evaluation

package

v0.4.0 Latest Latest Go to latest Published: May 23, 2026 License: MIT Imports: 4 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/plexusone/structured-evaluation

Links

Open Source Insights

Documentation ¶

Overview ¶

Package evaluation provides types for detailed evaluation reports with severity-based findings and recommendations. This is suited for LLM-as-Judge style reviews like PRD and ARB evaluations.

Index ¶

func AllRequiredPassing(results []CategoryResult, rubric *RubricSet) bool
type ActionItem
type AggregationMethod
type Annotation
type Category
- func NewCategory(id, name, description string) *Category
- func (c *Category) AddOption(value, label string, criteria ...string) *Category
- func (c *Category) GetOptionForValue(value string) *ScaleOption
- func (c *Category) SetEvaluationPrompt(prompt string) *Category
- func (c *Category) SetExamples(examples *CategoryExamples) *Category
- func (c *Category) SetRequired(required bool) *Category
- func (c *Category) SetWeight(weight float64) *Category
- func (c *Category) WithBinary(passCriteria, failCriteria []string) *Category
- func (c *Category) WithChecklist(required, optional []string, threshold *ChecklistThreshold) *Category
- func (c *Category) WithPassPartialFail(passCriteria, partialCriteria, failCriteria []string) *Category
type CategoryExamples
type CategoryResult
- func NewCategoryResult(category string, score ScoreValue, reasoning string) *CategoryResult
- func (cr *CategoryResult) AddEvidence(evidence ...string) *CategoryResult
- func (cr *CategoryResult) AddFinding(f Finding) *CategoryResult
- func (cr *CategoryResult) IsPassing() bool
- func (cr *CategoryResult) SetChecklistResults(results *ChecklistResults) *CategoryResult
type CategoryResultCounts
- func CountResults(results []CategoryResult) CategoryResultCounts
- func (c CategoryResultCounts) AllPassing() bool
type ChecklistResults
type ChecklistThreshold
type Decision
- func Evaluate(results []CategoryResult, findings []Finding, criteria PassCriteria, ...) Decision
type DecisionStatus
type EvaluationReport
- func NewEvaluationReport(reviewType, document string) *EvaluationReport
- func (r *EvaluationReport) AddCategoryResult(cr CategoryResult)
- func (r *EvaluationReport) AddFinding(f Finding)
- func (r *EvaluationReport) Evaluate(rubric *RubricSet) Decision
- func (r *EvaluationReport) Finalize(rubric *RubricSet, rerunCommand string)
- func (r *EvaluationReport) GenerateNextSteps(rerunCommand string)
- func (r *EvaluationReport) GenerateSummary() string
- func (r *EvaluationReport) GetCategoryResult(categoryID string) *CategoryResult
- func (r *EvaluationReport) SetJudge(judge *JudgeMetadata)
- func (r *EvaluationReport) SetPassCriteria(criteria PassCriteria)
- func (r *EvaluationReport) SetReference(ref *ReferenceData)
- func (r *EvaluationReport) SetRubric(rubricID, rubricVersion string)
type EvaluationType
type Example
type Finding
- func (f *Finding) IsBlocking() bool
type FindingCounts
- func CountFindings(findings []Finding) FindingCounts
- func (c FindingCounts) BlockingCount() int
- func (c FindingCounts) HasBlocking() bool
type FindingLimits
type JudgeCategoricalScore
type JudgeDisagreement
type JudgeMetadata
- func NewJudgeMetadata(model string) *JudgeMetadata
- func (j *JudgeMetadata) SetLatency(d time.Duration)
- func (j *JudgeMetadata) WithPrompt(template, version string) *JudgeMetadata
- func (j *JudgeMetadata) WithProvider(provider string) *JudgeMetadata
- func (j *JudgeMetadata) WithRubric(id, version string) *JudgeMetadata
- func (j *JudgeMetadata) WithTemperature(temp float64) *JudgeMetadata
- func (j *JudgeMetadata) WithTokenUsage(input, output int) *JudgeMetadata
- func (j *JudgeMetadata) WithTrace(traceID, spanID string) *JudgeMetadata
type MultiJudgeResult
- func AggregateEvaluations(evaluations []*EvaluationReport, method AggregationMethod) *MultiJudgeResult
type NextSteps
type PairwiseCategoryScore
type PairwiseComparison
- func NewPairwiseComparison(input, outputA, outputB string) *PairwiseComparison
- func (p *PairwiseComparison) AddCategoryScore(category string, winner PairwiseWinner, reasoning string, margin float64)
- func (p *PairwiseComparison) SetWinner(winner PairwiseWinner, reasoning string, confidence float64)
- func (p *PairwiseComparison) SwappedComparison() *PairwiseComparison
type PairwiseResult
- func ComputePairwiseResult(comparisons []PairwiseComparison) *PairwiseResult
type PairwiseWinner
type PassCriteria
- func DefaultPassCriteria() PassCriteria
- func StrictPassCriteria() PassCriteria
type ReferenceData
- func NewReferenceData(input, expectedOutput string) *ReferenceData
- func (r *ReferenceData) WithAnnotation(name string, score float64, annotatorID string) *ReferenceData
- func (r *ReferenceData) WithContext(ctx ...string) *ReferenceData
type ReferenceDataset
- func NewReferenceDataset(id, name string) *ReferenceDataset
- func (d *ReferenceDataset) AddItem(item ReferenceData)
- func (d *ReferenceDataset) GetByID(id string) *ReferenceData
type ReportMetadata
type RubricMetadata
type RubricPassCriteria
type RubricSet
- func NewRubricSet(id, name, version string) *RubricSet
- func (rs *RubricSet) AddCategory(cat Category) *RubricSet
- func (rs *RubricSet) GetCategory(id string) *Category
- func (rs *RubricSet) GetRequiredCategories() []Category
- func (rs *RubricSet) SetJudgePrompt(template string) *RubricSet
- func (rs *RubricSet) SetMetadata(meta *RubricMetadata) *RubricSet
- func (rs *RubricSet) SetPassCriteria(criteria RubricPassCriteria) *RubricSet
- func (rs *RubricSet) ToJSON() ([]byte, error)
- func (rs *RubricSet) Validate() []string
type Scale
type ScaleOption
type ScaleType
type ScoreValue
- func (s ScoreValue) Icon() string
- func (s ScoreValue) IsFailing() bool
- func (s ScoreValue) IsPartial() bool
- func (s ScoreValue) IsPassing() bool
type Severity
- func AllSeverities() []Severity
- func (s Severity) Icon() string
- func (s Severity) IsBlocking() bool
- func (s Severity) Weight() int
type TokenUsage

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func AllRequiredPassing ¶ added in v0.4.0

func AllRequiredPassing(results []CategoryResult, rubric *RubricSet) bool

AllRequiredPassing checks if all required categories passed.

Types ¶

type ActionItem ¶

type ActionItem struct {
	// Action describes what needs to be done.
	Action string `json:"action"`

	// Category is the related evaluation category.
	Category string `json:"category,omitempty"`

	// Severity is the related finding severity.
	Severity Severity `json:"severity,omitempty"`

	// Owner suggests who should do this.
	Owner string `json:"owner,omitempty"`

	// Effort estimates work required.
	Effort string `json:"effort,omitempty"`
}

ActionItem is a specific action to take.

type AggregationMethod ¶

type AggregationMethod string

AggregationMethod specifies how to combine multiple judge scores.

const (
	// AggregationMajority uses majority vote for pass/partial/fail.
	AggregationMajority AggregationMethod = "majority"

	// AggregationConservative uses the lowest/most critical score.
	AggregationConservative AggregationMethod = "conservative"

	// AggregationOptimistic uses the highest/most lenient score.
	AggregationOptimistic AggregationMethod = "optimistic"

	// AggregationUnanimous requires all judges to agree.
	AggregationUnanimous AggregationMethod = "unanimous"
)

type Annotation ¶

type Annotation struct {
	// Name is the annotation type (e.g., "quality", "relevance").
	Name string `json:"name"`

	// Score is a numeric score (if applicable).
	Score float64 `json:"score,omitempty"`

	// Label is a categorical label (if applicable).
	Label string `json:"label,omitempty"`

	// Explanation provides reasoning for the annotation.
	Explanation string `json:"explanation,omitempty"`

	// AnnotatorID identifies who provided this annotation.
	AnnotatorID string `json:"annotator_id,omitempty"`

	// AnnotatorType indicates human vs automated (e.g., "human", "llm", "rule").
	AnnotatorType string `json:"annotator_type,omitempty"`
}

Annotation represents a human-provided label or score.

type Category ¶ added in v0.4.0

type Category struct {
	// ID uniquely identifies this category within the rubric.
	ID string `json:"id"`

	// Name is the human-readable category name.
	Name string `json:"name"`

	// Description explains what this category measures.
	Description string `json:"description"`

	// Weight is the relative importance (default 1.0).
	Weight float64 `json:"weight,omitempty"`

	// Required indicates if this category must pass for overall pass.
	Required bool `json:"required,omitempty"`

	// Scale defines how this category is scored.
	Scale Scale `json:"scale"`

	// EvaluationPrompt is a specific prompt for evaluating this category.
	EvaluationPrompt string `json:"evaluationPrompt,omitempty"`

	// Examples provides few-shot examples for LLM evaluation.
	// Research shows 1 example per level improves LLM alignment.
	Examples *CategoryExamples `json:"examples,omitempty"`
}

Category is a single evaluation dimension.

func NewCategory ¶ added in v0.4.0

func NewCategory(id, name, description string) *Category

NewCategory creates a new category with a categorical scale.

func (*Category) AddOption ¶ added in v0.4.0

func (c *Category) AddOption(value, label string, criteria ...string) *Category

AddOption adds a scale option to a categorical category.

func (*Category) GetOptionForValue ¶ added in v0.4.0

func (c *Category) GetOptionForValue(value string) *ScaleOption

GetOptionForValue returns the scale option for a given value.

func (*Category) SetEvaluationPrompt ¶ added in v0.4.0

func (c *Category) SetEvaluationPrompt(prompt string) *Category

SetEvaluationPrompt sets the evaluation prompt for this category.

func (*Category) SetExamples ¶ added in v0.4.0

func (c *Category) SetExamples(examples *CategoryExamples) *Category

SetExamples sets few-shot examples for the category.

func (*Category) SetRequired ¶ added in v0.4.0

func (c *Category) SetRequired(required bool) *Category

SetRequired marks this category as required for pass.

func (*Category) SetWeight ¶ added in v0.4.0

func (c *Category) SetWeight(weight float64) *Category

SetWeight sets the category weight.

func (*Category) WithBinary ¶ added in v0.4.0

func (c *Category) WithBinary(passCriteria, failCriteria []string) *Category

WithBinary sets up a binary pass/fail scale.

func (*Category) WithChecklist ¶ added in v0.4.0

func (c *Category) WithChecklist(required, optional []string, threshold *ChecklistThreshold) *Category

WithChecklist sets up a checklist scale.

func (*Category) WithPassPartialFail ¶ added in v0.4.0

func (c *Category) WithPassPartialFail(passCriteria, partialCriteria, failCriteria []string) *Category

WithPassPartialFail sets up a standard pass/partial/fail scale.

type CategoryExamples ¶ added in v0.4.0

type CategoryExamples struct {
	Pass    *Example `json:"pass,omitempty"`
	Partial *Example `json:"partial,omitempty"`
	Fail    *Example `json:"fail,omitempty"`
}

CategoryExamples provides few-shot examples for a category. Research shows 1 example per level improves LLM alignment.

type CategoryResult ¶ added in v0.4.0

type CategoryResult struct {
	// Category is the category ID.
	Category string `json:"category"`

	// Score is the assigned score (pass, partial, fail).
	Score ScoreValue `json:"score"`

	// Reasoning explains the score (chain-of-thought).
	Reasoning string `json:"reasoning"`

	// Evidence are specific quotes or observations.
	Evidence []string `json:"evidence,omitempty"`

	// Findings are issues discovered in this category.
	Findings []Finding `json:"findings,omitempty"`

	// ChecklistResults tracks checklist items (for checklist scales).
	ChecklistResults *ChecklistResults `json:"checklistResults,omitempty"`
}

CategoryResult is the evaluation result for a single category.

func NewCategoryResult ¶ added in v0.4.0

func NewCategoryResult(category string, score ScoreValue, reasoning string) *CategoryResult

NewCategoryResult creates a category result with the given score.

func (*CategoryResult) AddEvidence ¶ added in v0.4.0

func (cr *CategoryResult) AddEvidence(evidence ...string) *CategoryResult

AddEvidence adds evidence to the result.

func (*CategoryResult) AddFinding ¶ added in v0.4.0

func (cr *CategoryResult) AddFinding(f Finding) *CategoryResult

AddFinding adds a finding to the result.

func (*CategoryResult) IsPassing ¶ added in v0.4.0

func (cr *CategoryResult) IsPassing() bool

IsPassing returns true if this category passed.

func (*CategoryResult) SetChecklistResults ¶ added in v0.4.0

func (cr *CategoryResult) SetChecklistResults(results *ChecklistResults) *CategoryResult

SetChecklistResults sets the checklist results.

type CategoryResultCounts ¶ added in v0.4.0

type CategoryResultCounts struct {
	Pass    int `json:"pass"`
	Partial int `json:"partial"`
	Fail    int `json:"fail"`
	Total   int `json:"total"`
}

CountCategoryResults counts results by score value.

func CountResults ¶ added in v0.4.0

func CountResults(results []CategoryResult) CategoryResultCounts

CountResults counts category results by score.

func (CategoryResultCounts) AllPassing ¶ added in v0.4.0

func (c CategoryResultCounts) AllPassing() bool

AllPassing returns true if all results are passing.

type ChecklistResults ¶ added in v0.4.0

type ChecklistResults struct {
	// RequiredPresent are required items that were found.
	RequiredPresent []string `json:"requiredPresent,omitempty"`

	// RequiredMissing are required items that were not found.
	RequiredMissing []string `json:"requiredMissing,omitempty"`

	// OptionalPresent are optional items that were found.
	OptionalPresent []string `json:"optionalPresent,omitempty"`

	// OptionalMissing are optional items that were not found.
	OptionalMissing []string `json:"optionalMissing,omitempty"`
}

ChecklistResults tracks which items were found for checklist scales.

type ChecklistThreshold ¶ added in v0.4.0

type ChecklistThreshold struct {
	// Required is "all" or a number of required items that must be present.
	Required string `json:"required,omitempty"`

	// Optional is the minimum number of optional items needed.
	Optional int `json:"optional,omitempty"`
}

ChecklistThreshold defines pass criteria for checklist scales.

type Decision ¶

type Decision struct {
	// Status is the decision outcome.
	Status DecisionStatus `json:"status"`

	// Passed indicates if the evaluation passed.
	Passed bool `json:"passed"`

	// Rationale explains the decision.
	Rationale string `json:"rationale"`

	// FindingCounts summarizes findings by severity.
	FindingCounts FindingCounts `json:"findingCounts"`

	// CategoryCounts summarizes category results.
	CategoryCounts CategoryResultCounts `json:"categoryCounts"`
}

Decision represents the evaluation decision.

func Evaluate ¶

func Evaluate(results []CategoryResult, findings []Finding, criteria PassCriteria, rubric *RubricSet) Decision

Evaluate checks category results and findings against criteria.

type DecisionStatus ¶

type DecisionStatus string

DecisionStatus represents the decision outcome.

const (
	DecisionPass        DecisionStatus = "pass"         // Meets all criteria
	DecisionConditional DecisionStatus = "conditional"  // Partial scores or non-blocking findings
	DecisionFail        DecisionStatus = "fail"         // Has blocking findings or required categories failed
	DecisionHumanReview DecisionStatus = "human_review" // Requires human judgment
)

type EvaluationReport ¶

type EvaluationReport struct {
	// Schema is the JSON Schema URL.
	Schema string `json:"$schema,omitempty"`

	// Metadata contains report identification and audit info.
	Metadata ReportMetadata `json:"metadata"`

	// ReviewType identifies the type of review (prd, arb, security, article, etc.).
	ReviewType string `json:"reviewType"`

	// Judge contains metadata about the LLM judge.
	Judge *JudgeMetadata `json:"judge,omitempty"`

	// RubricID references the rubric used for scoring.
	RubricID string `json:"rubricId,omitempty"`

	// RubricVersion is the version of the rubric used.
	RubricVersion string `json:"rubricVersion,omitempty"`

	// Reference contains gold/expected data for comparison.
	Reference *ReferenceData `json:"reference,omitempty"`

	// Categories contains results for each evaluation dimension.
	Categories []CategoryResult `json:"categories"`

	// Findings are all issues discovered during evaluation.
	Findings []Finding `json:"findings"`

	// PassCriteria defines the requirements for approval.
	PassCriteria PassCriteria `json:"passCriteria"`

	// Decision is the evaluation outcome.
	Decision Decision `json:"decision"`

	// OverallDecision is a simplified pass/conditional/fail status.
	OverallDecision string `json:"overallDecision"`

	// NextSteps provides actionable guidance.
	NextSteps NextSteps `json:"nextSteps"`

	// Summary is the overall assessment.
	Summary string `json:"summary"`
}

EvaluationReport is the detailed evaluation report for LLM-as-Judge reviews.

func NewEvaluationReport ¶

func NewEvaluationReport(reviewType, document string) *EvaluationReport

NewEvaluationReport creates a new evaluation report.

func (*EvaluationReport) AddCategoryResult ¶ added in v0.4.0

func (r *EvaluationReport) AddCategoryResult(cr CategoryResult)

AddCategoryResult adds a category result.

func (*EvaluationReport) AddFinding ¶

func (r *EvaluationReport) AddFinding(f Finding)

AddFinding adds a finding.

func (*EvaluationReport) Evaluate ¶

func (r *EvaluationReport) Evaluate(rubric *RubricSet) Decision

Evaluate computes the decision based on findings and category results.

func (*EvaluationReport) Finalize ¶

func (r *EvaluationReport) Finalize(rubric *RubricSet, rerunCommand string)

Finalize computes all derived fields.

func (*EvaluationReport) GenerateNextSteps ¶

func (r *EvaluationReport) GenerateNextSteps(rerunCommand string)

GenerateNextSteps creates actionable next steps.

func (*EvaluationReport) GenerateSummary ¶

func (r *EvaluationReport) GenerateSummary() string

GenerateSummary creates the summary text.

func (*EvaluationReport) GetCategoryResult ¶ added in v0.4.0

func (r *EvaluationReport) GetCategoryResult(categoryID string) *CategoryResult

GetCategoryResult returns a category result by ID, or nil if not found.

func (*EvaluationReport) SetJudge ¶

func (r *EvaluationReport) SetJudge(judge *JudgeMetadata)

SetJudge sets the judge metadata.

func (*EvaluationReport) SetPassCriteria ¶ added in v0.4.0

func (r *EvaluationReport) SetPassCriteria(criteria PassCriteria)

SetPassCriteria sets the pass criteria.

func (*EvaluationReport) SetReference ¶

func (r *EvaluationReport) SetReference(ref *ReferenceData)

SetReference sets the reference data for comparison.

func (*EvaluationReport) SetRubric ¶

func (r *EvaluationReport) SetRubric(rubricID, rubricVersion string)

SetRubric sets the rubric ID and version.

type EvaluationType ¶ added in v0.4.0

type EvaluationType string

EvaluationType defines how evaluation is performed.

const (
	// EvaluationTypeAnalytic scores each category independently (recommended for LLM-as-Judge).
	EvaluationTypeAnalytic EvaluationType = "analytic"

	// EvaluationTypeHolistic provides a single overall score.
	EvaluationTypeHolistic EvaluationType = "holistic"
)

type Example ¶ added in v0.4.0

type Example struct {
	// Excerpt is example content from a document.
	Excerpt string `json:"excerpt"`

	// Reasoning explains why this gets this score.
	// Including reasoning improves LLM alignment (chain-of-thought).
	Reasoning string `json:"reasoning"`
}

Example is a few-shot example for LLM evaluation.

type Finding ¶

type Finding struct {
	// ID is the unique identifier for this finding.
	ID string `json:"id"`

	// Category is the evaluation category this relates to.
	Category string `json:"category"`

	// Severity indicates the impact level.
	Severity Severity `json:"severity"`

	// Title is a brief summary of the finding.
	Title string `json:"title"`

	// Description provides detailed explanation.
	Description string `json:"description"`

	// Recommendation explains how to fix the issue.
	Recommendation string `json:"recommendation"`

	// Evidence provides specific examples or references.
	Evidence string `json:"evidence,omitempty"`

	// Owner suggests who should address this finding.
	Owner string `json:"owner,omitempty"`

	// Effort estimates the work required (low, medium, high).
	Effort string `json:"effort,omitempty"`
}

Finding represents an issue discovered during evaluation.

func (*Finding) IsBlocking ¶

func (f *Finding) IsBlocking() bool

IsBlocking returns true if this finding blocks approval.

type FindingCounts ¶

type FindingCounts struct {
	Critical int `json:"critical"`
	High     int `json:"high"`
	Medium   int `json:"medium"`
	Low      int `json:"low"`
	Info     int `json:"info"`
	Total    int `json:"total"`
}

FindingCounts tracks the number of findings by severity.

func CountFindings ¶

func CountFindings(findings []Finding) FindingCounts

CountFindings counts findings by severity.

func (FindingCounts) BlockingCount ¶

func (c FindingCounts) BlockingCount() int

BlockingCount returns the number of blocking findings.

func (FindingCounts) HasBlocking ¶

func (c FindingCounts) HasBlocking() bool

HasBlocking returns true if there are any blocking findings.

type FindingLimits ¶ added in v0.4.0

type FindingLimits struct {
	Critical int `json:"critical"`
	High     int `json:"high"`
	Medium   int `json:"medium"`
	Low      int `json:"low,omitempty"`
}

FindingLimits sets maximum allowed findings per severity. Use -1 for unlimited.

type JudgeCategoricalScore ¶ added in v0.4.0

type JudgeCategoricalScore struct {
	// JudgeID identifies the judge.
	JudgeID string `json:"judgeId"`

	// Score is the judge's categorical score.
	Score ScoreValue `json:"score"`
}

JudgeCategoricalScore is a categorical score from a specific judge.

type JudgeDisagreement ¶

type JudgeDisagreement struct {
	// Category is the evaluation dimension.
	Category string `json:"category"`

	// Scores are the individual judge scores.
	Scores []JudgeCategoricalScore `json:"scores"`

	// UniqueScores is the number of distinct scores given.
	UniqueScores int `json:"uniqueScores"`
}

JudgeDisagreement captures where judges had significantly different scores.

type JudgeMetadata ¶

type JudgeMetadata struct {
	// JudgeID is a unique identifier for this judge configuration.
	JudgeID string `json:"judge_id,omitempty"`

	// Model is the LLM model used (e.g., "claude-3-opus-20240229", "gpt-4-turbo").
	Model string `json:"model"`

	// ModelProvider is the API provider (e.g., "anthropic", "openai", "bedrock").
	ModelProvider string `json:"model_provider,omitempty"`

	// ModelVersion is the specific model version if applicable.
	ModelVersion string `json:"model_version,omitempty"`

	// PromptTemplate is the name/ID of the prompt template used.
	PromptTemplate string `json:"prompt_template,omitempty"`

	// PromptVersion is the version of the prompt template.
	PromptVersion string `json:"prompt_version,omitempty"`

	// SystemPrompt is the system prompt used (or hash/reference if too long).
	SystemPrompt string `json:"system_prompt,omitempty"`

	// Temperature is the sampling temperature used.
	Temperature float64 `json:"temperature,omitempty"`

	// MaxTokens is the max tokens setting.
	MaxTokens int `json:"max_tokens,omitempty"`

	// RubricID references the rubric set used for scoring.
	RubricID string `json:"rubric_id,omitempty"`

	// RubricVersion is the version of the rubric used.
	RubricVersion string `json:"rubric_version,omitempty"`

	// EvaluatedAt is when this evaluation was performed.
	EvaluatedAt time.Time `json:"evaluated_at,omitempty"`

	// Latency is the evaluation duration.
	Latency time.Duration `json:"latency,omitempty"`

	// TokensUsed tracks token consumption.
	TokensUsed *TokenUsage `json:"tokens_used,omitempty"`

	// TraceID links to observability trace (e.g., for Opik/Phoenix/Langfuse).
	TraceID string `json:"trace_id,omitempty"`

	// SpanID links to observability span.
	SpanID string `json:"span_id,omitempty"`
}

JudgeMetadata tracks information about the LLM judge that produced an evaluation. This enables reproducibility, debugging, and comparison of different judge configurations.

func NewJudgeMetadata ¶

func NewJudgeMetadata(model string) *JudgeMetadata

NewJudgeMetadata creates judge metadata with required fields.

func (*JudgeMetadata) SetLatency ¶

func (j *JudgeMetadata) SetLatency(d time.Duration)

SetLatency records the evaluation duration.

func (*JudgeMetadata) WithPrompt ¶

func (j *JudgeMetadata) WithPrompt(template, version string) *JudgeMetadata

WithPrompt sets the prompt template info.

func (*JudgeMetadata) WithProvider ¶

func (j *JudgeMetadata) WithProvider(provider string) *JudgeMetadata

WithProvider sets the model provider.

func (*JudgeMetadata) WithRubric ¶

func (j *JudgeMetadata) WithRubric(id, version string) *JudgeMetadata

WithRubric sets the rubric reference.

func (*JudgeMetadata) WithTemperature ¶

func (j *JudgeMetadata) WithTemperature(temp float64) *JudgeMetadata

WithTemperature sets the sampling temperature.

func (*JudgeMetadata) WithTokenUsage ¶

func (j *JudgeMetadata) WithTokenUsage(input, output int) *JudgeMetadata

WithTokenUsage sets the token usage.

func (*JudgeMetadata) WithTrace ¶

func (j *JudgeMetadata) WithTrace(traceID, spanID string) *JudgeMetadata

WithTrace links to observability.

type MultiJudgeResult ¶

type MultiJudgeResult struct {
	// Evaluations are the individual judge evaluations.
	Evaluations []*EvaluationReport `json:"evaluations"`

	// Judges contains metadata for each judge.
	Judges []*JudgeMetadata `json:"judges"`

	// AggregatedCategories are the combined category results.
	AggregatedCategories []CategoryResult `json:"aggregatedCategories"`

	// AggregationMethod describes how scores were combined.
	AggregationMethod AggregationMethod `json:"aggregationMethod"`

	// Agreement measures inter-judge agreement (0-1, higher = more agreement).
	Agreement float64 `json:"agreement"`

	// Disagreements lists categories where judges significantly disagreed.
	Disagreements []JudgeDisagreement `json:"disagreements,omitempty"`

	// ConsolidatedDecision is the final decision after aggregation.
	ConsolidatedDecision Decision `json:"consolidatedDecision"`

	// ConsolidatedFindings merges findings from all judges.
	ConsolidatedFindings []Finding `json:"consolidatedFindings"`
}

MultiJudgeResult aggregates evaluations from multiple judges. This improves reliability by combining perspectives and detecting disagreement.

func AggregateEvaluations ¶

func AggregateEvaluations(evaluations []*EvaluationReport, method AggregationMethod) *MultiJudgeResult

AggregateEvaluations combines multiple evaluation reports.

type NextSteps ¶

type NextSteps struct {
	// RerunCommand is the command to re-run evaluation.
	RerunCommand string `json:"rerunCommand,omitempty"`

	// Immediate are blocking actions that must be completed.
	Immediate []ActionItem `json:"immediate,omitempty"`

	// Recommended are suggested improvements.
	Recommended []ActionItem `json:"recommended,omitempty"`
}

NextSteps provides actionable workflow guidance.

type PairwiseCategoryScore ¶

type PairwiseCategoryScore struct {
	// Category is the evaluation dimension.
	Category string `json:"category"`

	// Winner indicates which output won for this category.
	Winner PairwiseWinner `json:"winner"`

	// Margin indicates how much better the winner is (0-1, higher = larger gap).
	Margin float64 `json:"margin,omitempty"`

	// Reasoning explains the category-level comparison.
	Reasoning string `json:"reasoning,omitempty"`
}

PairwiseCategoryScore compares outputs on a specific dimension.

type PairwiseComparison ¶

type PairwiseComparison struct {
	// ID is the unique identifier for this comparison.
	ID string `json:"id,omitempty"`

	// Input is the shared input/prompt for both outputs.
	Input string `json:"input"`

	// OutputA is the first output being compared.
	OutputA string `json:"output_a"`

	// OutputB is the second output being compared.
	OutputB string `json:"output_b"`

	// Winner indicates which output won ("A", "B", or "tie").
	Winner PairwiseWinner `json:"winner"`

	// Confidence is the judge's confidence in the decision (0-1).
	Confidence float64 `json:"confidence,omitempty"`

	// Reasoning explains why this winner was chosen.
	Reasoning string `json:"reasoning"`

	// CategoryScores provides per-category comparisons if applicable.
	CategoryScores []PairwiseCategoryScore `json:"category_scores,omitempty"`

	// Judge contains metadata about the LLM judge.
	Judge *JudgeMetadata `json:"judge,omitempty"`

	// Metadata contains additional comparison context.
	Metadata map[string]any `json:"metadata,omitempty"`

	// CreatedAt is when this comparison was made.
	CreatedAt time.Time `json:"created_at,omitempty"`
}

PairwiseComparison represents a comparison between two outputs. This is an alternative to absolute scoring that can reduce position bias and improve reliability of LLM-as-Judge evaluations.

func NewPairwiseComparison ¶

func NewPairwiseComparison(input, outputA, outputB string) *PairwiseComparison

NewPairwiseComparison creates a new pairwise comparison.

func (*PairwiseComparison) AddCategoryScore ¶

func (p *PairwiseComparison) AddCategoryScore(category string, winner PairwiseWinner, reasoning string, margin float64)

AddCategoryScore adds a per-category comparison.

func (*PairwiseComparison) SetWinner ¶

func (p *PairwiseComparison) SetWinner(winner PairwiseWinner, reasoning string, confidence float64)

SetWinner sets the comparison result.

func (*PairwiseComparison) SwappedComparison ¶

func (p *PairwiseComparison) SwappedComparison() *PairwiseComparison

SwappedComparison creates a comparison with A and B swapped. Running both orders helps detect position bias in the judge.

type PairwiseResult ¶

type PairwiseResult struct {
	// Comparisons are all the individual comparisons.
	Comparisons []PairwiseComparison `json:"comparisons"`

	// WinRateA is the percentage of comparisons won by A.
	WinRateA float64 `json:"win_rate_a"`

	// WinRateB is the percentage of comparisons won by B.
	WinRateB float64 `json:"win_rate_b"`

	// TieRate is the percentage of ties.
	TieRate float64 `json:"tie_rate"`

	// OverallWinner is the aggregated winner.
	OverallWinner PairwiseWinner `json:"overall_winner"`

	// Confidence is the overall confidence in the result.
	Confidence float64 `json:"confidence"`
}

PairwiseResult aggregates multiple pairwise comparisons.

func ComputePairwiseResult ¶

func ComputePairwiseResult(comparisons []PairwiseComparison) *PairwiseResult

ComputeResult aggregates multiple comparisons into a result.

type PairwiseWinner ¶

type PairwiseWinner string

PairwiseWinner indicates the winner of a pairwise comparison.

const (
	// WinnerA indicates output A is better.
	WinnerA PairwiseWinner = "A"

	// WinnerB indicates output B is better.
	WinnerB PairwiseWinner = "B"

	// WinnerTie indicates both outputs are roughly equal.
	WinnerTie PairwiseWinner = "tie"

	// WinnerUncertain indicates the judge couldn't determine a winner.
	WinnerUncertain PairwiseWinner = "uncertain"
)

type PassCriteria ¶

type PassCriteria struct {
	// MinCategoriesPassing specifies how many categories must pass.
	// Values: "all", "all_required", or a number like "3".
	MinCategoriesPassing string `json:"minCategoriesPassing"`

	// MaxFindings limits findings by severity.
	// Use -1 for unlimited.
	MaxFindings *FindingLimits `json:"maxFindingsSeverity,omitempty"`
}

PassCriteria defines the requirements for approval. Aligned with LLM-as-Judge best practices.

func DefaultPassCriteria ¶

func DefaultPassCriteria() PassCriteria

DefaultPassCriteria returns standard pass criteria. All required categories must pass, 0 critical/high findings allowed.

func StrictPassCriteria ¶

func StrictPassCriteria() PassCriteria

StrictPassCriteria returns strict pass criteria. All categories must pass, max 3 medium findings.

type ReferenceData ¶

type ReferenceData struct {
	// ID is the unique identifier for this reference.
	ID string `json:"id,omitempty"`

	// Input is the input/prompt that produced the reference output.
	Input string `json:"input,omitempty"`

	// ExpectedOutput is the gold/reference output.
	ExpectedOutput string `json:"expected_output,omitempty"`

	// ExpectedOutputs allows multiple acceptable outputs.
	ExpectedOutputs []string `json:"expected_outputs,omitempty"`

	// Context provides additional context (e.g., retrieved documents for RAG).
	Context []string `json:"context,omitempty"`

	// Annotations are human-provided labels or scores.
	Annotations []Annotation `json:"annotations,omitempty"`

	// Source indicates where this reference came from.
	Source string `json:"source,omitempty"`

	// Tags categorize or filter references.
	Tags []string `json:"tags,omitempty"`

	// Metadata contains additional reference data.
	Metadata map[string]any `json:"metadata,omitempty"`
}

ReferenceData contains ground truth or expected data for evaluation. This enables reference-based evaluation where outputs are compared against known-good examples.

func NewReferenceData ¶

func NewReferenceData(input, expectedOutput string) *ReferenceData

NewReferenceData creates a new reference data item.

func (*ReferenceData) WithAnnotation ¶

func (r *ReferenceData) WithAnnotation(name string, score float64, annotatorID string) *ReferenceData

WithAnnotation adds a human annotation.

func (*ReferenceData) WithContext ¶

func (r *ReferenceData) WithContext(ctx ...string) *ReferenceData

WithContext adds context documents.

type ReferenceDataset ¶

type ReferenceDataset struct {
	// ID is the unique identifier for this dataset.
	ID string `json:"id"`

	// Name is the display name.
	Name string `json:"name"`

	// Description explains what this dataset contains.
	Description string `json:"description,omitempty"`

	// Version tracks dataset iterations.
	Version string `json:"version,omitempty"`

	// Items are the reference data items.
	Items []ReferenceData `json:"items"`

	// Tags categorize the dataset.
	Tags []string `json:"tags,omitempty"`

	// Metadata contains additional dataset info.
	Metadata map[string]any `json:"metadata,omitempty"`
}

ReferenceDataset is a collection of reference data items.

func NewReferenceDataset ¶

func NewReferenceDataset(id, name string) *ReferenceDataset

NewReferenceDataset creates a new reference dataset.

func (*ReferenceDataset) AddItem ¶

func (d *ReferenceDataset) AddItem(item ReferenceData)

AddItem adds a reference data item to the dataset.

func (*ReferenceDataset) GetByID ¶

func (d *ReferenceDataset) GetByID(id string) *ReferenceData

GetByID retrieves a reference item by ID.

type ReportMetadata ¶

type ReportMetadata struct {
	// Document is the filename or path being evaluated.
	Document string `json:"document"`

	// DocumentID is the document identifier (e.g., PRD ID).
	DocumentID string `json:"documentId,omitempty"`

	// DocumentTitle is the document title.
	DocumentTitle string `json:"documentTitle,omitempty"`

	// DocumentVersion is the document version.
	DocumentVersion string `json:"documentVersion,omitempty"`

	// GeneratedAt is when the report was created.
	GeneratedAt time.Time `json:"generatedAt"`

	// GeneratedBy identifies what created this report.
	GeneratedBy string `json:"generatedBy,omitempty"`

	// ReviewerID identifies the reviewer (agent or human).
	ReviewerID string `json:"reviewerId,omitempty"`
}

ReportMetadata contains report identification.

type RubricMetadata ¶ added in v0.4.0

type RubricMetadata struct {
	CreatedAt string   `json:"createdAt,omitempty"`
	Author    string   `json:"author,omitempty"`
	BasedOn   []string `json:"basedOn,omitempty"`
}

RubricMetadata contains additional rubric information.

type RubricPassCriteria ¶ added in v0.4.0

type RubricPassCriteria struct {
	// MinCategoriesPassing is "all", "all_required", or a number.
	MinCategoriesPassing string `json:"minCategoriesPassing,omitempty"`

	// MaxFindings limits findings by severity.
	MaxFindings *FindingLimits `json:"maxFindingsSeverity,omitempty"`
}

RubricPassCriteria defines requirements for overall pass/fail determination.

type RubricSet ¶

type RubricSet struct {
	// ID uniquely identifies this rubric set.
	ID string `json:"id"`

	// Name is the human-readable name.
	Name string `json:"name"`

	// Version is the semantic version of this rubric.
	Version string `json:"version"`

	// Description explains what this rubric set evaluates.
	Description string `json:"description,omitempty"`

	// EvaluationType is "analytic" (per-category) or "holistic" (single score).
	// Analytic is recommended for LLM-as-Judge.
	EvaluationType EvaluationType `json:"evaluationType,omitempty"`

	// PassCriteria defines requirements for overall pass/fail.
	PassCriteria RubricPassCriteria `json:"passCriteria"`

	// Categories are the evaluation dimensions.
	Categories []Category `json:"categories"`

	// JudgePromptTemplate is the prompt template for LLM evaluation.
	// Supports placeholders: {content}, {categories}, etc.
	JudgePromptTemplate string `json:"judgePromptTemplate,omitempty"`

	// Metadata contains additional information about the rubric.
	Metadata *RubricMetadata `json:"metadata,omitempty"`
}

RubricSet is a collection of rubrics for a complete evaluation. Follows Go-first principles: Go types are source of truth, JSON Schema generated from them.

func NewRubricSet ¶ added in v0.4.0

func NewRubricSet(id, name, version string) *RubricSet

NewRubricSet creates a new rubric set with required fields.

func (*RubricSet) AddCategory ¶ added in v0.4.0

func (rs *RubricSet) AddCategory(cat Category) *RubricSet

AddCategory adds a category to the rubric set.

func (*RubricSet) GetCategory ¶ added in v0.4.0

func (rs *RubricSet) GetCategory(id string) *Category

GetCategory returns a category by ID, or nil if not found.

func (*RubricSet) GetRequiredCategories ¶ added in v0.4.0

func (rs *RubricSet) GetRequiredCategories() []Category

GetRequiredCategories returns all required categories.

func (*RubricSet) SetJudgePrompt ¶ added in v0.4.0

func (rs *RubricSet) SetJudgePrompt(template string) *RubricSet

SetJudgePrompt sets the judge prompt template.

func (*RubricSet) SetMetadata ¶ added in v0.4.0

func (rs *RubricSet) SetMetadata(meta *RubricMetadata) *RubricSet

SetMetadata sets the rubric metadata.

func (*RubricSet) SetPassCriteria ¶ added in v0.4.0

func (rs *RubricSet) SetPassCriteria(criteria RubricPassCriteria) *RubricSet

SetPassCriteria sets the pass criteria.

func (*RubricSet) ToJSON ¶ added in v0.4.0

func (rs *RubricSet) ToJSON() ([]byte, error)

ToJSON serializes a rubric set to JSON.

func (*RubricSet) Validate ¶ added in v0.4.0

func (rs *RubricSet) Validate() []string

Validate checks the rubric for common issues.

type Scale ¶ added in v0.4.0

type Scale struct {
	// Type is "categorical", "checklist", or "binary".
	// Categorical with 2-3 options is recommended for LLM-as-Judge.
	Type ScaleType `json:"type"`

	// Options are the scoring options (for categorical scales).
	Options []ScaleOption `json:"options,omitempty"`

	// RequiredItems are items that must be present (for checklist scales).
	RequiredItems []string `json:"requiredItems,omitempty"`

	// OptionalItems are items that add value (for checklist scales).
	OptionalItems []string `json:"optionalItems,omitempty"`

	// PassingThreshold defines pass criteria (for checklist scales).
	PassingThreshold *ChecklistThreshold `json:"passingThreshold,omitempty"`
}

Scale defines the scoring mechanism for a category.

type ScaleOption ¶ added in v0.4.0

type ScaleOption struct {
	// Value is the machine-readable value (e.g., "pass", "partial", "fail").
	Value string `json:"value"`

	// Label is the human-readable label.
	Label string `json:"label"`

	// Criteria are specific requirements for this score level.
	Criteria []string `json:"criteria"`
}

ScaleOption is a single option in a categorical scale.

type ScaleType ¶ added in v0.4.0

type ScaleType string

ScaleType defines the type of scoring scale.

const (
	// ScaleTypeCategorical uses discrete categories (pass/partial/fail).
	// Recommended for LLM-as-Judge - better calibrated than numeric scales.
	ScaleTypeCategorical ScaleType = "categorical"

	// ScaleTypeChecklist uses a list of required/optional items.
	ScaleTypeChecklist ScaleType = "checklist"

	// ScaleTypeBinary is simple pass/fail.
	ScaleTypeBinary ScaleType = "binary"
)

type ScoreValue ¶ added in v0.4.0

type ScoreValue string

ScoreValue represents a categorical score value.

const (
	ScorePass    ScoreValue = "pass"
	ScorePartial ScoreValue = "partial"
	ScoreFail    ScoreValue = "fail"
)

func (ScoreValue) Icon ¶ added in v0.4.0

func (s ScoreValue) Icon() string

Icon returns the emoji icon for the score.

func (ScoreValue) IsFailing ¶ added in v0.4.0

func (s ScoreValue) IsFailing() bool

IsFailing returns true if this score is failing.

func (ScoreValue) IsPartial ¶ added in v0.4.0

func (s ScoreValue) IsPartial() bool

IsPartial returns true if this score is partial.

func (ScoreValue) IsPassing ¶ added in v0.4.0

func (s ScoreValue) IsPassing() bool

IsPassing returns true if this score is considered passing.

type Severity ¶

type Severity string

Severity represents the severity level of a finding. Based on InfoSec severity classifications.

const (
	SeverityCritical Severity = "critical" // Blocks approval, must fix
	SeverityHigh     Severity = "high"     // Blocks approval, must fix
	SeverityMedium   Severity = "medium"   // Should fix before approval
	SeverityLow      Severity = "low"      // Nice to fix
	SeverityInfo     Severity = "info"     // Informational only
)

func AllSeverities ¶

func AllSeverities() []Severity

AllSeverities returns all severity levels in order of severity.

func (Severity) Icon ¶

func (s Severity) Icon() string

Icon returns the emoji icon for the severity.

func (Severity) IsBlocking ¶

func (s Severity) IsBlocking() bool

IsBlocking returns true if this severity blocks approval.

func (Severity) Weight ¶

func (s Severity) Weight() int

Weight returns a numeric weight for sorting (higher = more severe).

type TokenUsage ¶

type TokenUsage struct {
	// InputTokens is the number of input/prompt tokens.
	InputTokens int `json:"input_tokens"`

	// OutputTokens is the number of output/completion tokens.
	OutputTokens int `json:"output_tokens"`

	// TotalTokens is the total tokens used.
	TotalTokens int `json:"total_tokens"`

	// CacheReadTokens is tokens read from cache (if applicable).
	CacheReadTokens int `json:"cache_read_tokens,omitempty"`

	// CacheWriteTokens is tokens written to cache (if applicable).
	CacheWriteTokens int `json:"cache_write_tokens,omitempty"`
}

TokenUsage tracks token consumption for an evaluation.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL