evaluation

package

v0.2.0 Latest Latest Go to latest Published: Jan 26, 2026 License: MIT Imports: 4 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/agentplexus/structured-evaluation

Links

Open Source Insights

Documentation ¶

Overview ¶

Package evaluation provides types for detailed evaluation reports with severity-based findings and recommendations. This is suited for LLM-as-Judge style reviews like PRD and ARB evaluations.

Index ¶

type ActionItem
type AggregationMethod
type Annotation
type CategoryScore
- func NewCategoryScore(category string, weight, score float64, justification string) CategoryScore
- func (c *CategoryScore) ComputeStatus() ScoreStatus
- func (c *CategoryScore) ComputeWeightedScore() float64
type Decision
- func Evaluate(findings []Finding, weightedScore float64, criteria PassCriteria) Decision
type DecisionStatus
type EvaluationReport
- func NewEvaluationReport(reviewType, document string) *EvaluationReport
- func (r *EvaluationReport) AddCategory(cs CategoryScore)
- func (r *EvaluationReport) AddFinding(f Finding)
- func (r *EvaluationReport) ComputeWeightedScore() float64
- func (r *EvaluationReport) Evaluate() Decision
- func (r *EvaluationReport) Finalize(rerunCommand string)
- func (r *EvaluationReport) GenerateNextSteps(rerunCommand string)
- func (r *EvaluationReport) GenerateSummary() string
- func (r *EvaluationReport) SetJudge(judge *JudgeMetadata)
- func (r *EvaluationReport) SetReference(ref *ReferenceData)
- func (r *EvaluationReport) SetRubric(rubricID string)
type Finding
- func (f *Finding) IsBlocking() bool
type FindingCounts
- func CountFindings(findings []Finding) FindingCounts
- func (c FindingCounts) BlockingCount() int
- func (c FindingCounts) HasBlocking() bool
type JudgeDisagreement
type JudgeMetadata
- func NewJudgeMetadata(model string) *JudgeMetadata
- func (j *JudgeMetadata) SetLatency(d time.Duration)
- func (j *JudgeMetadata) WithPrompt(template, version string) *JudgeMetadata
- func (j *JudgeMetadata) WithProvider(provider string) *JudgeMetadata
- func (j *JudgeMetadata) WithRubric(id, version string) *JudgeMetadata
- func (j *JudgeMetadata) WithTemperature(temp float64) *JudgeMetadata
- func (j *JudgeMetadata) WithTokenUsage(input, output int) *JudgeMetadata
- func (j *JudgeMetadata) WithTrace(traceID, spanID string) *JudgeMetadata
type JudgeScore
type MultiJudgeResult
- func AggregateEvaluations(evaluations []*EvaluationReport, method AggregationMethod) *MultiJudgeResult
type NextSteps
type PairwiseCategoryScore
type PairwiseComparison
- func NewPairwiseComparison(input, outputA, outputB string) *PairwiseComparison
- func (p *PairwiseComparison) AddCategoryScore(category string, winner PairwiseWinner, reasoning string, margin float64)
- func (p *PairwiseComparison) SetWinner(winner PairwiseWinner, reasoning string, confidence float64)
- func (p *PairwiseComparison) SwappedComparison() *PairwiseComparison
type PairwiseResult
- func ComputePairwiseResult(comparisons []PairwiseComparison) *PairwiseResult
type PairwiseWinner
type PassCriteria
- func DefaultPassCriteria() PassCriteria
- func StrictPassCriteria() PassCriteria
type ReferenceData
- func NewReferenceData(input, expectedOutput string) *ReferenceData
- func (r *ReferenceData) WithAnnotation(name string, score float64, annotatorID string) *ReferenceData
- func (r *ReferenceData) WithContext(ctx ...string) *ReferenceData
type ReferenceDataset
- func NewReferenceDataset(id, name string) *ReferenceDataset
- func (d *ReferenceDataset) AddItem(item ReferenceData)
- func (d *ReferenceDataset) GetByID(id string) *ReferenceData
type ReportMetadata
type Rubric
- func NewRubric(category, description string) *Rubric
- func (r *Rubric) AddAnchor(score float64, label, description string, criteria ...string) *Rubric
- func (r *Rubric) AddExample(score float64, output, explanation string) *Rubric
- func (r *Rubric) AddRangeAnchor(minScore, maxScore float64, label, description string, criteria ...string) *Rubric
- func (r *Rubric) GetAnchorForScore(score float64) *ScoreAnchor
type RubricExample
type RubricSet
- func DefaultPRDRubricSet() *RubricSet
type ScoreAnchor
type ScoreStatus
- func (s ScoreStatus) Icon() string
type Severity
- func AllSeverities() []Severity
- func (s Severity) Icon() string
- func (s Severity) IsBlocking() bool
- func (s Severity) Weight() int
type TokenUsage

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

This section is empty.

Types ¶

type ActionItem ¶

type ActionItem struct {
	// Action describes what needs to be done.
	Action string `json:"action"`

	// Category is the related evaluation category.
	Category string `json:"category,omitempty"`

	// Severity is the related finding severity.
	Severity Severity `json:"severity,omitempty"`

	// Owner suggests who should do this.
	Owner string `json:"owner,omitempty"`

	// Effort estimates work required.
	Effort string `json:"effort,omitempty"`
}

ActionItem is a specific action to take.

type AggregationMethod ¶ added in v0.2.0

type AggregationMethod string

AggregationMethod specifies how to combine multiple judge scores.

const (
	// AggregationMean uses the arithmetic mean of scores.
	AggregationMean AggregationMethod = "mean"

	// AggregationMedian uses the median score.
	AggregationMedian AggregationMethod = "median"

	// AggregationWeighted uses weighted average based on judge confidence.
	AggregationWeighted AggregationMethod = "weighted"

	// AggregationMajority uses majority vote for pass/fail.
	AggregationMajority AggregationMethod = "majority"

	// AggregationConservative uses the lowest/most critical score.
	AggregationConservative AggregationMethod = "conservative"
)

type Annotation ¶ added in v0.2.0

type Annotation struct {
	// Name is the annotation type (e.g., "quality", "relevance").
	Name string `json:"name"`

	// Score is a numeric score (if applicable).
	Score float64 `json:"score,omitempty"`

	// Label is a categorical label (if applicable).
	Label string `json:"label,omitempty"`

	// Explanation provides reasoning for the annotation.
	Explanation string `json:"explanation,omitempty"`

	// AnnotatorID identifies who provided this annotation.
	AnnotatorID string `json:"annotator_id,omitempty"`

	// AnnotatorType indicates human vs automated (e.g., "human", "llm", "rule").
	AnnotatorType string `json:"annotator_type,omitempty"`
}

Annotation represents a human-provided label or score.

type CategoryScore ¶

type CategoryScore struct {
	// Category is the name/ID of the category.
	Category string `json:"category"`

	// Weight is the category weight (0.0-1.0, should sum to 1.0).
	Weight float64 `json:"weight"`

	// Score is the category score (0.0-10.0).
	Score float64 `json:"score"`

	// MaxScore is the maximum possible score (default 10.0).
	MaxScore float64 `json:"max_score"`

	// Status is the derived status (pass/warn/fail).
	Status ScoreStatus `json:"status"`

	// Justification explains why this score was given.
	Justification string `json:"justification"`

	// Evidence provides specific supporting evidence.
	Evidence string `json:"evidence,omitempty"`

	// Findings are issues found in this category.
	Findings []Finding `json:"findings,omitempty"`
}

CategoryScore represents a score for a single evaluation category.

func NewCategoryScore ¶

func NewCategoryScore(category string, weight, score float64, justification string) CategoryScore

NewCategoryScore creates a category score with computed status.

func (*CategoryScore) ComputeStatus ¶

func (c *CategoryScore) ComputeStatus() ScoreStatus

ComputeStatus calculates the status from the score.

func (*CategoryScore) ComputeWeightedScore ¶

func (c *CategoryScore) ComputeWeightedScore() float64

ComputeWeightedScore calculates the weighted contribution of this category.

type Decision ¶

type Decision struct {
	// Status is the decision outcome.
	Status DecisionStatus `json:"status"`

	// Passed indicates if the evaluation passed.
	Passed bool `json:"passed"`

	// Rationale explains the decision.
	Rationale string `json:"rationale"`

	// FindingCounts summarizes findings by severity.
	FindingCounts FindingCounts `json:"finding_counts"`

	// WeightedScore is the final weighted score.
	WeightedScore float64 `json:"weighted_score"`
}

Decision represents the evaluation decision.

func Evaluate ¶

func Evaluate(findings []Finding, weightedScore float64, criteria PassCriteria) Decision

Evaluate checks findings and score against criteria.

type DecisionStatus ¶

type DecisionStatus string

DecisionStatus represents the decision outcome.

const (
	DecisionPass        DecisionStatus = "pass"         // Meets all criteria
	DecisionConditional DecisionStatus = "conditional"  // Meets score but has findings
	DecisionFail        DecisionStatus = "fail"         // Has blocking findings
	DecisionHumanReview DecisionStatus = "human_review" // Requires human judgment
)

type EvaluationReport ¶

type EvaluationReport struct {
	// Schema is the JSON Schema URL.
	Schema string `json:"$schema,omitempty"`

	// Metadata contains report identification and audit info.
	Metadata ReportMetadata `json:"metadata"`

	// ReviewType identifies the type of review (prd, arb, security, etc.).
	ReviewType string `json:"review_type"`

	// Judge contains metadata about the LLM judge (v0.2.0).
	Judge *JudgeMetadata `json:"judge,omitempty"`

	// RubricID references the rubric used for scoring (v0.2.0).
	RubricID string `json:"rubric_id,omitempty"`

	// Reference contains gold/expected data for comparison (v0.2.0).
	Reference *ReferenceData `json:"reference,omitempty"`

	// Categories contains scores for each evaluation dimension.
	Categories []CategoryScore `json:"categories"`

	// Findings are all issues discovered during evaluation.
	Findings []Finding `json:"findings"`

	// WeightedScore is the overall weighted score.
	WeightedScore float64 `json:"weighted_score"`

	// PassCriteria defines the requirements for approval.
	PassCriteria PassCriteria `json:"pass_criteria"`

	// Decision is the evaluation outcome.
	Decision Decision `json:"decision"`

	// NextSteps provides actionable guidance.
	NextSteps NextSteps `json:"next_steps"`

	// Summary is the overall assessment.
	Summary string `json:"summary"`
}

EvaluationReport is the detailed evaluation report for LLM-as-Judge reviews.

func NewEvaluationReport ¶

func NewEvaluationReport(reviewType, document string) *EvaluationReport

NewEvaluationReport creates a new evaluation report.

func (*EvaluationReport) AddCategory ¶

func (r *EvaluationReport) AddCategory(cs CategoryScore)

AddCategory adds a category score.

func (*EvaluationReport) AddFinding ¶

func (r *EvaluationReport) AddFinding(f Finding)

AddFinding adds a finding.

func (*EvaluationReport) ComputeWeightedScore ¶

func (r *EvaluationReport) ComputeWeightedScore() float64

ComputeWeightedScore calculates the overall weighted score.

func (*EvaluationReport) Evaluate ¶

func (r *EvaluationReport) Evaluate() Decision

Evaluate computes the decision based on findings and score.

func (*EvaluationReport) Finalize ¶

func (r *EvaluationReport) Finalize(rerunCommand string)

Finalize computes all derived fields.

func (*EvaluationReport) GenerateNextSteps ¶

func (r *EvaluationReport) GenerateNextSteps(rerunCommand string)

GenerateNextSteps creates actionable next steps.

func (*EvaluationReport) GenerateSummary ¶

func (r *EvaluationReport) GenerateSummary() string

GenerateSummary creates the summary text.

func (*EvaluationReport) SetJudge ¶ added in v0.2.0

func (r *EvaluationReport) SetJudge(judge *JudgeMetadata)

SetJudge sets the judge metadata.

func (*EvaluationReport) SetReference ¶ added in v0.2.0

func (r *EvaluationReport) SetReference(ref *ReferenceData)

SetReference sets the reference data for comparison.

func (*EvaluationReport) SetRubric ¶ added in v0.2.0

func (r *EvaluationReport) SetRubric(rubricID string)

SetRubric sets the rubric ID.

type Finding ¶

type Finding struct {
	// ID is the unique identifier for this finding.
	ID string `json:"id"`

	// Category is the evaluation category this relates to.
	Category string `json:"category"`

	// Severity indicates the impact level.
	Severity Severity `json:"severity"`

	// Title is a brief summary of the finding.
	Title string `json:"title"`

	// Description provides detailed explanation.
	Description string `json:"description"`

	// Recommendation explains how to fix the issue.
	Recommendation string `json:"recommendation"`

	// Evidence provides specific examples or references.
	Evidence string `json:"evidence,omitempty"`

	// Owner suggests who should address this finding.
	Owner string `json:"owner,omitempty"`

	// Effort estimates the work required (low, medium, high).
	Effort string `json:"effort,omitempty"`
}

Finding represents an issue discovered during evaluation.

func (*Finding) IsBlocking ¶

func (f *Finding) IsBlocking() bool

IsBlocking returns true if this finding blocks approval.

type FindingCounts ¶

type FindingCounts struct {
	Critical int `json:"critical"`
	High     int `json:"high"`
	Medium   int `json:"medium"`
	Low      int `json:"low"`
	Info     int `json:"info"`
	Total    int `json:"total"`
}

FindingCounts tracks the number of findings by severity.

func CountFindings ¶

func CountFindings(findings []Finding) FindingCounts

CountFindings counts findings by severity.

func (FindingCounts) BlockingCount ¶

func (c FindingCounts) BlockingCount() int

BlockingCount returns the number of blocking findings.

func (FindingCounts) HasBlocking ¶

func (c FindingCounts) HasBlocking() bool

HasBlocking returns true if there are any blocking findings.

type JudgeDisagreement ¶ added in v0.2.0

type JudgeDisagreement struct {
	// Category is the evaluation dimension.
	Category string `json:"category"`

	// Scores are the individual judge scores.
	Scores []JudgeScore `json:"scores"`

	// Range is the difference between max and min scores.
	Range float64 `json:"range"`

	// StandardDeviation measures score spread.
	StandardDeviation float64 `json:"standard_deviation"`
}

JudgeDisagreement captures where judges had significantly different scores.

type JudgeMetadata ¶ added in v0.2.0

type JudgeMetadata struct {
	// JudgeID is a unique identifier for this judge configuration.
	JudgeID string `json:"judge_id,omitempty"`

	// Model is the LLM model used (e.g., "claude-3-opus-20240229", "gpt-4-turbo").
	Model string `json:"model"`

	// ModelProvider is the API provider (e.g., "anthropic", "openai", "bedrock").
	ModelProvider string `json:"model_provider,omitempty"`

	// ModelVersion is the specific model version if applicable.
	ModelVersion string `json:"model_version,omitempty"`

	// PromptTemplate is the name/ID of the prompt template used.
	PromptTemplate string `json:"prompt_template,omitempty"`

	// PromptVersion is the version of the prompt template.
	PromptVersion string `json:"prompt_version,omitempty"`

	// SystemPrompt is the system prompt used (or hash/reference if too long).
	SystemPrompt string `json:"system_prompt,omitempty"`

	// Temperature is the sampling temperature used.
	Temperature float64 `json:"temperature,omitempty"`

	// MaxTokens is the max tokens setting.
	MaxTokens int `json:"max_tokens,omitempty"`

	// RubricID references the rubric set used for scoring.
	RubricID string `json:"rubric_id,omitempty"`

	// RubricVersion is the version of the rubric used.
	RubricVersion string `json:"rubric_version,omitempty"`

	// EvaluatedAt is when this evaluation was performed.
	EvaluatedAt time.Time `json:"evaluated_at,omitempty"`

	// Latency is the evaluation duration.
	Latency time.Duration `json:"latency,omitempty"`

	// TokensUsed tracks token consumption.
	TokensUsed *TokenUsage `json:"tokens_used,omitempty"`

	// TraceID links to observability trace (e.g., for Opik/Phoenix/Langfuse).
	TraceID string `json:"trace_id,omitempty"`

	// SpanID links to observability span.
	SpanID string `json:"span_id,omitempty"`
}

JudgeMetadata tracks information about the LLM judge that produced an evaluation. This enables reproducibility, debugging, and comparison of different judge configurations.

func NewJudgeMetadata ¶ added in v0.2.0

func NewJudgeMetadata(model string) *JudgeMetadata

NewJudgeMetadata creates judge metadata with required fields.

func (*JudgeMetadata) SetLatency ¶ added in v0.2.0

func (j *JudgeMetadata) SetLatency(d time.Duration)

SetLatency records the evaluation duration.

func (*JudgeMetadata) WithPrompt ¶ added in v0.2.0

func (j *JudgeMetadata) WithPrompt(template, version string) *JudgeMetadata

WithPrompt sets the prompt template info.

func (*JudgeMetadata) WithProvider ¶ added in v0.2.0

func (j *JudgeMetadata) WithProvider(provider string) *JudgeMetadata

WithProvider sets the model provider.

func (*JudgeMetadata) WithRubric ¶ added in v0.2.0

func (j *JudgeMetadata) WithRubric(id, version string) *JudgeMetadata

WithRubric sets the rubric reference.

func (*JudgeMetadata) WithTemperature ¶ added in v0.2.0

func (j *JudgeMetadata) WithTemperature(temp float64) *JudgeMetadata

WithTemperature sets the sampling temperature.

func (*JudgeMetadata) WithTokenUsage ¶ added in v0.2.0

func (j *JudgeMetadata) WithTokenUsage(input, output int) *JudgeMetadata

WithTokenUsage sets the token usage.

func (*JudgeMetadata) WithTrace ¶ added in v0.2.0

func (j *JudgeMetadata) WithTrace(traceID, spanID string) *JudgeMetadata

WithTrace links to observability.

type JudgeScore ¶ added in v0.2.0

type JudgeScore struct {
	// JudgeID identifies the judge.
	JudgeID string `json:"judge_id"`

	// Score is the judge's score.
	Score float64 `json:"score"`
}

JudgeScore is a score from a specific judge.

type MultiJudgeResult ¶ added in v0.2.0

type MultiJudgeResult struct {
	// Evaluations are the individual judge evaluations.
	Evaluations []*EvaluationReport `json:"evaluations"`

	// Judges contains metadata for each judge.
	Judges []*JudgeMetadata `json:"judges"`

	// AggregatedScore is the combined score (mean, median, or weighted).
	AggregatedScore float64 `json:"aggregated_score"`

	// AggregationMethod describes how scores were combined.
	AggregationMethod AggregationMethod `json:"aggregation_method"`

	// Agreement measures inter-judge agreement (0-1, higher = more agreement).
	Agreement float64 `json:"agreement"`

	// Disagreements lists categories where judges significantly disagreed.
	Disagreements []JudgeDisagreement `json:"disagreements,omitempty"`

	// ConsolidatedDecision is the final decision after aggregation.
	ConsolidatedDecision Decision `json:"consolidated_decision"`

	// ConsolidatedFindings merges findings from all judges.
	ConsolidatedFindings []Finding `json:"consolidated_findings"`
}

MultiJudgeResult aggregates evaluations from multiple judges. This improves reliability by combining perspectives and detecting disagreement.

func AggregateEvaluations ¶ added in v0.2.0

func AggregateEvaluations(evaluations []*EvaluationReport, method AggregationMethod) *MultiJudgeResult

AggregateEvaluations combines multiple evaluation reports.

type NextSteps ¶

type NextSteps struct {
	// RerunCommand is the command to re-run evaluation.
	RerunCommand string `json:"rerun_command"`

	// Immediate are blocking actions that must be completed.
	Immediate []ActionItem `json:"immediate,omitempty"`

	// Recommended are suggested improvements.
	Recommended []ActionItem `json:"recommended,omitempty"`
}

NextSteps provides actionable workflow guidance.

type PairwiseCategoryScore ¶ added in v0.2.0

type PairwiseCategoryScore struct {
	// Category is the evaluation dimension.
	Category string `json:"category"`

	// Winner indicates which output won for this category.
	Winner PairwiseWinner `json:"winner"`

	// Margin indicates how much better the winner is (0-1, higher = larger gap).
	Margin float64 `json:"margin,omitempty"`

	// Reasoning explains the category-level comparison.
	Reasoning string `json:"reasoning,omitempty"`
}

PairwiseCategoryScore compares outputs on a specific dimension.

type PairwiseComparison ¶ added in v0.2.0

type PairwiseComparison struct {
	// ID is the unique identifier for this comparison.
	ID string `json:"id,omitempty"`

	// Input is the shared input/prompt for both outputs.
	Input string `json:"input"`

	// OutputA is the first output being compared.
	OutputA string `json:"output_a"`

	// OutputB is the second output being compared.
	OutputB string `json:"output_b"`

	// Winner indicates which output won ("A", "B", or "tie").
	Winner PairwiseWinner `json:"winner"`

	// Confidence is the judge's confidence in the decision (0-1).
	Confidence float64 `json:"confidence,omitempty"`

	// Reasoning explains why this winner was chosen.
	Reasoning string `json:"reasoning"`

	// CategoryScores provides per-category comparisons if applicable.
	CategoryScores []PairwiseCategoryScore `json:"category_scores,omitempty"`

	// Judge contains metadata about the LLM judge.
	Judge *JudgeMetadata `json:"judge,omitempty"`

	// Metadata contains additional comparison context.
	Metadata map[string]any `json:"metadata,omitempty"`

	// CreatedAt is when this comparison was made.
	CreatedAt time.Time `json:"created_at,omitempty"`
}

PairwiseComparison represents a comparison between two outputs. This is an alternative to absolute scoring that can reduce position bias and improve reliability of LLM-as-Judge evaluations.

func NewPairwiseComparison ¶ added in v0.2.0

func NewPairwiseComparison(input, outputA, outputB string) *PairwiseComparison

NewPairwiseComparison creates a new pairwise comparison.

func (*PairwiseComparison) AddCategoryScore ¶ added in v0.2.0

func (p *PairwiseComparison) AddCategoryScore(category string, winner PairwiseWinner, reasoning string, margin float64)

AddCategoryScore adds a per-category comparison.

func (*PairwiseComparison) SetWinner ¶ added in v0.2.0

func (p *PairwiseComparison) SetWinner(winner PairwiseWinner, reasoning string, confidence float64)

SetWinner sets the comparison result.

func (*PairwiseComparison) SwappedComparison ¶ added in v0.2.0

func (p *PairwiseComparison) SwappedComparison() *PairwiseComparison

SwappedComparison creates a comparison with A and B swapped. Running both orders helps detect position bias in the judge.

type PairwiseResult ¶ added in v0.2.0

type PairwiseResult struct {
	// Comparisons are all the individual comparisons.
	Comparisons []PairwiseComparison `json:"comparisons"`

	// WinRateA is the percentage of comparisons won by A.
	WinRateA float64 `json:"win_rate_a"`

	// WinRateB is the percentage of comparisons won by B.
	WinRateB float64 `json:"win_rate_b"`

	// TieRate is the percentage of ties.
	TieRate float64 `json:"tie_rate"`

	// OverallWinner is the aggregated winner.
	OverallWinner PairwiseWinner `json:"overall_winner"`

	// Confidence is the overall confidence in the result.
	Confidence float64 `json:"confidence"`
}

PairwiseResult aggregates multiple pairwise comparisons.

func ComputePairwiseResult ¶ added in v0.2.0

func ComputePairwiseResult(comparisons []PairwiseComparison) *PairwiseResult

ComputeResult aggregates multiple comparisons into a result.

type PairwiseWinner ¶ added in v0.2.0

type PairwiseWinner string

PairwiseWinner indicates the winner of a pairwise comparison.

const (
	// WinnerA indicates output A is better.
	WinnerA PairwiseWinner = "A"

	// WinnerB indicates output B is better.
	WinnerB PairwiseWinner = "B"

	// WinnerTie indicates both outputs are roughly equal.
	WinnerTie PairwiseWinner = "tie"

	// WinnerUncertain indicates the judge couldn't determine a winner.
	WinnerUncertain PairwiseWinner = "uncertain"
)

type PassCriteria ¶

type PassCriteria struct {
	// MaxCritical is the maximum allowed critical findings (default 0).
	MaxCritical int `json:"max_critical"`

	// MaxHigh is the maximum allowed high severity findings (default 0).
	MaxHigh int `json:"max_high"`

	// MaxMedium is the maximum allowed medium findings (-1 = unlimited).
	MaxMedium int `json:"max_medium,omitempty"`

	// MinScore is the minimum weighted score required.
	MinScore float64 `json:"min_score"`
}

PassCriteria defines the requirements for approval.

func DefaultPassCriteria ¶

func DefaultPassCriteria() PassCriteria

DefaultPassCriteria returns standard pass criteria. Zero Critical/High, minimum score 7.0.

func StrictPassCriteria ¶

func StrictPassCriteria() PassCriteria

StrictPassCriteria returns strict pass criteria. Zero Critical/High, max 3 Medium, minimum score 8.0.

type ReferenceData ¶ added in v0.2.0

type ReferenceData struct {
	// ID is the unique identifier for this reference.
	ID string `json:"id,omitempty"`

	// Input is the input/prompt that produced the reference output.
	Input string `json:"input,omitempty"`

	// ExpectedOutput is the gold/reference output.
	ExpectedOutput string `json:"expected_output,omitempty"`

	// ExpectedOutputs allows multiple acceptable outputs.
	ExpectedOutputs []string `json:"expected_outputs,omitempty"`

	// Context provides additional context (e.g., retrieved documents for RAG).
	Context []string `json:"context,omitempty"`

	// Annotations are human-provided labels or scores.
	Annotations []Annotation `json:"annotations,omitempty"`

	// Source indicates where this reference came from.
	Source string `json:"source,omitempty"`

	// Tags categorize or filter references.
	Tags []string `json:"tags,omitempty"`

	// Metadata contains additional reference data.
	Metadata map[string]any `json:"metadata,omitempty"`
}

ReferenceData contains ground truth or expected data for evaluation. This enables reference-based evaluation where outputs are compared against known-good examples.

func NewReferenceData ¶ added in v0.2.0

func NewReferenceData(input, expectedOutput string) *ReferenceData

NewReferenceData creates a new reference data item.

func (*ReferenceData) WithAnnotation ¶ added in v0.2.0

func (r *ReferenceData) WithAnnotation(name string, score float64, annotatorID string) *ReferenceData

WithAnnotation adds a human annotation.

func (*ReferenceData) WithContext ¶ added in v0.2.0

func (r *ReferenceData) WithContext(ctx ...string) *ReferenceData

WithContext adds context documents.

type ReferenceDataset ¶ added in v0.2.0

type ReferenceDataset struct {
	// ID is the unique identifier for this dataset.
	ID string `json:"id"`

	// Name is the display name.
	Name string `json:"name"`

	// Description explains what this dataset contains.
	Description string `json:"description,omitempty"`

	// Version tracks dataset iterations.
	Version string `json:"version,omitempty"`

	// Items are the reference data items.
	Items []ReferenceData `json:"items"`

	// Tags categorize the dataset.
	Tags []string `json:"tags,omitempty"`

	// Metadata contains additional dataset info.
	Metadata map[string]any `json:"metadata,omitempty"`
}

ReferenceDataset is a collection of reference data items.

func NewReferenceDataset ¶ added in v0.2.0

func NewReferenceDataset(id, name string) *ReferenceDataset

NewReferenceDataset creates a new reference dataset.

func (*ReferenceDataset) AddItem ¶ added in v0.2.0

func (d *ReferenceDataset) AddItem(item ReferenceData)

AddItem adds a reference data item to the dataset.

func (*ReferenceDataset) GetByID ¶ added in v0.2.0

func (d *ReferenceDataset) GetByID(id string) *ReferenceData

GetByID retrieves a reference item by ID.

type ReportMetadata ¶

type ReportMetadata struct {
	// Document is the filename or path being evaluated.
	Document string `json:"document"`

	// DocumentID is the document identifier (e.g., PRD ID).
	DocumentID string `json:"document_id,omitempty"`

	// DocumentTitle is the document title.
	DocumentTitle string `json:"document_title,omitempty"`

	// DocumentVersion is the document version.
	DocumentVersion string `json:"document_version,omitempty"`

	// GeneratedAt is when the report was created.
	GeneratedAt time.Time `json:"generated_at"`

	// GeneratedBy identifies what created this report.
	GeneratedBy string `json:"generated_by,omitempty"`

	// ReviewerID identifies the reviewer (agent or human).
	ReviewerID string `json:"reviewer_id,omitempty"`
}

ReportMetadata contains report identification.

type Rubric ¶ added in v0.2.0

type Rubric struct {
	// Category is the evaluation dimension this rubric applies to.
	Category string `json:"category"`

	// Description explains what this category measures.
	Description string `json:"description"`

	// Anchors define what each score level means.
	// Key is the score (e.g., "10", "7", "5", "3", "1") or range (e.g., "8-10").
	Anchors []ScoreAnchor `json:"anchors"`

	// Examples provide sample inputs/outputs for each score level.
	Examples []RubricExample `json:"examples,omitempty"`
}

Rubric defines the scoring criteria for an evaluation category. It provides explicit anchors for what each score level means, improving consistency and reproducibility of LLM-as-Judge evaluations.

func NewRubric ¶ added in v0.2.0

func NewRubric(category, description string) *Rubric

NewRubric creates a new rubric for a category.

func (*Rubric) AddAnchor ¶ added in v0.2.0

func (r *Rubric) AddAnchor(score float64, label, description string, criteria ...string) *Rubric

AddAnchor adds a score anchor to the rubric.

func (*Rubric) AddExample ¶ added in v0.2.0

func (r *Rubric) AddExample(score float64, output, explanation string) *Rubric

AddExample adds an example to the rubric.

func (*Rubric) AddRangeAnchor ¶ added in v0.2.0

func (r *Rubric) AddRangeAnchor(minScore, maxScore float64, label, description string, criteria ...string) *Rubric

AddRangeAnchor adds a range-based score anchor.

func (*Rubric) GetAnchorForScore ¶ added in v0.2.0

func (r *Rubric) GetAnchorForScore(score float64) *ScoreAnchor

GetAnchorForScore returns the anchor that applies to the given score.

type RubricExample ¶ added in v0.2.0

type RubricExample struct {
	// Score is the score this example demonstrates.
	Score float64 `json:"score"`

	// Input is the example input/prompt.
	Input string `json:"input,omitempty"`

	// Output is the example output being scored.
	Output string `json:"output"`

	// Explanation describes why this output receives this score.
	Explanation string `json:"explanation"`
}

RubricExample provides a concrete example for a score level.

type RubricSet ¶ added in v0.2.0

type RubricSet struct {
	// ID is the unique identifier for this rubric set.
	ID string `json:"id"`

	// Name is the display name.
	Name string `json:"name"`

	// Version tracks rubric iterations.
	Version string `json:"version"`

	// Description explains what this rubric set evaluates.
	Description string `json:"description,omitempty"`

	// Rubrics are the category-specific rubrics.
	Rubrics []Rubric `json:"rubrics"`
}

RubricSet is a collection of rubrics for a complete evaluation.

func DefaultPRDRubricSet ¶ added in v0.2.0

func DefaultPRDRubricSet() *RubricSet

DefaultPRDRubricSet returns a standard rubric set for PRD evaluation.

type ScoreAnchor ¶ added in v0.2.0

type ScoreAnchor struct {
	// Score is the numeric score this anchor represents.
	// Use -1 for range-based anchors where MinScore/MaxScore are set.
	Score float64 `json:"score,omitempty"`

	// MinScore is the minimum score for range-based anchors.
	MinScore float64 `json:"min_score,omitempty"`

	// MaxScore is the maximum score for range-based anchors.
	MaxScore float64 `json:"max_score,omitempty"`

	// Label is a short name for this level (e.g., "Excellent", "Good", "Poor").
	Label string `json:"label"`

	// Description explains what qualifies for this score.
	Description string `json:"description"`

	// Criteria are specific requirements for this score level.
	Criteria []string `json:"criteria,omitempty"`
}

ScoreAnchor defines the criteria for a specific score or score range.

type ScoreStatus ¶

type ScoreStatus string

ScoreStatus represents the pass/warn/fail status for a category score.

const (
	ScoreStatusPass          ScoreStatus = "pass"              // Score >= 7.0
	ScoreStatusWarn          ScoreStatus = "warn"              // Score >= 5.0 && < 7.0
	ScoreStatusFail          ScoreStatus = "fail"              // Score < 5.0
	CategoryPending          ScoreStatus = "pending"           // Not yet evaluated
	CategoryNeedsImprovement ScoreStatus = "needs_improvement" // Requires attention
)

func (ScoreStatus) Icon ¶

func (s ScoreStatus) Icon() string

Icon returns the emoji icon for the score status.

type Severity ¶

type Severity string

Severity represents the severity level of a finding. Based on InfoSec severity classifications.

const (
	SeverityCritical Severity = "critical" // Blocks approval, must fix
	SeverityHigh     Severity = "high"     // Blocks approval, must fix
	SeverityMedium   Severity = "medium"   // Should fix before approval
	SeverityLow      Severity = "low"      // Nice to fix
	SeverityInfo     Severity = "info"     // Informational only
)

func AllSeverities ¶

func AllSeverities() []Severity

AllSeverities returns all severity levels in order of severity.

func (Severity) Icon ¶

func (s Severity) Icon() string

Icon returns the emoji icon for the severity.

func (Severity) IsBlocking ¶

func (s Severity) IsBlocking() bool

IsBlocking returns true if this severity blocks approval.

func (Severity) Weight ¶

func (s Severity) Weight() int

Weight returns a numeric weight for sorting (higher = more severe).

type TokenUsage ¶ added in v0.2.0

type TokenUsage struct {
	// InputTokens is the number of input/prompt tokens.
	InputTokens int `json:"input_tokens"`

	// OutputTokens is the number of output/completion tokens.
	OutputTokens int `json:"output_tokens"`

	// TotalTokens is the total tokens used.
	TotalTokens int `json:"total_tokens"`

	// CacheReadTokens is tokens read from cache (if applicable).
	CacheReadTokens int `json:"cache_read_tokens,omitempty"`

	// CacheWriteTokens is tokens written to cache (if applicable).
	CacheWriteTokens int `json:"cache_write_tokens,omitempty"`
}

TokenUsage tracks token consumption for an evaluation.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL