evaluation

package
v0.2.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 26, 2026 License: MIT Imports: 4 Imported by: 0

Documentation

Overview

Package evaluation provides types for detailed evaluation reports with severity-based findings and recommendations. This is suited for LLM-as-Judge style reviews like PRD and ARB evaluations.

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type ActionItem

type ActionItem struct {
	// Action describes what needs to be done.
	Action string `json:"action"`

	// Category is the related evaluation category.
	Category string `json:"category,omitempty"`

	// Severity is the related finding severity.
	Severity Severity `json:"severity,omitempty"`

	// Owner suggests who should do this.
	Owner string `json:"owner,omitempty"`

	// Effort estimates work required.
	Effort string `json:"effort,omitempty"`
}

ActionItem is a specific action to take.

type AggregationMethod added in v0.2.0

type AggregationMethod string

AggregationMethod specifies how to combine multiple judge scores.

const (
	// AggregationMean uses the arithmetic mean of scores.
	AggregationMean AggregationMethod = "mean"

	// AggregationMedian uses the median score.
	AggregationMedian AggregationMethod = "median"

	// AggregationWeighted uses weighted average based on judge confidence.
	AggregationWeighted AggregationMethod = "weighted"

	// AggregationMajority uses majority vote for pass/fail.
	AggregationMajority AggregationMethod = "majority"

	// AggregationConservative uses the lowest/most critical score.
	AggregationConservative AggregationMethod = "conservative"
)

type Annotation added in v0.2.0

type Annotation struct {
	// Name is the annotation type (e.g., "quality", "relevance").
	Name string `json:"name"`

	// Score is a numeric score (if applicable).
	Score float64 `json:"score,omitempty"`

	// Label is a categorical label (if applicable).
	Label string `json:"label,omitempty"`

	// Explanation provides reasoning for the annotation.
	Explanation string `json:"explanation,omitempty"`

	// AnnotatorID identifies who provided this annotation.
	AnnotatorID string `json:"annotator_id,omitempty"`

	// AnnotatorType indicates human vs automated (e.g., "human", "llm", "rule").
	AnnotatorType string `json:"annotator_type,omitempty"`
}

Annotation represents a human-provided label or score.

type CategoryScore

type CategoryScore struct {
	// Category is the name/ID of the category.
	Category string `json:"category"`

	// Weight is the category weight (0.0-1.0, should sum to 1.0).
	Weight float64 `json:"weight"`

	// Score is the category score (0.0-10.0).
	Score float64 `json:"score"`

	// MaxScore is the maximum possible score (default 10.0).
	MaxScore float64 `json:"max_score"`

	// Status is the derived status (pass/warn/fail).
	Status ScoreStatus `json:"status"`

	// Justification explains why this score was given.
	Justification string `json:"justification"`

	// Evidence provides specific supporting evidence.
	Evidence string `json:"evidence,omitempty"`

	// Findings are issues found in this category.
	Findings []Finding `json:"findings,omitempty"`
}

CategoryScore represents a score for a single evaluation category.

func NewCategoryScore

func NewCategoryScore(category string, weight, score float64, justification string) CategoryScore

NewCategoryScore creates a category score with computed status.

func (*CategoryScore) ComputeStatus

func (c *CategoryScore) ComputeStatus() ScoreStatus

ComputeStatus calculates the status from the score.

func (*CategoryScore) ComputeWeightedScore

func (c *CategoryScore) ComputeWeightedScore() float64

ComputeWeightedScore calculates the weighted contribution of this category.

type Decision

type Decision struct {
	// Status is the decision outcome.
	Status DecisionStatus `json:"status"`

	// Passed indicates if the evaluation passed.
	Passed bool `json:"passed"`

	// Rationale explains the decision.
	Rationale string `json:"rationale"`

	// FindingCounts summarizes findings by severity.
	FindingCounts FindingCounts `json:"finding_counts"`

	// WeightedScore is the final weighted score.
	WeightedScore float64 `json:"weighted_score"`
}

Decision represents the evaluation decision.

func Evaluate

func Evaluate(findings []Finding, weightedScore float64, criteria PassCriteria) Decision

Evaluate checks findings and score against criteria.

type DecisionStatus

type DecisionStatus string

DecisionStatus represents the decision outcome.

const (
	DecisionPass        DecisionStatus = "pass"         // Meets all criteria
	DecisionConditional DecisionStatus = "conditional"  // Meets score but has findings
	DecisionFail        DecisionStatus = "fail"         // Has blocking findings
	DecisionHumanReview DecisionStatus = "human_review" // Requires human judgment
)

type EvaluationReport

type EvaluationReport struct {
	// Schema is the JSON Schema URL.
	Schema string `json:"$schema,omitempty"`

	// Metadata contains report identification and audit info.
	Metadata ReportMetadata `json:"metadata"`

	// ReviewType identifies the type of review (prd, arb, security, etc.).
	ReviewType string `json:"review_type"`

	// Judge contains metadata about the LLM judge (v0.2.0).
	Judge *JudgeMetadata `json:"judge,omitempty"`

	// RubricID references the rubric used for scoring (v0.2.0).
	RubricID string `json:"rubric_id,omitempty"`

	// Reference contains gold/expected data for comparison (v0.2.0).
	Reference *ReferenceData `json:"reference,omitempty"`

	// Categories contains scores for each evaluation dimension.
	Categories []CategoryScore `json:"categories"`

	// Findings are all issues discovered during evaluation.
	Findings []Finding `json:"findings"`

	// WeightedScore is the overall weighted score.
	WeightedScore float64 `json:"weighted_score"`

	// PassCriteria defines the requirements for approval.
	PassCriteria PassCriteria `json:"pass_criteria"`

	// Decision is the evaluation outcome.
	Decision Decision `json:"decision"`

	// NextSteps provides actionable guidance.
	NextSteps NextSteps `json:"next_steps"`

	// Summary is the overall assessment.
	Summary string `json:"summary"`
}

EvaluationReport is the detailed evaluation report for LLM-as-Judge reviews.

func NewEvaluationReport

func NewEvaluationReport(reviewType, document string) *EvaluationReport

NewEvaluationReport creates a new evaluation report.

func (*EvaluationReport) AddCategory

func (r *EvaluationReport) AddCategory(cs CategoryScore)

AddCategory adds a category score.

func (*EvaluationReport) AddFinding

func (r *EvaluationReport) AddFinding(f Finding)

AddFinding adds a finding.

func (*EvaluationReport) ComputeWeightedScore

func (r *EvaluationReport) ComputeWeightedScore() float64

ComputeWeightedScore calculates the overall weighted score.

func (*EvaluationReport) Evaluate

func (r *EvaluationReport) Evaluate() Decision

Evaluate computes the decision based on findings and score.

func (*EvaluationReport) Finalize

func (r *EvaluationReport) Finalize(rerunCommand string)

Finalize computes all derived fields.

func (*EvaluationReport) GenerateNextSteps

func (r *EvaluationReport) GenerateNextSteps(rerunCommand string)

GenerateNextSteps creates actionable next steps.

func (*EvaluationReport) GenerateSummary

func (r *EvaluationReport) GenerateSummary() string

GenerateSummary creates the summary text.

func (*EvaluationReport) SetJudge added in v0.2.0

func (r *EvaluationReport) SetJudge(judge *JudgeMetadata)

SetJudge sets the judge metadata.

func (*EvaluationReport) SetReference added in v0.2.0

func (r *EvaluationReport) SetReference(ref *ReferenceData)

SetReference sets the reference data for comparison.

func (*EvaluationReport) SetRubric added in v0.2.0

func (r *EvaluationReport) SetRubric(rubricID string)

SetRubric sets the rubric ID.

type Finding

type Finding struct {
	// ID is the unique identifier for this finding.
	ID string `json:"id"`

	// Category is the evaluation category this relates to.
	Category string `json:"category"`

	// Severity indicates the impact level.
	Severity Severity `json:"severity"`

	// Title is a brief summary of the finding.
	Title string `json:"title"`

	// Description provides detailed explanation.
	Description string `json:"description"`

	// Recommendation explains how to fix the issue.
	Recommendation string `json:"recommendation"`

	// Evidence provides specific examples or references.
	Evidence string `json:"evidence,omitempty"`

	// Owner suggests who should address this finding.
	Owner string `json:"owner,omitempty"`

	// Effort estimates the work required (low, medium, high).
	Effort string `json:"effort,omitempty"`
}

Finding represents an issue discovered during evaluation.

func (*Finding) IsBlocking

func (f *Finding) IsBlocking() bool

IsBlocking returns true if this finding blocks approval.

type FindingCounts

type FindingCounts struct {
	Critical int `json:"critical"`
	High     int `json:"high"`
	Medium   int `json:"medium"`
	Low      int `json:"low"`
	Info     int `json:"info"`
	Total    int `json:"total"`
}

FindingCounts tracks the number of findings by severity.

func CountFindings

func CountFindings(findings []Finding) FindingCounts

CountFindings counts findings by severity.

func (FindingCounts) BlockingCount

func (c FindingCounts) BlockingCount() int

BlockingCount returns the number of blocking findings.

func (FindingCounts) HasBlocking

func (c FindingCounts) HasBlocking() bool

HasBlocking returns true if there are any blocking findings.

type JudgeDisagreement added in v0.2.0

type JudgeDisagreement struct {
	// Category is the evaluation dimension.
	Category string `json:"category"`

	// Scores are the individual judge scores.
	Scores []JudgeScore `json:"scores"`

	// Range is the difference between max and min scores.
	Range float64 `json:"range"`

	// StandardDeviation measures score spread.
	StandardDeviation float64 `json:"standard_deviation"`
}

JudgeDisagreement captures where judges had significantly different scores.

type JudgeMetadata added in v0.2.0

type JudgeMetadata struct {
	// JudgeID is a unique identifier for this judge configuration.
	JudgeID string `json:"judge_id,omitempty"`

	// Model is the LLM model used (e.g., "claude-3-opus-20240229", "gpt-4-turbo").
	Model string `json:"model"`

	// ModelProvider is the API provider (e.g., "anthropic", "openai", "bedrock").
	ModelProvider string `json:"model_provider,omitempty"`

	// ModelVersion is the specific model version if applicable.
	ModelVersion string `json:"model_version,omitempty"`

	// PromptTemplate is the name/ID of the prompt template used.
	PromptTemplate string `json:"prompt_template,omitempty"`

	// PromptVersion is the version of the prompt template.
	PromptVersion string `json:"prompt_version,omitempty"`

	// SystemPrompt is the system prompt used (or hash/reference if too long).
	SystemPrompt string `json:"system_prompt,omitempty"`

	// Temperature is the sampling temperature used.
	Temperature float64 `json:"temperature,omitempty"`

	// MaxTokens is the max tokens setting.
	MaxTokens int `json:"max_tokens,omitempty"`

	// RubricID references the rubric set used for scoring.
	RubricID string `json:"rubric_id,omitempty"`

	// RubricVersion is the version of the rubric used.
	RubricVersion string `json:"rubric_version,omitempty"`

	// EvaluatedAt is when this evaluation was performed.
	EvaluatedAt time.Time `json:"evaluated_at,omitempty"`

	// Latency is the evaluation duration.
	Latency time.Duration `json:"latency,omitempty"`

	// TokensUsed tracks token consumption.
	TokensUsed *TokenUsage `json:"tokens_used,omitempty"`

	// TraceID links to observability trace (e.g., for Opik/Phoenix/Langfuse).
	TraceID string `json:"trace_id,omitempty"`

	// SpanID links to observability span.
	SpanID string `json:"span_id,omitempty"`
}

JudgeMetadata tracks information about the LLM judge that produced an evaluation. This enables reproducibility, debugging, and comparison of different judge configurations.

func NewJudgeMetadata added in v0.2.0

func NewJudgeMetadata(model string) *JudgeMetadata

NewJudgeMetadata creates judge metadata with required fields.

func (*JudgeMetadata) SetLatency added in v0.2.0

func (j *JudgeMetadata) SetLatency(d time.Duration)

SetLatency records the evaluation duration.

func (*JudgeMetadata) WithPrompt added in v0.2.0

func (j *JudgeMetadata) WithPrompt(template, version string) *JudgeMetadata

WithPrompt sets the prompt template info.

func (*JudgeMetadata) WithProvider added in v0.2.0

func (j *JudgeMetadata) WithProvider(provider string) *JudgeMetadata

WithProvider sets the model provider.

func (*JudgeMetadata) WithRubric added in v0.2.0

func (j *JudgeMetadata) WithRubric(id, version string) *JudgeMetadata

WithRubric sets the rubric reference.

func (*JudgeMetadata) WithTemperature added in v0.2.0

func (j *JudgeMetadata) WithTemperature(temp float64) *JudgeMetadata

WithTemperature sets the sampling temperature.

func (*JudgeMetadata) WithTokenUsage added in v0.2.0

func (j *JudgeMetadata) WithTokenUsage(input, output int) *JudgeMetadata

WithTokenUsage sets the token usage.

func (*JudgeMetadata) WithTrace added in v0.2.0

func (j *JudgeMetadata) WithTrace(traceID, spanID string) *JudgeMetadata

WithTrace links to observability.

type JudgeScore added in v0.2.0

type JudgeScore struct {
	// JudgeID identifies the judge.
	JudgeID string `json:"judge_id"`

	// Score is the judge's score.
	Score float64 `json:"score"`
}

JudgeScore is a score from a specific judge.

type MultiJudgeResult added in v0.2.0

type MultiJudgeResult struct {
	// Evaluations are the individual judge evaluations.
	Evaluations []*EvaluationReport `json:"evaluations"`

	// Judges contains metadata for each judge.
	Judges []*JudgeMetadata `json:"judges"`

	// AggregatedScore is the combined score (mean, median, or weighted).
	AggregatedScore float64 `json:"aggregated_score"`

	// AggregationMethod describes how scores were combined.
	AggregationMethod AggregationMethod `json:"aggregation_method"`

	// Agreement measures inter-judge agreement (0-1, higher = more agreement).
	Agreement float64 `json:"agreement"`

	// Disagreements lists categories where judges significantly disagreed.
	Disagreements []JudgeDisagreement `json:"disagreements,omitempty"`

	// ConsolidatedDecision is the final decision after aggregation.
	ConsolidatedDecision Decision `json:"consolidated_decision"`

	// ConsolidatedFindings merges findings from all judges.
	ConsolidatedFindings []Finding `json:"consolidated_findings"`
}

MultiJudgeResult aggregates evaluations from multiple judges. This improves reliability by combining perspectives and detecting disagreement.

func AggregateEvaluations added in v0.2.0

func AggregateEvaluations(evaluations []*EvaluationReport, method AggregationMethod) *MultiJudgeResult

AggregateEvaluations combines multiple evaluation reports.

type NextSteps

type NextSteps struct {
	// RerunCommand is the command to re-run evaluation.
	RerunCommand string `json:"rerun_command"`

	// Immediate are blocking actions that must be completed.
	Immediate []ActionItem `json:"immediate,omitempty"`

	// Recommended are suggested improvements.
	Recommended []ActionItem `json:"recommended,omitempty"`
}

NextSteps provides actionable workflow guidance.

type PairwiseCategoryScore added in v0.2.0

type PairwiseCategoryScore struct {
	// Category is the evaluation dimension.
	Category string `json:"category"`

	// Winner indicates which output won for this category.
	Winner PairwiseWinner `json:"winner"`

	// Margin indicates how much better the winner is (0-1, higher = larger gap).
	Margin float64 `json:"margin,omitempty"`

	// Reasoning explains the category-level comparison.
	Reasoning string `json:"reasoning,omitempty"`
}

PairwiseCategoryScore compares outputs on a specific dimension.

type PairwiseComparison added in v0.2.0

type PairwiseComparison struct {
	// ID is the unique identifier for this comparison.
	ID string `json:"id,omitempty"`

	// Input is the shared input/prompt for both outputs.
	Input string `json:"input"`

	// OutputA is the first output being compared.
	OutputA string `json:"output_a"`

	// OutputB is the second output being compared.
	OutputB string `json:"output_b"`

	// Winner indicates which output won ("A", "B", or "tie").
	Winner PairwiseWinner `json:"winner"`

	// Confidence is the judge's confidence in the decision (0-1).
	Confidence float64 `json:"confidence,omitempty"`

	// Reasoning explains why this winner was chosen.
	Reasoning string `json:"reasoning"`

	// CategoryScores provides per-category comparisons if applicable.
	CategoryScores []PairwiseCategoryScore `json:"category_scores,omitempty"`

	// Judge contains metadata about the LLM judge.
	Judge *JudgeMetadata `json:"judge,omitempty"`

	// Metadata contains additional comparison context.
	Metadata map[string]any `json:"metadata,omitempty"`

	// CreatedAt is when this comparison was made.
	CreatedAt time.Time `json:"created_at,omitempty"`
}

PairwiseComparison represents a comparison between two outputs. This is an alternative to absolute scoring that can reduce position bias and improve reliability of LLM-as-Judge evaluations.

func NewPairwiseComparison added in v0.2.0

func NewPairwiseComparison(input, outputA, outputB string) *PairwiseComparison

NewPairwiseComparison creates a new pairwise comparison.

func (*PairwiseComparison) AddCategoryScore added in v0.2.0

func (p *PairwiseComparison) AddCategoryScore(category string, winner PairwiseWinner, reasoning string, margin float64)

AddCategoryScore adds a per-category comparison.

func (*PairwiseComparison) SetWinner added in v0.2.0

func (p *PairwiseComparison) SetWinner(winner PairwiseWinner, reasoning string, confidence float64)

SetWinner sets the comparison result.

func (*PairwiseComparison) SwappedComparison added in v0.2.0

func (p *PairwiseComparison) SwappedComparison() *PairwiseComparison

SwappedComparison creates a comparison with A and B swapped. Running both orders helps detect position bias in the judge.

type PairwiseResult added in v0.2.0

type PairwiseResult struct {
	// Comparisons are all the individual comparisons.
	Comparisons []PairwiseComparison `json:"comparisons"`

	// WinRateA is the percentage of comparisons won by A.
	WinRateA float64 `json:"win_rate_a"`

	// WinRateB is the percentage of comparisons won by B.
	WinRateB float64 `json:"win_rate_b"`

	// TieRate is the percentage of ties.
	TieRate float64 `json:"tie_rate"`

	// OverallWinner is the aggregated winner.
	OverallWinner PairwiseWinner `json:"overall_winner"`

	// Confidence is the overall confidence in the result.
	Confidence float64 `json:"confidence"`
}

PairwiseResult aggregates multiple pairwise comparisons.

func ComputePairwiseResult added in v0.2.0

func ComputePairwiseResult(comparisons []PairwiseComparison) *PairwiseResult

ComputeResult aggregates multiple comparisons into a result.

type PairwiseWinner added in v0.2.0

type PairwiseWinner string

PairwiseWinner indicates the winner of a pairwise comparison.

const (
	// WinnerA indicates output A is better.
	WinnerA PairwiseWinner = "A"

	// WinnerB indicates output B is better.
	WinnerB PairwiseWinner = "B"

	// WinnerTie indicates both outputs are roughly equal.
	WinnerTie PairwiseWinner = "tie"

	// WinnerUncertain indicates the judge couldn't determine a winner.
	WinnerUncertain PairwiseWinner = "uncertain"
)

type PassCriteria

type PassCriteria struct {
	// MaxCritical is the maximum allowed critical findings (default 0).
	MaxCritical int `json:"max_critical"`

	// MaxHigh is the maximum allowed high severity findings (default 0).
	MaxHigh int `json:"max_high"`

	// MaxMedium is the maximum allowed medium findings (-1 = unlimited).
	MaxMedium int `json:"max_medium,omitempty"`

	// MinScore is the minimum weighted score required.
	MinScore float64 `json:"min_score"`
}

PassCriteria defines the requirements for approval.

func DefaultPassCriteria

func DefaultPassCriteria() PassCriteria

DefaultPassCriteria returns standard pass criteria. Zero Critical/High, minimum score 7.0.

func StrictPassCriteria

func StrictPassCriteria() PassCriteria

StrictPassCriteria returns strict pass criteria. Zero Critical/High, max 3 Medium, minimum score 8.0.

type ReferenceData added in v0.2.0

type ReferenceData struct {
	// ID is the unique identifier for this reference.
	ID string `json:"id,omitempty"`

	// Input is the input/prompt that produced the reference output.
	Input string `json:"input,omitempty"`

	// ExpectedOutput is the gold/reference output.
	ExpectedOutput string `json:"expected_output,omitempty"`

	// ExpectedOutputs allows multiple acceptable outputs.
	ExpectedOutputs []string `json:"expected_outputs,omitempty"`

	// Context provides additional context (e.g., retrieved documents for RAG).
	Context []string `json:"context,omitempty"`

	// Annotations are human-provided labels or scores.
	Annotations []Annotation `json:"annotations,omitempty"`

	// Source indicates where this reference came from.
	Source string `json:"source,omitempty"`

	// Tags categorize or filter references.
	Tags []string `json:"tags,omitempty"`

	// Metadata contains additional reference data.
	Metadata map[string]any `json:"metadata,omitempty"`
}

ReferenceData contains ground truth or expected data for evaluation. This enables reference-based evaluation where outputs are compared against known-good examples.

func NewReferenceData added in v0.2.0

func NewReferenceData(input, expectedOutput string) *ReferenceData

NewReferenceData creates a new reference data item.

func (*ReferenceData) WithAnnotation added in v0.2.0

func (r *ReferenceData) WithAnnotation(name string, score float64, annotatorID string) *ReferenceData

WithAnnotation adds a human annotation.

func (*ReferenceData) WithContext added in v0.2.0

func (r *ReferenceData) WithContext(ctx ...string) *ReferenceData

WithContext adds context documents.

type ReferenceDataset added in v0.2.0

type ReferenceDataset struct {
	// ID is the unique identifier for this dataset.
	ID string `json:"id"`

	// Name is the display name.
	Name string `json:"name"`

	// Description explains what this dataset contains.
	Description string `json:"description,omitempty"`

	// Version tracks dataset iterations.
	Version string `json:"version,omitempty"`

	// Items are the reference data items.
	Items []ReferenceData `json:"items"`

	// Tags categorize the dataset.
	Tags []string `json:"tags,omitempty"`

	// Metadata contains additional dataset info.
	Metadata map[string]any `json:"metadata,omitempty"`
}

ReferenceDataset is a collection of reference data items.

func NewReferenceDataset added in v0.2.0

func NewReferenceDataset(id, name string) *ReferenceDataset

NewReferenceDataset creates a new reference dataset.

func (*ReferenceDataset) AddItem added in v0.2.0

func (d *ReferenceDataset) AddItem(item ReferenceData)

AddItem adds a reference data item to the dataset.

func (*ReferenceDataset) GetByID added in v0.2.0

func (d *ReferenceDataset) GetByID(id string) *ReferenceData

GetByID retrieves a reference item by ID.

type ReportMetadata

type ReportMetadata struct {
	// Document is the filename or path being evaluated.
	Document string `json:"document"`

	// DocumentID is the document identifier (e.g., PRD ID).
	DocumentID string `json:"document_id,omitempty"`

	// DocumentTitle is the document title.
	DocumentTitle string `json:"document_title,omitempty"`

	// DocumentVersion is the document version.
	DocumentVersion string `json:"document_version,omitempty"`

	// GeneratedAt is when the report was created.
	GeneratedAt time.Time `json:"generated_at"`

	// GeneratedBy identifies what created this report.
	GeneratedBy string `json:"generated_by,omitempty"`

	// ReviewerID identifies the reviewer (agent or human).
	ReviewerID string `json:"reviewer_id,omitempty"`
}

ReportMetadata contains report identification.

type Rubric added in v0.2.0

type Rubric struct {
	// Category is the evaluation dimension this rubric applies to.
	Category string `json:"category"`

	// Description explains what this category measures.
	Description string `json:"description"`

	// Anchors define what each score level means.
	// Key is the score (e.g., "10", "7", "5", "3", "1") or range (e.g., "8-10").
	Anchors []ScoreAnchor `json:"anchors"`

	// Examples provide sample inputs/outputs for each score level.
	Examples []RubricExample `json:"examples,omitempty"`
}

Rubric defines the scoring criteria for an evaluation category. It provides explicit anchors for what each score level means, improving consistency and reproducibility of LLM-as-Judge evaluations.

func NewRubric added in v0.2.0

func NewRubric(category, description string) *Rubric

NewRubric creates a new rubric for a category.

func (*Rubric) AddAnchor added in v0.2.0

func (r *Rubric) AddAnchor(score float64, label, description string, criteria ...string) *Rubric

AddAnchor adds a score anchor to the rubric.

func (*Rubric) AddExample added in v0.2.0

func (r *Rubric) AddExample(score float64, output, explanation string) *Rubric

AddExample adds an example to the rubric.

func (*Rubric) AddRangeAnchor added in v0.2.0

func (r *Rubric) AddRangeAnchor(minScore, maxScore float64, label, description string, criteria ...string) *Rubric

AddRangeAnchor adds a range-based score anchor.

func (*Rubric) GetAnchorForScore added in v0.2.0

func (r *Rubric) GetAnchorForScore(score float64) *ScoreAnchor

GetAnchorForScore returns the anchor that applies to the given score.

type RubricExample added in v0.2.0

type RubricExample struct {
	// Score is the score this example demonstrates.
	Score float64 `json:"score"`

	// Input is the example input/prompt.
	Input string `json:"input,omitempty"`

	// Output is the example output being scored.
	Output string `json:"output"`

	// Explanation describes why this output receives this score.
	Explanation string `json:"explanation"`
}

RubricExample provides a concrete example for a score level.

type RubricSet added in v0.2.0

type RubricSet struct {
	// ID is the unique identifier for this rubric set.
	ID string `json:"id"`

	// Name is the display name.
	Name string `json:"name"`

	// Version tracks rubric iterations.
	Version string `json:"version"`

	// Description explains what this rubric set evaluates.
	Description string `json:"description,omitempty"`

	// Rubrics are the category-specific rubrics.
	Rubrics []Rubric `json:"rubrics"`
}

RubricSet is a collection of rubrics for a complete evaluation.

func DefaultPRDRubricSet added in v0.2.0

func DefaultPRDRubricSet() *RubricSet

DefaultPRDRubricSet returns a standard rubric set for PRD evaluation.

type ScoreAnchor added in v0.2.0

type ScoreAnchor struct {
	// Score is the numeric score this anchor represents.
	// Use -1 for range-based anchors where MinScore/MaxScore are set.
	Score float64 `json:"score,omitempty"`

	// MinScore is the minimum score for range-based anchors.
	MinScore float64 `json:"min_score,omitempty"`

	// MaxScore is the maximum score for range-based anchors.
	MaxScore float64 `json:"max_score,omitempty"`

	// Label is a short name for this level (e.g., "Excellent", "Good", "Poor").
	Label string `json:"label"`

	// Description explains what qualifies for this score.
	Description string `json:"description"`

	// Criteria are specific requirements for this score level.
	Criteria []string `json:"criteria,omitempty"`
}

ScoreAnchor defines the criteria for a specific score or score range.

type ScoreStatus

type ScoreStatus string

ScoreStatus represents the pass/warn/fail status for a category score.

const (
	ScoreStatusPass          ScoreStatus = "pass"              // Score >= 7.0
	ScoreStatusWarn          ScoreStatus = "warn"              // Score >= 5.0 && < 7.0
	ScoreStatusFail          ScoreStatus = "fail"              // Score < 5.0
	CategoryPending          ScoreStatus = "pending"           // Not yet evaluated
	CategoryNeedsImprovement ScoreStatus = "needs_improvement" // Requires attention
)

func (ScoreStatus) Icon

func (s ScoreStatus) Icon() string

Icon returns the emoji icon for the score status.

type Severity

type Severity string

Severity represents the severity level of a finding. Based on InfoSec severity classifications.

const (
	SeverityCritical Severity = "critical" // Blocks approval, must fix
	SeverityHigh     Severity = "high"     // Blocks approval, must fix
	SeverityMedium   Severity = "medium"   // Should fix before approval
	SeverityLow      Severity = "low"      // Nice to fix
	SeverityInfo     Severity = "info"     // Informational only
)

func AllSeverities

func AllSeverities() []Severity

AllSeverities returns all severity levels in order of severity.

func (Severity) Icon

func (s Severity) Icon() string

Icon returns the emoji icon for the severity.

func (Severity) IsBlocking

func (s Severity) IsBlocking() bool

IsBlocking returns true if this severity blocks approval.

func (Severity) Weight

func (s Severity) Weight() int

Weight returns a numeric weight for sorting (higher = more severe).

type TokenUsage added in v0.2.0

type TokenUsage struct {
	// InputTokens is the number of input/prompt tokens.
	InputTokens int `json:"input_tokens"`

	// OutputTokens is the number of output/completion tokens.
	OutputTokens int `json:"output_tokens"`

	// TotalTokens is the total tokens used.
	TotalTokens int `json:"total_tokens"`

	// CacheReadTokens is tokens read from cache (if applicable).
	CacheReadTokens int `json:"cache_read_tokens,omitempty"`

	// CacheWriteTokens is tokens written to cache (if applicable).
	CacheWriteTokens int `json:"cache_write_tokens,omitempty"`
}

TokenUsage tracks token consumption for an evaluation.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL