Documentation
¶
Overview ¶
Package evaluation provides types for detailed evaluation reports with severity-based findings and recommendations. This is suited for LLM-as-Judge style reviews like PRD and ARB evaluations.
Index ¶
- type ActionItem
- type AggregationMethod
- type Annotation
- type CategoryScore
- type Decision
- type DecisionStatus
- type EvaluationReport
- func (r *EvaluationReport) AddCategory(cs CategoryScore)
- func (r *EvaluationReport) AddFinding(f Finding)
- func (r *EvaluationReport) ComputeWeightedScore() float64
- func (r *EvaluationReport) Evaluate() Decision
- func (r *EvaluationReport) Finalize(rerunCommand string)
- func (r *EvaluationReport) GenerateNextSteps(rerunCommand string)
- func (r *EvaluationReport) GenerateSummary() string
- func (r *EvaluationReport) SetJudge(judge *JudgeMetadata)
- func (r *EvaluationReport) SetReference(ref *ReferenceData)
- func (r *EvaluationReport) SetRubric(rubricID string)
- type Finding
- type FindingCounts
- type JudgeDisagreement
- type JudgeMetadata
- func (j *JudgeMetadata) SetLatency(d time.Duration)
- func (j *JudgeMetadata) WithPrompt(template, version string) *JudgeMetadata
- func (j *JudgeMetadata) WithProvider(provider string) *JudgeMetadata
- func (j *JudgeMetadata) WithRubric(id, version string) *JudgeMetadata
- func (j *JudgeMetadata) WithTemperature(temp float64) *JudgeMetadata
- func (j *JudgeMetadata) WithTokenUsage(input, output int) *JudgeMetadata
- func (j *JudgeMetadata) WithTrace(traceID, spanID string) *JudgeMetadata
- type JudgeScore
- type MultiJudgeResult
- type NextSteps
- type PairwiseCategoryScore
- type PairwiseComparison
- type PairwiseResult
- type PairwiseWinner
- type PassCriteria
- type ReferenceData
- type ReferenceDataset
- type ReportMetadata
- type Rubric
- func (r *Rubric) AddAnchor(score float64, label, description string, criteria ...string) *Rubric
- func (r *Rubric) AddExample(score float64, output, explanation string) *Rubric
- func (r *Rubric) AddRangeAnchor(minScore, maxScore float64, label, description string, criteria ...string) *Rubric
- func (r *Rubric) GetAnchorForScore(score float64) *ScoreAnchor
- type RubricExample
- type RubricSet
- type ScoreAnchor
- type ScoreStatus
- type Severity
- type TokenUsage
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type ActionItem ¶
type ActionItem struct {
// Action describes what needs to be done.
Action string `json:"action"`
// Category is the related evaluation category.
Category string `json:"category,omitempty"`
// Severity is the related finding severity.
Severity Severity `json:"severity,omitempty"`
// Owner suggests who should do this.
Owner string `json:"owner,omitempty"`
// Effort estimates work required.
Effort string `json:"effort,omitempty"`
}
ActionItem is a specific action to take.
type AggregationMethod ¶ added in v0.2.0
type AggregationMethod string
AggregationMethod specifies how to combine multiple judge scores.
const ( // AggregationMean uses the arithmetic mean of scores. AggregationMean AggregationMethod = "mean" // AggregationMedian uses the median score. AggregationMedian AggregationMethod = "median" // AggregationWeighted uses weighted average based on judge confidence. AggregationWeighted AggregationMethod = "weighted" // AggregationMajority uses majority vote for pass/fail. AggregationMajority AggregationMethod = "majority" // AggregationConservative uses the lowest/most critical score. AggregationConservative AggregationMethod = "conservative" )
type Annotation ¶ added in v0.2.0
type Annotation struct {
// Name is the annotation type (e.g., "quality", "relevance").
Name string `json:"name"`
// Score is a numeric score (if applicable).
Score float64 `json:"score,omitempty"`
// Label is a categorical label (if applicable).
Label string `json:"label,omitempty"`
// Explanation provides reasoning for the annotation.
Explanation string `json:"explanation,omitempty"`
// AnnotatorID identifies who provided this annotation.
AnnotatorID string `json:"annotator_id,omitempty"`
// AnnotatorType indicates human vs automated (e.g., "human", "llm", "rule").
AnnotatorType string `json:"annotator_type,omitempty"`
}
Annotation represents a human-provided label or score.
type CategoryScore ¶
type CategoryScore struct {
// Category is the name/ID of the category.
Category string `json:"category"`
// Weight is the category weight (0.0-1.0, should sum to 1.0).
Weight float64 `json:"weight"`
// Score is the category score (0.0-10.0).
Score float64 `json:"score"`
// MaxScore is the maximum possible score (default 10.0).
MaxScore float64 `json:"max_score"`
// Status is the derived status (pass/warn/fail).
Status ScoreStatus `json:"status"`
// Justification explains why this score was given.
Justification string `json:"justification"`
// Evidence provides specific supporting evidence.
Evidence string `json:"evidence,omitempty"`
// Findings are issues found in this category.
Findings []Finding `json:"findings,omitempty"`
}
CategoryScore represents a score for a single evaluation category.
func NewCategoryScore ¶
func NewCategoryScore(category string, weight, score float64, justification string) CategoryScore
NewCategoryScore creates a category score with computed status.
func (*CategoryScore) ComputeStatus ¶
func (c *CategoryScore) ComputeStatus() ScoreStatus
ComputeStatus calculates the status from the score.
func (*CategoryScore) ComputeWeightedScore ¶
func (c *CategoryScore) ComputeWeightedScore() float64
ComputeWeightedScore calculates the weighted contribution of this category.
type Decision ¶
type Decision struct {
// Status is the decision outcome.
Status DecisionStatus `json:"status"`
// Passed indicates if the evaluation passed.
Passed bool `json:"passed"`
// Rationale explains the decision.
Rationale string `json:"rationale"`
// FindingCounts summarizes findings by severity.
FindingCounts FindingCounts `json:"finding_counts"`
// WeightedScore is the final weighted score.
WeightedScore float64 `json:"weighted_score"`
}
Decision represents the evaluation decision.
type DecisionStatus ¶
type DecisionStatus string
DecisionStatus represents the decision outcome.
const ( DecisionPass DecisionStatus = "pass" // Meets all criteria DecisionConditional DecisionStatus = "conditional" // Meets score but has findings DecisionFail DecisionStatus = "fail" // Has blocking findings DecisionHumanReview DecisionStatus = "human_review" // Requires human judgment )
type EvaluationReport ¶
type EvaluationReport struct {
// Schema is the JSON Schema URL.
Schema string `json:"$schema,omitempty"`
// Metadata contains report identification and audit info.
Metadata ReportMetadata `json:"metadata"`
// ReviewType identifies the type of review (prd, arb, security, etc.).
ReviewType string `json:"review_type"`
// Judge contains metadata about the LLM judge (v0.2.0).
Judge *JudgeMetadata `json:"judge,omitempty"`
// RubricID references the rubric used for scoring (v0.2.0).
RubricID string `json:"rubric_id,omitempty"`
// Reference contains gold/expected data for comparison (v0.2.0).
Reference *ReferenceData `json:"reference,omitempty"`
// Categories contains scores for each evaluation dimension.
Categories []CategoryScore `json:"categories"`
// Findings are all issues discovered during evaluation.
Findings []Finding `json:"findings"`
// WeightedScore is the overall weighted score.
WeightedScore float64 `json:"weighted_score"`
// PassCriteria defines the requirements for approval.
PassCriteria PassCriteria `json:"pass_criteria"`
// Decision is the evaluation outcome.
Decision Decision `json:"decision"`
// NextSteps provides actionable guidance.
NextSteps NextSteps `json:"next_steps"`
// Summary is the overall assessment.
Summary string `json:"summary"`
}
EvaluationReport is the detailed evaluation report for LLM-as-Judge reviews.
func NewEvaluationReport ¶
func NewEvaluationReport(reviewType, document string) *EvaluationReport
NewEvaluationReport creates a new evaluation report.
func (*EvaluationReport) AddCategory ¶
func (r *EvaluationReport) AddCategory(cs CategoryScore)
AddCategory adds a category score.
func (*EvaluationReport) AddFinding ¶
func (r *EvaluationReport) AddFinding(f Finding)
AddFinding adds a finding.
func (*EvaluationReport) ComputeWeightedScore ¶
func (r *EvaluationReport) ComputeWeightedScore() float64
ComputeWeightedScore calculates the overall weighted score.
func (*EvaluationReport) Evaluate ¶
func (r *EvaluationReport) Evaluate() Decision
Evaluate computes the decision based on findings and score.
func (*EvaluationReport) Finalize ¶
func (r *EvaluationReport) Finalize(rerunCommand string)
Finalize computes all derived fields.
func (*EvaluationReport) GenerateNextSteps ¶
func (r *EvaluationReport) GenerateNextSteps(rerunCommand string)
GenerateNextSteps creates actionable next steps.
func (*EvaluationReport) GenerateSummary ¶
func (r *EvaluationReport) GenerateSummary() string
GenerateSummary creates the summary text.
func (*EvaluationReport) SetJudge ¶ added in v0.2.0
func (r *EvaluationReport) SetJudge(judge *JudgeMetadata)
SetJudge sets the judge metadata.
func (*EvaluationReport) SetReference ¶ added in v0.2.0
func (r *EvaluationReport) SetReference(ref *ReferenceData)
SetReference sets the reference data for comparison.
func (*EvaluationReport) SetRubric ¶ added in v0.2.0
func (r *EvaluationReport) SetRubric(rubricID string)
SetRubric sets the rubric ID.
type Finding ¶
type Finding struct {
// ID is the unique identifier for this finding.
ID string `json:"id"`
// Category is the evaluation category this relates to.
Category string `json:"category"`
// Severity indicates the impact level.
Severity Severity `json:"severity"`
// Title is a brief summary of the finding.
Title string `json:"title"`
// Description provides detailed explanation.
Description string `json:"description"`
// Recommendation explains how to fix the issue.
Recommendation string `json:"recommendation"`
// Evidence provides specific examples or references.
Evidence string `json:"evidence,omitempty"`
// Owner suggests who should address this finding.
Owner string `json:"owner,omitempty"`
// Effort estimates the work required (low, medium, high).
Effort string `json:"effort,omitempty"`
}
Finding represents an issue discovered during evaluation.
func (*Finding) IsBlocking ¶
IsBlocking returns true if this finding blocks approval.
type FindingCounts ¶
type FindingCounts struct {
Critical int `json:"critical"`
High int `json:"high"`
Medium int `json:"medium"`
Low int `json:"low"`
Info int `json:"info"`
Total int `json:"total"`
}
FindingCounts tracks the number of findings by severity.
func CountFindings ¶
func CountFindings(findings []Finding) FindingCounts
CountFindings counts findings by severity.
func (FindingCounts) BlockingCount ¶
func (c FindingCounts) BlockingCount() int
BlockingCount returns the number of blocking findings.
func (FindingCounts) HasBlocking ¶
func (c FindingCounts) HasBlocking() bool
HasBlocking returns true if there are any blocking findings.
type JudgeDisagreement ¶ added in v0.2.0
type JudgeDisagreement struct {
// Category is the evaluation dimension.
Category string `json:"category"`
// Scores are the individual judge scores.
Scores []JudgeScore `json:"scores"`
// Range is the difference between max and min scores.
Range float64 `json:"range"`
// StandardDeviation measures score spread.
StandardDeviation float64 `json:"standard_deviation"`
}
JudgeDisagreement captures where judges had significantly different scores.
type JudgeMetadata ¶ added in v0.2.0
type JudgeMetadata struct {
// JudgeID is a unique identifier for this judge configuration.
JudgeID string `json:"judge_id,omitempty"`
// Model is the LLM model used (e.g., "claude-3-opus-20240229", "gpt-4-turbo").
Model string `json:"model"`
// ModelProvider is the API provider (e.g., "anthropic", "openai", "bedrock").
ModelProvider string `json:"model_provider,omitempty"`
// ModelVersion is the specific model version if applicable.
ModelVersion string `json:"model_version,omitempty"`
// PromptTemplate is the name/ID of the prompt template used.
PromptTemplate string `json:"prompt_template,omitempty"`
// PromptVersion is the version of the prompt template.
PromptVersion string `json:"prompt_version,omitempty"`
// SystemPrompt is the system prompt used (or hash/reference if too long).
SystemPrompt string `json:"system_prompt,omitempty"`
// Temperature is the sampling temperature used.
Temperature float64 `json:"temperature,omitempty"`
// MaxTokens is the max tokens setting.
MaxTokens int `json:"max_tokens,omitempty"`
// RubricID references the rubric set used for scoring.
RubricID string `json:"rubric_id,omitempty"`
// RubricVersion is the version of the rubric used.
RubricVersion string `json:"rubric_version,omitempty"`
// EvaluatedAt is when this evaluation was performed.
EvaluatedAt time.Time `json:"evaluated_at,omitempty"`
// Latency is the evaluation duration.
Latency time.Duration `json:"latency,omitempty"`
// TokensUsed tracks token consumption.
TokensUsed *TokenUsage `json:"tokens_used,omitempty"`
// TraceID links to observability trace (e.g., for Opik/Phoenix/Langfuse).
TraceID string `json:"trace_id,omitempty"`
// SpanID links to observability span.
SpanID string `json:"span_id,omitempty"`
}
JudgeMetadata tracks information about the LLM judge that produced an evaluation. This enables reproducibility, debugging, and comparison of different judge configurations.
func NewJudgeMetadata ¶ added in v0.2.0
func NewJudgeMetadata(model string) *JudgeMetadata
NewJudgeMetadata creates judge metadata with required fields.
func (*JudgeMetadata) SetLatency ¶ added in v0.2.0
func (j *JudgeMetadata) SetLatency(d time.Duration)
SetLatency records the evaluation duration.
func (*JudgeMetadata) WithPrompt ¶ added in v0.2.0
func (j *JudgeMetadata) WithPrompt(template, version string) *JudgeMetadata
WithPrompt sets the prompt template info.
func (*JudgeMetadata) WithProvider ¶ added in v0.2.0
func (j *JudgeMetadata) WithProvider(provider string) *JudgeMetadata
WithProvider sets the model provider.
func (*JudgeMetadata) WithRubric ¶ added in v0.2.0
func (j *JudgeMetadata) WithRubric(id, version string) *JudgeMetadata
WithRubric sets the rubric reference.
func (*JudgeMetadata) WithTemperature ¶ added in v0.2.0
func (j *JudgeMetadata) WithTemperature(temp float64) *JudgeMetadata
WithTemperature sets the sampling temperature.
func (*JudgeMetadata) WithTokenUsage ¶ added in v0.2.0
func (j *JudgeMetadata) WithTokenUsage(input, output int) *JudgeMetadata
WithTokenUsage sets the token usage.
func (*JudgeMetadata) WithTrace ¶ added in v0.2.0
func (j *JudgeMetadata) WithTrace(traceID, spanID string) *JudgeMetadata
WithTrace links to observability.
type JudgeScore ¶ added in v0.2.0
type JudgeScore struct {
// JudgeID identifies the judge.
JudgeID string `json:"judge_id"`
// Score is the judge's score.
Score float64 `json:"score"`
}
JudgeScore is a score from a specific judge.
type MultiJudgeResult ¶ added in v0.2.0
type MultiJudgeResult struct {
// Evaluations are the individual judge evaluations.
Evaluations []*EvaluationReport `json:"evaluations"`
// Judges contains metadata for each judge.
Judges []*JudgeMetadata `json:"judges"`
// AggregatedScore is the combined score (mean, median, or weighted).
AggregatedScore float64 `json:"aggregated_score"`
// AggregationMethod describes how scores were combined.
AggregationMethod AggregationMethod `json:"aggregation_method"`
// Agreement measures inter-judge agreement (0-1, higher = more agreement).
Agreement float64 `json:"agreement"`
// Disagreements lists categories where judges significantly disagreed.
Disagreements []JudgeDisagreement `json:"disagreements,omitempty"`
// ConsolidatedDecision is the final decision after aggregation.
ConsolidatedDecision Decision `json:"consolidated_decision"`
// ConsolidatedFindings merges findings from all judges.
ConsolidatedFindings []Finding `json:"consolidated_findings"`
}
MultiJudgeResult aggregates evaluations from multiple judges. This improves reliability by combining perspectives and detecting disagreement.
func AggregateEvaluations ¶ added in v0.2.0
func AggregateEvaluations(evaluations []*EvaluationReport, method AggregationMethod) *MultiJudgeResult
AggregateEvaluations combines multiple evaluation reports.
type NextSteps ¶
type NextSteps struct {
// RerunCommand is the command to re-run evaluation.
RerunCommand string `json:"rerun_command"`
// Immediate are blocking actions that must be completed.
Immediate []ActionItem `json:"immediate,omitempty"`
// Recommended are suggested improvements.
Recommended []ActionItem `json:"recommended,omitempty"`
}
NextSteps provides actionable workflow guidance.
type PairwiseCategoryScore ¶ added in v0.2.0
type PairwiseCategoryScore struct {
// Category is the evaluation dimension.
Category string `json:"category"`
// Winner indicates which output won for this category.
Winner PairwiseWinner `json:"winner"`
// Margin indicates how much better the winner is (0-1, higher = larger gap).
Margin float64 `json:"margin,omitempty"`
// Reasoning explains the category-level comparison.
Reasoning string `json:"reasoning,omitempty"`
}
PairwiseCategoryScore compares outputs on a specific dimension.
type PairwiseComparison ¶ added in v0.2.0
type PairwiseComparison struct {
// ID is the unique identifier for this comparison.
ID string `json:"id,omitempty"`
// Input is the shared input/prompt for both outputs.
Input string `json:"input"`
// OutputA is the first output being compared.
OutputA string `json:"output_a"`
// OutputB is the second output being compared.
OutputB string `json:"output_b"`
// Winner indicates which output won ("A", "B", or "tie").
Winner PairwiseWinner `json:"winner"`
// Confidence is the judge's confidence in the decision (0-1).
Confidence float64 `json:"confidence,omitempty"`
// Reasoning explains why this winner was chosen.
Reasoning string `json:"reasoning"`
// CategoryScores provides per-category comparisons if applicable.
CategoryScores []PairwiseCategoryScore `json:"category_scores,omitempty"`
// Judge contains metadata about the LLM judge.
Judge *JudgeMetadata `json:"judge,omitempty"`
// Metadata contains additional comparison context.
Metadata map[string]any `json:"metadata,omitempty"`
// CreatedAt is when this comparison was made.
CreatedAt time.Time `json:"created_at,omitempty"`
}
PairwiseComparison represents a comparison between two outputs. This is an alternative to absolute scoring that can reduce position bias and improve reliability of LLM-as-Judge evaluations.
func NewPairwiseComparison ¶ added in v0.2.0
func NewPairwiseComparison(input, outputA, outputB string) *PairwiseComparison
NewPairwiseComparison creates a new pairwise comparison.
func (*PairwiseComparison) AddCategoryScore ¶ added in v0.2.0
func (p *PairwiseComparison) AddCategoryScore(category string, winner PairwiseWinner, reasoning string, margin float64)
AddCategoryScore adds a per-category comparison.
func (*PairwiseComparison) SetWinner ¶ added in v0.2.0
func (p *PairwiseComparison) SetWinner(winner PairwiseWinner, reasoning string, confidence float64)
SetWinner sets the comparison result.
func (*PairwiseComparison) SwappedComparison ¶ added in v0.2.0
func (p *PairwiseComparison) SwappedComparison() *PairwiseComparison
SwappedComparison creates a comparison with A and B swapped. Running both orders helps detect position bias in the judge.
type PairwiseResult ¶ added in v0.2.0
type PairwiseResult struct {
// Comparisons are all the individual comparisons.
Comparisons []PairwiseComparison `json:"comparisons"`
// WinRateA is the percentage of comparisons won by A.
WinRateA float64 `json:"win_rate_a"`
// WinRateB is the percentage of comparisons won by B.
WinRateB float64 `json:"win_rate_b"`
// TieRate is the percentage of ties.
TieRate float64 `json:"tie_rate"`
// OverallWinner is the aggregated winner.
OverallWinner PairwiseWinner `json:"overall_winner"`
// Confidence is the overall confidence in the result.
Confidence float64 `json:"confidence"`
}
PairwiseResult aggregates multiple pairwise comparisons.
func ComputePairwiseResult ¶ added in v0.2.0
func ComputePairwiseResult(comparisons []PairwiseComparison) *PairwiseResult
ComputeResult aggregates multiple comparisons into a result.
type PairwiseWinner ¶ added in v0.2.0
type PairwiseWinner string
PairwiseWinner indicates the winner of a pairwise comparison.
const ( // WinnerA indicates output A is better. WinnerA PairwiseWinner = "A" // WinnerB indicates output B is better. WinnerB PairwiseWinner = "B" // WinnerTie indicates both outputs are roughly equal. WinnerTie PairwiseWinner = "tie" // WinnerUncertain indicates the judge couldn't determine a winner. WinnerUncertain PairwiseWinner = "uncertain" )
type PassCriteria ¶
type PassCriteria struct {
// MaxCritical is the maximum allowed critical findings (default 0).
MaxCritical int `json:"max_critical"`
// MaxHigh is the maximum allowed high severity findings (default 0).
MaxHigh int `json:"max_high"`
// MaxMedium is the maximum allowed medium findings (-1 = unlimited).
MaxMedium int `json:"max_medium,omitempty"`
// MinScore is the minimum weighted score required.
MinScore float64 `json:"min_score"`
}
PassCriteria defines the requirements for approval.
func DefaultPassCriteria ¶
func DefaultPassCriteria() PassCriteria
DefaultPassCriteria returns standard pass criteria. Zero Critical/High, minimum score 7.0.
func StrictPassCriteria ¶
func StrictPassCriteria() PassCriteria
StrictPassCriteria returns strict pass criteria. Zero Critical/High, max 3 Medium, minimum score 8.0.
type ReferenceData ¶ added in v0.2.0
type ReferenceData struct {
// ID is the unique identifier for this reference.
ID string `json:"id,omitempty"`
// Input is the input/prompt that produced the reference output.
Input string `json:"input,omitempty"`
// ExpectedOutput is the gold/reference output.
ExpectedOutput string `json:"expected_output,omitempty"`
// ExpectedOutputs allows multiple acceptable outputs.
ExpectedOutputs []string `json:"expected_outputs,omitempty"`
// Context provides additional context (e.g., retrieved documents for RAG).
Context []string `json:"context,omitempty"`
// Annotations are human-provided labels or scores.
Annotations []Annotation `json:"annotations,omitempty"`
// Source indicates where this reference came from.
Source string `json:"source,omitempty"`
// Tags categorize or filter references.
Tags []string `json:"tags,omitempty"`
// Metadata contains additional reference data.
Metadata map[string]any `json:"metadata,omitempty"`
}
ReferenceData contains ground truth or expected data for evaluation. This enables reference-based evaluation where outputs are compared against known-good examples.
func NewReferenceData ¶ added in v0.2.0
func NewReferenceData(input, expectedOutput string) *ReferenceData
NewReferenceData creates a new reference data item.
func (*ReferenceData) WithAnnotation ¶ added in v0.2.0
func (r *ReferenceData) WithAnnotation(name string, score float64, annotatorID string) *ReferenceData
WithAnnotation adds a human annotation.
func (*ReferenceData) WithContext ¶ added in v0.2.0
func (r *ReferenceData) WithContext(ctx ...string) *ReferenceData
WithContext adds context documents.
type ReferenceDataset ¶ added in v0.2.0
type ReferenceDataset struct {
// ID is the unique identifier for this dataset.
ID string `json:"id"`
// Name is the display name.
Name string `json:"name"`
// Description explains what this dataset contains.
Description string `json:"description,omitempty"`
// Version tracks dataset iterations.
Version string `json:"version,omitempty"`
// Items are the reference data items.
Items []ReferenceData `json:"items"`
// Tags categorize the dataset.
Tags []string `json:"tags,omitempty"`
// Metadata contains additional dataset info.
Metadata map[string]any `json:"metadata,omitempty"`
}
ReferenceDataset is a collection of reference data items.
func NewReferenceDataset ¶ added in v0.2.0
func NewReferenceDataset(id, name string) *ReferenceDataset
NewReferenceDataset creates a new reference dataset.
func (*ReferenceDataset) AddItem ¶ added in v0.2.0
func (d *ReferenceDataset) AddItem(item ReferenceData)
AddItem adds a reference data item to the dataset.
func (*ReferenceDataset) GetByID ¶ added in v0.2.0
func (d *ReferenceDataset) GetByID(id string) *ReferenceData
GetByID retrieves a reference item by ID.
type ReportMetadata ¶
type ReportMetadata struct {
// Document is the filename or path being evaluated.
Document string `json:"document"`
// DocumentID is the document identifier (e.g., PRD ID).
DocumentID string `json:"document_id,omitempty"`
// DocumentTitle is the document title.
DocumentTitle string `json:"document_title,omitempty"`
// DocumentVersion is the document version.
DocumentVersion string `json:"document_version,omitempty"`
// GeneratedAt is when the report was created.
GeneratedAt time.Time `json:"generated_at"`
// GeneratedBy identifies what created this report.
GeneratedBy string `json:"generated_by,omitempty"`
// ReviewerID identifies the reviewer (agent or human).
ReviewerID string `json:"reviewer_id,omitempty"`
}
ReportMetadata contains report identification.
type Rubric ¶ added in v0.2.0
type Rubric struct {
// Category is the evaluation dimension this rubric applies to.
Category string `json:"category"`
// Description explains what this category measures.
Description string `json:"description"`
// Anchors define what each score level means.
// Key is the score (e.g., "10", "7", "5", "3", "1") or range (e.g., "8-10").
Anchors []ScoreAnchor `json:"anchors"`
// Examples provide sample inputs/outputs for each score level.
Examples []RubricExample `json:"examples,omitempty"`
}
Rubric defines the scoring criteria for an evaluation category. It provides explicit anchors for what each score level means, improving consistency and reproducibility of LLM-as-Judge evaluations.
func (*Rubric) AddExample ¶ added in v0.2.0
AddExample adds an example to the rubric.
func (*Rubric) AddRangeAnchor ¶ added in v0.2.0
func (r *Rubric) AddRangeAnchor(minScore, maxScore float64, label, description string, criteria ...string) *Rubric
AddRangeAnchor adds a range-based score anchor.
func (*Rubric) GetAnchorForScore ¶ added in v0.2.0
func (r *Rubric) GetAnchorForScore(score float64) *ScoreAnchor
GetAnchorForScore returns the anchor that applies to the given score.
type RubricExample ¶ added in v0.2.0
type RubricExample struct {
// Score is the score this example demonstrates.
Score float64 `json:"score"`
// Input is the example input/prompt.
Input string `json:"input,omitempty"`
// Output is the example output being scored.
Output string `json:"output"`
// Explanation describes why this output receives this score.
Explanation string `json:"explanation"`
}
RubricExample provides a concrete example for a score level.
type RubricSet ¶ added in v0.2.0
type RubricSet struct {
// ID is the unique identifier for this rubric set.
ID string `json:"id"`
// Name is the display name.
Name string `json:"name"`
// Version tracks rubric iterations.
Version string `json:"version"`
// Description explains what this rubric set evaluates.
Description string `json:"description,omitempty"`
// Rubrics are the category-specific rubrics.
Rubrics []Rubric `json:"rubrics"`
}
RubricSet is a collection of rubrics for a complete evaluation.
func DefaultPRDRubricSet ¶ added in v0.2.0
func DefaultPRDRubricSet() *RubricSet
DefaultPRDRubricSet returns a standard rubric set for PRD evaluation.
type ScoreAnchor ¶ added in v0.2.0
type ScoreAnchor struct {
// Score is the numeric score this anchor represents.
// Use -1 for range-based anchors where MinScore/MaxScore are set.
Score float64 `json:"score,omitempty"`
// MinScore is the minimum score for range-based anchors.
MinScore float64 `json:"min_score,omitempty"`
// MaxScore is the maximum score for range-based anchors.
MaxScore float64 `json:"max_score,omitempty"`
// Label is a short name for this level (e.g., "Excellent", "Good", "Poor").
Label string `json:"label"`
// Description explains what qualifies for this score.
Description string `json:"description"`
// Criteria are specific requirements for this score level.
Criteria []string `json:"criteria,omitempty"`
}
ScoreAnchor defines the criteria for a specific score or score range.
type ScoreStatus ¶
type ScoreStatus string
ScoreStatus represents the pass/warn/fail status for a category score.
const ( ScoreStatusPass ScoreStatus = "pass" // Score >= 7.0 ScoreStatusWarn ScoreStatus = "warn" // Score >= 5.0 && < 7.0 ScoreStatusFail ScoreStatus = "fail" // Score < 5.0 CategoryPending ScoreStatus = "pending" // Not yet evaluated CategoryNeedsImprovement ScoreStatus = "needs_improvement" // Requires attention )
func (ScoreStatus) Icon ¶
func (s ScoreStatus) Icon() string
Icon returns the emoji icon for the score status.
type Severity ¶
type Severity string
Severity represents the severity level of a finding. Based on InfoSec severity classifications.
func AllSeverities ¶
func AllSeverities() []Severity
AllSeverities returns all severity levels in order of severity.
func (Severity) IsBlocking ¶
IsBlocking returns true if this severity blocks approval.
type TokenUsage ¶ added in v0.2.0
type TokenUsage struct {
// InputTokens is the number of input/prompt tokens.
InputTokens int `json:"input_tokens"`
// OutputTokens is the number of output/completion tokens.
OutputTokens int `json:"output_tokens"`
// TotalTokens is the total tokens used.
TotalTokens int `json:"total_tokens"`
// CacheReadTokens is tokens read from cache (if applicable).
CacheReadTokens int `json:"cache_read_tokens,omitempty"`
// CacheWriteTokens is tokens written to cache (if applicable).
CacheWriteTokens int `json:"cache_write_tokens,omitempty"`
}
TokenUsage tracks token consumption for an evaluation.