Documentation
¶
Overview ¶
Package judge provides LLM-as-Judge evaluation for API style rules.
This package integrates with structured-evaluation to enable AI-powered assessment of API specifications beyond what deterministic linting can achieve.
Key concepts:
- Evaluator: The main interface for running LLM evaluations
- RubricSet: A collection of evaluation criteria built from APIStyleSpec rules
- EvaluationResult: The outcome of evaluating an API spec against criteria
Usage:
spec, _ := profile.Load("azure")
evaluator := judge.NewEvaluator(provider, spec)
result, err := evaluator.Evaluate(ctx, openAPISpec)
The evaluator builds prompts from rule JudgeCriteria and uses an LLM provider to assess compliance. Results include scores, reasoning, and specific findings mapped back to rule IDs.
Index ¶
- Constants
- Variables
- func AggregateScores(scores []float64, weights []float64) float64
- func ScoreToStatus(score float64) types.Status
- func TruncateSpec(content string, maxChars int) (string, bool)
- type AnthropicProvider
- type CategoryEvaluationResponse
- type CategoryResult
- type ClaudeEvaluator
- func (e *ClaudeEvaluator) Evaluate(ctx context.Context, specBytes []byte, opts *Options) (*EvaluationReport, error)
- func (e *ClaudeEvaluator) EvaluateCategory(ctx context.Context, specBytes []byte, category string, opts *Options) (*CategoryResult, error)
- func (e *ClaudeEvaluator) RubricSet() *RubricSet
- type CompletionRequest
- type CompletionResponse
- type Criterion
- type EvaluationReport
- type EvaluationSummary
- type Evaluator
- type Finding
- type FindingResponse
- type Options
- type PromptBuilder
- func (pb *PromptBuilder) BuildBatchEvaluation(rubricSet *RubricSet, specContent string) string
- func (pb *PromptBuilder) BuildCategoryEvaluation(category string, criteria []*Criterion, specContent string) string
- func (pb *PromptBuilder) BuildSingleEvaluation(criterion *Criterion, specContent string) string
- type Provider
- type ProviderConfig
- type ReportMetadata
- type RubricSet
- type SingleEvaluationResponse
- type TokenUsage
Constants ¶
const ( ModelClaudeOpus = "claude-opus-4-20250514" ModelClaudeSonnet = "claude-sonnet-4-20250514" ModelClaudeHaiku = "claude-3-5-haiku-20241022" )
Model constants for Anthropic Claude models.
Variables ¶
var ErrNoJSONFound = errors.New("no JSON block found in response")
ErrNoJSONFound indicates no JSON block was found in the response.
Functions ¶
func AggregateScores ¶
AggregateScores computes an aggregate score from multiple scores with weights.
func ScoreToStatus ¶
ScoreToStatus converts a numeric score to a pass/fail status.
Types ¶
type AnthropicProvider ¶
type AnthropicProvider struct {
// contains filtered or unexported fields
}
AnthropicProvider implements Provider using the Anthropic Claude HTTP API. Uses HTTP directly to avoid SDK dependency conflicts.
func NewAnthropicProvider ¶
func NewAnthropicProvider(apiKey string, config *ProviderConfig) *AnthropicProvider
NewAnthropicProvider creates a new Anthropic provider. If apiKey is empty, it reads from ANTHROPIC_API_KEY environment variable.
func (*AnthropicProvider) Complete ¶
func (p *AnthropicProvider) Complete(ctx context.Context, req *CompletionRequest) (*CompletionResponse, error)
Complete sends a completion request to the Claude API.
func (*AnthropicProvider) DefaultModel ¶
func (p *AnthropicProvider) DefaultModel() string
DefaultModel returns the configured default model.
func (*AnthropicProvider) Name ¶
func (p *AnthropicProvider) Name() string
Name returns the provider name.
func (*AnthropicProvider) SetDefaultModel ¶
func (p *AnthropicProvider) SetDefaultModel(model string)
SetDefaultModel updates the default model.
type CategoryEvaluationResponse ¶
type CategoryEvaluationResponse struct {
Findings []FindingResponse `json:"findings"`
}
CategoryEvaluationResponse is the expected JSON structure for category evaluation.
type CategoryResult ¶
type CategoryResult struct {
// Name is the category identifier.
Name string `json:"name"`
// Score is the category's aggregate score (0.0-1.0).
Score float64 `json:"score"`
// Findings contains rule evaluations in this category.
Findings []Finding `json:"findings"`
}
CategoryResult contains evaluation results for a single category.
type ClaudeEvaluator ¶
type ClaudeEvaluator struct {
// contains filtered or unexported fields
}
ClaudeEvaluator implements Evaluator using the Anthropic Claude API.
func NewClaudeEvaluator ¶
func NewClaudeEvaluator(provider Provider, spec *types.APIStyleSpec) *ClaudeEvaluator
NewClaudeEvaluator creates a new evaluator with a provider and style spec.
func NewClaudeEvaluatorWithRubric ¶
func NewClaudeEvaluatorWithRubric(provider Provider, rubricSet *RubricSet) *ClaudeEvaluator
NewClaudeEvaluatorWithRubric creates an evaluator with a pre-built rubric set.
func (*ClaudeEvaluator) Evaluate ¶
func (e *ClaudeEvaluator) Evaluate(ctx context.Context, specBytes []byte, opts *Options) (*EvaluationReport, error)
Evaluate assesses an API specification against all criteria.
func (*ClaudeEvaluator) EvaluateCategory ¶
func (e *ClaudeEvaluator) EvaluateCategory(ctx context.Context, specBytes []byte, category string, opts *Options) (*CategoryResult, error)
EvaluateCategory evaluates a single category of rules.
func (*ClaudeEvaluator) RubricSet ¶
func (e *ClaudeEvaluator) RubricSet() *RubricSet
RubricSet returns the underlying rubric set.
type CompletionRequest ¶
type CompletionRequest struct {
// SystemPrompt sets the system/context instruction.
SystemPrompt string
// UserPrompt is the main user message/question.
UserPrompt string
// Model specifies the model to use (provider-specific).
Model string
// Temperature controls randomness (0.0-1.0).
Temperature float64
// MaxTokens limits response length.
MaxTokens int
// StopSequences are strings that stop generation.
StopSequences []string
// Metadata is optional context for logging/tracing.
Metadata map[string]string
}
CompletionRequest contains parameters for an LLM completion.
type CompletionResponse ¶
type CompletionResponse struct {
// Content is the generated text.
Content string
// Model is the model that was used.
Model string
// Usage contains token usage statistics.
Usage *TokenUsage
// FinishReason indicates why generation stopped.
FinishReason string
}
CompletionResponse contains the LLM's response.
type Criterion ¶
type Criterion struct {
// RuleID links back to the source rule.
RuleID string
// RuleTitle is the rule's display name.
RuleTitle string
// Category is the rule's category for grouping.
Category string
// Prompt is the evaluation instruction for the LLM.
Prompt string
// Weight influences scoring (0.0-1.0, default 1.0).
Weight float64
// Severity is the rule's severity level.
Severity types.Severity
// RequiresContext indicates if broader context is needed.
RequiresContext bool
// Examples from the rule (if available).
GoodExamples []string
BadExamples []string
// Rationale from the rule (if available).
Rationale string
// References from the rule.
References []types.Reference
}
Criterion defines how a single rule should be evaluated by an LLM.
type EvaluationReport ¶
type EvaluationReport struct {
// Status indicates overall evaluation outcome.
Status types.Status `json:"status"`
// Summary provides aggregate statistics.
Summary EvaluationSummary `json:"summary"`
// Categories contains results grouped by category.
Categories []CategoryResult `json:"categories"`
// Findings contains all individual rule evaluations.
Findings []Finding `json:"findings"`
// Metadata contains evaluation context.
Metadata ReportMetadata `json:"metadata"`
}
EvaluationReport contains the full results of an LLM evaluation.
func NewEvaluationReport ¶
func NewEvaluationReport() *EvaluationReport
NewEvaluationReport creates a new empty evaluation report.
func (*EvaluationReport) AddFinding ¶
func (r *EvaluationReport) AddFinding(f Finding)
AddFinding adds a finding and updates summary statistics.
func (*EvaluationReport) CalculateScores ¶
func (r *EvaluationReport) CalculateScores()
CalculateScores computes final scores after all findings are added.
func (*EvaluationReport) HasCriticalFailures ¶
func (r *EvaluationReport) HasCriticalFailures() bool
HasCriticalFailures returns true if any error-severity rules failed.
func (*EvaluationReport) HasFailures ¶
func (r *EvaluationReport) HasFailures() bool
HasFailures returns true if any findings failed.
type EvaluationSummary ¶
type EvaluationSummary struct {
// TotalRules is the number of rules evaluated.
TotalRules int `json:"totalRules"`
// PassedRules is the number of rules that passed.
PassedRules int `json:"passedRules"`
// FailedRules is the number of rules that failed.
FailedRules int `json:"failedRules"`
// SkippedRules is the number of rules skipped.
SkippedRules int `json:"skippedRules"`
// OverallScore is the weighted average score (0.0-1.0).
OverallScore float64 `json:"overallScore"`
// CategoryScores maps category names to their scores.
CategoryScores map[string]float64 `json:"categoryScores,omitempty"`
}
EvaluationSummary provides aggregate statistics.
type Evaluator ¶
type Evaluator interface {
// Evaluate assesses an API specification against the configured rubric.
Evaluate(ctx context.Context, specBytes []byte, opts *Options) (*EvaluationReport, error)
// EvaluateCategory evaluates a single category of rules.
EvaluateCategory(ctx context.Context, specBytes []byte, category string, opts *Options) (*CategoryResult, error)
}
Evaluator runs LLM-based evaluation of API specifications.
type Finding ¶
type Finding struct {
// RuleID is the evaluated rule's identifier.
RuleID string `json:"ruleId"`
// RuleTitle is the rule's display name.
RuleTitle string `json:"ruleTitle"`
// Category is the rule's category.
Category string `json:"category"`
// Score is the evaluation score (0.0-1.0).
Score float64 `json:"score"`
// Passed indicates if the rule passed (score >= 0.5).
Passed bool `json:"passed"`
// Reasoning explains the evaluation decision.
Reasoning string `json:"reasoning,omitempty"`
// Examples are specific instances found in the spec.
Examples []string `json:"examples,omitempty"`
// Suggestions provides improvement recommendations.
Suggestions []string `json:"suggestions,omitempty"`
// Locations identifies paths in the spec related to findings.
Locations []string `json:"locations,omitempty"`
// Severity reflects the rule's configured severity.
Severity types.Severity `json:"severity"`
// Weight is the rule's evaluation weight.
Weight float64 `json:"weight"`
}
Finding represents an individual rule evaluation result.
func ParseCategoryEvaluation ¶
ParseCategoryEvaluation extracts multiple findings from LLM response.
type FindingResponse ¶
type FindingResponse struct {
RuleID string `json:"ruleId"`
Score float64 `json:"score"`
Passed bool `json:"passed"`
Reasoning string `json:"reasoning"`
Examples []string `json:"examples"`
Suggestions []string `json:"suggestions"`
Locations []string `json:"locations"`
}
FindingResponse is the expected JSON structure for individual findings.
type Options ¶
type Options struct {
// Categories limits evaluation to specific categories.
// If empty, all categories are evaluated.
Categories []string
// RuleIDs limits evaluation to specific rules.
// If empty, all rules with Judge criteria are evaluated.
RuleIDs []string
// FileName is the name of the spec file (for context in prompts).
FileName string
// IncludeReasoning enables detailed reasoning in results.
IncludeReasoning bool
// MaxConcurrency limits parallel LLM calls (default: 1).
MaxConcurrency int
// Model specifies the LLM model to use.
// Provider-specific (e.g., "claude-3-haiku-20240307" for Anthropic).
Model string
// Temperature controls response randomness (0.0-1.0).
Temperature float64
// MaxTokens limits response length.
MaxTokens int
}
Options configures the evaluation behavior.
func DefaultOptions ¶
func DefaultOptions() *Options
DefaultOptions returns options with sensible defaults.
type PromptBuilder ¶
type PromptBuilder struct {
// SystemPrompt is the base system instruction.
SystemPrompt string
}
PromptBuilder constructs evaluation prompts for LLM evaluation.
func NewPromptBuilder ¶
func NewPromptBuilder() *PromptBuilder
NewPromptBuilder creates a new prompt builder with default system prompt.
func (*PromptBuilder) BuildBatchEvaluation ¶
func (pb *PromptBuilder) BuildBatchEvaluation(rubricSet *RubricSet, specContent string) string
BuildBatchEvaluation creates a prompt for evaluating all rules at once. Use sparingly - can hit token limits on large specs.
func (*PromptBuilder) BuildCategoryEvaluation ¶
func (pb *PromptBuilder) BuildCategoryEvaluation(category string, criteria []*Criterion, specContent string) string
BuildCategoryEvaluation creates a prompt for evaluating multiple criteria in a category.
func (*PromptBuilder) BuildSingleEvaluation ¶
func (pb *PromptBuilder) BuildSingleEvaluation(criterion *Criterion, specContent string) string
BuildSingleEvaluation creates a prompt for evaluating a single criterion.
type Provider ¶
type Provider interface {
// Complete sends a completion request to the LLM.
Complete(ctx context.Context, req *CompletionRequest) (*CompletionResponse, error)
// Name returns the provider name (e.g., "anthropic", "openai").
Name() string
// DefaultModel returns the provider's default model.
DefaultModel() string
}
Provider defines the interface for LLM providers.
type ProviderConfig ¶
type ProviderConfig struct {
// APIKey is the authentication key.
APIKey string
// BaseURL overrides the default API endpoint.
BaseURL string
// DefaultModel sets the default model for requests.
DefaultModel string
// Timeout in seconds for API calls.
Timeout int
// MaxRetries for failed requests.
MaxRetries int
// RetryDelay in milliseconds between retries.
RetryDelay int
}
ProviderConfig holds common provider configuration.
func DefaultProviderConfig ¶
func DefaultProviderConfig() *ProviderConfig
DefaultProviderConfig returns a config with sensible defaults.
type ReportMetadata ¶
type ReportMetadata struct {
// FileName is the evaluated spec file name.
FileName string `json:"fileName,omitempty"`
// ProfileName is the style profile used.
ProfileName string `json:"profileName,omitempty"`
// Model is the LLM model used.
Model string `json:"model,omitempty"`
// Duration is the total evaluation time.
Duration string `json:"duration,omitempty"`
// Timestamp is when the evaluation was performed.
Timestamp string `json:"timestamp,omitempty"`
}
ReportMetadata contains evaluation context information.
type RubricSet ¶
type RubricSet struct {
// Name is the rubric set identifier (from spec name).
Name string
// Criteria contains all evaluation criteria keyed by rule ID.
Criteria map[string]*Criterion
// Categories groups criteria by category name.
Categories map[string][]*Criterion
}
RubricSet is a collection of evaluation criteria built from an APIStyleSpec.
func BuildRubricSet ¶
func BuildRubricSet(spec *types.APIStyleSpec) *RubricSet
BuildRubricSet creates a RubricSet from an APIStyleSpec. Only rules with JudgeCriteria are included.
func (*RubricSet) AllCriteria ¶
AllCriteria returns all criteria as a slice.
func (*RubricSet) CategoryNames ¶
CategoryNames returns all category names in the rubric set.
func (*RubricSet) FilterByCategory ¶
FilterByCategory returns criteria for a specific category.
func (*RubricSet) FilterByRuleIDs ¶
FilterByRuleIDs returns criteria for specific rule IDs.
type SingleEvaluationResponse ¶
type SingleEvaluationResponse struct {
Score float64 `json:"score"`
Passed bool `json:"passed"`
Reasoning string `json:"reasoning"`
Examples []string `json:"examples"`
Suggestions []string `json:"suggestions"`
Locations []string `json:"locations"`
}
SingleEvaluationResponse is the expected JSON structure for single rule evaluation.
type TokenUsage ¶
type TokenUsage struct {
// InputTokens is the number of tokens in the request.
InputTokens int
// OutputTokens is the number of tokens in the response.
OutputTokens int
// TotalTokens is the sum of input and output tokens.
TotalTokens int
}
TokenUsage tracks token consumption.