Documentation
¶
Overview ¶
Package llm provides LLM-based evaluation metrics.
These metrics use an LLM to evaluate the quality of outputs. They are more expensive than heuristic metrics but can evaluate subjective qualities like relevance, helpfulness, and coherence.
Provider Interface ¶
All LLM metrics require a Provider implementation. The package provides:
- SimpleProvider: Wraps a completion function
- MockProvider: For testing
- CachingProvider: Wraps another provider with caching
Available Metrics ¶
- GEval: G-EVAL framework with chain-of-thought evaluation
- AnswerRelevance: How relevant an answer is to the question
- Hallucination: Detects fabricated information
- ContextRecall: How well the response uses provided context
- ContextPrecision: Whether response sticks to context
- Moderation: Content policy violation detection
- Factuality: Factual accuracy evaluation
- Coherence: Logical coherence assessment
- Helpfulness: How helpful the response is
- CustomJudge: Create metrics with custom prompts
Usage Example ¶
// Create a provider (see integrations package for OpenAI, Anthropic)
provider := llm.NewSimpleProvider("custom", "model-name", func(ctx context.Context, req llm.CompletionRequest) (*llm.CompletionResponse, error) {
// Call your LLM API here
return &llm.CompletionResponse{Content: "..."}, nil
})
// Create metrics
relevance := llm.NewAnswerRelevance(provider)
hallucination := llm.NewHallucination(provider)
// Evaluate
input := evaluation.NewMetricInput("What is 2+2?", "The answer is 4.").WithExpected("4")
score := relevance.Score(ctx, input)
Custom Judge Example ¶
judge := llm.NewCustomJudge("tone_check", `
Evaluate whether the following response maintains a professional tone.
User message: {{input}}
AI response: {{output}}
Return your response in JSON format:
{"score": <0.0-1.0>, "reason": "<explanation>"}
`, provider)
Index ¶
- func FormatPromptTemplate(template string, vars map[string]string) string
- func ParseJSONResponse(response string, v any) error
- func ParseReasonFromResponse(response string) string
- func ParseScoreFromResponse(response string) (float64, error)
- type AnswerRelevance
- type BaseJudge
- type CachingProvider
- type Coherence
- type CompletionRequest
- type CompletionResponse
- type ContextPrecision
- type ContextRecall
- type CustomJudge
- type Factuality
- type GEval
- type Hallucination
- type Helpfulness
- type JudgeOption
- type Message
- type MockProvider
- type Moderation
- type Provider
- type ProviderOption
- type ScoreResponse
- type SimpleProvider
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func FormatPromptTemplate ¶
FormatPromptTemplate formats a prompt template with variable substitution.
func ParseJSONResponse ¶
ParseJSONResponse extracts JSON from an LLM response.
func ParseReasonFromResponse ¶
ParseReasonFromResponse extracts a reason/explanation from an LLM response.
func ParseScoreFromResponse ¶
ParseScoreFromResponse extracts a numeric score from an LLM response.
Types ¶
type AnswerRelevance ¶
type AnswerRelevance struct {
*BaseJudge
}
AnswerRelevance evaluates how relevant an answer is to the question.
func NewAnswerRelevance ¶
func NewAnswerRelevance(provider Provider, opts ...JudgeOption) *AnswerRelevance
NewAnswerRelevance creates a new AnswerRelevance metric.
func (*AnswerRelevance) Score ¶
func (m *AnswerRelevance) Score(ctx context.Context, input evaluation.MetricInput) *evaluation.ScoreResult
Score evaluates answer relevance.
type BaseJudge ¶
type BaseJudge struct {
evaluation.BaseMetric
// contains filtered or unexported fields
}
BaseJudge provides common functionality for LLM-based evaluation metrics.
func NewBaseJudge ¶
func NewBaseJudge(name string, provider Provider, opts ...JudgeOption) *BaseJudge
NewBaseJudge creates a new base judge.
type CachingProvider ¶
type CachingProvider struct {
// contains filtered or unexported fields
}
CachingProvider wraps a provider with response caching.
func NewCachingProvider ¶
func NewCachingProvider(inner Provider) *CachingProvider
NewCachingProvider creates a caching wrapper around a provider.
func (*CachingProvider) Complete ¶
func (p *CachingProvider) Complete(ctx context.Context, req CompletionRequest) (*CompletionResponse, error)
Complete returns cached response or calls inner provider.
func (*CachingProvider) DefaultModel ¶
func (p *CachingProvider) DefaultModel() string
DefaultModel returns the inner provider's default model.
func (*CachingProvider) Name ¶
func (p *CachingProvider) Name() string
Name returns the inner provider name.
type Coherence ¶
type Coherence struct {
*BaseJudge
}
Coherence evaluates the logical coherence of a response.
func NewCoherence ¶
func NewCoherence(provider Provider, opts ...JudgeOption) *Coherence
NewCoherence creates a new Coherence metric.
func (*Coherence) Score ¶
func (m *Coherence) Score(ctx context.Context, input evaluation.MetricInput) *evaluation.ScoreResult
Score evaluates coherence.
type CompletionRequest ¶
type CompletionRequest struct {
Messages []Message `json:"messages"`
Model string `json:"model,omitempty"`
Temperature float64 `json:"temperature,omitempty"`
MaxTokens int `json:"max_tokens,omitempty"`
}
CompletionRequest represents a request for chat completion.
type CompletionResponse ¶
type CompletionResponse struct {
Content string `json:"content"`
Model string `json:"model,omitempty"`
PromptTokens int `json:"prompt_tokens,omitempty"`
OutputTokens int `json:"output_tokens,omitempty"`
}
CompletionResponse represents a chat completion response.
type ContextPrecision ¶
type ContextPrecision struct {
*BaseJudge
}
ContextPrecision evaluates whether the response sticks to the context.
func NewContextPrecision ¶
func NewContextPrecision(provider Provider, opts ...JudgeOption) *ContextPrecision
NewContextPrecision creates a new ContextPrecision metric.
func (*ContextPrecision) Score ¶
func (m *ContextPrecision) Score(ctx context.Context, input evaluation.MetricInput) *evaluation.ScoreResult
Score evaluates context precision.
type ContextRecall ¶
type ContextRecall struct {
*BaseJudge
}
ContextRecall evaluates how well the response uses the provided context.
func NewContextRecall ¶
func NewContextRecall(provider Provider, opts ...JudgeOption) *ContextRecall
NewContextRecall creates a new ContextRecall metric.
func (*ContextRecall) Score ¶
func (m *ContextRecall) Score(ctx context.Context, input evaluation.MetricInput) *evaluation.ScoreResult
Score evaluates context recall.
type CustomJudge ¶
type CustomJudge struct {
*BaseJudge
// contains filtered or unexported fields
}
CustomJudge allows creating metrics with custom prompts.
func NewCustomJudge ¶
func NewCustomJudge(name, promptTemplate string, provider Provider, opts ...JudgeOption) *CustomJudge
NewCustomJudge creates a custom judge metric.
func (*CustomJudge) Score ¶
func (m *CustomJudge) Score(ctx context.Context, input evaluation.MetricInput) *evaluation.ScoreResult
Score evaluates using the custom prompt.
type Factuality ¶
type Factuality struct {
*BaseJudge
}
Factuality evaluates factual accuracy of responses.
func NewFactuality ¶
func NewFactuality(provider Provider, opts ...JudgeOption) *Factuality
NewFactuality creates a new Factuality metric.
func (*Factuality) Score ¶
func (m *Factuality) Score(ctx context.Context, input evaluation.MetricInput) *evaluation.ScoreResult
Score evaluates factual accuracy.
type GEval ¶
type GEval struct {
*BaseJudge
// contains filtered or unexported fields
}
GEval implements the G-EVAL framework for LLM evaluation. It uses chain-of-thought prompting to evaluate outputs.
func NewGEval ¶
func NewGEval(provider Provider, criteria string, opts ...JudgeOption) *GEval
NewGEval creates a new G-EVAL metric.
func (*GEval) Score ¶
func (g *GEval) Score(ctx context.Context, input evaluation.MetricInput) *evaluation.ScoreResult
Score evaluates using G-EVAL.
func (*GEval) WithEvaluationSteps ¶
WithEvaluationSteps adds custom evaluation steps.
type Hallucination ¶
type Hallucination struct {
*BaseJudge
}
Hallucination detects hallucinations in LLM outputs.
func NewHallucination ¶
func NewHallucination(provider Provider, opts ...JudgeOption) *Hallucination
NewHallucination creates a new Hallucination detection metric.
func (*Hallucination) Score ¶
func (m *Hallucination) Score(ctx context.Context, input evaluation.MetricInput) *evaluation.ScoreResult
Score detects hallucinations (higher score = more hallucination detected).
type Helpfulness ¶
type Helpfulness struct {
*BaseJudge
}
Helpfulness evaluates how helpful a response is.
func NewHelpfulness ¶
func NewHelpfulness(provider Provider, opts ...JudgeOption) *Helpfulness
NewHelpfulness creates a new Helpfulness metric.
func (*Helpfulness) Score ¶
func (m *Helpfulness) Score(ctx context.Context, input evaluation.MetricInput) *evaluation.ScoreResult
Score evaluates helpfulness.
type JudgeOption ¶
type JudgeOption func(*BaseJudge)
JudgeOption configures a judge metric.
func WithJudgeModel ¶
func WithJudgeModel(model string) JudgeOption
WithJudgeModel sets the model for the judge.
func WithJudgeTemperature ¶
func WithJudgeTemperature(temp float64) JudgeOption
WithJudgeTemperature sets the temperature for the judge.
type Message ¶
type Message struct {
Role string `json:"role"` // "system", "user", "assistant"
Content string `json:"content"`
}
Message represents a chat message.
type MockProvider ¶
type MockProvider struct {
// contains filtered or unexported fields
}
MockProvider is a provider that returns predefined responses for testing.
func NewMockProvider ¶
func NewMockProvider(responses map[string]string, defaultResp string) *MockProvider
NewMockProvider creates a mock provider for testing.
func (*MockProvider) Complete ¶
func (p *MockProvider) Complete(ctx context.Context, req CompletionRequest) (*CompletionResponse, error)
Complete returns a predefined response.
func (*MockProvider) DefaultModel ¶
func (p *MockProvider) DefaultModel() string
DefaultModel returns the default model.
type Moderation ¶
type Moderation struct {
*BaseJudge
// contains filtered or unexported fields
}
Moderation evaluates content for policy violations.
func NewModeration ¶
func NewModeration(provider Provider, opts ...JudgeOption) *Moderation
NewModeration creates a new Moderation metric.
func (*Moderation) Score ¶
func (m *Moderation) Score(ctx context.Context, input evaluation.MetricInput) *evaluation.ScoreResult
Score evaluates content moderation (higher = more violations).
func (*Moderation) WithCategories ¶
func (m *Moderation) WithCategories(categories []string) *Moderation
WithCategories sets custom moderation categories.
type Provider ¶
type Provider interface {
// Complete sends a completion request and returns the response.
Complete(ctx context.Context, req CompletionRequest) (*CompletionResponse, error)
// Name returns the provider name (e.g., "openai", "anthropic").
Name() string
// DefaultModel returns the default model for this provider.
DefaultModel() string
}
Provider is an interface for LLM providers used in evaluation.
type ProviderOption ¶
type ProviderOption func(*providerConfig)
ProviderOption configures a provider.
func WithMaxTokens ¶
func WithMaxTokens(max int) ProviderOption
WithMaxTokens sets the maximum tokens for generation.
func WithTemperature ¶
func WithTemperature(temp float64) ProviderOption
WithTemperature sets the temperature for generation.
type ScoreResponse ¶
ScoreResponse represents a structured scoring response.
func ParseScoreResponse ¶
func ParseScoreResponse(response string) (*ScoreResponse, error)
ParseScoreResponse parses a JSON score response.
func ScoreWithRetry ¶
func ScoreWithRetry(ctx context.Context, j *BaseJudge, messages []Message, maxRetries int) (*ScoreResponse, error)
ScoreWithRetry attempts to score with retries on failure.
type SimpleProvider ¶
type SimpleProvider struct {
// contains filtered or unexported fields
}
SimpleProvider is a basic provider implementation using a function.
func NewSimpleProvider ¶
func NewSimpleProvider(name, defaultModel string, fn func(ctx context.Context, req CompletionRequest) (*CompletionResponse, error)) *SimpleProvider
NewSimpleProvider creates a provider from a completion function.
func (*SimpleProvider) Complete ¶
func (p *SimpleProvider) Complete(ctx context.Context, req CompletionRequest) (*CompletionResponse, error)
Complete sends a completion request.
func (*SimpleProvider) DefaultModel ¶
func (p *SimpleProvider) DefaultModel() string
DefaultModel returns the default model.
func (*SimpleProvider) Name ¶
func (p *SimpleProvider) Name() string
Name returns the provider name.