llm

package

v0.5.1 Latest Latest Go to latest Published: Jan 17, 2026 License: MIT Imports: 7 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/agentplexus/go-opik

Links

Open Source Insights

Documentation ¶

Overview ¶

Package llm provides LLM-based evaluation metrics.

These metrics use an LLM to evaluate the quality of outputs. They are more expensive than heuristic metrics but can evaluate subjective qualities like relevance, helpfulness, and coherence.

Provider Interface ¶

All LLM metrics require a Provider implementation. The package provides:

SimpleProvider: Wraps a completion function
MockProvider: For testing
CachingProvider: Wraps another provider with caching

Available Metrics ¶

GEval: G-EVAL framework with chain-of-thought evaluation
AnswerRelevance: How relevant an answer is to the question
Hallucination: Detects fabricated information
ContextRecall: How well the response uses provided context
ContextPrecision: Whether response sticks to context
Moderation: Content policy violation detection
Factuality: Factual accuracy evaluation
Coherence: Logical coherence assessment
Helpfulness: How helpful the response is
CustomJudge: Create metrics with custom prompts

Usage Example ¶

// Create a provider (see integrations package for OpenAI, Anthropic)
provider := llm.NewSimpleProvider("custom", "model-name", func(ctx context.Context, req llm.CompletionRequest) (*llm.CompletionResponse, error) {
    // Call your LLM API here
    return &llm.CompletionResponse{Content: "..."}, nil
})

// Create metrics
relevance := llm.NewAnswerRelevance(provider)
hallucination := llm.NewHallucination(provider)

// Evaluate
input := evaluation.NewMetricInput("What is 2+2?", "The answer is 4.").WithExpected("4")
score := relevance.Score(ctx, input)

Custom Judge Example ¶

judge := llm.NewCustomJudge("tone_check", `
Evaluate whether the following response maintains a professional tone.

User message: {{input}}
AI response: {{output}}

Return your response in JSON format:
{"score": <0.0-1.0>, "reason": "<explanation>"}
`, provider)

Index ¶

func FormatPromptTemplate(template string, vars map[string]string) string
func ParseJSONResponse(response string, v any) error
func ParseReasonFromResponse(response string) string
func ParseScoreFromResponse(response string) (float64, error)
type AnswerRelevance
- func NewAnswerRelevance(provider Provider, opts ...JudgeOption) *AnswerRelevance
- func (m *AnswerRelevance) Score(ctx context.Context, input evaluation.MetricInput) *evaluation.ScoreResult
type BaseJudge
- func NewBaseJudge(name string, provider Provider, opts ...JudgeOption) *BaseJudge
- func (j *BaseJudge) Complete(ctx context.Context, messages []Message) (*CompletionResponse, error)
- func (j *BaseJudge) Model() string
- func (j *BaseJudge) Provider() Provider
type CachingProvider
- func NewCachingProvider(inner Provider) *CachingProvider
- func (p *CachingProvider) Complete(ctx context.Context, req CompletionRequest) (*CompletionResponse, error)
- func (p *CachingProvider) DefaultModel() string
- func (p *CachingProvider) Name() string
type Coherence
- func NewCoherence(provider Provider, opts ...JudgeOption) *Coherence
- func (m *Coherence) Score(ctx context.Context, input evaluation.MetricInput) *evaluation.ScoreResult
type CompletionRequest
type CompletionResponse
type ContextPrecision
- func NewContextPrecision(provider Provider, opts ...JudgeOption) *ContextPrecision
- func (m *ContextPrecision) Score(ctx context.Context, input evaluation.MetricInput) *evaluation.ScoreResult
type ContextRecall
- func NewContextRecall(provider Provider, opts ...JudgeOption) *ContextRecall
- func (m *ContextRecall) Score(ctx context.Context, input evaluation.MetricInput) *evaluation.ScoreResult
type CustomJudge
- func NewCustomJudge(name, promptTemplate string, provider Provider, opts ...JudgeOption) *CustomJudge
- func (m *CustomJudge) Score(ctx context.Context, input evaluation.MetricInput) *evaluation.ScoreResult
type Factuality
- func NewFactuality(provider Provider, opts ...JudgeOption) *Factuality
- func (m *Factuality) Score(ctx context.Context, input evaluation.MetricInput) *evaluation.ScoreResult
type GEval
- func NewGEval(provider Provider, criteria string, opts ...JudgeOption) *GEval
- func (g *GEval) Score(ctx context.Context, input evaluation.MetricInput) *evaluation.ScoreResult
- func (g *GEval) WithEvaluationSteps(steps []string) *GEval
type Hallucination
- func NewHallucination(provider Provider, opts ...JudgeOption) *Hallucination
- func (m *Hallucination) Score(ctx context.Context, input evaluation.MetricInput) *evaluation.ScoreResult
type Helpfulness
- func NewHelpfulness(provider Provider, opts ...JudgeOption) *Helpfulness
- func (m *Helpfulness) Score(ctx context.Context, input evaluation.MetricInput) *evaluation.ScoreResult
type JudgeOption
- func WithJudgeModel(model string) JudgeOption
- func WithJudgeTemperature(temp float64) JudgeOption
type Message
type MockProvider
- func NewMockProvider(responses map[string]string, defaultResp string) *MockProvider
- func (p *MockProvider) Complete(ctx context.Context, req CompletionRequest) (*CompletionResponse, error)
- func (p *MockProvider) DefaultModel() string
- func (p *MockProvider) Name() string
type Moderation
- func NewModeration(provider Provider, opts ...JudgeOption) *Moderation
- func (m *Moderation) Score(ctx context.Context, input evaluation.MetricInput) *evaluation.ScoreResult
- func (m *Moderation) WithCategories(categories []string) *Moderation
type Provider
type ProviderOption
- func WithAPIKey(key string) ProviderOption
- func WithBaseURL(url string) ProviderOption
- func WithMaxTokens(max int) ProviderOption
- func WithModel(model string) ProviderOption
- func WithTemperature(temp float64) ProviderOption
type ScoreResponse
- func ParseScoreResponse(response string) (*ScoreResponse, error)
- func ScoreWithRetry(ctx context.Context, j *BaseJudge, messages []Message, maxRetries int) (*ScoreResponse, error)
type SimpleProvider
- func NewSimpleProvider(name, defaultModel string, ...) *SimpleProvider
- func (p *SimpleProvider) Complete(ctx context.Context, req CompletionRequest) (*CompletionResponse, error)
- func (p *SimpleProvider) DefaultModel() string
- func (p *SimpleProvider) Name() string

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func FormatPromptTemplate ¶

func FormatPromptTemplate(template string, vars map[string]string) string

FormatPromptTemplate formats a prompt template with variable substitution.

func ParseJSONResponse ¶

func ParseJSONResponse(response string, v any) error

ParseJSONResponse extracts JSON from an LLM response.

func ParseReasonFromResponse ¶

func ParseReasonFromResponse(response string) string

ParseReasonFromResponse extracts a reason/explanation from an LLM response.

func ParseScoreFromResponse ¶

func ParseScoreFromResponse(response string) (float64, error)

ParseScoreFromResponse extracts a numeric score from an LLM response.

Types ¶

type AnswerRelevance ¶

type AnswerRelevance struct {
	*BaseJudge
}

AnswerRelevance evaluates how relevant an answer is to the question.

func NewAnswerRelevance ¶

func NewAnswerRelevance(provider Provider, opts ...JudgeOption) *AnswerRelevance

NewAnswerRelevance creates a new AnswerRelevance metric.

func (*AnswerRelevance) Score ¶

func (m *AnswerRelevance) Score(ctx context.Context, input evaluation.MetricInput) *evaluation.ScoreResult

Score evaluates answer relevance.

type BaseJudge ¶

type BaseJudge struct {
	evaluation.BaseMetric
	// contains filtered or unexported fields
}

BaseJudge provides common functionality for LLM-based evaluation metrics.

func NewBaseJudge ¶

func NewBaseJudge(name string, provider Provider, opts ...JudgeOption) *BaseJudge

NewBaseJudge creates a new base judge.

func (*BaseJudge) Complete ¶

func (j *BaseJudge) Complete(ctx context.Context, messages []Message) (*CompletionResponse, error)

Complete sends a completion request to the provider.

func (*BaseJudge) Model ¶

func (j *BaseJudge) Model() string

Model returns the model name.

func (*BaseJudge) Provider ¶

func (j *BaseJudge) Provider() Provider

Provider returns the LLM provider.

type CachingProvider ¶

type CachingProvider struct {
	// contains filtered or unexported fields
}

CachingProvider wraps a provider with response caching.

func NewCachingProvider ¶

func NewCachingProvider(inner Provider) *CachingProvider

NewCachingProvider creates a caching wrapper around a provider.

func (*CachingProvider) Complete ¶

func (p *CachingProvider) Complete(ctx context.Context, req CompletionRequest) (*CompletionResponse, error)

Complete returns cached response or calls inner provider.

func (*CachingProvider) DefaultModel ¶

func (p *CachingProvider) DefaultModel() string

DefaultModel returns the inner provider's default model.

func (*CachingProvider) Name ¶

func (p *CachingProvider) Name() string

Name returns the inner provider name.

type Coherence ¶

type Coherence struct {
	*BaseJudge
}

Coherence evaluates the logical coherence of a response.

func NewCoherence ¶

func NewCoherence(provider Provider, opts ...JudgeOption) *Coherence

NewCoherence creates a new Coherence metric.

func (*Coherence) Score ¶

func (m *Coherence) Score(ctx context.Context, input evaluation.MetricInput) *evaluation.ScoreResult

Score evaluates coherence.

type CompletionRequest ¶

type CompletionRequest struct {
	Messages    []Message `json:"messages"`
	Model       string    `json:"model,omitempty"`
	Temperature float64   `json:"temperature,omitempty"`
	MaxTokens   int       `json:"max_tokens,omitempty"`
}

CompletionRequest represents a request for chat completion.

type CompletionResponse ¶

type CompletionResponse struct {
	Content      string `json:"content"`
	Model        string `json:"model,omitempty"`
	PromptTokens int    `json:"prompt_tokens,omitempty"`
	OutputTokens int    `json:"output_tokens,omitempty"`
}

CompletionResponse represents a chat completion response.

type ContextPrecision ¶

type ContextPrecision struct {
	*BaseJudge
}

ContextPrecision evaluates whether the response sticks to the context.

func NewContextPrecision ¶

func NewContextPrecision(provider Provider, opts ...JudgeOption) *ContextPrecision

NewContextPrecision creates a new ContextPrecision metric.

func (*ContextPrecision) Score ¶

func (m *ContextPrecision) Score(ctx context.Context, input evaluation.MetricInput) *evaluation.ScoreResult

Score evaluates context precision.

type ContextRecall ¶

type ContextRecall struct {
	*BaseJudge
}

ContextRecall evaluates how well the response uses the provided context.

func NewContextRecall ¶

func NewContextRecall(provider Provider, opts ...JudgeOption) *ContextRecall

NewContextRecall creates a new ContextRecall metric.

func (*ContextRecall) Score ¶

func (m *ContextRecall) Score(ctx context.Context, input evaluation.MetricInput) *evaluation.ScoreResult

Score evaluates context recall.

type CustomJudge ¶

type CustomJudge struct {
	*BaseJudge
	// contains filtered or unexported fields
}

CustomJudge allows creating metrics with custom prompts.

func NewCustomJudge ¶

func NewCustomJudge(name, promptTemplate string, provider Provider, opts ...JudgeOption) *CustomJudge

NewCustomJudge creates a custom judge metric.

func (*CustomJudge) Score ¶

func (m *CustomJudge) Score(ctx context.Context, input evaluation.MetricInput) *evaluation.ScoreResult

Score evaluates using the custom prompt.

type Factuality ¶

type Factuality struct {
	*BaseJudge
}

Factuality evaluates factual accuracy of responses.

func NewFactuality ¶

func NewFactuality(provider Provider, opts ...JudgeOption) *Factuality

NewFactuality creates a new Factuality metric.

func (*Factuality) Score ¶

func (m *Factuality) Score(ctx context.Context, input evaluation.MetricInput) *evaluation.ScoreResult

Score evaluates factual accuracy.

type GEval ¶

type GEval struct {
	*BaseJudge
	// contains filtered or unexported fields
}

GEval implements the G-EVAL framework for LLM evaluation. It uses chain-of-thought prompting to evaluate outputs.

func NewGEval ¶

func NewGEval(provider Provider, criteria string, opts ...JudgeOption) *GEval

NewGEval creates a new G-EVAL metric.

func (*GEval) Score ¶

func (g *GEval) Score(ctx context.Context, input evaluation.MetricInput) *evaluation.ScoreResult

Score evaluates using G-EVAL.

func (*GEval) WithEvaluationSteps ¶

func (g *GEval) WithEvaluationSteps(steps []string) *GEval

WithEvaluationSteps adds custom evaluation steps.

type Hallucination ¶

type Hallucination struct {
	*BaseJudge
}

Hallucination detects hallucinations in LLM outputs.

func NewHallucination ¶

func NewHallucination(provider Provider, opts ...JudgeOption) *Hallucination

NewHallucination creates a new Hallucination detection metric.

func (*Hallucination) Score ¶

func (m *Hallucination) Score(ctx context.Context, input evaluation.MetricInput) *evaluation.ScoreResult

Score detects hallucinations (higher score = more hallucination detected).

type Helpfulness ¶

type Helpfulness struct {
	*BaseJudge
}

Helpfulness evaluates how helpful a response is.

func NewHelpfulness ¶

func NewHelpfulness(provider Provider, opts ...JudgeOption) *Helpfulness

NewHelpfulness creates a new Helpfulness metric.

func (*Helpfulness) Score ¶

func (m *Helpfulness) Score(ctx context.Context, input evaluation.MetricInput) *evaluation.ScoreResult

Score evaluates helpfulness.

type JudgeOption ¶

type JudgeOption func(*BaseJudge)

JudgeOption configures a judge metric.

func WithJudgeModel ¶

func WithJudgeModel(model string) JudgeOption

WithJudgeModel sets the model for the judge.

func WithJudgeTemperature ¶

func WithJudgeTemperature(temp float64) JudgeOption

WithJudgeTemperature sets the temperature for the judge.

type Message ¶

type Message struct {
	Role    string `json:"role"` // "system", "user", "assistant"
	Content string `json:"content"`
}

Message represents a chat message.

type MockProvider ¶

type MockProvider struct {
	// contains filtered or unexported fields
}

MockProvider is a provider that returns predefined responses for testing.

func NewMockProvider ¶

func NewMockProvider(responses map[string]string, defaultResp string) *MockProvider

NewMockProvider creates a mock provider for testing.

func (*MockProvider) Complete ¶

func (p *MockProvider) Complete(ctx context.Context, req CompletionRequest) (*CompletionResponse, error)

Complete returns a predefined response.

func (*MockProvider) DefaultModel ¶

func (p *MockProvider) DefaultModel() string

DefaultModel returns the default model.

func (*MockProvider) Name ¶

func (p *MockProvider) Name() string

Name returns the provider name.

type Moderation ¶

type Moderation struct {
	*BaseJudge
	// contains filtered or unexported fields
}

Moderation evaluates content for policy violations.

func NewModeration ¶

func NewModeration(provider Provider, opts ...JudgeOption) *Moderation

NewModeration creates a new Moderation metric.

func (*Moderation) Score ¶

func (m *Moderation) Score(ctx context.Context, input evaluation.MetricInput) *evaluation.ScoreResult

Score evaluates content moderation (higher = more violations).

func (*Moderation) WithCategories ¶

func (m *Moderation) WithCategories(categories []string) *Moderation

WithCategories sets custom moderation categories.

type Provider ¶

type Provider interface {
	// Complete sends a completion request and returns the response.
	Complete(ctx context.Context, req CompletionRequest) (*CompletionResponse, error)

	// Name returns the provider name (e.g., "openai", "anthropic").
	Name() string

	// DefaultModel returns the default model for this provider.
	DefaultModel() string
}

Provider is an interface for LLM providers used in evaluation.

type ProviderOption ¶

type ProviderOption func(*providerConfig)

ProviderOption configures a provider.

func WithAPIKey ¶

func WithAPIKey(key string) ProviderOption

WithAPIKey sets the API key.

func WithBaseURL ¶

func WithBaseURL(url string) ProviderOption

WithBaseURL sets a custom base URL.

func WithMaxTokens ¶

func WithMaxTokens(max int) ProviderOption

WithMaxTokens sets the maximum tokens for generation.

func WithModel ¶

func WithModel(model string) ProviderOption

WithModel sets the model to use.

func WithTemperature ¶

func WithTemperature(temp float64) ProviderOption

WithTemperature sets the temperature for generation.

type ScoreResponse ¶

type ScoreResponse struct {
	Score  float64 `json:"score"`
	Reason string  `json:"reason,omitempty"`
}

ScoreResponse represents a structured scoring response.

func ParseScoreResponse ¶

func ParseScoreResponse(response string) (*ScoreResponse, error)

ParseScoreResponse parses a JSON score response.

func ScoreWithRetry ¶

func ScoreWithRetry(ctx context.Context, j *BaseJudge, messages []Message, maxRetries int) (*ScoreResponse, error)

ScoreWithRetry attempts to score with retries on failure.

type SimpleProvider ¶

type SimpleProvider struct {
	// contains filtered or unexported fields
}

SimpleProvider is a basic provider implementation using a function.

func NewSimpleProvider ¶

func NewSimpleProvider(name, defaultModel string, fn func(ctx context.Context, req CompletionRequest) (*CompletionResponse, error)) *SimpleProvider

NewSimpleProvider creates a provider from a completion function.

func (*SimpleProvider) Complete ¶

func (p *SimpleProvider) Complete(ctx context.Context, req CompletionRequest) (*CompletionResponse, error)

Complete sends a completion request.

func (*SimpleProvider) DefaultModel ¶

func (p *SimpleProvider) DefaultModel() string

DefaultModel returns the default model.

func (*SimpleProvider) Name ¶

func (p *SimpleProvider) Name() string

Name returns the provider name.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL