eval

package
v0.7.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 1, 2026 License: MIT Imports: 14 Imported by: 0

Documentation

Overview

Package eval provides evaluation orchestration for spec documents.

Package eval provides evaluation orchestration for spec documents.

Package eval provides evaluation orchestration for spec documents.

Package eval provides evaluation orchestration for spec documents.

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func ClaimsFromCategoryResults added in v0.7.0

func ClaimsFromCategoryResults(categories []CategoryResult, timestamp time.Time) []claims.Claim

ClaimsFromCategoryResults creates claims from category results.

func CreateMultiEvalSummary added in v0.7.0

func CreateMultiEvalSummary(
	project string,
	version string,
	results map[string]*Result,
	evalReports map[string]*rubric.Rubric,
	claimsReports map[string]*claims.ClaimsReport,
) *summary.SummaryReport

CreateMultiEvalSummary creates a summary with multiple evaluations.

func CreateSingleEvalSummary added in v0.7.0

func CreateSingleEvalSummary(
	project string,
	specType string,
	result *Result,
	evalReport *rubric.Rubric,
	claimsReport *claims.ClaimsReport,
) *summary.SummaryReport

CreateSingleEvalSummary creates a summary with a single evaluation.

func RenderEvaluationReportMarkdown added in v0.7.0

func RenderEvaluationReportMarkdown(w io.Writer, report *rubric.Rubric) error

RenderEvaluationReportMarkdown renders a structured-evaluation report to markdown.

Types

type CategoryResult

type CategoryResult struct {
	ID          string  `json:"id"`
	Name        string  `json:"name"`
	Score       float64 `json:"score"`
	Weight      float64 `json:"weight"`
	Explanation string  `json:"explanation"`
}

CategoryResult contains the evaluation result for a category.

type EvalSummary added in v0.7.0

type EvalSummary struct {
	// Project is the project name.
	Project string

	// Version is the project version.
	Version string

	// Results are the individual evaluation results keyed by spec type.
	Results map[string]*Result

	// EvaluationReports are the structured evaluation reports.
	EvaluationReports map[string]*rubric.Rubric

	// ClaimsReports are the claims extracted from findings.
	ClaimsReports map[string]*claims.ClaimsReport
}

EvalSummary aggregates multiple evaluation results with embedded reports.

func NewEvalSummary added in v0.7.0

func NewEvalSummary(project, version string) *EvalSummary

NewEvalSummary creates a new evaluation summary.

func (*EvalSummary) AddResult added in v0.7.0

func (s *EvalSummary) AddResult(specType string, result *Result, evalReport *rubric.Rubric, claimsReport *claims.ClaimsReport)

AddResult adds an evaluation result to the summary.

func (*EvalSummary) IsAllPassing added in v0.7.0

func (s *EvalSummary) IsAllPassing() bool

IsAllPassing returns true if all evaluations passed.

func (*EvalSummary) ToSummaryReport added in v0.7.0

func (s *EvalSummary) ToSummaryReport(phase string) *summary.SummaryReport

ToSummaryReport converts to a structured-evaluation SummaryReport. The report embeds full-fidelity EvaluationReport and ClaimsReport.

func (*EvalSummary) TotalScore added in v0.7.0

func (s *EvalSummary) TotalScore() float64

TotalScore returns the average score across all evaluations.

type Evaluator

type Evaluator struct {
	// contains filtered or unexported fields
}

Evaluator performs evaluations using an LLM judge.

func NewEvaluator

func NewEvaluator(llm *LLMClient) *Evaluator

NewEvaluator creates a new evaluator with the given LLM client.

func (*Evaluator) Evaluate

func (e *Evaluator) Evaluate(ctx context.Context, specType types.SpecType, content string) (*Result, error)

Evaluate runs evaluation on content against the rubric for the given spec type.

func (*Evaluator) SetRubricLoader

func (e *Evaluator) SetRubricLoader(loader rubrics.Loader)

SetRubricLoader sets a custom rubric loader for evaluation.

type Finding

type Finding struct {
	Severity       string `json:"severity"`
	Category       string `json:"category"`
	Title          string `json:"title"`
	Description    string `json:"description"`
	Recommendation string `json:"recommendation"`
	Evidence       string `json:"evidence,omitempty"`
}

Finding represents an issue found during evaluation.

type JudgeMetadata

type JudgeMetadata struct {
	Model       string  `json:"model"`
	Provider    string  `json:"provider"`
	Temperature float64 `json:"temperature"`
	Tokens      int     `json:"tokens"`
}

JudgeMetadata records information about the LLM judge.

type LLMClient

type LLMClient struct {
	// contains filtered or unexported fields
}

LLMClient wraps omnillm for evaluation requests.

func NewLLMClient

func NewLLMClient(cfg LLMConfig) (*LLMClient, error)

NewLLMClient creates a new LLM client with the given configuration.

func NewLLMClientFromEnv

func NewLLMClientFromEnv() (*LLMClient, error)

NewLLMClientFromEnv creates an LLM client using environment configuration. It tries providers in order: ANTHROPIC, OPENAI, GEMINI.

func NewLLMClientFromProject

func NewLLMClientFromProject(projectCfg *types.LLMConfig) (*LLMClient, error)

NewLLMClientFromProject creates an LLM client using project configuration. Project config values take precedence; missing values fall back to environment defaults.

func (*LLMClient) Close

func (c *LLMClient) Close() error

Close releases resources.

func (*LLMClient) Complete

func (c *LLMClient) Complete(ctx context.Context, prompt string) (string, JudgeMetadata, error)

Complete sends a prompt to the LLM and returns the response.

type LLMConfig

type LLMConfig struct {
	Provider    string  // Provider name (openai, anthropic, gemini, etc.)
	Model       string  // Model name
	APIKey      string  // API key (optional if env var is set)
	Temperature float64 // Temperature for generation (default 0.0 for deterministic)
	MaxTokens   int     // Max tokens for response (default 4096)
}

LLMConfig configures the LLM client.

func DefaultLLMConfig

func DefaultLLMConfig() LLMConfig

DefaultLLMConfig returns default configuration for evaluation.

type MarkdownRenderer added in v0.7.0

type MarkdownRenderer struct{}

MarkdownRenderer renders results to markdown format.

func NewMarkdownRenderer added in v0.7.0

func NewMarkdownRenderer() *MarkdownRenderer

NewMarkdownRenderer creates a new markdown renderer.

func (*MarkdownRenderer) Render added in v0.7.0

func (r *MarkdownRenderer) Render(w io.Writer, result *Result) error

Render writes the result as markdown to the writer.

type Renderer added in v0.7.0

type Renderer interface {
	Render(w io.Writer, result *Result) error
}

Renderer renders evaluation results to various formats.

type Result

type Result struct {
	SpecType   types.SpecType   `json:"spec_type"`
	Timestamp  time.Time        `json:"timestamp"`
	Score      float64          `json:"score"`
	Passed     bool             `json:"passed"`
	Categories []CategoryResult `json:"categories"`
	Findings   []Finding        `json:"findings"`
	Decision   string           `json:"decision"`
	Summary    string           `json:"summary"`
	Judge      JudgeMetadata    `json:"judge"`
}

Result represents the outcome of an evaluation.

func (*Result) ToClaimsReport added in v0.7.0

func (r *Result) ToClaimsReport(document string) *claims.ClaimsReport

ToClaimsReport extracts claims from evaluation findings. Each finding becomes a claim with internal validation based on the evaluation.

func (*Result) ToEvaluationReport

func (r *Result) ToEvaluationReport(rubricSet *rubrics.RubricSet) *rubric.Rubric

ToEvaluationReport converts the result to a structured-evaluation report. The rubricSet parameter is required for finalization.

type TerminalRenderer added in v0.7.0

type TerminalRenderer struct {
	Verbose bool
}

TerminalRenderer renders results for terminal output.

func NewTerminalRenderer added in v0.7.0

func NewTerminalRenderer(verbose bool) *TerminalRenderer

NewTerminalRenderer creates a new terminal renderer.

func (*TerminalRenderer) Render added in v0.7.0

func (r *TerminalRenderer) Render(w io.Writer, result *Result) error

Render writes the result to the terminal.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL