eval

package
v0.1.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 4, 2026 License: Apache-2.0 Imports: 11 Imported by: 0

Documentation

Index

Constants

View Source
const (
	DifficultyEasy      = "easy"
	DifficultyMedium    = "medium"
	DifficultyHard      = "hard"
	DifficultyComplex   = "complex" // Used by legacy ComplexDataset(); not part of ALTAVision eval.
	DifficultySuperHard = "super-hard"
)

Difficulty levels for evaluation datasets.

Variables

This section is empty.

Functions

func ALTAVisionAllDatasets

func ALTAVisionAllDatasets() map[string]Dataset

ALTAVisionAllDatasets returns all ALTAVision datasets keyed by difficulty.

func FormatReport

func FormatReport(r *Report) string

FormatReport produces a human-readable report string.

func PDFComplexityReport

func PDFComplexityReport(results []PDFComplexityResult) string

PDFComplexityReport summarizes PDF complexity evaluation results.

Types

type AggregateMetrics

type AggregateMetrics struct {
	AvgFaithfulness       float64 `json:"avg_faithfulness"`
	AvgRelevance          float64 `json:"avg_relevance"`
	AvgAccuracy           float64 `json:"avg_accuracy"`
	AvgCitationQuality    float64 `json:"avg_citation_quality"`
	AvgConfidence         float64 `json:"avg_confidence"`
	AvgClaimGrounding     float64 `json:"avg_claim_grounding"`
	AvgHallucinationScore float64 `json:"avg_hallucination_score"`
}

AggregateMetrics holds averaged metrics across all tests.

type Dataset

type Dataset struct {
	Name       string     `json:"name"`
	Difficulty string     `json:"difficulty"` // easy, medium, hard, complex, super-hard
	Tests      []TestCase `json:"tests"`
}

Dataset is a collection of test cases for evaluation.

func ALTAVisionEasyDataset

func ALTAVisionEasyDataset() Dataset

ALTAVisionEasyDataset returns 30 easy (single-fact lookup) test cases from the ALTAVision AV-FM/AV-FF technical manual.

Expected facts use pipe-separated alternatives (e.g. "Spanish|English") so accuracy scoring works regardless of the LLM's answer language.

func ALTAVisionHardDataset

func ALTAVisionHardDataset() Dataset

ALTAVisionHardDataset returns 30 hard (multi-hop reasoning) test cases.

func ALTAVisionMediumDataset

func ALTAVisionMediumDataset() Dataset

ALTAVisionMediumDataset returns 30 medium (multi-fact, context-dependent) test cases.

func ALTAVisionSuperHardDataset

func ALTAVisionSuperHardDataset() Dataset

ALTAVisionSuperHardDataset returns 50 super-hard (synthesis/inference) test cases. Includes the original 30 (with fixes to Q2, Q19, Q25, Q30) plus 20 new tests in categories: graph-multi-hop, anti-hallucination, numerical, reasoning.

func ComplexDataset

func ComplexDataset() Dataset

ComplexDataset returns sample complex (cross-document) test cases.

func EasyDataset

func EasyDataset() Dataset

EasyDataset returns sample easy (single-fact) test cases.

func MediumDataset

func MediumDataset() Dataset

MediumDataset returns sample medium (multi-hop) test cases.

type Evaluator

type Evaluator struct {
	// contains filtered or unexported fields
}

Evaluator runs evaluation test sets against a GoReason engine.

func NewEvaluator

func NewEvaluator(engine goreason.Engine) *Evaluator

NewEvaluator creates a new evaluator.

func (*Evaluator) Run

func (e *Evaluator) Run(ctx context.Context, dataset Dataset, opts ...goreason.QueryOption) (*Report, error)

Run executes an evaluation dataset against the engine.

type FactCheck

type FactCheck struct {
	Fact      string `json:"fact"`
	Found     bool   `json:"found"`
	ChunkID   int64  `json:"chunk_id,omitempty"`
	ChunkRank int    `json:"chunk_rank,omitempty"`
	Details   string `json:"details,omitempty"`
}

FactCheck records whether a single expected fact was found at a pipeline stage.

type GroundTruthCheck

type GroundTruthCheck struct {
	FactsInDB      []FactCheck `json:"facts_in_db"`
	FactsEmbedded  []FactCheck `json:"facts_embedded"`
	FactsRetrieved []FactCheck `json:"facts_retrieved"`
	FactsInAnswer  []FactCheck `json:"facts_in_answer"`
	Diagnosis      string      `json:"diagnosis"`
}

GroundTruthCheck diagnoses where each expected fact was lost in the pipeline.

type PDFComplexityResult

type PDFComplexityResult struct {
	Path            string  `json:"path"`
	ExpectedComplex bool    `json:"expected_complex"`
	DetectedComplex bool    `json:"detected_complex"`
	Score           float64 `json:"score"`
	Correct         bool    `json:"correct"`
	Details         string  `json:"details"`
}

PDFComplexityResult holds the evaluation of PDF complexity detection.

func EvaluatePDFComplexity

func EvaluatePDFComplexity(testCases []PDFComplexityTestCase) []PDFComplexityResult

EvaluatePDFComplexity tests the PDF complexity detector against known files.

type PDFComplexityTestCase

type PDFComplexityTestCase struct {
	Path            string `json:"path"`
	ExpectedComplex bool   `json:"expected_complex"`
	Description     string `json:"description"`
}

PDFComplexityTestCase defines a test for the complexity detector.

type ReasoningStep

type ReasoningStep struct {
	Round     int      `json:"round"`
	Action    string   `json:"action"`
	Prompt    string   `json:"prompt,omitempty"`
	Response  string   `json:"response,omitempty"`
	Tokens    int      `json:"tokens,omitempty"`
	ElapsedMs int64    `json:"elapsed_ms,omitempty"`
	Issues    []string `json:"issues,omitempty"`
}

ReasoningStep records a single round of reasoning with full context for replay.

type Report

type Report struct {
	Dataset         string                      `json:"dataset"`
	Difficulty      string                      `json:"difficulty,omitempty"`
	TotalTests      int                         `json:"total_tests"`
	Passed          int                         `json:"passed"`
	Failed          int                         `json:"failed"`
	Metrics         AggregateMetrics            `json:"metrics"`
	CategoryMetrics map[string]AggregateMetrics `json:"category_metrics,omitempty"`
	Results         []TestResult                `json:"results"`
	RunTime         time.Duration               `json:"run_time"`
	TokenUsage      TokenUsage                  `json:"token_usage"`
}

Report holds the results of an evaluation run.

type RetrievalTrace

type RetrievalTrace struct {
	VecResults          int      `json:"vec_results"`
	FTSResults          int      `json:"fts_results"`
	GraphResults        int      `json:"graph_results"`
	FusedResults        int      `json:"fused_results"`
	VecWeight           float64  `json:"vec_weight"`
	FTSWeight           float64  `json:"fts_weight"`
	GraphWeight         float64  `json:"graph_weight"`
	IdentifiersDetected bool     `json:"identifiers_detected"`
	FTSQuery            string   `json:"fts_query"`
	GraphEntities       []string `json:"graph_entities"`
	ElapsedMs           int64    `json:"elapsed_ms"`
}

RetrievalTrace holds the full retrieval breakdown for a query.

type SourceTrace

type SourceTrace struct {
	ChunkID    int64    `json:"chunk_id"`
	Heading    string   `json:"heading"`
	Content    string   `json:"content"`
	PageNumber int      `json:"page_number"`
	Score      float64  `json:"score"`
	Methods    []string `json:"methods,omitempty"`
	VecRank    int      `json:"vec_rank,omitempty"`
	FTSRank    int      `json:"fts_rank,omitempty"`
	GraphRank  int      `json:"graph_rank,omitempty"`
}

SourceTrace records a single retrieved chunk with its retrieval metadata.

type TestCase

type TestCase struct {
	Question      string   `json:"question"`
	ExpectedFacts []string `json:"expected_facts"` // Facts that should appear in the answer
	Category      string   `json:"category"`       // single-fact, multi-hop, cross-document, multi-fact, synthesis
	Explanation   string   `json:"explanation"`    // Ground truth reference with page citations
}

TestCase defines a single evaluation question.

type TestResult

type TestResult struct {
	Question           string   `json:"question"`
	ExpectedFacts      []string `json:"expected_facts"`
	Category           string   `json:"category,omitempty"`
	Explanation        string   `json:"explanation,omitempty"`
	Answer             string   `json:"answer"`
	Confidence         float64  `json:"confidence"`
	Faithfulness       float64  `json:"faithfulness"`
	Relevance          float64  `json:"relevance"`
	Accuracy           float64  `json:"accuracy"`
	CitationQuality    float64  `json:"citation_quality"`
	ClaimGrounding     float64  `json:"claim_grounding"`
	HallucinationScore float64  `json:"hallucination_score"`
	Passed             bool     `json:"passed"`
	Error              string   `json:"error,omitempty"`
	PromptTokens       int      `json:"prompt_tokens"`
	CompletionTokens   int      `json:"completion_tokens"`
	TotalTokens        int      `json:"total_tokens"`

	// Timing
	ElapsedMs int64 `json:"elapsed_ms"`

	// Sources (the chunks the model actually saw)
	Sources []SourceTrace `json:"sources,omitempty"`

	// Retrieval breakdown
	Retrieval *RetrievalTrace `json:"retrieval,omitempty"`

	// Reasoning trace
	ReasoningSteps []ReasoningStep `json:"reasoning_steps,omitempty"`

	// Ground truth diagnosis
	GroundTruth *GroundTruthCheck `json:"ground_truth,omitempty"`
}

TestResult holds the result of a single test case with full diagnostics.

type TokenUsage

type TokenUsage struct {
	PromptTokens     int `json:"prompt_tokens"`
	CompletionTokens int `json:"completion_tokens"`
	TotalTokens      int `json:"total_tokens"`
}

TokenUsage aggregates LLM token consumption across an evaluation run.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL