Documentation
¶
Index ¶
- Constants
- func ALTAVisionAllDatasets() map[string]Dataset
- func FormatReport(r *Report) string
- func PDFComplexityReport(results []PDFComplexityResult) string
- type AggregateMetrics
- type Dataset
- type Evaluator
- type FactCheck
- type GroundTruthCheck
- type PDFComplexityResult
- type PDFComplexityTestCase
- type ReasoningStep
- type Report
- type RetrievalTrace
- type SourceTrace
- type TestCase
- type TestResult
- type TokenUsage
Constants ¶
const ( DifficultyEasy = "easy" DifficultyMedium = "medium" DifficultyHard = "hard" DifficultyComplex = "complex" // Used by legacy ComplexDataset(); not part of ALTAVision eval. DifficultySuperHard = "super-hard" )
Difficulty levels for evaluation datasets.
Variables ¶
This section is empty.
Functions ¶
func ALTAVisionAllDatasets ¶
ALTAVisionAllDatasets returns all ALTAVision datasets keyed by difficulty.
func FormatReport ¶
FormatReport produces a human-readable report string.
func PDFComplexityReport ¶
func PDFComplexityReport(results []PDFComplexityResult) string
PDFComplexityReport summarizes PDF complexity evaluation results.
Types ¶
type AggregateMetrics ¶
type AggregateMetrics struct {
AvgFaithfulness float64 `json:"avg_faithfulness"`
AvgRelevance float64 `json:"avg_relevance"`
AvgAccuracy float64 `json:"avg_accuracy"`
AvgCitationQuality float64 `json:"avg_citation_quality"`
AvgConfidence float64 `json:"avg_confidence"`
AvgClaimGrounding float64 `json:"avg_claim_grounding"`
AvgHallucinationScore float64 `json:"avg_hallucination_score"`
}
AggregateMetrics holds averaged metrics across all tests.
type Dataset ¶
type Dataset struct {
Name string `json:"name"`
Difficulty string `json:"difficulty"` // easy, medium, hard, complex, super-hard
Tests []TestCase `json:"tests"`
}
Dataset is a collection of test cases for evaluation.
func ALTAVisionEasyDataset ¶
func ALTAVisionEasyDataset() Dataset
ALTAVisionEasyDataset returns 30 easy (single-fact lookup) test cases from the ALTAVision AV-FM/AV-FF technical manual.
Expected facts use pipe-separated alternatives (e.g. "Spanish|English") so accuracy scoring works regardless of the LLM's answer language.
func ALTAVisionHardDataset ¶
func ALTAVisionHardDataset() Dataset
ALTAVisionHardDataset returns 30 hard (multi-hop reasoning) test cases.
func ALTAVisionMediumDataset ¶
func ALTAVisionMediumDataset() Dataset
ALTAVisionMediumDataset returns 30 medium (multi-fact, context-dependent) test cases.
func ALTAVisionSuperHardDataset ¶
func ALTAVisionSuperHardDataset() Dataset
ALTAVisionSuperHardDataset returns 50 super-hard (synthesis/inference) test cases. Includes the original 30 (with fixes to Q2, Q19, Q25, Q30) plus 20 new tests in categories: graph-multi-hop, anti-hallucination, numerical, reasoning.
func ComplexDataset ¶
func ComplexDataset() Dataset
ComplexDataset returns sample complex (cross-document) test cases.
func EasyDataset ¶
func EasyDataset() Dataset
EasyDataset returns sample easy (single-fact) test cases.
func MediumDataset ¶
func MediumDataset() Dataset
MediumDataset returns sample medium (multi-hop) test cases.
type Evaluator ¶
type Evaluator struct {
// contains filtered or unexported fields
}
Evaluator runs evaluation test sets against a GoReason engine.
func NewEvaluator ¶
func NewEvaluator(engine goreason.Engine) *Evaluator
NewEvaluator creates a new evaluator.
type FactCheck ¶
type FactCheck struct {
Fact string `json:"fact"`
Found bool `json:"found"`
ChunkID int64 `json:"chunk_id,omitempty"`
ChunkRank int `json:"chunk_rank,omitempty"`
Details string `json:"details,omitempty"`
}
FactCheck records whether a single expected fact was found at a pipeline stage.
type GroundTruthCheck ¶
type GroundTruthCheck struct {
FactsInDB []FactCheck `json:"facts_in_db"`
FactsEmbedded []FactCheck `json:"facts_embedded"`
FactsRetrieved []FactCheck `json:"facts_retrieved"`
FactsInAnswer []FactCheck `json:"facts_in_answer"`
Diagnosis string `json:"diagnosis"`
}
GroundTruthCheck diagnoses where each expected fact was lost in the pipeline.
type PDFComplexityResult ¶
type PDFComplexityResult struct {
Path string `json:"path"`
ExpectedComplex bool `json:"expected_complex"`
DetectedComplex bool `json:"detected_complex"`
Score float64 `json:"score"`
Correct bool `json:"correct"`
Details string `json:"details"`
}
PDFComplexityResult holds the evaluation of PDF complexity detection.
func EvaluatePDFComplexity ¶
func EvaluatePDFComplexity(testCases []PDFComplexityTestCase) []PDFComplexityResult
EvaluatePDFComplexity tests the PDF complexity detector against known files.
type PDFComplexityTestCase ¶
type PDFComplexityTestCase struct {
Path string `json:"path"`
ExpectedComplex bool `json:"expected_complex"`
Description string `json:"description"`
}
PDFComplexityTestCase defines a test for the complexity detector.
type ReasoningStep ¶
type ReasoningStep struct {
Round int `json:"round"`
Action string `json:"action"`
Prompt string `json:"prompt,omitempty"`
Response string `json:"response,omitempty"`
Tokens int `json:"tokens,omitempty"`
ElapsedMs int64 `json:"elapsed_ms,omitempty"`
Issues []string `json:"issues,omitempty"`
}
ReasoningStep records a single round of reasoning with full context for replay.
type Report ¶
type Report struct {
Dataset string `json:"dataset"`
Difficulty string `json:"difficulty,omitempty"`
TotalTests int `json:"total_tests"`
Passed int `json:"passed"`
Failed int `json:"failed"`
Metrics AggregateMetrics `json:"metrics"`
CategoryMetrics map[string]AggregateMetrics `json:"category_metrics,omitempty"`
Results []TestResult `json:"results"`
RunTime time.Duration `json:"run_time"`
TokenUsage TokenUsage `json:"token_usage"`
}
Report holds the results of an evaluation run.
type RetrievalTrace ¶
type RetrievalTrace struct {
VecResults int `json:"vec_results"`
FTSResults int `json:"fts_results"`
GraphResults int `json:"graph_results"`
FusedResults int `json:"fused_results"`
VecWeight float64 `json:"vec_weight"`
FTSWeight float64 `json:"fts_weight"`
GraphWeight float64 `json:"graph_weight"`
IdentifiersDetected bool `json:"identifiers_detected"`
FTSQuery string `json:"fts_query"`
GraphEntities []string `json:"graph_entities"`
ElapsedMs int64 `json:"elapsed_ms"`
}
RetrievalTrace holds the full retrieval breakdown for a query.
type SourceTrace ¶
type SourceTrace struct {
ChunkID int64 `json:"chunk_id"`
Heading string `json:"heading"`
Content string `json:"content"`
PageNumber int `json:"page_number"`
Score float64 `json:"score"`
Methods []string `json:"methods,omitempty"`
VecRank int `json:"vec_rank,omitempty"`
FTSRank int `json:"fts_rank,omitempty"`
GraphRank int `json:"graph_rank,omitempty"`
}
SourceTrace records a single retrieved chunk with its retrieval metadata.
type TestCase ¶
type TestCase struct {
Question string `json:"question"`
ExpectedFacts []string `json:"expected_facts"` // Facts that should appear in the answer
Category string `json:"category"` // single-fact, multi-hop, cross-document, multi-fact, synthesis
Explanation string `json:"explanation"` // Ground truth reference with page citations
}
TestCase defines a single evaluation question.
type TestResult ¶
type TestResult struct {
Question string `json:"question"`
ExpectedFacts []string `json:"expected_facts"`
Category string `json:"category,omitempty"`
Explanation string `json:"explanation,omitempty"`
Answer string `json:"answer"`
Confidence float64 `json:"confidence"`
Faithfulness float64 `json:"faithfulness"`
Relevance float64 `json:"relevance"`
Accuracy float64 `json:"accuracy"`
CitationQuality float64 `json:"citation_quality"`
ClaimGrounding float64 `json:"claim_grounding"`
HallucinationScore float64 `json:"hallucination_score"`
Passed bool `json:"passed"`
Error string `json:"error,omitempty"`
PromptTokens int `json:"prompt_tokens"`
CompletionTokens int `json:"completion_tokens"`
TotalTokens int `json:"total_tokens"`
// Timing
ElapsedMs int64 `json:"elapsed_ms"`
// Sources (the chunks the model actually saw)
Sources []SourceTrace `json:"sources,omitempty"`
// Retrieval breakdown
Retrieval *RetrievalTrace `json:"retrieval,omitempty"`
// Reasoning trace
ReasoningSteps []ReasoningStep `json:"reasoning_steps,omitempty"`
// Ground truth diagnosis
GroundTruth *GroundTruthCheck `json:"ground_truth,omitempty"`
}
TestResult holds the result of a single test case with full diagnostics.
type TokenUsage ¶
type TokenUsage struct {
PromptTokens int `json:"prompt_tokens"`
CompletionTokens int `json:"completion_tokens"`
TotalTokens int `json:"total_tokens"`
}
TokenUsage aggregates LLM token consumption across an evaluation run.