Documentation
¶
Index ¶
Examples ¶
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type Evaluation ¶
Evaluation represents the output of an evaluator.
type Evaluator ¶
type Evaluator interface {
Name() string
Run(ctx context.Context, rec dataset.Record, output any) (any, error)
}
Evaluator represents an evaluator for an Experiment.
func NewEvaluator ¶
func NewEvaluator(name string, fn EvaluatorFunc) Evaluator
NewEvaluator creates a new Evaluator.
type EvaluatorFunc ¶
EvaluatorFunc is the type for Evaluator functions.
type Experiment ¶
type Experiment struct {
Name string
// contains filtered or unexported fields
}
Experiment represents a DataDog LLM Observability experiment.
func New ¶
func New(name string, task Task, ds *dataset.Dataset, evaluators []Evaluator, opts ...Option) (*Experiment, error)
New creates a new Experiment.
Example ¶
package main
import (
"context"
"fmt"
"log"
"strings"
"github.com/DataDog/dd-trace-go/v2/ddtrace/tracer"
"github.com/DataDog/dd-trace-go/v2/llmobs/dataset"
"github.com/DataDog/dd-trace-go/v2/llmobs/experiment"
)
func main() {
if err := tracer.Start(tracer.WithLLMObsEnabled(true)); err != nil {
log.Fatal(err)
}
defer tracer.Stop()
ds, err := dataset.Pull(context.TODO(), "capitals-of-the-world")
if err != nil {
log.Fatal(err)
}
task := experiment.NewTask("capitals-of-the-world", func(ctx context.Context, rec dataset.Record, experimentCfg map[string]any) (any, error) {
inputMap := rec.Input.(map[string]any)
question := inputMap["question"].(string)
// Your LLM or processing logic here
if strings.Contains(question, "China") {
return "Beijing", nil
}
return "Unknown", nil
})
evs := []experiment.Evaluator{
experiment.NewEvaluator("exact-match", func(ctx context.Context, rec dataset.Record, output any) (any, error) {
return output == rec.ExpectedOutput, nil
}),
experiment.NewEvaluator("overlap", func(ctx context.Context, rec dataset.Record, output any) (any, error) {
outStr, ok := output.(string)
if !ok {
return nil, fmt.Errorf("wanted output to be a string, got: %T", output)
}
expStr, ok := rec.ExpectedOutput.(string)
if !ok {
return nil, fmt.Errorf("wanted expectedOutput to be a string, got: %T", rec.ExpectedOutput)
}
outSet := make(map[rune]struct{})
for _, r := range outStr {
outSet[r] = struct{}{}
}
expSet := make(map[rune]struct{})
for _, r := range expStr {
expSet[r] = struct{}{}
}
// Intersection size
intersection := 0
for r := range outSet {
if _, ok := expSet[r]; ok {
intersection++
}
}
// |A ∪ B| = |A| + |B| − |A ∩ B|
union := len(outSet) + len(expSet) - intersection
// Jaccard similarity. Define both-empty as a perfect match.
var score float64
if union == 0 {
score = 1.0
} else {
score = float64(intersection) / float64(union)
}
return score, nil
}),
experiment.NewEvaluator("fake-llm-as-a-judge", func(ctx context.Context, rec dataset.Record, output any) (any, error) {
return "excellent", nil
}),
}
exp, err := experiment.New(
"my-experiment",
task,
ds,
evs,
experiment.WithDescription("Testing capital cities knowledge"),
experiment.WithExperimentConfig(
map[string]any{
"model_name": "gpt-4",
"version": "1.0",
},
),
)
if err != nil {
log.Fatal(err)
}
results, err := exp.Run(context.TODO())
if err != nil {
log.Fatal(err)
}
for _, res := range results.Results {
fmt.Printf("Record ID: %s", res.Record.ID())
fmt.Printf("Input: %v", res.Record.Input)
fmt.Printf("Expected Output: %v", res.Record.ExpectedOutput)
fmt.Printf("Output: %v", res.Output)
for _, ev := range res.Evaluations {
fmt.Printf("Evaluator score (%s): %v", ev.Name, ev.Value)
if ev.Error != nil {
fmt.Printf("Evaluator error (%s): %v", ev.Name, ev.Error)
}
}
if res.Error != nil {
fmt.Printf("Error: %v", res.Error)
}
}
}
func (*Experiment) Run ¶
func (e *Experiment) Run(ctx context.Context, opts ...RunOption) (*ExperimentResult, error)
Run executes the experiment, running the task and evaluators on each record in the dataset, then running summary evaluators on the aggregated results.
func (*Experiment) URL ¶
func (e *Experiment) URL() string
type ExperimentResult ¶
type ExperimentResult struct {
ExperimentName string
DatasetName string
Results []*RecordResult
SummaryEvaluations []*Evaluation
}
ExperimentResult represents the complete results of an experiment run.
type Option ¶
type Option func(cfg *newCfg)
func WithDescription ¶
func WithExperimentConfig ¶
func WithProjectName ¶
func WithSummaryEvaluators ¶
func WithSummaryEvaluators(summaryEvaluators ...SummaryEvaluator) Option
WithSummaryEvaluators sets the summary evaluators for the experiment. Summary evaluators run after all tasks and evaluators have completed, receiving all experiment results to compute aggregate metrics.
type RecordResult ¶
type RecordResult struct {
Record *dataset.Record // The dataset record containing input, expected output, and metadata
Output any // The task output for this record
Evaluations []*Evaluation // Evaluation results for this record
// Experiment execution metadata
RecordIndex int // Index of the record in the dataset
SpanID string // Span ID for tracing
TraceID string // Trace ID for tracing
Timestamp time.Time // When the task was executed
Error error // Any error that occurred during task execution
}
RecordResult represents an experiment result for a single record.
type RunOption ¶
type RunOption func(cfg *runCfg)
func WithAbortOnError ¶
func WithMaxConcurrency ¶
func WithSampleSize ¶
type SummaryEvaluator ¶
type SummaryEvaluator interface {
Name() string
Run(ctx context.Context, results []*RecordResult) (any, error)
}
SummaryEvaluator represents a summary evaluator for an Experiment. Summary evaluators run after all tasks and evaluators have completed, receiving all experiment results to compute aggregate metrics.
func NewSummaryEvaluator ¶
func NewSummaryEvaluator(name string, fn SummaryEvaluatorFunc) SummaryEvaluator
NewSummaryEvaluator creates a new SummaryEvaluator.
type SummaryEvaluatorFunc ¶
type SummaryEvaluatorFunc func(ctx context.Context, results []*RecordResult) (any, error)
SummaryEvaluatorFunc is the type for SummaryEvaluator functions.