Documentation
¶
Index ¶
Examples ¶
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type Evaluation ¶
Evaluation represents the output of an evaluator.
type Evaluator ¶
type Evaluator interface {
Name() string
Run(ctx context.Context, rec dataset.Record, output any) (any, error)
}
Evaluator represents an evaluator for an Experiment.
func NewEvaluator ¶
func NewEvaluator(name string, fn EvaluatorFunc) Evaluator
NewEvaluator creates a new Evaluator.
type EvaluatorFunc ¶
EvaluatorFunc is the type for Evaluator functions.
type Experiment ¶
type Experiment struct {
Name string
// contains filtered or unexported fields
}
Experiment represents a DataDog LLM Observability experiment.
func New ¶
func New(name string, task Task, ds *dataset.Dataset, evaluators []Evaluator, opts ...Option) (*Experiment, error)
New creates a new Experiment.
Example ¶
package main
import (
"context"
"fmt"
"log"
"strings"
"github.com/DataDog/dd-trace-go/v2/ddtrace/tracer"
"github.com/DataDog/dd-trace-go/v2/llmobs/dataset"
"github.com/DataDog/dd-trace-go/v2/llmobs/experiment"
)
func main() {
if err := tracer.Start(tracer.WithLLMObsEnabled(true)); err != nil {
log.Fatal(err)
}
defer tracer.Stop()
ds, err := dataset.Pull(context.TODO(), "capitals-of-the-world")
if err != nil {
log.Fatal(err)
}
task := experiment.NewTask("capitals-of-the-world", func(ctx context.Context, rec dataset.Record, experimentCfg map[string]any) (any, error) {
inputMap := rec.Input.(map[string]any)
question := inputMap["question"].(string)
// Your LLM or processing logic here
if strings.Contains(question, "China") {
return "Beijing", nil
}
return "Unknown", nil
})
evs := []experiment.Evaluator{
experiment.NewEvaluator("exact-match", func(ctx context.Context, rec dataset.Record, output any) (any, error) {
return output == rec.ExpectedOutput, nil
}),
experiment.NewEvaluator("overlap", func(ctx context.Context, rec dataset.Record, output any) (any, error) {
outStr, ok := output.(string)
if !ok {
return nil, fmt.Errorf("wanted output to be a string, got: %T", output)
}
expStr, ok := rec.ExpectedOutput.(string)
if !ok {
return nil, fmt.Errorf("wanted expectedOutput to be a string, got: %T", rec.ExpectedOutput)
}
outSet := make(map[rune]struct{})
for _, r := range outStr {
outSet[r] = struct{}{}
}
expSet := make(map[rune]struct{})
for _, r := range expStr {
expSet[r] = struct{}{}
}
// Intersection size
intersection := 0
for r := range outSet {
if _, ok := expSet[r]; ok {
intersection++
}
}
// |A ∪ B| = |A| + |B| − |A ∩ B|
union := len(outSet) + len(expSet) - intersection
// Jaccard similarity. Define both-empty as a perfect match.
var score float64
if union == 0 {
score = 1.0
} else {
score = float64(intersection) / float64(union)
}
return score, nil
}),
experiment.NewEvaluator("fake-llm-as-a-judge", func(ctx context.Context, rec dataset.Record, output any) (any, error) {
return "excellent", nil
}),
}
exp, err := experiment.New(
"my-experiment",
task,
ds,
evs,
experiment.WithDescription("Testing capital cities knowledge"),
experiment.WithExperimentConfig(
map[string]any{
"model_name": "gpt-4",
"version": "1.0",
},
),
)
if err != nil {
log.Fatal(err)
}
results, err := exp.Run(context.TODO())
if err != nil {
log.Fatal(err)
}
for _, res := range results.Results {
fmt.Printf("Record ID: %s", res.Record.ID())
fmt.Printf("Input: %v", res.Record.Input)
fmt.Printf("Expected Output: %v", res.Record.ExpectedOutput)
fmt.Printf("Output: %v", res.Output)
for _, ev := range res.Evaluations {
fmt.Printf("Evaluator score (%s): %v", ev.Name, ev.Value)
if ev.Error != nil {
fmt.Printf("Evaluator error (%s): %v", ev.Name, ev.Error)
}
}
if res.Error != nil {
fmt.Printf("Error: %v", res.Error)
}
}
}
Output:
func (*Experiment) Run ¶
func (e *Experiment) Run(ctx context.Context, opts ...RunOption) (result *ExperimentResult, retErr error)
Run executes the experiment, running the task and evaluators on each record in the dataset, then running summary evaluators on the aggregated results. When configured with WithRuns(n), the full experiment loop is executed n times.
func (*Experiment) URL ¶
func (e *Experiment) URL() string
type ExperimentResult ¶
type ExperimentResult struct {
// ExperimentName is the name of the experiment as provided to New.
ExperimentName string
// DatasetName is the name of the dataset the experiment ran against.
DatasetName string
// Runs holds the results for each run iteration, in order. For a single-run
// experiment this slice has exactly one element.
Runs []*RunResult
// Results is kept for single-run backward compatibility and points to Runs[0].Results.
//
// Deprecated: Use Runs[0].Results instead.
Results []*RecordResult
// SummaryEvaluations is kept for single-run backward compatibility and points to Runs[0].SummaryEvaluations.
//
// Deprecated: Use Runs[0].SummaryEvaluations instead.
SummaryEvaluations []*Evaluation
}
ExperimentResult represents the complete results of an experiment execution. For multi-run experiments (WithRuns > 1), Runs contains one entry per iteration.
type Option ¶
type Option func(cfg *newCfg)
func WithDescription ¶
func WithExperimentConfig ¶
func WithProjectName ¶
func WithRuns ¶ added in v2.8.0
WithRuns sets the number of times the experiment will be executed end-to-end. Each run gets a unique run ID and a 1-indexed iteration number, both propagated as tags on spans and evaluation metric events. Defaults to 1.
func WithSummaryEvaluators ¶
func WithSummaryEvaluators(summaryEvaluators ...SummaryEvaluator) Option
WithSummaryEvaluators sets the summary evaluators for the experiment. Summary evaluators run after all tasks and evaluators have completed, receiving all experiment results to compute aggregate metrics.
type RecordResult ¶
type RecordResult struct {
// Record is the dataset record containing input, expected output, and metadata.
Record *dataset.Record
// Output is the task output for this record.
Output any
// Evaluations holds the evaluation results for this record.
Evaluations []*Evaluation
// RecordIndex is the index of the record in the dataset.
RecordIndex int
// SpanID is the span ID for tracing.
SpanID string
// TraceID is the trace ID for tracing.
TraceID string
// Timestamp is when the task was executed.
Timestamp time.Time
// Error is any error that occurred during task execution.
Error error
}
RecordResult represents an experiment result for a single record.
type RunInfo ¶ added in v2.8.0
type RunInfo struct {
ID string // UUID uniquely identifying this run
Iteration int // 1-indexed iteration number
}
RunInfo contains metadata for a single experiment run iteration.
type RunOption ¶
type RunOption func(cfg *runCfg)
func WithAbortOnError ¶
func WithMaxConcurrency ¶
func WithSampleSize ¶
type RunResult ¶ added in v2.8.0
type RunResult struct {
Run RunInfo
Results []*RecordResult
SummaryEvaluations []*Evaluation
}
RunResult contains the results for a single run iteration.
type SummaryEvaluator ¶
type SummaryEvaluator interface {
Name() string
Run(ctx context.Context, results []*RecordResult) (any, error)
}
SummaryEvaluator represents a summary evaluator for an Experiment. Summary evaluators run after all tasks and evaluators have completed, receiving all experiment results to compute aggregate metrics.
func NewSummaryEvaluator ¶
func NewSummaryEvaluator(name string, fn SummaryEvaluatorFunc) SummaryEvaluator
NewSummaryEvaluator creates a new SummaryEvaluator.
type SummaryEvaluatorFunc ¶
type SummaryEvaluatorFunc func(ctx context.Context, results []*RecordResult) (any, error)
SummaryEvaluatorFunc is the type for SummaryEvaluator functions.