experiment

package

v2.4.0 Latest Latest Go to latest Published: Nov 14, 2025 License: Apache-2.0, BSD-3-Clause, Apache-2.0 Imports: 12 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/DataDog/dd-trace-go

Links

Open Source Insights

Documentation ¶

Index ¶

type Evaluation
type Evaluator
- func NewEvaluator(name string, fn EvaluatorFunc) Evaluator
type EvaluatorFunc
type Experiment
- func New(name string, task Task, ds *dataset.Dataset, evaluators []Evaluator, ...) (*Experiment, error)
- func (e *Experiment) Run(ctx context.Context, opts ...RunOption) (*ExperimentResult, error)
- func (e *Experiment) URL() string
type ExperimentResult
type Option
type RecordResult
type RunOption
type SummaryEvaluator
- func NewSummaryEvaluator(name string, fn SummaryEvaluatorFunc) SummaryEvaluator
type SummaryEvaluatorFunc
type Task
- func NewTask(name string, fn TaskFunc) Task
type TaskFunc

Examples ¶

New

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

This section is empty.

Types ¶

type Evaluation ¶

type Evaluation struct {
	Name  string
	Value any
	Error error
}

Evaluation represents the output of an evaluator.

type Evaluator ¶

type Evaluator interface {
	Name() string
	Run(ctx context.Context, rec dataset.Record, output any) (any, error)
}

Evaluator represents an evaluator for an Experiment.

type EvaluatorFunc ¶

type EvaluatorFunc func(ctx context.Context, rec dataset.Record, output any) (any, error)

EvaluatorFunc is the type for Evaluator functions.

type Experiment ¶

type Experiment struct {
	Name string
	// contains filtered or unexported fields
}

Experiment represents a DataDog LLM Observability experiment.

func New ¶

func New(name string, task Task, ds *dataset.Dataset, evaluators []Evaluator, opts ...Option) (*Experiment, error)

New creates a new Experiment.

Example ¶

package main

import (
	"context"
	"fmt"
	"log"
	"strings"

	"github.com/DataDog/dd-trace-go/v2/ddtrace/tracer"
	"github.com/DataDog/dd-trace-go/v2/llmobs/dataset"
	"github.com/DataDog/dd-trace-go/v2/llmobs/experiment"
)

func main() {
	if err := tracer.Start(tracer.WithLLMObsEnabled(true)); err != nil {
		log.Fatal(err)
	}
	defer tracer.Stop()

	ds, err := dataset.Pull(context.TODO(), "capitals-of-the-world")
	if err != nil {
		log.Fatal(err)
	}

	task := experiment.NewTask("capitals-of-the-world", func(ctx context.Context, rec dataset.Record, experimentCfg map[string]any) (any, error) {
		inputMap := rec.Input.(map[string]any)
		question := inputMap["question"].(string)
		// Your LLM or processing logic here
		if strings.Contains(question, "China") {
			return "Beijing", nil
		}
		return "Unknown", nil
	})

	evs := []experiment.Evaluator{
		experiment.NewEvaluator("exact-match", func(ctx context.Context, rec dataset.Record, output any) (any, error) {
			return output == rec.ExpectedOutput, nil
		}),
		experiment.NewEvaluator("overlap", func(ctx context.Context, rec dataset.Record, output any) (any, error) {
			outStr, ok := output.(string)
			if !ok {
				return nil, fmt.Errorf("wanted output to be a string, got: %T", output)
			}
			expStr, ok := rec.ExpectedOutput.(string)
			if !ok {
				return nil, fmt.Errorf("wanted expectedOutput to be a string, got: %T", rec.ExpectedOutput)
			}

			outSet := make(map[rune]struct{})
			for _, r := range outStr {
				outSet[r] = struct{}{}
			}
			expSet := make(map[rune]struct{})
			for _, r := range expStr {
				expSet[r] = struct{}{}
			}

			// Intersection size
			intersection := 0
			for r := range outSet {
				if _, ok := expSet[r]; ok {
					intersection++
				}
			}
			// |A ∪ B| = |A| + |B| − |A ∩ B|
			union := len(outSet) + len(expSet) - intersection

			// Jaccard similarity. Define both-empty as a perfect match.
			var score float64
			if union == 0 {
				score = 1.0
			} else {
				score = float64(intersection) / float64(union)
			}

			return score, nil
		}),
		experiment.NewEvaluator("fake-llm-as-a-judge", func(ctx context.Context, rec dataset.Record, output any) (any, error) {
			return "excellent", nil
		}),
	}

	exp, err := experiment.New(
		"my-experiment",
		task,
		ds,
		evs,
		experiment.WithDescription("Testing capital cities knowledge"),
		experiment.WithExperimentConfig(
			map[string]any{
				"model_name": "gpt-4",
				"version":    "1.0",
			},
		),
	)
	if err != nil {
		log.Fatal(err)
	}

	results, err := exp.Run(context.TODO())
	if err != nil {
		log.Fatal(err)
	}

	for _, res := range results.Results {
		fmt.Printf("Record ID: %s", res.Record.ID())
		fmt.Printf("Input: %v", res.Record.Input)
		fmt.Printf("Expected Output: %v", res.Record.ExpectedOutput)
		fmt.Printf("Output: %v", res.Output)
		for _, ev := range res.Evaluations {
			fmt.Printf("Evaluator score (%s): %v", ev.Name, ev.Value)
			if ev.Error != nil {
				fmt.Printf("Evaluator error (%s): %v", ev.Name, ev.Error)
			}
		}
		if res.Error != nil {
			fmt.Printf("Error: %v", res.Error)
		}
	}
}

func (*Experiment) Run ¶

func (e *Experiment) Run(ctx context.Context, opts ...RunOption) (*ExperimentResult, error)

Run executes the experiment, running the task and evaluators on each record in the dataset, then running summary evaluators on the aggregated results.

func (*Experiment) URL ¶

func (e *Experiment) URL() string

type ExperimentResult ¶

type ExperimentResult struct {
	ExperimentName     string
	DatasetName        string
	Results            []*RecordResult
	SummaryEvaluations []*Evaluation
}

ExperimentResult represents the complete results of an experiment run.

type Option ¶

type Option func(cfg *newCfg)

func WithDescription ¶

func WithDescription(description string) Option

func WithExperimentConfig ¶

func WithExperimentConfig(experimentCfg map[string]any) Option

func WithProjectName ¶

func WithProjectName(name string) Option

func WithSummaryEvaluators ¶

func WithSummaryEvaluators(summaryEvaluators ...SummaryEvaluator) Option

WithSummaryEvaluators sets the summary evaluators for the experiment. Summary evaluators run after all tasks and evaluators have completed, receiving all experiment results to compute aggregate metrics.

func WithTags ¶

func WithTags(tags map[string]string) Option

type RecordResult ¶

type RecordResult struct {
	Record      *dataset.Record // The dataset record containing input, expected output, and metadata
	Output      any             // The task output for this record
	Evaluations []*Evaluation   // Evaluation results for this record

	// Experiment execution metadata
	RecordIndex int       // Index of the record in the dataset
	SpanID      string    // Span ID for tracing
	TraceID     string    // Trace ID for tracing
	Timestamp   time.Time // When the task was executed
	Error       error     // Any error that occurred during task execution
}

RecordResult represents an experiment result for a single record.

type RunOption ¶

type RunOption func(cfg *runCfg)

func WithAbortOnError ¶

func WithAbortOnError(abortOnError bool) RunOption

func WithMaxConcurrency ¶

func WithMaxConcurrency(maxConcurrency int) RunOption

func WithSampleSize ¶

func WithSampleSize(sampleSize int) RunOption

type SummaryEvaluator ¶

type SummaryEvaluator interface {
	Name() string
	Run(ctx context.Context, results []*RecordResult) (any, error)
}

SummaryEvaluator represents a summary evaluator for an Experiment. Summary evaluators run after all tasks and evaluators have completed, receiving all experiment results to compute aggregate metrics.

func NewSummaryEvaluator ¶

func NewSummaryEvaluator(name string, fn SummaryEvaluatorFunc) SummaryEvaluator

NewSummaryEvaluator creates a new SummaryEvaluator.

type SummaryEvaluatorFunc ¶

type SummaryEvaluatorFunc func(ctx context.Context, results []*RecordResult) (any, error)

SummaryEvaluatorFunc is the type for SummaryEvaluator functions.

type Task ¶

type Task interface {
	Name() string
	Run(ctx context.Context, rec dataset.Record, experimentCfg map[string]any) (any, error)
}

Task represents the task to run for an Experiment.

func NewTask ¶

func NewTask(name string, fn TaskFunc) Task

NewTask creates a new Task.

type TaskFunc ¶

type TaskFunc func(ctx context.Context, rec dataset.Record, experimentCfg map[string]any) (any, error)

TaskFunc is the type for Task functions.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL

Documentation ¶

Index ¶

Examples ¶

Constants ¶

Variables ¶

Functions ¶

Types ¶

type Evaluation ¶

type Evaluator ¶

func NewEvaluator ¶

type EvaluatorFunc ¶

type Experiment ¶

func New ¶

func (*Experiment) Run ¶

func (*Experiment) URL ¶

type ExperimentResult ¶

type Option ¶

func WithDescription ¶

func WithExperimentConfig ¶

func WithProjectName ¶

func WithSummaryEvaluators ¶

func WithTags ¶

type RecordResult ¶

type RunOption ¶

func WithAbortOnError ¶

func WithMaxConcurrency ¶

func WithSampleSize ¶

type SummaryEvaluator ¶

func NewSummaryEvaluator ¶

type SummaryEvaluatorFunc ¶

type Task ¶

func NewTask ¶

type TaskFunc ¶

Source Files ¶