experiment

package

v2.10.0-dev Latest Latest Go to latest Published: May 12, 2026 License: Apache-2.0, BSD-3-Clause, Apache-2.0 Imports: 15 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/DataDog/dd-trace-go

Links

Open Source Insights

Documentation ¶

Index ¶

type Evaluation
type Evaluator
- func NewEvaluator(name string, fn EvaluatorFunc) Evaluator
type EvaluatorFunc
type Experiment
- func New(name string, task Task, ds *dataset.Dataset, evaluators []Evaluator, ...) (*Experiment, error)
- func (e *Experiment) Run(ctx context.Context, opts ...RunOption) (result *ExperimentResult, retErr error)
- func (e *Experiment) URL() string
type ExperimentResult
type Option
- func WithDescription(description string) Option
- func WithExperimentConfig(experimentCfg map[string]any) Option
- func WithProjectName(name string) Option
- func WithRuns(n int) Option
- func WithSummaryEvaluators(summaryEvaluators ...SummaryEvaluator) Option
- func WithTags(tags map[string]string) Option
type RecordResult
type RunInfo
type RunOption
- func WithAbortOnError(abortOnError bool) RunOption
- func WithMaxConcurrency(maxConcurrency int) RunOption
- func WithSampleSize(sampleSize int) RunOption
type RunResult
type SummaryEvaluator
- func NewSummaryEvaluator(name string, fn SummaryEvaluatorFunc) SummaryEvaluator
type SummaryEvaluatorFunc
type Task
- func NewTask(name string, fn TaskFunc) Task
type TaskFunc

Examples ¶

New

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

This section is empty.

Types ¶

type Evaluation ¶

type Evaluation struct {
	Name  string
	Value any
	Error error
}

Evaluation represents the output of an evaluator.

type Evaluator ¶

type Evaluator interface {
	Name() string
	Run(ctx context.Context, rec dataset.Record, output any) (any, error)
}

Evaluator represents an evaluator for an Experiment.

type EvaluatorFunc ¶

type EvaluatorFunc func(ctx context.Context, rec dataset.Record, output any) (any, error)

EvaluatorFunc is the type for Evaluator functions.

type Experiment ¶

type Experiment struct {
	Name string
	// contains filtered or unexported fields
}

Experiment represents a DataDog LLM Observability experiment.

func New ¶

func New(name string, task Task, ds *dataset.Dataset, evaluators []Evaluator, opts ...Option) (*Experiment, error)

New creates a new Experiment.

Example ¶

package main

import (
	"context"
	"fmt"
	"log"
	"strings"

	"github.com/DataDog/dd-trace-go/v2/ddtrace/tracer"
	"github.com/DataDog/dd-trace-go/v2/llmobs/dataset"
	"github.com/DataDog/dd-trace-go/v2/llmobs/experiment"
)

func main() {
	if err := tracer.Start(tracer.WithLLMObsEnabled(true)); err != nil {
		log.Fatal(err)
	}
	defer tracer.Stop()

	ds, err := dataset.Pull(context.TODO(), "capitals-of-the-world")
	if err != nil {
		log.Fatal(err)
	}

	task := experiment.NewTask("capitals-of-the-world", func(ctx context.Context, rec dataset.Record, experimentCfg map[string]any) (any, error) {
		inputMap := rec.Input.(map[string]any)
		question := inputMap["question"].(string)
		// Your LLM or processing logic here
		if strings.Contains(question, "China") {
			return "Beijing", nil
		}
		return "Unknown", nil
	})

	evs := []experiment.Evaluator{
		experiment.NewEvaluator("exact-match", func(ctx context.Context, rec dataset.Record, output any) (any, error) {
			return output == rec.ExpectedOutput, nil
		}),
		experiment.NewEvaluator("overlap", func(ctx context.Context, rec dataset.Record, output any) (any, error) {
			outStr, ok := output.(string)
			if !ok {
				return nil, fmt.Errorf("wanted output to be a string, got: %T", output)
			}
			expStr, ok := rec.ExpectedOutput.(string)
			if !ok {
				return nil, fmt.Errorf("wanted expectedOutput to be a string, got: %T", rec.ExpectedOutput)
			}

			outSet := make(map[rune]struct{})
			for _, r := range outStr {
				outSet[r] = struct{}{}
			}
			expSet := make(map[rune]struct{})
			for _, r := range expStr {
				expSet[r] = struct{}{}
			}

			// Intersection size
			intersection := 0
			for r := range outSet {
				if _, ok := expSet[r]; ok {
					intersection++
				}
			}
			// |A ∪ B| = |A| + |B| − |A ∩ B|
			union := len(outSet) + len(expSet) - intersection

			// Jaccard similarity. Define both-empty as a perfect match.
			var score float64
			if union == 0 {
				score = 1.0
			} else {
				score = float64(intersection) / float64(union)
			}

			return score, nil
		}),
		experiment.NewEvaluator("fake-llm-as-a-judge", func(ctx context.Context, rec dataset.Record, output any) (any, error) {
			return "excellent", nil
		}),
	}

	exp, err := experiment.New(
		"my-experiment",
		task,
		ds,
		evs,
		experiment.WithDescription("Testing capital cities knowledge"),
		experiment.WithExperimentConfig(
			map[string]any{
				"model_name": "gpt-4",
				"version":    "1.0",
			},
		),
	)
	if err != nil {
		log.Fatal(err)
	}

	results, err := exp.Run(context.TODO())
	if err != nil {
		log.Fatal(err)
	}

	for _, res := range results.Results {
		fmt.Printf("Record ID: %s", res.Record.ID())
		fmt.Printf("Input: %v", res.Record.Input)
		fmt.Printf("Expected Output: %v", res.Record.ExpectedOutput)
		fmt.Printf("Output: %v", res.Output)
		for _, ev := range res.Evaluations {
			fmt.Printf("Evaluator score (%s): %v", ev.Name, ev.Value)
			if ev.Error != nil {
				fmt.Printf("Evaluator error (%s): %v", ev.Name, ev.Error)
			}
		}
		if res.Error != nil {
			fmt.Printf("Error: %v", res.Error)
		}
	}
}

Output:

func (*Experiment) Run ¶

func (e *Experiment) Run(ctx context.Context, opts ...RunOption) (result *ExperimentResult, retErr error)

Run executes the experiment, running the task and evaluators on each record in the dataset, then running summary evaluators on the aggregated results. When configured with WithRuns(n), the full experiment loop is executed n times.

func (*Experiment) URL ¶

func (e *Experiment) URL() string

type ExperimentResult ¶

type ExperimentResult struct {
	// ExperimentName is the name of the experiment as provided to New.
	ExperimentName string
	// DatasetName is the name of the dataset the experiment ran against.
	DatasetName string
	// Runs holds the results for each run iteration, in order. For a single-run
	// experiment this slice has exactly one element.
	Runs []*RunResult
	// Results is kept for single-run backward compatibility and points to Runs[0].Results.
	//
	// Deprecated: Use Runs[0].Results instead.
	Results []*RecordResult
	// SummaryEvaluations is kept for single-run backward compatibility and points to Runs[0].SummaryEvaluations.
	//
	// Deprecated: Use Runs[0].SummaryEvaluations instead.
	SummaryEvaluations []*Evaluation
}

ExperimentResult represents the complete results of an experiment execution. For multi-run experiments (WithRuns > 1), Runs contains one entry per iteration.

type Option ¶

type Option func(cfg *newCfg)

func WithDescription ¶

func WithDescription(description string) Option

func WithExperimentConfig ¶

func WithExperimentConfig(experimentCfg map[string]any) Option

func WithProjectName ¶

func WithProjectName(name string) Option

func WithRuns ¶ added in v2.8.0

func WithRuns(n int) Option

WithRuns sets the number of times the experiment will be executed end-to-end. Each run gets a unique run ID and a 1-indexed iteration number, both propagated as tags on spans and evaluation metric events. Defaults to 1.

func WithSummaryEvaluators ¶

func WithSummaryEvaluators(summaryEvaluators ...SummaryEvaluator) Option

WithSummaryEvaluators sets the summary evaluators for the experiment. Summary evaluators run after all tasks and evaluators have completed, receiving all experiment results to compute aggregate metrics.

func WithTags ¶

func WithTags(tags map[string]string) Option

type RecordResult ¶

type RecordResult struct {
	// Record is the dataset record containing input, expected output, and metadata.
	Record *dataset.Record
	// Output is the task output for this record.
	Output any
	// Evaluations holds the evaluation results for this record.
	Evaluations []*Evaluation

	// RecordIndex is the index of the record in the dataset.
	RecordIndex int
	// SpanID is the span ID for tracing.
	SpanID string
	// TraceID is the trace ID for tracing.
	TraceID string
	// Timestamp is when the task was executed.
	Timestamp time.Time
	// Error is any error that occurred during task execution.
	Error error
}

RecordResult represents an experiment result for a single record.

type RunInfo ¶ added in v2.8.0

type RunInfo struct {
	ID        string // UUID uniquely identifying this run
	Iteration int    // 1-indexed iteration number
}

RunInfo contains metadata for a single experiment run iteration.

type RunOption ¶

type RunOption func(cfg *runCfg)

func WithAbortOnError ¶

func WithAbortOnError(abortOnError bool) RunOption

func WithMaxConcurrency ¶

func WithMaxConcurrency(maxConcurrency int) RunOption

func WithSampleSize ¶

func WithSampleSize(sampleSize int) RunOption

type RunResult ¶ added in v2.8.0

type RunResult struct {
	Run                RunInfo
	Results            []*RecordResult
	SummaryEvaluations []*Evaluation
}

RunResult contains the results for a single run iteration.

type SummaryEvaluator ¶

type SummaryEvaluator interface {
	Name() string
	Run(ctx context.Context, results []*RecordResult) (any, error)
}

SummaryEvaluator represents a summary evaluator for an Experiment. Summary evaluators run after all tasks and evaluators have completed, receiving all experiment results to compute aggregate metrics.

func NewSummaryEvaluator ¶

func NewSummaryEvaluator(name string, fn SummaryEvaluatorFunc) SummaryEvaluator

NewSummaryEvaluator creates a new SummaryEvaluator.

type SummaryEvaluatorFunc ¶

type SummaryEvaluatorFunc func(ctx context.Context, results []*RecordResult) (any, error)

SummaryEvaluatorFunc is the type for SummaryEvaluator functions.

type Task ¶

type Task interface {
	Name() string
	Run(ctx context.Context, rec dataset.Record, experimentCfg map[string]any) (any, error)
}

Task represents the task to run for an Experiment.

func NewTask ¶

func NewTask(name string, fn TaskFunc) Task

NewTask creates a new Task.

type TaskFunc ¶

type TaskFunc func(ctx context.Context, rec dataset.Record, experimentCfg map[string]any) (any, error)

TaskFunc is the type for Task functions.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL

Documentation ¶

Index ¶

Examples ¶

Constants ¶

Variables ¶

Functions ¶

Types ¶

type Evaluation ¶

type Evaluator ¶

func NewEvaluator ¶

type EvaluatorFunc ¶

type Experiment ¶

func New ¶

func (*Experiment) Run ¶

func (*Experiment) URL ¶

type ExperimentResult ¶

type Option ¶

func WithDescription ¶

func WithExperimentConfig ¶

func WithProjectName ¶

func WithRuns ¶ added in v2.8.0

func WithSummaryEvaluators ¶

func WithTags ¶

type RecordResult ¶

type RunInfo ¶ added in v2.8.0

type RunOption ¶

func WithAbortOnError ¶

func WithMaxConcurrency ¶

func WithSampleSize ¶

type RunResult ¶ added in v2.8.0

type SummaryEvaluator ¶

func NewSummaryEvaluator ¶

type SummaryEvaluatorFunc ¶

type Task ¶

func NewTask ¶

type TaskFunc ¶

Source Files ¶