experiment

package
v2.8.2 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: May 15, 2026 License: Apache-2.0, BSD-3-Clause, Apache-2.0 Imports: 15 Imported by: 0

Documentation

Index

Examples

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type Evaluation

type Evaluation struct {
	Name  string
	Value any
	Error error
}

Evaluation represents the output of an evaluator.

type Evaluator

type Evaluator interface {
	Name() string
	Run(ctx context.Context, rec dataset.Record, output any) (any, error)
}

Evaluator represents an evaluator for an Experiment.

func NewEvaluator

func NewEvaluator(name string, fn EvaluatorFunc) Evaluator

NewEvaluator creates a new Evaluator.

type EvaluatorFunc

type EvaluatorFunc func(ctx context.Context, rec dataset.Record, output any) (any, error)

EvaluatorFunc is the type for Evaluator functions.

type Experiment

type Experiment struct {
	Name string
	// contains filtered or unexported fields
}

Experiment represents a DataDog LLM Observability experiment.

func New

func New(name string, task Task, ds *dataset.Dataset, evaluators []Evaluator, opts ...Option) (*Experiment, error)

New creates a new Experiment.

Example
package main

import (
	"context"
	"fmt"
	"log"
	"strings"

	"github.com/DataDog/dd-trace-go/v2/ddtrace/tracer"
	"github.com/DataDog/dd-trace-go/v2/llmobs/dataset"
	"github.com/DataDog/dd-trace-go/v2/llmobs/experiment"
)

func main() {
	if err := tracer.Start(tracer.WithLLMObsEnabled(true)); err != nil {
		log.Fatal(err)
	}
	defer tracer.Stop()

	ds, err := dataset.Pull(context.TODO(), "capitals-of-the-world")
	if err != nil {
		log.Fatal(err)
	}

	task := experiment.NewTask("capitals-of-the-world", func(ctx context.Context, rec dataset.Record, experimentCfg map[string]any) (any, error) {
		inputMap := rec.Input.(map[string]any)
		question := inputMap["question"].(string)
		// Your LLM or processing logic here
		if strings.Contains(question, "China") {
			return "Beijing", nil
		}
		return "Unknown", nil
	})

	evs := []experiment.Evaluator{
		experiment.NewEvaluator("exact-match", func(ctx context.Context, rec dataset.Record, output any) (any, error) {
			return output == rec.ExpectedOutput, nil
		}),
		experiment.NewEvaluator("overlap", func(ctx context.Context, rec dataset.Record, output any) (any, error) {
			outStr, ok := output.(string)
			if !ok {
				return nil, fmt.Errorf("wanted output to be a string, got: %T", output)
			}
			expStr, ok := rec.ExpectedOutput.(string)
			if !ok {
				return nil, fmt.Errorf("wanted expectedOutput to be a string, got: %T", rec.ExpectedOutput)
			}

			outSet := make(map[rune]struct{})
			for _, r := range outStr {
				outSet[r] = struct{}{}
			}
			expSet := make(map[rune]struct{})
			for _, r := range expStr {
				expSet[r] = struct{}{}
			}

			// Intersection size
			intersection := 0
			for r := range outSet {
				if _, ok := expSet[r]; ok {
					intersection++
				}
			}
			// |A ∪ B| = |A| + |B| − |A ∩ B|
			union := len(outSet) + len(expSet) - intersection

			// Jaccard similarity. Define both-empty as a perfect match.
			var score float64
			if union == 0 {
				score = 1.0
			} else {
				score = float64(intersection) / float64(union)
			}

			return score, nil
		}),
		experiment.NewEvaluator("fake-llm-as-a-judge", func(ctx context.Context, rec dataset.Record, output any) (any, error) {
			return "excellent", nil
		}),
	}

	exp, err := experiment.New(
		"my-experiment",
		task,
		ds,
		evs,
		experiment.WithDescription("Testing capital cities knowledge"),
		experiment.WithExperimentConfig(
			map[string]any{
				"model_name": "gpt-4",
				"version":    "1.0",
			},
		),
	)
	if err != nil {
		log.Fatal(err)
	}

	results, err := exp.Run(context.TODO())
	if err != nil {
		log.Fatal(err)
	}

	for _, res := range results.Results {
		fmt.Printf("Record ID: %s", res.Record.ID())
		fmt.Printf("Input: %v", res.Record.Input)
		fmt.Printf("Expected Output: %v", res.Record.ExpectedOutput)
		fmt.Printf("Output: %v", res.Output)
		for _, ev := range res.Evaluations {
			fmt.Printf("Evaluator score (%s): %v", ev.Name, ev.Value)
			if ev.Error != nil {
				fmt.Printf("Evaluator error (%s): %v", ev.Name, ev.Error)
			}
		}
		if res.Error != nil {
			fmt.Printf("Error: %v", res.Error)
		}
	}
}

func (*Experiment) Run

func (e *Experiment) Run(ctx context.Context, opts ...RunOption) (result *ExperimentResult, retErr error)

Run executes the experiment, running the task and evaluators on each record in the dataset, then running summary evaluators on the aggregated results. When configured with WithRuns(n), the full experiment loop is executed n times.

func (*Experiment) URL

func (e *Experiment) URL() string

type ExperimentResult

type ExperimentResult struct {
	// ExperimentName is the name of the experiment as provided to New.
	ExperimentName string
	// DatasetName is the name of the dataset the experiment ran against.
	DatasetName string
	// Runs holds the results for each run iteration, in order. For a single-run
	// experiment this slice has exactly one element.
	Runs []*RunResult
	// Results is kept for single-run backward compatibility and points to Runs[0].Results.
	//
	// Deprecated: Use Runs[0].Results instead.
	Results []*RecordResult
	// SummaryEvaluations is kept for single-run backward compatibility and points to Runs[0].SummaryEvaluations.
	//
	// Deprecated: Use Runs[0].SummaryEvaluations instead.
	SummaryEvaluations []*Evaluation
}

ExperimentResult represents the complete results of an experiment execution. For multi-run experiments (WithRuns > 1), Runs contains one entry per iteration.

type Option

type Option func(cfg *newCfg)

func WithDescription

func WithDescription(description string) Option

func WithExperimentConfig

func WithExperimentConfig(experimentCfg map[string]any) Option

func WithProjectName

func WithProjectName(name string) Option

func WithRuns added in v2.8.0

func WithRuns(n int) Option

WithRuns sets the number of times the experiment will be executed end-to-end. Each run gets a unique run ID and a 1-indexed iteration number, both propagated as tags on spans and evaluation metric events. Defaults to 1.

func WithSummaryEvaluators

func WithSummaryEvaluators(summaryEvaluators ...SummaryEvaluator) Option

WithSummaryEvaluators sets the summary evaluators for the experiment. Summary evaluators run after all tasks and evaluators have completed, receiving all experiment results to compute aggregate metrics.

func WithTags

func WithTags(tags map[string]string) Option

type RecordResult

type RecordResult struct {
	// Record is the dataset record containing input, expected output, and metadata.
	Record *dataset.Record
	// Output is the task output for this record.
	Output any
	// Evaluations holds the evaluation results for this record.
	Evaluations []*Evaluation

	// RecordIndex is the index of the record in the dataset.
	RecordIndex int
	// SpanID is the span ID for tracing.
	SpanID string
	// TraceID is the trace ID for tracing.
	TraceID string
	// Timestamp is when the task was executed.
	Timestamp time.Time
	// Error is any error that occurred during task execution.
	Error error
}

RecordResult represents an experiment result for a single record.

type RunInfo added in v2.8.0

type RunInfo struct {
	ID        string // UUID uniquely identifying this run
	Iteration int    // 1-indexed iteration number
}

RunInfo contains metadata for a single experiment run iteration.

type RunOption

type RunOption func(cfg *runCfg)

func WithAbortOnError

func WithAbortOnError(abortOnError bool) RunOption

func WithMaxConcurrency

func WithMaxConcurrency(maxConcurrency int) RunOption

func WithSampleSize

func WithSampleSize(sampleSize int) RunOption

type RunResult added in v2.8.0

type RunResult struct {
	Run                RunInfo
	Results            []*RecordResult
	SummaryEvaluations []*Evaluation
}

RunResult contains the results for a single run iteration.

type SummaryEvaluator

type SummaryEvaluator interface {
	Name() string
	Run(ctx context.Context, results []*RecordResult) (any, error)
}

SummaryEvaluator represents a summary evaluator for an Experiment. Summary evaluators run after all tasks and evaluators have completed, receiving all experiment results to compute aggregate metrics.

func NewSummaryEvaluator

func NewSummaryEvaluator(name string, fn SummaryEvaluatorFunc) SummaryEvaluator

NewSummaryEvaluator creates a new SummaryEvaluator.

type SummaryEvaluatorFunc

type SummaryEvaluatorFunc func(ctx context.Context, results []*RecordResult) (any, error)

SummaryEvaluatorFunc is the type for SummaryEvaluator functions.

type Task

type Task interface {
	Name() string
	Run(ctx context.Context, rec dataset.Record, experimentCfg map[string]any) (any, error)
}

Task represents the task to run for an Experiment.

func NewTask

func NewTask(name string, fn TaskFunc) Task

NewTask creates a new Task.

type TaskFunc

type TaskFunc func(ctx context.Context, rec dataset.Record, experimentCfg map[string]any) (any, error)

TaskFunc is the type for Task functions.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL