experiment

package
v2.3.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Oct 30, 2025 License: Apache-2.0, BSD-3-Clause, Apache-2.0 Imports: 12 Imported by: 0

Documentation

Index

Examples

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type Evaluation

type Evaluation struct {
	Name  string
	Value any
	Error error
}

Evaluation represents the output of an evaluator.

type Evaluator

type Evaluator interface {
	Name() string
	Run(ctx context.Context, rec dataset.Record, output any) (any, error)
}

Evaluator represents an evaluator for an Experiment.

func NewEvaluator

func NewEvaluator(name string, fn EvaluatorFunc) Evaluator

NewEvaluator creates a new Evaluator.

type EvaluatorFunc

type EvaluatorFunc func(ctx context.Context, rec dataset.Record, output any) (any, error)

EvaluatorFunc is the type for Evaluator functions.

type Experiment

type Experiment struct {
	Name string
	// contains filtered or unexported fields
}

Experiment represents a DataDog LLM Observability experiment.

func New

func New(name string, task Task, ds *dataset.Dataset, evaluators []Evaluator, opts ...Option) (*Experiment, error)

New creates a new Experiment.

Example
package main

import (
	"context"
	"fmt"
	"log"
	"strings"

	"github.com/DataDog/dd-trace-go/v2/ddtrace/tracer"
	"github.com/DataDog/dd-trace-go/v2/llmobs/dataset"
	"github.com/DataDog/dd-trace-go/v2/llmobs/experiment"
)

func main() {
	if err := tracer.Start(tracer.WithLLMObsEnabled(true)); err != nil {
		log.Fatal(err)
	}
	defer tracer.Stop()

	ds, err := dataset.Pull(context.TODO(), "capitals-of-the-world")
	if err != nil {
		log.Fatal(err)
	}

	task := experiment.NewTask("capitals-of-the-world", func(ctx context.Context, rec dataset.Record, experimentCfg map[string]any) (any, error) {
		inputMap := rec.Input.(map[string]any)
		question := inputMap["question"].(string)
		// Your LLM or processing logic here
		if strings.Contains(question, "China") {
			return "Beijing", nil
		}
		return "Unknown", nil
	})

	evs := []experiment.Evaluator{
		experiment.NewEvaluator("exact-match", func(ctx context.Context, rec dataset.Record, output any) (any, error) {
			return output == rec.ExpectedOutput, nil
		}),
		experiment.NewEvaluator("overlap", func(ctx context.Context, rec dataset.Record, output any) (any, error) {
			outStr, ok := output.(string)
			if !ok {
				return nil, fmt.Errorf("wanted output to be a string, got: %T", output)
			}
			expStr, ok := rec.ExpectedOutput.(string)
			if !ok {
				return nil, fmt.Errorf("wanted expectedOutput to be a string, got: %T", rec.ExpectedOutput)
			}

			outSet := make(map[rune]struct{})
			for _, r := range outStr {
				outSet[r] = struct{}{}
			}
			expSet := make(map[rune]struct{})
			for _, r := range expStr {
				expSet[r] = struct{}{}
			}

			// Intersection size
			intersection := 0
			for r := range outSet {
				if _, ok := expSet[r]; ok {
					intersection++
				}
			}
			// |A ∪ B| = |A| + |B| − |A ∩ B|
			union := len(outSet) + len(expSet) - intersection

			// Jaccard similarity. Define both-empty as a perfect match.
			var score float64
			if union == 0 {
				score = 1.0
			} else {
				score = float64(intersection) / float64(union)
			}

			return score, nil
		}),
		experiment.NewEvaluator("fake-llm-as-a-judge", func(ctx context.Context, rec dataset.Record, output any) (any, error) {
			return "excellent", nil
		}),
	}

	exp, err := experiment.New(
		"my-experiment",
		task,
		ds,
		evs,
		experiment.WithDescription("Testing capital cities knowledge"),
		experiment.WithExperimentConfig(
			map[string]any{
				"model_name": "gpt-4",
				"version":    "1.0",
			},
		),
	)
	if err != nil {
		log.Fatal(err)
	}

	results, err := exp.Run(context.TODO())
	if err != nil {
		log.Fatal(err)
	}

	for _, res := range results.Results {
		fmt.Printf("Record ID: %s", res.Record.ID())
		fmt.Printf("Input: %v", res.Record.Input)
		fmt.Printf("Expected Output: %v", res.Record.ExpectedOutput)
		fmt.Printf("Output: %v", res.Output)
		for _, ev := range res.Evaluations {
			fmt.Printf("Evaluator score (%s): %v", ev.Name, ev.Value)
			if ev.Error != nil {
				fmt.Printf("Evaluator error (%s): %v", ev.Name, ev.Error)
			}
		}
		if res.Error != nil {
			fmt.Printf("Error: %v", res.Error)
		}
	}
}

func (*Experiment) Run

func (e *Experiment) Run(ctx context.Context, opts ...RunOption) (*ExperimentResult, error)

Run executes the experiment, running the task and evaluators on each record in the dataset, then running summary evaluators on the aggregated results.

func (*Experiment) URL

func (e *Experiment) URL() string

type ExperimentResult

type ExperimentResult struct {
	ExperimentName     string
	DatasetName        string
	Results            []*RecordResult
	SummaryEvaluations []*Evaluation
}

ExperimentResult represents the complete results of an experiment run.

type Option

type Option func(cfg *newCfg)

func WithDescription

func WithDescription(description string) Option

func WithExperimentConfig

func WithExperimentConfig(experimentCfg map[string]any) Option

func WithProjectName

func WithProjectName(name string) Option

func WithSummaryEvaluators

func WithSummaryEvaluators(summaryEvaluators ...SummaryEvaluator) Option

WithSummaryEvaluators sets the summary evaluators for the experiment. Summary evaluators run after all tasks and evaluators have completed, receiving all experiment results to compute aggregate metrics.

func WithTags

func WithTags(tags map[string]string) Option

type RecordResult

type RecordResult struct {
	Record      *dataset.Record // The dataset record containing input, expected output, and metadata
	Output      any             // The task output for this record
	Evaluations []*Evaluation   // Evaluation results for this record

	// Experiment execution metadata
	RecordIndex int       // Index of the record in the dataset
	SpanID      string    // Span ID for tracing
	TraceID     string    // Trace ID for tracing
	Timestamp   time.Time // When the task was executed
	Error       error     // Any error that occurred during task execution
}

RecordResult represents an experiment result for a single record.

type RunOption

type RunOption func(cfg *runCfg)

func WithAbortOnError

func WithAbortOnError(abortOnError bool) RunOption

func WithMaxConcurrency

func WithMaxConcurrency(maxConcurrency int) RunOption

func WithSampleSize

func WithSampleSize(sampleSize int) RunOption

type SummaryEvaluator

type SummaryEvaluator interface {
	Name() string
	Run(ctx context.Context, results []*RecordResult) (any, error)
}

SummaryEvaluator represents a summary evaluator for an Experiment. Summary evaluators run after all tasks and evaluators have completed, receiving all experiment results to compute aggregate metrics.

func NewSummaryEvaluator

func NewSummaryEvaluator(name string, fn SummaryEvaluatorFunc) SummaryEvaluator

NewSummaryEvaluator creates a new SummaryEvaluator.

type SummaryEvaluatorFunc

type SummaryEvaluatorFunc func(ctx context.Context, results []*RecordResult) (any, error)

SummaryEvaluatorFunc is the type for SummaryEvaluator functions.

type Task

type Task interface {
	Name() string
	Run(ctx context.Context, rec dataset.Record, experimentCfg map[string]any) (any, error)
}

Task represents the task to run for an Experiment.

func NewTask

func NewTask(name string, fn TaskFunc) Task

NewTask creates a new Task.

type TaskFunc

type TaskFunc func(ctx context.Context, rec dataset.Record, experimentCfg map[string]any) (any, error)

TaskFunc is the type for Task functions.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL