engine

package

v1.4.1 Latest Latest Go to latest Published: Apr 4, 2026 License: Apache-2.0 Imports: 56 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/AltairaLabs/PromptKit

Links

Open Source Insights

Documentation ¶

Overview ¶

Package engine orchestrates test execution across scenarios, providers, and configurations.

The engine package is the core execution layer of the Arena testing tool. It manages:

Conversation lifecycle and message flow
Provider and model configuration
Telemetry and metrics collection
Result aggregation and validation
Concurrent test execution across multiple scenarios

Key types:

Engine: Main orchestration struct that executes test runs
RunResult: Contains execution results, metrics, and conversation history
RunFilters: Filters for selective test execution

Example usage:

eng, _ := engine.NewEngine(cfg, providerRegistry, promptRegistry)
results, _ := eng.Execute(ctx, filters)
for _, result := range results {
    fmt.Printf("Scenario %s: %s\n", result.ScenarioID, result.Status)
}

Index ¶

Constants
func AggregateTrialResults(store *statestore.ArenaStateStore, runIDs []string, combos []RunCombination) []string
func ApplyPerturbation(turns []config.TurnDefinition, variant PerturbationVariant) []config.TurnDefinition
func FormatFileSize(bytes int64) string
func FormatMediaType(mediaType string) string
type A2AAgentInfo
type A2ASkillInfo
type AssertionTrialStats
type AssertionsSummary
type CompositeConversationExecutor
- func NewCompositeConversationExecutor(defaultExecutor *DefaultConversationExecutor, ...) *CompositeConversationExecutor
- func (ce *CompositeConversationExecutor) ExecuteConversation(ctx context.Context, req ConversationRequest) *ConversationResult
- func (ce *CompositeConversationExecutor) ExecuteConversationStream(ctx context.Context, req ConversationRequest) (<-chan ConversationStreamChunk, error)
- func (ce *CompositeConversationExecutor) GetDefaultExecutor() *DefaultConversationExecutor
- func (ce *CompositeConversationExecutor) GetDuplexExecutor() *DuplexConversationExecutor
- func (ce *CompositeConversationExecutor) GetEvalExecutor() *EvalConversationExecutor
type ConversationExecutor
- func BuildEngineComponents(cfg *config.Config) (providerRegistry *providers.Registry, promptRegistry *prompt.Registry, ...)
type ConversationRequest
type ConversationResult
type ConversationStreamChunk
type DefaultConversationExecutor
- func NewDefaultConversationExecutor(scriptedExecutor turnexecutors.TurnExecutor, ...) *DefaultConversationExecutor
- func (ce *DefaultConversationExecutor) ExecuteConversation(ctx context.Context, req ConversationRequest) *ConversationResult
- func (ce *DefaultConversationExecutor) ExecuteConversationStream(ctx context.Context, req ConversationRequest) (<-chan ConversationStreamChunk, error)
type DuplexConversationExecutor
- func NewDuplexConversationExecutor(selfPlayRegistry *selfplay.Registry, promptRegistry *prompt.Registry, ...) *DuplexConversationExecutor
- func (de *DuplexConversationExecutor) ExecuteConversation(ctx context.Context, req ConversationRequest) *ConversationResult
- func (de *DuplexConversationExecutor) ExecuteConversationStream(ctx context.Context, req ConversationRequest) (<-chan ConversationStreamChunk, error)
type Engine
- func NewEngine(cfg *config.Config, providerRegistry *providers.Registry, ...) (*Engine, error)
- func NewEngineFromConfig(cfg *config.Config) (*Engine, error)
- func NewEngineFromConfigFile(configPath string) (*Engine, error)
- func (e *Engine) Close() error
- func (e *Engine) ConfigureSessionRecordingFromConfig() error
- func (e *Engine) EnableMessageEvents()
- func (e *Engine) EnableMockProviderMode(mockConfigPath string) error
- func (e *Engine) EnableSessionRecording(recordingDir string) error
- func (e *Engine) ExecuteRuns(ctx context.Context, plan *RunPlan, concurrency int) ([]string, error)
- func (e *Engine) GenerateRunPlan(regionFilter, providerFilter, scenarioFilter, evalFilter []string) (*RunPlan, error)
- func (e *Engine) GetConfig() *config.Config
- func (e *Engine) GetRecordingDir() string
- func (e *Engine) GetRecordingPath(runID string) string
- func (e *Engine) GetStateStore() statestore.Store
- func (e *Engine) SetEventBus(bus events.Bus, opts ...EventBusOption)
- func (e *Engine) SetMetrics(collector *metrics.Collector, instanceLabels map[string]string)
- func (e *Engine) SetTracerProvider(tp trace.TracerProvider)
type EvalConversationExecutor
- func NewEvalConversationExecutor(adapterRegistry *adapters.Registry, promptRegistry *prompt.Registry, ...) *EvalConversationExecutor
- func (e *EvalConversationExecutor) ExecuteConversation(ctx context.Context, req ConversationRequest) *ConversationResult
- func (e *EvalConversationExecutor) ExecuteConversationStream(ctx context.Context, req ConversationRequest) (<-chan ConversationStreamChunk, error)
type EvalOrchestrator
- func NewEvalOrchestrator(registry *evals.EvalTypeRegistry, defs []evals.EvalDef, skipEvals bool, ...) *EvalOrchestrator
- func (h *EvalOrchestrator) Clone() *EvalOrchestrator
- func (h *EvalOrchestrator) HasEvals() bool
- func (h *EvalOrchestrator) RunAssertionsAsConversationResults(ctx context.Context, assertionConfigs []assertions.AssertionConfig, ...) []assertions.ConversationValidationResult
- func (h *EvalOrchestrator) RunAssertionsAsEvals(ctx context.Context, assertionConfigs []assertions.AssertionConfig, ...) []evals.EvalResult
- func (h *EvalOrchestrator) RunConversationEvals(ctx context.Context, messages []types.Message, sessionID string) []assertions.ConversationValidationResult
- func (h *EvalOrchestrator) RunSessionEvals(ctx context.Context, messages []types.Message, sessionID string) []assertions.ConversationValidationResult
- func (h *EvalOrchestrator) RunTurnEvals(ctx context.Context, messages []types.Message, turnIndex int, sessionID string) []assertions.ConversationValidationResult
- func (h *EvalOrchestrator) SetEventBus(bus events.Bus)
- func (h *EvalOrchestrator) SetMetadata(metadata map[string]any)
- func (h *EvalOrchestrator) SetWorkflowMetadataProvider(provider WorkflowMetadataProvider)
type EventBusOption
- func WithMessageEvents() EventBusOption
type MediaOutput
- func CollectMediaOutputs(messages []types.Message) []MediaOutput
type MediaOutputStats
- func GetMediaOutputStatistics(outputs []MediaOutput) MediaOutputStats
type PerturbationVariant
- func ExpandPerturbations(scenario *config.Scenario) []PerturbationVariant
type RunCombination
type RunPlan
type RunResult
type SelfPlayRoleInfo
type StateStoreConfig
type TrialGroupKey
type TrialResults
type WorkflowMetadataProvider

Constants ¶

View Source

const DefaultRunTimeout = 5 * time.Minute

DefaultRunTimeout is the default maximum duration for a single run execution. If a provider call hangs, the run will be canceled after this timeout, freeing up the semaphore slot for other runs.

Variables ¶

This section is empty.

Functions ¶

func AggregateTrialResults ¶ added in v1.3.10

func AggregateTrialResults(store *statestore.ArenaStateStore, runIDs []string, combos []RunCombination) []string

AggregateTrialResults groups trial run results by scenario+provider+region, computes statistical metrics, and updates the first run in each group with the aggregated TrialResults. Returns the run IDs that represent trial groups (i.e., the first run ID of each group that now carries the summary).

func ApplyPerturbation ¶ added in v1.3.10

func ApplyPerturbation(turns []config.TurnDefinition, variant PerturbationVariant) []config.TurnDefinition

ApplyPerturbation substitutes perturbation variables in turn content. Placeholders use {key} syntax (e.g., "Book a flight from {city}" with city=NYC becomes "Book a flight from NYC").

func FormatFileSize ¶ added in v1.1.0

func FormatFileSize(bytes int64) string

FormatFileSize formats bytes as human-readable size

func FormatMediaType ¶ added in v1.1.0

func FormatMediaType(mediaType string) string

FormatMediaType returns a human-readable label for media type

Types ¶

type A2AAgentInfo ¶ added in v1.3.1

type A2AAgentInfo struct {
	Name        string         `json:"name"`
	Description string         `json:"description"`
	Skills      []A2ASkillInfo `json:"skills,omitempty"`
}

A2AAgentInfo contains metadata about an A2A agent for report rendering.

type A2ASkillInfo ¶ added in v1.3.1

type A2ASkillInfo struct {
	ID          string   `json:"id"`
	Name        string   `json:"name"`
	Description string   `json:"description,omitempty"`
	Tags        []string `json:"tags,omitempty"`
}

A2ASkillInfo contains metadata about a single A2A agent skill.

type AssertionTrialStats ¶ added in v1.3.10

type AssertionTrialStats struct {
	// PassRate is the fraction of trials where this assertion passed.
	PassRate float64 `json:"pass_rate"`
	// PassCount is the number of trials where this assertion passed.
	PassCount int `json:"pass_count"`
	// FailCount is the number of trials where this assertion failed.
	FailCount int `json:"fail_count"`
	// FlakinessScore for this specific assertion.
	FlakinessScore float64 `json:"flakiness_score"`
}

AssertionTrialStats holds per-assertion statistics across trial runs.

type AssertionsSummary ¶ added in v1.1.3

type AssertionsSummary struct {
	Failed  int                                       `json:"failed"`
	Passed  bool                                      `json:"passed"`
	Results []assertions.ConversationValidationResult `json:"results"`
	Total   int                                       `json:"total"`
}

AssertionsSummary matches the structure used for turn-level assertions in message meta

type CompositeConversationExecutor ¶ added in v1.1.6

type CompositeConversationExecutor struct {
	// contains filtered or unexported fields
}

CompositeConversationExecutor routes conversation execution to the appropriate executor based on scenario configuration. It selects between: - EvalConversationExecutor for evaluation mode (recording replay with assertions) - DefaultConversationExecutor for standard turn-based conversations - DuplexConversationExecutor for bidirectional streaming scenarios

func NewCompositeConversationExecutor ¶ added in v1.1.6

func NewCompositeConversationExecutor(
	defaultExecutor *DefaultConversationExecutor,
	duplexExecutor *DuplexConversationExecutor,
	evalExecutor *EvalConversationExecutor,
) *CompositeConversationExecutor

NewCompositeConversationExecutor creates a new composite executor.

func (*CompositeConversationExecutor) ExecuteConversation ¶ added in v1.1.6

func (ce *CompositeConversationExecutor) ExecuteConversation(
	ctx context.Context,
	req ConversationRequest,
) *ConversationResult

ExecuteConversation routes to the appropriate executor based on scenario config. If the request is for eval mode, uses EvalConversationExecutor. If the scenario has duplex configuration, uses DuplexConversationExecutor. Otherwise, uses DefaultConversationExecutor.

func (*CompositeConversationExecutor) ExecuteConversationStream ¶ added in v1.1.6

func (ce *CompositeConversationExecutor) ExecuteConversationStream(
	ctx context.Context,
	req ConversationRequest,
) (<-chan ConversationStreamChunk, error)

ExecuteConversationStream routes streaming execution to the appropriate executor.

func (*CompositeConversationExecutor) GetDefaultExecutor ¶ added in v1.1.6

func (ce *CompositeConversationExecutor) GetDefaultExecutor() *DefaultConversationExecutor

GetDefaultExecutor returns the default executor for direct access if needed.

func (*CompositeConversationExecutor) GetDuplexExecutor ¶ added in v1.1.6

func (ce *CompositeConversationExecutor) GetDuplexExecutor() *DuplexConversationExecutor

GetDuplexExecutor returns the duplex executor for direct access if needed.

func (*CompositeConversationExecutor) GetEvalExecutor ¶ added in v1.1.9

func (ce *CompositeConversationExecutor) GetEvalExecutor() *EvalConversationExecutor

GetEvalExecutor returns the eval executor for direct access if needed.

type ConversationExecutor ¶

type ConversationExecutor interface {
	// ExecuteConversation runs a complete conversation based on scenario
	ExecuteConversation(ctx context.Context, req ConversationRequest) *ConversationResult

	// ExecuteConversationStream runs a conversation with streaming
	ExecuteConversationStream(ctx context.Context, req ConversationRequest) (<-chan ConversationStreamChunk, error)
}

ConversationExecutor orchestrates full conversation flows

func BuildEngineComponents ¶ added in v1.1.9

func BuildEngineComponents(cfg *config.Config) (
	providerRegistry *providers.Registry,
	promptRegistry *prompt.Registry,
	mcpRegistry *mcp.RegistryImpl,
	convExecutor ConversationExecutor,
	adapterReg *adapters.Registry,
	a2aCleanup func(),
	toolReg *tools.Registry,
	err error,
)

BuildEngineComponents builds all engine components from a loaded Config object. This function creates and initializes: - MCP registry and tools (if configured) - Provider registry (for main assistant) - Prompt registry (if configured) - Tool registry (static + MCP tools) - Turn executor - Self-play provider registry (if enabled) - Self-play registry (if enabled) - Conversation executor

This function is exported to enable programmatic creation of Arena engines without requiring file-based configuration. Users can construct a *config.Config programmatically and pass it to this function to get all required registries for use with NewEngine.

Returns all components needed to construct an Engine, or an error if any component fails to build.

type ConversationRequest ¶

type ConversationRequest struct {
	// Required fields
	Provider providers.Provider
	Scenario *config.Scenario
	Eval     *config.Eval // Eval configuration (mutually exclusive with Scenario)
	Config   *config.Config
	Region   string

	// Optional overrides (for future use)
	Temperature *float64 // Override scenario temperature
	MaxTokens   *int     // Override scenario max tokens
	Timeout     *int     // Timeout in seconds

	// For distributed execution and tracing (v0.2.0+)
	RunID    string            // Unique identifier for this run
	Metadata map[string]string // Additional metadata for debugging/tracing

	// Event bus for runtime/TUI events
	EventBus events.Bus

	// State management
	StateStoreConfig *StateStoreConfig // Optional state store configuration
	ConversationID   string            // Conversation identifier for state persistence

	// Per-run eval orchestrator override (for workflow scenarios that need
	// isolated workflow metadata). If nil, the executor's shared orchestrator is used.
	EvalOrchestrator *EvalOrchestrator

	// RecordingConfig enables RecordingStage in the pipeline for message.created events.
	// If nil, no recording stages are added.
	RecordingConfig *stage.RecordingStageConfig

	// PostTurnHook is called after each turn completes. Used by the workflow
	// engine to commit deferred transitions after the pipeline finishes.
	PostTurnHook func() error
}

ConversationRequest contains all data needed for conversation execution. Using a request object makes the API extensible without breaking changes.

type ConversationResult ¶

type ConversationResult struct {
	Messages     []types.Message         // Flat list of all messages in the conversation
	Cost         types.CostInfo          // Total cost across all messages
	ToolStats    *types.ToolStats        // Tool usage statistics
	Violations   []types.ValidationError // Validation errors
	MediaOutputs []MediaOutput           // Media outputs generated by LLMs

	// Conversation-level assertions
	ConversationAssertionResults []assertions.ConversationValidationResult `json:"conv_assertions_results,omitempty"`

	// Self-play metadata
	SelfPlay  bool   `json:"self_play,omitempty"`
	PersonaID string `json:"persona_id,omitempty"`

	// Error handling
	Error  string `json:"error,omitempty"`  // Error message if execution failed
	Failed bool   `json:"failed,omitempty"` // Whether execution failed (but partial results may be available)
}

ConversationResult contains the outcome of conversation execution

type ConversationStreamChunk ¶

type ConversationStreamChunk struct {
	// Current turn number (0-indexed)
	TurnIndex int

	// Delta content from this specific chunk
	Delta string

	// Token count (accumulated for current turn)
	TokenCount int

	// Finish reason for current turn (only in last chunk of turn)
	FinishReason *string

	// Complete conversation result (accumulated, updated with each chunk)
	Result *ConversationResult

	// Error if streaming failed
	Error error

	// Metadata
	Metadata map[string]interface{}
}

ConversationStreamChunk represents a streaming chunk during conversation execution

type DefaultConversationExecutor ¶

type DefaultConversationExecutor struct {
	// contains filtered or unexported fields
}

DefaultConversationExecutor implements ConversationExecutor interface

func NewDefaultConversationExecutor ¶

func NewDefaultConversationExecutor(
	scriptedExecutor turnexecutors.TurnExecutor,
	selfPlayExecutor turnexecutors.TurnExecutor,
	selfPlayRegistry *selfplay.Registry,
	promptRegistry *prompt.Registry,
	evalOrchestrator *EvalOrchestrator,
) *DefaultConversationExecutor

NewDefaultConversationExecutor creates a new conversation executor

func (*DefaultConversationExecutor) ExecuteConversation ¶

func (ce *DefaultConversationExecutor) ExecuteConversation(ctx context.Context, req ConversationRequest) *ConversationResult

ExecuteConversation runs a complete conversation based on scenario using the new Turn model

func (*DefaultConversationExecutor) ExecuteConversationStream ¶

func (ce *DefaultConversationExecutor) ExecuteConversationStream(ctx context.Context, req ConversationRequest) (<-chan ConversationStreamChunk, error)

ExecuteConversationStream runs a complete conversation with streaming

type DuplexConversationExecutor ¶ added in v1.1.6

type DuplexConversationExecutor struct {
	// contains filtered or unexported fields
}

DuplexConversationExecutor handles duplex (bidirectional streaming) conversations. Unlike the standard executor which processes turns sequentially, this executor establishes a persistent streaming session and handles real-time audio I/O.

func NewDuplexConversationExecutor ¶ added in v1.1.6

func NewDuplexConversationExecutor(
	selfPlayRegistry *selfplay.Registry,
	promptRegistry *prompt.Registry,
	toolRegistry *tools.Registry,
	mediaStorage storage.MediaStorageService,
	evalOrchestrator *EvalOrchestrator,
) *DuplexConversationExecutor

NewDuplexConversationExecutor creates a new duplex conversation executor.

func (*DuplexConversationExecutor) ExecuteConversation ¶ added in v1.1.6

func (de *DuplexConversationExecutor) ExecuteConversation(
	ctx context.Context,
	req ConversationRequest,
) *ConversationResult

ExecuteConversation runs a duplex conversation based on scenario. For duplex mode, this establishes a streaming session and processes audio turns in real-time.

func (*DuplexConversationExecutor) ExecuteConversationStream ¶ added in v1.1.6

func (de *DuplexConversationExecutor) ExecuteConversationStream(
	ctx context.Context,
	req ConversationRequest,
) (<-chan ConversationStreamChunk, error)

ExecuteConversationStream runs a duplex conversation with streaming output. For duplex mode, this returns chunks as they arrive from the provider.

type Engine ¶

type Engine struct {
	// contains filtered or unexported fields
}

Engine manages the execution of prompt testing scenarios across multiple providers, regions, and configurations. It coordinates conversation execution, tool calling, validation, and result collection.

The engine supports both scripted conversations and self-play mode where an LLM simulates user behavior. It handles provider initialization, concurrent execution, and comprehensive result tracking including costs and tool usage.

func NewEngine ¶

func NewEngine(
	cfg *config.Config,
	providerRegistry *providers.Registry,
	promptRegistry *prompt.Registry,
	mcpRegistry *mcp.RegistryImpl,
	convExecutor ConversationExecutor,
	adapterRegistry *adapters.Registry,
) (*Engine, error)

NewEngine creates a new simulation engine from pre-built components. This is the primary constructor for the Engine and is preferred for testing where components can be created and configured independently.

This constructor uses dependency injection, accepting all required registries and executors as parameters. This makes testing easier and follows better architectural practices.

Parameters:

cfg: Fully loaded and validated Config object
providerRegistry: Registry for looking up providers by ID
promptRegistry: Registry for system prompts and task types
convExecutor: Executor for full conversations
adapterRegistry: Registry for recording adapters (used for eval enumeration)

Returns an initialized Engine ready for test execution.

func NewEngineFromConfig ¶ added in v1.1.11

func NewEngineFromConfig(cfg *config.Config) (*Engine, error)

NewEngineFromConfig creates a new Engine from a pre-loaded configuration. This allows CLI or programmatic callers to modify the config before engine creation.

func NewEngineFromConfigFile ¶

func NewEngineFromConfigFile(configPath string) (*Engine, error)

NewEngineFromConfigFile creates a new simulation engine from a configuration file. It loads the configuration, validates it, initializes all registries, and sets up the execution pipeline for conversation testing.

The configuration file is loaded along with all referenced resources (scenarios, providers, tools, personas), making the Config object fully self-contained.

This constructor performs all necessary initialization steps in the correct order: 1. Load and validate configuration from file (including all referenced resources) 2. Build registries from loaded resources 3. Initialize executors (turn, conversation, self-play if enabled) 4. Create Engine with all components

Note: Logger verbosity should be configured at application startup, not here. This function does not modify global logger settings.

Parameters:

configPath: Path to the arena.yaml configuration file

Returns an initialized Engine ready for test execution, or an error if:

Configuration file cannot be read or parsed
Configuration validation fails
Any resource file cannot be loaded
Provider type is unsupported

func (*Engine) Close ¶

func (e *Engine) Close() error

Close shuts down the engine and cleans up resources. This includes closing all MCP server connections, provider HTTP clients, and the event store if session recording is enabled.

func (*Engine) ConfigureSessionRecordingFromConfig ¶ added in v1.1.6

func (e *Engine) ConfigureSessionRecordingFromConfig() error

ConfigureSessionRecordingFromConfig enables session recording if configured. It reads the recording configuration from the engine's config and enables session recording with the appropriate directory path. Returns nil if recording is not enabled in the config.

func (*Engine) EnableMessageEvents ¶ added in v1.3.22

func (e *Engine) EnableMessageEvents()

EnableMessageEvents enables RecordingStage in pipelines so message.created events are published to the event bus. This does NOT write session recordings to disk — use EnableSessionRecording for that. Requires an event bus to be configured via SetEventBus.

func (*Engine) EnableMockProviderMode ¶ added in v1.1.0

func (e *Engine) EnableMockProviderMode(mockConfigPath string) error

EnableMockProviderMode replaces all providers in the registry with mock providers. This enables testing of scenario behavior without making real API calls. Mock providers can use either file-based configuration for scenario-specific responses or default in-memory responses.

Parameters:

mockConfigPath: Optional path to YAML configuration file for mock responses

Returns an error if the mock configuration file cannot be loaded or parsed.

func (*Engine) EnableSessionRecording ¶ added in v1.1.6

func (e *Engine) EnableSessionRecording(recordingDir string) error

EnableSessionRecording enables session recording for all runs. Recordings are stored in the specified directory as JSONL files, one file per session (using RunID as session ID). Returns an error if the directory cannot be created.

func (*Engine) ExecuteRuns ¶

func (e *Engine) ExecuteRuns(ctx context.Context, plan *RunPlan, concurrency int) ([]string, error)

GetStateStore returns the engine's state store for accessing run results Runs are executed concurrently up to the specified concurrency limit, with run IDs collected in order matching the input plan.

Each run executes independently: - Loads scenario and provider - Executes conversation turns (with self-play if configured) - Runs validators on the results - Tracks costs, timing, and tool calls - Saves results to StateStore

The context can be used to cancel all in-flight executions. Run IDs are returned for all combinations, with errors captured in individual RunResult (accessible via StateStore).

Parameters:

ctx: Context for cancellation
plan: RunPlan containing combinations to execute
concurrency: Maximum number of simultaneous executions

Returns a slice of RunIDs in the same order as plan.Combinations, or an error if execution setup fails. Individual run errors are stored in StateStore, not returned here.

func (*Engine) GenerateRunPlan ¶

func (e *Engine) GenerateRunPlan(regionFilter, providerFilter, scenarioFilter, evalFilter []string) (*RunPlan, error)

GenerateRunPlan creates a comprehensive test execution plan from filter criteria. The plan contains all combinations of regions × providers × scenarios OR evals that match the provided filters. Scenarios and evals are mutually exclusive.

For scenarios: - regionFilter: Empty = all regions from prompt configs (or default) - providerFilter: Empty = all registered providers (or scenario-specified providers) - scenarioFilter: Empty = all loaded scenarios

For evals: - evalFilter: Empty = all loaded evals - Regions and providers are not used (they come from recordings)

Provider selection logic (scenarios only): 1. If scenario specifies providers: use those (intersected with CLI filter if provided) 2. If scenario doesn't specify providers: use all arena providers (intersected with CLI filter)

Returns a RunPlan containing all matching combinations, ready for execution. Each combination represents one independent test run that will be executed and validated separately.

func (*Engine) GetConfig ¶ added in v1.1.6

func (e *Engine) GetConfig() *config.Config

GetConfig returns the engine's configuration.

func (*Engine) GetRecordingDir ¶ added in v1.1.6

func (e *Engine) GetRecordingDir() string

GetRecordingDir returns the directory where session recordings are stored. Returns empty string if recording is not enabled.

func (*Engine) GetRecordingPath ¶ added in v1.1.6

func (e *Engine) GetRecordingPath(runID string) string

GetRecordingPath returns the path to the recording file for a given run ID. Returns empty string if recording is not enabled.

func (*Engine) GetStateStore ¶

func (e *Engine) GetStateStore() statestore.Store

GetStateStore returns the engine's state store for accessing run results

func (*Engine) SetEventBus ¶ added in v1.1.4

func (e *Engine) SetEventBus(bus events.Bus, opts ...EventBusOption)

SetEventBus configures the shared event bus used for runtime and TUI observability. If session recording is enabled, the event store is subscribed to the bus.

func (*Engine) SetMetrics ¶ added in v1.3.20

func (e *Engine) SetMetrics(collector *metrics.Collector, instanceLabels map[string]string)

SetMetrics configures Prometheus metrics collection for the engine. When set, a MetricContext is created and subscribed to the event bus, recording provider call durations, token counts, costs, and tool call metrics. An event bus must be configured via SetEventBus before calling this method.

func (*Engine) SetTracerProvider ¶ added in v1.3.20

func (e *Engine) SetTracerProvider(tp trace.TracerProvider)

SetTracerProvider configures OpenTelemetry distributed tracing for the engine. When set, an OTelEventListener is created and subscribed to the event bus, converting provider call, tool call, and pipeline events into OTel spans. An event bus must be configured via SetEventBus before calling this method.

type EvalConversationExecutor ¶ added in v1.1.9

type EvalConversationExecutor struct {
	// contains filtered or unexported fields
}

EvalConversationExecutor handles evaluation mode: replaying saved conversations with assertions. Unlike scenario execution, eval mode: - Loads turns from recordings (no prompt building) - Applies assertions to pre-recorded assistant messages - Skips tool execution (tool calls are metadata only) - Returns results in the same schema as scenario execution for output parity

func NewEvalConversationExecutor ¶ added in v1.1.9

func NewEvalConversationExecutor(
	adapterRegistry *adapters.Registry,
	promptRegistry *prompt.Registry,
	providerRegistry *providers.Registry,
	evalOrchestrator *EvalOrchestrator,
) *EvalConversationExecutor

NewEvalConversationExecutor creates a new eval conversation executor.

func (*EvalConversationExecutor) ExecuteConversation ¶ added in v1.1.9

func (e *EvalConversationExecutor) ExecuteConversation(
	ctx context.Context,
	req ConversationRequest,
) *ConversationResult

ExecuteConversation runs an evaluation on a saved conversation.

func (*EvalConversationExecutor) ExecuteConversationStream ¶ added in v1.1.9

func (e *EvalConversationExecutor) ExecuteConversationStream(
	ctx context.Context,
	req ConversationRequest,
) (<-chan ConversationStreamChunk, error)

ExecuteConversationStream runs evaluation with streaming output. For eval mode, we don't have true streaming since we're replaying, but we implement this to satisfy the interface.

type EvalOrchestrator ¶ added in v1.3.20

type EvalOrchestrator struct {
	// contains filtered or unexported fields
}

EvalOrchestrator orchestrates eval and assertion execution during Arena runs.

func NewEvalOrchestrator ¶ added in v1.3.20

func NewEvalOrchestrator(
	registry *evals.EvalTypeRegistry,
	defs []evals.EvalDef,
	skipEvals bool,
	evalTypeFilter []string,
	taskType string,
) *EvalOrchestrator

NewEvalOrchestrator creates a hook for executing pack evals during Arena runs. If skipEvals is true, the runner is nil and all methods are no-ops. The evalTypeFilter, when non-empty, restricts execution to matching eval types.

func (*EvalOrchestrator) Clone ¶ added in v1.3.20

func (h *EvalOrchestrator) Clone() *EvalOrchestrator

Clone creates a shallow copy suitable for per-run use. The runner and defs are shared (immutable after construction), but metadata and workflow provider are independent. This avoids data races when concurrent runs set different workflow metadata providers.

func (*EvalOrchestrator) HasEvals ¶ added in v1.3.20

func (h *EvalOrchestrator) HasEvals() bool

HasEvals returns true if there are eval defs to execute.

func (*EvalOrchestrator) RunAssertionsAsConversationResults ¶ added in v1.3.20

func (h *EvalOrchestrator) RunAssertionsAsConversationResults(
	ctx context.Context,
	assertionConfigs []assertions.AssertionConfig,
	messages []types.Message,
	turnIndex int,
	sessionID string,
	trigger evals.EvalTrigger,
) []assertions.ConversationValidationResult

RunAssertionsAsConversationResults converts assertion configs to EvalDefs, runs them through the runner, and wraps results in ConversationValidationResult. The results use the original assertion type names (not pack_eval: prefixed).

func (*EvalOrchestrator) RunAssertionsAsEvals ¶ added in v1.3.20

func (h *EvalOrchestrator) RunAssertionsAsEvals(
	ctx context.Context,
	assertionConfigs []assertions.AssertionConfig,
	messages []types.Message,
	turnIndex int,
	sessionID string,
	trigger evals.EvalTrigger,
) []evals.EvalResult

RunAssertionsAsEvals converts assertion configs to EvalDefs and runs them through the runner. Returns raw EvalResults (not converted to assertion format). The trigger parameter overrides the default trigger on each converted def.

Each assertion is converted to an EvalDef with type "assertion", which the runner dispatches to AssertionEvalHandler. The wrapper resolves the inner eval handler from the registry, executes it, and applies min_score/max_score thresholds to determine pass/fail.

func (*EvalOrchestrator) RunConversationEvals ¶ added in v1.3.20

func (h *EvalOrchestrator) RunConversationEvals(
	ctx context.Context,
	messages []types.Message,
	sessionID string,
) []assertions.ConversationValidationResult

RunConversationEvals runs conversation-complete evals after all turns finish. Returns converted ConversationValidationResult entries.

func (*EvalOrchestrator) RunSessionEvals ¶ added in v1.3.20

func (h *EvalOrchestrator) RunSessionEvals(
	ctx context.Context,
	messages []types.Message,
	sessionID string,
) []assertions.ConversationValidationResult

RunSessionEvals runs session-complete evals after conversation finishes. Returns converted ConversationValidationResult entries.

func (*EvalOrchestrator) RunTurnEvals ¶ added in v1.3.20

func (h *EvalOrchestrator) RunTurnEvals(
	ctx context.Context,
	messages []types.Message,
	turnIndex int,
	sessionID string,
) []assertions.ConversationValidationResult

RunTurnEvals runs turn-triggered evals after a turn completes. Returns converted ConversationValidationResult entries.

func (*EvalOrchestrator) SetEventBus ¶ added in v1.3.20

func (h *EvalOrchestrator) SetEventBus(bus events.Bus)

SetEventBus configures the event bus for provider call telemetry in eval handlers. When set, an emitter is injected into each EvalContext's metadata so that LLM judge provider calls emit ProviderCallStarted/Completed/Failed events.

func (*EvalOrchestrator) SetMetadata ¶ added in v1.3.20

func (h *EvalOrchestrator) SetMetadata(metadata map[string]any)

SetMetadata sets metadata that will be injected into every EvalContext. Used to pass judge_targets, prompt_registry, and other config to eval handlers.

func (*EvalOrchestrator) SetWorkflowMetadataProvider ¶ added in v1.3.20

func (h *EvalOrchestrator) SetWorkflowMetadataProvider(provider WorkflowMetadataProvider)

SetWorkflowMetadataProvider sets the workflow state provider for eval context injection. Called per-run for workflow scenarios so assertions can access the current workflow state.

type EventBusOption ¶ added in v1.3.22

type EventBusOption func(*eventBusConfig)

EventBusOption configures optional behavior when setting the event bus.

func WithMessageEvents ¶ added in v1.3.22

func WithMessageEvents() EventBusOption

WithMessageEvents enables RecordingStage in pipelines so message.created events are published to the event bus. This does NOT write session recordings to disk — use EnableSessionRecording for that.

type MediaOutput ¶ added in v1.1.0

type MediaOutput struct {
	Type       string `json:"type"`                // "image", "audio", "video"
	MIMEType   string `json:"mime_type"`           // e.g., "image/jpeg", "audio/mp3"
	SizeBytes  int64  `json:"size_bytes"`          // Size of the media file
	Duration   *int   `json:"duration,omitempty"`  // Duration in seconds (for audio/video)
	Width      *int   `json:"width,omitempty"`     // Width in pixels (for image/video)
	Height     *int   `json:"height,omitempty"`    // Height in pixels (for image/video)
	FilePath   string `json:"file_path,omitempty"` // Path where media was saved
	Thumbnail  string `json:"thumbnail,omitempty"` // Base64-encoded thumbnail for HTML reports
	MessageIdx int    `json:"message_index"`       // Index of message containing this media
	PartIdx    int    `json:"part_index"`          // Index of content part within the message
}

MediaOutput represents media content generated by an LLM during test execution

func CollectMediaOutputs ¶ added in v1.1.0

func CollectMediaOutputs(messages []types.Message) []MediaOutput

CollectMediaOutputs extracts media outputs from conversation messages Returns a slice of MediaOutput for tracking in RunResult

type MediaOutputStats ¶ added in v1.1.0

type MediaOutputStats struct {
	Total          int            `json:"total"`
	ImageCount     int            `json:"image_count"`
	AudioCount     int            `json:"audio_count"`
	VideoCount     int            `json:"video_count"`
	TotalSizeBytes int64          `json:"total_size_bytes"`
	ByType         map[string]int `json:"by_type"`
}

MediaOutputStats contains summary statistics for media outputs

func GetMediaOutputStatistics ¶ added in v1.1.0

func GetMediaOutputStatistics(outputs []MediaOutput) MediaOutputStats

GetMediaOutputStatistics calculates summary statistics for media outputs

type PerturbationVariant ¶ added in v1.3.10

type PerturbationVariant struct {
	// Substitutions maps placeholder names to their values for this variant.
	Substitutions map[string]string
}

PerturbationVariant represents a single set of variable substitutions.

func ExpandPerturbations ¶ added in v1.3.10

func ExpandPerturbations(scenario *config.Scenario) []PerturbationVariant

ExpandPerturbations computes all perturbation variants for a scenario. It collects all perturbation maps across turns and computes the Cartesian product. Returns nil if no perturbations are defined.

type RunCombination ¶

type RunCombination struct {
	Region       string
	ScenarioID   string // For scenario-based runs
	EvalID       string // For eval-based runs (mutually exclusive with ScenarioID)
	ProviderID   string // Not used for eval runs (provider comes from recording)
	RecordingRef string // For batch evals: specific recording reference ID (resolved by adapter)
	TrialIndex   int    // Trial number (0-based) when scenario has Trials > 1
	TotalTrials  int    // Total number of trials for this scenario (0 or 1 = single run)

	// PerturbationIndex identifies which perturbation variant this run uses (-1 = no perturbation).
	PerturbationIndex int
}

RunCombination represents a single test execution

type RunPlan ¶

type RunPlan struct {
	Combinations []RunCombination
}

RunPlan defines the test execution plan

type RunResult ¶

type RunResult struct {
	RunID      string                  `json:"RunID"`
	PromptPack string                  `json:"PromptPack"`
	Region     string                  `json:"Region"`
	ScenarioID string                  `json:"ScenarioID"`
	ProviderID string                  `json:"ProviderID"`
	Params     map[string]interface{}  `json:"Params"`
	Messages   []types.Message         `json:"Messages"`
	Commit     map[string]interface{}  `json:"Commit"`
	Cost       types.CostInfo          `json:"Cost"`
	ToolStats  *types.ToolStats        `json:"ToolStats"`
	Violations []types.ValidationError `json:"Violations"`
	StartTime  time.Time               `json:"StartTime"`
	EndTime    time.Time               `json:"EndTime"`
	Duration   time.Duration           `json:"Duration"`
	Error      string                  `json:"Error"`
	SelfPlay   bool                    `json:"SelfPlay"`
	PersonaID  string                  `json:"PersonaID"`

	UserFeedback  *statestore.Feedback `json:"UserFeedback"`
	SessionTags   []string             `json:"SessionTags"`
	AssistantRole *SelfPlayRoleInfo    `json:"AssistantRole"`
	UserRole      *SelfPlayRoleInfo    `json:"UserRole"`

	// Media outputs generated by LLMs during test execution
	MediaOutputs []MediaOutput `json:"MediaOutputs,omitempty"`

	// Session recording path (if recording was enabled)
	RecordingPath string `json:"RecordingPath,omitempty"`

	// Conversation-level assertions evaluated after the conversation completes (summary format)
	ConversationAssertions AssertionsSummary `json:"conversation_assertions,omitempty"`

	// A2A agent metadata (populated from config for report rendering)
	A2AAgents []A2AAgentInfo `json:"A2AAgents,omitempty"`

	// TrialResults holds aggregated statistics when a scenario is run with Trials > 1.
	TrialResults *TrialResults `json:"trial_results,omitempty"`
}

RunResult contains the complete results of a single test execution

type SelfPlayRoleInfo ¶

type SelfPlayRoleInfo struct {
	Provider string
	Model    string
	Region   string
}

SelfPlayRoleInfo contains provider information for self-play roles

type StateStoreConfig ¶

type StateStoreConfig struct {
	Store    interface{}            // State store implementation (statestore.Store)
	UserID   string                 // User identifier (optional)
	Metadata map[string]interface{} // Additional metadata to store (optional)
}

StateStoreConfig wraps the pipeline StateStore configuration for Arena

type TrialGroupKey ¶ added in v1.3.10

type TrialGroupKey struct {
	ScenarioID string
	ProviderID string
	Region     string
}

TrialGroupKey identifies a unique scenario+provider+region combination for trial grouping.

type TrialResults ¶ added in v1.3.10

type TrialResults struct {
	// TrialCount is the number of trials executed.
	TrialCount int `json:"trial_count"`
	// PassRate is the fraction of trials where all assertions passed (0.0-1.0).
	PassRate float64 `json:"pass_rate"`
	// FlakinessScore ranges from 0 (deterministic) to 1 (maximally flaky, 50/50).
	FlakinessScore float64 `json:"flakiness_score"`
	// PerAssertionStats maps assertion ID to its pass rate across trials.
	PerAssertionStats map[string]AssertionTrialStats `json:"per_assertion_stats,omitempty"`
}

TrialResults holds aggregated results from multiple trial executions of a scenario.

type WorkflowMetadataProvider ¶ added in v1.3.20

type WorkflowMetadataProvider interface {
	WorkflowMetadata() map[string]any
}

WorkflowMetadataProvider is implemented by types that provide workflow state metadata for injection into the eval context during assertion evaluation.

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL