agent

package
v0.3.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 24, 2026 License: MIT Imports: 3 Imported by: 0

Documentation

Overview

Package agent provides voice agent orchestration for real-time conversations.

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type Config

type Config struct {
	// Name is a human-readable name for the agent.
	Name string

	// SystemPrompt is the initial system prompt for the LLM.
	SystemPrompt string

	// VoiceID is the TTS voice to use.
	VoiceID string

	// Language is the primary language (BCP-47 code).
	Language string

	// STTProvider is the speech-to-text provider name.
	STTProvider string

	// TTSProvider is the text-to-speech provider name.
	TTSProvider string

	// LLMProvider is the LLM provider name.
	LLMProvider string

	// LLMModel is the specific LLM model to use.
	LLMModel string

	// MaxTurnDuration is the maximum duration for a single turn.
	MaxTurnDuration time.Duration

	// MaxSessionDuration is the maximum total session duration.
	MaxSessionDuration time.Duration

	// InterruptionMode controls how interruptions are handled.
	InterruptionMode InterruptionMode

	// Tools defines functions the agent can call.
	Tools []Tool

	// Webhooks configures event webhooks.
	Webhooks WebhookConfig
}

Config configures a voice agent.

type Event

type Event struct {
	// Type is the event type.
	Type EventType

	// Timestamp is when the event occurred.
	Timestamp time.Time

	// Data contains event-specific data.
	Data any

	// Error contains any error details.
	Error error
}

Event represents a session event.

type EventType

type EventType string

EventType identifies the type of session event.

const (
	// EventSessionStarted indicates the session has started.
	EventSessionStarted EventType = "session_started"

	// EventSessionEnded indicates the session has ended.
	EventSessionEnded EventType = "session_ended"

	// EventUserSpeechStart indicates the user started speaking.
	EventUserSpeechStart EventType = "user_speech_start"

	// EventUserSpeechEnd indicates the user stopped speaking.
	EventUserSpeechEnd EventType = "user_speech_end"

	// EventUserTranscript contains user speech transcription.
	EventUserTranscript EventType = "user_transcript"

	// EventAgentThinking indicates the agent is processing.
	EventAgentThinking EventType = "agent_thinking"

	// EventAgentSpeechStart indicates the agent started speaking.
	EventAgentSpeechStart EventType = "agent_speech_start"

	// EventAgentSpeechEnd indicates the agent stopped speaking.
	EventAgentSpeechEnd EventType = "agent_speech_end"

	// EventAgentTranscript contains agent response text.
	EventAgentTranscript EventType = "agent_transcript"

	// EventToolCall indicates a tool was called.
	EventToolCall EventType = "tool_call"

	// EventInterruption indicates the user interrupted.
	EventInterruption EventType = "interruption"

	// EventError indicates an error occurred.
	EventError EventType = "error"
)

type InterruptionMode

type InterruptionMode string

InterruptionMode controls how user interruptions are handled.

const (
	// InterruptImmediate stops TTS immediately when user speaks.
	InterruptImmediate InterruptionMode = "immediate"

	// InterruptAfterSentence finishes current sentence before stopping.
	InterruptAfterSentence InterruptionMode = "after_sentence"

	// InterruptDisabled ignores interruptions.
	InterruptDisabled InterruptionMode = "disabled"
)

type Metrics

type Metrics struct {
	// SessionDurationMs is total session duration.
	SessionDurationMs int

	// TurnCount is the number of conversation turns.
	TurnCount int

	// UserSpeechDurationMs is total user speech time.
	UserSpeechDurationMs int

	// AgentSpeechDurationMs is total agent speech time.
	AgentSpeechDurationMs int

	// AvgSTTLatencyMs is average STT processing time.
	AvgSTTLatencyMs int

	// AvgLLMLatencyMs is average LLM processing time.
	AvgLLMLatencyMs int

	// AvgTTSLatencyMs is average TTS processing time.
	AvgTTSLatencyMs int

	// AvgTotalLatencyMs is average end-to-end latency.
	AvgTotalLatencyMs int

	// InterruptionCount is number of user interruptions.
	InterruptionCount int

	// ToolCallCount is number of tool invocations.
	ToolCallCount int

	// ErrorCount is number of errors encountered.
	ErrorCount int
}

Metrics contains session performance metrics.

type Provider

type Provider interface {
	// Name returns the provider name.
	Name() string

	// CreateSession creates a new voice session.
	CreateSession(ctx context.Context, config Config) (Session, error)

	// GetSession retrieves an existing session by ID.
	GetSession(ctx context.Context, sessionID string) (Session, error)

	// ListSessions lists active sessions.
	ListSessions(ctx context.Context) ([]string, error)
}

Provider defines the interface for voice agent providers.

type Session

type Session interface {
	// ID returns the unique session identifier.
	ID() string

	// Start begins the voice session.
	Start(ctx context.Context) error

	// Stop ends the voice session gracefully.
	Stop(ctx context.Context) error

	// SendAudio sends audio data to the agent.
	SendAudio(audio []byte) error

	// ReceiveAudio returns a channel for receiving agent audio.
	ReceiveAudio() <-chan []byte

	// SendText sends text input to the agent (bypass STT).
	SendText(text string) error

	// Events returns a channel for session events.
	Events() <-chan Event

	// Transcript returns the conversation transcript so far.
	Transcript() []Turn

	// Metrics returns session performance metrics.
	Metrics() Metrics
}

Session represents an active voice conversation session.

type Tool

type Tool struct {
	// Name is the function name.
	Name string

	// Description describes what the function does.
	Description string

	// Parameters defines the function parameters (JSON Schema).
	Parameters map[string]any

	// Handler is called when the tool is invoked.
	Handler ToolHandler
}

Tool defines a function the voice agent can call.

type ToolCall

type ToolCall struct {
	// Name is the tool name.
	Name string

	// Arguments is the parsed arguments.
	Arguments map[string]any

	// Result is the tool result.
	Result string

	// Error is any error from the tool call.
	Error string

	// DurationMs is the tool execution time.
	DurationMs int
}

ToolCall represents a tool invocation during conversation.

type ToolHandler

type ToolHandler func(ctx context.Context, args map[string]any) (string, error)

ToolHandler processes a tool call and returns a result.

type TransportAdapter

type TransportAdapter interface {
	// Connect connects the transport to a session.
	Connect(ctx context.Context, session Session) error

	// Disconnect disconnects the transport from the session.
	Disconnect(ctx context.Context) error

	// AudioIn returns a writer for incoming audio.
	AudioIn() io.Writer

	// AudioOut returns a reader for outgoing audio.
	AudioOut() io.Reader
}

TransportAdapter adapts a transport to a voice agent session.

type Turn

type Turn struct {
	// Role is "user" or "agent".
	Role string

	// Text is the transcribed/generated text.
	Text string

	// Timestamp is when the turn occurred.
	Timestamp time.Time

	// DurationMs is the turn duration in milliseconds.
	DurationMs int

	// ToolCalls contains any tool calls made during this turn.
	ToolCalls []ToolCall
}

Turn represents a single conversation turn.

type WebhookConfig

type WebhookConfig struct {
	// OnSessionStart is called when a session begins.
	OnSessionStart string

	// OnSessionEnd is called when a session ends.
	OnSessionEnd string

	// OnTurnComplete is called after each conversation turn.
	OnTurnComplete string

	// OnToolCall is called when a tool is invoked.
	OnToolCall string
}

WebhookConfig configures event webhooks.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL