agent

package

v0.0.2 Latest Latest Go to latest Published: Apr 14, 2026 License: MIT Imports: 37 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/cavos-io/rtp-agent

Links

Open Source Insights

Documentation ¶

Index ¶

Constants
func PerformToolExecutions(ctx context.Context, functionCh <-chan *llm.FunctionToolCall, ...) <-chan ToolExecutionOutput
func RegisterPlugin(p Plugin)
func UploadSessionReport(cloudURL string, apiKey string, apiSecret string, agentName string, ...) error
func WithRunContext(ctx context.Context, rc *RunContext) context.Context
type Agent
- func NewAgent(instructions string) *Agent
- func (a *Agent) GetActivity() *AgentActivity
- func (a *Agent) GetAgent() *Agent
- func (a *Agent) OnEnter()
- func (a *Agent) OnExit()
- func (a *Agent) OnUserTurnCompleted(ctx context.Context, chatCtx *llm.ChatContext, newMsg *llm.ChatMessage) error
- func (a *Agent) Start(session *AgentSession, agentIntf AgentInterface) *AgentActivity
- func (a *Agent) UpdateInstructions(ctx context.Context, instructions string) error
- func (a *Agent) UpdateTools(ctx context.Context, tools []llm.Tool) error
type AgentActivity
- func NewAgentActivity(agentIntf AgentInterface, session *AgentSession) *AgentActivity
- func (a *AgentActivity) OnEndOfSpeech(ev *vad.VADEvent)
- func (a *AgentActivity) OnFinalTranscript(ev *stt.SpeechEvent)
- func (a *AgentActivity) OnStartOfSpeech(ev *vad.VADEvent)
- func (a *AgentActivity) ScheduleSpeech(speech *SpeechHandle, priority int, force bool) error
- func (a *AgentActivity) Start()
- func (a *AgentActivity) Stop()
type AgentInterface
type AgentSession
- func NewAgentSession(agent AgentInterface, room *lksdk.Room, opts AgentSessionOptions) *AgentSession
- func (s *AgentSession) GenerateReply(ctx context.Context, userInput string) error
- func (s *AgentSession) GetAgentTrackSID() string
- func (s *AgentSession) OnAudioFrame(ctx context.Context, frame *model.AudioFrame)
- func (s *AgentSession) PublishAgentTranscript(text string)
- func (s *AgentSession) PublishUserTranscript(text string)
- func (s *AgentSession) SetAgentTrackSID(sid string)
- func (s *AgentSession) SetRemoteTrackSID(sid string)
- func (s *AgentSession) SetRemoteUserIdentity(identity string)
- func (s *AgentSession) Start(ctx context.Context) error
- func (s *AgentSession) Stop(ctx context.Context) error
- func (s *AgentSession) UpdateAgentState(state AgentState)
- func (s *AgentSession) UpdateUserState(state UserState)
type AgentSessionOptions
type AgentState
type AgentStateChangedEvent
type AgentTask
- func NewAgentTask[T any](instructions string) *AgentTask[T]
- func (t *AgentTask[T]) Complete(result T)
- func (t *AgentTask[T]) Fail(err error)
- func (t *AgentTask[T]) WaitAny(ctx context.Context) (any, error)
type AudioConfig
type AudioRecognition
- func NewAudioRecognition(session *AgentSession, hooks RecognitionHooks, s stt.STT, v vad.VAD) *AudioRecognition
- func (ar *AudioRecognition) PushAudio(frame *model.AudioFrame) error
- func (ar *AudioRecognition) Start(ctx context.Context) error
type AudioSource
type Avatar
- func NewAvatar() *Avatar
- func (a *Avatar) Start(ctx context.Context) error
type AvatarIO
type AvatarRunner
- func NewAvatarRunner(io AvatarIO) *AvatarRunner
- func (r *AvatarRunner) SimulateLipSync(text string)
- func (r *AvatarRunner) Start(ctx context.Context) error
- func (r *AvatarRunner) Stop()
type AvatarState
type BackgroundAudioPlayer
- func NewBackgroundAudioPlayer(ambientSound, thinkingSound interface{}) *BackgroundAudioPlayer
- func (p *BackgroundAudioPlayer) AgentStateChanged(newState AgentState)
- func (p *BackgroundAudioPlayer) Close() error
- func (p *BackgroundAudioPlayer) Play(audio interface{}, loop bool) *PlayHandle
- func (p *BackgroundAudioPlayer) Start(room *lksdk.Room, agentSession *AgentSession) error
type BuiltinAudioClip
- func (b BuiltinAudioClip) Path() string
type ClientEventPayload
type ClientEventsDispatcher
- func NewClientEventsDispatcher(room *lksdk.Room) *ClientEventsDispatcher
- func (d *ClientEventsDispatcher) DispatchAgentState(state AgentState)
- func (d *ClientEventsDispatcher) DispatchUserState(state UserState)
type CloseEvent
- func (e *CloseEvent) GetType() string
type CloseReason
type ConversationItemAddedEvent
- func (e *ConversationItemAddedEvent) GetType() string
type DataStreamIO
- func NewDataStreamIO(room *lksdk.Room) *DataStreamIO
- func (io *DataStreamIO) SendAvatarData(ctx context.Context, data []byte) error
type DtmfEvent
type EndOfTurnInfo
type EvaluationResult
type Event
type IVRActivity
- func NewIVRActivity(agentIntf AgentInterface) *IVRActivity
- func (i *IVRActivity) OnDtmf(digit string)
- func (i *IVRActivity) SetDigitCallback(timeout time.Duration, cb func(buffer string) (bool, error))
- func (i *IVRActivity) Start()
- func (i *IVRActivity) Stop()
type InputDetails
- func DefaultInputDetails() InputDetails
type LLMGenerationData
- func PerformLLMInference(ctx context.Context, l llm.LLM, chatCtx *llm.ChatContext, tools []llm.Tool) (*LLMGenerationData, error)
type LLMTurnDetector
- func NewLLMTurnDetector(llmInstance llm.LLM) *LLMTurnDetector
- func (m *LLMTurnDetector) PredictEndOfTurn(ctx context.Context, chatCtx *llm.ChatContext) (float64, error)
type MetricsCollectedEvent
- func (e *MetricsCollectedEvent) GetType() string
type MultimodalAgent
- func NewMultimodalAgent(m llm.RealtimeModel, chatCtx *llm.ChatContext) *MultimodalAgent
- func (ma *MultimodalAgent) OnAudioFrame(ctx context.Context, frame *model.AudioFrame)
- func (ma *MultimodalAgent) Start(ctx context.Context, s *AgentSession) error
type PipelineAgent
- func NewPipelineAgent(vad vad.VAD, stt stt.STT, llmObj llm.LLM, tts tts.TTS, ...) *PipelineAgent
- func (va *PipelineAgent) OnAudioFrame(ctx context.Context, frame *model.AudioFrame)
- func (va *PipelineAgent) Start(ctx context.Context, s *AgentSession) error
type PlayHandle
- func (h *PlayHandle) Done() bool
- func (h *PlayHandle) Stop()
- func (h *PlayHandle) WaitForPlayout()
type Plugin
- func RegisteredPlugins() []Plugin
type QueueIO
- func NewQueueIO() *QueueIO
- func (io *QueueIO) ReadQueue() <-chan []byte
- func (io *QueueIO) SendAvatarData(ctx context.Context, data []byte) error
type RecognitionHooks
type RecordingOptions
type RunAssert
- func (a *RunAssert) ContainsMessage(role llm.ChatRole, content string) *RunAssert
- func (a *RunAssert) HasError() error
- func (a *RunAssert) IsFunctionCall(name string) *RunAssert
- func (a *RunAssert) Judge(ctx context.Context, evaluator evals.Evaluator, llmInstance llm.LLM) (*RunAssert, error)
type RunContext
- func GetRunContext(ctx context.Context) *RunContext
- func (r *RunContext) WaitForPlayout(ctx context.Context) error
type RunResult
- func NewRunResult(chatCtx *llm.ChatContext) *RunResult
type SessionReport
- func NewSessionReport() *SessionReport
type SpeechCreatedEvent
- func (e *SpeechCreatedEvent) GetType() string
type SpeechHandle
- func NewSpeechHandle(allowInterruptions bool, inputDetails InputDetails) *SpeechHandle
- func (s *SpeechHandle) Interrupt(force bool) error
- func (s *SpeechHandle) IsDone() bool
- func (s *SpeechHandle) IsInterrupted() bool
- func (s *SpeechHandle) IsScheduled() bool
- func (s *SpeechHandle) MarkDone()
- func (s *SpeechHandle) MarkScheduled()
- func (s *SpeechHandle) Wait(ctx context.Context) error
type TTSGenerationData
- func PerformTTSInference(ctx context.Context, t tts.TTS, textCh <-chan string) (*TTSGenerationData, error)
type Tagger
- func NewTagger() *Tagger
- func (t *Tagger) Add(tag string)
- func (t *Tagger) Evaluation(result *EvaluationResult)
- func (t *Tagger) Fail(reason string)
- func (t *Tagger) OutcomeReason() string
- func (t *Tagger) Remove(tag string)
- func (t *Tagger) Success(reason string)
- func (t *Tagger) Tags() []string
type TaskWaiter
type ToolExecutionOutput
type TranscriptSynchronizer
- func NewTranscriptSynchronizer(speakingRate float64) *TranscriptSynchronizer
- func (s *TranscriptSynchronizer) Close()
- func (s *TranscriptSynchronizer) EventCh() <-chan string
- func (s *TranscriptSynchronizer) Interrupt()
- func (s *TranscriptSynchronizer) PushAudio(frame *model.AudioFrame)
- func (s *TranscriptSynchronizer) PushText(text string)
type TranscriptionFilter
- func NewTranscriptionFilter() *TranscriptionFilter
type TurnDetectionMode
type TurnDetector
type UserInputTranscribedEvent
- func (e *UserInputTranscribedEvent) GetType() string
type UserState
type UserStateChangedEvent
type VoiceActivityVideoSampler
- func NewVoiceActivityVideoSampler(session *AgentSession, sampleRate float64, opts images.EncodeOptions) *VoiceActivityVideoSampler
- func (s *VoiceActivityVideoSampler) OnVideoFrame(ctx context.Context, frame *images.VideoFrame) bool
- func (s *VoiceActivityVideoSampler) SetSpeaking(speaking bool)

Constants ¶

View Source

const (
	UserStateSpeaking  UserState = "speaking"
	UserStateListening UserState = "listening"
	UserStateAway      UserState = "away"

	AgentStateInitializing AgentState = "initializing"
	AgentStateIdle         AgentState = "idle"
	AgentStateListening    AgentState = "listening"
	AgentStateThinking     AgentState = "thinking"
	AgentStateSpeaking     AgentState = "speaking"
)

View Source

const (
	SpeechPriorityLow    = 0
	SpeechPriorityNormal = 5
	SpeechPriorityHigh   = 10
	InterruptionTimeout  = 5 * time.Second
)

Variables ¶

This section is empty.

Functions ¶

func PerformToolExecutions ¶

func PerformToolExecutions(
	ctx context.Context,
	functionCh <-chan *llm.FunctionToolCall,
	toolCtx *llm.ToolContext,
) <-chan ToolExecutionOutput

func RegisterPlugin ¶

func RegisterPlugin(p Plugin)

func UploadSessionReport ¶

func UploadSessionReport(
	cloudURL string,
	apiKey string,
	apiSecret string,
	agentName string,
	report *SessionReport,
) error

func WithRunContext ¶

func WithRunContext(ctx context.Context, rc *RunContext) context.Context

Types ¶

type Agent ¶

type Agent struct {
	ID           string
	Instructions string
	ChatCtx      *llm.ChatContext
	Tools        []llm.Tool

	TurnDetection TurnDetectionMode
	TurnDetector  TurnDetector
	STT           stt.STT
	VAD           vad.VAD
	LLM           llm.LLM
	TTS           tts.TTS

	AllowInterruptions        bool
	MinConsecutiveSpeechDelay float64
	UseTTSAlignedTranscript   bool
	MinEndpointingDelay       float64
	MaxEndpointingDelay       float64
	// contains filtered or unexported fields
}

func NewAgent ¶

func NewAgent(instructions string) *Agent

func (*Agent) GetActivity ¶

func (a *Agent) GetActivity() *AgentActivity

func (*Agent) GetAgent ¶

func (a *Agent) GetAgent() *Agent

func (*Agent) OnEnter ¶

func (a *Agent) OnEnter()

func (*Agent) OnExit ¶

func (a *Agent) OnExit()

func (*Agent) OnUserTurnCompleted ¶

func (a *Agent) OnUserTurnCompleted(ctx context.Context, chatCtx *llm.ChatContext, newMsg *llm.ChatMessage) error

func (*Agent) Start ¶

func (a *Agent) Start(session *AgentSession, agentIntf AgentInterface) *AgentActivity

func (*Agent) UpdateInstructions ¶

func (a *Agent) UpdateInstructions(ctx context.Context, instructions string) error

func (*Agent) UpdateTools ¶

func (a *Agent) UpdateTools(ctx context.Context, tools []llm.Tool) error

type AgentActivity ¶

type AgentActivity struct {
	AgentIntf AgentInterface
	Agent     *Agent
	Session   *AgentSession
	// contains filtered or unexported fields
}

AgentActivity handles the internal event loops, I/O processing, and speech generation queue for an Agent.

func NewAgentActivity ¶

func NewAgentActivity(agentIntf AgentInterface, session *AgentSession) *AgentActivity

func (*AgentActivity) OnEndOfSpeech ¶

func (a *AgentActivity) OnEndOfSpeech(ev *vad.VADEvent)

func (*AgentActivity) OnFinalTranscript ¶

func (a *AgentActivity) OnFinalTranscript(ev *stt.SpeechEvent)

func (*AgentActivity) OnStartOfSpeech ¶

func (a *AgentActivity) OnStartOfSpeech(ev *vad.VADEvent)

Event callbacks from RecognitionHooks

func (*AgentActivity) ScheduleSpeech ¶

func (a *AgentActivity) ScheduleSpeech(speech *SpeechHandle, priority int, force bool) error

func (*AgentActivity) Start ¶

func (a *AgentActivity) Start()

func (*AgentActivity) Stop ¶

func (a *AgentActivity) Stop()

type AgentInterface ¶

type AgentInterface interface {
	OnEnter()
	OnExit()
	OnUserTurnCompleted(ctx context.Context, chatCtx *llm.ChatContext, newMsg *llm.ChatMessage) error
	GetAgent() *Agent
	GetActivity() *AgentActivity
}

type AgentSession ¶

type AgentSession struct {
	Options AgentSessionOptions

	ChatCtx   *llm.ChatContext
	Agent     AgentInterface
	STT       stt.STT
	VAD       vad.VAD
	LLM       llm.LLM
	TTS       tts.TTS
	Tools     []llm.Tool
	Assistant *PipelineAgent
	Room      *lksdk.Room

	MetricsCollector *telemetry.UsageCollector

	UserState  UserState
	AgentState AgentState

	// Transcript attribution — set by RoomIO when tracks are established.
	RemoteUserIdentity string
	RemoteTrackSID     string
	AgentTrackSID      string

	// Event channels
	AgentStateChangedCh chan AgentStateChangedEvent
	UserStateChangedCh  chan UserStateChangedEvent
	// contains filtered or unexported fields
}

func NewAgentSession ¶

func NewAgentSession(agent AgentInterface, room *lksdk.Room, opts AgentSessionOptions) *AgentSession

func (*AgentSession) GenerateReply ¶

func (s *AgentSession) GenerateReply(ctx context.Context, userInput string) error

func (*AgentSession) GetAgentTrackSID ¶

func (s *AgentSession) GetAgentTrackSID() string

func (*AgentSession) OnAudioFrame ¶

func (s *AgentSession) OnAudioFrame(ctx context.Context, frame *model.AudioFrame)

func (*AgentSession) PublishAgentTranscript ¶

func (s *AgentSession) PublishAgentTranscript(text string)

PublishAgentTranscript publishes the agent's LLM response to the Playground. Sends both a ChatMessage (chat panel) and a Transcription packet (transcript overlay).

func (*AgentSession) PublishUserTranscript ¶

func (s *AgentSession) PublishUserTranscript(text string)

PublishUserTranscript publishes the user's STT transcript to the Playground. Sends both a ChatMessage (chat panel) and a Transcription packet (transcript overlay).

func (*AgentSession) SetAgentTrackSID ¶

func (s *AgentSession) SetAgentTrackSID(sid string)

func (*AgentSession) SetRemoteTrackSID ¶

func (s *AgentSession) SetRemoteTrackSID(sid string)

func (*AgentSession) SetRemoteUserIdentity ¶

func (s *AgentSession) SetRemoteUserIdentity(identity string)

func (*AgentSession) Start ¶

func (s *AgentSession) Start(ctx context.Context) error

func (*AgentSession) Stop ¶

func (s *AgentSession) Stop(ctx context.Context) error

func (*AgentSession) UpdateAgentState ¶

func (s *AgentSession) UpdateAgentState(state AgentState)

func (*AgentSession) UpdateUserState ¶

func (s *AgentSession) UpdateUserState(state UserState)

type AgentSessionOptions ¶

type AgentSessionOptions struct {
	AllowInterruptions            bool
	DiscardAudioIfUninterruptible bool
	MinInterruptionDuration       float64
	MinInterruptionWords          int
	MinEndpointingDelay           float64
	MaxEndpointingDelay           float64
	MaxToolSteps                  int
	UserAwayTimeout               float64
	FalseInterruptionTimeout      float64
	ResumeFalseInterruption       bool
	MinConsecutiveSpeechDelay     float64
	UseTTSAlignedTranscript       bool
	PreemptiveGeneration          bool
	AECWarmupDuration             float64
}

type AgentState ¶

type AgentState string

type AgentStateChangedEvent ¶

type AgentStateChangedEvent struct {
	OldState AgentState
	NewState AgentState
}

type AgentTask ¶

type AgentTask[T any] struct {
	Agent
	Result chan T
	Err    chan error
}

AgentTask represents a sub-agent execution that returns a result

func NewAgentTask ¶

func NewAgentTask[T any](instructions string) *AgentTask[T]

func (*AgentTask[T]) Complete ¶

func (t *AgentTask[T]) Complete(result T)

func (*AgentTask[T]) Fail ¶

func (t *AgentTask[T]) Fail(err error)

func (*AgentTask[T]) WaitAny ¶

func (t *AgentTask[T]) WaitAny(ctx context.Context) (any, error)

type AudioConfig ¶

type AudioConfig struct {
	Source      AudioSource
	Volume      float64
	Probability float64
}

type AudioRecognition ¶

type AudioRecognition struct {
	// contains filtered or unexported fields
}

func NewAudioRecognition ¶

func NewAudioRecognition(session *AgentSession, hooks RecognitionHooks, s stt.STT, v vad.VAD) *AudioRecognition

func (*AudioRecognition) PushAudio ¶

func (ar *AudioRecognition) PushAudio(frame *model.AudioFrame) error

func (*AudioRecognition) Start ¶

func (ar *AudioRecognition) Start(ctx context.Context) error

type AudioSource ¶

type AudioSource interface{} // Can be string, BuiltinAudioClip, or <-chan *model.AudioFrame

type Avatar ¶

type Avatar struct {
	State AvatarState
}

func NewAvatar ¶

func NewAvatar() *Avatar

func (*Avatar) Start ¶

func (a *Avatar) Start(ctx context.Context) error

type AvatarIO ¶

type AvatarIO interface {
	SendAvatarData(ctx context.Context, data []byte) error
}

AvatarIO defines how Avatar commands/data are sent.

type AvatarRunner ¶

type AvatarRunner struct {
	// contains filtered or unexported fields
}

AvatarRunner coordinates Avatar IO and LipSync events.

func NewAvatarRunner ¶

func NewAvatarRunner(io AvatarIO) *AvatarRunner

func (*AvatarRunner) SimulateLipSync ¶

func (r *AvatarRunner) SimulateLipSync(text string)

SimulateLipSync takes text (from TranscriptSynchronizer) and simulates basic lip movements

func (*AvatarRunner) Start ¶

func (r *AvatarRunner) Start(ctx context.Context) error

func (*AvatarRunner) Stop ¶

func (r *AvatarRunner) Stop()

type AvatarState ¶

type AvatarState string

const (
	AvatarStateIdle     AvatarState = "idle"
	AvatarStateSpeaking AvatarState = "speaking"
)

type BackgroundAudioPlayer ¶

type BackgroundAudioPlayer struct {
	// contains filtered or unexported fields
}

func NewBackgroundAudioPlayer ¶

func NewBackgroundAudioPlayer(ambientSound, thinkingSound interface{}) *BackgroundAudioPlayer

func (*BackgroundAudioPlayer) AgentStateChanged ¶

func (p *BackgroundAudioPlayer) AgentStateChanged(newState AgentState)

func (*BackgroundAudioPlayer) Close ¶

func (p *BackgroundAudioPlayer) Close() error

func (*BackgroundAudioPlayer) Play ¶

func (p *BackgroundAudioPlayer) Play(audio interface{}, loop bool) *PlayHandle

func (*BackgroundAudioPlayer) Start ¶

func (p *BackgroundAudioPlayer) Start(room *lksdk.Room, agentSession *AgentSession) error

type BuiltinAudioClip ¶

type BuiltinAudioClip string

const (
	CityAmbience    BuiltinAudioClip = "city-ambience.ogg"
	ForestAmbience  BuiltinAudioClip = "forest-ambience.ogg"
	OfficeAmbience  BuiltinAudioClip = "office-ambience.ogg"
	CrowdedRoom     BuiltinAudioClip = "crowded-room.ogg"
	KeyboardTyping  BuiltinAudioClip = "keyboard-typing.ogg"
	KeyboardTyping2 BuiltinAudioClip = "keyboard-typing2.ogg"
	HoldMusic       BuiltinAudioClip = "hold_music.ogg"
)

func (BuiltinAudioClip) Path ¶

func (b BuiltinAudioClip) Path() string

type ClientEventPayload ¶

type ClientEventPayload struct {
	Type  string `json:"type"`
	State string `json:"state,omitempty"`
}

type ClientEventsDispatcher ¶

type ClientEventsDispatcher struct {
	// contains filtered or unexported fields
}

ClientEventsDispatcher manages sending Agent states to the LiveKit Room DataChannel

func NewClientEventsDispatcher ¶

func NewClientEventsDispatcher(room *lksdk.Room) *ClientEventsDispatcher

func (*ClientEventsDispatcher) DispatchAgentState ¶

func (d *ClientEventsDispatcher) DispatchAgentState(state AgentState)

DispatchAgentState emits AgentStateIdle, AgentStateThinking, AgentStateSpeaking

func (*ClientEventsDispatcher) DispatchUserState ¶

func (d *ClientEventsDispatcher) DispatchUserState(state UserState)

DispatchUserState emits UserStateListening, UserStateSpeaking

type CloseEvent ¶

type CloseEvent struct {
	Reason    CloseReason
	Error     error
	CreatedAt time.Time
}

func (*CloseEvent) GetType ¶

func (e *CloseEvent) GetType() string

type CloseReason ¶

type CloseReason string

const (
	CloseReasonError                   CloseReason = "error"
	CloseReasonJobShutdown             CloseReason = "job_shutdown"
	CloseReasonParticipantDisconnected CloseReason = "participant_disconnected"
	CloseReasonUserInitiated           CloseReason = "user_initiated"
)

type ConversationItemAddedEvent ¶

type ConversationItemAddedEvent struct {
	Item      llm.ChatItem
	CreatedAt time.Time
}

func (*ConversationItemAddedEvent) GetType ¶

func (e *ConversationItemAddedEvent) GetType() string

type DataStreamIO ¶

type DataStreamIO struct {
	// contains filtered or unexported fields
}

func NewDataStreamIO ¶

func NewDataStreamIO(room *lksdk.Room) *DataStreamIO

func (*DataStreamIO) SendAvatarData ¶

func (io *DataStreamIO) SendAvatarData(ctx context.Context, data []byte) error

type DtmfEvent ¶

type DtmfEvent struct {
	Digit string
	Time  time.Time
}

type EndOfTurnInfo ¶

type EndOfTurnInfo struct {
	SkipReply            bool
	NewTranscript        string
	TranscriptConfidence float64
	StartedSpeakingAt    *float64
	StoppedSpeakingAt    *float64
}

type EvaluationResult ¶

type EvaluationResult struct {
	Judgments map[string]string
}

type Event ¶

type Event interface {
	GetType() string
}

type IVRActivity ¶

type IVRActivity struct {
	AgentIntf AgentInterface
	Agent     *Agent
	// contains filtered or unexported fields
}

func NewIVRActivity ¶

func NewIVRActivity(agentIntf AgentInterface) *IVRActivity

func (*IVRActivity) OnDtmf ¶

func (i *IVRActivity) OnDtmf(digit string)

func (*IVRActivity) SetDigitCallback ¶

func (i *IVRActivity) SetDigitCallback(timeout time.Duration, cb func(buffer string) (bool, error))

func (*IVRActivity) Start ¶

func (i *IVRActivity) Start()

func (*IVRActivity) Stop ¶

func (i *IVRActivity) Stop()

type InputDetails ¶

type InputDetails struct {
	Modality string
}

func DefaultInputDetails ¶

func DefaultInputDetails() InputDetails

type LLMGenerationData ¶

type LLMGenerationData struct {
	TextCh     chan string
	FunctionCh chan *llm.FunctionToolCall
	FullTextCh chan string // receives the complete assembled text when streaming is done
	Usage      *llm.CompletionUsage
}

func PerformLLMInference ¶

func PerformLLMInference(ctx context.Context, l llm.LLM, chatCtx *llm.ChatContext, tools []llm.Tool) (*LLMGenerationData, error)

type LLMTurnDetector ¶

type LLMTurnDetector struct {
	// contains filtered or unexported fields
}

LLMTurnDetector uses an LLM to predict if the user has finished speaking. It sends the recent conversation history to the LLM and asks for a probability score.

func NewLLMTurnDetector ¶

func NewLLMTurnDetector(llmInstance llm.LLM) *LLMTurnDetector

func (*LLMTurnDetector) PredictEndOfTurn ¶

func (m *LLMTurnDetector) PredictEndOfTurn(ctx context.Context, chatCtx *llm.ChatContext) (float64, error)

type MetricsCollectedEvent ¶

type MetricsCollectedEvent struct {
	Metrics   telemetry.AgentMetrics
	CreatedAt time.Time
}

func (*MetricsCollectedEvent) GetType ¶

func (e *MetricsCollectedEvent) GetType() string

type MultimodalAgent ¶

type MultimodalAgent struct {
	PublishAudio func(frame *model.AudioFrame) error
	// contains filtered or unexported fields
}

func NewMultimodalAgent ¶

func NewMultimodalAgent(
	m llm.RealtimeModel,
	chatCtx *llm.ChatContext,
) *MultimodalAgent

func (*MultimodalAgent) OnAudioFrame ¶

func (ma *MultimodalAgent) OnAudioFrame(ctx context.Context, frame *model.AudioFrame)

func (*MultimodalAgent) Start ¶

func (ma *MultimodalAgent) Start(ctx context.Context, s *AgentSession) error

type PipelineAgent ¶

type PipelineAgent struct {
	LLM llm.LLM

	PublishAudio func(frame *model.AudioFrame) error
	// contains filtered or unexported fields
}

func NewPipelineAgent ¶

func NewPipelineAgent(
	vad vad.VAD,
	stt stt.STT,
	llmObj llm.LLM,
	tts tts.TTS,
	chatCtx *llm.ChatContext,
) *PipelineAgent

func (*PipelineAgent) OnAudioFrame ¶

func (va *PipelineAgent) OnAudioFrame(ctx context.Context, frame *model.AudioFrame)

func (*PipelineAgent) Start ¶

func (va *PipelineAgent) Start(ctx context.Context, s *AgentSession) error

type PlayHandle ¶

type PlayHandle struct {
	// contains filtered or unexported fields
}

func (*PlayHandle) Done ¶

func (h *PlayHandle) Done() bool

func (*PlayHandle) Stop ¶

func (h *PlayHandle) Stop()

func (*PlayHandle) WaitForPlayout ¶

func (h *PlayHandle) WaitForPlayout()

type Plugin ¶

type Plugin interface {
	Title() string
	Version() string
	Package() string
	DownloadFiles() error
}

func RegisteredPlugins ¶

func RegisteredPlugins() []Plugin

type QueueIO ¶

type QueueIO struct {
	// contains filtered or unexported fields
}

func NewQueueIO ¶

func NewQueueIO() *QueueIO

func (*QueueIO) ReadQueue ¶

func (io *QueueIO) ReadQueue() <-chan []byte

func (*QueueIO) SendAvatarData ¶

func (io *QueueIO) SendAvatarData(ctx context.Context, data []byte) error

type RecognitionHooks ¶

type RecognitionHooks interface {
	OnStartOfSpeech(ev *vad.VADEvent)
	OnEndOfSpeech(ev *vad.VADEvent)
	OnFinalTranscript(ev *stt.SpeechEvent)
}

type RecordingOptions ¶

type RecordingOptions struct {
	Audio      bool `json:"audio"`
	Traces     bool `json:"traces"`
	Logs       bool `json:"logs"`
	Transcript bool `json:"transcript"`
}

type RunAssert ¶

type RunAssert struct {
	ChatCtx *llm.ChatContext
	// contains filtered or unexported fields
}

func (*RunAssert) ContainsMessage ¶

func (a *RunAssert) ContainsMessage(role llm.ChatRole, content string) *RunAssert

func (*RunAssert) HasError ¶

func (a *RunAssert) HasError() error

func (*RunAssert) IsFunctionCall ¶

func (a *RunAssert) IsFunctionCall(name string) *RunAssert

func (*RunAssert) Judge ¶

func (a *RunAssert) Judge(ctx context.Context, evaluator evals.Evaluator, llmInstance llm.LLM) (*RunAssert, error)

type RunContext ¶

type RunContext struct {
	Session      *AgentSession
	SpeechHandle *SpeechHandle
	FunctionCall *llm.FunctionCall
}

func GetRunContext ¶

func GetRunContext(ctx context.Context) *RunContext

func (*RunContext) WaitForPlayout ¶

func (r *RunContext) WaitForPlayout(ctx context.Context) error

type RunResult ¶

type RunResult struct {
	ChatCtx *llm.ChatContext
	Expect  *RunAssert
}

func NewRunResult ¶

func NewRunResult(chatCtx *llm.ChatContext) *RunResult

type SessionReport ¶

type SessionReport struct {
	RecordingOptions        RecordingOptions    `json:"recording_options"`
	JobID                   string              `json:"job_id"`
	RoomID                  string              `json:"room_id"`
	Room                    string              `json:"room"`
	Options                 AgentSessionOptions `json:"options"`
	Events                  []any               `json:"events"`
	ChatHistory             *llm.ChatContext    `json:"chat_history"`
	AudioRecordingPath      *string             `json:"audio_recording_path,omitempty"`
	AudioRecordingStartedAt *float64            `json:"audio_recording_started_at,omitempty"`
	Duration                *float64            `json:"duration,omitempty"`
	StartedAt               *float64            `json:"started_at,omitempty"`
	Timestamp               float64             `json:"timestamp"`
}

func NewSessionReport ¶

func NewSessionReport() *SessionReport

type SpeechCreatedEvent ¶

type SpeechCreatedEvent struct {
	UserInitiated bool
	Source        string // "say" or "generate_reply"
	SpeechHandle  *SpeechHandle
	CreatedAt     time.Time
}

func (*SpeechCreatedEvent) GetType ¶

func (e *SpeechCreatedEvent) GetType() string

type SpeechHandle ¶

type SpeechHandle struct {
	ID                 string
	AllowInterruptions bool
	InputDetails       InputDetails
	Priority           int
	CreatedAt          time.Time
	// contains filtered or unexported fields
}

func NewSpeechHandle ¶

func NewSpeechHandle(allowInterruptions bool, inputDetails InputDetails) *SpeechHandle

func (*SpeechHandle) Interrupt ¶

func (s *SpeechHandle) Interrupt(force bool) error

func (*SpeechHandle) IsDone ¶

func (s *SpeechHandle) IsDone() bool

func (*SpeechHandle) IsInterrupted ¶

func (s *SpeechHandle) IsInterrupted() bool

func (*SpeechHandle) IsScheduled ¶

func (s *SpeechHandle) IsScheduled() bool

func (*SpeechHandle) MarkDone ¶

func (s *SpeechHandle) MarkDone()

func (*SpeechHandle) MarkScheduled ¶

func (s *SpeechHandle) MarkScheduled()

func (*SpeechHandle) Wait ¶

func (s *SpeechHandle) Wait(ctx context.Context) error

type TTSGenerationData ¶

type TTSGenerationData struct {
	AudioCh chan *model.AudioFrame
	TTFB    time.Duration
}

func PerformTTSInference ¶

func PerformTTSInference(ctx context.Context, t tts.TTS, textCh <-chan string) (*TTSGenerationData, error)

type Tagger ¶

type Tagger struct {
	// contains filtered or unexported fields
}

func NewTagger ¶

func NewTagger() *Tagger

func (*Tagger) Add ¶

func (t *Tagger) Add(tag string)

func (*Tagger) Evaluation ¶

func (t *Tagger) Evaluation(result *EvaluationResult)

func (*Tagger) Fail ¶

func (t *Tagger) Fail(reason string)

func (*Tagger) OutcomeReason ¶

func (t *Tagger) OutcomeReason() string

func (*Tagger) Remove ¶

func (t *Tagger) Remove(tag string)

func (*Tagger) Success ¶

func (t *Tagger) Success(reason string)

func (*Tagger) Tags ¶

func (t *Tagger) Tags() []string

type TaskWaiter ¶

type TaskWaiter interface {
	WaitAny(ctx context.Context) (any, error)
}

type ToolExecutionOutput ¶

type ToolExecutionOutput struct {
	FncCall    llm.FunctionCall
	FncCallOut *llm.FunctionCallOutput
	RawOutput  any
	RawError   error
}

type TranscriptSynchronizer ¶

type TranscriptSynchronizer struct {
	// contains filtered or unexported fields
}

TranscriptSynchronizer drip-feeds text to match the playout speed of audio.

func NewTranscriptSynchronizer ¶

func NewTranscriptSynchronizer(speakingRate float64) *TranscriptSynchronizer

NewTranscriptSynchronizer initializes the synchronizer. Default speaking rate is usually ~3.83 syllables/sec.

func (*TranscriptSynchronizer) Close ¶

func (s *TranscriptSynchronizer) Close()

func (*TranscriptSynchronizer) EventCh ¶

func (s *TranscriptSynchronizer) EventCh() <-chan string

func (*TranscriptSynchronizer) Interrupt ¶

func (s *TranscriptSynchronizer) Interrupt()

Interrupt immediately flushes the remaining text buffer to the event channel and stops syncing.

func (*TranscriptSynchronizer) PushAudio ¶

func (s *TranscriptSynchronizer) PushAudio(frame *model.AudioFrame)

func (*TranscriptSynchronizer) PushText ¶

func (s *TranscriptSynchronizer) PushText(text string)

type TranscriptionFilter ¶

type TranscriptionFilter struct {
	SpeakingRate float64
}

func NewTranscriptionFilter ¶

func NewTranscriptionFilter() *TranscriptionFilter

type TurnDetectionMode ¶

type TurnDetectionMode string

const (
	TurnDetectionModeSTT         TurnDetectionMode = "stt"
	TurnDetectionModeVAD         TurnDetectionMode = "vad"
	TurnDetectionModeRealtimeLLM TurnDetectionMode = "realtime_llm"
	TurnDetectionModeManual      TurnDetectionMode = "manual"
)

type TurnDetector ¶

type TurnDetector interface {
	PredictEndOfTurn(ctx context.Context, chatCtx *llm.ChatContext) (float64, error)
}

type UserInputTranscribedEvent ¶

type UserInputTranscribedEvent struct {
	Language   string
	Transcript string
	IsFinal    bool
	SpeakerID  string
	CreatedAt  time.Time
}

func (*UserInputTranscribedEvent) GetType ¶

func (e *UserInputTranscribedEvent) GetType() string

type UserState ¶

type UserState string

type UserStateChangedEvent ¶

type UserStateChangedEvent struct {
	OldState UserState
	NewState UserState
}

type VoiceActivityVideoSampler ¶

type VoiceActivityVideoSampler struct {
	// contains filtered or unexported fields
}

VoiceActivityVideoSampler samples video frames at a reduced rate (e.g. 1 fps) only when the user is speaking, to reduce LLM context token usage.

func NewVoiceActivityVideoSampler ¶

func NewVoiceActivityVideoSampler(session *AgentSession, sampleRate float64, opts images.EncodeOptions) *VoiceActivityVideoSampler

func (*VoiceActivityVideoSampler) OnVideoFrame ¶

func (s *VoiceActivityVideoSampler) OnVideoFrame(ctx context.Context, frame *images.VideoFrame) bool

OnVideoFrame should be called for every incoming WebRTC video frame. It returns true if the frame should be forwarded to the LLM.

func (*VoiceActivityVideoSampler) SetSpeaking ¶

func (s *VoiceActivityVideoSampler) SetSpeaking(speaking bool)

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL