dialog

package
v1.2.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 6, 2026 License: MIT Imports: 8 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type Command

type Command struct {
	Type   CommandType `json:"type"`
	CallID string      `json:"call_id"`

	Text        string `json:"text,omitempty"`
	UtteranceID string `json:"utterance_id,omitempty"`
	// StreamEnd on tts.stream marks the final LLM chunk (same as tts.stream.end).
	StreamEnd bool `json:"stream_end,omitempty"`

	Reason string `json:"reason,omitempty"`

	Meta *CommandMeta `json:"meta,omitempty"`
}

Command is the envelope the dialog app sends to the voice plane.

type CommandMeta

type CommandMeta struct {
	LLMModel   string `json:"llmModel,omitempty"`
	LLMFirstMs int    `json:"llmFirstMs,omitempty"`
	LLMWallMs  int    `json:"llmWallMs,omitempty"`
	UserText   string `json:"userText,omitempty"`
}

CommandMeta carries optional turn-level metadata from the dialog app.

type CommandType

type CommandType string

CommandType enumerates messages the dialog app sends to the voice plane.

const (
	CmdTTSSpeak     CommandType = "tts.speak"
	CmdTTSStream    CommandType = "tts.stream"     // LLM streaming token/chunk
	CmdTTSStreamEnd CommandType = "tts.stream.end" // flush segmenter tail
	CmdTTSInterrupt CommandType = "tts.interrupt"
	CmdHangup       CommandType = "hangup"
)

type Config

type Config struct {
	CallID string
	Meta   StartMeta

	Engine asr.Engine

	InputCodec    string
	PCMSampleRate int

	TTSService tts.TTSService
	// TTSCache wraps TTSService with process-level PCM caching when set.
	TTSCache   *tts.CacheConfig
	OnAudioOut func([]byte) error

	OnEvent  EventHandler
	OnHangup func(reason string)
	OnTurn   func(TurnEvent)

	// EnableVAD enables barge-in during downlink playback (default true).
	EnableVAD *bool
	// VADConfig overrides barge-in thresholds; nil uses DefaultBargeInVADConfig.
	VADConfig *asr.VADConfig
	// EnableEchoFilter suppresses uplink while TTS active (default true).
	EnableEchoFilter *bool
	// EchoTail extends echo suppression after playback ends (default 150ms).
	EchoTail time.Duration
	// Denoiser optional uplink noise/AEC (RNNoise, WebRTC AEC3, hardware AEC).
	// When non-nil, runs after decode and before VAD.
	Denoiser asr.Denoiser
	// CoalesceUplink buffers small PCM chunks before ASR (default true).
	CoalesceUplink *bool
	// PaceRealtime paces TTS frames at wall-clock rate for smooth playback (default true).
	PaceRealtime *bool
	// TTSFrameDuration sets downlink frame size (default 60ms).
	TTSFrameDuration time.Duration
	// OutputCodec is the downlink wire codec ("pcm" default, "opus" uses AudioSender).
	OutputCodec string

	// EnableSentenceFilter enables ASR sentence-boundary filtering (default true).
	EnableSentenceFilter *bool
	// SentenceFilterSimilarity sets dedup threshold; nil → 0.85, explicit 0 disables dedup only.
	SentenceFilterSimilarity *float64
	// SentenceFilter overrides auto-created filter when set explicitly.
	SentenceFilter *asr.SentenceFilter

	// EnableStreamSegmenter wires TextSegmenter for tts.stream commands (default true).
	EnableStreamSegmenter *bool
	TextSegmenterConfig   *tts.TextSegmenterConfig
}

Config wires the voice plane for a single call.

type Event

type Event struct {
	Type   EventType `json:"type"`
	CallID string    `json:"call_id"`

	From  string `json:"from,omitempty"`
	To    string `json:"to,omitempty"`
	Codec string `json:"codec,omitempty"`
	PCMHz int    `json:"pcm_hz,omitempty"`

	Reason string `json:"reason,omitempty"`

	Text string `json:"text,omitempty"`

	Message string `json:"message,omitempty"`
	Fatal   bool   `json:"fatal,omitempty"`

	Digit string `json:"digit,omitempty"`
	End   bool   `json:"end,omitempty"`

	UtteranceID string `json:"utterance_id,omitempty"`
	OK          bool   `json:"ok,omitempty"`

	Target string `json:"target,omitempty"`
}

Event is the envelope the voice plane sends to the dialog app.

type EventHandler

type EventHandler func(Event)

EventHandler receives voice-plane events. Implementations should return quickly; heavy work (LLM calls) belongs in a separate goroutine.

type EventType

type EventType string

EventType enumerates messages the voice plane sends to the dialog app.

const (
	EvCallStarted     EventType = "call.started"
	EvCallEnded       EventType = "call.ended"
	EvASRPartial      EventType = "asr.partial"
	EvASRFinal        EventType = "asr.final"
	EvASRError        EventType = "asr.error"
	EvDTMF            EventType = "dtmf"
	EvTTSStarted      EventType = "tts.started"
	EvTTSEnded        EventType = "tts.ended"
	EvTTSInterrupt    EventType = "tts.interrupt"
	EvTransferRequest EventType = "transfer.request"
)

type Session

type Session struct {
	// contains filtered or unexported fields
}

Session is a transport-agnostic voice call session. It runs uplink ASR and downlink TTS, emitting events to an external dialog app and accepting commands.

func NewSession

func NewSession(ctx context.Context, cfg Config) (*Session, error)

NewSession builds and returns a voice session from cfg.

func (*Session) Close

func (s *Session) Close(reason string)

Close tears down the session and emits call.ended.

func (*Session) ForwardTransferRequest

func (s *Session) ForwardTransferRequest(target string)

ForwardTransferRequest notifies the dialog app of a transfer request.

func (*Session) HandleCommand

func (s *Session) HandleCommand(cmd Command)

HandleCommand processes a dialog-plane command.

func (*Session) IsTTSPlaying

func (s *Session) IsTTSPlaying() bool

IsTTSPlaying reports whether downlink TTS is active.

func (*Session) ProcessAudio

func (s *Session) ProcessAudio(ctx context.Context, data []byte) error

ProcessAudio feeds one uplink audio chunk (encoded or PCM per session config).

func (*Session) PushDTMF

func (s *Session) PushDTMF(digit string, end bool)

PushDTMF forwards a DTMF digit as an event to the dialog app.

func (*Session) Start

func (s *Session) Start(ctx context.Context) error

Start activates the session and emits call.started.

type StartMeta

type StartMeta struct {
	From  string
	To    string
	Codec string
	PCMHz int
}

StartMeta describes the call at session start (emitted in call.started).

type TurnEvent

type TurnEvent struct {
	UtteranceID      string
	LLMText          string
	Meta             *CommandMeta
	DurationMs       int
	TTSFirstByteMs   int
	E2EFirstByteMs   int
	MoreSpeaksQueued bool
	OK               bool
}

TurnEvent is delivered to OnTurn after each tts.speak completes.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL