Documentation
¶
Overview ¶
Package voice provides processors that wire STT, LLM, and TTS into a pipeline.
Package voice provides processors that wire STT, LLM, and TTS into a pipeline.
Index ¶
Constants ¶
const DefaultTurnMaxDurationSecs = 8
DefaultTurnMaxDurationSecs is the default max turn duration when maxDurationSecs <= 0.
const MinSTTBufferMs = 500
MinSTTBufferMs is the default minimum audio to buffer before calling STT (500ms). Sending very short chunks (e.g. 20ms) to STT APIs yields empty transcripts.
Variables ¶
This section is empty.
Functions ¶
func NewInterruptionControllerFromOptions ¶
func NewInterruptionControllerFromOptions(name string, opts json.RawMessage) processors.Processor
NewInterruptionControllerFromOptions builds an InterruptionController from JSON plugin options.
Types ¶
type InterruptionController ¶
type InterruptionController struct {
*processors.BaseProcessor
// Strategy decides when to interrupt based on accumulated user text.
Strategy interruptions.Strategy
// contains filtered or unexported fields
}
InterruptionController observes bot speech and user transcripts and, when allowed, emits InterruptionFrame to clear downstream playback (barge-in).
It relies on a configured interruptions.Strategy to decide when the user has spoken “enough” to warrant an interruption.
func NewInterruptionController ¶
func NewInterruptionController(name string, strategy interruptions.Strategy) *InterruptionController
NewInterruptionController constructs a controller with the provided strategy. When strategy is nil, the controller behaves as a pass-through processor.
func (*InterruptionController) ProcessFrame ¶
func (p *InterruptionController) ProcessFrame(ctx context.Context, f frames.Frame, dir processors.Direction) error
ProcessFrame tracks bot/user state and, when appropriate, emits an InterruptionFrame downstream. All frames are forwarded unchanged.
type InterruptionControllerOptions ¶
type InterruptionControllerOptions struct {
// Strategy selects the interruption strategy, e.g. "min_words".
Strategy string `json:"strategy,omitempty"`
// MinWords configures MinWordsStrategy when Strategy is "min_words".
MinWords int `json:"min_words,omitempty"`
}
InterruptionControllerOptions describes JSON options for the "interruption_controller" processor when used via plugin_options.
type LLMProcessor ¶
type LLMProcessor struct {
*processors.BaseProcessor
LLM services.LLMService
SystemPrompt string // optional; when set, sent as system message so the LLM replies as assistant
OnContextUpdate OnContextUpdate // optional; called when msgs is updated (e.g. for IVR SetSavedMessages)
// contains filtered or unexported fields
}
LLMProcessor runs the LLM on transcription/context and streams LLMTextFrame downstream.
func NewLLMProcessor ¶
func NewLLMProcessor(name string, llm services.LLMService) *LLMProcessor
NewLLMProcessor returns a processor that runs the LLM and streams text downstream.
func NewLLMProcessorWithSystemPrompt ¶
func NewLLMProcessorWithSystemPrompt(name string, llm services.LLMService, systemPrompt string) *LLMProcessor
NewLLMProcessorWithSystemPrompt returns a processor that runs the LLM with an optional system prompt (e.g. "You are a helpful voice assistant. Reply briefly.").
func (*LLMProcessor) ProcessFrame ¶
func (p *LLMProcessor) ProcessFrame(ctx context.Context, f frames.Frame, dir processors.Direction) error
ProcessFrame runs LLM on TranscriptionFrame (appends user message), LLMRunFrame, or LLMMessagesUpdateFrame; streams LLMTextFrame downstream. Forwards other frames.
type OnContextUpdate ¶
OnContextUpdate is called whenever the LLM context (msgs) is updated. Used by IVR to capture conversation for mode switching.
type STTProcessor ¶
type STTProcessor struct {
*processors.BaseProcessor
STT services.STTService
SampleRate int
Channels int
MinBufferBytes int // min bytes before calling Transcribe (e.g. 500ms at 16kHz mono = 16000)
// contains filtered or unexported fields
}
STTProcessor turns AudioRawFrame into TranscriptionFrame using an STTService. It buffers incoming audio and only calls the STT service when at least MinBufferBytes have been accumulated, so the API receives enough audio to return transcripts.
func NewSTTProcessor ¶
func NewSTTProcessor(name string, stt services.STTService, sampleRate, channels int) *STTProcessor
NewSTTProcessor returns a processor that transcribes audio and pushes TranscriptionFrame(s) downstream. It buffers audio until at least minBufferMs of audio is available (default 500ms) before calling STT.
func NewSTTProcessorWithBuffer ¶
func NewSTTProcessorWithBuffer(name string, stt services.STTService, sampleRate, channels, minBufferMs int) *STTProcessor
NewSTTProcessorWithBuffer is like NewSTTProcessor but allows setting minBufferMs (e.g. 300–800).
func (*STTProcessor) ProcessFrame ¶
func (p *STTProcessor) ProcessFrame(ctx context.Context, f frames.Frame, dir processors.Direction) error
ProcessFrame buffers AudioRawFrame and transcribes when enough audio is accumulated; forwards other frames.
type TTSProcessor ¶
type TTSProcessor struct {
*processors.BaseProcessor
TTS services.TTSService
SampleRate int
MaxBatchRunes int // max runes before flushing without sentence end (0 = use default)
// contains filtered or unexported fields
}
TTSProcessor turns LLMTextFrame, TextFrame, or TTSSpeakFrame into TTSAudioRawFrame using a TTSService. It batches streamed LLM text by sentence (or MaxBatchRunes) so each TTS API call gets a full phrase, reducing choppy playback. Emits BotStartedSpeakingFrame before the first TTS audio in a response and BotStoppedSpeakingFrame after each segment for observers.
func NewTTSProcessor ¶
func NewTTSProcessor(name string, tts services.TTSService, sampleRate int) *TTSProcessor
NewTTSProcessor returns a processor that speaks text and pushes TTSAudioRawFrame(s) downstream.
func (*TTSProcessor) ProcessFrame ¶
func (p *TTSProcessor) ProcessFrame(ctx context.Context, f frames.Frame, dir processors.Direction) error
ProcessFrame buffers LLMTextFrame/TextFrame until a sentence boundary or limit, then speaks; TTSSpeakFrame is spoken immediately. Forwards other frames (flushing any pending text first).
type TurnProcessor ¶
type TurnProcessor struct {
*processors.BaseProcessor
VAD vad.Detector
Analyzer turn.Analyzer
SampleRate int
Channels int
// contains filtered or unexported fields
}
TurnProcessor buffers AudioRawFrame chunks, runs VAD and turn detection, and forwards concatenated audio downstream only when the turn is complete (end of speech).
func NewTurnProcessor ¶
func NewTurnProcessor(name string, v vad.Detector, a turn.Analyzer, sampleRate, channels int, useAsync bool) *TurnProcessor
NewTurnProcessor returns a processor that buffers audio and forwards one segment per turn. When useAsync is true, end-of-turn detection is driven via Analyzer.AnalyzeEndOfTurnAsync; otherwise the synchronous AppendAudio return value is used.
func NewTurnProcessorWithUserTurn ¶
func NewTurnProcessorWithUserTurn( name string, v vad.Detector, a turn.Analyzer, sampleRate, channels int, useAsync bool, userTurnStopTimeout float64, userIdleTimeout float64, maxDurationSecs float64, ) *TurnProcessor
NewTurnProcessorWithUserTurn is like NewTurnProcessor but allows callers to configure user turn stop and idle timeouts. maxDurationSecs when <= 0 uses DefaultTurnMaxDurationSecs; the turn buffer is pre-allocated to this duration to avoid repeated growth and GC.
func (*TurnProcessor) ProcessFrame ¶
func (p *TurnProcessor) ProcessFrame(ctx context.Context, f frames.Frame, dir processors.Direction) error
ProcessFrame buffers AudioRawFrame, runs VAD and turn detection; on turn complete pushes audio downstream.