Documentation
¶
Index ¶
- Constants
- type AudioSink
- type AudioSource
- type ClientAECMode
- type DeviceType
- type ErrorCode
- type Event
- type EventType
- type InterruptReason
- type Pipeline
- func (p *Pipeline) Abort() bool
- func (p *Pipeline) RunAudio(ctx context.Context, input audio.Frame) (audio.Stream[Event], error)
- func (p *Pipeline) RunAudioStream(ctx context.Context, input audio.Stream[audio.Frame]) (audio.Stream[Event], error)
- func (p *Pipeline) RunText(ctx context.Context, text string) (audio.Stream[Event], error)
- type PipelineOption
- func WithContextID(id string) PipelineOption
- func WithDynamicTTSOptions(fn func() []tts.TTSOption) PipelineOption
- func WithSTTOptions(opts ...stt.STTOption) PipelineOption
- func WithSegmenterOptions(opts ...tts.SegmenterOption) PipelineOption
- func WithTTSOptions(opts ...tts.TTSOption) PipelineOption
- func WithTimeouts(timeouts PipelineTimeouts) PipelineOption
- func WithTurnHistory(msgs []model.Message) PipelineOption
- type PipelineTimeouts
- type PlaybackMode
- type PlaybackReferenceProvider
- type Session
- func (s *Session) Capabilities() SessionCapabilities
- func (s *Session) CommitInput() bool
- func (s *Session) Run(ctx context.Context) error
- func (s *Session) Send(text string) bool
- func (s *Session) SessionID() string
- func (s *Session) StopSpeaking() bool
- func (s *Session) VoiceProfile() (VoiceProfile, bool)
- type SessionCapabilities
- type SessionOption
- func WithBargeInConfirm(n int) SessionOption
- func WithCapabilities(capabilities SessionCapabilities) SessionOption
- func WithDetector(d detect.SpeechDetector) SessionOption
- func WithEndpointDecider(decider endpoint.Decider) SessionOption
- func WithEventHandler(fn func(Event)) SessionOption
- func WithFrameSize(d time.Duration) SessionOption
- func WithMetricsHook(hook speechmetrics.Hook) SessionOption
- func WithPlaybackDrainTimeout(d time.Duration) SessionOption
- func WithPreprocessor(processor preprocess.Processor) SessionOption
- func WithPreprocessors(processors ...preprocess.Processor) SessionOption
- func WithSilenceDuration(d time.Duration) SessionOption
- func WithVoiceProfile(profile VoiceProfile) SessionOption
- type SessionState
- type VoiceProfile
- type VoiceProfileScene
Constants ¶
const ( ExtraKeyLanguage = "speech.language" ExtraKeyEmotion = "speech.emotion" ExtraKeyVolume = "speech.volume" ExtraKeyScene = "speech.scene" )
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type AudioSink ¶
type AudioSink interface {
// Play starts playing utterances from the stream asynchronously.
// It returns a channel that is closed when playback finishes (drained or aborted).
//
// Stream termination semantics:
// - io.EOF means the turn ended normally — drain the hardware buffer before signalling done.
// - Any other error means interruption — discard buffered audio and signal done immediately.
Play(stream audio.Stream[tts.Utterance]) <-chan struct{}
}
AudioSink is an abstraction for audio output (speaker, WebSocket, file, etc.).
type AudioSource ¶
AudioSource is an abstraction for audio input (microphone, WebSocket, file, etc.).
Implementation contract: when the context passed to Start (or the source's owning context) is cancelled, Stream().Read() must return an error (typically via Pipe.Interrupt). This is required for Session.Run to exit promptly on Ctrl+C. Implementations that do not honour this contract will cause Session.Run to block indefinitely on src.Read().
type ClientAECMode ¶
type ClientAECMode string
const ( ClientAECUnknown ClientAECMode = "unknown" ClientAECHardware ClientAECMode = "hardware" ClientAECOS ClientAECMode = "os" ClientAECBrowser ClientAECMode = "browser" ClientAECDisabled ClientAECMode = "disabled" )
type DeviceType ¶
type DeviceType string
const ( DeviceTypeUnknown DeviceType = "unknown" DeviceTypeDesktop DeviceType = "desktop" DeviceTypeBrowser DeviceType = "browser" DeviceTypeMobile DeviceType = "mobile" )
type Event ¶
type Event struct {
Type EventType
Text string
Audio audio.Frame
Lang string
Confidence float64
Duration time.Duration
Words []stt.WordTiming
TranscriptRevision int
Data map[string]any
RunID string
TurnID string
SessionID string
ErrorCode ErrorCode
InterruptReason InterruptReason
}
Event is emitted by the voice pipeline.
type EventType ¶
type EventType string
EventType identifies a voice pipeline event.
const ( EventTurnStarted EventType = "voice.turn.started" EventTranscriptRevision EventType = "voice.transcript.revision" EventTranscriptPartial EventType = "voice.transcript.partial" EventTranscriptFinal EventType = "voice.transcript.final" EventTextDelta EventType = "voice.text.delta" EventAudio EventType = "voice.audio" EventResponseDone EventType = "voice.response.done" EventAudioDone EventType = "voice.audio.done" EventPlayStarted EventType = "voice.play.started" EventToolCall EventType = "voice.tool.call" EventToolResult EventType = "voice.tool.result" EventTurnInterrupted EventType = "voice.turn.interrupted" EventTurnDone EventType = "voice.turn.done" EventDone EventType = "voice.done" EventPlayDone EventType = "voice.play.done" EventError EventType = "voice.error" )
type InterruptReason ¶
type InterruptReason string
const ( InterruptReasonUnknown InterruptReason = "" InterruptReasonUserBargeIn InterruptReason = "user_barge_in" InterruptReasonManualInterrupt InterruptReason = "manual_interrupt" InterruptReasonTextInterrupt InterruptReason = "text_interrupt" )
type Pipeline ¶
type Pipeline struct {
// contains filtered or unexported fields
}
Pipeline orchestrates STT → Runtime.Run → TTS into a stream of voice events using a linear pipeline architecture.
func NewPipeline ¶
func NewPipeline(s stt.STT, t tts.TTS, rt workflow.Runtime, agent workflow.Agent, opts ...PipelineOption) *Pipeline
NewPipeline creates a new voice pipeline. The stt parameter may be nil if only text input (RunText) is used.
func (*Pipeline) RunAudio ¶
RunAudio processes a complete audio input in one shot. Requires a non-nil STT provider.
type PipelineOption ¶
type PipelineOption func(*Pipeline)
PipelineOption configures a Pipeline.
func WithContextID ¶
func WithContextID(id string) PipelineOption
WithContextID sets the memory context identifier passed to Runtime.Run. When the Runtime is configured with a MemoryFactory, this enables automatic history load/save across turns.
func WithDynamicTTSOptions ¶
func WithDynamicTTSOptions(fn func() []tts.TTSOption) PipelineOption
WithDynamicTTSOptions registers a callback that is invoked at the start of each turn to obtain the current TTS options. The returned options are appended after any static options set via WithTTSOptions, so they can override values like Speed, Pitch, Emotion, etc.
func WithSTTOptions ¶
func WithSTTOptions(opts ...stt.STTOption) PipelineOption
func WithSegmenterOptions ¶
func WithSegmenterOptions(opts ...tts.SegmenterOption) PipelineOption
func WithTTSOptions ¶
func WithTTSOptions(opts ...tts.TTSOption) PipelineOption
func WithTimeouts ¶
func WithTimeouts(timeouts PipelineTimeouts) PipelineOption
func WithTurnHistory ¶
func WithTurnHistory(msgs []model.Message) PipelineOption
WithTurnHistory sets message history for the current turn. Only effective when the Runtime has no MemoryFactory configured; when a MemoryFactory is present, use WithContextID instead.
type PipelineTimeouts ¶
type PlaybackMode ¶
type PlaybackMode string
const ( PlaybackModeUnknown PlaybackMode = "unknown" PlaybackModeSpeaker PlaybackMode = "speaker" PlaybackModeHeadset PlaybackMode = "headset" )
type PlaybackReferenceProvider ¶
PlaybackReferenceProvider is an optional interface for sinks that can expose the audio they are currently playing as a reference stream for echo suppression or AEC.
type Session ¶
type Session struct {
// contains filtered or unexported fields
}
Session implements a voice conversation state machine.
Audio: IDLE → HEARING → RESPONDING → PLAYBACK → IDLE Text: IDLE → RESPONDING → PLAYBACK → IDLE
Both audio barge-in and text input (Send) can interrupt any state.
func NewSession ¶
func NewSession(p *Pipeline, src AudioSource, sink AudioSink, opts ...SessionOption) *Session
NewSession creates a new voice session. The src parameter may be nil for text-only sessions (use Send to inject text).
func (*Session) Capabilities ¶
func (s *Session) Capabilities() SessionCapabilities
func (*Session) CommitInput ¶
CommitInput explicitly commits the current audio input turn. It is a no-op when the session is not currently hearing audio.
func (*Session) Run ¶
Run starts the session loop. It blocks until ctx is cancelled or the audio source is exhausted. For text-only sessions (source == nil), Run blocks until ctx is cancelled.
func (*Session) Send ¶
Send injects a text message into the session, skipping STT. If the session is currently responding or playing, it triggers a barge-in. Returns false if the internal buffer is full (message dropped). Safe to call from any goroutine.
func (*Session) StopSpeaking ¶
StopSpeaking signals that the current speaking phase should stop. During hearing it behaves like CommitInput; during responding/playback it acts as a turn interruption hint.
func (*Session) VoiceProfile ¶
func (s *Session) VoiceProfile() (VoiceProfile, bool)
type SessionCapabilities ¶
type SessionCapabilities struct {
ClientAEC ClientAECMode
ClientNS bool
ClientAGC bool
DeviceType DeviceType
PlaybackMode PlaybackMode
}
SessionCapabilities describes client-side media processing and device hints.
type SessionOption ¶
type SessionOption func(*sessionConfig)
SessionOption configures a Session.
func WithBargeInConfirm ¶
func WithBargeInConfirm(n int) SessionOption
func WithCapabilities ¶
func WithCapabilities(capabilities SessionCapabilities) SessionOption
func WithDetector ¶
func WithDetector(d detect.SpeechDetector) SessionOption
WithDetector sets a custom SpeechDetector. If not set, a default EnergyDetector is used.
func WithEndpointDecider ¶
func WithEndpointDecider(decider endpoint.Decider) SessionOption
func WithEventHandler ¶
func WithEventHandler(fn func(Event)) SessionOption
func WithFrameSize ¶
func WithFrameSize(d time.Duration) SessionOption
func WithMetricsHook ¶
func WithMetricsHook(hook speechmetrics.Hook) SessionOption
func WithPlaybackDrainTimeout ¶
func WithPlaybackDrainTimeout(d time.Duration) SessionOption
func WithPreprocessor ¶
func WithPreprocessor(processor preprocess.Processor) SessionOption
func WithPreprocessors ¶
func WithPreprocessors(processors ...preprocess.Processor) SessionOption
func WithSilenceDuration ¶
func WithSilenceDuration(d time.Duration) SessionOption
func WithVoiceProfile ¶
func WithVoiceProfile(profile VoiceProfile) SessionOption
type SessionState ¶
type SessionState int
SessionState represents the current state of the voice session.
const ( StateIdle SessionState = iota StateHearing StateResponding StatePlayback )
type VoiceProfile ¶
type VoiceProfile struct {
Language string
Voice string
Speed float64
Emotion string
Volume float64
Codec audio.Codec
Rate int
Scene VoiceProfileScene
}
func CommandAssistantVoiceProfile ¶
func CommandAssistantVoiceProfile() VoiceProfile
func CompanionVoiceProfile ¶
func CompanionVoiceProfile() VoiceProfile
func CustomerServiceVoiceProfile ¶
func CustomerServiceVoiceProfile() VoiceProfile
func (VoiceProfile) TTSOptions ¶
func (p VoiceProfile) TTSOptions() []tts.TTSOption
type VoiceProfileScene ¶
type VoiceProfileScene string
const ( VoiceProfileSceneCustomerService VoiceProfileScene = "customer_service" VoiceProfileSceneCompanion VoiceProfileScene = "companion" VoiceProfileSceneCommandAssistant VoiceProfileScene = "command_assistant" )