Documentation
¶
Index ¶
- Variables
- func SanitizeSpeech(text string) string
- func Warmup(ctx context.Context, svc TTSService)
- type AudioFrame
- type AudioSender
- func (s *AudioSender) Close() error
- func (s *AudioSender) GetBufferLevel() int
- func (s *AudioSender) GetPendingCount() int
- func (s *AudioSender) ProcessFrame(frame AudioFrame) error
- func (s *AudioSender) Reset()
- func (s *AudioSender) SetLogger(callback func(string))
- func (s *AudioSender) SetOutputCodec(codec string) error
- func (s *AudioSender) Start(ctx context.Context) error
- func (s *AudioSender) Stop() error
- type AudioSenderConfig
- type Cache
- type CacheConfig
- type CachingTTSService
- func (c *CachingTTSService) Cache() *Cache
- func (c *CachingTTSService) CacheKey(text string) string
- func (c *CachingTTSService) Prewarm(ctx context.Context, texts []string, onErr func(text string, err error))
- func (c *CachingTTSService) Synthesize(ctx context.Context, text string, onPCMChunk func([]byte) error) error
- type Capabilities
- type EncodedFrame
- type Speaker
- func (s *Speaker) DrainQueue()
- func (s *Speaker) Enqueue(text, utteranceID string, e2eAnchor *time.Time) bool
- func (s *Speaker) Interrupt()
- func (s *Speaker) IsActive() bool
- func (s *Speaker) IsPlaying() bool
- func (s *Speaker) QueueDepth() int
- func (s *Speaker) SetCallbacks(onStarted func(utteranceID, text string, chained bool), ...)
- func (s *Speaker) SetCallbacksWithFirstFrame(onStarted func(utteranceID, text string, chained bool), ...)
- func (s *Speaker) Start(ctx context.Context)
- func (s *Speaker) Stop()
- type SpeakerConfig
- type TTSPipeline
- func (p *TTSPipeline) ArmFirstFrameHook(fn func())
- func (p *TTSPipeline) Close() error
- func (p *TTSPipeline) GetConfig() TTSPipelineConfig
- func (p *TTSPipeline) Interrupt()
- func (p *TTSPipeline) IsPlaying() bool
- func (p *TTSPipeline) PlayFrames(ctx context.Context, frames [][]byte) error
- func (p *TTSPipeline) SetLogger(callback func(string))
- func (p *TTSPipeline) SetOnCompleteFunc(callback func())
- func (p *TTSPipeline) Speak(text string) error
- func (p *TTSPipeline) Start(ctx context.Context) error
- func (p *TTSPipeline) Stop() error
- func (p *TTSPipeline) Synthesize(ctx context.Context, text string) error
- func (p *TTSPipeline) Warmup(ctx context.Context)
- type TTSPipelineComponent
- type TTSPipelineConfig
- type TTSService
- type TTSWorkerConfig
- type TTSWorkerPool
- type TextSegment
- type TextSegmenterComponent
- func (s *TextSegmenterComponent) Name() string
- func (s *TextSegmenterComponent) OnComplete()
- func (s *TextSegmenterComponent) Process(ctx context.Context, data interface{}) (interface{}, bool, error)
- func (s *TextSegmenterComponent) Reset()
- func (s *TextSegmenterComponent) SetLogger(callback func(string))
- func (s *TextSegmenterComponent) SetPlayID(playID string)
- type TextSegmenterConfig
Constants ¶
This section is empty.
Variables ¶
var ( // ErrTTSServiceRequired is returned when TTSService is not provided. ErrTTSServiceRequired = errors.New("TTS service is required") // ErrSendCallbackRequired is returned when SendCallback is not provided. ErrSendCallbackRequired = errors.New("send callback is required") // ErrPipelineNotStarted is returned when pipeline operations are called before Start. ErrPipelineNotStarted = errors.New("pipeline not started") // ErrInvalidDataType is returned when data type is invalid. ErrInvalidDataType = errors.New("invalid data type") // ErrEmptyText is returned when text is empty. ErrEmptyText = errors.New("empty text") )
var DefaultCache = NewCache(128, 32<<20)
DefaultCache is the process-wide TTS PCM cache (128 entries / 32 MiB).
var ErrInterrupted = errors.New("tts: interrupted")
ErrInterrupted is returned when Speak is cancelled via Interrupt.
Functions ¶
func SanitizeSpeech ¶
SanitizeSpeech prepares text for cloud TTS synthesis. Returns empty when the segment has no speakable content (punctuation-only, SSML, emoji, etc.).
func Warmup ¶ added in v1.3.0
func Warmup(ctx context.Context, svc TTSService)
Warmup issues a tiny synthesis request so the next real segment avoids cold-start latency.
Types ¶
type AudioFrame ¶
AudioFrame represents a frame of audio data.
type AudioSender ¶
type AudioSender struct {
// contains filtered or unexported fields
}
AudioSender handles audio encoding and sending.
func NewAudioSender ¶
func NewAudioSender(config AudioSenderConfig) (*AudioSender, error)
NewAudioSender creates a new audio sender.
func (*AudioSender) GetBufferLevel ¶
func (s *AudioSender) GetBufferLevel() int
GetBufferLevel returns the current buffer level.
func (*AudioSender) GetPendingCount ¶
func (s *AudioSender) GetPendingCount() int
GetPendingCount returns the pending packet count.
func (*AudioSender) ProcessFrame ¶
func (s *AudioSender) ProcessFrame(frame AudioFrame) error
ProcessFrame processes a PCM audio frame (encode + buffer).
func (*AudioSender) SetLogger ¶
func (s *AudioSender) SetLogger(callback func(string))
SetLogger sets the logging callback.
func (*AudioSender) SetOutputCodec ¶
func (s *AudioSender) SetOutputCodec(codec string) error
SetOutputCodec sets the output codec.
type AudioSenderConfig ¶
type AudioSenderConfig struct {
// OutputCodec: output codec (e.g., "opus", "pcm")
OutputCodec string
// TargetSampleRate: target sample rate (default: 16000)
TargetSampleRate int
// FrameDuration: frame duration (default: 60ms)
FrameDuration time.Duration
// SendCallback: callback for sending encoded audio
SendCallback func(data []byte) error
// GetPendingCountFunc: optional callback to get pending packet count
GetPendingCountFunc func() int
// Logger: optional logging callback
Logger func(string)
}
AudioSenderConfig contains configuration for the audio sender.
type Cache ¶
type Cache struct {
// contains filtered or unexported fields
}
Cache holds rendered PCM keyed by an opaque string. Safe for concurrent access. Use DefaultCache to share entries across calls in one process.
func NewCache ¶
NewCache returns an empty cache with the given caps. maxEntries and maxBytes both apply; whichever is hit first triggers eviction. Pass 0 to disable that cap.
type CacheConfig ¶
type CacheConfig struct {
// Cache to use. nil → DefaultCache.
Cache *Cache
// VoiceKey identifies vendor + voice + sample rate + speed. Required.
VoiceKey string
// MaxRunes skips cache writes for longer texts. 0 = no limit.
MaxRunes int
// ChunkBytes controls replay chunk size. 0 → one shot.
ChunkBytes int
}
CacheConfig configures a CachingTTSService.
type CachingTTSService ¶
type CachingTTSService struct {
// contains filtered or unexported fields
}
CachingTTSService wraps TTSService with a PCM cache.
func NewCachingTTSService ¶
func NewCachingTTSService(inner TTSService, cfg CacheConfig) (*CachingTTSService, error)
NewCachingTTSService validates cfg and returns a cache-aware TTSService.
func (*CachingTTSService) Cache ¶
func (c *CachingTTSService) Cache() *Cache
Cache returns the underlying cache.
func (*CachingTTSService) CacheKey ¶
func (c *CachingTTSService) CacheKey(text string) string
CacheKey returns the canonical key for text.
func (*CachingTTSService) Prewarm ¶
func (c *CachingTTSService) Prewarm(ctx context.Context, texts []string, onErr func(text string, err error))
Prewarm renders texts once and stores PCM. Skips already-cached entries.
func (*CachingTTSService) Synthesize ¶
func (c *CachingTTSService) Synthesize(ctx context.Context, text string, onPCMChunk func([]byte) error) error
Synthesize implements TTSService.
type Capabilities ¶ added in v1.3.0
Capabilities describes vendor synthesis behavior for pipeline tuning.
func CapabilitiesFrom ¶ added in v1.3.0
func CapabilitiesFrom(svc TTSService) Capabilities
CapabilitiesFrom inspects a TTSService for optional vendor hints.
func DefaultCapabilities ¶ added in v1.3.0
func DefaultCapabilities() Capabilities
DefaultCapabilities returns conservative batch-oriented defaults.
type EncodedFrame ¶
EncodedFrame represents an encoded audio frame.
type Speaker ¶
type Speaker struct {
// contains filtered or unexported fields
}
Speaker serializes TTS playback and pipelines synthesis ahead of playback so LLM stream segments can be synthesized while the previous segment is playing.
func NewSpeaker ¶
func NewSpeaker(cfg SpeakerConfig) (*Speaker, error)
NewSpeaker creates a pipelined TTS speaker.
func (*Speaker) DrainQueue ¶
func (s *Speaker) DrainQueue()
DrainQueue drops pending jobs without interrupting the current utterance.
func (*Speaker) Enqueue ¶
Enqueue schedules text for synthesis. Non-blocking; drops when queue is full.
func (*Speaker) Interrupt ¶
func (s *Speaker) Interrupt()
Interrupt stops the current utterance and drains pending jobs.
func (*Speaker) QueueDepth ¶
QueueDepth returns pending text segment count.
func (*Speaker) SetCallbacks ¶
func (s *Speaker) SetCallbacks( onStarted func(utteranceID, text string, chained bool), onEnded func(utteranceID string, ok bool, duration time.Duration, ttsFirstMs, e2eFirstMs int, moreQueued bool), )
SetCallbacks updates lifecycle hooks (safe before Start).
func (*Speaker) SetCallbacksWithFirstFrame ¶ added in v1.3.0
func (s *Speaker) SetCallbacksWithFirstFrame( onStarted func(utteranceID, text string, chained bool), onEnded func(utteranceID string, ok bool, duration time.Duration, ttsFirstMs, e2eFirstMs int, moreQueued bool), onFirstFrame func(utteranceID string, ttsFirstMs, e2eFirstMs int), )
SetCallbacksWithFirstFrame also wires an optional first-frame hook (once per utterance).
type SpeakerConfig ¶
type SpeakerConfig struct {
Pipeline *TTSPipeline
QueueSize int
Prefetch int // buffered segments between synth and play (default 2)
OnStarted func(utteranceID, text string, chained bool)
OnEnded func(utteranceID string, ok bool, duration time.Duration, ttsFirstMs, e2eFirstMs int, moreQueued bool)
OnFirstFrame func(utteranceID string, ttsFirstMs, e2eFirstMs int)
}
SpeakerConfig configures a serial TTS speaker.
type TTSPipeline ¶
type TTSPipeline struct {
// contains filtered or unexported fields
}
TTSPipeline manages text-to-speech synthesis with pluggable components.
func NewTTSPipeline ¶
func NewTTSPipeline(config TTSPipelineConfig) (*TTSPipeline, error)
NewTTSPipeline creates a new TTS pipeline with the given configuration.
func (*TTSPipeline) ArmFirstFrameHook ¶
func (p *TTSPipeline) ArmFirstFrameHook(fn func())
ArmFirstFrameHook installs a one-shot callback fired on the first emitted frame.
func (*TTSPipeline) GetConfig ¶
func (p *TTSPipeline) GetConfig() TTSPipelineConfig
GetConfig returns the current pipeline configuration.
func (*TTSPipeline) Interrupt ¶
func (p *TTSPipeline) Interrupt()
Interrupt cancels the in-flight Speak. The pipeline remains usable.
func (*TTSPipeline) IsPlaying ¶
func (p *TTSPipeline) IsPlaying() bool
IsPlaying reports whether a Speak call is streaming audio.
func (*TTSPipeline) PlayFrames ¶ added in v1.3.0
func (p *TTSPipeline) PlayFrames(ctx context.Context, frames [][]byte) error
PlayFrames emits pre-collected frames with pacing and hooks.
func (*TTSPipeline) SetLogger ¶
func (p *TTSPipeline) SetLogger(callback func(string))
SetLogger sets the logging callback.
func (*TTSPipeline) SetOnCompleteFunc ¶
func (p *TTSPipeline) SetOnCompleteFunc(callback func())
SetOnCompleteFunc sets the completion callback.
func (*TTSPipeline) Speak ¶
func (p *TTSPipeline) Speak(text string) error
Speak synthesizes text with per-utterance cancellation (barge-in). Blocks until synthesis completes, is interrupted, or the pipeline stops.
func (*TTSPipeline) Start ¶
func (p *TTSPipeline) Start(ctx context.Context) error
Start starts the TTS pipeline.
func (*TTSPipeline) Synthesize ¶
func (p *TTSPipeline) Synthesize(ctx context.Context, text string) error
Synthesize synthesizes text to audio using the provided context.
func (*TTSPipeline) Warmup ¶ added in v1.3.0
func (p *TTSPipeline) Warmup(ctx context.Context)
WarmupPipeline warms the pipeline's configured TTS service.
type TTSPipelineComponent ¶
type TTSPipelineComponent interface {
// Name returns the component name.
Name() string
// Process processes data through the component.
// For text processors: input is string, output is string
// For audio processors: input is []byte, output is []byte
// Returns (processedData, shouldContinue, error)
Process(ctx context.Context, data interface{}) (interface{}, bool, error)
}
TTSPipelineComponent defines the interface for TTS pipeline components.
type TTSPipelineConfig ¶
type TTSPipelineConfig struct {
// TTSService: the TTS service to use for synthesis
TTSService TTSService
// OutputCodec: output audio codec (e.g., "opus", "pcm")
OutputCodec string
// TargetSampleRate: target sample rate for output audio (default: 16000)
TargetSampleRate int
// FrameDuration: frame duration for audio processing (default: 60ms)
FrameDuration time.Duration
// TextProcessors: optional text processing components (e.g., text normalization)
TextProcessors []TTSPipelineComponent
// AudioProcessors: optional audio processing components (e.g., encoding)
AudioProcessors []TTSPipelineComponent
// SendCallback: callback for sending synthesized audio
SendCallback func(data []byte) error
// RecordCallback: optional callback for recording synthesized audio
RecordCallback func(data []byte) error
// Logger: optional logging callback
Logger func(string)
// PaceRealtime sleeps between frames so playback matches wall-clock (required for RTP/VoIP).
PaceRealtime bool
}
TTSPipelineConfig contains configuration for the TTS pipeline.
func DefaultTTSPipelineConfig ¶
func DefaultTTSPipelineConfig(ttsService TTSService) TTSPipelineConfig
DefaultTTSPipelineConfig returns default TTS pipeline configuration.
type TTSService ¶
type TTSService interface {
// Synthesize synthesizes text to audio and calls the callback for each audio chunk.
// The callback receives PCM audio data (typically 16-bit mono at 16kHz).
Synthesize(ctx context.Context, text string, callback func([]byte) error) error
}
TTSService defines the interface for text-to-speech synthesis.
func FromSynthesisEngine ¶
func FromSynthesisEngine(engine synthesizer.AudioSynthesisEngine) TTSService
FromSynthesisEngine wraps synthesizer.AudioSynthesisEngine as TTSService.
type TTSWorkerConfig ¶
type TTSWorkerConfig struct {
// TTSService: the TTS service to use
TTSService TTSService
// WorkerCount: number of worker goroutines (default: 1)
WorkerCount int
// Logger: optional logging callback
Logger func(string)
}
TTSWorkerConfig contains configuration for TTS workers.
type TTSWorkerPool ¶
type TTSWorkerPool struct {
// contains filtered or unexported fields
}
TTSWorkerPool manages multiple TTS worker goroutines.
func NewTTSWorkerPool ¶
func NewTTSWorkerPool( config TTSWorkerConfig, inputCh <-chan TextSegment, outputCh chan<- AudioFrame, ) (*TTSWorkerPool, error)
NewTTSWorkerPool creates a new TTS worker pool.
func (*TTSWorkerPool) GetGlobalSequence ¶
func (p *TTSWorkerPool) GetGlobalSequence() uint32
GetGlobalSequence returns the current global sequence number.
func (*TTSWorkerPool) Start ¶
func (p *TTSWorkerPool) Start(ctx context.Context) error
Start starts the worker pool.
func (*TTSWorkerPool) UpdateTTSService ¶
func (p *TTSWorkerPool) UpdateTTSService(newService TTSService) error
UpdateTTSService updates the TTS service (for speaker switching).
type TextSegment ¶
TextSegment represents a text segment for TTS synthesis.
type TextSegmenterComponent ¶
type TextSegmenterComponent struct {
// contains filtered or unexported fields
}
TextSegmenterComponent segments streaming LLM text for TTS synthesis.
func NewTextSegmenterComponent ¶
func NewTextSegmenterComponent(config TextSegmenterConfig, outputFunc func(TextSegment)) *TextSegmenterComponent
NewTextSegmenterComponent creates a new text segmenter component.
func (*TextSegmenterComponent) Name ¶
func (s *TextSegmenterComponent) Name() string
Name returns the component name.
func (*TextSegmenterComponent) OnComplete ¶
func (s *TextSegmenterComponent) OnComplete()
OnComplete flushes the tail when the LLM stream ends.
func (*TextSegmenterComponent) Process ¶
func (s *TextSegmenterComponent) Process(ctx context.Context, data interface{}) (interface{}, bool, error)
Process ingests streaming LLM text and emits segments when rules fire.
func (*TextSegmenterComponent) Reset ¶
func (s *TextSegmenterComponent) Reset()
Reset resets the segmenter state.
func (*TextSegmenterComponent) SetLogger ¶
func (s *TextSegmenterComponent) SetLogger(callback func(string))
SetLogger sets the logging callback.
func (*TextSegmenterComponent) SetPlayID ¶
func (s *TextSegmenterComponent) SetPlayID(playID string)
SetPlayID sets the current play ID for segments.
type TextSegmenterConfig ¶
type TextSegmenterConfig struct {
DelayTimeout time.Duration
// First segment (latency-optimized)
FirstMinChars int // comma/pause flush once buffer has this many runes (default 6)
FirstMaxChars int // first-chunk safety cap with punctuation-aware cut (default 18, 0=off)
// Later segments (semantic priority)
RestForceMaxChars int // emergency split only after this many runes without sentence end (default 120, 0=wait for OnComplete)
// Deprecated: use FirstMinChars / FirstMaxChars. Kept for backward compatibility.
MinChars int
MaxChars int
}
TextSegmenterConfig controls LLM→TTS streaming segmentation.
Strategy:
- First segment: lower latency — break on sentence end, comma (≥FirstMinChars), or FirstMaxChars with punctuation-aware boundary.
- Later segments: semantic priority — only sentence-ending punctuation or stream end; no comma/char chops unless RestForceMaxChars safety triggers.
func DefaultTextSegmenterConfig ¶
func DefaultTextSegmenterConfig() TextSegmenterConfig
DefaultTextSegmenterConfig returns punctuation-first segmentation defaults.