Documentation
¶
Index ¶
- Variables
- func SanitizeSpeech(text string) string
- type AudioFrame
- type AudioSender
- func (s *AudioSender) Close() error
- func (s *AudioSender) GetBufferLevel() int
- func (s *AudioSender) GetPendingCount() int
- func (s *AudioSender) ProcessFrame(frame AudioFrame) error
- func (s *AudioSender) Reset()
- func (s *AudioSender) SetLogger(callback func(string))
- func (s *AudioSender) SetOutputCodec(codec string) error
- func (s *AudioSender) Start(ctx context.Context) error
- func (s *AudioSender) Stop() error
- type AudioSenderConfig
- type Cache
- type CacheConfig
- type CachingTTSService
- func (c *CachingTTSService) Cache() *Cache
- func (c *CachingTTSService) CacheKey(text string) string
- func (c *CachingTTSService) Prewarm(ctx context.Context, texts []string, onErr func(text string, err error))
- func (c *CachingTTSService) Synthesize(ctx context.Context, text string, onPCMChunk func([]byte) error) error
- type EncodedFrame
- type Speaker
- func (s *Speaker) DrainQueue()
- func (s *Speaker) Enqueue(text, utteranceID string, e2eAnchor *time.Time) bool
- func (s *Speaker) Interrupt()
- func (s *Speaker) IsActive() bool
- func (s *Speaker) IsPlaying() bool
- func (s *Speaker) QueueDepth() int
- func (s *Speaker) SetCallbacks(onStarted func(utteranceID, text string, chained bool), ...)
- func (s *Speaker) Start(ctx context.Context)
- func (s *Speaker) Stop()
- type SpeakerConfig
- type TTSPipeline
- func (p *TTSPipeline) ArmFirstFrameHook(fn func())
- func (p *TTSPipeline) Close() error
- func (p *TTSPipeline) GetConfig() TTSPipelineConfig
- func (p *TTSPipeline) Interrupt()
- func (p *TTSPipeline) IsPlaying() bool
- func (p *TTSPipeline) SetLogger(callback func(string))
- func (p *TTSPipeline) SetOnCompleteFunc(callback func())
- func (p *TTSPipeline) Speak(text string) error
- func (p *TTSPipeline) Start(ctx context.Context) error
- func (p *TTSPipeline) Stop() error
- func (p *TTSPipeline) Synthesize(ctx context.Context, text string) error
- type TTSPipelineComponent
- type TTSPipelineConfig
- type TTSService
- type TTSWorkerConfig
- type TTSWorkerPool
- type TextSegment
- type TextSegmenterComponent
- func (s *TextSegmenterComponent) Name() string
- func (s *TextSegmenterComponent) OnComplete()
- func (s *TextSegmenterComponent) Process(ctx context.Context, data interface{}) (interface{}, bool, error)
- func (s *TextSegmenterComponent) Reset()
- func (s *TextSegmenterComponent) SetLogger(callback func(string))
- func (s *TextSegmenterComponent) SetPlayID(playID string)
- type TextSegmenterConfig
Constants ¶
This section is empty.
Variables ¶
var ( // ErrTTSServiceRequired is returned when TTSService is not provided. ErrTTSServiceRequired = errors.New("TTS service is required") // ErrSendCallbackRequired is returned when SendCallback is not provided. ErrSendCallbackRequired = errors.New("send callback is required") // ErrPipelineNotStarted is returned when pipeline operations are called before Start. ErrPipelineNotStarted = errors.New("pipeline not started") // ErrInvalidDataType is returned when data type is invalid. ErrInvalidDataType = errors.New("invalid data type") // ErrEmptyText is returned when text is empty. ErrEmptyText = errors.New("empty text") )
var DefaultCache = NewCache(128, 32<<20)
DefaultCache is the process-wide TTS PCM cache (128 entries / 32 MiB).
var ErrInterrupted = errors.New("tts: interrupted")
ErrInterrupted is returned when Speak is cancelled via Interrupt.
Functions ¶
func SanitizeSpeech ¶
SanitizeSpeech prepares text for cloud TTS synthesis.
Types ¶
type AudioFrame ¶
AudioFrame represents a frame of audio data.
type AudioSender ¶
type AudioSender struct {
// contains filtered or unexported fields
}
AudioSender handles audio encoding and sending.
func NewAudioSender ¶
func NewAudioSender(config AudioSenderConfig) (*AudioSender, error)
NewAudioSender creates a new audio sender.
func (*AudioSender) GetBufferLevel ¶
func (s *AudioSender) GetBufferLevel() int
GetBufferLevel returns the current buffer level.
func (*AudioSender) GetPendingCount ¶
func (s *AudioSender) GetPendingCount() int
GetPendingCount returns the pending packet count.
func (*AudioSender) ProcessFrame ¶
func (s *AudioSender) ProcessFrame(frame AudioFrame) error
ProcessFrame processes a PCM audio frame (encode + buffer).
func (*AudioSender) SetLogger ¶
func (s *AudioSender) SetLogger(callback func(string))
SetLogger sets the logging callback.
func (*AudioSender) SetOutputCodec ¶
func (s *AudioSender) SetOutputCodec(codec string) error
SetOutputCodec sets the output codec.
type AudioSenderConfig ¶
type AudioSenderConfig struct {
// OutputCodec: output codec (e.g., "opus", "pcm")
OutputCodec string
// TargetSampleRate: target sample rate (default: 16000)
TargetSampleRate int
// FrameDuration: frame duration (default: 60ms)
FrameDuration time.Duration
// SendCallback: callback for sending encoded audio
SendCallback func(data []byte) error
// GetPendingCountFunc: optional callback to get pending packet count
GetPendingCountFunc func() int
// Logger: optional logging callback
Logger func(string)
}
AudioSenderConfig contains configuration for the audio sender.
type Cache ¶
type Cache struct {
// contains filtered or unexported fields
}
Cache holds rendered PCM keyed by an opaque string. Safe for concurrent access. Use DefaultCache to share entries across calls in one process.
func NewCache ¶
NewCache returns an empty cache with the given caps. maxEntries and maxBytes both apply; whichever is hit first triggers eviction. Pass 0 to disable that cap.
type CacheConfig ¶
type CacheConfig struct {
// Cache to use. nil → DefaultCache.
Cache *Cache
// VoiceKey identifies vendor + voice + sample rate + speed. Required.
VoiceKey string
// MaxRunes skips cache writes for longer texts. 0 = no limit.
MaxRunes int
// ChunkBytes controls replay chunk size. 0 → one shot.
ChunkBytes int
}
CacheConfig configures a CachingTTSService.
type CachingTTSService ¶
type CachingTTSService struct {
// contains filtered or unexported fields
}
CachingTTSService wraps TTSService with a PCM cache.
func NewCachingTTSService ¶
func NewCachingTTSService(inner TTSService, cfg CacheConfig) (*CachingTTSService, error)
NewCachingTTSService validates cfg and returns a cache-aware TTSService.
func (*CachingTTSService) Cache ¶
func (c *CachingTTSService) Cache() *Cache
Cache returns the underlying cache.
func (*CachingTTSService) CacheKey ¶
func (c *CachingTTSService) CacheKey(text string) string
CacheKey returns the canonical key for text.
func (*CachingTTSService) Prewarm ¶
func (c *CachingTTSService) Prewarm(ctx context.Context, texts []string, onErr func(text string, err error))
Prewarm renders texts once and stores PCM. Skips already-cached entries.
func (*CachingTTSService) Synthesize ¶
func (c *CachingTTSService) Synthesize(ctx context.Context, text string, onPCMChunk func([]byte) error) error
Synthesize implements TTSService.
type EncodedFrame ¶
EncodedFrame represents an encoded audio frame.
type Speaker ¶
type Speaker struct {
// contains filtered or unexported fields
}
Speaker serializes TTS playback so adjacent utterances never overlap.
func NewSpeaker ¶
func NewSpeaker(cfg SpeakerConfig) (*Speaker, error)
NewSpeaker creates a serial TTS speaker.
func (*Speaker) DrainQueue ¶
func (s *Speaker) DrainQueue()
DrainQueue drops pending jobs without interrupting the current utterance.
func (*Speaker) Enqueue ¶
Enqueue schedules text for synthesis. Non-blocking; drops when queue is full.
func (*Speaker) Interrupt ¶
func (s *Speaker) Interrupt()
Interrupt stops the current utterance and drains pending jobs.
func (*Speaker) QueueDepth ¶
QueueDepth returns pending utterance count.
func (*Speaker) SetCallbacks ¶
func (s *Speaker) SetCallbacks( onStarted func(utteranceID, text string, chained bool), onEnded func(utteranceID string, ok bool, duration time.Duration, ttsFirstMs, e2eFirstMs int, moreQueued bool), )
SetCallbacks updates lifecycle hooks (safe before Start).
type SpeakerConfig ¶
type SpeakerConfig struct {
Pipeline *TTSPipeline
QueueSize int
OnStarted func(utteranceID, text string, chained bool)
OnEnded func(utteranceID string, ok bool, duration time.Duration, ttsFirstMs, e2eFirstMs int, moreQueued bool)
}
SpeakerConfig configures a serial TTS speaker.
type TTSPipeline ¶
type TTSPipeline struct {
// contains filtered or unexported fields
}
TTSPipeline manages text-to-speech synthesis with pluggable components.
func NewTTSPipeline ¶
func NewTTSPipeline(config TTSPipelineConfig) (*TTSPipeline, error)
NewTTSPipeline creates a new TTS pipeline with the given configuration.
func (*TTSPipeline) ArmFirstFrameHook ¶
func (p *TTSPipeline) ArmFirstFrameHook(fn func())
ArmFirstFrameHook installs a one-shot callback fired on the first emitted frame.
func (*TTSPipeline) GetConfig ¶
func (p *TTSPipeline) GetConfig() TTSPipelineConfig
GetConfig returns the current pipeline configuration.
func (*TTSPipeline) Interrupt ¶
func (p *TTSPipeline) Interrupt()
Interrupt cancels the in-flight Speak. The pipeline remains usable.
func (*TTSPipeline) IsPlaying ¶
func (p *TTSPipeline) IsPlaying() bool
IsPlaying reports whether a Speak call is streaming audio.
func (*TTSPipeline) SetLogger ¶
func (p *TTSPipeline) SetLogger(callback func(string))
SetLogger sets the logging callback.
func (*TTSPipeline) SetOnCompleteFunc ¶
func (p *TTSPipeline) SetOnCompleteFunc(callback func())
SetOnCompleteFunc sets the completion callback.
func (*TTSPipeline) Speak ¶
func (p *TTSPipeline) Speak(text string) error
Speak synthesizes text with per-utterance cancellation (barge-in). Blocks until synthesis completes, is interrupted, or the pipeline stops.
func (*TTSPipeline) Start ¶
func (p *TTSPipeline) Start(ctx context.Context) error
Start starts the TTS pipeline.
func (*TTSPipeline) Synthesize ¶
func (p *TTSPipeline) Synthesize(ctx context.Context, text string) error
Synthesize synthesizes text to audio using the provided context.
type TTSPipelineComponent ¶
type TTSPipelineComponent interface {
// Name returns the component name.
Name() string
// Process processes data through the component.
// For text processors: input is string, output is string
// For audio processors: input is []byte, output is []byte
// Returns (processedData, shouldContinue, error)
Process(ctx context.Context, data interface{}) (interface{}, bool, error)
}
TTSPipelineComponent defines the interface for TTS pipeline components.
type TTSPipelineConfig ¶
type TTSPipelineConfig struct {
// TTSService: the TTS service to use for synthesis
TTSService TTSService
// OutputCodec: output audio codec (e.g., "opus", "pcm")
OutputCodec string
// TargetSampleRate: target sample rate for output audio (default: 16000)
TargetSampleRate int
// FrameDuration: frame duration for audio processing (default: 60ms)
FrameDuration time.Duration
// TextProcessors: optional text processing components (e.g., text normalization)
TextProcessors []TTSPipelineComponent
// AudioProcessors: optional audio processing components (e.g., encoding)
AudioProcessors []TTSPipelineComponent
// SendCallback: callback for sending synthesized audio
SendCallback func(data []byte) error
// RecordCallback: optional callback for recording synthesized audio
RecordCallback func(data []byte) error
// Logger: optional logging callback
Logger func(string)
// PaceRealtime sleeps between frames so playback matches wall-clock (required for RTP/VoIP).
PaceRealtime bool
}
TTSPipelineConfig contains configuration for the TTS pipeline.
func DefaultTTSPipelineConfig ¶
func DefaultTTSPipelineConfig(ttsService TTSService) TTSPipelineConfig
DefaultTTSPipelineConfig returns default TTS pipeline configuration.
type TTSService ¶
type TTSService interface {
// Synthesize synthesizes text to audio and calls the callback for each audio chunk.
// The callback receives PCM audio data (typically 16-bit mono at 16kHz).
Synthesize(ctx context.Context, text string, callback func([]byte) error) error
}
TTSService defines the interface for text-to-speech synthesis.
func FromSynthesisEngine ¶
func FromSynthesisEngine(engine synthesizer.AudioSynthesisEngine) TTSService
FromSynthesisEngine wraps synthesizer.AudioSynthesisEngine as TTSService.
type TTSWorkerConfig ¶
type TTSWorkerConfig struct {
// TTSService: the TTS service to use
TTSService TTSService
// WorkerCount: number of worker goroutines (default: 1)
WorkerCount int
// Logger: optional logging callback
Logger func(string)
}
TTSWorkerConfig contains configuration for TTS workers.
type TTSWorkerPool ¶
type TTSWorkerPool struct {
// contains filtered or unexported fields
}
TTSWorkerPool manages multiple TTS worker goroutines.
func NewTTSWorkerPool ¶
func NewTTSWorkerPool( config TTSWorkerConfig, inputCh <-chan TextSegment, outputCh chan<- AudioFrame, ) (*TTSWorkerPool, error)
NewTTSWorkerPool creates a new TTS worker pool.
func (*TTSWorkerPool) GetGlobalSequence ¶
func (p *TTSWorkerPool) GetGlobalSequence() uint32
GetGlobalSequence returns the current global sequence number.
func (*TTSWorkerPool) Start ¶
func (p *TTSWorkerPool) Start(ctx context.Context) error
Start starts the worker pool.
func (*TTSWorkerPool) UpdateTTSService ¶
func (p *TTSWorkerPool) UpdateTTSService(newService TTSService) error
UpdateTTSService updates the TTS service (for speaker switching).
type TextSegment ¶
TextSegment represents a text segment for TTS synthesis.
type TextSegmenterComponent ¶
type TextSegmenterComponent struct {
// contains filtered or unexported fields
}
TextSegmenterComponent segments text for streaming TTS synthesis. It intelligently breaks text at sentence boundaries and accumulates text based on character count and punctuation.
func NewTextSegmenterComponent ¶
func NewTextSegmenterComponent(config TextSegmenterConfig, outputFunc func(TextSegment)) *TextSegmenterComponent
NewTextSegmenterComponent creates a new text segmenter component.
func (*TextSegmenterComponent) Name ¶
func (s *TextSegmenterComponent) Name() string
Name returns the component name.
func (*TextSegmenterComponent) OnComplete ¶
func (s *TextSegmenterComponent) OnComplete()
OnComplete marks the end of text input.
func (*TextSegmenterComponent) Process ¶
func (s *TextSegmenterComponent) Process(ctx context.Context, data interface{}) (interface{}, bool, error)
Process processes text for segmentation. Returns (remainingText, shouldContinue, error)
func (*TextSegmenterComponent) Reset ¶
func (s *TextSegmenterComponent) Reset()
Reset resets the segmenter state.
func (*TextSegmenterComponent) SetLogger ¶
func (s *TextSegmenterComponent) SetLogger(callback func(string))
SetLogger sets the logging callback.
func (*TextSegmenterComponent) SetPlayID ¶
func (s *TextSegmenterComponent) SetPlayID(playID string)
SetPlayID sets the current play ID for segments.
type TextSegmenterConfig ¶
type TextSegmenterConfig struct {
// DelayTimeout: delay before sending a segment (default: 50ms)
DelayTimeout time.Duration
// MinChars: minimum characters before sending a segment (default: 15)
MinChars int
// MaxChars: maximum characters in a segment (default: 35)
MaxChars int
}
TextSegmenterConfig contains configuration for text segmentation.
func DefaultTextSegmenterConfig ¶
func DefaultTextSegmenterConfig() TextSegmenterConfig
DefaultTextSegmenterConfig returns default text segmenter configuration.