tts

package
v1.2.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 6, 2026 License: MIT Imports: 13 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var (
	// ErrTTSServiceRequired is returned when TTSService is not provided.
	ErrTTSServiceRequired = errors.New("TTS service is required")

	// ErrSendCallbackRequired is returned when SendCallback is not provided.
	ErrSendCallbackRequired = errors.New("send callback is required")

	// ErrPipelineNotStarted is returned when pipeline operations are called before Start.
	ErrPipelineNotStarted = errors.New("pipeline not started")

	// ErrInvalidDataType is returned when data type is invalid.
	ErrInvalidDataType = errors.New("invalid data type")

	// ErrEmptyText is returned when text is empty.
	ErrEmptyText = errors.New("empty text")
)
View Source
var DefaultCache = NewCache(128, 32<<20)

DefaultCache is the process-wide TTS PCM cache (128 entries / 32 MiB).

View Source
var ErrInterrupted = errors.New("tts: interrupted")

ErrInterrupted is returned when Speak is cancelled via Interrupt.

Functions

func SanitizeSpeech

func SanitizeSpeech(text string) string

SanitizeSpeech prepares text for cloud TTS synthesis.

Types

type AudioFrame

type AudioFrame struct {
	Data       []byte
	SampleRate int
	Channels   int
	PlayID     string
	Sequence   uint32
}

AudioFrame represents a frame of audio data.

type AudioSender

type AudioSender struct {
	// contains filtered or unexported fields
}

AudioSender handles audio encoding and sending.

func NewAudioSender

func NewAudioSender(config AudioSenderConfig) (*AudioSender, error)

NewAudioSender creates a new audio sender.

func (*AudioSender) Close

func (s *AudioSender) Close() error

Close closes the audio sender.

func (*AudioSender) GetBufferLevel

func (s *AudioSender) GetBufferLevel() int

GetBufferLevel returns the current buffer level.

func (*AudioSender) GetPendingCount

func (s *AudioSender) GetPendingCount() int

GetPendingCount returns the pending packet count.

func (*AudioSender) ProcessFrame

func (s *AudioSender) ProcessFrame(frame AudioFrame) error

ProcessFrame processes a PCM audio frame (encode + buffer).

func (*AudioSender) Reset

func (s *AudioSender) Reset()

Reset resets the sender state.

func (*AudioSender) SetLogger

func (s *AudioSender) SetLogger(callback func(string))

SetLogger sets the logging callback.

func (*AudioSender) SetOutputCodec

func (s *AudioSender) SetOutputCodec(codec string) error

SetOutputCodec sets the output codec.

func (*AudioSender) Start

func (s *AudioSender) Start(ctx context.Context) error

Start starts the audio sender.

func (*AudioSender) Stop

func (s *AudioSender) Stop() error

Stop stops the audio sender.

type AudioSenderConfig

type AudioSenderConfig struct {
	// OutputCodec: output codec (e.g., "opus", "pcm")
	OutputCodec string
	// TargetSampleRate: target sample rate (default: 16000)
	TargetSampleRate int
	// FrameDuration: frame duration (default: 60ms)
	FrameDuration time.Duration
	// SendCallback: callback for sending encoded audio
	SendCallback func(data []byte) error
	// GetPendingCountFunc: optional callback to get pending packet count
	GetPendingCountFunc func() int
	// Logger: optional logging callback
	Logger func(string)
}

AudioSenderConfig contains configuration for the audio sender.

type Cache

type Cache struct {
	// contains filtered or unexported fields
}

Cache holds rendered PCM keyed by an opaque string. Safe for concurrent access. Use DefaultCache to share entries across calls in one process.

func NewCache

func NewCache(maxEntries, maxBytes int) *Cache

NewCache returns an empty cache with the given caps. maxEntries and maxBytes both apply; whichever is hit first triggers eviction. Pass 0 to disable that cap.

func (*Cache) Bytes

func (c *Cache) Bytes() int

Bytes returns the current total PCM bytes held.

func (*Cache) Get

func (c *Cache) Get(key string) ([]byte, bool)

Get returns (pcm, true) on a hit. The returned slice is owned by the cache.

func (*Cache) Len

func (c *Cache) Len() int

Len returns the number of cached entries.

func (*Cache) Put

func (c *Cache) Put(key string, pcm []byte)

Put stores a copy of pcm under key, evicting oldest entries when over cap.

type CacheConfig

type CacheConfig struct {
	// Cache to use. nil → DefaultCache.
	Cache *Cache
	// VoiceKey identifies vendor + voice + sample rate + speed. Required.
	VoiceKey string
	// MaxRunes skips cache writes for longer texts. 0 = no limit.
	MaxRunes int
	// ChunkBytes controls replay chunk size. 0 → one shot.
	ChunkBytes int
}

CacheConfig configures a CachingTTSService.

type CachingTTSService

type CachingTTSService struct {
	// contains filtered or unexported fields
}

CachingTTSService wraps TTSService with a PCM cache.

func NewCachingTTSService

func NewCachingTTSService(inner TTSService, cfg CacheConfig) (*CachingTTSService, error)

NewCachingTTSService validates cfg and returns a cache-aware TTSService.

func (*CachingTTSService) Cache

func (c *CachingTTSService) Cache() *Cache

Cache returns the underlying cache.

func (*CachingTTSService) CacheKey

func (c *CachingTTSService) CacheKey(text string) string

CacheKey returns the canonical key for text.

func (*CachingTTSService) Prewarm

func (c *CachingTTSService) Prewarm(ctx context.Context, texts []string, onErr func(text string, err error))

Prewarm renders texts once and stores PCM. Skips already-cached entries.

func (*CachingTTSService) Synthesize

func (c *CachingTTSService) Synthesize(ctx context.Context, text string, onPCMChunk func([]byte) error) error

Synthesize implements TTSService.

type EncodedFrame

type EncodedFrame struct {
	Data     []byte
	PlayID   string
	Sequence uint32
}

EncodedFrame represents an encoded audio frame.

type Speaker

type Speaker struct {
	// contains filtered or unexported fields
}

Speaker serializes TTS playback so adjacent utterances never overlap.

func NewSpeaker

func NewSpeaker(cfg SpeakerConfig) (*Speaker, error)

NewSpeaker creates a serial TTS speaker.

func (*Speaker) DrainQueue

func (s *Speaker) DrainQueue()

DrainQueue drops pending jobs without interrupting the current utterance.

func (*Speaker) Enqueue

func (s *Speaker) Enqueue(text, utteranceID string, e2eAnchor *time.Time) bool

Enqueue schedules text for synthesis. Non-blocking; drops when queue is full.

func (*Speaker) Interrupt

func (s *Speaker) Interrupt()

Interrupt stops the current utterance and drains pending jobs.

func (*Speaker) IsActive

func (s *Speaker) IsActive() bool

IsActive is true while streaming or while utterances remain queued.

func (*Speaker) IsPlaying

func (s *Speaker) IsPlaying() bool

IsPlaying reports whether TTS audio is actively streaming.

func (*Speaker) QueueDepth

func (s *Speaker) QueueDepth() int

QueueDepth returns pending utterance count.

func (*Speaker) SetCallbacks

func (s *Speaker) SetCallbacks(
	onStarted func(utteranceID, text string, chained bool),
	onEnded func(utteranceID string, ok bool, duration time.Duration, ttsFirstMs, e2eFirstMs int, moreQueued bool),
)

SetCallbacks updates lifecycle hooks (safe before Start).

func (*Speaker) Start

func (s *Speaker) Start(ctx context.Context)

Start begins draining the speak queue.

func (*Speaker) Stop

func (s *Speaker) Stop()

Stop drains and shuts down the speaker.

type SpeakerConfig

type SpeakerConfig struct {
	Pipeline  *TTSPipeline
	QueueSize int
	OnStarted func(utteranceID, text string, chained bool)
	OnEnded   func(utteranceID string, ok bool, duration time.Duration, ttsFirstMs, e2eFirstMs int, moreQueued bool)
}

SpeakerConfig configures a serial TTS speaker.

type TTSPipeline

type TTSPipeline struct {
	// contains filtered or unexported fields
}

TTSPipeline manages text-to-speech synthesis with pluggable components.

func NewTTSPipeline

func NewTTSPipeline(config TTSPipelineConfig) (*TTSPipeline, error)

NewTTSPipeline creates a new TTS pipeline with the given configuration.

func (*TTSPipeline) ArmFirstFrameHook

func (p *TTSPipeline) ArmFirstFrameHook(fn func())

ArmFirstFrameHook installs a one-shot callback fired on the first emitted frame.

func (*TTSPipeline) Close

func (p *TTSPipeline) Close() error

Close closes the TTS pipeline.

func (*TTSPipeline) GetConfig

func (p *TTSPipeline) GetConfig() TTSPipelineConfig

GetConfig returns the current pipeline configuration.

func (*TTSPipeline) Interrupt

func (p *TTSPipeline) Interrupt()

Interrupt cancels the in-flight Speak. The pipeline remains usable.

func (*TTSPipeline) IsPlaying

func (p *TTSPipeline) IsPlaying() bool

IsPlaying reports whether a Speak call is streaming audio.

func (*TTSPipeline) SetLogger

func (p *TTSPipeline) SetLogger(callback func(string))

SetLogger sets the logging callback.

func (*TTSPipeline) SetOnCompleteFunc

func (p *TTSPipeline) SetOnCompleteFunc(callback func())

SetOnCompleteFunc sets the completion callback.

func (*TTSPipeline) Speak

func (p *TTSPipeline) Speak(text string) error

Speak synthesizes text with per-utterance cancellation (barge-in). Blocks until synthesis completes, is interrupted, or the pipeline stops.

func (*TTSPipeline) Start

func (p *TTSPipeline) Start(ctx context.Context) error

Start starts the TTS pipeline.

func (*TTSPipeline) Stop

func (p *TTSPipeline) Stop() error

Stop stops the TTS pipeline.

func (*TTSPipeline) Synthesize

func (p *TTSPipeline) Synthesize(ctx context.Context, text string) error

Synthesize synthesizes text to audio using the provided context.

type TTSPipelineComponent

type TTSPipelineComponent interface {
	// Name returns the component name.
	Name() string

	// Process processes data through the component.
	// For text processors: input is string, output is string
	// For audio processors: input is []byte, output is []byte
	// Returns (processedData, shouldContinue, error)
	Process(ctx context.Context, data interface{}) (interface{}, bool, error)
}

TTSPipelineComponent defines the interface for TTS pipeline components.

type TTSPipelineConfig

type TTSPipelineConfig struct {
	// TTSService: the TTS service to use for synthesis
	TTSService TTSService
	// OutputCodec: output audio codec (e.g., "opus", "pcm")
	OutputCodec string
	// TargetSampleRate: target sample rate for output audio (default: 16000)
	TargetSampleRate int
	// FrameDuration: frame duration for audio processing (default: 60ms)
	FrameDuration time.Duration
	// TextProcessors: optional text processing components (e.g., text normalization)
	TextProcessors []TTSPipelineComponent
	// AudioProcessors: optional audio processing components (e.g., encoding)
	AudioProcessors []TTSPipelineComponent
	// SendCallback: callback for sending synthesized audio
	SendCallback func(data []byte) error
	// RecordCallback: optional callback for recording synthesized audio
	RecordCallback func(data []byte) error
	// Logger: optional logging callback
	Logger func(string)
	// PaceRealtime sleeps between frames so playback matches wall-clock (required for RTP/VoIP).
	PaceRealtime bool
}

TTSPipelineConfig contains configuration for the TTS pipeline.

func DefaultTTSPipelineConfig

func DefaultTTSPipelineConfig(ttsService TTSService) TTSPipelineConfig

DefaultTTSPipelineConfig returns default TTS pipeline configuration.

type TTSService

type TTSService interface {
	// Synthesize synthesizes text to audio and calls the callback for each audio chunk.
	// The callback receives PCM audio data (typically 16-bit mono at 16kHz).
	Synthesize(ctx context.Context, text string, callback func([]byte) error) error
}

TTSService defines the interface for text-to-speech synthesis.

func FromSynthesisEngine

func FromSynthesisEngine(engine synthesizer.AudioSynthesisEngine) TTSService

FromSynthesisEngine wraps synthesizer.AudioSynthesisEngine as TTSService.

type TTSWorkerConfig

type TTSWorkerConfig struct {
	// TTSService: the TTS service to use
	TTSService TTSService
	// WorkerCount: number of worker goroutines (default: 1)
	WorkerCount int
	// Logger: optional logging callback
	Logger func(string)
}

TTSWorkerConfig contains configuration for TTS workers.

type TTSWorkerPool

type TTSWorkerPool struct {
	// contains filtered or unexported fields
}

TTSWorkerPool manages multiple TTS worker goroutines.

func NewTTSWorkerPool

func NewTTSWorkerPool(
	config TTSWorkerConfig,
	inputCh <-chan TextSegment,
	outputCh chan<- AudioFrame,
) (*TTSWorkerPool, error)

NewTTSWorkerPool creates a new TTS worker pool.

func (*TTSWorkerPool) Close

func (p *TTSWorkerPool) Close() error

Close closes the worker pool.

func (*TTSWorkerPool) GetGlobalSequence

func (p *TTSWorkerPool) GetGlobalSequence() uint32

GetGlobalSequence returns the current global sequence number.

func (*TTSWorkerPool) Start

func (p *TTSWorkerPool) Start(ctx context.Context) error

Start starts the worker pool.

func (*TTSWorkerPool) Stop

func (p *TTSWorkerPool) Stop() error

Stop stops the worker pool.

func (*TTSWorkerPool) UpdateTTSService

func (p *TTSWorkerPool) UpdateTTSService(newService TTSService) error

UpdateTTSService updates the TTS service (for speaker switching).

type TextSegment

type TextSegment struct {
	Text      string
	IsFinal   bool
	Timestamp time.Time
	PlayID    string
}

TextSegment represents a text segment for TTS synthesis.

type TextSegmenterComponent

type TextSegmenterComponent struct {
	// contains filtered or unexported fields
}

TextSegmenterComponent segments text for streaming TTS synthesis. It intelligently breaks text at sentence boundaries and accumulates text based on character count and punctuation.

func NewTextSegmenterComponent

func NewTextSegmenterComponent(config TextSegmenterConfig, outputFunc func(TextSegment)) *TextSegmenterComponent

NewTextSegmenterComponent creates a new text segmenter component.

func (*TextSegmenterComponent) Name

func (s *TextSegmenterComponent) Name() string

Name returns the component name.

func (*TextSegmenterComponent) OnComplete

func (s *TextSegmenterComponent) OnComplete()

OnComplete marks the end of text input.

func (*TextSegmenterComponent) Process

func (s *TextSegmenterComponent) Process(ctx context.Context, data interface{}) (interface{}, bool, error)

Process processes text for segmentation. Returns (remainingText, shouldContinue, error)

func (*TextSegmenterComponent) Reset

func (s *TextSegmenterComponent) Reset()

Reset resets the segmenter state.

func (*TextSegmenterComponent) SetLogger

func (s *TextSegmenterComponent) SetLogger(callback func(string))

SetLogger sets the logging callback.

func (*TextSegmenterComponent) SetPlayID

func (s *TextSegmenterComponent) SetPlayID(playID string)

SetPlayID sets the current play ID for segments.

type TextSegmenterConfig

type TextSegmenterConfig struct {
	// DelayTimeout: delay before sending a segment (default: 50ms)
	DelayTimeout time.Duration
	// MinChars: minimum characters before sending a segment (default: 15)
	MinChars int
	// MaxChars: maximum characters in a segment (default: 35)
	MaxChars int
}

TextSegmenterConfig contains configuration for text segmentation.

func DefaultTextSegmenterConfig

func DefaultTextSegmenterConfig() TextSegmenterConfig

DefaultTextSegmenterConfig returns default text segmenter configuration.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL