tts

package

v1.2.0 Latest Latest Go to latest Published: Jun 6, 2026 License: MIT Imports: 13 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/LingByte/lingllm

Links

Open Source Insights

Documentation ¶

Index ¶

Variables
func SanitizeSpeech(text string) string
type AudioFrame
type AudioSender
- func NewAudioSender(config AudioSenderConfig) (*AudioSender, error)
- func (s *AudioSender) Close() error
- func (s *AudioSender) GetBufferLevel() int
- func (s *AudioSender) GetPendingCount() int
- func (s *AudioSender) ProcessFrame(frame AudioFrame) error
- func (s *AudioSender) Reset()
- func (s *AudioSender) SetLogger(callback func(string))
- func (s *AudioSender) SetOutputCodec(codec string) error
- func (s *AudioSender) Start(ctx context.Context) error
- func (s *AudioSender) Stop() error
type AudioSenderConfig
type Cache
- func NewCache(maxEntries, maxBytes int) *Cache
- func (c *Cache) Bytes() int
- func (c *Cache) Get(key string) ([]byte, bool)
- func (c *Cache) Len() int
- func (c *Cache) Put(key string, pcm []byte)
type CacheConfig
type CachingTTSService
- func NewCachingTTSService(inner TTSService, cfg CacheConfig) (*CachingTTSService, error)
- func (c *CachingTTSService) Cache() *Cache
- func (c *CachingTTSService) CacheKey(text string) string
- func (c *CachingTTSService) Prewarm(ctx context.Context, texts []string, onErr func(text string, err error))
- func (c *CachingTTSService) Synthesize(ctx context.Context, text string, onPCMChunk func([]byte) error) error
type EncodedFrame
type Speaker
- func NewSpeaker(cfg SpeakerConfig) (*Speaker, error)
- func (s *Speaker) DrainQueue()
- func (s *Speaker) Enqueue(text, utteranceID string, e2eAnchor *time.Time) bool
- func (s *Speaker) Interrupt()
- func (s *Speaker) IsActive() bool
- func (s *Speaker) IsPlaying() bool
- func (s *Speaker) QueueDepth() int
- func (s *Speaker) SetCallbacks(onStarted func(utteranceID, text string, chained bool), ...)
- func (s *Speaker) Start(ctx context.Context)
- func (s *Speaker) Stop()
type SpeakerConfig
type TTSPipeline
- func NewTTSPipeline(config TTSPipelineConfig) (*TTSPipeline, error)
- func (p *TTSPipeline) ArmFirstFrameHook(fn func())
- func (p *TTSPipeline) Close() error
- func (p *TTSPipeline) GetConfig() TTSPipelineConfig
- func (p *TTSPipeline) Interrupt()
- func (p *TTSPipeline) IsPlaying() bool
- func (p *TTSPipeline) SetLogger(callback func(string))
- func (p *TTSPipeline) SetOnCompleteFunc(callback func())
- func (p *TTSPipeline) Speak(text string) error
- func (p *TTSPipeline) Start(ctx context.Context) error
- func (p *TTSPipeline) Stop() error
- func (p *TTSPipeline) Synthesize(ctx context.Context, text string) error
type TTSPipelineComponent
type TTSPipelineConfig
- func DefaultTTSPipelineConfig(ttsService TTSService) TTSPipelineConfig
type TTSService
- func FromSynthesisEngine(engine synthesizer.AudioSynthesisEngine) TTSService
type TTSWorkerConfig
type TTSWorkerPool
- func NewTTSWorkerPool(config TTSWorkerConfig, inputCh <-chan TextSegment, outputCh chan<- AudioFrame) (*TTSWorkerPool, error)
- func (p *TTSWorkerPool) Close() error
- func (p *TTSWorkerPool) GetGlobalSequence() uint32
- func (p *TTSWorkerPool) Start(ctx context.Context) error
- func (p *TTSWorkerPool) Stop() error
- func (p *TTSWorkerPool) UpdateTTSService(newService TTSService) error
type TextSegment
type TextSegmenterComponent
- func NewTextSegmenterComponent(config TextSegmenterConfig, outputFunc func(TextSegment)) *TextSegmenterComponent
- func (s *TextSegmenterComponent) Name() string
- func (s *TextSegmenterComponent) OnComplete()
- func (s *TextSegmenterComponent) Process(ctx context.Context, data interface{}) (interface{}, bool, error)
- func (s *TextSegmenterComponent) Reset()
- func (s *TextSegmenterComponent) SetLogger(callback func(string))
- func (s *TextSegmenterComponent) SetPlayID(playID string)
type TextSegmenterConfig
- func DefaultTextSegmenterConfig() TextSegmenterConfig

Constants ¶

This section is empty.

Variables ¶

View Source

var (
	// ErrTTSServiceRequired is returned when TTSService is not provided.
	ErrTTSServiceRequired = errors.New("TTS service is required")

	// ErrSendCallbackRequired is returned when SendCallback is not provided.
	ErrSendCallbackRequired = errors.New("send callback is required")

	// ErrPipelineNotStarted is returned when pipeline operations are called before Start.
	ErrPipelineNotStarted = errors.New("pipeline not started")

	// ErrInvalidDataType is returned when data type is invalid.
	ErrInvalidDataType = errors.New("invalid data type")

	// ErrEmptyText is returned when text is empty.
	ErrEmptyText = errors.New("empty text")
)

View Source

var DefaultCache = NewCache(128, 32<<20)

DefaultCache is the process-wide TTS PCM cache (128 entries / 32 MiB).

View Source

var ErrInterrupted = errors.New("tts: interrupted")

ErrInterrupted is returned when Speak is cancelled via Interrupt.

Functions ¶

func SanitizeSpeech ¶

func SanitizeSpeech(text string) string

SanitizeSpeech prepares text for cloud TTS synthesis.

Types ¶

type AudioFrame ¶

type AudioFrame struct {
	Data       []byte
	SampleRate int
	Channels   int
	PlayID     string
	Sequence   uint32
}

AudioFrame represents a frame of audio data.

type AudioSender ¶

type AudioSender struct {
	// contains filtered or unexported fields
}

AudioSender handles audio encoding and sending.

func NewAudioSender ¶

func NewAudioSender(config AudioSenderConfig) (*AudioSender, error)

NewAudioSender creates a new audio sender.

func (*AudioSender) Close ¶

func (s *AudioSender) Close() error

Close closes the audio sender.

func (*AudioSender) GetBufferLevel ¶

func (s *AudioSender) GetBufferLevel() int

GetBufferLevel returns the current buffer level.

func (*AudioSender) GetPendingCount ¶

func (s *AudioSender) GetPendingCount() int

GetPendingCount returns the pending packet count.

func (*AudioSender) ProcessFrame ¶

func (s *AudioSender) ProcessFrame(frame AudioFrame) error

ProcessFrame processes a PCM audio frame (encode + buffer).

func (*AudioSender) Reset ¶

func (s *AudioSender) Reset()

Reset resets the sender state.

func (*AudioSender) SetLogger ¶

func (s *AudioSender) SetLogger(callback func(string))

SetLogger sets the logging callback.

func (*AudioSender) SetOutputCodec ¶

func (s *AudioSender) SetOutputCodec(codec string) error

SetOutputCodec sets the output codec.

func (*AudioSender) Start ¶

func (s *AudioSender) Start(ctx context.Context) error

Start starts the audio sender.

func (*AudioSender) Stop ¶

func (s *AudioSender) Stop() error

Stop stops the audio sender.

type AudioSenderConfig ¶

type AudioSenderConfig struct {
	// OutputCodec: output codec (e.g., "opus", "pcm")
	OutputCodec string
	// TargetSampleRate: target sample rate (default: 16000)
	TargetSampleRate int
	// FrameDuration: frame duration (default: 60ms)
	FrameDuration time.Duration
	// SendCallback: callback for sending encoded audio
	SendCallback func(data []byte) error
	// GetPendingCountFunc: optional callback to get pending packet count
	GetPendingCountFunc func() int
	// Logger: optional logging callback
	Logger func(string)
}

AudioSenderConfig contains configuration for the audio sender.

type Cache ¶

type Cache struct {
	// contains filtered or unexported fields
}

Cache holds rendered PCM keyed by an opaque string. Safe for concurrent access. Use DefaultCache to share entries across calls in one process.

func NewCache ¶

func NewCache(maxEntries, maxBytes int) *Cache

NewCache returns an empty cache with the given caps. maxEntries and maxBytes both apply; whichever is hit first triggers eviction. Pass 0 to disable that cap.

func (*Cache) Bytes ¶

func (c *Cache) Bytes() int

Bytes returns the current total PCM bytes held.

func (*Cache) Get ¶

func (c *Cache) Get(key string) ([]byte, bool)

Get returns (pcm, true) on a hit. The returned slice is owned by the cache.

func (*Cache) Len ¶

func (c *Cache) Len() int

Len returns the number of cached entries.

func (*Cache) Put ¶

func (c *Cache) Put(key string, pcm []byte)

Put stores a copy of pcm under key, evicting oldest entries when over cap.

type CacheConfig ¶

type CacheConfig struct {
	// Cache to use. nil → DefaultCache.
	Cache *Cache
	// VoiceKey identifies vendor + voice + sample rate + speed. Required.
	VoiceKey string
	// MaxRunes skips cache writes for longer texts. 0 = no limit.
	MaxRunes int
	// ChunkBytes controls replay chunk size. 0 → one shot.
	ChunkBytes int
}

CacheConfig configures a CachingTTSService.

type CachingTTSService ¶

type CachingTTSService struct {
	// contains filtered or unexported fields
}

CachingTTSService wraps TTSService with a PCM cache.

func NewCachingTTSService ¶

func NewCachingTTSService(inner TTSService, cfg CacheConfig) (*CachingTTSService, error)

NewCachingTTSService validates cfg and returns a cache-aware TTSService.

func (*CachingTTSService) Cache ¶

func (c *CachingTTSService) Cache() *Cache

Cache returns the underlying cache.

func (*CachingTTSService) CacheKey ¶

func (c *CachingTTSService) CacheKey(text string) string

CacheKey returns the canonical key for text.

func (*CachingTTSService) Prewarm ¶

func (c *CachingTTSService) Prewarm(ctx context.Context, texts []string, onErr func(text string, err error))

Prewarm renders texts once and stores PCM. Skips already-cached entries.

func (*CachingTTSService) Synthesize ¶

func (c *CachingTTSService) Synthesize(ctx context.Context, text string, onPCMChunk func([]byte) error) error

Synthesize implements TTSService.

type EncodedFrame ¶

type EncodedFrame struct {
	Data     []byte
	PlayID   string
	Sequence uint32
}

EncodedFrame represents an encoded audio frame.

type Speaker ¶

type Speaker struct {
	// contains filtered or unexported fields
}

Speaker serializes TTS playback so adjacent utterances never overlap.

func NewSpeaker ¶

func NewSpeaker(cfg SpeakerConfig) (*Speaker, error)

NewSpeaker creates a serial TTS speaker.

func (*Speaker) DrainQueue ¶

func (s *Speaker) DrainQueue()

DrainQueue drops pending jobs without interrupting the current utterance.

func (*Speaker) Enqueue ¶

func (s *Speaker) Enqueue(text, utteranceID string, e2eAnchor *time.Time) bool

Enqueue schedules text for synthesis. Non-blocking; drops when queue is full.

func (*Speaker) Interrupt ¶

func (s *Speaker) Interrupt()

Interrupt stops the current utterance and drains pending jobs.

func (*Speaker) IsActive ¶

func (s *Speaker) IsActive() bool

IsActive is true while streaming or while utterances remain queued.

func (*Speaker) IsPlaying ¶

func (s *Speaker) IsPlaying() bool

IsPlaying reports whether TTS audio is actively streaming.

func (*Speaker) QueueDepth ¶

func (s *Speaker) QueueDepth() int

QueueDepth returns pending utterance count.

func (*Speaker) SetCallbacks ¶

func (s *Speaker) SetCallbacks(
	onStarted func(utteranceID, text string, chained bool),
	onEnded func(utteranceID string, ok bool, duration time.Duration, ttsFirstMs, e2eFirstMs int, moreQueued bool),
)

SetCallbacks updates lifecycle hooks (safe before Start).

func (*Speaker) Start ¶

func (s *Speaker) Start(ctx context.Context)

Start begins draining the speak queue.

func (*Speaker) Stop ¶

func (s *Speaker) Stop()

Stop drains and shuts down the speaker.

type SpeakerConfig ¶

type SpeakerConfig struct {
	Pipeline  *TTSPipeline
	QueueSize int
	OnStarted func(utteranceID, text string, chained bool)
	OnEnded   func(utteranceID string, ok bool, duration time.Duration, ttsFirstMs, e2eFirstMs int, moreQueued bool)
}

SpeakerConfig configures a serial TTS speaker.

type TTSPipeline ¶

type TTSPipeline struct {
	// contains filtered or unexported fields
}

TTSPipeline manages text-to-speech synthesis with pluggable components.

func NewTTSPipeline ¶

func NewTTSPipeline(config TTSPipelineConfig) (*TTSPipeline, error)

NewTTSPipeline creates a new TTS pipeline with the given configuration.

func (*TTSPipeline) ArmFirstFrameHook ¶

func (p *TTSPipeline) ArmFirstFrameHook(fn func())

ArmFirstFrameHook installs a one-shot callback fired on the first emitted frame.

func (*TTSPipeline) Close ¶

func (p *TTSPipeline) Close() error

Close closes the TTS pipeline.

func (*TTSPipeline) GetConfig ¶

func (p *TTSPipeline) GetConfig() TTSPipelineConfig

GetConfig returns the current pipeline configuration.

func (*TTSPipeline) Interrupt ¶

func (p *TTSPipeline) Interrupt()

Interrupt cancels the in-flight Speak. The pipeline remains usable.

func (*TTSPipeline) IsPlaying ¶

func (p *TTSPipeline) IsPlaying() bool

IsPlaying reports whether a Speak call is streaming audio.

func (*TTSPipeline) SetLogger ¶

func (p *TTSPipeline) SetLogger(callback func(string))

SetLogger sets the logging callback.

func (*TTSPipeline) SetOnCompleteFunc ¶

func (p *TTSPipeline) SetOnCompleteFunc(callback func())

SetOnCompleteFunc sets the completion callback.

func (*TTSPipeline) Speak ¶

func (p *TTSPipeline) Speak(text string) error

Speak synthesizes text with per-utterance cancellation (barge-in). Blocks until synthesis completes, is interrupted, or the pipeline stops.

func (*TTSPipeline) Start ¶

func (p *TTSPipeline) Start(ctx context.Context) error

Start starts the TTS pipeline.

func (*TTSPipeline) Stop ¶

func (p *TTSPipeline) Stop() error

Stop stops the TTS pipeline.

func (*TTSPipeline) Synthesize ¶

func (p *TTSPipeline) Synthesize(ctx context.Context, text string) error

Synthesize synthesizes text to audio using the provided context.

type TTSPipelineComponent ¶

type TTSPipelineComponent interface {
	// Name returns the component name.
	Name() string

	// Process processes data through the component.
	// For text processors: input is string, output is string
	// For audio processors: input is []byte, output is []byte
	// Returns (processedData, shouldContinue, error)
	Process(ctx context.Context, data interface{}) (interface{}, bool, error)
}

TTSPipelineComponent defines the interface for TTS pipeline components.

type TTSPipelineConfig ¶

type TTSPipelineConfig struct {
	// TTSService: the TTS service to use for synthesis
	TTSService TTSService
	// OutputCodec: output audio codec (e.g., "opus", "pcm")
	OutputCodec string
	// TargetSampleRate: target sample rate for output audio (default: 16000)
	TargetSampleRate int
	// FrameDuration: frame duration for audio processing (default: 60ms)
	FrameDuration time.Duration
	// TextProcessors: optional text processing components (e.g., text normalization)
	TextProcessors []TTSPipelineComponent
	// AudioProcessors: optional audio processing components (e.g., encoding)
	AudioProcessors []TTSPipelineComponent
	// SendCallback: callback for sending synthesized audio
	SendCallback func(data []byte) error
	// RecordCallback: optional callback for recording synthesized audio
	RecordCallback func(data []byte) error
	// Logger: optional logging callback
	Logger func(string)
	// PaceRealtime sleeps between frames so playback matches wall-clock (required for RTP/VoIP).
	PaceRealtime bool
}

TTSPipelineConfig contains configuration for the TTS pipeline.

func DefaultTTSPipelineConfig ¶

func DefaultTTSPipelineConfig(ttsService TTSService) TTSPipelineConfig

DefaultTTSPipelineConfig returns default TTS pipeline configuration.

type TTSService ¶

type TTSService interface {
	// Synthesize synthesizes text to audio and calls the callback for each audio chunk.
	// The callback receives PCM audio data (typically 16-bit mono at 16kHz).
	Synthesize(ctx context.Context, text string, callback func([]byte) error) error
}

TTSService defines the interface for text-to-speech synthesis.

func FromSynthesisEngine ¶

func FromSynthesisEngine(engine synthesizer.AudioSynthesisEngine) TTSService

FromSynthesisEngine wraps synthesizer.AudioSynthesisEngine as TTSService.

type TTSWorkerConfig ¶

type TTSWorkerConfig struct {
	// TTSService: the TTS service to use
	TTSService TTSService
	// WorkerCount: number of worker goroutines (default: 1)
	WorkerCount int
	// Logger: optional logging callback
	Logger func(string)
}

TTSWorkerConfig contains configuration for TTS workers.

type TTSWorkerPool ¶

type TTSWorkerPool struct {
	// contains filtered or unexported fields
}

TTSWorkerPool manages multiple TTS worker goroutines.

func NewTTSWorkerPool ¶

func NewTTSWorkerPool(
	config TTSWorkerConfig,
	inputCh <-chan TextSegment,
	outputCh chan<- AudioFrame,
) (*TTSWorkerPool, error)

NewTTSWorkerPool creates a new TTS worker pool.

func (*TTSWorkerPool) Close ¶

func (p *TTSWorkerPool) Close() error

Close closes the worker pool.

func (*TTSWorkerPool) GetGlobalSequence ¶

func (p *TTSWorkerPool) GetGlobalSequence() uint32

GetGlobalSequence returns the current global sequence number.

func (*TTSWorkerPool) Start ¶

func (p *TTSWorkerPool) Start(ctx context.Context) error

Start starts the worker pool.

func (*TTSWorkerPool) Stop ¶

func (p *TTSWorkerPool) Stop() error

Stop stops the worker pool.

func (*TTSWorkerPool) UpdateTTSService ¶

func (p *TTSWorkerPool) UpdateTTSService(newService TTSService) error

UpdateTTSService updates the TTS service (for speaker switching).

type TextSegment ¶

type TextSegment struct {
	Text      string
	IsFinal   bool
	Timestamp time.Time
	PlayID    string
}

TextSegment represents a text segment for TTS synthesis.

type TextSegmenterComponent ¶

type TextSegmenterComponent struct {
	// contains filtered or unexported fields
}

TextSegmenterComponent segments text for streaming TTS synthesis. It intelligently breaks text at sentence boundaries and accumulates text based on character count and punctuation.

func NewTextSegmenterComponent ¶

func NewTextSegmenterComponent(config TextSegmenterConfig, outputFunc func(TextSegment)) *TextSegmenterComponent

NewTextSegmenterComponent creates a new text segmenter component.

func (*TextSegmenterComponent) Name ¶

func (s *TextSegmenterComponent) Name() string

Name returns the component name.

func (*TextSegmenterComponent) OnComplete ¶

func (s *TextSegmenterComponent) OnComplete()

OnComplete marks the end of text input.

func (*TextSegmenterComponent) Process ¶

func (s *TextSegmenterComponent) Process(ctx context.Context, data interface{}) (interface{}, bool, error)

Process processes text for segmentation. Returns (remainingText, shouldContinue, error)

func (*TextSegmenterComponent) Reset ¶

func (s *TextSegmenterComponent) Reset()

Reset resets the segmenter state.

func (*TextSegmenterComponent) SetLogger ¶

func (s *TextSegmenterComponent) SetLogger(callback func(string))

SetLogger sets the logging callback.

func (*TextSegmenterComponent) SetPlayID ¶

func (s *TextSegmenterComponent) SetPlayID(playID string)

SetPlayID sets the current play ID for segments.

type TextSegmenterConfig ¶

type TextSegmenterConfig struct {
	// DelayTimeout: delay before sending a segment (default: 50ms)
	DelayTimeout time.Duration
	// MinChars: minimum characters before sending a segment (default: 15)
	MinChars int
	// MaxChars: maximum characters in a segment (default: 35)
	MaxChars int
}

TextSegmenterConfig contains configuration for text segmentation.

func DefaultTextSegmenterConfig ¶

func DefaultTextSegmenterConfig() TextSegmenterConfig

DefaultTextSegmenterConfig returns default text segmenter configuration.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL