tts

package

v1.4.1 Latest Latest Go to latest Published: Jun 8, 2026 License: MIT Imports: 15 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/LingByte/lingllm

Links

Open Source Insights

Documentation ¶

Index ¶

Variables
func SanitizeSpeech(text string) string
func Warmup(ctx context.Context, svc TTSService)
type AudioFrame
type AudioSender
- func NewAudioSender(config AudioSenderConfig) (*AudioSender, error)
- func (s *AudioSender) Close() error
- func (s *AudioSender) GetBufferLevel() int
- func (s *AudioSender) GetPendingCount() int
- func (s *AudioSender) ProcessFrame(frame AudioFrame) error
- func (s *AudioSender) Reset()
- func (s *AudioSender) SetLogger(callback func(string))
- func (s *AudioSender) SetOutputCodec(codec string) error
- func (s *AudioSender) Start(ctx context.Context) error
- func (s *AudioSender) Stop() error
type AudioSenderConfig
type Cache
- func NewCache(maxEntries, maxBytes int) *Cache
- func (c *Cache) Bytes() int
- func (c *Cache) Get(key string) ([]byte, bool)
- func (c *Cache) Len() int
- func (c *Cache) Put(key string, pcm []byte)
type CacheConfig
type CachingTTSService
- func NewCachingTTSService(inner TTSService, cfg CacheConfig) (*CachingTTSService, error)
- func (c *CachingTTSService) Cache() *Cache
- func (c *CachingTTSService) CacheKey(text string) string
- func (c *CachingTTSService) Prewarm(ctx context.Context, texts []string, onErr func(text string, err error))
- func (c *CachingTTSService) Synthesize(ctx context.Context, text string, onPCMChunk func([]byte) error) error
type Capabilities
- func CapabilitiesFrom(svc TTSService) Capabilities
- func DefaultCapabilities() Capabilities
type EncodedFrame
type Speaker
- func NewSpeaker(cfg SpeakerConfig) (*Speaker, error)
- func (s *Speaker) DrainQueue()
- func (s *Speaker) Enqueue(text, utteranceID string, e2eAnchor *time.Time) bool
- func (s *Speaker) Interrupt()
- func (s *Speaker) IsActive() bool
- func (s *Speaker) IsPlaying() bool
- func (s *Speaker) QueueDepth() int
- func (s *Speaker) SetCallbacks(onStarted func(utteranceID, text string, chained bool), ...)
- func (s *Speaker) SetCallbacksWithFirstFrame(onStarted func(utteranceID, text string, chained bool), ...)
- func (s *Speaker) Start(ctx context.Context)
- func (s *Speaker) Stop()
type SpeakerConfig
type TTSPipeline
- func NewTTSPipeline(config TTSPipelineConfig) (*TTSPipeline, error)
- func (p *TTSPipeline) ArmFirstFrameHook(fn func())
- func (p *TTSPipeline) Close() error
- func (p *TTSPipeline) GetConfig() TTSPipelineConfig
- func (p *TTSPipeline) Interrupt()
- func (p *TTSPipeline) IsPlaying() bool
- func (p *TTSPipeline) PlayFrames(ctx context.Context, frames [][]byte) error
- func (p *TTSPipeline) SetLogger(callback func(string))
- func (p *TTSPipeline) SetOnCompleteFunc(callback func())
- func (p *TTSPipeline) Speak(text string) error
- func (p *TTSPipeline) Start(ctx context.Context) error
- func (p *TTSPipeline) Stop() error
- func (p *TTSPipeline) Synthesize(ctx context.Context, text string) error
- func (p *TTSPipeline) Warmup(ctx context.Context)
type TTSPipelineComponent
type TTSPipelineConfig
- func DefaultTTSPipelineConfig(ttsService TTSService) TTSPipelineConfig
type TTSService
- func FromSynthesisEngine(engine synthesizer.AudioSynthesisEngine) TTSService
type TTSWorkerConfig
type TTSWorkerPool
- func NewTTSWorkerPool(config TTSWorkerConfig, inputCh <-chan TextSegment, outputCh chan<- AudioFrame) (*TTSWorkerPool, error)
- func (p *TTSWorkerPool) Close() error
- func (p *TTSWorkerPool) GetGlobalSequence() uint32
- func (p *TTSWorkerPool) Start(ctx context.Context) error
- func (p *TTSWorkerPool) Stop() error
- func (p *TTSWorkerPool) UpdateTTSService(newService TTSService) error
type TextSegment
type TextSegmenterComponent
- func NewTextSegmenterComponent(config TextSegmenterConfig, outputFunc func(TextSegment)) *TextSegmenterComponent
- func (s *TextSegmenterComponent) Name() string
- func (s *TextSegmenterComponent) OnComplete()
- func (s *TextSegmenterComponent) Process(ctx context.Context, data interface{}) (interface{}, bool, error)
- func (s *TextSegmenterComponent) Reset()
- func (s *TextSegmenterComponent) SetLogger(callback func(string))
- func (s *TextSegmenterComponent) SetPlayID(playID string)
type TextSegmenterConfig
- func DefaultTextSegmenterConfig() TextSegmenterConfig

Constants ¶

This section is empty.

Variables ¶

View Source

var (
	// ErrTTSServiceRequired is returned when TTSService is not provided.
	ErrTTSServiceRequired = errors.New("TTS service is required")

	// ErrSendCallbackRequired is returned when SendCallback is not provided.
	ErrSendCallbackRequired = errors.New("send callback is required")

	// ErrPipelineNotStarted is returned when pipeline operations are called before Start.
	ErrPipelineNotStarted = errors.New("pipeline not started")

	// ErrInvalidDataType is returned when data type is invalid.
	ErrInvalidDataType = errors.New("invalid data type")

	// ErrEmptyText is returned when text is empty.
	ErrEmptyText = errors.New("empty text")
)

View Source

var DefaultCache = NewCache(128, 32<<20)

DefaultCache is the process-wide TTS PCM cache (128 entries / 32 MiB).

View Source

var ErrInterrupted = errors.New("tts: interrupted")

ErrInterrupted is returned when Speak is cancelled via Interrupt.

Functions ¶

func SanitizeSpeech ¶

func SanitizeSpeech(text string) string

SanitizeSpeech prepares text for cloud TTS synthesis. Returns empty when the segment has no speakable content (punctuation-only, SSML, emoji, etc.).

func Warmup ¶ added in v1.3.0

func Warmup(ctx context.Context, svc TTSService)

Warmup issues a tiny synthesis request so the next real segment avoids cold-start latency.

Types ¶

type AudioFrame ¶

type AudioFrame struct {
	Data       []byte
	SampleRate int
	Channels   int
	PlayID     string
	Sequence   uint32
}

AudioFrame represents a frame of audio data.

type AudioSender ¶

type AudioSender struct {
	// contains filtered or unexported fields
}

AudioSender handles audio encoding and sending.

func NewAudioSender ¶

func NewAudioSender(config AudioSenderConfig) (*AudioSender, error)

NewAudioSender creates a new audio sender.

func (*AudioSender) Close ¶

func (s *AudioSender) Close() error

Close closes the audio sender.

func (*AudioSender) GetBufferLevel ¶

func (s *AudioSender) GetBufferLevel() int

GetBufferLevel returns the current buffer level.

func (*AudioSender) GetPendingCount ¶

func (s *AudioSender) GetPendingCount() int

GetPendingCount returns the pending packet count.

func (*AudioSender) ProcessFrame ¶

func (s *AudioSender) ProcessFrame(frame AudioFrame) error

ProcessFrame processes a PCM audio frame (encode + buffer).

func (*AudioSender) Reset ¶

func (s *AudioSender) Reset()

Reset resets the sender state.

func (*AudioSender) SetLogger ¶

func (s *AudioSender) SetLogger(callback func(string))

SetLogger sets the logging callback.

func (*AudioSender) SetOutputCodec ¶

func (s *AudioSender) SetOutputCodec(codec string) error

SetOutputCodec sets the output codec.

func (*AudioSender) Start ¶

func (s *AudioSender) Start(ctx context.Context) error

Start starts the audio sender.

func (*AudioSender) Stop ¶

func (s *AudioSender) Stop() error

Stop stops the audio sender.

type AudioSenderConfig ¶

type AudioSenderConfig struct {
	// OutputCodec: output codec (e.g., "opus", "pcm")
	OutputCodec string
	// TargetSampleRate: target sample rate (default: 16000)
	TargetSampleRate int
	// FrameDuration: frame duration (default: 60ms)
	FrameDuration time.Duration
	// SendCallback: callback for sending encoded audio
	SendCallback func(data []byte) error
	// GetPendingCountFunc: optional callback to get pending packet count
	GetPendingCountFunc func() int
	// Logger: optional logging callback
	Logger func(string)
}

AudioSenderConfig contains configuration for the audio sender.

type Cache ¶

type Cache struct {
	// contains filtered or unexported fields
}

Cache holds rendered PCM keyed by an opaque string. Safe for concurrent access. Use DefaultCache to share entries across calls in one process.

func NewCache ¶

func NewCache(maxEntries, maxBytes int) *Cache

NewCache returns an empty cache with the given caps. maxEntries and maxBytes both apply; whichever is hit first triggers eviction. Pass 0 to disable that cap.

func (*Cache) Bytes ¶

func (c *Cache) Bytes() int

Bytes returns the current total PCM bytes held.

func (*Cache) Get ¶

func (c *Cache) Get(key string) ([]byte, bool)

Get returns (pcm, true) on a hit. The returned slice is owned by the cache.

func (*Cache) Len ¶

func (c *Cache) Len() int

Len returns the number of cached entries.

func (*Cache) Put ¶

func (c *Cache) Put(key string, pcm []byte)

Put stores a copy of pcm under key, evicting oldest entries when over cap.

type CacheConfig ¶

type CacheConfig struct {
	// Cache to use. nil → DefaultCache.
	Cache *Cache
	// VoiceKey identifies vendor + voice + sample rate + speed. Required.
	VoiceKey string
	// MaxRunes skips cache writes for longer texts. 0 = no limit.
	MaxRunes int
	// ChunkBytes controls replay chunk size. 0 → one shot.
	ChunkBytes int
}

CacheConfig configures a CachingTTSService.

type CachingTTSService ¶

type CachingTTSService struct {
	// contains filtered or unexported fields
}

CachingTTSService wraps TTSService with a PCM cache.

func NewCachingTTSService ¶

func NewCachingTTSService(inner TTSService, cfg CacheConfig) (*CachingTTSService, error)

NewCachingTTSService validates cfg and returns a cache-aware TTSService.

func (*CachingTTSService) Cache ¶

func (c *CachingTTSService) Cache() *Cache

Cache returns the underlying cache.

func (*CachingTTSService) CacheKey ¶

func (c *CachingTTSService) CacheKey(text string) string

CacheKey returns the canonical key for text.

func (*CachingTTSService) Prewarm ¶

func (c *CachingTTSService) Prewarm(ctx context.Context, texts []string, onErr func(text string, err error))

Prewarm renders texts once and stores PCM. Skips already-cached entries.

func (*CachingTTSService) Synthesize ¶

func (c *CachingTTSService) Synthesize(ctx context.Context, text string, onPCMChunk func([]byte) error) error

Synthesize implements TTSService.

type Capabilities ¶ added in v1.3.0

type Capabilities struct {
	StreamingTTFB bool
	FirstMaxChars int
	FirstMinChars int
}

Capabilities describes vendor synthesis behavior for pipeline tuning.

func CapabilitiesFrom ¶ added in v1.3.0

func CapabilitiesFrom(svc TTSService) Capabilities

CapabilitiesFrom inspects a TTSService for optional vendor hints.

func DefaultCapabilities ¶ added in v1.3.0

func DefaultCapabilities() Capabilities

DefaultCapabilities returns conservative batch-oriented defaults.

type EncodedFrame ¶

type EncodedFrame struct {
	Data     []byte
	PlayID   string
	Sequence uint32
}

EncodedFrame represents an encoded audio frame.

type Speaker ¶

type Speaker struct {
	// contains filtered or unexported fields
}

Speaker serializes TTS playback and pipelines synthesis ahead of playback so LLM stream segments can be synthesized while the previous segment is playing.

func NewSpeaker ¶

func NewSpeaker(cfg SpeakerConfig) (*Speaker, error)

NewSpeaker creates a pipelined TTS speaker.

func (*Speaker) DrainQueue ¶

func (s *Speaker) DrainQueue()

DrainQueue drops pending jobs without interrupting the current utterance.

func (*Speaker) Enqueue ¶

func (s *Speaker) Enqueue(text, utteranceID string, e2eAnchor *time.Time) bool

Enqueue schedules text for synthesis. Non-blocking; drops when queue is full.

func (*Speaker) Interrupt ¶

func (s *Speaker) Interrupt()

Interrupt stops the current utterance and drains pending jobs.

func (*Speaker) IsActive ¶

func (s *Speaker) IsActive() bool

IsActive is true while streaming or while utterances remain queued.

func (*Speaker) IsPlaying ¶

func (s *Speaker) IsPlaying() bool

IsPlaying reports whether TTS audio is actively streaming.

func (*Speaker) QueueDepth ¶

func (s *Speaker) QueueDepth() int

QueueDepth returns pending text segment count.

func (*Speaker) SetCallbacks ¶

func (s *Speaker) SetCallbacks(
	onStarted func(utteranceID, text string, chained bool),
	onEnded func(utteranceID string, ok bool, duration time.Duration, ttsFirstMs, e2eFirstMs int, moreQueued bool),
)

SetCallbacks updates lifecycle hooks (safe before Start).

func (*Speaker) SetCallbacksWithFirstFrame ¶ added in v1.3.0

func (s *Speaker) SetCallbacksWithFirstFrame(
	onStarted func(utteranceID, text string, chained bool),
	onEnded func(utteranceID string, ok bool, duration time.Duration, ttsFirstMs, e2eFirstMs int, moreQueued bool),
	onFirstFrame func(utteranceID string, ttsFirstMs, e2eFirstMs int),
)

SetCallbacksWithFirstFrame also wires an optional first-frame hook (once per utterance).

func (*Speaker) Start ¶

func (s *Speaker) Start(ctx context.Context)

Start begins synth and play workers.

func (*Speaker) Stop ¶

func (s *Speaker) Stop()

Stop drains and shuts down the speaker.

type SpeakerConfig ¶

type SpeakerConfig struct {
	Pipeline     *TTSPipeline
	QueueSize    int
	Prefetch     int // buffered segments between synth and play (default 2)
	OnStarted    func(utteranceID, text string, chained bool)
	OnEnded      func(utteranceID string, ok bool, duration time.Duration, ttsFirstMs, e2eFirstMs int, moreQueued bool)
	OnFirstFrame func(utteranceID string, ttsFirstMs, e2eFirstMs int)
}

SpeakerConfig configures a serial TTS speaker.

type TTSPipeline ¶

type TTSPipeline struct {
	// contains filtered or unexported fields
}

TTSPipeline manages text-to-speech synthesis with pluggable components.

func NewTTSPipeline ¶

func NewTTSPipeline(config TTSPipelineConfig) (*TTSPipeline, error)

NewTTSPipeline creates a new TTS pipeline with the given configuration.

func (*TTSPipeline) ArmFirstFrameHook ¶

func (p *TTSPipeline) ArmFirstFrameHook(fn func())

ArmFirstFrameHook installs a one-shot callback fired on the first emitted frame.

func (*TTSPipeline) Close ¶

func (p *TTSPipeline) Close() error

Close closes the TTS pipeline.

func (*TTSPipeline) GetConfig ¶

func (p *TTSPipeline) GetConfig() TTSPipelineConfig

GetConfig returns the current pipeline configuration.

func (*TTSPipeline) Interrupt ¶

func (p *TTSPipeline) Interrupt()

Interrupt cancels the in-flight Speak. The pipeline remains usable.

func (*TTSPipeline) IsPlaying ¶

func (p *TTSPipeline) IsPlaying() bool

IsPlaying reports whether a Speak call is streaming audio.

func (*TTSPipeline) PlayFrames ¶ added in v1.3.0

func (p *TTSPipeline) PlayFrames(ctx context.Context, frames [][]byte) error

PlayFrames emits pre-collected frames with pacing and hooks.

func (*TTSPipeline) SetLogger ¶

func (p *TTSPipeline) SetLogger(callback func(string))

SetLogger sets the logging callback.

func (*TTSPipeline) SetOnCompleteFunc ¶

func (p *TTSPipeline) SetOnCompleteFunc(callback func())

SetOnCompleteFunc sets the completion callback.

func (*TTSPipeline) Speak ¶

func (p *TTSPipeline) Speak(text string) error

Speak synthesizes text with per-utterance cancellation (barge-in). Blocks until synthesis completes, is interrupted, or the pipeline stops.

func (*TTSPipeline) Start ¶

func (p *TTSPipeline) Start(ctx context.Context) error

Start starts the TTS pipeline.

func (*TTSPipeline) Stop ¶

func (p *TTSPipeline) Stop() error

Stop stops the TTS pipeline.

func (*TTSPipeline) Synthesize ¶

func (p *TTSPipeline) Synthesize(ctx context.Context, text string) error

Synthesize synthesizes text to audio using the provided context.

func (*TTSPipeline) Warmup ¶ added in v1.3.0

func (p *TTSPipeline) Warmup(ctx context.Context)

WarmupPipeline warms the pipeline's configured TTS service.

type TTSPipelineComponent ¶

type TTSPipelineComponent interface {
	// Name returns the component name.
	Name() string

	// Process processes data through the component.
	// For text processors: input is string, output is string
	// For audio processors: input is []byte, output is []byte
	// Returns (processedData, shouldContinue, error)
	Process(ctx context.Context, data interface{}) (interface{}, bool, error)
}

TTSPipelineComponent defines the interface for TTS pipeline components.

type TTSPipelineConfig ¶

type TTSPipelineConfig struct {
	// TTSService: the TTS service to use for synthesis
	TTSService TTSService
	// OutputCodec: output audio codec (e.g., "opus", "pcm")
	OutputCodec string
	// TargetSampleRate: target sample rate for output audio (default: 16000)
	TargetSampleRate int
	// FrameDuration: frame duration for audio processing (default: 60ms)
	FrameDuration time.Duration
	// TextProcessors: optional text processing components (e.g., text normalization)
	TextProcessors []TTSPipelineComponent
	// AudioProcessors: optional audio processing components (e.g., encoding)
	AudioProcessors []TTSPipelineComponent
	// SendCallback: callback for sending synthesized audio
	SendCallback func(data []byte) error
	// RecordCallback: optional callback for recording synthesized audio
	RecordCallback func(data []byte) error
	// Logger: optional logging callback
	Logger func(string)
	// PaceRealtime sleeps between frames so playback matches wall-clock (required for RTP/VoIP).
	PaceRealtime bool
}

TTSPipelineConfig contains configuration for the TTS pipeline.

func DefaultTTSPipelineConfig ¶

func DefaultTTSPipelineConfig(ttsService TTSService) TTSPipelineConfig

DefaultTTSPipelineConfig returns default TTS pipeline configuration.

type TTSService ¶

type TTSService interface {
	// Synthesize synthesizes text to audio and calls the callback for each audio chunk.
	// The callback receives PCM audio data (typically 16-bit mono at 16kHz).
	Synthesize(ctx context.Context, text string, callback func([]byte) error) error
}

TTSService defines the interface for text-to-speech synthesis.

func FromSynthesisEngine ¶

func FromSynthesisEngine(engine synthesizer.AudioSynthesisEngine) TTSService

FromSynthesisEngine wraps synthesizer.AudioSynthesisEngine as TTSService.

type TTSWorkerConfig ¶

type TTSWorkerConfig struct {
	// TTSService: the TTS service to use
	TTSService TTSService
	// WorkerCount: number of worker goroutines (default: 1)
	WorkerCount int
	// Logger: optional logging callback
	Logger func(string)
}

TTSWorkerConfig contains configuration for TTS workers.

type TTSWorkerPool ¶

type TTSWorkerPool struct {
	// contains filtered or unexported fields
}

TTSWorkerPool manages multiple TTS worker goroutines.

func NewTTSWorkerPool ¶

func NewTTSWorkerPool(
	config TTSWorkerConfig,
	inputCh <-chan TextSegment,
	outputCh chan<- AudioFrame,
) (*TTSWorkerPool, error)

NewTTSWorkerPool creates a new TTS worker pool.

func (*TTSWorkerPool) Close ¶

func (p *TTSWorkerPool) Close() error

Close closes the worker pool.

func (*TTSWorkerPool) GetGlobalSequence ¶

func (p *TTSWorkerPool) GetGlobalSequence() uint32

GetGlobalSequence returns the current global sequence number.

func (*TTSWorkerPool) Start ¶

func (p *TTSWorkerPool) Start(ctx context.Context) error

Start starts the worker pool.

func (*TTSWorkerPool) Stop ¶

func (p *TTSWorkerPool) Stop() error

Stop stops the worker pool.

func (*TTSWorkerPool) UpdateTTSService ¶

func (p *TTSWorkerPool) UpdateTTSService(newService TTSService) error

UpdateTTSService updates the TTS service (for speaker switching).

type TextSegment ¶

type TextSegment struct {
	Text      string
	IsFinal   bool
	Timestamp time.Time
	PlayID    string
}

TextSegment represents a text segment for TTS synthesis.

type TextSegmenterComponent ¶

type TextSegmenterComponent struct {
	// contains filtered or unexported fields
}

TextSegmenterComponent segments streaming LLM text for TTS synthesis.

func NewTextSegmenterComponent ¶

func NewTextSegmenterComponent(config TextSegmenterConfig, outputFunc func(TextSegment)) *TextSegmenterComponent

NewTextSegmenterComponent creates a new text segmenter component.

func (*TextSegmenterComponent) Name ¶

func (s *TextSegmenterComponent) Name() string

Name returns the component name.

func (*TextSegmenterComponent) OnComplete ¶

func (s *TextSegmenterComponent) OnComplete()

OnComplete flushes the tail when the LLM stream ends.

func (*TextSegmenterComponent) Process ¶

func (s *TextSegmenterComponent) Process(ctx context.Context, data interface{}) (interface{}, bool, error)

Process ingests streaming LLM text and emits segments when rules fire.

func (*TextSegmenterComponent) Reset ¶

func (s *TextSegmenterComponent) Reset()

Reset resets the segmenter state.

func (*TextSegmenterComponent) SetLogger ¶

func (s *TextSegmenterComponent) SetLogger(callback func(string))

SetLogger sets the logging callback.

func (*TextSegmenterComponent) SetPlayID ¶

func (s *TextSegmenterComponent) SetPlayID(playID string)

SetPlayID sets the current play ID for segments.

type TextSegmenterConfig ¶

type TextSegmenterConfig struct {
	DelayTimeout time.Duration

	// First segment (latency-optimized)
	FirstMinChars int // comma/pause flush once buffer has this many runes (default 6)
	FirstMaxChars int // first-chunk safety cap with punctuation-aware cut (default 18, 0=off)

	// Later segments (semantic priority)
	RestForceMaxChars int // emergency split only after this many runes without sentence end (default 120, 0=wait for OnComplete)

	// Deprecated: use FirstMinChars / FirstMaxChars. Kept for backward compatibility.
	MinChars int
	MaxChars int
}

TextSegmenterConfig controls LLM→TTS streaming segmentation.

Strategy:

First segment: lower latency — break on sentence end, comma (≥FirstMinChars), or FirstMaxChars with punctuation-aware boundary.
Later segments: semantic priority — only sentence-ending punctuation or stream end; no comma/char chops unless RestForceMaxChars safety triggers.

func DefaultTextSegmenterConfig ¶

func DefaultTextSegmenterConfig() TextSegmenterConfig

DefaultTextSegmenterConfig returns punctuation-first segmentation defaults.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL