tts

package
v0.0.0-...-8acab51 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 26, 2026 License: MIT Imports: 18 Imported by: 0

Documentation

Overview

Package tts provides text-to-speech functionality. This package implements TTS provider interfaces for synthesizing speech from text.

Index

Constants

This section is empty.

Variables

View Source
var (
	// ErrProviderNotFound is returned when a provider is not found.
	ErrProviderNotFound = errors.New("TTS provider not found")
	// ErrProviderDisabled is returned when a provider is disabled.
	ErrProviderDisabled = errors.New("TTS provider disabled")
	// ErrInvalidText is returned when the text is invalid.
	ErrInvalidText = errors.New("invalid text")
	// ErrSynthesisFailed is returned when synthesis fails.
	ErrSynthesisFailed = errors.New("synthesis failed")
	// ErrTextTooLong is returned when the text is too long.
	ErrTextTooLong = errors.New("text too long")
	// ErrVoiceNotFound is returned when the voice is not found.
	ErrVoiceNotFound = errors.New("voice not found")
	// ErrNoProviderConfigured is returned when no provider is configured.
	ErrNoProviderConfigured = errors.New("no TTS provider configured, please configure one in settings")
)

Functions

func DetectLanguage

func DetectLanguage(text string) (lang string, ratio float64)

DetectLanguage detects the primary language of text based on Unicode ranges. Returns the detected language code and confidence ratio (0.0-1.0).

func KokoroAvailable

func KokoroAvailable() bool

KokoroAvailable reports whether Kokoro was compiled in.

func NewKokoroProvider

func NewKokoroProvider(_ string) *kokoroStub

func WindowsNativeAvailable

func WindowsNativeAvailable() bool

WindowsNativeAvailable checks if Windows native TTS is available

Types

type AudioFormat

type AudioFormat string

AudioFormat represents the output audio format.

const (
	// FormatMP3 is MP3 audio format.
	FormatMP3 AudioFormat = "mp3"
	// FormatOPUS is OPUS audio format.
	FormatOPUS AudioFormat = "opus"
	// FormatAAC is AAC audio format.
	FormatAAC AudioFormat = "aac"
	// FormatFLAC is FLAC audio format.
	FormatFLAC AudioFormat = "flac"
	// FormatWAV is WAV audio format.
	FormatWAV AudioFormat = "wav"
	// FormatPCM is raw PCM audio format.
	FormatPCM AudioFormat = "pcm"
)

type AudioPreprocessor

type AudioPreprocessor struct {
	// contains filtered or unexported fields
}

AudioPreprocessor handles audio preprocessing before vocoder

func NewAudioPreprocessor

func NewAudioPreprocessor(sampleRate int) *AudioPreprocessor

NewAudioPreprocessor creates a new audio preprocessor

func (*AudioPreprocessor) ApplyEQ

func (p *AudioPreprocessor) ApplyEQ(samples []int16) []int16

ApplyEQ applies EQ filter (boost 1kHz by 3dB)

func (*AudioPreprocessor) ApplyLowpass

func (p *AudioPreprocessor) ApplyLowpass(samples []int16) []int16

ApplyLowpass applies lowpass filter at 7kHz

func (*AudioPreprocessor) Preprocess

func (p *AudioPreprocessor) Preprocess(samples []int16) []int16

Preprocess applies full preprocessing chain

type Consent struct {
	ID             string     `json:"id"`
	UserID         string     `json:"user_id"`
	Service        string     `json:"service"` // 'edge-tts', 'espeak'
	ConsentGiven   bool       `json:"consent_given"`
	ConsentDate    *time.Time `json:"consent_date,omitempty"`
	ConsentVersion string     `json:"consent_version"`
	CreatedAt      time.Time  `json:"created_at"`
	UpdatedAt      time.Time  `json:"updated_at"`
}

Consent represents a user's consent for a service

type ConsentManager

type ConsentManager struct {
	// contains filtered or unexported fields
}

ConsentManager handles user privacy consent for online TTS services

func NewConsentManager

func NewConsentManager(db *sql.DB) *ConsentManager

NewConsentManager creates a new consent manager

func NewConsentManagerWithReadDB

func NewConsentManagerWithReadDB(writeDB, readDB *sql.DB) *ConsentManager

NewConsentManagerWithReadDB creates a new consent manager with separate write and read database handles.

func (*ConsentManager) GetConsent

func (cm *ConsentManager) GetConsent(ctx context.Context, userID, service string) (*Consent, error)

GetConsent retrieves user consent for a service

func (*ConsentManager) HasConsent

func (cm *ConsentManager) HasConsent(ctx context.Context, userID, service string) (bool, error)

HasConsent checks if user has given consent

func (*ConsentManager) SetConsent

func (cm *ConsentManager) SetConsent(ctx context.Context, userID, service string, given bool, version string) error

SetConsent saves or updates user consent

type EdgeTTSProvider

type EdgeTTSProvider struct {
	// contains filtered or unexported fields
}

EdgeTTSProvider implements the Provider interface for Microsoft Edge TTS.

func NewEdgeTTSProvider

func NewEdgeTTSProvider() *EdgeTTSProvider

NewEdgeTTSProvider creates a new Edge TTS provider.

func (*EdgeTTSProvider) IsAvailable

func (p *EdgeTTSProvider) IsAvailable() bool

IsAvailable always returns true for Edge TTS.

func (*EdgeTTSProvider) ListVoices

func (p *EdgeTTSProvider) ListVoices(ctx context.Context) ([]Voice, error)

ListVoices returns available voices.

func (*EdgeTTSProvider) MaxTextLength

func (p *EdgeTTSProvider) MaxTextLength() int

MaxTextLength returns the maximum text length.

func (*EdgeTTSProvider) Name

func (p *EdgeTTSProvider) Name() string

Name returns the provider name.

func (*EdgeTTSProvider) SupportedFormats

func (p *EdgeTTSProvider) SupportedFormats() []AudioFormat

SupportedFormats returns the supported audio formats.

func (*EdgeTTSProvider) Synthesize

Synthesize synthesizes text to speech using edge-tts-go library.

func (*EdgeTTSProvider) SynthesizeStream

func (p *EdgeTTSProvider) SynthesizeStream(ctx context.Context, req *SynthesizeRequest, callback StreamCallback) error

SynthesizeStream synthesizes text with streaming audio output. Sends the complete audio as a single callback because the SSE frontend closes the EventSource after the first audio event.

func (*EdgeTTSProvider) Type

func (p *EdgeTTSProvider) Type() ProviderType

Type returns the provider type.

type EspeakModelManager

type EspeakModelManager struct{}

func NewEspeakModelManager

func NewEspeakModelManager(dataPath string) *EspeakModelManager

func (*EspeakModelManager) EnsureVoiceData

func (m *EspeakModelManager) EnsureVoiceData(ctx context.Context) error

type EspeakNGAdapter

type EspeakNGAdapter struct{}

EspeakNGAdapter is a stub when built without espeak tag.

func NewEspeakNGAdapter

func NewEspeakNGAdapter(_ string) *EspeakNGAdapter

NewEspeakNGAdapter returns a stub adapter.

func (*EspeakNGAdapter) Available

func (a *EspeakNGAdapter) Available() bool

func (*EspeakNGAdapter) Close

func (a *EspeakNGAdapter) Close()

func (*EspeakNGAdapter) GetProvider

func (a *EspeakNGAdapter) GetProvider() *EspeakNGProvider

func (*EspeakNGAdapter) ListVoices

func (a *EspeakNGAdapter) ListVoices(_ context.Context) ([]Voice, error)

func (*EspeakNGAdapter) MaxTextLength

func (a *EspeakNGAdapter) MaxTextLength() int

func (*EspeakNGAdapter) Name

func (a *EspeakNGAdapter) Name() string

func (*EspeakNGAdapter) SupportedFormats

func (a *EspeakNGAdapter) SupportedFormats() []AudioFormat

func (*EspeakNGAdapter) Synthesize

func (*EspeakNGAdapter) SynthesizeStream

func (a *EspeakNGAdapter) SynthesizeStream(_ context.Context, _ *SynthesizeRequest, _ StreamCallback) error

func (*EspeakNGAdapter) Type

func (a *EspeakNGAdapter) Type() ProviderType

type EspeakNGProvider

type EspeakNGProvider struct{}

EspeakNGProvider is a stub when built without espeak tag.

func NewEspeakNGProvider

func NewEspeakNGProvider(_ string) *EspeakNGProvider

NewEspeakNGProvider returns a stub provider.

func (*EspeakNGProvider) Close

func (p *EspeakNGProvider) Close()

func (*EspeakNGProvider) Initialize

func (p *EspeakNGProvider) Initialize() error

func (*EspeakNGProvider) Name

func (p *EspeakNGProvider) Name() string

func (*EspeakNGProvider) SupportedLanguages

func (p *EspeakNGProvider) SupportedLanguages() []string

func (*EspeakNGProvider) Synthesize

func (*EspeakNGProvider) TextToPhonemes

func (p *EspeakNGProvider) TextToPhonemes(_, _ string) (string, error)

func (*EspeakNGProvider) Type

func (p *EspeakNGProvider) Type() string

type EspeakNGRequest

type EspeakNGRequest struct {
	Text     string  `json:"text"`
	Language string  `json:"language"`
	Rate     float32 `json:"rate"`
	Pitch    float32 `json:"pitch"`
	Volume   float32 `json:"volume"`
}

EspeakNGRequest represents a synthesis request (stub).

type Handler

type Handler struct {
	// contains filtered or unexported fields
}

Handler handles TTS management HTTP requests.

func NewHandler

func NewHandler(service Service) *Handler

NewHandler creates a new TTS handler.

func (*Handler) ListProviders

func (h *Handler) ListProviders(c echo.Context) error

ListProviders returns available TTS providers.

func (*Handler) ListVoices

func (h *Handler) ListVoices(c echo.Context) error

ListVoices returns available TTS voices.

func (*Handler) RegisterRoutes

func (h *Handler) RegisterRoutes(g *echo.Group)

RegisterRoutes registers the TTS management routes.

type KokoroModelManager

type KokoroModelManager struct{}

func NewKokoroModelManager

func NewKokoroModelManager(dataPath string) *KokoroModelManager

func (*KokoroModelManager) CancelDownload

func (m *KokoroModelManager) CancelDownload() error

func (*KokoroModelManager) DownloadModel

func (m *KokoroModelManager) DownloadModel(ctx context.Context) error

func (*KokoroModelManager) GetDownloadProgress

func (m *KokoroModelManager) GetDownloadProgress() (float64, string, error)

func (*KokoroModelManager) GetModelPath

func (m *KokoroModelManager) GetModelPath() string

func (*KokoroModelManager) GetModelStatus

func (m *KokoroModelManager) GetModelStatus() map[string]interface{}

func (*KokoroModelManager) IsModelReady

func (m *KokoroModelManager) IsModelReady() bool

type LanguageStats

type LanguageStats struct {
	CJKHan   int // CJK Unified Ideographs (shared: zh, ja, ko)
	Kana     int // Hiragana + Katakana (unique to Japanese)
	Hangul   int // Korean Hangul (unique to Korean)
	Latin    int // Latin script (en, fr, de, es, pt, etc.)
	Cyrillic int // Cyrillic script (ru, uk, bg, etc.)
	Arabic   int // Arabic script (ar, fa, ur, etc.)
	Thai     int // Thai script
	Devanag  int // Devanagari script (hi, mr, ne, etc.)
	Other    int // Other letters/numbers
	Total    int // Total scored characters
}

LanguageStats holds language detection statistics. CJKHan counts are shared across Chinese/Japanese/Korean since Han ideographs are used by all three languages. Disambiguating scripts (Kana, Hangul) break ties.

func AnalyzeText

func AnalyzeText(text string) *LanguageStats

AnalyzeText analyzes text and returns language statistics.

func (*LanguageStats) GetLanguageRatios

func (s *LanguageStats) GetLanguageRatios() map[string]float64

GetLanguageRatios returns all language ratios.

func (*LanguageStats) IsMixedLanguage

func (s *LanguageStats) IsMixedLanguage() bool

IsMixedLanguage returns true if text contains multiple language scripts.

func (*LanguageStats) PrimaryLanguage

func (s *LanguageStats) PrimaryLanguage() (lang string, ratio float64)

PrimaryLanguage returns the primary language and its confidence ratio. Han ideographs are distributed to CJK languages based on disambiguating scripts:

  • If Kana present → Han counts toward Japanese
  • If Hangul present → Han counts toward Korean
  • If neither → Han counts toward Chinese
  • If both Kana and Hangul → Han split proportionally

type LocalSpeaker

type LocalSpeaker interface {
	SpeakLocally(ctx context.Context, text string, speed float32) error
	StopSpeaking()
}

LocalSpeaker is an optional interface for providers that can play audio locally.

type MacOSNativeTTS

type MacOSNativeTTS struct{}

MacOSNativeTTS stub for non-macOS systems

func NewMacOSNativeTTS

func NewMacOSNativeTTS() *MacOSNativeTTS

NewMacOSNativeTTS creates a stub provider

func (*MacOSNativeTTS) Available

func (p *MacOSNativeTTS) Available() bool

Available returns false on non-macOS systems

func (*MacOSNativeTTS) Close

func (p *MacOSNativeTTS) Close()

Close does nothing

func (*MacOSNativeTTS) Initialize

func (p *MacOSNativeTTS) Initialize() error

Initialize returns error on non-macOS

func (*MacOSNativeTTS) ListVoices

func (p *MacOSNativeTTS) ListVoices(ctx context.Context) ([]Voice, error)

ListVoices returns empty list

func (*MacOSNativeTTS) MaxTextLength

func (p *MacOSNativeTTS) MaxTextLength() int

MaxTextLength returns 0

func (*MacOSNativeTTS) Name

func (p *MacOSNativeTTS) Name() string

Name returns provider name

func (*MacOSNativeTTS) SupportedFormats

func (p *MacOSNativeTTS) SupportedFormats() []AudioFormat

SupportedFormats returns empty list

func (*MacOSNativeTTS) Synthesize

Synthesize returns error

func (*MacOSNativeTTS) SynthesizeStream

func (p *MacOSNativeTTS) SynthesizeStream(ctx context.Context, req *SynthesizeRequest, callback StreamCallback) error

SynthesizeStream returns error

func (*MacOSNativeTTS) Type

func (p *MacOSNativeTTS) Type() ProviderType

Type returns provider type

type Phonemizer

type Phonemizer interface {
	// TextToPhonemes converts text to IPA phonemes.
	// lang is a language code (e.g. "en", "cmn", "ja").
	TextToPhonemes(text, lang string) (string, error)
}

Phonemizer converts text to IPA phoneme strings.

type Provider

type Provider interface {
	// Name returns the provider name.
	Name() string
	// Type returns the provider type.
	Type() ProviderType
	// Synthesize synthesizes text to speech.
	Synthesize(ctx context.Context, req *SynthesizeRequest) (*SynthesizeResponse, error)
	// SynthesizeStream synthesizes text with streaming audio output.
	SynthesizeStream(ctx context.Context, req *SynthesizeRequest, callback StreamCallback) error
	// ListVoices returns available voices.
	ListVoices(ctx context.Context) ([]Voice, error)
	// SupportedFormats returns the supported audio formats.
	SupportedFormats() []AudioFormat
	// MaxTextLength returns the maximum text length.
	MaxTextLength() int
}

Provider defines the TTS provider interface.

func NewWindowsNativeTTSProvider

func NewWindowsNativeTTSProvider() Provider

NewWindowsNativeTTSProvider returns nil on non-Windows platforms.

type ProviderConfig

type ProviderConfig struct {
	// Type is the provider type.
	Type ProviderType `json:"type" yaml:"type"`
	// Enabled indicates if this provider is enabled.
	Enabled bool `json:"enabled" yaml:"enabled"`
	// APIKey is the API key for the provider.
	APIKey string `json:"api_key,omitempty" yaml:"api_key,omitempty"`
	// BaseURL is the base URL for the API.
	BaseURL string `json:"base_url,omitempty" yaml:"base_url,omitempty"`
	// DefaultVoice is the default voice ID.
	DefaultVoice string `json:"default_voice,omitempty" yaml:"default_voice,omitempty"`
	// DefaultFormat is the default audio format.
	DefaultFormat AudioFormat `json:"default_format,omitempty" yaml:"default_format,omitempty"`
	// MaxTextLength is the maximum text length.
	MaxTextLength int `json:"max_text_length,omitempty" yaml:"max_text_length,omitempty"`
}

ProviderConfig holds the configuration for a TTS provider.

type ProviderFactory

type ProviderFactory struct{}

ProviderFactory creates TTS providers based on type

func (*ProviderFactory) CreateProvider

func (pf *ProviderFactory) CreateProvider(providerType string) (Provider, error)

CreateProvider creates a TTS provider based on the provider type

func (*ProviderFactory) GetAvailableProviders

func (pf *ProviderFactory) GetAvailableProviders() []string

GetAvailableProviders returns list of available provider types

type ProviderType

type ProviderType string

ProviderType represents the type of TTS provider.

const (
	// ProviderOpenAI is OpenAI's TTS API.
	ProviderOpenAI ProviderType = "openai"
	// ProviderElevenLabs is ElevenLabs TTS.
	ProviderElevenLabs ProviderType = "elevenlabs"
	// ProviderPiper is local Piper TTS.
	ProviderPiper ProviderType = "piper"
	// ProviderKokoro is local Kokoro TTS (legacy, Python-based).
	ProviderKokoro ProviderType = "kokoro"
	// ProviderSherpa is local TTS using sherpa-onnx (native Go, no Python).
	ProviderSherpa ProviderType = "sherpa"
	// ProviderEspeakNG is local eSpeak-NG TTS.
	ProviderEspeakNG ProviderType = "espeak-ng"
	// ProviderEdge is Microsoft Edge TTS.
	ProviderEdge ProviderType = "edge-tts"
	// ProviderMacOSNative is macOS native TTS using AVSpeechSynthesizer.
	ProviderMacOSNative ProviderType = "macos-native"
	// ProviderWindowsNative is Windows native TTS using WinRT SpeechSynthesis.
	ProviderWindowsNative ProviderType = "windows-native"
)

type Service

type Service interface {
	// Synthesize synthesizes text using the default provider.
	Synthesize(ctx context.Context, req *SynthesizeRequest) (*SynthesizeResponse, error)
	// SynthesizeWithProvider synthesizes text using a specific provider.
	SynthesizeWithProvider(ctx context.Context, providerType ProviderType, req *SynthesizeRequest) (*SynthesizeResponse, error)
	// SynthesizeStream synthesizes text with streaming audio output.
	SynthesizeStream(ctx context.Context, req *SynthesizeRequest, callback StreamCallback) error
	// ListVoices returns available voices from the default provider.
	ListVoices(ctx context.Context) ([]Voice, error)
	// ListProviders returns all available providers.
	ListProviders() []ProviderType
	// GetDefaultProvider returns the default provider type.
	GetDefaultProvider() ProviderType
	// SetDefaultProvider sets the default provider type.
	SetDefaultProvider(providerType ProviderType) error
	// GetProvider returns the provider instance for a given provider type.
	GetProvider(providerType ProviderType) Provider
	// GetConfig returns the current TTS configuration (speed, pitch, volume).
	GetConfig() (speed, pitch, volume float32)
	// SetConfig sets the TTS configuration (speed, pitch, volume).
	SetConfig(speed, pitch, volume float32)
	// GetVocoderStatus returns vocoder model status.
	GetVocoderStatus() map[string]interface{}
	// DownloadVocoderModel starts downloading the vocoder model.
	DownloadVocoderModel(ctx context.Context) error
	// CancelVocoderDownload cancels the vocoder download.
	CancelVocoderDownload()
	// GetKokoroStatus returns Kokoro model status.
	GetKokoroStatus() map[string]interface{}
	// DownloadKokoroModel starts downloading the Kokoro model.
	DownloadKokoroModel(ctx context.Context) error
	// CancelKokoroDownload cancels the Kokoro download.
	CancelKokoroDownload()
	// Close cleans up all provider resources.
	Close()
}

Service defines the TTS service interface.

func NewService

func NewService(cfg *ServiceConfig) (Service, error)

NewService creates a new TTS service.

type ServiceConfig

type ServiceConfig struct {
	DefaultProvider ProviderType
	Providers       []ProviderConfig
	DataPath        string // Path for model downloads
}

ServiceConfig holds the configuration for the TTS service.

type StreamCallback

type StreamCallback func(chunk []byte) error

StreamCallback is called when streaming audio chunks are available.

type SynthesizeRequest

type SynthesizeRequest struct {
	// Text is the text to synthesize.
	Text string
	// Voice is the voice ID to use.
	Voice string
	// Format is the output audio format.
	Format AudioFormat
	// Speed is the speech speed (0.25-4.0, default 1.0).
	Speed float32
	// Pitch is the speech pitch adjustment.
	Pitch float32
	// Volume is the speech volume (0-200, default 100).
	Volume float32
}

SynthesizeRequest represents a synthesis request.

type SynthesizeResponse

type SynthesizeResponse struct {
	// Audio is the synthesized audio data.
	Audio io.ReadCloser
	// Format is the audio format.
	Format AudioFormat
	// ContentType is the MIME content type.
	ContentType string
	// Duration is the estimated audio duration in seconds.
	Duration float64
}

SynthesizeResponse represents a synthesis response.

type VocoderModelManager

type VocoderModelManager struct{}

VocoderModelManager stub when espeak is not compiled in.

func NewVocoderModelManager

func NewVocoderModelManager(dataPath string) *VocoderModelManager

func (*VocoderModelManager) CancelDownload

func (m *VocoderModelManager) CancelDownload()

func (*VocoderModelManager) DownloadModel

func (m *VocoderModelManager) DownloadModel(ctx context.Context) error

func (*VocoderModelManager) GetModelStatus

func (m *VocoderModelManager) GetModelStatus() map[string]interface{}

func (*VocoderModelManager) IsReady

func (m *VocoderModelManager) IsReady() bool

type Voice

type Voice struct {
	// ID is the voice identifier.
	ID string `json:"id"`
	// Name is the display name.
	Name string `json:"name"`
	// Language is the voice language.
	Language string `json:"language"`
	// Gender is the voice gender.
	Gender string `json:"gender,omitempty"`
	// Description is the voice description.
	Description string `json:"description,omitempty"`
	// PreviewURL is a URL to preview the voice.
	PreviewURL string `json:"preview_url,omitempty"`
	// Provider is the TTS provider name (e.g., "Kokoro", "eSpeak-NG (Robotic)").
	Provider string `json:"provider,omitempty"`
	// Quality indicates voice quality level (e.g., "high", "medium", "low").
	Quality string `json:"quality,omitempty"`
}

Voice represents a TTS voice.

func GetVoicesByLanguage

func GetVoicesByLanguage(voices []Voice, langPrefix string) []Voice

GetVoicesByLanguage returns voices filtered by language prefix.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL