tts

package

v0.0.0-...-8acab51 Latest Latest Go to latest Published: Apr 26, 2026 License: MIT Imports: 18 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/IceWhaleTech/ZimaOS-Blue

Links

Open Source Insights

Documentation ¶

Rendered for

Overview ¶

Package tts provides text-to-speech functionality. This package implements TTS provider interfaces for synthesizing speech from text.

Index ¶

Variables
func DetectLanguage(text string) (lang string, ratio float64)
func KokoroAvailable() bool
func NewKokoroProvider(_ string) *kokoroStub
func WindowsNativeAvailable() bool
type AudioFormat
type AudioPreprocessor
- func NewAudioPreprocessor(sampleRate int) *AudioPreprocessor
- func (p *AudioPreprocessor) ApplyEQ(samples []int16) []int16
- func (p *AudioPreprocessor) ApplyLowpass(samples []int16) []int16
- func (p *AudioPreprocessor) Preprocess(samples []int16) []int16
type Consent
type ConsentManager
- func NewConsentManager(db *sql.DB) *ConsentManager
- func NewConsentManagerWithReadDB(writeDB, readDB *sql.DB) *ConsentManager
- func (cm *ConsentManager) GetConsent(ctx context.Context, userID, service string) (*Consent, error)
- func (cm *ConsentManager) HasConsent(ctx context.Context, userID, service string) (bool, error)
- func (cm *ConsentManager) SetConsent(ctx context.Context, userID, service string, given bool, version string) error
type EdgeTTSProvider
- func NewEdgeTTSProvider() *EdgeTTSProvider
- func (p *EdgeTTSProvider) IsAvailable() bool
- func (p *EdgeTTSProvider) ListVoices(ctx context.Context) ([]Voice, error)
- func (p *EdgeTTSProvider) MaxTextLength() int
- func (p *EdgeTTSProvider) Name() string
- func (p *EdgeTTSProvider) SupportedFormats() []AudioFormat
- func (p *EdgeTTSProvider) Synthesize(ctx context.Context, req *SynthesizeRequest) (*SynthesizeResponse, error)
- func (p *EdgeTTSProvider) SynthesizeStream(ctx context.Context, req *SynthesizeRequest, callback StreamCallback) error
- func (p *EdgeTTSProvider) Type() ProviderType
type EspeakModelManager
- func NewEspeakModelManager(dataPath string) *EspeakModelManager
- func (m *EspeakModelManager) EnsureVoiceData(ctx context.Context) error
type EspeakNGAdapter
- func NewEspeakNGAdapter(_ string) *EspeakNGAdapter
- func (a *EspeakNGAdapter) Available() bool
- func (a *EspeakNGAdapter) Close()
- func (a *EspeakNGAdapter) GetProvider() *EspeakNGProvider
- func (a *EspeakNGAdapter) ListVoices(_ context.Context) ([]Voice, error)
- func (a *EspeakNGAdapter) MaxTextLength() int
- func (a *EspeakNGAdapter) Name() string
- func (a *EspeakNGAdapter) SupportedFormats() []AudioFormat
- func (a *EspeakNGAdapter) Synthesize(_ context.Context, _ *SynthesizeRequest) (*SynthesizeResponse, error)
- func (a *EspeakNGAdapter) SynthesizeStream(_ context.Context, _ *SynthesizeRequest, _ StreamCallback) error
- func (a *EspeakNGAdapter) Type() ProviderType
type EspeakNGProvider
- func NewEspeakNGProvider(_ string) *EspeakNGProvider
- func (p *EspeakNGProvider) Close()
- func (p *EspeakNGProvider) Initialize() error
- func (p *EspeakNGProvider) Name() string
- func (p *EspeakNGProvider) SupportedLanguages() []string
- func (p *EspeakNGProvider) Synthesize(_ context.Context, _ *EspeakNGRequest) (io.ReadCloser, error)
- func (p *EspeakNGProvider) TextToPhonemes(_, _ string) (string, error)
- func (p *EspeakNGProvider) Type() string
type EspeakNGRequest
type Handler
- func NewHandler(service Service) *Handler
- func (h *Handler) ListProviders(c echo.Context) error
- func (h *Handler) ListVoices(c echo.Context) error
- func (h *Handler) RegisterRoutes(g *echo.Group)
type KokoroModelManager
- func NewKokoroModelManager(dataPath string) *KokoroModelManager
- func (m *KokoroModelManager) CancelDownload() error
- func (m *KokoroModelManager) DownloadModel(ctx context.Context) error
- func (m *KokoroModelManager) GetDownloadProgress() (float64, string, error)
- func (m *KokoroModelManager) GetModelPath() string
- func (m *KokoroModelManager) GetModelStatus() map[string]interface{}
- func (m *KokoroModelManager) IsModelReady() bool
type LanguageStats
- func AnalyzeText(text string) *LanguageStats
- func (s *LanguageStats) GetLanguageRatios() map[string]float64
- func (s *LanguageStats) IsMixedLanguage() bool
- func (s *LanguageStats) PrimaryLanguage() (lang string, ratio float64)
type LocalSpeaker
type MacOSNativeTTS
- func NewMacOSNativeTTS() *MacOSNativeTTS
- func (p *MacOSNativeTTS) Available() bool
- func (p *MacOSNativeTTS) Close()
- func (p *MacOSNativeTTS) Initialize() error
- func (p *MacOSNativeTTS) ListVoices(ctx context.Context) ([]Voice, error)
- func (p *MacOSNativeTTS) MaxTextLength() int
- func (p *MacOSNativeTTS) Name() string
- func (p *MacOSNativeTTS) SupportedFormats() []AudioFormat
- func (p *MacOSNativeTTS) Synthesize(ctx context.Context, req *SynthesizeRequest) (*SynthesizeResponse, error)
- func (p *MacOSNativeTTS) SynthesizeStream(ctx context.Context, req *SynthesizeRequest, callback StreamCallback) error
- func (p *MacOSNativeTTS) Type() ProviderType
type Phonemizer
type Provider
- func NewWindowsNativeTTSProvider() Provider
type ProviderConfig
type ProviderFactory
- func (pf *ProviderFactory) CreateProvider(providerType string) (Provider, error)
- func (pf *ProviderFactory) GetAvailableProviders() []string
type ProviderType
type Service
- func NewService(cfg *ServiceConfig) (Service, error)
type ServiceConfig
type StreamCallback
type SynthesizeRequest
type SynthesizeResponse
type VocoderModelManager
- func NewVocoderModelManager(dataPath string) *VocoderModelManager
- func (m *VocoderModelManager) CancelDownload()
- func (m *VocoderModelManager) DownloadModel(ctx context.Context) error
- func (m *VocoderModelManager) GetModelStatus() map[string]interface{}
- func (m *VocoderModelManager) IsReady() bool
type Voice
- func GetVoicesByLanguage(voices []Voice, langPrefix string) []Voice

Constants ¶

This section is empty.

Variables ¶

View Source

var (
	// ErrProviderNotFound is returned when a provider is not found.
	ErrProviderNotFound = errors.New("TTS provider not found")
	// ErrProviderDisabled is returned when a provider is disabled.
	ErrProviderDisabled = errors.New("TTS provider disabled")
	// ErrInvalidText is returned when the text is invalid.
	ErrInvalidText = errors.New("invalid text")
	// ErrSynthesisFailed is returned when synthesis fails.
	ErrSynthesisFailed = errors.New("synthesis failed")
	// ErrTextTooLong is returned when the text is too long.
	ErrTextTooLong = errors.New("text too long")
	// ErrVoiceNotFound is returned when the voice is not found.
	ErrVoiceNotFound = errors.New("voice not found")
	// ErrNoProviderConfigured is returned when no provider is configured.
	ErrNoProviderConfigured = errors.New("no TTS provider configured, please configure one in settings")
)

Functions ¶

func DetectLanguage ¶

func DetectLanguage(text string) (lang string, ratio float64)

DetectLanguage detects the primary language of text based on Unicode ranges. Returns the detected language code and confidence ratio (0.0-1.0).

func KokoroAvailable ¶

func KokoroAvailable() bool

KokoroAvailable reports whether Kokoro was compiled in.

func NewKokoroProvider ¶

func NewKokoroProvider(_ string) *kokoroStub

func WindowsNativeAvailable ¶

func WindowsNativeAvailable() bool

WindowsNativeAvailable checks if Windows native TTS is available

Types ¶

type AudioFormat ¶

type AudioFormat string

AudioFormat represents the output audio format.

const (
	// FormatMP3 is MP3 audio format.
	FormatMP3 AudioFormat = "mp3"
	// FormatOPUS is OPUS audio format.
	FormatOPUS AudioFormat = "opus"
	// FormatAAC is AAC audio format.
	FormatAAC AudioFormat = "aac"
	// FormatFLAC is FLAC audio format.
	FormatFLAC AudioFormat = "flac"
	// FormatWAV is WAV audio format.
	FormatWAV AudioFormat = "wav"
	// FormatPCM is raw PCM audio format.
	FormatPCM AudioFormat = "pcm"
)

type AudioPreprocessor ¶

type AudioPreprocessor struct {
	// contains filtered or unexported fields
}

AudioPreprocessor handles audio preprocessing before vocoder

func NewAudioPreprocessor ¶

func NewAudioPreprocessor(sampleRate int) *AudioPreprocessor

NewAudioPreprocessor creates a new audio preprocessor

func (*AudioPreprocessor) ApplyEQ ¶

func (p *AudioPreprocessor) ApplyEQ(samples []int16) []int16

ApplyEQ applies EQ filter (boost 1kHz by 3dB)

func (*AudioPreprocessor) ApplyLowpass ¶

func (p *AudioPreprocessor) ApplyLowpass(samples []int16) []int16

ApplyLowpass applies lowpass filter at 7kHz

func (*AudioPreprocessor) Preprocess ¶

func (p *AudioPreprocessor) Preprocess(samples []int16) []int16

Preprocess applies full preprocessing chain

type Consent struct {
	ID             string     `json:"id"`
	UserID         string     `json:"user_id"`
	Service        string     `json:"service"` // 'edge-tts', 'espeak'
	ConsentGiven   bool       `json:"consent_given"`
	ConsentDate    *time.Time `json:"consent_date,omitempty"`
	ConsentVersion string     `json:"consent_version"`
	CreatedAt      time.Time  `json:"created_at"`
	UpdatedAt      time.Time  `json:"updated_at"`
}

Consent represents a user's consent for a service

type ConsentManager ¶

type ConsentManager struct {
	// contains filtered or unexported fields
}

ConsentManager handles user privacy consent for online TTS services

func NewConsentManager ¶

func NewConsentManager(db *sql.DB) *ConsentManager

NewConsentManager creates a new consent manager

func NewConsentManagerWithReadDB ¶

func NewConsentManagerWithReadDB(writeDB, readDB *sql.DB) *ConsentManager

NewConsentManagerWithReadDB creates a new consent manager with separate write and read database handles.

func (*ConsentManager) GetConsent ¶

func (cm *ConsentManager) GetConsent(ctx context.Context, userID, service string) (*Consent, error)

GetConsent retrieves user consent for a service

func (*ConsentManager) HasConsent ¶

func (cm *ConsentManager) HasConsent(ctx context.Context, userID, service string) (bool, error)

HasConsent checks if user has given consent

func (*ConsentManager) SetConsent ¶

func (cm *ConsentManager) SetConsent(ctx context.Context, userID, service string, given bool, version string) error

SetConsent saves or updates user consent

type EdgeTTSProvider ¶

type EdgeTTSProvider struct {
	// contains filtered or unexported fields
}

EdgeTTSProvider implements the Provider interface for Microsoft Edge TTS.

func NewEdgeTTSProvider ¶

func NewEdgeTTSProvider() *EdgeTTSProvider

NewEdgeTTSProvider creates a new Edge TTS provider.

func (*EdgeTTSProvider) IsAvailable ¶

func (p *EdgeTTSProvider) IsAvailable() bool

IsAvailable always returns true for Edge TTS.

func (*EdgeTTSProvider) ListVoices ¶

func (p *EdgeTTSProvider) ListVoices(ctx context.Context) ([]Voice, error)

ListVoices returns available voices.

func (*EdgeTTSProvider) MaxTextLength ¶

func (p *EdgeTTSProvider) MaxTextLength() int

MaxTextLength returns the maximum text length.

func (*EdgeTTSProvider) Name ¶

func (p *EdgeTTSProvider) Name() string

Name returns the provider name.

func (*EdgeTTSProvider) SupportedFormats ¶

func (p *EdgeTTSProvider) SupportedFormats() []AudioFormat

SupportedFormats returns the supported audio formats.

func (*EdgeTTSProvider) Synthesize ¶

func (p *EdgeTTSProvider) Synthesize(ctx context.Context, req *SynthesizeRequest) (*SynthesizeResponse, error)

Synthesize synthesizes text to speech using edge-tts-go library.

func (*EdgeTTSProvider) SynthesizeStream ¶

func (p *EdgeTTSProvider) SynthesizeStream(ctx context.Context, req *SynthesizeRequest, callback StreamCallback) error

SynthesizeStream synthesizes text with streaming audio output. Sends the complete audio as a single callback because the SSE frontend closes the EventSource after the first audio event.

func (*EdgeTTSProvider) Type ¶

func (p *EdgeTTSProvider) Type() ProviderType

Type returns the provider type.

type EspeakModelManager ¶

type EspeakModelManager struct{}

func NewEspeakModelManager ¶

func NewEspeakModelManager(dataPath string) *EspeakModelManager

func (*EspeakModelManager) EnsureVoiceData ¶

func (m *EspeakModelManager) EnsureVoiceData(ctx context.Context) error

type EspeakNGAdapter ¶

type EspeakNGAdapter struct{}

EspeakNGAdapter is a stub when built without espeak tag.

func NewEspeakNGAdapter ¶

func NewEspeakNGAdapter(_ string) *EspeakNGAdapter

NewEspeakNGAdapter returns a stub adapter.

func (*EspeakNGAdapter) Available ¶

func (a *EspeakNGAdapter) Available() bool

func (*EspeakNGAdapter) Close ¶

func (a *EspeakNGAdapter) Close()

func (*EspeakNGAdapter) GetProvider ¶

func (a *EspeakNGAdapter) GetProvider() *EspeakNGProvider

func (*EspeakNGAdapter) ListVoices ¶

func (a *EspeakNGAdapter) ListVoices(_ context.Context) ([]Voice, error)

func (*EspeakNGAdapter) MaxTextLength ¶

func (a *EspeakNGAdapter) MaxTextLength() int

func (*EspeakNGAdapter) Name ¶

func (a *EspeakNGAdapter) Name() string

func (*EspeakNGAdapter) SupportedFormats ¶

func (a *EspeakNGAdapter) SupportedFormats() []AudioFormat

func (*EspeakNGAdapter) Synthesize ¶

func (a *EspeakNGAdapter) Synthesize(_ context.Context, _ *SynthesizeRequest) (*SynthesizeResponse, error)

func (*EspeakNGAdapter) SynthesizeStream ¶

func (a *EspeakNGAdapter) SynthesizeStream(_ context.Context, _ *SynthesizeRequest, _ StreamCallback) error

func (*EspeakNGAdapter) Type ¶

func (a *EspeakNGAdapter) Type() ProviderType

type EspeakNGProvider ¶

type EspeakNGProvider struct{}

EspeakNGProvider is a stub when built without espeak tag.

func NewEspeakNGProvider ¶

func NewEspeakNGProvider(_ string) *EspeakNGProvider

NewEspeakNGProvider returns a stub provider.

func (*EspeakNGProvider) Close ¶

func (p *EspeakNGProvider) Close()

func (*EspeakNGProvider) Initialize ¶

func (p *EspeakNGProvider) Initialize() error

func (*EspeakNGProvider) Name ¶

func (p *EspeakNGProvider) Name() string

func (*EspeakNGProvider) SupportedLanguages ¶

func (p *EspeakNGProvider) SupportedLanguages() []string

func (*EspeakNGProvider) Synthesize ¶

func (p *EspeakNGProvider) Synthesize(_ context.Context, _ *EspeakNGRequest) (io.ReadCloser, error)

func (*EspeakNGProvider) TextToPhonemes ¶

func (p *EspeakNGProvider) TextToPhonemes(_, _ string) (string, error)

func (*EspeakNGProvider) Type ¶

func (p *EspeakNGProvider) Type() string

type EspeakNGRequest ¶

type EspeakNGRequest struct {
	Text     string  `json:"text"`
	Language string  `json:"language"`
	Rate     float32 `json:"rate"`
	Pitch    float32 `json:"pitch"`
	Volume   float32 `json:"volume"`
}

EspeakNGRequest represents a synthesis request (stub).

type Handler ¶

type Handler struct {
	// contains filtered or unexported fields
}

Handler handles TTS management HTTP requests.

func (*Handler) ListProviders ¶

func (h *Handler) ListProviders(c echo.Context) error

ListProviders returns available TTS providers.

func (*Handler) ListVoices ¶

func (h *Handler) ListVoices(c echo.Context) error

ListVoices returns available TTS voices.

func (*Handler) RegisterRoutes ¶

func (h *Handler) RegisterRoutes(g *echo.Group)

RegisterRoutes registers the TTS management routes.

type KokoroModelManager ¶

type KokoroModelManager struct{}

func NewKokoroModelManager ¶

func NewKokoroModelManager(dataPath string) *KokoroModelManager

func (*KokoroModelManager) CancelDownload ¶

func (m *KokoroModelManager) CancelDownload() error

func (*KokoroModelManager) DownloadModel ¶

func (m *KokoroModelManager) DownloadModel(ctx context.Context) error

func (*KokoroModelManager) GetDownloadProgress ¶

func (m *KokoroModelManager) GetDownloadProgress() (float64, string, error)

func (*KokoroModelManager) GetModelPath ¶

func (m *KokoroModelManager) GetModelPath() string

func (*KokoroModelManager) GetModelStatus ¶

func (m *KokoroModelManager) GetModelStatus() map[string]interface{}

func (*KokoroModelManager) IsModelReady ¶

func (m *KokoroModelManager) IsModelReady() bool

type LanguageStats ¶

type LanguageStats struct {
	CJKHan   int // CJK Unified Ideographs (shared: zh, ja, ko)
	Kana     int // Hiragana + Katakana (unique to Japanese)
	Hangul   int // Korean Hangul (unique to Korean)
	Latin    int // Latin script (en, fr, de, es, pt, etc.)
	Cyrillic int // Cyrillic script (ru, uk, bg, etc.)
	Arabic   int // Arabic script (ar, fa, ur, etc.)
	Thai     int // Thai script
	Devanag  int // Devanagari script (hi, mr, ne, etc.)
	Other    int // Other letters/numbers
	Total    int // Total scored characters
}

LanguageStats holds language detection statistics. CJKHan counts are shared across Chinese/Japanese/Korean since Han ideographs are used by all three languages. Disambiguating scripts (Kana, Hangul) break ties.

func AnalyzeText ¶

func AnalyzeText(text string) *LanguageStats

AnalyzeText analyzes text and returns language statistics.

func (*LanguageStats) GetLanguageRatios ¶

func (s *LanguageStats) GetLanguageRatios() map[string]float64

GetLanguageRatios returns all language ratios.

func (*LanguageStats) IsMixedLanguage ¶

func (s *LanguageStats) IsMixedLanguage() bool

IsMixedLanguage returns true if text contains multiple language scripts.

func (*LanguageStats) PrimaryLanguage ¶

func (s *LanguageStats) PrimaryLanguage() (lang string, ratio float64)

PrimaryLanguage returns the primary language and its confidence ratio. Han ideographs are distributed to CJK languages based on disambiguating scripts:

If Kana present → Han counts toward Japanese
If Hangul present → Han counts toward Korean
If neither → Han counts toward Chinese
If both Kana and Hangul → Han split proportionally

type LocalSpeaker ¶

type LocalSpeaker interface {
	SpeakLocally(ctx context.Context, text string, speed float32) error
	StopSpeaking()
}

LocalSpeaker is an optional interface for providers that can play audio locally.

type MacOSNativeTTS ¶

type MacOSNativeTTS struct{}

MacOSNativeTTS stub for non-macOS systems

func NewMacOSNativeTTS ¶

func NewMacOSNativeTTS() *MacOSNativeTTS

NewMacOSNativeTTS creates a stub provider

func (*MacOSNativeTTS) Available ¶

func (p *MacOSNativeTTS) Available() bool

Available returns false on non-macOS systems

func (*MacOSNativeTTS) Close ¶

func (p *MacOSNativeTTS) Close()

Close does nothing

func (*MacOSNativeTTS) Initialize ¶

func (p *MacOSNativeTTS) Initialize() error

Initialize returns error on non-macOS

func (*MacOSNativeTTS) ListVoices ¶

func (p *MacOSNativeTTS) ListVoices(ctx context.Context) ([]Voice, error)

ListVoices returns empty list

func (*MacOSNativeTTS) MaxTextLength ¶

func (p *MacOSNativeTTS) MaxTextLength() int

MaxTextLength returns 0

func (*MacOSNativeTTS) Name ¶

func (p *MacOSNativeTTS) Name() string

Name returns provider name

func (*MacOSNativeTTS) SupportedFormats ¶

func (p *MacOSNativeTTS) SupportedFormats() []AudioFormat

SupportedFormats returns empty list

func (*MacOSNativeTTS) Synthesize ¶

func (p *MacOSNativeTTS) Synthesize(ctx context.Context, req *SynthesizeRequest) (*SynthesizeResponse, error)

Synthesize returns error

func (*MacOSNativeTTS) SynthesizeStream ¶

func (p *MacOSNativeTTS) SynthesizeStream(ctx context.Context, req *SynthesizeRequest, callback StreamCallback) error

SynthesizeStream returns error

func (*MacOSNativeTTS) Type ¶

func (p *MacOSNativeTTS) Type() ProviderType

Type returns provider type

type Phonemizer ¶

type Phonemizer interface {
	// TextToPhonemes converts text to IPA phonemes.
	// lang is a language code (e.g. "en", "cmn", "ja").
	TextToPhonemes(text, lang string) (string, error)
}

Phonemizer converts text to IPA phoneme strings.

type Provider ¶

type Provider interface {
	// Name returns the provider name.
	Name() string
	// Type returns the provider type.
	Type() ProviderType
	// Synthesize synthesizes text to speech.
	Synthesize(ctx context.Context, req *SynthesizeRequest) (*SynthesizeResponse, error)
	// SynthesizeStream synthesizes text with streaming audio output.
	SynthesizeStream(ctx context.Context, req *SynthesizeRequest, callback StreamCallback) error
	// ListVoices returns available voices.
	ListVoices(ctx context.Context) ([]Voice, error)
	// SupportedFormats returns the supported audio formats.
	SupportedFormats() []AudioFormat
	// MaxTextLength returns the maximum text length.
	MaxTextLength() int
}

Provider defines the TTS provider interface.

func NewWindowsNativeTTSProvider ¶

func NewWindowsNativeTTSProvider() Provider

NewWindowsNativeTTSProvider returns nil on non-Windows platforms.

type ProviderConfig ¶

type ProviderConfig struct {
	// Type is the provider type.
	Type ProviderType `json:"type" yaml:"type"`
	// Enabled indicates if this provider is enabled.
	Enabled bool `json:"enabled" yaml:"enabled"`
	// APIKey is the API key for the provider.
	APIKey string `json:"api_key,omitempty" yaml:"api_key,omitempty"`
	// BaseURL is the base URL for the API.
	BaseURL string `json:"base_url,omitempty" yaml:"base_url,omitempty"`
	// DefaultVoice is the default voice ID.
	DefaultVoice string `json:"default_voice,omitempty" yaml:"default_voice,omitempty"`
	// DefaultFormat is the default audio format.
	DefaultFormat AudioFormat `json:"default_format,omitempty" yaml:"default_format,omitempty"`
	// MaxTextLength is the maximum text length.
	MaxTextLength int `json:"max_text_length,omitempty" yaml:"max_text_length,omitempty"`
}

ProviderConfig holds the configuration for a TTS provider.

type ProviderFactory ¶

type ProviderFactory struct{}

ProviderFactory creates TTS providers based on type

func (*ProviderFactory) CreateProvider ¶

func (pf *ProviderFactory) CreateProvider(providerType string) (Provider, error)

CreateProvider creates a TTS provider based on the provider type

func (*ProviderFactory) GetAvailableProviders ¶

func (pf *ProviderFactory) GetAvailableProviders() []string

GetAvailableProviders returns list of available provider types

type ProviderType ¶

type ProviderType string

ProviderType represents the type of TTS provider.

const (
	// ProviderOpenAI is OpenAI's TTS API.
	ProviderOpenAI ProviderType = "openai"
	// ProviderElevenLabs is ElevenLabs TTS.
	ProviderElevenLabs ProviderType = "elevenlabs"
	// ProviderPiper is local Piper TTS.
	ProviderPiper ProviderType = "piper"
	// ProviderKokoro is local Kokoro TTS (legacy, Python-based).
	ProviderKokoro ProviderType = "kokoro"
	// ProviderSherpa is local TTS using sherpa-onnx (native Go, no Python).
	ProviderSherpa ProviderType = "sherpa"
	// ProviderEspeakNG is local eSpeak-NG TTS.
	ProviderEspeakNG ProviderType = "espeak-ng"
	// ProviderEdge is Microsoft Edge TTS.
	ProviderEdge ProviderType = "edge-tts"
	// ProviderMacOSNative is macOS native TTS using AVSpeechSynthesizer.
	ProviderMacOSNative ProviderType = "macos-native"
	// ProviderWindowsNative is Windows native TTS using WinRT SpeechSynthesis.
	ProviderWindowsNative ProviderType = "windows-native"
)

type Service ¶

type Service interface {
	// Synthesize synthesizes text using the default provider.
	Synthesize(ctx context.Context, req *SynthesizeRequest) (*SynthesizeResponse, error)
	// SynthesizeWithProvider synthesizes text using a specific provider.
	SynthesizeWithProvider(ctx context.Context, providerType ProviderType, req *SynthesizeRequest) (*SynthesizeResponse, error)
	// SynthesizeStream synthesizes text with streaming audio output.
	SynthesizeStream(ctx context.Context, req *SynthesizeRequest, callback StreamCallback) error
	// ListVoices returns available voices from the default provider.
	ListVoices(ctx context.Context) ([]Voice, error)
	// ListProviders returns all available providers.
	ListProviders() []ProviderType
	// GetDefaultProvider returns the default provider type.
	GetDefaultProvider() ProviderType
	// SetDefaultProvider sets the default provider type.
	SetDefaultProvider(providerType ProviderType) error
	// GetProvider returns the provider instance for a given provider type.
	GetProvider(providerType ProviderType) Provider
	// GetConfig returns the current TTS configuration (speed, pitch, volume).
	GetConfig() (speed, pitch, volume float32)
	// SetConfig sets the TTS configuration (speed, pitch, volume).
	SetConfig(speed, pitch, volume float32)
	// GetVocoderStatus returns vocoder model status.
	GetVocoderStatus() map[string]interface{}
	// DownloadVocoderModel starts downloading the vocoder model.
	DownloadVocoderModel(ctx context.Context) error
	// CancelVocoderDownload cancels the vocoder download.
	CancelVocoderDownload()
	// GetKokoroStatus returns Kokoro model status.
	GetKokoroStatus() map[string]interface{}
	// DownloadKokoroModel starts downloading the Kokoro model.
	DownloadKokoroModel(ctx context.Context) error
	// CancelKokoroDownload cancels the Kokoro download.
	CancelKokoroDownload()
	// Close cleans up all provider resources.
	Close()
}

Service defines the TTS service interface.

func NewService ¶

func NewService(cfg *ServiceConfig) (Service, error)

NewService creates a new TTS service.

type ServiceConfig ¶

type ServiceConfig struct {
	DefaultProvider ProviderType
	Providers       []ProviderConfig
	DataPath        string // Path for model downloads
}

ServiceConfig holds the configuration for the TTS service.

type StreamCallback ¶

type StreamCallback func(chunk []byte) error

StreamCallback is called when streaming audio chunks are available.

type SynthesizeRequest ¶

type SynthesizeRequest struct {
	// Text is the text to synthesize.
	Text string
	// Voice is the voice ID to use.
	Voice string
	// Format is the output audio format.
	Format AudioFormat
	// Speed is the speech speed (0.25-4.0, default 1.0).
	Speed float32
	// Pitch is the speech pitch adjustment.
	Pitch float32
	// Volume is the speech volume (0-200, default 100).
	Volume float32
}

SynthesizeRequest represents a synthesis request.

type SynthesizeResponse ¶

type SynthesizeResponse struct {
	// Audio is the synthesized audio data.
	Audio io.ReadCloser
	// Format is the audio format.
	Format AudioFormat
	// ContentType is the MIME content type.
	ContentType string
	// Duration is the estimated audio duration in seconds.
	Duration float64
}

SynthesizeResponse represents a synthesis response.

type VocoderModelManager ¶

type VocoderModelManager struct{}

VocoderModelManager stub when espeak is not compiled in.

func NewVocoderModelManager ¶

func NewVocoderModelManager(dataPath string) *VocoderModelManager

func (*VocoderModelManager) CancelDownload ¶

func (m *VocoderModelManager) CancelDownload()

func (*VocoderModelManager) DownloadModel ¶

func (m *VocoderModelManager) DownloadModel(ctx context.Context) error

func (*VocoderModelManager) GetModelStatus ¶

func (m *VocoderModelManager) GetModelStatus() map[string]interface{}

func (*VocoderModelManager) IsReady ¶

func (m *VocoderModelManager) IsReady() bool

type Voice ¶

type Voice struct {
	// ID is the voice identifier.
	ID string `json:"id"`
	// Name is the display name.
	Name string `json:"name"`
	// Language is the voice language.
	Language string `json:"language"`
	// Gender is the voice gender.
	Gender string `json:"gender,omitempty"`
	// Description is the voice description.
	Description string `json:"description,omitempty"`
	// PreviewURL is a URL to preview the voice.
	PreviewURL string `json:"preview_url,omitempty"`
	// Provider is the TTS provider name (e.g., "Kokoro", "eSpeak-NG (Robotic)").
	Provider string `json:"provider,omitempty"`
	// Quality indicates voice quality level (e.g., "high", "medium", "low").
	Quality string `json:"quality,omitempty"`
}

Voice represents a TTS voice.

func GetVoicesByLanguage ¶

func GetVoicesByLanguage(voices []Voice, langPrefix string) []Voice

GetVoicesByLanguage returns voices filtered by language prefix.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL