voice

package
v0.4.2 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: May 25, 2026 License: MIT Imports: 15 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var HTTPClient = &http.Client{
	Timeout: 60 * time.Second,
}

HTTPClient is the HTTP client used for API requests

Functions

func ConvertAudioFormat added in v0.3.0

func ConvertAudioFormat(input []byte, outputFormat string) ([]byte, error)

ConvertAudioFormat converts audio to different format using ffmpeg

func DetectAudioFormat added in v0.3.0

func DetectAudioFormat(data []byte) string

DetectAudioFormat detects audio format from data

func DownloadAudio added in v0.3.0

func DownloadAudio(ctx context.Context, url string) ([]byte, error)

DownloadAudio downloads audio from URL

func GetAudioDuration added in v0.3.0

func GetAudioDuration(audio []byte, format string) float64

GetAudioDuration estimates audio duration in seconds

func GetAvailableVoices added in v0.3.0

func GetAvailableVoices(provider, language string) []string

GetAvailableVoices returns available voices for a provider and language

func GetSupportedLanguages added in v0.3.0

func GetSupportedLanguages(provider string) []string

GetSupportedLanguages returns supported languages for a provider

func PlayAudio added in v0.3.0

func PlayAudio(audio []byte) error

PlayAudio plays audio using system audio player

func ValidateAudio added in v0.3.0

func ValidateAudio(audio []byte, maxSize int) error

ValidateAudio validates audio data

Types

type AzureProvider

type AzureProvider struct {
	// contains filtered or unexported fields
}

AzureProvider implements Azure Speech Services Docs: https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/

func NewAzureProvider added in v0.3.0

func NewAzureProvider(config *ProviderConfig) *AzureProvider

NewAzureProvider creates a new Azure Speech provider

func (*AzureProvider) ASR

func (p *AzureProvider) ASR(ctx context.Context, audio []byte) (string, error)

func (*AzureProvider) IsAvailable

func (p *AzureProvider) IsAvailable() bool

func (*AzureProvider) Name

func (p *AzureProvider) Name() string

func (*AzureProvider) StreamASR

func (p *AzureProvider) StreamASR(ctx context.Context, audio <-chan []byte) (<-chan string, error)

func (*AzureProvider) StreamTTS

func (p *AzureProvider) StreamTTS(ctx context.Context, text string) (<-chan []byte, error)

func (*AzureProvider) TTS

func (p *AzureProvider) TTS(ctx context.Context, text string) ([]byte, error)

type ConversationSession

type ConversationSession struct {
	ID        string
	StartTime time.Time
	Messages  []VoiceMessage
	Language  string
	IsActive  bool
}

ConversationSession represents a voice conversation session

type ElevenLabsProvider

type ElevenLabsProvider struct {
	// contains filtered or unexported fields
}

ElevenLabsProvider implements ElevenLabs TTS Docs: https://elevenlabs.io/docs/api-reference/text-to-speech

func NewElevenLabsProvider added in v0.3.0

func NewElevenLabsProvider(config *ProviderConfig) *ElevenLabsProvider

NewElevenLabsProvider creates a new ElevenLabs TTS provider

func (*ElevenLabsProvider) ASR

func (p *ElevenLabsProvider) ASR(ctx context.Context, audio []byte) (string, error)

func (*ElevenLabsProvider) IsAvailable

func (p *ElevenLabsProvider) IsAvailable() bool

func (*ElevenLabsProvider) Name

func (p *ElevenLabsProvider) Name() string

func (*ElevenLabsProvider) StreamASR

func (p *ElevenLabsProvider) StreamASR(ctx context.Context, audio <-chan []byte) (<-chan string, error)

func (*ElevenLabsProvider) StreamTTS

func (p *ElevenLabsProvider) StreamTTS(ctx context.Context, text string) (<-chan []byte, error)

func (*ElevenLabsProvider) TTS

func (p *ElevenLabsProvider) TTS(ctx context.Context, text string) ([]byte, error)

type GoogleTTSProvider added in v0.3.0

type GoogleTTSProvider struct {
	// contains filtered or unexported fields
}

GoogleTTSProvider implements Google Cloud Text-to-Speech Docs: https://cloud.google.com/text-to-speech/docs

func NewGoogleTTSProvider added in v0.3.0

func NewGoogleTTSProvider(config *ProviderConfig) *GoogleTTSProvider

NewGoogleTTSProvider creates a new Google TTS provider

func (*GoogleTTSProvider) ASR added in v0.3.0

func (p *GoogleTTSProvider) ASR(ctx context.Context, audio []byte) (string, error)

func (*GoogleTTSProvider) IsAvailable added in v0.3.0

func (p *GoogleTTSProvider) IsAvailable() bool

func (*GoogleTTSProvider) Name added in v0.3.0

func (p *GoogleTTSProvider) Name() string

func (*GoogleTTSProvider) StreamASR added in v0.3.0

func (p *GoogleTTSProvider) StreamASR(ctx context.Context, audio <-chan []byte) (<-chan string, error)

func (*GoogleTTSProvider) StreamTTS added in v0.3.0

func (p *GoogleTTSProvider) StreamTTS(ctx context.Context, text string) (<-chan []byte, error)

func (*GoogleTTSProvider) TTS added in v0.3.0

func (p *GoogleTTSProvider) TTS(ctx context.Context, text string) ([]byte, error)

type Manager

type Manager struct {
	// contains filtered or unexported fields
}

Manager manages voice operations

func NewManager

func NewManager(config *VoiceConfig) *Manager

NewManager creates a new voice manager

func (*Manager) ASR

func (m *Manager) ASR(ctx context.Context, audio []byte) (*VoiceResult, error)

ASR converts speech to text

func (*Manager) ASRWithProvider added in v0.3.0

func (m *Manager) ASRWithProvider(ctx context.Context, providerName string, audio []byte) (*VoiceResult, error)

ASRWithProvider converts speech to text using a specific provider

func (*Manager) Base64Decode

func (m *Manager) Base64Decode(data string) ([]byte, error)

Base64Decode decodes base64 to audio

func (*Manager) Base64Encode

func (m *Manager) Base64Encode(audio []byte) string

Base64Encode encodes audio to base64

func (*Manager) ConvertAudioFormat added in v0.3.0

func (m *Manager) ConvertAudioFormat(audio []byte, format string) ([]byte, error)

ConvertAudioFormat converts audio to different format

func (*Manager) DownloadAudio added in v0.3.0

func (m *Manager) DownloadAudio(ctx context.Context, url string) ([]byte, error)

DownloadAudio downloads audio from URL

func (*Manager) GenerateVoiceResponse

func (m *Manager) GenerateVoiceResponse(ctx context.Context, session *ConversationSession, text string) ([]byte, error)

GenerateVoiceResponse generates voice response for text

func (*Manager) GetActiveProvider added in v0.3.0

func (m *Manager) GetActiveProvider() VoiceProvider

GetActiveProvider returns the currently active provider

func (*Manager) GetAvailableProviders added in v0.3.0

func (m *Manager) GetAvailableProviders() []string

GetAvailableProviders returns list of providers that are currently available (have API keys)

func (*Manager) GetAvailableVoices added in v0.3.0

func (m *Manager) GetAvailableVoices(provider, language string) []string

GetAvailableVoices returns available voices for a provider and language

func (*Manager) GetConfig added in v0.3.0

func (m *Manager) GetConfig() *VoiceConfig

GetConfig returns the current voice configuration

func (*Manager) GetPresets

func (m *Manager) GetPresets() []VoicePreset

GetPresets returns built-in voice presets

func (*Manager) GetProvider added in v0.3.0

func (m *Manager) GetProvider(name string) (VoiceProvider, error)

GetProvider returns a specific provider by name

func (*Manager) GetProviders

func (m *Manager) GetProviders() []string

GetProviders returns list of available providers

func (*Manager) GetSupportedLanguages added in v0.3.0

func (m *Manager) GetSupportedLanguages(provider string) []string

GetSupportedLanguages returns supported languages for a provider

func (*Manager) LoadAudio

func (m *Manager) LoadAudio(path string) ([]byte, error)

LoadAudio loads audio data from file

func (*Manager) PlayAudio added in v0.3.0

func (m *Manager) PlayAudio(audio []byte) error

PlayAudio plays audio using system audio player

func (*Manager) ProcessVoiceInput

func (m *Manager) ProcessVoiceInput(ctx context.Context, session *ConversationSession, audio []byte) (string, error)

ProcessVoiceInput processes voice input and returns text

func (*Manager) SaveAudio

func (m *Manager) SaveAudio(audio []byte, path string) error

SaveAudio saves audio data to file

func (*Manager) SetProvider

func (m *Manager) SetProvider(name string) error

SetProvider sets the active voice provider

func (*Manager) StartSession

func (m *Manager) StartSession(ctx context.Context, language string) (*ConversationSession, error)

StartSession starts a new voice conversation

func (*Manager) StreamASR

func (m *Manager) StreamASR(ctx context.Context, audio <-chan []byte) (<-chan string, error)

StreamASR streams speech to text

func (*Manager) StreamTTS

func (m *Manager) StreamTTS(ctx context.Context, text string) (<-chan *VoiceResult, error)

StreamTTS streams text to speech

func (*Manager) TTS

func (m *Manager) TTS(ctx context.Context, text string) (*VoiceResult, error)

TTS converts text to speech

func (*Manager) TTSWithProvider added in v0.3.0

func (m *Manager) TTSWithProvider(ctx context.Context, providerName, text string) (*VoiceResult, error)

TTSWithProvider converts text to speech using a specific provider

func (*Manager) UpdateConfig added in v0.3.0

func (m *Manager) UpdateConfig(config *VoiceConfig) error

UpdateConfig updates the voice configuration

type OpenAIProvider

type OpenAIProvider struct {
	// contains filtered or unexported fields
}

OpenAIProvider implements OpenAI TTS Docs: https://platform.openai.com/docs/guides/text-to-speech

func NewOpenAIProvider added in v0.3.0

func NewOpenAIProvider(config *ProviderConfig) *OpenAIProvider

NewOpenAIProvider creates a new OpenAI TTS provider

func (*OpenAIProvider) ASR

func (p *OpenAIProvider) ASR(ctx context.Context, audio []byte) (string, error)

func (*OpenAIProvider) IsAvailable

func (p *OpenAIProvider) IsAvailable() bool

func (*OpenAIProvider) Name

func (p *OpenAIProvider) Name() string

func (*OpenAIProvider) StreamASR

func (p *OpenAIProvider) StreamASR(ctx context.Context, audio <-chan []byte) (<-chan string, error)

func (*OpenAIProvider) StreamTTS

func (p *OpenAIProvider) StreamTTS(ctx context.Context, text string) (<-chan []byte, error)

func (*OpenAIProvider) TTS

func (p *OpenAIProvider) TTS(ctx context.Context, text string) ([]byte, error)

type ProviderConfig added in v0.3.0

type ProviderConfig struct {
	APIKey   string
	APIURL   string
	Region   string
	Model    string
	Voice    string
	Language string
	Speed    float64
	Pitch    float64
}

ProviderConfig holds configuration for voice providers

type ProviderCredentials added in v0.3.0

type ProviderCredentials struct {
	APIKey string `json:"api_key"`
	APIURL string `json:"api_url"`
	Region string `json:"region"`
}

ProviderCredentials holds credentials for a specific provider

type VoiceConfig

type VoiceConfig struct {
	Provider     string                         `json:"provider"`     // openai, elevenlabs, azure, google, whisper
	Model        string                         `json:"model"`        // TTS/ASR model
	Voice        string                         `json:"voice"`        // Voice ID
	Language     string                         `json:"language"`     // Language code (en, zh, etc.)
	Speed        float64                        `json:"speed"`        // Speech speed (0.5-2.0)
	Pitch        float64                        `json:"pitch"`        // Voice pitch (-20 to 20)
	APIKey       string                         `json:"api_key"`      // API key
	APIURL       string                         `json:"api_url"`      // Custom API URL
	Region       string                         `json:"region"`       // Azure region
	Latency      string                         `json:"latency"`      // ultra_low, low, medium
	Instructions string                         `json:"instructions"` // Voice instructions
	Providers    map[string]ProviderCredentials `json:"providers"`    // Provider-specific credentials
}

VoiceConfig holds voice configuration

func DefaultVoiceConfig

func DefaultVoiceConfig() *VoiceConfig

DefaultVoiceConfig returns a default voice configuration

type VoiceMessage

type VoiceMessage struct {
	Role      string // user, assistant
	Text      string // transcribed text
	Audio     []byte // audio data
	Timestamp time.Time
}

VoiceMessage represents a voice message in a conversation

type VoicePreset

type VoicePreset struct {
	Name         string  `json:"name"`
	Provider     string  `json:"provider"`
	Model        string  `json:"model"`
	VoiceID      string  `json:"voice_id"`
	Speed        float64 `json:"speed"`
	Pitch        float64 `json:"pitch"`
	Instructions string  `json:"instructions"`
	Language     string  `json:"language"`
}

VoicePreset represents a voice preset

type VoiceProvider

type VoiceProvider interface {
	// Name returns the provider name
	Name() string

	// IsAvailable checks if the provider is available
	IsAvailable() bool

	// TTS converts text to speech
	TTS(ctx context.Context, text string) ([]byte, error)

	// ASR converts speech to text
	ASR(ctx context.Context, audio []byte) (string, error)

	// StreamTTS streams text to speech
	StreamTTS(ctx context.Context, text string) (<-chan []byte, error)

	// StreamASR streams speech to text
	StreamASR(ctx context.Context, audio <-chan []byte) (<-chan string, error)
}

VoiceProvider defines the interface for TTS/ASR providers

type VoiceResult

type VoiceResult struct {
	Audio      []byte  // Audio data
	Text       string  // Text content
	Duration   float64 // Duration in seconds
	Format     string  // Audio format (mp3, wav, ogg)
	SampleRate int     // Sample rate
	Provider   string  // Provider used
}

VoiceResult represents a voice operation result

type WhisperProvider

type WhisperProvider struct {
	// contains filtered or unexported fields
}

WhisperProvider implements OpenAI Whisper for ASR only This is a specialized provider for local Whisper installations

func NewWhisperProvider added in v0.3.0

func NewWhisperProvider(config *ProviderConfig) *WhisperProvider

NewWhisperProvider creates a new Whisper ASR provider

func (*WhisperProvider) ASR

func (p *WhisperProvider) ASR(ctx context.Context, audio []byte) (string, error)

func (*WhisperProvider) IsAvailable

func (p *WhisperProvider) IsAvailable() bool

func (*WhisperProvider) Name

func (p *WhisperProvider) Name() string

func (*WhisperProvider) StreamASR

func (p *WhisperProvider) StreamASR(ctx context.Context, audio <-chan []byte) (<-chan string, error)

func (*WhisperProvider) StreamTTS

func (p *WhisperProvider) StreamTTS(ctx context.Context, text string) (<-chan []byte, error)

func (*WhisperProvider) TTS

func (p *WhisperProvider) TTS(ctx context.Context, text string) ([]byte, error)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL