speech

package
v0.0.0-...-8acab51 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 26, 2026 License: MIT Imports: 17 Imported by: 0

Documentation

Index

Constants

View Source
const ProviderMacOSNative stt.ProviderType = "macos-native"

Variables

View Source
var (
	ErrNoASRProvider = &SpeechError{Code: "NO_ASR_PROVIDER", Message: "no ASR provider available"}
)

Error definitions

Functions

func CurrentSTTAuthorizationState

func CurrentSTTAuthorizationState() (status int, authErr error, initialized bool)

CurrentSTTAuthorizationState reports that macOS STT is unavailable on non-darwin builds.

func GetTCCAppName

func GetTCCAppName() string

GetTCCAppName returns empty string on non-macOS.

func NewWindowsNativeASR

func NewWindowsNativeASR() stt.Provider

func RequestSTTAuthorization

func RequestSTTAuthorization() (int, error)

RequestSTTAuthorization is a no-op on non-macOS.

func RunMainRunLoop

func RunMainRunLoop()

RunMainRunLoop is a no-op on non-macOS.

func StopMainRunLoop

func StopMainRunLoop()

StopMainRunLoop is a no-op on non-macOS.

func SubmitToMainThread

func SubmitToMainThread(fn func())

SubmitToMainThread is a no-op on non-macOS — just runs inline.

Types

type ASRConfig

type ASRConfig struct {
	Enabled        bool          `json:"enabled" yaml:"enabled" yaml:"enabled"`
	Provider       string        `json:"provider" yaml:"provider" yaml:"provider"`
	Model          string        `json:"model" yaml:"model" yaml:"model"`
	ModelDir       string        `json:"model_dir" yaml:"model_dir" yaml:"model_dir"`
	DefaultLang    string        `json:"default_language" yaml:"default_language" yaml:"default_language"`
	EditBeforeSend bool          `json:"edit_before_send" yaml:"edit_before_send" yaml:"edit_before_send"`
	MaxDuration    time.Duration `json:"max_duration" yaml:"max_duration" yaml:"max_duration"`
}

ASRConfig holds ASR-specific configuration.

type ASRStatus

type ASRStatus struct {
	Ready              bool             `json:"ready"`
	Provider           string           `json:"provider"`
	ModelName          string           `json:"model_name,omitempty"`
	StreamingSupported bool             `json:"streaming_supported"`
	EditBeforeSend     bool             `json:"edit_before_send"`
	Downloading        bool             `json:"downloading"`
	Progress           *Progress        `json:"progress,omitempty"`
	Downloads          []DownloadStatus `json:"downloads,omitempty"`
	HasPending         bool             `json:"has_pending"`
	PermissionDenied   bool             `json:"permission_denied,omitempty"`
	PermissionError    string           `json:"permission_error,omitempty"`
	PermissionAppName  string           `json:"permission_app_name,omitempty"`
	OnDeviceSupported  bool             `json:"on_device_supported,omitempty"`
	OnDeviceOnly       bool             `json:"on_device_only,omitempty"`
	DictationAvailable bool             `json:"dictation_available,omitempty"`
	OfflineLanguages   []string         `json:"offline_languages,omitempty"`
	Models             []interface{}    `json:"models"`
	AvailableProviders []string         `json:"available_providers,omitempty"`
}

ASRStatus represents ASR status.

type ComponentDownloadStatus

type ComponentDownloadStatus struct {
	Ready          bool    `json:"ready"`
	Downloading    bool    `json:"downloading"`
	Progress       float64 `json:"progress"`
	Error          string  `json:"error,omitempty"`
	InitStage      string  `json:"init_stage,omitempty"`
	Speed          string  `json:"speed,omitempty"`
	ETA            string  `json:"eta,omitempty"`
	File           string  `json:"file,omitempty"`
	FileIndex      int     `json:"file_index,omitempty"`
	TotalFiles     int     `json:"total_files,omitempty"`
	DownloadedSize string  `json:"downloaded_human,omitempty"`
}

ComponentDownloadStatus represents the download/readiness status of a TTS component (e.g. Kokoro model, vocoder).

type Config

type Config struct {
	TTS TTSConfig `json:"tts" yaml:"tts" yaml:"tts"`
	ASR ASRConfig `json:"asr" yaml:"asr" yaml:"asr"`
}

Config holds unified speech configuration.

type DownloadRequest

type DownloadRequest struct {
	ModelType string `json:"model_type"`
}

DownloadRequest represents a model download request.

type DownloadStatus

type DownloadStatus struct {
	ModelType string   `json:"model_type"`
	Progress  Progress `json:"progress"`
}

DownloadStatus represents a single model download status.

type EspeakLanguagePack

type EspeakLanguagePack struct {
	Code       string `json:"code"`
	Name       string `json:"name"`
	Downloaded bool   `json:"downloaded"`
	Size       int64  `json:"size"` // actual file size in bytes
}

EspeakLanguagePack represents an eSpeak-NG language dictionary.

type EspeakManager

type EspeakManager struct {
	// contains filtered or unexported fields
}

EspeakManager checks the real espeak-ng-data directory for available language dictionaries. The eSpeak-NG engine is statically linked via CGO, so there is no library to download — only the data directory needs to be present at runtime.

func NewEspeakManager

func NewEspeakManager(dataDir string) *EspeakManager

NewEspeakManager creates a new EspeakManager that detects the real espeak-ng-data directory using the same search logic as the CGO provider.

func (*EspeakManager) GetDataSize

func (em *EspeakManager) GetDataSize() int64

GetDataSize returns the total size of the espeak-ng-data directory in bytes.

func (*EspeakManager) GetLibraryPath

func (em *EspeakManager) GetLibraryPath() string

GetLibraryPath returns the detected espeak-ng-data path.

func (*EspeakManager) GetPlatformInfo

func (em *EspeakManager) GetPlatformInfo() (string, string)

GetPlatformInfo returns OS/arch info (useful for status display).

func (*EspeakManager) IsLibraryInstalled

func (em *EspeakManager) IsLibraryInstalled() bool

IsLibraryInstalled returns true if the espeak-ng-data directory was found. The engine itself is statically linked; this checks for runtime data.

func (*EspeakManager) LanguageCount

func (em *EspeakManager) LanguageCount() int

LanguageCount returns the number of *_dict files found.

func (*EspeakManager) ListLanguagePacks

func (em *EspeakManager) ListLanguagePacks() []*EspeakLanguagePack

ListLanguagePacks scans the espeak-ng-data directory for *_dict files and returns the list with real file sizes.

func (*EspeakManager) Refresh

func (em *EspeakManager) Refresh()

Refresh re-scans for the espeak-ng-data directory.

type EspeakStatus

type EspeakStatus struct {
	Installed     bool   `json:"installed"`
	Path          string `json:"path,omitempty"`
	LanguageCount int    `json:"language_count"`
	DataSize      int64  `json:"data_size"`
	StaticLinked  bool   `json:"static_linked"`
}

EspeakStatus represents the eSpeak-NG runtime data status.

type Handler

type Handler struct {
	// contains filtered or unexported fields
}

Handler handles unified speech HTTP requests.

func NewHandler

func NewHandler(svc Service, kv kvstore.Store, dataPath string) *Handler

NewHandler creates a new speech handler.

func (*Handler) CancelASRDownload

func (h *Handler) CancelASRDownload(c echo.Context) error

CancelASRDownload cancels the current ASR model download.

func (*Handler) CancelKokoroDownload

func (h *Handler) CancelKokoroDownload(c echo.Context) error

CancelKokoroDownload cancels the Kokoro download.

func (*Handler) CancelVocoderDownload

func (h *Handler) CancelVocoderDownload(c echo.Context) error

CancelVocoderDownload cancels the vocoder download.

func (*Handler) DownloadASRModel

func (h *Handler) DownloadASRModel(c echo.Context) error

DownloadASRModel starts downloading an ASR model.

func (*Handler) DownloadKokoro

func (h *Handler) DownloadKokoro(c echo.Context) error

DownloadKokoro starts downloading the Kokoro model.

func (*Handler) DownloadVocoder

func (h *Handler) DownloadVocoder(c echo.Context) error

DownloadVocoder starts downloading the vocoder model.

func (*Handler) GetEspeakManager

func (h *Handler) GetEspeakManager() *EspeakManager

GetEspeakManager returns the EspeakManager instance.

func (*Handler) GetOfflineLanguages

func (h *Handler) GetOfflineLanguages(c echo.Context) error

GetOfflineLanguages returns installed offline dictation languages (macOS/Windows native). Separated from /status because it reads system settings which is non-critical info.

func (*Handler) GetPersistedTTSProvider

func (h *Handler) GetPersistedTTSProvider() string

GetPersistedTTSProvider returns the persisted TTS provider from kvstore, or empty string if not set.

func (*Handler) GetStatus

func (h *Handler) GetStatus(c echo.Context) error

GetStatus returns the unified speech status.

func (*Handler) GetTTSConfig

func (h *Handler) GetTTSConfig(c echo.Context) error

GetTTSConfig returns the current TTS configuration.

func (*Handler) Init

func (h *Handler) Init(c echo.Context) error

Init initializes TTS/STT services lazily.

func (*Handler) ListEspeakLanguages

func (h *Handler) ListEspeakLanguages(c echo.Context) error

ListEspeakLanguages returns available eSpeak-NG language dictionaries by scanning the real espeak-ng-data directory.

func (*Handler) RegisterRoutes

func (h *Handler) RegisterRoutes(g *echo.Group)

RegisterRoutes registers unified speech management routes.

func (*Handler) RestoreEditBeforeSend

func (h *Handler) RestoreEditBeforeSend()

RestoreEditBeforeSend restores the persisted edit-before-send setting from kvstore.

func (*Handler) Service

func (h *Handler) Service() Service

Service returns the speech service.

func (*Handler) SetASROnDevice

func (h *Handler) SetASROnDevice(c echo.Context) error

SetASROnDevice toggles on-device-only mode for macOS native STT.

func (*Handler) SetEditBeforeSend

func (h *Handler) SetEditBeforeSend(c echo.Context) error

SetEditBeforeSend toggles the edit-before-send setting for ASR.

func (*Handler) SetTTSConfig

func (h *Handler) SetTTSConfig(c echo.Context) error

SetTTSConfig updates the TTS configuration.

func (*Handler) SwitchASRModel

func (h *Handler) SwitchASRModel(c echo.Context) error

SwitchASRModel switches to a different ASR model or provider.

func (*Handler) SwitchTTSProvider

func (h *Handler) SwitchTTSProvider(c echo.Context) error

SwitchTTSProvider switches the TTS provider.

func (*Handler) Transcribe

func (h *Handler) Transcribe(c echo.Context) error

Transcribe transcribes audio and returns editable result.

type InitConfig

type InitConfig struct {
	DataDir      string
	OpenAIAPIKey string
}

InitConfig holds configuration for lazy initialization of TTS/STT services.

type MacOSNativeSTT

type MacOSNativeSTT struct{}

MacOSNativeSTT stub for non-macOS systems

func NewMacOSNativeSTT

func NewMacOSNativeSTT() *MacOSNativeSTT

NewMacOSNativeSTT creates a stub provider

func (*MacOSNativeSTT) Available

func (p *MacOSNativeSTT) Available() bool

Available returns false on non-macOS

func (*MacOSNativeSTT) Close

func (p *MacOSNativeSTT) Close()

Close does nothing

func (*MacOSNativeSTT) DictationAvailable

func (p *MacOSNativeSTT) DictationAvailable() bool

func (*MacOSNativeSTT) Initialize

func (p *MacOSNativeSTT) Initialize() error

Initialize returns error on non-macOS

func (*MacOSNativeSTT) MaxDuration

func (p *MacOSNativeSTT) MaxDuration() time.Duration

MaxDuration returns zero

func (*MacOSNativeSTT) Name

func (p *MacOSNativeSTT) Name() string

Name returns provider name

func (*MacOSNativeSTT) OfflineDictationLanguages

func (p *MacOSNativeSTT) OfflineDictationLanguages() []string

func (*MacOSNativeSTT) RequireOnDevice

func (p *MacOSNativeSTT) RequireOnDevice() bool

func (*MacOSNativeSTT) SetRequireOnDevice

func (p *MacOSNativeSTT) SetRequireOnDevice(_ bool)

func (*MacOSNativeSTT) SupportedFormats

func (p *MacOSNativeSTT) SupportedFormats() []stt.AudioFormat

SupportedFormats returns empty list

func (*MacOSNativeSTT) SupportsOnDevice

func (p *MacOSNativeSTT) SupportsOnDevice() bool

func (*MacOSNativeSTT) Transcribe

Transcribe returns error

func (*MacOSNativeSTT) TranscribeStream

TranscribeStream returns error

func (*MacOSNativeSTT) Type

func (p *MacOSNativeSTT) Type() stt.ProviderType

Type returns the provider type

type ModelInfo

type ModelInfo struct {
	ID          string   `json:"id"`
	Name        string   `json:"name"`
	Description string   `json:"description"`
	Type        string   `json:"type"` // "tts" or "asr"
	Languages   []string `json:"languages,omitempty"`
	Size        string   `json:"size"`
	Streaming   bool     `json:"streaming,omitempty"`
	Downloaded  bool     `json:"downloaded"`
}

ModelInfo represents information about a speech model.

type ModelsResponse

type ModelsResponse struct {
	TTS []ModelInfo `json:"tts"`
	ASR []ModelInfo `json:"asr"`
}

ModelsResponse represents the list of available models.

type OnDeviceUnavailableError

type OnDeviceUnavailableError struct {
	Locale string
	Detail string
}

OnDeviceUnavailableError indicates on-device recognition failed for the locale.

func (*OnDeviceUnavailableError) Error

func (e *OnDeviceUnavailableError) Error() string

type Progress

type Progress struct {
	File       string  `json:"file"`
	Downloaded int64   `json:"downloaded"`
	Total      int64   `json:"total"`
	Percentage float64 `json:"percentage"`
	SpeedHuman string  `json:"speed_human"`
	ETA        string  `json:"eta"`
}

Progress represents download progress.

type Service

type Service interface {
	// GetStatus returns the unified speech status.
	GetStatus() *StatusResponse
	// GetModels returns all available models.
	GetModels() *ModelsResponse
	// GetASRProvider returns the ASR provider if available.
	GetASRProvider() stt.Provider
	// GetTTSProvider returns the TTS provider if available.
	GetTTSProvider() tts.Provider
	// GetSTTService returns the underlying STT service.
	GetSTTService() stt.Service
	// GetTTSService returns the underlying TTS service.
	GetTTSService() tts.Service
	// IsEditBeforeSendEnabled returns whether edit-before-send is enabled.
	IsEditBeforeSendEnabled() bool
	// SetEditBeforeSend enables or disables edit-before-send.
	SetEditBeforeSend(enabled bool)
	// Initialize initializes TTS/STT services lazily.
	Initialize() error
	// IsInitialized returns whether services have been initialized.
	IsInitialized() bool
	// SetTTSProvider sets the TTS provider.
	SetTTSProvider(provider tts.Provider)
	// SetASRProvider sets the ASR provider.
	SetASRProvider(provider stt.Provider)
	// SetASRPermissionDenied records that ASR permission was denied (e.g. macOS TCC).
	SetASRPermissionDenied(errMsg string)
	// SetEspeakManager sets the eSpeak manager for status reporting.
	SetEspeakManager(em *EspeakManager)
	// SetStatusPrewarmEnabled controls whether GetStatus may trigger lazy STT initialization.
	SetStatusPrewarmEnabled(enabled bool)
}

Service provides unified speech functionality (TTS + ASR).

func NewService

func NewService(cfg *Config, sttSvc stt.Service, ttsSvc tts.Service) Service

NewService creates a new unified speech service.

func NewServiceWithInitConfig

func NewServiceWithInitConfig(cfg *Config, initCfg *InitConfig) Service

NewServiceWithInitConfig creates a service with lazy initialization config.

func NewServiceWithProviders

func NewServiceWithProviders(cfg *Config, sttSvc stt.Service, ttsSvc tts.Service, asrProvider stt.Provider, ttsProvider tts.Provider) Service

NewServiceWithProviders creates a service with explicit providers.

type SpeechError

type SpeechError struct {
	Code    string
	Message string
}

SpeechError represents a speech-related error.

func (*SpeechError) Error

func (e *SpeechError) Error() string

type StatusResponse

type StatusResponse struct {
	TTS    TTSStatus     `json:"tts"`
	ASR    ASRStatus     `json:"asr"`
	Espeak *EspeakStatus `json:"espeak,omitempty"`
}

StatusResponse represents the unified speech status.

type SwitchProviderRequest

type SwitchProviderRequest struct {
	Provider string `json:"provider"`
}

SwitchProviderRequest represents a TTS provider switch request.

type SwitchRequest

type SwitchRequest struct {
	ModelType string `json:"model_type"`
}

SwitchRequest represents a model switch request.

type TTSConfig

type TTSConfig struct {
	Provider string  `json:"provider" yaml:"provider" yaml:"provider"`
	Model    string  `json:"model" yaml:"model" yaml:"model"`
	Speed    float32 `json:"speed" yaml:"speed" yaml:"speed"`    // Speech rate (0.5-2.0, default 1.0)
	Pitch    float32 `json:"pitch" yaml:"pitch" yaml:"pitch"`    // Pitch adjustment (-10 to 10, default 0)
	Volume   float32 `json:"volume" yaml:"volume" yaml:"volume"` // Volume (0-100, default 100)
}

TTSConfig holds TTS-specific configuration.

type TTSConfigRequest

type TTSConfigRequest struct {
	Speed  float32 `json:"speed"`
	Pitch  float32 `json:"pitch"`
	Volume float32 `json:"volume"`
}

TTSConfigRequest represents a TTS config update request.

type TTSStatus

type TTSStatus struct {
	Ready              bool                                `json:"ready"`
	Provider           string                              `json:"provider"`
	ModelName          string                              `json:"model_name,omitempty"`
	AvailableProviders []string                            `json:"available_providers,omitempty"`
	Models             []interface{}                       `json:"models"`
	Components         map[string]*ComponentDownloadStatus `json:"components,omitempty"`
}

TTSStatus represents TTS status.

type TranscriptionResult

type TranscriptionResult struct {
	Text       string  `json:"text"`
	Language   string  `json:"language,omitempty"`
	Duration   float64 `json:"duration,omitempty"`
	Confidence float64 `json:"confidence,omitempty"`
	Editable   bool    `json:"editable"`
	SessionID  string  `json:"session_id,omitempty"`
}

TranscriptionResult with edit support.

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL