stt

package
v0.17.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 12, 2026 License: Apache-2.0 Imports: 17 Imported by: 0

Documentation

Index

Constants

View Source
const MinWhisperModelBytes = 50_000_000

MinWhisperModelBytes is the minimum file size we expect for a valid ggml model. ggml-base.bin is ~150 MB; anything under 50 MB is clearly corrupt/truncated.

Variables

This section is empty.

Functions

This section is empty.

Types

type GoogleSTTProvider

type GoogleSTTProvider struct {
	APIKey  string
	Model   string // "chirp_3", "chirp_2", "latest_long"
	BaseURL string // Override for testing; defaults to googleSTTBaseURL
	// contains filtered or unexported fields
}

GoogleSTTProvider implements STTProvider for Google Cloud Speech-to-Text v1 REST API.

func NewGoogleSTTProvider

func NewGoogleSTTProvider(apiKey, model string) *GoogleSTTProvider

NewGoogleSTTProvider creates a provider for Google Cloud Speech-to-Text. Model defaults to "chirp_3" if empty.

func (*GoogleSTTProvider) Health

func (p *GoogleSTTProvider) Health(ctx context.Context) error

Health checks if the Google Speech API is reachable.

func (*GoogleSTTProvider) Name

func (p *GoogleSTTProvider) Name() string

Name returns the provider identifier.

func (*GoogleSTTProvider) Transcribe

func (p *GoogleSTTProvider) Transcribe(ctx context.Context, audio []byte, opts TranscribeOpts) (*Result, error)

Transcribe sends audio to Google Cloud Speech-to-Text v1 REST API.

type HuggingFaceProvider

type HuggingFaceProvider struct {
	Model   string
	Token   string
	BaseURL string // Override for testing; defaults to hfBaseURL
	// contains filtered or unexported fields
}

HuggingFaceProvider implements STTProvider for Tier 3: HuggingFace Inference API.

func NewHuggingFaceProvider

func NewHuggingFaceProvider(model, token string) *HuggingFaceProvider

func (*HuggingFaceProvider) Health

func (p *HuggingFaceProvider) Health(ctx context.Context) error

func (*HuggingFaceProvider) Name

func (p *HuggingFaceProvider) Name() string

func (*HuggingFaceProvider) Transcribe

func (p *HuggingFaceProvider) Transcribe(ctx context.Context, audio []byte, opts TranscribeOpts) (*Result, error)

type InstallStatus added in v0.17.0

type InstallStatus struct {
	BinaryFound bool     `json:"binaryFound"`
	BinaryPath  string   `json:"binaryPath"`
	ModelFound  bool     `json:"modelFound"`
	ModelPath   string   `json:"modelPath"`
	ModelBytes  int64    `json:"modelBytes"`
	ServerReady bool     `json:"serverReady"`
	Problems    []string `json:"problems,omitempty"`
}

InstallStatus describes what's present and what's missing for local STT.

type LocalProvider

type LocalProvider struct {
	BaseURL   string // e.g. "http://127.0.0.1:8080"
	Port      int
	ModelPath string
	GPU       string
	// contains filtered or unexported fields
}

LocalProvider implements STTProvider for Tier 1: localhost whisper.cpp server.

func NewLocalProvider

func NewLocalProvider(port int, modelPath, gpu string) *LocalProvider

func (*LocalProvider) Health

func (p *LocalProvider) Health(ctx context.Context) error

func (*LocalProvider) IsReady

func (p *LocalProvider) IsReady() bool

IsReady returns true if the whisper-server subprocess is running and responding.

func (*LocalProvider) Name

func (p *LocalProvider) Name() string

func (*LocalProvider) StartServer

func (p *LocalProvider) StartServer(ctx context.Context) error

StartServer starts the whisper.cpp server subprocess. Blocks until ready or context cancelled.

func (*LocalProvider) StopServer

func (p *LocalProvider) StopServer()

StopServer terminates the whisper-server subprocess.

func (*LocalProvider) Transcribe

func (p *LocalProvider) Transcribe(ctx context.Context, audio []byte, opts TranscribeOpts) (*Result, error)

func (*LocalProvider) VerifyInstallation added in v0.17.0

func (p *LocalProvider) VerifyInstallation() InstallStatus

VerifyInstallation checks binary and model availability without starting the server.

type OpenAICompatibleProvider

type OpenAICompatibleProvider struct {
	BaseURL string
	APIKey  string
	Model   string
	// contains filtered or unexported fields
}

OpenAICompatibleProvider implements STTProvider for any endpoint speaking the OpenAI /v1/audio/transcriptions API (OpenAI, Groq, VPS whisper-server, etc.).

func NewGroqSTTProvider

func NewGroqSTTProvider(apiKey string) *OpenAICompatibleProvider

NewGroqSTTProvider creates a provider for the Groq Whisper API.

func NewOpenAICompatibleProvider

func NewOpenAICompatibleProvider(name, baseURL, apiKey, model string) *OpenAICompatibleProvider

NewOpenAICompatibleProvider creates a provider for any OpenAI-compatible STT endpoint.

func NewOpenAISTTProvider

func NewOpenAISTTProvider(apiKey string) *OpenAICompatibleProvider

NewOpenAISTTProvider creates a provider for the OpenAI Whisper API.

func NewVPSProvider

func NewVPSProvider(baseURL, apiKey string) *OpenAICompatibleProvider

NewVPSProvider creates a provider for a self-hosted whisper-server.

func (*OpenAICompatibleProvider) Health

Health checks provider reachability. Tries GET /health first (whisper-server), then falls back to GET /v1/models (OpenAI, Groq).

func (*OpenAICompatibleProvider) Name

func (p *OpenAICompatibleProvider) Name() string

Name returns the provider identifier.

func (*OpenAICompatibleProvider) Transcribe

func (p *OpenAICompatibleProvider) Transcribe(ctx context.Context, audio []byte, opts TranscribeOpts) (*Result, error)

Transcribe sends audio to the OpenAI-compatible /v1/audio/transcriptions endpoint.

type Result

type Result struct {
	Text       string
	Language   string
	Duration   time.Duration
	Provider   string
	Model      string
	Confidence float64 // If available from the provider
}

Result holds the output of a transcription.

type STTProvider

type STTProvider interface {
	// Transcribe sends audio data to the STT backend and returns the transcription.
	Transcribe(ctx context.Context, audio []byte, opts TranscribeOpts) (*Result, error)

	// Name returns the provider identifier (e.g. "local", "vps", "huggingface").
	Name() string

	// Health checks if the provider is reachable and ready.
	Health(ctx context.Context) error
}

STTProvider defines the interface for all speech-to-text backends. All implementations speak the OpenAI-compatible /v1/audio/transcriptions API.

type TranscribeOpts

type TranscribeOpts struct {
	Language string // "de", "en", "auto"
	Model    string // Optional: model override
	Prompt   string // Optional: provider-specific hint prompt for better recognition
}

TranscribeOpts configures a single transcription request.

type VPSProvider

type VPSProvider = OpenAICompatibleProvider

VPSProvider is an alias for backward compatibility. Use OpenAICompatibleProvider directly for new code.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL