Documentation
¶
Overview ¶
Package stt provides a unified interface for Speech-to-Text providers.
Index ¶
- Variables
- type Client
- func (c *Client) Provider(name string) (Provider, bool)
- func (c *Client) SetFallbacks(names ...string)
- func (c *Client) SetPrimary(name string)
- func (c *Client) Transcribe(ctx context.Context, audio []byte, config TranscriptionConfig) (*TranscriptionResult, error)
- func (c *Client) TranscribeStream(ctx context.Context, config TranscriptionConfig) (io.WriteCloser, <-chan StreamEvent, error)
- type Provider
- type Segment
- type StreamEvent
- type StreamEventType
- type StreamingProvider
- type TranscriptionConfig
- type TranscriptionResult
- type Word
Constants ¶
This section is empty.
Variables ¶
var ( // ErrNoAvailableProvider is returned when no provider is available. ErrNoAvailableProvider = errors.New("stt: no available provider") // ErrStreamingNotSupported is returned when streaming is not supported. ErrStreamingNotSupported = errors.New("stt: streaming not supported by any provider") // ErrInvalidAudio is returned when the audio data is invalid. ErrInvalidAudio = errors.New("stt: invalid audio data") // ErrInvalidConfig is returned when the transcription config is invalid. ErrInvalidConfig = errors.New("stt: invalid configuration") // ErrAudioTooLong is returned when audio exceeds provider limits. ErrAudioTooLong = errors.New("stt: audio too long") // ErrAudioTooShort is returned when audio is too short to transcribe. ErrAudioTooShort = errors.New("stt: audio too short") // ErrRateLimited is returned when the provider rate limits the request. ErrRateLimited = errors.New("stt: rate limited") // ErrQuotaExceeded is returned when the provider quota is exceeded. ErrQuotaExceeded = errors.New("stt: quota exceeded") // ErrUnsupportedLanguage is returned when the language is not supported. ErrUnsupportedLanguage = errors.New("stt: unsupported language") // ErrUnsupportedFormat is returned when the audio format is not supported. ErrUnsupportedFormat = errors.New("stt: unsupported audio format") // ErrStreamClosed is returned when attempting to use a closed stream. ErrStreamClosed = errors.New("stt: stream closed") )
Functions ¶
This section is empty.
Types ¶
type Client ¶
type Client struct {
// contains filtered or unexported fields
}
Client provides a unified interface across multiple STT providers.
func (*Client) SetFallbacks ¶
SetFallbacks sets the fallback provider order.
func (*Client) SetPrimary ¶
SetPrimary sets the primary provider by name.
func (*Client) Transcribe ¶
func (c *Client) Transcribe(ctx context.Context, audio []byte, config TranscriptionConfig) (*TranscriptionResult, error)
Transcribe uses the primary provider with automatic fallback.
func (*Client) TranscribeStream ¶
func (c *Client) TranscribeStream(ctx context.Context, config TranscriptionConfig) (io.WriteCloser, <-chan StreamEvent, error)
TranscribeStream attempts streaming transcription with the primary provider. Falls back to batch transcription if streaming is not available.
type Provider ¶
type Provider interface {
// Name returns the provider name.
Name() string
// Transcribe converts audio to text (batch mode).
Transcribe(ctx context.Context, audio []byte, config TranscriptionConfig) (*TranscriptionResult, error)
// TranscribeFile transcribes audio from a file path.
TranscribeFile(ctx context.Context, filePath string, config TranscriptionConfig) (*TranscriptionResult, error)
// TranscribeURL transcribes audio from a URL.
TranscribeURL(ctx context.Context, url string, config TranscriptionConfig) (*TranscriptionResult, error)
}
Provider defines the interface for STT providers.
type Segment ¶
type Segment struct {
// Text is the transcribed text for this segment.
Text string
// StartTime is when the segment starts.
StartTime time.Duration
// EndTime is when the segment ends.
EndTime time.Duration
// Confidence is the average confidence for this segment.
Confidence float64
// Speaker is the speaker identifier (if diarization enabled).
Speaker string
// Words contains word-level details (if enabled).
Words []Word
// Language is the detected language for this segment.
Language string
}
Segment represents a segment of transcription (sentence, phrase).
type StreamEvent ¶
type StreamEvent struct {
// Type is the event type.
Type StreamEventType
// Transcript is the current transcript (partial or final).
Transcript string
// IsFinal indicates if this is a final (non-interim) result.
IsFinal bool
// Segment contains segment details for final results.
Segment *Segment
// SpeechStarted indicates voice activity started.
SpeechStarted bool
// SpeechEnded indicates voice activity ended.
SpeechEnded bool
// Error contains any error that occurred.
Error error
}
StreamEvent represents an event from streaming transcription.
type StreamEventType ¶
type StreamEventType string
StreamEventType identifies the type of stream event.
const ( // EventTranscript is a transcription result (partial or final). EventTranscript StreamEventType = "transcript" // EventSpeechStart indicates the user started speaking. EventSpeechStart StreamEventType = "speech_start" // EventSpeechEnd indicates the user stopped speaking. EventSpeechEnd StreamEventType = "speech_end" // EventError indicates an error occurred. EventError StreamEventType = "error" )
type StreamingProvider ¶
type StreamingProvider interface {
Provider
// TranscribeStream starts a streaming transcription session.
// Returns a writer for sending audio and a channel for receiving events.
TranscribeStream(ctx context.Context, config TranscriptionConfig) (io.WriteCloser, <-chan StreamEvent, error)
}
StreamingProvider extends Provider with real-time streaming support.
type TranscriptionConfig ¶
type TranscriptionConfig struct {
// Language is the BCP-47 language code (e.g., "en-US").
// Leave empty for automatic detection.
Language string
// Model is the provider-specific model identifier (optional).
Model string
// SampleRate is the audio sample rate in Hz.
SampleRate int
// Channels is the number of audio channels (1 = mono, 2 = stereo).
Channels int
// Encoding is the audio encoding ("pcm", "mp3", "wav", "opus", "flac").
Encoding string
// EnablePunctuation adds punctuation to transcription.
EnablePunctuation bool
// EnableWordTimestamps includes word-level timestamps.
EnableWordTimestamps bool
// EnableSpeakerDiarization identifies different speakers.
EnableSpeakerDiarization bool
// MaxSpeakers is the maximum number of speakers to detect (for diarization).
MaxSpeakers int
// Keywords are words/phrases to boost recognition accuracy.
Keywords []string
// VocabularyID is a provider-specific custom vocabulary ID.
VocabularyID string
}
TranscriptionConfig configures a STT transcription request.
type TranscriptionResult ¶
type TranscriptionResult struct {
// Text is the full transcription text.
Text string
// Segments contains segment-level details.
Segments []Segment
// Language is the detected language.
Language string
// LanguageConfidence is the confidence in language detection.
LanguageConfidence float64
// Duration is the audio duration.
Duration time.Duration
}
TranscriptionResult contains the result of a STT transcription.
type Word ¶
type Word struct {
// Text is the transcribed word.
Text string
// StartTime is when the word starts.
StartTime time.Duration
// EndTime is when the word ends.
EndTime time.Duration
// Confidence is the recognition confidence (0.0 to 1.0).
Confidence float64
// Speaker is the speaker identifier (if diarization enabled).
Speaker string
}
Word represents a single transcribed word with timing.
Directories
¶
| Path | Synopsis |
|---|---|
|
Package providertest provides conformance tests for STT provider implementations.
|
Package providertest provides conformance tests for STT provider implementations. |