stt

package

v0.0.2 Latest Latest Go to latest Published: Apr 14, 2026 License: MIT Imports: 9 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/cavos-io/rtp-agent

Links

Open Source Insights

Documentation ¶

Index ¶

type FallbackAdapter
- func NewFallbackAdapter(stts []STT) *FallbackAdapter
type MultiSpeakerAdapter
- func NewMultiSpeakerAdapter(stt STT, detectPrimary bool, suppressBackground bool, primaryFormat string, ...) (*MultiSpeakerAdapter, error)
type PrimarySpeakerDetectionOptions
- func DefaultPrimarySpeakerDetectionOptions() PrimarySpeakerDetectionOptions
type RecognizeStream
type STT
type STTCapabilities
type SearchStream
type SpeechData
type SpeechEvent
type SpeechEventType
type StreamAdapter
- func NewStreamAdapter(stt STT, vad vad.VAD) *StreamAdapter

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

This section is empty.

Types ¶

type FallbackAdapter ¶

type FallbackAdapter struct {
	// contains filtered or unexported fields
}

func NewFallbackAdapter ¶

func NewFallbackAdapter(stts []STT) *FallbackAdapter

func (*FallbackAdapter) Capabilities ¶

func (f *FallbackAdapter) Capabilities() STTCapabilities

func (*FallbackAdapter) Label ¶

func (f *FallbackAdapter) Label() string

func (*FallbackAdapter) Recognize ¶

func (f *FallbackAdapter) Recognize(ctx context.Context, frames []*model.AudioFrame, language string) (*SpeechEvent, error)

func (*FallbackAdapter) Stream ¶

func (f *FallbackAdapter) Stream(ctx context.Context, language string) (RecognizeStream, error)

type MultiSpeakerAdapter ¶

type MultiSpeakerAdapter struct {
	// contains filtered or unexported fields
}

func NewMultiSpeakerAdapter ¶

func NewMultiSpeakerAdapter(stt STT, detectPrimary bool, suppressBackground bool, primaryFormat string, backgroundFormat string, opt *PrimarySpeakerDetectionOptions) (*MultiSpeakerAdapter, error)

func (*MultiSpeakerAdapter) Capabilities ¶

func (a *MultiSpeakerAdapter) Capabilities() STTCapabilities

func (*MultiSpeakerAdapter) Label ¶

func (a *MultiSpeakerAdapter) Label() string

func (*MultiSpeakerAdapter) Recognize ¶

func (a *MultiSpeakerAdapter) Recognize(ctx context.Context, frames []*model.AudioFrame, language string) (*SpeechEvent, error)

func (*MultiSpeakerAdapter) Stream ¶

func (a *MultiSpeakerAdapter) Stream(ctx context.Context, language string) (RecognizeStream, error)

type PrimarySpeakerDetectionOptions ¶

type PrimarySpeakerDetectionOptions struct {
	FrameSizeMs            int
	RMSBufferDuration      float64
	MinRMSSamples          int
	RMSSmoothingFactor     float64
	ThresholdMultiplier    float64
	DecayToEqualTime       float64
	ThresholdMinMultiplier float64
}

func DefaultPrimarySpeakerDetectionOptions ¶

func DefaultPrimarySpeakerDetectionOptions() PrimarySpeakerDetectionOptions

type RecognizeStream ¶

type RecognizeStream interface {
	PushFrame(frame *model.AudioFrame) error
	Flush() error
	Close() error
	Next() (*SpeechEvent, error)
}

type STT ¶

type STT interface {
	Label() string
	Capabilities() STTCapabilities
	Stream(ctx context.Context, language string) (RecognizeStream, error)
	Recognize(ctx context.Context, frames []*model.AudioFrame, language string) (*SpeechEvent, error)
}

type STTCapabilities ¶

type STTCapabilities struct {
	Streaming        bool
	InterimResults   bool
	Diarization      bool
	OfflineRecognize bool
}

type SearchStream ¶

type SearchStream interface {
	PushFrame(frame *model.AudioFrame) error
	Close() error
	Next() (*SpeechEvent, error)
}

type SpeechData ¶

type SpeechData struct {
	Language         string
	Text             string
	StartTime        float64
	EndTime          float64
	Confidence       float64
	SpeakerID        string
	IsPrimarySpeaker *bool
}

type SpeechEvent ¶

type SpeechEvent struct {
	Type         SpeechEventType
	RequestID    string
	Alternatives []SpeechData
	Interrupted  bool
}

type SpeechEventType ¶

type SpeechEventType string

const (
	SpeechEventStartOfSpeech       SpeechEventType = "start_of_speech"
	SpeechEventInterimTranscript   SpeechEventType = "interim_transcript"
	SpeechEventPreflightTranscript SpeechEventType = "preflight_transcript"
	SpeechEventFinalTranscript     SpeechEventType = "final_transcript"
	SpeechEventRecognitionUsage    SpeechEventType = "recognition_usage"
	SpeechEventEndOfSpeech         SpeechEventType = "end_of_speech"
)

type StreamAdapter ¶

type StreamAdapter struct {
	// contains filtered or unexported fields
}

StreamAdapter converts a non-streaming STT into a streaming STT by coupling it with a VAD. It buffers audio frames and sends them to the underlying STT Recognize method when the VAD detects speech.

func NewStreamAdapter ¶

func NewStreamAdapter(stt STT, vad vad.VAD) *StreamAdapter

func (*StreamAdapter) Capabilities ¶

func (a *StreamAdapter) Capabilities() STTCapabilities

func (*StreamAdapter) Label ¶

func (a *StreamAdapter) Label() string

func (*StreamAdapter) Recognize ¶

func (a *StreamAdapter) Recognize(ctx context.Context, frames []*model.AudioFrame, language string) (*SpeechEvent, error)

func (*StreamAdapter) Stream ¶

func (a *StreamAdapter) Stream(ctx context.Context, language string) (RecognizeStream, error)

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL