voice

package
v1.3.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 26, 2026 License: MIT Imports: 5 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type AudioChunk

type AudioChunk struct {
	Data       []byte    `json:"data"`
	SampleRate int       `json:"sample_rate"`
	Channels   int       `json:"channels"`
	Timestamp  time.Time `json:"timestamp"`
	IsFinal    bool      `json:"is_final"`
}

AudioChunk代表了一大块音频数据.

type AudioFrame

type AudioFrame struct {
	Data       []byte    `json:"data"`
	SampleRate int       `json:"sample_rate"`
	Channels   int       `json:"channels"`
	Duration   int       `json:"duration_ms"`
	Timestamp  time.Time `json:"timestamp"`
}

AudioFrame代表单一音频帧.

type AudioMetrics

type AudioMetrics struct {
	TotalRequests  int64         `json:"total_requests"`
	AverageLatency time.Duration `json:"average_latency"`
	P95Latency     time.Duration `json:"p95_latency"`
	TargetHitRate  float64       `json:"target_hit_rate"`
	TotalAudioMS   int64         `json:"total_audio_ms"`
	Interruptions  int64         `json:"interruptions"`
	// contains filtered or unexported fields
}

AudioMetrics追踪音频处理度量衡.

type LLMHandler

type LLMHandler interface {
	ProcessStream(ctx context.Context, input string) (<-chan string, error)
}

LLMHandler为语音处理LLM交互.

type MultimodalInput

type MultimodalInput struct {
	Audio     []AudioFrame   `json:"audio,omitempty"`
	Text      string         `json:"text,omitempty"`
	Image     []byte         `json:"image,omitempty"`
	Timestamp time.Time      `json:"timestamp"`
	Metadata  map[string]any `json:"metadata,omitempty"`
}

多式联运输入代表了本土音频推理的输入.

type MultimodalOutput

type MultimodalOutput struct {
	Audio       []AudioFrame `json:"audio,omitempty"`
	Text        string       `json:"text,omitempty"`
	Transcript  string       `json:"transcript,omitempty"`
	LatencyMS   int64        `json:"latency_ms"`
	TokensUsed  int          `json:"tokens_used"`
	Confidence  float64      `json:"confidence"`
	Interrupted bool         `json:"interrupted"`
}

多式联运输出代表了本地音频推理的输出.

type NativeAudioConfig

type NativeAudioConfig struct {
	TargetLatencyMS int           `json:"target_latency_ms"` // Target: 232ms
	SampleRate      int           `json:"sample_rate"`
	ChunkSizeMS     int           `json:"chunk_size_ms"`
	BufferSize      int           `json:"buffer_size"`
	EnableVAD       bool          `json:"enable_vad"`
	Timeout         time.Duration `json:"timeout"`
}

原生AudioConfig配置了本土音频推理.

func DefaultNativeAudioConfig

func DefaultNativeAudioConfig() NativeAudioConfig

默认 NativeAudioConfig 返回低延迟的优化默认值。

type NativeAudioProvider

type NativeAudioProvider interface {
	ProcessAudio(ctx context.Context, input MultimodalInput) (*MultimodalOutput, error)
	StreamAudio(ctx context.Context, input <-chan AudioFrame) (<-chan AudioFrame, error)
	Name() string
}

土著AudioProvider定义了本地音频模型的界面.

type NativeAudioReasoner

type NativeAudioReasoner struct {
	// contains filtered or unexported fields
}

土著AudioReasoner提供GPT-4o风格的本土音频推理.

func NewNativeAudioReasoner

func NewNativeAudioReasoner(provider NativeAudioProvider, config NativeAudioConfig, logger *zap.Logger) *NativeAudioReasoner

NewNativeAudioReasoner创造出一个新的本土音频理性.

func (*NativeAudioReasoner) GetMetrics

func (r *NativeAudioReasoner) GetMetrics() AudioMetrics

GetMetrics 返回当前度量衡 。

func (*NativeAudioReasoner) Interrupt

func (r *NativeAudioReasoner) Interrupt()

中断中断当前音频处理.

func (*NativeAudioReasoner) Process

进程用本地音频推理处理多模式输入.

func (*NativeAudioReasoner) StreamProcess

func (r *NativeAudioReasoner) StreamProcess(ctx context.Context, inputChan <-chan AudioFrame) (<-chan AudioFrame, error)

StreamProcess在流化模式下处理音频,以达到最小的延迟.

type STTProvider

type STTProvider interface {
	StartStream(ctx context.Context, sampleRate int) (STTStream, error)
	Name() string
}

STTProvider定义了语音到文本接口.

type STTStream

type STTStream interface {
	Send(chunk AudioChunk) error
	Receive() <-chan TranscriptEvent
	Close() error
}

STTstream代表流传的STT会话.

type SpeechEvent

type SpeechEvent struct {
	Audio     []byte    `json:"audio"`
	Text      string    `json:"text"`
	IsFinal   bool      `json:"is_final"`
	Timestamp time.Time `json:"timestamp"`
}

SpeechEvent 代表文字对语音事件.

type TTSProvider

type TTSProvider interface {
	Synthesize(ctx context.Context, text string) (<-chan SpeechEvent, error)
	SynthesizeStream(ctx context.Context, textChan <-chan string) (<-chan SpeechEvent, error)
	Name() string
}

TTS Provider定义了文本到语音界面.

type TranscriptEvent

type TranscriptEvent struct {
	Text       string    `json:"text"`
	IsFinal    bool      `json:"is_final"`
	Confidence float64   `json:"confidence"`
	StartTime  float64   `json:"start_time"`
	EndTime    float64   `json:"end_time"`
	Timestamp  time.Time `json:"timestamp"`
}

TranscriptEvent代表一个语音对文本事件.

type VoiceAgent

type VoiceAgent struct {
	// contains filtered or unexported fields
}

VoiceAgent执行实时语音代理.

func NewVoiceAgent

func NewVoiceAgent(config VoiceConfig, stt STTProvider, tts TTSProvider, llm LLMHandler, logger *zap.Logger) *VoiceAgent

NewVoiceAgent创建了新的语音代理.

func (*VoiceAgent) GetMetrics

func (v *VoiceAgent) GetMetrics() VoiceMetrics

GetMetrics 返回当前度量衡 。

func (*VoiceAgent) GetState

func (v *VoiceAgent) GetState() VoiceState

GetState 返回当前状态 。

func (*VoiceAgent) Start

func (v *VoiceAgent) Start(ctx context.Context) (*VoiceSession, error)

开始语音对话

type VoiceConfig

type VoiceConfig struct {
	STTProvider      string        `json:"stt_provider"`      // deepgram, assemblyai, whisper
	TTSProvider      string        `json:"tts_provider"`      // elevenlabs, openai, azure
	SampleRate       int           `json:"sample_rate"`       // 16000, 24000, 48000
	MaxLatencyMS     int           `json:"max_latency_ms"`    // Target latency
	VADEnabled       bool          `json:"vad_enabled"`       // Voice Activity Detection
	InterruptEnabled bool          `json:"interrupt_enabled"` // Allow interruptions
	BufferDuration   time.Duration `json:"buffer_duration"`
}

VoiceConfig 配置语音代理 。

func DefaultVoiceConfig

func DefaultVoiceConfig() VoiceConfig

默认 VoiceConfig 为低延迟返回优化默认值。

type VoiceMetrics

type VoiceMetrics struct {
	TotalSessions     int64         `json:"total_sessions"`
	AverageLatency    time.Duration `json:"average_latency"`
	P95Latency        time.Duration `json:"p95_latency"`
	InterruptionCount int64         `json:"interruption_count"`
	TotalAudioSeconds float64       `json:"total_audio_seconds"`
}

语音计量跟踪语音代理性能.

type VoiceSession

type VoiceSession struct {
	ID string
	// contains filtered or unexported fields
}

语音会议代表了积极的语音对话。

func (*VoiceSession) Close

func (s *VoiceSession) Close() error

关闭会话 。

func (*VoiceSession) Interrupt

func (s *VoiceSession) Interrupt()

中断当前发言 。

func (*VoiceSession) ReceiveSpeech

func (s *VoiceSession) ReceiveSpeech() <-chan SpeechEvent

接收Speech返回接收合成语音的信道 。

func (*VoiceSession) SendAudio

func (s *VoiceSession) SendAudio(chunk AudioChunk) error

SendAudio向会话发送音频数据.

type VoiceState

type VoiceState string

语音状态代表语音代理的当前状态.

const (
	StateIdle        VoiceState = "idle"
	StateListening   VoiceState = "listening"
	StateProcessing  VoiceState = "processing"
	StateSpeaking    VoiceState = "speaking"
	StateInterrupted VoiceState = "interrupted"
)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL