Documentation
¶
Index ¶
- type AudioChunk
- type AudioFrame
- type AudioMetrics
- type LLMHandler
- type MultimodalInput
- type MultimodalOutput
- type NativeAudioConfig
- type NativeAudioProvider
- type NativeAudioReasoner
- func (r *NativeAudioReasoner) GetMetrics() AudioMetrics
- func (r *NativeAudioReasoner) Interrupt()
- func (r *NativeAudioReasoner) Process(ctx context.Context, input MultimodalInput) (*MultimodalOutput, error)
- func (r *NativeAudioReasoner) StreamProcess(ctx context.Context, inputChan <-chan AudioFrame) (<-chan AudioFrame, error)
- type STTProvider
- type STTStream
- type SpeechEvent
- type TTSProvider
- type TranscriptEvent
- type VoiceAgent
- type VoiceConfig
- type VoiceMetrics
- type VoiceSession
- type VoiceState
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type AudioChunk ¶
type AudioChunk struct {
Data []byte `json:"data"`
SampleRate int `json:"sample_rate"`
Channels int `json:"channels"`
Timestamp time.Time `json:"timestamp"`
IsFinal bool `json:"is_final"`
}
AudioChunk代表了一大块音频数据.
type AudioFrame ¶
type AudioFrame struct {
Data []byte `json:"data"`
SampleRate int `json:"sample_rate"`
Channels int `json:"channels"`
Duration int `json:"duration_ms"`
Timestamp time.Time `json:"timestamp"`
}
AudioFrame代表单一音频帧.
type AudioMetrics ¶
type AudioMetrics struct {
TotalRequests int64 `json:"total_requests"`
AverageLatency time.Duration `json:"average_latency"`
P95Latency time.Duration `json:"p95_latency"`
TargetHitRate float64 `json:"target_hit_rate"`
TotalAudioMS int64 `json:"total_audio_ms"`
Interruptions int64 `json:"interruptions"`
// contains filtered or unexported fields
}
AudioMetrics追踪音频处理度量衡.
type LLMHandler ¶
type LLMHandler interface {
ProcessStream(ctx context.Context, input string) (<-chan string, error)
}
LLMHandler为语音处理LLM交互.
type MultimodalInput ¶
type MultimodalInput struct {
Audio []AudioFrame `json:"audio,omitempty"`
Text string `json:"text,omitempty"`
Image []byte `json:"image,omitempty"`
Timestamp time.Time `json:"timestamp"`
Metadata map[string]any `json:"metadata,omitempty"`
}
多式联运输入代表了本土音频推理的输入.
type MultimodalOutput ¶
type MultimodalOutput struct {
Audio []AudioFrame `json:"audio,omitempty"`
Text string `json:"text,omitempty"`
Transcript string `json:"transcript,omitempty"`
LatencyMS int64 `json:"latency_ms"`
TokensUsed int `json:"tokens_used"`
Confidence float64 `json:"confidence"`
Interrupted bool `json:"interrupted"`
}
多式联运输出代表了本地音频推理的输出.
type NativeAudioConfig ¶
type NativeAudioConfig struct {
TargetLatencyMS int `json:"target_latency_ms"` // Target: 232ms
SampleRate int `json:"sample_rate"`
ChunkSizeMS int `json:"chunk_size_ms"`
BufferSize int `json:"buffer_size"`
EnableVAD bool `json:"enable_vad"`
Timeout time.Duration `json:"timeout"`
}
原生AudioConfig配置了本土音频推理.
func DefaultNativeAudioConfig ¶
func DefaultNativeAudioConfig() NativeAudioConfig
默认 NativeAudioConfig 返回低延迟的优化默认值。
type NativeAudioProvider ¶
type NativeAudioProvider interface {
ProcessAudio(ctx context.Context, input MultimodalInput) (*MultimodalOutput, error)
StreamAudio(ctx context.Context, input <-chan AudioFrame) (<-chan AudioFrame, error)
Name() string
}
土著AudioProvider定义了本地音频模型的界面.
type NativeAudioReasoner ¶
type NativeAudioReasoner struct {
// contains filtered or unexported fields
}
土著AudioReasoner提供GPT-4o风格的本土音频推理.
func NewNativeAudioReasoner ¶
func NewNativeAudioReasoner(provider NativeAudioProvider, config NativeAudioConfig, logger *zap.Logger) *NativeAudioReasoner
NewNativeAudioReasoner创造出一个新的本土音频理性.
func (*NativeAudioReasoner) GetMetrics ¶
func (r *NativeAudioReasoner) GetMetrics() AudioMetrics
GetMetrics 返回当前度量衡 。
func (*NativeAudioReasoner) Process ¶
func (r *NativeAudioReasoner) Process(ctx context.Context, input MultimodalInput) (*MultimodalOutput, error)
进程用本地音频推理处理多模式输入.
func (*NativeAudioReasoner) StreamProcess ¶
func (r *NativeAudioReasoner) StreamProcess(ctx context.Context, inputChan <-chan AudioFrame) (<-chan AudioFrame, error)
StreamProcess在流化模式下处理音频,以达到最小的延迟.
type STTProvider ¶
type STTProvider interface {
StartStream(ctx context.Context, sampleRate int) (STTStream, error)
Name() string
}
STTProvider定义了语音到文本接口.
type STTStream ¶
type STTStream interface {
Send(chunk AudioChunk) error
Receive() <-chan TranscriptEvent
Close() error
}
STTstream代表流传的STT会话.
type SpeechEvent ¶
type SpeechEvent struct {
Audio []byte `json:"audio"`
Text string `json:"text"`
IsFinal bool `json:"is_final"`
Timestamp time.Time `json:"timestamp"`
}
SpeechEvent 代表文字对语音事件.
type TTSProvider ¶
type TTSProvider interface {
Synthesize(ctx context.Context, text string) (<-chan SpeechEvent, error)
SynthesizeStream(ctx context.Context, textChan <-chan string) (<-chan SpeechEvent, error)
Name() string
}
TTS Provider定义了文本到语音界面.
type TranscriptEvent ¶
type TranscriptEvent struct {
Text string `json:"text"`
IsFinal bool `json:"is_final"`
Confidence float64 `json:"confidence"`
StartTime float64 `json:"start_time"`
EndTime float64 `json:"end_time"`
Timestamp time.Time `json:"timestamp"`
}
TranscriptEvent代表一个语音对文本事件.
type VoiceAgent ¶
type VoiceAgent struct {
// contains filtered or unexported fields
}
VoiceAgent执行实时语音代理.
func NewVoiceAgent ¶
func NewVoiceAgent(config VoiceConfig, stt STTProvider, tts TTSProvider, llm LLMHandler, logger *zap.Logger) *VoiceAgent
NewVoiceAgent创建了新的语音代理.
func (*VoiceAgent) Start ¶
func (v *VoiceAgent) Start(ctx context.Context) (*VoiceSession, error)
开始语音对话
type VoiceConfig ¶
type VoiceConfig struct {
STTProvider string `json:"stt_provider"` // deepgram, assemblyai, whisper
TTSProvider string `json:"tts_provider"` // elevenlabs, openai, azure
SampleRate int `json:"sample_rate"` // 16000, 24000, 48000
MaxLatencyMS int `json:"max_latency_ms"` // Target latency
VADEnabled bool `json:"vad_enabled"` // Voice Activity Detection
InterruptEnabled bool `json:"interrupt_enabled"` // Allow interruptions
BufferDuration time.Duration `json:"buffer_duration"`
}
VoiceConfig 配置语音代理 。
type VoiceMetrics ¶
type VoiceMetrics struct {
TotalSessions int64 `json:"total_sessions"`
AverageLatency time.Duration `json:"average_latency"`
P95Latency time.Duration `json:"p95_latency"`
InterruptionCount int64 `json:"interruption_count"`
TotalAudioSeconds float64 `json:"total_audio_seconds"`
}
语音计量跟踪语音代理性能.
type VoiceSession ¶
type VoiceSession struct {
ID string
// contains filtered or unexported fields
}
语音会议代表了积极的语音对话。
func (*VoiceSession) ReceiveSpeech ¶
func (s *VoiceSession) ReceiveSpeech() <-chan SpeechEvent
接收Speech返回接收合成语音的信道 。
func (*VoiceSession) SendAudio ¶
func (s *VoiceSession) SendAudio(chunk AudioChunk) error
SendAudio向会话发送音频数据.
type VoiceState ¶
type VoiceState string
语音状态代表语音代理的当前状态.
const ( StateIdle VoiceState = "idle" StateListening VoiceState = "listening" StateProcessing VoiceState = "processing" StateSpeaking VoiceState = "speaking" StateInterrupted VoiceState = "interrupted" )