voice

package

v1.3.0 Latest Latest Go to latest Published: Feb 26, 2026 License: MIT Imports: 5 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/BaSui01/agentflow

Links

Open Source Insights

Documentation ¶

Index ¶

type AudioChunk
type AudioFrame
type AudioMetrics
type LLMHandler
type MultimodalInput
type MultimodalOutput
type NativeAudioConfig
- func DefaultNativeAudioConfig() NativeAudioConfig
type NativeAudioProvider
type NativeAudioReasoner
- func NewNativeAudioReasoner(provider NativeAudioProvider, config NativeAudioConfig, logger *zap.Logger) *NativeAudioReasoner
- func (r *NativeAudioReasoner) GetMetrics() AudioMetrics
- func (r *NativeAudioReasoner) Interrupt()
- func (r *NativeAudioReasoner) Process(ctx context.Context, input MultimodalInput) (*MultimodalOutput, error)
- func (r *NativeAudioReasoner) StreamProcess(ctx context.Context, inputChan <-chan AudioFrame) (<-chan AudioFrame, error)
type STTProvider
type STTStream
type SpeechEvent
type TTSProvider
type TranscriptEvent
type VoiceAgent
- func NewVoiceAgent(config VoiceConfig, stt STTProvider, tts TTSProvider, llm LLMHandler, ...) *VoiceAgent
- func (v *VoiceAgent) GetMetrics() VoiceMetrics
- func (v *VoiceAgent) GetState() VoiceState
- func (v *VoiceAgent) Start(ctx context.Context) (*VoiceSession, error)
type VoiceConfig
- func DefaultVoiceConfig() VoiceConfig
type VoiceMetrics
type VoiceSession
- func (s *VoiceSession) Close() error
- func (s *VoiceSession) Interrupt()
- func (s *VoiceSession) ReceiveSpeech() <-chan SpeechEvent
- func (s *VoiceSession) SendAudio(chunk AudioChunk) error
type VoiceState

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

This section is empty.

Types ¶

type AudioChunk ¶

type AudioChunk struct {
	Data       []byte    `json:"data"`
	SampleRate int       `json:"sample_rate"`
	Channels   int       `json:"channels"`
	Timestamp  time.Time `json:"timestamp"`
	IsFinal    bool      `json:"is_final"`
}

AudioChunk代表了一大块音频数据.

type AudioFrame ¶

type AudioFrame struct {
	Data       []byte    `json:"data"`
	SampleRate int       `json:"sample_rate"`
	Channels   int       `json:"channels"`
	Duration   int       `json:"duration_ms"`
	Timestamp  time.Time `json:"timestamp"`
}

AudioFrame代表单一音频帧.

type AudioMetrics ¶

type AudioMetrics struct {
	TotalRequests  int64         `json:"total_requests"`
	AverageLatency time.Duration `json:"average_latency"`
	P95Latency     time.Duration `json:"p95_latency"`
	TargetHitRate  float64       `json:"target_hit_rate"`
	TotalAudioMS   int64         `json:"total_audio_ms"`
	Interruptions  int64         `json:"interruptions"`
	// contains filtered or unexported fields
}

AudioMetrics追踪音频处理度量衡.

type LLMHandler ¶

type LLMHandler interface {
	ProcessStream(ctx context.Context, input string) (<-chan string, error)
}

LLMHandler为语音处理LLM交互.

type MultimodalInput ¶

type MultimodalInput struct {
	Audio     []AudioFrame   `json:"audio,omitempty"`
	Text      string         `json:"text,omitempty"`
	Image     []byte         `json:"image,omitempty"`
	Timestamp time.Time      `json:"timestamp"`
	Metadata  map[string]any `json:"metadata,omitempty"`
}

多式联运输入代表了本土音频推理的输入.

type MultimodalOutput ¶

type MultimodalOutput struct {
	Audio       []AudioFrame `json:"audio,omitempty"`
	Text        string       `json:"text,omitempty"`
	Transcript  string       `json:"transcript,omitempty"`
	LatencyMS   int64        `json:"latency_ms"`
	TokensUsed  int          `json:"tokens_used"`
	Confidence  float64      `json:"confidence"`
	Interrupted bool         `json:"interrupted"`
}

多式联运输出代表了本地音频推理的输出.

type NativeAudioConfig ¶

type NativeAudioConfig struct {
	TargetLatencyMS int           `json:"target_latency_ms"` // Target: 232ms
	SampleRate      int           `json:"sample_rate"`
	ChunkSizeMS     int           `json:"chunk_size_ms"`
	BufferSize      int           `json:"buffer_size"`
	EnableVAD       bool          `json:"enable_vad"`
	Timeout         time.Duration `json:"timeout"`
}

原生AudioConfig配置了本土音频推理.

func DefaultNativeAudioConfig ¶

func DefaultNativeAudioConfig() NativeAudioConfig

默认 NativeAudioConfig 返回低延迟的优化默认值。

type NativeAudioProvider ¶

type NativeAudioProvider interface {
	ProcessAudio(ctx context.Context, input MultimodalInput) (*MultimodalOutput, error)
	StreamAudio(ctx context.Context, input <-chan AudioFrame) (<-chan AudioFrame, error)
	Name() string
}

土著AudioProvider定义了本地音频模型的界面.

type NativeAudioReasoner ¶

type NativeAudioReasoner struct {
	// contains filtered or unexported fields
}

土著AudioReasoner提供GPT-4o风格的本土音频推理.

func NewNativeAudioReasoner ¶

func NewNativeAudioReasoner(provider NativeAudioProvider, config NativeAudioConfig, logger *zap.Logger) *NativeAudioReasoner

NewNativeAudioReasoner创造出一个新的本土音频理性.

func (*NativeAudioReasoner) GetMetrics ¶

func (r *NativeAudioReasoner) GetMetrics() AudioMetrics

GetMetrics 返回当前度量衡。

func (*NativeAudioReasoner) Interrupt ¶

func (r *NativeAudioReasoner) Interrupt()

中断中断当前音频处理.

func (*NativeAudioReasoner) Process ¶

func (r *NativeAudioReasoner) Process(ctx context.Context, input MultimodalInput) (*MultimodalOutput, error)

进程用本地音频推理处理多模式输入.

func (*NativeAudioReasoner) StreamProcess ¶

func (r *NativeAudioReasoner) StreamProcess(ctx context.Context, inputChan <-chan AudioFrame) (<-chan AudioFrame, error)

StreamProcess在流化模式下处理音频,以达到最小的延迟.

type STTProvider ¶

type STTProvider interface {
	StartStream(ctx context.Context, sampleRate int) (STTStream, error)
	Name() string
}

STTProvider定义了语音到文本接口.

type STTStream ¶

type STTStream interface {
	Send(chunk AudioChunk) error
	Receive() <-chan TranscriptEvent
	Close() error
}

STTstream代表流传的STT会话.

type SpeechEvent ¶

type SpeechEvent struct {
	Audio     []byte    `json:"audio"`
	Text      string    `json:"text"`
	IsFinal   bool      `json:"is_final"`
	Timestamp time.Time `json:"timestamp"`
}

SpeechEvent 代表文字对语音事件.

type TTSProvider ¶

type TTSProvider interface {
	Synthesize(ctx context.Context, text string) (<-chan SpeechEvent, error)
	SynthesizeStream(ctx context.Context, textChan <-chan string) (<-chan SpeechEvent, error)
	Name() string
}

TTS Provider定义了文本到语音界面.

type TranscriptEvent ¶

type TranscriptEvent struct {
	Text       string    `json:"text"`
	IsFinal    bool      `json:"is_final"`
	Confidence float64   `json:"confidence"`
	StartTime  float64   `json:"start_time"`
	EndTime    float64   `json:"end_time"`
	Timestamp  time.Time `json:"timestamp"`
}

TranscriptEvent代表一个语音对文本事件.

type VoiceAgent ¶

type VoiceAgent struct {
	// contains filtered or unexported fields
}

VoiceAgent执行实时语音代理.

func NewVoiceAgent ¶

func NewVoiceAgent(config VoiceConfig, stt STTProvider, tts TTSProvider, llm LLMHandler, logger *zap.Logger) *VoiceAgent

NewVoiceAgent创建了新的语音代理.

func (*VoiceAgent) GetMetrics ¶

func (v *VoiceAgent) GetMetrics() VoiceMetrics

GetMetrics 返回当前度量衡。

func (*VoiceAgent) GetState ¶

func (v *VoiceAgent) GetState() VoiceState

GetState 返回当前状态。

func (*VoiceAgent) Start ¶

func (v *VoiceAgent) Start(ctx context.Context) (*VoiceSession, error)

开始语音对话

type VoiceConfig ¶

type VoiceConfig struct {
	STTProvider      string        `json:"stt_provider"`      // deepgram, assemblyai, whisper
	TTSProvider      string        `json:"tts_provider"`      // elevenlabs, openai, azure
	SampleRate       int           `json:"sample_rate"`       // 16000, 24000, 48000
	MaxLatencyMS     int           `json:"max_latency_ms"`    // Target latency
	VADEnabled       bool          `json:"vad_enabled"`       // Voice Activity Detection
	InterruptEnabled bool          `json:"interrupt_enabled"` // Allow interruptions
	BufferDuration   time.Duration `json:"buffer_duration"`
}

VoiceConfig 配置语音代理。

func DefaultVoiceConfig ¶

func DefaultVoiceConfig() VoiceConfig

默认 VoiceConfig 为低延迟返回优化默认值。

type VoiceMetrics ¶

type VoiceMetrics struct {
	TotalSessions     int64         `json:"total_sessions"`
	AverageLatency    time.Duration `json:"average_latency"`
	P95Latency        time.Duration `json:"p95_latency"`
	InterruptionCount int64         `json:"interruption_count"`
	TotalAudioSeconds float64       `json:"total_audio_seconds"`
}

语音计量跟踪语音代理性能.

type VoiceSession ¶

type VoiceSession struct {
	ID string
	// contains filtered or unexported fields
}

语音会议代表了积极的语音对话。

func (*VoiceSession) Close ¶

func (s *VoiceSession) Close() error

关闭会话。

func (*VoiceSession) Interrupt ¶

func (s *VoiceSession) Interrupt()

中断当前发言。

func (*VoiceSession) ReceiveSpeech ¶

func (s *VoiceSession) ReceiveSpeech() <-chan SpeechEvent

接收Speech返回接收合成语音的信道。

func (*VoiceSession) SendAudio ¶

func (s *VoiceSession) SendAudio(chunk AudioChunk) error

SendAudio向会话发送音频数据.

type VoiceState ¶

type VoiceState string

语音状态代表语音代理的当前状态.

const (
	StateIdle        VoiceState = "idle"
	StateListening   VoiceState = "listening"
	StateProcessing  VoiceState = "processing"
	StateSpeaking    VoiceState = "speaking"
	StateInterrupted VoiceState = "interrupted"
)

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL