Documentation
¶
Index ¶
- Variables
- func FindLastSentenceEnding(text string) int
- func NormalizeForCompare(text string) string
- func Similarity(a, b string) float64
- type DecoderComponent
- type DecoderConfig
- type Denoiser
- type DenoiserComponent
- type EchoFilterComponent
- type Engine
- type LoggingComponent
- type Metrics
- type PCMCoalesceComponent
- type PCMInputComponent
- type PassthroughComponent
- type Pipeline
- type PipelineComponent
- type PlaybackGate
- type RecognizerComponent
- func (c *RecognizerComponent) Close() error
- func (c *RecognizerComponent) Name() string
- func (c *RecognizerComponent) Process(ctx context.Context, data interface{}) (interface{}, bool, error)
- func (c *RecognizerComponent) SetOnError(fn func(err error, fatal bool))
- func (c *RecognizerComponent) SetOnTranscript(fn func(text string, isFinal bool))
- func (c *RecognizerComponent) Start() error
- type RecognizerEngine
- type SensitiveFilterComponent
- func (s *SensitiveFilterComponent) Name() string
- func (s *SensitiveFilterComponent) Process(ctx context.Context, data interface{}) (interface{}, bool, error)
- func (s *SensitiveFilterComponent) SetBlacklist(patterns []string) error
- func (s *SensitiveFilterComponent) SetFilterEmoji(enabled bool)
- func (s *SensitiveFilterComponent) SetReplaceWith(replacement string)
- func (s *SensitiveFilterComponent) SetWhitelist(patterns []string) error
- type SensitiveFilterConfig
- type SentenceFilter
- type StandardPipeline
- func (p *StandardPipeline) ClearTTSState()
- func (p *StandardPipeline) Close() error
- func (p *StandardPipeline) GetMetrics() Metrics
- func (p *StandardPipeline) IsTTSPlaying() bool
- func (p *StandardPipeline) Process(ctx context.Context, data interface{}) (interface{}, error)
- func (p *StandardPipeline) ProcessOutput(ctx context.Context, text string, isFinal bool)
- func (p *StandardPipeline) ResetState()
- func (p *StandardPipeline) SetBargeInCallback(callback func())
- func (p *StandardPipeline) SetOutputCallback(callback func(text string, isFinal bool))
- func (p *StandardPipeline) SetPCMAudioCallback(callback func(data []byte) error)
- func (p *StandardPipeline) SetTTSPlaying(playing bool)
- func (p *StandardPipeline) WirePlaybackGate(gate *PlaybackGate)
- type VADComponent
- func (v *VADComponent) Name() string
- func (v *VADComponent) Process(ctx context.Context, data interface{}) (interface{}, bool, error)
- func (v *VADComponent) Reset()
- func (v *VADComponent) SetBargeInCallback(callback func())
- func (v *VADComponent) SetConsecutiveFrames(frames int)
- func (v *VADComponent) SetEnabled(enabled bool)
- func (v *VADComponent) SetLogger(callback func(string))
- func (v *VADComponent) SetThreshold(threshold float64)
- type VADConfig
Constants ¶
This section is empty.
Variables ¶
var ( ErrEmptyInputStages = errors.New("asr: input stages cannot be empty") ErrNilData = errors.New("asr: data is nil") ErrInvalidDataType = errors.New("asr: invalid data type") ErrPipelineClosed = errors.New("asr: pipeline already closed") ErrNilEngine = errors.New("asr: nil engine") )
Common errors
var SentenceEndings = []rune{'。', '!', '?', '.', '!', '?', '\n'}
SentenceEndings are treated as sentence boundaries.
Functions ¶
func FindLastSentenceEnding ¶
FindLastSentenceEnding returns the byte offset of the last sentence terminator, or -1.
func NormalizeForCompare ¶
NormalizeForCompare strips punctuation/whitespace for fuzzy ASR dedup.
func Similarity ¶
Similarity returns normalized Levenshtein similarity in [0, 1].
Types ¶
type DecoderComponent ¶
type DecoderComponent struct {
// contains filtered or unexported fields
}
DecoderComponent decodes compressed audio (e.g., Opus) to PCM. It uses the media/encoder package to handle codec-specific decoding.
func NewDecoderComponent ¶
func NewDecoderComponent(config DecoderConfig) (*DecoderComponent, error)
NewDecoderComponent creates a new decoder component with the given configuration.
func (*DecoderComponent) GetConfig ¶
func (d *DecoderComponent) GetConfig() DecoderConfig
GetConfig returns the current decoder configuration.
func (*DecoderComponent) Name ¶
func (d *DecoderComponent) Name() string
Name returns the component name.
func (*DecoderComponent) Process ¶
func (d *DecoderComponent) Process(ctx context.Context, data interface{}) (interface{}, bool, error)
Process decodes compressed audio data to PCM. Returns (pcmData, shouldContinue, error)
func (*DecoderComponent) SetLogger ¶
func (d *DecoderComponent) SetLogger(callback func(string))
SetLogger sets the logging callback.
type DecoderConfig ¶
type DecoderConfig struct {
// SourceCodec: source codec name (e.g., "opus", "pcmu", "pcma", "g722", "pcm")
SourceCodec string
// SourceSampleRate: source audio sample rate (e.g., 48000 for opus, 16000 for pcmu)
SourceSampleRate int
// SourceChannels: source audio channels (usually 1 or 2)
SourceChannels int
// TargetSampleRate: target PCM sample rate (e.g., 16000)
TargetSampleRate int
// TargetChannels: target PCM channels (usually 1)
TargetChannels int
// FrameDuration: frame duration string (e.g., "20ms", "40ms", "60ms")
FrameDuration string
}
DecoderConfig contains configuration for the decoder component.
func DefaultDecoderConfig ¶
func DefaultDecoderConfig() DecoderConfig
DefaultDecoderConfig returns a default decoder configuration. Assumes OPUS input at 48kHz and outputs PCM at 16kHz mono.
type Denoiser ¶
Denoiser performs optional uplink PCM processing (RNNoise, WebRTC AEC3, hardware AEC, ...). Implementations must not mutate the input slice and may return the input unchanged when they have nothing to do.
type DenoiserComponent ¶
type DenoiserComponent struct {
// contains filtered or unexported fields
}
DenoiserComponent applies a Denoiser in the ASR input chain. Place it after decode/PCM input and before VAD so barge-in still sees processed mic energy.
func NewDenoiserComponent ¶
func NewDenoiserComponent(dn Denoiser) *DenoiserComponent
NewDenoiserComponent creates a denoiser stage. dn must be non-nil.
func (*DenoiserComponent) Name ¶
func (d *DenoiserComponent) Name() string
Name returns the component identifier.
type EchoFilterComponent ¶
type EchoFilterComponent struct {
// contains filtered or unexported fields
}
EchoFilterComponent suppresses uplink audio while downlink TTS is active so ASR does not transcribe the AI's own voice. VAD for barge-in must run *before* this stage so it still sees raw microphone energy.
func NewEchoFilterComponent ¶
func NewEchoFilterComponent(gate *PlaybackGate) *EchoFilterComponent
NewEchoFilterComponent creates an echo suppressor backed by a PlaybackGate.
func (*EchoFilterComponent) Name ¶
func (e *EchoFilterComponent) Name() string
Name returns the component name.
type Engine ¶
type Engine interface {
Start() error
Stop()
SendPCM(pcm []byte, end bool) error
OnResult(callback func(text string, isFinal bool))
OnError(callback func(err error, fatal bool))
}
Engine is the minimal ASR contract the voice pipeline consumes. It abstracts over recognizer.Recognizer and other vendor backends.
type LoggingComponent ¶
type LoggingComponent struct {
// contains filtered or unexported fields
}
LoggingComponent logs data passing through (useful for debugging).
func NewLoggingComponent ¶
func NewLoggingComponent(name string, logger func(string)) *LoggingComponent
NewLoggingComponent creates a logging component.
func (*LoggingComponent) Name ¶
func (l *LoggingComponent) Name() string
Name returns the component name.
type Metrics ¶
type Metrics struct {
FirstPacketTime time.Time // First audio packet time
LastPacketTime time.Time // Last audio packet time
ASRFirstResult time.Time // First ASR result time
ASRLatency time.Duration // ASR latency (from last packet to first result)
TotalAudioBytes int // Total audio bytes (PCM)
AudioDuration time.Duration // Total audio duration
RTF float64 // Real-Time Factor (processing time / audio duration)
// contains filtered or unexported fields
}
Metrics contains ASR pipeline performance metrics.
type PCMCoalesceComponent ¶
type PCMCoalesceComponent struct {
// contains filtered or unexported fields
}
PCMCoalesceComponent buffers small uplink chunks into recognizer-friendly frames. Default target is 20 ms of PCM16 mono at the configured sample rate.
func NewPCMCoalesceComponent ¶
func NewPCMCoalesceComponent(sampleRateHz, minMs int) *PCMCoalesceComponent
NewPCMCoalesceComponent creates a coalescer. sampleRateHz is the PCM rate after decode (e.g. 16000). minMs is the minimum buffer duration (default 20).
func (*PCMCoalesceComponent) Flush ¶
func (c *PCMCoalesceComponent) Flush() []byte
Flush emits any remaining buffered PCM (call on utterance end / session close).
func (*PCMCoalesceComponent) Name ¶
func (c *PCMCoalesceComponent) Name() string
Name returns the component identifier.
type PCMInputComponent ¶
type PCMInputComponent struct{}
PCMInputComponent handles raw PCM audio input.
func (*PCMInputComponent) Name ¶
func (p *PCMInputComponent) Name() string
Name returns the component name.
type PassthroughComponent ¶
type PassthroughComponent struct {
// contains filtered or unexported fields
}
PassthroughComponent passes data through unchanged (useful for testing/logging).
func NewPassthroughComponent ¶
func NewPassthroughComponent(name string) *PassthroughComponent
NewPassthroughComponent creates a passthrough component.
func (*PassthroughComponent) Name ¶
func (p *PassthroughComponent) Name() string
Name returns the component name.
type Pipeline ¶
type Pipeline interface {
// Process processes one audio frame through the entire pipeline.
Process(ctx context.Context, data interface{}) (interface{}, error)
// ProcessOutput runs recognized text through output stages and fires callbacks.
ProcessOutput(ctx context.Context, text string, isFinal bool)
// SetOutputCallback sets the callback for final output.
SetOutputCallback(callback func(text string, isFinal bool))
// SetPCMAudioCallback sets the callback for PCM audio recording.
SetPCMAudioCallback(callback func(data []byte) error)
// SetBargeInCallback sets the callback for barge-in detection.
SetBargeInCallback(callback func())
// SetTTSPlaying sets the TTS playing state.
SetTTSPlaying(playing bool)
// IsTTSPlaying checks if TTS is playing.
IsTTSPlaying() bool
// ClearTTSState clears the TTS state.
ClearTTSState()
// ResetState resets the pipeline state for a new conversation round.
ResetState()
// GetMetrics returns the pipeline performance metrics.
GetMetrics() Metrics
// Close tears down the pipeline and releases resources.
Close() error
}
Pipeline is the main audio processing pipeline.
type PipelineComponent ¶
type PipelineComponent interface {
// Name returns the component's identifier (e.g., "vad", "echo_filter", "asr_input")
Name() string
// Process handles one audio frame and returns the result.
// data: input data (typically []byte for audio or string for text)
// Returns: (output data, shouldContinue, error)
// If shouldContinue is false, the pipeline stops processing this frame.
Process(ctx context.Context, data interface{}) (interface{}, bool, error)
}
PipelineComponent is a single step in the audio processing pipeline. Implementations MUST be safe for concurrent calls.
type PlaybackGate ¶
type PlaybackGate struct {
// contains filtered or unexported fields
}
PlaybackGate tracks downlink TTS activity for echo suppression and barge-in. It treats queued utterances and a configurable post-playback tail as "active" so uplink echo does not leak into ASR right after the speaker goes quiet.
func NewPlaybackGate ¶
func NewPlaybackGate(isPlaying func() bool, queueDepth func() int, tail time.Duration) *PlaybackGate
NewPlaybackGate creates a gate. tail is how long after playback ends uplink remains suppressed (room echo). 0 disables tail extension.
func (*PlaybackGate) IsBargeInWindow ¶
func (g *PlaybackGate) IsBargeInWindow() bool
IsBargeInWindow is true when user interrupt should be considered: streaming, queued, or within the post-playback tail.
func (*PlaybackGate) IsEchoSuppressActive ¶
func (g *PlaybackGate) IsEchoSuppressActive() bool
IsEchoSuppressActive is true when uplink should not be fed to ASR (echo tail included). Slightly longer than barge-in window when tail > 0.
func (*PlaybackGate) IsQueued ¶
func (g *PlaybackGate) IsQueued() bool
IsQueued is true when additional utterances wait on the speak queue.
func (*PlaybackGate) IsStreaming ¶
func (g *PlaybackGate) IsStreaming() bool
IsStreaming is true while audio frames are actively leaving the TTS pipeline.
func (*PlaybackGate) Reset ¶
func (g *PlaybackGate) Reset()
Reset clears tail memory (e.g. on session teardown).
type RecognizerComponent ¶
type RecognizerComponent struct {
// contains filtered or unexported fields
}
RecognizerComponent is the terminal input-stage that feeds PCM into an ASR engine. Recognition results are delivered via OnTranscript / OnError callbacks configured before Start; the session wires those into ProcessOutput and events.
func NewRecognizerComponent ¶
func NewRecognizerComponent(engine Engine) (*RecognizerComponent, error)
NewRecognizerComponent creates a recognizer pipeline stage.
func (*RecognizerComponent) Close ¶
func (c *RecognizerComponent) Close() error
Close stops the recognizer engine.
func (*RecognizerComponent) Name ¶
func (c *RecognizerComponent) Name() string
Name returns the component identifier.
func (*RecognizerComponent) Process ¶
func (c *RecognizerComponent) Process(ctx context.Context, data interface{}) (interface{}, bool, error)
Process feeds one PCM frame into the recognizer. Audio does not pass to downstream input stages (this is the terminal uplink stage).
func (*RecognizerComponent) SetOnError ¶
func (c *RecognizerComponent) SetOnError(fn func(err error, fatal bool))
SetOnError registers the error sink.
func (*RecognizerComponent) SetOnTranscript ¶
func (c *RecognizerComponent) SetOnTranscript(fn func(text string, isFinal bool))
SetOnTranscript registers the transcript sink.
func (*RecognizerComponent) Start ¶
func (c *RecognizerComponent) Start() error
Start connects the underlying engine. Safe to call multiple times.
type RecognizerEngine ¶
type RecognizerEngine struct {
// contains filtered or unexported fields
}
RecognizerEngine adapts recognizer.Recognizer to Engine.
func NewRecognizerEngine ¶
func NewRecognizerEngine(r *recognizer.Recognizer) *RecognizerEngine
NewRecognizerEngine wraps a recognizer.Recognizer.
func (*RecognizerEngine) OnError ¶
func (e *RecognizerEngine) OnError(callback func(err error, fatal bool))
func (*RecognizerEngine) OnResult ¶
func (e *RecognizerEngine) OnResult(callback func(text string, isFinal bool))
func (*RecognizerEngine) Start ¶
func (e *RecognizerEngine) Start() error
func (*RecognizerEngine) Stop ¶
func (e *RecognizerEngine) Stop()
type SensitiveFilterComponent ¶
type SensitiveFilterComponent struct {
// contains filtered or unexported fields
}
SensitiveFilterComponent filters sensitive information from recognized text. It supports: - Blacklist/whitelist filtering - Emoji filtering - Case-insensitive matching
func NewSensitiveFilterComponent ¶
func NewSensitiveFilterComponent(config SensitiveFilterConfig) (*SensitiveFilterComponent, error)
NewSensitiveFilterComponent creates a new sensitive filter component with the given configuration.
func (*SensitiveFilterComponent) Name ¶
func (s *SensitiveFilterComponent) Name() string
Name returns the component name.
func (*SensitiveFilterComponent) Process ¶
func (s *SensitiveFilterComponent) Process(ctx context.Context, data interface{}) (interface{}, bool, error)
Process filters sensitive information from text. Returns (filteredText, shouldContinue, error)
func (*SensitiveFilterComponent) SetBlacklist ¶
func (s *SensitiveFilterComponent) SetBlacklist(patterns []string) error
SetBlacklist updates the blacklist patterns.
func (*SensitiveFilterComponent) SetFilterEmoji ¶
func (s *SensitiveFilterComponent) SetFilterEmoji(enabled bool)
SetFilterEmoji enables or disables emoji filtering.
func (*SensitiveFilterComponent) SetReplaceWith ¶
func (s *SensitiveFilterComponent) SetReplaceWith(replacement string)
SetReplaceWith sets the replacement character.
func (*SensitiveFilterComponent) SetWhitelist ¶
func (s *SensitiveFilterComponent) SetWhitelist(patterns []string) error
SetWhitelist updates the whitelist patterns.
type SensitiveFilterConfig ¶
type SensitiveFilterConfig struct {
// Blacklist: words/patterns to filter out
Blacklist []string
// Whitelist: words/patterns to allow (takes precedence over blacklist)
Whitelist []string
// FilterEmoji: whether to filter out emoji characters (default: true)
FilterEmoji bool
// CaseSensitive: whether filtering is case-sensitive (default: false)
CaseSensitive bool
// ReplaceWith: character to replace filtered content with (default: "*")
ReplaceWith string
}
SensitiveFilterConfig contains configuration for sensitive filter component.
func DefaultSensitiveFilterConfig ¶
func DefaultSensitiveFilterConfig() SensitiveFilterConfig
DefaultSensitiveFilterConfig returns the default sensitive filter configuration.
type SentenceFilter ¶
type SentenceFilter struct {
// contains filtered or unexported fields
}
SentenceFilter buffers ASR partials and emits complete sentences to reduce LLM thrash. Finals always pass through.
func NewSentenceFilter ¶
func NewSentenceFilter(similarityThreshold float64) *SentenceFilter
NewSentenceFilter returns a filter with the given similarity threshold (0 disables).
func (*SentenceFilter) Reset ¶
func (f *SentenceFilter) Reset()
Reset clears filter state between turns.
type StandardPipeline ¶
type StandardPipeline struct {
// contains filtered or unexported fields
}
StandardPipeline implements Pipeline with a chain of components.
func NewStandardPipeline ¶
func NewStandardPipeline( inputStages []PipelineComponent, outputStages []PipelineComponent, ) (*StandardPipeline, error)
NewStandardPipeline creates a new standard ASR processing pipeline.
func (*StandardPipeline) ClearTTSState ¶
func (p *StandardPipeline) ClearTTSState()
ClearTTSState clears the TTS state.
func (*StandardPipeline) Close ¶
func (p *StandardPipeline) Close() error
Close tears down the pipeline.
func (*StandardPipeline) GetMetrics ¶
func (p *StandardPipeline) GetMetrics() Metrics
GetMetrics returns the pipeline performance metrics.
func (*StandardPipeline) IsTTSPlaying ¶
func (p *StandardPipeline) IsTTSPlaying() bool
IsTTSPlaying checks if TTS is playing.
func (*StandardPipeline) Process ¶
func (p *StandardPipeline) Process(ctx context.Context, data interface{}) (interface{}, error)
Process processes one audio frame through the entire pipeline.
func (*StandardPipeline) ProcessOutput ¶
func (p *StandardPipeline) ProcessOutput(ctx context.Context, text string, isFinal bool)
ProcessOutput processes output through output stages.
func (*StandardPipeline) ResetState ¶
func (p *StandardPipeline) ResetState()
ResetState resets the pipeline state for a new conversation round.
func (*StandardPipeline) SetBargeInCallback ¶
func (p *StandardPipeline) SetBargeInCallback(callback func())
SetBargeInCallback sets the callback for barge-in detection on all VAD stages.
func (*StandardPipeline) SetOutputCallback ¶
func (p *StandardPipeline) SetOutputCallback(callback func(text string, isFinal bool))
SetOutputCallback sets the callback for final output.
func (*StandardPipeline) SetPCMAudioCallback ¶
func (p *StandardPipeline) SetPCMAudioCallback(callback func(data []byte) error)
SetPCMAudioCallback sets the callback for PCM audio recording.
func (*StandardPipeline) SetTTSPlaying ¶
func (p *StandardPipeline) SetTTSPlaying(playing bool)
SetTTSPlaying sets the TTS playing state.
func (*StandardPipeline) WirePlaybackGate ¶
func (p *StandardPipeline) WirePlaybackGate(gate *PlaybackGate)
WirePlaybackGate attaches a shared gate to VAD and echo-filter stages.
type VADComponent ¶
type VADComponent struct {
// contains filtered or unexported fields
}
VADComponent performs energy-based barge-in detection during downlink playback. It must sit before EchoFilterComponent so it analyzes raw microphone PCM.
func NewVADComponent ¶
func NewVADComponent(config VADConfig, gate *PlaybackGate) *VADComponent
NewVADComponent creates a VAD stage. gate may be nil (detection disabled).
func (*VADComponent) Process ¶
func (v *VADComponent) Process(ctx context.Context, data interface{}) (interface{}, bool, error)
Process analyzes PCM for barge-in; audio passes through unchanged.
func (*VADComponent) Reset ¶
func (v *VADComponent) Reset()
Reset clears internal state between turns.
func (*VADComponent) SetBargeInCallback ¶
func (v *VADComponent) SetBargeInCallback(callback func())
SetBargeInCallback sets the callback invoked on barge-in (edge-triggered per window).
func (*VADComponent) SetConsecutiveFrames ¶
func (v *VADComponent) SetConsecutiveFrames(frames int)
SetConsecutiveFrames sets frames required before barge-in fires.
func (*VADComponent) SetEnabled ¶
func (v *VADComponent) SetEnabled(enabled bool)
SetEnabled enables or disables VAD.
func (*VADComponent) SetLogger ¶
func (v *VADComponent) SetLogger(callback func(string))
SetLogger sets the logging callback.
func (*VADComponent) SetThreshold ¶
func (v *VADComponent) SetThreshold(threshold float64)
SetThreshold sets the RMS energy threshold.
type VADConfig ¶
type VADConfig struct {
Enabled bool
Threshold float64
ConsecutiveFramesNeeded int
MaxNoiseSamples int
}
VADConfig contains configuration for VAD component.
func DefaultBargeInVADConfig ¶
func DefaultBargeInVADConfig() VADConfig
DefaultBargeInVADConfig returns thresholds calibrated for interrupting TTS on uncancelled speakers.
func DefaultVADConfig ¶
func DefaultVADConfig() VADConfig
DefaultVADConfig returns general-purpose VAD settings (not barge-in tuned).