speech

package

v0.1.13 Latest Latest Go to latest Published: Apr 23, 2026 License: MIT Imports: 18 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/GizClaw/flowcraft

Links

Open Source Insights

Documentation ¶

Index ¶

Constants
type AudioSink
type AudioSource
type ClientAECMode
type DeviceType
type ErrorCode
- func ClassifyError(err error) ErrorCode
type Event
type EventType
type InterruptReason
type Pipeline
- func NewPipeline(s stt.STT, t tts.TTS, rt workflow.Runtime, agent workflow.Agent, ...) *Pipeline
- func (p *Pipeline) Abort() bool
- func (p *Pipeline) RunAudio(ctx context.Context, input audio.Frame) (audio.Stream[Event], error)
- func (p *Pipeline) RunAudioStream(ctx context.Context, input audio.Stream[audio.Frame]) (audio.Stream[Event], error)
- func (p *Pipeline) RunText(ctx context.Context, text string) (audio.Stream[Event], error)
type PipelineOption
- func WithContextID(id string) PipelineOption
- func WithDynamicTTSOptions(fn func() []tts.TTSOption) PipelineOption
- func WithSTTOptions(opts ...stt.STTOption) PipelineOption
- func WithSegmenterOptions(opts ...tts.SegmenterOption) PipelineOption
- func WithTTSOptions(opts ...tts.TTSOption) PipelineOption
- func WithTimeouts(timeouts PipelineTimeouts) PipelineOption
- func WithTurnHistory(msgs []model.Message) PipelineOption
type PipelineTimeouts
type PlaybackMode
type PlaybackReferenceProvider
type Session
- func NewSession(p *Pipeline, src AudioSource, sink AudioSink, opts ...SessionOption) *Session
- func (s *Session) Capabilities() SessionCapabilities
- func (s *Session) CommitInput() bool
- func (s *Session) Run(ctx context.Context) error
- func (s *Session) Send(text string) bool
- func (s *Session) SessionID() string
- func (s *Session) StopSpeaking() bool
- func (s *Session) VoiceProfile() (VoiceProfile, bool)
type SessionCapabilities
type SessionOption
- func WithBargeInConfirm(n int) SessionOption
- func WithCapabilities(capabilities SessionCapabilities) SessionOption
- func WithDetector(d detect.SpeechDetector) SessionOption
- func WithEndpointDecider(decider endpoint.Decider) SessionOption
- func WithEventHandler(fn func(Event)) SessionOption
- func WithFrameSize(d time.Duration) SessionOption
- func WithMetricsHook(hook speechmetrics.Hook) SessionOption
- func WithPlaybackDrainTimeout(d time.Duration) SessionOption
- func WithPreprocessor(processor preprocess.Processor) SessionOption
- func WithPreprocessors(processors ...preprocess.Processor) SessionOption
- func WithSilenceDuration(d time.Duration) SessionOption
- func WithVoiceProfile(profile VoiceProfile) SessionOption
type SessionState
type VoiceProfile
- func CommandAssistantVoiceProfile() VoiceProfile
- func CompanionVoiceProfile() VoiceProfile
- func CustomerServiceVoiceProfile() VoiceProfile
- func (p VoiceProfile) TTSOptions() []tts.TTSOption
type VoiceProfileScene

Constants ¶

View Source

const (
	ExtraKeyLanguage = "speech.language"
	ExtraKeyEmotion  = "speech.emotion"
	ExtraKeyVolume   = "speech.volume"
	ExtraKeyScene    = "speech.scene"
)

Variables ¶

This section is empty.

Functions ¶

This section is empty.

Types ¶

type AudioSink ¶

type AudioSink interface {
	// Play starts playing utterances from the stream asynchronously.
	// It returns a channel that is closed when playback finishes (drained or aborted).
	//
	// Stream termination semantics:
	//   - io.EOF means the turn ended normally — drain the hardware buffer before signalling done.
	//   - Any other error means interruption — discard buffered audio and signal done immediately.
	Play(stream audio.Stream[tts.Utterance]) <-chan struct{}
}

AudioSink is an abstraction for audio output (speaker, WebSocket, file, etc.).

type AudioSource ¶

type AudioSource interface {
	Stream() audio.Stream[audio.Frame]
}

AudioSource is an abstraction for audio input (microphone, WebSocket, file, etc.).

Implementation contract: when the context passed to Start (or the source's owning context) is cancelled, Stream().Read() must return an error (typically via Pipe.Interrupt). This is required for Session.Run to exit promptly on Ctrl+C. Implementations that do not honour this contract will cause Session.Run to block indefinitely on src.Read().

type ClientAECMode ¶

type ClientAECMode string

const (
	ClientAECUnknown  ClientAECMode = "unknown"
	ClientAECHardware ClientAECMode = "hardware"
	ClientAECOS       ClientAECMode = "os"
	ClientAECBrowser  ClientAECMode = "browser"
	ClientAECDisabled ClientAECMode = "disabled"
)

type DeviceType ¶

type DeviceType string

const (
	DeviceTypeUnknown DeviceType = "unknown"
	DeviceTypeDesktop DeviceType = "desktop"
	DeviceTypeBrowser DeviceType = "browser"
	DeviceTypeMobile  DeviceType = "mobile"
)

type ErrorCode ¶

type ErrorCode string

const (
	ErrorCodeUnknown             ErrorCode = "unknown"
	ErrorCodeTimeout             ErrorCode = "timeout"
	ErrorCodeProviderUnavailable ErrorCode = "provider_unavailable"
	ErrorCodeBadAudio            ErrorCode = "bad_audio"
	ErrorCodeTransport           ErrorCode = "transport_error"
	ErrorCodeInterrupted         ErrorCode = "interrupted"
	ErrorCodeInternal            ErrorCode = "internal_error"
)

func ClassifyError ¶

func ClassifyError(err error) ErrorCode

type Event ¶

type Event struct {
	Type               EventType
	Text               string
	Audio              audio.Frame
	Lang               string
	Confidence         float64
	Duration           time.Duration
	Words              []stt.WordTiming
	TranscriptRevision int
	Data               map[string]any
	RunID              string
	TurnID             string
	SessionID          string
	ErrorCode          ErrorCode
	InterruptReason    InterruptReason
}

Event is emitted by the voice pipeline.

type EventType ¶

type EventType string

EventType identifies a voice pipeline event.

const (
	EventTurnStarted        EventType = "voice.turn.started"
	EventTranscriptRevision EventType = "voice.transcript.revision"
	EventTranscriptPartial  EventType = "voice.transcript.partial"
	EventTranscriptFinal    EventType = "voice.transcript.final"
	EventTextDelta          EventType = "voice.text.delta"
	EventAudio              EventType = "voice.audio"
	EventResponseDone       EventType = "voice.response.done"
	EventAudioDone          EventType = "voice.audio.done"
	EventPlayStarted        EventType = "voice.play.started"
	EventToolCall           EventType = "voice.tool.call"
	EventToolResult         EventType = "voice.tool.result"
	EventTurnInterrupted    EventType = "voice.turn.interrupted"
	EventTurnDone           EventType = "voice.turn.done"
	EventDone               EventType = "voice.done"
	EventPlayDone           EventType = "voice.play.done"
	EventError              EventType = "voice.error"
)

type InterruptReason ¶

type InterruptReason string

const (
	InterruptReasonUnknown         InterruptReason = ""
	InterruptReasonUserBargeIn     InterruptReason = "user_barge_in"
	InterruptReasonManualInterrupt InterruptReason = "manual_interrupt"
	InterruptReasonTextInterrupt   InterruptReason = "text_interrupt"
)

type Pipeline ¶

type Pipeline struct {
	// contains filtered or unexported fields
}

Pipeline orchestrates STT → Runtime.Run → TTS into a stream of voice events using a linear pipeline architecture.

func NewPipeline ¶

func NewPipeline(s stt.STT, t tts.TTS, rt workflow.Runtime, agent workflow.Agent, opts ...PipelineOption) *Pipeline

NewPipeline creates a new voice pipeline. The stt parameter may be nil if only text input (RunText) is used.

func (*Pipeline) Abort ¶

func (p *Pipeline) Abort() bool

Abort cancels the currently running execution via context cancellation.

func (*Pipeline) RunAudio ¶

func (p *Pipeline) RunAudio(ctx context.Context, input audio.Frame) (audio.Stream[Event], error)

RunAudio processes a complete audio input in one shot. Requires a non-nil STT provider.

func (*Pipeline) RunAudioStream ¶

func (p *Pipeline) RunAudioStream(ctx context.Context, input audio.Stream[audio.Frame]) (audio.Stream[Event], error)

RunAudioStream processes streaming audio input. Requires a non-nil STT provider.

func (*Pipeline) RunText ¶

func (p *Pipeline) RunText(ctx context.Context, text string) (audio.Stream[Event], error)

RunText processes text input directly, skipping STT. Produces the same Event stream as RunAudio/RunAudioStream.

type PipelineOption ¶

type PipelineOption func(*Pipeline)

PipelineOption configures a Pipeline.

func WithContextID ¶

func WithContextID(id string) PipelineOption

WithContextID sets the memory context identifier passed to Runtime.Run. When the Runtime is configured with a MemoryFactory, this enables automatic history load/save across turns.

func WithDynamicTTSOptions ¶

func WithDynamicTTSOptions(fn func() []tts.TTSOption) PipelineOption

WithDynamicTTSOptions registers a callback that is invoked at the start of each turn to obtain the current TTS options. The returned options are appended after any static options set via WithTTSOptions, so they can override values like Speed, Pitch, Emotion, etc.

func WithSTTOptions ¶

func WithSTTOptions(opts ...stt.STTOption) PipelineOption

func WithSegmenterOptions ¶

func WithSegmenterOptions(opts ...tts.SegmenterOption) PipelineOption

func WithTTSOptions ¶

func WithTTSOptions(opts ...tts.TTSOption) PipelineOption

func WithTimeouts ¶

func WithTimeouts(timeouts PipelineTimeouts) PipelineOption

func WithTurnHistory ¶

func WithTurnHistory(msgs []model.Message) PipelineOption

WithTurnHistory sets message history for the current turn. Only effective when the Runtime has no MemoryFactory configured; when a MemoryFactory is present, use WithContextID instead.

type PipelineTimeouts ¶

type PipelineTimeouts struct {
	STTFirstPartial  time.Duration
	STTFinal         time.Duration
	RunnerFirstToken time.Duration
	TTSFirstAudio    time.Duration
}

type PlaybackMode ¶

type PlaybackMode string

const (
	PlaybackModeUnknown PlaybackMode = "unknown"
	PlaybackModeSpeaker PlaybackMode = "speaker"
	PlaybackModeHeadset PlaybackMode = "headset"
)

type PlaybackReferenceProvider ¶

type PlaybackReferenceProvider interface {
	PlaybackReference() audio.Stream[audio.Frame]
}

PlaybackReferenceProvider is an optional interface for sinks that can expose the audio they are currently playing as a reference stream for echo suppression or AEC.

type Session ¶

type Session struct {
	// contains filtered or unexported fields
}

Session implements a voice conversation state machine.

Audio:  IDLE → HEARING → RESPONDING → PLAYBACK → IDLE
Text:   IDLE → RESPONDING → PLAYBACK → IDLE

Both audio barge-in and text input (Send) can interrupt any state.

func NewSession ¶

func NewSession(p *Pipeline, src AudioSource, sink AudioSink, opts ...SessionOption) *Session

NewSession creates a new voice session. The src parameter may be nil for text-only sessions (use Send to inject text).

func (*Session) Capabilities ¶

func (s *Session) Capabilities() SessionCapabilities

func (*Session) CommitInput ¶

func (s *Session) CommitInput() bool

CommitInput explicitly commits the current audio input turn. It is a no-op when the session is not currently hearing audio.

func (*Session) Run ¶

func (s *Session) Run(ctx context.Context) error

Run starts the session loop. It blocks until ctx is cancelled or the audio source is exhausted. For text-only sessions (source == nil), Run blocks until ctx is cancelled.

func (*Session) Send ¶

func (s *Session) Send(text string) bool

Send injects a text message into the session, skipping STT. If the session is currently responding or playing, it triggers a barge-in. Returns false if the internal buffer is full (message dropped). Safe to call from any goroutine.

func (*Session) SessionID ¶

func (s *Session) SessionID() string

func (*Session) StopSpeaking ¶

func (s *Session) StopSpeaking() bool

StopSpeaking signals that the current speaking phase should stop. During hearing it behaves like CommitInput; during responding/playback it acts as a turn interruption hint.

func (*Session) VoiceProfile ¶

func (s *Session) VoiceProfile() (VoiceProfile, bool)

type SessionCapabilities ¶

type SessionCapabilities struct {
	ClientAEC    ClientAECMode
	ClientNS     bool
	ClientAGC    bool
	DeviceType   DeviceType
	PlaybackMode PlaybackMode
}

SessionCapabilities describes client-side media processing and device hints.

type SessionOption ¶

type SessionOption func(*sessionConfig)

SessionOption configures a Session.

func WithBargeInConfirm ¶

func WithBargeInConfirm(n int) SessionOption

func WithCapabilities ¶

func WithCapabilities(capabilities SessionCapabilities) SessionOption

func WithDetector ¶

func WithDetector(d detect.SpeechDetector) SessionOption

WithDetector sets a custom SpeechDetector. If not set, a default EnergyDetector is used.

func WithEndpointDecider ¶

func WithEndpointDecider(decider endpoint.Decider) SessionOption

func WithEventHandler ¶

func WithEventHandler(fn func(Event)) SessionOption

func WithFrameSize ¶

func WithFrameSize(d time.Duration) SessionOption

func WithMetricsHook ¶

func WithMetricsHook(hook speechmetrics.Hook) SessionOption

func WithPlaybackDrainTimeout ¶

func WithPlaybackDrainTimeout(d time.Duration) SessionOption

func WithPreprocessor ¶

func WithPreprocessor(processor preprocess.Processor) SessionOption

func WithPreprocessors ¶

func WithPreprocessors(processors ...preprocess.Processor) SessionOption

func WithSilenceDuration ¶

func WithSilenceDuration(d time.Duration) SessionOption

func WithVoiceProfile ¶

func WithVoiceProfile(profile VoiceProfile) SessionOption

type SessionState ¶

type SessionState int

SessionState represents the current state of the voice session.

const (
	StateIdle SessionState = iota
	StateHearing
	StateResponding
	StatePlayback
)

type VoiceProfile ¶

type VoiceProfile struct {
	Language string
	Voice    string
	Speed    float64
	Emotion  string
	Volume   float64
	Codec    audio.Codec
	Rate     int
	Scene    VoiceProfileScene
}

func CommandAssistantVoiceProfile ¶

func CommandAssistantVoiceProfile() VoiceProfile

func CompanionVoiceProfile ¶

func CompanionVoiceProfile() VoiceProfile

func CustomerServiceVoiceProfile ¶

func CustomerServiceVoiceProfile() VoiceProfile

func (VoiceProfile) TTSOptions ¶

func (p VoiceProfile) TTSOptions() []tts.TTSOption

type VoiceProfileScene ¶

type VoiceProfileScene string

const (
	VoiceProfileSceneCustomerService  VoiceProfileScene = "customer_service"
	VoiceProfileSceneCompanion        VoiceProfileScene = "companion"
	VoiceProfileSceneCommandAssistant VoiceProfileScene = "command_assistant"
)

Source Files ¶

View all Source files

Directories ¶

Path	Synopsis
audio
detect
endpoint
metrics
preprocess
provider
stt
tts
vad
webrtc

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL