voice

package
v0.0.0-...-8acab51 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 26, 2026 License: MIT Imports: 31 Imported by: 0

Documentation

Overview

Package voice provides voice assistant functionality. This package combines STT and TTS services with WebSocket streaming for real-time voice interaction.

Index

Constants

View Source
const (
	// MsgTypeAudio is an audio data message.
	MsgTypeAudio = "audio"
	// MsgTypeTranscript is a transcription result message.
	MsgTypeTranscript = "transcript"
	// MsgTypeResponse is an assistant response message.
	MsgTypeResponse = "response"
	// MsgTypeAudioResponse is an audio response message.
	MsgTypeAudioResponse = "audio_response"
	// MsgTypeStateChange is a state change message.
	MsgTypeStateChange = "state_change"
	// MsgTypeError is an error message.
	MsgTypeError = "error"
	// MsgTypeConfig is a configuration message.
	MsgTypeConfig = "config"
	// MsgTypeWakeWord is a wake word detected message.
	MsgTypeWakeWord = "wake_word"
	// MsgTypePing is a ping message.
	MsgTypePing = "ping"
	// MsgTypePong is a pong message.
	MsgTypePong = "pong"
)

WebSocket message types

Variables

View Source
var (
	// ErrSessionNotFound is returned when a session is not found.
	ErrSessionNotFound = errors.New("voice session not found")
	// ErrSessionExpired is returned when a session has expired.
	ErrSessionExpired = errors.New("voice session expired")
	// ErrInvalidAudio is returned when the audio is invalid.
	ErrInvalidAudio = errors.New("invalid audio data")
	// ErrProcessingFailed is returned when processing fails.
	ErrProcessingFailed = errors.New("voice processing failed")
)

Functions

This section is empty.

Types

type ChatFunc

type ChatFunc func(ctx context.Context, req llm.ChatRequest) (*llm.ChatResponse, error)

ChatFunc is a function that sends a chat request and returns a response.

type Handler

type Handler struct {
	// contains filtered or unexported fields
}

Handler handles voice HTTP requests.

func NewHandler

func NewHandler(service Service) *Handler

NewHandler creates a new voice handler.

func (*Handler) CloseSession

func (h *Handler) CloseSession(c echo.Context) error

CloseSession closes a voice session.

func (*Handler) CreateSession

func (h *Handler) CreateSession(c echo.Context) error

CreateSession creates a new voice session.

func (*Handler) GetSession

func (h *Handler) GetSession(c echo.Context) error

GetSession gets a voice session.

func (*Handler) ListVoices

func (h *Handler) ListVoices(c echo.Context) error

ListVoices returns available TTS voices.

func (*Handler) RegisterRoutes

func (h *Handler) RegisterRoutes(g *echo.Group)

RegisterRoutes registers the voice routes.

func (*Handler) Service

func (h *Handler) Service() Service

Service returns the voice service.

func (*Handler) StopSpeaking

func (h *Handler) StopSpeaking(c echo.Context) error

StopSpeaking stops any currently running local speech.

func (*Handler) Synthesize

func (h *Handler) Synthesize(c echo.Context) error

Synthesize handles text-to-speech requests.

func (*Handler) SynthesizeStream

func (h *Handler) SynthesizeStream(c echo.Context) error

SynthesizeStream handles streaming text-to-speech requests via SSE. Uses true chunk-by-chunk streaming: each sentence's audio is sent as soon as it's ready.

func (*Handler) Transcribe

func (h *Handler) Transcribe(c echo.Context) error

Transcribe handles audio transcription requests.

type Service

type Service interface {
	// CreateSession creates a new voice session.
	CreateSession(ctx context.Context, userID string, config *VoiceConfig) (*Session, error)
	// GetSession gets a session by ID.
	GetSession(ctx context.Context, sessionID string) (*Session, error)
	// UpdateSessionState updates the session state.
	UpdateSessionState(ctx context.Context, sessionID string, state SessionState) error
	// CloseSession closes a session.
	CloseSession(ctx context.Context, sessionID string) error
	// Transcribe transcribes audio to text.
	Transcribe(ctx context.Context, req *TranscribeRequest, audio []byte) (*TranscribeResponse, error)
	// Synthesize synthesizes text to speech.
	Synthesize(ctx context.Context, req *SynthesizeRequest) ([]byte, string, error)
	// SynthesizeStream synthesizes text and calls back with each audio chunk.
	// This enables true streaming: the first chunk arrives while later chunks are still being synthesized.
	SynthesizeStream(ctx context.Context, req *SynthesizeRequest, callback func(audio []byte, contentType string) error) error
	// SpeakLocally plays text through local audio output (blocking until done).
	// Returns false if the current provider doesn't support local playback.
	SpeakLocally(ctx context.Context, text string) (bool, error)
	// StopSpeaking stops any currently running local speech.
	StopSpeaking()
	// SetSTTService replaces the STT service at runtime.
	SetSTTService(svc stt.Service)
	// SetChatFunc sets the LLM chat function used by ProcessVoiceInput.
	SetChatFunc(fn ChatFunc)
	// ProcessVoiceInput processes voice input and returns a response.
	ProcessVoiceInput(ctx context.Context, sessionID string, text string) (string, error)
}

Service defines the voice service interface.

func NewService

func NewService(cfg *ServiceConfig) Service

NewService creates a new voice service.

type ServiceConfig

type ServiceConfig struct {
	STTService stt.Service
	TTSService tts.Service
}

ServiceConfig holds the configuration for the voice service.

type Session

type Session struct {
	// ID is the session identifier.
	ID string `json:"id"`
	// UserID is the user identifier.
	UserID string `json:"user_id"`
	// State is the current session state.
	State SessionState `json:"state"`
	// Language is the session language.
	Language string `json:"language"`
	// Voice is the TTS voice ID.
	Voice string `json:"voice"`
	// CreatedAt is when the session was created.
	CreatedAt time.Time `json:"created_at"`
	// LastActivity is the last activity time.
	LastActivity time.Time `json:"last_activity"`
	// ConversationID is the linked conversation ID.
	ConversationID string `json:"conversation_id,omitempty"`
}

Session represents a voice session.

type SessionState

type SessionState string

SessionState represents the state of a voice session.

const (
	// StateIdle indicates the session is idle.
	StateIdle SessionState = "idle"
	// StateListening indicates the session is listening for audio.
	StateListening SessionState = "listening"
	// StateProcessing indicates the session is processing audio.
	StateProcessing SessionState = "processing"
	// StateSpeaking indicates the session is playing audio response.
	StateSpeaking SessionState = "speaking"
)

type SynthesizeRequest

type SynthesizeRequest struct {
	// Text is the text to synthesize.
	Text string `json:"text"`
	// Format is the output format.
	Format string `json:"format,omitempty"`
}

SynthesizeRequest represents a synthesis request.

type TranscribeRequest

type TranscribeRequest struct {
	// SessionID is the session identifier.
	SessionID string `json:"session_id,omitempty"`
	// Language is the language code.
	Language string `json:"language,omitempty"`
	// Format is the audio format.
	Format string `json:"format"`
}

TranscribeRequest represents a transcription request.

type TranscribeResponse

type TranscribeResponse struct {
	// Text is the transcribed text.
	Text string `json:"text"`
	// Language is the detected language.
	Language string `json:"language,omitempty"`
	// Duration is the audio duration.
	Duration float64 `json:"duration,omitempty"`
	// Confidence is the confidence score.
	Confidence float64 `json:"confidence,omitempty"`
}

TranscribeResponse represents a transcription response.

type VoiceConfig

type VoiceConfig struct {
	// Language is the default language.
	Language string `json:"language"`
	// Voice is the default TTS voice.
	Voice string `json:"voice"`
	// WakeWord is the wake word (optional).
	WakeWord string `json:"wake_word,omitempty"`
	// WakeWordEnabled indicates if wake word detection is enabled.
	WakeWordEnabled bool `json:"wake_word_enabled"`
	// ContinuousListening indicates if continuous listening is enabled.
	ContinuousListening bool `json:"continuous_listening"`
	// AutoPlayResponse indicates if responses should auto-play.
	AutoPlayResponse bool `json:"auto_play_response"`
}

VoiceConfig represents voice configuration.

type VoiceMessage

type VoiceMessage struct {
	// ID is the message identifier.
	ID string `json:"id"`
	// SessionID is the session identifier.
	SessionID string `json:"session_id"`
	// Role is the message role (user or assistant).
	Role string `json:"role"`
	// Text is the transcribed or generated text.
	Text string `json:"text"`
	// AudioURL is the URL to the audio file (if stored).
	AudioURL string `json:"audio_url,omitempty"`
	// Duration is the audio duration in seconds.
	Duration float64 `json:"duration,omitempty"`
	// Timestamp is when the message was created.
	Timestamp time.Time `json:"timestamp"`
}

VoiceMessage represents a voice message in the conversation.

type WSHandler

type WSHandler struct {
	// contains filtered or unexported fields
}

WSHandler handles WebSocket connections for voice streaming.

func NewWSHandler

func NewWSHandler(service Service) *WSHandler

NewWSHandler creates a new WebSocket handler.

func (*WSHandler) Close

func (h *WSHandler) Close()

Close closes all active voice WebSocket connections gracefully.

func (*WSHandler) HandleStream

func (h *WSHandler) HandleStream(c echo.Context) error

HandleStream handles WebSocket connections for voice streaming.

func (*WSHandler) RegisterRoutes

func (h *WSHandler) RegisterRoutes(g *echo.Group)

RegisterRoutes registers the WebSocket routes.

type WebSocketMessage

type WebSocketMessage struct {
	// Type is the message type.
	Type string `json:"type"`
	// Data is the message data.
	Data interface{} `json:"data,omitempty"`
	// Error is the error message (if any).
	Error string `json:"error,omitempty"`
}

WebSocketMessage represents a WebSocket message.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL