Documentation
¶
Overview ¶
包 multimodal 提供多模态输入处理能力。
该包包含 AgentFlow 在 `llm/multimodal` 目录下的核心实现。
Index ¶
- type AudioConfig
- type AudioFormat
- type Capability
- type Content
- func LoadAudioFromFile(path string) (Content, error)
- func LoadImageFromFile(path string) (Content, error)
- func LoadImageFromURL(url string) (Content, error)
- func NewAudioBase64Content(data string, format AudioFormat) Content
- func NewAudioURLContent(url string) Content
- func NewImageBase64Content(data string, format ImageFormat) Content
- func NewImageURLContent(url string) Content
- func NewTextContent(text string) Content
- type ContentType
- type ImageDimensions
- type ImageFormat
- type MultimodalMessage
- type MultimodalProvider
- func (m *MultimodalProvider) Completion(ctx context.Context, req *MultimodalRequest) (*llm.ChatResponse, error)
- func (m *MultimodalProvider) Name() string
- func (m *MultimodalProvider) Stream(ctx context.Context, req *MultimodalRequest) (<-chan llm.StreamChunk, error)
- func (m *MultimodalProvider) SupportedModalities() []ContentType
- func (m *MultimodalProvider) SupportsMultimodal() bool
- type MultimodalRequest
- type Processor
- type ResolutionPreset
- type Router
- func (r *Router) Embed(ctx context.Context, req *embedding.EmbeddingRequest, providerName string) (*embedding.EmbeddingResponse, error)
- func (r *Router) Embedding(name string) (embedding.Provider, error)
- func (r *Router) Generate3D(ctx context.Context, req *threed.GenerateRequest, providerName string) (*threed.GenerateResponse, error)
- func (r *Router) GenerateImage(ctx context.Context, req *image.GenerateRequest, providerName string) (*image.GenerateResponse, error)
- func (r *Router) GenerateMusic(ctx context.Context, req *music.GenerateRequest, providerName string) (*music.GenerateResponse, error)
- func (r *Router) GenerateVideo(ctx context.Context, req *video.GenerateRequest, providerName string) (*video.GenerateResponse, error)
- func (r *Router) HasCapability(cap Capability) bool
- func (r *Router) Image(name string) (image.Provider, error)
- func (r *Router) ListProviders() map[Capability][]string
- func (r *Router) Moderate(ctx context.Context, req *moderation.ModerationRequest, providerName string) (*moderation.ModerationResponse, error)
- func (r *Router) Moderation(name string) (moderation.ModerationProvider, error)
- func (r *Router) Music(name string) (music.MusicProvider, error)
- func (r *Router) RegisterEmbedding(name string, provider embedding.Provider, isDefault bool)
- func (r *Router) RegisterImage(name string, provider image.Provider, isDefault bool)
- func (r *Router) RegisterModeration(name string, provider moderation.ModerationProvider, isDefault bool)
- func (r *Router) RegisterMusic(name string, provider music.MusicProvider, isDefault bool)
- func (r *Router) RegisterRerank(name string, provider rerank.Provider, isDefault bool)
- func (r *Router) RegisterSTT(name string, provider speech.STTProvider, isDefault bool)
- func (r *Router) RegisterTTS(name string, provider speech.TTSProvider, isDefault bool)
- func (r *Router) RegisterThreeD(name string, provider threed.ThreeDProvider, isDefault bool)
- func (r *Router) RegisterVideo(name string, provider video.Provider, isDefault bool)
- func (r *Router) Rerank(name string) (rerank.Provider, error)
- func (r *Router) RerankDocs(ctx context.Context, req *rerank.RerankRequest, providerName string) (*rerank.RerankResponse, error)
- func (r *Router) STT(name string) (speech.STTProvider, error)
- func (r *Router) Synthesize(ctx context.Context, req *speech.TTSRequest, providerName string) (*speech.TTSResponse, error)
- func (r *Router) TTS(name string) (speech.TTSProvider, error)
- func (r *Router) ThreeD(name string) (threed.ThreeDProvider, error)
- func (r *Router) Transcribe(ctx context.Context, req *speech.STTRequest, providerName string) (*speech.STTResponse, error)
- func (r *Router) Video(name string) (video.Provider, error)
- type VisionConfig
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type AudioConfig ¶
type AudioConfig struct {
MaxDuration float64 `json:"max_duration"` // Max duration in seconds
MaxFileSize int64 `json:"max_file_size"` // Max file size in bytes
SampleRate int `json:"sample_rate"` // Target sample rate
AllowedFormats []AudioFormat `json:"allowed_formats"`
}
AudioConfig 配置音频处理.
type AudioFormat ¶
type AudioFormat string
AudioFormat 表示支持的音频格式.
const ( AudioFormatMP3 AudioFormat = "mp3" AudioFormatWAV AudioFormat = "wav" AudioFormatOGG AudioFormat = "ogg" AudioFormatFLAC AudioFormat = "flac" AudioFormatM4A AudioFormat = "m4a" )
type Capability ¶
type Capability string
能力代表一种AI能力.
const ( CapabilityEmbedding Capability = "embedding" CapabilityRerank Capability = "rerank" CapabilityTTS Capability = "tts" CapabilitySTT Capability = "stt" CapabilityImage Capability = "image" CapabilityVideo Capability = "video" CapabilityMusic Capability = "music" CapabilityThreeD Capability = "3d" CapabilityModeration Capability = "moderation" )
type Content ¶
type Content struct {
Type ContentType `json:"type"`
Text string `json:"text,omitempty"`
ImageURL string `json:"image_url,omitempty"`
AudioURL string `json:"audio_url,omitempty"`
VideoURL string `json:"video_url,omitempty"`
// Base64 编码数据( URL 的选项)
Data string `json:"data,omitempty"`
MediaType string `json:"media_type,omitempty"` // e.g., "image/png", "audio/mp3"
// 元数据
FileName string `json:"file_name,omitempty"`
FileSize int64 `json:"file_size,omitempty"`
Duration float64 `json:"duration,omitempty"` // For audio/video in seconds
Dimensions *ImageDimensions `json:"dimensions,omitempty"`
Metadata map[string]string `json:"metadata,omitempty"`
}
Content 表示一个多模态内容项.
func LoadAudioFromFile ¶
LoadAudioFromFile 加载音频文件.
func LoadImageFromFile ¶
LoadImageFromFile 从文件路径加载图像.
func LoadImageFromURL ¶
LoadImageFromURL 从 URL 加载图像.
func NewAudioBase64Content ¶
func NewAudioBase64Content(data string, format AudioFormat) Content
NewAudioBase64Content 从 Base64 数据创建音频内容.
func NewAudioURLContent ¶
NewAudioURLContent 从 URL 创建音频内容.
func NewImageBase64Content ¶
func NewImageBase64Content(data string, format ImageFormat) Content
NewImageBase64Content 从 Base64 数据创建图像内容.
func NewImageURLContent ¶
NewImageURLContent 从 URL 创建图像内容.
type ContentType ¶
type ContentType string
ContentType 表示多模态内容的类型.
const ( ContentTypeText ContentType = "text" ContentTypeImage ContentType = "image" ContentTypeAudio ContentType = "audio" ContentTypeVideo ContentType = "video" ContentTypeDocument ContentType = "document" )
type ImageDimensions ¶
ImageDimensions 表示图像尺寸.
type ImageFormat ¶
type ImageFormat string
ImageFormat 表示支持的图像格式.
const ( ImageFormatPNG ImageFormat = "png" ImageFormatJPEG ImageFormat = "jpeg" ImageFormatGIF ImageFormat = "gif" ImageFormatWebP ImageFormat = "webp" )
type MultimodalMessage ¶
MultimodalMessage 表示包含多种内容类型的消息.
type MultimodalProvider ¶
type MultimodalProvider struct {
// contains filtered or unexported fields
}
MultimodalProvider 将提供者包装在多模态支持下.
func NewMultimodalProvider ¶
func NewMultimodalProvider(provider llm.Provider, processor *Processor) *MultimodalProvider
NewMultimodalProvider 创建支持多模态的提供者包装.
func (*MultimodalProvider) Completion ¶
func (m *MultimodalProvider) Completion(ctx context.Context, req *MultimodalRequest) (*llm.ChatResponse, error)
Completion 发出多模态完成请求.
func (*MultimodalProvider) Stream ¶
func (m *MultimodalProvider) Stream(ctx context.Context, req *MultimodalRequest) (<-chan llm.StreamChunk, error)
Stream 发送多模态流请求.
func (*MultimodalProvider) SupportedModalities ¶
func (m *MultimodalProvider) SupportedModalities() []ContentType
SupportedModalities 返回提供者支持的模态.
func (*MultimodalProvider) SupportsMultimodal ¶
func (m *MultimodalProvider) SupportsMultimodal() bool
SupportsMultimodal 检查提供者是否支持多模态输入.
type MultimodalRequest ¶
type MultimodalRequest struct {
llm.ChatRequest
MultimodalMessages []MultimodalMessage `json:"multimodal_messages,omitempty"`
}
MultimodalRequest 以多模态内容扩展聊天请求.
type Processor ¶
type Processor struct {
// contains filtered or unexported fields
}
Processor 处理不同提供者的多模态内容转换.
func NewProcessor ¶
func NewProcessor(visionCfg VisionConfig, audioCfg AudioConfig) *Processor
NewProcessor 创建新的多模态处理器.
func (*Processor) ConvertToProviderFormat ¶
func (p *Processor) ConvertToProviderFormat(provider string, messages []MultimodalMessage) ([]llm.Message, error)
ConvertToProviderFormat 将多模态消息转换为提供者专用格式.
type ResolutionPreset ¶
type ResolutionPreset string
ResolutionPreset 表示视觉模型的图像分辨率预设.
const ( ResolutionLow ResolutionPreset = "low" // 512x512 or similar ResolutionMedium ResolutionPreset = "medium" // 1024x1024 or similar ResolutionHigh ResolutionPreset = "high" // Original resolution ResolutionAuto ResolutionPreset = "auto" // Let the model decide )
type Router ¶
type Router struct {
// contains filtered or unexported fields
}
Router 向所有多模态提供者提供统一访问。
func (*Router) Embed ¶
func (r *Router) Embed(ctx context.Context, req *embedding.EmbeddingRequest, providerName string) (*embedding.EmbeddingResponse, error)
嵌入使用默认或指定的提供者生成嵌入.
func (*Router) Generate3D ¶
func (r *Router) Generate3D(ctx context.Context, req *threed.GenerateRequest, providerName string) (*threed.GenerateResponse, error)
生成3D使用默认或指定的提供者生成3D模型.
func (*Router) GenerateImage ¶
func (r *Router) GenerateImage(ctx context.Context, req *image.GenerateRequest, providerName string) (*image.GenerateResponse, error)
生成图像使用默认或指定的提供者生成图像.
func (*Router) GenerateMusic ¶
func (r *Router) GenerateMusic(ctx context.Context, req *music.GenerateRequest, providerName string) (*music.GenerateResponse, error)
生成音乐使用默认或指定的提供者生成音乐.
func (*Router) GenerateVideo ¶
func (r *Router) GenerateVideo(ctx context.Context, req *video.GenerateRequest, providerName string) (*video.GenerateResponse, error)
生成视频使用默认或指定的提供者生成.
func (*Router) ListProviders ¶
func (r *Router) ListProviders() map[Capability][]string
ListProviders按能力返回所有注册的提供者名称.
func (*Router) Moderate ¶
func (r *Router) Moderate(ctx context.Context, req *moderation.ModerationRequest, providerName string) (*moderation.ModerationResponse, error)
适度检查政策违规内容.
func (*Router) Moderation ¶
func (r *Router) Moderation(name string) (moderation.ModerationProvider, error)
中度通过名称或默认返回温和提供方 。
func (*Router) Music ¶
func (r *Router) Music(name string) (music.MusicProvider, error)
音乐通过名称或默认返回音乐提供者.
func (*Router) RegisterEmbedding ¶
RegisterEmbedding 注册一个嵌入提供者.
func (*Router) RegisterImage ¶
注册图像注册图像提供者 。
func (*Router) RegisterModeration ¶
func (r *Router) RegisterModeration(name string, provider moderation.ModerationProvider, isDefault bool)
登记册修改登记了一个温和提供方。
func (*Router) RegisterMusic ¶
func (r *Router) RegisterMusic(name string, provider music.MusicProvider, isDefault bool)
注册Music 注册音乐提供者 。
func (*Router) RegisterRerank ¶
Register Rerank 注册一个重新排序的提供者 。
func (*Router) RegisterSTT ¶
func (r *Router) RegisterSTT(name string, provider speech.STTProvider, isDefault bool)
注册STT 注册 STT 提供者 。
func (*Router) RegisterTTS ¶
func (r *Router) RegisterTTS(name string, provider speech.TTSProvider, isDefault bool)
RegisterTTS 注册一个 TTS 提供者.
func (*Router) RegisterThreeD ¶
func (r *Router) RegisterThreeD(name string, provider threed.ThreeDProvider, isDefault bool)
RegisterThreeD 注册一个 3D 提供者.
func (*Router) RegisterVideo ¶
RegisterVideo 注册一个视频提供者。
func (*Router) RerankDocs ¶
func (r *Router) RerankDocs(ctx context.Context, req *rerank.RerankRequest, providerName string) (*rerank.RerankResponse, error)
重新排序 Docs 使用默认或指定的提供者重新排序文档 。
func (*Router) STT ¶
func (r *Router) STT(name string) (speech.STTProvider, error)
STT 按名称或默认返回一个 STT 提供者 。
func (*Router) Synthesize ¶
func (r *Router) Synthesize(ctx context.Context, req *speech.TTSRequest, providerName string) (*speech.TTSResponse, error)
合成大小使用默认或指定的提供者生成语音.
func (*Router) TTS ¶
func (r *Router) TTS(name string) (speech.TTSProvider, error)
TTS通过名称或默认返回一个 TTS 提供者.
func (*Router) ThreeD ¶
func (r *Router) ThreeD(name string) (threed.ThreeDProvider, error)
三维通过名称或默认返回一个三维提供者.
func (*Router) Transcribe ¶
func (r *Router) Transcribe(ctx context.Context, req *speech.STTRequest, providerName string) (*speech.STTResponse, error)
使用默认或指定的提供者将语音转换为文本。
type VisionConfig ¶
type VisionConfig struct {
Resolution ResolutionPreset `json:"resolution"`
MaxImageSize int64 `json:"max_image_size"` // Max file size in bytes
MaxDimension int `json:"max_dimension"` // Max width/height
AllowedFormats []ImageFormat `json:"allowed_formats"`
}
VisionConfig 配置视觉处理.