Documentation
¶
Index ¶
- Constants
- Variables
- func List() []string
- func Register(factory ChunkerFactory) error
- func Supports(provider string) bool
- type Chunk
- type ChunkOptions
- type ChunkResult
- type ChunkStats
- type Chunker
- type ChunkerFactory
- type Config
- type DefaultFactory
- type DocumentType
- type DocumentTypeDetector
- type LLMChunker
- type LLMChunkerFactory
- type RouterChunkerFactory
- type RoutingChunker
- type RuleBasedDocumentTypeDetector
- type StructuredChunkerFactory
- type StructuredRuleChunker
- type TableKVChunker
- type TableKVChunkerFactory
Constants ¶
View Source
const ( DefaultLLMChunkMaxChars = 600 DefaultLLMChunkOverlapChars = 80 DefaultLLMChunkMinChars = 40 )
View Source
const ( DefaultRuleChunkMaxChars = 1200 DefaultRuleChunkMinChars = 80 )
Variables ¶
View Source
var ( ErrEmptyText = errors.New("text is empty") ErrInvalidChunkOpt = errors.New("invalid chunk options") ErrNoChunks = errors.New("no chunks produced") ErrChunkerNotFound = errors.New("chunker not found") ErrProviderNotFound = errors.New("provider not found") ErrInvalidConfig = errors.New("invalid configuration") ErrChunkFailed = errors.New("chunking failed") ErrDetectionFailed = errors.New("document type detection failed") )
Functions ¶
Types ¶
type Chunk ¶
type Chunk struct {
Index int // 分块索引
Title string // 分块标题(可选)
Text string // 分块文本
Metadata map[string]interface{}
}
Chunk 分块结果
type ChunkOptions ¶
type ChunkOptions struct {
// MaxChars 目标最大字符数
MaxChars int
// OverlapChars 相邻分块的重叠字符数
OverlapChars int
// MinChars 最小字符数
MinChars int
// DocumentTitle 文档标题
DocumentTitle string
// PreChunkClean 分块前的清理选项
PreChunkClean map[string]interface{}
}
ChunkOptions 分块选项
type ChunkResult ¶
type ChunkResult struct {
Chunks []Chunk
DocumentType DocumentType
Duration int64 // 毫秒
}
ChunkResult 分块结果
type ChunkStats ¶
type ChunkStats struct {
TotalChunks int
TotalChars int
AvgChunkSize int
MinChunkSize int
MaxChunkSize int
OverlapChars int
DocumentType DocumentType
}
ChunkStats 分块统计
type Chunker ¶
type Chunker interface {
// Provider 返回提供商名称
Provider() string
// Chunk 将文本分块
Chunk(ctx context.Context, text string, opts *ChunkOptions) ([]Chunk, error)
}
Chunker 分块接口
type ChunkerFactory ¶
type ChunkerFactory interface {
// Create 创建分块器
Create(ctx context.Context, cfg *Config) (Chunker, error)
// Name 返回工厂名称
Name() string
// Supports 检查是否支持该提供商
Supports(provider string) bool
}
ChunkerFactory 分块工厂接口
type Config ¶
type Config struct {
// Provider 提供商名称 (llm, rules_structured, rules_table_kv, router 等)
Provider string
// Model LLM 模型名称(仅用于 LLM 分块器)
Model string
// ChatModel LLM 聊天模型(仅用于 LLM 分块器)
ChatModel interface{}
// Detector 文档类型检测器(仅用于 router 分块器)
Detector DocumentTypeDetector
// MaxChars 最大字符数
MaxChars int
// MinChars 最小字符数
MinChars int
// OverlapChars 重叠字符数
OverlapChars int
// CustomConfig 自定义配置
CustomConfig map[string]interface{}
}
Config 分块器配置
type DefaultFactory ¶
type DefaultFactory struct {
// contains filtered or unexported fields
}
DefaultFactory 默认分块工厂
func (*DefaultFactory) Register ¶
func (f *DefaultFactory) Register(factory ChunkerFactory) error
Register 注册工厂
func (*DefaultFactory) Supports ¶
func (f *DefaultFactory) Supports(provider string) bool
Supports 检查是否支持该提供商
type DocumentType ¶
type DocumentType int
DocumentType 文档类型
const ( DocumentTypeUnknown DocumentType = iota DocumentTypeStructured // 结构化文档(有标题、段落等) DocumentTypeTableKV // 表格/键值对文档 DocumentTypeUnstructured // 非结构化文档(OCR、噪声文本等) )
type DocumentTypeDetector ¶
type DocumentTypeDetector interface {
// DetectDocumentType 检测文档类型
DetectDocumentType(ctx context.Context, text string) (DocumentType, error)
}
DocumentTypeDetector 文档类型检测器
type LLMChunker ¶
type LLMChunker struct {
// contains filtered or unexported fields
}
LLMChunker 使用 LLM 进行分块
func (*LLMChunker) Chunk ¶
func (c *LLMChunker) Chunk(ctx context.Context, text string, opts *ChunkOptions) ([]Chunk, error)
func (*LLMChunker) Provider ¶
func (c *LLMChunker) Provider() string
type LLMChunkerFactory ¶
type LLMChunkerFactory struct{}
LLMChunkerFactory LLM 分块工厂
func (*LLMChunkerFactory) Name ¶
func (f *LLMChunkerFactory) Name() string
func (*LLMChunkerFactory) Supports ¶
func (f *LLMChunkerFactory) Supports(provider string) bool
type RouterChunkerFactory ¶
type RouterChunkerFactory struct{}
RouterChunkerFactory 路由分块工厂
func (*RouterChunkerFactory) Name ¶
func (f *RouterChunkerFactory) Name() string
func (*RouterChunkerFactory) Supports ¶
func (f *RouterChunkerFactory) Supports(provider string) bool
type RoutingChunker ¶
type RoutingChunker struct {
// contains filtered or unexported fields
}
RoutingChunker 路由分块器 - 根据文档类型选择合适的分块策略
func NewRoutingChunker ¶
func NewRoutingChunker(cfg *Config) *RoutingChunker
NewRoutingChunker 创建路由分块器
func (*RoutingChunker) Chunk ¶
func (c *RoutingChunker) Chunk(ctx context.Context, text string, opts *ChunkOptions) ([]Chunk, error)
func (*RoutingChunker) Provider ¶
func (c *RoutingChunker) Provider() string
type RuleBasedDocumentTypeDetector ¶
type RuleBasedDocumentTypeDetector struct{}
RuleBasedDocumentTypeDetector 基于规则的文档类型检测器
func (*RuleBasedDocumentTypeDetector) DetectDocumentType ¶
func (d *RuleBasedDocumentTypeDetector) DetectDocumentType(ctx context.Context, text string) (DocumentType, error)
type StructuredChunkerFactory ¶
type StructuredChunkerFactory struct{}
StructuredChunkerFactory 结构化分块工厂
func (*StructuredChunkerFactory) Name ¶
func (f *StructuredChunkerFactory) Name() string
func (*StructuredChunkerFactory) Supports ¶
func (f *StructuredChunkerFactory) Supports(provider string) bool
type StructuredRuleChunker ¶
type StructuredRuleChunker struct {
// contains filtered or unexported fields
}
StructuredRuleChunker 结构化文档规则分块器
func NewStructuredRuleChunker ¶
func NewStructuredRuleChunker(cfg *Config) *StructuredRuleChunker
NewStructuredRuleChunker 创建结构化规则分块器
func (*StructuredRuleChunker) Chunk ¶
func (c *StructuredRuleChunker) Chunk(ctx context.Context, text string, opts *ChunkOptions) ([]Chunk, error)
func (*StructuredRuleChunker) Provider ¶
func (c *StructuredRuleChunker) Provider() string
type TableKVChunker ¶
type TableKVChunker struct {
// contains filtered or unexported fields
}
TableKVChunker 表格/键值对分块器
func NewTableKVChunker ¶
func NewTableKVChunker(cfg *Config) *TableKVChunker
NewTableKVChunker 创建表格/键值对分块器
func (*TableKVChunker) Chunk ¶
func (c *TableKVChunker) Chunk(ctx context.Context, text string, opts *ChunkOptions) ([]Chunk, error)
func (*TableKVChunker) Provider ¶
func (c *TableKVChunker) Provider() string
type TableKVChunkerFactory ¶
type TableKVChunkerFactory struct{}
TableKVChunkerFactory 表格/键值对分块工厂
func (*TableKVChunkerFactory) Name ¶
func (f *TableKVChunkerFactory) Name() string
func (*TableKVChunkerFactory) Supports ¶
func (f *TableKVChunkerFactory) Supports(provider string) bool
Click to show internal directories.
Click to hide internal directories.