chunk

package
v1.1.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 3, 2026 License: MIT Imports: 8 Imported by: 0

Documentation

Index

Constants

View Source
const (
	DefaultLLMChunkMaxChars     = 600
	DefaultLLMChunkOverlapChars = 80
	DefaultLLMChunkMinChars     = 40
)
View Source
const (
	DefaultRuleChunkMaxChars = 1200
	DefaultRuleChunkMinChars = 80
)

Variables

View Source
var (
	ErrEmptyText        = errors.New("text is empty")
	ErrInvalidChunkOpt  = errors.New("invalid chunk options")
	ErrNoChunks         = errors.New("no chunks produced")
	ErrChunkerNotFound  = errors.New("chunker not found")
	ErrProviderNotFound = errors.New("provider not found")
	ErrInvalidConfig    = errors.New("invalid configuration")
	ErrChunkFailed      = errors.New("chunking failed")
	ErrDetectionFailed  = errors.New("document type detection failed")
)

Functions

func List

func List() []string

List 列出所有支持的提供商

func Register

func Register(factory ChunkerFactory) error

Register 向全局工厂注册提供商

func Supports

func Supports(provider string) bool

Supports 检查是否支持该提供商

Types

type Chunk

type Chunk struct {
	Index    int    // 分块索引
	Title    string // 分块标题(可选)
	Text     string // 分块文本
	Metadata map[string]interface{}
}

Chunk 分块结果

type ChunkOptions

type ChunkOptions struct {
	// MaxChars 目标最大字符数
	MaxChars int
	// OverlapChars 相邻分块的重叠字符数
	OverlapChars int
	// MinChars 最小字符数
	MinChars int
	// DocumentTitle 文档标题
	DocumentTitle string
	// PreChunkClean 分块前的清理选项
	PreChunkClean map[string]interface{}
}

ChunkOptions 分块选项

type ChunkResult

type ChunkResult struct {
	Chunks       []Chunk
	DocumentType DocumentType
	Duration     int64 // 毫秒
}

ChunkResult 分块结果

type ChunkStats

type ChunkStats struct {
	TotalChunks  int
	TotalChars   int
	AvgChunkSize int
	MinChunkSize int
	MaxChunkSize int
	OverlapChars int
	DocumentType DocumentType
}

ChunkStats 分块统计

type Chunker

type Chunker interface {
	// Provider 返回提供商名称
	Provider() string

	// Chunk 将文本分块
	Chunk(ctx context.Context, text string, opts *ChunkOptions) ([]Chunk, error)
}

Chunker 分块接口

func Create

func Create(ctx context.Context, cfg *Config) (Chunker, error)

Create 使用全局工厂创建分块器

type ChunkerFactory

type ChunkerFactory interface {
	// Create 创建分块器
	Create(ctx context.Context, cfg *Config) (Chunker, error)

	// Name 返回工厂名称
	Name() string

	// Supports 检查是否支持该提供商
	Supports(provider string) bool
}

ChunkerFactory 分块工厂接口

type Config

type Config struct {
	// Provider 提供商名称 (llm, rules_structured, rules_table_kv, router 等)
	Provider string

	// Model LLM 模型名称(仅用于 LLM 分块器)
	Model string

	// ChatModel LLM 聊天模型(仅用于 LLM 分块器)
	ChatModel interface{}

	// Detector 文档类型检测器(仅用于 router 分块器)
	Detector DocumentTypeDetector

	// MaxChars 最大字符数
	MaxChars int

	// MinChars 最小字符数
	MinChars int

	// OverlapChars 重叠字符数
	OverlapChars int

	// CustomConfig 自定义配置
	CustomConfig map[string]interface{}
}

Config 分块器配置

type DefaultFactory

type DefaultFactory struct {
	// contains filtered or unexported fields
}

DefaultFactory 默认分块工厂

func GetFactory

func GetFactory() *DefaultFactory

GetFactory 获取全局工厂实例

func NewDefaultFactory

func NewDefaultFactory() *DefaultFactory

NewDefaultFactory 创建默认工厂

func (*DefaultFactory) Create

func (f *DefaultFactory) Create(ctx context.Context, cfg *Config) (Chunker, error)

Create 创建分块器

func (*DefaultFactory) List

func (f *DefaultFactory) List() []string

List 列出所有支持的提供商

func (*DefaultFactory) Register

func (f *DefaultFactory) Register(factory ChunkerFactory) error

Register 注册工厂

func (*DefaultFactory) Supports

func (f *DefaultFactory) Supports(provider string) bool

Supports 检查是否支持该提供商

type DocumentType

type DocumentType int

DocumentType 文档类型

const (
	DocumentTypeUnknown      DocumentType = iota
	DocumentTypeStructured                // 结构化文档(有标题、段落等)
	DocumentTypeTableKV                   // 表格/键值对文档
	DocumentTypeUnstructured              // 非结构化文档(OCR、噪声文本等)
)

type DocumentTypeDetector

type DocumentTypeDetector interface {
	// DetectDocumentType 检测文档类型
	DetectDocumentType(ctx context.Context, text string) (DocumentType, error)
}

DocumentTypeDetector 文档类型检测器

type LLMChunker

type LLMChunker struct {
	// contains filtered or unexported fields
}

LLMChunker 使用 LLM 进行分块

func NewLLMChunker

func NewLLMChunker(cfg *Config) *LLMChunker

NewLLMChunker 创建 LLM 分块器

func (*LLMChunker) Chunk

func (c *LLMChunker) Chunk(ctx context.Context, text string, opts *ChunkOptions) ([]Chunk, error)

func (*LLMChunker) Provider

func (c *LLMChunker) Provider() string

type LLMChunkerFactory

type LLMChunkerFactory struct{}

LLMChunkerFactory LLM 分块工厂

func (*LLMChunkerFactory) Create

func (f *LLMChunkerFactory) Create(ctx context.Context, cfg *Config) (Chunker, error)

func (*LLMChunkerFactory) Name

func (f *LLMChunkerFactory) Name() string

func (*LLMChunkerFactory) Supports

func (f *LLMChunkerFactory) Supports(provider string) bool

type RouterChunkerFactory

type RouterChunkerFactory struct{}

RouterChunkerFactory 路由分块工厂

func (*RouterChunkerFactory) Create

func (f *RouterChunkerFactory) Create(ctx context.Context, cfg *Config) (Chunker, error)

func (*RouterChunkerFactory) Name

func (f *RouterChunkerFactory) Name() string

func (*RouterChunkerFactory) Supports

func (f *RouterChunkerFactory) Supports(provider string) bool

type RoutingChunker

type RoutingChunker struct {
	// contains filtered or unexported fields
}

RoutingChunker 路由分块器 - 根据文档类型选择合适的分块策略

func NewRoutingChunker

func NewRoutingChunker(cfg *Config) *RoutingChunker

NewRoutingChunker 创建路由分块器

func (*RoutingChunker) Chunk

func (c *RoutingChunker) Chunk(ctx context.Context, text string, opts *ChunkOptions) ([]Chunk, error)

func (*RoutingChunker) Provider

func (c *RoutingChunker) Provider() string

type RuleBasedDocumentTypeDetector

type RuleBasedDocumentTypeDetector struct{}

RuleBasedDocumentTypeDetector 基于规则的文档类型检测器

func (*RuleBasedDocumentTypeDetector) DetectDocumentType

func (d *RuleBasedDocumentTypeDetector) DetectDocumentType(ctx context.Context, text string) (DocumentType, error)

type StructuredChunkerFactory

type StructuredChunkerFactory struct{}

StructuredChunkerFactory 结构化分块工厂

func (*StructuredChunkerFactory) Create

func (f *StructuredChunkerFactory) Create(ctx context.Context, cfg *Config) (Chunker, error)

func (*StructuredChunkerFactory) Name

func (f *StructuredChunkerFactory) Name() string

func (*StructuredChunkerFactory) Supports

func (f *StructuredChunkerFactory) Supports(provider string) bool

type StructuredRuleChunker

type StructuredRuleChunker struct {
	// contains filtered or unexported fields
}

StructuredRuleChunker 结构化文档规则分块器

func NewStructuredRuleChunker

func NewStructuredRuleChunker(cfg *Config) *StructuredRuleChunker

NewStructuredRuleChunker 创建结构化规则分块器

func (*StructuredRuleChunker) Chunk

func (c *StructuredRuleChunker) Chunk(ctx context.Context, text string, opts *ChunkOptions) ([]Chunk, error)

func (*StructuredRuleChunker) Provider

func (c *StructuredRuleChunker) Provider() string

type TableKVChunker

type TableKVChunker struct {
	// contains filtered or unexported fields
}

TableKVChunker 表格/键值对分块器

func NewTableKVChunker

func NewTableKVChunker(cfg *Config) *TableKVChunker

NewTableKVChunker 创建表格/键值对分块器

func (*TableKVChunker) Chunk

func (c *TableKVChunker) Chunk(ctx context.Context, text string, opts *ChunkOptions) ([]Chunk, error)

func (*TableKVChunker) Provider

func (c *TableKVChunker) Provider() string

type TableKVChunkerFactory

type TableKVChunkerFactory struct{}

TableKVChunkerFactory 表格/键值对分块工厂

func (*TableKVChunkerFactory) Create

func (f *TableKVChunkerFactory) Create(ctx context.Context, cfg *Config) (Chunker, error)

func (*TableKVChunkerFactory) Name

func (f *TableKVChunkerFactory) Name() string

func (*TableKVChunkerFactory) Supports

func (f *TableKVChunkerFactory) Supports(provider string) bool

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL