textsplitter

package
v0.6.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jul 21, 2025 License: MIT Imports: 12 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var (
	ErrInvalidChunkSize       = errors.New("invalid chunk size")
	ErrEmptyContent           = errors.New("content is empty or contains only whitespace")
	ErrTokenizerNotConfigured = errors.New("tokenizer service is not configured")
	ErrModelRequired          = errors.New("model name is required")
)

Functions

This section is empty.

Types

type ChunkType

type ChunkType string

ChunkType represents the type of content in a chunk

const (
	ChunkTypeFunction ChunkType = "function"
	ChunkTypeClass    ChunkType = "class"
	ChunkTypeImports  ChunkType = "imports"
	ChunkTypeComment  ChunkType = "comment"
	ChunkTypeCode     ChunkType = "code"
	ChunkTypeText     ChunkType = "text"
)

type CodeAwareTextSplitter

type CodeAwareTextSplitter struct {
	// contains filtered or unexported fields
}

func NewCodeAware

func NewCodeAware(
	registry parsers.ParserRegistry,
	tokenizer Tokenizer,
	logger *slog.Logger,
	opts ...Option,
) (*CodeAwareTextSplitter, error)

func (*CodeAwareTextSplitter) ChunkFileWithFileInfo

func (c *CodeAwareTextSplitter) ChunkFileWithFileInfo(
	ctx context.Context,
	content, filePath, modelName string,
	fileInfo fs.FileInfo,
	opts *schema.CodeChunkingOptions,
) ([]schema.CodeChunk, error)

func (*CodeAwareTextSplitter) EnrichChunkWithContext

func (c *CodeAwareTextSplitter) EnrichChunkWithContext(
	ctx context.Context,
	chunk model.CodeChunk,
	fileContent string,
	metadata model.FileMetadata,
	parentChunks []model.CodeChunk,
	modelName string,
) model.CodeChunk

EnrichChunkWithContext adds file and hierarchical context to a chunk.

func (*CodeAwareTextSplitter) GetRecommendedChunkSize

func (c *CodeAwareTextSplitter) GetRecommendedChunkSize(ctx context.Context, filePath, modelName string, contentLength int) int

GetRecommendedChunkSize returns a recommended chunk size based on file type and content length.

func (*CodeAwareTextSplitter) SplitDocuments

func (c *CodeAwareTextSplitter) SplitDocuments(ctx context.Context, docs []schema.Document) ([]schema.Document, error)

func (*CodeAwareTextSplitter) ValidateChunkingOptions

func (c *CodeAwareTextSplitter) ValidateChunkingOptions(opts *model.CodeChunkingOptions) error

ValidateChunkingOptions validates the provided chunking options for correctness.

type Option

type Option func(*options)

Option is a function type for configuring the splitter.

func WithChunkOverlap

func WithChunkOverlap(overlap int) Option

WithChunkOverlap sets the chunk overlap.

func WithChunkSize

func WithChunkSize(size int) Option

WithChunkSize sets the target chunk size.

func WithEstimationRatio

func WithEstimationRatio(ratio float64) Option

WithEstimationRatio sets the character-to-token estimation ratio.

func WithMaxChunkSize

func WithMaxChunkSize(size int) Option

func WithMinChunkSize

func WithMinChunkSize(size int) Option

WithMinChunkSize sets the minimum number of characters for a chunk to be valid.

func WithModelName

func WithModelName(name string) Option

WithModelName sets the model name for token-aware splitting.

type TextSplitter

type TextSplitter interface {
	SplitDocuments(ctx context.Context, docs []schema.Document) ([]schema.Document, error)
}

type Tokenizer

type Tokenizer interface {
	CountTokens(ctx context.Context, modelName, text string) int
	EstimateTokens(ctx context.Context, modelName, text string) int
	SplitTextByTokens(ctx context.Context, modelName, text string, maxTokens int) ([]string, error)
	GetRecommendedChunkSize(ctx context.Context, modelName string) int
	GetOptimalOverlapTokens(ctx context.Context, modelName string) int
	GetMaxContextWindow(ctx context.Context, modelName string) int
}

Tokenizer is an interface for components that can count tokens.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL