textsplitter

package
v0.21.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 4, 2026 License: MIT Imports: 15 Imported by: 0

Documentation

Index

Constants

View Source
const (
	// MaxParentTextLength defines the default limit for parent context storage
	MaxParentTextLength = 2000
	// DefaultChunkSize is the fallback size if not provided
	DefaultChunkSize = 2048
)

Variables

View Source
var (
	ErrInvalidChunkSize       = errors.New("invalid chunk size")
	ErrEmptyContent           = errors.New("content is empty or contains only whitespace")
	ErrTokenizerNotConfigured = errors.New("tokenizer service is not configured")
	ErrModelRequired          = errors.New("model name is required")
)

Functions

func TruncateParentText added in v0.15.0

func TruncateParentText(text string, maxLen int) string

TruncateParentText reduces text length while preserving start and end context.

Types

type ChunkType

type ChunkType string

ChunkType represents the type of content in a chunk

const (
	ChunkTypeFunction ChunkType = "function"
	ChunkTypeClass    ChunkType = "class"
	ChunkTypeImports  ChunkType = "imports"
	ChunkTypeComment  ChunkType = "comment"
	ChunkTypeCode     ChunkType = "code"
	ChunkTypeText     ChunkType = "text"
)

type CodeAwareTextSplitter

type CodeAwareTextSplitter struct {
	// contains filtered or unexported fields
}

func NewCodeAware

func NewCodeAware(
	registry parsers.ParserRegistry,
	tokenizer Tokenizer,
	logger *slog.Logger,
	opts ...Option,
) (*CodeAwareTextSplitter, error)

func (*CodeAwareTextSplitter) ChunkFileWithFileInfo

func (c *CodeAwareTextSplitter) ChunkFileWithFileInfo(
	ctx context.Context,
	content, filePath, modelName string,
	fileInfo fs.FileInfo,
	opts *schema.CodeChunkingOptions,
) ([]schema.CodeChunk, error)

func (*CodeAwareTextSplitter) EnrichChunkWithContext

func (c *CodeAwareTextSplitter) EnrichChunkWithContext(
	ctx context.Context,
	chunk model.CodeChunk,
	fileContent string,
	metadata model.FileMetadata,
	parentChunks []model.CodeChunk,
	modelName string,
) model.CodeChunk

EnrichChunkWithContext adds file and hierarchical context to a chunk.

func (*CodeAwareTextSplitter) GetRecommendedChunkSize

func (c *CodeAwareTextSplitter) GetRecommendedChunkSize(ctx context.Context, filePath, modelName string, contentLength int) int

GetRecommendedChunkSize returns a recommended chunk size based on file type and content length.

func (*CodeAwareTextSplitter) SplitDocuments

func (c *CodeAwareTextSplitter) SplitDocuments(ctx context.Context, docs []schema.Document) ([]schema.Document, error)

SplitDocuments takes a slice of documents and returns a new slice with split content.

func (*CodeAwareTextSplitter) ValidateChunkingOptions

func (c *CodeAwareTextSplitter) ValidateChunkingOptions(opts *model.CodeChunkingOptions) error

ValidateChunkingOptions validates the provided chunking options for correctness.

type Option

type Option func(*options)

Option is a function type for configuring the splitter.

func WithChunkOverlap

func WithChunkOverlap(overlap int) Option

WithChunkOverlap sets the chunk overlap.

func WithChunkSize

func WithChunkSize(size int) Option

WithChunkSize sets the target chunk size.

func WithEstimationRatio

func WithEstimationRatio(ratio float64) Option

WithEstimationRatio sets the character-to-token estimation ratio.

func WithMaxChunkSize

func WithMaxChunkSize(size int) Option

func WithMinChunkSize

func WithMinChunkSize(size int) Option

WithMinChunkSize sets the minimum number of characters for a chunk to be valid.

func WithModelName

func WithModelName(name string) Option

WithModelName sets the model name for token-aware splitting.

func WithParentContextConfig added in v0.15.0

func WithParentContextConfig(config ParentContextConfig) Option

WithParentContextConfig sets the parent context configuration.

type ParentContextConfig added in v0.15.0

type ParentContextConfig struct {
	Enabled       bool
	MaxTextLength int
}

type RecursiveCharacter added in v0.15.0

type RecursiveCharacter struct {
	// contains filtered or unexported fields
}

RecursiveCharacter is a text splitter that recursively tries to split text using a list of separators. It aims to keep semantically related parts of the text together as long as possible.

func NewRecursiveCharacter added in v0.15.0

func NewRecursiveCharacter(opts ...Option) *RecursiveCharacter

NewRecursiveCharacter creates a new RecursiveCharacter text splitter.

func (*RecursiveCharacter) SplitText added in v0.15.0

func (s *RecursiveCharacter) SplitText(_ context.Context, text string) ([]string, error)

SplitText splits a single text document into multiple chunks.

type TextSplitter

type TextSplitter interface {
	SplitDocuments(ctx context.Context, docs []schema.Document) ([]schema.Document, error)
}

type Tokenizer

type Tokenizer interface {
	CountTokens(ctx context.Context, modelName, text string) int
	EstimateTokens(ctx context.Context, modelName, text string) int
	SplitTextByTokens(ctx context.Context, modelName, text string, maxTokens int) ([]string, error)
	GetRecommendedChunkSize(ctx context.Context, modelName string) int
	GetOptimalOverlapTokens(ctx context.Context, modelName string) int
	GetMaxContextWindow(ctx context.Context, modelName string) int
}

Tokenizer is an interface for components that can count tokens.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL