textsplitter

package

v0.21.1 Latest Latest Go to latest Published: Feb 4, 2026 License: MIT Imports: 15 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/sevigo/goframe

Links

Open Source Insights

Documentation ¶

Index ¶

Constants
Variables
func TruncateParentText(text string, maxLen int) string
type ChunkType
type CodeAwareTextSplitter
- func NewCodeAware(registry parsers.ParserRegistry, tokenizer Tokenizer, logger *slog.Logger, ...) (*CodeAwareTextSplitter, error)
type Option
type ParentContextConfig
type RecursiveCharacter
- func NewRecursiveCharacter(opts ...Option) *RecursiveCharacter
- func (s *RecursiveCharacter) SplitText(_ context.Context, text string) ([]string, error)
type TextSplitter
type Tokenizer

Constants ¶

View Source

const (
	// MaxParentTextLength defines the default limit for parent context storage
	MaxParentTextLength = 2000
	// DefaultChunkSize is the fallback size if not provided
	DefaultChunkSize = 2048
)

Variables ¶

View Source

var (
	ErrInvalidChunkSize       = errors.New("invalid chunk size")
	ErrEmptyContent           = errors.New("content is empty or contains only whitespace")
	ErrTokenizerNotConfigured = errors.New("tokenizer service is not configured")
	ErrModelRequired          = errors.New("model name is required")
)

Functions ¶

func TruncateParentText ¶ added in v0.15.0

func TruncateParentText(text string, maxLen int) string

TruncateParentText reduces text length while preserving start and end context.

Types ¶

type ChunkType ¶

type ChunkType string

ChunkType represents the type of content in a chunk

const (
	ChunkTypeFunction ChunkType = "function"
	ChunkTypeClass    ChunkType = "class"
	ChunkTypeImports  ChunkType = "imports"
	ChunkTypeComment  ChunkType = "comment"
	ChunkTypeCode     ChunkType = "code"
	ChunkTypeText     ChunkType = "text"
)

type CodeAwareTextSplitter ¶

type CodeAwareTextSplitter struct {
	// contains filtered or unexported fields
}

func NewCodeAware ¶

func NewCodeAware(
	registry parsers.ParserRegistry,
	tokenizer Tokenizer,
	logger *slog.Logger,
	opts ...Option,
) (*CodeAwareTextSplitter, error)

func (*CodeAwareTextSplitter) ChunkFileWithFileInfo ¶

func (c *CodeAwareTextSplitter) ChunkFileWithFileInfo(
	ctx context.Context,
	content, filePath, modelName string,
	fileInfo fs.FileInfo,
	opts *schema.CodeChunkingOptions,
) ([]schema.CodeChunk, error)

func (*CodeAwareTextSplitter) EnrichChunkWithContext ¶

func (c *CodeAwareTextSplitter) EnrichChunkWithContext(
	ctx context.Context,
	chunk model.CodeChunk,
	fileContent string,
	metadata model.FileMetadata,
	parentChunks []model.CodeChunk,
	modelName string,
) model.CodeChunk

EnrichChunkWithContext adds file and hierarchical context to a chunk.

func (*CodeAwareTextSplitter) GetRecommendedChunkSize ¶

func (c *CodeAwareTextSplitter) GetRecommendedChunkSize(ctx context.Context, filePath, modelName string, contentLength int) int

GetRecommendedChunkSize returns a recommended chunk size based on file type and content length.

func (*CodeAwareTextSplitter) SplitDocuments ¶

func (c *CodeAwareTextSplitter) SplitDocuments(ctx context.Context, docs []schema.Document) ([]schema.Document, error)

SplitDocuments takes a slice of documents and returns a new slice with split content.

func (*CodeAwareTextSplitter) ValidateChunkingOptions ¶

func (c *CodeAwareTextSplitter) ValidateChunkingOptions(opts *model.CodeChunkingOptions) error

ValidateChunkingOptions validates the provided chunking options for correctness.

type Option ¶

type Option func(*options)

Option is a function type for configuring the splitter.

func WithChunkOverlap ¶

func WithChunkOverlap(overlap int) Option

WithChunkOverlap sets the chunk overlap.

func WithChunkSize ¶

func WithChunkSize(size int) Option

WithChunkSize sets the target chunk size.

func WithEstimationRatio ¶

func WithEstimationRatio(ratio float64) Option

WithEstimationRatio sets the character-to-token estimation ratio.

func WithMaxChunkSize ¶

func WithMaxChunkSize(size int) Option

func WithMinChunkSize ¶

func WithMinChunkSize(size int) Option

WithMinChunkSize sets the minimum number of characters for a chunk to be valid.

func WithModelName ¶

func WithModelName(name string) Option

WithModelName sets the model name for token-aware splitting.

func WithParentContextConfig ¶ added in v0.15.0

func WithParentContextConfig(config ParentContextConfig) Option

WithParentContextConfig sets the parent context configuration.

type ParentContextConfig ¶ added in v0.15.0

type ParentContextConfig struct {
	Enabled       bool
	MaxTextLength int
}

type RecursiveCharacter ¶ added in v0.15.0

type RecursiveCharacter struct {
	// contains filtered or unexported fields
}

RecursiveCharacter is a text splitter that recursively tries to split text using a list of separators. It aims to keep semantically related parts of the text together as long as possible.

func NewRecursiveCharacter ¶ added in v0.15.0

func NewRecursiveCharacter(opts ...Option) *RecursiveCharacter

NewRecursiveCharacter creates a new RecursiveCharacter text splitter.

func (*RecursiveCharacter) SplitText ¶ added in v0.15.0

func (s *RecursiveCharacter) SplitText(_ context.Context, text string) ([]string, error)

SplitText splits a single text document into multiple chunks.

type TextSplitter ¶

type TextSplitter interface {
	SplitDocuments(ctx context.Context, docs []schema.Document) ([]schema.Document, error)
}

type Tokenizer ¶

type Tokenizer interface {
	CountTokens(ctx context.Context, modelName, text string) int
	EstimateTokens(ctx context.Context, modelName, text string) int
	SplitTextByTokens(ctx context.Context, modelName, text string, maxTokens int) ([]string, error)
	GetRecommendedChunkSize(ctx context.Context, modelName string) int
	GetOptimalOverlapTokens(ctx context.Context, modelName string) int
	GetMaxContextWindow(ctx context.Context, modelName string) int
}

Tokenizer is an interface for components that can count tokens.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL