splitter

package

v1.2.8 Latest Latest Go to latest Published: Mar 11, 2025 License: MIT Imports: 13 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/bububa/atomic-agents

Links

Open Source Insights

Documentation ¶

Overview ¶

Package splitter defines different chunker spliters

Index ¶

type Chunk
type Graphemes
- func NewGraphemes(opts ...Option) *Graphemes
type GraphemesTokenCounter
- func (c *GraphemesTokenCounter) Count(p []byte) int
type Markdown
- func NewMarkdown(opts ...Option) *Markdown
- func (s *Markdown) EnableCodeBlocks() *Markdown
- func (s *Markdown) EnableHeadingHierarchy() *Markdown
- func (s *Markdown) EnableReferenceLinks() *Markdown
- func (s *Markdown) SetSecondSplitter(splitter embedder.Chunker) *Markdown
- func (sp *Markdown) SplitText(text string) []string
type Option
- func WithBuffer(rw *bytes.Buffer) Option
- func WithChunkSize(size int) Option
- func WithOverlap(overlap int) Option
- func WithTokenCounter(counter TokenCounter) Option
type Options
- func (o *Options) Buffer() *bytes.Buffer
- func (o *Options) Chunks() []string
- func (o *Options) Read(p []byte) (int, error)
- func (o *Options) Scan() error
- func (o *Options) Scanner() Scanner
- func (o *Options) Size() int
- func (o *Options) SplitText(txt string) []string
- func (o *Options) TokenCount(txt string) int
- func (o *Options) Write(p []byte) (int, error)
type Phrases
- func NewPhrases(opts ...Option) *Phrases
type PhrasesTokenCounter
- func (c PhrasesTokenCounter) Count(p []byte) int
type Scanner
type Sentences
- func NewSentences(opts ...Option) *Sentences
type SentencesTokenCounter
- func (c SentencesTokenCounter) Count(p []byte) int
type TikTokenCounter
- func NewTikTokenCounter(encoding string) (*TikTokenCounter, error)
- func (ttc *TikTokenCounter) Count(p []byte) int
type TokenCounter
type Words
- func NewWords(opts ...Option) *Words
type WordsTokenCounter
- func (c WordsTokenCounter) Count(p []byte) int

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

This section is empty.

Types ¶

type Chunk ¶

type Chunk struct {
	// Buffer contains the actual content of the chunk
	Buffer *bytes.Buffer
	// TokenSize represents the number of tokens in this chunk
	TokenSize int
	// Start is the index of the first part in this chunk
	Start int
	// End is the index of the last part in this chunk (exclusive)
	End int
}

Chunk represents a piece of text with associated metadata for tracking its position and size within the original document.

type Graphemes ¶

type Graphemes struct {
	Options
}

func NewGraphemes ¶

func NewGraphemes(opts ...Option) *Graphemes

type GraphemesTokenCounter ¶

type GraphemesTokenCounter struct{}

func (*GraphemesTokenCounter) Count ¶

func (c *GraphemesTokenCounter) Count(p []byte) int

type Markdown ¶

type Markdown struct {
	Options
	// contains filtered or unexported fields
}

Markdown markdown header text splitter.

If your origin document is HTML, you purify and convert to markdown, then split it.

func NewMarkdown ¶

func NewMarkdown(opts ...Option) *Markdown

NewMarkdown creates a new Markdown text splitter.

func (*Markdown) EnableCodeBlocks ¶

func (s *Markdown) EnableCodeBlocks() *Markdown

func (*Markdown) EnableHeadingHierarchy ¶

func (s *Markdown) EnableHeadingHierarchy() *Markdown

func (*Markdown) EnableReferenceLinks ¶

func (s *Markdown) EnableReferenceLinks() *Markdown

func (*Markdown) SetSecondSplitter ¶

func (s *Markdown) SetSecondSplitter(splitter embedder.Chunker) *Markdown

func (*Markdown) SplitText ¶

func (sp *Markdown) SplitText(text string) []string

SplitText splits a text into multiple text.

type Option ¶

type Option func(*Options)

Option is a function type for configuring chunkcer Options. This follows the functional options pattern for clean and flexible configuration.

func WithBuffer ¶

func WithBuffer(rw *bytes.Buffer) Option

func WithChunkSize ¶

func WithChunkSize(size int) Option

func WithOverlap ¶

func WithOverlap(overlap int) Option

func WithTokenCounter ¶

func WithTokenCounter(counter TokenCounter) Option

type Options ¶

type Options struct {
	// contains filtered or unexported fields
}

func (*Options) Buffer ¶

func (o *Options) Buffer() *bytes.Buffer

func (*Options) Chunks ¶

func (o *Options) Chunks() []string

func (*Options) Read ¶

func (o *Options) Read(p []byte) (int, error)

func (*Options) Scan ¶

func (o *Options) Scan() error

func (*Options) Scanner ¶

func (o *Options) Scanner() Scanner

func (*Options) Size ¶

func (o *Options) Size() int

func (*Options) SplitText ¶

func (o *Options) SplitText(txt string) []string

func (*Options) TokenCount ¶

func (o *Options) TokenCount(txt string) int

func (*Options) Write ¶

func (o *Options) Write(p []byte) (int, error)

type Phrases ¶ added in v1.2.2

type Phrases struct {
	Options
}

func NewPhrases ¶ added in v1.2.2

func NewPhrases(opts ...Option) *Phrases

type PhrasesTokenCounter ¶ added in v1.2.2

type PhrasesTokenCounter struct{}

func (PhrasesTokenCounter) Count ¶ added in v1.2.2

func (c PhrasesTokenCounter) Count(p []byte) int

type Scanner ¶

type Scanner interface {
	Bytes() []byte
	Text() string
	Scan() bool
	Err() error
}

type Sentences ¶

type Sentences struct {
	Options
}

func NewSentences ¶

func NewSentences(opts ...Option) *Sentences

type SentencesTokenCounter ¶

type SentencesTokenCounter struct{}

func (SentencesTokenCounter) Count ¶

func (c SentencesTokenCounter) Count(p []byte) int

type TikTokenCounter ¶

type TikTokenCounter struct {
	// contains filtered or unexported fields
}

TikTokenCounter provides accurate token counting using the tiktoken library, which implements the tokenization schemes used by OpenAI models.

func NewTikTokenCounter ¶

func NewTikTokenCounter(encoding string) (*TikTokenCounter, error)

NewTikTokenCounter creates a new TikTokenCounter using the specified encoding. Common encodings include: - "cl100k_base" (GPT-4, ChatGPT) - "p50k_base" (GPT-3) - "r50k_base" (Codex)

func (*TikTokenCounter) Count ¶

func (ttc *TikTokenCounter) Count(p []byte) int

Count returns the exact number of tokens in the text according to the specified tiktoken encoding.

type TokenCounter ¶

type TokenCounter interface {
	// Count returns the number of tokens in the given text according to the
	// implementation's tokenization strategy.
	Count(p []byte) int
}

TokenCounter defines the interface for counting tokens in a string. This abstraction allows for different tokenization strategies (e.g., words, subwords).

type Words ¶

type Words struct {
	Options
}

func NewWords ¶

func NewWords(opts ...Option) *Words

type WordsTokenCounter ¶

type WordsTokenCounter struct{}

func (WordsTokenCounter) Count ¶

func (c WordsTokenCounter) Count(p []byte) int

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL