tokenize

package
v0.0.2 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 14, 2026 License: MIT Imports: 8 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type AdvancedSentenceTokenizer

type AdvancedSentenceTokenizer struct {
	// contains filtered or unexported fields
}

AdvancedSentenceTokenizer provides robust, multilingual sentence boundary detection. It replaces the need for the CGO-bound BlingFire C++ library in the Go parity.

func NewAdvancedSentenceTokenizer

func NewAdvancedSentenceTokenizer() *AdvancedSentenceTokenizer

func (*AdvancedSentenceTokenizer) Stream

func (t *AdvancedSentenceTokenizer) Stream(language string) SentenceStream

func (*AdvancedSentenceTokenizer) Tokenize

func (t *AdvancedSentenceTokenizer) Tokenize(text string, language string) []string

type BasicSentenceTokenizer

type BasicSentenceTokenizer struct{}

func NewBasicSentenceTokenizer

func NewBasicSentenceTokenizer() *BasicSentenceTokenizer

func (*BasicSentenceTokenizer) Stream

func (t *BasicSentenceTokenizer) Stream(language string) SentenceStream

func (*BasicSentenceTokenizer) Tokenize

func (t *BasicSentenceTokenizer) Tokenize(text string, language string) []string

type BasicWordTokenizer

type BasicWordTokenizer struct{}

func NewBasicWordTokenizer

func NewBasicWordTokenizer() *BasicWordTokenizer

func (*BasicWordTokenizer) Stream

func (t *BasicWordTokenizer) Stream(language string) WordStream

func (*BasicWordTokenizer) Tokenize

func (t *BasicWordTokenizer) Tokenize(text string, language string) []string

type BufferedTokenStream

type BufferedTokenStream struct {
	// contains filtered or unexported fields
}

func NewBufferedTokenStream

func NewBufferedTokenStream(fnc func(string) []string, minTokenLen, minCtxLen int) *BufferedTokenStream

func (*BufferedTokenStream) Close

func (s *BufferedTokenStream) Close() error

func (*BufferedTokenStream) Flush

func (s *BufferedTokenStream) Flush() error

func (*BufferedTokenStream) Next

func (s *BufferedTokenStream) Next() (*TokenData, error)

func (*BufferedTokenStream) PushText

func (s *BufferedTokenStream) PushText(text string) error

type SentenceStream

type SentenceStream interface {
	PushText(text string) error
	Flush() error
	Close() error
	Next() (*TokenData, error)
}

type SentenceTokenizer

type SentenceTokenizer interface {
	Tokenize(text string, language string) []string
	Stream(language string) SentenceStream
}

type TokenData

type TokenData struct {
	SegmentID string
	Token     string
	Start     int
	End       int
}

func SplitParagraphs

func SplitParagraphs(text string) []TokenData

func SplitSentences

func SplitSentences(text string, minSentenceLen int, retainFormat bool) []TokenData

func SplitWords

func SplitWords(text string, ignorePunctuation bool, splitCharacter bool, retainFormat bool) []TokenData

type WordStream

type WordStream interface {
	PushText(text string) error
	Flush() error
	Close() error
	Next() (*TokenData, error)
}

type WordTokenizer

type WordTokenizer interface {
	Tokenize(text string, language string) []string
	Stream(language string) WordStream
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL