blingfire

package

v0.0.13 Latest Latest Go to latest Published: May 28, 2026 License: MIT Imports: 3 Imported by: 0

Details

This section is empty.

This section is empty.

func TextToSentences(text string) []string

TextToSentences splits text into sentences using a robust regex.

func TextToWords(text string) []string

TextToWords splits text into words.

type Offset struct {
	Start int
	End   int
}

Offset represents a start and end position of a token within the original string.

func TextToSentencesWithOffsets(text string) ([]string, []Offset)

TextToSentencesWithOffsets splits text into sentences and returns their offsets.

func TextToWordsWithOffsets(text string) ([]string, []Offset)

TextToWordsWithOffsets splits text into words and returns their offsets.

type SentenceTokenizer struct {
	Language         string
	MinSentenceLen   int
	StreamContextLen int
}

func NewSentenceTokenizer(language string, minSentenceLen, streamContextLen int) *SentenceTokenizer

func (t *SentenceTokenizer) Stream(language string) tokenize.SentenceStream

func (t *SentenceTokenizer) Tokenize(text string, language string) []string

type WordTokenizer struct {
	Language string
}

func NewWordTokenizer(language string) *WordTokenizer

func (t *WordTokenizer) Stream(language string) tokenize.WordStream

func (t *WordTokenizer) Tokenize(text string, language string) []string