tokenizer

package
v0.17.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 25, 2026 License: MIT Imports: 13 Imported by: 0

Documentation

Index

Constants

View Source
const (
	TOKEN_TYPE_NORMAL = iota + 1
	TOKEN_TYPE_UNKNOWN
	TOKEN_TYPE_CONTROL
	TOKEN_TYPE_USER_DEFINED
	TOKEN_TYPE_UNUSED
	TOKEN_TYPE_BYTE
)

Variables

This section is empty.

Functions

This section is empty.

Types

type BytePairEncoding

type BytePairEncoding struct {
	// contains filtered or unexported fields
}

func NewBytePairEncoding

func NewBytePairEncoding(vocab *Vocabulary, pretokenizer ...string) BytePairEncoding

func (BytePairEncoding) Decode

func (bpe BytePairEncoding) Decode(ids []int32) (string, error)

func (BytePairEncoding) Encode

func (bpe BytePairEncoding) Encode(s string, addSpecial bool) ([]int32, error)

func (BytePairEncoding) Is

func (bpe BytePairEncoding) Is(id int32, special Special) bool

func (BytePairEncoding) Vocabulary

func (bpe BytePairEncoding) Vocabulary() *Vocabulary

type SentencePiece

type SentencePiece struct {
	// contains filtered or unexported fields
}

func NewSentencePiece

func NewSentencePiece(vocab *Vocabulary) SentencePiece

func (SentencePiece) Decode

func (spm SentencePiece) Decode(ids []int32) (string, error)

func (SentencePiece) Encode

func (spm SentencePiece) Encode(s string, addSpecial bool) ([]int32, error)

func (SentencePiece) Is

func (spm SentencePiece) Is(id int32, special Special) bool

func (SentencePiece) Vocabulary

func (spm SentencePiece) Vocabulary() *Vocabulary

type Special

type Special int32
const (
	SpecialBOS Special = iota
	SpecialEOS
)

type Tokenizer

type Tokenizer interface {
	Encode(s string, addSpecial bool) ([]int32, error)
	Decode([]int32) (string, error)
	Is(int32, Special) bool
	Vocabulary() *Vocabulary
}

type Vocabulary

type Vocabulary struct {
	Values []string
	Types  []int32
	Scores []float32
	Merges []string

	BOS, EOS       []int32
	AddBOS, AddEOS bool
	// contains filtered or unexported fields
}

func (*Vocabulary) Decode

func (v *Vocabulary) Decode(id int32) string

func (*Vocabulary) Encode

func (v *Vocabulary) Encode(s string) int32

func (*Vocabulary) Is

func (v *Vocabulary) Is(id int32, special Special) bool

func (*Vocabulary) Merge

func (v *Vocabulary) Merge(left, right string) int

func (*Vocabulary) SpecialVocabulary

func (v *Vocabulary) SpecialVocabulary() []string

type WordPiece

type WordPiece struct {
	// contains filtered or unexported fields
}

func NewWordPiece

func NewWordPiece(vocab *Vocabulary, lowercase bool) WordPiece

func (WordPiece) Decode

func (wpm WordPiece) Decode(ids []int32) (string, error)

Decode implements Tokenizer.

func (WordPiece) Encode

func (wpm WordPiece) Encode(s string, addSpecial bool) ([]int32, error)

Encode implements Tokenizer.

func (WordPiece) Is

func (wpm WordPiece) Is(id int32, special Special) bool

Is implements Tokenizer.

func (WordPiece) Vocabulary

func (wpm WordPiece) Vocabulary() *Vocabulary

Vocabulary implements Tokenizer.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL