bpe

package

v0.2.4 Latest Latest Go to latest Published: Apr 10, 2025 License: Apache-2.0 Imports: 21 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/Canva/tokenizer

Links

Open Source Insights

Documentation ¶

Index ¶

Constants
type BPE
- func DefaultBPE() (*BPE, error)
- func New(vocab model.Vocab, mergesData []string, dropout *float32, unkToken *string, ...) (*BPE, error)
- func NewBPE(vocab model.Vocab, merges Merges) *BPE
- func NewBpeFromFiles(vocab, merges string) (*BPE, error)
- func (b *BPE) ClearCache()
- func (b *BPE) FromFiles(vocab string, merges string) *BpeBuilder
- func (b *BPE) GetContinuingSubwordPrfix() *string
- func (b *BPE) GetUnkToken() *string
- func (b BPE) GetVocab() map[string]int
- func (b BPE) GetVocabSize() int
- func (b BPE) IdToToken(id int) (token string, ok bool)
- func (b *BPE) MergeWord(w string) *Word
- func (b *BPE) ReadFiles(vocabF string, mergesF string) (*model.Vocab, *Merges, error)
- func (b BPE) Save(dir string, nameOpt ...string) error
- func (b BPE) TokenToId(token string) (id int, ok bool)
- func (b BPE) Tokenize(sequence string) (retVal []tokenizer.Token, err error)
- func (b BPE) TokenizeWithCache(sequence string) (retVal []tokenizer.Token)
- func (b *BPE) WordToTokens(word Word) []tokenizer.Token
type BpeBuilder
- func NewBpeBuilder() *BpeBuilder
- func (bb *BpeBuilder) Build() (*BPE, error)
- func (bb *BpeBuilder) CacheCapacity(capacity int)
- func (bb *BpeBuilder) ContinuingSubwordPrefix(continuingSubwordPrefix string)
- func (bb *BpeBuilder) Dropout(dropout float32)
- func (bb *BpeBuilder) EndOfWordSuffix(endOfWordSuffix string)
- func (bb *BpeBuilder) Files(vocab string, merges string)
- func (bb *BpeBuilder) UnkToken(unkTok string)
- func (bb *BpeBuilder) VocabAndMerges(vocab model.Vocab, merges Merges)
type BpeTrainer
- func NewBpeTrainer(minFreq int, vocabSize int) *BpeTrainer
- func (bt *BpeTrainer) ProcessTokens(words map[string]int, tokens []string)
- func (bt *BpeTrainer) Train(wordCounts map[string]int) (tokenizer.Model, []tokenizer.AddedToken)
- func (bt *BpeTrainer) WithProgressBar() bool
type BpeTrainerBuilder
- func NewBPETrainerBuilder() *BpeTrainerBuilder
- func (btb *BpeTrainerBuilder) Build() *BpeTrainer
- func (btb *BpeTrainerBuilder) ContinuingSubwordPrefix(prefix string)
- func (btb *BpeTrainerBuilder) EndOfWordSuffix(suffix string)
- func (btb *BpeTrainerBuilder) InitialAlphabet(alphabet CharSet)
- func (btb *BpeTrainerBuilder) LimitAlphabet(limit int)
- func (btb *BpeTrainerBuilder) MinFrequency(freq int)
- func (btb *BpeTrainerBuilder) ShowProgress(show bool)
- func (btb *BpeTrainerBuilder) SpecialTokens(tokens []tokenizer.AddedToken)
- func (btb *BpeTrainerBuilder) VocabSize(size int)
type Cache
- func NewCache(capacity int) *Cache
- func (c *Cache) Clear()
- func (c *Cache) Fresh()
- func (c *Cache) GetValues(keys []string) []Word
- func (c *Cache) SetValues(values []CacheItem)
type CacheItem
type CharSet
type Config
type Merge
- func (m *Merge) Cmp(other *Merge) Ordering
- func (m *Merge) Eq(other *Merge) bool
- func (m *Merge) PartialCmp(other *Merge) (Ordering, error)
type Merges
- func CreateMerges(vocab map[string]int, mergesData []string) (*Merges, error)
type Ordering
type Pair
type PairVal
type Symbol
- func (s *Symbol) MergeWith(other *Symbol, newC int)
type Symbols
- func (ss *Symbols) Insert(s Symbol, i int) error
- func (ss *Symbols) Remove(i int) error
type TConfig
type TMerge
type UintSet
type WChange
type Word
- func NewWord() *Word
- func (w *Word) Add(c int, byteLen int)
- func (w *Word) GetChars() []int
- func (w *Word) GetOffsets() [][]int
- func (w *Word) Merge(c1, c2, replacement int) ([]WChange, error)
- func (w *Word) MergeAll(merges map[Pair]PairVal, dropoutOpt ...float32)

Constants ¶

View Source

const DefaultCacheCapacity int = 10000

Variables ¶

This section is empty.

Functions ¶

This section is empty.

Types ¶

type BPE ¶

type BPE struct {
	// Vocab is the vocabulary assigns a number to each token.
	Vocab *model.Vocab

	// VocabR is Reversed vocabulary, to rebuild sentences.
	VocabR *model.VocabR

	// Merges contains the mapping between Pairs and their (rank, newId).
	Merges *Merges

	// Cache contains the cache for optimizing the encoding step.
	// It is a `map[string]Word`
	Cache *Cache

	// Dropout probability for merges.
	// 0 = no dropout is the default.
	// At 1.0, tokenization will perform no merges, so the result will just be characters.
	Dropout *float32

	// UnkToken is the unknown token to be used when we encounter an unknown char
	UnkToken *string

	// ContinuingSubwordPrefix is an optional prefix
	// to use on any subword that exist only behind another one
	ContinuingSubwordPrefix *string

	// EndOfWordSuffix is an optional suffix
	// to caracterize and end-of-word subword
	EndOfWordSuffix *string
}

BPE is a struct for byte pair encoding model Ref. https://www.aclweb.org/anthology/P16-1162/

func DefaultBPE ¶

func DefaultBPE() (*BPE, error)

func New ¶

func New(

	vocab model.Vocab,
	mergesData []string,
	dropout *float32,
	unkToken *string,
	continuingSubwordPrefix *string,
	endOfWordSuffix *string,
) (*BPE, error)

New create new BPE model.

func NewBPE ¶

func NewBPE(vocab model.Vocab, merges Merges) *BPE

NewBPE creates new BPE model with given vocab and merges

func NewBpeFromFiles ¶

func NewBpeFromFiles(vocab, merges string) (*BPE, error)

NewBpeFromFiles create BPE model from vocab and merges files

func (*BPE) ClearCache ¶

func (b *BPE) ClearCache()

ClearCache reset the cache

func (*BPE) FromFiles ¶

func (b *BPE) FromFiles(vocab string, merges string) *BpeBuilder

FromFile creates `BpeBuilder` from vocab and merges files.

func (*BPE) GetContinuingSubwordPrfix ¶

func (b *BPE) GetContinuingSubwordPrfix() *string

GetContinuingSubwordPrefix returns continuing subword prefix

func (*BPE) GetUnkToken ¶

func (b *BPE) GetUnkToken() *string

GetUnkToken returns `unk` token

func (BPE) GetVocab ¶

func (b BPE) GetVocab() map[string]int

GetVocab returns BPE vocab func (b *BPE) GetVocab() *model.Vocab {

func (BPE) GetVocabSize ¶

func (b BPE) GetVocabSize() int

func (BPE) IdToToken ¶

func (b BPE) IdToToken(id int) (token string, ok bool)

func (*BPE) MergeWord ¶

func (b *BPE) MergeWord(w string) *Word

MergeWord merges given word

func (*BPE) ReadFiles ¶

func (b *BPE) ReadFiles(vocabF string, mergesF string) (*model.Vocab, *Merges, error)

ReadFiles read the given files to extract vocab and merges

func (BPE) Save ¶

func (b BPE) Save(dir string, nameOpt ...string) error

func (BPE) TokenToId ¶

func (b BPE) TokenToId(token string) (id int, ok bool)

func (BPE) Tokenize ¶

func (b BPE) Tokenize(sequence string) (retVal []tokenizer.Token, err error)

Tokenize tokenizes sentences into tokens NOTE: sentence is []PreToken struct{Value string, Offsets Offsets}

func (BPE) TokenizeWithCache ¶

func (b BPE) TokenizeWithCache(sequence string) (retVal []tokenizer.Token)

func (*BPE) WordToTokens ¶

func (b *BPE) WordToTokens(word Word) []tokenizer.Token

WordToTokens slices word to tokens

type BpeBuilder ¶

type BpeBuilder struct {
	// contains filtered or unexported fields
}

BpeBuilder can be used to create a `BPE` model with a custom configuration

func NewBpeBuilder ¶

func NewBpeBuilder() *BpeBuilder

func (*BpeBuilder) Build ¶

func (bb *BpeBuilder) Build() (*BPE, error)

Build returns a `BPE` model that uses the BpeBuilder configuration

func (*BpeBuilder) CacheCapacity ¶

func (bb *BpeBuilder) CacheCapacity(capacity int)

CacheCapacity sets the cache capacity. Disable cache by setting it to 0

func (*BpeBuilder) ContinuingSubwordPrefix ¶

func (bb *BpeBuilder) ContinuingSubwordPrefix(continuingSubwordPrefix string)

ContinuingSubword set the `continuingSubwordPrefix` option.

func (*BpeBuilder) Dropout ¶

func (bb *BpeBuilder) Dropout(dropout float32)

Dropout set dropout for model Ref. https://arxiv.org/abs/1910.13267

func (*BpeBuilder) EndOfWordSuffix ¶

func (bb *BpeBuilder) EndOfWordSuffix(endOfWordSuffix string)

EndOfWordSuffix set the `endOfWordSuffix` option.

func (*BpeBuilder) Files ¶

func (bb *BpeBuilder) Files(vocab string, merges string)

Files sets input files for the model

func (*BpeBuilder) UnkToken ¶

func (bb *BpeBuilder) UnkToken(unkTok string)

UnkToken set the `UNK` token for the vocab

func (*BpeBuilder) VocabAndMerges ¶

func (bb *BpeBuilder) VocabAndMerges(vocab model.Vocab, merges Merges)

VocabAndMerges sets vocab and merges

type BpeTrainer ¶

type BpeTrainer struct {
	// The minimum frequency a pair must have to produce a merge operation
	MinFrequency int
	// The target vocabulary size
	VocabSize int
	// Whether to show progress while training
	ShowProgress bool
	// A list of special tokens that the model should know of
	SpecialTokens []tokenizer.AddedToken
	// Whether to limit the number of initial tokens that can be kept before
	// computing merges
	LimitAlphabet *int // TODO: replace with int and `None` value = -1
	// The initial alphabet we want absolutely to include. This allows to cover
	// some characters that are not necessarily in the training set
	InitialAlphabet CharSet
	// An optional prefix to use on any subword that exist only behind another one
	ContinuingSubwordPrefix *string
	// An optional suffix to characterize and end-of-word subword
	EndOfWordSuffix *string
}

BpeTrainer is in charge of training a `BPE` model from a mapping of words to word counts.

Example:

wordCounts := map[string]int = {
	{"Hello", 1},
	{"World", 1},
}

trainer := NewBPETrainer() model, specialTokens := trainer.Train(wordCounts)

func NewBpeTrainer ¶

func NewBpeTrainer(minFreq int, vocabSize int) *BpeTrainer

func (*BpeTrainer) ProcessTokens ¶

func (bt *BpeTrainer) ProcessTokens(words map[string]int, tokens []string)

Process a bunch of tokens, counting them

func (*BpeTrainer) Train ¶

func (bt *BpeTrainer) Train(wordCounts map[string]int) (tokenizer.Model, []tokenizer.AddedToken)

Train trains bpe model on input wordCounts and returns 1. BPE model; 2. merges func (bt *BpeTrainer) Train(wordCounts map[string]int) (BPE, []string) {

func (*BpeTrainer) WithProgressBar ¶

func (bt *BpeTrainer) WithProgressBar() bool

type BpeTrainerBuilder ¶

type BpeTrainerBuilder struct {
	Config *TConfig
}

BpeTrainerBuilder can be used to create a `BpeTrainer` with a custom configuration

func NewBPETrainerBuilder ¶

func NewBPETrainerBuilder() *BpeTrainerBuilder

func (*BpeTrainerBuilder) Build ¶

func (btb *BpeTrainerBuilder) Build() *BpeTrainer

Build constructs the final BpeTrainer

func (*BpeTrainerBuilder) ContinuingSubwordPrefix ¶

func (btb *BpeTrainerBuilder) ContinuingSubwordPrefix(prefix string)

ContinuingSubwordPrefix set the ContinuingSubwordPrefix

func (*BpeTrainerBuilder) EndOfWordSuffix ¶

func (btb *BpeTrainerBuilder) EndOfWordSuffix(suffix string)

EndOfWordSuffix set the EndOfWordSuffix

func (*BpeTrainerBuilder) InitialAlphabet ¶

func (btb *BpeTrainerBuilder) InitialAlphabet(alphabet CharSet)

InitialAlphabet set the initial alphabet

func (*BpeTrainerBuilder) LimitAlphabet ¶

func (btb *BpeTrainerBuilder) LimitAlphabet(limit int)

LimitAlphabet set the alphabet limit

func (*BpeTrainerBuilder) MinFrequency ¶

func (btb *BpeTrainerBuilder) MinFrequency(freq int)

MinFequency set minimum frequency

func (*BpeTrainerBuilder) ShowProgress ¶

func (btb *BpeTrainerBuilder) ShowProgress(show bool)

ShowProgress set whether to show progress

func (*BpeTrainerBuilder) SpecialTokens ¶

func (btb *BpeTrainerBuilder) SpecialTokens(tokens []tokenizer.AddedToken)

SpecialToken set special tokens

func (*BpeTrainerBuilder) VocabSize ¶

func (btb *BpeTrainerBuilder) VocabSize(size int)

VocabSize set the vocabulary size

type Cache ¶

type Cache struct {
	Capacity int
	// contains filtered or unexported fields
}

Cache is a map with read-write mutex included to hold map of `word` strings E.g. https://tour.golang.org/concurrency/9 NOTE: can we you sync.Map struct instead???

func NewCache ¶

func NewCache(capacity int) *Cache

NewCache create an empty Cache with a specified capacity

func (*Cache) Clear ¶

func (c *Cache) Clear()

Clear clears the cache

func (*Cache) Fresh ¶

func (c *Cache) Fresh()

Fresh create a fresh `Cache` with the same configuration

func (*Cache) GetValues ¶

func (c *Cache) GetValues(keys []string) []Word

GetValues returns slices of values associated with input keys

func (*Cache) SetValues ¶

func (c *Cache) SetValues(values []CacheItem)

type CacheItem ¶

type CacheItem struct {
	// Key   interface{}
	// Value interface{}
	Key   string
	Value Word // `word` string
}

type CharSet ¶

type CharSet map[string]struct{}

type Config ¶

type Config struct {
	// contains filtered or unexported fields
}

type Merge ¶

type Merge struct {
	Pos   int
	Rank  int
	NewId int
	Time  time.Time
}

func (*Merge) Cmp ¶

func (m *Merge) Cmp(other *Merge) Ordering

func (*Merge) Eq ¶

func (m *Merge) Eq(other *Merge) bool

NOTE.Should we implement comparing methods? - Eq - PartialCmp - Cmp

func (*Merge) PartialCmp ¶

func (m *Merge) PartialCmp(other *Merge) (Ordering, error)

type Merges ¶

type Merges map[Pair]PairVal

func CreateMerges ¶

func CreateMerges(vocab map[string]int, mergesData []string) (*Merges, error)

type Ordering ¶

type Ordering int

Ordering is a enum of Less, Equal, and Greater

const (
	Less    Ordering = -1
	Equal   Ordering = 0
	Greater Ordering = 1
)

type Pair ¶

type Pair struct {
	C1 int
	C2 int
}

type PairVal ¶

type PairVal struct {
	Rank  int
	NewId int
}

PairVal holds pair's rank and NewId

type Symbol ¶

type Symbol struct {
	C    int
	Prev int
	Next int
	Len  int
}

func (*Symbol) MergeWith ¶

func (s *Symbol) MergeWith(other *Symbol, newC int)

type Symbols ¶

type Symbols []Symbol

Some slice methods to manipulate slice struct Symbol

func (*Symbols) Insert ¶

func (ss *Symbols) Insert(s Symbol, i int) error

Insert inserts a symbol to the slice at `i` index point

func (*Symbols) Remove ¶

func (ss *Symbols) Remove(i int) error

Remove removes a symbol from the slice at `i` index point

type TConfig ¶

type TConfig struct {
	MinFrequency            int
	VocabSize               int
	ShowProgress            bool
	SpecialTokens           []tokenizer.AddedToken
	LimitAlphabet           *int
	InitialAlphabet         CharSet
	ContinuingSubwordPrefix *string
	EndOfWordSuffix         *string
}

NOTE: there exists `Config`

type TMerge ¶

type TMerge struct {
	Pair  Pair
	Count int
	Pos   UintSet
	Time  time.Time
}

type UintSet ¶

type UintSet map[int]struct{}

Map with no value Ref: https://stackoverflow.com/questions/57620170

type WChange ¶

type WChange struct {
	C1     int
	C2     int
	Change int
}

type Word ¶

type Word struct {
	Symbols Symbols
}

func NewWord ¶

func NewWord() *Word

func (*Word) Add ¶

func (w *Word) Add(c int, byteLen int)

func (*Word) GetChars ¶

func (w *Word) GetChars() []int

func (*Word) GetOffsets ¶

func (w *Word) GetOffsets() [][]int

func (*Word) Merge ¶

func (w *Word) Merge(c1, c2, replacement int) ([]WChange, error)

Merge finds any pairs of (c1, c2) and removes in place. It also maps changes depending on the position of the pair in word.

func (*Word) MergeAll ¶

func (w *Word) MergeAll(merges map[Pair]PairVal, dropoutOpt ...float32)

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL