Documentation
¶
Index ¶
- Constants
- type BPE
- func (b *BPE) ClearCache()
- func (b *BPE) FromFiles(vocab string, merges string) *BpeBuilder
- func (b *BPE) GetContinuingSubwordPrfix() *string
- func (b *BPE) GetUnkToken() *string
- func (b BPE) GetVocab() map[string]int
- func (b BPE) GetVocabSize() int
- func (b BPE) IdToToken(id int) (token string, ok bool)
- func (b *BPE) MergeWord(w string) *Word
- func (b *BPE) ReadFiles(vocabF string, mergesF string) (*model.Vocab, *Merges, error)
- func (b BPE) Save(dir string, nameOpt ...string) error
- func (b BPE) TokenToId(token string) (id int, ok bool)
- func (b BPE) Tokenize(sequence string) (retVal []tokenizer.Token, err error)
- func (b BPE) TokenizeWithCache(sequence string) (retVal []tokenizer.Token)
- func (b *BPE) WordToTokens(word Word) []tokenizer.Token
- type BpeBuilder
- func (bb *BpeBuilder) Build() (*BPE, error)
- func (bb *BpeBuilder) CacheCapacity(capacity int)
- func (bb *BpeBuilder) ContinuingSubwordPrefix(continuingSubwordPrefix string)
- func (bb *BpeBuilder) Dropout(dropout float32)
- func (bb *BpeBuilder) EndOfWordSuffix(endOfWordSuffix string)
- func (bb *BpeBuilder) Files(vocab string, merges string)
- func (bb *BpeBuilder) UnkToken(unkTok string)
- func (bb *BpeBuilder) VocabAndMerges(vocab model.Vocab, merges Merges)
- type BpeTrainer
- type BpeTrainerBuilder
- func (btb *BpeTrainerBuilder) Build() *BpeTrainer
- func (btb *BpeTrainerBuilder) ContinuingSubwordPrefix(prefix string)
- func (btb *BpeTrainerBuilder) EndOfWordSuffix(suffix string)
- func (btb *BpeTrainerBuilder) InitialAlphabet(alphabet CharSet)
- func (btb *BpeTrainerBuilder) LimitAlphabet(limit int)
- func (btb *BpeTrainerBuilder) MinFrequency(freq int)
- func (btb *BpeTrainerBuilder) ShowProgress(show bool)
- func (btb *BpeTrainerBuilder) SpecialTokens(tokens []tokenizer.AddedToken)
- func (btb *BpeTrainerBuilder) VocabSize(size int)
- type Cache
- type CacheItem
- type CharSet
- type Config
- type Merge
- type Merges
- type Ordering
- type Pair
- type PairVal
- type Symbol
- type Symbols
- type TConfig
- type TMerge
- type UintSet
- type WChange
- type Word
Constants ¶
const DefaultCacheCapacity int = 10000
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type BPE ¶
type BPE struct {
// Vocab is the vocabulary assigns a number to each token.
Vocab *model.Vocab
// VocabR is Reversed vocabulary, to rebuild sentences.
VocabR *model.VocabR
// Merges contains the mapping between Pairs and their (rank, newId).
Merges *Merges
// Cache contains the cache for optimizing the encoding step.
// It is a `map[string]Word`
Cache *Cache
// Dropout probability for merges.
// 0 = no dropout is the default.
// At 1.0, tokenization will perform no merges, so the result will just be characters.
Dropout *float32
// UnkToken is the unknown token to be used when we encounter an unknown char
UnkToken *string
// ContinuingSubwordPrefix is an optional prefix
// to use on any subword that exist only behind another one
ContinuingSubwordPrefix *string
// EndOfWordSuffix is an optional suffix
// to caracterize and end-of-word subword
EndOfWordSuffix *string
}
BPE is a struct for byte pair encoding model Ref. https://www.aclweb.org/anthology/P16-1162/
func DefaultBPE ¶
func New ¶
func New( vocab model.Vocab, mergesData []string, dropout *float32, unkToken *string, continuingSubwordPrefix *string, endOfWordSuffix *string, ) (*BPE, error)
New create new BPE model.
func NewBpeFromFiles ¶
NewBpeFromFiles create BPE model from vocab and merges files
func (*BPE) FromFiles ¶
func (b *BPE) FromFiles(vocab string, merges string) *BpeBuilder
FromFile creates `BpeBuilder` from vocab and merges files.
func (*BPE) GetContinuingSubwordPrfix ¶
GetContinuingSubwordPrefix returns continuing subword prefix
func (BPE) GetVocabSize ¶
func (BPE) Tokenize ¶
Tokenize tokenizes sentences into tokens NOTE: sentence is []PreToken struct{Value string, Offsets Offsets}
func (BPE) TokenizeWithCache ¶
type BpeBuilder ¶
type BpeBuilder struct {
// contains filtered or unexported fields
}
BpeBuilder can be used to create a `BPE` model with a custom configuration
func NewBpeBuilder ¶
func NewBpeBuilder() *BpeBuilder
func (*BpeBuilder) Build ¶
func (bb *BpeBuilder) Build() (*BPE, error)
Build returns a `BPE` model that uses the BpeBuilder configuration
func (*BpeBuilder) CacheCapacity ¶
func (bb *BpeBuilder) CacheCapacity(capacity int)
CacheCapacity sets the cache capacity. Disable cache by setting it to 0
func (*BpeBuilder) ContinuingSubwordPrefix ¶
func (bb *BpeBuilder) ContinuingSubwordPrefix(continuingSubwordPrefix string)
ContinuingSubword set the `continuingSubwordPrefix` option.
func (*BpeBuilder) Dropout ¶
func (bb *BpeBuilder) Dropout(dropout float32)
Dropout set dropout for model Ref. https://arxiv.org/abs/1910.13267
func (*BpeBuilder) EndOfWordSuffix ¶
func (bb *BpeBuilder) EndOfWordSuffix(endOfWordSuffix string)
EndOfWordSuffix set the `endOfWordSuffix` option.
func (*BpeBuilder) Files ¶
func (bb *BpeBuilder) Files(vocab string, merges string)
Files sets input files for the model
func (*BpeBuilder) UnkToken ¶
func (bb *BpeBuilder) UnkToken(unkTok string)
UnkToken set the `UNK` token for the vocab
func (*BpeBuilder) VocabAndMerges ¶
func (bb *BpeBuilder) VocabAndMerges(vocab model.Vocab, merges Merges)
VocabAndMerges sets vocab and merges
type BpeTrainer ¶
type BpeTrainer struct {
// The minimum frequency a pair must have to produce a merge operation
MinFrequency int
// The target vocabulary size
VocabSize int
// Whether to show progress while training
ShowProgress bool
// A list of special tokens that the model should know of
SpecialTokens []tokenizer.AddedToken
// Whether to limit the number of initial tokens that can be kept before
// computing merges
LimitAlphabet *int // TODO: replace with int and `None` value = -1
// The initial alphabet we want absolutely to include. This allows to cover
// some characters that are not necessarily in the training set
InitialAlphabet CharSet
// An optional prefix to use on any subword that exist only behind another one
ContinuingSubwordPrefix *string
// An optional suffix to characterize and end-of-word subword
EndOfWordSuffix *string
}
BpeTrainer is in charge of training a `BPE` model from a mapping of words to word counts.
Example:
wordCounts := map[string]int = {
{"Hello", 1},
{"World", 1},
}
trainer := NewBPETrainer() model, specialTokens := trainer.Train(wordCounts)
func NewBpeTrainer ¶
func NewBpeTrainer(minFreq int, vocabSize int) *BpeTrainer
func (*BpeTrainer) ProcessTokens ¶
func (bt *BpeTrainer) ProcessTokens(words map[string]int, tokens []string)
Process a bunch of tokens, counting them
func (*BpeTrainer) Train ¶
func (bt *BpeTrainer) Train(wordCounts map[string]int) (tokenizer.Model, []tokenizer.AddedToken)
Train trains bpe model on input wordCounts and returns 1. BPE model; 2. merges func (bt *BpeTrainer) Train(wordCounts map[string]int) (BPE, []string) {
func (*BpeTrainer) WithProgressBar ¶
func (bt *BpeTrainer) WithProgressBar() bool
type BpeTrainerBuilder ¶
type BpeTrainerBuilder struct {
Config *TConfig
}
BpeTrainerBuilder can be used to create a `BpeTrainer` with a custom configuration
func NewBPETrainerBuilder ¶
func NewBPETrainerBuilder() *BpeTrainerBuilder
func (*BpeTrainerBuilder) Build ¶
func (btb *BpeTrainerBuilder) Build() *BpeTrainer
Build constructs the final BpeTrainer
func (*BpeTrainerBuilder) ContinuingSubwordPrefix ¶
func (btb *BpeTrainerBuilder) ContinuingSubwordPrefix(prefix string)
ContinuingSubwordPrefix set the ContinuingSubwordPrefix
func (*BpeTrainerBuilder) EndOfWordSuffix ¶
func (btb *BpeTrainerBuilder) EndOfWordSuffix(suffix string)
EndOfWordSuffix set the EndOfWordSuffix
func (*BpeTrainerBuilder) InitialAlphabet ¶
func (btb *BpeTrainerBuilder) InitialAlphabet(alphabet CharSet)
InitialAlphabet set the initial alphabet
func (*BpeTrainerBuilder) LimitAlphabet ¶
func (btb *BpeTrainerBuilder) LimitAlphabet(limit int)
LimitAlphabet set the alphabet limit
func (*BpeTrainerBuilder) MinFrequency ¶
func (btb *BpeTrainerBuilder) MinFrequency(freq int)
MinFequency set minimum frequency
func (*BpeTrainerBuilder) ShowProgress ¶
func (btb *BpeTrainerBuilder) ShowProgress(show bool)
ShowProgress set whether to show progress
func (*BpeTrainerBuilder) SpecialTokens ¶
func (btb *BpeTrainerBuilder) SpecialTokens(tokens []tokenizer.AddedToken)
SpecialToken set special tokens
func (*BpeTrainerBuilder) VocabSize ¶
func (btb *BpeTrainerBuilder) VocabSize(size int)
VocabSize set the vocabulary size
type Cache ¶
type Cache struct {
Capacity int
// contains filtered or unexported fields
}
Cache is a map with read-write mutex included to hold map of `word` strings E.g. https://tour.golang.org/concurrency/9 NOTE: can we you sync.Map struct instead???
func (*Cache) Fresh ¶
func (c *Cache) Fresh()
Fresh create a fresh `Cache` with the same configuration
type Symbols ¶
type Symbols []Symbol
Some slice methods to manipulate slice struct Symbol
type TConfig ¶
type TConfig struct {
MinFrequency int
VocabSize int
ShowProgress bool
SpecialTokens []tokenizer.AddedToken
LimitAlphabet *int
InitialAlphabet CharSet
ContinuingSubwordPrefix *string
EndOfWordSuffix *string
}
NOTE: there exists `Config`
type UintSet ¶
type UintSet map[int]struct{}
Map with no value Ref: https://stackoverflow.com/questions/57620170