Documentation
¶
Index ¶
- Constants
- type Config
- type TokenScore
- type Unigram
- func (u *Unigram) GetVocab() map[string]int
- func (u *Unigram) GetVocabSize() int
- func (u *Unigram) IdToToken(id int) (string, bool)
- func (u *Unigram) Save(dir string, prefixOpt ...string) error
- func (u *Unigram) TokenToId(token string) (int, bool)
- func (u *Unigram) Tokenize(sequence string) ([]tokenizer.Token, error)
- type UnigramBuilder
- func (ub *UnigramBuilder) Build() (*Unigram, error)
- func (ub *UnigramBuilder) BytesFallback(bytesFallback bool) *UnigramBuilder
- func (ub *UnigramBuilder) FuseUnk(fuseUnk bool) *UnigramBuilder
- func (ub *UnigramBuilder) UnkID(unkID int) *UnigramBuilder
- func (ub *UnigramBuilder) Vocab(vocab []TokenScore) *UnigramBuilder
Constants ¶
const ( CacheExpiredTime = 5 CacheCleanTime = 10 )
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type Config ¶
type Config struct {
// contains filtered or unexported fields
}
Config holds the configuration for the Unigram model
type TokenScore ¶
TokenScore represents a token and its score in the Unigram model
type Unigram ¶
type Unigram struct {
// contains filtered or unexported fields
}
Unigram implements the Unigram language model for tokenization
func New ¶
func New(vocab []TokenScore, opts *util.Params) (*Unigram, error)
New creates a new Unigram model with the given vocabulary and options
func (*Unigram) GetVocabSize ¶
GetVocabSize returns the size of the vocabulary
type UnigramBuilder ¶
type UnigramBuilder struct {
// contains filtered or unexported fields
}
UnigramBuilder can be used to create a Unigram model with a custom configuration
func NewUnigramBuilder ¶
func NewUnigramBuilder() *UnigramBuilder
NewUnigramBuilder creates a new UnigramBuilder with default configuration
func (*UnigramBuilder) Build ¶
func (ub *UnigramBuilder) Build() (*Unigram, error)
Build creates a new Unigram model with the configured parameters
func (*UnigramBuilder) BytesFallback ¶
func (ub *UnigramBuilder) BytesFallback(bytesFallback bool) *UnigramBuilder
BytesFallback sets whether to use byte fallback for unknown tokens
func (*UnigramBuilder) FuseUnk ¶
func (ub *UnigramBuilder) FuseUnk(fuseUnk bool) *UnigramBuilder
FuseUnk sets whether to fuse unknown tokens together
func (*UnigramBuilder) UnkID ¶
func (ub *UnigramBuilder) UnkID(unkID int) *UnigramBuilder
UnkID sets the unknown token ID for the Unigram model
func (*UnigramBuilder) Vocab ¶
func (ub *UnigramBuilder) Vocab(vocab []TokenScore) *UnigramBuilder
Vocab sets the vocabulary for the Unigram model