Documentation
¶
Index ¶
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type BpeDecoder ¶
type BpeDecoder struct {
*DecoderBase
// contains filtered or unexported fields
}
Allows decoding Original BPE by joining all the tokens and then replacing the suffix used to identify end-of-words by whitespaces
func DefaultBpeDecoder ¶
func DefaultBpeDecoder() *BpeDecoder
DefaultBpeDecoder create a new BpeDecoder with default suffix (`</w>`)
func NewBpeDecoder ¶
func NewBpeDecoder(suffix string) *BpeDecoder
NewBpeDecoder creates a new BpeDecoder
func (*BpeDecoder) DecodeChain ¶
func (bd *BpeDecoder) DecodeChain(tokens []string) []string
type ByteFallback ¶
type ByteFallback struct {
*DecoderBase
// contains filtered or unexported fields
}
func NewByteFallback ¶
func NewByteFallback() *ByteFallback
func (*ByteFallback) DecodeChain ¶
func (d *ByteFallback) DecodeChain(tokens []string) []string
type CTC ¶
type CTC struct {
*DecoderBase
PadToken string // the pad token used by CTC to delimit a new token
WordDelimiterToken string // the word delimiter token. It will be replace by a `<space>`
Cleanup bool // whether to cleanup some tokenization artifacts, mainly spaces before punctuation and some abbreviated english forms
}
func DefaultCTC ¶
func DefaultCTC() *CTC
func (*CTC) DecodeChain ¶
type DecoderBase ¶
type DecoderBase struct {
tokenizer.Decoder // Embed Decoder interface here so that a struct that embed `DecoderBase` can overwrite it method.
}
func (*DecoderBase) Decode ¶
func (d *DecoderBase) Decode(tokens []string) string
func (*DecoderBase) DecodeChain ¶
func (d *DecoderBase) DecodeChain(tokens []string) []string
NOTE. this method here for validating only! It will be overloaded if a struct embed `DecoderBase` overwrites it.
type Fuse ¶
type Fuse struct {
*DecoderBase
}
Fuse constructs Fuse decoder It's simply fuses all tokens into one big string.
func (*Fuse) DecodeChain ¶
type Sequence ¶
type Sequence struct {
*DecoderBase
// contains filtered or unexported fields
}
func NewSequence ¶
func (*Sequence) DecodeChain ¶
Decode implements `tokenizer.Decoder` interface.
type Strip ¶
type Strip struct {
*DecoderBase
Content string
Start int
Stop int
}
func (*Strip) DecodeChain ¶
type WordPieceDecoder ¶
type WordPieceDecoder struct {
*DecoderBase
// contains filtered or unexported fields
}
WordPieceDecoder takes care of decoding a list of wordpiece tokens back into a readable string.
func DefaultWordpieceDecoder ¶
func DefaultWordpieceDecoder() *WordPieceDecoder
DefaultBpeDecoder create a new BpeDecoder with default suffix (`</w>`)
func NewWordPieceDecoder ¶
func NewWordPieceDecoder(prefix string, cleanup bool) *WordPieceDecoder
NewBpeDecoder creates a new BpeDecoder
func (*WordPieceDecoder) Cleanup ¶
func (wd *WordPieceDecoder) Cleanup(tok string) string
func (*WordPieceDecoder) DecodeChain ¶
func (wd *WordPieceDecoder) DecodeChain(tokens []string) []string