decoder

package

v0.2.4 Latest Latest Go to latest Published: Apr 10, 2025 License: Apache-2.0 Imports: 5 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/Canva/tokenizer

Links

Open Source Insights

Documentation ¶

Index ¶

type BpeDecoder
- func DefaultBpeDecoder() *BpeDecoder
- func NewBpeDecoder(suffix string) *BpeDecoder
- func (bd *BpeDecoder) DecodeChain(tokens []string) []string
type ByteFallback
- func NewByteFallback() *ByteFallback
- func (d *ByteFallback) DecodeChain(tokens []string) []string
type CTC
- func DefaultCTC() *CTC
- func NewCTC(padToken string, wordDelimiterToken string, cleanup bool) *CTC
- func (d *CTC) DecodeChain(tokens []string) []string
type DecoderBase
- func (d *DecoderBase) Decode(tokens []string) string
- func (d *DecoderBase) DecodeChain(tokens []string) []string
type Fuse
- func NewFuse() *Fuse
- func (f *Fuse) DecodeChain(tokens []string) []string
type Sequence
- func NewSequence(decoders []tokenizer.Decoder) *Sequence
- func (d *Sequence) DecodeChain(tokens []string) []string
type Strip
- func NewStrip(content string, start, stop int) *Strip
- func (d *Strip) DecodeChain(tokens []string) []string
type WordPieceDecoder
- func DefaultWordpieceDecoder() *WordPieceDecoder
- func NewWordPieceDecoder(prefix string, cleanup bool) *WordPieceDecoder
- func (wd *WordPieceDecoder) Cleanup(tok string) string
- func (wd *WordPieceDecoder) DecodeChain(tokens []string) []string

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

This section is empty.

Types ¶

type BpeDecoder ¶

type BpeDecoder struct {
	*DecoderBase
	// contains filtered or unexported fields
}

Allows decoding Original BPE by joining all the tokens and then replacing the suffix used to identify end-of-words by whitespaces

func DefaultBpeDecoder ¶

func DefaultBpeDecoder() *BpeDecoder

DefaultBpeDecoder create a new BpeDecoder with default suffix (`</w>`)

func NewBpeDecoder ¶

func NewBpeDecoder(suffix string) *BpeDecoder

NewBpeDecoder creates a new BpeDecoder

func (*BpeDecoder) DecodeChain ¶

func (bd *BpeDecoder) DecodeChain(tokens []string) []string

type ByteFallback ¶

type ByteFallback struct {
	*DecoderBase
	// contains filtered or unexported fields
}

func NewByteFallback ¶

func NewByteFallback() *ByteFallback

func (*ByteFallback) DecodeChain ¶

func (d *ByteFallback) DecodeChain(tokens []string) []string

type CTC ¶

type CTC struct {
	*DecoderBase

	PadToken           string // the pad token used by CTC to delimit a new token
	WordDelimiterToken string // the word delimiter token. It will be replace by a `<space>`
	Cleanup            bool   // whether to cleanup some tokenization artifacts, mainly spaces before punctuation and some abbreviated english forms
}

func DefaultCTC ¶

func DefaultCTC() *CTC

func NewCTC ¶

func NewCTC(padToken string, wordDelimiterToken string, cleanup bool) *CTC

func (*CTC) DecodeChain ¶

func (d *CTC) DecodeChain(tokens []string) []string

type DecoderBase ¶

type DecoderBase struct {
	tokenizer.Decoder // Embed Decoder interface here so that a struct that embed `DecoderBase` can overwrite it method.
}

func (*DecoderBase) Decode ¶

func (d *DecoderBase) Decode(tokens []string) string

func (*DecoderBase) DecodeChain ¶

func (d *DecoderBase) DecodeChain(tokens []string) []string

NOTE. this method here for validating only! It will be overloaded if a struct embed `DecoderBase` overwrites it.

type Fuse ¶

type Fuse struct {
	*DecoderBase
}

Fuse constructs Fuse decoder It's simply fuses all tokens into one big string.

func NewFuse ¶

func NewFuse() *Fuse

func (*Fuse) DecodeChain ¶

func (f *Fuse) DecodeChain(tokens []string) []string

type Sequence ¶

type Sequence struct {
	*DecoderBase
	// contains filtered or unexported fields
}

func NewSequence ¶

func NewSequence(decoders []tokenizer.Decoder) *Sequence

func (*Sequence) DecodeChain ¶

func (d *Sequence) DecodeChain(tokens []string) []string

Decode implements `tokenizer.Decoder` interface.

type Strip ¶

type Strip struct {
	*DecoderBase

	Content string
	Start   int
	Stop    int
}

func NewStrip ¶

func NewStrip(content string, start, stop int) *Strip

func (*Strip) DecodeChain ¶

func (d *Strip) DecodeChain(tokens []string) []string

type WordPieceDecoder ¶

type WordPieceDecoder struct {
	*DecoderBase
	// contains filtered or unexported fields
}

WordPieceDecoder takes care of decoding a list of wordpiece tokens back into a readable string.

func DefaultWordpieceDecoder ¶

func DefaultWordpieceDecoder() *WordPieceDecoder

DefaultBpeDecoder create a new BpeDecoder with default suffix (`</w>`)

func NewWordPieceDecoder ¶

func NewWordPieceDecoder(prefix string, cleanup bool) *WordPieceDecoder

NewBpeDecoder creates a new BpeDecoder

func (*WordPieceDecoder) Cleanup ¶

func (wd *WordPieceDecoder) Cleanup(tok string) string

func (*WordPieceDecoder) DecodeChain ¶

func (wd *WordPieceDecoder) DecodeChain(tokens []string) []string

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL