pretokenizer

package
v0.2.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 24, 2023 License: Apache-2.0 Imports: 6 Imported by: 27

Documentation

Index

Constants

This section is empty.

Variables

View Source
var BytesChar map[uint8]string = GenerateBytesChar()
View Source
var CharBytes map[string]uint8 = func() map[string]uint8 {
	var bc = GenerateBytesChar()
	var cb map[string]uint8 = make(map[string]uint8)
	for b, c := range bc {
		cb[c] = b
	}
	return cb
}()

Functions

func DefaultSplit added in v0.2.0

func DefaultSplit() normalizer.SplitDelimiterBehavior

func FixedScript added in v0.2.0

func FixedScript(c rune) string

func GenerateBytesChar

func GenerateBytesChar() map[uint8]string

BytesChar maps first 0-255 (byte) to first 0-255 `char` in unicode Ref. https://en.wikipedia.org/wiki/List_of_Unicode_characters Ref. https://rosettacode.org/wiki/UTF-8_encode_and_decode See example: https://play.golang.org/p/_1W0ni2uZWm

func GetScript added in v0.2.0

func GetScript(r rune) string

GetScript returns key to script in `unicode.Scripts`.

func ProcessOffsets

func ProcessOffsets(encoding *tokenizer.Encoding, addPrefixSpace bool) *tokenizer.Encoding

Types

type BertPreTokenizer

type BertPreTokenizer struct{}

func NewBertPreTokenizer

func NewBertPreTokenizer() *BertPreTokenizer

func (*BertPreTokenizer) PreTokenize

PreTokenize implements PreTokenizer interface for BertPreTokenizer

type ByteLevel

type ByteLevel struct {
	// whether to add a leading space to the first word.
	// It allows to treat the leading word just as any other words.
	AddPrefixSpace bool

	// Whether the post processing step should trim offsets
	// to avoid including whitespaces.
	TrimOffsets bool
}

ByteLevel provides all the neccessary steps to handle the BPE tokenization at byte-level. It takes care of all the required processing steps to transform a utf-8 string as needed before and after the BPE model does it job.

func NewByteLevel

func NewByteLevel() *ByteLevel

NewByteLevel returns a default ByteLevel with both AddPrefixSpace and TrimOffsets set true

func (*ByteLevel) AddedToken

func (bl *ByteLevel) AddedToken(isPair bool) int

func (*ByteLevel) Alphabet

func (bl *ByteLevel) Alphabet() map[string]struct{}

Alphabet returns set of first 256 unicode `char`

func (*ByteLevel) Decode

func (bl *ByteLevel) Decode(tokens []string) string

Decode converts any byte-level characters to their unicode couterpart before merging everything back into a single string

func (*ByteLevel) DecodeChain added in v0.2.0

func (bl *ByteLevel) DecodeChain(tokens []string) []string

func (*ByteLevel) PreTokenize

func (bl *ByteLevel) PreTokenize(pretokenized *tokenizer.PreTokenizedString) (*tokenizer.PreTokenizedString, error)

PreTokenizer, as a `PreTokenizer`, `ByteLevel` is in charge of transforming all the unicode characters into their byte-level counterpart. It also splits the input according to the configured regex.

func (*ByteLevel) Process

func (bl *ByteLevel) Process(encoding, pairEncoding *tokenizer.Encoding, addSpecialTokens bool) *tokenizer.Encoding

func (*ByteLevel) SetAddPrefixSpace

func (bl *ByteLevel) SetAddPrefixSpace(v bool)

SetAddPrefixSpace set `AddPrefixSpace` property

func (*ByteLevel) SetTrimOffsets

func (bl *ByteLevel) SetTrimOffsets(v bool)

SetTrimOffsets set `TrimOffsets` property

type CharDelimiterSplit added in v0.2.0

type CharDelimiterSplit struct {
	Delimiter rune
}

func NewCharDelimiterSplit added in v0.2.0

func NewCharDelimiterSplit(delimiter rune) *CharDelimiterSplit

func (*CharDelimiterSplit) PreTokenize added in v0.2.0

type Digits added in v0.2.0

type Digits struct {
	IndividualDigits bool
}

func DefaultDigits added in v0.2.0

func DefaultDigits() *Digits

func NewDigits added in v0.2.0

func NewDigits(individualDigits bool) *Digits

func (*Digits) PreTokenize added in v0.2.0

func (p *Digits) PreTokenize(pretokenized *tokenizer.PreTokenizedString) (*tokenizer.PreTokenizedString, error)

PreTokenize implements tokenizer.PreTokenizer.

type Metaspace added in v0.2.0

type Metaspace struct {
	Replacement    string
	AddPrefixSpace bool
	StrRep         string
}

Metaspace constructs a Metaspace struct. It replaces all the whitespaces by the provided meta character and then splits on this character.

func DefaultMetaspace added in v0.2.0

func DefaultMetaspace() *Metaspace

func NewMetaspace added in v0.2.0

func NewMetaspace(replacement string, addPrefixSpace bool) *Metaspace

func (*Metaspace) Decode added in v0.2.0

func (m *Metaspace) Decode(tokens []string) string

func (*Metaspace) DecodeChain added in v0.2.0

func (m *Metaspace) DecodeChain(tokens []string) []string

DecodeChain implements Decoder interface.

func (*Metaspace) GetReplacement added in v0.2.0

func (m *Metaspace) GetReplacement() string

func (*Metaspace) PreTokenize added in v0.2.0

func (m *Metaspace) PreTokenize(pretokenized *tokenizer.PreTokenizedString) (*tokenizer.PreTokenizedString, error)

PreTokenize implements PreTokenizer interface

func (*Metaspace) SetReplacement added in v0.2.0

func (m *Metaspace) SetReplacement(replacement string)

type Punctuation added in v0.2.0

type Punctuation struct {
	Behavior normalizer.SplitDelimiterBehavior
}

func DefaultPunctuation added in v0.2.0

func DefaultPunctuation() *Punctuation

func NewPunctuation added in v0.2.0

func NewPunctuation(behavior normalizer.SplitDelimiterBehavior) *Punctuation

func (*Punctuation) PreTokenize added in v0.2.0

func (p *Punctuation) PreTokenize(pretokenized *tokenizer.PreTokenizedString) (*tokenizer.PreTokenizedString, error)
impl PreTokenizer for Punctuation {
    fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
        pretokenized.split(|_, s| s.split(is_punc, self.behavior))
    }
}

PreTokenize implements tokenizer.PreTokenizer.

type Sequence added in v0.2.0

type Sequence struct {
	// contains filtered or unexported fields
}

func NewSequence added in v0.2.0

func NewSequence(pretokenizers []tokenizer.PreTokenizer) *Sequence

func (*Sequence) PreTokenize added in v0.2.0

type Split added in v0.2.0

type Split struct {
	Pattern  normalizer.Pattern
	Behavior normalizer.SplitDelimiterBehavior
	Invert   bool
}

func NewSplit added in v0.2.0

func NewSplit(pattern normalizer.Pattern, behavior normalizer.SplitDelimiterBehavior, invert bool) *Split

func (*Split) PreTokenize added in v0.2.0

func (s *Split) PreTokenize(pretokenized *tokenizer.PreTokenizedString) (*tokenizer.PreTokenizedString, error)

type UnicodeScript added in v0.2.0

type UnicodeScript struct{}

func DefaultUnicodeScript added in v0.2.0

func DefaultUnicodeScript() *UnicodeScript

func NewUnicodeScript added in v0.2.0

func NewUnicodeScript() *UnicodeScript

func (*UnicodeScript) PreTokenize added in v0.2.0

func (us *UnicodeScript) PreTokenize(pretokenized *tokenizer.PreTokenizedString) (*tokenizer.PreTokenizedString, error)

type Whitespace added in v0.2.0

type Whitespace struct{}

func DefaultWhitespace added in v0.2.0

func DefaultWhitespace() *Whitespace

func NewWhitespace added in v0.2.0

func NewWhitespace() *Whitespace

func (*Whitespace) PreTokenize added in v0.2.0

func (p *Whitespace) PreTokenize(pretokenized *tokenizer.PreTokenizedString) (*tokenizer.PreTokenizedString, error)

type WhitespaceSplit added in v0.2.0

type WhitespaceSplit struct{}

func NewWhitespaceSplit added in v0.2.0

func NewWhitespaceSplit() *WhitespaceSplit

func (*WhitespaceSplit) PreTokenize added in v0.2.0

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL