Documentation
¶
Overview ¶
Package tokenizer provides text tokenization for LLM inference.
The tokenizer package implements various tokenization strategies:
- tiktoken: BPE tokenizer used by GPT-3/GPT-4 (cl100k_base, p50k_base)
- BPE: Byte-Pair Encoding from HuggingFace tokenizer.json
- Chat templates: Format messages for conversational models
Supported formats:
- ChatML: <|im_start|>role\ncontent<|im_end|> (DeepSeek, OpenAI)
- LLaMA: [INST] user message [/INST] assistant response
- Mistral: Similar to LLaMA with variations
Example usage:
// Load tiktoken
tok, err := tiktoken.NewTiktoken("cl100k_base")
if err != nil {
log.Fatal(err)
}
// Encode text
tokens, err := tok.Encode("Hello, world!")
if err != nil {
log.Fatal(err)
}
// Decode tokens
text, err := tok.Decode(tokens)
if err != nil {
log.Fatal(err)
}
// Apply chat template
messages := []ChatMessage{
{Role: "system", Content: "You are helpful."},
{Role: "user", Content: "Hi!"},
}
prompt := ApplyChatMLTemplate(messages)
Index ¶
- type BPETokenizer
- func (b *BPETokenizer) BosToken() int32
- func (b *BPETokenizer) Decode(tokens []int32) (string, error)
- func (b *BPETokenizer) Encode(text string) ([]int32, error)
- func (b *BPETokenizer) EosToken() int32
- func (b *BPETokenizer) IsSpecialToken(token int32) bool
- func (b *BPETokenizer) PadToken() int32
- func (b *BPETokenizer) SetSpecialTokens(bos, eos, pad, unk int32)
- func (b *BPETokenizer) UnkToken() int32
- func (b *BPETokenizer) VocabSize() int
- type ChatMLTemplate
- type ChatMessage
- type ChatTemplate
- type HFTokenizerMetadata
- type HFTokenizerType
- type HuggingFaceTokenizerConfig
- type LLaMATemplate
- type MistralTemplate
- type TikToken
- func (t *TikToken) BosToken() int32
- func (t *TikToken) Decode(tokens []int32) (string, error)
- func (t *TikToken) Encode(text string) ([]int32, error)
- func (t *TikToken) EosToken() int32
- func (t *TikToken) IsSpecialToken(token int32) bool
- func (t *TikToken) Name() string
- func (t *TikToken) PadToken() int32
- func (t *TikToken) UnkToken() int32
- func (t *TikToken) VocabSize() int
- type Tokenizer
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type BPETokenizer ¶
type BPETokenizer struct {
// contains filtered or unexported fields
}
BPETokenizer implements Byte-Pair Encoding tokenization.
This is a pure Go implementation that can load HuggingFace tokenizer.json files.
func ExampleBPEVocab ¶
func ExampleBPEVocab() *BPETokenizer
ExampleBPEVocab creates a minimal BPE tokenizer for testing.
func LoadBPEFromHuggingFace ¶
func LoadBPEFromHuggingFace(path string) (*BPETokenizer, error)
LoadBPEFromHuggingFace loads a BPE tokenizer from tokenizer.json.
This is a simplified loader that handles the most common HuggingFace format.
func NewBPETokenizer ¶
func NewBPETokenizer(vocab map[string]int32, merges []pair) *BPETokenizer
NewBPETokenizer creates a new BPE tokenizer from vocab and merges.
func (*BPETokenizer) BosToken ¶
func (b *BPETokenizer) BosToken() int32
BosToken returns the beginning-of-sequence token ID.
func (*BPETokenizer) Decode ¶
func (b *BPETokenizer) Decode(tokens []int32) (string, error)
Decode converts token IDs back to text.
func (*BPETokenizer) Encode ¶
func (b *BPETokenizer) Encode(text string) ([]int32, error)
Encode converts text to token IDs using BPE.
func (*BPETokenizer) EosToken ¶
func (b *BPETokenizer) EosToken() int32
EosToken returns the end-of-sequence token ID.
func (*BPETokenizer) IsSpecialToken ¶
func (b *BPETokenizer) IsSpecialToken(token int32) bool
IsSpecialToken checks if a token ID is a special token.
func (*BPETokenizer) PadToken ¶
func (b *BPETokenizer) PadToken() int32
PadToken returns the padding token ID.
func (*BPETokenizer) SetSpecialTokens ¶
func (b *BPETokenizer) SetSpecialTokens(bos, eos, pad, unk int32)
SetSpecialTokens configures special token IDs.
func (*BPETokenizer) UnkToken ¶
func (b *BPETokenizer) UnkToken() int32
UnkToken returns the unknown token ID.
func (*BPETokenizer) VocabSize ¶
func (b *BPETokenizer) VocabSize() int
VocabSize returns the total vocabulary size.
type ChatMLTemplate ¶
type ChatMLTemplate struct{}
ChatMLTemplate implements the ChatML format used by OpenAI and DeepSeek.
Format: <|im_start|>role\ncontent<|im_end|>.
func NewChatMLTemplate ¶
func NewChatMLTemplate() *ChatMLTemplate
NewChatMLTemplate creates a new ChatML template.
func (*ChatMLTemplate) Apply ¶
func (t *ChatMLTemplate) Apply(messages []ChatMessage) string
Apply formats messages in ChatML format.
func (*ChatMLTemplate) Name ¶
func (t *ChatMLTemplate) Name() string
Name returns the template name.
type ChatMessage ¶
type ChatMessage struct {
// Role specifies the message role ("system", "user", "assistant").
Role string
// Content is the message text.
Content string
}
ChatMessage represents a single message in a conversation.
type ChatTemplate ¶
type ChatTemplate interface {
// Apply formats a sequence of messages into a prompt string.
Apply(messages []ChatMessage) string
// Name returns the template name (e.g., "ChatML", "LLaMA").
Name() string
}
ChatTemplate formats messages for conversational models.
func GetChatTemplate ¶
func GetChatTemplate(name string) (ChatTemplate, error)
GetChatTemplate returns a chat template by name.
type HFTokenizerMetadata ¶
type HFTokenizerMetadata struct {
Type HFTokenizerType
VocabSize int
HasBOS bool
HasEOS bool
HasPAD bool
HasUNK bool
ModelName string
TokenizerType string
}
HFTokenizerMetadata contains metadata from tokenizer.json.
func DetectHFTokenizerType ¶
func DetectHFTokenizerType(path string) (*HFTokenizerMetadata, error)
DetectHFTokenizerType determines the tokenizer type from tokenizer.json.
type HFTokenizerType ¶
type HFTokenizerType string
HFTokenizerType identifies the tokenizer implementation type.
const ( // HFTypeBPE indicates Byte-Pair Encoding tokenizer. HFTypeBPE HFTokenizerType = "BPE" // HFTypeWordPiece indicates WordPiece tokenizer (BERT-style). HFTypeWordPiece HFTokenizerType = "WordPiece" // HFTypeUnigram indicates Unigram tokenizer (SentencePiece-style). HFTypeUnigram HFTokenizerType = "Unigram" // HFTypeUnknown indicates an unknown or unsupported tokenizer type. HFTypeUnknown HFTokenizerType = "Unknown" )
type HuggingFaceTokenizerConfig ¶
type HuggingFaceTokenizerConfig struct {
Model struct {
Vocab map[string]int `json:"vocab"`
Merges []string `json:"merges"`
} `json:"model"`
AddedTokens []struct {
ID int `json:"id"`
Content string `json:"content"`
Special bool `json:"special"`
} `json:"added_tokens"`
}
HuggingFaceTokenizerConfig represents a subset of tokenizer.json structure.
type LLaMATemplate ¶
type LLaMATemplate struct {
// contains filtered or unexported fields
}
LLaMATemplate implements the LLaMA chat format.
Format: [INST] user message [/INST] assistant response.
func NewLLaMATemplate ¶
func NewLLaMATemplate() *LLaMATemplate
NewLLaMATemplate creates a new LLaMA chat template.
func (*LLaMATemplate) Apply ¶
func (t *LLaMATemplate) Apply(messages []ChatMessage) string
Apply formats messages in LLaMA format.
type MistralTemplate ¶
type MistralTemplate struct {
// contains filtered or unexported fields
}
MistralTemplate implements the Mistral chat format.
Similar to LLaMA but with slight variations.
func NewMistralTemplate ¶
func NewMistralTemplate() *MistralTemplate
NewMistralTemplate creates a new Mistral chat template.
func (*MistralTemplate) Apply ¶
func (t *MistralTemplate) Apply(messages []ChatMessage) string
Apply formats messages in Mistral format.
func (*MistralTemplate) Name ¶
func (t *MistralTemplate) Name() string
Name returns the template name.
type TikToken ¶
type TikToken struct {
// contains filtered or unexported fields
}
TikToken wraps the pkoukk/tiktoken-go library for OpenAI tokenizers.
Supported encodings:
- cl100k_base: GPT-4, GPT-3.5-turbo, text-embedding-ada-002
- p50k_base: GPT-3, Codex
- r50k_base: GPT-3, davinci-002, babbage-002
func NewTikToken ¶
NewTikToken creates a new TikToken tokenizer with the specified encoding.
Supported encodings: "cl100k_base" (GPT-4), "p50k_base" (GPT-3).
func NewTikTokenForModel ¶
NewTikTokenForModel creates a TikToken tokenizer for a specific model.
Example models: "gpt-4", "gpt-3.5-turbo", "text-embedding-ada-002".
func (*TikToken) BosToken ¶
BosToken returns the beginning-of-sequence token ID. tiktoken doesn't use BOS tokens, returns -1.
func (*TikToken) EosToken ¶
EosToken returns the end-of-sequence token ID. tiktoken uses <|endoftext|> (token 50256 for cl100k_base, 50256 for p50k_base).
func (*TikToken) IsSpecialToken ¶
IsSpecialToken checks if a token ID is a special token.
For tiktoken, special tokens are primarily <|endoftext|> and role markers.
func (*TikToken) PadToken ¶
PadToken returns the padding token ID. tiktoken doesn't define a padding token, returns -1.
type Tokenizer ¶
type Tokenizer interface {
// Encode converts text to token IDs.
Encode(text string) ([]int32, error)
// Decode converts token IDs back to text.
Decode(tokens []int32) (string, error)
// VocabSize returns the total vocabulary size.
VocabSize() int
// BosToken returns the beginning-of-sequence token ID.
// Returns -1 if not applicable.
BosToken() int32
// EosToken returns the end-of-sequence token ID.
// Returns -1 if not applicable.
EosToken() int32
// PadToken returns the padding token ID.
// Returns -1 if not applicable.
PadToken() int32
// UnkToken returns the unknown token ID.
// Returns -1 if not applicable.
UnkToken() int32
// IsSpecialToken checks if a token ID is a special token.
IsSpecialToken(token int32) bool
}
Tokenizer is the core interface for text tokenization.
All tokenizer implementations (tiktoken, BPE, etc.) must implement this interface.
func AutoLoadTokenizer ¶
AutoLoadTokenizer attempts to automatically load the correct tokenizer.
It tries multiple strategies:
- Load from HuggingFace model directory (tokenizer.json)
- Load tiktoken by model name
- Load tiktoken by encoding name
func LoadFromHuggingFace ¶
LoadFromHuggingFace loads a tokenizer from a HuggingFace model directory.
The directory should contain tokenizer.json and optionally tokenizer_config.json.
func TryLoadTikToken ¶
TryLoadTikToken attempts to load a tiktoken-compatible tokenizer.
This is a fallback for models that use OpenAI-style tokenizers.