tinybpe

package module
v1.0.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 31, 2025 License: MIT Imports: 9 Imported by: 0

README

TinyBPE Library

Go Reference GitHub go.mod Go version Go Report Card GitHub Release GitHub Downloads (all assets, all releases)

Installation

go get github.com/shadowy-pycoder/tinybpe@latest

Usage

package main

import (
	"fmt"
	"os"

	"github.com/shadowy-pycoder/tinybpe"
)

func main() {
	tokenizer := tinybpe.NewTokenizer()
	f, err := os.ReadFile("./testdata/t8.shakespeare.txt")
	if err != nil {
		panic(err)
	}
	vocabSize := 512
	verbose := true
	tokenizer.Train(f, vocabSize, verbose)
	if err := tokenizer.Save("test"); err != nil {
		panic(err)
	}
	tokens := tokenizer.Encode([]byte("Hello World"))
	fmt.Println(tokens)
	text, err := tokenizer.Decode(tokens)
	if err != nil {
		panic(err)
	}
	fmt.Println(text)
}

Documentation

Index

Constants

View Source
const (
	MinVocabSize int = 256
	MaxVocabSize int = int(^uint(0) >> 1)
)
View Source
const Version string = "tinybpe v1.0.0"

Variables

This section is empty.

Functions

This section is empty.

Types

type Pair

type Pair struct {
	Left  TokenId
	Right TokenId
}

type TokenId

type TokenId int

type Tokenizer

type Tokenizer struct {
	// contains filtered or unexported fields
}

func Load

func Load(modelName string) (*Tokenizer, error)

func NewTokenizer

func NewTokenizer() *Tokenizer

func (*Tokenizer) Decode

func (t *Tokenizer) Decode(ids []TokenId) (string, error)

func (*Tokenizer) Encode

func (t *Tokenizer) Encode(raw []byte) []TokenId

func (*Tokenizer) Save

func (t *Tokenizer) Save(modelName string) error

func (*Tokenizer) Train

func (t *Tokenizer) Train(raw []byte, vocabSize int, verbose bool)

Directories

Path Synopsis
cmd
tinybpe command

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL