Documentation
ΒΆ
Index ΒΆ
- Constants
- Variables
- func ClearHFCache() error
- func ClearHFCachePattern(pattern string) (int, error)
- func ClearHFModelCache(modelID string) error
- func ClearLibraryCache() error
- func DownloadAndCacheLibrary() error
- func DownloadAndCacheLibraryWithVersion(version string) error
- func DownloadLibraryFromGitHub(destPath string) error
- func DownloadLibraryFromGitHubWithVersion(destPath, version string) error
- func GetAvailableVersions() ([]string, error)
- func GetCachedLibraryPath() string
- func GetHFCacheInfo(modelID string) (map[string]interface{}, error)
- func GetLibraryInfo() map[string]interface{}
- func GetLibraryVersion() string
- func IsLibraryCached() bool
- func LoadTokenizerLibrary(userPath string) (uintptr, error)
- func MasksFromBuf(buf Buffer) (special, attention []uint32)
- func SetLibraryVersion(version string)
- func TokensFromBuf(buf Buffer) []string
- type Buffer
- type EncodeOption
- type EncodeOptions
- type EncodeResult
- type GitHubAsset
- type GitHubRelease
- type HFConfig
- type PaddingOptions
- type PaddingStrategy
- type PaddingStrategyTag
- type StringResult
- type Tokenizer
- func (t *Tokenizer) Close() error
- func (t *Tokenizer) Decode(ids []uint32, skipSpecialTokens bool) (string, error)
- func (t *Tokenizer) Encode(message string, opts ...EncodeOption) (*EncodeResult, error)
- func (t *Tokenizer) EncodePair(sequence string, pair string, opts ...EncodeOption) (*EncodeResult, error)
- func (t *Tokenizer) EncodePairs(sequences []string, pairs []string, opts ...EncodeOption) ([]*EncodeResult, error)
- func (t *Tokenizer) GetLibraryVersion() string
- func (t *Tokenizer) VocabSize() (uint32, error)
- type TokenizerOption
- func WithHFCacheDir(dir string) TokenizerOption
- func WithHFCacheTTL(ttl time.Duration) TokenizerOption
- func WithHFMaxTokenizerSize(maxSize int64) TokenizerOption
- func WithHFOfflineMode(offline bool) TokenizerOption
- func WithHFRevision(revision string) TokenizerOption
- func WithHFTimeout(timeout time.Duration) TokenizerOption
- func WithHFToken(token string) TokenizerOption
- func WithHFUseLocalCache(useCache bool) TokenizerOption
- func WithLibraryPath(path string) TokenizerOption
- func WithPadding(enabled bool, strategy PaddingStrategy) TokenizerOption
- func WithTruncation(maxLen uintptr, direction TruncationDirection, strategy TruncationStrategy) TokenizerOption
- type TokenizerOptions
- type TokenizerResult
- type TruncationDirection
- type TruncationOptions
- type TruncationStrategy
- type VocabSizeResult
Constants ΒΆ
const ( GitHubRepo = "amikos-tech/pure-tokenizers" DefaultTag = "latest" DownloadTimeout = 30 * time.Second )
const ( HFDefaultRevision = "main" HFDefaultTimeout = 30 * time.Second HFMaxRetries = 3 HFRetryDelay = time.Second // HFMaxRetryAfterDelay caps the maximum delay from Retry-After headers // to prevent excessive waits from misconfigured or malicious servers HFMaxRetryAfterDelay = 5 * time.Minute // DefaultMaxTokenizerSize is the default maximum size for tokenizer files (500MB) // This prevents OOM errors from excessively large downloads DefaultMaxTokenizerSize = 500 * 1024 * 1024 )
const ( SUCCESS = 0 ErrInvalidUTF8 = -1 ErrEncodingFailed = -2 ErrNullOutput = -3 ErrInvalidTokenizerRef = -4 ErrNullInput = -5 ErrTokenizerCreationFailed = -6 ErrInvalidPath = -7 ErrFileNotFound = -8 ErrTruncationFailed = -9 ErrPaddingFailed = -10 ErrDecodeFailed = -11 ErrCStringConversionFailed = -12 ErrInvalidIDs = -13 ErrInvalidOptions = -14 )
const AbiCompatibilityConstraint = "^0.1.x"
AbiCompatibilityConstraint defines the required version range for ABI compatibility. The library version from Cargo.toml is used as the ABI version. Update this constraint when making breaking changes to the FFI interface.
const LibName = "tokenizers"
const TruncationMaxLengthDefault uintptr = 512 // Default truncation length, can be overridden by user
Variables ΒΆ
var ( HFHubBaseURL = "https://huggingface.co" // Variable to allow testing with mock server // ErrCacheNotFound is returned when a requested cache file does not exist ErrCacheNotFound = errors.New("cache file not found") )
Functions ΒΆ
func ClearHFCache ΒΆ added in v0.1.1
func ClearHFCache() error
ClearHFCache clears all HuggingFace tokenizer cache
func ClearHFCachePattern ΒΆ added in v0.1.1
ClearHFCachePattern clears cache entries matching a glob pattern. The pattern is matched against model IDs (e.g., "bert-*", "huggingface/*"). Patterns use standard glob syntax: * matches any sequence, ? matches any single character.
Examples:
- "bert-*" matches all BERT model variants
- "huggingface/*" matches all models from the huggingface organization
- "*/bert-*" matches BERT models from any organization
For security, patterns containing ".." in path segments or absolute paths are rejected. Returns the number of cache entries cleared and any error encountered.
func ClearHFModelCache ΒΆ added in v0.1.1
ClearHFModelCache clears the cache for a specific model
func ClearLibraryCache ΒΆ
func ClearLibraryCache() error
ClearLibraryCache removes the cached library file
func DownloadAndCacheLibrary ΒΆ
func DownloadAndCacheLibrary() error
DownloadAndCacheLibrary downloads and caches the library for the current platform
func DownloadAndCacheLibraryWithVersion ΒΆ
DownloadAndCacheLibraryWithVersion downloads and caches a specific version of the library
func DownloadLibraryFromGitHub ΒΆ
DownloadLibraryFromGitHub downloads the platform-specific library from GitHub releases
func DownloadLibraryFromGitHubWithVersion ΒΆ
DownloadLibraryFromGitHubWithVersion downloads a specific version of the library
func GetAvailableVersions ΒΆ
GetAvailableVersions fetches available versions from GitHub releases
func GetCachedLibraryPath ΒΆ
func GetCachedLibraryPath() string
GetCachedLibraryPath returns the path where the library would be cached
func GetHFCacheInfo ΒΆ added in v0.1.1
GetHFCacheInfo returns information about the HuggingFace cache for a model
func GetLibraryInfo ΒΆ
func GetLibraryInfo() map[string]interface{}
GetLibraryInfo returns information about the current library setup
func GetLibraryVersion ΒΆ added in v0.1.1
func GetLibraryVersion() string
GetLibraryVersion returns the current library version used in User-Agent
func IsLibraryCached ΒΆ
func IsLibraryCached() bool
IsLibraryCached checks if the library is already cached and valid
func LoadTokenizerLibrary ΒΆ
LoadTokenizerLibrary loads the tokenizer shared library from the specified path or attempts to find it through various fallback mechanisms: 1. User-provided path 2. TOKENIZERS_LIB_PATH environment variable 3. Cached library in platform-specific directory 4. Automatic download from GitHub releases
func MasksFromBuf ΒΆ
func SetLibraryVersion ΒΆ added in v0.1.1
func SetLibraryVersion(version string)
SetLibraryVersion sets the library version for User-Agent headers
func TokensFromBuf ΒΆ
Types ΒΆ
type EncodeOption ΒΆ
type EncodeOption func(eo *EncodeOptions) error
func WithAddSpecialTokens ΒΆ
func WithAddSpecialTokens() EncodeOption
func WithReturnAllAttributes ΒΆ
func WithReturnAllAttributes() EncodeOption
func WithReturnAttentionMask ΒΆ
func WithReturnAttentionMask() EncodeOption
func WithReturnOffsets ΒΆ
func WithReturnOffsets() EncodeOption
func WithReturnSpecialTokensMask ΒΆ
func WithReturnSpecialTokensMask() EncodeOption
func WithReturnTokens ΒΆ
func WithReturnTokens() EncodeOption
func WithReturnTypeIDs ΒΆ
func WithReturnTypeIDs() EncodeOption
type EncodeOptions ΒΆ
type EncodeResult ΒΆ
type GitHubAsset ΒΆ
type GitHubRelease ΒΆ
type GitHubRelease struct {
TagName string `json:"tag_name"`
Assets []GitHubAsset `json:"assets"`
}
GitHub API structures
type HFConfig ΒΆ added in v0.1.1
type HFConfig struct {
Token string
Revision string
CacheDir string
Timeout time.Duration
MaxRetries int
OfflineMode bool
// UseLocalCache enables checking the HuggingFace hub cache before downloading
UseLocalCache bool
// CacheTTL specifies how long cached tokenizers are considered valid (0 = forever)
CacheTTL time.Duration
// MaxTokenizerSize is the maximum allowed size for tokenizer files in bytes
// (env: HF_MAX_TOKENIZER_SIZE, default: 500MB).
// When set to 0 (zero value), falls back to HF_MAX_TOKENIZER_SIZE environment variable,
// or DefaultMaxTokenizerSize (500MB) if the environment variable is not set.
// Use WithHFMaxTokenizerSize to explicitly set this value.
MaxTokenizerSize int64
// BaseURL is the base URL for HuggingFace Hub API (defaults to HFHubBaseURL if empty)
// This is primarily used for testing with mock servers
BaseURL string
// HTTP client pooling configuration
// These settings control connection reuse for improved performance.
// Config fields take priority over environment variables.
//
// IMPORTANT: The HTTP client is initialized once per process using sync.Once.
// Changes to these configuration values after the first HuggingFace download
// will NOT take effect. Set these values before any HuggingFace operations.
//
// Performance trade-offs:
// - Higher values: Better connection reuse, reduced latency for subsequent requests, but increased memory usage
// - Lower values: Reduced memory footprint, but more connection establishment overhead
//
// Recommended configurations:
// - High-throughput services: Increase HTTPMaxIdleConnsPerHost (e.g., 20-50) for parallel downloads
// - Resource-constrained environments: Reduce both values (e.g., 50/5) to minimize memory usage
// - Short-lived scripts: Reduce HTTPIdleTimeout (e.g., 10s) to release resources quickly
//
// Note: HTTPMaxIdleConns will be automatically adjusted to be >= HTTPMaxIdleConnsPerHost for logical consistency
//
// Debug mode: Set DEBUG=1 environment variable to see actual configuration values being used
HTTPMaxIdleConns int // Maximum idle connections across all hosts (env: HF_HTTP_MAX_IDLE_CONNS, default: 100, max: 1000)
HTTPMaxIdleConnsPerHost int // Maximum idle connections per host (env: HF_HTTP_MAX_IDLE_CONNS_PER_HOST, default: 10, max: 100)
HTTPIdleTimeout time.Duration // How long to keep idle connections open (env: HF_HTTP_IDLE_TIMEOUT, default: 90s)
}
HFConfig holds HuggingFace-specific configuration
type PaddingOptions ΒΆ
type PaddingOptions struct {
Enabled bool
Strategy PaddingStrategy
}
type PaddingStrategy ΒΆ
type PaddingStrategy struct {
Tag PaddingStrategyTag
FixedSize uintptr // Only valid if Tag == PaddingStrategyFixed
}
type PaddingStrategyTag ΒΆ
type PaddingStrategyTag int
const ( PaddingStrategyBatchLongest PaddingStrategyTag = iota PaddingStrategyFixed )
type StringResult ΒΆ
type Tokenizer ΒΆ
type Tokenizer struct {
LibraryPath string // Path to the shared library
TruncationEnabled bool
TruncationDirection TruncationDirection
TruncationStrategy TruncationStrategy
TruncationMaxLength uintptr // Maximum length for truncation
PaddingEnabled bool
PaddingStrategy PaddingStrategy // Strategy for padding
// contains filtered or unexported fields
}
func FromHuggingFace ΒΆ added in v0.1.1
func FromHuggingFace(modelID string, opts ...TokenizerOption) (*Tokenizer, error)
FromHuggingFace loads a tokenizer from HuggingFace Hub using the model identifier.
The model identifier can be in the format "organization/model" or just "model". For example: "bert-base-uncased", "google/flan-t5-base", "meta-llama/Llama-2-7b-hf".
By default, it loads from the "main" branch/revision. Use WithHFRevision to specify a different revision (branch, tag, or commit hash).
For private or gated models, authentication is required. Set the HF_TOKEN environment variable or use WithHFToken option.
The tokenizer is cached locally for faster subsequent loads. The cache location is platform-specific and can be overridden with WithHFCacheDir.
Example:
tokenizer, err := FromHuggingFace("bert-base-uncased")
if err != nil {
log.Fatal(err)
}
defer tokenizer.Close()
func (*Tokenizer) Encode ΒΆ
func (t *Tokenizer) Encode(message string, opts ...EncodeOption) (*EncodeResult, error)
func (*Tokenizer) EncodePair ΒΆ added in v0.1.2
func (t *Tokenizer) EncodePair(sequence string, pair string, opts ...EncodeOption) (*EncodeResult, error)
EncodePair encodes a single sequence pair. This is a convenience wrapper around EncodePairs for encoding a single pair.
func (*Tokenizer) EncodePairs ΒΆ added in v0.1.2
func (t *Tokenizer) EncodePairs(sequences []string, pairs []string, opts ...EncodeOption) ([]*EncodeResult, error)
EncodePairs encodes multiple sequence pairs in parallel. This is useful for reranking tasks where you need to encode query-document pairs.
func (*Tokenizer) GetLibraryVersion ΒΆ added in v0.1.1
GetLibraryVersion returns the version of the tokenizer library
type TokenizerOption ΒΆ
func WithHFCacheDir ΒΆ added in v0.1.1
func WithHFCacheDir(dir string) TokenizerOption
WithHFCacheDir sets a custom cache directory for HuggingFace tokenizers
func WithHFCacheTTL ΒΆ added in v0.1.1
func WithHFCacheTTL(ttl time.Duration) TokenizerOption
WithHFCacheTTL sets the cache time-to-live for cached tokenizers
func WithHFMaxTokenizerSize ΒΆ added in v0.1.1
func WithHFMaxTokenizerSize(maxSize int64) TokenizerOption
WithHFMaxTokenizerSize sets the maximum allowed size for tokenizer files in bytes Default is 500MB. Set to a very large value to effectively disable size validation.
func WithHFOfflineMode ΒΆ added in v0.1.1
func WithHFOfflineMode(offline bool) TokenizerOption
WithHFOfflineMode forces the tokenizer to only use cached versions
func WithHFRevision ΒΆ added in v0.1.1
func WithHFRevision(revision string) TokenizerOption
WithHFRevision sets the model revision (branch, tag, or commit hash)
func WithHFTimeout ΒΆ added in v0.1.1
func WithHFTimeout(timeout time.Duration) TokenizerOption
WithHFTimeout sets the download timeout for HuggingFace requests
func WithHFToken ΒΆ added in v0.1.1
func WithHFToken(token string) TokenizerOption
WithHFToken sets the HuggingFace API token for authentication
func WithHFUseLocalCache ΒΆ added in v0.1.1
func WithHFUseLocalCache(useCache bool) TokenizerOption
WithHFUseLocalCache enables or disables checking the HuggingFace hub cache
func WithLibraryPath ΒΆ
func WithLibraryPath(path string) TokenizerOption
WithLibraryPath sets the path to the shared library for the tokenizer. This must be the path to the .so/dylib/dll file that contains the tokenizer implementation.
func WithPadding ΒΆ
func WithPadding(enabled bool, strategy PaddingStrategy) TokenizerOption
func WithTruncation ΒΆ
func WithTruncation(maxLen uintptr, direction TruncationDirection, strategy TruncationStrategy) TokenizerOption
type TokenizerOptions ΒΆ
type TokenizerOptions struct {
AddSpecialTokens bool
Trunc TruncationOptions
Pad PaddingOptions
}
type TokenizerResult ΒΆ
type TruncationDirection ΒΆ
type TruncationDirection uint8
const ( TruncationDirectionLeft TruncationDirection = iota TruncationDirectionRight )
const TruncationDirectionDefault TruncationDirection = TruncationDirectionRight
type TruncationOptions ΒΆ
type TruncationOptions struct {
Enabled bool
MaxLen uintptr
Strategy TruncationStrategy
Direction TruncationDirection
Stride uintptr
}
type TruncationStrategy ΒΆ
type TruncationStrategy uint8
const ( TruncationStrategyLongestFirst TruncationStrategy = iota TruncationStrategyOnlyFirst TruncationStrategyOnlySecond )
const TruncationStrategyDefault TruncationStrategy = TruncationStrategyLongestFirst