Documentation
¶
Index ¶
- Variables
- func ApplyBuiltinTemplate(templateName string, messages []ChatMessage, addAssistant bool) (string, error)
- func ApplyTemplateWithModel(model *Model, tmpl string, messages []ChatMessage, addAssistant bool) (string, error)
- func BuiltinTemplates() []string
- func CacheCount() int32
- func Cleanup()
- func ClearCache()
- func CosineSimilarity(a, b []float32) float64
- func DisableLogging()
- func DotProduct(a, b []float32) float64
- func EuclideanDistance(a, b []float32) float64
- func GPUBackendName() string
- func GPUCount() int
- func HasCUDA() bool
- func HasMetal() bool
- func HasVulkan() bool
- func Init() error
- func IsInitialized() bool
- func NormalizeEmbeddings(embd []float32)
- func SetLogCallback(fn LogCallback)
- func SetLogLevel(level LogLevel)
- type AttentionType
- type Batch
- func (b *Batch) Add(token Token, pos int32, seqID int32, logits bool) error
- func (b *Batch) AddSeq(token Token, pos int32, seqIDs []int32, logits bool) error
- func (b *Batch) AddTokens(tokens []Token, posStart int32, seqID int32, logitsLast bool) (int32, error)
- func (b *Batch) Capacity() int32
- func (b *Batch) Clear()
- func (b *Batch) Close() error
- func (b *Batch) Decode(ctx *Context) error
- func (b *Batch) Encode(ctx *Context) error
- func (b *Batch) NumTokens() int32
- func (b *Batch) SetLogits(idx int32, logits bool) error
- type BatchEmbeddings
- type ChatMessage
- type CompletionOptions
- type Context
- func (c *Context) BatchSize() uint32
- func (c *Context) Close() error
- func (ctx *Context) Complete(prompt string, opts CompletionOptions) (string, error)
- func (ctx *Context) CompleteNative(prompt string, opts CompletionOptions) (string, error)
- func (ctx *Context) CompleteNativeWithStopInfo(prompt string, opts CompletionOptions) (string, bool, error)
- func (ctx *Context) CompleteWithMessages(messages []ChatMessage, opts CompletionOptions) (string, error)
- func (ctx *Context) ComputeEmbedding(model *Model, text string, opts EmbeddingOptions) ([]float32, error)
- func (ctx *Context) ComputeEmbeddings(model *Model, texts []string, opts EmbeddingOptions) (*BatchEmbeddings, error)
- func (c *Context) ContextSize() uint32
- func (c *Context) CtxSeq() uint32
- func (ctx *Context) GetAllEmbeddings(nOutputs int) ([]float32, error)
- func (ctx *Context) GetEmbeddings(idx int32) ([]float32, error)
- func (ctx *Context) GetEmbeddingsBySeq(seqID int32) ([]float32, error)
- func (ctx *Context) GetLogits(idx int32) ([]float32, error)
- func (ctx *Context) Info() (ContextInfo, error)
- func (c *Context) KVCacheTypeK() GGMLType
- func (c *Context) KVCacheTypeV() GGMLType
- func (ctx *Context) MemoryCanShift() bool
- func (ctx *Context) MemoryClear(clearData bool) error
- func (ctx *Context) MemorySeqAdd(seqID, p0, p1, delta int32) error
- func (ctx *Context) MemorySeqCp(seqIDSrc, seqIDDst, p0, p1 int32) error
- func (ctx *Context) MemorySeqDiv(seqID, p0, p1, d int32)
- func (ctx *Context) MemorySeqKeep(seqID int32)
- func (ctx *Context) MemorySeqLength(seqID int32) int32
- func (ctx *Context) MemorySeqPosMax(seqID int32) int32
- func (ctx *Context) MemorySeqPosMin(seqID int32) int32
- func (ctx *Context) MemorySeqRm(seqID, p0, p1 int32) error
- func (c *Context) Model() *Model
- func (ctx *Context) NEmbd() int32
- func (ctx *Context) NVocab() int32
- func (ctx *Context) Perf() (PerfData, error)
- func (ctx *Context) PerfReset()
- func (ctx *Context) PoolingType() PoolingType
- func (c *Context) SeqMax() uint32
- func (ctx *Context) SetEmbeddings(enabled bool)
- func (c *Context) StateGetData() ([]byte, error)
- func (c *Context) StateGetSize() uint64
- func (c *Context) StateLoadFile(path string, maxTokens int) ([]Token, error)
- func (c *Context) StateSaveFile(path string, tokens []Token) error
- func (c *Context) StateSeqGetData(seqID int32) ([]byte, error)
- func (c *Context) StateSeqGetSize(seqID int32) uint64
- func (c *Context) StateSeqLoadFile(path string, seqID int32, maxTokens int) ([]Token, uint64, error)
- func (c *Context) StateSeqSaveFile(path string, seqID int32, tokens []Token) (uint64, error)
- func (c *Context) StateSeqSetData(seqID int32, data []byte) (uint64, error)
- func (c *Context) StateSetData(data []byte) (uint64, error)
- func (ctx *Context) Synchronize() error
- func (c *Context) UBatchSize() uint32
- type ContextInfo
- type ContextParams
- type DetokenizeOptions
- type EmbeddingOptions
- type FlashAttnType
- type GGMLType
- type GPUInfo
- type GrammarSampler
- type LazyGrammarOptions
- type LazyGrammarSampler
- type LogCallback
- type LogLevel
- type MirostatV2Params
- type MirostatV2Sampler
- type Model
- func (m *Model) AllMetadata() (map[string]string, error)
- func (m *Model) Arch() (string, error)
- func (m *Model) BOS() Token
- func (m *Model) ChatTemplate(templateName string) string
- func (m *Model) Close() error
- func (m *Model) ContextSize() int32
- func (m *Model) Description() (string, error)
- func (m *Model) Detokenize(tokens []Token, opts DetokenizeOptions) (string, error)
- func (m *Model) EOS() Token
- func (m *Model) EOT() Token
- func (m *Model) EmbeddingSize() int32
- func (m *Model) HasChatTemplate() bool
- func (m *Model) Info() (ModelInfo, error)
- func (m *Model) IsControl(token Token) bool
- func (m *Model) IsEOG(token Token) bool
- func (m *Model) LayerCount() int32
- func (m *Model) MetaCount() int
- func (m *Model) MetaKey(index int) (string, error)
- func (m *Model) MetaValue(key string) (string, error)
- func (m *Model) Metadata(key string) string
- func (m *Model) NCtxTrain() int32
- func (m *Model) NEmbd() int32
- func (m *Model) NHead() int32
- func (m *Model) NHeadKV() int32
- func (m *Model) NL() Token
- func (m *Model) NLayer() int32
- func (m *Model) Name() (string, error)
- func (m *Model) PAD() Token
- func (m *Model) ParamCount() uint64
- func (m *Model) Path() string
- func (m *Model) SizeBytes() uint64
- func (m *Model) SizeString() string
- func (m *Model) TokenToString(token Token) (string, error)
- func (m *Model) Tokenize(text string, opts TokenizeOptions) ([]Token, error)
- func (m *Model) VocabSize() int32
- type ModelInfo
- type ModelParams
- type PerfData
- type PoolingType
- type Sampler
- type SamplerChain
- func (sc *SamplerChain) Accept(token Token)
- func (sc *SamplerChain) AddDist(seed uint32)
- func (sc *SamplerChain) AddGreedy()
- func (sc *SamplerChain) AddMinP(p float32, minKeep int)
- func (sc *SamplerChain) AddPenalties(lastN int32, repeat, freq, presence float32)
- func (sc *SamplerChain) AddTemp(temp float32)
- func (sc *SamplerChain) AddTopK(k int32)
- func (sc *SamplerChain) AddTopP(p float32, minKeep int)
- func (sc *SamplerChain) Close() error
- func (sc *SamplerChain) Length() int32
- func (sc *SamplerChain) Reset()
- func (sc *SamplerChain) Sample(ctx *Context, idx int32) (Token, error)
- type SamplerParams
- type Token
- type TokenizeOptions
- type XTCParams
- type XTCSampler
Constants ¶
This section is empty.
Variables ¶
var ( ErrInvalidModel = llama.ErrInvalidModel ErrInvalidContext = llama.ErrInvalidContext ErrInvalidArgument = llama.ErrInvalidArgument ErrNoKVSlot = llama.ErrNoKVSlot ErrBatchFull = llama.ErrBatchFull ErrInvalidBatch = llama.ErrInvalidBatch ErrKeyNotFound = llama.ErrKeyNotFound ErrIndexOutOfRange = llama.ErrIndexOutOfRange ErrInvalidToken = llama.ErrInvalidToken )
var ErrPartialRemovalNotSupported = errors.New("partial memory removal not supported")
ErrPartialRemovalNotSupported is returned when partial KV cache removal is not supported.
Functions ¶
func ApplyBuiltinTemplate ¶
func ApplyBuiltinTemplate(templateName string, messages []ChatMessage, addAssistant bool) (string, error)
ApplyBuiltinTemplate formats messages using a named built-in template Common templates: "llama2", "llama3", "chatml", "gemma", "phi3", etc.
func ApplyTemplateWithModel ¶
func ApplyTemplateWithModel(model *Model, tmpl string, messages []ChatMessage, addAssistant bool) (string, error)
ApplyTemplateWithModel formats messages using a specific template If tmpl is empty, uses the model's default template If model is nil, tmpl must be provided
func BuiltinTemplates ¶
func BuiltinTemplates() []string
BuiltinTemplates returns the list of built-in chat template names
func Cleanup ¶
func Cleanup()
Cleanup frees all resources and shuts down the backend This should be called when completely done with llama.cpp
func CosineSimilarity ¶
CosineSimilarity computes the cosine similarity between two embedding vectors. Both vectors should be normalized for best results. Returns a value between -1 and 1.
func DotProduct ¶
DotProduct computes the dot product between two embedding vectors.
func EuclideanDistance ¶
EuclideanDistance computes the Euclidean distance between two embedding vectors.
func GPUBackendName ¶
func GPUBackendName() string
GPUBackendName returns the name of the GPU backend. Returns "Metal", "CUDA", "Vulkan", or "CPU".
func GPUCount ¶
func GPUCount() int
GPUCount returns the number of available GPU devices. Returns 0 if no GPU backend is available.
func Init ¶
func Init() error
Init initializes the llama.cpp backend This is called automatically when loading a model, but can be called explicitly for early initialization This function is idempotent - multiple calls are safe
func IsInitialized ¶
func IsInitialized() bool
IsInitialized returns true if the backend has been initialized
func NormalizeEmbeddings ¶
func NormalizeEmbeddings(embd []float32)
NormalizeEmbeddings performs L2 normalization on an embedding vector in-place. This is commonly needed before computing cosine similarity.
func SetLogCallback ¶
func SetLogCallback(fn LogCallback)
SetLogCallback sets a custom log callback function. Pass nil to disable custom logging and restore default behavior. The callback receives the log level and message text.
func SetLogLevel ¶
func SetLogLevel(level LogLevel)
SetLogLevel sets the minimum log level. Messages below this level will be discarded.
Types ¶
type AttentionType ¶
type AttentionType int32
AttentionType specifies the attention pattern for embeddings
const ( AttentionUnspecified AttentionType = -1 // Auto-detect from model AttentionCausal AttentionType = 0 // Causal/autoregressive (GPT-style) AttentionNonCausal AttentionType = 1 // Non-causal/bidirectional (BERT-style) )
type Batch ¶
type Batch struct {
// contains filtered or unexported fields
}
Batch manages tokens to be processed by the model
func BatchFromTokens ¶
BatchFromTokens creates a batch pre-populated with the given tokens. All tokens are assigned to sequence 0 with sequential positions starting from posStart. If logitsLast is true, only the last token will have logits computed.
func NewBatch ¶
NewBatch creates a new batch that can hold up to nTokens tokens. nSeqMax is the maximum number of sequence IDs per token (usually 1).
func (*Batch) Add ¶
Add adds a single token to the batch. pos is the position in the sequence. seqID is the sequence ID. logits indicates whether to compute logits for this token. Returns ErrInvalidBatch if batch is closed, ErrBatchFull if batch is full.
func (*Batch) AddSeq ¶
AddSeq adds a single token with multiple sequence IDs. This is useful for shared prefixes across multiple sequences. Returns ErrInvalidBatch if batch is closed or seqIDs is empty, ErrBatchFull if batch is full.
func (*Batch) AddTokens ¶
func (b *Batch) AddTokens(tokens []Token, posStart int32, seqID int32, logitsLast bool) (int32, error)
AddTokens adds multiple tokens to the batch, all with the same sequence ID. posStart is the starting position for the first token. If logitsLast is true, only the last token will have logits computed. Returns the number of tokens actually added (may be less if batch fills up), or error. Returns ErrInvalidBatch if batch is closed.
func (*Batch) Decode ¶
Decode processes the batch using the given context. This runs the model forward pass and populates the KV cache. Returns nil on success. Returns ErrNoKVSlot if no KV cache slot is available (try smaller batch or larger context).
func (*Batch) Encode ¶
Encode processes the batch using the encoder (for encoder-decoder models). Returns nil on success.
type BatchEmbeddings ¶
type BatchEmbeddings struct {
Embeddings [][]float32 // One embedding vector per sequence/token
Dimension int // Embedding dimension
}
BatchEmbeddings holds multiple embeddings with their sequence information.
func ExtractBatchEmbeddings ¶
func ExtractBatchEmbeddings(flat []float32, nOutputs, nEmbd int) BatchEmbeddings
ExtractBatchEmbeddings extracts individual embeddings from a flattened buffer. nOutputs is the number of embeddings, nEmbd is the dimension.
func (*BatchEmbeddings) MostSimilar ¶
func (be *BatchEmbeddings) MostSimilar(queryIdx int, k int) []int
MostSimilar returns indices of the k most similar embeddings to the given query index. Excludes the query itself from results.
func (*BatchEmbeddings) Normalize ¶
func (be *BatchEmbeddings) Normalize()
Normalize normalizes all embeddings in the batch.
func (*BatchEmbeddings) SimilarityMatrix ¶
func (be *BatchEmbeddings) SimilarityMatrix() [][]float64
SimilarityMatrix computes pairwise cosine similarity between all embeddings. Returns an NxN matrix where result[i][j] is the similarity between embeddings i and j.
type ChatMessage ¶
type ChatMessage struct {
Role string // "system", "user", or "assistant"
Content string // The message content
}
ChatMessage represents a single message in a conversation
type CompletionOptions ¶
type CompletionOptions struct {
// MaxTokens is the maximum number of tokens to generate (0 = default 128)
MaxTokens int
// StopWords are strings that stop generation when encountered
StopWords []string
// OnToken callback is called for each generated token
// Return false to stop generation early
OnToken func(token string) bool
// Sampler parameters
SamplerParams SamplerParams
// EnablePrefixCaching reuses KV cache for matching prompt prefix
EnablePrefixCaching bool
// AbortContext cancels generation when done
AbortContext context.Context
}
CompletionOptions configures text generation
func DefaultCompletionOptions ¶
func DefaultCompletionOptions() CompletionOptions
DefaultCompletionOptions returns sensible defaults
type Context ¶
type Context struct {
// contains filtered or unexported fields
}
Context represents an inference context for a model
func NewContext ¶
func NewContext(model *Model, params ContextParams) (*Context, error)
NewContext creates a new inference context from a model
func (*Context) Complete ¶
func (ctx *Context) Complete(prompt string, opts CompletionOptions) (string, error)
Complete generates text completion for the given prompt This Go implementation makes multiple CGO calls per token. For better performance, use CompleteNative which keeps the generation loop in C++
func (*Context) CompleteNative ¶
func (ctx *Context) CompleteNative(prompt string, opts CompletionOptions) (string, error)
CompleteNative generates text completion using the C++ generation loop This minimizes CGO overhead by keeping the entire generation loop in C++
func (*Context) CompleteNativeWithStopInfo ¶
func (ctx *Context) CompleteNativeWithStopInfo(prompt string, opts CompletionOptions) (string, bool, error)
CompleteNativeWithStopInfo generates text completion and returns whether a stop sequence was hit
func (*Context) CompleteWithMessages ¶
func (ctx *Context) CompleteWithMessages(messages []ChatMessage, opts CompletionOptions) (string, error)
CompleteWithMessages generates a completion using chat messages The messages are formatted using the model's chat template
func (*Context) ComputeEmbedding ¶
func (ctx *Context) ComputeEmbedding(model *Model, text string, opts EmbeddingOptions) ([]float32, error)
ComputeEmbedding computes embedding for a single text. Convenience wrapper around ComputeEmbeddings.
func (*Context) ComputeEmbeddings ¶
func (ctx *Context) ComputeEmbeddings(model *Model, texts []string, opts EmbeddingOptions) (*BatchEmbeddings, error)
ComputeEmbeddings computes embeddings for multiple texts in a single batch. This is the high-level API for batch embedding computation. Returns one embedding vector per input text.
func (*Context) ContextSize ¶
ContextSize returns the actual context size
func (*Context) GetAllEmbeddings ¶
GetAllEmbeddings returns all embeddings from the last decode. Use nOutputs to specify how many embeddings to extract (number of tokens with logits=true). The returned slice has size nOutputs * nEmbd.
func (*Context) GetEmbeddings ¶
GetEmbeddings returns the embeddings for the token at the given index. Use idx=-1 to get embeddings for the last token. Only available when context was created with embeddings=true.
func (*Context) GetEmbeddingsBySeq ¶
GetEmbeddingsBySeq returns the pooled embeddings for a specific sequence. This only works when pooling_type is not NONE. The sequence must have been processed in a batch.
func (*Context) GetLogits ¶
GetLogits returns the logits for the token at the given index after a decode. Use idx=-1 to get logits for the last token that had logits=true in the batch. Returns a slice of n_vocab floats representing log-probabilities for each token.
func (*Context) Info ¶
func (ctx *Context) Info() (ContextInfo, error)
Info returns runtime information about the context.
func (*Context) KVCacheTypeK ¶
KVCacheTypeK returns the data type used for the K cache
func (*Context) KVCacheTypeV ¶
KVCacheTypeV returns the data type used for the V cache
func (*Context) MemoryCanShift ¶
MemoryCanShift returns whether the memory supports context shifting
func (*Context) MemoryClear ¶
MemoryClear clears the memory (KV cache) contents. If clearData is true, the data buffers will also be cleared. Returns ErrInvalidContext if context is closed.
func (*Context) MemorySeqAdd ¶
MemorySeqAdd shifts positions in memory (for context shifting). Returns ErrInvalidContext if context is closed.
func (*Context) MemorySeqCp ¶
MemorySeqCp copies a sequence in memory. Returns ErrInvalidContext if context is closed.
func (*Context) MemorySeqDiv ¶
MemorySeqDiv divides positions in a sequence range [p0, p1) by d (integer division).
func (*Context) MemorySeqKeep ¶
MemorySeqKeep removes all tokens that do not belong to the specified sequence.
func (*Context) MemorySeqLength ¶
MemorySeqLength returns the number of tokens cached for a sequence. Returns 0 if the sequence is empty.
func (*Context) MemorySeqPosMax ¶
MemorySeqPosMax returns the maximum position for a sequence (-1 if empty)
func (*Context) MemorySeqPosMin ¶
MemorySeqPosMin returns the minimum position for a sequence (-1 if empty)
func (*Context) MemorySeqRm ¶
MemorySeqRm removes tokens from memory for a sequence. seqID: sequence ID (-1 for all sequences) p0: start position (inclusive, -1 for 0) p1: end position (exclusive, -1 for end) Returns ErrInvalidContext if context is closed, ErrPartialRemovalNotSupported if partial removal is not supported.
func (*Context) PerfReset ¶
func (ctx *Context) PerfReset()
PerfReset resets the performance counters.
func (*Context) PoolingType ¶
func (ctx *Context) PoolingType() PoolingType
PoolingType returns the pooling type configured for this context.
func (*Context) SetEmbeddings ¶
SetEmbeddings enables or disables embedding extraction mode. When enabled, embeddings will be computed during decode.
func (*Context) StateGetData ¶
StateGetData copies full context state into a byte slice. Returns the number of bytes written.
func (*Context) StateGetSize ¶
StateGetSize returns the size in bytes needed to save full context state.
func (*Context) StateLoadFile ¶
StateLoadFile loads full context state from a file. If maxTokens > 0, also loads tokens into the returned slice. Returns tokens and error.
func (*Context) StateSaveFile ¶
StateSaveFile saves full context state to a file. tokens: optional slice of tokens to save alongside the state. Returns true on success.
func (*Context) StateSeqGetData ¶
StateSeqGetData copies a sequence's state into a byte slice. Returns the number of bytes written.
func (*Context) StateSeqGetSize ¶
StateSeqGetSize returns the size in bytes needed to save a sequence's state.
func (*Context) StateSeqLoadFile ¶
func (c *Context) StateSeqLoadFile(path string, seqID int32, maxTokens int) ([]Token, uint64, error)
StateSeqLoadFile loads a sequence's state from a file. If maxTokens > 0, also loads tokens into the returned slice. Returns tokens read, bytes read, and error.
func (*Context) StateSeqSaveFile ¶
StateSeqSaveFile saves a sequence's state to a file. tokens: optional slice of tokens to save alongside the state. Returns the number of bytes written (0 on failure).
func (*Context) StateSeqSetData ¶
StateSeqSetData restores a sequence's state from a byte slice. Returns the number of bytes read, or an error if restoration failed.
func (*Context) StateSetData ¶
StateSetData restores full context state from a byte slice. Returns the number of bytes read, or an error if restoration failed.
func (*Context) Synchronize ¶
Synchronize waits for all GPU operations to complete. Returns ErrInvalidContext if context is closed.
func (*Context) UBatchSize ¶
UBatchSize returns the physical maximum batch size
type ContextInfo ¶
type ContextInfo struct {
NCtx uint32 // Context size
NBatch uint32 // Batch size
NUBatch uint32 // Micro-batch size
NSeqMax uint32 // Max sequences
NThreads int32 // Thread count
}
ContextInfo contains runtime information about a context.
type ContextParams ¶
type ContextParams struct {
NCtx uint32 // Context size (0 = from model)
NBatch uint32 // Logical maximum batch size
NUBatch uint32 // Physical maximum batch size
NSeqMax uint32 // Max number of sequences
NThreads int32 // Threads for generation
NThreadsBatch int32 // Threads for batch processing
RopeFreqBase float32 // RoPE base frequency (0 = from model)
RopeFreqScale float32 // RoPE frequency scaling (0 = from model)
TypeK GGMLType // KV cache K type (-1 = default F16)
TypeV GGMLType // KV cache V type (-1 = default F16)
AttentionType AttentionType // Attention type for embeddings (-1 = auto)
FlashAttn FlashAttnType // Flash attention mode (-1 = auto, 0 = disabled, 1 = enabled)
Embeddings bool // Extract embeddings
OffloadKQV bool // Offload KQV ops to GPU
KVUnified bool // Use unified KV cache (required for encoder/BERT models)
NoPerf bool // Disable performance timings
}
ContextParams contains configuration for creating a context
func DefaultContextParams ¶
func DefaultContextParams() ContextParams
DefaultContextParams returns default context parameters
type DetokenizeOptions ¶
type DetokenizeOptions struct {
RemoveSpecial bool // Remove BOS/EOS tokens from output
UnparseSpecial bool // Render special tokens as text
}
DetokenizeOptions configures detokenization behavior
func DefaultDetokenizeOptions ¶
func DefaultDetokenizeOptions() DetokenizeOptions
DefaultDetokenizeOptions returns default detokenization options
type EmbeddingOptions ¶
type EmbeddingOptions struct {
// Normalize whether to L2-normalize embeddings (recommended for cosine similarity)
Normalize bool
// AddBOS whether to add BOS token at the start
AddBOS bool
// AddEOS whether to add EOS token at the end
AddEOS bool
}
EmbeddingOptions configures how embeddings are computed.
func DefaultEmbeddingOptions ¶
func DefaultEmbeddingOptions() EmbeddingOptions
DefaultEmbeddingOptions returns default options for embedding computation.
type FlashAttnType ¶
type FlashAttnType int32
FlashAttnType specifies flash attention mode
const ( FlashAttnAuto FlashAttnType = -1 // Auto-detect (recommended for embedding models) FlashAttnDisabled FlashAttnType = 0 // Disable flash attention FlashAttnEnabled FlashAttnType = 1 // Enable flash attention )
type GGMLType ¶
type GGMLType int32
GGMLType represents the data type for KV cache quantization
const ( GGMLTypeF32 GGMLType = 0 // 32-bit float (highest precision, most memory) GGMLTypeF16 GGMLType = 1 // 16-bit float (default, good balance) GGMLTypeQ4_0 GGMLType = 2 // 4-bit quantization GGMLTypeQ4_1 GGMLType = 3 // 4-bit quantization with offset GGMLTypeQ5_0 GGMLType = 6 // 5-bit quantization GGMLTypeQ5_1 GGMLType = 7 // 5-bit quantization with offset GGMLTypeQ8_0 GGMLType = 8 // 8-bit quantization (good quality, saves memory) GGMLTypeQ8_1 GGMLType = 9 // 8-bit quantization with offset GGMLTypeBF16 GGMLType = 30 // Brain float 16 )
type GPUInfo ¶
type GPUInfo struct {
DeviceID int32
DeviceName string
FreeMemoryBytes int64 // -1 if unknown
TotalMemoryBytes int64 // -1 if unknown
}
GPUInfo contains information about a GPU device
func GPUGetInfo ¶
GPUGetInfo returns information about a specific GPU device. Returns nil if the device_id is invalid or GPU is not available.
func (GPUInfo) FreeMemoryMB ¶
FreeMemoryMB returns free memory in megabytes, or -1 if unknown
func (GPUInfo) TotalMemoryMB ¶
TotalMemoryMB returns total memory in megabytes, or -1 if unknown
type GrammarSampler ¶
type GrammarSampler struct {
Sampler
}
GrammarSampler is a sampler that constrains generation using GBNF grammar It's a specialized Sampler that enforces grammar rules on token selection
func NewGrammarSampler ¶
func NewGrammarSampler(model *Model, grammarStr, grammarRoot string) (*GrammarSampler, error)
NewGrammarSampler creates a grammar-constrained sampler from GBNF grammar string The grammar uses GBNF (GGML BNF) format - see llama.cpp/grammars/README.md Example:
grammar := `root ::= "Yes" | "No"` sampler, err := NewGrammarSampler(model, grammar, "root")
type LazyGrammarOptions ¶
type LazyGrammarOptions struct {
// GrammarStr is the GBNF grammar string
GrammarStr string
// GrammarRoot is the start symbol name (default: "root")
GrammarRoot string
// TriggerPatterns are regex patterns that activate the grammar
// Pattern matches from start of generation, grammar gets content from first match group
TriggerPatterns []string
// TriggerTokens are token IDs that activate the grammar
// Grammar gets content starting from the trigger token (included)
TriggerTokens []Token
}
LazyGrammarOptions configures when grammar activation occurs
type LazyGrammarSampler ¶
type LazyGrammarSampler struct {
Sampler
}
LazyGrammarSampler creates a grammar sampler that activates based on triggers Grammar is only applied after specific patterns or tokens appear in output Useful for conditional grammar application (e.g., only enforce JSON after seeing "{")
func NewLazyGrammarSampler ¶
func NewLazyGrammarSampler(model *Model, opts LazyGrammarOptions) (*LazyGrammarSampler, error)
NewLazyGrammarSampler creates a grammar sampler triggered by patterns or tokens Example:
opts := LazyGrammarOptions{
GrammarStr: `root ::= object`,
TriggerPatterns: []string{`\{`}, // Activate when "{" appears
}
sampler, err := NewLazyGrammarSampler(model, opts)
type LogCallback ¶
LogCallback is a function that receives log messages
type LogLevel ¶
type LogLevel int
LogLevel represents the severity of a log message
const ( LogLevelNone LogLevel = C.LLAMA_GO_LOG_LEVEL_NONE LogLevelDebug LogLevel = C.LLAMA_GO_LOG_LEVEL_DEBUG LogLevelInfo LogLevel = C.LLAMA_GO_LOG_LEVEL_INFO LogLevelWarn LogLevel = C.LLAMA_GO_LOG_LEVEL_WARN LogLevelError LogLevel = C.LLAMA_GO_LOG_LEVEL_ERROR )
type MirostatV2Params ¶
type MirostatV2Params struct {
Seed uint32 // Random seed
Tau float32 // Target cross-entropy (higher = more random)
Eta float32 // Learning rate for mu updates
}
MirostatV2Params configures Mirostat v2 sampling
func DefaultMirostatV2Params ¶
func DefaultMirostatV2Params() MirostatV2Params
DefaultMirostatV2Params returns sensible Mirostat v2 defaults
type MirostatV2Sampler ¶
type MirostatV2Sampler struct {
Sampler
}
MirostatV2Sampler uses simplified Mirostat algorithm
func NewMirostatV2Sampler ¶
func NewMirostatV2Sampler(params MirostatV2Params) (*MirostatV2Sampler, error)
NewMirostatV2Sampler creates a Mirostat v2 sampler
type Model ¶
type Model struct {
// contains filtered or unexported fields
}
Model represents a loaded LLM model with caching support
func LoadModel ¶
func LoadModel(path string, params ModelParams) (*Model, error)
LoadModel loads a model from the given path Models are cached and reference-counted; loading the same path multiple times returns the same cached model with an incremented reference count
func (*Model) AllMetadata ¶
AllMetadata returns all metadata as a map. Returns ErrInvalidModel if model is closed.
func (*Model) Arch ¶
Arch returns the model architecture (e.g., "llama", "mistral", "phi"). Returns ErrInvalidModel if model is closed, ErrKeyNotFound if architecture not set.
func (*Model) ChatTemplate ¶
ChatTemplate returns the chat template from model metadata The templateName can be empty for default, or a specific name like "tool_use"
func (*Model) Close ¶
Close releases the model reference The underlying model is freed when the last reference is released
func (*Model) ContextSize ¶
ContextSize returns the model's training context size
func (*Model) Description ¶
Description returns the model description from metadata. Returns ErrInvalidModel if model is closed, ErrKeyNotFound if description not set.
func (*Model) Detokenize ¶
func (m *Model) Detokenize(tokens []Token, opts DetokenizeOptions) (string, error)
Detokenize converts tokens back to text
func (*Model) EmbeddingSize ¶
EmbeddingSize returns the model's embedding dimension
func (*Model) HasChatTemplate ¶
HasChatTemplate returns true if the model has a chat template
func (*Model) LayerCount ¶
LayerCount returns the number of layers in the model
func (*Model) MetaKey ¶
MetaKey returns the metadata key at the given index. Returns ErrInvalidModel if model is closed, ErrIndexOutOfRange if index is invalid.
func (*Model) MetaValue ¶
MetaValue returns the metadata value for the given key. Returns ErrInvalidModel if model is closed, ErrKeyNotFound if key doesn't exist.
func (*Model) Name ¶
Name returns the model name from metadata. Returns ErrInvalidModel if model is closed, ErrKeyNotFound if name not set.
func (*Model) ParamCount ¶
ParamCount returns the total number of parameters in the model.
func (*Model) SizeString ¶
SizeString returns a human-readable model size (e.g., "4.2 GB").
func (*Model) TokenToString ¶
TokenToString converts a single token to its string representation. Returns ErrInvalidModel if model is closed, ErrInvalidToken if token is invalid.
type ModelInfo ¶
type ModelInfo struct {
NLayer int32 // Total number of layers
NHead int32 // Number of attention heads
NHeadKV int32 // Number of KV heads (for GQA/MQA)
NEmbd int32 // Embedding dimension
NCtxTrain int32 // Training context length
NParams uint64 // Total parameter count
ModelSize uint64 // Model size in bytes
}
ModelInfo contains runtime information about a loaded model.
type ModelParams ¶
type ModelParams struct {
NGPULayers int32 // Number of layers to offload to GPU (-1 = all)
MainGPU int32 // Main GPU index
UseMmap bool // Use memory mapping for model loading
UseMlock bool // Lock model in memory
}
ModelParams contains configuration for loading a model
func DefaultModelParams ¶
func DefaultModelParams() ModelParams
DefaultModelParams returns default model loading parameters
type PerfData ¶
type PerfData struct {
StartMs float64 // Absolute start time (ms)
LoadMs float64 // Model loading time (ms)
PromptMs float64 // Prompt processing time (ms)
GenerateMs float64 // Token generation time (ms)
PromptCount int32 // Prompt tokens processed
TokenCount int32 // Tokens generated
}
PerfData contains performance timing data.
func (PerfData) PromptTokensPerSecond ¶
PromptTokensPerSecond returns the prompt processing rate.
func (PerfData) TokensPerSecond ¶
TokensPerSecond returns the token generation rate.
type PoolingType ¶
type PoolingType int32
PoolingType represents the pooling strategy for embeddings
const ( PoolingUnspecified PoolingType = -1 PoolingNone PoolingType = 0 PoolingMean PoolingType = 1 PoolingCLS PoolingType = 2 PoolingLast PoolingType = 3 PoolingRank PoolingType = 4 )
func (PoolingType) String ¶
func (p PoolingType) String() string
String returns a human-readable name for the pooling type
type Sampler ¶
type Sampler struct {
// contains filtered or unexported fields
}
Sampler handles token selection from model output
func NewSampler ¶
func NewSampler(model *Model, params SamplerParams) (*Sampler, error)
NewSampler creates a sampler chain with the given parameters
func (*Sampler) ChainLength ¶
ChainLength returns the number of samplers in the chain
type SamplerChain ¶
type SamplerChain struct {
// contains filtered or unexported fields
}
SamplerChain allows building custom sampler chains
func NewSamplerChain ¶
func NewSamplerChain() *SamplerChain
NewSamplerChain creates a new empty sampler chain
func (*SamplerChain) Accept ¶
func (sc *SamplerChain) Accept(token Token)
Accept records a token for penalty tracking
func (*SamplerChain) AddDist ¶
func (sc *SamplerChain) AddDist(seed uint32)
AddDist adds a random distribution sampler
func (*SamplerChain) AddGreedy ¶
func (sc *SamplerChain) AddGreedy()
AddGreedy adds a greedy sampler (always pick highest probability)
func (*SamplerChain) AddMinP ¶
func (sc *SamplerChain) AddMinP(p float32, minKeep int)
AddMinP adds min-P filtering
func (*SamplerChain) AddPenalties ¶
func (sc *SamplerChain) AddPenalties(lastN int32, repeat, freq, presence float32)
AddPenalties adds repetition/frequency/presence penalties
func (*SamplerChain) AddTemp ¶
func (sc *SamplerChain) AddTemp(temp float32)
AddTemp adds temperature scaling
func (*SamplerChain) AddTopK ¶
func (sc *SamplerChain) AddTopK(k int32)
AddTopK adds top-K filtering
func (*SamplerChain) AddTopP ¶
func (sc *SamplerChain) AddTopP(p float32, minKeep int)
AddTopP adds top-P (nucleus) filtering
func (*SamplerChain) Length ¶
func (sc *SamplerChain) Length() int32
Length returns the number of samplers in the chain
type SamplerParams ¶
type SamplerParams struct {
// Seed for random sampling (0 = random seed)
Seed uint32
// Temperature controls randomness (1.0 = normal, <1.0 = more deterministic, >1.0 = more random)
// Set to 0 for greedy sampling
Temperature float32
// TopK limits sampling to top K tokens (0 = disabled)
TopK int32
// TopP (nucleus sampling) limits to tokens with cumulative probability <= P (1.0 = disabled)
TopP float32
// MinP filters tokens with probability < P * max_prob (0.0 = disabled)
MinP float32
// RepeatPenalty penalizes repeated tokens (1.0 = disabled)
RepeatPenalty float32
// RepeatLastN is the number of tokens to consider for repetition penalty
RepeatLastN int32
// FrequencyPenalty reduces probability of frequent tokens (0.0 = disabled)
FrequencyPenalty float32
// PresencePenalty reduces probability of tokens that appeared at all (0.0 = disabled)
PresencePenalty float32
}
SamplerParams configures sampling behavior
func DefaultSamplerParams ¶
func DefaultSamplerParams() SamplerParams
DefaultSamplerParams returns sensible default sampling parameters
func GreedySamplerParams ¶
func GreedySamplerParams() SamplerParams
GreedySamplerParams returns parameters for greedy (deterministic) sampling
type TokenizeOptions ¶
type TokenizeOptions struct {
AddSpecial bool // Add BOS/EOS tokens
ParseSpecial bool // Parse special tokens in text (like <|endoftext|>)
}
TokenizeOptions configures tokenization behavior
func DefaultTokenizeOptions ¶
func DefaultTokenizeOptions() TokenizeOptions
DefaultTokenizeOptions returns default tokenization options
type XTCParams ¶
type XTCParams struct {
Probability float32 // Probability threshold (0.0-1.0)
Threshold float32 // Logit threshold
MinKeep int // Minimum tokens to keep
Seed uint32 // Random seed
}
XTCParams configures XTC (Exclude Top Choices) sampling
func DefaultXTCParams ¶
func DefaultXTCParams() XTCParams
DefaultXTCParams returns sensible XTC defaults
type XTCSampler ¶
type XTCSampler struct {
Sampler
}
XTCSampler excludes top choices for diversity
func NewXTCSampler ¶
func NewXTCSampler(params XTCParams) (*XTCSampler, error)
NewXTCSampler creates an XTC sampler for diversity