schema

package

v0.0.11 Latest Latest Go to latest Published: Feb 1, 2026 License: Apache-2.0 Imports: 6 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/mutablelogic/go-llama

Links

Open Source Insights

Documentation ¶

Index ¶

Constants
func SpanName(args ...string) string
type CachedModel
- func (m CachedModel) String() string
type ChatChunk
- func (r ChatChunk) String() string
type ChatMessage
type ChatRequest
- func (r ChatRequest) String() string
type ChatResponse
- func (r ChatResponse) String() string
type CompletionChunk
- func (r CompletionChunk) String() string
type CompletionRequest
- func (r CompletionRequest) String() string
type CompletionResponse
- func (r CompletionResponse) String() string
type ContextRequest
- func (r ContextRequest) String() string
type DetokenizeRequest
- func (r DetokenizeRequest) String() string
type DetokenizeResponse
- func (r DetokenizeResponse) String() string
type EmbedRequest
- func (r EmbedRequest) String() string
type EmbedResponse
- func (r EmbedResponse) String() string
type GPUDevice
- func (d GPUDevice) String() string
type GPUInfo
- func (i GPUInfo) String() string
type LoadModelRequest
- func (r LoadModelRequest) String() string
type Model
- func NewModelFromGGUF(basePath, relPath string, ctx *gguf.Context) (*Model, error)
- func (m Model) String() string
type ModelPullProgress
type ModelRuntime
- func (m ModelRuntime) String() string
type PullModelRequest
- func (r PullModelRequest) String() string
type ServerModel
type Token
type TokenizeRequest
- func (r TokenizeRequest) String() string
type TokenizeResponse
- func (r TokenizeResponse) String() string
type Usage
- func (u Usage) TotalTokens() int

Constants ¶

View Source

const (
	CompletionFinishReasonMaxTokens = "max_tokens"
	CompletionFinishReasonStop      = "stop"
	CompletionFinishReasonEOS       = "eos"
)

View Source

const (
	CompletionStreamDeltaType = "completion.delta"
	CompletionStreamDoneType  = "completion.done"
	CompletionStreamErrorType = "completion.error"

	ModelPullProgressType = "model.pull.progress"
	ModelPullCompleteType = "model.pull.complete"
	ModelPullErrorType    = "model.pull.error"
)

Variables ¶

This section is empty.

Functions ¶

func SpanName ¶

func SpanName(args ...string) string

SpanName returns a span name with "llamacpp" prefix joined by periods. SpanName("ListModels") returns "llamacpp.ListModels" SpanName("Model", "Load") returns "llamacpp.Model.Load"

Types ¶

type CachedModel ¶

type CachedModel struct {
	ServerModel
	Model
	LoadedAt time.Time     `json:"loaded_at,omitzero"`
	Runtime  *ModelRuntime `json:"runtime,omitempty"`
}

CachedModel represents a model loaded in memory. For server builds, it embeds ServerModel which provides the Handle and RWMutex.

func (CachedModel) String ¶

func (m CachedModel) String() string

type ChatChunk ¶ added in v0.0.5

type ChatChunk struct {
	Message ChatMessage `json:"message"`
}

ChatChunk contains a streamed chat chunk.

func (ChatChunk) String ¶ added in v0.0.5

func (r ChatChunk) String() string

type ChatMessage ¶ added in v0.0.5

type ChatMessage struct {
	Role    string `json:"role"`    // "system", "user", "assistant", or "tool"
	Content string `json:"content"` // The message content
}

ChatMessage represents a single message in a conversation.

type ChatRequest ¶ added in v0.0.5

type ChatRequest struct {
	CompletionRequest
	Messages []ChatMessage `json:"messages"`
}

ChatRequest contains parameters for chat completion. It embeds CompletionRequest to reuse sampling and model options.

func (ChatRequest) String ¶ added in v0.0.5

func (r ChatRequest) String() string

type ChatResponse ¶ added in v0.0.5

type ChatResponse struct {
	Model        string       `json:"model"`                   // Model used
	Thinking     *ChatMessage `json:"thinking,omitempty"`      // Optional reasoning message
	Message      ChatMessage  `json:"message"`                 // Assistant message
	Usage        Usage        `json:"usage"`                   // Token usage
	FinishReason string       `json:"finish_reason,omitempty"` // Reason generation ended
}

ChatResponse contains the generated assistant message.

func (ChatResponse) String ¶ added in v0.0.5

func (r ChatResponse) String() string

type CompletionChunk ¶

type CompletionChunk struct {
	Text string `json:"text"` // Chunk text
}

CompletionChunk contains a streamed completion chunk.

func (CompletionChunk) String ¶

func (r CompletionChunk) String() string

type CompletionRequest ¶

type CompletionRequest struct {
	Model         string   `json:"model"`                    // Model name
	Prompt        string   `json:"prompt"`                   // Prompt to complete
	MaxTokens     *int32   `json:"max_tokens,omitempty"`     // Max tokens to generate
	Temperature   *float32 `json:"temperature,omitempty"`    // Sampling temperature
	TopP          *float32 `json:"top_p,omitempty"`          // Nucleus sampling
	TopK          *int32   `json:"top_k,omitempty"`          // Top-k sampling
	RepeatPenalty *float32 `json:"repeat_penalty,omitempty"` // Penalize repeats (1.0 = disabled)
	RepeatLastN   *int32   `json:"repeat_last_n,omitempty"`  // Repeat penalty window size
	Seed          *uint32  `json:"seed,omitempty"`           // RNG seed
	Stop          []string `json:"stop,omitempty"`           // Stop words
	PrefixCache   *bool    `json:"prefix_cache,omitempty"`   // Enable prefix caching
}

CompletionRequest contains parameters for text completion.

func (CompletionRequest) String ¶

func (r CompletionRequest) String() string

type CompletionResponse ¶

type CompletionResponse struct {
	Model        string `json:"model"`                   // Model used
	Text         string `json:"text"`                    // Completion text
	Usage        Usage  `json:"usage"`                   // Token usage
	FinishReason string `json:"finish_reason,omitempty"` // Reason generation ended
}

CompletionResponse contains the generated completion.

func (CompletionResponse) String ¶

func (r CompletionResponse) String() string

type ContextRequest ¶

type ContextRequest struct {
	LoadModelRequest
	ContextSize   *uint32 `json:"context_size,omitempty"`   // Context size (nil = from model)
	BatchSize     *uint32 `json:"batch_size,omitempty"`     // Logical batch size (nil = default)
	UBatchSize    *uint32 `json:"ubatch_size,omitempty"`    // Physical/micro batch size (nil = default, must equal batch_size for encoder models)
	Threads       *int32  `json:"threads,omitempty"`        // Number of threads (nil = default)
	AttentionType *int32  `json:"attention_type,omitempty"` // Attention type: -1=auto, 0=causal, 1=non-causal (nil = auto)
	FlashAttn     *int32  `json:"flash_attn,omitempty"`     // Flash attention: -1=auto, 0=disabled, 1=enabled (nil = auto)
	Embeddings    *bool   `json:"embeddings,omitempty"`     // Enable embeddings extraction (nil = false)
	KVUnified     *bool   `json:"kv_unified,omitempty"`     // Use unified KV cache (nil = default, required for BERT)
}

ContextRequest contains parameters for creating an inference context.

func (ContextRequest) String ¶

func (r ContextRequest) String() string

type DetokenizeRequest ¶

type DetokenizeRequest struct {
	Model          string  `json:"model"`                     // Model name or path (must be loaded)
	Tokens         []Token `json:"tokens"`                    // Tokens to detokenize
	RemoveSpecial  *bool   `json:"remove_special,omitempty"`  // Remove BOS/EOS tokens (default: false)
	UnparseSpecial *bool   `json:"unparse_special,omitempty"` // Render special tokens as text (default: true)
}

DetokenizeRequest contains parameters for detokenizing tokens.

func (DetokenizeRequest) String ¶

func (r DetokenizeRequest) String() string

type DetokenizeResponse ¶

type DetokenizeResponse struct {
	Text string `json:"text"`
}

DetokenizeResponse contains the result of detokenization.

func (DetokenizeResponse) String ¶

func (r DetokenizeResponse) String() string

type EmbedRequest ¶

type EmbedRequest struct {
	Model     string   `json:"model"`               // Model name
	Input     []string `json:"input"`               // Text(s) to embed
	Normalize *bool    `json:"normalize,omitempty"` // L2-normalize embeddings (default: true)
}

EmbedRequest contains parameters for generating embeddings.

func (EmbedRequest) String ¶

func (r EmbedRequest) String() string

type EmbedResponse ¶

type EmbedResponse struct {
	Model      string      `json:"model"`      // Model used
	Embeddings [][]float32 `json:"embeddings"` // One embedding vector per input
	Dimension  int         `json:"dimension"`  // Embedding dimension
	Usage      Usage       `json:"usage"`      // Token usage
}

EmbedResponse contains the generated embeddings.

func (EmbedResponse) String ¶

func (r EmbedResponse) String() string

type GPUDevice ¶

type GPUDevice struct {
	ID               int32  `json:"id"`
	Name             string `json:"name"`
	FreeMemoryBytes  int64  `json:"free_memory_bytes"`  // -1 if unknown
	TotalMemoryBytes int64  `json:"total_memory_bytes"` // -1 if unknown
}

GPUDevice represents information about a single GPU device

func (GPUDevice) String ¶

func (d GPUDevice) String() string

type GPUInfo ¶

type GPUInfo struct {
	Backend string      `json:"backend"` // "Metal", "CUDA", "Vulkan", "CPU"
	Devices []GPUDevice `json:"devices"`
}

GPUInfo represents the GPU/accelerator configuration

func (GPUInfo) String ¶

func (i GPUInfo) String() string

type LoadModelRequest ¶

type LoadModelRequest struct {
	Name   string `json:"name"`                 // Model name or path to load
	Load   *bool  `json:"load,omitempty"`       // Load (true) or unload (false) model (nil = load)
	Gpu    *int32 `json:"gpu,omitempty"`        // Main GPU index (nil = default)
	Layers *int32 `json:"gpu_layers,omitempty"` // Number of layers to offload to GPU (nil = default, -1 = all)
	Mmap   *bool  `json:"use_mmap,omitempty"`   // Use memory mapping for model loading (nil = default)
	Mlock  *bool  `json:"use_mlock,omitempty"`  // Lock model in memory (nil = default)
}

LoadModelRequest contains the parameters for loading a model into memory.

func (LoadModelRequest) String ¶

func (r LoadModelRequest) String() string

type Model ¶

type Model struct {
	// Identity
	Path         string `json:"path,omitempty"`
	Name         string `json:"name,omitempty"`
	Architecture string `json:"architecture,omitempty"`
	Description  string `json:"description,omitempty"`

	// Chat template
	ChatTemplate string `json:"chatTemplate,omitempty"`

	// Dimensions
	ContextSize   int32 `json:"contextSize,omitempty"`
	EmbeddingSize int32 `json:"embeddingSize,omitempty"`
	LayerCount    int32 `json:"layerCount,omitempty"`
	HeadCount     int32 `json:"headCount,omitempty"`
	HeadKVCount   int32 `json:"headKVCount,omitempty"`

	// Raw metadata key/value pairs from the model
	Meta map[string]any `json:"meta,omitempty"`
}

Model represents model metadata and capabilities (excluding load params).

func NewModelFromGGUF ¶

func NewModelFromGGUF(basePath, relPath string, ctx *gguf.Context) (*Model, error)

NewModelFromGGUF builds a schema Model from a GGUF file context. The relPath is the relative path from basePath to the model file. This is a lightweight way to get model metadata without loading the full model.

func (Model) String ¶

func (m Model) String() string

type ModelPullProgress ¶

type ModelPullProgress struct {
	Filename      string  `json:"model"`
	BytesReceived uint64  `json:"bytes_received"`
	TotalBytes    uint64  `json:"total_bytes,omitempty"`
	Percentage    float64 `json:"percent,omitempty"`
}

ModelPullProgress represents progress information during model download

type ModelRuntime ¶

type ModelRuntime struct {
	NLayer    int32  `json:"layerCount,omitempty"`
	NHead     int32  `json:"headCount,omitempty"`
	NHeadKV   int32  `json:"headKVCount,omitempty"`
	NEmbd     int32  `json:"embeddingSize,omitempty"`
	NCtxTrain int32  `json:"contextSize,omitempty"`
	NParams   uint64 `json:"paramCount,omitempty"`
	ModelSize uint64 `json:"modelSizeBytes,omitempty"`
}

ModelRuntime represents runtime statistics for a loaded model.

func (ModelRuntime) String ¶ added in v0.0.3

func (m ModelRuntime) String() string

type PullModelRequest ¶

type PullModelRequest struct {
	URL string `json:"url"` // URL to download the model from (supports hf:// and https://)
}

PullModelRequest contains the parameters for downloading a model from a URL.

func (PullModelRequest) String ¶

func (r PullModelRequest) String() string

type ServerModel ¶ added in v0.0.3

type ServerModel struct {
	sync.RWMutex
	Handle *llamacpp.Model
}

ServerModel represents a model loaded in memory on the server. It includes the C model handle and synchronization primitives.

type Token ¶ added in v0.0.3

type Token = int32

Token is a token ID (type alias for int32)

type TokenizeRequest ¶

type TokenizeRequest struct {
	Model        string `json:"model"`                   // Model name or path (must be loaded)
	Text         string `json:"text"`                    // Text to tokenize
	AddSpecial   *bool  `json:"add_special,omitempty"`   // Add BOS/EOS tokens (default: true)
	ParseSpecial *bool  `json:"parse_special,omitempty"` // Parse special tokens in text (default: false)
}

TokenizeRequest contains parameters for tokenizing text.

func (TokenizeRequest) String ¶

func (r TokenizeRequest) String() string

type TokenizeResponse ¶

type TokenizeResponse struct {
	Tokens []Token `json:"tokens"`
}

TokenizeResponse contains the result of tokenization.

func (TokenizeResponse) String ¶

func (r TokenizeResponse) String() string

type Usage ¶

type Usage struct {
	InputTokens  int `json:"input_tokens"`  // Tokens in input (prompt/text to embed)
	OutputTokens int `json:"output_tokens"` // Tokens generated (0 for embeddings)
}

Usage tracks token usage for requests.

func (Usage) TotalTokens ¶

func (u Usage) TotalTokens() int

TotalTokens returns the sum of input and output tokens.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL