Documentation
¶
Index ¶
- Constants
- func SpanName(args ...string) string
- type CachedModel
- type ChatChunk
- type ChatMessage
- type ChatRequest
- type ChatResponse
- type CompletionChunk
- type CompletionRequest
- type CompletionResponse
- type ContextRequest
- type DetokenizeRequest
- type DetokenizeResponse
- type EmbedRequest
- type EmbedResponse
- type GPUDevice
- type GPUInfo
- type LoadModelRequest
- type Model
- type ModelPullProgress
- type ModelRuntime
- type PullModelRequest
- type ServerModel
- type Token
- type TokenizeRequest
- type TokenizeResponse
- type Usage
Constants ¶
const ( CompletionFinishReasonMaxTokens = "max_tokens" CompletionFinishReasonStop = "stop" CompletionFinishReasonEOS = "eos" )
const ( CompletionStreamDeltaType = "completion.delta" CompletionStreamDoneType = "completion.done" CompletionStreamErrorType = "completion.error" ModelPullProgressType = "model.pull.progress" ModelPullCompleteType = "model.pull.complete" ModelPullErrorType = "model.pull.error" )
Variables ¶
This section is empty.
Functions ¶
Types ¶
type CachedModel ¶
type CachedModel struct {
ServerModel
Model
LoadedAt time.Time `json:"loaded_at,omitzero"`
Runtime *ModelRuntime `json:"runtime,omitempty"`
}
CachedModel represents a model loaded in memory. For server builds, it embeds ServerModel which provides the Handle and RWMutex.
func (CachedModel) String ¶
func (m CachedModel) String() string
type ChatChunk ¶ added in v0.0.5
type ChatChunk struct {
Message ChatMessage `json:"message"`
}
ChatChunk contains a streamed chat chunk.
type ChatMessage ¶ added in v0.0.5
type ChatMessage struct {
Role string `json:"role"` // "system", "user", "assistant", or "tool"
Content string `json:"content"` // The message content
}
ChatMessage represents a single message in a conversation.
type ChatRequest ¶ added in v0.0.5
type ChatRequest struct {
CompletionRequest
Messages []ChatMessage `json:"messages"`
}
ChatRequest contains parameters for chat completion. It embeds CompletionRequest to reuse sampling and model options.
func (ChatRequest) String ¶ added in v0.0.5
func (r ChatRequest) String() string
type ChatResponse ¶ added in v0.0.5
type ChatResponse struct {
Model string `json:"model"` // Model used
Thinking *ChatMessage `json:"thinking,omitempty"` // Optional reasoning message
Message ChatMessage `json:"message"` // Assistant message
Usage Usage `json:"usage"` // Token usage
FinishReason string `json:"finish_reason,omitempty"` // Reason generation ended
}
ChatResponse contains the generated assistant message.
func (ChatResponse) String ¶ added in v0.0.5
func (r ChatResponse) String() string
type CompletionChunk ¶
type CompletionChunk struct {
Text string `json:"text"` // Chunk text
}
CompletionChunk contains a streamed completion chunk.
func (CompletionChunk) String ¶
func (r CompletionChunk) String() string
type CompletionRequest ¶
type CompletionRequest struct {
Model string `json:"model"` // Model name
Prompt string `json:"prompt"` // Prompt to complete
MaxTokens *int32 `json:"max_tokens,omitempty"` // Max tokens to generate
Temperature *float32 `json:"temperature,omitempty"` // Sampling temperature
TopP *float32 `json:"top_p,omitempty"` // Nucleus sampling
TopK *int32 `json:"top_k,omitempty"` // Top-k sampling
RepeatPenalty *float32 `json:"repeat_penalty,omitempty"` // Penalize repeats (1.0 = disabled)
RepeatLastN *int32 `json:"repeat_last_n,omitempty"` // Repeat penalty window size
Seed *uint32 `json:"seed,omitempty"` // RNG seed
Stop []string `json:"stop,omitempty"` // Stop words
PrefixCache *bool `json:"prefix_cache,omitempty"` // Enable prefix caching
}
CompletionRequest contains parameters for text completion.
func (CompletionRequest) String ¶
func (r CompletionRequest) String() string
type CompletionResponse ¶
type CompletionResponse struct {
Model string `json:"model"` // Model used
Text string `json:"text"` // Completion text
Usage Usage `json:"usage"` // Token usage
FinishReason string `json:"finish_reason,omitempty"` // Reason generation ended
}
CompletionResponse contains the generated completion.
func (CompletionResponse) String ¶
func (r CompletionResponse) String() string
type ContextRequest ¶
type ContextRequest struct {
LoadModelRequest
ContextSize *uint32 `json:"context_size,omitempty"` // Context size (nil = from model)
BatchSize *uint32 `json:"batch_size,omitempty"` // Logical batch size (nil = default)
UBatchSize *uint32 `json:"ubatch_size,omitempty"` // Physical/micro batch size (nil = default, must equal batch_size for encoder models)
Threads *int32 `json:"threads,omitempty"` // Number of threads (nil = default)
AttentionType *int32 `json:"attention_type,omitempty"` // Attention type: -1=auto, 0=causal, 1=non-causal (nil = auto)
FlashAttn *int32 `json:"flash_attn,omitempty"` // Flash attention: -1=auto, 0=disabled, 1=enabled (nil = auto)
Embeddings *bool `json:"embeddings,omitempty"` // Enable embeddings extraction (nil = false)
KVUnified *bool `json:"kv_unified,omitempty"` // Use unified KV cache (nil = default, required for BERT)
}
ContextRequest contains parameters for creating an inference context.
func (ContextRequest) String ¶
func (r ContextRequest) String() string
type DetokenizeRequest ¶
type DetokenizeRequest struct {
Model string `json:"model"` // Model name or path (must be loaded)
Tokens []Token `json:"tokens"` // Tokens to detokenize
RemoveSpecial *bool `json:"remove_special,omitempty"` // Remove BOS/EOS tokens (default: false)
UnparseSpecial *bool `json:"unparse_special,omitempty"` // Render special tokens as text (default: true)
}
DetokenizeRequest contains parameters for detokenizing tokens.
func (DetokenizeRequest) String ¶
func (r DetokenizeRequest) String() string
type DetokenizeResponse ¶
type DetokenizeResponse struct {
Text string `json:"text"`
}
DetokenizeResponse contains the result of detokenization.
func (DetokenizeResponse) String ¶
func (r DetokenizeResponse) String() string
type EmbedRequest ¶
type EmbedRequest struct {
Model string `json:"model"` // Model name
Input []string `json:"input"` // Text(s) to embed
Normalize *bool `json:"normalize,omitempty"` // L2-normalize embeddings (default: true)
}
EmbedRequest contains parameters for generating embeddings.
func (EmbedRequest) String ¶
func (r EmbedRequest) String() string
type EmbedResponse ¶
type EmbedResponse struct {
Model string `json:"model"` // Model used
Embeddings [][]float32 `json:"embeddings"` // One embedding vector per input
Dimension int `json:"dimension"` // Embedding dimension
Usage Usage `json:"usage"` // Token usage
}
EmbedResponse contains the generated embeddings.
func (EmbedResponse) String ¶
func (r EmbedResponse) String() string
type GPUDevice ¶
type GPUDevice struct {
ID int32 `json:"id"`
Name string `json:"name"`
FreeMemoryBytes int64 `json:"free_memory_bytes"` // -1 if unknown
TotalMemoryBytes int64 `json:"total_memory_bytes"` // -1 if unknown
}
GPUDevice represents information about a single GPU device
type GPUInfo ¶
type GPUInfo struct {
Backend string `json:"backend"` // "Metal", "CUDA", "Vulkan", "CPU"
Devices []GPUDevice `json:"devices"`
}
GPUInfo represents the GPU/accelerator configuration
type LoadModelRequest ¶
type LoadModelRequest struct {
Name string `json:"name"` // Model name or path to load
Load *bool `json:"load,omitempty"` // Load (true) or unload (false) model (nil = load)
Gpu *int32 `json:"gpu,omitempty"` // Main GPU index (nil = default)
Layers *int32 `json:"gpu_layers,omitempty"` // Number of layers to offload to GPU (nil = default, -1 = all)
Mmap *bool `json:"use_mmap,omitempty"` // Use memory mapping for model loading (nil = default)
Mlock *bool `json:"use_mlock,omitempty"` // Lock model in memory (nil = default)
}
LoadModelRequest contains the parameters for loading a model into memory.
func (LoadModelRequest) String ¶
func (r LoadModelRequest) String() string
type Model ¶
type Model struct {
// Identity
Path string `json:"path,omitempty"`
Name string `json:"name,omitempty"`
Architecture string `json:"architecture,omitempty"`
Description string `json:"description,omitempty"`
// Chat template
ChatTemplate string `json:"chatTemplate,omitempty"`
// Dimensions
ContextSize int32 `json:"contextSize,omitempty"`
EmbeddingSize int32 `json:"embeddingSize,omitempty"`
LayerCount int32 `json:"layerCount,omitempty"`
HeadCount int32 `json:"headCount,omitempty"`
HeadKVCount int32 `json:"headKVCount,omitempty"`
// Raw metadata key/value pairs from the model
Meta map[string]any `json:"meta,omitempty"`
}
Model represents model metadata and capabilities (excluding load params).
func NewModelFromGGUF ¶
NewModelFromGGUF builds a schema Model from a GGUF file context. The relPath is the relative path from basePath to the model file. This is a lightweight way to get model metadata without loading the full model.
type ModelPullProgress ¶
type ModelPullProgress struct {
Filename string `json:"model"`
BytesReceived uint64 `json:"bytes_received"`
TotalBytes uint64 `json:"total_bytes,omitempty"`
Percentage float64 `json:"percent,omitempty"`
}
ModelPullProgress represents progress information during model download
type ModelRuntime ¶
type ModelRuntime struct {
NLayer int32 `json:"layerCount,omitempty"`
NHead int32 `json:"headCount,omitempty"`
NHeadKV int32 `json:"headKVCount,omitempty"`
NEmbd int32 `json:"embeddingSize,omitempty"`
NCtxTrain int32 `json:"contextSize,omitempty"`
NParams uint64 `json:"paramCount,omitempty"`
ModelSize uint64 `json:"modelSizeBytes,omitempty"`
}
ModelRuntime represents runtime statistics for a loaded model.
func (ModelRuntime) String ¶ added in v0.0.3
func (m ModelRuntime) String() string
type PullModelRequest ¶
type PullModelRequest struct {
URL string `json:"url"` // URL to download the model from (supports hf:// and https://)
}
PullModelRequest contains the parameters for downloading a model from a URL.
func (PullModelRequest) String ¶
func (r PullModelRequest) String() string
type ServerModel ¶ added in v0.0.3
ServerModel represents a model loaded in memory on the server. It includes the C model handle and synchronization primitives.
type TokenizeRequest ¶
type TokenizeRequest struct {
Model string `json:"model"` // Model name or path (must be loaded)
Text string `json:"text"` // Text to tokenize
AddSpecial *bool `json:"add_special,omitempty"` // Add BOS/EOS tokens (default: true)
ParseSpecial *bool `json:"parse_special,omitempty"` // Parse special tokens in text (default: false)
}
TokenizeRequest contains parameters for tokenizing text.
func (TokenizeRequest) String ¶
func (r TokenizeRequest) String() string
type TokenizeResponse ¶
type TokenizeResponse struct {
Tokens []Token `json:"tokens"`
}
TokenizeResponse contains the result of tokenization.
func (TokenizeResponse) String ¶
func (r TokenizeResponse) String() string
type Usage ¶
type Usage struct {
InputTokens int `json:"input_tokens"` // Tokens in input (prompt/text to embed)
OutputTokens int `json:"output_tokens"` // Tokens generated (0 for embeddings)
}
Usage tracks token usage for requests.
func (Usage) TotalTokens ¶
TotalTokens returns the sum of input and output tokens.