llm

package
v0.5.13 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 3, 2025 License: MIT Imports: 28 Imported by: 11

Documentation

Index

Constants

This section is empty.

Variables

View Source
var LlamaServerSysProcAttr = &syscall.SysProcAttr{}

Functions

func LoadModel added in v0.1.33

func LoadModel(model string, maxArraySize int) (*ggml.GGML, error)

LoadModel will load a model from disk. The model must be in the GGML format.

It collects array values for arrays with a size less than or equal to maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If the maxArraySize is negative, all arrays are collected.

func PredictServerFit added in v0.1.33

func PredictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options) (bool, uint64)

This algorithm looks for a complete fit to determine if we need to unload other models

Types

type CompletionRequest added in v0.1.32

type CompletionRequest struct {
	Prompt  string
	Format  json.RawMessage
	Images  []ImageData
	Options *api.Options
}

type CompletionResponse added in v0.1.32

type CompletionResponse struct {
	Content            string
	DoneReason         string
	Done               bool
	PromptEvalCount    int
	PromptEvalDuration time.Duration
	EvalCount          int
	EvalDuration       time.Duration
}

type DetokenizeRequest

type DetokenizeRequest struct {
	Tokens []int `json:"tokens"`
}

type DetokenizeResponse

type DetokenizeResponse struct {
	Content string `json:"content"`
}

type EmbeddingRequest

type EmbeddingRequest struct {
	Content string `json:"content"`
}

type EmbeddingResponse

type EmbeddingResponse struct {
	Embedding []float32 `json:"embedding"`
}

type ImageData

type ImageData struct {
	Data          []byte `json:"data"`
	ID            int    `json:"id"`
	AspectRatioID int    `json:"aspect_ratio_id"`
}

type LlamaServer added in v0.1.32

type LlamaServer interface {
	Ping(ctx context.Context) error
	WaitUntilRunning(ctx context.Context) error
	Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
	Embedding(ctx context.Context, input string) ([]float32, error)
	Tokenize(ctx context.Context, content string) ([]int, error)
	Detokenize(ctx context.Context, tokens []int) (string, error)
	Close() error
	EstimatedVRAM() uint64 // Total VRAM across all GPUs
	EstimatedTotal() uint64
	EstimatedVRAMByGPU(gpuID string) uint64
}

func NewLlamaServer added in v0.1.32

func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error)

NewLlamaServer will run a server for the given GPUs The gpu list must be a single family.

type MemoryEstimate added in v0.1.45

type MemoryEstimate struct {
	// How many layers we predict we can load
	Layers int

	// The size of the graph which occupies the main GPU
	Graph uint64

	// How much VRAM will be allocated given the number of layers we predict
	VRAMSize uint64

	// The total size of the model if loaded into VRAM.  If all layers are loaded, VRAMSize == TotalSize
	TotalSize uint64

	// For multi-GPU scenarios, this provides the tensor split parameter
	TensorSplit string

	// For multi-GPU scenarios, this is the size in bytes per GPU
	GPUSizes []uint64
	// contains filtered or unexported fields
}

func EstimateGPULayers added in v0.1.33

func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []string, opts api.Options) MemoryEstimate

Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size The GPUs provided must all be the same Library

func (MemoryEstimate) LogValue added in v0.5.12

func (m MemoryEstimate) LogValue() slog.Value

type ServerStatus added in v0.1.32

type ServerStatus int
const (
	ServerStatusReady ServerStatus = iota
	ServerStatusNoSlotsAvailable
	ServerStatusLoadingModel
	ServerStatusNotResponding
	ServerStatusError
)

func (ServerStatus) ToString added in v0.1.33

func (s ServerStatus) ToString() string

type ServerStatusResp added in v0.1.32

type ServerStatusResp struct {
	Status          string  `json:"status"`
	SlotsIdle       int     `json:"slots_idle"`
	SlotsProcessing int     `json:"slots_processing"`
	Error           string  `json:"error"`
	Progress        float32 `json:"progress"`
}

type StatusWriter added in v0.1.32

type StatusWriter struct {
	LastErrMsg string
	// contains filtered or unexported fields
}

StatusWriter is a writer that captures error messages from the llama runner process

func NewStatusWriter added in v0.1.32

func NewStatusWriter(out *os.File) *StatusWriter

func (*StatusWriter) Write added in v0.1.32

func (w *StatusWriter) Write(b []byte) (int, error)

type TokenizeRequest

type TokenizeRequest struct {
	Content string `json:"content"`
}

type TokenizeResponse

type TokenizeResponse struct {
	Tokens []int `json:"tokens"`
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL