Documentation
¶
Index ¶
- Variables
- type BackendStatus
- type ConfigureRequest
- type DiskUsage
- type OpenAIErrorResponse
- type OpenAIInferenceRequest
- type Scheduler
- func (s *Scheduler) Configure(w http.ResponseWriter, r *http.Request)
- func (s *Scheduler) GetAllActiveRunners() []metrics.ActiveRunner
- func (s *Scheduler) GetBackendStatus(w http.ResponseWriter, r *http.Request)
- func (s *Scheduler) GetDiskUsage(w http.ResponseWriter, _ *http.Request)
- func (s *Scheduler) GetLlamaCppSocket() (string, error)
- func (s *Scheduler) GetRunningBackends(w http.ResponseWriter, r *http.Request)
- func (s *Scheduler) GetRunningBackendsInfo(ctx context.Context) []BackendStatus
- func (s *Scheduler) RebuildRoutes(allowedOrigins []string)
- func (s *Scheduler) ResetInstaller(httpClient *http.Client)
- func (s *Scheduler) Run(ctx context.Context) error
- func (s *Scheduler) ServeHTTP(w http.ResponseWriter, r *http.Request)
- func (s *Scheduler) Unload(w http.ResponseWriter, r *http.Request)
- type UnloadRequest
- type UnloadResponse
Constants ¶
This section is empty.
Variables ¶
var ErrBackendNotFound = errors.New("backend not found")
ErrBackendNotFound indicates that an unknown backend was requested. If returned in conjunction with an HTTP request, it should be paired with a 404 response status.
var RunnerSocketPath = func(slot int) (string, error) { return fmt.Sprintf("inference-runner-%d.sock", slot), nil }
RunnerSocketPath determines the Unix domain socket path used to communicate with runners at the specified slot. It can be overridden during init().
Functions ¶
This section is empty.
Types ¶
type BackendStatus ¶
type BackendStatus struct {
// BackendName is the name of the backend
BackendName string `json:"backend_name"`
// ModelName is the name of the model loaded in the backend
ModelName string `json:"model_name"`
// Mode is the mode the backend is operating in
Mode string `json:"mode"`
// LastUsed represents when this (backend, model, mode) tuple was last used
LastUsed time.Time `json:"last_used,omitempty"`
// InUse indicates whether this backend is currently handling a request
InUse bool `json:"in_use,omitempty"`
}
BackendStatus represents information about a running backend
type ConfigureRequest ¶
type ConfigureRequest struct {
Model string `json:"model"`
ContextSize int64 `json:"context-size,omitempty"`
RuntimeFlags []string `json:"runtime-flags,omitempty"`
RawRuntimeFlags string `json:"raw-runtime-flags,omitempty"`
Speculative *inference.SpeculativeDecodingConfig `json:"speculative,omitempty"`
}
ConfigureRequest specifies per-model runtime configuration options.
type DiskUsage ¶
type DiskUsage struct {
ModelsDiskUsage int64 `json:"models_disk_usage"`
DefaultBackendDiskUsage int64 `json:"default_backend_disk_usage"`
}
DiskUsage represents the disk usage of the models and default backend.
type OpenAIErrorResponse ¶
type OpenAIErrorResponse struct {
Type string `json:"type"` // always "error"
Code *string `json:"code"`
Message string `json:"message"`
Param *string `json:"param"`
SequenceNumber int `json:"sequence_number"`
}
OpenAIErrorResponse is used to format an OpenAI API compatible error response (see https://platform.openai.com/docs/api-reference/responses-streaming/error)
type OpenAIInferenceRequest ¶
type OpenAIInferenceRequest struct {
// Model is the requested model name.
Model string `json:"model"`
}
OpenAIInferenceRequest is used to extract the model specification from either a chat completion or embedding request in the OpenAI API.
type Scheduler ¶
type Scheduler struct {
// contains filtered or unexported fields
}
Scheduler is used to coordinate inference scheduling across multiple backends and models.
func NewScheduler ¶
func NewScheduler( log logging.Logger, backends map[string]inference.Backend, defaultBackend inference.Backend, modelManager *models.Manager, httpClient *http.Client, allowedOrigins []string, tracker *metrics.Tracker, sysMemInfo memory.SystemMemoryInfo, ) *Scheduler
NewScheduler creates a new inference scheduler.
func (*Scheduler) GetAllActiveRunners ¶
func (s *Scheduler) GetAllActiveRunners() []metrics.ActiveRunner
GetAllActiveRunners returns information about all active runners
func (*Scheduler) GetBackendStatus ¶
func (s *Scheduler) GetBackendStatus(w http.ResponseWriter, r *http.Request)
func (*Scheduler) GetDiskUsage ¶
func (s *Scheduler) GetDiskUsage(w http.ResponseWriter, _ *http.Request)
func (*Scheduler) GetLlamaCppSocket ¶
GetLlamaCppSocket returns the Unix socket path for an active llama.cpp runner
func (*Scheduler) GetRunningBackends ¶
func (s *Scheduler) GetRunningBackends(w http.ResponseWriter, r *http.Request)
GetRunningBackends returns information about all running backends
func (*Scheduler) GetRunningBackendsInfo ¶ added in v1.0.5
func (s *Scheduler) GetRunningBackendsInfo(ctx context.Context) []BackendStatus
GetRunningBackendsInfo returns information about all running backends as a slice
func (*Scheduler) RebuildRoutes ¶
func (*Scheduler) ResetInstaller ¶
func (*Scheduler) Run ¶
Run is the scheduler's main run loop. By the time it returns, all inference backends will have been unloaded from memory.
type UnloadRequest ¶
type UnloadRequest struct {
All bool `json:"all"`
Backend string `json:"backend"`
Models []string `json:"models"`
}
UnloadRequest is used to specify which models to unload.
type UnloadResponse ¶
type UnloadResponse struct {
UnloadedRunners int `json:"unloaded_runners"`
}
UnloadResponse is used to return the number of unloaded runners (backend, model).