Documentation
¶
Index ¶
- Variables
- type BackendStatus
- type ConfigureRequest
- type DiskUsage
- type HTTPHandler
- func (h *HTTPHandler) Configure(w http.ResponseWriter, r *http.Request)
- func (h *HTTPHandler) GetAllActiveRunners() []metrics.ActiveRunner
- func (h *HTTPHandler) GetBackendStatus(w http.ResponseWriter, r *http.Request)
- func (h *HTTPHandler) GetDiskUsage(w http.ResponseWriter, _ *http.Request)
- func (h *HTTPHandler) GetLlamaCppSocket() (string, error)
- func (h *HTTPHandler) GetRunningBackends(w http.ResponseWriter, r *http.Request)
- func (h *HTTPHandler) RebuildRoutes(allowedOrigins []string)
- func (h *HTTPHandler) ServeHTTP(w http.ResponseWriter, r *http.Request)
- func (h *HTTPHandler) Unload(w http.ResponseWriter, r *http.Request)
- type OpenAIErrorResponse
- type OpenAIInferenceRequest
- type Scheduler
- func (s *Scheduler) ConfigureRunner(ctx context.Context, backend inference.Backend, req ConfigureRequest, ...) (inference.Backend, error)
- func (s *Scheduler) GetAllActiveRunners() []metrics.ActiveRunner
- func (s *Scheduler) GetLlamaCppSocket() (string, error)
- func (s *Scheduler) GetRunningBackendsInfo(ctx context.Context) []BackendStatus
- func (s *Scheduler) ResetInstaller(httpClient *http.Client)
- func (s *Scheduler) Run(ctx context.Context) error
- type UnloadRequest
- type UnloadResponse
Constants ¶
This section is empty.
Variables ¶
var ErrBackendNotFound = errors.New("backend not found")
ErrBackendNotFound indicates that an unknown backend was requested. If returned in conjunction with an HTTP request, it should be paired with a 404 response status.
var RunnerSocketPath = func(slot int) (string, error) { return fmt.Sprintf("inference-runner-%d.sock", slot), nil }
RunnerSocketPath determines the Unix domain socket path used to communicate with runners at the specified slot. It can be overridden during init().
Functions ¶
This section is empty.
Types ¶
type BackendStatus ¶
type BackendStatus struct {
// BackendName is the name of the backend
BackendName string `json:"backend_name"`
// ModelName is the name of the model loaded in the backend
ModelName string `json:"model_name"`
// Mode is the mode the backend is operating in
Mode string `json:"mode"`
// LastUsed represents when this (backend, model, mode) tuple was last used
LastUsed time.Time `json:"last_used,omitempty"`
// InUse indicates whether this backend is currently handling a request
InUse bool `json:"in_use,omitempty"`
}
BackendStatus represents information about a running backend
type ConfigureRequest ¶
type ConfigureRequest struct {
Model string `json:"model"`
Mode *inference.BackendMode `json:"mode,omitempty"`
inference.BackendConfiguration
}
ConfigureRequest specifies per-model runtime configuration options.
type DiskUsage ¶
type DiskUsage struct {
ModelsDiskUsage int64 `json:"models_disk_usage"`
DefaultBackendDiskUsage int64 `json:"default_backend_disk_usage"`
}
DiskUsage represents the disk usage of the models and default backend.
type HTTPHandler ¶ added in v1.0.7
type HTTPHandler struct {
// contains filtered or unexported fields
}
HTTPHandler handles HTTP requests for the scheduler. It wraps the Scheduler to provide HTTP endpoint functionality without coupling the core scheduling logic to HTTP concerns.
func NewHTTPHandler ¶ added in v1.0.7
func NewHTTPHandler(s *Scheduler, modelHandler *models.HTTPHandler, allowedOrigins []string) *HTTPHandler
NewHTTPHandler creates a new HTTP handler that wraps the scheduler. This is the primary HTTP interface for the scheduling package.
func (*HTTPHandler) Configure ¶ added in v1.0.7
func (h *HTTPHandler) Configure(w http.ResponseWriter, r *http.Request)
Configure handles POST <inference-prefix>/{backend}/_configure requests.
func (*HTTPHandler) GetAllActiveRunners ¶ added in v1.0.7
func (h *HTTPHandler) GetAllActiveRunners() []metrics.ActiveRunner
GetAllActiveRunners delegates to the scheduler's business logic. Required by metrics.SchedulerInterface.
func (*HTTPHandler) GetBackendStatus ¶ added in v1.0.7
func (h *HTTPHandler) GetBackendStatus(w http.ResponseWriter, r *http.Request)
GetBackendStatus returns the status of all backends.
func (*HTTPHandler) GetDiskUsage ¶ added in v1.0.7
func (h *HTTPHandler) GetDiskUsage(w http.ResponseWriter, _ *http.Request)
GetDiskUsage returns disk usage information for models and backends.
func (*HTTPHandler) GetLlamaCppSocket ¶ added in v1.0.7
func (h *HTTPHandler) GetLlamaCppSocket() (string, error)
GetLlamaCppSocket delegates to the scheduler's business logic. Required by metrics.SchedulerInterface.
func (*HTTPHandler) GetRunningBackends ¶ added in v1.0.7
func (h *HTTPHandler) GetRunningBackends(w http.ResponseWriter, r *http.Request)
GetRunningBackends returns information about all running backends
func (*HTTPHandler) RebuildRoutes ¶ added in v1.0.7
func (h *HTTPHandler) RebuildRoutes(allowedOrigins []string)
RebuildRoutes updates the HTTP routes with new allowed origins.
func (*HTTPHandler) ServeHTTP ¶ added in v1.0.7
func (h *HTTPHandler) ServeHTTP(w http.ResponseWriter, r *http.Request)
ServeHTTP implements net/http.Handler.ServeHTTP.
func (*HTTPHandler) Unload ¶ added in v1.0.7
func (h *HTTPHandler) Unload(w http.ResponseWriter, r *http.Request)
Unload unloads the specified runners (backend, model) from the backend. Currently, this doesn't work for runners that are handling an OpenAI request.
type OpenAIErrorResponse ¶
type OpenAIErrorResponse struct {
Type string `json:"type"` // always "error"
Code *string `json:"code"`
Message string `json:"message"`
Param *string `json:"param"`
SequenceNumber int `json:"sequence_number"`
}
OpenAIErrorResponse is used to format an OpenAI API compatible error response (see https://platform.openai.com/docs/api-reference/responses-streaming/error)
type OpenAIInferenceRequest ¶
type OpenAIInferenceRequest struct {
// Model is the requested model name.
Model string `json:"model"`
}
OpenAIInferenceRequest is used to extract the model specification from either a chat completion or embedding request in the OpenAI API.
type Scheduler ¶
type Scheduler struct {
// contains filtered or unexported fields
}
Scheduler is used to coordinate inference scheduling across multiple backends and models.
func NewScheduler ¶
func NewScheduler( log logging.Logger, backends map[string]inference.Backend, defaultBackend inference.Backend, modelManager *models.Manager, httpClient *http.Client, tracker *metrics.Tracker, sysMemInfo memory.SystemMemoryInfo, ) *Scheduler
NewScheduler creates a new inference scheduler.
func (*Scheduler) ConfigureRunner ¶ added in v1.0.7
func (s *Scheduler) ConfigureRunner(ctx context.Context, backend inference.Backend, req ConfigureRequest, userAgent string) (inference.Backend, error)
ConfigureRunner configures a runner for a specific model and backend. It handles all the business logic of configuration including parsing flags, determining mode, selecting backend, and setting runner configuration.
func (*Scheduler) GetAllActiveRunners ¶
func (s *Scheduler) GetAllActiveRunners() []metrics.ActiveRunner
GetAllActiveRunners returns information about all active runners
func (*Scheduler) GetLlamaCppSocket ¶
GetLlamaCppSocket returns the Unix socket path for an active llama.cpp runner
func (*Scheduler) GetRunningBackendsInfo ¶ added in v1.0.5
func (s *Scheduler) GetRunningBackendsInfo(ctx context.Context) []BackendStatus
GetRunningBackendsInfo returns information about all running backends as a slice
func (*Scheduler) ResetInstaller ¶
ResetInstaller resets the backend installer with a new HTTP client.
type UnloadRequest ¶
type UnloadRequest struct {
All bool `json:"all"`
Backend string `json:"backend"`
Models []string `json:"models"`
}
UnloadRequest is used to specify which models to unload.
type UnloadResponse ¶
type UnloadResponse struct {
UnloadedRunners int `json:"unloaded_runners"`
}
UnloadResponse is used to return the number of unloaded runners (backend, model).