Documentation
¶
Index ¶
- Variables
- type Config
- type HypervisorManager
- type Manager
- func (m *Manager) Backend() framework.Backend
- func (m *Manager) DeviceController() framework.DeviceController
- func (m *Manager) GetDeviceMetrics() (map[string]*api.GPUUsageMetrics, error)
- func (m *Manager) GetStateDir() string
- func (m *Manager) GetVendor() string
- func (m *Manager) GetWorkerAllocation(workerUID string) (*api.WorkerAllocation, bool)
- func (m *Manager) IsStarted() bool
- func (m *Manager) ListDevices() ([]*api.DeviceInfo, error)
- func (m *Manager) ListWorkers() []*api.WorkerInfo
- func (m *Manager) RegisterDeviceHandler(handler framework.DeviceChangeHandler)
- func (m *Manager) RegisterWorkerHandler(handler framework.WorkerChangeHandler) error
- func (m *Manager) Start() error
- func (m *Manager) StartWorker(workerInfo *api.WorkerInfo) error
- func (m *Manager) Stop() error
- func (m *Manager) StopWorker(workerUID string) error
- type Reconciler
- type ReconcilerConfig
- type ReconcilerStatus
Constants ¶
This section is empty.
Variables ¶
var ErrNotStarted = errors.New("hypervisor manager not started")
ErrNotStarted is returned when the manager is not started
Functions ¶
This section is empty.
Types ¶
type Config ¶
type Config struct {
// LibPath is the path to the accelerator library (e.g., libaccelerator_nvidia.so)
LibPath string
// Vendor identifier (e.g., "nvidia", "amd", "stub")
Vendor string
// IsolationMode for worker processes (shared, soft, partitioned)
IsolationMode tfv1.IsolationModeType
// StateDir for tensor-fusion state files (workers.json, devices.json)
StateDir string
}
Config holds configuration for the hypervisor manager
type HypervisorManager ¶
type HypervisorManager interface {
Start() error
Stop() error
IsStarted() bool
ListDevices() ([]*api.DeviceInfo, error)
ListWorkers() []*api.WorkerInfo
StartWorker(workerInfo *api.WorkerInfo) error
StopWorker(workerUID string) error
GetDeviceMetrics() (map[string]*api.GPUUsageMetrics, error)
GetWorkerAllocation(workerUID string) (*api.WorkerAllocation, bool)
RegisterWorkerHandler(handler framework.WorkerChangeHandler) error
RegisterDeviceHandler(handler framework.DeviceChangeHandler)
}
HypervisorManager defines the interface for hypervisor operations
type Manager ¶
type Manager struct {
// contains filtered or unexported fields
}
Manager wraps tensor-fusion hypervisor components for single-node GPU management
func NewManager ¶
NewManager creates a new hypervisor manager
func (*Manager) DeviceController ¶
func (m *Manager) DeviceController() framework.DeviceController
DeviceController returns the underlying device controller for advanced use cases
func (*Manager) GetDeviceMetrics ¶
func (m *Manager) GetDeviceMetrics() (map[string]*api.GPUUsageMetrics, error)
GetDeviceMetrics returns metrics for all devices
func (*Manager) GetStateDir ¶
GetStateDir returns the state directory path
func (*Manager) GetWorkerAllocation ¶
func (m *Manager) GetWorkerAllocation(workerUID string) (*api.WorkerAllocation, bool)
GetWorkerAllocation returns the allocation for a specific worker
func (*Manager) ListDevices ¶
func (m *Manager) ListDevices() ([]*api.DeviceInfo, error)
ListDevices returns all discovered GPU devices
func (*Manager) ListWorkers ¶
func (m *Manager) ListWorkers() []*api.WorkerInfo
ListWorkers returns all workers from the backend
func (*Manager) RegisterDeviceHandler ¶
func (m *Manager) RegisterDeviceHandler(handler framework.DeviceChangeHandler)
RegisterDeviceHandler registers a handler for device change events
func (*Manager) RegisterWorkerHandler ¶
func (m *Manager) RegisterWorkerHandler(handler framework.WorkerChangeHandler) error
RegisterWorkerHandler registers a handler for worker change events
func (*Manager) StartWorker ¶
func (m *Manager) StartWorker(workerInfo *api.WorkerInfo) error
StartWorker starts a worker with the given configuration
func (*Manager) StopWorker ¶
StopWorker stops a worker by its UID
type Reconciler ¶
type Reconciler struct {
// contains filtered or unexported fields
}
Reconciler reconciles cloud-desired workers with hypervisor-actual workers
func NewReconciler ¶
func NewReconciler(cfg ReconcilerConfig) *Reconciler
NewReconciler creates a new worker reconciler
func (*Reconciler) GetStatus ¶
func (r *Reconciler) GetStatus() ReconcilerStatus
GetStatus returns current reconciler status
func (*Reconciler) SetDesiredWorkers ¶
func (r *Reconciler) SetDesiredWorkers(infos []*api.WorkerInfo)
SetDesiredWorkers updates the desired worker state This should be called when cloud backend config is pulled
func (*Reconciler) TriggerReconcile ¶
func (r *Reconciler) TriggerReconcile()
TriggerReconcile triggers an immediate reconciliation
type ReconcilerConfig ¶
type ReconcilerConfig struct {
Manager HypervisorManager
// Optional callbacks
OnWorkerStarted func(workerID string)
OnWorkerStopped func(workerID string)
OnReconcileComplete func(added, removed, updated int)
}
ReconcilerConfig holds configuration for the reconciler
type ReconcilerStatus ¶
ReconcilerStatus represents the current reconciliation status
func (*ReconcilerStatus) String ¶
func (s *ReconcilerStatus) String() string
String returns a human-readable representation