hypervisor

package
v1.1.11 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 4, 2026 License: Apache-2.0 Imports: 17 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var ErrNotStarted = errors.New("hypervisor manager not started")

ErrNotStarted is returned when the manager is not started

Functions

This section is empty.

Types

type Config

type Config struct {
	// LibPath is the path to the accelerator library (e.g., libaccelerator_nvidia.so)
	LibPath string

	// Vendor identifier (e.g., "nvidia", "amd", "stub")
	Vendor string

	// IsolationMode for worker processes (shared, soft, partitioned)
	IsolationMode tfv1.IsolationModeType

	// StateDir for tensor-fusion state files (workers.json, devices.json)
	StateDir string
}

Config holds configuration for the hypervisor manager

type HypervisorManager

type HypervisorManager interface {
	Start() error
	Stop() error
	IsStarted() bool
	ListDevices() ([]*api.DeviceInfo, error)
	ListWorkers() []*api.WorkerInfo
	StartWorker(workerInfo *api.WorkerInfo) error
	StopWorker(workerUID string) error
	GetDeviceMetrics() (map[string]*api.GPUUsageMetrics, error)
	GetWorkerAllocation(workerUID string) (*api.WorkerAllocation, bool)
	RegisterWorkerHandler(handler framework.WorkerChangeHandler) error
	RegisterDeviceHandler(handler framework.DeviceChangeHandler)
}

HypervisorManager defines the interface for hypervisor operations

type Manager

type Manager struct {
	// contains filtered or unexported fields
}

Manager wraps tensor-fusion hypervisor components for single-node GPU management

func NewManager

func NewManager(cfg Config) (*Manager, error)

NewManager creates a new hypervisor manager

func (*Manager) Backend

func (m *Manager) Backend() framework.Backend

Backend returns the underlying backend for advanced use cases

func (*Manager) DeviceController

func (m *Manager) DeviceController() framework.DeviceController

DeviceController returns the underlying device controller for advanced use cases

func (*Manager) GetDeviceMetrics

func (m *Manager) GetDeviceMetrics() (map[string]*api.GPUUsageMetrics, error)

GetDeviceMetrics returns metrics for all devices

func (*Manager) GetStateDir

func (m *Manager) GetStateDir() string

GetStateDir returns the state directory path

func (*Manager) GetVendor

func (m *Manager) GetVendor() string

GetVendor returns the accelerator vendor

func (*Manager) GetWorkerAllocation

func (m *Manager) GetWorkerAllocation(workerUID string) (*api.WorkerAllocation, bool)

GetWorkerAllocation returns the allocation for a specific worker

func (*Manager) IsStarted

func (m *Manager) IsStarted() bool

IsStarted returns whether the manager is running

func (*Manager) ListDevices

func (m *Manager) ListDevices() ([]*api.DeviceInfo, error)

ListDevices returns all discovered GPU devices

func (*Manager) ListWorkers

func (m *Manager) ListWorkers() []*api.WorkerInfo

ListWorkers returns all workers from the backend

func (*Manager) RegisterDeviceHandler

func (m *Manager) RegisterDeviceHandler(handler framework.DeviceChangeHandler)

RegisterDeviceHandler registers a handler for device change events

func (*Manager) RegisterWorkerHandler

func (m *Manager) RegisterWorkerHandler(handler framework.WorkerChangeHandler) error

RegisterWorkerHandler registers a handler for worker change events

func (*Manager) Start

func (m *Manager) Start() error

Start initializes and starts all hypervisor components

func (*Manager) StartWorker

func (m *Manager) StartWorker(workerInfo *api.WorkerInfo) error

StartWorker starts a worker with the given configuration

func (*Manager) Stop

func (m *Manager) Stop() error

Stop gracefully shuts down all hypervisor components

func (*Manager) StopWorker

func (m *Manager) StopWorker(workerUID string) error

StopWorker stops a worker by its UID

type Reconciler

type Reconciler struct {
	// contains filtered or unexported fields
}

Reconciler reconciles cloud-desired workers with hypervisor-actual workers

func NewReconciler

func NewReconciler(cfg ReconcilerConfig) *Reconciler

NewReconciler creates a new worker reconciler

func (*Reconciler) GetStatus

func (r *Reconciler) GetStatus() ReconcilerStatus

GetStatus returns current reconciler status

func (*Reconciler) SetDesiredWorkers

func (r *Reconciler) SetDesiredWorkers(infos []*api.WorkerInfo)

SetDesiredWorkers updates the desired worker state This should be called when cloud backend config is pulled

func (*Reconciler) Start

func (r *Reconciler) Start()

Start begins the reconciliation loop

func (*Reconciler) Stop

func (r *Reconciler) Stop()

Stop stops the reconciliation loop

func (*Reconciler) TriggerReconcile

func (r *Reconciler) TriggerReconcile()

TriggerReconcile triggers an immediate reconciliation

type ReconcilerConfig

type ReconcilerConfig struct {
	Manager HypervisorManager

	// Optional callbacks
	OnWorkerStarted     func(workerID string)
	OnWorkerStopped     func(workerID string)
	OnReconcileComplete func(added, removed, updated int)
}

ReconcilerConfig holds configuration for the reconciler

type ReconcilerStatus

type ReconcilerStatus struct {
	DesiredCount int
	ActualCount  int
	InSync       bool
}

ReconcilerStatus represents the current reconciliation status

func (*ReconcilerStatus) String

func (s *ReconcilerStatus) String() string

String returns a human-readable representation

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL