Documentation
¶
Index ¶
- Constants
- type Manager
- func (m *Manager) GetStatus(workerID string) (*WorkerState, error)
- func (m *Manager) List() []*WorkerState
- func (m *Manager) LoadStateFile() ([]WorkerConfig, error)
- func (m *Manager) Reconcile(desiredWorkers []WorkerConfig) error
- func (m *Manager) SaveStateFile() error
- func (m *Manager) Shutdown()
- func (m *Manager) Start(config WorkerConfig) error
- func (m *Manager) Stop(workerID string) error
- type TensorFusionWorkerInfo
- type WorkerConfig
- type WorkerMode
- type WorkerState
- type WorkerStatus
Constants ¶
const DefaultWorkerPort = 42352
DefaultWorkerPort returns the default worker port
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type Manager ¶
type Manager struct {
// contains filtered or unexported fields
}
Manager manages worker processes
func NewManager ¶
NewManager creates a new worker manager
func (*Manager) GetStatus ¶
func (m *Manager) GetStatus(workerID string) (*WorkerState, error)
GetStatus returns the status of a worker
func (*Manager) LoadStateFile ¶
func (m *Manager) LoadStateFile() ([]WorkerConfig, error)
LoadStateFile loads worker state from tensor-fusion format file
func (*Manager) Reconcile ¶
func (m *Manager) Reconcile(desiredWorkers []WorkerConfig) error
Reconcile reconciles the desired worker state with actual state
func (*Manager) SaveStateFile ¶
SaveStateFile saves worker state to a file in tensor-fusion format
func (*Manager) Start ¶
func (m *Manager) Start(config WorkerConfig) error
Start starts a worker process
type TensorFusionWorkerInfo ¶
type TensorFusionWorkerInfo struct {
WorkerUID string `json:"WorkerUID"`
Namespace string `json:"Namespace,omitempty"`
WorkerName string `json:"WorkerName,omitempty"`
AllocatedDevices []string `json:"AllocatedDevices"`
Status string `json:"Status"` // "Pending", "Running", "Terminated"
}
TensorFusionWorkerInfo represents the worker info format expected by tensor-fusion hypervisor
type WorkerConfig ¶
type WorkerConfig struct {
WorkerID string `json:"worker_id"`
GPUIDs []string `json:"gpu_ids"`
ListenPort int `json:"listen_port"`
Mode WorkerMode `json:"mode"`
ShmemFile string `json:"shmem_file,omitempty"`
ShmemSizeMB int `json:"shmem_size_mb,omitempty"`
Enabled bool `json:"enabled"`
WorkerBinaryPath string `json:"worker_binary_path,omitempty"`
}
WorkerConfig represents configuration for a worker process
type WorkerMode ¶
type WorkerMode string
WorkerMode represents the worker network mode
const ( WorkerModeTCP WorkerMode = "tcp" WorkerModeShmem WorkerMode = "shmem" )
type WorkerState ¶
type WorkerState struct {
Config WorkerConfig `json:"config"`
Status WorkerStatus `json:"status"`
PID int `json:"pid,omitempty"`
StartedAt *time.Time `json:"started_at,omitempty"`
Error string `json:"error,omitempty"`
}
WorkerState represents the runtime state of a worker
type WorkerStatus ¶
type WorkerStatus string
WorkerStatus represents the current status of a worker
const ( WorkerStatusPending WorkerStatus = "Pending" WorkerStatusRunning WorkerStatus = "Running" WorkerStatusStopping WorkerStatus = "Stopping" WorkerStatusStopped WorkerStatus = "Stopped" WorkerStatusTerminated WorkerStatus = "Terminated" WorkerStatusError WorkerStatus = "Error" )