device

package
v1.54.7-0...-0a23e25 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 27, 2026 License: Apache-2.0 Imports: 21 Imported by: 0

Documentation

Index

Constants

View Source
const MaxDeviceProperties = 64
View Source
const MaxExtraMetrics = 64
View Source
const MaxMountPath = 512
View Source
const MaxProcesses = 1024
View Source
const (
	MaxTopologyDevices = 64
)

Variables

This section is empty.

Functions

func GetDiskInfo

func GetDiskInfo(path string) (total int64)

func GetTotalHostRAMBytes

func GetTotalHostRAMBytes() int64

func IsNonFatalAcceleratorError

func IsNonFatalAcceleratorError(err error) bool

IsNonFatalAcceleratorError returns true if the error is a non-fatal accelerator error that should not block the main flow (e.g., NOT_SUPPORTED, NOT_FOUND)

Types

type AcceleratorError

type AcceleratorError struct {
	Code    Result
	Message string
}

AcceleratorError wraps a Result code as an error for type checking

func (*AcceleratorError) Error

func (e *AcceleratorError) Error() string

type AcceleratorInterface

type AcceleratorInterface struct {
	// contains filtered or unexported fields
}

AcceleratorInterface provides Go bindings for the C accelerator library using purego

func NewAcceleratorInterface

func NewAcceleratorInterface(libPath string) (*AcceleratorInterface, error)

NewAcceleratorInterface creates a new accelerator interface and loads the library

func (*AcceleratorInterface) AssignPartition

func (a *AcceleratorInterface) AssignPartition(templateID, deviceUUID string) (*AssignPartitionResult, error)

AssignPartition assigns a partition to a device using a template (e.g., create MIG instance) Returns the assigned partition result including UUID and environment variables

func (*AcceleratorInterface) Close

func (a *AcceleratorInterface) Close() error

Close unloads the accelerator library

func (*AcceleratorInterface) GetAllDevices

func (a *AcceleratorInterface) GetAllDevices() ([]*api.DeviceInfo, error)

GetAllDevices retrieves all available devices from the accelerator library

func (*AcceleratorInterface) GetDeviceMetrics

func (a *AcceleratorInterface) GetDeviceMetrics(deviceUUIDs []string) ([]*api.GPUUsageMetrics, error)

GetDeviceMetrics retrieves device metrics for the specified device UUIDs

func (*AcceleratorInterface) GetProcessInformation

func (a *AcceleratorInterface) GetProcessInformation() ([]api.ProcessInformation, error)

GetProcessInformation retrieves process information (compute and memory utilization) for all processes on all devices. This combines the functionality of GetProcessComputeUtilization and GetProcessMemoryUtilization following AMD SMI style API design. Note: This directly calls the C API which returns all GPU processes, regardless of what Go tracks internally.

func (*AcceleratorInterface) GetTotalProcessCount

func (a *AcceleratorInterface) GetTotalProcessCount() int

GetTotalProcessCount returns the total number of processes across all devices

func (*AcceleratorInterface) GetVendorMountLibs

func (a *AcceleratorInterface) GetVendorMountLibs() ([]*api.Mount, error)

GetVendorMountLibs retrieves vendor mount libs

func (*AcceleratorInterface) Load

func (a *AcceleratorInterface) Load() error

Load loads the accelerator library dynamically using purego on Unix systems

func (*AcceleratorInterface) RemovePartition

func (a *AcceleratorInterface) RemovePartition(templateID, deviceUUID string) error

RemovePartition removes a partition from a device templateID is the template ID used to create the partition

func (*AcceleratorInterface) SetComputeUnitHardLimit

func (a *AcceleratorInterface) SetComputeUnitHardLimit(deviceUUID string, computeUnitLimit uint32) error

SetComputeUnitHardLimit sets hard compute unit limit for a device (one-time, called at worker start)

func (*AcceleratorInterface) SetMemHardLimit

func (a *AcceleratorInterface) SetMemHardLimit(deviceUUID string, memoryLimitBytes uint64) error

SetMemHardLimit sets hard memory limit for a device (one-time, called at worker start by limiter.so)

type AssignPartitionResult

type AssignPartitionResult struct {
	PartitionUUID string
	EnvVars       map[string]string
	DeviceNodes   map[string]string
	Type          PartitionResultType
}

AssignPartitionResult represents the result of assigning a partition

type Controller

type Controller struct {
	// contains filtered or unexported fields
}

Controller manages GPU device discovery and lifecycle

func NewController

func NewController(
	ctx context.Context,
	acceleratorLibPath string,
	acceleratorVendor string,
	discoveryInterval time.Duration,
	isolationMode string,
) (*Controller, error)

NewController creates a new device manager

func (*Controller) AggregateNodeInfo

func (m *Controller) AggregateNodeInfo() *api.NodeInfo

func (*Controller) DiscoverDevices

func (m *Controller) DiscoverDevices() error

DiscoverDevices implements framework.DeviceController

func (*Controller) GetAcceleratorVendor

func (m *Controller) GetAcceleratorVendor() string

func (*Controller) GetDevice

func (m *Controller) GetDevice(deviceUUID string) (*api.DeviceInfo, bool)

GetDevice implements framework.DeviceController

func (*Controller) GetDeviceMetrics

func (m *Controller) GetDeviceMetrics() (map[string]*api.GPUUsageMetrics, error)

GetDeviceMetrics implements framework.DeviceController

func (*Controller) GetDevices

func (m *Controller) GetDevices() []*api.DeviceInfo

GetDevices returns all discovered devices

func (*Controller) GetProcessInformation

func (m *Controller) GetProcessInformation() ([]api.ProcessInformation, error)

GetProcessInformation implements framework.DeviceController Returns process-level GPU metrics for all processes on all devices

func (*Controller) GetVendorMountLibs

func (m *Controller) GetVendorMountLibs() ([]*api.Mount, error)

func (*Controller) ListDevices

func (m *Controller) ListDevices() ([]*api.DeviceInfo, error)

ListDevices implements framework.DeviceController

func (*Controller) RegisterDeviceUpdateHandler

func (m *Controller) RegisterDeviceUpdateHandler(handler framework.DeviceChangeHandler)

func (*Controller) RemovePartitionedDevice

func (m *Controller) RemovePartitionedDevice(partitionUUID, deviceUUID string) error

func (*Controller) SetAllocationController

func (m *Controller) SetAllocationController(allocationController framework.WorkerAllocationController)

SetAllocationController sets the allocation controller for telemetry purposes

func (*Controller) SplitDevice

func (m *Controller) SplitDevice(deviceUUID string, partitionTemplateID string) (*api.DeviceInfo, error)

func (*Controller) Start

func (m *Controller) Start() error

Start implements framework.DeviceController

func (*Controller) StartDiscoverDevices

func (m *Controller) StartDiscoverDevices() error

DiscoverDevices discovers all available GPU devices

func (*Controller) Stop

func (m *Controller) Stop() error

type DeviceBasicInfo

type DeviceBasicInfo struct {
	UUID              [64]byte  // C: char uuid[64]
	Vendor            [32]byte  // C: char vendor[32]
	Model             [128]byte // C: char model[128]
	DriverVersion     [80]byte  // C: char driverVersion[80]
	FirmwareVersion   [64]byte  // C: char firmwareVersion[64]
	DeviceNode        [64]byte  // C: char deviceNode[64]
	Index             int32     // C: int32_t index
	NUMANode          int32     // C: int32_t numaNode
	TotalMemoryBytes  uint64    // C: uint64_t totalMemoryBytes
	TotalComputeUnits uint64    // C: uint64_t totalComputeUnits
	MaxTflops         float64   // C: double maxTflops
	PCIeGen           uint32    // C: uint32_t pcieGen
	PCIeWidth         uint32    // C: uint32_t pcieWidth
}

DeviceBasicInfo matches the C struct DeviceBasicInfo in provider/accelerator.h Field names in Go are capitalized for export, but memory layout must match C struct exactly C struct fields: uuid, vendor, model, driverVersion, firmwareVersion, deviceNode, index, numaNode,

totalMemoryBytes, totalComputeUnits, maxTflops, pcieGen, pcieWidth

type DeviceMetrics

type DeviceMetrics struct {
	DeviceUUID         [64]byte
	PowerUsageWatts    float64
	TemperatureCelsius float64
	PCIeRxBytes        uint64
	PCIeTxBytes        uint64
	UtilizationPercent uint32
	MemoryUsedBytes    uint64
	ExtraMetrics       [MaxExtraMetrics]ExtraMetric
	ExtraMetricsCount  uintptr
}

type DeviceProperties

type DeviceProperties struct {
	Properties [MaxDeviceProperties]DevicePropertyKV
	Count      uintptr
}

type DevicePropertyKV

type DevicePropertyKV struct {
	Key   [64]byte
	Value [256]byte
}

type DeviceTopoNode

type DeviceTopoNode struct {
	PeerUUID  [64]byte      // Peer device UUID
	PeerIndex int32         // Peer device index
	TopoLevel TopoLevelType // Topology level to this peer
}

DeviceTopoNode represents connection to another device

type DeviceTopologyInfo

type DeviceTopologyInfo struct {
	DeviceUUID  [64]byte                           // This device's UUID
	DeviceIndex int32                              // This device's index
	NUMANode    int32                              // This device's NUMA node
	Peers       [MaxTopologyDevices]DeviceTopoNode // Topology to all other devices
	PeerCount   uintptr                            // Number of peers
}

DeviceTopologyInfo represents a device and its topology to all other devices

type ExtendedDeviceInfo

type ExtendedDeviceInfo struct {
	Basic        DeviceBasicInfo
	Props        DeviceProperties
	Capabilities VirtualizationCapabilities
}

type ExtendedDeviceTopology

type ExtendedDeviceTopology struct {
	Devices     [MaxTopologyDevices]DeviceTopologyInfo // Array of device topology rows
	DeviceCount uintptr                                // Number of devices
}

ExtendedDeviceTopology contains topology for all devices

type ExtraMetric

type ExtraMetric struct {
	Key   [64]byte
	Value float64
}

type Mount

type Mount struct {
	HostPath  [MaxMountPath]byte
	GuestPath [MaxMountPath]byte
}

type PartitionResult

type PartitionResult struct {
	Type       PartitionResultType // C: PartitionResultType type
	DeviceUUID [64]byte            // C: char deviceUUID[64]
	EnvVars    [10][256]byte       // C: char envVars[10][256], key-value pairs like "A=B"
}

PartitionResult matches the C struct PartitionResult in provider/accelerator.h Field names in Go are capitalized for export, but memory layout must match C struct exactly C struct fields: type, deviceUUID, envVars

type PartitionResultType

type PartitionResultType int32

PartitionResultType represents the type of partition result

const (
	PartitionTypeEnvironmentVariable PartitionResultType = 0
	PartitionTypeDeviceNode          PartitionResultType = 1
)

type ProcessInformation

type ProcessInformation struct {
	ProcessID                 [32]byte
	DeviceUUID                [64]byte
	ComputeUtilizationPercent float64
	ActiveSMs                 uint64
	TotalSMs                  uint64
	MemoryUsedBytes           uint64
	MemoryReservedBytes       uint64
	MemoryUtilizationPercent  float64
}

ProcessInformation combines compute and memory utilization (AMD SMI style)

type Result

type Result int32
const (
	ResultSuccess                Result = 0
	ResultErrorInvalidParam      Result = 1
	ResultErrorNotFound          Result = 2
	ResultErrorNotSupported      Result = 3
	ResultErrorResourceExhausted Result = 4
	ResultErrorOperationFailed   Result = 5
	ResultErrorInternal          Result = 6
)

type SnapshotContext

type SnapshotContext struct {
	ProcessIDs   *int32  // Pointer to array of process IDs (for process-level snapshot, NULL for device-level)
	ProcessCount uintptr // Number of processes (0 for device-level snapshot)
	DeviceUUID   *byte   // Device UUID (for device-level snapshot, NULL for process-level)
}

SnapshotContext for snapshot/resume operations Supports both process-level (CUDA) and device-level (other vendors) snapshots

type TopoLevelType

type TopoLevelType int32

TopoLevelType represents GPU-to-GPU connection type

const (
	TopoLevelInternal     TopoLevelType = 0 // e.g. Tesla K80 (same board)
	TopoLevelSingleSwitch TopoLevelType = 1 // single PCIe switch
	TopoLevelMultiSwitch  TopoLevelType = 2 // multiple PCIe switches (no host bridge traversal)
	TopoLevelHostBridge   TopoLevelType = 3 // same host bridge
	TopoLevelNUMANode     TopoLevelType = 4 // same NUMA node
	TopoLevelSystem       TopoLevelType = 5 // cross NUMA (system level)
	TopoLevelSelf         TopoLevelType = 6 // same device
	TopoLevelUnknown      TopoLevelType = 7 // unknown or error
)

type VirtualizationCapabilities

type VirtualizationCapabilities struct {
	SupportsPartitioning  bool
	SupportsSoftIsolation bool
	SupportsHardIsolation bool
	SupportsSnapshot      bool
	SupportsMetrics       bool
	SupportsRemoting      bool
	MaxPartitions         uint32
	MaxWorkersPerDevice   uint32
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL