nvml

package

v0.7.0 Latest Latest Go to latest Published: Aug 25, 2025 License: Apache-2.0 Imports: 20 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/leptonai/gpud

Links

Open Source Insights

Documentation ¶

Overview ¶

Package nvml implements the NVIDIA Management Library (NVML) interface. See https://docs.nvidia.com/deploy/nvml-api/nvml-api-reference.html#nvml-api-reference for more details.

Index ¶

Variables
func ClockEventsSupportedByDevice(dev device.Device) (bool, error)
func ClockEventsSupportedVersion(major int) bool
func GPMSupportedByDevice(dev device.Device) (bool, error)
func GetArchFamily(dev device.Device) (string, error)
func GetBrand(dev device.Device) (string, error)
func GetCUDAVersion() (string, error)
func GetDriverVersion() (string, error)
func GetGPMMetrics(ctx context.Context, dev device.Device, sampleDuration time.Duration, ...) (map[nvml.GpmMetricId]float64, error)
func GetProductName(dev device.Device) (string, error)
func GetSystemDriverVersion(nvmlLib nvml.Interface) (string, error)
func IsGPULostError(ret nvml.Return) bool
func IsNoSuchFileOrDirectoryError(err error) bool
func IsNotFoundError(ret nvml.Return) bool
func IsNotReadyError(ret nvml.Return) bool
func IsNotSupportError(ret nvml.Return) bool
func IsVersionMismatchError(ret nvml.Return) bool
func LoadGPUDeviceName() (string, error)
func ParseDriverVersion(version string) (major, minor, patch int, err error)
func SanitizeProductName(productName string) string
func SupportedFMByGPUProduct(gpuProductName string) bool
type AllECCErrorCounts
- func (allCounts AllECCErrorCounts) FindUncorrectedErrs() []string
type ClockEvents
- func GetClockEvents(uuid string, dev device.Device) (ClockEvents, error)
- func (evs *ClockEvents) Event() *eventstore.Event
type ClockSpeed
- func GetClockSpeed(uuid string, dev device.Device) (ClockSpeed, error)
type ECCErrorCounts
type ECCErrors
- func GetECCErrors(uuid string, dev device.Device, eccModeEnabledCurrent bool) (ECCErrors, error)
type ECCMode
- func GetECCModeEnabled(uuid string, dev device.Device) (ECCMode, error)
type GPMMetrics
type GSPFirmwareMode
- func GetGSPFirmwareMode(uuid string, dev device.Device) (GSPFirmwareMode, error)
type Instance
- func New() (Instance, error)
- func NewNoOp() Instance
- func NewWithExitOnSuccessfulLoad(ctx context.Context) (Instance, error)
type Memory
- func GetMemory(uuid string, dev device.Device) (Memory, error)
- func (mem Memory) GetUsedPercent() (float64, error)
type MemoryErrorManagementCapabilities
- func SupportedMemoryMgmtCapsByGPUProduct(gpuProductName string) MemoryErrorManagementCapabilities
type NVLink
- func GetNVLink(uuid string, dev device.Device) (NVLink, error)
type NVLinkState
type NVLinkStates
- func (s NVLinkStates) AllFeatureEnabled() bool
- func (s NVLinkStates) TotalCRCErrors() uint64
- func (s NVLinkStates) TotalRecoveryErrors() uint64
- func (s NVLinkStates) TotalReplayErrors() uint64
type Op
type OpOption
- func WithDBRO(db *sql.DB) OpOption
- func WithDBRW(db *sql.DB) OpOption
- func WithHWSlowdownEventBucket(bucket eventstore.Bucket) OpOption
type PersistenceMode
- func GetPersistenceMode(uuid string, dev device.Device) (PersistenceMode, error)
type Power
- func GetPower(uuid string, dev device.Device) (Power, error)
- func (power Power) GetUsedPercent() (float64, error)
type Process
type Processes
- func GetProcesses(uuid string, dev device.Device) (Processes, error)
type RemappedRows
- func GetRemappedRows(uuid string, dev device.Device) (RemappedRows, error)
- func (r RemappedRows) QualifiesForRMA() bool
- func (r RemappedRows) RequiresReset() bool
type Temperature
- func GetTemperature(uuid string, dev device.Device) (Temperature, error)
- func (temp Temperature) GetUsedPercentGPUMax() (float64, error)
- func (temp Temperature) GetUsedPercentMemMax() (float64, error)
- func (temp Temperature) GetUsedPercentShutdown() (float64, error)
- func (temp Temperature) GetUsedPercentSlowdown() (float64, error)
type Utilization
- func GetUtilization(uuid string, dev device.Device) (Utilization, error)

Constants ¶

This section is empty.

Variables ¶

View Source

var BAD_CUDA_ENV_KEYS = map[string]string{
"NSIGHT_CUDA_DEBUGGER": "Setting NSIGHT_CUDA_DEBUGGER=1 can degrade the performance of an application, since the debugger is made resident. See https://docs.nvidia.com/nsight-visual-studio-edition/3.2/Content/Attach_CUDA_to_Process.htm.",
"CUDA_INJECTION32_PATH": "Captures information about CUDA execution trace. See https://docs.nvidia.com/nsight-systems/2020.3/tracing/index.html.",
"CUDA_INJECTION64_PATH": "Captures information about CUDA execution trace. See https://docs.nvidia.com/nsight-systems/2020.3/tracing/index.html.",
"CUDA_AUTO_BOOST": "Automatically selects the highest possible clock rate allowed by the thermal and power budget. Independent of the global default setting the autoboost behavior can be overridden by setting the environment variable CUDA_AUTO_BOOST. Set CUDA_AUTO_BOOST=0 to disable frequency throttling/boosting. You may run 'nvidia-smi --auto-boost-default=0' to disable autoboost by default. See https://developer.nvidia.com/blog/increase-performance-gpu-boost-k80-autoboost/.",
"CUDA_ENABLE_COREDUMP_ON_EXCEPTION": "Enables GPU core dumps.",
"CUDA_COREDUMP_FILE": "Enables GPU core dumps.",
"CUDA_DEVICE_WAITS_ON_EXCEPTION": "CUDA kernel will pause when an exception occurs. This is only useful for debugging.",
"CUDA_PROFILE": "Enables CUDA profiling.",
"COMPUTE_PROFILE": "Enables compute profiling.",
"OPENCL_PROFILE": "Enables OpenCL profiling.",
}

ports "DCGM_FR_BAD_CUDA_ENV"; The environment has variables that hurt CUDA This is derived from "DCGM_FR_BAD_CUDA_ENV" in DCGM. ref. https://github.com/NVIDIA/DCGM/blob/903d745504f50153be8293f8566346f9de3b3c93/nvvs/plugin_src/software/Software.cpp#L839-L876

View Source

var (
	// ErrGPULost is an error that indicates the GPU is lost.
	// Likely due to the GPU is physically removed from the machine.
	// Also manifested as Xid 79 (GPU has fallen off the bus).
	// ref. https://github.com/leptonai/gpud/issues/604
	ErrGPULost = errors.New("gpu lost")
)

Functions ¶

func ClockEventsSupportedByDevice ¶

func ClockEventsSupportedByDevice(dev device.Device) (bool, error)

Returns true if clock events is supported by this device.

func ClockEventsSupportedVersion ¶

func ClockEventsSupportedVersion(major int) bool

clock events are supported in versions 535 and above otherwise, CGO call just exits with undefined symbol: nvmlDeviceGetCurrentClocksEventReasons

func GPMSupportedByDevice ¶

func GPMSupportedByDevice(dev device.Device) (bool, error)

func GetArchFamily ¶ added in v0.5.0

func GetArchFamily(dev device.Device) (string, error)

GetArchFamily returns the GPU architecture family name based on the given device CUDA compute capability. ref. https://github.com/NVIDIA/k8s-device-plugin/blob/f666bc3f836a09ae2fda439f3d7a8d8b06b48ac4/internal/lm/resource.go#L283C6-L283C19

func GetBrand ¶ added in v0.5.0

func GetBrand(dev device.Device) (string, error)

func GetCUDAVersion ¶ added in v0.4.5

func GetCUDAVersion() (string, error)

func GetDriverVersion ¶

func GetDriverVersion() (string, error)

func GetGPMMetrics ¶

func GetGPMMetrics(ctx context.Context, dev device.Device, sampleDuration time.Duration, metricIDs ...nvml.GpmMetricId) (map[nvml.GpmMetricId]float64, error)

Returns the map from the metrics ID to the value for this device. Don't call these in parallel for multiple devices. It "SIGSEGV: segmentation violation" in cgo execution. Returns nil if it's not supported. ref. https://github.com/NVIDIA/go-nvml/blob/main/examples/gpm-metrics/main.go

func GetProductName ¶ added in v0.5.0

func GetProductName(dev device.Device) (string, error)

func GetSystemDriverVersion ¶ added in v0.5.0

func GetSystemDriverVersion(nvmlLib nvml.Interface) (string, error)

func IsGPULostError ¶ added in v0.5.0

func IsGPULostError(ret nvml.Return) bool

IsGPULostError returns true if the error indicates that the GPU is lost. "if the target GPU has fallen off the bus or is otherwise inaccessible".

func IsNoSuchFileOrDirectoryError ¶ added in v0.5.0

func IsNoSuchFileOrDirectoryError(err error) bool

func IsNotFoundError ¶ added in v0.4.5

func IsNotFoundError(ret nvml.Return) bool

IsNotFoundError returns true if the error indicates that the object/instance is not found. e.g., process not found from nvml

func IsNotReadyError ¶ added in v0.4.5

func IsNotReadyError(ret nvml.Return) bool

IsNotReadyError returns true if the error indicates that the system is not ready, meaning that the GPU is not yet initialized. e.g., "nvml.CLOCK_GRAPHICS: System is not in ready state"

func IsNotSupportError ¶

func IsNotSupportError(ret nvml.Return) bool

IsNotSupportError returns true if the error indicates that the operation is not supported.

func IsVersionMismatchError ¶

func IsVersionMismatchError(ret nvml.Return) bool

IsVersionMismatchError returns true if the error indicates a version mismatch.

func LoadGPUDeviceName ¶ added in v0.4.5

func LoadGPUDeviceName() (string, error)

Loads the product name of the NVIDIA GPU device.

func ParseDriverVersion ¶

func ParseDriverVersion(version string) (major, minor, patch int, err error)

func SanitizeProductName ¶ added in v0.5.0

func SanitizeProductName(productName string) string

SanitizeProductName sanitizes the product name as in NVIDIA device plugin.

e.g., "NVIDIA H100 80GB HBM3" becomes "NVIDIA-H100-80GB-HBM3"

ref. https://github.com/NVIDIA/k8s-device-plugin/blob/f666bc3f836a09ae2fda439f3d7a8d8b06b48ac4/internal/lm/resource.go#L187-L204 ref. https://github.com/NVIDIA/k8s-device-plugin/blob/f666bc3f836a09ae2fda439f3d7a8d8b06b48ac4/internal/lm/resource.go#L314-L322

func SupportedFMByGPUProduct ¶ added in v0.5.0

func SupportedFMByGPUProduct(gpuProductName string) bool

SupportedFMByGPUProduct returns the GPU fabric manager support status based on the GPU product name.

Types ¶

type AllECCErrorCounts ¶

type AllECCErrorCounts struct {
	// Total ECC error counts for the device.
	// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g9748430b6aa6cdbb2349c5e835d70b0f
	Total ECCErrorCounts `json:"total"`

	// GPU L1 Cache.
	// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1g9bcbee49054a953d333d4aa11e8b9c25
	L1Cache ECCErrorCounts `json:"l1_cache"`

	// GPU L2 Cache.
	// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1g9bcbee49054a953d333d4aa11e8b9c25
	L2Cache ECCErrorCounts `json:"l2_cache"`

	// Turing+ DRAM.
	// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1g9bcbee49054a953d333d4aa11e8b9c25
	DRAM ECCErrorCounts `json:"dram"`

	// Turing+ SRAM.
	// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1g9bcbee49054a953d333d4aa11e8b9c25
	SRAM ECCErrorCounts `json:"sram"`

	// GPU Device Memory.
	// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1g9bcbee49054a953d333d4aa11e8b9c25
	GPUDeviceMemory ECCErrorCounts `json:"gpu_device_memory"`

	// GPU Texture Memory.
	// Specialized memory optimized for 2D spatial locality.
	// Read-only from kernels (in most cases).
	// Optimized for specific access patterns common in graphics/image processing.
	// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1g9bcbee49054a953d333d4aa11e8b9c25
	GPUTextureMemory ECCErrorCounts `json:"gpu_texture_memory"`

	// Shared memory. Not texture memory.
	// Used for inter-thread communication and data caching within a block.
	// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1g9bcbee49054a953d333d4aa11e8b9c25
	SharedMemory ECCErrorCounts `json:"shared_memory"`

	// GPU Register File.
	// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1g9bcbee49054a953d333d4aa11e8b9c25
	GPURegisterFile ECCErrorCounts `json:"gpu_register_file"`
}

func (AllECCErrorCounts) FindUncorrectedErrs ¶

func (allCounts AllECCErrorCounts) FindUncorrectedErrs() []string

type ClockEvents ¶

type ClockEvents struct {
	// Time is the time the metrics were collected.
	Time metav1.Time `json:"time"`

	// Represents the GPU UUID.
	UUID string `json:"uuid"`

	// BusID is the GPU bus ID from the nvml API.
	//  e.g., "0000:0f:00.0"
	BusID string `json:"bus_id"`

	// Represents the bitmask of active clocks event reasons.
	// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlClocksEventReasons.html#group__nvmlClocksEventReasons
	ReasonsBitmask uint64 `json:"reasons_bitmask"`

	// Represents the hardware slowdown reasons.
	HWSlowdownReasons []string `json:"hw_slowdown_reasons,omitempty"`

	// Represents other human-readable reasons for the clock events.
	Reasons []string `json:"reasons,omitempty"`

	// Set true if the HW Slowdown reason due to the high temperature is active.
	HWSlowdown bool `json:"hw_slowdown"`
	// Set true if the HW Thermal Slowdown reason due to the high temperature is active.
	HWSlowdownThermal bool `json:"hw_thermal_slowdown"`
	// Set true if the HW Power Brake Slowdown reason due to the external power brake assertion is active.
	HWSlowdownPowerBrake bool `json:"hw_slowdown_power_brake"`

	// Supported is true if the clock events are supported by the device.
	Supported bool `json:"supported"`
}

ClockEvents represents the current clock events from the nvmlDeviceGetCurrentClocksEventReasons API. ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g7e505374454a0d4fc7339b6c885656d6 ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1ga115e41a14b747cb334a0e7b49ae1941 ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlClocksEventReasons.html#group__nvmlClocksEventReasons

func GetClockEvents ¶

func GetClockEvents(uuid string, dev device.Device) (ClockEvents, error)

func (*ClockEvents) Event ¶ added in v0.5.0

func (evs *ClockEvents) Event() *eventstore.Event

Event creates a apiv1.Event from ClockEvents if there are hardware slowdown reasons. Returns nil if there are no hardware slowdown reasons.

type ClockSpeed ¶

type ClockSpeed struct {
	// Represents the GPU UUID.
	UUID string `json:"uuid"`

	// BusID is the GPU bus ID from the nvml API.
	//  e.g., "0000:0f:00.0"
	BusID string `json:"bus_id"`

	GraphicsMHz uint32 `json:"graphics_mhz"`
	MemoryMHz   uint32 `json:"memory_mhz"`

	// ClockGraphicsSupported is true if the clock speed is supported by the device.
	ClockGraphicsSupported bool `json:"clock_graphics_supported"`

	// ClockMemorySupported is true if the clock speed is supported by the device.
	ClockMemorySupported bool `json:"clock_memory_supported"`
}

ClockSpeed represents the data from the nvmlDeviceGetClockInfo API. Returns the graphics and memory clock speeds in MHz. ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g2efc4dd4096173f01d80b2a8bbfd97ad

func GetClockSpeed ¶

func GetClockSpeed(uuid string, dev device.Device) (ClockSpeed, error)

type ECCErrorCounts ¶

type ECCErrorCounts struct {
	// A memory error that was correctedFor ECC errors, these are single bit errors.
	// For Texture memory, these are errors fixed by resend.
	// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1gc5469bd68b9fdcf78734471d86becb24
	Corrected uint64 `json:"corrected"`

	// A memory error that was not corrected.
	// For ECC errors, these are double bit errors.
	// For Texture memory, these are errors where the resend fails.
	// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1gc5469bd68b9fdcf78734471d86becb24
	Uncorrected uint64 `json:"uncorrected"`
}

type ECCErrors ¶

type ECCErrors struct {
	// Represents the GPU UUID.
	UUID string `json:"uuid"`

	// BusID is the GPU bus ID from the nvml API.
	//  e.g., "0000:0f:00.0"
	BusID string `json:"bus_id"`

	// Aggregate counts persist across reboots (i.e. for the lifetime of the device).
	// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1g08978d1c4fb52b6a4c72b39de144f1d9
	Aggregate AllECCErrorCounts `json:"aggregate"`

	// Volatile counts are reset each time the driver loads.
	// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1g08978d1c4fb52b6a4c72b39de144f1d9
	Volatile AllECCErrorCounts `json:"volatile"`

	// Supported is true if the ECC errors are supported by the device.
	// Set to true if any of the ECC error counts are supported.
	Supported bool `json:"supported"`
}

func GetECCErrors ¶

func GetECCErrors(uuid string, dev device.Device, eccModeEnabledCurrent bool) (ECCErrors, error)

type ECCMode ¶

type ECCMode struct {
	// Represents the GPU UUID.
	UUID string `json:"uuid"`

	// BusID is the GPU bus ID from the nvml API.
	//  e.g., "0000:0f:00.0"
	BusID string `json:"bus_id"`

	EnabledCurrent bool `json:"enabled_current"`

	// "pending" ECC mode refers to the target mode following the next reboot.
	EnabledPending bool `json:"enabled_pending"`

	// Supported is true if the ECC mode is supported by the device.
	Supported bool `json:"supported"`
}

func GetECCModeEnabled ¶

func GetECCModeEnabled(uuid string, dev device.Device) (ECCMode, error)

Returns the current and pending ECC modes. "pending" ECC mode refers to the target mode following the next reboot.

type GPMMetrics ¶

type GPMMetrics struct {
	// Time is the time the metrics were collected.
	Time metav1.Time `json:"time"`

	// Device UUID that these GPM metrics belong to.
	UUID string `json:"uuid"`

	// The duration of the sample.
	SampleDuration metav1.Duration `json:"sample_duration"`

	// The metrics.
	Metrics map[nvml.GpmMetricId]float64 `json:"metrics"`
}

GPMMetrics contains the GPM metrics for a device.

type GSPFirmwareMode ¶

type GSPFirmwareMode struct {
	UUID      string `json:"uuid"`
	BusID     string `json:"bus_id"`
	Enabled   bool   `json:"enabled"`
	Supported bool   `json:"supported"`
}

GSPFirmwareMode is the GSP firmware mode of the device. ref. https://www.nvidia.com.tw/Download/driverResults.aspx/224886/tw ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g37f644e70bd4853a78ca2bbf70861f67

func GetGSPFirmwareMode ¶

func GetGSPFirmwareMode(uuid string, dev device.Device) (GSPFirmwareMode, error)

type Instance ¶

type Instance interface {
	// NVMLExists returns true if the NVML library is installed.
	NVMLExists() bool

	// Library returns the NVML library.
	Library() nvmllib.Library

	// Devices returns the current devices in the system.
	// The key is the UUID of the GPU device.
	Devices() map[string]device.Device

	// ProductName returns the product name of the GPU.
	// Note that some machines have nvml library but the driver is not installed,
	// returning empty value for the GPU product name.
	ProductName() string

	// Architecture returns the architecture of the GPU.
	// GB200 may return "NVIDIA-Graphics-Device" for the product name
	// but "blackwell" for architecture.
	Architecture() string

	// Brand returns the brand of the GPU.
	Brand() string

	// DriverVersion returns the driver version of the GPU.
	DriverVersion() string

	// DriverMajor returns the major version of the driver.
	DriverMajor() int

	// CUDAVersion returns the CUDA version of the GPU.
	CUDAVersion() string

	// FabricManagerSupported returns true if the fabric manager is supported.
	FabricManagerSupported() bool

	// GetMemoryErrorManagementCapabilities returns the memory error management capabilities of the GPU.
	GetMemoryErrorManagementCapabilities() MemoryErrorManagementCapabilities

	// Shutdown shuts down the NVML library.
	Shutdown() error
}

Instance is the interface for the NVML library connector.

func New ¶ added in v0.5.0

func New() (Instance, error)

New creates a new instance of the NVML library. If NVML is not installed, it returns no-op nvml instance.

func NewNoOp ¶ added in v0.5.0

func NewNoOp() Instance

func NewWithExitOnSuccessfulLoad ¶ added in v0.5.0

func NewWithExitOnSuccessfulLoad(ctx context.Context) (Instance, error)

NewWithExitOnSuccessfulLoad creates a new instance of the NVML library. If NVML is not installed, it returns no-op nvml instance. It also calls the exit function when NVML is successfully loaded. The exit function is only called when the NVML library is not found. Other errors are returned as is.

type Memory ¶

type Memory struct {
	// Represents the GPU UUID.
	UUID string `json:"uuid"`

	// BusID is the GPU bus ID from the nvml API.
	//  e.g., "0000:0f:00.0"
	BusID string `json:"bus_id"`

	TotalBytes     uint64 `json:"total_bytes"`
	TotalHumanized string `json:"total_humanized"`

	ReservedBytes     uint64 `json:"reserved_bytes"`
	ReservedHumanized string `json:"reserved_humanized"`

	UsedBytes     uint64 `json:"used_bytes"`
	UsedHumanized string `json:"used_humanized"`

	FreeBytes     uint64 `json:"free_bytes"`
	FreeHumanized string `json:"free_humanized"`

	UsedPercent string `json:"used_percent"`

	// Supported is true if the memory is supported by the device.
	Supported bool `json:"supported"`
}

func GetMemory ¶

func GetMemory(uuid string, dev device.Device) (Memory, error)

func (Memory) GetUsedPercent ¶

func (mem Memory) GetUsedPercent() (float64, error)

type MemoryErrorManagementCapabilities ¶ added in v0.4.5

type MemoryErrorManagementCapabilities struct {
	// (If supported) GPU can limit the impact of uncorrectable ECC errors to GPU applications.
	// Existing/new workloads will run unaffected, both in terms of accuracy and performance.
	// Thus, does not require a GPU reset when memory errors occur.
	//
	// Note thtat there are some rarer cases, where uncorrectable errors are still uncontained
	// thus impacting all other workloads being procssed in the GPU.
	//
	// ref. https://docs.nvidia.com/deploy/a100-gpu-mem-error-mgmt/index.html#error-containments
	ErrorContainment bool `json:"error_containment"`

	// (If supported) GPU can dynamically mark the page containing uncorrectable errors
	// as unusable, and any existing or new workloads will not be allocating this page.
	//
	// Thus, does not require a GPU reset to recover from most uncorrectable ECC errors.
	//
	// ref. https://docs.nvidia.com/deploy/a100-gpu-mem-error-mgmt/index.html#dynamic-page-offlining
	DynamicPageOfflining bool `json:"dynamic_page_offlining"`

	// (If supported) GPU can replace degrading memory cells with spare ones
	// to avoid offlining regions of memory. And the row remapping is different
	// from dynamic page offlining which is fixed at a hardware level.
	//
	// The row remapping requires a GPU reset to take effect.
	//
	// Even for "NVIDIA GeForce RTX 4090", NVML API returns no error on the remapped rows API,
	// thus "NVML.Supported" is not a reliable way to check if row remapping is supported.
	// We track a separate boolean value based on the GPU product name.
	//
	// ref. https://docs.nvidia.com/deploy/a100-gpu-mem-error-mgmt/index.html#row-remapping
	RowRemapping bool `json:"row_remapping"`

	// Message contains the message to the user about the memory error management capabilities.
	Message string `json:"message,omitempty"`
}

Contains information about the GPU's memory error management capabilities. ref. https://docs.nvidia.com/deploy/a100-gpu-mem-error-mgmt/index.html#supported-gpus

func SupportedMemoryMgmtCapsByGPUProduct ¶ added in v0.4.5

func SupportedMemoryMgmtCapsByGPUProduct(gpuProductName string) MemoryErrorManagementCapabilities

SupportedMemoryMgmtCapsByGPUProduct returns the GPU memory error management capabilities based on the GPU product name. ref. https://docs.nvidia.com/deploy/a100-gpu-mem-error-mgmt/index.html#supported-gpus

type NVLink ¶

type NVLink struct {
	// Represents the GPU UUID.
	UUID string `json:"uuid"`

	// BusID is the GPU bus ID from the nvml API.
	//  e.g., "0000:0f:00.0"
	BusID string `json:"bus_id"`

	// States is the list of nvlink states.
	States NVLinkStates `json:"states"`

	// Supported is true if the NVLink is supported by the device.
	Supported bool `json:"supported"`
}

func GetNVLink ¶

func GetNVLink(uuid string, dev device.Device) (NVLink, error)

Queries the nvlink information.

type NVLinkState ¶

type NVLinkState struct {
	// Link is the nvlink link number.
	Link int `json:"link"`

	// FeatureEnabled is true if the nvlink feature is enabled.
	FeatureEnabled bool `json:"feature_enabled"`
	// ReplayErrors is the number of replay errors.
	ReplayErrors uint64 `json:"replay_errors"`
	// RecoveryErrors is the number of recovery errors.
	RecoveryErrors uint64 `json:"recovery_errors"`
	// CRCErrors is the number of crc errors.
	CRCErrors uint64 `json:"crc_errors"`

	// ThroughputRawTxBytes is the NVLink TX Data throughput + protocol overhead in bytes.
	ThroughputRawTxBytes uint64 `json:"throughput_raw_tx_bytes"`
	// ThroughputRawRxBytes is the NVLink RX Data throughput + protocol overhead in bytes.
	ThroughputRawRxBytes uint64 `json:"throughput_raw_rx_bytes"`
}

type NVLinkStates ¶

type NVLinkStates []NVLinkState

func (NVLinkStates) AllFeatureEnabled ¶

func (s NVLinkStates) AllFeatureEnabled() bool

func (NVLinkStates) TotalCRCErrors ¶

func (s NVLinkStates) TotalCRCErrors() uint64

func (NVLinkStates) TotalRecoveryErrors ¶

func (s NVLinkStates) TotalRecoveryErrors() uint64

func (NVLinkStates) TotalReplayErrors ¶ added in v0.5.0

func (s NVLinkStates) TotalReplayErrors() uint64

type Op ¶

type Op struct {
	// contains filtered or unexported fields
}

type OpOption ¶

type OpOption func(*Op)

func WithDBRO ¶

func WithDBRO(db *sql.DB) OpOption

Specifies the read-only database instance. If not specified, a new in-memory database is created.

func WithDBRW ¶

func WithDBRW(db *sql.DB) OpOption

Specifies the database instance to persist nvidia components data (e.g., xid/sxid events). Must be a writable database instance. If not specified, a new in-memory database is created.

func WithHWSlowdownEventBucket ¶ added in v0.4.5

func WithHWSlowdownEventBucket(bucket eventstore.Bucket) OpOption

type PersistenceMode ¶

type PersistenceMode struct {
	UUID    string `json:"uuid"`
	BusID   string `json:"bus_id"`
	Enabled bool   `json:"enabled"`
	// Supported is true if the persistence mode is supported by the device.
	Supported bool `json:"supported"`
}

PersistenceMode is the persistence mode of the device. Implements "DCGM_FR_PERSISTENCE_MODE" in DCGM. ref. https://github.com/NVIDIA/DCGM/blob/903d745504f50153be8293f8566346f9de3b3c93/nvvs/plugin_src/software/Software.cpp#L526-L553

Persistence mode controls whether the NVIDIA driver stays loaded when no active clients are connected to the GPU. ref. https://developer.nvidia.com/management-library-nvml

Once all clients have closed the device file, the GPU state will be unloaded unless persistence mode is enabled. ref. https://docs.nvidia.com/deploy/driver-persistence/index.html

NVIDIA Persistence Daemon provides a more robust implementation of persistence mode on Linux. ref. https://docs.nvidia.com/deploy/driver-persistence/index.html#usage

To enable persistence mode, we need to check if "nvidia-persistenced" is running. Or run "nvidia-smi -pm 1" to enable persistence mode.

func GetPersistenceMode ¶

func GetPersistenceMode(uuid string, dev device.Device) (PersistenceMode, error)

type Power ¶

type Power struct {
	// Represents the GPU UUID.
	UUID string `json:"uuid"`

	// BusID is the GPU bus ID from the nvml API.
	//  e.g., "0000:0f:00.0"
	BusID string `json:"bus_id"`

	UsageMilliWatts           uint32 `json:"usage_milli_watts"`
	EnforcedLimitMilliWatts   uint32 `json:"enforced_limit_milli_watts"`
	ManagementLimitMilliWatts uint32 `json:"management_limit_milli_watts"`

	UsedPercent string `json:"used_percent"`

	GetPowerUsageSupported           bool `json:"get_power_usage_supported"`
	GetPowerLimitSupported           bool `json:"get_power_limit_supported"`
	GetPowerManagementLimitSupported bool `json:"get_power_management_limit_supported"`
}

func GetPower ¶

func GetPower(uuid string, dev device.Device) (Power, error)

func (Power) GetUsedPercent ¶

func (power Power) GetUsedPercent() (float64, error)

type Process ¶

type Process struct {
	PID    uint32   `json:"pid"`
	Status []string `json:"status,omitempty"`

	// ZombieStatus is set to true if the process is defunct
	// (terminated but not reaped by its parent).
	ZombieStatus bool `json:"zombie_status,omitempty"`

	// BadEnvVarsForCUDA is a map of environment variables that are known to hurt CUDA
	// that is set for this specific process.
	// Empty if there is no bad environment variable found for this process.
	// This implements "DCGM_FR_BAD_CUDA_ENV" logic in DCGM.
	BadEnvVarsForCUDA map[string]string `json:"bad_env_vars_for_cuda,omitempty"`

	CmdArgs                     []string    `json:"cmd_args,omitempty"`
	CreateTime                  metav1.Time `json:"create_time,omitempty"`
	GPUUsedPercent              uint32      `json:"gpu_used_percent,omitempty"`
	GPUUsedMemoryBytes          uint64      `json:"gpu_used_memory_bytes,omitempty"`
	GPUUsedMemoryBytesHumanized string      `json:"gpu_used_memory_bytes_humanized,omitempty"`
}

type Processes ¶

type Processes struct {
	// Represents the GPU UUID.
	UUID string `json:"uuid"`

	// BusID is the GPU bus ID from the nvml API.
	//  e.g., "0000:0f:00.0"
	BusID string `json:"bus_id"`

	// A list of running processes.
	RunningProcesses []Process `json:"running_processes"`

	// GetComputeRunningProcessesSupported is true if the device supports the getComputeRunningProcesses API.
	GetComputeRunningProcessesSupported bool `json:"get_compute_running_processes_supported"`

	// GetProcessUtilizationSupported is true if the device supports the getProcessUtilization API.
	GetProcessUtilizationSupported bool `json:"get_process_utilization_supported"`
}

Processes represents the current clock events from the nvmlDeviceGetCurrentClocksEventReasons API. ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g7e505374454a0d4fc7339b6c885656d6 ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1ga115e41a14b747cb334a0e7b49ae1941 ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlClocksEventReasons.html#group__nvmlClocksEventReasons

func GetProcesses ¶

func GetProcesses(uuid string, dev device.Device) (Processes, error)

type RemappedRows ¶

type RemappedRows struct {
	// Represents the GPU UUID.
	UUID string `json:"uuid"`

	// BusID is the GPU bus ID from the nvml API.
	//  e.g., "0000:0f:00.0"
	BusID string `json:"bus_id"`

	// The number of rows remapped due to correctable errors.
	RemappedDueToCorrectableErrors int `json:"remapped_due_to_correctable_errors"`

	// The number of rows remapped due to uncorrectable errors.
	RemappedDueToUncorrectableErrors int `json:"remapped_due_to_uncorrectable_errors"`

	// Indicates whether or not remappings are pending.
	// If true, GPU requires a reset to actually remap the row.
	//
	// A pending remapping won't affect future work on the GPU
	// since error-containment and dynamic page blacklisting will take care of that.
	RemappingPending bool `json:"remapping_pending"`

	// Set to true when a remapping has failed in the past.
	// A pending remapping won't affect future work on the GPU
	// since error-containment and dynamic page blacklisting will take care of that.
	RemappingFailed bool `json:"remapping_failed"`

	// Supported is true if the remapped rows are supported by the device.
	// Even for "NVIDIA GeForce RTX 4090", this "GetRemappedRows" returns no error,
	// thus "Supported" is not a reliable way to check if row remapping is supported.
	Supported bool `json:"supported"`
}

RemappedRows represents the row remapping data. The row remapping feature is used to prevent known degraded memory locations from being used. But may require a GPU reset to actually remap the rows. ref. https://docs.nvidia.com/deploy/a100-gpu-mem-error-mgmt/index.html#row-remapping ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g055e7c34f7f15b6ae9aac1dabd60870d

func GetRemappedRows ¶

func GetRemappedRows(uuid string, dev device.Device) (RemappedRows, error)

func (RemappedRows) QualifiesForRMA ¶

func (r RemappedRows) QualifiesForRMA() bool

Returns true if a GPU qualifies for RMA. ref. https://docs.nvidia.com/deploy/a100-gpu-mem-error-mgmt/index.html#rma-policy-thresholds-for-row-remapping

func (RemappedRows) RequiresReset ¶

func (r RemappedRows) RequiresReset() bool

Returns true if a GPU requires a reset to remap the rows.

type Temperature ¶

type Temperature struct {
	// Represents the GPU UUID.
	UUID string `json:"uuid"`

	// BusID is the GPU bus ID from the nvml API.
	//  e.g., "0000:0f:00.0"
	BusID string `json:"bus_id"`

	CurrentCelsiusGPUCore uint32 `json:"current_celsius_gpu_core"`

	// Threshold at which the GPU starts to shut down to prevent hardware damage.
	ThresholdCelsiusShutdown uint32 `json:"threshold_celsius_shutdown"`
	// Threshold at which the GPU starts to throttle its performance.
	ThresholdCelsiusSlowdown uint32 `json:"threshold_celsius_slowdown"`
	// Maximum safe operating temperature for the GPU's memory.
	ThresholdCelsiusMemMax uint32 `json:"threshold_celsius_mem_max"`
	// Maximum safe operating temperature for the GPU core.
	ThresholdCelsiusGPUMax uint32 `json:"threshold_celsius_gpu_max"`

	UsedPercentShutdown string `json:"used_percent_shutdown"`
	UsedPercentSlowdown string `json:"used_percent_slowdown"`
	UsedPercentMemMax   string `json:"used_percent_mem_max"`
	UsedPercentGPUMax   string `json:"used_percent_gpu_max"`
}

func GetTemperature ¶

func GetTemperature(uuid string, dev device.Device) (Temperature, error)

func (Temperature) GetUsedPercentGPUMax ¶

func (temp Temperature) GetUsedPercentGPUMax() (float64, error)

func (Temperature) GetUsedPercentMemMax ¶

func (temp Temperature) GetUsedPercentMemMax() (float64, error)

func (Temperature) GetUsedPercentShutdown ¶

func (temp Temperature) GetUsedPercentShutdown() (float64, error)

func (Temperature) GetUsedPercentSlowdown ¶

func (temp Temperature) GetUsedPercentSlowdown() (float64, error)

type Utilization ¶

type Utilization struct {
	// Represents the GPU UUID.
	UUID string `json:"uuid"`

	// BusID is the GPU bus ID from the nvml API.
	//  e.g., "0000:0f:00.0"
	BusID string `json:"bus_id"`

	// Percent of time over the past sample period during which one or more kernels was executing on the GPU.
	GPUUsedPercent uint32 `json:"gpu_used_percent"`
	// Percent of time over the past sample period during which global (device) memory was being read or written.
	MemoryUsedPercent uint32 `json:"memory_used_percent"`

	// Supported is true if the utilization is supported by the device.
	Supported bool `json:"supported"`
}

Utilization represents the data from the nvmlDeviceGetUtilizationRates API. Utilization information for a device. Each sample period may be between 1 second and 1/6 second, depending on the product being queried. ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g540824faa6cef45500e0d1dc2f50b321 ref. https://docs.nvidia.com/deploy/nvml-api/structnvmlUtilization__t.html#structnvmlUtilization__t c.f., "DCGM_FI_PROF_GR_ENGINE_ACTIVE" https://docs.nvidia.com/datacenter/dcgm/1.7/dcgm-api/group__dcgmFieldIdentifiers.html#group__dcgmFieldIdentifiers_1g5a93634d6e8574ab6af4bfab102709dc

func GetUtilization ¶

func GetUtilization(uuid string, dev device.Device) (Utilization, error)

Directories ¶

Path	Synopsis
device Package device provides a wrapper around the "github.com/NVIDIA/go-nvlib/pkg/nvlib/device".Device type that adds a PCIBusID method.	Package device provides a wrapper around the "github.com/NVIDIA/go-nvlib/pkg/nvlib/device".Device type that adds a PCIBusID method.
lib Package lib implements the NVIDIA Management Library (NVML) interface.	Package lib implements the NVIDIA Management Library (NVML) interface.
mock
testutil

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL