Documentation
¶
Overview ¶
Package ecc tracks the NVIDIA per-GPU ECC errors and other ECC related information.
Package ecc provides the NVIDIA ECC metrics collection and reporting.
Index ¶
Constants ¶
View Source
const Name = "accelerator-nvidia-ecc"
View Source
const SubSystem = "accelerator_nvidia_ecc"
Variables ¶
This section is empty.
Functions ¶
func New ¶
func New(gpudInstance *components.GPUdInstance) (components.Component, error)
Types ¶
type AllECCErrorCounts ¶ added in v0.9.0
type AllECCErrorCounts struct {
// Total ECC error counts for the device.
// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g9748430b6aa6cdbb2349c5e835d70b0f
Total ECCErrorCounts `json:"total"`
// GPU L1 Cache.
// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1g9bcbee49054a953d333d4aa11e8b9c25
L1Cache ECCErrorCounts `json:"l1_cache"`
// GPU L2 Cache.
// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1g9bcbee49054a953d333d4aa11e8b9c25
L2Cache ECCErrorCounts `json:"l2_cache"`
// Turing+ DRAM.
// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1g9bcbee49054a953d333d4aa11e8b9c25
DRAM ECCErrorCounts `json:"dram"`
// Turing+ SRAM.
// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1g9bcbee49054a953d333d4aa11e8b9c25
SRAM ECCErrorCounts `json:"sram"`
// GPU Device Memory.
// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1g9bcbee49054a953d333d4aa11e8b9c25
GPUDeviceMemory ECCErrorCounts `json:"gpu_device_memory"`
// GPU Texture Memory.
// Specialized memory optimized for 2D spatial locality.
// Read-only from kernels (in most cases).
// Optimized for specific access patterns common in graphics/image processing.
// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1g9bcbee49054a953d333d4aa11e8b9c25
GPUTextureMemory ECCErrorCounts `json:"gpu_texture_memory"`
// Used for inter-thread communication and data caching within a block.
// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1g9bcbee49054a953d333d4aa11e8b9c25
SharedMemory ECCErrorCounts `json:"shared_memory"`
// GPU Register File.
// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1g9bcbee49054a953d333d4aa11e8b9c25
GPURegisterFile ECCErrorCounts `json:"gpu_register_file"`
}
func (AllECCErrorCounts) FindUncorrectedErrs ¶ added in v0.9.0
func (allCounts AllECCErrorCounts) FindUncorrectedErrs() []string
type ECCErrorCounts ¶ added in v0.9.0
type ECCErrorCounts struct {
// A memory error that was correctedFor ECC errors, these are single bit errors.
// For Texture memory, these are errors fixed by resend.
// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1gc5469bd68b9fdcf78734471d86becb24
Corrected uint64 `json:"corrected"`
// A memory error that was not corrected.
// For ECC errors, these are double bit errors.
// For Texture memory, these are errors where the resend fails.
// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1gc5469bd68b9fdcf78734471d86becb24
Uncorrected uint64 `json:"uncorrected"`
}
type ECCErrors ¶ added in v0.9.0
type ECCErrors struct {
// Represents the GPU UUID.
UUID string `json:"uuid"`
// BusID is the GPU bus ID from the nvml API.
// e.g., "0000:0f:00.0"
BusID string `json:"bus_id"`
// Aggregate counts persist across reboots (i.e. for the lifetime of the device).
// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1g08978d1c4fb52b6a4c72b39de144f1d9
Aggregate AllECCErrorCounts `json:"aggregate"`
// Volatile counts are reset each time the driver loads.
// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1g08978d1c4fb52b6a4c72b39de144f1d9
Volatile AllECCErrorCounts `json:"volatile"`
// Supported is true if the ECC errors are supported by the device.
// Set to true if any of the ECC error counts are supported.
Supported bool `json:"supported"`
}
type ECCMode ¶ added in v0.9.0
type ECCMode struct {
// Represents the GPU UUID.
UUID string `json:"uuid"`
// BusID is the GPU bus ID from the nvml API.
// e.g., "0000:0f:00.0"
BusID string `json:"bus_id"`
EnabledCurrent bool `json:"enabled_current"`
// "pending" ECC mode refers to the target mode following the next reboot.
EnabledPending bool `json:"enabled_pending"`
// Supported is true if the ECC mode is supported by the device.
Supported bool `json:"supported"`
}
Click to show internal directories.
Click to hide internal directories.