gpu

package
v0.12.0-rc7 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 24, 2026 License: Apache-2.0 Imports: 14 Imported by: 0

Documentation

Overview

Package gpu collects GPU hardware and driver configuration data using a two-phase detection model.

Two-Phase Collection

The collector runs two independent detection phases, each producing a separate measurement subtype:

Phase 1 ("hardware"): NFD-based PCI enumeration — detects NVIDIA GPUs
    via sysfs PCI device scan and checks nvidia kernel module state.
    No GPU drivers required. Requires Linux with sysfs mounted.

Phase 2 ("smi"): nvidia-smi XML query — collects driver version, CUDA
    version, per-GPU hardware specs, and runtime settings. Requires
    nvidia-smi in PATH with a loaded NVIDIA driver.

Phase 1 enables day-0 GPU detection on freshly provisioned nodes where drivers have not yet been installed. Phase 2 provides the full telemetry used for recipe generation and validation.

Graceful Degradation

Each phase degrades independently:

  • Phase 1 failure (e.g., no sysfs on macOS): logged as warning, skipped. Only the "smi" subtype is returned.
  • Phase 2 failure (e.g., nvidia-smi not installed): logged as warning. A zero-GPU "smi" subtype is returned with gpu-count=0.
  • Both phases fail: measurement contains only the zero-GPU "smi" subtype.
  • Phase 1 nil (no HardwareDetector configured): Phase 1 is skipped entirely, preserving the pre-NFD single-phase behavior.

Measurement Structure

A successful two-phase collection produces:

Measurement{
    Type: "GPU",
    Subtypes: [
        {Name: "hardware", Data: {gpu-present, gpu-count, driver-loaded, detection-source}},
        {Name: "smi",      Data: {gpu-count, driver, cuda-version, gpu.model, ...}},
    ],
}

The "hardware" subtype keys are defined in pkg/measurement:

  • KeyGPUPresent: bool — true if at least one NVIDIA GPU found via PCI
  • KeyGPUCount: int — number of NVIDIA GPUs detected
  • KeyGPUDriverLoaded: bool — true if nvidia kernel module is loaded
  • KeyGPUDetectionSource: string — detection method (e.g., "nfd")

The "smi" subtype contains driver telemetry and per-GPU hardware details.

Usage

The collector is created by the factory with NFD wiring:

collector := gpu.NewCollector(
    gpu.WithHardwareDetector(&gpu.NFDHardwareDetector{}),
)
m, err := collector.Collect(ctx)

Without WithHardwareDetector, Phase 1 is skipped (backward compatible).

Context and Timeouts

The collector respects context cancellation and applies a bounded timeout (defaults.CollectorTimeout). NFD detection has its own sub-timeout (defaults.NFDDetectionTimeout = 5s). The context is passed to each phase, so cancellation is respected within each phase's I/O operations.

Platform Support

  • Linux with sysfs: Both phases run (full two-phase detection)
  • macOS / containers without /sys: Phase 1 fails gracefully, Phase 2 only
  • No nvidia-smi: Phase 2 returns zero-GPU subtype

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type Aggregate

type Aggregate struct {
	SramCorrectable         string `xml:"sram_correctable" json:"sramCorrectable" yaml:"sramCorrectable"`
	SramUncorrectableParity string `xml:"sram_uncorrectable_parity" json:"sramUncorrectableParity" yaml:"sramUncorrectableParity"`
	SramUncorrectableSecded string `xml:"sram_uncorrectable_secded" json:"sramUncorrectableSecded" yaml:"sramUncorrectableSecded"`
	DramCorrectable         string `xml:"dram_correctable" json:"dramCorrectable" yaml:"dramCorrectable"`
	DramUncorrectable       string `xml:"dram_uncorrectable" json:"dramUncorrectable" yaml:"dramUncorrectable"`
	SramThresholdExceeded   string `xml:"sram_threshold_exceeded" json:"sramThresholdExceeded" yaml:"sramThresholdExceeded"`
}

type AggregateUncorrectableSramSources

type AggregateUncorrectableSramSources struct {
	SramL2              string `xml:"sram_l2" json:"sramL2" yaml:"sramL2"`
	SramSm              string `xml:"sram_sm" json:"sramSm" yaml:"sramSm"`
	SramMicrocontroller string `xml:"sram_microcontroller" json:"sramMicrocontroller" yaml:"sramMicrocontroller"`
	SramPcie            string `xml:"sram_pcie" json:"sramPcie" yaml:"sramPcie"`
	SramOther           string `xml:"sram_other" json:"sramOther" yaml:"sramOther"`
}

type ApplicationsClocks

type ApplicationsClocks struct {
	GraphicsClock string `xml:"graphics_clock" json:"graphicsClock" yaml:"graphicsClock"`
	MemClock      string `xml:"mem_clock" json:"memClock" yaml:"memClock"`
}

type Bar1MemoryUsage

type Bar1MemoryUsage struct {
	Total string `xml:"total" json:"total" yaml:"total"`
	Used  string `xml:"used" json:"used" yaml:"used"`
	Free  string `xml:"free" json:"free" yaml:"free"`
}

type Capabilities

type Capabilities struct {
	Egm string `xml:"egm" json:"egm" yaml:"egm"`
}

type CcProtectedMemoryUsage

type CcProtectedMemoryUsage struct {
	Total string `xml:"total" json:"total" yaml:"total"`
	Used  string `xml:"used" json:"used" yaml:"used"`
	Free  string `xml:"free" json:"free" yaml:"free"`
}

type ClockPolicy

type ClockPolicy struct {
	AutoBoost        string `xml:"auto_boost" json:"autoBoost" yaml:"autoBoost"`
	AutoBoostDefault string `xml:"auto_boost_default" json:"autoBoostDefault" yaml:"autoBoostDefault"`
}

type Clocks

type Clocks struct {
	GraphicsClock string `xml:"graphics_clock" json:"graphicsClock" yaml:"graphicsClock"`
	SmClock       string `xml:"sm_clock" json:"smClock" yaml:"smClock"`
	MemClock      string `xml:"mem_clock" json:"memClock" yaml:"memClock"`
	VideoClock    string `xml:"video_clock" json:"videoClock" yaml:"videoClock"`
}

type ClocksEventReasons

type ClocksEventReasons struct {
	ClocksEventReasonGpuIdle                   string `xml:"clocks_event_reason_gpu_idle" json:"clocksEventReasonGPUIdle" yaml:"clocksEventReasonGPUIdle"`
	ClocksEventReasonApplicationsClocksSetting string `` /* 153-byte string literal not displayed */
	ClocksEventReasonSwPowerCap                string `xml:"clocks_event_reason_sw_power_cap" json:"clocksEventReasonSwPowerCap" yaml:"clocksEventReasonSwPowerCap"`
	ClocksEventReasonHwSlowdown                string `xml:"clocks_event_reason_hw_slowdown" json:"clocksEventReasonHwSlowdown" yaml:"clocksEventReasonHwSlowdown"`
	ClocksEventReasonHwThermalSlowdown         string `` /* 129-byte string literal not displayed */
	ClocksEventReasonHwPowerBrakeSlowdown      string `` /* 139-byte string literal not displayed */
	ClocksEventReasonSyncBoost                 string `xml:"clocks_event_reason_sync_boost" json:"clocksEventReasonSyncBoost" yaml:"clocksEventReasonSyncBoost"`
	ClocksEventReasonSwThermalSlowdown         string `` /* 129-byte string literal not displayed */
	ClocksEventReasonDisplayClocksSetting      string `` /* 138-byte string literal not displayed */
}

type Collector

type Collector struct {
	// contains filtered or unexported fields
}

Collector collects NVIDIA SMI configurations from nvidia-smi command output in XML format and parses them into NVSMIDevice structures

func NewCollector added in v0.12.0

func NewCollector(opts ...CollectorOption) *Collector

NewCollector creates a GPU collector with the given options.

func (*Collector) Collect

func (s *Collector) Collect(ctx context.Context) (*measurement.Measurement, error)

Collect retrieves GPU information in two phases:

  • Phase 1 (hardware): NFD-based PCI detection when hardwareDetector is set
  • Phase 2 (smi): existing nvidia-smi collection (always runs)

When hardwareDetector is nil, Phase 1 is skipped preserving pre-NFD behavior. On Phase 1 failure, collector logs a warning and proceeds with Phase 2.

type CollectorOption added in v0.12.0

type CollectorOption func(*Collector)

CollectorOption configures a Collector.

func WithCommandRunner added in v0.12.0

func WithCommandRunner(runner commandRunner) CollectorOption

WithCommandRunner sets a custom command runner for executing external tools. Used in tests to mock nvidia-smi execution.

func WithHardwareDetector added in v0.12.0

func WithHardwareDetector(d HardwareDetector) CollectorOption

WithHardwareDetector sets the hardware detector for Phase 1 GPU detection. When not set, Phase 1 is skipped and only nvidia-smi collection runs.

type DefaultApplicationsClocks

type DefaultApplicationsClocks struct {
	GraphicsClock string `xml:"graphics_clock" json:"graphicsClock" yaml:"graphicsClock"`
	MemClock      string `xml:"mem_clock" json:"memClock" yaml:"memClock"`
}

type DeferredClocks

type DeferredClocks struct {
	MemClock string `xml:"mem_clock" json:"memClock" yaml:"memClock"`
}

type DoubleBitRetirement

type DoubleBitRetirement struct {
	RetiredCount    string `xml:"retired_count" json:"retiredCount" yaml:"retiredCount"`
	RetiredPagelist string `xml:"retired_pagelist" json:"retiredPagelist" yaml:"retiredPagelist"`
}

type DramEncryptionMode

type DramEncryptionMode struct {
	CurrentDramEncryption string `xml:"current_dram_encryption" json:"currentDramEncryption" yaml:"currentDramEncryption"`
	PendingDramEncryption string `xml:"pending_dram_encryption" json:"pendingDramEncryption" yaml:"pendingDramEncryption"`
}

type DriverModel

type DriverModel struct {
	CurrentDm string `xml:"current_dm" json:"currentDm" yaml:"currentDm"`
	PendingDm string `xml:"pending_dm" json:"pendingDm" yaml:"pendingDm"`
}

type EccErrors

type EccErrors struct {
	Volatile                          Volatile                          `xml:"volatile" json:"volatile" yaml:"volatile"`
	Aggregate                         Aggregate                         `xml:"aggregate" json:"aggregate" yaml:"aggregate"`
	AggregateUncorrectableSramSources AggregateUncorrectableSramSources `xml:"aggregate_uncorrectable_sram_sources" json:"aggregateUncorrectableSramSources" yaml:"aggregateUncorrectableSramSources"`
}

type EccMode

type EccMode struct {
	CurrentEcc string `xml:"current_ecc" json:"currentEcc" yaml:"currentEcc"`
	PendingEcc string `xml:"pending_ecc" json:"pendingEcc" yaml:"pendingEcc"`
}

type EncoderStats

type EncoderStats struct {
	SessionCount   string `xml:"session_count" json:"sessionCount" yaml:"sessionCount"`
	AverageFps     string `xml:"average_fps" json:"averageFps" yaml:"averageFps"`
	AverageLatency string `xml:"average_latency" json:"averageLatency" yaml:"averageLatency"`
}

type Fabric

type Fabric struct {
	State       string `xml:"state" json:"state" yaml:"state"`
	Status      string `xml:"status" json:"status" yaml:"status"`
	Cliqueid    string `xml:"cliqueId" json:"cliqueId" yaml:"cliqueId"`
	Clusteruuid string `xml:"clusterUuid" json:"clusterUuid" yaml:"clusterUuid"`
	Health      Health `xml:"health" json:"health" yaml:"health"`
}

type FbMemoryUsage

type FbMemoryUsage struct {
	Total    string `xml:"total" json:"total" yaml:"total"`
	Reserved string `xml:"reserved" json:"reserved" yaml:"reserved"`
	Used     string `xml:"used" json:"used" yaml:"used"`
	Free     string `xml:"free" json:"free" yaml:"free"`
}

type FbcStats

type FbcStats struct {
	SessionCount   string `xml:"session_count" json:"sessionCount" yaml:"sessionCount"`
	AverageFps     string `xml:"average_fps" json:"averageFps" yaml:"averageFps"`
	AverageLatency string `xml:"average_latency" json:"averageLatency" yaml:"averageLatency"`
}

type GPU

type GPU struct {
	ProductName               string                    `xml:"product_name" json:"productName" yaml:"productName"`
	ProductBrand              string                    `xml:"product_brand" json:"productBrand" yaml:"productBrand"`
	ProductArchitecture       string                    `xml:"product_architecture" json:"productArchitecture" yaml:"productArchitecture"`
	DisplayMode               string                    `xml:"display_mode" json:"displayMode" yaml:"displayMode"`
	DisplayActive             string                    `xml:"display_active" json:"displayActive" yaml:"displayActive"`
	PersistenceMode           string                    `xml:"persistence_mode" json:"persistenceMode" yaml:"persistenceMode"`
	AddressingMode            string                    `xml:"addressing_mode" json:"addressingMode" yaml:"addressingMode"`
	MigMode                   MigMode                   `xml:"mig_mode" json:"migMode" yaml:"migMode"`
	MigDevices                string                    `xml:"mig_devices" json:"migDevices" yaml:"migDevices"`
	AccountingMode            string                    `xml:"accounting_mode" json:"accountingMode" yaml:"accountingMode"`
	AccountingModeBufferSize  string                    `xml:"accounting_mode_buffer_size" json:"accountingModeBufferSize" yaml:"accountingModeBufferSize"`
	DriverModel               DriverModel               `xml:"driver_model" json:"driverModel" yaml:"driverModel"`
	Serial                    string                    `xml:"serial" json:"serial" yaml:"serial"`
	UUID                      string                    `xml:"uuid" json:"uuid" yaml:"uuid"`
	MinorNumber               string                    `xml:"minor_number" json:"minorNumber" yaml:"minorNumber"`
	VbiosVersion              string                    `xml:"vbios_version" json:"vbiosVersion" yaml:"vbiosVersion"`
	MultigpuBoard             string                    `xml:"multigpu_board" json:"multiGPUBoard" yaml:"multiGPUBoard"`
	BoardID                   string                    `xml:"board_id" json:"boardId" yaml:"boardId"`
	BoardPartNumber           string                    `xml:"board_part_number" json:"boardPartNumber" yaml:"boardPartNumber"`
	GpuPartNumber             string                    `xml:"gpu_part_number" json:"gpuPartNumber" yaml:"gpuPartNumber"`
	GpuFruPartNumber          string                    `xml:"gpu_fru_part_number" json:"gpuFRUPartNumber" yaml:"gpuFRUPartNumber"`
	PlatformInfo              PlatformInfo              `xml:"platformInfo" json:"platformInfo" yaml:"platformInfo"`
	InforomVersion            InforomVersion            `xml:"inforom_version" json:"inforomVersion" yaml:"inforomVersion"`
	InforomBbxFlush           InforomBbxFlush           `xml:"inforom_bbx_flush" json:"inforomBBXFlush" yaml:"inforomBBXFlush"`
	GpuOperationMode          OperationMode             `xml:"gpu_operation_mode" json:"gpuOperationMode" yaml:"gpuOperationMode"`
	C2cMode                   string                    `xml:"c2c_mode" json:"c2cMode" yaml:"c2cMode"`
	GpuVirtualizationMode     VirtualizationMode        `xml:"gpu_virtualization_mode" json:"gpuVirtualizationMode" yaml:"gpuVirtualizationMode"`
	GpuResetStatus            ResetStatus               `xml:"gpu_reset_status" json:"gpuResetStatus" yaml:"gpuResetStatus"`
	GpuRecoveryAction         string                    `xml:"gpu_recovery_action" json:"gpuRecoveryAction" yaml:"gpuRecoveryAction"`
	GspFirmwareVersion        string                    `xml:"gsp_firmware_version" json:"gspFirmwareVersion" yaml:"gspFirmwareVersion"`
	Ibmnpu                    Ibmnpu                    `xml:"ibmnpu" json:"ibmnpu" yaml:"ibmnpu"`
	Pci                       Pci                       `xml:"pci" json:"pci" yaml:"pci"`
	FanSpeed                  string                    `xml:"fan_speed" json:"fanSpeed" yaml:"fanSpeed"`
	PerformanceState          string                    `xml:"performance_state" json:"performanceState" yaml:"performanceState"`
	ClocksEventReasons        ClocksEventReasons        `xml:"clocks_event_reasons" json:"clocksEventReasons" yaml:"clocksEventReasons"`
	SparseOperationMode       string                    `xml:"sparse_operation_mode" json:"sparseOperationMode" yaml:"sparseOperationMode"`
	FbMemoryUsage             FbMemoryUsage             `xml:"fb_memory_usage" json:"fbMemoryUsage" yaml:"fbMemoryUsage"`
	Bar1MemoryUsage           Bar1MemoryUsage           `xml:"bar1_memory_usage" json:"bar1MemoryUsage" yaml:"bar1MemoryUsage"`
	CcProtectedMemoryUsage    CcProtectedMemoryUsage    `xml:"cc_protected_memory_usage" json:"ccProtectedMemoryUsage" yaml:"ccProtectedMemoryUsage"`
	ComputeMode               string                    `xml:"compute_mode" json:"computeMode" yaml:"computeMode"`
	Utilization               Utilization               `xml:"utilization" json:"utilization" yaml:"utilization"`
	EncoderStats              EncoderStats              `xml:"encoder_stats" json:"encoderStats" yaml:"encoderStats"`
	FbcStats                  FbcStats                  `xml:"fbc_stats" json:"fbcStats" yaml:"fbcStats"`
	DramEncryptionMode        DramEncryptionMode        `xml:"dram_encryption_mode" json:"dramEncryptionMode" yaml:"dramEncryptionMode"`
	EccMode                   EccMode                   `xml:"ecc_mode" json:"eccMode" yaml:"eccMode"`
	EccErrors                 EccErrors                 `xml:"ecc_errors" json:"eccErrors" yaml:"eccErrors"`
	RetiredPages              RetiredPages              `xml:"retired_pages" json:"retiredPages" yaml:"retiredPages"`
	RemappedRows              RemappedRows              `xml:"remapped_rows" json:"remappedRows" yaml:"remappedRows"`
	Temperature               Temperature               `xml:"temperature" json:"temperature" yaml:"temperature"`
	SupportedGpuTargetTemp    SupportedGpuTargetTemp    `xml:"supported_gpu_target_temp" json:"supportedGpuTargetTemp" yaml:"supportedGpuTargetTemp"`
	GpuPowerReadings          PowerReadings             `xml:"gpu_power_readings" json:"gpuPowerReadings" yaml:"gpuPowerReadings"`
	GpuMemoryPowerReadings    MemoryPowerReadings       `xml:"gpu_memory_power_readings" json:"gpuMemoryPowerReadings" yaml:"gpuMemoryPowerReadings"`
	ModulePowerReadings       ModulePowerReadings       `xml:"module_power_readings" json:"modulePowerReadings" yaml:"modulePowerReadings"`
	PowerSmoothing            string                    `xml:"power_smoothing" json:"powerSmoothing" yaml:"powerSmoothing"`
	PowerProfiles             PowerProfiles             `xml:"power_profiles" json:"powerProfiles" yaml:"powerProfiles"`
	Clocks                    Clocks                    `xml:"clocks" json:"clocks" yaml:"clocks"`
	ApplicationsClocks        ApplicationsClocks        `xml:"applications_clocks" json:"applicationsClocks" yaml:"applicationsClocks"`
	DefaultApplicationsClocks DefaultApplicationsClocks `xml:"default_applications_clocks" json:"defaultApplicationsClocks" yaml:"defaultApplicationsClocks"`
	DeferredClocks            DeferredClocks            `xml:"deferred_clocks" json:"deferredClocks" yaml:"deferredClocks"`
	MaxClocks                 MaxClocks                 `xml:"max_clocks" json:"maxClocks" yaml:"maxClocks"`
	MaxCustomerBoostClocks    MaxCustomerBoostClocks    `xml:"max_customer_boost_clocks" json:"maxCustomerBoostClocks" yaml:"maxCustomerBoostClocks"`
	ClockPolicy               ClockPolicy               `xml:"clock_policy" json:"clockPolicy" yaml:"clockPolicy"`
	Voltage                   Voltage                   `xml:"voltage" json:"voltage" yaml:"voltage"`
	Fabric                    Fabric                    `xml:"fabric" json:"fabric" yaml:"fabric"`
	SupportedClocks           SupportedClocks           `xml:"supported_clocks" json:"supportedClocks" yaml:"supportedClocks"`
	Processes                 string                    `xml:"processes" json:"processes" yaml:"processes"`
	AccountedProcesses        string                    `xml:"accounted_processes" json:"accountedProcesses" yaml:"accountedProcesses"`
	Capabilities              Capabilities              `xml:"capabilities" json:"capabilities" yaml:"capabilities"`
}

type HardwareDetector added in v0.12.0

type HardwareDetector interface {
	// Detect discovers GPU hardware and driver module state.
	// Returns HardwareInfo describing what was found, or an error if
	// detection could not be performed (e.g., sysfs not available).
	Detect(ctx context.Context) (*HardwareInfo, error)
}

HardwareDetector abstracts GPU hardware detection for testability. Implementations enumerate PCI devices and kernel module state without requiring GPU drivers to be installed.

type HardwareInfo added in v0.12.0

type HardwareInfo struct {
	// GPUPresent is true if at least one NVIDIA GPU was found via PCI enumeration.
	GPUPresent bool

	// GPUCount is the number of NVIDIA GPUs detected via PCI enumeration.
	GPUCount int

	// DriverLoaded is true if the nvidia kernel module is currently loaded.
	DriverLoaded bool

	// DetectionSource identifies which detection method produced this result
	// (e.g., "nfd", "sysfs").
	DetectionSource string
}

HardwareInfo describes the GPU hardware state detected without drivers.

type Health

type Health struct {
	Bandwidth               string `xml:"bandwidth" json:"bandwidth" yaml:"bandwidth"`
	RouteRecoveryInProgress string `xml:"route_recovery_in_progress" json:"routeRecoveryInProgress" yaml:"routeRecoveryInProgress"`
	RouteUnhealthy          string `xml:"route_unhealthy" json:"routeUnhealthy" yaml:"routeUnhealthy"`
	AccessTimeoutRecovery   string `xml:"access_timeout_recovery" json:"accessTimeoutRecovery" yaml:"accessTimeoutRecovery"`
}

type Ibmnpu

type Ibmnpu struct {
	RelaxedOrderingMode string `xml:"relaxed_ordering_mode" json:"relaxedOrderingMode" yaml:"relaxedOrderingMode"`
}

type InforomBbxFlush

type InforomBbxFlush struct {
	LatestTimestamp string `xml:"latest_timestamp" json:"latestTimestamp" yaml:"latestTimestamp"`
	LatestDuration  string `xml:"latest_duration" json:"latestDuration" yaml:"latestDuration"`
}

type InforomVersion

type InforomVersion struct {
	ImgVersion string `xml:"img_version" json:"imgVersion" yaml:"imgVersion"`
	OemObject  string `xml:"oem_object" json:"oemObject" yaml:"oemObject"`
	EccObject  string `xml:"ecc_object" json:"eccObject" yaml:"eccObject"`
	PwrObject  string `xml:"pwr_object" json:"pwrObject" yaml:"pwrObject"`
}

type LinkWidths

type LinkWidths struct {
	MaxLinkWidth     string `xml:"max_link_width" json:"maxLinkWidth" yaml:"maxLinkWidth"`
	CurrentLinkWidth string `xml:"current_link_width" json:"currentLinkWidth" yaml:"currentLinkWidth"`
}

type MaxClocks

type MaxClocks struct {
	GraphicsClock string `xml:"graphics_clock" json:"graphicsClock" yaml:"graphicsClock"`
	SmClock       string `xml:"sm_clock" json:"smClock" yaml:"smClock"`
	MemClock      string `xml:"mem_clock" json:"memClock" yaml:"memClock"`
	VideoClock    string `xml:"video_clock" json:"videoClock" yaml:"videoClock"`
}

type MaxCustomerBoostClocks

type MaxCustomerBoostClocks struct {
	GraphicsClock string `xml:"graphics_clock" json:"graphicsClock" yaml:"graphicsClock"`
}

type MemoryPowerReadings

type MemoryPowerReadings struct {
	PowerDraw string `xml:"power_draw" json:"powerDraw" yaml:"powerDraw"`
}

type MigMode

type MigMode struct {
	CurrentMig string `xml:"current_mig" json:"currentMig" yaml:"currentMig"`
	PendingMig string `xml:"pending_mig" json:"pendingMig" yaml:"pendingMig"`
}

type ModulePowerReadings

type ModulePowerReadings struct {
	PowerState          string `xml:"power_state" json:"powerState" yaml:"powerState"`
	PowerDraw           string `xml:"power_draw" json:"powerDraw" yaml:"powerDraw"`
	CurrentPowerLimit   string `xml:"current_power_limit" json:"currentPowerLimit" yaml:"currentPowerLimit"`
	RequestedPowerLimit string `xml:"requested_power_limit" json:"requestedPowerLimit" yaml:"requestedPowerLimit"`
	DefaultPowerLimit   string `xml:"default_power_limit" json:"defaultPowerLimit" yaml:"defaultPowerLimit"`
	MinPowerLimit       string `xml:"min_power_limit" json:"minPowerLimit" yaml:"minPowerLimit"`
	MaxPowerLimit       string `xml:"max_power_limit" json:"maxPowerLimit" yaml:"maxPowerLimit"`
}

type MultipleSingleBitRetirement

type MultipleSingleBitRetirement struct {
	RetiredCount    string `xml:"retired_count" json:"retiredCount" yaml:"retiredCount"`
	RetiredPagelist string `xml:"retired_pagelist" json:"retiredPagelist" yaml:"retiredPagelist"`
}

type NFDHardwareDetector added in v0.12.0

type NFDHardwareDetector struct{}

NFDHardwareDetector uses NFD source packages to detect GPU hardware via PCI enumeration and kernel module state from sysfs/procfs.

NFDHardwareDetector is not safe for concurrent use. NFD source singletons are shared package-level state without synchronization. In AICR's architecture the GPU collector runs once per snapshot, so this is not a practical concern.

func (*NFDHardwareDetector) Detect added in v0.12.0

Detect discovers GPU hardware using NFD PCI and kernel sources. PCI discovery is required; kernel module detection is best-effort.

This method requires Linux with sysfs/procfs mounted. On other platforms (macOS, containers without /sys), PCI discovery will fail and an error is returned. The caller (Collector.Collect) handles this gracefully by falling back to nvidia-smi-only collection.

type NVSMIDevice

type NVSMIDevice struct {
	Timestamp     string `xml:"timestamp" json:"timestamp" yaml:"timestamp"`
	DriverVersion string `xml:"driver_version" json:"driverVersion" yaml:"driverVersion"`
	CudaVersion   string `xml:"cuda_version" json:"cudaVersion" yaml:"cudaVersion"`
	AttachedGpus  int    `xml:"attached_gpus" json:"attachedGPUs" yaml:"attachedGPUs"`
	GPUs          []GPU  `xml:"gpu" json:"gpu" yaml:"gpu"`
}

type OperationMode

type OperationMode struct {
	CurrentGom string `xml:"current_gom" json:"currentGom" yaml:"currentGom"`
	PendingGom string `xml:"pending_gom" json:"pendingGom" yaml:"pendingGom"`
}

type Pci

type Pci struct {
	PciBus                string         `xml:"pci_bus" json:"pciBus" yaml:"pciBus"`
	PciDevice             string         `xml:"pci_device" json:"pciDevice" yaml:"pciDevice"`
	PciDomain             string         `xml:"pci_domain" json:"pciDomain" yaml:"pciDomain"`
	PciBaseClass          string         `xml:"pci_base_class" json:"pciBaseClass" yaml:"pciBaseClass"`
	PciSubClass           string         `xml:"pci_sub_class" json:"pciSubClass" yaml:"pciSubClass"`
	PciDeviceID           string         `xml:"pci_device_id" json:"pciDeviceId" yaml:"pciDeviceId"`
	PciBusID              string         `xml:"pci_bus_id" json:"pciBusId" yaml:"pciBusId"`
	PciSubSystemID        string         `xml:"pci_sub_system_id" json:"pciSubSystemId" yaml:"pciSubSystemId"`
	PciGpuLinkInfo        PciGpuLinkInfo `xml:"pci_gpu_link_info" json:"pciGPULinkInfo" yaml:"pciGPULinkInfo"`
	PciBridgeChip         PciBridgeChip  `xml:"pci_bridge_chip" json:"pciBridgeChip" yaml:"pciBridgeChip"`
	ReplayCounter         string         `xml:"replay_counter" json:"replayCounter" yaml:"replayCounter"`
	ReplayRolloverCounter string         `xml:"replay_rollover_counter" json:"replayRolloverCounter" yaml:"replayRolloverCounter"`
	TxUtil                string         `xml:"tx_util" json:"txUtil" yaml:"txUtil"`
	RxUtil                string         `xml:"rx_util" json:"rxUtil" yaml:"rxUtil"`
	AtomicCapsOutbound    string         `xml:"atomic_caps_outbound" json:"atomicCapsOutbound" yaml:"atomicCapsOutbound"`
	AtomicCapsInbound     string         `xml:"atomic_caps_inbound" json:"atomicCapsInbound" yaml:"atomicCapsInbound"`
}

type PciBridgeChip

type PciBridgeChip struct {
	BridgeChipType string `xml:"bridge_chip_type" json:"bridgeChipType" yaml:"bridgeChipType"`
	BridgeChipFw   string `xml:"bridge_chip_fw" json:"bridgeChipFw" yaml:"bridgeChipFw"`
}

type PciGpuLinkInfo

type PciGpuLinkInfo struct {
	PcieGen    PcieGen    `xml:"pcie_gen" json:"pcieGen" yaml:"pcieGen"`
	LinkWidths LinkWidths `xml:"link_widths" json:"linkWidths" yaml:"linkWidths"`
}

type PcieGen

type PcieGen struct {
	MaxLinkGen           string `xml:"max_link_gen" json:"maxLinkGen" yaml:"maxLinkGen"`
	CurrentLinkGen       string `xml:"current_link_gen" json:"currentLinkGen" yaml:"currentLinkGen"`
	DeviceCurrentLinkGen string `xml:"device_current_link_gen" json:"deviceCurrentLinkGen" yaml:"deviceCurrentLinkGen"`
	MaxDeviceLinkGen     string `xml:"max_device_link_gen" json:"maxDeviceLinkGen" yaml:"maxDeviceLinkGen"`
	MaxHostLinkGen       string `xml:"max_host_link_gen" json:"maxHostLinkGen" yaml:"maxHostLinkGen"`
}

type PlatformInfo

type PlatformInfo struct {
	ChassisSerialNumber string `xml:"chassis_serial_number" json:"chassisSerialNumber" yaml:"chassisSerialNumber"`
	SlotNumber          string `xml:"slot_number" json:"slotNumber" yaml:"slotNumber"`
	TrayIndex           string `xml:"tray_index" json:"trayIndex" yaml:"trayIndex"`
	HostID              string `xml:"host_id" json:"hostId" yaml:"hostId"`
	PeerType            string `xml:"peer_type" json:"peerType" yaml:"peerType"`
	ModuleID            string `xml:"module_id" json:"moduleId" yaml:"moduleId"`
}

type PowerProfiles

type PowerProfiles struct {
	PowerProfileRequestedProfiles string `xml:"power_profile_requested_profiles" json:"powerProfileRequestedProfiles" yaml:"powerProfileRequestedProfiles"`
	PowerProfileEnforcedProfiles  string `xml:"power_profile_enforced_profiles" json:"powerProfileEnforcedProfiles" yaml:"powerProfileEnforcedProfiles"`
}

type PowerReadings

type PowerReadings struct {
	PowerState          string `xml:"power_state" json:"powerState" yaml:"powerState"`
	PowerDraw           string `xml:"power_draw" json:"powerDraw" yaml:"powerDraw"`
	CurrentPowerLimit   string `xml:"current_power_limit" json:"currentPowerLimit" yaml:"currentPowerLimit"`
	RequestedPowerLimit string `xml:"requested_power_limit" json:"requestedPowerLimit" yaml:"requestedPowerLimit"`
	DefaultPowerLimit   string `xml:"default_power_limit" json:"defaultPowerLimit" yaml:"defaultPowerLimit"`
	MinPowerLimit       string `xml:"min_power_limit" json:"minPowerLimit" yaml:"minPowerLimit"`
	MaxPowerLimit       string `xml:"max_power_limit" json:"maxPowerLimit" yaml:"maxPowerLimit"`
}

type RemappedRows

type RemappedRows struct {
	RemappedRowCorr      string               `xml:"remapped_row_corr" json:"remappedRowCorr" yaml:"remappedRowCorr"`
	RemappedRowUnc       string               `xml:"remapped_row_unc" json:"remappedRowUnc" yaml:"remappedRowUnc"`
	RemappedRowPending   string               `xml:"remapped_row_pending" json:"remappedRowPending" yaml:"remappedRowPending"`
	RemappedRowFailure   string               `xml:"remapped_row_failure" json:"remappedRowFailure" yaml:"remappedRowFailure"`
	RowRemapperHistogram RowRemapperHistogram `xml:"row_remapper_histogram" json:"rowRemapperHistogram" yaml:"rowRemapperHistogram"`
}

type ResetStatus

type ResetStatus struct {
	ResetRequired            string `xml:"reset_required" json:"resetRequired" yaml:"resetRequired"`
	DrainAndResetRecommended string `xml:"drain_and_reset_recommended" json:"drainAndResetRecommended" yaml:"drainAndResetRecommended"`
}

type RetiredPages

type RetiredPages struct {
	MultipleSingleBitRetirement MultipleSingleBitRetirement `xml:"multiple_single_bit_retirement" json:"multipleSingleBitRetirement" yaml:"multipleSingleBitRetirement"`
	DoubleBitRetirement         DoubleBitRetirement         `xml:"double_bit_retirement" json:"doubleBitRetirement" yaml:"doubleBitRetirement"`
	PendingBlacklist            string                      `xml:"pending_blacklist" json:"pendingBlacklist" yaml:"pendingBlacklist"`
	PendingRetirement           string                      `xml:"pending_retirement" json:"pendingRetirement" yaml:"pendingRetirement"`
}

type RowRemapperHistogram

type RowRemapperHistogram struct {
	RowRemapperHistogramMax     string `xml:"row_remapper_histogram_max" json:"rowRemapperHistogramMax" yaml:"rowRemapperHistogramMax"`
	RowRemapperHistogramHigh    string `xml:"row_remapper_histogram_high" json:"rowRemapperHistogramHigh" yaml:"rowRemapperHistogramHigh"`
	RowRemapperHistogramPartial string `xml:"row_remapper_histogram_partial" json:"rowRemapperHistogramPartial" yaml:"rowRemapperHistogramPartial"`
	RowRemapperHistogramLow     string `xml:"row_remapper_histogram_low" json:"rowRemapperHistogramLow" yaml:"rowRemapperHistogramLow"`
	RowRemapperHistogramNone    string `xml:"row_remapper_histogram_none" json:"rowRemapperHistogramNone" yaml:"rowRemapperHistogramNone"`
}

type SupportedClocks

type SupportedClocks struct {
	SupportedMemClock SupportedMemClock `xml:"supported_mem_clock" json:"supportedMemClock" yaml:"supportedMemClock"`
}

type SupportedGpuTargetTemp

type SupportedGpuTargetTemp struct {
	GpuTargetTempMin string `xml:"gpu_target_temp_min" json:"gpuTargetTempMin" yaml:"gpuTargetTempMin"`
	GpuTargetTempMax string `xml:"gpu_target_temp_max" json:"gpuTargetTempMax" yaml:"gpuTargetTempMax"`
}

type SupportedMemClock

type SupportedMemClock struct {
	Value                  string `xml:"value" json:"value" yaml:"value"`
	SupportedGraphicsClock string `xml:"supported_graphics_clock" json:"supportedGraphicsClock" yaml:"supportedGraphicsClock"`
}

type Temperature

type Temperature struct {
	GpuTemp                      string `xml:"gpu_temp" json:"gpuTemp" yaml:"gpuTemp"`
	GpuTempTlimit                string `xml:"gpu_temp_tlimit" json:"gpuTempTlimit" yaml:"gpuTempTlimit"`
	GpuTempMaxTlimitThreshold    string `xml:"gpu_temp_max_tlimit_threshold" json:"gpuTempMaxTlimitThreshold" yaml:"gpuTempMaxTlimitThreshold"`
	GpuTempSlowTlimitThreshold   string `xml:"gpu_temp_slow_tlimit_threshold" json:"gpuTempSlowTlimitThreshold" yaml:"gpuTempSlowTlimitThreshold"`
	GpuTempMaxGpuTlimitThreshold string `xml:"gpu_temp_max_gpu_tlimit_threshold" json:"gpuTempMaxGPUTlimitThreshold" yaml:"gpuTempMaxGPUTlimitThreshold"`
	GpuTargetTemperature         string `xml:"gpu_target_temperature" json:"gpuTargetTemperature" yaml:"gpuTargetTemperature"`
	MemoryTemp                   string `xml:"memory_temp" json:"memoryTemp" yaml:"memoryTemp"`
	GpuTempMaxMemTlimitThreshold string `xml:"gpu_temp_max_mem_tlimit_threshold" json:"gpuTempMaxMemTlimitThreshold" yaml:"gpuTempMaxMemTlimitThreshold"`
}

type Utilization

type Utilization struct {
	GpuUtil     string `xml:"gpu_util" json:"gpuUtil" yaml:"gpuUtil"`
	MemoryUtil  string `xml:"memory_util" json:"memoryUtil" yaml:"memoryUtil"`
	EncoderUtil string `xml:"encoder_util" json:"encoderUtil" yaml:"encoderUtil"`
	DecoderUtil string `xml:"decoder_util" json:"decoderUtil" yaml:"decoderUtil"`
	JpegUtil    string `xml:"jpeg_util" json:"jpegUtil" yaml:"jpegUtil"`
	OfaUtil     string `xml:"ofa_util" json:"ofaUtil" yaml:"ofaUtil"`
}

type VirtualizationMode

type VirtualizationMode struct {
	VirtualizationMode    string `xml:"virtualization_mode" json:"virtualizationMode" yaml:"virtualizationMode"`
	HostVgpuMode          string `xml:"host_vgpu_mode" json:"hostVGPUMode" yaml:"hostVGPUMode"`
	VgpuHeterogeneousMode string `xml:"vgpu_heterogeneous_mode" json:"vgpuHeterogeneousMode" yaml:"vgpuHeterogeneousMode"`
}

type Volatile

type Volatile struct {
	SramCorrectable         string `xml:"sram_correctable" json:"sramCorrectable" yaml:"sramCorrectable"`
	SramUncorrectableParity string `xml:"sram_uncorrectable_parity" json:"sramUncorrectableParity" yaml:"sramUncorrectableParity"`
	SramUncorrectableSecded string `xml:"sram_uncorrectable_secded" json:"sramUncorrectableSecded" yaml:"sramUncorrectableSecded"`
	DramCorrectable         string `xml:"dram_correctable" json:"dramCorrectable" yaml:"dramCorrectable"`
	DramUncorrectable       string `xml:"dram_uncorrectable" json:"dramUncorrectable" yaml:"dramUncorrectable"`
}

type Voltage

type Voltage struct {
	GraphicsVolt string `xml:"graphics_volt" json:"graphicsVolt" yaml:"graphicsVolt"`
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL