Documentation
¶
Overview ¶
Package processes tracks the NVIDIA per-GPU processes.
Index ¶
Constants ¶
const Name = "accelerator-nvidia-processes"
const SubSystem = "accelerator_nvidia_processes"
Variables ¶
var BAD_CUDA_ENV_KEYS = map[string]string{
"NSIGHT_CUDA_DEBUGGER": "Setting NSIGHT_CUDA_DEBUGGER=1 can degrade the performance of an application, since the debugger is made resident. See https://docs.nvidia.com/nsight-visual-studio-edition/3.2/Content/Attach_CUDA_to_Process.htm.",
"CUDA_INJECTION32_PATH": "Captures information about CUDA execution trace. See https://docs.nvidia.com/nsight-systems/2020.3/tracing/index.html.",
"CUDA_INJECTION64_PATH": "Captures information about CUDA execution trace. See https://docs.nvidia.com/nsight-systems/2020.3/tracing/index.html.",
"CUDA_AUTO_BOOST": "Automatically selects the highest possible clock rate allowed by the thermal and power budget. Independent of the global default setting the autoboost behavior can be overridden by setting the environment variable CUDA_AUTO_BOOST. Set CUDA_AUTO_BOOST=0 to disable frequency throttling/boosting. You may run 'nvidia-smi --auto-boost-default=0' to disable autoboost by default. See https://developer.nvidia.com/blog/increase-performance-gpu-boost-k80-autoboost/.",
"CUDA_ENABLE_COREDUMP_ON_EXCEPTION": "Enables GPU core dumps.",
"CUDA_COREDUMP_FILE": "Enables GPU core dumps.",
"CUDA_DEVICE_WAITS_ON_EXCEPTION": "CUDA kernel will pause when an exception occurs. This is only useful for debugging.",
"CUDA_PROFILE": "Enables CUDA profiling.",
"COMPUTE_PROFILE": "Enables compute profiling.",
"OPENCL_PROFILE": "Enables OpenCL profiling.",
}
ports "DCGM_FR_BAD_CUDA_ENV"; The environment has variables that hurt CUDA This is derived from "DCGM_FR_BAD_CUDA_ENV" in DCGM. ref. https://github.com/NVIDIA/DCGM/blob/903d745504f50153be8293f8566346f9de3b3c93/nvvs/plugin_src/software/Software.cpp#L839-L876
Functions ¶
func New ¶
func New(gpudInstance *components.GPUdInstance) (components.Component, error)
Types ¶
type Process ¶ added in v0.9.0
type Process struct {
PID uint32 `json:"pid"`
Status []string `json:"status,omitempty"`
// ZombieStatus is set to true if the process is defunct
// (terminated but not reaped by its parent).
ZombieStatus bool `json:"zombie_status,omitempty"`
// BadEnvVarsForCUDA is a map of environment variables that are known to hurt CUDA
// that is set for this specific process.
// Empty if there is no bad environment variable found for this process.
// This implements "DCGM_FR_BAD_CUDA_ENV" logic in DCGM.
BadEnvVarsForCUDA map[string]string `json:"bad_env_vars_for_cuda,omitempty"`
CmdArgs []string `json:"cmd_args,omitempty"`
CreateTime metav1.Time `json:"create_time,omitempty"`
GPUUsedPercent uint32 `json:"gpu_used_percent,omitempty"`
GPUUsedMemoryBytes uint64 `json:"gpu_used_memory_bytes,omitempty"`
GPUUsedMemoryBytesHumanized string `json:"gpu_used_memory_bytes_humanized,omitempty"`
}
type Processes ¶ added in v0.9.0
type Processes struct {
// Represents the GPU UUID.
UUID string `json:"uuid"`
// BusID is the GPU bus ID from the nvml API.
// e.g., "0000:0f:00.0"
BusID string `json:"bus_id"`
// A list of running processes.
RunningProcesses []Process `json:"running_processes"`
// GetComputeRunningProcessesSupported is true if the device supports the getComputeRunningProcesses API.
GetComputeRunningProcessesSupported bool `json:"get_compute_running_processes_supported"`
// GetProcessUtilizationSupported is true if the device supports the getProcessUtilization API.
GetProcessUtilizationSupported bool `json:"get_process_utilization_supported"`
}
Processes represents the current clock events from the nvmlDeviceGetCurrentClocksEventReasons API. ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g7e505374454a0d4fc7339b6c885656d6 ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1ga115e41a14b747cb334a0e7b49ae1941 ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlClocksEventReasons.html#group__nvmlClocksEventReasons