Documentation
¶
Index ¶
Constants ¶
const ( FlexKey = "f" // Monitor all GPUs if MIG is disabled or all GPU instances if MIG is enabled MajorKey = "g" // Monitor top-level entities: GPUs or NvSwitches or CPUs MinorKey = "i" // Monitor sub-level entities: GPU instances/NvLinks/CPUCores - GPUI cannot be specified if MIG is disabled )
const ( CLIFieldsFile = "collectors" CLIAddress = "address" CLICollectInterval = "collect-interval" CLIKubernetes = "kubernetes" CLIKubernetesEnablePodLabels = "kubernetes-enable-pod-labels" CLIKubernetesEnablePodUID = "kubernetes-enable-pod-uid" CLIKubernetesGPUIDType = "kubernetes-gpu-id-type" CLIKubernetesPodLabelAllowlistRegex = "kubernetes-pod-label-allowlist-regex" CLIUseOldNamespace = "use-old-namespace" CLIRemoteHEInfo = "remote-hostengine-info" CLIGPUDevices = "devices" CLISwitchDevices = "switch-devices" CLICPUDevices = "cpu-devices" CLINoHostname = "no-hostname" CLIUseFakeGPUs = "fake-gpus" CLIConfigMapData = "configmap-data" CLIWebSystemdSocket = "web-systemd-socket" CLIWebConfigFile = "web-config-file" CLIXIDCountWindowSize = "xid-count-window-size" CLIReplaceBlanksInModelName = "replace-blanks-in-model-name" CLIDebugMode = "debug" CLIClockEventsCountWindowSize = "clock-events-count-window-size" CLIEnableDCGMLog = "enable-dcgm-log" CLIDCGMLogLevel = "dcgm-log-level" CLILogFormat = "log-format" CLIPodResourcesKubeletSocket = "pod-resources-kubelet-socket" CLIHPCJobMappingDir = "hpc-job-mapping-dir" CLINvidiaResourceNames = "nvidia-resource-names" CLIKubernetesVirtualGPUs = "kubernetes-virtual-gpus" CLIDumpEnabled = "dump-enabled" CLIDumpDirectory = "dump-directory" CLIDumpRetention = "dump-retention" CLIDumpCompression = "dump-compression" CLIKubernetesEnableDRA = "kubernetes-enable-dra" CLIDisableStartupValidate = "disable-startup-validate" CLIEnableGPUBindUnbindWatch = "enable-gpu-bind-unbind-watch" CLIGPUBindUnbindPollInterval = "gpu-bind-unbind-poll-interval" )
const ( DCGMDbgLvlNone = "NONE" DCGMDbgLvlFatal = "FATAL" DCGMDbgLvlError = "ERROR" DCGMDbgLvlWarn = "WARN" DCGMDbgLvlInfo = "INFO" DCGMDbgLvlDebug = "DEBUG" DCGMDbgLvlVerb = "VERB" )
DCGMDbgLvl is a DCGM library debug level.
Variables ¶
var DCGMDbgLvlValues = []string{ DCGMDbgLvlNone, DCGMDbgLvlFatal, DCGMDbgLvlError, DCGMDbgLvlWarn, DCGMDbgLvlInfo, DCGMDbgLvlDebug, DCGMDbgLvlVerb, }
Functions ¶
func StartDCGMExporterWithSignalSource ¶
func StartDCGMExporterWithSignalSource(c *cli.Context, sigSource SignalSource) error
StartDCGMExporterWithSignalSource starts the exporter with a custom signal source. This variant allows dependency injection for testing.
Types ¶
type OSSignalSource ¶
type OSSignalSource struct {
// contains filtered or unexported fields
}
OSSignalSource watches actual OS signals (production use)
func NewOSSignalSource ¶
func NewOSSignalSource(sigs ...os.Signal) *OSSignalSource
NewOSSignalSource creates a signal source that watches OS signals
func (*OSSignalSource) Cleanup ¶
func (s *OSSignalSource) Cleanup()
Cleanup stops watching OS signals and closes the channel
func (*OSSignalSource) Signals ¶
func (s *OSSignalSource) Signals() <-chan os.Signal
Signals returns the channel that receives OS signals
type SignalSource ¶
type SignalSource interface {
// Signals returns the channel that receives signals
Signals() <-chan os.Signal
// Cleanup stops signal watching and cleans up resources
Cleanup()
}
SignalSource provides signals that trigger reload or shutdown. This interface allows dependency injection for testing.
type TestSignalSource ¶
type TestSignalSource struct {
// contains filtered or unexported fields
}
TestSignalSource allows programmatic signal injection for testing
func NewTestSignalSource ¶
func NewTestSignalSource() *TestSignalSource
NewTestSignalSource creates a signal source for testing
func (*TestSignalSource) Cleanup ¶
func (s *TestSignalSource) Cleanup()
Cleanup closes the signal channel
func (*TestSignalSource) SendSignal ¶
func (s *TestSignalSource) SendSignal(sig os.Signal)
SendSignal injects a signal into the channel (test helper)
func (*TestSignalSource) Signals ¶
func (s *TestSignalSource) Signals() <-chan os.Signal
Signals returns the channel that receives test signals