cmd

package
v0.0.0-...-6aa6421 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jul 16, 2025 License: Apache-2.0 Imports: 37 Imported by: 0

Documentation

Index

Constants

View Source
const (
	FlexKey  = "f" // Monitor all GPUs if MIG is disabled or all GPU instances if MIG is enabled
	MajorKey = "g" // Monitor top-level entities: GPUs or NvSwitches or CPUs
	MinorKey = "i" // Monitor sub-level entities: GPU instances/NvLinks/CPUCores - GPUI cannot be specified if MIG is disabled

)
View Source
const (
	CLIFieldsFile                 = "collectors"
	CLIAddress                    = "address"
	CLICollectInterval            = "collect-interval"
	CLIKubernetes                 = "kubernetes"
	CLIKubernetesEnablePodLabels  = "kubernetes-enable-pod-labels"
	CLIKubernetesGPUIDType        = "kubernetes-gpu-id-type"
	CLIUseOldNamespace            = "use-old-namespace"
	CLIRemoteHEInfo               = "remote-hostengine-info"
	CLIGPUDevices                 = "devices"
	CLISwitchDevices              = "switch-devices"
	CLICPUDevices                 = "cpu-devices"
	CLINoHostname                 = "no-hostname"
	CLIUseFakeGPUs                = "fake-gpus"
	CLIConfigMapData              = "configmap-data"
	CLIWebSystemdSocket           = "web-systemd-socket"
	CLIWebConfigFile              = "web-config-file"
	CLIXIDCountWindowSize         = "xid-count-window-size"
	CLIReplaceBlanksInModelName   = "replace-blanks-in-model-name"
	CLIDebugMode                  = "debug"
	CLIClockEventsCountWindowSize = "clock-events-count-window-size"
	CLIEnableDCGMLog              = "enable-dcgm-log"
	CLIDCGMLogLevel               = "dcgm-log-level"
	CLILogFormat                  = "log-format"
	CLIPodResourcesKubeletSocket  = "pod-resources-kubelet-socket"
	CLIHPCJobMappingDir           = "hpc-job-mapping-dir"
	CLINvidiaResourceNames        = "nvidia-resource-names"
	CLIKubernetesVirtualGPUs      = "kubernetes-virtual-gpus"
	CLIDumpEnabled                = "dump-enabled"
	CLIDumpDirectory              = "dump-directory"
	CLIDumpRetention              = "dump-retention"
	CLIDumpCompression            = "dump-compression"
)
View Source
const (
	DCGMDbgLvlNone  = "NONE"
	DCGMDbgLvlFatal = "FATAL"
	DCGMDbgLvlError = "ERROR"
	DCGMDbgLvlWarn  = "WARN"
	DCGMDbgLvlInfo  = "INFO"
	DCGMDbgLvlDebug = "DEBUG"
	DCGMDbgLvlVerb  = "VERB"
)

DCGMDbgLvl is a DCGM library debug level.

Variables

Functions

func NewApp

func NewApp(buildVersion ...string) *cli.App

Types

type DcgmCollector

type DcgmCollector struct {
	// contains filtered or unexported fields
}

DcgmCollector is intended for external use as a prometheus.Collector

func GetCollector

func GetCollector() (*DcgmCollector, error)

func (*DcgmCollector) Collect

func (d *DcgmCollector) Collect(ch chan<- prometheus.Metric)

Collect grabs metrics from the dcgm-exporter by parsing the buffer from the server's Render method

func (*DcgmCollector) Describe

func (d *DcgmCollector) Describe(ch chan<- *prometheus.Desc)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL