nodemon

package
v0.0.75 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 27, 2026 License: Apache-2.0 Imports: 18 Imported by: 0

Documentation

Index

Constants

View Source
const (
	KindPod         = "Pod"
	KindJob         = "Job"
	KindCronJob     = "CronJob"
	KindRollout     = "Rollout"
	KindDaemonSet   = "DaemonSet"
	KindDeployment  = "Deployment"
	KindStatefulSet = "StatefulSet"
	KindReplicaSet  = "ReplicaSet"
)

Variables

EnabledMetrics is the set of DCGM metrics to scrape and process.

Functions

func NewContainerMetricsHandler

func NewContainerMetricsHandler(querier MetricsQuerier, log logr.Logger) http.Handler

NewContainerMetricsHandler creates an HTTP handler for GET /container/metrics.

func NewServerMux

func NewServerMux(containerMetricsHandler http.Handler) *http.ServeMux

NewServerMux creates the HTTP mux for the nodemon.

Types

type Exporter

type Exporter struct {
	// contains filtered or unexported fields
}

Exporter ties together the scraper, mapper, and DCGM pod discovery.

func NewExporter

func NewExporter(
	cfg ExporterConfig,
	dynClient dynamic.Interface,
	scraper Scraper,
	mapper MetricMapper,
	log logr.Logger,
) *Exporter

NewExporter creates a new GPU metrics exporter.

func (*Exporter) QueryMetrics

func (e *Exporter) QueryMetrics(ctx context.Context) ([]GPUMetric, error)

QueryMetrics scrapes DCGM exporters on demand and returns mapped GPU metrics.

type ExporterConfig

type ExporterConfig struct {
	HTTPListenPort      int
	DCGMHost            string
	DCGMPort            int
	DCGMMetricsEndpoint string
	DCGMLabels          string // label selector, e.g. "app.kubernetes.io/name=dcgm-exporter"
	NodeName            string
}

ExporterConfig holds environment-driven configuration.

type GPUMetric

type GPUMetric struct {
	NodeName      string `json:"node_name"`
	ModelName     string `json:"model_name"`
	Device        string `json:"device"`
	DeviceID      string `json:"device_id"`
	DeviceUUID    string `json:"device_uuid"`
	MIGProfile    string `json:"mig_profile,omitempty"`
	MIGInstanceID string `json:"mig_instance_id,omitempty"`

	Pod          string `json:"pod"`
	Container    string `json:"container"`
	Namespace    string `json:"namespace"`
	WorkloadName string `json:"workload_name,omitempty"`
	WorkloadKind string `json:"workload_kind,omitempty"`

	SMActive             float64 `json:"sm_active"`
	SMOccupancy          float64 `json:"sm_occupancy"`
	TensorActive         float64 `json:"tensor_active"`
	DRAMActive           float64 `json:"dram_active"`
	PCIeTXBytes          float64 `json:"pcie_tx_bytes"`
	PCIeRXBytes          float64 `json:"pcie_rx_bytes"`
	NVLinkTXBytes        float64 `json:"nvlink_tx_bytes"`
	NVLinkRXBytes        float64 `json:"nvlink_rx_bytes"`
	GraphicsEngineActive float64 `json:"graphics_engine_active"`
	FramebufferTotal     float64 `json:"framebuffer_total"`
	FramebufferUsed      float64 `json:"framebuffer_used"`
	FramebufferFree      float64 `json:"framebuffer_free"`
	PCIeLinkGen          float64 `json:"pcie_link_gen"`
	PCIeLinkWidth        float64 `json:"pcie_link_width"`
	Temperature          float64 `json:"temperature"`
	MemoryTemperature    float64 `json:"memory_temperature"`
	PowerUsage           float64 `json:"power_usage"`
	GPUUtilization       float64 `json:"gpu_utilization"`
	IntPipeActive        float64 `json:"int_pipe_active"`
	FP16PipeActive       float64 `json:"fp16_pipe_active"`
	FP32PipeActive       float64 `json:"fp32_pipe_active"`
	FP64PipeActive       float64 `json:"fp64_pipe_active"`
	ClocksEventReasons   float64 `json:"clocks_event_reasons"`
	XIDErrors            float64 `json:"xid_errors"`
	PowerViolation       float64 `json:"power_violation"`
	ThermalViolation     float64 `json:"thermal_violation"`
	SMClock              float64 `json:"sm_clock"`
	MemClock             float64 `json:"mem_clock"`

	Timestamp time.Time `json:"timestamp"`
}

GPUMetric represents a single GPU's metrics for a container.

type GPUMetricResponse

type GPUMetricResponse struct {
	NodeName      string `json:"node_name"`
	ModelName     string `json:"model_name"`
	Device        string `json:"device"`
	DeviceID      string `json:"device_id"`
	DeviceUUID    string `json:"device_uuid"`
	MIGProfile    string `json:"mig_profile,omitempty"`
	MIGInstanceID string `json:"mig_instance_id,omitempty"`

	Pod          string `json:"pod"`
	Container    string `json:"container"`
	Namespace    string `json:"namespace"`
	WorkloadName string `json:"workload_name,omitempty"`
	WorkloadKind string `json:"workload_kind,omitempty"`

	SMActive             float64 `json:"sm_active"`
	SMOccupancy          float64 `json:"sm_occupancy"`
	TensorActive         float64 `json:"tensor_active"`
	DRAMActive           float64 `json:"dram_active"`
	PCIeTXBytes          float64 `json:"pcie_tx_bytes"`
	PCIeRXBytes          float64 `json:"pcie_rx_bytes"`
	NVLinkTXBytes        float64 `json:"nvlink_tx_bytes"`
	NVLinkRXBytes        float64 `json:"nvlink_rx_bytes"`
	GraphicsEngineActive float64 `json:"graphics_engine_active"`
	FramebufferTotal     float64 `json:"framebuffer_total"`
	FramebufferUsed      float64 `json:"framebuffer_used"`
	FramebufferFree      float64 `json:"framebuffer_free"`
	PCIeLinkGen          float64 `json:"pcie_link_gen"`
	PCIeLinkWidth        float64 `json:"pcie_link_width"`
	Temperature          float64 `json:"temperature"`
	MemoryTemperature    float64 `json:"memory_temperature"`
	PowerUsage           float64 `json:"power_usage"`
	GPUUtilization       float64 `json:"gpu_utilization"`
	IntPipeActive        float64 `json:"int_pipe_active"`
	FP16PipeActive       float64 `json:"fp16_pipe_active"`
	FP32PipeActive       float64 `json:"fp32_pipe_active"`
	FP64PipeActive       float64 `json:"fp64_pipe_active"`
	ClocksEventReasons   float64 `json:"clocks_event_reasons"`
	XIDErrors            float64 `json:"xid_errors"`
	PowerViolation       float64 `json:"power_violation"`
	ThermalViolation     float64 `json:"thermal_violation"`
	SMClock              float64 `json:"sm_clock"`
	MemClock             float64 `json:"mem_clock"`

	Timestamp time.Time `json:"timestamp"`
}

GPUMetricResponse is the JSON API contract for the /container/metrics endpoint.

type HTTPClient

type HTTPClient interface {
	Do(req *http.Request) (*http.Response, error)
}

HTTPClient abstracts *http.Client for testing.

type MetricFamilyMap

type MetricFamilyMap map[string]*dto.MetricFamily

MetricFamilyMap maps a Prometheus metric name to its parsed metric family.

type MetricMapper

type MetricMapper interface {
	MapToGPUMetrics(ctx context.Context, metrics []MetricFamilyMap) []GPUMetric
}

MetricMapper maps scraped DCGM metric families into structured GPU metrics.

func NewMapper

func NewMapper(nodeName string, resolver WorkloadResolver, log logr.Logger) MetricMapper

NewMapper creates a new MetricMapper.

type MetricName

type MetricName = string

MetricName identifies a DCGM metric by its Prometheus metric name.

const (
	MetricStreamingMultiProcessorActive    MetricName = "DCGM_FI_PROF_SM_ACTIVE"
	MetricStreamingMultiProcessorOccupancy MetricName = "DCGM_FI_PROF_SM_OCCUPANCY"
	MetricStreamingMultiProcessorTensor    MetricName = "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE"
	MetricDRAMActive                       MetricName = "DCGM_FI_PROF_DRAM_ACTIVE"
	MetricPCIeTXBytes                      MetricName = "DCGM_FI_PROF_PCIE_TX_BYTES"
	MetricPCIeRXBytes                      MetricName = "DCGM_FI_PROF_PCIE_RX_BYTES"
	MetricNVLinkTXBytes                    MetricName = "DCGM_FI_PROF_NVLINK_TX_BYTES"
	MetricNVLinkRXBytes                    MetricName = "DCGM_FI_PROF_NVLINK_RX_BYTES"
	MetricGraphicsEngineActive             MetricName = "DCGM_FI_PROF_GR_ENGINE_ACTIVE"
	MetricFrameBufferTotal                 MetricName = "DCGM_FI_DEV_FB_TOTAL"
	MetricFrameBufferUsed                  MetricName = "DCGM_FI_DEV_FB_USED"
	MetricFrameBufferFree                  MetricName = "DCGM_FI_DEV_FB_FREE"
	MetricPCIeLinkGen                      MetricName = "DCGM_FI_DEV_PCIE_LINK_GEN"
	MetricPCIeLinkWidth                    MetricName = "DCGM_FI_DEV_PCIE_LINK_WIDTH"
	MetricGPUTemperature                   MetricName = "DCGM_FI_DEV_GPU_TEMP"
	MetricMemoryTemperature                MetricName = "DCGM_FI_DEV_MEMORY_TEMP"
	MetricPowerUsage                       MetricName = "DCGM_FI_DEV_POWER_USAGE"
	MetricGPUUtilization                   MetricName = "DCGM_FI_DEV_GPU_UTIL"
	MetricIntPipeActive                    MetricName = "DCGM_FI_PROF_PIPE_INT_ACTIVE"
	MetricFloat16PipeActive                MetricName = "DCGM_FI_PROF_PIPE_FP16_ACTIVE"
	MetricFloat32PipeActive                MetricName = "DCGM_FI_PROF_PIPE_FP32_ACTIVE"
	MetricFloat64PipeActive                MetricName = "DCGM_FI_PROF_PIPE_FP64_ACTIVE"
	MetricClocksEventReasons               MetricName = "DCGM_FI_DEV_CLOCKS_EVENT_REASONS"
	MetricXIDErrors                        MetricName = "DCGM_FI_DEV_XID_ERRORS"
	MetricPowerViolation                   MetricName = "DCGM_FI_DEV_POWER_VIOLATION"
	MetricThermalViolation                 MetricName = "DCGM_FI_DEV_THERMAL_VIOLATION"
	MetricSMClock                          MetricName = "DCGM_FI_DEV_SM_CLOCK"
	MetricMemClock                         MetricName = "DCGM_FI_DEV_MEM_CLOCK"
)

DCGM metric names scraped from the DCGM exporter.

type MetricsQuerier

type MetricsQuerier interface {
	QueryMetrics(ctx context.Context) ([]GPUMetric, error)
}

MetricsQuerier provides on-demand GPU metrics.

type Scraper

type Scraper interface {
	Scrape(ctx context.Context, urls []string) ([]MetricFamilyMap, error)
}

Scraper fetches and parses Prometheus-format metrics from DCGM exporter endpoints.

func NewScraper

func NewScraper(httpClient HTTPClient, log logr.Logger) Scraper

NewScraper creates a new DCGM metrics scraper.

type Workload

type Workload struct {
	Name      string
	Namespace string
	Kind      string
}

Workload represents a resolved Kubernetes workload.

type WorkloadResolver

type WorkloadResolver interface {
	FindWorkloadForPod(
		ctx context.Context,
		name, namespace string,
	) (kind, workloadName string, err error)
}

WorkloadResolver resolves the top-level owning workload for a pod.

func NewWorkloadResolver

func NewWorkloadResolver(
	dynClient dynamic.Interface,
	cfg WorkloadResolverConfig,
	log logr.Logger,
) WorkloadResolver

NewWorkloadResolver creates a WorkloadResolver that uses the K8s dynamic client to walk owner references and find the top-level owning workload. Supports LRU caching and label-based workload name resolution.

type WorkloadResolverConfig

type WorkloadResolverConfig struct {
	LabelKeys []string
	CacheSize int
}

WorkloadResolverConfig holds configuration for the workload resolver.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL