Documentation
¶
Index ¶
- Constants
- Variables
- func NewContainerMetricsHandler(querier MetricsQuerier, log logr.Logger) http.Handler
- func NewServerMux(containerMetricsHandler http.Handler) *http.ServeMux
- type Exporter
- type ExporterConfig
- type GPUMetric
- type GPUMetricResponse
- type HTTPClient
- type MetricFamilyMap
- type MetricMapper
- type MetricName
- type MetricsQuerier
- type Scraper
- type Workload
- type WorkloadResolver
- type WorkloadResolverConfig
Constants ¶
const ( KindPod = "Pod" KindJob = "Job" KindCronJob = "CronJob" KindRollout = "Rollout" KindDaemonSet = "DaemonSet" KindDeployment = "Deployment" KindStatefulSet = "StatefulSet" KindReplicaSet = "ReplicaSet" )
Variables ¶
var EnabledMetrics = map[MetricName]struct{}{ MetricStreamingMultiProcessorActive: {}, MetricStreamingMultiProcessorOccupancy: {}, MetricStreamingMultiProcessorTensor: {}, MetricDRAMActive: {}, MetricPCIeTXBytes: {}, MetricPCIeRXBytes: {}, MetricNVLinkTXBytes: {}, MetricNVLinkRXBytes: {}, MetricGraphicsEngineActive: {}, MetricFrameBufferTotal: {}, MetricFrameBufferUsed: {}, MetricFrameBufferFree: {}, MetricPCIeLinkGen: {}, MetricPCIeLinkWidth: {}, MetricGPUTemperature: {}, MetricMemoryTemperature: {}, MetricPowerUsage: {}, MetricGPUUtilization: {}, MetricIntPipeActive: {}, MetricFloat16PipeActive: {}, MetricFloat32PipeActive: {}, MetricFloat64PipeActive: {}, MetricClocksEventReasons: {}, MetricXIDErrors: {}, MetricPowerViolation: {}, MetricThermalViolation: {}, MetricSMClock: {}, MetricMemClock: {}, }
EnabledMetrics is the set of DCGM metrics to scrape and process.
Functions ¶
func NewContainerMetricsHandler ¶
func NewContainerMetricsHandler(querier MetricsQuerier, log logr.Logger) http.Handler
NewContainerMetricsHandler creates an HTTP handler for GET /container/metrics.
Types ¶
type Exporter ¶
type Exporter struct {
// contains filtered or unexported fields
}
Exporter ties together the scraper, mapper, and DCGM pod discovery.
func NewExporter ¶
func NewExporter( cfg ExporterConfig, dynClient dynamic.Interface, scraper Scraper, mapper MetricMapper, log logr.Logger, ) *Exporter
NewExporter creates a new GPU metrics exporter.
type ExporterConfig ¶
type ExporterConfig struct {
HTTPListenPort int
DCGMHost string
DCGMPort int
DCGMMetricsEndpoint string
DCGMLabels string // label selector, e.g. "app.kubernetes.io/name=dcgm-exporter"
NodeName string
}
ExporterConfig holds environment-driven configuration.
type GPUMetric ¶
type GPUMetric struct {
NodeName string `json:"node_name"`
ModelName string `json:"model_name"`
Device string `json:"device"`
DeviceID string `json:"device_id"`
DeviceUUID string `json:"device_uuid"`
MIGProfile string `json:"mig_profile,omitempty"`
MIGInstanceID string `json:"mig_instance_id,omitempty"`
Pod string `json:"pod"`
Container string `json:"container"`
Namespace string `json:"namespace"`
WorkloadName string `json:"workload_name,omitempty"`
WorkloadKind string `json:"workload_kind,omitempty"`
SMActive float64 `json:"sm_active"`
SMOccupancy float64 `json:"sm_occupancy"`
TensorActive float64 `json:"tensor_active"`
DRAMActive float64 `json:"dram_active"`
PCIeTXBytes float64 `json:"pcie_tx_bytes"`
PCIeRXBytes float64 `json:"pcie_rx_bytes"`
NVLinkTXBytes float64 `json:"nvlink_tx_bytes"`
NVLinkRXBytes float64 `json:"nvlink_rx_bytes"`
GraphicsEngineActive float64 `json:"graphics_engine_active"`
FramebufferTotal float64 `json:"framebuffer_total"`
FramebufferUsed float64 `json:"framebuffer_used"`
FramebufferFree float64 `json:"framebuffer_free"`
PCIeLinkGen float64 `json:"pcie_link_gen"`
PCIeLinkWidth float64 `json:"pcie_link_width"`
Temperature float64 `json:"temperature"`
MemoryTemperature float64 `json:"memory_temperature"`
PowerUsage float64 `json:"power_usage"`
GPUUtilization float64 `json:"gpu_utilization"`
IntPipeActive float64 `json:"int_pipe_active"`
FP16PipeActive float64 `json:"fp16_pipe_active"`
FP32PipeActive float64 `json:"fp32_pipe_active"`
FP64PipeActive float64 `json:"fp64_pipe_active"`
ClocksEventReasons float64 `json:"clocks_event_reasons"`
XIDErrors float64 `json:"xid_errors"`
PowerViolation float64 `json:"power_violation"`
ThermalViolation float64 `json:"thermal_violation"`
SMClock float64 `json:"sm_clock"`
MemClock float64 `json:"mem_clock"`
Timestamp time.Time `json:"timestamp"`
}
GPUMetric represents a single GPU's metrics for a container.
type GPUMetricResponse ¶
type GPUMetricResponse struct {
NodeName string `json:"node_name"`
ModelName string `json:"model_name"`
Device string `json:"device"`
DeviceID string `json:"device_id"`
DeviceUUID string `json:"device_uuid"`
MIGProfile string `json:"mig_profile,omitempty"`
MIGInstanceID string `json:"mig_instance_id,omitempty"`
Pod string `json:"pod"`
Container string `json:"container"`
Namespace string `json:"namespace"`
WorkloadName string `json:"workload_name,omitempty"`
WorkloadKind string `json:"workload_kind,omitempty"`
SMActive float64 `json:"sm_active"`
SMOccupancy float64 `json:"sm_occupancy"`
TensorActive float64 `json:"tensor_active"`
DRAMActive float64 `json:"dram_active"`
PCIeTXBytes float64 `json:"pcie_tx_bytes"`
PCIeRXBytes float64 `json:"pcie_rx_bytes"`
NVLinkTXBytes float64 `json:"nvlink_tx_bytes"`
NVLinkRXBytes float64 `json:"nvlink_rx_bytes"`
GraphicsEngineActive float64 `json:"graphics_engine_active"`
FramebufferTotal float64 `json:"framebuffer_total"`
FramebufferUsed float64 `json:"framebuffer_used"`
FramebufferFree float64 `json:"framebuffer_free"`
PCIeLinkGen float64 `json:"pcie_link_gen"`
PCIeLinkWidth float64 `json:"pcie_link_width"`
Temperature float64 `json:"temperature"`
MemoryTemperature float64 `json:"memory_temperature"`
PowerUsage float64 `json:"power_usage"`
GPUUtilization float64 `json:"gpu_utilization"`
IntPipeActive float64 `json:"int_pipe_active"`
FP16PipeActive float64 `json:"fp16_pipe_active"`
FP32PipeActive float64 `json:"fp32_pipe_active"`
FP64PipeActive float64 `json:"fp64_pipe_active"`
ClocksEventReasons float64 `json:"clocks_event_reasons"`
XIDErrors float64 `json:"xid_errors"`
PowerViolation float64 `json:"power_violation"`
ThermalViolation float64 `json:"thermal_violation"`
SMClock float64 `json:"sm_clock"`
MemClock float64 `json:"mem_clock"`
Timestamp time.Time `json:"timestamp"`
}
GPUMetricResponse is the JSON API contract for the /container/metrics endpoint.
type HTTPClient ¶
HTTPClient abstracts *http.Client for testing.
type MetricFamilyMap ¶
type MetricFamilyMap map[string]*dto.MetricFamily
MetricFamilyMap maps a Prometheus metric name to its parsed metric family.
type MetricMapper ¶
type MetricMapper interface {
MapToGPUMetrics(ctx context.Context, metrics []MetricFamilyMap) []GPUMetric
}
MetricMapper maps scraped DCGM metric families into structured GPU metrics.
func NewMapper ¶
func NewMapper(nodeName string, resolver WorkloadResolver, log logr.Logger) MetricMapper
NewMapper creates a new MetricMapper.
type MetricName ¶
type MetricName = string
MetricName identifies a DCGM metric by its Prometheus metric name.
const ( MetricStreamingMultiProcessorActive MetricName = "DCGM_FI_PROF_SM_ACTIVE" MetricStreamingMultiProcessorOccupancy MetricName = "DCGM_FI_PROF_SM_OCCUPANCY" MetricStreamingMultiProcessorTensor MetricName = "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE" MetricDRAMActive MetricName = "DCGM_FI_PROF_DRAM_ACTIVE" MetricPCIeTXBytes MetricName = "DCGM_FI_PROF_PCIE_TX_BYTES" MetricPCIeRXBytes MetricName = "DCGM_FI_PROF_PCIE_RX_BYTES" MetricNVLinkTXBytes MetricName = "DCGM_FI_PROF_NVLINK_TX_BYTES" MetricNVLinkRXBytes MetricName = "DCGM_FI_PROF_NVLINK_RX_BYTES" MetricGraphicsEngineActive MetricName = "DCGM_FI_PROF_GR_ENGINE_ACTIVE" MetricFrameBufferTotal MetricName = "DCGM_FI_DEV_FB_TOTAL" MetricFrameBufferUsed MetricName = "DCGM_FI_DEV_FB_USED" MetricFrameBufferFree MetricName = "DCGM_FI_DEV_FB_FREE" MetricPCIeLinkGen MetricName = "DCGM_FI_DEV_PCIE_LINK_GEN" MetricPCIeLinkWidth MetricName = "DCGM_FI_DEV_PCIE_LINK_WIDTH" MetricGPUTemperature MetricName = "DCGM_FI_DEV_GPU_TEMP" MetricMemoryTemperature MetricName = "DCGM_FI_DEV_MEMORY_TEMP" MetricPowerUsage MetricName = "DCGM_FI_DEV_POWER_USAGE" MetricGPUUtilization MetricName = "DCGM_FI_DEV_GPU_UTIL" MetricIntPipeActive MetricName = "DCGM_FI_PROF_PIPE_INT_ACTIVE" MetricFloat16PipeActive MetricName = "DCGM_FI_PROF_PIPE_FP16_ACTIVE" MetricFloat32PipeActive MetricName = "DCGM_FI_PROF_PIPE_FP32_ACTIVE" MetricFloat64PipeActive MetricName = "DCGM_FI_PROF_PIPE_FP64_ACTIVE" MetricClocksEventReasons MetricName = "DCGM_FI_DEV_CLOCKS_EVENT_REASONS" MetricXIDErrors MetricName = "DCGM_FI_DEV_XID_ERRORS" MetricPowerViolation MetricName = "DCGM_FI_DEV_POWER_VIOLATION" MetricThermalViolation MetricName = "DCGM_FI_DEV_THERMAL_VIOLATION" MetricSMClock MetricName = "DCGM_FI_DEV_SM_CLOCK" MetricMemClock MetricName = "DCGM_FI_DEV_MEM_CLOCK" )
DCGM metric names scraped from the DCGM exporter.
type MetricsQuerier ¶
MetricsQuerier provides on-demand GPU metrics.
type Scraper ¶
type Scraper interface {
Scrape(ctx context.Context, urls []string) ([]MetricFamilyMap, error)
}
Scraper fetches and parses Prometheus-format metrics from DCGM exporter endpoints.
func NewScraper ¶
func NewScraper(httpClient HTTPClient, log logr.Logger) Scraper
NewScraper creates a new DCGM metrics scraper.
type WorkloadResolver ¶
type WorkloadResolver interface {
FindWorkloadForPod(
ctx context.Context,
name, namespace string,
) (kind, workloadName string, err error)
}
WorkloadResolver resolves the top-level owning workload for a pod.
func NewWorkloadResolver ¶
func NewWorkloadResolver( dynClient dynamic.Interface, cfg WorkloadResolverConfig, log logr.Logger, ) WorkloadResolver
NewWorkloadResolver creates a WorkloadResolver that uses the K8s dynamic client to walk owner references and find the top-level owning workload. Supports LRU caching and label-based workload name resolution.
type WorkloadResolverConfig ¶
WorkloadResolverConfig holds configuration for the workload resolver.