models

package
v0.3.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 31, 2026 License: MIT Imports: 3 Imported by: 0

Documentation

Overview

Package models defines data structures and types for representing metrics, nodes, jobs, and other observability entities. It provides standardized interfaces for metric collection, type-safe data handling, and consistent serialization across the observability plugin components.

Index

Constants

This section is empty.

Variables

View Source
var CommonAggregations = map[string]AggregationFunc{
	"avg": func(values []float64) float64 {
		if len(values) == 0 {
			return 0
		}
		sum := 0.0
		for _, v := range values {
			sum += v
		}
		return sum / float64(len(values))
	},
	"min": func(values []float64) float64 {
		if len(values) == 0 {
			return 0
		}
		minVal := values[0]
		for _, v := range values[1:] {
			if v < minVal {
				minVal = v
			}
		}
		return minVal
	},
	"max": func(values []float64) float64 {
		if len(values) == 0 {
			return 0
		}
		maxVal := values[0]
		for _, v := range values[1:] {
			if v > maxVal {
				maxVal = v
			}
		}
		return maxVal
	},
	"sum": func(values []float64) float64 {
		sum := 0.0
		for _, v := range values {
			sum += v
		}
		return sum
	},
	"count": func(values []float64) float64 {
		return float64(len(values))
	},
}

CommonAggregations provides common aggregation functions

Functions

func FormatJobMetrics

func FormatJobMetrics(job *JobMetrics) string

FormatJobMetrics formats job metrics for display

func FormatNodeMetrics

func FormatNodeMetrics(node *NodeMetrics) string

FormatNodeMetrics formats node metrics for display

func FormatValue

func FormatValue(value float64, unit string) string

FormatValue formats a metric value with appropriate units

func GetColorForUsage

func GetColorForUsage(usage float64) string

GetColorForUsage returns a color based on usage percentage

func GetEfficiencyColor

func GetEfficiencyColor(efficiency float64) string

GetEfficiencyColor returns a color based on efficiency percentage

Types

type AggregateJobMetrics

type AggregateJobMetrics struct {
	Timestamp            time.Time `json:"timestamp"`
	RunningJobs          int       `json:"running_jobs"`
	TotalAllocatedCPUs   int       `json:"total_allocated_cpus"`
	TotalAllocatedMem    uint64    `json:"total_allocated_mem"`
	TotalUsedCPUs        float64   `json:"total_used_cpus"`
	TotalUsedMem         uint64    `json:"total_used_mem"`
	TotalWastedCPUs      float64   `json:"total_wasted_cpus"`
	TotalWastedMem       uint64    `json:"total_wasted_mem"`
	CPUUtilization       float64   `json:"cpu_utilization"`
	MemUtilization       float64   `json:"mem_utilization"`
	AverageEfficiency    float64   `json:"average_efficiency"`
	TotalEfficiencyScore float64   `json:"total_efficiency_score"`
}

AggregateJobMetrics represents cluster-wide job aggregate metrics

type AggregateNodeMetrics

type AggregateNodeMetrics struct {
	Timestamp          time.Time `json:"timestamp"`
	ActiveNodes        int       `json:"active_nodes"`
	TotalCPUCores      int       `json:"total_cpu_cores"`
	TotalMemory        uint64    `json:"total_memory"`
	UsedMemory         uint64    `json:"used_memory"`
	MemoryUsagePercent float64   `json:"memory_usage_percent"`
	TotalCPUUsage      float64   `json:"total_cpu_usage"`
	AverageCPUUsage    float64   `json:"average_cpu_usage"`
	TotalLoadAverage   float64   `json:"total_load_average"`
	AverageLoadPerCore float64   `json:"average_load_per_core"`
	TotalDiskRead      float64   `json:"total_disk_read"`
	TotalDiskWrite     float64   `json:"total_disk_write"`
	TotalNetworkRx     float64   `json:"total_network_rx"`
	TotalNetworkTx     float64   `json:"total_network_tx"`
	TotalJobs          int       `json:"total_jobs"`
}

AggregateNodeMetrics represents cluster-wide aggregate metrics

type AggregationFunc

type AggregationFunc func([]float64) float64

AggregationFunc represents a function for aggregating metric values

type Alert

type Alert struct {
	ID         string            `json:"id"`
	Name       string            `json:"name"`
	Severity   string            `json:"severity"` // "info", "warning", "critical"
	State      string            `json:"state"`    // "pending", "firing", "resolved"
	Message    string            `json:"message"`
	Resolution string            `json:"resolution,omitempty"`
	Source     string            `json:"source"`
	Metric     string            `json:"metric"`
	Value      float64           `json:"value"`
	Threshold  float64           `json:"threshold"`
	Timestamp  time.Time         `json:"timestamp"`
	Resolved   bool              `json:"resolved"`
	ResolvedAt time.Time         `json:"resolved_at,omitempty"`
	Labels     map[string]string `json:"labels,omitempty"`
}

Alert represents a monitoring alert

type CPUMetrics

type CPUMetrics struct {
	Usage     float64 `json:"usage"`     // Percentage (0-100)
	Cores     int     `json:"cores"`     // Number of cores
	Load1m    float64 `json:"load_1m"`   // 1-minute load average
	Load5m    float64 `json:"load_5m"`   // 5-minute load average
	Load15m   float64 `json:"load_15m"`  // 15-minute load average
	Throttled float64 `json:"throttled"` // Throttled percentage
	System    float64 `json:"system"`    // System CPU percentage
	User      float64 `json:"user"`      // User CPU percentage
	IOWait    float64 `json:"io_wait"`   // IO wait percentage
	Limit     float64 `json:"limit"`     // CPU limit (cores or millicores)
}

CPUMetrics represents CPU-related metrics

type DiskMetrics

type DiskMetrics struct {
	ReadBytesPerSec  float64 `json:"read_bytes_per_sec"`  // Bytes/sec
	WriteBytesPerSec float64 `json:"write_bytes_per_sec"` // Bytes/sec
	ReadOpsPerSec    float64 `json:"read_ops_per_sec"`    // Operations/sec
	WriteOpsPerSec   float64 `json:"write_ops_per_sec"`   // Operations/sec
	IOUtilization    float64 `json:"io_utilization"`      // Percentage (0-100)
}

DiskMetrics represents disk I/O metrics

type EfficiencyMetrics

type EfficiencyMetrics struct {
	CPUEfficiency     float64 `json:"cpu_efficiency"`     // Actual vs Allocated CPU %
	MemEfficiency     float64 `json:"mem_efficiency"`     // Actual vs Allocated Memory %
	OverallEfficiency float64 `json:"overall_efficiency"` // Combined efficiency score
	CPUWasted         float64 `json:"cpu_wasted"`         // Wasted CPU cores
	MemWasted         uint64  `json:"mem_wasted"`         // Wasted memory bytes
}

EfficiencyMetrics tracks resource efficiency

type JobInfo

type JobInfo struct {
	JobName       string
	User          string
	State         string
	NodeList      []string
	StartTime     time.Time
	AllocatedCPUs int
	AllocatedMem  uint64
}

JobInfo contains SLURM job information

type JobMetrics

type JobMetrics struct {
	JobID         string            `json:"job_id"`
	JobName       string            `json:"job_name"`
	User          string            `json:"user"`
	State         string            `json:"state"`
	NodeList      []string          `json:"node_list"`
	StartTime     time.Time         `json:"start_time"`
	AllocatedCPUs int               `json:"allocated_cpus"`
	AllocatedMem  uint64            `json:"allocated_mem"` // in bytes
	Resources     ResourceMetrics   `json:"resources"`
	Efficiency    EfficiencyMetrics `json:"efficiency"`
	CgroupPath    string            `json:"cgroup_path"`
	Labels        map[string]string `json:"labels"` // Prometheus labels
	LastUpdate    time.Time         `json:"last_update"`
}

JobMetrics represents metrics for a SLURM job

func (*JobMetrics) IsEfficient

func (j *JobMetrics) IsEfficient(threshold float64) bool

IsEfficient returns whether a job is using resources efficiently

type JobMetricsCollector

type JobMetricsCollector struct {
	// contains filtered or unexported fields
}

JobMetricsCollector collects and manages job metrics

func NewJobMetricsCollector

func NewJobMetricsCollector(cgroupPattern string) *JobMetricsCollector

NewJobMetricsCollector creates a new job metrics collector

func (*JobMetricsCollector) GetActiveJobs

func (jmc *JobMetricsCollector) GetActiveJobs() map[string]*JobMetrics

GetActiveJobs returns only running jobs

func (*JobMetricsCollector) GetAggregateMetrics

func (jmc *JobMetricsCollector) GetAggregateMetrics() *AggregateJobMetrics

GetAggregateMetrics returns aggregate metrics across all jobs

func (*JobMetricsCollector) GetAllJobs

func (jmc *JobMetricsCollector) GetAllJobs() map[string]*JobMetrics

GetAllJobs returns all job metrics

func (*JobMetricsCollector) GetInefficiientJobs

func (jmc *JobMetricsCollector) GetInefficiientJobs(threshold float64) []*JobMetrics

GetInefficiientJobs returns jobs with low resource efficiency

func (*JobMetricsCollector) GetJob

func (jmc *JobMetricsCollector) GetJob(jobID string) (*JobMetrics, bool)

GetJob returns metrics for a specific job

func (*JobMetricsCollector) GetJobsSummary

func (jmc *JobMetricsCollector) GetJobsSummary() map[string]int

GetJobsSummary returns a summary of jobs by state

func (*JobMetricsCollector) RemoveJob

func (jmc *JobMetricsCollector) RemoveJob(jobID string)

RemoveJob removes a job from tracking

func (*JobMetricsCollector) UpdateFromPrometheus

func (jmc *JobMetricsCollector) UpdateFromPrometheus(jobID string, metrics map[string]*TimeSeries)

UpdateFromPrometheus updates job metrics from Prometheus data

func (*JobMetricsCollector) UpdateJobInfo

func (jmc *JobMetricsCollector) UpdateJobInfo(jobID string, info *JobInfo)

UpdateJobInfo updates SLURM job information

type MemoryMetrics

type MemoryMetrics struct {
	Total     uint64  `json:"total"`      // Total memory in bytes
	Used      uint64  `json:"used"`       // Used memory in bytes
	Available uint64  `json:"available"`  // Available memory in bytes
	Cache     uint64  `json:"cache"`      // Cache memory in bytes
	Buffer    uint64  `json:"buffer"`     // Buffer memory in bytes
	Usage     float64 `json:"usage"`      // Usage percentage (0-100)
	SwapTotal uint64  `json:"swap_total"` // Total swap in bytes
	SwapUsed  uint64  `json:"swap_used"`  // Used swap in bytes
	Limit     uint64  `json:"limit"`      // Memory limit in bytes (for containers/jobs)
}

MemoryMetrics represents memory-related metrics

type MetricCollection

type MetricCollection struct {
	ID         string                `json:"id"`
	Name       string                `json:"name"`
	Type       string                `json:"type"` // "node", "job", "cluster"
	Metrics    map[string]TimeSeries `json:"metrics"`
	LastUpdate time.Time             `json:"last_update"`
}

MetricCollection represents a collection of related metrics

func NewMetricCollection

func NewMetricCollection(id, name, collectionType string) *MetricCollection

NewMetricCollection creates a new metric collection

func (*MetricCollection) AddMetric

func (mc *MetricCollection) AddMetric(name string, ts *TimeSeries)

AddMetric adds a time series to the collection

func (*MetricCollection) GetMetric

func (mc *MetricCollection) GetMetric(name string) (*TimeSeries, bool)

GetMetric retrieves a specific metric by name

type MetricType

type MetricType string

MetricType represents the type of metric

const (
	// MetricTypeCPU is the metric type for CPU metrics.
	MetricTypeCPU MetricType = "cpu"
	// MetricTypeMemory is the metric type for memory metrics.
	MetricTypeMemory MetricType = "memory"
	// MetricTypeDisk is the metric type for disk metrics.
	MetricTypeDisk MetricType = "disk"
	// MetricTypeNetwork is the metric type for network metrics.
	MetricTypeNetwork MetricType = "network"
	// MetricTypeLoad is the metric type for load metrics.
	MetricTypeLoad MetricType = "load"
	// MetricTypeTemperature is the metric type for temperature metrics.
	MetricTypeTemperature MetricType = "temperature"
	// MetricTypeCustom is the metric type for custom metrics.
	MetricTypeCustom MetricType = "custom"
)

type MetricValue

type MetricValue struct {
	Timestamp time.Time              `json:"timestamp"`
	Value     float64                `json:"value"`
	Labels    map[string]string      `json:"labels"`
	Unit      string                 `json:"unit"`
	Metadata  map[string]interface{} `json:"metadata,omitempty"`
}

MetricValue represents a single metric measurement

type NetworkMetrics

type NetworkMetrics struct {
	ReceiveBytesPerSec    float64 `json:"receive_bytes_per_sec"`    // Bytes/sec
	TransmitBytesPerSec   float64 `json:"transmit_bytes_per_sec"`   // Bytes/sec
	ReceivePacketsPerSec  float64 `json:"receive_packets_per_sec"`  // Packets/sec
	TransmitPacketsPerSec float64 `json:"transmit_packets_per_sec"` // Packets/sec
	ReceiveErrors         uint64  `json:"receive_errors"`           // Total errors
	TransmitErrors        uint64  `json:"transmit_errors"`          // Total errors
}

NetworkMetrics represents network I/O metrics

type NodeMetrics

type NodeMetrics struct {
	NodeName      string             `json:"node_name"`
	NodeState     string             `json:"node_state"` // SLURM state
	LastUpdate    time.Time          `json:"last_update"`
	Resources     ResourceMetrics    `json:"resources"`
	JobCount      int                `json:"job_count"`
	Labels        map[string]string  `json:"labels"`         // Prometheus labels
	CustomMetrics map[string]float64 `json:"custom_metrics"` // Additional metrics
}

NodeMetrics represents metrics for a compute node

func (*NodeMetrics) GetHealthStatus

func (n *NodeMetrics) GetHealthStatus() string

GetHealthStatus returns the health status of a node based on metrics

type NodeMetricsCollector

type NodeMetricsCollector struct {
	// contains filtered or unexported fields
}

NodeMetricsCollector collects and manages node metrics

func NewNodeMetricsCollector

func NewNodeMetricsCollector(nodeLabel string) *NodeMetricsCollector

NewNodeMetricsCollector creates a new node metrics collector

func (*NodeMetricsCollector) GetAggregateMetrics

func (nmc *NodeMetricsCollector) GetAggregateMetrics() *AggregateNodeMetrics

GetAggregateMetrics returns aggregate metrics across all nodes

func (*NodeMetricsCollector) GetAllNodes

func (nmc *NodeMetricsCollector) GetAllNodes() map[string]*NodeMetrics

GetAllNodes returns all node metrics

func (*NodeMetricsCollector) GetNode

func (nmc *NodeMetricsCollector) GetNode(nodeName string) (*NodeMetrics, bool)

GetNode returns metrics for a specific node

func (*NodeMetricsCollector) GetNodesSummary

func (nmc *NodeMetricsCollector) GetNodesSummary() map[string]int

GetNodesSummary returns a summary of all nodes by state

func (*NodeMetricsCollector) MapSLURMToPrometheus

func (nmc *NodeMetricsCollector) MapSLURMToPrometheus(slurmName string) string

MapSLURMToPrometheus maps SLURM node names to Prometheus labels

func (*NodeMetricsCollector) UpdateFromPrometheus

func (nmc *NodeMetricsCollector) UpdateFromPrometheus(nodeName string, metrics map[string]*TimeSeries)

UpdateFromPrometheus updates node metrics from Prometheus data

func (*NodeMetricsCollector) UpdateNodeState

func (nmc *NodeMetricsCollector) UpdateNodeState(nodeName, state string, jobCount int)

UpdateNodeState updates the SLURM state for a node

type ResourceMetrics

type ResourceMetrics struct {
	CPU       CPUMetrics     `json:"cpu"`
	Memory    MemoryMetrics  `json:"memory"`
	Disk      DiskMetrics    `json:"disk"`
	Network   NetworkMetrics `json:"network"`
	Timestamp time.Time      `json:"timestamp"`
}

ResourceMetrics represents aggregated resource metrics

type TimeSeries

type TimeSeries struct {
	Name   string            `json:"name"`
	Labels map[string]string `json:"labels"`
	Values []MetricValue     `json:"values"`
	Unit   string            `json:"unit"`
	Type   MetricType        `json:"type"`
}

TimeSeries represents a time series of metric values

func (*TimeSeries) Add

func (ts *TimeSeries) Add(value MetricValue)

Add adds a metric value to the time series

func (*TimeSeries) Average

func (ts *TimeSeries) Average() float64

Average calculates the average value over the time series

func (*TimeSeries) Latest

func (ts *TimeSeries) Latest() *MetricValue

Latest returns the most recent value

func (*TimeSeries) Max

func (ts *TimeSeries) Max() float64

Max returns the maximum value in the time series

func (*TimeSeries) Min

func (ts *TimeSeries) Min() float64

Min returns the minimum value in the time series

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL