Documentation
¶
Overview ¶
Package models defines data structures and types for representing metrics, nodes, jobs, and other observability entities. It provides standardized interfaces for metric collection, type-safe data handling, and consistent serialization across the observability plugin components.
Index ¶
- Variables
- func FormatJobMetrics(job *JobMetrics) string
- func FormatNodeMetrics(node *NodeMetrics) string
- func FormatValue(value float64, unit string) string
- func GetColorForUsage(usage float64) string
- func GetEfficiencyColor(efficiency float64) string
- type AggregateJobMetrics
- type AggregateNodeMetrics
- type AggregationFunc
- type Alert
- type CPUMetrics
- type DiskMetrics
- type EfficiencyMetrics
- type JobInfo
- type JobMetrics
- type JobMetricsCollector
- func (jmc *JobMetricsCollector) GetActiveJobs() map[string]*JobMetrics
- func (jmc *JobMetricsCollector) GetAggregateMetrics() *AggregateJobMetrics
- func (jmc *JobMetricsCollector) GetAllJobs() map[string]*JobMetrics
- func (jmc *JobMetricsCollector) GetInefficiientJobs(threshold float64) []*JobMetrics
- func (jmc *JobMetricsCollector) GetJob(jobID string) (*JobMetrics, bool)
- func (jmc *JobMetricsCollector) GetJobsSummary() map[string]int
- func (jmc *JobMetricsCollector) RemoveJob(jobID string)
- func (jmc *JobMetricsCollector) UpdateFromPrometheus(jobID string, metrics map[string]*TimeSeries)
- func (jmc *JobMetricsCollector) UpdateJobInfo(jobID string, info JobInfo)
- type MemoryMetrics
- type MetricCollection
- type MetricType
- type MetricValue
- type NetworkMetrics
- type NodeMetrics
- type NodeMetricsCollector
- func (nmc *NodeMetricsCollector) GetAggregateMetrics() *AggregateNodeMetrics
- func (nmc *NodeMetricsCollector) GetAllNodes() map[string]*NodeMetrics
- func (nmc *NodeMetricsCollector) GetNode(nodeName string) (*NodeMetrics, bool)
- func (nmc *NodeMetricsCollector) GetNodesSummary() map[string]int
- func (nmc *NodeMetricsCollector) MapSLURMToPrometheus(slurmName string) string
- func (nmc *NodeMetricsCollector) UpdateFromPrometheus(nodeName string, metrics map[string]*TimeSeries)
- func (nmc *NodeMetricsCollector) UpdateNodeState(nodeName, state string, jobCount int)
- type ResourceMetrics
- type TimeSeries
Constants ¶
This section is empty.
Variables ¶
var CommonAggregations = map[string]AggregationFunc{ "avg": func(values []float64) float64 { if len(values) == 0 { return 0 } sum := 0.0 for _, v := range values { sum += v } return sum / float64(len(values)) }, "min": func(values []float64) float64 { if len(values) == 0 { return 0 } min := values[0] for _, v := range values[1:] { if v < min { min = v } } return min }, "max": func(values []float64) float64 { if len(values) == 0 { return 0 } max := values[0] for _, v := range values[1:] { if v > max { max = v } } return max }, "sum": func(values []float64) float64 { sum := 0.0 for _, v := range values { sum += v } return sum }, "count": func(values []float64) float64 { return float64(len(values)) }, }
CommonAggregations provides common aggregation functions
Functions ¶
func FormatJobMetrics ¶
func FormatJobMetrics(job *JobMetrics) string
FormatJobMetrics formats job metrics for display
func FormatNodeMetrics ¶
func FormatNodeMetrics(node *NodeMetrics) string
FormatNodeMetrics formats node metrics for display
func FormatValue ¶
FormatValue formats a metric value with appropriate units
func GetColorForUsage ¶
GetColorForUsage returns a color based on usage percentage
func GetEfficiencyColor ¶
GetEfficiencyColor returns a color based on efficiency percentage
Types ¶
type AggregateJobMetrics ¶
type AggregateJobMetrics struct {
Timestamp time.Time `json:"timestamp"`
RunningJobs int `json:"running_jobs"`
TotalAllocatedCPUs int `json:"total_allocated_cpus"`
TotalAllocatedMem uint64 `json:"total_allocated_mem"`
TotalUsedCPUs float64 `json:"total_used_cpus"`
TotalUsedMem uint64 `json:"total_used_mem"`
TotalWastedCPUs float64 `json:"total_wasted_cpus"`
TotalWastedMem uint64 `json:"total_wasted_mem"`
CPUUtilization float64 `json:"cpu_utilization"`
MemUtilization float64 `json:"mem_utilization"`
AverageEfficiency float64 `json:"average_efficiency"`
TotalEfficiencyScore float64 `json:"total_efficiency_score"`
}
AggregateJobMetrics represents cluster-wide job aggregate metrics
type AggregateNodeMetrics ¶
type AggregateNodeMetrics struct {
Timestamp time.Time `json:"timestamp"`
ActiveNodes int `json:"active_nodes"`
TotalCPUCores int `json:"total_cpu_cores"`
TotalMemory uint64 `json:"total_memory"`
UsedMemory uint64 `json:"used_memory"`
MemoryUsagePercent float64 `json:"memory_usage_percent"`
TotalCPUUsage float64 `json:"total_cpu_usage"`
AverageCPUUsage float64 `json:"average_cpu_usage"`
TotalLoadAverage float64 `json:"total_load_average"`
AverageLoadPerCore float64 `json:"average_load_per_core"`
TotalDiskRead float64 `json:"total_disk_read"`
TotalDiskWrite float64 `json:"total_disk_write"`
TotalNetworkRx float64 `json:"total_network_rx"`
TotalNetworkTx float64 `json:"total_network_tx"`
TotalJobs int `json:"total_jobs"`
}
AggregateNodeMetrics represents cluster-wide aggregate metrics
type AggregationFunc ¶
AggregationFunc represents a function for aggregating metric values
type Alert ¶
type Alert struct {
ID string `json:"id"`
Name string `json:"name"`
Severity string `json:"severity"` // "info", "warning", "critical"
State string `json:"state"` // "pending", "firing", "resolved"
Message string `json:"message"`
Resolution string `json:"resolution,omitempty"`
Source string `json:"source"`
Metric string `json:"metric"`
Value float64 `json:"value"`
Threshold float64 `json:"threshold"`
Timestamp time.Time `json:"timestamp"`
Resolved bool `json:"resolved"`
ResolvedAt time.Time `json:"resolved_at,omitempty"`
Labels map[string]string `json:"labels,omitempty"`
}
Alert represents a monitoring alert
type CPUMetrics ¶
type CPUMetrics struct {
Usage float64 `json:"usage"` // Percentage (0-100)
Cores int `json:"cores"` // Number of cores
Load1m float64 `json:"load_1m"` // 1-minute load average
Load5m float64 `json:"load_5m"` // 5-minute load average
Load15m float64 `json:"load_15m"` // 15-minute load average
Throttled float64 `json:"throttled"` // Throttled percentage
System float64 `json:"system"` // System CPU percentage
User float64 `json:"user"` // User CPU percentage
IOWait float64 `json:"io_wait"` // IO wait percentage
Limit float64 `json:"limit"` // CPU limit (cores or millicores)
}
CPUMetrics represents CPU-related metrics
type DiskMetrics ¶
type DiskMetrics struct {
ReadBytesPerSec float64 `json:"read_bytes_per_sec"` // Bytes/sec
WriteBytesPerSec float64 `json:"write_bytes_per_sec"` // Bytes/sec
ReadOpsPerSec float64 `json:"read_ops_per_sec"` // Operations/sec
WriteOpsPerSec float64 `json:"write_ops_per_sec"` // Operations/sec
IOUtilization float64 `json:"io_utilization"` // Percentage (0-100)
}
DiskMetrics represents disk I/O metrics
type EfficiencyMetrics ¶
type EfficiencyMetrics struct {
CPUEfficiency float64 `json:"cpu_efficiency"` // Actual vs Allocated CPU %
MemEfficiency float64 `json:"mem_efficiency"` // Actual vs Allocated Memory %
OverallEfficiency float64 `json:"overall_efficiency"` // Combined efficiency score
CPUWasted float64 `json:"cpu_wasted"` // Wasted CPU cores
MemWasted uint64 `json:"mem_wasted"` // Wasted memory bytes
}
EfficiencyMetrics tracks resource efficiency
type JobInfo ¶
type JobInfo struct {
JobName string
User string
State string
NodeList []string
StartTime time.Time
AllocatedCPUs int
AllocatedMem uint64
}
JobInfo contains SLURM job information
type JobMetrics ¶
type JobMetrics struct {
JobID string `json:"job_id"`
JobName string `json:"job_name"`
User string `json:"user"`
State string `json:"state"`
NodeList []string `json:"node_list"`
StartTime time.Time `json:"start_time"`
AllocatedCPUs int `json:"allocated_cpus"`
AllocatedMem uint64 `json:"allocated_mem"` // in bytes
Resources ResourceMetrics `json:"resources"`
Efficiency EfficiencyMetrics `json:"efficiency"`
CgroupPath string `json:"cgroup_path"`
Labels map[string]string `json:"labels"` // Prometheus labels
LastUpdate time.Time `json:"last_update"`
}
JobMetrics represents metrics for a SLURM job
func (*JobMetrics) IsEfficient ¶
func (j *JobMetrics) IsEfficient(threshold float64) bool
IsEfficient returns whether a job is using resources efficiently
type JobMetricsCollector ¶
type JobMetricsCollector struct {
// contains filtered or unexported fields
}
JobMetricsCollector collects and manages job metrics
func NewJobMetricsCollector ¶
func NewJobMetricsCollector(cgroupPattern string) *JobMetricsCollector
NewJobMetricsCollector creates a new job metrics collector
func (*JobMetricsCollector) GetActiveJobs ¶
func (jmc *JobMetricsCollector) GetActiveJobs() map[string]*JobMetrics
GetActiveJobs returns only running jobs
func (*JobMetricsCollector) GetAggregateMetrics ¶
func (jmc *JobMetricsCollector) GetAggregateMetrics() *AggregateJobMetrics
GetAggregateMetrics returns aggregate metrics across all jobs
func (*JobMetricsCollector) GetAllJobs ¶
func (jmc *JobMetricsCollector) GetAllJobs() map[string]*JobMetrics
GetAllJobs returns all job metrics
func (*JobMetricsCollector) GetInefficiientJobs ¶
func (jmc *JobMetricsCollector) GetInefficiientJobs(threshold float64) []*JobMetrics
GetInefficiientJobs returns jobs with low resource efficiency
func (*JobMetricsCollector) GetJob ¶
func (jmc *JobMetricsCollector) GetJob(jobID string) (*JobMetrics, bool)
GetJob returns metrics for a specific job
func (*JobMetricsCollector) GetJobsSummary ¶
func (jmc *JobMetricsCollector) GetJobsSummary() map[string]int
GetJobsSummary returns a summary of jobs by state
func (*JobMetricsCollector) RemoveJob ¶
func (jmc *JobMetricsCollector) RemoveJob(jobID string)
RemoveJob removes a job from tracking
func (*JobMetricsCollector) UpdateFromPrometheus ¶
func (jmc *JobMetricsCollector) UpdateFromPrometheus(jobID string, metrics map[string]*TimeSeries)
UpdateFromPrometheus updates job metrics from Prometheus data
func (*JobMetricsCollector) UpdateJobInfo ¶
func (jmc *JobMetricsCollector) UpdateJobInfo(jobID string, info JobInfo)
UpdateJobInfo updates SLURM job information
type MemoryMetrics ¶
type MemoryMetrics struct {
Total uint64 `json:"total"` // Total memory in bytes
Used uint64 `json:"used"` // Used memory in bytes
Available uint64 `json:"available"` // Available memory in bytes
Cache uint64 `json:"cache"` // Cache memory in bytes
Buffer uint64 `json:"buffer"` // Buffer memory in bytes
Usage float64 `json:"usage"` // Usage percentage (0-100)
SwapTotal uint64 `json:"swap_total"` // Total swap in bytes
SwapUsed uint64 `json:"swap_used"` // Used swap in bytes
Limit uint64 `json:"limit"` // Memory limit in bytes (for containers/jobs)
}
MemoryMetrics represents memory-related metrics
type MetricCollection ¶
type MetricCollection struct {
ID string `json:"id"`
Name string `json:"name"`
Type string `json:"type"` // "node", "job", "cluster"
Metrics map[string]TimeSeries `json:"metrics"`
LastUpdate time.Time `json:"last_update"`
}
MetricCollection represents a collection of related metrics
func NewMetricCollection ¶
func NewMetricCollection(id, name, collectionType string) *MetricCollection
NewMetricCollection creates a new metric collection
func (*MetricCollection) AddMetric ¶
func (mc *MetricCollection) AddMetric(name string, ts TimeSeries)
AddMetric adds a time series to the collection
func (*MetricCollection) GetMetric ¶
func (mc *MetricCollection) GetMetric(name string) (*TimeSeries, bool)
GetMetric retrieves a specific metric by name
type MetricType ¶
type MetricType string
MetricType represents the type of metric
const ( MetricTypeCPU MetricType = "cpu" MetricTypeMemory MetricType = "memory" MetricTypeDisk MetricType = "disk" MetricTypeNetwork MetricType = "network" MetricTypeLoad MetricType = "load" MetricTypeTemperature MetricType = "temperature" MetricTypeCustom MetricType = "custom" )
type MetricValue ¶
type MetricValue struct {
Timestamp time.Time `json:"timestamp"`
Value float64 `json:"value"`
Labels map[string]string `json:"labels"`
Unit string `json:"unit"`
Metadata map[string]interface{} `json:"metadata,omitempty"`
}
MetricValue represents a single metric measurement
type NetworkMetrics ¶
type NetworkMetrics struct {
ReceiveBytesPerSec float64 `json:"receive_bytes_per_sec"` // Bytes/sec
TransmitBytesPerSec float64 `json:"transmit_bytes_per_sec"` // Bytes/sec
ReceivePacketsPerSec float64 `json:"receive_packets_per_sec"` // Packets/sec
TransmitPacketsPerSec float64 `json:"transmit_packets_per_sec"` // Packets/sec
ReceiveErrors uint64 `json:"receive_errors"` // Total errors
TransmitErrors uint64 `json:"transmit_errors"` // Total errors
}
NetworkMetrics represents network I/O metrics
type NodeMetrics ¶
type NodeMetrics struct {
NodeName string `json:"node_name"`
NodeState string `json:"node_state"` // SLURM state
LastUpdate time.Time `json:"last_update"`
Resources ResourceMetrics `json:"resources"`
JobCount int `json:"job_count"`
Labels map[string]string `json:"labels"` // Prometheus labels
CustomMetrics map[string]float64 `json:"custom_metrics"` // Additional metrics
}
NodeMetrics represents metrics for a compute node
func (*NodeMetrics) GetHealthStatus ¶
func (n *NodeMetrics) GetHealthStatus() string
GetHealthStatus returns the health status of a node based on metrics
type NodeMetricsCollector ¶
type NodeMetricsCollector struct {
// contains filtered or unexported fields
}
NodeMetricsCollector collects and manages node metrics
func NewNodeMetricsCollector ¶
func NewNodeMetricsCollector(nodeLabel string) *NodeMetricsCollector
NewNodeMetricsCollector creates a new node metrics collector
func (*NodeMetricsCollector) GetAggregateMetrics ¶
func (nmc *NodeMetricsCollector) GetAggregateMetrics() *AggregateNodeMetrics
GetAggregateMetrics returns aggregate metrics across all nodes
func (*NodeMetricsCollector) GetAllNodes ¶
func (nmc *NodeMetricsCollector) GetAllNodes() map[string]*NodeMetrics
GetAllNodes returns all node metrics
func (*NodeMetricsCollector) GetNode ¶
func (nmc *NodeMetricsCollector) GetNode(nodeName string) (*NodeMetrics, bool)
GetNode returns metrics for a specific node
func (*NodeMetricsCollector) GetNodesSummary ¶
func (nmc *NodeMetricsCollector) GetNodesSummary() map[string]int
GetNodesSummary returns a summary of all nodes by state
func (*NodeMetricsCollector) MapSLURMToPrometheus ¶
func (nmc *NodeMetricsCollector) MapSLURMToPrometheus(slurmName string) string
MapSLURMToPrometheus maps SLURM node names to Prometheus labels
func (*NodeMetricsCollector) UpdateFromPrometheus ¶
func (nmc *NodeMetricsCollector) UpdateFromPrometheus(nodeName string, metrics map[string]*TimeSeries)
UpdateFromPrometheus updates node metrics from Prometheus data
func (*NodeMetricsCollector) UpdateNodeState ¶
func (nmc *NodeMetricsCollector) UpdateNodeState(nodeName, state string, jobCount int)
UpdateNodeState updates the SLURM state for a node
type ResourceMetrics ¶
type ResourceMetrics struct {
CPU CPUMetrics `json:"cpu"`
Memory MemoryMetrics `json:"memory"`
Disk DiskMetrics `json:"disk"`
Network NetworkMetrics `json:"network"`
Timestamp time.Time `json:"timestamp"`
}
ResourceMetrics represents aggregated resource metrics
type TimeSeries ¶
type TimeSeries struct {
Name string `json:"name"`
Labels map[string]string `json:"labels"`
Values []MetricValue `json:"values"`
Unit string `json:"unit"`
Type MetricType `json:"type"`
}
TimeSeries represents a time series of metric values
func (*TimeSeries) Add ¶
func (ts *TimeSeries) Add(value MetricValue)
Add adds a metric value to the time series
func (*TimeSeries) Average ¶
func (ts *TimeSeries) Average() float64
Average calculates the average value over the time series
func (*TimeSeries) Latest ¶
func (ts *TimeSeries) Latest() *MetricValue
Latest returns the most recent value
func (*TimeSeries) Max ¶
func (ts *TimeSeries) Max() float64
Max returns the maximum value in the time series
func (*TimeSeries) Min ¶
func (ts *TimeSeries) Min() float64
Min returns the minimum value in the time series