Documentation
¶
Overview ¶
Package monitoring provides cluster monitoring and alert management.
Index ¶
- func GetSeverityColor(severity AlertSeverity) string
- func GetSeverityIcon(severity AlertSeverity) string
- type Alert
- type AlertFilter
- type AlertListener
- type AlertManager
- func (am *AlertManager) AcknowledgeAlert(id, ackedBy string) error
- func (am *AlertManager) AddAlert(alert *Alert)
- func (am *AlertManager) AddListener(listener AlertListener)
- func (am *AlertManager) ClearResolvedAlerts(olderThan time.Duration) int
- func (am *AlertManager) GetActiveAlerts() []*Alert
- func (am *AlertManager) GetAlert(id string) *Alert
- func (am *AlertManager) GetAlerts(filter *AlertFilter) []*Alert
- func (am *AlertManager) GetCriticalAlerts() []*Alert
- func (am *AlertManager) GetStats() AlertStats
- func (am *AlertManager) RemoveListener(listener AlertListener)
- func (am *AlertManager) ResolveAlert(id string) error
- type AlertSeverity
- type AlertStats
- type AlertType
- type ClusterHealth
- type HealthCheck
- type HealthCheckFunc
- type HealthIssue
- type HealthMonitor
- type HealthStatus
- type HealthThreshold
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func GetSeverityColor ¶
func GetSeverityColor(severity AlertSeverity) string
GetSeverityColor returns the color for alert severity display
func GetSeverityIcon ¶
func GetSeverityIcon(severity AlertSeverity) string
GetSeverityIcon returns an icon for alert severity
Types ¶
type Alert ¶
type Alert struct {
ID string
Type AlertType
Severity AlertSeverity
Title string
Message string
Component string
Timestamp time.Time
Acknowledged bool
AckedBy string
AckedAt *time.Time
Resolved bool
ResolvedAt *time.Time
Count int
LastSeen time.Time
Metadata map[string]interface{}
}
Alert represents a cluster alert
type AlertFilter ¶
type AlertFilter struct {
Types []AlertType
Severities []AlertSeverity
Components []string
Acknowledged *bool
Resolved *bool
SinceTime *time.Time
UntilTime *time.Time
}
AlertFilter defines criteria for filtering alerts
type AlertListener ¶
AlertListener defines the interface for alert listeners
type AlertManager ¶
type AlertManager struct {
// contains filtered or unexported fields
}
AlertManager manages cluster alerts
func NewAlertManager ¶
func NewAlertManager() *AlertManager
NewAlertManager creates a new alert manager
func (*AlertManager) AcknowledgeAlert ¶
func (am *AlertManager) AcknowledgeAlert(id, ackedBy string) error
AcknowledgeAlert marks an alert as acknowledged
func (*AlertManager) AddAlert ¶
func (am *AlertManager) AddAlert(alert *Alert)
AddAlert adds a new alert or updates an existing one
func (*AlertManager) AddListener ¶
func (am *AlertManager) AddListener(listener AlertListener)
AddListener adds an alert listener
func (*AlertManager) ClearResolvedAlerts ¶
func (am *AlertManager) ClearResolvedAlerts(olderThan time.Duration) int
ClearResolvedAlerts removes all resolved alerts older than the specified duration
func (*AlertManager) GetActiveAlerts ¶
func (am *AlertManager) GetActiveAlerts() []*Alert
GetActiveAlerts returns all unresolved alerts
func (*AlertManager) GetAlert ¶
func (am *AlertManager) GetAlert(id string) *Alert
GetAlert retrieves an alert by ID
func (*AlertManager) GetAlerts ¶
func (am *AlertManager) GetAlerts(filter *AlertFilter) []*Alert
GetAlerts returns all alerts, optionally filtered by parameters
func (*AlertManager) GetCriticalAlerts ¶
func (am *AlertManager) GetCriticalAlerts() []*Alert
GetCriticalAlerts returns all critical alerts
func (*AlertManager) GetStats ¶
func (am *AlertManager) GetStats() AlertStats
GetStats returns statistics about alerts
func (*AlertManager) RemoveListener ¶
func (am *AlertManager) RemoveListener(listener AlertListener)
RemoveListener removes an alert listener
func (*AlertManager) ResolveAlert ¶
func (am *AlertManager) ResolveAlert(id string) error
ResolveAlert marks an alert as resolved
type AlertSeverity ¶
type AlertSeverity string
AlertSeverity represents the severity level of an alert
const ( // AlertSeverityInfo is the info severity level for alerts. AlertSeverityInfo AlertSeverity = "info" // AlertSeverityWarning is the warning severity level for alerts. AlertSeverityWarning AlertSeverity = "warning" // AlertSeverityCritical is the critical severity level for alerts. AlertSeverityCritical AlertSeverity = "critical" )
type AlertStats ¶
type AlertStats struct {
Total int
Critical int
Warning int
Info int
Acknowledged int
Unacknowledged int
Active int
Resolved int
}
AlertStats provides statistics about alerts
type AlertType ¶
type AlertType string
AlertType represents the type of alert
const ( // AlertTypeHealth is the alert type for health-related alerts. AlertTypeHealth AlertType = "health" // AlertTypePerformance is the alert type for performance-related alerts. AlertTypePerformance AlertType = "performance" // AlertTypeResource is the alert type for resource-related alerts. AlertTypeResource AlertType = "resource" // AlertTypeJob is the alert type for job-related alerts. AlertTypeJob AlertType = "job" // AlertTypeNode is the alert type for node-related alerts. AlertTypeNode AlertType = "node" // AlertTypeSystem is the alert type for system-related alerts. AlertTypeSystem AlertType = "system" )
type ClusterHealth ¶
type ClusterHealth struct {
OverallStatus HealthStatus
Checks map[string]*HealthCheck
Issues []HealthIssue
LastUpdated time.Time
// contains filtered or unexported fields
}
ClusterHealth represents the overall cluster health
type HealthCheck ¶
type HealthCheck struct {
Name string
Description string
Status HealthStatus
Message string
LastCheck time.Time
CheckCount int
Threshold HealthThreshold
}
HealthCheck represents a single health check
type HealthCheckFunc ¶
type HealthCheckFunc func(client dao.SlurmClient) *HealthCheck
HealthCheckFunc defines a function that performs a health check
type HealthIssue ¶
type HealthIssue struct {
ID string
Component string
Severity HealthStatus
Title string
Description string
FirstSeen time.Time
LastSeen time.Time
Count int
Resolved bool
}
HealthIssue represents a specific health issue
type HealthMonitor ¶
type HealthMonitor struct {
// contains filtered or unexported fields
}
HealthMonitor monitors cluster health and generates alerts
func NewHealthMonitor ¶
func NewHealthMonitor(client dao.SlurmClient, interval time.Duration) *HealthMonitor
NewHealthMonitor creates a new health monitor
func (*HealthMonitor) GetAlertManager ¶
func (hm *HealthMonitor) GetAlertManager() *AlertManager
GetAlertManager returns the alert manager
func (*HealthMonitor) GetHealth ¶
func (hm *HealthMonitor) GetHealth() *ClusterHealth
GetHealth returns the current cluster health
type HealthStatus ¶
type HealthStatus string
HealthStatus represents the health status of a cluster component
const ( // HealthStatusHealthy indicates the component is healthy. HealthStatusHealthy HealthStatus = "healthy" // HealthStatusWarning indicates the component has warning conditions. HealthStatusWarning HealthStatus = "warning" // HealthStatusCritical indicates the component is in critical condition. HealthStatusCritical HealthStatus = "critical" // HealthStatusUnknown indicates the health status is unknown. HealthStatusUnknown HealthStatus = "unknown" )