Documentation
¶
Overview ¶
Package alerts provides real-time alerting and notification capabilities for monitoring system metrics and resource utilization. It supports configurable alert rules, multiple severity levels, notification channels, and alert history tracking with automatic cleanup and retention management.
Index ¶
- Variables
- func ValidateRule(rule *Rule) error
- type Alert
- type AlertFilter
- type AlertState
- type AlertSubscription
- type Engine
- func (e *Engine) AddRule(rule *Rule)
- func (e *Engine) GetActiveAlerts() []Alert
- func (e *Engine) GetAlertHistory(limit int) []Alert
- func (e *Engine) GetRules() []Rule
- func (e *Engine) RemoveRule(name string)
- func (e *Engine) SetAlertCallback(fn func(*Alert))
- func (e *Engine) SetJobCollector(collector *models.JobMetricsCollector)
- func (e *Engine) SetNodeCollector(collector *models.NodeMetricsCollector)
- func (e *Engine) SetResolvedCallback(fn func(*Alert))
- func (e *Engine) Start(ctx context.Context) error
- func (e *Engine) Stop() error
- type InAppHandler
- func (h *InAppHandler) ClearNotifications()
- func (h *InAppHandler) GetID() string
- func (h *InAppHandler) GetName() string
- func (h *InAppHandler) GetNotifications(limit int) []*Notification
- func (h *InAppHandler) IsEnabled() bool
- func (h *InAppHandler) Send(_ context.Context, notification *Notification) error
- type LogHandler
- type Notification
- type NotificationEvent
- type NotificationHandler
- type NotificationManager
- func (nm *NotificationManager) GetHandlers() map[string]NotificationHandler
- func (nm *NotificationManager) GetHistory(limit int) []*NotificationEvent
- func (nm *NotificationManager) NotifyAlert(ctx context.Context, alert *Alert) error
- func (nm *NotificationManager) NotifyResolved(ctx context.Context, alert *Alert) error
- func (nm *NotificationManager) RegisterHandler(handler NotificationHandler) error
- func (nm *NotificationManager) Subscribe(subscription *AlertSubscription) error
- func (nm *NotificationManager) TestNotification(ctx context.Context, handlerID string) error
- func (nm *NotificationManager) UnregisterHandler(handlerID string)
- func (nm *NotificationManager) Unsubscribe(subscriptionID string)
- type NotificationType
- type RateLimiter
- type Rule
- type RuleBuilder
- func (rb *RuleBuilder) Build() (Rule, error)
- func (rb *RuleBuilder) WithAnnotation(key, value string) *RuleBuilder
- func (rb *RuleBuilder) WithDescription(description string) *RuleBuilder
- func (rb *RuleBuilder) WithDuration(duration time.Duration) *RuleBuilder
- func (rb *RuleBuilder) WithLabel(key, value string) *RuleBuilder
- func (rb *RuleBuilder) WithMessageTemplate(template string) *RuleBuilder
- func (rb *RuleBuilder) WithMetric(metric string) *RuleBuilder
- func (rb *RuleBuilder) WithQuery(query string) *RuleBuilder
- func (rb *RuleBuilder) WithSeverity(severity string) *RuleBuilder
- func (rb *RuleBuilder) WithTarget(target string) *RuleBuilder
- func (rb *RuleBuilder) WithThreshold(operator string, threshold float64) *RuleBuilder
- func (rb *RuleBuilder) WithType(ruleType RuleType) *RuleBuilder
- type RuleCondition
- type RuleType
Constants ¶
This section is empty.
Variables ¶
var PredefinedRules = []Rule{ { Name: "high_node_cpu_usage", Type: RuleTypeThreshold, Target: "node", Metric: "cpu_usage", Operator: ">", Threshold: 90.0, Duration: 5 * time.Minute, Severity: "warning", Enabled: true, Description: "Node CPU usage is above 90%", }, { Name: "critical_node_cpu_usage", Type: RuleTypeThreshold, Target: "node", Metric: "cpu_usage", Operator: ">", Threshold: 95.0, Duration: 5 * time.Minute, Severity: "critical", Enabled: true, Description: "Node CPU usage is critically high", }, { Name: "high_node_memory_usage", Type: RuleTypeThreshold, Target: "node", Metric: "memory_usage", Operator: ">", Threshold: 90.0, Duration: 5 * time.Minute, Severity: "warning", Enabled: true, Description: "Node memory usage is above 90%", }, { Name: "critical_node_memory_usage", Type: RuleTypeThreshold, Target: "node", Metric: "memory_usage", Operator: ">", Threshold: 95.0, Duration: 5 * time.Minute, Severity: "critical", Enabled: true, Description: "Node memory usage is critically high", }, { Name: "high_load_average", Type: RuleTypeThreshold, Target: "node", Metric: "load_per_core", Operator: ">", Threshold: 2.0, Duration: 10 * time.Minute, Severity: "warning", Enabled: true, Description: "Load average per core is high", }, { Name: "disk_io_saturation", Type: RuleTypeThreshold, Target: "node", Metric: "disk_io_util", Operator: ">", Threshold: 90.0, Duration: 5 * time.Minute, Severity: "warning", Enabled: true, Description: "Disk I/O utilization is saturated", }, { Name: "job_memory_limit", Type: RuleTypeThreshold, Target: "job", Metric: "memory_usage_percent", Operator: ">", Threshold: 95.0, Duration: 5 * time.Minute, Severity: "critical", Enabled: true, Description: "Job is approaching memory limit", }, { Name: "job_inefficient", Type: RuleTypeThreshold, Target: "job", Metric: "overall_efficiency", Operator: "<", Threshold: 20.0, Duration: 30 * time.Minute, Severity: "warning", Enabled: true, Description: "Job is using less than 20% of allocated resources", }, { Name: "job_cpu_throttled", Type: RuleTypeQuery, Target: "job", Query: `rate(container_cpu_throttled_seconds_total[5m]) > 0.1`, Operator: ">", Threshold: 0.1, Duration: 5 * time.Minute, Severity: "warning", Enabled: false, Description: "Job CPU is being throttled", }, { Name: "cluster_cpu_high", Type: RuleTypeThreshold, Target: "cluster", Metric: "cpu_usage", Operator: ">", Threshold: 80.0, Duration: 15 * time.Minute, Severity: "warning", Enabled: true, Description: "Cluster-wide CPU usage is high", }, { Name: "cluster_memory_high", Type: RuleTypeThreshold, Target: "cluster", Metric: "memory_usage", Operator: ">", Threshold: 85.0, Duration: 15 * time.Minute, Severity: "warning", Enabled: true, Description: "Cluster-wide memory usage is high", }, { Name: "nodes_down", Type: RuleTypeQuery, Target: "cluster", Query: `count(up{job="node-exporter"} == 0) > 0`, Operator: ">", Threshold: 0, Duration: 5 * time.Minute, Severity: "critical", Enabled: false, Description: "One or more nodes are down", }, }
PredefinedRules contains commonly used alert rules
var RuleTemplates = map[string]Rule{ "node_metric_threshold": { Name: "node_${metric}_threshold", Type: RuleTypeThreshold, Target: "node", Metric: "${metric}", Operator: ">", Severity: "warning", Enabled: true, }, "job_metric_threshold": { Name: "job_${metric}_threshold", Type: RuleTypeThreshold, Target: "job", Metric: "${metric}", Operator: ">", Severity: "warning", Enabled: true, }, "custom_query": { Name: "custom_query_alert", Type: RuleTypeQuery, Target: "cluster", Query: "${query}", Operator: ">", Severity: "warning", Enabled: true, }, }
RuleTemplates provides templates for creating custom rules
Functions ¶
Types ¶
type Alert ¶
type Alert struct {
ID string `json:"id"`
RuleName string `json:"rule_name"`
Severity string `json:"severity"`
State AlertState `json:"state"`
Message string `json:"message"`
Description string `json:"description"`
Labels map[string]string `json:"labels"`
Annotations map[string]string `json:"annotations"`
Value float64 `json:"value"`
Threshold float64 `json:"threshold"`
FirstSeen time.Time `json:"first_seen"`
LastSeen time.Time `json:"last_seen"`
ResolvedAt time.Time `json:"resolved_at,omitempty"`
Duration time.Duration `json:"duration"`
Source string `json:"source"`
}
Alert represents an active or historical alert
type AlertFilter ¶
type AlertFilter struct {
Severities []string
Sources []string
Rules []string
Labels map[string]string
}
AlertFilter defines criteria for filtering alerts
type AlertState ¶
type AlertState string
AlertState represents the state of an alert
const ( // AlertStatePending is the pending state for alerts. AlertStatePending AlertState = "pending" // AlertStateFiring is the firing state for alerts. AlertStateFiring AlertState = "firing" // AlertStateResolved is the resolved state for alerts. AlertStateResolved AlertState = "resolved" )
type AlertSubscription ¶
type AlertSubscription struct {
ID string
Filter AlertFilter
HandlerIDs []string
CreatedAt time.Time
LastNotified time.Time
}
AlertSubscription defines a subscription to alert notifications
type Engine ¶
type Engine struct {
// contains filtered or unexported fields
}
Engine manages alert evaluation and state
func NewEngine ¶
func NewEngine(config *config.AlertConfig, client *prometheus.CachedClient) *Engine
NewEngine creates a new alert engine
func (*Engine) GetActiveAlerts ¶
GetActiveAlerts returns all active alerts
func (*Engine) GetAlertHistory ¶
GetAlertHistory returns alert history
func (*Engine) RemoveRule ¶
RemoveRule removes an alert rule by name
func (*Engine) SetAlertCallback ¶
SetAlertCallback sets the callback for new alerts
func (*Engine) SetJobCollector ¶
func (e *Engine) SetJobCollector(collector *models.JobMetricsCollector)
SetJobCollector updates the job metrics collector
func (*Engine) SetNodeCollector ¶
func (e *Engine) SetNodeCollector(collector *models.NodeMetricsCollector)
SetNodeCollector updates the node metrics collector
func (*Engine) SetResolvedCallback ¶
SetResolvedCallback sets the callback for resolved alerts
type InAppHandler ¶
type InAppHandler struct {
// contains filtered or unexported fields
}
InAppHandler handles in-app notifications
func NewInAppHandler ¶
func NewInAppHandler() *InAppHandler
NewInAppHandler creates a new in-app notification handler
func (*InAppHandler) ClearNotifications ¶
func (h *InAppHandler) ClearNotifications()
ClearNotifications clears all in-app notifications
func (*InAppHandler) GetID ¶
func (h *InAppHandler) GetID() string
GetID returns the unique identifier of the in-app handler.
func (*InAppHandler) GetName ¶
func (h *InAppHandler) GetName() string
GetName returns the name of the in-app handler.
func (*InAppHandler) GetNotifications ¶
func (h *InAppHandler) GetNotifications(limit int) []*Notification
GetNotifications returns in-app notifications
func (*InAppHandler) IsEnabled ¶
func (h *InAppHandler) IsEnabled() bool
IsEnabled returns whether the in-app handler is enabled.
func (*InAppHandler) Send ¶
func (h *InAppHandler) Send(_ context.Context, notification *Notification) error
Send sends a notification to the in-app notification queue.
type LogHandler ¶
type LogHandler struct {
// contains filtered or unexported fields
}
LogHandler logs notifications
func NewLogHandler ¶
func NewLogHandler(logger func(string)) *LogHandler
NewLogHandler creates a new log handler
func (*LogHandler) GetID ¶
func (h *LogHandler) GetID() string
GetID returns the unique identifier of the log handler.
func (*LogHandler) GetName ¶
func (h *LogHandler) GetName() string
GetName returns the name of the log handler.
func (*LogHandler) IsEnabled ¶
func (h *LogHandler) IsEnabled() bool
IsEnabled returns whether the log handler is enabled.
func (*LogHandler) Send ¶
func (h *LogHandler) Send(_ context.Context, notification *Notification) error
Send sends a notification to the configured logger.
type Notification ¶
type Notification struct {
Alert *Alert `json:"alert"`
Type NotificationType `json:"type"`
Timestamp time.Time `json:"timestamp"`
Recipients []string `json:"recipients,omitempty"`
Metadata map[string]string `json:"metadata,omitempty"`
}
Notification represents an alert notification
type NotificationEvent ¶
type NotificationEvent struct {
ID string
Notification *Notification
HandlerID string
Status string
Error string
SentAt time.Time
}
NotificationEvent represents a notification event in history
type NotificationHandler ¶
type NotificationHandler interface {
GetID() string
GetName() string
Send(ctx context.Context, notification *Notification) error
IsEnabled() bool
}
NotificationHandler defines the interface for notification handlers
type NotificationManager ¶
type NotificationManager struct {
// contains filtered or unexported fields
}
NotificationManager handles alert notifications
func NewNotificationManager ¶
func NewNotificationManager() *NotificationManager
NewNotificationManager creates a new notification manager
func (*NotificationManager) GetHandlers ¶
func (nm *NotificationManager) GetHandlers() map[string]NotificationHandler
GetHandlers returns all registered handlers
func (*NotificationManager) GetHistory ¶
func (nm *NotificationManager) GetHistory(limit int) []*NotificationEvent
GetHistory returns notification history
func (*NotificationManager) NotifyAlert ¶
func (nm *NotificationManager) NotifyAlert(ctx context.Context, alert *Alert) error
NotifyAlert sends notifications for a new alert
func (*NotificationManager) NotifyResolved ¶
func (nm *NotificationManager) NotifyResolved(ctx context.Context, alert *Alert) error
NotifyResolved sends notifications for a resolved alert
func (*NotificationManager) RegisterHandler ¶
func (nm *NotificationManager) RegisterHandler(handler NotificationHandler) error
RegisterHandler registers a notification handler
func (*NotificationManager) Subscribe ¶
func (nm *NotificationManager) Subscribe(subscription *AlertSubscription) error
Subscribe creates a subscription for alert notifications
func (*NotificationManager) TestNotification ¶
func (nm *NotificationManager) TestNotification(ctx context.Context, handlerID string) error
TestNotification sends a test notification to a specific handler
func (*NotificationManager) UnregisterHandler ¶
func (nm *NotificationManager) UnregisterHandler(handlerID string)
UnregisterHandler unregisters a notification handler
func (*NotificationManager) Unsubscribe ¶
func (nm *NotificationManager) Unsubscribe(subscriptionID string)
Unsubscribe removes a subscription
type NotificationType ¶
type NotificationType string
NotificationType defines the type of notification
const ( // NotificationTypeAlert is the notification type for new alerts. NotificationTypeAlert NotificationType = "alert" // NotificationTypeResolved is the notification type for resolved alerts. NotificationTypeResolved NotificationType = "resolved" // NotificationTypeTest is the notification type for test notifications. NotificationTypeTest NotificationType = "test" )
type RateLimiter ¶
type RateLimiter struct {
// contains filtered or unexported fields
}
RateLimiter provides simple rate limiting
func NewRateLimiter ¶
func NewRateLimiter(rate int, period time.Duration) *RateLimiter
NewRateLimiter creates a new rate limiter
func (*RateLimiter) Allow ¶
func (rl *RateLimiter) Allow(key string) bool
Allow checks if a request is allowed
type Rule ¶
type Rule struct {
Name string `json:"name"`
Type RuleType `json:"type"`
Target string `json:"target"` // "node", "job", "cluster"
Metric string `json:"metric"`
Operator string `json:"operator"`
Threshold float64 `json:"threshold"`
Duration time.Duration `json:"duration"`
Query string `json:"query,omitempty"` // For query type rules
Conditions []RuleCondition `json:"conditions,omitempty"` // For composite rules
Severity string `json:"severity"`
Enabled bool `json:"enabled"`
Description string `json:"description"`
MessageTemplate string `json:"message_template,omitempty"`
Labels map[string]string `json:"labels,omitempty"`
Annotations map[string]string `json:"annotations,omitempty"`
}
Rule defines an alert rule
type RuleBuilder ¶
type RuleBuilder struct {
// contains filtered or unexported fields
}
RuleBuilder helps build alert rules programmatically
func NewRuleBuilder ¶
func NewRuleBuilder(name string) *RuleBuilder
NewRuleBuilder creates a new rule builder
func (*RuleBuilder) Build ¶
func (rb *RuleBuilder) Build() (Rule, error)
Build validates and returns the rule
func (*RuleBuilder) WithAnnotation ¶
func (rb *RuleBuilder) WithAnnotation(key, value string) *RuleBuilder
WithAnnotation adds an annotation
func (*RuleBuilder) WithDescription ¶
func (rb *RuleBuilder) WithDescription(description string) *RuleBuilder
WithDescription sets the description
func (*RuleBuilder) WithDuration ¶
func (rb *RuleBuilder) WithDuration(duration time.Duration) *RuleBuilder
WithDuration sets the duration
func (*RuleBuilder) WithLabel ¶
func (rb *RuleBuilder) WithLabel(key, value string) *RuleBuilder
WithLabel adds a label
func (*RuleBuilder) WithMessageTemplate ¶
func (rb *RuleBuilder) WithMessageTemplate(template string) *RuleBuilder
WithMessageTemplate sets the message template
func (*RuleBuilder) WithMetric ¶
func (rb *RuleBuilder) WithMetric(metric string) *RuleBuilder
WithMetric sets the metric
func (*RuleBuilder) WithQuery ¶
func (rb *RuleBuilder) WithQuery(query string) *RuleBuilder
WithQuery sets the PromQL query
func (*RuleBuilder) WithSeverity ¶
func (rb *RuleBuilder) WithSeverity(severity string) *RuleBuilder
WithSeverity sets the severity
func (*RuleBuilder) WithTarget ¶
func (rb *RuleBuilder) WithTarget(target string) *RuleBuilder
WithTarget sets the target
func (*RuleBuilder) WithThreshold ¶
func (rb *RuleBuilder) WithThreshold(operator string, threshold float64) *RuleBuilder
WithThreshold sets the threshold and operator
func (*RuleBuilder) WithType ¶
func (rb *RuleBuilder) WithType(ruleType RuleType) *RuleBuilder
WithType sets the rule type
type RuleCondition ¶
type RuleCondition struct {
Metric string `json:"metric"`
Operator string `json:"operator"`
Threshold float64 `json:"threshold"`
Weight float64 `json:"weight"` // For weighted conditions
}
RuleCondition defines a condition for composite rules