alerts

package
v0.4.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 8, 2026 License: MIT Imports: 8 Imported by: 0

Documentation

Overview

Package alerts provides real-time alerting and notification capabilities for monitoring system metrics and resource utilization. It supports configurable alert rules, multiple severity levels, notification channels, and alert history tracking with automatic cleanup and retention management.

Index

Constants

This section is empty.

Variables

View Source
var PredefinedRules = []Rule{

	{
		Name:        "high_node_cpu_usage",
		Type:        RuleTypeThreshold,
		Target:      "node",
		Metric:      "cpu_usage",
		Operator:    ">",
		Threshold:   90.0,
		Duration:    5 * time.Minute,
		Severity:    "warning",
		Enabled:     true,
		Description: "Node CPU usage is above 90%",
	},
	{
		Name:        "critical_node_cpu_usage",
		Type:        RuleTypeThreshold,
		Target:      "node",
		Metric:      "cpu_usage",
		Operator:    ">",
		Threshold:   95.0,
		Duration:    5 * time.Minute,
		Severity:    "critical",
		Enabled:     true,
		Description: "Node CPU usage is critically high",
	},
	{
		Name:        "high_node_memory_usage",
		Type:        RuleTypeThreshold,
		Target:      "node",
		Metric:      "memory_usage",
		Operator:    ">",
		Threshold:   90.0,
		Duration:    5 * time.Minute,
		Severity:    "warning",
		Enabled:     true,
		Description: "Node memory usage is above 90%",
	},
	{
		Name:        "critical_node_memory_usage",
		Type:        RuleTypeThreshold,
		Target:      "node",
		Metric:      "memory_usage",
		Operator:    ">",
		Threshold:   95.0,
		Duration:    5 * time.Minute,
		Severity:    "critical",
		Enabled:     true,
		Description: "Node memory usage is critically high",
	},
	{
		Name:        "high_load_average",
		Type:        RuleTypeThreshold,
		Target:      "node",
		Metric:      "load_per_core",
		Operator:    ">",
		Threshold:   2.0,
		Duration:    10 * time.Minute,
		Severity:    "warning",
		Enabled:     true,
		Description: "Load average per core is high",
	},
	{
		Name:        "disk_io_saturation",
		Type:        RuleTypeThreshold,
		Target:      "node",
		Metric:      "disk_io_util",
		Operator:    ">",
		Threshold:   90.0,
		Duration:    5 * time.Minute,
		Severity:    "warning",
		Enabled:     true,
		Description: "Disk I/O utilization is saturated",
	},

	{
		Name:        "job_memory_limit",
		Type:        RuleTypeThreshold,
		Target:      "job",
		Metric:      "memory_usage_percent",
		Operator:    ">",
		Threshold:   95.0,
		Duration:    5 * time.Minute,
		Severity:    "critical",
		Enabled:     true,
		Description: "Job is approaching memory limit",
	},
	{
		Name:        "job_inefficient",
		Type:        RuleTypeThreshold,
		Target:      "job",
		Metric:      "overall_efficiency",
		Operator:    "<",
		Threshold:   20.0,
		Duration:    30 * time.Minute,
		Severity:    "warning",
		Enabled:     true,
		Description: "Job is using less than 20% of allocated resources",
	},
	{
		Name:        "job_cpu_throttled",
		Type:        RuleTypeQuery,
		Target:      "job",
		Query:       `rate(container_cpu_throttled_seconds_total[5m]) > 0.1`,
		Operator:    ">",
		Threshold:   0.1,
		Duration:    5 * time.Minute,
		Severity:    "warning",
		Enabled:     false,
		Description: "Job CPU is being throttled",
	},

	{
		Name:        "cluster_cpu_high",
		Type:        RuleTypeThreshold,
		Target:      "cluster",
		Metric:      "cpu_usage",
		Operator:    ">",
		Threshold:   80.0,
		Duration:    15 * time.Minute,
		Severity:    "warning",
		Enabled:     true,
		Description: "Cluster-wide CPU usage is high",
	},
	{
		Name:        "cluster_memory_high",
		Type:        RuleTypeThreshold,
		Target:      "cluster",
		Metric:      "memory_usage",
		Operator:    ">",
		Threshold:   85.0,
		Duration:    15 * time.Minute,
		Severity:    "warning",
		Enabled:     true,
		Description: "Cluster-wide memory usage is high",
	},
	{
		Name:        "nodes_down",
		Type:        RuleTypeQuery,
		Target:      "cluster",
		Query:       `count(up{job="node-exporter"} == 0) > 0`,
		Operator:    ">",
		Threshold:   0,
		Duration:    5 * time.Minute,
		Severity:    "critical",
		Enabled:     false,
		Description: "One or more nodes are down",
	},
}

PredefinedRules contains commonly used alert rules

View Source
var RuleTemplates = map[string]Rule{
	"node_metric_threshold": {
		Name:     "node_${metric}_threshold",
		Type:     RuleTypeThreshold,
		Target:   "node",
		Metric:   "${metric}",
		Operator: ">",
		Severity: "warning",
		Enabled:  true,
	},
	"job_metric_threshold": {
		Name:     "job_${metric}_threshold",
		Type:     RuleTypeThreshold,
		Target:   "job",
		Metric:   "${metric}",
		Operator: ">",
		Severity: "warning",
		Enabled:  true,
	},
	"custom_query": {
		Name:     "custom_query_alert",
		Type:     RuleTypeQuery,
		Target:   "cluster",
		Query:    "${query}",
		Operator: ">",
		Severity: "warning",
		Enabled:  true,
	},
}

RuleTemplates provides templates for creating custom rules

Functions

func ValidateRule

func ValidateRule(rule *Rule) error

ValidateRule validates an alert rule

Types

type Alert

type Alert struct {
	ID          string            `json:"id"`
	RuleName    string            `json:"rule_name"`
	Severity    string            `json:"severity"`
	State       AlertState        `json:"state"`
	Message     string            `json:"message"`
	Description string            `json:"description"`
	Labels      map[string]string `json:"labels"`
	Annotations map[string]string `json:"annotations"`
	Value       float64           `json:"value"`
	Threshold   float64           `json:"threshold"`
	FirstSeen   time.Time         `json:"first_seen"`
	LastSeen    time.Time         `json:"last_seen"`
	ResolvedAt  time.Time         `json:"resolved_at,omitempty"`
	Duration    time.Duration     `json:"duration"`
	Source      string            `json:"source"`
}

Alert represents an active or historical alert

type AlertFilter

type AlertFilter struct {
	Severities []string
	Sources    []string
	Rules      []string
	Labels     map[string]string
}

AlertFilter defines criteria for filtering alerts

type AlertState

type AlertState string

AlertState represents the state of an alert

const (
	// AlertStatePending is the pending state for alerts.
	AlertStatePending AlertState = "pending"
	// AlertStateFiring is the firing state for alerts.
	AlertStateFiring AlertState = "firing"
	// AlertStateResolved is the resolved state for alerts.
	AlertStateResolved AlertState = "resolved"
)

type AlertSubscription

type AlertSubscription struct {
	ID           string
	Filter       AlertFilter
	HandlerIDs   []string
	CreatedAt    time.Time
	LastNotified time.Time
}

AlertSubscription defines a subscription to alert notifications

type Engine

type Engine struct {
	// contains filtered or unexported fields
}

Engine manages alert evaluation and state

func NewEngine

func NewEngine(config *config.AlertConfig, client *prometheus.CachedClient) *Engine

NewEngine creates a new alert engine

func (*Engine) AddRule

func (e *Engine) AddRule(rule *Rule)

AddRule adds a new alert rule

func (*Engine) GetActiveAlerts

func (e *Engine) GetActiveAlerts() []Alert

GetActiveAlerts returns all active alerts

func (*Engine) GetAlertHistory

func (e *Engine) GetAlertHistory(limit int) []Alert

GetAlertHistory returns alert history

func (*Engine) GetRules

func (e *Engine) GetRules() []Rule

GetRules returns all configured rules

func (*Engine) RemoveRule

func (e *Engine) RemoveRule(name string)

RemoveRule removes an alert rule by name

func (*Engine) SetAlertCallback

func (e *Engine) SetAlertCallback(fn func(*Alert))

SetAlertCallback sets the callback for new alerts

func (*Engine) SetJobCollector

func (e *Engine) SetJobCollector(collector *models.JobMetricsCollector)

SetJobCollector updates the job metrics collector

func (*Engine) SetNodeCollector

func (e *Engine) SetNodeCollector(collector *models.NodeMetricsCollector)

SetNodeCollector updates the node metrics collector

func (*Engine) SetResolvedCallback

func (e *Engine) SetResolvedCallback(fn func(*Alert))

SetResolvedCallback sets the callback for resolved alerts

func (*Engine) Start

func (e *Engine) Start(ctx context.Context) error

Start starts the alert engine

func (*Engine) Stop

func (e *Engine) Stop() error

Stop stops the alert engine

type InAppHandler

type InAppHandler struct {
	// contains filtered or unexported fields
}

InAppHandler handles in-app notifications

func NewInAppHandler

func NewInAppHandler() *InAppHandler

NewInAppHandler creates a new in-app notification handler

func (*InAppHandler) ClearNotifications

func (h *InAppHandler) ClearNotifications()

ClearNotifications clears all in-app notifications

func (*InAppHandler) GetID

func (h *InAppHandler) GetID() string

GetID returns the unique identifier of the in-app handler.

func (*InAppHandler) GetName

func (h *InAppHandler) GetName() string

GetName returns the name of the in-app handler.

func (*InAppHandler) GetNotifications

func (h *InAppHandler) GetNotifications(limit int) []*Notification

GetNotifications returns in-app notifications

func (*InAppHandler) IsEnabled

func (h *InAppHandler) IsEnabled() bool

IsEnabled returns whether the in-app handler is enabled.

func (*InAppHandler) Send

func (h *InAppHandler) Send(_ context.Context, notification *Notification) error

Send sends a notification to the in-app notification queue.

type LogHandler

type LogHandler struct {
	// contains filtered or unexported fields
}

LogHandler logs notifications

func NewLogHandler

func NewLogHandler(logger func(string)) *LogHandler

NewLogHandler creates a new log handler

func (*LogHandler) GetID

func (h *LogHandler) GetID() string

GetID returns the unique identifier of the log handler.

func (*LogHandler) GetName

func (h *LogHandler) GetName() string

GetName returns the name of the log handler.

func (*LogHandler) IsEnabled

func (h *LogHandler) IsEnabled() bool

IsEnabled returns whether the log handler is enabled.

func (*LogHandler) Send

func (h *LogHandler) Send(_ context.Context, notification *Notification) error

Send sends a notification to the configured logger.

type Notification

type Notification struct {
	Alert      *Alert            `json:"alert"`
	Type       NotificationType  `json:"type"`
	Timestamp  time.Time         `json:"timestamp"`
	Recipients []string          `json:"recipients,omitempty"`
	Metadata   map[string]string `json:"metadata,omitempty"`
}

Notification represents an alert notification

type NotificationEvent

type NotificationEvent struct {
	ID           string
	Notification *Notification
	HandlerID    string
	Status       string
	Error        string
	SentAt       time.Time
}

NotificationEvent represents a notification event in history

type NotificationHandler

type NotificationHandler interface {
	GetID() string
	GetName() string
	Send(ctx context.Context, notification *Notification) error
	IsEnabled() bool
}

NotificationHandler defines the interface for notification handlers

type NotificationManager

type NotificationManager struct {
	// contains filtered or unexported fields
}

NotificationManager handles alert notifications

func NewNotificationManager

func NewNotificationManager() *NotificationManager

NewNotificationManager creates a new notification manager

func (*NotificationManager) GetHandlers

func (nm *NotificationManager) GetHandlers() map[string]NotificationHandler

GetHandlers returns all registered handlers

func (*NotificationManager) GetHistory

func (nm *NotificationManager) GetHistory(limit int) []*NotificationEvent

GetHistory returns notification history

func (*NotificationManager) NotifyAlert

func (nm *NotificationManager) NotifyAlert(ctx context.Context, alert *Alert) error

NotifyAlert sends notifications for a new alert

func (*NotificationManager) NotifyResolved

func (nm *NotificationManager) NotifyResolved(ctx context.Context, alert *Alert) error

NotifyResolved sends notifications for a resolved alert

func (*NotificationManager) RegisterHandler

func (nm *NotificationManager) RegisterHandler(handler NotificationHandler) error

RegisterHandler registers a notification handler

func (*NotificationManager) Subscribe

func (nm *NotificationManager) Subscribe(subscription *AlertSubscription) error

Subscribe creates a subscription for alert notifications

func (*NotificationManager) TestNotification

func (nm *NotificationManager) TestNotification(ctx context.Context, handlerID string) error

TestNotification sends a test notification to a specific handler

func (*NotificationManager) UnregisterHandler

func (nm *NotificationManager) UnregisterHandler(handlerID string)

UnregisterHandler unregisters a notification handler

func (*NotificationManager) Unsubscribe

func (nm *NotificationManager) Unsubscribe(subscriptionID string)

Unsubscribe removes a subscription

type NotificationType

type NotificationType string

NotificationType defines the type of notification

const (
	// NotificationTypeAlert is the notification type for new alerts.
	NotificationTypeAlert NotificationType = "alert"
	// NotificationTypeResolved is the notification type for resolved alerts.
	NotificationTypeResolved NotificationType = "resolved"
	// NotificationTypeTest is the notification type for test notifications.
	NotificationTypeTest NotificationType = "test"
)

type RateLimiter

type RateLimiter struct {
	// contains filtered or unexported fields
}

RateLimiter provides simple rate limiting

func NewRateLimiter

func NewRateLimiter(rate int, period time.Duration) *RateLimiter

NewRateLimiter creates a new rate limiter

func (*RateLimiter) Allow

func (rl *RateLimiter) Allow(key string) bool

Allow checks if a request is allowed

type Rule

type Rule struct {
	Name            string            `json:"name"`
	Type            RuleType          `json:"type"`
	Target          string            `json:"target"` // "node", "job", "cluster"
	Metric          string            `json:"metric"`
	Operator        string            `json:"operator"`
	Threshold       float64           `json:"threshold"`
	Duration        time.Duration     `json:"duration"`
	Query           string            `json:"query,omitempty"`      // For query type rules
	Conditions      []RuleCondition   `json:"conditions,omitempty"` // For composite rules
	Severity        string            `json:"severity"`
	Enabled         bool              `json:"enabled"`
	Description     string            `json:"description"`
	MessageTemplate string            `json:"message_template,omitempty"`
	Labels          map[string]string `json:"labels,omitempty"`
	Annotations     map[string]string `json:"annotations,omitempty"`
}

Rule defines an alert rule

type RuleBuilder

type RuleBuilder struct {
	// contains filtered or unexported fields
}

RuleBuilder helps build alert rules programmatically

func NewRuleBuilder

func NewRuleBuilder(name string) *RuleBuilder

NewRuleBuilder creates a new rule builder

func (*RuleBuilder) Build

func (rb *RuleBuilder) Build() (Rule, error)

Build validates and returns the rule

func (*RuleBuilder) WithAnnotation

func (rb *RuleBuilder) WithAnnotation(key, value string) *RuleBuilder

WithAnnotation adds an annotation

func (*RuleBuilder) WithDescription

func (rb *RuleBuilder) WithDescription(description string) *RuleBuilder

WithDescription sets the description

func (*RuleBuilder) WithDuration

func (rb *RuleBuilder) WithDuration(duration time.Duration) *RuleBuilder

WithDuration sets the duration

func (*RuleBuilder) WithLabel

func (rb *RuleBuilder) WithLabel(key, value string) *RuleBuilder

WithLabel adds a label

func (*RuleBuilder) WithMessageTemplate

func (rb *RuleBuilder) WithMessageTemplate(template string) *RuleBuilder

WithMessageTemplate sets the message template

func (*RuleBuilder) WithMetric

func (rb *RuleBuilder) WithMetric(metric string) *RuleBuilder

WithMetric sets the metric

func (*RuleBuilder) WithQuery

func (rb *RuleBuilder) WithQuery(query string) *RuleBuilder

WithQuery sets the PromQL query

func (*RuleBuilder) WithSeverity

func (rb *RuleBuilder) WithSeverity(severity string) *RuleBuilder

WithSeverity sets the severity

func (*RuleBuilder) WithTarget

func (rb *RuleBuilder) WithTarget(target string) *RuleBuilder

WithTarget sets the target

func (*RuleBuilder) WithThreshold

func (rb *RuleBuilder) WithThreshold(operator string, threshold float64) *RuleBuilder

WithThreshold sets the threshold and operator

func (*RuleBuilder) WithType

func (rb *RuleBuilder) WithType(ruleType RuleType) *RuleBuilder

WithType sets the rule type

type RuleCondition

type RuleCondition struct {
	Metric    string  `json:"metric"`
	Operator  string  `json:"operator"`
	Threshold float64 `json:"threshold"`
	Weight    float64 `json:"weight"` // For weighted conditions
}

RuleCondition defines a condition for composite rules

type RuleType

type RuleType string

RuleType defines the type of alert rule

const (
	// RuleTypeThreshold is the threshold rule type for alerts.
	RuleTypeThreshold RuleType = "threshold"
	// RuleTypeQuery is the query rule type for alerts.
	RuleTypeQuery RuleType = "query"
	// RuleTypeComposite is the composite rule type for alerts.
	RuleTypeComposite RuleType = "composite"
)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL