watcher

package

v0.0.16 Latest Latest Go to latest Published: Apr 30, 2026 License: MIT Imports: 15 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/gaurangkudale/rca-operator

Links

Open Source Insights

Documentation ¶

Index ¶

func IsKnownEventType(name string) bool
type AttributesEvent
type BaseEvent
type ChannelEventEmitter
- func NewChannelEventEmitter(ch chan<- CorrelatorEvent, logger logr.Logger) *ChannelEventEmitter
- func NewChannelEventEmitterWithOptions(ch chan<- CorrelatorEvent, logger logr.Logger, opts ChannelEventEmitterOptions) *ChannelEventEmitter
- func (e *ChannelEventEmitter) Emit(event CorrelatorEvent)
type ChannelEventEmitterOptions
type CorrelatorEvent
type CrashLoopBackOffEvent
- func (e CrashLoopBackOffEvent) DedupKey() string
- func (e CrashLoopBackOffEvent) OccurredAt() time.Time
- func (e CrashLoopBackOffEvent) Type() EventType
type CronJobFailedEvent
- func (e CronJobFailedEvent) DedupKey() string
- func (e CronJobFailedEvent) OccurredAt() time.Time
- func (e CronJobFailedEvent) Type() EventType
type CronJobWatcher
- func NewCronJobWatcher(cache ctrlcache.Cache, emitter EventEmitter, logger logr.Logger, ...) *CronJobWatcher
- func (w *CronJobWatcher) Start(ctx context.Context) error
type CronJobWatcherConfig
type DaemonSetWatcher
- func NewDaemonSetWatcher(cache ctrlcache.Cache, emitter EventEmitter, logger logr.Logger, ...) *DaemonSetWatcher
- func (w *DaemonSetWatcher) Start(ctx context.Context) error
type DaemonSetWatcherConfig
type DeploymentWatcher
- func NewDeploymentWatcher(cache ctrlcache.Cache, emitter EventEmitter, logger logr.Logger, ...) *DeploymentWatcher
- func (w *DeploymentWatcher) Start(ctx context.Context) error
type DeploymentWatcherConfig
type EventEmitter
type EventType
type EventWatcher
- func NewEventWatcher(cache ctrlcache.Cache, emitter EventEmitter, logger logr.Logger, ...) *EventWatcher
- func (w *EventWatcher) Start(ctx context.Context) error
type EventWatcherConfig
type GracePeriodViolationEvent
- func (e GracePeriodViolationEvent) DedupKey() string
- func (e GracePeriodViolationEvent) OccurredAt() time.Time
- func (e GracePeriodViolationEvent) Type() EventType
type ImagePullBackOffEvent
- func (e ImagePullBackOffEvent) DedupKey() string
- func (e ImagePullBackOffEvent) OccurredAt() time.Time
- func (e ImagePullBackOffEvent) Type() EventType
type JobFailedEvent
- func (e JobFailedEvent) DedupKey() string
- func (e JobFailedEvent) OccurredAt() time.Time
- func (e JobFailedEvent) Type() EventType
type JobWatcher
- func NewJobWatcher(cache ctrlcache.Cache, emitter EventEmitter, logger logr.Logger, ...) *JobWatcher
- func (w *JobWatcher) Start(ctx context.Context) error
type JobWatcherConfig
type NodeNotReadyEvent
- func (e NodeNotReadyEvent) DedupKey() string
- func (e NodeNotReadyEvent) OccurredAt() time.Time
- func (e NodeNotReadyEvent) Type() EventType
type NodePressureEvent
- func (e NodePressureEvent) DedupKey() string
- func (e NodePressureEvent) OccurredAt() time.Time
- func (e NodePressureEvent) Type() EventType
type NodeWatcher
- func NewNodeWatcher(cache ctrlcache.Cache, emitter EventEmitter, logger logr.Logger, ...) *NodeWatcher
- func (w *NodeWatcher) Start(ctx context.Context) error
type NodeWatcherConfig
type OOMKilledEvent
- func (e OOMKilledEvent) DedupKey() string
- func (e OOMKilledEvent) OccurredAt() time.Time
- func (e OOMKilledEvent) Type() EventType
type OTelLogMatchEvent
- func (e OTelLogMatchEvent) Attributes() map[string]string
- func (e OTelLogMatchEvent) DedupKey() string
- func (e OTelLogMatchEvent) OccurredAt() time.Time
- func (e OTelLogMatchEvent) Type() EventType
type OTelSpanErrorEvent
- func (e OTelSpanErrorEvent) Attributes() map[string]string
- func (e OTelSpanErrorEvent) DedupKey() string
- func (e OTelSpanErrorEvent) OccurredAt() time.Time
- func (e OTelSpanErrorEvent) Type() EventType
type OTelSpanEventEvent
- func (e OTelSpanEventEvent) Attributes() map[string]string
- func (e OTelSpanEventEvent) DedupKey() string
- func (e OTelSpanEventEvent) OccurredAt() time.Time
- func (e OTelSpanEventEvent) Type() EventType
type OTelSpanLatencySpikeEvent
- func (e OTelSpanLatencySpikeEvent) Attributes() map[string]string
- func (e OTelSpanLatencySpikeEvent) DedupKey() string
- func (e OTelSpanLatencySpikeEvent) OccurredAt() time.Time
- func (e OTelSpanLatencySpikeEvent) Type() EventType
type PodDeletedEvent
- func (e PodDeletedEvent) DedupKey() string
- func (e PodDeletedEvent) OccurredAt() time.Time
- func (e PodDeletedEvent) Type() EventType
type PodEvictedEvent
- func (e PodEvictedEvent) DedupKey() string
- func (e PodEvictedEvent) OccurredAt() time.Time
- func (e PodEvictedEvent) Type() EventType
type PodHealthyEvent
- func (e PodHealthyEvent) DedupKey() string
- func (e PodHealthyEvent) OccurredAt() time.Time
- func (e PodHealthyEvent) Type() EventType
type PodPendingTooLongEvent
- func (e PodPendingTooLongEvent) DedupKey() string
- func (e PodPendingTooLongEvent) OccurredAt() time.Time
- func (e PodPendingTooLongEvent) Type() EventType
type PodWatcher
- func NewPodWatcher(cache ctrlcache.Cache, emitter EventEmitter, logger logr.Logger, ...) *PodWatcher
- func (w *PodWatcher) Start(ctx context.Context) error
type PodWatcherConfig
type ProbeFailureEvent
- func (e ProbeFailureEvent) DedupKey() string
- func (e ProbeFailureEvent) OccurredAt() time.Time
- func (e ProbeFailureEvent) Type() EventType
type StalledDaemonSetEvent
- func (e StalledDaemonSetEvent) DedupKey() string
- func (e StalledDaemonSetEvent) OccurredAt() time.Time
- func (e StalledDaemonSetEvent) Type() EventType
type StalledRolloutEvent
- func (e StalledRolloutEvent) DedupKey() string
- func (e StalledRolloutEvent) OccurredAt() time.Time
- func (e StalledRolloutEvent) Type() EventType
type StalledStatefulSetEvent
- func (e StalledStatefulSetEvent) DedupKey() string
- func (e StalledStatefulSetEvent) OccurredAt() time.Time
- func (e StalledStatefulSetEvent) Type() EventType
type StatefulSetWatcher
- func NewStatefulSetWatcher(cache ctrlcache.Cache, emitter EventEmitter, logger logr.Logger, ...) *StatefulSetWatcher
- func (w *StatefulSetWatcher) Start(ctx context.Context) error
type StatefulSetWatcherConfig

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func IsKnownEventType ¶ added in v0.0.16

func IsKnownEventType(name string) bool

IsKnownEventType reports whether name is one of the signal types emitted by the watcher or OTLP ingest pipeline.

Types ¶

type AttributesEvent ¶ added in v0.0.16

type AttributesEvent interface {
	Attributes() map[string]string
}

AttributesEvent is an optional interface implemented by events that carry key/value attributes (notably OTel span and log signals). The CRD rule engine uses this for attribute-level condition matching without needing type-switches.

Events that do not implement this interface are treated as having no attributes, preserving backward compatibility with existing K8s-event-only rules.

type BaseEvent ¶

type BaseEvent struct {
	At        time.Time
	AgentName string
	Namespace string
	PodName   string
	PodUID    string
	NodeName  string
}

BaseEvent carries fields common to all watcher-originated signals.

type ChannelEventEmitter ¶

type ChannelEventEmitter struct {
	// contains filtered or unexported fields
}

ChannelEventEmitter sends watcher events to a shared correlator channel.

func NewChannelEventEmitter ¶

func NewChannelEventEmitter(ch chan<- CorrelatorEvent, logger logr.Logger) *ChannelEventEmitter

NewChannelEventEmitter creates a non-blocking emitter backed by a channel.

func NewChannelEventEmitterWithOptions ¶ added in v0.0.16

func NewChannelEventEmitterWithOptions(ch chan<- CorrelatorEvent, logger logr.Logger, opts ChannelEventEmitterOptions) *ChannelEventEmitter

func (*ChannelEventEmitter) Emit ¶

func (e *ChannelEventEmitter) Emit(event CorrelatorEvent)

Emit attempts to send without blocking informer processing.

type ChannelEventEmitterOptions ¶ added in v0.0.16

type ChannelEventEmitterOptions struct {
	// DedupWindow coalesces repeated events with the same DedupKey before they
	// enter the shared channel. This protects the incident engine from OTLP log
	// storms while still allowing fresh signals through after the window.
	DedupWindow time.Duration
}

type CorrelatorEvent ¶

type CorrelatorEvent interface {
	Type() EventType
	OccurredAt() time.Time
	DedupKey() string
}

CorrelatorEvent is the shared typed event interface consumed by the correlator.

type CrashLoopBackOffEvent ¶

type CrashLoopBackOffEvent struct {
	BaseEvent
	ContainerName string
	RestartCount  int32
	Threshold     int32
	// Exit code info (optional) — captured from last container termination
	LastExitCode        int32  // 0 if not available
	ExitCodeCategory    string // e.g., "PermissionDenied", or empty if not available
	ExitCodeDescription string // human-readable description, or empty if not available
}

CrashLoopBackOffEvent is emitted when a pod container repeatedly restarts in CrashLoopBackOff. It may include the last exit code and classification to provide diagnostic context.

func (CrashLoopBackOffEvent) DedupKey ¶

func (e CrashLoopBackOffEvent) DedupKey() string

func (CrashLoopBackOffEvent) OccurredAt ¶

func (e CrashLoopBackOffEvent) OccurredAt() time.Time

func (CrashLoopBackOffEvent) Type ¶

func (e CrashLoopBackOffEvent) Type() EventType

type CronJobFailedEvent ¶ added in v0.0.15

type CronJobFailedEvent struct {
	BaseEvent
	CronJobName string
	LastJobName string
	Reason      string
	Message     string
}

CronJobFailedEvent is emitted when a batch/v1 CronJob's most recent child Job has failed, indicating a broken scheduled task.

func (CronJobFailedEvent) DedupKey ¶ added in v0.0.15

func (e CronJobFailedEvent) DedupKey() string

func (CronJobFailedEvent) OccurredAt ¶ added in v0.0.15

func (e CronJobFailedEvent) OccurredAt() time.Time

func (CronJobFailedEvent) Type ¶ added in v0.0.15

func (e CronJobFailedEvent) Type() EventType

type CronJobWatcher ¶ added in v0.0.15

type CronJobWatcher struct {
	// contains filtered or unexported fields
}

CronJobWatcher monitors batch/v1 CronJobs and emits a CronJobFailedEvent when the most recent child Job has failed, indicating a broken scheduled task.

At most one event is emitted per failed child Job (keyed by Job UID).

func NewCronJobWatcher ¶ added in v0.0.15

func NewCronJobWatcher(cache ctrlcache.Cache, emitter EventEmitter, logger logr.Logger, cfg CronJobWatcherConfig) *CronJobWatcher

NewCronJobWatcher creates a CronJobWatcher backed by a controller-runtime cache.

func (*CronJobWatcher) Start ¶ added in v0.0.15

func (w *CronJobWatcher) Start(ctx context.Context) error

Start registers a periodic scanner that checks for CronJobs whose most recent child Job has failed. Non-blocking; all goroutines are bounded by ctx.

Unlike other watchers, CronJobWatcher relies on scanning Jobs rather than watching CronJob objects directly, because the failure signal comes from the child Job's status, not from the CronJob spec.

type CronJobWatcherConfig ¶ added in v0.0.15

type CronJobWatcherConfig struct {
	AgentName       string
	WatchNamespaces []string
	ScanInterval    time.Duration
}

CronJobWatcherConfig controls the behaviour of the CronJob failure watcher.

type DaemonSetWatcher ¶ added in v0.0.15

type DaemonSetWatcher struct {
	// contains filtered or unexported fields
}

DaemonSetWatcher monitors apps/v1 DaemonSets and emits a StalledDaemonSetEvent when the number of ready pods is fewer than desired, indicating a stalled rollout or scheduling failure.

At most one event is emitted per (DaemonSet UID, observedGeneration) pair.

func NewDaemonSetWatcher ¶ added in v0.0.15

func NewDaemonSetWatcher(cache ctrlcache.Cache, emitter EventEmitter, logger logr.Logger, cfg DaemonSetWatcherConfig) *DaemonSetWatcher

NewDaemonSetWatcher creates a DaemonSetWatcher backed by a controller-runtime cache.

func (*DaemonSetWatcher) Start ¶ added in v0.0.15

func (w *DaemonSetWatcher) Start(ctx context.Context) error

Start registers informer handlers, runs a bootstrap scan, and launches the periodic fallback scanner. Non-blocking; all goroutines are bounded by ctx.

type DaemonSetWatcherConfig ¶ added in v0.0.15

type DaemonSetWatcherConfig struct {
	AgentName       string
	WatchNamespaces []string
	ScanInterval    time.Duration
}

DaemonSetWatcherConfig controls the behaviour of the DaemonSet watcher.

type DeploymentWatcher ¶

type DeploymentWatcher struct {
	// contains filtered or unexported fields
}

DeploymentWatcher monitors apps/v1 Deployments and emits a StalledRolloutEvent when a rollout fails to make forward progress within its configured progressDeadlineSeconds window (the kubelet sets Progressing=False with Reason=ProgressDeadlineExceeded on the Deployment's status conditions).

At most one StalledRolloutEvent is emitted per (deployment, observedGeneration) pair. When the deployment either completes its rollout or starts a new one, the in-memory gate is cleared so a subsequent stall can fire again.

It is intentionally read-only: it never writes to the Kubernetes API.

func NewDeploymentWatcher ¶

func NewDeploymentWatcher(cache ctrlcache.Cache, emitter EventEmitter, logger logr.Logger, cfg DeploymentWatcherConfig) *DeploymentWatcher

NewDeploymentWatcher creates a DeploymentWatcher backed by a controller-runtime cache. Config fields are defaulted when zero.

func (*DeploymentWatcher) Start ¶

func (w *DeploymentWatcher) Start(ctx context.Context) error

Start registers informer handlers, runs a bootstrap scan for pre-existing stalled rollouts, and launches the periodic fallback scanner. It is non-blocking; all goroutines are bounded by ctx.

type DeploymentWatcherConfig ¶

type DeploymentWatcherConfig struct {
	// AgentName is stamped on every emitted event for correlator routing.
	AgentName string

	// WatchNamespaces restricts observation to these namespaces.
	// An empty slice means watch all namespaces.
	WatchNamespaces []string

	// ScanInterval controls how often the periodic fallback scan runs.
	// Defaults to defaultDeploymentScanInterval.
	ScanInterval time.Duration
}

DeploymentWatcherConfig controls the behaviour of the deployment rollout watcher.

type EventEmitter ¶

type EventEmitter interface {
	Emit(event CorrelatorEvent)
}

EventEmitter abstracts how watcher events are delivered to downstream consumers.

type EventType ¶

type EventType string

EventType identifies the concrete watcher signal type sent to the correlator.

const (
	EventTypeCrashLoopBackOff     EventType = "CrashLoopBackOff"
	EventTypeOOMKilled            EventType = "OOMKilled"
	EventTypeImagePullBackOff     EventType = "ImagePullBackOff"
	EventTypePodPendingTooLong    EventType = "PodPendingTooLong"
	EventTypeGracePeriodViolation EventType = "GracePeriodViolation"
	EventTypePodHealthy           EventType = "PodHealthy"
	EventTypePodDeleted           EventType = "PodDeleted"

	// Event-stream-sourced signals (detected from core/v1 Event objects).
	EventTypeNodeNotReady EventType = "NodeNotReady"
	EventTypePodEvicted   EventType = "PodEvicted"
	EventTypeProbeFailure EventType = "ProbeFailure"

	// Deployment-sourced signals (detected from apps/v1 Deployment objects).
	EventTypeStalledRollout EventType = "StalledRollout"

	// Node-condition-sourced signals (detected from corev1.Node objects by node_watcher.go).
	// NodeNotReady is also captured via event_watcher.go; both paths feed the correlator
	// and the dedup key (namespace+nodeName) prevents duplicate incidents.
	EventTypeNodePressure EventType = "NodePressure"

	// StatefulSet-sourced signals (detected from apps/v1 StatefulSet objects).
	EventTypeStalledStatefulSet EventType = "StalledStatefulSet"

	// DaemonSet-sourced signals (detected from apps/v1 DaemonSet objects).
	EventTypeStalledDaemonSet EventType = "StalledDaemonSet"

	// Job-sourced signals (detected from batch/v1 Job objects).
	EventTypeJobFailed EventType = "JobFailed"

	// CronJob-sourced signals (detected from batch/v1 CronJob objects).
	EventTypeCronJobFailed EventType = "CronJobFailed"

	// OTel-sourced signals (ingested via internal/otelingest from the cluster-wide
	// OTel Collector DaemonSet). These allow the correlator to reason about
	// user-workload traces and logs alongside Kubernetes control-plane events.
	EventTypeOTelSpanError        EventType = "OTelSpanError"
	EventTypeOTelSpanLatencySpike EventType = "OTelSpanLatencySpike"
	EventTypeOTelLogMatch         EventType = "OTelLogMatch"
	EventTypeOTelSpanEvent        EventType = "OTelSpanEvent"
)

type EventWatcher ¶

type EventWatcher struct {
	// contains filtered or unexported fields
}

EventWatcher watches the core/v1 Event stream and emits typed CorrelatorEvents for OOM kills, pod evictions, probe failures, and node NotReady transitions.

It is intentionally read-only: it never writes to the Kubernetes API.

func NewEventWatcher ¶

func NewEventWatcher(cache ctrlcache.Cache, emitter EventEmitter, logger logr.Logger, cfg EventWatcherConfig) *EventWatcher

NewEventWatcher creates an EventWatcher backed by a controller-runtime cache. Config fields are defaulted when zero.

func (*EventWatcher) Start ¶

func (w *EventWatcher) Start(ctx context.Context) error

Start registers informer handlers, runs a bootstrap replay scan, and launches the periodic dedup-map sweep. It is non-blocking; goroutines are bounded by ctx.

type EventWatcherConfig ¶

type EventWatcherConfig struct {
	// AgentName is stamped on every emitted event for correlator routing.
	AgentName string

	// WatchNamespaces restricts observation to these namespaces.
	// An empty slice means watch all namespaces.
	WatchNamespaces []string

	// DedupWindow is how long the same (namespace/objectUID/reason) key is
	// suppressed after the first emit. Defaults to defaultEventDedupWindow.
	DedupWindow time.Duration

	// DedupSweepInterval controls how often the dedup map is compacted.
	// Defaults to defaultEventDedupSweepInterval.
	DedupSweepInterval time.Duration
}

EventWatcherConfig controls the behaviour of the Kubernetes Event stream watcher.

type GracePeriodViolationEvent ¶

type GracePeriodViolationEvent struct {
	BaseEvent
	GracePeriodSeconds int64
	OverdueFor         time.Duration
}

GracePeriodViolationEvent is emitted when a deleting pod exceeds termination grace period while at least one container is still running.

func (GracePeriodViolationEvent) DedupKey ¶

func (e GracePeriodViolationEvent) DedupKey() string

func (GracePeriodViolationEvent) OccurredAt ¶

func (e GracePeriodViolationEvent) OccurredAt() time.Time

func (GracePeriodViolationEvent) Type ¶

func (e GracePeriodViolationEvent) Type() EventType

type ImagePullBackOffEvent ¶

type ImagePullBackOffEvent struct {
	BaseEvent
	ContainerName string
	Reason        string
	Message       string
}

ImagePullBackOffEvent is emitted when image pull for a container fails.

func (ImagePullBackOffEvent) DedupKey ¶

func (e ImagePullBackOffEvent) DedupKey() string

func (ImagePullBackOffEvent) OccurredAt ¶

func (e ImagePullBackOffEvent) OccurredAt() time.Time

func (ImagePullBackOffEvent) Type ¶

func (e ImagePullBackOffEvent) Type() EventType

type JobFailedEvent ¶ added in v0.0.15

type JobFailedEvent struct {
	BaseEvent
	JobName string
	Reason  string
	Message string
}

JobFailedEvent is emitted when a batch/v1 Job reaches a Failed condition, typically due to BackoffLimitExceeded or DeadlineExceeded.

func (JobFailedEvent) DedupKey ¶ added in v0.0.15

func (e JobFailedEvent) DedupKey() string

func (JobFailedEvent) OccurredAt ¶ added in v0.0.15

func (e JobFailedEvent) OccurredAt() time.Time

func (JobFailedEvent) Type ¶ added in v0.0.15

func (e JobFailedEvent) Type() EventType

type JobWatcher ¶ added in v0.0.15

type JobWatcher struct {
	// contains filtered or unexported fields
}

JobWatcher monitors batch/v1 Jobs and emits a JobFailedEvent when a Job reaches a Failed condition (typically BackoffLimitExceeded or DeadlineExceeded).

At most one event is emitted per (Job UID, observedGeneration) pair.

func NewJobWatcher ¶ added in v0.0.15

func NewJobWatcher(cache ctrlcache.Cache, emitter EventEmitter, logger logr.Logger, cfg JobWatcherConfig) *JobWatcher

NewJobWatcher creates a JobWatcher backed by a controller-runtime cache.

func (*JobWatcher) Start ¶ added in v0.0.15

func (w *JobWatcher) Start(ctx context.Context) error

Start registers informer handlers, runs a bootstrap scan, and launches the periodic fallback scanner. Non-blocking; all goroutines are bounded by ctx.

type JobWatcherConfig ¶ added in v0.0.15

type JobWatcherConfig struct {
	AgentName       string
	WatchNamespaces []string
	ScanInterval    time.Duration
}

JobWatcherConfig controls the behaviour of the Job failure watcher.

type NodeNotReadyEvent ¶

type NodeNotReadyEvent struct {
	BaseEvent
	// NodeName is the name of the node that went NotReady.
	// Overrides BaseEvent.NodeName for clarity; PodName is empty for node-level events.
	Reason  string
	Message string
}

NodeNotReadyEvent is emitted when a Kubernetes Node transitions to NotReady. Sourced from the core/v1 Event stream (reason: NodeNotReady or NodeConditionChanged).

func (NodeNotReadyEvent) DedupKey ¶

func (e NodeNotReadyEvent) DedupKey() string

func (NodeNotReadyEvent) OccurredAt ¶

func (e NodeNotReadyEvent) OccurredAt() time.Time

func (NodeNotReadyEvent) Type ¶

func (e NodeNotReadyEvent) Type() EventType

type NodePressureEvent ¶

type NodePressureEvent struct {
	BaseEvent
	// PressureType is one of: "DiskPressure", "MemoryPressure", "PIDPressure".
	PressureType string
	// Message is the kubelet-provided detail from the condition (may be empty).
	Message string
}

NodePressureEvent is emitted when a Node enters a resource-pressure condition: DiskPressure, MemoryPressure, or PIDPressure.

Sourced from corev1.Node status conditions by node_watcher.go (which watches the Node object directly, so it fires even when the kubelet does not produce a K8s Event).

PressureType contains the human-readable condition name — "DiskPressure", "MemoryPressure", or "PIDPressure" — matching corev1.NodeConditionType.

func (NodePressureEvent) DedupKey ¶

func (e NodePressureEvent) DedupKey() string

func (NodePressureEvent) OccurredAt ¶

func (e NodePressureEvent) OccurredAt() time.Time

func (NodePressureEvent) Type ¶

func (e NodePressureEvent) Type() EventType

type NodeWatcher ¶

type NodeWatcher struct {
	// contains filtered or unexported fields
}

NodeWatcher monitors corev1.Node objects and emits typed signals for:

NotReady condition → NodeNotReadyEvent
DiskPressure condition → NodePressureEvent{PressureType: "DiskPressure"}
MemoryPressure condition → NodePressureEvent{PressureType: "MemoryPressure"}
PIDPressure condition → NodePressureEvent{PressureType: "PIDPressure"}

It supplements event_watcher.go, which captures node signals from the K8s Event stream. Watching the Node object directly is more reliable because:

Node conditions are always present on the Node status object.
DiskPressure and MemoryPressure may not produce K8s Events in every cluster.
K8s Events are rate-limited; conditions are authoritative and persistent.

The correlator's dedup key (namespace+nodeName+conditionType) prevents duplicate incidents when both event_watcher and node_watcher observe the same condition.

It is intentionally read-only: it never writes to the Kubernetes API.

func NewNodeWatcher ¶

func NewNodeWatcher(cache ctrlcache.Cache, emitter EventEmitter, logger logr.Logger, cfg NodeWatcherConfig) *NodeWatcher

NewNodeWatcher creates a NodeWatcher backed by a controller-runtime cache. Config fields are defaulted when zero.

func (*NodeWatcher) Start ¶

func (w *NodeWatcher) Start(ctx context.Context) error

Start registers informer handlers, runs a bootstrap scan for pre-existing node conditions, and launches the periodic fallback scanner. Non-blocking; all goroutines are bounded by ctx.

type NodeWatcherConfig ¶

type NodeWatcherConfig struct {
	// AgentName is stamped on every emitted event for correlator routing.
	AgentName string

	// IncidentNamespace is the namespace where node-level IncidentReport CRs are
	// stored.  Since Nodes are cluster-scoped, a target namespace must be provided
	// explicitly; the agent's own namespace is the natural choice.
	// Defaults to "default" when empty.
	IncidentNamespace string

	// ScanInterval controls how often the periodic fallback scan runs.
	// Defaults to defaultNodeScanInterval.
	ScanInterval time.Duration
}

NodeWatcherConfig controls Node condition monitoring behaviour.

type OOMKilledEvent ¶

type OOMKilledEvent struct {
	BaseEvent
	ContainerName string
	ExitCode      int32
	Reason        string
}

OOMKilledEvent is emitted when a container terminates with OOMKilled semantics.

func (OOMKilledEvent) DedupKey ¶

func (e OOMKilledEvent) DedupKey() string

func (OOMKilledEvent) OccurredAt ¶

func (e OOMKilledEvent) OccurredAt() time.Time

func (OOMKilledEvent) Type ¶

func (e OOMKilledEvent) Type() EventType

type OTelLogMatchEvent ¶ added in v0.0.16

type OTelLogMatchEvent struct {
	BaseEvent
	TraceID       string
	SpanID        string
	ServiceName   string
	Severity      string // OTel severity text: TRACE | DEBUG | INFO | WARN | ERROR | FATAL
	SeverityNum   int32  // OTel severity number (1-24)
	Body          string // Redacted body text (PII already stripped at ingest time).
	BodyHash      string // Hash of the redacted body; used for dedup.
	Attrs         map[string]string
	ResourceAttrs map[string]string
}

OTelLogMatchEvent is emitted when the operator ingests a log record whose severity is WARN or above (configurable via otelIngest.filters.logs.minSeverity).

BodyHash is a hash of the normalized log body used for dedup within the sliding buffer (identical log lines from the same service collapse to one entry).

func (OTelLogMatchEvent) Attributes ¶ added in v0.0.16

func (e OTelLogMatchEvent) Attributes() map[string]string

func (OTelLogMatchEvent) DedupKey ¶ added in v0.0.16

func (e OTelLogMatchEvent) DedupKey() string

func (OTelLogMatchEvent) OccurredAt ¶ added in v0.0.16

func (e OTelLogMatchEvent) OccurredAt() time.Time

func (OTelLogMatchEvent) Type ¶ added in v0.0.16

func (e OTelLogMatchEvent) Type() EventType

type OTelSpanErrorEvent ¶ added in v0.0.16

type OTelSpanErrorEvent struct {
	BaseEvent
	TraceID       string
	SpanID        string
	ParentSpanID  string
	ServiceName   string
	SpanName      string
	SpanKind      string // CLIENT | SERVER | INTERNAL | PRODUCER | CONSUMER
	StatusCode    string // STATUS_CODE_ERROR | STATUS_CODE_OK | STATUS_CODE_UNSET
	StatusMessage string
	DurationNanos int64
	StartTime     time.Time
	EndTime       time.Time
	Attrs         map[string]string
	ResourceAttrs map[string]string
}

OTelSpanErrorEvent is emitted when the operator ingests a span whose status is ERROR (or that carries a 5xx HTTP/gRPC status attribute). The span may originate from any auto-instrumented workload in the cluster.

BaseEvent fields are populated from k8s.* resource attributes applied by the OTel Collector's k8sattributes processor: Namespace = k8s.namespace.name, PodName = k8s.pod.name, NodeName = k8s.node.name.

func (OTelSpanErrorEvent) Attributes ¶ added in v0.0.16

func (e OTelSpanErrorEvent) Attributes() map[string]string

func (OTelSpanErrorEvent) DedupKey ¶ added in v0.0.16

func (e OTelSpanErrorEvent) DedupKey() string

func (OTelSpanErrorEvent) OccurredAt ¶ added in v0.0.16

func (e OTelSpanErrorEvent) OccurredAt() time.Time

func (OTelSpanErrorEvent) Type ¶ added in v0.0.16

func (e OTelSpanErrorEvent) Type() EventType

type OTelSpanEventEvent ¶ added in v0.0.16

type OTelSpanEventEvent struct {
	BaseEvent
	TraceID       string
	SpanID        string
	ServiceName   string
	EventName     string // e.g. "exception", "log"
	EventTime     time.Time
	Attrs         map[string]string
	ResourceAttrs map[string]string
}

OTelSpanEventEvent is emitted when a span carries a standalone OTel event record (e.g. exception.stacktrace, log.message). These are attached to spans but evaluated independently by the correlator because an exception event can be the strongest root-cause signal even on an otherwise-OK span.

func (OTelSpanEventEvent) Attributes ¶ added in v0.0.16

func (e OTelSpanEventEvent) Attributes() map[string]string

func (OTelSpanEventEvent) DedupKey ¶ added in v0.0.16

func (e OTelSpanEventEvent) DedupKey() string

func (OTelSpanEventEvent) OccurredAt ¶ added in v0.0.16

func (e OTelSpanEventEvent) OccurredAt() time.Time

func (OTelSpanEventEvent) Type ¶ added in v0.0.16

func (e OTelSpanEventEvent) Type() EventType

type OTelSpanLatencySpikeEvent ¶ added in v0.0.16

type OTelSpanLatencySpikeEvent struct {
	BaseEvent
	TraceID       string
	SpanID        string
	ParentSpanID  string
	ServiceName   string
	SpanName      string
	DurationNanos int64
	ThresholdNs   int64
	StartTime     time.Time
	EndTime       time.Time
	Attrs         map[string]string
	ResourceAttrs map[string]string
}

OTelSpanLatencySpikeEvent is emitted when a span's duration exceeds the configured latency threshold (see Helm otelIngest.filters.traces.latencyP99Ms).

func (OTelSpanLatencySpikeEvent) Attributes ¶ added in v0.0.16

func (e OTelSpanLatencySpikeEvent) Attributes() map[string]string

func (OTelSpanLatencySpikeEvent) DedupKey ¶ added in v0.0.16

func (e OTelSpanLatencySpikeEvent) DedupKey() string

func (OTelSpanLatencySpikeEvent) OccurredAt ¶ added in v0.0.16

func (e OTelSpanLatencySpikeEvent) OccurredAt() time.Time

func (OTelSpanLatencySpikeEvent) Type ¶ added in v0.0.16

func (e OTelSpanLatencySpikeEvent) Type() EventType

type PodDeletedEvent ¶

type PodDeletedEvent struct {
	BaseEvent
}

PodDeletedEvent is emitted when a watched pod is removed from the cluster. It triggers immediate resolution of any Active incidents referencing the pod.

func (PodDeletedEvent) DedupKey ¶

func (e PodDeletedEvent) DedupKey() string

func (PodDeletedEvent) OccurredAt ¶

func (e PodDeletedEvent) OccurredAt() time.Time

func (PodDeletedEvent) Type ¶

func (e PodDeletedEvent) Type() EventType

type PodEvictedEvent ¶

type PodEvictedEvent struct {
	BaseEvent
	Reason  string
	Message string
}

PodEvictedEvent is emitted when a pod is evicted from a node due to resource pressure. Sourced from the core/v1 Event stream (reason: Evicted).

func (PodEvictedEvent) DedupKey ¶

func (e PodEvictedEvent) DedupKey() string

func (PodEvictedEvent) OccurredAt ¶

func (e PodEvictedEvent) OccurredAt() time.Time

func (PodEvictedEvent) Type ¶

func (e PodEvictedEvent) Type() EventType

type PodHealthyEvent ¶

type PodHealthyEvent struct {
	BaseEvent
}

PodHealthyEvent is emitted when a pod transitions to Running and Ready.

func (PodHealthyEvent) DedupKey ¶

func (e PodHealthyEvent) DedupKey() string

func (PodHealthyEvent) OccurredAt ¶

func (e PodHealthyEvent) OccurredAt() time.Time

func (PodHealthyEvent) Type ¶

func (e PodHealthyEvent) Type() EventType

type PodPendingTooLongEvent ¶

type PodPendingTooLongEvent struct {
	BaseEvent
	PendingFor time.Duration
	Timeout    time.Duration
}

PodPendingTooLongEvent is emitted when a pod remains Pending beyond configured timeout.

func (PodPendingTooLongEvent) DedupKey ¶

func (e PodPendingTooLongEvent) DedupKey() string

func (PodPendingTooLongEvent) OccurredAt ¶

func (e PodPendingTooLongEvent) OccurredAt() time.Time

func (PodPendingTooLongEvent) Type ¶

func (e PodPendingTooLongEvent) Type() EventType

type PodWatcher ¶

type PodWatcher struct {
	// contains filtered or unexported fields
}

PodWatcher monitors pods and emits typed watch events for correlator processing.

func NewPodWatcher ¶

func NewPodWatcher(cache ctrlcache.Cache, emitter EventEmitter, logger logr.Logger, cfg PodWatcherConfig) *PodWatcher

NewPodWatcher creates a pod watcher backed by controller-runtime informers.

func (*PodWatcher) Start ¶

func (w *PodWatcher) Start(ctx context.Context) error

Start registers informer handlers and launches periodic pending-pod scans.

type PodWatcherConfig ¶

type PodWatcherConfig struct {
	AgentName                 string
	CrashLoopRestartThreshold int32
	PendingTimeout            time.Duration
	PendingScanInterval       time.Duration
	ReadyStabilityWindow      time.Duration
	WatchNamespaces           []string
}

PodWatcherConfig controls pod failure detection thresholds.

type ProbeFailureEvent ¶

type ProbeFailureEvent struct {
	BaseEvent
	// ProbeType is one of "Liveness", "Readiness", or "Startup".
	ProbeType string
	Message   string
}

ProbeFailureEvent is emitted when a container's liveness, readiness, or startup probe fails. Sourced from the core/v1 Event stream (reason: Unhealthy).

func (ProbeFailureEvent) DedupKey ¶

func (e ProbeFailureEvent) DedupKey() string

func (ProbeFailureEvent) OccurredAt ¶

func (e ProbeFailureEvent) OccurredAt() time.Time

func (ProbeFailureEvent) Type ¶

func (e ProbeFailureEvent) Type() EventType

type StalledDaemonSetEvent ¶ added in v0.0.15

type StalledDaemonSetEvent struct {
	BaseEvent
	DaemonSetName          string
	Revision               int64
	DesiredNumberScheduled int32
	NumberReady            int32
	UpdatedNumberScheduled int32
	Reason                 string
	Message                string
}

StalledDaemonSetEvent is emitted when an apps/v1 DaemonSet has fewer ready pods than desired for an extended period, indicating a stalled rollout or scheduling failure.

func (StalledDaemonSetEvent) DedupKey ¶ added in v0.0.15

func (e StalledDaemonSetEvent) DedupKey() string

func (StalledDaemonSetEvent) OccurredAt ¶ added in v0.0.15

func (e StalledDaemonSetEvent) OccurredAt() time.Time

func (StalledDaemonSetEvent) Type ¶ added in v0.0.15

func (e StalledDaemonSetEvent) Type() EventType

type StalledRolloutEvent ¶

type StalledRolloutEvent struct {
	BaseEvent
	// DeploymentName is the name of the stalled Deployment.
	DeploymentName string
	// Revision is the status.observedGeneration at the time of detection.
	// It is included in the dedup key so that a new rollout attempt that
	// also stalls produces a fresh event.
	Revision int64
	// DesiredReplicas is the replica count requested in spec.replicas (default 1).
	DesiredReplicas int32
	// ReadyReplicas is the number of replicas that are currently Ready.
	ReadyReplicas int32
	// Reason is always "ProgressDeadlineExceeded" for Phase-1 detection.
	Reason string
	// Message is the human-readable detail from the Progressing condition.
	Message string
}

StalledRolloutEvent is emitted when an apps/v1 Deployment rollout fails to make forward progress within its configured progressDeadlineSeconds window.

The kubelet marks this state by setting a Progressing condition with Status=False and Reason=ProgressDeadlineExceeded on the Deployment.

DeploymentName is also stored in BaseEvent.PodName so the correlator can use the same resource-key routing as other event types without a separate code path.

func (StalledRolloutEvent) DedupKey ¶

func (e StalledRolloutEvent) DedupKey() string

func (StalledRolloutEvent) OccurredAt ¶

func (e StalledRolloutEvent) OccurredAt() time.Time

func (StalledRolloutEvent) Type ¶

func (e StalledRolloutEvent) Type() EventType

type StalledStatefulSetEvent ¶ added in v0.0.15

type StalledStatefulSetEvent struct {
	BaseEvent
	StatefulSetName string
	Revision        int64
	DesiredReplicas int32
	ReadyReplicas   int32
	UpdatedReplicas int32
	Reason          string
	Message         string
}

StalledStatefulSetEvent is emitted when an apps/v1 StatefulSet's rolling update stalls — detected when UpdateRevision != CurrentRevision and no pods have been updated for longer than the scan interval.

func (StalledStatefulSetEvent) DedupKey ¶ added in v0.0.15

func (e StalledStatefulSetEvent) DedupKey() string

func (StalledStatefulSetEvent) OccurredAt ¶ added in v0.0.15

func (e StalledStatefulSetEvent) OccurredAt() time.Time

func (StalledStatefulSetEvent) Type ¶ added in v0.0.15

func (e StalledStatefulSetEvent) Type() EventType

type StatefulSetWatcher ¶ added in v0.0.15

type StatefulSetWatcher struct {
	// contains filtered or unexported fields
}

StatefulSetWatcher monitors apps/v1 StatefulSets and emits a StalledStatefulSetEvent when a rolling update stalls — detected when UpdateRevision != CurrentRevision and UpdatedReplicas < Replicas, indicating the rollout is not making forward progress.

At most one event is emitted per (StatefulSet UID, observedGeneration) pair.

func NewStatefulSetWatcher ¶ added in v0.0.15

func NewStatefulSetWatcher(cache ctrlcache.Cache, emitter EventEmitter, logger logr.Logger, cfg StatefulSetWatcherConfig) *StatefulSetWatcher

NewStatefulSetWatcher creates a StatefulSetWatcher backed by a controller-runtime cache.

func (*StatefulSetWatcher) Start ¶ added in v0.0.15

func (w *StatefulSetWatcher) Start(ctx context.Context) error

Start registers informer handlers, runs a bootstrap scan, and launches the periodic fallback scanner. Non-blocking; all goroutines are bounded by ctx.

type StatefulSetWatcherConfig ¶ added in v0.0.15

type StatefulSetWatcherConfig struct {
	AgentName       string
	WatchNamespaces []string
	ScanInterval    time.Duration
}

StatefulSetWatcherConfig controls the behaviour of the StatefulSet rollout watcher.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL