metrics

package
v0.1.6 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 8, 2026 License: Apache-2.0 Imports: 6 Imported by: 1

Documentation

Overview

Package metrics provides Prometheus metrics for BubuStack controllers.

This package defines and registers all Prometheus metrics used by the operator, including counters, histograms, and gauges for tracking controller operations, resource lifecycles, and transport activities.

Controller Metrics

StoryRun and StepRun lifecycle metrics:

metrics.RecordStoryRunMetrics("default", "my-story", "Succeeded", 300*time.Second)
metrics.RecordStepRunComplete("default", "my-story", "step-1", "Succeeded", 30*time.Second)

Controller reconcile metrics:

metrics.RecordControllerReconcile("StoryRun", duration, err)

CEL Metrics

CEL expression evaluation metrics:

metrics.RecordCELEvaluation("when", duration, nil)
metrics.RecordCELCacheHit("compilation")

Transport Metrics

Transport and binding metrics:

metrics.RecordTransportBindingReady("default", "binding-1", true)
metrics.RecordTransportPacketSent("grpc", 1024)

Cleanup Metrics

Resource cleanup tracking:

metrics.RecordStoryRunCleanup("default", WindowRetention, 5)
metrics.RecordStoryRunCleanedUpTotal("default")

Registration

Metrics are automatically registered with the controller-runtime metrics registry. The Enable function must be called at startup to activate metric recording.

Index

Constants

View Source
const (
	// UnknownLabelValue is used whenever a label would otherwise be empty.
	UnknownLabelValue = "(unknown)"

	// UnknownResourceValue is used for resource labels without a specific name.
	UnknownResourceValue = "unknown"

	// EmptyHashSummary represents the absence of a hash summary in metrics.
	EmptyHashSummary = "(none)"
)
View Source
const (
	WindowChildTTL  = "child_ttl"
	WindowRetention = "retention"
)

StoryRunWindow labels identify which cleanup window a gauge value tracks.

Variables

View Source
var (

	// StoryRun metrics
	StoryRunsTotal = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Name: "bobrapet_storyruns_total",
			Help: "Total number of StoryRuns processed",
		},
		[]string{"namespace", "story", "phase"},
	)

	StoryRunDuration = prometheus.NewHistogramVec(
		prometheus.HistogramOpts{
			Name:    "bobrapet_storyrun_duration_seconds",
			Help:    "Duration of StoryRun execution",
			Buckets: []float64{1, 5, 15, 30, 60, 300, 600, 1800, 3600},
		},
		[]string{"namespace", "story", "phase"},
	)

	StoryRunStepsActive = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Name: "bobrapet_storyrun_steps_active",
			Help: "Number of currently active steps in StoryRuns",
		},
		[]string{"namespace", "story"},
	)

	StoryRunStepsCompleted = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Name: "bobrapet_storyrun_steps_completed",
			Help: "Number of completed steps in StoryRuns",
		},
		[]string{"namespace", "story"},
	)

	StoryRunWindowSeconds = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Name: "bobrapet_storyrun_window_seconds",
			Help: "Configured child TTL and retention windows applied to StoryRuns",
		},
		[]string{"namespace", "story", "window"},
	)

	StoryRunQueueAgeSeconds = prometheus.NewHistogramVec(
		prometheus.HistogramOpts{
			Name:    "bobrapet_storyrun_queue_age_seconds",
			Help:    "Time StoryRuns spend queued due to scheduling or priority ordering.",
			Buckets: []float64{1, 5, 15, 30, 60, 300, 600, 1800, 3600},
		},
		[]string{"namespace", "queue"},
	)

	StoryRunQueueDepth = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Name: "bobrapet_storyrun_queue_depth",
			Help: "Number of steps queued by scheduling limits per queue.",
		},
		[]string{"namespace", "storyrun", "queue"},
	)

	ImpulseThrottledTriggersTotal = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Name: "bobrapet_impulse_throttled_triggers_total",
			Help: "Number of triggers delayed by per-Impulse throttling, sourced from Impulse status.",
		},
		[]string{"namespace", "impulse"},
	)

	StoryRunDependentsDeletedTotal = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Name: "bobrapet_storyrun_dependents_deleted_total",
			Help: "Total number of StoryRun dependents deleted during cleanup, partitioned by resource type.",
		},
		[]string{"namespace", "storyrun", "resource"},
	)

	StoryRunRBACOperationsTotal = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Name: "bobrapet_storyrun_rbac_operations_total",
			Help: "Number of StoryRun RBAC reconciliation operations by resource and controller result",
		},
		[]string{"resource", "operation"},
	)

	StoryDirtyMarksTotal = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Name: "bobrapet_story_dirty_marks_total",
			Help: "Total number of dirty flags set for Stories",
		},
		[]string{"namespace", "story", "reason"},
	)

	// StepRun metrics
	StepRunsTotal = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Name: "bobrapet_stepruns_total",
			Help: "Total number of StepRuns processed",
		},
		[]string{"namespace", "engram", "phase"},
	)

	StepRunDuration = prometheus.NewHistogramVec(
		prometheus.HistogramOpts{
			Name:    "bobrapet_steprun_duration_seconds",
			Help:    "Duration of StepRun execution",
			Buckets: []float64{0.1, 0.5, 1, 5, 15, 30, 60, 300},
		},
		[]string{"namespace", "engram", "phase"},
	)

	StepRunRetries = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Name: "bobrapet_steprun_retries_total",
			Help: "Total number of StepRun retries",
		},
		[]string{"namespace", "engram", "reason"},
	)

	StepRunCacheLookupsTotal = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Name: "bobrapet_steprun_cache_lookups_total",
			Help: "Total number of StepRun cache lookups partitioned by hit/miss result.",
		},
		[]string{"namespace", "engram", "result"},
	)

	StepRunChildrenCreatedTotal = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Name: "bobrapet_child_stepruns_created_total",
			Help: "Total number of child StepRuns created per StoryRun/Engram.",
		},
		[]string{"namespace", "storyrun", "engram"},
	)

	DownstreamTargetMutationsTotal = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Name: "bobrapet_downstream_target_mutations_total",
			Help: "Total number of downstream target mutations applied to StepRuns",
		},
		[]string{"namespace", "step", "action", "hashes"},
	)
	TransportBindingReadFallbacksTotal = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Name: "bobrapet_transport_binding_read_fallbacks_total",
			Help: "Total number of binding env read fallbacks triggered while waiting for " +
				"API readers to observe TransportBindings",
		},
		[]string{"namespace", "reader"},
	)

	ResolverStageTotal = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Name: "bobrapet_resolver_stage_total",
			Help: "Total number of resolver chain stages executed, labeled by layer, stage, mode, and outcome",
		},
		[]string{"layer", "stage", "mode", "outcome"},
	)

	ResolverStageDuration = prometheus.NewHistogramVec(
		prometheus.HistogramOpts{
			Name:    "bobrapet_resolver_stage_duration_seconds",
			Help:    "Duration of resolver chain stages",
			Buckets: []float64{0.001, 0.01, 0.05, 0.1, 0.5, 1, 2},
		},
		[]string{"layer", "stage"},
	)

	ResolverServiceAccountFallbacksTotal = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Name: "bobrapet_resolver_service_account_fallback_total",
			Help: "Number of times the config resolver defaulted the StepRun ServiceAccount to the per-StoryRun runner",
		},
		[]string{"storyrun"},
	)

	// Controller metrics
	ControllerReconcileTotal = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Name: "bobrapet_controller_reconcile_total",
			Help: "Total number of reconciles by controller",
		},
		[]string{"controller", "result"},
	)

	ControllerReconcileDuration = prometheus.NewHistogramVec(
		prometheus.HistogramOpts{
			Name:    "bobrapet_controller_reconcile_duration_seconds",
			Help:    "Duration of controller reconcile operations",
			Buckets: prometheus.DefBuckets,
		},
		[]string{"controller"},
	)

	ControllerReconcileErrors = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Name: "bobrapet_controller_reconcile_errors_total",
			Help: "Total number of reconcile errors",
		},
		[]string{"controller", "error_type"},
	)

	// CEL evaluation metrics
	CELEvaluationDuration = prometheus.NewHistogramVec(
		prometheus.HistogramOpts{
			Name:    "bobrapet_cel_evaluation_duration_seconds",
			Help:    "Duration of CEL expression evaluation",
			Buckets: []float64{0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1},
		},
		[]string{"expression_type"},
	)

	CELEvaluationTotal = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Name: "bobrapet_cel_evaluation_total",
			Help: "Total number of CEL expression evaluations",
		},
		[]string{"expression_type", "result"},
	)

	CELCacheHits = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Name: "bobrapet_cel_cache_hits_total",
			Help: "Total number of CEL compilation cache hits",
		},
		[]string{"cache_type"},
	)

	// Resource cleanup metrics
	ResourceCleanupTotal = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Name: "bobrapet_resource_cleanup_total",
			Help: "Total number of resource cleanup operations",
		},
		[]string{"resource_type", "result"},
	)

	ResourceCleanupDuration = prometheus.NewHistogramVec(
		prometheus.HistogramOpts{
			Name:    "bobrapet_resource_cleanup_duration_seconds",
			Help:    "Duration of resource cleanup operations",
			Buckets: []float64{0.1, 0.5, 1, 5, 10, 30},
		},
		[]string{"resource_type"},
	)

	// Job execution metrics
	JobExecutionTotal = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Name: "bobrapet_job_execution_total",
			Help: "Total number of Job executions",
		},
		[]string{"namespace", "image", "result"},
	)

	JobExecutionDuration = prometheus.NewHistogramVec(
		prometheus.HistogramOpts{
			Name:    "bobrapet_job_execution_duration_seconds",
			Help:    "Duration of Job execution",
			Buckets: []float64{1, 5, 15, 30, 60, 300, 600, 1800},
		},
		[]string{"namespace", "image"},
	)

	// Quota violation metrics
	QuotaViolationTotal = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Name: "bobrapet_quota_violation_total",
			Help: "Total number of quota violations",
		},
		[]string{"namespace", "resource_type", "violation_type"},
	)

	// Resource quota metrics
	ResourceQuotaUsage = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Name: "bobrapet_resource_quota_usage",
			Help: "Current resource quota usage",
		},
		[]string{"namespace", "resource_type"},
	)

	ResourceQuotaLimit = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Name: "bobrapet_resource_quota_limit",
			Help: "Resource quota limits",
		},
		[]string{"namespace", "resource_type"},
	)

	// gRPC Transport (bobravoz-grpc) metrics
	GRPCStreamRequests = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Name: "bobravoz_grpc_stream_requests_total",
			Help: "Total number of streaming RPC requests by status code",
		},
		[]string{"method", "code"},
	)

	GRPCStreamDuration = prometheus.NewHistogramVec(
		prometheus.HistogramOpts{
			Name:    "bobravoz_grpc_stream_duration_seconds",
			Help:    "Duration of streaming RPCs in seconds",
			Buckets: []float64{.1, .5, 1, 5, 10, 30, 60, 300},
		},
		[]string{"method"},
	)

	GRPCMessagesReceived = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Name: "bobravoz_grpc_messages_received_total",
			Help: "Total messages received on streaming RPCs",
		},
		[]string{"storyrun", "step"},
	)

	GRPCMessagesSent = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Name: "bobravoz_grpc_messages_sent_total",
			Help: "Total messages sent on streaming RPCs",
		},
		[]string{"storyrun", "step"},
	)

	GRPCMessagesDropped = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Name: "bobravoz_grpc_messages_dropped_total",
			Help: "Total messages dropped due to buffer overflow or errors",
		},
		[]string{"storyrun", "step", "reason"},
	)

	TransportBindingOperationsTotal = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Name: "bobrapet_transport_binding_operations_total",
			Help: "Total number of TransportBinding operations performed by a controller",
		},
		[]string{"controller", "result", "mutated", "binding_alias"},
	)

	TransportBindingOperationDuration = prometheus.NewHistogramVec(
		prometheus.HistogramOpts{
			Name:    "bobrapet_transport_binding_operation_duration_seconds",
			Help:    "Duration of TransportBinding CreateOrUpdate calls",
			Buckets: []float64{0.01, 0.05, 0.1, 0.5, 1, 2, 5},
		},
		[]string{"controller", "mutated", "binding_alias"},
	)

	TransportBindingAnnotationSanitizeFailuresTotal = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Name: "bobrapet_transport_binding_annotation_sanitize_failures_total",
			Help: "Counts sanitized binding annotation failures keyed by namespace, StoryRun, and Step.",
		},
		[]string{"namespace", "story_run", "step"},
	)

	ControllerMapperFailures = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Name: "bobrapet_controller_mapper_failures_total",
			Help: "Total number of controller watch mapper failures",
		},
		[]string{"controller", "mapper"},
	)

	ControllerIndexFallbacks = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Name: "bobrapet_controller_index_fallback_total",
			Help: "Total number of times controllers had to fall back to full namespace scans " +
				"because a field index lookup failed",
		},
		[]string{"controller", "index"},
	)

	TriggerBackfillLoopsTotal = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Name: "bobrapet_trigger_backfill_loops_total",
			Help: "Number of bounded trigger backfill iterations executed",
		},
		[]string{"controller", "namespace", "resource", "pending"},
	)

	TriggerBackfillMarkedTotal = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Name: "bobrapet_trigger_backfill_marked_total",
			Help: "Total number of child resources annotated during trigger backfill",
		},
		[]string{"controller", "namespace", "resource"},
	)

	DAGIterationSteps = prometheus.NewHistogramVec(
		prometheus.HistogramOpts{
			Name:    "bobrapet_dag_iteration_steps",
			Help:    "Ready/skipped step counts per DAG iteration",
			Buckets: []float64{0, 1, 2, 5, 10, 20, 40},
		},
		[]string{"controller", "type"},
	)

	SubStoryRefreshTotal = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Name: "bobrapet_substory_refresh_total",
			Help: "Total number of DAG sub-story refreshes triggered by synchronous executeStory completions",
		},
		[]string{"controller"},
	)
)

Functions

func Enable added in v0.1.4

func Enable(enabled bool)

Enable toggles emission of controller metrics at runtime.

func RecordArtifactOperation

func RecordArtifactOperation(operation string, duration time.Duration, err error)

RecordArtifactOperation records artifact management operations

func RecordCELCacheHit

func RecordCELCacheHit(cacheType string)

RecordCELCacheHit records CEL cache hit

func RecordCELEvaluation

func RecordCELEvaluation(expressionType string, duration time.Duration, err error)

RecordCELEvaluation records CEL evaluation metrics

func RecordChildStepRunCreated added in v0.1.4

func RecordChildStepRunCreated(namespace, storyRun, engram string)

RecordChildStepRunCreated increments the counter tracking newly created child StepRuns.

func RecordCleanupOperation

func RecordCleanupOperation(resourceType, namespace string, deletedCount int, duration time.Duration, err error)

RecordCleanupOperation records cleanup operations

func RecordControllerIndexFallback added in v0.1.4

func RecordControllerIndexFallback(controller, index string)

RecordControllerIndexFallback records when a controller cannot use a configured field index.

func RecordControllerMapperFailure added in v0.1.4

func RecordControllerMapperFailure(controller, mapper string)

RecordControllerMapperFailure records mapper failures for watch handlers.

func RecordControllerReconcile

func RecordControllerReconcile(controller string, duration time.Duration, err error)

RecordControllerReconcile records controller reconcile metrics

func RecordDAGIteration added in v0.1.4

func RecordDAGIteration(controller string, readyCount, skippedCount int)

RecordDAGIteration captures ready/skipped counts for DAG loop iterations.

func RecordDownstreamTargetMutation added in v0.1.4

func RecordDownstreamTargetMutation(namespace, step, action, hashSummary string)

RecordDownstreamTargetMutation counts downstream target updates keyed by action and hash summary.

func RecordGRPCMessageDropped

func RecordGRPCMessageDropped(storyRun, step, reason string)

RecordGRPCMessageDropped records a dropped message with a reason

func RecordGRPCMessageReceived

func RecordGRPCMessageReceived(storyRun, step string)

RecordGRPCMessageReceived records a message received on a stream

func RecordGRPCMessageSent

func RecordGRPCMessageSent(storyRun, step string)

RecordGRPCMessageSent records a message sent on a stream

func RecordGRPCStreamDuration

func RecordGRPCStreamDuration(method string, durationSeconds float64)

RecordGRPCStreamDuration records the duration of a streaming RPC

func RecordGRPCStreamRequest

func RecordGRPCStreamRequest(method, code string)

RecordGRPCStreamRequest records a streaming RPC request with its result code

func RecordImpulseThrottledTriggers added in v0.1.4

func RecordImpulseThrottledTriggers(namespace, impulse string, count int64)

RecordImpulseThrottledTriggers records throttled trigger counts from Impulse status.

func RecordJobExecution

func RecordJobExecution(namespace, image, result string, duration time.Duration)

RecordJobExecution records Job execution metrics

func RecordQuotaViolation

func RecordQuotaViolation(namespace, resourceType, violationType string)

RecordQuotaViolation records quota violation metrics

func RecordResolverServiceAccountFallback added in v0.1.4

func RecordResolverServiceAccountFallback(storyRun string)

RecordResolverServiceAccountFallback counts resolver runs that default the ServiceAccount name.

func RecordResolverStage added in v0.1.4

func RecordResolverStage(layer, stage, mode, outcome string, duration time.Duration)

RecordResolverStage tracks resolver-chain stage execution telemetry.

func RecordResourceCleanup

func RecordResourceCleanup(resourceType string, duration time.Duration, err error)

RecordResourceCleanup records resource cleanup metrics

func RecordStepRunCacheLookup added in v0.1.4

func RecordStepRunCacheLookup(namespace, engram, result string)

RecordStepRunCacheLookup increments the StepRun cache lookup counter.

func RecordStepRunDuration

func RecordStepRunDuration(namespace, storyRunRef, stepName, phase string, duration time.Duration)

RecordStepRunDuration records duration metrics for StepRun operations (alias for consistency)

func RecordStepRunMetrics

func RecordStepRunMetrics(namespace, engram, phase string, duration time.Duration)

RecordStepRunMetrics records metrics for StepRun operations

func RecordStorageOperation

func RecordStorageOperation(provider, operation string, duration time.Duration, err error)

RecordStorageOperation records storage provider operations (placeholder for storage package)

func RecordStoryDirtyMark added in v0.1.4

func RecordStoryDirtyMark(namespace, story, reason string)

RecordStoryDirtyMark increments the metric tracking why a Story was marked dirty.

func RecordStoryRunDependentCleanup added in v0.1.4

func RecordStoryRunDependentCleanup(namespace, storyRun, resource string, count int)

RecordStoryRunDependentCleanup counts StoryRun dependent deletions keyed by resource type.

func RecordStoryRunMetrics

func RecordStoryRunMetrics(namespace, story, phase string, duration time.Duration)

RecordStoryRunMetrics records metrics for StoryRun operations

func RecordStoryRunQueueAge added in v0.1.4

func RecordStoryRunQueueAge(namespace, queue string, age time.Duration)

RecordStoryRunQueueAge observes how long a StoryRun has been queued.

func RecordStoryRunQueueDepth added in v0.1.4

func RecordStoryRunQueueDepth(namespace, storyRun, queue string, depth int)

RecordStoryRunQueueDepth records the number of steps queued for a queue.

func RecordStoryRunRBACOperation added in v0.1.4

func RecordStoryRunRBACOperation(resource, operation string)

RecordStoryRunRBACOperation counts RBAC reconciliation changes keyed by operation result.

func RecordStoryRunWindowSeconds added in v0.1.4

func RecordStoryRunWindowSeconds(namespace, story, window string, seconds float64)

RecordStoryRunWindowSeconds captures the configured TTL/retention windows for StoryRuns.

func RecordSubStoryRefresh added in v0.1.4

func RecordSubStoryRefresh(controller string)

RecordSubStoryRefresh increments the counter tracking DAG sub-story refresh passes.

func RecordTransportBindingAnnotationSanitizeFailure added in v0.1.4

func RecordTransportBindingAnnotationSanitizeFailure(namespace, storyRun, step string)

RecordTransportBindingAnnotationSanitizeFailure increments the counter when Engram binding annotations fail sanitization.

func RecordTransportBindingOperation added in v0.1.4

func RecordTransportBindingOperation(controller, bindingAlias string, mutated bool, duration time.Duration, err error)

RecordTransportBindingOperation records metrics for TransportBinding mutations.

func RecordTransportBindingReadFallback added in v0.1.4

func RecordTransportBindingReadFallback(namespace, reader string)

RecordTransportBindingReadFallback counts binding env read fallbacks keyed by namespace and reader type.

func RecordTransportBindingSnapshot added in v0.1.4

func RecordTransportBindingSnapshot(name string, ready, total, pending, failed int)

RecordTransportBindingSnapshot records the current ready/total/pending/failed binding counts for a Transport so controllers and background workers emit the same metric series.

func RecordTriggerBackfill added in v0.1.4

func RecordTriggerBackfill(controller, namespace, resource string, marked int, pending bool)

RecordTriggerBackfill captures bounded trigger backfill activity for controllers such as Engram and Story.

func UpdateResourceQuotaLimit

func UpdateResourceQuotaLimit(namespace, resourceType string, limit float64)

UpdateResourceQuotaLimit updates resource quota limits

func UpdateResourceQuotaUsage

func UpdateResourceQuotaUsage(namespace, resourceType string, usage float64)

UpdateResourceQuotaUsage updates current resource usage

func UpdateStoryRunStepsGauge

func UpdateStoryRunStepsGauge(namespace, story string, active, completed int)

UpdateStoryRunStepsGauge updates the active/completed steps gauge

Types

This section is empty.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL