Documentation
¶
Overview ¶
Package metrics provides Prometheus metrics for BubuStack controllers.
This package defines and registers all Prometheus metrics used by the operator, including counters, histograms, and gauges for tracking controller operations, resource lifecycles, and transport activities.
Controller Metrics ¶
StoryRun and StepRun lifecycle metrics:
metrics.RecordStoryRunMetrics("default", "my-story", "Succeeded", 300*time.Second)
metrics.RecordStepRunComplete("default", "my-story", "step-1", "Succeeded", 30*time.Second)
Controller reconcile metrics:
metrics.RecordControllerReconcile("StoryRun", duration, err)
CEL Metrics ¶
CEL expression evaluation metrics:
metrics.RecordCELEvaluation("when", duration, nil)
metrics.RecordCELCacheHit("compilation")
Transport Metrics ¶
Transport and binding metrics:
metrics.RecordTransportBindingReady("default", "binding-1", true)
metrics.RecordTransportPacketSent("grpc", 1024)
Cleanup Metrics ¶
Resource cleanup tracking:
metrics.RecordStoryRunCleanup("default", WindowRetention, 5)
metrics.RecordStoryRunCleanedUpTotal("default")
Registration ¶
Metrics are automatically registered with the controller-runtime metrics registry. The Enable function must be called at startup to activate metric recording.
Index ¶
- Constants
- Variables
- func Enable(enabled bool)
- func RecordArtifactOperation(operation string, duration time.Duration, err error)
- func RecordCELCacheHit(cacheType string)
- func RecordCELEvaluation(expressionType string, duration time.Duration, err error)
- func RecordChildStepRunCreated(namespace, storyRun, engram string)
- func RecordCleanupOperation(resourceType, namespace string, deletedCount int, duration time.Duration, ...)
- func RecordControllerIndexFallback(controller, index string)
- func RecordControllerMapperFailure(controller, mapper string)
- func RecordControllerReconcile(controller string, duration time.Duration, err error)
- func RecordDAGIteration(controller string, readyCount, skippedCount int)
- func RecordDownstreamTargetMutation(namespace, step, action, hashSummary string)
- func RecordGRPCMessageDropped(storyRun, step, reason string)
- func RecordGRPCMessageReceived(storyRun, step string)
- func RecordGRPCMessageSent(storyRun, step string)
- func RecordGRPCStreamDuration(method string, durationSeconds float64)
- func RecordGRPCStreamRequest(method, code string)
- func RecordImpulseThrottledTriggers(namespace, impulse string, count int64)
- func RecordJobExecution(namespace, image, result string, duration time.Duration)
- func RecordQuotaViolation(namespace, resourceType, violationType string)
- func RecordResolverServiceAccountFallback(storyRun string)
- func RecordResolverStage(layer, stage, mode, outcome string, duration time.Duration)
- func RecordResourceCleanup(resourceType string, duration time.Duration, err error)
- func RecordStepRunCacheLookup(namespace, engram, result string)
- func RecordStepRunDuration(namespace, storyRunRef, stepName, phase string, duration time.Duration)
- func RecordStepRunMetrics(namespace, engram, phase string, duration time.Duration)
- func RecordStorageOperation(provider, operation string, duration time.Duration, err error)
- func RecordStoryDirtyMark(namespace, story, reason string)
- func RecordStoryRunDependentCleanup(namespace, storyRun, resource string, count int)
- func RecordStoryRunMetrics(namespace, story, phase string, duration time.Duration)
- func RecordStoryRunQueueAge(namespace, queue string, age time.Duration)
- func RecordStoryRunQueueDepth(namespace, storyRun, queue string, depth int)
- func RecordStoryRunRBACOperation(resource, operation string)
- func RecordStoryRunWindowSeconds(namespace, story, window string, seconds float64)
- func RecordSubStoryRefresh(controller string)
- func RecordTransportBindingAnnotationSanitizeFailure(namespace, storyRun, step string)
- func RecordTransportBindingOperation(controller, bindingAlias string, mutated bool, duration time.Duration, ...)
- func RecordTransportBindingReadFallback(namespace, reader string)
- func RecordTransportBindingSnapshot(name string, ready, total, pending, failed int)
- func RecordTriggerBackfill(controller, namespace, resource string, marked int, pending bool)
- func UpdateResourceQuotaLimit(namespace, resourceType string, limit float64)
- func UpdateResourceQuotaUsage(namespace, resourceType string, usage float64)
- func UpdateStoryRunStepsGauge(namespace, story string, active, completed int)
Constants ¶
const ( // UnknownLabelValue is used whenever a label would otherwise be empty. UnknownLabelValue = "(unknown)" // UnknownResourceValue is used for resource labels without a specific name. UnknownResourceValue = "unknown" // EmptyHashSummary represents the absence of a hash summary in metrics. EmptyHashSummary = "(none)" )
const ( WindowChildTTL = "child_ttl" WindowRetention = "retention" )
StoryRunWindow labels identify which cleanup window a gauge value tracks.
Variables ¶
var ( // StoryRun metrics StoryRunsTotal = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "bobrapet_storyruns_total", Help: "Total number of StoryRuns processed", }, []string{"namespace", "story", "phase"}, ) StoryRunDuration = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Name: "bobrapet_storyrun_duration_seconds", Help: "Duration of StoryRun execution", Buckets: []float64{1, 5, 15, 30, 60, 300, 600, 1800, 3600}, }, []string{"namespace", "story", "phase"}, ) StoryRunStepsActive = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Name: "bobrapet_storyrun_steps_active", Help: "Number of currently active steps in StoryRuns", }, []string{"namespace", "story"}, ) StoryRunStepsCompleted = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Name: "bobrapet_storyrun_steps_completed", Help: "Number of completed steps in StoryRuns", }, []string{"namespace", "story"}, ) StoryRunWindowSeconds = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Name: "bobrapet_storyrun_window_seconds", Help: "Configured child TTL and retention windows applied to StoryRuns", }, []string{"namespace", "story", "window"}, ) StoryRunQueueAgeSeconds = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Name: "bobrapet_storyrun_queue_age_seconds", Help: "Time StoryRuns spend queued due to scheduling or priority ordering.", Buckets: []float64{1, 5, 15, 30, 60, 300, 600, 1800, 3600}, }, []string{"namespace", "queue"}, ) StoryRunQueueDepth = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Name: "bobrapet_storyrun_queue_depth", Help: "Number of steps queued by scheduling limits per queue.", }, []string{"namespace", "storyrun", "queue"}, ) ImpulseThrottledTriggersTotal = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Name: "bobrapet_impulse_throttled_triggers_total", Help: "Number of triggers delayed by per-Impulse throttling, sourced from Impulse status.", }, []string{"namespace", "impulse"}, ) StoryRunDependentsDeletedTotal = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "bobrapet_storyrun_dependents_deleted_total", Help: "Total number of StoryRun dependents deleted during cleanup, partitioned by resource type.", }, []string{"namespace", "storyrun", "resource"}, ) StoryRunRBACOperationsTotal = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "bobrapet_storyrun_rbac_operations_total", Help: "Number of StoryRun RBAC reconciliation operations by resource and controller result", }, []string{"resource", "operation"}, ) StoryDirtyMarksTotal = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "bobrapet_story_dirty_marks_total", Help: "Total number of dirty flags set for Stories", }, []string{"namespace", "story", "reason"}, ) // StepRun metrics StepRunsTotal = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "bobrapet_stepruns_total", Help: "Total number of StepRuns processed", }, []string{"namespace", "engram", "phase"}, ) StepRunDuration = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Name: "bobrapet_steprun_duration_seconds", Help: "Duration of StepRun execution", Buckets: []float64{0.1, 0.5, 1, 5, 15, 30, 60, 300}, }, []string{"namespace", "engram", "phase"}, ) StepRunRetries = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "bobrapet_steprun_retries_total", Help: "Total number of StepRun retries", }, []string{"namespace", "engram", "reason"}, ) StepRunCacheLookupsTotal = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "bobrapet_steprun_cache_lookups_total", Help: "Total number of StepRun cache lookups partitioned by hit/miss result.", }, []string{"namespace", "engram", "result"}, ) StepRunChildrenCreatedTotal = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "bobrapet_child_stepruns_created_total", Help: "Total number of child StepRuns created per StoryRun/Engram.", }, []string{"namespace", "storyrun", "engram"}, ) DownstreamTargetMutationsTotal = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "bobrapet_downstream_target_mutations_total", Help: "Total number of downstream target mutations applied to StepRuns", }, []string{"namespace", "step", "action", "hashes"}, ) TransportBindingReadFallbacksTotal = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "bobrapet_transport_binding_read_fallbacks_total", Help: "Total number of binding env read fallbacks triggered while waiting for " + "API readers to observe TransportBindings", }, []string{"namespace", "reader"}, ) ResolverStageTotal = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "bobrapet_resolver_stage_total", Help: "Total number of resolver chain stages executed, labeled by layer, stage, mode, and outcome", }, []string{"layer", "stage", "mode", "outcome"}, ) ResolverStageDuration = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Name: "bobrapet_resolver_stage_duration_seconds", Help: "Duration of resolver chain stages", Buckets: []float64{0.001, 0.01, 0.05, 0.1, 0.5, 1, 2}, }, []string{"layer", "stage"}, ) ResolverServiceAccountFallbacksTotal = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "bobrapet_resolver_service_account_fallback_total", Help: "Number of times the config resolver defaulted the StepRun ServiceAccount to the per-StoryRun runner", }, []string{"storyrun"}, ) // Controller metrics ControllerReconcileTotal = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "bobrapet_controller_reconcile_total", Help: "Total number of reconciles by controller", }, []string{"controller", "result"}, ) ControllerReconcileDuration = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Name: "bobrapet_controller_reconcile_duration_seconds", Help: "Duration of controller reconcile operations", Buckets: prometheus.DefBuckets, }, []string{"controller"}, ) ControllerReconcileErrors = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "bobrapet_controller_reconcile_errors_total", Help: "Total number of reconcile errors", }, []string{"controller", "error_type"}, ) // CEL evaluation metrics CELEvaluationDuration = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Name: "bobrapet_cel_evaluation_duration_seconds", Help: "Duration of CEL expression evaluation", Buckets: []float64{0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1}, }, []string{"expression_type"}, ) CELEvaluationTotal = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "bobrapet_cel_evaluation_total", Help: "Total number of CEL expression evaluations", }, []string{"expression_type", "result"}, ) CELCacheHits = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "bobrapet_cel_cache_hits_total", Help: "Total number of CEL compilation cache hits", }, []string{"cache_type"}, ) // Resource cleanup metrics ResourceCleanupTotal = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "bobrapet_resource_cleanup_total", Help: "Total number of resource cleanup operations", }, []string{"resource_type", "result"}, ) ResourceCleanupDuration = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Name: "bobrapet_resource_cleanup_duration_seconds", Help: "Duration of resource cleanup operations", Buckets: []float64{0.1, 0.5, 1, 5, 10, 30}, }, []string{"resource_type"}, ) // Job execution metrics JobExecutionTotal = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "bobrapet_job_execution_total", Help: "Total number of Job executions", }, []string{"namespace", "image", "result"}, ) JobExecutionDuration = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Name: "bobrapet_job_execution_duration_seconds", Help: "Duration of Job execution", Buckets: []float64{1, 5, 15, 30, 60, 300, 600, 1800}, }, []string{"namespace", "image"}, ) // Quota violation metrics QuotaViolationTotal = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "bobrapet_quota_violation_total", Help: "Total number of quota violations", }, []string{"namespace", "resource_type", "violation_type"}, ) // Resource quota metrics ResourceQuotaUsage = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Name: "bobrapet_resource_quota_usage", Help: "Current resource quota usage", }, []string{"namespace", "resource_type"}, ) ResourceQuotaLimit = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Name: "bobrapet_resource_quota_limit", Help: "Resource quota limits", }, []string{"namespace", "resource_type"}, ) // gRPC Transport (bobravoz-grpc) metrics GRPCStreamRequests = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "bobravoz_grpc_stream_requests_total", Help: "Total number of streaming RPC requests by status code", }, []string{"method", "code"}, ) GRPCStreamDuration = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Name: "bobravoz_grpc_stream_duration_seconds", Help: "Duration of streaming RPCs in seconds", Buckets: []float64{.1, .5, 1, 5, 10, 30, 60, 300}, }, []string{"method"}, ) GRPCMessagesReceived = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "bobravoz_grpc_messages_received_total", Help: "Total messages received on streaming RPCs", }, []string{"storyrun", "step"}, ) GRPCMessagesSent = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "bobravoz_grpc_messages_sent_total", Help: "Total messages sent on streaming RPCs", }, []string{"storyrun", "step"}, ) GRPCMessagesDropped = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "bobravoz_grpc_messages_dropped_total", Help: "Total messages dropped due to buffer overflow or errors", }, []string{"storyrun", "step", "reason"}, ) TransportBindingOperationsTotal = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "bobrapet_transport_binding_operations_total", Help: "Total number of TransportBinding operations performed by a controller", }, []string{"controller", "result", "mutated", "binding_alias"}, ) TransportBindingOperationDuration = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Name: "bobrapet_transport_binding_operation_duration_seconds", Help: "Duration of TransportBinding CreateOrUpdate calls", Buckets: []float64{0.01, 0.05, 0.1, 0.5, 1, 2, 5}, }, []string{"controller", "mutated", "binding_alias"}, ) TransportBindingAnnotationSanitizeFailuresTotal = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "bobrapet_transport_binding_annotation_sanitize_failures_total", Help: "Counts sanitized binding annotation failures keyed by namespace, StoryRun, and Step.", }, []string{"namespace", "story_run", "step"}, ) ControllerMapperFailures = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "bobrapet_controller_mapper_failures_total", Help: "Total number of controller watch mapper failures", }, []string{"controller", "mapper"}, ) ControllerIndexFallbacks = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "bobrapet_controller_index_fallback_total", Help: "Total number of times controllers had to fall back to full namespace scans " + "because a field index lookup failed", }, []string{"controller", "index"}, ) TriggerBackfillLoopsTotal = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "bobrapet_trigger_backfill_loops_total", Help: "Number of bounded trigger backfill iterations executed", }, []string{"controller", "namespace", "resource", "pending"}, ) TriggerBackfillMarkedTotal = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "bobrapet_trigger_backfill_marked_total", Help: "Total number of child resources annotated during trigger backfill", }, []string{"controller", "namespace", "resource"}, ) DAGIterationSteps = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Name: "bobrapet_dag_iteration_steps", Help: "Ready/skipped step counts per DAG iteration", Buckets: []float64{0, 1, 2, 5, 10, 20, 40}, }, []string{"controller", "type"}, ) SubStoryRefreshTotal = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "bobrapet_substory_refresh_total", Help: "Total number of DAG sub-story refreshes triggered by synchronous executeStory completions", }, []string{"controller"}, ) )
Functions ¶
func Enable ¶ added in v0.1.4
func Enable(enabled bool)
Enable toggles emission of controller metrics at runtime.
func RecordArtifactOperation ¶
RecordArtifactOperation records artifact management operations
func RecordCELCacheHit ¶
func RecordCELCacheHit(cacheType string)
RecordCELCacheHit records CEL cache hit
func RecordCELEvaluation ¶
RecordCELEvaluation records CEL evaluation metrics
func RecordChildStepRunCreated ¶ added in v0.1.4
func RecordChildStepRunCreated(namespace, storyRun, engram string)
RecordChildStepRunCreated increments the counter tracking newly created child StepRuns.
func RecordCleanupOperation ¶
func RecordCleanupOperation(resourceType, namespace string, deletedCount int, duration time.Duration, err error)
RecordCleanupOperation records cleanup operations
func RecordControllerIndexFallback ¶ added in v0.1.4
func RecordControllerIndexFallback(controller, index string)
RecordControllerIndexFallback records when a controller cannot use a configured field index.
func RecordControllerMapperFailure ¶ added in v0.1.4
func RecordControllerMapperFailure(controller, mapper string)
RecordControllerMapperFailure records mapper failures for watch handlers.
func RecordControllerReconcile ¶
RecordControllerReconcile records controller reconcile metrics
func RecordDAGIteration ¶ added in v0.1.4
RecordDAGIteration captures ready/skipped counts for DAG loop iterations.
func RecordDownstreamTargetMutation ¶ added in v0.1.4
func RecordDownstreamTargetMutation(namespace, step, action, hashSummary string)
RecordDownstreamTargetMutation counts downstream target updates keyed by action and hash summary.
func RecordGRPCMessageDropped ¶
func RecordGRPCMessageDropped(storyRun, step, reason string)
RecordGRPCMessageDropped records a dropped message with a reason
func RecordGRPCMessageReceived ¶
func RecordGRPCMessageReceived(storyRun, step string)
RecordGRPCMessageReceived records a message received on a stream
func RecordGRPCMessageSent ¶
func RecordGRPCMessageSent(storyRun, step string)
RecordGRPCMessageSent records a message sent on a stream
func RecordGRPCStreamDuration ¶
RecordGRPCStreamDuration records the duration of a streaming RPC
func RecordGRPCStreamRequest ¶
func RecordGRPCStreamRequest(method, code string)
RecordGRPCStreamRequest records a streaming RPC request with its result code
func RecordImpulseThrottledTriggers ¶ added in v0.1.4
RecordImpulseThrottledTriggers records throttled trigger counts from Impulse status.
func RecordJobExecution ¶
RecordJobExecution records Job execution metrics
func RecordQuotaViolation ¶
func RecordQuotaViolation(namespace, resourceType, violationType string)
RecordQuotaViolation records quota violation metrics
func RecordResolverServiceAccountFallback ¶ added in v0.1.4
func RecordResolverServiceAccountFallback(storyRun string)
RecordResolverServiceAccountFallback counts resolver runs that default the ServiceAccount name.
func RecordResolverStage ¶ added in v0.1.4
RecordResolverStage tracks resolver-chain stage execution telemetry.
func RecordResourceCleanup ¶
RecordResourceCleanup records resource cleanup metrics
func RecordStepRunCacheLookup ¶ added in v0.1.4
func RecordStepRunCacheLookup(namespace, engram, result string)
RecordStepRunCacheLookup increments the StepRun cache lookup counter.
func RecordStepRunDuration ¶
RecordStepRunDuration records duration metrics for StepRun operations (alias for consistency)
func RecordStepRunMetrics ¶
RecordStepRunMetrics records metrics for StepRun operations
func RecordStorageOperation ¶
RecordStorageOperation records storage provider operations (placeholder for storage package)
func RecordStoryDirtyMark ¶ added in v0.1.4
func RecordStoryDirtyMark(namespace, story, reason string)
RecordStoryDirtyMark increments the metric tracking why a Story was marked dirty.
func RecordStoryRunDependentCleanup ¶ added in v0.1.4
RecordStoryRunDependentCleanup counts StoryRun dependent deletions keyed by resource type.
func RecordStoryRunMetrics ¶
RecordStoryRunMetrics records metrics for StoryRun operations
func RecordStoryRunQueueAge ¶ added in v0.1.4
RecordStoryRunQueueAge observes how long a StoryRun has been queued.
func RecordStoryRunQueueDepth ¶ added in v0.1.4
RecordStoryRunQueueDepth records the number of steps queued for a queue.
func RecordStoryRunRBACOperation ¶ added in v0.1.4
func RecordStoryRunRBACOperation(resource, operation string)
RecordStoryRunRBACOperation counts RBAC reconciliation changes keyed by operation result.
func RecordStoryRunWindowSeconds ¶ added in v0.1.4
RecordStoryRunWindowSeconds captures the configured TTL/retention windows for StoryRuns.
func RecordSubStoryRefresh ¶ added in v0.1.4
func RecordSubStoryRefresh(controller string)
RecordSubStoryRefresh increments the counter tracking DAG sub-story refresh passes.
func RecordTransportBindingAnnotationSanitizeFailure ¶ added in v0.1.4
func RecordTransportBindingAnnotationSanitizeFailure(namespace, storyRun, step string)
RecordTransportBindingAnnotationSanitizeFailure increments the counter when Engram binding annotations fail sanitization.
func RecordTransportBindingOperation ¶ added in v0.1.4
func RecordTransportBindingOperation(controller, bindingAlias string, mutated bool, duration time.Duration, err error)
RecordTransportBindingOperation records metrics for TransportBinding mutations.
func RecordTransportBindingReadFallback ¶ added in v0.1.4
func RecordTransportBindingReadFallback(namespace, reader string)
RecordTransportBindingReadFallback counts binding env read fallbacks keyed by namespace and reader type.
func RecordTransportBindingSnapshot ¶ added in v0.1.4
RecordTransportBindingSnapshot records the current ready/total/pending/failed binding counts for a Transport so controllers and background workers emit the same metric series.
func RecordTriggerBackfill ¶ added in v0.1.4
RecordTriggerBackfill captures bounded trigger backfill activity for controllers such as Engram and Story.
func UpdateResourceQuotaLimit ¶
UpdateResourceQuotaLimit updates resource quota limits
func UpdateResourceQuotaUsage ¶
UpdateResourceQuotaUsage updates current resource usage
func UpdateStoryRunStepsGauge ¶
UpdateStoryRunStepsGauge updates the active/completed steps gauge
Types ¶
This section is empty.