telemetry

package
v0.3.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 18, 2026 License: MIT Imports: 12 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func PrometheusHandler

func PrometheusHandler() http.Handler

Types

type HealthStats

type HealthStats struct {
	IngestionRate  int64   `json:"ingestion_rate"`
	DLQSize        int64   `json:"dlq_size"`
	ActiveConns    int64   `json:"active_connections"`
	DBLatencyP99Ms float64 `json:"db_latency_p99_ms"`
	Goroutines     int     `json:"goroutines"`
	HeapAllocMB    float64 `json:"heap_alloc_mb"`
	UptimeSeconds  float64 `json:"uptime_seconds"`
}

HealthStats is the JSON response for GET /api/health.

type Metrics

type Metrics struct {
	// --- Existing ---
	IngestionRate     prometheus.Counter
	ActiveConnections prometheus.Gauge
	DBLatency         prometheus.Histogram
	DLQSize           prometheus.Gauge

	// IngestDurationSeconds is the per-Export E2E latency observed inside
	// the OTLP servers (gRPC + HTTP), labeled by signal {traces,logs,metrics}.
	// Drives ingest SLOs: alert on p99 / error budget burn rather than on the
	// blunt OtelContext_grpc_request_duration_seconds aggregate.
	IngestDurationSeconds *prometheus.HistogramVec

	// --- gRPC ---
	GRPCRequestsTotal   *prometheus.CounterVec
	GRPCRequestDuration *prometheus.HistogramVec
	GRPCBatchSize       prometheus.Histogram

	// --- HTTP ---
	HTTPRequestsTotal   *prometheus.CounterVec
	HTTPRequestDuration *prometheus.HistogramVec

	// --- TSDB ---
	TSDBIngestTotal         prometheus.Counter
	TSDBFlushDuration       prometheus.Histogram
	TSDBBatchesDropped      prometheus.Counter
	TSDBCardinalityOverflow prometheus.Counter
	// TSDBCardinalityOverflowByTenant labels overflow events with the tenant ID
	// that triggered them, or the sentinel "__global__" when the global cap
	// (not a per-tenant cap) was the trigger. Use this to identify noisy
	// tenants: sum by (tenant_id) (rate(otelcontext_tsdb_cardinality_overflow_by_tenant_total[5m]))
	TSDBCardinalityOverflowByTenant *prometheus.CounterVec

	// --- WebSocket ---
	WSMessagesSent       *prometheus.CounterVec
	WSSlowClientsRemoved prometheus.Counter

	// --- DLQ ---
	DLQEnqueuedTotal prometheus.Counter
	DLQReplaySuccess prometheus.Counter
	DLQReplayFailure prometheus.Counter
	DLQDiskBytes     prometheus.Gauge

	// --- Storage ---
	HotDBSizeBytes prometheus.Gauge

	// --- Retention ---
	RetentionRowsPurgedTotal       *prometheus.CounterVec
	RetentionPurgeDurationSeconds  *prometheus.HistogramVec
	RetentionVacuumDurationSeconds *prometheus.HistogramVec
	RetentionRowsBehindGauge       *prometheus.GaugeVec

	// --- Postgres partitioning (DB_POSTGRES_PARTITIONING=daily) ---
	// PartitionsDropped counts daily logs partitions dropped during the
	// retention pass. Each drop is a near-instant DDL — alert when this
	// counter is flat for >1.5 retention periods (indicates a stuck loop).
	PartitionsDropped prometheus.Counter
	// PartitionsActive gauges the live partitions attached to logs.
	// Healthy steady-state ~ HOT_RETENTION_DAYS + DB_PARTITION_LOOKAHEAD_DAYS + 1.
	PartitionsActive prometheus.Gauge

	// --- Runtime ---
	GoGoroutines     prometheus.Gauge
	GoHeapAllocBytes prometheus.Gauge

	// --- Operational (Fix 6) ---
	PanicsRecoveredTotal          *prometheus.CounterVec
	MCPToolInvocationsTotal       *prometheus.CounterVec
	APIAuthFailuresTotal          *prometheus.CounterVec
	GraphRAGEventBufferDepth      prometheus.Gauge
	RetentionLastSuccessTimestamp *prometheus.GaugeVec
	RetentionConsecutiveFailures  *prometheus.GaugeVec
	DBUp                          *prometheus.GaugeVec

	// --- GraphRAG overflow ---
	GraphRAGEventsDroppedTotal *prometheus.CounterVec

	// GraphRAGTenantsEvictedTotal counts tenant store slices evicted after
	// exceeding GRAPHRAG_TENANT_IDLE_TTL. The default tenant is never
	// evicted; a steady non-zero rate on a single-tenant install means
	// rogue tenant IDs are reaching ingest.
	GraphRAGTenantsEvictedTotal prometheus.Counter

	// --- In-memory store census (OOM-survival work) ---
	// GraphRAGStoreEntities — live node counts per entity kind across tenants
	// (tenants|services|operations|traces|spans|log_clusters|metrics|anomalies).
	// GraphRAGStoreEdges — live edge counts per store (service|trace|signal|anomaly).
	// Together with the ring/drain gauges these attribute RSS growth to a
	// specific structure before a heap profile is needed.
	GraphRAGStoreEntities *prometheus.GaugeVec
	GraphRAGStoreEdges    *prometheus.GaugeVec
	TSDBRingSeriesActive  prometheus.Gauge
	// TSDBRingSeriesRejected — points refused a NEW ring series at the
	// tenant-scoped series cap (existing series keep recording).
	TSDBRingSeriesRejected prometheus.Counter
	DrainTemplatesActive   prometheus.Gauge

	// --- Async ingest pipeline (Phase 1 robustness work) ---
	// IngestPipelineQueueDepth — current queue depth, sampled on every Submit.
	// Labeled by signal so spikes can be attributed to traces vs logs.
	IngestPipelineQueueDepth *prometheus.GaugeVec
	// IngestPipelineQueueBytes — approximate bytes held by queued batches.
	// Reserved at Submit, released when a worker finishes the batch; the
	// byte cap (INGEST_PIPELINE_MAX_BYTES) rejects submissions above it.
	IngestPipelineQueueBytes prometheus.Gauge
	// IngestPipelineDroppedTotal — batches that did NOT reach the DB.
	// reason="soft_backpressure" — healthy batch dropped at >=90% fullness.
	// reason="queue_full"        — batch rejected at 100% capacity (client got 429/RESOURCE_EXHAUSTED).
	// reason="bytes_full"        — batch rejected at the byte cap (even priority batches).
	IngestPipelineDroppedTotal *prometheus.CounterVec

	// HTTPOTLPThrottledTotal — count of HTTP 429s issued by the OTLP HTTP
	// receiver when the async ingest pipeline is full. Mirrors the gRPC
	// RESOURCE_EXHAUSTED path so operators see a single throttling signal
	// across both transports. Label `signal` is one of traces|logs|metrics.
	HTTPOTLPThrottledTotal *prometheus.CounterVec

	// --- DB pool (sampled every 5s from sql.DB.Stats) ---
	DBPoolOpenConnections prometheus.Gauge
	DBPoolInUse           prometheus.Gauge
	DBPoolIdle            prometheus.Gauge
	DBPoolWaitCount       prometheus.Gauge
	DBPoolWaitDuration    prometheus.Gauge // cumulative seconds

	// --- DLQ eviction (Task 8) ---
	DLQEvictedTotal      prometheus.Counter
	DLQEvictedBytesTotal prometheus.Counter

	// --- Dashboard p99 (Task 10) ---
	DashboardP99RowCapHitsTotal prometheus.Counter
	// contains filtered or unexported fields
}

Metrics holds all internal Prometheus metrics for OtelContext self-monitoring.

func New

func New() *Metrics

New creates and registers all OtelContext internal metrics.

func (*Metrics) DecrementActiveConns

func (m *Metrics) DecrementActiveConns()

func (*Metrics) GetHealthStats

func (m *Metrics) GetHealthStats() HealthStats

func (*Metrics) HealthHandler

func (m *Metrics) HealthHandler() http.HandlerFunc

func (*Metrics) HealthWSHandler

func (m *Metrics) HealthWSHandler() http.HandlerFunc

HealthWSHandler returns an HTTP handler that upgrades to WebSocket and pushes HealthStats snapshots every 3 seconds. An immediate snapshot is sent on connection so the client never has to wait for the first tick.

func (*Metrics) IncrementActiveConns

func (m *Metrics) IncrementActiveConns()

func (*Metrics) ObserveDBLatency

func (m *Metrics) ObserveDBLatency(seconds float64)

func (*Metrics) ObserveIngestDuration

func (m *Metrics) ObserveIngestDuration(signal string, d time.Duration)

ObserveIngestDuration records an end-to-end OTLP Export latency for the given signal. Callers should pass time.Since(start) measured from the very start of the Export handler. Nil-safe so the OTLP servers can be wired without a Metrics instance during tests.

func (*Metrics) RecordIngestion

func (m *Metrics) RecordIngestion(count int)

func (*Metrics) SampleDBPoolStats

func (m *Metrics) SampleDBPoolStats(sqlDB *sql.DB)

SampleDBPoolStats writes the live pool stats into the DBPool* gauges. Safe to call from a ticker goroutine. A nil receiver or a nil *sql.DB is a no-op so callers don't need to guard at every call site.

WaitCount and WaitDuration from sql.DBStats are cumulative values (always monotonically increasing) — operators should compute rate() over them.

func (*Metrics) SetActiveConnections

func (m *Metrics) SetActiveConnections(n int)

func (*Metrics) SetDLQSize

func (m *Metrics) SetDLQSize(n int)

func (*Metrics) StartRuntimeMetrics

func (m *Metrics) StartRuntimeMetrics()

StartRuntimeMetrics samples Go runtime stats every 15 seconds.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL