telemetry

package
v0.2.0-beta.3 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: May 3, 2026 License: MIT Imports: 12 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func PrometheusHandler

func PrometheusHandler() http.Handler

Types

type HealthStats

type HealthStats struct {
	IngestionRate  int64   `json:"ingestion_rate"`
	DLQSize        int64   `json:"dlq_size"`
	ActiveConns    int64   `json:"active_connections"`
	DBLatencyP99Ms float64 `json:"db_latency_p99_ms"`
	Goroutines     int     `json:"goroutines"`
	HeapAllocMB    float64 `json:"heap_alloc_mb"`
	UptimeSeconds  float64 `json:"uptime_seconds"`
}

HealthStats is the JSON response for GET /api/health.

type Metrics

type Metrics struct {
	// --- Existing ---
	IngestionRate     prometheus.Counter
	ActiveConnections prometheus.Gauge
	DBLatency         prometheus.Histogram
	DLQSize           prometheus.Gauge

	// IngestDurationSeconds is the per-Export E2E latency observed inside
	// the OTLP servers (gRPC + HTTP), labeled by signal {traces,logs,metrics}.
	// Drives ingest SLOs: alert on p99 / error budget burn rather than on the
	// blunt OtelContext_grpc_request_duration_seconds aggregate.
	IngestDurationSeconds *prometheus.HistogramVec

	// --- gRPC ---
	GRPCRequestsTotal   *prometheus.CounterVec
	GRPCRequestDuration *prometheus.HistogramVec
	GRPCBatchSize       prometheus.Histogram

	// --- HTTP ---
	HTTPRequestsTotal   *prometheus.CounterVec
	HTTPRequestDuration *prometheus.HistogramVec

	// --- TSDB ---
	TSDBIngestTotal         prometheus.Counter
	TSDBFlushDuration       prometheus.Histogram
	TSDBBatchesDropped      prometheus.Counter
	TSDBCardinalityOverflow prometheus.Counter
	// TSDBCardinalityOverflowByTenant labels overflow events with the tenant ID
	// that triggered them, or the sentinel "__global__" when the global cap
	// (not a per-tenant cap) was the trigger. Use this to identify noisy
	// tenants: sum by (tenant_id) (rate(otelcontext_tsdb_cardinality_overflow_by_tenant_total[5m]))
	TSDBCardinalityOverflowByTenant *prometheus.CounterVec

	// --- WebSocket ---
	WSMessagesSent       *prometheus.CounterVec
	WSSlowClientsRemoved prometheus.Counter

	// --- DLQ ---
	DLQEnqueuedTotal prometheus.Counter
	DLQReplaySuccess prometheus.Counter
	DLQReplayFailure prometheus.Counter
	DLQDiskBytes     prometheus.Gauge

	// --- Storage ---
	HotDBSizeBytes prometheus.Gauge

	// --- Retention ---
	RetentionRowsPurgedTotal       *prometheus.CounterVec
	RetentionPurgeDurationSeconds  *prometheus.HistogramVec
	RetentionVacuumDurationSeconds *prometheus.HistogramVec
	RetentionRowsBehindGauge       *prometheus.GaugeVec

	// --- Postgres partitioning (DB_POSTGRES_PARTITIONING=daily) ---
	// PartitionsDropped counts daily logs partitions dropped during the
	// retention pass. Each drop is a near-instant DDL — alert when this
	// counter is flat for >1.5 retention periods (indicates a stuck loop).
	PartitionsDropped prometheus.Counter
	// PartitionsActive gauges the live partitions attached to logs.
	// Healthy steady-state ~ HOT_RETENTION_DAYS + DB_PARTITION_LOOKAHEAD_DAYS + 1.
	PartitionsActive prometheus.Gauge

	// --- Runtime ---
	GoGoroutines     prometheus.Gauge
	GoHeapAllocBytes prometheus.Gauge

	// --- Operational (Fix 6) ---
	PanicsRecoveredTotal          *prometheus.CounterVec
	MCPToolInvocationsTotal       *prometheus.CounterVec
	APIAuthFailuresTotal          *prometheus.CounterVec
	GraphRAGEventBufferDepth      prometheus.Gauge
	RetentionLastSuccessTimestamp *prometheus.GaugeVec
	RetentionConsecutiveFailures  *prometheus.GaugeVec
	DBUp                          *prometheus.GaugeVec

	// --- GraphRAG overflow ---
	GraphRAGEventsDroppedTotal *prometheus.CounterVec

	// --- Async ingest pipeline (Phase 1 robustness work) ---
	// IngestPipelineQueueDepth — current queue depth, sampled on every Submit.
	// Labeled by signal so spikes can be attributed to traces vs logs.
	IngestPipelineQueueDepth *prometheus.GaugeVec
	// IngestPipelineDroppedTotal — batches that did NOT reach the DB.
	// reason="soft_backpressure" — healthy batch dropped at >=90% fullness.
	// reason="queue_full"        — batch rejected at 100% capacity (client got 429/RESOURCE_EXHAUSTED).
	IngestPipelineDroppedTotal *prometheus.CounterVec

	// HTTPOTLPThrottledTotal — count of HTTP 429s issued by the OTLP HTTP
	// receiver when the async ingest pipeline is full. Mirrors the gRPC
	// RESOURCE_EXHAUSTED path so operators see a single throttling signal
	// across both transports. Label `signal` is one of traces|logs|metrics.
	HTTPOTLPThrottledTotal *prometheus.CounterVec

	// --- DB pool (sampled every 5s from sql.DB.Stats) ---
	DBPoolOpenConnections prometheus.Gauge
	DBPoolInUse           prometheus.Gauge
	DBPoolIdle            prometheus.Gauge
	DBPoolWaitCount       prometheus.Gauge
	DBPoolWaitDuration    prometheus.Gauge // cumulative seconds

	// --- DLQ eviction (Task 8) ---
	DLQEvictedTotal      prometheus.Counter
	DLQEvictedBytesTotal prometheus.Counter

	// --- Dashboard p99 (Task 10) ---
	DashboardP99RowCapHitsTotal prometheus.Counter

	// --- Vectordb persistence ---
	// VectorSnapshotWritesTotal counts snapshot write attempts, labeled
	// {result=success|failure}. Alert on rate(failure[10m]) > 0.
	VectorSnapshotWritesTotal *prometheus.CounterVec
	// VectorSnapshotDurationSeconds is the WriteSnapshot wall-clock
	// duration. Histogram so operators can SLO p95 / p99.
	VectorSnapshotDurationSeconds prometheus.Histogram
	// VectorSnapshotSizeBytes gauges the on-disk size of the latest
	// successful snapshot. Sudden growth signals a maxSize bump or a
	// schema change worth investigating.
	VectorSnapshotSizeBytes prometheus.Gauge
	// VectorSnapshotLoadTotal counts startup snapshot loads, labeled
	// {result=success|missing|corrupt}. corrupt = magic/version/crc/decode
	// failure — caller falls back to a full DB rebuild.
	VectorSnapshotLoadTotal *prometheus.CounterVec
	// VectorReplayLogsTotal accumulates rows processed by ReplayFromDB
	// across the daemon's lifetime. The rate spikes only at startup
	// (catching the snapshot→now gap), then stays flat.
	VectorReplayLogsTotal prometheus.Counter
	// contains filtered or unexported fields
}

Metrics holds all internal Prometheus metrics for OtelContext self-monitoring.

func New

func New() *Metrics

New creates and registers all OtelContext internal metrics.

func (*Metrics) DecrementActiveConns

func (m *Metrics) DecrementActiveConns()

func (*Metrics) GetHealthStats

func (m *Metrics) GetHealthStats() HealthStats

func (*Metrics) HealthHandler

func (m *Metrics) HealthHandler() http.HandlerFunc

func (*Metrics) HealthWSHandler

func (m *Metrics) HealthWSHandler() http.HandlerFunc

HealthWSHandler returns an HTTP handler that upgrades to WebSocket and pushes HealthStats snapshots every 3 seconds. An immediate snapshot is sent on connection so the client never has to wait for the first tick.

func (*Metrics) IncrementActiveConns

func (m *Metrics) IncrementActiveConns()

func (*Metrics) ObserveDBLatency

func (m *Metrics) ObserveDBLatency(seconds float64)

func (*Metrics) ObserveIngestDuration

func (m *Metrics) ObserveIngestDuration(signal string, d time.Duration)

ObserveIngestDuration records an end-to-end OTLP Export latency for the given signal. Callers should pass time.Since(start) measured from the very start of the Export handler. Nil-safe so the OTLP servers can be wired without a Metrics instance during tests.

func (*Metrics) RecordIngestion

func (m *Metrics) RecordIngestion(count int)

func (*Metrics) RecordVectorReplayLogs

func (m *Metrics) RecordVectorReplayLogs(count int)

RecordVectorReplayLogs adds rows processed by ReplayFromDB to the lifetime counter. Called once after the startup tail-replay completes.

func (*Metrics) RecordVectorSnapshotLoad

func (m *Metrics) RecordVectorSnapshotLoad(result string)

RecordVectorSnapshotLoad is the observer hook for startup snapshot loads. result is "success", "missing" (first start, no prior file), or "corrupt" (any decode/CRC/version error → full rebuild fallback).

func (*Metrics) RecordVectorSnapshotWrite

func (m *Metrics) RecordVectorSnapshotWrite(result string, duration time.Duration, size int64)

RecordVectorSnapshotWrite is the observer hook the vectordb snapshot path calls after each WriteSnapshot attempt. result is "success" or "failure"; size is the on-disk byte count after a successful rename (zero on failure).

func (*Metrics) SampleDBPoolStats

func (m *Metrics) SampleDBPoolStats(sqlDB *sql.DB)

SampleDBPoolStats writes the live pool stats into the DBPool* gauges. Safe to call from a ticker goroutine. A nil receiver or a nil *sql.DB is a no-op so callers don't need to guard at every call site.

WaitCount and WaitDuration from sql.DBStats are cumulative values (always monotonically increasing) — operators should compute rate() over them.

func (*Metrics) SetActiveConnections

func (m *Metrics) SetActiveConnections(n int)

func (*Metrics) SetDLQSize

func (m *Metrics) SetDLQSize(n int)

func (*Metrics) StartRuntimeMetrics

func (m *Metrics) StartRuntimeMetrics()

StartRuntimeMetrics samples Go runtime stats every 15 seconds.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL