Documentation
¶
Overview ¶
Package metrics provides Prometheus metrics for PATH gateway observability. These metrics are designed based on how_metrics_should_work.md to provide domain-centric, actionable insights into gateway performance.
Index ¶
- Constants
- Variables
- func GetLatencySignal(latencyMs float64) string
- func GetLatencySignalWithThresholds(latencyMs float64, thresholds *LatencyThresholds) string
- func GetStatusCodeCategory(statusCode int) string
- func NormalizeRPCType(rpcType string) string
- func RecordBatchSize(rpcType, serviceID string, batchCount int, latencySeconds float64)
- func RecordHealthCheck(domain, rpcType, serviceID, healthCheckName, reputationSignal string)
- func RecordLatencyReputation(domain, rpcType, serviceID, latencySignal string)
- func RecordObservation(domain, rpcType, serviceID, networkType, method, reputationSignal string)
- func RecordProbationEvent(domain, rpcType, serviceID, event string)
- func RecordRPCTypeFallback(domain, supplier, serviceID, requestedRPCType, fallbackRPCType string)
- func RecordRelay(domain, rpcType, serviceID, statusCode, reputationSignal, relayType string, ...)
- func RecordRequest(domain, rpcType, serviceID, statusCode string, latencySeconds float64)
- func RecordRequestSize(rpcType, serviceID string, bytesReceived, bytesSent int64)
- func RecordRetryDistribution(domain, rpcType, serviceID, reason string)
- func RecordRetryResult(rpcType, serviceID, retryCount, result string, latencySeconds float64)
- func RecordSupplierBlacklist(domain, supplier, serviceID, reason string)
- func RecordSupplierNilPubkey(supplier string)
- func RecordSupplierPubkeyCacheInvalidated(supplier string)
- func RecordSupplierPubkeyRecovered(supplier string)
- func RecordWebsocketConnectionClosed(domain, serviceID string, durationSeconds float64)
- func RecordWebsocketConnectionEstablished(domain, serviceID string)
- func RecordWebsocketConnectionFailed(domain, serviceID string)
- func RecordWebsocketMessage(domain, serviceID, direction, reputationSignal string)
- func ServePprof(ctx context.Context, logger polylog.Logger, addr string)
- func SetMeanScore(domain, serviceID, rpcType string, score float64)
- type EndpointLeaderboardEntry
- type LatencyThresholds
- type LeaderboardDataProvider
- type LeaderboardPublisher
- type MeanScoreEntry
- type PrometheusMetricsReporter
Constants ¶
const ( // MetricPrefix prefix for all PATH metrics MetricPrefix = "path_" LabelDomain = "domain" LabelRPCType = "rpc_type" LabelServiceID = "service_id" LabelTierThreshold = "tier_threshold" LabelSessionStartHeight = "session_start_height" LabelHealthCheckName = "health_check_name" LabelReputationSignal = "reputation_signal" LabelNetworkType = "network_type" LabelMethod = "method" LabelLatencySignal = "latency_signal" LabelStatusCode = "status_code" LabelRetryReason = "reason" LabelRetryCount = "retry_count" LabelResult = "result" LabelBatchCount = "batch_count" LabelSupplier = "supplier" LatencySignalCheetah = "Cheetah" LatencySignalGazelle = "Gazelle" LatencySignalRabbit = "Rabbit" LatencySignalTurtle = "Turtle" LatencySignalSnail = "Snail" SignalOK = "ok" SignalSlow = "slow" SignalSlowASF = "slow_asf" SignalMinorError = "minor_error" SignalMajorError = "major_error" SignalCriticalError = "critical_error" SignalFatalError = "fatal_error" NetworkTypeEVM = "evm" NetworkTypeCosmos = "cosmos" NetworkTypeSolana = "solana" NetworkTypePassthrough = "passthrough" RetryReason5xx = "retry_on_5xx" RetryReasonTimeout = "retry_on_timeout" RetryReasonConnection = "retry_on_connection" RetryResultSuccess = "success" RetryResultFailure = "failure" LabelProbationEvent = "event" ProbationEventEntered = "entered" ProbationEventExited = "exited" ProbationEventRouted = "routed" )
const ( BlacklistReasonSignatureError = "signature_error" BlacklistReasonValidationError = "validation_error" BlacklistReasonUnmarshalError = "unmarshal_error" BlacklistReasonPubKeyError = "pubkey_error" BlacklistReasonNilPubKey = "nil_pubkey" )
Blacklist reason constants
const ( PubkeyCacheEventInvalidated = "invalidated" PubkeyCacheEventRecovered = "recovered" )
Pubkey cache event constants
const ( RelayTypeNormal = "normal" RelayTypeHealthCheck = "health_check" RelayTypeProbation = "probation" )
const ( WSEventEstablished = "established" WSEventClosed = "closed" WSEventFailed = "failed" WSDirectionClientToEndpoint = "client_to_endpoint" WSDirectionEndpointToClient = "endpoint_to_client" )
const ( // LeaderboardPublishInterval is how often the endpoint leaderboard is published LeaderboardPublishInterval = 10 * time.Second )
Variables ¶
var BatchItemsTotal = promauto.NewCounterVec( prometheus.CounterOpts{ Name: MetricPrefix + "batch_items_total", Help: "Total items across all batch requests by service.", }, []string{LabelServiceID}, )
BatchItemsTotal counts the total items across all batch requests per service. Used with BatchRequestsTotal to calculate average batch size: items/requests
var BatchRequestsTotal = promauto.NewCounterVec( prometheus.CounterOpts{ Name: MetricPrefix + "batch_requests_total", Help: "Total number of batch requests by service.", }, []string{LabelServiceID}, )
BatchRequestsTotal counts the number of batch requests per service. Used with BatchItemsTotal to calculate average batch size: items/requests
var BatchSizeLatency = promauto.NewHistogramVec( prometheus.HistogramOpts{ Name: MetricPrefix + "batch_size_latency_seconds", Help: "Batch request latency in seconds by rpc_type, service_id, and batch_count.", Buckets: []float64{0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60}, }, []string{LabelRPCType, LabelServiceID, LabelBatchCount}, )
var BatchSizeTotal = promauto.NewCounterVec( prometheus.CounterOpts{ Name: MetricPrefix + "batch_size_total", Help: "Batch requests by rpc_type, service_id, and batch_count.", }, []string{LabelRPCType, LabelServiceID, LabelBatchCount}, )
var HealthCheckStatus = promauto.NewCounterVec( prometheus.CounterOpts{ Name: MetricPrefix + "health_check_status_total", Help: "Health check results by domain, rpc_type, service_id, health_check_name, and reputation_signal.", }, []string{LabelDomain, LabelRPCType, LabelServiceID, LabelHealthCheckName, LabelReputationSignal}, )
var LatencyReputation = promauto.NewCounterVec( prometheus.CounterOpts{ Name: MetricPrefix + "latency_reputation_total", Help: "Latency categorization by domain, rpc_type, service_id, and latency_signal (Cheetah/Gazelle/Rabbit/Turtle/Snail).", }, []string{LabelDomain, LabelRPCType, LabelServiceID, LabelLatencySignal}, )
var ObservationPipeline = promauto.NewCounterVec( prometheus.CounterOpts{ Name: MetricPrefix + "observation_pipeline_total", Help: "Observation pipeline events by domain, rpc_type, service_id, network_type, method, and reputation_signal.", }, []string{LabelDomain, LabelRPCType, LabelServiceID, LabelNetworkType, LabelMethod, LabelReputationSignal}, )
var ProbationEventsTotal = promauto.NewCounterVec( prometheus.CounterOpts{ Name: MetricPrefix + "probation_events_total", Help: "Probation events by domain, rpc_type, service_id, and event type (entered/exited/routed).", }, []string{LabelDomain, LabelRPCType, LabelServiceID, LabelProbationEvent}, )
var RPCTypeFallbackTotal = promauto.NewCounterVec( prometheus.CounterOpts{ Name: MetricPrefix + "rpc_type_fallback_total", Help: "Count of RPC type fallbacks when supplier doesn't support requested RPC type.", }, []string{LabelDomain, LabelSupplier, LabelServiceID, "requested_rpc_type", "fallback_rpc_type"}, )
var RelayLatency = promauto.NewHistogramVec( prometheus.HistogramOpts{ Name: MetricPrefix + "relay_latency_seconds", Help: "Outgoing relay latency in seconds by domain, rpc_type, service_id, status_code, reputation_signal, and request_type.", Buckets: []float64{0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30}, }, []string{LabelDomain, LabelRPCType, LabelServiceID, LabelStatusCode, LabelReputationSignal, "request_type"}, )
var RelaysTotal = promauto.NewCounterVec( prometheus.CounterOpts{ Name: MetricPrefix + "relays_total", Help: "Total outgoing relays to suppliers by domain, rpc_type, service_id, status_code, reputation_signal, and request_type.", }, []string{LabelDomain, LabelRPCType, LabelServiceID, LabelStatusCode, LabelReputationSignal, "request_type"}, )
var ReputationEndpointLeaderboard = promauto.NewGaugeVec( prometheus.GaugeOpts{ Name: MetricPrefix + "reputation_endpoint_leaderboard", Help: "Number of endpoints grouped by domain, rpc_type, service_id, tier_threshold, and session_start_height. Published every 10s as a leaderboard snapshot.", }, []string{LabelDomain, LabelRPCType, LabelServiceID, LabelTierThreshold, LabelSessionStartHeight}, )
var ReputationMeanScore = promauto.NewGaugeVec( prometheus.GaugeOpts{ Name: MetricPrefix + "reputation_mean_score", Help: "Mean reputation score by domain, service_id, and rpc_type.", }, []string{LabelDomain, LabelServiceID, LabelRPCType}, )
var RequestBytesReceived = promauto.NewCounterVec( prometheus.CounterOpts{ Name: MetricPrefix + "request_bytes_received_total", Help: "Total bytes received in requests by rpc_type and service_id.", }, []string{LabelRPCType, LabelServiceID}, )
var RequestLatency = promauto.NewHistogramVec( prometheus.HistogramOpts{ Name: MetricPrefix + "request_latency_seconds", Help: "Request latency in seconds by domain, rpc_type, service_id, and status_code.", Buckets: []float64{0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10}, }, []string{LabelDomain, LabelRPCType, LabelServiceID, LabelStatusCode}, )
var RequestsTotal = promauto.NewCounterVec( prometheus.CounterOpts{ Name: MetricPrefix + "requests_total", Help: "Total requests by domain, rpc_type, service_id, and status_code.", }, []string{LabelDomain, LabelRPCType, LabelServiceID, LabelStatusCode}, )
var ResponseBytesSent = promauto.NewCounterVec( prometheus.CounterOpts{ Name: MetricPrefix + "response_bytes_sent_total", Help: "Total bytes sent in responses by rpc_type and service_id.", }, []string{LabelRPCType, LabelServiceID}, )
var RetriesDistribution = promauto.NewCounterVec( prometheus.CounterOpts{ Name: MetricPrefix + "retries_distribution_total", Help: "Retry events by domain, rpc_type, service_id, and reason (retry_on_5xx/retry_on_timeout/retry_on_connection).", }, []string{LabelDomain, LabelRPCType, LabelServiceID, LabelRetryReason}, )
var RetryResultsLatency = promauto.NewHistogramVec( prometheus.HistogramOpts{ Name: MetricPrefix + "retry_results_latency_seconds", Help: "Retry result latency in seconds by rpc_type, service_id, retry_count, and result.", Buckets: []float64{0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30}, }, []string{LabelRPCType, LabelServiceID, LabelRetryCount, LabelResult}, )
var RetryResultsTotal = promauto.NewCounterVec( prometheus.CounterOpts{ Name: MetricPrefix + "retry_results_total", Help: "Retry results by rpc_type, service_id, retry_count, and result (success/failure).", }, []string{LabelRPCType, LabelServiceID, LabelRetryCount, LabelResult}, )
var SupplierBlacklistTotal = promauto.NewCounterVec( prometheus.CounterOpts{ Name: MetricPrefix + "supplier_blacklist_total", Help: "Suppliers blacklisted by domain, supplier address, service_id, and reason.", }, []string{LabelDomain, LabelSupplier, LabelServiceID, "reason"}, )
var SupplierNilPubkeyTotal = promauto.NewCounterVec( prometheus.CounterOpts{ Name: MetricPrefix + "supplier_nil_pubkey_total", Help: "Count of times a supplier was found with nil public key (hasn't signed first tx).", }, []string{LabelSupplier}, )
var SupplierPubkeyCacheEvents = promauto.NewCounterVec( prometheus.CounterOpts{ Name: MetricPrefix + "supplier_pubkey_cache_events_total", Help: "Supplier pubkey cache events: invalidated (on signature failure), recovered (nil -> valid).", }, []string{LabelSupplier, "event"}, )
var WebsocketConnectionDuration = promauto.NewHistogramVec( prometheus.HistogramOpts{ Name: MetricPrefix + "websocket_connection_duration_seconds", Help: "WebSocket connection duration in seconds by domain and service_id.", Buckets: []float64{1, 5, 10, 30, 60, 300, 600, 1800, 3600}, }, []string{LabelDomain, LabelServiceID}, )
var WebsocketConnectionEventsTotal = promauto.NewCounterVec( prometheus.CounterOpts{ Name: MetricPrefix + "websocket_connection_events_total", Help: "WebSocket connection events by domain, service_id, and event type (established/closed/failed).", }, []string{LabelDomain, LabelServiceID, "event"}, )
var WebsocketConnectionsActive = promauto.NewGaugeVec( prometheus.GaugeOpts{ Name: MetricPrefix + "websocket_connections_active", Help: "Current active WebSocket connections by domain and service_id.", }, []string{LabelDomain, LabelServiceID}, )
var WebsocketMessagesTotal = promauto.NewCounterVec( prometheus.CounterOpts{ Name: MetricPrefix + "websocket_messages_total", Help: "WebSocket messages by domain, service_id, direction (client_to_endpoint/endpoint_to_client), and reputation_signal.", }, []string{LabelDomain, LabelServiceID, "direction", LabelReputationSignal}, )
Functions ¶
func GetLatencySignal ¶ added in v1.0.10
GetLatencySignal converts latency in milliseconds to a latency signal category. Uses default fixed thresholds. Use GetLatencySignalWithThresholds for per-service thresholds.
func GetLatencySignalWithThresholds ¶ added in v1.0.10
func GetLatencySignalWithThresholds(latencyMs float64, thresholds *LatencyThresholds) string
GetLatencySignalWithThresholds converts latency to a signal based on provided thresholds. If thresholds are nil, use default fixed thresholds.
func GetStatusCodeCategory ¶ added in v1.0.10
GetStatusCodeCategory returns the status code as a string, grouping 4xx and 5xx
func NormalizeRPCType ¶ added in v1.0.10
NormalizeRPCType converts an RPC type string to lowercase format for consistent metric labels. Accepts both protobuf enum string format ("JSON_RPC") and snake_case format ("json_rpc"). Returns lowercase snake_case (e.g., "json_rpc", "rest", "comet_bft", "websocket").
func RecordBatchSize ¶ added in v1.0.10
RecordBatchSize records a batch request with latency
func RecordHealthCheck ¶ added in v1.0.10
func RecordHealthCheck(domain, rpcType, serviceID, healthCheckName, reputationSignal string)
RecordHealthCheck records a health check result
func RecordLatencyReputation ¶ added in v1.0.10
func RecordLatencyReputation(domain, rpcType, serviceID, latencySignal string)
RecordLatencyReputation records a latency categorization
func RecordObservation ¶ added in v1.0.10
func RecordObservation(domain, rpcType, serviceID, networkType, method, reputationSignal string)
RecordObservation records an observation pipeline event
func RecordProbationEvent ¶ added in v1.0.10
func RecordProbationEvent(domain, rpcType, serviceID, event string)
RecordProbationEvent records a probation event (entered, exited, or routed)
func RecordRPCTypeFallback ¶ added in v1.0.10
func RecordRPCTypeFallback(domain, supplier, serviceID, requestedRPCType, fallbackRPCType string)
RecordRPCTypeFallback records when a supplier doesn't support the requested RPC type and a fallback RPC type is used instead
func RecordRelay ¶ added in v1.0.10
func RecordRelay(domain, rpcType, serviceID, statusCode, reputationSignal, relayType string, latencySeconds float64)
RecordRelay records an outgoing relay to a supplier endpoint with latency relayType should be one of: RelayTypeNormal, RelayTypeHealthCheck, RelayTypeProbation statusCode should be the HTTP status code category (2xx, 4xx, 5xx, etc.) reputationSignal should be the signal recorded (ok, minor_error, major_error, etc.)
func RecordRequest ¶ added in v1.0.10
RecordRequest records a request with status code and latency
func RecordRequestSize ¶ added in v1.0.10
RecordRequestSize records request and response sizes
func RecordRetryDistribution ¶ added in v1.0.10
func RecordRetryDistribution(domain, rpcType, serviceID, reason string)
RecordRetryDistribution records why a retry happened
func RecordRetryResult ¶ added in v1.0.10
RecordRetryResult records a retry outcome with latency
func RecordSupplierBlacklist ¶ added in v1.0.10
func RecordSupplierBlacklist(domain, supplier, serviceID, reason string)
RecordSupplierBlacklist records a supplier being blacklisted with the specific reason reason should be one of the BlacklistReason* constants
func RecordSupplierNilPubkey ¶ added in v1.0.10
func RecordSupplierNilPubkey(supplier string)
RecordSupplierNilPubkey records when a supplier is found with a nil public key. This indicates the supplier account exists but hasn't signed its first transaction yet.
func RecordSupplierPubkeyCacheInvalidated ¶ added in v1.0.10
func RecordSupplierPubkeyCacheInvalidated(supplier string)
RecordSupplierPubkeyCacheInvalidated records when a supplier's pubkey cache entry is invalidated due to signature verification failure.
func RecordSupplierPubkeyRecovered ¶ added in v1.0.10
func RecordSupplierPubkeyRecovered(supplier string)
RecordSupplierPubkeyRecovered records when a supplier's pubkey transitions from nil to valid (they signed their first transaction).
func RecordWebsocketConnectionClosed ¶ added in v1.0.10
RecordWebsocketConnectionClosed records a WebSocket connection closure and decrements the active connection count, recording the duration
func RecordWebsocketConnectionEstablished ¶ added in v1.0.10
func RecordWebsocketConnectionEstablished(domain, serviceID string)
RecordWebsocketConnectionEstablished records a successful WebSocket connection establishment and increments the active connection count
func RecordWebsocketConnectionFailed ¶ added in v1.0.10
func RecordWebsocketConnectionFailed(domain, serviceID string)
RecordWebsocketConnectionFailed records a WebSocket connection failure
func RecordWebsocketMessage ¶ added in v1.0.10
func RecordWebsocketMessage(domain, serviceID, direction, reputationSignal string)
RecordWebsocketMessage records a WebSocket message direction should be WSDirectionClientToEndpoint or WSDirectionEndpointToClient
func ServePprof ¶
ServePprof Starts a pprof server on the given address.
func SetMeanScore ¶ added in v1.0.10
SetMeanScore sets the mean reputation score for a domain/service/rpc_type combination
Types ¶
type EndpointLeaderboardEntry ¶ added in v1.0.10
type EndpointLeaderboardEntry struct {
Domain string
RPCType string
ServiceID string
TierThreshold int // The tier threshold (e.g., 70, 50, 30)
SessionStartHeight int64 // The session start height
EndpointCount int // Number of endpoints in this group
}
EndpointLeaderboardEntry represents a single entry in the leaderboard snapshot
type LatencyThresholds ¶ added in v1.0.10
type LatencyThresholds struct {
FastMs float64 // <= this = Cheetah
NormalMs float64 // <= this = Gazelle
SlowMs float64 // <= this = Rabbit
SevereMs float64 // <= this = Turtle
}
LatencyThresholds defines thresholds for latency signal categorization. These should be derived from per-service LatencyConfig.
func DefaultLatencyThresholds ¶ added in v1.0.10
func DefaultLatencyThresholds() *LatencyThresholds
DefaultLatencyThresholds returns default thresholds when no per-service config is available.
type LeaderboardDataProvider ¶ added in v1.0.10
type LeaderboardDataProvider interface {
// GetEndpointLeaderboardData returns all endpoint entries grouped by the required dimensions
GetEndpointLeaderboardData(ctx context.Context) ([]EndpointLeaderboardEntry, error)
// GetMeanScoreData returns mean reputation scores per domain/service/rpc_type
GetMeanScoreData(ctx context.Context) ([]MeanScoreEntry, error)
}
LeaderboardDataProvider is an interface for getting endpoint distribution data
type LeaderboardPublisher ¶ added in v1.0.10
type LeaderboardPublisher struct {
// contains filtered or unexported fields
}
LeaderboardPublisher publishes endpoint leaderboard metrics every 10 seconds
func NewLeaderboardPublisher ¶ added in v1.0.10
func NewLeaderboardPublisher(logger polylog.Logger, provider LeaderboardDataProvider) *LeaderboardPublisher
NewLeaderboardPublisher creates a new leaderboard publisher
func (*LeaderboardPublisher) PublishOnce ¶ added in v1.0.10
func (lp *LeaderboardPublisher) PublishOnce(ctx context.Context)
PublishOnce can be called to manually trigger a leaderboard publish (for testing)
func (*LeaderboardPublisher) Start ¶ added in v1.0.10
func (lp *LeaderboardPublisher) Start(ctx context.Context) error
Start begins the periodic leaderboard publishing
func (*LeaderboardPublisher) Stop ¶ added in v1.0.10
func (lp *LeaderboardPublisher) Stop()
Stop stops the leaderboard publisher
type MeanScoreEntry ¶ added in v1.0.10
type MeanScoreEntry struct {
Domain string
ServiceID string
RPCType string
MeanScore float64 // Average score across all endpoints for this combination
}
MeanScoreEntry represents mean score for a domain/service/rpc_type combination
type PrometheusMetricsReporter ¶
PrometheusMetricsReporter provides the functionality required for exporting PATH metrics to Grafana.
func (*PrometheusMetricsReporter) Publish ¶
func (pmr *PrometheusMetricsReporter) Publish(observations *observation.RequestResponseObservations)
Publish exports service request and response metrics to Prometheus/Grafana Implements the gateway.RequestResponseReporter interface. Records metrics as defined in how_metrics_should_work.md
func (*PrometheusMetricsReporter) ServeMetrics ¶
func (pmr *PrometheusMetricsReporter) ServeMetrics(addr string) error
Starts a metrics server on the given address.