metrics

package
v1.0.14 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 22, 2026 License: MIT Imports: 16 Imported by: 0

Documentation

Overview

Package metrics provides Prometheus metrics for PATH gateway observability. These metrics are designed based on how_metrics_should_work.md to provide domain-centric, actionable insights into gateway performance.

Index

Constants

View Source
const (
	// MetricPrefix prefix for all PATH metrics
	MetricPrefix = "path_"

	LabelDomain             = "domain"
	LabelRPCType            = "rpc_type"
	LabelServiceID          = "service_id"
	LabelTierThreshold      = "tier_threshold"
	LabelSessionStartHeight = "session_start_height"
	LabelHealthCheckName    = "health_check_name"
	LabelReputationSignal   = "reputation_signal"
	LabelNetworkType        = "network_type"
	LabelMethod             = "method"
	LabelLatencySignal      = "latency_signal"
	LabelStatusCode         = "status_code"
	LabelRetryReason        = "reason"
	LabelRetryCount         = "retry_count"
	LabelResult             = "result"
	LabelBatchCount         = "batch_count"
	LabelSupplier           = "supplier"

	LatencySignalCheetah = "Cheetah"
	LatencySignalGazelle = "Gazelle"
	LatencySignalRabbit  = "Rabbit"
	LatencySignalTurtle  = "Turtle"
	LatencySignalSnail   = "Snail"

	SignalOK            = "ok"
	SignalSlow          = "slow"
	SignalSlowASF       = "slow_asf"
	SignalMinorError    = "minor_error"
	SignalMajorError    = "major_error"
	SignalCriticalError = "critical_error"
	SignalFatalError    = "fatal_error"

	NetworkTypeEVM         = "evm"
	NetworkTypeCosmos      = "cosmos"
	NetworkTypeSolana      = "solana"
	NetworkTypePassthrough = "passthrough"

	RetryReason5xx        = "retry_on_5xx"
	RetryReasonTimeout    = "retry_on_timeout"
	RetryReasonConnection = "retry_on_connection"

	RetryResultSuccess = "success"
	RetryResultFailure = "failure"

	LabelProbationEvent = "event"

	ProbationEventEntered = "entered"
	ProbationEventExited  = "exited"
	ProbationEventRouted  = "routed"
)
View Source
const (
	BlacklistReasonSignatureError  = "signature_error"
	BlacklistReasonValidationError = "validation_error"
	BlacklistReasonUnmarshalError  = "unmarshal_error"
	BlacklistReasonPubKeyError     = "pubkey_error"
	BlacklistReasonNilPubKey       = "nil_pubkey"
)

Blacklist reason constants

View Source
const (
	PubkeyCacheEventInvalidated = "invalidated"
	PubkeyCacheEventRecovered   = "recovered"
)

Pubkey cache event constants

View Source
const (
	RelayTypeNormal      = "normal"
	RelayTypeHealthCheck = "health_check"
	RelayTypeProbation   = "probation"
)
View Source
const (
	WSEventEstablished = "established"
	WSEventClosed      = "closed"
	WSEventFailed      = "failed"

	WSDirectionClientToEndpoint = "client_to_endpoint"
	WSDirectionEndpointToClient = "endpoint_to_client"
)
View Source
const (
	// LeaderboardPublishInterval is how often the endpoint leaderboard is published
	LeaderboardPublishInterval = 10 * time.Second
)

Variables

View Source
var BatchItemsTotal = promauto.NewCounterVec(
	prometheus.CounterOpts{
		Name: MetricPrefix + "batch_items_total",
		Help: "Total items across all batch requests by service.",
	},
	[]string{LabelServiceID},
)

BatchItemsTotal counts the total items across all batch requests per service. Used with BatchRequestsTotal to calculate average batch size: items/requests

View Source
var BatchRequestsTotal = promauto.NewCounterVec(
	prometheus.CounterOpts{
		Name: MetricPrefix + "batch_requests_total",
		Help: "Total number of batch requests by service.",
	},
	[]string{LabelServiceID},
)

BatchRequestsTotal counts the number of batch requests per service. Used with BatchItemsTotal to calculate average batch size: items/requests

View Source
var BatchSizeLatency = promauto.NewHistogramVec(
	prometheus.HistogramOpts{
		Name:    MetricPrefix + "batch_size_latency_seconds",
		Help:    "Batch request latency in seconds by rpc_type, service_id, and batch_count.",
		Buckets: []float64{0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60},
	},
	[]string{LabelRPCType, LabelServiceID, LabelBatchCount},
)
View Source
var BatchSizeTotal = promauto.NewCounterVec(
	prometheus.CounterOpts{
		Name: MetricPrefix + "batch_size_total",
		Help: "Batch requests by rpc_type, service_id, and batch_count.",
	},
	[]string{LabelRPCType, LabelServiceID, LabelBatchCount},
)
View Source
var HealthCheckStatus = promauto.NewCounterVec(
	prometheus.CounterOpts{
		Name: MetricPrefix + "health_check_status_total",
		Help: "Health check results by domain, rpc_type, service_id, health_check_name, and reputation_signal.",
	},
	[]string{LabelDomain, LabelRPCType, LabelServiceID, LabelHealthCheckName, LabelReputationSignal},
)
View Source
var LatencyReputation = promauto.NewCounterVec(
	prometheus.CounterOpts{
		Name: MetricPrefix + "latency_reputation_total",
		Help: "Latency categorization by domain, rpc_type, service_id, and latency_signal (Cheetah/Gazelle/Rabbit/Turtle/Snail).",
	},
	[]string{LabelDomain, LabelRPCType, LabelServiceID, LabelLatencySignal},
)
View Source
var ObservationPipeline = promauto.NewCounterVec(
	prometheus.CounterOpts{
		Name: MetricPrefix + "observation_pipeline_total",
		Help: "Observation pipeline events by domain, rpc_type, service_id, network_type, method, and reputation_signal.",
	},
	[]string{LabelDomain, LabelRPCType, LabelServiceID, LabelNetworkType, LabelMethod, LabelReputationSignal},
)
View Source
var ProbationEventsTotal = promauto.NewCounterVec(
	prometheus.CounterOpts{
		Name: MetricPrefix + "probation_events_total",
		Help: "Probation events by domain, rpc_type, service_id, and event type (entered/exited/routed).",
	},
	[]string{LabelDomain, LabelRPCType, LabelServiceID, LabelProbationEvent},
)
View Source
var RPCTypeFallbackTotal = promauto.NewCounterVec(
	prometheus.CounterOpts{
		Name: MetricPrefix + "rpc_type_fallback_total",
		Help: "Count of RPC type fallbacks when supplier doesn't support requested RPC type.",
	},
	[]string{LabelDomain, LabelSupplier, LabelServiceID, "requested_rpc_type", "fallback_rpc_type"},
)
View Source
var RelayLatency = promauto.NewHistogramVec(
	prometheus.HistogramOpts{
		Name:    MetricPrefix + "relay_latency_seconds",
		Help:    "Outgoing relay latency in seconds by domain, rpc_type, service_id, status_code, reputation_signal, and request_type.",
		Buckets: []float64{0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30},
	},
	[]string{LabelDomain, LabelRPCType, LabelServiceID, LabelStatusCode, LabelReputationSignal, "request_type"},
)
View Source
var RelaysTotal = promauto.NewCounterVec(
	prometheus.CounterOpts{
		Name: MetricPrefix + "relays_total",
		Help: "Total outgoing relays to suppliers by domain, rpc_type, service_id, status_code, reputation_signal, and request_type.",
	},
	[]string{LabelDomain, LabelRPCType, LabelServiceID, LabelStatusCode, LabelReputationSignal, "request_type"},
)
View Source
var ReputationEndpointLeaderboard = promauto.NewGaugeVec(
	prometheus.GaugeOpts{
		Name: MetricPrefix + "reputation_endpoint_leaderboard",
		Help: "Number of endpoints grouped by domain, rpc_type, service_id, tier_threshold, and session_start_height. Published every 10s as a leaderboard snapshot.",
	},
	[]string{LabelDomain, LabelRPCType, LabelServiceID, LabelTierThreshold, LabelSessionStartHeight},
)
View Source
var ReputationMeanScore = promauto.NewGaugeVec(
	prometheus.GaugeOpts{
		Name: MetricPrefix + "reputation_mean_score",
		Help: "Mean reputation score by domain, service_id, and rpc_type.",
	},
	[]string{LabelDomain, LabelServiceID, LabelRPCType},
)
View Source
var RequestBytesReceived = promauto.NewCounterVec(
	prometheus.CounterOpts{
		Name: MetricPrefix + "request_bytes_received_total",
		Help: "Total bytes received in requests by rpc_type and service_id.",
	},
	[]string{LabelRPCType, LabelServiceID},
)
View Source
var RequestLatency = promauto.NewHistogramVec(
	prometheus.HistogramOpts{
		Name:    MetricPrefix + "request_latency_seconds",
		Help:    "Request latency in seconds by domain, rpc_type, service_id, and status_code.",
		Buckets: []float64{0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10},
	},
	[]string{LabelDomain, LabelRPCType, LabelServiceID, LabelStatusCode},
)
View Source
var RequestsTotal = promauto.NewCounterVec(
	prometheus.CounterOpts{
		Name: MetricPrefix + "requests_total",
		Help: "Total requests by domain, rpc_type, service_id, and status_code.",
	},
	[]string{LabelDomain, LabelRPCType, LabelServiceID, LabelStatusCode},
)
View Source
var ResponseBytesSent = promauto.NewCounterVec(
	prometheus.CounterOpts{
		Name: MetricPrefix + "response_bytes_sent_total",
		Help: "Total bytes sent in responses by rpc_type and service_id.",
	},
	[]string{LabelRPCType, LabelServiceID},
)
View Source
var RetriesDistribution = promauto.NewCounterVec(
	prometheus.CounterOpts{
		Name: MetricPrefix + "retries_distribution_total",
		Help: "Retry events by domain, rpc_type, service_id, and reason (retry_on_5xx/retry_on_timeout/retry_on_connection).",
	},
	[]string{LabelDomain, LabelRPCType, LabelServiceID, LabelRetryReason},
)
View Source
var RetryResultsLatency = promauto.NewHistogramVec(
	prometheus.HistogramOpts{
		Name:    MetricPrefix + "retry_results_latency_seconds",
		Help:    "Retry result latency in seconds by rpc_type, service_id, retry_count, and result.",
		Buckets: []float64{0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30},
	},
	[]string{LabelRPCType, LabelServiceID, LabelRetryCount, LabelResult},
)
View Source
var RetryResultsTotal = promauto.NewCounterVec(
	prometheus.CounterOpts{
		Name: MetricPrefix + "retry_results_total",
		Help: "Retry results by rpc_type, service_id, retry_count, and result (success/failure).",
	},
	[]string{LabelRPCType, LabelServiceID, LabelRetryCount, LabelResult},
)
View Source
var SupplierBlacklistTotal = promauto.NewCounterVec(
	prometheus.CounterOpts{
		Name: MetricPrefix + "supplier_blacklist_total",
		Help: "Suppliers blacklisted by domain, supplier address, service_id, and reason.",
	},
	[]string{LabelDomain, LabelSupplier, LabelServiceID, "reason"},
)
View Source
var SupplierNilPubkeyTotal = promauto.NewCounterVec(
	prometheus.CounterOpts{
		Name: MetricPrefix + "supplier_nil_pubkey_total",
		Help: "Count of times a supplier was found with nil public key (hasn't signed first tx).",
	},
	[]string{LabelSupplier},
)
View Source
var SupplierPubkeyCacheEvents = promauto.NewCounterVec(
	prometheus.CounterOpts{
		Name: MetricPrefix + "supplier_pubkey_cache_events_total",
		Help: "Supplier pubkey cache events: invalidated (on signature failure), recovered (nil -> valid).",
	},
	[]string{LabelSupplier, "event"},
)
View Source
var WebsocketConnectionDuration = promauto.NewHistogramVec(
	prometheus.HistogramOpts{
		Name:    MetricPrefix + "websocket_connection_duration_seconds",
		Help:    "WebSocket connection duration in seconds by domain and service_id.",
		Buckets: []float64{1, 5, 10, 30, 60, 300, 600, 1800, 3600},
	},
	[]string{LabelDomain, LabelServiceID},
)
View Source
var WebsocketConnectionEventsTotal = promauto.NewCounterVec(
	prometheus.CounterOpts{
		Name: MetricPrefix + "websocket_connection_events_total",
		Help: "WebSocket connection events by domain, service_id, and event type (established/closed/failed).",
	},
	[]string{LabelDomain, LabelServiceID, "event"},
)
View Source
var WebsocketConnectionsActive = promauto.NewGaugeVec(
	prometheus.GaugeOpts{
		Name: MetricPrefix + "websocket_connections_active",
		Help: "Current active WebSocket connections by domain and service_id.",
	},
	[]string{LabelDomain, LabelServiceID},
)
View Source
var WebsocketMessagesTotal = promauto.NewCounterVec(
	prometheus.CounterOpts{
		Name: MetricPrefix + "websocket_messages_total",
		Help: "WebSocket messages by domain, service_id, direction (client_to_endpoint/endpoint_to_client), and reputation_signal.",
	},
	[]string{LabelDomain, LabelServiceID, "direction", LabelReputationSignal},
)

Functions

func GetLatencySignal added in v1.0.10

func GetLatencySignal(latencyMs float64) string

GetLatencySignal converts latency in milliseconds to a latency signal category. Uses default fixed thresholds. Use GetLatencySignalWithThresholds for per-service thresholds.

func GetLatencySignalWithThresholds added in v1.0.10

func GetLatencySignalWithThresholds(latencyMs float64, thresholds *LatencyThresholds) string

GetLatencySignalWithThresholds converts latency to a signal based on provided thresholds. If thresholds are nil, use default fixed thresholds.

func GetStatusCodeCategory added in v1.0.10

func GetStatusCodeCategory(statusCode int) string

GetStatusCodeCategory returns the status code as a string, grouping 4xx and 5xx

func NormalizeRPCType added in v1.0.10

func NormalizeRPCType(rpcType string) string

NormalizeRPCType converts an RPC type string to lowercase format for consistent metric labels. Accepts both protobuf enum string format ("JSON_RPC") and snake_case format ("json_rpc"). Returns lowercase snake_case (e.g., "json_rpc", "rest", "comet_bft", "websocket").

func RecordBatchSize added in v1.0.10

func RecordBatchSize(rpcType, serviceID string, batchCount int, latencySeconds float64)

RecordBatchSize records a batch request with latency

func RecordHealthCheck added in v1.0.10

func RecordHealthCheck(domain, rpcType, serviceID, healthCheckName, reputationSignal string)

RecordHealthCheck records a health check result

func RecordLatencyReputation added in v1.0.10

func RecordLatencyReputation(domain, rpcType, serviceID, latencySignal string)

RecordLatencyReputation records a latency categorization

func RecordObservation added in v1.0.10

func RecordObservation(domain, rpcType, serviceID, networkType, method, reputationSignal string)

RecordObservation records an observation pipeline event

func RecordProbationEvent added in v1.0.10

func RecordProbationEvent(domain, rpcType, serviceID, event string)

RecordProbationEvent records a probation event (entered, exited, or routed)

func RecordRPCTypeFallback added in v1.0.10

func RecordRPCTypeFallback(domain, supplier, serviceID, requestedRPCType, fallbackRPCType string)

RecordRPCTypeFallback records when a supplier doesn't support the requested RPC type and a fallback RPC type is used instead

func RecordRelay added in v1.0.10

func RecordRelay(domain, rpcType, serviceID, statusCode, reputationSignal, relayType string, latencySeconds float64)

RecordRelay records an outgoing relay to a supplier endpoint with latency relayType should be one of: RelayTypeNormal, RelayTypeHealthCheck, RelayTypeProbation statusCode should be the HTTP status code category (2xx, 4xx, 5xx, etc.) reputationSignal should be the signal recorded (ok, minor_error, major_error, etc.)

func RecordRequest added in v1.0.10

func RecordRequest(domain, rpcType, serviceID, statusCode string, latencySeconds float64)

RecordRequest records a request with status code and latency

func RecordRequestSize added in v1.0.10

func RecordRequestSize(rpcType, serviceID string, bytesReceived, bytesSent int64)

RecordRequestSize records request and response sizes

func RecordRetryDistribution added in v1.0.10

func RecordRetryDistribution(domain, rpcType, serviceID, reason string)

RecordRetryDistribution records why a retry happened

func RecordRetryResult added in v1.0.10

func RecordRetryResult(rpcType, serviceID, retryCount, result string, latencySeconds float64)

RecordRetryResult records a retry outcome with latency

func RecordSupplierBlacklist added in v1.0.10

func RecordSupplierBlacklist(domain, supplier, serviceID, reason string)

RecordSupplierBlacklist records a supplier being blacklisted with the specific reason reason should be one of the BlacklistReason* constants

func RecordSupplierNilPubkey added in v1.0.10

func RecordSupplierNilPubkey(supplier string)

RecordSupplierNilPubkey records when a supplier is found with a nil public key. This indicates the supplier account exists but hasn't signed its first transaction yet.

func RecordSupplierPubkeyCacheInvalidated added in v1.0.10

func RecordSupplierPubkeyCacheInvalidated(supplier string)

RecordSupplierPubkeyCacheInvalidated records when a supplier's pubkey cache entry is invalidated due to signature verification failure.

func RecordSupplierPubkeyRecovered added in v1.0.10

func RecordSupplierPubkeyRecovered(supplier string)

RecordSupplierPubkeyRecovered records when a supplier's pubkey transitions from nil to valid (they signed their first transaction).

func RecordWebsocketConnectionClosed added in v1.0.10

func RecordWebsocketConnectionClosed(domain, serviceID string, durationSeconds float64)

RecordWebsocketConnectionClosed records a WebSocket connection closure and decrements the active connection count, recording the duration

func RecordWebsocketConnectionEstablished added in v1.0.10

func RecordWebsocketConnectionEstablished(domain, serviceID string)

RecordWebsocketConnectionEstablished records a successful WebSocket connection establishment and increments the active connection count

func RecordWebsocketConnectionFailed added in v1.0.10

func RecordWebsocketConnectionFailed(domain, serviceID string)

RecordWebsocketConnectionFailed records a WebSocket connection failure

func RecordWebsocketMessage added in v1.0.10

func RecordWebsocketMessage(domain, serviceID, direction, reputationSignal string)

RecordWebsocketMessage records a WebSocket message direction should be WSDirectionClientToEndpoint or WSDirectionEndpointToClient

func ServePprof

func ServePprof(ctx context.Context, logger polylog.Logger, addr string)

ServePprof Starts a pprof server on the given address.

func SetMeanScore added in v1.0.10

func SetMeanScore(domain, serviceID, rpcType string, score float64)

SetMeanScore sets the mean reputation score for a domain/service/rpc_type combination

Types

type EndpointLeaderboardEntry added in v1.0.10

type EndpointLeaderboardEntry struct {
	Domain             string
	RPCType            string
	ServiceID          string
	TierThreshold      int   // The tier threshold (e.g., 70, 50, 30)
	SessionStartHeight int64 // The session start height
	EndpointCount      int   // Number of endpoints in this group
}

EndpointLeaderboardEntry represents a single entry in the leaderboard snapshot

type LatencyThresholds added in v1.0.10

type LatencyThresholds struct {
	FastMs   float64 // <= this = Cheetah
	NormalMs float64 // <= this = Gazelle
	SlowMs   float64 // <= this = Rabbit
	SevereMs float64 // <= this = Turtle
}

LatencyThresholds defines thresholds for latency signal categorization. These should be derived from per-service LatencyConfig.

func DefaultLatencyThresholds added in v1.0.10

func DefaultLatencyThresholds() *LatencyThresholds

DefaultLatencyThresholds returns default thresholds when no per-service config is available.

type LeaderboardDataProvider added in v1.0.10

type LeaderboardDataProvider interface {
	// GetEndpointLeaderboardData returns all endpoint entries grouped by the required dimensions
	GetEndpointLeaderboardData(ctx context.Context) ([]EndpointLeaderboardEntry, error)
	// GetMeanScoreData returns mean reputation scores per domain/service/rpc_type
	GetMeanScoreData(ctx context.Context) ([]MeanScoreEntry, error)
}

LeaderboardDataProvider is an interface for getting endpoint distribution data

type LeaderboardPublisher added in v1.0.10

type LeaderboardPublisher struct {
	// contains filtered or unexported fields
}

LeaderboardPublisher publishes endpoint leaderboard metrics every 10 seconds

func NewLeaderboardPublisher added in v1.0.10

func NewLeaderboardPublisher(logger polylog.Logger, provider LeaderboardDataProvider) *LeaderboardPublisher

NewLeaderboardPublisher creates a new leaderboard publisher

func (*LeaderboardPublisher) PublishOnce added in v1.0.10

func (lp *LeaderboardPublisher) PublishOnce(ctx context.Context)

PublishOnce can be called to manually trigger a leaderboard publish (for testing)

func (*LeaderboardPublisher) Start added in v1.0.10

func (lp *LeaderboardPublisher) Start(ctx context.Context) error

Start begins the periodic leaderboard publishing

func (*LeaderboardPublisher) Stop added in v1.0.10

func (lp *LeaderboardPublisher) Stop()

Stop stops the leaderboard publisher

type MeanScoreEntry added in v1.0.10

type MeanScoreEntry struct {
	Domain    string
	ServiceID string
	RPCType   string
	MeanScore float64 // Average score across all endpoints for this combination
}

MeanScoreEntry represents mean score for a domain/service/rpc_type combination

type PrometheusMetricsReporter

type PrometheusMetricsReporter struct {
	Logger polylog.Logger
}

PrometheusMetricsReporter provides the functionality required for exporting PATH metrics to Grafana.

func (*PrometheusMetricsReporter) Publish

Publish exports service request and response metrics to Prometheus/Grafana Implements the gateway.RequestResponseReporter interface. Records metrics as defined in how_metrics_should_work.md

func (*PrometheusMetricsReporter) ServeMetrics

func (pmr *PrometheusMetricsReporter) ServeMetrics(addr string) error

Starts a metrics server on the given address.

Directories

Path Synopsis
protocol

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL