metrics

package
v0.6.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 3, 2026 License: Apache-2.0 Imports: 30 Imported by: 0

Documentation

Index

Constants

View Source
const (
	GatewayRequestTotal = "gateway_request_total"
	GatewayE2EDuration  = "gateway_e2e_duration_seconds"
	GatewayInFlight     = "gateway_in_flight_requests"

	// Count of streamed responses where first token delay > 1s
	GatewayFirstTokenDelayOver1sTotal = "gateway_first_token_delay_over_1s_total"

	// counter to track #success & #fail requests
	GatewayRequestSuccessTotal = "gateway_request_success_total"
	GatewayRequestFailTotal    = "gateway_request_fail_total"

	// counter to track #success & #fail requests for each model
	GatewayRequestModelSuccessTotal = "gateway_request_model_success_total"
	GatewayRequestModelFailTotal    = "gateway_request_model_fail_total"

	// counter to track #prompt & #completion tokenss
	GatewayPromptTokenBucketTotal     = "gateway_prompt_token_bucket_total"
	GatewayCompletionTokenBucketTotal = "gateway_completion_token_bucket_total"

	// counter to track #success & #fail prefill requests
	GatewayPrefillRequestSuccessTotal = "gateway_prefill_request_success_total"
	GatewayPrefillRequestFailTotal    = "gateway_prefill_request_fail_total"

	// gauge to track #outstanding prefill & decode requests
	GatewayPrefillOutstandingRequests = "gateway_prefill_outstanding_requests"
	GatewayDecodeOutstandingRequests  = "gateway_decode_outstanding_requests"

	// counter to track #prefill & #decode pods selected by pd
	PDSelectedPrefillPodTotal = "pd_selected_prefill_pod_total"
	PDSelectedDecodePodTotal  = "pd_selected_decode_pod_total"

	// Duration bucket counters for timing breakdowns
	GatewayRoutingTimeBucketTotal    = "gateway_routing_time_bucket_total"
	GatewayPrefillTimeBucketTotal    = "gateway_prefill_time_bucket_total"
	GatewayKVTransferTimeBucketTotal = "gateway_kv_transfer_time_bucket_total"
	GatewayTTFTBucketTotal           = "gateway_ttft_bucket_total"
	GatewayTPOTBucketTotal           = "gateway_tpot_bucket_total"
	GatewayDecodeTimeBucketTotal     = "gateway_decode_time_bucket_total"
	GatewayTotalTimeBucketTotal      = "gateway_total_time_bucket_total"
)
View Source
const (
	NumRequestsRunning          = "num_requests_running"
	NumRequestsWaiting          = "num_requests_waiting"
	EngineSleepState            = "engine_sleep_state"
	HTTPRequestTotal            = "http_requests_total"
	NumPreemptionsTotal         = "num_preemptions_total"
	RequestSuccessTotal         = "request_success_total"
	NumPrefillPreallocQueueReqs = "num_prefill_prealloc_queue_reqs"
	NumDecodePreallocQueueReqs  = "num_decode_prealloc_queue_reqs"

	E2ERequestLatencySeconds        = "e2e_request_latency_seconds"
	RequestQueueTimeSeconds         = "request_queue_time_seconds"
	RequestInferenceTimeSeconds     = "request_inference_time_seconds"
	PerStageReqLatencySeconds       = "per_stage_req_latency_seconds"
	HTTPRequestDurationSeconds      = "http_request_duration_seconds"
	HTTPRequestDurationHighRSeconds = "http_request_duration_highr_seconds"

	TimeToFirstTokenSeconds   = "time_to_first_token_seconds"
	RequestPrefillTimeSeconds = "request_prefill_time_seconds"
	PromptTokenTotal          = "prompt_tokens_total"
	RequestPromptTokens       = "request_prompt_tokens"

	// deprecated (time_per_output_token_seconds), use inter_token_latency_seconds instead
	TimePerOutputTokenSeconds        = "time_per_output_token_seconds"
	InterTokenLatencySeconds         = "inter_token_latency_seconds"
	RequestTimePerOutputTokenSeconds = "request_time_per_output_token_seconds"
	RequestDecodeTimeSeconds         = "request_decode_time_seconds"

	GenerationTokenTotal          = "generation_tokens_total"
	IterationTokensTotal          = "iteration_tokens_total"
	RequestGenerationTokens       = "request_generation_tokens"
	RequestMaxNumGenerationTokens = "request_max_num_generation_tokens"

	KVCacheUsagePerc                = "kv_cache_usage_perc"
	NixlNumFailedTransfers          = "nixl_num_failed_transfers_total"
	NixlNumFailedNotifications      = "nixl_num_failed_notifications_total"
	PrefixCacheHitTotal             = "prefix_cache_hits_total"
	PrefixCacheQueriesTotal         = "prefix_cache_queries_total"
	ExternalPrefixCacheHitsTotal    = "external_prefix_cache_hits_total"
	ExternalPrefixCacheQueriesTotal = "external_prefix_cache_queries_total"

	NixlXferTimeSeconds  = "nixl_xfer_time_seconds"
	NixlPostTimeSeconds  = "nixl_post_time_seconds"
	NixlBytesTransferred = "nixl_bytes_transferred"
	NixlNumDescriptors   = "nixl_num_descriptors"

	DrainRate1m = "drain_rate_1m"

	P95TTFT5m                            = "p95_ttft_5m"
	P95TTFT5mPod                         = "p95_ttft_5m_pod"
	AvgTTFT5mPod                         = "avg_ttft_5m_pod"
	P95TPOT5mPod                         = "p95_tpot_5m_pod"
	AvgTPOT5mPod                         = "avg_tpot_pod_5m"
	AvgPromptToksPerReq                  = "avg_prompt_toks_per_req"
	AvgGenerationToksPerReq              = "avg_generation_toks_per_req"
	GPUCacheUsagePerc                    = "gpu_cache_usage_perc"
	GPUBusyTimeRatio                     = "gpu_busy_time_ratio"
	CPUCacheUsagePerc                    = "cpu_cache_usage_perc"
	EngineUtilization                    = "engine_utilization"
	AvgE2ELatencyPod                     = "avg_e2e_latency_pod"
	AvgRequestsPerMinPod                 = "avg_requests_per_min_pod"
	AvgPromptThroughputToksPerMinPod     = "avg_prompt_throughput_toks_per_min_pod"
	AvgGenerationThroughputToksPerMinPod = "avg_generation_throughput_toks_per_min_pod"
	MaxLora                              = "max_lora"
	WaitingLoraAdapters                  = "waiting_lora_adapters"
	RunningLoraAdapters                  = "running_lora_adapters"
	VTCBucketSizeActive                  = "vtc_bucket_size_active"
	// Realtime metrics
	RealtimeNumRequestsRunning = "realtime_num_requests_running"
	RealtimeNormalizedPendings = "realtime_normalized_pendings"

	// error to read metrics from backend
	PrometheusQueryFail       = "prometheus_query_fail"
	LLMEngineMetricsQueryFail = "llm_engine_metrics_query_fail"

	// Deprecated metrics
	NumRequestsSwapped              = "num_requests_swapped"
	AvgPromptThroughputToksPerS     = "avg_prompt_throughput_toks_per_s"
	AvgGenerationThroughputToksPerS = "avg_generation_throughput_toks_per_s"
)

Variables

View Source
var (

	// Function variables that can be overridden for testing
	SetGaugeMetricFnForTest         = defaultSetGaugeMetric
	IncrementCounterMetricFnForTest = defaultIncrementCounterMetric
)
View Source
var (
	GatewayMetrics = map[string]Metric{
		GatewayRequestTotal: {
			MetricScope:  PodMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Counter,
			},
			Description: "Total number of requests received by the gateway",
		},

		GatewayRequestModelSuccessTotal: {
			MetricScope:  PodMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Counter,
			},
			Description: "Total number of successful requests received by the gateway for each model",
		},
		GatewayRequestModelFailTotal: {
			MetricScope:  PodMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Counter,
			},
			Description: "Total number of failed requests received by the gateway for each model",
		},

		GatewayPrefillRequestSuccessTotal: {
			MetricScope:  PodMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Counter,
			},
			Description: "Total number of successful prefill requests received by the gateway",
		},
		GatewayPrefillRequestFailTotal: {
			MetricScope:  PodMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Counter,
			},
			Description: "Total number of failed prefill requests received by the gateway",
		},

		GatewayPrefillOutstandingRequests: {
			MetricScope:  PodMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Gauge,
			},
			Description: "Total number of outstanding prefill requests received by the gateway",
		},
		GatewayDecodeOutstandingRequests: {
			MetricScope:  PodMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Gauge,
			},
			Description: "Total number of outstanding decode requests received by the gateway",
		},

		GatewayE2EDuration: {
			MetricScope:  PodMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Histogram,
			},
			Description: "End-to-end latency distribution of requests received by the gateway",
		},

		GatewayInFlight: {
			MetricScope:  PodMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Gauge,
			},
			Description: "Current number of requests in flight (i.e., being processed) by the gateway",
		},

		GatewayFirstTokenDelayOver1sTotal: {
			MetricScope:  PodMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Counter,
			},
			Description: "Count of streamed responses where first token delay > 1s",
		},

		GatewayPromptTokenBucketTotal: {
			MetricScope:  PodMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Counter,
			},
			Description: "Requests counted by prompt token bucket",
		},

		GatewayCompletionTokenBucketTotal: {
			MetricScope:  PodMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Counter,
			},
			Description: "Requests counted by completion token bucket",
		},
		PDSelectedPrefillPodTotal: {
			MetricScope:  PodMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Counter,
			},
			Description: "Total selections of prefill pods by the PD router",
		},
		PDSelectedDecodePodTotal: {
			MetricScope:  PodMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Counter,
			},
			Description: "Total selections of decode pods by the PD router",
		},

		GatewayRoutingTimeBucketTotal: {
			MetricScope:  PodMetricScope,
			MetricSource: PodRawMetrics,
			MetricType:   MetricType{Raw: Counter},
			Description:  "Requests counted by routing time bucket",
		},
		GatewayPrefillTimeBucketTotal: {
			MetricScope:  PodMetricScope,
			MetricSource: PodRawMetrics,
			MetricType:   MetricType{Raw: Counter},
			Description:  "Requests counted by prefill time bucket",
		},
		GatewayKVTransferTimeBucketTotal: {
			MetricScope:  PodMetricScope,
			MetricSource: PodRawMetrics,
			MetricType:   MetricType{Raw: Counter},
			Description:  "Requests counted by KV transfer time bucket",
		},
		GatewayTTFTBucketTotal: {
			MetricScope:  PodMetricScope,
			MetricSource: PodRawMetrics,
			MetricType:   MetricType{Raw: Counter},
			Description:  "Requests counted by TTFT bucket",
		},
		GatewayTPOTBucketTotal: {
			MetricScope:  PodMetricScope,
			MetricSource: PodRawMetrics,
			MetricType:   MetricType{Raw: Counter},
			Description:  "Requests counted by TPOT bucket",
		},
		GatewayDecodeTimeBucketTotal: {
			MetricScope:  PodMetricScope,
			MetricSource: PodRawMetrics,
			MetricType:   MetricType{Raw: Counter},
			Description:  "Requests counted by decode time bucket",
		},
		GatewayTotalTimeBucketTotal: {
			MetricScope:  PodMetricScope,
			MetricSource: PodRawMetrics,
			MetricType:   MetricType{Raw: Counter},
			Description:  "Requests counted by total time bucket",
		},
	}
)
View Source
var (
	// Metrics defines all available metrics, including raw and query-based metrics.
	Metrics = map[string]Metric{
		NumRequestsRunning: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Gauge,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm":   "vllm:num_requests_running",
				"sglang": "sglang:num_running_reqs",
			},
			Description: "Number of running requests",
		},
		NumRequestsWaiting: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Gauge,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm":   "vllm:num_requests_waiting",
				"sglang": "sglang:num_queue_reqs",
			},
			Description: "Number of waiting requests",
		},
		NumRequestsSwapped: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Gauge,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm":   "vllm:num_requests_swapped",
				"sglang": "sglang:num_retracted_reqs",
			},
			Description: "Number of swapped requests",
		},
		EngineSleepState: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Gauge,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:engine_sleep_state",
			},
			Description: "Engine sleep state; awake = 0 means engine is sleeping; awake = 1 means engine is awake; weights_offloaded = 1 means sleep level 1; discard_all = 1 means sleep level 2.",
		},
		HTTPRequestTotal: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Counter,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:http_requests_total",
			},
			Description: "Total number of requests by method, status and handler.",
		},
		NumPreemptionsTotal: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Counter,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:num_preemptions_total",
			},
			Description: "Number of preemptions",
		},
		RequestSuccessTotal: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Counter,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm":   "vllm:num_requests_success_total",
				"sglang": "sglang:num_requests_total",
			},
			Description: "Number of successful requests",
		},
		NumPrefillPreallocQueueReqs: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Gauge,
			},
			EngineMetricsNameMapping: map[string]string{
				"sglang": "sglang:num_prefill_prealloc_queue_reqs",
			},
			Description: "Number of prefill preallocation queue requests",
		},
		NumDecodePreallocQueueReqs: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Gauge,
			},
			EngineMetricsNameMapping: map[string]string{
				"sglang": "sglang:num_decode_prealloc_queue_reqs",
			},
			Description: "Number of decode preallocation queue requests",
		},

		E2ERequestLatencySeconds: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Histogram,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm":   "vllm:e2e_request_latency_seconds",
				"sglang": "sglang:e2e_request_latency_seconds",
			},
			Description: "End-to-end request latency in seconds",
		},
		RequestQueueTimeSeconds: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Histogram,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:request_queue_time_seconds",
			},
			Description: "Request queue time in seconds",
		},
		RequestInferenceTimeSeconds: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Histogram,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:request_inference_time_seconds",
			},
			Description: "Request inference time in seconds",
		},
		PerStageReqLatencySeconds: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Histogram,
			},
			EngineMetricsNameMapping: map[string]string{
				"sglang": "sglang:per_stage_req_latency_seconds",
			},
			Description: "Per-stage request latency in seconds",
		},
		HTTPRequestDurationSeconds: {
			MetricScope:  PodMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Histogram,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm": "http_request_duration_seconds",
			},
			Description: "Histogram of request duration in seconds",
		},
		HTTPRequestDurationHighRSeconds: {
			MetricScope:  PodMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Histogram,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm": "http_request_duration_highr_seconds",
			},
			Description: "Histogram of request duration in seconds for high priority requests",
		},
		PromptTokenTotal: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Counter,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:prompt_tokens_total",
			},
			Description: "Total prompt tokens",
		},
		RequestPromptTokens: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Histogram,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:request_prompt_tokens",
			},
			Description: "Histogram of prompt tokens",
		},
		GenerationTokenTotal: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Counter,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:generation_tokens_total",
			},
			Description: "Total generation tokens",
		},
		RequestGenerationTokens: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Histogram,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:request_generation_tokens",
			},
			Description: "Histogram of generation tokens",
		},
		RequestMaxNumGenerationTokens: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Histogram,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:request_max_num_generation_tokens",
			},
			Description: "Histogram of max number of generation tokens",
		},
		IterationTokensTotal: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Histogram,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:iteration_tokens_total",
			},
			Description: "Total iteration tokens",
		},
		TimeToFirstTokenSeconds: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Histogram,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm":   "vllm:time_to_first_token_seconds",
				"sglang": "sglang:time_to_first_token_seconds",
			},
			Description: "Time to first token in seconds",
		},
		TimePerOutputTokenSeconds: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Histogram,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm":   "vllm:time_per_output_token_seconds",
				"sglang": "sglang:inter_token_latency_seconds",
			},
			Description: "Time per output token in seconds",
		},
		InterTokenLatencySeconds: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Histogram,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm":   "vllm:inter_token_latency_seconds",
				"sglang": "sglang:inter_token_latency_seconds",
			},
			Description: "Inter-token latency in seconds",
		},
		RequestDecodeTimeSeconds: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Histogram,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:request_decode_time_seconds",
			},
			Description: "Request decode time in seconds",
		},
		RequestPrefillTimeSeconds: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Histogram,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:request_prefill_time_seconds",
			},
			Description: "Request prefill time in seconds",
		},
		RequestTimePerOutputTokenSeconds: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Histogram,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:request_time_per_output_token_seconds",
			},
			Description: "Time per output token in seconds",
		},
		GPUCacheUsagePerc: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Gauge,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm":   "vllm:gpu_cache_usage_perc",
				"sglang": "sglang:token_usage",
				"xllm":   "kv_cache_utilization",
			},
			Description: "GPU cache usage percentage",
		},
		EngineUtilization: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Gauge,
			},
			EngineMetricsNameMapping: map[string]string{
				"xllm": "engine_utilization",
			},
			Description: "GPU busy time ratio",
		},
		CPUCacheUsagePerc: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Gauge,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:cpu_cache_usage_perc",
			},
			Description: "CPU cache usage percentage",
		},
		KVCacheUsagePerc: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Gauge,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm":   "vllm:kv_cache_usage_perc",
				"sglang": "sglang:token_usage",
				"xllm":   "kv_cache_utilization",
			},
			Description: "KV-cache usage. 1 means 100 percent usage.",
		},
		PrefixCacheQueriesTotal: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Counter,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:prefix_cache_queries_total",
			},
			Description: "Prefix cache queries, in terms of number of queried tokens..",
		},
		PrefixCacheHitTotal: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Counter,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:prefix_cache_hits_total",
			},
			Description: "Prefix cache hits, in terms of number of cached tokens.",
		},
		ExternalPrefixCacheQueriesTotal: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Counter,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:external_prefix_cache_queries_total",
			},
			Description: "External prefix cache queries from KV connector cross-instance cache sharing, in terms of number of queried tokens.",
		},
		ExternalPrefixCacheHitsTotal: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Counter,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:external_prefix_cache_hits_total",
			},
			Description: "External prefix cache hits from KV connector cross-instance cache sharing, in terms of number of cached tokens.",
		},

		NixlXferTimeSeconds: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Histogram,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:nixl_xfer_time_seconds",
			},
			Description: "transfer duration for NIXL KV Cache transfers",
		},
		NixlPostTimeSeconds: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Histogram,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:nixl_post_time_seconds",
			},
			Description: "transfer post time for NIXL KV Cache transfers",
		},
		NixlBytesTransferred: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Histogram,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:nixl_bytes_transferred",
			},
			Description: "number of bytes transferred per NIXL KV Cache transfer",
		},
		NixlNumDescriptors: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Histogram,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:nixl_num_descriptors",
			},
			Description: "number of descriptors per NIXL  KV Cache transfers",
		},
		NixlNumFailedTransfers: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Counter,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:nixl_num_failed_transfers",
			},
			Description: "number of failed NIXL KV Cache transfers",
		},
		NixlNumFailedNotifications: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Counter,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:nixl_num_failed_notifications",
			},
			Description: "number of failed NIXL KV Cache notifications",
		},

		DrainRate1m: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PrometheusEndpoint,
			MetricType: MetricType{
				Query: PromQL,
			},

			PromQL: `
			clamp_min(
				rate(
					sglang:num_requests_total{
						instance="${instance}",
						model_name="${model_name}",
						job="pods"
					}[1m]
				),
				0.01
			)`,
			Description: "1-minute average rate of finished requests (Drains), clamped to avoid zero",
		},

		P95TTFT5m: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PrometheusEndpoint,
			MetricType: MetricType{
				Query: PromQL,
			},
			PromQL:      `histogram_quantile(0.95, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{instance="${instance}", model_name="${model_name}", job="pods"}[5m])))`,
			Description: "95th ttft in last 5 mins",
		},
		P95TTFT5mPod: {
			MetricScope:  PodMetricScope,
			MetricSource: PrometheusEndpoint,
			MetricType: MetricType{
				Query: PromQL,
			},
			PromQL:      `histogram_quantile(0.95, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{instance="${instance}", job="pods"}[5m])))`,
			Description: "95th ttft in last 5 mins",
		},
		AvgTTFT5mPod: {
			MetricScope:  PodMetricScope,
			MetricSource: PrometheusEndpoint,
			MetricType: MetricType{
				Query: PromQL,
			},
			PromQL:      `increase(vllm:time_to_first_token_seconds_sum{instance="${instance}", job="pods"}[5m]) / increase(vllm:time_to_first_token_seconds_count{instance="${instance}", job="pods"}[5m])`,
			Description: "Average ttft in last 5 mins",
		},
		P95TPOT5mPod: {
			MetricScope:  PodMetricScope,
			MetricSource: PrometheusEndpoint,
			MetricType: MetricType{
				Query: PromQL,
			},
			PromQL:      `histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{instance="${instance}", job="pods"}[5m])))`,
			Description: "95th tpot in last 5 mins",
		},
		AvgTPOT5mPod: {
			MetricScope:  PodMetricScope,
			MetricSource: PrometheusEndpoint,
			MetricType: MetricType{
				Query: PromQL,
			},
			PromQL:      `increase(vllm:time_per_output_token_seconds_sum{instance="${instance}", job="pods"}[5m]) / increase(vllm:time_per_output_token_seconds_sum{instance="${instance}", job="pods"}[5m])`,
			Description: "Average tpot in last 5 mins",
		},
		AvgPromptToksPerReq: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PrometheusEndpoint,
			MetricType: MetricType{
				Query: PromQL,
			},
			PromQL:      `increase(vllm:request_prompt_tokens_sum{instance="${instance}", model_name="${model_name}", job="pods"}[1d]) / increase(vllm:request_prompt_tokens_count{instance="${instance}", model_name="${model_name}", job="pods"}[1d])`,
			Description: "Average prompt tokens per request in last day",
		},
		AvgGenerationToksPerReq: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PrometheusEndpoint,
			MetricType: MetricType{
				Query: PromQL,
			},
			PromQL:      `increase(vllm:request_generation_tokens_sum{instance="${instance}", model_name="${model_name}", job="pods"}[1d]) / increase(vllm:request_generation_tokens_count{instance="${instance}", model_name="${model_name}", job="pods"}[1d])`,
			Description: "Average generation tokens per request in last day",
		},
		AvgE2ELatencyPod: {
			MetricScope:  PodMetricScope,
			MetricSource: PrometheusEndpoint,
			MetricType: MetricType{
				Query: PromQL,
			},
			PromQL:      `increase(vllm:e2e_request_latency_seconds_sum{instance="${instance}", job="pods"}[5m]) / increase(vllm:e2e_request_latency_seconds_count{instance="${instance}", job="pods"}[5m])`,
			Description: "Average End-to-end latency in last 5 mins",
		},
		AvgRequestsPerMinPod: {
			MetricScope:  PodMetricScope,
			MetricSource: PrometheusEndpoint,
			MetricType: MetricType{
				Query: PromQL,
			},
			PromQL:      `increase(vllm:request_success_total{instance="${instance}", job="pods"}[5m]) / 5`,
			Description: "Average requests throughput per minute in last 5 mins",
		},
		AvgPromptThroughputToksPerS: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Gauge,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:avg_prompt_throughput_toks_per_s",
			},
			Description: "Average prompt throughput in tokens per second",
		},
		AvgGenerationThroughputToksPerS: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Gauge,
			},
			EngineMetricsNameMapping: map[string]string{
				"vllm":   "vllm:avg_generation_throughput_toks_per_s",
				"sglang": "sglang:gen_throughput",
			},
			Description: "Average generation throughput in tokens per second",
		},
		AvgPromptThroughputToksPerMinPod: {
			MetricScope:  PodMetricScope,
			MetricSource: PrometheusEndpoint,
			MetricType: MetricType{
				Query: PromQL,
			},
			PromQL:      `increase(vllm:prompt_tokens_total{instance="${instance}", job="pods"}[5m]) / 5`,
			Description: "Average prompt throughput in tokens per minute in last 5 mins",
		},
		AvgGenerationThroughputToksPerMinPod: {
			MetricScope:  PodMetricScope,
			MetricSource: PrometheusEndpoint,
			MetricType: MetricType{
				Query: PromQL,
			},
			PromQL:      `increase(vllm:generation_tokens_total{instance="${instance}", job="pods"}[5m]) / 5`,
			Description: "Average generation throughput in tokens per minute in last 5 mins",
		},
		MaxLora: {
			MetricScope:  PodMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Query: QueryLabel,
			},
			LabelKey: "max_lora",
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:lora_requests_info",
			},
			Description: "Max count of Lora Adapters",
		},
		RunningLoraAdapters: {
			MetricScope:  PodMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Query: QueryLabel,
			},
			LabelKey: "running_lora_adapters",
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:lora_requests_info",
			},
			Description: "Count of running Lora Adapters",
		},
		WaitingLoraAdapters: {
			MetricScope:  PodMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Query: QueryLabel,
			},
			LabelKey: "waiting_lora_adapters",
			EngineMetricsNameMapping: map[string]string{
				"vllm": "vllm:lora_requests_info",
			},
			Description: "Count of waiting Lora Adapters",
		},
		VTCBucketSizeActive: {
			MetricScope:  PodModelMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Gauge,
			},
			Description: "Current adaptive bucket size used by VTC algorithm for token normalization",
		},
		PrometheusQueryFail: {
			MetricScope:  PodMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Counter,
			},
			Description: "Total number of Prometheus query failures",
		},
		LLMEngineMetricsQueryFail: {
			MetricScope:  PodMetricScope,
			MetricSource: PodRawMetrics,
			MetricType: MetricType{
				Raw: Counter,
			},
			Description: "Total number of LLM engine metrics query failures",
		},
	}
)

Functions

func BuildQuery

func BuildQuery(queryTemplate string, queryLabels map[string]string) string

BuildQuery dynamically injects labels into a PromQL query template.

func EmitMetricToPrometheus added in v0.6.0

func EmitMetricToPrometheus(routingCtx *types.RoutingContext, pod *v1.Pod, metricName string, metricValue MetricValue, extra map[string]string)

func ExtractNumericFromPromResult added in v0.6.0

func ExtractNumericFromPromResult(r *model.Value) (float64, error)

func GetEngineType added in v0.5.0

func GetEngineType(pod v1.Pod) string

GetEngineType extracts the engine type from pod labels, defaults to "vllm" for backward compatibility This function is centralized to avoid duplication across packages

func GetGaugeValueForTest

func GetGaugeValueForTest(name string, labelValues ...string) float64

func GetLabelValueForKey

func GetLabelValueForKey(metric *dto.Metric, key string) (string, error)

func GetMetricHelp

func GetMetricHelp(metricName string) string

func HttpFailureStatusCode added in v0.6.0

func HttpFailureStatusCode(ctx context.Context, err error, resp *http.Response) (string, string)

func IncrementCounterMetric

func IncrementCounterMetric(name string, help string, value float64, labelNames []string, labelValues ...string)

func InitializePrometheusAPI

func InitializePrometheusAPI(endpoint, username, password string) (prometheusv1.API, error)

InitializePrometheusAPI initializes the Prometheus API client.

func ParseMetricFromBody

func ParseMetricFromBody(body []byte, metricName string) (float64, error)

ParseMetricFromBody parses a simple metric from the Prometheus response body.

func ParseMetricsFromReader added in v0.5.0

func ParseMetricsFromReader(reader io.Reader) (map[string]*dto.MetricFamily, error)

ParseMetricsFromReader parses Prometheus metrics from an io.Reader (extracted for reuse)

func ParseMetricsURLWithContext added in v0.4.0

func ParseMetricsURLWithContext(ctx context.Context, url string) (map[string]*dto.MetricFamily, error)

func SetGaugeMetric

func SetGaugeMetric(name string, help string, value float64, labelNames []string, labelValues ...string)

func SetHistogramMetric added in v0.6.0

func SetHistogramMetric(name string, help string, value *HistogramMetricValue, labelNames []string, labelValues ...string)

func SetupCounterMetricsForTest

func SetupCounterMetricsForTest(metricName string, labelNames []string) (*prometheus.CounterVec, func())

func SetupMetricsForTest

func SetupMetricsForTest(metricName string, labelNames []string) (*prometheus.GaugeVec, func())

Types

type EngineMetricsFetcher added in v0.5.0

type EngineMetricsFetcher struct {
	// contains filtered or unexported fields
}

EngineMetricsFetcher provides a unified interface for fetching typed metrics from inference engine pods It leverages the centralized metrics registry and type system in pkg/metrics

func NewEngineMetricsFetcher added in v0.5.0

func NewEngineMetricsFetcher() *EngineMetricsFetcher

NewEngineMetricsFetcher creates a new engine metrics fetcher with default configuration

func NewEngineMetricsFetcherWithConfig added in v0.5.0

func NewEngineMetricsFetcherWithConfig(config EngineMetricsFetcherConfig) *EngineMetricsFetcher

NewEngineMetricsFetcherWithConfig creates a new engine metrics fetcher with custom configuration

func (*EngineMetricsFetcher) FetchAllTypedMetrics added in v0.5.0

func (ef *EngineMetricsFetcher) FetchAllTypedMetrics(ctx context.Context, endpoint, engineType, identifier string, requestedMetrics []string) (*EngineMetricsResult, error)

FetchAllTypedMetrics fetches all available typed metrics from an engine endpoint

func (*EngineMetricsFetcher) FetchTypedMetric added in v0.5.0

func (ef *EngineMetricsFetcher) FetchTypedMetric(ctx context.Context, endpoint, engineType, identifier, metricName string) (MetricValue, error)

FetchTypedMetric fetches a single typed metric from an engine endpoint Note: if the client needs to fetch multiple metrics, it's better to use FetchAllTypedMetrics

type EngineMetricsFetcherConfig added in v0.5.0

type EngineMetricsFetcherConfig struct {
	Timeout     time.Duration
	MaxRetries  int
	BaseDelay   time.Duration
	MaxDelay    time.Duration
	InsecureTLS bool
}

EngineMetricsFetcherConfig holds configuration for engine metrics fetching

func DefaultEngineMetricsFetcherConfig added in v0.5.0

func DefaultEngineMetricsFetcherConfig() EngineMetricsFetcherConfig

DefaultEngineMetricsFetcherConfig returns sensible defaults for engine metrics fetching

type EngineMetricsResult added in v0.5.0

type EngineMetricsResult struct {
	Identifier   string // Caller-provided identifier (e.g., pod name)
	Endpoint     string // The endpoint that was queried
	EngineType   string
	Metrics      map[string]MetricValue // Pod-scoped metrics
	ModelMetrics map[string]MetricValue // Pod+Model-scoped metrics (key format: "model/metric")
	Errors       []error                // Any errors encountered during fetching
}

EngineMetricsResult contains the result of fetching metrics from an engine endpoint

type HistogramMetricValue

type HistogramMetricValue struct {
	Sum     float64
	Count   float64
	Buckets map[string]float64 // e.g., {"0.1": 5, "0.5": 3, "1.0": 2}
	Labels  map[string]string  // Optional: Additional labels for the histogram.
}

HistogramMetricValue represents a detailed histogram metric.

func GetHistogramValue

func GetHistogramValue(metric *dto.Metric) (*HistogramMetricValue, error)

func ParseHistogramFromBody

func ParseHistogramFromBody(body []byte, metricName string) (*HistogramMetricValue, error)

ParseHistogramFromBody parses a histogram metric from the Prometheus response body.

func (*HistogramMetricValue) GetBucketValue

func (h *HistogramMetricValue) GetBucketValue(bucket string) (float64, bool)

GetBucketValue returns the count for a specific bucket.

func (*HistogramMetricValue) GetCount

func (h *HistogramMetricValue) GetCount() float64

GetCount returns the total count of values in the histogram.

func (*HistogramMetricValue) GetHistogramValue

func (h *HistogramMetricValue) GetHistogramValue() *HistogramMetricValue

func (*HistogramMetricValue) GetLabelValues added in v0.6.0

func (h *HistogramMetricValue) GetLabelValues() map[string]string

func (*HistogramMetricValue) GetMean

func (h *HistogramMetricValue) GetMean() float64

GetMean returns the mean value of the histogram (Sum / Count).

func (*HistogramMetricValue) GetPercentile

func (h *HistogramMetricValue) GetPercentile(percentile float64) (float64, error)

func (*HistogramMetricValue) GetPrometheusResult

func (h *HistogramMetricValue) GetPrometheusResult() *model.Value

func (*HistogramMetricValue) GetSimpleValue

func (h *HistogramMetricValue) GetSimpleValue() float64

func (*HistogramMetricValue) GetSum

func (h *HistogramMetricValue) GetSum() float64

GetSum returns the sum of the histogram values.

func (*HistogramMetricValue) GetValue

func (h *HistogramMetricValue) GetValue() interface{}

type LabelValueMetricValue

type LabelValueMetricValue struct {
	Value string
}

PrometheusMetricValue represents Prometheus query results.

func (*LabelValueMetricValue) GetHistogramValue

func (l *LabelValueMetricValue) GetHistogramValue() *HistogramMetricValue

func (*LabelValueMetricValue) GetLabelValues added in v0.6.0

func (l *LabelValueMetricValue) GetLabelValues() map[string]string

func (*LabelValueMetricValue) GetPrometheusResult

func (l *LabelValueMetricValue) GetPrometheusResult() *model.Value

func (*LabelValueMetricValue) GetSimpleValue

func (l *LabelValueMetricValue) GetSimpleValue() float64

type Metric

type Metric struct {
	MetricSource             MetricSource
	MetricType               MetricType
	PromQL                   string            // Optional: Only applicable for PromQL-based metrics
	LabelKey                 string            // Optional: Only applicable for QueryLabel-based metrics
	EngineMetricsNameMapping map[string]string // Optional: Mapping from engine type to raw metric name.
	Description              string
	MetricScope              MetricScope
}

Metric defines a unique metric with metadata.

type MetricScope

type MetricScope string

MetricScope defines the scope of a metric (e.g., model or pod or podmodel).

const (
	ModelMetricScope    MetricScope = "Model"
	PodMetricScope      MetricScope = "Pod"
	PodModelMetricScope MetricScope = "PodModel" // model in pod
)

type MetricSource

type MetricSource string

MetricSource defines the metric source

const (
	// PrometheusEndpoint indicates metrics are queried from a remote Prometheus server.
	// This source allows querying both raw and aggregated metrics, leveraging PromQL for advanced analytics.
	PrometheusEndpoint MetricSource = "PrometheusEndpoint"
	// PodRawMetrics indicates metrics are collected directly from the metricPort of a Pod.
	PodRawMetrics MetricSource = "PodRawMetrics"
)

type MetricSubscriber

type MetricSubscriber interface {
	SubscribedMetrics() []string
}

type MetricType

type MetricType struct {
	Raw   RawMetricType // Optional: Represents the type of raw metric.
	Query QueryType     // Optional: Represents the query type for derived metrics.
}

MetricType defines the type of a metric, including raw metrics and queries.

func (MetricType) IsQuery

func (m MetricType) IsQuery() bool

func (MetricType) IsRawMetric

func (m MetricType) IsRawMetric() bool

type MetricValue

type MetricValue interface {
	GetSimpleValue() float64
	GetHistogramValue() *HistogramMetricValue
	GetPrometheusResult() *model.Value
	GetLabelValues() map[string]string
}

MetricValue is the interface for all metric values.

type PrometheusMetricValue

type PrometheusMetricValue struct {
	Result *model.Value
}

PrometheusMetricValue represents Prometheus query results.

func (*PrometheusMetricValue) GetHistogramValue

func (p *PrometheusMetricValue) GetHistogramValue() *HistogramMetricValue

func (*PrometheusMetricValue) GetLabelValues added in v0.6.0

func (s *PrometheusMetricValue) GetLabelValues() map[string]string

func (*PrometheusMetricValue) GetPrometheusResult

func (p *PrometheusMetricValue) GetPrometheusResult() *model.Value

func (*PrometheusMetricValue) GetSimpleValue

func (p *PrometheusMetricValue) GetSimpleValue() float64

type QueryType

type QueryType string

QueryType defines the type of metric query, such as PromQL.

const (
	PromQL     QueryType = "PromQL"     // PromQL represents a Prometheus query language expression.
	QueryLabel QueryType = "QueryLabel" // Query Label value from raw metrics.
)

type RawMetricType

type RawMetricType string

RawMetricType defines the type of raw metrics (e.g., collected directly from a source).

const (
	Gauge     RawMetricType = "Gauge"     // Gauge represents a snapshot value.
	Counter   RawMetricType = "Counter"   // Counter represents a cumulative value.
	Histogram RawMetricType = "Histogram" // Histogram represents a distribution of values.
)

type Server added in v0.4.0

type Server struct {
	// contains filtered or unexported fields
}

func NewServer added in v0.4.0

func NewServer(addr string) *Server

func (*Server) Start added in v0.4.0

func (s *Server) Start() error

func (*Server) Stop added in v0.4.0

func (s *Server) Stop() error

type SimpleMetricValue

type SimpleMetricValue struct {
	Value  float64
	Labels map[string]string // Optional: Additional labels for the metric.
}

SimpleMetricValue represents simple metrics (e.g., gauge or counter).

func GetCounterGaugeValue

func GetCounterGaugeValue(metric *dto.Metric, metricType dto.MetricType) (*SimpleMetricValue, error)

func (*SimpleMetricValue) GetHistogramValue

func (s *SimpleMetricValue) GetHistogramValue() *HistogramMetricValue

func (*SimpleMetricValue) GetLabelValues added in v0.6.0

func (s *SimpleMetricValue) GetLabelValues() map[string]string

func (*SimpleMetricValue) GetPrometheusResult

func (s *SimpleMetricValue) GetPrometheusResult() *model.Value

func (*SimpleMetricValue) GetSimpleValue

func (s *SimpleMetricValue) GetSimpleValue() float64

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL