Documentation
¶
Index ¶
- Constants
- Variables
- func BuildQuery(queryTemplate string, queryLabels map[string]string) string
- func EmitMetricToPrometheus(routingCtx *types.RoutingContext, pod *v1.Pod, metricName string, ...)
- func ExtractNumericFromPromResult(r *model.Value) (float64, error)
- func GetEngineType(pod v1.Pod) string
- func GetGaugeValueForTest(name string, labelValues ...string) float64
- func GetLabelValueForKey(metric *dto.Metric, key string) (string, error)
- func GetMetricHelp(metricName string) string
- func HttpFailureStatusCode(ctx context.Context, err error, resp *http.Response) (string, string)
- func IncrementCounterMetric(name string, help string, value float64, labelNames []string, ...)
- func InitializePrometheusAPI(endpoint, username, password string) (prometheusv1.API, error)
- func ParseMetricFromBody(body []byte, metricName string) (float64, error)
- func ParseMetricsFromReader(reader io.Reader) (map[string]*dto.MetricFamily, error)
- func ParseMetricsURLWithContext(ctx context.Context, url string) (map[string]*dto.MetricFamily, error)
- func SetGaugeMetric(name string, help string, value float64, labelNames []string, ...)
- func SetHistogramMetric(name string, help string, value *HistogramMetricValue, labelNames []string, ...)
- func SetupCounterMetricsForTest(metricName string, labelNames []string) (*prometheus.CounterVec, func())
- func SetupMetricsForTest(metricName string, labelNames []string) (*prometheus.GaugeVec, func())
- type EngineMetricsFetcher
- type EngineMetricsFetcherConfig
- type EngineMetricsResult
- type HistogramMetricValue
- func (h *HistogramMetricValue) GetBucketValue(bucket string) (float64, bool)
- func (h *HistogramMetricValue) GetCount() float64
- func (h *HistogramMetricValue) GetHistogramValue() *HistogramMetricValue
- func (h *HistogramMetricValue) GetLabelValues() map[string]string
- func (h *HistogramMetricValue) GetMean() float64
- func (h *HistogramMetricValue) GetPercentile(percentile float64) (float64, error)
- func (h *HistogramMetricValue) GetPrometheusResult() *model.Value
- func (h *HistogramMetricValue) GetSimpleValue() float64
- func (h *HistogramMetricValue) GetSum() float64
- func (h *HistogramMetricValue) GetValue() interface{}
- type LabelValueMetricValue
- type Metric
- type MetricScope
- type MetricSource
- type MetricSubscriber
- type MetricType
- type MetricValue
- type PrometheusMetricValue
- type QueryType
- type RawMetricType
- type Server
- type SimpleMetricValue
Constants ¶
const ( GatewayRequestTotal = "gateway_request_total" GatewayE2EDuration = "gateway_e2e_duration_seconds" GatewayInFlight = "gateway_in_flight_requests" // Count of streamed responses where first token delay > 1s GatewayFirstTokenDelayOver1sTotal = "gateway_first_token_delay_over_1s_total" // counter to track #success & #fail requests GatewayRequestSuccessTotal = "gateway_request_success_total" GatewayRequestFailTotal = "gateway_request_fail_total" // counter to track #success & #fail requests for each model GatewayRequestModelSuccessTotal = "gateway_request_model_success_total" GatewayRequestModelFailTotal = "gateway_request_model_fail_total" // counter to track #prompt & #completion tokenss GatewayPromptTokenBucketTotal = "gateway_prompt_token_bucket_total" GatewayCompletionTokenBucketTotal = "gateway_completion_token_bucket_total" // counter to track #success & #fail prefill requests GatewayPrefillRequestSuccessTotal = "gateway_prefill_request_success_total" GatewayPrefillRequestFailTotal = "gateway_prefill_request_fail_total" // gauge to track #outstanding prefill & decode requests GatewayPrefillOutstandingRequests = "gateway_prefill_outstanding_requests" GatewayDecodeOutstandingRequests = "gateway_decode_outstanding_requests" // counter to track #prefill & #decode pods selected by pd PDSelectedPrefillPodTotal = "pd_selected_prefill_pod_total" PDSelectedDecodePodTotal = "pd_selected_decode_pod_total" // Duration bucket counters for timing breakdowns GatewayRoutingTimeBucketTotal = "gateway_routing_time_bucket_total" GatewayPrefillTimeBucketTotal = "gateway_prefill_time_bucket_total" GatewayKVTransferTimeBucketTotal = "gateway_kv_transfer_time_bucket_total" GatewayTTFTBucketTotal = "gateway_ttft_bucket_total" GatewayTPOTBucketTotal = "gateway_tpot_bucket_total" GatewayDecodeTimeBucketTotal = "gateway_decode_time_bucket_total" GatewayTotalTimeBucketTotal = "gateway_total_time_bucket_total" )
const ( NumRequestsRunning = "num_requests_running" NumRequestsWaiting = "num_requests_waiting" EngineSleepState = "engine_sleep_state" HTTPRequestTotal = "http_requests_total" NumPreemptionsTotal = "num_preemptions_total" RequestSuccessTotal = "request_success_total" NumPrefillPreallocQueueReqs = "num_prefill_prealloc_queue_reqs" NumDecodePreallocQueueReqs = "num_decode_prealloc_queue_reqs" E2ERequestLatencySeconds = "e2e_request_latency_seconds" RequestQueueTimeSeconds = "request_queue_time_seconds" RequestInferenceTimeSeconds = "request_inference_time_seconds" PerStageReqLatencySeconds = "per_stage_req_latency_seconds" HTTPRequestDurationSeconds = "http_request_duration_seconds" HTTPRequestDurationHighRSeconds = "http_request_duration_highr_seconds" TimeToFirstTokenSeconds = "time_to_first_token_seconds" RequestPrefillTimeSeconds = "request_prefill_time_seconds" PromptTokenTotal = "prompt_tokens_total" RequestPromptTokens = "request_prompt_tokens" // deprecated (time_per_output_token_seconds), use inter_token_latency_seconds instead TimePerOutputTokenSeconds = "time_per_output_token_seconds" InterTokenLatencySeconds = "inter_token_latency_seconds" RequestTimePerOutputTokenSeconds = "request_time_per_output_token_seconds" RequestDecodeTimeSeconds = "request_decode_time_seconds" GenerationTokenTotal = "generation_tokens_total" IterationTokensTotal = "iteration_tokens_total" RequestGenerationTokens = "request_generation_tokens" RequestMaxNumGenerationTokens = "request_max_num_generation_tokens" KVCacheUsagePerc = "kv_cache_usage_perc" NixlNumFailedTransfers = "nixl_num_failed_transfers_total" NixlNumFailedNotifications = "nixl_num_failed_notifications_total" PrefixCacheHitTotal = "prefix_cache_hits_total" PrefixCacheQueriesTotal = "prefix_cache_queries_total" ExternalPrefixCacheHitsTotal = "external_prefix_cache_hits_total" ExternalPrefixCacheQueriesTotal = "external_prefix_cache_queries_total" NixlXferTimeSeconds = "nixl_xfer_time_seconds" NixlPostTimeSeconds = "nixl_post_time_seconds" NixlBytesTransferred = "nixl_bytes_transferred" NixlNumDescriptors = "nixl_num_descriptors" DrainRate1m = "drain_rate_1m" P95TTFT5m = "p95_ttft_5m" P95TTFT5mPod = "p95_ttft_5m_pod" AvgTTFT5mPod = "avg_ttft_5m_pod" P95TPOT5mPod = "p95_tpot_5m_pod" AvgTPOT5mPod = "avg_tpot_pod_5m" AvgPromptToksPerReq = "avg_prompt_toks_per_req" AvgGenerationToksPerReq = "avg_generation_toks_per_req" GPUCacheUsagePerc = "gpu_cache_usage_perc" GPUBusyTimeRatio = "gpu_busy_time_ratio" CPUCacheUsagePerc = "cpu_cache_usage_perc" EngineUtilization = "engine_utilization" AvgE2ELatencyPod = "avg_e2e_latency_pod" AvgRequestsPerMinPod = "avg_requests_per_min_pod" AvgPromptThroughputToksPerMinPod = "avg_prompt_throughput_toks_per_min_pod" AvgGenerationThroughputToksPerMinPod = "avg_generation_throughput_toks_per_min_pod" MaxLora = "max_lora" WaitingLoraAdapters = "waiting_lora_adapters" RunningLoraAdapters = "running_lora_adapters" VTCBucketSizeActive = "vtc_bucket_size_active" // Realtime metrics RealtimeNumRequestsRunning = "realtime_num_requests_running" RealtimeNormalizedPendings = "realtime_normalized_pendings" // error to read metrics from backend PrometheusQueryFail = "prometheus_query_fail" LLMEngineMetricsQueryFail = "llm_engine_metrics_query_fail" // Deprecated metrics NumRequestsSwapped = "num_requests_swapped" AvgPromptThroughputToksPerS = "avg_prompt_throughput_toks_per_s" AvgGenerationThroughputToksPerS = "avg_generation_throughput_toks_per_s" )
Variables ¶
var ( // Function variables that can be overridden for testing SetGaugeMetricFnForTest = defaultSetGaugeMetric IncrementCounterMetricFnForTest = defaultIncrementCounterMetric )
var ( GatewayMetrics = map[string]Metric{ GatewayRequestTotal: { MetricScope: PodMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Counter, }, Description: "Total number of requests received by the gateway", }, GatewayRequestModelSuccessTotal: { MetricScope: PodMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Counter, }, Description: "Total number of successful requests received by the gateway for each model", }, GatewayRequestModelFailTotal: { MetricScope: PodMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Counter, }, Description: "Total number of failed requests received by the gateway for each model", }, GatewayPrefillRequestSuccessTotal: { MetricScope: PodMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Counter, }, Description: "Total number of successful prefill requests received by the gateway", }, GatewayPrefillRequestFailTotal: { MetricScope: PodMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Counter, }, Description: "Total number of failed prefill requests received by the gateway", }, GatewayPrefillOutstandingRequests: { MetricScope: PodMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Gauge, }, Description: "Total number of outstanding prefill requests received by the gateway", }, GatewayDecodeOutstandingRequests: { MetricScope: PodMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Gauge, }, Description: "Total number of outstanding decode requests received by the gateway", }, GatewayE2EDuration: { MetricScope: PodMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Histogram, }, Description: "End-to-end latency distribution of requests received by the gateway", }, GatewayInFlight: { MetricScope: PodMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Gauge, }, Description: "Current number of requests in flight (i.e., being processed) by the gateway", }, GatewayFirstTokenDelayOver1sTotal: { MetricScope: PodMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Counter, }, Description: "Count of streamed responses where first token delay > 1s", }, GatewayPromptTokenBucketTotal: { MetricScope: PodMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Counter, }, Description: "Requests counted by prompt token bucket", }, GatewayCompletionTokenBucketTotal: { MetricScope: PodMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Counter, }, Description: "Requests counted by completion token bucket", }, PDSelectedPrefillPodTotal: { MetricScope: PodMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Counter, }, Description: "Total selections of prefill pods by the PD router", }, PDSelectedDecodePodTotal: { MetricScope: PodMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Counter, }, Description: "Total selections of decode pods by the PD router", }, GatewayRoutingTimeBucketTotal: { MetricScope: PodMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{Raw: Counter}, Description: "Requests counted by routing time bucket", }, GatewayPrefillTimeBucketTotal: { MetricScope: PodMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{Raw: Counter}, Description: "Requests counted by prefill time bucket", }, GatewayKVTransferTimeBucketTotal: { MetricScope: PodMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{Raw: Counter}, Description: "Requests counted by KV transfer time bucket", }, GatewayTTFTBucketTotal: { MetricScope: PodMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{Raw: Counter}, Description: "Requests counted by TTFT bucket", }, GatewayTPOTBucketTotal: { MetricScope: PodMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{Raw: Counter}, Description: "Requests counted by TPOT bucket", }, GatewayDecodeTimeBucketTotal: { MetricScope: PodMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{Raw: Counter}, Description: "Requests counted by decode time bucket", }, GatewayTotalTimeBucketTotal: { MetricScope: PodMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{Raw: Counter}, Description: "Requests counted by total time bucket", }, } )
var ( // Metrics defines all available metrics, including raw and query-based metrics. Metrics = map[string]Metric{ NumRequestsRunning: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Gauge, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:num_requests_running", "sglang": "sglang:num_running_reqs", }, Description: "Number of running requests", }, NumRequestsWaiting: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Gauge, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:num_requests_waiting", "sglang": "sglang:num_queue_reqs", }, Description: "Number of waiting requests", }, NumRequestsSwapped: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Gauge, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:num_requests_swapped", "sglang": "sglang:num_retracted_reqs", }, Description: "Number of swapped requests", }, EngineSleepState: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Gauge, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:engine_sleep_state", }, Description: "Engine sleep state; awake = 0 means engine is sleeping; awake = 1 means engine is awake; weights_offloaded = 1 means sleep level 1; discard_all = 1 means sleep level 2.", }, HTTPRequestTotal: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Counter, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:http_requests_total", }, Description: "Total number of requests by method, status and handler.", }, NumPreemptionsTotal: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Counter, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:num_preemptions_total", }, Description: "Number of preemptions", }, RequestSuccessTotal: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Counter, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:num_requests_success_total", "sglang": "sglang:num_requests_total", }, Description: "Number of successful requests", }, NumPrefillPreallocQueueReqs: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Gauge, }, EngineMetricsNameMapping: map[string]string{ "sglang": "sglang:num_prefill_prealloc_queue_reqs", }, Description: "Number of prefill preallocation queue requests", }, NumDecodePreallocQueueReqs: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Gauge, }, EngineMetricsNameMapping: map[string]string{ "sglang": "sglang:num_decode_prealloc_queue_reqs", }, Description: "Number of decode preallocation queue requests", }, E2ERequestLatencySeconds: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Histogram, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:e2e_request_latency_seconds", "sglang": "sglang:e2e_request_latency_seconds", }, Description: "End-to-end request latency in seconds", }, RequestQueueTimeSeconds: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Histogram, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:request_queue_time_seconds", }, Description: "Request queue time in seconds", }, RequestInferenceTimeSeconds: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Histogram, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:request_inference_time_seconds", }, Description: "Request inference time in seconds", }, PerStageReqLatencySeconds: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Histogram, }, EngineMetricsNameMapping: map[string]string{ "sglang": "sglang:per_stage_req_latency_seconds", }, Description: "Per-stage request latency in seconds", }, HTTPRequestDurationSeconds: { MetricScope: PodMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Histogram, }, EngineMetricsNameMapping: map[string]string{ "vllm": "http_request_duration_seconds", }, Description: "Histogram of request duration in seconds", }, HTTPRequestDurationHighRSeconds: { MetricScope: PodMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Histogram, }, EngineMetricsNameMapping: map[string]string{ "vllm": "http_request_duration_highr_seconds", }, Description: "Histogram of request duration in seconds for high priority requests", }, PromptTokenTotal: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Counter, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:prompt_tokens_total", }, Description: "Total prompt tokens", }, RequestPromptTokens: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Histogram, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:request_prompt_tokens", }, Description: "Histogram of prompt tokens", }, GenerationTokenTotal: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Counter, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:generation_tokens_total", }, Description: "Total generation tokens", }, RequestGenerationTokens: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Histogram, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:request_generation_tokens", }, Description: "Histogram of generation tokens", }, RequestMaxNumGenerationTokens: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Histogram, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:request_max_num_generation_tokens", }, Description: "Histogram of max number of generation tokens", }, IterationTokensTotal: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Histogram, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:iteration_tokens_total", }, Description: "Total iteration tokens", }, TimeToFirstTokenSeconds: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Histogram, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:time_to_first_token_seconds", "sglang": "sglang:time_to_first_token_seconds", }, Description: "Time to first token in seconds", }, TimePerOutputTokenSeconds: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Histogram, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:time_per_output_token_seconds", "sglang": "sglang:inter_token_latency_seconds", }, Description: "Time per output token in seconds", }, InterTokenLatencySeconds: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Histogram, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:inter_token_latency_seconds", "sglang": "sglang:inter_token_latency_seconds", }, Description: "Inter-token latency in seconds", }, RequestDecodeTimeSeconds: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Histogram, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:request_decode_time_seconds", }, Description: "Request decode time in seconds", }, RequestPrefillTimeSeconds: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Histogram, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:request_prefill_time_seconds", }, Description: "Request prefill time in seconds", }, RequestTimePerOutputTokenSeconds: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Histogram, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:request_time_per_output_token_seconds", }, Description: "Time per output token in seconds", }, GPUCacheUsagePerc: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Gauge, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:gpu_cache_usage_perc", "sglang": "sglang:token_usage", "xllm": "kv_cache_utilization", }, Description: "GPU cache usage percentage", }, EngineUtilization: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Gauge, }, EngineMetricsNameMapping: map[string]string{ "xllm": "engine_utilization", }, Description: "GPU busy time ratio", }, CPUCacheUsagePerc: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Gauge, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:cpu_cache_usage_perc", }, Description: "CPU cache usage percentage", }, KVCacheUsagePerc: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Gauge, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:kv_cache_usage_perc", "sglang": "sglang:token_usage", "xllm": "kv_cache_utilization", }, Description: "KV-cache usage. 1 means 100 percent usage.", }, PrefixCacheQueriesTotal: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Counter, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:prefix_cache_queries_total", }, Description: "Prefix cache queries, in terms of number of queried tokens..", }, PrefixCacheHitTotal: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Counter, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:prefix_cache_hits_total", }, Description: "Prefix cache hits, in terms of number of cached tokens.", }, ExternalPrefixCacheQueriesTotal: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Counter, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:external_prefix_cache_queries_total", }, Description: "External prefix cache queries from KV connector cross-instance cache sharing, in terms of number of queried tokens.", }, ExternalPrefixCacheHitsTotal: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Counter, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:external_prefix_cache_hits_total", }, Description: "External prefix cache hits from KV connector cross-instance cache sharing, in terms of number of cached tokens.", }, NixlXferTimeSeconds: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Histogram, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:nixl_xfer_time_seconds", }, Description: "transfer duration for NIXL KV Cache transfers", }, NixlPostTimeSeconds: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Histogram, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:nixl_post_time_seconds", }, Description: "transfer post time for NIXL KV Cache transfers", }, NixlBytesTransferred: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Histogram, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:nixl_bytes_transferred", }, Description: "number of bytes transferred per NIXL KV Cache transfer", }, NixlNumDescriptors: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Histogram, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:nixl_num_descriptors", }, Description: "number of descriptors per NIXL KV Cache transfers", }, NixlNumFailedTransfers: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Counter, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:nixl_num_failed_transfers", }, Description: "number of failed NIXL KV Cache transfers", }, NixlNumFailedNotifications: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Counter, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:nixl_num_failed_notifications", }, Description: "number of failed NIXL KV Cache notifications", }, DrainRate1m: { MetricScope: PodModelMetricScope, MetricSource: PrometheusEndpoint, MetricType: MetricType{ Query: PromQL, }, PromQL: ` clamp_min( rate( sglang:num_requests_total{ instance="${instance}", model_name="${model_name}", job="pods" }[1m] ), 0.01 )`, Description: "1-minute average rate of finished requests (Drains), clamped to avoid zero", }, P95TTFT5m: { MetricScope: PodModelMetricScope, MetricSource: PrometheusEndpoint, MetricType: MetricType{ Query: PromQL, }, PromQL: `histogram_quantile(0.95, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{instance="${instance}", model_name="${model_name}", job="pods"}[5m])))`, Description: "95th ttft in last 5 mins", }, P95TTFT5mPod: { MetricScope: PodMetricScope, MetricSource: PrometheusEndpoint, MetricType: MetricType{ Query: PromQL, }, PromQL: `histogram_quantile(0.95, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{instance="${instance}", job="pods"}[5m])))`, Description: "95th ttft in last 5 mins", }, AvgTTFT5mPod: { MetricScope: PodMetricScope, MetricSource: PrometheusEndpoint, MetricType: MetricType{ Query: PromQL, }, PromQL: `increase(vllm:time_to_first_token_seconds_sum{instance="${instance}", job="pods"}[5m]) / increase(vllm:time_to_first_token_seconds_count{instance="${instance}", job="pods"}[5m])`, Description: "Average ttft in last 5 mins", }, P95TPOT5mPod: { MetricScope: PodMetricScope, MetricSource: PrometheusEndpoint, MetricType: MetricType{ Query: PromQL, }, PromQL: `histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{instance="${instance}", job="pods"}[5m])))`, Description: "95th tpot in last 5 mins", }, AvgTPOT5mPod: { MetricScope: PodMetricScope, MetricSource: PrometheusEndpoint, MetricType: MetricType{ Query: PromQL, }, PromQL: `increase(vllm:time_per_output_token_seconds_sum{instance="${instance}", job="pods"}[5m]) / increase(vllm:time_per_output_token_seconds_sum{instance="${instance}", job="pods"}[5m])`, Description: "Average tpot in last 5 mins", }, AvgPromptToksPerReq: { MetricScope: PodModelMetricScope, MetricSource: PrometheusEndpoint, MetricType: MetricType{ Query: PromQL, }, PromQL: `increase(vllm:request_prompt_tokens_sum{instance="${instance}", model_name="${model_name}", job="pods"}[1d]) / increase(vllm:request_prompt_tokens_count{instance="${instance}", model_name="${model_name}", job="pods"}[1d])`, Description: "Average prompt tokens per request in last day", }, AvgGenerationToksPerReq: { MetricScope: PodModelMetricScope, MetricSource: PrometheusEndpoint, MetricType: MetricType{ Query: PromQL, }, PromQL: `increase(vllm:request_generation_tokens_sum{instance="${instance}", model_name="${model_name}", job="pods"}[1d]) / increase(vllm:request_generation_tokens_count{instance="${instance}", model_name="${model_name}", job="pods"}[1d])`, Description: "Average generation tokens per request in last day", }, AvgE2ELatencyPod: { MetricScope: PodMetricScope, MetricSource: PrometheusEndpoint, MetricType: MetricType{ Query: PromQL, }, PromQL: `increase(vllm:e2e_request_latency_seconds_sum{instance="${instance}", job="pods"}[5m]) / increase(vllm:e2e_request_latency_seconds_count{instance="${instance}", job="pods"}[5m])`, Description: "Average End-to-end latency in last 5 mins", }, AvgRequestsPerMinPod: { MetricScope: PodMetricScope, MetricSource: PrometheusEndpoint, MetricType: MetricType{ Query: PromQL, }, PromQL: `increase(vllm:request_success_total{instance="${instance}", job="pods"}[5m]) / 5`, Description: "Average requests throughput per minute in last 5 mins", }, AvgPromptThroughputToksPerS: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Gauge, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:avg_prompt_throughput_toks_per_s", }, Description: "Average prompt throughput in tokens per second", }, AvgGenerationThroughputToksPerS: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Gauge, }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:avg_generation_throughput_toks_per_s", "sglang": "sglang:gen_throughput", }, Description: "Average generation throughput in tokens per second", }, AvgPromptThroughputToksPerMinPod: { MetricScope: PodMetricScope, MetricSource: PrometheusEndpoint, MetricType: MetricType{ Query: PromQL, }, PromQL: `increase(vllm:prompt_tokens_total{instance="${instance}", job="pods"}[5m]) / 5`, Description: "Average prompt throughput in tokens per minute in last 5 mins", }, AvgGenerationThroughputToksPerMinPod: { MetricScope: PodMetricScope, MetricSource: PrometheusEndpoint, MetricType: MetricType{ Query: PromQL, }, PromQL: `increase(vllm:generation_tokens_total{instance="${instance}", job="pods"}[5m]) / 5`, Description: "Average generation throughput in tokens per minute in last 5 mins", }, MaxLora: { MetricScope: PodMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Query: QueryLabel, }, LabelKey: "max_lora", EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:lora_requests_info", }, Description: "Max count of Lora Adapters", }, RunningLoraAdapters: { MetricScope: PodMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Query: QueryLabel, }, LabelKey: "running_lora_adapters", EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:lora_requests_info", }, Description: "Count of running Lora Adapters", }, WaitingLoraAdapters: { MetricScope: PodMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Query: QueryLabel, }, LabelKey: "waiting_lora_adapters", EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:lora_requests_info", }, Description: "Count of waiting Lora Adapters", }, VTCBucketSizeActive: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Gauge, }, Description: "Current adaptive bucket size used by VTC algorithm for token normalization", }, PrometheusQueryFail: { MetricScope: PodMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Counter, }, Description: "Total number of Prometheus query failures", }, LLMEngineMetricsQueryFail: { MetricScope: PodMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Counter, }, Description: "Total number of LLM engine metrics query failures", }, } )
Functions ¶
func BuildQuery ¶
BuildQuery dynamically injects labels into a PromQL query template.
func EmitMetricToPrometheus ¶ added in v0.6.0
func EmitMetricToPrometheus(routingCtx *types.RoutingContext, pod *v1.Pod, metricName string, metricValue MetricValue, extra map[string]string)
func ExtractNumericFromPromResult ¶ added in v0.6.0
func GetEngineType ¶ added in v0.5.0
GetEngineType extracts the engine type from pod labels, defaults to "vllm" for backward compatibility This function is centralized to avoid duplication across packages
func GetGaugeValueForTest ¶
func GetMetricHelp ¶
func HttpFailureStatusCode ¶ added in v0.6.0
func IncrementCounterMetric ¶
func InitializePrometheusAPI ¶
func InitializePrometheusAPI(endpoint, username, password string) (prometheusv1.API, error)
InitializePrometheusAPI initializes the Prometheus API client.
func ParseMetricFromBody ¶
ParseMetricFromBody parses a simple metric from the Prometheus response body.
func ParseMetricsFromReader ¶ added in v0.5.0
ParseMetricsFromReader parses Prometheus metrics from an io.Reader (extracted for reuse)
func ParseMetricsURLWithContext ¶ added in v0.4.0
func SetGaugeMetric ¶
func SetHistogramMetric ¶ added in v0.6.0
func SetHistogramMetric(name string, help string, value *HistogramMetricValue, labelNames []string, labelValues ...string)
func SetupCounterMetricsForTest ¶
func SetupCounterMetricsForTest(metricName string, labelNames []string) (*prometheus.CounterVec, func())
func SetupMetricsForTest ¶
func SetupMetricsForTest(metricName string, labelNames []string) (*prometheus.GaugeVec, func())
Types ¶
type EngineMetricsFetcher ¶ added in v0.5.0
type EngineMetricsFetcher struct {
// contains filtered or unexported fields
}
EngineMetricsFetcher provides a unified interface for fetching typed metrics from inference engine pods It leverages the centralized metrics registry and type system in pkg/metrics
func NewEngineMetricsFetcher ¶ added in v0.5.0
func NewEngineMetricsFetcher() *EngineMetricsFetcher
NewEngineMetricsFetcher creates a new engine metrics fetcher with default configuration
func NewEngineMetricsFetcherWithConfig ¶ added in v0.5.0
func NewEngineMetricsFetcherWithConfig(config EngineMetricsFetcherConfig) *EngineMetricsFetcher
NewEngineMetricsFetcherWithConfig creates a new engine metrics fetcher with custom configuration
func (*EngineMetricsFetcher) FetchAllTypedMetrics ¶ added in v0.5.0
func (ef *EngineMetricsFetcher) FetchAllTypedMetrics(ctx context.Context, endpoint, engineType, identifier string, requestedMetrics []string) (*EngineMetricsResult, error)
FetchAllTypedMetrics fetches all available typed metrics from an engine endpoint
func (*EngineMetricsFetcher) FetchTypedMetric ¶ added in v0.5.0
func (ef *EngineMetricsFetcher) FetchTypedMetric(ctx context.Context, endpoint, engineType, identifier, metricName string) (MetricValue, error)
FetchTypedMetric fetches a single typed metric from an engine endpoint Note: if the client needs to fetch multiple metrics, it's better to use FetchAllTypedMetrics
type EngineMetricsFetcherConfig ¶ added in v0.5.0
type EngineMetricsFetcherConfig struct {
Timeout time.Duration
MaxRetries int
BaseDelay time.Duration
MaxDelay time.Duration
InsecureTLS bool
}
EngineMetricsFetcherConfig holds configuration for engine metrics fetching
func DefaultEngineMetricsFetcherConfig ¶ added in v0.5.0
func DefaultEngineMetricsFetcherConfig() EngineMetricsFetcherConfig
DefaultEngineMetricsFetcherConfig returns sensible defaults for engine metrics fetching
type EngineMetricsResult ¶ added in v0.5.0
type EngineMetricsResult struct {
Identifier string // Caller-provided identifier (e.g., pod name)
Endpoint string // The endpoint that was queried
EngineType string
Metrics map[string]MetricValue // Pod-scoped metrics
ModelMetrics map[string]MetricValue // Pod+Model-scoped metrics (key format: "model/metric")
Errors []error // Any errors encountered during fetching
}
EngineMetricsResult contains the result of fetching metrics from an engine endpoint
type HistogramMetricValue ¶
type HistogramMetricValue struct {
Sum float64
Count float64
Buckets map[string]float64 // e.g., {"0.1": 5, "0.5": 3, "1.0": 2}
Labels map[string]string // Optional: Additional labels for the histogram.
}
HistogramMetricValue represents a detailed histogram metric.
func GetHistogramValue ¶
func GetHistogramValue(metric *dto.Metric) (*HistogramMetricValue, error)
func ParseHistogramFromBody ¶
func ParseHistogramFromBody(body []byte, metricName string) (*HistogramMetricValue, error)
ParseHistogramFromBody parses a histogram metric from the Prometheus response body.
func (*HistogramMetricValue) GetBucketValue ¶
func (h *HistogramMetricValue) GetBucketValue(bucket string) (float64, bool)
GetBucketValue returns the count for a specific bucket.
func (*HistogramMetricValue) GetCount ¶
func (h *HistogramMetricValue) GetCount() float64
GetCount returns the total count of values in the histogram.
func (*HistogramMetricValue) GetHistogramValue ¶
func (h *HistogramMetricValue) GetHistogramValue() *HistogramMetricValue
func (*HistogramMetricValue) GetLabelValues ¶ added in v0.6.0
func (h *HistogramMetricValue) GetLabelValues() map[string]string
func (*HistogramMetricValue) GetMean ¶
func (h *HistogramMetricValue) GetMean() float64
GetMean returns the mean value of the histogram (Sum / Count).
func (*HistogramMetricValue) GetPercentile ¶
func (h *HistogramMetricValue) GetPercentile(percentile float64) (float64, error)
func (*HistogramMetricValue) GetPrometheusResult ¶
func (h *HistogramMetricValue) GetPrometheusResult() *model.Value
func (*HistogramMetricValue) GetSimpleValue ¶
func (h *HistogramMetricValue) GetSimpleValue() float64
func (*HistogramMetricValue) GetSum ¶
func (h *HistogramMetricValue) GetSum() float64
GetSum returns the sum of the histogram values.
func (*HistogramMetricValue) GetValue ¶
func (h *HistogramMetricValue) GetValue() interface{}
type LabelValueMetricValue ¶
type LabelValueMetricValue struct {
Value string
}
PrometheusMetricValue represents Prometheus query results.
func (*LabelValueMetricValue) GetHistogramValue ¶
func (l *LabelValueMetricValue) GetHistogramValue() *HistogramMetricValue
func (*LabelValueMetricValue) GetLabelValues ¶ added in v0.6.0
func (l *LabelValueMetricValue) GetLabelValues() map[string]string
func (*LabelValueMetricValue) GetPrometheusResult ¶
func (l *LabelValueMetricValue) GetPrometheusResult() *model.Value
func (*LabelValueMetricValue) GetSimpleValue ¶
func (l *LabelValueMetricValue) GetSimpleValue() float64
type Metric ¶
type Metric struct {
MetricSource MetricSource
MetricType MetricType
PromQL string // Optional: Only applicable for PromQL-based metrics
LabelKey string // Optional: Only applicable for QueryLabel-based metrics
EngineMetricsNameMapping map[string]string // Optional: Mapping from engine type to raw metric name.
Description string
MetricScope MetricScope
}
Metric defines a unique metric with metadata.
type MetricScope ¶
type MetricScope string
MetricScope defines the scope of a metric (e.g., model or pod or podmodel).
const ( ModelMetricScope MetricScope = "Model" PodMetricScope MetricScope = "Pod" PodModelMetricScope MetricScope = "PodModel" // model in pod )
type MetricSource ¶
type MetricSource string
MetricSource defines the metric source
const ( // PrometheusEndpoint indicates metrics are queried from a remote Prometheus server. // This source allows querying both raw and aggregated metrics, leveraging PromQL for advanced analytics. PrometheusEndpoint MetricSource = "PrometheusEndpoint" // PodRawMetrics indicates metrics are collected directly from the metricPort of a Pod. PodRawMetrics MetricSource = "PodRawMetrics" )
type MetricSubscriber ¶
type MetricSubscriber interface {
SubscribedMetrics() []string
}
type MetricType ¶
type MetricType struct {
Raw RawMetricType // Optional: Represents the type of raw metric.
Query QueryType // Optional: Represents the query type for derived metrics.
}
MetricType defines the type of a metric, including raw metrics and queries.
func (MetricType) IsQuery ¶
func (m MetricType) IsQuery() bool
func (MetricType) IsRawMetric ¶
func (m MetricType) IsRawMetric() bool
type MetricValue ¶
type MetricValue interface {
GetSimpleValue() float64
GetHistogramValue() *HistogramMetricValue
GetPrometheusResult() *model.Value
GetLabelValues() map[string]string
}
MetricValue is the interface for all metric values.
type PrometheusMetricValue ¶
PrometheusMetricValue represents Prometheus query results.
func (*PrometheusMetricValue) GetHistogramValue ¶
func (p *PrometheusMetricValue) GetHistogramValue() *HistogramMetricValue
func (*PrometheusMetricValue) GetLabelValues ¶ added in v0.6.0
func (s *PrometheusMetricValue) GetLabelValues() map[string]string
func (*PrometheusMetricValue) GetPrometheusResult ¶
func (p *PrometheusMetricValue) GetPrometheusResult() *model.Value
func (*PrometheusMetricValue) GetSimpleValue ¶
func (p *PrometheusMetricValue) GetSimpleValue() float64
type RawMetricType ¶
type RawMetricType string
RawMetricType defines the type of raw metrics (e.g., collected directly from a source).
const ( Gauge RawMetricType = "Gauge" // Gauge represents a snapshot value. Counter RawMetricType = "Counter" // Counter represents a cumulative value. Histogram RawMetricType = "Histogram" // Histogram represents a distribution of values. )
type SimpleMetricValue ¶
type SimpleMetricValue struct {
Value float64
Labels map[string]string // Optional: Additional labels for the metric.
}
SimpleMetricValue represents simple metrics (e.g., gauge or counter).
func GetCounterGaugeValue ¶
func GetCounterGaugeValue(metric *dto.Metric, metricType dto.MetricType) (*SimpleMetricValue, error)
func (*SimpleMetricValue) GetHistogramValue ¶
func (s *SimpleMetricValue) GetHistogramValue() *HistogramMetricValue
func (*SimpleMetricValue) GetLabelValues ¶ added in v0.6.0
func (s *SimpleMetricValue) GetLabelValues() map[string]string
func (*SimpleMetricValue) GetPrometheusResult ¶
func (s *SimpleMetricValue) GetPrometheusResult() *model.Value
func (*SimpleMetricValue) GetSimpleValue ¶
func (s *SimpleMetricValue) GetSimpleValue() float64