metrics

package
v1.46.3 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Sep 29, 2025 License: Apache-2.0 Imports: 29 Imported by: 0

Documentation

Overview

NOTE: Make sure any new field/tag to existing metrics or new metrics should be added to SetupTable function for manual DB migration

Index

Constants

View Source
const (
	CREATE_TABLE_OPTION_TPL = "ENGINE=mito WITH( ttl='%s', merge_mode = 'last_non_null')"
)
View Source
const CurrentAppSQLVersion = "1.0"

Variables

View Source
var TFVersionMigrationMap = []struct {
	Version  string
	AlterSQL []string
}{

	{"1.0", []string{
		"CREATE TABLE IF NOT EXISTS tf_worker_resources (\n    `worker` String NULL SKIPPING INDEX,\n    `workload` String NULL INVERTED INDEX,\n    `pool` String NULL INVERTED INDEX,\n    `namespace` String NULL INVERTED INDEX,\n    `qos` String NULL,\n    `tflops_request` Double NULL,\n    `tflops_limit` Double NULL,\n    `vram_bytes_request` Double NULL,\n    `vram_bytes_limit` Double NULL,\n    `gpu_count` BigInt NULL,\n    `raw_cost` Double NULL,\n    `ready` String NULL,\n    `ts` Timestamp_ms TIME INDEX,\n    PRIMARY KEY (`worker`, `workload`, `pool`, `namespace`))\n    ENGINE=mito WITH( ttl='30d', merge_mode = 'last_non_null')",

		"CREATE TABLE IF NOT EXISTS tf_node_metrics (\n    `node` String NULL INVERTED INDEX,\n    `pool` String NULL INVERTED INDEX,\n    `phase` String NULL INVERTED INDEX,\n    `allocated_tflops` Double NULL,\n    `allocated_tflops_percent` Double NULL,\n    `allocated_vram_bytes` Double NULL,\n    `allocated_vram_percent` Double NULL,\n    `allocated_tflops_percent_virtual` Double NULL,\n    `allocated_vram_percent_virtual` Double NULL,\n    `raw_cost` Double NULL,\n    `gpu_count` BigInt NULL,\n    `ts` Timestamp_ms TIME INDEX,\n    PRIMARY KEY (`node`, `pool`, `phase`))\n    ENGINE=mito WITH( ttl='30d', merge_mode = 'last_non_null')",

		"CREATE TABLE IF NOT EXISTS tf_system_metrics (\n    `pool` String NULL INVERTED INDEX,\n    `total_workers_cnt` BigInt NULL,\n    `total_nodes_cnt` BigInt NULL,\n    `total_allocation_fail_cnt` BigInt NULL,\n    `total_allocation_success_cnt` BigInt NULL,\n    `total_scale_up_cnt` BigInt NULL,\n    `total_scale_down_cnt` BigInt NULL,\n    `ts` Timestamp_ms TIME INDEX,\n    PRIMARY KEY (`pool`))\n    ENGINE=mito WITH( ttl='30d', merge_mode = 'last_non_null')",

		"CREATE TABLE IF NOT EXISTS tf_system_log (\n    `component` String NULL INVERTED INDEX,\n    `container` String NULL INVERTED INDEX,\n    `message` String NULL FULLTEXT INDEX WITH (analyzer = 'English' , case_sensitive = 'false'),\n    `namespace` String NULL INVERTED INDEX,\n    `pod` String NULL SKIPPING INDEX,\n    `stream` String NULL,\n    `timestamp` String NULL,\n    `greptime_timestamp` Timestamp_ms TIME INDEX,\n    PRIMARY KEY (`component`, `container`, `namespace`, `pod`))\n    ENGINE=mito WITH( ttl='30d', merge_mode = 'last_non_null')",

		"CREATE TABLE IF NOT EXISTS tf_worker_usage (\n    `workload` String NULL INVERTED INDEX,\n    `worker` String NULL SKIPPING INDEX,\n    `namespace` String NULL INVERTED INDEX,\n    `pool` String NULL INVERTED INDEX,\n    `node` String NULL INVERTED INDEX,\n    `uuid` String NULL INVERTED INDEX,\n    `compute_tflops` Double NULL,\n    `compute_percentage` Double NULL,\n    `memory_bytes` BigInt UNSIGNED NULL,\n    `memory_percentage` Double NULL,\n    `compute_throttled_cnt` BigInt NULL,\n    `vram_freezed_cnt` BigInt NULL,\n    `vram_resumed_cnt` BigInt NULL,\n    `ts` Timestamp_ms TIME INDEX,\n    PRIMARY KEY (`workload`, `worker`, `namespace`, `pool`, `node`, `uuid`))\n    ENGINE=mito WITH( ttl='30d', merge_mode = 'last_non_null')",

		"CREATE TABLE IF NOT EXISTS tf_gpu_usage (\n    `node` String NULL INVERTED INDEX,\n    `pool` String NULL INVERTED INDEX,\n    `uuid` String NULL INVERTED INDEX,\n    `compute_percentage` Double NULL,\n    `memory_percentage` Double NULL,\n    `memory_bytes` BigInt UNSIGNED NULL,\n    `compute_tflops` Double NULL,\n    `rx` Double NULL,\n    `tx` Double NULL,\n    `temperature` Double NULL,\n    `graphics_clock_mhz` Double NULL,\n    `sm_clock_mhz` Double NULL,\n    `memory_clock_mhz` Double NULL,\n    `video_clock_mhz` Double NULL,\n    `power_usage` Double NULL,\n    `nvlink_rx` Double NULL,\n    `nvlink_tx` Double NULL,\n    `ts` Timestamp_ms TIME INDEX,\n    PRIMARY KEY (`node`, `pool`, `uuid`))\n    ENGINE=mito WITH( ttl='30d', merge_mode = 'last_non_null')",
	}},

	{"1.1", []string{}},
}

When upgrading database, should run alter sql in order for every version not lower than current version, until the version to be updated

View Source
var TensorFusionSystemMetricsMap = make(map[string]*TensorFusionSystemMetrics)

Functions

func GetInitTableSQL added in v1.42.0

func GetInitTableSQL(model schema.Tabler, ttl string) string

func InitPoolMetricsWhenNotExists added in v1.42.0

func InitPoolMetricsWhenNotExists(poolObj *tfv1.GPUPool)

func RemoveNodeMetrics added in v1.33.4

func RemoveNodeMetrics(nodeName string)

func RemovePoolMetrics added in v1.42.0

func RemovePoolMetrics(poolName string)

func RemoveWorkerMetrics added in v1.33.4

func RemoveWorkerMetrics(workerName string, deletionTime time.Time)

func SetAutoscalingMetrics added in v1.34.0

func SetAutoscalingMetrics(poolName string, isScaleUp bool)

TODO should record metrics after autoscaling feature added

func SetNodeMetrics added in v1.33.4

func SetNodeMetrics(node *tfv1.GPUNode, poolObj *tfv1.GPUPool, gpuModels []string)

func SetPoolMetrics added in v1.42.0

func SetPoolMetrics(poolObj *tfv1.GPUPool)

func SetSchedulerMetrics added in v1.34.0

func SetSchedulerMetrics(poolName string, isSuccess bool)

func SetWorkerMetricsByWorkload added in v1.33.4

func SetWorkerMetricsByWorkload(pod *corev1.Pod)

Types

type ActiveNodeAndWorker added in v1.34.0

type ActiveNodeAndWorker struct {
	// contains filtered or unexported fields
}

type Encoder added in v1.37.0

type Encoder interface {
	StartLine(measurement string)
	AddTag(key, value string)
	AddField(key string, value any)
	EndLine(timestamp time.Time)
	Bytes() []byte
	Err() error
}

func NewEncoder added in v1.37.0

func NewEncoder(encoderType string) Encoder

type EncoderType added in v1.37.0

type EncoderType uint8

EncoderType represents the encoder type as an enum for better performance

const (
	EncoderTypeInflux EncoderType = iota
	EncoderTypeJson
	EncoderTypeOTel
)

type GreptimeDBConnection added in v1.34.0

type GreptimeDBConnection struct {
	Host     string
	Port     string
	User     string
	Password string
	Database string
}

type HypervisorGPUUsageMetrics added in v1.34.0

type HypervisorGPUUsageMetrics struct {
	NodeName string `json:"nodeName" gorm:"column:node;index:,class:INVERTED"`
	PoolName string `json:"poolName" gorm:"column:pool;index:,class:INVERTED"`
	UUID     string `json:"uuid" gorm:"column:uuid;index:,class:INVERTED"`

	ComputePercent float64 `json:"computePercent" gorm:"column:compute_percentage"`
	VRAMPercent    float64 `json:"vramPercent" gorm:"column:memory_percentage"`

	VRAMBytes     uint64  `json:"vramBytes" gorm:"column:memory_bytes"`
	ComputeTflops float64 `json:"computeTflops" gorm:"column:compute_tflops"`

	PcieRxKB    float64 `json:"pcieRx" gorm:"column:rx"`
	PcieTxKB    float64 `json:"pcieTx" gorm:"column:tx"`
	Temperature float64 `json:"temperature" gorm:"column:temperature"`

	GraphicsClockMHz float64 `json:"graphicsClockMHz" gorm:"column:graphics_clock_mhz"`
	SMClockMHz       float64 `json:"smClockMHz" gorm:"column:sm_clock_mhz"`
	MemoryClockMHz   float64 `json:"memoryClockMHz" gorm:"column:memory_clock_mhz"`
	VideoClockMHz    float64 `json:"videoClockMHz" gorm:"column:video_clock_mhz"`
	PowerUsage       float64 `json:"powerUsage" gorm:"column:power_usage"`
	NvlinkRx         float64 `json:"nvlinkRx" gorm:"column:nvlink_rx"`
	NvlinkTx         float64 `json:"nvlinkTx" gorm:"column:nvlink_tx"`

	Timestamp time.Time `json:"ts" gorm:"column:ts;index:,class:TIME"`
}

func (HypervisorGPUUsageMetrics) TableName added in v1.34.0

func (nu HypervisorGPUUsageMetrics) TableName() string

type HypervisorWorkerUsageMetrics added in v1.34.0

type HypervisorWorkerUsageMetrics struct {
	WorkloadName string `json:"workloadName" gorm:"column:workload;index:,class:INVERTED"`
	WorkerName   string `json:"workerName" gorm:"column:worker;index:,class:SKIPPING"`
	Namespace    string `json:"namespace" gorm:"column:namespace;index:,class:INVERTED"`
	PoolName     string `json:"poolName" gorm:"column:pool;index:,class:INVERTED"`
	NodeName     string `json:"nodeName" gorm:"column:node;index:,class:INVERTED"`
	UUID         string `json:"uuid" gorm:"column:uuid;index:,class:INVERTED"`

	ComputeTflops  float64 `json:"computeTflops" gorm:"column:compute_tflops"`
	ComputePercent float64 `json:"computePercent" gorm:"column:compute_percentage"`
	VRAMBytes      uint64  `json:"vramBytes" gorm:"column:memory_bytes"`
	VRAMPercent    float64 `json:"vramPercent" gorm:"column:memory_percentage"`

	ComputeThrottledCount int64 `json:"computeThrottledCount" gorm:"column:compute_throttled_cnt"`
	VRAMFreezedCount      int64 `json:"vramFreezedCount" gorm:"column:vram_freezed_cnt"`
	VRAMResumedCount      int64 `json:"vramResumedCount" gorm:"column:vram_resumed_cnt"`

	Timestamp time.Time `json:"ts" gorm:"column:ts;index:,class:TIME"`
}

func (HypervisorWorkerUsageMetrics) TableName added in v1.34.0

func (wu HypervisorWorkerUsageMetrics) TableName() string

type MetricsRecorder added in v1.33.4

type MetricsRecorder struct {
	MetricsOutputPath string

	// Raw billing result for node and workers
	HourlyUnitPriceMap map[string]float64

	// Pool level eviction protection price ratio map, key is pool name
	PoolEvictionProtectionPriceRatioMap map[string]string

	// Worker level unit price map, key is pool name, second level key is QoS level
	WorkerUnitPriceMap map[string]map[string]RawBillingPricing
}

func (*MetricsRecorder) RecordMetrics added in v1.33.4

func (mr *MetricsRecorder) RecordMetrics(writer io.Writer)

func (*MetricsRecorder) Start added in v1.33.4

func (mr *MetricsRecorder) Start()

Start metrics recorder The leader container will fill the metrics map, so followers don't have metrics point thus metrics recorder only printed in one controller instance One minute interval could cause some metrics ignored or billing not accurate, known issue

func (*MetricsRecorder) UpdateMetricsRecorder added in v1.44.0

func (r *MetricsRecorder) UpdateMetricsRecorder(pool *tfv1.GPUPool, specChanged bool)

Update metrics recorder's raw billing map

type MultiProtocolEncoder added in v1.37.0

type MultiProtocolEncoder struct {
	// contains filtered or unexported fields
}

func (*MultiProtocolEncoder) AddField added in v1.37.0

func (m *MultiProtocolEncoder) AddField(key string, value any)

func (*MultiProtocolEncoder) AddTag added in v1.37.0

func (m *MultiProtocolEncoder) AddTag(key, value string)

func (*MultiProtocolEncoder) Bytes added in v1.37.0

func (m *MultiProtocolEncoder) Bytes() []byte

func (*MultiProtocolEncoder) EndLine added in v1.37.0

func (m *MultiProtocolEncoder) EndLine(timestamp time.Time)

func (*MultiProtocolEncoder) Err added in v1.37.0

func (m *MultiProtocolEncoder) Err() error

func (*MultiProtocolEncoder) StartLine added in v1.37.0

func (m *MultiProtocolEncoder) StartLine(measurement string)

type NodeResourceMetrics added in v1.34.0

type NodeResourceMetrics struct {
	NodeName string `json:"nodeName" gorm:"column:node;index:,class:INVERTED"`
	PoolName string `json:"poolName" gorm:"column:pool;index:,class:INVERTED"`
	Phase    string `json:"phase" gorm:"column:phase;index:,class:INVERTED"`

	AllocatedTflops        float64 `json:"allocatedTflops" gorm:"column:allocated_tflops"`
	AllocatedTflopsPercent float64 `json:"allocatedTflopsPercent" gorm:"column:allocated_tflops_percent"`
	AllocatedVramBytes     float64 `json:"allocatedVramBytes" gorm:"column:allocated_vram_bytes"`
	AllocatedVramPercent   float64 `json:"allocatedVramPercent" gorm:"column:allocated_vram_percent"`

	AllocatedTflopsPercentToVirtualCap float64 `json:"allocatedTflopsPercentToVirtualCap" gorm:"column:allocated_tflops_percent_virtual"`
	AllocatedVramPercentToVirtualCap   float64 `json:"allocatedVramPercentToVirtualCap" gorm:"column:allocated_vram_percent_virtual"`

	LimitedTFlops                    float64 `json:"limitedTFlops" gorm:"column:limited_tflops"`
	LimitedVramBytes                 float64 `json:"limitedVramBytes" gorm:"column:limited_vram_bytes"`
	LimitedTFlopsPercentToVirtualCap float64 `json:"limitedTFlopsPercentToVirtualCap" gorm:"column:limited_tflops_percent_virtual"`
	LimitedVramPercentToVirtualCap   float64 `json:"limitedVramPercentToVirtualCap" gorm:"column:limited_vram_percent_virtual"`

	RawCost  float64 `json:"rawCost" gorm:"column:raw_cost"`
	GPUCount int     `json:"gpuCount" gorm:"column:gpu_count"`

	LastRecordTime time.Time `json:"lastRecordTime" gorm:"column:ts;index:,class:TIME"`
	// contains filtered or unexported fields
}

func (*NodeResourceMetrics) SetGPUModelAndCount added in v1.34.0

func (nm *NodeResourceMetrics) SetGPUModelAndCount(gpuModels []string)

func (NodeResourceMetrics) TableName added in v1.34.0

func (nm NodeResourceMetrics) TableName() string

type PoolResourceMetrics added in v1.42.0

type PoolResourceMetrics struct {
	PoolName string `json:"poolName" gorm:"column:pool;index:,class:INVERTED"`

	Phase string `json:"phase" gorm:"column:phase;index:,class:INVERTED"`

	AllocatedTflops                    float64 `json:"allocatedTflops" gorm:"column:allocated_tflops"`
	AllocatedTflopsPercent             float64 `json:"allocatedTflopsPercent" gorm:"column:allocated_tflops_percent"`
	AllocatedTflopsPercentToVirtualCap float64 `json:"allocatedTflopsPercentToVirtualCap" gorm:"column:allocated_tflops_percent_virtual"`
	AllocatedVramBytes                 float64 `json:"allocatedVramBytes" gorm:"column:allocated_vram_bytes"`
	AllocatedVramPercent               float64 `json:"allocatedVramPercent" gorm:"column:allocated_vram_percent"`
	AllocatedVramPercentToVirtualCap   float64 `json:"allocatedVramPercentToVirtualCap" gorm:"column:allocated_vram_percent_virtual"`

	AssignedLimitedTFlops                    float64 `json:"limitedTFlops" gorm:"column:limited_tflops"`
	AssignedLimitedVramBytes                 float64 `json:"limitedVramBytes" gorm:"column:limited_vram_bytes"`
	AssignedLimitedTFlopsPercentToVirtualCap float64 `json:"limitedTFlopsPercentToVirtualCap" gorm:"column:limited_tflops_percent_virtual"`
	AssignedLimitedVramPercentToVirtualCap   float64 `json:"limitedVramPercentToVirtualCap" gorm:"column:limited_vram_percent_virtual"`

	GPUCount int `json:"gpuCount" gorm:"column:gpu_count"`

	LastRecordTime time.Time `json:"lastRecordTime" gorm:"column:ts;index:,class:TIME"`
}

func (PoolResourceMetrics) TableName added in v1.42.0

func (pm PoolResourceMetrics) TableName() string

type RawBillingPricing added in v1.33.4

type RawBillingPricing struct {
	TflopsPerSecond float64
	VramPerSecond   float64

	TflopsOverRequestPerSecond float64
	VramOverRequestPerSecond   float64
}

type TFSystemLog added in v1.34.0

type TFSystemLog struct {
	Component string `json:"component" gorm:"column:component;index:,class:INVERTED"`
	Container string `json:"container" gorm:"column:container;index:,class:INVERTED"`
	Message   string `` /* 126-byte string literal not displayed */
	Namespace string `json:"namespace" gorm:"column:namespace;index:,class:INVERTED"`
	Pod       string `json:"pod" gorm:"column:pod;index:,class:SKIPPING"`
	Stream    string `json:"stream" gorm:"column:stream"`
	// message written timestamp
	Timestamp string `json:"timestamp" gorm:"column:timestamp"`

	GreptimeTimestamp time.Time `json:"greptime_timestamp" gorm:"column:greptime_timestamp;index:,class:TIME;precision:ms"`
}

func (TFSystemLog) TableName added in v1.34.0

func (sl TFSystemLog) TableName() string

type TensorFusionSystemMetrics added in v1.34.0

type TensorFusionSystemMetrics struct {
	PoolName string `json:"poolName" gorm:"column:pool;index:,class:INVERTED"`

	TotalWorkerCount            int64 `json:"totalWorkerCount" gorm:"column:total_workers_cnt"`
	TotalNodeCount              int64 `json:"totalNodeCount" gorm:"column:total_nodes_cnt"`
	TotalAllocationFailCount    int64 `json:"totalAllocationFailCount" gorm:"column:total_allocation_fail_cnt"`
	TotalAllocationSuccessCount int64 `json:"totalAllocationSuccessCount" gorm:"column:total_allocation_success_cnt"`
	TotalScaleUpCount           int64 `json:"totalScaleUpCount" gorm:"column:total_scale_up_cnt"`
	TotalScaleDownCount         int64 `json:"totalScaleDownCount" gorm:"column:total_scale_down_cnt"`

	Timestamp time.Time `json:"ts" gorm:"column:ts;index:,class:TIME"`
}

func (TensorFusionSystemMetrics) TableName added in v1.34.0

func (wm TensorFusionSystemMetrics) TableName() string

type TimeSeriesDB added in v1.34.0

type TimeSeriesDB struct {
	*gorm.DB
}

func (*TimeSeriesDB) FindRecentNodeMetrics added in v1.34.0

func (t *TimeSeriesDB) FindRecentNodeMetrics() ([]NodeResourceMetrics, error)

func (*TimeSeriesDB) SetTableTTL added in v1.34.0

func (t *TimeSeriesDB) SetTableTTL(ttl string) error

func (*TimeSeriesDB) Setup added in v1.34.0

func (m *TimeSeriesDB) Setup(connection GreptimeDBConnection) error

func (*TimeSeriesDB) SetupTables added in v1.34.0

func (t *TimeSeriesDB) SetupTables(client client.Client) error

Deprecated Code No need migration for new tables, Dynamic created during ingestion Dynamic indexed in Greptime Cloud/Enterprise edition

type WorkerResourceMetrics added in v1.34.0

type WorkerResourceMetrics struct {
	WorkerName   string `json:"workerName" gorm:"column:worker;index:,class:SKIPPING"`
	WorkloadName string `json:"workloadName" gorm:"column:workload;index:,class:INVERTED"`
	PoolName     string `json:"poolName" gorm:"column:pool;index:,class:INVERTED"`
	Namespace    string `json:"namespace" gorm:"column:namespace;index:,class:INVERTED"`
	QoS          string `json:"qos" gorm:"column:qos"`

	TflopsRequest    float64 `json:"tflopsRequest" gorm:"column:tflops_request"`
	TflopsLimit      float64 `json:"tflopsLimit" gorm:"column:tflops_limit"`
	VramBytesRequest float64 `json:"vramBytesRequest" gorm:"column:vram_bytes_request"`
	VramBytesLimit   float64 `json:"vramBytesLimit" gorm:"column:vram_bytes_limit"`
	GPUCount         int     `json:"gpuCount" gorm:"column:gpu_count"`
	RawCost          float64 `json:"rawCost" gorm:"column:raw_cost"`
	Ready            bool    `json:"ready" gorm:"column:ready"`

	LastRecordTime time.Time `json:"lastRecordTime" gorm:"column:ts;index:,class:TIME"`
	// contains filtered or unexported fields
}

Metrics will be stored in a map, key is the worker name, value is the metrics By default, metrics will be updated every minute

func (WorkerResourceMetrics) TableName added in v1.34.0

func (wm WorkerResourceMetrics) TableName() string

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL