Documentation
¶
Overview ¶
NOTE: Make sure any new field/tag to existing metrics or new metrics should be added to SetupTable function for manual DB migration
Index ¶
- Constants
- Variables
- func GetInitTableSQL(model schema.Tabler, ttl string) string
- func InitPoolMetricsWhenNotExists(poolObj *tfv1.GPUPool)
- func RemoveNodeMetrics(nodeName string)
- func RemovePoolMetrics(poolName string)
- func RemoveWorkerMetrics(workerName string, deletionTime time.Time)
- func SetAutoscalingMetrics(poolName string, isScaleUp bool)
- func SetNodeMetrics(node *tfv1.GPUNode, poolObj *tfv1.GPUPool, gpuModels []string)
- func SetPoolMetrics(poolObj *tfv1.GPUPool)
- func SetSchedulerMetrics(poolName string, isSuccess bool)
- func SetWorkerMetricsByWorkload(pod *corev1.Pod)
- type ActiveNodeAndWorker
- type Encoder
- type EncoderType
- type GreptimeDBConnection
- type HypervisorGPUUsageMetrics
- type HypervisorWorkerUsageMetrics
- type MetricsRecorder
- type MultiProtocolEncoder
- func (m *MultiProtocolEncoder) AddField(key string, value any)
- func (m *MultiProtocolEncoder) AddTag(key, value string)
- func (m *MultiProtocolEncoder) Bytes() []byte
- func (m *MultiProtocolEncoder) EndLine(timestamp time.Time)
- func (m *MultiProtocolEncoder) Err() error
- func (m *MultiProtocolEncoder) StartLine(measurement string)
- type NodeResourceMetrics
- type PoolResourceMetrics
- type RawBillingPricing
- type TFSystemLog
- type TensorFusionSystemMetrics
- type TimeSeriesDB
- type WorkerResourceMetrics
Constants ¶
const (
CREATE_TABLE_OPTION_TPL = "ENGINE=mito WITH( ttl='%s', merge_mode = 'last_non_null')"
)
const CurrentAppSQLVersion = "1.0"
Variables ¶
var TFVersionMigrationMap = []struct { Version string AlterSQL []string }{ {"1.0", []string{ "CREATE TABLE IF NOT EXISTS tf_worker_resources (\n `worker` String NULL SKIPPING INDEX,\n `workload` String NULL INVERTED INDEX,\n `pool` String NULL INVERTED INDEX,\n `namespace` String NULL INVERTED INDEX,\n `qos` String NULL,\n `tflops_request` Double NULL,\n `tflops_limit` Double NULL,\n `vram_bytes_request` Double NULL,\n `vram_bytes_limit` Double NULL,\n `gpu_count` BigInt NULL,\n `raw_cost` Double NULL,\n `ready` String NULL,\n `ts` Timestamp_ms TIME INDEX,\n PRIMARY KEY (`worker`, `workload`, `pool`, `namespace`))\n ENGINE=mito WITH( ttl='30d', merge_mode = 'last_non_null')", "CREATE TABLE IF NOT EXISTS tf_node_metrics (\n `node` String NULL INVERTED INDEX,\n `pool` String NULL INVERTED INDEX,\n `phase` String NULL INVERTED INDEX,\n `allocated_tflops` Double NULL,\n `allocated_tflops_percent` Double NULL,\n `allocated_vram_bytes` Double NULL,\n `allocated_vram_percent` Double NULL,\n `allocated_tflops_percent_virtual` Double NULL,\n `allocated_vram_percent_virtual` Double NULL,\n `raw_cost` Double NULL,\n `gpu_count` BigInt NULL,\n `ts` Timestamp_ms TIME INDEX,\n PRIMARY KEY (`node`, `pool`, `phase`))\n ENGINE=mito WITH( ttl='30d', merge_mode = 'last_non_null')", "CREATE TABLE IF NOT EXISTS tf_system_metrics (\n `pool` String NULL INVERTED INDEX,\n `total_workers_cnt` BigInt NULL,\n `total_nodes_cnt` BigInt NULL,\n `total_allocation_fail_cnt` BigInt NULL,\n `total_allocation_success_cnt` BigInt NULL,\n `total_scale_up_cnt` BigInt NULL,\n `total_scale_down_cnt` BigInt NULL,\n `ts` Timestamp_ms TIME INDEX,\n PRIMARY KEY (`pool`))\n ENGINE=mito WITH( ttl='30d', merge_mode = 'last_non_null')", "CREATE TABLE IF NOT EXISTS tf_system_log (\n `component` String NULL INVERTED INDEX,\n `container` String NULL INVERTED INDEX,\n `message` String NULL FULLTEXT INDEX WITH (analyzer = 'English' , case_sensitive = 'false'),\n `namespace` String NULL INVERTED INDEX,\n `pod` String NULL SKIPPING INDEX,\n `stream` String NULL,\n `timestamp` String NULL,\n `greptime_timestamp` Timestamp_ms TIME INDEX,\n PRIMARY KEY (`component`, `container`, `namespace`, `pod`))\n ENGINE=mito WITH( ttl='30d', merge_mode = 'last_non_null')", "CREATE TABLE IF NOT EXISTS tf_worker_usage (\n `workload` String NULL INVERTED INDEX,\n `worker` String NULL SKIPPING INDEX,\n `namespace` String NULL INVERTED INDEX,\n `pool` String NULL INVERTED INDEX,\n `node` String NULL INVERTED INDEX,\n `uuid` String NULL INVERTED INDEX,\n `compute_tflops` Double NULL,\n `compute_percentage` Double NULL,\n `memory_bytes` BigInt UNSIGNED NULL,\n `memory_percentage` Double NULL,\n `compute_throttled_cnt` BigInt NULL,\n `vram_freezed_cnt` BigInt NULL,\n `vram_resumed_cnt` BigInt NULL,\n `ts` Timestamp_ms TIME INDEX,\n PRIMARY KEY (`workload`, `worker`, `namespace`, `pool`, `node`, `uuid`))\n ENGINE=mito WITH( ttl='30d', merge_mode = 'last_non_null')", "CREATE TABLE IF NOT EXISTS tf_gpu_usage (\n `node` String NULL INVERTED INDEX,\n `pool` String NULL INVERTED INDEX,\n `uuid` String NULL INVERTED INDEX,\n `compute_percentage` Double NULL,\n `memory_percentage` Double NULL,\n `memory_bytes` BigInt UNSIGNED NULL,\n `compute_tflops` Double NULL,\n `rx` Double NULL,\n `tx` Double NULL,\n `temperature` Double NULL,\n `graphics_clock_mhz` Double NULL,\n `sm_clock_mhz` Double NULL,\n `memory_clock_mhz` Double NULL,\n `video_clock_mhz` Double NULL,\n `power_usage` Double NULL,\n `nvlink_rx` Double NULL,\n `nvlink_tx` Double NULL,\n `ts` Timestamp_ms TIME INDEX,\n PRIMARY KEY (`node`, `pool`, `uuid`))\n ENGINE=mito WITH( ttl='30d', merge_mode = 'last_non_null')", }}, {"1.1", []string{}}, }
When upgrading database, should run alter sql in order for every version not lower than current version, until the version to be updated
var TensorFusionSystemMetricsMap = make(map[string]*TensorFusionSystemMetrics)
Functions ¶
func GetInitTableSQL ¶ added in v1.42.0
func InitPoolMetricsWhenNotExists ¶ added in v1.42.0
func RemoveNodeMetrics ¶ added in v1.33.4
func RemoveNodeMetrics(nodeName string)
func RemovePoolMetrics ¶ added in v1.42.0
func RemovePoolMetrics(poolName string)
func RemoveWorkerMetrics ¶ added in v1.33.4
func SetAutoscalingMetrics ¶ added in v1.34.0
TODO should record metrics after autoscaling feature added
func SetNodeMetrics ¶ added in v1.33.4
func SetPoolMetrics ¶ added in v1.42.0
func SetSchedulerMetrics ¶ added in v1.34.0
func SetWorkerMetricsByWorkload ¶ added in v1.33.4
Types ¶
type ActiveNodeAndWorker ¶ added in v1.34.0
type ActiveNodeAndWorker struct {
// contains filtered or unexported fields
}
type Encoder ¶ added in v1.37.0
type Encoder interface { StartLine(measurement string) AddTag(key, value string) AddField(key string, value any) EndLine(timestamp time.Time) Bytes() []byte Err() error }
func NewEncoder ¶ added in v1.37.0
type EncoderType ¶ added in v1.37.0
type EncoderType uint8
EncoderType represents the encoder type as an enum for better performance
const ( EncoderTypeInflux EncoderType = iota EncoderTypeJson EncoderTypeOTel )
type GreptimeDBConnection ¶ added in v1.34.0
type HypervisorGPUUsageMetrics ¶ added in v1.34.0
type HypervisorGPUUsageMetrics struct { NodeName string `json:"nodeName" gorm:"column:node;index:,class:INVERTED"` PoolName string `json:"poolName" gorm:"column:pool;index:,class:INVERTED"` UUID string `json:"uuid" gorm:"column:uuid;index:,class:INVERTED"` ComputePercent float64 `json:"computePercent" gorm:"column:compute_percentage"` VRAMPercent float64 `json:"vramPercent" gorm:"column:memory_percentage"` VRAMBytes uint64 `json:"vramBytes" gorm:"column:memory_bytes"` ComputeTflops float64 `json:"computeTflops" gorm:"column:compute_tflops"` PcieRxKB float64 `json:"pcieRx" gorm:"column:rx"` PcieTxKB float64 `json:"pcieTx" gorm:"column:tx"` Temperature float64 `json:"temperature" gorm:"column:temperature"` GraphicsClockMHz float64 `json:"graphicsClockMHz" gorm:"column:graphics_clock_mhz"` SMClockMHz float64 `json:"smClockMHz" gorm:"column:sm_clock_mhz"` MemoryClockMHz float64 `json:"memoryClockMHz" gorm:"column:memory_clock_mhz"` VideoClockMHz float64 `json:"videoClockMHz" gorm:"column:video_clock_mhz"` PowerUsage float64 `json:"powerUsage" gorm:"column:power_usage"` NvlinkRx float64 `json:"nvlinkRx" gorm:"column:nvlink_rx"` NvlinkTx float64 `json:"nvlinkTx" gorm:"column:nvlink_tx"` Timestamp time.Time `json:"ts" gorm:"column:ts;index:,class:TIME"` }
func (HypervisorGPUUsageMetrics) TableName ¶ added in v1.34.0
func (nu HypervisorGPUUsageMetrics) TableName() string
type HypervisorWorkerUsageMetrics ¶ added in v1.34.0
type HypervisorWorkerUsageMetrics struct { WorkloadName string `json:"workloadName" gorm:"column:workload;index:,class:INVERTED"` WorkerName string `json:"workerName" gorm:"column:worker;index:,class:SKIPPING"` Namespace string `json:"namespace" gorm:"column:namespace;index:,class:INVERTED"` PoolName string `json:"poolName" gorm:"column:pool;index:,class:INVERTED"` NodeName string `json:"nodeName" gorm:"column:node;index:,class:INVERTED"` UUID string `json:"uuid" gorm:"column:uuid;index:,class:INVERTED"` ComputeTflops float64 `json:"computeTflops" gorm:"column:compute_tflops"` ComputePercent float64 `json:"computePercent" gorm:"column:compute_percentage"` VRAMBytes uint64 `json:"vramBytes" gorm:"column:memory_bytes"` VRAMPercent float64 `json:"vramPercent" gorm:"column:memory_percentage"` ComputeThrottledCount int64 `json:"computeThrottledCount" gorm:"column:compute_throttled_cnt"` VRAMFreezedCount int64 `json:"vramFreezedCount" gorm:"column:vram_freezed_cnt"` VRAMResumedCount int64 `json:"vramResumedCount" gorm:"column:vram_resumed_cnt"` Timestamp time.Time `json:"ts" gorm:"column:ts;index:,class:TIME"` }
func (HypervisorWorkerUsageMetrics) TableName ¶ added in v1.34.0
func (wu HypervisorWorkerUsageMetrics) TableName() string
type MetricsRecorder ¶ added in v1.33.4
type MetricsRecorder struct { MetricsOutputPath string // Raw billing result for node and workers HourlyUnitPriceMap map[string]float64 // Pool level eviction protection price ratio map, key is pool name PoolEvictionProtectionPriceRatioMap map[string]string // Worker level unit price map, key is pool name, second level key is QoS level WorkerUnitPriceMap map[string]map[string]RawBillingPricing }
func (*MetricsRecorder) RecordMetrics ¶ added in v1.33.4
func (mr *MetricsRecorder) RecordMetrics(writer io.Writer)
func (*MetricsRecorder) Start ¶ added in v1.33.4
func (mr *MetricsRecorder) Start()
Start metrics recorder The leader container will fill the metrics map, so followers don't have metrics point thus metrics recorder only printed in one controller instance One minute interval could cause some metrics ignored or billing not accurate, known issue
func (*MetricsRecorder) UpdateMetricsRecorder ¶ added in v1.44.0
func (r *MetricsRecorder) UpdateMetricsRecorder(pool *tfv1.GPUPool, specChanged bool)
Update metrics recorder's raw billing map
type MultiProtocolEncoder ¶ added in v1.37.0
type MultiProtocolEncoder struct {
// contains filtered or unexported fields
}
func (*MultiProtocolEncoder) AddField ¶ added in v1.37.0
func (m *MultiProtocolEncoder) AddField(key string, value any)
func (*MultiProtocolEncoder) AddTag ¶ added in v1.37.0
func (m *MultiProtocolEncoder) AddTag(key, value string)
func (*MultiProtocolEncoder) Bytes ¶ added in v1.37.0
func (m *MultiProtocolEncoder) Bytes() []byte
func (*MultiProtocolEncoder) EndLine ¶ added in v1.37.0
func (m *MultiProtocolEncoder) EndLine(timestamp time.Time)
func (*MultiProtocolEncoder) Err ¶ added in v1.37.0
func (m *MultiProtocolEncoder) Err() error
func (*MultiProtocolEncoder) StartLine ¶ added in v1.37.0
func (m *MultiProtocolEncoder) StartLine(measurement string)
type NodeResourceMetrics ¶ added in v1.34.0
type NodeResourceMetrics struct { NodeName string `json:"nodeName" gorm:"column:node;index:,class:INVERTED"` PoolName string `json:"poolName" gorm:"column:pool;index:,class:INVERTED"` Phase string `json:"phase" gorm:"column:phase;index:,class:INVERTED"` AllocatedTflops float64 `json:"allocatedTflops" gorm:"column:allocated_tflops"` AllocatedTflopsPercent float64 `json:"allocatedTflopsPercent" gorm:"column:allocated_tflops_percent"` AllocatedVramBytes float64 `json:"allocatedVramBytes" gorm:"column:allocated_vram_bytes"` AllocatedVramPercent float64 `json:"allocatedVramPercent" gorm:"column:allocated_vram_percent"` AllocatedTflopsPercentToVirtualCap float64 `json:"allocatedTflopsPercentToVirtualCap" gorm:"column:allocated_tflops_percent_virtual"` AllocatedVramPercentToVirtualCap float64 `json:"allocatedVramPercentToVirtualCap" gorm:"column:allocated_vram_percent_virtual"` LimitedTFlops float64 `json:"limitedTFlops" gorm:"column:limited_tflops"` LimitedVramBytes float64 `json:"limitedVramBytes" gorm:"column:limited_vram_bytes"` LimitedTFlopsPercentToVirtualCap float64 `json:"limitedTFlopsPercentToVirtualCap" gorm:"column:limited_tflops_percent_virtual"` LimitedVramPercentToVirtualCap float64 `json:"limitedVramPercentToVirtualCap" gorm:"column:limited_vram_percent_virtual"` RawCost float64 `json:"rawCost" gorm:"column:raw_cost"` GPUCount int `json:"gpuCount" gorm:"column:gpu_count"` LastRecordTime time.Time `json:"lastRecordTime" gorm:"column:ts;index:,class:TIME"` // contains filtered or unexported fields }
func (*NodeResourceMetrics) SetGPUModelAndCount ¶ added in v1.34.0
func (nm *NodeResourceMetrics) SetGPUModelAndCount(gpuModels []string)
func (NodeResourceMetrics) TableName ¶ added in v1.34.0
func (nm NodeResourceMetrics) TableName() string
type PoolResourceMetrics ¶ added in v1.42.0
type PoolResourceMetrics struct { PoolName string `json:"poolName" gorm:"column:pool;index:,class:INVERTED"` Phase string `json:"phase" gorm:"column:phase;index:,class:INVERTED"` AllocatedTflops float64 `json:"allocatedTflops" gorm:"column:allocated_tflops"` AllocatedTflopsPercent float64 `json:"allocatedTflopsPercent" gorm:"column:allocated_tflops_percent"` AllocatedTflopsPercentToVirtualCap float64 `json:"allocatedTflopsPercentToVirtualCap" gorm:"column:allocated_tflops_percent_virtual"` AllocatedVramBytes float64 `json:"allocatedVramBytes" gorm:"column:allocated_vram_bytes"` AllocatedVramPercent float64 `json:"allocatedVramPercent" gorm:"column:allocated_vram_percent"` AllocatedVramPercentToVirtualCap float64 `json:"allocatedVramPercentToVirtualCap" gorm:"column:allocated_vram_percent_virtual"` AssignedLimitedTFlops float64 `json:"limitedTFlops" gorm:"column:limited_tflops"` AssignedLimitedVramBytes float64 `json:"limitedVramBytes" gorm:"column:limited_vram_bytes"` AssignedLimitedTFlopsPercentToVirtualCap float64 `json:"limitedTFlopsPercentToVirtualCap" gorm:"column:limited_tflops_percent_virtual"` AssignedLimitedVramPercentToVirtualCap float64 `json:"limitedVramPercentToVirtualCap" gorm:"column:limited_vram_percent_virtual"` GPUCount int `json:"gpuCount" gorm:"column:gpu_count"` LastRecordTime time.Time `json:"lastRecordTime" gorm:"column:ts;index:,class:TIME"` }
func (PoolResourceMetrics) TableName ¶ added in v1.42.0
func (pm PoolResourceMetrics) TableName() string
type RawBillingPricing ¶ added in v1.33.4
type TFSystemLog ¶ added in v1.34.0
type TFSystemLog struct { Component string `json:"component" gorm:"column:component;index:,class:INVERTED"` Container string `json:"container" gorm:"column:container;index:,class:INVERTED"` Message string `` /* 126-byte string literal not displayed */ Namespace string `json:"namespace" gorm:"column:namespace;index:,class:INVERTED"` Pod string `json:"pod" gorm:"column:pod;index:,class:SKIPPING"` Stream string `json:"stream" gorm:"column:stream"` // message written timestamp Timestamp string `json:"timestamp" gorm:"column:timestamp"` GreptimeTimestamp time.Time `json:"greptime_timestamp" gorm:"column:greptime_timestamp;index:,class:TIME;precision:ms"` }
func (TFSystemLog) TableName ¶ added in v1.34.0
func (sl TFSystemLog) TableName() string
type TensorFusionSystemMetrics ¶ added in v1.34.0
type TensorFusionSystemMetrics struct { PoolName string `json:"poolName" gorm:"column:pool;index:,class:INVERTED"` TotalWorkerCount int64 `json:"totalWorkerCount" gorm:"column:total_workers_cnt"` TotalNodeCount int64 `json:"totalNodeCount" gorm:"column:total_nodes_cnt"` TotalAllocationFailCount int64 `json:"totalAllocationFailCount" gorm:"column:total_allocation_fail_cnt"` TotalAllocationSuccessCount int64 `json:"totalAllocationSuccessCount" gorm:"column:total_allocation_success_cnt"` TotalScaleUpCount int64 `json:"totalScaleUpCount" gorm:"column:total_scale_up_cnt"` TotalScaleDownCount int64 `json:"totalScaleDownCount" gorm:"column:total_scale_down_cnt"` Timestamp time.Time `json:"ts" gorm:"column:ts;index:,class:TIME"` }
func (TensorFusionSystemMetrics) TableName ¶ added in v1.34.0
func (wm TensorFusionSystemMetrics) TableName() string
type TimeSeriesDB ¶ added in v1.34.0
func (*TimeSeriesDB) FindRecentNodeMetrics ¶ added in v1.34.0
func (t *TimeSeriesDB) FindRecentNodeMetrics() ([]NodeResourceMetrics, error)
func (*TimeSeriesDB) SetTableTTL ¶ added in v1.34.0
func (t *TimeSeriesDB) SetTableTTL(ttl string) error
func (*TimeSeriesDB) Setup ¶ added in v1.34.0
func (m *TimeSeriesDB) Setup(connection GreptimeDBConnection) error
func (*TimeSeriesDB) SetupTables ¶ added in v1.34.0
func (t *TimeSeriesDB) SetupTables(client client.Client) error
Deprecated Code No need migration for new tables, Dynamic created during ingestion Dynamic indexed in Greptime Cloud/Enterprise edition
type WorkerResourceMetrics ¶ added in v1.34.0
type WorkerResourceMetrics struct { WorkerName string `json:"workerName" gorm:"column:worker;index:,class:SKIPPING"` WorkloadName string `json:"workloadName" gorm:"column:workload;index:,class:INVERTED"` PoolName string `json:"poolName" gorm:"column:pool;index:,class:INVERTED"` Namespace string `json:"namespace" gorm:"column:namespace;index:,class:INVERTED"` QoS string `json:"qos" gorm:"column:qos"` TflopsRequest float64 `json:"tflopsRequest" gorm:"column:tflops_request"` TflopsLimit float64 `json:"tflopsLimit" gorm:"column:tflops_limit"` VramBytesRequest float64 `json:"vramBytesRequest" gorm:"column:vram_bytes_request"` VramBytesLimit float64 `json:"vramBytesLimit" gorm:"column:vram_bytes_limit"` GPUCount int `json:"gpuCount" gorm:"column:gpu_count"` RawCost float64 `json:"rawCost" gorm:"column:raw_cost"` Ready bool `json:"ready" gorm:"column:ready"` LastRecordTime time.Time `json:"lastRecordTime" gorm:"column:ts;index:,class:TIME"` // contains filtered or unexported fields }
Metrics will be stored in a map, key is the worker name, value is the metrics By default, metrics will be updated every minute
func (WorkerResourceMetrics) TableName ¶ added in v1.34.0
func (wm WorkerResourceMetrics) TableName() string