Documentation
¶
Overview ¶
NOTE: Make sure any new field/tag to existing metrics or new metrics should be added to SetupTable function for manual DB migration
Index ¶
- Constants
- Variables
- func RemoveNodeMetrics(nodeName string)
- func RemoveWorkerMetrics(workerName string, deletionTime time.Time)
- func SetAutoscalingMetrics(poolName string, isScaleUp bool)
- func SetNodeMetrics(node *tfv1.GPUNode, poolObj *tfv1.GPUPool, gpuModels []string)
- func SetSchedulerMetrics(poolName string, isSuccess bool)
- func SetWorkerMetricsByWorkload(pod *corev1.Pod)
- type ActiveNodeAndWorker
- type Encoder
- type EncoderType
- type GreptimeDBConnection
- type HypervisorGPUUsageMetrics
- type HypervisorWorkerUsageMetrics
- type MetricsRecorder
- type MultiProtocolEncoder
- func (m *MultiProtocolEncoder) AddField(key string, value any)
- func (m *MultiProtocolEncoder) AddTag(key, value string)
- func (m *MultiProtocolEncoder) Bytes() []byte
- func (m *MultiProtocolEncoder) EndLine(timestamp time.Time)
- func (m *MultiProtocolEncoder) Err() error
- func (m *MultiProtocolEncoder) StartLine(measurement string)
- type NodeResourceMetrics
- type RawBillingPricing
- type TFSystemLog
- type TensorFusionSystemMetrics
- type TimeSeriesDB
- type WorkerResourceMetrics
Constants ¶
const (
CREATE_TABLE_OPTION_TPL = "ENGINE=mito WITH( ttl='%s', merge_mode = 'last_non_null')"
)
const CurrentAppSQLVersion = "1.0"
Variables ¶
var TFVersionMigrationMap = []struct { Version string AlterSQL []string }{ {"1.0", []string{ "CREATE TABLE IF NOT EXISTS tf_worker_resources (\n `worker` String NULL SKIPPING INDEX,\n `workload` String NULL INVERTED INDEX,\n `pool` String NULL INVERTED INDEX,\n `namespace` String NULL INVERTED INDEX,\n `qos` String NULL,\n `tflops_request` Double NULL,\n `tflops_limit` Double NULL,\n `vram_bytes_request` Double NULL,\n `vram_bytes_limit` Double NULL,\n `gpu_count` BigInt NULL,\n `raw_cost` Double NULL,\n `ts` Timestamp_ms TIME INDEX,\n PRIMARY KEY (`worker`, `workload`, `pool`, `namespace`))\n ENGINE=mito WITH( ttl='30d', merge_mode = 'last_non_null')", "CREATE TABLE IF NOT EXISTS tf_node_resources (\n `node` String NULL INVERTED INDEX,\n `pool` String NULL INVERTED INDEX,\n `allocated_tflops` Double NULL,\n `allocated_tflops_percent` Double NULL,\n `allocated_vram_bytes` Double NULL,\n `allocated_vram_percent` Double NULL,\n `allocated_tflops_percent_virtual` Double NULL,\n `allocated_vram_percent_virtual` Double NULL,\n `raw_cost` Double NULL,\n `gpu_count` BigInt NULL,\n `ts` Timestamp_ms TIME INDEX,\n PRIMARY KEY (`node`, `pool`))\n ENGINE=mito WITH( ttl='30d', merge_mode = 'last_non_null')", "CREATE TABLE IF NOT EXISTS tf_system_metrics (\n `pool` String NULL INVERTED INDEX,\n `total_workers_cnt` BigInt NULL,\n `total_nodes_cnt` BigInt NULL,\n `total_allocation_fail_cnt` BigInt NULL,\n `total_allocation_success_cnt` BigInt NULL,\n `total_scale_up_cnt` BigInt NULL,\n `total_scale_down_cnt` BigInt NULL,\n `ts` Timestamp_ms TIME INDEX,\n PRIMARY KEY (`pool`))\n ENGINE=mito WITH( ttl='30d', merge_mode = 'last_non_null')", "CREATE TABLE IF NOT EXISTS tf_system_log (\n `component` String NULL INVERTED INDEX,\n `container` String NULL INVERTED INDEX,\n `message` String NULL FULLTEXT INDEX WITH (analyzer = 'English' , case_sensitive = 'false'),\n `namespace` String NULL INVERTED INDEX,\n `pod` String NULL SKIPPING INDEX,\n `stream` String NULL,\n `timestamp` String NULL,\n `greptime_timestamp` Timestamp_ms TIME INDEX,\n PRIMARY KEY (`component`, `container`, `namespace`, `pod`))\n ENGINE=mito WITH( ttl='30d', merge_mode = 'last_non_null')", "CREATE TABLE IF NOT EXISTS tf_worker_usage (\n `workload` String NULL INVERTED INDEX,\n `worker` String NULL SKIPPING INDEX,\n `pool` String NULL INVERTED INDEX,\n `node` String NULL INVERTED INDEX,\n `uuid` String NULL INVERTED INDEX,\n `compute_percentage` Double NULL,\n `memory_bytes` BigInt UNSIGNED NULL,\n `compute_tflops` Double NULL,\n `compute_throttled_cnt` BigInt NULL,\n `vram_freezed_cnt` BigInt NULL,\n `vram_resumed_cnt` BigInt NULL,\n `ts` Timestamp_ms TIME INDEX,\n PRIMARY KEY (`workload`, `worker`, `pool`, `node`, `uuid`))\n ENGINE=mito WITH( ttl='30d', merge_mode = 'last_non_null')", "CREATE TABLE IF NOT EXISTS tf_gpu_usage (\n `node` String NULL INVERTED INDEX,\n `pool` String NULL INVERTED INDEX,\n `uuid` String NULL INVERTED INDEX,\n `compute_percentage` Double NULL,\n `memory_percentage` Double NULL,\n `memory_bytes` BigInt UNSIGNED NULL,\n `compute_tflops` Double NULL,\n `rx` Double NULL,\n `tx` Double NULL,\n `temperature` Double NULL,\n `ts` Timestamp_ms TIME INDEX,\n PRIMARY KEY (`node`, `pool`, `uuid`))\n ENGINE=mito WITH( ttl='30d', merge_mode = 'last_non_null')", }}, {"1.1", []string{}}, }
When upgrading database, should run alter sql in order for every version not lower than current version, until the version to be updated
var TensorFusionSystemMetricsMap = make(map[string]*TensorFusionSystemMetrics)
Functions ¶
func RemoveNodeMetrics ¶ added in v1.33.4
func RemoveNodeMetrics(nodeName string)
func RemoveWorkerMetrics ¶ added in v1.33.4
func SetAutoscalingMetrics ¶ added in v1.34.0
TODO should record metrics after autoscaling feature added
func SetNodeMetrics ¶ added in v1.33.4
func SetSchedulerMetrics ¶ added in v1.34.0
func SetWorkerMetricsByWorkload ¶ added in v1.33.4
Types ¶
type ActiveNodeAndWorker ¶ added in v1.34.0
type ActiveNodeAndWorker struct {
// contains filtered or unexported fields
}
type Encoder ¶ added in v1.37.0
type Encoder interface { StartLine(measurement string) AddTag(key, value string) AddField(key string, value any) EndLine(timestamp time.Time) Bytes() []byte Err() error }
func NewEncoder ¶ added in v1.37.0
type EncoderType ¶ added in v1.37.0
type EncoderType uint8
EncoderType represents the encoder type as an enum for better performance
const ( EncoderTypeInflux EncoderType = iota EncoderTypeJson EncoderTypeOTel )
type GreptimeDBConnection ¶ added in v1.34.0
type HypervisorGPUUsageMetrics ¶ added in v1.34.0
type HypervisorGPUUsageMetrics struct { NodeName string `json:"nodeName" gorm:"column:node;index:,class:INVERTED"` PoolName string `json:"poolName" gorm:"column:pool;index:,class:INVERTED"` UUID string `json:"uuid" gorm:"column:uuid;index:,class:INVERTED"` ComputePercent float64 `json:"computePercent" gorm:"column:compute_percentage"` VRAMPercent float64 `json:"vramPercent" gorm:"column:memory_percentage"` VRAMBytes uint64 `json:"vramBytes" gorm:"column:memory_bytes"` ComputeTflops float64 `json:"computeTflops" gorm:"column:compute_tflops"` PcieRxKB float64 `json:"pcieRx" gorm:"column:rx"` PcieTxKB float64 `json:"pcieTx" gorm:"column:tx"` Temperature float64 `json:"temperature" gorm:"column:temperature"` Timestamp time.Time `json:"ts" gorm:"column:ts;index:,class:TIME"` }
func (HypervisorGPUUsageMetrics) TableName ¶ added in v1.34.0
func (nu HypervisorGPUUsageMetrics) TableName() string
type HypervisorWorkerUsageMetrics ¶ added in v1.34.0
type HypervisorWorkerUsageMetrics struct { WorkloadName string `json:"workloadName" gorm:"column:workload;index:,class:INVERTED"` WorkerName string `json:"workerName" gorm:"column:worker;index:,class:SKIPPING"` PoolName string `json:"poolName" gorm:"column:pool;index:,class:INVERTED"` NodeName string `json:"nodeName" gorm:"column:node;index:,class:INVERTED"` UUID string `json:"uuid" gorm:"column:uuid;index:,class:INVERTED"` ComputePercent float64 `json:"computePercent" gorm:"column:compute_percentage"` VRAMBytes uint64 `json:"vramBytes" gorm:"column:memory_bytes"` ComputeTflops float64 `json:"computeTflops" gorm:"column:compute_tflops"` ComputeThrottledCount int64 `json:"computeThrottledCount" gorm:"column:compute_throttled_cnt"` VRAMFreezedCount int64 `json:"vramFreezedCount" gorm:"column:vram_freezed_cnt"` VRAMResumedCount int64 `json:"vramResumedCount" gorm:"column:vram_resumed_cnt"` Timestamp time.Time `json:"ts" gorm:"column:ts;index:,class:TIME"` }
func (HypervisorWorkerUsageMetrics) TableName ¶ added in v1.34.0
func (wu HypervisorWorkerUsageMetrics) TableName() string
type MetricsRecorder ¶ added in v1.33.4
type MetricsRecorder struct { MetricsOutputPath string // Raw billing result for node and workers HourlyUnitPriceMap map[string]float64 // Worker level unit price map, key is pool name, second level key is QoS level WorkerUnitPriceMap map[string]map[string]RawBillingPricing }
func (*MetricsRecorder) RecordMetrics ¶ added in v1.33.4
func (mr *MetricsRecorder) RecordMetrics(writer io.Writer)
func (*MetricsRecorder) Start ¶ added in v1.33.4
func (mr *MetricsRecorder) Start()
Start metrics recorder The leader container will fill the metrics map, so followers don't have metrics point thus metrics recorder only printed in one controller instance One minute interval could cause some metrics ignored or billing not accurate, known issue
type MultiProtocolEncoder ¶ added in v1.37.0
type MultiProtocolEncoder struct {
// contains filtered or unexported fields
}
func (*MultiProtocolEncoder) AddField ¶ added in v1.37.0
func (m *MultiProtocolEncoder) AddField(key string, value any)
func (*MultiProtocolEncoder) AddTag ¶ added in v1.37.0
func (m *MultiProtocolEncoder) AddTag(key, value string)
func (*MultiProtocolEncoder) Bytes ¶ added in v1.37.0
func (m *MultiProtocolEncoder) Bytes() []byte
func (*MultiProtocolEncoder) EndLine ¶ added in v1.37.0
func (m *MultiProtocolEncoder) EndLine(timestamp time.Time)
func (*MultiProtocolEncoder) Err ¶ added in v1.37.0
func (m *MultiProtocolEncoder) Err() error
func (*MultiProtocolEncoder) StartLine ¶ added in v1.37.0
func (m *MultiProtocolEncoder) StartLine(measurement string)
type NodeResourceMetrics ¶ added in v1.34.0
type NodeResourceMetrics struct { NodeName string `json:"nodeName" gorm:"column:node;index:,class:INVERTED"` PoolName string `json:"poolName" gorm:"column:pool;index:,class:INVERTED"` AllocatedTflops float64 `json:"allocatedTflops" gorm:"column:allocated_tflops"` AllocatedTflopsPercent float64 `json:"allocatedTflopsPercent" gorm:"column:allocated_tflops_percent"` AllocatedVramBytes float64 `json:"allocatedVramBytes" gorm:"column:allocated_vram_bytes"` AllocatedVramPercent float64 `json:"allocatedVramPercent" gorm:"column:allocated_vram_percent"` AllocatedTflopsPercentToVirtualCap float64 `json:"allocatedTflopsPercentToVirtualCap" gorm:"column:allocated_tflops_percent_virtual"` AllocatedVramPercentToVirtualCap float64 `json:"allocatedVramPercentToVirtualCap" gorm:"column:allocated_vram_percent_virtual"` RawCost float64 `json:"rawCost" gorm:"column:raw_cost"` GPUCount int `json:"gpuCount" gorm:"column:gpu_count"` LastRecordTime time.Time `json:"lastRecordTime" gorm:"column:ts;index:,class:TIME"` // contains filtered or unexported fields }
func (*NodeResourceMetrics) SetGPUModelAndCount ¶ added in v1.34.0
func (nm *NodeResourceMetrics) SetGPUModelAndCount(gpuModels []string)
func (NodeResourceMetrics) TableName ¶ added in v1.34.0
func (nm NodeResourceMetrics) TableName() string
type RawBillingPricing ¶ added in v1.33.4
type TFSystemLog ¶ added in v1.34.0
type TFSystemLog struct { Component string `json:"component" gorm:"column:component;index:,class:INVERTED"` Container string `json:"container" gorm:"column:container;index:,class:INVERTED"` Message string `` /* 126-byte string literal not displayed */ Namespace string `json:"namespace" gorm:"column:namespace;index:,class:INVERTED"` Pod string `json:"pod" gorm:"column:pod;index:,class:SKIPPING"` Stream string `json:"stream" gorm:"column:stream"` // message written timestamp Timestamp string `json:"timestamp" gorm:"column:timestamp"` GreptimeTimestamp time.Time `json:"greptime_timestamp" gorm:"column:greptime_timestamp;index:,class:TIME;precision:ms"` }
func (TFSystemLog) TableName ¶ added in v1.34.0
func (sl TFSystemLog) TableName() string
type TensorFusionSystemMetrics ¶ added in v1.34.0
type TensorFusionSystemMetrics struct { PoolName string `json:"poolName" gorm:"column:pool;index:,class:INVERTED"` TotalWorkerCount int64 `json:"totalWorkerCount" gorm:"column:total_workers_cnt"` TotalNodeCount int64 `json:"totalNodeCount" gorm:"column:total_nodes_cnt"` TotalAllocationFailCount int64 `json:"totalAllocationFailCount" gorm:"column:total_allocation_fail_cnt"` TotalAllocationSuccessCount int64 `json:"totalAllocationSuccessCount" gorm:"column:total_allocation_success_cnt"` TotalScaleUpCount int64 `json:"totalScaleUpCount" gorm:"column:total_scale_up_cnt"` TotalScaleDownCount int64 `json:"totalScaleDownCount" gorm:"column:total_scale_down_cnt"` Timestamp time.Time `json:"ts" gorm:"column:ts;index:,class:TIME"` }
func (TensorFusionSystemMetrics) TableName ¶ added in v1.34.0
func (wm TensorFusionSystemMetrics) TableName() string
type TimeSeriesDB ¶ added in v1.34.0
func (*TimeSeriesDB) FindRecentNodeMetrics ¶ added in v1.34.0
func (t *TimeSeriesDB) FindRecentNodeMetrics() ([]NodeResourceMetrics, error)
func (*TimeSeriesDB) SetTableTTL ¶ added in v1.34.0
func (t *TimeSeriesDB) SetTableTTL(ttl string) error
func (*TimeSeriesDB) Setup ¶ added in v1.34.0
func (m *TimeSeriesDB) Setup(connection GreptimeDBConnection) error
func (*TimeSeriesDB) SetupTables ¶ added in v1.34.0
func (t *TimeSeriesDB) SetupTables(client client.Client) error
type WorkerResourceMetrics ¶ added in v1.34.0
type WorkerResourceMetrics struct { WorkerName string `json:"workerName" gorm:"column:worker;index:,class:SKIPPING"` WorkloadName string `json:"workloadName" gorm:"column:workload;index:,class:INVERTED"` PoolName string `json:"poolName" gorm:"column:pool;index:,class:INVERTED"` Namespace string `json:"namespace" gorm:"column:namespace;index:,class:INVERTED"` QoS string `json:"qos" gorm:"column:qos"` TflopsRequest float64 `json:"tflopsRequest" gorm:"column:tflops_request"` TflopsLimit float64 `json:"tflopsLimit" gorm:"column:tflops_limit"` VramBytesRequest float64 `json:"vramBytesRequest" gorm:"column:vram_bytes_request"` VramBytesLimit float64 `json:"vramBytesLimit" gorm:"column:vram_bytes_limit"` GPUCount int `json:"gpuCount" gorm:"column:gpu_count"` RawCost float64 `json:"rawCost" gorm:"column:raw_cost"` LastRecordTime time.Time `json:"lastRecordTime" gorm:"column:ts;index:,class:TIME"` // contains filtered or unexported fields }
Metrics will be stored in a map, key is the worker name, value is the metrics By default, metrics will be updated every minute
func (WorkerResourceMetrics) TableName ¶ added in v1.34.0
func (wm WorkerResourceMetrics) TableName() string