constants

package
v1.46.3 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Sep 29, 2025 License: Apache-2.0 Imports: 2 Imported by: 0

Documentation

Index

Constants

View Source
const (
	// Domain is the domain prefix used for all tensor-fusion.ai related annotations and finalizers
	Domain = "tensor-fusion.ai"

	// Finalizer constants
	FinalizerSuffix = "finalizer"
	Finalizer       = Domain + "/" + FinalizerSuffix

	SchedulerName = "tensor-fusion-scheduler"

	LabelKeyOwner           = Domain + "/managed-by"
	LabelKeyClusterOwner    = Domain + "/cluster"
	LabelKeyNodeClass       = Domain + "/node-class"
	LabelKeyPodTemplateHash = Domain + "/pod-template-hash"
	LabelNodeSelectorHash   = Domain + "/node-selector-hash"
	LabelComponent          = Domain + "/component"
	// used by TF connection, for matching the related connections when worker Pod state changed
	LabelWorkerName = Domain + "/worker-name"

	ComponentClient        = "client"
	ComponentWorker        = "worker"
	ComponentHypervisor    = "hypervisor"
	ComponentNodeDiscovery = "node-discovery"
	ComponentOperator      = "operator"

	GPUNodePoolIdentifierLabelPrefix = Domain + "/pool-"
	GPUNodePoolIdentifierLabelFormat = Domain + "/pool-%s"
	NodeDeletionMark                 = Domain + "/should-delete"

	TensorFusionEnabledLabelKey = Domain + "/enabled"
	InitialGPUNodeSelector      = "nvidia.com/gpu.present=true"

	LastSyncTimeAnnotationKey = Domain + "/last-sync"
	WorkloadKey               = Domain + "/workload"

	GpuPoolKey = Domain + "/gpupool"

	// Annotation key constants
	GpuCountAnnotation             = Domain + "/gpu-count"
	TFLOPSRequestAnnotation        = Domain + "/tflops-request"
	VRAMRequestAnnotation          = Domain + "/vram-request"
	TFLOPSLimitAnnotation          = Domain + "/tflops-limit"
	VRAMLimitAnnotation            = Domain + "/vram-limit"
	WorkloadProfileAnnotation      = Domain + "/workload-profile"
	InjectContainerAnnotation      = Domain + "/inject-container"
	IsLocalGPUAnnotation           = Domain + "/is-local-gpu"
	QoSLevelAnnotation             = Domain + "/qos"
	EmbeddedWorkerAnnotation       = Domain + "/embedded-worker"
	DedicatedWorkerAnnotation      = Domain + "/dedicated-worker"
	StandaloneWorkerModeAnnotation = Domain + "/no-standalone-worker-mode"
	// GPUModelAnnotation specifies the required GPU model (e.g., "A100", "H100")
	GPUModelAnnotation = Domain + "/gpu-model"
	// GPU ID list is assigned by scheduler, should not specified by user
	GPUDeviceIDsAnnotation            = Domain + "/gpu-ids"
	DedicatedGPUAnnotation            = Domain + "/dedicated-gpu"
	SetPendingOwnedWorkloadAnnotation = Domain + "/pending-owned-workload"
	PricingAnnotation                 = Domain + "/hourly-pricing"
	// In remote vGPU mode, selected workload is set by user with /workload annotation or generated by system
	SelectedWorkloadAnnotation = Domain + "/selected-workload"

	WorkloadModeAnnotation = Domain + "/workload-mode"
	WorkloadModeDynamic    = "dynamic"
	WorkloadModeFixed      = "fixed"

	// Annotations for killer switch: disable features
	// ['gpu-opt', 'mem-manager', 'gpu-limiter']
	DisableFeaturesAnnotation = Domain + "/disable-features"
	BuiltInFeaturesGpuOpt     = "gpu-opt"
	BuiltInFeaturesGpuLimiter = "gpu-limiter"
	BuiltInFeaturesMemManager = "mem-manager"
	// For debug purpose only of Remote vGPU, disable start worker to manual start with ad-hoc command inside Pod
	BuiltInFeatureStartWorker = "start-worker"

	GenHostPortLabel        = Domain + "/host-port"
	GenHostPortLabelValue   = "auto"
	GenHostPortNameLabel    = Domain + "/port-name"
	GenPortNumberAnnotation = Domain + "/port-number"

	AutoScaleResourcesAnnotation      = Domain + "/auto-resources"
	AutoScaleReplicasAnnotation       = Domain + "/auto-replicas"
	AutoScaleTargetResourceAnnotation = Domain + "/auto-scale-target-resource"

	GpuReleasedAnnotation = Domain + "/gpu-released"

	TensorFusionPodCounterKeyAnnotation = Domain + "/pod-counter-key"
	TensorFusionPodCountAnnotation      = Domain + "/tf-pod-count"
	TensorFusionWorkerSuffix            = "-tf"

	// For grey release
	TensorFusionEnabledReplicasAnnotation = Domain + "/enabled-replicas"
	TensorFusionDefaultPoolKeyAnnotation  = Domain + "/is-default-pool"
	// Eviction protection annotation for controlling pod eviction timing
	EvictionProtectionAnnotation = Domain + "/eviction-protection"

	NamespaceDefaultVal = "tensor-fusion-sys"

	KubernetesHostNameLabel = "kubernetes.io/hostname"
	KarpenterExpansionLabel = Domain + "/expansion-source"

	HypervisorServiceAccountName = "tensor-fusion-hypervisor-sa"

	TSDBVersionConfigMap = "tensor-fusion-tsdb-version"

	QoSLevelLow      = "low"
	QoSLevelMedium   = "medium"
	QoSLevelHigh     = "high"
	QoSLevelCritical = "critical"
)
View Source
const (
	TrueStringValue  = "true"
	FalseStringValue = "false"
)

for avoid golang lint issues

View Source
const (
	ConditionStatusTypeReady           = "Ready"
	ConditionStatusTypeGPUScheduled    = "GPUScheduled"
	ConditionStatusTypeConnectionReady = "ConnectionReady"
	ConditionStatusTypeNodeProvisioned = "NodeProvisioned"
	ConditionStatusTypePoolReady       = "PoolReady"

	ConditionStatusTypeGPUPool               = "GPUPoolReady"
	ConditionStatusTypeTimeSeriesDatabase    = "TimeSeriesDatabaseReady"
	ConditionStatusTypeCloudVendorConnection = "CloudVendorConnectionReady"

	ConditionStatusTypeRecommendationProvided = "RecommendationProvided"
)
View Source
const (
	PhaseUnknown    = "Unknown"
	PhasePending    = "Pending"
	PhaseUpdating   = "Updating"
	PhaseScheduling = "Scheduling"
	PhaseMigrating  = "Migrating"
	PhaseDestroying = "Destroying"

	PhaseRunning   = "Running"
	PhaseSucceeded = "Succeeded"
	PhaseFailed    = "Failed"
)
View Source
const (
	GPUNodeOSLinux   = "linux"
	GPUNodeOSWindows = "windows"
	GPUNodeOSMacOS   = "macos"
)
View Source
const (
	ProvisionerLabelKey        = Domain + "/node-provisioner"
	ProvisionerMissingLabel    = Domain + "/orphan"
	ProvisionerNamePlaceholder = "__GPU_NODE_RESOURCE_NAME__"
)

To match GPUNode with K8S node, when creating from cloud vendor, must set a label from cloud-init userdata

View Source
const (
	LeaderInfoConfigMapName        = "tensor-fusion-operator-leader-info"
	LeaderInfoConfigMapLeaderIPKey = "leader-ip"
)
View Source
const (
	LowFrequencyObjFailureInitialDelay        = 300 * time.Millisecond
	LowFrequencyObjFailureMaxDelay            = 1000 * time.Second
	LowFrequencyObjFailureMaxRPS              = 1
	LowFrequencyObjFailureMaxBurst            = 1
	LowFrequencyObjFailureConcurrentReconcile = 5
)
View Source
const (
	EnableWebhookEnv                  = "ENABLE_WEBHOOKS"
	EnableSchedulerEnv                = "ENABLE_SCHEDULER"
	EnableCustomResourceControllerEnv = "ENABLE_CR_CONTROLLER"

	// TensorFusion ControllerManager's http endpoint will verify Pod JWT signature
	// if this env var is set, will disable the verification, it's enabled by default
	// should not set to true in production environment
	DisableConnectionAuthEnv = "DISABLE_CONNECTION_AUTH"

	NvidiaOperatorProgressiveMigrationEnv = "NVIDIA_OPERATOR_PROGRESSIVE_MIGRATION"
	RunHypervisorUtilGPUAllocatable       = "RUN_HYPERVISOR_UTIL_GPU_ALLOCATABLE"
)

System feature toggles

View Source
const (
	NvidiaVisibleAllDeviceEnv   = "NVIDIA_VISIBLE_DEVICES"
	NvidiaVisibleAllDeviceValue = "all"

	TensorFusionGPUInfoConfigName       = "tensor-fusion-sys-public-gpu-info"
	TensorFusionGPUInfoConfigVolumeName = "gpu-info"
	TensorFusionGPUInfoConfigMountPath  = "/etc/tensor-fusion/gpu-info.yaml"
	TensorFusionGPUInfoConfigSubPath    = "gpu-info.yaml"
	TensorFusionGPUInfoEnvVar           = "TENSOR_FUSION_GPU_INFO_PATH"

	KubeletDevicePluginVolumeName = "device-plugin"
	KubeletDevicePluginPath       = "/var/lib/kubelet/device-plugins"

	KubeletPodResourcesVolumeName = "pod-resources"
	KubeletPodResourcesPath       = "/var/lib/kubelet/pod-resources"

	TensorFusionVectorConfigName       = "tensor-fusion-sys-vector-config"
	TensorFusionVectorConfigVolumeName = "vector-config"
	TensorFusionVectorConfigMountPath  = "/etc/vector/vector.yaml"
	TensorFusionVectorConfigSubPath    = "vector-hypervisor.yaml"

	LogsVolumeName           = "logs"
	KubernetesLogsVolumeName = "kubernetes-logs"
	KubernetesLogsPath       = "/var/log/pods"
	TensorFusionLogPath      = "/logs"

	DefaultHttpBindIP = "0.0.0.0"
)

General envs used in compose components manifest

View Source
const (
	TFContainerNameClient        = "inject-lib"
	TFContainerNameWorker        = "tensorfusion-worker"
	TFContainerNameHypervisor    = "tensorfusion-hypervisor"
	TFContainerNameNodeDiscovery = "tensorfusion-node-discovery"
	TFContainerVector            = "vector"
)
View Source
const (
	GetConnectionURLEnv    = "TENSOR_FUSION_OPERATOR_GET_CONNECTION_URL"
	ConnectionNameEnv      = "TENSOR_FUSION_CONNECTION_NAME"
	ConnectionNamespaceEnv = "TENSOR_FUSION_CONNECTION_NAMESPACE"

	RealNvmlLibPathEnv   = "TF_NVML_LIB_PATH"
	RealCUDALibPathEnv   = "TF_CUDA_LIB_PATH"
	RealNvmlLibPathValue = "/lib/x86_64-linux-gnu/libnvidia-ml.so.1"
	RealCUDALibPathValue = "/lib/x86_64-linux-gnu/libcuda.so"

	PrependPathEnv          = "TF_PREPEND_PATH"
	PrependLDLibraryPathEnv = "TF_PREPEND_LD_LIBRARY_PATH"

	LdPreloadFileName = "ld.so.preload"
	LdPreloadFile     = "/etc/ld.so.preload"

	TFLibsVolumeName         = "tf-libs"
	TFLibsVolumeMountPath    = "/tensor-fusion"
	TFConnectionNamePrefix   = "-tf-vgpu-"
	TFConnectionNameNoPrefix = "tf-vgpu-"

	HostIPFieldRef       = "status.hostIP"
	NodeNameFieldRef     = "spec.nodeName"
	ResourceNameFieldRef = "metadata.name"
	NamespaceFieldRef    = "metadata.namespace"
)

TensorFusion client related envs

View Source
const (
	HypervisorIPEnv   = "HYPERVISOR_IP"
	HypervisorPortEnv = "HYPERVISOR_PORT"

	PodNamespaceEnv  = "POD_NAMESPACE"
	ContainerNameEnv = "CONTAINER_NAME"

	// the path of nGPU lib for limiter to load
	NGPUPathEnv   = "TENSOR_FUSION_NGPU_PATH"
	NGPUPathValue = TFLibsVolumeMountPath + "/libcuda.so"

	LdPreloadEnv     = "LD_PRELOAD"
	LdPreloadLimiter = "/home/app/libcuda_limiter.so"

	SharedMemMountSubPath = "/shm"

	// disable GPU limiter, for emergency use
	DisableGpuLimiterEnv = "DISABLE_GPU_LIMITER"
	// directly forward CUDA calls to GPU driver in nGPU mode, for emergency use
	DisableCudaOptimizationEnv = "TF_ENABLE_DISPATCH_FORWARD"
	// disable vram manager, for emergency use
	DisableVRAMManagerEnv      = "TF_DISABLE_MEMORY_MANAGER"
	DisableWorkerFeatureEnvVal = "1"

	TensorFusionRemoteWorkerPortNumber = 8000
	TensorFusionRemoteWorkerPortName   = "remote-vgpu"
)

TensorFusion worker related envs

View Source
const (
	HypervisorPoolNameEnv           = "TENSOR_FUSION_POOL_NAME"
	PodNameEnv                      = "POD_NAME"
	VectorPodNodeNameEnv            = "NODE_NAME"
	HypervisorGPUNodeNameEnv        = "GPU_NODE_NAME"
	HypervisorSchedulingConfigEnv   = "TF_HYPERVISOR_SCHEDULING_CONFIG"
	HypervisorListenAddrEnv         = "API_LISTEN_ADDR"
	HypervisorMetricsFormatEnv      = "TF_HYPERVISOR_METRICS_FORMAT"
	HypervisorMetricsExtraLabelsEnv = "TF_HYPERVISOR_METRICS_EXTRA_LABELS"
	HypervisorDetectUsedGPUEnv      = "DETECT_IN_USED_GPUS"

	// Add ptrace capability to hypervisor container, to trace all host PID using GPU
	SystemPtraceCapability = "SYS_PTRACE"

	HypervisorDefaultPortNumber int32  = 8000
	HypervisorPortName          string = "http"

	// For security enhancement, there are 2 types of endpoints to protect
	// 1. client call operator /connection API, to obtain tensor fusion worker's URL
	// 2. worker call hypervisor API, to obtain current workers GPU quota info
	// if this env var is set on operator and hypervisor, will try to verify JWT signature for each call
	// not implemented yet, iss is public in EKS and most K8S distribution
	// but k3s and some K8S distribution may not support, need to find some way to get SA token JWT pub key
	HypervisorVerifyServiceAccountEnabledEnvVar   = "SA_TOKEN_VERIFY_ENABLED"
	HypervisorVerifyServiceAccountPublicKeyEnvVar = "SA_TOKEN_VERIFY_PUBLIC_KEY"
)

TensorFusion hypervisor related envs

View Source
const (
	NodeDiscoveryReportGPUNodeEnvName = "NODE_DISCOVERY_REPORT_GPU_NODE"
	NodeDiscoveryHostNameEnv          = "HOSTNAME"
)

Node discovery related envs

View Source
const (
	KubeApiVersionMajorEnv = "KUBE_API_VERSION_MAJOR"
	KubeApiVersionMinorEnv = "KUBE_API_VERSION_MINOR"
)
View Source
const AuthorizationHeader = "Authorization"
View Source
const DataVolumeName = "tf-data"
View Source
const DefaultEvictionProtectionPriceRatio = 1.2
View Source
const ExtraVerificationInfoPodIDKey = "authentication.kubernetes.io/pod-uid"
View Source
const GiBToBytes = 1024 * 1024 * 1024
View Source
const KarpenterNodeClaimKind = "NodeClaim"
View Source
const MobileGpuClockSpeedMultiplier = 0.75
View Source
const NamespaceEnv = "OPERATOR_NAMESPACE"

Controller itself envs

View Source
const NodeCriticalPriorityClassName = "system-node-critical"
View Source
const (
	NvidiaGPUKey = "nvidia.com/gpu"
)
View Source
const SchedulerSimulationKey = "simulate-schedule"
View Source
const (
	// No disrupt label, similar to Karpenter, avoid TFConnection/Worker/GPUNode to be moved to another node or destroying node.
	// Refer: https://karpenter.sh/docs/concepts/disruption/
	SchedulingDoNotDisruptLabel = Domain + "/do-not-disrupt"
)
View Source
const ShortUUIDAlphabet = "123456789abcdefghijkmnopqrstuvwxy"
View Source
const SpotInstanceAssumedDiscountRatio = 0.3
View Source
const TFDataPath = "/run/tensor-fusion"
View Source
const TFDataPathWorkerExpr = "shm/$(POD_NAMESPACE)/$(POD_NAME)"
View Source
const TensorFusionPoolManualCompaction = Domain + "/manual-compaction"
View Source
const TensorFusionSystemName = "tensor-fusion"

Variables

View Source
var (
	PendingRequeueDuration     = time.Second * 3
	StatusCheckInterval        = time.Second * 6
	GracefulPeriodSeconds      = ptr.To(int64(5))
	UnschedQueueBufferDuration = 10 * time.Second
)

Functions

This section is empty.

Types

This section is empty.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL