constants

package

v1.48.3 Latest Latest Go to latest Published: Oct 31, 2025 License: Apache-2.0 Imports: 2 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/NexusGPU/tensor-fusion

Links

Open Source Insights

Documentation ¶

Constants ¶

View Source

const (
	// Domain is the domain prefix used for all tensor-fusion.ai related annotations and finalizers
	Domain = "tensor-fusion.ai"

	// Finalizer constants
	FinalizerSuffix = "finalizer"
	Finalizer       = Domain + "/" + FinalizerSuffix

	SchedulerName = "tensor-fusion-scheduler"

	LabelKeyOwner           = Domain + "/managed-by"
	LabelKeyClusterOwner    = Domain + "/cluster"
	LabelKeyNodeClass       = Domain + "/node-class"
	LabelKeyPodTemplateHash = Domain + "/pod-template-hash"
	LabelNodeSelectorHash   = Domain + "/node-selector-hash"
	LabelComponent          = Domain + "/component"
	// used by TF connection, for matching the related connections when worker Pod state changed
	LabelWorkerName = Domain + "/worker-name"

	ComponentClient        = "client"
	ComponentWorker        = "worker"
	ComponentHypervisor    = "hypervisor"
	ComponentNodeDiscovery = "node-discovery"
	ComponentOperator      = "operator"

	GPUNodePoolIdentifierLabelPrefix = Domain + "/pool-"
	GPUNodePoolIdentifierLabelFormat = Domain + "/pool-%s"
	NodeDeletionMark                 = Domain + "/should-delete"

	TensorFusionEnabledLabelKey = Domain + "/enabled"
	InitialGPUNodeSelector      = "nvidia.com/gpu.present=true"

	LastSyncTimeAnnotationKey = Domain + "/last-sync"
	WorkloadKey               = Domain + "/workload"

	GpuPoolKey = Domain + "/gpupool"

	// Annotation key constants
	GpuCountAnnotation = Domain + "/gpu-count"
	// Specify which GPU index to use, when multiple GPUs are available, comma separated list of GPU indices
	GpuIndicesAnnotation = Domain + "/gpu-indices"
	// Specify which GPU/NPU/XPU vendor to use, default to any vendors in cluster
	GpuVendorAnnotation     = Domain + "/vendor"
	TFLOPSRequestAnnotation = Domain + "/tflops-request"
	VRAMRequestAnnotation   = Domain + "/vram-request"
	TFLOPSLimitAnnotation   = Domain + "/tflops-limit"
	VRAMLimitAnnotation     = Domain + "/vram-limit"

	// StreamMultiProcessor/AICore percentage, alternative to TFLOPs request and limit, NOT Recommended way
	// NOTE: using percent will cause namespace level quota check impossible, will bypass all quota check
	// thus, percent should only be used when tenant quota is not needed, and only one type of GPU in cluster
	ComputeRequestAnnotation = Domain + "/compute-percent-request"
	ComputeLimitAnnotation   = Domain + "/compute-percent-limit"

	WorkloadProfileAnnotation = Domain + "/workload-profile"
	InjectContainerAnnotation = Domain + "/inject-container"
	IsLocalGPUAnnotation      = Domain + "/is-local-gpu"
	QoSLevelAnnotation        = Domain + "/qos"
	EmbeddedWorkerAnnotation  = Domain + "/embedded-worker"
	DedicatedWorkerAnnotation = Domain + "/dedicated-worker"
	SidecarWorkerAnnotation   = Domain + "/sidecar-worker"
	// How to isolate computing resources, default to `soft“ mode, could be `shared` or `hard`
	ComputingIsolationModeAnnotation = Domain + "/compute-isolation"
	// GPUModelAnnotation specifies the required GPU model (e.g., "A100", "H100")
	GPUModelAnnotation = Domain + "/gpu-model"
	// GPU ID list is assigned by scheduler, should not specified by user
	GPUDeviceIDsAnnotation            = Domain + "/gpu-ids"
	DedicatedGPUAnnotation            = Domain + "/dedicated-gpu"
	SetPendingOwnedWorkloadAnnotation = Domain + "/pending-owned-workload"
	PricingAnnotation                 = Domain + "/hourly-pricing"
	// In remote vGPU mode, selected workload is set by user with /workload annotation or generated by system
	SelectedWorkloadAnnotation = Domain + "/selected-workload"
	// Additional worker pod template is set by user with /worker-pod-template annotation
	WorkerPodTemplateAnnotation = Domain + "/worker-pod-template"

	WorkloadModeAnnotation = Domain + "/workload-mode"
	WorkloadModeDynamic    = "dynamic"
	WorkloadModeFixed      = "fixed"

	// no computing limit, just isolate vram memory, rely on GPU built-in time-slicing, each process gets equal share of GPU
	// Pros: simple and stable, no performance overhead, maximize GPU utilization when well-scheduled
	// Cons: can not auto-scale and differentiate QoS levels, TFLOPs limit does not take effect, may cause resource contention
	ComputingIsolationModeShared = "shared"

	// default isolation mode, use Proportional-Integral-Derivative controller to isolate computing resources and assign time slices
	// Pros: can set QoS levels for different workloads, TFLOPs limit is relatively accurate
	// Cons: ~1% performance overhead, when burst credits are consumed,
	ComputingIsolationModeSoft = "soft"

	// use SM partitioning to isolate computing resources, each Pod get dedicated SMs depends on GPU driver support
	// Pros: better performance isolation, no performance overhead
	// Cons: can not auto-scale dynamically, percent may not 1%/1TFLOPs accuracy, coupled with GPU vendor's SM partitioning implementation
	// NOTE: this can only be used in Remote or Local+SidecarWorker mode, not supported in LocalGPU mode (because no TensorFusion Worker)
	ComputingIsolationModeHard = "hard"

	// Annotations for killer switch: disable features
	// ['gpu-opt', 'mem-manager', 'gpu-limiter']
	DisableFeaturesAnnotation = Domain + "/disable-features"
	BuiltInFeaturesGpuOpt     = "gpu-opt"
	BuiltInFeaturesGpuLimiter = "gpu-limiter"
	BuiltInFeaturesMemManager = "mem-manager"
	// For debug purpose only of Remote vGPU, disable start worker to manual start with ad-hoc command inside Pod
	BuiltInFeatureStartWorker = "start-worker"

	GenHostPortLabel        = Domain + "/host-port"
	GenHostPortLabelValue   = "auto"
	GenHostPortNameLabel    = Domain + "/port-name"
	GenPortNumberAnnotation = Domain + "/port-number"

	AutoScaleResourcesAnnotation      = Domain + "/auto-resources"
	AutoScaleReplicasAnnotation       = Domain + "/auto-replicas"
	AutoScaleTargetResourceAnnotation = Domain + "/auto-scale-target-resource"

	GpuReleasedAnnotation = Domain + "/gpu-released"

	TensorFusionPodCounterKeyAnnotation = Domain + "/pod-counter-key"
	TensorFusionPodCountAnnotation      = Domain + "/tf-pod-count"
	TensorFusionWorkerSuffix            = "-tf"

	// For grey release
	TensorFusionEnabledReplicasAnnotation = Domain + "/enabled-replicas"
	TensorFusionDefaultPoolKeyAnnotation  = Domain + "/is-default-pool"
	// Eviction protection annotation for controlling pod eviction timing
	EvictionProtectionAnnotation = Domain + "/eviction-protection"

	NamespaceDefaultVal = "tensor-fusion-sys"

	KubernetesHostNameLabel = "kubernetes.io/hostname"
	KarpenterExpansionLabel = Domain + "/expansion-source"

	HypervisorServiceAccountName = "tensor-fusion-hypervisor-sa"

	TSDBVersionConfigMap = "tensor-fusion-tsdb-version"

	QoSLevelLow      = "low"
	QoSLevelMedium   = "medium"
	QoSLevelHigh     = "high"
	QoSLevelCritical = "critical"
)

View Source

const (
	TrueStringValue  = "true"
	FalseStringValue = "false"
)

for avoid golang lint issues

View Source

const (
	ConditionStatusTypeReady           = "Ready"
	ConditionStatusTypeGPUScheduled    = "GPUScheduled"
	ConditionStatusTypeConnectionReady = "ConnectionReady"
	ConditionStatusTypeNodeProvisioned = "NodeProvisioned"
	ConditionStatusTypePoolReady       = "PoolReady"

	ConditionStatusTypeGPUPool               = "GPUPoolReady"
	ConditionStatusTypeTimeSeriesDatabase    = "TimeSeriesDatabaseReady"
	ConditionStatusTypeCloudVendorConnection = "CloudVendorConnectionReady"

	ConditionStatusTypeRecommendationProvided = "RecommendationProvided"
)

View Source

const (
	PhaseUnknown    = "Unknown"
	PhasePending    = "Pending"
	PhaseUpdating   = "Updating"
	PhaseScheduling = "Scheduling"
	PhaseMigrating  = "Migrating"
	PhaseDestroying = "Destroying"

	PhaseRunning   = "Running"
	PhaseSucceeded = "Succeeded"
	PhaseFailed    = "Failed"
)

View Source

const (
	GPUNodeOSLinux   = "linux"
	GPUNodeOSWindows = "windows"
	GPUNodeOSMacOS   = "macos"
)

View Source

const (
	ProvisionerLabelKey        = Domain + "/node-provisioner"
	ProvisionerMissingLabel    = Domain + "/orphan"
	ProvisionerNamePlaceholder = "__GPU_NODE_RESOURCE_NAME__"
)

To match GPUNode with K8S node, when creating from cloud vendor, must set a label from cloud-init userdata

View Source

const (
	LeaderInfoConfigMapName        = "tensor-fusion-operator-leader-info"
	LeaderInfoConfigMapLeaderIPKey = "leader-ip"
)

View Source

const (
	LowFrequencyObjFailureInitialDelay        = 300 * time.Millisecond
	LowFrequencyObjFailureMaxDelay            = 1000 * time.Second
	LowFrequencyObjFailureMaxRPS              = 1
	LowFrequencyObjFailureMaxBurst            = 1
	LowFrequencyObjFailureConcurrentReconcile = 5
)

View Source

const (
	EnableWebhookEnv                  = "ENABLE_WEBHOOKS"
	EnableSchedulerEnv                = "ENABLE_SCHEDULER"
	EnableCustomResourceControllerEnv = "ENABLE_CR_CONTROLLER"

	// TensorFusion ControllerManager's http endpoint will verify Pod JWT signature
	// if this env var is set, will disable the verification, it's enabled by default
	// should not set to true in production environment
	DisableConnectionAuthEnv = "DISABLE_CONNECTION_AUTH"

	NvidiaOperatorProgressiveMigrationEnv = "NVIDIA_OPERATOR_PROGRESSIVE_MIGRATION"
	RunHypervisorUtilGPUAllocatable       = "RUN_HYPERVISOR_UTIL_GPU_ALLOCATABLE"

	UsingCommercialComponentEnv = "COMMERCIAL_PLAN"
)

System feature toggles

View Source

const (
	NvidiaVisibleAllDeviceEnv   = "NVIDIA_VISIBLE_DEVICES"
	NvidiaVisibleAllDeviceValue = "all"

	TensorFusionGPUInfoConfigName       = "tensor-fusion-sys-public-gpu-info"
	TensorFusionGPUInfoConfigVolumeName = "gpu-info"
	TensorFusionGPUInfoConfigMountPath  = "/etc/tensor-fusion/gpu-info.yaml"
	TensorFusionGPUInfoConfigSubPath    = "gpu-info.yaml"
	TensorFusionGPUInfoEnvVar           = "TENSOR_FUSION_GPU_INFO_PATH"

	KubeletDevicePluginVolumeName = "device-plugin"
	KubeletDevicePluginPath       = "/var/lib/kubelet/device-plugins"

	KubeletPodResourcesVolumeName = "pod-resources"
	KubeletPodResourcesPath       = "/var/lib/kubelet/pod-resources"

	TensorFusionVectorConfigName       = "tensor-fusion-sys-vector-config"
	TensorFusionVectorConfigVolumeName = "vector-config"
	TensorFusionVectorConfigMountPath  = "/etc/vector/vector.yaml"
	TensorFusionVectorConfigSubPath    = "vector-hypervisor.yaml"

	LogsVolumeName           = "logs"
	KubernetesLogsVolumeName = "kubernetes-logs"
	KubernetesLogsPath       = "/var/log/pods"
	TensorFusionLogPath      = "/logs"

	DefaultHttpBindIP = "0.0.0.0"
)

General envs used in compose components manifest

View Source

const (
	TFContainerNameClient        = "inject-lib"
	TFContainerNameWorker        = "tensorfusion-worker"
	TFContainerNameHypervisor    = "tensorfusion-hypervisor"
	TFContainerNameNodeDiscovery = "tensorfusion-node-discovery"
	TFContainerVector            = "vector"
)

View Source

const (
	GetConnectionURLEnv     = "TENSOR_FUSION_OPERATOR_GET_CONNECTION_URL"
	ConnectionInfoEnv       = "TENSOR_FUSION_OPERATOR_CONNECTION_INFO"
	ConnectionNameEnv       = "TENSOR_FUSION_CONNECTION_NAME"
	ConnectionNamespaceEnv  = "TENSOR_FUSION_CONNECTION_NAMESPACE"
	DisableVMSharedMemEnv   = "TF_USE_IVSHMEM"
	ConnectionSharedMemSize = "256"
	ConnectionSharedMemName = "tf_shm"

	RealNvmlLibPathEnv   = "TF_NVML_LIB_PATH"
	RealCUDALibPathEnv   = "TF_CUDA_LIB_PATH"
	RealNvmlLibPathValue = "/lib/x86_64-linux-gnu/libnvidia-ml.so.1"
	RealCUDALibPathValue = "/lib/x86_64-linux-gnu/libcuda.so"

	PrependPathEnv    = "TF_PREPEND_PATH"
	PrependLibPathEnv = "TF_LD_LIBRARY_PATH"

	RunInsideGPUEnv = "RUN_INSIDE_GPU_NODE"

	LdPreloadFileName = "ld.so.preload"
	LdPreloadFile     = "/etc/ld.so.preload"

	LdLibraryPathFileName = "zz_tensor-fusion.conf"
	LdLibraryPathFile     = "/etc/ld.so.conf.d/zz_tensor-fusion.conf"

	TFLibsVolumeName         = "tf-libs"
	TFConfVolumeName         = "tf-conf-lib-paths"
	TFLibsVolumeMountPath    = "/tensor-fusion"
	TFConfVolumeMountPath    = "/tensor-fusion-conf"
	TFConnectionNamePrefix   = "-tf-vgpu-"
	TFConnectionNameNoPrefix = "tf-vgpu-"

	HostIPFieldRef       = "status.hostIP"
	NodeNameFieldRef     = "spec.nodeName"
	ResourceNameFieldRef = "metadata.name"
	NamespaceFieldRef    = "metadata.namespace"
)

TensorFusion client related envs

View Source

const (
	HypervisorIPEnv   = "HYPERVISOR_IP"
	HypervisorPortEnv = "HYPERVISOR_PORT"

	PodNamespaceEnv  = "POD_NAMESPACE"
	ContainerNameEnv = "CONTAINER_NAME"

	EnableWorkerLogEnv   = "TF_ENABLE_LOG"
	EnableWorkerLogValue = "1"

	// the path of nGPU lib for limiter to load
	NGPUPathEnv   = "TENSOR_FUSION_NGPU_PATH"
	NGPUPathValue = TFLibsVolumeMountPath + "/libcuda.so"

	LdPreloadEnv     = "LD_PRELOAD"
	LdPreloadLimiter = "/home/app/libcuda_limiter.so"

	SharedMemMountSubPath = "/shm"

	// disable GPU limiter, for emergency use
	DisableGpuLimiterEnv = "DISABLE_GPU_LIMITER"
	// directly forward CUDA calls to GPU driver in nGPU mode, for emergency use
	DisableCudaOptimizationEnv = "TF_ENABLE_DISPATCH_FORWARD"
	// disable vram manager, for emergency use
	DisableVRAMManagerEnv      = "TF_DISABLE_MEMORY_MANAGER"
	DisableWorkerFeatureEnvVal = "1"

	// hard limiter mode (not open sourced) in percent, only take effect on worker container yet
	HardSMLimiterEnv = "TF_CUDA_SM_PERCENT_LIMIT"
	// hard limiter (not open sourced) in megabytes, only take effect on worker container and when open source vgpu.rs gpu-limiter is disabled
	// when use this mode, memory request can not autoscale dynamically
	HardMemLimiterEnv = "TF_CUDA_MEMORY_LIMIT"

	TensorFusionRemoteWorkerPortNumber = 8000
	TensorFusionRemoteWorkerPortName   = "remote-vgpu"
)

TensorFusion worker related envs

View Source

const (
	HypervisorPoolNameEnv           = "TENSOR_FUSION_POOL_NAME"
	PodNameEnv                      = "POD_NAME"
	VectorPodNodeNameEnv            = "NODE_NAME"
	HypervisorGPUNodeNameEnv        = "GPU_NODE_NAME"
	HypervisorSchedulingConfigEnv   = "TF_HYPERVISOR_SCHEDULING_CONFIG"
	HypervisorListenAddrEnv         = "API_LISTEN_ADDR"
	HypervisorMetricsFormatEnv      = "TF_HYPERVISOR_METRICS_FORMAT"
	HypervisorMetricsExtraLabelsEnv = "TF_HYPERVISOR_METRICS_EXTRA_LABELS"
	HypervisorDetectUsedGPUEnv      = "DETECT_IN_USED_GPUS"

	// Add ptrace capability to hypervisor container, to trace all host PID using GPU
	SystemPtraceCapability = "SYS_PTRACE"

	HypervisorDefaultPortNumber int32  = 8000
	HypervisorPortName          string = "http"

	// For security enhancement, there are 2 types of endpoints to protect
	// 1. client call operator /connection API, to obtain tensor fusion worker's URL
	// 2. worker call hypervisor API, to obtain current workers GPU quota info
	// if this env var is set on operator and hypervisor, will try to verify JWT signature for each call
	// not implemented yet, iss is public in EKS and most K8S distribution
	// but k3s and some K8S distribution may not support, need to find some way to get SA token JWT pub key
	HypervisorVerifyServiceAccountEnabledEnvVar   = "SA_TOKEN_VERIFY_ENABLED"
	HypervisorVerifyServiceAccountPublicKeyEnvVar = "SA_TOKEN_VERIFY_PUBLIC_KEY"
)

TensorFusion hypervisor related envs

View Source

const (
	NodeDiscoveryReportGPUNodeEnvName = "NODE_DISCOVERY_REPORT_GPU_NODE"
	NodeDiscoveryHostNameEnv          = "HOSTNAME"
)

Node discovery related envs

View Source

const (
	KubeApiVersionMajorEnv = "KUBE_API_VERSION_MAJOR"
	KubeApiVersionMinorEnv = "KUBE_API_VERSION_MINOR"
)

View Source

const (
	// GPGPU vendors - Global
	AcceleratorVendorNvidia = "NVIDIA"
	AcceleratorVendorAMD    = "AMD"
	AcceleratorVendorIntel  = "Intel"

	// DSA vendors - Global
	AcceleratorVendorQualcomm  = "Qualcomm"
	AcceleratorVendorAWSNeuron = "AWS-Neuron"
	AcceleratorVendorGoogleTPU = "Google-TPU"
	AcceleratorVendorCerebras  = "Cerebras"

	// GPGPU vendors - CN
	AcceleratorVendorHygon        = "Hygon-DCU"
	AcceleratorVendorMetaX        = "Meta-X"
	AcceleratorVendorMThreads     = "MThreads"
	AcceleratorVendorBiren        = "BirenGPU"
	AcceleratorVendorAlibabaTHead = "THead-PPU"

	// DSA vendors - CN
	AcceleratorVendorHuaweiAscendNPU = "Ascend-NPU"
	AcceleratorVendorCambricon       = "Cambricon-MLU"
	AcceleratorVendorEnflame         = "Enflame-XPU"
	AcceleratorVendorKunlunX         = "KunlunXin-XPU"

	AcceleratorVendorUnknown = "Unknown"
)

View Source

const AuthorizationHeader = "Authorization"

View Source

const DataVolumeName = "tf-data"

View Source

const DefaultEvictionProtectionPriceRatio = 1.2

View Source

const ExtraVerificationInfoPodIDKey = "authentication.kubernetes.io/pod-uid"

View Source

const GiBToBytes = 1024 * 1024 * 1024

View Source

const KarpenterNodeClaimKind = "NodeClaim"

View Source

const KarpenterNodePoolKind = "NodePool"

View Source

const MobileGpuClockSpeedMultiplier = 0.75

View Source

const NamespaceEnv = "OPERATOR_NAMESPACE"

Controller itself envs

View Source

const NodeCriticalPriorityClassName = "system-node-critical"

View Source

const (
	NvidiaGPUKey = "nvidia.com/gpu"
)

View Source

const SchedulerSimulationKey = "simulate-schedule"

View Source

const (
	// No disrupt label, similar to Karpenter, avoid TFConnection/Worker/GPUNode to be moved to another node or destroying node.
	// Refer: https://karpenter.sh/docs/concepts/disruption/
	SchedulingDoNotDisruptLabel = Domain + "/do-not-disrupt"
)

View Source

const ShortUUIDAlphabet = "123456789abcdefghijkmnopqrstuvwxy"

View Source

const SpotInstanceAssumedDiscountRatio = 0.3

View Source

const TFDataPath = "/run/tensor-fusion"

View Source

const TFDataPathWorkerExpr = "shm/$(POD_NAMESPACE)/$(POD_NAME)"

View Source

const TensorFusionPoolManualCompaction = Domain + "/manual-compaction"

View Source

const TensorFusionSystemName = "tensor-fusion"

View Source

const TransportShmPath = "/dev/shm"

View Source

const TransportShmVolumeName = "tf-transport-shm"

Variables ¶

View Source

var (
	PendingRequeueDuration     = time.Second * 3
	StatusCheckInterval        = time.Second * 6
	GracefulPeriodSeconds      = ptr.To(int64(5))
	UnschedQueueBufferDuration = 10 * time.Second
)

View Source

var L1VirtualizationSupportedVendors = []map[string]bool{
	{
		AcceleratorVendorNvidia: false,

		AcceleratorVendorHuaweiAscendNPU: false,
		AcceleratorVendorAMD:             false,
		AcceleratorVendorHygon:           false,
		AcceleratorVendorMetaX:           false,
		AcceleratorVendorMThreads:        false,
	},
}

L1 Virtualization means simple device partitioning, such as dynamic MIG/VirtualizationTemplates Can not auto-scale dynamically, limited partition count, coarse-grained isolation

View Source

var L2VirtualizationSupportedVendors = []map[string]bool{
	{
		AcceleratorVendorNvidia: true,

		AcceleratorVendorAMD:             false,
		AcceleratorVendorHuaweiAscendNPU: false,
		AcceleratorVendorHygon:           false,
		AcceleratorVendorMetaX:           false,
	},
}

L2 Virtualization means dynamic user-space soft isolation, best performance and scalability Can auto-scale dynamically, accurate resource isolation, but can not used in untrusted environment

View Source

var L3VirtualizationSupportedVendors = []map[string]bool{
	{
		AcceleratorVendorNvidia: true,

		AcceleratorVendorAMD:             false,
		AcceleratorVendorHygon:           false,
		AcceleratorVendorMetaX:           false,
		AcceleratorVendorHuaweiAscendNPU: false,
	},
}

L3 Virtualization means full-featured soft and hard isolated virtualization, including API remoting support live-migration, can be used in both VM and container environments for untrusted workloads

Functions ¶

This section is empty.

Types ¶

This section is empty.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL