Documentation
¶
Index ¶
Constants ¶
const ( // Domain is the domain prefix used for all tensor-fusion.ai related annotations and finalizers Domain = "tensor-fusion.ai" // Finalizer constants FinalizerSuffix = "finalizer" Finalizer = Domain + "/" + FinalizerSuffix SchedulerName = "tensor-fusion-scheduler" LabelKeyOwner = Domain + "/managed-by" LabelKeyClusterOwner = Domain + "/cluster" LabelKeyNodeClass = Domain + "/node-class" LabelKeyPodTemplateHash = Domain + "/pod-template-hash" LabelNodeSelectorHash = Domain + "/node-selector-hash" LabelComponent = Domain + "/component" // used by TF connection, for matching the related connections when worker Pod state changed LabelWorkerName = Domain + "/worker-name" ComponentClient = "client" ComponentWorker = "worker" ComponentHypervisor = "hypervisor" ComponentNodeDiscovery = "node-discovery" ComponentOperator = "operator" GPUNodePoolIdentifierLabelPrefix = Domain + "/pool-" GPUNodePoolIdentifierLabelFormat = Domain + "/pool-%s" NodeDeletionMark = Domain + "/should-delete" TensorFusionEnabledLabelKey = Domain + "/enabled" InitialGPUNodeSelector = "nvidia.com/gpu.present=true" LastSyncTimeAnnotationKey = Domain + "/last-sync" WorkloadKey = Domain + "/workload" GpuPoolKey = Domain + "/gpupool" // Annotation key constants GpuCountAnnotation = Domain + "/gpu-count" // Specify which GPU index to use, when multiple GPUs are available, comma separated list of GPU indices GpuIndicesAnnotation = Domain + "/gpu-indices" // Specify which GPU/NPU/XPU vendor to use, default to any vendors in cluster GpuVendorAnnotation = Domain + "/vendor" TFLOPSRequestAnnotation = Domain + "/tflops-request" VRAMRequestAnnotation = Domain + "/vram-request" TFLOPSLimitAnnotation = Domain + "/tflops-limit" VRAMLimitAnnotation = Domain + "/vram-limit" // StreamMultiProcessor/AICore percentage, alternative to TFLOPs request and limit, NOT Recommended way // NOTE: using percent will cause namespace level quota check impossible, will bypass all quota check // thus, percent should only be used when tenant quota is not needed, and only one type of GPU in cluster ComputeRequestAnnotation = Domain + "/compute-percent-request" ComputeLimitAnnotation = Domain + "/compute-percent-limit" WorkloadProfileAnnotation = Domain + "/workload-profile" InjectContainerAnnotation = Domain + "/inject-container" IsLocalGPUAnnotation = Domain + "/is-local-gpu" QoSLevelAnnotation = Domain + "/qos" EmbeddedWorkerAnnotation = Domain + "/embedded-worker" DedicatedWorkerAnnotation = Domain + "/dedicated-worker" SidecarWorkerAnnotation = Domain + "/sidecar-worker" // How to isolate computing resources, default to `soft“ mode, could be `shared` or `hard` ComputingIsolationModeAnnotation = Domain + "/compute-isolation" // GPUModelAnnotation specifies the required GPU model (e.g., "A100", "H100") GPUModelAnnotation = Domain + "/gpu-model" // GPU ID list is assigned by scheduler, should not specified by user GPUDeviceIDsAnnotation = Domain + "/gpu-ids" DedicatedGPUAnnotation = Domain + "/dedicated-gpu" SetPendingOwnedWorkloadAnnotation = Domain + "/pending-owned-workload" PricingAnnotation = Domain + "/hourly-pricing" // In remote vGPU mode, selected workload is set by user with /workload annotation or generated by system SelectedWorkloadAnnotation = Domain + "/selected-workload" // Additional worker pod template is set by user with /worker-pod-template annotation WorkerPodTemplateAnnotation = Domain + "/worker-pod-template" WorkloadModeAnnotation = Domain + "/workload-mode" WorkloadModeDynamic = "dynamic" WorkloadModeFixed = "fixed" // Pros: simple and stable, no performance overhead, maximize GPU utilization when well-scheduled // Cons: can not auto-scale and differentiate QoS levels, TFLOPs limit does not take effect, may cause resource contention ComputingIsolationModeShared = "shared" // default isolation mode, use Proportional-Integral-Derivative controller to isolate computing resources and assign time slices // Pros: can set QoS levels for different workloads, TFLOPs limit is relatively accurate // Cons: ~1% performance overhead, when burst credits are consumed, ComputingIsolationModeSoft = "soft" // use SM partitioning to isolate computing resources, each Pod get dedicated SMs depends on GPU driver support // Pros: better performance isolation, no performance overhead // Cons: can not auto-scale dynamically, percent may not 1%/1TFLOPs accuracy, coupled with GPU vendor's SM partitioning implementation // NOTE: this can only be used in Remote or Local+SidecarWorker mode, not supported in LocalGPU mode (because no TensorFusion Worker) ComputingIsolationModeHard = "hard" // Annotations for killer switch: disable features // ['gpu-opt', 'mem-manager', 'gpu-limiter'] DisableFeaturesAnnotation = Domain + "/disable-features" BuiltInFeaturesGpuOpt = "gpu-opt" BuiltInFeaturesGpuLimiter = "gpu-limiter" BuiltInFeaturesMemManager = "mem-manager" // For debug purpose only of Remote vGPU, disable start worker to manual start with ad-hoc command inside Pod BuiltInFeatureStartWorker = "start-worker" GenHostPortLabel = Domain + "/host-port" GenHostPortLabelValue = "auto" GenHostPortNameLabel = Domain + "/port-name" GenPortNumberAnnotation = Domain + "/port-number" AutoScaleResourcesAnnotation = Domain + "/auto-resources" AutoScaleReplicasAnnotation = Domain + "/auto-replicas" AutoScaleTargetResourceAnnotation = Domain + "/auto-scale-target-resource" GpuReleasedAnnotation = Domain + "/gpu-released" TensorFusionPodCounterKeyAnnotation = Domain + "/pod-counter-key" TensorFusionPodCountAnnotation = Domain + "/tf-pod-count" TensorFusionWorkerSuffix = "-tf" // For grey release TensorFusionEnabledReplicasAnnotation = Domain + "/enabled-replicas" TensorFusionDefaultPoolKeyAnnotation = Domain + "/is-default-pool" // Eviction protection annotation for controlling pod eviction timing EvictionProtectionAnnotation = Domain + "/eviction-protection" NamespaceDefaultVal = "tensor-fusion-sys" KubernetesHostNameLabel = "kubernetes.io/hostname" KarpenterExpansionLabel = Domain + "/expansion-source" HypervisorServiceAccountName = "tensor-fusion-hypervisor-sa" TSDBVersionConfigMap = "tensor-fusion-tsdb-version" QoSLevelLow = "low" QoSLevelMedium = "medium" QoSLevelHigh = "high" QoSLevelCritical = "critical" )
const ( TrueStringValue = "true" FalseStringValue = "false" )
for avoid golang lint issues
const ( ConditionStatusTypeReady = "Ready" ConditionStatusTypeGPUScheduled = "GPUScheduled" ConditionStatusTypeConnectionReady = "ConnectionReady" ConditionStatusTypeNodeProvisioned = "NodeProvisioned" ConditionStatusTypePoolReady = "PoolReady" ConditionStatusTypeGPUPool = "GPUPoolReady" ConditionStatusTypeTimeSeriesDatabase = "TimeSeriesDatabaseReady" ConditionStatusTypeCloudVendorConnection = "CloudVendorConnectionReady" ConditionStatusTypeRecommendationProvided = "RecommendationProvided" )
const ( PhaseUnknown = "Unknown" PhasePending = "Pending" PhaseUpdating = "Updating" PhaseScheduling = "Scheduling" PhaseMigrating = "Migrating" PhaseDestroying = "Destroying" PhaseRunning = "Running" PhaseSucceeded = "Succeeded" PhaseFailed = "Failed" )
const ( GPUNodeOSLinux = "linux" GPUNodeOSWindows = "windows" GPUNodeOSMacOS = "macos" )
const ( ProvisionerLabelKey = Domain + "/node-provisioner" ProvisionerMissingLabel = Domain + "/orphan" ProvisionerNamePlaceholder = "__GPU_NODE_RESOURCE_NAME__" )
To match GPUNode with K8S node, when creating from cloud vendor, must set a label from cloud-init userdata
const ( LeaderInfoConfigMapName = "tensor-fusion-operator-leader-info" LeaderInfoConfigMapLeaderIPKey = "leader-ip" )
const ( LowFrequencyObjFailureInitialDelay = 300 * time.Millisecond LowFrequencyObjFailureMaxDelay = 1000 * time.Second LowFrequencyObjFailureMaxRPS = 1 LowFrequencyObjFailureMaxBurst = 1 LowFrequencyObjFailureConcurrentReconcile = 5 )
const ( EnableWebhookEnv = "ENABLE_WEBHOOKS" EnableSchedulerEnv = "ENABLE_SCHEDULER" EnableCustomResourceControllerEnv = "ENABLE_CR_CONTROLLER" // TensorFusion ControllerManager's http endpoint will verify Pod JWT signature // if this env var is set, will disable the verification, it's enabled by default // should not set to true in production environment DisableConnectionAuthEnv = "DISABLE_CONNECTION_AUTH" NvidiaOperatorProgressiveMigrationEnv = "NVIDIA_OPERATOR_PROGRESSIVE_MIGRATION" RunHypervisorUtilGPUAllocatable = "RUN_HYPERVISOR_UTIL_GPU_ALLOCATABLE" UsingCommercialComponentEnv = "COMMERCIAL_PLAN" )
System feature toggles
const ( NvidiaVisibleAllDeviceEnv = "NVIDIA_VISIBLE_DEVICES" NvidiaVisibleAllDeviceValue = "all" TensorFusionGPUInfoConfigName = "tensor-fusion-sys-public-gpu-info" TensorFusionGPUInfoConfigVolumeName = "gpu-info" TensorFusionGPUInfoConfigMountPath = "/etc/tensor-fusion/gpu-info.yaml" TensorFusionGPUInfoConfigSubPath = "gpu-info.yaml" TensorFusionGPUInfoEnvVar = "TENSOR_FUSION_GPU_INFO_PATH" KubeletDevicePluginVolumeName = "device-plugin" KubeletDevicePluginPath = "/var/lib/kubelet/device-plugins" KubeletPodResourcesVolumeName = "pod-resources" KubeletPodResourcesPath = "/var/lib/kubelet/pod-resources" TensorFusionVectorConfigName = "tensor-fusion-sys-vector-config" TensorFusionVectorConfigVolumeName = "vector-config" TensorFusionVectorConfigMountPath = "/etc/vector/vector.yaml" TensorFusionVectorConfigSubPath = "vector-hypervisor.yaml" LogsVolumeName = "logs" KubernetesLogsVolumeName = "kubernetes-logs" KubernetesLogsPath = "/var/log/pods" TensorFusionLogPath = "/logs" DefaultHttpBindIP = "0.0.0.0" )
General envs used in compose components manifest
const ( TFContainerNameClient = "inject-lib" TFContainerNameWorker = "tensorfusion-worker" TFContainerNameHypervisor = "tensorfusion-hypervisor" TFContainerNameNodeDiscovery = "tensorfusion-node-discovery" TFContainerVector = "vector" )
const ( GetConnectionURLEnv = "TENSOR_FUSION_OPERATOR_GET_CONNECTION_URL" ConnectionInfoEnv = "TENSOR_FUSION_OPERATOR_CONNECTION_INFO" ConnectionNameEnv = "TENSOR_FUSION_CONNECTION_NAME" ConnectionNamespaceEnv = "TENSOR_FUSION_CONNECTION_NAMESPACE" RealNvmlLibPathEnv = "TF_NVML_LIB_PATH" RealCUDALibPathEnv = "TF_CUDA_LIB_PATH" RealNvmlLibPathValue = "/lib/x86_64-linux-gnu/libnvidia-ml.so.1" RealCUDALibPathValue = "/lib/x86_64-linux-gnu/libcuda.so" PrependPathEnv = "TF_PREPEND_PATH" PrependLibPathEnv = "TF_LD_LIBRARY_PATH" RunInsideGPUEnv = "RUN_INSIDE_GPU_NODE" LdPreloadFileName = "ld.so.preload" LdPreloadFile = "/etc/ld.so.preload" LdLibraryPathFileName = "zz_tensor-fusion.conf" LdLibraryPathFile = "/etc/ld.so.conf.d/zz_tensor-fusion.conf" TFLibsVolumeName = "tf-libs" TFConfVolumeName = "tf-conf-lib-paths" TFLibsVolumeMountPath = "/tensor-fusion" TFConfVolumeMountPath = "/tensor-fusion-conf" TFConnectionNamePrefix = "-tf-vgpu-" TFConnectionNameNoPrefix = "tf-vgpu-" HostIPFieldRef = "status.hostIP" NodeNameFieldRef = "spec.nodeName" ResourceNameFieldRef = "metadata.name" NamespaceFieldRef = "metadata.namespace" )
TensorFusion client related envs
const ( HypervisorIPEnv = "HYPERVISOR_IP" HypervisorPortEnv = "HYPERVISOR_PORT" PodNamespaceEnv = "POD_NAMESPACE" ContainerNameEnv = "CONTAINER_NAME" EnableWorkerLogEnv = "TF_ENABLE_LOG" EnableWorkerLogValue = "1" // the path of nGPU lib for limiter to load NGPUPathEnv = "TENSOR_FUSION_NGPU_PATH" NGPUPathValue = TFLibsVolumeMountPath + "/libcuda.so" LdPreloadEnv = "LD_PRELOAD" LdPreloadLimiter = "/home/app/libcuda_limiter.so" // disable GPU limiter, for emergency use DisableGpuLimiterEnv = "DISABLE_GPU_LIMITER" // directly forward CUDA calls to GPU driver in nGPU mode, for emergency use DisableCudaOptimizationEnv = "TF_ENABLE_DISPATCH_FORWARD" // disable vram manager, for emergency use DisableVRAMManagerEnv = "TF_DISABLE_MEMORY_MANAGER" DisableWorkerFeatureEnvVal = "1" // hard limiter mode (not open sourced) in percent, only take effect on worker container yet HardSMLimiterEnv = "TF_CUDA_SM_PERCENT_LIMIT" // hard limiter (not open sourced) in megabytes, only take effect on worker container and when open source vgpu.rs gpu-limiter is disabled // when use this mode, memory request can not autoscale dynamically HardMemLimiterEnv = "TF_CUDA_MEMORY_LIMIT" TensorFusionRemoteWorkerPortNumber = 8000 TensorFusionRemoteWorkerPortName = "remote-vgpu" )
TensorFusion worker related envs
const ( HypervisorPoolNameEnv = "TENSOR_FUSION_POOL_NAME" PodNameEnv = "POD_NAME" VectorPodNodeNameEnv = "NODE_NAME" HypervisorGPUNodeNameEnv = "GPU_NODE_NAME" HypervisorSchedulingConfigEnv = "TF_HYPERVISOR_SCHEDULING_CONFIG" HypervisorListenAddrEnv = "API_LISTEN_ADDR" HypervisorMetricsFormatEnv = "TF_HYPERVISOR_METRICS_FORMAT" HypervisorMetricsExtraLabelsEnv = "TF_HYPERVISOR_METRICS_EXTRA_LABELS" HypervisorDetectUsedGPUEnv = "DETECT_IN_USED_GPUS" // Add ptrace capability to hypervisor container, to trace all host PID using GPU SystemPtraceCapability = "SYS_PTRACE" HypervisorDefaultPortNumber int32 = 8000 HypervisorPortName string = "http" // For security enhancement, there are 2 types of endpoints to protect // 1. client call operator /connection API, to obtain tensor fusion worker's URL // 2. worker call hypervisor API, to obtain current workers GPU quota info // if this env var is set on operator and hypervisor, will try to verify JWT signature for each call // not implemented yet, iss is public in EKS and most K8S distribution // but k3s and some K8S distribution may not support, need to find some way to get SA token JWT pub key HypervisorVerifyServiceAccountEnabledEnvVar = "SA_TOKEN_VERIFY_ENABLED" HypervisorVerifyServiceAccountPublicKeyEnvVar = "SA_TOKEN_VERIFY_PUBLIC_KEY" )
TensorFusion hypervisor related envs
const ( NodeDiscoveryReportGPUNodeEnvName = "NODE_DISCOVERY_REPORT_GPU_NODE" NodeDiscoveryHostNameEnv = "HOSTNAME" )
Node discovery related envs
const ( KubeApiVersionMajorEnv = "KUBE_API_VERSION_MAJOR" KubeApiVersionMinorEnv = "KUBE_API_VERSION_MINOR" )
const ( // GPGPU vendors - Global AcceleratorVendorNvidia = "NVIDIA" AcceleratorVendorAMD = "AMD" AcceleratorVendorIntel = "Intel" // DSA vendors - Global AcceleratorVendorQualcomm = "Qualcomm" AcceleratorVendorAWSNeuron = "AWS-Neuron" AcceleratorVendorGoogleTPU = "Google-TPU" AcceleratorVendorCerebras = "Cerebras" // GPGPU vendors - CN AcceleratorVendorHygon = "Hygon-DCU" AcceleratorVendorMetaX = "Meta-X" AcceleratorVendorMThreads = "MThreads" AcceleratorVendorBiren = "BirenGPU" AcceleratorVendorAlibabaTHead = "THead-PPU" // DSA vendors - CN AcceleratorVendorHuaweiAscendNPU = "Ascend-NPU" AcceleratorVendorCambricon = "Cambricon-MLU" AcceleratorVendorEnflame = "Enflame-XPU" AcceleratorVendorKunlunX = "KunlunXin-XPU" AcceleratorVendorUnknown = "Unknown" )
const AuthorizationHeader = "Authorization"
const DataVolumeName = "tf-data"
const DefaultEvictionProtectionPriceRatio = 1.2
const ExtraVerificationInfoPodIDKey = "authentication.kubernetes.io/pod-uid"
const GiBToBytes = 1024 * 1024 * 1024
const KarpenterNodeClaimKind = "NodeClaim"
const KarpenterNodePoolKind = "NodePool"
const MobileGpuClockSpeedMultiplier = 0.75
const NamespaceEnv = "OPERATOR_NAMESPACE"
Controller itself envs
const NodeCriticalPriorityClassName = "system-node-critical"
const (
NvidiaGPUKey = "nvidia.com/gpu"
)
const SchedulerSimulationKey = "simulate-schedule"
const ( // No disrupt label, similar to Karpenter, avoid TFConnection/Worker/GPUNode to be moved to another node or destroying node. // Refer: https://karpenter.sh/docs/concepts/disruption/ SchedulingDoNotDisruptLabel = Domain + "/do-not-disrupt" )
const ShortUUIDAlphabet = "123456789abcdefghijkmnopqrstuvwxy"
const SpotInstanceAssumedDiscountRatio = 0.3
const TFDataPath = "/run/tensor-fusion"
const TFDataPathWorkerExpr = "shm/$(POD_NAMESPACE)/$(POD_NAME)"
const TensorFusionPoolManualCompaction = Domain + "/manual-compaction"
const TensorFusionSystemName = "tensor-fusion"
const TransportShmPath = "/dev/shm"
const TransportShmVolumeName = "tf-transport-shm"
Variables ¶
var ( PendingRequeueDuration = time.Second * 3 StatusCheckInterval = time.Second * 6 GracefulPeriodSeconds = ptr.To(int64(5)) UnschedQueueBufferDuration = 10 * time.Second )
var L1VirtualizationSupportedVendors = []map[string]bool{ { AcceleratorVendorNvidia: false, AcceleratorVendorHuaweiAscendNPU: false, AcceleratorVendorAMD: false, AcceleratorVendorHygon: false, AcceleratorVendorMetaX: false, AcceleratorVendorMThreads: false, }, }
L1 Virtualization means simple device partitioning, such as dynamic MIG/VirtualizationTemplates Can not auto-scale dynamically, limited partition count, coarse-grained isolation
var L2VirtualizationSupportedVendors = []map[string]bool{ { AcceleratorVendorNvidia: true, AcceleratorVendorAMD: false, AcceleratorVendorHuaweiAscendNPU: false, AcceleratorVendorHygon: false, AcceleratorVendorMetaX: false, }, }
L2 Virtualization means dynamic user-space soft isolation, best performance and scalability Can auto-scale dynamically, accurate resource isolation, but can not used in untrusted environment
var L3VirtualizationSupportedVendors = []map[string]bool{ { AcceleratorVendorNvidia: true, AcceleratorVendorAMD: false, AcceleratorVendorHygon: false, AcceleratorVendorMetaX: false, AcceleratorVendorHuaweiAscendNPU: false, }, }
L3 Virtualization means full-featured soft and hard isolated virtualization, including API remoting support live-migration, can be used in both VM and container environments for untrusted workloads
Functions ¶
This section is empty.
Types ¶
This section is empty.