Documentation
¶
Index ¶
Constants ¶
View Source
const ( // Domain is the domain prefix used for all tensor-fusion.ai related annotations and finalizers Domain = "tensor-fusion.ai" // Finalizer constants FinalizerSuffix = "finalizer" Finalizer = Domain + "/" + FinalizerSuffix SchedulerName = "tensor-fusion-scheduler" LabelKeyOwner = Domain + "/managed-by" LabelKeyClusterOwner = Domain + "/cluster" LabelKeyNodeClass = Domain + "/node-class" LabelKeyPodTemplateHash = Domain + "/pod-template-hash" LabelNodeSelectorHash = Domain + "/node-selector-hash" LabelComponent = Domain + "/component" // used by TF connection, for matching the related connections when worker Pod state changed LabelWorkerName = Domain + "/worker-name" ComponentClient = "client" ComponentWorker = "worker" ComponentHypervisor = "hypervisor" ComponentNodeDiscovery = "node-discovery" ComponentOperator = "operator" GPUNodePoolIdentifierLabelPrefix = Domain + "/pool-" GPUNodePoolIdentifierLabelFormat = Domain + "/pool-%s" NodeDeletionMark = Domain + "/should-delete" TensorFusionEnabledLabelKey = Domain + "/enabled" InitialGPUNodeSelector = "nvidia.com/gpu.present=true" LastSyncTimeAnnotationKey = Domain + "/last-sync" WorkloadKey = Domain + "/workload" GpuPoolKey = Domain + "/gpupool" // Annotation key constants GpuCountAnnotation = Domain + "/gpu-count" TFLOPSRequestAnnotation = Domain + "/tflops-request" VRAMRequestAnnotation = Domain + "/vram-request" TFLOPSLimitAnnotation = Domain + "/tflops-limit" VRAMLimitAnnotation = Domain + "/vram-limit" WorkloadProfileAnnotation = Domain + "/workload-profile" InjectContainerAnnotation = Domain + "/inject-container" IsLocalGPUAnnotation = Domain + "/is-local-gpu" QoSLevelAnnotation = Domain + "/qos" EmbeddedWorkerAnnotation = Domain + "/embedded-worker" DedicatedWorkerAnnotation = Domain + "/dedicated-worker" StandaloneWorkerModeAnnotation = Domain + "/no-standalone-worker-mode" // GPUModelAnnotation specifies the required GPU model (e.g., "A100", "H100") GPUModelAnnotation = Domain + "/gpu-model" // GPU ID list is assigned by scheduler, should not specified by user GPUDeviceIDsAnnotation = Domain + "/gpu-ids" DedicatedGPUAnnotation = Domain + "/dedicated-gpu" SetPendingOwnedWorkloadAnnotation = Domain + "/pending-owned-workload" PricingAnnotation = Domain + "/hourly-pricing" // In remote vGPU mode, selected workload is set by user with /workload annotation or generated by system SelectedWorkloadAnnotation = Domain + "/selected-workload" WorkloadModeAnnotation = Domain + "/workload-mode" WorkloadModeDynamic = "dynamic" WorkloadModeFixed = "fixed" // Annotations for killer switch: disable features // ['gpu-opt', 'mem-manager', 'gpu-limiter'] DisableFeaturesAnnotation = Domain + "/disable-features" BuiltInFeaturesGpuOpt = "gpu-opt" BuiltInFeaturesGpuLimiter = "gpu-limiter" BuiltInFeaturesMemManager = "mem-manager" // For debug purpose only of Remote vGPU, disable start worker to manual start with ad-hoc command inside Pod BuiltInFeatureStartWorker = "start-worker" GenHostPortLabel = Domain + "/host-port" GenHostPortLabelValue = "auto" GenHostPortNameLabel = Domain + "/port-name" GenPortNumberAnnotation = Domain + "/port-number" AutoScaleResourcesAnnotation = Domain + "/auto-resources" AutoScaleReplicasAnnotation = Domain + "/auto-replicas" AutoScaleTargetResourceAnnotation = Domain + "/auto-scale-target-resource" GpuReleasedAnnotation = Domain + "/gpu-released" TensorFusionPodCounterKeyAnnotation = Domain + "/pod-counter-key" TensorFusionPodCountAnnotation = Domain + "/tf-pod-count" TensorFusionWorkerSuffix = "-tf" // For grey release TensorFusionEnabledReplicasAnnotation = Domain + "/enabled-replicas" TensorFusionDefaultPoolKeyAnnotation = Domain + "/is-default-pool" // Eviction protection annotation for controlling pod eviction timing EvictionProtectionAnnotation = Domain + "/eviction-protection" NamespaceDefaultVal = "tensor-fusion-sys" KubernetesHostNameLabel = "kubernetes.io/hostname" KarpenterExpansionLabel = Domain + "/expansion-source" HypervisorServiceAccountName = "tensor-fusion-hypervisor-sa" TSDBVersionConfigMap = "tensor-fusion-tsdb-version" QoSLevelLow = "low" QoSLevelMedium = "medium" QoSLevelHigh = "high" QoSLevelCritical = "critical" )
View Source
const ( TrueStringValue = "true" FalseStringValue = "false" )
for avoid golang lint issues
View Source
const ( ConditionStatusTypeReady = "Ready" ConditionStatusTypeGPUScheduled = "GPUScheduled" ConditionStatusTypeConnectionReady = "ConnectionReady" ConditionStatusTypeNodeProvisioned = "NodeProvisioned" ConditionStatusTypePoolReady = "PoolReady" ConditionStatusTypeGPUPool = "GPUPoolReady" ConditionStatusTypeTimeSeriesDatabase = "TimeSeriesDatabaseReady" ConditionStatusTypeCloudVendorConnection = "CloudVendorConnectionReady" ConditionStatusTypeRecommendationProvided = "RecommendationProvided" )
View Source
const ( PhaseUnknown = "Unknown" PhasePending = "Pending" PhaseUpdating = "Updating" PhaseScheduling = "Scheduling" PhaseMigrating = "Migrating" PhaseDestroying = "Destroying" PhaseRunning = "Running" PhaseSucceeded = "Succeeded" PhaseFailed = "Failed" )
View Source
const ( GPUNodeOSLinux = "linux" GPUNodeOSWindows = "windows" GPUNodeOSMacOS = "macos" )
View Source
const ( ProvisionerLabelKey = Domain + "/node-provisioner" ProvisionerMissingLabel = Domain + "/orphan" ProvisionerNamePlaceholder = "__GPU_NODE_RESOURCE_NAME__" )
To match GPUNode with K8S node, when creating from cloud vendor, must set a label from cloud-init userdata
View Source
const ( LeaderInfoConfigMapName = "tensor-fusion-operator-leader-info" LeaderInfoConfigMapLeaderIPKey = "leader-ip" )
View Source
const ( LowFrequencyObjFailureInitialDelay = 300 * time.Millisecond LowFrequencyObjFailureMaxDelay = 1000 * time.Second LowFrequencyObjFailureMaxRPS = 1 LowFrequencyObjFailureMaxBurst = 1 LowFrequencyObjFailureConcurrentReconcile = 5 )
View Source
const ( EnableWebhookEnv = "ENABLE_WEBHOOKS" EnableSchedulerEnv = "ENABLE_SCHEDULER" EnableCustomResourceControllerEnv = "ENABLE_CR_CONTROLLER" // TensorFusion ControllerManager's http endpoint will verify Pod JWT signature // if this env var is set, will disable the verification, it's enabled by default // should not set to true in production environment DisableConnectionAuthEnv = "DISABLE_CONNECTION_AUTH" NvidiaOperatorProgressiveMigrationEnv = "NVIDIA_OPERATOR_PROGRESSIVE_MIGRATION" RunHypervisorUtilGPUAllocatable = "RUN_HYPERVISOR_UTIL_GPU_ALLOCATABLE" )
System feature toggles
View Source
const ( NvidiaVisibleAllDeviceEnv = "NVIDIA_VISIBLE_DEVICES" NvidiaVisibleAllDeviceValue = "all" TensorFusionGPUInfoConfigName = "tensor-fusion-sys-public-gpu-info" TensorFusionGPUInfoConfigVolumeName = "gpu-info" TensorFusionGPUInfoConfigMountPath = "/etc/tensor-fusion/gpu-info.yaml" TensorFusionGPUInfoConfigSubPath = "gpu-info.yaml" TensorFusionGPUInfoEnvVar = "TENSOR_FUSION_GPU_INFO_PATH" KubeletDevicePluginVolumeName = "device-plugin" KubeletDevicePluginPath = "/var/lib/kubelet/device-plugins" KubeletPodResourcesVolumeName = "pod-resources" KubeletPodResourcesPath = "/var/lib/kubelet/pod-resources" TensorFusionVectorConfigName = "tensor-fusion-sys-vector-config" TensorFusionVectorConfigVolumeName = "vector-config" TensorFusionVectorConfigMountPath = "/etc/vector/vector.yaml" TensorFusionVectorConfigSubPath = "vector-hypervisor.yaml" LogsVolumeName = "logs" KubernetesLogsVolumeName = "kubernetes-logs" KubernetesLogsPath = "/var/log/pods" TensorFusionLogPath = "/logs" DefaultHttpBindIP = "0.0.0.0" )
General envs used in compose components manifest
View Source
const ( TFContainerNameClient = "inject-lib" TFContainerNameWorker = "tensorfusion-worker" TFContainerNameHypervisor = "tensorfusion-hypervisor" TFContainerNameNodeDiscovery = "tensorfusion-node-discovery" TFContainerVector = "vector" )
View Source
const ( GetConnectionURLEnv = "TENSOR_FUSION_OPERATOR_GET_CONNECTION_URL" ConnectionNameEnv = "TENSOR_FUSION_CONNECTION_NAME" ConnectionNamespaceEnv = "TENSOR_FUSION_CONNECTION_NAMESPACE" RealNvmlLibPathEnv = "TF_NVML_LIB_PATH" RealCUDALibPathEnv = "TF_CUDA_LIB_PATH" RealNvmlLibPathValue = "/lib/x86_64-linux-gnu/libnvidia-ml.so.1" RealCUDALibPathValue = "/lib/x86_64-linux-gnu/libcuda.so" PrependPathEnv = "TF_PREPEND_PATH" PrependLDLibraryPathEnv = "TF_PREPEND_LD_LIBRARY_PATH" LdPreloadFileName = "ld.so.preload" LdPreloadFile = "/etc/ld.so.preload" TFLibsVolumeName = "tf-libs" TFLibsVolumeMountPath = "/tensor-fusion" TFConnectionNamePrefix = "-tf-vgpu-" TFConnectionNameNoPrefix = "tf-vgpu-" HostIPFieldRef = "status.hostIP" NodeNameFieldRef = "spec.nodeName" ResourceNameFieldRef = "metadata.name" NamespaceFieldRef = "metadata.namespace" )
TensorFusion client related envs
View Source
const ( HypervisorIPEnv = "HYPERVISOR_IP" HypervisorPortEnv = "HYPERVISOR_PORT" PodNamespaceEnv = "POD_NAMESPACE" ContainerNameEnv = "CONTAINER_NAME" // the path of nGPU lib for limiter to load NGPUPathEnv = "TENSOR_FUSION_NGPU_PATH" NGPUPathValue = TFLibsVolumeMountPath + "/libcuda.so" LdPreloadEnv = "LD_PRELOAD" LdPreloadLimiter = "/home/app/libcuda_limiter.so" // disable GPU limiter, for emergency use DisableGpuLimiterEnv = "DISABLE_GPU_LIMITER" // directly forward CUDA calls to GPU driver in nGPU mode, for emergency use DisableCudaOptimizationEnv = "TF_ENABLE_DISPATCH_FORWARD" // disable vram manager, for emergency use DisableVRAMManagerEnv = "TF_DISABLE_MEMORY_MANAGER" DisableWorkerFeatureEnvVal = "1" TensorFusionRemoteWorkerPortNumber = 8000 TensorFusionRemoteWorkerPortName = "remote-vgpu" )
TensorFusion worker related envs
View Source
const ( HypervisorPoolNameEnv = "TENSOR_FUSION_POOL_NAME" PodNameEnv = "POD_NAME" VectorPodNodeNameEnv = "NODE_NAME" HypervisorGPUNodeNameEnv = "GPU_NODE_NAME" HypervisorSchedulingConfigEnv = "TF_HYPERVISOR_SCHEDULING_CONFIG" HypervisorListenAddrEnv = "API_LISTEN_ADDR" HypervisorMetricsFormatEnv = "TF_HYPERVISOR_METRICS_FORMAT" HypervisorMetricsExtraLabelsEnv = "TF_HYPERVISOR_METRICS_EXTRA_LABELS" HypervisorDetectUsedGPUEnv = "DETECT_IN_USED_GPUS" // Add ptrace capability to hypervisor container, to trace all host PID using GPU SystemPtraceCapability = "SYS_PTRACE" HypervisorDefaultPortNumber int32 = 8000 HypervisorPortName string = "http" // For security enhancement, there are 2 types of endpoints to protect // 1. client call operator /connection API, to obtain tensor fusion worker's URL // 2. worker call hypervisor API, to obtain current workers GPU quota info // if this env var is set on operator and hypervisor, will try to verify JWT signature for each call // not implemented yet, iss is public in EKS and most K8S distribution // but k3s and some K8S distribution may not support, need to find some way to get SA token JWT pub key HypervisorVerifyServiceAccountEnabledEnvVar = "SA_TOKEN_VERIFY_ENABLED" HypervisorVerifyServiceAccountPublicKeyEnvVar = "SA_TOKEN_VERIFY_PUBLIC_KEY" )
TensorFusion hypervisor related envs
View Source
const ( NodeDiscoveryReportGPUNodeEnvName = "NODE_DISCOVERY_REPORT_GPU_NODE" NodeDiscoveryHostNameEnv = "HOSTNAME" )
Node discovery related envs
View Source
const ( KubeApiVersionMajorEnv = "KUBE_API_VERSION_MAJOR" KubeApiVersionMinorEnv = "KUBE_API_VERSION_MINOR" )
View Source
const AuthorizationHeader = "Authorization"
View Source
const DataVolumeName = "tf-data"
View Source
const DefaultEvictionProtectionPriceRatio = 1.2
View Source
const ExtraVerificationInfoPodIDKey = "authentication.kubernetes.io/pod-uid"
View Source
const GiBToBytes = 1024 * 1024 * 1024
View Source
const KarpenterNodeClaimKind = "NodeClaim"
View Source
const MobileGpuClockSpeedMultiplier = 0.75
View Source
const NamespaceEnv = "OPERATOR_NAMESPACE"
Controller itself envs
View Source
const NodeCriticalPriorityClassName = "system-node-critical"
View Source
const (
NvidiaGPUKey = "nvidia.com/gpu"
)
View Source
const SchedulerSimulationKey = "simulate-schedule"
View Source
const ( // No disrupt label, similar to Karpenter, avoid TFConnection/Worker/GPUNode to be moved to another node or destroying node. // Refer: https://karpenter.sh/docs/concepts/disruption/ SchedulingDoNotDisruptLabel = Domain + "/do-not-disrupt" )
View Source
const ShortUUIDAlphabet = "123456789abcdefghijkmnopqrstuvwxy"
View Source
const SpotInstanceAssumedDiscountRatio = 0.3
View Source
const TFDataPath = "/run/tensor-fusion"
View Source
const TFDataPathWorkerExpr = "shm/$(POD_NAMESPACE)/$(POD_NAME)"
View Source
const TensorFusionPoolManualCompaction = Domain + "/manual-compaction"
View Source
const TensorFusionSystemName = "tensor-fusion"
Variables ¶
Functions ¶
This section is empty.
Types ¶
This section is empty.
Click to show internal directories.
Click to hide internal directories.