 Documentation
      ¶
      Documentation
      ¶
    
    
  
    
  
    Index ¶
Constants ¶
      View Source
      
  
    const ( // Domain is the domain prefix used for all tensor-fusion.ai related annotations and finalizers Domain = "tensor-fusion.ai" // Finalizer constants FinalizerSuffix = "finalizer" Finalizer = Domain + "/" + FinalizerSuffix SchedulerName = "tensor-fusion-scheduler" LabelKeyOwner = Domain + "/managed-by" LabelKeyClusterOwner = Domain + "/cluster" LabelKeyNodeClass = Domain + "/node-class" LabelKeyPodTemplateHash = Domain + "/pod-template-hash" LabelComponent = Domain + "/component" // used by TF connection, for matching the related connections when worker Pod state changed LabelWorkerName = Domain + "/worker-name" ComponentClient = "client" ComponentWorker = "worker" ComponentHypervisor = "hypervisor" ComponentNodeDiscovery = "node-discovery" ComponentOperator = "operator" GPUNodePoolIdentifierLabelPrefix = Domain + "/pool-" GPUNodePoolIdentifierLabelFormat = Domain + "/pool-%s" NodeDeletionMark = Domain + "/should-delete" TensorFusionEnabledLabelKey = Domain + "/enabled" InitialGPUNodeSelector = "nvidia.com/gpu.present=true" GPULastReportTimeAnnotationKey = Domain + "/last-sync" WorkloadKey = Domain + "/workload" GpuPoolKey = Domain + "/gpupool" // Annotation key constants GpuCountAnnotation = Domain + "/gpu-count" TFLOPSRequestAnnotation = Domain + "/tflops-request" VRAMRequestAnnotation = Domain + "/vram-request" TFLOPSLimitAnnotation = Domain + "/tflops-limit" VRAMLimitAnnotation = Domain + "/vram-limit" WorkloadProfileAnnotation = Domain + "/workload-profile" InjectContainerAnnotation = Domain + "/inject-container" IsLocalGPUAnnotation = Domain + "/is-local-gpu" QoSLevelAnnotation = Domain + "/qos" EmbeddedWorkerAnnotation = Domain + "/embedded-worker" DedicatedWorkerAnnotation = Domain + "/dedicated-worker" StandaloneWorkerModeAnnotation = Domain + "/no-standalone-worker-mode" // GPUModelAnnotation specifies the required GPU model (e.g., "A100", "H100") GPUModelAnnotation = Domain + "/gpu-model" // GPU ID list is assigned by scheduler, should not specified by user GPUDeviceIDsAnnotation = Domain + "/gpu-ids" SetPendingOwnedWorkloadAnnotation = Domain + "/pending-owned-workload" // Annotations for killer switch: disable features // ['gpu-opt', 'mem-manager', 'gpu-limiter'] DisableFeaturesAnnotation = Domain + "/disable-features" BuiltInFeaturesGpuOpt = "gpu-opt" BuiltInFeaturesGpuLimiter = "gpu-limiter" BuiltInFeaturesMemManager = "mem-manager" GenHostPortLabel = Domain + "/host-port" GenHostPortLabelValue = "auto" GenHostPortNameLabel = Domain + "/port-name" GenPortNumberAnnotation = Domain + "/port-number" AutoScaleLimitsAnnotation = Domain + "/auto-limits" AutoScaleRequestsAnnotation = Domain + "/auto-requests" AutoScaleReplicasAnnotation = Domain + "/auto-replicas" GpuReleasedAnnotation = Domain + "/gpu-released" TensorFusionPodCounterKeyAnnotation = Domain + "/pod-counter-key" TensorFusionPodCountAnnotation = Domain + "/tf-pod-count" TensorFusionWorkerSuffix = "-tf" // For grey release TensorFusionEnabledReplicasAnnotation = Domain + "/enabled-replicas" TensorFusionDefaultPoolKeyAnnotation = Domain + "/is-default-pool" NamespaceDefaultVal = "tensor-fusion-sys" KubernetesHostNameLabel = "kubernetes.io/hostname" HypervisorServiceAccountName = "tensor-fusion-hypervisor-sa" TSDBVersionConfigMap = "tensor-fusion-tsdb-version" QoSLevelLow = "low" QoSLevelMedium = "medium" QoSLevelHigh = "high" QoSLevelCritical = "critical" )
      View Source
      
  const ( TrueStringValue = "true" FalseStringValue = "false" )
for avoid golang lint issues
      View Source
      
  
    const ( ConditionStatusTypeReady = "Ready" ConditionStatusTypeGPUScheduled = "GPUScheduled" ConditionStatusTypeConnectionReady = "ConnectionReady" ConditionStatusTypeNodeProvisioned = "NodeProvisioned" ConditionStatusTypePoolReady = "PoolReady" ConditionStatusTypeGPUPool = "GPUPoolReady" ConditionStatusTypeTimeSeriesDatabase = "TimeSeriesDatabaseReady" ConditionStatusTypeCloudVendorConnection = "CloudVendorConnectionReady" )
      View Source
      
  
    const ( PhaseUnknown = "Unknown" PhasePending = "Pending" PhaseUpdating = "Updating" PhaseScheduling = "Scheduling" PhaseMigrating = "Migrating" PhaseDestroying = "Destroying" PhaseRunning = "Running" PhaseSucceeded = "Succeeded" PhaseFailed = "Failed" )
      View Source
      
  
    const ( GPUNodeOSLinux = "linux" GPUNodeOSWindows = "windows" GPUNodeOSMacOS = "macos" )
      View Source
      
  const ( ProvisionerLabelKey = Domain + "/node-provisioner" ProvisionerNamePlaceholder = "__GPU_NODE_RESOURCE_NAME__" )
To match GPUNode with K8S node, when creating from cloud vendor, must set a label from cloud-init userdata
      View Source
      
  
    const ( LeaderInfoConfigMapName = "tensor-fusion-operator-leader-info" LeaderInfoConfigMapLeaderIPKey = "leader-ip" )
      View Source
      
  
    const ( LowFrequencyObjFailureInitialDelay = 300 * time.Millisecond LowFrequencyObjFailureMaxDelay = 1000 * time.Second LowFrequencyObjFailureMaxRPS = 1 LowFrequencyObjFailureMaxBurst = 1 LowFrequencyObjFailureConcurrentReconcile = 5 )
      View Source
      
  const ( EnableWebhookEnv = "ENABLE_WEBHOOKS" EnableSchedulerEnv = "ENABLE_SCHEDULER" EnableCustomResourceControllerEnv = "ENABLE_CR_CONTROLLER" // TensorFusion ControllerManager's http endpoint will verify Pod JWT signature // if this env var is set, will disable the verification, it's enabled by default // should not set to true in production environment DisableConnectionAuthEnv = "DISABLE_CONNECTION_AUTH" NvidiaOperatorProgressiveMigrationEnv = "NVIDIA_OPERATOR_PROGRESSIVE_MIGRATION" )
System feature toggles
      View Source
      
  const ( NvidiaVisibleAllDeviceEnv = "NVIDIA_VISIBLE_DEVICES" NvidiaVisibleAllDeviceValue = "all" TensorFusionGPUInfoConfigName = "tensor-fusion-sys-public-gpu-info" TensorFusionGPUInfoConfigVolumeName = "gpu-info" TensorFusionGPUInfoConfigMountPath = "/etc/tensor-fusion/gpu-info.yaml" TensorFusionGPUInfoConfigSubPath = "gpu-info.yaml" TensorFusionGPUInfoEnvVar = "TENSOR_FUSION_GPU_INFO_PATH" KubeletDevicePluginVolumeName = "device-plugin" KubeletDevicePluginPath = "/var/lib/kubelet/device-plugins" TensorFusionVectorConfigName = "tensor-fusion-sys-vector-config" TensorFusionVectorConfigVolumeName = "vector-config" TensorFusionVectorConfigMountPath = "/etc/vector/vector.yaml" TensorFusionVectorConfigSubPath = "vector-hypervisor.yaml" LogsVolumeName = "logs" KubernetesLogsVolumeName = "kubernetes-logs" KubernetesLogsPath = "/var/log/pods" TensorFusionLogPath = "/logs" DefaultHttpBindIP = "0.0.0.0" )
General envs used in compose components manifest
      View Source
      
  
    const ( TFContainerNameClient = "inject-lib" TFContainerNameWorker = "tensorfusion-worker" TFContainerNameHypervisor = "tensorfusion-hypervisor" TFContainerNameNodeDiscovery = "tensorfusion-node-discovery" TFContainerVector = "vector" )
      View Source
      
  const ( GetConnectionURLEnv = "TENSOR_FUSION_OPERATOR_GET_CONNECTION_URL" ConnectionNameEnv = "TENSOR_FUSION_CONNECTION_NAME" ConnectionNamespaceEnv = "TENSOR_FUSION_CONNECTION_NAMESPACE" RealNvmlLibPathEnv = "TF_NVML_LIB_PATH" RealCUDALibPathEnv = "TF_CUDA_LIB_PATH" RealNvmlLibPathValue = "/lib/x86_64-linux-gnu/libnvidia-ml.so.1" RealCUDALibPathValue = "/lib/x86_64-linux-gnu/libcuda.so" PrependPathEnv = "TF_PREPEND_PATH" PrependLDLibraryPathEnv = "TF_PREPEND_LD_LIBRARY_PATH" LdPreloadFileName = "ld.so.preload" LdPreloadFile = "/etc/ld.so.preload" TFLibsVolumeName = "tf-libs" TFLibsVolumeMountPath = "/tensor-fusion" HostIPFieldRef = "status.hostIP" NodeNameFieldRef = "spec.nodeName" ResourceNameFieldRef = "metadata.name" NamespaceFieldRef = "metadata.namespace" )
TensorFusion client related envs
      View Source
      
  const ( HypervisorIPEnv = "HYPERVISOR_IP" HypervisorPortEnv = "HYPERVISOR_PORT" PodNamespaceEnv = "POD_NAMESPACE" ContainerNameEnv = "CONTAINER_NAME" // the path of nGPU lib for limiter to load NGPUPathEnv = "TENSOR_FUSION_NGPU_PATH" NGPUPathValue = TFLibsVolumeMountPath + "/libcuda.so" LdPreloadEnv = "LD_PRELOAD" LdPreloadLimiter = "/home/app/libcuda_limiter.so" // disable GPU limiter, for emergency use DisableGpuLimiterEnv = "DISABLE_GPU_LIMITER" // directly forward CUDA calls to GPU driver in nGPU mode, for emergency use DisableCudaOptimizationEnv = "TF_ENABLE_DISPATCH_FORWARD" // disable vram manager, for emergency use DisableVRAMManagerEnv = "TF_DISABLE_MEMORY_MANAGER" DisableWorkerFeatureEnvVal = "1" TensorFusionRemoteWorkerPortNumber = 8000 TensorFusionRemoteWorkerPortName = "remote-vgpu" )
TensorFusion worker related envs
      View Source
      
  const ( HypervisorPoolNameEnv = "TENSOR_FUSION_POOL_NAME" PodNameEnv = "POD_NAME" VectorPodNodeNameEnv = "NODE_NAME" HypervisorGPUNodeNameEnv = "GPU_NODE_NAME" HypervisorSchedulingConfigEnv = "TF_HYPERVISOR_SCHEDULING_CONFIG" HypervisorListenAddrEnv = "API_LISTEN_ADDR" HypervisorMetricsFormatEnv = "TF_HYPERVISOR_METRICS_FORMAT" HypervisorMetricsExtraLabelsEnv = "TF_HYPERVISOR_METRICS_EXTRA_LABELS" HypervisorDetectUsedGPUEnv = "DETECT_IN_USED_GPU" HypervisorDefaultPortNumber int32 = 8000 HypervisorPortName string = "http" // For security enhancement, there are 2 types of endpoints to protect // 1. client call operator /connection API, to obtain tensor fusion worker's URL // 2. worker call hypervisor API, to obtain current workers GPU quota info // if this env var is set on operator and hypervisor, will try to verify JWT signature for each call // not implemented yet, iss is public in EKS and most K8S distribution // but k3s and some K8S distribution may not support, need to find some way to get SA token JWT pub key HypervisorVerifyServiceAccountEnabledEnvVar = "SA_TOKEN_VERIFY_ENABLED" HypervisorVerifyServiceAccountPublicKeyEnvVar = "SA_TOKEN_VERIFY_PUBLIC_KEY" )
TensorFusion hypervisor related envs
      View Source
      
  const ( NodeDiscoveryReportGPUNodeEnvName = "NODE_DISCOVERY_REPORT_GPU_NODE" NodeDiscoveryHostNameEnv = "HOSTNAME" )
Node discovery related envs
      View Source
      
  
    const AlertJobName = "tensor-fusion"
    
      View Source
      
  
    const AuthorizationHeader = "Authorization"
    
      View Source
      
  
    const DataVolumeName = "tf-data"
    
      View Source
      
  
    const ExtraVerificationInfoPodIDKey = "authentication.kubernetes.io/pod-uid"
    
      View Source
      
  
    const GiBToBytes = 1024 * 1024 * 1024
    
      View Source
      
  const NamespaceEnv = "OPERATOR_NAMESPACE"
    Controller itself envs
      View Source
      
  
    const (
	NvidiaGPUKey = "nvidia.com/gpu"
)
    
      View Source
      
  
    const ( // No disrupt label, similar to Karpenter, avoid TFConnection/Worker/GPUNode to be moved to another node or destroying node. // Refer: https://karpenter.sh/docs/concepts/disruption/ SchedulingDoNotDisruptLabel = Domain + "/do-not-disrupt" )
      View Source
      
  
    const ShortUUIDAlphabet = "123456789abcdefghijkmnopqrstuvwxy"
    
      View Source
      
  
    const TFDataPath = "/tmp/tensor-fusion/data"
    
      View Source
      
  
const TensorFusionPoolManualCompaction = Domain + "/manual-compaction"
    Variables ¶
Functions ¶
This section is empty.
Types ¶
This section is empty.
 Click to show internal directories. 
   Click to hide internal directories.