Documentation
¶
Index ¶
Constants ¶
View Source
const ( // Domain is the domain prefix used for all tensor-fusion.ai related annotations and finalizers Domain = "tensor-fusion.ai" // Finalizer constants FinalizerSuffix = "finalizer" Finalizer = Domain + "/" + FinalizerSuffix SchedulerName = "tensor-fusion-scheduler" LabelKeyOwner = Domain + "/managed-by" LabelKeyClusterOwner = Domain + "/cluster" LabelKeyNodeClass = Domain + "/node-class" LabelKeyPodTemplateHash = Domain + "/pod-template-hash" LabelComponent = Domain + "/component" // used by TF connection, for matching the related connections when worker Pod state changed LabelWorkerName = Domain + "/worker-name" TrueStringValue = "true" FalseStringValue = "false" ComponentClient = "client" ComponentWorker = "worker" ComponentHypervisor = "hypervisor" ComponentNodeDiscovery = "node-discovery" ComponentOperator = "operator" GPUNodePoolIdentifierLabelPrefix = Domain + "/pool-" GPUNodePoolIdentifierLabelFormat = Domain + "/pool-%s" NodeDeletionMark = Domain + "/should-delete" TensorFusionEnabledLabelKey = Domain + "/enabled" InitialGPUNodeSelector = "nvidia.com/gpu.present=true" GPULastReportTimeAnnotationKey = Domain + "/last-sync" WorkloadKey = Domain + "/workload" GpuKey = Domain + "/gpu" GpuPoolKey = Domain + "/gpupool" // Annotation key constants GpuCountAnnotation = Domain + "/gpu-count" TFLOPSRequestAnnotation = Domain + "/tflops-request" VRAMRequestAnnotation = Domain + "/vram-request" TFLOPSLimitAnnotation = Domain + "/tflops-limit" VRAMLimitAnnotation = Domain + "/vram-limit" WorkloadProfileAnnotation = Domain + "/client-profile" InjectContainerAnnotation = Domain + "/inject-container" IsLocalGPUAnnotation = Domain + "/is-local-gpu" EmbeddedWorkerAnnotation = Domain + "/embedded-worker" DedicatedWorkerAnnotation = Domain + "/dedicated-worker" StandaloneWorkerModeAnnotation = Domain + "/no-standalone-worker-mode" // GPUModelAnnotation specifies the required GPU model (e.g., "A100", "H100") GPUModelAnnotation = Domain + "/gpu-model" // GPU ID list is assigned by scheduler, should not specified by user GPUDeviceIDsAnnotation = Domain + "/gpu-ids" SetPendingOwnedWorkloadAnnotation = Domain + "/pending-owned-workload" GenHostPortLabel = Domain + "/host-port" GenHostPortLabelValue = "auto" GenHostPortNameLabel = Domain + "/port-name" GenPortNumberAnnotation = Domain + "/port-number" TensorFusionWorkerPortNumber = 8000 AutoScaleLimitsAnnotation = Domain + "/auto-limits" AutoScaleRequestsAnnotation = Domain + "/auto-requests" AutoScaleReplicasAnnotation = Domain + "/auto-replicas" GpuReleasedAnnotation = Domain + "/gpu-released" TensorFusionPodCounterKeyAnnotation = Domain + "/pod-counter-key" TensorFusionPodCountAnnotation = Domain + "/tf-pod-count" TensorFusionWorkerSuffix = "-tf" // For grey release TensorFusionEnabledReplicasAnnotation = Domain + "/enabled-replicas" TensorFusionDefaultPoolKeyAnnotation = Domain + "/is-default-pool" GetConnectionURLEnv = "TENSOR_FUSION_OPERATOR_GET_CONNECTION_URL" ConnectionNameEnv = "TENSOR_FUSION_CONNECTION_NAME" ConnectionNamespaceEnv = "TENSOR_FUSION_CONNECTION_NAMESPACE" WorkerPortEnv = "TENSOR_FUSION_WORKER_PORT" WorkerCudaUpLimitTflopsEnv = "TENSOR_FUSION_CUDA_UP_LIMIT_TFLOPS" WorkerCudaUpLimitEnv = "TENSOR_FUSION_CUDA_UP_LIMIT" WorkerCudaMemLimitEnv = "TENSOR_FUSION_CUDA_MEM_LIMIT" WorkloadNameEnv = "TENSOR_FUSION_WORKLOAD_NAME" PoolNameEnv = "TENSOR_FUSION_POOL_NAME" PodNameEnv = "POD_NAME" GPUNodeNameEnv = "GPU_NODE_NAME" NamespaceEnv = "OPERATOR_NAMESPACE" NamespaceDefaultVal = "tensor-fusion-sys" KubernetesHostNameLabel = "kubernetes.io/hostname" GiBToBytes = 1024 * 1024 * 1024 HypervisorServiceAccountName = "tensor-fusion-hypervisor-sa" TSDBVersionConfigMap = "tensor-fusion-tsdb-version" QoSLevelLow = "low" QoSLevelMedium = "medium" QoSLevelHigh = "high" QoSLevelCritical = "critical" EnableWebhookEnv = "ENABLE_WEBHOOKS" EnableSchedulerEnv = "ENABLE_SCHEDULER" )
View Source
const ( ConditionStatusTypeReady = "Ready" ConditionStatusTypeGPUScheduled = "GPUScheduled" ConditionStatusTypeConnectionReady = "ConnectionReady" ConditionStatusTypeNodeProvisioned = "NodeProvisioned" ConditionStatusTypePoolReady = "PoolReady" ConditionStatusTypeGPUPool = "GPUPoolReady" ConditionStatusTypeTimeSeriesDatabase = "TimeSeriesDatabaseReady" ConditionStatusTypeCloudVendorConnection = "CloudVendorConnectionReady" )
View Source
const ( PhaseUnknown = "Unknown" PhasePending = "Pending" PhaseUpdating = "Updating" PhaseScheduling = "Scheduling" PhaseMigrating = "Migrating" PhaseDestroying = "Destroying" PhaseRunning = "Running" PhaseSucceeded = "Succeeded" PhaseFailed = "Failed" )
View Source
const ( GPUNodeOSLinux = "linux" GPUNodeOSWindows = "windows" GPUNodeOSMacOS = "macos" )
View Source
const ( ProvisionerLabelKey = Domain + "/node-provisioner" ProvisionerNamePlaceholder = "__GPU_NODE_RESOURCE_NAME__" )
To match GPUNode with K8S node, when creating from cloud vendor, must set a label from cloud-init userdata
View Source
const ( LeaderInfoConfigMapName = "tensor-fusion-operator-leader-info" LeaderInfoConfigMapLeaderIPKey = "leader-ip" )
View Source
const ( LowFrequencyObjFailureInitialDelay = 100 * time.Millisecond LowFrequencyObjFailureMaxDelay = 1000 * time.Second LowFrequencyObjFailureMaxRPS = 1 LowFrequencyObjFailureMaxBurst = 1 LowFrequencyObjFailureConcurrentReconcile = 5 )
View Source
const AlertJobName = "tensor-fusion"
View Source
const DataVolumeName = "tf-data"
View Source
const HypervisorSchedulingConfigEnv = "TF_HYPERVISOR_SCHEDULING_CONFIG"
View Source
const (
NodeDiscoveryReportGPUNodeEnvName = "NODE_DISCOVERY_REPORT_GPU_NODE"
)
View Source
const (
NvidiaGPUKey = "nvidia.com/gpu"
)
View Source
const NvidiaVisibleAllDeviceEnv = "NVIDIA_VISIBLE_DEVICES"
View Source
const NvidiaVisibleAllDeviceValue = "all"
View Source
const ( // No disrupt label, similar to Karpenter, avoid TFConnection/Worker/GPUNode to be moved to another node or destroying node. // Refer: https://karpenter.sh/docs/concepts/disruption/ SchedulingDoNotDisruptLabel = Domain + "/do-not-disrupt" )
View Source
const ShortUUIDAlphabet = "123456789abcdefghijkmnopqrstuvwxy"
View Source
const TFDataPath = "/tmp/tensor-fusion/data"
View Source
const TensorFusionPoolManualCompaction = Domain + "/manual-compaction"
Variables ¶
View Source
var ( PendingRequeueDuration = time.Second * 3 StatusCheckInterval = time.Second * 6 )
Functions ¶
This section is empty.
Types ¶
This section is empty.
Click to show internal directories.
Click to hide internal directories.