constants

package
v1.36.3 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jul 2, 2025 License: Apache-2.0 Imports: 1 Imported by: 0

Documentation

Index

Constants

View Source
const (
	// Domain is the domain prefix used for all tensor-fusion.ai related annotations and finalizers
	Domain = "tensor-fusion.ai"

	// Finalizer constants
	FinalizerSuffix = "finalizer"
	Finalizer       = Domain + "/" + FinalizerSuffix

	SchedulerName = "tensor-fusion-scheduler"

	LabelKeyOwner           = Domain + "/managed-by"
	LabelKeyClusterOwner    = Domain + "/cluster"
	LabelKeyNodeClass       = Domain + "/node-class"
	LabelKeyPodTemplateHash = Domain + "/pod-template-hash"
	LabelComponent          = Domain + "/component"
	// used by TF connection, for matching the related connections when worker Pod state changed
	LabelWorkerName  = Domain + "/worker-name"
	TrueStringValue  = "true"
	FalseStringValue = "false"

	ComponentClient        = "client"
	ComponentWorker        = "worker"
	ComponentHypervisor    = "hypervisor"
	ComponentNodeDiscovery = "node-discovery"
	ComponentOperator      = "operator"

	GPUNodePoolIdentifierLabelPrefix = Domain + "/pool-"
	GPUNodePoolIdentifierLabelFormat = Domain + "/pool-%s"
	NodeDeletionMark                 = Domain + "/should-delete"

	TensorFusionEnabledLabelKey = Domain + "/enabled"
	InitialGPUNodeSelector      = "nvidia.com/gpu.present=true"

	GPULastReportTimeAnnotationKey = Domain + "/last-sync"
	WorkloadKey                    = Domain + "/workload"
	GpuKey                         = Domain + "/gpu"
	GpuPoolKey                     = Domain + "/gpupool"

	// Annotation key constants
	GpuCountAnnotation             = Domain + "/gpu-count"
	TFLOPSRequestAnnotation        = Domain + "/tflops-request"
	VRAMRequestAnnotation          = Domain + "/vram-request"
	TFLOPSLimitAnnotation          = Domain + "/tflops-limit"
	VRAMLimitAnnotation            = Domain + "/vram-limit"
	WorkloadProfileAnnotation      = Domain + "/workload-profile"
	InjectContainerAnnotation      = Domain + "/inject-container"
	IsLocalGPUAnnotation           = Domain + "/is-local-gpu"
	EmbeddedWorkerAnnotation       = Domain + "/embedded-worker"
	DedicatedWorkerAnnotation      = Domain + "/dedicated-worker"
	StandaloneWorkerModeAnnotation = Domain + "/no-standalone-worker-mode"
	// GPUModelAnnotation specifies the required GPU model (e.g., "A100", "H100")
	GPUModelAnnotation = Domain + "/gpu-model"
	// GPU ID list is assigned by scheduler, should not specified by user
	GPUDeviceIDsAnnotation            = Domain + "/gpu-ids"
	SetPendingOwnedWorkloadAnnotation = Domain + "/pending-owned-workload"

	GenHostPortLabel             = Domain + "/host-port"
	GenHostPortLabelValue        = "auto"
	GenHostPortNameLabel         = Domain + "/port-name"
	GenPortNumberAnnotation      = Domain + "/port-number"
	TensorFusionWorkerPortNumber = 8000

	AutoScaleLimitsAnnotation   = Domain + "/auto-limits"
	AutoScaleRequestsAnnotation = Domain + "/auto-requests"
	AutoScaleReplicasAnnotation = Domain + "/auto-replicas"

	GpuReleasedAnnotation = Domain + "/gpu-released"

	TensorFusionPodCounterKeyAnnotation = Domain + "/pod-counter-key"
	TensorFusionPodCountAnnotation      = Domain + "/tf-pod-count"
	TensorFusionWorkerSuffix            = "-tf"

	// For grey release
	TensorFusionEnabledReplicasAnnotation = Domain + "/enabled-replicas"
	TensorFusionDefaultPoolKeyAnnotation  = Domain + "/is-default-pool"

	GetConnectionURLEnv    = "TENSOR_FUSION_OPERATOR_GET_CONNECTION_URL"
	ConnectionNameEnv      = "TENSOR_FUSION_CONNECTION_NAME"
	ConnectionNamespaceEnv = "TENSOR_FUSION_CONNECTION_NAMESPACE"

	WorkerCudaUpLimitTflopsEnv = "TENSOR_FUSION_CUDA_UP_LIMIT_TFLOPS"
	WorkerCudaUpLimitEnv       = "TENSOR_FUSION_CUDA_UP_LIMIT"
	WorkerCudaMemLimitEnv      = "TENSOR_FUSION_CUDA_MEM_LIMIT"
	WorkloadNameEnv            = "TENSOR_FUSION_WORKLOAD_NAME"
	PoolNameEnv                = "TENSOR_FUSION_POOL_NAME"
	PodNameEnv                 = "POD_NAME"
	GPUNodeNameEnv             = "GPU_NODE_NAME"
	NamespaceEnv               = "OPERATOR_NAMESPACE"
	NamespaceDefaultVal        = "tensor-fusion-sys"

	KubernetesHostNameLabel      = "kubernetes.io/hostname"
	GiBToBytes                   = 1024 * 1024 * 1024
	HypervisorServiceAccountName = "tensor-fusion-hypervisor-sa"

	TSDBVersionConfigMap = "tensor-fusion-tsdb-version"

	QoSLevelLow      = "low"
	QoSLevelMedium   = "medium"
	QoSLevelHigh     = "high"
	QoSLevelCritical = "critical"

	EnableWebhookEnv                  = "ENABLE_WEBHOOKS"
	EnableSchedulerEnv                = "ENABLE_SCHEDULER"
	EnableCustomResourceControllerEnv = "ENABLE_CR_CONTROLLER"
)
View Source
const (
	ConditionStatusTypeReady           = "Ready"
	ConditionStatusTypeGPUScheduled    = "GPUScheduled"
	ConditionStatusTypeConnectionReady = "ConnectionReady"
	ConditionStatusTypeNodeProvisioned = "NodeProvisioned"
	ConditionStatusTypePoolReady       = "PoolReady"

	ConditionStatusTypeGPUPool               = "GPUPoolReady"
	ConditionStatusTypeTimeSeriesDatabase    = "TimeSeriesDatabaseReady"
	ConditionStatusTypeCloudVendorConnection = "CloudVendorConnectionReady"
)
View Source
const (
	PhaseUnknown    = "Unknown"
	PhasePending    = "Pending"
	PhaseUpdating   = "Updating"
	PhaseScheduling = "Scheduling"
	PhaseMigrating  = "Migrating"
	PhaseDestroying = "Destroying"

	PhaseRunning   = "Running"
	PhaseSucceeded = "Succeeded"
	PhaseFailed    = "Failed"
)
View Source
const (
	GPUNodeOSLinux   = "linux"
	GPUNodeOSWindows = "windows"
	GPUNodeOSMacOS   = "macos"
)
View Source
const (
	ProvisionerLabelKey        = Domain + "/node-provisioner"
	ProvisionerNamePlaceholder = "__GPU_NODE_RESOURCE_NAME__"
)

To match GPUNode with K8S node, when creating from cloud vendor, must set a label from cloud-init userdata

View Source
const (
	LeaderInfoConfigMapName        = "tensor-fusion-operator-leader-info"
	LeaderInfoConfigMapLeaderIPKey = "leader-ip"
)
View Source
const (
	LowFrequencyObjFailureInitialDelay        = 300 * time.Millisecond
	LowFrequencyObjFailureMaxDelay            = 1000 * time.Second
	LowFrequencyObjFailureMaxRPS              = 1
	LowFrequencyObjFailureMaxBurst            = 1
	LowFrequencyObjFailureConcurrentReconcile = 5
)
View Source
const AlertJobName = "tensor-fusion"
View Source
const AuthorizationHeader = "Authorization"
View Source
const DataVolumeName = "tf-data"
View Source
const DisableConnectionAuthEnv = "DISABLE_CONNECTION_AUTH"

TensorFusion ControllerManager's http endpoint will verify Pod JWT signature if this env var is set, will disable the verification, it's enabled by default should not set to true in production environment

View Source
const ExtraVerificationInfoPodIDKey = "authentication.kubernetes.io/pod-uid"
View Source
const HypervisorSchedulingConfigEnv = "TF_HYPERVISOR_SCHEDULING_CONFIG"
View Source
const HypervisorVerifyServiceAccountEnabledEnvVar = "SA_TOKEN_VERIFY_ENABLED"
View Source
const HypervisorVerifyServiceAccountPublicKeyEnvVar = "SA_TOKEN_VERIFY_PUBLIC_KEY"
View Source
const (
	NodeDiscoveryReportGPUNodeEnvName = "NODE_DISCOVERY_REPORT_GPU_NODE"
)
View Source
const (
	NvidiaGPUKey = "nvidia.com/gpu"
)
View Source
const NvidiaVisibleAllDeviceEnv = "NVIDIA_VISIBLE_DEVICES"
View Source
const NvidiaVisibleAllDeviceValue = "all"
View Source
const (
	// No disrupt label, similar to Karpenter, avoid TFConnection/Worker/GPUNode to be moved to another node or destroying node.
	// Refer: https://karpenter.sh/docs/concepts/disruption/
	SchedulingDoNotDisruptLabel = Domain + "/do-not-disrupt"
)
View Source
const ShortUUIDAlphabet = "123456789abcdefghijkmnopqrstuvwxy"
View Source
const TFDataPath = "/tmp/tensor-fusion/data"
View Source
const TensorFusionPoolManualCompaction = Domain + "/manual-compaction"

Variables

View Source
var (
	PendingRequeueDuration = time.Second * 3
	StatusCheckInterval    = time.Second * 6
)

Functions

This section is empty.

Types

This section is empty.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL