constants

package

v1.36.3 Latest Latest Go to latest Published: Jul 2, 2025 License: Apache-2.0 Imports: 1 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/NexusGPU/tensor-fusion

Links

Open Source Insights

Documentation ¶

Constants ¶

View Source

const (
	// Domain is the domain prefix used for all tensor-fusion.ai related annotations and finalizers
	Domain = "tensor-fusion.ai"

	// Finalizer constants
	FinalizerSuffix = "finalizer"
	Finalizer       = Domain + "/" + FinalizerSuffix

	SchedulerName = "tensor-fusion-scheduler"

	LabelKeyOwner           = Domain + "/managed-by"
	LabelKeyClusterOwner    = Domain + "/cluster"
	LabelKeyNodeClass       = Domain + "/node-class"
	LabelKeyPodTemplateHash = Domain + "/pod-template-hash"
	LabelComponent          = Domain + "/component"
	// used by TF connection, for matching the related connections when worker Pod state changed
	LabelWorkerName  = Domain + "/worker-name"
	TrueStringValue  = "true"
	FalseStringValue = "false"

	ComponentClient        = "client"
	ComponentWorker        = "worker"
	ComponentHypervisor    = "hypervisor"
	ComponentNodeDiscovery = "node-discovery"
	ComponentOperator      = "operator"

	GPUNodePoolIdentifierLabelPrefix = Domain + "/pool-"
	GPUNodePoolIdentifierLabelFormat = Domain + "/pool-%s"
	NodeDeletionMark                 = Domain + "/should-delete"

	TensorFusionEnabledLabelKey = Domain + "/enabled"
	InitialGPUNodeSelector      = "nvidia.com/gpu.present=true"

	GPULastReportTimeAnnotationKey = Domain + "/last-sync"
	WorkloadKey                    = Domain + "/workload"
	GpuKey                         = Domain + "/gpu"
	GpuPoolKey                     = Domain + "/gpupool"

	// Annotation key constants
	GpuCountAnnotation             = Domain + "/gpu-count"
	TFLOPSRequestAnnotation        = Domain + "/tflops-request"
	VRAMRequestAnnotation          = Domain + "/vram-request"
	TFLOPSLimitAnnotation          = Domain + "/tflops-limit"
	VRAMLimitAnnotation            = Domain + "/vram-limit"
	WorkloadProfileAnnotation      = Domain + "/workload-profile"
	InjectContainerAnnotation      = Domain + "/inject-container"
	IsLocalGPUAnnotation           = Domain + "/is-local-gpu"
	EmbeddedWorkerAnnotation       = Domain + "/embedded-worker"
	DedicatedWorkerAnnotation      = Domain + "/dedicated-worker"
	StandaloneWorkerModeAnnotation = Domain + "/no-standalone-worker-mode"
	// GPUModelAnnotation specifies the required GPU model (e.g., "A100", "H100")
	GPUModelAnnotation = Domain + "/gpu-model"
	// GPU ID list is assigned by scheduler, should not specified by user
	GPUDeviceIDsAnnotation            = Domain + "/gpu-ids"
	SetPendingOwnedWorkloadAnnotation = Domain + "/pending-owned-workload"

	GenHostPortLabel             = Domain + "/host-port"
	GenHostPortLabelValue        = "auto"
	GenHostPortNameLabel         = Domain + "/port-name"
	GenPortNumberAnnotation      = Domain + "/port-number"
	TensorFusionWorkerPortNumber = 8000

	AutoScaleLimitsAnnotation   = Domain + "/auto-limits"
	AutoScaleRequestsAnnotation = Domain + "/auto-requests"
	AutoScaleReplicasAnnotation = Domain + "/auto-replicas"

	GpuReleasedAnnotation = Domain + "/gpu-released"

	TensorFusionPodCounterKeyAnnotation = Domain + "/pod-counter-key"
	TensorFusionPodCountAnnotation      = Domain + "/tf-pod-count"
	TensorFusionWorkerSuffix            = "-tf"

	// For grey release
	TensorFusionEnabledReplicasAnnotation = Domain + "/enabled-replicas"
	TensorFusionDefaultPoolKeyAnnotation  = Domain + "/is-default-pool"

	GetConnectionURLEnv    = "TENSOR_FUSION_OPERATOR_GET_CONNECTION_URL"
	ConnectionNameEnv      = "TENSOR_FUSION_CONNECTION_NAME"
	ConnectionNamespaceEnv = "TENSOR_FUSION_CONNECTION_NAMESPACE"

	WorkerCudaUpLimitTflopsEnv = "TENSOR_FUSION_CUDA_UP_LIMIT_TFLOPS"
	WorkerCudaUpLimitEnv       = "TENSOR_FUSION_CUDA_UP_LIMIT"
	WorkerCudaMemLimitEnv      = "TENSOR_FUSION_CUDA_MEM_LIMIT"
	WorkloadNameEnv            = "TENSOR_FUSION_WORKLOAD_NAME"
	PoolNameEnv                = "TENSOR_FUSION_POOL_NAME"
	PodNameEnv                 = "POD_NAME"
	GPUNodeNameEnv             = "GPU_NODE_NAME"
	NamespaceEnv               = "OPERATOR_NAMESPACE"
	NamespaceDefaultVal        = "tensor-fusion-sys"

	KubernetesHostNameLabel      = "kubernetes.io/hostname"
	GiBToBytes                   = 1024 * 1024 * 1024
	HypervisorServiceAccountName = "tensor-fusion-hypervisor-sa"

	TSDBVersionConfigMap = "tensor-fusion-tsdb-version"

	QoSLevelLow      = "low"
	QoSLevelMedium   = "medium"
	QoSLevelHigh     = "high"
	QoSLevelCritical = "critical"

	EnableWebhookEnv                  = "ENABLE_WEBHOOKS"
	EnableSchedulerEnv                = "ENABLE_SCHEDULER"
	EnableCustomResourceControllerEnv = "ENABLE_CR_CONTROLLER"
)

View Source

const (
	ConditionStatusTypeReady           = "Ready"
	ConditionStatusTypeGPUScheduled    = "GPUScheduled"
	ConditionStatusTypeConnectionReady = "ConnectionReady"
	ConditionStatusTypeNodeProvisioned = "NodeProvisioned"
	ConditionStatusTypePoolReady       = "PoolReady"

	ConditionStatusTypeGPUPool               = "GPUPoolReady"
	ConditionStatusTypeTimeSeriesDatabase    = "TimeSeriesDatabaseReady"
	ConditionStatusTypeCloudVendorConnection = "CloudVendorConnectionReady"
)

View Source

const (
	PhaseUnknown    = "Unknown"
	PhasePending    = "Pending"
	PhaseUpdating   = "Updating"
	PhaseScheduling = "Scheduling"
	PhaseMigrating  = "Migrating"
	PhaseDestroying = "Destroying"

	PhaseRunning   = "Running"
	PhaseSucceeded = "Succeeded"
	PhaseFailed    = "Failed"
)

View Source

const (
	GPUNodeOSLinux   = "linux"
	GPUNodeOSWindows = "windows"
	GPUNodeOSMacOS   = "macos"
)

View Source

const (
	ProvisionerLabelKey        = Domain + "/node-provisioner"
	ProvisionerNamePlaceholder = "__GPU_NODE_RESOURCE_NAME__"
)

To match GPUNode with K8S node, when creating from cloud vendor, must set a label from cloud-init userdata

View Source

const (
	LeaderInfoConfigMapName        = "tensor-fusion-operator-leader-info"
	LeaderInfoConfigMapLeaderIPKey = "leader-ip"
)

View Source

const (
	LowFrequencyObjFailureInitialDelay        = 300 * time.Millisecond
	LowFrequencyObjFailureMaxDelay            = 1000 * time.Second
	LowFrequencyObjFailureMaxRPS              = 1
	LowFrequencyObjFailureMaxBurst            = 1
	LowFrequencyObjFailureConcurrentReconcile = 5
)

View Source

const AlertJobName = "tensor-fusion"

View Source

const AuthorizationHeader = "Authorization"

View Source

const DataVolumeName = "tf-data"

View Source

const DisableConnectionAuthEnv = "DISABLE_CONNECTION_AUTH"

TensorFusion ControllerManager's http endpoint will verify Pod JWT signature if this env var is set, will disable the verification, it's enabled by default should not set to true in production environment

View Source

const ExtraVerificationInfoPodIDKey = "authentication.kubernetes.io/pod-uid"

View Source

const HypervisorSchedulingConfigEnv = "TF_HYPERVISOR_SCHEDULING_CONFIG"

View Source

const HypervisorVerifyServiceAccountEnabledEnvVar = "SA_TOKEN_VERIFY_ENABLED"

View Source

const HypervisorVerifyServiceAccountPublicKeyEnvVar = "SA_TOKEN_VERIFY_PUBLIC_KEY"

View Source

const (
	NodeDiscoveryReportGPUNodeEnvName = "NODE_DISCOVERY_REPORT_GPU_NODE"
)

View Source

const (
	NvidiaGPUKey = "nvidia.com/gpu"
)

View Source

const NvidiaVisibleAllDeviceEnv = "NVIDIA_VISIBLE_DEVICES"

View Source

const NvidiaVisibleAllDeviceValue = "all"

View Source

const (
	// No disrupt label, similar to Karpenter, avoid TFConnection/Worker/GPUNode to be moved to another node or destroying node.
	// Refer: https://karpenter.sh/docs/concepts/disruption/
	SchedulingDoNotDisruptLabel = Domain + "/do-not-disrupt"
)

View Source

const ShortUUIDAlphabet = "123456789abcdefghijkmnopqrstuvwxy"

View Source

const TFDataPath = "/tmp/tensor-fusion/data"

View Source

const TensorFusionPoolManualCompaction = Domain + "/manual-compaction"

Variables ¶

View Source

var (
	PendingRequeueDuration = time.Second * 3
	StatusCheckInterval    = time.Second * 6
)

Functions ¶

This section is empty.

Types ¶

This section is empty.

Source Files ¶

View all Source files

constants.go

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL