Documentation
¶
Overview ¶
Package v1 contains API Schema definitions for the tensor-fusion.ai v1 API group. +kubebuilder:object:generate=true +groupName=tensor-fusion.ai
Index ¶
- Constants
- Variables
- type AdjustRequest
- type AllocRequest
- type AllocatedPartition
- type AuthTypeEnum
- type AutoFreeze
- type AutoFreezeAndResume
- type AutoScalingConfig
- type AutoSetResources
- type BudgetExceedStrategy
- type CapacityConfig
- type CapacityTypeEnum
- type ClientConfig
- type ComponentConfig
- type ComputingVendorConfig
- type ComputingVendorName
- type ComputingVendorParams
- type CronScalingRule
- type DevicePluginDetectionConfig
- type ElasticRateLimitParameters
- type ExternalScalerConfig
- type ExternalScalerRequest
- type ExternalScalerResponse
- type GPU
- type GPUAllocationInfo
- type GPUFilter
- type GPUList
- type GPUNode
- type GPUNodeClaim
- type GPUNodeClaimList
- type GPUNodeClaimPhase
- type GPUNodeClaimSpec
- type GPUNodeClaimStatus
- type GPUNodeClass
- type GPUNodeClassList
- type GPUNodeClassSpec
- type GPUNodeClassStatus
- type GPUNodeInfo
- type GPUNodeList
- type GPUNodeManageMode
- type GPUNodeSpec
- type GPUNodeStatus
- type GPUOrCPUResourceUnit
- type GPUPool
- type GPUPoolDefinition
- type GPUPoolList
- type GPUPoolSpec
- type GPUPoolStatus
- type GPUResourceAvailablePercent
- type GPUResourcePricingUnit
- type GPUResourceQuota
- type GPUResourceQuotaConditionType
- type GPUResourceQuotaList
- type GPUResourceQuotaSingle
- type GPUResourceQuotaSpec
- type GPUResourceQuotaStatus
- type GPUResourceQuotaTotal
- type GPUResourceUnit
- type GPUResourceUsage
- type GPUStatus
- type GangSchedulingConfig
- type GroupKindName
- type HardwareModelInfo
- type HypervisorConfig
- type HypervisorScheduling
- type IsolationModeType
- type MaintenanceWindow
- type MultiProcessQueuingParameters
- type NameNamespace
- type NodeClassBlockDeviceMappings
- type NodeClassBlockDeviceSettings
- type NodeClassItemSelectorTerms
- type NodeClassMetadataOptions
- type NodeCompaction
- type NodeHypervisorStatus
- type NodeManagerConfig
- type NodeProvisioner
- type NodeRequirementKey
- type NodeRollingUpdatePolicy
- type OSImageTypeEnum
- type Oversubscription
- type PartitionTemplate
- type PeriodicalBudget
- type PlacementConfig
- type PlacementMode
- type PodGPUInfo
- type PoolComponentStatus
- type PoolProvisioningStatus
- type ProviderConfig
- type ProviderConfigList
- type ProviderConfigSpec
- type ProviderConfigStatus
- type ProviderHypervisorConfig
- type ProviderHypervisorHostPathMount
- type ProviderImages
- type ProvisioningMode
- type ProvisioningPhase
- type QoSLevel
- type QosConfig
- type QosDefinition
- type QosPricing
- type ReBalanceThreshold
- type ReBalancerConfig
- type Requirement
- type Resource
- type ResourceName
- type Resources
- type RunningAppDetail
- type ScalingTargetResource
- type SchedulingConfigTemplate
- type SchedulingConfigTemplateList
- type SchedulingConfigTemplateSpec
- type SchedulingConfigTemplateStatus
- type SmartSchedulerModelInput
- type Taint
- type TensorFusionCluster
- func (in *TensorFusionCluster) DeepCopy() *TensorFusionCluster
- func (in *TensorFusionCluster) DeepCopyInto(out *TensorFusionCluster)
- func (in *TensorFusionCluster) DeepCopyObject() runtime.Object
- func (tfc *TensorFusionCluster) RefreshStatus(ownedPools []GPUPool)
- func (tfc *TensorFusionCluster) SetAsPending()
- func (tfc *TensorFusionCluster) SetAsReady(conditions ...metav1.Condition) bool
- func (tfc *TensorFusionCluster) SetAsUnknown(err error) bool
- func (tfc *TensorFusionCluster) SetAsUpdating(conditions ...metav1.Condition) bool
- type TensorFusionClusterList
- type TensorFusionClusterPhase
- type TensorFusionClusterSpec
- type TensorFusionClusterStatus
- type TensorFusionConnection
- type TensorFusionConnectionList
- type TensorFusionConnectionSpec
- type TensorFusionConnectionStatus
- type TensorFusionGPUNodePhase
- type TensorFusionGPUPhase
- type TensorFusionPoolPhase
- type TensorFusionWorkload
- type TensorFusionWorkloadList
- type TensorFusionWorkloadPhase
- type TensorFusionWorkloadStatus
- type UsedBySystem
- type VerticalScalingRule
- type VirtualizationTemplate
- type WorkerConfig
- type WorkerPhase
- type WorkerStatus
- type WorkloadProfile
- type WorkloadProfileList
- type WorkloadProfileSpec
- type WorkloadProfileStatus
Constants ¶
const ( // Domain is the default domain for tensor-fusion.ai API group. Domain = "tensor-fusion.ai" // DomainPrefix is the prefix of the domain for tensor-fusion.ai API group. DomainPrefix = "tensor-fusion" )
const ( PhaseUnknown = "Unknown" PhasePending = "Pending" PhaseUpdating = "Updating" PhaseScheduling = "Scheduling" PhaseMigrating = "Migrating" PhaseDestroying = "Destroying" PhaseRunning = "Running" PhaseSucceeded = "Succeeded" PhaseFailed = "Failed" )
Phase constants for resource lifecycle states.
const ( TensorFusionPoolPhasePending = TensorFusionPoolPhase(PhasePending) TensorFusionPoolPhaseRunning = TensorFusionPoolPhase(PhaseRunning) TensorFusionPoolPhaseUpdating = TensorFusionPoolPhase(PhaseUpdating) TensorFusionPoolPhaseUnknown = TensorFusionPoolPhase(PhaseUnknown) TensorFusionPoolPhaseDestroying = TensorFusionPoolPhase(PhaseDestroying) )
const ( // None means not in provisioning mode ProvisioningPhaseNone = ProvisioningPhase("None") // When NodeClaim created and pending GPUNodeClaim not empty, it's provisioning state, // check until all GPUNodeClaims are bound, unless next scale up should not happen ProvisioningPhaseProvisioning = ProvisioningPhase("Provisioning") // When all GPUNodeClaims are bound, set to Completed ProvisioningPhaseCompleted = ProvisioningPhase("Completed") )
const ( TensorFusionClusterPending = TensorFusionClusterPhase(PhasePending) TensorFusionClusterRunning = TensorFusionClusterPhase(PhaseRunning) TensorFusionClusterUpdating = TensorFusionClusterPhase(PhaseUpdating) TensorFusionClusterDestroying = TensorFusionClusterPhase(PhaseDestroying) TensorFusionClusterUnknown = TensorFusionClusterPhase(PhaseUnknown) )
const ( // Pros: simple and stable, no performance overhead, maximize GPU utilization when well-scheduled // Cons: can not auto-scale and differentiate QoS levels, TFLOPs limit does not take effect, may cause resource contention IsolationModeShared = "shared" // default isolation mode, use Proportional-Integral-Derivative controller to isolate computing resources and assign time slices // Pros: can set QoS levels for different workloads, TFLOPs limit is relatively accurate // Cons: ~1% performance overhead, resource contention may occur when burst credits are consumed IsolationModeSoft = "soft" // use dedicated SMs to isolate computing resources // Pros: better performance isolation, no performance overhead, oversubscription is possible // Cons: can not auto-scale dynamically, percent may not 1%/1TFLOPs accuracy, coupled with GPU vendor's SM partitioning implementation // NOTE: this can only be used in Remote or Local+SidecarWorker mode, not supported in LocalGPU mode (because no TensorFusion Worker) IsolationModeHard = "hard" // use GPU driver level partitioning to isolate resources, need hardware support // Pros: no performance overhead, no resource contention, fully-isolated // Cons: not supported by all GPUs/XPUs, oversubscription is not possible IsolationModePartitioned = "partitioned" )
const (
ConditionStatusTypeReady = "Ready"
)
Condition status type constants.
const GPUNodeClaimKind = "GPUNodeClaim"
const (
GPUNodeClassKind = "GPUNodeClass"
)
Variables ¶
var ( // GroupVersion is group version used to register these objects. GroupVersion = schema.GroupVersion{Group: "tensor-fusion.ai", Version: "v1"} // SchemeBuilder is used to add go types to the GroupVersionKind scheme. SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion} // AddToScheme adds the types in this group-version to the given scheme. AddToScheme = SchemeBuilder.AddToScheme )
Functions ¶
This section is empty.
Types ¶
type AdjustRequest ¶
func (*AdjustRequest) Clone ¶
func (ar *AdjustRequest) Clone() fwk.StateData
func (*AdjustRequest) DeepCopy ¶
func (in *AdjustRequest) DeepCopy() *AdjustRequest
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AdjustRequest.
func (*AdjustRequest) DeepCopyInto ¶
func (in *AdjustRequest) DeepCopyInto(out *AdjustRequest)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type AllocRequest ¶
type AllocRequest struct {
// Name of the GPU pool to allocate from
PoolName string
// Namespace information for the workload
WorkloadNameNamespace NameNamespace
// Resource requirements for the allocation
Request Resource
Limit Resource
// Specific GPU indices to allocate, empty slice means any index
GPUIndices []int32
// Number of GPUs to allocate
Count uint
// Specific GPU model to allocate, empty string means any model
GPUModel string
// Specific GPU vendor to allocate, default to any if empty
GPUVendor string
// Node affinity requirements
NodeAffinity *v1.NodeAffinity
// final scheduled GPU IDs for this allocation request
// This fields is set by GPUAllocator, user should not choose specific GPUs
GPUNames []string
// record the pod meta for quota check
PodMeta metav1.ObjectMeta
QoS QoSLevel
Isolation IsolationModeType
// PartitionTemplateID is the template ID used for partitioned mode allocation
// This is set by the scheduler when a partition is matched, or read from pod annotation
PartitionTemplateID string
}
func (*AllocRequest) Clone ¶
func (p *AllocRequest) Clone() fwk.StateData
func (*AllocRequest) DeepCopy ¶
func (in *AllocRequest) DeepCopy() *AllocRequest
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AllocRequest.
func (*AllocRequest) DeepCopyInto ¶
func (in *AllocRequest) DeepCopyInto(out *AllocRequest)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type AllocatedPartition ¶ added in v0.1.1
type AllocatedPartition struct {
// TemplateID is the template used to create this partition
TemplateID string `json:"templateId"`
// PodUID is the UID of the pod using this partition (used as map key)
PodUID string `json:"podUid"`
// PodName is the name of the pod using this partition
PodName string `json:"podName"`
// Namespace is the namespace of the pod using this partition
Namespace string `json:"namespace"`
// AllocatedAt is when this partition was allocated
AllocatedAt metav1.Time `json:"allocatedAt"`
// AllocatedSlotStart is the starting slot position where this partition is allocated
// This is the actual hardware slot position (0-based index)
// For NVIDIA MIG: physical slot position (0-7)
AllocatedSlotStart *uint32 `json:"allocatedSlotStart,omitempty"`
// AllocatedSlotEnd is the ending slot position (exclusive) where this partition is allocated
// The partition occupies slots [AllocatedSlotStart, AllocatedSlotEnd)
AllocatedSlotEnd *uint32 `json:"allocatedSlotEnd,omitempty"`
// IsolationGroupID is the isolation group where this partition is allocated
// For Ascend NPU: vGroup ID (0-3)
// For NVIDIA MIG: this is derived from slot position, not used separately
// +optional
IsolationGroupID *uint32 `json:"isolationGroupId,omitempty"`
}
AllocatedPartition represents an allocated partition on a GPU Key in AllocatedPartitions map is podUID
func (*AllocatedPartition) DeepCopy ¶ added in v0.1.1
func (in *AllocatedPartition) DeepCopy() *AllocatedPartition
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AllocatedPartition.
func (*AllocatedPartition) DeepCopyInto ¶ added in v0.1.1
func (in *AllocatedPartition) DeepCopyInto(out *AllocatedPartition)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type AuthTypeEnum ¶
type AuthTypeEnum string
+kubebuilder:validation:Enum=accessKey;serviceAccountRole
const ( AuthTypeAccessKey AuthTypeEnum = "accessKey" AuthTypeServiceAccountRole AuthTypeEnum = "serviceAccountRole" )
type AutoFreeze ¶
type AutoFreeze struct {
Qos QoSLevel `json:"qos,omitempty"`
FreezeToMemTTL string `json:"freezeToMemTTL,omitempty"`
FreezeToDiskTTL string `json:"freezeToDiskTTL,omitempty"`
Enable *bool `json:"enable,omitempty"`
}
func (*AutoFreeze) DeepCopy ¶
func (in *AutoFreeze) DeepCopy() *AutoFreeze
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AutoFreeze.
func (*AutoFreeze) DeepCopyInto ¶
func (in *AutoFreeze) DeepCopyInto(out *AutoFreeze)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type AutoFreezeAndResume ¶
type AutoFreezeAndResume struct {
AutoFreeze []AutoFreeze `json:"autoFreeze,omitempty"`
IntelligenceWarmup SmartSchedulerModelInput `json:"intelligenceWarmup,omitempty"`
}
func (*AutoFreezeAndResume) DeepCopy ¶
func (in *AutoFreezeAndResume) DeepCopy() *AutoFreezeAndResume
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AutoFreezeAndResume.
func (*AutoFreezeAndResume) DeepCopyInto ¶
func (in *AutoFreezeAndResume) DeepCopyInto(out *AutoFreezeAndResume)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type AutoScalingConfig ¶
type AutoScalingConfig struct {
// Adjust baseline requests and limits to match the actual usage using recent metrics
AutoSetResources *AutoSetResources `json:"autoSetResources,omitempty"`
// CronScalingRules defines a list of CronScaling rules used to schedule scaling actions based on cron expressions.
CronScalingRules []CronScalingRule `json:"cronScalingRules,omitempty"`
ExternalScaler *ExternalScalerConfig `json:"externalScaler,omitempty"`
}
func (*AutoScalingConfig) DeepCopy ¶
func (in *AutoScalingConfig) DeepCopy() *AutoScalingConfig
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AutoScalingConfig.
func (*AutoScalingConfig) DeepCopyInto ¶
func (in *AutoScalingConfig) DeepCopyInto(out *AutoScalingConfig)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type AutoSetResources ¶
type AutoSetResources struct {
Enable bool `json:"enable,omitempty"`
// Target resource to scale, such as "compute", "vram", or "all" by default
TargetResource ScalingTargetResource `json:"targetResource,omitempty"`
// Tflops usage percentile that will be used as a base for tflops target recommendation. Default: 0.9
TargetComputePercentile string `json:"targetComputePercentile,omitempty"`
// Tflops usage percentile that will be used for the lower bound on tflops recommendation. Default: 0.5
LowerBoundComputePercentile string `json:"lowerBoundComputePercentile,omitempty"`
// Tflops usage percentile that will be used for the upper bound on tflops recommendation. Default: 0.95
UpperBoundComputePercentile string `json:"upperBoundComputePercentile,omitempty"`
// Vram usage percentile that will be used as a base for vram target recommendation. Default: 0.9
TargetVRAMPercentile string `json:"targetVRAMPercentile,omitempty"`
// Vram usage percentile that will be used for the lower bound on vram recommendation. Default: 0.5
LowerBoundVRAMPercentile string `json:"lowerBoundVRAMPercentile,omitempty"`
// Vram usage percentile that will be used for the upper bound on vram recommendation. Default: 0.95
UpperBoundVRAMPercentile string `json:"upperBoundVRAMPercentile,omitempty"`
// Fraction of usage added as the safety margin to the recommended request. Default: 0.15
MarginFraction string `json:"marginFraction,omitempty"`
// Only when the difference between the recommended request and the current request is greater than this threshold, the request will be updated. Default: 0.1
// This value can't greater than MarginFraction, otherwise no update will be made since always inside the threshold after multiplying MarginFraction.
UpdateThreshold string `json:"updateThreshold,omitempty"`
// How much time back TSDB have to be queried to get historical metrics. Default: 2h
HistoryDataPeriod string `json:"historyDataPeriod,omitempty"`
// Min scaling ratio to original resources, e.g. request 10Gi, ratio 0.5, scale down limit to 5Gi, default: 0.2
MinVRAMResourcesRatio string `json:"minVRAMResourcesRatio,omitempty"`
// Max scaling ratio to original resources, e.g. request 10Gi, ratio 2.0, scale up limit to 20Gi, default: 5.0
MaxVRAMResourcesRatio string `json:"maxVRAMResourcesRatio,omitempty"`
// Min scaling ratio to original resources, e.g. request 10Gi, ratio 0.5, scale down limit to 5Gi, default: 0.1
// This ratio only apply to tflops/compute request rather than limit, to avoid performance downgrade when not used for a long time
MinComputeResourcesRatio string `json:"minComputeResourcesRatio,omitempty"`
// Max scaling ratio to original resources, e.g. request 10Gi, ratio 2.0, scale up limit to 20Gi, default: 10.0
MaxComputeResourcesRatio string `json:"maxComputeResourcesRatio,omitempty"`
// When workload is created, wait for this period to collect enough metrics before scaling, default: 30m
InitialDelayPeriod string `json:"initialDelayPeriod,omitempty"`
// How often to evaluate the scaling operation, default: same as global config's auto scaling interval
Interval string `json:"interval,omitempty"`
}
func (*AutoSetResources) DeepCopy ¶
func (in *AutoSetResources) DeepCopy() *AutoSetResources
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AutoSetResources.
func (*AutoSetResources) DeepCopyInto ¶
func (in *AutoSetResources) DeepCopyInto(out *AutoSetResources)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type BudgetExceedStrategy ¶
type BudgetExceedStrategy string
+kubebuilder:validation:Enum=AlertOnly;AlertAndTerminateVM
const ( BudgetExceedStrategyAlertOnly BudgetExceedStrategy = "AlertOnly" BudgetExceedStrategyAlertAndTerminateVM BudgetExceedStrategy = "AlertAndTerminateVM" )
type CapacityConfig ¶
type CapacityConfig struct {
// +optional
MinResources *GPUOrCPUResourceUnit `json:"minResources,omitempty"`
// +optional
MaxResources *GPUOrCPUResourceUnit `json:"maxResources,omitempty"`
// +optional
WarmResources *GPUOrCPUResourceUnit `json:"warmResources,omitempty"`
// +optional
Oversubscription *Oversubscription `json:"oversubscription,omitempty"`
}
func (*CapacityConfig) DeepCopy ¶
func (in *CapacityConfig) DeepCopy() *CapacityConfig
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CapacityConfig.
func (*CapacityConfig) DeepCopyInto ¶
func (in *CapacityConfig) DeepCopyInto(out *CapacityConfig)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type CapacityTypeEnum ¶
type CapacityTypeEnum string
const ( CapacityTypeOnDemand CapacityTypeEnum = "OnDemand" CapacityTypeReserved CapacityTypeEnum = "Reserved" // Spot and Preemptive are aliases of each other, used by different providers CapacityTypeSpot CapacityTypeEnum = "Spot" )
type ClientConfig ¶
type ClientConfig struct {
// Image is the default client image, used when no ProviderConfig is found for the vendor
// When ProviderConfig exists, remoteClient image from ProviderConfig takes precedence
Image string `json:"image,omitempty"`
OperatorEndpoint string `json:"operatorEndpoint,omitempty"`
// +optional
PatchToPod *runtime.RawExtension `json:"patchToPod,omitempty"`
// +optional
PatchToContainer *runtime.RawExtension `json:"patchToContainer,omitempty"`
}
func (*ClientConfig) DeepCopy ¶
func (in *ClientConfig) DeepCopy() *ClientConfig
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ClientConfig.
func (*ClientConfig) DeepCopyInto ¶
func (in *ClientConfig) DeepCopyInto(out *ClientConfig)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type ComponentConfig ¶
type ComponentConfig struct {
// +optional
Worker *WorkerConfig `json:"worker,omitempty"`
// +optional
Hypervisor *HypervisorConfig `json:"hypervisor,omitempty"`
// +optional
Client *ClientConfig `json:"client,omitempty"`
}
Customize system components for seamless onboarding.
func (*ComponentConfig) DeepCopy ¶
func (in *ComponentConfig) DeepCopy() *ComponentConfig
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ComponentConfig.
func (*ComponentConfig) DeepCopyInto ¶
func (in *ComponentConfig) DeepCopyInto(out *ComponentConfig)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type ComputingVendorConfig ¶
type ComputingVendorConfig struct {
Name string `json:"name,omitempty"`
// support popular cloud providers
Type ComputingVendorName `json:"type,omitempty"`
AuthType AuthTypeEnum `json:"authType,omitempty"` // Authentication type (e.g., accessKey, serviceAccount).
// +optional
// +kubebuilder:default=true
Enable *bool `json:"enable,omitempty"` // Enable or disable the computing vendor.
Params ComputingVendorParams `json:"params,omitempty"`
}
ComputingVendorConfig defines the Cloud vendor connection such as AWS, GCP, Azure etc.
func (*ComputingVendorConfig) DeepCopy ¶
func (in *ComputingVendorConfig) DeepCopy() *ComputingVendorConfig
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ComputingVendorConfig.
func (*ComputingVendorConfig) DeepCopyInto ¶
func (in *ComputingVendorConfig) DeepCopyInto(out *ComputingVendorConfig)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type ComputingVendorName ¶
type ComputingVendorName string
+kubebuilder:validation:Enum=aws;lambda-labs;gcp;azure;oracle-oci;ibm;openshift;vultr;together-ai;alibaba;nvidia;tencent;runpod;karpenter;mock
const ( ComputingVendorAWS ComputingVendorName = "aws" ComputingVendorGCP ComputingVendorName = "gcp" ComputingVendorAzure ComputingVendorName = "azure" ComputingVendorOracle ComputingVendorName = "oracle-oci" ComputingVendorIBM ComputingVendorName = "ibm" ComputingVendorOpenShift ComputingVendorName = "openshift" ComputingVendorVultr ComputingVendorName = "vultr" ComputingVendorTogetherAI ComputingVendorName = "together-ai" ComputingVendorLambdaLabs ComputingVendorName = "lambda-labs" ComputingVendorAlibaba ComputingVendorName = "alibaba" ComputingVendorNvidia ComputingVendorName = "nvidia" ComputingVendorTencent ComputingVendorName = "tencent" ComputingVendorRunPod ComputingVendorName = "runpod" ComputingVendorKarpenter ComputingVendorName = "karpenter" // This is not unit/integration testing only, no cloud provider is involved ComputingVendorMock ComputingVendorName = "mock" )
type ComputingVendorParams ¶
type ComputingVendorParams struct {
// +optional
DefaultRegion string `json:"defaultRegion,omitempty"` // Region for the computing vendor.
// the secret of access key and secret key or config file, must be mounted as file path
// +optional
AccessKeyPath string `json:"accessKeyPath,omitempty"`
// +optional
SecretKeyPath string `json:"secretKeyPath,omitempty"`
// preferred IAM role since it's more secure
// +optional
IAMRole string `json:"iamRole,omitempty"`
// +optional
ConfigFile string `json:"configFile,omitempty"`
// +optional
// User can set extra cloud vendor params, eg.
// in ali cloud:" spotPriceLimit, spotDuration, spotInterruptionBehavior, systemDiskCategory, systemDiskSize, dataDiskPerformanceLevel
// in aws cloud: TODO
ExtraParams map[string]string `json:"extraParams,omitempty"`
}
func (*ComputingVendorParams) DeepCopy ¶
func (in *ComputingVendorParams) DeepCopy() *ComputingVendorParams
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ComputingVendorParams.
func (*ComputingVendorParams) DeepCopyInto ¶
func (in *ComputingVendorParams) DeepCopyInto(out *ComputingVendorParams)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type CronScalingRule ¶
type CronScalingRule struct {
// Enable specifies whether the cron scaler is enabled.
Enable bool `json:"enable,omitempty"`
// Name is the identifier for the cron scaler.
Name string `json:"name,omitempty"`
// Start is the start time for the scaling schedule, in cron format.
Start string `json:"start,omitempty"`
// End is the end time for the scaling schedule, in cron format.
End string `json:"end,omitempty"`
// DesiredResources specifies the target resources to scale to during the schedule.
DesiredResources Resources `json:"desiredResources,omitempty"`
}
CronScalingRule defines the rule for scaling resources based on a cron schedule. It allows enabling/disabling the scaler, specifying the time window for scaling, and configuring the desired resources and replicas during the scheduled period.
func (*CronScalingRule) DeepCopy ¶
func (in *CronScalingRule) DeepCopy() *CronScalingRule
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CronScalingRule.
func (*CronScalingRule) DeepCopyInto ¶
func (in *CronScalingRule) DeepCopyInto(out *CronScalingRule)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type DevicePluginDetectionConfig ¶ added in v0.1.1
type DevicePluginDetectionConfig struct {
// ResourceNamePrefixes are prefixes used to identify this vendor's device plugin resources
// e.g., ["nvidia.com/gpu", "nvidia.com/mig"] for NVIDIA
// +optional
ResourceNamePrefixes []string `json:"resourceNamePrefixes,omitempty"`
// UsedBySystemName is the identifier for workloads using this vendor's native device plugin
// +optional
UsedBySystemName string `json:"usedBySystemName,omitempty"`
}
DevicePluginDetectionConfig contains settings for detecting existing device plugins
func (*DevicePluginDetectionConfig) DeepCopy ¶ added in v0.1.1
func (in *DevicePluginDetectionConfig) DeepCopy() *DevicePluginDetectionConfig
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DevicePluginDetectionConfig.
func (*DevicePluginDetectionConfig) DeepCopyInto ¶ added in v0.1.1
func (in *DevicePluginDetectionConfig) DeepCopyInto(out *DevicePluginDetectionConfig)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type ElasticRateLimitParameters ¶
type ElasticRateLimitParameters struct {
// Refill rate is controlled by PID controller, adjusted by current utilization
MaxRefillRate string `json:"maxRefillRate,omitempty"`
MinRefillRate string `json:"minRefillRate,omitempty"`
// Filter ineffective requests from rate limit, 0.0 to 1.0
FilterAlpha string `json:"filterAlpha,omitempty"`
// PID controller parameters
Ki string `json:"ki,omitempty"`
Kd string `json:"kd,omitempty"`
Kp string `json:"kp,omitempty"`
// Burst window to control token bucket Min/Max (currentCapacity = burstWindow x currentRefillRate)
BurstWindow string `json:"burstWindow,omitempty"`
// token bucket min and max
CapacityMin string `json:"capacityMin,omitempty"`
CapacityMax string `json:"capacityMax,omitempty"`
// Decay factor for integral term in PID controller, to avoid integral windup
IntegralDecayFactor string `json:"integralDecayFactor,omitempty"`
}
func (*ElasticRateLimitParameters) DeepCopy ¶
func (in *ElasticRateLimitParameters) DeepCopy() *ElasticRateLimitParameters
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ElasticRateLimitParameters.
func (*ElasticRateLimitParameters) DeepCopyInto ¶
func (in *ElasticRateLimitParameters) DeepCopyInto(out *ElasticRateLimitParameters)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type ExternalScalerConfig ¶
type ExternalScalerConfig struct {
Enable bool `json:"enable,omitempty"`
URL string `json:"url,omitempty"`
// API key will be set into the request header as "Authorization: Bearer <api key>"
APIKeySecretRef *v1.SecretReference `json:"apiKeySecretRef,omitempty"`
InitialDelayPeriod string `json:"initialDelayPeriod,omitempty"`
// How often to evaluate the scaling operation, default: same as global config's auto scaling interval
Interval string `json:"interval,omitempty"`
}
func (*ExternalScalerConfig) DeepCopy ¶
func (in *ExternalScalerConfig) DeepCopy() *ExternalScalerConfig
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ExternalScalerConfig.
func (*ExternalScalerConfig) DeepCopyInto ¶
func (in *ExternalScalerConfig) DeepCopyInto(out *ExternalScalerConfig)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type ExternalScalerRequest ¶
type ExternalScalerRequest struct {
WorkloadName string `json:"workloadName,omitempty"`
Namespace string `json:"namespace,omitempty"`
CurrentResources Resources `json:"currentResources,omitempty"`
}
func (*ExternalScalerRequest) DeepCopy ¶
func (in *ExternalScalerRequest) DeepCopy() *ExternalScalerRequest
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ExternalScalerRequest.
func (*ExternalScalerRequest) DeepCopyInto ¶
func (in *ExternalScalerRequest) DeepCopyInto(out *ExternalScalerRequest)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type ExternalScalerResponse ¶
type ExternalScalerResponse struct {
NeedScaleUp bool `json:"needScaleUp,omitempty"`
NeedScaleDown bool `json:"needScaleDown,omitempty"`
// Explain why the scaling operation is needed or not needed, recorded to event and workload status
Reason string `json:"reason,omitempty"`
// If no scaling operation needed, this could be zero value
RecommendedResources Resources `json:"recommendedResources,omitempty"`
}
func (*ExternalScalerResponse) DeepCopy ¶
func (in *ExternalScalerResponse) DeepCopy() *ExternalScalerResponse
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ExternalScalerResponse.
func (*ExternalScalerResponse) DeepCopyInto ¶
func (in *ExternalScalerResponse) DeepCopyInto(out *ExternalScalerResponse)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type GPU ¶
type GPU struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`
Status GPUStatus `json:"status,omitempty"`
}
+kubebuilder:object:root=true +kubebuilder:subresource:status +kubebuilder:resource:scope=Cluster +kubebuilder:printcolumn:name="GPU Model",type="string",JSONPath=".status.gpuModel" +kubebuilder:printcolumn:name="Phase",type="string",JSONPath=".status.phase" +kubebuilder:printcolumn:name="Total TFlops",type="string",JSONPath=".status.capacity.tflops" +kubebuilder:printcolumn:name="Total VRAM",type="string",JSONPath=".status.capacity.vram" +kubebuilder:printcolumn:name="Available TFlops",type="string",JSONPath=".status.available.tflops" +kubebuilder:printcolumn:name="Available VRAM",type="string",JSONPath=".status.available.vram" +kubebuilder:printcolumn:name="Device UUID",type="string",JSONPath=".status.uuid" +kubebuilder:printcolumn:name="Used By",type="string",JSONPath=".status.usedBy" +kubebuilder:printcolumn:name="Node",type="string",JSONPath=".status.nodeSelector" GPU is the Schema for the gpus API.
func (*GPU) DeepCopy ¶
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPU.
func (*GPU) DeepCopyInto ¶
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (*GPU) DeepCopyObject ¶
DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
type GPUAllocationInfo ¶
type GPUAllocationInfo struct {
Request Resource `json:"request,omitempty"`
Limit Resource `json:"limit,omitempty"`
PodName string `json:"podName,omitempty"`
PodUID string `json:"podUID,omitempty"`
Namespace string `json:"namespace,omitempty"`
}
func (*GPUAllocationInfo) DeepCopy ¶
func (in *GPUAllocationInfo) DeepCopy() *GPUAllocationInfo
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUAllocationInfo.
func (*GPUAllocationInfo) DeepCopyInto ¶
func (in *GPUAllocationInfo) DeepCopyInto(out *GPUAllocationInfo)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type GPUFilter ¶
type GPUFilter struct {
Type string `json:"type,omitempty"`
Params runtime.RawExtension `json:"params,omitempty"`
}
GPUFilter is to select eligible GPUs for scheduling.
example: ```yaml - type: avoidTooMuchConnectionsOnSameGPU params:
connectionNum: 150
- type: avoidDifferentZone params:
# by default, GPU worker will be scheduled into the same zone as CPU Client Pod to align AZ and improve performance topologyKey: topology.kubernetes.io/zone
```
func (*GPUFilter) DeepCopy ¶
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUFilter.
func (*GPUFilter) DeepCopyInto ¶
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type GPUList ¶
type GPUList struct {
metav1.TypeMeta `json:",inline"`
metav1.ListMeta `json:"metadata,omitempty"`
Items []GPU `json:"items"`
}
GPUList contains a list of GPU.
func (*GPUList) DeepCopy ¶
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUList.
func (*GPUList) DeepCopyInto ¶
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (*GPUList) DeepCopyObject ¶
DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
type GPUNode ¶
type GPUNode struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`
Spec GPUNodeSpec `json:"spec,omitempty"`
Status GPUNodeStatus `json:"status,omitempty"`
}
+kubebuilder:object:root=true +kubebuilder:subresource:status +kubebuilder:resource:scope=Cluster +kubebuilder:printcolumn:name="Phase",type="string",JSONPath=".status.phase" +kubebuilder:printcolumn:name="Total TFlops",type="string",JSONPath=".status.totalTFlops" +kubebuilder:printcolumn:name="Total VRAM",type="string",JSONPath=".status.totalVRAM" +kubebuilder:printcolumn:name="Virtual TFlops",type="string",JSONPath=".status.virtualTFlops" +kubebuilder:printcolumn:name="Virtual VRAM",type="string",JSONPath=".status.virtualVRAM" +kubebuilder:printcolumn:name="Available TFlops",type="string",JSONPath=".status.availableTFlops" +kubebuilder:printcolumn:name="Available VRAM",type="string",JSONPath=".status.availableVRAM" +kubebuilder:printcolumn:name="GPU Count",type="integer",JSONPath=".status.totalGPUs" GPUNode is the Schema for the gpunodes API.
func (*GPUNode) DeepCopy ¶
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNode.
func (*GPUNode) DeepCopyInto ¶
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (*GPUNode) DeepCopyObject ¶
DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
type GPUNodeClaim ¶
type GPUNodeClaim struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata"`
Spec GPUNodeClaimSpec `json:"spec,omitempty"`
Status GPUNodeClaimStatus `json:"status,omitempty"`
}
GPUNodeClaim is the Schema for the gpunodeclaims API.
func (*GPUNodeClaim) DeepCopy ¶
func (in *GPUNodeClaim) DeepCopy() *GPUNodeClaim
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeClaim.
func (*GPUNodeClaim) DeepCopyInto ¶
func (in *GPUNodeClaim) DeepCopyInto(out *GPUNodeClaim)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (*GPUNodeClaim) DeepCopyObject ¶
func (in *GPUNodeClaim) DeepCopyObject() runtime.Object
DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
type GPUNodeClaimList ¶
type GPUNodeClaimList struct {
metav1.TypeMeta `json:",inline"`
metav1.ListMeta `json:"metadata"`
Items []GPUNodeClaim `json:"items"`
}
GPUNodeClaimList contains a list of GPUNodeClaim.
func (*GPUNodeClaimList) DeepCopy ¶
func (in *GPUNodeClaimList) DeepCopy() *GPUNodeClaimList
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeClaimList.
func (*GPUNodeClaimList) DeepCopyInto ¶
func (in *GPUNodeClaimList) DeepCopyInto(out *GPUNodeClaimList)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (*GPUNodeClaimList) DeepCopyObject ¶
func (in *GPUNodeClaimList) DeepCopyObject() runtime.Object
DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
type GPUNodeClaimPhase ¶
type GPUNodeClaimPhase string
const ( GPUNodeClaimPending GPUNodeClaimPhase = "Pending" GPUNodeClaimCreating GPUNodeClaimPhase = "Creating" GPUNodeClaimBound GPUNodeClaimPhase = "Bound" )
type GPUNodeClaimSpec ¶
type GPUNodeClaimSpec struct {
NodeName string `json:"nodeName,omitempty"`
Region string `json:"region,omitempty"`
Zone string `json:"zone,omitempty"`
InstanceType string `json:"instanceType,omitempty"`
NodeClassRef GroupKindName `json:"nodeClassRef,omitempty"`
CapacityType CapacityTypeEnum `json:"capacityType,omitempty"`
TFlopsOffered resource.Quantity `json:"tflopsOffered"`
VRAMOffered resource.Quantity `json:"vramOffered"`
GPUDeviceOffered int32 `json:"gpuDeviceOffered"`
ExtraParams map[string]string `json:"extraParams,omitempty"`
}
GPUNodeClaimSpec defines the desired state of GPUNodeClaim.
func (*GPUNodeClaimSpec) DeepCopy ¶
func (in *GPUNodeClaimSpec) DeepCopy() *GPUNodeClaimSpec
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeClaimSpec.
func (*GPUNodeClaimSpec) DeepCopyInto ¶
func (in *GPUNodeClaimSpec) DeepCopyInto(out *GPUNodeClaimSpec)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type GPUNodeClaimStatus ¶
type GPUNodeClaimStatus struct {
// +kubebuilder:default=Pending
Phase GPUNodeClaimPhase `json:"phase"`
InstanceID string `json:"instanceID,omitempty"`
}
GPUNodeClaimStatus defines the observed state of GPUNodeClaim.
func (*GPUNodeClaimStatus) DeepCopy ¶
func (in *GPUNodeClaimStatus) DeepCopy() *GPUNodeClaimStatus
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeClaimStatus.
func (*GPUNodeClaimStatus) DeepCopyInto ¶
func (in *GPUNodeClaimStatus) DeepCopyInto(out *GPUNodeClaimStatus)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type GPUNodeClass ¶
type GPUNodeClass struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`
Spec GPUNodeClassSpec `json:"spec,omitempty"`
Status GPUNodeClassStatus `json:"status,omitempty"`
}
+kubebuilder:object:root=true +kubebuilder:subresource:status +kubebuilder:resource:scope=Cluster GPUNodeClass is the Schema for the gpunodeclasses API.
func (*GPUNodeClass) DeepCopy ¶
func (in *GPUNodeClass) DeepCopy() *GPUNodeClass
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeClass.
func (*GPUNodeClass) DeepCopyInto ¶
func (in *GPUNodeClass) DeepCopyInto(out *GPUNodeClass)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (*GPUNodeClass) DeepCopyObject ¶
func (in *GPUNodeClass) DeepCopyObject() runtime.Object
DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
type GPUNodeClassList ¶
type GPUNodeClassList struct {
metav1.TypeMeta `json:",inline"`
metav1.ListMeta `json:"metadata,omitempty"`
Items []GPUNodeClass `json:"items"`
}
GPUNodeClassList contains a list of GPUNodeClass.
func (*GPUNodeClassList) DeepCopy ¶
func (in *GPUNodeClassList) DeepCopy() *GPUNodeClassList
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeClassList.
func (*GPUNodeClassList) DeepCopyInto ¶
func (in *GPUNodeClassList) DeepCopyInto(out *GPUNodeClassList)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (*GPUNodeClassList) DeepCopyObject ¶
func (in *GPUNodeClassList) DeepCopyObject() runtime.Object
DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
type GPUNodeClassSpec ¶
type GPUNodeClassSpec struct {
// +optional
// The launch template to use for VM instances, if set, all other fields could be skipped
LaunchTemplate NodeClassItemSelectorTerms `json:"launchTemplate"`
// +optional
// Could be private or public, varies in different cloud vendor, define where to query the OSImageID
// +kubebuilder:default="Private"
OSImageType OSImageTypeEnum `json:"osImageType,omitempty"`
// the OS image identifier string, default to use first one, if not found, fallback to others
OSImageSelectorTerms []NodeClassItemSelectorTerms `json:"osImageSelectorTerms,omitempty"`
// +optional
// The instance profile to use, assign IAM role and permissions for EC2 instances
InstanceProfile string `json:"instanceProfile,omitempty"`
// +optional
// for AWS only, IMDSv2 metadata service options
MetadataOptions *NodeClassMetadataOptions `json:"metadataOptions,omitempty"`
// +optional
SecurityGroupSelectorTerms []NodeClassItemSelectorTerms `json:"securityGroupSelectorTerms,omitempty"`
// +optional
SubnetSelectorTerms []NodeClassItemSelectorTerms `json:"subnetSelectorTerms,omitempty"` // Terms to select subnets
// +optional
BlockDeviceMappings []NodeClassBlockDeviceMappings `json:"blockDeviceMappings,omitempty"` // Block device mappings for the instance
// +optional
Tags map[string]string `json:"tags,omitempty"` // Tags associated with the resource
// +optional
UserData string `json:"userData,omitempty"` // User data script for the instance
}
GPUNodeClassSpec defines the desired state of GPUNodeClass.
func (*GPUNodeClassSpec) DeepCopy ¶
func (in *GPUNodeClassSpec) DeepCopy() *GPUNodeClassSpec
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeClassSpec.
func (*GPUNodeClassSpec) DeepCopyInto ¶
func (in *GPUNodeClassSpec) DeepCopyInto(out *GPUNodeClassSpec)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type GPUNodeClassStatus ¶
type GPUNodeClassStatus struct {
}
GPUNodeClassStatus defines the observed state of GPUNodeClass.
func (*GPUNodeClassStatus) DeepCopy ¶
func (in *GPUNodeClassStatus) DeepCopy() *GPUNodeClassStatus
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeClassStatus.
func (*GPUNodeClassStatus) DeepCopyInto ¶
func (in *GPUNodeClassStatus) DeepCopyInto(out *GPUNodeClassStatus)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type GPUNodeInfo ¶
type GPUNodeInfo struct {
// Additional space for L1/L2 VRAM buffer
RAMSize resource.Quantity `json:"ramSize,omitempty"`
DataDiskSize resource.Quantity `json:"dataDiskSize,omitempty"`
}
func (*GPUNodeInfo) DeepCopy ¶
func (in *GPUNodeInfo) DeepCopy() *GPUNodeInfo
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeInfo.
func (*GPUNodeInfo) DeepCopyInto ¶
func (in *GPUNodeInfo) DeepCopyInto(out *GPUNodeInfo)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type GPUNodeList ¶
type GPUNodeList struct {
metav1.TypeMeta `json:",inline"`
metav1.ListMeta `json:"metadata,omitempty"`
Items []GPUNode `json:"items"`
}
GPUNodeList contains a list of GPUNode.
func (*GPUNodeList) DeepCopy ¶
func (in *GPUNodeList) DeepCopy() *GPUNodeList
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeList.
func (*GPUNodeList) DeepCopyInto ¶
func (in *GPUNodeList) DeepCopyInto(out *GPUNodeList)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (*GPUNodeList) DeepCopyObject ¶
func (in *GPUNodeList) DeepCopyObject() runtime.Object
DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
type GPUNodeManageMode ¶
type GPUNodeManageMode string
+kubebuilder:validation:Enum=Manual;AutoSelect;Provisioned
const ( GPUNodeManageModeManual GPUNodeManageMode = "Manual" GPUNodeManageModeAutoSelect GPUNodeManageMode = "AutoSelect" GPUNodeManageModeProvisioned GPUNodeManageMode = "Provisioned" )
type GPUNodeSpec ¶
type GPUNodeSpec struct {
// +kubebuilder:default=AutoSelect
ManageMode GPUNodeManageMode `json:"manageMode,omitempty"`
// +optional
CostPerHour string `json:"costPerHour,omitempty"`
// if not all GPU cards should be used, specify the GPU card indices, default to empty,
// onboard all GPU cards to the pool
// +optional
GPUCardIndices []int `json:"gpuCardIndices,omitempty"`
// +optional
CloudVendorParam string `json:"cloudVendorParam,omitempty"`
}
GPUNodeSpec defines the desired state of GPUNode.
func (*GPUNodeSpec) DeepCopy ¶
func (in *GPUNodeSpec) DeepCopy() *GPUNodeSpec
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeSpec.
func (*GPUNodeSpec) DeepCopyInto ¶
func (in *GPUNodeSpec) DeepCopyInto(out *GPUNodeSpec)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type GPUNodeStatus ¶
type GPUNodeStatus struct {
// +kubebuilder:default=Pending
Phase TensorFusionGPUNodePhase `json:"phase"`
// +optional
Conditions []metav1.Condition `json:"conditions,omitempty"`
TotalTFlops resource.Quantity `json:"totalTFlops"`
TotalVRAM resource.Quantity `json:"totalVRAM"`
// +optional
VirtualTFlops resource.Quantity `json:"virtualTFlops,omitempty"`
// +optional
VirtualVRAM resource.Quantity `json:"virtualVRAM,omitempty"`
// +optional
AvailableTFlops resource.Quantity `json:"availableTFlops,omitempty"`
// +optional
AvailableVRAM resource.Quantity `json:"availableVRAM,omitempty"`
// +optional
VirtualAvailableTFlops *resource.Quantity `json:"virtualAvailableTFlops,omitempty"`
// +optional
VirtualAvailableVRAM *resource.Quantity `json:"virtualAvailableVRAM,omitempty"`
// +optional
HypervisorStatus NodeHypervisorStatus `json:"hypervisorStatus,omitempty"`
// +optional
NodeInfo GPUNodeInfo `json:"nodeInfo,omitempty"`
// +optional
LoadedModels *[]string `json:"loadedModels,omitempty"`
TotalGPUs int32 `json:"totalGPUs"`
ManagedGPUs int32 `json:"managedGPUs"`
// +optional
ManagedGPUDeviceIDs []string `json:"managedGPUDeviceIDs,omitempty"`
ObservedGeneration int64 `json:"observedGeneration,omitempty"`
// +optional
TotalGPUPods int32 `json:"totalGPUPods,omitempty"`
// +optional
AllocatedPods map[string][]*PodGPUInfo `json:"allocatedPods,omitempty"`
}
GPUNodeStatus defines the observed state of GPUNode.
func (*GPUNodeStatus) DeepCopy ¶
func (in *GPUNodeStatus) DeepCopy() *GPUNodeStatus
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeStatus.
func (*GPUNodeStatus) DeepCopyInto ¶
func (in *GPUNodeStatus) DeepCopyInto(out *GPUNodeStatus)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type GPUOrCPUResourceUnit ¶
type GPUOrCPUResourceUnit struct {
TFlops resource.Quantity `json:"tflops,omitempty"`
VRAM resource.Quantity `json:"vram,omitempty"`
// CPU/Memory is only available when CloudVendor connection is enabled
// +optional
CPU resource.Quantity `json:"cpu,omitempty"`
// +optional
Memory resource.Quantity `json:"memory,omitempty"`
}
func (*GPUOrCPUResourceUnit) DeepCopy ¶
func (in *GPUOrCPUResourceUnit) DeepCopy() *GPUOrCPUResourceUnit
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUOrCPUResourceUnit.
func (*GPUOrCPUResourceUnit) DeepCopyInto ¶
func (in *GPUOrCPUResourceUnit) DeepCopyInto(out *GPUOrCPUResourceUnit)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type GPUPool ¶
type GPUPool struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`
Spec GPUPoolSpec `json:"spec,omitempty"`
Status GPUPoolStatus `json:"status,omitempty"`
}
+kubebuilder:printcolumn:name="Phase",type="string",JSONPath=".status.phase" +kubebuilder:printcolumn:name="TFlops Oversubscription",type="string",JSONPath=".spec.capacityConfig.oversubscription.tflopsOversellRatio" +kubebuilder:printcolumn:name="Mode",type="string",JSONPath=".status.mode" +kubebuilder:printcolumn:name="Default Scheduling Strategy",type="string",JSONPath=".spec.schedulingConfigTemplate" +kubebuilder:printcolumn:name="Total Nodes",type="string",JSONPath=".status.totalNodes" +kubebuilder:printcolumn:name="Total GPU",type="string",JSONPath=".status.totalGPUs" +kubebuilder:printcolumn:name="Total Tflops",type="string",JSONPath=".status.totalTFlops" +kubebuilder:printcolumn:name="Total VRAM",type="string",JSONPath=".status.totalVRAM" +kubebuilder:printcolumn:name="Available Tflops",type="string",JSONPath=".status.availableTFlops" +kubebuilder:printcolumn:name="Available VRAM",type="string",JSONPath=".status.availableVRAM"
func (*GPUPool) DeepCopy ¶
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUPool.
func (*GPUPool) DeepCopyInto ¶
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (*GPUPool) DeepCopyObject ¶
DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
type GPUPoolDefinition ¶
type GPUPoolDefinition struct {
Name string `json:"name,omitempty"` // Name of the GPU pool.
IsDefault bool `json:"isDefault,omitempty"`
SpecTemplate GPUPoolSpec `json:"specTemplate"`
}
GPUPool defines how to create a GPU pool, could be URL or inline
func (*GPUPoolDefinition) DeepCopy ¶
func (in *GPUPoolDefinition) DeepCopy() *GPUPoolDefinition
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUPoolDefinition.
func (*GPUPoolDefinition) DeepCopyInto ¶
func (in *GPUPoolDefinition) DeepCopyInto(out *GPUPoolDefinition)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type GPUPoolList ¶
type GPUPoolList struct {
metav1.TypeMeta `json:",inline"`
metav1.ListMeta `json:"metadata,omitempty"`
Items []GPUPool `json:"items"`
}
GPUPoolList contains a list of GPUPool.
func (*GPUPoolList) DeepCopy ¶
func (in *GPUPoolList) DeepCopy() *GPUPoolList
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUPoolList.
func (*GPUPoolList) DeepCopyInto ¶
func (in *GPUPoolList) DeepCopyInto(out *GPUPoolList)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (*GPUPoolList) DeepCopyObject ¶
func (in *GPUPoolList) DeepCopyObject() runtime.Object
DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
type GPUPoolSpec ¶
type GPUPoolSpec struct {
// +optional
DefaultUsingLocalGPU *bool `json:"defaultUsingLocalGPU,omitempty"`
CapacityConfig *CapacityConfig `json:"capacityConfig,omitempty"`
// +required
NodeManagerConfig *NodeManagerConfig `json:"nodeManagerConfig,omitempty"`
// +optional
QosConfig *QosConfig `json:"qosConfig,omitempty"`
// +optional
ComponentConfig *ComponentConfig `json:"componentConfig,omitempty"`
// +optional
SchedulingConfigTemplate *string `json:"schedulingConfigTemplate,omitempty"`
}
GPUPoolSpec defines the desired state of GPUPool.
func (*GPUPoolSpec) DeepCopy ¶
func (in *GPUPoolSpec) DeepCopy() *GPUPoolSpec
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUPoolSpec.
func (*GPUPoolSpec) DeepCopyInto ¶
func (in *GPUPoolSpec) DeepCopyInto(out *GPUPoolSpec)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type GPUPoolStatus ¶
type GPUPoolStatus struct {
Cluster string `json:"cluster,omitempty"`
// +kubebuilder:default=Pending
Phase TensorFusionPoolPhase `json:"phase"`
Conditions []metav1.Condition `json:"conditions,omitempty"`
TotalNodes int32 `json:"totalNodes,omitempty"`
TotalGPUs int32 `json:"totalGPUs,omitempty"`
ReadyNodes int32 `json:"readyNodes"`
NotReadyNodes int32 `json:"notReadyNodes"`
TotalTFlops resource.Quantity `json:"totalTFlops"`
TotalVRAM resource.Quantity `json:"totalVRAM"`
VirtualTFlops resource.Quantity `json:"virtualTFlops"`
VirtualVRAM resource.Quantity `json:"virtualVRAM"`
AvailableTFlops resource.Quantity `json:"availableTFlops"`
AvailableVRAM resource.Quantity `json:"availableVRAM"`
RunningAppsCnt int32 `json:"runningAppsCnt,omitempty"`
// +optional
VirtualAvailableTFlops *resource.Quantity `json:"virtualAvailableTFlops,omitempty"`
// +optional
VirtualAvailableVRAM *resource.Quantity `json:"virtualAvailableVRAM,omitempty"`
// when updating any component version or config, pool controller will perform rolling update.
// the status will be updated periodically, default to 5s, progress will be 0-100.
// when the progress is 100, the component version or config is fully updated.
ComponentStatus PoolComponentStatus `json:"componentStatus"`
// TODO: aggregated with interval
SavedCostsPerMonth string `json:"savedCostsPerMonth,omitempty"`
PotentialSavingsPerMonth string `json:"potentialSavingsPerMonth,omitempty"`
// +kubebuilder:default=""
// If the budget is exceeded, the set value in comma separated string to indicate which period caused the exceeding.
// If this field is not empty, scheduler will not schedule new AI workloads and stop scaling-up check.
// TODO not implemented yet
BudgetExceeded string `json:"budgetExceeded,omitempty"`
// +optional
// +kubebuilder:default="None"
ProvisioningPhase ProvisioningPhase `json:"provisioningPhase,omitempty"`
// +optional
LastCompactionTime *metav1.Time `json:"lastCompactionTime,omitempty"`
}
GPUPoolStatus defines the observed state of GPUPool.
func (*GPUPoolStatus) DeepCopy ¶
func (in *GPUPoolStatus) DeepCopy() *GPUPoolStatus
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUPoolStatus.
func (*GPUPoolStatus) DeepCopyInto ¶
func (in *GPUPoolStatus) DeepCopyInto(out *GPUPoolStatus)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type GPUResourceAvailablePercent ¶
type GPUResourceAvailablePercent struct {
// +optional
RequestsTFlops string `json:"requests.tflops,omitempty"`
// +optional
RequestsVRAM string `json:"requests.vram,omitempty"`
// +optional
LimitsTFlops string `json:"limits.tflops,omitempty"`
// +optional
LimitsVRAM string `json:"limits.vram,omitempty"`
// +optional
Workers string `json:"workers,omitempty"`
}
GPUResourceAvailablePercent defines available percentage for each resource Use string for round(2) float to avoid kubernetes resource can not store float issue
func (*GPUResourceAvailablePercent) DeepCopy ¶
func (in *GPUResourceAvailablePercent) DeepCopy() *GPUResourceAvailablePercent
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUResourceAvailablePercent.
func (*GPUResourceAvailablePercent) DeepCopyInto ¶
func (in *GPUResourceAvailablePercent) DeepCopyInto(out *GPUResourceAvailablePercent)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type GPUResourcePricingUnit ¶
type GPUResourcePricingUnit struct {
// +kubebuilder:default="$0.0069228"
PerFP16TFlopsPerHour string `json:"perFP16TFlopsPerHour,omitempty"`
// +kubebuilder:default="$0.01548"
PerGBOfVRAMPerHour string `json:"perGBOfVRAMPerHour,omitempty"`
}
The default pricing based on second level pricing from https://modal.com/pricing with Tensor/CUDA Core : HBM = 2:1
func (*GPUResourcePricingUnit) DeepCopy ¶
func (in *GPUResourcePricingUnit) DeepCopy() *GPUResourcePricingUnit
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUResourcePricingUnit.
func (*GPUResourcePricingUnit) DeepCopyInto ¶
func (in *GPUResourcePricingUnit) DeepCopyInto(out *GPUResourcePricingUnit)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type GPUResourceQuota ¶
type GPUResourceQuota struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`
Spec GPUResourceQuotaSpec `json:"spec,omitempty"`
Status GPUResourceQuotaStatus `json:"status,omitempty"`
}
GPUResourceQuota is the Schema for the gpuresourcequotas API
func (*GPUResourceQuota) DeepCopy ¶
func (in *GPUResourceQuota) DeepCopy() *GPUResourceQuota
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUResourceQuota.
func (*GPUResourceQuota) DeepCopyInto ¶
func (in *GPUResourceQuota) DeepCopyInto(out *GPUResourceQuota)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (*GPUResourceQuota) DeepCopyObject ¶
func (in *GPUResourceQuota) DeepCopyObject() runtime.Object
DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
type GPUResourceQuotaConditionType ¶
type GPUResourceQuotaConditionType string
GPUResourceQuotaConditionType defines the condition types for GPUResourceQuota
const ( // GPUResourceQuotaConditionReady indicates the quota is ready and functioning GPUResourceQuotaConditionReady GPUResourceQuotaConditionType = "Ready" // GPUResourceQuotaConditionAlertThresholdReached indicates the alert threshold has been reached GPUResourceQuotaConditionAlertThresholdReached GPUResourceQuotaConditionType = "AlertThresholdReached" )
type GPUResourceQuotaList ¶
type GPUResourceQuotaList struct {
metav1.TypeMeta `json:",inline"`
metav1.ListMeta `json:"metadata,omitempty"`
Items []GPUResourceQuota `json:"items"`
}
GPUResourceQuotaList contains a list of GPUResourceQuota
func (*GPUResourceQuotaList) DeepCopy ¶
func (in *GPUResourceQuotaList) DeepCopy() *GPUResourceQuotaList
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUResourceQuotaList.
func (*GPUResourceQuotaList) DeepCopyInto ¶
func (in *GPUResourceQuotaList) DeepCopyInto(out *GPUResourceQuotaList)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (*GPUResourceQuotaList) DeepCopyObject ¶
func (in *GPUResourceQuotaList) DeepCopyObject() runtime.Object
DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
type GPUResourceQuotaSingle ¶
type GPUResourceQuotaSingle struct {
// Maximum resources per workload
// +optional
MaxRequests *Resource `json:"maxRequests,omitempty"`
// +optional
MaxLimits *Resource `json:"maxLimits,omitempty"`
// +optional
MaxGPUCount *int32 `json:"maxGPUCount,omitempty"`
// Default limits applied to workloads without explicit limits
// +optional
DefaultRequests *Resource `json:"defaultRequests,omitempty"`
// Default requests applied to workloads without explicit requests
// +optional
DefaultLimits *Resource `json:"defaultLimits,omitempty"`
}
GPUResourceQuotaSingle defines per-workload limits
func (*GPUResourceQuotaSingle) DeepCopy ¶
func (in *GPUResourceQuotaSingle) DeepCopy() *GPUResourceQuotaSingle
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUResourceQuotaSingle.
func (*GPUResourceQuotaSingle) DeepCopyInto ¶
func (in *GPUResourceQuotaSingle) DeepCopyInto(out *GPUResourceQuotaSingle)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type GPUResourceQuotaSpec ¶
type GPUResourceQuotaSpec struct {
// Total namespace limits (similar to ResourceQuotas)
Total GPUResourceQuotaTotal `json:"total,omitempty"`
// Per-workload limits (similar to LimitRanges)
Single GPUResourceQuotaSingle `json:"single,omitempty"`
}
GPUResourceQuotaSpec defines the desired state of GPUResourceQuota
func (*GPUResourceQuotaSpec) DeepCopy ¶
func (in *GPUResourceQuotaSpec) DeepCopy() *GPUResourceQuotaSpec
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUResourceQuotaSpec.
func (*GPUResourceQuotaSpec) DeepCopyInto ¶
func (in *GPUResourceQuotaSpec) DeepCopyInto(out *GPUResourceQuotaSpec)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type GPUResourceQuotaStatus ¶
type GPUResourceQuotaStatus struct {
// Current resource usage in the namespace
Used GPUResourceUsage `json:"used,omitempty"`
// Available percentage for each resource type
AvailablePercent GPUResourceAvailablePercent `json:"availablePercent,omitempty"`
// Conditions represent the latest available observations of the quota's state
// +optional
Conditions []metav1.Condition `json:"conditions,omitempty"`
// LastUpdateTime is the last time the status was updated
// +optional
LastUpdateTime *metav1.Time `json:"lastUpdateTime,omitempty"`
}
GPUResourceQuotaStatus defines the observed state of GPUResourceQuota
func (*GPUResourceQuotaStatus) DeepCopy ¶
func (in *GPUResourceQuotaStatus) DeepCopy() *GPUResourceQuotaStatus
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUResourceQuotaStatus.
func (*GPUResourceQuotaStatus) DeepCopyInto ¶
func (in *GPUResourceQuotaStatus) DeepCopyInto(out *GPUResourceQuotaStatus)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type GPUResourceQuotaTotal ¶
type GPUResourceQuotaTotal struct {
// Total requests limits for the namespace
// +optional
Requests *Resource `json:"requests,omitempty"`
// Total limits for the namespace
// +optional
Limits *Resource `json:"limits,omitempty"`
// Maximum number of workers in the namespace
// +optional
// +kubebuilder:default=32768
MaxWorkers *int32 `json:"maxWorkers,omitempty"`
// Alert threshold percentage (0-100)
// When usage exceeds this percentage, an alert event will be triggered
// +kubebuilder:validation:Minimum=0
// +kubebuilder:validation:Maximum=100
// +kubebuilder:default=95
// +optional
AlertThresholdPercent *int32 `json:"alertThresholdPercent,omitempty"`
}
GPUResourceQuotaTotal defines total namespace limits
func (*GPUResourceQuotaTotal) DeepCopy ¶
func (in *GPUResourceQuotaTotal) DeepCopy() *GPUResourceQuotaTotal
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUResourceQuotaTotal.
func (*GPUResourceQuotaTotal) DeepCopyInto ¶
func (in *GPUResourceQuotaTotal) DeepCopyInto(out *GPUResourceQuotaTotal)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type GPUResourceUnit ¶
type GPUResourceUnit struct {
// Tera floating point operations per second
TFlops resource.Quantity `json:"tflops,omitempty"`
// VRAM is short for Video memory, namely GPU RAM
VRAM resource.Quantity `json:"vram,omitempty"`
}
func (*GPUResourceUnit) DeepCopy ¶
func (in *GPUResourceUnit) DeepCopy() *GPUResourceUnit
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUResourceUnit.
func (*GPUResourceUnit) DeepCopyInto ¶
func (in *GPUResourceUnit) DeepCopyInto(out *GPUResourceUnit)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type GPUResourceUsage ¶
type GPUResourceUsage struct {
// Current requests usage
// +optional
Requests Resource `json:"requests,omitempty"`
// Current limits usage
// +optional
Limits Resource `json:"limits,omitempty"`
// Current number of workers
// +optional
Workers int32 `json:"workers,omitempty"`
}
GPUResourceUsage defines current resource usage
func (*GPUResourceUsage) DeepCopy ¶
func (in *GPUResourceUsage) DeepCopy() *GPUResourceUsage
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUResourceUsage.
func (*GPUResourceUsage) DeepCopyInto ¶
func (in *GPUResourceUsage) DeepCopyInto(out *GPUResourceUsage)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type GPUStatus ¶
type GPUStatus struct {
// +kubebuilder:default=Pending
Phase TensorFusionGPUPhase `json:"phase"`
// +kubebuilder:default="NVIDIA"
Vendor string `json:"vendor"`
Capacity *Resource `json:"capacity"`
Available *Resource `json:"available"`
UUID string `json:"uuid"`
// +optional
// +kubebuilder:default=soft
IsolationMode IsolationModeType `json:"isolationMode,omitempty"`
// +optional
Index *int32 `json:"index,omitempty"`
// When it's -1, it means the GPU is not assigned to any NUMA node
// +optional
NUMANode *int32 `json:"numaNode,omitempty"`
// The host match selector to schedule worker pods
NodeSelector map[string]string `json:"nodeSelector"`
GPUModel string `json:"gpuModel"`
// GPU is used by tensor-fusion or nvidia-operator
// This is the key to be compatible with nvidia-device-plugin to avoid resource overlap
// Hypervisor will watch kubelet device plugin to report all GPUs already used by nvidia-device-plugin
// GPUs will be grouped by usedBy to be used by different Pods,
// tensor-fusion annotation or nvidia-device-plugin resource block
// +optional
UsedBy UsedBySystem `json:"usedBy,omitempty"`
Message string `json:"message"`
// +optional
RunningApps []*RunningAppDetail `json:"runningApps,omitempty"`
// +optional
// AllocatedPartitions tracks allocated partitions on this GPU
// Key is partitionUUID, value contains template info and allocated resources
AllocatedPartitions map[string]AllocatedPartition `json:"allocatedPartitions,omitempty"`
}
GPUStatus defines the observed state of GPU. NOTE: When new fields added, remember to update syncGPUMetadataAndStatusFromCluster
func (*GPUStatus) DeepCopy ¶
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUStatus.
func (*GPUStatus) DeepCopyInto ¶
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type GangSchedulingConfig ¶ added in v0.1.1
type GangSchedulingConfig struct {
// MinMembers specifies the minimum number of pods that must be scheduled together
// When > 0, gang scheduling is enabled for this workload
// The gang group name is automatically derived from the workload name (namespace/workload-name)
// +kubebuilder:validation:Minimum=0
MinMembers int32 `json:"minMembers,omitempty"`
// Timeout specifies how long to wait for all gang members to be schedulable
// If not set or set to "0s", wait indefinitely until resources are available
// Example values: "5m", "10m", "1h"
// +optional
Timeout *metav1.Duration `json:"timeout,omitempty"`
}
GangSchedulingConfig defines gang scheduling configuration
func (*GangSchedulingConfig) DeepCopy ¶ added in v0.1.1
func (in *GangSchedulingConfig) DeepCopy() *GangSchedulingConfig
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GangSchedulingConfig.
func (*GangSchedulingConfig) DeepCopyInto ¶ added in v0.1.1
func (in *GangSchedulingConfig) DeepCopyInto(out *GangSchedulingConfig)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type GroupKindName ¶
type GroupKindName struct {
Group string `json:"group"`
Kind string `json:"kind"`
Version string `json:"version"`
Name string `json:"name"`
}
func (*GroupKindName) DeepCopy ¶
func (in *GroupKindName) DeepCopy() *GroupKindName
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GroupKindName.
func (*GroupKindName) DeepCopyInto ¶
func (in *GroupKindName) DeepCopyInto(out *GroupKindName)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type HardwareModelInfo ¶ added in v0.1.1
type HardwareModelInfo struct {
// Model is the short model identifier (e.g., "A100_SXM_80G", "RTX4090")
// +kubebuilder:validation:Required
Model string `json:"model"`
// FullModelName is the full human-readable name (e.g., "NVIDIA A100-SXM4-80GB")
// +kubebuilder:validation:Required
FullModelName string `json:"fullModelName"`
// CostPerHour is the average cost per hour for this GPU model
// +optional
CostPerHour string `json:"costPerHour,omitempty"`
// Fp16TFlops is the FP16 performance in TFlops
// +kubebuilder:validation:Required
Fp16TFlops resource.Quantity `json:"fp16TFlops"`
// MaxPartitions is the maximum number of partitions this GPU can support
// +optional
MaxPartitions uint32 `json:"maxPartitions,omitempty"`
// MaxPlacementSlots is the maximum number of placement slots (e.g., 8 for NVIDIA MIG)
// +optional
MaxPlacementSlots uint32 `json:"maxPlacementSlots,omitempty"`
// MaxIsolationGroups is the maximum number of isolation groups (e.g., 4 for Ascend vGroups)
// +optional
MaxIsolationGroups uint32 `json:"maxIsolationGroups,omitempty"`
// TotalExtendedResources defines the total capacity of extended resources
// For Ascend NPU: {"AICORE": 8, "AICPU": 7, "VPC": 12, ...}
// +optional
TotalExtendedResources map[string]uint32 `json:"totalExtendedResources,omitempty"`
// PartitionTemplateRefs contains references to virtualization templates
// Use template IDs defined in VirtualizationTemplates
// +optional
PartitionTemplateRefs []string `json:"partitionTemplateRefs,omitempty"`
}
HardwareModelInfo contains information about a specific GPU/accelerator model
func (*HardwareModelInfo) DeepCopy ¶ added in v0.1.1
func (in *HardwareModelInfo) DeepCopy() *HardwareModelInfo
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new HardwareModelInfo.
func (*HardwareModelInfo) DeepCopyInto ¶ added in v0.1.1
func (in *HardwareModelInfo) DeepCopyInto(out *HardwareModelInfo)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type HypervisorConfig ¶
type HypervisorConfig struct {
// Image is the default hypervisor image, used when no ProviderConfig is found for the vendor
// When ProviderConfig exists, middleware image from ProviderConfig takes precedence
Image string `json:"image,omitempty"`
VectorImage string `json:"vectorImage,omitempty"`
// +kubebuilder:default=8000
// +kubebuilder:validation:Minimum=0
// +kubebuilder:validation:Maximum=65535
// +optional
PortNumber *int32 `json:"portNumber,omitempty"`
// +optional
PodTemplate *runtime.RawExtension `json:"podTemplate,omitempty"`
// +optional
EnableVector bool `json:"enableVector,omitempty"`
}
func (*HypervisorConfig) DeepCopy ¶
func (in *HypervisorConfig) DeepCopy() *HypervisorConfig
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new HypervisorConfig.
func (*HypervisorConfig) DeepCopyInto ¶
func (in *HypervisorConfig) DeepCopyInto(out *HypervisorConfig)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type HypervisorScheduling ¶
type HypervisorScheduling struct {
// additional layer to save VRAM, auto-freeze memory and cool down to RAM and Disk
// Hypervisor will monitor and trigger freeze of inactive workers, Operator should mark them as scaled-to-zero and release the GPU pool resources, don't scale down CPU client part, so that they can continue to serve the traffic or scale down by other auto-scaling solutions like KEDA/KNative
AutoFreezeAndResume AutoFreezeAndResume `json:"autoFreezeAndResume,omitempty"`
// Hypervisor will move low priority jobs to pending queue if GPU is full
// This config can adjust hypervisor's queueing behavior to balance the co-scheduling CUDA calls
ElasticRateLimitParameters ElasticRateLimitParameters `json:"elasticRateLimitParameters,omitempty"`
// +optional
// For differentiate QoS levels, ensure critical and high QoS workloads on same GPU card getting more computing resources
MultiProcessQueuingParameters MultiProcessQueuingParameters `json:"multiProcessQueuingParameters,omitempty"`
}
func (*HypervisorScheduling) DeepCopy ¶
func (in *HypervisorScheduling) DeepCopy() *HypervisorScheduling
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new HypervisorScheduling.
func (*HypervisorScheduling) DeepCopyInto ¶
func (in *HypervisorScheduling) DeepCopyInto(out *HypervisorScheduling)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type IsolationModeType ¶
type IsolationModeType string
+kubebuilder:validation:Enum=shared;soft;hard;partitioned
type MaintenanceWindow ¶
type MaintenanceWindow struct {
// crontab syntax.
Includes []string `json:"includes,omitempty"`
}
func (*MaintenanceWindow) DeepCopy ¶
func (in *MaintenanceWindow) DeepCopy() *MaintenanceWindow
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MaintenanceWindow.
func (*MaintenanceWindow) DeepCopyInto ¶
func (in *MaintenanceWindow) DeepCopyInto(out *MaintenanceWindow)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type MultiProcessQueuingParameters ¶
type MultiProcessQueuingParameters struct {
// Condition for triggering scale down when usage is above ComputingThresholdForPreempt
ComputingThresholdForPreempt string `json:"computingThresholdForPreempt,omitempty"`
TriggerPreemptDuration string `json:"triggerPreemptDuration,omitempty"`
// Condition for triggering scale up when usage is below ComputingThresholdForResume
ComputingThresholdForResume string `json:"computingThresholdForResume,omitempty"`
TriggerResumeDuration string `json:"triggerResumeDuration,omitempty"`
// Coefficient for scale down when resource contention happens
CoefficientLow string `json:"coefficientLow,omitempty"`
CoefficientMedium string `json:"coefficientMedium,omitempty"`
CoefficientHigh string `json:"coefficientHigh,omitempty"`
// When avg utilization < ComputingThresholdForResume and last for more than TriggerResumeDuration
// Use following formula to scale up:
// Case #1 If all process has same QoS level, and cur_limit <= limit, fast resume to limit
// Case #2 Else, Max(limit, Min(cur_limit * 1/CoEfficient * SlowStartRatio, cur_limit * 1.2))
SlowStartRatio string `json:"slowStartRatio,omitempty"`
}
func (*MultiProcessQueuingParameters) DeepCopy ¶
func (in *MultiProcessQueuingParameters) DeepCopy() *MultiProcessQueuingParameters
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MultiProcessQueuingParameters.
func (*MultiProcessQueuingParameters) DeepCopyInto ¶
func (in *MultiProcessQueuingParameters) DeepCopyInto(out *MultiProcessQueuingParameters)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type NameNamespace ¶
type NameNamespace struct {
Name string `json:"name,omitempty"`
Namespace string `json:"namespace,omitempty"`
}
func (*NameNamespace) DeepCopy ¶
func (in *NameNamespace) DeepCopy() *NameNamespace
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NameNamespace.
func (*NameNamespace) DeepCopyInto ¶
func (in *NameNamespace) DeepCopyInto(out *NameNamespace)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (NameNamespace) String ¶
func (n NameNamespace) String() string
type NodeClassBlockDeviceMappings ¶
type NodeClassBlockDeviceMappings struct {
// +optional
DeviceName string `json:"deviceName,omitempty"` // The device name for the block device
EBS NodeClassBlockDeviceSettings `json:"ebs,omitempty"`
}
func (*NodeClassBlockDeviceMappings) DeepCopy ¶
func (in *NodeClassBlockDeviceMappings) DeepCopy() *NodeClassBlockDeviceMappings
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeClassBlockDeviceMappings.
func (*NodeClassBlockDeviceMappings) DeepCopyInto ¶
func (in *NodeClassBlockDeviceMappings) DeepCopyInto(out *NodeClassBlockDeviceMappings)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type NodeClassBlockDeviceSettings ¶
type NodeClassBlockDeviceSettings struct {
VolumeSize string `json:"volumeSize,omitempty"`
// +optional
// Default value would varies based on the cloud vendor
// For AWS it's gp3, for Alicloud it's cloud_essd
VolumeType string `json:"volumeType,omitempty"`
// +optional
// +kubebuilder:default=true
DeleteOnTermination bool `json:"deleteOnTermination,omitempty"` // Whether to delete the EBS volume on termination
// +optional
// +kubebuilder:default=true
Encrypted bool `json:"encrypted,omitempty"` // Whether the EBS volume is encrypted
}
func (*NodeClassBlockDeviceSettings) DeepCopy ¶
func (in *NodeClassBlockDeviceSettings) DeepCopy() *NodeClassBlockDeviceSettings
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeClassBlockDeviceSettings.
func (*NodeClassBlockDeviceSettings) DeepCopyInto ¶
func (in *NodeClassBlockDeviceSettings) DeepCopyInto(out *NodeClassBlockDeviceSettings)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type NodeClassItemSelectorTerms ¶
type NodeClassItemSelectorTerms struct {
// +optional
// The item ID
ID string `json:"id,omitempty"`
// +optional
// The item name
Name string `json:"name,omitempty"`
// +optional
// Query by tags
Tags map[string]string `json:"tags,omitempty"`
}
func (*NodeClassItemSelectorTerms) DeepCopy ¶
func (in *NodeClassItemSelectorTerms) DeepCopy() *NodeClassItemSelectorTerms
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeClassItemSelectorTerms.
func (*NodeClassItemSelectorTerms) DeepCopyInto ¶
func (in *NodeClassItemSelectorTerms) DeepCopyInto(out *NodeClassItemSelectorTerms)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type NodeClassMetadataOptions ¶
type NodeClassMetadataOptions struct {
// +optional
// +kubebuilder:default=true
HttpEndpoint bool `json:"httpEndpoint,omitempty"`
// +optional
// +kubebuilder:default=false
HttpProtocolIPv6 bool `json:"httpProtocolIPv6,omitempty"`
// +optional
// +kubebuilder:default=1
HttpPutResponseHopLimit int `json:"httpPutResponseHopLimit,omitempty"`
// +optional
// +kubebuilder:default="required"
HttpTokens string `json:"httpTokens,omitempty"`
}
AWS IMDSv2 metadata service options
func (*NodeClassMetadataOptions) DeepCopy ¶
func (in *NodeClassMetadataOptions) DeepCopy() *NodeClassMetadataOptions
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeClassMetadataOptions.
func (*NodeClassMetadataOptions) DeepCopyInto ¶
func (in *NodeClassMetadataOptions) DeepCopyInto(out *NodeClassMetadataOptions)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type NodeCompaction ¶
type NodeCompaction struct {
// +kubebuilder:default="5m"
Period string `json:"period,omitempty"`
}
func (*NodeCompaction) DeepCopy ¶
func (in *NodeCompaction) DeepCopy() *NodeCompaction
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeCompaction.
func (*NodeCompaction) DeepCopyInto ¶
func (in *NodeCompaction) DeepCopyInto(out *NodeCompaction)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type NodeHypervisorStatus ¶
type NodeHypervisorStatus struct {
HypervisorState string `json:"hypervisorState,omitempty"`
HypervisorVersion string `json:"hypervisorVersion,omitempty"`
LastHeartbeatTime metav1.Time `json:"lastHeartbeatTime,omitempty"`
}
func (*NodeHypervisorStatus) DeepCopy ¶
func (in *NodeHypervisorStatus) DeepCopy() *NodeHypervisorStatus
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeHypervisorStatus.
func (*NodeHypervisorStatus) DeepCopyInto ¶
func (in *NodeHypervisorStatus) DeepCopyInto(out *NodeHypervisorStatus)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type NodeManagerConfig ¶
type NodeManagerConfig struct {
// +kubebuilder:default="AutoSelect"
ProvisioningMode ProvisioningMode `json:"provisioningMode,omitempty"`
// +optional
// +kubebuilder:default=NVIDIA
// In single AI accelerator hardware vendor mode, when default vendor set
// All nodes provisioned by NodeProvisioner or selected by NodeSelector will be set with vendor label
DefaultVendor string `json:"defaultVendor,omitempty"`
// +optional
NodeProvisioner *NodeProvisioner `json:"nodeProvisioner,omitempty"`
// +optional
NodeSelector *corev1.NodeSelector `json:"nodeSelector,omitempty"`
// +optional
// When this field set, the GPU pool will be in multi AI accelerator vendor mode
// each GPU node's vendor name is set to map key, e.g. { AMD: { nodeSelectorTerms }}
MultiVendorNodeSelector map[string]*corev1.NodeSelector `json:"multiVendorNodeSelector,omitempty"`
// +optional
NodeCompaction *NodeCompaction `json:"nodeCompaction,omitempty"`
// +optional
NodePoolRollingUpdatePolicy *NodeRollingUpdatePolicy `json:"nodePoolRollingUpdatePolicy,omitempty"`
}
func (*NodeManagerConfig) DeepCopy ¶
func (in *NodeManagerConfig) DeepCopy() *NodeManagerConfig
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeManagerConfig.
func (*NodeManagerConfig) DeepCopyInto ¶
func (in *NodeManagerConfig) DeepCopyInto(out *NodeManagerConfig)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type NodeProvisioner ¶
type NodeProvisioner struct {
// TensorFusion GPUNodeClass name
NodeClass string `json:"nodeClass,omitempty"`
// Karpenter NodeClass name
// +optional
KarpenterNodeClassRef *GroupKindName `json:"karpenterNodeClassRef,omitempty"`
// +optional
GPURequirements []Requirement `json:"gpuRequirements,omitempty"`
// +optional
GPUTaints []Taint `json:"gpuTaints,omitempty"`
// +optional
GPULabels map[string]string `json:"gpuNodeLabels,omitempty"`
// +optional
CPURequirements []Requirement `json:"cpuRequirements,omitempty"`
// +optional
CPUTaints []Taint `json:"cpuTaints,omitempty"`
// +optional
CPULabels map[string]string `json:"cpuNodeLabels,omitempty"`
// +optional
GPUAnnotation map[string]string `json:"gpuNodeAnnotations,omitempty"`
// +optional
// NodeProvisioner will start an virtual billing based on public pricing or customized pricing, if the VM's costs exceeded any budget constraints, the new VM will not be created, and alerts will be generated
Budget *PeriodicalBudget `json:"budget,omitempty"`
}
NodeProvisioner or NodeSelector, they are exclusive. NodeSelector is for existing GPUs, NodeProvisioner is for Karpenter-like auto management.
func (*NodeProvisioner) DeepCopy ¶
func (in *NodeProvisioner) DeepCopy() *NodeProvisioner
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeProvisioner.
func (*NodeProvisioner) DeepCopyInto ¶
func (in *NodeProvisioner) DeepCopyInto(out *NodeProvisioner)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type NodeRequirementKey ¶
type NodeRequirementKey string
+kubebuilder:validation:Enum=node.kubernetes.io/instance-type;kubernetes.io/arch;kubernetes.io/os;topology.kubernetes.io/region;topology.kubernetes.io/zone;karpenter.sh/capacity-type;tensor-fusion.ai/gpu-vendor;tensor-fusion.ai/gpu-instance-family;tensor-fusion.ai/gpu-instance-size
const ( NodeRequirementKeyInstanceType NodeRequirementKey = "node.kubernetes.io/instance-type" NodeRequirementKeyArchitecture NodeRequirementKey = "kubernetes.io/arch" NodeRequirementKeyGPUVendor NodeRequirementKey = "tensor-fusion.ai/gpu-vendor" NodeRequirementKeyOS NodeRequirementKey = "kubernetes.io/os" NodeRequirementKeyRegion NodeRequirementKey = "topology.kubernetes.io/region" NodeRequirementKeyZone NodeRequirementKey = "topology.kubernetes.io/zone" // capacity-type is charging method, can be spot/preemptive or on-demand NodeRequirementKeyCapacityType NodeRequirementKey = "karpenter.sh/capacity-type" NodeRequirementKeyInstanceFamily NodeRequirementKey = "tensor-fusion.ai/gpu-instance-family" NodeRequirementKeyInstanceSize NodeRequirementKey = "tensor-fusion.ai/gpu-instance-size" )
type NodeRollingUpdatePolicy ¶
type NodeRollingUpdatePolicy struct {
// +optional
AutoUpdateHypervisor bool `json:"autoUpdateHypervisor,omitempty"`
// +optional
AutoUpdateWorker bool `json:"autoUpdateWorker,omitempty"`
// +optional
AutoUpdateClient bool `json:"autoUpdateClient,omitempty"`
// +kubebuilder:default=100
// +kubebuilder:validation:Minimum=0
// +kubebuilder:validation:Maximum=100
BatchPercentage int32 `json:"batchPercentage,omitempty"`
// +kubebuilder:default="1s"
BatchInterval string `json:"batchInterval,omitempty"`
// +optional
// +kubebuilder:default="10m"
MaxDuration string `json:"maxDuration,omitempty"`
// +optional
MaintenanceWindow MaintenanceWindow `json:"maintenanceWindow,omitempty"`
}
func (*NodeRollingUpdatePolicy) DeepCopy ¶
func (in *NodeRollingUpdatePolicy) DeepCopy() *NodeRollingUpdatePolicy
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeRollingUpdatePolicy.
func (*NodeRollingUpdatePolicy) DeepCopyInto ¶
func (in *NodeRollingUpdatePolicy) DeepCopyInto(out *NodeRollingUpdatePolicy)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type OSImageTypeEnum ¶
type OSImageTypeEnum string
+kubebuilder:validation:Enum=Private;Public;System
const ( OSImageTypePrivate OSImageTypeEnum = "Private" OSImageTypePublic OSImageTypeEnum = "Public" OSImageTypeSystem OSImageTypeEnum = "System" )
type Oversubscription ¶
type Oversubscription struct {
// the percentage of Host RAM appending to GPU VRAM, default to 50%
// +optional
// +kubebuilder:default=50
// +kubebuilder:validation:Minimum=0
// +kubebuilder:validation:Maximum=100
VRAMExpandToHostMem int32 `json:"vramExpandToHostMem,omitempty"`
// the percentage of Host Disk appending to GPU VRAM, default to 70%
// +optional
// +kubebuilder:default=70
// +kubebuilder:validation:Minimum=0
// +kubebuilder:validation:Maximum=100
VRAMExpandToHostDisk int32 `json:"vramExpandToHostDisk,omitempty"`
// The multi of TFlops to oversell, default to 500%, indicates 5 times oversell
// +optional
// +kubebuilder:default=500
// +kubebuilder:validation:Minimum=100
// +kubebuilder:validation:Maximum=100000
TFlopsOversellRatio int32 `json:"tflopsOversellRatio,omitempty"`
}
func (*Oversubscription) DeepCopy ¶
func (in *Oversubscription) DeepCopy() *Oversubscription
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Oversubscription.
func (*Oversubscription) DeepCopyInto ¶
func (in *Oversubscription) DeepCopyInto(out *Oversubscription)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type PartitionTemplate ¶ added in v0.1.1
type PartitionTemplate struct {
// TemplateID is the unique identifier for this partition template (e.g., "1g.24gb", "4g.94gb")
TemplateID string `json:"templateId"`
// Name is a human-readable name for this template
Name string `json:"name"`
}
PartitionTemplate represents a hardware partition template (e.g., MIG profile) Only stores template ID and name in GPU status. Detailed resource information is stored in public GPU info config.
func (*PartitionTemplate) DeepCopy ¶ added in v0.1.1
func (in *PartitionTemplate) DeepCopy() *PartitionTemplate
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PartitionTemplate.
func (*PartitionTemplate) DeepCopyInto ¶ added in v0.1.1
func (in *PartitionTemplate) DeepCopyInto(out *PartitionTemplate)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type PeriodicalBudget ¶
type PeriodicalBudget struct {
// +kubebuilder:default="100"
BudgetPerDay string `json:"budgetPerDay,omitempty"`
// +kubebuilder:default="1000"
BudgetPerMonth string `json:"budgetPerMonth,omitempty"`
// +kubebuilder:default="3000"
BudgetPerQuarter string `json:"budgetPerQuarter,omitempty"`
// +kubebuilder:default=AlertOnly
BudgetExceedStrategy BudgetExceedStrategy `json:"budgetExceedStrategy,omitempty"`
}
The budget constraints in dollars
func (*PeriodicalBudget) DeepCopy ¶
func (in *PeriodicalBudget) DeepCopy() *PeriodicalBudget
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PeriodicalBudget.
func (*PeriodicalBudget) DeepCopyInto ¶
func (in *PeriodicalBudget) DeepCopyInto(out *PeriodicalBudget)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type PlacementConfig ¶
type PlacementConfig struct {
// +kubebuilder:default=NodeCompactGPULowLoad
Mode PlacementMode `json:"mode"`
// +kubebuilder:default=true
// +optional
AllowUsingLocalGPU *bool `json:"allowUsingLocalGPU,omitempty"` // If false, workloads will not be scheduled directly to GPU nodes with 'localGPU: true'.
// +optional
GPUFilters []GPUFilter `json:"gpuFilters,omitempty"`
}
func (*PlacementConfig) DeepCopy ¶
func (in *PlacementConfig) DeepCopy() *PlacementConfig
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PlacementConfig.
func (*PlacementConfig) DeepCopyInto ¶
func (in *PlacementConfig) DeepCopyInto(out *PlacementConfig)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type PlacementMode ¶
type PlacementMode string
+kubebuilder:validation:Enum=CompactFirst;LowLoadFirst;NodeCompactGPULowLoad
const ( // default to compactFirst for cost saving and energy saving PlacementModeCompactFirst PlacementMode = "CompactFirst" // in some cases, use lowLoadFirst for balance and fairness PlacementModeLowLoadFirst PlacementMode = "LowLoadFirst" // in some cases, use nodeCompactGPULowLoad for balance and fairness PlacementModeNodeCompactGPULowLoad PlacementMode = "NodeCompactGPULowLoad" )
type PodGPUInfo ¶
type PodGPUInfo struct {
Name string `json:"name,omitempty"`
Namespace string `json:"namespace,omitempty"`
UID string `json:"uid,omitempty"`
Requests Resource `json:"requests,omitempty"`
Limits Resource `json:"limits,omitempty"`
QoS QoSLevel `json:"qos,omitempty"`
// IsExternal indicates if this allocation is from an external device plugin
// (e.g., nvidia-device-plugin) rather than TensorFusion scheduler
// +optional
IsExternal bool `json:"isExternal,omitempty"`
}
func (*PodGPUInfo) DeepCopy ¶
func (in *PodGPUInfo) DeepCopy() *PodGPUInfo
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodGPUInfo.
func (*PodGPUInfo) DeepCopyInto ¶
func (in *PodGPUInfo) DeepCopyInto(out *PodGPUInfo)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type PoolComponentStatus ¶
type PoolComponentStatus struct {
WorkerVersion string `json:"worker,omitempty"`
WorkerConfigSynced bool `json:"workerConfigSynced,omitempty"`
WorkerUpdateProgress int32 `json:"workerUpdateProgress,omitempty"`
HypervisorVersion string `json:"hypervisor,omitempty"`
HypervisorConfigSynced bool `json:"hypervisorConfigSynced,omitempty"`
HyperVisorUpdateProgress int32 `json:"hypervisorUpdateProgress,omitempty"`
ClientVersion string `json:"client,omitempty"`
ClientConfigSynced bool `json:"clientConfigSynced,omitempty"`
ClientUpdateProgress int32 `json:"clientUpdateProgress,omitempty"`
}
func (*PoolComponentStatus) DeepCopy ¶
func (in *PoolComponentStatus) DeepCopy() *PoolComponentStatus
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PoolComponentStatus.
func (*PoolComponentStatus) DeepCopyInto ¶
func (in *PoolComponentStatus) DeepCopyInto(out *PoolComponentStatus)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type PoolProvisioningStatus ¶
type PoolProvisioningStatus struct {
InitializingNodes int32 `json:"initializingNodes,omitempty"`
TerminatingNodes int32 `json:"terminatingNodes,omitempty"`
AvailableNodes int32 `json:"availableNodes,omitempty"`
}
func (*PoolProvisioningStatus) DeepCopy ¶
func (in *PoolProvisioningStatus) DeepCopy() *PoolProvisioningStatus
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PoolProvisioningStatus.
func (*PoolProvisioningStatus) DeepCopyInto ¶
func (in *PoolProvisioningStatus) DeepCopyInto(out *PoolProvisioningStatus)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type ProviderConfig ¶ added in v0.1.1
type ProviderConfig struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`
Spec ProviderConfigSpec `json:"spec,omitempty"`
Status ProviderConfigStatus `json:"status,omitempty"`
}
ProviderConfig is the Schema for managing hardware vendor specific configurations Each ProviderConfig represents a single vendor (NVIDIA, AMD, Ascend, etc.)
func (*ProviderConfig) DeepCopy ¶ added in v0.1.1
func (in *ProviderConfig) DeepCopy() *ProviderConfig
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ProviderConfig.
func (*ProviderConfig) DeepCopyInto ¶ added in v0.1.1
func (in *ProviderConfig) DeepCopyInto(out *ProviderConfig)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (*ProviderConfig) DeepCopyObject ¶ added in v0.1.1
func (in *ProviderConfig) DeepCopyObject() runtime.Object
DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
type ProviderConfigList ¶ added in v0.1.1
type ProviderConfigList struct {
metav1.TypeMeta `json:",inline"`
metav1.ListMeta `json:"metadata,omitempty"`
Items []ProviderConfig `json:"items"`
}
ProviderConfigList contains a list of ProviderConfig
func (*ProviderConfigList) DeepCopy ¶ added in v0.1.1
func (in *ProviderConfigList) DeepCopy() *ProviderConfigList
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ProviderConfigList.
func (*ProviderConfigList) DeepCopyInto ¶ added in v0.1.1
func (in *ProviderConfigList) DeepCopyInto(out *ProviderConfigList)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (*ProviderConfigList) DeepCopyObject ¶ added in v0.1.1
func (in *ProviderConfigList) DeepCopyObject() runtime.Object
DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
type ProviderConfigSpec ¶ added in v0.1.1
type ProviderConfigSpec struct {
// Vendor is the hardware vendor name (e.g., NVIDIA, AMD, Ascend, Apple)
// +kubebuilder:validation:Required
Vendor string `json:"vendor"`
// Images contains container images for different components
Images ProviderImages `json:"images"`
// Hypervisor contains vendor-specific hypervisor settings
// +optional
Hypervisor *ProviderHypervisorConfig `json:"hypervisor,omitempty"`
// HardwareMetadata contains GPU/accelerator model information
// +optional
HardwareMetadata []HardwareModelInfo `json:"hardwareMetadata,omitempty"`
// VirtualizationTemplates contains partition/slice templates that can be referenced by hardware metadata
// This avoids duplicating template info across multiple GPU models
// +optional
VirtualizationTemplates []VirtualizationTemplate `json:"virtualizationTemplates,omitempty"`
// InUseResourceNames contains resource names that should be removed from pods
// when TensorFusion takes over GPU management (e.g., "nvidia.com/gpu", "amd.com/gpu")
// +optional
InUseResourceNames []string `json:"inUseResourceNames,omitempty"`
// DevicePluginDetection contains settings for detecting existing device plugins
// +optional
DevicePluginDetection *DevicePluginDetectionConfig `json:"devicePluginDetection,omitempty"`
}
ProviderConfigSpec defines the desired state of ProviderConfig ProviderConfig is used to manage hardware vendor specific configurations including images, hardware metadata, virtualization templates and resource names
func (*ProviderConfigSpec) DeepCopy ¶ added in v0.1.1
func (in *ProviderConfigSpec) DeepCopy() *ProviderConfigSpec
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ProviderConfigSpec.
func (*ProviderConfigSpec) DeepCopyInto ¶ added in v0.1.1
func (in *ProviderConfigSpec) DeepCopyInto(out *ProviderConfigSpec)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type ProviderConfigStatus ¶ added in v0.1.1
type ProviderConfigStatus struct{}
ProviderConfigStatus defines the observed state of ProviderConfig
func (*ProviderConfigStatus) DeepCopy ¶ added in v0.1.1
func (in *ProviderConfigStatus) DeepCopy() *ProviderConfigStatus
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ProviderConfigStatus.
func (*ProviderConfigStatus) DeepCopyInto ¶ added in v0.1.1
func (in *ProviderConfigStatus) DeepCopyInto(out *ProviderConfigStatus)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type ProviderHypervisorConfig ¶ added in v0.1.2
type ProviderHypervisorConfig struct {
// PrivilegedHypervisor indicates the hypervisor container should run in privileged mode
// +optional
PrivilegedHypervisor bool `json:"privilegedHypervisor,omitempty"`
// LDLibraryPath appends entries to LD_LIBRARY_PATH for the hypervisor container
// +optional
LDLibraryPath string `json:"ldLibraryPath,omitempty"`
// HostPathMounts adds host path mounts to the hypervisor pod
// +optional
HostPathMounts []ProviderHypervisorHostPathMount `json:"hostPathMounts,omitempty"`
}
ProviderHypervisorConfig contains vendor-specific hypervisor configuration
func (*ProviderHypervisorConfig) DeepCopy ¶ added in v0.1.2
func (in *ProviderHypervisorConfig) DeepCopy() *ProviderHypervisorConfig
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ProviderHypervisorConfig.
func (*ProviderHypervisorConfig) DeepCopyInto ¶ added in v0.1.2
func (in *ProviderHypervisorConfig) DeepCopyInto(out *ProviderHypervisorConfig)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type ProviderHypervisorHostPathMount ¶ added in v0.1.2
type ProviderHypervisorHostPathMount struct {
// Name is the volume name
// +kubebuilder:validation:Required
Name string `json:"name"`
// HostPath is the path on the host to mount
// +kubebuilder:validation:Required
HostPath string `json:"hostPath"`
// MountPath is the path inside the container
// +kubebuilder:validation:Required
MountPath string `json:"mountPath"`
// ReadOnly indicates if the mount should be read-only
// +optional
ReadOnly bool `json:"readOnly,omitempty"`
}
ProviderHypervisorHostPathMount defines a hostPath mount for hypervisor
func (*ProviderHypervisorHostPathMount) DeepCopy ¶ added in v0.1.2
func (in *ProviderHypervisorHostPathMount) DeepCopy() *ProviderHypervisorHostPathMount
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ProviderHypervisorHostPathMount.
func (*ProviderHypervisorHostPathMount) DeepCopyInto ¶ added in v0.1.2
func (in *ProviderHypervisorHostPathMount) DeepCopyInto(out *ProviderHypervisorHostPathMount)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type ProviderImages ¶ added in v0.1.1
type ProviderImages struct {
// Middleware is the hypervisor/middleware image that runs on GPU nodes
// This image contains vendor-specific GPU drivers and runtime
// +kubebuilder:validation:Required
Middleware string `json:"middleware"`
// RemoteClient is the client library image injected into user pods
// +optional
RemoteClient string `json:"remoteClient,omitempty"`
// RemoteWorker is the worker process image for remote GPU access
// +optional
RemoteWorker string `json:"remoteWorker,omitempty"`
}
ProviderImages contains container images for TensorFusion components
func (*ProviderImages) DeepCopy ¶ added in v0.1.1
func (in *ProviderImages) DeepCopy() *ProviderImages
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ProviderImages.
func (*ProviderImages) DeepCopyInto ¶ added in v0.1.1
func (in *ProviderImages) DeepCopyInto(out *ProviderImages)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type ProvisioningMode ¶
type ProvisioningMode string
+kubebuilder:validation:Enum=Provisioned;AutoSelect;Karpenter
const ( ProvisioningModeProvisioned ProvisioningMode = "Provisioned" ProvisioningModeAutoSelect ProvisioningMode = "AutoSelect" ProvisioningModeKarpenter ProvisioningMode = "Karpenter" )
type ProvisioningPhase ¶
type ProvisioningPhase string
+kubebuilder:validation:Enum=None;Initializing;Provisioning;Completed
type QosConfig ¶
type QosConfig struct {
Definitions []QosDefinition `json:"definitions,omitempty"`
DefaultQoS QoSLevel `json:"defaultQoS,omitempty"`
Pricing []QosPricing `json:"pricing,omitempty"`
// Eviction protection price ratio applied to cost calculation during protection period
// This multiplier increases pricing for protected workloads to discourage preemption
// +optional
// +kubebuilder:default="1.2"
EvictionProtectionPriceRatio string `json:"evictionProtectionPriceRatio,omitempty"`
}
Define different QoS and their price.
func (*QosConfig) DeepCopy ¶
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new QosConfig.
func (*QosConfig) DeepCopyInto ¶
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type QosDefinition ¶
type QosDefinition struct {
Name QoSLevel `json:"name,omitempty"`
Description string `json:"description,omitempty"`
Priority int `json:"priority,omitempty"` // Range from 1-100, reflects the scheduling priority when GPU is full and tasks are in the queue.
}
func (*QosDefinition) DeepCopy ¶
func (in *QosDefinition) DeepCopy() *QosDefinition
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new QosDefinition.
func (*QosDefinition) DeepCopyInto ¶
func (in *QosDefinition) DeepCopyInto(out *QosDefinition)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type QosPricing ¶
type QosPricing struct {
Qos QoSLevel `json:"qos,omitempty"`
Requests GPUResourcePricingUnit `json:"requests,omitempty"`
// Default requests and limitsOverRequests are same, indicates normal on-demand serverless GPU usage, in hands-on lab low QoS case, limitsOverRequests should be lower, so that user can get burstable GPU resources with very low cost
// +kubebuilder:default="1"
LimitsOverRequestsChargingRatio string `json:"limitsOverRequests,omitempty"`
}
func (*QosPricing) DeepCopy ¶
func (in *QosPricing) DeepCopy() *QosPricing
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new QosPricing.
func (*QosPricing) DeepCopyInto ¶
func (in *QosPricing) DeepCopyInto(out *QosPricing)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type ReBalanceThreshold ¶
type ReBalanceThreshold struct {
MatchAny runtime.RawExtension `json:"matchAny,omitempty"`
}
func (*ReBalanceThreshold) DeepCopy ¶
func (in *ReBalanceThreshold) DeepCopy() *ReBalanceThreshold
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ReBalanceThreshold.
func (*ReBalanceThreshold) DeepCopyInto ¶
func (in *ReBalanceThreshold) DeepCopyInto(out *ReBalanceThreshold)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type ReBalancerConfig ¶
type ReBalancerConfig struct {
Enable *bool `json:"enable,omitempty"`
Interval string `json:"interval,omitempty"`
ReBalanceCoolDownTime string `json:"reBalanceCoolDownTime,omitempty"`
Threshold ReBalanceThreshold `json:"threshold,omitempty"`
}
Avoid hot GPU devices and continuously balance the workload\nimplemented by trigger a simulation scheduling and advise better GPU nodes for scheduler
func (*ReBalancerConfig) DeepCopy ¶
func (in *ReBalancerConfig) DeepCopy() *ReBalancerConfig
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ReBalancerConfig.
func (*ReBalancerConfig) DeepCopyInto ¶
func (in *ReBalancerConfig) DeepCopyInto(out *ReBalancerConfig)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type Requirement ¶
type Requirement struct {
Key NodeRequirementKey `json:"key,omitempty"`
// +kubebuilder:default="In"
// +kubebuilder:validation:Enum=In;Exists;DoesNotExist;Gt;Lt
Operator corev1.NodeSelectorOperator `json:"operator,omitempty"`
Values []string `json:"values,omitempty"`
}
func (*Requirement) DeepCopy ¶
func (in *Requirement) DeepCopy() *Requirement
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Requirement.
func (*Requirement) DeepCopyInto ¶
func (in *Requirement) DeepCopyInto(out *Requirement)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type Resource ¶
type Resource struct {
Tflops resource.Quantity `json:"tflops"`
// +optional
// 0-100 percentage, mutually exclusive with TFLOPs
ComputePercent resource.Quantity `json:"compute,omitempty"`
Vram resource.Quantity `json:"vram"`
}
func (*Resource) DeepCopy ¶
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Resource.
func (*Resource) DeepCopyInto ¶
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type ResourceName ¶
type ResourceName string
const ( ResourceTflops ResourceName = "tflops" ResourceVram ResourceName = "vram" )
type Resources ¶
func (*Resources) DeepCopy ¶
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Resources.
func (*Resources) DeepCopyInto ¶
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type RunningAppDetail ¶
type RunningAppDetail struct {
// Workload name namespace
Name string `json:"name,omitempty"`
Namespace string `json:"namespace,omitempty"`
// Worker count
Count int `json:"count"`
// Pod names that are running this workload
// +optional
Pods []*PodGPUInfo `json:"pods,omitempty"`
}
func (*RunningAppDetail) DeepCopy ¶
func (in *RunningAppDetail) DeepCopy() *RunningAppDetail
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RunningAppDetail.
func (*RunningAppDetail) DeepCopyInto ¶
func (in *RunningAppDetail) DeepCopyInto(out *RunningAppDetail)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type ScalingTargetResource ¶
type ScalingTargetResource string
const ( ScalingTargetResourceCompute ScalingTargetResource = "compute" ScalingTargetResourceVRAM ScalingTargetResource = "vram" ScalingTargetResourceAll ScalingTargetResource = "all" )
type SchedulingConfigTemplate ¶
type SchedulingConfigTemplate struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`
Spec SchedulingConfigTemplateSpec `json:"spec,omitempty"`
Status SchedulingConfigTemplateStatus `json:"status,omitempty"`
}
+kubebuilder:object:root=true +kubebuilder:subresource:status +kubebuilder:resource:scope=Cluster +kubebuilder:printcolumn:name="Mode",type="string",JSONPath=".spec.placement.mode" +kubebuilder:printcolumn:name="Allow Local GPU",type="string",JSONPath=".spec.placement.allowLocalGPU" +kubebuilder:printcolumn:name="AutoFreeze",type="string",JSONPath=".spec.hypervisor.autoFreezeAndResume.autoFreeze.enable" SchedulingConfigTemplate is the Schema for the schedulingconfigtemplates API.
func (*SchedulingConfigTemplate) DeepCopy ¶
func (in *SchedulingConfigTemplate) DeepCopy() *SchedulingConfigTemplate
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SchedulingConfigTemplate.
func (*SchedulingConfigTemplate) DeepCopyInto ¶
func (in *SchedulingConfigTemplate) DeepCopyInto(out *SchedulingConfigTemplate)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (*SchedulingConfigTemplate) DeepCopyObject ¶
func (in *SchedulingConfigTemplate) DeepCopyObject() runtime.Object
DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
type SchedulingConfigTemplateList ¶
type SchedulingConfigTemplateList struct {
metav1.TypeMeta `json:",inline"`
metav1.ListMeta `json:"metadata,omitempty"`
Items []SchedulingConfigTemplate `json:"items"`
}
SchedulingConfigTemplateList contains a list of SchedulingConfigTemplate.
func (*SchedulingConfigTemplateList) DeepCopy ¶
func (in *SchedulingConfigTemplateList) DeepCopy() *SchedulingConfigTemplateList
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SchedulingConfigTemplateList.
func (*SchedulingConfigTemplateList) DeepCopyInto ¶
func (in *SchedulingConfigTemplateList) DeepCopyInto(out *SchedulingConfigTemplateList)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (*SchedulingConfigTemplateList) DeepCopyObject ¶
func (in *SchedulingConfigTemplateList) DeepCopyObject() runtime.Object
DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
type SchedulingConfigTemplateSpec ¶
type SchedulingConfigTemplateSpec struct {
// place the client or worker to best matched nodes
Placement PlacementConfig `json:"placement"`
// scale the workload based on the usage and traffic
// +optional
VerticalScalingRules []VerticalScalingRule `json:"verticalScalingRules,omitempty"`
// avoid hot GPU devices and continuously balance the workload
// implemented by mark GPU as hot and trigger evict for re-scheduling
// The hot GPUs will get lower priority for scheduling
// Future: implement rebalancer
// +optional
ReBalancer *ReBalancerConfig `json:"reBalancer,omitempty"`
// single GPU device multi-process queuing and fair scheduling with QoS constraint
// +optional
Hypervisor *HypervisorScheduling `json:"hypervisor,omitempty"`
}
Place the workload to right nodes and scale smart.
func (*SchedulingConfigTemplateSpec) DeepCopy ¶
func (in *SchedulingConfigTemplateSpec) DeepCopy() *SchedulingConfigTemplateSpec
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SchedulingConfigTemplateSpec.
func (*SchedulingConfigTemplateSpec) DeepCopyInto ¶
func (in *SchedulingConfigTemplateSpec) DeepCopyInto(out *SchedulingConfigTemplateSpec)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type SchedulingConfigTemplateStatus ¶
type SchedulingConfigTemplateStatus struct {
}
SchedulingConfigTemplateStatus defines the observed state of SchedulingConfigTemplate.
func (*SchedulingConfigTemplateStatus) DeepCopy ¶
func (in *SchedulingConfigTemplateStatus) DeepCopy() *SchedulingConfigTemplateStatus
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SchedulingConfigTemplateStatus.
func (*SchedulingConfigTemplateStatus) DeepCopyInto ¶
func (in *SchedulingConfigTemplateStatus) DeepCopyInto(out *SchedulingConfigTemplateStatus)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type SmartSchedulerModelInput ¶
type SmartSchedulerModelInput struct {
Enable *bool `json:"enable,omitempty"`
Model string `json:"model,omitempty"`
HistoryDataPeriod string `json:"historyDataPeriod,omitempty"`
PredictionPeriod string `json:"predictionPeriod,omitempty"`
}
func (*SmartSchedulerModelInput) DeepCopy ¶
func (in *SmartSchedulerModelInput) DeepCopy() *SmartSchedulerModelInput
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SmartSchedulerModelInput.
func (*SmartSchedulerModelInput) DeepCopyInto ¶
func (in *SmartSchedulerModelInput) DeepCopyInto(out *SmartSchedulerModelInput)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type Taint ¶
type Taint struct {
// +kubebuilder:default=NoSchedule
// +kubebuilder:validation:Enum=NoSchedule;NoExecute;PreferNoSchedule
Effect corev1.TaintEffect `json:"effect,omitempty"`
Key string `json:"key,omitempty"`
Value string `json:"value,omitempty"`
}
func (*Taint) DeepCopy ¶
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Taint.
func (*Taint) DeepCopyInto ¶
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type TensorFusionCluster ¶
type TensorFusionCluster struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`
Spec TensorFusionClusterSpec `json:"spec,omitempty"`
Status TensorFusionClusterStatus `json:"status,omitempty"`
}
+kubebuilder:printcolumn:name="Total Tflops",type="string",JSONPath=".status.totalTFlops" +kubebuilder:printcolumn:name="Total VRAM",type="string",JSONPath=".status.totalVRAM" +kubebuilder:printcolumn:name="Available Tflops",type="string",JSONPath=".status.availableTFlops" +kubebuilder:printcolumn:name="Available VRAM",type="string",JSONPath=".status.availableVRAM" TensorFusionCluster is the Schema for the tensorfusionclusters API.
func (*TensorFusionCluster) DeepCopy ¶
func (in *TensorFusionCluster) DeepCopy() *TensorFusionCluster
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TensorFusionCluster.
func (*TensorFusionCluster) DeepCopyInto ¶
func (in *TensorFusionCluster) DeepCopyInto(out *TensorFusionCluster)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (*TensorFusionCluster) DeepCopyObject ¶
func (in *TensorFusionCluster) DeepCopyObject() runtime.Object
DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
func (*TensorFusionCluster) RefreshStatus ¶
func (tfc *TensorFusionCluster) RefreshStatus(ownedPools []GPUPool)
func (*TensorFusionCluster) SetAsPending ¶
func (tfc *TensorFusionCluster) SetAsPending()
func (*TensorFusionCluster) SetAsReady ¶
func (tfc *TensorFusionCluster) SetAsReady(conditions ...metav1.Condition) bool
func (*TensorFusionCluster) SetAsUnknown ¶
func (tfc *TensorFusionCluster) SetAsUnknown(err error) bool
func (*TensorFusionCluster) SetAsUpdating ¶
func (tfc *TensorFusionCluster) SetAsUpdating(conditions ...metav1.Condition) bool
type TensorFusionClusterList ¶
type TensorFusionClusterList struct {
metav1.TypeMeta `json:",inline"`
metav1.ListMeta `json:"metadata,omitempty"`
Items []TensorFusionCluster `json:"items"`
}
TensorFusionClusterList contains a list of TensorFusionCluster.
func (*TensorFusionClusterList) DeepCopy ¶
func (in *TensorFusionClusterList) DeepCopy() *TensorFusionClusterList
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TensorFusionClusterList.
func (*TensorFusionClusterList) DeepCopyInto ¶
func (in *TensorFusionClusterList) DeepCopyInto(out *TensorFusionClusterList)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (*TensorFusionClusterList) DeepCopyObject ¶
func (in *TensorFusionClusterList) DeepCopyObject() runtime.Object
DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
type TensorFusionClusterPhase ¶
type TensorFusionClusterPhase string
+kubebuilder:validation:Enum=Pending;Running;Updating;Destroying;Unknown TensorFusionClusterPhase represents the phase of the TensorFusionCluster resource.
type TensorFusionClusterSpec ¶
type TensorFusionClusterSpec struct {
GPUPools []GPUPoolDefinition `json:"gpuPools,omitempty"`
// +optional
ComputingVendor *ComputingVendorConfig `json:"computingVendor,omitempty"`
}
TensorFusionClusterSpec defines the desired state of TensorFusionCluster.
func (*TensorFusionClusterSpec) DeepCopy ¶
func (in *TensorFusionClusterSpec) DeepCopy() *TensorFusionClusterSpec
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TensorFusionClusterSpec.
func (*TensorFusionClusterSpec) DeepCopyInto ¶
func (in *TensorFusionClusterSpec) DeepCopyInto(out *TensorFusionClusterSpec)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type TensorFusionClusterStatus ¶
type TensorFusionClusterStatus struct {
// +kubebuilder:default=Pending
Phase TensorFusionClusterPhase `json:"phase,omitempty"`
Conditions []metav1.Condition `json:"conditions,omitempty"`
TotalPools int32 `json:"totalPools"`
TotalNodes int32 `json:"totalNodes"`
TotalGPUs int32 `json:"totalGPUs"`
TotalTFlops resource.Quantity `json:"totalTFlops"`
TotalVRAM resource.Quantity `json:"totalVRAM"`
VirtualTFlops resource.Quantity `json:"virtualTFlops"`
VirtualVRAM resource.Quantity `json:"virtualVRAM"`
AvailableTFlops resource.Quantity `json:"availableTFlops"`
AvailableVRAM resource.Quantity `json:"availableVRAM"`
// +optional
VirtualAvailableTFlops *resource.Quantity `json:"virtualAvailableTFlops,omitempty"`
// +optional
VirtualAvailableVRAM *resource.Quantity `json:"virtualAvailableVRAM,omitempty"`
// +optional
ReadyGPUPools []string `json:"readyGPUPools"`
// +optional
NotReadyGPUPools []string `json:"notReadyGPUPools"`
// +kubebuilder:default=0
//
RetryCount int64 `json:"retryCount"`
// TODO: calculated every 1h/1d/1w average
UtilizedTFlopsPercent string `json:"utilizedTFlopsPercent,omitempty"`
UtilizedVRAMPercent string `json:"utilizedVRAMPercent,omitempty"`
// TODO: updated with interval
AllocatedTFlopsPercent string `json:"allocatedTFlopsPercent,omitempty"`
AllocatedVRAMPercent string `json:"allocatedVRAMPercent,omitempty"`
// TODO: aggregated with interval
SavedCostsPerMonth string `json:"savedCostsPerMonth,omitempty"`
PotentialSavingsPerMonth string `json:"potentialSavingsPerMonth,omitempty"`
CloudVendorConfigHash string `json:"cloudVendorConfigHash,omitempty"`
}
TensorFusionClusterStatus defines the observed state of TensorFusionCluster.
func (*TensorFusionClusterStatus) DeepCopy ¶
func (in *TensorFusionClusterStatus) DeepCopy() *TensorFusionClusterStatus
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TensorFusionClusterStatus.
func (*TensorFusionClusterStatus) DeepCopyInto ¶
func (in *TensorFusionClusterStatus) DeepCopyInto(out *TensorFusionClusterStatus)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type TensorFusionConnection ¶
type TensorFusionConnection struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`
Spec TensorFusionConnectionSpec `json:"spec,omitempty"`
Status TensorFusionConnectionStatus `json:"status,omitempty"`
}
TensorFusionConnection is the Schema for the tensorfusionconnections API.
func (*TensorFusionConnection) DeepCopy ¶
func (in *TensorFusionConnection) DeepCopy() *TensorFusionConnection
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TensorFusionConnection.
func (*TensorFusionConnection) DeepCopyInto ¶
func (in *TensorFusionConnection) DeepCopyInto(out *TensorFusionConnection)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (*TensorFusionConnection) DeepCopyObject ¶
func (in *TensorFusionConnection) DeepCopyObject() runtime.Object
DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
type TensorFusionConnectionList ¶
type TensorFusionConnectionList struct {
metav1.TypeMeta `json:",inline"`
metav1.ListMeta `json:"metadata,omitempty"`
Items []TensorFusionConnection `json:"items"`
}
TensorFusionConnectionList contains a list of TensorFusionConnection.
func (*TensorFusionConnectionList) DeepCopy ¶
func (in *TensorFusionConnectionList) DeepCopy() *TensorFusionConnectionList
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TensorFusionConnectionList.
func (*TensorFusionConnectionList) DeepCopyInto ¶
func (in *TensorFusionConnectionList) DeepCopyInto(out *TensorFusionConnectionList)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (*TensorFusionConnectionList) DeepCopyObject ¶
func (in *TensorFusionConnectionList) DeepCopyObject() runtime.Object
DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
type TensorFusionConnectionSpec ¶
type TensorFusionConnectionSpec struct {
WorkloadName string `json:"workloadName"`
ClientPod string `json:"clientPod"`
}
TensorFusionConnectionSpec defines the desired state of TensorFusionConnection.
func (*TensorFusionConnectionSpec) DeepCopy ¶
func (in *TensorFusionConnectionSpec) DeepCopy() *TensorFusionConnectionSpec
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TensorFusionConnectionSpec.
func (*TensorFusionConnectionSpec) DeepCopyInto ¶
func (in *TensorFusionConnectionSpec) DeepCopyInto(out *TensorFusionConnectionSpec)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type TensorFusionConnectionStatus ¶
type TensorFusionConnectionStatus struct {
Phase WorkerPhase `json:"phase"`
ConnectionURL string `json:"connectionURL"`
WorkerName string `json:"workerName"`
}
TensorFusionConnectionStatus defines the observed state of TensorFusionConnection.
func (*TensorFusionConnectionStatus) DeepCopy ¶
func (in *TensorFusionConnectionStatus) DeepCopy() *TensorFusionConnectionStatus
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TensorFusionConnectionStatus.
func (*TensorFusionConnectionStatus) DeepCopyInto ¶
func (in *TensorFusionConnectionStatus) DeepCopyInto(out *TensorFusionConnectionStatus)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type TensorFusionGPUNodePhase ¶
type TensorFusionGPUNodePhase string
+kubebuilder:validation:Enum=Pending;Provisioning;Migrating;Running;Succeeded;Failed;Unknown;Destroying
const ( TensorFusionGPUNodePhasePending TensorFusionGPUNodePhase = PhasePending TensorFusionGPUNodePhaseMigrating TensorFusionGPUNodePhase = PhaseMigrating TensorFusionGPUNodePhaseRunning TensorFusionGPUNodePhase = PhaseRunning TensorFusionGPUNodePhaseSucceeded TensorFusionGPUNodePhase = PhaseSucceeded TensorFusionGPUNodePhaseFailed TensorFusionGPUNodePhase = PhaseFailed TensorFusionGPUNodePhaseUnknown TensorFusionGPUNodePhase = PhaseUnknown TensorFusionGPUNodePhaseDestroying TensorFusionGPUNodePhase = PhaseDestroying )
type TensorFusionGPUPhase ¶
type TensorFusionGPUPhase string
+kubebuilder:validation:Enum=Pending;Provisioning;Running;Unknown;Destroying;Migrating
const ( TensorFusionGPUPhasePending TensorFusionGPUPhase = PhasePending TensorFusionGPUPhaseUpdating TensorFusionGPUPhase = PhaseUpdating TensorFusionGPUPhaseRunning TensorFusionGPUPhase = PhaseRunning TensorFusionGPUPhaseUnknown TensorFusionGPUPhase = PhaseUnknown TensorFusionGPUPhaseDestroying TensorFusionGPUPhase = PhaseDestroying TensorFusionGPUPhaseMigrating TensorFusionGPUPhase = PhaseMigrating )
type TensorFusionPoolPhase ¶
type TensorFusionPoolPhase string
+kubebuilder:validation:Enum=Pending;Running;Updating;Destroying;Unknown
type TensorFusionWorkload ¶
type TensorFusionWorkload struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`
Spec WorkloadProfileSpec `json:"spec,omitempty"`
Status TensorFusionWorkloadStatus `json:"status,omitempty"`
}
TensorFusionWorkload is the Schema for the tensorfusionworkloads API.
func (*TensorFusionWorkload) DeepCopy ¶
func (in *TensorFusionWorkload) DeepCopy() *TensorFusionWorkload
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TensorFusionWorkload.
func (*TensorFusionWorkload) DeepCopyInto ¶
func (in *TensorFusionWorkload) DeepCopyInto(out *TensorFusionWorkload)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (*TensorFusionWorkload) DeepCopyObject ¶
func (in *TensorFusionWorkload) DeepCopyObject() runtime.Object
DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
type TensorFusionWorkloadList ¶
type TensorFusionWorkloadList struct {
metav1.TypeMeta `json:",inline"`
metav1.ListMeta `json:"metadata,omitempty"`
Items []TensorFusionWorkload `json:"items"`
}
TensorFusionWorkloadList contains a list of TensorFusionWorkload.
func (*TensorFusionWorkloadList) DeepCopy ¶
func (in *TensorFusionWorkloadList) DeepCopy() *TensorFusionWorkloadList
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TensorFusionWorkloadList.
func (*TensorFusionWorkloadList) DeepCopyInto ¶
func (in *TensorFusionWorkloadList) DeepCopyInto(out *TensorFusionWorkloadList)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (*TensorFusionWorkloadList) DeepCopyObject ¶
func (in *TensorFusionWorkloadList) DeepCopyObject() runtime.Object
DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
type TensorFusionWorkloadPhase ¶
type TensorFusionWorkloadPhase string
+kubebuilder:validation:Enum=Pending;Running;Failed;Unknown
const ( TensorFusionWorkloadPhasePending TensorFusionWorkloadPhase = "Pending" TensorFusionWorkloadPhaseRunning TensorFusionWorkloadPhase = "Running" TensorFusionWorkloadPhaseFailed TensorFusionWorkloadPhase = "Failed" )
type TensorFusionWorkloadStatus ¶
type TensorFusionWorkloadStatus struct {
// +kubebuilder:default=Pending
Phase TensorFusionWorkloadPhase `json:"phase,omitempty"`
// Represents the latest available observations of the workload's current state.
// +optional
Conditions []metav1.Condition `json:"conditions,omitempty"`
// workerCount is the number of vGPU workers
WorkerCount int32 `json:"workerCount"`
// readyWorkers is the number of vGPU workers ready
ReadyWorkers int32 `json:"readyWorkers,omitempty"`
// Hash of the pod template used to create worker pods
PodTemplateHash string `json:"podTemplateHash,omitempty"`
// The most recently GPU resources recommended by the autoscaler
// +optional
Recommendation *Resources `json:"recommendation,omitempty"`
// The number of replicas currently applied based on the latest recommendation
// +optional
AppliedRecommendedReplicas int32 `json:"appliedRecommendedReplicas,omitempty"`
// The currently active cron scaling rule
// +optional
ActiveCronScalingRule *CronScalingRule `json:"activeCronScalingRule,omitempty"`
}
TensorFusionWorkloadStatus defines the observed state of TensorFusionWorkload.
func (*TensorFusionWorkloadStatus) DeepCopy ¶
func (in *TensorFusionWorkloadStatus) DeepCopy() *TensorFusionWorkloadStatus
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TensorFusionWorkloadStatus.
func (*TensorFusionWorkloadStatus) DeepCopyInto ¶
func (in *TensorFusionWorkloadStatus) DeepCopyInto(out *TensorFusionWorkloadStatus)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type UsedBySystem ¶
type UsedBySystem string
+default="tensor-fusion"
var (
UsedByTensorFusion UsedBySystem = UsedBySystem(DomainPrefix)
)
type VerticalScalingRule ¶
type VerticalScalingRule struct {
Name string `json:"name,omitempty"`
// Rule auto applied in webhook, when pod matches the selector,
// the rule will be added into workload profile's autoScalingConfig and annotation
Selector metav1.LabelSelector `json:"selector,omitempty"`
Rule *AutoScalingConfig `json:"autoScaling,omitempty"`
}
func (*VerticalScalingRule) DeepCopy ¶
func (in *VerticalScalingRule) DeepCopy() *VerticalScalingRule
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VerticalScalingRule.
func (*VerticalScalingRule) DeepCopyInto ¶
func (in *VerticalScalingRule) DeepCopyInto(out *VerticalScalingRule)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type VirtualizationTemplate ¶ added in v0.1.1
type VirtualizationTemplate struct {
// ID is the unique identifier for this template (e.g., "mig-1g-10gb", "vir01")
// +kubebuilder:validation:Required
ID string `json:"id"`
// Name is the vendor-specific name (e.g., "1g.10gb", "vir01")
// +kubebuilder:validation:Required
Name string `json:"name"`
// MemoryGigabytes is the memory allocated to this partition
// +kubebuilder:validation:Required
MemoryGigabytes uint64 `json:"memoryGigabytes"`
// ComputePercent is the percentage of compute allocated (0-100), serialized as string
// +kubebuilder:validation:Required
ComputePercent string `json:"computePercent"`
// Description provides additional information about this template
// +optional
Description string `json:"description,omitempty"`
// MaxInstances is the maximum number of instances of this template per GPU
// +optional
MaxInstances uint32 `json:"maxInstances,omitempty"`
// PlacementLimit defines valid placement positions using a bitmask
// For NVIDIA MIG: defines which slots can host this partition
// +optional
PlacementLimit []uint32 `json:"placementLimit,omitempty"`
// PlacementOffset defines the slot offset for this template
// +optional
PlacementOffset uint32 `json:"placementOffset,omitempty"`
// ExtendedResources contains additional resource dimensions
// For Ascend NPU: {"AICORE": 1, "AICPU": 1, "VPC": 1, ...}
// +optional
ExtendedResources map[string]uint32 `json:"extendedResources,omitempty"`
// IsolationGroupSharing defines how isolation groups are handled
// "exclusive" - each partition requires its own isolation group
// "shared" - multiple partitions can share an isolation group (time-sharing)
// +optional
// +kubebuilder:default="exclusive"
// +kubebuilder:validation:Enum=exclusive;shared
IsolationGroupSharing string `json:"isolationGroupSharing,omitempty"`
// MaxPartitionsPerIsolationGroup limits partitions sharing one isolation group
// Only applicable when IsolationGroupSharing is "shared"
// +optional
MaxPartitionsPerIsolationGroup uint32 `json:"maxPartitionsPerIsolationGroup,omitempty"`
// IsolationGroupSlots defines minimum slots required by this template's isolation group
// +optional
IsolationGroupSlots uint32 `json:"isolationGroupSlots,omitempty"`
}
VirtualizationTemplate defines a partition/slice template for GPU virtualization
func (*VirtualizationTemplate) DeepCopy ¶ added in v0.1.1
func (in *VirtualizationTemplate) DeepCopy() *VirtualizationTemplate
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualizationTemplate.
func (*VirtualizationTemplate) DeepCopyInto ¶ added in v0.1.1
func (in *VirtualizationTemplate) DeepCopyInto(out *VirtualizationTemplate)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type WorkerConfig ¶
type WorkerConfig struct {
// Image is the default worker image, used when no ProviderConfig is found for the vendor
// When ProviderConfig exists, remoteWorker image from ProviderConfig takes precedence
Image string `json:"image,omitempty"`
// +optional
PodTemplate *runtime.RawExtension `json:"podTemplate,omitempty"`
}
func (*WorkerConfig) DeepCopy ¶
func (in *WorkerConfig) DeepCopy() *WorkerConfig
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WorkerConfig.
func (*WorkerConfig) DeepCopyInto ¶
func (in *WorkerConfig) DeepCopyInto(out *WorkerConfig)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type WorkerPhase ¶
type WorkerPhase string
const ( WorkerPending WorkerPhase = "Pending" WorkerRunning WorkerPhase = "Running" WorkerFailed WorkerPhase = "Failed" )
type WorkerStatus ¶
type WorkerStatus struct {
WorkerPhase WorkerPhase `json:"workerPhase"`
WorkerName string `json:"workerName"`
NodeSelector map[string]string `json:"nodeSelector,omitempty"`
// +optional
WorkerIp string `json:"workerIp,omitempty"`
// +optional
ResourceVersion string `json:"resourceVersion,omitempty"`
}
func (*WorkerStatus) DeepCopy ¶
func (in *WorkerStatus) DeepCopy() *WorkerStatus
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WorkerStatus.
func (*WorkerStatus) DeepCopyInto ¶
func (in *WorkerStatus) DeepCopyInto(out *WorkerStatus)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type WorkloadProfile ¶
type WorkloadProfile struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`
Spec WorkloadProfileSpec `json:"spec,omitempty"`
Status WorkloadProfileStatus `json:"status,omitempty"`
}
WorkloadProfile is the Schema for the workloadprofiles API.
func (*WorkloadProfile) DeepCopy ¶
func (in *WorkloadProfile) DeepCopy() *WorkloadProfile
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WorkloadProfile.
func (*WorkloadProfile) DeepCopyInto ¶
func (in *WorkloadProfile) DeepCopyInto(out *WorkloadProfile)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (*WorkloadProfile) DeepCopyObject ¶
func (in *WorkloadProfile) DeepCopyObject() runtime.Object
DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
type WorkloadProfileList ¶
type WorkloadProfileList struct {
metav1.TypeMeta `json:",inline"`
metav1.ListMeta `json:"metadata,omitempty"`
Items []WorkloadProfile `json:"items"`
}
WorkloadProfileList contains a list of WorkloadProfile.
func (*WorkloadProfileList) DeepCopy ¶
func (in *WorkloadProfileList) DeepCopy() *WorkloadProfileList
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WorkloadProfileList.
func (*WorkloadProfileList) DeepCopyInto ¶
func (in *WorkloadProfileList) DeepCopyInto(out *WorkloadProfileList)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (*WorkloadProfileList) DeepCopyObject ¶
func (in *WorkloadProfileList) DeepCopyObject() runtime.Object
DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
type WorkloadProfileSpec ¶
type WorkloadProfileSpec struct {
// +optional
// If replicas not set, it will be dynamic based on pending Pod
// If isLocalGPU set to true, replicas must be dynamic, and this field will be ignored
Replicas *int32 `json:"replicas,omitempty"`
// +optional
PoolName string `json:"poolName,omitempty"`
// +optional
Resources Resources `json:"resources"`
// +optional
// Qos defines the quality of service level for the client.
Qos QoSLevel `json:"qos,omitempty"`
// +optional
// Schedule the workload to the same GPU server that runs vGPU worker for best performance, default to false
IsLocalGPU bool `json:"isLocalGPU,omitempty"`
// +optional
// When set to sidecar worker mode, its always Local GPU mode, and hard-isolated with shared memory
// default to false, indicates the workload's embedded worker is same process, soft-isolated
SidecarWorker bool `json:"sidecarWorker,omitempty"`
// +optional
// +kubebuilder:default=soft
// How to isolate resources, could be `shared` or `soft` or `hard` or `partitioned`
Isolation IsolationModeType `json:"isolation,omitempty"`
// +optional
// PartitionTemplateID specifies the partition template ID for partitioned isolation mode
// This is read from pod annotation tensor-fusion.ai/partition if specified
PartitionTemplateID string `json:"partitionTemplateId,omitempty"`
// +optional
// GPUModel specifies the required GPU model (e.g., "A100", "H100")
GPUModel string `json:"gpuModel,omitempty"`
// The number of GPUs to be used by the workload, default to 1
GPUCount uint32 `json:"gpuCount,omitempty"`
// Specify GPU indices for precise control of scheduling
GPUIndices []int32 `json:"gpuIndices,omitempty"`
// Specify GPU vendor for precise control of scheduling
GPUVendor string `json:"vendor,omitempty"`
// +optional
// AutoScalingConfig configured here will override Pool's schedulingConfig
// This field can not be fully supported in annotation, if user want to enable auto-scaling in annotation,
// user can set tensor-fusion.ai/autoscale: 'true'
AutoScalingConfig AutoScalingConfig `json:"autoScalingConfig,omitempty"`
// +optional
// NodeAffinity specifies the node affinity requirements for the workload
NodeAffinity *v1.NodeAffinity `json:"nodeAffinity,omitempty"`
// +optional
// WorkerPodTemplate is the template for the worker pod, only take effect in remote vGPU mode
WorkerPodTemplate *runtime.RawExtension `json:"workerPodTemplate,omitempty"`
// +optional
// GangScheduling configuration for scheduling multiple pods together
GangScheduling *GangSchedulingConfig `json:"gangScheduling,omitempty"`
}
WorkloadProfileSpec defines the desired state of WorkloadProfile.
func (*WorkloadProfileSpec) DeepCopy ¶
func (in *WorkloadProfileSpec) DeepCopy() *WorkloadProfileSpec
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WorkloadProfileSpec.
func (*WorkloadProfileSpec) DeepCopyInto ¶
func (in *WorkloadProfileSpec) DeepCopyInto(out *WorkloadProfileSpec)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (WorkloadProfileSpec) IsDynamicReplica ¶
func (t WorkloadProfileSpec) IsDynamicReplica() bool
type WorkloadProfileStatus ¶
type WorkloadProfileStatus struct {
}
WorkloadProfileStatus defines the observed state of WorkloadProfile.
func (*WorkloadProfileStatus) DeepCopy ¶
func (in *WorkloadProfileStatus) DeepCopy() *WorkloadProfileStatus
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WorkloadProfileStatus.
func (*WorkloadProfileStatus) DeepCopyInto ¶
func (in *WorkloadProfileStatus) DeepCopyInto(out *WorkloadProfileStatus)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
Source Files
¶
- base_types.go
- constants.go
- gpu_types.go
- gpunode_funcs.go
- gpunode_types.go
- gpunodeclaim_types.go
- gpunodeclass_types.go
- gpupool_types.go
- gpuresourcequota_types.go
- groupversion_info.go
- providerconfig_types.go
- schedulingconfigtemplate_types.go
- tensorfusioncluster_funcs.go
- tensorfusioncluster_types.go
- tensorfusionconnection_types.go
- tensorfusionworkload_types.go
- workloadprofile_types.go
- zz_generated.deepcopy.go