Documentation
¶
Overview ¶
Package gpuallocator handles GPU allocation
Index ¶
- Constants
- Variables
- func CalculateAscendExtendedResourceUsage(gpuConfig *config.GpuInfo, templateID string) (map[string]uint32, error)
- func CalculatePartitionResourceUsage(capacityTflops resource.Quantity, gpuModel, templateID string) (tflops resource.Quantity, vram resource.Quantity, err error)
- func CheckPartitionAvailability(gpu *tfv1.GPU, templateID string) error
- func IsScalingQuotaExceededError(err error) bool
- func LoadPartitionTemplatesFromConfig(gpuInfos []config.GpuInfo)
- func RefreshGPUNodeCapacity(ctx context.Context, k8sClient client.Client, node *tfv1.GPUNode, ...) ([]string, error)
- type AscendPartitionStrategy
- func (s *AscendPartitionStrategy) AllocateSlot(gpu *tfv1.GPU, templateInfo config.PartitionTemplateInfo, ...) (*uint32, *uint32, *uint32, error)
- func (s *AscendPartitionStrategy) CheckAvailability(gpu *tfv1.GPU, templateInfo config.PartitionTemplateInfo, ...) error
- func (s *AscendPartitionStrategy) Name() string
- type CompactFirst
- type GpuAllocator
- func (s *GpuAllocator) AdjustAllocation(ctx context.Context, adjustRequest tfv1.AdjustRequest, dryRun bool) (tfv1.Resource, tfv1.Resource, error)
- func (s *GpuAllocator) Alloc(req *tfv1.AllocRequest) ([]*tfv1.GPU, error)
- func (s *GpuAllocator) Bind(gpuNames []string, req *tfv1.AllocRequest) ([]*tfv1.GPU, error)
- func (s *GpuAllocator) CheckQuotaAndFilter(ctx context.Context, req *tfv1.AllocRequest, isSimulateSchedule bool) ([]*tfv1.GPU, []filter.FilterDetail, error)
- func (s *GpuAllocator) CheckQuotaAndFilterSingleNodePreempt(nodeName string, allocReq *tfv1.AllocRequest, ...) error
- func (s *GpuAllocator) Dealloc(workloadNameNamespace tfv1.NameNamespace, gpus []string, ...)
- func (s *GpuAllocator) DeallocAsync(workloadNameNamespace tfv1.NameNamespace, gpus []string, ...)
- func (s *GpuAllocator) DeallocByPodIdentifier(ctx context.Context, podIdentifier types.NamespacedName)
- func (s *GpuAllocator) Filter(req *tfv1.AllocRequest, toFilterGPUs []*tfv1.GPU, isSimulateSchedule bool) ([]*tfv1.GPU, []filter.FilterDetail, error)
- func (s *GpuAllocator) FilterWithPreempt(req *tfv1.AllocRequest, preemptAllocRequests []*tfv1.AllocRequest) ([]*tfv1.GPU, []filter.FilterDetail, error)
- func (s *GpuAllocator) GetAllocationInfo() (gpuStore map[types.NamespacedName]*tfv1.GPU, ...)
- func (s *GpuAllocator) GetAllocationReqByNodeName(nodeName string) []*tfv1.AllocRequest
- func (s *GpuAllocator) GetMatchedPartition(req *tfv1.AllocRequest, filteredGPUs []*tfv1.GPU) (*tfv1.GPU, *PartitionMatchResult, error)
- func (s *GpuAllocator) GetNodeGpuStore() map[string]map[string]*tfv1.GPU
- func (s *GpuAllocator) GetQuotaStore() *quota.QuotaStore
- func (s *GpuAllocator) GetScoringStrategy(cfg *config.GPUFitConfig, req *tfv1.AllocRequest) Strategy
- func (s *GpuAllocator) InitGPUAndQuotaStore() error
- func (s *GpuAllocator) ListNonUsingNodes() sets.Set[string]
- func (s *GpuAllocator) ReconcileAllocationState()
- func (s *GpuAllocator) ReconcileAllocationStateForTesting()
- func (s *GpuAllocator) RegisterBindHandler(handler func(req *tfv1.AllocRequest))
- func (s *GpuAllocator) Score(ctx context.Context, strategy Strategy, req *tfv1.AllocRequest, ...) map[string]map[string]int
- func (s *GpuAllocator) Select(req *tfv1.AllocRequest, filteredGPUs []*tfv1.GPU) ([]*tfv1.GPU, error)
- func (s *GpuAllocator) SetAllocatorReady()
- func (s *GpuAllocator) SetMaxWorkerPerNode(maxWorkerPerNode int)
- func (s *GpuAllocator) SetupWithManager(ctx context.Context, mgr manager.Manager) error
- func (s *GpuAllocator) StartInformerForGPU(ctx context.Context, mgr manager.Manager) error
- func (s *GpuAllocator) Stop()
- func (s *GpuAllocator) SyncGPUsToK8s()
- type LowLoadFirst
- type NVIDIAMIGStrategy
- type NodeCompactGPULowLoad
- type NotSupportedPartitionStrategy
- func (s *NotSupportedPartitionStrategy) AllocateSlot(gpu *tfv1.GPU, templateInfo config.PartitionTemplateInfo, ...) (*uint32, *uint32, *uint32, error)
- func (s *NotSupportedPartitionStrategy) CheckAvailability(gpu *tfv1.GPU, templateInfo config.PartitionTemplateInfo, ...) error
- func (s *NotSupportedPartitionStrategy) Name() string
- type PartitionMatchResult
- type PartitionStrategy
- type SimulateSchedulingFilterDetail
- type Strategy
Constants ¶
const CleanUpCheckInterval = 3 * time.Minute
const DefaultMaxPartitionNum = 32
const PartitionMatchingComputingWeight = 0.6
const PartitionMatchingVRAMWeight = 0.4
Variables ¶
var DefaultGPUSelector = func( strategy Strategy, nodeGPUMap map[string]map[string]*tfv1.GPU, validGPUs []*tfv1.GPU, count uint, ) ([]*tfv1.GPU, error) { if len(validGPUs) == 0 { return nil, fmt.Errorf("no GPUs available") } gpuMap := lo.GroupBy(validGPUs, func(gpu *tfv1.GPU) string { return gpu.Status.NodeSelector[constants.KubernetesHostNameLabel] }) nodeScores := make([]struct { node string score int }, 0, len(gpuMap)) for node := range gpuMap { score := 0 allGPUs := nodeGPUMap[node] for _, gpu := range allGPUs { if gpu == nil || gpu.Status.Available == nil || gpu.Status.Capacity == nil { return nil, fmt.Errorf("gpu %s has no available or capacity, unexpected state", gpu.Name) } score += strategy.Score(gpu, true) } nodeScores = append(nodeScores, struct { node string score int }{node, score}) } slices.SortFunc(nodeScores, func(a, b struct { node string score int }) int { return b.score - a.score }) for _, nodeScore := range nodeScores { gpus := gpuMap[nodeScore.node] if len(gpus) >= int(count) { slices.SortFunc(gpus, func(a, b *tfv1.GPU) int { return strategy.Score(b, false) - strategy.Score(a, false) }) return gpus[:count], nil } } return nil, fmt.Errorf("not enough gpus in scoring stage") }
GPU selector is not used by Kubernetes scheduler framework, just used for allocator testing as of now, framework will compose similar logic
var GPUCapacityMap = map[string]tfv1.Resource{}
var MaxIsolationGroupsMap = map[string]uint32{}
MaxIsolationGroupsMap stores max isolation groups by GPU model Key: GPU model, Value: max isolation groups (e.g., 4 for Ascend vGroups)
var MaxPartitionsMap = map[string]uint32{}
MaxPartitionsMap stores max partitions by GPU model Key: GPU model, Value: max partitions (e.g., 7 for MIG)
var MaxPlacementSlotsMap = map[string]uint32{}
MaxPlacementSlotsMap stores max placement slots by GPU model Key: GPU model, Value: max placement slots (e.g., 8 for MIG)
var PartitionTemplateMap = map[string]map[string]config.PartitionTemplateInfo{}
PartitionTemplateMap stores partition template info by GPU model Key: GPU model (e.g., "A100_SXM_80G"), Value: map of templateID -> template info
var ScalingQuotaExceededError = goerrors.New("scaling quota exceeded")
var TotalExtendedResourcesMap = map[string]map[string]uint32{}
TotalExtendedResourcesMap stores total extended resources by GPU model Key: GPU model, Value: map of resource name -> total capacity For Ascend NPU: {"AICORE": 8, "AICPU": 7, "VPC": 12, ...}
Functions ¶
func CalculateAscendExtendedResourceUsage ¶ added in v1.55.0
func CalculateAscendExtendedResourceUsage(gpuConfig *config.GpuInfo, templateID string) (map[string]uint32, error)
CalculateAscendExtendedResourceUsage calculates total extended resource usage for a template
func CalculatePartitionResourceUsage ¶ added in v1.55.0
func CalculatePartitionResourceUsage(capacityTflops resource.Quantity, gpuModel, templateID string) (tflops resource.Quantity, vram resource.Quantity, err error)
CalculatePartitionResourceUsage calculates the resource usage for a partition template. Gets template info from config.
func CheckPartitionAvailability ¶ added in v1.55.0
CheckPartitionAvailability checks if a GPU has enough resources to allocate a partition. Gets template info from config. Uses vendor-specific strategy for slot/group checking.
func IsScalingQuotaExceededError ¶ added in v1.35.0
func LoadPartitionTemplatesFromConfig ¶ added in v1.55.0
LoadPartitionTemplatesFromConfig loads partition templates and max partitions from GPU info config This should be called when GPU info config is loaded/updated
Types ¶
type AscendPartitionStrategy ¶ added in v1.55.0
type AscendPartitionStrategy struct{}
AscendPartitionStrategy implements PartitionStrategy for Huawei Ascend NPU Key concepts: - vNPU: virtual NPU created from a template (e.g., vir01, vir04) - vGroup: isolation group (0-3), provides hardware isolation between groups - Templates sharing same vGroup use time-sharing (soft isolation) - Different vGroups provide hard isolation
func (*AscendPartitionStrategy) AllocateSlot ¶ added in v1.55.0
func (*AscendPartitionStrategy) CheckAvailability ¶ added in v1.55.0
func (s *AscendPartitionStrategy) CheckAvailability(gpu *tfv1.GPU, templateInfo config.PartitionTemplateInfo, gpuConfig *config.GpuInfo) error
func (*AscendPartitionStrategy) Name ¶ added in v1.55.0
func (s *AscendPartitionStrategy) Name() string
type CompactFirst ¶
type CompactFirst struct {
// contains filtered or unexported fields
}
CompactFirst selects GPU with minimum available resources (most utilized) to efficiently pack workloads and maximize GPU utilization
func (CompactFirst) Score ¶ added in v1.35.0
func (c CompactFirst) Score(gpu *tfv1.GPU, _ bool) int
Score function is using by Kubernetes scheduler framework
func (CompactFirst) SelectGPUs ¶
SelectGPUs selects multiple GPUs from the same node with the least available resources (most packed)
type GpuAllocator ¶
func NewGpuAllocator ¶
func NewGpuAllocator( ctx context.Context, indexAllocator *indexallocator.IndexAllocator, client client.Client, syncInterval time.Duration, ) *GpuAllocator
func (*GpuAllocator) AdjustAllocation ¶ added in v1.35.0
func (s *GpuAllocator) AdjustAllocation(ctx context.Context, adjustRequest tfv1.AdjustRequest, dryRun bool) (tfv1.Resource, tfv1.Resource, error)
Used for scale up decision, dryRun to pre-check capacity to determine if the allocation is valid when scaling up, return error and the max new requests/limits on existing GPU Auto scaler can directly call AdjustAllocation for scaling down decision it has to call AdjustAllocation with dryRun=true when scaling up, if return error is ScalingQuotaExceededError, it means the allocation is invalid, and it should scale up with another AdjustRequest to make sure not exceed quota, which returns in the first returned result retry until AdjustAllocation returns nil error, at most pre-configured maxRetry times returns remaining resource, delta resource, error
func (*GpuAllocator) Alloc ¶
func (s *GpuAllocator) Alloc(req *tfv1.AllocRequest) ([]*tfv1.GPU, error)
Alloc allocates a request to a gpu or multiple gpus from the same node. This is now implemented as a combination of Filter and Bind for backward compatibility.
func (*GpuAllocator) Bind ¶ added in v1.35.0
func (s *GpuAllocator) Bind( gpuNames []string, req *tfv1.AllocRequest, ) ([]*tfv1.GPU, error)
Bind allocates resources on the provided GPUs for the given request. It updates the in-memory store and marks the GPUs as dirty for syncing.
func (*GpuAllocator) CheckQuotaAndFilter ¶ added in v1.35.0
func (s *GpuAllocator) CheckQuotaAndFilter(ctx context.Context, req *tfv1.AllocRequest, isSimulateSchedule bool) ([]*tfv1.GPU, []filter.FilterDetail, error)
func (*GpuAllocator) CheckQuotaAndFilterSingleNodePreempt ¶ added in v1.44.0
func (s *GpuAllocator) CheckQuotaAndFilterSingleNodePreempt( nodeName string, allocReq *tfv1.AllocRequest, toPreemptPods sets.Set[types.NamespacedName], ) error
func (*GpuAllocator) Dealloc ¶
func (s *GpuAllocator) Dealloc( workloadNameNamespace tfv1.NameNamespace, gpus []string, podMeta metav1.ObjectMeta, )
Dealloc a request from gpu to release available resources on it.
func (*GpuAllocator) DeallocAsync ¶ added in v1.41.0
func (s *GpuAllocator) DeallocAsync( workloadNameNamespace tfv1.NameNamespace, gpus []string, podMeta metav1.ObjectMeta, )
func (*GpuAllocator) DeallocByPodIdentifier ¶ added in v1.41.5
func (s *GpuAllocator) DeallocByPodIdentifier(ctx context.Context, podIdentifier types.NamespacedName)
func (*GpuAllocator) Filter ¶ added in v1.35.0
func (s *GpuAllocator) Filter( req *tfv1.AllocRequest, toFilterGPUs []*tfv1.GPU, isSimulateSchedule bool, ) ([]*tfv1.GPU, []filter.FilterDetail, error)
Filter applies filters to a pool of GPUs based on the provided request and returns selected GPUs. It does not modify the GPU resources, only filters and selects them.
func (*GpuAllocator) FilterWithPreempt ¶ added in v1.44.0
func (s *GpuAllocator) FilterWithPreempt( req *tfv1.AllocRequest, preemptAllocRequests []*tfv1.AllocRequest, ) ([]*tfv1.GPU, []filter.FilterDetail, error)
func (*GpuAllocator) GetAllocationInfo ¶ added in v1.39.1
func (s *GpuAllocator) GetAllocationInfo() ( gpuStore map[types.NamespacedName]*tfv1.GPU, nodeWorkerStore map[string]map[types.NamespacedName]struct{}, uniqueAllocation map[string]*tfv1.AllocRequest, )
func (*GpuAllocator) GetAllocationReqByNodeName ¶ added in v1.43.8
func (s *GpuAllocator) GetAllocationReqByNodeName(nodeName string) []*tfv1.AllocRequest
func (*GpuAllocator) GetMatchedPartition ¶ added in v1.55.0
func (s *GpuAllocator) GetMatchedPartition( req *tfv1.AllocRequest, filteredGPUs []*tfv1.GPU, ) (*tfv1.GPU, *PartitionMatchResult, error)
GetMatchedPartition finds the best matching partition template for a request in partitioned mode. Returns the GPU, matched partition template, and partition UUID if a match is found. In partitioned mode, GPUs must have partition templates available, and we select the smallest template that can satisfy the request to minimize resource waste.
func (*GpuAllocator) GetNodeGpuStore ¶ added in v1.42.0
func (s *GpuAllocator) GetNodeGpuStore() map[string]map[string]*tfv1.GPU
func (*GpuAllocator) GetQuotaStore ¶ added in v1.35.0
func (s *GpuAllocator) GetQuotaStore() *quota.QuotaStore
func (*GpuAllocator) GetScoringStrategy ¶ added in v1.46.5
func (s *GpuAllocator) GetScoringStrategy(cfg *config.GPUFitConfig, req *tfv1.AllocRequest) Strategy
func (*GpuAllocator) InitGPUAndQuotaStore ¶ added in v1.35.0
func (s *GpuAllocator) InitGPUAndQuotaStore() error
InitGPUAndQuotaStore initializes both GPU store and quota store from Kubernetes
func (*GpuAllocator) ListNonUsingNodes ¶ added in v1.41.1
func (s *GpuAllocator) ListNonUsingNodes() sets.Set[string]
func (*GpuAllocator) ReconcileAllocationState ¶ added in v1.35.0
func (s *GpuAllocator) ReconcileAllocationState()
When it's leader, should reconcile state based on existing workers this function is run inside storeMutex lock
func (*GpuAllocator) ReconcileAllocationStateForTesting ¶ added in v1.44.0
func (s *GpuAllocator) ReconcileAllocationStateForTesting()
func (*GpuAllocator) RegisterBindHandler ¶ added in v1.46.0
func (s *GpuAllocator) RegisterBindHandler(handler func(req *tfv1.AllocRequest))
func (*GpuAllocator) Score ¶ added in v1.35.0
func (s *GpuAllocator) Score( ctx context.Context, strategy Strategy, req *tfv1.AllocRequest, nodeGPUs map[string][]*tfv1.GPU, ) map[string]map[string]int
First level is k8s node name, second level is GPU name, value is score
func (*GpuAllocator) Select ¶ added in v1.35.0
func (s *GpuAllocator) Select(req *tfv1.AllocRequest, filteredGPUs []*tfv1.GPU) ([]*tfv1.GPU, error)
func (*GpuAllocator) SetAllocatorReady ¶ added in v1.39.1
func (s *GpuAllocator) SetAllocatorReady()
func (*GpuAllocator) SetMaxWorkerPerNode ¶ added in v1.35.0
func (s *GpuAllocator) SetMaxWorkerPerNode(maxWorkerPerNode int)
AllocRequest encapsulates all parameters needed for GPU allocation
func (*GpuAllocator) SetupWithManager ¶
SetupWithManager sets up the GpuAllocator with the Manager.
func (*GpuAllocator) StartInformerForGPU ¶ added in v1.35.0
func (*GpuAllocator) SyncGPUsToK8s ¶ added in v1.35.0
func (s *GpuAllocator) SyncGPUsToK8s()
SyncGPUsToK8s syncs GPU status to Kubernetes
type LowLoadFirst ¶
type LowLoadFirst struct {
// contains filtered or unexported fields
}
LowLoadFirst selects GPU with maximum available resources (least utilized) to distribute workloads more evenly across GPUs
func (LowLoadFirst) Score ¶ added in v1.35.0
func (l LowLoadFirst) Score(gpu *tfv1.GPU, _ bool) int
Score function is using by Kubernetes scheduler framework
func (LowLoadFirst) SelectGPUs ¶
SelectGPUs selects multiple GPUs from the same node with the most available resources (least load)
type NVIDIAMIGStrategy ¶ added in v1.55.0
type NVIDIAMIGStrategy struct{}
NVIDIAMIGStrategy implements PartitionStrategy for NVIDIA MIG
func (*NVIDIAMIGStrategy) AllocateSlot ¶ added in v1.55.0
func (*NVIDIAMIGStrategy) CheckAvailability ¶ added in v1.55.0
func (s *NVIDIAMIGStrategy) CheckAvailability(gpu *tfv1.GPU, templateInfo config.PartitionTemplateInfo, gpuConfig *config.GpuInfo) error
func (*NVIDIAMIGStrategy) Name ¶ added in v1.55.0
func (s *NVIDIAMIGStrategy) Name() string
type NodeCompactGPULowLoad ¶ added in v1.46.5
type NodeCompactGPULowLoad struct {
// contains filtered or unexported fields
}
NodeCompactGPULowLoad selects GPU with maximum available resources (least utilized) to distribute workloads more evenly across GPUs default to this mode since it balance the cost and stability, scatter workload on single node with multiple GPUs
func (NodeCompactGPULowLoad) Score ¶ added in v1.46.5
func (l NodeCompactGPULowLoad) Score(gpu *tfv1.GPU, isForNode bool) int
Score function is using by Kubernetes scheduler framework
func (NodeCompactGPULowLoad) SelectGPUs ¶ added in v1.46.5
SelectGPUs selects multiple GPUs from the same node with the most available resources (least loaded)
type NotSupportedPartitionStrategy ¶ added in v1.55.0
type NotSupportedPartitionStrategy struct{}
NotSupportedPartitionStrategy implements PartitionStrategy for unknown vendors
func (*NotSupportedPartitionStrategy) AllocateSlot ¶ added in v1.55.0
func (*NotSupportedPartitionStrategy) CheckAvailability ¶ added in v1.55.0
func (s *NotSupportedPartitionStrategy) CheckAvailability(gpu *tfv1.GPU, templateInfo config.PartitionTemplateInfo, gpuConfig *config.GpuInfo) error
func (*NotSupportedPartitionStrategy) Name ¶ added in v1.55.0
func (s *NotSupportedPartitionStrategy) Name() string
type PartitionMatchResult ¶ added in v1.55.0
type PartitionMatchResult struct {
Template *config.PartitionTemplateInfo // Template info from config
TemplateID string // Template ID
Score float64 // Lower score means better match (less waste)
CanAllocate bool
Reason string
}
PartitionMatchResult represents the result of matching a partition template to a request
func MatchPartitionTemplate ¶ added in v1.55.0
func MatchPartitionTemplate(gpuStatus tfv1.GPUStatus, req *tfv1.AllocRequest) (*PartitionMatchResult, error)
MatchPartitionTemplate matches a partition template to an allocation request. Gets template info from config (PartitionTemplateMap) based on GPU model. In partitioned mode, we find the smallest template that can satisfy the request.
type PartitionStrategy ¶ added in v1.55.0
type PartitionStrategy interface {
// Name returns the strategy name (e.g., "nvidia-mig", "ascend-vnpu")
Name() string
// CheckAvailability verifies if a partition can be allocated on the given GPU
// Returns nil if allocation is possible, error otherwise
CheckAvailability(gpu *tfv1.GPU, templateInfo config.PartitionTemplateInfo, gpuConfig *config.GpuInfo) error
// AllocateSlot finds and returns allocation position information
// Returns (isolationGroupID, slotStart, slotEnd, error)
// - For NVIDIA MIG: slotStart/slotEnd are physical slot positions
// - For Ascend: isolationGroupID is the vGroup (0-3), slots may be nil
AllocateSlot(gpu *tfv1.GPU, templateInfo config.PartitionTemplateInfo, gpuConfig *config.GpuInfo) (*uint32, *uint32, *uint32, error)
}
PartitionStrategy defines the interface for vendor-specific partition allocation strategies
func GetPartitionStrategy ¶ added in v1.55.0
func GetPartitionStrategy(vendor string) PartitionStrategy
GetPartitionStrategy returns the appropriate partition strategy based on GPU vendor
type SimulateSchedulingFilterDetail ¶ added in v1.41.0
type SimulateSchedulingFilterDetail struct {
FilterStageDetails []filter.FilterDetail
}
When called /api/simulate-schedule with Pod yaml as body, return detailed filter details
func (*SimulateSchedulingFilterDetail) Clone ¶ added in v1.41.0
func (p *SimulateSchedulingFilterDetail) Clone() fwk.StateData
type Strategy ¶
type Strategy interface {
// When isForNode = true, indicates each GPU's node level score
// otherwise it's single GPU score inside one node
Score(gpu *tfv1.GPU, isForNode bool) int
SelectGPUs(gpus []*tfv1.GPU, count uint) ([]*tfv1.GPU, error)
}
func NewStrategy ¶
func NewStrategy(placementMode tfv1.PlacementMode, cfg *config.GPUFitConfig, nodeGpuStore map[string]map[string]*tfv1.GPU) Strategy
NewStrategy creates a strategy based on the placement mode