Documentation
¶
Overview ¶
Package gpuallocator handles GPU allocation
Index ¶
- Constants
- Variables
- func IsScalingQuotaExceededError(err error) bool
- func RefreshGPUNodeCapacity(ctx context.Context, k8sClient client.Client, node *tfv1.GPUNode, ...) ([]string, error)
- type CompactFirst
- type GpuAllocator
- func (s *GpuAllocator) AdjustAllocation(ctx context.Context, adjustRequest tfv1.AdjustRequest, dryRun bool) (tfv1.Resource, error)
- func (s *GpuAllocator) Alloc(req *tfv1.AllocRequest) ([]*tfv1.GPU, error)
- func (s *GpuAllocator) Bind(gpuNames []string, req *tfv1.AllocRequest) ([]*tfv1.GPU, error)
- func (s *GpuAllocator) CheckQuotaAndFilter(ctx context.Context, req *tfv1.AllocRequest, isSimulateSchedule bool) ([]*tfv1.GPU, []filter.FilterDetail, error)
- func (s *GpuAllocator) CheckQuotaAndFilterSingleNodePreempt(nodeName string, allocReq *tfv1.AllocRequest, ...) error
- func (s *GpuAllocator) ComposeAllocationRequest(pod *v1.Pod) (*tfv1.AllocRequest, string, error)
- func (s *GpuAllocator) Dealloc(workloadNameNamespace tfv1.NameNamespace, gpus []string, ...)
- func (s *GpuAllocator) DeallocAsync(workloadNameNamespace tfv1.NameNamespace, gpus []string, ...)
- func (s *GpuAllocator) DeallocByPodIdentifier(ctx context.Context, podIdentifier types.NamespacedName)
- func (s *GpuAllocator) Filter(req *tfv1.AllocRequest, toFilterGPUs []*tfv1.GPU, isSimulateSchedule bool) ([]*tfv1.GPU, []filter.FilterDetail, error)
- func (s *GpuAllocator) FilterWithPreempt(req *tfv1.AllocRequest, preemptAllocRequests []*tfv1.AllocRequest) ([]*tfv1.GPU, []filter.FilterDetail, error)
- func (s *GpuAllocator) GetAllocationInfo() (gpuStore map[types.NamespacedName]*tfv1.GPU, ...)
- func (s *GpuAllocator) GetAllocationReqByNodeName(nodeName string) []*tfv1.AllocRequest
- func (s *GpuAllocator) GetNodeGpuStore() map[string]map[string]*tfv1.GPU
- func (s *GpuAllocator) GetQuotaStore() *quota.QuotaStore
- func (s *GpuAllocator) GetScoringStrategy(cfg *config.GPUFitConfig, req *tfv1.AllocRequest) Strategy
- func (s *GpuAllocator) InitGPUAndQuotaStore() error
- func (s *GpuAllocator) ListNonUsingNodes() sets.Set[string]
- func (s *GpuAllocator) ReconcileAllocationState()
- func (s *GpuAllocator) ReconcileAllocationStateForTesting()
- func (s *GpuAllocator) RegisterBindHandler(handler func(req *tfv1.AllocRequest))
- func (s *GpuAllocator) Score(ctx context.Context, strategy Strategy, req *tfv1.AllocRequest, ...) map[string]map[string]int
- func (s *GpuAllocator) Select(req *tfv1.AllocRequest, filteredGPUs []*tfv1.GPU) ([]*tfv1.GPU, error)
- func (s *GpuAllocator) SetAllocatorReady()
- func (s *GpuAllocator) SetMaxWorkerPerNode(maxWorkerPerNode int)
- func (s *GpuAllocator) SetupWithManager(ctx context.Context, mgr manager.Manager) error
- func (s *GpuAllocator) StartInformerForGPU(ctx context.Context, mgr manager.Manager) error
- func (s *GpuAllocator) Stop()
- func (s *GpuAllocator) SyncGPUsToK8s()
- type LowLoadFirst
- type NodeCompactGPULowLoad
- type SimulateSchedulingFilterDetail
- type Strategy
Constants ¶
const CleanUpCheckInterval = 3 * time.Minute
const MaxGPUCounterPerAllocation = 128
Variables ¶
var DefaultGPUSelector = func( strategy Strategy, nodeGPUMap map[string]map[string]*tfv1.GPU, validGPUs []*tfv1.GPU, count uint, ) ([]*tfv1.GPU, error) { if len(validGPUs) == 0 { return nil, fmt.Errorf("no GPUs available") } gpuMap := lo.GroupBy(validGPUs, func(gpu *tfv1.GPU) string { return gpu.Status.NodeSelector[constants.KubernetesHostNameLabel] }) nodeScores := make([]struct { node string score int }, 0, len(gpuMap)) for node := range gpuMap { score := 0 allGPUs := nodeGPUMap[node] for _, gpu := range allGPUs { if gpu == nil || gpu.Status.Available == nil || gpu.Status.Capacity == nil { return nil, fmt.Errorf("gpu %s has no available or capacity, unexpected state", gpu.Name) } score += strategy.Score(gpu, true) } nodeScores = append(nodeScores, struct { node string score int }{node, score}) } slices.SortFunc(nodeScores, func(a, b struct { node string score int }) int { return b.score - a.score }) for _, nodeScore := range nodeScores { gpus := gpuMap[nodeScore.node] if len(gpus) >= int(count) { slices.SortFunc(gpus, func(a, b *tfv1.GPU) int { return strategy.Score(b, false) - strategy.Score(a, false) }) return gpus[:count], nil } } return nil, fmt.Errorf("not enough gpus in scoring stage") }
GPU selector is not used by Kubernetes scheduler framework, just used for allocator testing as of now, framework will compose similar logic
var GPUCapacityMap = map[string]tfv1.Resource{}
var ScalingQuotaExceededError = goerrors.New("scaling quota exceeded")
Functions ¶
func IsScalingQuotaExceededError ¶ added in v1.35.0
Types ¶
type CompactFirst ¶
type CompactFirst struct {
// contains filtered or unexported fields
}
CompactFirst selects GPU with minimum available resources (most utilized) to efficiently pack workloads and maximize GPU utilization
func (CompactFirst) Score ¶ added in v1.35.0
func (c CompactFirst) Score(gpu *tfv1.GPU, _ bool) int
Score function is using by Kubernetes scheduler framework
func (CompactFirst) SelectGPUs ¶
SelectGPUs selects multiple GPUs from the same node with the least available resources (most packed)
type GpuAllocator ¶
func NewGpuAllocator ¶
func (*GpuAllocator) AdjustAllocation ¶ added in v1.35.0
func (s *GpuAllocator) AdjustAllocation(ctx context.Context, adjustRequest tfv1.AdjustRequest, dryRun bool) (tfv1.Resource, error)
Used for scale up decision, dryRun to pre-check capacity to determine if the allocation is valid when scaling up, return error and the max new requests/limits on existing GPU Auto scaler can directly call AdjustAllocation for scaling down decision it has to call AdjustAllocation with dryRun=true when scaling up, if return error is ScalingQuotaExceededError, it means the allocation is invalid, and it should scale up with another AdjustRequest to make sure not exceed quota, which returns in the first returned result retry until AdjustAllocation returns nil error, at most pre-configured maxRetry times
func (*GpuAllocator) Alloc ¶
func (s *GpuAllocator) Alloc(req *tfv1.AllocRequest) ([]*tfv1.GPU, error)
Alloc allocates a request to a gpu or multiple gpus from the same node. This is now implemented as a combination of Filter and Bind for backward compatibility.
func (*GpuAllocator) Bind ¶ added in v1.35.0
func (s *GpuAllocator) Bind( gpuNames []string, req *tfv1.AllocRequest, ) ([]*tfv1.GPU, error)
Bind allocates resources on the provided GPUs for the given request. It updates the in-memory store and marks the GPUs as dirty for syncing.
func (*GpuAllocator) CheckQuotaAndFilter ¶ added in v1.35.0
func (s *GpuAllocator) CheckQuotaAndFilter(ctx context.Context, req *tfv1.AllocRequest, isSimulateSchedule bool) ([]*tfv1.GPU, []filter.FilterDetail, error)
func (*GpuAllocator) CheckQuotaAndFilterSingleNodePreempt ¶ added in v1.44.0
func (s *GpuAllocator) CheckQuotaAndFilterSingleNodePreempt( nodeName string, allocReq *tfv1.AllocRequest, toPreemptPods sets.Set[types.NamespacedName], ) error
func (*GpuAllocator) ComposeAllocationRequest ¶ added in v1.35.0
func (s *GpuAllocator) ComposeAllocationRequest(pod *v1.Pod) (*tfv1.AllocRequest, string, error)
func (*GpuAllocator) Dealloc ¶
func (s *GpuAllocator) Dealloc( workloadNameNamespace tfv1.NameNamespace, gpus []string, podMeta metav1.ObjectMeta, )
Dealloc a request from gpu to release available resources on it.
func (*GpuAllocator) DeallocAsync ¶ added in v1.41.0
func (s *GpuAllocator) DeallocAsync( workloadNameNamespace tfv1.NameNamespace, gpus []string, podMeta metav1.ObjectMeta, )
func (*GpuAllocator) DeallocByPodIdentifier ¶ added in v1.41.5
func (s *GpuAllocator) DeallocByPodIdentifier(ctx context.Context, podIdentifier types.NamespacedName)
func (*GpuAllocator) Filter ¶ added in v1.35.0
func (s *GpuAllocator) Filter( req *tfv1.AllocRequest, toFilterGPUs []*tfv1.GPU, isSimulateSchedule bool, ) ([]*tfv1.GPU, []filter.FilterDetail, error)
Filter applies filters to a pool of GPUs based on the provided request and returns selected GPUs. It does not modify the GPU resources, only filters and selects them.
func (*GpuAllocator) FilterWithPreempt ¶ added in v1.44.0
func (s *GpuAllocator) FilterWithPreempt( req *tfv1.AllocRequest, preemptAllocRequests []*tfv1.AllocRequest, ) ([]*tfv1.GPU, []filter.FilterDetail, error)
func (*GpuAllocator) GetAllocationInfo ¶ added in v1.39.1
func (s *GpuAllocator) GetAllocationInfo() ( gpuStore map[types.NamespacedName]*tfv1.GPU, nodeWorkerStore map[string]map[types.NamespacedName]struct{}, uniqueAllocation map[string]*tfv1.AllocRequest, )
func (*GpuAllocator) GetAllocationReqByNodeName ¶ added in v1.44.0
func (s *GpuAllocator) GetAllocationReqByNodeName(nodeName string) []*tfv1.AllocRequest
func (*GpuAllocator) GetNodeGpuStore ¶ added in v1.42.0
func (s *GpuAllocator) GetNodeGpuStore() map[string]map[string]*tfv1.GPU
func (*GpuAllocator) GetQuotaStore ¶ added in v1.35.0
func (s *GpuAllocator) GetQuotaStore() *quota.QuotaStore
func (*GpuAllocator) GetScoringStrategy ¶ added in v1.47.2
func (s *GpuAllocator) GetScoringStrategy(cfg *config.GPUFitConfig, req *tfv1.AllocRequest) Strategy
func (*GpuAllocator) InitGPUAndQuotaStore ¶ added in v1.35.0
func (s *GpuAllocator) InitGPUAndQuotaStore() error
InitGPUAndQuotaStore initializes both GPU store and quota store from Kubernetes
func (*GpuAllocator) ListNonUsingNodes ¶ added in v1.41.1
func (s *GpuAllocator) ListNonUsingNodes() sets.Set[string]
func (*GpuAllocator) ReconcileAllocationState ¶ added in v1.35.0
func (s *GpuAllocator) ReconcileAllocationState()
When it's leader, should reconcile state based on existing workers this function is run inside storeMutex lock
func (*GpuAllocator) ReconcileAllocationStateForTesting ¶ added in v1.44.0
func (s *GpuAllocator) ReconcileAllocationStateForTesting()
func (*GpuAllocator) RegisterBindHandler ¶ added in v1.46.3
func (s *GpuAllocator) RegisterBindHandler(handler func(req *tfv1.AllocRequest))
func (*GpuAllocator) Score ¶ added in v1.35.0
func (s *GpuAllocator) Score( ctx context.Context, strategy Strategy, req *tfv1.AllocRequest, nodeGPUs map[string][]*tfv1.GPU, ) map[string]map[string]int
First level is k8s node name, second level is GPU name, value is score
func (*GpuAllocator) Select ¶ added in v1.35.0
func (s *GpuAllocator) Select(req *tfv1.AllocRequest, filteredGPUs []*tfv1.GPU) ([]*tfv1.GPU, error)
func (*GpuAllocator) SetAllocatorReady ¶ added in v1.39.1
func (s *GpuAllocator) SetAllocatorReady()
func (*GpuAllocator) SetMaxWorkerPerNode ¶ added in v1.35.0
func (s *GpuAllocator) SetMaxWorkerPerNode(maxWorkerPerNode int)
AllocRequest encapsulates all parameters needed for GPU allocation
func (*GpuAllocator) SetupWithManager ¶
SetupWithManager sets up the GpuAllocator with the Manager.
func (*GpuAllocator) StartInformerForGPU ¶ added in v1.35.0
func (*GpuAllocator) SyncGPUsToK8s ¶ added in v1.35.0
func (s *GpuAllocator) SyncGPUsToK8s()
SyncGPUsToK8s syncs GPU status to Kubernetes
type LowLoadFirst ¶
type LowLoadFirst struct {
// contains filtered or unexported fields
}
LowLoadFirst selects GPU with maximum available resources (least utilized) to distribute workloads more evenly across GPUs
func (LowLoadFirst) Score ¶ added in v1.35.0
func (l LowLoadFirst) Score(gpu *tfv1.GPU, _ bool) int
Score function is using by Kubernetes scheduler framework
func (LowLoadFirst) SelectGPUs ¶
SelectGPUs selects multiple GPUs from the same node with the most available resources (least load)
type NodeCompactGPULowLoad ¶ added in v1.47.2
type NodeCompactGPULowLoad struct {
// contains filtered or unexported fields
}
NodeCompactGPULowLoad selects GPU with maximum available resources (least utilized) to distribute workloads more evenly across GPUs default to this mode since it balance the cost and stability, scatter workload on single node with multiple GPUs
func (NodeCompactGPULowLoad) Score ¶ added in v1.47.2
func (l NodeCompactGPULowLoad) Score(gpu *tfv1.GPU, isForNode bool) int
Score function is using by Kubernetes scheduler framework
func (NodeCompactGPULowLoad) SelectGPUs ¶ added in v1.47.2
SelectGPUs selects multiple GPUs from the same node with the most available resources (least loaded)
type SimulateSchedulingFilterDetail ¶ added in v1.41.0
type SimulateSchedulingFilterDetail struct {
FilterStageDetails []filter.FilterDetail
}
When called /api/simulate-schedule with Pod yaml as body, return detailed filter details
func (*SimulateSchedulingFilterDetail) Clone ¶ added in v1.41.0
func (p *SimulateSchedulingFilterDetail) Clone() fwk.StateData
type Strategy ¶
type Strategy interface {
// When isForNode = true, indicates each GPU's node level score
// otherwise it's single GPU score inside one node
Score(gpu *tfv1.GPU, isForNode bool) int
SelectGPUs(gpus []*tfv1.GPU, count uint) ([]*tfv1.GPU, error)
}
func NewStrategy ¶
func NewStrategy(placementMode tfv1.PlacementMode, cfg *config.GPUFitConfig, nodeGpuStore map[string]map[string]*tfv1.GPU) Strategy
NewStrategy creates a strategy based on the placement mode