gpuallocator

package

v1.58.1 Latest Latest Go to latest Published: Mar 5, 2026 License: Apache-2.0 Imports: 34 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/NexusGPU/tensor-fusion

Links

Open Source Insights

Documentation ¶

Overview ¶

Package gpuallocator handles GPU allocation

Index ¶

Constants
Variables
func IsScalingQuotaExceededError(err error) bool
func RefreshGPUNodeCapacity(ctx context.Context, k8sClient client.Client, node *tfv1.GPUNode, ...) ([]string, error)
type CompactFirst
- func (c CompactFirst) Score(gpu *tfv1.GPU, _ bool) int
- func (c CompactFirst) SelectGPUs(gpus []*tfv1.GPU, count uint) ([]*tfv1.GPU, error)
type GpuAllocator
- func NewGpuAllocator(ctx context.Context, client client.Client, syncInterval time.Duration) *GpuAllocator
type LowLoadFirst
- func (l LowLoadFirst) Score(gpu *tfv1.GPU, _ bool) int
- func (l LowLoadFirst) SelectGPUs(gpus []*tfv1.GPU, count uint) ([]*tfv1.GPU, error)
type NodeCompactGPULowLoad
- func (l NodeCompactGPULowLoad) Score(gpu *tfv1.GPU, isForNode bool) int
- func (l NodeCompactGPULowLoad) SelectGPUs(gpus []*tfv1.GPU, count uint) ([]*tfv1.GPU, error)
type SimulateSchedulingFilterDetail
- func (p *SimulateSchedulingFilterDetail) Clone() fwk.StateData
type Strategy
- func NewStrategy(placementMode tfv1.PlacementMode, cfg *config.GPUFitConfig, ...) Strategy

Constants ¶

View Source

const CleanUpCheckInterval = 3 * time.Minute

View Source

const MaxGPUCounterPerAllocation = 128

Variables ¶

View Source

var DefaultGPUSelector = func(
	strategy Strategy,
	nodeGPUMap map[string]map[string]*tfv1.GPU,
	validGPUs []*tfv1.GPU, count uint,
) ([]*tfv1.GPU, error) {
	if len(validGPUs) == 0 {
		return nil, fmt.Errorf("no GPUs available")
	}

	gpuMap := lo.GroupBy(validGPUs, func(gpu *tfv1.GPU) string {
		return gpu.Status.NodeSelector[constants.KubernetesHostNameLabel]
	})

	nodeScores := make([]struct {
		node  string
		score int
	}, 0, len(gpuMap))
	for node := range gpuMap {
		score := 0
		allGPUs := nodeGPUMap[node]
		for _, gpu := range allGPUs {
			if gpu == nil || gpu.Status.Available == nil || gpu.Status.Capacity == nil {
				return nil, fmt.Errorf("gpu %s has no available or capacity, unexpected state", gpu.Name)
			}
			score += strategy.Score(gpu, true)
		}
		nodeScores = append(nodeScores, struct {
			node  string
			score int
		}{node, score})
	}

	slices.SortFunc(nodeScores, func(a, b struct {
		node  string
		score int
	}) int {
		return b.score - a.score
	})

	for _, nodeScore := range nodeScores {
		gpus := gpuMap[nodeScore.node]
		if len(gpus) >= int(count) {

			slices.SortFunc(gpus, func(a, b *tfv1.GPU) int {
				return strategy.Score(b, false) - strategy.Score(a, false)
			})
			return gpus[:count], nil
		}
	}

	return nil, fmt.Errorf("not enough gpus in scoring stage")
}

GPU selector is not used by Kubernetes scheduler framework, just used for allocator testing as of now, framework will compose similar logic

View Source

var GPUCapacityMap = map[string]tfv1.Resource{}

View Source

var ScalingQuotaExceededError = goerrors.New("scaling quota exceeded")

Functions ¶

func IsScalingQuotaExceededError ¶ added in v1.35.0

func IsScalingQuotaExceededError(err error) bool

func RefreshGPUNodeCapacity ¶ added in v1.33.4

func RefreshGPUNodeCapacity(
	ctx context.Context, k8sClient client.Client,
	node *tfv1.GPUNode, pool *tfv1.GPUPool,
	allocator *GpuAllocator,
	coreNode *corev1.Node,
) ([]string, error)

Types ¶

type CompactFirst ¶

type CompactFirst struct {
	// contains filtered or unexported fields
}

CompactFirst selects GPU with minimum available resources (most utilized) to efficiently pack workloads and maximize GPU utilization

func (CompactFirst) Score ¶ added in v1.35.0

func (c CompactFirst) Score(gpu *tfv1.GPU, _ bool) int

Score function is using by Kubernetes scheduler framework

func (CompactFirst) SelectGPUs ¶

func (c CompactFirst) SelectGPUs(gpus []*tfv1.GPU, count uint) ([]*tfv1.GPU, error)

SelectGPUs selects multiple GPUs from the same node with the least available resources (most packed)

type GpuAllocator ¶

type GpuAllocator struct {
	client.Client
	// contains filtered or unexported fields
}

func NewGpuAllocator ¶

func NewGpuAllocator(ctx context.Context, client client.Client, syncInterval time.Duration) *GpuAllocator

func (*GpuAllocator) AdjustAllocation ¶ added in v1.35.0

func (s *GpuAllocator) AdjustAllocation(ctx context.Context, adjustRequest tfv1.AdjustRequest, dryRun bool) (tfv1.Resource, tfv1.Resource, error)

Used for scale up decision, dryRun to pre-check capacity to determine if the allocation is valid when scaling up, return error and the max new requests/limits on existing GPU Auto scaler can directly call AdjustAllocation for scaling down decision it has to call AdjustAllocation with dryRun=true when scaling up, if return error is ScalingQuotaExceededError, it means the allocation is invalid, and it should scale up with another AdjustRequest to make sure not exceed quota, which returns in the first returned result retry until AdjustAllocation returns nil error, at most pre-configured maxRetry times returns remaining resource, delta resource, error

func (*GpuAllocator) Alloc ¶

func (s *GpuAllocator) Alloc(req *tfv1.AllocRequest) ([]*tfv1.GPU, error)

Alloc allocates a request to a gpu or multiple gpus from the same node. This is now implemented as a combination of Filter and Bind for backward compatibility.

func (*GpuAllocator) Bind ¶ added in v1.35.0

func (s *GpuAllocator) Bind(
	gpuNames []string,
	req *tfv1.AllocRequest,
) ([]*tfv1.GPU, error)

Bind allocates resources on the provided GPUs for the given request. It updates the in-memory store and marks the GPUs as dirty for syncing.

func (*GpuAllocator) CheckQuotaAndFilter ¶ added in v1.35.0

func (s *GpuAllocator) CheckQuotaAndFilter(ctx context.Context, req *tfv1.AllocRequest, isSimulateSchedule bool) ([]*tfv1.GPU, []filter.FilterDetail, error)

func (*GpuAllocator) CheckQuotaAndFilterSingleNodePreempt ¶ added in v1.44.0

func (s *GpuAllocator) CheckQuotaAndFilterSingleNodePreempt(
	nodeName string, allocReq *tfv1.AllocRequest, toPreemptPods sets.Set[types.NamespacedName],
) error

func (*GpuAllocator) ComposeAllocationRequest ¶ added in v1.35.0

func (s *GpuAllocator) ComposeAllocationRequest(pod *v1.Pod) (*tfv1.AllocRequest, string, error)

func (*GpuAllocator) Dealloc ¶

func (s *GpuAllocator) Dealloc(
	workloadNameNamespace tfv1.NameNamespace,
	gpus []string,
	podMeta metav1.ObjectMeta,
)

Dealloc a request from gpu to release available resources on it.

func (*GpuAllocator) DeallocAsync ¶ added in v1.41.0

func (s *GpuAllocator) DeallocAsync(
	workloadNameNamespace tfv1.NameNamespace,
	gpus []string,
	podMeta metav1.ObjectMeta,
)

func (*GpuAllocator) DeallocByPodIdentifier ¶ added in v1.41.5

func (s *GpuAllocator) DeallocByPodIdentifier(ctx context.Context, podIdentifier types.NamespacedName)

func (*GpuAllocator) Filter ¶ added in v1.35.0

func (s *GpuAllocator) Filter(
	req *tfv1.AllocRequest,
	toFilterGPUs []*tfv1.GPU,
	isSimulateSchedule bool,
) ([]*tfv1.GPU, []filter.FilterDetail, error)

Filter applies filters to a pool of GPUs based on the provided request and returns selected GPUs. It does not modify the GPU resources, only filters and selects them.

func (*GpuAllocator) FilterWithPreempt ¶ added in v1.44.0

func (s *GpuAllocator) FilterWithPreempt(
	req *tfv1.AllocRequest,
	preemptAllocRequests []*tfv1.AllocRequest,
	targetNodeNames ...string,
) ([]*tfv1.GPU, []filter.FilterDetail, error)

func (*GpuAllocator) GetAllocationInfo ¶ added in v1.39.1

func (s *GpuAllocator) GetAllocationInfo() (
	gpuStore map[types.NamespacedName]*tfv1.GPU,
	nodeWorkerStore map[string]map[types.NamespacedName]struct{},
	uniqueAllocation map[string]*tfv1.AllocRequest,
)

func (*GpuAllocator) GetAllocationReqByNodeName ¶ added in v1.43.8

func (s *GpuAllocator) GetAllocationReqByNodeName(nodeName string) []*tfv1.AllocRequest

func (*GpuAllocator) GetNodeGpuStore ¶ added in v1.42.0

func (s *GpuAllocator) GetNodeGpuStore() map[string]map[string]*tfv1.GPU

GetNodeGpuStore returns a snapshot (shallow copy) of the node-to-GPU map. The caller can safely iterate the returned maps without holding any lock. The *tfv1.GPU pointers are shared with the allocator's internal state.

func (*GpuAllocator) GetQuotaStore ¶ added in v1.35.0

func (s *GpuAllocator) GetQuotaStore() *quota.QuotaStore

func (*GpuAllocator) GetScoringStrategy ¶ added in v1.46.5

func (s *GpuAllocator) GetScoringStrategy(cfg *config.GPUFitConfig, req *tfv1.AllocRequest) Strategy

func (*GpuAllocator) InitGPUAndQuotaStore ¶ added in v1.35.0

func (s *GpuAllocator) InitGPUAndQuotaStore() error

InitGPUAndQuotaStore initializes both GPU store and quota store from Kubernetes

func (*GpuAllocator) ListNonUsingNodes ¶ added in v1.41.1

func (s *GpuAllocator) ListNonUsingNodes() sets.Set[string]

func (*GpuAllocator) ReconcileAllocationState ¶ added in v1.35.0

func (s *GpuAllocator) ReconcileAllocationState()

When it's leader, should reconcile state based on existing workers this function is run inside storeMutex lock

func (*GpuAllocator) ReconcileAllocationStateForTesting ¶ added in v1.44.0

func (s *GpuAllocator) ReconcileAllocationStateForTesting()

func (*GpuAllocator) RegisterBindHandler ¶ added in v1.46.0

func (s *GpuAllocator) RegisterBindHandler(handler func(req *tfv1.AllocRequest))

func (*GpuAllocator) Score ¶ added in v1.35.0

func (s *GpuAllocator) Score(
	ctx context.Context, strategy Strategy, req *tfv1.AllocRequest, nodeGPUs map[string][]*tfv1.GPU,
) map[string]map[string]int

First level is k8s node name, second level is GPU name, value is score

func (*GpuAllocator) Select ¶ added in v1.35.0

func (s *GpuAllocator) Select(req *tfv1.AllocRequest, filteredGPUs []*tfv1.GPU) ([]*tfv1.GPU, error)

func (*GpuAllocator) SetAllocatorReady ¶ added in v1.39.1

func (s *GpuAllocator) SetAllocatorReady()

func (*GpuAllocator) SetMaxWorkerPerNode ¶ added in v1.35.0

func (s *GpuAllocator) SetMaxWorkerPerNode(maxWorkerPerNode int)

AllocRequest encapsulates all parameters needed for GPU allocation

func (*GpuAllocator) SetupWithManager ¶

func (s *GpuAllocator) SetupWithManager(ctx context.Context, mgr manager.Manager) error

SetupWithManager sets up the GpuAllocator with the Manager.

func (*GpuAllocator) StartInformerForGPU ¶ added in v1.35.0

func (s *GpuAllocator) StartInformerForGPU(ctx context.Context, mgr manager.Manager) error

func (*GpuAllocator) Stop ¶

func (s *GpuAllocator) Stop()

Stop stops all background goroutines

func (*GpuAllocator) SyncGPUsToK8s ¶ added in v1.35.0

func (s *GpuAllocator) SyncGPUsToK8s()

SyncGPUsToK8s syncs GPU status to Kubernetes

type LowLoadFirst ¶

type LowLoadFirst struct {
	// contains filtered or unexported fields
}

LowLoadFirst selects GPU with maximum available resources (least utilized) to distribute workloads more evenly across GPUs

func (LowLoadFirst) Score ¶ added in v1.35.0

func (l LowLoadFirst) Score(gpu *tfv1.GPU, _ bool) int

Score function is using by Kubernetes scheduler framework

func (LowLoadFirst) SelectGPUs ¶

func (l LowLoadFirst) SelectGPUs(gpus []*tfv1.GPU, count uint) ([]*tfv1.GPU, error)

SelectGPUs selects multiple GPUs from the same node with the most available resources (least load)

type NodeCompactGPULowLoad ¶ added in v1.46.5

type NodeCompactGPULowLoad struct {
	// contains filtered or unexported fields
}

NodeCompactGPULowLoad selects GPU with maximum available resources (least utilized) to distribute workloads more evenly across GPUs default to this mode since it balance the cost and stability, scatter workload on single node with multiple GPUs

func (NodeCompactGPULowLoad) Score ¶ added in v1.46.5

func (l NodeCompactGPULowLoad) Score(gpu *tfv1.GPU, isForNode bool) int

Score function is using by Kubernetes scheduler framework

func (NodeCompactGPULowLoad) SelectGPUs ¶ added in v1.46.5

func (l NodeCompactGPULowLoad) SelectGPUs(gpus []*tfv1.GPU, count uint) ([]*tfv1.GPU, error)

SelectGPUs selects multiple GPUs from the same node with the most available resources (least loaded)

type SimulateSchedulingFilterDetail ¶ added in v1.41.0

type SimulateSchedulingFilterDetail struct {
	FilterStageDetails []filter.FilterDetail
}

When called /api/simulate-schedule with Pod yaml as body, return detailed filter details

func (*SimulateSchedulingFilterDetail) Clone ¶ added in v1.41.0

func (p *SimulateSchedulingFilterDetail) Clone() fwk.StateData

type Strategy ¶

type Strategy interface {
	// When isForNode = true, indicates each GPU's node level score
	// otherwise it's single GPU score inside one node
	Score(gpu *tfv1.GPU, isForNode bool) int

	SelectGPUs(gpus []*tfv1.GPU, count uint) ([]*tfv1.GPU, error)
}

func NewStrategy ¶

func NewStrategy(placementMode tfv1.PlacementMode, cfg *config.GPUFitConfig, nodeGpuStore map[string]map[string]*tfv1.GPU) Strategy

NewStrategy creates a strategy based on the placement mode

Source Files ¶

View all Source files

Directories ¶

Path	Synopsis
filter

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL