util

package
v0.4.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 21, 2025 License: Apache-2.0 Imports: 18 Imported by: 0

Documentation

Index

Constants

View Source
const (
	KubeletConfigPath = "/var/lib/kubelet/config.yaml"

	CGroupBasePath   = "/sys/fs/cgroup"
	CGroupDevicePath = CGroupBasePath + "/devices"

	SYSTEMD  CGroupDriver = "systemd"
	CGROUPFS CGroupDriver = "cgroupfs"
)
View Source
const (
	DomainPrefix = "nvidia.com"

	// VGPUComputePolicyAnnotation none / balance / fixed(default)
	VGPUComputePolicyAnnotation = DomainPrefix + "/vgpu-compute-policy"

	NodeNvidiaDriverVersionLabel = DomainPrefix + "/node-driver-version"
	NodeNvidiaCudaVersionLabel   = DomainPrefix + "/node-cuda-version"

	VGPUNumberResourceName = DomainPrefix + "/vgpu-number"
	VGPUMemoryResourceName = DomainPrefix + "/vgpu-memory"
	VGPUCoreResourceName   = DomainPrefix + "/vgpu-cores"

	// NodeDeviceHeartbeatAnnotation Node device heartbeat time
	NodeDeviceHeartbeatAnnotation = DomainPrefix + "/node-device-heartbeat"
	NodeDeviceRegisterAnnotation  = DomainPrefix + "/node-device-register"
	NodeDeviceTopologyAnnotation  = DomainPrefix + "/node-device-topology"
	DeviceMemoryFactorAnnotation  = DomainPrefix + "/device-memory-factor"

	// PodIncludeGpuTypeAnnotation Specify the GPU type to be used
	PodIncludeGpuTypeAnnotation = DomainPrefix + "/include-gpu-type"
	// PodExcludeGpuTypeAnnotation Specify the GPU type to exclude
	PodExcludeGpuTypeAnnotation = DomainPrefix + "/exclude-gpu-type"

	// Scheduling strategies at the node and device levels
	NodeSchedulerPolicyAnnotation   = DomainPrefix + "/node-scheduler-policy"
	DeviceSchedulerPolicyAnnotation = DomainPrefix + "/device-scheduler-policy"

	// DeviceTopologyModeAnnotation Specify device topology mode
	DeviceTopologyModeAnnotation = DomainPrefix + "/device-topology-mode"

	// PodIncludeGPUUUIDAnnotation Specify the GPU UUID to be used
	PodIncludeGPUUUIDAnnotation = DomainPrefix + "/include-gpu-uuid"
	// PodExcludeGPUUUIDAnnotation Specify the GPU UUID to be excluded
	PodExcludeGPUUUIDAnnotation = DomainPrefix + "/exclude-gpu-uuid"

	PodPredicateNodeAnnotation = DomainPrefix + "/predicate-node"
	PodPredicateTimeAnnotation = DomainPrefix + "/predicate-time"
	PodAssignedPhaseLabel      = DomainPrefix + "/assigned-phase"

	// PodVGPUPreAllocAnnotation Pre allocated device information by the scheduler
	PodVGPUPreAllocAnnotation = DomainPrefix + "/pre-allocated"
	// PodVGPURealAllocAnnotation Real device information allocated by device plugins
	PodVGPURealAllocAnnotation = DomainPrefix + "/real-allocated"

	HundredCore     = 100
	MaxDeviceNumber = 16

	// MaxContainerLimit max container num
	MaxContainerLimit = 300000
	// PodAnnotationMaxLength pod annotation max data length 1MB
	PodAnnotationMaxLength = 1024 * 1024

	AllocateCheckErrMsg           = "Allocate check failed"
	PreStartContainerCheckErrMsg  = "PreStartContainer check failed"
	PreStartContainerCheckErrType = "PreStartContainerCheckErr"
)
View Source
const (
	// CUDA_MEM_LIMIT_<index> gpu memory limit
	CudaMemoryLimitEnv = "CUDA_MEM_LIMIT"
	// CUDA_MEM_RATIO_<index> gpu memory ratio
	CudaMemoryRatioEnv = "CUDA_MEM_RATIO"
	// CUDA_CORE_LIMIT_<index> gpu core limit
	CudaCoreLimitEnv = "CUDA_CORE_LIMIT"
	// CUDA_CORE_SOFT_LIMIT_<index> gpu core soft limit
	CudaSoftCoreLimitEnv = "CUDA_CORE_SOFT_LIMIT"
	// CUDA_CORE_SOFT_LIMIT_<index> gpu memory oversold switch
	CudaMemoryOversoldEnv = "CUDA_MEM_OVERSOLD"
	// GPU_DEVICES_UUID gpu uuid list
	GPUDevicesUuidEnv = "GPU_DEVICES_UUID"
	// CompatibilityModeEnv Indicate the compatibility mode of the environment
	CompatibilityModeEnv = "ENV_COMPATIBILITY_MODE"

	PodNameEnv      = "VGPU_POD_NAME"
	PodNamespaceEnv = "VGPU_POD_NAMESPACE"
	PodUIDEnv       = "VGPU_POD_UID"
	ContNameEnv     = "VGPU_CONTAINER_NAME"
)
View Source
const (
	DeviceListStrategyEnvvar         = "envvar"
	DeviceListStrategyVolumeMounts   = "volume-mounts"
	DeviceListStrategyCDIAnnotations = "cdi-annotations"
	DeviceListStrategyCDICRI         = "cdi-cri"
)

Constants to represent the various device list strategies

Variables

This section is empty.

Functions

func CheckDeviceType

func CheckDeviceType(annotations map[string]string, deviceType string) bool

CheckDeviceType Check if the device type meets expectations.

func CheckDeviceUuid

func CheckDeviceUuid(annotations map[string]string, deviceUUID string) bool

CheckDeviceUuid Check if the device uuid meets expectations.

func FilterAllocatingPods

func FilterAllocatingPods(activePods []corev1.Pod) []corev1.Pod

FilterAllocatingPods filter out the list of pods to be allocated.

func GetAllocatableOfNode

func GetAllocatableOfNode(node *corev1.Node, resourceName string) int

GetAllocatableOfNode Return the number of resources that can be allocated to the node.

func GetCapacityOfNode

func GetCapacityOfNode(node *corev1.Node, resourceName string) int

GetCapacityOfNode Return the capacity of node resources.

func GetContainerRuntime

func GetContainerRuntime(pod *corev1.Pod, containerName string) (runtimeName string, containerId string)

func GetContainerStatus

func GetContainerStatus(pod *corev1.Pod, containerName string) (*corev1.ContainerStatus, bool)

func GetCurrentPodByAllocatingPods

func GetCurrentPodByAllocatingPods(allocatingPods []corev1.Pod) (*corev1.Pod, error)

GetCurrentPodByAllocatingPods find the oldest Pod from the allocating Pods to be allocated as the current Pod to be allocated.

func GetK8sPodCGroupFullPath

func GetK8sPodCGroupFullPath(podCGroupPath string) string

GetK8sPodCGroupFullPath Obtain the cgroupv2 full path of the pod.

func GetK8sPodCGroupPath

func GetK8sPodCGroupPath(pod *corev1.Pod) (string, error)

GetK8sPodCGroupPath Obtain the relative path of pod cgroup for k8s.

func GetK8sPodContainerCGroupFullPath

func GetK8sPodContainerCGroupFullPath(pod *corev1.Pod, containerName string,
	getFullPath func(string) string) (string, error)

func GetK8sPodDeviceCGroupFullPath

func GetK8sPodDeviceCGroupFullPath(podCGroupPath string) string

GetK8sPodDeviceCGroupFullPath Obtain the full path of the cgroup device subsystem of the pod.

func GetNumaInformation

func GetNumaInformation(idx int) (int, error)

func GetPredicateTimeOfPod

func GetPredicateTimeOfPod(pod corev1.Pod) uint64

func GetResourceOfContainer

func GetResourceOfContainer(container *corev1.Container, resourceName corev1.ResourceName) int

GetResourceOfContainer Return the number of resource limit.

func GetResourceOfPod

func GetResourceOfPod(pod *corev1.Pod, resourceName corev1.ResourceName) int

GetResourceOfPod Return the number of resource limit for all containers of Pod.

func HasAnnotation

func HasAnnotation(obj metav1.Object, anno string) (val string, ok bool)

func HasLabel

func HasLabel(obj metav1.Object, label string) (val string, ok bool)

func InitializeCGroupDriver

func InitializeCGroupDriver(cgroupDriver string)

func InsertAnnotation

func InsertAnnotation(obj metav1.Object, k, v string)

func IsShouldDeletePod

func IsShouldDeletePod(pod *corev1.Pod) bool

IsShouldDeletePod Determine whether the pod has been deleted or needs to be deleted.

func IsVGPUEnabledNode

func IsVGPUEnabledNode(node *corev1.Node) bool

IsVGPUEnabledNode Determine whether there are VGPU devices on the node.

func IsVGPURequiredContainer

func IsVGPURequiredContainer(c *corev1.Container) bool

IsVGPURequiredContainer tell if the container is a vGPU request container.

func IsVGPUResourcePod

func IsVGPUResourcePod(pod *corev1.Pod) bool

IsVGPUResourcePod Determine if a pod has vGPU resource request.

func ParseContainerRuntime

func ParseContainerRuntime(podContainerId string) (runtimeName string, containerId string)

func PathIsNotExist

func PathIsNotExist(fullPath string) bool

func PodIsTerminated

func PodIsTerminated(pod *corev1.Pod) bool

func ShouldRetry

func ShouldRetry(err error) bool

ShouldRetry Determine whether the error of apiserver is of the type that needs to be retried.

func SplitK8sCGroupBasePath

func SplitK8sCGroupBasePath(cgroupFullPath string) string

func SystemdPathPrefixOfRuntime

func SystemdPathPrefixOfRuntime(runtimeName string) string

Types

type AssignedPhase

type AssignedPhase string
const (
	AssignPhaseSucceed    AssignedPhase = "succeed"
	AssignPhaseAllocating AssignedPhase = "allocating"
	AssignPhaseFailed     AssignedPhase = "failed"
)

type CGroupDriver

type CGroupDriver string

type CgroupName

type CgroupName []string

func NewPodCgroupName

func NewPodCgroupName(pod *corev1.Pod) CgroupName

func (CgroupName) ToCgroupfs

func (cgroupName CgroupName) ToCgroupfs() string

func (CgroupName) ToSystemd

func (cgroupName CgroupName) ToSystemd() string

cgroupName.ToSystemd converts the internal cgroup name to a systemd name. For example, the name {"kubepods", "burstable", "pod1234-abcd-5678-efgh"} becomes "/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-pod1234_abcd_5678_efgh.slice" This function always expands the systemd name into the cgroupfs form. If only the last part is needed, use path.Base(...) on it to discard the rest.

type ComputePolicy

type ComputePolicy string
const (
	// NoneComputePolicy There are no computing power limitations, tasks compete for GPUs on their own.
	NoneComputePolicy ComputePolicy = "none"
	// BalanceComputePolicy Automatically balance GPU load, maximize GPU utilization,
	// allocate idle computing power to ongoing tasks,
	// and roll back excess computing power when new tasks require it.
	BalanceComputePolicy ComputePolicy = "balance"
	// FixedComputePolicy Run tasks with a fixed computing power quota,
	// and the utilization rate will not exceed the quota.
	// Default strategy
	FixedComputePolicy ComputePolicy = "fixed" // default
)

type PodsOrderedByPredicateTime

type PodsOrderedByPredicateTime []corev1.Pod

func (PodsOrderedByPredicateTime) Len

func (pods PodsOrderedByPredicateTime) Len() int

func (PodsOrderedByPredicateTime) Less

func (pods PodsOrderedByPredicateTime) Less(i, j int) bool

func (PodsOrderedByPredicateTime) Swap

func (pods PodsOrderedByPredicateTime) Swap(i, j int)

type SchedulerPolicy

type SchedulerPolicy string
const (
	NonePolicy SchedulerPolicy = "none"
	// BinpackPolicy means the lower device memory remained after this allocation, the better
	BinpackPolicy SchedulerPolicy = "binpack"
	// SpreadPolicy means better put this task into an idle GPU card than a shared GPU card
	SpreadPolicy SchedulerPolicy = "spread"
)

type TopologyMode added in v0.3.0

type TopologyMode string
const (
	// NoneTopology Do not use any topology mode to allocate devices.
	NoneTopology TopologyMode = "none"
	// NUMATopology aligns the allocated devices according to numa nodes.
	NUMATopology TopologyMode = "numa"
	// LinkTopology find the best device set based on link topology.
	LinkTopology TopologyMode = "link"
)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL