util

package

v0.4.1 Latest Latest Go to latest Published: Apr 21, 2025 License: Apache-2.0 Imports: 18 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/coldzerofear/vgpu-manager

Links

Open Source Insights

Documentation ¶

Index ¶

Constants
func CheckDeviceType(annotations map[string]string, deviceType string) bool
func CheckDeviceUuid(annotations map[string]string, deviceUUID string) bool
func FilterAllocatingPods(activePods []corev1.Pod) []corev1.Pod
func GetAllocatableOfNode(node *corev1.Node, resourceName string) int
func GetCapacityOfNode(node *corev1.Node, resourceName string) int
func GetContainerRuntime(pod *corev1.Pod, containerName string) (runtimeName string, containerId string)
func GetContainerStatus(pod *corev1.Pod, containerName string) (*corev1.ContainerStatus, bool)
func GetCurrentPodByAllocatingPods(allocatingPods []corev1.Pod) (*corev1.Pod, error)
func GetK8sPodCGroupFullPath(podCGroupPath string) string
func GetK8sPodCGroupPath(pod *corev1.Pod) (string, error)
func GetK8sPodContainerCGroupFullPath(pod *corev1.Pod, containerName string, getFullPath func(string) string) (string, error)
func GetK8sPodDeviceCGroupFullPath(podCGroupPath string) string
func GetNumaInformation(idx int) (int, error)
func GetPredicateTimeOfPod(pod corev1.Pod) uint64
func GetResourceOfContainer(container *corev1.Container, resourceName corev1.ResourceName) int
func GetResourceOfPod(pod *corev1.Pod, resourceName corev1.ResourceName) int
func HasAnnotation(obj metav1.Object, anno string) (val string, ok bool)
func HasLabel(obj metav1.Object, label string) (val string, ok bool)
func InitializeCGroupDriver(cgroupDriver string)
func InsertAnnotation(obj metav1.Object, k, v string)
func IsShouldDeletePod(pod *corev1.Pod) bool
func IsVGPUEnabledNode(node *corev1.Node) bool
func IsVGPURequiredContainer(c *corev1.Container) bool
func IsVGPUResourcePod(pod *corev1.Pod) bool
func ParseContainerRuntime(podContainerId string) (runtimeName string, containerId string)
func PathIsNotExist(fullPath string) bool
func PodIsTerminated(pod *corev1.Pod) bool
func ShouldRetry(err error) bool
func SplitK8sCGroupBasePath(cgroupFullPath string) string
func SystemdPathPrefixOfRuntime(runtimeName string) string
type AssignedPhase
type CGroupDriver
type CgroupName
- func NewPodCgroupName(pod *corev1.Pod) CgroupName
- func (cgroupName CgroupName) ToCgroupfs() string
- func (cgroupName CgroupName) ToSystemd() string
type ComputePolicy
type PodsOrderedByPredicateTime
- func (pods PodsOrderedByPredicateTime) Len() int
- func (pods PodsOrderedByPredicateTime) Less(i, j int) bool
- func (pods PodsOrderedByPredicateTime) Swap(i, j int)
type SchedulerPolicy
type TopologyMode

Constants ¶

View Source

const (
	KubeletConfigPath = "/var/lib/kubelet/config.yaml"

	CGroupBasePath   = "/sys/fs/cgroup"
	CGroupDevicePath = CGroupBasePath + "/devices"

	SYSTEMD  CGroupDriver = "systemd"
	CGROUPFS CGroupDriver = "cgroupfs"
)

View Source

const (
	DomainPrefix = "nvidia.com"

	// VGPUComputePolicyAnnotation none / balance / fixed(default)
	VGPUComputePolicyAnnotation = DomainPrefix + "/vgpu-compute-policy"

	NodeNvidiaDriverVersionLabel = DomainPrefix + "/node-driver-version"
	NodeNvidiaCudaVersionLabel   = DomainPrefix + "/node-cuda-version"

	VGPUNumberResourceName = DomainPrefix + "/vgpu-number"
	VGPUMemoryResourceName = DomainPrefix + "/vgpu-memory"
	VGPUCoreResourceName   = DomainPrefix + "/vgpu-cores"

	// NodeDeviceHeartbeatAnnotation Node device heartbeat time
	NodeDeviceHeartbeatAnnotation = DomainPrefix + "/node-device-heartbeat"
	NodeDeviceRegisterAnnotation  = DomainPrefix + "/node-device-register"
	NodeDeviceTopologyAnnotation  = DomainPrefix + "/node-device-topology"
	DeviceMemoryFactorAnnotation  = DomainPrefix + "/device-memory-factor"

	// PodIncludeGpuTypeAnnotation Specify the GPU type to be used
	PodIncludeGpuTypeAnnotation = DomainPrefix + "/include-gpu-type"
	// PodExcludeGpuTypeAnnotation Specify the GPU type to exclude
	PodExcludeGpuTypeAnnotation = DomainPrefix + "/exclude-gpu-type"

	// Scheduling strategies at the node and device levels
	NodeSchedulerPolicyAnnotation   = DomainPrefix + "/node-scheduler-policy"
	DeviceSchedulerPolicyAnnotation = DomainPrefix + "/device-scheduler-policy"

	// DeviceTopologyModeAnnotation Specify device topology mode
	DeviceTopologyModeAnnotation = DomainPrefix + "/device-topology-mode"

	// PodIncludeGPUUUIDAnnotation Specify the GPU UUID to be used
	PodIncludeGPUUUIDAnnotation = DomainPrefix + "/include-gpu-uuid"
	// PodExcludeGPUUUIDAnnotation Specify the GPU UUID to be excluded
	PodExcludeGPUUUIDAnnotation = DomainPrefix + "/exclude-gpu-uuid"

	PodPredicateNodeAnnotation = DomainPrefix + "/predicate-node"
	PodPredicateTimeAnnotation = DomainPrefix + "/predicate-time"
	PodAssignedPhaseLabel      = DomainPrefix + "/assigned-phase"

	// PodVGPUPreAllocAnnotation Pre allocated device information by the scheduler
	PodVGPUPreAllocAnnotation = DomainPrefix + "/pre-allocated"
	// PodVGPURealAllocAnnotation Real device information allocated by device plugins
	PodVGPURealAllocAnnotation = DomainPrefix + "/real-allocated"

	HundredCore     = 100
	MaxDeviceNumber = 16

	// MaxContainerLimit max container num
	MaxContainerLimit = 300000
	// PodAnnotationMaxLength pod annotation max data length 1MB
	PodAnnotationMaxLength = 1024 * 1024

	AllocateCheckErrMsg           = "Allocate check failed"
	PreStartContainerCheckErrMsg  = "PreStartContainer check failed"
	PreStartContainerCheckErrType = "PreStartContainerCheckErr"
)

View Source

const (
	// CUDA_MEM_LIMIT_<index> gpu memory limit
	CudaMemoryLimitEnv = "CUDA_MEM_LIMIT"
	// CUDA_MEM_RATIO_<index> gpu memory ratio
	CudaMemoryRatioEnv = "CUDA_MEM_RATIO"
	// CUDA_CORE_LIMIT_<index> gpu core limit
	CudaCoreLimitEnv = "CUDA_CORE_LIMIT"
	// CUDA_CORE_SOFT_LIMIT_<index> gpu core soft limit
	CudaSoftCoreLimitEnv = "CUDA_CORE_SOFT_LIMIT"
	// CUDA_CORE_SOFT_LIMIT_<index> gpu memory oversold switch
	CudaMemoryOversoldEnv = "CUDA_MEM_OVERSOLD"
	// GPU_DEVICES_UUID gpu uuid list
	GPUDevicesUuidEnv = "GPU_DEVICES_UUID"
	// CompatibilityModeEnv Indicate the compatibility mode of the environment
	CompatibilityModeEnv = "ENV_COMPATIBILITY_MODE"

	PodNameEnv      = "VGPU_POD_NAME"
	PodNamespaceEnv = "VGPU_POD_NAMESPACE"
	PodUIDEnv       = "VGPU_POD_UID"
	ContNameEnv     = "VGPU_CONTAINER_NAME"
)

View Source

const (
	DeviceListStrategyEnvvar         = "envvar"
	DeviceListStrategyVolumeMounts   = "volume-mounts"
	DeviceListStrategyCDIAnnotations = "cdi-annotations"
	DeviceListStrategyCDICRI         = "cdi-cri"
)

Constants to represent the various device list strategies

Variables ¶

This section is empty.

Functions ¶

func CheckDeviceType ¶

func CheckDeviceType(annotations map[string]string, deviceType string) bool

CheckDeviceType Check if the device type meets expectations.

func CheckDeviceUuid ¶

func CheckDeviceUuid(annotations map[string]string, deviceUUID string) bool

CheckDeviceUuid Check if the device uuid meets expectations.

func FilterAllocatingPods ¶

func FilterAllocatingPods(activePods []corev1.Pod) []corev1.Pod

FilterAllocatingPods filter out the list of pods to be allocated.

func GetAllocatableOfNode ¶

func GetAllocatableOfNode(node *corev1.Node, resourceName string) int

GetAllocatableOfNode Return the number of resources that can be allocated to the node.

func GetCapacityOfNode ¶

func GetCapacityOfNode(node *corev1.Node, resourceName string) int

GetCapacityOfNode Return the capacity of node resources.

func GetContainerRuntime ¶

func GetContainerRuntime(pod *corev1.Pod, containerName string) (runtimeName string, containerId string)

func GetContainerStatus ¶

func GetContainerStatus(pod *corev1.Pod, containerName string) (*corev1.ContainerStatus, bool)

func GetCurrentPodByAllocatingPods ¶

func GetCurrentPodByAllocatingPods(allocatingPods []corev1.Pod) (*corev1.Pod, error)

GetCurrentPodByAllocatingPods find the oldest Pod from the allocating Pods to be allocated as the current Pod to be allocated.

func GetK8sPodCGroupFullPath ¶

func GetK8sPodCGroupFullPath(podCGroupPath string) string

GetK8sPodCGroupFullPath Obtain the cgroupv2 full path of the pod.

func GetK8sPodCGroupPath ¶

func GetK8sPodCGroupPath(pod *corev1.Pod) (string, error)

GetK8sPodCGroupPath Obtain the relative path of pod cgroup for k8s.

func GetK8sPodContainerCGroupFullPath ¶

func GetK8sPodContainerCGroupFullPath(pod *corev1.Pod, containerName string,
	getFullPath func(string) string) (string, error)

func GetK8sPodDeviceCGroupFullPath ¶

func GetK8sPodDeviceCGroupFullPath(podCGroupPath string) string

GetK8sPodDeviceCGroupFullPath Obtain the full path of the cgroup device subsystem of the pod.

func GetNumaInformation ¶

func GetNumaInformation(idx int) (int, error)

func GetPredicateTimeOfPod ¶

func GetPredicateTimeOfPod(pod corev1.Pod) uint64

func GetResourceOfContainer ¶

func GetResourceOfContainer(container *corev1.Container, resourceName corev1.ResourceName) int

GetResourceOfContainer Return the number of resource limit.

func GetResourceOfPod ¶

func GetResourceOfPod(pod *corev1.Pod, resourceName corev1.ResourceName) int

GetResourceOfPod Return the number of resource limit for all containers of Pod.

func HasAnnotation ¶

func HasAnnotation(obj metav1.Object, anno string) (val string, ok bool)

func HasLabel ¶

func HasLabel(obj metav1.Object, label string) (val string, ok bool)

func InitializeCGroupDriver ¶

func InitializeCGroupDriver(cgroupDriver string)

func InsertAnnotation ¶

func InsertAnnotation(obj metav1.Object, k, v string)

func IsShouldDeletePod ¶

func IsShouldDeletePod(pod *corev1.Pod) bool

IsShouldDeletePod Determine whether the pod has been deleted or needs to be deleted.

func IsVGPUEnabledNode ¶

func IsVGPUEnabledNode(node *corev1.Node) bool

IsVGPUEnabledNode Determine whether there are VGPU devices on the node.

func IsVGPURequiredContainer ¶

func IsVGPURequiredContainer(c *corev1.Container) bool

IsVGPURequiredContainer tell if the container is a vGPU request container.

func IsVGPUResourcePod ¶

func IsVGPUResourcePod(pod *corev1.Pod) bool

IsVGPUResourcePod Determine if a pod has vGPU resource request.

func ParseContainerRuntime ¶

func ParseContainerRuntime(podContainerId string) (runtimeName string, containerId string)

func PathIsNotExist ¶

func PathIsNotExist(fullPath string) bool

func PodIsTerminated ¶

func PodIsTerminated(pod *corev1.Pod) bool

func ShouldRetry ¶

func ShouldRetry(err error) bool

ShouldRetry Determine whether the error of apiserver is of the type that needs to be retried.

func SplitK8sCGroupBasePath ¶

func SplitK8sCGroupBasePath(cgroupFullPath string) string

func SystemdPathPrefixOfRuntime ¶

func SystemdPathPrefixOfRuntime(runtimeName string) string

Types ¶

type AssignedPhase ¶

type AssignedPhase string

const (
	AssignPhaseSucceed    AssignedPhase = "succeed"
	AssignPhaseAllocating AssignedPhase = "allocating"
	AssignPhaseFailed     AssignedPhase = "failed"
)

type CGroupDriver ¶

type CGroupDriver string

type CgroupName ¶

type CgroupName []string

func NewPodCgroupName ¶

func NewPodCgroupName(pod *corev1.Pod) CgroupName

func (CgroupName) ToCgroupfs ¶

func (cgroupName CgroupName) ToCgroupfs() string

func (CgroupName) ToSystemd ¶

func (cgroupName CgroupName) ToSystemd() string

cgroupName.ToSystemd converts the internal cgroup name to a systemd name. For example, the name {"kubepods", "burstable", "pod1234-abcd-5678-efgh"} becomes "/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-pod1234_abcd_5678_efgh.slice" This function always expands the systemd name into the cgroupfs form. If only the last part is needed, use path.Base(...) on it to discard the rest.

type ComputePolicy ¶

type ComputePolicy string

const (
	// NoneComputePolicy There are no computing power limitations, tasks compete for GPUs on their own.
	NoneComputePolicy ComputePolicy = "none"
	// BalanceComputePolicy Automatically balance GPU load, maximize GPU utilization,
	// allocate idle computing power to ongoing tasks,
	// and roll back excess computing power when new tasks require it.
	BalanceComputePolicy ComputePolicy = "balance"
	// FixedComputePolicy Run tasks with a fixed computing power quota,
	// and the utilization rate will not exceed the quota.
	// Default strategy
	FixedComputePolicy ComputePolicy = "fixed" // default
)

type PodsOrderedByPredicateTime ¶

type PodsOrderedByPredicateTime []corev1.Pod

func (PodsOrderedByPredicateTime) Len ¶

func (pods PodsOrderedByPredicateTime) Len() int

func (PodsOrderedByPredicateTime) Less ¶

func (pods PodsOrderedByPredicateTime) Less(i, j int) bool

func (PodsOrderedByPredicateTime) Swap ¶

func (pods PodsOrderedByPredicateTime) Swap(i, j int)

type SchedulerPolicy ¶

type SchedulerPolicy string

const (
	NonePolicy SchedulerPolicy = "none"
	// BinpackPolicy means the lower device memory remained after this allocation, the better
	BinpackPolicy SchedulerPolicy = "binpack"
	// SpreadPolicy means better put this task into an idle GPU card than a shared GPU card
	SpreadPolicy SchedulerPolicy = "spread"
)

type TopologyMode ¶ added in v0.3.0

type TopologyMode string

const (
	// NoneTopology Do not use any topology mode to allocate devices.
	NoneTopology TopologyMode = "none"
	// NUMATopology aligns the allocated devices according to numa nodes.
	NUMATopology TopologyMode = "numa"
	// LinkTopology find the best device set based on link topology.
	LinkTopology TopologyMode = "link"
)

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL