controllerutils

package
v0.0.8-internal Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 17, 2025 License: Apache-2.0 Imports: 21 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var DefaultGpuResourceKey = baseutils.GetEnv("DEFAULT_GPU_RESOURCE_KEY", "amd.com/gpu")
View Source
var DefaultPodTemplateSpec = corev1.PodTemplateSpec{
	Spec: corev1.PodSpec{
		SecurityContext: &corev1.PodSecurityContext{
			RunAsUser:  Int64Ptr(1000),
			RunAsGroup: Int64Ptr(1000),
			FSGroup:    func(i int64) *int64 { return &i }(1000),
		},
		RestartPolicy: corev1.RestartPolicyNever,
		Containers: []corev1.Container{
			{
				Name:            "workload",
				Image:           baseutils.DefaultRayImage,
				ImagePullPolicy: corev1.PullAlways,
				Env: []corev1.EnvVar{
					{Name: "HF_HOME", Value: "/workload/.cache/huggingface"},
				},
				Resources: corev1.ResourceRequirements{
					Requests: corev1.ResourceList{
						corev1.ResourceMemory: *resource.NewQuantity(16*1024*1024*1024, resource.BinarySI),
						corev1.ResourceCPU:    *resource.NewQuantity(2, resource.DecimalSI),
					},
					Limits: corev1.ResourceList{
						corev1.ResourceMemory: *resource.NewQuantity(16*1024*1024*1024, resource.BinarySI),
						corev1.ResourceCPU:    *resource.NewQuantity(2, resource.DecimalSI),
					},
				},
				VolumeMounts: []corev1.VolumeMount{
					{Name: "main-storage", MountPath: "/workload"},
					{Name: "dshm", MountPath: "/dev/shm"},
				},
			},
		},
	},
}

DefaultPodTemplateSpec defines a reusable Pod template with security and resource settings.

Functions

func AddEntrypoint

func AddEntrypoint(ctx context.Context, entrypoint string, podTemplateSpec *corev1.PodTemplateSpec) error

AddEntrypoint updates the entrypoint command in the PodTemplateSpec.

func AddEnvVars

func AddEnvVars(ctx context.Context, UserEnvVars []corev1.EnvVar, podTemplateSpec *corev1.PodTemplateSpec) error

func AdjustResourceRequestsAndLimits

func AdjustResourceRequestsAndLimits(ctx context.Context, gpuVendor string, gpuCount int, replicas int, gpusPerReplica int, podTemplateSpec *corev1.PodTemplateSpec) error

func BoolPtr

func BoolPtr(b bool) *bool

func CalculateNumberOfReplicas

func CalculateNumberOfReplicas(ctx context.Context, k8sClient client.Client, gpuVendor string, totalGpus int, userReplicas int, userGpusPerReplica int, useAvailability bool) (int, int, error)

CalculateNumberOfReplicas determines the number of replicas and GPUs per replica based on node labels and optionally available GPU capacity.

func ConvertKaiwoToKueueClusterQueue

func ConvertKaiwoToKueueClusterQueue(kaiwoQueue kaiwov1alpha1.ClusterQueue) kueuev1beta1.ClusterQueue

func ConvertKaiwoToKueueResourceFlavor

func ConvertKaiwoToKueueResourceFlavor(kaiwoFlavor kaiwov1alpha1.ResourceFlavorSpec) kueuev1beta1.ResourceFlavor

func ConvertKaiwoToKueueResourceFlavors

func ConvertKaiwoToKueueResourceFlavors(kaiwoFlavors []kaiwov1alpha1.ResourceFlavorSpec) []kueuev1beta1.ResourceFlavor

func CreateClusterQueue

func CreateClusterQueue(nodePoolResources map[string]kueuev1beta1.FlavorQuotas, name string) kaiwov1alpha1.ClusterQueue

func CreateLocalQueue

func CreateLocalQueue(ctx context.Context, c client.Client, name string, namespace string) error

CreateLocalQueue creates a LocalQueue in the given namespace.

func GetGpuResourceKey

func GetGpuResourceKey(vendor string) string

func Int32Ptr

func Int32Ptr(i int32) *int32

func Int64Ptr

func Int64Ptr(i int64) *int64

func LabelNode

func LabelNode(ctx context.Context, c client.Client, nodeName, key, value string) error

func MapGPUDeviceIDToName

func MapGPUDeviceIDToName(gpuID string, vendor string) string

func ReconcileDownloadJob

func ReconcileDownloadJob(r client.Client, s *runtime.Scheme, ctx context.Context, owner client.Object, spec *kaiwov1alpha1.StorageSpec) (*ctrl.Result, error)

ReconcileDownloadJob ensures that if there is data to download to the PVC(s) before the main workload runs, that the job is scheduled and has completed successfully before the main workload should be scheduled

func ReconcileStorage

func ReconcileStorage(r client.Client, s *runtime.Scheme, ctx context.Context, owner client.Object, spec *v1alpha1.StorageSpec) error

ReconcileStorage ensures that any requested PVCs are available and linked to the owner

func UpdatePodSpecStorage

func UpdatePodSpecStorage(ctx context.Context, podSpec *corev1.PodSpec, storageSpec v1alpha1.StorageSpec, ownerName string) error

Types

type NodeResourceInfo

type NodeResourceInfo struct {
	Name   string
	CPU    int
	Memory int
	Labels map[string]string
}

func GetNodeResources

func GetNodeResources(ctx context.Context, c client.Client) []NodeResourceInfo

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL