Documentation
¶
Index ¶
- Variables
- func AddEntrypoint(ctx context.Context, entrypoint string, ...) error
- func AddEnvVars(ctx context.Context, UserEnvVars []corev1.EnvVar, ...) error
- func AdjustResourceRequestsAndLimits(ctx context.Context, gpuVendor string, gpuCount int, replicas int, ...) error
- func BoolPtr(b bool) *bool
- func CalculateNumberOfReplicas(ctx context.Context, k8sClient client.Client, gpuVendor string, totalGpus int, ...) (int, int, error)
- func ConvertKaiwoToKueueClusterQueue(kaiwoQueue kaiwov1alpha1.ClusterQueue) kueuev1beta1.ClusterQueue
- func ConvertKaiwoToKueueResourceFlavor(kaiwoFlavor kaiwov1alpha1.ResourceFlavorSpec) kueuev1beta1.ResourceFlavor
- func ConvertKaiwoToKueueResourceFlavors(kaiwoFlavors []kaiwov1alpha1.ResourceFlavorSpec) []kueuev1beta1.ResourceFlavor
- func CreateClusterQueue(nodePoolResources map[string]kueuev1beta1.FlavorQuotas, name string) kaiwov1alpha1.ClusterQueue
- func CreateDefaultResourceFlavors(ctx context.Context, c client.Client) ([]kaiwov1alpha1.ResourceFlavorSpec, map[string]kueuev1beta1.FlavorQuotas, ...)
- func CreateLocalQueue(ctx context.Context, c client.Client, name string, namespace string) error
- func GetGpuResourceKey(vendor string) string
- func Int32Ptr(i int32) *int32
- func Int64Ptr(i int64) *int64
- func LabelNode(ctx context.Context, c client.Client, nodeName, key, value string) error
- func MapGPUDeviceIDToName(gpuID string, vendor string) string
- func ReconcileDownloadJob(r client.Client, s *runtime.Scheme, ctx context.Context, owner client.Object, ...) (*ctrl.Result, error)
- func ReconcileStorage(r client.Client, s *runtime.Scheme, ctx context.Context, owner client.Object, ...) error
- func RemoveDuplicateResourceFlavors(flavors []kaiwov1alpha1.ResourceFlavorSpec) []kaiwov1alpha1.ResourceFlavorSpec
- func UpdatePodSpecStorage(ctx context.Context, podSpec *corev1.PodSpec, storageSpec v1alpha1.StorageSpec, ...) error
- type NodeResourceInfo
Constants ¶
This section is empty.
Variables ¶
View Source
var DefaultGpuResourceKey = baseutils.GetEnv("DEFAULT_GPU_RESOURCE_KEY", "amd.com/gpu")
View Source
var DefaultPodTemplateSpec = corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ SecurityContext: &corev1.PodSecurityContext{ RunAsUser: Int64Ptr(1000), RunAsGroup: Int64Ptr(1000), FSGroup: func(i int64) *int64 { return &i }(1000), }, RestartPolicy: corev1.RestartPolicyNever, Containers: []corev1.Container{ { Name: "workload", Image: baseutils.DefaultRayImage, ImagePullPolicy: corev1.PullAlways, Env: []corev1.EnvVar{ {Name: "HF_HOME", Value: "/workload/.cache/huggingface"}, }, Resources: corev1.ResourceRequirements{ Requests: corev1.ResourceList{ corev1.ResourceMemory: *resource.NewQuantity(16*1024*1024*1024, resource.BinarySI), corev1.ResourceCPU: *resource.NewQuantity(2, resource.DecimalSI), }, Limits: corev1.ResourceList{ corev1.ResourceMemory: *resource.NewQuantity(16*1024*1024*1024, resource.BinarySI), corev1.ResourceCPU: *resource.NewQuantity(2, resource.DecimalSI), }, }, VolumeMounts: []corev1.VolumeMount{ {Name: "main-storage", MountPath: "/workload"}, {Name: "dshm", MountPath: "/dev/shm"}, }, }, }, }, }
DefaultPodTemplateSpec defines a reusable Pod template with security and resource settings.
Functions ¶
func AddEntrypoint ¶
func AddEntrypoint(ctx context.Context, entrypoint string, podTemplateSpec *corev1.PodTemplateSpec) error
AddEntrypoint updates the entrypoint command in the PodTemplateSpec.
func AddEnvVars ¶
func CalculateNumberOfReplicas ¶
func CalculateNumberOfReplicas(ctx context.Context, k8sClient client.Client, gpuVendor string, totalGpus int, userReplicas int, userGpusPerReplica int, useAvailability bool) (int, int, error)
CalculateNumberOfReplicas determines the number of replicas and GPUs per replica based on node labels and optionally available GPU capacity.
func ConvertKaiwoToKueueClusterQueue ¶
func ConvertKaiwoToKueueClusterQueue(kaiwoQueue kaiwov1alpha1.ClusterQueue) kueuev1beta1.ClusterQueue
func ConvertKaiwoToKueueResourceFlavor ¶
func ConvertKaiwoToKueueResourceFlavor(kaiwoFlavor kaiwov1alpha1.ResourceFlavorSpec) kueuev1beta1.ResourceFlavor
func ConvertKaiwoToKueueResourceFlavors ¶
func ConvertKaiwoToKueueResourceFlavors(kaiwoFlavors []kaiwov1alpha1.ResourceFlavorSpec) []kueuev1beta1.ResourceFlavor
func CreateClusterQueue ¶
func CreateClusterQueue(nodePoolResources map[string]kueuev1beta1.FlavorQuotas, name string) kaiwov1alpha1.ClusterQueue
func CreateDefaultResourceFlavors ¶
func CreateDefaultResourceFlavors(ctx context.Context, c client.Client) ([]kaiwov1alpha1.ResourceFlavorSpec, map[string]kueuev1beta1.FlavorQuotas, error)
func CreateLocalQueue ¶
CreateLocalQueue creates a LocalQueue in the given namespace.
func GetGpuResourceKey ¶
func MapGPUDeviceIDToName ¶
func ReconcileDownloadJob ¶
func ReconcileDownloadJob(r client.Client, s *runtime.Scheme, ctx context.Context, owner client.Object, spec *kaiwov1alpha1.StorageSpec) (*ctrl.Result, error)
ReconcileDownloadJob ensures that if there is data to download to the PVC(s) before the main workload runs, that the job is scheduled and has completed successfully before the main workload should be scheduled
func ReconcileStorage ¶
func ReconcileStorage(r client.Client, s *runtime.Scheme, ctx context.Context, owner client.Object, spec *v1alpha1.StorageSpec) error
ReconcileStorage ensures that any requested PVCs are available and linked to the owner
func RemoveDuplicateResourceFlavors ¶
func RemoveDuplicateResourceFlavors(flavors []kaiwov1alpha1.ResourceFlavorSpec) []kaiwov1alpha1.ResourceFlavorSpec
func UpdatePodSpecStorage ¶
Types ¶
type NodeResourceInfo ¶
func GetNodeResources ¶
func GetNodeResources(ctx context.Context, c client.Client) []NodeResourceInfo
Click to show internal directories.
Click to hide internal directories.