helpers

package
v0.0.0-...-a846cef Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 26, 2026 License: Apache-2.0 Imports: 24 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func CreateSimpleGPUPod

func CreateSimpleGPUPod(name, namespace string, gpuLimit int) *corev1.Pod

Types

type ChartOptions

type ChartOptions struct {
	CleanupOnFail bool
	GenerateName  bool
	ReleaseName   string
	Timeout       time.Duration
	Wait          bool
}

type ClusterPolicyClient

type ClusterPolicyClient struct {
	// contains filtered or unexported fields
}

func NewClusterPolicyClient

func NewClusterPolicyClient(client gpuclientset.Interface) *ClusterPolicyClient

func (*ClusterPolicyClient) DisableDCGM

func (h *ClusterPolicyClient) DisableDCGM(ctx context.Context, name string) error

func (*ClusterPolicyClient) DisableDCGMExporter

func (h *ClusterPolicyClient) DisableDCGMExporter(ctx context.Context, name string) error

func (*ClusterPolicyClient) DisableGFD

func (h *ClusterPolicyClient) DisableGFD(ctx context.Context, name string) error

func (*ClusterPolicyClient) EnableDCGM

func (h *ClusterPolicyClient) EnableDCGM(ctx context.Context, name string) error

func (*ClusterPolicyClient) EnableDCGMExporter

func (h *ClusterPolicyClient) EnableDCGMExporter(ctx context.Context, name string) error

func (*ClusterPolicyClient) EnableGFD

func (h *ClusterPolicyClient) EnableGFD(ctx context.Context, name string) error

func (*ClusterPolicyClient) Get

func (*ClusterPolicyClient) SetMIGStrategy

func (h *ClusterPolicyClient) SetMIGStrategy(ctx context.Context, name, strategy string) error

func (*ClusterPolicyClient) Update

func (*ClusterPolicyClient) UpdateDriverVersion

func (h *ClusterPolicyClient) UpdateDriverVersion(ctx context.Context, name, version string) error

func (*ClusterPolicyClient) WaitForReady

func (h *ClusterPolicyClient) WaitForReady(ctx context.Context, name string, timeout time.Duration) error

type DaemonSetClient

type DaemonSetClient struct {
	// contains filtered or unexported fields
}

func NewDaemonSetClient

func NewDaemonSetClient(client kubernetes.Interface) *DaemonSetClient

func (*DaemonSetClient) CheckNoRestarts

func (h *DaemonSetClient) CheckNoRestarts(ctx context.Context, namespace, name string) error

func (*DaemonSetClient) Get

func (h *DaemonSetClient) Get(ctx context.Context, namespace, name string) (*appsv1.DaemonSet, error)

func (*DaemonSetClient) GetByLabel

func (h *DaemonSetClient) GetByLabel(ctx context.Context, namespace, labelKey, labelValue string) (*appsv1.DaemonSet, error)

func (*DaemonSetClient) GetImage

func (h *DaemonSetClient) GetImage(ctx context.Context, namespace, name string) (string, error)

func (*DaemonSetClient) IsReady

func (h *DaemonSetClient) IsReady(ctx context.Context, namespace, name string) (bool, error)

func (*DaemonSetClient) WaitForReady

func (h *DaemonSetClient) WaitForReady(ctx context.Context, namespace, name string, timeout time.Duration) error

type NodeClient

type NodeClient struct {
	// contains filtered or unexported fields
}

func NewNodeClient

func NewNodeClient(client kubernetes.Interface) *NodeClient

func (*NodeClient) GetNodesByLabel

func (h *NodeClient) GetNodesByLabel(ctx context.Context, labelKey, labelValue string) ([]corev1.Node, error)

func (*NodeClient) LabelAllNodes

func (h *NodeClient) LabelAllNodes(ctx context.Context, key, value string) error

func (*NodeClient) LabelNode

func (h *NodeClient) LabelNode(ctx context.Context, nodeName, key, value string) error

func (*NodeClient) ListNodes

func (h *NodeClient) ListNodes(ctx context.Context) ([]corev1.Node, error)

func (*NodeClient) UnlabelAllNodes

func (h *NodeClient) UnlabelAllNodes(ctx context.Context, key string) error

func (*NodeClient) UnlabelNode

func (h *NodeClient) UnlabelNode(ctx context.Context, nodeName, key string) error

type NvidiaDriverClient

type NvidiaDriverClient struct {
	// contains filtered or unexported fields
}

func NewNvidiaDriverClient

func NewNvidiaDriverClient(client gpuclientset.Interface, k8sClient kubernetes.Interface) *NvidiaDriverClient

func (*NvidiaDriverClient) Create

func (*NvidiaDriverClient) Delete

func (h *NvidiaDriverClient) Delete(ctx context.Context, name string) error

func (*NvidiaDriverClient) Get

func (*NvidiaDriverClient) List

func (*NvidiaDriverClient) Update

func (*NvidiaDriverClient) UpdateDriverVersion

func (h *NvidiaDriverClient) UpdateDriverVersion(ctx context.Context, name, version string) error

func (*NvidiaDriverClient) WaitForPodsReady

func (h *NvidiaDriverClient) WaitForPodsReady(ctx context.Context, namespace string, timeout time.Duration) error

WaitForReady waits for the nvidia driver pods to be ready and not terminating. This checks actual pod readiness similar to check_nvidia_driver_pods_ready() in the bash tests.

func (*NvidiaDriverClient) WaitForPodsUpgradeDone

func (h *NvidiaDriverClient) WaitForPodsUpgradeDone(ctx context.Context, name string, timeout time.Duration) error

WaitForUpgradeDone waits for the driver upgrade to complete on all GPU nodes.

type OperatorClient

type OperatorClient struct {
	// contains filtered or unexported fields
}

func NewOperatorClient

func NewOperatorClient(opts ...OperatorClientOption) (*OperatorClient, error)

func (*OperatorClient) Install

func (op *OperatorClient) Install(ctx context.Context, params []string, chartOpts ChartOptions) (string, error)

func (*OperatorClient) Uninstall

func (op *OperatorClient) Uninstall(releaseName string) error

type OperatorClientOption

type OperatorClientOption func(client *OperatorClient)

func WithChart

func WithChart(chart string) OperatorClientOption

func WithKubeConfig

func WithKubeConfig(kubeconfig string) OperatorClientOption

func WithNamespace

func WithNamespace(namespace string) OperatorClientOption

type PodClient

type PodClient struct {
	// contains filtered or unexported fields
}

func NewPodClient

func NewPodClient(k8sClient corev1client.CoreV1Interface) *PodClient

func (*PodClient) CreateNamespace

func (c *PodClient) CreateNamespace(ctx context.Context, namespaceName string, labels map[string]string) (*corev1.Namespace, error)

func (*PodClient) DeleteNamespace

func (c *PodClient) DeleteNamespace(ctx context.Context, namespaceName string) error

func (*PodClient) EnsureNoPodRestarts

func (c *PodClient) EnsureNoPodRestarts(ctx context.Context, podName, namespace string) (bool, error)

func (*PodClient) GetPodLogs

func (c *PodClient) GetPodLogs(ctx context.Context, pod corev1.Pod) (string, error)

func (*PodClient) GetPodsByLabel

func (c *PodClient) GetPodsByLabel(ctx context.Context, namespace string, labelMap map[string]string) ([]corev1.Pod, error)

func (*PodClient) IsPodReady

func (c *PodClient) IsPodReady(ctx context.Context, podName, namespace string) (bool, error)

type WorkloadClient

type WorkloadClient struct {
	// contains filtered or unexported fields
}

func NewWorkloadClient

func NewWorkloadClient(client kubernetes.Interface) *WorkloadClient

func (*WorkloadClient) Delete

func (h *WorkloadClient) Delete(ctx context.Context, namespace, name string) error

func (*WorkloadClient) DeployPod

func (h *WorkloadClient) DeployPod(ctx context.Context, namespace string, podSpec *corev1.Pod) (*corev1.Pod, error)

func (*WorkloadClient) GetLogs

func (h *WorkloadClient) GetLogs(ctx context.Context, namespace, name string) (string, error)

func (*WorkloadClient) VerifyGPUAccess

func (h *WorkloadClient) VerifyGPUAccess(ctx context.Context, namespace, name string) error

VerifyGPUAccess checks pod logs for evidence of GPU access. TODO: Improve this by exec'ing into the container and invoking nvidia-smi directly

func (*WorkloadClient) WaitForCompletion

func (h *WorkloadClient) WaitForCompletion(ctx context.Context, namespace, name string, timeout time.Duration) error

func (*WorkloadClient) WaitForRunning

func (h *WorkloadClient) WaitForRunning(ctx context.Context, namespace, name string, timeout time.Duration) error

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL