Documentation
¶
Index ¶
- func CreateSimpleGPUPod(name, namespace string, gpuLimit int) *corev1.Pod
- type ChartOptions
- type ClusterPolicyClient
- func (h *ClusterPolicyClient) DisableDCGM(ctx context.Context, name string) error
- func (h *ClusterPolicyClient) DisableDCGMExporter(ctx context.Context, name string) error
- func (h *ClusterPolicyClient) DisableGFD(ctx context.Context, name string) error
- func (h *ClusterPolicyClient) EnableDCGM(ctx context.Context, name string) error
- func (h *ClusterPolicyClient) EnableDCGMExporter(ctx context.Context, name string) error
- func (h *ClusterPolicyClient) EnableGFD(ctx context.Context, name string) error
- func (h *ClusterPolicyClient) Get(ctx context.Context, name string) (*nvidiav1.ClusterPolicy, error)
- func (h *ClusterPolicyClient) SetMIGStrategy(ctx context.Context, name, strategy string) error
- func (h *ClusterPolicyClient) Update(ctx context.Context, cp *nvidiav1.ClusterPolicy) (*nvidiav1.ClusterPolicy, error)
- func (h *ClusterPolicyClient) UpdateDriverVersion(ctx context.Context, name, version string) error
- func (h *ClusterPolicyClient) WaitForReady(ctx context.Context, name string, timeout time.Duration) error
- type DaemonSetClient
- func (h *DaemonSetClient) CheckNoRestarts(ctx context.Context, namespace, name string) error
- func (h *DaemonSetClient) Get(ctx context.Context, namespace, name string) (*appsv1.DaemonSet, error)
- func (h *DaemonSetClient) GetByLabel(ctx context.Context, namespace, labelKey, labelValue string) (*appsv1.DaemonSet, error)
- func (h *DaemonSetClient) GetImage(ctx context.Context, namespace, name string) (string, error)
- func (h *DaemonSetClient) IsReady(ctx context.Context, namespace, name string) (bool, error)
- func (h *DaemonSetClient) WaitForReady(ctx context.Context, namespace, name string, timeout time.Duration) error
- type NodeClient
- func (h *NodeClient) GetNodesByLabel(ctx context.Context, labelKey, labelValue string) ([]corev1.Node, error)
- func (h *NodeClient) LabelAllNodes(ctx context.Context, key, value string) error
- func (h *NodeClient) LabelNode(ctx context.Context, nodeName, key, value string) error
- func (h *NodeClient) ListNodes(ctx context.Context) ([]corev1.Node, error)
- func (h *NodeClient) UnlabelAllNodes(ctx context.Context, key string) error
- func (h *NodeClient) UnlabelNode(ctx context.Context, nodeName, key string) error
- type NvidiaDriverClient
- func (h *NvidiaDriverClient) Create(ctx context.Context, driver *nvidiav1alpha1.NVIDIADriver) (*nvidiav1alpha1.NVIDIADriver, error)
- func (h *NvidiaDriverClient) Delete(ctx context.Context, name string) error
- func (h *NvidiaDriverClient) Get(ctx context.Context, name string) (*nvidiav1alpha1.NVIDIADriver, error)
- func (h *NvidiaDriverClient) List(ctx context.Context) (*nvidiav1alpha1.NVIDIADriverList, error)
- func (h *NvidiaDriverClient) Update(ctx context.Context, driver *nvidiav1alpha1.NVIDIADriver) (*nvidiav1alpha1.NVIDIADriver, error)
- func (h *NvidiaDriverClient) UpdateDriverVersion(ctx context.Context, name, version string) error
- func (h *NvidiaDriverClient) WaitForPodsReady(ctx context.Context, namespace string, timeout time.Duration) error
- func (h *NvidiaDriverClient) WaitForPodsUpgradeDone(ctx context.Context, name string, timeout time.Duration) error
- type OperatorClient
- type OperatorClientOption
- type PodClient
- func (c *PodClient) CreateNamespace(ctx context.Context, namespaceName string, labels map[string]string) (*corev1.Namespace, error)
- func (c *PodClient) DeleteNamespace(ctx context.Context, namespaceName string) error
- func (c *PodClient) EnsureNoPodRestarts(ctx context.Context, podName, namespace string) (bool, error)
- func (c *PodClient) GetPodLogs(ctx context.Context, pod corev1.Pod) (string, error)
- func (c *PodClient) GetPodsByLabel(ctx context.Context, namespace string, labelMap map[string]string) ([]corev1.Pod, error)
- func (c *PodClient) IsPodReady(ctx context.Context, podName, namespace string) (bool, error)
- type WorkloadClient
- func (h *WorkloadClient) Delete(ctx context.Context, namespace, name string) error
- func (h *WorkloadClient) DeployPod(ctx context.Context, namespace string, podSpec *corev1.Pod) (*corev1.Pod, error)
- func (h *WorkloadClient) GetLogs(ctx context.Context, namespace, name string) (string, error)
- func (h *WorkloadClient) VerifyGPUAccess(ctx context.Context, namespace, name string) error
- func (h *WorkloadClient) WaitForCompletion(ctx context.Context, namespace, name string, timeout time.Duration) error
- func (h *WorkloadClient) WaitForRunning(ctx context.Context, namespace, name string, timeout time.Duration) error
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
Types ¶
type ChartOptions ¶
type ClusterPolicyClient ¶
type ClusterPolicyClient struct {
// contains filtered or unexported fields
}
func NewClusterPolicyClient ¶
func NewClusterPolicyClient(client gpuclientset.Interface) *ClusterPolicyClient
func (*ClusterPolicyClient) DisableDCGM ¶
func (h *ClusterPolicyClient) DisableDCGM(ctx context.Context, name string) error
func (*ClusterPolicyClient) DisableDCGMExporter ¶
func (h *ClusterPolicyClient) DisableDCGMExporter(ctx context.Context, name string) error
func (*ClusterPolicyClient) DisableGFD ¶
func (h *ClusterPolicyClient) DisableGFD(ctx context.Context, name string) error
func (*ClusterPolicyClient) EnableDCGM ¶
func (h *ClusterPolicyClient) EnableDCGM(ctx context.Context, name string) error
func (*ClusterPolicyClient) EnableDCGMExporter ¶
func (h *ClusterPolicyClient) EnableDCGMExporter(ctx context.Context, name string) error
func (*ClusterPolicyClient) EnableGFD ¶
func (h *ClusterPolicyClient) EnableGFD(ctx context.Context, name string) error
func (*ClusterPolicyClient) Get ¶
func (h *ClusterPolicyClient) Get(ctx context.Context, name string) (*nvidiav1.ClusterPolicy, error)
func (*ClusterPolicyClient) SetMIGStrategy ¶
func (h *ClusterPolicyClient) SetMIGStrategy(ctx context.Context, name, strategy string) error
func (*ClusterPolicyClient) Update ¶
func (h *ClusterPolicyClient) Update(ctx context.Context, cp *nvidiav1.ClusterPolicy) (*nvidiav1.ClusterPolicy, error)
func (*ClusterPolicyClient) UpdateDriverVersion ¶
func (h *ClusterPolicyClient) UpdateDriverVersion(ctx context.Context, name, version string) error
func (*ClusterPolicyClient) WaitForReady ¶
type DaemonSetClient ¶
type DaemonSetClient struct {
// contains filtered or unexported fields
}
func NewDaemonSetClient ¶
func NewDaemonSetClient(client kubernetes.Interface) *DaemonSetClient
func (*DaemonSetClient) CheckNoRestarts ¶
func (h *DaemonSetClient) CheckNoRestarts(ctx context.Context, namespace, name string) error
func (*DaemonSetClient) GetByLabel ¶
func (*DaemonSetClient) WaitForReady ¶
type NodeClient ¶
type NodeClient struct {
// contains filtered or unexported fields
}
func NewNodeClient ¶
func NewNodeClient(client kubernetes.Interface) *NodeClient
func (*NodeClient) GetNodesByLabel ¶
func (*NodeClient) LabelAllNodes ¶
func (h *NodeClient) LabelAllNodes(ctx context.Context, key, value string) error
func (*NodeClient) LabelNode ¶
func (h *NodeClient) LabelNode(ctx context.Context, nodeName, key, value string) error
func (*NodeClient) UnlabelAllNodes ¶
func (h *NodeClient) UnlabelAllNodes(ctx context.Context, key string) error
func (*NodeClient) UnlabelNode ¶
func (h *NodeClient) UnlabelNode(ctx context.Context, nodeName, key string) error
type NvidiaDriverClient ¶
type NvidiaDriverClient struct {
// contains filtered or unexported fields
}
func NewNvidiaDriverClient ¶
func NewNvidiaDriverClient(client gpuclientset.Interface, k8sClient kubernetes.Interface) *NvidiaDriverClient
func (*NvidiaDriverClient) Create ¶
func (h *NvidiaDriverClient) Create(ctx context.Context, driver *nvidiav1alpha1.NVIDIADriver) (*nvidiav1alpha1.NVIDIADriver, error)
func (*NvidiaDriverClient) Delete ¶
func (h *NvidiaDriverClient) Delete(ctx context.Context, name string) error
func (*NvidiaDriverClient) Get ¶
func (h *NvidiaDriverClient) Get(ctx context.Context, name string) (*nvidiav1alpha1.NVIDIADriver, error)
func (*NvidiaDriverClient) List ¶
func (h *NvidiaDriverClient) List(ctx context.Context) (*nvidiav1alpha1.NVIDIADriverList, error)
func (*NvidiaDriverClient) Update ¶
func (h *NvidiaDriverClient) Update(ctx context.Context, driver *nvidiav1alpha1.NVIDIADriver) (*nvidiav1alpha1.NVIDIADriver, error)
func (*NvidiaDriverClient) UpdateDriverVersion ¶
func (h *NvidiaDriverClient) UpdateDriverVersion(ctx context.Context, name, version string) error
func (*NvidiaDriverClient) WaitForPodsReady ¶
func (h *NvidiaDriverClient) WaitForPodsReady(ctx context.Context, namespace string, timeout time.Duration) error
WaitForReady waits for the nvidia driver pods to be ready and not terminating. This checks actual pod readiness similar to check_nvidia_driver_pods_ready() in the bash tests.
func (*NvidiaDriverClient) WaitForPodsUpgradeDone ¶
func (h *NvidiaDriverClient) WaitForPodsUpgradeDone(ctx context.Context, name string, timeout time.Duration) error
WaitForUpgradeDone waits for the driver upgrade to complete on all GPU nodes.
type OperatorClient ¶
type OperatorClient struct {
// contains filtered or unexported fields
}
func NewOperatorClient ¶
func NewOperatorClient(opts ...OperatorClientOption) (*OperatorClient, error)
func (*OperatorClient) Install ¶
func (op *OperatorClient) Install(ctx context.Context, params []string, chartOpts ChartOptions) (string, error)
func (*OperatorClient) Uninstall ¶
func (op *OperatorClient) Uninstall(releaseName string) error
type OperatorClientOption ¶
type OperatorClientOption func(client *OperatorClient)
func WithChart ¶
func WithChart(chart string) OperatorClientOption
func WithKubeConfig ¶
func WithKubeConfig(kubeconfig string) OperatorClientOption
func WithNamespace ¶
func WithNamespace(namespace string) OperatorClientOption
type PodClient ¶
type PodClient struct {
// contains filtered or unexported fields
}
func NewPodClient ¶
func NewPodClient(k8sClient corev1client.CoreV1Interface) *PodClient
func (*PodClient) CreateNamespace ¶
func (*PodClient) DeleteNamespace ¶
func (*PodClient) EnsureNoPodRestarts ¶
func (*PodClient) GetPodLogs ¶
func (*PodClient) GetPodsByLabel ¶
type WorkloadClient ¶
type WorkloadClient struct {
// contains filtered or unexported fields
}
func NewWorkloadClient ¶
func NewWorkloadClient(client kubernetes.Interface) *WorkloadClient
func (*WorkloadClient) Delete ¶
func (h *WorkloadClient) Delete(ctx context.Context, namespace, name string) error
func (*WorkloadClient) VerifyGPUAccess ¶
func (h *WorkloadClient) VerifyGPUAccess(ctx context.Context, namespace, name string) error
VerifyGPUAccess checks pod logs for evidence of GPU access. TODO: Improve this by exec'ing into the container and invoking nvidia-smi directly
func (*WorkloadClient) WaitForCompletion ¶
func (*WorkloadClient) WaitForRunning ¶
Click to show internal directories.
Click to hide internal directories.