Documentation
¶
Index ¶
- Constants
- func AddNodeLabel(cl *kubernetes.Clientset, nodeName string, key string, value string) error
- func CheckDeploymentWithStandardKMMNFD(cl *kubernetes.Clientset, create bool) error
- func CheckGpuLabel(rl v1.ResourceList, label string) bool
- func CheckHelmDeployment(cl *kubernetes.Clientset, ns string, create bool) error
- func CheckHelmOCDeployment(cl *kubernetes.Clientset, create bool) error
- func CheckOCDeploymentWithStandardKMMNFD(cl *kubernetes.Clientset, create bool) error
- func CreateConfigMap(ctx context.Context, cl *kubernetes.Clientset, ns string, cmName string, ...) error
- func CreateDaemonset(cl *kubernetes.Clientset, ns string, name string, image string, ...) error
- func CreateDaemonsetVerify(ctx context.Context, cl *kubernetes.Clientset, ns string, name string, ...) error
- func CreateMinioService(ctx context.Context, cl *kubernetes.Clientset, ns, hostName string) error
- func CreateOpaqueSecret(ctx context.Context, cl *kubernetes.Clientset, name, ns string, ...) error
- func CreatePod(ctx context.Context, cl *kubernetes.Clientset, ns string, name string, ...) error
- func CreateTLSSecret(ctx context.Context, cl *kubernetes.Clientset, name, ns string, ...) error
- func CreateTempFile(fileName string, data []byte) (*os.File, error)
- func CurlMetrics(endpointIPs []string, token string, port int, secure bool, caCertPath string, ...) error
- func DRADriverName(cfgName string) string
- func DelDaemonset(cl *kubernetes.Clientset, ns string, name string) error
- func DelRocmPods(ctx context.Context, cl *kubernetes.Clientset) error
- func DelRocmPodsByNodeNames(ctx context.Context, cl *kubernetes.Clientset, workerNodeNames []string) error
- func DeleteConfigMap(ctx context.Context, cl *kubernetes.Clientset, ns string, cmName string) error
- func DeleteMinioService(ctx context.Context, cl *kubernetes.Clientset, ns string)
- func DeleteNodeAppDaemonSet(cl *kubernetes.Clientset) error
- func DeleteNodeLabel(cl *kubernetes.Clientset, nodeName string, key string) error
- func DeleteOpaqueSecret(ctx context.Context, cl *kubernetes.Clientset, name, ns string)
- func DeletePod(ctx context.Context, cl *kubernetes.Clientset, ns string, name string) error
- func DeleteRebootPod(ctx context.Context, cl *kubernetes.Clientset, nodeName string, force bool)
- func DeleteTLSSecret(ctx context.Context, cl *kubernetes.Clientset, name, ns string) error
- func DeleteTempFile(file *os.File) error
- func DeployNodeAppDaemonSet(cl *kubernetes.Clientset) error
- func DeployResourcesFromFile(pathOrURL string, cl *kubernetes.Clientset, apiCl *apiextClient.Clientset, ...) error
- func DeployRocmPods(ctx context.Context, cl *kubernetes.Clientset, res *v1.ResourceRequirements) error
- func DeployRocmPodsByNodeNames(ctx context.Context, cl *kubernetes.Clientset, workerNodeNames []string) error
- func DeployRocmPytorchPods(ctx context.Context, cl *kubernetes.Clientset, res *v1.ResourceRequirements) error
- func DevicePluginName(cfgName string) string
- func ExecPodCmd(command string, ns string, name string, container string) (string, error)
- func GenerateServiceAccountToken(clientset *kubernetes.Clientset, serviceAccountName, namespace string) (string, error)
- func GetAMDGPUCount(ctx context.Context, cl *kubernetes.Clientset, resourceType string) (map[string]int, error)
- func GetAMDGpuWorker(cl *kubernetes.Clientset, isOpenshift bool) []v1.Node
- func GetClusterIP(clientset *kubernetes.Clientset, serviceName, namespace string) (string, error)
- func GetClusterType(cfg *rest.Config) string
- func GetGpuDriverVersion(name string) (string, error)
- func GetJobLogs(clientset *kubernetes.Clientset, job *batchv1.Job) ([]string, error)
- func GetNodeIP(ctx context.Context, cl *kubernetes.Clientset, nodeName string) (string, error)
- func GetNodeIPs(clientset *kubernetes.Clientset) ([]string, error)
- func GetNodeIPsForDaemonSet(clientset *kubernetes.Clientset, daemonSetName, namespace string) ([]string, error)
- func GetNonAMDGpuWorker(cl *kubernetes.Clientset) []v1.Node
- func GetPodNamesFromJob(clientset *kubernetes.Clientset, job *batchv1.Job) ([]string, error)
- func GetRebootPod(nodeName string) *v1.Pod
- func GetRocmInfo(name string) (string, error)
- func GetServiceEndpoints(clientset *kubernetes.Clientset, serviceName, namespace string) ([]string, error)
- func GetWorkerNodes(cl *kubernetes.Clientset) []*v1.Node
- func HandleNodesReboot(ctx context.Context, cl *kubernetes.Clientset, nodes []v1.Node) error
- func IsJSONParsable(s string) bool
- func ListGpuDrivers(name string) (string, error)
- func ListRocmPods(ctx context.Context, cl *kubernetes.Clientset) ([]string, error)
- func ListRocmPodsByNodeNames(ctx context.Context, workerNodeNames []string) []string
- func NFDWorkerName(isOpenshift bool) string
- func NodeLabellerName(cfgName string) string
- func NodeTaint(cl *kubernetes.Clientset, nodeName string) error
- func PatchKMMDeploymentWithCIENVFlag(cl *kubernetes.Clientset) error
- func PatchOperatorControllerDeploymentWithCIENVFlag(cl *kubernetes.Clientset) error
- func RemoveMinioServiceAccount(ns, pod, container, accessKey string)
- func Retry(f func() error, timeout time.Duration, period time.Duration) error
- func RunCommand(command string)
- func RunCommandOnNode(ctx context.Context, cl *kubernetes.Clientset, nodeName, command string) (string, error)
- func SetGPUHealthOnNode(cl *kubernetes.Clientset, ns, gpuid, health, nodeName string) error
- func SetupAccessKeysOnMinioServer(ns, pod, container, accessKey, secretKey string)
- func SplitYAML(data []byte) [][]byte
- func VerifyROCMPODResourceCount(ctx context.Context, cl *kubernetes.Clientset, gpuReqCount int, ...) error
- type Option
- func WithDynamicClient(dc dynamic.Interface) Option
- func WithLogCollection() Option
- func WithNodeDiagnostics() Option
- func WithNodeDiagnosticsImage(image string) Option
- func WithNodeDiagnosticsSelector(selector string) Option
- func WithSnapshotInterval(d time.Duration) Option
- func WithSnapshots() Option
- type TestMonitor
- type UserRequest
Constants ¶
const ClusterTypeK8s = "kubernetes"
const ClusterTypeOpenShift = "openshift"
const HttpServerPort = "8084"
Variables ¶
This section is empty.
Functions ¶
func AddNodeLabel ¶ added in v1.2.2
func CheckDeploymentWithStandardKMMNFD ¶
func CheckDeploymentWithStandardKMMNFD(cl *kubernetes.Clientset, create bool) error
func CheckGpuLabel ¶
func CheckGpuLabel(rl v1.ResourceList, label string) bool
func CheckHelmDeployment ¶
func CheckHelmDeployment(cl *kubernetes.Clientset, ns string, create bool) error
func CheckHelmOCDeployment ¶
func CheckHelmOCDeployment(cl *kubernetes.Clientset, create bool) error
func CheckOCDeploymentWithStandardKMMNFD ¶
func CheckOCDeploymentWithStandardKMMNFD(cl *kubernetes.Clientset, create bool) error
func CreateConfigMap ¶ added in v1.4.0
func CreateDaemonset ¶
func CreateDaemonset(cl *kubernetes.Clientset, ns string, name string, image string, matchLabels map[string]string, res *v1.ResourceRequirements) error
func CreateDaemonsetVerify ¶
func CreateMinioService ¶ added in v1.2.2
func CreateOpaqueSecret ¶ added in v1.2.2
func CreateTLSSecret ¶
func CurlMetrics ¶
func DRADriverName ¶ added in v1.5.0
func DelDaemonset ¶
func DelDaemonset(cl *kubernetes.Clientset, ns string, name string) error
func DelRocmPods ¶
func DelRocmPods(ctx context.Context, cl *kubernetes.Clientset) error
func DelRocmPodsByNodeNames ¶
func DeleteConfigMap ¶ added in v1.4.0
func DeleteMinioService ¶ added in v1.2.2
func DeleteMinioService(ctx context.Context, cl *kubernetes.Clientset, ns string)
func DeleteNodeAppDaemonSet ¶
func DeleteNodeAppDaemonSet(cl *kubernetes.Clientset) error
func DeleteNodeLabel ¶ added in v1.2.2
func DeleteNodeLabel(cl *kubernetes.Clientset, nodeName string, key string) error
func DeleteOpaqueSecret ¶ added in v1.2.2
func DeleteOpaqueSecret(ctx context.Context, cl *kubernetes.Clientset, name, ns string)
func DeleteRebootPod ¶
func DeleteTLSSecret ¶
func DeleteTempFile ¶
func DeployNodeAppDaemonSet ¶
func DeployNodeAppDaemonSet(cl *kubernetes.Clientset) error
func DeployResourcesFromFile ¶
func DeployResourcesFromFile(pathOrURL string, cl *kubernetes.Clientset, apiCl *apiextClient.Clientset, create bool) error
func DeployRocmPods ¶
func DeployRocmPods(ctx context.Context, cl *kubernetes.Clientset, res *v1.ResourceRequirements) error
func DeployRocmPytorchPods ¶ added in v1.4.0
func DeployRocmPytorchPods(ctx context.Context, cl *kubernetes.Clientset, res *v1.ResourceRequirements) error
func DevicePluginName ¶
func ExecPodCmd ¶
func GenerateServiceAccountToken ¶
func GenerateServiceAccountToken(clientset *kubernetes.Clientset, serviceAccountName, namespace string) (string, error)
func GetAMDGPUCount ¶
func GetAMDGpuWorker ¶
func GetAMDGpuWorker(cl *kubernetes.Clientset, isOpenshift bool) []v1.Node
func GetClusterIP ¶
func GetClusterIP(clientset *kubernetes.Clientset, serviceName, namespace string) (string, error)
func GetClusterType ¶
func GetGpuDriverVersion ¶
func GetJobLogs ¶
func GetNodeIPs ¶
func GetNodeIPs(clientset *kubernetes.Clientset) ([]string, error)
func GetNodeIPsForDaemonSet ¶
func GetNodeIPsForDaemonSet(clientset *kubernetes.Clientset, daemonSetName, namespace string) ([]string, error)
func GetNonAMDGpuWorker ¶
func GetNonAMDGpuWorker(cl *kubernetes.Clientset) []v1.Node
func GetPodNamesFromJob ¶
func GetRebootPod ¶
func GetRocmInfo ¶
func GetServiceEndpoints ¶
func GetServiceEndpoints(clientset *kubernetes.Clientset, serviceName, namespace string) ([]string, error)
func GetWorkerNodes ¶
func GetWorkerNodes(cl *kubernetes.Clientset) []*v1.Node
func HandleNodesReboot ¶
func IsJSONParsable ¶
func ListGpuDrivers ¶
func ListRocmPods ¶
func ListRocmPodsByNodeNames ¶
func NFDWorkerName ¶
func NodeLabellerName ¶
func PatchKMMDeploymentWithCIENVFlag ¶
func PatchKMMDeploymentWithCIENVFlag(cl *kubernetes.Clientset) error
func PatchOperatorControllerDeploymentWithCIENVFlag ¶
func PatchOperatorControllerDeploymentWithCIENVFlag(cl *kubernetes.Clientset) error
func RemoveMinioServiceAccount ¶ added in v1.4.0
func RemoveMinioServiceAccount(ns, pod, container, accessKey string)
func RunCommand ¶
func RunCommand(command string)
func RunCommandOnNode ¶
func SetGPUHealthOnNode ¶
func SetGPUHealthOnNode(cl *kubernetes.Clientset, ns, gpuid, health, nodeName string) error
func SetupAccessKeysOnMinioServer ¶ added in v1.2.2
func SetupAccessKeysOnMinioServer(ns, pod, container, accessKey, secretKey string)
Types ¶
type Option ¶ added in v1.5.0
type Option func(*TestMonitor)
Option configures a TestMonitor.
func WithDynamicClient ¶ added in v1.5.0
WithDynamicClient sets a dynamic Kubernetes client on the TestMonitor, enabling snapshots of custom resources (e.g. DeviceConfig CRs).
func WithLogCollection ¶ added in v1.5.0
func WithLogCollection() Option
WithLogCollection enables the log collection module. Pod logs (init + regular containers) are streamed to files under <testDir>/logs/.
func WithNodeDiagnostics ¶ added in v1.5.0
func WithNodeDiagnostics() Option
WithNodeDiagnostics enables collection of dmesg and lsmod from GPU worker nodes (those labelled feature.node.kubernetes.io/amd-gpu=true) at the end of each test. Diagnostics are saved under <testDir>/node-diagnostics/<nodeName>/. The container image defaults to the E2E_NODE_DIAG_IMAGE env var (set via dev.env / Makefile), falling back to busybox:1.36.
func WithNodeDiagnosticsImage ¶ added in v1.5.0
WithNodeDiagnosticsImage enables node diagnostics with a custom container image (must have nsenter available).
func WithNodeDiagnosticsSelector ¶ added in v1.5.0
WithNodeDiagnosticsSelector overrides the default node label selector used to pick which nodes to collect diagnostics from. Default: "feature.node.kubernetes.io/amd-gpu=true".
func WithSnapshotInterval ¶ added in v1.5.0
WithSnapshotInterval enables snapshots and sets a custom interval.
func WithSnapshots ¶ added in v1.5.0
func WithSnapshots() Option
WithSnapshots enables the periodic resource snapshot module. Resource state is dumped every snapshotInterval (default 30s) to <testDir>/snapshots/.
type TestMonitor ¶ added in v1.5.0
type TestMonitor struct {
// contains filtered or unexported fields
}
TestMonitor observes cluster state during an e2e test. It supports independent modules that can be enabled/disabled via functional options:
- Log collection: lists existing pods then watches for new ones, streaming all container logs (init + regular) to a single chronological file with pod/container prefixes. Uses SinceTime to scope logs from long-running pods to the current test. Handles container restarts by re-following when a stream ends.
- Snapshots: periodically dumps resource state (pods, daemonsets, deployments, events) to timestamped files.
Create one TestMonitor per namespace. If you need to watch multiple namespaces, create multiple instances.
Usage:
// Both modules:
mon := NewTestMonitor(cs, "kube-amd-gpu", "e2e-artifacts",
WithLogCollection(),
WithSnapshots(),
)
// Only snapshots:
mon := NewTestMonitor(cs, "kube-amd-gpu", "e2e-artifacts",
WithSnapshots(),
)
mon.Start("E2ESuite.TestDeployment")
// ... test runs ...
mon.Stop()
func NewTestMonitor ¶ added in v1.5.0
func NewTestMonitor(clientSet kubernetes.Interface, namespace string, baseDir string, opts ...Option) *TestMonitor
NewTestMonitor creates a new TestMonitor for a single namespace. Pass one or more Option values to enable modules. If no options are passed, nothing is collected (the monitor is inert).
func (*TestMonitor) Start ¶ added in v1.5.0
func (tm *TestMonitor) Start(testName string)
Start begins observation for a test. It records the current time so that logs from long-running pods (like the operator controller) are only collected from this point forward.
func (*TestMonitor) Stop ¶ added in v1.5.0
func (tm *TestMonitor) Stop()
Stop halts all observation goroutines, waits for them to finish, and takes a final resource snapshot (if snapshots are enabled).
type UserRequest ¶
type UserRequest struct {
Command string `json:"command"`
}