utils

package
v1.5.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: May 6, 2026 License: Apache-2.0 Imports: 45 Imported by: 0

Documentation

Index

Constants

View Source
const ClusterTypeK8s = "kubernetes"
View Source
const ClusterTypeOpenShift = "openshift"
View Source
const HttpServerPort = "8084"

Variables

This section is empty.

Functions

func AddNodeLabel added in v1.2.2

func AddNodeLabel(cl *kubernetes.Clientset, nodeName string, key string, value string) error

func CheckDeploymentWithStandardKMMNFD

func CheckDeploymentWithStandardKMMNFD(cl *kubernetes.Clientset, create bool) error

func CheckGpuLabel

func CheckGpuLabel(rl v1.ResourceList, label string) bool

func CheckHelmDeployment

func CheckHelmDeployment(cl *kubernetes.Clientset, ns string, create bool) error

func CheckHelmOCDeployment

func CheckHelmOCDeployment(cl *kubernetes.Clientset, create bool) error

func CheckOCDeploymentWithStandardKMMNFD

func CheckOCDeploymentWithStandardKMMNFD(cl *kubernetes.Clientset, create bool) error

func CreateConfigMap added in v1.4.0

func CreateConfigMap(ctx context.Context, cl *kubernetes.Clientset, ns string, cmName string, data map[string]string) error

func CreateDaemonset

func CreateDaemonset(cl *kubernetes.Clientset, ns string, name string, image string, matchLabels map[string]string, res *v1.ResourceRequirements) error

func CreateDaemonsetVerify

func CreateDaemonsetVerify(ctx context.Context, cl *kubernetes.Clientset, ns string,
	name string, image string, matchLabels map[string]string,
	res *v1.ResourceRequirements) error

func CreateMinioService added in v1.2.2

func CreateMinioService(ctx context.Context, cl *kubernetes.Clientset, ns, hostName string) error

func CreateOpaqueSecret added in v1.2.2

func CreateOpaqueSecret(ctx context.Context, cl *kubernetes.Clientset, name, ns string, keys map[string]string) error

func CreatePod

func CreatePod(ctx context.Context, cl *kubernetes.Clientset, ns string,
	name string, image string, workerNodeName string) error

func CreateTLSSecret

func CreateTLSSecret(ctx context.Context, cl *kubernetes.Clientset, name, ns string, crt, key []byte) error

func CreateTempFile

func CreateTempFile(fileName string, data []byte) (*os.File, error)

func CurlMetrics

func CurlMetrics(
	endpointIPs []string,
	token string,
	port int,
	secure bool,
	caCertPath string,
	clientCertPath string,
	clientKeyPath string,
) error

func DRADriverName added in v1.5.0

func DRADriverName(cfgName string) string

func DelDaemonset

func DelDaemonset(cl *kubernetes.Clientset, ns string, name string) error

func DelRocmPods

func DelRocmPods(ctx context.Context, cl *kubernetes.Clientset) error

func DelRocmPodsByNodeNames

func DelRocmPodsByNodeNames(ctx context.Context, cl *kubernetes.Clientset,
	workerNodeNames []string) error

func DeleteConfigMap added in v1.4.0

func DeleteConfigMap(ctx context.Context, cl *kubernetes.Clientset, ns string, cmName string) error

func DeleteMinioService added in v1.2.2

func DeleteMinioService(ctx context.Context, cl *kubernetes.Clientset, ns string)

func DeleteNodeAppDaemonSet

func DeleteNodeAppDaemonSet(cl *kubernetes.Clientset) error

func DeleteNodeLabel added in v1.2.2

func DeleteNodeLabel(cl *kubernetes.Clientset, nodeName string, key string) error

func DeleteOpaqueSecret added in v1.2.2

func DeleteOpaqueSecret(ctx context.Context, cl *kubernetes.Clientset, name, ns string)

func DeletePod

func DeletePod(ctx context.Context, cl *kubernetes.Clientset, ns string,
	name string) error

func DeleteRebootPod

func DeleteRebootPod(ctx context.Context, cl *kubernetes.Clientset, nodeName string, force bool)

func DeleteTLSSecret

func DeleteTLSSecret(ctx context.Context, cl *kubernetes.Clientset, name, ns string) error

func DeleteTempFile

func DeleteTempFile(file *os.File) error

func DeployNodeAppDaemonSet

func DeployNodeAppDaemonSet(cl *kubernetes.Clientset) error

func DeployResourcesFromFile

func DeployResourcesFromFile(pathOrURL string, cl *kubernetes.Clientset, apiCl *apiextClient.Clientset, create bool) error

func DeployRocmPods

func DeployRocmPods(ctx context.Context, cl *kubernetes.Clientset,
	res *v1.ResourceRequirements) error

func DeployRocmPodsByNodeNames

func DeployRocmPodsByNodeNames(ctx context.Context, cl *kubernetes.Clientset,
	workerNodeNames []string) error

func DeployRocmPytorchPods added in v1.4.0

func DeployRocmPytorchPods(ctx context.Context, cl *kubernetes.Clientset,
	res *v1.ResourceRequirements) error

func DevicePluginName

func DevicePluginName(cfgName string) string

func ExecPodCmd

func ExecPodCmd(command string, ns string, name string, container string) (string, error)

func GenerateServiceAccountToken

func GenerateServiceAccountToken(clientset *kubernetes.Clientset, serviceAccountName, namespace string) (string, error)

func GetAMDGPUCount

func GetAMDGPUCount(ctx context.Context, cl *kubernetes.Clientset, resourceType string) (map[string]int, error)

func GetAMDGpuWorker

func GetAMDGpuWorker(cl *kubernetes.Clientset, isOpenshift bool) []v1.Node

func GetClusterIP

func GetClusterIP(clientset *kubernetes.Clientset, serviceName, namespace string) (string, error)

func GetClusterType

func GetClusterType(cfg *rest.Config) string

func GetGpuDriverVersion

func GetGpuDriverVersion(name string) (string, error)

func GetJobLogs

func GetJobLogs(clientset *kubernetes.Clientset, job *batchv1.Job) ([]string, error)

func GetNodeIP

func GetNodeIP(ctx context.Context, cl *kubernetes.Clientset,
	nodeName string) (string, error)

func GetNodeIPs

func GetNodeIPs(clientset *kubernetes.Clientset) ([]string, error)

func GetNodeIPsForDaemonSet

func GetNodeIPsForDaemonSet(clientset *kubernetes.Clientset, daemonSetName, namespace string) ([]string, error)

func GetNonAMDGpuWorker

func GetNonAMDGpuWorker(cl *kubernetes.Clientset) []v1.Node

func GetPodNamesFromJob

func GetPodNamesFromJob(clientset *kubernetes.Clientset, job *batchv1.Job) ([]string, error)

func GetRebootPod

func GetRebootPod(nodeName string) *v1.Pod

func GetRocmInfo

func GetRocmInfo(name string) (string, error)

func GetServiceEndpoints

func GetServiceEndpoints(clientset *kubernetes.Clientset, serviceName, namespace string) ([]string, error)

func GetWorkerNodes

func GetWorkerNodes(cl *kubernetes.Clientset) []*v1.Node

func HandleNodesReboot

func HandleNodesReboot(ctx context.Context, cl *kubernetes.Clientset, nodes []v1.Node) error

func IsJSONParsable

func IsJSONParsable(s string) bool

func ListGpuDrivers

func ListGpuDrivers(name string) (string, error)

func ListRocmPods

func ListRocmPods(ctx context.Context, cl *kubernetes.Clientset) ([]string, error)

func ListRocmPodsByNodeNames

func ListRocmPodsByNodeNames(ctx context.Context,
	workerNodeNames []string) []string

func NFDWorkerName

func NFDWorkerName(isOpenshift bool) string

func NodeLabellerName

func NodeLabellerName(cfgName string) string

func NodeTaint added in v1.2.2

func NodeTaint(cl *kubernetes.Clientset, nodeName string) error

func PatchKMMDeploymentWithCIENVFlag

func PatchKMMDeploymentWithCIENVFlag(cl *kubernetes.Clientset) error

func PatchOperatorControllerDeploymentWithCIENVFlag

func PatchOperatorControllerDeploymentWithCIENVFlag(cl *kubernetes.Clientset) error

func RemoveMinioServiceAccount added in v1.4.0

func RemoveMinioServiceAccount(ns, pod, container, accessKey string)

func Retry

func Retry(f func() error, timeout time.Duration, period time.Duration) error

func RunCommand

func RunCommand(command string)

func RunCommandOnNode

func RunCommandOnNode(ctx context.Context, cl *kubernetes.Clientset, nodeName, command string) (string, error)

func SetGPUHealthOnNode

func SetGPUHealthOnNode(cl *kubernetes.Clientset, ns, gpuid, health, nodeName string) error

func SetupAccessKeysOnMinioServer added in v1.2.2

func SetupAccessKeysOnMinioServer(ns, pod, container, accessKey, secretKey string)

func SplitYAML

func SplitYAML(data []byte) [][]byte

func VerifyROCMPODResourceCount

func VerifyROCMPODResourceCount(ctx context.Context, cl *kubernetes.Clientset,
	gpuReqCount int, resourceType string) error

Types

type Option added in v1.5.0

type Option func(*TestMonitor)

Option configures a TestMonitor.

func WithDynamicClient added in v1.5.0

func WithDynamicClient(dc dynamic.Interface) Option

WithDynamicClient sets a dynamic Kubernetes client on the TestMonitor, enabling snapshots of custom resources (e.g. DeviceConfig CRs).

func WithLogCollection added in v1.5.0

func WithLogCollection() Option

WithLogCollection enables the log collection module. Pod logs (init + regular containers) are streamed to files under <testDir>/logs/.

func WithNodeDiagnostics added in v1.5.0

func WithNodeDiagnostics() Option

WithNodeDiagnostics enables collection of dmesg and lsmod from GPU worker nodes (those labelled feature.node.kubernetes.io/amd-gpu=true) at the end of each test. Diagnostics are saved under <testDir>/node-diagnostics/<nodeName>/. The container image defaults to the E2E_NODE_DIAG_IMAGE env var (set via dev.env / Makefile), falling back to busybox:1.36.

func WithNodeDiagnosticsImage added in v1.5.0

func WithNodeDiagnosticsImage(image string) Option

WithNodeDiagnosticsImage enables node diagnostics with a custom container image (must have nsenter available).

func WithNodeDiagnosticsSelector added in v1.5.0

func WithNodeDiagnosticsSelector(selector string) Option

WithNodeDiagnosticsSelector overrides the default node label selector used to pick which nodes to collect diagnostics from. Default: "feature.node.kubernetes.io/amd-gpu=true".

func WithSnapshotInterval added in v1.5.0

func WithSnapshotInterval(d time.Duration) Option

WithSnapshotInterval enables snapshots and sets a custom interval.

func WithSnapshots added in v1.5.0

func WithSnapshots() Option

WithSnapshots enables the periodic resource snapshot module. Resource state is dumped every snapshotInterval (default 30s) to <testDir>/snapshots/.

type TestMonitor added in v1.5.0

type TestMonitor struct {
	// contains filtered or unexported fields
}

TestMonitor observes cluster state during an e2e test. It supports independent modules that can be enabled/disabled via functional options:

  • Log collection: lists existing pods then watches for new ones, streaming all container logs (init + regular) to a single chronological file with pod/container prefixes. Uses SinceTime to scope logs from long-running pods to the current test. Handles container restarts by re-following when a stream ends.
  • Snapshots: periodically dumps resource state (pods, daemonsets, deployments, events) to timestamped files.

Create one TestMonitor per namespace. If you need to watch multiple namespaces, create multiple instances.

Usage:

// Both modules:
mon := NewTestMonitor(cs, "kube-amd-gpu", "e2e-artifacts",
    WithLogCollection(),
    WithSnapshots(),
)

// Only snapshots:
mon := NewTestMonitor(cs, "kube-amd-gpu", "e2e-artifacts",
    WithSnapshots(),
)

mon.Start("E2ESuite.TestDeployment")
// ... test runs ...
mon.Stop()

func NewTestMonitor added in v1.5.0

func NewTestMonitor(clientSet kubernetes.Interface, namespace string, baseDir string, opts ...Option) *TestMonitor

NewTestMonitor creates a new TestMonitor for a single namespace. Pass one or more Option values to enable modules. If no options are passed, nothing is collected (the monitor is inert).

func (*TestMonitor) Start added in v1.5.0

func (tm *TestMonitor) Start(testName string)

Start begins observation for a test. It records the current time so that logs from long-running pods (like the operator controller) are only collected from this point forward.

func (*TestMonitor) Stop added in v1.5.0

func (tm *TestMonitor) Stop()

Stop halts all observation goroutines, waits for them to finish, and takes a final resource snapshot (if snapshots are enabled).

type UserRequest

type UserRequest struct {
	Command string `json:"command"`
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL