Documentation
¶
Overview ¶
Package setup provides internal testing utilities for configuring and managing Grove operator installations during e2e tests. This package is not intended for production use and its API may change without notice.
Functions are exported to allow access from e2e test packages (e.g., operator/e2e/tests) which need to modify Grove configuration during test scenarios.
Index ¶
- Constants
- func CreateDefaultKaiQueues(ctx context.Context, config *HelmInstallConfig) error
- func GetBlockForNodeIndex(idx int) string
- func GetGroveChartDir() (string, error)
- func GetKubeconfig(ctx context.Context, clusterName string) (*clientcmdapi.Config, error)
- func GetRackForNodeIndex(idx int) string
- func GetWorkerNodeLabelSelector() string
- func GetZoneForNodeIndex(idx int) string
- func InstallCoreComponents(ctx context.Context, restConfig *rest.Config, kaiConfig *HelmInstallConfig, ...) error
- func InstallHelmChart(config *HelmInstallConfig) (*release.Release, error)
- func InstallWithSkaffold(ctx context.Context, config *SkaffoldInstallConfig) (func(), error)
- func SetupCompleteK3DCluster(ctx context.Context, cfg ClusterConfig, skaffoldYAMLPath string, ...) (*rest.Config, func(), error)
- func SetupK3DCluster(ctx context.Context, cfg ClusterConfig, logger *utils.Logger) (*rest.Config, func(), error)
- func SetupRegistryTestImages(registryPort string, images []string) error
- func StartNodeMonitoring(ctx context.Context, clientset *kubernetes.Clientset, logger *utils.Logger) func()
- func UninstallHelmChart(config *HelmInstallConfig) error
- func UpdateGroveConfiguration(ctx context.Context, restConfig *rest.Config, chartDir string, ...) error
- func UpgradeHelmChart(config *HelmInstallConfig) (*release.Release, error)
- func WaitForKaiCRDs(ctx context.Context, config *HelmInstallConfig) error
- type ClusterConfig
- type GroveConfig
- type HelmInstallConfig
- type NodeLabel
- type NodeTaint
- type SharedClusterManager
- func (scm *SharedClusterManager) CleanupWorkloads(ctx context.Context) error
- func (scm *SharedClusterManager) GetCleanupError() string
- func (scm *SharedClusterManager) GetClients() (*kubernetes.Clientset, *rest.Config, dynamic.Interface)
- func (scm *SharedClusterManager) GetRegistryPort() string
- func (scm *SharedClusterManager) GetWorkerNodes() []string
- func (scm *SharedClusterManager) HasCleanupFailed() bool
- func (scm *SharedClusterManager) IsSetup() bool
- func (scm *SharedClusterManager) MarkCleanupFailed(err error)
- func (scm *SharedClusterManager) PrepareForTest(ctx context.Context, requiredWorkerNodes int) error
- func (scm *SharedClusterManager) Setup(ctx context.Context, testImages []string) error
- func (scm *SharedClusterManager) Teardown()
- type SkaffoldInstallConfig
- type WebhooksConfig
Constants ¶
const ( // OperatorNamespace is the namespace where the Grove operator is deployed for E2E tests. // This is used during installation (via Skaffold) and for finding operator pods during diagnostics. OperatorNamespace = "grove-system" // OperatorDeploymentName is the name of the operator deployment (also the Helm release name). // This is used to find operator pods for log collection during test failures. OperatorDeploymentName = "grove-operator" // DefaultWebhookPort is the default port for the webhook server. // NOTE: If you change this, also update config.server.webhooks.port in operator/charts/values.yaml DefaultWebhookPort = 9443 // DefaultWebhookServerCertDir is the default directory for webhook certificates. // NOTE: If you change this, also update config.server.webhooks.serverCertDir in operator/charts/values.yaml DefaultWebhookServerCertDir = "/etc/grove-operator/webhook-certs" )
const ( // WorkerNodeLabelKey is the label key used to identify worker nodes in e2e tests. // This can be changed if infrastructure changes. WorkerNodeLabelKey = "node_role.e2e.grove.nvidia.com" // WorkerNodeLabelValue is the label value for worker node identification in e2e tests. WorkerNodeLabelValue = "agent" // TopologyLabelZone is the Kubernetes label key for zone topology domain. TopologyLabelZone = "kubernetes.io/zone" // TopologyLabelBlock is the Kubernetes label key for the block topology domain. TopologyLabelBlock = "kubernetes.io/block" // TopologyLabelRack is the Kubernetes label key for the rack topology domain. TopologyLabelRack = "kubernetes.io/rack" // TopologyLabelHostname is the Kubernetes label key for the hostname topology domain. TopologyLabelHostname = "kubernetes.io/hostname" // NodesPerZone is the number of nodes per zone. NodesPerZone = 28 // NodesPerBlock is the number of nodes per block (28 / 2 blocks). NodesPerBlock = 14 // NodesPerRack is the number of nodes per rack (28 / 4 racks). NodesPerRack = 7 )
Variables ¶
This section is empty.
Functions ¶
func CreateDefaultKaiQueues ¶
func CreateDefaultKaiQueues(ctx context.Context, config *HelmInstallConfig) error
CreateDefaultKaiQueues creates queues using the k8s client YAML apply functionality
func GetBlockForNodeIndex ¶
GetBlockForNodeIndex returns the block label for a given node index. Both the index parameter and the returned block number are 0-based. e.g., nodes 0-13 → block-0, nodes 14-27 → block-1
func GetGroveChartDir ¶
GetGroveChartDir returns the absolute path to the Grove Helm chart directory. It uses runtime.Caller to find the path relative to this source file. This function is exported for use by callers of UpdateGroveConfiguration.
func GetKubeconfig ¶
GetKubeconfig fetches and returns the kubeconfig for a k3d cluster.
func GetRackForNodeIndex ¶
GetRackForNodeIndex returns the rack label for a given node index. Both the index parameter and the returned rack number are 0-based. e.g., nodes 0-6 → rack-0, nodes 7-13 → rack-1, etc.
func GetWorkerNodeLabelSelector ¶
func GetWorkerNodeLabelSelector() string
GetWorkerNodeLabelSelector returns the label selector for worker nodes in e2e tests. Returns a formatted string "key=value" for use with Kubernetes label selectors.
func GetZoneForNodeIndex ¶
GetZoneForNodeIndex returns the zone label for a given node index. Both the index parameter and the returned zone number are 0-based. e.g., nodes 0-27 → zone-0, nodes 28-55 → zone-1, etc.
func InstallCoreComponents ¶
func InstallCoreComponents(ctx context.Context, restConfig *rest.Config, kaiConfig *HelmInstallConfig, skaffoldYAMLPath string, registryPort string, logger *utils.Logger) error
InstallCoreComponents installs the core components (Grove via Skaffold and Kai Scheduler via Helm)
func InstallHelmChart ¶
func InstallHelmChart(config *HelmInstallConfig) (*release.Release, error)
InstallHelmChart installs a Helm chart with the given configuration.
func InstallWithSkaffold ¶
func InstallWithSkaffold(ctx context.Context, config *SkaffoldInstallConfig) (func(), error)
InstallWithSkaffold builds and deploys using Skaffold CLI. It runs in two phases to account for the push registry being different than the pull registry for k3d. Note: we're using Skaffold CLI instead of go libraries because there are depdenency conflicts between grove and Skaffold at the time of implentaiton.
func SetupCompleteK3DCluster ¶
func SetupCompleteK3DCluster(ctx context.Context, cfg ClusterConfig, skaffoldYAMLPath string, logger *utils.Logger) (*rest.Config, func(), error)
SetupCompleteK3DCluster creates a complete k3d cluster with Grove, Kai Scheduler, and NVIDIA GPU Operator
func SetupK3DCluster ¶
func SetupK3DCluster(ctx context.Context, cfg ClusterConfig, logger *utils.Logger) (*rest.Config, func(), error)
SetupK3DCluster creates a k3d cluster and returns a REST config
func SetupRegistryTestImages ¶
SetupRegistryTestImages pulls images and pushes them to the local k3d registry.
func StartNodeMonitoring ¶
func StartNodeMonitoring(ctx context.Context, clientset *kubernetes.Clientset, logger *utils.Logger) func()
StartNodeMonitoring starts a goroutine that monitors k3d cluster nodes for not ready status and automatically replaces them by deleting the node and restarting the corresponding Docker container. Returns a cleanup function that should be deferred to stop the monitoring.
Background: There's an intermitten issue where nodes go not ready which causes the tests to fail occasional. This is an issue with either k3d or docker on mac. This is a simple solution that is working flawlessly.
The monitoring process: 1. Checks for nodes that are not in Ready status every 5 seconds 2. Skips cordoned nodes (intentionally unschedulable for maintenance) 3. Deletes the not ready node from Kubernetes 4. Finds and restarts the corresponding Docker container (node names match container names exactly) 5. The restarted container will rejoin the cluster as a new node
func UninstallHelmChart ¶
func UninstallHelmChart(config *HelmInstallConfig) error
UninstallHelmChart uninstalls a Helm release.
func UpdateGroveConfiguration ¶
func UpdateGroveConfiguration(ctx context.Context, restConfig *rest.Config, chartDir string, config *GroveConfig, logger *utils.Logger) error
UpdateGroveConfiguration updates the Grove operator configuration.
This uses Helm upgrade (rather than Skaffold) because: 1. Grove is initially installed via Skaffold, which uses Helm under the hood 2. For config-only changes (like switching cert modes), rebuilding images is unnecessary 3. Helm upgrade with ReuseValues preserves the image configuration that Skaffold set
The chartDir parameter should be the path to the Grove Helm chart directory. Use GetGroveChartDir() to obtain the default chart directory path.
This approach avoids wasteful rebuilds while staying compatible with the Skaffold installation.
func UpgradeHelmChart ¶
func UpgradeHelmChart(config *HelmInstallConfig) (*release.Release, error)
UpgradeHelmChart upgrades a Helm chart with the given configuration.
func WaitForKaiCRDs ¶
func WaitForKaiCRDs(ctx context.Context, config *HelmInstallConfig) error
WaitForKaiCRDs waits for the Queue CRD from scheduling.run.ai/v2 to be available
Types ¶
type ClusterConfig ¶
type ClusterConfig struct {
Name string // Name of the k3d cluster
ControlPlaneNodes int // Number of Control Plane Nodes (k3s calls these server nodes)
WorkerNodes int // Number of worker nodes (called agents in k3s terminology)
Image string // Docker image to use for k3d cluster (e.g., "rancher/k3s:v1.28.8-k3s1")
HostPort string // Port on host to expose Kubernetes API (e.g., "6550")
LoadBalancerPort string // Load balancer port mapping in format "hostPort:containerPort" (e.g., "8080:80")
NodeLabels []NodeLabel // Kubernetes labels to apply with specific node filters
WorkerNodeTaints []NodeTaint // Taints to apply to worker nodes
WorkerMemory string // Memory allocation for worker/agent nodes (e.g., "150m")
EnableRegistry bool // Enable built-in Docker registry
RegistryPort string // Port for the Docker registry (e.g., "5001")
}
ClusterConfig holds configuration for creating a k3d cluster
func DefaultClusterConfig ¶
func DefaultClusterConfig() ClusterConfig
DefaultClusterConfig returns the default cluster configuration used by e2e tests. This includes all the node labels and taints required for Grove e2e testing. The setup-debug-cluster tool and SharedClusterManager both use this as their base config.
type GroveConfig ¶
type GroveConfig struct {
// InstallCRDs controls whether CRDs should be installed/updated.
InstallCRDs bool
// Webhooks contains webhook-specific configuration.
Webhooks WebhooksConfig
}
GroveConfig holds typed configuration options for updating the Grove operator. This struct provides a user-friendly interface that gets translated to Helm values internally.
type HelmInstallConfig ¶
type HelmInstallConfig struct {
// RestConfig is the Kubernetes REST configuration. If nil, uses default kubeconfig.
RestConfig *rest.Config
// ReleaseName is the name of the Helm release. Required unless GenerateName is true.
ReleaseName string
// ChartRef is the chart reference (path, URL, or chart name). Required.
ChartRef string
// ChartVersion is the version of the chart to install. Required.
ChartVersion string
// Namespace is the Kubernetes namespace to install into. Required.
Namespace string
// CreateNamespace creates the namespace if it doesn't exist.
CreateNamespace bool
// Wait blocks until all resources are ready.
Wait bool
// GenerateName generates a random release name with ReleaseName as prefix.
GenerateName bool
// Values are the chart values to use for the installation.
Values map[string]interface{}
// HelmLoggerFunc is called for Helm operation logging.
HelmLoggerFunc func(format string, v ...interface{})
// Logger is the full logger for component operations.
Logger *utils.Logger
// RepoURL is the base URL of the Helm repository (optional, for direct chart downloads).
RepoURL string
// ReuseValues reuses the last release's values and merges in the new values.
ReuseValues bool
// Timeout is the time to wait for Kubernetes operations (default: 5 minutes).
Timeout time.Duration
}
HelmInstallConfig holds configuration for Helm chart installations.
func (*HelmInstallConfig) Validate ¶
func (c *HelmInstallConfig) Validate() error
Validate validates and sets defaults for the configuration.
type NodeLabel ¶
type NodeLabel struct {
Key string // Label key
Value string // Label value
// k3s refers to worker nodes as agent nodes
NodeFilters []string // Node filters (e.g., "server:*", "agent:*", "server:0", "agent:1")
}
NodeLabel represents a Kubernetes node label with its target node filters
type SharedClusterManager ¶
type SharedClusterManager struct {
// contains filtered or unexported fields
}
SharedClusterManager manages a shared (singleton) k3d cluster for E2E tests
func SharedCluster ¶
func SharedCluster(logger *utils.Logger) *SharedClusterManager
SharedCluster returns the singleton shared cluster manager
func (*SharedClusterManager) CleanupWorkloads ¶
func (scm *SharedClusterManager) CleanupWorkloads(ctx context.Context) error
CleanupWorkloads removes all test workloads from the cluster
func (*SharedClusterManager) GetCleanupError ¶
func (scm *SharedClusterManager) GetCleanupError() string
GetCleanupError returns the error message from the failed cleanup, or empty string if no failure.
func (*SharedClusterManager) GetClients ¶
func (scm *SharedClusterManager) GetClients() (*kubernetes.Clientset, *rest.Config, dynamic.Interface)
GetClients returns the kubernetes clients for tests to use
func (*SharedClusterManager) GetRegistryPort ¶
func (scm *SharedClusterManager) GetRegistryPort() string
GetRegistryPort returns the registry port for test image setup
func (*SharedClusterManager) GetWorkerNodes ¶
func (scm *SharedClusterManager) GetWorkerNodes() []string
GetWorkerNodes returns the list of worker node names
func (*SharedClusterManager) HasCleanupFailed ¶
func (scm *SharedClusterManager) HasCleanupFailed() bool
HasCleanupFailed returns true if a previous cleanup operation failed.
func (*SharedClusterManager) IsSetup ¶
func (scm *SharedClusterManager) IsSetup() bool
IsSetup returns whether the shared cluster is setup
func (*SharedClusterManager) MarkCleanupFailed ¶
func (scm *SharedClusterManager) MarkCleanupFailed(err error)
MarkCleanupFailed marks that a cleanup operation has failed. This causes all subsequent tests to fail immediately when they try to prepare the cluster.
func (*SharedClusterManager) PrepareForTest ¶
func (scm *SharedClusterManager) PrepareForTest(ctx context.Context, requiredWorkerNodes int) error
PrepareForTest prepares the cluster for a specific test by cordoning the appropriate nodes. It ensures exactly `requiredWorkerNodes` nodes are schedulable by cordoning excess nodes. Returns an error if a previous cleanup operation failed, preventing potentially corrupted test state.
func (*SharedClusterManager) Setup ¶
func (scm *SharedClusterManager) Setup(ctx context.Context, testImages []string) error
Setup initializes the shared cluster with maximum required resources
func (*SharedClusterManager) Teardown ¶
func (scm *SharedClusterManager) Teardown()
Teardown cleans up the shared cluster
type SkaffoldInstallConfig ¶
type SkaffoldInstallConfig struct {
// SkaffoldYAMLPath is the path to the skaffold.yaml file. Required.
SkaffoldYAMLPath string
// RestConfig is the Kubernetes REST configuration. Required.
RestConfig *rest.Config
// Profiles are the Skaffold profiles to activate (optional).
Profiles []string
// PushRepo is the repository to push images to (e.g., "localhost:5001"). Required.
PushRepo string
// PullRepo is the repository to pull images from (e.g., "registry:5001"). Required.
PullRepo string
// Namespace is the target namespace for deployment (optional, defaults to "default").
Namespace string
// Env are environment variables required by skaffold.yaml (e.g., VERSION, LD_FLAGS).
Env map[string]string
// Logger is the logger for operations (optional, will use default if nil).
Logger *utils.Logger
}
SkaffoldInstallConfig holds configuration for Skaffold installations.
func (*SkaffoldInstallConfig) Validate ¶
func (c *SkaffoldInstallConfig) Validate() error
Validate validates the configuration.
type WebhooksConfig ¶
type WebhooksConfig struct {
// CertProvisionMode controls how webhook certificates are provisioned.
// Use configv1alpha1.CertProvisionModeAuto for automatic provisioning,
// configv1alpha1.CertProvisionModeManual for external cert management.
CertProvisionMode configv1alpha1.CertProvisionMode
// SecretName is the name of the Kubernetes secret containing TLS certificates.
SecretName string
// Annotations to apply to webhook configurations (e.g., for cert-manager CA injection).
// These annotations are applied to all webhook configurations.
Annotations map[string]string
}
WebhooksConfig holds configuration for Grove's webhook server.