Documentation
¶
Overview ¶
Package cluster provides types and operations for sind cluster management.
Index ¶
- Constants
- func BuildRunArgs(cfg RunConfig) []string
- func BuildSSHArgs(sshContainer docker.ContainerName, node, cluster string, isTTY bool, ...) []string
- func ComposeProject(realm, clusterName string) string
- func ContainerLogArgs(realm, node, cluster string, follow bool) []string
- func ContainerName(realm, cluster, shortName string) docker.ContainerName
- func ContainerPrefix(realm, cluster string) string
- func CreateClusterNetwork(ctx context.Context, client *docker.Client, realm, clusterName string) error
- func CreateClusterNodes(ctx context.Context, client *docker.Client, meshMgr *mesh.Manager, ...) error
- func CreateClusterVolumes(ctx context.Context, client *docker.Client, realm, clusterName string) error
- func CreateNode(ctx context.Context, client *docker.Client, meshMgr *mesh.Manager, ...) (docker.ContainerID, error)
- func DNSName(shortName, cluster string) string
- func DNSSearchDomain(cluster string) string
- func Delete(ctx context.Context, client *docker.Client, meshMgr *mesh.Manager, ...) error
- func DeleteContainers(ctx context.Context, client *docker.Client, ...) error
- func DeleteNetwork(ctx context.Context, client *docker.Client, name docker.NetworkName) error
- func DeleteVolumes(ctx context.Context, client *docker.Client, volumes []docker.VolumeName) error
- func DeregisterMesh(ctx context.Context, meshMgr *mesh.Manager, clusterName string, ...) error
- func EnableSlurmServices(ctx context.Context, client *docker.Client, configs []RunConfig) error
- func EnterTarget(ctx context.Context, client *docker.Client, realm, clusterName string) (string, error)
- func ExecArgs(ctx context.Context, client *docker.Client, realm string, ...) ([]string, error)
- func GetMungeKey(ctx context.Context, client *docker.Client, realm, clusterName string) ([]byte, error)
- func HasOtherClusters(ctx context.Context, client *docker.Client, realm, clusterName string) (bool, error)
- func NetworkName(realm, cluster string) docker.NetworkName
- func NextComputeIndex(ctx context.Context, client *docker.Client, realm, clusterName string) (int, error)
- func NodeLabels(realm, clusterName, role, slurmVersion string, containerNumber int) map[string]string
- func NodeShortNames(nodes []config.Node) []string
- func PowerCut(ctx context.Context, client *docker.Client, realm, clusterName string, ...) error
- func PowerCycle(ctx context.Context, client *docker.Client, realm, clusterName string, ...) error
- func PowerFreeze(ctx context.Context, client *docker.Client, realm, clusterName string, ...) error
- func PowerOn(ctx context.Context, client *docker.Client, realm, clusterName string, ...) error
- func PowerReboot(ctx context.Context, client *docker.Client, realm, clusterName string, ...) error
- func PowerShutdown(ctx context.Context, client *docker.Client, realm, clusterName string, ...) error
- func PowerUnfreeze(ctx context.Context, client *docker.Client, realm, clusterName string, ...) error
- func PreflightCheck(ctx context.Context, client *docker.Client, realm string, cfg *config.Cluster) error
- func ServiceLogArgs(realm, node, cluster, service string, follow bool) []string
- func ValidateWorkerAdd(ctx context.Context, client *docker.Client, realm string, ...) error
- func VolumeName(realm, cluster, volumeType string) docker.VolumeName
- func WorkerRemove(ctx context.Context, client *docker.Client, meshMgr *mesh.Manager, ...) error
- func WriteClusterConfig(ctx context.Context, client *docker.Client, realm string, cfg *config.Cluster, ...) error
- func WriteMungeKey(ctx context.Context, client *docker.Client, realm, clusterName string, ...) error
- type Cluster
- type ClusterStatus
- type ClusterSummary
- type NetworkHealth
- type NetworkSummary
- type Node
- type NodeHealth
- type NodeStatus
- type NodeSummary
- type Resources
- type RunConfig
- type Status
- type VolumeHealth
- type VolumeSummary
- type WorkerAddOptions
Constants ¶
const ( LabelRealm = "sind.realm" LabelCluster = "sind.cluster" LabelRole = "sind.role" LabelSlurmVersion = "sind.slurm.version" )
Label keys used on sind containers.
const ( LabelComposeProject = "com.docker.compose.project" LabelComposeService = "com.docker.compose.service" LabelComposeContainerNumber = "com.docker.compose.container-number" LabelComposeOneoff = "com.docker.compose.oneoff" LabelComposeConfigHash = "com.docker.compose.config-hash" LabelComposeConfigFiles = "com.docker.compose.project.config_files" LabelComposeNetwork = "com.docker.compose.network" LabelComposeVolume = "com.docker.compose.volume" )
Docker Compose compatibility labels.
const DNSSuffix = "sind.local"
DNSSuffix is the base domain for all sind DNS names.
Variables ¶
This section is empty.
Functions ¶
func BuildRunArgs ¶
BuildRunArgs returns the docker arguments for creating a node container. The returned slice does not include "create" or "run -d" — the caller passes these args to Client.CreateContainer or Client.RunContainer.
func BuildSSHArgs ¶
func BuildSSHArgs(sshContainer docker.ContainerName, node, cluster string, isTTY bool, sshOptions, command []string) []string
BuildSSHArgs builds the docker CLI arguments for running SSH through the sind-ssh relay container. The returned args are suitable for passing to docker directly (e.g. "docker exec -i -t sind-ssh ssh ...").
node is the short name (e.g. "worker-0"), cluster is the cluster name, isTTY controls whether -t is added to docker exec, sshOptions are passed through to SSH before the target, and command is the optional remote command.
func ComposeProject ¶
ComposeProject returns the Docker Compose project name for a cluster.
func ContainerLogArgs ¶
ContainerLogArgs builds docker CLI arguments for streaming container logs. node is the short name (e.g. "controller", "worker-0"), cluster is the cluster name, and follow controls whether --follow is added.
func ContainerName ¶
func ContainerName(realm, cluster, shortName string) docker.ContainerName
ContainerName returns the Docker container name for a node. shortName is the node's hostname, e.g. "controller", "submitter", "worker-0".
func ContainerPrefix ¶
ContainerPrefix returns the container name prefix for a cluster, used to extract short names from full container names.
func CreateClusterNetwork ¶
func CreateClusterNetwork(ctx context.Context, client *docker.Client, realm, clusterName string) error
CreateClusterNetwork creates the cluster-specific Docker bridge network.
func CreateClusterNodes ¶
func CreateClusterNodes(ctx context.Context, client *docker.Client, meshMgr *mesh.Manager, configs []RunConfig) error
CreateClusterNodes creates all node containers for the cluster. Each node is created, connected to the mesh network, and started.
func CreateClusterVolumes ¶
func CreateClusterVolumes(ctx context.Context, client *docker.Client, realm, clusterName string) error
CreateClusterVolumes creates the config, munge, and data volumes for a cluster.
func CreateNode ¶
func CreateNode(ctx context.Context, client *docker.Client, meshMgr *mesh.Manager, cfg RunConfig) (docker.ContainerID, error)
CreateNode creates a node container, connects it to the mesh network, and starts it. Returns the container ID.
func DNSName ¶
DNSName returns the fully qualified DNS name for a node. shortName is the node's hostname, e.g. "controller", "worker-0".
func DNSSearchDomain ¶
DNSSearchDomain returns the DNS search domain for a cluster.
func Delete ¶
func Delete(ctx context.Context, client *docker.Client, meshMgr *mesh.Manager, clusterName string) error
Delete orchestrates the full cluster deletion flow.
Deleting a non-existent cluster is not an error. The function handles partial clusters (e.g., from a failed creation) by removing whatever resources exist.
ListClusterResources
│
DeregisterMesh DNS + known_hosts per node
│
DeleteContainers stop + rm per container
│
DeleteNetwork rm cluster network
│
DeleteVolumes rm cluster volumes
│
HasOtherClusters?
yes → done
no → CleanupMesh
func DeleteContainers ¶
func DeleteContainers(ctx context.Context, client *docker.Client, containers []docker.ContainerListEntry) error
DeleteContainers stops and removes the given containers. Stop errors are ignored (the container may already be stopped), but remove errors are fatal.
func DeleteNetwork ¶
DeleteNetwork removes the cluster network.
func DeleteVolumes ¶
DeleteVolumes removes the given cluster volumes.
func DeregisterMesh ¶
func DeregisterMesh(ctx context.Context, meshMgr *mesh.Manager, clusterName string, containers []docker.ContainerListEntry) error
DeregisterMesh removes DNS records and known_hosts entries for each container in the cluster. This is the inverse of registerMesh during cluster creation.
func EnableSlurmServices ¶
EnableSlurmServices enables the role-appropriate Slurm daemon on each node. Controller nodes get slurmctld; managed worker nodes get slurmd. Submitter and unmanaged worker nodes are skipped.
func EnterTarget ¶
func EnterTarget(ctx context.Context, client *docker.Client, realm, clusterName string) (string, error)
EnterTarget determines the target node for an interactive shell. Returns "submitter" if present in the cluster, otherwise "controller".
func ExecArgs ¶
func ExecArgs(ctx context.Context, client *docker.Client, realm string, sshContainer docker.ContainerName, clusterName string, command []string) ([]string, error)
ExecArgs builds docker CLI arguments for a one-shot command execution on the cluster's submitter (or controller). The returned args are suitable for passing to docker directly.
func GetMungeKey ¶
func GetMungeKey(ctx context.Context, client *docker.Client, realm, clusterName string) ([]byte, error)
GetMungeKey reads the munge key from a cluster's node container. Any container in the cluster can be used since all mount the same munge volume.
func HasOtherClusters ¶
func HasOtherClusters(ctx context.Context, client *docker.Client, realm, clusterName string) (bool, error)
HasOtherClusters checks whether any sind cluster containers exist besides the named cluster. This is used to decide whether to clean up mesh infrastructure after deleting a cluster.
func NetworkName ¶
func NetworkName(realm, cluster string) docker.NetworkName
NetworkName returns the Docker network name for a cluster.
func NextComputeIndex ¶
func NextComputeIndex(ctx context.Context, client *docker.Client, realm, clusterName string) (int, error)
NextComputeIndex determines the next worker node index by examining existing containers in the cluster. Returns max(existing indices) + 1, or 0 if no worker containers exist.
func NodeLabels ¶
func NodeLabels(realm, clusterName, role, slurmVersion string, containerNumber int) map[string]string
NodeLabels returns the standard labels for a node container. containerNumber is the 1-based instance number for compose compatibility. The slurm version label is omitted when slurmVersion is empty.
func NodeShortNames ¶
NodeShortNames returns the short hostname for each node defined in the config. Worker nodes are indexed sequentially across all worker groups, matching the indexing used in slurm.GenerateNodesConf.
func PowerCut ¶
func PowerCut(ctx context.Context, client *docker.Client, realm, clusterName string, shortNames []string) error
PowerCut immediately kills the specified nodes (docker kill).
func PowerCycle ¶
func PowerCycle(ctx context.Context, client *docker.Client, realm, clusterName string, shortNames []string) error
PowerCycle hard-restarts the specified nodes (docker kill + start).
func PowerFreeze ¶
func PowerFreeze(ctx context.Context, client *docker.Client, realm, clusterName string, shortNames []string) error
PowerFreeze suspends all processes in the specified nodes (docker pause). The containers remain running but are completely unresponsive.
func PowerOn ¶
func PowerOn(ctx context.Context, client *docker.Client, realm, clusterName string, shortNames []string) error
PowerOn starts the specified stopped nodes (docker start).
func PowerReboot ¶
func PowerReboot(ctx context.Context, client *docker.Client, realm, clusterName string, shortNames []string) error
PowerReboot gracefully restarts the specified nodes (docker stop + start).
func PowerShutdown ¶
func PowerShutdown(ctx context.Context, client *docker.Client, realm, clusterName string, shortNames []string) error
PowerShutdown gracefully stops the specified nodes (docker stop).
func PowerUnfreeze ¶
func PowerUnfreeze(ctx context.Context, client *docker.Client, realm, clusterName string, shortNames []string) error
PowerUnfreeze resumes the specified frozen nodes (docker unpause).
func PreflightCheck ¶
func PreflightCheck(ctx context.Context, client *docker.Client, realm string, cfg *config.Cluster) error
PreflightCheck verifies that no Docker resources conflict with the cluster that would be created from the given configuration. It checks for existing networks, volumes, and containers with matching names.
func ServiceLogArgs ¶
ServiceLogArgs builds docker CLI arguments for streaming service journal logs. node is the short name, cluster is the cluster name, service is the systemd unit name (e.g. "slurmctld", "slurmd"), and follow controls whether --follow is added.
func ValidateWorkerAdd ¶
func ValidateWorkerAdd(ctx context.Context, client *docker.Client, realm string, opts WorkerAddOptions) error
ValidateWorkerAdd checks prerequisites for adding workers to a cluster. For managed workers, it verifies that sind-nodes.conf exists on the controller (indicating sind-generated Slurm configuration is in use). Unmanaged workers bypass the sind-nodes.conf check.
func VolumeName ¶
func VolumeName(realm, cluster, volumeType string) docker.VolumeName
VolumeName returns the Docker volume name for a cluster resource. volumeType is one of: "config", "munge", "data".
func WorkerRemove ¶
func WorkerRemove(ctx context.Context, client *docker.Client, meshMgr *mesh.Manager, clusterName string, shortNames []string) error
WorkerRemove removes worker nodes from a cluster.
For managed nodes (those present in sind-nodes.conf), the flow is:
- Update sind-nodes.conf to remove the node definitions
- Reconfigure slurmctld
- Deregister DNS + known_hosts
- Stop + remove containers
For unmanaged nodes, only steps 3–4 are performed.
func WriteClusterConfig ¶
func WriteClusterConfig(ctx context.Context, client *docker.Client, realm string, cfg *config.Cluster, image string) error
WriteClusterConfig generates and writes slurm.conf, sind-nodes.conf, and cgroup.conf to the config volume. Uses a temporary container to access the volume.
Types ¶
type Cluster ¶
Cluster represents a live sind cluster as it exists in Docker. This is distinct from config.Cluster, which represents the configuration input.
func Create ¶
func Create(ctx context.Context, client *docker.Client, meshMgr *mesh.Manager, cfg *config.Cluster, readinessInterval time.Duration) (*Cluster, error)
Create orchestrates the full cluster creation flow.
The caller must ensure mesh infrastructure exists (via mesh.Manager.EnsureMesh) before calling Create. The context deadline controls the overall timeout; readinessInterval controls the polling interval for readiness probes.
PreflightCheck
│
resolveInfra DNS IP ║ SSH key ║ Slurm version
│
createResources network ║ volumes → config ║ munge
│
createAllNodes node₁ ║ node₂ ║ ... ║ nodeₙ
│
setupNodes (wait + SSH + hostkey) per node
│
registerMesh DNS records + known_hosts (serial)
│
enableSlurm (enable + probe) per eligible node
│
*Cluster
type ClusterStatus ¶
type ClusterStatus struct {
Name string
Status Status
Nodes []*NodeStatus
Network *NetworkHealth
Volumes *VolumeHealth
}
ClusterStatus holds the full status of a sind cluster.
type ClusterSummary ¶
type ClusterSummary struct {
Name string
SlurmVersion string
Status Status
NodeCount int
Submitters int
Controllers int
Workers int
}
ClusterSummary holds summary information about a sind cluster.
func GetClusters ¶
func GetClusters(ctx context.Context, client *docker.Client, realm string) ([]*ClusterSummary, error)
GetClusters lists all sind clusters by querying Docker for containers with the sind.cluster label. Containers are grouped by cluster name.
type NetworkHealth ¶
type NetworkHealth struct {
Mesh bool // sind-mesh network exists
DNS bool // sind-dns container exists
Cluster bool // cluster network exists
MeshSubnet string // mesh network subnet
MeshGateway string // mesh network gateway
ClusterSubnet string // cluster network subnet
ClusterGateway string // cluster network gateway
}
NetworkHealth holds the health and IPAM details of cluster networking.
func GetNetworkHealth ¶
func GetNetworkHealth(ctx context.Context, client *docker.Client, realm, clusterName string) (*NetworkHealth, error)
GetNetworkHealth checks the health of mesh, DNS, and cluster networking.
type NetworkSummary ¶
NetworkSummary holds summary information about a sind network.
func GetNetworks ¶
func GetNetworks(ctx context.Context, client *docker.Client, realm string) ([]*NetworkSummary, error)
GetNetworks lists all sind-related Docker networks with IPAM details. This includes per-cluster networks (sind-<cluster>-net) and the mesh network (sind-mesh).
type Node ¶
type Node struct {
Name string // short name: "controller", "worker-0"
Role string // "controller", "submitter", "worker"
ContainerID docker.ContainerID // Docker container ID
IP string // container IP address
Status Status
}
Node represents a running node in a sind cluster.
func WorkerAdd ¶
func WorkerAdd(ctx context.Context, client *docker.Client, meshMgr *mesh.Manager, opts WorkerAddOptions, readinessInterval time.Duration) ([]*Node, error)
WorkerAdd adds worker nodes to an existing cluster.
For managed workers (default), the flow is:
- Validate: controller exists, sind-nodes.conf present
- Create worker container(s)
- Wait for readiness, inject SSH keys, collect host keys
- Register DNS + known_hosts
- Update sind-nodes.conf with new node definitions
- Reconfigure slurmctld
- Enable slurmd on new nodes
For unmanaged workers (Unmanaged=true), steps 5–7 are skipped.
type NodeHealth ¶
type NodeHealth struct {
Container string // container state: "running", "exited", etc.
IP string // container IP address
Munge bool // munge service healthy
SSHD bool // sshd accepting connections
Services map[string]bool // role-specific services (e.g., "slurmctld", "slurmd")
}
NodeHealth holds the health status of a single node.
func GetNodeHealth ¶
func GetNodeHealth(ctx context.Context, client *docker.Client, containerName, role, realm, clusterName string) (*NodeHealth, error)
GetNodeHealth checks the health of a single node container. If the container is not running, remaining checks are skipped and default to false. The role determines which Slurm services are checked. clusterName is used to select the cluster network IP.
type NodeStatus ¶
type NodeStatus struct {
Name string // DNS-style name: "controller.dev"
Role string // "controller", "submitter", "worker"
Health *NodeHealth
}
NodeStatus combines node identity with health information.
type NodeSummary ¶
type NodeSummary struct {
Name string // short name: "controller", "worker-0"
Role string // "controller", "submitter", "worker"
Status Status
}
NodeSummary holds summary information about a node in a sind cluster.
type Resources ¶
type Resources struct {
Containers []docker.ContainerListEntry
Network docker.NetworkName
NetworkExists bool
Volumes []docker.VolumeName
}
Resources holds the Docker resources belonging to a cluster.
func ListClusterResources ¶
func ListClusterResources(ctx context.Context, client *docker.Client, realm, clusterName string) (*Resources, error)
ListClusterResources discovers all Docker resources belonging to the named cluster. Containers are found by label filter; network and volumes are checked by name convention.
type RunConfig ¶
type RunConfig struct {
Realm string // realm name (e.g. "sind")
ClusterName string // cluster name
ShortName string // node hostname: "controller", "worker-0"
Role string // "controller", "submitter", "worker"
Image string // container image
CPUs int // CPU limit
Memory string // memory limit (e.g. "2g")
TmpSize string // /tmp tmpfs size (e.g. "1g")
SlurmVersion string // slurm version for labels (optional)
DNSIP string // mesh DNS container IP (optional)
DataHostPath string // host path for data volume (empty = use docker volume)
DataMountPath string // mount point for data (default: /data)
Managed bool // start slurmd and add to slurm.conf (worker only)
ContainerNumber int // 1-based compose container instance number
}
RunConfig holds the parameters needed to build docker run arguments for creating a node container.
type VolumeHealth ¶
type VolumeHealth struct {
Config bool // sind-<cluster>-config volume exists
Munge bool // sind-<cluster>-munge volume exists
Data bool // sind-<cluster>-data volume exists
}
VolumeHealth holds the existence status of cluster volumes.
func GetVolumeHealth ¶
func GetVolumeHealth(ctx context.Context, client *docker.Client, realm, clusterName string) (*VolumeHealth, error)
GetVolumeHealth checks whether the cluster's config, munge, and data volumes exist.
type VolumeSummary ¶
VolumeSummary holds summary information about a sind volume.
func GetVolumes ¶
GetVolumes lists all sind-related Docker volumes.