Documentation
¶
Overview ¶
Package cluster provides types and operations for sind cluster management.
Index ¶
- Constants
- func BuildContainerExecArgs(container docker.ContainerName, isTTY bool, command []string) []string
- func BuildRunArgs(cfg RunConfig) []string
- func BuildSSHArgs(sshContainer docker.ContainerName, node, cluster, realm string, isTTY bool, ...) []string
- func ComposeProject(realm, clusterName string) string
- func ContainerLogArgs(realm, node, cluster string, follow bool) []string
- func ContainerName(realm, cluster, shortName string) docker.ContainerName
- func ContainerPrefix(realm, cluster string) string
- func CreateClusterNetwork(ctx context.Context, client *docker.Client, realm, clusterName string) error
- func CreateClusterNodes(ctx context.Context, client *docker.Client, meshMgr *mesh.Manager, ...) error
- func CreateClusterVolumes(ctx context.Context, client *docker.Client, realm, clusterName string, ...) error
- func CreateNode(ctx context.Context, client *docker.Client, meshMgr *mesh.Manager, ...) (docker.ContainerID, error)
- func DNSName(shortName, cluster, realm string) string
- func DNSSearchDomain(cluster, realm string) string
- func Delete(ctx context.Context, client *docker.Client, meshMgr *mesh.Manager, ...) error
- func DeleteContainers(ctx context.Context, client *docker.Client, ...) error
- func DeleteNetwork(ctx context.Context, client *docker.Client, name docker.NetworkName) error
- func DeleteVolumes(ctx context.Context, client *docker.Client, volumes []docker.VolumeName) error
- func DeregisterMesh(ctx context.Context, meshMgr *mesh.Manager, clusterName string, ...) error
- func DiscoverClusterNames(ctx context.Context, client *docker.Client, realm string) ([]string, error)
- func EnableSlurmServices(ctx context.Context, client *docker.Client, configs []RunConfig) error
- func EnterTarget(ctx context.Context, client *docker.Client, realm, clusterName string) (string, error)
- func GetMungeKey(ctx context.Context, client *docker.Client, realm, clusterName string) ([]byte, error)
- func HasOtherClusters(ctx context.Context, client *docker.Client, realm, clusterName string) (bool, error)
- func NetworkName(realm, cluster string) docker.NetworkName
- func NextComputeIndex(ctx context.Context, client *docker.Client, realm, clusterName string) (int, error)
- func NodeLabels(realm, clusterName, role, slurmVersion, dataHostPath string, ...) map[string]string
- func NodeShortNames(nodes []config.Node) []string
- func PowerCut(ctx context.Context, client *docker.Client, realm, clusterName string, ...) error
- func PowerCycle(ctx context.Context, client *docker.Client, realm, clusterName string, ...) error
- func PowerFreeze(ctx context.Context, client *docker.Client, realm, clusterName string, ...) error
- func PowerOn(ctx context.Context, client *docker.Client, realm, clusterName string, ...) error
- func PowerReboot(ctx context.Context, client *docker.Client, realm, clusterName string, ...) error
- func PowerShutdown(ctx context.Context, client *docker.Client, realm, clusterName string, ...) error
- func PowerUnfreeze(ctx context.Context, client *docker.Client, realm, clusterName string, ...) error
- func PreflightCheck(ctx context.Context, client *docker.Client, realm string, cfg *config.Cluster) error
- func ServiceLogArgs(realm, node, cluster, service string, follow bool) []string
- func ValidateWorkerAdd(ctx context.Context, client *docker.Client, realm string, ...) error
- func VolumeName(realm, cluster, volumeType string) docker.VolumeName
- func WorkerRemove(ctx context.Context, client *docker.Client, meshMgr *mesh.Manager, ...) error
- func WriteClusterConfig(ctx context.Context, client *docker.Client, realm string, cfg *config.Cluster, ...) error
- func WriteMungeKey(ctx context.Context, client *docker.Client, realm, clusterName string, ...) error
- type Cluster
- type MountPoint
- type NetworkHealth
- type NetworkSummary
- type Node
- type NodeHealth
- type NodeStatus
- type NodeSummary
- type Resources
- type RunConfig
- type State
- type Status
- type Summary
- type VolumeSummary
- type WorkerAddOptions
Constants ¶
const ( LabelRealm = "sind.realm" LabelCluster = "sind.cluster" LabelRole = "sind.role" LabelSlurmVersion = "sind.slurm.version" LabelDataHostPath = "sind.data.hostpath" )
Label keys used on sind containers.
const ( LabelComposeProject = "com.docker.compose.project" LabelComposeService = "com.docker.compose.service" LabelComposeContainerNumber = "com.docker.compose.container-number" LabelComposeOneoff = "com.docker.compose.oneoff" LabelComposeConfigHash = "com.docker.compose.config-hash" LabelComposeConfigFiles = "com.docker.compose.project.config_files" LabelComposeNetwork = "com.docker.compose.network" LabelComposeVolume = "com.docker.compose.volume" )
Docker Compose compatibility labels.
const DNSSuffix = "sind"
DNSSuffix is the base domain for all sind DNS names.
Variables ¶
This section is empty.
Functions ¶
func BuildContainerExecArgs ¶ added in v0.3.0
func BuildContainerExecArgs(container docker.ContainerName, isTTY bool, command []string) []string
BuildContainerExecArgs builds docker CLI arguments for running a command directly inside a cluster container via docker exec. The working directory is set to /data (the shared data mount). When command is nil, an interactive login shell is started.
func BuildRunArgs ¶
BuildRunArgs returns the docker arguments for creating a node container. The returned slice does not include "create" or "run -d" — the caller passes these args to Client.CreateContainer or Client.RunContainer.
func BuildSSHArgs ¶
func BuildSSHArgs(sshContainer docker.ContainerName, node, cluster, realm string, isTTY bool, sshOptions, command []string) []string
BuildSSHArgs builds the docker CLI arguments for running SSH through the sind-ssh relay container. The returned args are suitable for passing to docker directly (e.g. "docker exec -i -t sind-ssh ssh ...").
node is the short name (e.g. "worker-0"), cluster is the cluster name, isTTY controls whether -t is added to docker exec, sshOptions are passed through to SSH before the target, and command is the optional remote command.
func ComposeProject ¶
ComposeProject returns the Docker Compose project name for a cluster.
func ContainerLogArgs ¶
ContainerLogArgs builds docker CLI arguments for streaming container logs. node is the short name (e.g. "controller", "worker-0"), cluster is the cluster name, and follow controls whether --follow is added.
func ContainerName ¶
func ContainerName(realm, cluster, shortName string) docker.ContainerName
ContainerName returns the Docker container name for a node. shortName is the node's hostname, e.g. "controller", "submitter", "worker-0".
func ContainerPrefix ¶
ContainerPrefix returns the container name prefix for a cluster, used to extract short names from full container names.
func CreateClusterNetwork ¶
func CreateClusterNetwork(ctx context.Context, client *docker.Client, realm, clusterName string) error
CreateClusterNetwork creates the cluster-specific Docker bridge network.
func CreateClusterNodes ¶
func CreateClusterNodes(ctx context.Context, client *docker.Client, meshMgr *mesh.Manager, configs []RunConfig) error
CreateClusterNodes creates all node containers for the cluster. Each node is created, connected to the mesh network, and started.
func CreateClusterVolumes ¶
func CreateClusterVolumes(ctx context.Context, client *docker.Client, realm, clusterName string, useDataVolume bool) error
CreateClusterVolumes creates the config and munge volumes for a cluster. When useDataVolume is true, a data volume is also created; otherwise the caller is expected to use a host-path bind mount for /data.
func CreateNode ¶
func CreateNode(ctx context.Context, client *docker.Client, meshMgr *mesh.Manager, cfg RunConfig) (docker.ContainerID, error)
CreateNode creates a node container, connects it to the mesh network, and starts it. Returns the container ID.
func DNSName ¶
DNSName returns the fully qualified DNS name for a node. shortName is the node's hostname, e.g. "controller", "worker-0".
func DNSSearchDomain ¶
DNSSearchDomain returns the DNS search domain for a cluster.
func Delete ¶
func Delete(ctx context.Context, client *docker.Client, meshMgr *mesh.Manager, clusterName string) error
Delete orchestrates the full cluster deletion flow.
Deleting a non-existent cluster is not an error. The function handles partial clusters (e.g., from a failed creation) by removing whatever resources exist.
deleteClusterResources
│
HasOtherClusters?
yes → done
no → CleanupMesh
func DeleteContainers ¶
func DeleteContainers(ctx context.Context, client *docker.Client, containers []docker.ContainerListEntry) error
DeleteContainers force-removes the given containers (docker rm -f).
func DeleteNetwork ¶
DeleteNetwork removes the cluster network.
func DeleteVolumes ¶
DeleteVolumes removes the given cluster volumes.
func DeregisterMesh ¶
func DeregisterMesh(ctx context.Context, meshMgr *mesh.Manager, clusterName string, containers []docker.ContainerListEntry) error
DeregisterMesh removes DNS records and known_hosts entries for each container in the cluster. This is the inverse of registerMesh during cluster creation.
func DiscoverClusterNames ¶ added in v0.3.0
func DiscoverClusterNames(ctx context.Context, client *docker.Client, realm string) ([]string, error)
DiscoverClusterNames finds cluster names from orphaned networks and volumes that may not have containers. This supplements GetClusters (which only finds clusters with running containers) for cleanup operations.
func EnableSlurmServices ¶
EnableSlurmServices enables the role-appropriate Slurm daemon on each node. Controller nodes get slurmctld; managed worker nodes get slurmd. Submitter and unmanaged worker nodes are skipped.
func EnterTarget ¶
func EnterTarget(ctx context.Context, client *docker.Client, realm, clusterName string) (string, error)
EnterTarget determines the target node for an interactive shell. Returns "submitter" if present in the cluster, otherwise "controller".
func GetMungeKey ¶
func GetMungeKey(ctx context.Context, client *docker.Client, realm, clusterName string) ([]byte, error)
GetMungeKey reads the munge key from a cluster's node container. Any container in the cluster can be used since all mount the same munge volume.
func HasOtherClusters ¶
func HasOtherClusters(ctx context.Context, client *docker.Client, realm, clusterName string) (bool, error)
HasOtherClusters checks whether any sind cluster containers exist besides the named cluster. This is used to decide whether to clean up mesh infrastructure after deleting a cluster.
func NetworkName ¶
func NetworkName(realm, cluster string) docker.NetworkName
NetworkName returns the Docker network name for a cluster.
func NextComputeIndex ¶
func NextComputeIndex(ctx context.Context, client *docker.Client, realm, clusterName string) (int, error)
NextComputeIndex determines the next worker node index by examining existing containers in the cluster. Returns max(existing indices) + 1, or 0 if no worker containers exist.
func NodeLabels ¶
func NodeLabels(realm, clusterName, role, slurmVersion, dataHostPath string, containerNumber int) map[string]string
NodeLabels returns the standard labels for a node container. containerNumber is the 1-based instance number for compose compatibility. The slurm version label is omitted when slurmVersion is empty. The data host path label is omitted when dataHostPath is empty (Docker volume mode).
func NodeShortNames ¶
NodeShortNames returns the short hostname for each node defined in the config. Worker nodes are indexed sequentially across all worker groups, matching the indexing used in slurm.GenerateNodesConf.
func PowerCut ¶
func PowerCut(ctx context.Context, client *docker.Client, realm, clusterName string, shortNames []string) error
PowerCut immediately kills the specified nodes (docker kill).
func PowerCycle ¶
func PowerCycle(ctx context.Context, client *docker.Client, realm, clusterName string, shortNames []string) error
PowerCycle hard-restarts the specified nodes (docker kill + start).
func PowerFreeze ¶
func PowerFreeze(ctx context.Context, client *docker.Client, realm, clusterName string, shortNames []string) error
PowerFreeze suspends all processes in the specified nodes (docker pause). The containers remain running but are completely unresponsive.
func PowerOn ¶
func PowerOn(ctx context.Context, client *docker.Client, realm, clusterName string, shortNames []string) error
PowerOn starts the specified stopped nodes (docker start).
func PowerReboot ¶
func PowerReboot(ctx context.Context, client *docker.Client, realm, clusterName string, shortNames []string) error
PowerReboot gracefully restarts the specified nodes (docker stop + start).
func PowerShutdown ¶
func PowerShutdown(ctx context.Context, client *docker.Client, realm, clusterName string, shortNames []string) error
PowerShutdown gracefully stops the specified nodes (docker stop).
func PowerUnfreeze ¶
func PowerUnfreeze(ctx context.Context, client *docker.Client, realm, clusterName string, shortNames []string) error
PowerUnfreeze resumes the specified frozen nodes (docker unpause).
func PreflightCheck ¶
func PreflightCheck(ctx context.Context, client *docker.Client, realm string, cfg *config.Cluster) error
PreflightCheck verifies that no Docker resources conflict with the cluster that would be created from the given configuration. It checks for existing networks, volumes, and containers with matching names.
func ServiceLogArgs ¶
ServiceLogArgs builds docker CLI arguments for streaming service journal logs. node is the short name, cluster is the cluster name, service is the systemd unit name (e.g. "slurmctld", "slurmd"), and follow controls whether --follow is added.
func ValidateWorkerAdd ¶
func ValidateWorkerAdd(ctx context.Context, client *docker.Client, realm string, opts WorkerAddOptions) error
ValidateWorkerAdd checks prerequisites for adding workers to a cluster. For managed workers, it verifies that sind-nodes.conf exists on the controller (indicating sind-generated Slurm configuration is in use). Unmanaged workers bypass the sind-nodes.conf check.
func VolumeName ¶
func VolumeName(realm, cluster, volumeType string) docker.VolumeName
VolumeName returns the Docker volume name for a cluster resource. volumeType is one of: "config", "munge", "data".
func WorkerRemove ¶
func WorkerRemove(ctx context.Context, client *docker.Client, meshMgr *mesh.Manager, clusterName string, shortNames []string) error
WorkerRemove removes worker nodes from a cluster.
For managed nodes (those present in sind-nodes.conf), the flow is:
- Update sind-nodes.conf to remove the node definitions
- Reconfigure slurmctld
- Deregister DNS + known_hosts
- Stop + remove containers
For unmanaged nodes, only steps 3–4 are performed.
func WriteClusterConfig ¶
func WriteClusterConfig(ctx context.Context, client *docker.Client, realm string, cfg *config.Cluster, image string, pull bool) error
WriteClusterConfig generates and writes slurm.conf, sind-nodes.conf, and cgroup.conf to the config volume. Uses a temporary container to access the volume.
Types ¶
type Cluster ¶
Cluster represents a live sind cluster as it exists in Docker. This is distinct from config.Cluster, which represents the configuration input.
func Create ¶
func Create(ctx context.Context, client *docker.Client, meshMgr *mesh.Manager, cfg *config.Cluster, readinessInterval time.Duration) (result *Cluster, retErr error)
Create orchestrates the full cluster creation flow.
The caller must ensure mesh infrastructure exists (via mesh.Manager.EnsureMesh) before calling Create. The context deadline controls the overall timeout; readinessInterval controls the polling interval for readiness probes.
PreflightCheck
│
resolveInfra DNS IP ║ SSH key ║ Slurm version
│
createResources network ║ volumes → config ║ munge
│
createAllNodes node₁ ║ node₂ ║ ... ║ nodeₙ
│
setupNodes (wait + SSH + hostkey) per node
│
registerMesh DNS records + known_hosts (serial)
│
enableSlurm (enable + probe) per eligible node
│
*Cluster
type MountPoint ¶ added in v0.3.0
type MountPoint struct {
Path string // mount path inside the container (e.g. "/etc/slurm")
Source string // volume name or host path
Type string // "volume" or "hostPath"
OK bool // true if the Docker volume exists (always true for hostPath)
}
MountPoint describes a volume or bind mount on cluster containers.
func GetMountPoints ¶ added in v0.3.0
func GetMountPoints(ctx context.Context, client *docker.Client, realm, clusterName string, containers []docker.ContainerListEntry) ([]MountPoint, error)
GetMountPoints returns the mount points for a cluster, checking volume existence for Docker volumes. The data mount source is determined from the sind.data.hostpath label on cluster containers: when present it is a host-path bind mount, otherwise it is a Docker volume.
type NetworkHealth ¶
type NetworkHealth struct {
Mesh bool // sind-mesh network exists
MeshName string // mesh network name (e.g. "sind-mesh")
MeshDriver string // mesh network driver (e.g. "bridge")
MeshSubnet string // mesh network subnet
MeshGateway string // mesh network gateway
DNS bool // sind-dns container exists
DNSName string // DNS container name (e.g. "sind-dns")
Cluster bool // cluster network exists
ClusterName string // cluster network name (e.g. "sind-dev-net")
ClusterDriver string // cluster network driver (e.g. "bridge")
ClusterSubnet string // cluster network subnet
ClusterGateway string // cluster network gateway
}
NetworkHealth holds the health and IPAM details of cluster networking.
func GetNetworkHealth ¶
func GetNetworkHealth(ctx context.Context, client *docker.Client, realm, clusterName string) (*NetworkHealth, error)
GetNetworkHealth checks the health of mesh, DNS, and cluster networking.
type NetworkSummary ¶
NetworkSummary holds summary information about a sind network.
func GetNetworks ¶
func GetNetworks(ctx context.Context, client *docker.Client, realm string) ([]*NetworkSummary, error)
GetNetworks lists all sind-related Docker networks with IPAM details. This includes per-cluster networks (sind-<cluster>-net) and the mesh network (sind-mesh).
type Node ¶
type Node struct {
Name string // short name: "controller", "worker-0"
Role string // "controller", "submitter", "worker"
ContainerID docker.ContainerID // Docker container ID
IP string // container IP address
State State
}
Node represents a running node in a sind cluster.
func WorkerAdd ¶
func WorkerAdd(ctx context.Context, client *docker.Client, meshMgr *mesh.Manager, opts WorkerAddOptions, readinessInterval time.Duration) (result []*Node, retErr error)
WorkerAdd adds worker nodes to an existing cluster.
For managed workers (default), the flow is:
- Validate: controller exists, sind-nodes.conf present
- Create worker container(s)
- Wait for readiness, inject SSH keys, collect host keys
- Register DNS + known_hosts
- Update sind-nodes.conf with new node definitions
- Reconfigure slurmctld
- Enable slurmd on new nodes
For unmanaged workers (Unmanaged=true), steps 5–7 are skipped.
type NodeHealth ¶
type NodeHealth struct {
Container string // container state: "running", "exited", etc.
IP string // container IP address
Munge bool // munge service healthy
SSHD bool // sshd accepting connections
Services map[string]bool // role-specific services (e.g., "slurmctld", "slurmd")
}
NodeHealth holds the health status of a single node.
func GetNodeHealth ¶
func GetNodeHealth(ctx context.Context, client *docker.Client, containerName, role, realm, clusterName string) (*NodeHealth, error)
GetNodeHealth checks the health of a single node container. If the container is not running, remaining checks are skipped and default to false. The role determines which Slurm services are checked. clusterName is used to select the cluster network IP.
type NodeStatus ¶
type NodeStatus struct {
Name string // DNS-style name: "controller.dev"
Role string // "controller", "submitter", "worker"
Health *NodeHealth
}
NodeStatus combines node identity with health information.
type NodeSummary ¶
type NodeSummary struct {
Name string // short name: "controller", "worker-0"
Role string // "controller", "submitter", "worker"
State State
}
NodeSummary holds summary information about a node in a sind cluster.
type Resources ¶
type Resources struct {
Containers []docker.ContainerListEntry
Network docker.NetworkName
NetworkExists bool
Volumes []docker.VolumeName
}
Resources holds the Docker resources belonging to a cluster.
func ListClusterResources ¶
func ListClusterResources(ctx context.Context, client *docker.Client, realm, clusterName string) (*Resources, error)
ListClusterResources discovers all Docker resources belonging to the named cluster. Containers are found by label filter; network and volumes are checked by name convention.
type RunConfig ¶
type RunConfig struct {
Realm string // realm name (e.g. "sind")
ClusterName string // cluster name
ShortName string // node hostname: "controller", "worker-0"
Role string // "controller", "submitter", "worker"
Image string // container image
CPUs int // CPU limit
Memory string // memory limit (e.g. "2g")
TmpSize string // /tmp tmpfs size (e.g. "1g")
SlurmVersion string // slurm version for labels (optional)
DNSIP string // mesh DNS container IP (optional)
DataHostPath string // host path for data volume (empty = use docker volume)
DataMountPath string // mount point for data (default: /data)
Managed bool // start slurmd and add to slurm.conf (worker only)
ContainerNumber int // 1-based compose container instance number
Pull bool // force fresh image pull (--pull always)
}
RunConfig holds the parameters needed to build docker run arguments for creating a node container.
type State ¶ added in v0.2.0
type State string
State represents the state of a cluster or node.
const ( StateRunning State = "running" StateStopped State = "stopped" StatePaused State = "paused" StateMixed State = "mixed" // cluster: nodes in different states StateEmpty State = "empty" // cluster: no nodes exist StateUnknown State = "unknown" // node: unrecognised container state )
Possible cluster/node states.
type Status ¶
type Status struct {
Name string
State State
Nodes []*NodeStatus
Network *NetworkHealth
Mounts []MountPoint
}
Status holds the full status of a sind cluster.
type Summary ¶ added in v0.2.0
type Summary struct {
Name string
SlurmVersion string
State State
NodeCount int
Submitters int
Controllers int
Workers int
}
Summary holds summary information about a sind cluster.
type VolumeSummary ¶
VolumeSummary holds summary information about a sind volume.
func GetVolumes ¶
GetVolumes lists all sind-related Docker volumes.