Documentation
¶
Overview ¶
Package checkpoint provides CRIU checkpoint (dump) operations.
config.go defines the static checkpoint spec loaded from ConfigMap YAML.
constants.go defines shared constants used across checkpoint and restore packages.
criu provides CRIU-specific configuration and utilities for checkpoint operations.
filesystem.go provides container rootfs introspection, filesystem config/metadata types, and rootfs diff capture for CRIU checkpoint.
k8s contains containerd discovery and Kubernetes path classification helpers.
mounts parses runtime mount state from /proc.
namespaces provides Linux namespace introspection for CRIU checkpoint.
Package checkpoint provides CRIU checkpoint (dump) operations.
storage.go provides checkpoint storage I/O: write/read manifests, listing, deletion.
Index ¶
- Constants
- func BuildCRIUDumpOptions(settings *CRIUSettings, pid int, imageDirFD int32, rootFS string, ...) (*criurpc.CriuOpts, error)
- func CaptureDeletedFiles(upperDir, checkpointDir string) (bool, error)
- func CaptureDevShm(pid int, checkpointDir string, log *logrus.Entry) error
- func CaptureRootfsDiff(upperDir, checkpointDir string, exclusions *FilesystemConfig, ...) (string, error)
- func CaptureRootfsState(upperDir, checkpointDir string, data *CheckpointManifest, log *logrus.Entry)
- func DeleteCheckpoint(baseDir, checkpointID string) error
- func ExecuteCRIUDump(criuOpts *criurpc.CriuOpts, checkpointDir string, log *logrus.Entry) (time.Duration, error)
- func FindWhiteoutFiles(upperDir string) ([]string, error)
- func GetAllNamespaces(pid int) (map[NamespaceType]*NamespaceInfo, error)
- func GetNamespaceInode(pid int, nsType NamespaceType) (uint64, error)
- func GetOverlayUpperDir(pid int) (string, error)
- func GetRootFS(pid int) (string, error)
- func ListCheckpoints(baseDir string) ([]string, error)
- func LoadDescriptors(checkpointDir string) ([]string, error)
- func SaveDescriptors(checkpointDir string, descriptors []string) error
- func WriteCheckpointManifest(checkpointDir string, data *CheckpointManifest) error
- type CRIUDumpManifest
- type CRIUSettings
- type CheckpointManifest
- type CheckpointOutcome
- type CheckpointRequest
- type CheckpointSpec
- type Checkpointer
- type ConfigError
- type ContainerInfoSnapshot
- type DiscoveryClient
- type ExternalMountManifestEntry
- type FilesystemConfig
- type FilesystemManifest
- type MountInfo
- type MountPolicy
- type NamespaceInfo
- type NamespaceManifestEntry
- type NamespaceType
- type SourcePodManifest
Constants ¶
const ( // HostProcPath is the mount point for the host's /proc in DaemonSet pods. HostProcPath = "/host/proc" // DevShmDirName is the directory name for captured /dev/shm contents. DevShmDirName = "dev-shm" // KubeLabelCheckpointSource is the pod label that triggers automatic checkpointing. // Set by the operator on checkpoint-eligible pods. KubeLabelCheckpointSource = "nvidia.com/checkpoint-source" // KubeLabelCheckpointHash is the pod label specifying the checkpoint identity hash. // Set by the operator on checkpoint-eligible pods. KubeLabelCheckpointHash = "nvidia.com/checkpoint-hash" // DumpLogFilename is the CRIU dump (checkpoint) log filename. DumpLogFilename = "dump.log" // CheckpointCRIUConfFilename is the CRIU config file written at checkpoint time. CheckpointCRIUConfFilename = "criu.conf" // CheckpointDoneFilename is the marker file written to the checkpoint directory // after all checkpoint artifacts are complete. Used to detect checkpoint readiness. // Also hard-coded in vLLM for early-exit when checkpoint already exists. CheckpointDoneFilename = "checkpoint.done" // CheckpointManifestFilename is the name of the manifest file in checkpoint directories. CheckpointManifestFilename = "manifest.yaml" // DescriptorsFilename is the name of the file descriptors file. DescriptorsFilename = "descriptors.yaml" // RootfsDiffFilename is the name of the rootfs diff tar in checkpoint directories. RootfsDiffFilename = "rootfs-diff.tar" // DeletedFilesFilename is the name of the deleted files JSON in checkpoint directories. DeletedFilesFilename = "deleted-files.json" )
const ( // K8sNamespace is the containerd namespace used by Kubernetes. K8sNamespace = "k8s.io" // ContainerdSocket is the default containerd socket path. ContainerdSocket = "/run/containerd/containerd.sock" )
Variables ¶
This section is empty.
Functions ¶
func BuildCRIUDumpOptions ¶
func BuildCRIUDumpOptions( settings *CRIUSettings, pid int, imageDirFD int32, rootFS string, mountInfo []MountInfo, ociSpec *specs.Spec, namespaces map[NamespaceType]*NamespaceInfo, ) (*criurpc.CriuOpts, error)
BuildCRIUDumpOptions creates CRIU options directly from spec settings and runtime state.
func CaptureDeletedFiles ¶
CaptureDeletedFiles finds whiteout files and saves them to a JSON file. Returns true if deleted files were found and saved.
func CaptureDevShm ¶
CaptureDevShm captures files from /dev/shm to the checkpoint directory. This is needed because /dev/shm is a tmpfs mount that is not part of the container's overlay filesystem, so rootfs diff doesn't capture it.
Semaphores (sem.* files) are included so that sem_unlink() calls succeed after restore. The semaphore kernel state won't be perfectly restored, but the files will exist for cleanup operations.
The files are saved to <checkpointDir>/dev-shm/ and can be restored using RestoreDevShm before CRIU restore.
func CaptureRootfsDiff ¶
func CaptureRootfsDiff(upperDir, checkpointDir string, exclusions *FilesystemConfig, bindMountDests []string) (string, error)
CaptureRootfsDiff captures the overlay upperdir to a tar file. The upperdir contains all filesystem modifications made by the container. Excludes bind mount destinations and configured directories to avoid conflicts during restore. Returns the path to the tar file or empty string if capture failed.
func CaptureRootfsState ¶
func CaptureRootfsState(upperDir, checkpointDir string, data *CheckpointManifest, log *logrus.Entry)
CaptureRootfsState captures the overlay upperdir and deleted files after CRIU dump. Updates the checkpoint manifest with rootfs diff information and saves it.
func DeleteCheckpoint ¶
DeleteCheckpoint removes a checkpoint directory.
func ExecuteCRIUDump ¶
func ExecuteCRIUDump(criuOpts *criurpc.CriuOpts, checkpointDir string, log *logrus.Entry) (time.Duration, error)
ExecuteCRIUDump runs the CRIU dump and logs timing plus dump-log location on failure.
func FindWhiteoutFiles ¶
FindWhiteoutFiles finds overlay whiteout files in the upperdir. Overlay filesystems use .wh.<filename> to mark deleted files. Returns a list of paths that were deleted in the container.
func GetAllNamespaces ¶
func GetAllNamespaces(pid int) (map[NamespaceType]*NamespaceInfo, error)
GetAllNamespaces returns information about all namespaces for a process
func GetNamespaceInode ¶
func GetNamespaceInode(pid int, nsType NamespaceType) (uint64, error)
GetNamespaceInode returns the inode number for a namespace
func GetOverlayUpperDir ¶
GetOverlayUpperDir extracts the overlay upperdir from mountinfo. This is the writable layer of the container's filesystem.
func ListCheckpoints ¶
ListCheckpoints returns all checkpoint IDs in the base directory.
func LoadDescriptors ¶
LoadDescriptors reads file descriptor information from checkpoint directory.
func SaveDescriptors ¶
SaveDescriptors writes file descriptor information to the checkpoint directory.
func WriteCheckpointManifest ¶
func WriteCheckpointManifest(checkpointDir string, data *CheckpointManifest) error
WriteCheckpointManifest writes a checkpoint manifest file in the checkpoint directory.
Types ¶
type CRIUDumpManifest ¶
type CRIUDumpManifest struct {
CRIU CRIUSettings `yaml:"criu"`
ExtMnt []ExternalMountManifestEntry `yaml:"extMnt,omitempty"`
External []string `yaml:"external,omitempty"`
SkipMnt []string `yaml:"skipMnt,omitempty"`
}
CRIUDumpManifest stores the resolved dump-time CRIU mount plan used for restore.
func NewCRIUDumpManifest ¶
func NewCRIUDumpManifest(criuOpts *criurpc.CriuOpts, settings CRIUSettings) CRIUDumpManifest
NewCRIUDumpManifest serializes resolved dump options for restore.
type CRIUSettings ¶
type CRIUSettings struct {
// GhostLimit is the maximum ghost file size in bytes.
// Ghost files are deleted-but-open files that CRIU needs to checkpoint.
// 512MB is recommended for GPU workloads with large memory allocations.
GhostLimit uint32 `yaml:"ghostLimit"`
// Timeout is the CRIU operation timeout in seconds.
// 6 hours (21600s) is recommended for large GPU model checkpoints.
Timeout uint32 `yaml:"timeout"`
// LogLevel is the CRIU logging verbosity (0-4).
LogLevel int32 `yaml:"logLevel"`
// WorkDir is the CRIU work directory for temporary files.
WorkDir string `yaml:"workDir"`
// AutoDedup enables auto-deduplication of memory pages.
AutoDedup bool `yaml:"autoDedup"`
// LazyPages enables lazy page migration (experimental).
LazyPages bool `yaml:"lazyPages"`
// LeaveRunning keeps the process running after checkpoint (dump only).
LeaveRunning bool `yaml:"leaveRunning"`
// ShellJob allows checkpointing session leaders (containers are often session leaders).
ShellJob bool `yaml:"shellJob"`
// TcpClose closes TCP connections instead of preserving them (pod IPs change on restore).
TcpClose bool `yaml:"tcpClose"`
// FileLocks allows checkpointing processes with file locks.
FileLocks bool `yaml:"fileLocks"`
// OrphanPtsMaster allows checkpointing containers with TTYs.
OrphanPtsMaster bool `yaml:"orphanPtsMaster"`
// ExtUnixSk allows external Unix sockets.
ExtUnixSk bool `yaml:"extUnixSk"`
// LinkRemap handles deleted-but-open files.
LinkRemap bool `yaml:"linkRemap"`
// ExtMasters allows external bind mount masters.
ExtMasters bool `yaml:"extMasters"`
// ManageCgroupsMode controls cgroup handling: "ignore" lets K8s manage cgroups.
ManageCgroupsMode string `yaml:"manageCgroupsMode"`
// LibDir is the path to CRIU plugin directory (e.g., /usr/local/lib/criu).
// Required for CUDA checkpoint/restore.
LibDir string `yaml:"libDir"`
// AllowUprobes allows user-space probes (required for CUDA checkpoints).
AllowUprobes bool `yaml:"allowUprobes"`
// SkipInFlight skips in-flight TCP connections during checkpoint/restore.
SkipInFlight bool `yaml:"skipInFlight"`
}
CRIUSettings holds CRIU-specific configuration options. Options are categorized by how they are passed to CRIU:
- RPC options: Passed via go-criu CriuOpts protobuf
- CRIU conf file options: Written to criu.conf (NOT available via RPC)
func (*CRIUSettings) GenerateCRIUConfContent ¶
func (c *CRIUSettings) GenerateCRIUConfContent() string
GenerateCRIUConfContent generates the criu.conf file content for options that cannot be passed via RPC.
type CheckpointManifest ¶
type CheckpointManifest struct {
CheckpointID string `yaml:"checkpointId"`
CreatedAt time.Time `yaml:"createdAt"`
CRIUDump CRIUDumpManifest `yaml:"criuDump"`
K8s SourcePodManifest `yaml:"k8s"`
Filesystem FilesystemManifest `yaml:"filesystem"`
Namespaces []NamespaceManifestEntry `yaml:"namespaces"`
}
CheckpointManifest is saved as manifest.yaml at checkpoint time and loaded at restore.
func NewCheckpointManifest ¶
func NewCheckpointManifest( checkpointID string, criuDump CRIUDumpManifest, k8s SourcePodManifest, filesystem FilesystemManifest, namespaces []NamespaceManifestEntry, ) *CheckpointManifest
NewCheckpointManifest assembles a CheckpointManifest from per-module builders.
func ReadCheckpointManifest ¶
func ReadCheckpointManifest(checkpointDir string) (*CheckpointManifest, error)
ReadCheckpointManifest reads checkpoint manifest from a checkpoint directory.
type CheckpointOutcome ¶
type CheckpointOutcome struct {
CheckpointID string
CheckpointDir string
Data *CheckpointManifest
}
CheckpointOutcome contains the result of a checkpoint operation.
type CheckpointRequest ¶
type CheckpointRequest struct {
ContainerID string
ContainerName string // K8s container name (for K8s API volume type lookup)
CheckpointID string
CheckpointDir string
NodeName string
PodName string
PodNamespace string
}
CheckpointRequest holds per-checkpoint identifiers for a checkpoint operation.
type CheckpointSpec ¶
type CheckpointSpec struct {
// BasePath is the base directory for checkpoint storage (PVC mount point).
BasePath string `yaml:"basePath"`
// CRIU options for dump operations
CRIU CRIUSettings `yaml:"criu"`
// RootfsExclusions defines paths to exclude from rootfs diff capture
RootfsExclusions FilesystemConfig `yaml:"rootfsExclusions"`
}
CheckpointSpec is the static checkpoint spec loaded from ConfigMap YAML.
func (*CheckpointSpec) Validate ¶
func (c *CheckpointSpec) Validate() error
Validate checks that the CheckpointSpec has valid values.
type Checkpointer ¶
type Checkpointer struct {
// contains filtered or unexported fields
}
Checkpointer performs CRIU checkpoint operations
func NewCheckpointer ¶
func NewCheckpointer(discoveryClient *DiscoveryClient) *Checkpointer
NewCheckpointer creates a new checkpointer
func (*Checkpointer) Checkpoint ¶
func (c *Checkpointer) Checkpoint(ctx context.Context, req CheckpointRequest, spec *CheckpointSpec) (*CheckpointOutcome, error)
Checkpoint performs a CRIU dump of a container. The operation has three phases: introspect, configure, capture.
type ConfigError ¶
ConfigError represents a configuration validation error.
func (*ConfigError) Error ¶
func (e *ConfigError) Error() string
type ContainerInfoSnapshot ¶
type ContainerInfoSnapshot struct {
PID int
RootFS string
UpperDir string
OCISpec *specs.Spec
MountInfo []MountInfo
Namespaces map[NamespaceType]*NamespaceInfo
}
ContainerInfoSnapshot holds runtime/container info needed for checkpointing.
type DiscoveryClient ¶
type DiscoveryClient struct {
// contains filtered or unexported fields
}
func NewDiscoveryClient ¶
func NewDiscoveryClient() (*DiscoveryClient, error)
func (*DiscoveryClient) Close ¶
func (c *DiscoveryClient) Close() error
func (*DiscoveryClient) ResolveContainer ¶
type ExternalMountManifestEntry ¶
ExternalMountManifestEntry is a serializable CRIU ext-mount entry in checkpoint manifests.
type FilesystemConfig ¶
type FilesystemConfig struct {
// SystemDirs are system directories that should be excluded from rootfs diff.
// These directories are typically injected/bind-mounted by NVIDIA GPU Operator
// at container start time, so they already exist in the restore target.
// Excluding them prevents conflicts (especially socket files which cannot be overwritten).
// Default: ["./usr", "./etc", "./opt", "./var", "./run"]
SystemDirs []string `yaml:"systemDirs"`
// CacheDirs are cache directories that can safely be excluded to reduce checkpoint size.
// Model weights and other cached data are typically re-downloaded if needed.
// Default: ["./.cache/huggingface", "./.cache/torch"]
CacheDirs []string `yaml:"cacheDirs"`
// AdditionalExclusions are custom paths to exclude from the rootfs diff.
// Use this for application-specific exclusions.
// Paths should be relative with "./" prefix (e.g., "./data/temp").
AdditionalExclusions []string `yaml:"additionalExclusions"`
}
FilesystemConfig is the static config for rootfs exclusions (from values.yaml).
func (*FilesystemConfig) GetAllExclusions ¶
func (c *FilesystemConfig) GetAllExclusions() []string
GetAllExclusions returns all exclusion paths combined. This is used when building tar arguments for rootfs diff capture.
func (*FilesystemConfig) Validate ¶
func (c *FilesystemConfig) Validate() error
Validate checks that the FilesystemConfig has valid values.
type FilesystemManifest ¶
type FilesystemManifest struct {
Exclusions FilesystemConfig `yaml:"exclusions"`
UpperDir string `yaml:"upperDir,omitempty"`
ExternalPaths []string `yaml:"externalPaths,omitempty"`
BindMountDests []string `yaml:"bindMountDests,omitempty"`
HasRootfsDiff bool `yaml:"hasRootfsDiff"`
HasDeletedFiles bool `yaml:"hasDeletedFiles"`
}
FilesystemManifest holds runtime filesystem state captured at checkpoint time.
func NewFilesystemManifest ¶
func NewFilesystemManifest(exclusions FilesystemConfig, upperDir string, ociSpec *specs.Spec) FilesystemManifest
NewFilesystemManifest constructs FilesystemManifest from config, overlay state, and OCI spec.
type MountInfo ¶
type MountInfo struct {
MountID string
ParentID string
MountPoint string
Root string
FSType string
Source string
Options string
SuperOptions string
}
type MountPolicy ¶
MountPolicy is the classified mount plan for CRIU dump options.
func BuildMountPolicy ¶
func BuildMountPolicy(mountInfo []MountInfo, ociSpec *specs.Spec, rootFS string) *MountPolicy
BuildMountPolicy classifies mounts into CRIU extMnt and skipMnt lists.
Rule order and precedence (top to bottom):
- Skip non-OCI proc/sys submounts and non-OCI runtime /run submounts. These mounts are typically node/kernel/runtime specific and are the highest-risk source of cross-node restore failures, so skip wins.
- Externalize mounts owned by runtime/OCI: - "/" (rootfs is recreated by runtime in OCI restore path) - OCI mount destinations - OCI masked/readonly paths
- Externalize non-OCI bind-like mounts (mount root is not "/" or "."). This captures runtime-injected file mounts (for example driver files) so CRIU does not try to recreate them from checkpoint data.
- Anything else is left unflagged and handled by CRIU default behavior.
Precedence: skip > externalize. If a path is classified as skipped, it is removed from the externalized set.
type NamespaceInfo ¶
type NamespaceInfo struct {
Type NamespaceType
Inode uint64
IsExternal bool // Whether NS is external (shared with pause container)
}
NamespaceInfo holds namespace identification information
func GetNamespaceInfo ¶
func GetNamespaceInfo(pid int, nsType NamespaceType) (*NamespaceInfo, error)
GetNamespaceInfo returns detailed namespace information
type NamespaceManifestEntry ¶
type NamespaceManifestEntry struct {
Type string `yaml:"type"` // net, pid, mnt, etc.
Inode uint64 `yaml:"inode"` // Namespace inode
IsExternal bool `yaml:"isExternal"` // Whether namespace is external (shared)
}
NamespaceManifestEntry stores namespace information saved in checkpoint manifests.
func NewNamespaceManifestEntries ¶
func NewNamespaceManifestEntries(namespaces map[NamespaceType]*NamespaceInfo) []NamespaceManifestEntry
NewNamespaceManifestEntries constructs namespace manifest entries from introspected namespaces.
type NamespaceType ¶
type NamespaceType string
NamespaceType represents a Linux namespace type
const ( NamespaceNet NamespaceType = "net" NamespacePID NamespaceType = "pid" NamespaceMnt NamespaceType = "mnt" NamespaceUTS NamespaceType = "uts" NamespaceIPC NamespaceType = "ipc" NamespaceUser NamespaceType = "user" NamespaceCgroup NamespaceType = "cgroup" )
type SourcePodManifest ¶
type SourcePodManifest struct {
ContainerID string `yaml:"containerId"`
PID int `yaml:"pid"`
SourceNode string `yaml:"sourceNode"`
PodName string `yaml:"podName"`
PodNamespace string `yaml:"podNamespace"`
}
func NewSourcePodManifest ¶
func NewSourcePodManifest(params CheckpointRequest, pid int) SourcePodManifest