container

package module
v0.0.0-...-7a1f928 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 5, 2026 License: Apache-2.0 Imports: 26 Imported by: 0

README

= container

A pure go implementation of container runtime, that can run processes form container images in a sandboxed.

== example

[source,go]
----
package main

import (
	"log/slog"
	"os"
	"syscall"

	"github.com/greatliontech/container"
	"github.com/greatliontech/ocifs"
)

func main() {
	contID := "example-container"

	ofs, err := ocifs.New(ocifs.WithExtraDirs([]string{
		"/proc", "/sys",
	}))
	if err != nil {
		slog.Error("failed to create ocifs", "msg", err)
		os.Exit(1)
	}

	trgtroot, err := os.MkdirTemp(os.TempDir(), "trgt")
	if err != nil {
		slog.Error("failed to create trgt temp dir", "msg", err)
		os.Exit(1)
	}
	defer os.RemoveAll(trgtroot)

	im, err := ofs.Mount("docker.io/busybox:latest", ocifs.MountWithID(contID))
	if err != nil {
		slog.Error("failed to mount", "msg", err)
		os.Exit(1)
	}

	cfg := container.Config{
		Root:     trgtroot,
		Hostname: "test",
		Namespaces: container.Namespaces{
			NewIPC:  true,
			NewMnt:  true,
			NewNet:  true,
			NewPID:  true,
			NewUTS:  true,
			NewUser: true,
		},
		Mounts: []container.Mount{
			{
				Source: im.MountPoint(),
				Target: trgtroot,
				Type:   "auto",
				Flags:  syscall.MS_BIND | syscall.MS_RDONLY,
			},
			{
				Source: "none",
				Target: trgtroot + "/proc",
				Type:   "proc",
			},
			{
				Source: "none",
				Target: trgtroot + "/sys",
				Type:   "sysfs",
			},
		},
		UidMappings: []syscall.SysProcIDMap{
			{
				ContainerID: 0,
				HostID:      syscall.Getuid(),
				Size:        1,
			},
		},
		GidMappings: []syscall.SysProcIDMap{
			{
				ContainerID: 0,
				HostID:      syscall.Getgid(),
				Size:        1,
			},
		},
	}

	cont, err := container.New("/tmp/contstate", contID, cfg)
	if err != nil {
		slog.Error("failed to create container", "msg", err)
		os.Exit(1)
	}

	p := &container.Process{
		Cmd:    "/bin/sh",
		Stdin:  os.Stdin,
		Stdout: os.Stdout,
		Stderr: os.Stderr,
	}

	if err := cont.Run(p); err != nil {
		slog.Error("failed to run", "msg", err)
		os.Exit(1)
	}

	if err := cont.Wait(); err != nil {
		slog.Error("failed to wait", "msg", err)
	}

	if im.Unmount() != nil {
		slog.Error("failed to unmount", "msg", err)
	}
}
----

Documentation

Index

Constants

View Source
const DefaultBridgeGateway = "10.88.0.1/16"

DefaultBridgeGateway is the default gateway for the container bridge

View Source
const DefaultBridgeName = "container0"

DefaultBridgeName is the default bridge name for container networking

View Source
const DefaultBridgeSubnet = "10.88.0.0/16"

DefaultBridgeSubnet is the default subnet for the container bridge

Variables

View Source
var (
	ErrCgroupV2NotMounted   = errors.New("cgroup v2 not mounted")
	ErrCgroupNotFound       = errors.New("cgroup not found")
	ErrControllerNotEnabled = errors.New("controller not enabled")
)
View Source
var MountFlags = struct {
	Bind         uintptr
	BindReadOnly uintptr
	Proc         uintptr
	Sysfs        uintptr
	Tmpfs        uintptr
	Devpts       uintptr
	Private      uintptr
	Slave        uintptr
	Shared       uintptr
}{
	Bind:         unix.MS_BIND,
	BindReadOnly: unix.MS_BIND | unix.MS_RDONLY,
	Proc:         unix.MS_NOSUID | unix.MS_NODEV | unix.MS_NOEXEC,
	Sysfs:        unix.MS_NOSUID | unix.MS_NODEV | unix.MS_NOEXEC | unix.MS_RDONLY,
	Tmpfs:        unix.MS_NOSUID | unix.MS_NODEV,
	Devpts:       unix.MS_NOSUID | unix.MS_NOEXEC,
	Private:      unix.MS_PRIVATE,
	Slave:        unix.MS_SLAVE,
	Shared:       unix.MS_SHARED,
}

MountFlags provides common mount flag combinations

Functions

func AllocateIP

func AllocateIP(subnet string, used []string) (string, error)

AllocateIP allocates the next available IP from a subnet This is a simple implementation - for production use a proper IPAM

func DefaultCapabilities

func DefaultCapabilities() []cap.Value

DefaultCapabilities returns a minimal set of capabilities for containers This matches Docker's default capability set

func EnsureBridge

func EnsureBridge(name string) (netlink.Link, error)

EnsureBridge creates the container bridge if it doesn't exist

func EnsureForwardingEnabled

func EnsureForwardingEnabled() error

EnsureForwardingEnabled ensures IP forwarding is enabled

func ExecNoUserNs

func ExecNoUserNs(pid int, config ExecConfig) error

ExecNoUserNs executes in container namespaces except user namespace.

WARNING: This function has significant limitations due to Go's runtime being multithreaded. The setns() syscall for mount namespace requires a single-threaded process, which Go cannot guarantee. This will likely fail with EINVAL.

For reliable exec into containers, use ExecWithNsenter instead, which delegates to the nsenter(1) utility (a single-threaded C program).

This function is kept for cases where you only need to enter non-mount namespaces or when called very early before Go's runtime spawns additional threads.

func ExecWithNsenter

func ExecWithNsenter(pid int, config ExecConfig) (*exec.Cmd, error)

ExecWithNsenter uses the nsenter(1) utility to properly enter all namespaces This is the safest approach as nsenter is a single-threaded C program

func GetNamespacePaths

func GetNamespacePaths(pid int) map[string]string

GetNamespacePaths returns paths to all namespace files for a process

func JoinNetworkNamespace

func JoinNetworkNamespace(pid int) error

JoinNetworkNamespace joins the network namespace of another process This is safe to call from Go as network namespace doesn't have the same threading issues as user namespace

func RunHooks

func RunHooks(hooks *Hooks, hookType HookType, state *ContainerState) error

RunHooks executes all hooks of the specified type

func SetupNATForBridge

func SetupNATForBridge(bridge string, subnet string) error

SetupNATForBridge sets up NAT for a bridge network using nftables

func WriteHosts

func WriteHosts(rootfs, hostname, ipAddress string) error

WriteHosts writes /etc/hosts in the container rootfs

func WriteResolvConf

func WriteResolvConf(rootfs string, dns []string) error

WriteResolvConf writes /etc/resolv.conf in the container rootfs

Types

type CPUResources

type CPUResources struct {
	// Max is the CPU bandwidth limit as "quota period" (cpu.max)
	// quota is in microseconds, period is typically 100000 (100ms)
	// e.g., "50000 100000" limits to 50% of one CPU
	// Use "max 100000" for unlimited
	Quota  int64
	Period uint64

	// Weight is the CPU weight for fair scheduling (cpu.weight)
	// Range: 1-10000, default 100
	Weight uint64

	// Cpus is the set of CPUs the container can use (cpuset.cpus)
	// e.g., "0-3" or "0,2,4"
	Cpus string

	// Mems is the set of memory nodes the container can use (cpuset.mems)
	// e.g., "0-1" or "0"
	Mems string
}

CPUResources defines CPU limits

type CPUStats

type CPUStats struct {
	UsageUsec uint64 // Total CPU time consumed in microseconds
}

type Capabilities

type Capabilities struct {
	// Bounding set - upper limit on capabilities that can be gained
	Bounding []cap.Value
	// Effective set - capabilities used for permission checks
	Effective []cap.Value
	// Permitted set - capabilities that can be assumed
	Permitted []cap.Value
	// Inheritable set - capabilities preserved across execve
	Inheritable []cap.Value
	// Ambient set - capabilities inherited by non-privileged programs
	Ambient []cap.Value
}

Capabilities configures the capability sets for the container

func DefaultCapabilitiesConfig

func DefaultCapabilitiesConfig() *Capabilities

DefaultCapabilitiesConfig returns a capabilities config with safe defaults

type Cgroup

type Cgroup struct {
	// contains filtered or unexported fields
}

Cgroup represents a cgroup v2 control group

func LoadCgroup

func LoadCgroup(name string) (*Cgroup, error)

LoadCgroup loads an existing cgroup

func NewCgroup

func NewCgroup(name string) (*Cgroup, error)

NewCgroup creates a new cgroup for the container

func (*Cgroup) AddProcess

func (c *Cgroup) AddProcess(pid int) error

AddProcess adds a process to the cgroup

func (*Cgroup) Apply

func (c *Cgroup) Apply(resources *Resources) error

Apply applies the resource limits to the cgroup

func (*Cgroup) Delete

func (c *Cgroup) Delete() error

Delete removes the cgroup All processes must be moved out first

func (*Cgroup) Path

func (c *Cgroup) Path() string

Path returns the cgroup path

func (*Cgroup) Processes

func (c *Cgroup) Processes() ([]int, error)

Processes returns the list of PIDs in the cgroup

func (*Cgroup) Stats

func (c *Cgroup) Stats() (*CgroupStats, error)

Stats returns current resource usage

type CgroupStats

type CgroupStats struct {
	Memory *MemoryStats
	CPU    *CPUStats
	Pids   *PidsStats
}

CgroupStats returns current resource usage statistics

type Config

type Config struct {
	Root        string
	Namespaces  Namespaces
	Hostname    string
	Mounts      []Mount
	UidMappings []syscall.SysProcIDMap
	GidMappings []syscall.SysProcIDMap

	// Security options (Phase 1)
	// UsePivotRoot uses pivot_root instead of chroot for better isolation
	UsePivotRoot bool
	// Capabilities configures Linux capabilities for the container
	Capabilities *Capabilities
	// Seccomp configures the seccomp profile for syscall filtering
	Seccomp *SeccompProfile
	// Devices specifies device nodes to create in /dev
	Devices []Device
	// SetupDev creates a minimal /dev with standard devices
	SetupDev bool
	// NoNewPrivileges sets the no_new_privs flag
	NoNewPrivileges bool

	// Resource limits (Phase 2)
	// Resources configures cgroups v2 resource limits
	Resources *Resources

	// Networking (Phase 3)
	// Network configures container networking
	Network *NetworkConfig

	// Lifecycle (Phase 4)
	// Hooks configures lifecycle hooks
	Hooks *Hooks
}

func DefaultConfig

func DefaultConfig() Config

DefaultConfig returns a Config with secure defaults. When NewUser is enabled (the default), UID/GID mappings are set to map the current user to root inside the container.

type Container

type Container struct {
	// contains filtered or unexported fields
}

func New

func New(statedir, id string, cfg Config) (*Container, error)

func (*Container) Destroy

func (c *Container) Destroy() error

Destroy cleans up container resources including cgroup and network

func (*Container) Exec

func (c *Container) Exec(config ExecConfig) (*exec.Cmd, error)

Exec executes a command in the container's namespaces Note: Due to Go's multithreading model, we cannot safely join user namespaces. This uses nsenter(1) as a workaround for proper namespace joining.

func (*Container) ExitCode

func (c *Container) ExitCode() int

ExitCode returns the container's exit code (0 if still running)

func (*Container) Pid

func (c *Container) Pid() int

Pid returns the container's main process ID

func (*Container) Run

func (c *Container) Run(p *Process) error

func (*Container) Signal

func (c *Container) Signal(sig syscall.Signal) error

Signal sends a signal to the container process

func (*Container) State

func (c *Container) State() State

State returns the current container state

func (*Container) Stats

func (c *Container) Stats() (*CgroupStats, error)

Stats returns current resource usage statistics

func (*Container) StderrPipe

func (c *Container) StderrPipe() (io.ReadCloser, error)

func (*Container) StdinPipe

func (c *Container) StdinPipe() (io.WriteCloser, error)

func (*Container) StdoutPipe

func (c *Container) StdoutPipe() (io.ReadCloser, error)

func (*Container) Stop

func (c *Container) Stop(config SignalConfig) error

Stop sends stop signal and waits, then kills if necessary

func (*Container) Wait

func (c *Container) Wait() error

type ContainerState

type ContainerState struct {
	ID        string    `json:"id"`
	State     State     `json:"state"`
	Pid       int       `json:"pid"`
	ExitCode  int       `json:"exit_code"`
	CreatedAt time.Time `json:"created_at"`
	StartedAt time.Time `json:"started_at,omitempty"`
	StoppedAt time.Time `json:"stopped_at,omitempty"`
}

ContainerState holds the complete state of a container

type Device

type Device struct {
	Path  string // Path inside container (e.g., "/dev/null")
	Type  uint32 // S_IFCHR (character) or S_IFBLK (block)
	Major uint32 // Major device number
	Minor uint32 // Minor device number
	Mode  uint32 // File permissions (e.g., 0666)
	Uid   uint32 // Owner UID
	Gid   uint32 // Owner GID
}

Device represents a device node to create in the container

func DefaultDevices

func DefaultDevices() []Device

DefaultDevices returns the minimal set of devices for a container

type ExecConfig

type ExecConfig struct {
	// Cmd is the command to execute
	Cmd string
	// Args are the command arguments
	Args []string
	// Env are environment variables
	Env []string
	// WorkDir is the working directory
	WorkDir string
	// Root is the root filesystem path (for chroot-based containers)
	Root string
	// Stdin is the stdin reader
	Stdin io.Reader
	// Stdout is the stdout writer
	Stdout io.Writer
	// Stderr is the stderr writer
	Stderr io.Writer
}

ExecConfig configures how to exec into a running container

type Hook

type Hook struct {
	// Path is the command to execute
	Path string `json:"path"`
	// Args are the command arguments
	Args []string `json:"args,omitempty"`
	// Env are environment variables
	Env []string `json:"env,omitempty"`
	// Timeout is the maximum time to wait for the hook
	Timeout time.Duration `json:"timeout,omitempty"`
}

Hook defines a lifecycle hook command

type HookType

type HookType string

HookType defines when a hook should be executed

const (
	HookPrestart        HookType = "prestart"
	HookCreateRuntime   HookType = "createRuntime"
	HookCreateContainer HookType = "createContainer"
	HookStartContainer  HookType = "startContainer"
	HookPoststart       HookType = "poststart"
	HookPoststop        HookType = "poststop"
)

type Hooks

type Hooks struct {
	Prestart        []Hook `json:"prestart,omitempty"`
	CreateRuntime   []Hook `json:"createRuntime,omitempty"`
	CreateContainer []Hook `json:"createContainer,omitempty"`
	StartContainer  []Hook `json:"startContainer,omitempty"`
	Poststart       []Hook `json:"poststart,omitempty"`
	Poststop        []Hook `json:"poststop,omitempty"`
}

Hooks defines all lifecycle hooks

type IOResources

type IOResources struct {
	// Weight is the I/O weight for fair scheduling (io.weight)
	// Range: 1-10000, default 100
	Weight uint64

	// Max specifies per-device I/O limits (io.max)
	// Key is "major:minor", value is the limit string
	// e.g., "8:0": "rbps=1048576 wbps=1048576 riops=1000 wiops=1000"
	Max map[string]string
}

IOResources defines I/O limits

type MemoryResources

type MemoryResources struct {
	// Max is the hard memory limit in bytes (memory.max)
	// Use -1 for unlimited
	Max int64

	// High is the memory throttling threshold in bytes (memory.high)
	// When exceeded, processes are throttled and put under heavy reclaim pressure
	// Use -1 for unlimited
	High int64

	// SwapMax is the swap limit in bytes (memory.swap.max)
	// Use -1 for unlimited, 0 to disable swap
	SwapMax int64
}

MemoryResources defines memory limits

type MemoryStats

type MemoryStats struct {
	Current uint64 // Current memory usage in bytes
	Peak    uint64 // Peak memory usage in bytes
}

type Mount

type Mount struct {
	Source string
	Target string
	Type   string
	Flags  uintptr
	Data   string
}

type Namespaces

type Namespaces struct {
	NewIPC  bool
	NewMnt  bool
	NewNet  bool
	NewPID  bool
	NewUTS  bool
	NewUser bool
}

func (Namespaces) CloneFlags

func (n Namespaces) CloneFlags() uintptr

type Network

type Network struct {
	// contains filtered or unexported fields
}

Network manages container networking

func SetupContainerNetwork

func SetupContainerNetwork(pid int, config NetworkConfig) (*Network, error)

SetupContainerNetwork sets up networking for a container pid is the container's init process PID

func (*Network) Cleanup

func (n *Network) Cleanup() error

Cleanup removes the network resources

type NetworkConfig

type NetworkConfig struct {
	// Mode is the networking mode
	Mode NetworkMode

	// Bridge is the bridge name to connect to (for bridge mode)
	// If empty, defaults to "container0"
	Bridge string

	// IPAddress is the container's IP address with CIDR (e.g., "10.0.0.2/24")
	// If empty in bridge mode, will need manual configuration
	IPAddress string

	// Gateway is the default gateway IP
	Gateway string

	// DNS is a list of DNS server IPs
	DNS []string

	// Hostname is the container hostname (also set via Config.Hostname)
	Hostname string

	// PortMappings defines port forwarding rules
	PortMappings []PortMapping
}

NetworkConfig defines the network configuration for a container

type NetworkMode

type NetworkMode string

NetworkMode defines the container networking mode

const (
	// NetworkModeNone - container has no network connectivity
	NetworkModeNone NetworkMode = "none"
	// NetworkModeHost - container shares host network namespace
	NetworkModeHost NetworkMode = "host"
	// NetworkModeBridge - container connects to a bridge network
	NetworkModeBridge NetworkMode = "bridge"
)

type PidsResources

type PidsResources struct {
	// Max is the maximum number of processes (pids.max)
	// Use -1 for unlimited
	Max int64
}

PidsResources defines process limits

type PidsStats

type PidsStats struct {
	Current uint64 // Current number of processes
}

type PortForwarder

type PortForwarder struct {
	// contains filtered or unexported fields
}

PortForwarder manages port forwarding rules using nftables

func NewPortForwarder

func NewPortForwarder(containerIP string, bridge string) (*PortForwarder, error)

NewPortForwarder creates a new port forwarder for a container

func (*PortForwarder) AddMapping

func (pf *PortForwarder) AddMapping(mapping PortMapping) error

AddMapping adds a port mapping

func (*PortForwarder) Cleanup

func (pf *PortForwarder) Cleanup() error

Cleanup removes all port forwarding rules for this container

func (*PortForwarder) RemoveMapping

func (pf *PortForwarder) RemoveMapping(mapping PortMapping) error

RemoveMapping removes a port mapping

type PortMapping

type PortMapping struct {
	// HostIP is the host IP to bind to (empty for all interfaces)
	HostIP string
	// HostPort is the port on the host
	HostPort uint16
	// ContainerPort is the port in the container
	ContainerPort uint16
	// Protocol is "tcp" or "udp"
	Protocol string
}

PortMapping defines a port forwarding rule

func ParsePortMapping

func ParsePortMapping(s string) (PortMapping, error)

ParsePortMapping parses a port mapping string like "8080:80" or "8080:80/tcp"

func PortMappingsFromStrings

func PortMappingsFromStrings(ss []string) ([]PortMapping, error)

PortMappingsFromStrings parses multiple port mapping strings

type Process

type Process struct {
	Cmd        string
	Args       []string
	WorkDir    string
	Env        []string
	InheritEnv bool
	Init       bool
	Credential *syscall.Credential
	Stdin      io.Reader `json:"-"`
	Stdout     io.Writer `json:"-"`
	Stderr     io.Writer `json:"-"`
	StdinPipe  bool      `json:"-"`
	StdoutPipe bool      `json:"-"`
	StderrPipe bool      `json:"-"`
}

type Resources

type Resources struct {
	// Memory limits
	Memory *MemoryResources

	// CPU limits
	CPU *CPUResources

	// Process limits
	Pids *PidsResources

	// I/O limits
	IO *IOResources
}

Resources defines resource limits for a container using cgroups v2

func DefaultResources

func DefaultResources() *Resources

DefaultResources returns a reasonable default resource configuration

type SeccompProfile

type SeccompProfile struct {
	// DefaultAction is the action for syscalls not in the rules
	DefaultAction seccomp.Action
	// Syscalls defines the syscall filtering rules
	Syscalls []seccomp.SyscallGroup
}

SeccompProfile defines the seccomp filtering configuration

func DefaultSeccompProfile

func DefaultSeccompProfile() *SeccompProfile

DefaultSeccompProfile returns a restrictive seccomp profile suitable for containers. It blocks dangerous syscalls while allowing normal container operation.

func (*SeccompProfile) MarshalJSON

func (p *SeccompProfile) MarshalJSON() ([]byte, error)

MarshalJSON implements json.Marshaler for SeccompProfile

func (*SeccompProfile) UnmarshalJSON

func (p *SeccompProfile) UnmarshalJSON(data []byte) error

UnmarshalJSON implements json.Unmarshaler for SeccompProfile

type SignalConfig

type SignalConfig struct {
	// StopSignal is the signal to send for graceful stop (default SIGTERM)
	StopSignal syscall.Signal
	// StopTimeout is how long to wait before sending SIGKILL
	StopTimeout time.Duration
	// ForwardSignals lists signals to forward to the container
	ForwardSignals []syscall.Signal
}

SignalConfig configures signal handling

func DefaultSignalConfig

func DefaultSignalConfig() SignalConfig

DefaultSignalConfig returns the default signal configuration

type SignalForwarder

type SignalForwarder struct {
	// contains filtered or unexported fields
}

SignalForwarder forwards signals to a process

func NewSignalForwarder

func NewSignalForwarder(pid int, signals []syscall.Signal) *SignalForwarder

NewSignalForwarder creates a new signal forwarder

func (*SignalForwarder) ForwardSignal

func (sf *SignalForwarder) ForwardSignal(sig syscall.Signal) error

ForwardSignal sends a signal to the process

func (*SignalForwarder) Start

func (sf *SignalForwarder) Start()

Start begins forwarding signals

func (*SignalForwarder) Stop

func (sf *SignalForwarder) Stop()

Stop stops forwarding signals

type State

type State string

State represents the container's current state

const (
	StateCreated State = "created"
	StateRunning State = "running"
	StateStopped State = "stopped"
)

type StateManager

type StateManager struct {
	// contains filtered or unexported fields
}

StateManager manages container state persistence

func NewStateManager

func NewStateManager(stateDir string) *StateManager

NewStateManager creates a new state manager

func (*StateManager) DeleteState

func (sm *StateManager) DeleteState(id string) error

DeleteState removes container state from disk

func (*StateManager) ListStates

func (sm *StateManager) ListStates() ([]*ContainerState, error)

ListStates returns all container states

func (*StateManager) LoadState

func (sm *StateManager) LoadState(id string) (*ContainerState, error)

LoadState loads container state from disk

func (*StateManager) SaveState

func (sm *StateManager) SaveState(state *ContainerState) error

SaveState saves container state to disk

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL