Complete infrastructure platform CLI: - Container runtime (systemd-nspawn) - VoltVisor VMs (Neutron Stardust / QEMU) - Stellarium CAS (content-addressed storage) - ORAS Registry - GitOps integration - Landlock LSM security - Compose orchestration - Mesh networking Copyright (c) Armored Gates LLC. All rights reserved. Licensed under AGPSL v5.0
367 lines
13 KiB
Go
367 lines
13 KiB
Go
/*
|
|
Hybrid Isolation - Security and resource isolation for Volt hybrid-native containers.
|
|
|
|
Configures:
|
|
- Landlock LSM policy generation (NEVER AppArmor)
|
|
- Seccomp profile selection (strict/default/unconfined)
|
|
- Cgroups v2 resource limits (memory, CPU, I/O, PIDs)
|
|
- Network namespace setup (private network stack)
|
|
|
|
Copyright (c) Armored Gates LLC. All rights reserved.
|
|
*/
|
|
package hybrid
|
|
|
|
import (
|
|
"fmt"
|
|
"path/filepath"
|
|
"strings"
|
|
)
|
|
|
|
// ── Seccomp Profiles ─────────────────────────────────────────────────────────
|
|
|
|
// SeccompProfile selects the syscall filtering level for a container.
|
|
type SeccompProfile string
|
|
|
|
const (
|
|
// SeccompStrict blocks dangerous syscalls and limits the container to a
|
|
// safe subset. Suitable for untrusted workloads.
|
|
SeccompStrict SeccompProfile = "strict"
|
|
|
|
// SeccompDefault applies the systemd-nspawn default seccomp filter which
|
|
// blocks mount, reboot, kexec, and other admin syscalls.
|
|
SeccompDefault SeccompProfile = "default"
|
|
|
|
// SeccompUnconfined disables seccomp filtering entirely. Use only for
|
|
// trusted workloads that need full syscall access (e.g. nested containers).
|
|
SeccompUnconfined SeccompProfile = "unconfined"
|
|
)
|
|
|
|
// ── Landlock Policy ──────────────────────────────────────────────────────────
|
|
|
|
// LandlockAccess defines the bitfield of allowed filesystem operations.
|
|
// These mirror the LANDLOCK_ACCESS_FS_* constants from the kernel ABI.
|
|
type LandlockAccess uint64
|
|
|
|
const (
|
|
LandlockAccessFSExecute LandlockAccess = 1 << 0
|
|
LandlockAccessFSWriteFile LandlockAccess = 1 << 1
|
|
LandlockAccessFSReadFile LandlockAccess = 1 << 2
|
|
LandlockAccessFSReadDir LandlockAccess = 1 << 3
|
|
LandlockAccessFSRemoveDir LandlockAccess = 1 << 4
|
|
LandlockAccessFSRemoveFile LandlockAccess = 1 << 5
|
|
LandlockAccessFSMakeChar LandlockAccess = 1 << 6
|
|
LandlockAccessFSMakeDir LandlockAccess = 1 << 7
|
|
LandlockAccessFSMakeReg LandlockAccess = 1 << 8
|
|
LandlockAccessFSMakeSock LandlockAccess = 1 << 9
|
|
LandlockAccessFSMakeFifo LandlockAccess = 1 << 10
|
|
LandlockAccessFSMakeBlock LandlockAccess = 1 << 11
|
|
LandlockAccessFSMakeSym LandlockAccess = 1 << 12
|
|
LandlockAccessFSRefer LandlockAccess = 1 << 13
|
|
LandlockAccessFSTruncate LandlockAccess = 1 << 14
|
|
|
|
// Convenience combinations.
|
|
LandlockReadOnly = LandlockAccessFSReadFile | LandlockAccessFSReadDir
|
|
LandlockReadWrite = LandlockReadOnly | LandlockAccessFSWriteFile |
|
|
LandlockAccessFSMakeReg | LandlockAccessFSMakeDir |
|
|
LandlockAccessFSRemoveFile | LandlockAccessFSRemoveDir |
|
|
LandlockAccessFSTruncate
|
|
LandlockReadExec = LandlockReadOnly | LandlockAccessFSExecute
|
|
)
|
|
|
|
// LandlockRule maps a filesystem path to the permitted access mask.
|
|
type LandlockRule struct {
|
|
Path string
|
|
Access LandlockAccess
|
|
}
|
|
|
|
// LandlockPolicy is an ordered set of Landlock rules for a container.
|
|
type LandlockPolicy struct {
|
|
Rules []LandlockRule
|
|
}
|
|
|
|
// ServerPolicy returns a Landlock policy for server/service workloads.
|
|
// Allows execution from /usr and /lib, read-write to /app, /tmp, /var.
|
|
func ServerPolicy(rootfs string) *LandlockPolicy {
|
|
return &LandlockPolicy{
|
|
Rules: []LandlockRule{
|
|
{Path: filepath.Join(rootfs, "usr"), Access: LandlockReadExec},
|
|
{Path: filepath.Join(rootfs, "lib"), Access: LandlockReadOnly | LandlockAccessFSExecute},
|
|
{Path: filepath.Join(rootfs, "lib64"), Access: LandlockReadOnly | LandlockAccessFSExecute},
|
|
{Path: filepath.Join(rootfs, "bin"), Access: LandlockReadExec},
|
|
{Path: filepath.Join(rootfs, "sbin"), Access: LandlockReadExec},
|
|
{Path: filepath.Join(rootfs, "etc"), Access: LandlockReadOnly},
|
|
{Path: filepath.Join(rootfs, "app"), Access: LandlockReadWrite},
|
|
{Path: filepath.Join(rootfs, "tmp"), Access: LandlockReadWrite},
|
|
{Path: filepath.Join(rootfs, "var"), Access: LandlockReadWrite},
|
|
{Path: filepath.Join(rootfs, "run"), Access: LandlockReadWrite},
|
|
},
|
|
}
|
|
}
|
|
|
|
// DesktopPolicy returns a Landlock policy for desktop/interactive workloads.
|
|
// More permissive than ServerPolicy: full home access, /var write access.
|
|
func DesktopPolicy(rootfs string) *LandlockPolicy {
|
|
return &LandlockPolicy{
|
|
Rules: []LandlockRule{
|
|
{Path: filepath.Join(rootfs, "usr"), Access: LandlockReadExec},
|
|
{Path: filepath.Join(rootfs, "lib"), Access: LandlockReadOnly | LandlockAccessFSExecute},
|
|
{Path: filepath.Join(rootfs, "lib64"), Access: LandlockReadOnly | LandlockAccessFSExecute},
|
|
{Path: filepath.Join(rootfs, "bin"), Access: LandlockReadExec},
|
|
{Path: filepath.Join(rootfs, "sbin"), Access: LandlockReadExec},
|
|
{Path: filepath.Join(rootfs, "etc"), Access: LandlockReadWrite},
|
|
{Path: filepath.Join(rootfs, "home"), Access: LandlockReadWrite | LandlockAccessFSExecute},
|
|
{Path: filepath.Join(rootfs, "tmp"), Access: LandlockReadWrite},
|
|
{Path: filepath.Join(rootfs, "var"), Access: LandlockReadWrite},
|
|
{Path: filepath.Join(rootfs, "run"), Access: LandlockReadWrite},
|
|
{Path: filepath.Join(rootfs, "opt"), Access: LandlockReadExec},
|
|
},
|
|
}
|
|
}
|
|
|
|
// ── Cgroups v2 Resource Limits ───────────────────────────────────────────────
|
|
|
|
// ResourceLimits configures cgroups v2 resource constraints for a container.
|
|
type ResourceLimits struct {
|
|
// Memory limits (e.g. "512M", "2G"). Empty means unlimited.
|
|
MemoryHard string // memory.max — hard limit, OOM kill above this
|
|
MemorySoft string // memory.high — throttle above this (soft pressure)
|
|
|
|
// CPU limits.
|
|
CPUWeight int // cpu.weight (1-10000, default 100). Proportional share.
|
|
CPUSet string // cpuset.cpus (e.g. "0-3", "0,2"). Pin to specific cores.
|
|
|
|
// I/O limits.
|
|
IOWeight int // io.weight (1-10000, default 100). Proportional share.
|
|
|
|
// PID limit.
|
|
PIDsMax int // pids.max — maximum number of processes. 0 means unlimited.
|
|
}
|
|
|
|
// DefaultResourceLimits returns conservative defaults suitable for most workloads.
|
|
func DefaultResourceLimits() *ResourceLimits {
|
|
return &ResourceLimits{
|
|
MemoryHard: "2G",
|
|
MemorySoft: "1G",
|
|
CPUWeight: 100,
|
|
CPUSet: "", // no pinning
|
|
IOWeight: 100,
|
|
PIDsMax: 4096,
|
|
}
|
|
}
|
|
|
|
// SystemdProperties converts ResourceLimits into systemd unit properties
|
|
// suitable for passing to systemd-run or systemd-nspawn via --property=.
|
|
func (r *ResourceLimits) SystemdProperties() []string {
|
|
var props []string
|
|
|
|
// Cgroups v2 delegation is always enabled for hybrid containers.
|
|
props = append(props, "Delegate=yes")
|
|
|
|
if r.MemoryHard != "" {
|
|
props = append(props, fmt.Sprintf("MemoryMax=%s", r.MemoryHard))
|
|
}
|
|
if r.MemorySoft != "" {
|
|
props = append(props, fmt.Sprintf("MemoryHigh=%s", r.MemorySoft))
|
|
}
|
|
if r.CPUWeight > 0 {
|
|
props = append(props, fmt.Sprintf("CPUWeight=%d", r.CPUWeight))
|
|
}
|
|
if r.CPUSet != "" {
|
|
props = append(props, fmt.Sprintf("AllowedCPUs=%s", r.CPUSet))
|
|
}
|
|
if r.IOWeight > 0 {
|
|
props = append(props, fmt.Sprintf("IOWeight=%d", r.IOWeight))
|
|
}
|
|
if r.PIDsMax > 0 {
|
|
props = append(props, fmt.Sprintf("TasksMax=%d", r.PIDsMax))
|
|
}
|
|
|
|
return props
|
|
}
|
|
|
|
// ── Network Isolation ────────────────────────────────────────────────────────
|
|
|
|
// NetworkMode selects the container network configuration.
|
|
type NetworkMode string
|
|
|
|
const (
|
|
// NetworkPrivate creates a fully isolated network namespace with a veth
|
|
// pair connected to the host bridge (voltbr0). The container gets its own
|
|
// IP stack, routing table, and firewall rules.
|
|
NetworkPrivate NetworkMode = "private"
|
|
|
|
// NetworkHost shares the host network namespace. The container sees all
|
|
// host interfaces and ports. Use only for trusted system services.
|
|
NetworkHost NetworkMode = "host"
|
|
|
|
// NetworkNone creates an isolated network namespace with no external
|
|
// connectivity. Loopback only.
|
|
NetworkNone NetworkMode = "none"
|
|
)
|
|
|
|
// NetworkConfig holds the network isolation settings for a container.
|
|
type NetworkConfig struct {
|
|
Mode NetworkMode
|
|
Bridge string // bridge name for private mode (default: "voltbr0")
|
|
|
|
// PortForwards maps host ports to container ports when Mode is NetworkPrivate.
|
|
PortForwards []PortForward
|
|
|
|
// DNS servers to inject into the container's resolv.conf.
|
|
DNS []string
|
|
}
|
|
|
|
// PortForward maps a single host port to a container port.
|
|
type PortForward struct {
|
|
HostPort int
|
|
ContainerPort int
|
|
Protocol string // "tcp" or "udp"
|
|
}
|
|
|
|
// DefaultNetworkConfig returns a private-network configuration with the
|
|
// standard Volt bridge.
|
|
func DefaultNetworkConfig() *NetworkConfig {
|
|
return &NetworkConfig{
|
|
Mode: NetworkPrivate,
|
|
Bridge: "voltbr0",
|
|
DNS: []string{"1.1.1.1", "1.0.0.1"},
|
|
}
|
|
}
|
|
|
|
// NspawnNetworkArgs returns the systemd-nspawn arguments for this network
|
|
// configuration.
|
|
func (n *NetworkConfig) NspawnNetworkArgs() []string {
|
|
switch n.Mode {
|
|
case NetworkPrivate:
|
|
args := []string{"--network-bridge=" + n.Bridge}
|
|
for _, pf := range n.PortForwards {
|
|
proto := pf.Protocol
|
|
if proto == "" {
|
|
proto = "tcp"
|
|
}
|
|
args = append(args, fmt.Sprintf("--port=%s:%d:%d", proto, pf.HostPort, pf.ContainerPort))
|
|
}
|
|
return args
|
|
case NetworkHost:
|
|
return nil // no network flags = share host namespace
|
|
case NetworkNone:
|
|
return []string{"--private-network"}
|
|
default:
|
|
return []string{"--network-bridge=voltbr0"}
|
|
}
|
|
}
|
|
|
|
// ── Isolation Profile ────────────────────────────────────────────────────────
|
|
|
|
// IsolationConfig combines all isolation settings for a hybrid container.
|
|
type IsolationConfig struct {
|
|
Landlock *LandlockPolicy
|
|
Seccomp SeccompProfile
|
|
Resources *ResourceLimits
|
|
Network *NetworkConfig
|
|
|
|
// PrivateUsers enables user namespace isolation (--private-users).
|
|
PrivateUsers bool
|
|
|
|
// ReadOnlyFS mounts the rootfs as read-only (--read-only).
|
|
ReadOnlyFS bool
|
|
}
|
|
|
|
// DefaultIsolation returns a security-first isolation configuration suitable
|
|
// for production workloads.
|
|
func DefaultIsolation(rootfs string) *IsolationConfig {
|
|
return &IsolationConfig{
|
|
Landlock: ServerPolicy(rootfs),
|
|
Seccomp: SeccompDefault,
|
|
Resources: DefaultResourceLimits(),
|
|
Network: DefaultNetworkConfig(),
|
|
PrivateUsers: true,
|
|
ReadOnlyFS: false,
|
|
}
|
|
}
|
|
|
|
// NspawnArgs returns the complete set of systemd-nspawn arguments for this
|
|
// isolation configuration. These are appended to the base nspawn command.
|
|
func (iso *IsolationConfig) NspawnArgs() []string {
|
|
var args []string
|
|
|
|
// Resource limits and cgroup delegation via --property.
|
|
for _, prop := range iso.Resources.SystemdProperties() {
|
|
args = append(args, "--property="+prop)
|
|
}
|
|
|
|
// Seccomp profile.
|
|
switch iso.Seccomp {
|
|
case SeccompStrict:
|
|
// systemd-nspawn applies its default filter automatically.
|
|
// For strict mode we add --capability=drop-all to further limit.
|
|
args = append(args, "--drop-capability=all")
|
|
case SeccompDefault:
|
|
// Use nspawn's built-in seccomp filter — no extra flags needed.
|
|
case SeccompUnconfined:
|
|
// Disable the built-in seccomp filter for trusted workloads.
|
|
args = append(args, "--system-call-filter=~")
|
|
}
|
|
|
|
// Network isolation.
|
|
args = append(args, iso.Network.NspawnNetworkArgs()...)
|
|
|
|
// User namespace isolation.
|
|
if iso.PrivateUsers {
|
|
args = append(args, "--private-users=pick")
|
|
}
|
|
|
|
// Read-only rootfs.
|
|
if iso.ReadOnlyFS {
|
|
args = append(args, "--read-only")
|
|
}
|
|
|
|
return args
|
|
}
|
|
|
|
// NspawnConfigBlock returns the .nspawn file content sections for this
|
|
// isolation configuration. Written to /etc/systemd/nspawn/<name>.nspawn.
|
|
func (iso *IsolationConfig) NspawnConfigBlock(name string) string {
|
|
var b strings.Builder
|
|
|
|
// [Exec] section
|
|
b.WriteString("[Exec]\n")
|
|
b.WriteString("Boot=yes\n")
|
|
b.WriteString("PrivateUsers=")
|
|
if iso.PrivateUsers {
|
|
b.WriteString("pick\n")
|
|
} else {
|
|
b.WriteString("no\n")
|
|
}
|
|
|
|
// Environment setup.
|
|
b.WriteString(fmt.Sprintf("Environment=VOLT_CONTAINER=%s\n", name))
|
|
b.WriteString("Environment=VOLT_RUNTIME=hybrid\n")
|
|
|
|
b.WriteString("\n")
|
|
|
|
// [Network] section
|
|
b.WriteString("[Network]\n")
|
|
switch iso.Network.Mode {
|
|
case NetworkPrivate:
|
|
b.WriteString(fmt.Sprintf("Bridge=%s\n", iso.Network.Bridge))
|
|
case NetworkNone:
|
|
b.WriteString("Private=yes\n")
|
|
case NetworkHost:
|
|
// No network section needed for host mode.
|
|
}
|
|
|
|
b.WriteString("\n")
|
|
|
|
// [ResourceControl] section (selected limits for the .nspawn file).
|
|
b.WriteString("[ResourceControl]\n")
|
|
if iso.Resources.MemoryHard != "" {
|
|
b.WriteString(fmt.Sprintf("MemoryMax=%s\n", iso.Resources.MemoryHard))
|
|
}
|
|
if iso.Resources.PIDsMax > 0 {
|
|
b.WriteString(fmt.Sprintf("TasksMax=%d\n", iso.Resources.PIDsMax))
|
|
}
|
|
|
|
return b.String()
|
|
}
|