/* Hybrid Isolation - Security and resource isolation for Volt hybrid-native containers. Configures: - Landlock LSM policy generation (NEVER AppArmor) - Seccomp profile selection (strict/default/unconfined) - Cgroups v2 resource limits (memory, CPU, I/O, PIDs) - Network namespace setup (private network stack) Copyright (c) Armored Gates LLC. All rights reserved. */ package hybrid import ( "fmt" "path/filepath" "strings" ) // ── Seccomp Profiles ───────────────────────────────────────────────────────── // SeccompProfile selects the syscall filtering level for a container. type SeccompProfile string const ( // SeccompStrict blocks dangerous syscalls and limits the container to a // safe subset. Suitable for untrusted workloads. SeccompStrict SeccompProfile = "strict" // SeccompDefault applies the systemd-nspawn default seccomp filter which // blocks mount, reboot, kexec, and other admin syscalls. SeccompDefault SeccompProfile = "default" // SeccompUnconfined disables seccomp filtering entirely. Use only for // trusted workloads that need full syscall access (e.g. nested containers). SeccompUnconfined SeccompProfile = "unconfined" ) // ── Landlock Policy ────────────────────────────────────────────────────────── // LandlockAccess defines the bitfield of allowed filesystem operations. // These mirror the LANDLOCK_ACCESS_FS_* constants from the kernel ABI. type LandlockAccess uint64 const ( LandlockAccessFSExecute LandlockAccess = 1 << 0 LandlockAccessFSWriteFile LandlockAccess = 1 << 1 LandlockAccessFSReadFile LandlockAccess = 1 << 2 LandlockAccessFSReadDir LandlockAccess = 1 << 3 LandlockAccessFSRemoveDir LandlockAccess = 1 << 4 LandlockAccessFSRemoveFile LandlockAccess = 1 << 5 LandlockAccessFSMakeChar LandlockAccess = 1 << 6 LandlockAccessFSMakeDir LandlockAccess = 1 << 7 LandlockAccessFSMakeReg LandlockAccess = 1 << 8 LandlockAccessFSMakeSock LandlockAccess = 1 << 9 LandlockAccessFSMakeFifo LandlockAccess = 1 << 10 LandlockAccessFSMakeBlock LandlockAccess = 1 << 11 LandlockAccessFSMakeSym LandlockAccess = 1 << 12 LandlockAccessFSRefer LandlockAccess = 1 << 13 LandlockAccessFSTruncate LandlockAccess = 1 << 14 // Convenience combinations. LandlockReadOnly = LandlockAccessFSReadFile | LandlockAccessFSReadDir LandlockReadWrite = LandlockReadOnly | LandlockAccessFSWriteFile | LandlockAccessFSMakeReg | LandlockAccessFSMakeDir | LandlockAccessFSRemoveFile | LandlockAccessFSRemoveDir | LandlockAccessFSTruncate LandlockReadExec = LandlockReadOnly | LandlockAccessFSExecute ) // LandlockRule maps a filesystem path to the permitted access mask. type LandlockRule struct { Path string Access LandlockAccess } // LandlockPolicy is an ordered set of Landlock rules for a container. type LandlockPolicy struct { Rules []LandlockRule } // ServerPolicy returns a Landlock policy for server/service workloads. // Allows execution from /usr and /lib, read-write to /app, /tmp, /var. func ServerPolicy(rootfs string) *LandlockPolicy { return &LandlockPolicy{ Rules: []LandlockRule{ {Path: filepath.Join(rootfs, "usr"), Access: LandlockReadExec}, {Path: filepath.Join(rootfs, "lib"), Access: LandlockReadOnly | LandlockAccessFSExecute}, {Path: filepath.Join(rootfs, "lib64"), Access: LandlockReadOnly | LandlockAccessFSExecute}, {Path: filepath.Join(rootfs, "bin"), Access: LandlockReadExec}, {Path: filepath.Join(rootfs, "sbin"), Access: LandlockReadExec}, {Path: filepath.Join(rootfs, "etc"), Access: LandlockReadOnly}, {Path: filepath.Join(rootfs, "app"), Access: LandlockReadWrite}, {Path: filepath.Join(rootfs, "tmp"), Access: LandlockReadWrite}, {Path: filepath.Join(rootfs, "var"), Access: LandlockReadWrite}, {Path: filepath.Join(rootfs, "run"), Access: LandlockReadWrite}, }, } } // DesktopPolicy returns a Landlock policy for desktop/interactive workloads. // More permissive than ServerPolicy: full home access, /var write access. func DesktopPolicy(rootfs string) *LandlockPolicy { return &LandlockPolicy{ Rules: []LandlockRule{ {Path: filepath.Join(rootfs, "usr"), Access: LandlockReadExec}, {Path: filepath.Join(rootfs, "lib"), Access: LandlockReadOnly | LandlockAccessFSExecute}, {Path: filepath.Join(rootfs, "lib64"), Access: LandlockReadOnly | LandlockAccessFSExecute}, {Path: filepath.Join(rootfs, "bin"), Access: LandlockReadExec}, {Path: filepath.Join(rootfs, "sbin"), Access: LandlockReadExec}, {Path: filepath.Join(rootfs, "etc"), Access: LandlockReadWrite}, {Path: filepath.Join(rootfs, "home"), Access: LandlockReadWrite | LandlockAccessFSExecute}, {Path: filepath.Join(rootfs, "tmp"), Access: LandlockReadWrite}, {Path: filepath.Join(rootfs, "var"), Access: LandlockReadWrite}, {Path: filepath.Join(rootfs, "run"), Access: LandlockReadWrite}, {Path: filepath.Join(rootfs, "opt"), Access: LandlockReadExec}, }, } } // ── Cgroups v2 Resource Limits ─────────────────────────────────────────────── // ResourceLimits configures cgroups v2 resource constraints for a container. type ResourceLimits struct { // Memory limits (e.g. "512M", "2G"). Empty means unlimited. MemoryHard string // memory.max — hard limit, OOM kill above this MemorySoft string // memory.high — throttle above this (soft pressure) // CPU limits. CPUWeight int // cpu.weight (1-10000, default 100). Proportional share. CPUSet string // cpuset.cpus (e.g. "0-3", "0,2"). Pin to specific cores. // I/O limits. IOWeight int // io.weight (1-10000, default 100). Proportional share. // PID limit. PIDsMax int // pids.max — maximum number of processes. 0 means unlimited. } // DefaultResourceLimits returns conservative defaults suitable for most workloads. func DefaultResourceLimits() *ResourceLimits { return &ResourceLimits{ MemoryHard: "2G", MemorySoft: "1G", CPUWeight: 100, CPUSet: "", // no pinning IOWeight: 100, PIDsMax: 4096, } } // SystemdProperties converts ResourceLimits into systemd unit properties // suitable for passing to systemd-run or systemd-nspawn via --property=. func (r *ResourceLimits) SystemdProperties() []string { var props []string // Cgroups v2 delegation is always enabled for hybrid containers. props = append(props, "Delegate=yes") if r.MemoryHard != "" { props = append(props, fmt.Sprintf("MemoryMax=%s", r.MemoryHard)) } if r.MemorySoft != "" { props = append(props, fmt.Sprintf("MemoryHigh=%s", r.MemorySoft)) } if r.CPUWeight > 0 { props = append(props, fmt.Sprintf("CPUWeight=%d", r.CPUWeight)) } if r.CPUSet != "" { props = append(props, fmt.Sprintf("AllowedCPUs=%s", r.CPUSet)) } if r.IOWeight > 0 { props = append(props, fmt.Sprintf("IOWeight=%d", r.IOWeight)) } if r.PIDsMax > 0 { props = append(props, fmt.Sprintf("TasksMax=%d", r.PIDsMax)) } return props } // ── Network Isolation ──────────────────────────────────────────────────────── // NetworkMode selects the container network configuration. type NetworkMode string const ( // NetworkPrivate creates a fully isolated network namespace with a veth // pair connected to the host bridge (voltbr0). The container gets its own // IP stack, routing table, and firewall rules. NetworkPrivate NetworkMode = "private" // NetworkHost shares the host network namespace. The container sees all // host interfaces and ports. Use only for trusted system services. NetworkHost NetworkMode = "host" // NetworkNone creates an isolated network namespace with no external // connectivity. Loopback only. NetworkNone NetworkMode = "none" ) // NetworkConfig holds the network isolation settings for a container. type NetworkConfig struct { Mode NetworkMode Bridge string // bridge name for private mode (default: "voltbr0") // PortForwards maps host ports to container ports when Mode is NetworkPrivate. PortForwards []PortForward // DNS servers to inject into the container's resolv.conf. DNS []string } // PortForward maps a single host port to a container port. type PortForward struct { HostPort int ContainerPort int Protocol string // "tcp" or "udp" } // DefaultNetworkConfig returns a private-network configuration with the // standard Volt bridge. func DefaultNetworkConfig() *NetworkConfig { return &NetworkConfig{ Mode: NetworkPrivate, Bridge: "voltbr0", DNS: []string{"1.1.1.1", "1.0.0.1"}, } } // NspawnNetworkArgs returns the systemd-nspawn arguments for this network // configuration. func (n *NetworkConfig) NspawnNetworkArgs() []string { switch n.Mode { case NetworkPrivate: args := []string{"--network-bridge=" + n.Bridge} for _, pf := range n.PortForwards { proto := pf.Protocol if proto == "" { proto = "tcp" } args = append(args, fmt.Sprintf("--port=%s:%d:%d", proto, pf.HostPort, pf.ContainerPort)) } return args case NetworkHost: return nil // no network flags = share host namespace case NetworkNone: return []string{"--private-network"} default: return []string{"--network-bridge=voltbr0"} } } // ── Isolation Profile ──────────────────────────────────────────────────────── // IsolationConfig combines all isolation settings for a hybrid container. type IsolationConfig struct { Landlock *LandlockPolicy Seccomp SeccompProfile Resources *ResourceLimits Network *NetworkConfig // PrivateUsers enables user namespace isolation (--private-users). PrivateUsers bool // ReadOnlyFS mounts the rootfs as read-only (--read-only). ReadOnlyFS bool } // DefaultIsolation returns a security-first isolation configuration suitable // for production workloads. func DefaultIsolation(rootfs string) *IsolationConfig { return &IsolationConfig{ Landlock: ServerPolicy(rootfs), Seccomp: SeccompDefault, Resources: DefaultResourceLimits(), Network: DefaultNetworkConfig(), PrivateUsers: true, ReadOnlyFS: false, } } // NspawnArgs returns the complete set of systemd-nspawn arguments for this // isolation configuration. These are appended to the base nspawn command. func (iso *IsolationConfig) NspawnArgs() []string { var args []string // Resource limits and cgroup delegation via --property. for _, prop := range iso.Resources.SystemdProperties() { args = append(args, "--property="+prop) } // Seccomp profile. switch iso.Seccomp { case SeccompStrict: // systemd-nspawn applies its default filter automatically. // For strict mode we add --capability=drop-all to further limit. args = append(args, "--drop-capability=all") case SeccompDefault: // Use nspawn's built-in seccomp filter — no extra flags needed. case SeccompUnconfined: // Disable the built-in seccomp filter for trusted workloads. args = append(args, "--system-call-filter=~") } // Network isolation. args = append(args, iso.Network.NspawnNetworkArgs()...) // User namespace isolation. if iso.PrivateUsers { args = append(args, "--private-users=pick") } // Read-only rootfs. if iso.ReadOnlyFS { args = append(args, "--read-only") } return args } // NspawnConfigBlock returns the .nspawn file content sections for this // isolation configuration. Written to /etc/systemd/nspawn/.nspawn. func (iso *IsolationConfig) NspawnConfigBlock(name string) string { var b strings.Builder // [Exec] section b.WriteString("[Exec]\n") b.WriteString("Boot=yes\n") b.WriteString("PrivateUsers=") if iso.PrivateUsers { b.WriteString("pick\n") } else { b.WriteString("no\n") } // Environment setup. b.WriteString(fmt.Sprintf("Environment=VOLT_CONTAINER=%s\n", name)) b.WriteString("Environment=VOLT_RUNTIME=hybrid\n") b.WriteString("\n") // [Network] section b.WriteString("[Network]\n") switch iso.Network.Mode { case NetworkPrivate: b.WriteString(fmt.Sprintf("Bridge=%s\n", iso.Network.Bridge)) case NetworkNone: b.WriteString("Private=yes\n") case NetworkHost: // No network section needed for host mode. } b.WriteString("\n") // [ResourceControl] section (selected limits for the .nspawn file). b.WriteString("[ResourceControl]\n") if iso.Resources.MemoryHard != "" { b.WriteString(fmt.Sprintf("MemoryMax=%s\n", iso.Resources.MemoryHard)) } if iso.Resources.PIDsMax > 0 { b.WriteString(fmt.Sprintf("TasksMax=%d\n", iso.Resources.PIDsMax)) } return b.String() }