Volt CLI: source-available under AGPSL v5.0
Complete infrastructure platform CLI: - Container runtime (systemd-nspawn) - VoltVisor VMs (Neutron Stardust / QEMU) - Stellarium CAS (content-addressed storage) - ORAS Registry - GitOps integration - Landlock LSM security - Compose orchestration - Mesh networking Copyright (c) Armored Gates LLC. All rights reserved. Licensed under AGPSL v5.0
This commit is contained in:
366
pkg/backend/hybrid/isolation.go
Normal file
366
pkg/backend/hybrid/isolation.go
Normal file
@@ -0,0 +1,366 @@
|
||||
/*
|
||||
Hybrid Isolation - Security and resource isolation for Volt hybrid-native containers.
|
||||
|
||||
Configures:
|
||||
- Landlock LSM policy generation (NEVER AppArmor)
|
||||
- Seccomp profile selection (strict/default/unconfined)
|
||||
- Cgroups v2 resource limits (memory, CPU, I/O, PIDs)
|
||||
- Network namespace setup (private network stack)
|
||||
|
||||
Copyright (c) Armored Gates LLC. All rights reserved.
|
||||
*/
|
||||
package hybrid
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// ── Seccomp Profiles ─────────────────────────────────────────────────────────
|
||||
|
||||
// SeccompProfile selects the syscall filtering level for a container.
|
||||
type SeccompProfile string
|
||||
|
||||
const (
|
||||
// SeccompStrict blocks dangerous syscalls and limits the container to a
|
||||
// safe subset. Suitable for untrusted workloads.
|
||||
SeccompStrict SeccompProfile = "strict"
|
||||
|
||||
// SeccompDefault applies the systemd-nspawn default seccomp filter which
|
||||
// blocks mount, reboot, kexec, and other admin syscalls.
|
||||
SeccompDefault SeccompProfile = "default"
|
||||
|
||||
// SeccompUnconfined disables seccomp filtering entirely. Use only for
|
||||
// trusted workloads that need full syscall access (e.g. nested containers).
|
||||
SeccompUnconfined SeccompProfile = "unconfined"
|
||||
)
|
||||
|
||||
// ── Landlock Policy ──────────────────────────────────────────────────────────
|
||||
|
||||
// LandlockAccess defines the bitfield of allowed filesystem operations.
|
||||
// These mirror the LANDLOCK_ACCESS_FS_* constants from the kernel ABI.
|
||||
type LandlockAccess uint64
|
||||
|
||||
const (
|
||||
LandlockAccessFSExecute LandlockAccess = 1 << 0
|
||||
LandlockAccessFSWriteFile LandlockAccess = 1 << 1
|
||||
LandlockAccessFSReadFile LandlockAccess = 1 << 2
|
||||
LandlockAccessFSReadDir LandlockAccess = 1 << 3
|
||||
LandlockAccessFSRemoveDir LandlockAccess = 1 << 4
|
||||
LandlockAccessFSRemoveFile LandlockAccess = 1 << 5
|
||||
LandlockAccessFSMakeChar LandlockAccess = 1 << 6
|
||||
LandlockAccessFSMakeDir LandlockAccess = 1 << 7
|
||||
LandlockAccessFSMakeReg LandlockAccess = 1 << 8
|
||||
LandlockAccessFSMakeSock LandlockAccess = 1 << 9
|
||||
LandlockAccessFSMakeFifo LandlockAccess = 1 << 10
|
||||
LandlockAccessFSMakeBlock LandlockAccess = 1 << 11
|
||||
LandlockAccessFSMakeSym LandlockAccess = 1 << 12
|
||||
LandlockAccessFSRefer LandlockAccess = 1 << 13
|
||||
LandlockAccessFSTruncate LandlockAccess = 1 << 14
|
||||
|
||||
// Convenience combinations.
|
||||
LandlockReadOnly = LandlockAccessFSReadFile | LandlockAccessFSReadDir
|
||||
LandlockReadWrite = LandlockReadOnly | LandlockAccessFSWriteFile |
|
||||
LandlockAccessFSMakeReg | LandlockAccessFSMakeDir |
|
||||
LandlockAccessFSRemoveFile | LandlockAccessFSRemoveDir |
|
||||
LandlockAccessFSTruncate
|
||||
LandlockReadExec = LandlockReadOnly | LandlockAccessFSExecute
|
||||
)
|
||||
|
||||
// LandlockRule maps a filesystem path to the permitted access mask.
|
||||
type LandlockRule struct {
|
||||
Path string
|
||||
Access LandlockAccess
|
||||
}
|
||||
|
||||
// LandlockPolicy is an ordered set of Landlock rules for a container.
|
||||
type LandlockPolicy struct {
|
||||
Rules []LandlockRule
|
||||
}
|
||||
|
||||
// ServerPolicy returns a Landlock policy for server/service workloads.
|
||||
// Allows execution from /usr and /lib, read-write to /app, /tmp, /var.
|
||||
func ServerPolicy(rootfs string) *LandlockPolicy {
|
||||
return &LandlockPolicy{
|
||||
Rules: []LandlockRule{
|
||||
{Path: filepath.Join(rootfs, "usr"), Access: LandlockReadExec},
|
||||
{Path: filepath.Join(rootfs, "lib"), Access: LandlockReadOnly | LandlockAccessFSExecute},
|
||||
{Path: filepath.Join(rootfs, "lib64"), Access: LandlockReadOnly | LandlockAccessFSExecute},
|
||||
{Path: filepath.Join(rootfs, "bin"), Access: LandlockReadExec},
|
||||
{Path: filepath.Join(rootfs, "sbin"), Access: LandlockReadExec},
|
||||
{Path: filepath.Join(rootfs, "etc"), Access: LandlockReadOnly},
|
||||
{Path: filepath.Join(rootfs, "app"), Access: LandlockReadWrite},
|
||||
{Path: filepath.Join(rootfs, "tmp"), Access: LandlockReadWrite},
|
||||
{Path: filepath.Join(rootfs, "var"), Access: LandlockReadWrite},
|
||||
{Path: filepath.Join(rootfs, "run"), Access: LandlockReadWrite},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// DesktopPolicy returns a Landlock policy for desktop/interactive workloads.
|
||||
// More permissive than ServerPolicy: full home access, /var write access.
|
||||
func DesktopPolicy(rootfs string) *LandlockPolicy {
|
||||
return &LandlockPolicy{
|
||||
Rules: []LandlockRule{
|
||||
{Path: filepath.Join(rootfs, "usr"), Access: LandlockReadExec},
|
||||
{Path: filepath.Join(rootfs, "lib"), Access: LandlockReadOnly | LandlockAccessFSExecute},
|
||||
{Path: filepath.Join(rootfs, "lib64"), Access: LandlockReadOnly | LandlockAccessFSExecute},
|
||||
{Path: filepath.Join(rootfs, "bin"), Access: LandlockReadExec},
|
||||
{Path: filepath.Join(rootfs, "sbin"), Access: LandlockReadExec},
|
||||
{Path: filepath.Join(rootfs, "etc"), Access: LandlockReadWrite},
|
||||
{Path: filepath.Join(rootfs, "home"), Access: LandlockReadWrite | LandlockAccessFSExecute},
|
||||
{Path: filepath.Join(rootfs, "tmp"), Access: LandlockReadWrite},
|
||||
{Path: filepath.Join(rootfs, "var"), Access: LandlockReadWrite},
|
||||
{Path: filepath.Join(rootfs, "run"), Access: LandlockReadWrite},
|
||||
{Path: filepath.Join(rootfs, "opt"), Access: LandlockReadExec},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// ── Cgroups v2 Resource Limits ───────────────────────────────────────────────
|
||||
|
||||
// ResourceLimits configures cgroups v2 resource constraints for a container.
|
||||
type ResourceLimits struct {
|
||||
// Memory limits (e.g. "512M", "2G"). Empty means unlimited.
|
||||
MemoryHard string // memory.max — hard limit, OOM kill above this
|
||||
MemorySoft string // memory.high — throttle above this (soft pressure)
|
||||
|
||||
// CPU limits.
|
||||
CPUWeight int // cpu.weight (1-10000, default 100). Proportional share.
|
||||
CPUSet string // cpuset.cpus (e.g. "0-3", "0,2"). Pin to specific cores.
|
||||
|
||||
// I/O limits.
|
||||
IOWeight int // io.weight (1-10000, default 100). Proportional share.
|
||||
|
||||
// PID limit.
|
||||
PIDsMax int // pids.max — maximum number of processes. 0 means unlimited.
|
||||
}
|
||||
|
||||
// DefaultResourceLimits returns conservative defaults suitable for most workloads.
|
||||
func DefaultResourceLimits() *ResourceLimits {
|
||||
return &ResourceLimits{
|
||||
MemoryHard: "2G",
|
||||
MemorySoft: "1G",
|
||||
CPUWeight: 100,
|
||||
CPUSet: "", // no pinning
|
||||
IOWeight: 100,
|
||||
PIDsMax: 4096,
|
||||
}
|
||||
}
|
||||
|
||||
// SystemdProperties converts ResourceLimits into systemd unit properties
|
||||
// suitable for passing to systemd-run or systemd-nspawn via --property=.
|
||||
func (r *ResourceLimits) SystemdProperties() []string {
|
||||
var props []string
|
||||
|
||||
// Cgroups v2 delegation is always enabled for hybrid containers.
|
||||
props = append(props, "Delegate=yes")
|
||||
|
||||
if r.MemoryHard != "" {
|
||||
props = append(props, fmt.Sprintf("MemoryMax=%s", r.MemoryHard))
|
||||
}
|
||||
if r.MemorySoft != "" {
|
||||
props = append(props, fmt.Sprintf("MemoryHigh=%s", r.MemorySoft))
|
||||
}
|
||||
if r.CPUWeight > 0 {
|
||||
props = append(props, fmt.Sprintf("CPUWeight=%d", r.CPUWeight))
|
||||
}
|
||||
if r.CPUSet != "" {
|
||||
props = append(props, fmt.Sprintf("AllowedCPUs=%s", r.CPUSet))
|
||||
}
|
||||
if r.IOWeight > 0 {
|
||||
props = append(props, fmt.Sprintf("IOWeight=%d", r.IOWeight))
|
||||
}
|
||||
if r.PIDsMax > 0 {
|
||||
props = append(props, fmt.Sprintf("TasksMax=%d", r.PIDsMax))
|
||||
}
|
||||
|
||||
return props
|
||||
}
|
||||
|
||||
// ── Network Isolation ────────────────────────────────────────────────────────
|
||||
|
||||
// NetworkMode selects the container network configuration.
|
||||
type NetworkMode string
|
||||
|
||||
const (
|
||||
// NetworkPrivate creates a fully isolated network namespace with a veth
|
||||
// pair connected to the host bridge (voltbr0). The container gets its own
|
||||
// IP stack, routing table, and firewall rules.
|
||||
NetworkPrivate NetworkMode = "private"
|
||||
|
||||
// NetworkHost shares the host network namespace. The container sees all
|
||||
// host interfaces and ports. Use only for trusted system services.
|
||||
NetworkHost NetworkMode = "host"
|
||||
|
||||
// NetworkNone creates an isolated network namespace with no external
|
||||
// connectivity. Loopback only.
|
||||
NetworkNone NetworkMode = "none"
|
||||
)
|
||||
|
||||
// NetworkConfig holds the network isolation settings for a container.
|
||||
type NetworkConfig struct {
|
||||
Mode NetworkMode
|
||||
Bridge string // bridge name for private mode (default: "voltbr0")
|
||||
|
||||
// PortForwards maps host ports to container ports when Mode is NetworkPrivate.
|
||||
PortForwards []PortForward
|
||||
|
||||
// DNS servers to inject into the container's resolv.conf.
|
||||
DNS []string
|
||||
}
|
||||
|
||||
// PortForward maps a single host port to a container port.
|
||||
type PortForward struct {
|
||||
HostPort int
|
||||
ContainerPort int
|
||||
Protocol string // "tcp" or "udp"
|
||||
}
|
||||
|
||||
// DefaultNetworkConfig returns a private-network configuration with the
|
||||
// standard Volt bridge.
|
||||
func DefaultNetworkConfig() *NetworkConfig {
|
||||
return &NetworkConfig{
|
||||
Mode: NetworkPrivate,
|
||||
Bridge: "voltbr0",
|
||||
DNS: []string{"1.1.1.1", "1.0.0.1"},
|
||||
}
|
||||
}
|
||||
|
||||
// NspawnNetworkArgs returns the systemd-nspawn arguments for this network
|
||||
// configuration.
|
||||
func (n *NetworkConfig) NspawnNetworkArgs() []string {
|
||||
switch n.Mode {
|
||||
case NetworkPrivate:
|
||||
args := []string{"--network-bridge=" + n.Bridge}
|
||||
for _, pf := range n.PortForwards {
|
||||
proto := pf.Protocol
|
||||
if proto == "" {
|
||||
proto = "tcp"
|
||||
}
|
||||
args = append(args, fmt.Sprintf("--port=%s:%d:%d", proto, pf.HostPort, pf.ContainerPort))
|
||||
}
|
||||
return args
|
||||
case NetworkHost:
|
||||
return nil // no network flags = share host namespace
|
||||
case NetworkNone:
|
||||
return []string{"--private-network"}
|
||||
default:
|
||||
return []string{"--network-bridge=voltbr0"}
|
||||
}
|
||||
}
|
||||
|
||||
// ── Isolation Profile ────────────────────────────────────────────────────────
|
||||
|
||||
// IsolationConfig combines all isolation settings for a hybrid container.
|
||||
type IsolationConfig struct {
|
||||
Landlock *LandlockPolicy
|
||||
Seccomp SeccompProfile
|
||||
Resources *ResourceLimits
|
||||
Network *NetworkConfig
|
||||
|
||||
// PrivateUsers enables user namespace isolation (--private-users).
|
||||
PrivateUsers bool
|
||||
|
||||
// ReadOnlyFS mounts the rootfs as read-only (--read-only).
|
||||
ReadOnlyFS bool
|
||||
}
|
||||
|
||||
// DefaultIsolation returns a security-first isolation configuration suitable
|
||||
// for production workloads.
|
||||
func DefaultIsolation(rootfs string) *IsolationConfig {
|
||||
return &IsolationConfig{
|
||||
Landlock: ServerPolicy(rootfs),
|
||||
Seccomp: SeccompDefault,
|
||||
Resources: DefaultResourceLimits(),
|
||||
Network: DefaultNetworkConfig(),
|
||||
PrivateUsers: true,
|
||||
ReadOnlyFS: false,
|
||||
}
|
||||
}
|
||||
|
||||
// NspawnArgs returns the complete set of systemd-nspawn arguments for this
|
||||
// isolation configuration. These are appended to the base nspawn command.
|
||||
func (iso *IsolationConfig) NspawnArgs() []string {
|
||||
var args []string
|
||||
|
||||
// Resource limits and cgroup delegation via --property.
|
||||
for _, prop := range iso.Resources.SystemdProperties() {
|
||||
args = append(args, "--property="+prop)
|
||||
}
|
||||
|
||||
// Seccomp profile.
|
||||
switch iso.Seccomp {
|
||||
case SeccompStrict:
|
||||
// systemd-nspawn applies its default filter automatically.
|
||||
// For strict mode we add --capability=drop-all to further limit.
|
||||
args = append(args, "--drop-capability=all")
|
||||
case SeccompDefault:
|
||||
// Use nspawn's built-in seccomp filter — no extra flags needed.
|
||||
case SeccompUnconfined:
|
||||
// Disable the built-in seccomp filter for trusted workloads.
|
||||
args = append(args, "--system-call-filter=~")
|
||||
}
|
||||
|
||||
// Network isolation.
|
||||
args = append(args, iso.Network.NspawnNetworkArgs()...)
|
||||
|
||||
// User namespace isolation.
|
||||
if iso.PrivateUsers {
|
||||
args = append(args, "--private-users=pick")
|
||||
}
|
||||
|
||||
// Read-only rootfs.
|
||||
if iso.ReadOnlyFS {
|
||||
args = append(args, "--read-only")
|
||||
}
|
||||
|
||||
return args
|
||||
}
|
||||
|
||||
// NspawnConfigBlock returns the .nspawn file content sections for this
|
||||
// isolation configuration. Written to /etc/systemd/nspawn/<name>.nspawn.
|
||||
func (iso *IsolationConfig) NspawnConfigBlock(name string) string {
|
||||
var b strings.Builder
|
||||
|
||||
// [Exec] section
|
||||
b.WriteString("[Exec]\n")
|
||||
b.WriteString("Boot=yes\n")
|
||||
b.WriteString("PrivateUsers=")
|
||||
if iso.PrivateUsers {
|
||||
b.WriteString("pick\n")
|
||||
} else {
|
||||
b.WriteString("no\n")
|
||||
}
|
||||
|
||||
// Environment setup.
|
||||
b.WriteString(fmt.Sprintf("Environment=VOLT_CONTAINER=%s\n", name))
|
||||
b.WriteString("Environment=VOLT_RUNTIME=hybrid\n")
|
||||
|
||||
b.WriteString("\n")
|
||||
|
||||
// [Network] section
|
||||
b.WriteString("[Network]\n")
|
||||
switch iso.Network.Mode {
|
||||
case NetworkPrivate:
|
||||
b.WriteString(fmt.Sprintf("Bridge=%s\n", iso.Network.Bridge))
|
||||
case NetworkNone:
|
||||
b.WriteString("Private=yes\n")
|
||||
case NetworkHost:
|
||||
// No network section needed for host mode.
|
||||
}
|
||||
|
||||
b.WriteString("\n")
|
||||
|
||||
// [ResourceControl] section (selected limits for the .nspawn file).
|
||||
b.WriteString("[ResourceControl]\n")
|
||||
if iso.Resources.MemoryHard != "" {
|
||||
b.WriteString(fmt.Sprintf("MemoryMax=%s\n", iso.Resources.MemoryHard))
|
||||
}
|
||||
if iso.Resources.PIDsMax > 0 {
|
||||
b.WriteString(fmt.Sprintf("TasksMax=%d\n", iso.Resources.PIDsMax))
|
||||
}
|
||||
|
||||
return b.String()
|
||||
}
|
||||
Reference in New Issue
Block a user