#!/bin/bash
# ══════════════════════════════════════════════════════════════════════════════
# Volt Hybrid Integration Tests — Isolation Verification
#
# Verifies security isolation boundaries for hybrid-native workloads:
#   1. Process isolation — can't see host processes
#   2. Network namespace isolation — different IP / interfaces
#   3. Mount namespace isolation — different /proc/mounts
#   4. Cgroup isolation — resource limits enforced
#   5. OOM stress test — memory over-allocation kills inside, host unaffected
#
# All isolation is via Linux kernel primitives:
#   Namespaces (PID, NET, MNT, UTS, IPC), cgroups v2, Landlock, Seccomp
# NO Docker. NO AppArmor. Landlock only.
#
# Requires: root, systemd-nspawn, base image
# ══════════════════════════════════════════════════════════════════════════════

set -uo pipefail
source "$(dirname "$0")/test_helpers.sh"

# ── Prerequisites ─────────────────────────────────────────────────────────────

require_root
require_volt
require_nspawn

BASE_IMAGE="/var/lib/volt/images/ubuntu_24.04"
if ! require_image "$BASE_IMAGE"; then
    echo "SKIP: No base image."
    exit 0
fi

trap cleanup_all EXIT

echo "⚡ Volt Hybrid Integration Tests — Isolation Verification"
echo "════════════════════════════════════════════════════════════════"

ISO_WL=$(test_name "isolation")

# Create and start the hybrid workload
create_container "$ISO_WL" "$BASE_IMAGE" 2>&1 >/dev/null
start_workload "$ISO_WL" 2>&1 >/dev/null

if ! wait_running "$ISO_WL" 30; then
    echo "FATAL: Could not start workload for isolation tests"
    exit 1
fi

LEADER_PID=$(get_leader_pid "$ISO_WL")
if [[ -z "$LEADER_PID" || "$LEADER_PID" == "0" ]]; then
    echo "FATAL: No leader PID for workload"
    exit 1
fi

# ── 1. Process Isolation ────────────────────────────────────────────────────

section "🔒 1. Process Isolation (PID Namespace)"

# Container should NOT see host processes.
# We look for a host-only process that the container shouldn't see.

# Get the container's view of its process list
container_pids=$(sudo nsenter -t "$LEADER_PID" -p -m sh -c \
    "ls -d /proc/[0-9]* 2>/dev/null | wc -l" 2>/dev/null || echo "0")
host_pids=$(ls -d /proc/[0-9]* 2>/dev/null | wc -l)

if [[ "$container_pids" -gt 0 ]]; then
    pass "Container can see $container_pids processes"
else
    fail "Container can see processes" "got 0"
fi

if [[ "$container_pids" -lt "$host_pids" ]]; then
    pass "Container sees fewer processes ($container_pids) than host ($host_pids)"
else
    fail "Container sees fewer processes than host" "container=$container_pids, host=$host_pids"
fi

# Check if the container can see OUR test script PID
our_pid=$$
can_see_us=$(sudo nsenter -t "$LEADER_PID" -p -m sh -c \
    "test -d /proc/$our_pid && echo 'yes' || echo 'no'" 2>/dev/null || echo "unknown")
if [[ "$can_see_us" == "no" ]]; then
    pass "Container cannot see host test script PID ($our_pid)"
elif [[ "$can_see_us" == "yes" ]]; then
    fail "Container should NOT see host PID $our_pid" "but it can"
else
    skip "Host PID visibility check" "could not determine"
fi

# Verify PID namespace inode differs
host_pidns_inode=$(stat -L -c '%i' /proc/self/ns/pid 2>/dev/null || echo "0")
container_pidns_inode=$(sudo nsenter -t "$LEADER_PID" -p -m stat -L -c '%i' /proc/self/ns/pid 2>/dev/null || echo "0")
if [[ "$host_pidns_inode" != "$container_pidns_inode" && "$container_pidns_inode" != "0" ]]; then
    pass "PID namespace inode differs (host=$host_pidns_inode, container=$container_pidns_inode)"
else
    skip "PID namespace inode check" "host=$host_pidns_inode, container=$container_pidns_inode"
fi

# Verify PID 1 inside is NOT the host's PID 1
host_pid1_name=$(cat /proc/1/comm 2>/dev/null || echo "")
container_pid1_name=$(sudo nsenter -t "$LEADER_PID" -p -m cat /proc/1/comm 2>/dev/null || echo "")
if [[ -n "$container_pid1_name" ]]; then
    pass "Container PID 1 process: $container_pid1_name"
    # In boot mode, PID 1 should be systemd; verify it's the container's own init
    if echo "$container_pid1_name" | grep -qE "systemd|init"; then
        pass "Container PID 1 is its own init system"
    else
        skip "Container PID 1 identity" "unexpected: $container_pid1_name"
    fi
fi

# ── 2. Network Namespace Isolation ──────────────────────────────────────────

section "🌐 2. Network Namespace Isolation"

# Verify the container has a different network namespace
host_netns_inode=$(stat -L -c '%i' /proc/self/ns/net 2>/dev/null || echo "0")
container_netns_inode=$(sudo nsenter -t "$LEADER_PID" -n stat -L -c '%i' /proc/self/ns/net 2>/dev/null || echo "0")

if [[ "$host_netns_inode" != "$container_netns_inode" && "$container_netns_inode" != "0" ]]; then
    pass "Network namespace inode differs (host=$host_netns_inode, container=$container_netns_inode)"
else
    fail "Network namespace inode differs" "host=$host_netns_inode, container=$container_netns_inode"
fi

# Get the container's IP address — should differ from host
host_ip=$(ip -4 -o addr show scope global 2>/dev/null | awk '{print $4}' | head -1 | cut -d/ -f1)
container_ip=$(sudo nsenter -t "$LEADER_PID" -n ip -4 -o addr show scope global 2>/dev/null | awk '{print $4}' | head -1 | cut -d/ -f1)

if [[ -n "$container_ip" && -n "$host_ip" && "$container_ip" != "$host_ip" ]]; then
    pass "Container IP ($container_ip) differs from host IP ($host_ip)"
elif [[ -z "$container_ip" ]]; then
    # Container may only have loopback (NetworkNone mode or bridge not set up)
    skip "Container IP comparison" "container has no global IP (bridge may not be configured)"
else
    fail "Container IP should differ from host" "both are $host_ip"
fi

# Verify container has its own interfaces (not sharing host interfaces)
host_ifaces=$(ip link show 2>/dev/null | grep -c "^[0-9]")
container_ifaces=$(sudo nsenter -t "$LEADER_PID" -n ip link show 2>/dev/null | grep -c "^[0-9]" || echo "0")

if [[ "$container_ifaces" -gt 0 ]]; then
    pass "Container has $container_ifaces network interfaces"
    if [[ "$container_ifaces" -lt "$host_ifaces" ]]; then
        pass "Container has fewer interfaces ($container_ifaces) than host ($host_ifaces)"
    else
        skip "Interface count comparison" "container=$container_ifaces, host=$host_ifaces"
    fi
else
    fail "Container should have at least loopback interface"
fi

# Verify loopback is present inside
if sudo nsenter -t "$LEADER_PID" -n ip link show lo 2>/dev/null | grep -q "UP"; then
    pass "Container loopback (lo) is UP"
else
    skip "Container loopback check" "lo may not be UP yet"
fi

# ── 3. Mount Namespace Isolation ────────────────────────────────────────────

section "📁 3. Mount Namespace Isolation"

# The container should have its own mount namespace with different mounts
host_mntns_inode=$(stat -L -c '%i' /proc/self/ns/mnt 2>/dev/null || echo "0")
container_mntns_inode=$(sudo nsenter -t "$LEADER_PID" -m stat -L -c '%i' /proc/self/ns/mnt 2>/dev/null || echo "0")

if [[ "$host_mntns_inode" != "$container_mntns_inode" && "$container_mntns_inode" != "0" ]]; then
    pass "Mount namespace inode differs (host=$host_mntns_inode, container=$container_mntns_inode)"
else
    fail "Mount namespace inode differs" "host=$host_mntns_inode, container=$container_mntns_inode"
fi

# Compare /proc/mounts content — should be fundamentally different
host_root_mount=$(grep "^[^ ]* / " /proc/mounts 2>/dev/null | head -1)
container_root_mount=$(sudo nsenter -t "$LEADER_PID" -m cat /proc/mounts 2>/dev/null | grep "^[^ ]* / " | head -1)

if [[ -n "$container_root_mount" && "$container_root_mount" != "$host_root_mount" ]]; then
    pass "Container root mount differs from host"
elif [[ -z "$container_root_mount" ]]; then
    skip "Container root mount check" "could not read container /proc/mounts"
else
    fail "Container root mount should differ" "same as host"
fi

# Verify host's /home is not visible inside (private rootfs)
if sudo nsenter -t "$LEADER_PID" -m ls /home/karl 2>/dev/null; then
    fail "Host /home/karl should NOT be visible inside container"
else
    pass "Host /home/karl is NOT visible inside container"
fi

# Verify /proc inside is a new mount (procfs)
container_proc_type=$(sudo nsenter -t "$LEADER_PID" -m grep "^proc /proc" /proc/mounts 2>/dev/null | awk '{print $3}')
if [[ "$container_proc_type" == "proc" ]]; then
    pass "Container has its own /proc (type=proc)"
else
    skip "Container /proc type check" "got: $container_proc_type"
fi

# ── 4. Cgroup Isolation ─────────────────────────────────────────────────────

section "⚙️  4. Cgroup Isolation (Resource Limits)"

# Find the cgroup for this container
cgroup_path=""
for candidate in \
    "/sys/fs/cgroup/machine.slice/volt-hybrid@${ISO_WL}.service" \
    "/sys/fs/cgroup/machine.slice/machine-${ISO_WL}.scope" \
    "/sys/fs/cgroup/machine.slice/systemd-nspawn@${ISO_WL}.service"; do
    if [[ -d "$candidate" ]]; then
        cgroup_path="$candidate"
        break
    fi
done

if [[ -z "$cgroup_path" ]]; then
    # Try broader search
    cgroup_path=$(find /sys/fs/cgroup -maxdepth 5 -name "*${ISO_WL}*" -type d 2>/dev/null | head -1)
fi

if [[ -n "$cgroup_path" && -d "$cgroup_path" ]]; then
    pass "Cgroup found: $cgroup_path"

    # Memory limit check
    if [[ -f "$cgroup_path/memory.max" ]]; then
        mem_max=$(cat "$cgroup_path/memory.max" 2>/dev/null)
        if [[ "$mem_max" != "max" && -n "$mem_max" ]]; then
            pass "Memory limit set: $mem_max bytes"
        else
            skip "Memory limit" "set to 'max' (unlimited)"
        fi
    else
        skip "Memory limit check" "memory.max not found"
    fi

    # Memory current usage
    if [[ -f "$cgroup_path/memory.current" ]]; then
        mem_cur=$(cat "$cgroup_path/memory.current" 2>/dev/null)
        if [[ -n "$mem_cur" && "$mem_cur" != "0" ]]; then
            pass "Memory usage tracked: $mem_cur bytes"
        else
            skip "Memory usage" "current=0"
        fi
    fi

    # PIDs limit check
    if [[ -f "$cgroup_path/pids.max" ]]; then
        pids_max=$(cat "$cgroup_path/pids.max" 2>/dev/null)
        if [[ "$pids_max" != "max" && -n "$pids_max" ]]; then
            pass "PIDs limit set: $pids_max"
        else
            skip "PIDs limit" "set to 'max' (unlimited)"
        fi
    fi

    # PIDs current
    if [[ -f "$cgroup_path/pids.current" ]]; then
        pids_cur=$(cat "$cgroup_path/pids.current" 2>/dev/null)
        pass "PIDs current: $pids_cur"
    fi

    # CPU weight/shares
    if [[ -f "$cgroup_path/cpu.weight" ]]; then
        cpu_weight=$(cat "$cgroup_path/cpu.weight" 2>/dev/null)
        pass "CPU weight set: $cpu_weight"
    fi

    # Verify cgroup controllers are enabled for the container
    if [[ -f "$cgroup_path/cgroup.controllers" ]]; then
        controllers=$(cat "$cgroup_path/cgroup.controllers" 2>/dev/null)
        pass "Available controllers: $controllers"
    fi
else
    skip "Cgroup isolation checks" "could not find cgroup for $ISO_WL"
fi

# ── 5. OOM Stress Test ──────────────────────────────────────────────────────

section "💥 5. OOM Stress Test (Memory Overallocation)"

# This test creates a SEPARATE workload with a tight memory limit,
# then attempts to allocate more than the limit inside.
# Expected: the process inside gets OOM-killed, host is unaffected.

OOM_WL=$(test_name "oom-test")
create_container "$OOM_WL" "$BASE_IMAGE" 2>&1 >/dev/null
start_workload "$OOM_WL" 2>&1 >/dev/null

if ! wait_running "$OOM_WL" 30; then
    skip "OOM test" "could not start OOM test workload"
else
    OOM_PID=$(get_leader_pid "$OOM_WL")

    # Set a tight memory limit via cgroup (128M)
    oom_cgroup=""
    for candidate in \
        "/sys/fs/cgroup/machine.slice/volt-hybrid@${OOM_WL}.service" \
        "/sys/fs/cgroup/machine.slice/machine-${OOM_WL}.scope" \
        "/sys/fs/cgroup/machine.slice/systemd-nspawn@${OOM_WL}.service"; do
        if [[ -d "$candidate" ]]; then
            oom_cgroup="$candidate"
            break
        fi
    done

    if [[ -z "$oom_cgroup" ]]; then
        oom_cgroup=$(find /sys/fs/cgroup -maxdepth 5 -name "*${OOM_WL}*" -type d 2>/dev/null | head -1)
    fi

    if [[ -n "$oom_cgroup" && -f "$oom_cgroup/memory.max" ]]; then
        # Set hard limit to 128MB
        echo "134217728" | sudo tee "$oom_cgroup/memory.max" >/dev/null 2>&1
        current_limit=$(cat "$oom_cgroup/memory.max" 2>/dev/null)
        pass "OOM test: memory limit set to $current_limit bytes"

        # Record host memory before stress
        host_mem_before=$(free -m 2>/dev/null | awk '/^Mem:/{print $7}')
        pass "Host available memory before stress: ${host_mem_before}MB"

        # Try to allocate 256MB inside the container (2× the limit)
        # Use a simple python/dd/stress approach
        oom_result=$(sudo nsenter -t "$OOM_PID" -p -m -n sh -c \
            "dd if=/dev/zero of=/dev/null bs=1M count=256 2>&1; echo EXIT_CODE=\$?" 2>/dev/null || echo "killed")

        # Check for OOM events in the cgroup
        if [[ -f "$oom_cgroup/memory.events" ]]; then
            oom_count=$(grep "^oom " "$oom_cgroup/memory.events" 2>/dev/null | awk '{print $2}')
            oom_kill_count=$(grep "^oom_kill " "$oom_cgroup/memory.events" 2>/dev/null | awk '{print $2}')
            if [[ "${oom_count:-0}" -gt 0 || "${oom_kill_count:-0}" -gt 0 ]]; then
                pass "OOM events triggered (oom=$oom_count, oom_kill=$oom_kill_count)"
            else
                # dd of=/dev/null doesn't actually allocate memory, try a real allocator
                # Use a subshell approach: allocate via /dev/shm or python
                sudo nsenter -t "$OOM_PID" -p -m -n sh -c \
                    "head -c 200M /dev/zero > /tmp/oom-alloc 2>/dev/null" || true
                sleep 2
                oom_count=$(grep "^oom " "$oom_cgroup/memory.events" 2>/dev/null | awk '{print $2}')
                oom_kill_count=$(grep "^oom_kill " "$oom_cgroup/memory.events" 2>/dev/null | awk '{print $2}')
                if [[ "${oom_count:-0}" -gt 0 || "${oom_kill_count:-0}" -gt 0 ]]; then
                    pass "OOM events triggered after file allocation (oom=$oom_count, oom_kill=$oom_kill_count)"
                else
                    skip "OOM events" "no oom events detected (oom=$oom_count, oom_kill=$oom_kill_count)"
                fi
            fi
        else
            skip "OOM events check" "memory.events not found"
        fi

        # Verify host is still healthy
        host_mem_after=$(free -m 2>/dev/null | awk '/^Mem:/{print $7}')
        pass "Host available memory after stress: ${host_mem_after}MB"

        # Host should still be responsive (if we got here, it is)
        if uptime &>/dev/null; then
            pass "Host is still responsive after OOM test"
        else
            fail "Host responsiveness check"
        fi
    else
        skip "OOM stress test" "could not find cgroup or memory.max for OOM workload"
    fi
fi

# Cleanup OOM workload
destroy_workload "$OOM_WL"
CLEANUP_WORKLOADS=("${CLEANUP_WORKLOADS[@]/$OOM_WL/}")

# ── Cleanup main isolation workload ─────────────────────────────────────────

stop_workload "$ISO_WL" &>/dev/null
destroy_workload "$ISO_WL"
CLEANUP_WORKLOADS=("${CLEANUP_WORKLOADS[@]/$ISO_WL/}")

# ── Results ──────────────────────────────────────────────────────────────────

print_results "Isolation Verification"
exit $?