KVM-based microVMM for the Volt platform: - Sub-second VM boot times - Minimal memory footprint - Landlock LSM + seccomp security - Virtio device support - Custom kernel management Copyright (c) Armored Gates LLC. All rights reserved. Licensed under AGPSL v5.0
609 lines
17 KiB
Rust
609 lines
17 KiB
Rust
//! PVH Boot Protocol Implementation
|
||
//!
|
||
//! PVH (Para-Virtualized Hardware) is a boot protocol that allows direct kernel
|
||
//! entry without BIOS/UEFI firmware. This is the fastest path to boot a Linux VM.
|
||
//!
|
||
//! # Overview
|
||
//!
|
||
//! The PVH boot protocol:
|
||
//! 1. Skips BIOS POST and firmware initialization
|
||
//! 2. Loads kernel directly into memory
|
||
//! 3. Sets up minimal boot structures (E820 map, start_info)
|
||
//! 4. Jumps directly to kernel 64-bit entry point
|
||
//!
|
||
//! # Boot Time Comparison
|
||
//!
|
||
//! | Method | Boot Time |
|
||
//! |--------|-----------|
|
||
//! | BIOS | 1-3s |
|
||
//! | UEFI | 0.5-1s |
|
||
//! | PVH | <50ms |
|
||
//!
|
||
//! # Memory Requirements
|
||
//!
|
||
//! The PVH start_info structure must be placed in guest memory and
|
||
//! its address passed to the kernel via RBX register.
|
||
|
||
use super::{layout, BootError, GuestMemory, Result};
|
||
|
||
/// Maximum number of E820 entries
|
||
pub const MAX_E820_ENTRIES: usize = 128;
|
||
|
||
/// E820 memory type values (matching Linux kernel definitions)
|
||
#[repr(u32)]
|
||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||
pub enum E820Type {
|
||
/// Usable RAM
|
||
Ram = 1,
|
||
/// Reserved by system
|
||
Reserved = 2,
|
||
/// ACPI reclaimable
|
||
Acpi = 3,
|
||
/// ACPI NVS (Non-Volatile Storage)
|
||
Nvs = 4,
|
||
/// Unusable memory
|
||
Unusable = 5,
|
||
/// Disabled memory (EFI)
|
||
Disabled = 6,
|
||
/// Persistent memory
|
||
Pmem = 7,
|
||
/// Undefined/other
|
||
Undefined = 0,
|
||
}
|
||
|
||
impl From<u32> for E820Type {
|
||
fn from(val: u32) -> Self {
|
||
match val {
|
||
1 => E820Type::Ram,
|
||
2 => E820Type::Reserved,
|
||
3 => E820Type::Acpi,
|
||
4 => E820Type::Nvs,
|
||
5 => E820Type::Unusable,
|
||
6 => E820Type::Disabled,
|
||
7 => E820Type::Pmem,
|
||
_ => E820Type::Undefined,
|
||
}
|
||
}
|
||
}
|
||
|
||
/// E820 memory map entry
|
||
///
|
||
/// Matches the Linux kernel's e820entry structure for compatibility.
|
||
#[repr(C, packed)]
|
||
#[derive(Debug, Clone, Copy, Default)]
|
||
pub struct E820Entry {
|
||
/// Start address of memory region
|
||
pub addr: u64,
|
||
/// Size of memory region in bytes
|
||
pub size: u64,
|
||
/// Type of memory region
|
||
pub entry_type: u32,
|
||
}
|
||
|
||
impl E820Entry {
|
||
/// Create a new E820 entry
|
||
pub fn new(addr: u64, size: u64, entry_type: E820Type) -> Self {
|
||
Self {
|
||
addr,
|
||
size,
|
||
entry_type: entry_type as u32,
|
||
}
|
||
}
|
||
|
||
/// Create a RAM entry
|
||
pub fn ram(addr: u64, size: u64) -> Self {
|
||
Self::new(addr, size, E820Type::Ram)
|
||
}
|
||
|
||
/// Create a reserved entry
|
||
pub fn reserved(addr: u64, size: u64) -> Self {
|
||
Self::new(addr, size, E820Type::Reserved)
|
||
}
|
||
}
|
||
|
||
/// PVH start_info structure
|
||
///
|
||
/// This is a simplified version compatible with the Xen PVH ABI.
|
||
/// The structure is placed in guest memory and its address is passed
|
||
/// to the kernel in RBX.
|
||
///
|
||
/// # Memory Layout
|
||
///
|
||
/// The structure must be at a known location (typically 0x7000) and
|
||
/// contain pointers to other boot structures.
|
||
#[repr(C)]
|
||
#[derive(Debug, Clone, Default)]
|
||
pub struct StartInfo {
|
||
/// Magic number (XEN_HVM_START_MAGIC_VALUE or custom)
|
||
pub magic: u32,
|
||
/// Version of the start_info structure
|
||
pub version: u32,
|
||
/// Flags (reserved, should be 0)
|
||
pub flags: u32,
|
||
/// Number of modules (initrd counts as 1)
|
||
pub nr_modules: u32,
|
||
/// Physical address of module list
|
||
pub modlist_paddr: u64,
|
||
/// Physical address of command line string
|
||
pub cmdline_paddr: u64,
|
||
/// Physical address of RSDP (ACPI, 0 if none)
|
||
pub rsdp_paddr: u64,
|
||
/// Physical address of E820 memory map
|
||
pub memmap_paddr: u64,
|
||
/// Number of entries in memory map
|
||
pub memmap_entries: u32,
|
||
/// Reserved/padding
|
||
pub reserved: u32,
|
||
}
|
||
|
||
/// XEN HVM start magic value
|
||
pub const XEN_HVM_START_MAGIC: u32 = 0x336ec578;
|
||
|
||
/// Volt custom magic (for identification)
|
||
pub const VOLT_MAGIC: u32 = 0x4e4f5641; // "NOVA"
|
||
|
||
impl StartInfo {
|
||
/// Create a new StartInfo with default values
|
||
pub fn new() -> Self {
|
||
Self {
|
||
magic: XEN_HVM_START_MAGIC,
|
||
version: 1,
|
||
flags: 0,
|
||
..Default::default()
|
||
}
|
||
}
|
||
|
||
/// Set command line address
|
||
pub fn with_cmdline(mut self, addr: u64) -> Self {
|
||
self.cmdline_paddr = addr;
|
||
self
|
||
}
|
||
|
||
/// Set memory map address and entry count
|
||
pub fn with_memmap(mut self, addr: u64, entries: u32) -> Self {
|
||
self.memmap_paddr = addr;
|
||
self.memmap_entries = entries;
|
||
self
|
||
}
|
||
|
||
/// Set module (initrd) information
|
||
pub fn with_module(mut self, modlist_addr: u64) -> Self {
|
||
self.nr_modules = 1;
|
||
self.modlist_paddr = modlist_addr;
|
||
self
|
||
}
|
||
|
||
/// Convert to bytes for writing to guest memory
|
||
pub fn as_bytes(&self) -> &[u8] {
|
||
unsafe {
|
||
std::slice::from_raw_parts(
|
||
self as *const Self as *const u8,
|
||
std::mem::size_of::<Self>(),
|
||
)
|
||
}
|
||
}
|
||
}
|
||
|
||
/// Module (initrd) entry for PVH
|
||
#[repr(C)]
|
||
#[derive(Debug, Clone, Copy, Default)]
|
||
pub struct HvmModlistEntry {
|
||
/// Physical address of module
|
||
pub paddr: u64,
|
||
/// Size of module in bytes
|
||
pub size: u64,
|
||
/// Physical address of command line for module (0 if none)
|
||
pub cmdline_paddr: u64,
|
||
/// Reserved
|
||
pub reserved: u64,
|
||
}
|
||
|
||
impl HvmModlistEntry {
|
||
/// Create entry for initrd
|
||
pub fn new(paddr: u64, size: u64) -> Self {
|
||
Self {
|
||
paddr,
|
||
size,
|
||
cmdline_paddr: 0,
|
||
reserved: 0,
|
||
}
|
||
}
|
||
|
||
/// Convert to bytes
|
||
pub fn as_bytes(&self) -> &[u8] {
|
||
unsafe {
|
||
std::slice::from_raw_parts(
|
||
self as *const Self as *const u8,
|
||
std::mem::size_of::<Self>(),
|
||
)
|
||
}
|
||
}
|
||
}
|
||
|
||
/// PVH configuration for boot setup
|
||
#[derive(Debug, Clone)]
|
||
pub struct PvhConfig {
|
||
/// Total memory size in bytes
|
||
pub memory_size: u64,
|
||
/// Number of vCPUs
|
||
pub vcpu_count: u32,
|
||
/// Physical address of command line
|
||
pub cmdline_addr: u64,
|
||
/// Physical address of initrd (if any)
|
||
pub initrd_addr: Option<u64>,
|
||
/// Size of initrd (if any)
|
||
pub initrd_size: Option<u64>,
|
||
}
|
||
|
||
/// PVH boot setup implementation
|
||
pub struct PvhBootSetup;
|
||
|
||
impl PvhBootSetup {
|
||
/// Set up PVH boot structures in guest memory
|
||
///
|
||
/// Creates and writes:
|
||
/// 1. E820 memory map
|
||
/// 2. start_info structure
|
||
/// 3. Module list (for initrd)
|
||
pub fn setup<M: GuestMemory>(config: &PvhConfig, guest_mem: &mut M) -> Result<()> {
|
||
// Build E820 memory map
|
||
let e820_entries = Self::build_e820_map(config.memory_size)?;
|
||
let e820_count = e820_entries.len() as u32;
|
||
|
||
// Write E820 map to guest memory
|
||
Self::write_e820_map(&e820_entries, guest_mem)?;
|
||
|
||
// Write module list if initrd is present
|
||
let modlist_addr = if let (Some(addr), Some(size)) = (config.initrd_addr, config.initrd_size) {
|
||
let modlist_addr = layout::E820_MAP_ADDR +
|
||
(MAX_E820_ENTRIES * std::mem::size_of::<E820Entry>()) as u64;
|
||
|
||
let entry = HvmModlistEntry::new(addr, size);
|
||
guest_mem.write_bytes(modlist_addr, entry.as_bytes())?;
|
||
|
||
Some(modlist_addr)
|
||
} else {
|
||
None
|
||
};
|
||
|
||
// Build and write start_info structure
|
||
let mut start_info = StartInfo::new()
|
||
.with_cmdline(config.cmdline_addr)
|
||
.with_memmap(layout::E820_MAP_ADDR, e820_count);
|
||
|
||
if let Some(addr) = modlist_addr {
|
||
start_info = start_info.with_module(addr);
|
||
}
|
||
|
||
guest_mem.write_bytes(layout::PVH_START_INFO_ADDR, start_info.as_bytes())?;
|
||
|
||
Ok(())
|
||
}
|
||
|
||
/// Build E820 memory map for the VM
|
||
///
|
||
/// Creates a standard x86_64 memory layout:
|
||
/// - Low memory (0-640KB): RAM
|
||
/// - Legacy hole (640KB-1MB): Reserved
|
||
/// - High memory (1MB+): RAM
|
||
fn build_e820_map(memory_size: u64) -> Result<Vec<E820Entry>> {
|
||
let mut entries = Vec::with_capacity(4);
|
||
|
||
// Validate minimum memory
|
||
if memory_size < layout::HIGH_MEMORY_START {
|
||
return Err(BootError::MemoryLayout(format!(
|
||
"Memory size {} is less than minimum required {}",
|
||
memory_size,
|
||
layout::HIGH_MEMORY_START
|
||
)));
|
||
}
|
||
|
||
// Low memory: 0 to 640KB (0x0 - 0x9FFFF)
|
||
// We reserve the first page for real-mode IVT
|
||
entries.push(E820Entry::ram(0, layout::LOW_MEMORY_END));
|
||
|
||
// Legacy video/ROM hole: 640KB to 1MB (0xA0000 - 0xFFFFF)
|
||
// This is reserved for VGA memory, option ROMs, etc.
|
||
let legacy_hole_size = layout::HIGH_MEMORY_START - layout::LOW_MEMORY_END;
|
||
entries.push(E820Entry::reserved(layout::LOW_MEMORY_END, legacy_hole_size));
|
||
|
||
// High memory: 1MB to RAM size
|
||
let high_memory_size = memory_size - layout::HIGH_MEMORY_START;
|
||
if high_memory_size > 0 {
|
||
entries.push(E820Entry::ram(layout::HIGH_MEMORY_START, high_memory_size));
|
||
}
|
||
|
||
// If memory > 4GB, we might need to handle the MMIO hole
|
||
// For now, we assume memory <= 4GB for simplicity
|
||
// Production systems should handle:
|
||
// - PCI MMIO hole (typically 0xE0000000 - 0xFFFFFFFF)
|
||
// - Memory above 4GB remapped
|
||
|
||
Ok(entries)
|
||
}
|
||
|
||
/// Write E820 map entries to guest memory
|
||
fn write_e820_map<M: GuestMemory>(entries: &[E820Entry], guest_mem: &mut M) -> Result<()> {
|
||
let entry_size = std::mem::size_of::<E820Entry>();
|
||
|
||
for (i, entry) in entries.iter().enumerate() {
|
||
let addr = layout::E820_MAP_ADDR + (i * entry_size) as u64;
|
||
let bytes = unsafe {
|
||
std::slice::from_raw_parts(entry as *const E820Entry as *const u8, entry_size)
|
||
};
|
||
guest_mem.write_bytes(addr, bytes)?;
|
||
}
|
||
|
||
Ok(())
|
||
}
|
||
|
||
/// Get initial CPU register state for PVH boot
|
||
///
|
||
/// Returns the register values needed to start the vCPU in 64-bit mode
|
||
/// with PVH boot protocol.
|
||
pub fn get_initial_regs(entry_point: u64) -> PvhRegs {
|
||
PvhRegs {
|
||
// Instruction pointer - kernel entry
|
||
rip: entry_point,
|
||
|
||
// RBX contains pointer to start_info (Xen PVH convention)
|
||
rbx: layout::PVH_START_INFO_ADDR,
|
||
|
||
// RSI also contains start_info pointer (Linux boot convention)
|
||
rsi: layout::PVH_START_INFO_ADDR,
|
||
|
||
// Stack pointer
|
||
rsp: layout::BOOT_STACK_POINTER,
|
||
|
||
// Clear other general-purpose registers
|
||
rax: 0,
|
||
rcx: 0,
|
||
rdx: 0,
|
||
rdi: 0,
|
||
rbp: 0,
|
||
r8: 0,
|
||
r9: 0,
|
||
r10: 0,
|
||
r11: 0,
|
||
r12: 0,
|
||
r13: 0,
|
||
r14: 0,
|
||
r15: 0,
|
||
|
||
// Flags - interrupts disabled
|
||
rflags: 0x2,
|
||
|
||
// Segment selectors for 64-bit mode
|
||
cs: 0x10, // Code segment, ring 0
|
||
ds: 0x18, // Data segment
|
||
es: 0x18,
|
||
fs: 0x18,
|
||
gs: 0x18,
|
||
ss: 0x18,
|
||
|
||
// CR registers for 64-bit mode
|
||
cr0: CR0_PE | CR0_ET | CR0_PG,
|
||
cr3: 0, // Page table base - set by kernel setup
|
||
cr4: CR4_PAE,
|
||
|
||
// EFER for long mode
|
||
efer: EFER_LME | EFER_LMA,
|
||
}
|
||
}
|
||
}
|
||
|
||
/// Control Register 0 bits
|
||
const CR0_PE: u64 = 1 << 0; // Protection Enable
|
||
const CR0_ET: u64 = 1 << 4; // Extension Type (387 present)
|
||
const CR0_PG: u64 = 1 << 31; // Paging Enable
|
||
|
||
/// Control Register 4 bits
|
||
const CR4_PAE: u64 = 1 << 5; // Physical Address Extension
|
||
|
||
/// EFER (Extended Feature Enable Register) bits
|
||
const EFER_LME: u64 = 1 << 8; // Long Mode Enable
|
||
const EFER_LMA: u64 = 1 << 10; // Long Mode Active
|
||
|
||
/// CPU register state for PVH boot
|
||
#[derive(Debug, Clone, Default)]
|
||
pub struct PvhRegs {
|
||
// General purpose registers
|
||
pub rax: u64,
|
||
pub rbx: u64,
|
||
pub rcx: u64,
|
||
pub rdx: u64,
|
||
pub rsi: u64,
|
||
pub rdi: u64,
|
||
pub rsp: u64,
|
||
pub rbp: u64,
|
||
pub r8: u64,
|
||
pub r9: u64,
|
||
pub r10: u64,
|
||
pub r11: u64,
|
||
pub r12: u64,
|
||
pub r13: u64,
|
||
pub r14: u64,
|
||
pub r15: u64,
|
||
|
||
// Instruction pointer
|
||
pub rip: u64,
|
||
|
||
// Flags
|
||
pub rflags: u64,
|
||
|
||
// Segment selectors
|
||
pub cs: u16,
|
||
pub ds: u16,
|
||
pub es: u16,
|
||
pub fs: u16,
|
||
pub gs: u16,
|
||
pub ss: u16,
|
||
|
||
// Control registers
|
||
pub cr0: u64,
|
||
pub cr3: u64,
|
||
pub cr4: u64,
|
||
|
||
// Model-specific registers
|
||
pub efer: u64,
|
||
}
|
||
|
||
/// GDT entries for 64-bit mode boot
|
||
///
|
||
/// This provides a minimal GDT for transitioning to 64-bit mode.
|
||
/// The kernel will set up its own GDT later.
|
||
pub struct BootGdt;
|
||
|
||
impl BootGdt {
|
||
/// Null descriptor (required as GDT[0])
|
||
pub const NULL: u64 = 0;
|
||
|
||
/// 64-bit code segment (CS)
|
||
/// Base: 0, Limit: 0xFFFFF (ignored in 64-bit mode)
|
||
/// Type: Code, Execute/Read, Present, DPL=0
|
||
pub const CODE64: u64 = 0x00af_9b00_0000_ffff;
|
||
|
||
/// 64-bit data segment (DS, ES, SS, FS, GS)
|
||
/// Base: 0, Limit: 0xFFFFF
|
||
/// Type: Data, Read/Write, Present, DPL=0
|
||
pub const DATA64: u64 = 0x00cf_9300_0000_ffff;
|
||
|
||
/// Build GDT table as bytes
|
||
pub fn as_bytes() -> [u8; 24] {
|
||
let mut gdt = [0u8; 24];
|
||
gdt[0..8].copy_from_slice(&Self::NULL.to_le_bytes());
|
||
gdt[8..16].copy_from_slice(&Self::CODE64.to_le_bytes());
|
||
gdt[16..24].copy_from_slice(&Self::DATA64.to_le_bytes());
|
||
gdt
|
||
}
|
||
}
|
||
|
||
#[cfg(test)]
|
||
mod tests {
|
||
use super::*;
|
||
|
||
struct MockMemory {
|
||
size: u64,
|
||
data: Vec<u8>,
|
||
}
|
||
|
||
impl MockMemory {
|
||
fn new(size: u64) -> Self {
|
||
Self {
|
||
size,
|
||
data: vec![0; size as usize],
|
||
}
|
||
}
|
||
}
|
||
|
||
impl GuestMemory for MockMemory {
|
||
fn write_bytes(&mut self, addr: u64, data: &[u8]) -> Result<()> {
|
||
let end = addr as usize + data.len();
|
||
if end > self.data.len() {
|
||
return Err(BootError::GuestMemoryWrite(format!(
|
||
"Write at {:#x} exceeds memory size",
|
||
addr
|
||
)));
|
||
}
|
||
self.data[addr as usize..end].copy_from_slice(data);
|
||
Ok(())
|
||
}
|
||
|
||
fn size(&self) -> u64 {
|
||
self.size
|
||
}
|
||
}
|
||
|
||
#[test]
|
||
fn test_e820_entry_size() {
|
||
// E820 entry must be exactly 20 bytes for Linux kernel compatibility
|
||
assert_eq!(std::mem::size_of::<E820Entry>(), 20);
|
||
}
|
||
|
||
#[test]
|
||
fn test_build_e820_map() {
|
||
let memory_size = 128 * 1024 * 1024; // 128MB
|
||
let entries = PvhBootSetup::build_e820_map(memory_size).unwrap();
|
||
|
||
// Should have at least 3 entries
|
||
assert!(entries.len() >= 3);
|
||
|
||
// First entry should be low memory RAM — copy from packed struct
|
||
let e0_addr = entries[0].addr;
|
||
let e0_type = entries[0].entry_type;
|
||
assert_eq!(e0_addr, 0);
|
||
assert_eq!(e0_type, E820Type::Ram as u32);
|
||
|
||
// Second entry should be legacy hole (reserved)
|
||
let e1_addr = entries[1].addr;
|
||
let e1_type = entries[1].entry_type;
|
||
assert_eq!(e1_addr, layout::LOW_MEMORY_END);
|
||
assert_eq!(e1_type, E820Type::Reserved as u32);
|
||
|
||
// Third entry should be high memory RAM
|
||
let e2_addr = entries[2].addr;
|
||
let e2_type = entries[2].entry_type;
|
||
assert_eq!(e2_addr, layout::HIGH_MEMORY_START);
|
||
assert_eq!(e2_type, E820Type::Ram as u32);
|
||
}
|
||
|
||
#[test]
|
||
fn test_start_info_size() {
|
||
// StartInfo should be reasonable size (under 4KB page)
|
||
let size = std::mem::size_of::<StartInfo>();
|
||
assert!(size < 4096);
|
||
assert!(size >= 48); // Minimum expected fields
|
||
}
|
||
|
||
#[test]
|
||
fn test_pvh_setup() {
|
||
let mut mem = MockMemory::new(128 * 1024 * 1024);
|
||
let config = PvhConfig {
|
||
memory_size: 128 * 1024 * 1024,
|
||
vcpu_count: 2,
|
||
cmdline_addr: layout::CMDLINE_ADDR,
|
||
initrd_addr: Some(100 * 1024 * 1024),
|
||
initrd_size: Some(10 * 1024 * 1024),
|
||
};
|
||
|
||
let result = PvhBootSetup::setup(&config, &mut mem);
|
||
assert!(result.is_ok());
|
||
|
||
// Verify magic was written to start_info location
|
||
let magic = u32::from_le_bytes([
|
||
mem.data[layout::PVH_START_INFO_ADDR as usize],
|
||
mem.data[layout::PVH_START_INFO_ADDR as usize + 1],
|
||
mem.data[layout::PVH_START_INFO_ADDR as usize + 2],
|
||
mem.data[layout::PVH_START_INFO_ADDR as usize + 3],
|
||
]);
|
||
assert_eq!(magic, XEN_HVM_START_MAGIC);
|
||
}
|
||
|
||
#[test]
|
||
fn test_pvh_regs() {
|
||
let entry_point = 0x100200;
|
||
let regs = PvhBootSetup::get_initial_regs(entry_point);
|
||
|
||
// Verify entry point
|
||
assert_eq!(regs.rip, entry_point);
|
||
|
||
// Verify start_info pointer in rbx
|
||
assert_eq!(regs.rbx, layout::PVH_START_INFO_ADDR);
|
||
|
||
// Verify 64-bit mode flags
|
||
assert!(regs.cr0 & CR0_PE != 0); // Protection enabled
|
||
assert!(regs.cr0 & CR0_PG != 0); // Paging enabled
|
||
assert!(regs.cr4 & CR4_PAE != 0); // PAE enabled
|
||
assert!(regs.efer & EFER_LME != 0); // Long mode enabled
|
||
}
|
||
|
||
#[test]
|
||
fn test_gdt_layout() {
|
||
let gdt = BootGdt::as_bytes();
|
||
assert_eq!(gdt.len(), 24); // 3 entries × 8 bytes
|
||
|
||
// First entry should be null
|
||
assert_eq!(&gdt[0..8], &[0u8; 8]);
|
||
}
|
||
}
|