Files
volt-vmm/vmm/src/boot/pvh.rs
Karl Clinger 40ed108dd5 Volt VMM (Neutron Stardust): source-available under AGPSL v5.0
KVM-based microVMM for the Volt platform:
- Sub-second VM boot times
- Minimal memory footprint
- Landlock LSM + seccomp security
- Virtio device support
- Custom kernel management

Copyright (c) Armored Gates LLC. All rights reserved.
Licensed under AGPSL v5.0
2026-03-21 01:04:35 -05:00

609 lines
17 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//! PVH Boot Protocol Implementation
//!
//! PVH (Para-Virtualized Hardware) is a boot protocol that allows direct kernel
//! entry without BIOS/UEFI firmware. This is the fastest path to boot a Linux VM.
//!
//! # Overview
//!
//! The PVH boot protocol:
//! 1. Skips BIOS POST and firmware initialization
//! 2. Loads kernel directly into memory
//! 3. Sets up minimal boot structures (E820 map, start_info)
//! 4. Jumps directly to kernel 64-bit entry point
//!
//! # Boot Time Comparison
//!
//! | Method | Boot Time |
//! |--------|-----------|
//! | BIOS | 1-3s |
//! | UEFI | 0.5-1s |
//! | PVH | <50ms |
//!
//! # Memory Requirements
//!
//! The PVH start_info structure must be placed in guest memory and
//! its address passed to the kernel via RBX register.
use super::{layout, BootError, GuestMemory, Result};
/// Maximum number of E820 entries
pub const MAX_E820_ENTRIES: usize = 128;
/// E820 memory type values (matching Linux kernel definitions)
#[repr(u32)]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum E820Type {
/// Usable RAM
Ram = 1,
/// Reserved by system
Reserved = 2,
/// ACPI reclaimable
Acpi = 3,
/// ACPI NVS (Non-Volatile Storage)
Nvs = 4,
/// Unusable memory
Unusable = 5,
/// Disabled memory (EFI)
Disabled = 6,
/// Persistent memory
Pmem = 7,
/// Undefined/other
Undefined = 0,
}
impl From<u32> for E820Type {
fn from(val: u32) -> Self {
match val {
1 => E820Type::Ram,
2 => E820Type::Reserved,
3 => E820Type::Acpi,
4 => E820Type::Nvs,
5 => E820Type::Unusable,
6 => E820Type::Disabled,
7 => E820Type::Pmem,
_ => E820Type::Undefined,
}
}
}
/// E820 memory map entry
///
/// Matches the Linux kernel's e820entry structure for compatibility.
#[repr(C, packed)]
#[derive(Debug, Clone, Copy, Default)]
pub struct E820Entry {
/// Start address of memory region
pub addr: u64,
/// Size of memory region in bytes
pub size: u64,
/// Type of memory region
pub entry_type: u32,
}
impl E820Entry {
/// Create a new E820 entry
pub fn new(addr: u64, size: u64, entry_type: E820Type) -> Self {
Self {
addr,
size,
entry_type: entry_type as u32,
}
}
/// Create a RAM entry
pub fn ram(addr: u64, size: u64) -> Self {
Self::new(addr, size, E820Type::Ram)
}
/// Create a reserved entry
pub fn reserved(addr: u64, size: u64) -> Self {
Self::new(addr, size, E820Type::Reserved)
}
}
/// PVH start_info structure
///
/// This is a simplified version compatible with the Xen PVH ABI.
/// The structure is placed in guest memory and its address is passed
/// to the kernel in RBX.
///
/// # Memory Layout
///
/// The structure must be at a known location (typically 0x7000) and
/// contain pointers to other boot structures.
#[repr(C)]
#[derive(Debug, Clone, Default)]
pub struct StartInfo {
/// Magic number (XEN_HVM_START_MAGIC_VALUE or custom)
pub magic: u32,
/// Version of the start_info structure
pub version: u32,
/// Flags (reserved, should be 0)
pub flags: u32,
/// Number of modules (initrd counts as 1)
pub nr_modules: u32,
/// Physical address of module list
pub modlist_paddr: u64,
/// Physical address of command line string
pub cmdline_paddr: u64,
/// Physical address of RSDP (ACPI, 0 if none)
pub rsdp_paddr: u64,
/// Physical address of E820 memory map
pub memmap_paddr: u64,
/// Number of entries in memory map
pub memmap_entries: u32,
/// Reserved/padding
pub reserved: u32,
}
/// XEN HVM start magic value
pub const XEN_HVM_START_MAGIC: u32 = 0x336ec578;
/// Volt custom magic (for identification)
pub const VOLT_MAGIC: u32 = 0x4e4f5641; // "NOVA"
impl StartInfo {
/// Create a new StartInfo with default values
pub fn new() -> Self {
Self {
magic: XEN_HVM_START_MAGIC,
version: 1,
flags: 0,
..Default::default()
}
}
/// Set command line address
pub fn with_cmdline(mut self, addr: u64) -> Self {
self.cmdline_paddr = addr;
self
}
/// Set memory map address and entry count
pub fn with_memmap(mut self, addr: u64, entries: u32) -> Self {
self.memmap_paddr = addr;
self.memmap_entries = entries;
self
}
/// Set module (initrd) information
pub fn with_module(mut self, modlist_addr: u64) -> Self {
self.nr_modules = 1;
self.modlist_paddr = modlist_addr;
self
}
/// Convert to bytes for writing to guest memory
pub fn as_bytes(&self) -> &[u8] {
unsafe {
std::slice::from_raw_parts(
self as *const Self as *const u8,
std::mem::size_of::<Self>(),
)
}
}
}
/// Module (initrd) entry for PVH
#[repr(C)]
#[derive(Debug, Clone, Copy, Default)]
pub struct HvmModlistEntry {
/// Physical address of module
pub paddr: u64,
/// Size of module in bytes
pub size: u64,
/// Physical address of command line for module (0 if none)
pub cmdline_paddr: u64,
/// Reserved
pub reserved: u64,
}
impl HvmModlistEntry {
/// Create entry for initrd
pub fn new(paddr: u64, size: u64) -> Self {
Self {
paddr,
size,
cmdline_paddr: 0,
reserved: 0,
}
}
/// Convert to bytes
pub fn as_bytes(&self) -> &[u8] {
unsafe {
std::slice::from_raw_parts(
self as *const Self as *const u8,
std::mem::size_of::<Self>(),
)
}
}
}
/// PVH configuration for boot setup
#[derive(Debug, Clone)]
pub struct PvhConfig {
/// Total memory size in bytes
pub memory_size: u64,
/// Number of vCPUs
pub vcpu_count: u32,
/// Physical address of command line
pub cmdline_addr: u64,
/// Physical address of initrd (if any)
pub initrd_addr: Option<u64>,
/// Size of initrd (if any)
pub initrd_size: Option<u64>,
}
/// PVH boot setup implementation
pub struct PvhBootSetup;
impl PvhBootSetup {
/// Set up PVH boot structures in guest memory
///
/// Creates and writes:
/// 1. E820 memory map
/// 2. start_info structure
/// 3. Module list (for initrd)
pub fn setup<M: GuestMemory>(config: &PvhConfig, guest_mem: &mut M) -> Result<()> {
// Build E820 memory map
let e820_entries = Self::build_e820_map(config.memory_size)?;
let e820_count = e820_entries.len() as u32;
// Write E820 map to guest memory
Self::write_e820_map(&e820_entries, guest_mem)?;
// Write module list if initrd is present
let modlist_addr = if let (Some(addr), Some(size)) = (config.initrd_addr, config.initrd_size) {
let modlist_addr = layout::E820_MAP_ADDR +
(MAX_E820_ENTRIES * std::mem::size_of::<E820Entry>()) as u64;
let entry = HvmModlistEntry::new(addr, size);
guest_mem.write_bytes(modlist_addr, entry.as_bytes())?;
Some(modlist_addr)
} else {
None
};
// Build and write start_info structure
let mut start_info = StartInfo::new()
.with_cmdline(config.cmdline_addr)
.with_memmap(layout::E820_MAP_ADDR, e820_count);
if let Some(addr) = modlist_addr {
start_info = start_info.with_module(addr);
}
guest_mem.write_bytes(layout::PVH_START_INFO_ADDR, start_info.as_bytes())?;
Ok(())
}
/// Build E820 memory map for the VM
///
/// Creates a standard x86_64 memory layout:
/// - Low memory (0-640KB): RAM
/// - Legacy hole (640KB-1MB): Reserved
/// - High memory (1MB+): RAM
fn build_e820_map(memory_size: u64) -> Result<Vec<E820Entry>> {
let mut entries = Vec::with_capacity(4);
// Validate minimum memory
if memory_size < layout::HIGH_MEMORY_START {
return Err(BootError::MemoryLayout(format!(
"Memory size {} is less than minimum required {}",
memory_size,
layout::HIGH_MEMORY_START
)));
}
// Low memory: 0 to 640KB (0x0 - 0x9FFFF)
// We reserve the first page for real-mode IVT
entries.push(E820Entry::ram(0, layout::LOW_MEMORY_END));
// Legacy video/ROM hole: 640KB to 1MB (0xA0000 - 0xFFFFF)
// This is reserved for VGA memory, option ROMs, etc.
let legacy_hole_size = layout::HIGH_MEMORY_START - layout::LOW_MEMORY_END;
entries.push(E820Entry::reserved(layout::LOW_MEMORY_END, legacy_hole_size));
// High memory: 1MB to RAM size
let high_memory_size = memory_size - layout::HIGH_MEMORY_START;
if high_memory_size > 0 {
entries.push(E820Entry::ram(layout::HIGH_MEMORY_START, high_memory_size));
}
// If memory > 4GB, we might need to handle the MMIO hole
// For now, we assume memory <= 4GB for simplicity
// Production systems should handle:
// - PCI MMIO hole (typically 0xE0000000 - 0xFFFFFFFF)
// - Memory above 4GB remapped
Ok(entries)
}
/// Write E820 map entries to guest memory
fn write_e820_map<M: GuestMemory>(entries: &[E820Entry], guest_mem: &mut M) -> Result<()> {
let entry_size = std::mem::size_of::<E820Entry>();
for (i, entry) in entries.iter().enumerate() {
let addr = layout::E820_MAP_ADDR + (i * entry_size) as u64;
let bytes = unsafe {
std::slice::from_raw_parts(entry as *const E820Entry as *const u8, entry_size)
};
guest_mem.write_bytes(addr, bytes)?;
}
Ok(())
}
/// Get initial CPU register state for PVH boot
///
/// Returns the register values needed to start the vCPU in 64-bit mode
/// with PVH boot protocol.
pub fn get_initial_regs(entry_point: u64) -> PvhRegs {
PvhRegs {
// Instruction pointer - kernel entry
rip: entry_point,
// RBX contains pointer to start_info (Xen PVH convention)
rbx: layout::PVH_START_INFO_ADDR,
// RSI also contains start_info pointer (Linux boot convention)
rsi: layout::PVH_START_INFO_ADDR,
// Stack pointer
rsp: layout::BOOT_STACK_POINTER,
// Clear other general-purpose registers
rax: 0,
rcx: 0,
rdx: 0,
rdi: 0,
rbp: 0,
r8: 0,
r9: 0,
r10: 0,
r11: 0,
r12: 0,
r13: 0,
r14: 0,
r15: 0,
// Flags - interrupts disabled
rflags: 0x2,
// Segment selectors for 64-bit mode
cs: 0x10, // Code segment, ring 0
ds: 0x18, // Data segment
es: 0x18,
fs: 0x18,
gs: 0x18,
ss: 0x18,
// CR registers for 64-bit mode
cr0: CR0_PE | CR0_ET | CR0_PG,
cr3: 0, // Page table base - set by kernel setup
cr4: CR4_PAE,
// EFER for long mode
efer: EFER_LME | EFER_LMA,
}
}
}
/// Control Register 0 bits
const CR0_PE: u64 = 1 << 0; // Protection Enable
const CR0_ET: u64 = 1 << 4; // Extension Type (387 present)
const CR0_PG: u64 = 1 << 31; // Paging Enable
/// Control Register 4 bits
const CR4_PAE: u64 = 1 << 5; // Physical Address Extension
/// EFER (Extended Feature Enable Register) bits
const EFER_LME: u64 = 1 << 8; // Long Mode Enable
const EFER_LMA: u64 = 1 << 10; // Long Mode Active
/// CPU register state for PVH boot
#[derive(Debug, Clone, Default)]
pub struct PvhRegs {
// General purpose registers
pub rax: u64,
pub rbx: u64,
pub rcx: u64,
pub rdx: u64,
pub rsi: u64,
pub rdi: u64,
pub rsp: u64,
pub rbp: u64,
pub r8: u64,
pub r9: u64,
pub r10: u64,
pub r11: u64,
pub r12: u64,
pub r13: u64,
pub r14: u64,
pub r15: u64,
// Instruction pointer
pub rip: u64,
// Flags
pub rflags: u64,
// Segment selectors
pub cs: u16,
pub ds: u16,
pub es: u16,
pub fs: u16,
pub gs: u16,
pub ss: u16,
// Control registers
pub cr0: u64,
pub cr3: u64,
pub cr4: u64,
// Model-specific registers
pub efer: u64,
}
/// GDT entries for 64-bit mode boot
///
/// This provides a minimal GDT for transitioning to 64-bit mode.
/// The kernel will set up its own GDT later.
pub struct BootGdt;
impl BootGdt {
/// Null descriptor (required as GDT[0])
pub const NULL: u64 = 0;
/// 64-bit code segment (CS)
/// Base: 0, Limit: 0xFFFFF (ignored in 64-bit mode)
/// Type: Code, Execute/Read, Present, DPL=0
pub const CODE64: u64 = 0x00af_9b00_0000_ffff;
/// 64-bit data segment (DS, ES, SS, FS, GS)
/// Base: 0, Limit: 0xFFFFF
/// Type: Data, Read/Write, Present, DPL=0
pub const DATA64: u64 = 0x00cf_9300_0000_ffff;
/// Build GDT table as bytes
pub fn as_bytes() -> [u8; 24] {
let mut gdt = [0u8; 24];
gdt[0..8].copy_from_slice(&Self::NULL.to_le_bytes());
gdt[8..16].copy_from_slice(&Self::CODE64.to_le_bytes());
gdt[16..24].copy_from_slice(&Self::DATA64.to_le_bytes());
gdt
}
}
#[cfg(test)]
mod tests {
use super::*;
struct MockMemory {
size: u64,
data: Vec<u8>,
}
impl MockMemory {
fn new(size: u64) -> Self {
Self {
size,
data: vec![0; size as usize],
}
}
}
impl GuestMemory for MockMemory {
fn write_bytes(&mut self, addr: u64, data: &[u8]) -> Result<()> {
let end = addr as usize + data.len();
if end > self.data.len() {
return Err(BootError::GuestMemoryWrite(format!(
"Write at {:#x} exceeds memory size",
addr
)));
}
self.data[addr as usize..end].copy_from_slice(data);
Ok(())
}
fn size(&self) -> u64 {
self.size
}
}
#[test]
fn test_e820_entry_size() {
// E820 entry must be exactly 20 bytes for Linux kernel compatibility
assert_eq!(std::mem::size_of::<E820Entry>(), 20);
}
#[test]
fn test_build_e820_map() {
let memory_size = 128 * 1024 * 1024; // 128MB
let entries = PvhBootSetup::build_e820_map(memory_size).unwrap();
// Should have at least 3 entries
assert!(entries.len() >= 3);
// First entry should be low memory RAM — copy from packed struct
let e0_addr = entries[0].addr;
let e0_type = entries[0].entry_type;
assert_eq!(e0_addr, 0);
assert_eq!(e0_type, E820Type::Ram as u32);
// Second entry should be legacy hole (reserved)
let e1_addr = entries[1].addr;
let e1_type = entries[1].entry_type;
assert_eq!(e1_addr, layout::LOW_MEMORY_END);
assert_eq!(e1_type, E820Type::Reserved as u32);
// Third entry should be high memory RAM
let e2_addr = entries[2].addr;
let e2_type = entries[2].entry_type;
assert_eq!(e2_addr, layout::HIGH_MEMORY_START);
assert_eq!(e2_type, E820Type::Ram as u32);
}
#[test]
fn test_start_info_size() {
// StartInfo should be reasonable size (under 4KB page)
let size = std::mem::size_of::<StartInfo>();
assert!(size < 4096);
assert!(size >= 48); // Minimum expected fields
}
#[test]
fn test_pvh_setup() {
let mut mem = MockMemory::new(128 * 1024 * 1024);
let config = PvhConfig {
memory_size: 128 * 1024 * 1024,
vcpu_count: 2,
cmdline_addr: layout::CMDLINE_ADDR,
initrd_addr: Some(100 * 1024 * 1024),
initrd_size: Some(10 * 1024 * 1024),
};
let result = PvhBootSetup::setup(&config, &mut mem);
assert!(result.is_ok());
// Verify magic was written to start_info location
let magic = u32::from_le_bytes([
mem.data[layout::PVH_START_INFO_ADDR as usize],
mem.data[layout::PVH_START_INFO_ADDR as usize + 1],
mem.data[layout::PVH_START_INFO_ADDR as usize + 2],
mem.data[layout::PVH_START_INFO_ADDR as usize + 3],
]);
assert_eq!(magic, XEN_HVM_START_MAGIC);
}
#[test]
fn test_pvh_regs() {
let entry_point = 0x100200;
let regs = PvhBootSetup::get_initial_regs(entry_point);
// Verify entry point
assert_eq!(regs.rip, entry_point);
// Verify start_info pointer in rbx
assert_eq!(regs.rbx, layout::PVH_START_INFO_ADDR);
// Verify 64-bit mode flags
assert!(regs.cr0 & CR0_PE != 0); // Protection enabled
assert!(regs.cr0 & CR0_PG != 0); // Paging enabled
assert!(regs.cr4 & CR4_PAE != 0); // PAE enabled
assert!(regs.efer & EFER_LME != 0); // Long mode enabled
}
#[test]
fn test_gdt_layout() {
let gdt = BootGdt::as_bytes();
assert_eq!(gdt.len(), 24); // 3 entries × 8 bytes
// First entry should be null
assert_eq!(&gdt[0..8], &[0u8; 8]);
}
}