Volt VMM (Neutron Stardust): source-available under AGPSL v5.0

KVM-based microVMM for the Volt platform:
- Sub-second VM boot times
- Minimal memory footprint
- Landlock LSM + seccomp security
- Virtio device support
- Custom kernel management

Copyright (c) Armored Gates LLC. All rights reserved.
Licensed under AGPSL v5.0
This commit is contained in:
Karl Clinger
2026-03-21 01:04:35 -05:00
commit 40ed108dd5
143 changed files with 50300 additions and 0 deletions

115
vmm/src/boot/gdt.rs Normal file
View File

@@ -0,0 +1,115 @@
//! GDT (Global Descriptor Table) Setup for 64-bit Boot
//!
//! Sets up a minimal GDT for 64-bit kernel boot. The kernel will set up
//! its own GDT later, so this is just for the initial transition.
use super::{GuestMemory, Result};
#[cfg(test)]
use super::BootError;
/// GDT address in guest memory
pub const GDT_ADDR: u64 = 0x500;
/// GDT size (3 entries × 8 bytes = 24 bytes, but we add a few more for safety)
pub const GDT_SIZE: usize = 0x30;
/// GDT entry indices (matches Firecracker layout)
#[allow(dead_code)] // GDT selector constants — part of x86 boot protocol
pub mod selectors {
/// Null segment (required)
pub const NULL: u16 = 0x00;
/// 64-bit code segment (at index 1, selector 0x08)
pub const CODE64: u16 = 0x08;
/// 64-bit data segment (at index 2, selector 0x10)
pub const DATA64: u16 = 0x10;
}
/// GDT setup implementation
pub struct GdtSetup;
impl GdtSetup {
/// Set up GDT in guest memory
///
/// Creates a minimal GDT matching Firecracker's layout:
/// - Entry 0 (0x00): Null descriptor (required)
/// - Entry 1 (0x08): 64-bit code segment
/// - Entry 2 (0x10): 64-bit data segment
pub fn setup<M: GuestMemory>(guest_mem: &mut M) -> Result<()> {
// Zero out the GDT area first
let zeros = vec![0u8; GDT_SIZE];
guest_mem.write_bytes(GDT_ADDR, &zeros)?;
// Entry 0: Null descriptor (required, all zeros)
// Already zeroed
// Entry 1 (0x08): 64-bit code segment
// Base: 0, Limit: 0xFFFFF (ignored in 64-bit mode)
// Flags: Present, Ring 0, Code, Execute/Read, Long mode
let code64: u64 = 0x00AF_9B00_0000_FFFF;
guest_mem.write_bytes(GDT_ADDR + 0x08, &code64.to_le_bytes())?;
// Entry 2 (0x10): 64-bit data segment
// Base: 0, Limit: 0xFFFFF
// Flags: Present, Ring 0, Data, Read/Write
let data64: u64 = 0x00CF_9300_0000_FFFF;
guest_mem.write_bytes(GDT_ADDR + 0x10, &data64.to_le_bytes())?;
tracing::debug!("GDT set up at 0x{:x}", GDT_ADDR);
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
struct MockMemory {
data: Vec<u8>,
}
impl MockMemory {
fn new(size: usize) -> Self {
Self {
data: vec![0; size],
}
}
fn read_u64(&self, addr: u64) -> u64 {
let bytes = &self.data[addr as usize..addr as usize + 8];
u64::from_le_bytes(bytes.try_into().unwrap())
}
}
impl GuestMemory for MockMemory {
fn write_bytes(&mut self, addr: u64, data: &[u8]) -> Result<()> {
let end = addr as usize + data.len();
if end > self.data.len() {
return Err(BootError::GuestMemoryWrite("overflow".into()));
}
self.data[addr as usize..end].copy_from_slice(data);
Ok(())
}
fn size(&self) -> u64 {
self.data.len() as u64
}
}
#[test]
fn test_gdt_setup() {
let mut mem = MockMemory::new(0x1000);
GdtSetup::setup(&mut mem).unwrap();
// Check null descriptor
assert_eq!(mem.read_u64(GDT_ADDR), 0);
// Check code segment (entry 1, offset 0x08)
let code = mem.read_u64(GDT_ADDR + 0x08);
assert_eq!(code, 0x00AF_9B00_0000_FFFF);
// Check data segment (entry 2, offset 0x10)
let data = mem.read_u64(GDT_ADDR + 0x10);
assert_eq!(data, 0x00CF_9300_0000_FFFF);
}
}

398
vmm/src/boot/initrd.rs Normal file
View File

@@ -0,0 +1,398 @@
//! Initrd/Initramfs Loader
//!
//! Handles loading of initial ramdisk images into guest memory.
//! The initrd is placed in high memory to avoid conflicts with the kernel.
//!
//! # Memory Placement Strategy
//!
//! The initrd is placed as high as possible in guest memory while:
//! 1. Staying below the 4GB boundary (for 32-bit kernel compatibility)
//! 2. Being page-aligned
//! 3. Not overlapping with the kernel
//!
//! This matches the behavior of QEMU and other hypervisors.
use super::{BootError, GuestMemory, Result};
use std::fs::File;
use std::io::Read;
use std::path::Path;
/// Page size for alignment
const PAGE_SIZE: u64 = 4096;
/// Maximum address for initrd (4GB - 1, for 32-bit compatibility)
const MAX_INITRD_ADDR: u64 = 0xFFFF_FFFF;
/// Minimum gap between kernel and initrd
const MIN_KERNEL_INITRD_GAP: u64 = PAGE_SIZE;
/// Initrd loader configuration
#[derive(Debug, Clone)]
pub struct InitrdConfig {
/// Path to initrd/initramfs image
pub path: String,
/// Total guest memory size
pub memory_size: u64,
/// End address of kernel (for placement calculation)
pub kernel_end: u64,
}
/// Result of initrd loading
#[derive(Debug, Clone)]
pub struct InitrdLoadResult {
/// Address where initrd was loaded
pub load_addr: u64,
/// Size of loaded initrd
pub size: u64,
}
/// Initrd loader implementation
pub struct InitrdLoader;
impl InitrdLoader {
/// Load initrd into guest memory
///
/// Places the initrd as high as possible in guest memory while respecting
/// alignment and boundary constraints.
pub fn load<M: GuestMemory>(
config: &InitrdConfig,
guest_mem: &mut M,
) -> Result<InitrdLoadResult> {
let initrd_data = Self::read_initrd_file(&config.path)?;
let initrd_size = initrd_data.len() as u64;
if initrd_size == 0 {
return Err(BootError::InitrdRead(std::io::Error::new(
std::io::ErrorKind::InvalidData,
"Initrd file is empty",
)));
}
// Calculate optimal placement address
let load_addr = Self::calculate_load_address(
initrd_size,
config.memory_size,
config.kernel_end,
guest_mem.size(),
)?;
// Write initrd to guest memory
guest_mem.write_bytes(load_addr, &initrd_data)?;
Ok(InitrdLoadResult {
load_addr,
size: initrd_size,
})
}
/// Read initrd file into memory
fn read_initrd_file(path: &str) -> Result<Vec<u8>> {
let path = Path::new(path);
if !path.exists() {
return Err(BootError::InitrdRead(std::io::Error::new(
std::io::ErrorKind::NotFound,
format!("Initrd not found: {}", path.display()),
)));
}
let mut file = File::open(path).map_err(BootError::InitrdRead)?;
let mut data = Vec::new();
file.read_to_end(&mut data).map_err(BootError::InitrdRead)?;
Ok(data)
}
/// Calculate the optimal load address for initrd
///
/// Strategy:
/// 1. Try to place at high memory (below 4GB for compatibility)
/// 2. Page-align the address
/// 3. Ensure no overlap with kernel
fn calculate_load_address(
initrd_size: u64,
memory_size: u64,
kernel_end: u64,
guest_mem_size: u64,
) -> Result<u64> {
// Determine the highest usable address
let max_addr = guest_mem_size.min(memory_size).min(MAX_INITRD_ADDR);
// Calculate page-aligned initrd size
let aligned_size = Self::align_up(initrd_size, PAGE_SIZE);
// Try to place at high memory (just below max_addr)
if max_addr < aligned_size {
return Err(BootError::InitrdTooLarge {
size: initrd_size,
available: max_addr,
});
}
// Calculate load address (page-aligned, as high as possible)
let ideal_addr = Self::align_down(max_addr - aligned_size, PAGE_SIZE);
// Check for kernel overlap
let min_addr = kernel_end + MIN_KERNEL_INITRD_GAP;
let min_addr_aligned = Self::align_up(min_addr, PAGE_SIZE);
if ideal_addr < min_addr_aligned {
// Not enough space between kernel and max memory
return Err(BootError::InitrdTooLarge {
size: initrd_size,
available: max_addr - min_addr_aligned,
});
}
Ok(ideal_addr)
}
/// Align value up to the given alignment
#[inline]
fn align_up(value: u64, alignment: u64) -> u64 {
(value + alignment - 1) & !(alignment - 1)
}
/// Align value down to the given alignment
#[inline]
fn align_down(value: u64, alignment: u64) -> u64 {
value & !(alignment - 1)
}
}
// --------------------------------------------------------------------------
// Initrd format detection — planned feature, not yet wired up
// --------------------------------------------------------------------------
/// Helper trait for initrd format detection
#[allow(dead_code)]
pub trait InitrdFormat {
/// Check if data is a valid initrd format
fn is_valid(data: &[u8]) -> bool;
/// Get format name
fn name() -> &'static str;
}
/// CPIO archive format (traditional initrd)
#[allow(dead_code)]
pub struct CpioFormat;
impl InitrdFormat for CpioFormat {
fn is_valid(data: &[u8]) -> bool {
if data.len() < 6 {
return false;
}
// Check for CPIO magic numbers
// "070701" or "070702" (newc format)
// "070707" (odc format)
// 0x71c7 or 0xc771 (binary format)
if &data[0..6] == b"070701" || &data[0..6] == b"070702" || &data[0..6] == b"070707" {
return true;
}
// Binary CPIO
if data.len() >= 2 {
let magic = u16::from_le_bytes([data[0], data[1]]);
if magic == 0x71c7 || magic == 0xc771 {
return true;
}
}
false
}
fn name() -> &'static str {
"CPIO"
}
}
/// Gzip compressed format
#[allow(dead_code)]
pub struct GzipFormat;
impl InitrdFormat for GzipFormat {
fn is_valid(data: &[u8]) -> bool {
// Gzip magic: 0x1f 0x8b
data.len() >= 2 && data[0] == 0x1f && data[1] == 0x8b
}
fn name() -> &'static str {
"Gzip"
}
}
/// XZ compressed format
#[allow(dead_code)]
pub struct XzFormat;
impl InitrdFormat for XzFormat {
fn is_valid(data: &[u8]) -> bool {
// XZ magic: 0xfd "7zXZ" 0x00
data.len() >= 6
&& data[0] == 0xfd
&& &data[1..5] == b"7zXZ"
&& data[5] == 0x00
}
fn name() -> &'static str {
"XZ"
}
}
/// Zstd compressed format
#[allow(dead_code)]
pub struct ZstdFormat;
impl InitrdFormat for ZstdFormat {
fn is_valid(data: &[u8]) -> bool {
// Zstd magic: 0x28 0xb5 0x2f 0xfd
data.len() >= 4
&& data[0] == 0x28
&& data[1] == 0xb5
&& data[2] == 0x2f
&& data[3] == 0xfd
}
fn name() -> &'static str {
"Zstd"
}
}
/// LZ4 compressed format
#[allow(dead_code)]
pub struct Lz4Format;
impl InitrdFormat for Lz4Format {
fn is_valid(data: &[u8]) -> bool {
// LZ4 frame magic: 0x04 0x22 0x4d 0x18
data.len() >= 4
&& data[0] == 0x04
&& data[1] == 0x22
&& data[2] == 0x4d
&& data[3] == 0x18
}
fn name() -> &'static str {
"LZ4"
}
}
/// Detect initrd format from data
#[allow(dead_code)]
pub fn detect_initrd_format(data: &[u8]) -> Option<&'static str> {
if GzipFormat::is_valid(data) {
return Some(GzipFormat::name());
}
if XzFormat::is_valid(data) {
return Some(XzFormat::name());
}
if ZstdFormat::is_valid(data) {
return Some(ZstdFormat::name());
}
if Lz4Format::is_valid(data) {
return Some(Lz4Format::name());
}
if CpioFormat::is_valid(data) {
return Some(CpioFormat::name());
}
None
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_align_up() {
assert_eq!(InitrdLoader::align_up(0, 4096), 0);
assert_eq!(InitrdLoader::align_up(1, 4096), 4096);
assert_eq!(InitrdLoader::align_up(4095, 4096), 4096);
assert_eq!(InitrdLoader::align_up(4096, 4096), 4096);
assert_eq!(InitrdLoader::align_up(4097, 4096), 8192);
}
#[test]
fn test_align_down() {
assert_eq!(InitrdLoader::align_down(0, 4096), 0);
assert_eq!(InitrdLoader::align_down(4095, 4096), 0);
assert_eq!(InitrdLoader::align_down(4096, 4096), 4096);
assert_eq!(InitrdLoader::align_down(4097, 4096), 4096);
assert_eq!(InitrdLoader::align_down(8191, 4096), 4096);
}
#[test]
fn test_calculate_load_address() {
// 128MB memory, 4MB kernel ending at 5MB
let memory_size = 128 * 1024 * 1024;
let kernel_end = 5 * 1024 * 1024;
let initrd_size = 10 * 1024 * 1024; // 10MB initrd
let result = InitrdLoader::calculate_load_address(
initrd_size,
memory_size,
kernel_end,
memory_size,
);
assert!(result.is_ok());
let addr = result.unwrap();
// Should be page-aligned
assert_eq!(addr % PAGE_SIZE, 0);
// Should be above kernel
assert!(addr > kernel_end);
// Should fit within memory
assert!(addr + initrd_size <= memory_size as u64);
}
#[test]
fn test_initrd_too_large() {
let memory_size = 16 * 1024 * 1024; // 16MB
let kernel_end = 8 * 1024 * 1024; // Kernel ends at 8MB
let initrd_size = 32 * 1024 * 1024; // 32MB initrd (too large!)
let result = InitrdLoader::calculate_load_address(
initrd_size,
memory_size,
kernel_end,
memory_size,
);
assert!(matches!(result, Err(BootError::InitrdTooLarge { .. })));
}
#[test]
fn test_detect_gzip() {
let data = [0x1f, 0x8b, 0x08, 0x00];
assert!(GzipFormat::is_valid(&data));
assert_eq!(detect_initrd_format(&data), Some("Gzip"));
}
#[test]
fn test_detect_xz() {
let data = [0xfd, b'7', b'z', b'X', b'Z', 0x00];
assert!(XzFormat::is_valid(&data));
assert_eq!(detect_initrd_format(&data), Some("XZ"));
}
#[test]
fn test_detect_zstd() {
let data = [0x28, 0xb5, 0x2f, 0xfd];
assert!(ZstdFormat::is_valid(&data));
assert_eq!(detect_initrd_format(&data), Some("Zstd"));
}
#[test]
fn test_detect_cpio_newc() {
let data = b"070701001234";
assert!(CpioFormat::is_valid(data));
}
}

465
vmm/src/boot/linux.rs Normal file
View File

@@ -0,0 +1,465 @@
//! Linux Boot Protocol Implementation
//!
//! Implements the Linux x86 boot protocol for 64-bit kernels.
//! This sets up the boot_params structure (zero page) that Linux expects
//! when booting in 64-bit mode.
//!
//! # References
//! - Linux kernel: arch/x86/include/uapi/asm/bootparam.h
//! - Linux kernel: Documentation/x86/boot.rst
use super::{layout, BootError, GuestMemory, Result};
/// Boot params address (zero page)
/// Must not overlap with page tables (0x1000-0x10FFF zeroed area) or GDT (0x500-0x52F)
pub const BOOT_PARAMS_ADDR: u64 = 0x20000;
/// Size of boot_params structure (4KB)
pub const BOOT_PARAMS_SIZE: usize = 4096;
/// E820 entry within boot_params
#[repr(C, packed)]
#[derive(Debug, Clone, Copy, Default)]
pub struct E820Entry {
pub addr: u64,
pub size: u64,
pub entry_type: u32,
}
/// E820 memory types
#[repr(u32)]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[allow(dead_code)] // E820 spec types — kept for completeness
pub enum E820Type {
Ram = 1,
Reserved = 2,
Acpi = 3,
Nvs = 4,
Unusable = 5,
}
impl E820Entry {
pub fn ram(addr: u64, size: u64) -> Self {
Self {
addr,
size,
entry_type: E820Type::Ram as u32,
}
}
pub fn reserved(addr: u64, size: u64) -> Self {
Self {
addr,
size,
entry_type: E820Type::Reserved as u32,
}
}
}
/// setup_header structure (at offset 0x1F1 in boot sector, or 0x1F1 in boot_params)
/// We only define the fields we actually use
#[repr(C, packed)]
#[derive(Debug, Clone, Copy)]
pub struct SetupHeader {
pub setup_sects: u8, // 0x1F1
pub root_flags: u16, // 0x1F2
pub syssize: u32, // 0x1F4
pub ram_size: u16, // 0x1F8 (obsolete)
pub vid_mode: u16, // 0x1FA
pub root_dev: u16, // 0x1FC
pub boot_flag: u16, // 0x1FE - should be 0xAA55
pub jump: u16, // 0x200
pub header: u32, // 0x202 - "HdrS" magic
pub version: u16, // 0x206
pub realmode_swtch: u32, // 0x208
pub start_sys_seg: u16, // 0x20C (obsolete)
pub kernel_version: u16, // 0x20E
pub type_of_loader: u8, // 0x210
pub loadflags: u8, // 0x211
pub setup_move_size: u16, // 0x212
pub code32_start: u32, // 0x214
pub ramdisk_image: u32, // 0x218
pub ramdisk_size: u32, // 0x21C
pub bootsect_kludge: u32, // 0x220
pub heap_end_ptr: u16, // 0x224
pub ext_loader_ver: u8, // 0x226
pub ext_loader_type: u8, // 0x227
pub cmd_line_ptr: u32, // 0x228
pub initrd_addr_max: u32, // 0x22C
pub kernel_alignment: u32, // 0x230
pub relocatable_kernel: u8, // 0x234
pub min_alignment: u8, // 0x235
pub xloadflags: u16, // 0x236
pub cmdline_size: u32, // 0x238
pub hardware_subarch: u32, // 0x23C
pub hardware_subarch_data: u64, // 0x240
pub payload_offset: u32, // 0x248
pub payload_length: u32, // 0x24C
pub setup_data: u64, // 0x250
pub pref_address: u64, // 0x258
pub init_size: u32, // 0x260
pub handover_offset: u32, // 0x264
pub kernel_info_offset: u32, // 0x268
}
impl Default for SetupHeader {
fn default() -> Self {
Self {
setup_sects: 0,
root_flags: 0,
syssize: 0,
ram_size: 0,
vid_mode: 0xFFFF, // VGA normal
root_dev: 0,
boot_flag: 0xAA55,
jump: 0,
header: 0x53726448, // "HdrS"
version: 0x020F, // Protocol version 2.15
realmode_swtch: 0,
start_sys_seg: 0,
kernel_version: 0,
type_of_loader: 0xFF, // Undefined loader
loadflags: LOADFLAG_LOADED_HIGH | LOADFLAG_CAN_USE_HEAP,
setup_move_size: 0,
code32_start: 0x100000, // 1MB
ramdisk_image: 0,
ramdisk_size: 0,
bootsect_kludge: 0,
heap_end_ptr: 0,
ext_loader_ver: 0,
ext_loader_type: 0,
cmd_line_ptr: 0,
initrd_addr_max: 0x7FFFFFFF,
kernel_alignment: 0x200000, // 2MB
relocatable_kernel: 1,
min_alignment: 21, // 2^21 = 2MB
xloadflags: XLF_KERNEL_64 | XLF_CAN_BE_LOADED_ABOVE_4G,
cmdline_size: 4096,
hardware_subarch: 0, // PC
hardware_subarch_data: 0,
payload_offset: 0,
payload_length: 0,
setup_data: 0,
pref_address: 0x1000000, // 16MB
init_size: 0,
handover_offset: 0,
kernel_info_offset: 0,
}
}
}
// Linux boot protocol constants — kept for completeness
#[allow(dead_code)]
pub const LOADFLAG_LOADED_HIGH: u8 = 0x01; // Kernel loaded high (at 0x100000)
#[allow(dead_code)]
pub const LOADFLAG_KASLR_FLAG: u8 = 0x02; // KASLR enabled
#[allow(dead_code)]
pub const LOADFLAG_QUIET_FLAG: u8 = 0x20; // Quiet boot
#[allow(dead_code)]
pub const LOADFLAG_KEEP_SEGMENTS: u8 = 0x40; // Don't reload segments
#[allow(dead_code)]
pub const LOADFLAG_CAN_USE_HEAP: u8 = 0x80; // Heap available
/// XLoadflags bits
#[allow(dead_code)]
pub const XLF_KERNEL_64: u16 = 0x0001; // 64-bit kernel
#[allow(dead_code)]
pub const XLF_CAN_BE_LOADED_ABOVE_4G: u16 = 0x0002; // Can load above 4GB
#[allow(dead_code)]
pub const XLF_EFI_HANDOVER_32: u16 = 0x0004; // EFI handover 32-bit
#[allow(dead_code)]
pub const XLF_EFI_HANDOVER_64: u16 = 0x0008; // EFI handover 64-bit
#[allow(dead_code)]
pub const XLF_EFI_KEXEC: u16 = 0x0010; // EFI kexec
/// Maximum E820 entries in boot_params
#[allow(dead_code)]
pub const E820_MAX_ENTRIES: usize = 128;
/// Offsets within boot_params structure
#[allow(dead_code)] // Linux boot protocol offsets — kept for reference
pub mod offsets {
/// setup_header starts at 0x1F1
pub const SETUP_HEADER: usize = 0x1F1;
/// E820 entry count at 0x1E8
pub const E820_ENTRIES: usize = 0x1E8;
/// E820 table starts at 0x2D0
pub const E820_TABLE: usize = 0x2D0;
/// Size of one E820 entry
pub const E820_ENTRY_SIZE: usize = 20;
}
/// Configuration for Linux boot setup
#[derive(Debug, Clone)]
pub struct LinuxBootConfig {
/// Total memory size in bytes
pub memory_size: u64,
/// Physical address of command line string
pub cmdline_addr: u64,
/// Physical address of initrd (if any)
pub initrd_addr: Option<u64>,
/// Size of initrd (if any)
pub initrd_size: Option<u64>,
}
/// Linux boot setup implementation
pub struct LinuxBootSetup;
impl LinuxBootSetup {
/// Set up Linux boot_params structure in guest memory
///
/// This creates the "zero page" that Linux expects when booting in 64-bit mode.
/// The boot_params address should be passed to the kernel via RSI register.
pub fn setup<M: GuestMemory>(config: &LinuxBootConfig, guest_mem: &mut M) -> Result<u64> {
// Allocate and zero the boot_params structure (4KB)
let boot_params = vec![0u8; BOOT_PARAMS_SIZE];
guest_mem.write_bytes(BOOT_PARAMS_ADDR, &boot_params)?;
// Build E820 memory map
let e820_entries = Self::build_e820_map(config.memory_size)?;
// Write E820 entry count
let e820_count = e820_entries.len() as u8;
guest_mem.write_bytes(
BOOT_PARAMS_ADDR + offsets::E820_ENTRIES as u64,
&[e820_count],
)?;
// Write E820 entries
for (i, entry) in e820_entries.iter().enumerate() {
let offset = BOOT_PARAMS_ADDR + offsets::E820_TABLE as u64
+ (i * offsets::E820_ENTRY_SIZE) as u64;
let bytes = unsafe {
std::slice::from_raw_parts(
entry as *const E820Entry as *const u8,
offsets::E820_ENTRY_SIZE,
)
};
guest_mem.write_bytes(offset, bytes)?;
}
// Build and write setup_header
let mut header = SetupHeader::default();
header.cmd_line_ptr = config.cmdline_addr as u32;
if let (Some(addr), Some(size)) = (config.initrd_addr, config.initrd_size) {
header.ramdisk_image = addr as u32;
header.ramdisk_size = size as u32;
}
// Write setup_header to boot_params
Self::write_setup_header(guest_mem, &header)?;
tracing::debug!(
"Linux boot_params setup at 0x{:x}: {} E820 entries, cmdline=0x{:x}",
BOOT_PARAMS_ADDR,
e820_count,
config.cmdline_addr
);
Ok(BOOT_PARAMS_ADDR)
}
/// Build E820 memory map for the VM
/// Layout matches Firecracker's working E820 configuration
fn build_e820_map(memory_size: u64) -> Result<Vec<E820Entry>> {
let mut entries = Vec::with_capacity(5);
if memory_size < layout::HIGH_MEMORY_START {
return Err(BootError::MemoryLayout(format!(
"Memory size {} is less than minimum required {}",
memory_size,
layout::HIGH_MEMORY_START
)));
}
// EBDA (Extended BIOS Data Area) boundary - Firecracker uses 0x9FC00
const EBDA_START: u64 = 0x9FC00;
// Low memory: 0 to EBDA (usable RAM) - matches Firecracker
entries.push(E820Entry::ram(0, EBDA_START));
// EBDA: Reserved area just below 640KB
entries.push(E820Entry::reserved(EBDA_START, layout::LOW_MEMORY_END - EBDA_START));
// Legacy hole: 640KB to 1MB (reserved for VGA/ROMs)
let legacy_hole_size = layout::HIGH_MEMORY_START - layout::LOW_MEMORY_END;
entries.push(E820Entry::reserved(layout::LOW_MEMORY_END, legacy_hole_size));
// High memory: 1MB to end of RAM
let high_memory_size = memory_size - layout::HIGH_MEMORY_START;
if high_memory_size > 0 {
entries.push(E820Entry::ram(layout::HIGH_MEMORY_START, high_memory_size));
}
Ok(entries)
}
/// Write setup_header to boot_params
fn write_setup_header<M: GuestMemory>(guest_mem: &mut M, header: &SetupHeader) -> Result<()> {
// The setup_header structure is written at offset 0x1F1 within boot_params
// We need to write individual fields at their correct offsets
let base = BOOT_PARAMS_ADDR;
// 0x1F1: setup_sects
guest_mem.write_bytes(base + 0x1F1, &[header.setup_sects])?;
// 0x1F2: root_flags
guest_mem.write_bytes(base + 0x1F2, &header.root_flags.to_le_bytes())?;
// 0x1F4: syssize
guest_mem.write_bytes(base + 0x1F4, &header.syssize.to_le_bytes())?;
// 0x1FE: boot_flag
guest_mem.write_bytes(base + 0x1FE, &header.boot_flag.to_le_bytes())?;
// 0x202: header magic
guest_mem.write_bytes(base + 0x202, &header.header.to_le_bytes())?;
// 0x206: version
guest_mem.write_bytes(base + 0x206, &header.version.to_le_bytes())?;
// 0x210: type_of_loader
guest_mem.write_bytes(base + 0x210, &[header.type_of_loader])?;
// 0x211: loadflags
guest_mem.write_bytes(base + 0x211, &[header.loadflags])?;
// 0x214: code32_start
guest_mem.write_bytes(base + 0x214, &header.code32_start.to_le_bytes())?;
// 0x218: ramdisk_image
guest_mem.write_bytes(base + 0x218, &header.ramdisk_image.to_le_bytes())?;
// 0x21C: ramdisk_size
guest_mem.write_bytes(base + 0x21C, &header.ramdisk_size.to_le_bytes())?;
// 0x224: heap_end_ptr
guest_mem.write_bytes(base + 0x224, &header.heap_end_ptr.to_le_bytes())?;
// 0x228: cmd_line_ptr
guest_mem.write_bytes(base + 0x228, &header.cmd_line_ptr.to_le_bytes())?;
// 0x22C: initrd_addr_max
guest_mem.write_bytes(base + 0x22C, &header.initrd_addr_max.to_le_bytes())?;
// 0x230: kernel_alignment
guest_mem.write_bytes(base + 0x230, &header.kernel_alignment.to_le_bytes())?;
// 0x234: relocatable_kernel
guest_mem.write_bytes(base + 0x234, &[header.relocatable_kernel])?;
// 0x236: xloadflags
guest_mem.write_bytes(base + 0x236, &header.xloadflags.to_le_bytes())?;
// 0x238: cmdline_size
guest_mem.write_bytes(base + 0x238, &header.cmdline_size.to_le_bytes())?;
// 0x23C: hardware_subarch
guest_mem.write_bytes(base + 0x23C, &header.hardware_subarch.to_le_bytes())?;
// 0x258: pref_address
guest_mem.write_bytes(base + 0x258, &header.pref_address.to_le_bytes())?;
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
struct MockMemory {
size: u64,
data: Vec<u8>,
}
impl MockMemory {
fn new(size: u64) -> Self {
Self {
size,
data: vec![0; size as usize],
}
}
fn read_bytes(&self, addr: u64, len: usize) -> &[u8] {
&self.data[addr as usize..addr as usize + len]
}
}
impl GuestMemory for MockMemory {
fn write_bytes(&mut self, addr: u64, data: &[u8]) -> Result<()> {
let end = addr as usize + data.len();
if end > self.data.len() {
return Err(BootError::GuestMemoryWrite(format!(
"Write at {:#x} exceeds memory",
addr
)));
}
self.data[addr as usize..end].copy_from_slice(data);
Ok(())
}
fn size(&self) -> u64 {
self.size
}
}
#[test]
fn test_e820_entry_size() {
assert_eq!(std::mem::size_of::<E820Entry>(), 20);
}
#[test]
fn test_linux_boot_setup() {
let mut mem = MockMemory::new(128 * 1024 * 1024);
let config = LinuxBootConfig {
memory_size: 128 * 1024 * 1024,
cmdline_addr: layout::CMDLINE_ADDR,
initrd_addr: None,
initrd_size: None,
};
let result = LinuxBootSetup::setup(&config, &mut mem);
assert!(result.is_ok());
assert_eq!(result.unwrap(), BOOT_PARAMS_ADDR);
// Verify boot_flag
let boot_flag = u16::from_le_bytes([
mem.data[BOOT_PARAMS_ADDR as usize + 0x1FE],
mem.data[BOOT_PARAMS_ADDR as usize + 0x1FF],
]);
assert_eq!(boot_flag, 0xAA55);
// Verify header magic
let magic = u32::from_le_bytes([
mem.data[BOOT_PARAMS_ADDR as usize + 0x202],
mem.data[BOOT_PARAMS_ADDR as usize + 0x203],
mem.data[BOOT_PARAMS_ADDR as usize + 0x204],
mem.data[BOOT_PARAMS_ADDR as usize + 0x205],
]);
assert_eq!(magic, 0x53726448); // "HdrS"
// Verify E820 entry count > 0
let e820_count = mem.data[BOOT_PARAMS_ADDR as usize + offsets::E820_ENTRIES];
assert!(e820_count >= 3);
}
#[test]
fn test_e820_map() {
let memory_size = 256 * 1024 * 1024; // 256MB
let entries = LinuxBootSetup::build_e820_map(memory_size).unwrap();
// 4 entries: low RAM (0..EBDA), EBDA reserved, legacy hole (640K-1M), high RAM
assert_eq!(entries.len(), 4);
// Low memory (0 to EBDA) — copy fields from packed struct to avoid unaligned references
let e0_addr = entries[0].addr;
let e0_type = entries[0].entry_type;
assert_eq!(e0_addr, 0);
assert_eq!(e0_type, E820Type::Ram as u32);
// EBDA reserved region
let e1_addr = entries[1].addr;
let e1_type = entries[1].entry_type;
assert_eq!(e1_addr, 0x9FC00); // EBDA_START
assert_eq!(e1_type, E820Type::Reserved as u32);
// Legacy hole (640KB to 1MB)
let e2_addr = entries[2].addr;
let e2_type = entries[2].entry_type;
assert_eq!(e2_addr, layout::LOW_MEMORY_END);
assert_eq!(e2_type, E820Type::Reserved as u32);
// High memory (1MB+)
let e3_addr = entries[3].addr;
let e3_type = entries[3].entry_type;
assert_eq!(e3_addr, layout::HIGH_MEMORY_START);
assert_eq!(e3_type, E820Type::Ram as u32);
}
}

576
vmm/src/boot/loader.rs Normal file
View File

@@ -0,0 +1,576 @@
//! Kernel Loader
//!
//! Loads Linux kernels in ELF64 or bzImage format directly into guest memory.
//! Supports PVH boot protocol for fastest possible boot times.
//!
//! # Kernel Formats
//!
//! ## ELF64 (vmlinux)
//! - Uncompressed kernel with ELF headers
//! - Direct load to specified address
//! - Entry point from ELF header
//!
//! ## bzImage
//! - Compressed kernel with setup header
//! - Requires parsing setup header for entry point
//! - Kernel loaded after setup sectors
use super::{layout, BootError, GuestMemory, Result};
use std::fs::File;
use std::io::Read;
use std::path::Path;
/// ELF magic number
const ELF_MAGIC: [u8; 4] = [0x7f, b'E', b'L', b'F'];
/// bzImage magic number at offset 0x202
const BZIMAGE_MAGIC: u32 = 0x53726448; // "HdrS"
/// Minimum boot protocol version for PVH
const MIN_BOOT_PROTOCOL_VERSION: u16 = 0x0200;
/// bzImage header offsets
#[allow(dead_code)] // Linux bzImage protocol constants — kept for completeness
mod bzimage {
/// Magic number offset
pub const HEADER_MAGIC_OFFSET: usize = 0x202;
/// Boot protocol version offset
pub const VERSION_OFFSET: usize = 0x206;
/// Kernel version string pointer offset
pub const KERNEL_VERSION_OFFSET: usize = 0x20e;
/// Setup sectors count offset (at 0x1f1)
pub const SETUP_SECTS_OFFSET: usize = 0x1f1;
/// Setup header size (minimum)
pub const SETUP_HEADER_SIZE: usize = 0x0202;
/// Sector size
pub const SECTOR_SIZE: usize = 512;
/// Default setup sectors if field is 0
pub const DEFAULT_SETUP_SECTS: u8 = 4;
/// Boot flag offset
pub const BOOT_FLAG_OFFSET: usize = 0x1fe;
/// Expected boot flag value
pub const BOOT_FLAG_VALUE: u16 = 0xaa55;
/// Real mode kernel header size
pub const REAL_MODE_HEADER_SIZE: usize = 0x8000;
/// Loadflags offset
pub const LOADFLAGS_OFFSET: usize = 0x211;
/// Loadflag: kernel is loaded high (at 0x100000)
pub const LOADFLAG_LOADED_HIGH: u8 = 0x01;
/// Loadflag: can use heap
pub const LOADFLAG_CAN_USE_HEAP: u8 = 0x80;
/// Code32 start offset
pub const CODE32_START_OFFSET: usize = 0x214;
/// Kernel alignment offset
pub const KERNEL_ALIGNMENT_OFFSET: usize = 0x230;
/// Pref address offset (64-bit)
pub const PREF_ADDRESS_OFFSET: usize = 0x258;
/// XLoadflags offset
pub const XLOADFLAGS_OFFSET: usize = 0x236;
/// XLoadflag: kernel has EFI handover
pub const XLF_KERNEL_64: u16 = 0x0001;
/// XLoadflag: can be loaded above 4GB
pub const XLF_CAN_BE_LOADED_ABOVE_4G: u16 = 0x0002;
}
/// Kernel type detection result
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum KernelType {
/// ELF64 format (vmlinux)
Elf64,
/// bzImage format (compressed)
BzImage,
}
/// Kernel loader configuration
#[derive(Debug, Clone)]
pub struct KernelConfig {
/// Path to kernel image
pub path: String,
/// Address to load kernel (typically 1MB)
pub load_addr: u64,
}
/// Result of kernel loading
#[derive(Debug, Clone)]
#[allow(dead_code)]
pub struct KernelLoadResult {
/// Address where kernel was loaded
pub load_addr: u64,
/// Total size of loaded kernel
pub size: u64,
/// Entry point address
pub entry_point: u64,
/// Detected kernel type
pub kernel_type: KernelType,
}
/// Kernel loader implementation
pub struct KernelLoader;
impl KernelLoader {
/// Load a kernel image into guest memory
///
/// Automatically detects kernel format (ELF64 or bzImage) and loads
/// appropriately for PVH boot.
pub fn load<M: GuestMemory>(config: &KernelConfig, guest_mem: &mut M) -> Result<KernelLoadResult> {
let kernel_data = Self::read_kernel_file(&config.path)?;
// Detect kernel type
let kernel_type = Self::detect_kernel_type(&kernel_data)?;
match kernel_type {
KernelType::Elf64 => Self::load_elf64(&kernel_data, config.load_addr, guest_mem),
KernelType::BzImage => Self::load_bzimage(&kernel_data, config.load_addr, guest_mem),
}
}
/// Read kernel file into memory
///
/// Pre-allocates the buffer to the file size to avoid reallocation
/// during read. For a 21MB kernel this saves ~2ms of Vec growth.
fn read_kernel_file(path: &str) -> Result<Vec<u8>> {
let path = Path::new(path);
let mut file = File::open(path).map_err(BootError::KernelRead)?;
let file_size = file.metadata()
.map_err(BootError::KernelRead)?
.len() as usize;
if file_size == 0 {
return Err(BootError::InvalidKernel("Kernel file is empty".into()));
}
let mut data = Vec::with_capacity(file_size);
file.read_to_end(&mut data).map_err(BootError::KernelRead)?;
Ok(data)
}
/// Detect kernel type from magic numbers
fn detect_kernel_type(data: &[u8]) -> Result<KernelType> {
if data.len() < 4 {
return Err(BootError::InvalidKernel("Kernel image too small".into()));
}
// Check for ELF magic
if data[0..4] == ELF_MAGIC {
// Verify it's ELF64
if data.len() < 5 || data[4] != 2 {
return Err(BootError::InvalidElf(
"Only ELF64 kernels are supported".into(),
));
}
return Ok(KernelType::Elf64);
}
// Check for bzImage magic
if data.len() >= bzimage::HEADER_MAGIC_OFFSET + 4 {
let magic = u32::from_le_bytes([
data[bzimage::HEADER_MAGIC_OFFSET],
data[bzimage::HEADER_MAGIC_OFFSET + 1],
data[bzimage::HEADER_MAGIC_OFFSET + 2],
data[bzimage::HEADER_MAGIC_OFFSET + 3],
]);
if magic == BZIMAGE_MAGIC || (magic & 0xffff) == (BZIMAGE_MAGIC & 0xffff) {
return Ok(KernelType::BzImage);
}
}
Err(BootError::InvalidKernel(
"Unknown kernel format (expected ELF64 or bzImage)".into(),
))
}
/// Load ELF64 kernel (vmlinux)
///
/// # Warning: vmlinux Direct Boot Limitations
///
/// Loading vmlinux ELF directly has a fundamental limitation: the kernel's
/// `__startup_64()` function builds its own page tables that ONLY map the
/// kernel text region. After the CR3 switch, low memory (0-16MB) is unmapped,
/// causing faults when accessing boot_params or any low memory address.
///
/// **Recommended**: Use bzImage format instead, which includes a decompressor
/// that properly sets up full identity mapping for all memory.
///
/// See `docs/kernel-pagetable-analysis.md` for detailed analysis.
fn load_elf64<M: GuestMemory>(
data: &[u8],
load_addr: u64,
guest_mem: &mut M,
) -> Result<KernelLoadResult> {
// CRITICAL WARNING: vmlinux direct boot may fail
tracing::warn!(
"Loading vmlinux ELF directly. This may fail due to kernel page table setup. \
The kernel's __startup_64() builds its own page tables that don't map low memory. \
Consider using bzImage format for reliable boot."
);
// Parse ELF header
let elf = Elf64Header::parse(data)?;
// Validate it's an executable
if elf.e_type != 2 {
// ET_EXEC
return Err(BootError::InvalidElf("Not an executable ELF".into()));
}
// Validate machine type (x86_64 = 62)
if elf.e_machine != 62 {
return Err(BootError::InvalidElf(format!(
"Unsupported machine type: {} (expected x86_64)",
elf.e_machine
)));
}
let mut kernel_end = load_addr;
// Load program headers
for i in 0..elf.e_phnum {
let ph_offset = elf.e_phoff as usize + (i as usize * elf.e_phentsize as usize);
let ph = Elf64ProgramHeader::parse(&data[ph_offset..])?;
// Only load PT_LOAD segments
if ph.p_type != 1 {
continue;
}
// Calculate destination address
// For PVH, we load at the physical address specified in the ELF
// or offset from our load address
let dest_addr = if ph.p_paddr >= layout::HIGH_MEMORY_START {
ph.p_paddr
} else {
load_addr + ph.p_paddr
};
// Validate we have space
if dest_addr + ph.p_memsz > guest_mem.size() {
return Err(BootError::KernelTooLarge {
size: dest_addr + ph.p_memsz,
available: guest_mem.size(),
});
}
// Load file contents
let file_start = ph.p_offset as usize;
let file_end = file_start + ph.p_filesz as usize;
if file_end > data.len() {
return Err(BootError::InvalidElf("Program header exceeds file size".into()));
}
guest_mem.write_bytes(dest_addr, &data[file_start..file_end])?;
// Zero BSS (memsz > filesz)
if ph.p_memsz > ph.p_filesz {
let bss_start = dest_addr + ph.p_filesz;
let bss_size = (ph.p_memsz - ph.p_filesz) as usize;
let zeros = vec![0u8; bss_size];
guest_mem.write_bytes(bss_start, &zeros)?;
}
kernel_end = kernel_end.max(dest_addr + ph.p_memsz);
tracing::debug!(
"Loaded ELF segment: dest=0x{:x}, filesz=0x{:x}, memsz=0x{:x}",
dest_addr,
ph.p_filesz,
ph.p_memsz
);
}
tracing::debug!(
"ELF kernel loaded: entry=0x{:x}, kernel_end=0x{:x}",
elf.e_entry,
kernel_end
);
// For vmlinux ELF, the e_entry is the physical entry point.
// But the kernel code is compiled for the virtual address.
// We map both identity (physical) and high-kernel (virtual) addresses,
// but it's better to use the physical entry for startup_64 which is
// designed to run with identity mapping first.
//
// However, if the kernel immediately triple-faults at the physical address,
// we can try the virtual address instead.
// Virtual address = 0xFFFFFFFF80000000 + (physical - 0x1000000) + offset_within_text
// For entry at physical 0x1000000, virtual would be 0xFFFFFFFF81000000
let virtual_entry = 0xFFFFFFFF81000000u64 + (elf.e_entry - 0x1000000);
tracing::debug!(
"Entry points: physical=0x{:x}, virtual=0x{:x}",
elf.e_entry, virtual_entry
);
Ok(KernelLoadResult {
load_addr,
size: kernel_end - load_addr,
// Use PHYSICAL entry point - kernel's startup_64 expects identity mapping
entry_point: elf.e_entry,
kernel_type: KernelType::Elf64,
})
}
/// Load bzImage kernel
fn load_bzimage<M: GuestMemory>(
data: &[u8],
load_addr: u64,
guest_mem: &mut M,
) -> Result<KernelLoadResult> {
// Validate minimum size
if data.len() < bzimage::SETUP_HEADER_SIZE + bzimage::SECTOR_SIZE {
return Err(BootError::InvalidBzImage("Image too small".into()));
}
// Check boot flag
let boot_flag = u16::from_le_bytes([
data[bzimage::BOOT_FLAG_OFFSET],
data[bzimage::BOOT_FLAG_OFFSET + 1],
]);
if boot_flag != bzimage::BOOT_FLAG_VALUE {
return Err(BootError::InvalidBzImage(format!(
"Invalid boot flag: {:#x}",
boot_flag
)));
}
// Get boot protocol version
let version = u16::from_le_bytes([
data[bzimage::VERSION_OFFSET],
data[bzimage::VERSION_OFFSET + 1],
]);
if version < MIN_BOOT_PROTOCOL_VERSION {
return Err(BootError::UnsupportedVersion(format!(
"Boot protocol {}.{} is too old (minimum 2.0)",
version >> 8,
version & 0xff
)));
}
// Get setup sectors count
let mut setup_sects = data[bzimage::SETUP_SECTS_OFFSET];
if setup_sects == 0 {
setup_sects = bzimage::DEFAULT_SETUP_SECTS;
}
// Calculate kernel offset (setup sectors + boot sector)
let setup_size = (setup_sects as usize + 1) * bzimage::SECTOR_SIZE;
if setup_size >= data.len() {
return Err(BootError::InvalidBzImage(
"Setup size exceeds image size".into(),
));
}
// Get loadflags
let loadflags = data[bzimage::LOADFLAGS_OFFSET];
let loaded_high = (loadflags & bzimage::LOADFLAG_LOADED_HIGH) != 0;
// For modern kernels (protocol >= 2.0), get code32 entry point
let code32_start = if version >= 0x0200 {
u32::from_le_bytes([
data[bzimage::CODE32_START_OFFSET],
data[bzimage::CODE32_START_OFFSET + 1],
data[bzimage::CODE32_START_OFFSET + 2],
data[bzimage::CODE32_START_OFFSET + 3],
])
} else {
0x100000 // Default high load address
};
// Check for 64-bit support (protocol >= 2.11)
let supports_64bit = if version >= 0x020b {
let xloadflags = u16::from_le_bytes([
data[bzimage::XLOADFLAGS_OFFSET],
data[bzimage::XLOADFLAGS_OFFSET + 1],
]);
(xloadflags & bzimage::XLF_KERNEL_64) != 0
} else {
false
};
// Get preferred load address (protocol >= 2.10)
let pref_address = if version >= 0x020a && data.len() > bzimage::PREF_ADDRESS_OFFSET + 8 {
u64::from_le_bytes([
data[bzimage::PREF_ADDRESS_OFFSET],
data[bzimage::PREF_ADDRESS_OFFSET + 1],
data[bzimage::PREF_ADDRESS_OFFSET + 2],
data[bzimage::PREF_ADDRESS_OFFSET + 3],
data[bzimage::PREF_ADDRESS_OFFSET + 4],
data[bzimage::PREF_ADDRESS_OFFSET + 5],
data[bzimage::PREF_ADDRESS_OFFSET + 6],
data[bzimage::PREF_ADDRESS_OFFSET + 7],
])
} else {
layout::KERNEL_LOAD_ADDR
};
// Determine actual load address
let actual_load_addr = if loaded_high {
if pref_address != 0 {
pref_address
} else {
load_addr
}
} else {
load_addr
};
// Extract protected mode kernel
let kernel_data = &data[setup_size..];
let kernel_size = kernel_data.len() as u64;
// Validate size
if actual_load_addr + kernel_size > guest_mem.size() {
return Err(BootError::KernelTooLarge {
size: kernel_size,
available: guest_mem.size() - actual_load_addr,
});
}
// Write kernel to guest memory
guest_mem.write_bytes(actual_load_addr, kernel_data)?;
// Determine entry point
// For PVH boot, we enter at the 64-bit entry point
// which is typically at load_addr + 0x200 for modern kernels
let entry_point = if supports_64bit {
// 64-bit entry point offset in newer kernels
actual_load_addr + 0x200
} else {
code32_start as u64
};
Ok(KernelLoadResult {
load_addr: actual_load_addr,
size: kernel_size,
entry_point,
kernel_type: KernelType::BzImage,
})
}
}
/// ELF64 header structure
#[derive(Debug, Default)]
struct Elf64Header {
e_type: u16,
e_machine: u16,
e_entry: u64,
e_phoff: u64,
e_phnum: u16,
e_phentsize: u16,
}
impl Elf64Header {
fn parse(data: &[u8]) -> Result<Self> {
if data.len() < 64 {
return Err(BootError::InvalidElf("ELF header too small".into()));
}
// Verify ELF magic
if &data[0..4] != &ELF_MAGIC {
return Err(BootError::InvalidElf("Invalid ELF magic".into()));
}
// Verify 64-bit
if data[4] != 2 {
return Err(BootError::InvalidElf("Not ELF64".into()));
}
// Verify little-endian
if data[5] != 1 {
return Err(BootError::InvalidElf("Not little-endian".into()));
}
Ok(Self {
e_type: u16::from_le_bytes([data[16], data[17]]),
e_machine: u16::from_le_bytes([data[18], data[19]]),
e_entry: u64::from_le_bytes([
data[24], data[25], data[26], data[27],
data[28], data[29], data[30], data[31],
]),
e_phoff: u64::from_le_bytes([
data[32], data[33], data[34], data[35],
data[36], data[37], data[38], data[39],
]),
e_phentsize: u16::from_le_bytes([data[54], data[55]]),
e_phnum: u16::from_le_bytes([data[56], data[57]]),
})
}
}
/// ELF64 program header structure
#[derive(Debug, Default)]
struct Elf64ProgramHeader {
p_type: u32,
p_offset: u64,
p_paddr: u64,
p_filesz: u64,
p_memsz: u64,
}
impl Elf64ProgramHeader {
fn parse(data: &[u8]) -> Result<Self> {
if data.len() < 56 {
return Err(BootError::InvalidElf("Program header too small".into()));
}
Ok(Self {
p_type: u32::from_le_bytes([data[0], data[1], data[2], data[3]]),
p_offset: u64::from_le_bytes([
data[8], data[9], data[10], data[11],
data[12], data[13], data[14], data[15],
]),
p_paddr: u64::from_le_bytes([
data[24], data[25], data[26], data[27],
data[28], data[29], data[30], data[31],
]),
p_filesz: u64::from_le_bytes([
data[32], data[33], data[34], data[35],
data[36], data[37], data[38], data[39],
]),
p_memsz: u64::from_le_bytes([
data[40], data[41], data[42], data[43],
data[44], data[45], data[46], data[47],
]),
})
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_detect_elf_magic() {
let mut elf_data = vec![0u8; 64];
elf_data[0..4].copy_from_slice(&ELF_MAGIC);
elf_data[4] = 2; // ELF64
let result = KernelLoader::detect_kernel_type(&elf_data);
assert!(matches!(result, Ok(KernelType::Elf64)));
}
#[test]
fn test_detect_bzimage_magic() {
let mut bzimage_data = vec![0u8; 0x210];
// Set boot flag
bzimage_data[bzimage::BOOT_FLAG_OFFSET] = 0x55;
bzimage_data[bzimage::BOOT_FLAG_OFFSET + 1] = 0xaa;
// Set HdrS magic
bzimage_data[bzimage::HEADER_MAGIC_OFFSET] = 0x48; // 'H'
bzimage_data[bzimage::HEADER_MAGIC_OFFSET + 1] = 0x64; // 'd'
bzimage_data[bzimage::HEADER_MAGIC_OFFSET + 2] = 0x72; // 'r'
bzimage_data[bzimage::HEADER_MAGIC_OFFSET + 3] = 0x53; // 'S'
let result = KernelLoader::detect_kernel_type(&bzimage_data);
assert!(matches!(result, Ok(KernelType::BzImage)));
}
#[test]
fn test_invalid_kernel() {
let data = vec![0u8; 100];
let result = KernelLoader::detect_kernel_type(&data);
assert!(matches!(result, Err(BootError::InvalidKernel(_))));
}
}

378
vmm/src/boot/mod.rs Normal file
View File

@@ -0,0 +1,378 @@
//! Volt Boot Loader Module
//!
//! Implements PVH direct kernel boot for sub-50ms cold boot times.
//! Skips BIOS/UEFI entirely by directly loading the kernel into guest memory
//! and setting up the boot parameters.
//!
//! # Boot Protocol
//!
//! Volt uses the PVH boot protocol (Xen-compatible) which allows direct
//! kernel entry without firmware. This is significantly faster than:
//! - Traditional BIOS boot (seconds)
//! - Linux boot protocol via SeaBIOS (hundreds of ms)
//! - UEFI boot (hundreds of ms)
//!
//! # Supported Kernel Formats
//!
//! - ELF64 (vmlinux) - Direct kernel image
//! - bzImage - Compressed Linux kernel with setup header
//!
//! # Memory Layout (typical)
//!
//! ```text
//! 0x0000_0000 - 0x0000_1000 : Reserved (real mode IVT, BDA)
//! 0x0000_7000 - 0x0000_8000 : PVH start_info structure
//! 0x0000_8000 - 0x0000_9000 : Boot command line
//! 0x0001_0000 - 0x0009_0000 : E820 map / boot params
//! 0x0010_0000 - ... : Kernel load address (1MB)
//! ... - RAM_END : Initrd (loaded at high memory)
//! ```
mod gdt;
mod initrd;
mod linux;
mod loader;
pub mod mptable;
mod pagetable;
#[allow(dead_code)] // PVH boot protocol — planned feature, not yet wired up
mod pvh;
pub use gdt::GdtSetup;
pub use initrd::{InitrdConfig, InitrdLoader};
pub use linux::LinuxBootSetup;
pub use loader::{KernelConfig, KernelLoader};
pub use mptable::setup_mptable;
pub use pagetable::PageTableSetup;
use std::io;
use thiserror::Error;
/// Boot loader errors
#[derive(Error, Debug)]
pub enum BootError {
#[error("Failed to read kernel image: {0}")]
KernelRead(#[source] io::Error),
#[error("Failed to read initrd: {0}")]
InitrdRead(#[source] io::Error),
#[error("Invalid kernel format: {0}")]
InvalidKernel(String),
#[error("Invalid bzImage: {0}")]
InvalidBzImage(String),
#[error("Invalid ELF kernel: {0}")]
InvalidElf(String),
#[error("Kernel too large: {size} bytes exceeds available memory {available}")]
KernelTooLarge { size: u64, available: u64 },
#[error("Initrd too large: {size} bytes exceeds available memory {available}")]
InitrdTooLarge { size: u64, available: u64 },
#[error("Command line too long: {len} bytes exceeds maximum {max}")]
CommandLineTooLong { len: usize, max: usize },
#[error("Memory layout error: {0}")]
MemoryLayout(String),
#[error("Failed to write to guest memory: {0}")]
GuestMemoryWrite(String),
#[error("PVH setup failed: {0}")]
#[allow(dead_code)] // PVH boot path planned
PvhSetup(String),
#[error("Unsupported kernel version: {0}")]
UnsupportedVersion(String),
}
pub type Result<T> = std::result::Result<T, BootError>;
/// Memory addresses for boot components (x86_64)
///
/// # Memory Layout (designed to avoid page table overlaps)
///
/// For VMs with up to 4GB RAM, page tables can use addresses 0x1000-0xA000.
/// All boot structures are placed above 0x10000 to ensure no overlaps.
///
/// ```text
/// 0x0000 - 0x04FF : Reserved (IVT, BDA)
/// 0x0500 - 0x052F : GDT (3 entries)
/// 0x1000 - 0x1FFF : PML4
/// 0x2000 - 0x2FFF : PDPT_LOW (identity mapping)
/// 0x3000 - 0x3FFF : PDPT_HIGH (kernel high-half mapping)
/// 0x4000 - 0x7FFF : PD tables for identity mapping (up to 4 for 4GB)
/// 0x8000 - 0x9FFF : PD tables for high-half kernel mapping
/// 0xA000 - 0x1FFFF : Reserved / available
/// 0x20000 : boot_params (Linux zero page) - 4KB
/// 0x21000 : PVH start_info - 4KB
/// 0x22000 : E820 memory map - 4KB
/// 0x30000 : Boot command line - 4KB
/// 0x31000 - 0xFFFFF: Stack and scratch space
/// 0x100000 : Kernel load address (1MB)
/// ```
#[allow(dead_code)] // Memory layout constants — reference for boot protocol
pub mod layout {
/// Start of reserved low memory
pub const LOW_MEMORY_START: u64 = 0x0;
/// Page table area starts here (PML4)
pub const PAGE_TABLE_START: u64 = 0x1000;
/// End of page table reserved area (enough for 4GB + high-half mapping)
pub const PAGE_TABLE_END: u64 = 0xA000;
/// PVH start_info structure location
/// MOVED from 0x7000 to 0x21000 to avoid page table overlap with large VMs
pub const PVH_START_INFO_ADDR: u64 = 0x21000;
/// Boot command line location (after boot_params at 0x20000)
pub const CMDLINE_ADDR: u64 = 0x30000;
/// Maximum command line length (including null terminator)
pub const CMDLINE_MAX_SIZE: usize = 4096;
/// E820 memory map location
/// MOVED from 0x9000 to 0x22000 to avoid page table overlap with large VMs
pub const E820_MAP_ADDR: u64 = 0x22000;
/// Default kernel load address (1MB, standard for x86_64)
pub const KERNEL_LOAD_ADDR: u64 = 0x100000;
/// Minimum gap between kernel and initrd
pub const KERNEL_INITRD_GAP: u64 = 0x1000;
/// EBDA (Extended BIOS Data Area) size to reserve
pub const EBDA_SIZE: u64 = 0x1000;
/// End of low memory (640KB boundary)
pub const LOW_MEMORY_END: u64 = 0xA0000;
/// Start of high memory (1MB)
pub const HIGH_MEMORY_START: u64 = 0x100000;
/// Initial stack pointer for boot
/// Placed in safe area above page tables but below boot structures
pub const BOOT_STACK_POINTER: u64 = 0x1FFF0;
/// PVH entry point - RIP value when starting the VM
/// This should point to the kernel entry point
pub const PVH_ENTRY_POINT: u64 = KERNEL_LOAD_ADDR;
}
/// Boot configuration combining kernel, initrd, and PVH setup
#[derive(Debug, Clone)]
#[allow(dead_code)] // Fields set by config but not all read yet
pub struct BootConfig {
/// Path to kernel image
pub kernel_path: String,
/// Optional path to initrd/initramfs
pub initrd_path: Option<String>,
/// Kernel command line
pub cmdline: String,
/// Total guest memory size in bytes
pub memory_size: u64,
/// Number of vCPUs
pub vcpu_count: u32,
}
impl Default for BootConfig {
fn default() -> Self {
Self {
kernel_path: String::new(),
initrd_path: None,
cmdline: String::from("console=ttyS0 reboot=k panic=1 pci=off"),
memory_size: 128 * 1024 * 1024, // 128MB default
vcpu_count: 1,
}
}
}
/// Result of boot setup - contains entry point and register state
#[derive(Debug, Clone)]
#[allow(dead_code)] // All fields are part of the boot result, may not all be read yet
pub struct BootSetupResult {
/// Kernel entry point (RIP)
pub entry_point: u64,
/// Initial stack pointer (RSP)
pub stack_pointer: u64,
/// Address of boot_params structure (RSI for Linux boot protocol)
pub start_info_addr: u64,
/// CR3 value (page table base address)
pub cr3: u64,
/// Address where kernel was loaded
pub kernel_load_addr: u64,
/// Size of loaded kernel
pub kernel_size: u64,
/// Address where initrd was loaded (if any)
pub initrd_addr: Option<u64>,
/// Size of initrd (if any)
pub initrd_size: Option<u64>,
}
/// Trait for guest memory access during boot
pub trait GuestMemory {
/// Write bytes to guest memory at the given address
fn write_bytes(&mut self, addr: u64, data: &[u8]) -> Result<()>;
/// Write a value to guest memory
#[allow(dead_code)]
fn write_obj<T: Copy>(&mut self, addr: u64, val: &T) -> Result<()> {
let bytes = unsafe {
std::slice::from_raw_parts(val as *const T as *const u8, std::mem::size_of::<T>())
};
self.write_bytes(addr, bytes)
}
/// Get the total size of guest memory
fn size(&self) -> u64;
}
/// Complete boot loader that orchestrates kernel, initrd, and PVH setup
pub struct BootLoader;
impl BootLoader {
/// Load kernel and initrd, set up Linux boot protocol
///
/// This is the main entry point for boot setup. It:
/// 1. Loads the kernel image (ELF or bzImage)
/// 2. Loads the initrd if specified
/// 3. Sets up the Linux boot_params structure (zero page)
/// 4. Writes the command line
/// 5. Returns the boot parameters for vCPU initialization
pub fn setup<M: GuestMemory>(
config: &BootConfig,
guest_mem: &mut M,
) -> Result<BootSetupResult> {
// Validate command line length
if config.cmdline.len() >= layout::CMDLINE_MAX_SIZE {
return Err(BootError::CommandLineTooLong {
len: config.cmdline.len(),
max: layout::CMDLINE_MAX_SIZE - 1,
});
}
// Load kernel
let kernel_config = KernelConfig {
path: config.kernel_path.clone(),
load_addr: layout::KERNEL_LOAD_ADDR,
};
let kernel_result = KernelLoader::load(&kernel_config, guest_mem)?;
// Calculate initrd placement (high memory, after kernel)
let initrd_result = if let Some(ref initrd_path) = config.initrd_path {
let initrd_config = InitrdConfig {
path: initrd_path.clone(),
memory_size: config.memory_size,
kernel_end: kernel_result.load_addr + kernel_result.size,
};
Some(InitrdLoader::load(&initrd_config, guest_mem)?)
} else {
None
};
// Write command line to guest memory
let cmdline_bytes = config.cmdline.as_bytes();
guest_mem.write_bytes(layout::CMDLINE_ADDR, cmdline_bytes)?;
// Null terminator
guest_mem.write_bytes(layout::CMDLINE_ADDR + cmdline_bytes.len() as u64, &[0])?;
// Set up GDT for 64-bit mode
GdtSetup::setup(guest_mem)?;
// Set up identity-mapped page tables for 64-bit mode
let cr3 = PageTableSetup::setup(guest_mem, config.memory_size)?;
// Set up Linux boot_params structure (zero page)
let linux_config = linux::LinuxBootConfig {
memory_size: config.memory_size,
cmdline_addr: layout::CMDLINE_ADDR,
initrd_addr: initrd_result.as_ref().map(|r| r.load_addr),
initrd_size: initrd_result.as_ref().map(|r| r.size),
};
let boot_params_addr = LinuxBootSetup::setup(&linux_config, guest_mem)?;
Ok(BootSetupResult {
entry_point: kernel_result.entry_point,
stack_pointer: layout::BOOT_STACK_POINTER,
start_info_addr: boot_params_addr,
cr3,
kernel_load_addr: kernel_result.load_addr,
kernel_size: kernel_result.size,
initrd_addr: initrd_result.as_ref().map(|r| r.load_addr),
initrd_size: initrd_result.as_ref().map(|r| r.size),
})
}
}
#[cfg(test)]
mod tests {
use super::*;
struct MockMemory {
size: u64,
data: Vec<u8>,
}
impl MockMemory {
fn new(size: u64) -> Self {
Self {
size,
data: vec![0; size as usize],
}
}
}
impl GuestMemory for MockMemory {
fn write_bytes(&mut self, addr: u64, data: &[u8]) -> Result<()> {
let end = addr as usize + data.len();
if end > self.data.len() {
return Err(BootError::GuestMemoryWrite(format!(
"Write at {:#x} with len {} exceeds memory size {}",
addr,
data.len(),
self.size
)));
}
self.data[addr as usize..end].copy_from_slice(data);
Ok(())
}
fn size(&self) -> u64 {
self.size
}
}
#[test]
fn test_boot_config_default() {
let config = BootConfig::default();
assert!(config.cmdline.contains("console=ttyS0"));
assert_eq!(config.vcpu_count, 1);
}
#[test]
fn test_cmdline_too_long() {
let mut mem = MockMemory::new(1024 * 1024);
let config = BootConfig {
kernel_path: "/boot/vmlinux".into(),
cmdline: "x".repeat(layout::CMDLINE_MAX_SIZE + 1),
..Default::default()
};
let result = BootLoader::setup(&config, &mut mem);
assert!(matches!(result, Err(BootError::CommandLineTooLong { .. })));
}
}

611
vmm/src/boot/mptable.rs Normal file
View File

@@ -0,0 +1,611 @@
//! Intel MultiProcessor Specification (MPS) Table Construction
//!
//! Implements MP Floating Pointer and MP Configuration Table structures
//! to advertise SMP topology to the guest kernel. This allows Linux to
//! discover and boot Application Processors (APs) beyond the Bootstrap
//! Processor (BSP).
//!
//! # Table Layout (placed at 0x9FC00, just below EBDA)
//!
//! ```text
//! 0x9FC00: MP Floating Pointer Structure (16 bytes)
//! 0x9FC10: MP Configuration Table Header (44 bytes)
//! 0x9FC3C: Processor Entry 0 (BSP, APIC ID 0) — 20 bytes
//! 0x9FC50: Processor Entry 1 (AP, APIC ID 1) — 20 bytes
//! ...
//! Bus Entry (ISA, 8 bytes)
//! I/O APIC Entry (8 bytes)
//! I/O Interrupt Entries (IRQ 0-15, 8 bytes each)
//! ```
//!
//! # References
//! - Intel MultiProcessor Specification v1.4 (May 1997)
//! - Firecracker's mpspec implementation (src/vmm/src/arch/x86_64/mptable.rs)
//! - Linux kernel: arch/x86/kernel/mpparse.c
use super::{BootError, GuestMemory, Result};
/// Base address for MP tables — just below EBDA at 640KB boundary.
/// This address (0x9FC00) is a conventional location that Linux scans.
pub const MP_TABLE_START: u64 = 0x9FC00;
/// Maximum number of vCPUs we can fit in the MP table area.
/// Each processor entry is 20 bytes. Between 0x9FC00 and 0xA0000 we have
/// 1024 bytes. After headers (60 bytes), bus (8), IOAPIC (8), and 16 IRQ
/// entries (128 bytes), we have ~830 bytes = 41 processor entries.
/// That's more than enough — clamp to 255 (max APIC IDs).
pub const MAX_CPUS: u8 = 255;
// ============================================================================
// MP Floating Pointer Structure (16 bytes)
// Intel MPS Table 4-1
// ============================================================================
/// MP Floating Pointer signature: "_MP_"
const MP_FP_SIGNATURE: [u8; 4] = [b'_', b'M', b'P', b'_'];
/// MP Configuration Table signature: "PCMP"
const MP_CT_SIGNATURE: [u8; 4] = [b'P', b'C', b'M', b'P'];
/// MP spec revision 1.4
const MP_SPEC_REVISION: u8 = 4;
/// MP Floating Pointer Feature Byte 1: indicates MP Config Table present
const MP_FEATURE_IMCRP: u8 = 0x80;
// ============================================================================
// MP Table Entry Types
// ============================================================================
const MP_ENTRY_PROCESSOR: u8 = 0;
const MP_ENTRY_BUS: u8 = 1;
const MP_ENTRY_IOAPIC: u8 = 2;
const MP_ENTRY_IO_INTERRUPT: u8 = 3;
#[allow(dead_code)]
const MP_ENTRY_LOCAL_INTERRUPT: u8 = 4;
// Processor entry flags
const CPU_FLAG_ENABLED: u8 = 0x01;
const CPU_FLAG_BSP: u8 = 0x02;
// Interrupt types
const INT_TYPE_INT: u8 = 0; // Vectored interrupt
#[allow(dead_code)]
const INT_TYPE_NMI: u8 = 1;
#[allow(dead_code)]
const INT_TYPE_SMI: u8 = 2;
const INT_TYPE_EXTINT: u8 = 3; // ExtINT (from 8259)
// Interrupt polarity/trigger flags
const INT_FLAG_DEFAULT: u16 = 0x0000; // Conforms to bus spec
// I/O APIC default address
const IOAPIC_DEFAULT_ADDR: u32 = 0xFEC0_0000;
/// ISA bus type string
const BUS_TYPE_ISA: [u8; 6] = [b'I', b'S', b'A', b' ', b' ', b' '];
// ============================================================================
// MP Table Builder
// ============================================================================
/// Write MP tables to guest memory for SMP discovery.
///
/// # Arguments
/// * `guest_mem` — Guest memory to write the tables into
/// * `num_cpus` — Number of vCPUs (1-255)
///
/// # Returns
/// The guest physical address where the MP Floating Pointer was written.
pub fn setup_mptable<M: GuestMemory>(guest_mem: &mut M, num_cpus: u8) -> Result<u64> {
if num_cpus == 0 {
return Err(BootError::MemoryLayout(
"MP table requires at least 1 CPU".to_string(),
));
}
if num_cpus > MAX_CPUS {
return Err(BootError::MemoryLayout(format!(
"MP table supports at most {} CPUs, got {}",
MAX_CPUS, num_cpus
)));
}
// Calculate sizes and offsets
let fp_size: u64 = 16; // MP Floating Pointer
let header_size: u64 = 44; // MP Config Table Header
let processor_entry_size: u64 = 20;
let bus_entry_size: u64 = 8;
let ioapic_entry_size: u64 = 8;
let io_int_entry_size: u64 = 8;
// Number of IO interrupt entries: IRQ 0-15 = 16 entries
let num_irqs: u64 = 16;
let config_table_addr = MP_TABLE_START + fp_size;
let _entries_start = config_table_addr + header_size;
// Calculate total config table size (header + all entries)
let total_entries_size = (num_cpus as u64) * processor_entry_size
+ bus_entry_size
+ ioapic_entry_size
+ num_irqs * io_int_entry_size;
let config_table_size = header_size + total_entries_size;
// Verify we fit in the available space (between 0x9FC00 and 0xA0000)
let total_size = fp_size + config_table_size;
if MP_TABLE_START + total_size > 0xA0000 {
return Err(BootError::MemoryLayout(format!(
"MP tables ({} bytes) exceed available space (0x9FC00-0xA0000)",
total_size
)));
}
// Verify we have enough guest memory
if MP_TABLE_START + total_size > guest_mem.size() {
return Err(BootError::MemoryLayout(format!(
"MP tables at 0x{:x} exceed guest memory size 0x{:x}",
MP_TABLE_START + total_size,
guest_mem.size()
)));
}
// Build the MP Configuration Table body (entries)
let mut table_buf = Vec::with_capacity(config_table_size as usize);
// Leave space for the header (we'll fill it after computing checksum)
table_buf.resize(header_size as usize, 0);
// ---- Processor Entries ----
let mut entry_count: u16 = 0;
for cpu_id in 0..num_cpus {
let flags = if cpu_id == 0 {
CPU_FLAG_ENABLED | CPU_FLAG_BSP
} else {
CPU_FLAG_ENABLED
};
// CPU signature: Family 6, Model 15 (Core 2 / Merom-class)
// This is a safe generic modern x86_64 signature
let cpu_signature: u32 = (6 << 8) | (15 << 4) | 1; // Family=6, Model=F, Stepping=1
let feature_flags: u32 = 0x0781_FBFF; // Common feature flags (FPU, SSE, SSE2, etc.)
write_processor_entry(
&mut table_buf,
cpu_id, // Local APIC ID
0x14, // Local APIC version (integrated APIC)
flags,
cpu_signature,
feature_flags,
);
entry_count += 1;
}
// ---- Bus Entry (ISA) ----
write_bus_entry(&mut table_buf, 0, &BUS_TYPE_ISA);
entry_count += 1;
// ---- I/O APIC Entry ----
// I/O APIC ID = num_cpus (first ID after all processors)
let ioapic_id = num_cpus;
write_ioapic_entry(&mut table_buf, ioapic_id, 0x11, IOAPIC_DEFAULT_ADDR);
entry_count += 1;
// ---- I/O Interrupt Assignment Entries ----
// Map ISA IRQs 0-15 to IOAPIC pins 0-15
// IRQ 0: ExtINT (8259 cascade through IOAPIC pin 0)
write_io_interrupt_entry(
&mut table_buf,
INT_TYPE_EXTINT,
INT_FLAG_DEFAULT,
0, // source bus = ISA
0, // source bus IRQ = 0
ioapic_id,
0, // IOAPIC pin 0
);
entry_count += 1;
// IRQs 1-15: Standard vectored interrupts
for irq in 1..16u8 {
// IRQ 2 is the PIC cascade — skip it (Linux doesn't use it in APIC mode)
// But we still report it for completeness
write_io_interrupt_entry(
&mut table_buf,
INT_TYPE_INT,
INT_FLAG_DEFAULT,
0, // source bus = ISA
irq, // source bus IRQ
ioapic_id,
irq, // IOAPIC pin = same as IRQ number
);
entry_count += 1;
}
// ---- Fill in the Configuration Table Header ----
// Build header at the start of table_buf
{
// Compute length before taking mutable borrow of the header slice
let table_len = table_buf.len() as u16;
let header = &mut table_buf[0..header_size as usize];
// Signature: "PCMP"
header[0..4].copy_from_slice(&MP_CT_SIGNATURE);
// Base table length (u16 LE) — entire config table including header
header[4..6].copy_from_slice(&table_len.to_le_bytes());
// Spec revision
header[6] = MP_SPEC_REVISION;
// Checksum — will be filled below
header[7] = 0;
// OEM ID (8 bytes, space-padded)
header[8..16].copy_from_slice(b"NOVAFLAR");
// Product ID (12 bytes, space-padded)
header[16..28].copy_from_slice(b"VOLT VM");
// OEM table pointer (0 = none)
header[28..32].copy_from_slice(&0u32.to_le_bytes());
// OEM table size
header[32..34].copy_from_slice(&0u16.to_le_bytes());
// Entry count
header[34..36].copy_from_slice(&entry_count.to_le_bytes());
// Local APIC address
header[36..40].copy_from_slice(&0xFEE0_0000u32.to_le_bytes());
// Extended table length
header[40..42].copy_from_slice(&0u16.to_le_bytes());
// Extended table checksum
header[42] = 0;
// Reserved
header[43] = 0;
// Compute and set checksum
let checksum = compute_checksum(&table_buf);
table_buf[7] = checksum;
}
// ---- Build the MP Floating Pointer Structure ----
let mut fp_buf = [0u8; 16];
// Signature: "_MP_"
fp_buf[0..4].copy_from_slice(&MP_FP_SIGNATURE);
// Physical address pointer to MP Config Table (u32 LE)
fp_buf[4..8].copy_from_slice(&(config_table_addr as u32).to_le_bytes());
// Length in 16-byte paragraphs (1 = 16 bytes)
fp_buf[8] = 1;
// Spec revision
fp_buf[9] = MP_SPEC_REVISION;
// Checksum — filled below
fp_buf[10] = 0;
// Feature byte 1: 0 = MP Config Table present (not default config)
fp_buf[11] = 0;
// Feature byte 2: bit 7 = IMCR present (PIC mode available)
fp_buf[12] = MP_FEATURE_IMCRP;
// Feature bytes 3-5: reserved
fp_buf[13] = 0;
fp_buf[14] = 0;
fp_buf[15] = 0;
// Compute floating pointer checksum
let fp_checksum = compute_checksum(&fp_buf);
fp_buf[10] = fp_checksum;
// ---- Write everything to guest memory ----
guest_mem.write_bytes(MP_TABLE_START, &fp_buf)?;
guest_mem.write_bytes(config_table_addr, &table_buf)?;
tracing::info!(
"MP table written at 0x{:x}: {} CPUs, {} entries, {} bytes total\n\
Layout: FP=0x{:x}, Config=0x{:x}, IOAPIC ID={}, IOAPIC addr=0x{:x}",
MP_TABLE_START,
num_cpus,
entry_count,
total_size,
MP_TABLE_START,
config_table_addr,
ioapic_id,
IOAPIC_DEFAULT_ADDR,
);
Ok(MP_TABLE_START)
}
/// Write a Processor Entry (20 bytes) to the table buffer.
///
/// Format (Intel MPS Table 4-4):
/// ```text
/// Offset Size Field
/// 0 1 Entry type (0 = processor)
/// 1 1 Local APIC ID
/// 2 1 Local APIC version
/// 3 1 CPU flags (bit 0=EN, bit 1=BP)
/// 4 4 CPU signature (stepping, model, family)
/// 8 4 Feature flags (from CPUID leaf 1 EDX)
/// 12 8 Reserved
/// ```
fn write_processor_entry(
buf: &mut Vec<u8>,
apic_id: u8,
apic_version: u8,
flags: u8,
cpu_signature: u32,
feature_flags: u32,
) {
buf.push(MP_ENTRY_PROCESSOR); // Entry type
buf.push(apic_id); // Local APIC ID
buf.push(apic_version); // Local APIC version
buf.push(flags); // CPU flags
buf.extend_from_slice(&cpu_signature.to_le_bytes()); // CPU signature
buf.extend_from_slice(&feature_flags.to_le_bytes()); // Feature flags
buf.extend_from_slice(&[0u8; 8]); // Reserved
}
/// Write a Bus Entry (8 bytes) to the table buffer.
///
/// Format (Intel MPS Table 4-5):
/// ```text
/// Offset Size Field
/// 0 1 Entry type (1 = bus)
/// 1 1 Bus ID
/// 2 6 Bus type string (space-padded)
/// ```
fn write_bus_entry(buf: &mut Vec<u8>, bus_id: u8, bus_type: &[u8; 6]) {
buf.push(MP_ENTRY_BUS);
buf.push(bus_id);
buf.extend_from_slice(bus_type);
}
/// Write an I/O APIC Entry (8 bytes) to the table buffer.
///
/// Format (Intel MPS Table 4-6):
/// ```text
/// Offset Size Field
/// 0 1 Entry type (2 = I/O APIC)
/// 1 1 I/O APIC ID
/// 2 1 I/O APIC version
/// 3 1 I/O APIC flags (bit 0 = EN)
/// 4 4 I/O APIC address
/// ```
fn write_ioapic_entry(buf: &mut Vec<u8>, id: u8, version: u8, addr: u32) {
buf.push(MP_ENTRY_IOAPIC);
buf.push(id);
buf.push(version);
buf.push(0x01); // flags: enabled
buf.extend_from_slice(&addr.to_le_bytes());
}
/// Write an I/O Interrupt Assignment Entry (8 bytes) to the table buffer.
///
/// Format (Intel MPS Table 4-7):
/// ```text
/// Offset Size Field
/// 0 1 Entry type (3 = I/O interrupt)
/// 1 1 Interrupt type (0=INT, 1=NMI, 2=SMI, 3=ExtINT)
/// 2 2 Flags (polarity/trigger)
/// 4 1 Source bus ID
/// 5 1 Source bus IRQ
/// 6 1 Destination I/O APIC ID
/// 7 1 Destination I/O APIC pin (INTIN#)
/// ```
fn write_io_interrupt_entry(
buf: &mut Vec<u8>,
int_type: u8,
flags: u16,
src_bus_id: u8,
src_bus_irq: u8,
dst_ioapic_id: u8,
dst_ioapic_pin: u8,
) {
buf.push(MP_ENTRY_IO_INTERRUPT);
buf.push(int_type);
buf.extend_from_slice(&flags.to_le_bytes());
buf.push(src_bus_id);
buf.push(src_bus_irq);
buf.push(dst_ioapic_id);
buf.push(dst_ioapic_pin);
}
/// Compute the two's-complement checksum for an MP structure.
/// The sum of all bytes in the structure must be 0 (mod 256).
fn compute_checksum(data: &[u8]) -> u8 {
let sum: u8 = data.iter().fold(0u8, |acc, &b| acc.wrapping_add(b));
(!sum).wrapping_add(1) // Two's complement = negate
}
// ============================================================================
// Tests
// ============================================================================
#[cfg(test)]
mod tests {
use super::*;
struct MockMemory {
size: u64,
data: Vec<u8>,
}
impl MockMemory {
fn new(size: u64) -> Self {
Self {
size,
data: vec![0; size as usize],
}
}
fn read_bytes(&self, addr: u64, len: usize) -> &[u8] {
&self.data[addr as usize..(addr as usize + len)]
}
}
impl GuestMemory for MockMemory {
fn write_bytes(&mut self, addr: u64, data: &[u8]) -> Result<()> {
let end = addr as usize + data.len();
if end > self.data.len() {
return Err(BootError::GuestMemoryWrite(format!(
"Write at {:#x} exceeds memory",
addr
)));
}
self.data[addr as usize..end].copy_from_slice(data);
Ok(())
}
fn size(&self) -> u64 {
self.size
}
}
#[test]
fn test_checksum() {
// A buffer with known checksum byte should sum to 0
let data = vec![1, 2, 3, 4];
let cs = compute_checksum(&data);
let total: u8 = data.iter().chain(std::iter::once(&cs)).fold(0u8, |a, b| a.wrapping_add(*b));
// With the checksum byte replacing the original slot, the sum should be 0
let mut with_cs = data.clone();
with_cs.push(0); // placeholder
// Actually the checksum replaces index 10 in the FP or 7 in the config header,
// but let's verify the math differently:
let sum_without: u8 = data.iter().fold(0u8, |a, b| a.wrapping_add(*b));
assert_eq!(sum_without.wrapping_add(cs), 0);
}
#[test]
fn test_mp_floating_pointer_signature() {
let mut mem = MockMemory::new(1024 * 1024);
let result = setup_mptable(&mut mem, 1);
assert!(result.is_ok());
let fp_addr = result.unwrap() as usize;
assert_eq!(&mem.data[fp_addr..fp_addr + 4], b"_MP_");
}
#[test]
fn test_mp_floating_pointer_checksum() {
let mut mem = MockMemory::new(1024 * 1024);
setup_mptable(&mut mem, 2).unwrap();
// MP Floating Pointer is 16 bytes at MP_TABLE_START
let fp = mem.read_bytes(MP_TABLE_START, 16);
let sum: u8 = fp.iter().fold(0u8, |a, &b| a.wrapping_add(b));
assert_eq!(sum, 0, "MP Floating Pointer checksum mismatch");
}
#[test]
fn test_mp_config_table_checksum() {
let mut mem = MockMemory::new(1024 * 1024);
setup_mptable(&mut mem, 2).unwrap();
// Config table starts at MP_TABLE_START + 16
let config_addr = (MP_TABLE_START + 16) as usize;
// Read table length from header bytes 4-5
let table_len = u16::from_le_bytes([
mem.data[config_addr + 4],
mem.data[config_addr + 5],
]) as usize;
let table = &mem.data[config_addr..config_addr + table_len];
let sum: u8 = table.iter().fold(0u8, |a, &b| a.wrapping_add(b));
assert_eq!(sum, 0, "MP Config Table checksum mismatch");
}
#[test]
fn test_mp_config_table_signature() {
let mut mem = MockMemory::new(1024 * 1024);
setup_mptable(&mut mem, 1).unwrap();
let config_addr = (MP_TABLE_START + 16) as usize;
assert_eq!(&mem.data[config_addr..config_addr + 4], b"PCMP");
}
#[test]
fn test_mp_table_1_cpu() {
let mut mem = MockMemory::new(1024 * 1024);
setup_mptable(&mut mem, 1).unwrap();
let config_addr = (MP_TABLE_START + 16) as usize;
// Entry count at offset 34 in header
let entry_count = u16::from_le_bytes([
mem.data[config_addr + 34],
mem.data[config_addr + 35],
]);
// 1 CPU + 1 bus + 1 IOAPIC + 16 IRQs = 19 entries
assert_eq!(entry_count, 19);
}
#[test]
fn test_mp_table_4_cpus() {
let mut mem = MockMemory::new(1024 * 1024);
setup_mptable(&mut mem, 4).unwrap();
let config_addr = (MP_TABLE_START + 16) as usize;
let entry_count = u16::from_le_bytes([
mem.data[config_addr + 34],
mem.data[config_addr + 35],
]);
// 4 CPUs + 1 bus + 1 IOAPIC + 16 IRQs = 22 entries
assert_eq!(entry_count, 22);
}
#[test]
fn test_mp_table_bsp_flag() {
let mut mem = MockMemory::new(1024 * 1024);
setup_mptable(&mut mem, 4).unwrap();
// First processor entry starts at config_addr + 44 (header size)
let proc0_offset = (MP_TABLE_START + 16 + 44) as usize;
assert_eq!(mem.data[proc0_offset], 0); // Entry type = processor
assert_eq!(mem.data[proc0_offset + 1], 0); // APIC ID = 0
assert_eq!(mem.data[proc0_offset + 3], CPU_FLAG_ENABLED | CPU_FLAG_BSP); // BSP + EN
// Second processor
let proc1_offset = proc0_offset + 20;
assert_eq!(mem.data[proc1_offset + 1], 1); // APIC ID = 1
assert_eq!(mem.data[proc1_offset + 3], CPU_FLAG_ENABLED); // EN only (no BSP)
}
#[test]
fn test_mp_table_ioapic() {
let mut mem = MockMemory::new(1024 * 1024);
let num_cpus: u8 = 2;
setup_mptable(&mut mem, num_cpus).unwrap();
// IOAPIC entry follows: processors (2*20) + bus (8) = 48 bytes after entries start
let entries_start = (MP_TABLE_START + 16 + 44) as usize;
let ioapic_offset = entries_start + (num_cpus as usize * 20) + 8;
assert_eq!(mem.data[ioapic_offset], MP_ENTRY_IOAPIC); // Entry type
assert_eq!(mem.data[ioapic_offset + 1], num_cpus); // IOAPIC ID = num_cpus
assert_eq!(mem.data[ioapic_offset + 3], 0x01); // Enabled
// IOAPIC address
let addr = u32::from_le_bytes([
mem.data[ioapic_offset + 4],
mem.data[ioapic_offset + 5],
mem.data[ioapic_offset + 6],
mem.data[ioapic_offset + 7],
]);
assert_eq!(addr, IOAPIC_DEFAULT_ADDR);
}
#[test]
fn test_mp_table_zero_cpus_error() {
let mut mem = MockMemory::new(1024 * 1024);
let result = setup_mptable(&mut mem, 0);
assert!(result.is_err());
}
#[test]
fn test_mp_table_local_apic_addr() {
let mut mem = MockMemory::new(1024 * 1024);
setup_mptable(&mut mem, 2).unwrap();
let config_addr = (MP_TABLE_START + 16) as usize;
// Local APIC address at offset 36 in header
let lapic_addr = u32::from_le_bytes([
mem.data[config_addr + 36],
mem.data[config_addr + 37],
mem.data[config_addr + 38],
mem.data[config_addr + 39],
]);
assert_eq!(lapic_addr, 0xFEE0_0000);
}
}

291
vmm/src/boot/pagetable.rs Normal file
View File

@@ -0,0 +1,291 @@
//! Page Table Setup for 64-bit Boot
//!
//! Sets up identity-mapped page tables for Linux 64-bit kernel boot.
//! The kernel expects to be running with paging enabled and needs:
//! - Identity mapping for low memory (0-4GB physical = 0-4GB virtual)
//! - High kernel mapping (0xffffffff80000000+ = physical addresses)
//!
//! # Page Table Layout
//!
//! We use 2MB huge pages for simplicity and performance:
//! - PML4 (Page Map Level 4) at 0x1000
//! - PDPT for low memory (identity) at 0x2000
//! - PDPT for high memory (kernel) at 0x3000
//! - PD tables at 0x4000+
//!
//! Each PD entry maps 2MB of physical memory using huge pages.
use super::{GuestMemory, Result};
#[cfg(test)]
use super::BootError;
/// PML4 table address
pub const PML4_ADDR: u64 = 0x1000;
/// PDPT (Page Directory Pointer Table) for identity mapping (low memory)
pub const PDPT_LOW_ADDR: u64 = 0x2000;
/// PDPT for kernel high memory mapping
pub const PDPT_HIGH_ADDR: u64 = 0x3000;
/// First PD (Page Directory) address
pub const PD_ADDR: u64 = 0x4000;
/// Size of one page table (4KB)
pub const PAGE_TABLE_SIZE: u64 = 0x1000;
/// Page table entry flags
#[allow(dead_code)] // x86 page table flags — kept for completeness
mod flags {
/// Present bit
pub const PRESENT: u64 = 1 << 0;
/// Read/Write bit
pub const WRITABLE: u64 = 1 << 1;
/// User/Supervisor bit (0 = supervisor only)
pub const USER: u64 = 1 << 2;
/// Page Size bit (1 = 2MB/1GB huge page)
pub const PAGE_SIZE: u64 = 1 << 7;
}
/// Page table setup implementation
pub struct PageTableSetup;
impl PageTableSetup {
/// Set up page tables for 64-bit Linux kernel boot
///
/// Creates:
/// 1. Identity mapping for first 4GB (virtual 0-4GB -> physical 0-4GB)
/// 2. High kernel mapping (virtual 0xffffffff80000000+ -> physical 0+)
///
/// This allows the kernel to execute at its linked address while also
/// having access to physical memory via identity mapping.
///
/// Returns the CR3 value (PML4 physical address).
pub fn setup<M: GuestMemory>(guest_mem: &mut M, memory_size: u64) -> Result<u64> {
// Zero out the page table area first (16 pages should be plenty)
let zeros = vec![0u8; PAGE_TABLE_SIZE as usize * 16];
guest_mem.write_bytes(PML4_ADDR, &zeros)?;
// Calculate how much memory to map (up to 4GB, or actual memory size)
let map_size = memory_size.min(4 * 1024 * 1024 * 1024);
// Number of 2MB pages needed
let num_2mb_pages = (map_size + 0x1FFFFF) / 0x200000;
// Number of PD tables needed (each PD has 512 entries, each entry maps 2MB)
let num_pd_tables = ((num_2mb_pages + 511) / 512).max(1) as usize;
// ============================================================
// Set up PML4 entries
// ============================================================
// Entry 0: Points to low PDPT for identity mapping (0x0 - 512GB)
let pml4_entry_0 = PDPT_LOW_ADDR | flags::PRESENT | flags::WRITABLE;
guest_mem.write_bytes(PML4_ADDR, &pml4_entry_0.to_le_bytes())?;
// Entry 511: Points to high PDPT for kernel mapping (0xFFFFFF8000000000+)
// PML4[511] maps addresses 0xFFFFFF8000000000 - 0xFFFFFFFFFFFFFFFF
let pml4_entry_511 = PDPT_HIGH_ADDR | flags::PRESENT | flags::WRITABLE;
guest_mem.write_bytes(PML4_ADDR + 511 * 8, &pml4_entry_511.to_le_bytes())?;
// ============================================================
// Set up PDPT for low memory (identity mapping)
// ============================================================
for i in 0..num_pd_tables.min(4) {
let pd_addr = PD_ADDR + (i as u64 * PAGE_TABLE_SIZE);
let pdpt_entry = pd_addr | flags::PRESENT | flags::WRITABLE;
let pdpt_offset = PDPT_LOW_ADDR + (i as u64 * 8);
guest_mem.write_bytes(pdpt_offset, &pdpt_entry.to_le_bytes())?;
}
// ============================================================
// Set up PDPT for high memory (kernel mapping)
// Kernel virtual: 0xffffffff80000000 -> physical 0x0
// This is PDPT entry 510 (for 0xffffffff80000000-0xffffffffbfffffff)
// And PDPT entry 511 (for 0xffffffffc0000000-0xffffffffffffffff)
// ============================================================
// We need PD tables for the high mapping too
// Use PD tables starting after the low-memory ones
let high_pd_base = PD_ADDR + (num_pd_tables.min(4) as u64 * PAGE_TABLE_SIZE);
// PDPT[510] maps 0xffffffff80000000-0xffffffffbfffffff to physical 0x0
// (This covers the typical kernel text segment)
let pdpt_entry_510 = high_pd_base | flags::PRESENT | flags::WRITABLE;
guest_mem.write_bytes(PDPT_HIGH_ADDR + 510 * 8, &pdpt_entry_510.to_le_bytes())?;
// PDPT[511] maps 0xffffffffc0000000-0xffffffffffffffff
let pdpt_entry_511 = (high_pd_base + PAGE_TABLE_SIZE) | flags::PRESENT | flags::WRITABLE;
guest_mem.write_bytes(PDPT_HIGH_ADDR + 511 * 8, &pdpt_entry_511.to_le_bytes())?;
// ============================================================
// Set up PD entries for identity mapping (2MB huge pages)
// ============================================================
for i in 0..num_2mb_pages {
let pd_table_index = (i / 512) as usize;
let pd_entry_index = i % 512;
if pd_table_index >= 4 {
break; // Only support first 4GB for now
}
let pd_table_addr = PD_ADDR + (pd_table_index as u64 * PAGE_TABLE_SIZE);
let pd_entry_offset = pd_table_addr + (pd_entry_index * 8);
// Physical address this entry maps (2MB aligned)
let phys_addr = i * 0x200000;
// PD entry with PAGE_SIZE flag for 2MB huge page
let pd_entry = phys_addr | flags::PRESENT | flags::WRITABLE | flags::PAGE_SIZE;
guest_mem.write_bytes(pd_entry_offset, &pd_entry.to_le_bytes())?;
}
// ============================================================
// Set up PD entries for high kernel mapping
// 0xffffffff80000000 + offset -> physical offset
// ============================================================
// Map first 1GB of physical memory to the high kernel address space
for i in 0..512 {
let phys_addr = i * 0x200000;
if phys_addr >= map_size {
break;
}
// PD for PDPT[510] (0xffffffff80000000-0xffffffffbfffffff)
let pd_entry = phys_addr | flags::PRESENT | flags::WRITABLE | flags::PAGE_SIZE;
let pd_offset = high_pd_base + (i * 8);
guest_mem.write_bytes(pd_offset, &pd_entry.to_le_bytes())?;
}
// Map second 1GB for PDPT[511]
for i in 0..512 {
let phys_addr = (512 + i) * 0x200000;
if phys_addr >= map_size {
break;
}
let pd_entry = phys_addr | flags::PRESENT | flags::WRITABLE | flags::PAGE_SIZE;
let pd_offset = high_pd_base + PAGE_TABLE_SIZE + (i * 8);
guest_mem.write_bytes(pd_offset, &pd_entry.to_le_bytes())?;
}
// Debug: dump page table structure for verification
tracing::info!(
"Page tables configured at CR3=0x{:x}:\n\
PML4[0] = 0x{:016x} -> PDPT_LOW at 0x{:x}\n\
PML4[511] = 0x{:016x} -> PDPT_HIGH at 0x{:x}\n\
PDPT_LOW[0] = 0x{:016x} -> PD at 0x{:x}\n\
{} PD entries (2MB huge pages) covering {} MB",
PML4_ADDR,
pml4_entry_0, PDPT_LOW_ADDR,
pml4_entry_511, PDPT_HIGH_ADDR,
PDPT_LOW_ADDR | flags::PRESENT | flags::WRITABLE, PD_ADDR,
num_2mb_pages,
map_size / (1024 * 1024)
);
// Log the PD entry that maps the kernel (typically at 16MB = 0x1000000)
// 0x1000000 / 2MB = 8, so PD[8] maps the kernel
let kernel_pd_entry = 8u64 * 0x200000 | flags::PRESENT | flags::WRITABLE | flags::PAGE_SIZE;
tracing::info!(
"Identity mapping for kernel at 0x1000000:\n\
PD[8] = 0x{:016x} -> maps physical 0x1000000-0x11FFFFF",
kernel_pd_entry
);
Ok(PML4_ADDR)
}
}
#[cfg(test)]
mod tests {
use super::*;
struct MockMemory {
size: u64,
data: Vec<u8>,
}
impl MockMemory {
fn new(size: u64) -> Self {
Self {
size,
data: vec![0; size as usize],
}
}
fn read_u64(&self, addr: u64) -> u64 {
let bytes = &self.data[addr as usize..addr as usize + 8];
u64::from_le_bytes(bytes.try_into().unwrap())
}
}
impl GuestMemory for MockMemory {
fn write_bytes(&mut self, addr: u64, data: &[u8]) -> Result<()> {
let end = addr as usize + data.len();
if end > self.data.len() {
return Err(BootError::GuestMemoryWrite(format!(
"Write at {:#x} exceeds memory",
addr
)));
}
self.data[addr as usize..end].copy_from_slice(data);
Ok(())
}
fn size(&self) -> u64 {
self.size
}
}
#[test]
fn test_page_table_setup() {
let mut mem = MockMemory::new(128 * 1024 * 1024);
let result = PageTableSetup::setup(&mut mem, 128 * 1024 * 1024);
assert!(result.is_ok());
assert_eq!(result.unwrap(), PML4_ADDR);
// Verify PML4[0] entry points to low PDPT (identity mapping)
let pml4_entry_0 = mem.read_u64(PML4_ADDR);
assert_eq!(pml4_entry_0 & !0xFFF, PDPT_LOW_ADDR);
assert!(pml4_entry_0 & flags::PRESENT != 0);
assert!(pml4_entry_0 & flags::WRITABLE != 0);
// Verify PML4[511] entry points to high PDPT (kernel mapping)
let pml4_entry_511 = mem.read_u64(PML4_ADDR + 511 * 8);
assert_eq!(pml4_entry_511 & !0xFFF, PDPT_HIGH_ADDR);
assert!(pml4_entry_511 & flags::PRESENT != 0);
// Verify first PDPT entry points to first PD
let pdpt_entry = mem.read_u64(PDPT_LOW_ADDR);
assert_eq!(pdpt_entry & !0xFFF, PD_ADDR);
assert!(pdpt_entry & flags::PRESENT != 0);
// Verify first PD entry maps physical address 0
let pd_entry = mem.read_u64(PD_ADDR);
assert_eq!(pd_entry & !0x1FFFFF, 0);
assert!(pd_entry & flags::PRESENT != 0);
assert!(pd_entry & flags::PAGE_SIZE != 0); // 2MB page
}
#[test]
fn test_identity_mapping() {
let mut mem = MockMemory::new(256 * 1024 * 1024);
PageTableSetup::setup(&mut mem, 256 * 1024 * 1024).unwrap();
// Check that addresses 0, 2MB, 4MB, etc. are identity mapped
for i in 0..128 {
let phys_addr = i * 0x200000u64; // 2MB pages
let pd_entry_index = i;
let pd_table_index = pd_entry_index / 512;
let pd_entry_in_table = pd_entry_index % 512;
let pd_addr = PD_ADDR + pd_table_index * PAGE_TABLE_SIZE;
let pd_entry = mem.read_u64(pd_addr + pd_entry_in_table * 8);
let mapped_addr = pd_entry & !0x1FFFFF;
assert_eq!(mapped_addr, phys_addr, "Mismatch at entry {}", i);
}
}
}

608
vmm/src/boot/pvh.rs Normal file
View File

@@ -0,0 +1,608 @@
//! PVH Boot Protocol Implementation
//!
//! PVH (Para-Virtualized Hardware) is a boot protocol that allows direct kernel
//! entry without BIOS/UEFI firmware. This is the fastest path to boot a Linux VM.
//!
//! # Overview
//!
//! The PVH boot protocol:
//! 1. Skips BIOS POST and firmware initialization
//! 2. Loads kernel directly into memory
//! 3. Sets up minimal boot structures (E820 map, start_info)
//! 4. Jumps directly to kernel 64-bit entry point
//!
//! # Boot Time Comparison
//!
//! | Method | Boot Time |
//! |--------|-----------|
//! | BIOS | 1-3s |
//! | UEFI | 0.5-1s |
//! | PVH | <50ms |
//!
//! # Memory Requirements
//!
//! The PVH start_info structure must be placed in guest memory and
//! its address passed to the kernel via RBX register.
use super::{layout, BootError, GuestMemory, Result};
/// Maximum number of E820 entries
pub const MAX_E820_ENTRIES: usize = 128;
/// E820 memory type values (matching Linux kernel definitions)
#[repr(u32)]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum E820Type {
/// Usable RAM
Ram = 1,
/// Reserved by system
Reserved = 2,
/// ACPI reclaimable
Acpi = 3,
/// ACPI NVS (Non-Volatile Storage)
Nvs = 4,
/// Unusable memory
Unusable = 5,
/// Disabled memory (EFI)
Disabled = 6,
/// Persistent memory
Pmem = 7,
/// Undefined/other
Undefined = 0,
}
impl From<u32> for E820Type {
fn from(val: u32) -> Self {
match val {
1 => E820Type::Ram,
2 => E820Type::Reserved,
3 => E820Type::Acpi,
4 => E820Type::Nvs,
5 => E820Type::Unusable,
6 => E820Type::Disabled,
7 => E820Type::Pmem,
_ => E820Type::Undefined,
}
}
}
/// E820 memory map entry
///
/// Matches the Linux kernel's e820entry structure for compatibility.
#[repr(C, packed)]
#[derive(Debug, Clone, Copy, Default)]
pub struct E820Entry {
/// Start address of memory region
pub addr: u64,
/// Size of memory region in bytes
pub size: u64,
/// Type of memory region
pub entry_type: u32,
}
impl E820Entry {
/// Create a new E820 entry
pub fn new(addr: u64, size: u64, entry_type: E820Type) -> Self {
Self {
addr,
size,
entry_type: entry_type as u32,
}
}
/// Create a RAM entry
pub fn ram(addr: u64, size: u64) -> Self {
Self::new(addr, size, E820Type::Ram)
}
/// Create a reserved entry
pub fn reserved(addr: u64, size: u64) -> Self {
Self::new(addr, size, E820Type::Reserved)
}
}
/// PVH start_info structure
///
/// This is a simplified version compatible with the Xen PVH ABI.
/// The structure is placed in guest memory and its address is passed
/// to the kernel in RBX.
///
/// # Memory Layout
///
/// The structure must be at a known location (typically 0x7000) and
/// contain pointers to other boot structures.
#[repr(C)]
#[derive(Debug, Clone, Default)]
pub struct StartInfo {
/// Magic number (XEN_HVM_START_MAGIC_VALUE or custom)
pub magic: u32,
/// Version of the start_info structure
pub version: u32,
/// Flags (reserved, should be 0)
pub flags: u32,
/// Number of modules (initrd counts as 1)
pub nr_modules: u32,
/// Physical address of module list
pub modlist_paddr: u64,
/// Physical address of command line string
pub cmdline_paddr: u64,
/// Physical address of RSDP (ACPI, 0 if none)
pub rsdp_paddr: u64,
/// Physical address of E820 memory map
pub memmap_paddr: u64,
/// Number of entries in memory map
pub memmap_entries: u32,
/// Reserved/padding
pub reserved: u32,
}
/// XEN HVM start magic value
pub const XEN_HVM_START_MAGIC: u32 = 0x336ec578;
/// Volt custom magic (for identification)
pub const VOLT_MAGIC: u32 = 0x4e4f5641; // "NOVA"
impl StartInfo {
/// Create a new StartInfo with default values
pub fn new() -> Self {
Self {
magic: XEN_HVM_START_MAGIC,
version: 1,
flags: 0,
..Default::default()
}
}
/// Set command line address
pub fn with_cmdline(mut self, addr: u64) -> Self {
self.cmdline_paddr = addr;
self
}
/// Set memory map address and entry count
pub fn with_memmap(mut self, addr: u64, entries: u32) -> Self {
self.memmap_paddr = addr;
self.memmap_entries = entries;
self
}
/// Set module (initrd) information
pub fn with_module(mut self, modlist_addr: u64) -> Self {
self.nr_modules = 1;
self.modlist_paddr = modlist_addr;
self
}
/// Convert to bytes for writing to guest memory
pub fn as_bytes(&self) -> &[u8] {
unsafe {
std::slice::from_raw_parts(
self as *const Self as *const u8,
std::mem::size_of::<Self>(),
)
}
}
}
/// Module (initrd) entry for PVH
#[repr(C)]
#[derive(Debug, Clone, Copy, Default)]
pub struct HvmModlistEntry {
/// Physical address of module
pub paddr: u64,
/// Size of module in bytes
pub size: u64,
/// Physical address of command line for module (0 if none)
pub cmdline_paddr: u64,
/// Reserved
pub reserved: u64,
}
impl HvmModlistEntry {
/// Create entry for initrd
pub fn new(paddr: u64, size: u64) -> Self {
Self {
paddr,
size,
cmdline_paddr: 0,
reserved: 0,
}
}
/// Convert to bytes
pub fn as_bytes(&self) -> &[u8] {
unsafe {
std::slice::from_raw_parts(
self as *const Self as *const u8,
std::mem::size_of::<Self>(),
)
}
}
}
/// PVH configuration for boot setup
#[derive(Debug, Clone)]
pub struct PvhConfig {
/// Total memory size in bytes
pub memory_size: u64,
/// Number of vCPUs
pub vcpu_count: u32,
/// Physical address of command line
pub cmdline_addr: u64,
/// Physical address of initrd (if any)
pub initrd_addr: Option<u64>,
/// Size of initrd (if any)
pub initrd_size: Option<u64>,
}
/// PVH boot setup implementation
pub struct PvhBootSetup;
impl PvhBootSetup {
/// Set up PVH boot structures in guest memory
///
/// Creates and writes:
/// 1. E820 memory map
/// 2. start_info structure
/// 3. Module list (for initrd)
pub fn setup<M: GuestMemory>(config: &PvhConfig, guest_mem: &mut M) -> Result<()> {
// Build E820 memory map
let e820_entries = Self::build_e820_map(config.memory_size)?;
let e820_count = e820_entries.len() as u32;
// Write E820 map to guest memory
Self::write_e820_map(&e820_entries, guest_mem)?;
// Write module list if initrd is present
let modlist_addr = if let (Some(addr), Some(size)) = (config.initrd_addr, config.initrd_size) {
let modlist_addr = layout::E820_MAP_ADDR +
(MAX_E820_ENTRIES * std::mem::size_of::<E820Entry>()) as u64;
let entry = HvmModlistEntry::new(addr, size);
guest_mem.write_bytes(modlist_addr, entry.as_bytes())?;
Some(modlist_addr)
} else {
None
};
// Build and write start_info structure
let mut start_info = StartInfo::new()
.with_cmdline(config.cmdline_addr)
.with_memmap(layout::E820_MAP_ADDR, e820_count);
if let Some(addr) = modlist_addr {
start_info = start_info.with_module(addr);
}
guest_mem.write_bytes(layout::PVH_START_INFO_ADDR, start_info.as_bytes())?;
Ok(())
}
/// Build E820 memory map for the VM
///
/// Creates a standard x86_64 memory layout:
/// - Low memory (0-640KB): RAM
/// - Legacy hole (640KB-1MB): Reserved
/// - High memory (1MB+): RAM
fn build_e820_map(memory_size: u64) -> Result<Vec<E820Entry>> {
let mut entries = Vec::with_capacity(4);
// Validate minimum memory
if memory_size < layout::HIGH_MEMORY_START {
return Err(BootError::MemoryLayout(format!(
"Memory size {} is less than minimum required {}",
memory_size,
layout::HIGH_MEMORY_START
)));
}
// Low memory: 0 to 640KB (0x0 - 0x9FFFF)
// We reserve the first page for real-mode IVT
entries.push(E820Entry::ram(0, layout::LOW_MEMORY_END));
// Legacy video/ROM hole: 640KB to 1MB (0xA0000 - 0xFFFFF)
// This is reserved for VGA memory, option ROMs, etc.
let legacy_hole_size = layout::HIGH_MEMORY_START - layout::LOW_MEMORY_END;
entries.push(E820Entry::reserved(layout::LOW_MEMORY_END, legacy_hole_size));
// High memory: 1MB to RAM size
let high_memory_size = memory_size - layout::HIGH_MEMORY_START;
if high_memory_size > 0 {
entries.push(E820Entry::ram(layout::HIGH_MEMORY_START, high_memory_size));
}
// If memory > 4GB, we might need to handle the MMIO hole
// For now, we assume memory <= 4GB for simplicity
// Production systems should handle:
// - PCI MMIO hole (typically 0xE0000000 - 0xFFFFFFFF)
// - Memory above 4GB remapped
Ok(entries)
}
/// Write E820 map entries to guest memory
fn write_e820_map<M: GuestMemory>(entries: &[E820Entry], guest_mem: &mut M) -> Result<()> {
let entry_size = std::mem::size_of::<E820Entry>();
for (i, entry) in entries.iter().enumerate() {
let addr = layout::E820_MAP_ADDR + (i * entry_size) as u64;
let bytes = unsafe {
std::slice::from_raw_parts(entry as *const E820Entry as *const u8, entry_size)
};
guest_mem.write_bytes(addr, bytes)?;
}
Ok(())
}
/// Get initial CPU register state for PVH boot
///
/// Returns the register values needed to start the vCPU in 64-bit mode
/// with PVH boot protocol.
pub fn get_initial_regs(entry_point: u64) -> PvhRegs {
PvhRegs {
// Instruction pointer - kernel entry
rip: entry_point,
// RBX contains pointer to start_info (Xen PVH convention)
rbx: layout::PVH_START_INFO_ADDR,
// RSI also contains start_info pointer (Linux boot convention)
rsi: layout::PVH_START_INFO_ADDR,
// Stack pointer
rsp: layout::BOOT_STACK_POINTER,
// Clear other general-purpose registers
rax: 0,
rcx: 0,
rdx: 0,
rdi: 0,
rbp: 0,
r8: 0,
r9: 0,
r10: 0,
r11: 0,
r12: 0,
r13: 0,
r14: 0,
r15: 0,
// Flags - interrupts disabled
rflags: 0x2,
// Segment selectors for 64-bit mode
cs: 0x10, // Code segment, ring 0
ds: 0x18, // Data segment
es: 0x18,
fs: 0x18,
gs: 0x18,
ss: 0x18,
// CR registers for 64-bit mode
cr0: CR0_PE | CR0_ET | CR0_PG,
cr3: 0, // Page table base - set by kernel setup
cr4: CR4_PAE,
// EFER for long mode
efer: EFER_LME | EFER_LMA,
}
}
}
/// Control Register 0 bits
const CR0_PE: u64 = 1 << 0; // Protection Enable
const CR0_ET: u64 = 1 << 4; // Extension Type (387 present)
const CR0_PG: u64 = 1 << 31; // Paging Enable
/// Control Register 4 bits
const CR4_PAE: u64 = 1 << 5; // Physical Address Extension
/// EFER (Extended Feature Enable Register) bits
const EFER_LME: u64 = 1 << 8; // Long Mode Enable
const EFER_LMA: u64 = 1 << 10; // Long Mode Active
/// CPU register state for PVH boot
#[derive(Debug, Clone, Default)]
pub struct PvhRegs {
// General purpose registers
pub rax: u64,
pub rbx: u64,
pub rcx: u64,
pub rdx: u64,
pub rsi: u64,
pub rdi: u64,
pub rsp: u64,
pub rbp: u64,
pub r8: u64,
pub r9: u64,
pub r10: u64,
pub r11: u64,
pub r12: u64,
pub r13: u64,
pub r14: u64,
pub r15: u64,
// Instruction pointer
pub rip: u64,
// Flags
pub rflags: u64,
// Segment selectors
pub cs: u16,
pub ds: u16,
pub es: u16,
pub fs: u16,
pub gs: u16,
pub ss: u16,
// Control registers
pub cr0: u64,
pub cr3: u64,
pub cr4: u64,
// Model-specific registers
pub efer: u64,
}
/// GDT entries for 64-bit mode boot
///
/// This provides a minimal GDT for transitioning to 64-bit mode.
/// The kernel will set up its own GDT later.
pub struct BootGdt;
impl BootGdt {
/// Null descriptor (required as GDT[0])
pub const NULL: u64 = 0;
/// 64-bit code segment (CS)
/// Base: 0, Limit: 0xFFFFF (ignored in 64-bit mode)
/// Type: Code, Execute/Read, Present, DPL=0
pub const CODE64: u64 = 0x00af_9b00_0000_ffff;
/// 64-bit data segment (DS, ES, SS, FS, GS)
/// Base: 0, Limit: 0xFFFFF
/// Type: Data, Read/Write, Present, DPL=0
pub const DATA64: u64 = 0x00cf_9300_0000_ffff;
/// Build GDT table as bytes
pub fn as_bytes() -> [u8; 24] {
let mut gdt = [0u8; 24];
gdt[0..8].copy_from_slice(&Self::NULL.to_le_bytes());
gdt[8..16].copy_from_slice(&Self::CODE64.to_le_bytes());
gdt[16..24].copy_from_slice(&Self::DATA64.to_le_bytes());
gdt
}
}
#[cfg(test)]
mod tests {
use super::*;
struct MockMemory {
size: u64,
data: Vec<u8>,
}
impl MockMemory {
fn new(size: u64) -> Self {
Self {
size,
data: vec![0; size as usize],
}
}
}
impl GuestMemory for MockMemory {
fn write_bytes(&mut self, addr: u64, data: &[u8]) -> Result<()> {
let end = addr as usize + data.len();
if end > self.data.len() {
return Err(BootError::GuestMemoryWrite(format!(
"Write at {:#x} exceeds memory size",
addr
)));
}
self.data[addr as usize..end].copy_from_slice(data);
Ok(())
}
fn size(&self) -> u64 {
self.size
}
}
#[test]
fn test_e820_entry_size() {
// E820 entry must be exactly 20 bytes for Linux kernel compatibility
assert_eq!(std::mem::size_of::<E820Entry>(), 20);
}
#[test]
fn test_build_e820_map() {
let memory_size = 128 * 1024 * 1024; // 128MB
let entries = PvhBootSetup::build_e820_map(memory_size).unwrap();
// Should have at least 3 entries
assert!(entries.len() >= 3);
// First entry should be low memory RAM — copy from packed struct
let e0_addr = entries[0].addr;
let e0_type = entries[0].entry_type;
assert_eq!(e0_addr, 0);
assert_eq!(e0_type, E820Type::Ram as u32);
// Second entry should be legacy hole (reserved)
let e1_addr = entries[1].addr;
let e1_type = entries[1].entry_type;
assert_eq!(e1_addr, layout::LOW_MEMORY_END);
assert_eq!(e1_type, E820Type::Reserved as u32);
// Third entry should be high memory RAM
let e2_addr = entries[2].addr;
let e2_type = entries[2].entry_type;
assert_eq!(e2_addr, layout::HIGH_MEMORY_START);
assert_eq!(e2_type, E820Type::Ram as u32);
}
#[test]
fn test_start_info_size() {
// StartInfo should be reasonable size (under 4KB page)
let size = std::mem::size_of::<StartInfo>();
assert!(size < 4096);
assert!(size >= 48); // Minimum expected fields
}
#[test]
fn test_pvh_setup() {
let mut mem = MockMemory::new(128 * 1024 * 1024);
let config = PvhConfig {
memory_size: 128 * 1024 * 1024,
vcpu_count: 2,
cmdline_addr: layout::CMDLINE_ADDR,
initrd_addr: Some(100 * 1024 * 1024),
initrd_size: Some(10 * 1024 * 1024),
};
let result = PvhBootSetup::setup(&config, &mut mem);
assert!(result.is_ok());
// Verify magic was written to start_info location
let magic = u32::from_le_bytes([
mem.data[layout::PVH_START_INFO_ADDR as usize],
mem.data[layout::PVH_START_INFO_ADDR as usize + 1],
mem.data[layout::PVH_START_INFO_ADDR as usize + 2],
mem.data[layout::PVH_START_INFO_ADDR as usize + 3],
]);
assert_eq!(magic, XEN_HVM_START_MAGIC);
}
#[test]
fn test_pvh_regs() {
let entry_point = 0x100200;
let regs = PvhBootSetup::get_initial_regs(entry_point);
// Verify entry point
assert_eq!(regs.rip, entry_point);
// Verify start_info pointer in rbx
assert_eq!(regs.rbx, layout::PVH_START_INFO_ADDR);
// Verify 64-bit mode flags
assert!(regs.cr0 & CR0_PE != 0); // Protection enabled
assert!(regs.cr0 & CR0_PG != 0); // Paging enabled
assert!(regs.cr4 & CR4_PAE != 0); // PAE enabled
assert!(regs.efer & EFER_LME != 0); // Long mode enabled
}
#[test]
fn test_gdt_layout() {
let gdt = BootGdt::as_bytes();
assert_eq!(gdt.len(), 24); // 3 entries × 8 bytes
// First entry should be null
assert_eq!(&gdt[0..8], &[0u8; 8]);
}
}