Volt VMM (Neutron Stardust): source-available under AGPSL v5.0
KVM-based microVMM for the Volt platform: - Sub-second VM boot times - Minimal memory footprint - Landlock LSM + seccomp security - Virtio device support - Custom kernel management Copyright (c) Armored Gates LLC. All rights reserved. Licensed under AGPSL v5.0
This commit is contained in:
115
vmm/src/boot/gdt.rs
Normal file
115
vmm/src/boot/gdt.rs
Normal file
@@ -0,0 +1,115 @@
|
||||
//! GDT (Global Descriptor Table) Setup for 64-bit Boot
|
||||
//!
|
||||
//! Sets up a minimal GDT for 64-bit kernel boot. The kernel will set up
|
||||
//! its own GDT later, so this is just for the initial transition.
|
||||
|
||||
use super::{GuestMemory, Result};
|
||||
#[cfg(test)]
|
||||
use super::BootError;
|
||||
|
||||
/// GDT address in guest memory
|
||||
pub const GDT_ADDR: u64 = 0x500;
|
||||
|
||||
/// GDT size (3 entries × 8 bytes = 24 bytes, but we add a few more for safety)
|
||||
pub const GDT_SIZE: usize = 0x30;
|
||||
|
||||
/// GDT entry indices (matches Firecracker layout)
|
||||
#[allow(dead_code)] // GDT selector constants — part of x86 boot protocol
|
||||
pub mod selectors {
|
||||
/// Null segment (required)
|
||||
pub const NULL: u16 = 0x00;
|
||||
/// 64-bit code segment (at index 1, selector 0x08)
|
||||
pub const CODE64: u16 = 0x08;
|
||||
/// 64-bit data segment (at index 2, selector 0x10)
|
||||
pub const DATA64: u16 = 0x10;
|
||||
}
|
||||
|
||||
/// GDT setup implementation
|
||||
pub struct GdtSetup;
|
||||
|
||||
impl GdtSetup {
|
||||
/// Set up GDT in guest memory
|
||||
///
|
||||
/// Creates a minimal GDT matching Firecracker's layout:
|
||||
/// - Entry 0 (0x00): Null descriptor (required)
|
||||
/// - Entry 1 (0x08): 64-bit code segment
|
||||
/// - Entry 2 (0x10): 64-bit data segment
|
||||
pub fn setup<M: GuestMemory>(guest_mem: &mut M) -> Result<()> {
|
||||
// Zero out the GDT area first
|
||||
let zeros = vec![0u8; GDT_SIZE];
|
||||
guest_mem.write_bytes(GDT_ADDR, &zeros)?;
|
||||
|
||||
// Entry 0: Null descriptor (required, all zeros)
|
||||
// Already zeroed
|
||||
|
||||
// Entry 1 (0x08): 64-bit code segment
|
||||
// Base: 0, Limit: 0xFFFFF (ignored in 64-bit mode)
|
||||
// Flags: Present, Ring 0, Code, Execute/Read, Long mode
|
||||
let code64: u64 = 0x00AF_9B00_0000_FFFF;
|
||||
guest_mem.write_bytes(GDT_ADDR + 0x08, &code64.to_le_bytes())?;
|
||||
|
||||
// Entry 2 (0x10): 64-bit data segment
|
||||
// Base: 0, Limit: 0xFFFFF
|
||||
// Flags: Present, Ring 0, Data, Read/Write
|
||||
let data64: u64 = 0x00CF_9300_0000_FFFF;
|
||||
guest_mem.write_bytes(GDT_ADDR + 0x10, &data64.to_le_bytes())?;
|
||||
|
||||
tracing::debug!("GDT set up at 0x{:x}", GDT_ADDR);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
struct MockMemory {
|
||||
data: Vec<u8>,
|
||||
}
|
||||
|
||||
impl MockMemory {
|
||||
fn new(size: usize) -> Self {
|
||||
Self {
|
||||
data: vec![0; size],
|
||||
}
|
||||
}
|
||||
|
||||
fn read_u64(&self, addr: u64) -> u64 {
|
||||
let bytes = &self.data[addr as usize..addr as usize + 8];
|
||||
u64::from_le_bytes(bytes.try_into().unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
impl GuestMemory for MockMemory {
|
||||
fn write_bytes(&mut self, addr: u64, data: &[u8]) -> Result<()> {
|
||||
let end = addr as usize + data.len();
|
||||
if end > self.data.len() {
|
||||
return Err(BootError::GuestMemoryWrite("overflow".into()));
|
||||
}
|
||||
self.data[addr as usize..end].copy_from_slice(data);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn size(&self) -> u64 {
|
||||
self.data.len() as u64
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_gdt_setup() {
|
||||
let mut mem = MockMemory::new(0x1000);
|
||||
GdtSetup::setup(&mut mem).unwrap();
|
||||
|
||||
// Check null descriptor
|
||||
assert_eq!(mem.read_u64(GDT_ADDR), 0);
|
||||
|
||||
// Check code segment (entry 1, offset 0x08)
|
||||
let code = mem.read_u64(GDT_ADDR + 0x08);
|
||||
assert_eq!(code, 0x00AF_9B00_0000_FFFF);
|
||||
|
||||
// Check data segment (entry 2, offset 0x10)
|
||||
let data = mem.read_u64(GDT_ADDR + 0x10);
|
||||
assert_eq!(data, 0x00CF_9300_0000_FFFF);
|
||||
}
|
||||
}
|
||||
398
vmm/src/boot/initrd.rs
Normal file
398
vmm/src/boot/initrd.rs
Normal file
@@ -0,0 +1,398 @@
|
||||
//! Initrd/Initramfs Loader
|
||||
//!
|
||||
//! Handles loading of initial ramdisk images into guest memory.
|
||||
//! The initrd is placed in high memory to avoid conflicts with the kernel.
|
||||
//!
|
||||
//! # Memory Placement Strategy
|
||||
//!
|
||||
//! The initrd is placed as high as possible in guest memory while:
|
||||
//! 1. Staying below the 4GB boundary (for 32-bit kernel compatibility)
|
||||
//! 2. Being page-aligned
|
||||
//! 3. Not overlapping with the kernel
|
||||
//!
|
||||
//! This matches the behavior of QEMU and other hypervisors.
|
||||
|
||||
use super::{BootError, GuestMemory, Result};
|
||||
use std::fs::File;
|
||||
use std::io::Read;
|
||||
use std::path::Path;
|
||||
|
||||
/// Page size for alignment
|
||||
const PAGE_SIZE: u64 = 4096;
|
||||
|
||||
/// Maximum address for initrd (4GB - 1, for 32-bit compatibility)
|
||||
const MAX_INITRD_ADDR: u64 = 0xFFFF_FFFF;
|
||||
|
||||
/// Minimum gap between kernel and initrd
|
||||
const MIN_KERNEL_INITRD_GAP: u64 = PAGE_SIZE;
|
||||
|
||||
/// Initrd loader configuration
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct InitrdConfig {
|
||||
/// Path to initrd/initramfs image
|
||||
pub path: String,
|
||||
|
||||
/// Total guest memory size
|
||||
pub memory_size: u64,
|
||||
|
||||
/// End address of kernel (for placement calculation)
|
||||
pub kernel_end: u64,
|
||||
}
|
||||
|
||||
/// Result of initrd loading
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct InitrdLoadResult {
|
||||
/// Address where initrd was loaded
|
||||
pub load_addr: u64,
|
||||
|
||||
/// Size of loaded initrd
|
||||
pub size: u64,
|
||||
}
|
||||
|
||||
/// Initrd loader implementation
|
||||
pub struct InitrdLoader;
|
||||
|
||||
impl InitrdLoader {
|
||||
/// Load initrd into guest memory
|
||||
///
|
||||
/// Places the initrd as high as possible in guest memory while respecting
|
||||
/// alignment and boundary constraints.
|
||||
pub fn load<M: GuestMemory>(
|
||||
config: &InitrdConfig,
|
||||
guest_mem: &mut M,
|
||||
) -> Result<InitrdLoadResult> {
|
||||
let initrd_data = Self::read_initrd_file(&config.path)?;
|
||||
let initrd_size = initrd_data.len() as u64;
|
||||
|
||||
if initrd_size == 0 {
|
||||
return Err(BootError::InitrdRead(std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidData,
|
||||
"Initrd file is empty",
|
||||
)));
|
||||
}
|
||||
|
||||
// Calculate optimal placement address
|
||||
let load_addr = Self::calculate_load_address(
|
||||
initrd_size,
|
||||
config.memory_size,
|
||||
config.kernel_end,
|
||||
guest_mem.size(),
|
||||
)?;
|
||||
|
||||
// Write initrd to guest memory
|
||||
guest_mem.write_bytes(load_addr, &initrd_data)?;
|
||||
|
||||
Ok(InitrdLoadResult {
|
||||
load_addr,
|
||||
size: initrd_size,
|
||||
})
|
||||
}
|
||||
|
||||
/// Read initrd file into memory
|
||||
fn read_initrd_file(path: &str) -> Result<Vec<u8>> {
|
||||
let path = Path::new(path);
|
||||
|
||||
if !path.exists() {
|
||||
return Err(BootError::InitrdRead(std::io::Error::new(
|
||||
std::io::ErrorKind::NotFound,
|
||||
format!("Initrd not found: {}", path.display()),
|
||||
)));
|
||||
}
|
||||
|
||||
let mut file = File::open(path).map_err(BootError::InitrdRead)?;
|
||||
|
||||
let mut data = Vec::new();
|
||||
file.read_to_end(&mut data).map_err(BootError::InitrdRead)?;
|
||||
|
||||
Ok(data)
|
||||
}
|
||||
|
||||
/// Calculate the optimal load address for initrd
|
||||
///
|
||||
/// Strategy:
|
||||
/// 1. Try to place at high memory (below 4GB for compatibility)
|
||||
/// 2. Page-align the address
|
||||
/// 3. Ensure no overlap with kernel
|
||||
fn calculate_load_address(
|
||||
initrd_size: u64,
|
||||
memory_size: u64,
|
||||
kernel_end: u64,
|
||||
guest_mem_size: u64,
|
||||
) -> Result<u64> {
|
||||
// Determine the highest usable address
|
||||
let max_addr = guest_mem_size.min(memory_size).min(MAX_INITRD_ADDR);
|
||||
|
||||
// Calculate page-aligned initrd size
|
||||
let aligned_size = Self::align_up(initrd_size, PAGE_SIZE);
|
||||
|
||||
// Try to place at high memory (just below max_addr)
|
||||
if max_addr < aligned_size {
|
||||
return Err(BootError::InitrdTooLarge {
|
||||
size: initrd_size,
|
||||
available: max_addr,
|
||||
});
|
||||
}
|
||||
|
||||
// Calculate load address (page-aligned, as high as possible)
|
||||
let ideal_addr = Self::align_down(max_addr - aligned_size, PAGE_SIZE);
|
||||
|
||||
// Check for kernel overlap
|
||||
let min_addr = kernel_end + MIN_KERNEL_INITRD_GAP;
|
||||
let min_addr_aligned = Self::align_up(min_addr, PAGE_SIZE);
|
||||
|
||||
if ideal_addr < min_addr_aligned {
|
||||
// Not enough space between kernel and max memory
|
||||
return Err(BootError::InitrdTooLarge {
|
||||
size: initrd_size,
|
||||
available: max_addr - min_addr_aligned,
|
||||
});
|
||||
}
|
||||
|
||||
Ok(ideal_addr)
|
||||
}
|
||||
|
||||
/// Align value up to the given alignment
|
||||
#[inline]
|
||||
fn align_up(value: u64, alignment: u64) -> u64 {
|
||||
(value + alignment - 1) & !(alignment - 1)
|
||||
}
|
||||
|
||||
/// Align value down to the given alignment
|
||||
#[inline]
|
||||
fn align_down(value: u64, alignment: u64) -> u64 {
|
||||
value & !(alignment - 1)
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// Initrd format detection — planned feature, not yet wired up
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
/// Helper trait for initrd format detection
|
||||
#[allow(dead_code)]
|
||||
pub trait InitrdFormat {
|
||||
/// Check if data is a valid initrd format
|
||||
fn is_valid(data: &[u8]) -> bool;
|
||||
|
||||
/// Get format name
|
||||
fn name() -> &'static str;
|
||||
}
|
||||
|
||||
/// CPIO archive format (traditional initrd)
|
||||
#[allow(dead_code)]
|
||||
pub struct CpioFormat;
|
||||
|
||||
impl InitrdFormat for CpioFormat {
|
||||
fn is_valid(data: &[u8]) -> bool {
|
||||
if data.len() < 6 {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check for CPIO magic numbers
|
||||
// "070701" or "070702" (newc format)
|
||||
// "070707" (odc format)
|
||||
// 0x71c7 or 0xc771 (binary format)
|
||||
if &data[0..6] == b"070701" || &data[0..6] == b"070702" || &data[0..6] == b"070707" {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Binary CPIO
|
||||
if data.len() >= 2 {
|
||||
let magic = u16::from_le_bytes([data[0], data[1]]);
|
||||
if magic == 0x71c7 || magic == 0xc771 {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
fn name() -> &'static str {
|
||||
"CPIO"
|
||||
}
|
||||
}
|
||||
|
||||
/// Gzip compressed format
|
||||
#[allow(dead_code)]
|
||||
pub struct GzipFormat;
|
||||
|
||||
impl InitrdFormat for GzipFormat {
|
||||
fn is_valid(data: &[u8]) -> bool {
|
||||
// Gzip magic: 0x1f 0x8b
|
||||
data.len() >= 2 && data[0] == 0x1f && data[1] == 0x8b
|
||||
}
|
||||
|
||||
fn name() -> &'static str {
|
||||
"Gzip"
|
||||
}
|
||||
}
|
||||
|
||||
/// XZ compressed format
|
||||
#[allow(dead_code)]
|
||||
pub struct XzFormat;
|
||||
|
||||
impl InitrdFormat for XzFormat {
|
||||
fn is_valid(data: &[u8]) -> bool {
|
||||
// XZ magic: 0xfd "7zXZ" 0x00
|
||||
data.len() >= 6
|
||||
&& data[0] == 0xfd
|
||||
&& &data[1..5] == b"7zXZ"
|
||||
&& data[5] == 0x00
|
||||
}
|
||||
|
||||
fn name() -> &'static str {
|
||||
"XZ"
|
||||
}
|
||||
}
|
||||
|
||||
/// Zstd compressed format
|
||||
#[allow(dead_code)]
|
||||
pub struct ZstdFormat;
|
||||
|
||||
impl InitrdFormat for ZstdFormat {
|
||||
fn is_valid(data: &[u8]) -> bool {
|
||||
// Zstd magic: 0x28 0xb5 0x2f 0xfd
|
||||
data.len() >= 4
|
||||
&& data[0] == 0x28
|
||||
&& data[1] == 0xb5
|
||||
&& data[2] == 0x2f
|
||||
&& data[3] == 0xfd
|
||||
}
|
||||
|
||||
fn name() -> &'static str {
|
||||
"Zstd"
|
||||
}
|
||||
}
|
||||
|
||||
/// LZ4 compressed format
|
||||
#[allow(dead_code)]
|
||||
pub struct Lz4Format;
|
||||
|
||||
impl InitrdFormat for Lz4Format {
|
||||
fn is_valid(data: &[u8]) -> bool {
|
||||
// LZ4 frame magic: 0x04 0x22 0x4d 0x18
|
||||
data.len() >= 4
|
||||
&& data[0] == 0x04
|
||||
&& data[1] == 0x22
|
||||
&& data[2] == 0x4d
|
||||
&& data[3] == 0x18
|
||||
}
|
||||
|
||||
fn name() -> &'static str {
|
||||
"LZ4"
|
||||
}
|
||||
}
|
||||
|
||||
/// Detect initrd format from data
|
||||
#[allow(dead_code)]
|
||||
pub fn detect_initrd_format(data: &[u8]) -> Option<&'static str> {
|
||||
if GzipFormat::is_valid(data) {
|
||||
return Some(GzipFormat::name());
|
||||
}
|
||||
if XzFormat::is_valid(data) {
|
||||
return Some(XzFormat::name());
|
||||
}
|
||||
if ZstdFormat::is_valid(data) {
|
||||
return Some(ZstdFormat::name());
|
||||
}
|
||||
if Lz4Format::is_valid(data) {
|
||||
return Some(Lz4Format::name());
|
||||
}
|
||||
if CpioFormat::is_valid(data) {
|
||||
return Some(CpioFormat::name());
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_align_up() {
|
||||
assert_eq!(InitrdLoader::align_up(0, 4096), 0);
|
||||
assert_eq!(InitrdLoader::align_up(1, 4096), 4096);
|
||||
assert_eq!(InitrdLoader::align_up(4095, 4096), 4096);
|
||||
assert_eq!(InitrdLoader::align_up(4096, 4096), 4096);
|
||||
assert_eq!(InitrdLoader::align_up(4097, 4096), 8192);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_align_down() {
|
||||
assert_eq!(InitrdLoader::align_down(0, 4096), 0);
|
||||
assert_eq!(InitrdLoader::align_down(4095, 4096), 0);
|
||||
assert_eq!(InitrdLoader::align_down(4096, 4096), 4096);
|
||||
assert_eq!(InitrdLoader::align_down(4097, 4096), 4096);
|
||||
assert_eq!(InitrdLoader::align_down(8191, 4096), 4096);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_calculate_load_address() {
|
||||
// 128MB memory, 4MB kernel ending at 5MB
|
||||
let memory_size = 128 * 1024 * 1024;
|
||||
let kernel_end = 5 * 1024 * 1024;
|
||||
let initrd_size = 10 * 1024 * 1024; // 10MB initrd
|
||||
|
||||
let result = InitrdLoader::calculate_load_address(
|
||||
initrd_size,
|
||||
memory_size,
|
||||
kernel_end,
|
||||
memory_size,
|
||||
);
|
||||
|
||||
assert!(result.is_ok());
|
||||
let addr = result.unwrap();
|
||||
|
||||
// Should be page-aligned
|
||||
assert_eq!(addr % PAGE_SIZE, 0);
|
||||
|
||||
// Should be above kernel
|
||||
assert!(addr > kernel_end);
|
||||
|
||||
// Should fit within memory
|
||||
assert!(addr + initrd_size <= memory_size as u64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_initrd_too_large() {
|
||||
let memory_size = 16 * 1024 * 1024; // 16MB
|
||||
let kernel_end = 8 * 1024 * 1024; // Kernel ends at 8MB
|
||||
let initrd_size = 32 * 1024 * 1024; // 32MB initrd (too large!)
|
||||
|
||||
let result = InitrdLoader::calculate_load_address(
|
||||
initrd_size,
|
||||
memory_size,
|
||||
kernel_end,
|
||||
memory_size,
|
||||
);
|
||||
|
||||
assert!(matches!(result, Err(BootError::InitrdTooLarge { .. })));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_gzip() {
|
||||
let data = [0x1f, 0x8b, 0x08, 0x00];
|
||||
assert!(GzipFormat::is_valid(&data));
|
||||
assert_eq!(detect_initrd_format(&data), Some("Gzip"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_xz() {
|
||||
let data = [0xfd, b'7', b'z', b'X', b'Z', 0x00];
|
||||
assert!(XzFormat::is_valid(&data));
|
||||
assert_eq!(detect_initrd_format(&data), Some("XZ"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_zstd() {
|
||||
let data = [0x28, 0xb5, 0x2f, 0xfd];
|
||||
assert!(ZstdFormat::is_valid(&data));
|
||||
assert_eq!(detect_initrd_format(&data), Some("Zstd"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_cpio_newc() {
|
||||
let data = b"070701001234";
|
||||
assert!(CpioFormat::is_valid(data));
|
||||
}
|
||||
}
|
||||
465
vmm/src/boot/linux.rs
Normal file
465
vmm/src/boot/linux.rs
Normal file
@@ -0,0 +1,465 @@
|
||||
//! Linux Boot Protocol Implementation
|
||||
//!
|
||||
//! Implements the Linux x86 boot protocol for 64-bit kernels.
|
||||
//! This sets up the boot_params structure (zero page) that Linux expects
|
||||
//! when booting in 64-bit mode.
|
||||
//!
|
||||
//! # References
|
||||
//! - Linux kernel: arch/x86/include/uapi/asm/bootparam.h
|
||||
//! - Linux kernel: Documentation/x86/boot.rst
|
||||
|
||||
use super::{layout, BootError, GuestMemory, Result};
|
||||
|
||||
/// Boot params address (zero page)
|
||||
/// Must not overlap with page tables (0x1000-0x10FFF zeroed area) or GDT (0x500-0x52F)
|
||||
pub const BOOT_PARAMS_ADDR: u64 = 0x20000;
|
||||
|
||||
/// Size of boot_params structure (4KB)
|
||||
pub const BOOT_PARAMS_SIZE: usize = 4096;
|
||||
|
||||
/// E820 entry within boot_params
|
||||
#[repr(C, packed)]
|
||||
#[derive(Debug, Clone, Copy, Default)]
|
||||
pub struct E820Entry {
|
||||
pub addr: u64,
|
||||
pub size: u64,
|
||||
pub entry_type: u32,
|
||||
}
|
||||
|
||||
/// E820 memory types
|
||||
#[repr(u32)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
#[allow(dead_code)] // E820 spec types — kept for completeness
|
||||
pub enum E820Type {
|
||||
Ram = 1,
|
||||
Reserved = 2,
|
||||
Acpi = 3,
|
||||
Nvs = 4,
|
||||
Unusable = 5,
|
||||
}
|
||||
|
||||
impl E820Entry {
|
||||
pub fn ram(addr: u64, size: u64) -> Self {
|
||||
Self {
|
||||
addr,
|
||||
size,
|
||||
entry_type: E820Type::Ram as u32,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn reserved(addr: u64, size: u64) -> Self {
|
||||
Self {
|
||||
addr,
|
||||
size,
|
||||
entry_type: E820Type::Reserved as u32,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// setup_header structure (at offset 0x1F1 in boot sector, or 0x1F1 in boot_params)
|
||||
/// We only define the fields we actually use
|
||||
#[repr(C, packed)]
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct SetupHeader {
|
||||
pub setup_sects: u8, // 0x1F1
|
||||
pub root_flags: u16, // 0x1F2
|
||||
pub syssize: u32, // 0x1F4
|
||||
pub ram_size: u16, // 0x1F8 (obsolete)
|
||||
pub vid_mode: u16, // 0x1FA
|
||||
pub root_dev: u16, // 0x1FC
|
||||
pub boot_flag: u16, // 0x1FE - should be 0xAA55
|
||||
pub jump: u16, // 0x200
|
||||
pub header: u32, // 0x202 - "HdrS" magic
|
||||
pub version: u16, // 0x206
|
||||
pub realmode_swtch: u32, // 0x208
|
||||
pub start_sys_seg: u16, // 0x20C (obsolete)
|
||||
pub kernel_version: u16, // 0x20E
|
||||
pub type_of_loader: u8, // 0x210
|
||||
pub loadflags: u8, // 0x211
|
||||
pub setup_move_size: u16, // 0x212
|
||||
pub code32_start: u32, // 0x214
|
||||
pub ramdisk_image: u32, // 0x218
|
||||
pub ramdisk_size: u32, // 0x21C
|
||||
pub bootsect_kludge: u32, // 0x220
|
||||
pub heap_end_ptr: u16, // 0x224
|
||||
pub ext_loader_ver: u8, // 0x226
|
||||
pub ext_loader_type: u8, // 0x227
|
||||
pub cmd_line_ptr: u32, // 0x228
|
||||
pub initrd_addr_max: u32, // 0x22C
|
||||
pub kernel_alignment: u32, // 0x230
|
||||
pub relocatable_kernel: u8, // 0x234
|
||||
pub min_alignment: u8, // 0x235
|
||||
pub xloadflags: u16, // 0x236
|
||||
pub cmdline_size: u32, // 0x238
|
||||
pub hardware_subarch: u32, // 0x23C
|
||||
pub hardware_subarch_data: u64, // 0x240
|
||||
pub payload_offset: u32, // 0x248
|
||||
pub payload_length: u32, // 0x24C
|
||||
pub setup_data: u64, // 0x250
|
||||
pub pref_address: u64, // 0x258
|
||||
pub init_size: u32, // 0x260
|
||||
pub handover_offset: u32, // 0x264
|
||||
pub kernel_info_offset: u32, // 0x268
|
||||
}
|
||||
|
||||
impl Default for SetupHeader {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
setup_sects: 0,
|
||||
root_flags: 0,
|
||||
syssize: 0,
|
||||
ram_size: 0,
|
||||
vid_mode: 0xFFFF, // VGA normal
|
||||
root_dev: 0,
|
||||
boot_flag: 0xAA55,
|
||||
jump: 0,
|
||||
header: 0x53726448, // "HdrS"
|
||||
version: 0x020F, // Protocol version 2.15
|
||||
realmode_swtch: 0,
|
||||
start_sys_seg: 0,
|
||||
kernel_version: 0,
|
||||
type_of_loader: 0xFF, // Undefined loader
|
||||
loadflags: LOADFLAG_LOADED_HIGH | LOADFLAG_CAN_USE_HEAP,
|
||||
setup_move_size: 0,
|
||||
code32_start: 0x100000, // 1MB
|
||||
ramdisk_image: 0,
|
||||
ramdisk_size: 0,
|
||||
bootsect_kludge: 0,
|
||||
heap_end_ptr: 0,
|
||||
ext_loader_ver: 0,
|
||||
ext_loader_type: 0,
|
||||
cmd_line_ptr: 0,
|
||||
initrd_addr_max: 0x7FFFFFFF,
|
||||
kernel_alignment: 0x200000, // 2MB
|
||||
relocatable_kernel: 1,
|
||||
min_alignment: 21, // 2^21 = 2MB
|
||||
xloadflags: XLF_KERNEL_64 | XLF_CAN_BE_LOADED_ABOVE_4G,
|
||||
cmdline_size: 4096,
|
||||
hardware_subarch: 0, // PC
|
||||
hardware_subarch_data: 0,
|
||||
payload_offset: 0,
|
||||
payload_length: 0,
|
||||
setup_data: 0,
|
||||
pref_address: 0x1000000, // 16MB
|
||||
init_size: 0,
|
||||
handover_offset: 0,
|
||||
kernel_info_offset: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Linux boot protocol constants — kept for completeness
|
||||
#[allow(dead_code)]
|
||||
pub const LOADFLAG_LOADED_HIGH: u8 = 0x01; // Kernel loaded high (at 0x100000)
|
||||
#[allow(dead_code)]
|
||||
pub const LOADFLAG_KASLR_FLAG: u8 = 0x02; // KASLR enabled
|
||||
#[allow(dead_code)]
|
||||
pub const LOADFLAG_QUIET_FLAG: u8 = 0x20; // Quiet boot
|
||||
#[allow(dead_code)]
|
||||
pub const LOADFLAG_KEEP_SEGMENTS: u8 = 0x40; // Don't reload segments
|
||||
#[allow(dead_code)]
|
||||
pub const LOADFLAG_CAN_USE_HEAP: u8 = 0x80; // Heap available
|
||||
|
||||
/// XLoadflags bits
|
||||
#[allow(dead_code)]
|
||||
pub const XLF_KERNEL_64: u16 = 0x0001; // 64-bit kernel
|
||||
#[allow(dead_code)]
|
||||
pub const XLF_CAN_BE_LOADED_ABOVE_4G: u16 = 0x0002; // Can load above 4GB
|
||||
#[allow(dead_code)]
|
||||
pub const XLF_EFI_HANDOVER_32: u16 = 0x0004; // EFI handover 32-bit
|
||||
#[allow(dead_code)]
|
||||
pub const XLF_EFI_HANDOVER_64: u16 = 0x0008; // EFI handover 64-bit
|
||||
#[allow(dead_code)]
|
||||
pub const XLF_EFI_KEXEC: u16 = 0x0010; // EFI kexec
|
||||
|
||||
/// Maximum E820 entries in boot_params
|
||||
#[allow(dead_code)]
|
||||
pub const E820_MAX_ENTRIES: usize = 128;
|
||||
|
||||
/// Offsets within boot_params structure
|
||||
#[allow(dead_code)] // Linux boot protocol offsets — kept for reference
|
||||
pub mod offsets {
|
||||
/// setup_header starts at 0x1F1
|
||||
pub const SETUP_HEADER: usize = 0x1F1;
|
||||
|
||||
/// E820 entry count at 0x1E8
|
||||
pub const E820_ENTRIES: usize = 0x1E8;
|
||||
|
||||
/// E820 table starts at 0x2D0
|
||||
pub const E820_TABLE: usize = 0x2D0;
|
||||
|
||||
/// Size of one E820 entry
|
||||
pub const E820_ENTRY_SIZE: usize = 20;
|
||||
}
|
||||
|
||||
/// Configuration for Linux boot setup
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct LinuxBootConfig {
|
||||
/// Total memory size in bytes
|
||||
pub memory_size: u64,
|
||||
/// Physical address of command line string
|
||||
pub cmdline_addr: u64,
|
||||
/// Physical address of initrd (if any)
|
||||
pub initrd_addr: Option<u64>,
|
||||
/// Size of initrd (if any)
|
||||
pub initrd_size: Option<u64>,
|
||||
}
|
||||
|
||||
/// Linux boot setup implementation
|
||||
pub struct LinuxBootSetup;
|
||||
|
||||
impl LinuxBootSetup {
|
||||
/// Set up Linux boot_params structure in guest memory
|
||||
///
|
||||
/// This creates the "zero page" that Linux expects when booting in 64-bit mode.
|
||||
/// The boot_params address should be passed to the kernel via RSI register.
|
||||
pub fn setup<M: GuestMemory>(config: &LinuxBootConfig, guest_mem: &mut M) -> Result<u64> {
|
||||
// Allocate and zero the boot_params structure (4KB)
|
||||
let boot_params = vec![0u8; BOOT_PARAMS_SIZE];
|
||||
guest_mem.write_bytes(BOOT_PARAMS_ADDR, &boot_params)?;
|
||||
|
||||
// Build E820 memory map
|
||||
let e820_entries = Self::build_e820_map(config.memory_size)?;
|
||||
|
||||
// Write E820 entry count
|
||||
let e820_count = e820_entries.len() as u8;
|
||||
guest_mem.write_bytes(
|
||||
BOOT_PARAMS_ADDR + offsets::E820_ENTRIES as u64,
|
||||
&[e820_count],
|
||||
)?;
|
||||
|
||||
// Write E820 entries
|
||||
for (i, entry) in e820_entries.iter().enumerate() {
|
||||
let offset = BOOT_PARAMS_ADDR + offsets::E820_TABLE as u64
|
||||
+ (i * offsets::E820_ENTRY_SIZE) as u64;
|
||||
let bytes = unsafe {
|
||||
std::slice::from_raw_parts(
|
||||
entry as *const E820Entry as *const u8,
|
||||
offsets::E820_ENTRY_SIZE,
|
||||
)
|
||||
};
|
||||
guest_mem.write_bytes(offset, bytes)?;
|
||||
}
|
||||
|
||||
// Build and write setup_header
|
||||
let mut header = SetupHeader::default();
|
||||
header.cmd_line_ptr = config.cmdline_addr as u32;
|
||||
|
||||
if let (Some(addr), Some(size)) = (config.initrd_addr, config.initrd_size) {
|
||||
header.ramdisk_image = addr as u32;
|
||||
header.ramdisk_size = size as u32;
|
||||
}
|
||||
|
||||
// Write setup_header to boot_params
|
||||
Self::write_setup_header(guest_mem, &header)?;
|
||||
|
||||
tracing::debug!(
|
||||
"Linux boot_params setup at 0x{:x}: {} E820 entries, cmdline=0x{:x}",
|
||||
BOOT_PARAMS_ADDR,
|
||||
e820_count,
|
||||
config.cmdline_addr
|
||||
);
|
||||
|
||||
Ok(BOOT_PARAMS_ADDR)
|
||||
}
|
||||
|
||||
/// Build E820 memory map for the VM
|
||||
/// Layout matches Firecracker's working E820 configuration
|
||||
fn build_e820_map(memory_size: u64) -> Result<Vec<E820Entry>> {
|
||||
let mut entries = Vec::with_capacity(5);
|
||||
|
||||
if memory_size < layout::HIGH_MEMORY_START {
|
||||
return Err(BootError::MemoryLayout(format!(
|
||||
"Memory size {} is less than minimum required {}",
|
||||
memory_size,
|
||||
layout::HIGH_MEMORY_START
|
||||
)));
|
||||
}
|
||||
|
||||
// EBDA (Extended BIOS Data Area) boundary - Firecracker uses 0x9FC00
|
||||
const EBDA_START: u64 = 0x9FC00;
|
||||
|
||||
// Low memory: 0 to EBDA (usable RAM) - matches Firecracker
|
||||
entries.push(E820Entry::ram(0, EBDA_START));
|
||||
|
||||
// EBDA: Reserved area just below 640KB
|
||||
entries.push(E820Entry::reserved(EBDA_START, layout::LOW_MEMORY_END - EBDA_START));
|
||||
|
||||
// Legacy hole: 640KB to 1MB (reserved for VGA/ROMs)
|
||||
let legacy_hole_size = layout::HIGH_MEMORY_START - layout::LOW_MEMORY_END;
|
||||
entries.push(E820Entry::reserved(layout::LOW_MEMORY_END, legacy_hole_size));
|
||||
|
||||
// High memory: 1MB to end of RAM
|
||||
let high_memory_size = memory_size - layout::HIGH_MEMORY_START;
|
||||
if high_memory_size > 0 {
|
||||
entries.push(E820Entry::ram(layout::HIGH_MEMORY_START, high_memory_size));
|
||||
}
|
||||
|
||||
Ok(entries)
|
||||
}
|
||||
|
||||
/// Write setup_header to boot_params
|
||||
fn write_setup_header<M: GuestMemory>(guest_mem: &mut M, header: &SetupHeader) -> Result<()> {
|
||||
// The setup_header structure is written at offset 0x1F1 within boot_params
|
||||
// We need to write individual fields at their correct offsets
|
||||
|
||||
let base = BOOT_PARAMS_ADDR;
|
||||
|
||||
// 0x1F1: setup_sects
|
||||
guest_mem.write_bytes(base + 0x1F1, &[header.setup_sects])?;
|
||||
// 0x1F2: root_flags
|
||||
guest_mem.write_bytes(base + 0x1F2, &header.root_flags.to_le_bytes())?;
|
||||
// 0x1F4: syssize
|
||||
guest_mem.write_bytes(base + 0x1F4, &header.syssize.to_le_bytes())?;
|
||||
// 0x1FE: boot_flag
|
||||
guest_mem.write_bytes(base + 0x1FE, &header.boot_flag.to_le_bytes())?;
|
||||
// 0x202: header magic
|
||||
guest_mem.write_bytes(base + 0x202, &header.header.to_le_bytes())?;
|
||||
// 0x206: version
|
||||
guest_mem.write_bytes(base + 0x206, &header.version.to_le_bytes())?;
|
||||
// 0x210: type_of_loader
|
||||
guest_mem.write_bytes(base + 0x210, &[header.type_of_loader])?;
|
||||
// 0x211: loadflags
|
||||
guest_mem.write_bytes(base + 0x211, &[header.loadflags])?;
|
||||
// 0x214: code32_start
|
||||
guest_mem.write_bytes(base + 0x214, &header.code32_start.to_le_bytes())?;
|
||||
// 0x218: ramdisk_image
|
||||
guest_mem.write_bytes(base + 0x218, &header.ramdisk_image.to_le_bytes())?;
|
||||
// 0x21C: ramdisk_size
|
||||
guest_mem.write_bytes(base + 0x21C, &header.ramdisk_size.to_le_bytes())?;
|
||||
// 0x224: heap_end_ptr
|
||||
guest_mem.write_bytes(base + 0x224, &header.heap_end_ptr.to_le_bytes())?;
|
||||
// 0x228: cmd_line_ptr
|
||||
guest_mem.write_bytes(base + 0x228, &header.cmd_line_ptr.to_le_bytes())?;
|
||||
// 0x22C: initrd_addr_max
|
||||
guest_mem.write_bytes(base + 0x22C, &header.initrd_addr_max.to_le_bytes())?;
|
||||
// 0x230: kernel_alignment
|
||||
guest_mem.write_bytes(base + 0x230, &header.kernel_alignment.to_le_bytes())?;
|
||||
// 0x234: relocatable_kernel
|
||||
guest_mem.write_bytes(base + 0x234, &[header.relocatable_kernel])?;
|
||||
// 0x236: xloadflags
|
||||
guest_mem.write_bytes(base + 0x236, &header.xloadflags.to_le_bytes())?;
|
||||
// 0x238: cmdline_size
|
||||
guest_mem.write_bytes(base + 0x238, &header.cmdline_size.to_le_bytes())?;
|
||||
// 0x23C: hardware_subarch
|
||||
guest_mem.write_bytes(base + 0x23C, &header.hardware_subarch.to_le_bytes())?;
|
||||
// 0x258: pref_address
|
||||
guest_mem.write_bytes(base + 0x258, &header.pref_address.to_le_bytes())?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
struct MockMemory {
|
||||
size: u64,
|
||||
data: Vec<u8>,
|
||||
}
|
||||
|
||||
impl MockMemory {
|
||||
fn new(size: u64) -> Self {
|
||||
Self {
|
||||
size,
|
||||
data: vec![0; size as usize],
|
||||
}
|
||||
}
|
||||
|
||||
fn read_bytes(&self, addr: u64, len: usize) -> &[u8] {
|
||||
&self.data[addr as usize..addr as usize + len]
|
||||
}
|
||||
}
|
||||
|
||||
impl GuestMemory for MockMemory {
|
||||
fn write_bytes(&mut self, addr: u64, data: &[u8]) -> Result<()> {
|
||||
let end = addr as usize + data.len();
|
||||
if end > self.data.len() {
|
||||
return Err(BootError::GuestMemoryWrite(format!(
|
||||
"Write at {:#x} exceeds memory",
|
||||
addr
|
||||
)));
|
||||
}
|
||||
self.data[addr as usize..end].copy_from_slice(data);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn size(&self) -> u64 {
|
||||
self.size
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_e820_entry_size() {
|
||||
assert_eq!(std::mem::size_of::<E820Entry>(), 20);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_linux_boot_setup() {
|
||||
let mut mem = MockMemory::new(128 * 1024 * 1024);
|
||||
let config = LinuxBootConfig {
|
||||
memory_size: 128 * 1024 * 1024,
|
||||
cmdline_addr: layout::CMDLINE_ADDR,
|
||||
initrd_addr: None,
|
||||
initrd_size: None,
|
||||
};
|
||||
|
||||
let result = LinuxBootSetup::setup(&config, &mut mem);
|
||||
assert!(result.is_ok());
|
||||
assert_eq!(result.unwrap(), BOOT_PARAMS_ADDR);
|
||||
|
||||
// Verify boot_flag
|
||||
let boot_flag = u16::from_le_bytes([
|
||||
mem.data[BOOT_PARAMS_ADDR as usize + 0x1FE],
|
||||
mem.data[BOOT_PARAMS_ADDR as usize + 0x1FF],
|
||||
]);
|
||||
assert_eq!(boot_flag, 0xAA55);
|
||||
|
||||
// Verify header magic
|
||||
let magic = u32::from_le_bytes([
|
||||
mem.data[BOOT_PARAMS_ADDR as usize + 0x202],
|
||||
mem.data[BOOT_PARAMS_ADDR as usize + 0x203],
|
||||
mem.data[BOOT_PARAMS_ADDR as usize + 0x204],
|
||||
mem.data[BOOT_PARAMS_ADDR as usize + 0x205],
|
||||
]);
|
||||
assert_eq!(magic, 0x53726448); // "HdrS"
|
||||
|
||||
// Verify E820 entry count > 0
|
||||
let e820_count = mem.data[BOOT_PARAMS_ADDR as usize + offsets::E820_ENTRIES];
|
||||
assert!(e820_count >= 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_e820_map() {
|
||||
let memory_size = 256 * 1024 * 1024; // 256MB
|
||||
let entries = LinuxBootSetup::build_e820_map(memory_size).unwrap();
|
||||
|
||||
// 4 entries: low RAM (0..EBDA), EBDA reserved, legacy hole (640K-1M), high RAM
|
||||
assert_eq!(entries.len(), 4);
|
||||
|
||||
// Low memory (0 to EBDA) — copy fields from packed struct to avoid unaligned references
|
||||
let e0_addr = entries[0].addr;
|
||||
let e0_type = entries[0].entry_type;
|
||||
assert_eq!(e0_addr, 0);
|
||||
assert_eq!(e0_type, E820Type::Ram as u32);
|
||||
|
||||
// EBDA reserved region
|
||||
let e1_addr = entries[1].addr;
|
||||
let e1_type = entries[1].entry_type;
|
||||
assert_eq!(e1_addr, 0x9FC00); // EBDA_START
|
||||
assert_eq!(e1_type, E820Type::Reserved as u32);
|
||||
|
||||
// Legacy hole (640KB to 1MB)
|
||||
let e2_addr = entries[2].addr;
|
||||
let e2_type = entries[2].entry_type;
|
||||
assert_eq!(e2_addr, layout::LOW_MEMORY_END);
|
||||
assert_eq!(e2_type, E820Type::Reserved as u32);
|
||||
|
||||
// High memory (1MB+)
|
||||
let e3_addr = entries[3].addr;
|
||||
let e3_type = entries[3].entry_type;
|
||||
assert_eq!(e3_addr, layout::HIGH_MEMORY_START);
|
||||
assert_eq!(e3_type, E820Type::Ram as u32);
|
||||
}
|
||||
}
|
||||
576
vmm/src/boot/loader.rs
Normal file
576
vmm/src/boot/loader.rs
Normal file
@@ -0,0 +1,576 @@
|
||||
//! Kernel Loader
|
||||
//!
|
||||
//! Loads Linux kernels in ELF64 or bzImage format directly into guest memory.
|
||||
//! Supports PVH boot protocol for fastest possible boot times.
|
||||
//!
|
||||
//! # Kernel Formats
|
||||
//!
|
||||
//! ## ELF64 (vmlinux)
|
||||
//! - Uncompressed kernel with ELF headers
|
||||
//! - Direct load to specified address
|
||||
//! - Entry point from ELF header
|
||||
//!
|
||||
//! ## bzImage
|
||||
//! - Compressed kernel with setup header
|
||||
//! - Requires parsing setup header for entry point
|
||||
//! - Kernel loaded after setup sectors
|
||||
|
||||
use super::{layout, BootError, GuestMemory, Result};
|
||||
use std::fs::File;
|
||||
use std::io::Read;
|
||||
use std::path::Path;
|
||||
|
||||
/// ELF magic number
|
||||
const ELF_MAGIC: [u8; 4] = [0x7f, b'E', b'L', b'F'];
|
||||
|
||||
/// bzImage magic number at offset 0x202
|
||||
const BZIMAGE_MAGIC: u32 = 0x53726448; // "HdrS"
|
||||
|
||||
/// Minimum boot protocol version for PVH
|
||||
const MIN_BOOT_PROTOCOL_VERSION: u16 = 0x0200;
|
||||
|
||||
/// bzImage header offsets
|
||||
#[allow(dead_code)] // Linux bzImage protocol constants — kept for completeness
|
||||
mod bzimage {
|
||||
/// Magic number offset
|
||||
pub const HEADER_MAGIC_OFFSET: usize = 0x202;
|
||||
/// Boot protocol version offset
|
||||
pub const VERSION_OFFSET: usize = 0x206;
|
||||
/// Kernel version string pointer offset
|
||||
pub const KERNEL_VERSION_OFFSET: usize = 0x20e;
|
||||
/// Setup sectors count offset (at 0x1f1)
|
||||
pub const SETUP_SECTS_OFFSET: usize = 0x1f1;
|
||||
/// Setup header size (minimum)
|
||||
pub const SETUP_HEADER_SIZE: usize = 0x0202;
|
||||
/// Sector size
|
||||
pub const SECTOR_SIZE: usize = 512;
|
||||
/// Default setup sectors if field is 0
|
||||
pub const DEFAULT_SETUP_SECTS: u8 = 4;
|
||||
/// Boot flag offset
|
||||
pub const BOOT_FLAG_OFFSET: usize = 0x1fe;
|
||||
/// Expected boot flag value
|
||||
pub const BOOT_FLAG_VALUE: u16 = 0xaa55;
|
||||
/// Real mode kernel header size
|
||||
pub const REAL_MODE_HEADER_SIZE: usize = 0x8000;
|
||||
/// Loadflags offset
|
||||
pub const LOADFLAGS_OFFSET: usize = 0x211;
|
||||
/// Loadflag: kernel is loaded high (at 0x100000)
|
||||
pub const LOADFLAG_LOADED_HIGH: u8 = 0x01;
|
||||
/// Loadflag: can use heap
|
||||
pub const LOADFLAG_CAN_USE_HEAP: u8 = 0x80;
|
||||
/// Code32 start offset
|
||||
pub const CODE32_START_OFFSET: usize = 0x214;
|
||||
/// Kernel alignment offset
|
||||
pub const KERNEL_ALIGNMENT_OFFSET: usize = 0x230;
|
||||
/// Pref address offset (64-bit)
|
||||
pub const PREF_ADDRESS_OFFSET: usize = 0x258;
|
||||
/// XLoadflags offset
|
||||
pub const XLOADFLAGS_OFFSET: usize = 0x236;
|
||||
/// XLoadflag: kernel has EFI handover
|
||||
pub const XLF_KERNEL_64: u16 = 0x0001;
|
||||
/// XLoadflag: can be loaded above 4GB
|
||||
pub const XLF_CAN_BE_LOADED_ABOVE_4G: u16 = 0x0002;
|
||||
}
|
||||
|
||||
/// Kernel type detection result
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum KernelType {
|
||||
/// ELF64 format (vmlinux)
|
||||
Elf64,
|
||||
/// bzImage format (compressed)
|
||||
BzImage,
|
||||
}
|
||||
|
||||
/// Kernel loader configuration
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct KernelConfig {
|
||||
/// Path to kernel image
|
||||
pub path: String,
|
||||
/// Address to load kernel (typically 1MB)
|
||||
pub load_addr: u64,
|
||||
}
|
||||
|
||||
/// Result of kernel loading
|
||||
#[derive(Debug, Clone)]
|
||||
#[allow(dead_code)]
|
||||
pub struct KernelLoadResult {
|
||||
/// Address where kernel was loaded
|
||||
pub load_addr: u64,
|
||||
/// Total size of loaded kernel
|
||||
pub size: u64,
|
||||
/// Entry point address
|
||||
pub entry_point: u64,
|
||||
/// Detected kernel type
|
||||
pub kernel_type: KernelType,
|
||||
}
|
||||
|
||||
/// Kernel loader implementation
|
||||
pub struct KernelLoader;
|
||||
|
||||
impl KernelLoader {
|
||||
/// Load a kernel image into guest memory
|
||||
///
|
||||
/// Automatically detects kernel format (ELF64 or bzImage) and loads
|
||||
/// appropriately for PVH boot.
|
||||
pub fn load<M: GuestMemory>(config: &KernelConfig, guest_mem: &mut M) -> Result<KernelLoadResult> {
|
||||
let kernel_data = Self::read_kernel_file(&config.path)?;
|
||||
|
||||
// Detect kernel type
|
||||
let kernel_type = Self::detect_kernel_type(&kernel_data)?;
|
||||
|
||||
match kernel_type {
|
||||
KernelType::Elf64 => Self::load_elf64(&kernel_data, config.load_addr, guest_mem),
|
||||
KernelType::BzImage => Self::load_bzimage(&kernel_data, config.load_addr, guest_mem),
|
||||
}
|
||||
}
|
||||
|
||||
/// Read kernel file into memory
|
||||
///
|
||||
/// Pre-allocates the buffer to the file size to avoid reallocation
|
||||
/// during read. For a 21MB kernel this saves ~2ms of Vec growth.
|
||||
fn read_kernel_file(path: &str) -> Result<Vec<u8>> {
|
||||
let path = Path::new(path);
|
||||
let mut file = File::open(path).map_err(BootError::KernelRead)?;
|
||||
|
||||
let file_size = file.metadata()
|
||||
.map_err(BootError::KernelRead)?
|
||||
.len() as usize;
|
||||
|
||||
if file_size == 0 {
|
||||
return Err(BootError::InvalidKernel("Kernel file is empty".into()));
|
||||
}
|
||||
|
||||
let mut data = Vec::with_capacity(file_size);
|
||||
file.read_to_end(&mut data).map_err(BootError::KernelRead)?;
|
||||
|
||||
Ok(data)
|
||||
}
|
||||
|
||||
/// Detect kernel type from magic numbers
|
||||
fn detect_kernel_type(data: &[u8]) -> Result<KernelType> {
|
||||
if data.len() < 4 {
|
||||
return Err(BootError::InvalidKernel("Kernel image too small".into()));
|
||||
}
|
||||
|
||||
// Check for ELF magic
|
||||
if data[0..4] == ELF_MAGIC {
|
||||
// Verify it's ELF64
|
||||
if data.len() < 5 || data[4] != 2 {
|
||||
return Err(BootError::InvalidElf(
|
||||
"Only ELF64 kernels are supported".into(),
|
||||
));
|
||||
}
|
||||
return Ok(KernelType::Elf64);
|
||||
}
|
||||
|
||||
// Check for bzImage magic
|
||||
if data.len() >= bzimage::HEADER_MAGIC_OFFSET + 4 {
|
||||
let magic = u32::from_le_bytes([
|
||||
data[bzimage::HEADER_MAGIC_OFFSET],
|
||||
data[bzimage::HEADER_MAGIC_OFFSET + 1],
|
||||
data[bzimage::HEADER_MAGIC_OFFSET + 2],
|
||||
data[bzimage::HEADER_MAGIC_OFFSET + 3],
|
||||
]);
|
||||
|
||||
if magic == BZIMAGE_MAGIC || (magic & 0xffff) == (BZIMAGE_MAGIC & 0xffff) {
|
||||
return Ok(KernelType::BzImage);
|
||||
}
|
||||
}
|
||||
|
||||
Err(BootError::InvalidKernel(
|
||||
"Unknown kernel format (expected ELF64 or bzImage)".into(),
|
||||
))
|
||||
}
|
||||
|
||||
/// Load ELF64 kernel (vmlinux)
|
||||
///
|
||||
/// # Warning: vmlinux Direct Boot Limitations
|
||||
///
|
||||
/// Loading vmlinux ELF directly has a fundamental limitation: the kernel's
|
||||
/// `__startup_64()` function builds its own page tables that ONLY map the
|
||||
/// kernel text region. After the CR3 switch, low memory (0-16MB) is unmapped,
|
||||
/// causing faults when accessing boot_params or any low memory address.
|
||||
///
|
||||
/// **Recommended**: Use bzImage format instead, which includes a decompressor
|
||||
/// that properly sets up full identity mapping for all memory.
|
||||
///
|
||||
/// See `docs/kernel-pagetable-analysis.md` for detailed analysis.
|
||||
fn load_elf64<M: GuestMemory>(
|
||||
data: &[u8],
|
||||
load_addr: u64,
|
||||
guest_mem: &mut M,
|
||||
) -> Result<KernelLoadResult> {
|
||||
// CRITICAL WARNING: vmlinux direct boot may fail
|
||||
tracing::warn!(
|
||||
"Loading vmlinux ELF directly. This may fail due to kernel page table setup. \
|
||||
The kernel's __startup_64() builds its own page tables that don't map low memory. \
|
||||
Consider using bzImage format for reliable boot."
|
||||
);
|
||||
|
||||
// Parse ELF header
|
||||
let elf = Elf64Header::parse(data)?;
|
||||
|
||||
// Validate it's an executable
|
||||
if elf.e_type != 2 {
|
||||
// ET_EXEC
|
||||
return Err(BootError::InvalidElf("Not an executable ELF".into()));
|
||||
}
|
||||
|
||||
// Validate machine type (x86_64 = 62)
|
||||
if elf.e_machine != 62 {
|
||||
return Err(BootError::InvalidElf(format!(
|
||||
"Unsupported machine type: {} (expected x86_64)",
|
||||
elf.e_machine
|
||||
)));
|
||||
}
|
||||
|
||||
let mut kernel_end = load_addr;
|
||||
|
||||
// Load program headers
|
||||
for i in 0..elf.e_phnum {
|
||||
let ph_offset = elf.e_phoff as usize + (i as usize * elf.e_phentsize as usize);
|
||||
let ph = Elf64ProgramHeader::parse(&data[ph_offset..])?;
|
||||
|
||||
// Only load PT_LOAD segments
|
||||
if ph.p_type != 1 {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Calculate destination address
|
||||
// For PVH, we load at the physical address specified in the ELF
|
||||
// or offset from our load address
|
||||
let dest_addr = if ph.p_paddr >= layout::HIGH_MEMORY_START {
|
||||
ph.p_paddr
|
||||
} else {
|
||||
load_addr + ph.p_paddr
|
||||
};
|
||||
|
||||
// Validate we have space
|
||||
if dest_addr + ph.p_memsz > guest_mem.size() {
|
||||
return Err(BootError::KernelTooLarge {
|
||||
size: dest_addr + ph.p_memsz,
|
||||
available: guest_mem.size(),
|
||||
});
|
||||
}
|
||||
|
||||
// Load file contents
|
||||
let file_start = ph.p_offset as usize;
|
||||
let file_end = file_start + ph.p_filesz as usize;
|
||||
if file_end > data.len() {
|
||||
return Err(BootError::InvalidElf("Program header exceeds file size".into()));
|
||||
}
|
||||
|
||||
guest_mem.write_bytes(dest_addr, &data[file_start..file_end])?;
|
||||
|
||||
// Zero BSS (memsz > filesz)
|
||||
if ph.p_memsz > ph.p_filesz {
|
||||
let bss_start = dest_addr + ph.p_filesz;
|
||||
let bss_size = (ph.p_memsz - ph.p_filesz) as usize;
|
||||
let zeros = vec![0u8; bss_size];
|
||||
guest_mem.write_bytes(bss_start, &zeros)?;
|
||||
}
|
||||
|
||||
kernel_end = kernel_end.max(dest_addr + ph.p_memsz);
|
||||
|
||||
tracing::debug!(
|
||||
"Loaded ELF segment: dest=0x{:x}, filesz=0x{:x}, memsz=0x{:x}",
|
||||
dest_addr,
|
||||
ph.p_filesz,
|
||||
ph.p_memsz
|
||||
);
|
||||
}
|
||||
|
||||
tracing::debug!(
|
||||
"ELF kernel loaded: entry=0x{:x}, kernel_end=0x{:x}",
|
||||
elf.e_entry,
|
||||
kernel_end
|
||||
);
|
||||
|
||||
// For vmlinux ELF, the e_entry is the physical entry point.
|
||||
// But the kernel code is compiled for the virtual address.
|
||||
// We map both identity (physical) and high-kernel (virtual) addresses,
|
||||
// but it's better to use the physical entry for startup_64 which is
|
||||
// designed to run with identity mapping first.
|
||||
//
|
||||
// However, if the kernel immediately triple-faults at the physical address,
|
||||
// we can try the virtual address instead.
|
||||
// Virtual address = 0xFFFFFFFF80000000 + (physical - 0x1000000) + offset_within_text
|
||||
// For entry at physical 0x1000000, virtual would be 0xFFFFFFFF81000000
|
||||
let virtual_entry = 0xFFFFFFFF81000000u64 + (elf.e_entry - 0x1000000);
|
||||
|
||||
tracing::debug!(
|
||||
"Entry points: physical=0x{:x}, virtual=0x{:x}",
|
||||
elf.e_entry, virtual_entry
|
||||
);
|
||||
|
||||
Ok(KernelLoadResult {
|
||||
load_addr,
|
||||
size: kernel_end - load_addr,
|
||||
// Use PHYSICAL entry point - kernel's startup_64 expects identity mapping
|
||||
entry_point: elf.e_entry,
|
||||
kernel_type: KernelType::Elf64,
|
||||
})
|
||||
}
|
||||
|
||||
/// Load bzImage kernel
|
||||
fn load_bzimage<M: GuestMemory>(
|
||||
data: &[u8],
|
||||
load_addr: u64,
|
||||
guest_mem: &mut M,
|
||||
) -> Result<KernelLoadResult> {
|
||||
// Validate minimum size
|
||||
if data.len() < bzimage::SETUP_HEADER_SIZE + bzimage::SECTOR_SIZE {
|
||||
return Err(BootError::InvalidBzImage("Image too small".into()));
|
||||
}
|
||||
|
||||
// Check boot flag
|
||||
let boot_flag = u16::from_le_bytes([
|
||||
data[bzimage::BOOT_FLAG_OFFSET],
|
||||
data[bzimage::BOOT_FLAG_OFFSET + 1],
|
||||
]);
|
||||
if boot_flag != bzimage::BOOT_FLAG_VALUE {
|
||||
return Err(BootError::InvalidBzImage(format!(
|
||||
"Invalid boot flag: {:#x}",
|
||||
boot_flag
|
||||
)));
|
||||
}
|
||||
|
||||
// Get boot protocol version
|
||||
let version = u16::from_le_bytes([
|
||||
data[bzimage::VERSION_OFFSET],
|
||||
data[bzimage::VERSION_OFFSET + 1],
|
||||
]);
|
||||
if version < MIN_BOOT_PROTOCOL_VERSION {
|
||||
return Err(BootError::UnsupportedVersion(format!(
|
||||
"Boot protocol {}.{} is too old (minimum 2.0)",
|
||||
version >> 8,
|
||||
version & 0xff
|
||||
)));
|
||||
}
|
||||
|
||||
// Get setup sectors count
|
||||
let mut setup_sects = data[bzimage::SETUP_SECTS_OFFSET];
|
||||
if setup_sects == 0 {
|
||||
setup_sects = bzimage::DEFAULT_SETUP_SECTS;
|
||||
}
|
||||
|
||||
// Calculate kernel offset (setup sectors + boot sector)
|
||||
let setup_size = (setup_sects as usize + 1) * bzimage::SECTOR_SIZE;
|
||||
if setup_size >= data.len() {
|
||||
return Err(BootError::InvalidBzImage(
|
||||
"Setup size exceeds image size".into(),
|
||||
));
|
||||
}
|
||||
|
||||
// Get loadflags
|
||||
let loadflags = data[bzimage::LOADFLAGS_OFFSET];
|
||||
let loaded_high = (loadflags & bzimage::LOADFLAG_LOADED_HIGH) != 0;
|
||||
|
||||
// For modern kernels (protocol >= 2.0), get code32 entry point
|
||||
let code32_start = if version >= 0x0200 {
|
||||
u32::from_le_bytes([
|
||||
data[bzimage::CODE32_START_OFFSET],
|
||||
data[bzimage::CODE32_START_OFFSET + 1],
|
||||
data[bzimage::CODE32_START_OFFSET + 2],
|
||||
data[bzimage::CODE32_START_OFFSET + 3],
|
||||
])
|
||||
} else {
|
||||
0x100000 // Default high load address
|
||||
};
|
||||
|
||||
// Check for 64-bit support (protocol >= 2.11)
|
||||
let supports_64bit = if version >= 0x020b {
|
||||
let xloadflags = u16::from_le_bytes([
|
||||
data[bzimage::XLOADFLAGS_OFFSET],
|
||||
data[bzimage::XLOADFLAGS_OFFSET + 1],
|
||||
]);
|
||||
(xloadflags & bzimage::XLF_KERNEL_64) != 0
|
||||
} else {
|
||||
false
|
||||
};
|
||||
|
||||
// Get preferred load address (protocol >= 2.10)
|
||||
let pref_address = if version >= 0x020a && data.len() > bzimage::PREF_ADDRESS_OFFSET + 8 {
|
||||
u64::from_le_bytes([
|
||||
data[bzimage::PREF_ADDRESS_OFFSET],
|
||||
data[bzimage::PREF_ADDRESS_OFFSET + 1],
|
||||
data[bzimage::PREF_ADDRESS_OFFSET + 2],
|
||||
data[bzimage::PREF_ADDRESS_OFFSET + 3],
|
||||
data[bzimage::PREF_ADDRESS_OFFSET + 4],
|
||||
data[bzimage::PREF_ADDRESS_OFFSET + 5],
|
||||
data[bzimage::PREF_ADDRESS_OFFSET + 6],
|
||||
data[bzimage::PREF_ADDRESS_OFFSET + 7],
|
||||
])
|
||||
} else {
|
||||
layout::KERNEL_LOAD_ADDR
|
||||
};
|
||||
|
||||
// Determine actual load address
|
||||
let actual_load_addr = if loaded_high {
|
||||
if pref_address != 0 {
|
||||
pref_address
|
||||
} else {
|
||||
load_addr
|
||||
}
|
||||
} else {
|
||||
load_addr
|
||||
};
|
||||
|
||||
// Extract protected mode kernel
|
||||
let kernel_data = &data[setup_size..];
|
||||
let kernel_size = kernel_data.len() as u64;
|
||||
|
||||
// Validate size
|
||||
if actual_load_addr + kernel_size > guest_mem.size() {
|
||||
return Err(BootError::KernelTooLarge {
|
||||
size: kernel_size,
|
||||
available: guest_mem.size() - actual_load_addr,
|
||||
});
|
||||
}
|
||||
|
||||
// Write kernel to guest memory
|
||||
guest_mem.write_bytes(actual_load_addr, kernel_data)?;
|
||||
|
||||
// Determine entry point
|
||||
// For PVH boot, we enter at the 64-bit entry point
|
||||
// which is typically at load_addr + 0x200 for modern kernels
|
||||
let entry_point = if supports_64bit {
|
||||
// 64-bit entry point offset in newer kernels
|
||||
actual_load_addr + 0x200
|
||||
} else {
|
||||
code32_start as u64
|
||||
};
|
||||
|
||||
Ok(KernelLoadResult {
|
||||
load_addr: actual_load_addr,
|
||||
size: kernel_size,
|
||||
entry_point,
|
||||
kernel_type: KernelType::BzImage,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// ELF64 header structure
|
||||
#[derive(Debug, Default)]
|
||||
struct Elf64Header {
|
||||
e_type: u16,
|
||||
e_machine: u16,
|
||||
e_entry: u64,
|
||||
e_phoff: u64,
|
||||
e_phnum: u16,
|
||||
e_phentsize: u16,
|
||||
}
|
||||
|
||||
impl Elf64Header {
|
||||
fn parse(data: &[u8]) -> Result<Self> {
|
||||
if data.len() < 64 {
|
||||
return Err(BootError::InvalidElf("ELF header too small".into()));
|
||||
}
|
||||
|
||||
// Verify ELF magic
|
||||
if &data[0..4] != &ELF_MAGIC {
|
||||
return Err(BootError::InvalidElf("Invalid ELF magic".into()));
|
||||
}
|
||||
|
||||
// Verify 64-bit
|
||||
if data[4] != 2 {
|
||||
return Err(BootError::InvalidElf("Not ELF64".into()));
|
||||
}
|
||||
|
||||
// Verify little-endian
|
||||
if data[5] != 1 {
|
||||
return Err(BootError::InvalidElf("Not little-endian".into()));
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
e_type: u16::from_le_bytes([data[16], data[17]]),
|
||||
e_machine: u16::from_le_bytes([data[18], data[19]]),
|
||||
e_entry: u64::from_le_bytes([
|
||||
data[24], data[25], data[26], data[27],
|
||||
data[28], data[29], data[30], data[31],
|
||||
]),
|
||||
e_phoff: u64::from_le_bytes([
|
||||
data[32], data[33], data[34], data[35],
|
||||
data[36], data[37], data[38], data[39],
|
||||
]),
|
||||
e_phentsize: u16::from_le_bytes([data[54], data[55]]),
|
||||
e_phnum: u16::from_le_bytes([data[56], data[57]]),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// ELF64 program header structure
|
||||
#[derive(Debug, Default)]
|
||||
struct Elf64ProgramHeader {
|
||||
p_type: u32,
|
||||
p_offset: u64,
|
||||
p_paddr: u64,
|
||||
p_filesz: u64,
|
||||
p_memsz: u64,
|
||||
}
|
||||
|
||||
impl Elf64ProgramHeader {
|
||||
fn parse(data: &[u8]) -> Result<Self> {
|
||||
if data.len() < 56 {
|
||||
return Err(BootError::InvalidElf("Program header too small".into()));
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
p_type: u32::from_le_bytes([data[0], data[1], data[2], data[3]]),
|
||||
p_offset: u64::from_le_bytes([
|
||||
data[8], data[9], data[10], data[11],
|
||||
data[12], data[13], data[14], data[15],
|
||||
]),
|
||||
p_paddr: u64::from_le_bytes([
|
||||
data[24], data[25], data[26], data[27],
|
||||
data[28], data[29], data[30], data[31],
|
||||
]),
|
||||
p_filesz: u64::from_le_bytes([
|
||||
data[32], data[33], data[34], data[35],
|
||||
data[36], data[37], data[38], data[39],
|
||||
]),
|
||||
p_memsz: u64::from_le_bytes([
|
||||
data[40], data[41], data[42], data[43],
|
||||
data[44], data[45], data[46], data[47],
|
||||
]),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_detect_elf_magic() {
|
||||
let mut elf_data = vec![0u8; 64];
|
||||
elf_data[0..4].copy_from_slice(&ELF_MAGIC);
|
||||
elf_data[4] = 2; // ELF64
|
||||
|
||||
let result = KernelLoader::detect_kernel_type(&elf_data);
|
||||
assert!(matches!(result, Ok(KernelType::Elf64)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_bzimage_magic() {
|
||||
let mut bzimage_data = vec![0u8; 0x210];
|
||||
// Set boot flag
|
||||
bzimage_data[bzimage::BOOT_FLAG_OFFSET] = 0x55;
|
||||
bzimage_data[bzimage::BOOT_FLAG_OFFSET + 1] = 0xaa;
|
||||
// Set HdrS magic
|
||||
bzimage_data[bzimage::HEADER_MAGIC_OFFSET] = 0x48; // 'H'
|
||||
bzimage_data[bzimage::HEADER_MAGIC_OFFSET + 1] = 0x64; // 'd'
|
||||
bzimage_data[bzimage::HEADER_MAGIC_OFFSET + 2] = 0x72; // 'r'
|
||||
bzimage_data[bzimage::HEADER_MAGIC_OFFSET + 3] = 0x53; // 'S'
|
||||
|
||||
let result = KernelLoader::detect_kernel_type(&bzimage_data);
|
||||
assert!(matches!(result, Ok(KernelType::BzImage)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_invalid_kernel() {
|
||||
let data = vec![0u8; 100];
|
||||
let result = KernelLoader::detect_kernel_type(&data);
|
||||
assert!(matches!(result, Err(BootError::InvalidKernel(_))));
|
||||
}
|
||||
}
|
||||
378
vmm/src/boot/mod.rs
Normal file
378
vmm/src/boot/mod.rs
Normal file
@@ -0,0 +1,378 @@
|
||||
//! Volt Boot Loader Module
|
||||
//!
|
||||
//! Implements PVH direct kernel boot for sub-50ms cold boot times.
|
||||
//! Skips BIOS/UEFI entirely by directly loading the kernel into guest memory
|
||||
//! and setting up the boot parameters.
|
||||
//!
|
||||
//! # Boot Protocol
|
||||
//!
|
||||
//! Volt uses the PVH boot protocol (Xen-compatible) which allows direct
|
||||
//! kernel entry without firmware. This is significantly faster than:
|
||||
//! - Traditional BIOS boot (seconds)
|
||||
//! - Linux boot protocol via SeaBIOS (hundreds of ms)
|
||||
//! - UEFI boot (hundreds of ms)
|
||||
//!
|
||||
//! # Supported Kernel Formats
|
||||
//!
|
||||
//! - ELF64 (vmlinux) - Direct kernel image
|
||||
//! - bzImage - Compressed Linux kernel with setup header
|
||||
//!
|
||||
//! # Memory Layout (typical)
|
||||
//!
|
||||
//! ```text
|
||||
//! 0x0000_0000 - 0x0000_1000 : Reserved (real mode IVT, BDA)
|
||||
//! 0x0000_7000 - 0x0000_8000 : PVH start_info structure
|
||||
//! 0x0000_8000 - 0x0000_9000 : Boot command line
|
||||
//! 0x0001_0000 - 0x0009_0000 : E820 map / boot params
|
||||
//! 0x0010_0000 - ... : Kernel load address (1MB)
|
||||
//! ... - RAM_END : Initrd (loaded at high memory)
|
||||
//! ```
|
||||
|
||||
mod gdt;
|
||||
mod initrd;
|
||||
mod linux;
|
||||
mod loader;
|
||||
pub mod mptable;
|
||||
mod pagetable;
|
||||
#[allow(dead_code)] // PVH boot protocol — planned feature, not yet wired up
|
||||
mod pvh;
|
||||
|
||||
pub use gdt::GdtSetup;
|
||||
pub use initrd::{InitrdConfig, InitrdLoader};
|
||||
pub use linux::LinuxBootSetup;
|
||||
pub use loader::{KernelConfig, KernelLoader};
|
||||
pub use mptable::setup_mptable;
|
||||
pub use pagetable::PageTableSetup;
|
||||
|
||||
use std::io;
|
||||
use thiserror::Error;
|
||||
|
||||
/// Boot loader errors
|
||||
#[derive(Error, Debug)]
|
||||
pub enum BootError {
|
||||
#[error("Failed to read kernel image: {0}")]
|
||||
KernelRead(#[source] io::Error),
|
||||
|
||||
#[error("Failed to read initrd: {0}")]
|
||||
InitrdRead(#[source] io::Error),
|
||||
|
||||
#[error("Invalid kernel format: {0}")]
|
||||
InvalidKernel(String),
|
||||
|
||||
#[error("Invalid bzImage: {0}")]
|
||||
InvalidBzImage(String),
|
||||
|
||||
#[error("Invalid ELF kernel: {0}")]
|
||||
InvalidElf(String),
|
||||
|
||||
#[error("Kernel too large: {size} bytes exceeds available memory {available}")]
|
||||
KernelTooLarge { size: u64, available: u64 },
|
||||
|
||||
#[error("Initrd too large: {size} bytes exceeds available memory {available}")]
|
||||
InitrdTooLarge { size: u64, available: u64 },
|
||||
|
||||
#[error("Command line too long: {len} bytes exceeds maximum {max}")]
|
||||
CommandLineTooLong { len: usize, max: usize },
|
||||
|
||||
#[error("Memory layout error: {0}")]
|
||||
MemoryLayout(String),
|
||||
|
||||
#[error("Failed to write to guest memory: {0}")]
|
||||
GuestMemoryWrite(String),
|
||||
|
||||
#[error("PVH setup failed: {0}")]
|
||||
#[allow(dead_code)] // PVH boot path planned
|
||||
PvhSetup(String),
|
||||
|
||||
#[error("Unsupported kernel version: {0}")]
|
||||
UnsupportedVersion(String),
|
||||
}
|
||||
|
||||
pub type Result<T> = std::result::Result<T, BootError>;
|
||||
|
||||
/// Memory addresses for boot components (x86_64)
|
||||
///
|
||||
/// # Memory Layout (designed to avoid page table overlaps)
|
||||
///
|
||||
/// For VMs with up to 4GB RAM, page tables can use addresses 0x1000-0xA000.
|
||||
/// All boot structures are placed above 0x10000 to ensure no overlaps.
|
||||
///
|
||||
/// ```text
|
||||
/// 0x0000 - 0x04FF : Reserved (IVT, BDA)
|
||||
/// 0x0500 - 0x052F : GDT (3 entries)
|
||||
/// 0x1000 - 0x1FFF : PML4
|
||||
/// 0x2000 - 0x2FFF : PDPT_LOW (identity mapping)
|
||||
/// 0x3000 - 0x3FFF : PDPT_HIGH (kernel high-half mapping)
|
||||
/// 0x4000 - 0x7FFF : PD tables for identity mapping (up to 4 for 4GB)
|
||||
/// 0x8000 - 0x9FFF : PD tables for high-half kernel mapping
|
||||
/// 0xA000 - 0x1FFFF : Reserved / available
|
||||
/// 0x20000 : boot_params (Linux zero page) - 4KB
|
||||
/// 0x21000 : PVH start_info - 4KB
|
||||
/// 0x22000 : E820 memory map - 4KB
|
||||
/// 0x30000 : Boot command line - 4KB
|
||||
/// 0x31000 - 0xFFFFF: Stack and scratch space
|
||||
/// 0x100000 : Kernel load address (1MB)
|
||||
/// ```
|
||||
#[allow(dead_code)] // Memory layout constants — reference for boot protocol
|
||||
pub mod layout {
|
||||
/// Start of reserved low memory
|
||||
pub const LOW_MEMORY_START: u64 = 0x0;
|
||||
|
||||
/// Page table area starts here (PML4)
|
||||
pub const PAGE_TABLE_START: u64 = 0x1000;
|
||||
|
||||
/// End of page table reserved area (enough for 4GB + high-half mapping)
|
||||
pub const PAGE_TABLE_END: u64 = 0xA000;
|
||||
|
||||
/// PVH start_info structure location
|
||||
/// MOVED from 0x7000 to 0x21000 to avoid page table overlap with large VMs
|
||||
pub const PVH_START_INFO_ADDR: u64 = 0x21000;
|
||||
|
||||
/// Boot command line location (after boot_params at 0x20000)
|
||||
pub const CMDLINE_ADDR: u64 = 0x30000;
|
||||
|
||||
/// Maximum command line length (including null terminator)
|
||||
pub const CMDLINE_MAX_SIZE: usize = 4096;
|
||||
|
||||
/// E820 memory map location
|
||||
/// MOVED from 0x9000 to 0x22000 to avoid page table overlap with large VMs
|
||||
pub const E820_MAP_ADDR: u64 = 0x22000;
|
||||
|
||||
/// Default kernel load address (1MB, standard for x86_64)
|
||||
pub const KERNEL_LOAD_ADDR: u64 = 0x100000;
|
||||
|
||||
/// Minimum gap between kernel and initrd
|
||||
pub const KERNEL_INITRD_GAP: u64 = 0x1000;
|
||||
|
||||
/// EBDA (Extended BIOS Data Area) size to reserve
|
||||
pub const EBDA_SIZE: u64 = 0x1000;
|
||||
|
||||
/// End of low memory (640KB boundary)
|
||||
pub const LOW_MEMORY_END: u64 = 0xA0000;
|
||||
|
||||
/// Start of high memory (1MB)
|
||||
pub const HIGH_MEMORY_START: u64 = 0x100000;
|
||||
|
||||
/// Initial stack pointer for boot
|
||||
/// Placed in safe area above page tables but below boot structures
|
||||
pub const BOOT_STACK_POINTER: u64 = 0x1FFF0;
|
||||
|
||||
/// PVH entry point - RIP value when starting the VM
|
||||
/// This should point to the kernel entry point
|
||||
pub const PVH_ENTRY_POINT: u64 = KERNEL_LOAD_ADDR;
|
||||
}
|
||||
|
||||
/// Boot configuration combining kernel, initrd, and PVH setup
|
||||
#[derive(Debug, Clone)]
|
||||
#[allow(dead_code)] // Fields set by config but not all read yet
|
||||
pub struct BootConfig {
|
||||
/// Path to kernel image
|
||||
pub kernel_path: String,
|
||||
|
||||
/// Optional path to initrd/initramfs
|
||||
pub initrd_path: Option<String>,
|
||||
|
||||
/// Kernel command line
|
||||
pub cmdline: String,
|
||||
|
||||
/// Total guest memory size in bytes
|
||||
pub memory_size: u64,
|
||||
|
||||
/// Number of vCPUs
|
||||
pub vcpu_count: u32,
|
||||
}
|
||||
|
||||
impl Default for BootConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
kernel_path: String::new(),
|
||||
initrd_path: None,
|
||||
cmdline: String::from("console=ttyS0 reboot=k panic=1 pci=off"),
|
||||
memory_size: 128 * 1024 * 1024, // 128MB default
|
||||
vcpu_count: 1,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Result of boot setup - contains entry point and register state
|
||||
#[derive(Debug, Clone)]
|
||||
#[allow(dead_code)] // All fields are part of the boot result, may not all be read yet
|
||||
pub struct BootSetupResult {
|
||||
/// Kernel entry point (RIP)
|
||||
pub entry_point: u64,
|
||||
|
||||
/// Initial stack pointer (RSP)
|
||||
pub stack_pointer: u64,
|
||||
|
||||
/// Address of boot_params structure (RSI for Linux boot protocol)
|
||||
pub start_info_addr: u64,
|
||||
|
||||
/// CR3 value (page table base address)
|
||||
pub cr3: u64,
|
||||
|
||||
/// Address where kernel was loaded
|
||||
pub kernel_load_addr: u64,
|
||||
|
||||
/// Size of loaded kernel
|
||||
pub kernel_size: u64,
|
||||
|
||||
/// Address where initrd was loaded (if any)
|
||||
pub initrd_addr: Option<u64>,
|
||||
|
||||
/// Size of initrd (if any)
|
||||
pub initrd_size: Option<u64>,
|
||||
}
|
||||
|
||||
/// Trait for guest memory access during boot
|
||||
pub trait GuestMemory {
|
||||
/// Write bytes to guest memory at the given address
|
||||
fn write_bytes(&mut self, addr: u64, data: &[u8]) -> Result<()>;
|
||||
|
||||
/// Write a value to guest memory
|
||||
#[allow(dead_code)]
|
||||
fn write_obj<T: Copy>(&mut self, addr: u64, val: &T) -> Result<()> {
|
||||
let bytes = unsafe {
|
||||
std::slice::from_raw_parts(val as *const T as *const u8, std::mem::size_of::<T>())
|
||||
};
|
||||
self.write_bytes(addr, bytes)
|
||||
}
|
||||
|
||||
/// Get the total size of guest memory
|
||||
fn size(&self) -> u64;
|
||||
}
|
||||
|
||||
/// Complete boot loader that orchestrates kernel, initrd, and PVH setup
|
||||
pub struct BootLoader;
|
||||
|
||||
impl BootLoader {
|
||||
/// Load kernel and initrd, set up Linux boot protocol
|
||||
///
|
||||
/// This is the main entry point for boot setup. It:
|
||||
/// 1. Loads the kernel image (ELF or bzImage)
|
||||
/// 2. Loads the initrd if specified
|
||||
/// 3. Sets up the Linux boot_params structure (zero page)
|
||||
/// 4. Writes the command line
|
||||
/// 5. Returns the boot parameters for vCPU initialization
|
||||
pub fn setup<M: GuestMemory>(
|
||||
config: &BootConfig,
|
||||
guest_mem: &mut M,
|
||||
) -> Result<BootSetupResult> {
|
||||
// Validate command line length
|
||||
if config.cmdline.len() >= layout::CMDLINE_MAX_SIZE {
|
||||
return Err(BootError::CommandLineTooLong {
|
||||
len: config.cmdline.len(),
|
||||
max: layout::CMDLINE_MAX_SIZE - 1,
|
||||
});
|
||||
}
|
||||
|
||||
// Load kernel
|
||||
let kernel_config = KernelConfig {
|
||||
path: config.kernel_path.clone(),
|
||||
load_addr: layout::KERNEL_LOAD_ADDR,
|
||||
};
|
||||
let kernel_result = KernelLoader::load(&kernel_config, guest_mem)?;
|
||||
|
||||
// Calculate initrd placement (high memory, after kernel)
|
||||
let initrd_result = if let Some(ref initrd_path) = config.initrd_path {
|
||||
let initrd_config = InitrdConfig {
|
||||
path: initrd_path.clone(),
|
||||
memory_size: config.memory_size,
|
||||
kernel_end: kernel_result.load_addr + kernel_result.size,
|
||||
};
|
||||
Some(InitrdLoader::load(&initrd_config, guest_mem)?)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// Write command line to guest memory
|
||||
let cmdline_bytes = config.cmdline.as_bytes();
|
||||
guest_mem.write_bytes(layout::CMDLINE_ADDR, cmdline_bytes)?;
|
||||
// Null terminator
|
||||
guest_mem.write_bytes(layout::CMDLINE_ADDR + cmdline_bytes.len() as u64, &[0])?;
|
||||
|
||||
// Set up GDT for 64-bit mode
|
||||
GdtSetup::setup(guest_mem)?;
|
||||
|
||||
// Set up identity-mapped page tables for 64-bit mode
|
||||
let cr3 = PageTableSetup::setup(guest_mem, config.memory_size)?;
|
||||
|
||||
// Set up Linux boot_params structure (zero page)
|
||||
let linux_config = linux::LinuxBootConfig {
|
||||
memory_size: config.memory_size,
|
||||
cmdline_addr: layout::CMDLINE_ADDR,
|
||||
initrd_addr: initrd_result.as_ref().map(|r| r.load_addr),
|
||||
initrd_size: initrd_result.as_ref().map(|r| r.size),
|
||||
};
|
||||
let boot_params_addr = LinuxBootSetup::setup(&linux_config, guest_mem)?;
|
||||
|
||||
Ok(BootSetupResult {
|
||||
entry_point: kernel_result.entry_point,
|
||||
stack_pointer: layout::BOOT_STACK_POINTER,
|
||||
start_info_addr: boot_params_addr,
|
||||
cr3,
|
||||
kernel_load_addr: kernel_result.load_addr,
|
||||
kernel_size: kernel_result.size,
|
||||
initrd_addr: initrd_result.as_ref().map(|r| r.load_addr),
|
||||
initrd_size: initrd_result.as_ref().map(|r| r.size),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
struct MockMemory {
|
||||
size: u64,
|
||||
data: Vec<u8>,
|
||||
}
|
||||
|
||||
impl MockMemory {
|
||||
fn new(size: u64) -> Self {
|
||||
Self {
|
||||
size,
|
||||
data: vec![0; size as usize],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl GuestMemory for MockMemory {
|
||||
fn write_bytes(&mut self, addr: u64, data: &[u8]) -> Result<()> {
|
||||
let end = addr as usize + data.len();
|
||||
if end > self.data.len() {
|
||||
return Err(BootError::GuestMemoryWrite(format!(
|
||||
"Write at {:#x} with len {} exceeds memory size {}",
|
||||
addr,
|
||||
data.len(),
|
||||
self.size
|
||||
)));
|
||||
}
|
||||
self.data[addr as usize..end].copy_from_slice(data);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn size(&self) -> u64 {
|
||||
self.size
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_boot_config_default() {
|
||||
let config = BootConfig::default();
|
||||
assert!(config.cmdline.contains("console=ttyS0"));
|
||||
assert_eq!(config.vcpu_count, 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cmdline_too_long() {
|
||||
let mut mem = MockMemory::new(1024 * 1024);
|
||||
let config = BootConfig {
|
||||
kernel_path: "/boot/vmlinux".into(),
|
||||
cmdline: "x".repeat(layout::CMDLINE_MAX_SIZE + 1),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = BootLoader::setup(&config, &mut mem);
|
||||
assert!(matches!(result, Err(BootError::CommandLineTooLong { .. })));
|
||||
}
|
||||
}
|
||||
611
vmm/src/boot/mptable.rs
Normal file
611
vmm/src/boot/mptable.rs
Normal file
@@ -0,0 +1,611 @@
|
||||
//! Intel MultiProcessor Specification (MPS) Table Construction
|
||||
//!
|
||||
//! Implements MP Floating Pointer and MP Configuration Table structures
|
||||
//! to advertise SMP topology to the guest kernel. This allows Linux to
|
||||
//! discover and boot Application Processors (APs) beyond the Bootstrap
|
||||
//! Processor (BSP).
|
||||
//!
|
||||
//! # Table Layout (placed at 0x9FC00, just below EBDA)
|
||||
//!
|
||||
//! ```text
|
||||
//! 0x9FC00: MP Floating Pointer Structure (16 bytes)
|
||||
//! 0x9FC10: MP Configuration Table Header (44 bytes)
|
||||
//! 0x9FC3C: Processor Entry 0 (BSP, APIC ID 0) — 20 bytes
|
||||
//! 0x9FC50: Processor Entry 1 (AP, APIC ID 1) — 20 bytes
|
||||
//! ...
|
||||
//! Bus Entry (ISA, 8 bytes)
|
||||
//! I/O APIC Entry (8 bytes)
|
||||
//! I/O Interrupt Entries (IRQ 0-15, 8 bytes each)
|
||||
//! ```
|
||||
//!
|
||||
//! # References
|
||||
//! - Intel MultiProcessor Specification v1.4 (May 1997)
|
||||
//! - Firecracker's mpspec implementation (src/vmm/src/arch/x86_64/mptable.rs)
|
||||
//! - Linux kernel: arch/x86/kernel/mpparse.c
|
||||
|
||||
use super::{BootError, GuestMemory, Result};
|
||||
|
||||
/// Base address for MP tables — just below EBDA at 640KB boundary.
|
||||
/// This address (0x9FC00) is a conventional location that Linux scans.
|
||||
pub const MP_TABLE_START: u64 = 0x9FC00;
|
||||
|
||||
/// Maximum number of vCPUs we can fit in the MP table area.
|
||||
/// Each processor entry is 20 bytes. Between 0x9FC00 and 0xA0000 we have
|
||||
/// 1024 bytes. After headers (60 bytes), bus (8), IOAPIC (8), and 16 IRQ
|
||||
/// entries (128 bytes), we have ~830 bytes = 41 processor entries.
|
||||
/// That's more than enough — clamp to 255 (max APIC IDs).
|
||||
pub const MAX_CPUS: u8 = 255;
|
||||
|
||||
// ============================================================================
|
||||
// MP Floating Pointer Structure (16 bytes)
|
||||
// Intel MPS Table 4-1
|
||||
// ============================================================================
|
||||
|
||||
/// MP Floating Pointer signature: "_MP_"
|
||||
const MP_FP_SIGNATURE: [u8; 4] = [b'_', b'M', b'P', b'_'];
|
||||
|
||||
/// MP Configuration Table signature: "PCMP"
|
||||
const MP_CT_SIGNATURE: [u8; 4] = [b'P', b'C', b'M', b'P'];
|
||||
|
||||
/// MP spec revision 1.4
|
||||
const MP_SPEC_REVISION: u8 = 4;
|
||||
|
||||
/// MP Floating Pointer Feature Byte 1: indicates MP Config Table present
|
||||
const MP_FEATURE_IMCRP: u8 = 0x80;
|
||||
|
||||
// ============================================================================
|
||||
// MP Table Entry Types
|
||||
// ============================================================================
|
||||
|
||||
const MP_ENTRY_PROCESSOR: u8 = 0;
|
||||
const MP_ENTRY_BUS: u8 = 1;
|
||||
const MP_ENTRY_IOAPIC: u8 = 2;
|
||||
const MP_ENTRY_IO_INTERRUPT: u8 = 3;
|
||||
#[allow(dead_code)]
|
||||
const MP_ENTRY_LOCAL_INTERRUPT: u8 = 4;
|
||||
|
||||
// Processor entry flags
|
||||
const CPU_FLAG_ENABLED: u8 = 0x01;
|
||||
const CPU_FLAG_BSP: u8 = 0x02;
|
||||
|
||||
// Interrupt types
|
||||
const INT_TYPE_INT: u8 = 0; // Vectored interrupt
|
||||
#[allow(dead_code)]
|
||||
const INT_TYPE_NMI: u8 = 1;
|
||||
#[allow(dead_code)]
|
||||
const INT_TYPE_SMI: u8 = 2;
|
||||
const INT_TYPE_EXTINT: u8 = 3; // ExtINT (from 8259)
|
||||
|
||||
// Interrupt polarity/trigger flags
|
||||
const INT_FLAG_DEFAULT: u16 = 0x0000; // Conforms to bus spec
|
||||
|
||||
// I/O APIC default address
|
||||
const IOAPIC_DEFAULT_ADDR: u32 = 0xFEC0_0000;
|
||||
|
||||
/// ISA bus type string
|
||||
const BUS_TYPE_ISA: [u8; 6] = [b'I', b'S', b'A', b' ', b' ', b' '];
|
||||
|
||||
// ============================================================================
|
||||
// MP Table Builder
|
||||
// ============================================================================
|
||||
|
||||
/// Write MP tables to guest memory for SMP discovery.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `guest_mem` — Guest memory to write the tables into
|
||||
/// * `num_cpus` — Number of vCPUs (1-255)
|
||||
///
|
||||
/// # Returns
|
||||
/// The guest physical address where the MP Floating Pointer was written.
|
||||
pub fn setup_mptable<M: GuestMemory>(guest_mem: &mut M, num_cpus: u8) -> Result<u64> {
|
||||
if num_cpus == 0 {
|
||||
return Err(BootError::MemoryLayout(
|
||||
"MP table requires at least 1 CPU".to_string(),
|
||||
));
|
||||
}
|
||||
if num_cpus > MAX_CPUS {
|
||||
return Err(BootError::MemoryLayout(format!(
|
||||
"MP table supports at most {} CPUs, got {}",
|
||||
MAX_CPUS, num_cpus
|
||||
)));
|
||||
}
|
||||
|
||||
// Calculate sizes and offsets
|
||||
let fp_size: u64 = 16; // MP Floating Pointer
|
||||
let header_size: u64 = 44; // MP Config Table Header
|
||||
let processor_entry_size: u64 = 20;
|
||||
let bus_entry_size: u64 = 8;
|
||||
let ioapic_entry_size: u64 = 8;
|
||||
let io_int_entry_size: u64 = 8;
|
||||
|
||||
// Number of IO interrupt entries: IRQ 0-15 = 16 entries
|
||||
let num_irqs: u64 = 16;
|
||||
|
||||
let config_table_addr = MP_TABLE_START + fp_size;
|
||||
let _entries_start = config_table_addr + header_size;
|
||||
|
||||
// Calculate total config table size (header + all entries)
|
||||
let total_entries_size = (num_cpus as u64) * processor_entry_size
|
||||
+ bus_entry_size
|
||||
+ ioapic_entry_size
|
||||
+ num_irqs * io_int_entry_size;
|
||||
let config_table_size = header_size + total_entries_size;
|
||||
|
||||
// Verify we fit in the available space (between 0x9FC00 and 0xA0000)
|
||||
let total_size = fp_size + config_table_size;
|
||||
if MP_TABLE_START + total_size > 0xA0000 {
|
||||
return Err(BootError::MemoryLayout(format!(
|
||||
"MP tables ({} bytes) exceed available space (0x9FC00-0xA0000)",
|
||||
total_size
|
||||
)));
|
||||
}
|
||||
|
||||
// Verify we have enough guest memory
|
||||
if MP_TABLE_START + total_size > guest_mem.size() {
|
||||
return Err(BootError::MemoryLayout(format!(
|
||||
"MP tables at 0x{:x} exceed guest memory size 0x{:x}",
|
||||
MP_TABLE_START + total_size,
|
||||
guest_mem.size()
|
||||
)));
|
||||
}
|
||||
|
||||
// Build the MP Configuration Table body (entries)
|
||||
let mut table_buf = Vec::with_capacity(config_table_size as usize);
|
||||
|
||||
// Leave space for the header (we'll fill it after computing checksum)
|
||||
table_buf.resize(header_size as usize, 0);
|
||||
|
||||
// ---- Processor Entries ----
|
||||
let mut entry_count: u16 = 0;
|
||||
|
||||
for cpu_id in 0..num_cpus {
|
||||
let flags = if cpu_id == 0 {
|
||||
CPU_FLAG_ENABLED | CPU_FLAG_BSP
|
||||
} else {
|
||||
CPU_FLAG_ENABLED
|
||||
};
|
||||
|
||||
// CPU signature: Family 6, Model 15 (Core 2 / Merom-class)
|
||||
// This is a safe generic modern x86_64 signature
|
||||
let cpu_signature: u32 = (6 << 8) | (15 << 4) | 1; // Family=6, Model=F, Stepping=1
|
||||
let feature_flags: u32 = 0x0781_FBFF; // Common feature flags (FPU, SSE, SSE2, etc.)
|
||||
|
||||
write_processor_entry(
|
||||
&mut table_buf,
|
||||
cpu_id, // Local APIC ID
|
||||
0x14, // Local APIC version (integrated APIC)
|
||||
flags,
|
||||
cpu_signature,
|
||||
feature_flags,
|
||||
);
|
||||
entry_count += 1;
|
||||
}
|
||||
|
||||
// ---- Bus Entry (ISA) ----
|
||||
write_bus_entry(&mut table_buf, 0, &BUS_TYPE_ISA);
|
||||
entry_count += 1;
|
||||
|
||||
// ---- I/O APIC Entry ----
|
||||
// I/O APIC ID = num_cpus (first ID after all processors)
|
||||
let ioapic_id = num_cpus;
|
||||
write_ioapic_entry(&mut table_buf, ioapic_id, 0x11, IOAPIC_DEFAULT_ADDR);
|
||||
entry_count += 1;
|
||||
|
||||
// ---- I/O Interrupt Assignment Entries ----
|
||||
// Map ISA IRQs 0-15 to IOAPIC pins 0-15
|
||||
|
||||
// IRQ 0: ExtINT (8259 cascade through IOAPIC pin 0)
|
||||
write_io_interrupt_entry(
|
||||
&mut table_buf,
|
||||
INT_TYPE_EXTINT,
|
||||
INT_FLAG_DEFAULT,
|
||||
0, // source bus = ISA
|
||||
0, // source bus IRQ = 0
|
||||
ioapic_id,
|
||||
0, // IOAPIC pin 0
|
||||
);
|
||||
entry_count += 1;
|
||||
|
||||
// IRQs 1-15: Standard vectored interrupts
|
||||
for irq in 1..16u8 {
|
||||
// IRQ 2 is the PIC cascade — skip it (Linux doesn't use it in APIC mode)
|
||||
// But we still report it for completeness
|
||||
write_io_interrupt_entry(
|
||||
&mut table_buf,
|
||||
INT_TYPE_INT,
|
||||
INT_FLAG_DEFAULT,
|
||||
0, // source bus = ISA
|
||||
irq, // source bus IRQ
|
||||
ioapic_id,
|
||||
irq, // IOAPIC pin = same as IRQ number
|
||||
);
|
||||
entry_count += 1;
|
||||
}
|
||||
|
||||
// ---- Fill in the Configuration Table Header ----
|
||||
// Build header at the start of table_buf
|
||||
{
|
||||
// Compute length before taking mutable borrow of the header slice
|
||||
let table_len = table_buf.len() as u16;
|
||||
let header = &mut table_buf[0..header_size as usize];
|
||||
|
||||
// Signature: "PCMP"
|
||||
header[0..4].copy_from_slice(&MP_CT_SIGNATURE);
|
||||
// Base table length (u16 LE) — entire config table including header
|
||||
header[4..6].copy_from_slice(&table_len.to_le_bytes());
|
||||
// Spec revision
|
||||
header[6] = MP_SPEC_REVISION;
|
||||
// Checksum — will be filled below
|
||||
header[7] = 0;
|
||||
// OEM ID (8 bytes, space-padded)
|
||||
header[8..16].copy_from_slice(b"NOVAFLAR");
|
||||
// Product ID (12 bytes, space-padded)
|
||||
header[16..28].copy_from_slice(b"VOLT VM");
|
||||
// OEM table pointer (0 = none)
|
||||
header[28..32].copy_from_slice(&0u32.to_le_bytes());
|
||||
// OEM table size
|
||||
header[32..34].copy_from_slice(&0u16.to_le_bytes());
|
||||
// Entry count
|
||||
header[34..36].copy_from_slice(&entry_count.to_le_bytes());
|
||||
// Local APIC address
|
||||
header[36..40].copy_from_slice(&0xFEE0_0000u32.to_le_bytes());
|
||||
// Extended table length
|
||||
header[40..42].copy_from_slice(&0u16.to_le_bytes());
|
||||
// Extended table checksum
|
||||
header[42] = 0;
|
||||
// Reserved
|
||||
header[43] = 0;
|
||||
|
||||
// Compute and set checksum
|
||||
let checksum = compute_checksum(&table_buf);
|
||||
table_buf[7] = checksum;
|
||||
}
|
||||
|
||||
// ---- Build the MP Floating Pointer Structure ----
|
||||
let mut fp_buf = [0u8; 16];
|
||||
|
||||
// Signature: "_MP_"
|
||||
fp_buf[0..4].copy_from_slice(&MP_FP_SIGNATURE);
|
||||
// Physical address pointer to MP Config Table (u32 LE)
|
||||
fp_buf[4..8].copy_from_slice(&(config_table_addr as u32).to_le_bytes());
|
||||
// Length in 16-byte paragraphs (1 = 16 bytes)
|
||||
fp_buf[8] = 1;
|
||||
// Spec revision
|
||||
fp_buf[9] = MP_SPEC_REVISION;
|
||||
// Checksum — filled below
|
||||
fp_buf[10] = 0;
|
||||
// Feature byte 1: 0 = MP Config Table present (not default config)
|
||||
fp_buf[11] = 0;
|
||||
// Feature byte 2: bit 7 = IMCR present (PIC mode available)
|
||||
fp_buf[12] = MP_FEATURE_IMCRP;
|
||||
// Feature bytes 3-5: reserved
|
||||
fp_buf[13] = 0;
|
||||
fp_buf[14] = 0;
|
||||
fp_buf[15] = 0;
|
||||
|
||||
// Compute floating pointer checksum
|
||||
let fp_checksum = compute_checksum(&fp_buf);
|
||||
fp_buf[10] = fp_checksum;
|
||||
|
||||
// ---- Write everything to guest memory ----
|
||||
guest_mem.write_bytes(MP_TABLE_START, &fp_buf)?;
|
||||
guest_mem.write_bytes(config_table_addr, &table_buf)?;
|
||||
|
||||
tracing::info!(
|
||||
"MP table written at 0x{:x}: {} CPUs, {} entries, {} bytes total\n\
|
||||
Layout: FP=0x{:x}, Config=0x{:x}, IOAPIC ID={}, IOAPIC addr=0x{:x}",
|
||||
MP_TABLE_START,
|
||||
num_cpus,
|
||||
entry_count,
|
||||
total_size,
|
||||
MP_TABLE_START,
|
||||
config_table_addr,
|
||||
ioapic_id,
|
||||
IOAPIC_DEFAULT_ADDR,
|
||||
);
|
||||
|
||||
Ok(MP_TABLE_START)
|
||||
}
|
||||
|
||||
/// Write a Processor Entry (20 bytes) to the table buffer.
|
||||
///
|
||||
/// Format (Intel MPS Table 4-4):
|
||||
/// ```text
|
||||
/// Offset Size Field
|
||||
/// 0 1 Entry type (0 = processor)
|
||||
/// 1 1 Local APIC ID
|
||||
/// 2 1 Local APIC version
|
||||
/// 3 1 CPU flags (bit 0=EN, bit 1=BP)
|
||||
/// 4 4 CPU signature (stepping, model, family)
|
||||
/// 8 4 Feature flags (from CPUID leaf 1 EDX)
|
||||
/// 12 8 Reserved
|
||||
/// ```
|
||||
fn write_processor_entry(
|
||||
buf: &mut Vec<u8>,
|
||||
apic_id: u8,
|
||||
apic_version: u8,
|
||||
flags: u8,
|
||||
cpu_signature: u32,
|
||||
feature_flags: u32,
|
||||
) {
|
||||
buf.push(MP_ENTRY_PROCESSOR); // Entry type
|
||||
buf.push(apic_id); // Local APIC ID
|
||||
buf.push(apic_version); // Local APIC version
|
||||
buf.push(flags); // CPU flags
|
||||
buf.extend_from_slice(&cpu_signature.to_le_bytes()); // CPU signature
|
||||
buf.extend_from_slice(&feature_flags.to_le_bytes()); // Feature flags
|
||||
buf.extend_from_slice(&[0u8; 8]); // Reserved
|
||||
}
|
||||
|
||||
/// Write a Bus Entry (8 bytes) to the table buffer.
|
||||
///
|
||||
/// Format (Intel MPS Table 4-5):
|
||||
/// ```text
|
||||
/// Offset Size Field
|
||||
/// 0 1 Entry type (1 = bus)
|
||||
/// 1 1 Bus ID
|
||||
/// 2 6 Bus type string (space-padded)
|
||||
/// ```
|
||||
fn write_bus_entry(buf: &mut Vec<u8>, bus_id: u8, bus_type: &[u8; 6]) {
|
||||
buf.push(MP_ENTRY_BUS);
|
||||
buf.push(bus_id);
|
||||
buf.extend_from_slice(bus_type);
|
||||
}
|
||||
|
||||
/// Write an I/O APIC Entry (8 bytes) to the table buffer.
|
||||
///
|
||||
/// Format (Intel MPS Table 4-6):
|
||||
/// ```text
|
||||
/// Offset Size Field
|
||||
/// 0 1 Entry type (2 = I/O APIC)
|
||||
/// 1 1 I/O APIC ID
|
||||
/// 2 1 I/O APIC version
|
||||
/// 3 1 I/O APIC flags (bit 0 = EN)
|
||||
/// 4 4 I/O APIC address
|
||||
/// ```
|
||||
fn write_ioapic_entry(buf: &mut Vec<u8>, id: u8, version: u8, addr: u32) {
|
||||
buf.push(MP_ENTRY_IOAPIC);
|
||||
buf.push(id);
|
||||
buf.push(version);
|
||||
buf.push(0x01); // flags: enabled
|
||||
buf.extend_from_slice(&addr.to_le_bytes());
|
||||
}
|
||||
|
||||
/// Write an I/O Interrupt Assignment Entry (8 bytes) to the table buffer.
|
||||
///
|
||||
/// Format (Intel MPS Table 4-7):
|
||||
/// ```text
|
||||
/// Offset Size Field
|
||||
/// 0 1 Entry type (3 = I/O interrupt)
|
||||
/// 1 1 Interrupt type (0=INT, 1=NMI, 2=SMI, 3=ExtINT)
|
||||
/// 2 2 Flags (polarity/trigger)
|
||||
/// 4 1 Source bus ID
|
||||
/// 5 1 Source bus IRQ
|
||||
/// 6 1 Destination I/O APIC ID
|
||||
/// 7 1 Destination I/O APIC pin (INTIN#)
|
||||
/// ```
|
||||
fn write_io_interrupt_entry(
|
||||
buf: &mut Vec<u8>,
|
||||
int_type: u8,
|
||||
flags: u16,
|
||||
src_bus_id: u8,
|
||||
src_bus_irq: u8,
|
||||
dst_ioapic_id: u8,
|
||||
dst_ioapic_pin: u8,
|
||||
) {
|
||||
buf.push(MP_ENTRY_IO_INTERRUPT);
|
||||
buf.push(int_type);
|
||||
buf.extend_from_slice(&flags.to_le_bytes());
|
||||
buf.push(src_bus_id);
|
||||
buf.push(src_bus_irq);
|
||||
buf.push(dst_ioapic_id);
|
||||
buf.push(dst_ioapic_pin);
|
||||
}
|
||||
|
||||
/// Compute the two's-complement checksum for an MP structure.
|
||||
/// The sum of all bytes in the structure must be 0 (mod 256).
|
||||
fn compute_checksum(data: &[u8]) -> u8 {
|
||||
let sum: u8 = data.iter().fold(0u8, |acc, &b| acc.wrapping_add(b));
|
||||
(!sum).wrapping_add(1) // Two's complement = negate
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Tests
|
||||
// ============================================================================
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
struct MockMemory {
|
||||
size: u64,
|
||||
data: Vec<u8>,
|
||||
}
|
||||
|
||||
impl MockMemory {
|
||||
fn new(size: u64) -> Self {
|
||||
Self {
|
||||
size,
|
||||
data: vec![0; size as usize],
|
||||
}
|
||||
}
|
||||
|
||||
fn read_bytes(&self, addr: u64, len: usize) -> &[u8] {
|
||||
&self.data[addr as usize..(addr as usize + len)]
|
||||
}
|
||||
}
|
||||
|
||||
impl GuestMemory for MockMemory {
|
||||
fn write_bytes(&mut self, addr: u64, data: &[u8]) -> Result<()> {
|
||||
let end = addr as usize + data.len();
|
||||
if end > self.data.len() {
|
||||
return Err(BootError::GuestMemoryWrite(format!(
|
||||
"Write at {:#x} exceeds memory",
|
||||
addr
|
||||
)));
|
||||
}
|
||||
self.data[addr as usize..end].copy_from_slice(data);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn size(&self) -> u64 {
|
||||
self.size
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_checksum() {
|
||||
// A buffer with known checksum byte should sum to 0
|
||||
let data = vec![1, 2, 3, 4];
|
||||
let cs = compute_checksum(&data);
|
||||
let total: u8 = data.iter().chain(std::iter::once(&cs)).fold(0u8, |a, b| a.wrapping_add(*b));
|
||||
// With the checksum byte replacing the original slot, the sum should be 0
|
||||
let mut with_cs = data.clone();
|
||||
with_cs.push(0); // placeholder
|
||||
// Actually the checksum replaces index 10 in the FP or 7 in the config header,
|
||||
// but let's verify the math differently:
|
||||
let sum_without: u8 = data.iter().fold(0u8, |a, b| a.wrapping_add(*b));
|
||||
assert_eq!(sum_without.wrapping_add(cs), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mp_floating_pointer_signature() {
|
||||
let mut mem = MockMemory::new(1024 * 1024);
|
||||
let result = setup_mptable(&mut mem, 1);
|
||||
assert!(result.is_ok());
|
||||
|
||||
let fp_addr = result.unwrap() as usize;
|
||||
assert_eq!(&mem.data[fp_addr..fp_addr + 4], b"_MP_");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mp_floating_pointer_checksum() {
|
||||
let mut mem = MockMemory::new(1024 * 1024);
|
||||
setup_mptable(&mut mem, 2).unwrap();
|
||||
|
||||
// MP Floating Pointer is 16 bytes at MP_TABLE_START
|
||||
let fp = mem.read_bytes(MP_TABLE_START, 16);
|
||||
let sum: u8 = fp.iter().fold(0u8, |a, &b| a.wrapping_add(b));
|
||||
assert_eq!(sum, 0, "MP Floating Pointer checksum mismatch");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mp_config_table_checksum() {
|
||||
let mut mem = MockMemory::new(1024 * 1024);
|
||||
setup_mptable(&mut mem, 2).unwrap();
|
||||
|
||||
// Config table starts at MP_TABLE_START + 16
|
||||
let config_addr = (MP_TABLE_START + 16) as usize;
|
||||
// Read table length from header bytes 4-5
|
||||
let table_len = u16::from_le_bytes([
|
||||
mem.data[config_addr + 4],
|
||||
mem.data[config_addr + 5],
|
||||
]) as usize;
|
||||
|
||||
let table = &mem.data[config_addr..config_addr + table_len];
|
||||
let sum: u8 = table.iter().fold(0u8, |a, &b| a.wrapping_add(b));
|
||||
assert_eq!(sum, 0, "MP Config Table checksum mismatch");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mp_config_table_signature() {
|
||||
let mut mem = MockMemory::new(1024 * 1024);
|
||||
setup_mptable(&mut mem, 1).unwrap();
|
||||
|
||||
let config_addr = (MP_TABLE_START + 16) as usize;
|
||||
assert_eq!(&mem.data[config_addr..config_addr + 4], b"PCMP");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mp_table_1_cpu() {
|
||||
let mut mem = MockMemory::new(1024 * 1024);
|
||||
setup_mptable(&mut mem, 1).unwrap();
|
||||
|
||||
let config_addr = (MP_TABLE_START + 16) as usize;
|
||||
// Entry count at offset 34 in header
|
||||
let entry_count = u16::from_le_bytes([
|
||||
mem.data[config_addr + 34],
|
||||
mem.data[config_addr + 35],
|
||||
]);
|
||||
// 1 CPU + 1 bus + 1 IOAPIC + 16 IRQs = 19 entries
|
||||
assert_eq!(entry_count, 19);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mp_table_4_cpus() {
|
||||
let mut mem = MockMemory::new(1024 * 1024);
|
||||
setup_mptable(&mut mem, 4).unwrap();
|
||||
|
||||
let config_addr = (MP_TABLE_START + 16) as usize;
|
||||
let entry_count = u16::from_le_bytes([
|
||||
mem.data[config_addr + 34],
|
||||
mem.data[config_addr + 35],
|
||||
]);
|
||||
// 4 CPUs + 1 bus + 1 IOAPIC + 16 IRQs = 22 entries
|
||||
assert_eq!(entry_count, 22);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mp_table_bsp_flag() {
|
||||
let mut mem = MockMemory::new(1024 * 1024);
|
||||
setup_mptable(&mut mem, 4).unwrap();
|
||||
|
||||
// First processor entry starts at config_addr + 44 (header size)
|
||||
let proc0_offset = (MP_TABLE_START + 16 + 44) as usize;
|
||||
assert_eq!(mem.data[proc0_offset], 0); // Entry type = processor
|
||||
assert_eq!(mem.data[proc0_offset + 1], 0); // APIC ID = 0
|
||||
assert_eq!(mem.data[proc0_offset + 3], CPU_FLAG_ENABLED | CPU_FLAG_BSP); // BSP + EN
|
||||
|
||||
// Second processor
|
||||
let proc1_offset = proc0_offset + 20;
|
||||
assert_eq!(mem.data[proc1_offset + 1], 1); // APIC ID = 1
|
||||
assert_eq!(mem.data[proc1_offset + 3], CPU_FLAG_ENABLED); // EN only (no BSP)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mp_table_ioapic() {
|
||||
let mut mem = MockMemory::new(1024 * 1024);
|
||||
let num_cpus: u8 = 2;
|
||||
setup_mptable(&mut mem, num_cpus).unwrap();
|
||||
|
||||
// IOAPIC entry follows: processors (2*20) + bus (8) = 48 bytes after entries start
|
||||
let entries_start = (MP_TABLE_START + 16 + 44) as usize;
|
||||
let ioapic_offset = entries_start + (num_cpus as usize * 20) + 8;
|
||||
|
||||
assert_eq!(mem.data[ioapic_offset], MP_ENTRY_IOAPIC); // Entry type
|
||||
assert_eq!(mem.data[ioapic_offset + 1], num_cpus); // IOAPIC ID = num_cpus
|
||||
assert_eq!(mem.data[ioapic_offset + 3], 0x01); // Enabled
|
||||
|
||||
// IOAPIC address
|
||||
let addr = u32::from_le_bytes([
|
||||
mem.data[ioapic_offset + 4],
|
||||
mem.data[ioapic_offset + 5],
|
||||
mem.data[ioapic_offset + 6],
|
||||
mem.data[ioapic_offset + 7],
|
||||
]);
|
||||
assert_eq!(addr, IOAPIC_DEFAULT_ADDR);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mp_table_zero_cpus_error() {
|
||||
let mut mem = MockMemory::new(1024 * 1024);
|
||||
let result = setup_mptable(&mut mem, 0);
|
||||
assert!(result.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mp_table_local_apic_addr() {
|
||||
let mut mem = MockMemory::new(1024 * 1024);
|
||||
setup_mptable(&mut mem, 2).unwrap();
|
||||
|
||||
let config_addr = (MP_TABLE_START + 16) as usize;
|
||||
// Local APIC address at offset 36 in header
|
||||
let lapic_addr = u32::from_le_bytes([
|
||||
mem.data[config_addr + 36],
|
||||
mem.data[config_addr + 37],
|
||||
mem.data[config_addr + 38],
|
||||
mem.data[config_addr + 39],
|
||||
]);
|
||||
assert_eq!(lapic_addr, 0xFEE0_0000);
|
||||
}
|
||||
}
|
||||
291
vmm/src/boot/pagetable.rs
Normal file
291
vmm/src/boot/pagetable.rs
Normal file
@@ -0,0 +1,291 @@
|
||||
//! Page Table Setup for 64-bit Boot
|
||||
//!
|
||||
//! Sets up identity-mapped page tables for Linux 64-bit kernel boot.
|
||||
//! The kernel expects to be running with paging enabled and needs:
|
||||
//! - Identity mapping for low memory (0-4GB physical = 0-4GB virtual)
|
||||
//! - High kernel mapping (0xffffffff80000000+ = physical addresses)
|
||||
//!
|
||||
//! # Page Table Layout
|
||||
//!
|
||||
//! We use 2MB huge pages for simplicity and performance:
|
||||
//! - PML4 (Page Map Level 4) at 0x1000
|
||||
//! - PDPT for low memory (identity) at 0x2000
|
||||
//! - PDPT for high memory (kernel) at 0x3000
|
||||
//! - PD tables at 0x4000+
|
||||
//!
|
||||
//! Each PD entry maps 2MB of physical memory using huge pages.
|
||||
|
||||
use super::{GuestMemory, Result};
|
||||
#[cfg(test)]
|
||||
use super::BootError;
|
||||
|
||||
/// PML4 table address
|
||||
pub const PML4_ADDR: u64 = 0x1000;
|
||||
|
||||
/// PDPT (Page Directory Pointer Table) for identity mapping (low memory)
|
||||
pub const PDPT_LOW_ADDR: u64 = 0x2000;
|
||||
|
||||
/// PDPT for kernel high memory mapping
|
||||
pub const PDPT_HIGH_ADDR: u64 = 0x3000;
|
||||
|
||||
/// First PD (Page Directory) address
|
||||
pub const PD_ADDR: u64 = 0x4000;
|
||||
|
||||
/// Size of one page table (4KB)
|
||||
pub const PAGE_TABLE_SIZE: u64 = 0x1000;
|
||||
|
||||
/// Page table entry flags
|
||||
#[allow(dead_code)] // x86 page table flags — kept for completeness
|
||||
mod flags {
|
||||
/// Present bit
|
||||
pub const PRESENT: u64 = 1 << 0;
|
||||
/// Read/Write bit
|
||||
pub const WRITABLE: u64 = 1 << 1;
|
||||
/// User/Supervisor bit (0 = supervisor only)
|
||||
pub const USER: u64 = 1 << 2;
|
||||
/// Page Size bit (1 = 2MB/1GB huge page)
|
||||
pub const PAGE_SIZE: u64 = 1 << 7;
|
||||
}
|
||||
|
||||
/// Page table setup implementation
|
||||
pub struct PageTableSetup;
|
||||
|
||||
impl PageTableSetup {
|
||||
/// Set up page tables for 64-bit Linux kernel boot
|
||||
///
|
||||
/// Creates:
|
||||
/// 1. Identity mapping for first 4GB (virtual 0-4GB -> physical 0-4GB)
|
||||
/// 2. High kernel mapping (virtual 0xffffffff80000000+ -> physical 0+)
|
||||
///
|
||||
/// This allows the kernel to execute at its linked address while also
|
||||
/// having access to physical memory via identity mapping.
|
||||
///
|
||||
/// Returns the CR3 value (PML4 physical address).
|
||||
pub fn setup<M: GuestMemory>(guest_mem: &mut M, memory_size: u64) -> Result<u64> {
|
||||
// Zero out the page table area first (16 pages should be plenty)
|
||||
let zeros = vec![0u8; PAGE_TABLE_SIZE as usize * 16];
|
||||
guest_mem.write_bytes(PML4_ADDR, &zeros)?;
|
||||
|
||||
// Calculate how much memory to map (up to 4GB, or actual memory size)
|
||||
let map_size = memory_size.min(4 * 1024 * 1024 * 1024);
|
||||
|
||||
// Number of 2MB pages needed
|
||||
let num_2mb_pages = (map_size + 0x1FFFFF) / 0x200000;
|
||||
|
||||
// Number of PD tables needed (each PD has 512 entries, each entry maps 2MB)
|
||||
let num_pd_tables = ((num_2mb_pages + 511) / 512).max(1) as usize;
|
||||
|
||||
// ============================================================
|
||||
// Set up PML4 entries
|
||||
// ============================================================
|
||||
|
||||
// Entry 0: Points to low PDPT for identity mapping (0x0 - 512GB)
|
||||
let pml4_entry_0 = PDPT_LOW_ADDR | flags::PRESENT | flags::WRITABLE;
|
||||
guest_mem.write_bytes(PML4_ADDR, &pml4_entry_0.to_le_bytes())?;
|
||||
|
||||
// Entry 511: Points to high PDPT for kernel mapping (0xFFFFFF8000000000+)
|
||||
// PML4[511] maps addresses 0xFFFFFF8000000000 - 0xFFFFFFFFFFFFFFFF
|
||||
let pml4_entry_511 = PDPT_HIGH_ADDR | flags::PRESENT | flags::WRITABLE;
|
||||
guest_mem.write_bytes(PML4_ADDR + 511 * 8, &pml4_entry_511.to_le_bytes())?;
|
||||
|
||||
// ============================================================
|
||||
// Set up PDPT for low memory (identity mapping)
|
||||
// ============================================================
|
||||
for i in 0..num_pd_tables.min(4) {
|
||||
let pd_addr = PD_ADDR + (i as u64 * PAGE_TABLE_SIZE);
|
||||
let pdpt_entry = pd_addr | flags::PRESENT | flags::WRITABLE;
|
||||
let pdpt_offset = PDPT_LOW_ADDR + (i as u64 * 8);
|
||||
guest_mem.write_bytes(pdpt_offset, &pdpt_entry.to_le_bytes())?;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// Set up PDPT for high memory (kernel mapping)
|
||||
// Kernel virtual: 0xffffffff80000000 -> physical 0x0
|
||||
// This is PDPT entry 510 (for 0xffffffff80000000-0xffffffffbfffffff)
|
||||
// And PDPT entry 511 (for 0xffffffffc0000000-0xffffffffffffffff)
|
||||
// ============================================================
|
||||
|
||||
// We need PD tables for the high mapping too
|
||||
// Use PD tables starting after the low-memory ones
|
||||
let high_pd_base = PD_ADDR + (num_pd_tables.min(4) as u64 * PAGE_TABLE_SIZE);
|
||||
|
||||
// PDPT[510] maps 0xffffffff80000000-0xffffffffbfffffff to physical 0x0
|
||||
// (This covers the typical kernel text segment)
|
||||
let pdpt_entry_510 = high_pd_base | flags::PRESENT | flags::WRITABLE;
|
||||
guest_mem.write_bytes(PDPT_HIGH_ADDR + 510 * 8, &pdpt_entry_510.to_le_bytes())?;
|
||||
|
||||
// PDPT[511] maps 0xffffffffc0000000-0xffffffffffffffff
|
||||
let pdpt_entry_511 = (high_pd_base + PAGE_TABLE_SIZE) | flags::PRESENT | flags::WRITABLE;
|
||||
guest_mem.write_bytes(PDPT_HIGH_ADDR + 511 * 8, &pdpt_entry_511.to_le_bytes())?;
|
||||
|
||||
// ============================================================
|
||||
// Set up PD entries for identity mapping (2MB huge pages)
|
||||
// ============================================================
|
||||
for i in 0..num_2mb_pages {
|
||||
let pd_table_index = (i / 512) as usize;
|
||||
let pd_entry_index = i % 512;
|
||||
|
||||
if pd_table_index >= 4 {
|
||||
break; // Only support first 4GB for now
|
||||
}
|
||||
|
||||
let pd_table_addr = PD_ADDR + (pd_table_index as u64 * PAGE_TABLE_SIZE);
|
||||
let pd_entry_offset = pd_table_addr + (pd_entry_index * 8);
|
||||
|
||||
// Physical address this entry maps (2MB aligned)
|
||||
let phys_addr = i * 0x200000;
|
||||
|
||||
// PD entry with PAGE_SIZE flag for 2MB huge page
|
||||
let pd_entry = phys_addr | flags::PRESENT | flags::WRITABLE | flags::PAGE_SIZE;
|
||||
guest_mem.write_bytes(pd_entry_offset, &pd_entry.to_le_bytes())?;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// Set up PD entries for high kernel mapping
|
||||
// 0xffffffff80000000 + offset -> physical offset
|
||||
// ============================================================
|
||||
// Map first 1GB of physical memory to the high kernel address space
|
||||
for i in 0..512 {
|
||||
let phys_addr = i * 0x200000;
|
||||
if phys_addr >= map_size {
|
||||
break;
|
||||
}
|
||||
|
||||
// PD for PDPT[510] (0xffffffff80000000-0xffffffffbfffffff)
|
||||
let pd_entry = phys_addr | flags::PRESENT | flags::WRITABLE | flags::PAGE_SIZE;
|
||||
let pd_offset = high_pd_base + (i * 8);
|
||||
guest_mem.write_bytes(pd_offset, &pd_entry.to_le_bytes())?;
|
||||
}
|
||||
|
||||
// Map second 1GB for PDPT[511]
|
||||
for i in 0..512 {
|
||||
let phys_addr = (512 + i) * 0x200000;
|
||||
if phys_addr >= map_size {
|
||||
break;
|
||||
}
|
||||
|
||||
let pd_entry = phys_addr | flags::PRESENT | flags::WRITABLE | flags::PAGE_SIZE;
|
||||
let pd_offset = high_pd_base + PAGE_TABLE_SIZE + (i * 8);
|
||||
guest_mem.write_bytes(pd_offset, &pd_entry.to_le_bytes())?;
|
||||
}
|
||||
|
||||
// Debug: dump page table structure for verification
|
||||
tracing::info!(
|
||||
"Page tables configured at CR3=0x{:x}:\n\
|
||||
PML4[0] = 0x{:016x} -> PDPT_LOW at 0x{:x}\n\
|
||||
PML4[511] = 0x{:016x} -> PDPT_HIGH at 0x{:x}\n\
|
||||
PDPT_LOW[0] = 0x{:016x} -> PD at 0x{:x}\n\
|
||||
{} PD entries (2MB huge pages) covering {} MB",
|
||||
PML4_ADDR,
|
||||
pml4_entry_0, PDPT_LOW_ADDR,
|
||||
pml4_entry_511, PDPT_HIGH_ADDR,
|
||||
PDPT_LOW_ADDR | flags::PRESENT | flags::WRITABLE, PD_ADDR,
|
||||
num_2mb_pages,
|
||||
map_size / (1024 * 1024)
|
||||
);
|
||||
|
||||
// Log the PD entry that maps the kernel (typically at 16MB = 0x1000000)
|
||||
// 0x1000000 / 2MB = 8, so PD[8] maps the kernel
|
||||
let kernel_pd_entry = 8u64 * 0x200000 | flags::PRESENT | flags::WRITABLE | flags::PAGE_SIZE;
|
||||
tracing::info!(
|
||||
"Identity mapping for kernel at 0x1000000:\n\
|
||||
PD[8] = 0x{:016x} -> maps physical 0x1000000-0x11FFFFF",
|
||||
kernel_pd_entry
|
||||
);
|
||||
|
||||
Ok(PML4_ADDR)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
struct MockMemory {
|
||||
size: u64,
|
||||
data: Vec<u8>,
|
||||
}
|
||||
|
||||
impl MockMemory {
|
||||
fn new(size: u64) -> Self {
|
||||
Self {
|
||||
size,
|
||||
data: vec![0; size as usize],
|
||||
}
|
||||
}
|
||||
|
||||
fn read_u64(&self, addr: u64) -> u64 {
|
||||
let bytes = &self.data[addr as usize..addr as usize + 8];
|
||||
u64::from_le_bytes(bytes.try_into().unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
impl GuestMemory for MockMemory {
|
||||
fn write_bytes(&mut self, addr: u64, data: &[u8]) -> Result<()> {
|
||||
let end = addr as usize + data.len();
|
||||
if end > self.data.len() {
|
||||
return Err(BootError::GuestMemoryWrite(format!(
|
||||
"Write at {:#x} exceeds memory",
|
||||
addr
|
||||
)));
|
||||
}
|
||||
self.data[addr as usize..end].copy_from_slice(data);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn size(&self) -> u64 {
|
||||
self.size
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_page_table_setup() {
|
||||
let mut mem = MockMemory::new(128 * 1024 * 1024);
|
||||
let result = PageTableSetup::setup(&mut mem, 128 * 1024 * 1024);
|
||||
|
||||
assert!(result.is_ok());
|
||||
assert_eq!(result.unwrap(), PML4_ADDR);
|
||||
|
||||
// Verify PML4[0] entry points to low PDPT (identity mapping)
|
||||
let pml4_entry_0 = mem.read_u64(PML4_ADDR);
|
||||
assert_eq!(pml4_entry_0 & !0xFFF, PDPT_LOW_ADDR);
|
||||
assert!(pml4_entry_0 & flags::PRESENT != 0);
|
||||
assert!(pml4_entry_0 & flags::WRITABLE != 0);
|
||||
|
||||
// Verify PML4[511] entry points to high PDPT (kernel mapping)
|
||||
let pml4_entry_511 = mem.read_u64(PML4_ADDR + 511 * 8);
|
||||
assert_eq!(pml4_entry_511 & !0xFFF, PDPT_HIGH_ADDR);
|
||||
assert!(pml4_entry_511 & flags::PRESENT != 0);
|
||||
|
||||
// Verify first PDPT entry points to first PD
|
||||
let pdpt_entry = mem.read_u64(PDPT_LOW_ADDR);
|
||||
assert_eq!(pdpt_entry & !0xFFF, PD_ADDR);
|
||||
assert!(pdpt_entry & flags::PRESENT != 0);
|
||||
|
||||
// Verify first PD entry maps physical address 0
|
||||
let pd_entry = mem.read_u64(PD_ADDR);
|
||||
assert_eq!(pd_entry & !0x1FFFFF, 0);
|
||||
assert!(pd_entry & flags::PRESENT != 0);
|
||||
assert!(pd_entry & flags::PAGE_SIZE != 0); // 2MB page
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_identity_mapping() {
|
||||
let mut mem = MockMemory::new(256 * 1024 * 1024);
|
||||
PageTableSetup::setup(&mut mem, 256 * 1024 * 1024).unwrap();
|
||||
|
||||
// Check that addresses 0, 2MB, 4MB, etc. are identity mapped
|
||||
for i in 0..128 {
|
||||
let phys_addr = i * 0x200000u64; // 2MB pages
|
||||
let pd_entry_index = i;
|
||||
let pd_table_index = pd_entry_index / 512;
|
||||
let pd_entry_in_table = pd_entry_index % 512;
|
||||
|
||||
let pd_addr = PD_ADDR + pd_table_index * PAGE_TABLE_SIZE;
|
||||
let pd_entry = mem.read_u64(pd_addr + pd_entry_in_table * 8);
|
||||
|
||||
let mapped_addr = pd_entry & !0x1FFFFF;
|
||||
assert_eq!(mapped_addr, phys_addr, "Mismatch at entry {}", i);
|
||||
}
|
||||
}
|
||||
}
|
||||
608
vmm/src/boot/pvh.rs
Normal file
608
vmm/src/boot/pvh.rs
Normal file
@@ -0,0 +1,608 @@
|
||||
//! PVH Boot Protocol Implementation
|
||||
//!
|
||||
//! PVH (Para-Virtualized Hardware) is a boot protocol that allows direct kernel
|
||||
//! entry without BIOS/UEFI firmware. This is the fastest path to boot a Linux VM.
|
||||
//!
|
||||
//! # Overview
|
||||
//!
|
||||
//! The PVH boot protocol:
|
||||
//! 1. Skips BIOS POST and firmware initialization
|
||||
//! 2. Loads kernel directly into memory
|
||||
//! 3. Sets up minimal boot structures (E820 map, start_info)
|
||||
//! 4. Jumps directly to kernel 64-bit entry point
|
||||
//!
|
||||
//! # Boot Time Comparison
|
||||
//!
|
||||
//! | Method | Boot Time |
|
||||
//! |--------|-----------|
|
||||
//! | BIOS | 1-3s |
|
||||
//! | UEFI | 0.5-1s |
|
||||
//! | PVH | <50ms |
|
||||
//!
|
||||
//! # Memory Requirements
|
||||
//!
|
||||
//! The PVH start_info structure must be placed in guest memory and
|
||||
//! its address passed to the kernel via RBX register.
|
||||
|
||||
use super::{layout, BootError, GuestMemory, Result};
|
||||
|
||||
/// Maximum number of E820 entries
|
||||
pub const MAX_E820_ENTRIES: usize = 128;
|
||||
|
||||
/// E820 memory type values (matching Linux kernel definitions)
|
||||
#[repr(u32)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum E820Type {
|
||||
/// Usable RAM
|
||||
Ram = 1,
|
||||
/// Reserved by system
|
||||
Reserved = 2,
|
||||
/// ACPI reclaimable
|
||||
Acpi = 3,
|
||||
/// ACPI NVS (Non-Volatile Storage)
|
||||
Nvs = 4,
|
||||
/// Unusable memory
|
||||
Unusable = 5,
|
||||
/// Disabled memory (EFI)
|
||||
Disabled = 6,
|
||||
/// Persistent memory
|
||||
Pmem = 7,
|
||||
/// Undefined/other
|
||||
Undefined = 0,
|
||||
}
|
||||
|
||||
impl From<u32> for E820Type {
|
||||
fn from(val: u32) -> Self {
|
||||
match val {
|
||||
1 => E820Type::Ram,
|
||||
2 => E820Type::Reserved,
|
||||
3 => E820Type::Acpi,
|
||||
4 => E820Type::Nvs,
|
||||
5 => E820Type::Unusable,
|
||||
6 => E820Type::Disabled,
|
||||
7 => E820Type::Pmem,
|
||||
_ => E820Type::Undefined,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// E820 memory map entry
|
||||
///
|
||||
/// Matches the Linux kernel's e820entry structure for compatibility.
|
||||
#[repr(C, packed)]
|
||||
#[derive(Debug, Clone, Copy, Default)]
|
||||
pub struct E820Entry {
|
||||
/// Start address of memory region
|
||||
pub addr: u64,
|
||||
/// Size of memory region in bytes
|
||||
pub size: u64,
|
||||
/// Type of memory region
|
||||
pub entry_type: u32,
|
||||
}
|
||||
|
||||
impl E820Entry {
|
||||
/// Create a new E820 entry
|
||||
pub fn new(addr: u64, size: u64, entry_type: E820Type) -> Self {
|
||||
Self {
|
||||
addr,
|
||||
size,
|
||||
entry_type: entry_type as u32,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a RAM entry
|
||||
pub fn ram(addr: u64, size: u64) -> Self {
|
||||
Self::new(addr, size, E820Type::Ram)
|
||||
}
|
||||
|
||||
/// Create a reserved entry
|
||||
pub fn reserved(addr: u64, size: u64) -> Self {
|
||||
Self::new(addr, size, E820Type::Reserved)
|
||||
}
|
||||
}
|
||||
|
||||
/// PVH start_info structure
|
||||
///
|
||||
/// This is a simplified version compatible with the Xen PVH ABI.
|
||||
/// The structure is placed in guest memory and its address is passed
|
||||
/// to the kernel in RBX.
|
||||
///
|
||||
/// # Memory Layout
|
||||
///
|
||||
/// The structure must be at a known location (typically 0x7000) and
|
||||
/// contain pointers to other boot structures.
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct StartInfo {
|
||||
/// Magic number (XEN_HVM_START_MAGIC_VALUE or custom)
|
||||
pub magic: u32,
|
||||
/// Version of the start_info structure
|
||||
pub version: u32,
|
||||
/// Flags (reserved, should be 0)
|
||||
pub flags: u32,
|
||||
/// Number of modules (initrd counts as 1)
|
||||
pub nr_modules: u32,
|
||||
/// Physical address of module list
|
||||
pub modlist_paddr: u64,
|
||||
/// Physical address of command line string
|
||||
pub cmdline_paddr: u64,
|
||||
/// Physical address of RSDP (ACPI, 0 if none)
|
||||
pub rsdp_paddr: u64,
|
||||
/// Physical address of E820 memory map
|
||||
pub memmap_paddr: u64,
|
||||
/// Number of entries in memory map
|
||||
pub memmap_entries: u32,
|
||||
/// Reserved/padding
|
||||
pub reserved: u32,
|
||||
}
|
||||
|
||||
/// XEN HVM start magic value
|
||||
pub const XEN_HVM_START_MAGIC: u32 = 0x336ec578;
|
||||
|
||||
/// Volt custom magic (for identification)
|
||||
pub const VOLT_MAGIC: u32 = 0x4e4f5641; // "NOVA"
|
||||
|
||||
impl StartInfo {
|
||||
/// Create a new StartInfo with default values
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
magic: XEN_HVM_START_MAGIC,
|
||||
version: 1,
|
||||
flags: 0,
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
/// Set command line address
|
||||
pub fn with_cmdline(mut self, addr: u64) -> Self {
|
||||
self.cmdline_paddr = addr;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set memory map address and entry count
|
||||
pub fn with_memmap(mut self, addr: u64, entries: u32) -> Self {
|
||||
self.memmap_paddr = addr;
|
||||
self.memmap_entries = entries;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set module (initrd) information
|
||||
pub fn with_module(mut self, modlist_addr: u64) -> Self {
|
||||
self.nr_modules = 1;
|
||||
self.modlist_paddr = modlist_addr;
|
||||
self
|
||||
}
|
||||
|
||||
/// Convert to bytes for writing to guest memory
|
||||
pub fn as_bytes(&self) -> &[u8] {
|
||||
unsafe {
|
||||
std::slice::from_raw_parts(
|
||||
self as *const Self as *const u8,
|
||||
std::mem::size_of::<Self>(),
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Module (initrd) entry for PVH
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone, Copy, Default)]
|
||||
pub struct HvmModlistEntry {
|
||||
/// Physical address of module
|
||||
pub paddr: u64,
|
||||
/// Size of module in bytes
|
||||
pub size: u64,
|
||||
/// Physical address of command line for module (0 if none)
|
||||
pub cmdline_paddr: u64,
|
||||
/// Reserved
|
||||
pub reserved: u64,
|
||||
}
|
||||
|
||||
impl HvmModlistEntry {
|
||||
/// Create entry for initrd
|
||||
pub fn new(paddr: u64, size: u64) -> Self {
|
||||
Self {
|
||||
paddr,
|
||||
size,
|
||||
cmdline_paddr: 0,
|
||||
reserved: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert to bytes
|
||||
pub fn as_bytes(&self) -> &[u8] {
|
||||
unsafe {
|
||||
std::slice::from_raw_parts(
|
||||
self as *const Self as *const u8,
|
||||
std::mem::size_of::<Self>(),
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// PVH configuration for boot setup
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct PvhConfig {
|
||||
/// Total memory size in bytes
|
||||
pub memory_size: u64,
|
||||
/// Number of vCPUs
|
||||
pub vcpu_count: u32,
|
||||
/// Physical address of command line
|
||||
pub cmdline_addr: u64,
|
||||
/// Physical address of initrd (if any)
|
||||
pub initrd_addr: Option<u64>,
|
||||
/// Size of initrd (if any)
|
||||
pub initrd_size: Option<u64>,
|
||||
}
|
||||
|
||||
/// PVH boot setup implementation
|
||||
pub struct PvhBootSetup;
|
||||
|
||||
impl PvhBootSetup {
|
||||
/// Set up PVH boot structures in guest memory
|
||||
///
|
||||
/// Creates and writes:
|
||||
/// 1. E820 memory map
|
||||
/// 2. start_info structure
|
||||
/// 3. Module list (for initrd)
|
||||
pub fn setup<M: GuestMemory>(config: &PvhConfig, guest_mem: &mut M) -> Result<()> {
|
||||
// Build E820 memory map
|
||||
let e820_entries = Self::build_e820_map(config.memory_size)?;
|
||||
let e820_count = e820_entries.len() as u32;
|
||||
|
||||
// Write E820 map to guest memory
|
||||
Self::write_e820_map(&e820_entries, guest_mem)?;
|
||||
|
||||
// Write module list if initrd is present
|
||||
let modlist_addr = if let (Some(addr), Some(size)) = (config.initrd_addr, config.initrd_size) {
|
||||
let modlist_addr = layout::E820_MAP_ADDR +
|
||||
(MAX_E820_ENTRIES * std::mem::size_of::<E820Entry>()) as u64;
|
||||
|
||||
let entry = HvmModlistEntry::new(addr, size);
|
||||
guest_mem.write_bytes(modlist_addr, entry.as_bytes())?;
|
||||
|
||||
Some(modlist_addr)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// Build and write start_info structure
|
||||
let mut start_info = StartInfo::new()
|
||||
.with_cmdline(config.cmdline_addr)
|
||||
.with_memmap(layout::E820_MAP_ADDR, e820_count);
|
||||
|
||||
if let Some(addr) = modlist_addr {
|
||||
start_info = start_info.with_module(addr);
|
||||
}
|
||||
|
||||
guest_mem.write_bytes(layout::PVH_START_INFO_ADDR, start_info.as_bytes())?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Build E820 memory map for the VM
|
||||
///
|
||||
/// Creates a standard x86_64 memory layout:
|
||||
/// - Low memory (0-640KB): RAM
|
||||
/// - Legacy hole (640KB-1MB): Reserved
|
||||
/// - High memory (1MB+): RAM
|
||||
fn build_e820_map(memory_size: u64) -> Result<Vec<E820Entry>> {
|
||||
let mut entries = Vec::with_capacity(4);
|
||||
|
||||
// Validate minimum memory
|
||||
if memory_size < layout::HIGH_MEMORY_START {
|
||||
return Err(BootError::MemoryLayout(format!(
|
||||
"Memory size {} is less than minimum required {}",
|
||||
memory_size,
|
||||
layout::HIGH_MEMORY_START
|
||||
)));
|
||||
}
|
||||
|
||||
// Low memory: 0 to 640KB (0x0 - 0x9FFFF)
|
||||
// We reserve the first page for real-mode IVT
|
||||
entries.push(E820Entry::ram(0, layout::LOW_MEMORY_END));
|
||||
|
||||
// Legacy video/ROM hole: 640KB to 1MB (0xA0000 - 0xFFFFF)
|
||||
// This is reserved for VGA memory, option ROMs, etc.
|
||||
let legacy_hole_size = layout::HIGH_MEMORY_START - layout::LOW_MEMORY_END;
|
||||
entries.push(E820Entry::reserved(layout::LOW_MEMORY_END, legacy_hole_size));
|
||||
|
||||
// High memory: 1MB to RAM size
|
||||
let high_memory_size = memory_size - layout::HIGH_MEMORY_START;
|
||||
if high_memory_size > 0 {
|
||||
entries.push(E820Entry::ram(layout::HIGH_MEMORY_START, high_memory_size));
|
||||
}
|
||||
|
||||
// If memory > 4GB, we might need to handle the MMIO hole
|
||||
// For now, we assume memory <= 4GB for simplicity
|
||||
// Production systems should handle:
|
||||
// - PCI MMIO hole (typically 0xE0000000 - 0xFFFFFFFF)
|
||||
// - Memory above 4GB remapped
|
||||
|
||||
Ok(entries)
|
||||
}
|
||||
|
||||
/// Write E820 map entries to guest memory
|
||||
fn write_e820_map<M: GuestMemory>(entries: &[E820Entry], guest_mem: &mut M) -> Result<()> {
|
||||
let entry_size = std::mem::size_of::<E820Entry>();
|
||||
|
||||
for (i, entry) in entries.iter().enumerate() {
|
||||
let addr = layout::E820_MAP_ADDR + (i * entry_size) as u64;
|
||||
let bytes = unsafe {
|
||||
std::slice::from_raw_parts(entry as *const E820Entry as *const u8, entry_size)
|
||||
};
|
||||
guest_mem.write_bytes(addr, bytes)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get initial CPU register state for PVH boot
|
||||
///
|
||||
/// Returns the register values needed to start the vCPU in 64-bit mode
|
||||
/// with PVH boot protocol.
|
||||
pub fn get_initial_regs(entry_point: u64) -> PvhRegs {
|
||||
PvhRegs {
|
||||
// Instruction pointer - kernel entry
|
||||
rip: entry_point,
|
||||
|
||||
// RBX contains pointer to start_info (Xen PVH convention)
|
||||
rbx: layout::PVH_START_INFO_ADDR,
|
||||
|
||||
// RSI also contains start_info pointer (Linux boot convention)
|
||||
rsi: layout::PVH_START_INFO_ADDR,
|
||||
|
||||
// Stack pointer
|
||||
rsp: layout::BOOT_STACK_POINTER,
|
||||
|
||||
// Clear other general-purpose registers
|
||||
rax: 0,
|
||||
rcx: 0,
|
||||
rdx: 0,
|
||||
rdi: 0,
|
||||
rbp: 0,
|
||||
r8: 0,
|
||||
r9: 0,
|
||||
r10: 0,
|
||||
r11: 0,
|
||||
r12: 0,
|
||||
r13: 0,
|
||||
r14: 0,
|
||||
r15: 0,
|
||||
|
||||
// Flags - interrupts disabled
|
||||
rflags: 0x2,
|
||||
|
||||
// Segment selectors for 64-bit mode
|
||||
cs: 0x10, // Code segment, ring 0
|
||||
ds: 0x18, // Data segment
|
||||
es: 0x18,
|
||||
fs: 0x18,
|
||||
gs: 0x18,
|
||||
ss: 0x18,
|
||||
|
||||
// CR registers for 64-bit mode
|
||||
cr0: CR0_PE | CR0_ET | CR0_PG,
|
||||
cr3: 0, // Page table base - set by kernel setup
|
||||
cr4: CR4_PAE,
|
||||
|
||||
// EFER for long mode
|
||||
efer: EFER_LME | EFER_LMA,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Control Register 0 bits
|
||||
const CR0_PE: u64 = 1 << 0; // Protection Enable
|
||||
const CR0_ET: u64 = 1 << 4; // Extension Type (387 present)
|
||||
const CR0_PG: u64 = 1 << 31; // Paging Enable
|
||||
|
||||
/// Control Register 4 bits
|
||||
const CR4_PAE: u64 = 1 << 5; // Physical Address Extension
|
||||
|
||||
/// EFER (Extended Feature Enable Register) bits
|
||||
const EFER_LME: u64 = 1 << 8; // Long Mode Enable
|
||||
const EFER_LMA: u64 = 1 << 10; // Long Mode Active
|
||||
|
||||
/// CPU register state for PVH boot
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct PvhRegs {
|
||||
// General purpose registers
|
||||
pub rax: u64,
|
||||
pub rbx: u64,
|
||||
pub rcx: u64,
|
||||
pub rdx: u64,
|
||||
pub rsi: u64,
|
||||
pub rdi: u64,
|
||||
pub rsp: u64,
|
||||
pub rbp: u64,
|
||||
pub r8: u64,
|
||||
pub r9: u64,
|
||||
pub r10: u64,
|
||||
pub r11: u64,
|
||||
pub r12: u64,
|
||||
pub r13: u64,
|
||||
pub r14: u64,
|
||||
pub r15: u64,
|
||||
|
||||
// Instruction pointer
|
||||
pub rip: u64,
|
||||
|
||||
// Flags
|
||||
pub rflags: u64,
|
||||
|
||||
// Segment selectors
|
||||
pub cs: u16,
|
||||
pub ds: u16,
|
||||
pub es: u16,
|
||||
pub fs: u16,
|
||||
pub gs: u16,
|
||||
pub ss: u16,
|
||||
|
||||
// Control registers
|
||||
pub cr0: u64,
|
||||
pub cr3: u64,
|
||||
pub cr4: u64,
|
||||
|
||||
// Model-specific registers
|
||||
pub efer: u64,
|
||||
}
|
||||
|
||||
/// GDT entries for 64-bit mode boot
|
||||
///
|
||||
/// This provides a minimal GDT for transitioning to 64-bit mode.
|
||||
/// The kernel will set up its own GDT later.
|
||||
pub struct BootGdt;
|
||||
|
||||
impl BootGdt {
|
||||
/// Null descriptor (required as GDT[0])
|
||||
pub const NULL: u64 = 0;
|
||||
|
||||
/// 64-bit code segment (CS)
|
||||
/// Base: 0, Limit: 0xFFFFF (ignored in 64-bit mode)
|
||||
/// Type: Code, Execute/Read, Present, DPL=0
|
||||
pub const CODE64: u64 = 0x00af_9b00_0000_ffff;
|
||||
|
||||
/// 64-bit data segment (DS, ES, SS, FS, GS)
|
||||
/// Base: 0, Limit: 0xFFFFF
|
||||
/// Type: Data, Read/Write, Present, DPL=0
|
||||
pub const DATA64: u64 = 0x00cf_9300_0000_ffff;
|
||||
|
||||
/// Build GDT table as bytes
|
||||
pub fn as_bytes() -> [u8; 24] {
|
||||
let mut gdt = [0u8; 24];
|
||||
gdt[0..8].copy_from_slice(&Self::NULL.to_le_bytes());
|
||||
gdt[8..16].copy_from_slice(&Self::CODE64.to_le_bytes());
|
||||
gdt[16..24].copy_from_slice(&Self::DATA64.to_le_bytes());
|
||||
gdt
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
struct MockMemory {
|
||||
size: u64,
|
||||
data: Vec<u8>,
|
||||
}
|
||||
|
||||
impl MockMemory {
|
||||
fn new(size: u64) -> Self {
|
||||
Self {
|
||||
size,
|
||||
data: vec![0; size as usize],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl GuestMemory for MockMemory {
|
||||
fn write_bytes(&mut self, addr: u64, data: &[u8]) -> Result<()> {
|
||||
let end = addr as usize + data.len();
|
||||
if end > self.data.len() {
|
||||
return Err(BootError::GuestMemoryWrite(format!(
|
||||
"Write at {:#x} exceeds memory size",
|
||||
addr
|
||||
)));
|
||||
}
|
||||
self.data[addr as usize..end].copy_from_slice(data);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn size(&self) -> u64 {
|
||||
self.size
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_e820_entry_size() {
|
||||
// E820 entry must be exactly 20 bytes for Linux kernel compatibility
|
||||
assert_eq!(std::mem::size_of::<E820Entry>(), 20);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_build_e820_map() {
|
||||
let memory_size = 128 * 1024 * 1024; // 128MB
|
||||
let entries = PvhBootSetup::build_e820_map(memory_size).unwrap();
|
||||
|
||||
// Should have at least 3 entries
|
||||
assert!(entries.len() >= 3);
|
||||
|
||||
// First entry should be low memory RAM — copy from packed struct
|
||||
let e0_addr = entries[0].addr;
|
||||
let e0_type = entries[0].entry_type;
|
||||
assert_eq!(e0_addr, 0);
|
||||
assert_eq!(e0_type, E820Type::Ram as u32);
|
||||
|
||||
// Second entry should be legacy hole (reserved)
|
||||
let e1_addr = entries[1].addr;
|
||||
let e1_type = entries[1].entry_type;
|
||||
assert_eq!(e1_addr, layout::LOW_MEMORY_END);
|
||||
assert_eq!(e1_type, E820Type::Reserved as u32);
|
||||
|
||||
// Third entry should be high memory RAM
|
||||
let e2_addr = entries[2].addr;
|
||||
let e2_type = entries[2].entry_type;
|
||||
assert_eq!(e2_addr, layout::HIGH_MEMORY_START);
|
||||
assert_eq!(e2_type, E820Type::Ram as u32);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_start_info_size() {
|
||||
// StartInfo should be reasonable size (under 4KB page)
|
||||
let size = std::mem::size_of::<StartInfo>();
|
||||
assert!(size < 4096);
|
||||
assert!(size >= 48); // Minimum expected fields
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pvh_setup() {
|
||||
let mut mem = MockMemory::new(128 * 1024 * 1024);
|
||||
let config = PvhConfig {
|
||||
memory_size: 128 * 1024 * 1024,
|
||||
vcpu_count: 2,
|
||||
cmdline_addr: layout::CMDLINE_ADDR,
|
||||
initrd_addr: Some(100 * 1024 * 1024),
|
||||
initrd_size: Some(10 * 1024 * 1024),
|
||||
};
|
||||
|
||||
let result = PvhBootSetup::setup(&config, &mut mem);
|
||||
assert!(result.is_ok());
|
||||
|
||||
// Verify magic was written to start_info location
|
||||
let magic = u32::from_le_bytes([
|
||||
mem.data[layout::PVH_START_INFO_ADDR as usize],
|
||||
mem.data[layout::PVH_START_INFO_ADDR as usize + 1],
|
||||
mem.data[layout::PVH_START_INFO_ADDR as usize + 2],
|
||||
mem.data[layout::PVH_START_INFO_ADDR as usize + 3],
|
||||
]);
|
||||
assert_eq!(magic, XEN_HVM_START_MAGIC);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pvh_regs() {
|
||||
let entry_point = 0x100200;
|
||||
let regs = PvhBootSetup::get_initial_regs(entry_point);
|
||||
|
||||
// Verify entry point
|
||||
assert_eq!(regs.rip, entry_point);
|
||||
|
||||
// Verify start_info pointer in rbx
|
||||
assert_eq!(regs.rbx, layout::PVH_START_INFO_ADDR);
|
||||
|
||||
// Verify 64-bit mode flags
|
||||
assert!(regs.cr0 & CR0_PE != 0); // Protection enabled
|
||||
assert!(regs.cr0 & CR0_PG != 0); // Paging enabled
|
||||
assert!(regs.cr4 & CR4_PAE != 0); // PAE enabled
|
||||
assert!(regs.efer & EFER_LME != 0); // Long mode enabled
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_gdt_layout() {
|
||||
let gdt = BootGdt::as_bytes();
|
||||
assert_eq!(gdt.len(), 24); // 3 entries × 8 bytes
|
||||
|
||||
// First entry should be null
|
||||
assert_eq!(&gdt[0..8], &[0u8; 8]);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user