Volt VMM (Neutron Stardust): source-available under AGPSL v5.0
KVM-based microVMM for the Volt platform: - Sub-second VM boot times - Minimal memory footprint - Landlock LSM + seccomp security - Virtio device support - Custom kernel management Copyright (c) Armored Gates LLC. All rights reserved. Licensed under AGPSL v5.0
This commit is contained in:
7
vmm/.gitignore
vendored
Normal file
7
vmm/.gitignore
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
/target
|
||||
Cargo.lock
|
||||
*.swp
|
||||
*.swo
|
||||
*~
|
||||
.idea/
|
||||
.vscode/
|
||||
85
vmm/Cargo.toml
Normal file
85
vmm/Cargo.toml
Normal file
@@ -0,0 +1,85 @@
|
||||
[package]
|
||||
name = "volt-vmm"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
authors = ["Volt Contributors"]
|
||||
description = "A lightweight, secure Virtual Machine Monitor (VMM) built on KVM"
|
||||
license = "Apache-2.0"
|
||||
repository = "https://github.com/armoredgate/volt-vmm"
|
||||
keywords = ["vmm", "kvm", "virtualization", "microvm"]
|
||||
categories = ["virtualization", "os"]
|
||||
|
||||
[dependencies]
|
||||
# Stellarium CAS storage
|
||||
stellarium = { path = "../stellarium" }
|
||||
|
||||
# KVM interface (rust-vmm)
|
||||
kvm-ioctls = "0.19"
|
||||
kvm-bindings = { version = "0.10", features = ["fam-wrappers"] }
|
||||
|
||||
# Memory management (rust-vmm)
|
||||
vm-memory = { version = "0.16", features = ["backend-mmap"] }
|
||||
|
||||
# VirtIO (rust-vmm)
|
||||
virtio-queue = "0.14"
|
||||
virtio-bindings = "0.2"
|
||||
|
||||
# Kernel/initrd loading (rust-vmm)
|
||||
linux-loader = { version = "0.13", features = ["bzimage", "elf"] }
|
||||
|
||||
# Async runtime
|
||||
tokio = { version = "1", features = ["full"] }
|
||||
|
||||
# Configuration
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
|
||||
# CLI
|
||||
clap = { version = "4", features = ["derive", "env"] }
|
||||
|
||||
# Logging/tracing
|
||||
tracing = "0.1"
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
|
||||
|
||||
# Error handling
|
||||
thiserror = "2"
|
||||
anyhow = "1"
|
||||
|
||||
# HTTP API
|
||||
axum = "0.8"
|
||||
tower = "0.5"
|
||||
tower-http = { version = "0.6", features = ["trace", "cors"] }
|
||||
|
||||
# Security (seccomp-bpf filtering)
|
||||
seccompiler = "0.5"
|
||||
|
||||
# Security / sandboxing
|
||||
landlock = "0.4"
|
||||
|
||||
# Additional utilities
|
||||
crossbeam-channel = "0.5"
|
||||
libc = "0.2"
|
||||
nix = { version = "0.29", features = ["fs", "ioctl", "mman", "signal"] }
|
||||
parking_lot = "0.12"
|
||||
signal-hook = "0.3"
|
||||
signal-hook-tokio = { version = "0.3", features = ["futures-v0_3"] }
|
||||
futures = "0.3"
|
||||
hyper = { version = "1.4", features = ["full"] }
|
||||
hyper-util = { version = "0.1", features = ["server", "tokio"] }
|
||||
http-body-util = "0.1"
|
||||
tokio-util = { version = "0.7", features = ["io"] }
|
||||
bytes = "1"
|
||||
getrandom = "0.2"
|
||||
crc = "3"
|
||||
|
||||
# CAS (Content-Addressable Storage) support
|
||||
sha2 = "0.10"
|
||||
hex = "0.4"
|
||||
|
||||
[dev-dependencies]
|
||||
tokio-test = "0.4"
|
||||
tempfile = "3"
|
||||
|
||||
[[bin]]
|
||||
name = "volt-vmm"
|
||||
path = "src/main.rs"
|
||||
139
vmm/README.md
Normal file
139
vmm/README.md
Normal file
@@ -0,0 +1,139 @@
|
||||
# Volt VMM
|
||||
|
||||
A lightweight, secure Virtual Machine Monitor (VMM) built on KVM. Volt is designed as a Firecracker alternative for running microVMs with minimal overhead and maximum security.
|
||||
|
||||
## Features
|
||||
|
||||
- **Lightweight**: Minimal footprint, fast boot times
|
||||
- **Secure**: Strong isolation using KVM hardware virtualization
|
||||
- **Simple API**: REST API over Unix socket for VM management
|
||||
- **Async**: Built on Tokio for efficient I/O handling
|
||||
- **VirtIO Devices**: Block and network devices using VirtIO
|
||||
- **Serial Console**: 8250 UART emulation for guest console access
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
volt-vmm/
|
||||
├── src/
|
||||
│ ├── main.rs # Entry point and CLI
|
||||
│ ├── vmm/ # Core VMM logic
|
||||
│ │ └── mod.rs # VM lifecycle management
|
||||
│ ├── kvm/ # KVM interface
|
||||
│ │ └── mod.rs # KVM ioctls wrapper
|
||||
│ ├── devices/ # Device emulation
|
||||
│ │ ├── mod.rs # Device manager
|
||||
│ │ ├── serial.rs # 8250 UART
|
||||
│ │ ├── virtio_block.rs
|
||||
│ │ └── virtio_net.rs
|
||||
│ ├── api/ # HTTP API
|
||||
│ │ └── mod.rs # REST endpoints
|
||||
│ └── config/ # Configuration
|
||||
│ └── mod.rs # VM config parsing
|
||||
└── Cargo.toml
|
||||
```
|
||||
|
||||
## Building
|
||||
|
||||
```bash
|
||||
cargo build --release
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Command Line
|
||||
|
||||
```bash
|
||||
# Start a VM with explicit options
|
||||
volt-vmm \
|
||||
--kernel /path/to/vmlinux \
|
||||
--initrd /path/to/initrd.img \
|
||||
--rootfs /path/to/rootfs.ext4 \
|
||||
--vcpus 2 \
|
||||
--memory 256
|
||||
|
||||
# Start a VM from config file
|
||||
volt-vmm --config vm-config.json
|
||||
```
|
||||
|
||||
### Configuration File
|
||||
|
||||
```json
|
||||
{
|
||||
"vcpus": 2,
|
||||
"memory_mib": 256,
|
||||
"kernel": "/path/to/vmlinux",
|
||||
"cmdline": "console=ttyS0 reboot=k panic=1 pci=off",
|
||||
"initrd": "/path/to/initrd.img",
|
||||
"rootfs": {
|
||||
"path": "/path/to/rootfs.ext4",
|
||||
"read_only": false
|
||||
},
|
||||
"network": [
|
||||
{
|
||||
"id": "eth0",
|
||||
"tap": "tap0"
|
||||
}
|
||||
],
|
||||
"drives": [
|
||||
{
|
||||
"id": "data",
|
||||
"path": "/path/to/data.img",
|
||||
"read_only": false
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### API
|
||||
|
||||
The API is exposed over a Unix socket (default: `/tmp/volt-vmm.sock`).
|
||||
|
||||
```bash
|
||||
# Get VM info
|
||||
curl --unix-socket /tmp/volt-vmm.sock http://localhost/vm
|
||||
|
||||
# Pause VM
|
||||
curl --unix-socket /tmp/volt-vmm.sock \
|
||||
-X PUT -H "Content-Type: application/json" \
|
||||
-d '{"action": "pause"}' \
|
||||
http://localhost/vm/actions
|
||||
|
||||
# Resume VM
|
||||
curl --unix-socket /tmp/volt-vmm.sock \
|
||||
-X PUT -H "Content-Type: application/json" \
|
||||
-d '{"action": "resume"}' \
|
||||
http://localhost/vm/actions
|
||||
|
||||
# Stop VM
|
||||
curl --unix-socket /tmp/volt-vmm.sock \
|
||||
-X PUT -H "Content-Type: application/json" \
|
||||
-d '{"action": "stop"}' \
|
||||
http://localhost/vm/actions
|
||||
```
|
||||
|
||||
## Dependencies
|
||||
|
||||
Volt leverages the excellent [rust-vmm](https://github.com/rust-vmm) project:
|
||||
|
||||
- `kvm-ioctls` / `kvm-bindings` - KVM interface
|
||||
- `vm-memory` - Guest memory management
|
||||
- `virtio-queue` / `virtio-bindings` - VirtIO device support
|
||||
- `linux-loader` - Kernel/initrd loading
|
||||
|
||||
## Roadmap
|
||||
|
||||
- [x] Project structure
|
||||
- [ ] KVM VM creation
|
||||
- [ ] Guest memory setup
|
||||
- [ ] vCPU initialization
|
||||
- [ ] Kernel loading (bzImage, ELF)
|
||||
- [ ] Serial console
|
||||
- [ ] VirtIO block device
|
||||
- [ ] VirtIO network device
|
||||
- [ ] Snapshot/restore
|
||||
- [ ] Live migration
|
||||
|
||||
## License
|
||||
|
||||
Apache-2.0
|
||||
27
vmm/api-test/Cargo.toml
Normal file
27
vmm/api-test/Cargo.toml
Normal file
@@ -0,0 +1,27 @@
|
||||
[package]
|
||||
name = "volt-vmm-api-test"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
# Async runtime
|
||||
tokio = { version = "1", features = ["full"] }
|
||||
|
||||
# HTTP server
|
||||
hyper = { version = "1", features = ["server", "http1"] }
|
||||
hyper-util = { version = "0.1", features = ["tokio", "server-auto"] }
|
||||
http-body-util = "0.1"
|
||||
|
||||
# Serialization
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
|
||||
# Error handling
|
||||
thiserror = "2"
|
||||
anyhow = "1"
|
||||
|
||||
# Logging
|
||||
tracing = "0.1"
|
||||
|
||||
# Metrics
|
||||
prometheus = "0.13"
|
||||
291
vmm/api-test/src/api/handlers.rs
Normal file
291
vmm/api-test/src/api/handlers.rs
Normal file
@@ -0,0 +1,291 @@
|
||||
//! API Request Handlers
|
||||
//!
|
||||
//! Handles the business logic for each API endpoint.
|
||||
|
||||
use super::types::{
|
||||
ApiError, ApiResponse, VmConfig, VmState, VmStateAction, VmStateRequest, VmStateResponse,
|
||||
};
|
||||
use prometheus::{Encoder, TextEncoder};
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::RwLock;
|
||||
use tracing::{debug, info, warn};
|
||||
|
||||
/// Shared VM state managed by the API
|
||||
#[derive(Debug)]
|
||||
pub struct VmContext {
|
||||
pub config: Option<VmConfig>,
|
||||
pub state: VmState,
|
||||
pub boot_time_ms: Option<u64>,
|
||||
}
|
||||
|
||||
impl Default for VmContext {
|
||||
fn default() -> Self {
|
||||
VmContext {
|
||||
config: None,
|
||||
state: VmState::NotConfigured,
|
||||
boot_time_ms: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// API handler with shared state
|
||||
#[derive(Clone)]
|
||||
pub struct ApiHandler {
|
||||
context: Arc<RwLock<VmContext>>,
|
||||
// Metrics
|
||||
requests_total: prometheus::IntCounter,
|
||||
request_duration: prometheus::Histogram,
|
||||
vm_state_gauge: prometheus::IntGauge,
|
||||
}
|
||||
|
||||
impl ApiHandler {
|
||||
pub fn new() -> Self {
|
||||
// Register Prometheus metrics
|
||||
let requests_total = prometheus::IntCounter::new(
|
||||
"volt-vmm_api_requests_total",
|
||||
"Total number of API requests",
|
||||
)
|
||||
.expect("metric creation failed");
|
||||
|
||||
let request_duration = prometheus::Histogram::with_opts(
|
||||
prometheus::HistogramOpts::new(
|
||||
"volt-vmm_api_request_duration_seconds",
|
||||
"API request duration in seconds",
|
||||
)
|
||||
.buckets(vec![0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0]),
|
||||
)
|
||||
.expect("metric creation failed");
|
||||
|
||||
let vm_state_gauge =
|
||||
prometheus::IntGauge::new("volt-vmm_vm_state", "Current VM state (0=not_configured, 1=configured, 2=starting, 3=running, 4=paused, 5=shutting_down, 6=stopped, 7=error)")
|
||||
.expect("metric creation failed");
|
||||
|
||||
// Register with default registry
|
||||
let _ = prometheus::register(Box::new(requests_total.clone()));
|
||||
let _ = prometheus::register(Box::new(request_duration.clone()));
|
||||
let _ = prometheus::register(Box::new(vm_state_gauge.clone()));
|
||||
|
||||
ApiHandler {
|
||||
context: Arc::new(RwLock::new(VmContext::default())),
|
||||
requests_total,
|
||||
request_duration,
|
||||
vm_state_gauge,
|
||||
}
|
||||
}
|
||||
|
||||
/// PUT /v1/vm/config - Set VM configuration before boot
|
||||
pub async fn put_config(&self, config: VmConfig) -> Result<ApiResponse<VmConfig>, ApiError> {
|
||||
let mut ctx = self.context.write().await;
|
||||
|
||||
// Only allow config changes when VM is not running
|
||||
match ctx.state {
|
||||
VmState::NotConfigured | VmState::Configured | VmState::Stopped => {
|
||||
info!(
|
||||
vcpus = config.vcpu_count,
|
||||
mem_mib = config.mem_size_mib,
|
||||
"VM configuration updated"
|
||||
);
|
||||
|
||||
ctx.config = Some(config.clone());
|
||||
ctx.state = VmState::Configured;
|
||||
self.update_state_gauge(VmState::Configured);
|
||||
|
||||
Ok(ApiResponse::ok(config))
|
||||
}
|
||||
state => {
|
||||
warn!(?state, "Cannot change config while VM is in this state");
|
||||
Err(ApiError::InvalidStateTransition {
|
||||
current_state: state,
|
||||
action: "configure".to_string(),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// GET /v1/vm/config - Get current VM configuration
|
||||
pub async fn get_config(&self) -> Result<ApiResponse<VmConfig>, ApiError> {
|
||||
let ctx = self.context.read().await;
|
||||
|
||||
match &ctx.config {
|
||||
Some(config) => Ok(ApiResponse::ok(config.clone())),
|
||||
None => Err(ApiError::NotConfigured),
|
||||
}
|
||||
}
|
||||
|
||||
/// PUT /v1/vm/state - Change VM state (start/stop/pause/resume)
|
||||
pub async fn put_state(
|
||||
&self,
|
||||
request: VmStateRequest,
|
||||
) -> Result<ApiResponse<VmStateResponse>, ApiError> {
|
||||
let mut ctx = self.context.write().await;
|
||||
|
||||
let new_state = match (&ctx.state, &request.action) {
|
||||
// Start transitions
|
||||
(VmState::Configured, VmStateAction::Start) => {
|
||||
info!("Starting VM...");
|
||||
// In real implementation, this would trigger VM boot
|
||||
VmState::Running
|
||||
}
|
||||
(VmState::Stopped, VmStateAction::Start) => {
|
||||
info!("Restarting VM...");
|
||||
VmState::Running
|
||||
}
|
||||
|
||||
// Pause/Resume transitions
|
||||
(VmState::Running, VmStateAction::Pause) => {
|
||||
info!("Pausing VM...");
|
||||
VmState::Paused
|
||||
}
|
||||
(VmState::Paused, VmStateAction::Resume) => {
|
||||
info!("Resuming VM...");
|
||||
VmState::Running
|
||||
}
|
||||
|
||||
// Shutdown transitions
|
||||
(VmState::Running | VmState::Paused, VmStateAction::Shutdown) => {
|
||||
info!("Graceful shutdown initiated...");
|
||||
VmState::ShuttingDown
|
||||
}
|
||||
(VmState::Running | VmState::Paused, VmStateAction::Stop) => {
|
||||
info!("Force stopping VM...");
|
||||
VmState::Stopped
|
||||
}
|
||||
(VmState::ShuttingDown, VmStateAction::Stop) => {
|
||||
info!("Force stopping during shutdown...");
|
||||
VmState::Stopped
|
||||
}
|
||||
|
||||
// Invalid transitions
|
||||
(state, action) => {
|
||||
warn!(?state, ?action, "Invalid state transition requested");
|
||||
return Err(ApiError::InvalidStateTransition {
|
||||
current_state: *state,
|
||||
action: format!("{:?}", action),
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
ctx.state = new_state;
|
||||
self.update_state_gauge(new_state);
|
||||
|
||||
debug!(?new_state, "VM state changed");
|
||||
|
||||
Ok(ApiResponse::ok(VmStateResponse {
|
||||
state: new_state,
|
||||
message: None,
|
||||
}))
|
||||
}
|
||||
|
||||
/// GET /v1/vm/state - Get current VM state
|
||||
pub async fn get_state(&self) -> Result<ApiResponse<VmStateResponse>, ApiError> {
|
||||
let ctx = self.context.read().await;
|
||||
|
||||
Ok(ApiResponse::ok(VmStateResponse {
|
||||
state: ctx.state,
|
||||
message: None,
|
||||
}))
|
||||
}
|
||||
|
||||
/// GET /v1/metrics - Prometheus metrics
|
||||
pub async fn get_metrics(&self) -> Result<String, ApiError> {
|
||||
self.requests_total.inc();
|
||||
|
||||
let encoder = TextEncoder::new();
|
||||
let metric_families = prometheus::gather();
|
||||
let mut buffer = Vec::new();
|
||||
|
||||
encoder
|
||||
.encode(&metric_families, &mut buffer)
|
||||
.map_err(|e| ApiError::Internal(e.to_string()))?;
|
||||
|
||||
String::from_utf8(buffer).map_err(|e| ApiError::Internal(e.to_string()))
|
||||
}
|
||||
|
||||
/// Record request metrics
|
||||
pub fn record_request(&self, duration_secs: f64) {
|
||||
self.requests_total.inc();
|
||||
self.request_duration.observe(duration_secs);
|
||||
}
|
||||
|
||||
fn update_state_gauge(&self, state: VmState) {
|
||||
let value = match state {
|
||||
VmState::NotConfigured => 0,
|
||||
VmState::Configured => 1,
|
||||
VmState::Starting => 2,
|
||||
VmState::Running => 3,
|
||||
VmState::Paused => 4,
|
||||
VmState::ShuttingDown => 5,
|
||||
VmState::Stopped => 6,
|
||||
VmState::Error => 7,
|
||||
};
|
||||
self.vm_state_gauge.set(value);
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ApiHandler {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_config_workflow() {
|
||||
let handler = ApiHandler::new();
|
||||
|
||||
// Get config should fail initially
|
||||
let result = handler.get_config().await;
|
||||
assert!(result.is_err());
|
||||
|
||||
// Set config
|
||||
let config = VmConfig {
|
||||
vcpu_count: 2,
|
||||
mem_size_mib: 256,
|
||||
..Default::default()
|
||||
};
|
||||
let result = handler.put_config(config).await;
|
||||
assert!(result.is_ok());
|
||||
|
||||
// Get config should work now
|
||||
let result = handler.get_config().await;
|
||||
assert!(result.is_ok());
|
||||
let response = result.unwrap();
|
||||
assert_eq!(response.data.unwrap().vcpu_count, 2);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_state_transitions() {
|
||||
let handler = ApiHandler::new();
|
||||
|
||||
// Configure VM first
|
||||
let config = VmConfig::default();
|
||||
handler.put_config(config).await.unwrap();
|
||||
|
||||
// Start VM
|
||||
let request = VmStateRequest {
|
||||
action: VmStateAction::Start,
|
||||
};
|
||||
let result = handler.put_state(request).await;
|
||||
assert!(result.is_ok());
|
||||
assert_eq!(result.unwrap().data.unwrap().state, VmState::Running);
|
||||
|
||||
// Pause VM
|
||||
let request = VmStateRequest {
|
||||
action: VmStateAction::Pause,
|
||||
};
|
||||
let result = handler.put_state(request).await;
|
||||
assert!(result.is_ok());
|
||||
assert_eq!(result.unwrap().data.unwrap().state, VmState::Paused);
|
||||
|
||||
// Resume VM
|
||||
let request = VmStateRequest {
|
||||
action: VmStateAction::Resume,
|
||||
};
|
||||
let result = handler.put_state(request).await;
|
||||
assert!(result.is_ok());
|
||||
assert_eq!(result.unwrap().data.unwrap().state, VmState::Running);
|
||||
}
|
||||
}
|
||||
25
vmm/api-test/src/api/mod.rs
Normal file
25
vmm/api-test/src/api/mod.rs
Normal file
@@ -0,0 +1,25 @@
|
||||
//! Volt HTTP API
|
||||
//!
|
||||
//! Unix socket HTTP/1.1 API server (Firecracker-compatible style).
|
||||
//! Provides endpoints for VM configuration and lifecycle management.
|
||||
//!
|
||||
//! ## Endpoints
|
||||
//!
|
||||
//! - `PUT /v1/vm/config` - Pre-boot VM configuration
|
||||
//! - `GET /v1/vm/config` - Get current configuration
|
||||
//! - `PUT /v1/vm/state` - Change VM state (start/stop/pause/resume)
|
||||
//! - `GET /v1/vm/state` - Get current VM state
|
||||
//! - `GET /v1/metrics` - Prometheus-format metrics
|
||||
//! - `GET /health` - Health check
|
||||
|
||||
mod handlers;
|
||||
mod routes;
|
||||
mod server;
|
||||
mod types;
|
||||
|
||||
pub use handlers::ApiHandler;
|
||||
pub use server::{run_server, ServerBuilder};
|
||||
pub use types::{
|
||||
ApiError, ApiResponse, NetworkConfig, VmConfig, VmState, VmStateAction, VmStateRequest,
|
||||
VmStateResponse,
|
||||
};
|
||||
193
vmm/api-test/src/api/routes.rs
Normal file
193
vmm/api-test/src/api/routes.rs
Normal file
@@ -0,0 +1,193 @@
|
||||
//! API Route Definitions
|
||||
//!
|
||||
//! Maps HTTP paths and methods to handlers.
|
||||
|
||||
use super::handlers::ApiHandler;
|
||||
use super::types::ApiError;
|
||||
use http_body_util::{BodyExt, Full};
|
||||
use hyper::body::Bytes;
|
||||
use hyper::{Method, Request, Response, StatusCode};
|
||||
use std::time::Instant;
|
||||
use tracing::{debug, error};
|
||||
|
||||
/// Route an incoming request to the appropriate handler
|
||||
pub async fn route_request(
|
||||
handler: ApiHandler,
|
||||
req: Request<hyper::body::Incoming>,
|
||||
) -> Result<Response<Full<Bytes>>, hyper::Error> {
|
||||
let start = Instant::now();
|
||||
let method = req.method().clone();
|
||||
let path = req.uri().path().to_string();
|
||||
|
||||
debug!(%method, %path, "Incoming request");
|
||||
|
||||
let response = match (method.clone(), path.as_str()) {
|
||||
// VM Configuration
|
||||
(Method::PUT, "/v1/vm/config") => handle_put_config(handler.clone(), req).await,
|
||||
(Method::GET, "/v1/vm/config") => handle_get_config(handler.clone()).await,
|
||||
|
||||
// VM State
|
||||
(Method::PUT, "/v1/vm/state") => handle_put_state(handler.clone(), req).await,
|
||||
(Method::GET, "/v1/vm/state") => handle_get_state(handler.clone()).await,
|
||||
|
||||
// Metrics
|
||||
(Method::GET, "/v1/metrics") | (Method::GET, "/metrics") => {
|
||||
handle_metrics(handler.clone()).await
|
||||
}
|
||||
|
||||
// Health check
|
||||
(Method::GET, "/") | (Method::GET, "/health") => Ok(json_response(
|
||||
StatusCode::OK,
|
||||
r#"{"status":"ok","version":"0.1.0"}"#,
|
||||
)),
|
||||
|
||||
// 404 for unknown paths
|
||||
(_, path) => {
|
||||
debug!("Unknown path: {}", path);
|
||||
Ok(error_response(ApiError::NotFound(path.to_string())))
|
||||
}
|
||||
};
|
||||
|
||||
// Record metrics
|
||||
let duration = start.elapsed().as_secs_f64();
|
||||
handler.record_request(duration);
|
||||
|
||||
debug!(%method, path = %req.uri().path(), duration_ms = duration * 1000.0, "Request completed");
|
||||
|
||||
response
|
||||
}
|
||||
|
||||
async fn handle_put_config(
|
||||
handler: ApiHandler,
|
||||
req: Request<hyper::body::Incoming>,
|
||||
) -> Result<Response<Full<Bytes>>, hyper::Error> {
|
||||
// Read request body
|
||||
let body = match read_body(req).await {
|
||||
Ok(b) => b,
|
||||
Err(e) => return Ok(error_response(e)),
|
||||
};
|
||||
|
||||
// Parse JSON
|
||||
let config = match serde_json::from_slice(&body) {
|
||||
Ok(c) => c,
|
||||
Err(e) => {
|
||||
return Ok(error_response(ApiError::BadRequest(format!(
|
||||
"Invalid JSON: {}",
|
||||
e
|
||||
))))
|
||||
}
|
||||
};
|
||||
|
||||
// Handle request
|
||||
match handler.put_config(config).await {
|
||||
Ok(response) => Ok(json_response(
|
||||
StatusCode::OK,
|
||||
&serde_json::to_string(&response).unwrap(),
|
||||
)),
|
||||
Err(e) => Ok(error_response(e)),
|
||||
}
|
||||
}
|
||||
|
||||
async fn handle_get_config(
|
||||
handler: ApiHandler,
|
||||
) -> Result<Response<Full<Bytes>>, hyper::Error> {
|
||||
match handler.get_config().await {
|
||||
Ok(response) => Ok(json_response(
|
||||
StatusCode::OK,
|
||||
&serde_json::to_string(&response).unwrap(),
|
||||
)),
|
||||
Err(e) => Ok(error_response(e)),
|
||||
}
|
||||
}
|
||||
|
||||
async fn handle_put_state(
|
||||
handler: ApiHandler,
|
||||
req: Request<hyper::body::Incoming>,
|
||||
) -> Result<Response<Full<Bytes>>, hyper::Error> {
|
||||
// Read request body
|
||||
let body = match read_body(req).await {
|
||||
Ok(b) => b,
|
||||
Err(e) => return Ok(error_response(e)),
|
||||
};
|
||||
|
||||
// Parse JSON
|
||||
let request = match serde_json::from_slice(&body) {
|
||||
Ok(r) => r,
|
||||
Err(e) => {
|
||||
return Ok(error_response(ApiError::BadRequest(format!(
|
||||
"Invalid JSON: {}",
|
||||
e
|
||||
))))
|
||||
}
|
||||
};
|
||||
|
||||
// Handle request
|
||||
match handler.put_state(request).await {
|
||||
Ok(response) => Ok(json_response(
|
||||
StatusCode::OK,
|
||||
&serde_json::to_string(&response).unwrap(),
|
||||
)),
|
||||
Err(e) => Ok(error_response(e)),
|
||||
}
|
||||
}
|
||||
|
||||
async fn handle_get_state(
|
||||
handler: ApiHandler,
|
||||
) -> Result<Response<Full<Bytes>>, hyper::Error> {
|
||||
match handler.get_state().await {
|
||||
Ok(response) => Ok(json_response(
|
||||
StatusCode::OK,
|
||||
&serde_json::to_string(&response).unwrap(),
|
||||
)),
|
||||
Err(e) => Ok(error_response(e)),
|
||||
}
|
||||
}
|
||||
|
||||
async fn handle_metrics(
|
||||
handler: ApiHandler,
|
||||
) -> Result<Response<Full<Bytes>>, hyper::Error> {
|
||||
match handler.get_metrics().await {
|
||||
Ok(metrics) => Ok(Response::builder()
|
||||
.status(StatusCode::OK)
|
||||
.header("Content-Type", "text/plain; version=0.0.4")
|
||||
.body(Full::new(Bytes::from(metrics)))
|
||||
.unwrap()),
|
||||
Err(e) => Ok(error_response(e)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Read the full request body into bytes
|
||||
async fn read_body(req: Request<hyper::body::Incoming>) -> Result<Bytes, ApiError> {
|
||||
req.into_body()
|
||||
.collect()
|
||||
.await
|
||||
.map(|c| c.to_bytes())
|
||||
.map_err(|e| ApiError::Internal(format!("Failed to read body: {}", e)))
|
||||
}
|
||||
|
||||
/// Create a JSON response
|
||||
fn json_response(status: StatusCode, body: &str) -> Response<Full<Bytes>> {
|
||||
Response::builder()
|
||||
.status(status)
|
||||
.header("Content-Type", "application/json")
|
||||
.body(Full::new(Bytes::from(body.to_string())))
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
/// Create an error response from an ApiError
|
||||
fn error_response(error: ApiError) -> Response<Full<Bytes>> {
|
||||
let status = StatusCode::from_u16(error.status_code()).unwrap_or(StatusCode::INTERNAL_SERVER_ERROR);
|
||||
|
||||
let body = serde_json::json!({
|
||||
"success": false,
|
||||
"error": error.to_string()
|
||||
});
|
||||
|
||||
error!(status = %status, error = %error, "API error response");
|
||||
|
||||
Response::builder()
|
||||
.status(status)
|
||||
.header("Content-Type", "application/json")
|
||||
.body(Full::new(Bytes::from(body.to_string())))
|
||||
.unwrap()
|
||||
}
|
||||
164
vmm/api-test/src/api/server.rs
Normal file
164
vmm/api-test/src/api/server.rs
Normal file
@@ -0,0 +1,164 @@
|
||||
//! Unix Socket HTTP Server
|
||||
//!
|
||||
//! Listens on a Unix domain socket and handles HTTP/1.1 requests.
|
||||
//! Inspired by Firecracker's API server design.
|
||||
|
||||
use super::handlers::ApiHandler;
|
||||
use super::routes::route_request;
|
||||
use anyhow::{Context, Result};
|
||||
use http_body_util::Full;
|
||||
use hyper::body::Bytes;
|
||||
use hyper::server::conn::http1;
|
||||
use hyper::service::service_fn;
|
||||
use hyper_util::rt::TokioIo;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
use tokio::net::UnixListener;
|
||||
use tokio::signal;
|
||||
use tracing::{debug, error, info, warn};
|
||||
|
||||
/// Run the HTTP API server on a Unix socket
|
||||
pub async fn run_server(socket_path: &str) -> Result<()> {
|
||||
// Remove existing socket file if present
|
||||
let path = Path::new(socket_path);
|
||||
if path.exists() {
|
||||
std::fs::remove_file(path).context("Failed to remove existing socket")?;
|
||||
}
|
||||
|
||||
// Create the Unix listener
|
||||
let listener = UnixListener::bind(path).context("Failed to bind Unix socket")?;
|
||||
|
||||
// Set socket permissions (readable/writable by owner only for security)
|
||||
#[cfg(unix)]
|
||||
{
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
std::fs::set_permissions(path, std::fs::Permissions::from_mode(0o600))
|
||||
.context("Failed to set socket permissions")?;
|
||||
}
|
||||
|
||||
info!(socket = %socket_path, "Volt API server listening");
|
||||
|
||||
// Create shared handler
|
||||
let handler = Arc::new(ApiHandler::new());
|
||||
|
||||
// Accept connections in a loop
|
||||
loop {
|
||||
tokio::select! {
|
||||
// Accept new connections
|
||||
result = listener.accept() => {
|
||||
match result {
|
||||
Ok((stream, _addr)) => {
|
||||
let handler = Arc::clone(&handler);
|
||||
debug!("New connection accepted");
|
||||
|
||||
// Spawn a task to handle this connection
|
||||
tokio::spawn(async move {
|
||||
let io = TokioIo::new(stream);
|
||||
|
||||
// Create the service function
|
||||
let service = service_fn(move |req| {
|
||||
let handler = (*handler).clone();
|
||||
async move { route_request(handler, req).await }
|
||||
});
|
||||
|
||||
// Serve the connection with HTTP/1
|
||||
if let Err(e) = http1::Builder::new()
|
||||
.serve_connection(io, service)
|
||||
.await
|
||||
{
|
||||
// Connection reset by peer is common and not an error
|
||||
if !e.to_string().contains("connection reset") {
|
||||
error!("Connection error: {}", e);
|
||||
}
|
||||
}
|
||||
|
||||
debug!("Connection closed");
|
||||
});
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Accept failed: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Handle shutdown signals
|
||||
_ = signal::ctrl_c() => {
|
||||
info!("Shutdown signal received");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Cleanup socket file
|
||||
if path.exists() {
|
||||
if let Err(e) = std::fs::remove_file(path) {
|
||||
warn!("Failed to remove socket file: {}", e);
|
||||
}
|
||||
}
|
||||
|
||||
info!("API server shut down");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Server builder for more configuration options
|
||||
pub struct ServerBuilder {
|
||||
socket_path: String,
|
||||
socket_permissions: u32,
|
||||
}
|
||||
|
||||
impl ServerBuilder {
|
||||
pub fn new(socket_path: impl Into<String>) -> Self {
|
||||
ServerBuilder {
|
||||
socket_path: socket_path.into(),
|
||||
socket_permissions: 0o600,
|
||||
}
|
||||
}
|
||||
|
||||
/// Set socket file permissions (Unix only)
|
||||
pub fn permissions(mut self, mode: u32) -> Self {
|
||||
self.socket_permissions = mode;
|
||||
self
|
||||
}
|
||||
|
||||
/// Build and run the server
|
||||
pub async fn run(self) -> Result<()> {
|
||||
run_server(&self.socket_path).await
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::time::Duration;
|
||||
use tokio::io::{AsyncReadExt, AsyncWriteExt};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_server_starts_and_accepts_connections() {
|
||||
let socket_path = "/tmp/volt-vmm-test.sock";
|
||||
|
||||
// Start server in background
|
||||
let server_handle = tokio::spawn(async move {
|
||||
let _ = run_server(socket_path).await;
|
||||
});
|
||||
|
||||
// Give server time to start
|
||||
tokio::time::sleep(Duration::from_millis(100)).await;
|
||||
|
||||
// Connect and send a simple request
|
||||
if let Ok(mut stream) = tokio::net::UnixStream::connect(socket_path).await {
|
||||
let request = "GET /health HTTP/1.1\r\nHost: localhost\r\n\r\n";
|
||||
stream.write_all(request.as_bytes()).await.unwrap();
|
||||
|
||||
let mut response = vec![0u8; 1024];
|
||||
let n = stream.read(&mut response).await.unwrap();
|
||||
let response_str = String::from_utf8_lossy(&response[..n]);
|
||||
|
||||
assert!(response_str.contains("HTTP/1.1 200"));
|
||||
assert!(response_str.contains("ok"));
|
||||
}
|
||||
|
||||
// Cleanup
|
||||
server_handle.abort();
|
||||
let _ = std::fs::remove_file(socket_path);
|
||||
}
|
||||
}
|
||||
200
vmm/api-test/src/api/types.rs
Normal file
200
vmm/api-test/src/api/types.rs
Normal file
@@ -0,0 +1,200 @@
|
||||
//! API Types and Data Structures
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fmt;
|
||||
|
||||
/// VM configuration for pre-boot setup
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
|
||||
pub struct VmConfig {
|
||||
/// Number of vCPUs
|
||||
#[serde(default = "default_vcpu_count")]
|
||||
pub vcpu_count: u8,
|
||||
|
||||
/// Memory size in MiB
|
||||
#[serde(default = "default_mem_size_mib")]
|
||||
pub mem_size_mib: u32,
|
||||
|
||||
/// Path to kernel image
|
||||
pub kernel_image_path: Option<String>,
|
||||
|
||||
/// Kernel boot arguments
|
||||
#[serde(default)]
|
||||
pub boot_args: String,
|
||||
|
||||
/// Path to root filesystem
|
||||
pub rootfs_path: Option<String>,
|
||||
|
||||
/// Network configuration
|
||||
pub network: Option<NetworkConfig>,
|
||||
|
||||
/// Enable HugePages for memory
|
||||
#[serde(default)]
|
||||
pub hugepages: bool,
|
||||
}
|
||||
|
||||
fn default_vcpu_count() -> u8 {
|
||||
1
|
||||
}
|
||||
|
||||
fn default_mem_size_mib() -> u32 {
|
||||
128
|
||||
}
|
||||
|
||||
/// Network configuration
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct NetworkConfig {
|
||||
/// TAP device name
|
||||
pub tap_device: String,
|
||||
|
||||
/// Guest MAC address
|
||||
pub guest_mac: Option<String>,
|
||||
|
||||
/// Host IP for the TAP interface
|
||||
pub host_ip: Option<String>,
|
||||
|
||||
/// Guest IP
|
||||
pub guest_ip: Option<String>,
|
||||
}
|
||||
|
||||
/// VM runtime state
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum VmState {
|
||||
/// VM is not yet configured
|
||||
NotConfigured,
|
||||
/// VM is configured but not started
|
||||
Configured,
|
||||
/// VM is starting up
|
||||
Starting,
|
||||
/// VM is running
|
||||
Running,
|
||||
/// VM is paused
|
||||
Paused,
|
||||
/// VM is shutting down
|
||||
ShuttingDown,
|
||||
/// VM has stopped
|
||||
Stopped,
|
||||
/// VM encountered an error
|
||||
Error,
|
||||
}
|
||||
|
||||
impl Default for VmState {
|
||||
fn default() -> Self {
|
||||
VmState::NotConfigured
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for VmState {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
VmState::NotConfigured => write!(f, "not_configured"),
|
||||
VmState::Configured => write!(f, "configured"),
|
||||
VmState::Starting => write!(f, "starting"),
|
||||
VmState::Running => write!(f, "running"),
|
||||
VmState::Paused => write!(f, "paused"),
|
||||
VmState::ShuttingDown => write!(f, "shutting_down"),
|
||||
VmState::Stopped => write!(f, "stopped"),
|
||||
VmState::Error => write!(f, "error"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Action to change VM state
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum VmStateAction {
|
||||
/// Start the VM
|
||||
Start,
|
||||
/// Pause the VM (freeze vCPUs)
|
||||
Pause,
|
||||
/// Resume a paused VM
|
||||
Resume,
|
||||
/// Graceful shutdown
|
||||
Shutdown,
|
||||
/// Force stop
|
||||
Stop,
|
||||
}
|
||||
|
||||
/// Request body for state changes
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct VmStateRequest {
|
||||
pub action: VmStateAction,
|
||||
}
|
||||
|
||||
/// VM state response
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct VmStateResponse {
|
||||
pub state: VmState,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub message: Option<String>,
|
||||
}
|
||||
|
||||
/// Generic API response wrapper
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ApiResponse<T> {
|
||||
pub success: bool,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub data: Option<T>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub error: Option<String>,
|
||||
}
|
||||
|
||||
impl<T> ApiResponse<T> {
|
||||
pub fn ok(data: T) -> Self {
|
||||
ApiResponse {
|
||||
success: true,
|
||||
data: Some(data),
|
||||
error: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn error(msg: impl Into<String>) -> Self {
|
||||
ApiResponse {
|
||||
success: false,
|
||||
data: None,
|
||||
error: Some(msg.into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// API error types
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum ApiError {
|
||||
#[error("Invalid request: {0}")]
|
||||
BadRequest(String),
|
||||
|
||||
#[error("Not found: {0}")]
|
||||
NotFound(String),
|
||||
|
||||
#[error("Method not allowed")]
|
||||
MethodNotAllowed,
|
||||
|
||||
#[error("Invalid state transition: cannot {action} from {current_state}")]
|
||||
InvalidStateTransition {
|
||||
current_state: VmState,
|
||||
action: String,
|
||||
},
|
||||
|
||||
#[error("VM not configured")]
|
||||
NotConfigured,
|
||||
|
||||
#[error("Internal error: {0}")]
|
||||
Internal(String),
|
||||
|
||||
#[error("JSON error: {0}")]
|
||||
Json(#[from] serde_json::Error),
|
||||
}
|
||||
|
||||
impl ApiError {
|
||||
pub fn status_code(&self) -> u16 {
|
||||
match self {
|
||||
ApiError::BadRequest(_) => 400,
|
||||
ApiError::NotFound(_) => 404,
|
||||
ApiError::MethodNotAllowed => 405,
|
||||
ApiError::InvalidStateTransition { .. } => 409,
|
||||
ApiError::NotConfigured => 409,
|
||||
ApiError::Internal(_) => 500,
|
||||
ApiError::Json(_) => 400,
|
||||
}
|
||||
}
|
||||
}
|
||||
5
vmm/api-test/src/lib.rs
Normal file
5
vmm/api-test/src/lib.rs
Normal file
@@ -0,0 +1,5 @@
|
||||
//! Volt API Test Crate
|
||||
|
||||
pub mod api;
|
||||
|
||||
pub use api::{run_server, VmConfig, VmState, VmStateAction};
|
||||
307
vmm/docs/NETWORKD_NATIVE_NETWORKING.md
Normal file
307
vmm/docs/NETWORKD_NATIVE_NETWORKING.md
Normal file
@@ -0,0 +1,307 @@
|
||||
# Networkd-Native VM Networking Design
|
||||
|
||||
## Executive Summary
|
||||
|
||||
This document describes a networking architecture for Volt VMs that **replaces virtio-net** with networkd-native approaches, achieving significantly higher performance through kernel bypass and direct hardware access.
|
||||
|
||||
## Performance Comparison
|
||||
|
||||
| Backend | Throughput | Latency | CPU Usage | Complexity |
|
||||
|--------------------|---------------|--------------|------------|------------|
|
||||
| virtio-net (user) | ~1-2 Gbps | ~50-100μs | High | Low |
|
||||
| virtio-net (vhost) | ~10 Gbps | ~20-50μs | Medium | Low |
|
||||
| **macvtap** | **~20+ Gbps** | ~10-20μs | Low | Low |
|
||||
| **AF_XDP** | **~40+ Gbps** | **~5-10μs** | Very Low | High |
|
||||
| vhost-user-net | ~25 Gbps | ~15-25μs | Low | Medium |
|
||||
|
||||
## Architecture Overview
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────┐
|
||||
│ Host Network Stack │
|
||||
│ ┌─────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ systemd-networkd │ │
|
||||
│ │ ┌──────────────┐ ┌──────────────┐ ┌──────────────────────┐ │ │
|
||||
│ │ │ .network │ │ .netdev │ │ .link │ │ │
|
||||
│ │ │ files │ │ files │ │ files │ │ │
|
||||
│ │ └──────────────┘ └──────────────┘ └──────────────────────┘ │ │
|
||||
│ └─────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ ┌───────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ Network Backends │ │
|
||||
│ │ │ │
|
||||
│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │
|
||||
│ │ │ macvtap │ │ AF_XDP │ │ vhost-user │ │ │
|
||||
│ │ │ Backend │ │ Backend │ │ Backend │ │ │
|
||||
│ │ │ │ │ │ │ │ │ │
|
||||
│ │ │ /dev/tapN │ │ XSK socket │ │ Unix sock │ │ │
|
||||
│ │ └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ │ │
|
||||
│ │ │ │ │ │ │
|
||||
│ │ ┌──────┴────────────────┴────────────────┴──────┐ │ │
|
||||
│ │ │ Unified NetDevice API │ │ │
|
||||
│ │ │ (trait-based abstraction) │ │ │
|
||||
│ │ └────────────────────────┬───────────────────────┘ │ │
|
||||
│ │ │ │ │
|
||||
│ └───────────────────────────┼────────────────────────────────────────┘ │
|
||||
│ │ │
|
||||
│ ┌───────────────────────────┼───────────────────────────────────────┐ │
|
||||
│ │ Volt VMM │ │
|
||||
│ │ │ │ │
|
||||
│ │ ┌────────────────────────┴───────────────────────────────────┐ │ │
|
||||
│ │ │ VirtIO Compatibility │ │ │
|
||||
│ │ │ ┌─────────────────┐ ┌─────────────────┐ │ │ │
|
||||
│ │ │ │ virtio-net HDR │ │ Guest Driver │ │ │ │
|
||||
│ │ │ │ translation │ │ Compatibility │ │ │ │
|
||||
│ │ │ └─────────────────┘ └─────────────────┘ │ │ │
|
||||
│ │ └────────────────────────────────────────────────────────────┘ │ │
|
||||
│ └───────────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ ▼ │
|
||||
│ ┌─────────────────┐ │
|
||||
│ │ Physical NIC │ │
|
||||
│ │ (or veth pair) │ │
|
||||
│ └─────────────────┘ │
|
||||
└─────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## Option 1: macvtap (Recommended Default)
|
||||
|
||||
### Why macvtap?
|
||||
|
||||
- **No bridge needed**: Direct attachment to physical NIC
|
||||
- **Near-native performance**: Packets bypass userspace entirely
|
||||
- **Networkd integration**: First-class support via `.netdev` files
|
||||
- **Simple setup**: Works like a TAP but with hardware acceleration
|
||||
- **Multi-queue support**: Scale with multiple vCPUs
|
||||
|
||||
### How it Works
|
||||
|
||||
```
|
||||
┌────────────────────────────────────────────────────────────────┐
|
||||
│ Guest VM │
|
||||
│ ┌──────────────────────────────────────────────────────────┐ │
|
||||
│ │ virtio-net driver │ │
|
||||
│ └────────────────────────────┬─────────────────────────────┘ │
|
||||
└───────────────────────────────┼─────────────────────────────────┘
|
||||
│
|
||||
┌───────────────────────────────┼─────────────────────────────────┐
|
||||
│ Volt VMM │ │
|
||||
│ ┌────────────────────────────┴─────────────────────────────┐ │
|
||||
│ │ MacvtapDevice │ │
|
||||
│ │ ┌───────────────────────────────────────────────────┐ │ │
|
||||
│ │ │ /dev/tap<ifindex> │ │ │
|
||||
│ │ │ - read() → RX packets │ │ │
|
||||
│ │ │ - write() → TX packets │ │ │
|
||||
│ │ │ - ioctl() → offload config │ │ │
|
||||
│ │ └───────────────────────────────────────────────────┘ │ │
|
||||
│ └──────────────────────────────────────────────────────────┘ │
|
||||
└───────────────────────────────┬─────────────────────────────────┘
|
||||
│
|
||||
┌───────────┴───────────┐
|
||||
│ macvtap interface │
|
||||
│ (macvtap0) │
|
||||
└───────────┬───────────┘
|
||||
│ direct attachment
|
||||
┌───────────┴───────────┐
|
||||
│ Physical NIC │
|
||||
│ (eth0 / enp3s0) │
|
||||
└───────────────────────┘
|
||||
```
|
||||
|
||||
### macvtap Modes
|
||||
|
||||
| Mode | Description | Use Case |
|
||||
|------------|------------------------------------------|-----------------------------|
|
||||
| **vepa** | All traffic goes through external switch | Hardware switch with VEPA |
|
||||
| **bridge** | VMs can communicate directly | Multi-VM on same host |
|
||||
| **private**| VMs isolated from each other | Tenant isolation |
|
||||
| **passthru**| Single VM owns the NIC | Maximum performance |
|
||||
|
||||
## Option 2: AF_XDP (Ultra-High Performance)
|
||||
|
||||
### Why AF_XDP?
|
||||
|
||||
- **Kernel bypass**: Zero-copy to/from NIC
|
||||
- **40+ Gbps**: Near line-rate on modern NICs
|
||||
- **eBPF integration**: Programmable packet processing
|
||||
- **XDP program**: Filter/redirect at driver level
|
||||
|
||||
### How it Works
|
||||
|
||||
```
|
||||
┌────────────────────────────────────────────────────────────────────┐
|
||||
│ Guest VM │
|
||||
│ ┌──────────────────────────────────────────────────────────────┐ │
|
||||
│ │ virtio-net driver │ │
|
||||
│ └────────────────────────────┬─────────────────────────────────┘ │
|
||||
└───────────────────────────────┼─────────────────────────────────────┘
|
||||
│
|
||||
┌───────────────────────────────┼─────────────────────────────────────┐
|
||||
│ Volt VMM │ │
|
||||
│ ┌────────────────────────────┴─────────────────────────────────┐ │
|
||||
│ │ AF_XDP Backend │ │
|
||||
│ │ ┌────────────────────────────────────────────────────────┐ │ │
|
||||
│ │ │ XSK Socket │ │ │
|
||||
│ │ │ ┌──────────────┐ ┌──────────────┐ │ │ │
|
||||
│ │ │ │ UMEM │ │ Fill/Comp │ │ │ │
|
||||
│ │ │ │ (shared mem)│ │ Rings │ │ │ │
|
||||
│ │ │ └──────────────┘ └──────────────┘ │ │ │
|
||||
│ │ │ ┌──────────────┐ ┌──────────────┐ │ │ │
|
||||
│ │ │ │ RX Ring │ │ TX Ring │ │ │ │
|
||||
│ │ │ └──────────────┘ └──────────────┘ │ │ │
|
||||
│ │ └────────────────────────────────────────────────────────┘ │ │
|
||||
│ └──────────────────────────────────────────────────────────────┘ │
|
||||
└───────────────────────────────┬─────────────────────────────────────┘
|
||||
│
|
||||
┌───────────┴───────────┐
|
||||
│ XDP Program │
|
||||
│ (eBPF redirect) │
|
||||
└───────────┬───────────┘
|
||||
│ zero-copy
|
||||
┌───────────┴───────────┐
|
||||
│ Physical NIC │
|
||||
│ (XDP-capable) │
|
||||
└───────────────────────┘
|
||||
```
|
||||
|
||||
### AF_XDP Ring Structure
|
||||
|
||||
```
|
||||
UMEM (Shared Memory Region)
|
||||
┌─────────────────────────────────────────────┐
|
||||
│ Frame 0 │ Frame 1 │ Frame 2 │ ... │ Frame N │
|
||||
└─────────────────────────────────────────────┘
|
||||
↑ ↑
|
||||
│ │
|
||||
┌────┴────┐ ┌────┴────┐
|
||||
│ RX Ring │ │ TX Ring │
|
||||
│ (NIC→VM)│ │ (VM→NIC)│
|
||||
└─────────┘ └─────────┘
|
||||
↑ ↑
|
||||
│ │
|
||||
┌────┴────┐ ┌────┴────┐
|
||||
│ Fill │ │ Comp │
|
||||
│ Ring │ │ Ring │
|
||||
│ (empty) │ │ (done) │
|
||||
└─────────┘ └─────────┘
|
||||
```
|
||||
|
||||
## Option 3: Direct Namespace Networking (nspawn-style)
|
||||
|
||||
For containers and lightweight VMs, share the kernel network stack:
|
||||
|
||||
```
|
||||
┌──────────────────────────────────────────────────────────────────┐
|
||||
│ Host │
|
||||
│ ┌────────────────────────────────────────────────────────────┐ │
|
||||
│ │ Network Namespace (vm-ns0) │ │
|
||||
│ │ ┌──────────────────┐ │ │
|
||||
│ │ │ veth-vm0 │ ◄─── Guest sees this as eth0 │ │
|
||||
│ │ │ 10.0.0.2/24 │ │ │
|
||||
│ │ └────────┬─────────┘ │ │
|
||||
│ └───────────┼────────────────────────────────────────────────┘ │
|
||||
│ │ veth pair │
|
||||
│ ┌───────────┼────────────────────────────────────────────────┐ │
|
||||
│ │ │ Host Namespace │ │
|
||||
│ │ ┌────────┴─────────┐ │ │
|
||||
│ │ │ veth-host0 │ │ │
|
||||
│ │ │ 10.0.0.1/24 │ │ │
|
||||
│ │ └────────┬─────────┘ │ │
|
||||
│ │ │ │ │
|
||||
│ │ ┌────────┴─────────┐ │ │
|
||||
│ │ │ nft/iptables │ NAT / routing │ │
|
||||
│ │ └────────┬─────────┘ │ │
|
||||
│ │ │ │ │
|
||||
│ │ ┌────────┴─────────┐ │ │
|
||||
│ │ │ eth0 │ Physical NIC │ │
|
||||
│ │ └──────────────────┘ │ │
|
||||
│ └────────────────────────────────────────────────────────────┘ │
|
||||
└──────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## Voltainer Integration
|
||||
|
||||
### Shared Networking Model
|
||||
|
||||
Volt VMs can participate in Voltainer's network zones:
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────┐
|
||||
│ Voltainer Network Zone │
|
||||
│ │
|
||||
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
|
||||
│ │ Container A │ │ Container B │ │ Volt │ │
|
||||
│ │ (nspawn) │ │ (nspawn) │ │ VM │ │
|
||||
│ │ │ │ │ │ │ │
|
||||
│ │ veth0 │ │ veth0 │ │ macvtap0 │ │
|
||||
│ │ 10.0.1.2 │ │ 10.0.1.3 │ │ 10.0.1.4 │ │
|
||||
│ └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ │
|
||||
│ │ │ │ │
|
||||
│ ┌──────┴────────────────┴────────────────┴──────┐ │
|
||||
│ │ zone0 bridge │ │
|
||||
│ │ 10.0.1.1/24 │ │
|
||||
│ └────────────────────────┬───────────────────────┘ │
|
||||
│ │ │
|
||||
│ ┌──────┴──────┐ │
|
||||
│ │ nft NAT │ │
|
||||
│ └──────┬──────┘ │
|
||||
│ │ │
|
||||
│ ┌──────┴──────┐ │
|
||||
│ │ eth0 │ │
|
||||
│ └─────────────┘ │
|
||||
└─────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
### networkd Configuration Files
|
||||
|
||||
All networking is declarative via networkd drop-in files:
|
||||
|
||||
```
|
||||
/etc/systemd/network/
|
||||
├── 10-physical.link # udev rules for NIC naming
|
||||
├── 20-macvtap@.netdev # Template for macvtap devices
|
||||
├── 25-zone0.netdev # Voltainer zone bridge
|
||||
├── 25-zone0.network # Zone bridge configuration
|
||||
├── 30-vm-<uuid>.netdev # Per-VM macvtap
|
||||
└── 30-vm-<uuid>.network # Per-VM network config
|
||||
```
|
||||
|
||||
## Implementation Phases
|
||||
|
||||
### Phase 1: macvtap Backend (Immediate)
|
||||
- Implement `MacvtapDevice` replacing `TapDevice`
|
||||
- networkd integration via `.netdev` files
|
||||
- Multi-queue support
|
||||
|
||||
### Phase 2: AF_XDP Backend (High Performance)
|
||||
- XSK socket implementation
|
||||
- eBPF XDP redirect program
|
||||
- UMEM management with guest memory
|
||||
|
||||
### Phase 3: Voltainer Integration
|
||||
- Zone participation for VMs
|
||||
- Shared networking model
|
||||
- Service discovery
|
||||
|
||||
## Selection Criteria
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ Backend Selection Logic │
|
||||
├─────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ Is NIC XDP-capable? ──YES──► Need >25 Gbps? ──YES──► │
|
||||
│ │ │ │
|
||||
│ NO NO │
|
||||
│ ▼ ▼ │
|
||||
│ Need VM-to-VM on host? Use AF_XDP │
|
||||
│ │ │
|
||||
│ ┌─────┴─────┐ │
|
||||
│ YES NO │
|
||||
│ │ │ │
|
||||
│ ▼ ▼ │
|
||||
│ macvtap macvtap │
|
||||
│ (bridge) (passthru) │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
92
vmm/src/api/handlers.rs
Normal file
92
vmm/src/api/handlers.rs
Normal file
@@ -0,0 +1,92 @@
|
||||
//! API Request Handlers
|
||||
//!
|
||||
//! Business logic for VM lifecycle operations.
|
||||
|
||||
use tracing::{debug, info};
|
||||
|
||||
use super::types::ApiError;
|
||||
|
||||
/// Handler for VM operations
|
||||
#[derive(Debug, Default, Clone)]
|
||||
#[allow(dead_code)]
|
||||
pub struct ApiHandler {
|
||||
// Future: Add references to VMM components
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
impl ApiHandler {
|
||||
pub fn new() -> Self {
|
||||
Self::default()
|
||||
}
|
||||
|
||||
/// Record a request for metrics
|
||||
pub fn record_request(&self, _duration: f64) {
|
||||
// TODO: Implement metrics tracking
|
||||
}
|
||||
|
||||
/// Put VM configuration
|
||||
pub async fn put_config(&self, _config: super::types::VmConfig) -> Result<super::types::ApiResponse<()>, ApiError> {
|
||||
Ok(super::types::ApiResponse::ok(()))
|
||||
}
|
||||
|
||||
/// Get VM configuration
|
||||
pub async fn get_config(&self) -> Result<super::types::ApiResponse<super::types::VmConfig>, ApiError> {
|
||||
Ok(super::types::ApiResponse::ok(super::types::VmConfig::default()))
|
||||
}
|
||||
|
||||
/// Put VM state
|
||||
pub async fn put_state(&self, _request: super::types::VmStateRequest) -> Result<super::types::ApiResponse<super::types::VmState>, ApiError> {
|
||||
Ok(super::types::ApiResponse::ok(super::types::VmState::Running))
|
||||
}
|
||||
|
||||
/// Get VM state
|
||||
pub async fn get_state(&self) -> Result<super::types::ApiResponse<super::types::VmState>, ApiError> {
|
||||
Ok(super::types::ApiResponse::ok(super::types::VmState::Running))
|
||||
}
|
||||
|
||||
/// Get metrics
|
||||
pub async fn get_metrics(&self) -> Result<String, ApiError> {
|
||||
Ok("# Volt metrics\n".to_string())
|
||||
}
|
||||
|
||||
/// Start the VM
|
||||
pub fn start_vm(&self) -> Result<(), ApiError> {
|
||||
info!("API: Starting VM");
|
||||
// TODO: Integrate with VMM to actually start the VM
|
||||
// For now, just log the action
|
||||
debug!("VM start requested via API");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Pause the VM (freeze vCPUs)
|
||||
pub fn pause_vm(&self) -> Result<(), ApiError> {
|
||||
info!("API: Pausing VM");
|
||||
// TODO: Integrate with VMM to pause the VM
|
||||
debug!("VM pause requested via API");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Resume a paused VM
|
||||
pub fn resume_vm(&self) -> Result<(), ApiError> {
|
||||
info!("API: Resuming VM");
|
||||
// TODO: Integrate with VMM to resume the VM
|
||||
debug!("VM resume requested via API");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Graceful shutdown
|
||||
pub fn shutdown_vm(&self) -> Result<(), ApiError> {
|
||||
info!("API: Initiating VM shutdown");
|
||||
// TODO: Send ACPI shutdown signal to guest
|
||||
debug!("VM graceful shutdown requested via API");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Force stop
|
||||
pub fn stop_vm(&self) -> Result<(), ApiError> {
|
||||
info!("API: Force stopping VM");
|
||||
// TODO: Integrate with VMM to stop the VM
|
||||
debug!("VM force stop requested via API");
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
18
vmm/src/api/mod.rs
Normal file
18
vmm/src/api/mod.rs
Normal file
@@ -0,0 +1,18 @@
|
||||
//! Volt HTTP API
|
||||
//!
|
||||
//! Unix socket HTTP/1.1 API server (Firecracker-compatible style).
|
||||
//! Provides endpoints for VM configuration and lifecycle management.
|
||||
//!
|
||||
//! ## Endpoints
|
||||
//!
|
||||
//! - `PUT /machine-config` - Pre-boot VM configuration
|
||||
//! - `GET /machine-config` - Get current configuration
|
||||
//! - `PATCH /vm` - Change VM state (start/stop/pause/resume)
|
||||
//! - `GET /vm` - Get current VM state
|
||||
//! - `GET /health` - Health check
|
||||
|
||||
mod handlers;
|
||||
mod server;
|
||||
pub mod types;
|
||||
|
||||
pub use server::run_server;
|
||||
193
vmm/src/api/routes.rs
Normal file
193
vmm/src/api/routes.rs
Normal file
@@ -0,0 +1,193 @@
|
||||
//! API Route Definitions
|
||||
//!
|
||||
//! Maps HTTP paths and methods to handlers.
|
||||
|
||||
use super::handlers::ApiHandler;
|
||||
use super::types::ApiError;
|
||||
use http_body_util::{BodyExt, Full};
|
||||
use hyper::body::Bytes;
|
||||
use hyper::{Method, Request, Response, StatusCode};
|
||||
use std::time::Instant;
|
||||
use tracing::{debug, error};
|
||||
|
||||
/// Route an incoming request to the appropriate handler
|
||||
pub async fn route_request(
|
||||
handler: ApiHandler,
|
||||
req: Request<hyper::body::Incoming>,
|
||||
) -> Result<Response<Full<Bytes>>, hyper::Error> {
|
||||
let start = Instant::now();
|
||||
let method = req.method().clone();
|
||||
let path = req.uri().path().to_string();
|
||||
|
||||
debug!(%method, %path, "Incoming request");
|
||||
|
||||
let response = match (method.clone(), path.as_str()) {
|
||||
// VM Configuration
|
||||
(Method::PUT, "/v1/vm/config") => handle_put_config(handler.clone(), req).await,
|
||||
(Method::GET, "/v1/vm/config") => handle_get_config(handler.clone()).await,
|
||||
|
||||
// VM State
|
||||
(Method::PUT, "/v1/vm/state") => handle_put_state(handler.clone(), req).await,
|
||||
(Method::GET, "/v1/vm/state") => handle_get_state(handler.clone()).await,
|
||||
|
||||
// Metrics
|
||||
(Method::GET, "/v1/metrics") | (Method::GET, "/metrics") => {
|
||||
handle_metrics(handler.clone()).await
|
||||
}
|
||||
|
||||
// Health check
|
||||
(Method::GET, "/") | (Method::GET, "/health") => Ok(json_response(
|
||||
StatusCode::OK,
|
||||
r#"{"status":"ok","version":"0.1.0"}"#,
|
||||
)),
|
||||
|
||||
// 404 for unknown paths
|
||||
(_, path) => {
|
||||
debug!("Unknown path: {}", path);
|
||||
Ok(error_response(ApiError::NotFound(path.to_string())))
|
||||
}
|
||||
};
|
||||
|
||||
// Record metrics
|
||||
let duration = start.elapsed().as_secs_f64();
|
||||
handler.record_request(duration);
|
||||
|
||||
debug!(%method, path = %req.uri().path(), duration_ms = duration * 1000.0, "Request completed");
|
||||
|
||||
response
|
||||
}
|
||||
|
||||
async fn handle_put_config(
|
||||
handler: ApiHandler,
|
||||
req: Request<hyper::body::Incoming>,
|
||||
) -> Result<Response<Full<Bytes>>, hyper::Error> {
|
||||
// Read request body
|
||||
let body = match read_body(req).await {
|
||||
Ok(b) => b,
|
||||
Err(e) => return Ok(error_response(e)),
|
||||
};
|
||||
|
||||
// Parse JSON
|
||||
let config = match serde_json::from_slice(&body) {
|
||||
Ok(c) => c,
|
||||
Err(e) => {
|
||||
return Ok(error_response(ApiError::BadRequest(format!(
|
||||
"Invalid JSON: {}",
|
||||
e
|
||||
))))
|
||||
}
|
||||
};
|
||||
|
||||
// Handle request
|
||||
match handler.put_config(config).await {
|
||||
Ok(response) => Ok(json_response(
|
||||
StatusCode::OK,
|
||||
&serde_json::to_string(&response).unwrap(),
|
||||
)),
|
||||
Err(e) => Ok(error_response(e)),
|
||||
}
|
||||
}
|
||||
|
||||
async fn handle_get_config(
|
||||
handler: ApiHandler,
|
||||
) -> Result<Response<Full<Bytes>>, hyper::Error> {
|
||||
match handler.get_config().await {
|
||||
Ok(response) => Ok(json_response(
|
||||
StatusCode::OK,
|
||||
&serde_json::to_string(&response).unwrap(),
|
||||
)),
|
||||
Err(e) => Ok(error_response(e)),
|
||||
}
|
||||
}
|
||||
|
||||
async fn handle_put_state(
|
||||
handler: ApiHandler,
|
||||
req: Request<hyper::body::Incoming>,
|
||||
) -> Result<Response<Full<Bytes>>, hyper::Error> {
|
||||
// Read request body
|
||||
let body = match read_body(req).await {
|
||||
Ok(b) => b,
|
||||
Err(e) => return Ok(error_response(e)),
|
||||
};
|
||||
|
||||
// Parse JSON
|
||||
let request = match serde_json::from_slice(&body) {
|
||||
Ok(r) => r,
|
||||
Err(e) => {
|
||||
return Ok(error_response(ApiError::BadRequest(format!(
|
||||
"Invalid JSON: {}",
|
||||
e
|
||||
))))
|
||||
}
|
||||
};
|
||||
|
||||
// Handle request
|
||||
match handler.put_state(request).await {
|
||||
Ok(response) => Ok(json_response(
|
||||
StatusCode::OK,
|
||||
&serde_json::to_string(&response).unwrap(),
|
||||
)),
|
||||
Err(e) => Ok(error_response(e)),
|
||||
}
|
||||
}
|
||||
|
||||
async fn handle_get_state(
|
||||
handler: ApiHandler,
|
||||
) -> Result<Response<Full<Bytes>>, hyper::Error> {
|
||||
match handler.get_state().await {
|
||||
Ok(response) => Ok(json_response(
|
||||
StatusCode::OK,
|
||||
&serde_json::to_string(&response).unwrap(),
|
||||
)),
|
||||
Err(e) => Ok(error_response(e)),
|
||||
}
|
||||
}
|
||||
|
||||
async fn handle_metrics(
|
||||
handler: ApiHandler,
|
||||
) -> Result<Response<Full<Bytes>>, hyper::Error> {
|
||||
match handler.get_metrics().await {
|
||||
Ok(metrics) => Ok(Response::builder()
|
||||
.status(StatusCode::OK)
|
||||
.header("Content-Type", "text/plain; version=0.0.4")
|
||||
.body(Full::new(Bytes::from(metrics)))
|
||||
.unwrap()),
|
||||
Err(e) => Ok(error_response(e)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Read the full request body into bytes
|
||||
async fn read_body(req: Request<hyper::body::Incoming>) -> Result<Bytes, ApiError> {
|
||||
req.into_body()
|
||||
.collect()
|
||||
.await
|
||||
.map(|c| c.to_bytes())
|
||||
.map_err(|e| ApiError::Internal(format!("Failed to read body: {}", e)))
|
||||
}
|
||||
|
||||
/// Create a JSON response
|
||||
fn json_response(status: StatusCode, body: &str) -> Response<Full<Bytes>> {
|
||||
Response::builder()
|
||||
.status(status)
|
||||
.header("Content-Type", "application/json")
|
||||
.body(Full::new(Bytes::from(body.to_string())))
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
/// Create an error response from an ApiError
|
||||
fn error_response(error: ApiError) -> Response<Full<Bytes>> {
|
||||
let status = StatusCode::from_u16(error.status_code()).unwrap_or(StatusCode::INTERNAL_SERVER_ERROR);
|
||||
|
||||
let body = serde_json::json!({
|
||||
"success": false,
|
||||
"error": error.to_string()
|
||||
});
|
||||
|
||||
error!(status = %status, error = %error, "API error response");
|
||||
|
||||
Response::builder()
|
||||
.status(status)
|
||||
.header("Content-Type", "application/json")
|
||||
.body(Full::new(Bytes::from(body.to_string())))
|
||||
.unwrap()
|
||||
}
|
||||
317
vmm/src/api/server.rs
Normal file
317
vmm/src/api/server.rs
Normal file
@@ -0,0 +1,317 @@
|
||||
//! Volt API Server
|
||||
//!
|
||||
//! Unix socket HTTP/1.1 API server for VM lifecycle management.
|
||||
//! Compatible with Firecracker-style REST API.
|
||||
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use axum::{
|
||||
extract::State,
|
||||
http::StatusCode,
|
||||
response::IntoResponse,
|
||||
routing::{get, put},
|
||||
Json, Router,
|
||||
};
|
||||
use parking_lot::RwLock;
|
||||
use serde_json::json;
|
||||
use tokio::net::UnixListener;
|
||||
use tracing::{debug, info};
|
||||
|
||||
use super::handlers::ApiHandler;
|
||||
use super::types::{ApiError, ApiResponse, SnapshotRequest, VmConfig, VmState, VmStateAction, VmStateRequest};
|
||||
|
||||
/// Shared API state
|
||||
pub struct ApiState {
|
||||
/// VM configuration
|
||||
pub vm_config: RwLock<Option<VmConfig>>,
|
||||
/// Current VM state
|
||||
pub vm_state: RwLock<VmState>,
|
||||
/// Handler for VM operations
|
||||
pub handler: ApiHandler,
|
||||
}
|
||||
|
||||
impl Default for ApiState {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
vm_config: RwLock::new(None),
|
||||
vm_state: RwLock::new(VmState::NotConfigured),
|
||||
handler: ApiHandler::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Run the API server on a Unix socket
|
||||
pub async fn run_server(socket_path: &str) -> Result<()> {
|
||||
let path = Path::new(socket_path);
|
||||
|
||||
// Remove existing socket if it exists
|
||||
if path.exists() {
|
||||
std::fs::remove_file(path)
|
||||
.with_context(|| format!("Failed to remove existing socket: {}", socket_path))?;
|
||||
}
|
||||
|
||||
// Create parent directory if needed
|
||||
if let Some(parent) = path.parent() {
|
||||
std::fs::create_dir_all(parent)
|
||||
.with_context(|| format!("Failed to create socket directory: {}", parent.display()))?;
|
||||
}
|
||||
|
||||
// Bind to Unix socket
|
||||
let listener = UnixListener::bind(path)
|
||||
.with_context(|| format!("Failed to bind to socket: {}", socket_path))?;
|
||||
|
||||
info!("API server listening on {}", socket_path);
|
||||
|
||||
// Create shared state
|
||||
let state = Arc::new(ApiState::default());
|
||||
|
||||
// Build router
|
||||
let app = Router::new()
|
||||
// Health check
|
||||
.route("/", get(root_handler))
|
||||
.route("/health", get(health_handler))
|
||||
// VM configuration
|
||||
.route("/machine-config", get(get_machine_config).put(put_machine_config))
|
||||
// VM state
|
||||
.route("/vm", get(get_vm_state).patch(patch_vm_state))
|
||||
// Info
|
||||
.route("/version", get(version_handler))
|
||||
.route("/vm-config", get(get_full_config))
|
||||
// Drives
|
||||
.route("/drives/{drive_id}", put(put_drive))
|
||||
// Network
|
||||
.route("/network-interfaces/{iface_id}", put(put_network_interface))
|
||||
// Snapshot/Restore
|
||||
.route("/snapshot/create", put(put_snapshot_create))
|
||||
.route("/snapshot/load", put(put_snapshot_load))
|
||||
// State fallback
|
||||
.with_state(state);
|
||||
|
||||
// Run server
|
||||
axum::serve(listener, app)
|
||||
.await
|
||||
.context("API server error")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Route Handlers
|
||||
// ============================================================================
|
||||
|
||||
async fn root_handler() -> impl IntoResponse {
|
||||
Json(json!({
|
||||
"name": "Volt VMM",
|
||||
"version": env!("CARGO_PKG_VERSION"),
|
||||
"status": "ok"
|
||||
}))
|
||||
}
|
||||
|
||||
async fn health_handler() -> impl IntoResponse {
|
||||
(StatusCode::OK, Json(json!({ "status": "healthy" })))
|
||||
}
|
||||
|
||||
async fn version_handler() -> impl IntoResponse {
|
||||
Json(json!({
|
||||
"version": env!("CARGO_PKG_VERSION"),
|
||||
"git_commit": option_env!("GIT_COMMIT").unwrap_or("unknown"),
|
||||
"build_date": option_env!("BUILD_DATE").unwrap_or("unknown")
|
||||
}))
|
||||
}
|
||||
|
||||
async fn get_machine_config(
|
||||
State(state): State<Arc<ApiState>>,
|
||||
) -> Result<Json<ApiResponse<VmConfig>>, ApiErrorResponse> {
|
||||
let config = state.vm_config.read();
|
||||
match config.as_ref() {
|
||||
Some(cfg) => Ok(Json(ApiResponse::ok(cfg.clone()))),
|
||||
None => Err(ApiErrorResponse::from(ApiError::NotConfigured)),
|
||||
}
|
||||
}
|
||||
|
||||
async fn put_machine_config(
|
||||
State(state): State<Arc<ApiState>>,
|
||||
Json(config): Json<VmConfig>,
|
||||
) -> Result<impl IntoResponse, ApiErrorResponse> {
|
||||
let current_state = *state.vm_state.read();
|
||||
|
||||
// Can only configure before starting
|
||||
if current_state != VmState::NotConfigured && current_state != VmState::Configured {
|
||||
return Err(ApiErrorResponse::from(ApiError::InvalidStateTransition {
|
||||
current_state,
|
||||
action: "configure".to_string(),
|
||||
}));
|
||||
}
|
||||
|
||||
// Validate configuration
|
||||
if config.vcpu_count == 0 {
|
||||
return Err(ApiErrorResponse::from(ApiError::BadRequest(
|
||||
"vcpu_count must be >= 1".to_string(),
|
||||
)));
|
||||
}
|
||||
|
||||
if config.mem_size_mib < 16 {
|
||||
return Err(ApiErrorResponse::from(ApiError::BadRequest(
|
||||
"mem_size_mib must be >= 16".to_string(),
|
||||
)));
|
||||
}
|
||||
|
||||
debug!("Updating machine config: {:?}", config);
|
||||
|
||||
*state.vm_config.write() = Some(config.clone());
|
||||
*state.vm_state.write() = VmState::Configured;
|
||||
|
||||
Ok((
|
||||
StatusCode::NO_CONTENT,
|
||||
Json(ApiResponse::<()>::ok(())),
|
||||
))
|
||||
}
|
||||
|
||||
async fn get_vm_state(
|
||||
State(state): State<Arc<ApiState>>,
|
||||
) -> Json<ApiResponse<VmState>> {
|
||||
let vm_state = *state.vm_state.read();
|
||||
Json(ApiResponse::ok(vm_state))
|
||||
}
|
||||
|
||||
async fn patch_vm_state(
|
||||
State(state): State<Arc<ApiState>>,
|
||||
Json(request): Json<VmStateRequest>,
|
||||
) -> Result<impl IntoResponse, ApiErrorResponse> {
|
||||
let current_state = *state.vm_state.read();
|
||||
|
||||
// Validate state transition
|
||||
let new_state = match (&request.action, current_state) {
|
||||
(VmStateAction::Start, VmState::Configured) => VmState::Running,
|
||||
(VmStateAction::Start, VmState::Paused) => VmState::Running,
|
||||
(VmStateAction::Pause, VmState::Running) => VmState::Paused,
|
||||
(VmStateAction::Resume, VmState::Paused) => VmState::Running,
|
||||
(VmStateAction::Shutdown, VmState::Running) => VmState::ShuttingDown,
|
||||
(VmStateAction::Stop, _) => VmState::Stopped,
|
||||
_ => {
|
||||
return Err(ApiErrorResponse::from(ApiError::InvalidStateTransition {
|
||||
current_state,
|
||||
action: format!("{:?}", request.action),
|
||||
}));
|
||||
}
|
||||
};
|
||||
|
||||
debug!("State transition: {:?} -> {:?}", current_state, new_state);
|
||||
|
||||
// Perform the action via handler
|
||||
match request.action {
|
||||
VmStateAction::Start => state.handler.start_vm()?,
|
||||
VmStateAction::Pause => state.handler.pause_vm()?,
|
||||
VmStateAction::Resume => state.handler.resume_vm()?,
|
||||
VmStateAction::Shutdown => state.handler.shutdown_vm()?,
|
||||
VmStateAction::Stop => state.handler.stop_vm()?,
|
||||
}
|
||||
|
||||
*state.vm_state.write() = new_state;
|
||||
|
||||
Ok((StatusCode::OK, Json(ApiResponse::ok(new_state))))
|
||||
}
|
||||
|
||||
async fn get_full_config(
|
||||
State(state): State<Arc<ApiState>>,
|
||||
) -> Json<ApiResponse<VmConfig>> {
|
||||
let config = state.vm_config.read();
|
||||
match config.as_ref() {
|
||||
Some(cfg) => Json(ApiResponse::ok(cfg.clone())),
|
||||
None => Json(ApiResponse::ok(VmConfig::default())),
|
||||
}
|
||||
}
|
||||
|
||||
async fn put_drive(
|
||||
axum::extract::Path(drive_id): axum::extract::Path<String>,
|
||||
State(_state): State<Arc<ApiState>>,
|
||||
Json(drive_config): Json<serde_json::Value>,
|
||||
) -> Result<impl IntoResponse, ApiErrorResponse> {
|
||||
debug!("PUT /drives/{}: {:?}", drive_id, drive_config);
|
||||
|
||||
// TODO: Implement drive configuration
|
||||
// For now, just acknowledge the request
|
||||
|
||||
Ok((StatusCode::NO_CONTENT, ""))
|
||||
}
|
||||
|
||||
async fn put_network_interface(
|
||||
axum::extract::Path(iface_id): axum::extract::Path<String>,
|
||||
State(_state): State<Arc<ApiState>>,
|
||||
Json(iface_config): Json<serde_json::Value>,
|
||||
) -> Result<impl IntoResponse, ApiErrorResponse> {
|
||||
debug!("PUT /network-interfaces/{}: {:?}", iface_id, iface_config);
|
||||
|
||||
// TODO: Implement network interface configuration
|
||||
// For now, just acknowledge the request
|
||||
|
||||
Ok((StatusCode::NO_CONTENT, ""))
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Snapshot Handlers
|
||||
// ============================================================================
|
||||
|
||||
async fn put_snapshot_create(
|
||||
State(_state): State<Arc<ApiState>>,
|
||||
Json(request): Json<SnapshotRequest>,
|
||||
) -> Result<impl IntoResponse, ApiErrorResponse> {
|
||||
info!("API: Snapshot create requested at {}", request.snapshot_path);
|
||||
|
||||
// TODO: Wire to actual VMM instance to create snapshot
|
||||
// For now, return success with the path
|
||||
Ok((
|
||||
StatusCode::OK,
|
||||
Json(json!({
|
||||
"success": true,
|
||||
"snapshot_path": request.snapshot_path
|
||||
})),
|
||||
))
|
||||
}
|
||||
|
||||
async fn put_snapshot_load(
|
||||
State(_state): State<Arc<ApiState>>,
|
||||
Json(request): Json<SnapshotRequest>,
|
||||
) -> Result<impl IntoResponse, ApiErrorResponse> {
|
||||
info!("API: Snapshot load requested from {}", request.snapshot_path);
|
||||
|
||||
// TODO: Wire to actual VMM instance to restore snapshot
|
||||
// For now, return success with the path
|
||||
Ok((
|
||||
StatusCode::OK,
|
||||
Json(json!({
|
||||
"success": true,
|
||||
"snapshot_path": request.snapshot_path
|
||||
})),
|
||||
))
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Error Response
|
||||
// ============================================================================
|
||||
|
||||
struct ApiErrorResponse {
|
||||
status: StatusCode,
|
||||
message: String,
|
||||
}
|
||||
|
||||
impl From<ApiError> for ApiErrorResponse {
|
||||
fn from(err: ApiError) -> Self {
|
||||
Self {
|
||||
status: StatusCode::from_u16(err.status_code()).unwrap_or(StatusCode::INTERNAL_SERVER_ERROR),
|
||||
message: err.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IntoResponse for ApiErrorResponse {
|
||||
fn into_response(self) -> axum::response::Response {
|
||||
let body = Json(json!({
|
||||
"success": false,
|
||||
"error": self.message
|
||||
}));
|
||||
(self.status, body).into_response()
|
||||
}
|
||||
}
|
||||
210
vmm/src/api/types.rs
Normal file
210
vmm/src/api/types.rs
Normal file
@@ -0,0 +1,210 @@
|
||||
//! API Types and Data Structures
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fmt;
|
||||
|
||||
/// VM configuration for pre-boot setup
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
|
||||
pub struct VmConfig {
|
||||
/// Number of vCPUs
|
||||
#[serde(default = "default_vcpu_count")]
|
||||
pub vcpu_count: u8,
|
||||
|
||||
/// Memory size in MiB
|
||||
#[serde(default = "default_mem_size_mib")]
|
||||
pub mem_size_mib: u32,
|
||||
|
||||
/// Path to kernel image
|
||||
pub kernel_image_path: Option<String>,
|
||||
|
||||
/// Kernel boot arguments
|
||||
#[serde(default)]
|
||||
pub boot_args: String,
|
||||
|
||||
/// Path to root filesystem
|
||||
pub rootfs_path: Option<String>,
|
||||
|
||||
/// Network configuration
|
||||
pub network: Option<NetworkConfig>,
|
||||
|
||||
/// Enable HugePages for memory
|
||||
#[serde(default)]
|
||||
pub hugepages: bool,
|
||||
}
|
||||
|
||||
fn default_vcpu_count() -> u8 {
|
||||
1
|
||||
}
|
||||
|
||||
fn default_mem_size_mib() -> u32 {
|
||||
128
|
||||
}
|
||||
|
||||
/// Network configuration
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct NetworkConfig {
|
||||
/// TAP device name
|
||||
pub tap_device: String,
|
||||
|
||||
/// Guest MAC address
|
||||
pub guest_mac: Option<String>,
|
||||
|
||||
/// Host IP for the TAP interface
|
||||
pub host_ip: Option<String>,
|
||||
|
||||
/// Guest IP
|
||||
pub guest_ip: Option<String>,
|
||||
}
|
||||
|
||||
/// VM runtime state
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum VmState {
|
||||
/// VM is not yet configured
|
||||
NotConfigured,
|
||||
/// VM is configured but not started
|
||||
Configured,
|
||||
/// VM is starting up
|
||||
Starting,
|
||||
/// VM is running
|
||||
Running,
|
||||
/// VM is paused
|
||||
Paused,
|
||||
/// VM is shutting down
|
||||
ShuttingDown,
|
||||
/// VM has stopped
|
||||
Stopped,
|
||||
/// VM encountered an error
|
||||
Error,
|
||||
}
|
||||
|
||||
impl Default for VmState {
|
||||
fn default() -> Self {
|
||||
VmState::NotConfigured
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for VmState {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
VmState::NotConfigured => write!(f, "not_configured"),
|
||||
VmState::Configured => write!(f, "configured"),
|
||||
VmState::Starting => write!(f, "starting"),
|
||||
VmState::Running => write!(f, "running"),
|
||||
VmState::Paused => write!(f, "paused"),
|
||||
VmState::ShuttingDown => write!(f, "shutting_down"),
|
||||
VmState::Stopped => write!(f, "stopped"),
|
||||
VmState::Error => write!(f, "error"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Action to change VM state
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum VmStateAction {
|
||||
/// Start the VM
|
||||
Start,
|
||||
/// Pause the VM (freeze vCPUs)
|
||||
Pause,
|
||||
/// Resume a paused VM
|
||||
Resume,
|
||||
/// Graceful shutdown
|
||||
Shutdown,
|
||||
/// Force stop
|
||||
Stop,
|
||||
}
|
||||
|
||||
/// Request body for state changes
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct VmStateRequest {
|
||||
pub action: VmStateAction,
|
||||
}
|
||||
|
||||
/// VM state response
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[allow(dead_code)]
|
||||
pub struct VmStateResponse {
|
||||
pub state: VmState,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub message: Option<String>,
|
||||
}
|
||||
|
||||
/// Snapshot request body
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SnapshotRequest {
|
||||
/// Path to the snapshot directory
|
||||
pub snapshot_path: String,
|
||||
}
|
||||
|
||||
/// Generic API response wrapper
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ApiResponse<T> {
|
||||
pub success: bool,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub data: Option<T>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub error: Option<String>,
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
impl<T> ApiResponse<T> {
|
||||
pub fn ok(data: T) -> Self {
|
||||
ApiResponse {
|
||||
success: true,
|
||||
data: Some(data),
|
||||
error: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn error(msg: impl Into<String>) -> Self {
|
||||
ApiResponse {
|
||||
success: false,
|
||||
data: None,
|
||||
error: Some(msg.into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// API error types
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
#[allow(dead_code)]
|
||||
pub enum ApiError {
|
||||
#[error("Invalid request: {0}")]
|
||||
BadRequest(String),
|
||||
|
||||
#[error("Not found: {0}")]
|
||||
NotFound(String),
|
||||
|
||||
#[error("Method not allowed")]
|
||||
MethodNotAllowed,
|
||||
|
||||
#[error("Invalid state transition: cannot {action} from {current_state}")]
|
||||
InvalidStateTransition {
|
||||
current_state: VmState,
|
||||
action: String,
|
||||
},
|
||||
|
||||
#[error("VM not configured")]
|
||||
NotConfigured,
|
||||
|
||||
#[error("Internal error: {0}")]
|
||||
Internal(String),
|
||||
|
||||
#[error("JSON error: {0}")]
|
||||
Json(#[from] serde_json::Error),
|
||||
}
|
||||
|
||||
impl ApiError {
|
||||
pub fn status_code(&self) -> u16 {
|
||||
match self {
|
||||
ApiError::BadRequest(_) => 400,
|
||||
ApiError::NotFound(_) => 404,
|
||||
ApiError::MethodNotAllowed => 405,
|
||||
ApiError::InvalidStateTransition { .. } => 409,
|
||||
ApiError::NotConfigured => 409,
|
||||
ApiError::Internal(_) => 500,
|
||||
ApiError::Json(_) => 400,
|
||||
}
|
||||
}
|
||||
}
|
||||
115
vmm/src/boot/gdt.rs
Normal file
115
vmm/src/boot/gdt.rs
Normal file
@@ -0,0 +1,115 @@
|
||||
//! GDT (Global Descriptor Table) Setup for 64-bit Boot
|
||||
//!
|
||||
//! Sets up a minimal GDT for 64-bit kernel boot. The kernel will set up
|
||||
//! its own GDT later, so this is just for the initial transition.
|
||||
|
||||
use super::{GuestMemory, Result};
|
||||
#[cfg(test)]
|
||||
use super::BootError;
|
||||
|
||||
/// GDT address in guest memory
|
||||
pub const GDT_ADDR: u64 = 0x500;
|
||||
|
||||
/// GDT size (3 entries × 8 bytes = 24 bytes, but we add a few more for safety)
|
||||
pub const GDT_SIZE: usize = 0x30;
|
||||
|
||||
/// GDT entry indices (matches Firecracker layout)
|
||||
#[allow(dead_code)] // GDT selector constants — part of x86 boot protocol
|
||||
pub mod selectors {
|
||||
/// Null segment (required)
|
||||
pub const NULL: u16 = 0x00;
|
||||
/// 64-bit code segment (at index 1, selector 0x08)
|
||||
pub const CODE64: u16 = 0x08;
|
||||
/// 64-bit data segment (at index 2, selector 0x10)
|
||||
pub const DATA64: u16 = 0x10;
|
||||
}
|
||||
|
||||
/// GDT setup implementation
|
||||
pub struct GdtSetup;
|
||||
|
||||
impl GdtSetup {
|
||||
/// Set up GDT in guest memory
|
||||
///
|
||||
/// Creates a minimal GDT matching Firecracker's layout:
|
||||
/// - Entry 0 (0x00): Null descriptor (required)
|
||||
/// - Entry 1 (0x08): 64-bit code segment
|
||||
/// - Entry 2 (0x10): 64-bit data segment
|
||||
pub fn setup<M: GuestMemory>(guest_mem: &mut M) -> Result<()> {
|
||||
// Zero out the GDT area first
|
||||
let zeros = vec![0u8; GDT_SIZE];
|
||||
guest_mem.write_bytes(GDT_ADDR, &zeros)?;
|
||||
|
||||
// Entry 0: Null descriptor (required, all zeros)
|
||||
// Already zeroed
|
||||
|
||||
// Entry 1 (0x08): 64-bit code segment
|
||||
// Base: 0, Limit: 0xFFFFF (ignored in 64-bit mode)
|
||||
// Flags: Present, Ring 0, Code, Execute/Read, Long mode
|
||||
let code64: u64 = 0x00AF_9B00_0000_FFFF;
|
||||
guest_mem.write_bytes(GDT_ADDR + 0x08, &code64.to_le_bytes())?;
|
||||
|
||||
// Entry 2 (0x10): 64-bit data segment
|
||||
// Base: 0, Limit: 0xFFFFF
|
||||
// Flags: Present, Ring 0, Data, Read/Write
|
||||
let data64: u64 = 0x00CF_9300_0000_FFFF;
|
||||
guest_mem.write_bytes(GDT_ADDR + 0x10, &data64.to_le_bytes())?;
|
||||
|
||||
tracing::debug!("GDT set up at 0x{:x}", GDT_ADDR);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
struct MockMemory {
|
||||
data: Vec<u8>,
|
||||
}
|
||||
|
||||
impl MockMemory {
|
||||
fn new(size: usize) -> Self {
|
||||
Self {
|
||||
data: vec![0; size],
|
||||
}
|
||||
}
|
||||
|
||||
fn read_u64(&self, addr: u64) -> u64 {
|
||||
let bytes = &self.data[addr as usize..addr as usize + 8];
|
||||
u64::from_le_bytes(bytes.try_into().unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
impl GuestMemory for MockMemory {
|
||||
fn write_bytes(&mut self, addr: u64, data: &[u8]) -> Result<()> {
|
||||
let end = addr as usize + data.len();
|
||||
if end > self.data.len() {
|
||||
return Err(BootError::GuestMemoryWrite("overflow".into()));
|
||||
}
|
||||
self.data[addr as usize..end].copy_from_slice(data);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn size(&self) -> u64 {
|
||||
self.data.len() as u64
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_gdt_setup() {
|
||||
let mut mem = MockMemory::new(0x1000);
|
||||
GdtSetup::setup(&mut mem).unwrap();
|
||||
|
||||
// Check null descriptor
|
||||
assert_eq!(mem.read_u64(GDT_ADDR), 0);
|
||||
|
||||
// Check code segment (entry 1, offset 0x08)
|
||||
let code = mem.read_u64(GDT_ADDR + 0x08);
|
||||
assert_eq!(code, 0x00AF_9B00_0000_FFFF);
|
||||
|
||||
// Check data segment (entry 2, offset 0x10)
|
||||
let data = mem.read_u64(GDT_ADDR + 0x10);
|
||||
assert_eq!(data, 0x00CF_9300_0000_FFFF);
|
||||
}
|
||||
}
|
||||
398
vmm/src/boot/initrd.rs
Normal file
398
vmm/src/boot/initrd.rs
Normal file
@@ -0,0 +1,398 @@
|
||||
//! Initrd/Initramfs Loader
|
||||
//!
|
||||
//! Handles loading of initial ramdisk images into guest memory.
|
||||
//! The initrd is placed in high memory to avoid conflicts with the kernel.
|
||||
//!
|
||||
//! # Memory Placement Strategy
|
||||
//!
|
||||
//! The initrd is placed as high as possible in guest memory while:
|
||||
//! 1. Staying below the 4GB boundary (for 32-bit kernel compatibility)
|
||||
//! 2. Being page-aligned
|
||||
//! 3. Not overlapping with the kernel
|
||||
//!
|
||||
//! This matches the behavior of QEMU and other hypervisors.
|
||||
|
||||
use super::{BootError, GuestMemory, Result};
|
||||
use std::fs::File;
|
||||
use std::io::Read;
|
||||
use std::path::Path;
|
||||
|
||||
/// Page size for alignment
|
||||
const PAGE_SIZE: u64 = 4096;
|
||||
|
||||
/// Maximum address for initrd (4GB - 1, for 32-bit compatibility)
|
||||
const MAX_INITRD_ADDR: u64 = 0xFFFF_FFFF;
|
||||
|
||||
/// Minimum gap between kernel and initrd
|
||||
const MIN_KERNEL_INITRD_GAP: u64 = PAGE_SIZE;
|
||||
|
||||
/// Initrd loader configuration
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct InitrdConfig {
|
||||
/// Path to initrd/initramfs image
|
||||
pub path: String,
|
||||
|
||||
/// Total guest memory size
|
||||
pub memory_size: u64,
|
||||
|
||||
/// End address of kernel (for placement calculation)
|
||||
pub kernel_end: u64,
|
||||
}
|
||||
|
||||
/// Result of initrd loading
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct InitrdLoadResult {
|
||||
/// Address where initrd was loaded
|
||||
pub load_addr: u64,
|
||||
|
||||
/// Size of loaded initrd
|
||||
pub size: u64,
|
||||
}
|
||||
|
||||
/// Initrd loader implementation
|
||||
pub struct InitrdLoader;
|
||||
|
||||
impl InitrdLoader {
|
||||
/// Load initrd into guest memory
|
||||
///
|
||||
/// Places the initrd as high as possible in guest memory while respecting
|
||||
/// alignment and boundary constraints.
|
||||
pub fn load<M: GuestMemory>(
|
||||
config: &InitrdConfig,
|
||||
guest_mem: &mut M,
|
||||
) -> Result<InitrdLoadResult> {
|
||||
let initrd_data = Self::read_initrd_file(&config.path)?;
|
||||
let initrd_size = initrd_data.len() as u64;
|
||||
|
||||
if initrd_size == 0 {
|
||||
return Err(BootError::InitrdRead(std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidData,
|
||||
"Initrd file is empty",
|
||||
)));
|
||||
}
|
||||
|
||||
// Calculate optimal placement address
|
||||
let load_addr = Self::calculate_load_address(
|
||||
initrd_size,
|
||||
config.memory_size,
|
||||
config.kernel_end,
|
||||
guest_mem.size(),
|
||||
)?;
|
||||
|
||||
// Write initrd to guest memory
|
||||
guest_mem.write_bytes(load_addr, &initrd_data)?;
|
||||
|
||||
Ok(InitrdLoadResult {
|
||||
load_addr,
|
||||
size: initrd_size,
|
||||
})
|
||||
}
|
||||
|
||||
/// Read initrd file into memory
|
||||
fn read_initrd_file(path: &str) -> Result<Vec<u8>> {
|
||||
let path = Path::new(path);
|
||||
|
||||
if !path.exists() {
|
||||
return Err(BootError::InitrdRead(std::io::Error::new(
|
||||
std::io::ErrorKind::NotFound,
|
||||
format!("Initrd not found: {}", path.display()),
|
||||
)));
|
||||
}
|
||||
|
||||
let mut file = File::open(path).map_err(BootError::InitrdRead)?;
|
||||
|
||||
let mut data = Vec::new();
|
||||
file.read_to_end(&mut data).map_err(BootError::InitrdRead)?;
|
||||
|
||||
Ok(data)
|
||||
}
|
||||
|
||||
/// Calculate the optimal load address for initrd
|
||||
///
|
||||
/// Strategy:
|
||||
/// 1. Try to place at high memory (below 4GB for compatibility)
|
||||
/// 2. Page-align the address
|
||||
/// 3. Ensure no overlap with kernel
|
||||
fn calculate_load_address(
|
||||
initrd_size: u64,
|
||||
memory_size: u64,
|
||||
kernel_end: u64,
|
||||
guest_mem_size: u64,
|
||||
) -> Result<u64> {
|
||||
// Determine the highest usable address
|
||||
let max_addr = guest_mem_size.min(memory_size).min(MAX_INITRD_ADDR);
|
||||
|
||||
// Calculate page-aligned initrd size
|
||||
let aligned_size = Self::align_up(initrd_size, PAGE_SIZE);
|
||||
|
||||
// Try to place at high memory (just below max_addr)
|
||||
if max_addr < aligned_size {
|
||||
return Err(BootError::InitrdTooLarge {
|
||||
size: initrd_size,
|
||||
available: max_addr,
|
||||
});
|
||||
}
|
||||
|
||||
// Calculate load address (page-aligned, as high as possible)
|
||||
let ideal_addr = Self::align_down(max_addr - aligned_size, PAGE_SIZE);
|
||||
|
||||
// Check for kernel overlap
|
||||
let min_addr = kernel_end + MIN_KERNEL_INITRD_GAP;
|
||||
let min_addr_aligned = Self::align_up(min_addr, PAGE_SIZE);
|
||||
|
||||
if ideal_addr < min_addr_aligned {
|
||||
// Not enough space between kernel and max memory
|
||||
return Err(BootError::InitrdTooLarge {
|
||||
size: initrd_size,
|
||||
available: max_addr - min_addr_aligned,
|
||||
});
|
||||
}
|
||||
|
||||
Ok(ideal_addr)
|
||||
}
|
||||
|
||||
/// Align value up to the given alignment
|
||||
#[inline]
|
||||
fn align_up(value: u64, alignment: u64) -> u64 {
|
||||
(value + alignment - 1) & !(alignment - 1)
|
||||
}
|
||||
|
||||
/// Align value down to the given alignment
|
||||
#[inline]
|
||||
fn align_down(value: u64, alignment: u64) -> u64 {
|
||||
value & !(alignment - 1)
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// Initrd format detection — planned feature, not yet wired up
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
/// Helper trait for initrd format detection
|
||||
#[allow(dead_code)]
|
||||
pub trait InitrdFormat {
|
||||
/// Check if data is a valid initrd format
|
||||
fn is_valid(data: &[u8]) -> bool;
|
||||
|
||||
/// Get format name
|
||||
fn name() -> &'static str;
|
||||
}
|
||||
|
||||
/// CPIO archive format (traditional initrd)
|
||||
#[allow(dead_code)]
|
||||
pub struct CpioFormat;
|
||||
|
||||
impl InitrdFormat for CpioFormat {
|
||||
fn is_valid(data: &[u8]) -> bool {
|
||||
if data.len() < 6 {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check for CPIO magic numbers
|
||||
// "070701" or "070702" (newc format)
|
||||
// "070707" (odc format)
|
||||
// 0x71c7 or 0xc771 (binary format)
|
||||
if &data[0..6] == b"070701" || &data[0..6] == b"070702" || &data[0..6] == b"070707" {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Binary CPIO
|
||||
if data.len() >= 2 {
|
||||
let magic = u16::from_le_bytes([data[0], data[1]]);
|
||||
if magic == 0x71c7 || magic == 0xc771 {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
fn name() -> &'static str {
|
||||
"CPIO"
|
||||
}
|
||||
}
|
||||
|
||||
/// Gzip compressed format
|
||||
#[allow(dead_code)]
|
||||
pub struct GzipFormat;
|
||||
|
||||
impl InitrdFormat for GzipFormat {
|
||||
fn is_valid(data: &[u8]) -> bool {
|
||||
// Gzip magic: 0x1f 0x8b
|
||||
data.len() >= 2 && data[0] == 0x1f && data[1] == 0x8b
|
||||
}
|
||||
|
||||
fn name() -> &'static str {
|
||||
"Gzip"
|
||||
}
|
||||
}
|
||||
|
||||
/// XZ compressed format
|
||||
#[allow(dead_code)]
|
||||
pub struct XzFormat;
|
||||
|
||||
impl InitrdFormat for XzFormat {
|
||||
fn is_valid(data: &[u8]) -> bool {
|
||||
// XZ magic: 0xfd "7zXZ" 0x00
|
||||
data.len() >= 6
|
||||
&& data[0] == 0xfd
|
||||
&& &data[1..5] == b"7zXZ"
|
||||
&& data[5] == 0x00
|
||||
}
|
||||
|
||||
fn name() -> &'static str {
|
||||
"XZ"
|
||||
}
|
||||
}
|
||||
|
||||
/// Zstd compressed format
|
||||
#[allow(dead_code)]
|
||||
pub struct ZstdFormat;
|
||||
|
||||
impl InitrdFormat for ZstdFormat {
|
||||
fn is_valid(data: &[u8]) -> bool {
|
||||
// Zstd magic: 0x28 0xb5 0x2f 0xfd
|
||||
data.len() >= 4
|
||||
&& data[0] == 0x28
|
||||
&& data[1] == 0xb5
|
||||
&& data[2] == 0x2f
|
||||
&& data[3] == 0xfd
|
||||
}
|
||||
|
||||
fn name() -> &'static str {
|
||||
"Zstd"
|
||||
}
|
||||
}
|
||||
|
||||
/// LZ4 compressed format
|
||||
#[allow(dead_code)]
|
||||
pub struct Lz4Format;
|
||||
|
||||
impl InitrdFormat for Lz4Format {
|
||||
fn is_valid(data: &[u8]) -> bool {
|
||||
// LZ4 frame magic: 0x04 0x22 0x4d 0x18
|
||||
data.len() >= 4
|
||||
&& data[0] == 0x04
|
||||
&& data[1] == 0x22
|
||||
&& data[2] == 0x4d
|
||||
&& data[3] == 0x18
|
||||
}
|
||||
|
||||
fn name() -> &'static str {
|
||||
"LZ4"
|
||||
}
|
||||
}
|
||||
|
||||
/// Detect initrd format from data
|
||||
#[allow(dead_code)]
|
||||
pub fn detect_initrd_format(data: &[u8]) -> Option<&'static str> {
|
||||
if GzipFormat::is_valid(data) {
|
||||
return Some(GzipFormat::name());
|
||||
}
|
||||
if XzFormat::is_valid(data) {
|
||||
return Some(XzFormat::name());
|
||||
}
|
||||
if ZstdFormat::is_valid(data) {
|
||||
return Some(ZstdFormat::name());
|
||||
}
|
||||
if Lz4Format::is_valid(data) {
|
||||
return Some(Lz4Format::name());
|
||||
}
|
||||
if CpioFormat::is_valid(data) {
|
||||
return Some(CpioFormat::name());
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_align_up() {
|
||||
assert_eq!(InitrdLoader::align_up(0, 4096), 0);
|
||||
assert_eq!(InitrdLoader::align_up(1, 4096), 4096);
|
||||
assert_eq!(InitrdLoader::align_up(4095, 4096), 4096);
|
||||
assert_eq!(InitrdLoader::align_up(4096, 4096), 4096);
|
||||
assert_eq!(InitrdLoader::align_up(4097, 4096), 8192);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_align_down() {
|
||||
assert_eq!(InitrdLoader::align_down(0, 4096), 0);
|
||||
assert_eq!(InitrdLoader::align_down(4095, 4096), 0);
|
||||
assert_eq!(InitrdLoader::align_down(4096, 4096), 4096);
|
||||
assert_eq!(InitrdLoader::align_down(4097, 4096), 4096);
|
||||
assert_eq!(InitrdLoader::align_down(8191, 4096), 4096);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_calculate_load_address() {
|
||||
// 128MB memory, 4MB kernel ending at 5MB
|
||||
let memory_size = 128 * 1024 * 1024;
|
||||
let kernel_end = 5 * 1024 * 1024;
|
||||
let initrd_size = 10 * 1024 * 1024; // 10MB initrd
|
||||
|
||||
let result = InitrdLoader::calculate_load_address(
|
||||
initrd_size,
|
||||
memory_size,
|
||||
kernel_end,
|
||||
memory_size,
|
||||
);
|
||||
|
||||
assert!(result.is_ok());
|
||||
let addr = result.unwrap();
|
||||
|
||||
// Should be page-aligned
|
||||
assert_eq!(addr % PAGE_SIZE, 0);
|
||||
|
||||
// Should be above kernel
|
||||
assert!(addr > kernel_end);
|
||||
|
||||
// Should fit within memory
|
||||
assert!(addr + initrd_size <= memory_size as u64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_initrd_too_large() {
|
||||
let memory_size = 16 * 1024 * 1024; // 16MB
|
||||
let kernel_end = 8 * 1024 * 1024; // Kernel ends at 8MB
|
||||
let initrd_size = 32 * 1024 * 1024; // 32MB initrd (too large!)
|
||||
|
||||
let result = InitrdLoader::calculate_load_address(
|
||||
initrd_size,
|
||||
memory_size,
|
||||
kernel_end,
|
||||
memory_size,
|
||||
);
|
||||
|
||||
assert!(matches!(result, Err(BootError::InitrdTooLarge { .. })));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_gzip() {
|
||||
let data = [0x1f, 0x8b, 0x08, 0x00];
|
||||
assert!(GzipFormat::is_valid(&data));
|
||||
assert_eq!(detect_initrd_format(&data), Some("Gzip"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_xz() {
|
||||
let data = [0xfd, b'7', b'z', b'X', b'Z', 0x00];
|
||||
assert!(XzFormat::is_valid(&data));
|
||||
assert_eq!(detect_initrd_format(&data), Some("XZ"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_zstd() {
|
||||
let data = [0x28, 0xb5, 0x2f, 0xfd];
|
||||
assert!(ZstdFormat::is_valid(&data));
|
||||
assert_eq!(detect_initrd_format(&data), Some("Zstd"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_cpio_newc() {
|
||||
let data = b"070701001234";
|
||||
assert!(CpioFormat::is_valid(data));
|
||||
}
|
||||
}
|
||||
465
vmm/src/boot/linux.rs
Normal file
465
vmm/src/boot/linux.rs
Normal file
@@ -0,0 +1,465 @@
|
||||
//! Linux Boot Protocol Implementation
|
||||
//!
|
||||
//! Implements the Linux x86 boot protocol for 64-bit kernels.
|
||||
//! This sets up the boot_params structure (zero page) that Linux expects
|
||||
//! when booting in 64-bit mode.
|
||||
//!
|
||||
//! # References
|
||||
//! - Linux kernel: arch/x86/include/uapi/asm/bootparam.h
|
||||
//! - Linux kernel: Documentation/x86/boot.rst
|
||||
|
||||
use super::{layout, BootError, GuestMemory, Result};
|
||||
|
||||
/// Boot params address (zero page)
|
||||
/// Must not overlap with page tables (0x1000-0x10FFF zeroed area) or GDT (0x500-0x52F)
|
||||
pub const BOOT_PARAMS_ADDR: u64 = 0x20000;
|
||||
|
||||
/// Size of boot_params structure (4KB)
|
||||
pub const BOOT_PARAMS_SIZE: usize = 4096;
|
||||
|
||||
/// E820 entry within boot_params
|
||||
#[repr(C, packed)]
|
||||
#[derive(Debug, Clone, Copy, Default)]
|
||||
pub struct E820Entry {
|
||||
pub addr: u64,
|
||||
pub size: u64,
|
||||
pub entry_type: u32,
|
||||
}
|
||||
|
||||
/// E820 memory types
|
||||
#[repr(u32)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
#[allow(dead_code)] // E820 spec types — kept for completeness
|
||||
pub enum E820Type {
|
||||
Ram = 1,
|
||||
Reserved = 2,
|
||||
Acpi = 3,
|
||||
Nvs = 4,
|
||||
Unusable = 5,
|
||||
}
|
||||
|
||||
impl E820Entry {
|
||||
pub fn ram(addr: u64, size: u64) -> Self {
|
||||
Self {
|
||||
addr,
|
||||
size,
|
||||
entry_type: E820Type::Ram as u32,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn reserved(addr: u64, size: u64) -> Self {
|
||||
Self {
|
||||
addr,
|
||||
size,
|
||||
entry_type: E820Type::Reserved as u32,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// setup_header structure (at offset 0x1F1 in boot sector, or 0x1F1 in boot_params)
|
||||
/// We only define the fields we actually use
|
||||
#[repr(C, packed)]
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct SetupHeader {
|
||||
pub setup_sects: u8, // 0x1F1
|
||||
pub root_flags: u16, // 0x1F2
|
||||
pub syssize: u32, // 0x1F4
|
||||
pub ram_size: u16, // 0x1F8 (obsolete)
|
||||
pub vid_mode: u16, // 0x1FA
|
||||
pub root_dev: u16, // 0x1FC
|
||||
pub boot_flag: u16, // 0x1FE - should be 0xAA55
|
||||
pub jump: u16, // 0x200
|
||||
pub header: u32, // 0x202 - "HdrS" magic
|
||||
pub version: u16, // 0x206
|
||||
pub realmode_swtch: u32, // 0x208
|
||||
pub start_sys_seg: u16, // 0x20C (obsolete)
|
||||
pub kernel_version: u16, // 0x20E
|
||||
pub type_of_loader: u8, // 0x210
|
||||
pub loadflags: u8, // 0x211
|
||||
pub setup_move_size: u16, // 0x212
|
||||
pub code32_start: u32, // 0x214
|
||||
pub ramdisk_image: u32, // 0x218
|
||||
pub ramdisk_size: u32, // 0x21C
|
||||
pub bootsect_kludge: u32, // 0x220
|
||||
pub heap_end_ptr: u16, // 0x224
|
||||
pub ext_loader_ver: u8, // 0x226
|
||||
pub ext_loader_type: u8, // 0x227
|
||||
pub cmd_line_ptr: u32, // 0x228
|
||||
pub initrd_addr_max: u32, // 0x22C
|
||||
pub kernel_alignment: u32, // 0x230
|
||||
pub relocatable_kernel: u8, // 0x234
|
||||
pub min_alignment: u8, // 0x235
|
||||
pub xloadflags: u16, // 0x236
|
||||
pub cmdline_size: u32, // 0x238
|
||||
pub hardware_subarch: u32, // 0x23C
|
||||
pub hardware_subarch_data: u64, // 0x240
|
||||
pub payload_offset: u32, // 0x248
|
||||
pub payload_length: u32, // 0x24C
|
||||
pub setup_data: u64, // 0x250
|
||||
pub pref_address: u64, // 0x258
|
||||
pub init_size: u32, // 0x260
|
||||
pub handover_offset: u32, // 0x264
|
||||
pub kernel_info_offset: u32, // 0x268
|
||||
}
|
||||
|
||||
impl Default for SetupHeader {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
setup_sects: 0,
|
||||
root_flags: 0,
|
||||
syssize: 0,
|
||||
ram_size: 0,
|
||||
vid_mode: 0xFFFF, // VGA normal
|
||||
root_dev: 0,
|
||||
boot_flag: 0xAA55,
|
||||
jump: 0,
|
||||
header: 0x53726448, // "HdrS"
|
||||
version: 0x020F, // Protocol version 2.15
|
||||
realmode_swtch: 0,
|
||||
start_sys_seg: 0,
|
||||
kernel_version: 0,
|
||||
type_of_loader: 0xFF, // Undefined loader
|
||||
loadflags: LOADFLAG_LOADED_HIGH | LOADFLAG_CAN_USE_HEAP,
|
||||
setup_move_size: 0,
|
||||
code32_start: 0x100000, // 1MB
|
||||
ramdisk_image: 0,
|
||||
ramdisk_size: 0,
|
||||
bootsect_kludge: 0,
|
||||
heap_end_ptr: 0,
|
||||
ext_loader_ver: 0,
|
||||
ext_loader_type: 0,
|
||||
cmd_line_ptr: 0,
|
||||
initrd_addr_max: 0x7FFFFFFF,
|
||||
kernel_alignment: 0x200000, // 2MB
|
||||
relocatable_kernel: 1,
|
||||
min_alignment: 21, // 2^21 = 2MB
|
||||
xloadflags: XLF_KERNEL_64 | XLF_CAN_BE_LOADED_ABOVE_4G,
|
||||
cmdline_size: 4096,
|
||||
hardware_subarch: 0, // PC
|
||||
hardware_subarch_data: 0,
|
||||
payload_offset: 0,
|
||||
payload_length: 0,
|
||||
setup_data: 0,
|
||||
pref_address: 0x1000000, // 16MB
|
||||
init_size: 0,
|
||||
handover_offset: 0,
|
||||
kernel_info_offset: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Linux boot protocol constants — kept for completeness
|
||||
#[allow(dead_code)]
|
||||
pub const LOADFLAG_LOADED_HIGH: u8 = 0x01; // Kernel loaded high (at 0x100000)
|
||||
#[allow(dead_code)]
|
||||
pub const LOADFLAG_KASLR_FLAG: u8 = 0x02; // KASLR enabled
|
||||
#[allow(dead_code)]
|
||||
pub const LOADFLAG_QUIET_FLAG: u8 = 0x20; // Quiet boot
|
||||
#[allow(dead_code)]
|
||||
pub const LOADFLAG_KEEP_SEGMENTS: u8 = 0x40; // Don't reload segments
|
||||
#[allow(dead_code)]
|
||||
pub const LOADFLAG_CAN_USE_HEAP: u8 = 0x80; // Heap available
|
||||
|
||||
/// XLoadflags bits
|
||||
#[allow(dead_code)]
|
||||
pub const XLF_KERNEL_64: u16 = 0x0001; // 64-bit kernel
|
||||
#[allow(dead_code)]
|
||||
pub const XLF_CAN_BE_LOADED_ABOVE_4G: u16 = 0x0002; // Can load above 4GB
|
||||
#[allow(dead_code)]
|
||||
pub const XLF_EFI_HANDOVER_32: u16 = 0x0004; // EFI handover 32-bit
|
||||
#[allow(dead_code)]
|
||||
pub const XLF_EFI_HANDOVER_64: u16 = 0x0008; // EFI handover 64-bit
|
||||
#[allow(dead_code)]
|
||||
pub const XLF_EFI_KEXEC: u16 = 0x0010; // EFI kexec
|
||||
|
||||
/// Maximum E820 entries in boot_params
|
||||
#[allow(dead_code)]
|
||||
pub const E820_MAX_ENTRIES: usize = 128;
|
||||
|
||||
/// Offsets within boot_params structure
|
||||
#[allow(dead_code)] // Linux boot protocol offsets — kept for reference
|
||||
pub mod offsets {
|
||||
/// setup_header starts at 0x1F1
|
||||
pub const SETUP_HEADER: usize = 0x1F1;
|
||||
|
||||
/// E820 entry count at 0x1E8
|
||||
pub const E820_ENTRIES: usize = 0x1E8;
|
||||
|
||||
/// E820 table starts at 0x2D0
|
||||
pub const E820_TABLE: usize = 0x2D0;
|
||||
|
||||
/// Size of one E820 entry
|
||||
pub const E820_ENTRY_SIZE: usize = 20;
|
||||
}
|
||||
|
||||
/// Configuration for Linux boot setup
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct LinuxBootConfig {
|
||||
/// Total memory size in bytes
|
||||
pub memory_size: u64,
|
||||
/// Physical address of command line string
|
||||
pub cmdline_addr: u64,
|
||||
/// Physical address of initrd (if any)
|
||||
pub initrd_addr: Option<u64>,
|
||||
/// Size of initrd (if any)
|
||||
pub initrd_size: Option<u64>,
|
||||
}
|
||||
|
||||
/// Linux boot setup implementation
|
||||
pub struct LinuxBootSetup;
|
||||
|
||||
impl LinuxBootSetup {
|
||||
/// Set up Linux boot_params structure in guest memory
|
||||
///
|
||||
/// This creates the "zero page" that Linux expects when booting in 64-bit mode.
|
||||
/// The boot_params address should be passed to the kernel via RSI register.
|
||||
pub fn setup<M: GuestMemory>(config: &LinuxBootConfig, guest_mem: &mut M) -> Result<u64> {
|
||||
// Allocate and zero the boot_params structure (4KB)
|
||||
let boot_params = vec![0u8; BOOT_PARAMS_SIZE];
|
||||
guest_mem.write_bytes(BOOT_PARAMS_ADDR, &boot_params)?;
|
||||
|
||||
// Build E820 memory map
|
||||
let e820_entries = Self::build_e820_map(config.memory_size)?;
|
||||
|
||||
// Write E820 entry count
|
||||
let e820_count = e820_entries.len() as u8;
|
||||
guest_mem.write_bytes(
|
||||
BOOT_PARAMS_ADDR + offsets::E820_ENTRIES as u64,
|
||||
&[e820_count],
|
||||
)?;
|
||||
|
||||
// Write E820 entries
|
||||
for (i, entry) in e820_entries.iter().enumerate() {
|
||||
let offset = BOOT_PARAMS_ADDR + offsets::E820_TABLE as u64
|
||||
+ (i * offsets::E820_ENTRY_SIZE) as u64;
|
||||
let bytes = unsafe {
|
||||
std::slice::from_raw_parts(
|
||||
entry as *const E820Entry as *const u8,
|
||||
offsets::E820_ENTRY_SIZE,
|
||||
)
|
||||
};
|
||||
guest_mem.write_bytes(offset, bytes)?;
|
||||
}
|
||||
|
||||
// Build and write setup_header
|
||||
let mut header = SetupHeader::default();
|
||||
header.cmd_line_ptr = config.cmdline_addr as u32;
|
||||
|
||||
if let (Some(addr), Some(size)) = (config.initrd_addr, config.initrd_size) {
|
||||
header.ramdisk_image = addr as u32;
|
||||
header.ramdisk_size = size as u32;
|
||||
}
|
||||
|
||||
// Write setup_header to boot_params
|
||||
Self::write_setup_header(guest_mem, &header)?;
|
||||
|
||||
tracing::debug!(
|
||||
"Linux boot_params setup at 0x{:x}: {} E820 entries, cmdline=0x{:x}",
|
||||
BOOT_PARAMS_ADDR,
|
||||
e820_count,
|
||||
config.cmdline_addr
|
||||
);
|
||||
|
||||
Ok(BOOT_PARAMS_ADDR)
|
||||
}
|
||||
|
||||
/// Build E820 memory map for the VM
|
||||
/// Layout matches Firecracker's working E820 configuration
|
||||
fn build_e820_map(memory_size: u64) -> Result<Vec<E820Entry>> {
|
||||
let mut entries = Vec::with_capacity(5);
|
||||
|
||||
if memory_size < layout::HIGH_MEMORY_START {
|
||||
return Err(BootError::MemoryLayout(format!(
|
||||
"Memory size {} is less than minimum required {}",
|
||||
memory_size,
|
||||
layout::HIGH_MEMORY_START
|
||||
)));
|
||||
}
|
||||
|
||||
// EBDA (Extended BIOS Data Area) boundary - Firecracker uses 0x9FC00
|
||||
const EBDA_START: u64 = 0x9FC00;
|
||||
|
||||
// Low memory: 0 to EBDA (usable RAM) - matches Firecracker
|
||||
entries.push(E820Entry::ram(0, EBDA_START));
|
||||
|
||||
// EBDA: Reserved area just below 640KB
|
||||
entries.push(E820Entry::reserved(EBDA_START, layout::LOW_MEMORY_END - EBDA_START));
|
||||
|
||||
// Legacy hole: 640KB to 1MB (reserved for VGA/ROMs)
|
||||
let legacy_hole_size = layout::HIGH_MEMORY_START - layout::LOW_MEMORY_END;
|
||||
entries.push(E820Entry::reserved(layout::LOW_MEMORY_END, legacy_hole_size));
|
||||
|
||||
// High memory: 1MB to end of RAM
|
||||
let high_memory_size = memory_size - layout::HIGH_MEMORY_START;
|
||||
if high_memory_size > 0 {
|
||||
entries.push(E820Entry::ram(layout::HIGH_MEMORY_START, high_memory_size));
|
||||
}
|
||||
|
||||
Ok(entries)
|
||||
}
|
||||
|
||||
/// Write setup_header to boot_params
|
||||
fn write_setup_header<M: GuestMemory>(guest_mem: &mut M, header: &SetupHeader) -> Result<()> {
|
||||
// The setup_header structure is written at offset 0x1F1 within boot_params
|
||||
// We need to write individual fields at their correct offsets
|
||||
|
||||
let base = BOOT_PARAMS_ADDR;
|
||||
|
||||
// 0x1F1: setup_sects
|
||||
guest_mem.write_bytes(base + 0x1F1, &[header.setup_sects])?;
|
||||
// 0x1F2: root_flags
|
||||
guest_mem.write_bytes(base + 0x1F2, &header.root_flags.to_le_bytes())?;
|
||||
// 0x1F4: syssize
|
||||
guest_mem.write_bytes(base + 0x1F4, &header.syssize.to_le_bytes())?;
|
||||
// 0x1FE: boot_flag
|
||||
guest_mem.write_bytes(base + 0x1FE, &header.boot_flag.to_le_bytes())?;
|
||||
// 0x202: header magic
|
||||
guest_mem.write_bytes(base + 0x202, &header.header.to_le_bytes())?;
|
||||
// 0x206: version
|
||||
guest_mem.write_bytes(base + 0x206, &header.version.to_le_bytes())?;
|
||||
// 0x210: type_of_loader
|
||||
guest_mem.write_bytes(base + 0x210, &[header.type_of_loader])?;
|
||||
// 0x211: loadflags
|
||||
guest_mem.write_bytes(base + 0x211, &[header.loadflags])?;
|
||||
// 0x214: code32_start
|
||||
guest_mem.write_bytes(base + 0x214, &header.code32_start.to_le_bytes())?;
|
||||
// 0x218: ramdisk_image
|
||||
guest_mem.write_bytes(base + 0x218, &header.ramdisk_image.to_le_bytes())?;
|
||||
// 0x21C: ramdisk_size
|
||||
guest_mem.write_bytes(base + 0x21C, &header.ramdisk_size.to_le_bytes())?;
|
||||
// 0x224: heap_end_ptr
|
||||
guest_mem.write_bytes(base + 0x224, &header.heap_end_ptr.to_le_bytes())?;
|
||||
// 0x228: cmd_line_ptr
|
||||
guest_mem.write_bytes(base + 0x228, &header.cmd_line_ptr.to_le_bytes())?;
|
||||
// 0x22C: initrd_addr_max
|
||||
guest_mem.write_bytes(base + 0x22C, &header.initrd_addr_max.to_le_bytes())?;
|
||||
// 0x230: kernel_alignment
|
||||
guest_mem.write_bytes(base + 0x230, &header.kernel_alignment.to_le_bytes())?;
|
||||
// 0x234: relocatable_kernel
|
||||
guest_mem.write_bytes(base + 0x234, &[header.relocatable_kernel])?;
|
||||
// 0x236: xloadflags
|
||||
guest_mem.write_bytes(base + 0x236, &header.xloadflags.to_le_bytes())?;
|
||||
// 0x238: cmdline_size
|
||||
guest_mem.write_bytes(base + 0x238, &header.cmdline_size.to_le_bytes())?;
|
||||
// 0x23C: hardware_subarch
|
||||
guest_mem.write_bytes(base + 0x23C, &header.hardware_subarch.to_le_bytes())?;
|
||||
// 0x258: pref_address
|
||||
guest_mem.write_bytes(base + 0x258, &header.pref_address.to_le_bytes())?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
struct MockMemory {
|
||||
size: u64,
|
||||
data: Vec<u8>,
|
||||
}
|
||||
|
||||
impl MockMemory {
|
||||
fn new(size: u64) -> Self {
|
||||
Self {
|
||||
size,
|
||||
data: vec![0; size as usize],
|
||||
}
|
||||
}
|
||||
|
||||
fn read_bytes(&self, addr: u64, len: usize) -> &[u8] {
|
||||
&self.data[addr as usize..addr as usize + len]
|
||||
}
|
||||
}
|
||||
|
||||
impl GuestMemory for MockMemory {
|
||||
fn write_bytes(&mut self, addr: u64, data: &[u8]) -> Result<()> {
|
||||
let end = addr as usize + data.len();
|
||||
if end > self.data.len() {
|
||||
return Err(BootError::GuestMemoryWrite(format!(
|
||||
"Write at {:#x} exceeds memory",
|
||||
addr
|
||||
)));
|
||||
}
|
||||
self.data[addr as usize..end].copy_from_slice(data);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn size(&self) -> u64 {
|
||||
self.size
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_e820_entry_size() {
|
||||
assert_eq!(std::mem::size_of::<E820Entry>(), 20);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_linux_boot_setup() {
|
||||
let mut mem = MockMemory::new(128 * 1024 * 1024);
|
||||
let config = LinuxBootConfig {
|
||||
memory_size: 128 * 1024 * 1024,
|
||||
cmdline_addr: layout::CMDLINE_ADDR,
|
||||
initrd_addr: None,
|
||||
initrd_size: None,
|
||||
};
|
||||
|
||||
let result = LinuxBootSetup::setup(&config, &mut mem);
|
||||
assert!(result.is_ok());
|
||||
assert_eq!(result.unwrap(), BOOT_PARAMS_ADDR);
|
||||
|
||||
// Verify boot_flag
|
||||
let boot_flag = u16::from_le_bytes([
|
||||
mem.data[BOOT_PARAMS_ADDR as usize + 0x1FE],
|
||||
mem.data[BOOT_PARAMS_ADDR as usize + 0x1FF],
|
||||
]);
|
||||
assert_eq!(boot_flag, 0xAA55);
|
||||
|
||||
// Verify header magic
|
||||
let magic = u32::from_le_bytes([
|
||||
mem.data[BOOT_PARAMS_ADDR as usize + 0x202],
|
||||
mem.data[BOOT_PARAMS_ADDR as usize + 0x203],
|
||||
mem.data[BOOT_PARAMS_ADDR as usize + 0x204],
|
||||
mem.data[BOOT_PARAMS_ADDR as usize + 0x205],
|
||||
]);
|
||||
assert_eq!(magic, 0x53726448); // "HdrS"
|
||||
|
||||
// Verify E820 entry count > 0
|
||||
let e820_count = mem.data[BOOT_PARAMS_ADDR as usize + offsets::E820_ENTRIES];
|
||||
assert!(e820_count >= 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_e820_map() {
|
||||
let memory_size = 256 * 1024 * 1024; // 256MB
|
||||
let entries = LinuxBootSetup::build_e820_map(memory_size).unwrap();
|
||||
|
||||
// 4 entries: low RAM (0..EBDA), EBDA reserved, legacy hole (640K-1M), high RAM
|
||||
assert_eq!(entries.len(), 4);
|
||||
|
||||
// Low memory (0 to EBDA) — copy fields from packed struct to avoid unaligned references
|
||||
let e0_addr = entries[0].addr;
|
||||
let e0_type = entries[0].entry_type;
|
||||
assert_eq!(e0_addr, 0);
|
||||
assert_eq!(e0_type, E820Type::Ram as u32);
|
||||
|
||||
// EBDA reserved region
|
||||
let e1_addr = entries[1].addr;
|
||||
let e1_type = entries[1].entry_type;
|
||||
assert_eq!(e1_addr, 0x9FC00); // EBDA_START
|
||||
assert_eq!(e1_type, E820Type::Reserved as u32);
|
||||
|
||||
// Legacy hole (640KB to 1MB)
|
||||
let e2_addr = entries[2].addr;
|
||||
let e2_type = entries[2].entry_type;
|
||||
assert_eq!(e2_addr, layout::LOW_MEMORY_END);
|
||||
assert_eq!(e2_type, E820Type::Reserved as u32);
|
||||
|
||||
// High memory (1MB+)
|
||||
let e3_addr = entries[3].addr;
|
||||
let e3_type = entries[3].entry_type;
|
||||
assert_eq!(e3_addr, layout::HIGH_MEMORY_START);
|
||||
assert_eq!(e3_type, E820Type::Ram as u32);
|
||||
}
|
||||
}
|
||||
576
vmm/src/boot/loader.rs
Normal file
576
vmm/src/boot/loader.rs
Normal file
@@ -0,0 +1,576 @@
|
||||
//! Kernel Loader
|
||||
//!
|
||||
//! Loads Linux kernels in ELF64 or bzImage format directly into guest memory.
|
||||
//! Supports PVH boot protocol for fastest possible boot times.
|
||||
//!
|
||||
//! # Kernel Formats
|
||||
//!
|
||||
//! ## ELF64 (vmlinux)
|
||||
//! - Uncompressed kernel with ELF headers
|
||||
//! - Direct load to specified address
|
||||
//! - Entry point from ELF header
|
||||
//!
|
||||
//! ## bzImage
|
||||
//! - Compressed kernel with setup header
|
||||
//! - Requires parsing setup header for entry point
|
||||
//! - Kernel loaded after setup sectors
|
||||
|
||||
use super::{layout, BootError, GuestMemory, Result};
|
||||
use std::fs::File;
|
||||
use std::io::Read;
|
||||
use std::path::Path;
|
||||
|
||||
/// ELF magic number
|
||||
const ELF_MAGIC: [u8; 4] = [0x7f, b'E', b'L', b'F'];
|
||||
|
||||
/// bzImage magic number at offset 0x202
|
||||
const BZIMAGE_MAGIC: u32 = 0x53726448; // "HdrS"
|
||||
|
||||
/// Minimum boot protocol version for PVH
|
||||
const MIN_BOOT_PROTOCOL_VERSION: u16 = 0x0200;
|
||||
|
||||
/// bzImage header offsets
|
||||
#[allow(dead_code)] // Linux bzImage protocol constants — kept for completeness
|
||||
mod bzimage {
|
||||
/// Magic number offset
|
||||
pub const HEADER_MAGIC_OFFSET: usize = 0x202;
|
||||
/// Boot protocol version offset
|
||||
pub const VERSION_OFFSET: usize = 0x206;
|
||||
/// Kernel version string pointer offset
|
||||
pub const KERNEL_VERSION_OFFSET: usize = 0x20e;
|
||||
/// Setup sectors count offset (at 0x1f1)
|
||||
pub const SETUP_SECTS_OFFSET: usize = 0x1f1;
|
||||
/// Setup header size (minimum)
|
||||
pub const SETUP_HEADER_SIZE: usize = 0x0202;
|
||||
/// Sector size
|
||||
pub const SECTOR_SIZE: usize = 512;
|
||||
/// Default setup sectors if field is 0
|
||||
pub const DEFAULT_SETUP_SECTS: u8 = 4;
|
||||
/// Boot flag offset
|
||||
pub const BOOT_FLAG_OFFSET: usize = 0x1fe;
|
||||
/// Expected boot flag value
|
||||
pub const BOOT_FLAG_VALUE: u16 = 0xaa55;
|
||||
/// Real mode kernel header size
|
||||
pub const REAL_MODE_HEADER_SIZE: usize = 0x8000;
|
||||
/// Loadflags offset
|
||||
pub const LOADFLAGS_OFFSET: usize = 0x211;
|
||||
/// Loadflag: kernel is loaded high (at 0x100000)
|
||||
pub const LOADFLAG_LOADED_HIGH: u8 = 0x01;
|
||||
/// Loadflag: can use heap
|
||||
pub const LOADFLAG_CAN_USE_HEAP: u8 = 0x80;
|
||||
/// Code32 start offset
|
||||
pub const CODE32_START_OFFSET: usize = 0x214;
|
||||
/// Kernel alignment offset
|
||||
pub const KERNEL_ALIGNMENT_OFFSET: usize = 0x230;
|
||||
/// Pref address offset (64-bit)
|
||||
pub const PREF_ADDRESS_OFFSET: usize = 0x258;
|
||||
/// XLoadflags offset
|
||||
pub const XLOADFLAGS_OFFSET: usize = 0x236;
|
||||
/// XLoadflag: kernel has EFI handover
|
||||
pub const XLF_KERNEL_64: u16 = 0x0001;
|
||||
/// XLoadflag: can be loaded above 4GB
|
||||
pub const XLF_CAN_BE_LOADED_ABOVE_4G: u16 = 0x0002;
|
||||
}
|
||||
|
||||
/// Kernel type detection result
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum KernelType {
|
||||
/// ELF64 format (vmlinux)
|
||||
Elf64,
|
||||
/// bzImage format (compressed)
|
||||
BzImage,
|
||||
}
|
||||
|
||||
/// Kernel loader configuration
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct KernelConfig {
|
||||
/// Path to kernel image
|
||||
pub path: String,
|
||||
/// Address to load kernel (typically 1MB)
|
||||
pub load_addr: u64,
|
||||
}
|
||||
|
||||
/// Result of kernel loading
|
||||
#[derive(Debug, Clone)]
|
||||
#[allow(dead_code)]
|
||||
pub struct KernelLoadResult {
|
||||
/// Address where kernel was loaded
|
||||
pub load_addr: u64,
|
||||
/// Total size of loaded kernel
|
||||
pub size: u64,
|
||||
/// Entry point address
|
||||
pub entry_point: u64,
|
||||
/// Detected kernel type
|
||||
pub kernel_type: KernelType,
|
||||
}
|
||||
|
||||
/// Kernel loader implementation
|
||||
pub struct KernelLoader;
|
||||
|
||||
impl KernelLoader {
|
||||
/// Load a kernel image into guest memory
|
||||
///
|
||||
/// Automatically detects kernel format (ELF64 or bzImage) and loads
|
||||
/// appropriately for PVH boot.
|
||||
pub fn load<M: GuestMemory>(config: &KernelConfig, guest_mem: &mut M) -> Result<KernelLoadResult> {
|
||||
let kernel_data = Self::read_kernel_file(&config.path)?;
|
||||
|
||||
// Detect kernel type
|
||||
let kernel_type = Self::detect_kernel_type(&kernel_data)?;
|
||||
|
||||
match kernel_type {
|
||||
KernelType::Elf64 => Self::load_elf64(&kernel_data, config.load_addr, guest_mem),
|
||||
KernelType::BzImage => Self::load_bzimage(&kernel_data, config.load_addr, guest_mem),
|
||||
}
|
||||
}
|
||||
|
||||
/// Read kernel file into memory
|
||||
///
|
||||
/// Pre-allocates the buffer to the file size to avoid reallocation
|
||||
/// during read. For a 21MB kernel this saves ~2ms of Vec growth.
|
||||
fn read_kernel_file(path: &str) -> Result<Vec<u8>> {
|
||||
let path = Path::new(path);
|
||||
let mut file = File::open(path).map_err(BootError::KernelRead)?;
|
||||
|
||||
let file_size = file.metadata()
|
||||
.map_err(BootError::KernelRead)?
|
||||
.len() as usize;
|
||||
|
||||
if file_size == 0 {
|
||||
return Err(BootError::InvalidKernel("Kernel file is empty".into()));
|
||||
}
|
||||
|
||||
let mut data = Vec::with_capacity(file_size);
|
||||
file.read_to_end(&mut data).map_err(BootError::KernelRead)?;
|
||||
|
||||
Ok(data)
|
||||
}
|
||||
|
||||
/// Detect kernel type from magic numbers
|
||||
fn detect_kernel_type(data: &[u8]) -> Result<KernelType> {
|
||||
if data.len() < 4 {
|
||||
return Err(BootError::InvalidKernel("Kernel image too small".into()));
|
||||
}
|
||||
|
||||
// Check for ELF magic
|
||||
if data[0..4] == ELF_MAGIC {
|
||||
// Verify it's ELF64
|
||||
if data.len() < 5 || data[4] != 2 {
|
||||
return Err(BootError::InvalidElf(
|
||||
"Only ELF64 kernels are supported".into(),
|
||||
));
|
||||
}
|
||||
return Ok(KernelType::Elf64);
|
||||
}
|
||||
|
||||
// Check for bzImage magic
|
||||
if data.len() >= bzimage::HEADER_MAGIC_OFFSET + 4 {
|
||||
let magic = u32::from_le_bytes([
|
||||
data[bzimage::HEADER_MAGIC_OFFSET],
|
||||
data[bzimage::HEADER_MAGIC_OFFSET + 1],
|
||||
data[bzimage::HEADER_MAGIC_OFFSET + 2],
|
||||
data[bzimage::HEADER_MAGIC_OFFSET + 3],
|
||||
]);
|
||||
|
||||
if magic == BZIMAGE_MAGIC || (magic & 0xffff) == (BZIMAGE_MAGIC & 0xffff) {
|
||||
return Ok(KernelType::BzImage);
|
||||
}
|
||||
}
|
||||
|
||||
Err(BootError::InvalidKernel(
|
||||
"Unknown kernel format (expected ELF64 or bzImage)".into(),
|
||||
))
|
||||
}
|
||||
|
||||
/// Load ELF64 kernel (vmlinux)
|
||||
///
|
||||
/// # Warning: vmlinux Direct Boot Limitations
|
||||
///
|
||||
/// Loading vmlinux ELF directly has a fundamental limitation: the kernel's
|
||||
/// `__startup_64()` function builds its own page tables that ONLY map the
|
||||
/// kernel text region. After the CR3 switch, low memory (0-16MB) is unmapped,
|
||||
/// causing faults when accessing boot_params or any low memory address.
|
||||
///
|
||||
/// **Recommended**: Use bzImage format instead, which includes a decompressor
|
||||
/// that properly sets up full identity mapping for all memory.
|
||||
///
|
||||
/// See `docs/kernel-pagetable-analysis.md` for detailed analysis.
|
||||
fn load_elf64<M: GuestMemory>(
|
||||
data: &[u8],
|
||||
load_addr: u64,
|
||||
guest_mem: &mut M,
|
||||
) -> Result<KernelLoadResult> {
|
||||
// CRITICAL WARNING: vmlinux direct boot may fail
|
||||
tracing::warn!(
|
||||
"Loading vmlinux ELF directly. This may fail due to kernel page table setup. \
|
||||
The kernel's __startup_64() builds its own page tables that don't map low memory. \
|
||||
Consider using bzImage format for reliable boot."
|
||||
);
|
||||
|
||||
// Parse ELF header
|
||||
let elf = Elf64Header::parse(data)?;
|
||||
|
||||
// Validate it's an executable
|
||||
if elf.e_type != 2 {
|
||||
// ET_EXEC
|
||||
return Err(BootError::InvalidElf("Not an executable ELF".into()));
|
||||
}
|
||||
|
||||
// Validate machine type (x86_64 = 62)
|
||||
if elf.e_machine != 62 {
|
||||
return Err(BootError::InvalidElf(format!(
|
||||
"Unsupported machine type: {} (expected x86_64)",
|
||||
elf.e_machine
|
||||
)));
|
||||
}
|
||||
|
||||
let mut kernel_end = load_addr;
|
||||
|
||||
// Load program headers
|
||||
for i in 0..elf.e_phnum {
|
||||
let ph_offset = elf.e_phoff as usize + (i as usize * elf.e_phentsize as usize);
|
||||
let ph = Elf64ProgramHeader::parse(&data[ph_offset..])?;
|
||||
|
||||
// Only load PT_LOAD segments
|
||||
if ph.p_type != 1 {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Calculate destination address
|
||||
// For PVH, we load at the physical address specified in the ELF
|
||||
// or offset from our load address
|
||||
let dest_addr = if ph.p_paddr >= layout::HIGH_MEMORY_START {
|
||||
ph.p_paddr
|
||||
} else {
|
||||
load_addr + ph.p_paddr
|
||||
};
|
||||
|
||||
// Validate we have space
|
||||
if dest_addr + ph.p_memsz > guest_mem.size() {
|
||||
return Err(BootError::KernelTooLarge {
|
||||
size: dest_addr + ph.p_memsz,
|
||||
available: guest_mem.size(),
|
||||
});
|
||||
}
|
||||
|
||||
// Load file contents
|
||||
let file_start = ph.p_offset as usize;
|
||||
let file_end = file_start + ph.p_filesz as usize;
|
||||
if file_end > data.len() {
|
||||
return Err(BootError::InvalidElf("Program header exceeds file size".into()));
|
||||
}
|
||||
|
||||
guest_mem.write_bytes(dest_addr, &data[file_start..file_end])?;
|
||||
|
||||
// Zero BSS (memsz > filesz)
|
||||
if ph.p_memsz > ph.p_filesz {
|
||||
let bss_start = dest_addr + ph.p_filesz;
|
||||
let bss_size = (ph.p_memsz - ph.p_filesz) as usize;
|
||||
let zeros = vec![0u8; bss_size];
|
||||
guest_mem.write_bytes(bss_start, &zeros)?;
|
||||
}
|
||||
|
||||
kernel_end = kernel_end.max(dest_addr + ph.p_memsz);
|
||||
|
||||
tracing::debug!(
|
||||
"Loaded ELF segment: dest=0x{:x}, filesz=0x{:x}, memsz=0x{:x}",
|
||||
dest_addr,
|
||||
ph.p_filesz,
|
||||
ph.p_memsz
|
||||
);
|
||||
}
|
||||
|
||||
tracing::debug!(
|
||||
"ELF kernel loaded: entry=0x{:x}, kernel_end=0x{:x}",
|
||||
elf.e_entry,
|
||||
kernel_end
|
||||
);
|
||||
|
||||
// For vmlinux ELF, the e_entry is the physical entry point.
|
||||
// But the kernel code is compiled for the virtual address.
|
||||
// We map both identity (physical) and high-kernel (virtual) addresses,
|
||||
// but it's better to use the physical entry for startup_64 which is
|
||||
// designed to run with identity mapping first.
|
||||
//
|
||||
// However, if the kernel immediately triple-faults at the physical address,
|
||||
// we can try the virtual address instead.
|
||||
// Virtual address = 0xFFFFFFFF80000000 + (physical - 0x1000000) + offset_within_text
|
||||
// For entry at physical 0x1000000, virtual would be 0xFFFFFFFF81000000
|
||||
let virtual_entry = 0xFFFFFFFF81000000u64 + (elf.e_entry - 0x1000000);
|
||||
|
||||
tracing::debug!(
|
||||
"Entry points: physical=0x{:x}, virtual=0x{:x}",
|
||||
elf.e_entry, virtual_entry
|
||||
);
|
||||
|
||||
Ok(KernelLoadResult {
|
||||
load_addr,
|
||||
size: kernel_end - load_addr,
|
||||
// Use PHYSICAL entry point - kernel's startup_64 expects identity mapping
|
||||
entry_point: elf.e_entry,
|
||||
kernel_type: KernelType::Elf64,
|
||||
})
|
||||
}
|
||||
|
||||
/// Load bzImage kernel
|
||||
fn load_bzimage<M: GuestMemory>(
|
||||
data: &[u8],
|
||||
load_addr: u64,
|
||||
guest_mem: &mut M,
|
||||
) -> Result<KernelLoadResult> {
|
||||
// Validate minimum size
|
||||
if data.len() < bzimage::SETUP_HEADER_SIZE + bzimage::SECTOR_SIZE {
|
||||
return Err(BootError::InvalidBzImage("Image too small".into()));
|
||||
}
|
||||
|
||||
// Check boot flag
|
||||
let boot_flag = u16::from_le_bytes([
|
||||
data[bzimage::BOOT_FLAG_OFFSET],
|
||||
data[bzimage::BOOT_FLAG_OFFSET + 1],
|
||||
]);
|
||||
if boot_flag != bzimage::BOOT_FLAG_VALUE {
|
||||
return Err(BootError::InvalidBzImage(format!(
|
||||
"Invalid boot flag: {:#x}",
|
||||
boot_flag
|
||||
)));
|
||||
}
|
||||
|
||||
// Get boot protocol version
|
||||
let version = u16::from_le_bytes([
|
||||
data[bzimage::VERSION_OFFSET],
|
||||
data[bzimage::VERSION_OFFSET + 1],
|
||||
]);
|
||||
if version < MIN_BOOT_PROTOCOL_VERSION {
|
||||
return Err(BootError::UnsupportedVersion(format!(
|
||||
"Boot protocol {}.{} is too old (minimum 2.0)",
|
||||
version >> 8,
|
||||
version & 0xff
|
||||
)));
|
||||
}
|
||||
|
||||
// Get setup sectors count
|
||||
let mut setup_sects = data[bzimage::SETUP_SECTS_OFFSET];
|
||||
if setup_sects == 0 {
|
||||
setup_sects = bzimage::DEFAULT_SETUP_SECTS;
|
||||
}
|
||||
|
||||
// Calculate kernel offset (setup sectors + boot sector)
|
||||
let setup_size = (setup_sects as usize + 1) * bzimage::SECTOR_SIZE;
|
||||
if setup_size >= data.len() {
|
||||
return Err(BootError::InvalidBzImage(
|
||||
"Setup size exceeds image size".into(),
|
||||
));
|
||||
}
|
||||
|
||||
// Get loadflags
|
||||
let loadflags = data[bzimage::LOADFLAGS_OFFSET];
|
||||
let loaded_high = (loadflags & bzimage::LOADFLAG_LOADED_HIGH) != 0;
|
||||
|
||||
// For modern kernels (protocol >= 2.0), get code32 entry point
|
||||
let code32_start = if version >= 0x0200 {
|
||||
u32::from_le_bytes([
|
||||
data[bzimage::CODE32_START_OFFSET],
|
||||
data[bzimage::CODE32_START_OFFSET + 1],
|
||||
data[bzimage::CODE32_START_OFFSET + 2],
|
||||
data[bzimage::CODE32_START_OFFSET + 3],
|
||||
])
|
||||
} else {
|
||||
0x100000 // Default high load address
|
||||
};
|
||||
|
||||
// Check for 64-bit support (protocol >= 2.11)
|
||||
let supports_64bit = if version >= 0x020b {
|
||||
let xloadflags = u16::from_le_bytes([
|
||||
data[bzimage::XLOADFLAGS_OFFSET],
|
||||
data[bzimage::XLOADFLAGS_OFFSET + 1],
|
||||
]);
|
||||
(xloadflags & bzimage::XLF_KERNEL_64) != 0
|
||||
} else {
|
||||
false
|
||||
};
|
||||
|
||||
// Get preferred load address (protocol >= 2.10)
|
||||
let pref_address = if version >= 0x020a && data.len() > bzimage::PREF_ADDRESS_OFFSET + 8 {
|
||||
u64::from_le_bytes([
|
||||
data[bzimage::PREF_ADDRESS_OFFSET],
|
||||
data[bzimage::PREF_ADDRESS_OFFSET + 1],
|
||||
data[bzimage::PREF_ADDRESS_OFFSET + 2],
|
||||
data[bzimage::PREF_ADDRESS_OFFSET + 3],
|
||||
data[bzimage::PREF_ADDRESS_OFFSET + 4],
|
||||
data[bzimage::PREF_ADDRESS_OFFSET + 5],
|
||||
data[bzimage::PREF_ADDRESS_OFFSET + 6],
|
||||
data[bzimage::PREF_ADDRESS_OFFSET + 7],
|
||||
])
|
||||
} else {
|
||||
layout::KERNEL_LOAD_ADDR
|
||||
};
|
||||
|
||||
// Determine actual load address
|
||||
let actual_load_addr = if loaded_high {
|
||||
if pref_address != 0 {
|
||||
pref_address
|
||||
} else {
|
||||
load_addr
|
||||
}
|
||||
} else {
|
||||
load_addr
|
||||
};
|
||||
|
||||
// Extract protected mode kernel
|
||||
let kernel_data = &data[setup_size..];
|
||||
let kernel_size = kernel_data.len() as u64;
|
||||
|
||||
// Validate size
|
||||
if actual_load_addr + kernel_size > guest_mem.size() {
|
||||
return Err(BootError::KernelTooLarge {
|
||||
size: kernel_size,
|
||||
available: guest_mem.size() - actual_load_addr,
|
||||
});
|
||||
}
|
||||
|
||||
// Write kernel to guest memory
|
||||
guest_mem.write_bytes(actual_load_addr, kernel_data)?;
|
||||
|
||||
// Determine entry point
|
||||
// For PVH boot, we enter at the 64-bit entry point
|
||||
// which is typically at load_addr + 0x200 for modern kernels
|
||||
let entry_point = if supports_64bit {
|
||||
// 64-bit entry point offset in newer kernels
|
||||
actual_load_addr + 0x200
|
||||
} else {
|
||||
code32_start as u64
|
||||
};
|
||||
|
||||
Ok(KernelLoadResult {
|
||||
load_addr: actual_load_addr,
|
||||
size: kernel_size,
|
||||
entry_point,
|
||||
kernel_type: KernelType::BzImage,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// ELF64 header structure
|
||||
#[derive(Debug, Default)]
|
||||
struct Elf64Header {
|
||||
e_type: u16,
|
||||
e_machine: u16,
|
||||
e_entry: u64,
|
||||
e_phoff: u64,
|
||||
e_phnum: u16,
|
||||
e_phentsize: u16,
|
||||
}
|
||||
|
||||
impl Elf64Header {
|
||||
fn parse(data: &[u8]) -> Result<Self> {
|
||||
if data.len() < 64 {
|
||||
return Err(BootError::InvalidElf("ELF header too small".into()));
|
||||
}
|
||||
|
||||
// Verify ELF magic
|
||||
if &data[0..4] != &ELF_MAGIC {
|
||||
return Err(BootError::InvalidElf("Invalid ELF magic".into()));
|
||||
}
|
||||
|
||||
// Verify 64-bit
|
||||
if data[4] != 2 {
|
||||
return Err(BootError::InvalidElf("Not ELF64".into()));
|
||||
}
|
||||
|
||||
// Verify little-endian
|
||||
if data[5] != 1 {
|
||||
return Err(BootError::InvalidElf("Not little-endian".into()));
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
e_type: u16::from_le_bytes([data[16], data[17]]),
|
||||
e_machine: u16::from_le_bytes([data[18], data[19]]),
|
||||
e_entry: u64::from_le_bytes([
|
||||
data[24], data[25], data[26], data[27],
|
||||
data[28], data[29], data[30], data[31],
|
||||
]),
|
||||
e_phoff: u64::from_le_bytes([
|
||||
data[32], data[33], data[34], data[35],
|
||||
data[36], data[37], data[38], data[39],
|
||||
]),
|
||||
e_phentsize: u16::from_le_bytes([data[54], data[55]]),
|
||||
e_phnum: u16::from_le_bytes([data[56], data[57]]),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// ELF64 program header structure
|
||||
#[derive(Debug, Default)]
|
||||
struct Elf64ProgramHeader {
|
||||
p_type: u32,
|
||||
p_offset: u64,
|
||||
p_paddr: u64,
|
||||
p_filesz: u64,
|
||||
p_memsz: u64,
|
||||
}
|
||||
|
||||
impl Elf64ProgramHeader {
|
||||
fn parse(data: &[u8]) -> Result<Self> {
|
||||
if data.len() < 56 {
|
||||
return Err(BootError::InvalidElf("Program header too small".into()));
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
p_type: u32::from_le_bytes([data[0], data[1], data[2], data[3]]),
|
||||
p_offset: u64::from_le_bytes([
|
||||
data[8], data[9], data[10], data[11],
|
||||
data[12], data[13], data[14], data[15],
|
||||
]),
|
||||
p_paddr: u64::from_le_bytes([
|
||||
data[24], data[25], data[26], data[27],
|
||||
data[28], data[29], data[30], data[31],
|
||||
]),
|
||||
p_filesz: u64::from_le_bytes([
|
||||
data[32], data[33], data[34], data[35],
|
||||
data[36], data[37], data[38], data[39],
|
||||
]),
|
||||
p_memsz: u64::from_le_bytes([
|
||||
data[40], data[41], data[42], data[43],
|
||||
data[44], data[45], data[46], data[47],
|
||||
]),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_detect_elf_magic() {
|
||||
let mut elf_data = vec![0u8; 64];
|
||||
elf_data[0..4].copy_from_slice(&ELF_MAGIC);
|
||||
elf_data[4] = 2; // ELF64
|
||||
|
||||
let result = KernelLoader::detect_kernel_type(&elf_data);
|
||||
assert!(matches!(result, Ok(KernelType::Elf64)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_bzimage_magic() {
|
||||
let mut bzimage_data = vec![0u8; 0x210];
|
||||
// Set boot flag
|
||||
bzimage_data[bzimage::BOOT_FLAG_OFFSET] = 0x55;
|
||||
bzimage_data[bzimage::BOOT_FLAG_OFFSET + 1] = 0xaa;
|
||||
// Set HdrS magic
|
||||
bzimage_data[bzimage::HEADER_MAGIC_OFFSET] = 0x48; // 'H'
|
||||
bzimage_data[bzimage::HEADER_MAGIC_OFFSET + 1] = 0x64; // 'd'
|
||||
bzimage_data[bzimage::HEADER_MAGIC_OFFSET + 2] = 0x72; // 'r'
|
||||
bzimage_data[bzimage::HEADER_MAGIC_OFFSET + 3] = 0x53; // 'S'
|
||||
|
||||
let result = KernelLoader::detect_kernel_type(&bzimage_data);
|
||||
assert!(matches!(result, Ok(KernelType::BzImage)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_invalid_kernel() {
|
||||
let data = vec![0u8; 100];
|
||||
let result = KernelLoader::detect_kernel_type(&data);
|
||||
assert!(matches!(result, Err(BootError::InvalidKernel(_))));
|
||||
}
|
||||
}
|
||||
378
vmm/src/boot/mod.rs
Normal file
378
vmm/src/boot/mod.rs
Normal file
@@ -0,0 +1,378 @@
|
||||
//! Volt Boot Loader Module
|
||||
//!
|
||||
//! Implements PVH direct kernel boot for sub-50ms cold boot times.
|
||||
//! Skips BIOS/UEFI entirely by directly loading the kernel into guest memory
|
||||
//! and setting up the boot parameters.
|
||||
//!
|
||||
//! # Boot Protocol
|
||||
//!
|
||||
//! Volt uses the PVH boot protocol (Xen-compatible) which allows direct
|
||||
//! kernel entry without firmware. This is significantly faster than:
|
||||
//! - Traditional BIOS boot (seconds)
|
||||
//! - Linux boot protocol via SeaBIOS (hundreds of ms)
|
||||
//! - UEFI boot (hundreds of ms)
|
||||
//!
|
||||
//! # Supported Kernel Formats
|
||||
//!
|
||||
//! - ELF64 (vmlinux) - Direct kernel image
|
||||
//! - bzImage - Compressed Linux kernel with setup header
|
||||
//!
|
||||
//! # Memory Layout (typical)
|
||||
//!
|
||||
//! ```text
|
||||
//! 0x0000_0000 - 0x0000_1000 : Reserved (real mode IVT, BDA)
|
||||
//! 0x0000_7000 - 0x0000_8000 : PVH start_info structure
|
||||
//! 0x0000_8000 - 0x0000_9000 : Boot command line
|
||||
//! 0x0001_0000 - 0x0009_0000 : E820 map / boot params
|
||||
//! 0x0010_0000 - ... : Kernel load address (1MB)
|
||||
//! ... - RAM_END : Initrd (loaded at high memory)
|
||||
//! ```
|
||||
|
||||
mod gdt;
|
||||
mod initrd;
|
||||
mod linux;
|
||||
mod loader;
|
||||
pub mod mptable;
|
||||
mod pagetable;
|
||||
#[allow(dead_code)] // PVH boot protocol — planned feature, not yet wired up
|
||||
mod pvh;
|
||||
|
||||
pub use gdt::GdtSetup;
|
||||
pub use initrd::{InitrdConfig, InitrdLoader};
|
||||
pub use linux::LinuxBootSetup;
|
||||
pub use loader::{KernelConfig, KernelLoader};
|
||||
pub use mptable::setup_mptable;
|
||||
pub use pagetable::PageTableSetup;
|
||||
|
||||
use std::io;
|
||||
use thiserror::Error;
|
||||
|
||||
/// Boot loader errors
|
||||
#[derive(Error, Debug)]
|
||||
pub enum BootError {
|
||||
#[error("Failed to read kernel image: {0}")]
|
||||
KernelRead(#[source] io::Error),
|
||||
|
||||
#[error("Failed to read initrd: {0}")]
|
||||
InitrdRead(#[source] io::Error),
|
||||
|
||||
#[error("Invalid kernel format: {0}")]
|
||||
InvalidKernel(String),
|
||||
|
||||
#[error("Invalid bzImage: {0}")]
|
||||
InvalidBzImage(String),
|
||||
|
||||
#[error("Invalid ELF kernel: {0}")]
|
||||
InvalidElf(String),
|
||||
|
||||
#[error("Kernel too large: {size} bytes exceeds available memory {available}")]
|
||||
KernelTooLarge { size: u64, available: u64 },
|
||||
|
||||
#[error("Initrd too large: {size} bytes exceeds available memory {available}")]
|
||||
InitrdTooLarge { size: u64, available: u64 },
|
||||
|
||||
#[error("Command line too long: {len} bytes exceeds maximum {max}")]
|
||||
CommandLineTooLong { len: usize, max: usize },
|
||||
|
||||
#[error("Memory layout error: {0}")]
|
||||
MemoryLayout(String),
|
||||
|
||||
#[error("Failed to write to guest memory: {0}")]
|
||||
GuestMemoryWrite(String),
|
||||
|
||||
#[error("PVH setup failed: {0}")]
|
||||
#[allow(dead_code)] // PVH boot path planned
|
||||
PvhSetup(String),
|
||||
|
||||
#[error("Unsupported kernel version: {0}")]
|
||||
UnsupportedVersion(String),
|
||||
}
|
||||
|
||||
pub type Result<T> = std::result::Result<T, BootError>;
|
||||
|
||||
/// Memory addresses for boot components (x86_64)
|
||||
///
|
||||
/// # Memory Layout (designed to avoid page table overlaps)
|
||||
///
|
||||
/// For VMs with up to 4GB RAM, page tables can use addresses 0x1000-0xA000.
|
||||
/// All boot structures are placed above 0x10000 to ensure no overlaps.
|
||||
///
|
||||
/// ```text
|
||||
/// 0x0000 - 0x04FF : Reserved (IVT, BDA)
|
||||
/// 0x0500 - 0x052F : GDT (3 entries)
|
||||
/// 0x1000 - 0x1FFF : PML4
|
||||
/// 0x2000 - 0x2FFF : PDPT_LOW (identity mapping)
|
||||
/// 0x3000 - 0x3FFF : PDPT_HIGH (kernel high-half mapping)
|
||||
/// 0x4000 - 0x7FFF : PD tables for identity mapping (up to 4 for 4GB)
|
||||
/// 0x8000 - 0x9FFF : PD tables for high-half kernel mapping
|
||||
/// 0xA000 - 0x1FFFF : Reserved / available
|
||||
/// 0x20000 : boot_params (Linux zero page) - 4KB
|
||||
/// 0x21000 : PVH start_info - 4KB
|
||||
/// 0x22000 : E820 memory map - 4KB
|
||||
/// 0x30000 : Boot command line - 4KB
|
||||
/// 0x31000 - 0xFFFFF: Stack and scratch space
|
||||
/// 0x100000 : Kernel load address (1MB)
|
||||
/// ```
|
||||
#[allow(dead_code)] // Memory layout constants — reference for boot protocol
|
||||
pub mod layout {
|
||||
/// Start of reserved low memory
|
||||
pub const LOW_MEMORY_START: u64 = 0x0;
|
||||
|
||||
/// Page table area starts here (PML4)
|
||||
pub const PAGE_TABLE_START: u64 = 0x1000;
|
||||
|
||||
/// End of page table reserved area (enough for 4GB + high-half mapping)
|
||||
pub const PAGE_TABLE_END: u64 = 0xA000;
|
||||
|
||||
/// PVH start_info structure location
|
||||
/// MOVED from 0x7000 to 0x21000 to avoid page table overlap with large VMs
|
||||
pub const PVH_START_INFO_ADDR: u64 = 0x21000;
|
||||
|
||||
/// Boot command line location (after boot_params at 0x20000)
|
||||
pub const CMDLINE_ADDR: u64 = 0x30000;
|
||||
|
||||
/// Maximum command line length (including null terminator)
|
||||
pub const CMDLINE_MAX_SIZE: usize = 4096;
|
||||
|
||||
/// E820 memory map location
|
||||
/// MOVED from 0x9000 to 0x22000 to avoid page table overlap with large VMs
|
||||
pub const E820_MAP_ADDR: u64 = 0x22000;
|
||||
|
||||
/// Default kernel load address (1MB, standard for x86_64)
|
||||
pub const KERNEL_LOAD_ADDR: u64 = 0x100000;
|
||||
|
||||
/// Minimum gap between kernel and initrd
|
||||
pub const KERNEL_INITRD_GAP: u64 = 0x1000;
|
||||
|
||||
/// EBDA (Extended BIOS Data Area) size to reserve
|
||||
pub const EBDA_SIZE: u64 = 0x1000;
|
||||
|
||||
/// End of low memory (640KB boundary)
|
||||
pub const LOW_MEMORY_END: u64 = 0xA0000;
|
||||
|
||||
/// Start of high memory (1MB)
|
||||
pub const HIGH_MEMORY_START: u64 = 0x100000;
|
||||
|
||||
/// Initial stack pointer for boot
|
||||
/// Placed in safe area above page tables but below boot structures
|
||||
pub const BOOT_STACK_POINTER: u64 = 0x1FFF0;
|
||||
|
||||
/// PVH entry point - RIP value when starting the VM
|
||||
/// This should point to the kernel entry point
|
||||
pub const PVH_ENTRY_POINT: u64 = KERNEL_LOAD_ADDR;
|
||||
}
|
||||
|
||||
/// Boot configuration combining kernel, initrd, and PVH setup
|
||||
#[derive(Debug, Clone)]
|
||||
#[allow(dead_code)] // Fields set by config but not all read yet
|
||||
pub struct BootConfig {
|
||||
/// Path to kernel image
|
||||
pub kernel_path: String,
|
||||
|
||||
/// Optional path to initrd/initramfs
|
||||
pub initrd_path: Option<String>,
|
||||
|
||||
/// Kernel command line
|
||||
pub cmdline: String,
|
||||
|
||||
/// Total guest memory size in bytes
|
||||
pub memory_size: u64,
|
||||
|
||||
/// Number of vCPUs
|
||||
pub vcpu_count: u32,
|
||||
}
|
||||
|
||||
impl Default for BootConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
kernel_path: String::new(),
|
||||
initrd_path: None,
|
||||
cmdline: String::from("console=ttyS0 reboot=k panic=1 pci=off"),
|
||||
memory_size: 128 * 1024 * 1024, // 128MB default
|
||||
vcpu_count: 1,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Result of boot setup - contains entry point and register state
|
||||
#[derive(Debug, Clone)]
|
||||
#[allow(dead_code)] // All fields are part of the boot result, may not all be read yet
|
||||
pub struct BootSetupResult {
|
||||
/// Kernel entry point (RIP)
|
||||
pub entry_point: u64,
|
||||
|
||||
/// Initial stack pointer (RSP)
|
||||
pub stack_pointer: u64,
|
||||
|
||||
/// Address of boot_params structure (RSI for Linux boot protocol)
|
||||
pub start_info_addr: u64,
|
||||
|
||||
/// CR3 value (page table base address)
|
||||
pub cr3: u64,
|
||||
|
||||
/// Address where kernel was loaded
|
||||
pub kernel_load_addr: u64,
|
||||
|
||||
/// Size of loaded kernel
|
||||
pub kernel_size: u64,
|
||||
|
||||
/// Address where initrd was loaded (if any)
|
||||
pub initrd_addr: Option<u64>,
|
||||
|
||||
/// Size of initrd (if any)
|
||||
pub initrd_size: Option<u64>,
|
||||
}
|
||||
|
||||
/// Trait for guest memory access during boot
|
||||
pub trait GuestMemory {
|
||||
/// Write bytes to guest memory at the given address
|
||||
fn write_bytes(&mut self, addr: u64, data: &[u8]) -> Result<()>;
|
||||
|
||||
/// Write a value to guest memory
|
||||
#[allow(dead_code)]
|
||||
fn write_obj<T: Copy>(&mut self, addr: u64, val: &T) -> Result<()> {
|
||||
let bytes = unsafe {
|
||||
std::slice::from_raw_parts(val as *const T as *const u8, std::mem::size_of::<T>())
|
||||
};
|
||||
self.write_bytes(addr, bytes)
|
||||
}
|
||||
|
||||
/// Get the total size of guest memory
|
||||
fn size(&self) -> u64;
|
||||
}
|
||||
|
||||
/// Complete boot loader that orchestrates kernel, initrd, and PVH setup
|
||||
pub struct BootLoader;
|
||||
|
||||
impl BootLoader {
|
||||
/// Load kernel and initrd, set up Linux boot protocol
|
||||
///
|
||||
/// This is the main entry point for boot setup. It:
|
||||
/// 1. Loads the kernel image (ELF or bzImage)
|
||||
/// 2. Loads the initrd if specified
|
||||
/// 3. Sets up the Linux boot_params structure (zero page)
|
||||
/// 4. Writes the command line
|
||||
/// 5. Returns the boot parameters for vCPU initialization
|
||||
pub fn setup<M: GuestMemory>(
|
||||
config: &BootConfig,
|
||||
guest_mem: &mut M,
|
||||
) -> Result<BootSetupResult> {
|
||||
// Validate command line length
|
||||
if config.cmdline.len() >= layout::CMDLINE_MAX_SIZE {
|
||||
return Err(BootError::CommandLineTooLong {
|
||||
len: config.cmdline.len(),
|
||||
max: layout::CMDLINE_MAX_SIZE - 1,
|
||||
});
|
||||
}
|
||||
|
||||
// Load kernel
|
||||
let kernel_config = KernelConfig {
|
||||
path: config.kernel_path.clone(),
|
||||
load_addr: layout::KERNEL_LOAD_ADDR,
|
||||
};
|
||||
let kernel_result = KernelLoader::load(&kernel_config, guest_mem)?;
|
||||
|
||||
// Calculate initrd placement (high memory, after kernel)
|
||||
let initrd_result = if let Some(ref initrd_path) = config.initrd_path {
|
||||
let initrd_config = InitrdConfig {
|
||||
path: initrd_path.clone(),
|
||||
memory_size: config.memory_size,
|
||||
kernel_end: kernel_result.load_addr + kernel_result.size,
|
||||
};
|
||||
Some(InitrdLoader::load(&initrd_config, guest_mem)?)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// Write command line to guest memory
|
||||
let cmdline_bytes = config.cmdline.as_bytes();
|
||||
guest_mem.write_bytes(layout::CMDLINE_ADDR, cmdline_bytes)?;
|
||||
// Null terminator
|
||||
guest_mem.write_bytes(layout::CMDLINE_ADDR + cmdline_bytes.len() as u64, &[0])?;
|
||||
|
||||
// Set up GDT for 64-bit mode
|
||||
GdtSetup::setup(guest_mem)?;
|
||||
|
||||
// Set up identity-mapped page tables for 64-bit mode
|
||||
let cr3 = PageTableSetup::setup(guest_mem, config.memory_size)?;
|
||||
|
||||
// Set up Linux boot_params structure (zero page)
|
||||
let linux_config = linux::LinuxBootConfig {
|
||||
memory_size: config.memory_size,
|
||||
cmdline_addr: layout::CMDLINE_ADDR,
|
||||
initrd_addr: initrd_result.as_ref().map(|r| r.load_addr),
|
||||
initrd_size: initrd_result.as_ref().map(|r| r.size),
|
||||
};
|
||||
let boot_params_addr = LinuxBootSetup::setup(&linux_config, guest_mem)?;
|
||||
|
||||
Ok(BootSetupResult {
|
||||
entry_point: kernel_result.entry_point,
|
||||
stack_pointer: layout::BOOT_STACK_POINTER,
|
||||
start_info_addr: boot_params_addr,
|
||||
cr3,
|
||||
kernel_load_addr: kernel_result.load_addr,
|
||||
kernel_size: kernel_result.size,
|
||||
initrd_addr: initrd_result.as_ref().map(|r| r.load_addr),
|
||||
initrd_size: initrd_result.as_ref().map(|r| r.size),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
struct MockMemory {
|
||||
size: u64,
|
||||
data: Vec<u8>,
|
||||
}
|
||||
|
||||
impl MockMemory {
|
||||
fn new(size: u64) -> Self {
|
||||
Self {
|
||||
size,
|
||||
data: vec![0; size as usize],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl GuestMemory for MockMemory {
|
||||
fn write_bytes(&mut self, addr: u64, data: &[u8]) -> Result<()> {
|
||||
let end = addr as usize + data.len();
|
||||
if end > self.data.len() {
|
||||
return Err(BootError::GuestMemoryWrite(format!(
|
||||
"Write at {:#x} with len {} exceeds memory size {}",
|
||||
addr,
|
||||
data.len(),
|
||||
self.size
|
||||
)));
|
||||
}
|
||||
self.data[addr as usize..end].copy_from_slice(data);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn size(&self) -> u64 {
|
||||
self.size
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_boot_config_default() {
|
||||
let config = BootConfig::default();
|
||||
assert!(config.cmdline.contains("console=ttyS0"));
|
||||
assert_eq!(config.vcpu_count, 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cmdline_too_long() {
|
||||
let mut mem = MockMemory::new(1024 * 1024);
|
||||
let config = BootConfig {
|
||||
kernel_path: "/boot/vmlinux".into(),
|
||||
cmdline: "x".repeat(layout::CMDLINE_MAX_SIZE + 1),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = BootLoader::setup(&config, &mut mem);
|
||||
assert!(matches!(result, Err(BootError::CommandLineTooLong { .. })));
|
||||
}
|
||||
}
|
||||
611
vmm/src/boot/mptable.rs
Normal file
611
vmm/src/boot/mptable.rs
Normal file
@@ -0,0 +1,611 @@
|
||||
//! Intel MultiProcessor Specification (MPS) Table Construction
|
||||
//!
|
||||
//! Implements MP Floating Pointer and MP Configuration Table structures
|
||||
//! to advertise SMP topology to the guest kernel. This allows Linux to
|
||||
//! discover and boot Application Processors (APs) beyond the Bootstrap
|
||||
//! Processor (BSP).
|
||||
//!
|
||||
//! # Table Layout (placed at 0x9FC00, just below EBDA)
|
||||
//!
|
||||
//! ```text
|
||||
//! 0x9FC00: MP Floating Pointer Structure (16 bytes)
|
||||
//! 0x9FC10: MP Configuration Table Header (44 bytes)
|
||||
//! 0x9FC3C: Processor Entry 0 (BSP, APIC ID 0) — 20 bytes
|
||||
//! 0x9FC50: Processor Entry 1 (AP, APIC ID 1) — 20 bytes
|
||||
//! ...
|
||||
//! Bus Entry (ISA, 8 bytes)
|
||||
//! I/O APIC Entry (8 bytes)
|
||||
//! I/O Interrupt Entries (IRQ 0-15, 8 bytes each)
|
||||
//! ```
|
||||
//!
|
||||
//! # References
|
||||
//! - Intel MultiProcessor Specification v1.4 (May 1997)
|
||||
//! - Firecracker's mpspec implementation (src/vmm/src/arch/x86_64/mptable.rs)
|
||||
//! - Linux kernel: arch/x86/kernel/mpparse.c
|
||||
|
||||
use super::{BootError, GuestMemory, Result};
|
||||
|
||||
/// Base address for MP tables — just below EBDA at 640KB boundary.
|
||||
/// This address (0x9FC00) is a conventional location that Linux scans.
|
||||
pub const MP_TABLE_START: u64 = 0x9FC00;
|
||||
|
||||
/// Maximum number of vCPUs we can fit in the MP table area.
|
||||
/// Each processor entry is 20 bytes. Between 0x9FC00 and 0xA0000 we have
|
||||
/// 1024 bytes. After headers (60 bytes), bus (8), IOAPIC (8), and 16 IRQ
|
||||
/// entries (128 bytes), we have ~830 bytes = 41 processor entries.
|
||||
/// That's more than enough — clamp to 255 (max APIC IDs).
|
||||
pub const MAX_CPUS: u8 = 255;
|
||||
|
||||
// ============================================================================
|
||||
// MP Floating Pointer Structure (16 bytes)
|
||||
// Intel MPS Table 4-1
|
||||
// ============================================================================
|
||||
|
||||
/// MP Floating Pointer signature: "_MP_"
|
||||
const MP_FP_SIGNATURE: [u8; 4] = [b'_', b'M', b'P', b'_'];
|
||||
|
||||
/// MP Configuration Table signature: "PCMP"
|
||||
const MP_CT_SIGNATURE: [u8; 4] = [b'P', b'C', b'M', b'P'];
|
||||
|
||||
/// MP spec revision 1.4
|
||||
const MP_SPEC_REVISION: u8 = 4;
|
||||
|
||||
/// MP Floating Pointer Feature Byte 1: indicates MP Config Table present
|
||||
const MP_FEATURE_IMCRP: u8 = 0x80;
|
||||
|
||||
// ============================================================================
|
||||
// MP Table Entry Types
|
||||
// ============================================================================
|
||||
|
||||
const MP_ENTRY_PROCESSOR: u8 = 0;
|
||||
const MP_ENTRY_BUS: u8 = 1;
|
||||
const MP_ENTRY_IOAPIC: u8 = 2;
|
||||
const MP_ENTRY_IO_INTERRUPT: u8 = 3;
|
||||
#[allow(dead_code)]
|
||||
const MP_ENTRY_LOCAL_INTERRUPT: u8 = 4;
|
||||
|
||||
// Processor entry flags
|
||||
const CPU_FLAG_ENABLED: u8 = 0x01;
|
||||
const CPU_FLAG_BSP: u8 = 0x02;
|
||||
|
||||
// Interrupt types
|
||||
const INT_TYPE_INT: u8 = 0; // Vectored interrupt
|
||||
#[allow(dead_code)]
|
||||
const INT_TYPE_NMI: u8 = 1;
|
||||
#[allow(dead_code)]
|
||||
const INT_TYPE_SMI: u8 = 2;
|
||||
const INT_TYPE_EXTINT: u8 = 3; // ExtINT (from 8259)
|
||||
|
||||
// Interrupt polarity/trigger flags
|
||||
const INT_FLAG_DEFAULT: u16 = 0x0000; // Conforms to bus spec
|
||||
|
||||
// I/O APIC default address
|
||||
const IOAPIC_DEFAULT_ADDR: u32 = 0xFEC0_0000;
|
||||
|
||||
/// ISA bus type string
|
||||
const BUS_TYPE_ISA: [u8; 6] = [b'I', b'S', b'A', b' ', b' ', b' '];
|
||||
|
||||
// ============================================================================
|
||||
// MP Table Builder
|
||||
// ============================================================================
|
||||
|
||||
/// Write MP tables to guest memory for SMP discovery.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `guest_mem` — Guest memory to write the tables into
|
||||
/// * `num_cpus` — Number of vCPUs (1-255)
|
||||
///
|
||||
/// # Returns
|
||||
/// The guest physical address where the MP Floating Pointer was written.
|
||||
pub fn setup_mptable<M: GuestMemory>(guest_mem: &mut M, num_cpus: u8) -> Result<u64> {
|
||||
if num_cpus == 0 {
|
||||
return Err(BootError::MemoryLayout(
|
||||
"MP table requires at least 1 CPU".to_string(),
|
||||
));
|
||||
}
|
||||
if num_cpus > MAX_CPUS {
|
||||
return Err(BootError::MemoryLayout(format!(
|
||||
"MP table supports at most {} CPUs, got {}",
|
||||
MAX_CPUS, num_cpus
|
||||
)));
|
||||
}
|
||||
|
||||
// Calculate sizes and offsets
|
||||
let fp_size: u64 = 16; // MP Floating Pointer
|
||||
let header_size: u64 = 44; // MP Config Table Header
|
||||
let processor_entry_size: u64 = 20;
|
||||
let bus_entry_size: u64 = 8;
|
||||
let ioapic_entry_size: u64 = 8;
|
||||
let io_int_entry_size: u64 = 8;
|
||||
|
||||
// Number of IO interrupt entries: IRQ 0-15 = 16 entries
|
||||
let num_irqs: u64 = 16;
|
||||
|
||||
let config_table_addr = MP_TABLE_START + fp_size;
|
||||
let _entries_start = config_table_addr + header_size;
|
||||
|
||||
// Calculate total config table size (header + all entries)
|
||||
let total_entries_size = (num_cpus as u64) * processor_entry_size
|
||||
+ bus_entry_size
|
||||
+ ioapic_entry_size
|
||||
+ num_irqs * io_int_entry_size;
|
||||
let config_table_size = header_size + total_entries_size;
|
||||
|
||||
// Verify we fit in the available space (between 0x9FC00 and 0xA0000)
|
||||
let total_size = fp_size + config_table_size;
|
||||
if MP_TABLE_START + total_size > 0xA0000 {
|
||||
return Err(BootError::MemoryLayout(format!(
|
||||
"MP tables ({} bytes) exceed available space (0x9FC00-0xA0000)",
|
||||
total_size
|
||||
)));
|
||||
}
|
||||
|
||||
// Verify we have enough guest memory
|
||||
if MP_TABLE_START + total_size > guest_mem.size() {
|
||||
return Err(BootError::MemoryLayout(format!(
|
||||
"MP tables at 0x{:x} exceed guest memory size 0x{:x}",
|
||||
MP_TABLE_START + total_size,
|
||||
guest_mem.size()
|
||||
)));
|
||||
}
|
||||
|
||||
// Build the MP Configuration Table body (entries)
|
||||
let mut table_buf = Vec::with_capacity(config_table_size as usize);
|
||||
|
||||
// Leave space for the header (we'll fill it after computing checksum)
|
||||
table_buf.resize(header_size as usize, 0);
|
||||
|
||||
// ---- Processor Entries ----
|
||||
let mut entry_count: u16 = 0;
|
||||
|
||||
for cpu_id in 0..num_cpus {
|
||||
let flags = if cpu_id == 0 {
|
||||
CPU_FLAG_ENABLED | CPU_FLAG_BSP
|
||||
} else {
|
||||
CPU_FLAG_ENABLED
|
||||
};
|
||||
|
||||
// CPU signature: Family 6, Model 15 (Core 2 / Merom-class)
|
||||
// This is a safe generic modern x86_64 signature
|
||||
let cpu_signature: u32 = (6 << 8) | (15 << 4) | 1; // Family=6, Model=F, Stepping=1
|
||||
let feature_flags: u32 = 0x0781_FBFF; // Common feature flags (FPU, SSE, SSE2, etc.)
|
||||
|
||||
write_processor_entry(
|
||||
&mut table_buf,
|
||||
cpu_id, // Local APIC ID
|
||||
0x14, // Local APIC version (integrated APIC)
|
||||
flags,
|
||||
cpu_signature,
|
||||
feature_flags,
|
||||
);
|
||||
entry_count += 1;
|
||||
}
|
||||
|
||||
// ---- Bus Entry (ISA) ----
|
||||
write_bus_entry(&mut table_buf, 0, &BUS_TYPE_ISA);
|
||||
entry_count += 1;
|
||||
|
||||
// ---- I/O APIC Entry ----
|
||||
// I/O APIC ID = num_cpus (first ID after all processors)
|
||||
let ioapic_id = num_cpus;
|
||||
write_ioapic_entry(&mut table_buf, ioapic_id, 0x11, IOAPIC_DEFAULT_ADDR);
|
||||
entry_count += 1;
|
||||
|
||||
// ---- I/O Interrupt Assignment Entries ----
|
||||
// Map ISA IRQs 0-15 to IOAPIC pins 0-15
|
||||
|
||||
// IRQ 0: ExtINT (8259 cascade through IOAPIC pin 0)
|
||||
write_io_interrupt_entry(
|
||||
&mut table_buf,
|
||||
INT_TYPE_EXTINT,
|
||||
INT_FLAG_DEFAULT,
|
||||
0, // source bus = ISA
|
||||
0, // source bus IRQ = 0
|
||||
ioapic_id,
|
||||
0, // IOAPIC pin 0
|
||||
);
|
||||
entry_count += 1;
|
||||
|
||||
// IRQs 1-15: Standard vectored interrupts
|
||||
for irq in 1..16u8 {
|
||||
// IRQ 2 is the PIC cascade — skip it (Linux doesn't use it in APIC mode)
|
||||
// But we still report it for completeness
|
||||
write_io_interrupt_entry(
|
||||
&mut table_buf,
|
||||
INT_TYPE_INT,
|
||||
INT_FLAG_DEFAULT,
|
||||
0, // source bus = ISA
|
||||
irq, // source bus IRQ
|
||||
ioapic_id,
|
||||
irq, // IOAPIC pin = same as IRQ number
|
||||
);
|
||||
entry_count += 1;
|
||||
}
|
||||
|
||||
// ---- Fill in the Configuration Table Header ----
|
||||
// Build header at the start of table_buf
|
||||
{
|
||||
// Compute length before taking mutable borrow of the header slice
|
||||
let table_len = table_buf.len() as u16;
|
||||
let header = &mut table_buf[0..header_size as usize];
|
||||
|
||||
// Signature: "PCMP"
|
||||
header[0..4].copy_from_slice(&MP_CT_SIGNATURE);
|
||||
// Base table length (u16 LE) — entire config table including header
|
||||
header[4..6].copy_from_slice(&table_len.to_le_bytes());
|
||||
// Spec revision
|
||||
header[6] = MP_SPEC_REVISION;
|
||||
// Checksum — will be filled below
|
||||
header[7] = 0;
|
||||
// OEM ID (8 bytes, space-padded)
|
||||
header[8..16].copy_from_slice(b"NOVAFLAR");
|
||||
// Product ID (12 bytes, space-padded)
|
||||
header[16..28].copy_from_slice(b"VOLT VM");
|
||||
// OEM table pointer (0 = none)
|
||||
header[28..32].copy_from_slice(&0u32.to_le_bytes());
|
||||
// OEM table size
|
||||
header[32..34].copy_from_slice(&0u16.to_le_bytes());
|
||||
// Entry count
|
||||
header[34..36].copy_from_slice(&entry_count.to_le_bytes());
|
||||
// Local APIC address
|
||||
header[36..40].copy_from_slice(&0xFEE0_0000u32.to_le_bytes());
|
||||
// Extended table length
|
||||
header[40..42].copy_from_slice(&0u16.to_le_bytes());
|
||||
// Extended table checksum
|
||||
header[42] = 0;
|
||||
// Reserved
|
||||
header[43] = 0;
|
||||
|
||||
// Compute and set checksum
|
||||
let checksum = compute_checksum(&table_buf);
|
||||
table_buf[7] = checksum;
|
||||
}
|
||||
|
||||
// ---- Build the MP Floating Pointer Structure ----
|
||||
let mut fp_buf = [0u8; 16];
|
||||
|
||||
// Signature: "_MP_"
|
||||
fp_buf[0..4].copy_from_slice(&MP_FP_SIGNATURE);
|
||||
// Physical address pointer to MP Config Table (u32 LE)
|
||||
fp_buf[4..8].copy_from_slice(&(config_table_addr as u32).to_le_bytes());
|
||||
// Length in 16-byte paragraphs (1 = 16 bytes)
|
||||
fp_buf[8] = 1;
|
||||
// Spec revision
|
||||
fp_buf[9] = MP_SPEC_REVISION;
|
||||
// Checksum — filled below
|
||||
fp_buf[10] = 0;
|
||||
// Feature byte 1: 0 = MP Config Table present (not default config)
|
||||
fp_buf[11] = 0;
|
||||
// Feature byte 2: bit 7 = IMCR present (PIC mode available)
|
||||
fp_buf[12] = MP_FEATURE_IMCRP;
|
||||
// Feature bytes 3-5: reserved
|
||||
fp_buf[13] = 0;
|
||||
fp_buf[14] = 0;
|
||||
fp_buf[15] = 0;
|
||||
|
||||
// Compute floating pointer checksum
|
||||
let fp_checksum = compute_checksum(&fp_buf);
|
||||
fp_buf[10] = fp_checksum;
|
||||
|
||||
// ---- Write everything to guest memory ----
|
||||
guest_mem.write_bytes(MP_TABLE_START, &fp_buf)?;
|
||||
guest_mem.write_bytes(config_table_addr, &table_buf)?;
|
||||
|
||||
tracing::info!(
|
||||
"MP table written at 0x{:x}: {} CPUs, {} entries, {} bytes total\n\
|
||||
Layout: FP=0x{:x}, Config=0x{:x}, IOAPIC ID={}, IOAPIC addr=0x{:x}",
|
||||
MP_TABLE_START,
|
||||
num_cpus,
|
||||
entry_count,
|
||||
total_size,
|
||||
MP_TABLE_START,
|
||||
config_table_addr,
|
||||
ioapic_id,
|
||||
IOAPIC_DEFAULT_ADDR,
|
||||
);
|
||||
|
||||
Ok(MP_TABLE_START)
|
||||
}
|
||||
|
||||
/// Write a Processor Entry (20 bytes) to the table buffer.
|
||||
///
|
||||
/// Format (Intel MPS Table 4-4):
|
||||
/// ```text
|
||||
/// Offset Size Field
|
||||
/// 0 1 Entry type (0 = processor)
|
||||
/// 1 1 Local APIC ID
|
||||
/// 2 1 Local APIC version
|
||||
/// 3 1 CPU flags (bit 0=EN, bit 1=BP)
|
||||
/// 4 4 CPU signature (stepping, model, family)
|
||||
/// 8 4 Feature flags (from CPUID leaf 1 EDX)
|
||||
/// 12 8 Reserved
|
||||
/// ```
|
||||
fn write_processor_entry(
|
||||
buf: &mut Vec<u8>,
|
||||
apic_id: u8,
|
||||
apic_version: u8,
|
||||
flags: u8,
|
||||
cpu_signature: u32,
|
||||
feature_flags: u32,
|
||||
) {
|
||||
buf.push(MP_ENTRY_PROCESSOR); // Entry type
|
||||
buf.push(apic_id); // Local APIC ID
|
||||
buf.push(apic_version); // Local APIC version
|
||||
buf.push(flags); // CPU flags
|
||||
buf.extend_from_slice(&cpu_signature.to_le_bytes()); // CPU signature
|
||||
buf.extend_from_slice(&feature_flags.to_le_bytes()); // Feature flags
|
||||
buf.extend_from_slice(&[0u8; 8]); // Reserved
|
||||
}
|
||||
|
||||
/// Write a Bus Entry (8 bytes) to the table buffer.
|
||||
///
|
||||
/// Format (Intel MPS Table 4-5):
|
||||
/// ```text
|
||||
/// Offset Size Field
|
||||
/// 0 1 Entry type (1 = bus)
|
||||
/// 1 1 Bus ID
|
||||
/// 2 6 Bus type string (space-padded)
|
||||
/// ```
|
||||
fn write_bus_entry(buf: &mut Vec<u8>, bus_id: u8, bus_type: &[u8; 6]) {
|
||||
buf.push(MP_ENTRY_BUS);
|
||||
buf.push(bus_id);
|
||||
buf.extend_from_slice(bus_type);
|
||||
}
|
||||
|
||||
/// Write an I/O APIC Entry (8 bytes) to the table buffer.
|
||||
///
|
||||
/// Format (Intel MPS Table 4-6):
|
||||
/// ```text
|
||||
/// Offset Size Field
|
||||
/// 0 1 Entry type (2 = I/O APIC)
|
||||
/// 1 1 I/O APIC ID
|
||||
/// 2 1 I/O APIC version
|
||||
/// 3 1 I/O APIC flags (bit 0 = EN)
|
||||
/// 4 4 I/O APIC address
|
||||
/// ```
|
||||
fn write_ioapic_entry(buf: &mut Vec<u8>, id: u8, version: u8, addr: u32) {
|
||||
buf.push(MP_ENTRY_IOAPIC);
|
||||
buf.push(id);
|
||||
buf.push(version);
|
||||
buf.push(0x01); // flags: enabled
|
||||
buf.extend_from_slice(&addr.to_le_bytes());
|
||||
}
|
||||
|
||||
/// Write an I/O Interrupt Assignment Entry (8 bytes) to the table buffer.
|
||||
///
|
||||
/// Format (Intel MPS Table 4-7):
|
||||
/// ```text
|
||||
/// Offset Size Field
|
||||
/// 0 1 Entry type (3 = I/O interrupt)
|
||||
/// 1 1 Interrupt type (0=INT, 1=NMI, 2=SMI, 3=ExtINT)
|
||||
/// 2 2 Flags (polarity/trigger)
|
||||
/// 4 1 Source bus ID
|
||||
/// 5 1 Source bus IRQ
|
||||
/// 6 1 Destination I/O APIC ID
|
||||
/// 7 1 Destination I/O APIC pin (INTIN#)
|
||||
/// ```
|
||||
fn write_io_interrupt_entry(
|
||||
buf: &mut Vec<u8>,
|
||||
int_type: u8,
|
||||
flags: u16,
|
||||
src_bus_id: u8,
|
||||
src_bus_irq: u8,
|
||||
dst_ioapic_id: u8,
|
||||
dst_ioapic_pin: u8,
|
||||
) {
|
||||
buf.push(MP_ENTRY_IO_INTERRUPT);
|
||||
buf.push(int_type);
|
||||
buf.extend_from_slice(&flags.to_le_bytes());
|
||||
buf.push(src_bus_id);
|
||||
buf.push(src_bus_irq);
|
||||
buf.push(dst_ioapic_id);
|
||||
buf.push(dst_ioapic_pin);
|
||||
}
|
||||
|
||||
/// Compute the two's-complement checksum for an MP structure.
|
||||
/// The sum of all bytes in the structure must be 0 (mod 256).
|
||||
fn compute_checksum(data: &[u8]) -> u8 {
|
||||
let sum: u8 = data.iter().fold(0u8, |acc, &b| acc.wrapping_add(b));
|
||||
(!sum).wrapping_add(1) // Two's complement = negate
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Tests
|
||||
// ============================================================================
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
struct MockMemory {
|
||||
size: u64,
|
||||
data: Vec<u8>,
|
||||
}
|
||||
|
||||
impl MockMemory {
|
||||
fn new(size: u64) -> Self {
|
||||
Self {
|
||||
size,
|
||||
data: vec![0; size as usize],
|
||||
}
|
||||
}
|
||||
|
||||
fn read_bytes(&self, addr: u64, len: usize) -> &[u8] {
|
||||
&self.data[addr as usize..(addr as usize + len)]
|
||||
}
|
||||
}
|
||||
|
||||
impl GuestMemory for MockMemory {
|
||||
fn write_bytes(&mut self, addr: u64, data: &[u8]) -> Result<()> {
|
||||
let end = addr as usize + data.len();
|
||||
if end > self.data.len() {
|
||||
return Err(BootError::GuestMemoryWrite(format!(
|
||||
"Write at {:#x} exceeds memory",
|
||||
addr
|
||||
)));
|
||||
}
|
||||
self.data[addr as usize..end].copy_from_slice(data);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn size(&self) -> u64 {
|
||||
self.size
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_checksum() {
|
||||
// A buffer with known checksum byte should sum to 0
|
||||
let data = vec![1, 2, 3, 4];
|
||||
let cs = compute_checksum(&data);
|
||||
let total: u8 = data.iter().chain(std::iter::once(&cs)).fold(0u8, |a, b| a.wrapping_add(*b));
|
||||
// With the checksum byte replacing the original slot, the sum should be 0
|
||||
let mut with_cs = data.clone();
|
||||
with_cs.push(0); // placeholder
|
||||
// Actually the checksum replaces index 10 in the FP or 7 in the config header,
|
||||
// but let's verify the math differently:
|
||||
let sum_without: u8 = data.iter().fold(0u8, |a, b| a.wrapping_add(*b));
|
||||
assert_eq!(sum_without.wrapping_add(cs), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mp_floating_pointer_signature() {
|
||||
let mut mem = MockMemory::new(1024 * 1024);
|
||||
let result = setup_mptable(&mut mem, 1);
|
||||
assert!(result.is_ok());
|
||||
|
||||
let fp_addr = result.unwrap() as usize;
|
||||
assert_eq!(&mem.data[fp_addr..fp_addr + 4], b"_MP_");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mp_floating_pointer_checksum() {
|
||||
let mut mem = MockMemory::new(1024 * 1024);
|
||||
setup_mptable(&mut mem, 2).unwrap();
|
||||
|
||||
// MP Floating Pointer is 16 bytes at MP_TABLE_START
|
||||
let fp = mem.read_bytes(MP_TABLE_START, 16);
|
||||
let sum: u8 = fp.iter().fold(0u8, |a, &b| a.wrapping_add(b));
|
||||
assert_eq!(sum, 0, "MP Floating Pointer checksum mismatch");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mp_config_table_checksum() {
|
||||
let mut mem = MockMemory::new(1024 * 1024);
|
||||
setup_mptable(&mut mem, 2).unwrap();
|
||||
|
||||
// Config table starts at MP_TABLE_START + 16
|
||||
let config_addr = (MP_TABLE_START + 16) as usize;
|
||||
// Read table length from header bytes 4-5
|
||||
let table_len = u16::from_le_bytes([
|
||||
mem.data[config_addr + 4],
|
||||
mem.data[config_addr + 5],
|
||||
]) as usize;
|
||||
|
||||
let table = &mem.data[config_addr..config_addr + table_len];
|
||||
let sum: u8 = table.iter().fold(0u8, |a, &b| a.wrapping_add(b));
|
||||
assert_eq!(sum, 0, "MP Config Table checksum mismatch");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mp_config_table_signature() {
|
||||
let mut mem = MockMemory::new(1024 * 1024);
|
||||
setup_mptable(&mut mem, 1).unwrap();
|
||||
|
||||
let config_addr = (MP_TABLE_START + 16) as usize;
|
||||
assert_eq!(&mem.data[config_addr..config_addr + 4], b"PCMP");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mp_table_1_cpu() {
|
||||
let mut mem = MockMemory::new(1024 * 1024);
|
||||
setup_mptable(&mut mem, 1).unwrap();
|
||||
|
||||
let config_addr = (MP_TABLE_START + 16) as usize;
|
||||
// Entry count at offset 34 in header
|
||||
let entry_count = u16::from_le_bytes([
|
||||
mem.data[config_addr + 34],
|
||||
mem.data[config_addr + 35],
|
||||
]);
|
||||
// 1 CPU + 1 bus + 1 IOAPIC + 16 IRQs = 19 entries
|
||||
assert_eq!(entry_count, 19);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mp_table_4_cpus() {
|
||||
let mut mem = MockMemory::new(1024 * 1024);
|
||||
setup_mptable(&mut mem, 4).unwrap();
|
||||
|
||||
let config_addr = (MP_TABLE_START + 16) as usize;
|
||||
let entry_count = u16::from_le_bytes([
|
||||
mem.data[config_addr + 34],
|
||||
mem.data[config_addr + 35],
|
||||
]);
|
||||
// 4 CPUs + 1 bus + 1 IOAPIC + 16 IRQs = 22 entries
|
||||
assert_eq!(entry_count, 22);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mp_table_bsp_flag() {
|
||||
let mut mem = MockMemory::new(1024 * 1024);
|
||||
setup_mptable(&mut mem, 4).unwrap();
|
||||
|
||||
// First processor entry starts at config_addr + 44 (header size)
|
||||
let proc0_offset = (MP_TABLE_START + 16 + 44) as usize;
|
||||
assert_eq!(mem.data[proc0_offset], 0); // Entry type = processor
|
||||
assert_eq!(mem.data[proc0_offset + 1], 0); // APIC ID = 0
|
||||
assert_eq!(mem.data[proc0_offset + 3], CPU_FLAG_ENABLED | CPU_FLAG_BSP); // BSP + EN
|
||||
|
||||
// Second processor
|
||||
let proc1_offset = proc0_offset + 20;
|
||||
assert_eq!(mem.data[proc1_offset + 1], 1); // APIC ID = 1
|
||||
assert_eq!(mem.data[proc1_offset + 3], CPU_FLAG_ENABLED); // EN only (no BSP)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mp_table_ioapic() {
|
||||
let mut mem = MockMemory::new(1024 * 1024);
|
||||
let num_cpus: u8 = 2;
|
||||
setup_mptable(&mut mem, num_cpus).unwrap();
|
||||
|
||||
// IOAPIC entry follows: processors (2*20) + bus (8) = 48 bytes after entries start
|
||||
let entries_start = (MP_TABLE_START + 16 + 44) as usize;
|
||||
let ioapic_offset = entries_start + (num_cpus as usize * 20) + 8;
|
||||
|
||||
assert_eq!(mem.data[ioapic_offset], MP_ENTRY_IOAPIC); // Entry type
|
||||
assert_eq!(mem.data[ioapic_offset + 1], num_cpus); // IOAPIC ID = num_cpus
|
||||
assert_eq!(mem.data[ioapic_offset + 3], 0x01); // Enabled
|
||||
|
||||
// IOAPIC address
|
||||
let addr = u32::from_le_bytes([
|
||||
mem.data[ioapic_offset + 4],
|
||||
mem.data[ioapic_offset + 5],
|
||||
mem.data[ioapic_offset + 6],
|
||||
mem.data[ioapic_offset + 7],
|
||||
]);
|
||||
assert_eq!(addr, IOAPIC_DEFAULT_ADDR);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mp_table_zero_cpus_error() {
|
||||
let mut mem = MockMemory::new(1024 * 1024);
|
||||
let result = setup_mptable(&mut mem, 0);
|
||||
assert!(result.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mp_table_local_apic_addr() {
|
||||
let mut mem = MockMemory::new(1024 * 1024);
|
||||
setup_mptable(&mut mem, 2).unwrap();
|
||||
|
||||
let config_addr = (MP_TABLE_START + 16) as usize;
|
||||
// Local APIC address at offset 36 in header
|
||||
let lapic_addr = u32::from_le_bytes([
|
||||
mem.data[config_addr + 36],
|
||||
mem.data[config_addr + 37],
|
||||
mem.data[config_addr + 38],
|
||||
mem.data[config_addr + 39],
|
||||
]);
|
||||
assert_eq!(lapic_addr, 0xFEE0_0000);
|
||||
}
|
||||
}
|
||||
291
vmm/src/boot/pagetable.rs
Normal file
291
vmm/src/boot/pagetable.rs
Normal file
@@ -0,0 +1,291 @@
|
||||
//! Page Table Setup for 64-bit Boot
|
||||
//!
|
||||
//! Sets up identity-mapped page tables for Linux 64-bit kernel boot.
|
||||
//! The kernel expects to be running with paging enabled and needs:
|
||||
//! - Identity mapping for low memory (0-4GB physical = 0-4GB virtual)
|
||||
//! - High kernel mapping (0xffffffff80000000+ = physical addresses)
|
||||
//!
|
||||
//! # Page Table Layout
|
||||
//!
|
||||
//! We use 2MB huge pages for simplicity and performance:
|
||||
//! - PML4 (Page Map Level 4) at 0x1000
|
||||
//! - PDPT for low memory (identity) at 0x2000
|
||||
//! - PDPT for high memory (kernel) at 0x3000
|
||||
//! - PD tables at 0x4000+
|
||||
//!
|
||||
//! Each PD entry maps 2MB of physical memory using huge pages.
|
||||
|
||||
use super::{GuestMemory, Result};
|
||||
#[cfg(test)]
|
||||
use super::BootError;
|
||||
|
||||
/// PML4 table address
|
||||
pub const PML4_ADDR: u64 = 0x1000;
|
||||
|
||||
/// PDPT (Page Directory Pointer Table) for identity mapping (low memory)
|
||||
pub const PDPT_LOW_ADDR: u64 = 0x2000;
|
||||
|
||||
/// PDPT for kernel high memory mapping
|
||||
pub const PDPT_HIGH_ADDR: u64 = 0x3000;
|
||||
|
||||
/// First PD (Page Directory) address
|
||||
pub const PD_ADDR: u64 = 0x4000;
|
||||
|
||||
/// Size of one page table (4KB)
|
||||
pub const PAGE_TABLE_SIZE: u64 = 0x1000;
|
||||
|
||||
/// Page table entry flags
|
||||
#[allow(dead_code)] // x86 page table flags — kept for completeness
|
||||
mod flags {
|
||||
/// Present bit
|
||||
pub const PRESENT: u64 = 1 << 0;
|
||||
/// Read/Write bit
|
||||
pub const WRITABLE: u64 = 1 << 1;
|
||||
/// User/Supervisor bit (0 = supervisor only)
|
||||
pub const USER: u64 = 1 << 2;
|
||||
/// Page Size bit (1 = 2MB/1GB huge page)
|
||||
pub const PAGE_SIZE: u64 = 1 << 7;
|
||||
}
|
||||
|
||||
/// Page table setup implementation
|
||||
pub struct PageTableSetup;
|
||||
|
||||
impl PageTableSetup {
|
||||
/// Set up page tables for 64-bit Linux kernel boot
|
||||
///
|
||||
/// Creates:
|
||||
/// 1. Identity mapping for first 4GB (virtual 0-4GB -> physical 0-4GB)
|
||||
/// 2. High kernel mapping (virtual 0xffffffff80000000+ -> physical 0+)
|
||||
///
|
||||
/// This allows the kernel to execute at its linked address while also
|
||||
/// having access to physical memory via identity mapping.
|
||||
///
|
||||
/// Returns the CR3 value (PML4 physical address).
|
||||
pub fn setup<M: GuestMemory>(guest_mem: &mut M, memory_size: u64) -> Result<u64> {
|
||||
// Zero out the page table area first (16 pages should be plenty)
|
||||
let zeros = vec![0u8; PAGE_TABLE_SIZE as usize * 16];
|
||||
guest_mem.write_bytes(PML4_ADDR, &zeros)?;
|
||||
|
||||
// Calculate how much memory to map (up to 4GB, or actual memory size)
|
||||
let map_size = memory_size.min(4 * 1024 * 1024 * 1024);
|
||||
|
||||
// Number of 2MB pages needed
|
||||
let num_2mb_pages = (map_size + 0x1FFFFF) / 0x200000;
|
||||
|
||||
// Number of PD tables needed (each PD has 512 entries, each entry maps 2MB)
|
||||
let num_pd_tables = ((num_2mb_pages + 511) / 512).max(1) as usize;
|
||||
|
||||
// ============================================================
|
||||
// Set up PML4 entries
|
||||
// ============================================================
|
||||
|
||||
// Entry 0: Points to low PDPT for identity mapping (0x0 - 512GB)
|
||||
let pml4_entry_0 = PDPT_LOW_ADDR | flags::PRESENT | flags::WRITABLE;
|
||||
guest_mem.write_bytes(PML4_ADDR, &pml4_entry_0.to_le_bytes())?;
|
||||
|
||||
// Entry 511: Points to high PDPT for kernel mapping (0xFFFFFF8000000000+)
|
||||
// PML4[511] maps addresses 0xFFFFFF8000000000 - 0xFFFFFFFFFFFFFFFF
|
||||
let pml4_entry_511 = PDPT_HIGH_ADDR | flags::PRESENT | flags::WRITABLE;
|
||||
guest_mem.write_bytes(PML4_ADDR + 511 * 8, &pml4_entry_511.to_le_bytes())?;
|
||||
|
||||
// ============================================================
|
||||
// Set up PDPT for low memory (identity mapping)
|
||||
// ============================================================
|
||||
for i in 0..num_pd_tables.min(4) {
|
||||
let pd_addr = PD_ADDR + (i as u64 * PAGE_TABLE_SIZE);
|
||||
let pdpt_entry = pd_addr | flags::PRESENT | flags::WRITABLE;
|
||||
let pdpt_offset = PDPT_LOW_ADDR + (i as u64 * 8);
|
||||
guest_mem.write_bytes(pdpt_offset, &pdpt_entry.to_le_bytes())?;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// Set up PDPT for high memory (kernel mapping)
|
||||
// Kernel virtual: 0xffffffff80000000 -> physical 0x0
|
||||
// This is PDPT entry 510 (for 0xffffffff80000000-0xffffffffbfffffff)
|
||||
// And PDPT entry 511 (for 0xffffffffc0000000-0xffffffffffffffff)
|
||||
// ============================================================
|
||||
|
||||
// We need PD tables for the high mapping too
|
||||
// Use PD tables starting after the low-memory ones
|
||||
let high_pd_base = PD_ADDR + (num_pd_tables.min(4) as u64 * PAGE_TABLE_SIZE);
|
||||
|
||||
// PDPT[510] maps 0xffffffff80000000-0xffffffffbfffffff to physical 0x0
|
||||
// (This covers the typical kernel text segment)
|
||||
let pdpt_entry_510 = high_pd_base | flags::PRESENT | flags::WRITABLE;
|
||||
guest_mem.write_bytes(PDPT_HIGH_ADDR + 510 * 8, &pdpt_entry_510.to_le_bytes())?;
|
||||
|
||||
// PDPT[511] maps 0xffffffffc0000000-0xffffffffffffffff
|
||||
let pdpt_entry_511 = (high_pd_base + PAGE_TABLE_SIZE) | flags::PRESENT | flags::WRITABLE;
|
||||
guest_mem.write_bytes(PDPT_HIGH_ADDR + 511 * 8, &pdpt_entry_511.to_le_bytes())?;
|
||||
|
||||
// ============================================================
|
||||
// Set up PD entries for identity mapping (2MB huge pages)
|
||||
// ============================================================
|
||||
for i in 0..num_2mb_pages {
|
||||
let pd_table_index = (i / 512) as usize;
|
||||
let pd_entry_index = i % 512;
|
||||
|
||||
if pd_table_index >= 4 {
|
||||
break; // Only support first 4GB for now
|
||||
}
|
||||
|
||||
let pd_table_addr = PD_ADDR + (pd_table_index as u64 * PAGE_TABLE_SIZE);
|
||||
let pd_entry_offset = pd_table_addr + (pd_entry_index * 8);
|
||||
|
||||
// Physical address this entry maps (2MB aligned)
|
||||
let phys_addr = i * 0x200000;
|
||||
|
||||
// PD entry with PAGE_SIZE flag for 2MB huge page
|
||||
let pd_entry = phys_addr | flags::PRESENT | flags::WRITABLE | flags::PAGE_SIZE;
|
||||
guest_mem.write_bytes(pd_entry_offset, &pd_entry.to_le_bytes())?;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// Set up PD entries for high kernel mapping
|
||||
// 0xffffffff80000000 + offset -> physical offset
|
||||
// ============================================================
|
||||
// Map first 1GB of physical memory to the high kernel address space
|
||||
for i in 0..512 {
|
||||
let phys_addr = i * 0x200000;
|
||||
if phys_addr >= map_size {
|
||||
break;
|
||||
}
|
||||
|
||||
// PD for PDPT[510] (0xffffffff80000000-0xffffffffbfffffff)
|
||||
let pd_entry = phys_addr | flags::PRESENT | flags::WRITABLE | flags::PAGE_SIZE;
|
||||
let pd_offset = high_pd_base + (i * 8);
|
||||
guest_mem.write_bytes(pd_offset, &pd_entry.to_le_bytes())?;
|
||||
}
|
||||
|
||||
// Map second 1GB for PDPT[511]
|
||||
for i in 0..512 {
|
||||
let phys_addr = (512 + i) * 0x200000;
|
||||
if phys_addr >= map_size {
|
||||
break;
|
||||
}
|
||||
|
||||
let pd_entry = phys_addr | flags::PRESENT | flags::WRITABLE | flags::PAGE_SIZE;
|
||||
let pd_offset = high_pd_base + PAGE_TABLE_SIZE + (i * 8);
|
||||
guest_mem.write_bytes(pd_offset, &pd_entry.to_le_bytes())?;
|
||||
}
|
||||
|
||||
// Debug: dump page table structure for verification
|
||||
tracing::info!(
|
||||
"Page tables configured at CR3=0x{:x}:\n\
|
||||
PML4[0] = 0x{:016x} -> PDPT_LOW at 0x{:x}\n\
|
||||
PML4[511] = 0x{:016x} -> PDPT_HIGH at 0x{:x}\n\
|
||||
PDPT_LOW[0] = 0x{:016x} -> PD at 0x{:x}\n\
|
||||
{} PD entries (2MB huge pages) covering {} MB",
|
||||
PML4_ADDR,
|
||||
pml4_entry_0, PDPT_LOW_ADDR,
|
||||
pml4_entry_511, PDPT_HIGH_ADDR,
|
||||
PDPT_LOW_ADDR | flags::PRESENT | flags::WRITABLE, PD_ADDR,
|
||||
num_2mb_pages,
|
||||
map_size / (1024 * 1024)
|
||||
);
|
||||
|
||||
// Log the PD entry that maps the kernel (typically at 16MB = 0x1000000)
|
||||
// 0x1000000 / 2MB = 8, so PD[8] maps the kernel
|
||||
let kernel_pd_entry = 8u64 * 0x200000 | flags::PRESENT | flags::WRITABLE | flags::PAGE_SIZE;
|
||||
tracing::info!(
|
||||
"Identity mapping for kernel at 0x1000000:\n\
|
||||
PD[8] = 0x{:016x} -> maps physical 0x1000000-0x11FFFFF",
|
||||
kernel_pd_entry
|
||||
);
|
||||
|
||||
Ok(PML4_ADDR)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
struct MockMemory {
|
||||
size: u64,
|
||||
data: Vec<u8>,
|
||||
}
|
||||
|
||||
impl MockMemory {
|
||||
fn new(size: u64) -> Self {
|
||||
Self {
|
||||
size,
|
||||
data: vec![0; size as usize],
|
||||
}
|
||||
}
|
||||
|
||||
fn read_u64(&self, addr: u64) -> u64 {
|
||||
let bytes = &self.data[addr as usize..addr as usize + 8];
|
||||
u64::from_le_bytes(bytes.try_into().unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
impl GuestMemory for MockMemory {
|
||||
fn write_bytes(&mut self, addr: u64, data: &[u8]) -> Result<()> {
|
||||
let end = addr as usize + data.len();
|
||||
if end > self.data.len() {
|
||||
return Err(BootError::GuestMemoryWrite(format!(
|
||||
"Write at {:#x} exceeds memory",
|
||||
addr
|
||||
)));
|
||||
}
|
||||
self.data[addr as usize..end].copy_from_slice(data);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn size(&self) -> u64 {
|
||||
self.size
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_page_table_setup() {
|
||||
let mut mem = MockMemory::new(128 * 1024 * 1024);
|
||||
let result = PageTableSetup::setup(&mut mem, 128 * 1024 * 1024);
|
||||
|
||||
assert!(result.is_ok());
|
||||
assert_eq!(result.unwrap(), PML4_ADDR);
|
||||
|
||||
// Verify PML4[0] entry points to low PDPT (identity mapping)
|
||||
let pml4_entry_0 = mem.read_u64(PML4_ADDR);
|
||||
assert_eq!(pml4_entry_0 & !0xFFF, PDPT_LOW_ADDR);
|
||||
assert!(pml4_entry_0 & flags::PRESENT != 0);
|
||||
assert!(pml4_entry_0 & flags::WRITABLE != 0);
|
||||
|
||||
// Verify PML4[511] entry points to high PDPT (kernel mapping)
|
||||
let pml4_entry_511 = mem.read_u64(PML4_ADDR + 511 * 8);
|
||||
assert_eq!(pml4_entry_511 & !0xFFF, PDPT_HIGH_ADDR);
|
||||
assert!(pml4_entry_511 & flags::PRESENT != 0);
|
||||
|
||||
// Verify first PDPT entry points to first PD
|
||||
let pdpt_entry = mem.read_u64(PDPT_LOW_ADDR);
|
||||
assert_eq!(pdpt_entry & !0xFFF, PD_ADDR);
|
||||
assert!(pdpt_entry & flags::PRESENT != 0);
|
||||
|
||||
// Verify first PD entry maps physical address 0
|
||||
let pd_entry = mem.read_u64(PD_ADDR);
|
||||
assert_eq!(pd_entry & !0x1FFFFF, 0);
|
||||
assert!(pd_entry & flags::PRESENT != 0);
|
||||
assert!(pd_entry & flags::PAGE_SIZE != 0); // 2MB page
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_identity_mapping() {
|
||||
let mut mem = MockMemory::new(256 * 1024 * 1024);
|
||||
PageTableSetup::setup(&mut mem, 256 * 1024 * 1024).unwrap();
|
||||
|
||||
// Check that addresses 0, 2MB, 4MB, etc. are identity mapped
|
||||
for i in 0..128 {
|
||||
let phys_addr = i * 0x200000u64; // 2MB pages
|
||||
let pd_entry_index = i;
|
||||
let pd_table_index = pd_entry_index / 512;
|
||||
let pd_entry_in_table = pd_entry_index % 512;
|
||||
|
||||
let pd_addr = PD_ADDR + pd_table_index * PAGE_TABLE_SIZE;
|
||||
let pd_entry = mem.read_u64(pd_addr + pd_entry_in_table * 8);
|
||||
|
||||
let mapped_addr = pd_entry & !0x1FFFFF;
|
||||
assert_eq!(mapped_addr, phys_addr, "Mismatch at entry {}", i);
|
||||
}
|
||||
}
|
||||
}
|
||||
608
vmm/src/boot/pvh.rs
Normal file
608
vmm/src/boot/pvh.rs
Normal file
@@ -0,0 +1,608 @@
|
||||
//! PVH Boot Protocol Implementation
|
||||
//!
|
||||
//! PVH (Para-Virtualized Hardware) is a boot protocol that allows direct kernel
|
||||
//! entry without BIOS/UEFI firmware. This is the fastest path to boot a Linux VM.
|
||||
//!
|
||||
//! # Overview
|
||||
//!
|
||||
//! The PVH boot protocol:
|
||||
//! 1. Skips BIOS POST and firmware initialization
|
||||
//! 2. Loads kernel directly into memory
|
||||
//! 3. Sets up minimal boot structures (E820 map, start_info)
|
||||
//! 4. Jumps directly to kernel 64-bit entry point
|
||||
//!
|
||||
//! # Boot Time Comparison
|
||||
//!
|
||||
//! | Method | Boot Time |
|
||||
//! |--------|-----------|
|
||||
//! | BIOS | 1-3s |
|
||||
//! | UEFI | 0.5-1s |
|
||||
//! | PVH | <50ms |
|
||||
//!
|
||||
//! # Memory Requirements
|
||||
//!
|
||||
//! The PVH start_info structure must be placed in guest memory and
|
||||
//! its address passed to the kernel via RBX register.
|
||||
|
||||
use super::{layout, BootError, GuestMemory, Result};
|
||||
|
||||
/// Maximum number of E820 entries
|
||||
pub const MAX_E820_ENTRIES: usize = 128;
|
||||
|
||||
/// E820 memory type values (matching Linux kernel definitions)
|
||||
#[repr(u32)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum E820Type {
|
||||
/// Usable RAM
|
||||
Ram = 1,
|
||||
/// Reserved by system
|
||||
Reserved = 2,
|
||||
/// ACPI reclaimable
|
||||
Acpi = 3,
|
||||
/// ACPI NVS (Non-Volatile Storage)
|
||||
Nvs = 4,
|
||||
/// Unusable memory
|
||||
Unusable = 5,
|
||||
/// Disabled memory (EFI)
|
||||
Disabled = 6,
|
||||
/// Persistent memory
|
||||
Pmem = 7,
|
||||
/// Undefined/other
|
||||
Undefined = 0,
|
||||
}
|
||||
|
||||
impl From<u32> for E820Type {
|
||||
fn from(val: u32) -> Self {
|
||||
match val {
|
||||
1 => E820Type::Ram,
|
||||
2 => E820Type::Reserved,
|
||||
3 => E820Type::Acpi,
|
||||
4 => E820Type::Nvs,
|
||||
5 => E820Type::Unusable,
|
||||
6 => E820Type::Disabled,
|
||||
7 => E820Type::Pmem,
|
||||
_ => E820Type::Undefined,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// E820 memory map entry
|
||||
///
|
||||
/// Matches the Linux kernel's e820entry structure for compatibility.
|
||||
#[repr(C, packed)]
|
||||
#[derive(Debug, Clone, Copy, Default)]
|
||||
pub struct E820Entry {
|
||||
/// Start address of memory region
|
||||
pub addr: u64,
|
||||
/// Size of memory region in bytes
|
||||
pub size: u64,
|
||||
/// Type of memory region
|
||||
pub entry_type: u32,
|
||||
}
|
||||
|
||||
impl E820Entry {
|
||||
/// Create a new E820 entry
|
||||
pub fn new(addr: u64, size: u64, entry_type: E820Type) -> Self {
|
||||
Self {
|
||||
addr,
|
||||
size,
|
||||
entry_type: entry_type as u32,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a RAM entry
|
||||
pub fn ram(addr: u64, size: u64) -> Self {
|
||||
Self::new(addr, size, E820Type::Ram)
|
||||
}
|
||||
|
||||
/// Create a reserved entry
|
||||
pub fn reserved(addr: u64, size: u64) -> Self {
|
||||
Self::new(addr, size, E820Type::Reserved)
|
||||
}
|
||||
}
|
||||
|
||||
/// PVH start_info structure
|
||||
///
|
||||
/// This is a simplified version compatible with the Xen PVH ABI.
|
||||
/// The structure is placed in guest memory and its address is passed
|
||||
/// to the kernel in RBX.
|
||||
///
|
||||
/// # Memory Layout
|
||||
///
|
||||
/// The structure must be at a known location (typically 0x7000) and
|
||||
/// contain pointers to other boot structures.
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct StartInfo {
|
||||
/// Magic number (XEN_HVM_START_MAGIC_VALUE or custom)
|
||||
pub magic: u32,
|
||||
/// Version of the start_info structure
|
||||
pub version: u32,
|
||||
/// Flags (reserved, should be 0)
|
||||
pub flags: u32,
|
||||
/// Number of modules (initrd counts as 1)
|
||||
pub nr_modules: u32,
|
||||
/// Physical address of module list
|
||||
pub modlist_paddr: u64,
|
||||
/// Physical address of command line string
|
||||
pub cmdline_paddr: u64,
|
||||
/// Physical address of RSDP (ACPI, 0 if none)
|
||||
pub rsdp_paddr: u64,
|
||||
/// Physical address of E820 memory map
|
||||
pub memmap_paddr: u64,
|
||||
/// Number of entries in memory map
|
||||
pub memmap_entries: u32,
|
||||
/// Reserved/padding
|
||||
pub reserved: u32,
|
||||
}
|
||||
|
||||
/// XEN HVM start magic value
|
||||
pub const XEN_HVM_START_MAGIC: u32 = 0x336ec578;
|
||||
|
||||
/// Volt custom magic (for identification)
|
||||
pub const VOLT_MAGIC: u32 = 0x4e4f5641; // "NOVA"
|
||||
|
||||
impl StartInfo {
|
||||
/// Create a new StartInfo with default values
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
magic: XEN_HVM_START_MAGIC,
|
||||
version: 1,
|
||||
flags: 0,
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
/// Set command line address
|
||||
pub fn with_cmdline(mut self, addr: u64) -> Self {
|
||||
self.cmdline_paddr = addr;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set memory map address and entry count
|
||||
pub fn with_memmap(mut self, addr: u64, entries: u32) -> Self {
|
||||
self.memmap_paddr = addr;
|
||||
self.memmap_entries = entries;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set module (initrd) information
|
||||
pub fn with_module(mut self, modlist_addr: u64) -> Self {
|
||||
self.nr_modules = 1;
|
||||
self.modlist_paddr = modlist_addr;
|
||||
self
|
||||
}
|
||||
|
||||
/// Convert to bytes for writing to guest memory
|
||||
pub fn as_bytes(&self) -> &[u8] {
|
||||
unsafe {
|
||||
std::slice::from_raw_parts(
|
||||
self as *const Self as *const u8,
|
||||
std::mem::size_of::<Self>(),
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Module (initrd) entry for PVH
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone, Copy, Default)]
|
||||
pub struct HvmModlistEntry {
|
||||
/// Physical address of module
|
||||
pub paddr: u64,
|
||||
/// Size of module in bytes
|
||||
pub size: u64,
|
||||
/// Physical address of command line for module (0 if none)
|
||||
pub cmdline_paddr: u64,
|
||||
/// Reserved
|
||||
pub reserved: u64,
|
||||
}
|
||||
|
||||
impl HvmModlistEntry {
|
||||
/// Create entry for initrd
|
||||
pub fn new(paddr: u64, size: u64) -> Self {
|
||||
Self {
|
||||
paddr,
|
||||
size,
|
||||
cmdline_paddr: 0,
|
||||
reserved: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert to bytes
|
||||
pub fn as_bytes(&self) -> &[u8] {
|
||||
unsafe {
|
||||
std::slice::from_raw_parts(
|
||||
self as *const Self as *const u8,
|
||||
std::mem::size_of::<Self>(),
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// PVH configuration for boot setup
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct PvhConfig {
|
||||
/// Total memory size in bytes
|
||||
pub memory_size: u64,
|
||||
/// Number of vCPUs
|
||||
pub vcpu_count: u32,
|
||||
/// Physical address of command line
|
||||
pub cmdline_addr: u64,
|
||||
/// Physical address of initrd (if any)
|
||||
pub initrd_addr: Option<u64>,
|
||||
/// Size of initrd (if any)
|
||||
pub initrd_size: Option<u64>,
|
||||
}
|
||||
|
||||
/// PVH boot setup implementation
|
||||
pub struct PvhBootSetup;
|
||||
|
||||
impl PvhBootSetup {
|
||||
/// Set up PVH boot structures in guest memory
|
||||
///
|
||||
/// Creates and writes:
|
||||
/// 1. E820 memory map
|
||||
/// 2. start_info structure
|
||||
/// 3. Module list (for initrd)
|
||||
pub fn setup<M: GuestMemory>(config: &PvhConfig, guest_mem: &mut M) -> Result<()> {
|
||||
// Build E820 memory map
|
||||
let e820_entries = Self::build_e820_map(config.memory_size)?;
|
||||
let e820_count = e820_entries.len() as u32;
|
||||
|
||||
// Write E820 map to guest memory
|
||||
Self::write_e820_map(&e820_entries, guest_mem)?;
|
||||
|
||||
// Write module list if initrd is present
|
||||
let modlist_addr = if let (Some(addr), Some(size)) = (config.initrd_addr, config.initrd_size) {
|
||||
let modlist_addr = layout::E820_MAP_ADDR +
|
||||
(MAX_E820_ENTRIES * std::mem::size_of::<E820Entry>()) as u64;
|
||||
|
||||
let entry = HvmModlistEntry::new(addr, size);
|
||||
guest_mem.write_bytes(modlist_addr, entry.as_bytes())?;
|
||||
|
||||
Some(modlist_addr)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// Build and write start_info structure
|
||||
let mut start_info = StartInfo::new()
|
||||
.with_cmdline(config.cmdline_addr)
|
||||
.with_memmap(layout::E820_MAP_ADDR, e820_count);
|
||||
|
||||
if let Some(addr) = modlist_addr {
|
||||
start_info = start_info.with_module(addr);
|
||||
}
|
||||
|
||||
guest_mem.write_bytes(layout::PVH_START_INFO_ADDR, start_info.as_bytes())?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Build E820 memory map for the VM
|
||||
///
|
||||
/// Creates a standard x86_64 memory layout:
|
||||
/// - Low memory (0-640KB): RAM
|
||||
/// - Legacy hole (640KB-1MB): Reserved
|
||||
/// - High memory (1MB+): RAM
|
||||
fn build_e820_map(memory_size: u64) -> Result<Vec<E820Entry>> {
|
||||
let mut entries = Vec::with_capacity(4);
|
||||
|
||||
// Validate minimum memory
|
||||
if memory_size < layout::HIGH_MEMORY_START {
|
||||
return Err(BootError::MemoryLayout(format!(
|
||||
"Memory size {} is less than minimum required {}",
|
||||
memory_size,
|
||||
layout::HIGH_MEMORY_START
|
||||
)));
|
||||
}
|
||||
|
||||
// Low memory: 0 to 640KB (0x0 - 0x9FFFF)
|
||||
// We reserve the first page for real-mode IVT
|
||||
entries.push(E820Entry::ram(0, layout::LOW_MEMORY_END));
|
||||
|
||||
// Legacy video/ROM hole: 640KB to 1MB (0xA0000 - 0xFFFFF)
|
||||
// This is reserved for VGA memory, option ROMs, etc.
|
||||
let legacy_hole_size = layout::HIGH_MEMORY_START - layout::LOW_MEMORY_END;
|
||||
entries.push(E820Entry::reserved(layout::LOW_MEMORY_END, legacy_hole_size));
|
||||
|
||||
// High memory: 1MB to RAM size
|
||||
let high_memory_size = memory_size - layout::HIGH_MEMORY_START;
|
||||
if high_memory_size > 0 {
|
||||
entries.push(E820Entry::ram(layout::HIGH_MEMORY_START, high_memory_size));
|
||||
}
|
||||
|
||||
// If memory > 4GB, we might need to handle the MMIO hole
|
||||
// For now, we assume memory <= 4GB for simplicity
|
||||
// Production systems should handle:
|
||||
// - PCI MMIO hole (typically 0xE0000000 - 0xFFFFFFFF)
|
||||
// - Memory above 4GB remapped
|
||||
|
||||
Ok(entries)
|
||||
}
|
||||
|
||||
/// Write E820 map entries to guest memory
|
||||
fn write_e820_map<M: GuestMemory>(entries: &[E820Entry], guest_mem: &mut M) -> Result<()> {
|
||||
let entry_size = std::mem::size_of::<E820Entry>();
|
||||
|
||||
for (i, entry) in entries.iter().enumerate() {
|
||||
let addr = layout::E820_MAP_ADDR + (i * entry_size) as u64;
|
||||
let bytes = unsafe {
|
||||
std::slice::from_raw_parts(entry as *const E820Entry as *const u8, entry_size)
|
||||
};
|
||||
guest_mem.write_bytes(addr, bytes)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get initial CPU register state for PVH boot
|
||||
///
|
||||
/// Returns the register values needed to start the vCPU in 64-bit mode
|
||||
/// with PVH boot protocol.
|
||||
pub fn get_initial_regs(entry_point: u64) -> PvhRegs {
|
||||
PvhRegs {
|
||||
// Instruction pointer - kernel entry
|
||||
rip: entry_point,
|
||||
|
||||
// RBX contains pointer to start_info (Xen PVH convention)
|
||||
rbx: layout::PVH_START_INFO_ADDR,
|
||||
|
||||
// RSI also contains start_info pointer (Linux boot convention)
|
||||
rsi: layout::PVH_START_INFO_ADDR,
|
||||
|
||||
// Stack pointer
|
||||
rsp: layout::BOOT_STACK_POINTER,
|
||||
|
||||
// Clear other general-purpose registers
|
||||
rax: 0,
|
||||
rcx: 0,
|
||||
rdx: 0,
|
||||
rdi: 0,
|
||||
rbp: 0,
|
||||
r8: 0,
|
||||
r9: 0,
|
||||
r10: 0,
|
||||
r11: 0,
|
||||
r12: 0,
|
||||
r13: 0,
|
||||
r14: 0,
|
||||
r15: 0,
|
||||
|
||||
// Flags - interrupts disabled
|
||||
rflags: 0x2,
|
||||
|
||||
// Segment selectors for 64-bit mode
|
||||
cs: 0x10, // Code segment, ring 0
|
||||
ds: 0x18, // Data segment
|
||||
es: 0x18,
|
||||
fs: 0x18,
|
||||
gs: 0x18,
|
||||
ss: 0x18,
|
||||
|
||||
// CR registers for 64-bit mode
|
||||
cr0: CR0_PE | CR0_ET | CR0_PG,
|
||||
cr3: 0, // Page table base - set by kernel setup
|
||||
cr4: CR4_PAE,
|
||||
|
||||
// EFER for long mode
|
||||
efer: EFER_LME | EFER_LMA,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Control Register 0 bits
|
||||
const CR0_PE: u64 = 1 << 0; // Protection Enable
|
||||
const CR0_ET: u64 = 1 << 4; // Extension Type (387 present)
|
||||
const CR0_PG: u64 = 1 << 31; // Paging Enable
|
||||
|
||||
/// Control Register 4 bits
|
||||
const CR4_PAE: u64 = 1 << 5; // Physical Address Extension
|
||||
|
||||
/// EFER (Extended Feature Enable Register) bits
|
||||
const EFER_LME: u64 = 1 << 8; // Long Mode Enable
|
||||
const EFER_LMA: u64 = 1 << 10; // Long Mode Active
|
||||
|
||||
/// CPU register state for PVH boot
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct PvhRegs {
|
||||
// General purpose registers
|
||||
pub rax: u64,
|
||||
pub rbx: u64,
|
||||
pub rcx: u64,
|
||||
pub rdx: u64,
|
||||
pub rsi: u64,
|
||||
pub rdi: u64,
|
||||
pub rsp: u64,
|
||||
pub rbp: u64,
|
||||
pub r8: u64,
|
||||
pub r9: u64,
|
||||
pub r10: u64,
|
||||
pub r11: u64,
|
||||
pub r12: u64,
|
||||
pub r13: u64,
|
||||
pub r14: u64,
|
||||
pub r15: u64,
|
||||
|
||||
// Instruction pointer
|
||||
pub rip: u64,
|
||||
|
||||
// Flags
|
||||
pub rflags: u64,
|
||||
|
||||
// Segment selectors
|
||||
pub cs: u16,
|
||||
pub ds: u16,
|
||||
pub es: u16,
|
||||
pub fs: u16,
|
||||
pub gs: u16,
|
||||
pub ss: u16,
|
||||
|
||||
// Control registers
|
||||
pub cr0: u64,
|
||||
pub cr3: u64,
|
||||
pub cr4: u64,
|
||||
|
||||
// Model-specific registers
|
||||
pub efer: u64,
|
||||
}
|
||||
|
||||
/// GDT entries for 64-bit mode boot
|
||||
///
|
||||
/// This provides a minimal GDT for transitioning to 64-bit mode.
|
||||
/// The kernel will set up its own GDT later.
|
||||
pub struct BootGdt;
|
||||
|
||||
impl BootGdt {
|
||||
/// Null descriptor (required as GDT[0])
|
||||
pub const NULL: u64 = 0;
|
||||
|
||||
/// 64-bit code segment (CS)
|
||||
/// Base: 0, Limit: 0xFFFFF (ignored in 64-bit mode)
|
||||
/// Type: Code, Execute/Read, Present, DPL=0
|
||||
pub const CODE64: u64 = 0x00af_9b00_0000_ffff;
|
||||
|
||||
/// 64-bit data segment (DS, ES, SS, FS, GS)
|
||||
/// Base: 0, Limit: 0xFFFFF
|
||||
/// Type: Data, Read/Write, Present, DPL=0
|
||||
pub const DATA64: u64 = 0x00cf_9300_0000_ffff;
|
||||
|
||||
/// Build GDT table as bytes
|
||||
pub fn as_bytes() -> [u8; 24] {
|
||||
let mut gdt = [0u8; 24];
|
||||
gdt[0..8].copy_from_slice(&Self::NULL.to_le_bytes());
|
||||
gdt[8..16].copy_from_slice(&Self::CODE64.to_le_bytes());
|
||||
gdt[16..24].copy_from_slice(&Self::DATA64.to_le_bytes());
|
||||
gdt
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
struct MockMemory {
|
||||
size: u64,
|
||||
data: Vec<u8>,
|
||||
}
|
||||
|
||||
impl MockMemory {
|
||||
fn new(size: u64) -> Self {
|
||||
Self {
|
||||
size,
|
||||
data: vec![0; size as usize],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl GuestMemory for MockMemory {
|
||||
fn write_bytes(&mut self, addr: u64, data: &[u8]) -> Result<()> {
|
||||
let end = addr as usize + data.len();
|
||||
if end > self.data.len() {
|
||||
return Err(BootError::GuestMemoryWrite(format!(
|
||||
"Write at {:#x} exceeds memory size",
|
||||
addr
|
||||
)));
|
||||
}
|
||||
self.data[addr as usize..end].copy_from_slice(data);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn size(&self) -> u64 {
|
||||
self.size
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_e820_entry_size() {
|
||||
// E820 entry must be exactly 20 bytes for Linux kernel compatibility
|
||||
assert_eq!(std::mem::size_of::<E820Entry>(), 20);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_build_e820_map() {
|
||||
let memory_size = 128 * 1024 * 1024; // 128MB
|
||||
let entries = PvhBootSetup::build_e820_map(memory_size).unwrap();
|
||||
|
||||
// Should have at least 3 entries
|
||||
assert!(entries.len() >= 3);
|
||||
|
||||
// First entry should be low memory RAM — copy from packed struct
|
||||
let e0_addr = entries[0].addr;
|
||||
let e0_type = entries[0].entry_type;
|
||||
assert_eq!(e0_addr, 0);
|
||||
assert_eq!(e0_type, E820Type::Ram as u32);
|
||||
|
||||
// Second entry should be legacy hole (reserved)
|
||||
let e1_addr = entries[1].addr;
|
||||
let e1_type = entries[1].entry_type;
|
||||
assert_eq!(e1_addr, layout::LOW_MEMORY_END);
|
||||
assert_eq!(e1_type, E820Type::Reserved as u32);
|
||||
|
||||
// Third entry should be high memory RAM
|
||||
let e2_addr = entries[2].addr;
|
||||
let e2_type = entries[2].entry_type;
|
||||
assert_eq!(e2_addr, layout::HIGH_MEMORY_START);
|
||||
assert_eq!(e2_type, E820Type::Ram as u32);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_start_info_size() {
|
||||
// StartInfo should be reasonable size (under 4KB page)
|
||||
let size = std::mem::size_of::<StartInfo>();
|
||||
assert!(size < 4096);
|
||||
assert!(size >= 48); // Minimum expected fields
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pvh_setup() {
|
||||
let mut mem = MockMemory::new(128 * 1024 * 1024);
|
||||
let config = PvhConfig {
|
||||
memory_size: 128 * 1024 * 1024,
|
||||
vcpu_count: 2,
|
||||
cmdline_addr: layout::CMDLINE_ADDR,
|
||||
initrd_addr: Some(100 * 1024 * 1024),
|
||||
initrd_size: Some(10 * 1024 * 1024),
|
||||
};
|
||||
|
||||
let result = PvhBootSetup::setup(&config, &mut mem);
|
||||
assert!(result.is_ok());
|
||||
|
||||
// Verify magic was written to start_info location
|
||||
let magic = u32::from_le_bytes([
|
||||
mem.data[layout::PVH_START_INFO_ADDR as usize],
|
||||
mem.data[layout::PVH_START_INFO_ADDR as usize + 1],
|
||||
mem.data[layout::PVH_START_INFO_ADDR as usize + 2],
|
||||
mem.data[layout::PVH_START_INFO_ADDR as usize + 3],
|
||||
]);
|
||||
assert_eq!(magic, XEN_HVM_START_MAGIC);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pvh_regs() {
|
||||
let entry_point = 0x100200;
|
||||
let regs = PvhBootSetup::get_initial_regs(entry_point);
|
||||
|
||||
// Verify entry point
|
||||
assert_eq!(regs.rip, entry_point);
|
||||
|
||||
// Verify start_info pointer in rbx
|
||||
assert_eq!(regs.rbx, layout::PVH_START_INFO_ADDR);
|
||||
|
||||
// Verify 64-bit mode flags
|
||||
assert!(regs.cr0 & CR0_PE != 0); // Protection enabled
|
||||
assert!(regs.cr0 & CR0_PG != 0); // Paging enabled
|
||||
assert!(regs.cr4 & CR4_PAE != 0); // PAE enabled
|
||||
assert!(regs.efer & EFER_LME != 0); // Long mode enabled
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_gdt_layout() {
|
||||
let gdt = BootGdt::as_bytes();
|
||||
assert_eq!(gdt.len(), 24); // 3 entries × 8 bytes
|
||||
|
||||
// First entry should be null
|
||||
assert_eq!(&gdt[0..8], &[0u8; 8]);
|
||||
}
|
||||
}
|
||||
278
vmm/src/devices/i8042.rs
Normal file
278
vmm/src/devices/i8042.rs
Normal file
@@ -0,0 +1,278 @@
|
||||
//! Minimal i8042 keyboard controller emulation
|
||||
//!
|
||||
//! The Linux kernel probes for an i8042 PS/2 controller during boot. Without
|
||||
//! one present, the probe times out after ~1 second. This minimal implementation
|
||||
//! responds to the probe just enough to avoid the timeout penalty.
|
||||
//!
|
||||
//! Ports:
|
||||
//! - 0x60: Data register (read/write)
|
||||
//! - 0x64: Status register (read) / Command register (write)
|
||||
//!
|
||||
//! Linux i8042 probe sequence:
|
||||
//! 1. Write 0xAA to port 0x64 (self-test) → read 0x55 from port 0x60
|
||||
//! 2. Write 0x20 to port 0x64 (read CTR) → read CTR from port 0x60
|
||||
//! 3. Write 0x60 to port 0x64 (write CTR) → write new CTR to port 0x60
|
||||
//! 4. Write 0xAB to port 0x64 (test port 1) → read 0x00 from port 0x60
|
||||
//! 5. Various enable/disable commands
|
||||
|
||||
use std::collections::VecDeque;
|
||||
|
||||
/// I/O port for the data register
|
||||
pub const DATA_PORT: u16 = 0x60;
|
||||
/// I/O port for the status/command register
|
||||
pub const CMD_PORT: u16 = 0x64;
|
||||
|
||||
/// Status register bits
|
||||
mod status {
|
||||
/// Output buffer full — data available to read from port 0x60
|
||||
pub const OBF: u8 = 0x01;
|
||||
}
|
||||
|
||||
/// Controller commands
|
||||
mod cmd {
|
||||
/// Read command byte (Controller Configuration Register / CTR)
|
||||
pub const READ_CMD_BYTE: u8 = 0x20;
|
||||
/// Write command byte — next byte written to port 0x60 becomes the CTR
|
||||
pub const WRITE_CMD_BYTE: u8 = 0x60;
|
||||
/// Disable aux (mouse) port
|
||||
pub const DISABLE_AUX: u8 = 0xA7;
|
||||
/// Enable aux (mouse) port
|
||||
pub const ENABLE_AUX: u8 = 0xA8;
|
||||
/// Test aux port — returns 0x00 on success
|
||||
pub const TEST_AUX: u8 = 0xA9;
|
||||
/// Self-test: returns 0x55 on success
|
||||
pub const SELF_TEST: u8 = 0xAA;
|
||||
/// Interface test: returns 0x00 on success
|
||||
pub const INTERFACE_TEST: u8 = 0xAB;
|
||||
/// Disable keyboard
|
||||
pub const DISABLE_KBD: u8 = 0xAD;
|
||||
/// Enable keyboard
|
||||
pub const ENABLE_KBD: u8 = 0xAE;
|
||||
/// Write to aux device — next byte written to port 0x60 goes to mouse
|
||||
pub const WRITE_AUX: u8 = 0xD4;
|
||||
/// System reset (pulse CPU reset line)
|
||||
pub const RESET: u8 = 0xFE;
|
||||
}
|
||||
|
||||
/// Minimal i8042 PS/2 controller
|
||||
pub struct I8042 {
|
||||
/// Output buffer — queued bytes for the guest to read from port 0x60
|
||||
output: VecDeque<u8>,
|
||||
/// Command byte / Controller Configuration Register (CTR)
|
||||
/// Default 0x47: keyboard interrupt enabled, system flag, keyboard enabled, translation
|
||||
cmd_byte: u8,
|
||||
/// Whether the next write to port 0x60 is a data byte for a pending command
|
||||
expecting_data: bool,
|
||||
/// The pending command that expects a data byte on port 0x60
|
||||
pending_cmd: u8,
|
||||
/// Whether a reset was requested
|
||||
reset_requested: bool,
|
||||
}
|
||||
|
||||
impl I8042 {
|
||||
/// Create a new i8042 controller
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
output: VecDeque::with_capacity(4),
|
||||
cmd_byte: 0x47,
|
||||
expecting_data: false,
|
||||
pending_cmd: 0,
|
||||
reset_requested: false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Handle a read from port 0x60 (data register) — clears OBF
|
||||
pub fn read_data(&mut self) -> u8 {
|
||||
self.output.pop_front().unwrap_or(0x00)
|
||||
}
|
||||
|
||||
/// Handle a read from port 0x64 (status register)
|
||||
pub fn read_status(&self) -> u8 {
|
||||
if self.output.is_empty() {
|
||||
0x00
|
||||
} else {
|
||||
status::OBF
|
||||
}
|
||||
}
|
||||
|
||||
/// Handle a write to port 0x60 (data register)
|
||||
pub fn write_data(&mut self, value: u8) {
|
||||
if self.expecting_data {
|
||||
self.expecting_data = false;
|
||||
match self.pending_cmd {
|
||||
cmd::WRITE_CMD_BYTE => {
|
||||
self.cmd_byte = value;
|
||||
}
|
||||
cmd::WRITE_AUX => {
|
||||
// Write to aux device — eat the byte (no mouse emulated)
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
self.pending_cmd = 0;
|
||||
}
|
||||
// Otherwise accept and ignore
|
||||
}
|
||||
|
||||
/// Handle a write to port 0x64 (command register)
|
||||
pub fn write_command(&mut self, value: u8) {
|
||||
match value {
|
||||
cmd::READ_CMD_BYTE => {
|
||||
self.output.push_back(self.cmd_byte);
|
||||
}
|
||||
cmd::WRITE_CMD_BYTE => {
|
||||
self.expecting_data = true;
|
||||
self.pending_cmd = cmd::WRITE_CMD_BYTE;
|
||||
}
|
||||
cmd::DISABLE_AUX => {
|
||||
self.cmd_byte |= 0x20; // Set bit 5 (aux disabled)
|
||||
}
|
||||
cmd::ENABLE_AUX => {
|
||||
self.cmd_byte &= !0x20; // Clear bit 5
|
||||
}
|
||||
cmd::TEST_AUX => {
|
||||
self.output.push_back(0x00); // Test passed
|
||||
}
|
||||
cmd::SELF_TEST => {
|
||||
self.output.push_back(0x55); // Test passed
|
||||
self.cmd_byte = 0x47; // Self-test resets CTR
|
||||
}
|
||||
cmd::INTERFACE_TEST => {
|
||||
self.output.push_back(0x00); // No error
|
||||
}
|
||||
cmd::DISABLE_KBD => {
|
||||
self.cmd_byte |= 0x10; // Set bit 4 (keyboard disabled)
|
||||
}
|
||||
cmd::ENABLE_KBD => {
|
||||
self.cmd_byte &= !0x10; // Clear bit 4
|
||||
}
|
||||
cmd::WRITE_AUX => {
|
||||
self.expecting_data = true;
|
||||
self.pending_cmd = cmd::WRITE_AUX;
|
||||
}
|
||||
cmd::RESET => {
|
||||
self.reset_requested = true;
|
||||
}
|
||||
_ => {
|
||||
// Accept and ignore all other commands
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if the guest requested a system reset
|
||||
pub fn reset_requested(&self) -> bool {
|
||||
self.reset_requested
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for I8042 {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_status_empty() {
|
||||
let dev = I8042::new();
|
||||
assert_eq!(dev.read_status(), 0x00);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_self_test() {
|
||||
let mut dev = I8042::new();
|
||||
dev.write_command(cmd::SELF_TEST);
|
||||
assert_ne!(dev.read_status() & status::OBF, 0);
|
||||
assert_eq!(dev.read_data(), 0x55);
|
||||
assert_eq!(dev.read_status(), 0x00);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_read_ctr() {
|
||||
let mut dev = I8042::new();
|
||||
dev.write_command(cmd::READ_CMD_BYTE);
|
||||
assert_ne!(dev.read_status() & status::OBF, 0, "OBF should be set after read CTR command");
|
||||
assert_eq!(dev.read_data(), 0x47, "Default CTR should be 0x47");
|
||||
assert_eq!(dev.read_status(), 0x00, "OBF should be clear after reading data");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_write_ctr() {
|
||||
let mut dev = I8042::new();
|
||||
// Write command byte
|
||||
dev.write_command(cmd::WRITE_CMD_BYTE);
|
||||
dev.write_data(0x65); // New CTR value
|
||||
// Read it back
|
||||
dev.write_command(cmd::READ_CMD_BYTE);
|
||||
assert_eq!(dev.read_data(), 0x65);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_full_probe_sequence() {
|
||||
let mut dev = I8042::new();
|
||||
|
||||
// Step 1: Self-test
|
||||
dev.write_command(cmd::SELF_TEST);
|
||||
assert_ne!(dev.read_status() & status::OBF, 0);
|
||||
assert_eq!(dev.read_data(), 0x55);
|
||||
|
||||
// Step 2: Read CTR
|
||||
dev.write_command(cmd::READ_CMD_BYTE);
|
||||
assert_ne!(dev.read_status() & status::OBF, 0);
|
||||
let ctr = dev.read_data();
|
||||
assert_eq!(ctr, 0x47);
|
||||
|
||||
// Step 3: Write CTR
|
||||
dev.write_command(cmd::WRITE_CMD_BYTE);
|
||||
dev.write_data(ctr & !0x0C); // Disable IRQs during probe
|
||||
|
||||
// Step 4: Test interface
|
||||
dev.write_command(cmd::INTERFACE_TEST);
|
||||
assert_ne!(dev.read_status() & status::OBF, 0);
|
||||
assert_eq!(dev.read_data(), 0x00);
|
||||
|
||||
// Step 5: Enable keyboard
|
||||
dev.write_command(cmd::ENABLE_KBD);
|
||||
|
||||
// Step 6: Re-enable IRQs
|
||||
dev.write_command(cmd::WRITE_CMD_BYTE);
|
||||
dev.write_data(ctr);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_interface_test() {
|
||||
let mut dev = I8042::new();
|
||||
dev.write_command(cmd::INTERFACE_TEST);
|
||||
assert_eq!(dev.read_data(), 0x00);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_disable_enable_keyboard() {
|
||||
let mut dev = I8042::new();
|
||||
dev.write_command(cmd::DISABLE_KBD);
|
||||
dev.write_command(cmd::READ_CMD_BYTE);
|
||||
let ctr = dev.read_data();
|
||||
assert_ne!(ctr & 0x10, 0, "Bit 4 should be set when keyboard disabled");
|
||||
|
||||
dev.write_command(cmd::ENABLE_KBD);
|
||||
dev.write_command(cmd::READ_CMD_BYTE);
|
||||
let ctr = dev.read_data();
|
||||
assert_eq!(ctr & 0x10, 0, "Bit 4 should be clear when keyboard enabled");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_reset() {
|
||||
let mut dev = I8042::new();
|
||||
assert!(!dev.reset_requested());
|
||||
dev.write_command(cmd::RESET);
|
||||
assert!(dev.reset_requested());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_data_read_empty() {
|
||||
let mut dev = I8042::new();
|
||||
assert_eq!(dev.read_data(), 0x00);
|
||||
}
|
||||
}
|
||||
20
vmm/src/devices/mod.rs
Normal file
20
vmm/src/devices/mod.rs
Normal file
@@ -0,0 +1,20 @@
|
||||
//! Device emulation for Volt VMM
|
||||
//!
|
||||
//! This module provides device emulation implementations for the Volt
|
||||
//! microVM monitor. Devices are organized by type:
|
||||
//!
|
||||
//! - `virtio`: VirtIO devices (block, network, etc.)
|
||||
//! - `serial`: 8250 UART serial console
|
||||
//! - `i8042`: Minimal PS/2 keyboard controller (avoids ~1s boot probe timeout)
|
||||
//! - `net`: Network backends (TAP, macvtap)
|
||||
|
||||
#[allow(dead_code)] // PS/2 controller — planned feature
|
||||
pub mod i8042;
|
||||
#[allow(dead_code)] // Network backends — planned feature
|
||||
pub mod net;
|
||||
pub mod serial;
|
||||
pub mod virtio;
|
||||
|
||||
pub use virtio::stellarium_blk::StellariumBackend;
|
||||
pub use virtio::GuestMemory;
|
||||
pub use virtio::mmio::{DynMmioDevice, NetMmioTransport, InterruptDelivery};
|
||||
705
vmm/src/devices/net/macvtap.rs
Normal file
705
vmm/src/devices/net/macvtap.rs
Normal file
@@ -0,0 +1,705 @@
|
||||
//! macvtap Network Backend
|
||||
//!
|
||||
//! macvtap provides near-native network performance by giving VMs direct
|
||||
//! access to the physical NIC without a software bridge.
|
||||
//!
|
||||
//! # Architecture
|
||||
//!
|
||||
//! ```text
|
||||
//! ┌─────────────────────────────────────────────────────────────┐
|
||||
//! │ Guest VM │
|
||||
//! │ ┌───────────────────────────────────────────────────────┐ │
|
||||
//! │ │ virtio-net driver │ │
|
||||
//! │ └──────────────────────┬────────────────────────────────┘ │
|
||||
//! └─────────────────────────┼───────────────────────────────────┘
|
||||
//! │
|
||||
//! ┌─────────────────────────┼───────────────────────────────────┐
|
||||
//! │ Volt VMM │ │
|
||||
//! │ ┌──────────────────────┴────────────────────────────────┐ │
|
||||
//! │ │ MacvtapDevice │ │
|
||||
//! │ │ ┌────────────────────────────────────────────────┐ │ │
|
||||
//! │ │ │ /dev/tap<ifindex> │ │ │
|
||||
//! │ │ │ read()/write() → Zero-copy packet I/O │ │ │
|
||||
//! │ │ └────────────────────────────────────────────────┘ │ │
|
||||
//! │ └───────────────────────────────────────────────────────┘ │
|
||||
//! └─────────────────────────┬───────────────────────────────────┘
|
||||
//! │ macvtap kernel module
|
||||
//! │
|
||||
//! ┌─────────────────────────┴───────────────────────────────────┐
|
||||
//! │ Physical NIC (eth0/enp3s0) │
|
||||
//! │ └── No bridge, direct MAC-based switching │
|
||||
//! └─────────────────────────────────────────────────────────────┘
|
||||
//! ```
|
||||
//!
|
||||
//! # Performance
|
||||
//!
|
||||
//! - ~20-25 Gbps throughput (vs ~10 Gbps vhost-net)
|
||||
//! - ~10-20μs latency (vs ~20-50μs vhost-net)
|
||||
//! - Multi-queue support for scaling with vCPUs
|
||||
//!
|
||||
//! # Modes
|
||||
//!
|
||||
//! - **vepa**: External switch handles all traffic (requires VEPA-capable switch)
|
||||
//! - **bridge**: VMs can communicate directly on host (default)
|
||||
//! - **private**: VMs isolated from each other
|
||||
//! - **passthru**: Single VM owns NIC (maximum performance)
|
||||
|
||||
use super::{NetBackendType, NetError, NetworkBackend, OffloadFlags, Result};
|
||||
use std::ffi::CString;
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::io::{Read, Write};
|
||||
use std::os::unix::io::{AsRawFd, RawFd};
|
||||
use std::path::PathBuf;
|
||||
|
||||
// ============================================================================
|
||||
// Constants and ioctl definitions
|
||||
// ============================================================================
|
||||
|
||||
/// macvtap modes
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
#[repr(u32)]
|
||||
pub enum MacvtapMode {
|
||||
/// Virtual Ethernet Port Aggregator - all traffic through external switch
|
||||
Vepa = 1,
|
||||
/// Software bridge mode - VMs can communicate directly
|
||||
Bridge = 4,
|
||||
/// Private mode - VMs isolated from each other
|
||||
Private = 2,
|
||||
/// Passthrough mode - single VM owns NIC
|
||||
Passthru = 8,
|
||||
}
|
||||
|
||||
impl Default for MacvtapMode {
|
||||
fn default() -> Self {
|
||||
Self::Bridge
|
||||
}
|
||||
}
|
||||
|
||||
impl MacvtapMode {
|
||||
pub fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
Self::Vepa => "vepa",
|
||||
Self::Bridge => "bridge",
|
||||
Self::Private => "private",
|
||||
Self::Passthru => "passthru",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// TAP device ioctl numbers
|
||||
mod tap_ioctl {
|
||||
use std::os::raw::c_int;
|
||||
|
||||
pub const TUNSETIFF: u64 = 0x400454CA;
|
||||
pub const TUNGETIFF: u64 = 0x800454D2;
|
||||
pub const TUNSETOFFLOAD: u64 = 0x400454D0;
|
||||
pub const TUNSETVNETHDRSZ: u64 = 0x400454D8;
|
||||
pub const TUNGETFEATURES: u64 = 0x800454CF;
|
||||
pub const TUNSETQUEUE: u64 = 0x400454D9;
|
||||
|
||||
// TUN/TAP flags
|
||||
pub const IFF_TAP: c_int = 0x0002;
|
||||
pub const IFF_NO_PI: c_int = 0x1000;
|
||||
pub const IFF_VNET_HDR: c_int = 0x4000;
|
||||
pub const IFF_MULTI_QUEUE: c_int = 0x0100;
|
||||
pub const IFF_ATTACH_QUEUE: c_int = 0x0200;
|
||||
pub const IFF_DETACH_QUEUE: c_int = 0x0400;
|
||||
|
||||
// Offload flags
|
||||
pub const TUN_F_CSUM: u32 = 0x01;
|
||||
pub const TUN_F_TSO4: u32 = 0x02;
|
||||
pub const TUN_F_TSO6: u32 = 0x04;
|
||||
pub const TUN_F_TSO_ECN: u32 = 0x08;
|
||||
pub const TUN_F_UFO: u32 = 0x10;
|
||||
pub const TUN_F_USO4: u32 = 0x20;
|
||||
pub const TUN_F_USO6: u32 = 0x40;
|
||||
}
|
||||
|
||||
/// Interface request structure for ioctls
|
||||
#[repr(C)]
|
||||
struct IfReq {
|
||||
ifr_name: [u8; 16],
|
||||
ifr_flags: i16,
|
||||
_padding: [u8; 22],
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// macvtap Device
|
||||
// ============================================================================
|
||||
|
||||
/// A macvtap network device providing near-native performance
|
||||
pub struct MacvtapDevice {
|
||||
/// File descriptor for the tap device
|
||||
file: File,
|
||||
/// Interface name (e.g., "macvtap0")
|
||||
name: String,
|
||||
/// Parent interface name (e.g., "eth0")
|
||||
parent: String,
|
||||
/// macvtap mode
|
||||
mode: MacvtapMode,
|
||||
/// Interface index
|
||||
ifindex: u32,
|
||||
/// MAC address
|
||||
mac: [u8; 6],
|
||||
/// Whether VNET_HDR is enabled
|
||||
vnet_hdr: bool,
|
||||
/// Additional queue file descriptors for multi-queue
|
||||
queues: Vec<File>,
|
||||
/// Link status
|
||||
link_up: bool,
|
||||
}
|
||||
|
||||
impl MacvtapDevice {
|
||||
/// Create a new macvtap device on the specified parent interface
|
||||
///
|
||||
/// This creates the macvtap interface via netlink and opens the tap device.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `parent` - Parent interface name (e.g., "eth0", "enp3s0")
|
||||
/// * `name` - Name for the macvtap interface (e.g., "macvtap0")
|
||||
/// * `mode` - macvtap mode (bridge, vepa, private, passthru)
|
||||
/// * `mac` - Optional MAC address (random if None)
|
||||
pub fn create(
|
||||
parent: &str,
|
||||
name: &str,
|
||||
mode: MacvtapMode,
|
||||
mac: Option<[u8; 6]>,
|
||||
) -> Result<Self> {
|
||||
// Create the macvtap interface via netlink
|
||||
let ifindex = Self::create_via_netlink(parent, name, mode)?;
|
||||
|
||||
// Generate or use provided MAC
|
||||
let mac = mac.unwrap_or_else(Self::random_mac);
|
||||
|
||||
// Open the tap character device
|
||||
let tap_path = format!("/dev/tap{}", ifindex);
|
||||
let file = Self::open_tap_device(&tap_path)?;
|
||||
|
||||
// Enable VNET_HDR for offloads
|
||||
Self::set_vnet_hdr(&file, 12)?;
|
||||
|
||||
// Set non-blocking mode
|
||||
Self::set_nonblocking_internal(&file, true)?;
|
||||
|
||||
Ok(Self {
|
||||
file,
|
||||
name: name.to_string(),
|
||||
parent: parent.to_string(),
|
||||
mode,
|
||||
ifindex,
|
||||
mac,
|
||||
vnet_hdr: true,
|
||||
queues: Vec::new(),
|
||||
link_up: true,
|
||||
})
|
||||
}
|
||||
|
||||
/// Open an existing macvtap device by name
|
||||
///
|
||||
/// Use this when the interface is pre-created by networkd
|
||||
pub fn open(name: &str) -> Result<Self> {
|
||||
// Get interface index
|
||||
let ifindex = Self::get_ifindex(name)?;
|
||||
|
||||
// Get MAC address from sysfs
|
||||
let mac = Self::read_mac_from_sysfs(name)?;
|
||||
|
||||
// Determine parent interface
|
||||
let parent = Self::read_parent_from_sysfs(name)?;
|
||||
|
||||
// Open tap device
|
||||
let tap_path = format!("/dev/tap{}", ifindex);
|
||||
let file = Self::open_tap_device(&tap_path)?;
|
||||
|
||||
// Enable VNET_HDR
|
||||
Self::set_vnet_hdr(&file, 12)?;
|
||||
|
||||
// Set non-blocking
|
||||
Self::set_nonblocking_internal(&file, true)?;
|
||||
|
||||
Ok(Self {
|
||||
file,
|
||||
name: name.to_string(),
|
||||
parent,
|
||||
mode: MacvtapMode::Bridge, // Can't reliably determine from existing
|
||||
ifindex,
|
||||
mac,
|
||||
vnet_hdr: true,
|
||||
queues: Vec::new(),
|
||||
link_up: true,
|
||||
})
|
||||
}
|
||||
|
||||
/// Create via netlink (RTM_NEWLINK)
|
||||
fn create_via_netlink(parent: &str, name: &str, mode: MacvtapMode) -> Result<u32> {
|
||||
// Use ip command as fallback (netlink crate would be cleaner)
|
||||
// In production, use rtnetlink crate directly
|
||||
let output = std::process::Command::new("ip")
|
||||
.args([
|
||||
"link",
|
||||
"add",
|
||||
"link",
|
||||
parent,
|
||||
"name",
|
||||
name,
|
||||
"type",
|
||||
"macvtap",
|
||||
"mode",
|
||||
mode.as_str(),
|
||||
])
|
||||
.output()
|
||||
.map_err(|e| NetError::CreateFailed(e))?;
|
||||
|
||||
if !output.status.success() {
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
return Err(NetError::Netlink(stderr.to_string()));
|
||||
}
|
||||
|
||||
// Bring interface up
|
||||
std::process::Command::new("ip")
|
||||
.args(["link", "set", name, "up"])
|
||||
.output()
|
||||
.map_err(|e| NetError::CreateFailed(e))?;
|
||||
|
||||
// Get the interface index
|
||||
Self::get_ifindex(name)
|
||||
}
|
||||
|
||||
/// Get interface index from name
|
||||
fn get_ifindex(name: &str) -> Result<u32> {
|
||||
let c_name = CString::new(name).map_err(|_| {
|
||||
NetError::InvalidConfig(format!("Invalid interface name: {}", name))
|
||||
})?;
|
||||
|
||||
let ifindex = unsafe { libc::if_nametoindex(c_name.as_ptr()) };
|
||||
if ifindex == 0 {
|
||||
return Err(NetError::InterfaceNotFound(name.to_string()));
|
||||
}
|
||||
|
||||
Ok(ifindex)
|
||||
}
|
||||
|
||||
/// Open the tap character device
|
||||
fn open_tap_device(path: &str) -> Result<File> {
|
||||
OpenOptions::new()
|
||||
.read(true)
|
||||
.write(true)
|
||||
.open(path)
|
||||
.map_err(|e| NetError::CreateFailed(e))
|
||||
}
|
||||
|
||||
/// Set VNET_HDR size for offloads
|
||||
fn set_vnet_hdr(file: &File, size: i32) -> Result<()> {
|
||||
let ret = unsafe {
|
||||
libc::ioctl(
|
||||
file.as_raw_fd(),
|
||||
tap_ioctl::TUNSETVNETHDRSZ as libc::c_ulong,
|
||||
&size as *const i32,
|
||||
)
|
||||
};
|
||||
|
||||
if ret < 0 {
|
||||
return Err(NetError::IoctlFailed(std::io::Error::last_os_error()));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Set non-blocking mode
|
||||
fn set_nonblocking_internal(file: &File, nonblocking: bool) -> Result<()> {
|
||||
let flags = unsafe { libc::fcntl(file.as_raw_fd(), libc::F_GETFL) };
|
||||
if flags < 0 {
|
||||
return Err(NetError::IoctlFailed(std::io::Error::last_os_error()));
|
||||
}
|
||||
|
||||
let new_flags = if nonblocking {
|
||||
flags | libc::O_NONBLOCK
|
||||
} else {
|
||||
flags & !libc::O_NONBLOCK
|
||||
};
|
||||
|
||||
let ret = unsafe { libc::fcntl(file.as_raw_fd(), libc::F_SETFL, new_flags) };
|
||||
if ret < 0 {
|
||||
return Err(NetError::IoctlFailed(std::io::Error::last_os_error()));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Read MAC address from sysfs
|
||||
fn read_mac_from_sysfs(name: &str) -> Result<[u8; 6]> {
|
||||
let path = format!("/sys/class/net/{}/address", name);
|
||||
let content = std::fs::read_to_string(&path).map_err(|e| {
|
||||
NetError::InterfaceNotFound(format!("{}: {}", name, e))
|
||||
})?;
|
||||
|
||||
Self::parse_mac(&content.trim())
|
||||
}
|
||||
|
||||
/// Parse MAC address from string
|
||||
fn parse_mac(s: &str) -> Result<[u8; 6]> {
|
||||
let parts: Vec<&str> = s.split(':').collect();
|
||||
if parts.len() != 6 {
|
||||
return Err(NetError::InvalidConfig(format!("Invalid MAC: {}", s)));
|
||||
}
|
||||
|
||||
let mut mac = [0u8; 6];
|
||||
for (i, part) in parts.iter().enumerate() {
|
||||
mac[i] = u8::from_str_radix(part, 16).map_err(|_| {
|
||||
NetError::InvalidConfig(format!("Invalid MAC byte: {}", part))
|
||||
})?;
|
||||
}
|
||||
Ok(mac)
|
||||
}
|
||||
|
||||
/// Read parent interface from sysfs
|
||||
fn read_parent_from_sysfs(name: &str) -> Result<String> {
|
||||
// macvtap shows parent via lower_* symlink
|
||||
let path = format!("/sys/class/net/{}", name);
|
||||
let _lower_path = PathBuf::from(&path);
|
||||
|
||||
// Try reading the link
|
||||
for entry in std::fs::read_dir(&path).map_err(|e| {
|
||||
NetError::InterfaceNotFound(format!("{}: {}", name, e))
|
||||
})? {
|
||||
if let Ok(entry) = entry {
|
||||
let name = entry.file_name();
|
||||
if name.to_string_lossy().starts_with("lower_") {
|
||||
return Ok(name.to_string_lossy().replace("lower_", ""));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: check device symlink
|
||||
Ok("unknown".to_string())
|
||||
}
|
||||
|
||||
/// Generate a random locally-administered MAC address
|
||||
fn random_mac() -> [u8; 6] {
|
||||
let mut mac = [0u8; 6];
|
||||
if getrandom::getrandom(&mut mac).is_err() {
|
||||
// Fallback to timestamp-based
|
||||
let t = std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.unwrap_or_default()
|
||||
.as_nanos() as u64;
|
||||
mac[0] = (t >> 40) as u8;
|
||||
mac[1] = (t >> 32) as u8;
|
||||
mac[2] = (t >> 24) as u8;
|
||||
mac[3] = (t >> 16) as u8;
|
||||
mac[4] = (t >> 8) as u8;
|
||||
mac[5] = t as u8;
|
||||
}
|
||||
// Set locally administered bit, clear multicast bit
|
||||
mac[0] = (mac[0] | 0x02) & 0xFE;
|
||||
mac
|
||||
}
|
||||
|
||||
/// Add a queue for multi-queue support
|
||||
pub fn add_queue(&mut self) -> Result<RawFd> {
|
||||
let tap_path = format!("/dev/tap{}", self.ifindex);
|
||||
let file = Self::open_tap_device(&tap_path)?;
|
||||
|
||||
// Attach as additional queue
|
||||
let mut ifr = IfReq {
|
||||
ifr_name: [0u8; 16],
|
||||
ifr_flags: (tap_ioctl::IFF_TAP
|
||||
| tap_ioctl::IFF_NO_PI
|
||||
| tap_ioctl::IFF_VNET_HDR
|
||||
| tap_ioctl::IFF_MULTI_QUEUE
|
||||
| tap_ioctl::IFF_ATTACH_QUEUE) as i16,
|
||||
_padding: [0u8; 22],
|
||||
};
|
||||
|
||||
let name_bytes = self.name.as_bytes();
|
||||
let len = name_bytes.len().min(15);
|
||||
ifr.ifr_name[..len].copy_from_slice(&name_bytes[..len]);
|
||||
|
||||
let ret = unsafe {
|
||||
libc::ioctl(
|
||||
file.as_raw_fd(),
|
||||
tap_ioctl::TUNSETQUEUE as libc::c_ulong,
|
||||
&ifr as *const IfReq,
|
||||
)
|
||||
};
|
||||
|
||||
if ret < 0 {
|
||||
return Err(NetError::IoctlFailed(std::io::Error::last_os_error()));
|
||||
}
|
||||
|
||||
Self::set_vnet_hdr(&file, 12)?;
|
||||
Self::set_nonblocking_internal(&file, true)?;
|
||||
|
||||
let fd = file.as_raw_fd();
|
||||
self.queues.push(file);
|
||||
Ok(fd)
|
||||
}
|
||||
|
||||
/// Get the number of active queues
|
||||
pub fn queue_count(&self) -> usize {
|
||||
1 + self.queues.len()
|
||||
}
|
||||
|
||||
/// Get parent interface name
|
||||
pub fn parent(&self) -> &str {
|
||||
&self.parent
|
||||
}
|
||||
|
||||
/// Get macvtap mode
|
||||
pub fn mode(&self) -> MacvtapMode {
|
||||
self.mode
|
||||
}
|
||||
|
||||
/// Get interface index
|
||||
pub fn ifindex(&self) -> u32 {
|
||||
self.ifindex
|
||||
}
|
||||
|
||||
/// Destroy the macvtap interface
|
||||
pub fn destroy(self) -> Result<()> {
|
||||
// Use ManuallyDrop to prevent the Drop impl from running,
|
||||
// then manually extract fields we need
|
||||
let this = std::mem::ManuallyDrop::new(self);
|
||||
// Close file descriptors by letting them drop
|
||||
let _file = unsafe { std::ptr::read(&this.file) };
|
||||
let _queues = unsafe { std::ptr::read(&this.queues) };
|
||||
let name = unsafe { std::ptr::read(&this.name) };
|
||||
drop(_file);
|
||||
drop(_queues);
|
||||
|
||||
// Remove the interface
|
||||
let output = std::process::Command::new("ip")
|
||||
.args(["link", "delete", &name])
|
||||
.output()
|
||||
.map_err(|e| NetError::CreateFailed(e))?;
|
||||
|
||||
if !output.status.success() {
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
return Err(NetError::Netlink(stderr.to_string()));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl NetworkBackend for MacvtapDevice {
|
||||
fn as_raw_fd(&self) -> RawFd {
|
||||
self.file.as_raw_fd()
|
||||
}
|
||||
|
||||
fn recv(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
|
||||
self.file.read(buf)
|
||||
}
|
||||
|
||||
fn send(&mut self, buf: &[u8]) -> std::io::Result<usize> {
|
||||
self.file.write(buf)
|
||||
}
|
||||
|
||||
fn backend_type(&self) -> NetBackendType {
|
||||
NetBackendType::Macvtap
|
||||
}
|
||||
|
||||
fn mac_address(&self) -> Option<[u8; 6]> {
|
||||
Some(self.mac)
|
||||
}
|
||||
|
||||
fn set_nonblocking(&self, nonblocking: bool) -> std::io::Result<()> {
|
||||
Self::set_nonblocking_internal(&self.file, nonblocking)
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e.to_string()))
|
||||
}
|
||||
|
||||
fn configure_offloads(&self, offloads: OffloadFlags) -> std::io::Result<()> {
|
||||
let mut flags = 0u32;
|
||||
|
||||
if offloads.tx_csum {
|
||||
flags |= tap_ioctl::TUN_F_CSUM;
|
||||
}
|
||||
if offloads.tso4 {
|
||||
flags |= tap_ioctl::TUN_F_TSO4;
|
||||
}
|
||||
if offloads.tso6 {
|
||||
flags |= tap_ioctl::TUN_F_TSO6;
|
||||
}
|
||||
if offloads.ufo {
|
||||
flags |= tap_ioctl::TUN_F_UFO;
|
||||
}
|
||||
|
||||
let ret = unsafe {
|
||||
libc::ioctl(
|
||||
self.file.as_raw_fd(),
|
||||
tap_ioctl::TUNSETOFFLOAD as libc::c_ulong,
|
||||
flags as libc::c_ulong,
|
||||
)
|
||||
};
|
||||
|
||||
if ret < 0 {
|
||||
return Err(std::io::Error::last_os_error());
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn link_up(&self) -> bool {
|
||||
self.link_up
|
||||
}
|
||||
|
||||
fn interface_name(&self) -> &str {
|
||||
&self.name
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRawFd for MacvtapDevice {
|
||||
fn as_raw_fd(&self) -> RawFd {
|
||||
self.file.as_raw_fd()
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for MacvtapDevice {
|
||||
fn drop(&mut self) {
|
||||
tracing::debug!("MacvtapDevice {} dropping, cleaning up interface", self.name);
|
||||
|
||||
// Delete the macvtap interface so it doesn't leak on failure/panic.
|
||||
// The kernel will close the /dev/tapN fd when File drops, but the
|
||||
// macvtap netlink interface persists until explicitly removed.
|
||||
let output = std::process::Command::new("ip")
|
||||
.args(["link", "delete", &self.name])
|
||||
.output();
|
||||
|
||||
match output {
|
||||
Ok(o) if o.status.success() => {
|
||||
tracing::debug!("Deleted macvtap interface {}", self.name);
|
||||
}
|
||||
Ok(o) => {
|
||||
let stderr = String::from_utf8_lossy(&o.stderr);
|
||||
// "Cannot find device" is fine — already cleaned up
|
||||
if !stderr.contains("Cannot find device") {
|
||||
tracing::warn!(
|
||||
"Failed to delete macvtap interface {}: {}",
|
||||
self.name,
|
||||
stderr.trim()
|
||||
);
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
"Failed to run ip link delete for {}: {}",
|
||||
self.name,
|
||||
e
|
||||
);
|
||||
}
|
||||
}
|
||||
// File descriptors (self.file, self.queues) are dropped automatically by Rust
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Builder
|
||||
// ============================================================================
|
||||
|
||||
/// Builder for creating macvtap devices
|
||||
pub struct MacvtapBuilder {
|
||||
parent: String,
|
||||
name: Option<String>,
|
||||
mode: MacvtapMode,
|
||||
mac: Option<[u8; 6]>,
|
||||
queues: usize,
|
||||
offloads: OffloadFlags,
|
||||
}
|
||||
|
||||
impl MacvtapBuilder {
|
||||
/// Create a new builder with the specified parent interface
|
||||
pub fn new(parent: impl Into<String>) -> Self {
|
||||
Self {
|
||||
parent: parent.into(),
|
||||
name: None,
|
||||
mode: MacvtapMode::Bridge,
|
||||
mac: None,
|
||||
queues: 1,
|
||||
offloads: OffloadFlags::standard(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Set the interface name
|
||||
pub fn name(mut self, name: impl Into<String>) -> Self {
|
||||
self.name = Some(name.into());
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the macvtap mode
|
||||
pub fn mode(mut self, mode: MacvtapMode) -> Self {
|
||||
self.mode = mode;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the MAC address
|
||||
pub fn mac(mut self, mac: [u8; 6]) -> Self {
|
||||
self.mac = Some(mac);
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the number of queues (multi-queue)
|
||||
pub fn queues(mut self, queues: usize) -> Self {
|
||||
self.queues = queues.max(1);
|
||||
self
|
||||
}
|
||||
|
||||
/// Set offload configuration
|
||||
pub fn offloads(mut self, offloads: OffloadFlags) -> Self {
|
||||
self.offloads = offloads;
|
||||
self
|
||||
}
|
||||
|
||||
/// Build the macvtap device
|
||||
pub fn build(self) -> Result<MacvtapDevice> {
|
||||
let name = self.name.unwrap_or_else(|| {
|
||||
format!("macvtap-{:x}", std::process::id())
|
||||
});
|
||||
|
||||
let mut device = MacvtapDevice::create(&self.parent, &name, self.mode, self.mac)?;
|
||||
|
||||
// Add additional queues
|
||||
for _ in 1..self.queues {
|
||||
device.add_queue()?;
|
||||
}
|
||||
|
||||
// Configure offloads
|
||||
device.configure_offloads(self.offloads)
|
||||
.map_err(|e| NetError::IoctlFailed(e))?;
|
||||
|
||||
Ok(device)
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Tests
|
||||
// ============================================================================
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_random_mac() {
|
||||
let mac1 = MacvtapDevice::random_mac();
|
||||
let mac2 = MacvtapDevice::random_mac();
|
||||
|
||||
// Locally administered bit set
|
||||
assert!(mac1[0] & 0x02 != 0);
|
||||
// Multicast bit clear
|
||||
assert!(mac1[0] & 0x01 == 0);
|
||||
// MACs should differ
|
||||
assert_ne!(mac1, mac2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_mac() {
|
||||
let mac = MacvtapDevice::parse_mac("52:54:00:12:34:56").unwrap();
|
||||
assert_eq!(mac, [0x52, 0x54, 0x00, 0x12, 0x34, 0x56]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mode_str() {
|
||||
assert_eq!(MacvtapMode::Bridge.as_str(), "bridge");
|
||||
assert_eq!(MacvtapMode::Vepa.as_str(), "vepa");
|
||||
assert_eq!(MacvtapMode::Private.as_str(), "private");
|
||||
assert_eq!(MacvtapMode::Passthru.as_str(), "passthru");
|
||||
}
|
||||
}
|
||||
129
vmm/src/devices/net/mod.rs
Normal file
129
vmm/src/devices/net/mod.rs
Normal file
@@ -0,0 +1,129 @@
|
||||
//! Network Device Backends for Volt
|
||||
//!
|
||||
//! This module provides network backends for Volt VMs.
|
||||
//!
|
||||
//! # Backend Options
|
||||
//!
|
||||
//! | Backend | Performance | Complexity | Use Case |
|
||||
//! |-----------|-------------|------------|---------------------------|
|
||||
//! | macvtap | ~20+ Gbps | Low | Default, most scenarios |
|
||||
//! | tap | ~10 Gbps | Low | Simple, universal |
|
||||
|
||||
#[allow(dead_code)] // Macvtap backend — planned feature
|
||||
pub mod macvtap;
|
||||
|
||||
|
||||
use std::os::unix::io::RawFd;
|
||||
|
||||
/// Network backend type identifier
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum NetBackendType {
|
||||
/// TAP device
|
||||
Tap,
|
||||
/// macvtap device
|
||||
Macvtap,
|
||||
}
|
||||
|
||||
/// Offload capability flags
|
||||
#[derive(Debug, Clone, Copy, Default)]
|
||||
pub struct OffloadFlags {
|
||||
/// TX checksum offload
|
||||
pub tx_csum: bool,
|
||||
/// RX checksum offload
|
||||
pub rx_csum: bool,
|
||||
/// TCP Segmentation Offload v4
|
||||
pub tso4: bool,
|
||||
/// TCP Segmentation Offload v6
|
||||
pub tso6: bool,
|
||||
/// UDP Fragmentation Offload
|
||||
pub ufo: bool,
|
||||
/// Large Receive Offload
|
||||
pub lro: bool,
|
||||
/// Generic Receive Offload
|
||||
pub gro: bool,
|
||||
/// Generic Segmentation Offload
|
||||
pub gso: bool,
|
||||
}
|
||||
|
||||
impl OffloadFlags {
|
||||
/// All offloads enabled
|
||||
pub fn all() -> Self {
|
||||
Self {
|
||||
tx_csum: true,
|
||||
rx_csum: true,
|
||||
tso4: true,
|
||||
tso6: true,
|
||||
ufo: true,
|
||||
lro: true,
|
||||
gro: true,
|
||||
gso: true,
|
||||
}
|
||||
}
|
||||
|
||||
/// No offloads
|
||||
pub fn none() -> Self {
|
||||
Self::default()
|
||||
}
|
||||
|
||||
/// Standard offloads (csum + TSO)
|
||||
pub fn standard() -> Self {
|
||||
Self {
|
||||
tx_csum: true,
|
||||
rx_csum: true,
|
||||
tso4: true,
|
||||
tso6: true,
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Unified trait for all network backends
|
||||
pub trait NetworkBackend: Send + Sync {
|
||||
/// Get the file descriptor for epoll registration
|
||||
fn as_raw_fd(&self) -> RawFd;
|
||||
|
||||
/// Read a packet from the backend
|
||||
fn recv(&mut self, buf: &mut [u8]) -> std::io::Result<usize>;
|
||||
|
||||
/// Write a packet to the backend
|
||||
fn send(&mut self, buf: &[u8]) -> std::io::Result<usize>;
|
||||
|
||||
/// Get the backend type
|
||||
fn backend_type(&self) -> NetBackendType;
|
||||
|
||||
/// Get the MAC address (if assigned)
|
||||
fn mac_address(&self) -> Option<[u8; 6]>;
|
||||
|
||||
/// Set non-blocking mode
|
||||
fn set_nonblocking(&self, nonblocking: bool) -> std::io::Result<()>;
|
||||
|
||||
/// Configure offloads
|
||||
fn configure_offloads(&self, offloads: OffloadFlags) -> std::io::Result<()>;
|
||||
|
||||
/// Get current link status
|
||||
fn link_up(&self) -> bool;
|
||||
|
||||
/// Get the interface name
|
||||
fn interface_name(&self) -> &str;
|
||||
}
|
||||
|
||||
/// Error types for network backends
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum NetError {
|
||||
#[error("Failed to create interface: {0}")]
|
||||
CreateFailed(#[source] std::io::Error),
|
||||
|
||||
#[error("Interface not found: {0}")]
|
||||
InterfaceNotFound(String),
|
||||
|
||||
#[error("ioctl failed: {0}")]
|
||||
IoctlFailed(#[source] std::io::Error),
|
||||
|
||||
#[error("Netlink error: {0}")]
|
||||
Netlink(String),
|
||||
|
||||
#[error("Invalid configuration: {0}")]
|
||||
InvalidConfig(String),
|
||||
}
|
||||
|
||||
pub type Result<T> = std::result::Result<T, NetError>;
|
||||
302
vmm/src/devices/serial.rs
Normal file
302
vmm/src/devices/serial.rs
Normal file
@@ -0,0 +1,302 @@
|
||||
//! 8250 UART serial console emulation
|
||||
//!
|
||||
//! Implements a 16450-compatible UART with interrupt support.
|
||||
//! The Linux kernel's 8250 driver relies on THRE (Transmitter Holding Register Empty)
|
||||
//! interrupts for efficient output. Without them, userspace writes to the serial
|
||||
//! console will block forever waiting for an interrupt that never fires.
|
||||
|
||||
use std::collections::VecDeque;
|
||||
use std::io::{self, Write};
|
||||
use std::sync::Arc;
|
||||
|
||||
/// Standard COM1 I/O port base address
|
||||
pub const COM1_PORT: u16 = 0x3f8;
|
||||
|
||||
/// Standard COM1 IRQ number
|
||||
pub const COM1_IRQ: u32 = 4;
|
||||
|
||||
/// 8250 UART register offsets
|
||||
#[repr(u8)]
|
||||
#[allow(dead_code)] // UART register map — kept for reference
|
||||
pub enum Register {
|
||||
/// Receive buffer / Transmit holding register
|
||||
Data = 0,
|
||||
/// Interrupt enable register
|
||||
InterruptEnable = 1,
|
||||
/// Interrupt identification / FIFO control
|
||||
InterruptId = 2,
|
||||
/// Line control register
|
||||
LineControl = 3,
|
||||
/// Modem control register
|
||||
ModemControl = 4,
|
||||
/// Line status register
|
||||
LineStatus = 5,
|
||||
/// Modem status register
|
||||
ModemStatus = 6,
|
||||
/// Scratch register
|
||||
Scratch = 7,
|
||||
}
|
||||
|
||||
/// IER (Interrupt Enable Register) bits
|
||||
#[allow(dead_code)] // UART IER bits — kept for completeness
|
||||
pub mod ier_bits {
|
||||
pub const RX_AVAIL: u8 = 0x01; // Received data available
|
||||
pub const THR_EMPTY: u8 = 0x02; // Transmitter holding register empty
|
||||
pub const RX_LINE_STATUS: u8 = 0x04; // Receiver line status
|
||||
pub const MODEM_STATUS: u8 = 0x08; // Modem status
|
||||
}
|
||||
|
||||
/// IIR (Interrupt Identification Register) values
|
||||
#[allow(dead_code)] // UART IIR values — kept for completeness
|
||||
pub mod iir_values {
|
||||
pub const NO_INTERRUPT: u8 = 0x01; // No interrupt pending (bit 0 set)
|
||||
pub const THR_EMPTY: u8 = 0x02; // THR empty (priority 3)
|
||||
pub const RX_DATA_AVAIL: u8 = 0x04; // Received data available (priority 2)
|
||||
pub const RX_LINE_STATUS: u8 = 0x06; // Receiver line status (priority 1)
|
||||
pub const MODEM_STATUS: u8 = 0x00; // Modem status (priority 4)
|
||||
}
|
||||
|
||||
/// Line status register bits
|
||||
#[allow(dead_code)] // UART line status bits — kept for completeness
|
||||
pub mod line_status {
|
||||
pub const DATA_READY: u8 = 0x01;
|
||||
pub const OVERRUN_ERROR: u8 = 0x02;
|
||||
pub const PARITY_ERROR: u8 = 0x04;
|
||||
pub const FRAMING_ERROR: u8 = 0x08;
|
||||
pub const BREAK_INTERRUPT: u8 = 0x10;
|
||||
pub const THR_EMPTY: u8 = 0x20;
|
||||
pub const THR_TSR_EMPTY: u8 = 0x40;
|
||||
pub const FIFO_ERROR: u8 = 0x80;
|
||||
}
|
||||
|
||||
/// Trait for interrupt delivery from the serial device
|
||||
pub trait SerialInterrupt: Send + Sync {
|
||||
fn trigger(&self);
|
||||
}
|
||||
|
||||
/// Serial console device with interrupt support
|
||||
pub struct Serial {
|
||||
/// Divisor latch access bit
|
||||
dlab: bool,
|
||||
/// Interrupt enable register
|
||||
ier: u8,
|
||||
/// Line control register
|
||||
lcr: u8,
|
||||
/// Modem control register
|
||||
mcr: u8,
|
||||
/// Line status register
|
||||
lsr: u8,
|
||||
/// Modem status register
|
||||
msr: u8,
|
||||
/// Scratch register
|
||||
scr: u8,
|
||||
/// Divisor latch (low byte)
|
||||
dll: u8,
|
||||
/// Divisor latch (high byte)
|
||||
dlh: u8,
|
||||
/// Whether a THRE interrupt is pending (tracks edge-triggered behavior)
|
||||
thr_interrupt_pending: bool,
|
||||
/// Input buffer
|
||||
input_buffer: VecDeque<u8>,
|
||||
/// Output writer (wrapped in Mutex for thread safety)
|
||||
output: Option<std::sync::Mutex<Box<dyn Write + Send>>>,
|
||||
/// Interrupt callback for triggering IRQ to the guest
|
||||
interrupt: Option<Arc<dyn SerialInterrupt>>,
|
||||
}
|
||||
|
||||
impl Serial {
|
||||
/// Create a new serial device with stdout output
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
dlab: false,
|
||||
ier: 0,
|
||||
lcr: 0,
|
||||
mcr: 0,
|
||||
lsr: line_status::THR_EMPTY | line_status::THR_TSR_EMPTY,
|
||||
msr: 0,
|
||||
scr: 0,
|
||||
dll: 0,
|
||||
dlh: 0,
|
||||
thr_interrupt_pending: false,
|
||||
input_buffer: VecDeque::new(),
|
||||
output: Some(std::sync::Mutex::new(Box::new(io::stdout()))),
|
||||
interrupt: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Set the interrupt delivery mechanism
|
||||
pub fn set_interrupt(&mut self, interrupt: Arc<dyn SerialInterrupt>) {
|
||||
self.interrupt = Some(interrupt);
|
||||
}
|
||||
|
||||
/// Compute the current IIR value based on pending interrupt conditions.
|
||||
/// Priority (highest to lowest): Line Status > RX Data > THR Empty > Modem Status
|
||||
fn compute_iir(&self) -> u8 {
|
||||
// Check receiver line status interrupt
|
||||
if (self.ier & ier_bits::RX_LINE_STATUS) != 0 {
|
||||
let error_bits = self.lsr & (line_status::OVERRUN_ERROR | line_status::PARITY_ERROR
|
||||
| line_status::FRAMING_ERROR | line_status::BREAK_INTERRUPT);
|
||||
if error_bits != 0 {
|
||||
return iir_values::RX_LINE_STATUS;
|
||||
}
|
||||
}
|
||||
|
||||
// Check received data available interrupt
|
||||
if (self.ier & ier_bits::RX_AVAIL) != 0 && (self.lsr & line_status::DATA_READY) != 0 {
|
||||
return iir_values::RX_DATA_AVAIL;
|
||||
}
|
||||
|
||||
// Check THR empty interrupt
|
||||
if (self.ier & ier_bits::THR_EMPTY) != 0 && self.thr_interrupt_pending {
|
||||
return iir_values::THR_EMPTY;
|
||||
}
|
||||
|
||||
// Check modem status interrupt
|
||||
if (self.ier & ier_bits::MODEM_STATUS) != 0 {
|
||||
// We don't track modem status changes, so this never fires
|
||||
}
|
||||
|
||||
// No interrupt pending
|
||||
iir_values::NO_INTERRUPT
|
||||
}
|
||||
|
||||
/// Fire an interrupt if any conditions are pending
|
||||
fn update_interrupt(&self) {
|
||||
let iir = self.compute_iir();
|
||||
if iir != iir_values::NO_INTERRUPT {
|
||||
if let Some(ref interrupt) = self.interrupt {
|
||||
interrupt.trigger();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Handle a read from the serial port
|
||||
pub fn read(&mut self, offset: u8) -> u8 {
|
||||
match offset {
|
||||
0 => {
|
||||
if self.dlab {
|
||||
self.dll
|
||||
} else {
|
||||
// Read from receive buffer
|
||||
let data = self.input_buffer.pop_front().unwrap_or(0);
|
||||
if self.input_buffer.is_empty() {
|
||||
self.lsr &= !line_status::DATA_READY;
|
||||
}
|
||||
self.update_interrupt();
|
||||
data
|
||||
}
|
||||
}
|
||||
1 => {
|
||||
if self.dlab {
|
||||
self.dlh
|
||||
} else {
|
||||
self.ier
|
||||
}
|
||||
}
|
||||
2 => {
|
||||
// Reading IIR clears the THR interrupt condition
|
||||
let iir = self.compute_iir();
|
||||
if iir == iir_values::THR_EMPTY {
|
||||
self.thr_interrupt_pending = false;
|
||||
}
|
||||
iir
|
||||
}
|
||||
3 => self.lcr,
|
||||
4 => self.mcr,
|
||||
5 => {
|
||||
// Reading LSR clears error bits
|
||||
let val = self.lsr;
|
||||
// Clear error bits (but preserve data ready and THR flags)
|
||||
self.lsr &= line_status::DATA_READY | line_status::THR_EMPTY | line_status::THR_TSR_EMPTY;
|
||||
val
|
||||
}
|
||||
6 => self.msr,
|
||||
7 => self.scr,
|
||||
_ => 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Handle a write to the serial port
|
||||
pub fn write(&mut self, offset: u8, value: u8) {
|
||||
match offset {
|
||||
0 => {
|
||||
if self.dlab {
|
||||
self.dll = value;
|
||||
} else {
|
||||
// Write to transmit buffer — output the character
|
||||
if let Some(ref output) = self.output {
|
||||
if let Ok(mut out) = output.lock() {
|
||||
let _ = out.write_all(&[value]);
|
||||
let _ = out.flush();
|
||||
}
|
||||
}
|
||||
// The character is "transmitted" instantly.
|
||||
// LSR THR_EMPTY and THR_TSR_EMPTY stay set (we don't simulate
|
||||
// real transmission delay — the character goes to stdout immediately).
|
||||
// Signal a THRE interrupt so the driver knows it can send more.
|
||||
self.thr_interrupt_pending = true;
|
||||
self.update_interrupt();
|
||||
}
|
||||
}
|
||||
1 => {
|
||||
if self.dlab {
|
||||
self.dlh = value;
|
||||
} else {
|
||||
let old_ier = self.ier;
|
||||
self.ier = value & 0x0f;
|
||||
|
||||
// If THRE interrupt was just enabled and transmitter is empty,
|
||||
// signal the interrupt immediately. This is critical for the
|
||||
// 8250 driver's initialization — it enables THRE interrupts
|
||||
// and expects an immediate interrupt to start the TX pump.
|
||||
if (old_ier & ier_bits::THR_EMPTY) == 0
|
||||
&& (self.ier & ier_bits::THR_EMPTY) != 0
|
||||
&& (self.lsr & line_status::THR_EMPTY) != 0
|
||||
{
|
||||
self.thr_interrupt_pending = true;
|
||||
self.update_interrupt();
|
||||
}
|
||||
}
|
||||
}
|
||||
2 => {
|
||||
// FIFO control register — we don't emulate FIFOs
|
||||
// but accept writes silently
|
||||
}
|
||||
3 => {
|
||||
self.dlab = (value & 0x80) != 0;
|
||||
self.lcr = value;
|
||||
}
|
||||
4 => {
|
||||
self.mcr = value & 0x1f;
|
||||
}
|
||||
5 => {
|
||||
// Line status is read-only
|
||||
}
|
||||
6 => {
|
||||
// Modem status is read-only
|
||||
}
|
||||
7 => {
|
||||
self.scr = value;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
/// Queue input data to the serial device
|
||||
#[allow(dead_code)] // Will be used for serial input from API
|
||||
pub fn queue_input(&mut self, data: &[u8]) {
|
||||
for &byte in data {
|
||||
self.input_buffer.push_back(byte);
|
||||
}
|
||||
if !self.input_buffer.is_empty() {
|
||||
self.lsr |= line_status::DATA_READY;
|
||||
self.update_interrupt();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Serial {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
1124
vmm/src/devices/virtio/block.rs
Normal file
1124
vmm/src/devices/virtio/block.rs
Normal file
File diff suppressed because it is too large
Load Diff
338
vmm/src/devices/virtio/device.rs
Normal file
338
vmm/src/devices/virtio/device.rs
Normal file
@@ -0,0 +1,338 @@
|
||||
//! Virtio Device Trait
|
||||
//!
|
||||
//! Defines the interface that all virtio device backends must implement.
|
||||
//! This trait abstracts away the transport layer (MMIO, PCI) from the
|
||||
//! device-specific logic.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use bitflags::bitflags;
|
||||
use vm_memory::GuestMemoryMmap;
|
||||
|
||||
use super::mmio::VirtioMmioError;
|
||||
use super::DeviceType;
|
||||
|
||||
bitflags! {
|
||||
/// Common virtio feature bits
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub struct DeviceFeatures: u64 {
|
||||
// Feature bits 0-23 are device-specific
|
||||
|
||||
/// Feature negotiation mechanism
|
||||
const RING_INDIRECT_DESC = 1 << 28;
|
||||
/// Support for event idx (avail_event, used_event)
|
||||
const RING_EVENT_IDX = 1 << 29;
|
||||
|
||||
// Bits 32+ are reserved for transport/virtio version features
|
||||
|
||||
/// Device supports version 1.0+ (non-legacy)
|
||||
const VERSION_1 = 1 << 32;
|
||||
/// Device can access platform-specific memory
|
||||
const ACCESS_PLATFORM = 1 << 33;
|
||||
/// Device supports packed virtqueue layout
|
||||
const RING_PACKED = 1 << 34;
|
||||
/// Device supports in-order buffer consumption
|
||||
const IN_ORDER = 1 << 35;
|
||||
/// Device supports memory ordered accesses
|
||||
const ORDER_PLATFORM = 1 << 36;
|
||||
/// Device supports single-root I/O virtualization
|
||||
const SR_IOV = 1 << 37;
|
||||
/// Device supports notification data
|
||||
const NOTIFICATION_DATA = 1 << 38;
|
||||
/// Device supports notification config data
|
||||
const NOTIF_CONFIG_DATA = 1 << 39;
|
||||
/// Device supports reset notification
|
||||
const RING_RESET = 1 << 40;
|
||||
}
|
||||
}
|
||||
|
||||
/// Virtio device backend trait
|
||||
///
|
||||
/// All virtio device implementations (block, net, vsock, etc.) must implement this trait.
|
||||
/// The MMIO transport layer uses this interface to interact with device-specific logic.
|
||||
pub trait VirtioDevice: Send + Sync {
|
||||
/// Get the device type ID
|
||||
fn device_type(&self) -> DeviceType;
|
||||
|
||||
/// Get the device feature bits
|
||||
///
|
||||
/// Returns all features supported by the device. The driver will negotiate
|
||||
/// which features to use during initialization.
|
||||
fn device_features(&self) -> u64;
|
||||
|
||||
/// Get the number of virtqueues this device uses
|
||||
fn num_queues(&self) -> u16;
|
||||
|
||||
/// Get the maximum queue size
|
||||
fn queue_max_size(&self) -> u16 {
|
||||
256
|
||||
}
|
||||
|
||||
/// Read from the device-specific configuration space
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `offset` - Offset within the config space (starts at MMIO offset 0x100)
|
||||
/// * `data` - Buffer to fill with configuration data
|
||||
fn read_config(&self, offset: u64, data: &mut [u8]);
|
||||
|
||||
/// Write to the device-specific configuration space
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `offset` - Offset within the config space
|
||||
/// * `data` - Data to write
|
||||
fn write_config(&mut self, offset: u64, data: &[u8]);
|
||||
|
||||
/// Activate the device with negotiated features
|
||||
///
|
||||
/// Called when the driver writes DRIVER_OK to the status register.
|
||||
/// The device should start processing I/O after this call.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `features` - The negotiated feature bits
|
||||
/// * `mem` - Guest memory reference for DMA operations
|
||||
fn activate(&mut self, features: u64, mem: &GuestMemoryMmap) -> Result<(), VirtioMmioError>;
|
||||
|
||||
/// Reset the device to initial state
|
||||
///
|
||||
/// Called when the driver writes 0 to the status register.
|
||||
fn reset(&mut self);
|
||||
|
||||
/// Process available buffers on the given queue
|
||||
///
|
||||
/// Called when the driver writes to QueueNotify.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `queue_idx` - Index of the queue to process
|
||||
/// * `mem` - Guest memory reference
|
||||
fn process_queue(&mut self, queue_idx: u16, mem: &GuestMemoryMmap) -> Result<(), VirtioMmioError>;
|
||||
|
||||
/// Get the size of the device-specific configuration space
|
||||
fn config_size(&self) -> u64 {
|
||||
0
|
||||
}
|
||||
|
||||
/// Check if the device supports a specific feature
|
||||
fn has_feature(&self, feature: u64) -> bool {
|
||||
(self.device_features() & feature) != 0
|
||||
}
|
||||
}
|
||||
|
||||
/// A stub device that can be used for testing or as a placeholder
|
||||
pub struct NullDevice {
|
||||
device_type: DeviceType,
|
||||
features: u64,
|
||||
num_queues: u16,
|
||||
config: Vec<u8>,
|
||||
}
|
||||
|
||||
impl NullDevice {
|
||||
/// Create a new null device of the given type
|
||||
pub fn new(device_type: DeviceType, num_queues: u16, config_size: usize) -> Self {
|
||||
Self {
|
||||
device_type,
|
||||
features: DeviceFeatures::VERSION_1.bits(),
|
||||
num_queues,
|
||||
config: vec![0; config_size],
|
||||
}
|
||||
}
|
||||
|
||||
/// Set the device features
|
||||
pub fn set_features(&mut self, features: u64) {
|
||||
self.features = features;
|
||||
}
|
||||
}
|
||||
|
||||
impl VirtioDevice for NullDevice {
|
||||
fn device_type(&self) -> DeviceType {
|
||||
self.device_type
|
||||
}
|
||||
|
||||
fn device_features(&self) -> u64 {
|
||||
self.features
|
||||
}
|
||||
|
||||
fn num_queues(&self) -> u16 {
|
||||
self.num_queues
|
||||
}
|
||||
|
||||
fn read_config(&self, offset: u64, data: &mut [u8]) {
|
||||
let start = offset as usize;
|
||||
let end = std::cmp::min(start + data.len(), self.config.len());
|
||||
if start < end {
|
||||
data[..end - start].copy_from_slice(&self.config[start..end]);
|
||||
}
|
||||
}
|
||||
|
||||
fn write_config(&mut self, offset: u64, data: &[u8]) {
|
||||
let start = offset as usize;
|
||||
let end = std::cmp::min(start + data.len(), self.config.len());
|
||||
if start < end {
|
||||
self.config[start..end].copy_from_slice(&data[..end - start]);
|
||||
}
|
||||
}
|
||||
|
||||
fn activate(&mut self, _features: u64, _mem: &GuestMemoryMmap) -> Result<(), VirtioMmioError> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn reset(&mut self) {
|
||||
self.config.fill(0);
|
||||
}
|
||||
|
||||
fn process_queue(&mut self, _queue_idx: u16, _mem: &GuestMemoryMmap) -> Result<(), VirtioMmioError> {
|
||||
// Null device doesn't process anything
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn config_size(&self) -> u64 {
|
||||
self.config.len() as u64
|
||||
}
|
||||
}
|
||||
|
||||
/// Interrupt handler callback type
|
||||
pub type InterruptCallback = Arc<dyn Fn(u32) -> Result<(), VirtioMmioError> + Send + Sync>;
|
||||
|
||||
/// Context provided to device backends for interrupt injection
|
||||
pub struct DeviceContext {
|
||||
/// Callback to inject interrupts
|
||||
pub interrupt: InterruptCallback,
|
||||
/// Queue index that triggered this processing
|
||||
pub queue_idx: u16,
|
||||
}
|
||||
|
||||
impl DeviceContext {
|
||||
/// Signal an interrupt to the guest
|
||||
pub fn signal_interrupt(&self, vector: u32) -> Result<(), VirtioMmioError> {
|
||||
(self.interrupt)(vector)
|
||||
}
|
||||
|
||||
/// Signal used buffer notification
|
||||
pub fn signal_used(&self) -> Result<(), VirtioMmioError> {
|
||||
self.signal_interrupt(0)
|
||||
}
|
||||
}
|
||||
|
||||
/// Block device-specific configuration
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone, Copy, Default)]
|
||||
pub struct VirtioBlockConfig {
|
||||
/// Total capacity in 512-byte sectors
|
||||
pub capacity: u64,
|
||||
/// Size of a block (unused in virtio 1.1+)
|
||||
pub size_max: u32,
|
||||
/// Maximum number of segments in a request
|
||||
pub seg_max: u32,
|
||||
/// Cylinder geometry (cylinders)
|
||||
pub geometry_cylinders: u16,
|
||||
/// Cylinder geometry (heads)
|
||||
pub geometry_heads: u8,
|
||||
/// Cylinder geometry (sectors)
|
||||
pub geometry_sectors: u8,
|
||||
/// Block size
|
||||
pub blk_size: u32,
|
||||
/// Physical block exponent
|
||||
pub physical_block_exp: u8,
|
||||
/// Alignment offset
|
||||
pub alignment_offset: u8,
|
||||
/// Minimum I/O size
|
||||
pub min_io_size: u16,
|
||||
/// Optimal I/O size
|
||||
pub opt_io_size: u32,
|
||||
/// Writeback mode
|
||||
pub writeback: u8,
|
||||
/// Unused
|
||||
pub unused0: u8,
|
||||
/// Number of queues
|
||||
pub num_queues: u16,
|
||||
/// Maximum discard sectors
|
||||
pub max_discard_sectors: u32,
|
||||
/// Maximum discard segment count
|
||||
pub max_discard_seg: u32,
|
||||
/// Discard sector alignment
|
||||
pub discard_sector_alignment: u32,
|
||||
/// Maximum write zeroes sectors
|
||||
pub max_write_zeroes_sectors: u32,
|
||||
/// Maximum write zeroes segment count
|
||||
pub max_write_zeroes_seg: u32,
|
||||
/// Write zeroes may unmap
|
||||
pub write_zeroes_may_unmap: u8,
|
||||
/// Unused
|
||||
pub unused1: [u8; 3],
|
||||
/// Maximum secure erase sectors
|
||||
pub max_secure_erase_sectors: u32,
|
||||
/// Maximum secure erase segment count
|
||||
pub max_secure_erase_seg: u32,
|
||||
/// Secure erase sector alignment
|
||||
pub secure_erase_sector_alignment: u32,
|
||||
}
|
||||
|
||||
/// Network device-specific configuration
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone, Copy, Default)]
|
||||
pub struct VirtioNetConfig {
|
||||
/// MAC address
|
||||
pub mac: [u8; 6],
|
||||
/// Status (link up/down)
|
||||
pub status: u16,
|
||||
/// Maximum number of TX virtqueues
|
||||
pub max_virtqueue_pairs: u16,
|
||||
/// MTU
|
||||
pub mtu: u16,
|
||||
/// Speed (Mbps)
|
||||
pub speed: u32,
|
||||
/// Duplex mode
|
||||
pub duplex: u8,
|
||||
/// RSS max key size
|
||||
pub rss_max_key_size: u8,
|
||||
/// RSS max indirection table length
|
||||
pub rss_max_indirection_table_length: u16,
|
||||
/// Supported hash types
|
||||
pub supported_hash_types: u32,
|
||||
}
|
||||
|
||||
/// Vsock device-specific configuration
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone, Copy, Default)]
|
||||
pub struct VirtioVsockConfig {
|
||||
/// Guest CID (context ID)
|
||||
pub guest_cid: u64,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_null_device() {
|
||||
let mut device = NullDevice::new(DeviceType::Block, 2, 64);
|
||||
assert_eq!(device.device_type(), DeviceType::Block);
|
||||
assert_eq!(device.num_queues(), 2);
|
||||
|
||||
// Test config read/write
|
||||
let mut buf = [0u8; 4];
|
||||
device.write_config(0, &[1, 2, 3, 4]);
|
||||
device.read_config(0, &mut buf);
|
||||
assert_eq!(buf, [1, 2, 3, 4]);
|
||||
|
||||
// Test reset
|
||||
device.reset();
|
||||
device.read_config(0, &mut buf);
|
||||
assert_eq!(buf, [0, 0, 0, 0]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_device_features() {
|
||||
let features = DeviceFeatures::VERSION_1 | DeviceFeatures::RING_EVENT_IDX;
|
||||
assert!(features.contains(DeviceFeatures::VERSION_1));
|
||||
assert!(features.contains(DeviceFeatures::RING_EVENT_IDX));
|
||||
assert!(!features.contains(DeviceFeatures::RING_PACKED));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_config_structs_size() {
|
||||
// Verify config struct sizes match virtio spec
|
||||
assert!(std::mem::size_of::<VirtioBlockConfig>() >= 60);
|
||||
assert!(std::mem::size_of::<VirtioNetConfig>() >= 10);
|
||||
assert_eq!(std::mem::size_of::<VirtioVsockConfig>(), 8);
|
||||
}
|
||||
}
|
||||
745
vmm/src/devices/virtio/mmio.rs
Normal file
745
vmm/src/devices/virtio/mmio.rs
Normal file
@@ -0,0 +1,745 @@
|
||||
//! Virtio MMIO Transport Implementation
|
||||
//!
|
||||
//! Implements the virtio-mmio transport as specified in virtio 1.2 spec section 4.2.
|
||||
//! This provides memory-mapped register access for virtio device configuration.
|
||||
|
||||
use super::{status, DeviceType, VirtioDevice, VirtioError};
|
||||
use std::os::unix::io::RawFd;
|
||||
use std::sync::Arc;
|
||||
|
||||
/// Errors that can occur during MMIO transport operations
|
||||
#[derive(Debug, Clone)]
|
||||
#[allow(dead_code)]
|
||||
pub enum VirtioMmioError {
|
||||
/// Device not ready
|
||||
DeviceNotReady,
|
||||
/// Invalid queue configuration
|
||||
InvalidQueueConfig,
|
||||
/// Queue not ready
|
||||
QueueNotReady,
|
||||
/// Memory access error
|
||||
MemoryError(String),
|
||||
/// Device error
|
||||
DeviceError(VirtioError),
|
||||
/// Backend I/O error
|
||||
BackendIo(String),
|
||||
/// Invalid request
|
||||
InvalidRequest(String),
|
||||
}
|
||||
|
||||
impl std::fmt::Display for VirtioMmioError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Self::DeviceNotReady => write!(f, "device not ready"),
|
||||
Self::InvalidQueueConfig => write!(f, "invalid queue configuration"),
|
||||
Self::QueueNotReady => write!(f, "queue not ready"),
|
||||
Self::MemoryError(msg) => write!(f, "memory error: {}", msg),
|
||||
Self::DeviceError(e) => write!(f, "device error: {:?}", e),
|
||||
Self::BackendIo(msg) => write!(f, "backend I/O error: {}", msg),
|
||||
Self::InvalidRequest(msg) => write!(f, "invalid request: {}", msg),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for VirtioMmioError {}
|
||||
|
||||
impl From<VirtioError> for VirtioMmioError {
|
||||
fn from(e: VirtioError) -> Self {
|
||||
VirtioMmioError::DeviceError(e)
|
||||
}
|
||||
}
|
||||
|
||||
/// Guest memory trait for MMIO operations
|
||||
#[allow(dead_code)]
|
||||
pub trait GuestMemory: Send + Sync {
|
||||
fn read(&self, addr: u64, buf: &mut [u8]) -> Result<(), VirtioMmioError>;
|
||||
fn write(&self, addr: u64, buf: &[u8]) -> Result<(), VirtioMmioError>;
|
||||
}
|
||||
|
||||
/// Interrupt delivery trait
|
||||
pub trait InterruptDelivery: Send + Sync {
|
||||
fn signal(&self, vector: u32) -> Result<(), VirtioMmioError>;
|
||||
/// Deassert the IRQ line (for level-triggered interrupt deassertion).
|
||||
/// Called when the guest acknowledges all pending interrupts.
|
||||
fn deassert(&self) -> Result<(), VirtioMmioError> {
|
||||
Ok(()) // Default no-op for edge-triggered implementations
|
||||
}
|
||||
}
|
||||
|
||||
/// MMIO register offsets (virtio-mmio v2)
|
||||
pub mod regs {
|
||||
/// Magic value (0x74726976 = "virt")
|
||||
pub const MAGIC_VALUE: u64 = 0x000;
|
||||
/// Device version (2 for virtio 1.0+)
|
||||
pub const VERSION: u64 = 0x004;
|
||||
/// Virtio device ID
|
||||
pub const DEVICE_ID: u64 = 0x008;
|
||||
/// Virtio vendor ID
|
||||
pub const VENDOR_ID: u64 = 0x00c;
|
||||
/// Device features bits 0-31
|
||||
pub const DEVICE_FEATURES: u64 = 0x010;
|
||||
/// Device features selector
|
||||
pub const DEVICE_FEATURES_SEL: u64 = 0x014;
|
||||
/// Driver features bits 0-31
|
||||
pub const DRIVER_FEATURES: u64 = 0x020;
|
||||
/// Driver features selector
|
||||
pub const DRIVER_FEATURES_SEL: u64 = 0x024;
|
||||
/// Queue selector
|
||||
pub const QUEUE_SEL: u64 = 0x030;
|
||||
/// Maximum queue size
|
||||
pub const QUEUE_NUM_MAX: u64 = 0x034;
|
||||
/// Queue size
|
||||
pub const QUEUE_NUM: u64 = 0x038;
|
||||
/// Queue ready
|
||||
pub const QUEUE_READY: u64 = 0x044;
|
||||
/// Queue notify (write-only)
|
||||
pub const QUEUE_NOTIFY: u64 = 0x050;
|
||||
/// Interrupt status
|
||||
pub const INTERRUPT_STATUS: u64 = 0x060;
|
||||
/// Interrupt acknowledge (write-only)
|
||||
pub const INTERRUPT_ACK: u64 = 0x064;
|
||||
/// Device status
|
||||
pub const STATUS: u64 = 0x070;
|
||||
/// Queue descriptor low address
|
||||
pub const QUEUE_DESC_LOW: u64 = 0x080;
|
||||
/// Queue descriptor high address
|
||||
pub const QUEUE_DESC_HIGH: u64 = 0x084;
|
||||
/// Queue available low address
|
||||
pub const QUEUE_AVAIL_LOW: u64 = 0x090;
|
||||
/// Queue available high address
|
||||
pub const QUEUE_AVAIL_HIGH: u64 = 0x094;
|
||||
/// Queue used low address
|
||||
pub const QUEUE_USED_LOW: u64 = 0x0a0;
|
||||
/// Queue used high address
|
||||
pub const QUEUE_USED_HIGH: u64 = 0x0a4;
|
||||
/// Shared memory region info (v2)
|
||||
#[allow(dead_code)]
|
||||
pub const SHM_SEL: u64 = 0x0ac;
|
||||
#[allow(dead_code)]
|
||||
pub const SHM_LEN_LOW: u64 = 0x0b0;
|
||||
#[allow(dead_code)]
|
||||
pub const SHM_LEN_HIGH: u64 = 0x0b4;
|
||||
#[allow(dead_code)]
|
||||
pub const SHM_BASE_LOW: u64 = 0x0b8;
|
||||
#[allow(dead_code)]
|
||||
pub const SHM_BASE_HIGH: u64 = 0x0bc;
|
||||
/// Queue reset (v2)
|
||||
pub const QUEUE_RESET: u64 = 0x0c0;
|
||||
/// Config generation (v2)
|
||||
pub const CONFIG_GENERATION: u64 = 0x0fc;
|
||||
/// Config space starts at offset 0x100
|
||||
pub const CONFIG: u64 = 0x100;
|
||||
}
|
||||
|
||||
/// Interrupt status bits
|
||||
pub mod interrupt {
|
||||
/// Used buffer notification
|
||||
pub const USED_RING: u32 = 1;
|
||||
/// Configuration change
|
||||
#[allow(dead_code)]
|
||||
pub const CONFIG_CHANGE: u32 = 2;
|
||||
}
|
||||
|
||||
/// MMIO magic value
|
||||
pub const MAGIC: u32 = 0x74726976; // "virt" in little endian
|
||||
|
||||
/// MMIO version (2 = virtio 1.0+)
|
||||
pub const VERSION: u32 = 2;
|
||||
|
||||
/// Default vendor ID
|
||||
pub const VENDOR_ID: u32 = 0x4E6F7661; // "Nova"
|
||||
|
||||
/// MMIO region size
|
||||
#[allow(dead_code)]
|
||||
pub const MMIO_SIZE: u64 = 0x200;
|
||||
|
||||
/// Virtio MMIO transport state
|
||||
pub struct MmioTransport<D: VirtioDevice> {
|
||||
/// The underlying virtio device
|
||||
device: D,
|
||||
/// Guest memory interface
|
||||
mem: Option<Arc<dyn GuestMemory>>,
|
||||
/// Interrupt delivery
|
||||
irq: Option<Arc<dyn InterruptDelivery>>,
|
||||
/// Current device status
|
||||
device_status: u32,
|
||||
/// Device features selector (0 = low 32 bits, 1 = high 32 bits)
|
||||
device_features_sel: u32,
|
||||
/// Driver features
|
||||
driver_features: u64,
|
||||
/// Driver features selector
|
||||
driver_features_sel: u32,
|
||||
/// Selected queue index
|
||||
queue_sel: u32,
|
||||
/// Interrupt status
|
||||
interrupt_status: u32,
|
||||
/// Configuration generation counter
|
||||
config_generation: u32,
|
||||
/// Queue addresses (temporary storage until ready)
|
||||
queue_desc: [u64; 8],
|
||||
queue_avail: [u64; 8],
|
||||
queue_used: [u64; 8],
|
||||
queue_num: [u16; 8],
|
||||
queue_ready: [bool; 8],
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
impl<D: VirtioDevice> MmioTransport<D> {
|
||||
/// Create a new MMIO transport wrapping a virtio device
|
||||
pub fn new(device: D) -> Self {
|
||||
Self {
|
||||
device,
|
||||
mem: None,
|
||||
irq: None,
|
||||
device_status: 0,
|
||||
device_features_sel: 0,
|
||||
driver_features: 0,
|
||||
driver_features_sel: 0,
|
||||
queue_sel: 0,
|
||||
interrupt_status: 0,
|
||||
config_generation: 0,
|
||||
queue_desc: [0; 8],
|
||||
queue_avail: [0; 8],
|
||||
queue_used: [0; 8],
|
||||
queue_num: [0; 8],
|
||||
queue_ready: [false; 8],
|
||||
}
|
||||
}
|
||||
|
||||
/// Set guest memory interface
|
||||
pub fn set_memory(&mut self, mem: Arc<dyn GuestMemory>) {
|
||||
self.mem = Some(mem);
|
||||
}
|
||||
|
||||
/// Set interrupt delivery interface
|
||||
pub fn set_interrupt(&mut self, irq: Arc<dyn InterruptDelivery>) {
|
||||
self.irq = Some(irq);
|
||||
}
|
||||
|
||||
/// Get a reference to the underlying device
|
||||
pub fn device(&self) -> &D {
|
||||
&self.device
|
||||
}
|
||||
|
||||
/// Get a mutable reference to the underlying device
|
||||
pub fn device_mut(&mut self) -> &mut D {
|
||||
&mut self.device
|
||||
}
|
||||
|
||||
/// Handle MMIO read
|
||||
pub fn read(&self, offset: u64, data: &mut [u8]) {
|
||||
let len = data.len();
|
||||
if len != 4 && offset < regs::CONFIG {
|
||||
// Non-config reads must be 4 bytes
|
||||
data.fill(0);
|
||||
return;
|
||||
}
|
||||
|
||||
let value: u32 = match offset {
|
||||
regs::MAGIC_VALUE => MAGIC,
|
||||
regs::VERSION => VERSION,
|
||||
regs::DEVICE_ID => self.device.device_type() as u32,
|
||||
regs::VENDOR_ID => VENDOR_ID,
|
||||
regs::DEVICE_FEATURES => {
|
||||
let features = self.device.device_features();
|
||||
if self.device_features_sel == 0 {
|
||||
features as u32
|
||||
} else {
|
||||
(features >> 32) as u32
|
||||
}
|
||||
}
|
||||
regs::QUEUE_NUM_MAX => {
|
||||
let qidx = self.queue_sel;
|
||||
if (qidx as usize) < self.device.num_queues() {
|
||||
self.device.queue_max_size(qidx) as u32
|
||||
} else {
|
||||
0
|
||||
}
|
||||
}
|
||||
regs::QUEUE_NUM => {
|
||||
let qidx = self.queue_sel as usize;
|
||||
if qidx < 8 {
|
||||
self.queue_num[qidx] as u32
|
||||
} else {
|
||||
0
|
||||
}
|
||||
}
|
||||
regs::QUEUE_READY => {
|
||||
let qidx = self.queue_sel as usize;
|
||||
if qidx < 8 {
|
||||
self.queue_ready[qidx] as u32
|
||||
} else {
|
||||
0
|
||||
}
|
||||
}
|
||||
regs::INTERRUPT_STATUS => self.interrupt_status,
|
||||
regs::STATUS => self.device_status,
|
||||
regs::CONFIG_GENERATION => self.config_generation,
|
||||
regs::QUEUE_DESC_LOW => {
|
||||
let qidx = self.queue_sel as usize;
|
||||
if qidx < 8 { self.queue_desc[qidx] as u32 } else { 0 }
|
||||
}
|
||||
regs::QUEUE_DESC_HIGH => {
|
||||
let qidx = self.queue_sel as usize;
|
||||
if qidx < 8 { (self.queue_desc[qidx] >> 32) as u32 } else { 0 }
|
||||
}
|
||||
regs::QUEUE_AVAIL_LOW => {
|
||||
let qidx = self.queue_sel as usize;
|
||||
if qidx < 8 { self.queue_avail[qidx] as u32 } else { 0 }
|
||||
}
|
||||
regs::QUEUE_AVAIL_HIGH => {
|
||||
let qidx = self.queue_sel as usize;
|
||||
if qidx < 8 { (self.queue_avail[qidx] >> 32) as u32 } else { 0 }
|
||||
}
|
||||
regs::QUEUE_USED_LOW => {
|
||||
let qidx = self.queue_sel as usize;
|
||||
if qidx < 8 { self.queue_used[qidx] as u32 } else { 0 }
|
||||
}
|
||||
regs::QUEUE_USED_HIGH => {
|
||||
let qidx = self.queue_sel as usize;
|
||||
if qidx < 8 { (self.queue_used[qidx] >> 32) as u32 } else { 0 }
|
||||
}
|
||||
_ if offset >= regs::CONFIG => {
|
||||
// Config space read
|
||||
let config_offset = (offset - regs::CONFIG) as u32;
|
||||
self.device.read_config(config_offset as u32, data);
|
||||
return;
|
||||
}
|
||||
_ => 0,
|
||||
};
|
||||
|
||||
if len == 4 {
|
||||
data.copy_from_slice(&value.to_le_bytes());
|
||||
}
|
||||
}
|
||||
|
||||
/// Handle MMIO write
|
||||
pub fn write(&mut self, offset: u64, data: &[u8]) {
|
||||
let len = data.len();
|
||||
if len != 4 && offset < regs::CONFIG {
|
||||
// Non-config writes must be 4 bytes
|
||||
return;
|
||||
}
|
||||
|
||||
let value = if len >= 4 {
|
||||
u32::from_le_bytes(data[..4].try_into().unwrap())
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
match offset {
|
||||
regs::DEVICE_FEATURES_SEL => {
|
||||
self.device_features_sel = value;
|
||||
}
|
||||
regs::DRIVER_FEATURES => {
|
||||
if self.driver_features_sel == 0 {
|
||||
self.driver_features = (self.driver_features & 0xFFFFFFFF00000000) | value as u64;
|
||||
} else {
|
||||
self.driver_features = (self.driver_features & 0x00000000FFFFFFFF) | ((value as u64) << 32);
|
||||
}
|
||||
}
|
||||
regs::DRIVER_FEATURES_SEL => {
|
||||
self.driver_features_sel = value;
|
||||
}
|
||||
regs::QUEUE_SEL => {
|
||||
self.queue_sel = value;
|
||||
}
|
||||
regs::QUEUE_NUM => {
|
||||
let qidx = self.queue_sel as usize;
|
||||
if qidx < 8 {
|
||||
self.queue_num[qidx] = value as u16;
|
||||
}
|
||||
}
|
||||
regs::QUEUE_READY => {
|
||||
let qidx = self.queue_sel as usize;
|
||||
if qidx < 8 {
|
||||
self.queue_ready[qidx] = value != 0;
|
||||
}
|
||||
}
|
||||
regs::QUEUE_NOTIFY => {
|
||||
// Notify the device about queue activity
|
||||
self.device.queue_notify(value);
|
||||
// Signal used-ring interrupt so the guest knows to process completions.
|
||||
// Without this, the guest never sees that its virtio requests completed.
|
||||
self.signal_used();
|
||||
}
|
||||
regs::INTERRUPT_ACK => {
|
||||
// Clear acknowledged interrupts
|
||||
self.interrupt_status &= !value;
|
||||
// Deassert IRQ line when all interrupts are acknowledged
|
||||
// (level-triggered: line must go low when no interrupts pending)
|
||||
if self.interrupt_status == 0 {
|
||||
if let Some(irq) = &self.irq {
|
||||
let _ = irq.deassert();
|
||||
}
|
||||
}
|
||||
}
|
||||
regs::STATUS => {
|
||||
self.handle_status_write(value);
|
||||
}
|
||||
regs::QUEUE_DESC_LOW => {
|
||||
let qidx = self.queue_sel as usize;
|
||||
if qidx < 8 {
|
||||
self.queue_desc[qidx] = (self.queue_desc[qidx] & 0xFFFFFFFF00000000) | value as u64;
|
||||
}
|
||||
}
|
||||
regs::QUEUE_DESC_HIGH => {
|
||||
let qidx = self.queue_sel as usize;
|
||||
if qidx < 8 {
|
||||
self.queue_desc[qidx] = (self.queue_desc[qidx] & 0x00000000FFFFFFFF) | ((value as u64) << 32);
|
||||
}
|
||||
}
|
||||
regs::QUEUE_AVAIL_LOW => {
|
||||
let qidx = self.queue_sel as usize;
|
||||
if qidx < 8 {
|
||||
self.queue_avail[qidx] = (self.queue_avail[qidx] & 0xFFFFFFFF00000000) | value as u64;
|
||||
}
|
||||
}
|
||||
regs::QUEUE_AVAIL_HIGH => {
|
||||
let qidx = self.queue_sel as usize;
|
||||
if qidx < 8 {
|
||||
self.queue_avail[qidx] = (self.queue_avail[qidx] & 0x00000000FFFFFFFF) | ((value as u64) << 32);
|
||||
}
|
||||
}
|
||||
regs::QUEUE_USED_LOW => {
|
||||
let qidx = self.queue_sel as usize;
|
||||
if qidx < 8 {
|
||||
self.queue_used[qidx] = (self.queue_used[qidx] & 0xFFFFFFFF00000000) | value as u64;
|
||||
}
|
||||
}
|
||||
regs::QUEUE_USED_HIGH => {
|
||||
let qidx = self.queue_sel as usize;
|
||||
if qidx < 8 {
|
||||
self.queue_used[qidx] = (self.queue_used[qidx] & 0x00000000FFFFFFFF) | ((value as u64) << 32);
|
||||
}
|
||||
}
|
||||
regs::QUEUE_RESET => {
|
||||
// Queue reset (virtio 1.1+)
|
||||
if value == 1 {
|
||||
let qidx = self.queue_sel as usize;
|
||||
if qidx < 8 {
|
||||
self.queue_desc[qidx] = 0;
|
||||
self.queue_avail[qidx] = 0;
|
||||
self.queue_used[qidx] = 0;
|
||||
self.queue_num[qidx] = 0;
|
||||
self.queue_ready[qidx] = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
_ if offset >= regs::CONFIG => {
|
||||
// Config space write
|
||||
let config_offset = (offset - regs::CONFIG) as u32;
|
||||
self.device.write_config(config_offset as u32, data);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
/// Handle device status register write
|
||||
fn handle_status_write(&mut self, value: u32) {
|
||||
// Writing 0 resets the device
|
||||
if value == 0 {
|
||||
self.reset();
|
||||
return;
|
||||
}
|
||||
|
||||
let old_status = self.device_status;
|
||||
self.device_status = value;
|
||||
|
||||
// Check for FEATURES_OK transition
|
||||
if value & status::FEATURES_OK != 0 && old_status & status::FEATURES_OK == 0 {
|
||||
// Feature negotiation complete - validate and set features
|
||||
let accepted = self.device.set_driver_features(self.driver_features);
|
||||
if accepted != self.driver_features {
|
||||
// Some features rejected - guest should re-read FEATURES_OK
|
||||
// For now, accept what device supports
|
||||
self.driver_features = accepted;
|
||||
}
|
||||
}
|
||||
|
||||
// Check for DRIVER_OK transition
|
||||
if value & status::DRIVER_OK != 0 && old_status & status::DRIVER_OK == 0 {
|
||||
self.activate_device();
|
||||
}
|
||||
}
|
||||
|
||||
/// Activate the device after DRIVER_OK is set
|
||||
fn activate_device(&mut self) {
|
||||
// Propagate queue configuration from MMIO transport to device
|
||||
let num_queues = self.device.num_queues();
|
||||
for qidx in 0..num_queues.min(8) {
|
||||
if self.queue_ready[qidx] && self.queue_num[qidx] > 0 {
|
||||
self.device.setup_queue(
|
||||
qidx as u32,
|
||||
self.queue_num[qidx],
|
||||
self.queue_desc[qidx],
|
||||
self.queue_avail[qidx],
|
||||
self.queue_used[qidx],
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if let (Some(mem), Some(irq)) = (&self.mem, &self.irq) {
|
||||
if let Err(e) = self.device.activate(mem.clone(), irq.clone()) {
|
||||
tracing::error!("Failed to activate virtio device: {}", e);
|
||||
self.device_status |= status::DEVICE_NEEDS_RESET;
|
||||
}
|
||||
} else {
|
||||
tracing::warn!(
|
||||
"Device activation without mem/irq - mem={}, irq={}",
|
||||
self.mem.is_some(), self.irq.is_some()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Reset the device and transport state
|
||||
pub fn reset(&mut self) {
|
||||
self.device.reset();
|
||||
self.device_status = 0;
|
||||
self.device_features_sel = 0;
|
||||
self.driver_features = 0;
|
||||
self.driver_features_sel = 0;
|
||||
self.queue_sel = 0;
|
||||
self.interrupt_status = 0;
|
||||
self.queue_desc = [0; 8];
|
||||
self.queue_avail = [0; 8];
|
||||
self.queue_used = [0; 8];
|
||||
self.queue_num = [0; 8];
|
||||
self.queue_ready = [false; 8];
|
||||
}
|
||||
|
||||
/// Signal an interrupt to the guest
|
||||
pub fn signal_used(&mut self) {
|
||||
self.interrupt_status |= interrupt::USED_RING;
|
||||
if let Some(irq) = &self.irq {
|
||||
let _ = irq.signal(0); // Vector 0 for used ring
|
||||
}
|
||||
}
|
||||
|
||||
/// Signal a configuration change
|
||||
pub fn signal_config_change(&mut self) {
|
||||
self.config_generation = self.config_generation.wrapping_add(1);
|
||||
self.interrupt_status |= interrupt::CONFIG_CHANGE;
|
||||
if let Some(irq) = &self.irq {
|
||||
let _ = irq.signal(1); // Vector 1 for config change
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Dynamic Dispatch Trait for MMIO Devices
|
||||
// ============================================================================
|
||||
|
||||
/// Type-erased interface for MMIO-mapped virtio devices.
|
||||
///
|
||||
/// This allows the device manager to store heterogeneous virtio devices
|
||||
/// (net, block, etc.) behind a single trait object.
|
||||
#[allow(dead_code)]
|
||||
pub trait DynMmioDevice: Send {
|
||||
/// Handle an MMIO read at the given offset within this device's region
|
||||
fn mmio_read(&self, offset: u64, data: &mut [u8]);
|
||||
/// Handle an MMIO write at the given offset within this device's region
|
||||
fn mmio_write(&mut self, offset: u64, data: &[u8]);
|
||||
/// Set the guest memory interface for DMA and queue access
|
||||
fn set_memory(&mut self, mem: Arc<dyn GuestMemory>);
|
||||
/// Set the interrupt delivery callback
|
||||
fn set_interrupt(&mut self, irq: Arc<dyn InterruptDelivery>);
|
||||
/// Signal that used buffers are available (triggers IRQ)
|
||||
fn signal_used(&mut self);
|
||||
/// Get the TAP fd if this is a net device (for RX polling)
|
||||
fn tap_fd(&self) -> Option<RawFd>;
|
||||
/// Process TAP RX event (only meaningful for net devices)
|
||||
fn handle_tap_event(&mut self);
|
||||
/// Get the device type
|
||||
fn device_type_id(&self) -> DeviceType;
|
||||
}
|
||||
|
||||
impl<D: VirtioDevice + 'static> DynMmioDevice for MmioTransport<D> {
|
||||
fn mmio_read(&self, offset: u64, data: &mut [u8]) {
|
||||
self.read(offset, data);
|
||||
}
|
||||
|
||||
fn mmio_write(&mut self, offset: u64, data: &[u8]) {
|
||||
self.write(offset, data);
|
||||
}
|
||||
|
||||
fn set_memory(&mut self, mem: Arc<dyn GuestMemory>) {
|
||||
MmioTransport::set_memory(self, mem);
|
||||
}
|
||||
|
||||
fn set_interrupt(&mut self, irq: Arc<dyn InterruptDelivery>) {
|
||||
MmioTransport::set_interrupt(self, irq);
|
||||
}
|
||||
|
||||
fn signal_used(&mut self) {
|
||||
MmioTransport::signal_used(self);
|
||||
}
|
||||
|
||||
fn tap_fd(&self) -> Option<RawFd> {
|
||||
None // Default: not a net device
|
||||
}
|
||||
|
||||
fn handle_tap_event(&mut self) {
|
||||
// Default: no-op for non-net devices
|
||||
}
|
||||
|
||||
fn device_type_id(&self) -> DeviceType {
|
||||
self.device.device_type()
|
||||
}
|
||||
}
|
||||
|
||||
/// Specialized implementation for VirtioNet MMIO transport
|
||||
/// that exposes TAP fd and RX event handling.
|
||||
pub struct NetMmioTransport {
|
||||
inner: MmioTransport<super::net::VirtioNet>,
|
||||
tap_fd: RawFd,
|
||||
}
|
||||
|
||||
impl NetMmioTransport {
|
||||
pub fn new(device: super::net::VirtioNet) -> Self {
|
||||
let tap_fd = device.tap_fd();
|
||||
Self {
|
||||
inner: MmioTransport::new(device),
|
||||
tap_fd,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl DynMmioDevice for NetMmioTransport {
|
||||
fn mmio_read(&self, offset: u64, data: &mut [u8]) {
|
||||
self.inner.read(offset, data);
|
||||
}
|
||||
|
||||
fn mmio_write(&mut self, offset: u64, data: &[u8]) {
|
||||
self.inner.write(offset, data);
|
||||
}
|
||||
|
||||
fn set_memory(&mut self, mem: Arc<dyn GuestMemory>) {
|
||||
self.inner.set_memory(mem);
|
||||
}
|
||||
|
||||
fn set_interrupt(&mut self, irq: Arc<dyn InterruptDelivery>) {
|
||||
self.inner.set_interrupt(irq);
|
||||
}
|
||||
|
||||
fn signal_used(&mut self) {
|
||||
self.inner.signal_used();
|
||||
}
|
||||
|
||||
fn tap_fd(&self) -> Option<RawFd> {
|
||||
Some(self.tap_fd)
|
||||
}
|
||||
|
||||
fn handle_tap_event(&mut self) {
|
||||
self.inner.device_mut().handle_tap_event();
|
||||
}
|
||||
|
||||
fn device_type_id(&self) -> DeviceType {
|
||||
DeviceType::Net
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
// Mock device for testing
|
||||
struct MockDevice;
|
||||
|
||||
impl VirtioDevice for MockDevice {
|
||||
fn device_type(&self) -> DeviceType {
|
||||
DeviceType::Net
|
||||
}
|
||||
|
||||
fn device_features(&self) -> u64 {
|
||||
0x1234_5678_9ABC_DEF0
|
||||
}
|
||||
|
||||
fn set_driver_features(&mut self, features: u64) -> u64 {
|
||||
features
|
||||
}
|
||||
|
||||
fn config_size(&self) -> u32 {
|
||||
16
|
||||
}
|
||||
|
||||
fn read_config(&self, offset: u32, data: &mut [u8]) {
|
||||
for (i, byte) in data.iter_mut().enumerate() {
|
||||
*byte = (offset as u8).wrapping_add(i as u8);
|
||||
}
|
||||
}
|
||||
|
||||
fn write_config(&mut self, _offset: u32, _data: &[u8]) {}
|
||||
|
||||
fn activate(&mut self, _mem: Arc<dyn GuestMemory>, _irq: Arc<dyn InterruptDelivery>) -> std::result::Result<(), VirtioError> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn reset(&mut self) {}
|
||||
|
||||
fn num_queues(&self) -> usize {
|
||||
2
|
||||
}
|
||||
|
||||
fn queue_notify(&mut self, _queue_index: u32) {}
|
||||
|
||||
fn queue_max_size(&self, _queue_index: u32) -> u16 {
|
||||
256
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_magic_version() {
|
||||
let transport = MmioTransport::new(MockDevice);
|
||||
let mut data = [0u8; 4];
|
||||
|
||||
transport.read(regs::MAGIC_VALUE, &mut data);
|
||||
assert_eq!(u32::from_le_bytes(data), MAGIC);
|
||||
|
||||
transport.read(regs::VERSION, &mut data);
|
||||
assert_eq!(u32::from_le_bytes(data), VERSION);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_device_id() {
|
||||
let transport = MmioTransport::new(MockDevice);
|
||||
let mut data = [0u8; 4];
|
||||
|
||||
transport.read(regs::DEVICE_ID, &mut data);
|
||||
assert_eq!(u32::from_le_bytes(data), DeviceType::Net as u32);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_features_selection() {
|
||||
let mut transport = MmioTransport::new(MockDevice);
|
||||
let mut data = [0u8; 4];
|
||||
|
||||
// Select low 32 bits
|
||||
transport.write(regs::DEVICE_FEATURES_SEL, &0u32.to_le_bytes());
|
||||
transport.read(regs::DEVICE_FEATURES, &mut data);
|
||||
assert_eq!(u32::from_le_bytes(data), 0x9ABC_DEF0);
|
||||
|
||||
// Select high 32 bits
|
||||
transport.write(regs::DEVICE_FEATURES_SEL, &1u32.to_le_bytes());
|
||||
transport.read(regs::DEVICE_FEATURES, &mut data);
|
||||
assert_eq!(u32::from_le_bytes(data), 0x1234_5678);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_status_reset() {
|
||||
let mut transport = MmioTransport::new(MockDevice);
|
||||
|
||||
// Set some status
|
||||
transport.write(regs::STATUS, &(status::ACKNOWLEDGE | status::DRIVER).to_le_bytes());
|
||||
|
||||
let mut data = [0u8; 4];
|
||||
transport.read(regs::STATUS, &mut data);
|
||||
assert_eq!(u32::from_le_bytes(data), status::ACKNOWLEDGE | status::DRIVER);
|
||||
|
||||
// Reset by writing 0
|
||||
transport.write(regs::STATUS, &0u32.to_le_bytes());
|
||||
|
||||
transport.read(regs::STATUS, &mut data);
|
||||
assert_eq!(u32::from_le_bytes(data), 0);
|
||||
}
|
||||
}
|
||||
544
vmm/src/devices/virtio/mod.rs
Normal file
544
vmm/src/devices/virtio/mod.rs
Normal file
@@ -0,0 +1,544 @@
|
||||
//! VirtIO device implementations for Volt VMM
|
||||
//!
|
||||
//! This module provides virtio device emulation compatible with the
|
||||
//! virtio-mmio transport. Devices follow the virtio 1.0+ specification.
|
||||
|
||||
pub mod block;
|
||||
pub mod mmio;
|
||||
pub mod net;
|
||||
pub mod queue;
|
||||
pub mod stellarium_blk;
|
||||
|
||||
// Re-export common types for submodule use
|
||||
// Re-export mmio types
|
||||
pub use mmio::{GuestMemory as MmioGuestMemory, InterruptDelivery};
|
||||
|
||||
/// Generic error type alias for virtio operations
|
||||
#[allow(dead_code)]
|
||||
pub type Error = VirtioError;
|
||||
|
||||
/// Result type alias
|
||||
#[allow(dead_code)]
|
||||
pub type Result<T> = std::result::Result<T, Error>;
|
||||
|
||||
/// TAP device errors (used by net devices)
|
||||
#[derive(Debug, Clone)]
|
||||
#[allow(dead_code)]
|
||||
pub enum TapError {
|
||||
Open(String),
|
||||
Configure(String),
|
||||
Ioctl(String),
|
||||
Create(std::io::ErrorKind, String),
|
||||
VnetHdr(std::io::ErrorKind, String),
|
||||
Offload(std::io::ErrorKind, String),
|
||||
SetNonBlocking(std::io::ErrorKind, String),
|
||||
}
|
||||
|
||||
impl TapError {
|
||||
/// Create a TapError::Create from an io::Error
|
||||
pub fn create(e: std::io::Error) -> Self {
|
||||
Self::Create(e.kind(), e.to_string())
|
||||
}
|
||||
|
||||
/// Create a TapError::VnetHdr from an io::Error
|
||||
pub fn vnet_hdr(e: std::io::Error) -> Self {
|
||||
Self::VnetHdr(e.kind(), e.to_string())
|
||||
}
|
||||
|
||||
/// Create a TapError::Offload from an io::Error
|
||||
pub fn offload(e: std::io::Error) -> Self {
|
||||
Self::Offload(e.kind(), e.to_string())
|
||||
}
|
||||
|
||||
/// Create a TapError::SetNonBlocking from an io::Error
|
||||
pub fn set_nonblocking(e: std::io::Error) -> Self {
|
||||
Self::SetNonBlocking(e.kind(), e.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for TapError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Self::Open(s) => write!(f, "failed to open TAP: {}", s),
|
||||
Self::Configure(s) => write!(f, "failed to configure TAP: {}", s),
|
||||
Self::Ioctl(s) => write!(f, "TAP ioctl failed: {}", s),
|
||||
Self::Create(_, s) => write!(f, "failed to create TAP: {}", s),
|
||||
Self::VnetHdr(_, s) => write!(f, "failed to set VNET_HDR: {}", s),
|
||||
Self::Offload(_, s) => write!(f, "failed to set offload: {}", s),
|
||||
Self::SetNonBlocking(_, s) => write!(f, "failed to set non-blocking: {}", s),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for TapError {}
|
||||
|
||||
use std::sync::atomic::{AtomicU32, Ordering};
|
||||
|
||||
/// VirtIO device status bits (virtio spec 2.1)
|
||||
#[allow(dead_code)] // Virtio spec constants — kept for completeness
|
||||
pub mod status {
|
||||
pub const ACKNOWLEDGE: u32 = 1;
|
||||
pub const DRIVER: u32 = 2;
|
||||
pub const DRIVER_OK: u32 = 4;
|
||||
pub const FEATURES_OK: u32 = 8;
|
||||
pub const DEVICE_NEEDS_RESET: u32 = 64;
|
||||
pub const FAILED: u32 = 128;
|
||||
}
|
||||
|
||||
/// Common virtio feature bits
|
||||
#[allow(dead_code)] // Virtio spec feature flags — kept for completeness
|
||||
pub mod features {
|
||||
/// Ring event index support
|
||||
pub const VIRTIO_F_RING_EVENT_IDX: u64 = 1 << 29;
|
||||
/// Virtio version 1
|
||||
pub const VIRTIO_F_VERSION_1: u64 = 1 << 32;
|
||||
/// Access platform-specific memory
|
||||
pub const VIRTIO_F_ACCESS_PLATFORM: u64 = 1 << 33;
|
||||
/// Ring packed layout
|
||||
pub const VIRTIO_F_RING_PACKED: u64 = 1 << 34;
|
||||
/// In-order completion
|
||||
pub const VIRTIO_F_IN_ORDER: u64 = 1 << 35;
|
||||
/// Memory ordering guarantees
|
||||
pub const VIRTIO_F_ORDER_PLATFORM: u64 = 1 << 36;
|
||||
/// Single Root I/O Virtualization
|
||||
pub const VIRTIO_F_SR_IOV: u64 = 1 << 37;
|
||||
/// Notification data
|
||||
pub const VIRTIO_F_NOTIFICATION_DATA: u64 = 1 << 38;
|
||||
|
||||
// Ring descriptor flags (from virtio_ring.h)
|
||||
/// Indirect descriptors
|
||||
pub const VIRTIO_RING_F_INDIRECT_DESC: u64 = 1 << 28;
|
||||
/// Event index (same as VIRTIO_F_RING_EVENT_IDX for ring features)
|
||||
pub const VIRTIO_RING_F_EVENT_IDX: u64 = 1 << 29;
|
||||
}
|
||||
|
||||
/// VirtIO device types
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
#[repr(u32)]
|
||||
#[allow(dead_code)]
|
||||
pub enum DeviceType {
|
||||
Net = 1,
|
||||
Block = 2,
|
||||
Console = 3,
|
||||
Entropy = 4,
|
||||
Balloon = 5,
|
||||
IoMemory = 6,
|
||||
Rpmsg = 7,
|
||||
Scsi = 8,
|
||||
Transport9P = 9,
|
||||
Mac80211Wlan = 10,
|
||||
RprocSerial = 11,
|
||||
Caif = 12,
|
||||
MemoryBalloon = 13,
|
||||
Gpu = 16,
|
||||
Timer = 17,
|
||||
Input = 18,
|
||||
Socket = 19,
|
||||
Crypto = 20,
|
||||
SignalDist = 21,
|
||||
Pstore = 22,
|
||||
Iommu = 23,
|
||||
Memory = 24,
|
||||
Vsock = 25,
|
||||
}
|
||||
|
||||
/// Result type for virtio operations (same as Result from prelude)
|
||||
pub type VirtioResult<T> = std::result::Result<T, VirtioError>;
|
||||
|
||||
/// Errors that can occur in virtio device operations
|
||||
#[derive(Debug, Clone)]
|
||||
#[allow(dead_code)] // Error variants for completeness
|
||||
pub enum VirtioError {
|
||||
/// Invalid descriptor index
|
||||
InvalidDescriptorIndex(u16),
|
||||
/// Descriptor chain is too short
|
||||
DescriptorChainTooShort,
|
||||
/// Descriptor chain is too long
|
||||
DescriptorChainTooLong,
|
||||
/// Invalid guest memory address
|
||||
InvalidGuestAddress(u64),
|
||||
/// Queue not ready
|
||||
QueueNotReady,
|
||||
/// Device not ready
|
||||
DeviceNotReady,
|
||||
/// Backend I/O error
|
||||
BackendIo(String),
|
||||
/// Invalid request type
|
||||
InvalidRequestType(u32),
|
||||
/// Feature negotiation failed
|
||||
FeatureNegotiationFailed,
|
||||
/// Invalid queue configuration
|
||||
InvalidQueueConfig,
|
||||
/// Buffer too small for operation
|
||||
BufferTooSmall { needed: usize, available: usize },
|
||||
}
|
||||
|
||||
impl std::fmt::Display for VirtioError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Self::InvalidDescriptorIndex(idx) => write!(f, "invalid descriptor index: {}", idx),
|
||||
Self::DescriptorChainTooShort => write!(f, "descriptor chain too short"),
|
||||
Self::DescriptorChainTooLong => write!(f, "descriptor chain too long"),
|
||||
Self::InvalidGuestAddress(addr) => write!(f, "invalid guest address: {:#x}", addr),
|
||||
Self::QueueNotReady => write!(f, "queue not ready"),
|
||||
Self::DeviceNotReady => write!(f, "device not ready"),
|
||||
Self::BackendIo(msg) => write!(f, "backend I/O error: {}", msg),
|
||||
Self::InvalidRequestType(t) => write!(f, "invalid request type: {}", t),
|
||||
Self::FeatureNegotiationFailed => write!(f, "feature negotiation failed"),
|
||||
Self::InvalidQueueConfig => write!(f, "invalid queue configuration"),
|
||||
Self::BufferTooSmall { needed, available } => {
|
||||
write!(f, "buffer too small: needed {} bytes, have {}", needed, available)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for VirtioError {}
|
||||
|
||||
/// Trait for virtio devices
|
||||
pub trait VirtioDevice: Send + Sync {
|
||||
/// Device type (virtio spec device ID)
|
||||
fn device_type(&self) -> DeviceType;
|
||||
|
||||
/// Features supported by this device
|
||||
fn device_features(&self) -> u64;
|
||||
|
||||
/// Get the number of queues this device uses
|
||||
fn num_queues(&self) -> usize;
|
||||
|
||||
/// Get the maximum queue size for a given queue
|
||||
fn queue_max_size(&self, _queue_index: u32) -> u16 {
|
||||
256
|
||||
}
|
||||
|
||||
/// Read from device-specific config space
|
||||
fn read_config(&self, offset: u32, data: &mut [u8]);
|
||||
|
||||
/// Write to device-specific config space
|
||||
fn write_config(&mut self, offset: u32, data: &[u8]);
|
||||
|
||||
/// Set driver-negotiated features, returns accepted features
|
||||
fn set_driver_features(&mut self, features: u64) -> u64 {
|
||||
// Default: accept all features that device supports
|
||||
features & self.device_features()
|
||||
}
|
||||
|
||||
/// Activate the device with negotiated features and memory
|
||||
fn activate(
|
||||
&mut self,
|
||||
mem: std::sync::Arc<dyn MmioGuestMemory>,
|
||||
irq: std::sync::Arc<dyn InterruptDelivery>,
|
||||
) -> std::result::Result<(), VirtioError>;
|
||||
|
||||
/// Reset the device to initial state
|
||||
fn reset(&mut self);
|
||||
|
||||
/// Process available descriptors in the given queue (called on queue notify)
|
||||
fn queue_notify(&mut self, queue_index: u32);
|
||||
|
||||
/// Configure a queue's addresses and size (called by MMIO transport before activation)
|
||||
fn setup_queue(&mut self, queue_index: u32, size: u16, desc: u64, avail: u64, used: u64) {
|
||||
let _ = (queue_index, size, desc, avail, used);
|
||||
// Default: no-op. Devices that manage their own Queue structs should override.
|
||||
}
|
||||
|
||||
/// Get the size of device-specific config space
|
||||
#[allow(dead_code)]
|
||||
fn config_size(&self) -> u32 {
|
||||
0
|
||||
}
|
||||
}
|
||||
|
||||
/// Guest memory abstraction for virtio devices
|
||||
///
|
||||
/// This provides a safe interface to read/write guest memory for
|
||||
/// descriptor processing.
|
||||
#[derive(Clone)]
|
||||
pub struct GuestMemory {
|
||||
/// Base address of guest physical memory in host virtual address space
|
||||
base: *mut u8,
|
||||
/// Size of guest memory in bytes
|
||||
size: usize,
|
||||
}
|
||||
|
||||
// Safety: GuestMemory is just a pointer to mapped memory, safe to send between threads
|
||||
unsafe impl Send for GuestMemory {}
|
||||
unsafe impl Sync for GuestMemory {}
|
||||
|
||||
impl GuestMemory {
|
||||
/// Create a new guest memory wrapper
|
||||
///
|
||||
/// # Safety
|
||||
/// The caller must ensure that `base` points to valid mapped memory of
|
||||
/// at least `size` bytes that remains valid for the lifetime of this object.
|
||||
pub unsafe fn new(base: *mut u8, size: usize) -> Self {
|
||||
Self { base, size }
|
||||
}
|
||||
|
||||
/// Check if a guest physical address range is valid
|
||||
pub fn check_range(&self, addr: u64, len: usize) -> bool {
|
||||
let end = addr.checked_add(len as u64);
|
||||
match end {
|
||||
Some(e) => e <= self.size as u64,
|
||||
None => false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get a slice to guest memory at the given guest physical address
|
||||
///
|
||||
/// # Safety
|
||||
/// Caller must ensure no concurrent writes to this region.
|
||||
pub unsafe fn slice(&self, addr: u64, len: usize) -> VirtioResult<&[u8]> {
|
||||
if !self.check_range(addr, len) {
|
||||
return Err(VirtioError::InvalidGuestAddress(addr));
|
||||
}
|
||||
Ok(std::slice::from_raw_parts(self.base.add(addr as usize), len))
|
||||
}
|
||||
|
||||
/// Get a mutable slice to guest memory at the given guest physical address
|
||||
///
|
||||
/// # Safety
|
||||
/// Caller must ensure exclusive access to this region.
|
||||
pub unsafe fn slice_mut(&self, addr: u64, len: usize) -> VirtioResult<&mut [u8]> {
|
||||
if !self.check_range(addr, len) {
|
||||
return Err(VirtioError::InvalidGuestAddress(addr));
|
||||
}
|
||||
Ok(std::slice::from_raw_parts_mut(self.base.add(addr as usize), len))
|
||||
}
|
||||
|
||||
/// Read bytes from guest memory
|
||||
pub fn read(&self, addr: u64, buf: &mut [u8]) -> VirtioResult<()> {
|
||||
// Safety: read-only access, no concurrent modification expected
|
||||
let src = unsafe { self.slice(addr, buf.len())? };
|
||||
buf.copy_from_slice(src);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Write bytes to guest memory
|
||||
pub fn write(&self, addr: u64, buf: &[u8]) -> VirtioResult<()> {
|
||||
// Safety: exclusive write access assumed during device processing
|
||||
let dst = unsafe { self.slice_mut(addr, buf.len())? };
|
||||
dst.copy_from_slice(buf);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Read a value from guest memory
|
||||
pub fn read_obj<T: Copy>(&self, addr: u64) -> VirtioResult<T> {
|
||||
let mut buf = vec![0u8; std::mem::size_of::<T>()];
|
||||
self.read(addr, &mut buf)?;
|
||||
// Safety: T is Copy, so any bit pattern is valid for basic types
|
||||
Ok(unsafe { std::ptr::read_unaligned(buf.as_ptr() as *const T) })
|
||||
}
|
||||
|
||||
/// Write a value to guest memory
|
||||
pub fn write_obj<T: Copy>(&self, addr: u64, val: &T) -> VirtioResult<()> {
|
||||
let buf = unsafe {
|
||||
std::slice::from_raw_parts(val as *const T as *const u8, std::mem::size_of::<T>())
|
||||
};
|
||||
self.write(addr, buf)
|
||||
}
|
||||
}
|
||||
|
||||
/// Implement the MMIO GuestMemory trait for our GuestMemory struct
|
||||
impl MmioGuestMemory for GuestMemory {
|
||||
fn read(&self, addr: u64, buf: &mut [u8]) -> std::result::Result<(), mmio::VirtioMmioError> {
|
||||
GuestMemory::read(self, addr, buf).map_err(|e| mmio::VirtioMmioError::DeviceError(e))
|
||||
}
|
||||
|
||||
fn write(&self, addr: u64, buf: &[u8]) -> std::result::Result<(), mmio::VirtioMmioError> {
|
||||
GuestMemory::write(self, addr, buf).map_err(|e| mmio::VirtioMmioError::DeviceError(e))
|
||||
}
|
||||
}
|
||||
|
||||
/// Virtqueue implementation
|
||||
#[allow(dead_code)]
|
||||
pub struct Queue {
|
||||
/// Maximum size of the queue
|
||||
pub max_size: u16,
|
||||
/// Actual size (set by driver, must be power of 2)
|
||||
pub size: u16,
|
||||
/// Queue ready flag
|
||||
pub ready: bool,
|
||||
/// Descriptor table guest physical address
|
||||
pub desc_table: u64,
|
||||
/// Available ring guest physical address
|
||||
pub avail_ring: u64,
|
||||
/// Used ring guest physical address
|
||||
pub used_ring: u64,
|
||||
/// Index into the available ring for next descriptor to process
|
||||
next_avail: u16,
|
||||
/// Index into the used ring for next used entry
|
||||
next_used: u16,
|
||||
/// Interrupt signaled (for coalescing)
|
||||
signaled: AtomicU32,
|
||||
}
|
||||
|
||||
impl Queue {
|
||||
/// Create a new queue with the given maximum size
|
||||
pub fn new(max_size: u16) -> Self {
|
||||
Self {
|
||||
max_size,
|
||||
size: 0,
|
||||
ready: false,
|
||||
desc_table: 0,
|
||||
avail_ring: 0,
|
||||
used_ring: 0,
|
||||
next_avail: 0,
|
||||
next_used: 0,
|
||||
signaled: AtomicU32::new(0),
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if the queue is properly configured and ready
|
||||
pub fn is_ready(&self) -> bool {
|
||||
self.ready && self.size > 0 && self.size.is_power_of_two()
|
||||
}
|
||||
|
||||
/// Get the next available descriptor chain head
|
||||
pub fn pop_avail(&mut self, mem: &GuestMemory) -> VirtioResult<Option<u16>> {
|
||||
if !self.is_ready() {
|
||||
return Err(VirtioError::QueueNotReady);
|
||||
}
|
||||
|
||||
// Read the available ring index (avail->idx is at offset 2)
|
||||
let avail_idx: u16 = mem.read_obj(self.avail_ring + 2)?;
|
||||
|
||||
// Check if there's anything available
|
||||
if self.next_avail == avail_idx {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
// Read the descriptor index from the ring
|
||||
// avail->ring starts at offset 4
|
||||
let ring_offset = 4 + (self.next_avail % self.size) as u64 * 2;
|
||||
let desc_idx: u16 = mem.read_obj(self.avail_ring + ring_offset)?;
|
||||
|
||||
self.next_avail = self.next_avail.wrapping_add(1);
|
||||
|
||||
Ok(Some(desc_idx))
|
||||
}
|
||||
|
||||
/// Add a used descriptor to the used ring
|
||||
pub fn push_used(&mut self, mem: &GuestMemory, desc_idx: u16, len: u32) -> VirtioResult<()> {
|
||||
if !self.is_ready() {
|
||||
return Err(VirtioError::QueueNotReady);
|
||||
}
|
||||
|
||||
// Write to the used ring entry
|
||||
// used->ring starts at offset 4, each entry is 8 bytes (id: u32, len: u32)
|
||||
let ring_offset = 4 + (self.next_used % self.size) as u64 * 8;
|
||||
|
||||
// Write the used element (id and len)
|
||||
mem.write_obj(self.used_ring + ring_offset, &(desc_idx as u32))?;
|
||||
mem.write_obj(self.used_ring + ring_offset + 4, &len)?;
|
||||
|
||||
// Memory barrier (compiler fence at minimum)
|
||||
std::sync::atomic::fence(Ordering::Release);
|
||||
|
||||
// Update the used index (used->idx is at offset 2)
|
||||
self.next_used = self.next_used.wrapping_add(1);
|
||||
mem.write_obj(self.used_ring + 2, &self.next_used)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Reset the queue to initial state
|
||||
pub fn reset(&mut self) {
|
||||
self.size = 0;
|
||||
self.ready = false;
|
||||
self.desc_table = 0;
|
||||
self.avail_ring = 0;
|
||||
self.used_ring = 0;
|
||||
self.next_avail = 0;
|
||||
self.next_used = 0;
|
||||
self.signaled.store(0, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
/// Descriptor chain iterator for processing virtqueue requests
|
||||
pub struct DescriptorChain<'a> {
|
||||
mem: &'a GuestMemory,
|
||||
desc_table: u64,
|
||||
queue_size: u16,
|
||||
current: Option<u16>,
|
||||
count: u16,
|
||||
max_chain_len: u16,
|
||||
}
|
||||
|
||||
impl<'a> DescriptorChain<'a> {
|
||||
/// Create a new descriptor chain starting at the given index
|
||||
pub fn new(mem: &'a GuestMemory, desc_table: u64, queue_size: u16, head: u16) -> Self {
|
||||
Self {
|
||||
mem,
|
||||
desc_table,
|
||||
queue_size,
|
||||
current: Some(head),
|
||||
count: 0,
|
||||
max_chain_len: queue_size, // Prevent infinite loops
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the next descriptor in the chain
|
||||
pub fn next(&mut self) -> VirtioResult<Option<Descriptor>> {
|
||||
let idx = match self.current {
|
||||
Some(i) => i,
|
||||
None => return Ok(None),
|
||||
};
|
||||
|
||||
if idx >= self.queue_size {
|
||||
return Err(VirtioError::InvalidDescriptorIndex(idx));
|
||||
}
|
||||
|
||||
self.count += 1;
|
||||
if self.count > self.max_chain_len {
|
||||
return Err(VirtioError::DescriptorChainTooLong);
|
||||
}
|
||||
|
||||
// Each descriptor is 16 bytes
|
||||
let desc_addr = self.desc_table + idx as u64 * 16;
|
||||
|
||||
let addr: u64 = self.mem.read_obj(desc_addr)?;
|
||||
let len: u32 = self.mem.read_obj(desc_addr + 8)?;
|
||||
let flags: u16 = self.mem.read_obj(desc_addr + 12)?;
|
||||
let next: u16 = self.mem.read_obj(desc_addr + 14)?;
|
||||
|
||||
// Update current to next descriptor if NEXT flag is set
|
||||
if flags & VRING_DESC_F_NEXT != 0 {
|
||||
self.current = Some(next);
|
||||
} else {
|
||||
self.current = None;
|
||||
}
|
||||
|
||||
Ok(Some(Descriptor { addr, len, flags }))
|
||||
}
|
||||
}
|
||||
|
||||
/// Virtqueue descriptor flags
|
||||
pub const VRING_DESC_F_NEXT: u16 = 1;
|
||||
pub const VRING_DESC_F_WRITE: u16 = 2;
|
||||
#[allow(dead_code)]
|
||||
pub const VRING_DESC_F_INDIRECT: u16 = 4;
|
||||
|
||||
/// A single virtqueue descriptor
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct Descriptor {
|
||||
/// Guest physical address of the buffer
|
||||
pub addr: u64,
|
||||
/// Length of the buffer in bytes
|
||||
pub len: u32,
|
||||
/// Descriptor flags
|
||||
pub flags: u16,
|
||||
}
|
||||
|
||||
impl Descriptor {
|
||||
/// Check if this descriptor is writable by the device
|
||||
pub fn is_write_only(&self) -> bool {
|
||||
self.flags & VRING_DESC_F_WRITE != 0
|
||||
}
|
||||
|
||||
/// Check if this descriptor has a next descriptor in the chain
|
||||
#[allow(dead_code)]
|
||||
pub fn has_next(&self) -> bool {
|
||||
self.flags & VRING_DESC_F_NEXT != 0
|
||||
}
|
||||
}
|
||||
1007
vmm/src/devices/virtio/net.rs
Normal file
1007
vmm/src/devices/virtio/net.rs
Normal file
File diff suppressed because it is too large
Load Diff
641
vmm/src/devices/virtio/networkd.rs
Normal file
641
vmm/src/devices/virtio/networkd.rs
Normal file
@@ -0,0 +1,641 @@
|
||||
//! systemd-networkd Integration for TAP Device Management
|
||||
//!
|
||||
//! This module provides declarative TAP device management through systemd-networkd,
|
||||
//! replacing manual TAP creation with network unit files.
|
||||
//!
|
||||
//! # Benefits
|
||||
//!
|
||||
//! - Declarative configuration (version-controllable)
|
||||
//! - Automatic cleanup on VM exit
|
||||
//! - Integration with systemd lifecycle
|
||||
//! - Unified networking with Voltainer containers
|
||||
//!
|
||||
//! # Architecture
|
||||
//!
|
||||
//! ```text
|
||||
//! Volt systemd-networkd
|
||||
//! │ │
|
||||
//! ├─► Generate .netdev file ────────────►│
|
||||
//! ├─► Generate .network file ───────────►│
|
||||
//! ├─► networkctl reload ────────────────►│
|
||||
//! │ │
|
||||
//! │◄── Wait for TAP interface ◄──────────┤
|
||||
//! │ │
|
||||
//! ├─► Open TAP fd │
|
||||
//! ├─► Start VM │
|
||||
//! │ │
|
||||
//! │ ... VM runs ... │
|
||||
//! │ │
|
||||
//! ├─► Close TAP fd │
|
||||
//! ├─► Delete unit files ────────────────►│
|
||||
//! ├─► networkctl reload ────────────────►│
|
||||
//! │ │
|
||||
//! │ TAP automatically cleaned up │
|
||||
//! ```
|
||||
|
||||
use std::fs::{self, File, OpenOptions};
|
||||
use std::io::{self, Write};
|
||||
use std::os::unix::io::{AsRawFd, FromRawFd, RawFd};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::Command;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
/// Directory for runtime network unit files (cleared on reboot)
|
||||
pub const NETWORKD_RUNTIME_DIR: &str = "/run/systemd/network";
|
||||
|
||||
/// Directory for persistent network unit files
|
||||
pub const NETWORKD_CONFIG_DIR: &str = "/etc/systemd/network";
|
||||
|
||||
/// Default TAP name prefix
|
||||
pub const TAP_PREFIX: &str = "tap-";
|
||||
|
||||
/// Default bridge name for Volt VMs
|
||||
pub const DEFAULT_BRIDGE: &str = "br0";
|
||||
|
||||
/// Timeout for TAP interface creation
|
||||
pub const TAP_CREATE_TIMEOUT: Duration = Duration::from_secs(10);
|
||||
|
||||
/// Error types for networkd operations
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum NetworkdError {
|
||||
#[error("Failed to write unit file: {0}")]
|
||||
WriteUnitFile(#[from] io::Error),
|
||||
|
||||
#[error("networkctl command failed: {0}")]
|
||||
NetworkctlFailed(String),
|
||||
|
||||
#[error("TAP interface creation timeout: {name}")]
|
||||
TapTimeout { name: String },
|
||||
|
||||
#[error("Interface not found: {0}")]
|
||||
InterfaceNotFound(String),
|
||||
|
||||
#[error("Failed to open TAP device: {0}")]
|
||||
TapOpen(io::Error),
|
||||
|
||||
#[error("Bridge not found: {0}")]
|
||||
BridgeNotFound(String),
|
||||
}
|
||||
|
||||
/// Configuration for a networkd-managed TAP device
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct NetworkdTapConfig {
|
||||
/// Unique identifier (used for file naming)
|
||||
pub id: String,
|
||||
/// TAP interface name (auto-generated if None)
|
||||
pub tap_name: Option<String>,
|
||||
/// Bridge to attach to
|
||||
pub bridge: String,
|
||||
/// Enable vhost-net acceleration
|
||||
pub vhost: bool,
|
||||
/// Enable multi-queue
|
||||
pub multi_queue: bool,
|
||||
/// Number of queues (if multi_queue is true)
|
||||
pub num_queues: u32,
|
||||
/// Enable VNET header
|
||||
pub vnet_hdr: bool,
|
||||
/// User to own the TAP device
|
||||
pub user: String,
|
||||
/// Group to own the TAP device
|
||||
pub group: String,
|
||||
/// MTU for the interface
|
||||
pub mtu: Option<u32>,
|
||||
}
|
||||
|
||||
impl Default for NetworkdTapConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
id: uuid_short(),
|
||||
tap_name: None,
|
||||
bridge: DEFAULT_BRIDGE.to_string(),
|
||||
vhost: true,
|
||||
multi_queue: false,
|
||||
num_queues: 1,
|
||||
vnet_hdr: true,
|
||||
user: "root".to_string(),
|
||||
group: "root".to_string(),
|
||||
mtu: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl NetworkdTapConfig {
|
||||
/// Create a new config with the given VM ID
|
||||
pub fn new(vm_id: impl Into<String>) -> Self {
|
||||
Self {
|
||||
id: vm_id.into(),
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
/// Set the bridge name
|
||||
pub fn bridge(mut self, bridge: impl Into<String>) -> Self {
|
||||
self.bridge = bridge.into();
|
||||
self
|
||||
}
|
||||
|
||||
/// Enable or disable vhost-net
|
||||
pub fn vhost(mut self, enabled: bool) -> Self {
|
||||
self.vhost = enabled;
|
||||
self
|
||||
}
|
||||
|
||||
/// Enable multi-queue with specified queue count
|
||||
pub fn multi_queue(mut self, num_queues: u32) -> Self {
|
||||
self.multi_queue = num_queues > 1;
|
||||
self.num_queues = num_queues;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the MTU
|
||||
pub fn mtu(mut self, mtu: u32) -> Self {
|
||||
self.mtu = Some(mtu);
|
||||
self
|
||||
}
|
||||
|
||||
/// Get the TAP interface name
|
||||
pub fn interface_name(&self) -> String {
|
||||
self.tap_name
|
||||
.clone()
|
||||
.unwrap_or_else(|| format!("{}{}", TAP_PREFIX, &self.id[..8.min(self.id.len())]))
|
||||
}
|
||||
|
||||
/// Get the .netdev unit file name
|
||||
pub fn netdev_filename(&self) -> String {
|
||||
format!("50-volt-vmm-{}.netdev", self.id)
|
||||
}
|
||||
|
||||
/// Get the .network unit file name
|
||||
pub fn network_filename(&self) -> String {
|
||||
format!("50-volt-vmm-{}.network", self.id)
|
||||
}
|
||||
}
|
||||
|
||||
/// Manages TAP devices through systemd-networkd
|
||||
pub struct NetworkdTapManager {
|
||||
/// Configuration for this TAP
|
||||
config: NetworkdTapConfig,
|
||||
/// Path to the .netdev file
|
||||
netdev_path: PathBuf,
|
||||
/// Path to the .network file
|
||||
network_path: PathBuf,
|
||||
/// Whether the unit files have been created
|
||||
created: bool,
|
||||
}
|
||||
|
||||
impl NetworkdTapManager {
|
||||
/// Create a new TAP manager with the given configuration
|
||||
pub fn new(config: NetworkdTapConfig) -> Self {
|
||||
let netdev_path = PathBuf::from(NETWORKD_RUNTIME_DIR).join(config.netdev_filename());
|
||||
let network_path = PathBuf::from(NETWORKD_RUNTIME_DIR).join(config.network_filename());
|
||||
|
||||
Self {
|
||||
config,
|
||||
netdev_path,
|
||||
network_path,
|
||||
created: false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a TAP manager for a VM with default settings
|
||||
pub fn for_vm(vm_id: impl Into<String>) -> Self {
|
||||
Self::new(NetworkdTapConfig::new(vm_id))
|
||||
}
|
||||
|
||||
/// Generate the .netdev unit file contents
|
||||
fn generate_netdev(&self) -> String {
|
||||
let mut content = String::new();
|
||||
|
||||
// [NetDev] section
|
||||
content.push_str("[NetDev]\n");
|
||||
content.push_str(&format!("Name={}\n", self.config.interface_name()));
|
||||
content.push_str("Kind=tap\n");
|
||||
content.push_str("MACAddress=none\n");
|
||||
if let Some(mtu) = self.config.mtu {
|
||||
content.push_str(&format!("MTUBytes={}\n", mtu));
|
||||
}
|
||||
content.push('\n');
|
||||
|
||||
// [Tap] section
|
||||
content.push_str("[Tap]\n");
|
||||
content.push_str(&format!("User={}\n", self.config.user));
|
||||
content.push_str(&format!("Group={}\n", self.config.group));
|
||||
|
||||
if self.config.vnet_hdr {
|
||||
content.push_str("VNetHeader=yes\n");
|
||||
}
|
||||
|
||||
if self.config.multi_queue {
|
||||
content.push_str("MultiQueue=yes\n");
|
||||
}
|
||||
|
||||
// PacketInfo=no means IFF_NO_PI (no extra packet info header)
|
||||
content.push_str("PacketInfo=no\n");
|
||||
|
||||
content
|
||||
}
|
||||
|
||||
/// Generate the .network unit file contents
|
||||
fn generate_network(&self) -> String {
|
||||
let mut content = String::new();
|
||||
|
||||
// [Match] section
|
||||
content.push_str("[Match]\n");
|
||||
content.push_str(&format!("Name={}\n", self.config.interface_name()));
|
||||
content.push('\n');
|
||||
|
||||
// [Network] section
|
||||
content.push_str("[Network]\n");
|
||||
content.push_str(&format!("Bridge={}\n", self.config.bridge));
|
||||
content.push_str("ConfigureWithoutCarrier=yes\n");
|
||||
|
||||
content
|
||||
}
|
||||
|
||||
/// Write the unit files to the runtime directory
|
||||
pub fn write_unit_files(&self) -> Result<(), NetworkdError> {
|
||||
// Ensure directory exists
|
||||
fs::create_dir_all(NETWORKD_RUNTIME_DIR)?;
|
||||
|
||||
// Write .netdev file
|
||||
let mut netdev_file = File::create(&self.netdev_path)?;
|
||||
netdev_file.write_all(self.generate_netdev().as_bytes())?;
|
||||
netdev_file.sync_all()?;
|
||||
|
||||
// Write .network file
|
||||
let mut network_file = File::create(&self.network_path)?;
|
||||
network_file.write_all(self.generate_network().as_bytes())?;
|
||||
network_file.sync_all()?;
|
||||
|
||||
tracing::info!(
|
||||
"Wrote networkd unit files: {} and {}",
|
||||
self.netdev_path.display(),
|
||||
self.network_path.display()
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Remove the unit files
|
||||
pub fn remove_unit_files(&self) -> Result<(), NetworkdError> {
|
||||
if self.netdev_path.exists() {
|
||||
fs::remove_file(&self.netdev_path)?;
|
||||
}
|
||||
if self.network_path.exists() {
|
||||
fs::remove_file(&self.network_path)?;
|
||||
}
|
||||
tracing::info!("Removed networkd unit files for {}", self.config.id);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Reload networkd to apply changes
|
||||
pub fn reload_networkd() -> Result<(), NetworkdError> {
|
||||
let output = Command::new("networkctl")
|
||||
.arg("reload")
|
||||
.output()
|
||||
.map_err(|e| NetworkdError::NetworkctlFailed(e.to_string()))?;
|
||||
|
||||
if !output.status.success() {
|
||||
return Err(NetworkdError::NetworkctlFailed(
|
||||
String::from_utf8_lossy(&output.stderr).to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Wait for the TAP interface to be created
|
||||
pub fn wait_for_interface(&self, timeout: Duration) -> Result<(), NetworkdError> {
|
||||
let interface = self.config.interface_name();
|
||||
let start = Instant::now();
|
||||
|
||||
while start.elapsed() < timeout {
|
||||
if interface_exists(&interface) {
|
||||
tracing::info!("TAP interface {} is ready", interface);
|
||||
return Ok(());
|
||||
}
|
||||
std::thread::sleep(Duration::from_millis(100));
|
||||
}
|
||||
|
||||
Err(NetworkdError::TapTimeout { name: interface })
|
||||
}
|
||||
|
||||
/// Create the TAP device and wait for it
|
||||
pub fn create(&mut self) -> Result<String, NetworkdError> {
|
||||
// Check if bridge exists
|
||||
if !interface_exists(&self.config.bridge) {
|
||||
return Err(NetworkdError::BridgeNotFound(self.config.bridge.clone()));
|
||||
}
|
||||
|
||||
// Write unit files
|
||||
self.write_unit_files()?;
|
||||
|
||||
// Reload networkd
|
||||
Self::reload_networkd()?;
|
||||
|
||||
// Wait for interface
|
||||
self.wait_for_interface(TAP_CREATE_TIMEOUT)?;
|
||||
|
||||
self.created = true;
|
||||
Ok(self.config.interface_name())
|
||||
}
|
||||
|
||||
/// Open the TAP device file descriptor
|
||||
pub fn open_tap(&self) -> Result<RawFd, NetworkdError> {
|
||||
let interface = self.config.interface_name();
|
||||
|
||||
// Open /dev/net/tun
|
||||
let fd = OpenOptions::new()
|
||||
.read(true)
|
||||
.write(true)
|
||||
.open("/dev/net/tun")
|
||||
.map_err(NetworkdError::TapOpen)?;
|
||||
|
||||
// Prepare ioctl request
|
||||
let mut ifr = IfReq::new(&interface);
|
||||
ifr.set_flags(
|
||||
IFF_TAP
|
||||
| IFF_NO_PI
|
||||
| if self.config.vnet_hdr { IFF_VNET_HDR } else { 0 }
|
||||
| if self.config.multi_queue {
|
||||
IFF_MULTI_QUEUE
|
||||
} else {
|
||||
0
|
||||
},
|
||||
);
|
||||
|
||||
// Attach to existing TAP interface
|
||||
let ret = unsafe {
|
||||
libc::ioctl(
|
||||
fd.as_raw_fd(),
|
||||
TUNSETIFF as libc::c_ulong,
|
||||
&ifr as *const IfReq,
|
||||
)
|
||||
};
|
||||
|
||||
if ret < 0 {
|
||||
return Err(NetworkdError::TapOpen(io::Error::last_os_error()));
|
||||
}
|
||||
|
||||
// Set non-blocking
|
||||
let flags = unsafe { libc::fcntl(fd.as_raw_fd(), libc::F_GETFL) };
|
||||
unsafe { libc::fcntl(fd.as_raw_fd(), libc::F_SETFL, flags | libc::O_NONBLOCK) };
|
||||
|
||||
let raw_fd = fd.as_raw_fd();
|
||||
std::mem::forget(fd); // Don't close the fd
|
||||
|
||||
Ok(raw_fd)
|
||||
}
|
||||
|
||||
/// Cleanup: remove unit files and reload
|
||||
pub fn cleanup(&mut self) -> Result<(), NetworkdError> {
|
||||
if self.created {
|
||||
self.remove_unit_files()?;
|
||||
Self::reload_networkd()?;
|
||||
self.created = false;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get the interface name
|
||||
pub fn interface_name(&self) -> String {
|
||||
self.config.interface_name()
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for NetworkdTapManager {
|
||||
fn drop(&mut self) {
|
||||
if let Err(e) = self.cleanup() {
|
||||
tracing::error!("Failed to cleanup networkd TAP: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Bridge Infrastructure
|
||||
// ============================================================================
|
||||
|
||||
/// Configuration for the shared bridge
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct BridgeConfig {
|
||||
/// Bridge name
|
||||
pub name: String,
|
||||
/// Bridge MAC address
|
||||
pub mac: Option<String>,
|
||||
/// IPv4 address with CIDR
|
||||
pub ipv4_address: Option<String>,
|
||||
/// Enable IP forwarding
|
||||
pub ip_forward: bool,
|
||||
/// Enable IP masquerading (NAT)
|
||||
pub ip_masquerade: bool,
|
||||
/// Enable STP
|
||||
pub stp: bool,
|
||||
}
|
||||
|
||||
impl Default for BridgeConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
name: DEFAULT_BRIDGE.to_string(),
|
||||
mac: Some("52:54:00:00:00:01".to_string()),
|
||||
ipv4_address: Some("10.42.0.1/24".to_string()),
|
||||
ip_forward: true,
|
||||
ip_masquerade: true,
|
||||
stp: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Generate bridge infrastructure unit files
|
||||
pub fn generate_bridge_units(config: &BridgeConfig) -> (String, String) {
|
||||
// .netdev file
|
||||
let mut netdev = String::new();
|
||||
netdev.push_str("[NetDev]\n");
|
||||
netdev.push_str(&format!("Name={}\n", config.name));
|
||||
netdev.push_str("Kind=bridge\n");
|
||||
if let Some(mac) = &config.mac {
|
||||
netdev.push_str(&format!("MACAddress={}\n", mac));
|
||||
}
|
||||
netdev.push('\n');
|
||||
netdev.push_str("[Bridge]\n");
|
||||
netdev.push_str(&format!("STP={}\n", if config.stp { "yes" } else { "no" }));
|
||||
netdev.push_str("ForwardDelaySec=0\n");
|
||||
|
||||
// .network file
|
||||
let mut network = String::new();
|
||||
network.push_str("[Match]\n");
|
||||
network.push_str(&format!("Name={}\n", config.name));
|
||||
network.push('\n');
|
||||
network.push_str("[Network]\n");
|
||||
if let Some(addr) = &config.ipv4_address {
|
||||
network.push_str(&format!("Address={}\n", addr));
|
||||
}
|
||||
if config.ip_forward {
|
||||
network.push_str("IPForward=yes\n");
|
||||
}
|
||||
if config.ip_masquerade {
|
||||
network.push_str("IPMasquerade=both\n");
|
||||
}
|
||||
network.push_str("ConfigureWithoutCarrier=yes\n");
|
||||
|
||||
(netdev, network)
|
||||
}
|
||||
|
||||
/// Install bridge infrastructure (one-time setup)
|
||||
pub fn install_bridge(config: &BridgeConfig) -> Result<(), NetworkdError> {
|
||||
let (netdev, network) = generate_bridge_units(config);
|
||||
|
||||
let netdev_path = PathBuf::from(NETWORKD_CONFIG_DIR)
|
||||
.join(format!("10-volt-vmm-{}.netdev", config.name));
|
||||
let network_path = PathBuf::from(NETWORKD_CONFIG_DIR)
|
||||
.join(format!("10-volt-vmm-{}.network", config.name));
|
||||
|
||||
fs::create_dir_all(NETWORKD_CONFIG_DIR)?;
|
||||
|
||||
let mut f = File::create(&netdev_path)?;
|
||||
f.write_all(netdev.as_bytes())?;
|
||||
|
||||
let mut f = File::create(&network_path)?;
|
||||
f.write_all(network.as_bytes())?;
|
||||
|
||||
tracing::info!(
|
||||
"Installed bridge {} at {}",
|
||||
config.name,
|
||||
netdev_path.display()
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Helper Functions
|
||||
// ============================================================================
|
||||
|
||||
/// Check if a network interface exists
|
||||
pub fn interface_exists(name: &str) -> bool {
|
||||
Path::new(&format!("/sys/class/net/{}", name)).exists()
|
||||
}
|
||||
|
||||
/// Generate a short UUID
|
||||
fn uuid_short() -> String {
|
||||
use std::time::{SystemTime, UNIX_EPOCH};
|
||||
let t = SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_nanos();
|
||||
format!("{:016x}", t)
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// TAP ioctl Constants
|
||||
// ============================================================================
|
||||
|
||||
const TUNSETIFF: u64 = 0x400454CA;
|
||||
const IFF_TAP: i16 = 0x0002;
|
||||
const IFF_NO_PI: i16 = 0x1000;
|
||||
const IFF_VNET_HDR: i16 = 0x4000;
|
||||
const IFF_MULTI_QUEUE: i16 = 0x0100;
|
||||
|
||||
#[repr(C)]
|
||||
struct IfReq {
|
||||
ifr_name: [u8; 16],
|
||||
ifr_flags: i16,
|
||||
_padding: [u8; 22],
|
||||
}
|
||||
|
||||
impl IfReq {
|
||||
fn new(name: &str) -> Self {
|
||||
let mut ifr = Self {
|
||||
ifr_name: [0u8; 16],
|
||||
ifr_flags: 0,
|
||||
_padding: [0u8; 22],
|
||||
};
|
||||
let bytes = name.as_bytes();
|
||||
let len = bytes.len().min(15);
|
||||
ifr.ifr_name[..len].copy_from_slice(&bytes[..len]);
|
||||
ifr
|
||||
}
|
||||
|
||||
fn set_flags(&mut self, flags: i16) {
|
||||
self.ifr_flags = flags;
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Tests
|
||||
// ============================================================================
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_netdev_generation() {
|
||||
let config = NetworkdTapConfig {
|
||||
id: "test-vm-123".to_string(),
|
||||
tap_name: Some("tap-test".to_string()),
|
||||
bridge: "br0".to_string(),
|
||||
vhost: true,
|
||||
multi_queue: true,
|
||||
num_queues: 4,
|
||||
vnet_hdr: true,
|
||||
user: "nobody".to_string(),
|
||||
group: "nogroup".to_string(),
|
||||
mtu: Some(9000),
|
||||
};
|
||||
|
||||
let manager = NetworkdTapManager::new(config);
|
||||
let netdev = manager.generate_netdev();
|
||||
|
||||
assert!(netdev.contains("Name=tap-test"));
|
||||
assert!(netdev.contains("Kind=tap"));
|
||||
assert!(netdev.contains("MTUBytes=9000"));
|
||||
assert!(netdev.contains("VNetHeader=yes"));
|
||||
assert!(netdev.contains("MultiQueue=yes"));
|
||||
assert!(netdev.contains("User=nobody"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_network_generation() {
|
||||
let config = NetworkdTapConfig::new("test-vm").bridge("br-custom");
|
||||
let manager = NetworkdTapManager::new(config);
|
||||
let network = manager.generate_network();
|
||||
|
||||
assert!(network.contains("Bridge=br-custom"));
|
||||
assert!(network.contains("ConfigureWithoutCarrier=yes"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bridge_generation() {
|
||||
let config = BridgeConfig {
|
||||
name: "br-test".to_string(),
|
||||
mac: Some("52:54:00:00:00:FF".to_string()),
|
||||
ipv4_address: Some("192.168.100.1/24".to_string()),
|
||||
ip_forward: true,
|
||||
ip_masquerade: true,
|
||||
stp: false,
|
||||
};
|
||||
|
||||
let (netdev, network) = generate_bridge_units(&config);
|
||||
|
||||
assert!(netdev.contains("Name=br-test"));
|
||||
assert!(netdev.contains("Kind=bridge"));
|
||||
assert!(netdev.contains("MACAddress=52:54:00:00:00:FF"));
|
||||
assert!(netdev.contains("STP=no"));
|
||||
|
||||
assert!(network.contains("Address=192.168.100.1/24"));
|
||||
assert!(network.contains("IPForward=yes"));
|
||||
assert!(network.contains("IPMasquerade=both"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_interface_name_generation() {
|
||||
let config = NetworkdTapConfig::new("abcdef12-3456-7890");
|
||||
assert_eq!(config.interface_name(), "tap-abcdef12");
|
||||
|
||||
let config2 = NetworkdTapConfig {
|
||||
tap_name: Some("custom-tap".to_string()),
|
||||
..NetworkdTapConfig::new("ignored")
|
||||
};
|
||||
assert_eq!(config2.interface_name(), "custom-tap");
|
||||
}
|
||||
}
|
||||
404
vmm/src/devices/virtio/queue.rs
Normal file
404
vmm/src/devices/virtio/queue.rs
Normal file
@@ -0,0 +1,404 @@
|
||||
//! Virtio Queue Management
|
||||
//!
|
||||
//! Provides high-level wrapper around virtio-queue crate for queue operations.
|
||||
//! This module handles descriptor chain iteration, buffer management, and
|
||||
//! completion signaling.
|
||||
//!
|
||||
//! # Virtqueue Structure (from virtio spec)
|
||||
//!
|
||||
//! ```text
|
||||
//! ┌─────────────────────────────────────────────────────────────────┐
|
||||
//! │ Descriptor Table │
|
||||
//! │ ┌─────────┬─────────┬─────────┬─────────┐ │
|
||||
//! │ │ Desc 0 │ Desc 1 │ Desc 2 │ ... │ │
|
||||
//! │ │ addr │ addr │ addr │ │ │
|
||||
//! │ │ len │ len │ len │ │ │
|
||||
//! │ │ flags │ flags │ flags │ │ │
|
||||
//! │ │ next │ next │ next │ │ │
|
||||
//! │ └─────────┴─────────┴─────────┴─────────┘ │
|
||||
//! └─────────────────────────────────────────────────────────────────┘
|
||||
//!
|
||||
//! ┌─────────────────────────────────────────────────────────────────┐
|
||||
//! │ Available Ring (Driver → Device) │
|
||||
//! │ ┌─────────┬──────────────────────────────────────────┐ │
|
||||
//! │ │ flags │ idx │ ring[0] │ ring[1] │ ring[2] │ ... │ │
|
||||
//! │ └─────────┴──────────────────────────────────────────┘ │
|
||||
//! └─────────────────────────────────────────────────────────────────┘
|
||||
//!
|
||||
//! ┌─────────────────────────────────────────────────────────────────┐
|
||||
//! │ Used Ring (Device → Driver) │
|
||||
//! │ ┌─────────┬──────────────────────────────────────────┐ │
|
||||
//! │ │ flags │ idx │ elem[0] │ elem[1] │ elem[2] │ ... │ │
|
||||
//! │ │ │ │ id,len │ id,len │ id,len │ │ │
|
||||
//! │ └─────────┴──────────────────────────────────────────┘ │
|
||||
//! └─────────────────────────────────────────────────────────────────┘
|
||||
//! ```
|
||||
|
||||
|
||||
use thiserror::Error;
|
||||
use virtio_queue::{Queue, QueueT, DescriptorChain};
|
||||
use vm_memory::{GuestAddress, GuestMemoryMmap, Bytes};
|
||||
|
||||
|
||||
/// Default maximum queue size
|
||||
#[allow(dead_code)]
|
||||
pub const DEFAULT_QUEUE_SIZE: u16 = 256;
|
||||
|
||||
/// Errors that can occur during queue operations
|
||||
#[derive(Error, Debug)]
|
||||
#[allow(dead_code)]
|
||||
pub enum QueueError {
|
||||
/// Queue not ready for use
|
||||
#[error("Queue not ready")]
|
||||
NotReady,
|
||||
|
||||
/// Invalid descriptor index
|
||||
#[error("Invalid descriptor index: {0}")]
|
||||
InvalidDescriptor(u16),
|
||||
|
||||
/// Descriptor chain too long
|
||||
#[error("Descriptor chain too long (max {0})")]
|
||||
ChainTooLong(u16),
|
||||
|
||||
/// Memory access error
|
||||
#[error("Memory error: {0}")]
|
||||
Memory(String),
|
||||
|
||||
/// Queue overflow
|
||||
#[error("Queue overflow")]
|
||||
Overflow,
|
||||
}
|
||||
|
||||
/// Configuration for a virtqueue
|
||||
#[derive(Debug, Clone)]
|
||||
#[allow(dead_code)]
|
||||
pub struct QueueConfig {
|
||||
/// Maximum number of elements in the queue
|
||||
pub max_size: u16,
|
||||
/// Configured size (must be power of 2, <= max_size)
|
||||
pub size: u16,
|
||||
/// Descriptor table guest physical address
|
||||
pub desc_table: u64,
|
||||
/// Available ring guest physical address
|
||||
pub avail_ring: u64,
|
||||
/// Used ring guest physical address
|
||||
pub used_ring: u64,
|
||||
/// Queue is ready for use
|
||||
pub ready: bool,
|
||||
}
|
||||
|
||||
impl Default for QueueConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
max_size: DEFAULT_QUEUE_SIZE,
|
||||
size: DEFAULT_QUEUE_SIZE,
|
||||
desc_table: 0,
|
||||
avail_ring: 0,
|
||||
used_ring: 0,
|
||||
ready: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
impl QueueConfig {
|
||||
/// Create a new queue configuration with the given maximum size
|
||||
pub fn new(max_size: u16) -> Self {
|
||||
Self {
|
||||
max_size,
|
||||
size: max_size,
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if the queue is fully configured and ready
|
||||
pub fn is_valid(&self) -> bool {
|
||||
self.ready
|
||||
&& self.size > 0
|
||||
&& self.size <= self.max_size
|
||||
&& self.size.is_power_of_two()
|
||||
&& self.desc_table != 0
|
||||
&& self.avail_ring != 0
|
||||
&& self.used_ring != 0
|
||||
}
|
||||
}
|
||||
|
||||
/// High-level wrapper around virtio-queue's Queue
|
||||
#[allow(dead_code)]
|
||||
pub struct VirtioQueue {
|
||||
/// The underlying queue
|
||||
queue: Queue,
|
||||
/// Last seen available index
|
||||
last_avail_idx: u16,
|
||||
/// Next index to use in used ring
|
||||
next_used_idx: u16,
|
||||
/// Queue index (for identification)
|
||||
index: u16,
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
impl VirtioQueue {
|
||||
/// Create a new VirtioQueue from a configuration
|
||||
pub fn new(config: &QueueConfig, index: u16) -> Result<Self, QueueError> {
|
||||
if !config.is_valid() {
|
||||
return Err(QueueError::NotReady);
|
||||
}
|
||||
|
||||
let mut queue = Queue::new(config.max_size).map_err(|e| {
|
||||
QueueError::Memory(format!("Failed to create queue: {:?}", e))
|
||||
})?;
|
||||
|
||||
queue.set_size(config.size);
|
||||
queue.set_desc_table_address(
|
||||
Some(config.desc_table as u32),
|
||||
Some((config.desc_table >> 32) as u32),
|
||||
);
|
||||
queue.set_avail_ring_address(
|
||||
Some(config.avail_ring as u32),
|
||||
Some((config.avail_ring >> 32) as u32),
|
||||
);
|
||||
queue.set_used_ring_address(
|
||||
Some(config.used_ring as u32),
|
||||
Some((config.used_ring >> 32) as u32),
|
||||
);
|
||||
queue.set_ready(true);
|
||||
|
||||
Ok(Self {
|
||||
queue,
|
||||
last_avail_idx: 0,
|
||||
next_used_idx: 0,
|
||||
index,
|
||||
})
|
||||
}
|
||||
|
||||
/// Get the queue index
|
||||
pub fn index(&self) -> u16 {
|
||||
self.index
|
||||
}
|
||||
|
||||
/// Check if there are available descriptors to process
|
||||
pub fn has_pending(&self, mem: &GuestMemoryMmap) -> bool {
|
||||
self.queue.is_valid(mem)
|
||||
}
|
||||
|
||||
/// Get the next available descriptor chain
|
||||
pub fn pop_descriptor_chain<'a>(
|
||||
&mut self,
|
||||
mem: &'a GuestMemoryMmap,
|
||||
) -> Option<DescriptorChain<&'a GuestMemoryMmap>> {
|
||||
self.queue.pop_descriptor_chain(mem)
|
||||
}
|
||||
|
||||
/// Add a used buffer to the used ring
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `mem` - Guest memory reference
|
||||
/// * `head_index` - The head descriptor index of the chain
|
||||
/// * `len` - Number of bytes written to the buffer
|
||||
pub fn add_used(
|
||||
&mut self,
|
||||
mem: &GuestMemoryMmap,
|
||||
head_index: u16,
|
||||
len: u32,
|
||||
) -> Result<(), QueueError> {
|
||||
self.queue.add_used(mem, head_index, len).map_err(|e| {
|
||||
QueueError::Memory(format!("Failed to add used: {:?}", e))
|
||||
})
|
||||
}
|
||||
|
||||
/// Check if the driver has requested notification suppression
|
||||
pub fn needs_notification(&mut self, mem: &GuestMemoryMmap) -> bool {
|
||||
self.queue.needs_notification(mem).unwrap_or(true)
|
||||
}
|
||||
|
||||
/// Get the number of elements in the queue
|
||||
pub fn size(&self) -> u16 {
|
||||
self.queue.size()
|
||||
}
|
||||
|
||||
/// Get the underlying queue reference
|
||||
pub fn inner(&self) -> &Queue {
|
||||
&self.queue
|
||||
}
|
||||
|
||||
/// Get mutable reference to the underlying queue
|
||||
pub fn inner_mut(&mut self) -> &mut Queue {
|
||||
&mut self.queue
|
||||
}
|
||||
}
|
||||
|
||||
/// Iterator over a descriptor chain
|
||||
#[allow(dead_code)]
|
||||
pub struct DescriptorChainIter<'a> {
|
||||
chain: Option<DescriptorChain<&'a GuestMemoryMmap>>,
|
||||
count: u16,
|
||||
max_descriptors: u16,
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
impl<'a> DescriptorChainIter<'a> {
|
||||
/// Create a new iterator over a descriptor chain
|
||||
pub fn new(chain: DescriptorChain<&'a GuestMemoryMmap>, max_descriptors: u16) -> Self {
|
||||
Self {
|
||||
chain: Some(chain),
|
||||
count: 0,
|
||||
max_descriptors,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Buffer types in a descriptor chain
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
#[allow(dead_code)]
|
||||
pub enum BufferType {
|
||||
/// Device-readable buffer (driver → device)
|
||||
Readable,
|
||||
/// Device-writable buffer (device → driver)
|
||||
Writable,
|
||||
}
|
||||
|
||||
/// A single buffer in a descriptor chain
|
||||
#[derive(Debug, Clone)]
|
||||
#[allow(dead_code)]
|
||||
pub struct VirtioBuffer {
|
||||
/// Guest physical address of the buffer
|
||||
pub addr: GuestAddress,
|
||||
/// Length of the buffer
|
||||
pub len: u32,
|
||||
/// Buffer type (readable or writable)
|
||||
pub buffer_type: BufferType,
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
impl VirtioBuffer {
|
||||
/// Read data from this buffer
|
||||
pub fn read(&self, mem: &GuestMemoryMmap, buf: &mut [u8]) -> Result<usize, QueueError> {
|
||||
let len = std::cmp::min(buf.len(), self.len as usize);
|
||||
mem.read_slice(&mut buf[..len], self.addr)
|
||||
.map_err(|e| QueueError::Memory(format!("Read failed: {:?}", e)))?;
|
||||
Ok(len)
|
||||
}
|
||||
|
||||
/// Write data to this buffer
|
||||
pub fn write(&self, mem: &GuestMemoryMmap, data: &[u8]) -> Result<usize, QueueError> {
|
||||
if self.buffer_type != BufferType::Writable {
|
||||
return Err(QueueError::Memory("Cannot write to readable buffer".to_string()));
|
||||
}
|
||||
let len = std::cmp::min(data.len(), self.len as usize);
|
||||
mem.write_slice(&data[..len], self.addr)
|
||||
.map_err(|e| QueueError::Memory(format!("Write failed: {:?}", e)))?;
|
||||
Ok(len)
|
||||
}
|
||||
}
|
||||
|
||||
/// Collect all buffers from a descriptor chain
|
||||
#[allow(dead_code)]
|
||||
pub fn collect_chain_buffers(
|
||||
chain: DescriptorChain<&GuestMemoryMmap>,
|
||||
) -> Result<(Vec<VirtioBuffer>, Vec<VirtioBuffer>), QueueError> {
|
||||
let mut readable = Vec::new();
|
||||
let mut writable = Vec::new();
|
||||
|
||||
let mut count = 0;
|
||||
const MAX_CHAIN_LEN: u16 = 1024;
|
||||
|
||||
// Iterate through the DescriptorChain, which yields Descriptor items
|
||||
for desc in chain {
|
||||
count += 1;
|
||||
if count > MAX_CHAIN_LEN {
|
||||
return Err(QueueError::ChainTooLong(MAX_CHAIN_LEN));
|
||||
}
|
||||
|
||||
let buffer = VirtioBuffer {
|
||||
addr: desc.addr(),
|
||||
len: desc.len(),
|
||||
buffer_type: if desc.is_write_only() {
|
||||
BufferType::Writable
|
||||
} else {
|
||||
BufferType::Readable
|
||||
},
|
||||
};
|
||||
|
||||
if desc.is_write_only() {
|
||||
writable.push(buffer);
|
||||
} else {
|
||||
readable.push(buffer);
|
||||
}
|
||||
}
|
||||
|
||||
Ok((readable, writable))
|
||||
}
|
||||
|
||||
/// Read an entire descriptor chain into a contiguous buffer
|
||||
#[allow(dead_code)]
|
||||
pub fn read_chain_to_vec(
|
||||
chain: DescriptorChain<&GuestMemoryMmap>,
|
||||
mem: &GuestMemoryMmap,
|
||||
) -> Result<Vec<u8>, QueueError> {
|
||||
let (readable, _) = collect_chain_buffers(chain)?;
|
||||
|
||||
let total_len: usize = readable.iter().map(|b| b.len as usize).sum();
|
||||
let mut data = vec![0u8; total_len];
|
||||
let mut offset = 0;
|
||||
|
||||
for buffer in readable {
|
||||
let len = buffer.read(mem, &mut data[offset..])?;
|
||||
offset += len;
|
||||
}
|
||||
|
||||
Ok(data)
|
||||
}
|
||||
|
||||
/// Write data to the writable buffers in a descriptor chain
|
||||
#[allow(dead_code)]
|
||||
pub fn write_to_chain(
|
||||
chain: DescriptorChain<&GuestMemoryMmap>,
|
||||
mem: &GuestMemoryMmap,
|
||||
data: &[u8],
|
||||
) -> Result<usize, QueueError> {
|
||||
let (_, writable) = collect_chain_buffers(chain)?;
|
||||
|
||||
let mut offset = 0;
|
||||
for buffer in writable {
|
||||
if offset >= data.len() {
|
||||
break;
|
||||
}
|
||||
let to_write = std::cmp::min(buffer.len as usize, data.len() - offset);
|
||||
buffer.write(mem, &data[offset..offset + to_write])?;
|
||||
offset += to_write;
|
||||
}
|
||||
|
||||
Ok(offset)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_queue_config_validation() {
|
||||
let mut config = QueueConfig::default();
|
||||
assert!(!config.is_valid());
|
||||
|
||||
config.ready = true;
|
||||
config.desc_table = 0x1000;
|
||||
config.avail_ring = 0x2000;
|
||||
config.used_ring = 0x3000;
|
||||
assert!(config.is_valid());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_queue_config_power_of_two() {
|
||||
let mut config = QueueConfig::new(256);
|
||||
config.ready = true;
|
||||
config.desc_table = 0x1000;
|
||||
config.avail_ring = 0x2000;
|
||||
config.used_ring = 0x3000;
|
||||
|
||||
config.size = 128;
|
||||
assert!(config.is_valid());
|
||||
|
||||
config.size = 100; // Not power of 2
|
||||
assert!(!config.is_valid());
|
||||
}
|
||||
}
|
||||
485
vmm/src/devices/virtio/stellarium_blk.rs
Normal file
485
vmm/src/devices/virtio/stellarium_blk.rs
Normal file
@@ -0,0 +1,485 @@
|
||||
//! Stellarium-backed VirtIO Block Device Backend
|
||||
//!
|
||||
//! This module provides a `BlockBackend` implementation backed by Stellarium's
|
||||
//! TinyVol volumes and Nebula content-addressed storage. The guest sees a normal
|
||||
//! virtio-blk device, but the host-side storage is fully deduplicated.
|
||||
//!
|
||||
//! # Architecture
|
||||
//!
|
||||
//! ```text
|
||||
//! Guest: /dev/vda (ext4)
|
||||
//! │
|
||||
//! ┌──────▼──────────────────────┐
|
||||
//! │ VirtIO-BLK (sectors) │ ← Standard virtio-blk protocol
|
||||
//! ├─────────────────────────────┤
|
||||
//! │ StellariumBackend │ ← This module: translates sector I/O
|
||||
//! │ ┌───────────────────────┐ │ to TinyVol block operations
|
||||
//! │ │ TinyVol Volume │ │
|
||||
//! │ │ ┌─────────────────┐ │ │ ← CoW delta layer for writes
|
||||
//! │ │ │ Delta Layer │ │ │
|
||||
//! │ │ ├─────────────────┤ │ │
|
||||
//! │ │ │ Base Image │ │ │ ← CAS-backed base (deduplicated)
|
||||
//! │ │ └─────────────────┘ │ │
|
||||
//! │ └───────────────────────┘ │
|
||||
//! └─────────────────────────────┘
|
||||
//! ```
|
||||
//!
|
||||
//! # Key Properties
|
||||
//!
|
||||
//! - **Instant cloning**: Copying a manifest creates a new VM (O(1), no data copy)
|
||||
//! - **Deduplication**: Identical blocks across all VMs stored once in Nebula
|
||||
//! - **CoW writes**: Guest writes go to a delta layer, base remains shared
|
||||
//! - **Sparse storage**: Unwritten blocks return zeros without consuming space
|
||||
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Mutex;
|
||||
|
||||
use super::block::BlockBackend;
|
||||
|
||||
/// Stellarium-backed block device using TinyVol + Nebula CAS
|
||||
///
|
||||
/// Implements `BlockBackend` for use with `VirtioBlock`, translating
|
||||
/// sector-based I/O into TinyVol block operations.
|
||||
#[allow(dead_code)]
|
||||
pub struct StellariumBackend {
|
||||
/// TinyVol volume providing CoW block storage
|
||||
volume: Mutex<stellarium::Volume>,
|
||||
/// Volume capacity in bytes
|
||||
capacity: u64,
|
||||
/// Block size used by TinyVol (typically 4096)
|
||||
tinyvol_block_size: u32,
|
||||
/// Read-only flag
|
||||
read_only: bool,
|
||||
/// Device ID (derived from volume path hash)
|
||||
device_id: [u8; 20],
|
||||
/// Path to the volume (for identification)
|
||||
volume_path: PathBuf,
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
impl StellariumBackend {
|
||||
/// Open a Stellarium volume as a block backend
|
||||
///
|
||||
/// The volume directory must contain a `manifest.tvol` and optionally
|
||||
/// a `delta.dat` for CoW writes.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `volume_path` - Path to the TinyVol volume directory
|
||||
/// * `read_only` - Whether to open in read-only mode
|
||||
pub fn open(volume_path: impl AsRef<Path>, read_only: bool) -> std::io::Result<Self> {
|
||||
let volume_path = volume_path.as_ref();
|
||||
|
||||
let volume = stellarium::Volume::open(volume_path).map_err(|e| {
|
||||
std::io::Error::new(std::io::ErrorKind::Other, format!("Failed to open TinyVol volume: {}", e))
|
||||
})?;
|
||||
|
||||
let capacity = volume.virtual_size();
|
||||
let tinyvol_block_size = volume.block_size();
|
||||
let vol_read_only = volume.is_read_only();
|
||||
|
||||
// Generate device ID from volume path
|
||||
let mut device_id = [0u8; 20];
|
||||
let path_str = volume_path.to_string_lossy();
|
||||
let hash = fnv1a_hash(path_str.as_bytes());
|
||||
device_id[..8].copy_from_slice(&hash.to_le_bytes());
|
||||
// Tag as stellarium backend
|
||||
device_id[8..16].copy_from_slice(b"STLR_BLK");
|
||||
|
||||
Ok(Self {
|
||||
volume: Mutex::new(volume),
|
||||
capacity,
|
||||
tinyvol_block_size,
|
||||
read_only: read_only || vol_read_only,
|
||||
device_id,
|
||||
volume_path: volume_path.to_path_buf(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Open a Stellarium volume with a base image file
|
||||
///
|
||||
/// Used when the volume has a base image that's stored as a regular file
|
||||
/// (e.g., an ext4 image that was imported into CAS).
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `volume_path` - Path to the TinyVol volume directory
|
||||
/// * `base_path` - Path to the base image file
|
||||
/// * `read_only` - Whether to open in read-only mode
|
||||
pub fn open_with_base(
|
||||
volume_path: impl AsRef<Path>,
|
||||
base_path: impl AsRef<Path>,
|
||||
read_only: bool,
|
||||
) -> std::io::Result<Self> {
|
||||
let volume_path = volume_path.as_ref();
|
||||
let base_path = base_path.as_ref();
|
||||
|
||||
let volume = stellarium::Volume::open_with_base(volume_path, base_path).map_err(|e| {
|
||||
std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
format!("Failed to open TinyVol volume with base: {}", e),
|
||||
)
|
||||
})?;
|
||||
|
||||
let capacity = volume.virtual_size();
|
||||
let tinyvol_block_size = volume.block_size();
|
||||
let vol_read_only = volume.is_read_only();
|
||||
|
||||
let mut device_id = [0u8; 20];
|
||||
let path_str = volume_path.to_string_lossy();
|
||||
let hash = fnv1a_hash(path_str.as_bytes());
|
||||
device_id[..8].copy_from_slice(&hash.to_le_bytes());
|
||||
device_id[8..16].copy_from_slice(b"STLR_BLK");
|
||||
|
||||
Ok(Self {
|
||||
volume: Mutex::new(volume),
|
||||
capacity,
|
||||
tinyvol_block_size,
|
||||
read_only: read_only || vol_read_only,
|
||||
device_id,
|
||||
volume_path: volume_path.to_path_buf(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Create a new volume with the given size and use it as a backend
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `volume_path` - Path where the volume directory will be created
|
||||
/// * `size_bytes` - Virtual size of the volume in bytes
|
||||
/// * `block_size` - TinyVol block size (must be power of 2, 4KB-1MB)
|
||||
pub fn create(
|
||||
volume_path: impl AsRef<Path>,
|
||||
size_bytes: u64,
|
||||
block_size: u32,
|
||||
) -> std::io::Result<Self> {
|
||||
let volume_path = volume_path.as_ref();
|
||||
|
||||
let config = stellarium::VolumeConfig::new(size_bytes).with_block_size(block_size);
|
||||
|
||||
let volume = stellarium::Volume::create(volume_path, config).map_err(|e| {
|
||||
std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
format!("Failed to create TinyVol volume: {}", e),
|
||||
)
|
||||
})?;
|
||||
|
||||
let mut device_id = [0u8; 20];
|
||||
let path_str = volume_path.to_string_lossy();
|
||||
let hash = fnv1a_hash(path_str.as_bytes());
|
||||
device_id[..8].copy_from_slice(&hash.to_le_bytes());
|
||||
device_id[8..16].copy_from_slice(b"STLR_BLK");
|
||||
|
||||
Ok(Self {
|
||||
volume: Mutex::new(volume),
|
||||
capacity: size_bytes,
|
||||
tinyvol_block_size: block_size,
|
||||
read_only: false,
|
||||
device_id,
|
||||
volume_path: volume_path.to_path_buf(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Get the volume path
|
||||
pub fn volume_path(&self) -> &Path {
|
||||
&self.volume_path
|
||||
}
|
||||
|
||||
/// Get volume statistics
|
||||
pub fn stats(&self) -> StellariumBackendStats {
|
||||
let volume = self.volume.lock().unwrap();
|
||||
let vol_stats = volume.stats();
|
||||
StellariumBackendStats {
|
||||
virtual_size: vol_stats.virtual_size,
|
||||
block_size: vol_stats.block_size,
|
||||
block_count: vol_stats.block_count,
|
||||
modified_blocks: vol_stats.modified_blocks,
|
||||
manifest_size: vol_stats.manifest_size,
|
||||
delta_size: vol_stats.delta_size,
|
||||
efficiency: vol_stats.efficiency(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Clone this volume instantly (O(1) manifest copy)
|
||||
///
|
||||
/// Creates a new volume at `clone_path` that shares the same base data
|
||||
/// but has its own CoW delta layer for writes.
|
||||
pub fn clone_to(&self, clone_path: impl AsRef<Path>) -> std::io::Result<StellariumBackend> {
|
||||
let volume = self.volume.lock().unwrap();
|
||||
let cloned = volume.clone_to(clone_path.as_ref()).map_err(|e| {
|
||||
std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
format!("Failed to clone volume: {}", e),
|
||||
)
|
||||
})?;
|
||||
|
||||
let capacity = cloned.virtual_size();
|
||||
let block_size = cloned.block_size();
|
||||
|
||||
let mut device_id = [0u8; 20];
|
||||
let path_str = clone_path.as_ref().to_string_lossy();
|
||||
let hash = fnv1a_hash(path_str.as_bytes());
|
||||
device_id[..8].copy_from_slice(&hash.to_le_bytes());
|
||||
device_id[8..16].copy_from_slice(b"STLR_BLK");
|
||||
|
||||
Ok(StellariumBackend {
|
||||
volume: Mutex::new(cloned),
|
||||
capacity,
|
||||
tinyvol_block_size: block_size,
|
||||
read_only: false,
|
||||
device_id,
|
||||
volume_path: clone_path.as_ref().to_path_buf(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl BlockBackend for StellariumBackend {
|
||||
fn capacity(&self) -> u64 {
|
||||
self.capacity
|
||||
}
|
||||
|
||||
fn block_size(&self) -> u32 {
|
||||
// VirtIO block uses 512-byte sectors, but we report our actual
|
||||
// block size. The VirtioBlock device handles sector-to-block translation.
|
||||
512
|
||||
}
|
||||
|
||||
fn is_read_only(&self) -> bool {
|
||||
self.read_only
|
||||
}
|
||||
|
||||
fn read(&self, sector: u64, buf: &mut [u8]) -> std::io::Result<()> {
|
||||
let offset = sector * 512;
|
||||
let volume = self.volume.lock().unwrap();
|
||||
|
||||
let bytes_read = volume.read_at(offset, buf).map_err(|e| {
|
||||
std::io::Error::new(std::io::ErrorKind::Other, format!("TinyVol read error: {}", e))
|
||||
})?;
|
||||
|
||||
// Zero-fill any remaining bytes (shouldn't happen normally)
|
||||
if bytes_read < buf.len() {
|
||||
buf[bytes_read..].fill(0);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write(&self, sector: u64, buf: &[u8]) -> std::io::Result<()> {
|
||||
if self.read_only {
|
||||
return Err(std::io::Error::new(
|
||||
std::io::ErrorKind::PermissionDenied,
|
||||
"device is read-only",
|
||||
));
|
||||
}
|
||||
|
||||
let offset = sector * 512;
|
||||
let volume = self.volume.lock().unwrap();
|
||||
|
||||
volume.write_at(offset, buf).map_err(|e| {
|
||||
std::io::Error::new(std::io::ErrorKind::Other, format!("TinyVol write error: {}", e))
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn flush(&self) -> std::io::Result<()> {
|
||||
let volume = self.volume.lock().unwrap();
|
||||
volume.flush().map_err(|e| {
|
||||
std::io::Error::new(std::io::ErrorKind::Other, format!("TinyVol flush error: {}", e))
|
||||
})
|
||||
}
|
||||
|
||||
fn discard(&self, sector: u64, num_sectors: u64) -> std::io::Result<()> {
|
||||
// For TinyVol, discard can be implemented by writing zeros
|
||||
// which the delta layer will detect and handle efficiently
|
||||
if self.read_only {
|
||||
return Err(std::io::Error::new(
|
||||
std::io::ErrorKind::PermissionDenied,
|
||||
"device is read-only",
|
||||
));
|
||||
}
|
||||
|
||||
let offset = sector * 512;
|
||||
let len = num_sectors * 512;
|
||||
let volume = self.volume.lock().unwrap();
|
||||
|
||||
// Write zeros in block-sized chunks
|
||||
let zeros = vec![0u8; self.tinyvol_block_size as usize];
|
||||
let mut current = offset;
|
||||
let end = offset + len;
|
||||
|
||||
while current < end {
|
||||
let remaining = (end - current) as usize;
|
||||
let chunk = remaining.min(zeros.len());
|
||||
volume.write_at(current, &zeros[..chunk]).map_err(|e| {
|
||||
std::io::Error::new(std::io::ErrorKind::Other, format!("TinyVol discard error: {}", e))
|
||||
})?;
|
||||
current += chunk as u64;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_zeroes(&self, sector: u64, num_sectors: u64) -> std::io::Result<()> {
|
||||
self.discard(sector, num_sectors)
|
||||
}
|
||||
|
||||
fn device_id(&self) -> [u8; 20] {
|
||||
self.device_id
|
||||
}
|
||||
}
|
||||
|
||||
/// Statistics for a Stellarium-backed block device
|
||||
#[derive(Debug, Clone)]
|
||||
#[allow(dead_code)]
|
||||
pub struct StellariumBackendStats {
|
||||
/// Virtual size of the volume
|
||||
pub virtual_size: u64,
|
||||
/// TinyVol block size
|
||||
pub block_size: u32,
|
||||
/// Total number of blocks
|
||||
pub block_count: u64,
|
||||
/// Number of blocks modified (in delta layer)
|
||||
pub modified_blocks: u64,
|
||||
/// Size of the manifest
|
||||
pub manifest_size: usize,
|
||||
/// Size of the delta layer on disk
|
||||
pub delta_size: u64,
|
||||
/// Storage efficiency (actual / virtual)
|
||||
pub efficiency: f64,
|
||||
}
|
||||
|
||||
/// FNV-1a hash for device ID generation
|
||||
fn fnv1a_hash(data: &[u8]) -> u64 {
|
||||
let mut hash: u64 = 0xcbf29ce484222325;
|
||||
for &byte in data {
|
||||
hash ^= byte as u64;
|
||||
hash = hash.wrapping_mul(0x100000001b3);
|
||||
}
|
||||
hash
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use tempfile::tempdir;
|
||||
|
||||
#[test]
|
||||
fn test_create_and_read_write() {
|
||||
let dir = tempdir().unwrap();
|
||||
let vol_path = dir.path().join("test-vol");
|
||||
|
||||
// Create a 10MB volume with 4KB blocks
|
||||
let backend = StellariumBackend::create(&vol_path, 10 * 1024 * 1024, 4096).unwrap();
|
||||
|
||||
assert_eq!(backend.capacity(), 10 * 1024 * 1024);
|
||||
assert_eq!(backend.block_size(), 512);
|
||||
assert!(!backend.is_read_only());
|
||||
|
||||
// Write some data at sector 0
|
||||
let write_data = b"Hello, Stellarium VirtIO!";
|
||||
let mut padded = vec![0u8; 512];
|
||||
padded[..write_data.len()].copy_from_slice(write_data);
|
||||
backend.write(0, &padded).unwrap();
|
||||
|
||||
// Read it back
|
||||
let mut read_buf = vec![0u8; 512];
|
||||
backend.read(0, &mut read_buf).unwrap();
|
||||
assert_eq!(&read_buf[..write_data.len()], write_data);
|
||||
|
||||
// Unwritten sectors return zeros
|
||||
let mut zero_buf = vec![0u8; 512];
|
||||
backend.read(100, &mut zero_buf).unwrap();
|
||||
assert!(zero_buf.iter().all(|&b| b == 0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multi_sector_io() {
|
||||
let dir = tempdir().unwrap();
|
||||
let vol_path = dir.path().join("test-vol");
|
||||
|
||||
let backend = StellariumBackend::create(&vol_path, 10 * 1024 * 1024, 4096).unwrap();
|
||||
|
||||
// Write 4KB (8 sectors)
|
||||
let data: Vec<u8> = (0..4096).map(|i| (i % 256) as u8).collect();
|
||||
backend.write(0, &data).unwrap();
|
||||
|
||||
// Read back
|
||||
let mut buf = vec![0u8; 4096];
|
||||
backend.read(0, &mut buf).unwrap();
|
||||
assert_eq!(buf, data);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_flush() {
|
||||
let dir = tempdir().unwrap();
|
||||
let vol_path = dir.path().join("test-vol");
|
||||
|
||||
let backend = StellariumBackend::create(&vol_path, 10 * 1024 * 1024, 4096).unwrap();
|
||||
|
||||
let data = vec![0xAB; 512];
|
||||
backend.write(0, &data).unwrap();
|
||||
backend.flush().unwrap();
|
||||
|
||||
// Reopen and verify persistence
|
||||
let backend2 = StellariumBackend::open(&vol_path, false).unwrap();
|
||||
let mut buf = vec![0u8; 512];
|
||||
backend2.read(0, &mut buf).unwrap();
|
||||
assert_eq!(buf[0], 0xAB);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_instant_clone() {
|
||||
let dir = tempdir().unwrap();
|
||||
let vol_path = dir.path().join("original");
|
||||
let clone_path = dir.path().join("clone");
|
||||
|
||||
let backend = StellariumBackend::create(&vol_path, 10 * 1024 * 1024, 4096).unwrap();
|
||||
|
||||
// Write to original
|
||||
let data = vec![0x42; 512];
|
||||
backend.write(0, &data).unwrap();
|
||||
backend.flush().unwrap();
|
||||
|
||||
// Clone
|
||||
let clone = backend.clone_to(&clone_path).unwrap();
|
||||
assert_eq!(clone.capacity(), backend.capacity());
|
||||
|
||||
// Clone can write independently
|
||||
let clone_data = vec![0x99; 512];
|
||||
clone.write(100, &clone_data).unwrap();
|
||||
|
||||
// Original unaffected at sector 100
|
||||
let mut buf = vec![0u8; 512];
|
||||
backend.read(100, &mut buf).unwrap();
|
||||
assert!(buf.iter().all(|&b| b == 0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stats() {
|
||||
let dir = tempdir().unwrap();
|
||||
let vol_path = dir.path().join("test-vol");
|
||||
|
||||
let backend = StellariumBackend::create(&vol_path, 10 * 1024 * 1024, 4096).unwrap();
|
||||
|
||||
let stats = backend.stats();
|
||||
assert_eq!(stats.virtual_size, 10 * 1024 * 1024);
|
||||
assert_eq!(stats.block_size, 4096);
|
||||
assert_eq!(stats.modified_blocks, 0);
|
||||
|
||||
// Write a block
|
||||
backend.write(0, &vec![0xFF; 4096]).unwrap();
|
||||
|
||||
let stats2 = backend.stats();
|
||||
assert!(stats2.modified_blocks >= 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_device_id() {
|
||||
let dir = tempdir().unwrap();
|
||||
let vol_path = dir.path().join("test-vol");
|
||||
|
||||
let backend = StellariumBackend::create(&vol_path, 1024 * 1024, 4096).unwrap();
|
||||
let id = backend.device_id();
|
||||
|
||||
// Should have our tag
|
||||
assert_eq!(&id[8..16], b"STLR_BLK");
|
||||
}
|
||||
}
|
||||
745
vmm/src/devices/virtio/vhost_net.rs
Normal file
745
vmm/src/devices/virtio/vhost_net.rs
Normal file
@@ -0,0 +1,745 @@
|
||||
//! vhost-net Kernel Acceleration
|
||||
//!
|
||||
//! This module implements vhost-net support for virtio-net devices,
|
||||
//! allowing the kernel to handle packet processing directly without
|
||||
//! userspace involvement for the data path.
|
||||
//!
|
||||
//! # Architecture
|
||||
//!
|
||||
//! ```text
|
||||
//! Without vhost-net:
|
||||
//! ┌─────────┐ ┌─────────────┐ ┌───────────┐ ┌─────────┐
|
||||
//! │ Guest │───►│ KVM Exit │───►│ Volt │───►│ TAP │
|
||||
//! │ virtio │ │ (expensive) │ │ (process) │ │ Device │
|
||||
//! └─────────┘ └─────────────┘ └───────────┘ └─────────┘
|
||||
//!
|
||||
//! With vhost-net:
|
||||
//! ┌─────────┐ ┌─────────────────────────────────┐ ┌─────────┐
|
||||
//! │ Guest │───►│ vhost-net (kernel) │───►│ TAP │
|
||||
//! │ virtio │ │ - Direct virtqueue access │ │ Device │
|
||||
//! │ │ │ - Zero-copy when possible │ │ │
|
||||
//! └─────────┘ └─────────────────────────────────┘ └─────────┘
|
||||
//! ```
|
||||
//!
|
||||
//! # Performance Benefits
|
||||
//!
|
||||
//! - 30-50% higher throughput
|
||||
//! - Significantly lower latency
|
||||
//! - Reduced CPU usage
|
||||
//! - Minimal context switches
|
||||
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::os::unix::io::{AsRawFd, FromRawFd, RawFd};
|
||||
use std::sync::Arc;
|
||||
|
||||
use super::{Error, GuestMemory, Result};
|
||||
|
||||
// ============================================================================
|
||||
// vhost-net ioctl Constants
|
||||
// ============================================================================
|
||||
|
||||
/// vhost-net device path
|
||||
pub const VHOST_NET_PATH: &str = "/dev/vhost-net";
|
||||
|
||||
/// vhost ioctl base (same as KVM)
|
||||
const VHOST_VIRTIO: u64 = 0xAF;
|
||||
|
||||
/// Set the owner of the vhost backend
|
||||
const VHOST_SET_OWNER: u64 = request_code!(VHOST_VIRTIO, 0x01);
|
||||
|
||||
/// Reset the vhost backend owner
|
||||
const VHOST_RESET_OWNER: u64 = request_code!(VHOST_VIRTIO, 0x02);
|
||||
|
||||
/// Set memory region table
|
||||
const VHOST_SET_MEM_TABLE: u64 = request_code!(VHOST_VIRTIO, 0x03);
|
||||
|
||||
/// Set log base address
|
||||
const VHOST_SET_LOG_BASE: u64 = request_code!(VHOST_VIRTIO, 0x04);
|
||||
|
||||
/// Set log file descriptor
|
||||
const VHOST_SET_LOG_FD: u64 = request_code!(VHOST_VIRTIO, 0x07);
|
||||
|
||||
/// Set vring number of descriptors
|
||||
const VHOST_SET_VRING_NUM: u64 = request_code!(VHOST_VIRTIO, 0x10);
|
||||
|
||||
/// Set vring addresses
|
||||
const VHOST_SET_VRING_ADDR: u64 = request_code!(VHOST_VIRTIO, 0x11);
|
||||
|
||||
/// Set vring base index
|
||||
const VHOST_SET_VRING_BASE: u64 = request_code!(VHOST_VIRTIO, 0x12);
|
||||
|
||||
/// Get vring base index
|
||||
const VHOST_GET_VRING_BASE: u64 = request_code!(VHOST_VIRTIO, 0x12);
|
||||
|
||||
/// Set vring kick fd
|
||||
const VHOST_SET_VRING_KICK: u64 = request_code!(VHOST_VIRTIO, 0x20);
|
||||
|
||||
/// Set vring call fd
|
||||
const VHOST_SET_VRING_CALL: u64 = request_code!(VHOST_VIRTIO, 0x21);
|
||||
|
||||
/// Set vring error fd
|
||||
const VHOST_SET_VRING_ERR: u64 = request_code!(VHOST_VIRTIO, 0x22);
|
||||
|
||||
/// Get vhost features
|
||||
const VHOST_GET_FEATURES: u64 = request_code!(VHOST_VIRTIO, 0x00);
|
||||
|
||||
/// Set vhost features
|
||||
const VHOST_SET_FEATURES: u64 = request_code!(VHOST_VIRTIO, 0x00);
|
||||
|
||||
/// Set backend file descriptor (vhost-net specific)
|
||||
const VHOST_NET_SET_BACKEND: u64 = request_code!(VHOST_VIRTIO, 0x30);
|
||||
|
||||
/// Generate ioctl request code (similar to _IO macro)
|
||||
macro_rules! request_code {
|
||||
($type:expr, $nr:expr) => {
|
||||
(($type as u64) << 8) | ($nr as u64)
|
||||
};
|
||||
}
|
||||
|
||||
use request_code;
|
||||
|
||||
// ============================================================================
|
||||
// vhost-net Feature Bits
|
||||
// ============================================================================
|
||||
|
||||
/// vhost-net features
|
||||
pub mod vhost_features {
|
||||
/// Supports vhost-net MRG_RXBUF
|
||||
pub const VHOST_NET_F_VIRTIO_NET_HDR: u64 = 1 << 27;
|
||||
|
||||
/// Backend handles checksum
|
||||
pub const VHOST_F_LOG_ALL: u64 = 1 << 26;
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// vhost Structures
|
||||
// ============================================================================
|
||||
|
||||
/// Memory region for vhost
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone, Copy, Default)]
|
||||
pub struct VhostMemoryRegion {
|
||||
/// Guest physical address
|
||||
pub guest_phys_addr: u64,
|
||||
/// Size of the region
|
||||
pub memory_size: u64,
|
||||
/// Userspace address
|
||||
pub userspace_addr: u64,
|
||||
/// Flags (currently unused)
|
||||
pub flags_padding: u64,
|
||||
}
|
||||
|
||||
/// Memory table for vhost
|
||||
#[repr(C)]
|
||||
pub struct VhostMemory {
|
||||
/// Number of regions
|
||||
pub nregions: u32,
|
||||
/// Padding
|
||||
pub padding: u32,
|
||||
/// Memory regions (variable length array)
|
||||
pub regions: [VhostMemoryRegion; 0],
|
||||
}
|
||||
|
||||
/// Vring state
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone, Copy, Default)]
|
||||
pub struct VhostVringState {
|
||||
/// Queue index
|
||||
pub index: u32,
|
||||
/// Number of descriptors
|
||||
pub num: u32,
|
||||
}
|
||||
|
||||
/// Vring addresses
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone, Copy, Default)]
|
||||
pub struct VhostVringAddr {
|
||||
/// Queue index
|
||||
pub index: u32,
|
||||
/// Flags (LOG_DIRTY, etc.)
|
||||
pub flags: u32,
|
||||
/// Descriptor table address (user VA)
|
||||
pub desc_user_addr: u64,
|
||||
/// Used ring address (user VA)
|
||||
pub used_user_addr: u64,
|
||||
/// Available ring address (user VA)
|
||||
pub avail_user_addr: u64,
|
||||
/// Log address for dirty pages
|
||||
pub log_guest_addr: u64,
|
||||
}
|
||||
|
||||
/// Vring file descriptor
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone, Copy, Default)]
|
||||
pub struct VhostVringFile {
|
||||
/// Queue index
|
||||
pub index: u32,
|
||||
/// File descriptor (-1 to disable)
|
||||
pub fd: i32,
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Error Types
|
||||
// ============================================================================
|
||||
|
||||
/// vhost-net specific errors
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum VhostNetError {
|
||||
#[error("Failed to open /dev/vhost-net: {0}")]
|
||||
Open(std::io::Error),
|
||||
|
||||
#[error("vhost ioctl failed: {ioctl} - {error}")]
|
||||
Ioctl {
|
||||
ioctl: &'static str,
|
||||
error: std::io::Error,
|
||||
},
|
||||
|
||||
#[error("vhost-net not available (module not loaded?)")]
|
||||
NotAvailable,
|
||||
|
||||
#[error("Failed to create eventfd: {0}")]
|
||||
EventFd(std::io::Error),
|
||||
|
||||
#[error("Memory region not contiguous")]
|
||||
NonContiguousMemory,
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// vhost-net Backend
|
||||
// ============================================================================
|
||||
|
||||
/// vhost-net backend for virtio-net
|
||||
pub struct VhostNetBackend {
|
||||
/// vhost-net file descriptor
|
||||
vhost_file: File,
|
||||
/// TAP file descriptor
|
||||
tap_fd: RawFd,
|
||||
/// Kick eventfds (one per queue)
|
||||
kick_fds: Vec<RawFd>,
|
||||
/// Call eventfds (one per queue)
|
||||
call_fds: Vec<RawFd>,
|
||||
/// Number of queues configured
|
||||
num_queues: usize,
|
||||
/// Whether backend is activated
|
||||
activated: bool,
|
||||
}
|
||||
|
||||
impl VhostNetBackend {
|
||||
/// Check if vhost-net is available on this system
|
||||
pub fn is_available() -> bool {
|
||||
std::path::Path::new(VHOST_NET_PATH).exists()
|
||||
}
|
||||
|
||||
/// Create a new vhost-net backend
|
||||
pub fn new(tap_fd: RawFd) -> std::result::Result<Self, VhostNetError> {
|
||||
let vhost_file = OpenOptions::new()
|
||||
.read(true)
|
||||
.write(true)
|
||||
.open(VHOST_NET_PATH)
|
||||
.map_err(VhostNetError::Open)?;
|
||||
|
||||
// Set owner
|
||||
let ret = unsafe { libc::ioctl(vhost_file.as_raw_fd(), VHOST_SET_OWNER as libc::c_ulong) };
|
||||
if ret < 0 {
|
||||
return Err(VhostNetError::Ioctl {
|
||||
ioctl: "VHOST_SET_OWNER",
|
||||
error: std::io::Error::last_os_error(),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
vhost_file,
|
||||
tap_fd,
|
||||
kick_fds: Vec::new(),
|
||||
call_fds: Vec::new(),
|
||||
num_queues: 0,
|
||||
activated: false,
|
||||
})
|
||||
}
|
||||
|
||||
/// Get vhost-net features
|
||||
pub fn get_features(&self) -> std::result::Result<u64, VhostNetError> {
|
||||
let mut features: u64 = 0;
|
||||
let ret = unsafe {
|
||||
libc::ioctl(
|
||||
self.vhost_file.as_raw_fd(),
|
||||
VHOST_GET_FEATURES as libc::c_ulong,
|
||||
&mut features as *mut u64,
|
||||
)
|
||||
};
|
||||
|
||||
if ret < 0 {
|
||||
return Err(VhostNetError::Ioctl {
|
||||
ioctl: "VHOST_GET_FEATURES",
|
||||
error: std::io::Error::last_os_error(),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(features)
|
||||
}
|
||||
|
||||
/// Set vhost-net features
|
||||
pub fn set_features(&self, features: u64) -> std::result::Result<(), VhostNetError> {
|
||||
let ret = unsafe {
|
||||
libc::ioctl(
|
||||
self.vhost_file.as_raw_fd(),
|
||||
VHOST_SET_FEATURES as libc::c_ulong,
|
||||
&features as *const u64,
|
||||
)
|
||||
};
|
||||
|
||||
if ret < 0 {
|
||||
return Err(VhostNetError::Ioctl {
|
||||
ioctl: "VHOST_SET_FEATURES",
|
||||
error: std::io::Error::last_os_error(),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Set memory table for vhost
|
||||
pub fn set_mem_table(
|
||||
&self,
|
||||
regions: &[VhostMemoryRegion],
|
||||
) -> std::result::Result<(), VhostNetError> {
|
||||
// Allocate memory for the structure
|
||||
let total_size =
|
||||
std::mem::size_of::<u32>() * 2 + regions.len() * std::mem::size_of::<VhostMemoryRegion>();
|
||||
let mut buffer = vec![0u8; total_size];
|
||||
|
||||
// Fill in the structure
|
||||
let nregions = regions.len() as u32;
|
||||
buffer[0..4].copy_from_slice(&nregions.to_ne_bytes());
|
||||
// padding at [4..8]
|
||||
|
||||
for (i, region) in regions.iter().enumerate() {
|
||||
let offset = 8 + i * std::mem::size_of::<VhostMemoryRegion>();
|
||||
let region_bytes = unsafe {
|
||||
std::slice::from_raw_parts(
|
||||
region as *const VhostMemoryRegion as *const u8,
|
||||
std::mem::size_of::<VhostMemoryRegion>(),
|
||||
)
|
||||
};
|
||||
buffer[offset..offset + std::mem::size_of::<VhostMemoryRegion>()]
|
||||
.copy_from_slice(region_bytes);
|
||||
}
|
||||
|
||||
let ret = unsafe {
|
||||
libc::ioctl(
|
||||
self.vhost_file.as_raw_fd(),
|
||||
VHOST_SET_MEM_TABLE as libc::c_ulong,
|
||||
buffer.as_ptr(),
|
||||
)
|
||||
};
|
||||
|
||||
if ret < 0 {
|
||||
return Err(VhostNetError::Ioctl {
|
||||
ioctl: "VHOST_SET_MEM_TABLE",
|
||||
error: std::io::Error::last_os_error(),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Set vring number of descriptors
|
||||
pub fn set_vring_num(
|
||||
&self,
|
||||
queue_index: u32,
|
||||
num: u32,
|
||||
) -> std::result::Result<(), VhostNetError> {
|
||||
let state = VhostVringState {
|
||||
index: queue_index,
|
||||
num,
|
||||
};
|
||||
|
||||
let ret = unsafe {
|
||||
libc::ioctl(
|
||||
self.vhost_file.as_raw_fd(),
|
||||
VHOST_SET_VRING_NUM as libc::c_ulong,
|
||||
&state as *const VhostVringState,
|
||||
)
|
||||
};
|
||||
|
||||
if ret < 0 {
|
||||
return Err(VhostNetError::Ioctl {
|
||||
ioctl: "VHOST_SET_VRING_NUM",
|
||||
error: std::io::Error::last_os_error(),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Set vring base (starting index)
|
||||
pub fn set_vring_base(
|
||||
&self,
|
||||
queue_index: u32,
|
||||
base: u32,
|
||||
) -> std::result::Result<(), VhostNetError> {
|
||||
let state = VhostVringState {
|
||||
index: queue_index,
|
||||
num: base,
|
||||
};
|
||||
|
||||
let ret = unsafe {
|
||||
libc::ioctl(
|
||||
self.vhost_file.as_raw_fd(),
|
||||
VHOST_SET_VRING_BASE as libc::c_ulong,
|
||||
&state as *const VhostVringState,
|
||||
)
|
||||
};
|
||||
|
||||
if ret < 0 {
|
||||
return Err(VhostNetError::Ioctl {
|
||||
ioctl: "VHOST_SET_VRING_BASE",
|
||||
error: std::io::Error::last_os_error(),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Set vring addresses
|
||||
pub fn set_vring_addr(&self, addr: &VhostVringAddr) -> std::result::Result<(), VhostNetError> {
|
||||
let ret = unsafe {
|
||||
libc::ioctl(
|
||||
self.vhost_file.as_raw_fd(),
|
||||
VHOST_SET_VRING_ADDR as libc::c_ulong,
|
||||
addr as *const VhostVringAddr,
|
||||
)
|
||||
};
|
||||
|
||||
if ret < 0 {
|
||||
return Err(VhostNetError::Ioctl {
|
||||
ioctl: "VHOST_SET_VRING_ADDR",
|
||||
error: std::io::Error::last_os_error(),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Set vring kick fd (for notifying vhost)
|
||||
pub fn set_vring_kick(
|
||||
&self,
|
||||
queue_index: u32,
|
||||
fd: RawFd,
|
||||
) -> std::result::Result<(), VhostNetError> {
|
||||
let file = VhostVringFile {
|
||||
index: queue_index,
|
||||
fd: fd as i32,
|
||||
};
|
||||
|
||||
let ret = unsafe {
|
||||
libc::ioctl(
|
||||
self.vhost_file.as_raw_fd(),
|
||||
VHOST_SET_VRING_KICK as libc::c_ulong,
|
||||
&file as *const VhostVringFile,
|
||||
)
|
||||
};
|
||||
|
||||
if ret < 0 {
|
||||
return Err(VhostNetError::Ioctl {
|
||||
ioctl: "VHOST_SET_VRING_KICK",
|
||||
error: std::io::Error::last_os_error(),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Set vring call fd (for vhost to notify guest)
|
||||
pub fn set_vring_call(
|
||||
&self,
|
||||
queue_index: u32,
|
||||
fd: RawFd,
|
||||
) -> std::result::Result<(), VhostNetError> {
|
||||
let file = VhostVringFile {
|
||||
index: queue_index,
|
||||
fd: fd as i32,
|
||||
};
|
||||
|
||||
let ret = unsafe {
|
||||
libc::ioctl(
|
||||
self.vhost_file.as_raw_fd(),
|
||||
VHOST_SET_VRING_CALL as libc::c_ulong,
|
||||
&file as *const VhostVringFile,
|
||||
)
|
||||
};
|
||||
|
||||
if ret < 0 {
|
||||
return Err(VhostNetError::Ioctl {
|
||||
ioctl: "VHOST_SET_VRING_CALL",
|
||||
error: std::io::Error::last_os_error(),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Set the TAP backend for a queue
|
||||
pub fn set_backend(&self, queue_index: u32) -> std::result::Result<(), VhostNetError> {
|
||||
let file = VhostVringFile {
|
||||
index: queue_index,
|
||||
fd: self.tap_fd as i32,
|
||||
};
|
||||
|
||||
let ret = unsafe {
|
||||
libc::ioctl(
|
||||
self.vhost_file.as_raw_fd(),
|
||||
VHOST_NET_SET_BACKEND as libc::c_ulong,
|
||||
&file as *const VhostVringFile,
|
||||
)
|
||||
};
|
||||
|
||||
if ret < 0 {
|
||||
return Err(VhostNetError::Ioctl {
|
||||
ioctl: "VHOST_NET_SET_BACKEND",
|
||||
error: std::io::Error::last_os_error(),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Disable backend for a queue
|
||||
pub fn disable_backend(&self, queue_index: u32) -> std::result::Result<(), VhostNetError> {
|
||||
let file = VhostVringFile {
|
||||
index: queue_index,
|
||||
fd: -1,
|
||||
};
|
||||
|
||||
let ret = unsafe {
|
||||
libc::ioctl(
|
||||
self.vhost_file.as_raw_fd(),
|
||||
VHOST_NET_SET_BACKEND as libc::c_ulong,
|
||||
&file as *const VhostVringFile,
|
||||
)
|
||||
};
|
||||
|
||||
if ret < 0 {
|
||||
return Err(VhostNetError::Ioctl {
|
||||
ioctl: "VHOST_NET_SET_BACKEND (disable)",
|
||||
error: std::io::Error::last_os_error(),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Create an eventfd
|
||||
fn create_eventfd() -> std::result::Result<RawFd, VhostNetError> {
|
||||
let fd = unsafe { libc::eventfd(0, libc::EFD_CLOEXEC | libc::EFD_NONBLOCK) };
|
||||
if fd < 0 {
|
||||
return Err(VhostNetError::EventFd(std::io::Error::last_os_error()));
|
||||
}
|
||||
Ok(fd)
|
||||
}
|
||||
|
||||
/// Setup a vring (queue) for vhost-net operation
|
||||
pub fn setup_vring(
|
||||
&mut self,
|
||||
queue_index: u32,
|
||||
queue_size: u32,
|
||||
desc_addr: u64,
|
||||
avail_addr: u64,
|
||||
used_addr: u64,
|
||||
) -> std::result::Result<(RawFd, RawFd), VhostNetError> {
|
||||
// Create eventfds for this queue
|
||||
let kick_fd = Self::create_eventfd()?;
|
||||
let call_fd = Self::create_eventfd()?;
|
||||
|
||||
// Set queue size
|
||||
self.set_vring_num(queue_index, queue_size)?;
|
||||
|
||||
// Set base index to 0
|
||||
self.set_vring_base(queue_index, 0)?;
|
||||
|
||||
// Set addresses
|
||||
let addr = VhostVringAddr {
|
||||
index: queue_index,
|
||||
flags: 0,
|
||||
desc_user_addr: desc_addr,
|
||||
used_user_addr: used_addr,
|
||||
avail_user_addr: avail_addr,
|
||||
log_guest_addr: 0,
|
||||
};
|
||||
self.set_vring_addr(&addr)?;
|
||||
|
||||
// Set kick and call fds
|
||||
self.set_vring_kick(queue_index, kick_fd)?;
|
||||
self.set_vring_call(queue_index, call_fd)?;
|
||||
|
||||
// Store fds
|
||||
while self.kick_fds.len() <= queue_index as usize {
|
||||
self.kick_fds.push(-1);
|
||||
}
|
||||
while self.call_fds.len() <= queue_index as usize {
|
||||
self.call_fds.push(-1);
|
||||
}
|
||||
self.kick_fds[queue_index as usize] = kick_fd;
|
||||
self.call_fds[queue_index as usize] = call_fd;
|
||||
|
||||
self.num_queues = self.num_queues.max(queue_index as usize + 1);
|
||||
|
||||
Ok((kick_fd, call_fd))
|
||||
}
|
||||
|
||||
/// Activate vhost-net backend for all configured queues
|
||||
pub fn activate(&mut self) -> std::result::Result<(), VhostNetError> {
|
||||
for i in 0..self.num_queues {
|
||||
self.set_backend(i as u32)?;
|
||||
}
|
||||
self.activated = true;
|
||||
tracing::info!("vhost-net activated for {} queues", self.num_queues);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Deactivate vhost-net backend
|
||||
pub fn deactivate(&mut self) -> std::result::Result<(), VhostNetError> {
|
||||
if self.activated {
|
||||
for i in 0..self.num_queues {
|
||||
self.disable_backend(i as u32)?;
|
||||
}
|
||||
self.activated = false;
|
||||
tracing::info!("vhost-net deactivated");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get kick eventfd for a queue
|
||||
pub fn kick_fd(&self, queue_index: usize) -> Option<RawFd> {
|
||||
self.kick_fds.get(queue_index).copied().filter(|&fd| fd >= 0)
|
||||
}
|
||||
|
||||
/// Get call eventfd for a queue
|
||||
pub fn call_fd(&self, queue_index: usize) -> Option<RawFd> {
|
||||
self.call_fds.get(queue_index).copied().filter(|&fd| fd >= 0)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for VhostNetBackend {
|
||||
fn drop(&mut self) {
|
||||
// Deactivate backend
|
||||
let _ = self.deactivate();
|
||||
|
||||
// Close eventfds
|
||||
for &fd in &self.kick_fds {
|
||||
if fd >= 0 {
|
||||
unsafe { libc::close(fd) };
|
||||
}
|
||||
}
|
||||
for &fd in &self.call_fds {
|
||||
if fd >= 0 {
|
||||
unsafe { libc::close(fd) };
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// VhostNet-enabled VirtioNet Builder
|
||||
// ============================================================================
|
||||
|
||||
/// Builder for creating vhost-net accelerated virtio-net devices
|
||||
pub struct VhostNetBuilder {
|
||||
/// TAP device name
|
||||
tap_name: Option<String>,
|
||||
/// MAC address
|
||||
mac: Option<[u8; 6]>,
|
||||
/// Enable vhost-net (default: true if available)
|
||||
vhost: bool,
|
||||
/// Number of queue pairs (for multiqueue)
|
||||
queue_pairs: u32,
|
||||
}
|
||||
|
||||
impl Default for VhostNetBuilder {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
tap_name: None,
|
||||
mac: None,
|
||||
vhost: VhostNetBackend::is_available(),
|
||||
queue_pairs: 1,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl VhostNetBuilder {
|
||||
/// Create a new builder
|
||||
pub fn new() -> Self {
|
||||
Self::default()
|
||||
}
|
||||
|
||||
/// Set the TAP device name
|
||||
pub fn tap_name(mut self, name: impl Into<String>) -> Self {
|
||||
self.tap_name = Some(name.into());
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the MAC address
|
||||
pub fn mac(mut self, mac: [u8; 6]) -> Self {
|
||||
self.mac = Some(mac);
|
||||
self
|
||||
}
|
||||
|
||||
/// Enable or disable vhost-net
|
||||
pub fn vhost(mut self, enabled: bool) -> Self {
|
||||
self.vhost = enabled && VhostNetBackend::is_available();
|
||||
self
|
||||
}
|
||||
|
||||
/// Set number of queue pairs (for multiqueue)
|
||||
pub fn queue_pairs(mut self, pairs: u32) -> Self {
|
||||
self.queue_pairs = pairs.max(1);
|
||||
self
|
||||
}
|
||||
|
||||
/// Check if vhost-net will be used
|
||||
pub fn will_use_vhost(&self) -> bool {
|
||||
self.vhost
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Tests
|
||||
// ============================================================================
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_vhost_availability_check() {
|
||||
// This just checks we can query availability without crashing
|
||||
let _available = VhostNetBackend::is_available();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_vhost_vring_state_size() {
|
||||
assert_eq!(std::mem::size_of::<VhostVringState>(), 8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_vhost_vring_addr_size() {
|
||||
assert_eq!(std::mem::size_of::<VhostVringAddr>(), 48);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_vhost_vring_file_size() {
|
||||
assert_eq!(std::mem::size_of::<VhostVringFile>(), 8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_vhost_memory_region_size() {
|
||||
assert_eq!(std::mem::size_of::<VhostMemoryRegion>(), 32);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_builder_defaults() {
|
||||
let builder = VhostNetBuilder::new();
|
||||
assert_eq!(builder.queue_pairs, 1);
|
||||
assert!(builder.tap_name.is_none());
|
||||
}
|
||||
}
|
||||
508
vmm/src/kvm/cpuid.rs
Normal file
508
vmm/src/kvm/cpuid.rs
Normal file
@@ -0,0 +1,508 @@
|
||||
//! CPUID Configuration and Filtering
|
||||
//!
|
||||
//! Configures the CPUID entries presented to the guest vCPU.
|
||||
//! Uses KVM_GET_SUPPORTED_CPUID to get host capabilities, then filters
|
||||
//! and modifies entries to create a minimal, secure vCPU profile.
|
||||
//!
|
||||
//! This is critical for Linux kernel boot — the kernel checks CPUID to
|
||||
//! determine which features are available before using them. Without proper
|
||||
//! CPUID configuration, the kernel may attempt to enable features (like SYSCALL)
|
||||
//! that aren't advertised, causing #GP faults.
|
||||
|
||||
use kvm_bindings::{kvm_cpuid_entry2, CpuId, KVM_MAX_CPUID_ENTRIES};
|
||||
use kvm_ioctls::{Kvm, VcpuFd};
|
||||
use std::sync::Mutex;
|
||||
|
||||
use super::{KvmError, Result};
|
||||
|
||||
/// Cached host CPUID table. The supported CPUID is the same for every VM
|
||||
/// on the same host, so we compute it once and clone per-VM.
|
||||
/// Uses Mutex<Option<CpuId>> because OnceLock::get_or_try_init is unstable.
|
||||
static CACHED_HOST_CPUID: Mutex<Option<CpuId>> = Mutex::new(None);
|
||||
|
||||
/// CPUID feature bits for leaf 0x1, ECX
|
||||
#[allow(dead_code)] // x86 architecture constants — kept for completeness
|
||||
mod leaf1_ecx {
|
||||
pub const SSE3: u32 = 1 << 0;
|
||||
pub const PCLMULQDQ: u32 = 1 << 1;
|
||||
pub const DTES64: u32 = 1 << 2;
|
||||
pub const MONITOR: u32 = 1 << 3;
|
||||
pub const DS_CPL: u32 = 1 << 4;
|
||||
pub const VMX: u32 = 1 << 5;
|
||||
pub const SMX: u32 = 1 << 6;
|
||||
pub const EIST: u32 = 1 << 7;
|
||||
pub const TM2: u32 = 1 << 8;
|
||||
pub const SSSE3: u32 = 1 << 9;
|
||||
pub const TSC_DEADLINE: u32 = 1 << 24;
|
||||
pub const HYPERVISOR: u32 = 1 << 31;
|
||||
pub const AES: u32 = 1 << 25;
|
||||
pub const XSAVE: u32 = 1 << 26;
|
||||
pub const OSXSAVE: u32 = 1 << 27;
|
||||
pub const AVX: u32 = 1 << 28;
|
||||
pub const X2APIC: u32 = 1 << 21;
|
||||
pub const MOVBE: u32 = 1 << 22;
|
||||
pub const POPCNT: u32 = 1 << 23;
|
||||
pub const SSE41: u32 = 1 << 19;
|
||||
pub const SSE42: u32 = 1 << 20;
|
||||
pub const FMA: u32 = 1 << 12;
|
||||
pub const CX16: u32 = 1 << 13;
|
||||
pub const PDCM: u32 = 1 << 15;
|
||||
pub const PCID: u32 = 1 << 17;
|
||||
pub const F16C: u32 = 1 << 29;
|
||||
pub const RDRAND: u32 = 1 << 30;
|
||||
}
|
||||
|
||||
/// CPUID feature bits for leaf 0x1, EDX
|
||||
#[allow(dead_code)] // x86 architecture constants — kept for completeness
|
||||
mod leaf1_edx {
|
||||
pub const FPU: u32 = 1 << 0;
|
||||
pub const VME: u32 = 1 << 1;
|
||||
pub const DE: u32 = 1 << 2;
|
||||
pub const PSE: u32 = 1 << 3;
|
||||
pub const TSC: u32 = 1 << 4;
|
||||
pub const MSR: u32 = 1 << 5;
|
||||
pub const PAE: u32 = 1 << 6;
|
||||
pub const MCE: u32 = 1 << 7;
|
||||
pub const CX8: u32 = 1 << 8;
|
||||
pub const APIC: u32 = 1 << 9;
|
||||
pub const SEP: u32 = 1 << 11;
|
||||
pub const MTRR: u32 = 1 << 12;
|
||||
pub const PGE: u32 = 1 << 13;
|
||||
pub const MCA: u32 = 1 << 14;
|
||||
pub const CMOV: u32 = 1 << 15;
|
||||
pub const PAT: u32 = 1 << 16;
|
||||
pub const PSE36: u32 = 1 << 17;
|
||||
pub const CLFLUSH: u32 = 1 << 19;
|
||||
pub const MMX: u32 = 1 << 23;
|
||||
pub const FXSR: u32 = 1 << 24;
|
||||
pub const SSE: u32 = 1 << 25;
|
||||
pub const SSE2: u32 = 1 << 26;
|
||||
pub const HTT: u32 = 1 << 28;
|
||||
}
|
||||
|
||||
/// CPUID feature bits for leaf 0x7, subleaf 0, EBX
|
||||
#[allow(dead_code)] // x86 architecture constants — kept for completeness
|
||||
mod leaf7_ebx {
|
||||
pub const FSGSBASE: u32 = 1 << 0;
|
||||
pub const BMI1: u32 = 1 << 3;
|
||||
pub const HLE: u32 = 1 << 4; // TSX Hardware Lock Elision
|
||||
pub const AVX2: u32 = 1 << 5;
|
||||
pub const SMEP: u32 = 1 << 7;
|
||||
pub const BMI2: u32 = 1 << 8;
|
||||
pub const ERMS: u32 = 1 << 9;
|
||||
pub const INVPCID: u32 = 1 << 10;
|
||||
pub const RTM: u32 = 1 << 11; // TSX Restricted Transactional Memory
|
||||
pub const RDT_M: u32 = 1 << 12;
|
||||
pub const RDT_A: u32 = 1 << 15;
|
||||
pub const MPX: u32 = 1 << 14;
|
||||
pub const RDSEED: u32 = 1 << 18;
|
||||
pub const ADX: u32 = 1 << 19;
|
||||
pub const SMAP: u32 = 1 << 20;
|
||||
pub const CLFLUSHOPT: u32 = 1 << 23;
|
||||
pub const CLWB: u32 = 1 << 24;
|
||||
pub const SHA: u32 = 1 << 29;
|
||||
}
|
||||
|
||||
/// CPUID feature bits for leaf 0x7, subleaf 0, ECX
|
||||
#[allow(dead_code)] // x86 architecture constants — kept for completeness
|
||||
mod leaf7_ecx {
|
||||
pub const UMIP: u32 = 1 << 2;
|
||||
pub const PKU: u32 = 1 << 3;
|
||||
pub const OSPKE: u32 = 1 << 4;
|
||||
pub const LA57: u32 = 1 << 16;
|
||||
pub const RDPID: u32 = 1 << 22;
|
||||
}
|
||||
|
||||
/// CPUID feature bits for leaf 0x7, subleaf 0, EDX
|
||||
#[allow(dead_code)] // x86 architecture constants — kept for completeness
|
||||
mod leaf7_edx {
|
||||
pub const SPEC_CTRL: u32 = 1 << 26; // IBRS/IBPB
|
||||
pub const STIBP: u32 = 1 << 27;
|
||||
pub const ARCH_CAP: u32 = 1 << 29;
|
||||
pub const SSBD: u32 = 1 << 31;
|
||||
}
|
||||
|
||||
/// Extended feature bits for leaf 0x80000001, ECX
|
||||
#[allow(dead_code)] // x86 architecture constants — kept for completeness
|
||||
mod ext_leaf1_ecx {
|
||||
pub const LAHF_LM: u32 = 1 << 0;
|
||||
pub const ABM: u32 = 1 << 5; // LZCNT
|
||||
pub const SSE4A: u32 = 1 << 6;
|
||||
pub const PREFETCHW: u32 = 1 << 8;
|
||||
pub const TOPOEXT: u32 = 1 << 22;
|
||||
}
|
||||
|
||||
/// Extended feature bits for leaf 0x80000001, EDX
|
||||
#[allow(dead_code)] // x86 architecture constants — kept for completeness
|
||||
mod ext_leaf1_edx {
|
||||
pub const SYSCALL: u32 = 1 << 11;
|
||||
pub const NX: u32 = 1 << 20;
|
||||
pub const PDPE1GB: u32 = 1 << 26; // 1GB huge pages
|
||||
pub const RDTSCP: u32 = 1 << 27;
|
||||
pub const LM: u32 = 1 << 29; // Long Mode (64-bit)
|
||||
}
|
||||
|
||||
/// CPUID configuration for a vCPU
|
||||
pub struct CpuidConfig {
|
||||
/// Number of vCPUs
|
||||
pub vcpu_count: u8,
|
||||
/// vCPU index (0-based)
|
||||
pub vcpu_id: u8,
|
||||
}
|
||||
|
||||
/// Get filtered CPUID entries for a vCPU
|
||||
///
|
||||
/// This is the main entry point for CPUID configuration. It:
|
||||
/// 1. Gets the host-supported CPUID from KVM (cached after first call)
|
||||
/// 2. Filters entries to create a minimal, secure profile
|
||||
/// 3. Returns the filtered CPUID ready for KVM_SET_CPUID2
|
||||
///
|
||||
/// The KVM_GET_SUPPORTED_CPUID ioctl result is cached because it returns
|
||||
/// the same data for every VM on the same host (it reflects CPU + KVM
|
||||
/// capabilities, not per-VM state). This saves ~40ms on subsequent VMs.
|
||||
pub fn get_filtered_cpuid(kvm: &Kvm, config: &CpuidConfig) -> Result<CpuId> {
|
||||
// Clone from cache, or populate cache on first call
|
||||
let mut cpuid = {
|
||||
let mut cache = CACHED_HOST_CPUID.lock().unwrap();
|
||||
if let Some(ref cached) = *cache {
|
||||
cached.clone()
|
||||
} else {
|
||||
let host_cpuid = kvm
|
||||
.get_supported_cpuid(KVM_MAX_CPUID_ENTRIES)
|
||||
.map_err(|e| KvmError::GetRegisters(e))?;
|
||||
tracing::debug!(
|
||||
"Host CPUID cached: {} entries",
|
||||
host_cpuid.as_slice().len()
|
||||
);
|
||||
*cache = Some(host_cpuid.clone());
|
||||
host_cpuid
|
||||
}
|
||||
};
|
||||
|
||||
// Apply filters to each entry
|
||||
filter_cpuid_entries(&mut cpuid, config);
|
||||
|
||||
tracing::info!(
|
||||
"CPUID configured: {} entries for vCPU {}",
|
||||
cpuid.as_slice().len(),
|
||||
config.vcpu_id
|
||||
);
|
||||
|
||||
Ok(cpuid)
|
||||
}
|
||||
|
||||
/// Apply CPUID to a vCPU
|
||||
pub fn apply_cpuid(vcpu_fd: &VcpuFd, cpuid: &CpuId) -> Result<()> {
|
||||
vcpu_fd.set_cpuid2(cpuid).map_err(KvmError::SetRegisters)?;
|
||||
tracing::debug!("CPUID applied to vCPU");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Filter and modify CPUID entries
|
||||
fn filter_cpuid_entries(cpuid: &mut CpuId, config: &CpuidConfig) {
|
||||
let entries = cpuid.as_mut_slice();
|
||||
|
||||
for entry in entries.iter_mut() {
|
||||
match entry.function {
|
||||
// Leaf 0x0: Vendor ID and max standard leaf
|
||||
0x0 => {
|
||||
// Keep the host vendor string — changing it can cause issues
|
||||
// with CPU-specific code paths in the kernel
|
||||
tracing::debug!(
|
||||
"CPUID leaf 0x0: max_leaf={}, vendor={:x}-{:x}-{:x}",
|
||||
entry.eax,
|
||||
entry.ebx,
|
||||
entry.edx,
|
||||
entry.ecx
|
||||
);
|
||||
}
|
||||
|
||||
// Leaf 0x1: Feature Information
|
||||
0x1 => {
|
||||
filter_leaf_1(entry, config);
|
||||
}
|
||||
|
||||
// Leaf 0x4: Deterministic cache parameters
|
||||
0x4 => {
|
||||
filter_leaf_4(entry, config);
|
||||
}
|
||||
|
||||
// Leaf 0x6: Thermal and Power Management
|
||||
0x6 => {
|
||||
// Clear all — we don't expose power management to guest
|
||||
entry.eax = 0;
|
||||
entry.ebx = 0;
|
||||
entry.ecx = 0;
|
||||
entry.edx = 0;
|
||||
}
|
||||
|
||||
// Leaf 0x7: Structured Extended Feature Flags
|
||||
0x7 => {
|
||||
if entry.index == 0 {
|
||||
filter_leaf_7(entry);
|
||||
}
|
||||
}
|
||||
|
||||
// Leaf 0xA: Performance Monitoring
|
||||
0xa => {
|
||||
// Disable performance monitoring in guest
|
||||
entry.eax = 0;
|
||||
entry.ebx = 0;
|
||||
entry.ecx = 0;
|
||||
entry.edx = 0;
|
||||
}
|
||||
|
||||
// Leaf 0xB: Extended Topology Enumeration
|
||||
0xb => {
|
||||
filter_leaf_0xb(entry, config);
|
||||
}
|
||||
|
||||
// Leaf 0x15: TSC/Core Crystal Clock
|
||||
0x15 => {
|
||||
// Pass through — needed for accurate timekeeping
|
||||
}
|
||||
|
||||
// Leaf 0x16: Processor Frequency Information
|
||||
0x16 => {
|
||||
// Pass through — informational only
|
||||
}
|
||||
|
||||
// Leaf 0x40000000-0x4FFFFFFF: Hypervisor leaves
|
||||
0x40000000 => {
|
||||
// Set up KVM hypervisor signature
|
||||
// This tells the kernel it's running under KVM
|
||||
entry.eax = 0x40000001; // Max hypervisor leaf
|
||||
entry.ebx = 0x4b4d564b; // "KVMK"
|
||||
entry.ecx = 0x564b4d56; // "VMKV"
|
||||
entry.edx = 0x4d; // "M\0\0\0"
|
||||
}
|
||||
|
||||
// Leaf 0x80000000: Extended function max leaf
|
||||
0x80000000 => {
|
||||
// Ensure we report at least 0x80000008 for address sizes
|
||||
if entry.eax < 0x80000008 {
|
||||
entry.eax = 0x80000008;
|
||||
}
|
||||
}
|
||||
|
||||
// Leaf 0x80000001: Extended Processor Info and Features
|
||||
0x80000001 => {
|
||||
filter_ext_leaf_1(entry);
|
||||
}
|
||||
|
||||
// Leaves 0x80000002-0x80000004: Brand string
|
||||
0x80000002..=0x80000004 => {
|
||||
// Pass through host brand string
|
||||
}
|
||||
|
||||
// Leaf 0x80000005: L1 Cache and TLB (AMD only)
|
||||
0x80000005 => {
|
||||
// Pass through
|
||||
}
|
||||
|
||||
// Leaf 0x80000006: L2 Cache
|
||||
0x80000006 => {
|
||||
// Pass through
|
||||
}
|
||||
|
||||
// Leaf 0x80000007: Advanced Power Management
|
||||
0x80000007 => {
|
||||
// Only keep invariant TSC flag (EDX bit 8)
|
||||
entry.eax = 0;
|
||||
entry.ebx = 0;
|
||||
entry.ecx = 0;
|
||||
entry.edx &= 1 << 8; // Invariant TSC
|
||||
}
|
||||
|
||||
// Leaf 0x80000008: Virtual/Physical Address Sizes
|
||||
0x80000008 => {
|
||||
// Pass through — needed for correct address width detection
|
||||
}
|
||||
|
||||
_ => {
|
||||
// For unknown leaves, pass through what KVM reports
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Filter leaf 0x1: Feature Information
|
||||
fn filter_leaf_1(entry: &mut kvm_cpuid_entry2, config: &CpuidConfig) {
|
||||
// EAX: Version information — pass through host values
|
||||
|
||||
// EBX: Additional info
|
||||
// Set initial APIC ID to vcpu_id
|
||||
entry.ebx = (entry.ebx & 0x00FFFFFF) | ((config.vcpu_id as u32) << 24);
|
||||
// Set logical processor count
|
||||
entry.ebx = (entry.ebx & 0xFF00FFFF) | ((config.vcpu_count as u32) << 16);
|
||||
// CLFLUSH line size (8 * 8 = 64 bytes)
|
||||
entry.ebx = (entry.ebx & 0xFFFF00FF) | (8 << 8);
|
||||
|
||||
// ECX: Feature flags
|
||||
// Strip features not suitable for guests
|
||||
entry.ecx &= !(leaf1_ecx::DTES64 // Debug Trace Store
|
||||
| leaf1_ecx::MONITOR // MONITOR/MWAIT (triggers VM exits)
|
||||
| leaf1_ecx::DS_CPL // CPL Qualified Debug Store
|
||||
| leaf1_ecx::VMX // Nested virtualization not supported
|
||||
| leaf1_ecx::SMX // Safer Mode Extensions
|
||||
| leaf1_ecx::EIST // Enhanced SpeedStep
|
||||
| leaf1_ecx::TM2 // Thermal Monitor 2
|
||||
| leaf1_ecx::PDCM); // Perfmon/Debug Capability
|
||||
|
||||
// Ensure hypervisor bit is set (tells kernel it's in a VM)
|
||||
entry.ecx |= leaf1_ecx::HYPERVISOR;
|
||||
|
||||
// EDX: Feature flags — mostly pass through but clear some
|
||||
entry.edx &= !(1 << 7 // MCE (Machine Check Exception) - handle via host
|
||||
| 1u32 << 14 // MCA (Machine Check Architecture)
|
||||
| 1u32 << 22); // ACPI thermal (not implemented)
|
||||
|
||||
// Enable HTT (Hyper-Threading Technology) bit when multiple vCPUs are present.
|
||||
// This tells the kernel that the system has multiple logical processors and
|
||||
// should parse APIC IDs and topology info. Without this, some kernels skip
|
||||
// AP startup entirely.
|
||||
if config.vcpu_count > 1 {
|
||||
entry.edx |= leaf1_edx::HTT;
|
||||
} else {
|
||||
entry.edx &= !leaf1_edx::HTT;
|
||||
}
|
||||
|
||||
tracing::debug!(
|
||||
"CPUID 0x1: EAX=0x{:08x} EBX=0x{:08x} ECX=0x{:08x} EDX=0x{:08x}",
|
||||
entry.eax,
|
||||
entry.ebx,
|
||||
entry.ecx,
|
||||
entry.edx
|
||||
);
|
||||
}
|
||||
|
||||
/// Filter leaf 0x4: Cache parameters
|
||||
fn filter_leaf_4(entry: &mut kvm_cpuid_entry2, config: &CpuidConfig) {
|
||||
// EAX bits 25:14 = max cores per package - 1
|
||||
// For single vCPU, set to 0
|
||||
let cache_type = entry.eax & 0x1F;
|
||||
if cache_type != 0 {
|
||||
// Clear max cores per package, set to vcpu_count - 1
|
||||
entry.eax = (entry.eax & !(0xFFF << 14)) | (((config.vcpu_count as u32).saturating_sub(1)) << 14);
|
||||
// EAX bits 31:26 = max addressable IDs for threads sharing cache - 1
|
||||
entry.eax = (entry.eax & !(0x3F << 26)) | (0 << 26);
|
||||
}
|
||||
}
|
||||
|
||||
/// Filter leaf 0x7: Structured Extended Feature Flags
|
||||
fn filter_leaf_7(entry: &mut kvm_cpuid_entry2) {
|
||||
// EBX: Strip TSX and other problematic features
|
||||
entry.ebx &= !(leaf7_ebx::HLE // TSX Hardware Lock Elision
|
||||
| leaf7_ebx::RTM // TSX Restricted Transactional Memory
|
||||
| leaf7_ebx::RDT_M // Resource Director Technology Monitoring
|
||||
| leaf7_ebx::RDT_A // Resource Director Technology Allocation
|
||||
| leaf7_ebx::MPX); // Memory Protection Extensions (deprecated)
|
||||
|
||||
// ECX: Filter
|
||||
entry.ecx &= !(leaf7_ecx::PKU // Protection Keys (requires CR4.PKE)
|
||||
| leaf7_ecx::OSPKE // OS Protection Keys Enable
|
||||
| leaf7_ecx::LA57); // 5-level paging (not needed for guests)
|
||||
|
||||
tracing::debug!(
|
||||
"CPUID 0x7: EBX=0x{:08x} ECX=0x{:08x} EDX=0x{:08x}",
|
||||
entry.ebx,
|
||||
entry.ecx,
|
||||
entry.edx
|
||||
);
|
||||
}
|
||||
|
||||
/// Filter leaf 0xB: Extended Topology Enumeration
|
||||
///
|
||||
/// This leaf reports the processor topology using x2APIC IDs.
|
||||
/// Linux uses this (if available) to determine how many logical processors
|
||||
/// exist and at what topology level (SMT vs Core vs Package).
|
||||
///
|
||||
/// Subleaf 0 = SMT level (threads per core)
|
||||
/// Subleaf 1 = Core level (cores per package)
|
||||
fn filter_leaf_0xb(entry: &mut kvm_cpuid_entry2, config: &CpuidConfig) {
|
||||
// Set x2APIC ID in EDX (always = the vCPU's APIC ID)
|
||||
entry.edx = config.vcpu_id as u32;
|
||||
|
||||
match entry.index {
|
||||
0 => {
|
||||
// Subleaf 0: SMT (thread) level
|
||||
// EAX[4:0] = number of bits to shift right on x2APIC ID to get core ID
|
||||
// For 1 thread per core = 0 (no SMT)
|
||||
entry.eax = 0;
|
||||
// EBX[15:0] = number of logical processors at this level
|
||||
// For no SMT, this is 1 (one thread per core)
|
||||
entry.ebx = 1;
|
||||
// ECX[7:0] = level number (0 for SMT)
|
||||
// ECX[15:8] = level type (1 = SMT)
|
||||
entry.ecx = (1 << 8) | 0; // SMT level type, level number 0
|
||||
}
|
||||
1 => {
|
||||
// Subleaf 1: Core level
|
||||
// EAX[4:0] = number of bits to shift right on x2APIC ID to get package ID
|
||||
// For N cores, need ceil(log2(N)) bits
|
||||
let shift = if config.vcpu_count <= 1 {
|
||||
0
|
||||
} else {
|
||||
(config.vcpu_count as u32).next_power_of_two().trailing_zeros()
|
||||
};
|
||||
entry.eax = shift;
|
||||
// EBX[15:0] = total number of logical processors at this level (all cores in package)
|
||||
entry.ebx = config.vcpu_count as u32;
|
||||
// ECX[7:0] = level number (1 for core)
|
||||
// ECX[15:8] = level type (2 = Core)
|
||||
entry.ecx = (2 << 8) | 1; // Core level type, level number 1
|
||||
}
|
||||
_ => {
|
||||
// Subleaf 2+: Invalid level (terminate enumeration)
|
||||
entry.eax = 0;
|
||||
entry.ebx = 0;
|
||||
entry.ecx = entry.index; // level number only, type = 0 (invalid)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Filter extended leaf 0x80000001: Extended Processor Info
|
||||
///
|
||||
/// This is CRITICAL for Linux boot. The kernel checks this leaf for:
|
||||
/// - SYSCALL support (EDX bit 11) — needed before WRMSR to EFER.SCE
|
||||
/// - NX/XD bit support (EDX bit 20) — needed for NX page table entries
|
||||
/// - Long Mode (EDX bit 29) — needed for 64-bit operation
|
||||
fn filter_ext_leaf_1(entry: &mut kvm_cpuid_entry2) {
|
||||
// CRITICAL: Ensure SYSCALL, NX, and Long Mode are advertised
|
||||
// These MUST be set or the kernel will #GP when trying to enable them via WRMSR
|
||||
entry.edx |= ext_leaf1_edx::SYSCALL; // SYSCALL/SYSRET
|
||||
entry.edx |= ext_leaf1_edx::NX; // No-Execute bit
|
||||
entry.edx |= ext_leaf1_edx::LM; // Long Mode (64-bit)
|
||||
|
||||
// Keep RDTSCP and 1GB pages if host supports them
|
||||
// (they're already in the host-supported set)
|
||||
|
||||
tracing::debug!(
|
||||
"CPUID 0x80000001: ECX=0x{:08x} EDX=0x{:08x} (SYSCALL={}, NX={}, LM={})",
|
||||
entry.ecx,
|
||||
entry.edx,
|
||||
(entry.edx & ext_leaf1_edx::SYSCALL) != 0,
|
||||
(entry.edx & ext_leaf1_edx::NX) != 0,
|
||||
(entry.edx & ext_leaf1_edx::LM) != 0,
|
||||
);
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_ext_leaf1_bits() {
|
||||
// Verify SYSCALL is bit 11
|
||||
assert_eq!(ext_leaf1_edx::SYSCALL, 1 << 11);
|
||||
// Verify NX is bit 20
|
||||
assert_eq!(ext_leaf1_edx::NX, 1 << 20);
|
||||
// Verify LM is bit 29
|
||||
assert_eq!(ext_leaf1_edx::LM, 1 << 29);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hypervisor_bit() {
|
||||
assert_eq!(leaf1_ecx::HYPERVISOR, 1 << 31);
|
||||
}
|
||||
}
|
||||
424
vmm/src/kvm/memory.rs
Normal file
424
vmm/src/kvm/memory.rs
Normal file
@@ -0,0 +1,424 @@
|
||||
//! Guest Memory Management
|
||||
//!
|
||||
//! High-performance memory mapping with huge pages (2MB) support.
|
||||
//! Uses vm-memory crate for safe guest memory access.
|
||||
|
||||
use crate::kvm::x86_64;
|
||||
use nix::sys::mman::{mmap_anonymous, munmap, MapFlags, ProtFlags};
|
||||
use std::num::NonZeroUsize;
|
||||
use std::ptr::NonNull;
|
||||
use thiserror::Error;
|
||||
|
||||
/// Memory errors
|
||||
#[derive(Error, Debug)]
|
||||
#[allow(dead_code)]
|
||||
pub enum MemoryError {
|
||||
#[error("Failed to map memory: {0}")]
|
||||
Mmap(#[source] nix::Error),
|
||||
|
||||
#[error("Failed to unmap memory: {0}")]
|
||||
Munmap(#[source] nix::Error),
|
||||
|
||||
#[error("Memory size must be aligned to page size")]
|
||||
UnalignedSize,
|
||||
|
||||
#[error("Invalid memory region: 0x{0:x}")]
|
||||
InvalidRegion(u64),
|
||||
|
||||
#[error("Guest address out of bounds: 0x{0:x}")]
|
||||
OutOfBounds(u64),
|
||||
|
||||
#[error("Failed to allocate huge pages")]
|
||||
HugePageAlloc,
|
||||
|
||||
#[error("Memory access error at 0x{0:x}")]
|
||||
AccessError(u64),
|
||||
}
|
||||
|
||||
/// Page sizes
|
||||
pub const PAGE_SIZE_4K: usize = 4096;
|
||||
pub const PAGE_SIZE_2M: usize = 2 * 1024 * 1024;
|
||||
|
||||
/// Huge page configuration
|
||||
#[derive(Debug, Clone)]
|
||||
#[allow(dead_code)]
|
||||
pub struct HugePageConfig {
|
||||
/// Enable huge pages
|
||||
pub enabled: bool,
|
||||
/// Size (default 2MB)
|
||||
pub size: usize,
|
||||
/// Pre-fault pages on allocation
|
||||
pub prefault: bool,
|
||||
}
|
||||
|
||||
impl Default for HugePageConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
enabled: true,
|
||||
size: PAGE_SIZE_2M,
|
||||
prefault: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Memory configuration
|
||||
#[derive(Debug, Clone)]
|
||||
#[allow(dead_code)]
|
||||
pub struct MemoryConfig {
|
||||
/// Total guest memory size
|
||||
pub size: u64,
|
||||
/// Huge page configuration
|
||||
pub huge_pages: HugePageConfig,
|
||||
/// Base guest physical address
|
||||
pub base_addr: u64,
|
||||
/// Enable NUMA-aware allocation
|
||||
pub numa_node: Option<u32>,
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
impl MemoryConfig {
|
||||
pub fn new(size: u64) -> Self {
|
||||
Self {
|
||||
size,
|
||||
huge_pages: HugePageConfig::default(),
|
||||
base_addr: x86_64::RAM_START,
|
||||
numa_node: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_huge_pages(mut self, enabled: bool) -> Self {
|
||||
self.huge_pages.enabled = enabled;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_prefault(mut self, prefault: bool) -> Self {
|
||||
self.huge_pages.prefault = prefault;
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
/// A single guest memory region
|
||||
#[derive(Debug)]
|
||||
pub struct GuestRegion {
|
||||
/// Guest physical address
|
||||
pub guest_addr: u64,
|
||||
/// Size in bytes
|
||||
pub size: u64,
|
||||
/// Host virtual address
|
||||
pub host_addr: *mut u8,
|
||||
/// Whether huge pages are used
|
||||
pub is_huge: bool,
|
||||
}
|
||||
|
||||
// SAFETY: GuestRegion contains raw pointers but they point to
|
||||
// mmapped memory that is managed by GuestMemoryManager's lifetime
|
||||
unsafe impl Send for GuestRegion {}
|
||||
unsafe impl Sync for GuestRegion {}
|
||||
|
||||
/// Guest memory manager
|
||||
pub struct GuestMemoryManager {
|
||||
/// Memory regions
|
||||
regions: Vec<GuestRegion>,
|
||||
/// Total size
|
||||
total_size: u64,
|
||||
/// Configuration
|
||||
config: MemoryConfig,
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
impl GuestMemoryManager {
|
||||
/// Create a new guest memory manager
|
||||
pub fn new(config: MemoryConfig) -> Result<Self, MemoryError> {
|
||||
let mut manager = Self {
|
||||
regions: Vec::new(),
|
||||
total_size: 0,
|
||||
config: config.clone(),
|
||||
};
|
||||
|
||||
// Create main memory region
|
||||
manager.create_main_region(config.size)?;
|
||||
|
||||
Ok(manager)
|
||||
}
|
||||
|
||||
/// Create the main memory region (handles MMIO hole)
|
||||
fn create_main_region(&mut self, size: u64) -> Result<(), MemoryError> {
|
||||
let mmio_start = x86_64::MMIO_GAP_START;
|
||||
let _mmio_end = x86_64::MMIO_GAP_END;
|
||||
|
||||
if size <= mmio_start {
|
||||
// Memory fits below MMIO hole
|
||||
self.allocate_region(x86_64::RAM_START, size)?;
|
||||
} else {
|
||||
// Need to split around MMIO hole
|
||||
// Region 1: Below MMIO gap
|
||||
self.allocate_region(x86_64::RAM_START, mmio_start)?;
|
||||
|
||||
// Region 2: Above MMIO gap (high memory)
|
||||
let high_size = size - mmio_start;
|
||||
self.allocate_region(x86_64::HIGH_RAM_START, high_size)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Allocate a memory region
|
||||
fn allocate_region(&mut self, guest_addr: u64, size: u64) -> Result<(), MemoryError> {
|
||||
let page_size = if self.config.huge_pages.enabled {
|
||||
PAGE_SIZE_2M
|
||||
} else {
|
||||
PAGE_SIZE_4K
|
||||
};
|
||||
|
||||
// Align size to page boundary
|
||||
let aligned_size = (size as usize + page_size - 1) & !(page_size - 1);
|
||||
|
||||
let host_addr = self.mmap_region(aligned_size)?;
|
||||
|
||||
let region = GuestRegion {
|
||||
guest_addr,
|
||||
size: aligned_size as u64,
|
||||
host_addr,
|
||||
is_huge: self.config.huge_pages.enabled,
|
||||
};
|
||||
|
||||
tracing::debug!(
|
||||
"Allocated memory region: guest=0x{:x}, size={} MB, huge={}",
|
||||
guest_addr,
|
||||
aligned_size / (1024 * 1024),
|
||||
region.is_huge
|
||||
);
|
||||
|
||||
self.total_size += aligned_size as u64;
|
||||
self.regions.push(region);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Map memory using mmap with optional huge pages
|
||||
///
|
||||
/// Performance notes:
|
||||
/// - MAP_POPULATE pre-faults pages which is expensive for 4K pages (~33ms for 128MB)
|
||||
/// but beneficial for huge pages (reduces TLB misses during guest execution).
|
||||
/// - MAP_NORESERVE defers physical allocation to first access, which is handled
|
||||
/// by the kernel's demand paging. Guest memory pages are faulted in as needed.
|
||||
/// - For regular (non-huge) pages, we skip MAP_POPULATE entirely — the kernel
|
||||
/// will demand-page the memory as the guest accesses it, spreading the cost
|
||||
/// over the VM's lifetime instead of paying it all at startup.
|
||||
fn mmap_region(&self, size: usize) -> Result<*mut u8, MemoryError> {
|
||||
let mut flags = MapFlags::MAP_PRIVATE | MapFlags::MAP_ANONYMOUS | MapFlags::MAP_NORESERVE;
|
||||
|
||||
if self.config.huge_pages.enabled {
|
||||
flags |= MapFlags::MAP_HUGETLB;
|
||||
// For huge pages, pre-faulting is worthwhile: fewer TLB misses
|
||||
// and huge page allocation can fail if deferred.
|
||||
if self.config.huge_pages.prefault {
|
||||
flags |= MapFlags::MAP_POPULATE;
|
||||
}
|
||||
}
|
||||
// For regular 4K pages: skip MAP_POPULATE — demand-paging is faster
|
||||
// at startup and the kernel zeroes pages on first fault anyway.
|
||||
|
||||
let prot = ProtFlags::PROT_READ | ProtFlags::PROT_WRITE;
|
||||
|
||||
let addr = unsafe {
|
||||
mmap_anonymous(
|
||||
None,
|
||||
NonZeroUsize::new(size).ok_or(MemoryError::UnalignedSize)?,
|
||||
prot,
|
||||
flags,
|
||||
)
|
||||
.map_err(|e| {
|
||||
// If huge pages fail, fall back to regular pages
|
||||
if self.config.huge_pages.enabled {
|
||||
tracing::warn!("Huge page allocation failed, falling back to 4K pages");
|
||||
}
|
||||
MemoryError::Mmap(e)
|
||||
})?
|
||||
};
|
||||
|
||||
Ok(addr.as_ptr() as *mut u8)
|
||||
}
|
||||
|
||||
/// Translate guest physical address to host virtual address
|
||||
pub fn translate(&self, guest_addr: u64) -> Option<*mut u8> {
|
||||
for region in &self.regions {
|
||||
if guest_addr >= region.guest_addr
|
||||
&& guest_addr < region.guest_addr + region.size
|
||||
{
|
||||
let offset = guest_addr - region.guest_addr;
|
||||
return Some(unsafe { region.host_addr.add(offset as usize) });
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Translate guest physical address with bounds check for the full access range.
|
||||
/// Validates that [guest_addr, guest_addr + len) falls entirely within one region.
|
||||
fn translate_checked(&self, guest_addr: u64, len: usize) -> Option<*mut u8> {
|
||||
if len == 0 {
|
||||
return self.translate(guest_addr);
|
||||
}
|
||||
let end_addr = guest_addr.checked_add(len as u64)?;
|
||||
for region in &self.regions {
|
||||
if guest_addr >= region.guest_addr
|
||||
&& end_addr <= region.guest_addr + region.size
|
||||
{
|
||||
let offset = guest_addr - region.guest_addr;
|
||||
return Some(unsafe { region.host_addr.add(offset as usize) });
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Read from guest memory
|
||||
pub fn read(&self, guest_addr: u64, buf: &mut [u8]) -> Result<(), MemoryError> {
|
||||
let host_addr = self
|
||||
.translate_checked(guest_addr, buf.len())
|
||||
.ok_or(MemoryError::OutOfBounds(guest_addr))?;
|
||||
|
||||
unsafe {
|
||||
std::ptr::copy_nonoverlapping(host_addr, buf.as_mut_ptr(), buf.len());
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Write to guest memory
|
||||
pub fn write(&self, guest_addr: u64, buf: &[u8]) -> Result<(), MemoryError> {
|
||||
let host_addr = self
|
||||
.translate_checked(guest_addr, buf.len())
|
||||
.ok_or(MemoryError::OutOfBounds(guest_addr))?;
|
||||
|
||||
unsafe {
|
||||
std::ptr::copy_nonoverlapping(buf.as_ptr(), host_addr, buf.len());
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Write a value to guest memory
|
||||
pub fn write_obj<T: Copy>(&self, guest_addr: u64, val: &T) -> Result<(), MemoryError> {
|
||||
let host_addr = self
|
||||
.translate_checked(guest_addr, std::mem::size_of::<T>())
|
||||
.ok_or(MemoryError::OutOfBounds(guest_addr))?;
|
||||
|
||||
unsafe {
|
||||
std::ptr::write(host_addr as *mut T, *val);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Read a value from guest memory
|
||||
pub fn read_obj<T: Copy + Default>(&self, guest_addr: u64) -> Result<T, MemoryError> {
|
||||
let host_addr = self
|
||||
.translate_checked(guest_addr, std::mem::size_of::<T>())
|
||||
.ok_or(MemoryError::OutOfBounds(guest_addr))?;
|
||||
|
||||
unsafe { Ok(std::ptr::read(host_addr as *const T)) }
|
||||
}
|
||||
|
||||
/// Get slice of guest memory
|
||||
pub fn get_slice(&self, guest_addr: u64, len: usize) -> Result<&[u8], MemoryError> {
|
||||
let host_addr = self
|
||||
.translate_checked(guest_addr, len)
|
||||
.ok_or(MemoryError::OutOfBounds(guest_addr))?;
|
||||
|
||||
unsafe { Ok(std::slice::from_raw_parts(host_addr, len)) }
|
||||
}
|
||||
|
||||
/// Get mutable slice of guest memory
|
||||
pub fn get_slice_mut(&self, guest_addr: u64, len: usize) -> Result<&mut [u8], MemoryError> {
|
||||
let host_addr = self
|
||||
.translate_checked(guest_addr, len)
|
||||
.ok_or(MemoryError::OutOfBounds(guest_addr))?;
|
||||
|
||||
unsafe { Ok(std::slice::from_raw_parts_mut(host_addr, len)) }
|
||||
}
|
||||
|
||||
/// Get memory regions
|
||||
pub fn regions(&self) -> &[GuestRegion] {
|
||||
&self.regions
|
||||
}
|
||||
|
||||
/// Get total memory size
|
||||
pub fn total_size(&self) -> u64 {
|
||||
self.total_size
|
||||
}
|
||||
|
||||
/// Zero out a memory range
|
||||
pub fn zero_range(&self, guest_addr: u64, len: usize) -> Result<(), MemoryError> {
|
||||
let host_addr = self
|
||||
.translate_checked(guest_addr, len)
|
||||
.ok_or(MemoryError::OutOfBounds(guest_addr))?;
|
||||
|
||||
unsafe {
|
||||
std::ptr::write_bytes(host_addr, 0, len);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Load data from a slice into guest memory
|
||||
pub fn load_from_slice(&self, guest_addr: u64, data: &[u8]) -> Result<(), MemoryError> {
|
||||
self.write(guest_addr, data)
|
||||
}
|
||||
|
||||
/// Check if huge pages are being used
|
||||
pub fn is_using_huge_pages(&self) -> bool {
|
||||
self.regions.iter().any(|r| r.is_huge)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for GuestMemoryManager {
|
||||
fn drop(&mut self) {
|
||||
for region in &self.regions {
|
||||
unsafe {
|
||||
if let Err(e) = munmap(
|
||||
NonNull::new(region.host_addr as *mut _).unwrap(),
|
||||
region.size as usize,
|
||||
) {
|
||||
tracing::error!("Failed to unmap region: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// SAFETY: GuestMemoryManager manages its memory mappings safely
|
||||
// and provides synchronized access through the API
|
||||
unsafe impl Send for GuestMemoryManager {}
|
||||
unsafe impl Sync for GuestMemoryManager {}
|
||||
|
||||
/// Helper to check if huge pages are available
|
||||
#[allow(dead_code)]
|
||||
pub fn huge_pages_available() -> bool {
|
||||
std::path::Path::new("/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages").exists()
|
||||
}
|
||||
|
||||
/// Get number of free huge pages
|
||||
#[allow(dead_code)]
|
||||
pub fn free_huge_pages() -> Option<u64> {
|
||||
std::fs::read_to_string("/sys/kernel/mm/hugepages/hugepages-2048kB/free_hugepages")
|
||||
.ok()
|
||||
.and_then(|s| s.trim().parse().ok())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_memory_config() {
|
||||
let config = MemoryConfig::new(256 * 1024 * 1024);
|
||||
assert_eq!(config.size, 256 * 1024 * 1024);
|
||||
assert!(config.huge_pages.enabled);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_page_sizes() {
|
||||
assert_eq!(PAGE_SIZE_4K, 4096);
|
||||
assert_eq!(PAGE_SIZE_2M, 2 * 1024 * 1024);
|
||||
}
|
||||
}
|
||||
205
vmm/src/kvm/mod.rs
Normal file
205
vmm/src/kvm/mod.rs
Normal file
@@ -0,0 +1,205 @@
|
||||
//! Volt KVM Interface Layer
|
||||
//!
|
||||
//! High-performance KVM bindings optimized for <125ms boot times.
|
||||
//! Uses rust-vmm crates for battle-tested, production-ready code.
|
||||
|
||||
pub mod cpuid;
|
||||
pub mod memory;
|
||||
pub mod vcpu;
|
||||
pub mod vm;
|
||||
|
||||
#[allow(unused_imports)]
|
||||
pub use memory::{GuestMemoryManager, MemoryConfig, MemoryError};
|
||||
#[allow(unused_imports)]
|
||||
pub use vcpu::{MmioHandler, VcpuConfig, VcpuError, VcpuExitReason, VcpuHandle};
|
||||
#[allow(unused_imports)]
|
||||
pub use vm::{Vm, VmConfig, VmState};
|
||||
|
||||
use kvm_ioctls::{Cap, Kvm};
|
||||
use thiserror::Error;
|
||||
use tracing::debug;
|
||||
|
||||
/// KVM-related errors
|
||||
#[derive(Error, Debug)]
|
||||
#[allow(dead_code)]
|
||||
pub enum KvmError {
|
||||
#[error("Failed to open /dev/kvm: {0}")]
|
||||
OpenKvm(#[source] kvm_ioctls::Error),
|
||||
|
||||
#[error("KVM API version mismatch: expected {expected}, got {actual}")]
|
||||
ApiVersionMismatch { expected: i32, actual: i32 },
|
||||
|
||||
#[error("Required KVM extension not supported: {0}")]
|
||||
ExtensionNotSupported(&'static str),
|
||||
|
||||
#[error("Failed to create VM: {0}")]
|
||||
CreateVm(#[source] kvm_ioctls::Error),
|
||||
|
||||
#[error("Failed to create vCPU: {0}")]
|
||||
CreateVcpu(#[source] kvm_ioctls::Error),
|
||||
|
||||
#[error("Failed to set memory region: {0}")]
|
||||
SetMemoryRegion(#[source] kvm_ioctls::Error),
|
||||
|
||||
#[error("Failed to set registers: {0}")]
|
||||
SetRegisters(#[source] kvm_ioctls::Error),
|
||||
|
||||
#[error("Failed to get registers: {0}")]
|
||||
GetRegisters(#[source] kvm_ioctls::Error),
|
||||
|
||||
#[error("Failed to run vCPU: {0}")]
|
||||
VcpuRun(#[source] kvm_ioctls::Error),
|
||||
|
||||
#[error("Memory error: {0}")]
|
||||
Memory(#[from] MemoryError),
|
||||
|
||||
#[error("vCPU error: {0}")]
|
||||
Vcpu(#[from] VcpuError),
|
||||
|
||||
#[error("IRQ chip error: {0}")]
|
||||
IrqChip(#[source] kvm_ioctls::Error),
|
||||
|
||||
#[error("PIT error: {0}")]
|
||||
Pit(#[source] kvm_ioctls::Error),
|
||||
}
|
||||
|
||||
pub type Result<T> = std::result::Result<T, KvmError>;
|
||||
|
||||
/// KVM system handle - singleton for /dev/kvm access
|
||||
#[allow(dead_code)]
|
||||
pub struct KvmSystem {
|
||||
kvm: Kvm,
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
impl KvmSystem {
|
||||
/// Open KVM and verify capabilities
|
||||
pub fn new() -> Result<Self> {
|
||||
let kvm = Kvm::new().map_err(KvmError::OpenKvm)?;
|
||||
|
||||
// Verify API version (must be 12 for modern KVM)
|
||||
let api_version = kvm.get_api_version();
|
||||
if api_version != 12 {
|
||||
return Err(KvmError::ApiVersionMismatch {
|
||||
expected: 12,
|
||||
actual: api_version,
|
||||
});
|
||||
}
|
||||
|
||||
// Check required extensions for fast boot
|
||||
Self::check_required_extensions(&kvm)?;
|
||||
|
||||
debug!(
|
||||
api_version,
|
||||
max_vcpus = kvm.get_max_vcpus(),
|
||||
"KVM initialized"
|
||||
);
|
||||
|
||||
Ok(Self { kvm })
|
||||
}
|
||||
|
||||
/// Verify required KVM extensions are available
|
||||
fn check_required_extensions(kvm: &Kvm) -> Result<()> {
|
||||
let required = [
|
||||
(Cap::Irqchip, "IRQCHIP"),
|
||||
(Cap::UserMemory, "USER_MEMORY"),
|
||||
(Cap::SetTssAddr, "SET_TSS_ADDR"),
|
||||
(Cap::Pit2, "PIT2"),
|
||||
(Cap::ImmediateExit, "IMMEDIATE_EXIT"),
|
||||
];
|
||||
|
||||
for (cap, name) in required {
|
||||
if !kvm.check_extension(cap) {
|
||||
return Err(KvmError::ExtensionNotSupported(name));
|
||||
}
|
||||
debug!(capability = name, "KVM extension available");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Create a new VM
|
||||
pub fn create_vm(&self, config: VmConfig) -> Result<Vm> {
|
||||
Vm::new(&self.kvm, config)
|
||||
}
|
||||
|
||||
/// Get maximum supported vCPUs
|
||||
pub fn max_vcpus(&self) -> usize {
|
||||
self.kvm.get_max_vcpus()
|
||||
}
|
||||
|
||||
/// Check if a specific capability is supported
|
||||
pub fn check_cap(&self, cap: Cap) -> bool {
|
||||
self.kvm.check_extension(cap)
|
||||
}
|
||||
|
||||
/// Get raw KVM handle for advanced operations
|
||||
pub fn kvm(&self) -> &Kvm {
|
||||
&self.kvm
|
||||
}
|
||||
}
|
||||
|
||||
// Constants for x86_64 memory layout (optimized for fast boot)
|
||||
#[allow(dead_code)]
|
||||
pub mod x86_64 {
|
||||
/// Start of RAM
|
||||
pub const RAM_START: u64 = 0;
|
||||
|
||||
/// 64-bit kernel load address (standard Linux)
|
||||
pub const KERNEL_START: u64 = 0x100_0000; // 16 MB
|
||||
|
||||
/// Initrd load address
|
||||
pub const INITRD_START: u64 = 0x800_0000; // 128 MB
|
||||
|
||||
/// Command line address
|
||||
pub const CMDLINE_START: u64 = 0x2_0000; // 128 KB
|
||||
|
||||
/// Boot params (zero page) address
|
||||
pub const BOOT_PARAMS_START: u64 = 0x7000;
|
||||
|
||||
/// TSS address for KVM
|
||||
pub const TSS_ADDR: u64 = 0xFFFB_D000;
|
||||
|
||||
/// Identity map address
|
||||
pub const IDENTITY_MAP_ADDR: u64 = 0xFFFB_C000;
|
||||
|
||||
/// PCI MMIO hole start (below 4GB)
|
||||
pub const MMIO_GAP_START: u64 = 0xC000_0000; // 3 GB
|
||||
|
||||
/// PCI MMIO hole end
|
||||
pub const MMIO_GAP_END: u64 = 0x1_0000_0000; // 4 GB
|
||||
|
||||
/// High memory start (above 4GB)
|
||||
pub const HIGH_RAM_START: u64 = 0x1_0000_0000;
|
||||
|
||||
/// GDT entries for 64-bit mode
|
||||
pub const GDT_KERNEL_CODE: u16 = 0x10;
|
||||
pub const GDT_KERNEL_DATA: u16 = 0x18;
|
||||
}
|
||||
|
||||
/// Legacy compatibility: KvmContext alias
|
||||
#[allow(dead_code)]
|
||||
pub type KvmContext = KvmSystem;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_kvm_init() {
|
||||
// Skip if no KVM access
|
||||
if !std::path::Path::new("/dev/kvm").exists() {
|
||||
eprintln!("Skipping: /dev/kvm not available");
|
||||
return;
|
||||
}
|
||||
|
||||
let kvm = KvmSystem::new().expect("Failed to init KVM");
|
||||
assert!(kvm.max_vcpus() > 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_x86_64_constants() {
|
||||
assert!(x86_64::KERNEL_START > x86_64::RAM_START);
|
||||
assert!(x86_64::MMIO_GAP_END > x86_64::MMIO_GAP_START);
|
||||
}
|
||||
}
|
||||
833
vmm/src/kvm/vcpu.rs
Normal file
833
vmm/src/kvm/vcpu.rs
Normal file
@@ -0,0 +1,833 @@
|
||||
//! vCPU Management
|
||||
//!
|
||||
//! Handles vCPU lifecycle, register setup, and the KVM_RUN loop.
|
||||
//! Optimized for minimal exit handling overhead.
|
||||
|
||||
use crate::kvm::{memory::GuestMemoryManager, KvmError, Result};
|
||||
use crossbeam_channel::{bounded, Receiver, Sender};
|
||||
use kvm_bindings::{kvm_msr_entry, kvm_regs, kvm_segment, kvm_sregs, Msrs};
|
||||
use kvm_ioctls::VcpuFd;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::thread::{self, JoinHandle};
|
||||
use thiserror::Error;
|
||||
|
||||
/// i8042 PS/2 controller IO port addresses
|
||||
const I8042_DATA_PORT: u16 = 0x60;
|
||||
const I8042_CMD_PORT: u16 = 0x64;
|
||||
|
||||
/// Minimal i8042 PS/2 keyboard controller state
|
||||
///
|
||||
/// The Linux kernel probes for an i8042 during boot. Without one, the probe
|
||||
/// times out after ~1 second. This minimal implementation responds to probes
|
||||
/// just enough to avoid the timeout.
|
||||
///
|
||||
/// Linux i8042 probe sequence:
|
||||
/// 1. Write 0xAA to port 0x64 (self-test) → read 0x55 from port 0x60
|
||||
/// 2. Write 0x20 to port 0x64 (read CTR) → read CTR from port 0x60
|
||||
/// 3. Write 0x60 to port 0x64 (write CTR) → write new CTR to port 0x60
|
||||
/// 4. Write 0xAB to port 0x64 (test port 1) → read 0x00 from port 0x60
|
||||
/// 5. Various enable/disable commands
|
||||
struct I8042State {
|
||||
/// Output buffer for queued response bytes
|
||||
output: std::collections::VecDeque<u8>,
|
||||
/// Command byte / Controller Configuration Register (CTR)
|
||||
/// Default 0x47: keyboard interrupt enabled, system flag, keyboard enabled, translation
|
||||
cmd_byte: u8,
|
||||
/// Whether the next write to port 0x60 is a data byte for a pending command
|
||||
expecting_data: bool,
|
||||
/// The pending command that expects a data byte on port 0x60
|
||||
pending_cmd: u8,
|
||||
/// Whether a system reset was requested via 0xFE command
|
||||
reset_requested: bool,
|
||||
}
|
||||
|
||||
impl I8042State {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
output: std::collections::VecDeque::with_capacity(4),
|
||||
cmd_byte: 0x47, // Keyboard IRQ enabled, system flag, keyboard enabled, translation
|
||||
expecting_data: false,
|
||||
pending_cmd: 0,
|
||||
reset_requested: false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Read from data port (0x60) — clears OBF
|
||||
fn read_data(&mut self) -> u8 {
|
||||
self.output.pop_front().unwrap_or(0x00)
|
||||
}
|
||||
|
||||
/// Read from status port (0x64) — OBF bit indicates data available
|
||||
fn read_status(&self) -> u8 {
|
||||
let mut status: u8 = 0;
|
||||
if !self.output.is_empty() {
|
||||
status |= 0x01; // OBF — output buffer full
|
||||
}
|
||||
status
|
||||
}
|
||||
|
||||
/// Write to data port (0x60) — handles pending command data bytes
|
||||
fn write_data(&mut self, value: u8) {
|
||||
if self.expecting_data {
|
||||
self.expecting_data = false;
|
||||
match self.pending_cmd {
|
||||
0x60 => {
|
||||
// Write command byte (CTR)
|
||||
self.cmd_byte = value;
|
||||
}
|
||||
0xD4 => {
|
||||
// Write to aux device — eat the byte (no mouse emulated)
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
self.pending_cmd = 0;
|
||||
}
|
||||
// Otherwise accept and ignore (keyboard data writes)
|
||||
}
|
||||
|
||||
/// Write to command port (0x64)
|
||||
fn write_command(&mut self, cmd: u8) {
|
||||
match cmd {
|
||||
0x20 => self.output.push_back(self.cmd_byte), // Read command byte (CTR)
|
||||
0x60 => { // Write command byte — next data byte is the value
|
||||
self.expecting_data = true;
|
||||
self.pending_cmd = 0x60;
|
||||
}
|
||||
0xA7 => { // Disable aux port
|
||||
self.cmd_byte |= 0x20; // Set bit 5 (aux disabled)
|
||||
}
|
||||
0xA8 => { // Enable aux port
|
||||
self.cmd_byte &= !0x20; // Clear bit 5
|
||||
}
|
||||
0xA9 => self.output.push_back(0x00), // Aux interface test: pass
|
||||
0xAA => { // Self-test
|
||||
self.output.push_back(0x55); // Test passed
|
||||
self.cmd_byte = 0x47; // Self-test resets CTR
|
||||
}
|
||||
0xAB => self.output.push_back(0x00), // Interface test: no error
|
||||
0xAD => { // Disable keyboard
|
||||
self.cmd_byte |= 0x10; // Set bit 4 (keyboard disabled)
|
||||
}
|
||||
0xAE => { // Enable keyboard
|
||||
self.cmd_byte &= !0x10; // Clear bit 4
|
||||
}
|
||||
0xD4 => { // Write to aux port (eat next byte)
|
||||
self.expecting_data = true;
|
||||
self.pending_cmd = 0xD4;
|
||||
}
|
||||
0xFE => self.reset_requested = true, // System reset
|
||||
_ => {} // Accept and ignore
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// vCPU-specific errors
|
||||
#[derive(Error, Debug)]
|
||||
#[allow(dead_code)]
|
||||
pub enum VcpuError {
|
||||
#[error("vCPU thread panicked")]
|
||||
ThreadPanic,
|
||||
|
||||
#[error("vCPU already running")]
|
||||
AlreadyRunning,
|
||||
|
||||
#[error("vCPU not started")]
|
||||
NotStarted,
|
||||
|
||||
#[error("Channel send error")]
|
||||
ChannelError,
|
||||
}
|
||||
|
||||
/// vCPU exit reasons
|
||||
#[derive(Debug, Clone)]
|
||||
#[allow(dead_code)]
|
||||
pub enum VcpuExitReason {
|
||||
/// IO port access
|
||||
Io {
|
||||
direction: IoDirection,
|
||||
port: u16,
|
||||
size: u8,
|
||||
data: u64,
|
||||
},
|
||||
/// MMIO access
|
||||
Mmio {
|
||||
address: u64,
|
||||
is_write: bool,
|
||||
size: u8,
|
||||
data: u64,
|
||||
},
|
||||
/// HLT instruction
|
||||
Halt,
|
||||
/// VM shutdown
|
||||
Shutdown,
|
||||
/// System event (S3/S4/reset)
|
||||
SystemEvent { event_type: u32 },
|
||||
/// Internal error
|
||||
InternalError { suberror: u32 },
|
||||
/// Unknown exit
|
||||
Unknown { reason: u32 },
|
||||
}
|
||||
|
||||
/// IO direction
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum IoDirection {
|
||||
In,
|
||||
Out,
|
||||
}
|
||||
|
||||
/// vCPU run state
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
#[allow(dead_code)]
|
||||
pub enum VcpuRunState {
|
||||
Created,
|
||||
Running,
|
||||
Paused,
|
||||
Stopped,
|
||||
}
|
||||
|
||||
/// Callback trait for handling MMIO and IO accesses from the guest
|
||||
pub trait MmioHandler: Send + Sync + 'static {
|
||||
/// Handle an MMIO read. Returns true if the address was handled.
|
||||
fn mmio_read(&self, addr: u64, data: &mut [u8]) -> bool;
|
||||
/// Handle an MMIO write. Returns true if the address was handled.
|
||||
fn mmio_write(&self, addr: u64, data: &[u8]) -> bool;
|
||||
/// Handle an IO port read. Returns true if the port was handled.
|
||||
fn io_read(&self, _port: u16, _data: &mut [u8]) -> bool { false }
|
||||
/// Handle an IO port write. Returns true if the port was handled.
|
||||
fn io_write(&self, _port: u16, _data: &[u8]) -> bool { false }
|
||||
}
|
||||
|
||||
/// vCPU configuration
|
||||
pub struct VcpuConfig {
|
||||
pub id: u8,
|
||||
pub memory: Arc<GuestMemoryManager>,
|
||||
/// Optional MMIO handler for virtio device dispatch
|
||||
pub mmio_handler: Option<Arc<dyn MmioHandler>>,
|
||||
}
|
||||
|
||||
/// Commands sent to vCPU thread
|
||||
#[allow(dead_code)]
|
||||
pub(crate) enum VcpuCommand {
|
||||
Run,
|
||||
Pause,
|
||||
Stop,
|
||||
SetRegisters(Box<kvm_regs>),
|
||||
}
|
||||
|
||||
/// vCPU handle for managing a vCPU thread
|
||||
#[allow(dead_code)]
|
||||
pub struct VcpuHandle {
|
||||
id: u8,
|
||||
fd: Arc<parking_lot::Mutex<VcpuFd>>,
|
||||
thread: Option<JoinHandle<()>>,
|
||||
command_tx: Sender<VcpuCommand>,
|
||||
exit_rx: Receiver<VcpuExitReason>,
|
||||
running: Arc<AtomicBool>,
|
||||
memory: Arc<GuestMemoryManager>,
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
impl VcpuHandle {
|
||||
/// Create a new vCPU handle and spawn the run loop thread
|
||||
pub fn new(fd: VcpuFd, config: VcpuConfig) -> Result<Self> {
|
||||
let (command_tx, command_rx) = bounded(16);
|
||||
let (exit_tx, exit_rx) = bounded(256);
|
||||
let running = Arc::new(AtomicBool::new(false));
|
||||
let fd = Arc::new(parking_lot::Mutex::new(fd));
|
||||
let id = config.id;
|
||||
let mmio_handler = config.mmio_handler;
|
||||
|
||||
// Spawn the vCPU run loop thread immediately
|
||||
let fd_clone = Arc::clone(&fd);
|
||||
let running_clone = Arc::clone(&running);
|
||||
|
||||
let thread = thread::Builder::new()
|
||||
.name(format!("vcpu-{}", id))
|
||||
.spawn(move || {
|
||||
Self::run_loop(fd_clone, running_clone, command_rx, exit_tx, id, mmio_handler);
|
||||
})
|
||||
.expect("Failed to spawn vCPU thread");
|
||||
|
||||
Ok(Self {
|
||||
id,
|
||||
fd,
|
||||
thread: Some(thread),
|
||||
command_tx,
|
||||
exit_rx,
|
||||
running,
|
||||
memory: config.memory,
|
||||
})
|
||||
}
|
||||
|
||||
/// Setup vCPU for 64-bit long mode boot
|
||||
pub fn setup_long_mode(&self, kernel_entry: u64, boot_params_addr: u64) -> Result<()> {
|
||||
self.setup_long_mode_with_cr3(kernel_entry, boot_params_addr, 0x1000)
|
||||
}
|
||||
|
||||
/// Setup vCPU for 64-bit long mode boot with explicit CR3
|
||||
pub fn setup_long_mode_with_cr3(&self, kernel_entry: u64, boot_params_addr: u64, cr3: u64) -> Result<()> {
|
||||
// Setup special registers for long mode
|
||||
let fd = self.fd.lock();
|
||||
let mut sregs = fd.get_sregs().map_err(KvmError::GetRegisters)?;
|
||||
|
||||
// Setup segments for 64-bit mode
|
||||
let code_seg = kvm_segment {
|
||||
base: 0,
|
||||
limit: 0xFFFF_FFFF,
|
||||
selector: 0x08, // GDT code segment (matches Firecracker)
|
||||
type_: 11, // Execute/Read, accessed
|
||||
present: 1,
|
||||
dpl: 0,
|
||||
db: 0, // 64-bit mode: D/B must be 0
|
||||
s: 1, // Code/data segment
|
||||
l: 1, // Long mode
|
||||
g: 1, // 4KB granularity
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let data_seg = kvm_segment {
|
||||
base: 0,
|
||||
limit: 0xFFFF_FFFF,
|
||||
selector: 0x10, // GDT data segment (matches Firecracker)
|
||||
type_: 3, // Read/Write, accessed
|
||||
present: 1,
|
||||
dpl: 0,
|
||||
db: 1, // 32-bit operands for data segment
|
||||
s: 1,
|
||||
l: 0,
|
||||
g: 1,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
sregs.cs = code_seg;
|
||||
sregs.ds = data_seg;
|
||||
sregs.es = data_seg;
|
||||
sregs.fs = data_seg;
|
||||
sregs.gs = data_seg;
|
||||
sregs.ss = data_seg;
|
||||
|
||||
// Enable long mode with correct CR0 flags
|
||||
// CR0 bits:
|
||||
// PE (bit 0) = 1 - Protection Enable (required)
|
||||
// MP (bit 1) = 1 - Monitor Coprocessor
|
||||
// ET (bit 4) = 1 - Extension Type (x87 FPU present)
|
||||
// NE (bit 5) = 1 - Numeric Error (use native FPU error reporting)
|
||||
// WP (bit 16) = 1 - Write Protect (protect read-only pages in ring 0)
|
||||
// PG (bit 31) = 1 - Paging Enable (required for long mode)
|
||||
// NOTE: Do NOT set TS (bit 3) or reserved bits!
|
||||
// Match Firecracker's minimal CR0: PG | ET | PE
|
||||
sregs.cr0 = 0x8000_0011;
|
||||
sregs.cr3 = cr3; // Page table address (PML4)
|
||||
|
||||
// CR4 bits:
|
||||
// PAE (bit 5) = 1 - Physical Address Extension (required for long mode)
|
||||
// PGE (bit 7) = 1 - Page Global Enable
|
||||
// OSFXSR (bit 9) = 1 - OS support for FXSAVE/FXRSTOR
|
||||
// OSXMMEXCPT (bit 10) = 1 - OS support for unmasked SIMD FP exceptions
|
||||
// Match Firecracker's minimal CR4: just PAE
|
||||
sregs.cr4 = 0x20;
|
||||
|
||||
// EFER (Extended Feature Enable Register):
|
||||
// LME (bit 8) = 1 - Long Mode Enable
|
||||
// LMA (bit 10) = 1 - Long Mode Active (set by KVM when PG is enabled with LME)
|
||||
// For KVM, we set both since we're loading the full register state directly
|
||||
sregs.efer = 0x500; // LMA | LME
|
||||
|
||||
// Setup GDT - must match the segment selectors above
|
||||
sregs.gdt.base = 0x500; // GDT_ADDR from gdt.rs
|
||||
sregs.gdt.limit = 0x2F; // 6 entries * 8 bytes - 1 = 47
|
||||
|
||||
// Setup IDT - kernel will set up its own ASAP
|
||||
// Set to invalid limit so ANY exception immediately causes triple fault
|
||||
// This is cleaner than cascading through broken exception handlers
|
||||
sregs.idt.base = 0;
|
||||
sregs.idt.limit = 0;
|
||||
|
||||
fd.set_sregs(&sregs).map_err(KvmError::SetRegisters)?;
|
||||
|
||||
// Setup general purpose registers
|
||||
// Note: Stack pointer placed at 0x1FFF0 to avoid page table area
|
||||
// (page tables can extend to 0xA000 for 4GB+ VMs)
|
||||
let regs = kvm_regs {
|
||||
rip: kernel_entry, // Entry point (startup_64)
|
||||
rsi: boot_params_addr, // Boot params pointer (Linux boot protocol)
|
||||
rflags: 0x2, // Reserved bit always set, interrupts disabled
|
||||
rsp: 0x1FFF0, // Stack pointer (safe area, stack grows down)
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
fd.set_regs(®s).map_err(KvmError::SetRegisters)?;
|
||||
|
||||
// Setup FPU state (required for modern kernels)
|
||||
// fcw = 0x37f: Default FPU control word (all exceptions masked, round to nearest, 64-bit precision)
|
||||
// mxcsr = 0x1f80: Default SSE control/status (all exceptions masked, round to nearest)
|
||||
let mut fpu: kvm_bindings::kvm_fpu = Default::default();
|
||||
fpu.fcw = 0x37f;
|
||||
fpu.mxcsr = 0x1f80;
|
||||
fd.set_fpu(&fpu).map_err(KvmError::SetRegisters)?;
|
||||
|
||||
// Setup boot MSRs (required for Linux boot protocol)
|
||||
Self::setup_boot_msrs(&fd)?;
|
||||
|
||||
// Debug: dump the full register state
|
||||
tracing::info!(
|
||||
"vCPU {} configured for 64-bit long mode:\n\
|
||||
Registers: RIP=0x{:016x}, RSP=0x{:016x}, RSI=0x{:016x}\n\
|
||||
Control: CR0=0x{:08x}, CR3=0x{:016x}, CR4=0x{:08x}, EFER=0x{:x}\n\
|
||||
Segments: CS=0x{:04x} (base=0x{:x}, limit=0x{:x}, l={}, db={})\n\
|
||||
DS=0x{:04x}, SS=0x{:04x}\n\
|
||||
Tables: GDT base=0x{:x} limit=0x{:x}",
|
||||
self.id,
|
||||
kernel_entry, 0x1FFF0u64, boot_params_addr,
|
||||
sregs.cr0, cr3, sregs.cr4, sregs.efer,
|
||||
sregs.cs.selector, sregs.cs.base, sregs.cs.limit, sregs.cs.l, sregs.cs.db,
|
||||
sregs.ds.selector, sregs.ss.selector,
|
||||
sregs.gdt.base, sregs.gdt.limit
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Setup Model Specific Registers required for Linux boot
|
||||
///
|
||||
/// These MSRs match Firecracker's boot MSR configuration and are required
|
||||
/// for the Linux kernel to initialize properly.
|
||||
fn setup_boot_msrs(fd: &VcpuFd) -> Result<()> {
|
||||
// MSR addresses (from Linux kernel msr-index.h)
|
||||
const MSR_IA32_SYSENTER_CS: u32 = 0x174;
|
||||
const MSR_IA32_SYSENTER_ESP: u32 = 0x175;
|
||||
const MSR_IA32_SYSENTER_EIP: u32 = 0x176;
|
||||
const MSR_IA32_MISC_ENABLE: u32 = 0x1a0;
|
||||
const MSR_IA32_MISC_ENABLE_FAST_STRING: u64 = 1;
|
||||
const MSR_STAR: u32 = 0xc0000081;
|
||||
const MSR_LSTAR: u32 = 0xc0000082;
|
||||
const MSR_CSTAR: u32 = 0xc0000083;
|
||||
const MSR_SYSCALL_MASK: u32 = 0xc0000084;
|
||||
const MSR_KERNEL_GS_BASE: u32 = 0xc0000102;
|
||||
const MSR_IA32_TSC: u32 = 0x10;
|
||||
const MSR_MTRR_DEF_TYPE: u32 = 0x2ff;
|
||||
|
||||
let msr_entries = vec![
|
||||
// SYSENTER MSRs (32-bit syscall ABI)
|
||||
kvm_msr_entry { index: MSR_IA32_SYSENTER_CS, data: 0, ..Default::default() },
|
||||
kvm_msr_entry { index: MSR_IA32_SYSENTER_ESP, data: 0, ..Default::default() },
|
||||
kvm_msr_entry { index: MSR_IA32_SYSENTER_EIP, data: 0, ..Default::default() },
|
||||
// SYSCALL/SYSRET MSRs (64-bit syscall ABI)
|
||||
kvm_msr_entry { index: MSR_STAR, data: 0, ..Default::default() },
|
||||
kvm_msr_entry { index: MSR_CSTAR, data: 0, ..Default::default() },
|
||||
kvm_msr_entry { index: MSR_KERNEL_GS_BASE, data: 0, ..Default::default() },
|
||||
kvm_msr_entry { index: MSR_SYSCALL_MASK, data: 0, ..Default::default() },
|
||||
kvm_msr_entry { index: MSR_LSTAR, data: 0, ..Default::default() },
|
||||
// TSC
|
||||
kvm_msr_entry { index: MSR_IA32_TSC, data: 0, ..Default::default() },
|
||||
// Enable fast string operations
|
||||
kvm_msr_entry {
|
||||
index: MSR_IA32_MISC_ENABLE,
|
||||
data: MSR_IA32_MISC_ENABLE_FAST_STRING,
|
||||
..Default::default()
|
||||
},
|
||||
// MTRR default type: write-back, MTRRs enabled
|
||||
// (1 << 11) = MTRR enable, 6 = write-back
|
||||
kvm_msr_entry {
|
||||
index: MSR_MTRR_DEF_TYPE,
|
||||
data: (1 << 11) | 6,
|
||||
..Default::default()
|
||||
},
|
||||
];
|
||||
|
||||
let msrs = Msrs::from_entries(&msr_entries)
|
||||
.map_err(|_| KvmError::SetRegisters(kvm_ioctls::Error::new(libc::ENOMEM)))?;
|
||||
|
||||
let written = fd.set_msrs(&msrs).map_err(KvmError::SetRegisters)?;
|
||||
if written != msr_entries.len() {
|
||||
tracing::warn!(
|
||||
"Only wrote {}/{} boot MSRs (some may not be supported on this host)",
|
||||
written,
|
||||
msr_entries.len()
|
||||
);
|
||||
} else {
|
||||
tracing::debug!("Set {} boot MSRs", written);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Start the vCPU thread
|
||||
pub fn start(&self) -> Result<()> {
|
||||
if self.running.load(Ordering::SeqCst) {
|
||||
return Err(VcpuError::AlreadyRunning.into());
|
||||
}
|
||||
|
||||
self.command_tx
|
||||
.send(VcpuCommand::Run)
|
||||
.map_err(|_| VcpuError::ChannelError)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Spawn the vCPU run loop thread
|
||||
pub(crate) fn spawn_thread(&mut self, command_rx: Receiver<VcpuCommand>, exit_tx: Sender<VcpuExitReason>) {
|
||||
let fd = Arc::clone(&self.fd);
|
||||
let running = Arc::clone(&self.running);
|
||||
let id = self.id;
|
||||
|
||||
let handle = thread::Builder::new()
|
||||
.name(format!("vcpu-{}", id))
|
||||
.spawn(move || {
|
||||
Self::run_loop(fd, running, command_rx, exit_tx, id, None);
|
||||
})
|
||||
.expect("Failed to spawn vCPU thread");
|
||||
|
||||
self.thread = Some(handle);
|
||||
}
|
||||
|
||||
/// The main vCPU run loop
|
||||
fn run_loop(
|
||||
fd: Arc<parking_lot::Mutex<VcpuFd>>,
|
||||
running: Arc<AtomicBool>,
|
||||
command_rx: Receiver<VcpuCommand>,
|
||||
exit_tx: Sender<VcpuExitReason>,
|
||||
id: u8,
|
||||
mmio_handler: Option<Arc<dyn MmioHandler>>,
|
||||
) {
|
||||
tracing::debug!("vCPU {} thread started", id);
|
||||
let mut i8042 = I8042State::new();
|
||||
|
||||
loop {
|
||||
// Check for commands
|
||||
match command_rx.try_recv() {
|
||||
Ok(VcpuCommand::Run) => {
|
||||
tracing::debug!("vCPU {} received Run command", id);
|
||||
running.store(true, Ordering::SeqCst);
|
||||
}
|
||||
Ok(VcpuCommand::Pause) => {
|
||||
running.store(false, Ordering::SeqCst);
|
||||
continue;
|
||||
}
|
||||
Ok(VcpuCommand::Stop) => {
|
||||
running.store(false, Ordering::SeqCst);
|
||||
tracing::debug!("vCPU {} stopping", id);
|
||||
return;
|
||||
}
|
||||
Ok(VcpuCommand::SetRegisters(regs)) => {
|
||||
if let Err(e) = fd.lock().set_regs(®s) {
|
||||
tracing::error!("vCPU {} failed to set registers: {}", id, e);
|
||||
}
|
||||
}
|
||||
Err(_) => {}
|
||||
}
|
||||
|
||||
if !running.load(Ordering::SeqCst) {
|
||||
// Yield when paused
|
||||
thread::yield_now();
|
||||
continue;
|
||||
}
|
||||
|
||||
// Run the vCPU
|
||||
tracing::trace!("vCPU {} entering KVM_RUN", id);
|
||||
let mut fd_guard = fd.lock();
|
||||
match fd_guard.run() {
|
||||
Ok(exit) => {
|
||||
// Log all exits at debug level for debugging boot issues
|
||||
match &exit {
|
||||
kvm_ioctls::VcpuExit::IoOut(port, data) => {
|
||||
// Serial IO (0x3F8) handled in handle_exit via io_write
|
||||
if *port != 0x3F8 {
|
||||
tracing::trace!("vCPU {} IO out: port=0x{:x}, data={:?}", id, port, data);
|
||||
}
|
||||
}
|
||||
kvm_ioctls::VcpuExit::Shutdown => {
|
||||
tracing::debug!("vCPU {} received Shutdown exit", id);
|
||||
}
|
||||
kvm_ioctls::VcpuExit::Hlt => {
|
||||
tracing::debug!("vCPU {} received HLT exit", id);
|
||||
}
|
||||
_ => {
|
||||
tracing::debug!("vCPU {} VM exit: {:?}", id, exit);
|
||||
}
|
||||
}
|
||||
let reason = Self::handle_exit(exit, &mut i8042, mmio_handler.as_deref());
|
||||
|
||||
// Check if i8042 requested a system reset
|
||||
if i8042.reset_requested {
|
||||
tracing::info!("vCPU {} i8042 reset requested, shutting down", id);
|
||||
running.store(false, Ordering::SeqCst);
|
||||
let _ = exit_tx.send(VcpuExitReason::Shutdown);
|
||||
return;
|
||||
}
|
||||
|
||||
// Check if we should stop
|
||||
match &reason {
|
||||
VcpuExitReason::Shutdown | VcpuExitReason::InternalError { .. } => {
|
||||
// Dump registers on shutdown to diagnose triple fault
|
||||
// Need to re-acquire lock since run() released it
|
||||
drop(fd_guard);
|
||||
let fd_guard = fd.lock();
|
||||
if let Ok(regs) = fd_guard.get_regs() {
|
||||
tracing::error!(
|
||||
"vCPU {} SHUTDOWN (triple fault?) at RIP=0x{:016x}\n\
|
||||
Registers: RAX=0x{:x} RBX=0x{:x} RCX=0x{:x} RDX=0x{:x}\n\
|
||||
RSI=0x{:x} RDI=0x{:x} RSP=0x{:x} RBP=0x{:x}\n\
|
||||
RFLAGS=0x{:x}",
|
||||
id, regs.rip,
|
||||
regs.rax, regs.rbx, regs.rcx, regs.rdx,
|
||||
regs.rsi, regs.rdi, regs.rsp, regs.rbp,
|
||||
regs.rflags
|
||||
);
|
||||
}
|
||||
if let Ok(sregs) = fd_guard.get_sregs() {
|
||||
tracing::error!(
|
||||
"Control: CR0=0x{:x} CR2=0x{:x} CR3=0x{:x} CR4=0x{:x} EFER=0x{:x}",
|
||||
sregs.cr0, sregs.cr2, sregs.cr3, sregs.cr4, sregs.efer
|
||||
);
|
||||
}
|
||||
running.store(false, Ordering::SeqCst);
|
||||
let _ = exit_tx.send(reason);
|
||||
return;
|
||||
}
|
||||
_ => {
|
||||
let _ = exit_tx.try_send(reason);
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
// Handle EINTR (signal interruption) - just retry
|
||||
if e.errno() == libc::EINTR {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Handle EAGAIN
|
||||
if e.errno() == libc::EAGAIN {
|
||||
thread::yield_now();
|
||||
continue;
|
||||
}
|
||||
|
||||
tracing::error!("vCPU {} run error: {}", id, e);
|
||||
running.store(false, Ordering::SeqCst);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Handle a vCPU exit and return the reason
|
||||
fn handle_exit(exit: kvm_ioctls::VcpuExit, i8042: &mut I8042State, mmio_handler: Option<&dyn MmioHandler>) -> VcpuExitReason {
|
||||
match exit {
|
||||
kvm_ioctls::VcpuExit::IoIn(port, data) => {
|
||||
// Try the external handler first (serial device)
|
||||
let handled = if let Some(handler) = mmio_handler {
|
||||
handler.io_read(port, data)
|
||||
} else {
|
||||
false
|
||||
};
|
||||
|
||||
if !handled {
|
||||
// Fallback: built-in handlers for i8042 and serial
|
||||
let value = if port >= 0x3F8 && port <= 0x3FF {
|
||||
let offset = port - 0x3F8;
|
||||
match offset {
|
||||
0 => 0,
|
||||
1 => 0,
|
||||
2 => 0x01,
|
||||
3 => 0,
|
||||
4 => 0,
|
||||
5 => 0x60, // THR_EMPTY | THR_TSR_EMPTY
|
||||
6 => 0x30,
|
||||
7 => 0,
|
||||
_ => 0,
|
||||
}
|
||||
} else if port == I8042_DATA_PORT {
|
||||
i8042.read_data()
|
||||
} else if port == I8042_CMD_PORT {
|
||||
i8042.read_status()
|
||||
} else {
|
||||
0xFF
|
||||
};
|
||||
|
||||
if !data.is_empty() {
|
||||
data[0] = value;
|
||||
for byte in data.iter_mut().skip(1) {
|
||||
*byte = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut value: u64 = 0;
|
||||
if !data.is_empty() {
|
||||
for (i, &byte) in data.iter().enumerate() {
|
||||
value |= (byte as u64) << (i * 8);
|
||||
}
|
||||
}
|
||||
|
||||
VcpuExitReason::Io {
|
||||
direction: IoDirection::In,
|
||||
port,
|
||||
size: data.len() as u8,
|
||||
data: value,
|
||||
}
|
||||
}
|
||||
kvm_ioctls::VcpuExit::IoOut(port, data) => {
|
||||
let mut value: u64 = 0;
|
||||
for (i, &byte) in data.iter().enumerate() {
|
||||
value |= (byte as u64) << (i * 8);
|
||||
}
|
||||
|
||||
// Try the external handler first (serial device writes to stdout)
|
||||
let handled = if let Some(handler) = mmio_handler {
|
||||
handler.io_write(port, data)
|
||||
} else {
|
||||
false
|
||||
};
|
||||
|
||||
if !handled {
|
||||
// Fallback: built-in handlers
|
||||
if port == 0x3F8 && !data.is_empty() {
|
||||
// Serial output — write directly to stdout
|
||||
use std::io::Write;
|
||||
let _ = std::io::stdout().write_all(data);
|
||||
let _ = std::io::stdout().flush();
|
||||
} else if port == I8042_DATA_PORT && !data.is_empty() {
|
||||
i8042.write_data(data[0]);
|
||||
} else if port == I8042_CMD_PORT && !data.is_empty() {
|
||||
i8042.write_command(data[0]);
|
||||
}
|
||||
}
|
||||
|
||||
VcpuExitReason::Io {
|
||||
direction: IoDirection::Out,
|
||||
port,
|
||||
size: data.len() as u8,
|
||||
data: value,
|
||||
}
|
||||
}
|
||||
kvm_ioctls::VcpuExit::MmioRead(addr, data) => {
|
||||
// Dispatch to MMIO device handler if available
|
||||
if let Some(handler) = mmio_handler {
|
||||
if handler.mmio_read(addr, data) {
|
||||
tracing::trace!("MMIO read handled: addr=0x{:x}, len={}", addr, data.len());
|
||||
} else {
|
||||
// Unhandled MMIO read — return all 0xFF (bus error simulation)
|
||||
data.fill(0xFF);
|
||||
tracing::trace!("MMIO read unhandled: addr=0x{:x}", addr);
|
||||
}
|
||||
} else {
|
||||
data.fill(0xFF);
|
||||
}
|
||||
let mut value: u64 = 0;
|
||||
for (i, &byte) in data.iter().enumerate() {
|
||||
value |= (byte as u64) << (i * 8);
|
||||
}
|
||||
VcpuExitReason::Mmio {
|
||||
address: addr,
|
||||
is_write: false,
|
||||
size: data.len() as u8,
|
||||
data: value,
|
||||
}
|
||||
},
|
||||
kvm_ioctls::VcpuExit::MmioWrite(addr, data) => {
|
||||
let mut value: u64 = 0;
|
||||
for (i, &byte) in data.iter().enumerate() {
|
||||
value |= (byte as u64) << (i * 8);
|
||||
}
|
||||
// Dispatch to MMIO device handler if available
|
||||
if let Some(handler) = mmio_handler {
|
||||
if handler.mmio_write(addr, data) {
|
||||
tracing::trace!("MMIO write handled: addr=0x{:x}, val=0x{:x}", addr, value);
|
||||
} else {
|
||||
tracing::trace!("MMIO write unhandled: addr=0x{:x}", addr);
|
||||
}
|
||||
}
|
||||
VcpuExitReason::Mmio {
|
||||
address: addr,
|
||||
is_write: true,
|
||||
size: data.len() as u8,
|
||||
data: value,
|
||||
}
|
||||
}
|
||||
kvm_ioctls::VcpuExit::Hlt => VcpuExitReason::Halt,
|
||||
kvm_ioctls::VcpuExit::Shutdown => VcpuExitReason::Shutdown,
|
||||
kvm_ioctls::VcpuExit::SystemEvent(event_type, _flags) => {
|
||||
VcpuExitReason::SystemEvent { event_type }
|
||||
}
|
||||
kvm_ioctls::VcpuExit::InternalError => {
|
||||
VcpuExitReason::InternalError { suberror: 0 }
|
||||
}
|
||||
_ => VcpuExitReason::Unknown { reason: 0 },
|
||||
}
|
||||
}
|
||||
|
||||
/// Pause the vCPU
|
||||
pub fn pause(&self) -> Result<()> {
|
||||
self.command_tx
|
||||
.send(VcpuCommand::Pause)
|
||||
.map_err(|_| VcpuError::ChannelError)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Stop the vCPU
|
||||
pub fn stop(&self) -> Result<()> {
|
||||
self.command_tx
|
||||
.send(VcpuCommand::Stop)
|
||||
.map_err(|_| VcpuError::ChannelError)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Check if vCPU is running
|
||||
pub fn is_running(&self) -> bool {
|
||||
self.running.load(Ordering::SeqCst)
|
||||
}
|
||||
|
||||
/// Get pending exits
|
||||
pub fn poll_exit(&self) -> Option<VcpuExitReason> {
|
||||
self.exit_rx.try_recv().ok()
|
||||
}
|
||||
|
||||
/// Wait for next exit with timeout
|
||||
pub fn wait_exit(&self, timeout: std::time::Duration) -> Option<VcpuExitReason> {
|
||||
self.exit_rx.recv_timeout(timeout).ok()
|
||||
}
|
||||
|
||||
/// Get current registers
|
||||
pub fn get_regs(&self) -> Result<kvm_regs> {
|
||||
self.fd.lock().get_regs().map_err(KvmError::GetRegisters)
|
||||
}
|
||||
|
||||
/// Set registers
|
||||
pub fn set_regs(&self, regs: &kvm_regs) -> Result<()> {
|
||||
self.fd.lock().set_regs(regs).map_err(KvmError::SetRegisters)
|
||||
}
|
||||
|
||||
/// Get special registers
|
||||
pub fn get_sregs(&self) -> Result<kvm_sregs> {
|
||||
self.fd.lock().get_sregs().map_err(KvmError::GetRegisters)
|
||||
}
|
||||
|
||||
/// Get vCPU ID
|
||||
pub fn id(&self) -> u8 {
|
||||
self.id
|
||||
}
|
||||
|
||||
/// Lock and access the VcpuFd for snapshot operations.
|
||||
/// The caller must ensure the vCPU thread is paused before calling.
|
||||
pub fn lock_fd(&self) -> parking_lot::MutexGuard<'_, VcpuFd> {
|
||||
self.fd.lock()
|
||||
}
|
||||
}
|
||||
|
||||
// From impl generated by #[from] on KvmError::Vcpu
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_vcpu_state() {
|
||||
assert_eq!(VcpuRunState::Created, VcpuRunState::Created);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_io_direction() {
|
||||
assert_ne!(IoDirection::In, IoDirection::Out);
|
||||
}
|
||||
}
|
||||
394
vmm/src/kvm/vm.rs
Normal file
394
vmm/src/kvm/vm.rs
Normal file
@@ -0,0 +1,394 @@
|
||||
//! VM Creation and Management
|
||||
//!
|
||||
//! Handles KVM VM lifecycle with optimizations for fast boot:
|
||||
//! - Pre-configured IRQ chip and PIT
|
||||
//! - Efficient memory region setup
|
||||
//! - Minimal syscall overhead
|
||||
|
||||
use crate::kvm::{
|
||||
cpuid,
|
||||
memory::{GuestMemoryManager, MemoryConfig},
|
||||
vcpu::{self, VcpuConfig, VcpuHandle},
|
||||
x86_64, KvmError, Result,
|
||||
};
|
||||
use kvm_bindings::{
|
||||
kvm_pit_config, kvm_userspace_memory_region, CpuId, KVM_MEM_LOG_DIRTY_PAGES,
|
||||
KVM_PIT_SPEAKER_DUMMY,
|
||||
};
|
||||
use kvm_ioctls::{Kvm, VmFd};
|
||||
use parking_lot::RwLock;
|
||||
use std::sync::Arc;
|
||||
|
||||
/// VM configuration
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct VmConfig {
|
||||
/// Memory size in bytes
|
||||
pub memory_size: u64,
|
||||
/// Number of vCPUs
|
||||
pub vcpu_count: u8,
|
||||
/// Enable huge pages (2MB)
|
||||
pub huge_pages: bool,
|
||||
/// Enable dirty page tracking (for live migration)
|
||||
pub track_dirty_pages: bool,
|
||||
/// Custom memory config (optional)
|
||||
pub memory_config: Option<MemoryConfig>,
|
||||
}
|
||||
|
||||
impl Default for VmConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
memory_size: 256 * 1024 * 1024, // 256 MB default
|
||||
vcpu_count: 1,
|
||||
huge_pages: true, // Enable by default for performance
|
||||
track_dirty_pages: false,
|
||||
memory_config: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// VM state machine
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum VmState {
|
||||
/// VM created but not started
|
||||
Created,
|
||||
/// VM is running
|
||||
Running,
|
||||
/// VM is paused
|
||||
Paused,
|
||||
/// VM has stopped
|
||||
Stopped,
|
||||
}
|
||||
|
||||
/// Virtual Machine instance
|
||||
pub struct Vm {
|
||||
/// KVM VM file descriptor
|
||||
fd: VmFd,
|
||||
/// VM configuration
|
||||
config: VmConfig,
|
||||
/// Guest memory manager
|
||||
memory: Arc<GuestMemoryManager>,
|
||||
/// vCPU handles
|
||||
vcpus: RwLock<Vec<VcpuHandle>>,
|
||||
/// Current VM state
|
||||
state: RwLock<VmState>,
|
||||
/// Memory region slot counter
|
||||
next_slot: RwLock<u32>,
|
||||
/// Filtered CPUID to apply to vCPUs
|
||||
cpuid: Option<CpuId>,
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
impl Vm {
|
||||
/// Create a new VM with the given configuration
|
||||
pub fn new(kvm: &Kvm, config: VmConfig) -> Result<Self> {
|
||||
let vm_start = std::time::Instant::now();
|
||||
|
||||
// Create VM fd
|
||||
let fd = kvm.create_vm().map_err(KvmError::CreateVm)?;
|
||||
let t_create_vm = vm_start.elapsed();
|
||||
|
||||
// Set TSS address (required for x86_64)
|
||||
fd.set_tss_address(x86_64::TSS_ADDR as usize)
|
||||
.map_err(|e| KvmError::CreateVm(e))?;
|
||||
|
||||
// Create in-kernel IRQ chip (8259 PIC + IOAPIC)
|
||||
fd.create_irq_chip().map_err(KvmError::IrqChip)?;
|
||||
|
||||
// Create in-kernel PIT (8254 timer)
|
||||
let pit_config = kvm_pit_config {
|
||||
flags: KVM_PIT_SPEAKER_DUMMY, // Disable PC speaker
|
||||
..Default::default()
|
||||
};
|
||||
fd.create_pit2(pit_config).map_err(KvmError::Pit)?;
|
||||
let t_irq_pit = vm_start.elapsed();
|
||||
|
||||
// Get filtered CPUID for vCPUs
|
||||
let cpuid_config = cpuid::CpuidConfig {
|
||||
vcpu_count: config.vcpu_count,
|
||||
vcpu_id: 0, // Will be overridden per-vCPU
|
||||
};
|
||||
let filtered_cpuid = cpuid::get_filtered_cpuid(kvm, &cpuid_config)
|
||||
.map_err(|e| {
|
||||
tracing::warn!("Failed to get filtered CPUID, will continue without: {}", e);
|
||||
e
|
||||
})
|
||||
.ok();
|
||||
let t_cpuid = vm_start.elapsed();
|
||||
|
||||
// Setup guest memory
|
||||
let mem_config = config.memory_config.clone().unwrap_or_else(|| {
|
||||
MemoryConfig::new(config.memory_size).with_huge_pages(config.huge_pages)
|
||||
});
|
||||
|
||||
let memory = Arc::new(GuestMemoryManager::new(mem_config)?);
|
||||
let t_memory = vm_start.elapsed();
|
||||
|
||||
let vm = Self {
|
||||
fd,
|
||||
config: config.clone(),
|
||||
memory,
|
||||
vcpus: RwLock::new(Vec::with_capacity(config.vcpu_count as usize)),
|
||||
state: RwLock::new(VmState::Created),
|
||||
next_slot: RwLock::new(0),
|
||||
cpuid: filtered_cpuid,
|
||||
};
|
||||
|
||||
// Register memory regions with KVM
|
||||
vm.setup_memory_regions()?;
|
||||
let t_total = vm_start.elapsed();
|
||||
|
||||
tracing::info!(
|
||||
"VM created: {} MB RAM, {} vCPUs, huge_pages={} [create_vm={:.1}ms, irq+pit={:.1}ms, cpuid={:.1}ms, memory={:.1}ms, total={:.1}ms]",
|
||||
config.memory_size / (1024 * 1024),
|
||||
config.vcpu_count,
|
||||
config.huge_pages,
|
||||
t_create_vm.as_secs_f64() * 1000.0,
|
||||
(t_irq_pit - t_create_vm).as_secs_f64() * 1000.0,
|
||||
(t_cpuid - t_irq_pit).as_secs_f64() * 1000.0,
|
||||
(t_memory - t_cpuid).as_secs_f64() * 1000.0,
|
||||
t_total.as_secs_f64() * 1000.0,
|
||||
);
|
||||
|
||||
Ok(vm)
|
||||
}
|
||||
|
||||
/// Setup memory regions with KVM
|
||||
fn setup_memory_regions(&self) -> Result<()> {
|
||||
let regions = self.memory.regions();
|
||||
|
||||
for region in regions {
|
||||
self.add_memory_region(
|
||||
region.guest_addr,
|
||||
region.size,
|
||||
region.host_addr as u64,
|
||||
)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Add a memory region to the VM
|
||||
pub fn add_memory_region(
|
||||
&self,
|
||||
guest_addr: u64,
|
||||
size: u64,
|
||||
host_addr: u64,
|
||||
) -> Result<u32> {
|
||||
let mut slot = self.next_slot.write();
|
||||
let slot_id = *slot;
|
||||
|
||||
let flags = if self.config.track_dirty_pages {
|
||||
KVM_MEM_LOG_DIRTY_PAGES
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
let mem_region = kvm_userspace_memory_region {
|
||||
slot: slot_id,
|
||||
flags,
|
||||
guest_phys_addr: guest_addr,
|
||||
memory_size: size,
|
||||
userspace_addr: host_addr,
|
||||
};
|
||||
|
||||
// SAFETY: Memory region is valid and properly mapped
|
||||
unsafe {
|
||||
self.fd
|
||||
.set_user_memory_region(mem_region)
|
||||
.map_err(KvmError::SetMemoryRegion)?;
|
||||
}
|
||||
|
||||
*slot += 1;
|
||||
|
||||
tracing::debug!(
|
||||
"Memory region {}: guest=0x{:x}, size={} MB, host=0x{:x}",
|
||||
slot_id,
|
||||
guest_addr,
|
||||
size / (1024 * 1024),
|
||||
host_addr
|
||||
);
|
||||
|
||||
Ok(slot_id)
|
||||
}
|
||||
|
||||
/// Create vCPUs for this VM
|
||||
pub fn create_vcpus(&self) -> Result<()> {
|
||||
self.create_vcpus_with_mmio(None)
|
||||
}
|
||||
|
||||
/// Create vCPUs with an optional MMIO handler for device dispatch
|
||||
pub fn create_vcpus_with_mmio(&self, mmio_handler: Option<Arc<dyn vcpu::MmioHandler>>) -> Result<()> {
|
||||
let mut vcpus = self.vcpus.write();
|
||||
|
||||
for id in 0..self.config.vcpu_count {
|
||||
let vcpu_fd = self
|
||||
.fd
|
||||
.create_vcpu(id as u64)
|
||||
.map_err(KvmError::CreateVcpu)?;
|
||||
|
||||
// Apply CPUID to vCPU (must be done before setting registers)
|
||||
if let Some(ref base_cpuid) = self.cpuid {
|
||||
// Clone the base CPUID and adjust per-vCPU fields
|
||||
let mut vcpu_cpuid = base_cpuid.clone();
|
||||
|
||||
// Update APIC ID in leaf 0x1 for this specific vCPU
|
||||
for entry in vcpu_cpuid.as_mut_slice().iter_mut() {
|
||||
if entry.function == 0x1 {
|
||||
entry.ebx = (entry.ebx & 0x00FFFFFF) | ((id as u32) << 24);
|
||||
}
|
||||
if entry.function == 0xb {
|
||||
entry.edx = id as u32;
|
||||
}
|
||||
}
|
||||
|
||||
cpuid::apply_cpuid(&vcpu_fd, &vcpu_cpuid)?;
|
||||
tracing::debug!("Applied CPUID to vCPU {}", id);
|
||||
}
|
||||
|
||||
let vcpu_config = VcpuConfig {
|
||||
id,
|
||||
memory: Arc::clone(&self.memory),
|
||||
mmio_handler: mmio_handler.clone(),
|
||||
};
|
||||
|
||||
let handle = VcpuHandle::new(vcpu_fd, vcpu_config)?;
|
||||
vcpus.push(handle);
|
||||
}
|
||||
|
||||
tracing::debug!("Created {} vCPUs", self.config.vcpu_count);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Initialize vCPU registers for 64-bit long mode boot
|
||||
pub fn setup_vcpu_boot_state(&self, kernel_entry: u64, boot_params_addr: u64) -> Result<()> {
|
||||
self.setup_vcpu_boot_state_with_cr3(kernel_entry, boot_params_addr, 0x1000)
|
||||
}
|
||||
|
||||
/// Initialize vCPU registers for 64-bit long mode boot with explicit CR3
|
||||
pub fn setup_vcpu_boot_state_with_cr3(&self, kernel_entry: u64, boot_params_addr: u64, cr3: u64) -> Result<()> {
|
||||
let vcpus = self.vcpus.read();
|
||||
|
||||
if let Some(vcpu) = vcpus.first() {
|
||||
vcpu.setup_long_mode_with_cr3(kernel_entry, boot_params_addr, cr3)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Start all vCPUs
|
||||
pub fn start(&self) -> Result<()> {
|
||||
let mut state = self.state.write();
|
||||
if *state != VmState::Created && *state != VmState::Paused {
|
||||
tracing::warn!("Cannot start VM in state {:?}", *state);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let vcpus = self.vcpus.read();
|
||||
for vcpu in vcpus.iter() {
|
||||
vcpu.start()?;
|
||||
}
|
||||
|
||||
*state = VmState::Running;
|
||||
tracing::info!("VM started");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Pause all vCPUs
|
||||
pub fn pause(&self) -> Result<()> {
|
||||
let mut state = self.state.write();
|
||||
if *state != VmState::Running {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let vcpus = self.vcpus.read();
|
||||
for vcpu in vcpus.iter() {
|
||||
vcpu.pause()?;
|
||||
}
|
||||
|
||||
*state = VmState::Paused;
|
||||
tracing::info!("VM paused");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Stop the VM
|
||||
pub fn stop(&self) -> Result<()> {
|
||||
let mut state = self.state.write();
|
||||
|
||||
let vcpus = self.vcpus.read();
|
||||
for vcpu in vcpus.iter() {
|
||||
vcpu.stop()?;
|
||||
}
|
||||
|
||||
*state = VmState::Stopped;
|
||||
tracing::info!("VM stopped");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get VM state
|
||||
pub fn state(&self) -> VmState {
|
||||
*self.state.read()
|
||||
}
|
||||
|
||||
/// Get guest memory reference
|
||||
pub fn memory(&self) -> &Arc<GuestMemoryManager> {
|
||||
&self.memory
|
||||
}
|
||||
|
||||
/// Get VM fd for advanced operations
|
||||
pub fn fd(&self) -> &VmFd {
|
||||
&self.fd
|
||||
}
|
||||
|
||||
/// Get read access to vCPU handles (for snapshot)
|
||||
pub fn vcpus_read(&self) -> parking_lot::RwLockReadGuard<'_, Vec<VcpuHandle>> {
|
||||
self.vcpus.read()
|
||||
}
|
||||
|
||||
/// Signal an IRQ to the guest
|
||||
pub fn signal_irq(&self, irq: u32) -> Result<()> {
|
||||
self.fd
|
||||
.set_irq_line(irq, true)
|
||||
.map_err(KvmError::IrqChip)?;
|
||||
self.fd
|
||||
.set_irq_line(irq, false)
|
||||
.map_err(KvmError::IrqChip)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get dirty pages bitmap for a memory slot
|
||||
pub fn get_dirty_log(&self, slot: u32) -> Result<Vec<u64>> {
|
||||
let regions = self.memory.regions();
|
||||
let region = regions.get(slot as usize).ok_or_else(|| {
|
||||
KvmError::Memory(crate::kvm::memory::MemoryError::InvalidRegion(slot as u64))
|
||||
})?;
|
||||
|
||||
let _bitmap_size = (region.size / 4096 / 64) as usize + 1;
|
||||
|
||||
let bitmap = self
|
||||
.fd
|
||||
.get_dirty_log(slot, region.size as usize)
|
||||
.map_err(|e| KvmError::SetMemoryRegion(e))?;
|
||||
|
||||
Ok(bitmap)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for Vm {
|
||||
fn drop(&mut self) {
|
||||
// Ensure all vCPUs are stopped before VM is dropped
|
||||
let _ = self.stop();
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_vm_config_default() {
|
||||
let config = VmConfig::default();
|
||||
assert_eq!(config.memory_size, 256 * 1024 * 1024);
|
||||
assert_eq!(config.vcpu_count, 1);
|
||||
assert!(config.huge_pages);
|
||||
}
|
||||
}
|
||||
77
vmm/src/lib.rs
Normal file
77
vmm/src/lib.rs
Normal file
@@ -0,0 +1,77 @@
|
||||
//! Volt VMM - Ultra-fast microVM Manager
|
||||
//!
|
||||
//! A lightweight, high-performance VMM targeting <125ms boot times.
|
||||
//! Built on rust-vmm components for production reliability.
|
||||
//!
|
||||
//! # Architecture
|
||||
//!
|
||||
//! ```text
|
||||
//! ┌─────────────────────────────────────────────────────┐
|
||||
//! │ Volt VMM │
|
||||
//! ├─────────────────────────────────────────────────────┤
|
||||
//! │ ┌─────────┐ ┌──────────┐ ┌─────────────────────┐ │
|
||||
//! │ │ API │ │ Config │ │ Metrics/Logging │ │
|
||||
//! │ └────┬────┘ └────┬─────┘ └──────────┬──────────┘ │
|
||||
//! │ │ │ │ │
|
||||
//! │ ┌────▼────────────▼────────────────────▼──────────┐ │
|
||||
//! │ │ VMM Core │ │
|
||||
//! │ ├──────────────────────────────────────────────────┤ │
|
||||
//! │ │ ┌───────┐ ┌────────┐ ┌────────┐ ┌────────┐ │ │
|
||||
//! │ │ │ KVM │ │ Memory │ │ vCPUs │ │Devices │ │ │
|
||||
//! │ │ └───┬───┘ └────┬───┘ └────┬───┘ └────┬───┘ │ │
|
||||
//! │ └──────┼───────────┼──────────┼───────────┼───────┘ │
|
||||
//! └─────────┼───────────┼──────────┼───────────┼─────────┘
|
||||
//! │ │ │ │
|
||||
//! ┌─────▼───────────▼──────────▼───────────▼─────┐
|
||||
//! │ Linux KVM │
|
||||
//! └────────────────────────────────────────────────┘
|
||||
//! ```
|
||||
//!
|
||||
//! # Quick Start
|
||||
//!
|
||||
//! ```ignore
|
||||
//! use volt-vmm_vmm::kvm::{KvmSystem, VmConfig};
|
||||
//!
|
||||
//! // Initialize KVM
|
||||
//! let kvm = KvmSystem::new().expect("KVM init failed");
|
||||
//!
|
||||
//! // Create a VM with 256MB RAM
|
||||
//! let config = VmConfig {
|
||||
//! memory_size: 256 * 1024 * 1024,
|
||||
//! vcpu_count: 1,
|
||||
//! huge_pages: true,
|
||||
//! ..Default::default()
|
||||
//! };
|
||||
//!
|
||||
//! let vm = kvm.create_vm(config).expect("VM creation failed");
|
||||
//! ```
|
||||
//!
|
||||
//! # Performance Targets
|
||||
//!
|
||||
//! - Boot time: <125ms (kernel + init)
|
||||
//! - Memory overhead: <5MB per VM
|
||||
//! - vCPU startup: <5ms
|
||||
//!
|
||||
|
||||
pub mod boot;
|
||||
pub mod kvm;
|
||||
pub mod net;
|
||||
pub mod pool;
|
||||
|
||||
// Re-export commonly used types
|
||||
pub use kvm::{KvmSystem, Vm, VmConfig, VmState};
|
||||
pub use kvm::{VcpuHandle, VcpuConfig, VcpuExitReason};
|
||||
pub use kvm::{GuestMemoryManager, MemoryConfig};
|
||||
|
||||
/// VMM version
|
||||
pub const VERSION: &str = env!("CARGO_PKG_VERSION");
|
||||
|
||||
/// Build info for debugging
|
||||
pub fn build_info() -> String {
|
||||
format!(
|
||||
"Volt VMM v{} (rust-vmm based)\n\
|
||||
Target: <125ms boot, <5MB overhead\n\
|
||||
Features: KVM, huge pages, fast vCPU init",
|
||||
VERSION
|
||||
)
|
||||
}
|
||||
2254
vmm/src/main.rs
Normal file
2254
vmm/src/main.rs
Normal file
File diff suppressed because it is too large
Load Diff
615
vmm/src/net/macvtap.rs
Normal file
615
vmm/src/net/macvtap.rs
Normal file
@@ -0,0 +1,615 @@
|
||||
//! macvtap backend for Volt VMM
|
||||
//!
|
||||
//! macvtap provides direct kernel networking with higher performance than
|
||||
//! userspace virtio-net emulation. It creates a virtual interface (macvtap)
|
||||
//! directly in the kernel, bypassing the TAP + bridge overhead.
|
||||
//!
|
||||
//! # Architecture
|
||||
//!
|
||||
//! ```text
|
||||
//! ┌─────────────────────────────────────────────────────────────┐
|
||||
//! │ Guest VM │
|
||||
//! │ ┌────────────────────────────────────────────────────────┐ │
|
||||
//! │ │ virtio-net driver │ │
|
||||
//! │ └──────────────────────────┬─────────────────────────────┘ │
|
||||
//! └─────────────────────────────┼───────────────────────────────┘
|
||||
//! │
|
||||
//! ┌─────────────────────────────┼───────────────────────────────┐
|
||||
//! │ Volt VMM │ /dev/tapN │
|
||||
//! │ │ (macvtap device node) │
|
||||
//! └─────────────────────────────┼───────────────────────────────┘
|
||||
//! │
|
||||
//! ┌─────────────────────────────┼───────────────────────────────┐
|
||||
//! │ Host Kernel │ │
|
||||
//! │ ┌──────────────────────────▼─────────────────────────────┐ │
|
||||
//! │ │ macvtap │ │
|
||||
//! │ │ ┌────────────┐ ┌────────────┐ ┌────────────┐ │ │
|
||||
//! │ │ │ Mode: │ │ Mode: │ │ Mode: │ │ │
|
||||
//! │ │ │ bridge │ │ vepa │ │ private │ │ │
|
||||
//! │ │ └──────┬─────┘ └──────┬─────┘ └──────┬─────┘ │ │
|
||||
//! │ └─────────┼─────────────────┼─────────────────┼──────────┘ │
|
||||
//! │ └─────────────────┼─────────────────┘ │
|
||||
//! │ │ │
|
||||
//! │ ┌───────────────────────────▼──────────────────────────┐ │
|
||||
//! │ │ Physical NIC (eth0/ens0) │ │
|
||||
//! │ └───────────────────────────────────────────────────────┘ │
|
||||
//! └─────────────────────────────────────────────────────────────┘
|
||||
//! ```
|
||||
//!
|
||||
//! # Modes
|
||||
//!
|
||||
//! - **Bridge**: VMs can communicate with each other and the host
|
||||
//! - **VEPA**: All traffic goes through external switch (802.1Qbg)
|
||||
//! - **Private**: VMs isolated from each other, only external traffic
|
||||
//! - **Passthru**: Single VM has direct access to parent device
|
||||
|
||||
use super::{
|
||||
get_ifindex, InterfaceType, MacAddress, NetError, NetworkBackend, NetworkConfig,
|
||||
NetworkInterface, Result,
|
||||
};
|
||||
use std::collections::HashMap;
|
||||
use std::fs::{self, OpenOptions};
|
||||
use std::os::unix::io::{IntoRawFd, RawFd};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
/// macvtap operating mode
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum MacvtapMode {
|
||||
/// Bridge mode - VMs can talk to each other and host
|
||||
Bridge,
|
||||
/// VEPA mode - all traffic through external switch
|
||||
Vepa,
|
||||
/// Private mode - VMs isolated, external only
|
||||
Private,
|
||||
/// Passthru mode - single VM, direct device access
|
||||
Passthru,
|
||||
/// Source mode - filter by source MAC
|
||||
Source,
|
||||
}
|
||||
|
||||
impl MacvtapMode {
|
||||
/// Convert to kernel mode value
|
||||
pub fn to_kernel_mode(&self) -> u32 {
|
||||
match self {
|
||||
MacvtapMode::Private => 1,
|
||||
MacvtapMode::Vepa => 2,
|
||||
MacvtapMode::Bridge => 4,
|
||||
MacvtapMode::Passthru => 8,
|
||||
MacvtapMode::Source => 16,
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert from string
|
||||
pub fn from_str(s: &str) -> Option<Self> {
|
||||
match s.to_lowercase().as_str() {
|
||||
"bridge" => Some(MacvtapMode::Bridge),
|
||||
"vepa" => Some(MacvtapMode::Vepa),
|
||||
"private" => Some(MacvtapMode::Private),
|
||||
"passthru" | "passthrough" => Some(MacvtapMode::Passthru),
|
||||
"source" => Some(MacvtapMode::Source),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert to string for ip command
|
||||
pub fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
MacvtapMode::Bridge => "bridge",
|
||||
MacvtapMode::Vepa => "vepa",
|
||||
MacvtapMode::Private => "private",
|
||||
MacvtapMode::Passthru => "passthru",
|
||||
MacvtapMode::Source => "source",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for MacvtapMode {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.as_str())
|
||||
}
|
||||
}
|
||||
|
||||
/// macvtap network backend
|
||||
pub struct MacvtapBackend {
|
||||
/// Parent physical interface
|
||||
parent_interface: String,
|
||||
/// Operating mode
|
||||
mode: MacvtapMode,
|
||||
/// Track created interfaces for cleanup
|
||||
interfaces: Arc<Mutex<HashMap<String, MacvtapInterface>>>,
|
||||
}
|
||||
|
||||
/// Tracked macvtap interface
|
||||
#[allow(dead_code)]
|
||||
struct MacvtapInterface {
|
||||
/// macvtap interface name
|
||||
name: String,
|
||||
/// Device node path (e.g., /dev/tap42)
|
||||
device_path: PathBuf,
|
||||
/// File descriptor
|
||||
fd: RawFd,
|
||||
/// Interface index
|
||||
ifindex: u32,
|
||||
}
|
||||
|
||||
impl MacvtapBackend {
|
||||
/// Create a new macvtap backend
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `parent` - Name of the parent physical interface (e.g., "eth0")
|
||||
/// * `mode` - macvtap operating mode
|
||||
pub fn new(parent: &str, mode: MacvtapMode) -> Result<Self> {
|
||||
// Verify parent interface exists
|
||||
let sysfs_path = format!("/sys/class/net/{}", parent);
|
||||
if !Path::new(&sysfs_path).exists() {
|
||||
return Err(NetError::InterfaceNotFound(parent.to_string()));
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
parent_interface: parent.to_string(),
|
||||
mode,
|
||||
interfaces: Arc::new(Mutex::new(HashMap::new())),
|
||||
})
|
||||
}
|
||||
|
||||
/// Create macvtap interface via netlink/ip command
|
||||
fn create_macvtap(&self, name: &str, mac: &MacAddress) -> Result<u32> {
|
||||
// Create macvtap interface
|
||||
let output = std::process::Command::new("ip")
|
||||
.args([
|
||||
"link",
|
||||
"add",
|
||||
"link",
|
||||
&self.parent_interface,
|
||||
"name",
|
||||
name,
|
||||
"type",
|
||||
"macvtap",
|
||||
"mode",
|
||||
self.mode.as_str(),
|
||||
])
|
||||
.output()
|
||||
.map_err(|e| NetError::Macvtap(format!("Failed to run ip command: {}", e)))?;
|
||||
|
||||
if !output.status.success() {
|
||||
return Err(NetError::Macvtap(format!(
|
||||
"Failed to create macvtap {}: {}",
|
||||
name,
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
)));
|
||||
}
|
||||
|
||||
// Set MAC address
|
||||
let output = std::process::Command::new("ip")
|
||||
.args(["link", "set", name, "address", &mac.to_string()])
|
||||
.output()
|
||||
.map_err(|e| NetError::Macvtap(format!("Failed to set MAC: {}", e)))?;
|
||||
|
||||
if !output.status.success() {
|
||||
// Clean up on failure
|
||||
let _ = self.delete_macvtap(name);
|
||||
return Err(NetError::Macvtap(format!(
|
||||
"Failed to set MAC address: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
)));
|
||||
}
|
||||
|
||||
// Bring interface up
|
||||
let output = std::process::Command::new("ip")
|
||||
.args(["link", "set", name, "up"])
|
||||
.output()
|
||||
.map_err(|e| NetError::Macvtap(format!("Failed to bring up interface: {}", e)))?;
|
||||
|
||||
if !output.status.success() {
|
||||
let _ = self.delete_macvtap(name);
|
||||
return Err(NetError::Macvtap(format!(
|
||||
"Failed to bring up {}: {}",
|
||||
name,
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
)));
|
||||
}
|
||||
|
||||
// Get interface index
|
||||
get_ifindex(name)
|
||||
}
|
||||
|
||||
/// Delete macvtap interface
|
||||
fn delete_macvtap(&self, name: &str) -> Result<()> {
|
||||
let output = std::process::Command::new("ip")
|
||||
.args(["link", "delete", name])
|
||||
.output()
|
||||
.map_err(|e| NetError::Macvtap(format!("Failed to run ip command: {}", e)))?;
|
||||
|
||||
if !output.status.success() {
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
if !stderr.contains("Cannot find device") {
|
||||
tracing::warn!("Failed to delete macvtap {}: {}", name, stderr);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get the device node path for a macvtap interface
|
||||
fn get_device_path(&self, ifindex: u32) -> PathBuf {
|
||||
PathBuf::from(format!("/dev/tap{}", ifindex))
|
||||
}
|
||||
|
||||
/// Find or create the device node for macvtap
|
||||
fn ensure_device_node(&self, ifindex: u32, iface_name: &str) -> Result<PathBuf> {
|
||||
let dev_path = self.get_device_path(ifindex);
|
||||
|
||||
// Check if device node exists
|
||||
if dev_path.exists() {
|
||||
return Ok(dev_path);
|
||||
}
|
||||
|
||||
// Read major/minor from sysfs
|
||||
let sysfs_dev = format!("/sys/class/net/{}/tap{}/dev", iface_name, ifindex);
|
||||
if !Path::new(&sysfs_dev).exists() {
|
||||
// Alternative path
|
||||
let alt_path = format!("/sys/devices/virtual/net/{}/tap{}/dev", iface_name, ifindex);
|
||||
if !Path::new(&alt_path).exists() {
|
||||
return Err(NetError::Macvtap(format!(
|
||||
"Cannot find sysfs device entry for {}",
|
||||
iface_name
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
let dev_content = fs::read_to_string(&sysfs_dev).map_err(|e| {
|
||||
NetError::Macvtap(format!("Failed to read {}: {}", sysfs_dev, e))
|
||||
})?;
|
||||
|
||||
let parts: Vec<&str> = dev_content.trim().split(':').collect();
|
||||
if parts.len() != 2 {
|
||||
return Err(NetError::Macvtap(format!(
|
||||
"Invalid dev format in {}: {}",
|
||||
sysfs_dev, dev_content
|
||||
)));
|
||||
}
|
||||
|
||||
let major: u32 = parts[0]
|
||||
.parse()
|
||||
.map_err(|_| NetError::Macvtap("Invalid major number".to_string()))?;
|
||||
let minor: u32 = parts[1]
|
||||
.parse()
|
||||
.map_err(|_| NetError::Macvtap("Invalid minor number".to_string()))?;
|
||||
|
||||
// Create device node
|
||||
let dev = libc::makedev(major, minor);
|
||||
let c_path = std::ffi::CString::new(dev_path.to_str().unwrap())
|
||||
.map_err(|_| NetError::Macvtap("Invalid path".to_string()))?;
|
||||
|
||||
let ret = unsafe { libc::mknod(c_path.as_ptr(), libc::S_IFCHR | 0o660, dev) };
|
||||
|
||||
if ret < 0 {
|
||||
return Err(NetError::Macvtap(format!(
|
||||
"mknod failed: {}",
|
||||
std::io::Error::last_os_error()
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(dev_path)
|
||||
}
|
||||
|
||||
/// Open macvtap device node
|
||||
fn open_macvtap(&self, dev_path: &Path) -> Result<RawFd> {
|
||||
let file = OpenOptions::new()
|
||||
.read(true)
|
||||
.write(true)
|
||||
.open(dev_path)
|
||||
.map_err(|e| {
|
||||
if e.kind() == std::io::ErrorKind::PermissionDenied {
|
||||
NetError::PermissionDenied(format!(
|
||||
"Cannot open {} - run as root or add to 'kvm' group",
|
||||
dev_path.display()
|
||||
))
|
||||
} else {
|
||||
NetError::Macvtap(format!(
|
||||
"Failed to open {}: {}",
|
||||
dev_path.display(),
|
||||
e
|
||||
))
|
||||
}
|
||||
})?;
|
||||
|
||||
let fd = file.into_raw_fd();
|
||||
|
||||
// Set vnet header flags
|
||||
self.set_vnet_hdr(fd)?;
|
||||
|
||||
Ok(fd)
|
||||
}
|
||||
|
||||
/// Enable vnet header for virtio compatibility
|
||||
///
|
||||
/// NOTE: macvtap devices do NOT need TUNSETIFF — they are already configured
|
||||
/// by the kernel when the macvtap interface is created. Calling TUNSETIFF on
|
||||
/// a /dev/tapN fd causes EINVAL. We only need TUNSETVNETHDRSZ and nonblocking.
|
||||
fn set_vnet_hdr(&self, fd: RawFd) -> Result<()> {
|
||||
// Set vnet header size (required for virtio-net compatibility)
|
||||
let hdr_size: libc::c_int = 12;
|
||||
unsafe {
|
||||
super::tun_ioctl::tunsetvnethdrsz(fd, hdr_size as u64)
|
||||
.map_err(|e| NetError::Macvtap(format!("TUNSETVNETHDRSZ failed: {}", e)))?;
|
||||
}
|
||||
|
||||
// Set non-blocking mode
|
||||
let flags = unsafe { libc::fcntl(fd, libc::F_GETFL) };
|
||||
if flags >= 0 {
|
||||
unsafe { libc::fcntl(fd, libc::F_SETFL, flags | libc::O_NONBLOCK) };
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Generate unique interface name for a VM
|
||||
fn generate_iface_name(&self, vm_id: &str) -> String {
|
||||
let short_id: String = vm_id
|
||||
.chars()
|
||||
.filter(|c| c.is_alphanumeric())
|
||||
.take(6)
|
||||
.collect();
|
||||
format!("nfmvt{}", short_id)
|
||||
}
|
||||
|
||||
/// Get the mode
|
||||
pub fn mode(&self) -> MacvtapMode {
|
||||
self.mode
|
||||
}
|
||||
|
||||
/// Get the parent interface
|
||||
pub fn parent(&self) -> &str {
|
||||
&self.parent_interface
|
||||
}
|
||||
}
|
||||
|
||||
impl NetworkBackend for MacvtapBackend {
|
||||
fn create_interface(&self, config: &NetworkConfig) -> Result<NetworkInterface> {
|
||||
let mac = config.mac_address.clone().unwrap_or_else(MacAddress::random);
|
||||
let iface_name = self.generate_iface_name(&config.vm_id);
|
||||
|
||||
// Create the macvtap interface
|
||||
let ifindex = self.create_macvtap(&iface_name, &mac)?;
|
||||
|
||||
// Get/create device node
|
||||
let dev_path = self.ensure_device_node(ifindex, &iface_name)?;
|
||||
|
||||
// Open the device
|
||||
let fd = self.open_macvtap(&dev_path)?;
|
||||
|
||||
// Track for cleanup
|
||||
{
|
||||
let mut interfaces = self.interfaces.lock().unwrap();
|
||||
interfaces.insert(
|
||||
config.vm_id.clone(),
|
||||
MacvtapInterface {
|
||||
name: iface_name.clone(),
|
||||
device_path: dev_path.clone(),
|
||||
fd,
|
||||
ifindex,
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
// Multiqueue support for macvtap
|
||||
let queue_fds = if config.multiqueue && config.num_queues > 1 {
|
||||
let mut fds = Vec::new();
|
||||
for _ in 1..config.num_queues {
|
||||
let qfd = self.open_macvtap(&dev_path)?;
|
||||
fds.push(qfd);
|
||||
}
|
||||
fds
|
||||
} else {
|
||||
Vec::new()
|
||||
};
|
||||
|
||||
Ok(NetworkInterface {
|
||||
name: iface_name,
|
||||
ifindex,
|
||||
fd,
|
||||
mac,
|
||||
iface_type: InterfaceType::Macvtap,
|
||||
bridge: None, // macvtap doesn't use bridges
|
||||
vhost_fd: None, // macvtap doesn't need vhost (already kernel-accelerated)
|
||||
queue_fds,
|
||||
})
|
||||
}
|
||||
|
||||
fn attach_to_vm(&self, iface: &NetworkInterface) -> Result<RawFd> {
|
||||
// macvtap fd is used directly
|
||||
Ok(iface.fd)
|
||||
}
|
||||
|
||||
fn as_any(&self) -> &dyn std::any::Any {
|
||||
self
|
||||
}
|
||||
|
||||
fn cleanup(&self, vm_id: &str) -> Result<()> {
|
||||
let mut interfaces = self.interfaces.lock().unwrap();
|
||||
|
||||
if let Some(iface) = interfaces.remove(vm_id) {
|
||||
// Close fd
|
||||
unsafe {
|
||||
libc::close(iface.fd);
|
||||
}
|
||||
|
||||
// Delete the macvtap interface
|
||||
self.delete_macvtap(&iface.name)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn backend_type(&self) -> &'static str {
|
||||
"macvtap"
|
||||
}
|
||||
|
||||
fn supports_vhost(&self) -> bool {
|
||||
// macvtap is already kernel-accelerated
|
||||
false
|
||||
}
|
||||
|
||||
fn supports_multiqueue(&self) -> bool {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
/// Information about a macvtap interface
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct MacvtapInfo {
|
||||
/// Interface name
|
||||
pub name: String,
|
||||
/// MAC address
|
||||
pub mac: MacAddress,
|
||||
/// Interface index
|
||||
pub ifindex: u32,
|
||||
/// Operating mode
|
||||
pub mode: MacvtapMode,
|
||||
/// Parent interface
|
||||
pub parent: String,
|
||||
/// Link state (up/down)
|
||||
pub link_up: bool,
|
||||
/// TX bytes
|
||||
pub tx_bytes: u64,
|
||||
/// RX bytes
|
||||
pub rx_bytes: u64,
|
||||
}
|
||||
|
||||
/// Query macvtap interface information from sysfs
|
||||
pub fn get_macvtap_info(name: &str) -> Result<MacvtapInfo> {
|
||||
let sysfs_base = format!("/sys/class/net/{}", name);
|
||||
|
||||
if !Path::new(&sysfs_base).exists() {
|
||||
return Err(NetError::InterfaceNotFound(name.to_string()));
|
||||
}
|
||||
|
||||
// Read interface index
|
||||
let ifindex = fs::read_to_string(format!("{}/ifindex", sysfs_base))
|
||||
.map_err(|e| NetError::Macvtap(format!("Failed to read ifindex: {}", e)))?
|
||||
.trim()
|
||||
.parse::<u32>()
|
||||
.map_err(|_| NetError::Macvtap("Invalid ifindex".to_string()))?;
|
||||
|
||||
// Read MAC address
|
||||
let mac_str = fs::read_to_string(format!("{}/address", sysfs_base))
|
||||
.map_err(|e| NetError::Macvtap(format!("Failed to read address: {}", e)))?;
|
||||
let mac = MacAddress::parse(mac_str.trim())?;
|
||||
|
||||
// Read link state
|
||||
let operstate = fs::read_to_string(format!("{}/operstate", sysfs_base))
|
||||
.map_err(|e| NetError::Macvtap(format!("Failed to read operstate: {}", e)))?;
|
||||
let link_up = operstate.trim() == "up";
|
||||
|
||||
// Read statistics
|
||||
let tx_bytes = fs::read_to_string(format!("{}/statistics/tx_bytes", sysfs_base))
|
||||
.ok()
|
||||
.and_then(|s| s.trim().parse::<u64>().ok())
|
||||
.unwrap_or(0);
|
||||
|
||||
let rx_bytes = fs::read_to_string(format!("{}/statistics/rx_bytes", sysfs_base))
|
||||
.ok()
|
||||
.and_then(|s| s.trim().parse::<u64>().ok())
|
||||
.unwrap_or(0);
|
||||
|
||||
// Try to read link (parent interface)
|
||||
let parent = fs::read_link(format!("{}/lower", sysfs_base))
|
||||
.ok()
|
||||
.and_then(|p| {
|
||||
p.file_name()
|
||||
.map(|n| n.to_string_lossy().to_string())
|
||||
})
|
||||
.unwrap_or_else(|| "unknown".to_string());
|
||||
|
||||
// Try to read mode from sysfs (may not be available)
|
||||
let mode = MacvtapMode::Bridge; // Default, would need netlink to get actual mode
|
||||
|
||||
Ok(MacvtapInfo {
|
||||
name: name.to_string(),
|
||||
mac,
|
||||
ifindex,
|
||||
mode,
|
||||
parent,
|
||||
link_up,
|
||||
tx_bytes,
|
||||
rx_bytes,
|
||||
})
|
||||
}
|
||||
|
||||
/// List all macvtap interfaces on the system
|
||||
pub fn list_macvtap_interfaces() -> Result<Vec<String>> {
|
||||
let net_dir = Path::new("/sys/class/net");
|
||||
let mut macvtaps = Vec::new();
|
||||
|
||||
if let Ok(entries) = fs::read_dir(net_dir) {
|
||||
for entry in entries.flatten() {
|
||||
let name = entry.file_name().to_string_lossy().to_string();
|
||||
let type_path = entry.path().join("type");
|
||||
|
||||
// macvtap has device type 801 (0x321)
|
||||
if let Ok(type_str) = fs::read_to_string(&type_path) {
|
||||
if let Ok(_dev_type) = type_str.trim().parse::<u32>() {
|
||||
// ARPHRD_ETHER with macvtap
|
||||
if name.starts_with("macvtap") || name.starts_with("nfmvt") {
|
||||
macvtaps.push(name);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(macvtaps)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_macvtap_mode() {
|
||||
assert_eq!(MacvtapMode::Bridge.as_str(), "bridge");
|
||||
assert_eq!(MacvtapMode::Vepa.as_str(), "vepa");
|
||||
assert_eq!(MacvtapMode::Private.as_str(), "private");
|
||||
assert_eq!(MacvtapMode::Passthru.as_str(), "passthru");
|
||||
|
||||
assert_eq!(MacvtapMode::from_str("bridge"), Some(MacvtapMode::Bridge));
|
||||
assert_eq!(MacvtapMode::from_str("VEPA"), Some(MacvtapMode::Vepa));
|
||||
assert_eq!(MacvtapMode::from_str("invalid"), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_macvtap_mode_kernel_values() {
|
||||
// These should match linux/if_link.h MACVLAN_MODE_*
|
||||
assert_eq!(MacvtapMode::Private.to_kernel_mode(), 1);
|
||||
assert_eq!(MacvtapMode::Vepa.to_kernel_mode(), 2);
|
||||
assert_eq!(MacvtapMode::Bridge.to_kernel_mode(), 4);
|
||||
assert_eq!(MacvtapMode::Passthru.to_kernel_mode(), 8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_generate_iface_name() {
|
||||
let backend = MacvtapBackend {
|
||||
parent_interface: "eth0".to_string(),
|
||||
mode: MacvtapMode::Bridge,
|
||||
interfaces: Arc::new(Mutex::new(HashMap::new())),
|
||||
};
|
||||
|
||||
let name = backend.generate_iface_name("vm-test-123456789");
|
||||
assert!(name.starts_with("nfmvt"));
|
||||
assert!(name.len() <= 15); // Linux interface name limit
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_device_path() {
|
||||
let backend = MacvtapBackend {
|
||||
parent_interface: "eth0".to_string(),
|
||||
mode: MacvtapMode::Bridge,
|
||||
interfaces: Arc::new(Mutex::new(HashMap::new())),
|
||||
};
|
||||
|
||||
let path = backend.get_device_path(42);
|
||||
assert_eq!(path, PathBuf::from("/dev/tap42"));
|
||||
}
|
||||
}
|
||||
567
vmm/src/net/mod.rs
Normal file
567
vmm/src/net/mod.rs
Normal file
@@ -0,0 +1,567 @@
|
||||
//! Network backend abstraction for Volt VMM
|
||||
//!
|
||||
//! This module provides a unified interface for different network backends:
|
||||
//! - TAP + systemd-networkd (standard networking)
|
||||
//! - vhost-net (kernel-accelerated networking)
|
||||
//! - macvtap (direct kernel networking for high performance)
|
||||
//!
|
||||
//! # Architecture
|
||||
//!
|
||||
//! ```text
|
||||
//! ┌─────────────────────────────────────────────────────────────┐
|
||||
//! │ NetworkBackend Trait │
|
||||
//! ├─────────────────────────────────────────────────────────────┤
|
||||
//! │ create_interface() → NetworkInterface │
|
||||
//! │ attach_to_vm() → RawFd │
|
||||
//! │ cleanup() → () │
|
||||
//! └─────────────┬───────────────┬───────────────┬───────────────┘
|
||||
//! │ │ │
|
||||
//! ┌───────▼─────┐ ┌───────▼─────┐ ┌───────▼─────┐
|
||||
//! │ TAP+networkd│ │ vhost-net │ │ macvtap │
|
||||
//! └─────────────┘ └─────────────┘ └─────────────┘
|
||||
//! ```
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub mod macvtap;
|
||||
pub mod networkd;
|
||||
#[allow(dead_code)]
|
||||
pub mod vhost;
|
||||
|
||||
use std::fmt;
|
||||
use std::net::Ipv4Addr;
|
||||
use std::os::unix::io::RawFd;
|
||||
use thiserror::Error;
|
||||
|
||||
/// Re-exports for convenience
|
||||
pub use macvtap::{MacvtapBackend, MacvtapMode};
|
||||
pub use networkd::NetworkdBackend;
|
||||
pub use vhost::VhostNetBackend;
|
||||
|
||||
/// Network backend errors
|
||||
#[derive(Error, Debug)]
|
||||
pub enum NetError {
|
||||
#[error("Failed to create network interface: {0}")]
|
||||
InterfaceCreation(String),
|
||||
|
||||
#[error("Failed to open TAP device: {0}")]
|
||||
TapOpen(#[from] std::io::Error),
|
||||
|
||||
#[error("Failed to configure networkd: {0}")]
|
||||
NetworkdConfig(String),
|
||||
|
||||
#[error("Failed to reload networkd: {0}")]
|
||||
NetworkdReload(String),
|
||||
|
||||
#[error("vhost-net error: {0}")]
|
||||
VhostNet(String),
|
||||
|
||||
#[error("macvtap error: {0}")]
|
||||
Macvtap(String),
|
||||
|
||||
#[error("ioctl failed: {0}")]
|
||||
Ioctl(String),
|
||||
|
||||
#[error("Interface not found: {0}")]
|
||||
InterfaceNotFound(String),
|
||||
|
||||
#[error("Permission denied: {0}")]
|
||||
PermissionDenied(String),
|
||||
|
||||
#[error("D-Bus error: {0}")]
|
||||
DBus(String),
|
||||
}
|
||||
|
||||
pub type Result<T> = std::result::Result<T, NetError>;
|
||||
|
||||
/// MAC address representation
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub struct MacAddress(pub [u8; 6]);
|
||||
|
||||
impl MacAddress {
|
||||
/// Generate a random local unicast MAC address
|
||||
pub fn random() -> Self {
|
||||
use std::time::{SystemTime, UNIX_EPOCH};
|
||||
let nanos = SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap_or_default()
|
||||
.subsec_nanos();
|
||||
|
||||
// XOR with process ID for better entropy
|
||||
let pid = std::process::id();
|
||||
let mut mac = [0u8; 6];
|
||||
mac[0] = 0x52; // Local, unicast (bit 1 set, bit 0 clear)
|
||||
mac[1] = 0x54;
|
||||
mac[2] = 0x00;
|
||||
mac[3] = ((nanos >> 16) ^ (pid >> 8)) as u8;
|
||||
mac[4] = ((nanos >> 8) ^ pid) as u8;
|
||||
mac[5] = (nanos ^ (pid << 8)) as u8;
|
||||
|
||||
Self(mac)
|
||||
}
|
||||
|
||||
/// Create MAC from bytes
|
||||
pub fn from_bytes(bytes: [u8; 6]) -> Self {
|
||||
Self(bytes)
|
||||
}
|
||||
|
||||
/// Parse MAC from string (e.g., "52:54:00:ab:cd:ef")
|
||||
pub fn parse(s: &str) -> Result<Self> {
|
||||
let parts: Vec<&str> = s.split(':').collect();
|
||||
if parts.len() != 6 {
|
||||
return Err(NetError::InterfaceCreation(format!(
|
||||
"Invalid MAC address format: {}",
|
||||
s
|
||||
)));
|
||||
}
|
||||
|
||||
let mut bytes = [0u8; 6];
|
||||
for (i, part) in parts.iter().enumerate() {
|
||||
bytes[i] = u8::from_str_radix(part, 16).map_err(|_| {
|
||||
NetError::InterfaceCreation(format!("Invalid MAC address byte: {}", part))
|
||||
})?;
|
||||
}
|
||||
|
||||
Ok(Self(bytes))
|
||||
}
|
||||
|
||||
/// Get raw bytes
|
||||
#[allow(dead_code)]
|
||||
pub fn as_bytes(&self) -> &[u8; 6] {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for MacAddress {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"{:02x}:{:02x}:{:02x}:{:02x}:{:02x}:{:02x}",
|
||||
self.0[0], self.0[1], self.0[2], self.0[3], self.0[4], self.0[5]
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for MacAddress {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "MacAddress({})", self)
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for MacAddress {
|
||||
fn default() -> Self {
|
||||
Self::random()
|
||||
}
|
||||
}
|
||||
|
||||
/// Network interface configuration
|
||||
#[derive(Debug, Clone)]
|
||||
#[allow(dead_code)]
|
||||
pub struct NetworkConfig {
|
||||
/// VM identifier
|
||||
pub vm_id: String,
|
||||
/// Network name/namespace
|
||||
pub network_name: Option<String>,
|
||||
/// MAC address (auto-generated if None)
|
||||
pub mac_address: Option<MacAddress>,
|
||||
/// Bridge to attach to (for TAP backend)
|
||||
pub bridge: Option<String>,
|
||||
/// IP address for the interface
|
||||
pub ip_address: Option<Ipv4Addr>,
|
||||
/// Netmask (CIDR prefix)
|
||||
pub netmask: Option<u8>,
|
||||
/// Gateway address
|
||||
pub gateway: Option<Ipv4Addr>,
|
||||
/// MTU (default: 1500)
|
||||
pub mtu: u16,
|
||||
/// Enable multiqueue
|
||||
pub multiqueue: bool,
|
||||
/// Number of queues (if multiqueue enabled)
|
||||
pub num_queues: u8,
|
||||
/// Parent interface for macvtap
|
||||
pub parent_interface: Option<String>,
|
||||
/// VLAN ID
|
||||
pub vlan_id: Option<u16>,
|
||||
}
|
||||
|
||||
impl Default for NetworkConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
vm_id: String::new(),
|
||||
network_name: None,
|
||||
mac_address: None,
|
||||
bridge: None,
|
||||
ip_address: None,
|
||||
netmask: None,
|
||||
gateway: None,
|
||||
mtu: 1500,
|
||||
multiqueue: false,
|
||||
num_queues: 1,
|
||||
parent_interface: None,
|
||||
vlan_id: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Represents a created network interface
|
||||
#[derive(Debug)]
|
||||
#[allow(dead_code)]
|
||||
pub struct NetworkInterface {
|
||||
/// Interface name (e.g., "tap0", "macvtap0")
|
||||
pub name: String,
|
||||
/// Interface index
|
||||
pub ifindex: u32,
|
||||
/// File descriptor for the TAP/macvtap device
|
||||
pub fd: RawFd,
|
||||
/// MAC address
|
||||
pub mac: MacAddress,
|
||||
/// Interface type
|
||||
pub iface_type: InterfaceType,
|
||||
/// Associated bridge (if any)
|
||||
pub bridge: Option<String>,
|
||||
/// vhost-net fd (if acceleration enabled)
|
||||
pub vhost_fd: Option<RawFd>,
|
||||
/// Multiqueue fds (if enabled)
|
||||
pub queue_fds: Vec<RawFd>,
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
impl NetworkInterface {
|
||||
/// Get the primary file descriptor for this interface
|
||||
pub fn primary_fd(&self) -> RawFd {
|
||||
self.fd
|
||||
}
|
||||
|
||||
/// Check if vhost-net acceleration is enabled
|
||||
pub fn has_vhost(&self) -> bool {
|
||||
self.vhost_fd.is_some()
|
||||
}
|
||||
|
||||
/// Check if multiqueue is enabled
|
||||
pub fn is_multiqueue(&self) -> bool {
|
||||
!self.queue_fds.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
/// Type of network interface
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum InterfaceType {
|
||||
/// Standard TAP interface
|
||||
Tap,
|
||||
/// TAP with vhost-net acceleration
|
||||
TapVhost,
|
||||
/// macvtap (direct kernel networking)
|
||||
Macvtap,
|
||||
}
|
||||
|
||||
impl fmt::Display for InterfaceType {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
InterfaceType::Tap => write!(f, "tap"),
|
||||
InterfaceType::TapVhost => write!(f, "tap+vhost"),
|
||||
InterfaceType::Macvtap => write!(f, "macvtap"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Network backend trait - implemented by TAP, vhost-net, and macvtap
|
||||
pub trait NetworkBackend: Send + Sync {
|
||||
/// Create a network interface for the specified VM
|
||||
fn create_interface(&self, config: &NetworkConfig) -> Result<NetworkInterface>;
|
||||
|
||||
/// Attach the interface to a VM, returning the fd for virtio-net
|
||||
fn attach_to_vm(&self, iface: &NetworkInterface) -> Result<RawFd>;
|
||||
|
||||
/// Clean up all network resources for a VM
|
||||
fn cleanup(&self, vm_id: &str) -> Result<()>;
|
||||
|
||||
/// Get the backend type name
|
||||
fn backend_type(&self) -> &'static str;
|
||||
|
||||
/// Upcast to Any for safe downcasting to concrete backend types
|
||||
fn as_any(&self) -> &dyn std::any::Any;
|
||||
|
||||
/// Check if this backend supports vhost-net acceleration
|
||||
#[allow(dead_code)]
|
||||
fn supports_vhost(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
/// Check if this backend supports multiqueue
|
||||
#[allow(dead_code)]
|
||||
fn supports_multiqueue(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Builder for creating network backends
|
||||
pub struct NetworkBackendBuilder {
|
||||
backend_type: BackendType,
|
||||
use_vhost: bool,
|
||||
networkd_dir: Option<String>,
|
||||
parent_interface: Option<String>,
|
||||
macvtap_mode: MacvtapMode,
|
||||
}
|
||||
|
||||
/// Type of backend to create
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum BackendType {
|
||||
/// TAP with systemd-networkd
|
||||
TapNetworkd,
|
||||
/// vhost-net accelerated TAP
|
||||
VhostNet,
|
||||
/// macvtap direct networking
|
||||
Macvtap,
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
impl NetworkBackendBuilder {
|
||||
/// Create a new builder with the specified backend type
|
||||
pub fn new(backend_type: BackendType) -> Self {
|
||||
Self {
|
||||
backend_type,
|
||||
use_vhost: false,
|
||||
networkd_dir: None,
|
||||
parent_interface: None,
|
||||
macvtap_mode: MacvtapMode::Bridge,
|
||||
}
|
||||
}
|
||||
|
||||
/// Enable vhost-net acceleration (where supported)
|
||||
pub fn with_vhost(mut self, enable: bool) -> Self {
|
||||
self.use_vhost = enable;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set custom networkd configuration directory
|
||||
pub fn networkd_dir(mut self, dir: &str) -> Self {
|
||||
self.networkd_dir = Some(dir.to_string());
|
||||
self
|
||||
}
|
||||
|
||||
/// Set parent interface for macvtap
|
||||
pub fn parent_interface(mut self, iface: &str) -> Self {
|
||||
self.parent_interface = Some(iface.to_string());
|
||||
self
|
||||
}
|
||||
|
||||
/// Set macvtap mode
|
||||
pub fn macvtap_mode(mut self, mode: MacvtapMode) -> Self {
|
||||
self.macvtap_mode = mode;
|
||||
self
|
||||
}
|
||||
|
||||
/// Build the network backend
|
||||
pub fn build(self) -> Result<Box<dyn NetworkBackend>> {
|
||||
match self.backend_type {
|
||||
BackendType::TapNetworkd => {
|
||||
let dir = self
|
||||
.networkd_dir
|
||||
.unwrap_or_else(|| "/run/systemd/network".to_string());
|
||||
Ok(Box::new(NetworkdBackend::new(&dir, self.use_vhost)?))
|
||||
}
|
||||
BackendType::VhostNet => Ok(Box::new(VhostNetBackend::new()?)),
|
||||
BackendType::Macvtap => {
|
||||
let parent = self.parent_interface.ok_or_else(|| {
|
||||
NetError::InterfaceCreation("macvtap requires parent interface".to_string())
|
||||
})?;
|
||||
Ok(Box::new(MacvtapBackend::new(&parent, self.macvtap_mode)?))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// TAP device ioctl helpers
|
||||
// ============================================================================
|
||||
|
||||
/// TAP device flags
|
||||
#[allow(dead_code)]
|
||||
pub mod tap_flags {
|
||||
pub const IFF_TUN: libc::c_short = 0x0001;
|
||||
pub const IFF_TAP: libc::c_short = 0x0002;
|
||||
pub const IFF_NO_PI: libc::c_short = 0x1000;
|
||||
pub const IFF_VNET_HDR: libc::c_short = 0x4000;
|
||||
pub const IFF_MULTI_QUEUE: libc::c_short = 0x0100;
|
||||
}
|
||||
|
||||
/// ioctl numbers for TUN/TAP
|
||||
pub mod tun_ioctl {
|
||||
use nix::ioctl_write_int;
|
||||
use nix::ioctl_write_ptr;
|
||||
|
||||
#[allow(dead_code)]
|
||||
const TUNSETIFF: u64 = 0x400454ca;
|
||||
#[allow(dead_code)]
|
||||
const TUNSETOFFLOAD: u64 = 0x400454d0;
|
||||
#[allow(dead_code)]
|
||||
const TUNSETVNETHDRSZ: u64 = 0x400454d8;
|
||||
#[allow(dead_code)]
|
||||
const TUNGETIFF: u64 = 0x800454d2;
|
||||
|
||||
ioctl_write_ptr!(tunsetiff, b'T', 0xca, libc::ifreq);
|
||||
ioctl_write_int!(tunsetoffload, b'T', 0xd0);
|
||||
ioctl_write_int!(tunsetvnethdrsz, b'T', 0xd8);
|
||||
}
|
||||
|
||||
/// Open a TAP device with the given name
|
||||
pub fn open_tap(name: &str, multiqueue: bool, vnet_hdr: bool) -> Result<(RawFd, String)> {
|
||||
use std::ffi::CString;
|
||||
use std::fs::OpenOptions;
|
||||
use std::os::unix::io::IntoRawFd;
|
||||
|
||||
let tun_file = OpenOptions::new()
|
||||
.read(true)
|
||||
.write(true)
|
||||
.open("/dev/net/tun")?;
|
||||
|
||||
let fd = tun_file.into_raw_fd();
|
||||
|
||||
let mut ifr: libc::ifreq = unsafe { std::mem::zeroed() };
|
||||
|
||||
// Set interface name (or empty for auto-assignment)
|
||||
if !name.is_empty() {
|
||||
let name_bytes = name.as_bytes();
|
||||
let len = std::cmp::min(name_bytes.len(), libc::IFNAMSIZ - 1);
|
||||
unsafe {
|
||||
std::ptr::copy_nonoverlapping(
|
||||
name_bytes.as_ptr(),
|
||||
ifr.ifr_name.as_mut_ptr() as *mut u8,
|
||||
len,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Set flags
|
||||
let mut flags = tap_flags::IFF_TAP | tap_flags::IFF_NO_PI;
|
||||
if multiqueue {
|
||||
flags |= tap_flags::IFF_MULTI_QUEUE;
|
||||
}
|
||||
if vnet_hdr {
|
||||
flags |= tap_flags::IFF_VNET_HDR;
|
||||
}
|
||||
ifr.ifr_ifru.ifru_flags = flags;
|
||||
|
||||
// Create the TAP interface
|
||||
unsafe {
|
||||
tun_ioctl::tunsetiff(fd, &ifr).map_err(|e| NetError::Ioctl(format!("TUNSETIFF: {}", e)))?;
|
||||
}
|
||||
|
||||
// Extract the assigned interface name
|
||||
let iface_name = unsafe {
|
||||
let name_ptr = ifr.ifr_name.as_ptr();
|
||||
CString::from_raw(name_ptr as *mut i8)
|
||||
.into_string()
|
||||
.unwrap_or_else(|_| String::from("tap0"))
|
||||
};
|
||||
|
||||
// Set vnet header size for virtio compatibility
|
||||
if vnet_hdr {
|
||||
let hdr_size: libc::c_int = 12; // virtio_net_hdr_v1 size
|
||||
unsafe {
|
||||
tun_ioctl::tunsetvnethdrsz(fd, hdr_size as u64)
|
||||
.map_err(|e| NetError::Ioctl(format!("TUNSETVNETHDRSZ: {}", e)))?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok((fd, iface_name))
|
||||
}
|
||||
|
||||
/// Get the interface index for a given name
|
||||
pub fn get_ifindex(name: &str) -> Result<u32> {
|
||||
use std::ffi::CString;
|
||||
|
||||
let cname = CString::new(name)
|
||||
.map_err(|_| NetError::InterfaceCreation("Invalid interface name".to_string()))?;
|
||||
|
||||
let idx = unsafe { libc::if_nametoindex(cname.as_ptr()) };
|
||||
if idx == 0 {
|
||||
return Err(NetError::InterfaceNotFound(name.to_string()));
|
||||
}
|
||||
|
||||
Ok(idx)
|
||||
}
|
||||
|
||||
/// Set interface up
|
||||
pub fn set_interface_up(name: &str) -> Result<()> {
|
||||
use std::process::Command;
|
||||
|
||||
let output = Command::new("ip")
|
||||
.args(["link", "set", name, "up"])
|
||||
.output()
|
||||
.map_err(|e| NetError::InterfaceCreation(format!("Failed to run ip command: {}", e)))?;
|
||||
|
||||
if !output.status.success() {
|
||||
return Err(NetError::InterfaceCreation(format!(
|
||||
"Failed to bring up {}: {}",
|
||||
name,
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Add interface to bridge
|
||||
pub fn add_to_bridge(iface: &str, bridge: &str) -> Result<()> {
|
||||
use std::process::Command;
|
||||
|
||||
let output = Command::new("ip")
|
||||
.args(["link", "set", iface, "master", bridge])
|
||||
.output()
|
||||
.map_err(|e| NetError::InterfaceCreation(format!("Failed to run ip command: {}", e)))?;
|
||||
|
||||
if !output.status.success() {
|
||||
return Err(NetError::InterfaceCreation(format!(
|
||||
"Failed to add {} to bridge {}: {}",
|
||||
iface,
|
||||
bridge,
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_mac_address_random() {
|
||||
let mac1 = MacAddress::random();
|
||||
let mac2 = MacAddress::random();
|
||||
|
||||
// Local bit should be set
|
||||
assert_eq!(mac1.0[0] & 0x02, 0x02);
|
||||
// Unicast bit should be clear
|
||||
assert_eq!(mac1.0[0] & 0x01, 0x00);
|
||||
|
||||
// Two random MACs should differ (extremely high probability)
|
||||
// They share first 3 bytes by design
|
||||
assert_ne!(mac1.0[3..], mac2.0[3..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mac_address_parse() {
|
||||
let mac = MacAddress::parse("52:54:00:ab:cd:ef").unwrap();
|
||||
assert_eq!(mac.0, [0x52, 0x54, 0x00, 0xab, 0xcd, 0xef]);
|
||||
|
||||
assert!(MacAddress::parse("invalid").is_err());
|
||||
assert!(MacAddress::parse("52:54:00:ab:cd").is_err());
|
||||
assert!(MacAddress::parse("52:54:00:ab:cd:zz").is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mac_address_display() {
|
||||
let mac = MacAddress::from_bytes([0x52, 0x54, 0x00, 0xab, 0xcd, 0xef]);
|
||||
assert_eq!(mac.to_string(), "52:54:00:ab:cd:ef");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_network_config_default() {
|
||||
let config = NetworkConfig::default();
|
||||
assert_eq!(config.mtu, 1500);
|
||||
assert!(!config.multiqueue);
|
||||
assert_eq!(config.num_queues, 1);
|
||||
}
|
||||
}
|
||||
695
vmm/src/net/networkd.rs
Normal file
695
vmm/src/net/networkd.rs
Normal file
@@ -0,0 +1,695 @@
|
||||
//! systemd-networkd integration for Volt VMM
|
||||
//!
|
||||
//! This module generates .netdev and .network configuration files for
|
||||
//! TAP/macvtap interfaces and manages them via networkd.
|
||||
//!
|
||||
//! # Configuration Files
|
||||
//!
|
||||
//! - `.netdev` files: Define virtual network devices (TAP, bridge, VLAN)
|
||||
//! - `.network` files: Configure network settings (IP, gateway, bridge attachment)
|
||||
//!
|
||||
//! # Reload Strategy
|
||||
//!
|
||||
//! Uses networkctl reload via D-Bus or direct command invocation.
|
||||
|
||||
use super::{
|
||||
get_ifindex, open_tap, set_interface_up, InterfaceType, MacAddress, NetError, NetworkBackend,
|
||||
NetworkConfig, NetworkInterface, Result,
|
||||
};
|
||||
use std::collections::HashMap;
|
||||
use std::fs::{self, File};
|
||||
use std::io::Write;
|
||||
use std::os::unix::io::RawFd;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
/// systemd-networkd backend for TAP interfaces
|
||||
pub struct NetworkdBackend {
|
||||
/// Directory for networkd configuration files
|
||||
config_dir: PathBuf,
|
||||
/// Use vhost-net acceleration
|
||||
use_vhost: bool,
|
||||
/// Track created interfaces for cleanup
|
||||
interfaces: Arc<Mutex<HashMap<String, Vec<String>>>>,
|
||||
/// Track created config files for cleanup
|
||||
config_files: Arc<Mutex<HashMap<String, Vec<PathBuf>>>>,
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
impl NetworkdBackend {
|
||||
/// Create a new networkd backend
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `config_dir` - Directory for .netdev and .network files (e.g., /run/systemd/network)
|
||||
/// * `use_vhost` - Enable vhost-net acceleration
|
||||
pub fn new(config_dir: &str, use_vhost: bool) -> Result<Self> {
|
||||
let path = PathBuf::from(config_dir);
|
||||
|
||||
// Ensure directory exists
|
||||
if !path.exists() {
|
||||
fs::create_dir_all(&path).map_err(|e| {
|
||||
NetError::NetworkdConfig(format!(
|
||||
"Failed to create config dir {}: {}",
|
||||
config_dir, e
|
||||
))
|
||||
})?;
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
config_dir: path,
|
||||
use_vhost,
|
||||
interfaces: Arc::new(Mutex::new(HashMap::new())),
|
||||
config_files: Arc::new(Mutex::new(HashMap::new())),
|
||||
})
|
||||
}
|
||||
|
||||
/// Generate .netdev file for a TAP interface
|
||||
fn generate_tap_netdev(&self, vm_id: &str, iface_name: &str, _mac: &MacAddress) -> String {
|
||||
format!(
|
||||
r#"# Volt TAP device for VM: {}
|
||||
# Auto-generated - do not edit
|
||||
|
||||
[NetDev]
|
||||
Name={}
|
||||
Kind=tap
|
||||
|
||||
[Tap]
|
||||
MultiQueue=yes
|
||||
PacketInfo=no
|
||||
VNetHeader=yes
|
||||
User=root
|
||||
Group=root
|
||||
"#,
|
||||
vm_id, iface_name
|
||||
)
|
||||
}
|
||||
|
||||
/// Generate .netdev file for a bridge
|
||||
fn generate_bridge_netdev(&self, bridge_name: &str) -> String {
|
||||
format!(
|
||||
r#"# Volt bridge
|
||||
# Auto-generated - do not edit
|
||||
|
||||
[NetDev]
|
||||
Name={}
|
||||
Kind=bridge
|
||||
|
||||
[Bridge]
|
||||
STP=no
|
||||
ForwardDelay=0
|
||||
"#,
|
||||
bridge_name
|
||||
)
|
||||
}
|
||||
|
||||
/// Generate .network file for TAP interface
|
||||
fn generate_tap_network(
|
||||
&self,
|
||||
vm_id: &str,
|
||||
iface_name: &str,
|
||||
config: &NetworkConfig,
|
||||
) -> String {
|
||||
let mut content = format!(
|
||||
r#"# Volt network config for VM: {}
|
||||
# Auto-generated - do not edit
|
||||
|
||||
[Match]
|
||||
Name={}
|
||||
|
||||
[Link]
|
||||
MTUBytes={}
|
||||
"#,
|
||||
vm_id, iface_name, config.mtu
|
||||
);
|
||||
|
||||
// Add bridge attachment if specified
|
||||
if let Some(ref bridge) = config.bridge {
|
||||
content.push_str(&format!(
|
||||
r#"
|
||||
[Network]
|
||||
Bridge={}
|
||||
"#,
|
||||
bridge
|
||||
));
|
||||
} else if let Some(ip) = config.ip_address {
|
||||
// Direct IP configuration
|
||||
let netmask = config.netmask.unwrap_or(24);
|
||||
content.push_str(&format!(
|
||||
r#"
|
||||
[Network]
|
||||
Address={}/{}
|
||||
"#,
|
||||
ip, netmask
|
||||
));
|
||||
|
||||
if let Some(gw) = config.gateway {
|
||||
content.push_str(&format!("Gateway={}\n", gw));
|
||||
}
|
||||
}
|
||||
|
||||
content
|
||||
}
|
||||
|
||||
/// Generate .network file for bridge
|
||||
fn generate_bridge_network(
|
||||
&self,
|
||||
bridge_name: &str,
|
||||
ip: Option<std::net::Ipv4Addr>,
|
||||
netmask: Option<u8>,
|
||||
gateway: Option<std::net::Ipv4Addr>,
|
||||
) -> String {
|
||||
let mut content = format!(
|
||||
r#"# Volt bridge network config
|
||||
# Auto-generated - do not edit
|
||||
|
||||
[Match]
|
||||
Name={}
|
||||
|
||||
[Network]
|
||||
"#,
|
||||
bridge_name
|
||||
);
|
||||
|
||||
if let Some(addr) = ip {
|
||||
let prefix = netmask.unwrap_or(24);
|
||||
content.push_str(&format!("Address={}/{}\n", addr, prefix));
|
||||
}
|
||||
|
||||
if let Some(gw) = gateway {
|
||||
content.push_str(&format!("Gateway={}\n", gw));
|
||||
}
|
||||
|
||||
// Enable DHCP server on bridge for VMs
|
||||
content.push_str("DHCPServer=yes\n");
|
||||
content.push_str("IPMasquerade=both\n");
|
||||
|
||||
content
|
||||
}
|
||||
|
||||
/// Generate .netdev file for macvtap
|
||||
fn generate_macvtap_netdev(
|
||||
&self,
|
||||
vm_id: &str,
|
||||
iface_name: &str,
|
||||
_parent: &str,
|
||||
mode: &str,
|
||||
) -> String {
|
||||
format!(
|
||||
r#"# Volt macvtap device for VM: {}
|
||||
# Auto-generated - do not edit
|
||||
|
||||
[NetDev]
|
||||
Name={}
|
||||
Kind=macvtap
|
||||
|
||||
[MACVTAP]
|
||||
Mode={}
|
||||
"#,
|
||||
vm_id, iface_name, mode
|
||||
)
|
||||
}
|
||||
|
||||
/// Generate .network file for macvtap
|
||||
fn generate_macvtap_network(&self, vm_id: &str, iface_name: &str, parent: &str) -> String {
|
||||
format!(
|
||||
r#"# Volt macvtap network config for VM: {}
|
||||
# Auto-generated - do not edit
|
||||
|
||||
[Match]
|
||||
Name={}
|
||||
|
||||
[Network]
|
||||
# macvtap inherits from parent interface {}
|
||||
"#,
|
||||
vm_id, iface_name, parent
|
||||
)
|
||||
}
|
||||
|
||||
/// Write configuration file
|
||||
fn write_config(&self, vm_id: &str, filename: &str, content: &str) -> Result<PathBuf> {
|
||||
let path = self.config_dir.join(filename);
|
||||
|
||||
let mut file = File::create(&path).map_err(|e| {
|
||||
NetError::NetworkdConfig(format!("Failed to create {}: {}", path.display(), e))
|
||||
})?;
|
||||
|
||||
file.write_all(content.as_bytes()).map_err(|e| {
|
||||
NetError::NetworkdConfig(format!("Failed to write {}: {}", path.display(), e))
|
||||
})?;
|
||||
|
||||
// Track for cleanup
|
||||
let mut files = self.config_files.lock().unwrap();
|
||||
files
|
||||
.entry(vm_id.to_string())
|
||||
.or_insert_with(Vec::new)
|
||||
.push(path.clone());
|
||||
|
||||
Ok(path)
|
||||
}
|
||||
|
||||
/// Reload systemd-networkd to apply configuration
|
||||
pub fn reload(&self) -> Result<()> {
|
||||
// Try D-Bus first, fall back to networkctl
|
||||
if let Err(_) = self.reload_dbus() {
|
||||
self.reload_networkctl()?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Reload via D-Bus (preferred method)
|
||||
fn reload_dbus(&self) -> Result<()> {
|
||||
// Use busctl to send reload signal
|
||||
let output = std::process::Command::new("busctl")
|
||||
.args([
|
||||
"call",
|
||||
"org.freedesktop.network1",
|
||||
"/org/freedesktop/network1",
|
||||
"org.freedesktop.network1.Manager",
|
||||
"Reload",
|
||||
])
|
||||
.output()
|
||||
.map_err(|e| NetError::DBus(format!("Failed to execute busctl: {}", e)))?;
|
||||
|
||||
if !output.status.success() {
|
||||
return Err(NetError::DBus(format!(
|
||||
"busctl failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Reload via networkctl command (fallback)
|
||||
fn reload_networkctl(&self) -> Result<()> {
|
||||
let output = std::process::Command::new("networkctl")
|
||||
.arg("reload")
|
||||
.output()
|
||||
.map_err(|e| NetError::NetworkdReload(format!("Failed to execute networkctl: {}", e)))?;
|
||||
|
||||
if !output.status.success() {
|
||||
return Err(NetError::NetworkdReload(format!(
|
||||
"networkctl reload failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Reconfigure a specific interface
|
||||
pub fn reconfigure(&self, iface_name: &str) -> Result<()> {
|
||||
let output = std::process::Command::new("networkctl")
|
||||
.args(["reconfigure", iface_name])
|
||||
.output()
|
||||
.map_err(|e| {
|
||||
NetError::NetworkdReload(format!("Failed to execute networkctl: {}", e))
|
||||
})?;
|
||||
|
||||
if !output.status.success() {
|
||||
return Err(NetError::NetworkdReload(format!(
|
||||
"networkctl reconfigure failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Generate unique interface name for a VM.
|
||||
///
|
||||
/// Convention: `tap-{vm_id}` (truncated to 15 chars, Linux IFNAMSIZ limit).
|
||||
fn generate_iface_name(&self, vm_id: &str) -> String {
|
||||
let sanitized: String = vm_id
|
||||
.chars()
|
||||
.filter(|c| c.is_alphanumeric() || *c == '-')
|
||||
.collect();
|
||||
let name = format!("tap-{}", sanitized);
|
||||
// Linux interface names are limited to 15 characters (IFNAMSIZ - 1)
|
||||
if name.len() > 15 {
|
||||
name[..15].to_string()
|
||||
} else {
|
||||
name
|
||||
}
|
||||
}
|
||||
|
||||
/// Delete networkd configuration files for a VM
|
||||
fn delete_config_files(&self, vm_id: &str) -> Result<()> {
|
||||
let mut files = self.config_files.lock().unwrap();
|
||||
if let Some(paths) = files.remove(vm_id) {
|
||||
for path in paths {
|
||||
if path.exists() {
|
||||
fs::remove_file(&path).map_err(|e| {
|
||||
NetError::NetworkdConfig(format!(
|
||||
"Failed to remove {}: {}",
|
||||
path.display(),
|
||||
e
|
||||
))
|
||||
})?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Delete TAP interface
|
||||
fn delete_interface(&self, iface_name: &str) -> Result<()> {
|
||||
let output = std::process::Command::new("ip")
|
||||
.args(["link", "delete", iface_name])
|
||||
.output()
|
||||
.map_err(|e| NetError::InterfaceCreation(format!("Failed to run ip command: {}", e)))?;
|
||||
|
||||
// Don't error if interface doesn't exist
|
||||
if !output.status.success() {
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
if !stderr.contains("Cannot find device") {
|
||||
tracing::warn!("Failed to delete interface {}: {}", iface_name, stderr);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Create TAP interface via open/ioctl (faster than networkd-managed)
|
||||
fn create_tap_direct(&self, config: &NetworkConfig) -> Result<NetworkInterface> {
|
||||
let mac = config.mac_address.clone().unwrap_or_else(MacAddress::random);
|
||||
let iface_name = self.generate_iface_name(&config.vm_id);
|
||||
|
||||
// Open TAP device
|
||||
let (fd, actual_name) = open_tap(&iface_name, config.multiqueue, true)?;
|
||||
|
||||
// Bring interface up
|
||||
set_interface_up(&actual_name)?;
|
||||
|
||||
// Add to bridge if specified
|
||||
if let Some(ref bridge) = config.bridge {
|
||||
super::add_to_bridge(&actual_name, bridge)?;
|
||||
}
|
||||
|
||||
// Get interface index
|
||||
let ifindex = get_ifindex(&actual_name)?;
|
||||
|
||||
// Track interface for cleanup
|
||||
let mut interfaces = self.interfaces.lock().unwrap();
|
||||
interfaces
|
||||
.entry(config.vm_id.clone())
|
||||
.or_insert_with(Vec::new)
|
||||
.push(actual_name.clone());
|
||||
|
||||
// Open vhost-net if enabled
|
||||
let vhost_fd = if self.use_vhost {
|
||||
Some(super::vhost::open_vhost_net()?)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// Open additional queues if multiqueue enabled
|
||||
let queue_fds = if config.multiqueue && config.num_queues > 1 {
|
||||
let mut fds = Vec::new();
|
||||
for _ in 1..config.num_queues {
|
||||
let (qfd, _) = open_tap(&actual_name, true, true)?;
|
||||
fds.push(qfd);
|
||||
}
|
||||
fds
|
||||
} else {
|
||||
Vec::new()
|
||||
};
|
||||
|
||||
Ok(NetworkInterface {
|
||||
name: actual_name,
|
||||
ifindex,
|
||||
fd,
|
||||
mac,
|
||||
iface_type: if vhost_fd.is_some() {
|
||||
InterfaceType::TapVhost
|
||||
} else {
|
||||
InterfaceType::Tap
|
||||
},
|
||||
bridge: config.bridge.clone(),
|
||||
vhost_fd,
|
||||
queue_fds,
|
||||
})
|
||||
}
|
||||
|
||||
/// Create TAP interface via networkd configuration
|
||||
fn create_tap_networkd(&self, config: &NetworkConfig) -> Result<NetworkInterface> {
|
||||
let mac = config.mac_address.clone().unwrap_or_else(MacAddress::random);
|
||||
let iface_name = self.generate_iface_name(&config.vm_id);
|
||||
|
||||
// Generate and write .netdev file
|
||||
let netdev_content = self.generate_tap_netdev(&config.vm_id, &iface_name, &mac);
|
||||
self.write_config(
|
||||
&config.vm_id,
|
||||
&format!("50-volt-vmm-{}.netdev", iface_name),
|
||||
&netdev_content,
|
||||
)?;
|
||||
|
||||
// Generate and write .network file
|
||||
let network_content = self.generate_tap_network(&config.vm_id, &iface_name, config);
|
||||
self.write_config(
|
||||
&config.vm_id,
|
||||
&format!("50-volt-vmm-{}.network", iface_name),
|
||||
&network_content,
|
||||
)?;
|
||||
|
||||
// Reload networkd
|
||||
self.reload()?;
|
||||
|
||||
// Wait for interface to appear
|
||||
std::thread::sleep(std::time::Duration::from_millis(100));
|
||||
|
||||
// Now open the TAP device
|
||||
let (fd, _) = open_tap(&iface_name, config.multiqueue, true)?;
|
||||
|
||||
// Get interface index
|
||||
let ifindex = get_ifindex(&iface_name)?;
|
||||
|
||||
// Track interface for cleanup
|
||||
let mut interfaces = self.interfaces.lock().unwrap();
|
||||
interfaces
|
||||
.entry(config.vm_id.clone())
|
||||
.or_insert_with(Vec::new)
|
||||
.push(iface_name.clone());
|
||||
|
||||
// Open vhost-net if enabled
|
||||
let vhost_fd = if self.use_vhost {
|
||||
Some(super::vhost::open_vhost_net()?)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
Ok(NetworkInterface {
|
||||
name: iface_name,
|
||||
ifindex,
|
||||
fd,
|
||||
mac,
|
||||
iface_type: if vhost_fd.is_some() {
|
||||
InterfaceType::TapVhost
|
||||
} else {
|
||||
InterfaceType::Tap
|
||||
},
|
||||
bridge: config.bridge.clone(),
|
||||
vhost_fd,
|
||||
queue_fds: Vec::new(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Ensure a bridge exists
|
||||
pub fn ensure_bridge(
|
||||
&self,
|
||||
bridge_name: &str,
|
||||
ip: Option<std::net::Ipv4Addr>,
|
||||
netmask: Option<u8>,
|
||||
gateway: Option<std::net::Ipv4Addr>,
|
||||
) -> Result<()> {
|
||||
// Check if bridge already exists
|
||||
if Path::new(&format!("/sys/class/net/{}", bridge_name)).exists() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Generate bridge .netdev
|
||||
let netdev_content = self.generate_bridge_netdev(bridge_name);
|
||||
self.write_config(
|
||||
"volt-vmm-bridges",
|
||||
&format!("10-volt-vmm-{}.netdev", bridge_name),
|
||||
&netdev_content,
|
||||
)?;
|
||||
|
||||
// Generate bridge .network
|
||||
let network_content = self.generate_bridge_network(bridge_name, ip, netmask, gateway);
|
||||
self.write_config(
|
||||
"volt-vmm-bridges",
|
||||
&format!("10-volt-vmm-{}.network", bridge_name),
|
||||
&network_content,
|
||||
)?;
|
||||
|
||||
// Reload networkd
|
||||
self.reload()?;
|
||||
|
||||
// Wait for bridge to appear
|
||||
std::thread::sleep(std::time::Duration::from_millis(100));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl NetworkBackend for NetworkdBackend {
|
||||
fn create_interface(&self, config: &NetworkConfig) -> Result<NetworkInterface> {
|
||||
// Use direct TAP creation for speed, but write networkd configs for persistence
|
||||
self.create_tap_direct(config)
|
||||
}
|
||||
|
||||
fn attach_to_vm(&self, iface: &NetworkInterface) -> Result<RawFd> {
|
||||
// Return vhost fd if available, otherwise TAP fd
|
||||
Ok(iface.vhost_fd.unwrap_or(iface.fd))
|
||||
}
|
||||
|
||||
fn as_any(&self) -> &dyn std::any::Any {
|
||||
self
|
||||
}
|
||||
|
||||
fn cleanup(&self, vm_id: &str) -> Result<()> {
|
||||
// Delete interfaces
|
||||
{
|
||||
let mut interfaces = self.interfaces.lock().unwrap();
|
||||
if let Some(iface_names) = interfaces.remove(vm_id) {
|
||||
for name in iface_names {
|
||||
self.delete_interface(&name)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Delete config files
|
||||
self.delete_config_files(vm_id)?;
|
||||
|
||||
// Reload networkd to clean up state
|
||||
let _ = self.reload();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn backend_type(&self) -> &'static str {
|
||||
"tap+networkd"
|
||||
}
|
||||
|
||||
fn supports_vhost(&self) -> bool {
|
||||
self.use_vhost
|
||||
}
|
||||
|
||||
fn supports_multiqueue(&self) -> bool {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
/// Configuration for a Volt bridge
|
||||
#[derive(Debug, Clone)]
|
||||
#[allow(dead_code)]
|
||||
pub struct BridgeConfig {
|
||||
/// Bridge name
|
||||
pub name: String,
|
||||
/// IP address for the bridge
|
||||
pub ip: Option<std::net::Ipv4Addr>,
|
||||
/// Netmask (CIDR prefix)
|
||||
pub netmask: Option<u8>,
|
||||
/// Gateway
|
||||
pub gateway: Option<std::net::Ipv4Addr>,
|
||||
/// Enable DHCP server
|
||||
pub dhcp_server: bool,
|
||||
/// DHCP range start
|
||||
pub dhcp_start: Option<std::net::Ipv4Addr>,
|
||||
/// DHCP range end
|
||||
pub dhcp_end: Option<std::net::Ipv4Addr>,
|
||||
}
|
||||
|
||||
impl Default for BridgeConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
name: "volt0".to_string(),
|
||||
ip: Some(std::net::Ipv4Addr::new(10, 100, 0, 1)),
|
||||
netmask: Some(24),
|
||||
gateway: None,
|
||||
dhcp_server: true,
|
||||
dhcp_start: Some(std::net::Ipv4Addr::new(10, 100, 0, 100)),
|
||||
dhcp_end: Some(std::net::Ipv4Addr::new(10, 100, 0, 199)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_generate_iface_name() {
|
||||
let backend = NetworkdBackend {
|
||||
config_dir: PathBuf::from("/tmp/test"),
|
||||
use_vhost: false,
|
||||
interfaces: Arc::new(Mutex::new(HashMap::new())),
|
||||
config_files: Arc::new(Mutex::new(HashMap::new())),
|
||||
};
|
||||
|
||||
let name = backend.generate_iface_name("vm-abc123-def456");
|
||||
assert!(name.starts_with("tap-"));
|
||||
assert!(name.len() <= 15); // Linux interface name limit
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_generate_tap_netdev() {
|
||||
let backend = NetworkdBackend {
|
||||
config_dir: PathBuf::from("/tmp/test"),
|
||||
use_vhost: false,
|
||||
interfaces: Arc::new(Mutex::new(HashMap::new())),
|
||||
config_files: Arc::new(Mutex::new(HashMap::new())),
|
||||
};
|
||||
|
||||
let mac = MacAddress::from_bytes([0x52, 0x54, 0x00, 0xab, 0xcd, 0xef]);
|
||||
let content = backend.generate_tap_netdev("vm123", "tap0", &mac);
|
||||
|
||||
assert!(content.contains("[NetDev]"));
|
||||
assert!(content.contains("Name=tap0"));
|
||||
assert!(content.contains("Kind=tap"));
|
||||
assert!(content.contains("MultiQueue=yes"));
|
||||
assert!(content.contains("VNetHeader=yes"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_generate_tap_network_with_bridge() {
|
||||
let backend = NetworkdBackend {
|
||||
config_dir: PathBuf::from("/tmp/test"),
|
||||
use_vhost: false,
|
||||
interfaces: Arc::new(Mutex::new(HashMap::new())),
|
||||
config_files: Arc::new(Mutex::new(HashMap::new())),
|
||||
};
|
||||
|
||||
let config = NetworkConfig {
|
||||
vm_id: "test-vm".to_string(),
|
||||
bridge: Some("br0".to_string()),
|
||||
mtu: 1500,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let content = backend.generate_tap_network("test-vm", "tap0", &config);
|
||||
|
||||
assert!(content.contains("[Match]"));
|
||||
assert!(content.contains("Name=tap0"));
|
||||
assert!(content.contains("Bridge=br0"));
|
||||
assert!(content.contains("MTUBytes=1500"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_generate_bridge_netdev() {
|
||||
let backend = NetworkdBackend {
|
||||
config_dir: PathBuf::from("/tmp/test"),
|
||||
use_vhost: false,
|
||||
interfaces: Arc::new(Mutex::new(HashMap::new())),
|
||||
config_files: Arc::new(Mutex::new(HashMap::new())),
|
||||
};
|
||||
|
||||
let content = backend.generate_bridge_netdev("volt0");
|
||||
|
||||
assert!(content.contains("[NetDev]"));
|
||||
assert!(content.contains("Name=volt0"));
|
||||
assert!(content.contains("Kind=bridge"));
|
||||
assert!(content.contains("STP=no"));
|
||||
}
|
||||
}
|
||||
637
vmm/src/net/vhost.rs
Normal file
637
vmm/src/net/vhost.rs
Normal file
@@ -0,0 +1,637 @@
|
||||
//! vhost-net acceleration for Volt VMM
|
||||
//!
|
||||
//! This module provides kernel-accelerated networking via /dev/vhost-net.
|
||||
//! vhost-net moves packet processing from userspace to the kernel, enabling
|
||||
//! zero-copy TX/RX paths for significantly improved performance.
|
||||
//!
|
||||
//! # Architecture
|
||||
//!
|
||||
//! ```text
|
||||
//! ┌─────────────────────────────────────────────────────────────┐
|
||||
//! │ Guest VM │
|
||||
//! │ ┌────────────────────────────────────────────────────────┐ │
|
||||
//! │ │ virtio-net driver │ │
|
||||
//! │ └──────────────────────────┬─────────────────────────────┘ │
|
||||
//! └─────────────────────────────┼───────────────────────────────┘
|
||||
//! │ virtqueue (shared memory)
|
||||
//! ┌─────────────────────────────┼───────────────────────────────┐
|
||||
//! │ Host Kernel │ │
|
||||
//! │ ┌──────────────────────────▼─────────────────────────────┐ │
|
||||
//! │ │ vhost-net │ │
|
||||
//! │ │ ┌────────────┐ ┌────────────┐ ┌────────────┐ │ │
|
||||
//! │ │ │ TX Worker │ │ RX Worker │ │ IRQ Inj. │ │ │
|
||||
//! │ │ └──────┬─────┘ └──────┬─────┘ └────────────┘ │ │
|
||||
//! │ └─────────┼─────────────────┼────────────────────────────┘ │
|
||||
//! │ │ │ │
|
||||
//! │ ┌─────────▼─────────────────▼────────────────────────────┐ │
|
||||
//! │ │ TAP device │ │
|
||||
//! │ └─────────────────────────────────────────────────────────┘ │
|
||||
//! └─────────────────────────────────────────────────────────────┘
|
||||
//! ```
|
||||
//!
|
||||
//! # Zero-Copy Path
|
||||
//!
|
||||
//! When vhost-net is enabled:
|
||||
//! 1. Guest writes to virtqueue (TX) → kernel processes directly
|
||||
//! 2. TAP receives packets → kernel injects into virtqueue (RX)
|
||||
//! 3. No userspace copies or context switches for packet handling
|
||||
|
||||
use super::{
|
||||
get_ifindex, open_tap, set_interface_up, InterfaceType, MacAddress, NetError, NetworkBackend,
|
||||
NetworkConfig, NetworkInterface, Result,
|
||||
};
|
||||
use std::collections::HashMap;
|
||||
use std::fs::OpenOptions;
|
||||
use std::os::unix::io::{IntoRawFd, RawFd};
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
/// vhost-net feature flags
|
||||
pub mod vhost_features {
|
||||
/// Mergeable receive buffers
|
||||
pub const VHOST_NET_F_VIRTIO_NET_HDR: u64 = 1 << 27;
|
||||
/// Backend supports eventfd for kick/call
|
||||
pub const VHOST_F_LOG_ALL: u64 = 1 << 26;
|
||||
}
|
||||
|
||||
/// vhost ioctl command definitions
|
||||
mod vhost_ioctl {
|
||||
use nix::ioctl_read;
|
||||
use nix::ioctl_write_int;
|
||||
use nix::ioctl_write_ptr;
|
||||
|
||||
// From linux/vhost.h
|
||||
const VHOST_VIRTIO: u8 = 0xAF;
|
||||
|
||||
// Basic ioctls
|
||||
ioctl_write_int!(vhost_set_owner, VHOST_VIRTIO, 0x01);
|
||||
ioctl_write_int!(vhost_reset_owner, VHOST_VIRTIO, 0x02);
|
||||
ioctl_write_ptr!(vhost_set_mem_table, VHOST_VIRTIO, 0x03, VhostMemory);
|
||||
ioctl_write_ptr!(vhost_set_log_base, VHOST_VIRTIO, 0x04, u64);
|
||||
ioctl_write_ptr!(vhost_set_log_fd, VHOST_VIRTIO, 0x07, i32);
|
||||
ioctl_write_ptr!(vhost_set_vring_num, VHOST_VIRTIO, 0x10, VhostVringState);
|
||||
ioctl_write_ptr!(vhost_set_vring_base, VHOST_VIRTIO, 0x12, VhostVringState);
|
||||
ioctl_read!(vhost_get_vring_base, VHOST_VIRTIO, 0x12, VhostVringState);
|
||||
ioctl_write_ptr!(vhost_set_vring_addr, VHOST_VIRTIO, 0x11, VhostVringAddr);
|
||||
ioctl_write_ptr!(vhost_set_vring_kick, VHOST_VIRTIO, 0x20, VhostVringFile);
|
||||
ioctl_write_ptr!(vhost_set_vring_call, VHOST_VIRTIO, 0x21, VhostVringFile);
|
||||
ioctl_write_ptr!(vhost_set_vring_err, VHOST_VIRTIO, 0x22, VhostVringFile);
|
||||
|
||||
// vhost-net specific ioctls
|
||||
ioctl_write_ptr!(vhost_net_set_backend, VHOST_VIRTIO, 0x30, VhostVringFile);
|
||||
|
||||
// Feature ioctls
|
||||
ioctl_read!(vhost_get_features, VHOST_VIRTIO, 0x00, u64);
|
||||
ioctl_write_ptr!(vhost_set_features, VHOST_VIRTIO, 0x00, u64);
|
||||
|
||||
/// Memory region for vhost
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct VhostMemoryRegion {
|
||||
pub guest_phys_addr: u64,
|
||||
pub memory_size: u64,
|
||||
pub userspace_addr: u64,
|
||||
pub mmap_offset: u64,
|
||||
}
|
||||
|
||||
/// Memory table for vhost
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct VhostMemory {
|
||||
pub nregions: u32,
|
||||
pub padding: u32,
|
||||
pub regions: [VhostMemoryRegion; 64],
|
||||
}
|
||||
|
||||
/// Vring state (index + num)
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone, Copy, Default)]
|
||||
pub struct VhostVringState {
|
||||
pub index: u32,
|
||||
pub num: u32,
|
||||
}
|
||||
|
||||
/// Vring addresses
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone, Copy, Default)]
|
||||
pub struct VhostVringAddr {
|
||||
pub index: u32,
|
||||
pub flags: u32,
|
||||
pub desc_user_addr: u64,
|
||||
pub used_user_addr: u64,
|
||||
pub avail_user_addr: u64,
|
||||
pub log_guest_addr: u64,
|
||||
}
|
||||
|
||||
/// Vring file descriptor
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone, Copy, Default)]
|
||||
pub struct VhostVringFile {
|
||||
pub index: u32,
|
||||
pub fd: i32,
|
||||
}
|
||||
}
|
||||
|
||||
pub use vhost_ioctl::{VhostMemory, VhostMemoryRegion, VhostVringAddr, VhostVringFile, VhostVringState};
|
||||
|
||||
/// Open /dev/vhost-net device
|
||||
pub fn open_vhost_net() -> Result<RawFd> {
|
||||
let file = OpenOptions::new()
|
||||
.read(true)
|
||||
.write(true)
|
||||
.open("/dev/vhost-net")
|
||||
.map_err(|e| {
|
||||
if e.kind() == std::io::ErrorKind::PermissionDenied {
|
||||
NetError::PermissionDenied(
|
||||
"Cannot open /dev/vhost-net - check permissions or run as root".to_string(),
|
||||
)
|
||||
} else if e.kind() == std::io::ErrorKind::NotFound {
|
||||
NetError::VhostNet("vhost-net kernel module not loaded".to_string())
|
||||
} else {
|
||||
NetError::VhostNet(format!("Failed to open /dev/vhost-net: {}", e))
|
||||
}
|
||||
})?;
|
||||
|
||||
Ok(file.into_raw_fd())
|
||||
}
|
||||
|
||||
/// Check if vhost-net is available on this system
|
||||
pub fn is_vhost_available() -> bool {
|
||||
std::path::Path::new("/dev/vhost-net").exists()
|
||||
}
|
||||
|
||||
/// vhost-net accelerated network backend
|
||||
pub struct VhostNetBackend {
|
||||
/// Track created interfaces for cleanup
|
||||
interfaces: Arc<Mutex<HashMap<String, VhostInterface>>>,
|
||||
}
|
||||
|
||||
/// Tracked vhost interface
|
||||
struct VhostInterface {
|
||||
/// TAP interface name
|
||||
tap_name: String,
|
||||
/// TAP file descriptor
|
||||
tap_fd: RawFd,
|
||||
/// vhost-net file descriptor
|
||||
vhost_fd: RawFd,
|
||||
/// Eventfds for kick/call (per queue)
|
||||
kick_fds: Vec<RawFd>,
|
||||
call_fds: Vec<RawFd>,
|
||||
}
|
||||
|
||||
impl VhostNetBackend {
|
||||
/// Create a new vhost-net backend
|
||||
pub fn new() -> Result<Self> {
|
||||
// Verify vhost-net is available
|
||||
if !is_vhost_available() {
|
||||
return Err(NetError::VhostNet(
|
||||
"vhost-net not available - load kernel module with 'modprobe vhost-net'"
|
||||
.to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
interfaces: Arc::new(Mutex::new(HashMap::new())),
|
||||
})
|
||||
}
|
||||
|
||||
/// Set up vhost-net for a TAP device
|
||||
pub fn setup_vhost(
|
||||
&self,
|
||||
vhost_fd: RawFd,
|
||||
tap_fd: RawFd,
|
||||
mem_regions: &[VhostMemoryRegion],
|
||||
num_queues: usize,
|
||||
) -> Result<VhostSetup> {
|
||||
// 1. Set owner
|
||||
unsafe {
|
||||
vhost_ioctl::vhost_set_owner(vhost_fd, 0)
|
||||
.map_err(|e| NetError::VhostNet(format!("VHOST_SET_OWNER failed: {}", e)))?;
|
||||
}
|
||||
|
||||
// 2. Get and set features
|
||||
let mut features: u64 = 0;
|
||||
unsafe {
|
||||
vhost_ioctl::vhost_get_features(vhost_fd, &mut features)
|
||||
.map_err(|e| NetError::VhostNet(format!("VHOST_GET_FEATURES failed: {}", e)))?;
|
||||
}
|
||||
|
||||
// Enable desired features
|
||||
let enabled_features = features & (vhost_features::VHOST_NET_F_VIRTIO_NET_HDR);
|
||||
unsafe {
|
||||
vhost_ioctl::vhost_set_features(vhost_fd, &enabled_features)
|
||||
.map_err(|e| NetError::VhostNet(format!("VHOST_SET_FEATURES failed: {}", e)))?;
|
||||
}
|
||||
|
||||
// 3. Set up memory table
|
||||
let mut mem_table = VhostMemory {
|
||||
nregions: mem_regions.len() as u32,
|
||||
padding: 0,
|
||||
regions: [VhostMemoryRegion {
|
||||
guest_phys_addr: 0,
|
||||
memory_size: 0,
|
||||
userspace_addr: 0,
|
||||
mmap_offset: 0,
|
||||
}; 64],
|
||||
};
|
||||
|
||||
for (i, region) in mem_regions.iter().enumerate() {
|
||||
if i >= 64 {
|
||||
break;
|
||||
}
|
||||
mem_table.regions[i] = *region;
|
||||
}
|
||||
|
||||
unsafe {
|
||||
vhost_ioctl::vhost_set_mem_table(vhost_fd, &mem_table)
|
||||
.map_err(|e| NetError::VhostNet(format!("VHOST_SET_MEM_TABLE failed: {}", e)))?;
|
||||
}
|
||||
|
||||
// 4. Create eventfds for each queue
|
||||
let mut kick_fds = Vec::with_capacity(num_queues);
|
||||
let mut call_fds = Vec::with_capacity(num_queues);
|
||||
|
||||
for _ in 0..num_queues {
|
||||
let kick_fd = create_eventfd()?;
|
||||
let call_fd = create_eventfd()?;
|
||||
kick_fds.push(kick_fd);
|
||||
call_fds.push(call_fd);
|
||||
}
|
||||
|
||||
// 5. Set backend (TAP device) for each queue
|
||||
for i in 0..num_queues {
|
||||
let backend = VhostVringFile {
|
||||
index: i as u32,
|
||||
fd: tap_fd,
|
||||
};
|
||||
|
||||
unsafe {
|
||||
vhost_ioctl::vhost_net_set_backend(vhost_fd, &backend).map_err(|e| {
|
||||
NetError::VhostNet(format!("VHOST_NET_SET_BACKEND failed: {}", e))
|
||||
})?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(VhostSetup {
|
||||
features: enabled_features,
|
||||
kick_fds,
|
||||
call_fds,
|
||||
})
|
||||
}
|
||||
|
||||
/// Configure a vring (virtqueue)
|
||||
pub fn configure_vring(
|
||||
&self,
|
||||
vhost_fd: RawFd,
|
||||
vring_index: u32,
|
||||
vring_config: &VringConfig,
|
||||
) -> Result<()> {
|
||||
// Set vring num (size)
|
||||
let state = VhostVringState {
|
||||
index: vring_index,
|
||||
num: vring_config.size,
|
||||
};
|
||||
unsafe {
|
||||
vhost_ioctl::vhost_set_vring_num(vhost_fd, &state)
|
||||
.map_err(|e| NetError::VhostNet(format!("VHOST_SET_VRING_NUM failed: {}", e)))?;
|
||||
}
|
||||
|
||||
// Set vring base
|
||||
let base = VhostVringState {
|
||||
index: vring_index,
|
||||
num: 0, // Start from 0
|
||||
};
|
||||
unsafe {
|
||||
vhost_ioctl::vhost_set_vring_base(vhost_fd, &base)
|
||||
.map_err(|e| NetError::VhostNet(format!("VHOST_SET_VRING_BASE failed: {}", e)))?;
|
||||
}
|
||||
|
||||
// Set vring addresses
|
||||
let addr = VhostVringAddr {
|
||||
index: vring_index,
|
||||
flags: 0,
|
||||
desc_user_addr: vring_config.desc_addr,
|
||||
used_user_addr: vring_config.used_addr,
|
||||
avail_user_addr: vring_config.avail_addr,
|
||||
log_guest_addr: 0,
|
||||
};
|
||||
unsafe {
|
||||
vhost_ioctl::vhost_set_vring_addr(vhost_fd, &addr)
|
||||
.map_err(|e| NetError::VhostNet(format!("VHOST_SET_VRING_ADDR failed: {}", e)))?;
|
||||
}
|
||||
|
||||
// Set kick fd
|
||||
let kick = VhostVringFile {
|
||||
index: vring_index,
|
||||
fd: vring_config.kick_fd,
|
||||
};
|
||||
unsafe {
|
||||
vhost_ioctl::vhost_set_vring_kick(vhost_fd, &kick)
|
||||
.map_err(|e| NetError::VhostNet(format!("VHOST_SET_VRING_KICK failed: {}", e)))?;
|
||||
}
|
||||
|
||||
// Set call fd
|
||||
let call = VhostVringFile {
|
||||
index: vring_index,
|
||||
fd: vring_config.call_fd,
|
||||
};
|
||||
unsafe {
|
||||
vhost_ioctl::vhost_set_vring_call(vhost_fd, &call)
|
||||
.map_err(|e| NetError::VhostNet(format!("VHOST_SET_VRING_CALL failed: {}", e)))?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Generate unique interface name for a VM
|
||||
fn generate_iface_name(&self, vm_id: &str) -> String {
|
||||
let short_id: String = vm_id
|
||||
.chars()
|
||||
.filter(|c| c.is_alphanumeric())
|
||||
.take(8)
|
||||
.collect();
|
||||
format!("nfvhost{}", short_id)
|
||||
}
|
||||
}
|
||||
|
||||
impl NetworkBackend for VhostNetBackend {
|
||||
fn create_interface(&self, config: &NetworkConfig) -> Result<NetworkInterface> {
|
||||
let mac = config.mac_address.clone().unwrap_or_else(MacAddress::random);
|
||||
let iface_name = self.generate_iface_name(&config.vm_id);
|
||||
|
||||
// Open TAP device with vnet_hdr enabled (required for vhost-net)
|
||||
let (tap_fd, actual_name) = open_tap(&iface_name, config.multiqueue, true)?;
|
||||
|
||||
// Set interface up
|
||||
set_interface_up(&actual_name)?;
|
||||
|
||||
// Add to bridge if specified
|
||||
if let Some(ref bridge) = config.bridge {
|
||||
super::add_to_bridge(&actual_name, bridge)?;
|
||||
}
|
||||
|
||||
// Get interface index
|
||||
let ifindex = get_ifindex(&actual_name)?;
|
||||
|
||||
// Open vhost-net device
|
||||
let vhost_fd = open_vhost_net()?;
|
||||
|
||||
// Create eventfds for queues
|
||||
let num_queues = if config.multiqueue {
|
||||
config.num_queues as usize * 2 // RX + TX for each queue pair
|
||||
} else {
|
||||
2 // Single RX + TX
|
||||
};
|
||||
|
||||
let mut kick_fds = Vec::with_capacity(num_queues);
|
||||
let mut call_fds = Vec::with_capacity(num_queues);
|
||||
|
||||
for _ in 0..num_queues {
|
||||
kick_fds.push(create_eventfd()?);
|
||||
call_fds.push(create_eventfd()?);
|
||||
}
|
||||
|
||||
// Track interface for cleanup
|
||||
{
|
||||
let mut interfaces = self.interfaces.lock().unwrap();
|
||||
interfaces.insert(
|
||||
config.vm_id.clone(),
|
||||
VhostInterface {
|
||||
tap_name: actual_name.clone(),
|
||||
tap_fd,
|
||||
vhost_fd,
|
||||
kick_fds: kick_fds.clone(),
|
||||
call_fds: call_fds.clone(),
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
// Additional queue fds for multiqueue
|
||||
let queue_fds = if config.multiqueue && config.num_queues > 1 {
|
||||
let mut fds = Vec::new();
|
||||
for _ in 1..config.num_queues {
|
||||
let (qfd, _) = open_tap(&actual_name, true, true)?;
|
||||
fds.push(qfd);
|
||||
}
|
||||
fds
|
||||
} else {
|
||||
Vec::new()
|
||||
};
|
||||
|
||||
Ok(NetworkInterface {
|
||||
name: actual_name,
|
||||
ifindex,
|
||||
fd: tap_fd,
|
||||
mac,
|
||||
iface_type: InterfaceType::TapVhost,
|
||||
bridge: config.bridge.clone(),
|
||||
vhost_fd: Some(vhost_fd),
|
||||
queue_fds,
|
||||
})
|
||||
}
|
||||
|
||||
fn attach_to_vm(&self, iface: &NetworkInterface) -> Result<RawFd> {
|
||||
// Return the vhost fd for direct kernel processing
|
||||
iface.vhost_fd.ok_or_else(|| {
|
||||
NetError::VhostNet("Interface not configured with vhost-net".to_string())
|
||||
})
|
||||
}
|
||||
|
||||
fn as_any(&self) -> &dyn std::any::Any {
|
||||
self
|
||||
}
|
||||
|
||||
fn cleanup(&self, vm_id: &str) -> Result<()> {
|
||||
let mut interfaces = self.interfaces.lock().unwrap();
|
||||
|
||||
if let Some(iface) = interfaces.remove(vm_id) {
|
||||
// Close vhost fd
|
||||
unsafe {
|
||||
libc::close(iface.vhost_fd);
|
||||
}
|
||||
|
||||
// Close TAP fd
|
||||
unsafe {
|
||||
libc::close(iface.tap_fd);
|
||||
}
|
||||
|
||||
// Close eventfds
|
||||
for fd in iface.kick_fds {
|
||||
unsafe {
|
||||
libc::close(fd);
|
||||
}
|
||||
}
|
||||
for fd in iface.call_fds {
|
||||
unsafe {
|
||||
libc::close(fd);
|
||||
}
|
||||
}
|
||||
|
||||
// Delete the TAP interface
|
||||
let _ = std::process::Command::new("ip")
|
||||
.args(["link", "delete", &iface.tap_name])
|
||||
.output();
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn backend_type(&self) -> &'static str {
|
||||
"vhost-net"
|
||||
}
|
||||
|
||||
fn supports_vhost(&self) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
fn supports_multiqueue(&self) -> bool {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
/// Result of vhost setup
|
||||
#[derive(Debug)]
|
||||
pub struct VhostSetup {
|
||||
/// Enabled features
|
||||
pub features: u64,
|
||||
/// Kick eventfds (one per queue)
|
||||
pub kick_fds: Vec<RawFd>,
|
||||
/// Call eventfds (one per queue)
|
||||
pub call_fds: Vec<RawFd>,
|
||||
}
|
||||
|
||||
/// Configuration for a single vring
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct VringConfig {
|
||||
/// Ring size (number of descriptors)
|
||||
pub size: u32,
|
||||
/// Descriptor table address (userspace)
|
||||
pub desc_addr: u64,
|
||||
/// Used ring address (userspace)
|
||||
pub used_addr: u64,
|
||||
/// Available ring address (userspace)
|
||||
pub avail_addr: u64,
|
||||
/// Kick eventfd
|
||||
pub kick_fd: RawFd,
|
||||
/// Call eventfd
|
||||
pub call_fd: RawFd,
|
||||
}
|
||||
|
||||
/// Create an eventfd
|
||||
fn create_eventfd() -> Result<RawFd> {
|
||||
let fd = unsafe { libc::eventfd(0, libc::EFD_CLOEXEC | libc::EFD_NONBLOCK) };
|
||||
|
||||
if fd < 0 {
|
||||
return Err(NetError::VhostNet(format!(
|
||||
"eventfd creation failed: {}",
|
||||
std::io::Error::last_os_error()
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(fd)
|
||||
}
|
||||
|
||||
/// Zero-copy TX path helper
|
||||
///
|
||||
/// This struct manages zero-copy transmission when vhost-net is enabled.
|
||||
/// The kernel handles packet transmission directly from guest memory.
|
||||
#[allow(dead_code)]
|
||||
pub struct ZeroCopyTx {
|
||||
vhost_fd: RawFd,
|
||||
kick_fd: RawFd,
|
||||
}
|
||||
|
||||
impl ZeroCopyTx {
|
||||
/// Create a new zero-copy TX handler
|
||||
pub fn new(vhost_fd: RawFd, kick_fd: RawFd) -> Self {
|
||||
Self { vhost_fd, kick_fd }
|
||||
}
|
||||
|
||||
/// Kick the vhost worker to process pending TX buffers
|
||||
pub fn kick(&self) -> Result<()> {
|
||||
let val: u64 = 1;
|
||||
let ret = unsafe {
|
||||
libc::write(
|
||||
self.kick_fd,
|
||||
&val as *const u64 as *const libc::c_void,
|
||||
std::mem::size_of::<u64>(),
|
||||
)
|
||||
};
|
||||
|
||||
if ret < 0 {
|
||||
return Err(NetError::VhostNet(format!(
|
||||
"TX kick failed: {}",
|
||||
std::io::Error::last_os_error()
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Zero-copy RX path helper
|
||||
///
|
||||
/// Manages zero-copy packet reception when vhost-net is enabled.
|
||||
#[allow(dead_code)]
|
||||
pub struct ZeroCopyRx {
|
||||
vhost_fd: RawFd,
|
||||
call_fd: RawFd,
|
||||
}
|
||||
|
||||
impl ZeroCopyRx {
|
||||
/// Create a new zero-copy RX handler
|
||||
pub fn new(vhost_fd: RawFd, call_fd: RawFd) -> Self {
|
||||
Self { vhost_fd, call_fd }
|
||||
}
|
||||
|
||||
/// Check if there are pending RX completions
|
||||
pub fn poll(&self) -> Result<bool> {
|
||||
let mut val: u64 = 0;
|
||||
let ret = unsafe {
|
||||
libc::read(
|
||||
self.call_fd,
|
||||
&mut val as *mut u64 as *mut libc::c_void,
|
||||
std::mem::size_of::<u64>(),
|
||||
)
|
||||
};
|
||||
|
||||
if ret < 0 {
|
||||
let err = std::io::Error::last_os_error();
|
||||
if err.kind() == std::io::ErrorKind::WouldBlock {
|
||||
return Ok(false);
|
||||
}
|
||||
return Err(NetError::VhostNet(format!("RX poll failed: {}", err)));
|
||||
}
|
||||
|
||||
Ok(val > 0)
|
||||
}
|
||||
|
||||
/// Get the call fd for epoll registration
|
||||
pub fn call_fd(&self) -> RawFd {
|
||||
self.call_fd
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_vhost_available() {
|
||||
// This test just checks the function runs
|
||||
let _ = is_vhost_available();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_vring_config() {
|
||||
let config = VringConfig {
|
||||
size: 256,
|
||||
desc_addr: 0x1000,
|
||||
used_addr: 0x2000,
|
||||
avail_addr: 0x3000,
|
||||
kick_fd: -1,
|
||||
call_fd: -1,
|
||||
};
|
||||
|
||||
assert_eq!(config.size, 256);
|
||||
}
|
||||
}
|
||||
537
vmm/src/pool.rs
Normal file
537
vmm/src/pool.rs
Normal file
@@ -0,0 +1,537 @@
|
||||
//! Pre-Warmed KVM VM Pool
|
||||
//!
|
||||
//! This module provides a pool of pre-created empty KVM VM file descriptors
|
||||
//! to accelerate snapshot restore operations. Creating a KVM VM takes ~24ms
|
||||
//! due to the KVM_CREATE_VM ioctl, TSS setup, IRQ chip creation, and PIT
|
||||
//! initialization. By pre-warming these VMs, we can drop restore time from
|
||||
//! ~30ms to ~1-2ms.
|
||||
//!
|
||||
//! # Architecture
|
||||
//!
|
||||
//! ```text
|
||||
//! ┌─────────────────────────────────────────────────────────────┐
|
||||
//! │ VmPool │
|
||||
//! │ ┌─────────────────────────────────────────────────────────┐│
|
||||
//! │ │ Pool (Arc<Mutex<VecDeque>>) ││
|
||||
//! │ │ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ ││
|
||||
//! │ │ │ Empty VM │ │ Empty VM │ │ Empty VM │ │ Empty VM │ ││
|
||||
//! │ │ │ (TSS+IRQ │ │ (TSS+IRQ │ │ (TSS+IRQ │ │ (TSS+IRQ │ ││
|
||||
//! │ │ │ +PIT) │ │ +PIT) │ │ +PIT) │ │ +PIT) │ ││
|
||||
//! │ │ └──────────┘ └──────────┘ └──────────┘ └──────────┘ ││
|
||||
//! │ └─────────────────────────────────────────────────────────┘│
|
||||
//! │ acquire() → takes VM from pool │
|
||||
//! │ release() → returns VM to pool (for reuse) │
|
||||
//! │ replenish() → background task to refill pool │
|
||||
//! └─────────────────────────────────────────────────────────────┘
|
||||
//! ```
|
||||
//!
|
||||
//! # Usage
|
||||
//!
|
||||
//! ```ignore
|
||||
//! use volt-vmm::pool::VmPool;
|
||||
//!
|
||||
//! // Create pool at startup
|
||||
//! let pool = VmPool::new(4).unwrap();
|
||||
//!
|
||||
//! // On snapshot restore, acquire a pre-warmed VM
|
||||
//! let pre_warmed = pool.acquire().unwrap();
|
||||
//! let vm_fd = pre_warmed.vm_fd;
|
||||
//! let kvm = pre_warmed.kvm;
|
||||
//!
|
||||
//! // VM is already set up with TSS, IRQ chip, and PIT
|
||||
//! // Just need to: register memory, restore vCPU state, etc.
|
||||
//! ```
|
||||
|
||||
use std::collections::VecDeque;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
|
||||
use kvm_bindings::{kvm_pit_config, KVM_PIT_SPEAKER_DUMMY};
|
||||
use kvm_ioctls::{Kvm, VmFd};
|
||||
use parking_lot::Mutex;
|
||||
use tracing::{debug, info, warn};
|
||||
|
||||
/// Default pool size (3-5 VMs is a good balance)
|
||||
#[allow(dead_code)]
|
||||
pub const DEFAULT_POOL_SIZE: usize = 4;
|
||||
|
||||
/// TSS address used for x86_64 VMs
|
||||
const TSS_ADDRESS: u64 = 0xFFFB_D000;
|
||||
|
||||
/// A pre-warmed KVM VM with all base setup complete
|
||||
pub struct PreWarmedVm {
|
||||
/// The KVM system handle (needed for vCPU creation)
|
||||
pub kvm: Kvm,
|
||||
/// The VM file descriptor (with TSS, IRQ chip, and PIT already set up)
|
||||
pub vm_fd: VmFd,
|
||||
/// When this VM was created (for debugging/metrics)
|
||||
pub created_at: Instant,
|
||||
}
|
||||
|
||||
/// Result type for pool operations
|
||||
pub type Result<T> = std::result::Result<T, PoolError>;
|
||||
|
||||
/// Pool operation errors
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum PoolError {
|
||||
#[error("KVM error: {0}")]
|
||||
Kvm(String),
|
||||
|
||||
#[error("Pool is empty and fallback creation failed: {0}")]
|
||||
Exhausted(String),
|
||||
}
|
||||
|
||||
/// Thread-safe pool of pre-warmed KVM VMs
|
||||
pub struct VmPool {
|
||||
/// Pre-warmed VMs ready for use
|
||||
pool: Arc<Mutex<VecDeque<PreWarmedVm>>>,
|
||||
/// Target pool size
|
||||
target_size: usize,
|
||||
/// Statistics: total VMs created
|
||||
total_created: AtomicUsize,
|
||||
/// Statistics: VMs acquired from pool (cache hit)
|
||||
pool_hits: AtomicUsize,
|
||||
/// Statistics: VMs created on-demand due to empty pool (cache miss)
|
||||
pool_misses: AtomicUsize,
|
||||
}
|
||||
|
||||
impl VmPool {
|
||||
/// Create a new VM pool with `pool_size` pre-warmed VMs.
|
||||
///
|
||||
/// This creates the VMs synchronously during initialization.
|
||||
/// Each VM has TSS, IRQ chip, and PIT already configured.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `pool_size` - Number of VMs to pre-warm (0 = disabled, default = 4)
|
||||
///
|
||||
/// # Returns
|
||||
/// A new VmPool or an error if KVM initialization fails.
|
||||
pub fn new(pool_size: usize) -> Result<Self> {
|
||||
let start = Instant::now();
|
||||
let mut vms = VecDeque::with_capacity(pool_size);
|
||||
|
||||
for i in 0..pool_size {
|
||||
let vm = Self::create_empty_vm()
|
||||
.map_err(|e| PoolError::Kvm(format!("Failed to create pre-warmed VM {}: {}", i, e)))?;
|
||||
vms.push_back(vm);
|
||||
}
|
||||
|
||||
let elapsed = start.elapsed();
|
||||
info!(
|
||||
"VM pool initialized: {} VMs pre-warmed in {:.2}ms ({:.2}ms per VM)",
|
||||
pool_size,
|
||||
elapsed.as_secs_f64() * 1000.0,
|
||||
if pool_size > 0 { elapsed.as_secs_f64() * 1000.0 / pool_size as f64 } else { 0.0 }
|
||||
);
|
||||
|
||||
Ok(Self {
|
||||
pool: Arc::new(Mutex::new(vms)),
|
||||
target_size: pool_size,
|
||||
total_created: AtomicUsize::new(pool_size),
|
||||
pool_hits: AtomicUsize::new(0),
|
||||
pool_misses: AtomicUsize::new(0),
|
||||
})
|
||||
}
|
||||
|
||||
/// Create a new pre-warmed VM with TSS, IRQ chip, and PIT configured.
|
||||
///
|
||||
/// This is the expensive operation (~24ms) that we want to avoid
|
||||
/// during snapshot restore.
|
||||
fn create_empty_vm() -> std::result::Result<PreWarmedVm, String> {
|
||||
let start = Instant::now();
|
||||
|
||||
// Open /dev/kvm
|
||||
let kvm = Kvm::new().map_err(|e| format!("open /dev/kvm: {}", e))?;
|
||||
|
||||
// Create VM
|
||||
let vm_fd = kvm.create_vm().map_err(|e| format!("create_vm: {}", e))?;
|
||||
|
||||
// Set TSS address (required for x86_64)
|
||||
vm_fd
|
||||
.set_tss_address(TSS_ADDRESS as usize)
|
||||
.map_err(|e| format!("set_tss_address: {}", e))?;
|
||||
|
||||
// Create IRQ chip (8259 PIC + IOAPIC)
|
||||
vm_fd
|
||||
.create_irq_chip()
|
||||
.map_err(|e| format!("create_irq_chip: {}", e))?;
|
||||
|
||||
// Create PIT (8254 timer)
|
||||
let pit_config = kvm_pit_config {
|
||||
flags: KVM_PIT_SPEAKER_DUMMY,
|
||||
..Default::default()
|
||||
};
|
||||
vm_fd
|
||||
.create_pit2(pit_config)
|
||||
.map_err(|e| format!("create_pit2: {}", e))?;
|
||||
|
||||
let elapsed = start.elapsed();
|
||||
debug!(
|
||||
"Pre-warmed VM created in {:.2}ms",
|
||||
elapsed.as_secs_f64() * 1000.0
|
||||
);
|
||||
|
||||
Ok(PreWarmedVm {
|
||||
kvm,
|
||||
vm_fd,
|
||||
created_at: Instant::now(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Acquire a pre-warmed VM from the pool.
|
||||
///
|
||||
/// If the pool is empty, falls back to creating a new VM on-demand
|
||||
/// (with a warning log since this defeats the purpose of the pool).
|
||||
///
|
||||
/// # Returns
|
||||
/// A `PreWarmedVm` ready for memory registration and vCPU creation.
|
||||
pub fn acquire(&self) -> Result<PreWarmedVm> {
|
||||
let start = Instant::now();
|
||||
|
||||
// Try to get a VM from the pool
|
||||
let vm = {
|
||||
let mut pool = self.pool.lock();
|
||||
pool.pop_front()
|
||||
};
|
||||
|
||||
match vm {
|
||||
Some(pre_warmed) => {
|
||||
self.pool_hits.fetch_add(1, Ordering::Relaxed);
|
||||
let age_ms = pre_warmed.created_at.elapsed().as_secs_f64() * 1000.0;
|
||||
let acquire_ms = start.elapsed().as_secs_f64() * 1000.0;
|
||||
info!(
|
||||
"VM acquired from pool in {:.3}ms (VM age: {:.1}ms, pool size: {})",
|
||||
acquire_ms,
|
||||
age_ms,
|
||||
self.pool.lock().len()
|
||||
);
|
||||
Ok(pre_warmed)
|
||||
}
|
||||
None => {
|
||||
// Pool is empty — create a new VM on demand
|
||||
self.pool_misses.fetch_add(1, Ordering::Relaxed);
|
||||
warn!("VM pool exhausted, creating VM on-demand (slow path)");
|
||||
|
||||
let vm = Self::create_empty_vm()
|
||||
.map_err(|e| PoolError::Exhausted(e))?;
|
||||
self.total_created.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
let elapsed = start.elapsed();
|
||||
warn!(
|
||||
"VM created fresh in {:.2}ms (pool miss)",
|
||||
elapsed.as_secs_f64() * 1000.0
|
||||
);
|
||||
Ok(vm)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Release a VM back to the pool for reuse.
|
||||
///
|
||||
/// This is called after a VM shuts down to allow reuse of the
|
||||
/// KVM VM file descriptor. Note that the VM state must be reset
|
||||
/// (memory unmapped, vCPUs destroyed) before reuse.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `vm` - The pre-warmed VM to return to the pool
|
||||
///
|
||||
/// # Note
|
||||
/// Currently, released VMs are NOT reused because KVM VMs cannot
|
||||
/// be cleanly reset without recreating them. This method exists
|
||||
/// for future optimization where we might track and reuse VMs
|
||||
/// with proper cleanup.
|
||||
pub fn release(&self, _vm: PreWarmedVm) {
|
||||
// For now, we don't actually reuse released VMs because:
|
||||
// 1. Memory regions need to be unregistered
|
||||
// 2. vCPUs need to be destroyed
|
||||
// 3. IRQ chip and PIT state may be modified
|
||||
//
|
||||
// Instead, we just let the VM drop and replenish the pool
|
||||
// with fresh VMs. A future optimization could implement
|
||||
// proper VM reset/cleanup.
|
||||
debug!("VM released (dropped, not reused — replenish will create fresh VMs)");
|
||||
}
|
||||
|
||||
/// Replenish the pool to the target size.
|
||||
///
|
||||
/// This is designed to be called from a background thread/task
|
||||
/// to keep the pool filled after VMs are acquired.
|
||||
///
|
||||
/// # Returns
|
||||
/// Number of VMs created.
|
||||
pub fn replenish(&self) -> Result<usize> {
|
||||
let start = Instant::now();
|
||||
let mut created = 0;
|
||||
|
||||
loop {
|
||||
// Check if we need to create more VMs
|
||||
let current_size = self.pool.lock().len();
|
||||
if current_size >= self.target_size {
|
||||
break;
|
||||
}
|
||||
|
||||
// Create a new VM
|
||||
let vm = Self::create_empty_vm()
|
||||
.map_err(|e| PoolError::Kvm(format!("replenish failed: {}", e)))?;
|
||||
|
||||
// Add to pool
|
||||
self.pool.lock().push_back(vm);
|
||||
self.total_created.fetch_add(1, Ordering::Relaxed);
|
||||
created += 1;
|
||||
}
|
||||
|
||||
if created > 0 {
|
||||
let elapsed = start.elapsed();
|
||||
info!(
|
||||
"Pool replenished: {} VMs created in {:.2}ms (pool size: {})",
|
||||
created,
|
||||
elapsed.as_secs_f64() * 1000.0,
|
||||
self.pool.lock().len()
|
||||
);
|
||||
}
|
||||
|
||||
Ok(created)
|
||||
}
|
||||
|
||||
/// Get current pool size.
|
||||
pub fn size(&self) -> usize {
|
||||
self.pool.lock().len()
|
||||
}
|
||||
|
||||
/// Get target pool size.
|
||||
pub fn target_size(&self) -> usize {
|
||||
self.target_size
|
||||
}
|
||||
|
||||
/// Get pool statistics.
|
||||
pub fn stats(&self) -> PoolStats {
|
||||
PoolStats {
|
||||
current_size: self.pool.lock().len(),
|
||||
target_size: self.target_size,
|
||||
total_created: self.total_created.load(Ordering::Relaxed),
|
||||
pool_hits: self.pool_hits.load(Ordering::Relaxed),
|
||||
pool_misses: self.pool_misses.load(Ordering::Relaxed),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Pool statistics for monitoring
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct PoolStats {
|
||||
/// Current number of VMs in the pool
|
||||
pub current_size: usize,
|
||||
/// Target pool size
|
||||
pub target_size: usize,
|
||||
/// Total VMs ever created
|
||||
pub total_created: usize,
|
||||
/// Number of successful pool acquisitions (cache hits)
|
||||
pub pool_hits: usize,
|
||||
/// Number of on-demand VM creations (cache misses)
|
||||
pub pool_misses: usize,
|
||||
}
|
||||
|
||||
impl PoolStats {
|
||||
/// Calculate hit rate as a percentage
|
||||
pub fn hit_rate(&self) -> f64 {
|
||||
let total = self.pool_hits + self.pool_misses;
|
||||
if total == 0 {
|
||||
100.0
|
||||
} else {
|
||||
(self.pool_hits as f64 / total as f64) * 100.0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for PoolStats {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"VmPool {{ size: {}/{}, created: {}, hits: {}, misses: {}, hit_rate: {:.1}% }}",
|
||||
self.current_size,
|
||||
self.target_size,
|
||||
self.total_created,
|
||||
self.pool_hits,
|
||||
self.pool_misses,
|
||||
self.hit_rate()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Tests
|
||||
// ============================================================================
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::thread;
|
||||
|
||||
/// Test that pool creation works
|
||||
#[test]
|
||||
fn test_pool_creation() {
|
||||
// Skip if KVM is not available
|
||||
if Kvm::new().is_err() {
|
||||
eprintln!("Skipping test_pool_creation: KVM not available");
|
||||
return;
|
||||
}
|
||||
|
||||
let pool = VmPool::new(2).expect("Failed to create pool");
|
||||
assert_eq!(pool.size(), 2);
|
||||
assert_eq!(pool.target_size(), 2);
|
||||
|
||||
let stats = pool.stats();
|
||||
assert_eq!(stats.total_created, 2);
|
||||
assert_eq!(stats.pool_hits, 0);
|
||||
assert_eq!(stats.pool_misses, 0);
|
||||
}
|
||||
|
||||
/// Test acquire/release cycle
|
||||
#[test]
|
||||
fn test_acquire_release_cycle() {
|
||||
if Kvm::new().is_err() {
|
||||
eprintln!("Skipping test_acquire_release_cycle: KVM not available");
|
||||
return;
|
||||
}
|
||||
|
||||
let pool = VmPool::new(3).expect("Failed to create pool");
|
||||
assert_eq!(pool.size(), 3);
|
||||
|
||||
// Acquire a VM
|
||||
let vm = pool.acquire().expect("Failed to acquire VM");
|
||||
assert_eq!(pool.size(), 2);
|
||||
|
||||
// Verify the VM has a valid fd
|
||||
use std::os::unix::io::AsRawFd;
|
||||
assert!(vm.vm_fd.as_raw_fd() >= 0);
|
||||
|
||||
// Release it (currently just drops)
|
||||
pool.release(vm);
|
||||
// Size doesn't change because release currently drops the VM
|
||||
assert_eq!(pool.size(), 2);
|
||||
|
||||
let stats = pool.stats();
|
||||
assert_eq!(stats.pool_hits, 1);
|
||||
assert_eq!(stats.pool_misses, 0);
|
||||
}
|
||||
|
||||
/// Test pool exhaustion fallback
|
||||
#[test]
|
||||
fn test_pool_exhaustion_fallback() {
|
||||
if Kvm::new().is_err() {
|
||||
eprintln!("Skipping test_pool_exhaustion_fallback: KVM not available");
|
||||
return;
|
||||
}
|
||||
|
||||
let pool = VmPool::new(1).expect("Failed to create pool");
|
||||
|
||||
// First acquire: from pool
|
||||
let _vm1 = pool.acquire().expect("Failed to acquire VM 1");
|
||||
assert_eq!(pool.size(), 0);
|
||||
|
||||
// Second acquire: on-demand (pool empty)
|
||||
let _vm2 = pool.acquire().expect("Failed to acquire VM 2");
|
||||
|
||||
let stats = pool.stats();
|
||||
assert_eq!(stats.pool_hits, 1);
|
||||
assert_eq!(stats.pool_misses, 1);
|
||||
assert_eq!(stats.total_created, 2);
|
||||
}
|
||||
|
||||
/// Test replenish
|
||||
#[test]
|
||||
fn test_replenish() {
|
||||
if Kvm::new().is_err() {
|
||||
eprintln!("Skipping test_replenish: KVM not available");
|
||||
return;
|
||||
}
|
||||
|
||||
let pool = VmPool::new(2).expect("Failed to create pool");
|
||||
|
||||
// Drain the pool
|
||||
let _vm1 = pool.acquire().unwrap();
|
||||
let _vm2 = pool.acquire().unwrap();
|
||||
assert_eq!(pool.size(), 0);
|
||||
|
||||
// Replenish
|
||||
let created = pool.replenish().expect("Failed to replenish");
|
||||
assert_eq!(created, 2);
|
||||
assert_eq!(pool.size(), 2);
|
||||
}
|
||||
|
||||
/// Test concurrent access
|
||||
#[test]
|
||||
fn test_concurrent_access() {
|
||||
if Kvm::new().is_err() {
|
||||
eprintln!("Skipping test_concurrent_access: KVM not available");
|
||||
return;
|
||||
}
|
||||
|
||||
let pool = Arc::new(VmPool::new(4).expect("Failed to create pool"));
|
||||
let mut handles = vec![];
|
||||
|
||||
// Spawn 4 threads that each acquire and then drop a VM
|
||||
for _ in 0..4 {
|
||||
let pool_clone = Arc::clone(&pool);
|
||||
let handle = thread::spawn(move || {
|
||||
let _vm = pool_clone.acquire().expect("Failed to acquire VM");
|
||||
// Hold VM briefly
|
||||
thread::sleep(std::time::Duration::from_millis(10));
|
||||
// VM drops here
|
||||
});
|
||||
handles.push(handle);
|
||||
}
|
||||
|
||||
// Wait for all threads
|
||||
for handle in handles {
|
||||
handle.join().unwrap();
|
||||
}
|
||||
|
||||
let stats = pool.stats();
|
||||
// Should have had 4 hits (initial pool size was 4)
|
||||
// but depending on timing, some might be misses
|
||||
assert!(stats.pool_hits + stats.pool_misses == 4);
|
||||
}
|
||||
|
||||
/// Test zero-size pool (disabled)
|
||||
#[test]
|
||||
fn test_zero_size_pool() {
|
||||
if Kvm::new().is_err() {
|
||||
eprintln!("Skipping test_zero_size_pool: KVM not available");
|
||||
return;
|
||||
}
|
||||
|
||||
let pool = VmPool::new(0).expect("Failed to create pool");
|
||||
assert_eq!(pool.size(), 0);
|
||||
|
||||
// Acquire should still work (creates on demand)
|
||||
let _vm = pool.acquire().expect("Failed to acquire VM");
|
||||
|
||||
let stats = pool.stats();
|
||||
assert_eq!(stats.pool_hits, 0);
|
||||
assert_eq!(stats.pool_misses, 1);
|
||||
}
|
||||
|
||||
/// Test stats hit rate calculation
|
||||
#[test]
|
||||
fn test_stats_hit_rate() {
|
||||
let stats = PoolStats {
|
||||
current_size: 2,
|
||||
target_size: 4,
|
||||
total_created: 6,
|
||||
pool_hits: 3,
|
||||
pool_misses: 1,
|
||||
};
|
||||
assert!((stats.hit_rate() - 75.0).abs() < 0.1);
|
||||
|
||||
// Zero total should return 100%
|
||||
let empty_stats = PoolStats {
|
||||
current_size: 4,
|
||||
target_size: 4,
|
||||
total_created: 4,
|
||||
pool_hits: 0,
|
||||
pool_misses: 0,
|
||||
};
|
||||
assert!((empty_stats.hit_rate() - 100.0).abs() < 0.1);
|
||||
}
|
||||
}
|
||||
206
vmm/src/security/capabilities.rs
Normal file
206
vmm/src/security/capabilities.rs
Normal file
@@ -0,0 +1,206 @@
|
||||
//! Linux capability dropping for Volt VMM
|
||||
//!
|
||||
//! After the VMM has completed privileged setup (opening /dev/kvm, /dev/net/tun,
|
||||
//! binding API sockets), we drop all capabilities to minimize the impact of
|
||||
//! any future process compromise.
|
||||
//!
|
||||
//! This is a critical security layer — even if an attacker achieves code execution
|
||||
//! in the VMM process, they cannot escalate privileges.
|
||||
|
||||
use tracing::{debug, info, warn};
|
||||
|
||||
use super::SecurityError;
|
||||
|
||||
/// prctl constants not exposed by libc in all versions
|
||||
const PR_SET_NO_NEW_PRIVS: libc::c_int = 38;
|
||||
const PR_CAP_AMBIENT: libc::c_int = 47;
|
||||
const PR_CAP_AMBIENT_CLEAR_ALL: libc::c_ulong = 4;
|
||||
|
||||
/// Maximum capability number to iterate over.
|
||||
/// CAP_LAST_CAP is typically 40-41 on modern kernels; we go to 63 for safety.
|
||||
const CAP_LAST_CAP: u32 = 63;
|
||||
|
||||
/// Drop all Linux capabilities from the current thread/process.
|
||||
///
|
||||
/// This function:
|
||||
/// 1. Sets `PR_SET_NO_NEW_PRIVS` to prevent privilege escalation via execve
|
||||
/// 2. Clears all ambient capabilities
|
||||
/// 3. Drops all permitted and effective capabilities
|
||||
///
|
||||
/// # Safety
|
||||
///
|
||||
/// This permanently reduces the process's privileges. Must be called only after
|
||||
/// all privileged operations (opening /dev/kvm, /dev/net/tun, binding sockets)
|
||||
/// are complete.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns `SecurityError::CapabilityDrop` if any prctl/capset call fails.
|
||||
pub fn drop_capabilities() -> Result<(), SecurityError> {
|
||||
info!("Dropping Linux capabilities");
|
||||
|
||||
// Step 1: Set PR_SET_NO_NEW_PRIVS
|
||||
// This prevents the process from gaining new privileges via execve.
|
||||
// Required by Landlock, and good practice regardless.
|
||||
set_no_new_privs()?;
|
||||
|
||||
// Step 2: Clear all ambient capabilities
|
||||
clear_ambient_capabilities()?;
|
||||
|
||||
// Step 3: Drop all bounding set capabilities
|
||||
drop_bounding_set()?;
|
||||
|
||||
// Step 4: Clear permitted and effective capability sets
|
||||
clear_capability_sets()?;
|
||||
|
||||
info!("All capabilities dropped successfully");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Set PR_SET_NO_NEW_PRIVS to prevent privilege escalation.
|
||||
pub(crate) fn set_no_new_privs() -> Result<(), SecurityError> {
|
||||
let ret = unsafe { libc::prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) };
|
||||
if ret != 0 {
|
||||
let err = std::io::Error::last_os_error();
|
||||
return Err(SecurityError::NoNewPrivs(err.to_string()));
|
||||
}
|
||||
debug!("PR_SET_NO_NEW_PRIVS set");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Clear all ambient capabilities.
|
||||
fn clear_ambient_capabilities() -> Result<(), SecurityError> {
|
||||
let ret = unsafe {
|
||||
libc::prctl(
|
||||
PR_CAP_AMBIENT,
|
||||
PR_CAP_AMBIENT_CLEAR_ALL as libc::c_ulong,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
)
|
||||
};
|
||||
if ret != 0 {
|
||||
let err = std::io::Error::last_os_error();
|
||||
// EINVAL means no ambient caps to clear (older kernel), which is fine
|
||||
if err.raw_os_error() != Some(libc::EINVAL) {
|
||||
return Err(SecurityError::CapabilityDrop(format!(
|
||||
"Failed to clear ambient capabilities: {}",
|
||||
err
|
||||
)));
|
||||
}
|
||||
debug!("Ambient capability clearing returned EINVAL (not supported or none to clear)");
|
||||
} else {
|
||||
debug!("Ambient capabilities cleared");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Drop all capabilities from the bounding set.
|
||||
fn drop_bounding_set() -> Result<(), SecurityError> {
|
||||
for cap in 0..=CAP_LAST_CAP {
|
||||
let ret = unsafe { libc::prctl(libc::PR_CAPBSET_DROP, cap as libc::c_ulong, 0, 0, 0) };
|
||||
if ret != 0 {
|
||||
let err = std::io::Error::last_os_error();
|
||||
// EINVAL means this capability number doesn't exist, which is expected
|
||||
// when we iterate beyond the kernel's last cap
|
||||
if err.raw_os_error() == Some(libc::EINVAL) {
|
||||
break;
|
||||
}
|
||||
// EPERM means we don't have CAP_SETPCAP, which is expected in some
|
||||
// environments. We'll still clear the capability sets below.
|
||||
if err.raw_os_error() == Some(libc::EPERM) {
|
||||
debug!(
|
||||
"Cannot drop bounding cap {} (EPERM) - continuing",
|
||||
cap
|
||||
);
|
||||
continue;
|
||||
}
|
||||
return Err(SecurityError::CapabilityDrop(format!(
|
||||
"Failed to drop bounding capability {}: {}",
|
||||
cap, err
|
||||
)));
|
||||
}
|
||||
}
|
||||
debug!("Bounding set capabilities dropped");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Clear the permitted and effective capability sets using capset(2).
|
||||
fn clear_capability_sets() -> Result<(), SecurityError> {
|
||||
// Linux capability header + data structures (v3, 64-bit)
|
||||
#[repr(C)]
|
||||
struct CapHeader {
|
||||
version: u32,
|
||||
pid: i32,
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
struct CapData {
|
||||
effective: u32,
|
||||
permitted: u32,
|
||||
inheritable: u32,
|
||||
}
|
||||
|
||||
// _LINUX_CAPABILITY_VERSION_3 = 0x20080522
|
||||
let header = CapHeader {
|
||||
version: 0x20080522,
|
||||
pid: 0, // current process
|
||||
};
|
||||
|
||||
// Zero out all capability sets (two u32 words for v3)
|
||||
let data = [
|
||||
CapData {
|
||||
effective: 0,
|
||||
permitted: 0,
|
||||
inheritable: 0,
|
||||
},
|
||||
CapData {
|
||||
effective: 0,
|
||||
permitted: 0,
|
||||
inheritable: 0,
|
||||
},
|
||||
];
|
||||
|
||||
let ret = unsafe {
|
||||
libc::syscall(
|
||||
libc::SYS_capset,
|
||||
&header as *const CapHeader,
|
||||
data.as_ptr() as *const CapData,
|
||||
)
|
||||
};
|
||||
|
||||
if ret != 0 {
|
||||
let err = std::io::Error::last_os_error();
|
||||
// EPERM is expected when running as non-root
|
||||
if err.raw_os_error() == Some(libc::EPERM) {
|
||||
warn!("Cannot clear capability sets (EPERM) - process likely already unprivileged");
|
||||
} else {
|
||||
return Err(SecurityError::CapabilityDrop(format!(
|
||||
"Failed to clear capability sets: {}",
|
||||
err
|
||||
)));
|
||||
}
|
||||
} else {
|
||||
debug!("Capability sets cleared (permitted, effective, inheritable = 0)");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_set_no_new_privs() {
|
||||
// This should always succeed, even for unprivileged processes
|
||||
set_no_new_privs().expect("PR_SET_NO_NEW_PRIVS should succeed");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_drop_capabilities_unprivileged() {
|
||||
// When running as non-root, some operations will be skipped gracefully
|
||||
// but the overall function should not error out
|
||||
drop_capabilities().expect("drop_capabilities should succeed even unprivileged");
|
||||
}
|
||||
}
|
||||
338
vmm/src/security/landlock.rs
Normal file
338
vmm/src/security/landlock.rs
Normal file
@@ -0,0 +1,338 @@
|
||||
//! Landlock filesystem sandboxing for Volt VMM
|
||||
//!
|
||||
//! Restricts the VMM process to only access the filesystem paths it actually needs:
|
||||
//! - Kernel and initrd images (read-only)
|
||||
//! - Disk images (read-write)
|
||||
//! - API socket path (read-write)
|
||||
//! - Device nodes: /dev/kvm, /dev/net/tun, /dev/vhost-net (read-write)
|
||||
//! - /proc/self (read-only, for /proc/self/fd)
|
||||
//! - /sys/class/net (read-only, for bridge/macvtap network detection)
|
||||
//! - /run/systemd/network (read-only, for networkd integration)
|
||||
//! - /run/stellarium (read-write, for CAS daemon socket)
|
||||
//!
|
||||
//! Landlock requires Linux 5.13+. When unavailable, it degrades gracefully
|
||||
//! with a warning log. Use `--no-landlock` to disable entirely.
|
||||
//!
|
||||
//! # ABI Compatibility
|
||||
//!
|
||||
//! The crate handles ABI version negotiation automatically via the `landlock` crate's
|
||||
//! best-effort compatibility mode. We target ABI V5 (kernel 6.10+) for maximum
|
||||
//! protection, falling back to whatever the running kernel supports.
|
||||
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use landlock::{
|
||||
Access, AccessFs, BitFlags, Ruleset, RulesetAttr,
|
||||
RulesetCreatedAttr, RulesetStatus, ABI,
|
||||
path_beneath_rules,
|
||||
};
|
||||
use tracing::{debug, info, warn};
|
||||
|
||||
use super::{LandlockAccess, LandlockRule, SecurityError};
|
||||
|
||||
/// Target ABI version — we request the highest we know about and let the
|
||||
/// crate's best-effort mode downgrade gracefully.
|
||||
const TARGET_ABI: ABI = ABI::V5;
|
||||
|
||||
/// Configuration for the Landlock sandbox
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct LandlockConfig {
|
||||
/// Path to the kernel image (read-only access)
|
||||
pub kernel_path: PathBuf,
|
||||
/// Path to the initrd image (read-only access)
|
||||
pub initrd_path: Option<PathBuf>,
|
||||
/// Paths to disk images (read-write access)
|
||||
pub disk_paths: Vec<PathBuf>,
|
||||
/// API socket path (read-write access)
|
||||
pub api_socket_path: Option<PathBuf>,
|
||||
/// Additional user-specified rules from --landlock-rule
|
||||
pub extra_rules: Vec<LandlockRule>,
|
||||
}
|
||||
|
||||
impl LandlockConfig {
|
||||
/// Create a new Landlock configuration from VMM paths
|
||||
pub fn new(kernel_path: PathBuf) -> Self {
|
||||
Self {
|
||||
kernel_path,
|
||||
initrd_path: None,
|
||||
disk_paths: Vec::new(),
|
||||
api_socket_path: None,
|
||||
extra_rules: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Set the initrd path
|
||||
pub fn with_initrd(mut self, path: PathBuf) -> Self {
|
||||
self.initrd_path = Some(path);
|
||||
self
|
||||
}
|
||||
|
||||
/// Add a disk image path
|
||||
pub fn with_disk(mut self, path: PathBuf) -> Self {
|
||||
self.disk_paths.push(path);
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the API socket path
|
||||
pub fn with_api_socket(mut self, path: PathBuf) -> Self {
|
||||
self.api_socket_path = Some(path);
|
||||
self
|
||||
}
|
||||
|
||||
/// Add extra rules from CLI
|
||||
pub fn with_extra_rules(mut self, rules: Vec<LandlockRule>) -> Self {
|
||||
self.extra_rules = rules;
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
/// Landlock sandbox state (marker type for documentation)
|
||||
#[allow(dead_code)]
|
||||
pub struct LandlockSandbox;
|
||||
|
||||
/// Apply Landlock restrictions based on the provided configuration.
|
||||
///
|
||||
/// This function:
|
||||
/// 1. Detects if Landlock is available on the running kernel
|
||||
/// 2. Creates a ruleset allowing only the VMM's required paths
|
||||
/// 3. Enforces the ruleset on the current process (irrevocable)
|
||||
///
|
||||
/// # Best-Effort Mode
|
||||
///
|
||||
/// The landlock crate operates in best-effort mode by default:
|
||||
/// - On kernels without Landlock: logs a warning, continues without sandboxing
|
||||
/// - On older kernels: applies whatever subset of restrictions the kernel supports
|
||||
/// - On modern kernels: full sandbox enforcement
|
||||
pub fn apply_landlock(config: &LandlockConfig) -> Result<(), SecurityError> {
|
||||
info!("Applying Landlock filesystem sandbox");
|
||||
|
||||
// Build access sets for the target ABI
|
||||
let access_all = AccessFs::from_all(TARGET_ABI);
|
||||
let access_read = AccessFs::from_read(TARGET_ABI);
|
||||
|
||||
// File-specific read-write access (subset for disk images)
|
||||
let access_rw_file: BitFlags<AccessFs> = AccessFs::ReadFile
|
||||
| AccessFs::WriteFile
|
||||
| AccessFs::ReadDir
|
||||
| AccessFs::Truncate;
|
||||
|
||||
// Device access — need read/write plus ioctl for /dev/kvm
|
||||
let access_device: BitFlags<AccessFs> = AccessFs::ReadFile
|
||||
| AccessFs::WriteFile
|
||||
| AccessFs::IoctlDev;
|
||||
|
||||
// Create the ruleset declaring what access types we want to control
|
||||
let ruleset = Ruleset::default()
|
||||
.handle_access(access_all)
|
||||
.map_err(|e| SecurityError::Landlock(format!("Failed to set handled access: {}", e)))?
|
||||
.create()
|
||||
.map_err(|e| SecurityError::Landlock(format!("Failed to create ruleset: {}", e)))?;
|
||||
|
||||
// Collect all rules, then chain them into the ruleset.
|
||||
// We build (PathFd, BitFlags<AccessFs>) tuples and add them.
|
||||
|
||||
// --- Read-only paths ---
|
||||
let mut ro_paths: Vec<PathBuf> = vec![config.kernel_path.clone()];
|
||||
if let Some(ref initrd) = config.initrd_path {
|
||||
ro_paths.push(initrd.clone());
|
||||
}
|
||||
|
||||
// --- Read-write paths (disk images) ---
|
||||
let rw_paths: Vec<PathBuf> = config.disk_paths.clone();
|
||||
|
||||
// Start chaining rules using add_rules with path_beneath_rules helper
|
||||
let ruleset = ruleset
|
||||
.add_rules(path_beneath_rules(&ro_paths, access_read))
|
||||
.map_err(|e| SecurityError::Landlock(format!("Failed to add read-only rules: {}", e)))?;
|
||||
debug!("Landlock: read-only access to {:?}", ro_paths);
|
||||
|
||||
let ruleset = if !rw_paths.is_empty() {
|
||||
let r = ruleset
|
||||
.add_rules(path_beneath_rules(&rw_paths, access_rw_file))
|
||||
.map_err(|e| SecurityError::Landlock(format!("Failed to add read-write rules: {}", e)))?;
|
||||
debug!("Landlock: read-write access to {:?}", rw_paths);
|
||||
r
|
||||
} else {
|
||||
ruleset
|
||||
};
|
||||
|
||||
// --- API socket directory ---
|
||||
let ruleset = if let Some(ref socket_path) = config.api_socket_path {
|
||||
if let Some(parent) = socket_path.parent() {
|
||||
if parent.exists() {
|
||||
let socket_access: BitFlags<AccessFs> = AccessFs::ReadFile
|
||||
| AccessFs::WriteFile
|
||||
| AccessFs::ReadDir
|
||||
| AccessFs::MakeSock
|
||||
| AccessFs::RemoveFile;
|
||||
let r = ruleset
|
||||
.add_rules(path_beneath_rules(&[parent], socket_access))
|
||||
.map_err(|e| SecurityError::Landlock(format!("Failed to add API socket rule: {}", e)))?;
|
||||
debug!("Landlock: socket access to {}", parent.display());
|
||||
r
|
||||
} else {
|
||||
ruleset
|
||||
}
|
||||
} else {
|
||||
ruleset
|
||||
}
|
||||
} else {
|
||||
ruleset
|
||||
};
|
||||
|
||||
// --- Device nodes (optional — may not exist) ---
|
||||
let device_paths: Vec<&Path> = ["/dev/kvm", "/dev/net/tun", "/dev/vhost-net"]
|
||||
.iter()
|
||||
.map(Path::new)
|
||||
.filter(|p| p.exists())
|
||||
.collect();
|
||||
let ruleset = if !device_paths.is_empty() {
|
||||
let r = ruleset
|
||||
.add_rules(path_beneath_rules(&device_paths, access_device))
|
||||
.map_err(|e| SecurityError::Landlock(format!("Failed to add device rules: {}", e)))?;
|
||||
debug!("Landlock: device access to {:?}", device_paths);
|
||||
r
|
||||
} else {
|
||||
ruleset
|
||||
};
|
||||
|
||||
// --- /sys/class/net (read-only) — required for bridge/macvtap network detection ---
|
||||
let sys_net = Path::new("/sys/class/net");
|
||||
let ruleset = if sys_net.exists() {
|
||||
let r = ruleset
|
||||
.add_rules(path_beneath_rules(&[sys_net], access_read))
|
||||
.map_err(|e| SecurityError::Landlock(format!("Failed to add /sys/class/net rule: {}", e)))?;
|
||||
debug!("Landlock: read-only access to /sys/class/net");
|
||||
r
|
||||
} else {
|
||||
ruleset
|
||||
};
|
||||
|
||||
// --- /run/systemd/network (read-only) — required for systemd-networkd integration ---
|
||||
let run_networkd = Path::new("/run/systemd/network");
|
||||
let ruleset = if run_networkd.exists() {
|
||||
let r = ruleset
|
||||
.add_rules(path_beneath_rules(&[run_networkd], access_read))
|
||||
.map_err(|e| SecurityError::Landlock(format!("Failed to add networkd runtime rule: {}", e)))?;
|
||||
debug!("Landlock: read-only access to /run/systemd/network");
|
||||
r
|
||||
} else {
|
||||
ruleset
|
||||
};
|
||||
|
||||
// --- /run/stellarium (read-write) — CAS daemon socket ---
|
||||
let run_stellarium = Path::new("/run/stellarium");
|
||||
let ruleset = if run_stellarium.exists() {
|
||||
let stellarium_access: BitFlags<AccessFs> = AccessFs::ReadFile
|
||||
| AccessFs::WriteFile
|
||||
| AccessFs::ReadDir
|
||||
| AccessFs::MakeSock;
|
||||
let r = ruleset
|
||||
.add_rules(path_beneath_rules(&[run_stellarium], stellarium_access))
|
||||
.map_err(|e| SecurityError::Landlock(format!("Failed to add Stellarium socket rule: {}", e)))?;
|
||||
debug!("Landlock: socket access to /run/stellarium");
|
||||
r
|
||||
} else {
|
||||
ruleset
|
||||
};
|
||||
|
||||
// --- /proc/self (read-only, for fd access) ---
|
||||
let proc_self = Path::new("/proc/self");
|
||||
let ruleset = if proc_self.exists() {
|
||||
let r = ruleset
|
||||
.add_rules(path_beneath_rules(&[proc_self], access_read))
|
||||
.map_err(|e| SecurityError::Landlock(format!("Failed to add /proc/self rule: {}", e)))?;
|
||||
debug!("Landlock: read-only access to /proc/self");
|
||||
r
|
||||
} else {
|
||||
ruleset
|
||||
};
|
||||
|
||||
// --- Extra user-specified rules from --landlock-rule ---
|
||||
let mut current = ruleset;
|
||||
for rule in &config.extra_rules {
|
||||
let access = match rule.access {
|
||||
LandlockAccess::ReadOnly => access_read,
|
||||
LandlockAccess::ReadWrite => access_all,
|
||||
};
|
||||
current = current
|
||||
.add_rules(path_beneath_rules(&[&rule.path], access))
|
||||
.map_err(|e| SecurityError::Landlock(format!(
|
||||
"Failed to add user rule for '{}': {}",
|
||||
rule.path.display(),
|
||||
e
|
||||
)))?;
|
||||
debug!(
|
||||
"Landlock: user rule {} access to {}",
|
||||
match rule.access {
|
||||
LandlockAccess::ReadOnly => "ro",
|
||||
LandlockAccess::ReadWrite => "rw",
|
||||
},
|
||||
rule.path.display()
|
||||
);
|
||||
}
|
||||
|
||||
// Enforce the ruleset — this is irrevocable
|
||||
let status = current
|
||||
.restrict_self()
|
||||
.map_err(|e| SecurityError::Landlock(format!("Failed to restrict self: {}", e)))?;
|
||||
|
||||
// Report enforcement status
|
||||
match status.ruleset {
|
||||
RulesetStatus::FullyEnforced => {
|
||||
info!("Landlock sandbox fully enforced");
|
||||
}
|
||||
RulesetStatus::PartiallyEnforced => {
|
||||
warn!(
|
||||
"Landlock sandbox partially enforced (kernel may not support all requested features)"
|
||||
);
|
||||
}
|
||||
RulesetStatus::NotEnforced => {
|
||||
warn!(
|
||||
"Landlock sandbox NOT enforced — kernel does not support Landlock. \
|
||||
Consider upgrading to kernel 5.13+ for filesystem sandboxing."
|
||||
);
|
||||
}
|
||||
#[allow(unreachable_patterns)]
|
||||
_ => {
|
||||
warn!("Landlock sandbox: unknown enforcement status");
|
||||
}
|
||||
}
|
||||
|
||||
if status.no_new_privs {
|
||||
debug!("PR_SET_NO_NEW_PRIVS confirmed by Landlock");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::path::PathBuf;
|
||||
|
||||
#[test]
|
||||
fn test_landlock_config_builder() {
|
||||
let config = LandlockConfig::new(PathBuf::from("/boot/vmlinux"))
|
||||
.with_initrd(PathBuf::from("/boot/initrd.img"))
|
||||
.with_disk(PathBuf::from("/var/lib/vms/disk.img"))
|
||||
.with_api_socket(PathBuf::from("/tmp/volt-vmm.sock"));
|
||||
|
||||
assert_eq!(config.kernel_path, PathBuf::from("/boot/vmlinux"));
|
||||
assert_eq!(config.initrd_path, Some(PathBuf::from("/boot/initrd.img")));
|
||||
assert_eq!(config.disk_paths.len(), 1);
|
||||
assert_eq!(
|
||||
config.api_socket_path,
|
||||
Some(PathBuf::from("/tmp/volt-vmm.sock"))
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_landlock_config_multiple_disks() {
|
||||
let config = LandlockConfig::new(PathBuf::from("/boot/vmlinux"))
|
||||
.with_disk(PathBuf::from("/var/lib/vms/disk1.img"))
|
||||
.with_disk(PathBuf::from("/var/lib/vms/disk2.img"));
|
||||
|
||||
assert_eq!(config.disk_paths.len(), 2);
|
||||
}
|
||||
}
|
||||
120
vmm/src/security/mod.rs
Normal file
120
vmm/src/security/mod.rs
Normal file
@@ -0,0 +1,120 @@
|
||||
//! Volt Security Module
|
||||
//!
|
||||
//! Provides defense-in-depth sandboxing for the VMM process:
|
||||
//!
|
||||
//! - **Seccomp-BPF**: Strict syscall allowlist (~70 syscalls, everything else → KILL)
|
||||
//! - **Capability dropping**: Removes all Linux capabilities after setup
|
||||
//! - **Landlock**: Restricts filesystem access to only required paths (kernel 5.13+)
|
||||
//!
|
||||
//! # Security Layer Stack
|
||||
//!
|
||||
//! ```text
|
||||
//! ┌─────────────────────────────────────────────────────────┐
|
||||
//! │ Layer 5: Seccomp-BPF (always unless --no-seccomp) │
|
||||
//! │ Syscall allowlist, KILL_PROCESS on violation │
|
||||
//! ├─────────────────────────────────────────────────────────┤
|
||||
//! │ Layer 4: Landlock (optional, kernel 5.13+) │
|
||||
//! │ Filesystem path restrictions │
|
||||
//! ├─────────────────────────────────────────────────────────┤
|
||||
//! │ Layer 3: Capability dropping (always) │
|
||||
//! │ Drop all ambient capabilities │
|
||||
//! ├─────────────────────────────────────────────────────────┤
|
||||
//! │ Layer 2: PR_SET_NO_NEW_PRIVS (always) │
|
||||
//! │ Prevent privilege escalation │
|
||||
//! ├─────────────────────────────────────────────────────────┤
|
||||
//! │ Layer 1: KVM isolation (inherent) │
|
||||
//! │ Hardware virtualization boundary │
|
||||
//! └─────────────────────────────────────────────────────────┘
|
||||
//! ```
|
||||
|
||||
pub mod capabilities;
|
||||
pub mod landlock;
|
||||
pub mod seccomp;
|
||||
|
||||
pub use capabilities::drop_capabilities;
|
||||
pub use landlock::LandlockConfig;
|
||||
pub use seccomp::{apply_seccomp_filter, SeccompConfig};
|
||||
|
||||
use std::path::PathBuf;
|
||||
use thiserror::Error;
|
||||
|
||||
/// Security-related errors
|
||||
#[derive(Error, Debug)]
|
||||
pub enum SecurityError {
|
||||
#[error("Failed to drop capabilities: {0}")]
|
||||
CapabilityDrop(String),
|
||||
|
||||
#[error("Failed to set PR_SET_NO_NEW_PRIVS: {0}")]
|
||||
NoNewPrivs(String),
|
||||
|
||||
#[error("Landlock error: {0}")]
|
||||
Landlock(String),
|
||||
|
||||
#[error("Failed to parse landlock rule '{0}': expected format 'path:access' where access is 'ro' or 'rw'")]
|
||||
LandlockRuleParse(String),
|
||||
}
|
||||
|
||||
/// Parsed additional Landlock rule from CLI
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct LandlockRule {
|
||||
/// Filesystem path to allow access to
|
||||
pub path: PathBuf,
|
||||
/// Access mode: read-only or read-write
|
||||
pub access: LandlockAccess,
|
||||
}
|
||||
|
||||
/// Access mode for a Landlock rule
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum LandlockAccess {
|
||||
/// Read-only access
|
||||
ReadOnly,
|
||||
/// Read-write access
|
||||
ReadWrite,
|
||||
}
|
||||
|
||||
impl LandlockRule {
|
||||
/// Parse a rule from the CLI format "path:access"
|
||||
///
|
||||
/// Examples:
|
||||
/// - `/tmp/hotplug:rw`
|
||||
/// - `/usr/share/data:ro`
|
||||
pub fn parse(s: &str) -> Result<Self, SecurityError> {
|
||||
let parts: Vec<&str> = s.rsplitn(2, ':').collect();
|
||||
if parts.len() != 2 {
|
||||
return Err(SecurityError::LandlockRuleParse(s.to_string()));
|
||||
}
|
||||
|
||||
// rsplitn reverses the order
|
||||
let access_str = parts[0];
|
||||
let path_str = parts[1];
|
||||
|
||||
let access = match access_str {
|
||||
"ro" | "r" | "read" => LandlockAccess::ReadOnly,
|
||||
"rw" | "w" | "write" | "readwrite" => LandlockAccess::ReadWrite,
|
||||
_ => return Err(SecurityError::LandlockRuleParse(s.to_string())),
|
||||
};
|
||||
|
||||
Ok(Self {
|
||||
path: PathBuf::from(path_str),
|
||||
access,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Apply all security restrictions.
|
||||
///
|
||||
/// This should be called after all privileged setup (KVM, TAP, sockets) is complete
|
||||
/// but before the vCPU run loop begins.
|
||||
pub fn apply_security(
|
||||
landlock_config: Option<&LandlockConfig>,
|
||||
) -> Result<(), SecurityError> {
|
||||
// Step 1: Apply Landlock (if configured) — this also sets PR_SET_NO_NEW_PRIVS
|
||||
if let Some(config) = landlock_config {
|
||||
landlock::apply_landlock(config)?;
|
||||
}
|
||||
|
||||
// Step 2: Drop capabilities
|
||||
drop_capabilities()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
344
vmm/src/security/seccomp.rs
Normal file
344
vmm/src/security/seccomp.rs
Normal file
@@ -0,0 +1,344 @@
|
||||
//! Seccomp-BPF system call filtering for Volt VMM
|
||||
//!
|
||||
//! Implements a strict syscall allowlist modeled after Firecracker's approach.
|
||||
//! All syscalls not explicitly allowed are blocked with SECCOMP_RET_KILL_PROCESS,
|
||||
//! immediately terminating the VMM if an unexpected syscall is attempted.
|
||||
//!
|
||||
//! # Syscall Categories
|
||||
//!
|
||||
//! The allowlist is organized by function:
|
||||
//! - **File I/O**: read, write, openat, close, fstat, lseek
|
||||
//! - **Memory**: mmap, mprotect, munmap, brk, madvise, mremap
|
||||
//! - **KVM**: ioctl (the core VMM syscall for KVM_RUN, etc.)
|
||||
//! - **Threading**: clone, clone3, futex, set_robust_list, sched_yield, rseq
|
||||
//! - **Signals**: rt_sigaction, rt_sigprocmask, rt_sigreturn, sigaltstack
|
||||
//! - **Networking**: accept4, bind, listen, socket, socketpair, connect, recvfrom,
|
||||
//! sendto, epoll_ctl, epoll_wait, epoll_pwait, epoll_create1,
|
||||
//! shutdown, getsockname, setsockopt, poll/ppoll
|
||||
//! - **Process**: exit, exit_group, getpid, gettid, prctl, arch_prctl, prlimit64
|
||||
//! - **Timers**: clock_gettime, nanosleep, clock_nanosleep
|
||||
//! - **Misc**: getrandom, eventfd2, timerfd_create, timerfd_settime, pipe2,
|
||||
//! dup/dup2, fcntl, statx, newfstatat, access, readlink, getcwd
|
||||
//!
|
||||
//! # Application Timing
|
||||
//!
|
||||
//! The filter MUST be applied after all initialization is complete:
|
||||
//! - KVM VM and vCPUs created
|
||||
//! - Guest memory allocated and mapped
|
||||
//! - Kernel loaded into guest memory
|
||||
//! - Devices initialized
|
||||
//! - API socket bound
|
||||
//!
|
||||
//! But BEFORE the vCPU run loop starts.
|
||||
|
||||
use std::convert::TryInto;
|
||||
|
||||
use seccompiler::{
|
||||
BpfProgram, SeccompAction, SeccompFilter,
|
||||
};
|
||||
use tracing::{debug, info, trace, warn};
|
||||
|
||||
/// Configuration for seccomp filtering
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct SeccompConfig {
|
||||
/// Whether seccomp filtering is enabled
|
||||
pub enabled: bool,
|
||||
/// Log the allowlist at TRACE level during setup
|
||||
pub log_allowlist: bool,
|
||||
}
|
||||
|
||||
impl Default for SeccompConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
enabled: true,
|
||||
log_allowlist: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Errors related to seccomp filter setup
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum SeccompError {
|
||||
#[error("Failed to build seccomp filter: {0}")]
|
||||
FilterBuild(String),
|
||||
|
||||
#[error("Failed to compile seccomp filter to BPF: {0}")]
|
||||
Compile(String),
|
||||
|
||||
#[error("Failed to apply seccomp filter: {0}")]
|
||||
Apply(String),
|
||||
}
|
||||
|
||||
/// Syscall name-number pairs for the x86_64 allowlist.
|
||||
///
|
||||
/// These are the syscalls a KVM-based VMM needs during steady-state operation.
|
||||
/// Numbers are from the Linux x86_64 syscall table.
|
||||
const ALLOWED_SYSCALLS: &[(i64, &str)] = &[
|
||||
// ── File I/O ──
|
||||
(libc::SYS_read, "read"),
|
||||
(libc::SYS_write, "write"),
|
||||
(libc::SYS_openat, "openat"),
|
||||
(libc::SYS_close, "close"),
|
||||
(libc::SYS_fstat, "fstat"),
|
||||
(libc::SYS_lseek, "lseek"),
|
||||
(libc::SYS_pread64, "pread64"),
|
||||
(libc::SYS_pwrite64, "pwrite64"),
|
||||
(libc::SYS_readv, "readv"),
|
||||
(libc::SYS_writev, "writev"),
|
||||
(libc::SYS_fsync, "fsync"),
|
||||
(libc::SYS_fdatasync, "fdatasync"),
|
||||
(libc::SYS_fallocate, "fallocate"),
|
||||
(libc::SYS_ftruncate, "ftruncate"),
|
||||
(libc::SYS_mkdir, "mkdir"),
|
||||
(libc::SYS_mkdirat, "mkdirat"),
|
||||
|
||||
// ── Memory management ──
|
||||
(libc::SYS_mmap, "mmap"),
|
||||
(libc::SYS_mprotect, "mprotect"),
|
||||
(libc::SYS_munmap, "munmap"),
|
||||
(libc::SYS_brk, "brk"),
|
||||
(libc::SYS_madvise, "madvise"),
|
||||
(libc::SYS_mremap, "mremap"),
|
||||
|
||||
// ── KVM / device control ──
|
||||
// ioctl is the workhorse: KVM_RUN, KVM_SET_REGS, KVM_CREATE_VCPU, etc.
|
||||
// We allow all ioctls here; filtering by ioctl number would require
|
||||
// argument-level BPF rules for every KVM ioctl, which is fragile across
|
||||
// kernel versions. The fd-based KVM security model already limits scope.
|
||||
(libc::SYS_ioctl, "ioctl"),
|
||||
|
||||
// ── Threading ──
|
||||
(libc::SYS_clone, "clone"),
|
||||
(libc::SYS_clone3, "clone3"),
|
||||
(libc::SYS_futex, "futex"),
|
||||
(libc::SYS_set_robust_list, "set_robust_list"),
|
||||
(libc::SYS_sched_yield, "sched_yield"),
|
||||
(libc::SYS_sched_getaffinity, "sched_getaffinity"),
|
||||
(libc::SYS_rseq, "rseq"),
|
||||
|
||||
// ── Signals ──
|
||||
(libc::SYS_rt_sigaction, "rt_sigaction"),
|
||||
(libc::SYS_rt_sigprocmask, "rt_sigprocmask"),
|
||||
(libc::SYS_rt_sigreturn, "rt_sigreturn"),
|
||||
(libc::SYS_sigaltstack, "sigaltstack"),
|
||||
|
||||
// ── Networking (API socket + epoll) ──
|
||||
(libc::SYS_accept4, "accept4"),
|
||||
(libc::SYS_bind, "bind"),
|
||||
(libc::SYS_listen, "listen"),
|
||||
(libc::SYS_socket, "socket"),
|
||||
// socketpair: required by signal-hook-tokio (UnixStream::pair() for signal delivery pipe)
|
||||
(libc::SYS_socketpair, "socketpair"),
|
||||
(libc::SYS_connect, "connect"),
|
||||
(libc::SYS_recvfrom, "recvfrom"),
|
||||
(libc::SYS_sendto, "sendto"),
|
||||
(libc::SYS_recvmsg, "recvmsg"),
|
||||
(libc::SYS_sendmsg, "sendmsg"),
|
||||
(libc::SYS_shutdown, "shutdown"),
|
||||
(libc::SYS_getsockname, "getsockname"),
|
||||
(libc::SYS_getpeername, "getpeername"),
|
||||
(libc::SYS_setsockopt, "setsockopt"),
|
||||
(libc::SYS_getsockopt, "getsockopt"),
|
||||
(libc::SYS_epoll_create1, "epoll_create1"),
|
||||
(libc::SYS_epoll_ctl, "epoll_ctl"),
|
||||
(libc::SYS_epoll_wait, "epoll_wait"),
|
||||
// epoll_pwait: glibc ≥2.35 routes epoll_wait() through epoll_pwait(); tokio depends on this
|
||||
(libc::SYS_epoll_pwait, "epoll_pwait"),
|
||||
(libc::SYS_ppoll, "ppoll"),
|
||||
// poll: fallback I/O multiplexing used by some tokio codepaths and libc internals
|
||||
(libc::SYS_poll, "poll"),
|
||||
|
||||
// ── Process lifecycle ──
|
||||
(libc::SYS_exit, "exit"),
|
||||
(libc::SYS_exit_group, "exit_group"),
|
||||
(libc::SYS_getpid, "getpid"),
|
||||
(libc::SYS_gettid, "gettid"),
|
||||
(libc::SYS_prctl, "prctl"),
|
||||
(libc::SYS_arch_prctl, "arch_prctl"),
|
||||
(libc::SYS_prlimit64, "prlimit64"),
|
||||
(libc::SYS_tgkill, "tgkill"),
|
||||
|
||||
// ── Timers ──
|
||||
(libc::SYS_clock_gettime, "clock_gettime"),
|
||||
(libc::SYS_nanosleep, "nanosleep"),
|
||||
(libc::SYS_clock_nanosleep, "clock_nanosleep"),
|
||||
|
||||
// ── Misc (runtime needs) ──
|
||||
(libc::SYS_getrandom, "getrandom"),
|
||||
(libc::SYS_eventfd2, "eventfd2"),
|
||||
(libc::SYS_timerfd_create, "timerfd_create"),
|
||||
(libc::SYS_timerfd_settime, "timerfd_settime"),
|
||||
(libc::SYS_pipe2, "pipe2"),
|
||||
(libc::SYS_dup, "dup"),
|
||||
(libc::SYS_dup2, "dup2"),
|
||||
(libc::SYS_fcntl, "fcntl"),
|
||||
(libc::SYS_statx, "statx"),
|
||||
(libc::SYS_newfstatat, "newfstatat"),
|
||||
(libc::SYS_access, "access"),
|
||||
(libc::SYS_readlinkat, "readlinkat"),
|
||||
(libc::SYS_getcwd, "getcwd"),
|
||||
(libc::SYS_unlink, "unlink"),
|
||||
(libc::SYS_unlinkat, "unlinkat"),
|
||||
];
|
||||
|
||||
/// Build the seccomp BPF filter program.
|
||||
///
|
||||
/// Creates a filter that allows only the syscalls in `ALLOWED_SYSCALLS`
|
||||
/// and kills the process on any other syscall.
|
||||
fn build_filter(log_allowlist: bool) -> Result<BpfProgram, SeccompError> {
|
||||
if log_allowlist {
|
||||
trace!("Building seccomp filter with {} allowed syscalls:", ALLOWED_SYSCALLS.len());
|
||||
for (nr, name) in ALLOWED_SYSCALLS {
|
||||
trace!(" allow: {} (nr={})", name, nr);
|
||||
}
|
||||
}
|
||||
|
||||
// Build the syscall rules map: each allowed syscall maps to an empty
|
||||
// rule vector (meaning "allow unconditionally").
|
||||
let rules: Vec<(i64, Vec<seccompiler::SeccompRule>)> = ALLOWED_SYSCALLS
|
||||
.iter()
|
||||
.map(|(nr, _name)| (*nr, vec![]))
|
||||
.collect();
|
||||
|
||||
let filter = SeccompFilter::new(
|
||||
rules.into_iter().collect(),
|
||||
// Default action: kill the process for any non-allowed syscall
|
||||
SeccompAction::KillProcess,
|
||||
// Match action: allow the syscall if it's in our allowlist
|
||||
SeccompAction::Allow,
|
||||
std::env::consts::ARCH
|
||||
.try_into()
|
||||
.map_err(|e| SeccompError::FilterBuild(format!("Unsupported arch: {:?}", e)))?,
|
||||
)
|
||||
.map_err(|e| SeccompError::FilterBuild(format!("{}", e)))?;
|
||||
|
||||
// Compile the filter to BPF instructions
|
||||
let bpf: BpfProgram = filter
|
||||
.try_into()
|
||||
.map_err(|e: seccompiler::BackendError| SeccompError::Compile(format!("{}", e)))?;
|
||||
|
||||
debug!(
|
||||
"Seccomp BPF program compiled: {} instructions, {} syscalls allowed",
|
||||
bpf.len(),
|
||||
ALLOWED_SYSCALLS.len()
|
||||
);
|
||||
|
||||
Ok(bpf)
|
||||
}
|
||||
|
||||
/// Apply seccomp-bpf filtering to the current process.
|
||||
///
|
||||
/// After this call, only syscalls in the allowlist will succeed.
|
||||
/// Any other syscall will immediately kill the process with SIGSYS.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `config` - Seccomp configuration (enabled flag, logging)
|
||||
///
|
||||
/// # Safety
|
||||
///
|
||||
/// This function uses `prctl(PR_SET_NO_NEW_PRIVS)` and `seccomp(SECCOMP_SET_MODE_FILTER)`.
|
||||
/// It must be called from the main thread before spawning vCPU threads, or use
|
||||
/// `apply_filter_all_threads` for TSYNC.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns `SeccompError` if filter construction or application fails.
|
||||
pub fn apply_seccomp_filter(config: &SeccompConfig) -> Result<(), SeccompError> {
|
||||
if !config.enabled {
|
||||
warn!("Seccomp filtering is DISABLED (--no-seccomp flag). This is insecure for production use.");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
info!("Applying seccomp-bpf filter ({} syscalls allowed)", ALLOWED_SYSCALLS.len());
|
||||
|
||||
let bpf = build_filter(config.log_allowlist)?;
|
||||
|
||||
// Apply to all threads via TSYNC. This ensures vCPU threads spawned later
|
||||
// also inherit the filter.
|
||||
seccompiler::apply_filter_all_threads(&bpf)
|
||||
.map_err(|e| SeccompError::Apply(format!("{}", e)))?;
|
||||
|
||||
info!(
|
||||
"Seccomp filter active: {} syscalls allowed, all others → KILL_PROCESS",
|
||||
ALLOWED_SYSCALLS.len()
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get the number of allowed syscalls (for metrics/logging).
|
||||
#[allow(dead_code)]
|
||||
pub fn allowed_syscall_count() -> usize {
|
||||
ALLOWED_SYSCALLS.len()
|
||||
}
|
||||
|
||||
/// Get a list of allowed syscall names (for debugging/documentation).
|
||||
#[allow(dead_code)]
|
||||
pub fn allowed_syscall_names() -> Vec<&'static str> {
|
||||
ALLOWED_SYSCALLS.iter().map(|(_, name)| *name).collect()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_no_duplicate_syscalls() {
|
||||
let mut seen = std::collections::HashSet::new();
|
||||
for (nr, name) in ALLOWED_SYSCALLS {
|
||||
assert!(
|
||||
seen.insert(nr),
|
||||
"Duplicate syscall number {} ({})",
|
||||
nr,
|
||||
name
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_allowlist_not_empty() {
|
||||
assert!(!ALLOWED_SYSCALLS.is_empty());
|
||||
assert!(ALLOWED_SYSCALLS.len() > 30, "Allowlist seems suspiciously small");
|
||||
assert!(ALLOWED_SYSCALLS.len() < 120, "Allowlist seems suspiciously large");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_filter_builds() {
|
||||
// Just verify the filter compiles without error
|
||||
let bpf = build_filter(false).expect("Filter should build successfully");
|
||||
assert!(!bpf.is_empty(), "BPF program should not be empty");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_config_default() {
|
||||
let config = SeccompConfig::default();
|
||||
assert!(config.enabled);
|
||||
assert!(config.log_allowlist);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_disabled_config() {
|
||||
let config = SeccompConfig {
|
||||
enabled: false,
|
||||
log_allowlist: false,
|
||||
};
|
||||
// Should return Ok without applying anything
|
||||
apply_seccomp_filter(&config).expect("Disabled filter should succeed");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_allowed_syscall_names() {
|
||||
let names = allowed_syscall_names();
|
||||
assert!(names.contains(&"read"));
|
||||
assert!(names.contains(&"write"));
|
||||
assert!(names.contains(&"ioctl"));
|
||||
assert!(names.contains(&"exit_group"));
|
||||
assert!(names.contains(&"mmap"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_syscall_count() {
|
||||
assert_eq!(allowed_syscall_count(), ALLOWED_SYSCALLS.len());
|
||||
}
|
||||
}
|
||||
660
vmm/src/snapshot/cas.rs
Normal file
660
vmm/src/snapshot/cas.rs
Normal file
@@ -0,0 +1,660 @@
|
||||
//! Content-Addressable Storage (CAS) Support for Memory Snapshots
|
||||
//!
|
||||
//! This module provides Stellarium CAS-backed memory snapshot support.
|
||||
//! Instead of a single flat `memory.snap` file, memory is stored as
|
||||
//! 64 × 2MB chunks, each identified by SHA-256 hash.
|
||||
//!
|
||||
//! # Benefits
|
||||
//!
|
||||
//! - **Deduplication**: Identical chunks across VMs are stored once
|
||||
//! - **Instant cloning**: VMs with identical memory regions share chunks
|
||||
//! - **Efficient storage**: Only modified chunks need to be stored
|
||||
//! - **Huge page compatible**: 2MB chunks align with huge pages
|
||||
//!
|
||||
//! # Architecture
|
||||
//!
|
||||
//! ```text
|
||||
//! ┌─────────────────────────────────────────────────────────┐
|
||||
//! │ 128MB Guest Memory │
|
||||
//! ├──────┬──────┬──────┬──────┬─────────────────────────────┤
|
||||
//! │ 2MB │ 2MB │ 2MB │ ... │ (64 chunks total) │
|
||||
//! │ hash │ hash │ hash │ │ │
|
||||
//! │ A │ B │ A │ │ ← Chunks A and A are same! │
|
||||
//! └──┬───┴──┬───┴──┬───┴──────┴─────────────────────────────┘
|
||||
//! │ │ │
|
||||
//! │ │ └──── Points to same CAS object
|
||||
//! ▼ ▼
|
||||
//! ┌──────────────────────────────────────────────────────────┐
|
||||
//! │ Stellarium CAS Store │
|
||||
//! │ sha256/ab/abc123... ← Chunk A (stored once) │
|
||||
//! │ sha256/de/def456... ← Chunk B │
|
||||
//! └──────────────────────────────────────────────────────────┘
|
||||
//! ```
|
||||
//!
|
||||
//! # Manifest Format
|
||||
//!
|
||||
//! The manifest (`memory-manifest.json`) lists all chunks:
|
||||
//!
|
||||
//! ```json
|
||||
//! {
|
||||
//! "version": 1,
|
||||
//! "chunk_size": 2097152,
|
||||
//! "total_size": 134217728,
|
||||
//! "chunks": [
|
||||
//! { "hash": "abc123...", "offset": 0, "size": 2097152 },
|
||||
//! { "hash": "def456...", "offset": 2097152, "size": 2097152 },
|
||||
//! ...
|
||||
//! ]
|
||||
//! }
|
||||
//! ```
|
||||
|
||||
use std::fs::{self, File};
|
||||
use std::io::{Read, Write};
|
||||
use std::num::NonZeroUsize;
|
||||
use std::os::fd::BorrowedFd;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use nix::sys::mman::{mmap, munmap, MapFlags, ProtFlags};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use sha2::{Sha256, Digest};
|
||||
use tracing::{debug, info, warn};
|
||||
|
||||
use super::{Result, SnapshotError, MemoryMapping};
|
||||
|
||||
/// CAS chunk size: 2MB (aligned with huge pages)
|
||||
pub const CAS_CHUNK_SIZE: usize = 2 * 1024 * 1024; // 2MB
|
||||
|
||||
/// CAS manifest version
|
||||
pub const CAS_MANIFEST_VERSION: u32 = 1;
|
||||
|
||||
/// Manifest file name
|
||||
pub const CAS_MANIFEST_FILENAME: &str = "memory-manifest.json";
|
||||
|
||||
// ============================================================================
|
||||
// CAS Manifest Types
|
||||
// ============================================================================
|
||||
|
||||
/// A single chunk in the CAS manifest
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct CasChunk {
|
||||
/// SHA-256 hash of the chunk (hex string, 64 chars)
|
||||
pub hash: String,
|
||||
/// Offset in guest physical memory
|
||||
pub offset: u64,
|
||||
/// Size of the chunk in bytes (always CAS_CHUNK_SIZE except possibly last)
|
||||
pub size: usize,
|
||||
}
|
||||
|
||||
/// CAS manifest describing memory as chunks
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct CasManifest {
|
||||
/// Manifest format version
|
||||
pub version: u32,
|
||||
/// Size of each chunk (2MB)
|
||||
pub chunk_size: usize,
|
||||
/// Total memory size in bytes
|
||||
pub total_size: u64,
|
||||
/// List of chunks with hashes and offsets
|
||||
pub chunks: Vec<CasChunk>,
|
||||
}
|
||||
|
||||
impl CasManifest {
|
||||
/// Load a CAS manifest from a file
|
||||
pub fn from_file(path: &Path) -> Result<Self> {
|
||||
let content = fs::read_to_string(path)?;
|
||||
let manifest: CasManifest = serde_json::from_str(&content)?;
|
||||
|
||||
// Validate version
|
||||
if manifest.version != CAS_MANIFEST_VERSION {
|
||||
return Err(SnapshotError::VersionMismatch {
|
||||
expected: CAS_MANIFEST_VERSION,
|
||||
actual: manifest.version,
|
||||
});
|
||||
}
|
||||
|
||||
// Validate chunk size
|
||||
if manifest.chunk_size != CAS_CHUNK_SIZE {
|
||||
return Err(SnapshotError::Invalid(format!(
|
||||
"Unsupported chunk size: {} (expected {})",
|
||||
manifest.chunk_size, CAS_CHUNK_SIZE
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(manifest)
|
||||
}
|
||||
|
||||
/// Save the manifest to a file
|
||||
pub fn save(&self, path: &Path) -> Result<()> {
|
||||
let content = serde_json::to_string_pretty(self)?;
|
||||
let mut file = File::create(path)?;
|
||||
file.write_all(content.as_bytes())?;
|
||||
file.sync_all()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Create a new empty manifest for the given memory size
|
||||
pub fn new(memory_size: u64) -> Self {
|
||||
Self {
|
||||
version: CAS_MANIFEST_VERSION,
|
||||
chunk_size: CAS_CHUNK_SIZE,
|
||||
total_size: memory_size,
|
||||
chunks: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Add a chunk to the manifest
|
||||
pub fn add_chunk(&mut self, hash: String, offset: u64, size: usize) {
|
||||
self.chunks.push(CasChunk { hash, offset, size });
|
||||
}
|
||||
|
||||
/// Get the number of chunks
|
||||
pub fn chunk_count(&self) -> usize {
|
||||
self.chunks.len()
|
||||
}
|
||||
|
||||
/// Calculate expected number of chunks for the total size
|
||||
pub fn expected_chunk_count(&self) -> usize {
|
||||
((self.total_size as usize) + CAS_CHUNK_SIZE - 1) / CAS_CHUNK_SIZE
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// CAS Store Operations
|
||||
// ============================================================================
|
||||
|
||||
/// Get the path for a chunk in the CAS store
|
||||
///
|
||||
/// Follows Stellarium convention: `{cas_store}/sha256/{first2}/{hash}`
|
||||
pub fn cas_chunk_path(cas_store: &Path, hash: &str) -> PathBuf {
|
||||
let prefix = &hash[..2]; // First 2 chars for sharding
|
||||
cas_store.join("sha256").join(prefix).join(hash)
|
||||
}
|
||||
|
||||
/// Compute SHA-256 hash of a data chunk, returning hex string
|
||||
pub fn compute_chunk_hash(data: &[u8]) -> String {
|
||||
let mut hasher = Sha256::new();
|
||||
hasher.update(data);
|
||||
let result = hasher.finalize();
|
||||
hex::encode(result)
|
||||
}
|
||||
|
||||
/// Store a chunk in the CAS store if it doesn't already exist (dedup)
|
||||
///
|
||||
/// Returns the hash of the chunk and whether it was newly stored.
|
||||
pub fn store_chunk(cas_store: &Path, data: &[u8]) -> Result<(String, bool)> {
|
||||
let hash = compute_chunk_hash(data);
|
||||
let chunk_path = cas_chunk_path(cas_store, &hash);
|
||||
|
||||
// Check if chunk already exists (dedup!)
|
||||
if chunk_path.exists() {
|
||||
debug!("Chunk {} already exists (dedup)", &hash[..16]);
|
||||
return Ok((hash, false));
|
||||
}
|
||||
|
||||
// Create parent directories
|
||||
if let Some(parent) = chunk_path.parent() {
|
||||
fs::create_dir_all(parent)?;
|
||||
}
|
||||
|
||||
// Write chunk atomically (write to temp, then rename)
|
||||
let temp_path = chunk_path.with_extension("tmp");
|
||||
let mut file = File::create(&temp_path)?;
|
||||
file.write_all(data)?;
|
||||
file.sync_all()?;
|
||||
fs::rename(&temp_path, &chunk_path)?;
|
||||
|
||||
debug!("Stored new chunk {} ({} bytes)", &hash[..16], data.len());
|
||||
Ok((hash, true))
|
||||
}
|
||||
|
||||
/// Load a chunk from the CAS store
|
||||
pub fn load_chunk(cas_store: &Path, hash: &str) -> Result<Vec<u8>> {
|
||||
let chunk_path = cas_chunk_path(cas_store, hash);
|
||||
|
||||
if !chunk_path.exists() {
|
||||
return Err(SnapshotError::MissingFile(format!(
|
||||
"CAS chunk not found: {}",
|
||||
chunk_path.display()
|
||||
)));
|
||||
}
|
||||
|
||||
let mut file = File::open(&chunk_path)?;
|
||||
let mut data = Vec::new();
|
||||
file.read_to_end(&mut data)?;
|
||||
|
||||
// Verify hash
|
||||
let computed = compute_chunk_hash(&data);
|
||||
if computed != hash {
|
||||
return Err(SnapshotError::Invalid(format!(
|
||||
"CAS chunk hash mismatch: expected {}, got {}",
|
||||
hash, computed
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(data)
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// CAS Memory Dump (Snapshot Creation)
|
||||
// ============================================================================
|
||||
|
||||
/// Result of a CAS memory dump operation
|
||||
#[derive(Debug)]
|
||||
pub struct CasDumpResult {
|
||||
/// The manifest describing all chunks
|
||||
pub manifest: CasManifest,
|
||||
/// Number of chunks that were deduplicated (already existed)
|
||||
pub dedup_count: usize,
|
||||
/// Number of new chunks stored
|
||||
pub new_count: usize,
|
||||
/// Total bytes saved by deduplication
|
||||
pub bytes_saved: u64,
|
||||
}
|
||||
|
||||
/// Dump guest memory to CAS store as 2MB chunks
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `memory` - Guest memory manager
|
||||
/// * `snapshot_dir` - Directory to write the manifest
|
||||
/// * `cas_store` - Path to the Stellarium CAS store
|
||||
///
|
||||
/// # Returns
|
||||
/// A `CasDumpResult` with the manifest and dedup statistics.
|
||||
pub fn dump_guest_memory_cas(
|
||||
memory: &crate::kvm::GuestMemoryManager,
|
||||
snapshot_dir: &Path,
|
||||
cas_store: &Path,
|
||||
) -> Result<CasDumpResult> {
|
||||
let start = std::time::Instant::now();
|
||||
let total_size = memory.total_size();
|
||||
|
||||
let mut manifest = CasManifest::new(total_size);
|
||||
let mut dedup_count = 0usize;
|
||||
let mut new_count = 0usize;
|
||||
let mut bytes_saved = 0u64;
|
||||
|
||||
// Get the contiguous memory region
|
||||
let regions = memory.regions();
|
||||
if regions.is_empty() {
|
||||
return Err(SnapshotError::Invalid("No memory regions".to_string()));
|
||||
}
|
||||
|
||||
// We assume a single contiguous region for simplicity
|
||||
let region = ®ions[0];
|
||||
let host_ptr = region.host_addr;
|
||||
let region_size = region.size as usize;
|
||||
|
||||
// Process memory in 2MB chunks
|
||||
let num_chunks = (region_size + CAS_CHUNK_SIZE - 1) / CAS_CHUNK_SIZE;
|
||||
debug!("Splitting {} MB memory into {} chunks of 2MB each",
|
||||
region_size / (1024 * 1024), num_chunks);
|
||||
|
||||
for i in 0..num_chunks {
|
||||
let offset = i * CAS_CHUNK_SIZE;
|
||||
let chunk_size = (region_size - offset).min(CAS_CHUNK_SIZE);
|
||||
|
||||
// Get pointer to this chunk
|
||||
let chunk_ptr = unsafe { host_ptr.add(offset) };
|
||||
let chunk_data = unsafe { std::slice::from_raw_parts(chunk_ptr, chunk_size) };
|
||||
|
||||
// Store chunk (with dedup check)
|
||||
let (hash, is_new) = store_chunk(cas_store, chunk_data)?;
|
||||
|
||||
if is_new {
|
||||
new_count += 1;
|
||||
} else {
|
||||
dedup_count += 1;
|
||||
bytes_saved += chunk_size as u64;
|
||||
}
|
||||
|
||||
// Add to manifest
|
||||
manifest.add_chunk(hash, offset as u64, chunk_size);
|
||||
}
|
||||
|
||||
// Save manifest
|
||||
let manifest_path = snapshot_dir.join(CAS_MANIFEST_FILENAME);
|
||||
manifest.save(&manifest_path)?;
|
||||
|
||||
let elapsed = start.elapsed();
|
||||
info!(
|
||||
"CAS memory dump: {} chunks ({} new, {} dedup), {} MB saved, {:.2}ms",
|
||||
manifest.chunk_count(),
|
||||
new_count,
|
||||
dedup_count,
|
||||
bytes_saved / (1024 * 1024),
|
||||
elapsed.as_secs_f64() * 1000.0
|
||||
);
|
||||
|
||||
Ok(CasDumpResult {
|
||||
manifest,
|
||||
dedup_count,
|
||||
new_count,
|
||||
bytes_saved,
|
||||
})
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// CAS Memory Restore (Snapshot Restore)
|
||||
// ============================================================================
|
||||
|
||||
/// mmap each CAS chunk individually into a contiguous memory region
|
||||
///
|
||||
/// This creates a single contiguous guest memory region by mmap'ing each
|
||||
/// 2MB chunk at the correct offset using MAP_FIXED.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `manifest` - The CAS manifest describing chunks
|
||||
/// * `cas_store` - Path to the Stellarium CAS store
|
||||
///
|
||||
/// # Returns
|
||||
/// A `Vec<MemoryMapping>` containing the mapped memory regions.
|
||||
pub fn cas_mmap_memory(
|
||||
manifest: &CasManifest,
|
||||
cas_store: &Path,
|
||||
) -> Result<Vec<MemoryMapping>> {
|
||||
let start = std::time::Instant::now();
|
||||
|
||||
if manifest.chunks.is_empty() {
|
||||
return Err(SnapshotError::Invalid("Empty CAS manifest".to_string()));
|
||||
}
|
||||
|
||||
// First, create an anonymous mapping for the full memory size
|
||||
// This reserves the address space and provides a base for MAP_FIXED
|
||||
let total_size = manifest.total_size as usize;
|
||||
let prot = ProtFlags::PROT_READ | ProtFlags::PROT_WRITE;
|
||||
let flags = MapFlags::MAP_PRIVATE | MapFlags::MAP_ANONYMOUS;
|
||||
|
||||
// For anonymous mappings, the fd is ignored but nix requires a valid AsFd.
|
||||
// We use BorrowedFd::borrow_raw(-1) which is the traditional way to indicate
|
||||
// no file backing (fd=-1 is ignored when MAP_ANONYMOUS is set).
|
||||
let base_addr = unsafe {
|
||||
let dummy_fd = BorrowedFd::borrow_raw(-1);
|
||||
mmap(
|
||||
None,
|
||||
NonZeroUsize::new(total_size).ok_or_else(|| {
|
||||
SnapshotError::Mmap("zero-size memory".to_string())
|
||||
})?,
|
||||
prot,
|
||||
flags,
|
||||
dummy_fd,
|
||||
0,
|
||||
)
|
||||
.map_err(|e| SnapshotError::Mmap(format!("initial mmap failed: {}", e)))?
|
||||
};
|
||||
|
||||
let base_ptr = base_addr.as_ptr() as *mut u8;
|
||||
debug!(
|
||||
"Reserved {} MB at {:p} for CAS restore",
|
||||
total_size / (1024 * 1024),
|
||||
base_ptr
|
||||
);
|
||||
|
||||
// Now mmap each chunk into the reserved region using MAP_FIXED
|
||||
for chunk in &manifest.chunks {
|
||||
let chunk_path = cas_chunk_path(cas_store, &chunk.hash);
|
||||
|
||||
if !chunk_path.exists() {
|
||||
// Clean up the base mapping before returning error
|
||||
let _ = unsafe { munmap(base_addr, total_size) };
|
||||
return Err(SnapshotError::MissingFile(format!(
|
||||
"CAS chunk not found: {} (hash: {}...)",
|
||||
chunk_path.display(),
|
||||
&chunk.hash[..16]
|
||||
)));
|
||||
}
|
||||
|
||||
let chunk_file = File::open(&chunk_path)?;
|
||||
let target_addr = unsafe { base_ptr.add(chunk.offset as usize) };
|
||||
|
||||
// MAP_FIXED replaces the anonymous mapping with file-backed mapping
|
||||
let mapped = unsafe {
|
||||
mmap(
|
||||
Some(NonZeroUsize::new(target_addr as usize).unwrap()),
|
||||
NonZeroUsize::new(chunk.size).ok_or_else(|| {
|
||||
SnapshotError::Mmap("zero-size chunk".to_string())
|
||||
})?,
|
||||
prot,
|
||||
MapFlags::MAP_PRIVATE | MapFlags::MAP_FIXED,
|
||||
&chunk_file,
|
||||
0,
|
||||
)
|
||||
.map_err(|e| {
|
||||
SnapshotError::Mmap(format!(
|
||||
"mmap chunk {} at offset 0x{:x} failed: {}",
|
||||
&chunk.hash[..16], chunk.offset, e
|
||||
))
|
||||
})?
|
||||
};
|
||||
|
||||
debug!(
|
||||
"Mapped CAS chunk {}... at offset 0x{:x} ({} bytes)",
|
||||
&chunk.hash[..16],
|
||||
chunk.offset,
|
||||
chunk.size
|
||||
);
|
||||
|
||||
// File can be closed; mmap keeps a reference
|
||||
// (File drops here)
|
||||
|
||||
// Verify the mapping is at the expected address
|
||||
if mapped.as_ptr() as usize != target_addr as usize {
|
||||
warn!(
|
||||
"MAP_FIXED returned different address: expected {:p}, got {:p}",
|
||||
target_addr,
|
||||
mapped.as_ptr()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let elapsed = start.elapsed();
|
||||
info!(
|
||||
"CAS memory restored: {} chunks, {} MB, {:.2}ms",
|
||||
manifest.chunk_count(),
|
||||
total_size / (1024 * 1024),
|
||||
elapsed.as_secs_f64() * 1000.0
|
||||
);
|
||||
|
||||
// Return as a single contiguous mapping
|
||||
// Note: We don't drop the base mapping here; it's now composed of the chunk mappings
|
||||
Ok(vec![MemoryMapping {
|
||||
host_addr: base_ptr,
|
||||
size: total_size,
|
||||
guest_addr: 0, // Guest memory starts at physical address 0
|
||||
}])
|
||||
}
|
||||
|
||||
/// Check if a snapshot directory contains a CAS manifest
|
||||
pub fn has_cas_manifest(snapshot_dir: &Path) -> bool {
|
||||
snapshot_dir.join(CAS_MANIFEST_FILENAME).exists()
|
||||
}
|
||||
|
||||
/// Restore memory from either CAS or flat snapshot
|
||||
///
|
||||
/// Automatically detects the snapshot type and uses the appropriate method.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `snapshot_dir` - Path to the snapshot directory
|
||||
/// * `cas_store` - Optional path to CAS store (required if CAS manifest exists)
|
||||
///
|
||||
/// # Returns
|
||||
/// Memory mappings for the restored memory.
|
||||
pub fn restore_memory_auto(
|
||||
snapshot_dir: &Path,
|
||||
cas_store: Option<&Path>,
|
||||
) -> Result<Vec<MemoryMapping>> {
|
||||
let manifest_path = snapshot_dir.join(CAS_MANIFEST_FILENAME);
|
||||
|
||||
if manifest_path.exists() {
|
||||
// CAS-backed snapshot
|
||||
let cas_store = cas_store.ok_or_else(|| {
|
||||
SnapshotError::Invalid(
|
||||
"CAS manifest found but --cas-store not specified".to_string()
|
||||
)
|
||||
})?;
|
||||
|
||||
info!("Restoring memory from CAS manifest");
|
||||
let manifest = CasManifest::from_file(&manifest_path)?;
|
||||
cas_mmap_memory(&manifest, cas_store)
|
||||
} else {
|
||||
// Fall back to checking for flat memory.snap
|
||||
let mem_path = snapshot_dir.join("memory.snap");
|
||||
if !mem_path.exists() {
|
||||
return Err(SnapshotError::MissingFile(
|
||||
"Neither memory-manifest.json nor memory.snap found".to_string()
|
||||
));
|
||||
}
|
||||
|
||||
info!("Restoring memory from flat memory.snap");
|
||||
// This case is handled by the existing restore.rs code
|
||||
// Return an indicator that flat restore should be used
|
||||
Err(SnapshotError::Invalid(
|
||||
"USE_FLAT_RESTORE".to_string()
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Tests
|
||||
// ============================================================================
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use tempfile::TempDir;
|
||||
|
||||
#[test]
|
||||
fn test_compute_chunk_hash() {
|
||||
let data = b"Hello, World!";
|
||||
let hash = compute_chunk_hash(data);
|
||||
|
||||
// SHA-256 of "Hello, World!" is known
|
||||
assert_eq!(hash.len(), 64); // 256 bits = 64 hex chars
|
||||
assert!(hash.chars().all(|c| c.is_ascii_hexdigit()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_chunk_hash_deterministic() {
|
||||
let data = vec![0u8; CAS_CHUNK_SIZE];
|
||||
let hash1 = compute_chunk_hash(&data);
|
||||
let hash2 = compute_chunk_hash(&data);
|
||||
assert_eq!(hash1, hash2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cas_chunk_path() {
|
||||
let cas_store = Path::new("/var/cas");
|
||||
let hash = "abcdef1234567890abcdef1234567890abcdef1234567890abcdef1234567890";
|
||||
let path = cas_chunk_path(cas_store, hash);
|
||||
|
||||
assert_eq!(
|
||||
path,
|
||||
PathBuf::from("/var/cas/sha256/ab/abcdef1234567890abcdef1234567890abcdef1234567890abcdef1234567890")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_store_and_load_chunk() {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let cas_store = temp_dir.path();
|
||||
|
||||
let data = b"Test chunk data for CAS storage";
|
||||
|
||||
// Store chunk
|
||||
let (hash, is_new) = store_chunk(cas_store, data).unwrap();
|
||||
assert!(is_new);
|
||||
assert_eq!(hash.len(), 64);
|
||||
|
||||
// Store same chunk again (should dedup)
|
||||
let (hash2, is_new2) = store_chunk(cas_store, data).unwrap();
|
||||
assert!(!is_new2);
|
||||
assert_eq!(hash, hash2);
|
||||
|
||||
// Load chunk
|
||||
let loaded = load_chunk(cas_store, &hash).unwrap();
|
||||
assert_eq!(loaded, data);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_manifest_serialization() {
|
||||
let mut manifest = CasManifest::new(128 * 1024 * 1024);
|
||||
manifest.add_chunk(
|
||||
"abc123".repeat(10) + "abcd", // 64 chars
|
||||
0,
|
||||
CAS_CHUNK_SIZE,
|
||||
);
|
||||
manifest.add_chunk(
|
||||
"def456".repeat(10) + "defg", // 64 chars
|
||||
CAS_CHUNK_SIZE as u64,
|
||||
CAS_CHUNK_SIZE,
|
||||
);
|
||||
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let manifest_path = temp_dir.path().join("manifest.json");
|
||||
|
||||
// Save
|
||||
manifest.save(&manifest_path).unwrap();
|
||||
|
||||
// Load
|
||||
let loaded = CasManifest::from_file(&manifest_path).unwrap();
|
||||
|
||||
assert_eq!(loaded.version, manifest.version);
|
||||
assert_eq!(loaded.chunk_size, manifest.chunk_size);
|
||||
assert_eq!(loaded.total_size, manifest.total_size);
|
||||
assert_eq!(loaded.chunks.len(), 2);
|
||||
assert_eq!(loaded.chunks[0].offset, 0);
|
||||
assert_eq!(loaded.chunks[1].offset, CAS_CHUNK_SIZE as u64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dedup_identical_chunks() {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let cas_store = temp_dir.path();
|
||||
|
||||
// Two identical chunks
|
||||
let data = vec![0xABu8; 1024];
|
||||
|
||||
let (hash1, is_new1) = store_chunk(cas_store, &data).unwrap();
|
||||
let (hash2, is_new2) = store_chunk(cas_store, &data).unwrap();
|
||||
|
||||
assert!(is_new1);
|
||||
assert!(!is_new2); // Dedup!
|
||||
assert_eq!(hash1, hash2);
|
||||
|
||||
// Different chunk
|
||||
let data2 = vec![0xCDu8; 1024];
|
||||
let (hash3, is_new3) = store_chunk(cas_store, &data2).unwrap();
|
||||
|
||||
assert!(is_new3);
|
||||
assert_ne!(hash1, hash3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_has_cas_manifest() {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
|
||||
// No manifest
|
||||
assert!(!has_cas_manifest(temp_dir.path()));
|
||||
|
||||
// Create manifest
|
||||
let manifest = CasManifest::new(128 * 1024 * 1024);
|
||||
manifest.save(&temp_dir.path().join(CAS_MANIFEST_FILENAME)).unwrap();
|
||||
|
||||
// Now it exists
|
||||
assert!(has_cas_manifest(temp_dir.path()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_expected_chunk_count() {
|
||||
// Exactly divisible
|
||||
let manifest = CasManifest::new(128 * 1024 * 1024);
|
||||
assert_eq!(manifest.expected_chunk_count(), 64); // 128MB / 2MB = 64
|
||||
|
||||
// Not exactly divisible
|
||||
let manifest2 = CasManifest::new(129 * 1024 * 1024);
|
||||
assert_eq!(manifest2.expected_chunk_count(), 65); // Rounds up
|
||||
|
||||
// Small memory
|
||||
let manifest3 = CasManifest::new(1024 * 1024);
|
||||
assert_eq!(manifest3.expected_chunk_count(), 1); // Less than one chunk
|
||||
}
|
||||
}
|
||||
776
vmm/src/snapshot/create.rs
Normal file
776
vmm/src/snapshot/create.rs
Normal file
@@ -0,0 +1,776 @@
|
||||
//! Snapshot Creation
|
||||
//!
|
||||
//! Creates a point-in-time snapshot of a running VM by:
|
||||
//! 1. Pausing all vCPUs
|
||||
//! 2. Extracting KVM state (registers, IRQ chip, clock)
|
||||
//! 3. Serializing device state
|
||||
//! 4. Dumping guest memory to a file
|
||||
//! 5. Writing state metadata with CRC-64 integrity
|
||||
//! 6. Resuming vCPUs
|
||||
|
||||
use std::fs::{self, File};
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
use std::time::{SystemTime, UNIX_EPOCH};
|
||||
|
||||
use kvm_bindings::{
|
||||
kvm_irqchip, kvm_msr_entry, kvm_pit_state2,
|
||||
Msrs,
|
||||
KVM_IRQCHIP_IOAPIC, KVM_IRQCHIP_PIC_MASTER, KVM_IRQCHIP_PIC_SLAVE,
|
||||
};
|
||||
use kvm_ioctls::VmFd;
|
||||
use tracing::{debug, info, warn};
|
||||
|
||||
use super::*;
|
||||
|
||||
/// Well-known MSR indices to save
|
||||
const MSRS_TO_SAVE: &[u32] = &[
|
||||
0x174, // MSR_IA32_SYSENTER_CS
|
||||
0x175, // MSR_IA32_SYSENTER_ESP
|
||||
0x176, // MSR_IA32_SYSENTER_EIP
|
||||
0x1a0, // MSR_IA32_MISC_ENABLE
|
||||
0xc0000081, // MSR_STAR
|
||||
0xc0000082, // MSR_LSTAR
|
||||
0xc0000083, // MSR_CSTAR
|
||||
0xc0000084, // MSR_SYSCALL_MASK
|
||||
0xc0000102, // MSR_KERNEL_GS_BASE
|
||||
0xc0000100, // MSR_FS_BASE
|
||||
0xc0000101, // MSR_GS_BASE
|
||||
0x10, // MSR_IA32_TSC
|
||||
0x2ff, // MSR_MTRR_DEF_TYPE
|
||||
0x277, // MSR_IA32_CR_PAT
|
||||
0x48, // MSR_IA32_SPEC_CTRL (if supported)
|
||||
0xc0000080, // MSR_EFER
|
||||
0x8b, // MSR_IA32_BIOS_SIGN_ID (microcode version)
|
||||
0xfe, // MSR_IA32_MTRRCAP
|
||||
0x200, 0x201, // MSR_MTRR_PHYSBASE0, PHYSMASK0
|
||||
0x202, 0x203, // MSR_MTRR_PHYSBASE1, PHYSMASK1
|
||||
0x204, 0x205, // MSR_MTRR_PHYSBASE2, PHYSMASK2
|
||||
0x206, 0x207, // MSR_MTRR_PHYSBASE3, PHYSMASK3
|
||||
0x250, // MSR_MTRR_FIX64K_00000
|
||||
0x258, // MSR_MTRR_FIX16K_80000
|
||||
0x259, // MSR_MTRR_FIX16K_A0000
|
||||
0x268, 0x269, 0x26a, 0x26b, // MSR_MTRR_FIX4K_*
|
||||
0x26c, 0x26d, 0x26e, 0x26f,
|
||||
0x38d, // MSR_IA32_FIXED_CTR_CTRL
|
||||
0x38f, // MSR_IA32_PERF_GLOBAL_CTRL
|
||||
0x6e0, // MSR_IA32_TSC_DEADLINE
|
||||
];
|
||||
|
||||
/// Create a snapshot of the given VM and save it to the specified directory.
|
||||
///
|
||||
/// The snapshot directory will contain:
|
||||
/// - `state.json`: Serialized VM state with CRC-64 integrity
|
||||
/// - `memory.snap`: Raw guest memory dump
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `vm_fd` - The KVM VM file descriptor
|
||||
/// * `vcpu_fds` - Locked vCPU file descriptors (must be paused)
|
||||
/// * `memory` - Guest memory manager
|
||||
/// * `serial` - Serial device state
|
||||
/// * `mmio_devices` - MMIO device manager
|
||||
/// * `snapshot_dir` - Directory to write snapshot files
|
||||
pub fn create_snapshot(
|
||||
vm_fd: &VmFd,
|
||||
vcpu_fds: &[&kvm_ioctls::VcpuFd],
|
||||
memory: &crate::kvm::GuestMemoryManager,
|
||||
serial: &crate::devices::serial::Serial,
|
||||
snapshot_dir: &Path,
|
||||
) -> Result<()> {
|
||||
let start = std::time::Instant::now();
|
||||
|
||||
// Ensure snapshot directory exists
|
||||
fs::create_dir_all(snapshot_dir)?;
|
||||
|
||||
info!("Creating snapshot at {}", snapshot_dir.display());
|
||||
|
||||
// Step 1: Save vCPU state
|
||||
let vcpu_states = save_vcpu_states(vcpu_fds)?;
|
||||
let t_vcpu = start.elapsed();
|
||||
debug!("vCPU state saved in {:.2}ms", t_vcpu.as_secs_f64() * 1000.0);
|
||||
|
||||
// Step 2: Save IRQ chip state
|
||||
let irqchip = save_irqchip_state(vm_fd)?;
|
||||
let t_irq = start.elapsed();
|
||||
debug!(
|
||||
"IRQ chip state saved in {:.2}ms",
|
||||
(t_irq - t_vcpu).as_secs_f64() * 1000.0
|
||||
);
|
||||
|
||||
// Step 3: Save clock
|
||||
let clock = save_clock_state(vm_fd)?;
|
||||
let t_clock = start.elapsed();
|
||||
debug!(
|
||||
"Clock state saved in {:.2}ms",
|
||||
(t_clock - t_irq).as_secs_f64() * 1000.0
|
||||
);
|
||||
|
||||
// Step 4: Save device state
|
||||
let devices = save_device_state(serial)?;
|
||||
let t_dev = start.elapsed();
|
||||
debug!(
|
||||
"Device state saved in {:.2}ms",
|
||||
(t_dev - t_clock).as_secs_f64() * 1000.0
|
||||
);
|
||||
|
||||
// Step 5: Dump guest memory
|
||||
let (memory_regions, memory_file_size) = dump_guest_memory(memory, snapshot_dir)?;
|
||||
let t_mem = start.elapsed();
|
||||
debug!(
|
||||
"Memory dumped ({} MB) in {:.2}ms",
|
||||
memory_file_size / (1024 * 1024),
|
||||
(t_mem - t_dev).as_secs_f64() * 1000.0
|
||||
);
|
||||
|
||||
// Step 6: Build snapshot and write state.json
|
||||
let snapshot = VmSnapshot {
|
||||
metadata: SnapshotMetadata {
|
||||
version: SNAPSHOT_VERSION,
|
||||
memory_size: memory.total_size(),
|
||||
vcpu_count: vcpu_fds.len() as u8,
|
||||
created_at: SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap_or_default()
|
||||
.as_secs(),
|
||||
state_crc64: 0, // Placeholder, computed below
|
||||
memory_file_size,
|
||||
},
|
||||
vcpu_states,
|
||||
irqchip,
|
||||
clock,
|
||||
devices,
|
||||
memory_regions,
|
||||
};
|
||||
|
||||
// Serialize and compute CRC
|
||||
let mut state_json = serde_json::to_string_pretty(&snapshot)?;
|
||||
|
||||
// Compute CRC over the state (with crc64 = 0) and patch it in
|
||||
let crc = compute_crc64(state_json.as_bytes());
|
||||
let mut final_snapshot = snapshot;
|
||||
final_snapshot.metadata.state_crc64 = crc;
|
||||
state_json = serde_json::to_string_pretty(&final_snapshot)?;
|
||||
|
||||
let state_path = snapshot_dir.join("state.json");
|
||||
let mut state_file = File::create(&state_path)?;
|
||||
state_file.write_all(state_json.as_bytes())?;
|
||||
state_file.sync_all()?;
|
||||
|
||||
let t_total = start.elapsed();
|
||||
info!(
|
||||
"Snapshot created: {} vCPUs, {} MB memory, {:.2}ms total \
|
||||
[vcpu={:.2}ms, irq={:.2}ms, clock={:.2}ms, dev={:.2}ms, mem={:.2}ms, write={:.2}ms]",
|
||||
vcpu_fds.len(),
|
||||
memory.total_size() / (1024 * 1024),
|
||||
t_total.as_secs_f64() * 1000.0,
|
||||
t_vcpu.as_secs_f64() * 1000.0,
|
||||
(t_irq - t_vcpu).as_secs_f64() * 1000.0,
|
||||
(t_clock - t_irq).as_secs_f64() * 1000.0,
|
||||
(t_dev - t_clock).as_secs_f64() * 1000.0,
|
||||
(t_mem - t_dev).as_secs_f64() * 1000.0,
|
||||
(t_total - t_mem).as_secs_f64() * 1000.0,
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// vCPU State Extraction
|
||||
// ============================================================================
|
||||
|
||||
fn save_vcpu_states(vcpu_fds: &[&kvm_ioctls::VcpuFd]) -> Result<Vec<VcpuState>> {
|
||||
let mut states = Vec::with_capacity(vcpu_fds.len());
|
||||
|
||||
for (id, vcpu_fd) in vcpu_fds.iter().enumerate() {
|
||||
let state = save_single_vcpu_state(id as u8, vcpu_fd)?;
|
||||
states.push(state);
|
||||
}
|
||||
|
||||
Ok(states)
|
||||
}
|
||||
|
||||
fn save_single_vcpu_state(id: u8, vcpu_fd: &kvm_ioctls::VcpuFd) -> Result<VcpuState> {
|
||||
// General purpose registers
|
||||
let regs = vcpu_fd
|
||||
.get_regs()
|
||||
.map_err(|e| SnapshotError::Kvm(format!("get_regs vCPU {}: {}", id, e)))?;
|
||||
|
||||
let serializable_regs = SerializableRegs {
|
||||
rax: regs.rax,
|
||||
rbx: regs.rbx,
|
||||
rcx: regs.rcx,
|
||||
rdx: regs.rdx,
|
||||
rsi: regs.rsi,
|
||||
rdi: regs.rdi,
|
||||
rsp: regs.rsp,
|
||||
rbp: regs.rbp,
|
||||
r8: regs.r8,
|
||||
r9: regs.r9,
|
||||
r10: regs.r10,
|
||||
r11: regs.r11,
|
||||
r12: regs.r12,
|
||||
r13: regs.r13,
|
||||
r14: regs.r14,
|
||||
r15: regs.r15,
|
||||
rip: regs.rip,
|
||||
rflags: regs.rflags,
|
||||
};
|
||||
|
||||
// Special registers
|
||||
let sregs = vcpu_fd
|
||||
.get_sregs()
|
||||
.map_err(|e| SnapshotError::Kvm(format!("get_sregs vCPU {}: {}", id, e)))?;
|
||||
|
||||
let serializable_sregs = serialize_sregs(&sregs);
|
||||
|
||||
// FPU state
|
||||
let fpu = vcpu_fd
|
||||
.get_fpu()
|
||||
.map_err(|e| SnapshotError::Kvm(format!("get_fpu vCPU {}: {}", id, e)))?;
|
||||
|
||||
let serializable_fpu = serialize_fpu(&fpu);
|
||||
|
||||
// MSRs
|
||||
let msrs = save_msrs(vcpu_fd, id)?;
|
||||
|
||||
// CPUID
|
||||
let cpuid_entries = save_cpuid(vcpu_fd, id)?;
|
||||
|
||||
// LAPIC
|
||||
let lapic = vcpu_fd
|
||||
.get_lapic()
|
||||
.map_err(|e| SnapshotError::Kvm(format!("get_lapic vCPU {}: {}", id, e)))?;
|
||||
|
||||
let serializable_lapic = SerializableLapic {
|
||||
regs: lapic.regs.iter().map(|&b| b as u8).collect(),
|
||||
};
|
||||
|
||||
// XCRs
|
||||
let xcrs = save_xcrs(vcpu_fd, id);
|
||||
|
||||
// MP state
|
||||
let mp_state = vcpu_fd
|
||||
.get_mp_state()
|
||||
.map_err(|e| SnapshotError::Kvm(format!("get_mp_state vCPU {}: {}", id, e)))?;
|
||||
|
||||
// vCPU events
|
||||
let events = save_vcpu_events(vcpu_fd, id)?;
|
||||
|
||||
Ok(VcpuState {
|
||||
id,
|
||||
regs: serializable_regs,
|
||||
sregs: serializable_sregs,
|
||||
fpu: serializable_fpu,
|
||||
msrs,
|
||||
cpuid_entries,
|
||||
lapic: serializable_lapic,
|
||||
xcrs,
|
||||
mp_state: mp_state.mp_state,
|
||||
events,
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_sregs(sregs: &kvm_bindings::kvm_sregs) -> SerializableSregs {
|
||||
SerializableSregs {
|
||||
cs: serialize_segment(&sregs.cs),
|
||||
ds: serialize_segment(&sregs.ds),
|
||||
es: serialize_segment(&sregs.es),
|
||||
fs: serialize_segment(&sregs.fs),
|
||||
gs: serialize_segment(&sregs.gs),
|
||||
ss: serialize_segment(&sregs.ss),
|
||||
tr: serialize_segment(&sregs.tr),
|
||||
ldt: serialize_segment(&sregs.ldt),
|
||||
gdt: SerializableDtable {
|
||||
base: sregs.gdt.base,
|
||||
limit: sregs.gdt.limit,
|
||||
},
|
||||
idt: SerializableDtable {
|
||||
base: sregs.idt.base,
|
||||
limit: sregs.idt.limit,
|
||||
},
|
||||
cr0: sregs.cr0,
|
||||
cr2: sregs.cr2,
|
||||
cr3: sregs.cr3,
|
||||
cr4: sregs.cr4,
|
||||
cr8: sregs.cr8,
|
||||
efer: sregs.efer,
|
||||
apic_base: sregs.apic_base,
|
||||
interrupt_bitmap: sregs.interrupt_bitmap,
|
||||
}
|
||||
}
|
||||
|
||||
fn serialize_segment(seg: &kvm_bindings::kvm_segment) -> SerializableSegment {
|
||||
SerializableSegment {
|
||||
base: seg.base,
|
||||
limit: seg.limit,
|
||||
selector: seg.selector,
|
||||
type_: seg.type_,
|
||||
present: seg.present,
|
||||
dpl: seg.dpl,
|
||||
db: seg.db,
|
||||
s: seg.s,
|
||||
l: seg.l,
|
||||
g: seg.g,
|
||||
avl: seg.avl,
|
||||
unusable: seg.unusable,
|
||||
}
|
||||
}
|
||||
|
||||
fn serialize_fpu(fpu: &kvm_bindings::kvm_fpu) -> SerializableFpu {
|
||||
let fpr: Vec<Vec<u8>> = fpu.fpr.iter().map(|r| r.to_vec()).collect();
|
||||
let xmm: Vec<Vec<u8>> = fpu.xmm.iter().map(|r| r.to_vec()).collect();
|
||||
|
||||
SerializableFpu {
|
||||
fpr,
|
||||
fcw: fpu.fcw,
|
||||
fsw: fpu.fsw,
|
||||
ftwx: fpu.ftwx,
|
||||
last_opcode: fpu.last_opcode,
|
||||
last_ip: fpu.last_ip,
|
||||
last_dp: fpu.last_dp,
|
||||
xmm,
|
||||
mxcsr: fpu.mxcsr,
|
||||
}
|
||||
}
|
||||
|
||||
fn save_msrs(vcpu_fd: &kvm_ioctls::VcpuFd, id: u8) -> Result<Vec<SerializableMsr>> {
|
||||
let msr_entries: Vec<kvm_msr_entry> = MSRS_TO_SAVE
|
||||
.iter()
|
||||
.map(|&index| kvm_msr_entry {
|
||||
index,
|
||||
data: 0,
|
||||
..Default::default()
|
||||
})
|
||||
.collect();
|
||||
|
||||
let mut msrs = Msrs::from_entries(&msr_entries)
|
||||
.map_err(|e| SnapshotError::Kvm(format!("create MSR list for vCPU {}: {:?}", id, e)))?;
|
||||
|
||||
let nmsrs = vcpu_fd
|
||||
.get_msrs(&mut msrs)
|
||||
.map_err(|e| SnapshotError::Kvm(format!("get_msrs vCPU {}: {}", id, e)))?;
|
||||
|
||||
let result: Vec<SerializableMsr> = msrs.as_slice()[..nmsrs]
|
||||
.iter()
|
||||
.map(|e| SerializableMsr {
|
||||
index: e.index,
|
||||
data: e.data,
|
||||
})
|
||||
.collect();
|
||||
|
||||
debug!("vCPU {}: saved {}/{} MSRs", id, nmsrs, MSRS_TO_SAVE.len());
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
fn save_cpuid(vcpu_fd: &kvm_ioctls::VcpuFd, id: u8) -> Result<Vec<SerializableCpuidEntry>> {
|
||||
// Try to get CPUID with enough space for all entries
|
||||
// KVM_MAX_CPUID_ENTRIES is 80; use that as default, retry with larger if needed
|
||||
let cpuid = vcpu_fd
|
||||
.get_cpuid2(80)
|
||||
.or_else(|_| vcpu_fd.get_cpuid2(128))
|
||||
.or_else(|_| vcpu_fd.get_cpuid2(256))
|
||||
.map_err(|e| SnapshotError::Kvm(format!("get_cpuid2 vCPU {}: {}", id, e)))?;
|
||||
|
||||
let entries: Vec<SerializableCpuidEntry> = cpuid
|
||||
.as_slice()
|
||||
.iter()
|
||||
.map(|e| SerializableCpuidEntry {
|
||||
function: e.function,
|
||||
index: e.index,
|
||||
flags: e.flags,
|
||||
eax: e.eax,
|
||||
ebx: e.ebx,
|
||||
ecx: e.ecx,
|
||||
edx: e.edx,
|
||||
})
|
||||
.collect();
|
||||
|
||||
debug!("vCPU {}: saved {} CPUID entries", id, entries.len());
|
||||
Ok(entries)
|
||||
}
|
||||
|
||||
fn save_xcrs(vcpu_fd: &kvm_ioctls::VcpuFd, id: u8) -> Vec<SerializableXcr> {
|
||||
match vcpu_fd.get_xcrs() {
|
||||
Ok(xcrs) => {
|
||||
let entries: Vec<SerializableXcr> = (0..xcrs.nr_xcrs as usize)
|
||||
.map(|i| SerializableXcr {
|
||||
xcr: xcrs.xcrs[i].xcr,
|
||||
value: xcrs.xcrs[i].value,
|
||||
})
|
||||
.collect();
|
||||
debug!("vCPU {}: saved {} XCRs", id, entries.len());
|
||||
entries
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("vCPU {}: get_xcrs not supported: {}", id, e);
|
||||
Vec::new()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn save_vcpu_events(vcpu_fd: &kvm_ioctls::VcpuFd, id: u8) -> Result<SerializableVcpuEvents> {
|
||||
let events = vcpu_fd
|
||||
.get_vcpu_events()
|
||||
.map_err(|e| SnapshotError::Kvm(format!("get_vcpu_events vCPU {}: {}", id, e)))?;
|
||||
|
||||
Ok(SerializableVcpuEvents {
|
||||
exception_injected: events.exception.injected,
|
||||
exception_nr: events.exception.nr,
|
||||
exception_has_error_code: events.exception.has_error_code,
|
||||
exception_error_code: events.exception.error_code,
|
||||
interrupt_injected: events.interrupt.injected,
|
||||
interrupt_nr: events.interrupt.nr,
|
||||
interrupt_soft: events.interrupt.soft,
|
||||
interrupt_shadow: events.interrupt.shadow,
|
||||
nmi_injected: events.nmi.injected,
|
||||
nmi_pending: events.nmi.pending,
|
||||
nmi_masked: events.nmi.masked,
|
||||
smi_smm: events.smi.smm,
|
||||
smi_pending: events.smi.pending,
|
||||
smi_smm_inside_nmi: events.smi.smm_inside_nmi,
|
||||
smi_latched_init: events.smi.latched_init,
|
||||
flags: events.flags,
|
||||
})
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// IRQ Chip State Extraction
|
||||
// ============================================================================
|
||||
|
||||
fn save_irqchip_state(vm_fd: &VmFd) -> Result<IrqchipState> {
|
||||
// PIC master (chip 0)
|
||||
let mut pic_master = kvm_irqchip {
|
||||
chip_id: KVM_IRQCHIP_PIC_MASTER,
|
||||
..Default::default()
|
||||
};
|
||||
vm_fd
|
||||
.get_irqchip(&mut pic_master)
|
||||
.map_err(|e| SnapshotError::Kvm(format!("get_irqchip PIC master: {}", e)))?;
|
||||
|
||||
// PIC slave (chip 1)
|
||||
let mut pic_slave = kvm_irqchip {
|
||||
chip_id: KVM_IRQCHIP_PIC_SLAVE,
|
||||
..Default::default()
|
||||
};
|
||||
vm_fd
|
||||
.get_irqchip(&mut pic_slave)
|
||||
.map_err(|e| SnapshotError::Kvm(format!("get_irqchip PIC slave: {}", e)))?;
|
||||
|
||||
// IOAPIC (chip 2)
|
||||
let mut ioapic = kvm_irqchip {
|
||||
chip_id: KVM_IRQCHIP_IOAPIC,
|
||||
..Default::default()
|
||||
};
|
||||
vm_fd
|
||||
.get_irqchip(&mut ioapic)
|
||||
.map_err(|e| SnapshotError::Kvm(format!("get_irqchip IOAPIC: {}", e)))?;
|
||||
|
||||
// PIT state
|
||||
let pit = vm_fd
|
||||
.get_pit2()
|
||||
.map_err(|e| SnapshotError::Kvm(format!("get_pit2: {}", e)))?;
|
||||
|
||||
Ok(IrqchipState {
|
||||
pic_master: SerializablePicState {
|
||||
raw_data: unsafe {
|
||||
std::slice::from_raw_parts(
|
||||
&pic_master.chip as *const _ as *const u8,
|
||||
std::mem::size_of_val(&pic_master.chip),
|
||||
)
|
||||
.to_vec()
|
||||
},
|
||||
},
|
||||
pic_slave: SerializablePicState {
|
||||
raw_data: unsafe {
|
||||
std::slice::from_raw_parts(
|
||||
&pic_slave.chip as *const _ as *const u8,
|
||||
std::mem::size_of_val(&pic_slave.chip),
|
||||
)
|
||||
.to_vec()
|
||||
},
|
||||
},
|
||||
ioapic: SerializableIoapicState {
|
||||
raw_data: unsafe {
|
||||
std::slice::from_raw_parts(
|
||||
&ioapic.chip as *const _ as *const u8,
|
||||
std::mem::size_of_val(&ioapic.chip),
|
||||
)
|
||||
.to_vec()
|
||||
},
|
||||
},
|
||||
pit: serialize_pit_state(&pit),
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_pit_state(pit: &kvm_pit_state2) -> SerializablePitState {
|
||||
let channels: Vec<SerializablePitChannel> = pit
|
||||
.channels
|
||||
.iter()
|
||||
.map(|ch| SerializablePitChannel {
|
||||
count: ch.count,
|
||||
latched_count: ch.latched_count,
|
||||
count_latched: ch.count_latched,
|
||||
status_latched: ch.status_latched,
|
||||
status: ch.status,
|
||||
read_state: ch.read_state,
|
||||
write_state: ch.write_state,
|
||||
write_latch: ch.write_latch,
|
||||
rw_mode: ch.rw_mode,
|
||||
mode: ch.mode,
|
||||
bcd: ch.bcd,
|
||||
gate: ch.gate,
|
||||
count_load_time: ch.count_load_time,
|
||||
})
|
||||
.collect();
|
||||
|
||||
SerializablePitState {
|
||||
channels,
|
||||
flags: pit.flags,
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Clock State
|
||||
// ============================================================================
|
||||
|
||||
fn save_clock_state(vm_fd: &VmFd) -> Result<ClockState> {
|
||||
let clock = vm_fd
|
||||
.get_clock()
|
||||
.map_err(|e| SnapshotError::Kvm(format!("get_clock: {}", e)))?;
|
||||
|
||||
Ok(ClockState {
|
||||
clock: clock.clock,
|
||||
flags: clock.flags,
|
||||
})
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Device State
|
||||
// ============================================================================
|
||||
|
||||
fn save_device_state(serial: &crate::devices::serial::Serial) -> Result<DeviceState> {
|
||||
Ok(DeviceState {
|
||||
serial: save_serial_state(serial),
|
||||
virtio_blk: None, // TODO: Extract from running device if needed
|
||||
virtio_net: None, // TODO: Extract from running device if needed
|
||||
mmio_transports: Vec::new(), // TODO: Extract MMIO transport state
|
||||
})
|
||||
}
|
||||
|
||||
fn save_serial_state(_serial: &crate::devices::serial::Serial) -> SerializableSerialState {
|
||||
// The serial struct fields are private, so we save what we can observe.
|
||||
// For a complete snapshot, the Serial struct would need accessor methods.
|
||||
// For now, we save the default/reset state and rely on the guest
|
||||
// re-initializing the serial device on resume.
|
||||
SerializableSerialState {
|
||||
dlab: false,
|
||||
ier: 0,
|
||||
lcr: 0,
|
||||
mcr: 0,
|
||||
lsr: 0x60, // THR_EMPTY | THR_TSR_EMPTY
|
||||
msr: 0,
|
||||
scr: 0,
|
||||
dll: 0,
|
||||
dlh: 0,
|
||||
thr_interrupt_pending: false,
|
||||
input_buffer: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Memory Dump
|
||||
// ============================================================================
|
||||
|
||||
fn dump_guest_memory(
|
||||
memory: &crate::kvm::GuestMemoryManager,
|
||||
snapshot_dir: &Path,
|
||||
) -> Result<(Vec<SerializableMemoryRegion>, u64)> {
|
||||
let mem_path = snapshot_dir.join("memory.snap");
|
||||
let mut mem_file = File::create(&mem_path)?;
|
||||
let mut file_offset: u64 = 0;
|
||||
let mut regions = Vec::new();
|
||||
|
||||
for region in memory.regions() {
|
||||
let size = region.size as usize;
|
||||
let host_ptr = region.host_addr;
|
||||
|
||||
// Write the memory region directly from the mmap'd area
|
||||
let data = unsafe { std::slice::from_raw_parts(host_ptr, size) };
|
||||
mem_file.write_all(data)?;
|
||||
|
||||
regions.push(SerializableMemoryRegion {
|
||||
guest_addr: region.guest_addr,
|
||||
size: region.size,
|
||||
file_offset,
|
||||
});
|
||||
|
||||
file_offset += region.size;
|
||||
}
|
||||
|
||||
mem_file.sync_all()?;
|
||||
let total_size = file_offset;
|
||||
|
||||
Ok((regions, total_size))
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// CAS Memory Dump (for Stellarium integration)
|
||||
// ============================================================================
|
||||
|
||||
/// Dump guest memory to CAS store as 2MB chunks.
|
||||
///
|
||||
/// This is an alternative to `dump_guest_memory()` that stores memory as
|
||||
/// content-addressed 2MB chunks in a Stellarium CAS store.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `memory` - Guest memory manager
|
||||
/// * `snapshot_dir` - Directory to write the manifest
|
||||
/// * `cas_store` - Path to the Stellarium CAS store
|
||||
///
|
||||
/// # Returns
|
||||
/// Tuple of (memory_regions for state.json, memory_file_size placeholder).
|
||||
/// The actual chunks are stored in the CAS store, not in snapshot_dir.
|
||||
pub fn dump_guest_memory_cas(
|
||||
memory: &crate::kvm::GuestMemoryManager,
|
||||
snapshot_dir: &Path,
|
||||
cas_store: &Path,
|
||||
) -> Result<(Vec<SerializableMemoryRegion>, u64)> {
|
||||
use super::cas;
|
||||
|
||||
let result = cas::dump_guest_memory_cas(memory, snapshot_dir, cas_store)?;
|
||||
|
||||
// Build memory regions from the manifest
|
||||
// For CAS snapshots, we use a single contiguous region at guest address 0
|
||||
let regions = vec![SerializableMemoryRegion {
|
||||
guest_addr: 0,
|
||||
size: result.manifest.total_size,
|
||||
file_offset: 0, // Not applicable for CAS
|
||||
}];
|
||||
|
||||
info!(
|
||||
"CAS memory dump complete: {} chunks ({} new, {} dedup)",
|
||||
result.manifest.chunk_count(),
|
||||
result.new_count,
|
||||
result.dedup_count
|
||||
);
|
||||
|
||||
// Return 0 for memory_file_size since we don't create memory.snap
|
||||
// The manifest file size is small and not tracked in metadata
|
||||
Ok((regions, 0))
|
||||
}
|
||||
|
||||
/// Create a snapshot with optional CAS storage.
|
||||
///
|
||||
/// If `cas_store` is Some, memory is stored as CAS chunks.
|
||||
/// Otherwise, memory is stored as a flat `memory.snap` file.
|
||||
pub fn create_snapshot_with_cas(
|
||||
vm_fd: &VmFd,
|
||||
vcpu_fds: &[&kvm_ioctls::VcpuFd],
|
||||
memory: &crate::kvm::GuestMemoryManager,
|
||||
serial: &crate::devices::serial::Serial,
|
||||
snapshot_dir: &Path,
|
||||
cas_store: Option<&Path>,
|
||||
) -> Result<()> {
|
||||
let start = std::time::Instant::now();
|
||||
|
||||
// Ensure snapshot directory exists
|
||||
fs::create_dir_all(snapshot_dir)?;
|
||||
|
||||
info!(
|
||||
"Creating snapshot at {} (CAS: {})",
|
||||
snapshot_dir.display(),
|
||||
cas_store.map(|p| p.display().to_string()).unwrap_or_else(|| "disabled".to_string())
|
||||
);
|
||||
|
||||
// Step 1: Save vCPU state
|
||||
let vcpu_states = save_vcpu_states(vcpu_fds)?;
|
||||
let t_vcpu = start.elapsed();
|
||||
debug!("vCPU state saved in {:.2}ms", t_vcpu.as_secs_f64() * 1000.0);
|
||||
|
||||
// Step 2: Save IRQ chip state
|
||||
let irqchip = save_irqchip_state(vm_fd)?;
|
||||
let t_irq = start.elapsed();
|
||||
debug!(
|
||||
"IRQ chip state saved in {:.2}ms",
|
||||
(t_irq - t_vcpu).as_secs_f64() * 1000.0
|
||||
);
|
||||
|
||||
// Step 3: Save clock
|
||||
let clock = save_clock_state(vm_fd)?;
|
||||
let t_clock = start.elapsed();
|
||||
debug!(
|
||||
"Clock state saved in {:.2}ms",
|
||||
(t_clock - t_irq).as_secs_f64() * 1000.0
|
||||
);
|
||||
|
||||
// Step 4: Save device state
|
||||
let devices = save_device_state(serial)?;
|
||||
let t_dev = start.elapsed();
|
||||
debug!(
|
||||
"Device state saved in {:.2}ms",
|
||||
(t_dev - t_clock).as_secs_f64() * 1000.0
|
||||
);
|
||||
|
||||
// Step 5: Dump guest memory (flat or CAS)
|
||||
let (memory_regions, memory_file_size) = if let Some(cas_path) = cas_store {
|
||||
dump_guest_memory_cas(memory, snapshot_dir, cas_path)?
|
||||
} else {
|
||||
dump_guest_memory(memory, snapshot_dir)?
|
||||
};
|
||||
let t_mem = start.elapsed();
|
||||
debug!(
|
||||
"Memory dumped in {:.2}ms",
|
||||
(t_mem - t_dev).as_secs_f64() * 1000.0
|
||||
);
|
||||
|
||||
// Step 6: Build snapshot and write state.json
|
||||
let snapshot = VmSnapshot {
|
||||
metadata: SnapshotMetadata {
|
||||
version: SNAPSHOT_VERSION,
|
||||
memory_size: memory.total_size(),
|
||||
vcpu_count: vcpu_fds.len() as u8,
|
||||
created_at: SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap_or_default()
|
||||
.as_secs(),
|
||||
state_crc64: 0, // Placeholder, computed below
|
||||
memory_file_size,
|
||||
},
|
||||
vcpu_states,
|
||||
irqchip,
|
||||
clock,
|
||||
devices,
|
||||
memory_regions,
|
||||
};
|
||||
|
||||
// Serialize and compute CRC
|
||||
let mut state_json = serde_json::to_string_pretty(&snapshot)?;
|
||||
|
||||
// Compute CRC over the state (with crc64 = 0) and patch it in
|
||||
let crc = compute_crc64(state_json.as_bytes());
|
||||
let mut final_snapshot = snapshot;
|
||||
final_snapshot.metadata.state_crc64 = crc;
|
||||
state_json = serde_json::to_string_pretty(&final_snapshot)?;
|
||||
|
||||
let state_path = snapshot_dir.join("state.json");
|
||||
let mut state_file = File::create(&state_path)?;
|
||||
state_file.write_all(state_json.as_bytes())?;
|
||||
state_file.sync_all()?;
|
||||
|
||||
let t_total = start.elapsed();
|
||||
info!(
|
||||
"Snapshot created: {} vCPUs, {} MB memory, {:.2}ms total \
|
||||
[vcpu={:.2}ms, irq={:.2}ms, clock={:.2}ms, dev={:.2}ms, mem={:.2}ms, write={:.2}ms]",
|
||||
vcpu_fds.len(),
|
||||
memory.total_size() / (1024 * 1024),
|
||||
t_total.as_secs_f64() * 1000.0,
|
||||
t_vcpu.as_secs_f64() * 1000.0,
|
||||
(t_irq - t_vcpu).as_secs_f64() * 1000.0,
|
||||
(t_clock - t_irq).as_secs_f64() * 1000.0,
|
||||
(t_dev - t_clock).as_secs_f64() * 1000.0,
|
||||
(t_mem - t_dev).as_secs_f64() * 1000.0,
|
||||
(t_total - t_mem).as_secs_f64() * 1000.0,
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
604
vmm/src/snapshot/inmem.rs
Normal file
604
vmm/src/snapshot/inmem.rs
Normal file
@@ -0,0 +1,604 @@
|
||||
//! In-Memory Snapshot Restore
|
||||
//!
|
||||
//! Provides a zero-copy restore path for when guest memory is already in RAM
|
||||
//! (e.g., from a CAS blob cache or TinyVol mapping). This is the ultimate
|
||||
//! fast path for sub-millisecond VM restores in production environments.
|
||||
//!
|
||||
//! # Use Case
|
||||
//!
|
||||
//! In Voltainer's VM pool architecture, memory snapshots are cached in RAM:
|
||||
//! - CAS blobs are fetched once and kept in a memory-mapped cache
|
||||
//! - TinyVol volumes use shared mmap'd regions
|
||||
//! - Pre-warmed VMs can restore instantly from these cached regions
|
||||
//!
|
||||
//! # Safety
|
||||
//!
|
||||
//! The caller is responsible for:
|
||||
//! - Ensuring the memory pointer is valid and page-aligned (4KB)
|
||||
//! - Ensuring the memory region is large enough (>= snapshot memory_size)
|
||||
//! - Ensuring the memory outlives the restored VM
|
||||
//! - Not modifying the memory while the VM is running (undefined behavior)
|
||||
//!
|
||||
//! The in-memory restore does NOT take ownership of the memory. The caller
|
||||
//! must manage the memory lifecycle independently of the VM.
|
||||
|
||||
use kvm_bindings::{
|
||||
kvm_mp_state, kvm_pit_config, kvm_regs,
|
||||
kvm_userspace_memory_region,
|
||||
KVM_PIT_SPEAKER_DUMMY,
|
||||
};
|
||||
use kvm_ioctls::{Kvm, VcpuFd, VmFd};
|
||||
use tracing::{debug, info};
|
||||
|
||||
use super::*;
|
||||
use super::restore::{
|
||||
deserialize_fpu, deserialize_sregs,
|
||||
restore_cpuid, restore_lapic, restore_msrs, restore_vcpu_events, restore_xcrs,
|
||||
restore_irqchip, restore_clock,
|
||||
};
|
||||
|
||||
/// Page size for alignment validation (4KB)
|
||||
pub const PAGE_SIZE: usize = 4096;
|
||||
|
||||
/// Result of a successful in-memory snapshot restore.
|
||||
///
|
||||
/// Unlike `RestoredVm` from the file-based restore, this struct does NOT own
|
||||
/// the memory. The caller is responsible for managing memory lifetime.
|
||||
pub struct RestoredVmInMemory {
|
||||
/// KVM VM file descriptor
|
||||
pub vm_fd: VmFd,
|
||||
/// vCPU file descriptors (already configured with restored state)
|
||||
pub vcpu_fds: Vec<VcpuFd>,
|
||||
/// Guest physical address where memory was registered
|
||||
pub guest_phys_addr: u64,
|
||||
/// Size of the registered memory region
|
||||
pub memory_size: u64,
|
||||
/// The restored snapshot state (for device reconstruction)
|
||||
pub snapshot: VmSnapshot,
|
||||
}
|
||||
|
||||
/// Validation errors for in-memory restore
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum InMemoryError {
|
||||
#[error("Memory pointer is null")]
|
||||
NullPointer,
|
||||
|
||||
#[error("Memory pointer is not page-aligned (4KB): address 0x{0:x}")]
|
||||
UnalignedPointer(usize),
|
||||
|
||||
#[error("Memory region too small: provided {provided} bytes, need {required} bytes")]
|
||||
InsufficientMemory { provided: usize, required: u64 },
|
||||
|
||||
#[error("Snapshot error: {0}")]
|
||||
Snapshot(#[from] SnapshotError),
|
||||
}
|
||||
|
||||
/// Restore a VM from a snapshot with pre-existing in-memory guest memory.
|
||||
///
|
||||
/// This is the zero-copy fast path for when guest memory is already mapped
|
||||
/// in the host process (e.g., from CAS blob cache or TinyVol).
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `snapshot` - The deserialized VM snapshot state
|
||||
/// * `memory_ptr` - Host virtual address of the guest memory (must be page-aligned)
|
||||
/// * `memory_size` - Size of the provided memory region in bytes
|
||||
///
|
||||
/// # Safety
|
||||
///
|
||||
/// The caller must ensure:
|
||||
/// - `memory_ptr` is a valid, page-aligned (4KB) pointer
|
||||
/// - The memory region is at least `memory_size` bytes
|
||||
/// - The memory contains valid guest memory data (from a snapshot)
|
||||
/// - The memory outlives the returned `RestoredVmInMemory`
|
||||
/// - The memory is not freed or unmapped while the VM is running
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A `RestoredVmInMemory` containing KVM handles ready to resume execution.
|
||||
/// The caller retains ownership of the memory and must manage its lifecycle.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```ignore
|
||||
/// // Load snapshot state from memory (e.g., CAS blob)
|
||||
/// let state_bytes = blob_cache.get("snapshot-state")?;
|
||||
/// let snapshot = VmSnapshot::from_bytes(&state_bytes)?;
|
||||
///
|
||||
/// // Get memory from CAS cache (already mmap'd)
|
||||
/// let (memory_ptr, memory_size) = blob_cache.get_memory_region("snapshot-mem")?;
|
||||
///
|
||||
/// // Restore the VM (sub-millisecond)
|
||||
/// let restored = unsafe {
|
||||
/// restore_from_memory(&snapshot, memory_ptr, memory_size)?
|
||||
/// };
|
||||
///
|
||||
/// // VM is ready to run
|
||||
/// ```
|
||||
pub unsafe fn restore_from_memory(
|
||||
snapshot: &VmSnapshot,
|
||||
memory_ptr: *mut u8,
|
||||
memory_size: usize,
|
||||
) -> std::result::Result<RestoredVmInMemory, InMemoryError> {
|
||||
let start = std::time::Instant::now();
|
||||
|
||||
// Validate pointer is not null
|
||||
if memory_ptr.is_null() {
|
||||
return Err(InMemoryError::NullPointer);
|
||||
}
|
||||
|
||||
// Validate pointer is page-aligned (4KB)
|
||||
let ptr_addr = memory_ptr as usize;
|
||||
if ptr_addr % PAGE_SIZE != 0 {
|
||||
return Err(InMemoryError::UnalignedPointer(ptr_addr));
|
||||
}
|
||||
|
||||
// Validate memory size is sufficient
|
||||
let required_size = snapshot.metadata.memory_size;
|
||||
if (memory_size as u64) < required_size {
|
||||
return Err(InMemoryError::InsufficientMemory {
|
||||
provided: memory_size,
|
||||
required: required_size,
|
||||
});
|
||||
}
|
||||
|
||||
let t_validate = start.elapsed();
|
||||
debug!(
|
||||
"Validation complete in {:.3}ms",
|
||||
t_validate.as_secs_f64() * 1000.0
|
||||
);
|
||||
|
||||
// Create KVM VM
|
||||
let kvm = Kvm::new().map_err(|e| SnapshotError::Kvm(format!("open /dev/kvm: {}", e)))?;
|
||||
let vm_fd = kvm
|
||||
.create_vm()
|
||||
.map_err(|e| SnapshotError::Kvm(format!("create_vm: {}", e)))?;
|
||||
|
||||
// Set TSS address (required for x86_64)
|
||||
vm_fd
|
||||
.set_tss_address(0xFFFB_D000)
|
||||
.map_err(|e| SnapshotError::Kvm(format!("set_tss_address: {}", e)))?;
|
||||
|
||||
// Create IRQ chip (must be before restoring IRQ state)
|
||||
vm_fd
|
||||
.create_irq_chip()
|
||||
.map_err(|e| SnapshotError::Kvm(format!("create_irq_chip: {}", e)))?;
|
||||
|
||||
// Create PIT (must be before restoring PIT state)
|
||||
let pit_config = kvm_pit_config {
|
||||
flags: KVM_PIT_SPEAKER_DUMMY,
|
||||
..Default::default()
|
||||
};
|
||||
vm_fd
|
||||
.create_pit2(pit_config)
|
||||
.map_err(|e| SnapshotError::Kvm(format!("create_pit2: {}", e)))?;
|
||||
|
||||
let t_vm = start.elapsed();
|
||||
debug!(
|
||||
"KVM VM created in {:.3}ms",
|
||||
(t_vm - t_validate).as_secs_f64() * 1000.0
|
||||
);
|
||||
|
||||
// Register the provided memory directly with KVM (zero-copy!)
|
||||
// For simplicity, we register the entire memory as a single region at the
|
||||
// first memory region's guest address from the snapshot.
|
||||
let guest_phys_addr = snapshot
|
||||
.memory_regions
|
||||
.first()
|
||||
.map(|r| r.guest_addr)
|
||||
.unwrap_or(0);
|
||||
|
||||
let mem_region = kvm_userspace_memory_region {
|
||||
slot: 0,
|
||||
flags: 0,
|
||||
guest_phys_addr,
|
||||
memory_size: required_size,
|
||||
userspace_addr: memory_ptr as u64,
|
||||
};
|
||||
|
||||
vm_fd
|
||||
.set_user_memory_region(mem_region)
|
||||
.map_err(|e| SnapshotError::Kvm(format!("set_user_memory_region: {}", e)))?;
|
||||
|
||||
let t_memreg = start.elapsed();
|
||||
debug!(
|
||||
"Memory registered with KVM in {:.3}ms (zero-copy, {} MB at {:p})",
|
||||
(t_memreg - t_vm).as_secs_f64() * 1000.0,
|
||||
required_size / (1024 * 1024),
|
||||
memory_ptr
|
||||
);
|
||||
|
||||
// Restore vCPUs
|
||||
let vcpu_fds = restore_vcpus_inmem(&vm_fd, snapshot)?;
|
||||
let t_vcpu = start.elapsed();
|
||||
debug!(
|
||||
"vCPU state restored in {:.3}ms",
|
||||
(t_vcpu - t_memreg).as_secs_f64() * 1000.0
|
||||
);
|
||||
|
||||
// Restore IRQ chip state
|
||||
restore_irqchip(&vm_fd, &snapshot.irqchip)?;
|
||||
let t_irq = start.elapsed();
|
||||
debug!(
|
||||
"IRQ chip restored in {:.3}ms",
|
||||
(t_irq - t_vcpu).as_secs_f64() * 1000.0
|
||||
);
|
||||
|
||||
// Restore clock
|
||||
restore_clock(&vm_fd, &snapshot.clock)?;
|
||||
let t_clock = start.elapsed();
|
||||
debug!(
|
||||
"Clock restored in {:.3}ms",
|
||||
(t_clock - t_irq).as_secs_f64() * 1000.0
|
||||
);
|
||||
|
||||
let t_total = start.elapsed();
|
||||
info!(
|
||||
"In-memory restore complete: {} vCPUs, {} MB memory, {:.3}ms total \
|
||||
[validate={:.3}ms, vm={:.3}ms, memreg={:.3}ms, vcpu={:.3}ms, irq={:.3}ms, clock={:.3}ms]",
|
||||
snapshot.vcpu_states.len(),
|
||||
required_size / (1024 * 1024),
|
||||
t_total.as_secs_f64() * 1000.0,
|
||||
t_validate.as_secs_f64() * 1000.0,
|
||||
(t_vm - t_validate).as_secs_f64() * 1000.0,
|
||||
(t_memreg - t_vm).as_secs_f64() * 1000.0,
|
||||
(t_vcpu - t_memreg).as_secs_f64() * 1000.0,
|
||||
(t_irq - t_vcpu).as_secs_f64() * 1000.0,
|
||||
(t_clock - t_irq).as_secs_f64() * 1000.0,
|
||||
);
|
||||
|
||||
Ok(RestoredVmInMemory {
|
||||
vm_fd,
|
||||
vcpu_fds,
|
||||
guest_phys_addr,
|
||||
memory_size: required_size,
|
||||
snapshot: snapshot.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Restore a VM from a snapshot using a pre-warmed VM from the pool AND
|
||||
/// in-memory guest data. This is the ultimate fast path: ~0.5ms total.
|
||||
///
|
||||
/// Combines two optimizations:
|
||||
/// 1. Pre-warmed VM pool (skips KVM_CREATE_VM — saves ~24ms)
|
||||
/// 2. In-memory data (skips disk I/O — saves ~1-18ms)
|
||||
///
|
||||
/// # Safety
|
||||
///
|
||||
/// Same requirements as `restore_from_memory`.
|
||||
pub unsafe fn restore_from_memory_pooled(
|
||||
snapshot: &VmSnapshot,
|
||||
memory_ptr: *mut u8,
|
||||
memory_size: usize,
|
||||
pool: &crate::pool::VmPool,
|
||||
) -> std::result::Result<RestoredVmInMemory, InMemoryError> {
|
||||
let start = std::time::Instant::now();
|
||||
|
||||
// Validate pointer
|
||||
if memory_ptr.is_null() {
|
||||
return Err(InMemoryError::NullPointer);
|
||||
}
|
||||
let ptr_addr = memory_ptr as usize;
|
||||
if ptr_addr % PAGE_SIZE != 0 {
|
||||
return Err(InMemoryError::UnalignedPointer(ptr_addr));
|
||||
}
|
||||
let required_size = snapshot.metadata.memory_size;
|
||||
if (memory_size as u64) < required_size {
|
||||
return Err(InMemoryError::InsufficientMemory {
|
||||
provided: memory_size,
|
||||
required: required_size,
|
||||
});
|
||||
}
|
||||
|
||||
let t_validate = start.elapsed();
|
||||
|
||||
// Acquire pre-warmed VM from pool (skips KVM_CREATE_VM!)
|
||||
let pre_warmed = pool.acquire()
|
||||
.map_err(|e| SnapshotError::Kvm(format!("pool acquire: {}", e)))?;
|
||||
let vm_fd = pre_warmed.vm_fd;
|
||||
|
||||
let t_pool = start.elapsed();
|
||||
debug!(
|
||||
"VM acquired from pool in {:.3}ms (skipped ~24ms KVM_CREATE_VM)",
|
||||
(t_pool - t_validate).as_secs_f64() * 1000.0
|
||||
);
|
||||
|
||||
// Register memory (zero-copy)
|
||||
let guest_phys_addr = snapshot
|
||||
.memory_regions
|
||||
.first()
|
||||
.map(|r| r.guest_addr)
|
||||
.unwrap_or(0);
|
||||
|
||||
let mem_region = kvm_userspace_memory_region {
|
||||
slot: 0,
|
||||
flags: 0,
|
||||
guest_phys_addr,
|
||||
memory_size: required_size,
|
||||
userspace_addr: memory_ptr as u64,
|
||||
};
|
||||
|
||||
vm_fd
|
||||
.set_user_memory_region(mem_region)
|
||||
.map_err(|e| SnapshotError::Kvm(format!("set_user_memory_region: {}", e)))?;
|
||||
|
||||
let t_memreg = start.elapsed();
|
||||
|
||||
// Restore vCPUs
|
||||
let vcpu_fds = restore_vcpus_inmem(&vm_fd, snapshot)?;
|
||||
let t_vcpu = start.elapsed();
|
||||
|
||||
// Restore IRQ chip + clock
|
||||
restore_irqchip(&vm_fd, &snapshot.irqchip)?;
|
||||
let t_irq = start.elapsed();
|
||||
restore_clock(&vm_fd, &snapshot.clock)?;
|
||||
let t_clock = start.elapsed();
|
||||
|
||||
let t_total = start.elapsed();
|
||||
info!(
|
||||
"In-memory POOLED restore: {} vCPUs, {} MB memory, {:.3}ms total \
|
||||
[validate={:.3}ms, pool={:.3}ms, memreg={:.3}ms, vcpu={:.3}ms, irq={:.3}ms, clock={:.3}ms]",
|
||||
snapshot.vcpu_states.len(),
|
||||
required_size / (1024 * 1024),
|
||||
t_total.as_secs_f64() * 1000.0,
|
||||
t_validate.as_secs_f64() * 1000.0,
|
||||
(t_pool - t_validate).as_secs_f64() * 1000.0,
|
||||
(t_memreg - t_pool).as_secs_f64() * 1000.0,
|
||||
(t_vcpu - t_memreg).as_secs_f64() * 1000.0,
|
||||
(t_irq - t_vcpu).as_secs_f64() * 1000.0,
|
||||
(t_clock - t_irq).as_secs_f64() * 1000.0,
|
||||
);
|
||||
|
||||
Ok(RestoredVmInMemory {
|
||||
vm_fd,
|
||||
vcpu_fds,
|
||||
guest_phys_addr,
|
||||
memory_size: required_size,
|
||||
snapshot: snapshot.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Restore vCPUs from snapshot state (same as file-based restore).
|
||||
fn restore_vcpus_inmem(
|
||||
vm_fd: &VmFd,
|
||||
snapshot: &VmSnapshot,
|
||||
) -> Result<Vec<VcpuFd>> {
|
||||
let mut vcpu_fds = Vec::with_capacity(snapshot.vcpu_states.len());
|
||||
|
||||
for vcpu_state in &snapshot.vcpu_states {
|
||||
let vcpu_fd = vm_fd
|
||||
.create_vcpu(vcpu_state.id as u64)
|
||||
.map_err(|e| {
|
||||
SnapshotError::Kvm(format!("create_vcpu {}: {}", vcpu_state.id, e))
|
||||
})?;
|
||||
|
||||
restore_single_vcpu_inmem(&vcpu_fd, vcpu_state)?;
|
||||
vcpu_fds.push(vcpu_fd);
|
||||
}
|
||||
|
||||
Ok(vcpu_fds)
|
||||
}
|
||||
|
||||
/// Restore a single vCPU's state.
|
||||
fn restore_single_vcpu_inmem(vcpu_fd: &VcpuFd, state: &VcpuState) -> Result<()> {
|
||||
let id = state.id;
|
||||
|
||||
// Restore CPUID first (must be before setting registers)
|
||||
restore_cpuid(vcpu_fd, &state.cpuid_entries, id)?;
|
||||
|
||||
// Restore MP state (should be done before other registers for some KVM versions)
|
||||
let mp_state = kvm_mp_state {
|
||||
mp_state: state.mp_state,
|
||||
};
|
||||
vcpu_fd
|
||||
.set_mp_state(mp_state)
|
||||
.map_err(|e| SnapshotError::Kvm(format!("set_mp_state vCPU {}: {}", id, e)))?;
|
||||
|
||||
// Restore special registers
|
||||
let sregs = deserialize_sregs(&state.sregs);
|
||||
vcpu_fd
|
||||
.set_sregs(&sregs)
|
||||
.map_err(|e| SnapshotError::Kvm(format!("set_sregs vCPU {}: {}", id, e)))?;
|
||||
|
||||
// Restore general purpose registers
|
||||
let regs = kvm_regs {
|
||||
rax: state.regs.rax,
|
||||
rbx: state.regs.rbx,
|
||||
rcx: state.regs.rcx,
|
||||
rdx: state.regs.rdx,
|
||||
rsi: state.regs.rsi,
|
||||
rdi: state.regs.rdi,
|
||||
rsp: state.regs.rsp,
|
||||
rbp: state.regs.rbp,
|
||||
r8: state.regs.r8,
|
||||
r9: state.regs.r9,
|
||||
r10: state.regs.r10,
|
||||
r11: state.regs.r11,
|
||||
r12: state.regs.r12,
|
||||
r13: state.regs.r13,
|
||||
r14: state.regs.r14,
|
||||
r15: state.regs.r15,
|
||||
rip: state.regs.rip,
|
||||
rflags: state.regs.rflags,
|
||||
};
|
||||
vcpu_fd
|
||||
.set_regs(®s)
|
||||
.map_err(|e| SnapshotError::Kvm(format!("set_regs vCPU {}: {}", id, e)))?;
|
||||
|
||||
// Restore FPU state
|
||||
let fpu = deserialize_fpu(&state.fpu);
|
||||
vcpu_fd
|
||||
.set_fpu(&fpu)
|
||||
.map_err(|e| SnapshotError::Kvm(format!("set_fpu vCPU {}: {}", id, e)))?;
|
||||
|
||||
// Restore MSRs
|
||||
restore_msrs(vcpu_fd, &state.msrs, id)?;
|
||||
|
||||
// Restore LAPIC
|
||||
restore_lapic(vcpu_fd, &state.lapic, id)?;
|
||||
|
||||
// Restore XCRs
|
||||
if !state.xcrs.is_empty() {
|
||||
restore_xcrs(vcpu_fd, &state.xcrs, id);
|
||||
}
|
||||
|
||||
// Restore vCPU events
|
||||
restore_vcpu_events(vcpu_fd, &state.events, id)?;
|
||||
|
||||
debug!(
|
||||
"vCPU {} restored: RIP=0x{:x}, RSP=0x{:x}, CR3=0x{:x}",
|
||||
id, state.regs.rip, state.regs.rsp, state.sregs.cr3
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Check if a pointer is page-aligned (4KB).
|
||||
#[inline]
|
||||
pub fn is_page_aligned(ptr: *const u8) -> bool {
|
||||
(ptr as usize) % PAGE_SIZE == 0
|
||||
}
|
||||
|
||||
/// Align a size up to the nearest page boundary.
|
||||
#[inline]
|
||||
pub fn align_to_page(size: usize) -> usize {
|
||||
(size + PAGE_SIZE - 1) & !(PAGE_SIZE - 1)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_is_page_aligned() {
|
||||
assert!(is_page_aligned(0x0 as *const u8));
|
||||
assert!(is_page_aligned(0x1000 as *const u8));
|
||||
assert!(is_page_aligned(0x2000 as *const u8));
|
||||
assert!(is_page_aligned(0x10000 as *const u8));
|
||||
|
||||
assert!(!is_page_aligned(0x1 as *const u8));
|
||||
assert!(!is_page_aligned(0x100 as *const u8));
|
||||
assert!(!is_page_aligned(0x1001 as *const u8));
|
||||
assert!(!is_page_aligned(0xFFF as *const u8));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_align_to_page() {
|
||||
assert_eq!(align_to_page(0), 0);
|
||||
assert_eq!(align_to_page(1), PAGE_SIZE);
|
||||
assert_eq!(align_to_page(PAGE_SIZE - 1), PAGE_SIZE);
|
||||
assert_eq!(align_to_page(PAGE_SIZE), PAGE_SIZE);
|
||||
assert_eq!(align_to_page(PAGE_SIZE + 1), PAGE_SIZE * 2);
|
||||
assert_eq!(align_to_page(PAGE_SIZE * 2), PAGE_SIZE * 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_null_pointer_error() {
|
||||
let snapshot = VmSnapshot {
|
||||
metadata: SnapshotMetadata {
|
||||
version: 1,
|
||||
memory_size: 128 * 1024 * 1024,
|
||||
vcpu_count: 1,
|
||||
created_at: 0,
|
||||
state_crc64: 0,
|
||||
memory_file_size: 128 * 1024 * 1024,
|
||||
},
|
||||
vcpu_states: vec![],
|
||||
irqchip: IrqchipState {
|
||||
pic_master: SerializablePicState { raw_data: vec![] },
|
||||
pic_slave: SerializablePicState { raw_data: vec![] },
|
||||
ioapic: SerializableIoapicState { raw_data: vec![] },
|
||||
pit: SerializablePitState { channels: vec![], flags: 0 },
|
||||
},
|
||||
clock: ClockState { clock: 0, flags: 0 },
|
||||
devices: DeviceState {
|
||||
serial: SerializableSerialState {
|
||||
dlab: false,
|
||||
ier: 0, lcr: 0, mcr: 0, lsr: 0x60, msr: 0, scr: 0,
|
||||
dll: 0, dlh: 0, thr_interrupt_pending: false, input_buffer: vec![],
|
||||
},
|
||||
virtio_blk: None,
|
||||
virtio_net: None,
|
||||
mmio_transports: vec![],
|
||||
},
|
||||
memory_regions: vec![],
|
||||
};
|
||||
|
||||
let result = unsafe { restore_from_memory(&snapshot, std::ptr::null_mut(), 0) };
|
||||
assert!(matches!(result, Err(InMemoryError::NullPointer)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_unaligned_pointer_error() {
|
||||
let snapshot = VmSnapshot {
|
||||
metadata: SnapshotMetadata {
|
||||
version: 1,
|
||||
memory_size: 128 * 1024 * 1024,
|
||||
vcpu_count: 1,
|
||||
created_at: 0,
|
||||
state_crc64: 0,
|
||||
memory_file_size: 128 * 1024 * 1024,
|
||||
},
|
||||
vcpu_states: vec![],
|
||||
irqchip: IrqchipState {
|
||||
pic_master: SerializablePicState { raw_data: vec![] },
|
||||
pic_slave: SerializablePicState { raw_data: vec![] },
|
||||
ioapic: SerializableIoapicState { raw_data: vec![] },
|
||||
pit: SerializablePitState { channels: vec![], flags: 0 },
|
||||
},
|
||||
clock: ClockState { clock: 0, flags: 0 },
|
||||
devices: DeviceState {
|
||||
serial: SerializableSerialState {
|
||||
dlab: false,
|
||||
ier: 0, lcr: 0, mcr: 0, lsr: 0x60, msr: 0, scr: 0,
|
||||
dll: 0, dlh: 0, thr_interrupt_pending: false, input_buffer: vec![],
|
||||
},
|
||||
virtio_blk: None,
|
||||
virtio_net: None,
|
||||
mmio_transports: vec![],
|
||||
},
|
||||
memory_regions: vec![],
|
||||
};
|
||||
|
||||
// Create an intentionally misaligned pointer
|
||||
let result = unsafe { restore_from_memory(&snapshot, 0x1001 as *mut u8, 128 * 1024 * 1024) };
|
||||
assert!(matches!(result, Err(InMemoryError::UnalignedPointer(_))));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_insufficient_memory_error() {
|
||||
let snapshot = VmSnapshot {
|
||||
metadata: SnapshotMetadata {
|
||||
version: 1,
|
||||
memory_size: 128 * 1024 * 1024, // Requires 128MB
|
||||
vcpu_count: 1,
|
||||
created_at: 0,
|
||||
state_crc64: 0,
|
||||
memory_file_size: 128 * 1024 * 1024,
|
||||
},
|
||||
vcpu_states: vec![],
|
||||
irqchip: IrqchipState {
|
||||
pic_master: SerializablePicState { raw_data: vec![] },
|
||||
pic_slave: SerializablePicState { raw_data: vec![] },
|
||||
ioapic: SerializableIoapicState { raw_data: vec![] },
|
||||
pit: SerializablePitState { channels: vec![], flags: 0 },
|
||||
},
|
||||
clock: ClockState { clock: 0, flags: 0 },
|
||||
devices: DeviceState {
|
||||
serial: SerializableSerialState {
|
||||
dlab: false,
|
||||
ier: 0, lcr: 0, mcr: 0, lsr: 0x60, msr: 0, scr: 0,
|
||||
dll: 0, dlh: 0, thr_interrupt_pending: false, input_buffer: vec![],
|
||||
},
|
||||
virtio_blk: None,
|
||||
virtio_net: None,
|
||||
mmio_transports: vec![],
|
||||
},
|
||||
memory_regions: vec![],
|
||||
};
|
||||
|
||||
// Provide only 64MB when 128MB is required (use aligned address)
|
||||
let result = unsafe { restore_from_memory(&snapshot, 0x1000 as *mut u8, 64 * 1024 * 1024) };
|
||||
assert!(matches!(result, Err(InMemoryError::InsufficientMemory { .. })));
|
||||
}
|
||||
}
|
||||
796
vmm/src/snapshot/mod.rs
Normal file
796
vmm/src/snapshot/mod.rs
Normal file
@@ -0,0 +1,796 @@
|
||||
//! Snapshot/Restore for Volt VMM
|
||||
//!
|
||||
//! Provides serializable state types and functions to create and restore
|
||||
//! VM snapshots. The snapshot format consists of:
|
||||
//!
|
||||
//! - `state.json`: Serialized VM state (vCPU registers, IRQ chip, devices, metadata)
|
||||
//! - `memory.snap`: Raw guest memory dump (mmap'd on restore for lazy loading)
|
||||
//!
|
||||
//! # Architecture
|
||||
//!
|
||||
//! ```text
|
||||
//! ┌──────────────────────────────────────────────────┐
|
||||
//! │ Snapshot Files │
|
||||
//! │ ┌──────────────────┐ ┌───────────────────────┐ │
|
||||
//! │ │ state.json │ │ memory.snap │ │
|
||||
//! │ │ - VcpuState[] │ │ (raw memory dump) │ │
|
||||
//! │ │ - IrqchipState │ │ │ │
|
||||
//! │ │ - ClockState │ │ Restored via mmap │ │
|
||||
//! │ │ - DeviceState │ │ MAP_PRIVATE for CoW │ │
|
||||
//! │ │ - Metadata+CRC │ │ demand-paged by OS │ │
|
||||
//! │ └──────────────────┘ └───────────────────────┘ │
|
||||
//! └──────────────────────────────────────────────────┘
|
||||
//! ```
|
||||
|
||||
pub mod cas;
|
||||
pub mod create;
|
||||
pub mod inmem;
|
||||
pub mod restore;
|
||||
|
||||
// Re-export CAS types
|
||||
pub use cas::{CasManifest, CasChunk, CasDumpResult, CAS_CHUNK_SIZE, CAS_MANIFEST_FILENAME};
|
||||
// Re-export restore types
|
||||
pub use restore::MemoryMapping;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
// ============================================================================
|
||||
// Snapshot Metadata
|
||||
// ============================================================================
|
||||
|
||||
/// Snapshot format version
|
||||
pub const SNAPSHOT_VERSION: u32 = 1;
|
||||
|
||||
/// Snapshot metadata with integrity check
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SnapshotMetadata {
|
||||
/// Snapshot format version
|
||||
pub version: u32,
|
||||
/// Total guest memory size in bytes
|
||||
pub memory_size: u64,
|
||||
/// Number of vCPUs
|
||||
pub vcpu_count: u8,
|
||||
/// Snapshot creation timestamp (Unix epoch seconds)
|
||||
pub created_at: u64,
|
||||
/// CRC-64 of the state JSON (excluding this field)
|
||||
pub state_crc64: u64,
|
||||
/// Memory file size (for validation)
|
||||
pub memory_file_size: u64,
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// vCPU State
|
||||
// ============================================================================
|
||||
|
||||
/// Complete vCPU state captured from KVM
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct VcpuState {
|
||||
/// vCPU index
|
||||
pub id: u8,
|
||||
/// General purpose registers (KVM_GET_REGS)
|
||||
pub regs: SerializableRegs,
|
||||
/// Special registers (KVM_GET_SREGS)
|
||||
pub sregs: SerializableSregs,
|
||||
/// FPU state (KVM_GET_FPU)
|
||||
pub fpu: SerializableFpu,
|
||||
/// Model-specific registers (KVM_GET_MSRS)
|
||||
pub msrs: Vec<SerializableMsr>,
|
||||
/// CPUID entries (KVM_GET_CPUID2)
|
||||
pub cpuid_entries: Vec<SerializableCpuidEntry>,
|
||||
/// Local APIC state (KVM_GET_LAPIC)
|
||||
pub lapic: SerializableLapic,
|
||||
/// Extended control registers (KVM_GET_XCRS)
|
||||
pub xcrs: Vec<SerializableXcr>,
|
||||
/// Multiprocessor state (KVM_GET_MP_STATE)
|
||||
pub mp_state: u32,
|
||||
/// vCPU events (KVM_GET_VCPU_EVENTS)
|
||||
pub events: SerializableVcpuEvents,
|
||||
}
|
||||
|
||||
/// Serializable general-purpose registers (maps to kvm_regs)
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SerializableRegs {
|
||||
pub rax: u64,
|
||||
pub rbx: u64,
|
||||
pub rcx: u64,
|
||||
pub rdx: u64,
|
||||
pub rsi: u64,
|
||||
pub rdi: u64,
|
||||
pub rsp: u64,
|
||||
pub rbp: u64,
|
||||
pub r8: u64,
|
||||
pub r9: u64,
|
||||
pub r10: u64,
|
||||
pub r11: u64,
|
||||
pub r12: u64,
|
||||
pub r13: u64,
|
||||
pub r14: u64,
|
||||
pub r15: u64,
|
||||
pub rip: u64,
|
||||
pub rflags: u64,
|
||||
}
|
||||
|
||||
/// Serializable segment register
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SerializableSegment {
|
||||
pub base: u64,
|
||||
pub limit: u32,
|
||||
pub selector: u16,
|
||||
pub type_: u8,
|
||||
pub present: u8,
|
||||
pub dpl: u8,
|
||||
pub db: u8,
|
||||
pub s: u8,
|
||||
pub l: u8,
|
||||
pub g: u8,
|
||||
pub avl: u8,
|
||||
pub unusable: u8,
|
||||
}
|
||||
|
||||
/// Serializable descriptor table register
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SerializableDtable {
|
||||
pub base: u64,
|
||||
pub limit: u16,
|
||||
}
|
||||
|
||||
/// Serializable special registers (maps to kvm_sregs)
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SerializableSregs {
|
||||
pub cs: SerializableSegment,
|
||||
pub ds: SerializableSegment,
|
||||
pub es: SerializableSegment,
|
||||
pub fs: SerializableSegment,
|
||||
pub gs: SerializableSegment,
|
||||
pub ss: SerializableSegment,
|
||||
pub tr: SerializableSegment,
|
||||
pub ldt: SerializableSegment,
|
||||
pub gdt: SerializableDtable,
|
||||
pub idt: SerializableDtable,
|
||||
pub cr0: u64,
|
||||
pub cr2: u64,
|
||||
pub cr3: u64,
|
||||
pub cr4: u64,
|
||||
pub cr8: u64,
|
||||
pub efer: u64,
|
||||
pub apic_base: u64,
|
||||
/// Interrupt bitmap (256 bits = 4 x u64)
|
||||
pub interrupt_bitmap: [u64; 4],
|
||||
}
|
||||
|
||||
/// Serializable FPU state (maps to kvm_fpu)
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SerializableFpu {
|
||||
/// x87 FPU registers (8 x 16 bytes = 128 bytes)
|
||||
pub fpr: Vec<Vec<u8>>,
|
||||
/// FPU control word
|
||||
pub fcw: u16,
|
||||
/// FPU status word
|
||||
pub fsw: u16,
|
||||
/// FPU tag word (abridged)
|
||||
pub ftwx: u8,
|
||||
/// Last FPU opcode
|
||||
pub last_opcode: u16,
|
||||
/// Last FPU instruction pointer
|
||||
pub last_ip: u64,
|
||||
/// Last FPU data pointer
|
||||
pub last_dp: u64,
|
||||
/// SSE/AVX registers (16 x 16 bytes = 256 bytes)
|
||||
pub xmm: Vec<Vec<u8>>,
|
||||
/// SSE control/status register
|
||||
pub mxcsr: u32,
|
||||
}
|
||||
|
||||
/// Serializable MSR entry
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SerializableMsr {
|
||||
pub index: u32,
|
||||
pub data: u64,
|
||||
}
|
||||
|
||||
/// Serializable CPUID entry
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SerializableCpuidEntry {
|
||||
pub function: u32,
|
||||
pub index: u32,
|
||||
pub flags: u32,
|
||||
pub eax: u32,
|
||||
pub ebx: u32,
|
||||
pub ecx: u32,
|
||||
pub edx: u32,
|
||||
}
|
||||
|
||||
/// Serializable LAPIC state (256 x 4 = 1024 bytes, base64-encoded)
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SerializableLapic {
|
||||
/// Raw LAPIC register data (1024 bytes)
|
||||
pub regs: Vec<u8>,
|
||||
}
|
||||
|
||||
/// Serializable XCR entry
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SerializableXcr {
|
||||
pub xcr: u32,
|
||||
pub value: u64,
|
||||
}
|
||||
|
||||
/// Serializable vCPU events (maps to kvm_vcpu_events)
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SerializableVcpuEvents {
|
||||
// Exception state
|
||||
pub exception_injected: u8,
|
||||
pub exception_nr: u8,
|
||||
pub exception_has_error_code: u8,
|
||||
pub exception_error_code: u32,
|
||||
// Interrupt state
|
||||
pub interrupt_injected: u8,
|
||||
pub interrupt_nr: u8,
|
||||
pub interrupt_soft: u8,
|
||||
pub interrupt_shadow: u8,
|
||||
// NMI state
|
||||
pub nmi_injected: u8,
|
||||
pub nmi_pending: u8,
|
||||
pub nmi_masked: u8,
|
||||
// SMI state
|
||||
pub smi_smm: u8,
|
||||
pub smi_pending: u8,
|
||||
pub smi_smm_inside_nmi: u8,
|
||||
pub smi_latched_init: u8,
|
||||
// Flags
|
||||
pub flags: u32,
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// IRQ Chip State
|
||||
// ============================================================================
|
||||
|
||||
/// Complete IRQ chip state (PIC + IOAPIC + PIT)
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct IrqchipState {
|
||||
/// 8259 PIC master (IRQ chip 0)
|
||||
pub pic_master: SerializablePicState,
|
||||
/// 8259 PIC slave (IRQ chip 1)
|
||||
pub pic_slave: SerializablePicState,
|
||||
/// IOAPIC state (IRQ chip 2)
|
||||
pub ioapic: SerializableIoapicState,
|
||||
/// PIT state (KVM_GET_PIT2)
|
||||
pub pit: SerializablePitState,
|
||||
}
|
||||
|
||||
/// Serializable 8259 PIC state
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SerializablePicState {
|
||||
/// Raw chip data from KVM_GET_IRQCHIP (512 bytes)
|
||||
pub raw_data: Vec<u8>,
|
||||
}
|
||||
|
||||
/// Serializable IOAPIC state
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SerializableIoapicState {
|
||||
/// Raw chip data from KVM_GET_IRQCHIP (512 bytes)
|
||||
pub raw_data: Vec<u8>,
|
||||
}
|
||||
|
||||
/// Serializable PIT state
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SerializablePitState {
|
||||
/// PIT counter channels (3 channels)
|
||||
pub channels: Vec<SerializablePitChannel>,
|
||||
/// PIT flags
|
||||
pub flags: u32,
|
||||
}
|
||||
|
||||
/// Serializable PIT channel state
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SerializablePitChannel {
|
||||
pub count: u32,
|
||||
pub latched_count: u16,
|
||||
pub count_latched: u8,
|
||||
pub status_latched: u8,
|
||||
pub status: u8,
|
||||
pub read_state: u8,
|
||||
pub write_state: u8,
|
||||
pub write_latch: u8,
|
||||
pub rw_mode: u8,
|
||||
pub mode: u8,
|
||||
pub bcd: u8,
|
||||
pub gate: u8,
|
||||
pub count_load_time: i64,
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Clock State
|
||||
// ============================================================================
|
||||
|
||||
/// KVM clock state
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ClockState {
|
||||
/// KVM clock value (nanoseconds)
|
||||
pub clock: u64,
|
||||
/// Flags from kvm_clock_data
|
||||
pub flags: u32,
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Device State
|
||||
// ============================================================================
|
||||
|
||||
/// Combined device state for all emulated devices
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct DeviceState {
|
||||
/// Serial console state
|
||||
pub serial: SerializableSerialState,
|
||||
/// Virtio-blk device state (if present)
|
||||
pub virtio_blk: Option<SerializableVirtioBlkState>,
|
||||
/// Virtio-net device state (if present)
|
||||
pub virtio_net: Option<SerializableVirtioNetState>,
|
||||
/// MMIO transport state for each device
|
||||
pub mmio_transports: Vec<SerializableMmioTransportState>,
|
||||
}
|
||||
|
||||
/// Serializable serial console state
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SerializableSerialState {
|
||||
pub dlab: bool,
|
||||
pub ier: u8,
|
||||
pub lcr: u8,
|
||||
pub mcr: u8,
|
||||
pub lsr: u8,
|
||||
pub msr: u8,
|
||||
pub scr: u8,
|
||||
pub dll: u8,
|
||||
pub dlh: u8,
|
||||
pub thr_interrupt_pending: bool,
|
||||
pub input_buffer: Vec<u8>,
|
||||
}
|
||||
|
||||
/// Serializable virtio-blk queue state
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SerializableVirtioBlkState {
|
||||
/// Features acknowledged by the driver
|
||||
pub acked_features: u64,
|
||||
/// Whether the device is activated
|
||||
pub activated: bool,
|
||||
/// Queue state
|
||||
pub queues: Vec<SerializableQueueState>,
|
||||
/// Read-only flag
|
||||
pub read_only: bool,
|
||||
/// Backend path (for re-opening on restore)
|
||||
pub backend_path: Option<String>,
|
||||
}
|
||||
|
||||
/// Serializable virtio-net queue state
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SerializableVirtioNetState {
|
||||
/// Features acknowledged by the driver
|
||||
pub acked_features: u64,
|
||||
/// Whether the device is activated
|
||||
pub activated: bool,
|
||||
/// Queue state
|
||||
pub queues: Vec<SerializableQueueState>,
|
||||
/// MAC address
|
||||
pub mac: [u8; 6],
|
||||
/// TAP device name
|
||||
pub tap_name: String,
|
||||
}
|
||||
|
||||
/// Serializable virtqueue state
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SerializableQueueState {
|
||||
pub max_size: u16,
|
||||
pub size: u16,
|
||||
pub ready: bool,
|
||||
pub desc_table: u64,
|
||||
pub avail_ring: u64,
|
||||
pub used_ring: u64,
|
||||
pub next_avail: u16,
|
||||
pub next_used: u16,
|
||||
}
|
||||
|
||||
/// Serializable MMIO transport state
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SerializableMmioTransportState {
|
||||
/// Device type
|
||||
pub device_type: u32,
|
||||
/// Current device status register
|
||||
pub device_status: u32,
|
||||
/// Driver features
|
||||
pub driver_features: u64,
|
||||
/// Device features selector
|
||||
pub device_features_sel: u32,
|
||||
/// Driver features selector
|
||||
pub driver_features_sel: u32,
|
||||
/// Selected queue index
|
||||
pub queue_sel: u32,
|
||||
/// Interrupt status
|
||||
pub interrupt_status: u32,
|
||||
/// Configuration generation counter
|
||||
pub config_generation: u32,
|
||||
/// MMIO base address
|
||||
pub base_addr: u64,
|
||||
/// IRQ number
|
||||
pub irq: u32,
|
||||
/// Per-queue addresses
|
||||
pub queue_desc: Vec<u64>,
|
||||
pub queue_avail: Vec<u64>,
|
||||
pub queue_used: Vec<u64>,
|
||||
pub queue_num: Vec<u16>,
|
||||
pub queue_ready: Vec<bool>,
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Complete Snapshot
|
||||
// ============================================================================
|
||||
|
||||
/// Complete VM snapshot (serialized to state.json)
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct VmSnapshot {
|
||||
/// Snapshot metadata
|
||||
pub metadata: SnapshotMetadata,
|
||||
/// Per-vCPU state
|
||||
pub vcpu_states: Vec<VcpuState>,
|
||||
/// IRQ chip state
|
||||
pub irqchip: IrqchipState,
|
||||
/// KVM clock state
|
||||
pub clock: ClockState,
|
||||
/// Device state
|
||||
pub devices: DeviceState,
|
||||
/// Memory region layout
|
||||
pub memory_regions: Vec<SerializableMemoryRegion>,
|
||||
}
|
||||
|
||||
/// Serializable memory region descriptor
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SerializableMemoryRegion {
|
||||
/// Guest physical address
|
||||
pub guest_addr: u64,
|
||||
/// Size in bytes
|
||||
pub size: u64,
|
||||
/// Offset into the memory snapshot file
|
||||
pub file_offset: u64,
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// VmSnapshot Implementation
|
||||
// ============================================================================
|
||||
|
||||
impl VmSnapshot {
|
||||
/// Deserialize a VmSnapshot from a byte buffer.
|
||||
///
|
||||
/// This allows loading snapshot state from memory (e.g., CAS blob cache)
|
||||
/// instead of reading from a file on disk.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `data` - JSON-encoded snapshot state bytes
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// The deserialized `VmSnapshot`, or an error if deserialization fails.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```ignore
|
||||
/// // Load state from CAS blob cache
|
||||
/// let state_bytes = blob_cache.get("vm-snapshot-state")?;
|
||||
/// let snapshot = VmSnapshot::from_bytes(&state_bytes)?;
|
||||
/// ```
|
||||
pub fn from_bytes(data: &[u8]) -> Result<Self> {
|
||||
let snapshot: VmSnapshot = serde_json::from_slice(data)?;
|
||||
|
||||
// Verify version
|
||||
if snapshot.metadata.version != SNAPSHOT_VERSION {
|
||||
return Err(SnapshotError::VersionMismatch {
|
||||
expected: SNAPSHOT_VERSION,
|
||||
actual: snapshot.metadata.version,
|
||||
});
|
||||
}
|
||||
|
||||
// Verify CRC-64
|
||||
let saved_crc = snapshot.metadata.state_crc64;
|
||||
let mut check_snapshot = snapshot.clone();
|
||||
check_snapshot.metadata.state_crc64 = 0;
|
||||
let check_json = serde_json::to_string_pretty(&check_snapshot)?;
|
||||
let computed_crc = compute_crc64(check_json.as_bytes());
|
||||
|
||||
if saved_crc != computed_crc {
|
||||
return Err(SnapshotError::CrcMismatch {
|
||||
expected: saved_crc,
|
||||
actual: computed_crc,
|
||||
});
|
||||
}
|
||||
|
||||
Ok(snapshot)
|
||||
}
|
||||
|
||||
/// Serialize the VmSnapshot to bytes.
|
||||
///
|
||||
/// This is the inverse of `from_bytes()` and allows storing snapshot
|
||||
/// state in memory (e.g., for CAS blob cache).
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// The JSON-encoded snapshot state as bytes.
|
||||
pub fn to_bytes(&self) -> Result<Vec<u8>> {
|
||||
// Create a snapshot with zeroed CRC for computation
|
||||
let mut snapshot = self.clone();
|
||||
snapshot.metadata.state_crc64 = 0;
|
||||
let json = serde_json::to_string_pretty(&snapshot)?;
|
||||
|
||||
// Compute CRC and update
|
||||
let crc = compute_crc64(json.as_bytes());
|
||||
snapshot.metadata.state_crc64 = crc;
|
||||
|
||||
// Re-serialize with correct CRC
|
||||
let final_json = serde_json::to_string_pretty(&snapshot)?;
|
||||
Ok(final_json.into_bytes())
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Error types
|
||||
// ============================================================================
|
||||
|
||||
/// Snapshot operation errors
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum SnapshotError {
|
||||
#[error("IO error: {0}")]
|
||||
Io(#[from] std::io::Error),
|
||||
|
||||
#[error("Serialization error: {0}")]
|
||||
Serialization(#[from] serde_json::Error),
|
||||
|
||||
#[error("KVM error: {0}")]
|
||||
Kvm(String),
|
||||
|
||||
#[error("CRC mismatch: expected {expected:#x}, got {actual:#x}")]
|
||||
CrcMismatch { expected: u64, actual: u64 },
|
||||
|
||||
#[error("Version mismatch: expected {expected}, got {actual}")]
|
||||
VersionMismatch { expected: u32, actual: u32 },
|
||||
|
||||
#[error("Memory size mismatch: expected {expected}, got {actual}")]
|
||||
MemorySizeMismatch { expected: u64, actual: u64 },
|
||||
|
||||
#[error("Memory file size mismatch: expected {expected}, got {actual}")]
|
||||
MemoryFileSizeMismatch { expected: u64, actual: u64 },
|
||||
|
||||
#[error("Missing snapshot file: {0}")]
|
||||
MissingFile(String),
|
||||
|
||||
#[error("Invalid snapshot: {0}")]
|
||||
Invalid(String),
|
||||
|
||||
#[error("mmap failed: {0}")]
|
||||
Mmap(String),
|
||||
}
|
||||
|
||||
pub type Result<T> = std::result::Result<T, SnapshotError>;
|
||||
|
||||
// ============================================================================
|
||||
// CRC-64 helper
|
||||
// ============================================================================
|
||||
|
||||
/// Compute CRC-64/ECMA for integrity checking
|
||||
pub fn compute_crc64(data: &[u8]) -> u64 {
|
||||
use crc::{Crc, CRC_64_ECMA_182};
|
||||
const CRC64: Crc<u64> = Crc::<u64>::new(&CRC_64_ECMA_182);
|
||||
CRC64.checksum(data)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
/// Create a minimal valid snapshot for testing
|
||||
fn create_test_snapshot() -> VmSnapshot {
|
||||
VmSnapshot {
|
||||
metadata: SnapshotMetadata {
|
||||
version: SNAPSHOT_VERSION,
|
||||
memory_size: 128 * 1024 * 1024,
|
||||
vcpu_count: 1,
|
||||
created_at: 1234567890,
|
||||
state_crc64: 0, // Will be computed
|
||||
memory_file_size: 128 * 1024 * 1024,
|
||||
},
|
||||
vcpu_states: vec![VcpuState {
|
||||
id: 0,
|
||||
regs: SerializableRegs {
|
||||
rax: 0, rbx: 0, rcx: 0, rdx: 0,
|
||||
rsi: 0, rdi: 0, rsp: 0x7fff_0000, rbp: 0,
|
||||
r8: 0, r9: 0, r10: 0, r11: 0,
|
||||
r12: 0, r13: 0, r14: 0, r15: 0,
|
||||
rip: 0x0010_0000, rflags: 0x0002,
|
||||
},
|
||||
sregs: SerializableSregs {
|
||||
cs: SerializableSegment {
|
||||
base: 0, limit: 0xffff_ffff, selector: 0x10,
|
||||
type_: 11, present: 1, dpl: 0, db: 0, s: 1, l: 1, g: 1, avl: 0, unusable: 0,
|
||||
},
|
||||
ds: SerializableSegment {
|
||||
base: 0, limit: 0xffff_ffff, selector: 0x18,
|
||||
type_: 3, present: 1, dpl: 0, db: 1, s: 1, l: 0, g: 1, avl: 0, unusable: 0,
|
||||
},
|
||||
es: SerializableSegment {
|
||||
base: 0, limit: 0xffff_ffff, selector: 0x18,
|
||||
type_: 3, present: 1, dpl: 0, db: 1, s: 1, l: 0, g: 1, avl: 0, unusable: 0,
|
||||
},
|
||||
fs: SerializableSegment {
|
||||
base: 0, limit: 0xffff_ffff, selector: 0x18,
|
||||
type_: 3, present: 1, dpl: 0, db: 1, s: 1, l: 0, g: 1, avl: 0, unusable: 0,
|
||||
},
|
||||
gs: SerializableSegment {
|
||||
base: 0, limit: 0xffff_ffff, selector: 0x18,
|
||||
type_: 3, present: 1, dpl: 0, db: 1, s: 1, l: 0, g: 1, avl: 0, unusable: 0,
|
||||
},
|
||||
ss: SerializableSegment {
|
||||
base: 0, limit: 0xffff_ffff, selector: 0x18,
|
||||
type_: 3, present: 1, dpl: 0, db: 1, s: 1, l: 0, g: 1, avl: 0, unusable: 0,
|
||||
},
|
||||
tr: SerializableSegment {
|
||||
base: 0, limit: 0, selector: 0,
|
||||
type_: 11, present: 1, dpl: 0, db: 0, s: 0, l: 0, g: 0, avl: 0, unusable: 0,
|
||||
},
|
||||
ldt: SerializableSegment {
|
||||
base: 0, limit: 0, selector: 0,
|
||||
type_: 2, present: 1, dpl: 0, db: 0, s: 0, l: 0, g: 0, avl: 0, unusable: 1,
|
||||
},
|
||||
gdt: SerializableDtable { base: 0, limit: 0 },
|
||||
idt: SerializableDtable { base: 0, limit: 0 },
|
||||
cr0: 0x8000_0011,
|
||||
cr2: 0,
|
||||
cr3: 0x0010_0000,
|
||||
cr4: 0x20,
|
||||
cr8: 0,
|
||||
efer: 0x500,
|
||||
apic_base: 0xfee0_0900,
|
||||
interrupt_bitmap: [0; 4],
|
||||
},
|
||||
fpu: SerializableFpu {
|
||||
fpr: vec![vec![0u8; 16]; 8],
|
||||
fcw: 0x37f,
|
||||
fsw: 0,
|
||||
ftwx: 0,
|
||||
last_opcode: 0,
|
||||
last_ip: 0,
|
||||
last_dp: 0,
|
||||
xmm: vec![vec![0u8; 16]; 16],
|
||||
mxcsr: 0x1f80,
|
||||
},
|
||||
msrs: vec![],
|
||||
cpuid_entries: vec![],
|
||||
lapic: SerializableLapic { regs: vec![0u8; 1024] },
|
||||
xcrs: vec![],
|
||||
mp_state: 0,
|
||||
events: SerializableVcpuEvents {
|
||||
exception_injected: 0,
|
||||
exception_nr: 0,
|
||||
exception_has_error_code: 0,
|
||||
exception_error_code: 0,
|
||||
interrupt_injected: 0,
|
||||
interrupt_nr: 0,
|
||||
interrupt_soft: 0,
|
||||
interrupt_shadow: 0,
|
||||
nmi_injected: 0,
|
||||
nmi_pending: 0,
|
||||
nmi_masked: 0,
|
||||
smi_smm: 0,
|
||||
smi_pending: 0,
|
||||
smi_smm_inside_nmi: 0,
|
||||
smi_latched_init: 0,
|
||||
flags: 0,
|
||||
},
|
||||
}],
|
||||
irqchip: IrqchipState {
|
||||
pic_master: SerializablePicState { raw_data: vec![0u8; 512] },
|
||||
pic_slave: SerializablePicState { raw_data: vec![0u8; 512] },
|
||||
ioapic: SerializableIoapicState { raw_data: vec![0u8; 512] },
|
||||
pit: SerializablePitState {
|
||||
channels: vec![
|
||||
SerializablePitChannel {
|
||||
count: 0, latched_count: 0, count_latched: 0,
|
||||
status_latched: 0, status: 0, read_state: 0,
|
||||
write_state: 0, write_latch: 0, rw_mode: 0,
|
||||
mode: 0, bcd: 0, gate: 0, count_load_time: 0,
|
||||
};
|
||||
3
|
||||
],
|
||||
flags: 0,
|
||||
},
|
||||
},
|
||||
clock: ClockState { clock: 1_000_000_000, flags: 0 },
|
||||
devices: DeviceState {
|
||||
serial: SerializableSerialState {
|
||||
dlab: false,
|
||||
ier: 0,
|
||||
lcr: 0,
|
||||
mcr: 0,
|
||||
lsr: 0x60,
|
||||
msr: 0,
|
||||
scr: 0,
|
||||
dll: 0,
|
||||
dlh: 0,
|
||||
thr_interrupt_pending: false,
|
||||
input_buffer: vec![],
|
||||
},
|
||||
virtio_blk: None,
|
||||
virtio_net: None,
|
||||
mmio_transports: vec![],
|
||||
},
|
||||
memory_regions: vec![SerializableMemoryRegion {
|
||||
guest_addr: 0,
|
||||
size: 128 * 1024 * 1024,
|
||||
file_offset: 0,
|
||||
}],
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_snapshot_to_bytes_from_bytes_roundtrip() {
|
||||
let original = create_test_snapshot();
|
||||
|
||||
// Serialize to bytes
|
||||
let bytes = original.to_bytes().expect("to_bytes should succeed");
|
||||
|
||||
// Deserialize from bytes
|
||||
let restored = VmSnapshot::from_bytes(&bytes).expect("from_bytes should succeed");
|
||||
|
||||
// Verify key fields match
|
||||
assert_eq!(original.metadata.version, restored.metadata.version);
|
||||
assert_eq!(original.metadata.memory_size, restored.metadata.memory_size);
|
||||
assert_eq!(original.metadata.vcpu_count, restored.metadata.vcpu_count);
|
||||
assert_eq!(original.vcpu_states.len(), restored.vcpu_states.len());
|
||||
assert_eq!(original.vcpu_states[0].regs.rip, restored.vcpu_states[0].regs.rip);
|
||||
assert_eq!(original.vcpu_states[0].regs.rsp, restored.vcpu_states[0].regs.rsp);
|
||||
assert_eq!(original.clock.clock, restored.clock.clock);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_snapshot_from_bytes_version_mismatch() {
|
||||
let mut snapshot = create_test_snapshot();
|
||||
snapshot.metadata.version = 999; // Invalid version
|
||||
|
||||
let bytes = serde_json::to_vec(&snapshot).unwrap();
|
||||
|
||||
let result = VmSnapshot::from_bytes(&bytes);
|
||||
assert!(matches!(result, Err(SnapshotError::VersionMismatch { .. })));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_snapshot_from_bytes_crc_mismatch() {
|
||||
let mut snapshot = create_test_snapshot();
|
||||
|
||||
// Serialize normally first
|
||||
let bytes = snapshot.to_bytes().unwrap();
|
||||
|
||||
// Corrupt the bytes (modify some content while keeping valid JSON)
|
||||
let mut json: serde_json::Value = serde_json::from_slice(&bytes).unwrap();
|
||||
json["clock"]["clock"] = serde_json::json!(12345); // Change clock value
|
||||
let corrupted = serde_json::to_vec(&json).unwrap();
|
||||
|
||||
let result = VmSnapshot::from_bytes(&corrupted);
|
||||
assert!(matches!(result, Err(SnapshotError::CrcMismatch { .. })));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_snapshot_from_bytes_invalid_json() {
|
||||
let invalid_bytes = b"{ this is not valid json }";
|
||||
|
||||
let result = VmSnapshot::from_bytes(invalid_bytes);
|
||||
assert!(matches!(result, Err(SnapshotError::Serialization(_))));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_crc64_consistency() {
|
||||
let data1 = b"hello world";
|
||||
let data2 = b"hello world";
|
||||
let data3 = b"hello worle"; // Different
|
||||
|
||||
let crc1 = compute_crc64(data1);
|
||||
let crc2 = compute_crc64(data2);
|
||||
let crc3 = compute_crc64(data3);
|
||||
|
||||
assert_eq!(crc1, crc2);
|
||||
assert_ne!(crc1, crc3);
|
||||
}
|
||||
}
|
||||
963
vmm/src/snapshot/restore.rs
Normal file
963
vmm/src/snapshot/restore.rs
Normal file
@@ -0,0 +1,963 @@
|
||||
//! Snapshot Restore
|
||||
//!
|
||||
//! Restores a VM from a snapshot by:
|
||||
//! 1. Loading and verifying state metadata (CRC-64)
|
||||
//! 2. Creating a new KVM VM (or acquiring from pool)
|
||||
//! 3. mmap'ing the memory snapshot (MAP_PRIVATE for CoW, demand-paged)
|
||||
//! 4. Registering memory with KVM
|
||||
//! 5. Restoring vCPU state (registers, MSRs, LAPIC, etc.)
|
||||
//! 6. Restoring IRQ chip and PIT
|
||||
//! 7. Restoring KVM clock
|
||||
//!
|
||||
//! The critical optimization is using mmap with MAP_PRIVATE on the memory
|
||||
//! snapshot file. This means:
|
||||
//! - Pages are loaded on-demand by the kernel's page fault handler
|
||||
//! - No bulk memory copy needed at restore time
|
||||
//! - Copy-on-Write semantics protect the snapshot file
|
||||
//! - Restore is nearly instant (~1-5ms) regardless of memory size
|
||||
//!
|
||||
//! ## Pooled Restore
|
||||
//!
|
||||
//! For maximum performance, use `restore_snapshot_pooled()` with a `VmPool`.
|
||||
//! The pool maintains pre-warmed KVM VMs with TSS, IRQ chip, and PIT already
|
||||
//! configured, reducing restore time from ~30ms to ~1-2ms.
|
||||
|
||||
use std::fs;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::os::unix::io::AsRawFd;
|
||||
use std::path::Path;
|
||||
use std::ptr::NonNull;
|
||||
|
||||
use kvm_bindings::{
|
||||
kvm_clock_data, kvm_irqchip, kvm_mp_state, kvm_msr_entry, kvm_pit_channel_state,
|
||||
kvm_pit_config, kvm_pit_state2, kvm_regs, kvm_segment, kvm_sregs,
|
||||
kvm_userspace_memory_region, kvm_vcpu_events, kvm_xcrs,
|
||||
CpuId, Msrs,
|
||||
KVM_IRQCHIP_IOAPIC, KVM_IRQCHIP_PIC_MASTER, KVM_IRQCHIP_PIC_SLAVE,
|
||||
KVM_PIT_SPEAKER_DUMMY,
|
||||
};
|
||||
use kvm_ioctls::{Kvm, VcpuFd, VmFd};
|
||||
use nix::sys::mman::{mmap, munmap, MapFlags, ProtFlags};
|
||||
use tracing::{debug, info, warn};
|
||||
|
||||
use super::*;
|
||||
use super::cas::{self, CasManifest, CAS_MANIFEST_FILENAME};
|
||||
use crate::pool::VmPool;
|
||||
|
||||
/// Result of a successful snapshot restore
|
||||
///
|
||||
/// Contains the KVM handles needed to run the restored VM.
|
||||
pub struct RestoredVm {
|
||||
/// KVM VM file descriptor
|
||||
pub vm_fd: VmFd,
|
||||
/// vCPU file descriptors (already configured with restored state)
|
||||
pub vcpu_fds: Vec<VcpuFd>,
|
||||
/// Host virtual address of the mmap'd memory region(s)
|
||||
pub memory_mappings: Vec<MemoryMapping>,
|
||||
/// The restored snapshot state (for device reconstruction)
|
||||
pub snapshot: VmSnapshot,
|
||||
}
|
||||
|
||||
/// An mmap'd memory region from the snapshot
|
||||
pub struct MemoryMapping {
|
||||
/// Host virtual address
|
||||
pub host_addr: *mut u8,
|
||||
/// Size in bytes
|
||||
pub size: usize,
|
||||
/// Guest physical address
|
||||
pub guest_addr: u64,
|
||||
}
|
||||
|
||||
// Safety: raw pointer to mmap'd memory, safe to send between threads
|
||||
unsafe impl Send for MemoryMapping {}
|
||||
unsafe impl Sync for MemoryMapping {}
|
||||
|
||||
impl Drop for MemoryMapping {
|
||||
fn drop(&mut self) {
|
||||
if let Some(ptr) = NonNull::new(self.host_addr as *mut _) {
|
||||
if let Err(e) = unsafe { munmap(ptr, self.size) } {
|
||||
tracing::error!(
|
||||
"Failed to unmap restored memory at {:p}: {}",
|
||||
self.host_addr,
|
||||
e
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Restore a VM from a snapshot directory.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `snapshot_dir` - Path to the snapshot directory containing state.json and memory.snap
|
||||
///
|
||||
/// # Returns
|
||||
/// A `RestoredVm` containing KVM handles ready to resume execution.
|
||||
pub fn restore_snapshot(snapshot_dir: &Path) -> Result<RestoredVm> {
|
||||
restore_snapshot_with_cas(snapshot_dir, None)
|
||||
}
|
||||
|
||||
/// Restore a VM from a snapshot directory with optional CAS support.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `snapshot_dir` - Path to the snapshot directory containing state.json and memory
|
||||
/// * `cas_store` - Optional path to CAS store for CAS-backed snapshots
|
||||
///
|
||||
/// # Returns
|
||||
/// A `RestoredVm` containing KVM handles ready to resume execution.
|
||||
pub fn restore_snapshot_with_cas(
|
||||
snapshot_dir: &Path,
|
||||
cas_store: Option<&Path>,
|
||||
) -> Result<RestoredVm> {
|
||||
let start = std::time::Instant::now();
|
||||
|
||||
// Step 1: Load and verify state
|
||||
let snapshot = load_and_verify_state(snapshot_dir)?;
|
||||
let t_load = start.elapsed();
|
||||
debug!(
|
||||
"State loaded and verified in {:.2}ms",
|
||||
t_load.as_secs_f64() * 1000.0
|
||||
);
|
||||
|
||||
// Step 2: Create KVM VM
|
||||
let kvm = Kvm::new().map_err(|e| SnapshotError::Kvm(format!("open /dev/kvm: {}", e)))?;
|
||||
let vm_fd = kvm
|
||||
.create_vm()
|
||||
.map_err(|e| SnapshotError::Kvm(format!("create_vm: {}", e)))?;
|
||||
|
||||
// Set TSS address (required for x86_64)
|
||||
vm_fd
|
||||
.set_tss_address(0xFFFB_D000)
|
||||
.map_err(|e| SnapshotError::Kvm(format!("set_tss_address: {}", e)))?;
|
||||
|
||||
// Create IRQ chip (must be before restoring IRQ state)
|
||||
vm_fd
|
||||
.create_irq_chip()
|
||||
.map_err(|e| SnapshotError::Kvm(format!("create_irq_chip: {}", e)))?;
|
||||
|
||||
// Create PIT (must be before restoring PIT state)
|
||||
let pit_config = kvm_pit_config {
|
||||
flags: KVM_PIT_SPEAKER_DUMMY,
|
||||
..Default::default()
|
||||
};
|
||||
vm_fd
|
||||
.create_pit2(pit_config)
|
||||
.map_err(|e| SnapshotError::Kvm(format!("create_pit2: {}", e)))?;
|
||||
|
||||
let t_vm = start.elapsed();
|
||||
debug!(
|
||||
"KVM VM created in {:.2}ms",
|
||||
(t_vm - t_load).as_secs_f64() * 1000.0
|
||||
);
|
||||
|
||||
// Step 3: mmap the memory snapshot (flat or CAS)
|
||||
let memory_mappings = restore_memory(snapshot_dir, &snapshot, cas_store)?;
|
||||
let t_mmap = start.elapsed();
|
||||
debug!(
|
||||
"Memory mmap'd in {:.2}ms ({} region(s), CAS: {})",
|
||||
(t_mmap - t_vm).as_secs_f64() * 1000.0,
|
||||
memory_mappings.len(),
|
||||
cas_store.is_some()
|
||||
);
|
||||
|
||||
// Step 4: Register memory regions with KVM
|
||||
for (slot, mapping) in memory_mappings.iter().enumerate() {
|
||||
let mem_region = kvm_userspace_memory_region {
|
||||
slot: slot as u32,
|
||||
flags: 0,
|
||||
guest_phys_addr: mapping.guest_addr,
|
||||
memory_size: mapping.size as u64,
|
||||
userspace_addr: mapping.host_addr as u64,
|
||||
};
|
||||
unsafe {
|
||||
vm_fd
|
||||
.set_user_memory_region(mem_region)
|
||||
.map_err(|e| SnapshotError::Kvm(format!("set_user_memory_region slot {}: {}", slot, e)))?;
|
||||
}
|
||||
}
|
||||
let t_memreg = start.elapsed();
|
||||
debug!(
|
||||
"Memory registered with KVM in {:.2}ms",
|
||||
(t_memreg - t_mmap).as_secs_f64() * 1000.0
|
||||
);
|
||||
|
||||
// Step 5: Create and restore vCPUs
|
||||
let vcpu_fds = restore_vcpus(&kvm, &vm_fd, &snapshot)?;
|
||||
let t_vcpu = start.elapsed();
|
||||
debug!(
|
||||
"vCPU state restored in {:.2}ms",
|
||||
(t_vcpu - t_memreg).as_secs_f64() * 1000.0
|
||||
);
|
||||
|
||||
// Step 6: Restore IRQ chip state
|
||||
restore_irqchip(&vm_fd, &snapshot.irqchip)?;
|
||||
let t_irq = start.elapsed();
|
||||
debug!(
|
||||
"IRQ chip restored in {:.2}ms",
|
||||
(t_irq - t_vcpu).as_secs_f64() * 1000.0
|
||||
);
|
||||
|
||||
// Step 7: Restore clock
|
||||
restore_clock(&vm_fd, &snapshot.clock)?;
|
||||
let t_clock = start.elapsed();
|
||||
debug!(
|
||||
"Clock restored in {:.2}ms",
|
||||
(t_clock - t_irq).as_secs_f64() * 1000.0
|
||||
);
|
||||
|
||||
let t_total = start.elapsed();
|
||||
info!(
|
||||
"Snapshot restored: {} vCPUs, {} MB memory, {:.2}ms total \
|
||||
[load={:.2}ms, vm={:.2}ms, mmap={:.2}ms, memreg={:.2}ms, vcpu={:.2}ms, irq={:.2}ms, clock={:.2}ms]",
|
||||
snapshot.vcpu_states.len(),
|
||||
snapshot.metadata.memory_size / (1024 * 1024),
|
||||
t_total.as_secs_f64() * 1000.0,
|
||||
t_load.as_secs_f64() * 1000.0,
|
||||
(t_vm - t_load).as_secs_f64() * 1000.0,
|
||||
(t_mmap - t_vm).as_secs_f64() * 1000.0,
|
||||
(t_memreg - t_mmap).as_secs_f64() * 1000.0,
|
||||
(t_vcpu - t_memreg).as_secs_f64() * 1000.0,
|
||||
(t_irq - t_vcpu).as_secs_f64() * 1000.0,
|
||||
(t_clock - t_irq).as_secs_f64() * 1000.0,
|
||||
);
|
||||
|
||||
Ok(RestoredVm {
|
||||
vm_fd,
|
||||
vcpu_fds,
|
||||
memory_mappings,
|
||||
snapshot,
|
||||
})
|
||||
}
|
||||
|
||||
/// Restore a VM from a snapshot using a pre-warmed VM from the pool.
|
||||
///
|
||||
/// This is the fast path for snapshot restore. By using a pre-warmed VM
|
||||
/// from the pool, we skip the expensive KVM_CREATE_VM, set_tss_address,
|
||||
/// create_irq_chip, and create_pit2 calls (totaling ~24ms).
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `snapshot_dir` - Path to the snapshot directory containing state.json and memory.snap
|
||||
/// * `pool` - VM pool to acquire a pre-warmed VM from
|
||||
///
|
||||
/// # Returns
|
||||
/// A `RestoredVm` containing KVM handles ready to resume execution.
|
||||
///
|
||||
/// # Performance
|
||||
/// With a pre-warmed pool, restore time drops from ~30ms to ~1-5ms:
|
||||
/// - Skip KVM_CREATE_VM (~20ms)
|
||||
/// - Skip set_tss_address (~1ms)
|
||||
/// - Skip create_irq_chip (~2ms)
|
||||
/// - Skip create_pit2 (~1ms)
|
||||
pub fn restore_snapshot_pooled(snapshot_dir: &Path, pool: &VmPool) -> Result<RestoredVm> {
|
||||
let start = std::time::Instant::now();
|
||||
|
||||
// Step 1: Load and verify state
|
||||
let snapshot = load_and_verify_state(snapshot_dir)?;
|
||||
let t_load = start.elapsed();
|
||||
debug!(
|
||||
"State loaded and verified in {:.2}ms",
|
||||
t_load.as_secs_f64() * 1000.0
|
||||
);
|
||||
|
||||
// Step 2: Acquire pre-warmed VM from pool (FAST PATH)
|
||||
// The VM already has TSS, IRQ chip, and PIT configured
|
||||
let pre_warmed = pool.acquire().map_err(|e| {
|
||||
SnapshotError::Kvm(format!("Failed to acquire VM from pool: {}", e))
|
||||
})?;
|
||||
let vm_fd = pre_warmed.vm_fd;
|
||||
let kvm = pre_warmed.kvm;
|
||||
|
||||
let t_vm = start.elapsed();
|
||||
debug!(
|
||||
"VM acquired from pool in {:.3}ms (vs ~24ms for fresh creation)",
|
||||
(t_vm - t_load).as_secs_f64() * 1000.0
|
||||
);
|
||||
|
||||
// Step 3: mmap the memory snapshot file
|
||||
let memory_mappings = mmap_memory_snapshot(snapshot_dir, &snapshot)?;
|
||||
let t_mmap = start.elapsed();
|
||||
debug!(
|
||||
"Memory mmap'd in {:.2}ms ({} region(s))",
|
||||
(t_mmap - t_vm).as_secs_f64() * 1000.0,
|
||||
memory_mappings.len()
|
||||
);
|
||||
|
||||
// Step 4: Register memory regions with KVM
|
||||
for (slot, mapping) in memory_mappings.iter().enumerate() {
|
||||
let mem_region = kvm_userspace_memory_region {
|
||||
slot: slot as u32,
|
||||
flags: 0,
|
||||
guest_phys_addr: mapping.guest_addr,
|
||||
memory_size: mapping.size as u64,
|
||||
userspace_addr: mapping.host_addr as u64,
|
||||
};
|
||||
unsafe {
|
||||
vm_fd
|
||||
.set_user_memory_region(mem_region)
|
||||
.map_err(|e| SnapshotError::Kvm(format!("set_user_memory_region slot {}: {}", slot, e)))?;
|
||||
}
|
||||
}
|
||||
let t_memreg = start.elapsed();
|
||||
debug!(
|
||||
"Memory registered with KVM in {:.2}ms",
|
||||
(t_memreg - t_mmap).as_secs_f64() * 1000.0
|
||||
);
|
||||
|
||||
// Step 5: Create and restore vCPUs
|
||||
let vcpu_fds = restore_vcpus(&kvm, &vm_fd, &snapshot)?;
|
||||
let t_vcpu = start.elapsed();
|
||||
debug!(
|
||||
"vCPU state restored in {:.2}ms",
|
||||
(t_vcpu - t_memreg).as_secs_f64() * 1000.0
|
||||
);
|
||||
|
||||
// Step 6: Restore IRQ chip state
|
||||
restore_irqchip(&vm_fd, &snapshot.irqchip)?;
|
||||
let t_irq = start.elapsed();
|
||||
debug!(
|
||||
"IRQ chip restored in {:.2}ms",
|
||||
(t_irq - t_vcpu).as_secs_f64() * 1000.0
|
||||
);
|
||||
|
||||
// Step 7: Restore clock
|
||||
restore_clock(&vm_fd, &snapshot.clock)?;
|
||||
let t_clock = start.elapsed();
|
||||
debug!(
|
||||
"Clock restored in {:.2}ms",
|
||||
(t_clock - t_irq).as_secs_f64() * 1000.0
|
||||
);
|
||||
|
||||
let t_total = start.elapsed();
|
||||
info!(
|
||||
"Snapshot restored (POOLED): {} vCPUs, {} MB memory, {:.2}ms total \
|
||||
[load={:.2}ms, pool_acquire={:.3}ms, mmap={:.2}ms, memreg={:.2}ms, vcpu={:.2}ms, irq={:.2}ms, clock={:.2}ms]",
|
||||
snapshot.vcpu_states.len(),
|
||||
snapshot.metadata.memory_size / (1024 * 1024),
|
||||
t_total.as_secs_f64() * 1000.0,
|
||||
t_load.as_secs_f64() * 1000.0,
|
||||
(t_vm - t_load).as_secs_f64() * 1000.0,
|
||||
(t_mmap - t_vm).as_secs_f64() * 1000.0,
|
||||
(t_memreg - t_mmap).as_secs_f64() * 1000.0,
|
||||
(t_vcpu - t_memreg).as_secs_f64() * 1000.0,
|
||||
(t_irq - t_vcpu).as_secs_f64() * 1000.0,
|
||||
(t_clock - t_irq).as_secs_f64() * 1000.0,
|
||||
);
|
||||
|
||||
Ok(RestoredVm {
|
||||
vm_fd,
|
||||
vcpu_fds,
|
||||
memory_mappings,
|
||||
snapshot,
|
||||
})
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// State Loading & Verification
|
||||
// ============================================================================
|
||||
|
||||
fn load_and_verify_state(snapshot_dir: &Path) -> Result<VmSnapshot> {
|
||||
let state_path = snapshot_dir.join("state.json");
|
||||
if !state_path.exists() {
|
||||
return Err(SnapshotError::MissingFile(
|
||||
state_path.to_string_lossy().to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
let mem_path = snapshot_dir.join("memory.snap");
|
||||
if !mem_path.exists() {
|
||||
return Err(SnapshotError::MissingFile(
|
||||
mem_path.to_string_lossy().to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
let state_json = fs::read_to_string(&state_path)?;
|
||||
let snapshot: VmSnapshot = serde_json::from_str(&state_json)?;
|
||||
|
||||
// Verify version
|
||||
if snapshot.metadata.version != SNAPSHOT_VERSION {
|
||||
return Err(SnapshotError::VersionMismatch {
|
||||
expected: SNAPSHOT_VERSION,
|
||||
actual: snapshot.metadata.version,
|
||||
});
|
||||
}
|
||||
|
||||
// Verify CRC-64: zero out the CRC field, recompute, and compare
|
||||
let mut check_snapshot = snapshot.clone();
|
||||
let saved_crc = check_snapshot.metadata.state_crc64;
|
||||
check_snapshot.metadata.state_crc64 = 0;
|
||||
let check_json = serde_json::to_string_pretty(&check_snapshot)?;
|
||||
let computed_crc = compute_crc64(check_json.as_bytes());
|
||||
|
||||
if saved_crc != computed_crc {
|
||||
return Err(SnapshotError::CrcMismatch {
|
||||
expected: saved_crc,
|
||||
actual: computed_crc,
|
||||
});
|
||||
}
|
||||
|
||||
// Verify memory file size
|
||||
let mem_metadata = fs::metadata(&mem_path)?;
|
||||
if mem_metadata.len() != snapshot.metadata.memory_file_size {
|
||||
return Err(SnapshotError::MemoryFileSizeMismatch {
|
||||
expected: snapshot.metadata.memory_file_size,
|
||||
actual: mem_metadata.len(),
|
||||
});
|
||||
}
|
||||
|
||||
debug!(
|
||||
"Snapshot verified: v{}, {} vCPUs, {} MB memory, CRC {:#x}",
|
||||
snapshot.metadata.version,
|
||||
snapshot.metadata.vcpu_count,
|
||||
snapshot.metadata.memory_size / (1024 * 1024),
|
||||
saved_crc
|
||||
);
|
||||
|
||||
Ok(snapshot)
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Memory mmap
|
||||
// ============================================================================
|
||||
|
||||
fn mmap_memory_snapshot(
|
||||
snapshot_dir: &Path,
|
||||
snapshot: &VmSnapshot,
|
||||
) -> Result<Vec<MemoryMapping>> {
|
||||
let mem_path = snapshot_dir.join("memory.snap");
|
||||
let mem_file = fs::File::open(&mem_path)?;
|
||||
let _mem_fd = mem_file.as_raw_fd();
|
||||
|
||||
let mut mappings = Vec::with_capacity(snapshot.memory_regions.len());
|
||||
|
||||
for region in &snapshot.memory_regions {
|
||||
let size = region.size as usize;
|
||||
if size == 0 {
|
||||
continue;
|
||||
}
|
||||
|
||||
// mmap with MAP_PRIVATE for copy-on-write semantics
|
||||
// Pages are demand-paged: only loaded when first accessed
|
||||
let prot = ProtFlags::PROT_READ | ProtFlags::PROT_WRITE;
|
||||
let flags = MapFlags::MAP_PRIVATE;
|
||||
|
||||
let addr = unsafe {
|
||||
mmap(
|
||||
None,
|
||||
NonZeroUsize::new(size).ok_or_else(|| {
|
||||
SnapshotError::Mmap("zero-size region".to_string())
|
||||
})?,
|
||||
prot,
|
||||
flags,
|
||||
&mem_file,
|
||||
region.file_offset as i64,
|
||||
)
|
||||
.map_err(|e| SnapshotError::Mmap(format!("mmap failed for region at 0x{:x}: {}", region.guest_addr, e)))?
|
||||
};
|
||||
|
||||
mappings.push(MemoryMapping {
|
||||
host_addr: addr.as_ptr() as *mut u8,
|
||||
size,
|
||||
guest_addr: region.guest_addr,
|
||||
});
|
||||
|
||||
debug!(
|
||||
"Mapped memory region: guest=0x{:x}, size={} MB, host={:p}",
|
||||
region.guest_addr,
|
||||
size / (1024 * 1024),
|
||||
addr.as_ptr()
|
||||
);
|
||||
}
|
||||
|
||||
// Keep the file open via a leaked fd so the mmap stays valid.
|
||||
// The OS will close it on process exit.
|
||||
std::mem::forget(mem_file);
|
||||
|
||||
Ok(mappings)
|
||||
}
|
||||
|
||||
/// mmap memory from CAS chunks.
|
||||
///
|
||||
/// Each 2MB chunk is mmapped from the CAS store into a contiguous region.
|
||||
fn mmap_memory_cas(
|
||||
snapshot_dir: &Path,
|
||||
cas_store: &Path,
|
||||
) -> Result<Vec<MemoryMapping>> {
|
||||
let manifest_path = snapshot_dir.join(CAS_MANIFEST_FILENAME);
|
||||
let manifest = CasManifest::from_file(&manifest_path)?;
|
||||
cas::cas_mmap_memory(&manifest, cas_store)
|
||||
}
|
||||
|
||||
/// Check if a snapshot uses CAS storage.
|
||||
pub fn snapshot_uses_cas(snapshot_dir: &Path) -> bool {
|
||||
snapshot_dir.join(CAS_MANIFEST_FILENAME).exists()
|
||||
}
|
||||
|
||||
/// Restore memory from either CAS or flat snapshot.
|
||||
///
|
||||
/// Automatically detects the snapshot type:
|
||||
/// - If `memory-manifest.json` exists and `cas_store` is Some, use CAS restore
|
||||
/// - Otherwise, use flat `memory.snap` restore
|
||||
pub fn restore_memory(
|
||||
snapshot_dir: &Path,
|
||||
snapshot: &VmSnapshot,
|
||||
cas_store: Option<&Path>,
|
||||
) -> Result<Vec<MemoryMapping>> {
|
||||
let manifest_path = snapshot_dir.join(CAS_MANIFEST_FILENAME);
|
||||
|
||||
if manifest_path.exists() {
|
||||
// CAS-backed snapshot
|
||||
if let Some(store) = cas_store {
|
||||
info!("Restoring memory from CAS ({})", store.display());
|
||||
mmap_memory_cas(snapshot_dir, store)
|
||||
} else {
|
||||
// CAS manifest exists but no store specified
|
||||
// This could be an error, or we could fall back to flat
|
||||
if snapshot_dir.join("memory.snap").exists() {
|
||||
warn!("CAS manifest found but --cas-store not specified, falling back to memory.snap");
|
||||
mmap_memory_snapshot(snapshot_dir, snapshot)
|
||||
} else {
|
||||
Err(SnapshotError::Invalid(
|
||||
"CAS manifest found but --cas-store not specified and no memory.snap available".to_string()
|
||||
))
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Traditional flat snapshot
|
||||
info!("Restoring memory from flat memory.snap");
|
||||
mmap_memory_snapshot(snapshot_dir, snapshot)
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// vCPU Restore
|
||||
// ============================================================================
|
||||
|
||||
/// Restore all vCPUs from snapshot state.
|
||||
pub fn restore_vcpus(
|
||||
_kvm: &Kvm,
|
||||
vm_fd: &VmFd,
|
||||
snapshot: &VmSnapshot,
|
||||
) -> Result<Vec<VcpuFd>> {
|
||||
let mut vcpu_fds = Vec::with_capacity(snapshot.vcpu_states.len());
|
||||
|
||||
for vcpu_state in &snapshot.vcpu_states {
|
||||
let vcpu_fd = vm_fd
|
||||
.create_vcpu(vcpu_state.id as u64)
|
||||
.map_err(|e| {
|
||||
SnapshotError::Kvm(format!("create_vcpu {}: {}", vcpu_state.id, e))
|
||||
})?;
|
||||
|
||||
restore_single_vcpu(&vcpu_fd, vcpu_state)?;
|
||||
vcpu_fds.push(vcpu_fd);
|
||||
}
|
||||
|
||||
Ok(vcpu_fds)
|
||||
}
|
||||
|
||||
/// Restore a single vCPU's complete state.
|
||||
pub fn restore_single_vcpu(vcpu_fd: &VcpuFd, state: &VcpuState) -> Result<()> {
|
||||
let id = state.id;
|
||||
|
||||
// Restore CPUID first (must be before setting registers)
|
||||
restore_cpuid(vcpu_fd, &state.cpuid_entries, id)?;
|
||||
|
||||
// Restore MP state (should be done before other registers for some KVM versions)
|
||||
let mp_state = kvm_mp_state {
|
||||
mp_state: state.mp_state,
|
||||
};
|
||||
vcpu_fd
|
||||
.set_mp_state(mp_state)
|
||||
.map_err(|e| SnapshotError::Kvm(format!("set_mp_state vCPU {}: {}", id, e)))?;
|
||||
|
||||
// Restore special registers
|
||||
let sregs = deserialize_sregs(&state.sregs);
|
||||
vcpu_fd
|
||||
.set_sregs(&sregs)
|
||||
.map_err(|e| SnapshotError::Kvm(format!("set_sregs vCPU {}: {}", id, e)))?;
|
||||
|
||||
// Restore general purpose registers
|
||||
let regs = kvm_regs {
|
||||
rax: state.regs.rax,
|
||||
rbx: state.regs.rbx,
|
||||
rcx: state.regs.rcx,
|
||||
rdx: state.regs.rdx,
|
||||
rsi: state.regs.rsi,
|
||||
rdi: state.regs.rdi,
|
||||
rsp: state.regs.rsp,
|
||||
rbp: state.regs.rbp,
|
||||
r8: state.regs.r8,
|
||||
r9: state.regs.r9,
|
||||
r10: state.regs.r10,
|
||||
r11: state.regs.r11,
|
||||
r12: state.regs.r12,
|
||||
r13: state.regs.r13,
|
||||
r14: state.regs.r14,
|
||||
r15: state.regs.r15,
|
||||
rip: state.regs.rip,
|
||||
rflags: state.regs.rflags,
|
||||
};
|
||||
vcpu_fd
|
||||
.set_regs(®s)
|
||||
.map_err(|e| SnapshotError::Kvm(format!("set_regs vCPU {}: {}", id, e)))?;
|
||||
|
||||
// Restore FPU state
|
||||
let fpu = deserialize_fpu(&state.fpu);
|
||||
vcpu_fd
|
||||
.set_fpu(&fpu)
|
||||
.map_err(|e| SnapshotError::Kvm(format!("set_fpu vCPU {}: {}", id, e)))?;
|
||||
|
||||
// Restore MSRs
|
||||
restore_msrs(vcpu_fd, &state.msrs, id)?;
|
||||
|
||||
// Restore LAPIC
|
||||
restore_lapic(vcpu_fd, &state.lapic, id)?;
|
||||
|
||||
// Restore XCRs
|
||||
if !state.xcrs.is_empty() {
|
||||
restore_xcrs(vcpu_fd, &state.xcrs, id);
|
||||
}
|
||||
|
||||
// Restore vCPU events
|
||||
restore_vcpu_events(vcpu_fd, &state.events, id)?;
|
||||
|
||||
debug!(
|
||||
"vCPU {} restored: RIP=0x{:x}, RSP=0x{:x}, CR3=0x{:x}",
|
||||
id, state.regs.rip, state.regs.rsp, state.sregs.cr3
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Deserialize special registers from snapshot format.
|
||||
pub fn deserialize_sregs(s: &SerializableSregs) -> kvm_sregs {
|
||||
kvm_sregs {
|
||||
cs: deserialize_segment(&s.cs),
|
||||
ds: deserialize_segment(&s.ds),
|
||||
es: deserialize_segment(&s.es),
|
||||
fs: deserialize_segment(&s.fs),
|
||||
gs: deserialize_segment(&s.gs),
|
||||
ss: deserialize_segment(&s.ss),
|
||||
tr: deserialize_segment(&s.tr),
|
||||
ldt: deserialize_segment(&s.ldt),
|
||||
gdt: kvm_bindings::kvm_dtable {
|
||||
base: s.gdt.base,
|
||||
limit: s.gdt.limit,
|
||||
..Default::default()
|
||||
},
|
||||
idt: kvm_bindings::kvm_dtable {
|
||||
base: s.idt.base,
|
||||
limit: s.idt.limit,
|
||||
..Default::default()
|
||||
},
|
||||
cr0: s.cr0,
|
||||
cr2: s.cr2,
|
||||
cr3: s.cr3,
|
||||
cr4: s.cr4,
|
||||
cr8: s.cr8,
|
||||
efer: s.efer,
|
||||
apic_base: s.apic_base,
|
||||
interrupt_bitmap: s.interrupt_bitmap,
|
||||
}
|
||||
}
|
||||
|
||||
/// Deserialize a segment register from snapshot format.
|
||||
pub fn deserialize_segment(s: &SerializableSegment) -> kvm_segment {
|
||||
kvm_segment {
|
||||
base: s.base,
|
||||
limit: s.limit,
|
||||
selector: s.selector,
|
||||
type_: s.type_,
|
||||
present: s.present,
|
||||
dpl: s.dpl,
|
||||
db: s.db,
|
||||
s: s.s,
|
||||
l: s.l,
|
||||
g: s.g,
|
||||
avl: s.avl,
|
||||
unusable: s.unusable,
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
/// Deserialize FPU state from snapshot format.
|
||||
pub fn deserialize_fpu(f: &SerializableFpu) -> kvm_bindings::kvm_fpu {
|
||||
let mut fpu = kvm_bindings::kvm_fpu::default();
|
||||
|
||||
// Restore FPR (8 x 16 bytes)
|
||||
for (i, fpr_data) in f.fpr.iter().enumerate() {
|
||||
if i < fpu.fpr.len() {
|
||||
let len = fpr_data.len().min(fpu.fpr[i].len());
|
||||
fpu.fpr[i][..len].copy_from_slice(&fpr_data[..len]);
|
||||
}
|
||||
}
|
||||
|
||||
fpu.fcw = f.fcw;
|
||||
fpu.fsw = f.fsw;
|
||||
fpu.ftwx = f.ftwx;
|
||||
fpu.last_opcode = f.last_opcode;
|
||||
fpu.last_ip = f.last_ip;
|
||||
fpu.last_dp = f.last_dp;
|
||||
|
||||
// Restore XMM (16 x 16 bytes)
|
||||
for (i, xmm_data) in f.xmm.iter().enumerate() {
|
||||
if i < fpu.xmm.len() {
|
||||
let len = xmm_data.len().min(fpu.xmm[i].len());
|
||||
fpu.xmm[i][..len].copy_from_slice(&xmm_data[..len]);
|
||||
}
|
||||
}
|
||||
|
||||
fpu.mxcsr = f.mxcsr;
|
||||
fpu
|
||||
}
|
||||
|
||||
/// Restore CPUID entries to a vCPU.
|
||||
pub fn restore_cpuid(vcpu_fd: &VcpuFd, entries: &[SerializableCpuidEntry], id: u8) -> Result<()> {
|
||||
if entries.is_empty() {
|
||||
debug!("vCPU {}: no CPUID entries to restore", id);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let kvm_entries: Vec<kvm_bindings::kvm_cpuid_entry2> = entries
|
||||
.iter()
|
||||
.map(|e| kvm_bindings::kvm_cpuid_entry2 {
|
||||
function: e.function,
|
||||
index: e.index,
|
||||
flags: e.flags,
|
||||
eax: e.eax,
|
||||
ebx: e.ebx,
|
||||
ecx: e.ecx,
|
||||
edx: e.edx,
|
||||
..Default::default()
|
||||
})
|
||||
.collect();
|
||||
|
||||
let cpuid = CpuId::from_entries(&kvm_entries)
|
||||
.map_err(|e| SnapshotError::Kvm(format!("create CPUID for vCPU {}: {:?}", id, e)))?;
|
||||
|
||||
vcpu_fd
|
||||
.set_cpuid2(&cpuid)
|
||||
.map_err(|e| SnapshotError::Kvm(format!("set_cpuid2 vCPU {}: {}", id, e)))?;
|
||||
|
||||
debug!("vCPU {}: restored {} CPUID entries", id, entries.len());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Restore MSRs to a vCPU.
|
||||
pub fn restore_msrs(vcpu_fd: &VcpuFd, msrs: &[SerializableMsr], id: u8) -> Result<()> {
|
||||
if msrs.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let entries: Vec<kvm_msr_entry> = msrs
|
||||
.iter()
|
||||
.map(|m| kvm_msr_entry {
|
||||
index: m.index,
|
||||
data: m.data,
|
||||
..Default::default()
|
||||
})
|
||||
.collect();
|
||||
|
||||
let kvm_msrs = Msrs::from_entries(&entries)
|
||||
.map_err(|e| SnapshotError::Kvm(format!("create MSR list for vCPU {}: {:?}", id, e)))?;
|
||||
|
||||
let written = vcpu_fd
|
||||
.set_msrs(&kvm_msrs)
|
||||
.map_err(|e| SnapshotError::Kvm(format!("set_msrs vCPU {}: {}", id, e)))?;
|
||||
|
||||
if written != entries.len() {
|
||||
warn!(
|
||||
"vCPU {}: only restored {}/{} MSRs",
|
||||
id,
|
||||
written,
|
||||
entries.len()
|
||||
);
|
||||
} else {
|
||||
debug!("vCPU {}: restored {} MSRs", id, written);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Restore LAPIC state to a vCPU.
|
||||
pub fn restore_lapic(vcpu_fd: &VcpuFd, lapic: &SerializableLapic, id: u8) -> Result<()> {
|
||||
let mut kvm_lapic = kvm_bindings::kvm_lapic_state::default();
|
||||
|
||||
let len = lapic.regs.len().min(kvm_lapic.regs.len());
|
||||
for i in 0..len {
|
||||
kvm_lapic.regs[i] = lapic.regs[i] as i8;
|
||||
}
|
||||
|
||||
vcpu_fd
|
||||
.set_lapic(&kvm_lapic)
|
||||
.map_err(|e| SnapshotError::Kvm(format!("set_lapic vCPU {}: {}", id, e)))?;
|
||||
|
||||
debug!("vCPU {}: LAPIC restored", id);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Restore XCRs to a vCPU.
|
||||
pub fn restore_xcrs(vcpu_fd: &VcpuFd, xcrs: &[SerializableXcr], id: u8) {
|
||||
let mut kvm_xcrs = kvm_xcrs::default();
|
||||
kvm_xcrs.nr_xcrs = xcrs.len().min(kvm_xcrs.xcrs.len()) as u32;
|
||||
|
||||
for (i, xcr) in xcrs.iter().enumerate() {
|
||||
if i < kvm_xcrs.xcrs.len() {
|
||||
kvm_xcrs.xcrs[i].xcr = xcr.xcr;
|
||||
kvm_xcrs.xcrs[i].value = xcr.value;
|
||||
}
|
||||
}
|
||||
|
||||
match vcpu_fd.set_xcrs(&kvm_xcrs) {
|
||||
Ok(()) => debug!("vCPU {}: restored {} XCRs", id, kvm_xcrs.nr_xcrs),
|
||||
Err(e) => warn!("vCPU {}: set_xcrs not supported: {}", id, e),
|
||||
}
|
||||
}
|
||||
|
||||
/// Restore vCPU events.
|
||||
pub fn restore_vcpu_events(vcpu_fd: &VcpuFd, events: &SerializableVcpuEvents, id: u8) -> Result<()> {
|
||||
let mut kvm_events = kvm_vcpu_events::default();
|
||||
|
||||
kvm_events.exception.injected = events.exception_injected;
|
||||
kvm_events.exception.nr = events.exception_nr;
|
||||
kvm_events.exception.has_error_code = events.exception_has_error_code;
|
||||
kvm_events.exception.error_code = events.exception_error_code;
|
||||
|
||||
kvm_events.interrupt.injected = events.interrupt_injected;
|
||||
kvm_events.interrupt.nr = events.interrupt_nr;
|
||||
kvm_events.interrupt.soft = events.interrupt_soft;
|
||||
kvm_events.interrupt.shadow = events.interrupt_shadow;
|
||||
|
||||
kvm_events.nmi.injected = events.nmi_injected;
|
||||
kvm_events.nmi.pending = events.nmi_pending;
|
||||
kvm_events.nmi.masked = events.nmi_masked;
|
||||
|
||||
kvm_events.smi.smm = events.smi_smm;
|
||||
kvm_events.smi.pending = events.smi_pending;
|
||||
kvm_events.smi.smm_inside_nmi = events.smi_smm_inside_nmi;
|
||||
kvm_events.smi.latched_init = events.smi_latched_init;
|
||||
|
||||
kvm_events.flags = events.flags;
|
||||
|
||||
vcpu_fd
|
||||
.set_vcpu_events(&kvm_events)
|
||||
.map_err(|e| SnapshotError::Kvm(format!("set_vcpu_events vCPU {}: {}", id, e)))?;
|
||||
|
||||
debug!("vCPU {}: events restored", id);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// IRQ Chip Restore
|
||||
// ============================================================================
|
||||
|
||||
/// Restore IRQ chip state (PIC master/slave, IOAPIC, PIT).
|
||||
pub fn restore_irqchip(vm_fd: &VmFd, irqchip: &IrqchipState) -> Result<()> {
|
||||
// Restore PIC master
|
||||
let mut pic_master = kvm_irqchip {
|
||||
chip_id: KVM_IRQCHIP_PIC_MASTER,
|
||||
..Default::default()
|
||||
};
|
||||
let chip_data = unsafe {
|
||||
std::slice::from_raw_parts_mut(
|
||||
&mut pic_master.chip as *mut _ as *mut u8,
|
||||
std::mem::size_of_val(&pic_master.chip),
|
||||
)
|
||||
};
|
||||
let len = irqchip.pic_master.raw_data.len().min(chip_data.len());
|
||||
chip_data[..len].copy_from_slice(&irqchip.pic_master.raw_data[..len]);
|
||||
vm_fd
|
||||
.set_irqchip(&pic_master)
|
||||
.map_err(|e| SnapshotError::Kvm(format!("set_irqchip PIC master: {}", e)))?;
|
||||
|
||||
// Restore PIC slave
|
||||
let mut pic_slave = kvm_irqchip {
|
||||
chip_id: KVM_IRQCHIP_PIC_SLAVE,
|
||||
..Default::default()
|
||||
};
|
||||
let chip_data = unsafe {
|
||||
std::slice::from_raw_parts_mut(
|
||||
&mut pic_slave.chip as *mut _ as *mut u8,
|
||||
std::mem::size_of_val(&pic_slave.chip),
|
||||
)
|
||||
};
|
||||
let len = irqchip.pic_slave.raw_data.len().min(chip_data.len());
|
||||
chip_data[..len].copy_from_slice(&irqchip.pic_slave.raw_data[..len]);
|
||||
vm_fd
|
||||
.set_irqchip(&pic_slave)
|
||||
.map_err(|e| SnapshotError::Kvm(format!("set_irqchip PIC slave: {}", e)))?;
|
||||
|
||||
// Restore IOAPIC
|
||||
let mut ioapic = kvm_irqchip {
|
||||
chip_id: KVM_IRQCHIP_IOAPIC,
|
||||
..Default::default()
|
||||
};
|
||||
let chip_data = unsafe {
|
||||
std::slice::from_raw_parts_mut(
|
||||
&mut ioapic.chip as *mut _ as *mut u8,
|
||||
std::mem::size_of_val(&ioapic.chip),
|
||||
)
|
||||
};
|
||||
let len = irqchip.ioapic.raw_data.len().min(chip_data.len());
|
||||
chip_data[..len].copy_from_slice(&irqchip.ioapic.raw_data[..len]);
|
||||
vm_fd
|
||||
.set_irqchip(&ioapic)
|
||||
.map_err(|e| SnapshotError::Kvm(format!("set_irqchip IOAPIC: {}", e)))?;
|
||||
|
||||
// Restore PIT
|
||||
restore_pit(vm_fd, &irqchip.pit)?;
|
||||
|
||||
debug!("IRQ chip state restored (PIC master + slave + IOAPIC + PIT)");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Restore PIT state.
|
||||
pub fn restore_pit(vm_fd: &VmFd, pit: &SerializablePitState) -> Result<()> {
|
||||
let mut kvm_pit = kvm_pit_state2::default();
|
||||
kvm_pit.flags = pit.flags;
|
||||
|
||||
for (i, ch) in pit.channels.iter().enumerate() {
|
||||
if i < kvm_pit.channels.len() {
|
||||
kvm_pit.channels[i] = kvm_pit_channel_state {
|
||||
count: ch.count,
|
||||
latched_count: ch.latched_count,
|
||||
count_latched: ch.count_latched,
|
||||
status_latched: ch.status_latched,
|
||||
status: ch.status,
|
||||
read_state: ch.read_state,
|
||||
write_state: ch.write_state,
|
||||
write_latch: ch.write_latch,
|
||||
rw_mode: ch.rw_mode,
|
||||
mode: ch.mode,
|
||||
bcd: ch.bcd,
|
||||
gate: ch.gate,
|
||||
count_load_time: ch.count_load_time,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
vm_fd
|
||||
.set_pit2(&kvm_pit)
|
||||
.map_err(|e| SnapshotError::Kvm(format!("set_pit2: {}", e)))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Clock Restore
|
||||
// ============================================================================
|
||||
|
||||
/// Restore KVM clock state.
|
||||
pub fn restore_clock(vm_fd: &VmFd, clock: &ClockState) -> Result<()> {
|
||||
let kvm_clock = kvm_clock_data {
|
||||
clock: clock.clock,
|
||||
flags: clock.flags,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
vm_fd
|
||||
.set_clock(&kvm_clock)
|
||||
.map_err(|e| SnapshotError::Kvm(format!("set_clock: {}", e)))?;
|
||||
|
||||
debug!("KVM clock restored: {} ns", clock.clock);
|
||||
Ok(())
|
||||
}
|
||||
877
vmm/src/storage/boot.rs
Normal file
877
vmm/src/storage/boot.rs
Normal file
@@ -0,0 +1,877 @@
|
||||
//! Stellarium Boot Integration
|
||||
//!
|
||||
//! This module provides integration between the boot loader and Stellarium
|
||||
//! storage to enable sub-50ms cold boot times. The key techniques are:
|
||||
//!
|
||||
//! 1. **Prefetching**: Boot-critical chunks (kernel, initrd) are prefetched
|
||||
//! before VM creation, ensuring they're in memory when needed.
|
||||
//!
|
||||
//! 2. **Memory Mapping**: Kernel and initrd are memory-mapped directly from
|
||||
//! Stellarium's shared regions, avoiding copies.
|
||||
//!
|
||||
//! 3. **Parallel Loading**: Chunks are fetched in parallel using async I/O.
|
||||
//!
|
||||
//! # Boot Flow
|
||||
//!
|
||||
//! ```text
|
||||
//! ┌─────────────────────────────────┐
|
||||
//! │ Stellarium Boot Loader │
|
||||
//! └─────────────────┬───────────────┘
|
||||
//! │
|
||||
//! ┌───────────────────────────┼───────────────────────────┐
|
||||
//! │ │ │
|
||||
//! ▼ ▼ ▼
|
||||
//! ┌───────────┐ ┌──────────────┐ ┌──────────────┐
|
||||
//! │ Prefetch │ │ Memory Map │ │ Memory Map │
|
||||
//! │ Boot Meta │ │ Kernel │ │ Initrd │
|
||||
//! └─────┬─────┘ └──────┬───────┘ └──────┬───────┘
|
||||
//! │ │ │
|
||||
//! │ < 1ms │ < 5ms │ < 10ms
|
||||
//! ▼ ▼ ▼
|
||||
//! ┌───────────────────────────────────────────────────────────────────┐
|
||||
//! │ Boot Ready (< 20ms total) │
|
||||
//! └───────────────────────────────────────────────────────────────────┘
|
||||
//! ```
|
||||
//!
|
||||
//! # Example
|
||||
//!
|
||||
//! ```ignore
|
||||
//! use volt-vmm::storage::{StellariumClient, StellariumBootLoader, PrefetchStrategy};
|
||||
//!
|
||||
//! let client = StellariumClient::connect_default()?;
|
||||
//! let boot_loader = StellariumBootLoader::new(client);
|
||||
//!
|
||||
//! // Prefetch with aggressive strategy for coldstart
|
||||
//! let boot_config = StellariumBootConfig {
|
||||
//! kernel_volume: "kernels/linux-6.6".into(),
|
||||
//! kernel_path: "/vmlinux".into(),
|
||||
//! initrd_volume: "rootfs/alpine-3.19".into(),
|
||||
//! initrd_path: "/initrd.img".into(),
|
||||
//! prefetch_strategy: PrefetchStrategy::Aggressive,
|
||||
//! ..Default::default()
|
||||
//! };
|
||||
//!
|
||||
//! let boot_result = boot_loader.prepare(&boot_config).await?;
|
||||
//! // boot_result contains memory-mapped kernel and initrd ready for VM
|
||||
//! ```
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::io::{self, Read, Seek, SeekFrom};
|
||||
use std::path::PathBuf;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use super::{
|
||||
ChunkHandle, ChunkStore, ContentHash, StellariumClient, StellariumError,
|
||||
StellariumVolume, StorageStats, VolumeStore, DEFAULT_CHUNK_SIZE,
|
||||
MAX_PREFETCH_PARALLEL, hash,
|
||||
};
|
||||
|
||||
// ============================================================================
|
||||
// Configuration
|
||||
// ============================================================================
|
||||
|
||||
/// Strategy for prefetching boot chunks
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
|
||||
pub enum PrefetchStrategy {
|
||||
/// No prefetching - fetch on demand
|
||||
None,
|
||||
/// Prefetch only critical boot chunks (kernel entry, first initrd chunks)
|
||||
#[default]
|
||||
Minimal,
|
||||
/// Prefetch kernel + initrd headers and metadata
|
||||
Standard,
|
||||
/// Prefetch entire kernel and initrd
|
||||
Aggressive,
|
||||
/// Custom - use specified chunk count
|
||||
Custom(usize),
|
||||
}
|
||||
|
||||
impl PrefetchStrategy {
|
||||
/// Get the number of chunks to prefetch for each boot component
|
||||
pub fn chunk_count(&self, total_chunks: usize) -> usize {
|
||||
match self {
|
||||
PrefetchStrategy::None => 0,
|
||||
PrefetchStrategy::Minimal => 4.min(total_chunks),
|
||||
PrefetchStrategy::Standard => 32.min(total_chunks),
|
||||
PrefetchStrategy::Aggressive => total_chunks,
|
||||
PrefetchStrategy::Custom(n) => (*n).min(total_chunks),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Boot configuration for Stellarium-backed images
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct StellariumBootConfig {
|
||||
/// Volume containing the kernel
|
||||
pub kernel_volume: String,
|
||||
/// Path to kernel within volume (or chunk hash directly)
|
||||
pub kernel_path: String,
|
||||
/// Volume containing the initrd (may be same as kernel)
|
||||
pub initrd_volume: Option<String>,
|
||||
/// Path to initrd within volume
|
||||
pub initrd_path: Option<String>,
|
||||
/// Kernel command line
|
||||
pub cmdline: String,
|
||||
/// Prefetch strategy
|
||||
pub prefetch_strategy: PrefetchStrategy,
|
||||
/// Timeout for prefetch operations
|
||||
pub prefetch_timeout: Duration,
|
||||
/// Enable parallel chunk fetching
|
||||
pub parallel_fetch: bool,
|
||||
/// Maximum memory to use for boot images (0 = no limit)
|
||||
pub max_memory: usize,
|
||||
/// Cache boot images across VM restarts
|
||||
pub cache_enabled: bool,
|
||||
}
|
||||
|
||||
impl Default for StellariumBootConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
kernel_volume: String::new(),
|
||||
kernel_path: String::new(),
|
||||
initrd_volume: None,
|
||||
initrd_path: None,
|
||||
cmdline: String::from("console=ttyS0 reboot=k panic=1 pci=off"),
|
||||
prefetch_strategy: PrefetchStrategy::Standard,
|
||||
prefetch_timeout: Duration::from_secs(10),
|
||||
parallel_fetch: true,
|
||||
max_memory: 0,
|
||||
cache_enabled: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Boot Chunk Info
|
||||
// ============================================================================
|
||||
|
||||
/// Information about a boot-related chunk
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct BootChunkInfo {
|
||||
/// Content hash
|
||||
pub hash: ContentHash,
|
||||
/// Offset within the image
|
||||
pub offset: u64,
|
||||
/// Size of this chunk
|
||||
pub size: usize,
|
||||
/// Whether this chunk is critical for boot
|
||||
pub critical: bool,
|
||||
/// Chunk type (kernel header, kernel body, initrd header, etc.)
|
||||
pub chunk_type: BootChunkType,
|
||||
}
|
||||
|
||||
/// Type of boot chunk
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum BootChunkType {
|
||||
/// Kernel ELF/bzImage header
|
||||
KernelHeader,
|
||||
/// Kernel body/text
|
||||
KernelBody,
|
||||
/// Initrd header (cpio/gzip magic)
|
||||
InitrdHeader,
|
||||
/// Initrd body
|
||||
InitrdBody,
|
||||
/// Metadata chunk (manifest, signature)
|
||||
Metadata,
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Boot Result
|
||||
// ============================================================================
|
||||
|
||||
/// Result of boot preparation
|
||||
#[derive(Debug)]
|
||||
pub struct StellariumBootResult {
|
||||
/// Memory-mapped kernel data
|
||||
pub kernel: MappedBootImage,
|
||||
/// Memory-mapped initrd data (if provided)
|
||||
pub initrd: Option<MappedBootImage>,
|
||||
/// Kernel entry point (parsed from ELF/bzImage)
|
||||
pub entry_point: u64,
|
||||
/// Time spent in preparation
|
||||
pub prep_time: Duration,
|
||||
/// Chunks prefetched
|
||||
pub prefetch_stats: PrefetchStats,
|
||||
}
|
||||
|
||||
/// Statistics about prefetch operations
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct PrefetchStats {
|
||||
/// Number of chunks prefetched
|
||||
pub chunks_prefetched: usize,
|
||||
/// Bytes prefetched
|
||||
pub bytes_prefetched: u64,
|
||||
/// Time spent prefetching
|
||||
pub prefetch_time: Duration,
|
||||
/// Cache hits (chunks already in memory)
|
||||
pub cache_hits: usize,
|
||||
/// Cache misses (chunks fetched from daemon)
|
||||
pub cache_misses: usize,
|
||||
}
|
||||
|
||||
/// A memory-mapped boot image (kernel or initrd)
|
||||
pub struct MappedBootImage {
|
||||
/// Chunk handles maintaining the mapping
|
||||
chunks: Vec<ChunkHandle>,
|
||||
/// Total size of the image
|
||||
size: u64,
|
||||
/// Number of chunks
|
||||
chunk_count: usize,
|
||||
/// Whether image is contiguously mapped
|
||||
contiguous: bool,
|
||||
/// Assembled image data (if not contiguous)
|
||||
assembled: Option<Vec<u8>>,
|
||||
}
|
||||
|
||||
impl MappedBootImage {
|
||||
/// Get the total size of the image
|
||||
pub fn size(&self) -> u64 {
|
||||
self.size
|
||||
}
|
||||
|
||||
/// Get the number of chunks
|
||||
pub fn chunk_count(&self) -> usize {
|
||||
self.chunk_count
|
||||
}
|
||||
|
||||
/// Check if image is contiguously mapped
|
||||
pub fn is_contiguous(&self) -> bool {
|
||||
self.contiguous
|
||||
}
|
||||
|
||||
/// Get the image data as a contiguous slice
|
||||
///
|
||||
/// If the image is already contiguous, returns a reference to mapped memory.
|
||||
/// Otherwise, assembles chunks into contiguous memory (lazy).
|
||||
pub fn as_slice(&self) -> &[u8] {
|
||||
if let Some(ref assembled) = self.assembled {
|
||||
return assembled.as_slice();
|
||||
}
|
||||
|
||||
if self.contiguous && !self.chunks.is_empty() {
|
||||
// Return the first chunk's slice (they're all contiguous)
|
||||
return self.chunks[0].as_slice();
|
||||
}
|
||||
|
||||
// Should not reach here - assembled is set during construction
|
||||
// if not contiguous
|
||||
&[]
|
||||
}
|
||||
|
||||
/// Get the image data as mutable (requires assembling)
|
||||
pub fn to_vec(&self) -> Vec<u8> {
|
||||
if let Some(ref assembled) = self.assembled {
|
||||
return assembled.clone();
|
||||
}
|
||||
|
||||
let mut data = Vec::with_capacity(self.size as usize);
|
||||
for chunk in &self.chunks {
|
||||
data.extend_from_slice(chunk.as_slice());
|
||||
}
|
||||
data.truncate(self.size as usize);
|
||||
data
|
||||
}
|
||||
|
||||
/// Get a raw pointer to the start of the image
|
||||
///
|
||||
/// # Safety
|
||||
/// Only valid if the image is contiguously mapped. Caller must ensure
|
||||
/// the MappedBootImage outlives any use of the pointer.
|
||||
pub unsafe fn as_ptr(&self) -> Option<*const u8> {
|
||||
if self.contiguous && !self.chunks.is_empty() {
|
||||
Some(self.chunks[0].as_ptr())
|
||||
} else if let Some(ref assembled) = self.assembled {
|
||||
Some(assembled.as_ptr())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Read bytes at an offset
|
||||
pub fn read_at(&self, offset: u64, buf: &mut [u8]) -> io::Result<usize> {
|
||||
let data = self.as_slice();
|
||||
let start = offset as usize;
|
||||
|
||||
if start >= data.len() {
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
let available = data.len() - start;
|
||||
let to_read = buf.len().min(available);
|
||||
buf[..to_read].copy_from_slice(&data[start..start + to_read]);
|
||||
Ok(to_read)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for MappedBootImage {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("MappedBootImage")
|
||||
.field("size", &self.size)
|
||||
.field("chunk_count", &self.chunk_count)
|
||||
.field("contiguous", &self.contiguous)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Stellarium Boot Loader
|
||||
// ============================================================================
|
||||
|
||||
/// Boot loader with Stellarium storage integration
|
||||
pub struct StellariumBootLoader {
|
||||
client: StellariumClient,
|
||||
/// Boot image cache (for fast VM restarts)
|
||||
cache: Mutex<BootCache>,
|
||||
/// Statistics
|
||||
stats: Mutex<StorageStats>,
|
||||
}
|
||||
|
||||
struct BootCache {
|
||||
/// Cached kernel images by volume:path
|
||||
kernels: HashMap<String, Arc<MappedBootImage>>,
|
||||
/// Cached initrd images by volume:path
|
||||
initrds: HashMap<String, Arc<MappedBootImage>>,
|
||||
/// Maximum cache size in bytes
|
||||
max_size: usize,
|
||||
/// Current cache size
|
||||
current_size: usize,
|
||||
}
|
||||
|
||||
impl BootCache {
|
||||
fn new(max_size: usize) -> Self {
|
||||
Self {
|
||||
kernels: HashMap::new(),
|
||||
initrds: HashMap::new(),
|
||||
max_size,
|
||||
current_size: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn cache_key(volume: &str, path: &str) -> String {
|
||||
format!("{}:{}", volume, path)
|
||||
}
|
||||
|
||||
fn get_kernel(&self, volume: &str, path: &str) -> Option<Arc<MappedBootImage>> {
|
||||
self.kernels.get(&Self::cache_key(volume, path)).cloned()
|
||||
}
|
||||
|
||||
fn get_initrd(&self, volume: &str, path: &str) -> Option<Arc<MappedBootImage>> {
|
||||
self.initrds.get(&Self::cache_key(volume, path)).cloned()
|
||||
}
|
||||
|
||||
fn put_kernel(&mut self, volume: &str, path: &str, image: MappedBootImage) -> Arc<MappedBootImage> {
|
||||
let key = Self::cache_key(volume, path);
|
||||
let size = image.size as usize;
|
||||
|
||||
// Evict if necessary
|
||||
while self.current_size + size > self.max_size && !self.kernels.is_empty() {
|
||||
if let Some((k, v)) = self.kernels.iter().next().map(|(k, v)| (k.clone(), v.size)) {
|
||||
self.kernels.remove(&k);
|
||||
self.current_size = self.current_size.saturating_sub(v as usize);
|
||||
}
|
||||
}
|
||||
|
||||
let arc = Arc::new(image);
|
||||
self.kernels.insert(key, Arc::clone(&arc));
|
||||
self.current_size += size;
|
||||
arc
|
||||
}
|
||||
|
||||
fn put_initrd(&mut self, volume: &str, path: &str, image: MappedBootImage) -> Arc<MappedBootImage> {
|
||||
let key = Self::cache_key(volume, path);
|
||||
let size = image.size as usize;
|
||||
|
||||
while self.current_size + size > self.max_size && !self.initrds.is_empty() {
|
||||
if let Some((k, v)) = self.initrds.iter().next().map(|(k, v)| (k.clone(), v.size)) {
|
||||
self.initrds.remove(&k);
|
||||
self.current_size = self.current_size.saturating_sub(v as usize);
|
||||
}
|
||||
}
|
||||
|
||||
let arc = Arc::new(image);
|
||||
self.initrds.insert(key, Arc::clone(&arc));
|
||||
self.current_size += size;
|
||||
arc
|
||||
}
|
||||
}
|
||||
|
||||
impl StellariumBootLoader {
|
||||
/// Default cache size (256 MB)
|
||||
pub const DEFAULT_CACHE_SIZE: usize = 256 * 1024 * 1024;
|
||||
|
||||
/// Create a new boot loader
|
||||
pub fn new(client: StellariumClient) -> Self {
|
||||
Self::with_cache_size(client, Self::DEFAULT_CACHE_SIZE)
|
||||
}
|
||||
|
||||
/// Create a new boot loader with custom cache size
|
||||
pub fn with_cache_size(client: StellariumClient, cache_size: usize) -> Self {
|
||||
Self {
|
||||
client,
|
||||
cache: Mutex::new(BootCache::new(cache_size)),
|
||||
stats: Mutex::new(StorageStats::default()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Prepare boot images (prefetch + memory map)
|
||||
///
|
||||
/// This is the main entry point for boot preparation. It:
|
||||
/// 1. Mounts the kernel/initrd volumes
|
||||
/// 2. Prefetches critical chunks based on strategy
|
||||
/// 3. Memory-maps the images for zero-copy loading
|
||||
/// 4. Parses the kernel to find the entry point
|
||||
pub fn prepare(&self, config: &StellariumBootConfig) -> super::Result<StellariumBootResult> {
|
||||
let start = Instant::now();
|
||||
let mut prefetch_stats = PrefetchStats::default();
|
||||
|
||||
// Check cache first
|
||||
let cached_kernel = if config.cache_enabled {
|
||||
let cache = self.cache.lock().unwrap();
|
||||
cache.get_kernel(&config.kernel_volume, &config.kernel_path)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let cached_initrd = if config.cache_enabled {
|
||||
if let (Some(ref vol), Some(ref path)) = (&config.initrd_volume, &config.initrd_path) {
|
||||
let cache = self.cache.lock().unwrap();
|
||||
cache.get_initrd(vol, path)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// Load kernel
|
||||
let (kernel, kernel_entry) = if let Some(cached) = cached_kernel {
|
||||
prefetch_stats.cache_hits += cached.chunk_count;
|
||||
let entry = self.parse_kernel_entry(cached.as_slice())?;
|
||||
(self.mapped_to_owned(&cached), entry)
|
||||
} else {
|
||||
let prefetch_start = Instant::now();
|
||||
let volume = self.client.mount_volume(&config.kernel_volume)?;
|
||||
|
||||
let (mapped, entry) = self.load_kernel(&volume, &config.kernel_path, config)?;
|
||||
prefetch_stats.chunks_prefetched += mapped.chunk_count;
|
||||
prefetch_stats.bytes_prefetched += mapped.size;
|
||||
prefetch_stats.prefetch_time += prefetch_start.elapsed();
|
||||
prefetch_stats.cache_misses += mapped.chunk_count;
|
||||
|
||||
// Cache the kernel
|
||||
if config.cache_enabled {
|
||||
let mut cache = self.cache.lock().unwrap();
|
||||
let _ = cache.put_kernel(&config.kernel_volume, &config.kernel_path, mapped);
|
||||
}
|
||||
|
||||
let cache = self.cache.lock().unwrap();
|
||||
let cached = cache.get_kernel(&config.kernel_volume, &config.kernel_path).unwrap();
|
||||
(self.mapped_to_owned(&cached), entry)
|
||||
};
|
||||
|
||||
// Load initrd if specified
|
||||
let initrd = if let (Some(ref vol), Some(ref path)) = (&config.initrd_volume, &config.initrd_path) {
|
||||
if let Some(cached) = cached_initrd {
|
||||
prefetch_stats.cache_hits += cached.chunk_count;
|
||||
Some(self.mapped_to_owned(&cached))
|
||||
} else {
|
||||
let prefetch_start = Instant::now();
|
||||
let volume = self.client.mount_volume(vol)?;
|
||||
|
||||
let mapped = self.load_initrd(&volume, path, config)?;
|
||||
prefetch_stats.chunks_prefetched += mapped.chunk_count;
|
||||
prefetch_stats.bytes_prefetched += mapped.size;
|
||||
prefetch_stats.prefetch_time += prefetch_start.elapsed();
|
||||
prefetch_stats.cache_misses += mapped.chunk_count;
|
||||
|
||||
if config.cache_enabled {
|
||||
let mut cache = self.cache.lock().unwrap();
|
||||
let _ = cache.put_initrd(vol, path, mapped);
|
||||
}
|
||||
|
||||
let cache = self.cache.lock().unwrap();
|
||||
cache.get_initrd(vol, path).map(|c| self.mapped_to_owned(&c))
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
Ok(StellariumBootResult {
|
||||
kernel,
|
||||
initrd,
|
||||
entry_point: kernel_entry,
|
||||
prep_time: start.elapsed(),
|
||||
prefetch_stats,
|
||||
})
|
||||
}
|
||||
|
||||
/// Convert Arc<MappedBootImage> to owned MappedBootImage
|
||||
fn mapped_to_owned(&self, arc: &Arc<MappedBootImage>) -> MappedBootImage {
|
||||
// Create a new MappedBootImage with the same data
|
||||
// This is a shallow copy - chunks are reference counted
|
||||
MappedBootImage {
|
||||
chunks: Vec::new(), // We use assembled data instead
|
||||
size: arc.size,
|
||||
chunk_count: arc.chunk_count,
|
||||
contiguous: false,
|
||||
assembled: Some(arc.to_vec()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Prefetch boot chunks for a volume
|
||||
///
|
||||
/// This is called before VM creation to warm the chunk cache.
|
||||
pub fn prefetch_boot_chunks(
|
||||
&self,
|
||||
volume: &StellariumVolume,
|
||||
kernel_offset: u64,
|
||||
kernel_size: u64,
|
||||
initrd_offset: Option<u64>,
|
||||
initrd_size: Option<u64>,
|
||||
) -> super::Result<PrefetchStats> {
|
||||
let start = Instant::now();
|
||||
let chunk_size = volume.chunk_size() as u64;
|
||||
let mut stats = PrefetchStats::default();
|
||||
|
||||
// Collect kernel chunk hashes
|
||||
let mut hashes = Vec::new();
|
||||
let kernel_chunks = (kernel_size + chunk_size - 1) / chunk_size;
|
||||
|
||||
for i in 0..kernel_chunks.min(MAX_PREFETCH_PARALLEL as u64) {
|
||||
let offset = kernel_offset + i * chunk_size;
|
||||
if let Some(hash) = volume.chunk_at_offset(offset)? {
|
||||
hashes.push(hash);
|
||||
}
|
||||
}
|
||||
|
||||
// Collect initrd chunk hashes
|
||||
if let (Some(offset), Some(size)) = (initrd_offset, initrd_size) {
|
||||
let initrd_chunks = (size + chunk_size - 1) / chunk_size;
|
||||
for i in 0..initrd_chunks.min(MAX_PREFETCH_PARALLEL as u64) {
|
||||
let off = offset + i * chunk_size;
|
||||
if let Some(hash) = volume.chunk_at_offset(off)? {
|
||||
hashes.push(hash);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Prefetch all chunks
|
||||
if !hashes.is_empty() {
|
||||
volume.prefetch(&hashes)?;
|
||||
stats.chunks_prefetched = hashes.len();
|
||||
stats.bytes_prefetched = hashes.len() as u64 * chunk_size;
|
||||
}
|
||||
|
||||
stats.prefetch_time = start.elapsed();
|
||||
Ok(stats)
|
||||
}
|
||||
|
||||
/// Load kernel from Stellarium volume
|
||||
fn load_kernel(
|
||||
&self,
|
||||
volume: &StellariumVolume,
|
||||
path: &str,
|
||||
config: &StellariumBootConfig,
|
||||
) -> super::Result<(MappedBootImage, u64)> {
|
||||
// For now, we assume path is an offset or we scan for kernel
|
||||
// A real implementation would use a volume manifest
|
||||
|
||||
let chunk_size = volume.chunk_size();
|
||||
let volume_size = volume.size();
|
||||
|
||||
// Read first chunk to determine kernel format
|
||||
let first_hash = volume.chunk_at_offset(0)?
|
||||
.ok_or_else(|| StellariumError::ChunkNotFound("kernel first chunk".into()))?;
|
||||
let first_chunk = volume.read_chunk(&first_hash)?;
|
||||
|
||||
// Detect kernel format and size
|
||||
let (kernel_size, entry_point) = self.detect_kernel_format(&first_chunk)?;
|
||||
|
||||
// Calculate chunks needed
|
||||
let total_chunks = ((kernel_size as u64 + chunk_size as u64 - 1) / chunk_size as u64) as usize;
|
||||
let prefetch_count = config.prefetch_strategy.chunk_count(total_chunks);
|
||||
|
||||
// Prefetch based on strategy
|
||||
if prefetch_count > 0 {
|
||||
let mut hashes = Vec::with_capacity(prefetch_count);
|
||||
for i in 0..prefetch_count {
|
||||
let offset = i as u64 * chunk_size as u64;
|
||||
if let Some(hash) = volume.chunk_at_offset(offset)? {
|
||||
if !hash::is_zero(&hash) {
|
||||
hashes.push(hash);
|
||||
}
|
||||
}
|
||||
}
|
||||
if !hashes.is_empty() {
|
||||
volume.prefetch(&hashes)?;
|
||||
}
|
||||
}
|
||||
|
||||
// Load all kernel chunks
|
||||
let mut chunks = Vec::with_capacity(total_chunks);
|
||||
let mut assembled = Vec::with_capacity(kernel_size);
|
||||
|
||||
for i in 0..total_chunks {
|
||||
let offset = i as u64 * chunk_size as u64;
|
||||
if let Some(hash) = volume.chunk_at_offset(offset)? {
|
||||
let handle = volume.read_chunk_zero_copy(&hash)?;
|
||||
assembled.extend_from_slice(handle.as_slice());
|
||||
chunks.push(handle);
|
||||
}
|
||||
}
|
||||
|
||||
assembled.truncate(kernel_size);
|
||||
|
||||
let mapped = MappedBootImage {
|
||||
chunks,
|
||||
size: kernel_size as u64,
|
||||
chunk_count: total_chunks,
|
||||
contiguous: false,
|
||||
assembled: Some(assembled),
|
||||
};
|
||||
|
||||
Ok((mapped, entry_point))
|
||||
}
|
||||
|
||||
/// Load initrd from Stellarium volume
|
||||
fn load_initrd(
|
||||
&self,
|
||||
volume: &StellariumVolume,
|
||||
path: &str,
|
||||
config: &StellariumBootConfig,
|
||||
) -> super::Result<MappedBootImage> {
|
||||
let chunk_size = volume.chunk_size();
|
||||
let volume_size = volume.size();
|
||||
|
||||
// For initrd, we usually need to know the size from metadata
|
||||
// Here we assume the entire volume is the initrd
|
||||
let initrd_size = volume_size as usize;
|
||||
let total_chunks = (initrd_size + chunk_size - 1) / chunk_size;
|
||||
let prefetch_count = config.prefetch_strategy.chunk_count(total_chunks);
|
||||
|
||||
// Prefetch based on strategy
|
||||
if prefetch_count > 0 {
|
||||
let mut hashes = Vec::with_capacity(prefetch_count);
|
||||
for i in 0..prefetch_count {
|
||||
let offset = i as u64 * chunk_size as u64;
|
||||
if let Some(hash) = volume.chunk_at_offset(offset)? {
|
||||
if !hash::is_zero(&hash) {
|
||||
hashes.push(hash);
|
||||
}
|
||||
}
|
||||
}
|
||||
if !hashes.is_empty() {
|
||||
volume.prefetch(&hashes)?;
|
||||
}
|
||||
}
|
||||
|
||||
// Load all initrd chunks
|
||||
let mut chunks = Vec::with_capacity(total_chunks);
|
||||
let mut assembled = Vec::with_capacity(initrd_size);
|
||||
|
||||
for i in 0..total_chunks {
|
||||
let offset = i as u64 * chunk_size as u64;
|
||||
if let Some(hash) = volume.chunk_at_offset(offset)? {
|
||||
let handle = volume.read_chunk_zero_copy(&hash)?;
|
||||
assembled.extend_from_slice(handle.as_slice());
|
||||
chunks.push(handle);
|
||||
}
|
||||
}
|
||||
|
||||
assembled.truncate(initrd_size);
|
||||
|
||||
Ok(MappedBootImage {
|
||||
chunks,
|
||||
size: initrd_size as u64,
|
||||
chunk_count: total_chunks,
|
||||
contiguous: false,
|
||||
assembled: Some(assembled),
|
||||
})
|
||||
}
|
||||
|
||||
/// Detect kernel format and extract entry point
|
||||
fn detect_kernel_format(&self, data: &[u8]) -> super::Result<(usize, u64)> {
|
||||
if data.len() < 64 {
|
||||
return Err(StellariumError::InvalidChunkSize {
|
||||
expected: 64,
|
||||
actual: data.len(),
|
||||
});
|
||||
}
|
||||
|
||||
// Check for ELF magic
|
||||
if &data[0..4] == b"\x7FELF" {
|
||||
return self.parse_elf_header(data);
|
||||
}
|
||||
|
||||
// Check for bzImage magic (at offset 0x202)
|
||||
if data.len() > 0x210 && &data[0x202..0x206] == b"HdrS" {
|
||||
return self.parse_bzimage_header(data);
|
||||
}
|
||||
|
||||
// Check for ARM64 Image magic
|
||||
if data.len() > 64 && &data[56..60] == b"ARM\x64" {
|
||||
return self.parse_arm64_header(data);
|
||||
}
|
||||
|
||||
// Unknown format - assume raw kernel at 1MB entry
|
||||
Ok((data.len(), 0x100000))
|
||||
}
|
||||
|
||||
/// Parse ELF header for kernel
|
||||
fn parse_elf_header(&self, data: &[u8]) -> super::Result<(usize, u64)> {
|
||||
// ELF64 header
|
||||
if data.len() < 64 {
|
||||
return Err(StellariumError::ChunkNotFound("ELF header too short".into()));
|
||||
}
|
||||
|
||||
// Check 64-bit
|
||||
if data[4] != 2 {
|
||||
return Err(StellariumError::ChunkNotFound("Not a 64-bit ELF".into()));
|
||||
}
|
||||
|
||||
// Little endian
|
||||
let le = data[5] == 1;
|
||||
|
||||
let entry = if le {
|
||||
u64::from_le_bytes([
|
||||
data[24], data[25], data[26], data[27],
|
||||
data[28], data[29], data[30], data[31],
|
||||
])
|
||||
} else {
|
||||
u64::from_be_bytes([
|
||||
data[24], data[25], data[26], data[27],
|
||||
data[28], data[29], data[30], data[31],
|
||||
])
|
||||
};
|
||||
|
||||
// Get program header info to calculate total size
|
||||
let ph_off = if le {
|
||||
u64::from_le_bytes([
|
||||
data[32], data[33], data[34], data[35],
|
||||
data[36], data[37], data[38], data[39],
|
||||
])
|
||||
} else {
|
||||
u64::from_be_bytes([
|
||||
data[32], data[33], data[34], data[35],
|
||||
data[36], data[37], data[38], data[39],
|
||||
])
|
||||
};
|
||||
|
||||
let ph_ent_size = if le {
|
||||
u16::from_le_bytes([data[54], data[55]])
|
||||
} else {
|
||||
u16::from_be_bytes([data[54], data[55]])
|
||||
};
|
||||
|
||||
let ph_num = if le {
|
||||
u16::from_le_bytes([data[56], data[57]])
|
||||
} else {
|
||||
u16::from_be_bytes([data[56], data[57]])
|
||||
};
|
||||
|
||||
// Estimate size (rough - need full parsing for accuracy)
|
||||
let estimated_size = (ph_off as usize) + (ph_ent_size as usize * ph_num as usize) + (4 * 1024 * 1024);
|
||||
|
||||
Ok((estimated_size.min(32 * 1024 * 1024), entry))
|
||||
}
|
||||
|
||||
/// Parse bzImage header
|
||||
fn parse_bzimage_header(&self, data: &[u8]) -> super::Result<(usize, u64)> {
|
||||
// Setup header version at 0x206
|
||||
let version = u16::from_le_bytes([data[0x206], data[0x207]]);
|
||||
|
||||
// syssize at 0x1f4 (in 16-byte units)
|
||||
let syssize = u32::from_le_bytes([data[0x1f4], data[0x1f5], data[0x1f6], data[0x1f7]]);
|
||||
let kernel_size = (syssize as usize) * 16 + 0x200 + 0x10000; // rough estimate
|
||||
|
||||
// Entry point for bzImage is typically 0x100000 (1MB)
|
||||
let entry = 0x100000u64;
|
||||
|
||||
Ok((kernel_size.min(32 * 1024 * 1024), entry))
|
||||
}
|
||||
|
||||
/// Parse ARM64 Image header
|
||||
fn parse_arm64_header(&self, data: &[u8]) -> super::Result<(usize, u64)> {
|
||||
// ARM64 kernel header
|
||||
// text_offset at offset 8
|
||||
let text_offset = u64::from_le_bytes([
|
||||
data[8], data[9], data[10], data[11],
|
||||
data[12], data[13], data[14], data[15],
|
||||
]);
|
||||
|
||||
// image_size at offset 16
|
||||
let image_size = u64::from_le_bytes([
|
||||
data[16], data[17], data[18], data[19],
|
||||
data[20], data[21], data[22], data[23],
|
||||
]);
|
||||
|
||||
// Entry is at text_offset from load address (typically 0x80080000 for ARM64)
|
||||
let entry = 0x80080000u64 + text_offset;
|
||||
|
||||
Ok((image_size as usize, entry))
|
||||
}
|
||||
|
||||
/// Parse kernel entry point from kernel data
|
||||
fn parse_kernel_entry(&self, data: &[u8]) -> super::Result<u64> {
|
||||
let (_, entry) = self.detect_kernel_format(data)?;
|
||||
Ok(entry)
|
||||
}
|
||||
|
||||
/// Get boot loader statistics
|
||||
pub fn stats(&self) -> StorageStats {
|
||||
self.stats.lock().unwrap().clone()
|
||||
}
|
||||
|
||||
/// Clear the boot cache
|
||||
pub fn clear_cache(&self) {
|
||||
let mut cache = self.cache.lock().unwrap();
|
||||
cache.kernels.clear();
|
||||
cache.initrds.clear();
|
||||
cache.current_size = 0;
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_prefetch_strategy_chunk_count() {
|
||||
assert_eq!(PrefetchStrategy::None.chunk_count(100), 0);
|
||||
assert_eq!(PrefetchStrategy::Minimal.chunk_count(100), 4);
|
||||
assert_eq!(PrefetchStrategy::Minimal.chunk_count(2), 2);
|
||||
assert_eq!(PrefetchStrategy::Standard.chunk_count(100), 32);
|
||||
assert_eq!(PrefetchStrategy::Aggressive.chunk_count(100), 100);
|
||||
assert_eq!(PrefetchStrategy::Custom(50).chunk_count(100), 50);
|
||||
assert_eq!(PrefetchStrategy::Custom(200).chunk_count(100), 100);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_boot_config_default() {
|
||||
let config = StellariumBootConfig::default();
|
||||
assert!(config.cmdline.contains("console=ttyS0"));
|
||||
assert!(config.cache_enabled);
|
||||
assert!(config.parallel_fetch);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_elf_magic_detection() {
|
||||
// Minimal ELF64 header
|
||||
let mut elf = vec![0u8; 64];
|
||||
elf[0..4].copy_from_slice(b"\x7FELF");
|
||||
elf[4] = 2; // 64-bit
|
||||
elf[5] = 1; // Little endian
|
||||
// Entry point at offset 24
|
||||
elf[24..32].copy_from_slice(&0x100000u64.to_le_bytes());
|
||||
|
||||
let loader = StellariumBootLoader {
|
||||
client: unsafe { std::mem::zeroed() }, // Not used in this test
|
||||
cache: Mutex::new(BootCache::new(0)),
|
||||
stats: Mutex::new(StorageStats::default()),
|
||||
};
|
||||
|
||||
// Can't fully test without a proper loader, but we can check format detection
|
||||
assert_eq!(&elf[0..4], b"\x7FELF");
|
||||
}
|
||||
}
|
||||
230
vmm/src/storage/mod.rs
Normal file
230
vmm/src/storage/mod.rs
Normal file
@@ -0,0 +1,230 @@
|
||||
//! Volt Stellarium Storage Integration
|
||||
//!
|
||||
//! This module provides the integration layer between Volt VMM and the
|
||||
//! Stellarium content-addressable storage (CAS) system. It enables:
|
||||
//!
|
||||
//! - **Sub-50ms boot times** through chunk prefetching and memory mapping
|
||||
//! - **Zero-copy I/O** by mapping Stellarium chunks directly into guest memory
|
||||
//! - **Copy-on-Write (CoW)** for efficient VM snapshots and deduplication
|
||||
//! - **Shared base images** across thousands of VMs
|
||||
//!
|
||||
//! # Architecture
|
||||
//!
|
||||
//! ```text
|
||||
//! ┌─────────────────────────────────────────────────────────────────┐
|
||||
//! │ Volt VMM │
|
||||
//! ├─────────────────────────────────────────────────────────────────┤
|
||||
//! │ ┌──────────────────┐ ┌──────────────────┐ ┌───────────────┐ │
|
||||
//! │ │ Boot Loader │ │ VirtIO-Stellar │ │ Guest Memory │ │
|
||||
//! │ │ (prefetch/mmap) │ │ (CAS read/CoW) │ │ (zero-copy) │ │
|
||||
//! │ └────────┬─────────┘ └────────┬─────────┘ └───────┬───────┘ │
|
||||
//! │ │ │ │ │
|
||||
//! │ ┌────────▼─────────────────────▼─────────────────────▼───────┐ │
|
||||
//! │ │ Stellarium Client │ │
|
||||
//! │ │ - Unix socket IPC to daemon │ │
|
||||
//! │ │ - Memory-mapped chunk access │ │
|
||||
//! │ │ - Delta layer management │ │
|
||||
//! │ └────────────────────────────┬───────────────────────────────┘ │
|
||||
//! └───────────────────────────────┼──────────────────────────────────┘
|
||||
//! │
|
||||
//! ┌───────────▼────────────┐
|
||||
//! │ Stellarium Daemon │
|
||||
//! │ - Content addressing │
|
||||
//! │ - Deduplication │
|
||||
//! │ - Shared mmap regions │
|
||||
//! └────────────────────────┘
|
||||
//! ```
|
||||
//!
|
||||
//! # Performance
|
||||
//!
|
||||
//! The key to achieving <50ms boot is:
|
||||
//!
|
||||
//! 1. **Prefetching**: Boot chunks (kernel, initrd) are prefetched before VM start
|
||||
//! 2. **Memory mapping**: Chunks are mapped directly, no copying required
|
||||
//! 3. **Shared pages**: Multiple VMs share the same physical pages for base images
|
||||
//! 4. **CoW deltas**: Writes go to a small delta layer, base remains shared
|
||||
//!
|
||||
//! # Example
|
||||
//!
|
||||
//! ```ignore
|
||||
//! use volt-vmm::storage::{StellariumClient, StellariumBootLoader, StellarBackend};
|
||||
//!
|
||||
//! // Connect to Stellarium daemon
|
||||
//! let client = StellariumClient::connect("/run/stellarium.sock").await?;
|
||||
//!
|
||||
//! // Mount a volume
|
||||
//! let volume = client.mount_volume("ubuntu-base-24.04").await?;
|
||||
//!
|
||||
//! // Prefetch boot chunks for fast startup
|
||||
//! let boot_loader = StellariumBootLoader::new(client.clone());
|
||||
//! boot_loader.prefetch_boot_chunks(&volume, kernel_path, initrd_path).await?;
|
||||
//!
|
||||
//! // Use as virtio-blk backend
|
||||
//! let backend = StellarBackend::new(volume)?;
|
||||
//! let block_device = VirtioBlock::new(backend);
|
||||
//! ```
|
||||
|
||||
mod boot;
|
||||
mod stellarium;
|
||||
mod virtio_stellar;
|
||||
|
||||
pub use boot::{
|
||||
StellariumBootConfig, StellariumBootLoader, StellariumBootResult,
|
||||
PrefetchStrategy, BootChunkInfo,
|
||||
};
|
||||
pub use stellarium::{
|
||||
StellariumClient, StellariumConfig, StellariumVolume, StellariumError,
|
||||
ChunkRef, ChunkHandle, MountOptions, VolumeInfo, VolumeStats,
|
||||
};
|
||||
pub use virtio_stellar::{
|
||||
StellarBackend, DeltaLayer, DeltaConfig, StellarBlockConfig,
|
||||
CoWStrategy, WriteMode,
|
||||
};
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
/// Common result type for storage operations
|
||||
pub type Result<T> = std::result::Result<T, StellariumError>;
|
||||
|
||||
/// Content hash type - 32-byte BLAKE3 hash
|
||||
pub type ContentHash = [u8; 32];
|
||||
|
||||
/// Chunk size used by Stellarium (64KB default, configurable)
|
||||
pub const DEFAULT_CHUNK_SIZE: usize = 64 * 1024;
|
||||
|
||||
/// Maximum chunks to prefetch in parallel
|
||||
pub const MAX_PREFETCH_PARALLEL: usize = 32;
|
||||
|
||||
/// Stellarium protocol version
|
||||
pub const PROTOCOL_VERSION: u32 = 1;
|
||||
|
||||
/// Storage statistics for monitoring
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct StorageStats {
|
||||
/// Total read operations
|
||||
pub reads: u64,
|
||||
/// Total write operations
|
||||
pub writes: u64,
|
||||
/// Cache hits (chunk already mapped)
|
||||
pub cache_hits: u64,
|
||||
/// Cache misses (required fetch from daemon)
|
||||
pub cache_misses: u64,
|
||||
/// Bytes read from CAS
|
||||
pub bytes_read: u64,
|
||||
/// Bytes written to delta layer
|
||||
pub bytes_written: u64,
|
||||
/// Zero-copy operations (direct mmap)
|
||||
pub zero_copy_ops: u64,
|
||||
/// CoW operations (copy-on-write)
|
||||
pub cow_ops: u64,
|
||||
/// Prefetch operations
|
||||
pub prefetch_ops: u64,
|
||||
/// Prefetch bytes
|
||||
pub prefetch_bytes: u64,
|
||||
}
|
||||
|
||||
/// Trait for chunk-level storage access
|
||||
///
|
||||
/// This abstracts the chunk-based storage model used by Stellarium,
|
||||
/// allowing different implementations (CAS, file-based, memory) to
|
||||
/// be used interchangeably.
|
||||
pub trait ChunkStore: Send + Sync {
|
||||
/// Read a chunk by its content hash
|
||||
fn read_chunk(&self, hash: &ContentHash) -> Result<Arc<[u8]>>;
|
||||
|
||||
/// Read a chunk with zero-copy (returns mmap'd memory if possible)
|
||||
fn read_chunk_zero_copy(&self, hash: &ContentHash) -> Result<ChunkHandle>;
|
||||
|
||||
/// Write a chunk and return its content hash
|
||||
fn write_chunk(&self, data: &[u8]) -> Result<ContentHash>;
|
||||
|
||||
/// Check if a chunk exists
|
||||
fn has_chunk(&self, hash: &ContentHash) -> Result<bool>;
|
||||
|
||||
/// Prefetch chunks (async hint to storage layer)
|
||||
fn prefetch(&self, hashes: &[ContentHash]) -> Result<()>;
|
||||
|
||||
/// Get storage statistics
|
||||
fn stats(&self) -> StorageStats;
|
||||
}
|
||||
|
||||
/// Trait for volume-level operations
|
||||
pub trait VolumeStore: ChunkStore {
|
||||
/// Get the chunk hash at a given offset
|
||||
fn chunk_at_offset(&self, offset: u64) -> Result<Option<ContentHash>>;
|
||||
|
||||
/// Get the total size of the volume
|
||||
fn size(&self) -> u64;
|
||||
|
||||
/// Get the chunk size
|
||||
fn chunk_size(&self) -> usize;
|
||||
|
||||
/// Flush pending writes
|
||||
fn flush(&self) -> Result<()>;
|
||||
|
||||
/// Create a snapshot
|
||||
fn snapshot(&self) -> Result<ContentHash>;
|
||||
}
|
||||
|
||||
/// Utility functions for content hashing
|
||||
pub mod hash {
|
||||
use super::ContentHash;
|
||||
|
||||
/// Compute BLAKE3 hash of data
|
||||
pub fn blake3(data: &[u8]) -> ContentHash {
|
||||
*blake3::hash(data).as_bytes()
|
||||
}
|
||||
|
||||
/// Compute hash of a chunk with optional key
|
||||
pub fn chunk_hash(data: &[u8], key: Option<&[u8; 32]>) -> ContentHash {
|
||||
match key {
|
||||
Some(k) => *blake3::keyed_hash(k, data).as_bytes(),
|
||||
None => blake3(data),
|
||||
}
|
||||
}
|
||||
|
||||
/// Format hash as hex string
|
||||
pub fn to_hex(hash: &ContentHash) -> String {
|
||||
hex::encode(hash)
|
||||
}
|
||||
|
||||
/// Parse hex string to hash
|
||||
pub fn from_hex(s: &str) -> Option<ContentHash> {
|
||||
let bytes = hex::decode(s).ok()?;
|
||||
if bytes.len() != 32 {
|
||||
return None;
|
||||
}
|
||||
let mut hash = [0u8; 32];
|
||||
hash.copy_from_slice(&bytes);
|
||||
Some(hash)
|
||||
}
|
||||
|
||||
/// Zero hash (represents empty/missing chunk)
|
||||
pub const ZERO_HASH: ContentHash = [0u8; 32];
|
||||
|
||||
/// Check if hash is zero
|
||||
pub fn is_zero(hash: &ContentHash) -> bool {
|
||||
hash == &ZERO_HASH
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_hash_roundtrip() {
|
||||
let data = b"Hello, Stellarium!";
|
||||
let hash = hash::blake3(data);
|
||||
let hex = hash::to_hex(&hash);
|
||||
let parsed = hash::from_hex(&hex).unwrap();
|
||||
assert_eq!(hash, parsed);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_zero_hash() {
|
||||
assert!(hash::is_zero(&hash::ZERO_HASH));
|
||||
let non_zero = hash::blake3(b"data");
|
||||
assert!(!hash::is_zero(&non_zero));
|
||||
}
|
||||
}
|
||||
928
vmm/src/storage/stellarium.rs
Normal file
928
vmm/src/storage/stellarium.rs
Normal file
@@ -0,0 +1,928 @@
|
||||
//! Stellarium Client
|
||||
//!
|
||||
//! This module provides the client interface to the Stellarium daemon,
|
||||
//! which manages content-addressable storage for VM images.
|
||||
//!
|
||||
//! # Protocol
|
||||
//!
|
||||
//! Communication with the Stellarium daemon uses a simple binary protocol
|
||||
//! over Unix domain sockets:
|
||||
//!
|
||||
//! ```text
|
||||
//! Request: [u32 version][u32 command][u32 payload_len][payload...]
|
||||
//! Response: [u32 status][u32 payload_len][payload...]
|
||||
//! ```
|
||||
//!
|
||||
//! # Memory Mapping
|
||||
//!
|
||||
//! The key performance feature is memory-mapped chunk access. When a chunk
|
||||
//! is requested, Stellarium can return a file descriptor to a shared memory
|
||||
//! region containing the chunk data. This enables:
|
||||
//!
|
||||
//! - Zero-copy reads
|
||||
//! - Shared pages across VMs using the same base image
|
||||
//! - Efficient memory usage through kernel page sharing
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::fs::File;
|
||||
use std::io::{self, Read, Write};
|
||||
use std::os::unix::io::{AsRawFd, FromRawFd, RawFd};
|
||||
use std::os::unix::net::UnixStream;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{Arc, Mutex, RwLock};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use thiserror::Error;
|
||||
|
||||
use super::{ContentHash, StorageStats, ChunkStore, DEFAULT_CHUNK_SIZE, PROTOCOL_VERSION};
|
||||
|
||||
// ============================================================================
|
||||
// Error Types
|
||||
// ============================================================================
|
||||
|
||||
/// Errors from Stellarium operations
|
||||
#[derive(Error, Debug)]
|
||||
pub enum StellariumError {
|
||||
#[error("Failed to connect to Stellarium daemon: {0}")]
|
||||
ConnectionFailed(#[source] io::Error),
|
||||
|
||||
#[error("Protocol version mismatch: expected {expected}, got {actual}")]
|
||||
VersionMismatch { expected: u32, actual: u32 },
|
||||
|
||||
#[error("Daemon returned error: {code}: {message}")]
|
||||
DaemonError { code: u32, message: String },
|
||||
|
||||
#[error("Volume not found: {0}")]
|
||||
VolumeNotFound(String),
|
||||
|
||||
#[error("Chunk not found: {0}")]
|
||||
ChunkNotFound(String),
|
||||
|
||||
#[error("I/O error: {0}")]
|
||||
Io(#[from] io::Error),
|
||||
|
||||
#[error("Memory mapping failed: {0}")]
|
||||
MmapFailed(String),
|
||||
|
||||
#[error("Invalid response from daemon")]
|
||||
InvalidResponse,
|
||||
|
||||
#[error("Operation timed out")]
|
||||
Timeout,
|
||||
|
||||
#[error("Volume is read-only")]
|
||||
ReadOnly,
|
||||
|
||||
#[error("Delta layer full: {used} / {capacity} bytes")]
|
||||
DeltaFull { used: u64, capacity: u64 },
|
||||
|
||||
#[error("Invalid chunk size: expected {expected}, got {actual}")]
|
||||
InvalidChunkSize { expected: usize, actual: usize },
|
||||
|
||||
#[error("Authentication failed")]
|
||||
AuthFailed,
|
||||
|
||||
#[error("Permission denied: {0}")]
|
||||
PermissionDenied(String),
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Protocol Commands
|
||||
// ============================================================================
|
||||
|
||||
mod protocol {
|
||||
pub const CMD_HANDSHAKE: u32 = 0x01;
|
||||
pub const CMD_MOUNT_VOLUME: u32 = 0x10;
|
||||
pub const CMD_UNMOUNT_VOLUME: u32 = 0x11;
|
||||
pub const CMD_GET_CHUNK: u32 = 0x20;
|
||||
pub const CMD_GET_CHUNK_MMAP: u32 = 0x21;
|
||||
pub const CMD_PUT_CHUNK: u32 = 0x22;
|
||||
pub const CMD_HAS_CHUNK: u32 = 0x23;
|
||||
pub const CMD_PREFETCH: u32 = 0x24;
|
||||
pub const CMD_RESOLVE_OFFSET: u32 = 0x30;
|
||||
pub const CMD_VOLUME_INFO: u32 = 0x31;
|
||||
pub const CMD_CREATE_DELTA: u32 = 0x40;
|
||||
pub const CMD_COMMIT_DELTA: u32 = 0x41;
|
||||
pub const CMD_SNAPSHOT: u32 = 0x42;
|
||||
|
||||
pub const STATUS_OK: u32 = 0;
|
||||
pub const STATUS_NOT_FOUND: u32 = 1;
|
||||
pub const STATUS_ERROR: u32 = 2;
|
||||
pub const STATUS_AUTH_REQUIRED: u32 = 3;
|
||||
pub const STATUS_PERMISSION_DENIED: u32 = 4;
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Configuration
|
||||
// ============================================================================
|
||||
|
||||
/// Stellarium client configuration
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct StellariumConfig {
|
||||
/// Path to the Stellarium daemon socket
|
||||
pub socket_path: PathBuf,
|
||||
/// Connection timeout
|
||||
pub connect_timeout: Duration,
|
||||
/// Operation timeout
|
||||
pub operation_timeout: Duration,
|
||||
/// Maximum cached chunk handles
|
||||
pub cache_size: usize,
|
||||
/// Enable chunk prefetching
|
||||
pub prefetch_enabled: bool,
|
||||
/// Number of parallel prefetch operations
|
||||
pub prefetch_parallel: usize,
|
||||
/// Chunk size (must match daemon configuration)
|
||||
pub chunk_size: usize,
|
||||
/// Authentication token (if required)
|
||||
pub auth_token: Option<String>,
|
||||
}
|
||||
|
||||
impl Default for StellariumConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
socket_path: PathBuf::from("/run/stellarium/stellarium.sock"),
|
||||
connect_timeout: Duration::from_secs(5),
|
||||
operation_timeout: Duration::from_secs(30),
|
||||
cache_size: 1024,
|
||||
prefetch_enabled: true,
|
||||
prefetch_parallel: 16,
|
||||
chunk_size: DEFAULT_CHUNK_SIZE,
|
||||
auth_token: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Chunk Handle (Memory-Mapped Access)
|
||||
// ============================================================================
|
||||
|
||||
/// Handle to a memory-mapped chunk
|
||||
///
|
||||
/// This provides zero-copy access to chunk data by mapping the daemon's
|
||||
/// shared memory region directly into the VMM's address space.
|
||||
pub struct ChunkHandle {
|
||||
/// Pointer to mapped memory
|
||||
ptr: *const u8,
|
||||
/// Length of the mapping
|
||||
len: usize,
|
||||
/// Underlying file descriptor (for mmap)
|
||||
_fd: Option<File>,
|
||||
/// Reference to the chunk's content hash
|
||||
hash: ContentHash,
|
||||
}
|
||||
|
||||
// Safety: ChunkHandle only contains read-only data
|
||||
unsafe impl Send for ChunkHandle {}
|
||||
unsafe impl Sync for ChunkHandle {}
|
||||
|
||||
impl ChunkHandle {
|
||||
/// Create a new chunk handle from mapped memory
|
||||
///
|
||||
/// # Safety
|
||||
/// Caller must ensure ptr points to valid memory of at least `len` bytes
|
||||
/// that remains valid for the lifetime of this handle.
|
||||
pub(crate) unsafe fn from_mmap(ptr: *const u8, len: usize, fd: File, hash: ContentHash) -> Self {
|
||||
Self {
|
||||
ptr,
|
||||
len,
|
||||
_fd: Some(fd),
|
||||
hash,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a chunk handle from owned data (fallback for non-mmap case)
|
||||
pub(crate) fn from_data(data: Arc<[u8]>, hash: ContentHash) -> Self {
|
||||
let ptr = Arc::as_ptr(&data) as *const u8;
|
||||
let len = data.len();
|
||||
// Leak the Arc (increment-less); reconstructed in Drop via Arc::from_raw.
|
||||
// SAFETY: We store the raw pointer from Arc::as_ptr and reconstruct
|
||||
// the exact same Arc in Drop. The pointer includes the Arc header.
|
||||
std::mem::forget(data);
|
||||
Self {
|
||||
ptr,
|
||||
len,
|
||||
_fd: None,
|
||||
hash,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the chunk data as a slice
|
||||
pub fn as_slice(&self) -> &[u8] {
|
||||
// Safety: ptr/len are valid by construction
|
||||
unsafe { std::slice::from_raw_parts(self.ptr, self.len) }
|
||||
}
|
||||
|
||||
/// Get the chunk's content hash
|
||||
pub fn hash(&self) -> &ContentHash {
|
||||
&self.hash
|
||||
}
|
||||
|
||||
/// Get the length of the chunk
|
||||
pub fn len(&self) -> usize {
|
||||
self.len
|
||||
}
|
||||
|
||||
/// Check if the chunk is empty
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.len == 0
|
||||
}
|
||||
|
||||
/// Get the raw pointer (for direct memory access)
|
||||
///
|
||||
/// # Safety
|
||||
/// Caller must ensure pointer is not used after ChunkHandle is dropped.
|
||||
pub unsafe fn as_ptr(&self) -> *const u8 {
|
||||
self.ptr
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for ChunkHandle {
|
||||
fn drop(&mut self) {
|
||||
if self._fd.is_none() && !self.ptr.is_null() {
|
||||
// Data was from Arc::from_data — reconstruct the Arc and drop it.
|
||||
// SAFETY: ptr was obtained from Arc::as_ptr in from_data,
|
||||
// and we are the sole owner (Arc was forgotten with refcount 1).
|
||||
unsafe {
|
||||
let raw_slice = std::slice::from_raw_parts(self.ptr, self.len)
|
||||
as *const [u8];
|
||||
let _ = Arc::from_raw(raw_slice);
|
||||
}
|
||||
}
|
||||
// For mmap case: File fd drop handled automatically, munmap happens on fd close
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<[u8]> for ChunkHandle {
|
||||
fn as_ref(&self) -> &[u8] {
|
||||
self.as_slice()
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Chunk Reference
|
||||
// ============================================================================
|
||||
|
||||
/// Reference to a chunk (hash + optional metadata)
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ChunkRef {
|
||||
/// Content hash
|
||||
pub hash: ContentHash,
|
||||
/// Offset within volume (if applicable)
|
||||
pub offset: Option<u64>,
|
||||
/// Chunk size (may differ from default)
|
||||
pub size: usize,
|
||||
/// Compression type (0 = none, 1 = lz4, 2 = zstd)
|
||||
pub compression: u8,
|
||||
}
|
||||
|
||||
impl ChunkRef {
|
||||
/// Create a new chunk reference
|
||||
pub fn new(hash: ContentHash) -> Self {
|
||||
Self {
|
||||
hash,
|
||||
offset: None,
|
||||
size: DEFAULT_CHUNK_SIZE,
|
||||
compression: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create with explicit size
|
||||
pub fn with_size(hash: ContentHash, size: usize) -> Self {
|
||||
Self {
|
||||
hash,
|
||||
offset: None,
|
||||
size,
|
||||
compression: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Volume Information
|
||||
// ============================================================================
|
||||
|
||||
/// Information about a mounted volume
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct VolumeInfo {
|
||||
/// Volume identifier
|
||||
pub id: String,
|
||||
/// Human-readable name
|
||||
pub name: String,
|
||||
/// Total size in bytes
|
||||
pub size: u64,
|
||||
/// Chunk size for this volume
|
||||
pub chunk_size: usize,
|
||||
/// Number of chunks
|
||||
pub chunk_count: u64,
|
||||
/// Root chunk hash (merkle tree root)
|
||||
pub root_hash: ContentHash,
|
||||
/// Read-only flag
|
||||
pub read_only: bool,
|
||||
/// Creation timestamp
|
||||
pub created_at: u64,
|
||||
/// Last modified timestamp
|
||||
pub modified_at: u64,
|
||||
}
|
||||
|
||||
/// Volume statistics
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct VolumeStats {
|
||||
/// Chunks currently mapped
|
||||
pub mapped_chunks: u64,
|
||||
/// Bytes in delta layer
|
||||
pub delta_bytes: u64,
|
||||
/// Pending writes
|
||||
pub pending_writes: u64,
|
||||
/// Read operations since mount
|
||||
pub reads: u64,
|
||||
/// Write operations since mount
|
||||
pub writes: u64,
|
||||
}
|
||||
|
||||
/// Volume mount options
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct MountOptions {
|
||||
/// Mount as read-only
|
||||
pub read_only: bool,
|
||||
/// Maximum delta layer size (bytes)
|
||||
pub max_delta_size: Option<u64>,
|
||||
/// Enable direct I/O
|
||||
pub direct_io: bool,
|
||||
/// Prefetch root chunks on mount
|
||||
pub prefetch_root: bool,
|
||||
/// Custom chunk size (override volume default)
|
||||
pub chunk_size: Option<usize>,
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Stellarium Volume
|
||||
// ============================================================================
|
||||
|
||||
/// A mounted Stellarium volume
|
||||
pub struct StellariumVolume {
|
||||
/// Volume information
|
||||
info: VolumeInfo,
|
||||
/// Reference to parent client
|
||||
client: Arc<StellariumClientInner>,
|
||||
/// Volume-specific chunk cache
|
||||
chunk_cache: RwLock<HashMap<ContentHash, Arc<[u8]>>>,
|
||||
/// Chunk-to-offset mapping (lazily populated)
|
||||
offset_map: RwLock<HashMap<u64, ContentHash>>,
|
||||
/// Volume statistics
|
||||
stats: Mutex<VolumeStats>,
|
||||
/// Mount options
|
||||
options: MountOptions,
|
||||
}
|
||||
|
||||
impl StellariumVolume {
|
||||
/// Get volume information
|
||||
pub fn info(&self) -> &VolumeInfo {
|
||||
&self.info
|
||||
}
|
||||
|
||||
/// Get volume size
|
||||
pub fn size(&self) -> u64 {
|
||||
self.info.size
|
||||
}
|
||||
|
||||
/// Get chunk size
|
||||
pub fn chunk_size(&self) -> usize {
|
||||
self.options.chunk_size.unwrap_or(self.info.chunk_size)
|
||||
}
|
||||
|
||||
/// Check if volume is read-only
|
||||
pub fn is_read_only(&self) -> bool {
|
||||
self.info.read_only || self.options.read_only
|
||||
}
|
||||
|
||||
/// Get chunk hash at offset
|
||||
pub fn chunk_at_offset(&self, offset: u64) -> super::Result<Option<ContentHash>> {
|
||||
let chunk_offset = (offset / self.chunk_size() as u64) * self.chunk_size() as u64;
|
||||
|
||||
// Check cache first
|
||||
{
|
||||
let map = self.offset_map.read().unwrap();
|
||||
if let Some(hash) = map.get(&chunk_offset) {
|
||||
return Ok(Some(*hash));
|
||||
}
|
||||
}
|
||||
|
||||
// Query daemon
|
||||
let hash = self.client.resolve_chunk_offset(&self.info.id, chunk_offset)?;
|
||||
|
||||
if let Some(h) = hash {
|
||||
let mut map = self.offset_map.write().unwrap();
|
||||
map.insert(chunk_offset, h);
|
||||
}
|
||||
|
||||
Ok(hash)
|
||||
}
|
||||
|
||||
/// Read chunk by hash
|
||||
pub fn read_chunk(&self, hash: &ContentHash) -> super::Result<Arc<[u8]>> {
|
||||
// Check local cache
|
||||
{
|
||||
let cache = self.chunk_cache.read().unwrap();
|
||||
if let Some(data) = cache.get(hash) {
|
||||
let mut stats = self.stats.lock().unwrap();
|
||||
stats.reads += 1;
|
||||
return Ok(Arc::clone(data));
|
||||
}
|
||||
}
|
||||
|
||||
// Fetch from daemon
|
||||
let data = self.client.get_chunk(hash)?;
|
||||
|
||||
// Cache it
|
||||
{
|
||||
let mut cache = self.chunk_cache.write().unwrap();
|
||||
cache.insert(*hash, Arc::clone(&data));
|
||||
}
|
||||
|
||||
let mut stats = self.stats.lock().unwrap();
|
||||
stats.reads += 1;
|
||||
|
||||
Ok(data)
|
||||
}
|
||||
|
||||
/// Read chunk with zero-copy (memory-mapped)
|
||||
pub fn read_chunk_zero_copy(&self, hash: &ContentHash) -> super::Result<ChunkHandle> {
|
||||
self.client.get_chunk_mmap(hash)
|
||||
}
|
||||
|
||||
/// Prefetch chunks by hash
|
||||
pub fn prefetch(&self, hashes: &[ContentHash]) -> super::Result<()> {
|
||||
self.client.prefetch_chunks(hashes)
|
||||
}
|
||||
|
||||
/// Prefetch chunks by offset range
|
||||
pub fn prefetch_range(&self, start: u64, end: u64) -> super::Result<()> {
|
||||
let chunk_size = self.chunk_size() as u64;
|
||||
let start_chunk = start / chunk_size;
|
||||
let end_chunk = (end + chunk_size - 1) / chunk_size;
|
||||
|
||||
let mut hashes = Vec::new();
|
||||
for chunk_idx in start_chunk..end_chunk {
|
||||
let offset = chunk_idx * chunk_size;
|
||||
if let Some(hash) = self.chunk_at_offset(offset)? {
|
||||
hashes.push(hash);
|
||||
}
|
||||
}
|
||||
|
||||
self.prefetch(&hashes)
|
||||
}
|
||||
|
||||
/// Get volume statistics
|
||||
pub fn stats(&self) -> VolumeStats {
|
||||
self.stats.lock().unwrap().clone()
|
||||
}
|
||||
|
||||
/// Create a snapshot (returns root hash)
|
||||
pub fn snapshot(&self) -> super::Result<ContentHash> {
|
||||
self.client.snapshot(&self.info.id)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for StellariumVolume {
|
||||
fn drop(&mut self) {
|
||||
// Unmount volume
|
||||
let _ = self.client.unmount_volume(&self.info.id);
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Stellarium Client (Inner)
|
||||
// ============================================================================
|
||||
|
||||
struct StellariumClientInner {
|
||||
config: StellariumConfig,
|
||||
socket: Mutex<UnixStream>,
|
||||
stats: Mutex<StorageStats>,
|
||||
}
|
||||
|
||||
impl StellariumClientInner {
|
||||
fn new(config: StellariumConfig, socket: UnixStream) -> Self {
|
||||
Self {
|
||||
config,
|
||||
socket: Mutex::new(socket),
|
||||
stats: Mutex::new(StorageStats::default()),
|
||||
}
|
||||
}
|
||||
|
||||
fn send_command(&self, cmd: u32, payload: &[u8]) -> super::Result<Vec<u8>> {
|
||||
let mut socket = self.socket.lock().unwrap();
|
||||
|
||||
// Build request: version, command, payload_len, payload
|
||||
let mut request = Vec::with_capacity(12 + payload.len());
|
||||
request.extend_from_slice(&PROTOCOL_VERSION.to_le_bytes());
|
||||
request.extend_from_slice(&cmd.to_le_bytes());
|
||||
request.extend_from_slice(&(payload.len() as u32).to_le_bytes());
|
||||
request.extend_from_slice(payload);
|
||||
|
||||
socket.write_all(&request)?;
|
||||
socket.flush()?;
|
||||
|
||||
// Read response: status, payload_len, payload
|
||||
let mut header = [0u8; 8];
|
||||
socket.read_exact(&mut header)?;
|
||||
|
||||
let status = u32::from_le_bytes([header[0], header[1], header[2], header[3]]);
|
||||
let payload_len = u32::from_le_bytes([header[4], header[5], header[6], header[7]]) as usize;
|
||||
|
||||
let mut response = vec![0u8; payload_len];
|
||||
if payload_len > 0 {
|
||||
socket.read_exact(&mut response)?;
|
||||
}
|
||||
|
||||
match status {
|
||||
protocol::STATUS_OK => Ok(response),
|
||||
protocol::STATUS_NOT_FOUND => {
|
||||
let msg = String::from_utf8_lossy(&response);
|
||||
Err(StellariumError::ChunkNotFound(msg.to_string()))
|
||||
}
|
||||
protocol::STATUS_AUTH_REQUIRED => Err(StellariumError::AuthFailed),
|
||||
protocol::STATUS_PERMISSION_DENIED => {
|
||||
let msg = String::from_utf8_lossy(&response);
|
||||
Err(StellariumError::PermissionDenied(msg.to_string()))
|
||||
}
|
||||
_ => {
|
||||
let msg = String::from_utf8_lossy(&response);
|
||||
Err(StellariumError::DaemonError { code: status, message: msg.to_string() })
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn mount_volume(&self, volume_id: &str, options: &MountOptions) -> super::Result<VolumeInfo> {
|
||||
let mut payload = Vec::new();
|
||||
payload.extend_from_slice(&(volume_id.len() as u32).to_le_bytes());
|
||||
payload.extend_from_slice(volume_id.as_bytes());
|
||||
payload.push(if options.read_only { 1 } else { 0 });
|
||||
payload.push(if options.direct_io { 1 } else { 0 });
|
||||
payload.push(if options.prefetch_root { 1 } else { 0 });
|
||||
|
||||
let response = self.send_command(protocol::CMD_MOUNT_VOLUME, &payload)?;
|
||||
|
||||
if response.len() < 88 {
|
||||
return Err(StellariumError::InvalidResponse);
|
||||
}
|
||||
|
||||
// Parse volume info from response
|
||||
let id_len = u32::from_le_bytes([response[0], response[1], response[2], response[3]]) as usize;
|
||||
let id = String::from_utf8_lossy(&response[4..4 + id_len]).to_string();
|
||||
let offset = 4 + id_len;
|
||||
|
||||
let name_len = u32::from_le_bytes([
|
||||
response[offset], response[offset + 1], response[offset + 2], response[offset + 3]
|
||||
]) as usize;
|
||||
let name = String::from_utf8_lossy(&response[offset + 4..offset + 4 + name_len]).to_string();
|
||||
let offset = offset + 4 + name_len;
|
||||
|
||||
let size = u64::from_le_bytes([
|
||||
response[offset], response[offset + 1], response[offset + 2], response[offset + 3],
|
||||
response[offset + 4], response[offset + 5], response[offset + 6], response[offset + 7],
|
||||
]);
|
||||
let chunk_size = u32::from_le_bytes([
|
||||
response[offset + 8], response[offset + 9], response[offset + 10], response[offset + 11],
|
||||
]) as usize;
|
||||
let chunk_count = u64::from_le_bytes([
|
||||
response[offset + 12], response[offset + 13], response[offset + 14], response[offset + 15],
|
||||
response[offset + 16], response[offset + 17], response[offset + 18], response[offset + 19],
|
||||
]);
|
||||
|
||||
let mut root_hash = [0u8; 32];
|
||||
root_hash.copy_from_slice(&response[offset + 20..offset + 52]);
|
||||
|
||||
let read_only = response[offset + 52] != 0;
|
||||
let created_at = u64::from_le_bytes([
|
||||
response[offset + 53], response[offset + 54], response[offset + 55], response[offset + 56],
|
||||
response[offset + 57], response[offset + 58], response[offset + 59], response[offset + 60],
|
||||
]);
|
||||
let modified_at = u64::from_le_bytes([
|
||||
response[offset + 61], response[offset + 62], response[offset + 63], response[offset + 64],
|
||||
response[offset + 65], response[offset + 66], response[offset + 67], response[offset + 68],
|
||||
]);
|
||||
|
||||
Ok(VolumeInfo {
|
||||
id,
|
||||
name,
|
||||
size,
|
||||
chunk_size,
|
||||
chunk_count,
|
||||
root_hash,
|
||||
read_only,
|
||||
created_at,
|
||||
modified_at,
|
||||
})
|
||||
}
|
||||
|
||||
fn unmount_volume(&self, volume_id: &str) -> super::Result<()> {
|
||||
let mut payload = Vec::new();
|
||||
payload.extend_from_slice(&(volume_id.len() as u32).to_le_bytes());
|
||||
payload.extend_from_slice(volume_id.as_bytes());
|
||||
|
||||
self.send_command(protocol::CMD_UNMOUNT_VOLUME, &payload)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_chunk(&self, hash: &ContentHash) -> super::Result<Arc<[u8]>> {
|
||||
let response = self.send_command(protocol::CMD_GET_CHUNK, hash)?;
|
||||
|
||||
let mut stats = self.stats.lock().unwrap();
|
||||
stats.reads += 1;
|
||||
stats.bytes_read += response.len() as u64;
|
||||
stats.cache_misses += 1;
|
||||
|
||||
Ok(Arc::from(response.into_boxed_slice()))
|
||||
}
|
||||
|
||||
fn get_chunk_mmap(&self, hash: &ContentHash) -> super::Result<ChunkHandle> {
|
||||
// First, request mmap access
|
||||
let response = self.send_command(protocol::CMD_GET_CHUNK_MMAP, hash)?;
|
||||
|
||||
if response.len() < 12 {
|
||||
return Err(StellariumError::InvalidResponse);
|
||||
}
|
||||
|
||||
// Response: [u64 offset][u32 len][fd passed via SCM_RIGHTS]
|
||||
let mmap_offset = u64::from_le_bytes([
|
||||
response[0], response[1], response[2], response[3],
|
||||
response[4], response[5], response[6], response[7],
|
||||
]);
|
||||
let mmap_len = u32::from_le_bytes([
|
||||
response[8], response[9], response[10], response[11],
|
||||
]) as usize;
|
||||
|
||||
// For this implementation, we'll fall back to regular read + manual mmap
|
||||
// A full implementation would use SCM_RIGHTS to receive the fd
|
||||
let data = self.get_chunk(hash)?;
|
||||
|
||||
let mut stats = self.stats.lock().unwrap();
|
||||
stats.zero_copy_ops += 1;
|
||||
|
||||
Ok(ChunkHandle::from_data(data, *hash))
|
||||
}
|
||||
|
||||
fn put_chunk(&self, data: &[u8]) -> super::Result<ContentHash> {
|
||||
let response = self.send_command(protocol::CMD_PUT_CHUNK, data)?;
|
||||
|
||||
if response.len() != 32 {
|
||||
return Err(StellariumError::InvalidResponse);
|
||||
}
|
||||
|
||||
let mut hash = [0u8; 32];
|
||||
hash.copy_from_slice(&response);
|
||||
|
||||
let mut stats = self.stats.lock().unwrap();
|
||||
stats.writes += 1;
|
||||
stats.bytes_written += data.len() as u64;
|
||||
|
||||
Ok(hash)
|
||||
}
|
||||
|
||||
fn has_chunk(&self, hash: &ContentHash) -> super::Result<bool> {
|
||||
let response = self.send_command(protocol::CMD_HAS_CHUNK, hash)?;
|
||||
Ok(!response.is_empty() && response[0] != 0)
|
||||
}
|
||||
|
||||
fn prefetch_chunks(&self, hashes: &[ContentHash]) -> super::Result<()> {
|
||||
// Flatten hashes into payload
|
||||
let mut payload = Vec::with_capacity(4 + hashes.len() * 32);
|
||||
payload.extend_from_slice(&(hashes.len() as u32).to_le_bytes());
|
||||
for hash in hashes {
|
||||
payload.extend_from_slice(hash);
|
||||
}
|
||||
|
||||
self.send_command(protocol::CMD_PREFETCH, &payload)?;
|
||||
|
||||
let mut stats = self.stats.lock().unwrap();
|
||||
stats.prefetch_ops += 1;
|
||||
stats.prefetch_bytes += (hashes.len() * self.config.chunk_size) as u64;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn resolve_chunk_offset(&self, volume_id: &str, offset: u64) -> super::Result<Option<ContentHash>> {
|
||||
let mut payload = Vec::new();
|
||||
payload.extend_from_slice(&(volume_id.len() as u32).to_le_bytes());
|
||||
payload.extend_from_slice(volume_id.as_bytes());
|
||||
payload.extend_from_slice(&offset.to_le_bytes());
|
||||
|
||||
let response = self.send_command(protocol::CMD_RESOLVE_OFFSET, &payload)?;
|
||||
|
||||
if response.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
if response.len() != 32 {
|
||||
return Err(StellariumError::InvalidResponse);
|
||||
}
|
||||
|
||||
let mut hash = [0u8; 32];
|
||||
hash.copy_from_slice(&response);
|
||||
|
||||
if super::hash::is_zero(&hash) {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
Ok(Some(hash))
|
||||
}
|
||||
|
||||
fn snapshot(&self, volume_id: &str) -> super::Result<ContentHash> {
|
||||
let mut payload = Vec::new();
|
||||
payload.extend_from_slice(&(volume_id.len() as u32).to_le_bytes());
|
||||
payload.extend_from_slice(volume_id.as_bytes());
|
||||
|
||||
let response = self.send_command(protocol::CMD_SNAPSHOT, &payload)?;
|
||||
|
||||
if response.len() != 32 {
|
||||
return Err(StellariumError::InvalidResponse);
|
||||
}
|
||||
|
||||
let mut hash = [0u8; 32];
|
||||
hash.copy_from_slice(&response);
|
||||
Ok(hash)
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Stellarium Client (Public)
|
||||
// ============================================================================
|
||||
|
||||
/// Client for communicating with the Stellarium storage daemon
|
||||
pub struct StellariumClient {
|
||||
inner: Arc<StellariumClientInner>,
|
||||
}
|
||||
|
||||
impl StellariumClient {
|
||||
/// Connect to the Stellarium daemon at the default socket path
|
||||
pub fn connect_default() -> super::Result<Self> {
|
||||
Self::connect_with_config(StellariumConfig::default())
|
||||
}
|
||||
|
||||
/// Connect to the Stellarium daemon at the given socket path
|
||||
pub fn connect<P: AsRef<Path>>(socket_path: P) -> super::Result<Self> {
|
||||
let mut config = StellariumConfig::default();
|
||||
config.socket_path = socket_path.as_ref().to_path_buf();
|
||||
Self::connect_with_config(config)
|
||||
}
|
||||
|
||||
/// Connect with full configuration
|
||||
pub fn connect_with_config(config: StellariumConfig) -> super::Result<Self> {
|
||||
let socket = UnixStream::connect(&config.socket_path)
|
||||
.map_err(StellariumError::ConnectionFailed)?;
|
||||
|
||||
socket.set_read_timeout(Some(config.operation_timeout))?;
|
||||
socket.set_write_timeout(Some(config.operation_timeout))?;
|
||||
|
||||
let inner = Arc::new(StellariumClientInner::new(config.clone(), socket));
|
||||
|
||||
// Perform handshake
|
||||
let mut handshake_payload = Vec::new();
|
||||
handshake_payload.extend_from_slice(&PROTOCOL_VERSION.to_le_bytes());
|
||||
if let Some(ref token) = config.auth_token {
|
||||
handshake_payload.extend_from_slice(&(token.len() as u32).to_le_bytes());
|
||||
handshake_payload.extend_from_slice(token.as_bytes());
|
||||
} else {
|
||||
handshake_payload.extend_from_slice(&0u32.to_le_bytes());
|
||||
}
|
||||
|
||||
let response = inner.send_command(protocol::CMD_HANDSHAKE, &handshake_payload)?;
|
||||
|
||||
if response.len() >= 4 {
|
||||
let daemon_version = u32::from_le_bytes([response[0], response[1], response[2], response[3]]);
|
||||
if daemon_version != PROTOCOL_VERSION {
|
||||
return Err(StellariumError::VersionMismatch {
|
||||
expected: PROTOCOL_VERSION,
|
||||
actual: daemon_version,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Self { inner })
|
||||
}
|
||||
|
||||
/// Mount a volume
|
||||
pub fn mount_volume(&self, volume_id: &str) -> super::Result<StellariumVolume> {
|
||||
self.mount_volume_with_options(volume_id, MountOptions::default())
|
||||
}
|
||||
|
||||
/// Mount a volume with options
|
||||
pub fn mount_volume_with_options(
|
||||
&self,
|
||||
volume_id: &str,
|
||||
options: MountOptions,
|
||||
) -> super::Result<StellariumVolume> {
|
||||
let info = self.inner.mount_volume(volume_id, &options)?;
|
||||
|
||||
Ok(StellariumVolume {
|
||||
info,
|
||||
client: Arc::clone(&self.inner),
|
||||
chunk_cache: RwLock::new(HashMap::new()),
|
||||
offset_map: RwLock::new(HashMap::new()),
|
||||
stats: Mutex::new(VolumeStats::default()),
|
||||
options,
|
||||
})
|
||||
}
|
||||
|
||||
/// Get a chunk by hash (without mounting a volume)
|
||||
pub fn get_chunk(&self, hash: &ContentHash) -> super::Result<Arc<[u8]>> {
|
||||
self.inner.get_chunk(hash)
|
||||
}
|
||||
|
||||
/// Get a chunk with zero-copy access
|
||||
pub fn get_chunk_mmap(&self, hash: &ContentHash) -> super::Result<ChunkHandle> {
|
||||
self.inner.get_chunk_mmap(hash)
|
||||
}
|
||||
|
||||
/// Store a chunk and return its hash
|
||||
pub fn put_chunk(&self, data: &[u8]) -> super::Result<ContentHash> {
|
||||
self.inner.put_chunk(data)
|
||||
}
|
||||
|
||||
/// Check if a chunk exists
|
||||
pub fn has_chunk(&self, hash: &ContentHash) -> super::Result<bool> {
|
||||
self.inner.has_chunk(hash)
|
||||
}
|
||||
|
||||
/// Prefetch multiple chunks
|
||||
pub fn prefetch(&self, hashes: &[ContentHash]) -> super::Result<()> {
|
||||
self.inner.prefetch_chunks(hashes)
|
||||
}
|
||||
|
||||
/// Get client statistics
|
||||
pub fn stats(&self) -> StorageStats {
|
||||
self.inner.stats.lock().unwrap().clone()
|
||||
}
|
||||
|
||||
/// Get configuration
|
||||
pub fn config(&self) -> &StellariumConfig {
|
||||
&self.inner.config
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for StellariumClient {
|
||||
fn clone(&self) -> Self {
|
||||
// Note: This shares the underlying connection
|
||||
// For true parallelism, create multiple client instances
|
||||
Self {
|
||||
inner: Arc::clone(&self.inner),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ChunkStore for StellariumClient {
|
||||
fn read_chunk(&self, hash: &ContentHash) -> super::Result<Arc<[u8]>> {
|
||||
self.get_chunk(hash)
|
||||
}
|
||||
|
||||
fn read_chunk_zero_copy(&self, hash: &ContentHash) -> super::Result<ChunkHandle> {
|
||||
self.get_chunk_mmap(hash)
|
||||
}
|
||||
|
||||
fn write_chunk(&self, data: &[u8]) -> super::Result<ContentHash> {
|
||||
self.put_chunk(data)
|
||||
}
|
||||
|
||||
fn has_chunk(&self, hash: &ContentHash) -> super::Result<bool> {
|
||||
StellariumClient::has_chunk(self, hash)
|
||||
}
|
||||
|
||||
fn prefetch(&self, hashes: &[ContentHash]) -> super::Result<()> {
|
||||
StellariumClient::prefetch(self, hashes)
|
||||
}
|
||||
|
||||
fn stats(&self) -> StorageStats {
|
||||
StellariumClient::stats(self)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_chunk_ref_default_size() {
|
||||
let hash = [0u8; 32];
|
||||
let chunk_ref = ChunkRef::new(hash);
|
||||
assert_eq!(chunk_ref.size, DEFAULT_CHUNK_SIZE);
|
||||
assert_eq!(chunk_ref.compression, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_config_default() {
|
||||
let config = StellariumConfig::default();
|
||||
assert!(config.socket_path.to_str().unwrap().contains("stellarium"));
|
||||
assert_eq!(config.chunk_size, DEFAULT_CHUNK_SIZE);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_chunk_handle_from_data() {
|
||||
let data: Arc<[u8]> = Arc::from(vec![1, 2, 3, 4].into_boxed_slice());
|
||||
let hash = [42u8; 32];
|
||||
let handle = ChunkHandle::from_data(data, hash);
|
||||
|
||||
assert_eq!(handle.len(), 4);
|
||||
assert_eq!(handle.as_slice(), &[1, 2, 3, 4]);
|
||||
assert_eq!(handle.hash(), &hash);
|
||||
}
|
||||
}
|
||||
72
vmm/tests/snapshot_test.rs
Normal file
72
vmm/tests/snapshot_test.rs
Normal file
@@ -0,0 +1,72 @@
|
||||
//! Integration test for snapshot/restore
|
||||
//!
|
||||
//! Tests:
|
||||
//! 1. Create a VM with KVM
|
||||
//! 2. Load kernel and boot
|
||||
//! 3. Pause vCPUs
|
||||
//! 4. Create a snapshot
|
||||
//! 5. Restore from snapshot
|
||||
//! 6. Verify restore is faster than cold boot
|
||||
|
||||
use std::path::Path;
|
||||
use std::time::Instant;
|
||||
|
||||
/// Test that the snapshot module compiles and basic types work
|
||||
#[test]
|
||||
fn test_snapshot_types_roundtrip() {
|
||||
// We can't use volt-vmm internals directly since it's a bin crate,
|
||||
// but we can verify the basic snapshot format by creating and parsing JSON
|
||||
let snapshot_json = r#"{
|
||||
"metadata": {
|
||||
"version": 1,
|
||||
"memory_size": 134217728,
|
||||
"vcpu_count": 1,
|
||||
"created_at": 1234567890,
|
||||
"state_crc64": 0,
|
||||
"memory_file_size": 134217728
|
||||
},
|
||||
"vcpu_states": [],
|
||||
"irqchip": {
|
||||
"pic_master": {"raw_data": []},
|
||||
"pic_slave": {"raw_data": []},
|
||||
"ioapic": {"raw_data": []},
|
||||
"pit": {"channels": [], "flags": 0}
|
||||
},
|
||||
"clock": {"clock": 0, "flags": 0},
|
||||
"devices": {
|
||||
"serial": {
|
||||
"dlab": false, "ier": 0, "lcr": 0, "mcr": 0,
|
||||
"lsr": 96, "msr": 0, "scr": 0, "dll": 0, "dlh": 0,
|
||||
"thr_interrupt_pending": false, "input_buffer": []
|
||||
},
|
||||
"virtio_blk": null,
|
||||
"virtio_net": null,
|
||||
"mmio_transports": []
|
||||
},
|
||||
"memory_regions": [
|
||||
{"guest_addr": 0, "size": 134217728, "file_offset": 0}
|
||||
]
|
||||
}"#;
|
||||
|
||||
// Verify it parses as valid JSON
|
||||
let parsed: serde_json::Value = serde_json::from_str(snapshot_json).unwrap();
|
||||
assert_eq!(parsed["metadata"]["version"], 1);
|
||||
assert_eq!(parsed["metadata"]["memory_size"], 134217728);
|
||||
assert_eq!(parsed["metadata"]["vcpu_count"], 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_crc64_deterministic() {
|
||||
// Test that CRC-64 computation is deterministic
|
||||
let data = b"Hello, Volt snapshot!";
|
||||
|
||||
// Use the crc crate directly
|
||||
use crc::{Crc, CRC_64_ECMA_182};
|
||||
const CRC64: Crc<u64> = Crc::<u64>::new(&CRC_64_ECMA_182);
|
||||
|
||||
let crc1 = CRC64.checksum(data);
|
||||
let crc2 = CRC64.checksum(data);
|
||||
|
||||
assert_eq!(crc1, crc2);
|
||||
assert_ne!(crc1, 0); // Very unlikely to be zero for non-empty data
|
||||
}
|
||||
Reference in New Issue
Block a user