From 8aaf3734aadbf550ad7df06328f5c7964fc38220 Mon Sep 17 00:00:00 2001 From: Bo Chen Date: Sat, 21 Feb 2026 01:32:02 +0000 Subject: [PATCH 001/742] build: Temporarily remove the vfio CI worker This runner machine is out for maintenance. Signed-off-by: Bo Chen --- .github/workflows/integration-vfio.yaml | 33 ------------------------- 1 file changed, 33 deletions(-) delete mode 100644 .github/workflows/integration-vfio.yaml diff --git a/.github/workflows/integration-vfio.yaml b/.github/workflows/integration-vfio.yaml deleted file mode 100644 index 218e897270..0000000000 --- a/.github/workflows/integration-vfio.yaml +++ /dev/null @@ -1,33 +0,0 @@ -name: Cloud Hypervisor Tests (VFIO) -on: [merge_group, pull_request] -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event_name }} - cancel-in-progress: true - -jobs: - build: - name: Tests (VFIO) - runs-on: ${{ github.event_name == 'pull_request' && 'ubuntu-latest' || 'vfio-nvidia' }} - env: - AUTH_DOWNLOAD_TOKEN: ${{ secrets.AUTH_DOWNLOAD_TOKEN }} - steps: - - name: Fix workspace permissions - if: ${{ github.event_name != 'pull_request' }} - run: sudo chown -R runner:runner ${GITHUB_WORKSPACE} - - name: Code checkout - if: ${{ github.event_name != 'pull_request' }} - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - name: Run VFIO integration tests - if: ${{ github.event_name != 'pull_request' }} - timeout-minutes: 15 - run: scripts/dev_cli.sh tests --integration-vfio - # Most tests are failing with musl see #6790 - # - name: Run VFIO integration tests for musl - # if: ${{ github.event_name != 'pull_request' }} - # timeout-minutes: 15 - # run: scripts/dev_cli.sh tests --integration-vfio --libc musl - - name: Skipping build for PR - if: ${{ github.event_name == 'pull_request' }} - run: echo "Skipping build for PR" From ec5374cd9988beb6e21d913efc64d1e4f90c5a3e Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Sun, 22 Feb 2026 00:22:49 +0100 Subject: [PATCH 002/742] performance-metrics: Set image_type for backing file tests Add explicit image_type=qcow2 along backing_files=on for the relevant QCOW2 perf tests. Signed-off-by: Anatol Belski --- performance-metrics/src/performance_tests.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/performance-metrics/src/performance_tests.rs b/performance-metrics/src/performance_tests.rs index b959fce5c3..7eb07f368a 100644 --- a/performance-metrics/src/performance_tests.rs +++ b/performance-metrics/src/performance_tests.rs @@ -441,7 +441,7 @@ pub fn performance_block_io(control: &PerformanceTestControl) -> f64 { let mut test_disk_arg = format!("path={test_file},queue_size={queue_size},num_queues={num_queues}"); if test_file == OVERLAY_WITH_QCOW2_BACKING || test_file == OVERLAY_WITH_RAW_BACKING { - test_disk_arg.push_str(",backing_files=on"); + test_disk_arg.push_str(",image_type=qcow2,backing_files=on"); } let mut child = GuestCommand::new(&guest) From 600e74f0af86a2897c397fea537fb20424dc51fe Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 23 Feb 2026 12:06:31 +0100 Subject: [PATCH 003/742] scripts: dev_cli.sh: Allow passing commands to shell subcommand Accept arguments after -- in 'dev_cli.sh shell' and forward them to 'bash -c' inside the container. When no arguments are given, an interactive shell is started as before. This enables running one-off commands in the CI container without an interactive session, for example: ./scripts/dev_cli.sh shell -- rustup toolchain install nightly \&\& cargo +nightly fmt --all -- --check Signed-off-by: Anatol Belski --- scripts/dev_cli.sh | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/scripts/dev_cli.sh b/scripts/dev_cli.sh index e537c499c2..0e190a0072 100755 --- a/scripts/dev_cli.sh +++ b/scripts/dev_cli.sh @@ -706,10 +706,21 @@ cmd_shell() { ensure_build_dir ensure_latest_ctr process_volumes_args - say_warn "Starting a privileged shell prompt as root ..." - say_warn "WARNING: Your $CLH_ROOT_DIR folder will be bind-mounted in the container under $CTR_CLH_ROOT_DIR" + + # Remaining args after -- are passed as a command to bash -c. + # With no args, an interactive shell is started. + tty_args="-ti" + shell_args=() + if [ $# -gt 0 ]; then + tty_args="" + shell_args+=("-c" "$*") + else + say_warn "Starting a privileged shell prompt as root ..." + say_warn "WARNING: Your $CLH_ROOT_DIR folder will be bind-mounted in the container under $CTR_CLH_ROOT_DIR" + fi + $DOCKER_RUNTIME run \ - -ti \ + $tty_args \ --workdir "$CTR_CLH_ROOT_DIR" \ --rm \ --privileged \ @@ -723,7 +734,8 @@ cmd_shell() { --volume "$CLH_INTEGRATION_WORKLOADS:$CTR_CLH_INTEGRATION_WORKLOADS" \ --env USER="root" \ --entrypoint bash \ - "$CTR_IMAGE" + "$CTR_IMAGE" \ + "${shell_args[@]}" fix_dir_perms $? } From 184a229ca4a50c8292863fbe70e1b776fbd04772 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 23 Feb 2026 18:02:01 +0100 Subject: [PATCH 004/742] tests: vhdx: Enable VHDX fstrim integration test Remove the #[ignore] attribute from test_virtio_block_fstrim_unsupported_vhdx. Signed-off-by: Anatol Belski --- cloud-hypervisor/tests/integration.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 1aad1a372c..e9d2e66c1b 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -7793,11 +7793,7 @@ mod common_parallel { _test_virtio_block_fstrim("vhd", "vpc", &["-o", "subformat=fixed"], false, false); } - // VHDX backend has a multiqueue bug causing filesystem corruption. - // The _test_virtio_block_fstrim helper uses num_queues>1 which triggers the bug. - // Ref: #7665 #[test] - #[ignore] fn test_virtio_block_fstrim_unsupported_vhdx() { _test_virtio_block_fstrim("vhdx", "vhdx", &[], false, false); } From 6f19d0071db4d447058d05a5f64a6426ea6d0117 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 23 Feb 2026 18:03:08 +0100 Subject: [PATCH 005/742] block: vhdx: Fix multiqueue data corruption Wrap the Vhdx instance in Arc> so that all queues share a single mutex-protected backend, matching the approach already used for QCOW2. Vhdx::clone() uses dup() which shares the kernel file description including the file offset. With multiple queues performing concurrent seek+read/write on the shared offset, I/O operations race and corrupt data. Fixes: #7665 Signed-off-by: Anatol Belski --- block/src/vhdx_sync.rs | 46 +++++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/block/src/vhdx_sync.rs b/block/src/vhdx_sync.rs index fc236c15df..0a0dc47bc2 100644 --- a/block/src/vhdx_sync.rs +++ b/block/src/vhdx_sync.rs @@ -5,6 +5,7 @@ use std::collections::VecDeque; use std::fs::File; use std::os::fd::AsRawFd; +use std::sync::{Arc, Mutex}; use vmm_sys_util::eventfd::EventFd; @@ -15,24 +16,32 @@ use crate::vhdx::{Result as VhdxResult, Vhdx}; use crate::{AsyncAdaptor, BlockBackend, Error}; pub struct VhdxDiskSync { - vhdx_file: Vhdx, + // FIXME: The Mutex serializes all VHDX I/O operations across queues, which + // is necessary for correctness but eliminates any parallelism benefit from + // multiqueue. Vhdx::clone() shares the underlying file description across + // threads, so concurrent I/O from multiple queues races on the file offset + // causing data corruption. + // + // A proper fix would require restructuring the VHDX I/O path so that data + // operations can proceed in parallel with independent file descriptors. + vhdx_file: Arc>, } impl VhdxDiskSync { pub fn new(f: File) -> VhdxResult { Ok(VhdxDiskSync { - vhdx_file: Vhdx::new(f)?, + vhdx_file: Arc::new(Mutex::new(Vhdx::new(f)?)), }) } } impl DiskFile for VhdxDiskSync { fn logical_size(&mut self) -> DiskFileResult { - Ok(self.vhdx_file.virtual_disk_size()) + Ok(self.vhdx_file.lock().unwrap().virtual_disk_size()) } fn physical_size(&mut self) -> DiskFileResult { - self.vhdx_file.physical_size().map_err(|e| { + self.vhdx_file.lock().unwrap().physical_size().map_err(|e| { let io_inner = match e { Error::GetFileMetadata(e) => e, _ => unreachable!(), @@ -42,30 +51,28 @@ impl DiskFile for VhdxDiskSync { } fn new_async_io(&self, _ring_depth: u32) -> DiskFileResult> { - Ok( - Box::new(VhdxSync::new(self.vhdx_file.clone()).map_err(DiskFileError::NewAsyncIo)?) - as Box, - ) + Ok(Box::new(VhdxSync::new(Arc::clone(&self.vhdx_file))) as Box) } fn fd(&mut self) -> BorrowedDiskFd<'_> { - BorrowedDiskFd::new(self.vhdx_file.as_raw_fd()) + BorrowedDiskFd::new(self.vhdx_file.lock().unwrap().as_raw_fd()) } } pub struct VhdxSync { - vhdx_file: Vhdx, + vhdx_file: Arc>, eventfd: EventFd, completion_list: VecDeque<(u64, i32)>, } impl VhdxSync { - pub fn new(vhdx_file: Vhdx) -> std::io::Result { - Ok(VhdxSync { + pub fn new(vhdx_file: Arc>) -> Self { + VhdxSync { vhdx_file, - eventfd: EventFd::new(libc::EFD_NONBLOCK)?, + eventfd: EventFd::new(libc::EFD_NONBLOCK) + .expect("Failed creating EventFd for VhdxSync"), completion_list: VecDeque::new(), - }) + } } } @@ -82,7 +89,7 @@ impl AsyncIo for VhdxSync { iovecs: &[libc::iovec], user_data: u64, ) -> AsyncIoResult<()> { - self.vhdx_file.read_vectored_sync( + self.vhdx_file.lock().unwrap().read_vectored_sync( offset, iovecs, user_data, @@ -97,7 +104,7 @@ impl AsyncIo for VhdxSync { iovecs: &[libc::iovec], user_data: u64, ) -> AsyncIoResult<()> { - self.vhdx_file.write_vectored_sync( + self.vhdx_file.lock().unwrap().write_vectored_sync( offset, iovecs, user_data, @@ -107,8 +114,11 @@ impl AsyncIo for VhdxSync { } fn fsync(&mut self, user_data: Option) -> AsyncIoResult<()> { - self.vhdx_file - .fsync_sync(user_data, &self.eventfd, &mut self.completion_list) + self.vhdx_file.lock().unwrap().fsync_sync( + user_data, + &self.eventfd, + &mut self.completion_list, + ) } fn next_completed_request(&mut self) -> Option<(u64, i32)> { From 8a09b3870c911682ff5b68bb647a52725a7a843a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 24 Feb 2026 00:49:51 +0000 Subject: [PATCH 006/742] build: Bump the non-rust-vmm group across 2 directories with 12 updates Bumps the non-rust-vmm group with 7 updates in the / directory: | Package | From | To | | --- | --- | --- | | [anyhow](https://github.com/dtolnay/anyhow) | `1.0.101` | `1.0.102` | | [clap](https://github.com/clap-rs/clap) | `4.5.59` | `4.5.60` | | [zbus](https://github.com/z-galaxy/zbus) | `5.13.2` | `5.14.0` | | [bumpalo](https://github.com/fitzgen/bumpalo) | `3.19.1` | `3.20.2` | | [jiff](https://github.com/BurntSushi/jiff) | `0.2.20` | `0.2.21` | | [rustix](https://github.com/bytecodealliance/rustix) | `1.1.3` | `1.1.4` | | [syn](https://github.com/dtolnay/syn) | `2.0.116` | `2.0.117` | Bumps the non-rust-vmm group with 4 updates in the /fuzz directory: [anyhow](https://github.com/dtolnay/anyhow), [clap](https://github.com/clap-rs/clap), [bumpalo](https://github.com/fitzgen/bumpalo) and [syn](https://github.com/dtolnay/syn). Updates `anyhow` from 1.0.101 to 1.0.102 - [Release notes](https://github.com/dtolnay/anyhow/releases) - [Commits](https://github.com/dtolnay/anyhow/compare/1.0.101...1.0.102) Updates `clap` from 4.5.59 to 4.5.60 - [Release notes](https://github.com/clap-rs/clap/releases) - [Changelog](https://github.com/clap-rs/clap/blob/master/CHANGELOG.md) - [Commits](https://github.com/clap-rs/clap/compare/clap_complete-v4.5.59...clap_complete-v4.5.60) Updates `zbus` from 5.13.2 to 5.14.0 - [Release notes](https://github.com/z-galaxy/zbus/releases) - [Changelog](https://github.com/z-galaxy/zbus/blob/main/release-plz.toml) - [Commits](https://github.com/z-galaxy/zbus/compare/zbus-5.13.2...zbus-5.14.0) Updates `bumpalo` from 3.19.1 to 3.20.2 - [Changelog](https://github.com/fitzgen/bumpalo/blob/main/CHANGELOG.md) - [Commits](https://github.com/fitzgen/bumpalo/compare/v3.19.1...v3.20.2) Updates `clap_builder` from 4.5.59 to 4.5.60 - [Release notes](https://github.com/clap-rs/clap/releases) - [Changelog](https://github.com/clap-rs/clap/blob/master/CHANGELOG.md) - [Commits](https://github.com/clap-rs/clap/compare/v4.5.59...v4.5.60) Updates `jiff` from 0.2.20 to 0.2.21 - [Release notes](https://github.com/BurntSushi/jiff/releases) - [Changelog](https://github.com/BurntSushi/jiff/blob/master/CHANGELOG.md) - [Commits](https://github.com/BurntSushi/jiff/compare/jiff-static-0.2.20...jiff-static-0.2.21) Updates `jiff-static` from 0.2.20 to 0.2.21 - [Release notes](https://github.com/BurntSushi/jiff/releases) - [Changelog](https://github.com/BurntSushi/jiff/blob/master/CHANGELOG.md) - [Commits](https://github.com/BurntSushi/jiff/compare/jiff-static-0.2.20...jiff-static-0.2.21) Updates `rustix` from 1.1.3 to 1.1.4 - [Release notes](https://github.com/bytecodealliance/rustix/releases) - [Changelog](https://github.com/bytecodealliance/rustix/blob/main/CHANGES.md) - [Commits](https://github.com/bytecodealliance/rustix/compare/v1.1.3...v1.1.4) Updates `syn` from 2.0.116 to 2.0.117 - [Release notes](https://github.com/dtolnay/syn/releases) - [Commits](https://github.com/dtolnay/syn/compare/2.0.116...2.0.117) Updates `zbus_macros` from 5.13.2 to 5.14.0 - [Release notes](https://github.com/z-galaxy/zbus/releases) - [Changelog](https://github.com/z-galaxy/zbus/blob/main/release-plz.toml) - [Commits](https://github.com/z-galaxy/zbus/compare/zbus_macros-5.13.2...zbus_macros-5.14.0) Updates `zvariant` from 5.9.2 to 5.10.0 - [Release notes](https://github.com/z-galaxy/zbus/releases) - [Changelog](https://github.com/z-galaxy/zbus/blob/main/release-plz.toml) - [Commits](https://github.com/z-galaxy/zbus/compare/zvariant-5.9.2...zvariant-5.10.0) Updates `zvariant_derive` from 5.9.2 to 5.10.0 - [Release notes](https://github.com/z-galaxy/zbus/releases) - [Changelog](https://github.com/z-galaxy/zbus/blob/main/release-plz.toml) - [Commits](https://github.com/z-galaxy/zbus/compare/zvariant_derive-5.9.2...zvariant_derive-5.10.0) Updates `anyhow` from 1.0.101 to 1.0.102 - [Release notes](https://github.com/dtolnay/anyhow/releases) - [Commits](https://github.com/dtolnay/anyhow/compare/1.0.101...1.0.102) Updates `clap` from 4.5.59 to 4.5.60 - [Release notes](https://github.com/clap-rs/clap/releases) - [Changelog](https://github.com/clap-rs/clap/blob/master/CHANGELOG.md) - [Commits](https://github.com/clap-rs/clap/compare/clap_complete-v4.5.59...clap_complete-v4.5.60) Updates `bumpalo` from 3.19.1 to 3.20.2 - [Changelog](https://github.com/fitzgen/bumpalo/blob/main/CHANGELOG.md) - [Commits](https://github.com/fitzgen/bumpalo/compare/v3.19.1...v3.20.2) Updates `clap_builder` from 4.5.59 to 4.5.60 - [Release notes](https://github.com/clap-rs/clap/releases) - [Changelog](https://github.com/clap-rs/clap/blob/master/CHANGELOG.md) - [Commits](https://github.com/clap-rs/clap/compare/v4.5.59...v4.5.60) Updates `syn` from 2.0.116 to 2.0.117 - [Release notes](https://github.com/dtolnay/syn/releases) - [Commits](https://github.com/dtolnay/syn/compare/2.0.116...2.0.117) --- updated-dependencies: - dependency-name: anyhow dependency-version: 1.0.102 dependency-type: direct:production update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: clap dependency-version: 4.5.60 dependency-type: direct:production update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: zbus dependency-version: 5.14.0 dependency-type: direct:production update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: bumpalo dependency-version: 3.20.2 dependency-type: indirect update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: clap_builder dependency-version: 4.5.60 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: jiff dependency-version: 0.2.21 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: jiff-static dependency-version: 0.2.21 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: rustix dependency-version: 1.1.4 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: syn dependency-version: 2.0.117 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: zbus_macros dependency-version: 5.14.0 dependency-type: indirect update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: zvariant dependency-version: 5.10.0 dependency-type: indirect update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: zvariant_derive dependency-version: 5.10.0 dependency-type: indirect update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: anyhow dependency-version: 1.0.102 dependency-type: direct:production update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: clap dependency-version: 4.5.60 dependency-type: direct:production update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: bumpalo dependency-version: 3.20.2 dependency-type: indirect update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: clap_builder dependency-version: 4.5.60 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: syn dependency-version: 2.0.117 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm ... Signed-off-by: dependabot[bot] --- Cargo.lock | 64 ++++++++++++++++++------------------- Cargo.toml | 4 +-- cloud-hypervisor/Cargo.toml | 2 +- fuzz/Cargo.lock | 20 ++++++------ vmm/Cargo.toml | 2 +- 5 files changed, 46 insertions(+), 46 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 12304950bd..ea880ebc97 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -71,7 +71,7 @@ version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" dependencies = [ - "windows-sys 0.61.0", + "windows-sys 0.60.2", ] [[package]] @@ -82,14 +82,14 @@ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys 0.61.0", + "windows-sys 0.60.2", ] [[package]] name = "anyhow" -version = "1.0.101" +version = "1.0.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f0e0fee31ef5ed1ba1316088939cea399010ed7731dba877ed44aeb407a75ea" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" [[package]] name = "api_client" @@ -358,9 +358,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.19.1" +version = "3.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dd9dc738b7a8311c7ade152424974d8115f2cdad61e8dab8dac9f2362298510" +checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" [[package]] name = "byteorder" @@ -399,18 +399,18 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.59" +version = "4.5.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5caf74d17c3aec5495110c34cc3f78644bfa89af6c8993ed4de2790e49b6499" +checksum = "2797f34da339ce31042b27d23607e051786132987f595b02ba4f6a6dffb7030a" dependencies = [ "clap_builder", ] [[package]] name = "clap_builder" -version = "4.5.59" +version = "4.5.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "370daa45065b80218950227371916a1633217ae42b2715b2287b606dcd618e24" +checksum = "24a241312cea5059b13574bb9b3861cabf758b879c15190b37b6d6fd63ab6876" dependencies = [ "anstream", "anstyle", @@ -618,7 +618,7 @@ dependencies = [ "libc", "option-ext", "redox_users", - "windows-sys 0.61.0", + "windows-sys 0.60.2", ] [[package]] @@ -700,7 +700,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.61.0", + "windows-sys 0.60.2", ] [[package]] @@ -1136,9 +1136,9 @@ checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" [[package]] name = "jiff" -version = "0.2.20" +version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c867c356cc096b33f4981825ab281ecba3db0acefe60329f044c1789d94c6543" +checksum = "b3e3d65f018c6ae946ab16e80944b97096ed73c35b221d1c478a6c81d8f57940" dependencies = [ "jiff-static", "log", @@ -1149,9 +1149,9 @@ dependencies = [ [[package]] name = "jiff-static" -version = "0.2.20" +version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7946b4325269738f270bb55b3c19ab5c5040525f83fd625259422a9d25d9be5" +checksum = "a17c2b211d863c7fde02cbea8a3c1a439b98e109286554f2860bdded7ff83818" dependencies = [ "proc-macro2", "quote", @@ -1277,9 +1277,9 @@ dependencies = [ [[package]] name = "linux-raw-sys" -version = "0.11.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039" +checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" [[package]] name = "lock_api" @@ -1920,15 +1920,15 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" [[package]] name = "rustix" -version = "1.1.3" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "146c9e247ccc180c1f61615433868c99f3de3ae256a30a43b49f67c2d9171f34" +checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" dependencies = [ "bitflags 2.11.0", "errno", "libc", "linux-raw-sys", - "windows-sys 0.61.0", + "windows-sys 0.60.2", ] [[package]] @@ -2117,9 +2117,9 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "syn" -version = "2.0.116" +version = "2.0.117" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3df424c70518695237746f84cede799c9c58fcb37450d7b23716568cc8bc69cb" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" dependencies = [ "proc-macro2", "quote", @@ -2136,7 +2136,7 @@ dependencies = [ "getrandom 0.4.1", "once_cell", "rustix", - "windows-sys 0.61.0", + "windows-sys 0.60.2", ] [[package]] @@ -2966,9 +2966,9 @@ dependencies = [ [[package]] name = "zbus" -version = "5.13.2" +version = "5.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bfeff997a0aaa3eb20c4652baf788d2dfa6d2839a0ead0b3ff69ce2f9c4bdd1" +checksum = "ca82f95dbd3943a40a53cfded6c2d0a2ca26192011846a1810c4256ef92c60bc" dependencies = [ "async-broadcast", "async-executor", @@ -3001,9 +3001,9 @@ dependencies = [ [[package]] name = "zbus_macros" -version = "5.13.2" +version = "5.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0bbd5a90dbe8feee5b13def448427ae314ccd26a49cac47905cafefb9ff846f1" +checksum = "897e79616e84aac4b2c46e9132a4f63b93105d54fe8c0e8f6bffc21fa8d49222" dependencies = [ "proc-macro-crate", "proc-macro2", @@ -3081,9 +3081,9 @@ dependencies = [ [[package]] name = "zvariant" -version = "5.9.2" +version = "5.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68b64ef4f40c7951337ddc7023dd03528a57a3ce3408ee9da5e948bd29b232c4" +checksum = "5708299b21903bbe348e94729f22c49c55d04720a004aa350f1f9c122fd2540b" dependencies = [ "endi", "enumflags2", @@ -3095,9 +3095,9 @@ dependencies = [ [[package]] name = "zvariant_derive" -version = "5.9.2" +version = "5.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "484d5d975eb7afb52cc6b929c13d3719a20ad650fea4120e6310de3fc55e415c" +checksum = "5b59b012ebe9c46656f9cc08d8da8b4c726510aef12559da3e5f1bf72780752c" dependencies = [ "proc-macro-crate", "proc-macro2", diff --git a/Cargo.toml b/Cargo.toml index 53537895d7..e34941f358 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -73,11 +73,11 @@ serde_json = "1.0.149" serde_with = { version = "3.16.1", default-features = false } # other crates -anyhow = "1.0.101" +anyhow = "1.0.102" bitflags = "2.11.0" byteorder = "1.5.0" cfg-if = "1.0.4" -clap = "4.5.59" +clap = "4.5.60" dhat = "0.3.3" dirs = "6.0.0" env_logger = "0.11.8" diff --git a/cloud-hypervisor/Cargo.toml b/cloud-hypervisor/Cargo.toml index 542d1859d8..d69773e743 100644 --- a/cloud-hypervisor/Cargo.toml +++ b/cloud-hypervisor/Cargo.toml @@ -38,7 +38,7 @@ tracer = { path = "../tracer" } vm-memory = { workspace = true } vmm = { path = "../vmm" } vmm-sys-util = { workspace = true } -zbus = { version = "5.13.2", optional = true } +zbus = { version = "5.14.0", optional = true } [dev-dependencies] block = { path = "../block" } diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock index 37e0f2aa2d..52cf0e2286 100644 --- a/fuzz/Cargo.lock +++ b/fuzz/Cargo.lock @@ -69,9 +69,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.101" +version = "1.0.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f0e0fee31ef5ed1ba1316088939cea399010ed7731dba877ed44aeb407a75ea" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" [[package]] name = "arbitrary" @@ -161,9 +161,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.19.1" +version = "3.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dd9dc738b7a8311c7ade152424974d8115f2cdad61e8dab8dac9f2362298510" +checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" [[package]] name = "byteorder" @@ -191,18 +191,18 @@ checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" [[package]] name = "clap" -version = "4.5.59" +version = "4.5.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5caf74d17c3aec5495110c34cc3f78644bfa89af6c8993ed4de2790e49b6499" +checksum = "2797f34da339ce31042b27d23607e051786132987f595b02ba4f6a6dffb7030a" dependencies = [ "clap_builder", ] [[package]] name = "clap_builder" -version = "4.5.59" +version = "4.5.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "370daa45065b80218950227371916a1633217ae42b2715b2287b606dcd618e24" +checksum = "24a241312cea5059b13574bb9b3861cabf758b879c15190b37b6d6fd63ab6876" dependencies = [ "anstream", "anstyle", @@ -1170,9 +1170,9 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "syn" -version = "2.0.116" +version = "2.0.117" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3df424c70518695237746f84cede799c9c58fcb37450d7b23716568cc8bc69cb" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" dependencies = [ "proc-macro2", "quote", diff --git a/vmm/Cargo.toml b/vmm/Cargo.toml index 4cfa4ed3a1..43b1de14b4 100644 --- a/vmm/Cargo.toml +++ b/vmm/Cargo.toml @@ -91,7 +91,7 @@ vm-memory = { workspace = true, features = [ vm-migration = { path = "../vm-migration" } vm-virtio = { path = "../vm-virtio" } vmm-sys-util = { workspace = true, features = ["with-serde"] } -zbus = { version = "5.13.2", optional = true } +zbus = { version = "5.14.0", optional = true } zerocopy = { workspace = true, features = ["alloc", "derive"] } [lints] From 8c618ff5e018e8c19b9c1cc6fb465b1e765feea8 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Wed, 4 Feb 2026 09:24:03 -0500 Subject: [PATCH 007/742] virtio-devices: generic-vhost-user: implement device This implements a generic vhost-user device. All information about this device must be provided to Cloud Hypervisor via the command-line or API. The main use-case is types of vhost-user devices Cloud Hypervisor doesn't know about, but it can also be used for types it does know about. The generic device delegates all configuration space handling to the backend. This means that the vhost-user backend must support configuration space access. It also means that the backend has control of configuration space. For instance, this means that setting the tag of a virtio-fs device on the virtiofsd command line works as expected. If the VM is snapshotted or migrated, the backend must write the configuration space to a separate save file or migration stream. Similarly, if the VM is restored or migrated, the backend must read the configuration space from a separate save file or migration stream. Signed-off-by: Demi Marie Obenour --- virtio-devices/src/seccomp_filters.rs | 16 + .../src/vhost_user/generic_vhost_user.rs | 437 ++++++++++++++++++ virtio-devices/src/vhost_user/mod.rs | 2 + 3 files changed, 455 insertions(+) create mode 100644 virtio-devices/src/vhost_user/generic_vhost_user.rs diff --git a/virtio-devices/src/seccomp_filters.rs b/virtio-devices/src/seccomp_filters.rs index 5afd056a6b..63d01a5d8d 100644 --- a/virtio-devices/src/seccomp_filters.rs +++ b/virtio-devices/src/seccomp_filters.rs @@ -24,6 +24,7 @@ pub enum Thread { VirtioRng, VirtioVhostBlock, VirtioVhostFs, + VirtioGenericVhostUser, VirtioVhostNet, VirtioVhostNetCtl, VirtioVsock, @@ -192,6 +193,20 @@ fn virtio_vhost_fs_thread_rules() -> Vec<(i64, Vec)> { ] } +fn virtio_generic_vhost_user_thread_rules() -> Vec<(i64, Vec)> { + vec![ + (libc::SYS_clock_nanosleep, vec![]), + (libc::SYS_connect, vec![]), + (libc::SYS_nanosleep, vec![]), + (libc::SYS_pread64, vec![]), + (libc::SYS_pwrite64, vec![]), + (libc::SYS_recvmsg, vec![]), + (libc::SYS_sendmsg, vec![]), + (libc::SYS_sendto, vec![]), + (libc::SYS_socket, vec![]), + ] +} + fn virtio_vhost_net_ctl_thread_rules() -> Vec<(i64, Vec)> { vec![] } @@ -271,6 +286,7 @@ fn get_seccomp_rules(thread_type: Thread) -> Vec<(i64, Vec)> { Thread::VirtioRng => virtio_rng_thread_rules(), Thread::VirtioVhostBlock => virtio_vhost_block_thread_rules(), Thread::VirtioVhostFs => virtio_vhost_fs_thread_rules(), + Thread::VirtioGenericVhostUser => virtio_generic_vhost_user_thread_rules(), Thread::VirtioVhostNet => virtio_vhost_net_thread_rules(), Thread::VirtioVhostNetCtl => virtio_vhost_net_ctl_thread_rules(), Thread::VirtioVsock => virtio_vsock_thread_rules(), diff --git a/virtio-devices/src/vhost_user/generic_vhost_user.rs b/virtio-devices/src/vhost_user/generic_vhost_user.rs new file mode 100644 index 0000000000..5774af928a --- /dev/null +++ b/virtio-devices/src/vhost_user/generic_vhost_user.rs @@ -0,0 +1,437 @@ +// Copyright 2019 Intel Corporation. All Rights Reserved. +// Copyright 2025 Demi Marie Obenour. +// SPDX-License-Identifier: Apache-2.0 + +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::{Arc, Barrier, Mutex}; +use std::{result, thread}; + +use event_monitor::event; +use log::{error, info, warn}; +use seccompiler::SeccompAction; +use serde::{Deserialize, Serialize}; +use vhost::vhost_user::message::{ + VhostUserConfigFlags, VhostUserProtocolFeatures, VhostUserVirtioFeatures, +}; +use vhost::vhost_user::{FrontendReqHandler, VhostUserFrontend, VhostUserFrontendReqHandler}; +use virtio_queue::Queue; +use vm_device::UserspaceMapping; +use vm_memory::GuestMemoryAtomic; +use vm_migration::protocol::MemoryRangeTable; +use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable}; +use vmm_sys_util::eventfd::EventFd; + +use super::vu_common_ctrl::VhostUserHandle; +use super::{Error, Result}; +use crate::seccomp_filters::Thread; +use crate::thread_helper::spawn_virtio_thread; +use crate::vhost_user::VhostUserCommon; +use crate::{ + ActivateResult, GuestMemoryMmap, GuestRegionMmap, MmapRegion, VIRTIO_F_IOMMU_PLATFORM, + VirtioCommon, VirtioDevice, VirtioInterrupt, VirtioSharedMemoryList, +}; + +#[derive(Serialize, Deserialize)] +pub struct State { + pub avail_features: u64, + pub acked_features: u64, + pub acked_protocol_features: u64, + pub vu_num_queues: usize, + pub backend_req_support: bool, +} + +struct BackendReqHandler {} +impl VhostUserFrontendReqHandler for BackendReqHandler {} +pub struct GenericVhostUser { + common: VirtioCommon, + vu_common: VhostUserCommon, + id: String, + // Hold ownership of the memory that is allocated for the device + // which will be automatically dropped when the device is dropped + cache: Option<(VirtioSharedMemoryList, MmapRegion)>, + seccomp_action: SeccompAction, + guest_memory: Option>, + epoll_thread: Option>, + exit_evt: EventFd, + iommu: bool, + cfg_warning: AtomicBool, +} + +impl GenericVhostUser { + /// Create a new generic vhost-user device. + #[allow(clippy::too_many_arguments)] + pub fn new( + id: String, + path: &str, + request_queue_sizes: Vec, + device_type: u32, + cache: Option<(VirtioSharedMemoryList, MmapRegion)>, + seccomp_action: SeccompAction, + exit_evt: EventFd, + iommu: bool, + state: Option, + ) -> Result { + // Calculate the actual number of queues needed. + let num_queues = request_queue_sizes.len(); + + // Connect to the vhost-user socket. + let mut vu = VhostUserHandle::connect_vhost_user(false, path, num_queues as u64, false)?; + + let (avail_features, acked_features, acked_protocol_features, vu_num_queues, paused) = + if let Some(state) = state { + info!("Restoring generic vhost-user {id}"); + vu.set_protocol_features_vhost_user( + state.acked_features, + state.acked_protocol_features, + )?; + + ( + state.avail_features, + state.acked_features, + state.acked_protocol_features, + state.vu_num_queues, + true, + ) + } else { + let avail_protocol_features = VhostUserProtocolFeatures::CONFIG + | VhostUserProtocolFeatures::MQ + | VhostUserProtocolFeatures::CONFIGURE_MEM_SLOTS + | VhostUserProtocolFeatures::REPLY_ACK + | VhostUserProtocolFeatures::INFLIGHT_SHMFD + | VhostUserProtocolFeatures::LOG_SHMFD; + + let avail_features = super::DEFAULT_VIRTIO_FEATURES; + + let (acked_features, acked_protocol_features) = + vu.negotiate_features_vhost_user(avail_features, avail_protocol_features)?; + + let backend_num_queues = + if acked_protocol_features & VhostUserProtocolFeatures::MQ.bits() != 0 { + vu.socket_handle() + .get_queue_num() + .map_err(Error::VhostUserGetQueueMaxNum)? + as usize + } else { + num_queues + }; + + if num_queues > backend_num_queues { + error!( + "generic vhost-user requested too many queues ({num_queues}) \ +since the backend only supports {backend_num_queues}\n", + ); + return Err(Error::BadQueueNum); + } + // Create virtio-vhost-user device configuration. + ( + acked_features, + // If part of the available features that have been acked, the + // PROTOCOL_FEATURES bit must be already set through the VIRTIO + // acked features as we know the guest would never ack it, thus + // the feature would be lost. + acked_features & VhostUserVirtioFeatures::PROTOCOL_FEATURES.bits(), + acked_protocol_features, + num_queues, + false, + ) + }; + + Ok(GenericVhostUser { + common: VirtioCommon { + device_type, + avail_features, + acked_features, + queue_sizes: request_queue_sizes, + paused_sync: Some(Arc::new(Barrier::new(2))), + min_queues: 1, + paused: Arc::new(AtomicBool::new(paused)), + ..Default::default() + }, + vu_common: VhostUserCommon { + vu: Some(Arc::new(Mutex::new(vu))), + acked_protocol_features, + socket_path: path.to_string(), + vu_num_queues, + ..Default::default() + }, + id, + cache, + seccomp_action, + guest_memory: None, + epoll_thread: None, + exit_evt, + iommu, + cfg_warning: AtomicBool::new(false), + }) + } + + fn state(&self) -> State { + State { + avail_features: self.common.avail_features, + acked_features: self.common.acked_features, + acked_protocol_features: self.vu_common.acked_protocol_features, + vu_num_queues: self.vu_common.vu_num_queues, + backend_req_support: false, + } + } + + #[cold] + #[inline(never)] + fn warn_no_config_access(&self) { + if self + .cfg_warning + .compare_exchange(false, true, Ordering::SeqCst, Ordering::SeqCst) + .is_ok() + { + warn!( + "Attempt to read config space, but backend does not support config \ +space access. Reads will return 0xFF and writes will be ignored." + ); + } + } +} + +impl Drop for GenericVhostUser { + fn drop(&mut self) { + if let Some(kill_evt) = self.common.kill_evt.take() { + // Ignore the result because there is nothing we can do about it. + let _ = kill_evt.write(1); + } + self.common.wait_for_epoll_threads(); + if let Some(thread) = self.epoll_thread.take() + && let Err(e) = thread.join() + { + error!("Error joining thread: {e:?}"); + } + } +} + +impl VirtioDevice for GenericVhostUser { + fn device_type(&self) -> u32 { + self.common.device_type + } + + fn queue_max_sizes(&self) -> &[u16] { + &self.common.queue_sizes + } + + fn features(&self) -> u64 { + let mut features = self.common.avail_features; + if self.iommu { + features |= 1u64 << VIRTIO_F_IOMMU_PLATFORM; + } + features + } + + fn ack_features(&mut self, value: u64) { + self.common.ack_features(value); + } + + fn read_config(&self, offset: u64, data: &mut [u8]) { + if (VhostUserProtocolFeatures::CONFIG.bits() & self.state().acked_protocol_features) == 0 { + self.warn_no_config_access(); + + data.fill(0xFF); + return; + } + if let Err(e) = self + .vu_common + .vu + .as_ref() + .unwrap() + .lock() + .unwrap() + .socket_handle() + .get_config( + offset.try_into().unwrap(), + data.len().try_into().unwrap(), + VhostUserConfigFlags::empty(), + data, + ) + .map(|(_, config)| data.copy_from_slice(&config)) + { + panic!("Failed getting generic vhost-user configuration: {e}"); + } + } + + fn write_config(&mut self, offset: u64, data: &[u8]) { + if (VhostUserProtocolFeatures::CONFIG.bits() & self.state().acked_protocol_features) == 0 { + self.warn_no_config_access(); + return; + } + if let Err(e) = self + .vu_common + .vu + .as_ref() + .unwrap() + .lock() + .unwrap() + .socket_handle() + .set_config( + offset.try_into().unwrap(), + VhostUserConfigFlags::WRITABLE, + data, + ) + { + panic!("Failed setting generic vhost-user configuration: {e}"); + } + } + + fn activate( + &mut self, + mem: GuestMemoryAtomic, + interrupt_cb: Arc, + queues: Vec<(usize, Queue, EventFd)>, + ) -> ActivateResult { + self.common.activate(&queues, interrupt_cb.clone())?; + self.guest_memory = Some(mem.clone()); + + let backend_req_handler: Option> = None; + // Run a dedicated thread for handling potential reconnections with + // the backend. + let (kill_evt, pause_evt) = self.common.dup_eventfds(); + + let mut handler = self.vu_common.activate( + mem, + &queues, + interrupt_cb, + self.common.acked_features, + backend_req_handler, + kill_evt, + pause_evt, + )?; + + let paused = self.common.paused.clone(); + let paused_sync = self.common.paused_sync.clone(); + + let mut epoll_threads = Vec::new(); + spawn_virtio_thread( + &self.id, + &self.seccomp_action, + Thread::VirtioGenericVhostUser, + &mut epoll_threads, + &self.exit_evt, + move || handler.run(&paused, paused_sync.as_ref().unwrap()), + )?; + self.epoll_thread = Some(epoll_threads.remove(0)); + + event!("virtio-device", "activated", "id", &self.id); + Ok(()) + } + + fn reset(&mut self) -> Option> { + // We first must resume the virtio thread if it was paused. + if self.common.pause_evt.take().is_some() { + self.common.resume().ok()?; + } + + if let Some(vu) = &self.vu_common.vu + && let Err(e) = vu.lock().unwrap().reset_vhost_user() + { + error!("Failed to reset vhost-user daemon: {e:?}"); + return None; + } + + if let Some(kill_evt) = self.common.kill_evt.take() { + // Ignore the result because there is nothing we can do about it. + let _ = kill_evt.write(1); + } + + event!("virtio-device", "reset", "id", &self.id); + + // Return the interrupt + Some(self.common.interrupt_cb.take().unwrap()) + } + + fn shutdown(&mut self) { + self.vu_common.shutdown(); + } + + fn get_shm_regions(&self) -> Option { + self.cache.as_ref().map(|cache| cache.0.clone()) + } + + fn set_shm_regions( + &mut self, + shm_regions: VirtioSharedMemoryList, + ) -> std::result::Result<(), crate::Error> { + if let Some(cache) = self.cache.as_mut() { + cache.0 = shm_regions; + Ok(()) + } else { + Err(crate::Error::SetShmRegionsNotSupported) + } + } + + fn add_memory_region( + &mut self, + region: &Arc, + ) -> std::result::Result<(), crate::Error> { + self.vu_common.add_memory_region(&self.guest_memory, region) + } + + fn userspace_mappings(&self) -> Vec { + let mut mappings = Vec::new(); + if let Some(cache) = self.cache.as_ref() { + mappings.push(UserspaceMapping { + mem_slot: cache.0.mem_slot, + addr: cache.0.addr, + mapping: cache.0.mapping.clone(), + mergeable: false, + }); + } + + mappings + } +} + +impl Pausable for GenericVhostUser { + fn pause(&mut self) -> result::Result<(), MigratableError> { + self.vu_common.pause()?; + self.common.pause() + } + + fn resume(&mut self) -> result::Result<(), MigratableError> { + self.common.resume()?; + + if let Some(epoll_thread) = &self.epoll_thread { + epoll_thread.thread().unpark(); + } + + self.vu_common.resume() + } +} + +impl Snapshottable for GenericVhostUser { + fn id(&self) -> String { + self.id.clone() + } + + fn snapshot(&mut self) -> std::result::Result { + self.vu_common.snapshot(&self.state()) + } +} +impl Transportable for GenericVhostUser {} + +impl Migratable for GenericVhostUser { + fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { + self.vu_common.start_dirty_log(&self.guest_memory) + } + + fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { + self.vu_common.stop_dirty_log() + } + + fn dirty_log(&mut self) -> std::result::Result { + self.vu_common.dirty_log(&self.guest_memory) + } + + fn start_migration(&mut self) -> std::result::Result<(), MigratableError> { + self.vu_common.start_migration() + } + + fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> { + self.vu_common + .complete_migration(self.common.kill_evt.take()) + } +} diff --git a/virtio-devices/src/vhost_user/mod.rs b/virtio-devices/src/vhost_user/mod.rs index 7e2c162cb9..158da3d800 100644 --- a/virtio-devices/src/vhost_user/mod.rs +++ b/virtio-devices/src/vhost_user/mod.rs @@ -33,11 +33,13 @@ use crate::{ pub mod blk; pub mod fs; +pub mod generic_vhost_user; pub mod net; pub mod vu_common_ctrl; pub use self::blk::Blk; pub use self::fs::*; +pub use self::generic_vhost_user::GenericVhostUser; pub use self::net::Net; pub use self::vu_common_ctrl::VhostUserConfig; From 085a7a49fab7783ee760a746830a506409a28919 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Wed, 4 Feb 2026 09:52:42 -0500 Subject: [PATCH 008/742] vmm: generic vhost-user: add support Add VMM support for generic vhost-user devices. Signed-off-by: Demi Marie Obenour --- cloud-hypervisor/src/main.rs | 1 + option_parser/src/lib.rs | 16 +++ vmm/src/api/mod.rs | 50 +++++++- vmm/src/config.rs | 217 +++++++++++++++++++++++++++++++++++ vmm/src/device_manager.rs | 92 ++++++++++++++- vmm/src/lib.rs | 91 ++++++++++++++- vmm/src/vm.rs | 31 ++++- vmm/src/vm_config.rs | 25 ++++ 8 files changed, 515 insertions(+), 8 deletions(-) diff --git a/cloud-hypervisor/src/main.rs b/cloud-hypervisor/src/main.rs index 1d78b400bc..351dd73b78 100644 --- a/cloud-hypervisor/src/main.rs +++ b/cloud-hypervisor/src/main.rs @@ -998,6 +998,7 @@ mod unit_tests { }, balloon: None, fs: None, + generic_vhost_user: None, pmem: None, serial: ConsoleConfig { file: None, diff --git a/option_parser/src/lib.rs b/option_parser/src/lib.rs index 1722da39f2..699a26252c 100644 --- a/option_parser/src/lib.rs +++ b/option_parser/src/lib.rs @@ -4,6 +4,7 @@ // use std::collections::HashMap; +use std::fmt::{Display, Write}; use std::num::ParseIntError; use std::str::FromStr; @@ -240,6 +241,21 @@ impl FromStr for ByteSized { pub struct IntegerList(pub Vec); +impl Display for IntegerList { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_char('[')?; + let mut iter = self.0.iter(); + if let Some(first) = iter.next() { + first.fmt(f)?; + for i in iter { + f.write_char(',')?; + i.fmt(f)?; + } + } + f.write_char(']') + } +} + #[derive(Error, Debug)] pub enum IntegerListParseError { #[error("invalid value: {0}")] diff --git a/vmm/src/api/mod.rs b/vmm/src/api/mod.rs index 12ca6b9877..17ba0011b0 100644 --- a/vmm/src/api/mod.rs +++ b/vmm/src/api/mod.rs @@ -51,8 +51,8 @@ use crate::config::RestoreConfig; use crate::device_tree::DeviceTree; use crate::vm::{Error as VmError, VmState}; use crate::vm_config::{ - DeviceConfig, DiskConfig, FsConfig, NetConfig, PmemConfig, UserDeviceConfig, VdpaConfig, - VmConfig, VsockConfig, + DeviceConfig, DiskConfig, FsConfig, GenericVhostUserConfig, NetConfig, PmemConfig, + UserDeviceConfig, VdpaConfig, VmConfig, VsockConfig, }; /// API errors are sent back from the VMM API server through the ApiResponse. @@ -170,6 +170,10 @@ pub enum ApiError { #[error("The fs could not be added to the VM")] VmAddFs(#[source] VmError), + /// The generic vhost-user device could not be added to the VM. + #[error("The generic vhost-user device could not be added to the VM")] + VmAddGenericVhostUser(#[source] VmError), + /// The pmem device could not be added to the VM. #[error("The pmem device could not be added to the VM")] VmAddPmem(#[source] VmError), @@ -340,6 +344,11 @@ pub trait RequestHandler { fn vm_add_fs(&mut self, fs_cfg: FsConfig) -> Result>, VmError>; + fn vm_add_generic_vhost_user( + &mut self, + fs_cfg: GenericVhostUserConfig, + ) -> Result>, VmError>; + fn vm_add_pmem(&mut self, pmem_cfg: PmemConfig) -> Result>, VmError>; fn vm_add_net(&mut self, net_cfg: NetConfig) -> Result>, VmError>; @@ -539,6 +548,43 @@ impl ApiAction for VmAddFs { } } +pub struct VmAddGenericVhostUser; + +impl ApiAction for VmAddGenericVhostUser { + type RequestBody = GenericVhostUserConfig; + type ResponseBody = Option; + + fn request( + &self, + config: Self::RequestBody, + response_sender: Sender, + ) -> ApiRequest { + Box::new(move |vmm| { + info!("API request event: VmAddGenericVhostUser {config:?}"); + + let response = vmm + .vm_add_generic_vhost_user(config) + .map_err(ApiError::VmAddGenericVhostUser) + .map(ApiResponsePayload::VmAction); + + response_sender + .send(response) + .map_err(VmmError::ApiResponseSend)?; + + Ok(false) + }) + } + + fn send( + &self, + api_evt: EventFd, + api_sender: Sender, + data: Self::RequestBody, + ) -> ApiResult { + get_response_body(self, api_evt, api_sender, data) + } +} + pub struct VmAddPmem; impl ApiAction for VmAddPmem { diff --git a/vmm/src/config.rs b/vmm/src/config.rs index a4339c27b8..25c0d96ae6 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -46,6 +46,24 @@ pub enum Error { /// Filesystem socket is missing #[error("Error parsing --fs: socket missing")] ParseFsSockMissing, + /// Generic vhost-user socket is missing + #[error("Error parsing --generic-vhost-user: socket missing")] + ParseGenericVhostUserSockMissing, + /// Generic vhost-user number of queues is missing + #[error("Error parsing --generic-vhost-user: number of queues missing")] + ParseGenericVhostUserNumResponseQueuesMissing, + /// Generic vhost-user virtio ID is missing + #[error("Error parsing --generic-vhost-user: virtio ID missing")] + ParseGenericVhostUserVirtioIdMissing, + /// Generic vhost-user available features is missing + #[error("Error parsing --generic-vhost-user: available features missing")] + ParseGenericVhostUserAvailFeaturesMissing, + /// Generic vhost-user queue size is too large + #[error("Error parsing --generic-vhost-user: queue size {0} is {1}, but limit is 65535")] + ParseGenericVhostUserQueueSizeTooLarge(usize, u64), + /// Generic vhost-user queue size missing + #[error("Error parsing --generic-vhost-user: queue size missing")] + ParseGenericVhostUserQueueSizeMissing, /// Missing persistent memory file parameter. #[error("Error parsing --pmem: file missing")] ParsePmemFileMissing, @@ -94,6 +112,9 @@ pub enum Error { /// Error parsing persistent memory parameters #[error("Error parsing --pmem")] ParsePersistentMemory(#[source] OptionParserError), + /// Error parsing generic vhost-user parameters + #[error("Error parsing --generic-vhost-user")] + ParseGenericVhostUser(#[source] OptionParserError), /// Failed parsing console #[error("Error parsing --console")] ParseConsole(#[source] OptionParserError), @@ -394,6 +415,7 @@ pub struct VmParams<'a> { pub rng: &'a str, pub balloon: Option<&'a str>, pub fs: Option>, + pub generic_vhost_user: Option>, pub pmem: Option>, pub serial: &'a str, pub console: &'a str, @@ -455,6 +477,9 @@ impl<'a> VmParams<'a> { let fs: Option> = args .get_many::("fs") .map(|x| x.map(|y| y as &str).collect()); + let generic_vhost_user: Option> = args + .get_many::("generic-vhost-user") + .map(|x| x.map(|y| y as &str).collect()); let pmem: Option> = args .get_many::("pmem") .map(|x| x.map(|y| y as &str).collect()); @@ -509,6 +534,7 @@ impl<'a> VmParams<'a> { rng, balloon, fs, + generic_vhost_user, pmem, serial, console, @@ -1642,6 +1668,82 @@ impl BalloonConfig { } } +impl GenericVhostUserConfig { + pub const SYNTAX: &'static str = "generic vhost-user parameters \ + \"virtio_id=,\ + socket=,\ + queue_sizes=,\ + id=,pci_segment=\""; + + pub fn parse(vhost_user: &str) -> Result { + let mut parser = OptionParser::new(); + parser + .add("virtio_id") + .add("queue_sizes") + .add("socket") + .add("id") + .add("pci_segment"); + parser + .parse(vhost_user) + .map_err(Error::ParseGenericVhostUser)?; + + let socket = parser + .get("socket") + .ok_or(Error::ParseGenericVhostUserSockMissing)?; + + let IntegerList(queue_sizes) = parser + .convert("queue_sizes") + .map_err(Error::ParseGenericVhostUser)? + .ok_or(Error::ParseGenericVhostUserQueueSizeMissing)?; + let device_type = parser + .convert("virtio_id") + .map_err(Error::ParseGenericVhostUser)? + .ok_or(Error::ParseGenericVhostUserVirtioIdMissing)?; + let id = parser.get("id"); + let pci_segment = parser + .convert("pci_segment") + .map_err(Error::ParseGenericVhostUser)? + .unwrap_or_default(); + let mut converted_queue_sizes: Vec = Vec::new(); + for (offset, &queue_size) in queue_sizes.iter().enumerate() { + match queue_size.try_into() { + Err(_) => { + return Err(Error::ParseGenericVhostUserQueueSizeTooLarge( + offset, queue_size, + )); + } + Ok(queue_size) => converted_queue_sizes.push(queue_size), + } + } + + Ok(GenericVhostUserConfig { + socket: socket.into(), + device_type, + id, + pci_segment, + queue_sizes: converted_queue_sizes, + }) + } + + pub fn validate(&self, vm_config: &VmConfig) -> ValidationResult<()> { + if let Some(platform_config) = vm_config.platform.as_ref() { + if self.pci_segment >= platform_config.num_pci_segments { + return Err(ValidationError::InvalidPciSegment(self.pci_segment)); + } + + if let Some(iommu_segments) = platform_config.iommu_segments.as_ref() + && iommu_segments.contains(&self.pci_segment) + { + return Err(ValidationError::IommuNotSupportedOnSegment( + self.pci_segment, + )); + } + } + + Ok(()) + } +} + impl FsConfig { pub const SYNTAX: &'static str = "virtio-fs parameters \ \"tag=,socket=,num_queues=,\ @@ -2738,6 +2840,17 @@ impl VmConfig { } } + if let Some(generic_vhost_user_devices) = &self.generic_vhost_user { + if !generic_vhost_user_devices.is_empty() && !self.backed_by_shared_memory() { + return Err(ValidationError::VhostUserRequiresSharedMemory); + } + for generic_vhost_user_device in generic_vhost_user_devices { + generic_vhost_user_device.validate(self)?; + + Self::validate_identifier(&mut id_list, &generic_vhost_user_device.id)?; + } + } + if let Some(pmems) = &self.pmem { for pmem in pmems { pmem.validate(self)?; @@ -2991,6 +3104,15 @@ impl VmConfig { fs = Some(fs_config_list); } + let mut generic_vhost_user: Option> = None; + if let Some(generic_vhost_user_list) = &vm_params.generic_vhost_user { + let mut generic_vhost_user_config_list = Vec::new(); + for item in generic_vhost_user_list.iter() { + generic_vhost_user_config_list.push(GenericVhostUserConfig::parse(item)?); + } + generic_vhost_user = Some(generic_vhost_user_config_list); + } + let mut pmem: Option> = None; if let Some(pmem_list) = &vm_params.pmem { let mut pmem_config_list = Vec::new(); @@ -3126,6 +3248,7 @@ impl VmConfig { net, rng, balloon, + generic_vhost_user, fs, pmem, serial, @@ -3188,6 +3311,13 @@ impl VmConfig { removed |= fs.len() != len; } + // Remove if generic vhost-user device + if let Some(generic_vhost_user) = self.generic_vhost_user.as_mut() { + let len = generic_vhost_user.len(); + generic_vhost_user.retain(|dev| dev.id.as_ref().map(|id| id.as_ref()) != Some(id)); + removed |= generic_vhost_user.len() != len; + } + // Remove if net device if let Some(net) = self.net.as_mut() { let len = net.len(); @@ -3260,6 +3390,7 @@ impl Clone for VmConfig { #[cfg(feature = "pvmemcontrol")] pvmemcontrol: self.pvmemcontrol.clone(), fs: self.fs.clone(), + generic_vhost_user: self.generic_vhost_user.clone(), pmem: self.pmem.clone(), serial: self.serial.clone(), console: self.console.clone(), @@ -3784,6 +3915,90 @@ mod unit_tests { Ok(()) } + #[track_caller] + #[allow(clippy::too_many_arguments)] + fn make_vhost_user_config( + socket: &str, + virtio_id: u64, + id: &str, + pci_segment: u64, + queue_sizes: &IntegerList, + ) { + assert!(!socket.contains(",[]\n\r\0\"")); + assert!(!id.contains(",[]\n\r\0\"")); + let config = GenericVhostUserConfig::parse(&format!( + "virtio_id={virtio_id},socket=\"{socket}\",\ +id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" + )); + if pci_segment <= u16::MAX.into() + && virtio_id <= u32::MAX.into() + && queue_sizes.0.iter().all(|&f| f <= u16::MAX.into()) + { + assert_eq!( + config.unwrap(), + GenericVhostUserConfig { + socket: socket.into(), + id: Some(id.to_owned()), + device_type: u32::try_from(virtio_id).unwrap(), + pci_segment: u16::try_from(pci_segment).unwrap(), + queue_sizes: queue_sizes + .0 + .iter() + .map(|&f| u16::try_from(f).unwrap()) + .collect(), + } + ); + } else { + config.unwrap_err(); + } + } + + #[test] + fn test_parse_vhost_user() -> Result<()> { + // all parameters must be supplied, except pci_segment + GenericVhostUserConfig::parse("").unwrap_err(); + GenericVhostUserConfig::parse("virtio_id=1").unwrap_err(); + GenericVhostUserConfig::parse("queue_size=1").unwrap_err(); + GenericVhostUserConfig::parse("socket=/tmp/sock").unwrap_err(); + GenericVhostUserConfig::parse("id=1").unwrap_err(); + make_vhost_user_config( + "/dev/null/doesnotexist", + 100, + "Something", + 10, + &IntegerList(vec![u16::MAX.into(), 20u16.into()]), + ); + make_vhost_user_config( + "/dev/null/doesnotexist", + 100, + "Something", + 10, + &IntegerList(vec![u16::MAX.into()]), + ); + make_vhost_user_config( + "/dev/null/doesnotexist", + u64::from(u32::MAX) + 1, + "Something", + 10, + &IntegerList(vec![20u64]), + ); + make_vhost_user_config( + "/dev/null/doesnotexist", + u64::from(u32::MAX) + 1, + "Something", + 10, + &IntegerList(vec![20u64]), + ); + make_vhost_user_config( + "/dev/null/doesnotexist", + u64::from(u32::MAX) + 1, + "Something", + 10, + &IntegerList(vec![20u64]), + ); + Ok(()) + } + fn pmem_fixture() -> PmemConfig { PmemConfig { file: PathBuf::from("/tmp/pmem"), @@ -4168,6 +4383,7 @@ mod unit_tests { rate_limit_groups: None, disks: None, rng: RngConfig::default(), + generic_vhost_user: None, balloon: None, fs: None, pmem: None, @@ -4373,6 +4589,7 @@ mod unit_tests { }, balloon: None, fs: None, + generic_vhost_user: None, pmem: None, serial: ConsoleConfig { file: None, diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index a7d3254c3f..f22696c7fe 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -127,8 +127,8 @@ use crate::serial_manager::{Error as SerialManagerError, SerialManager}; use crate::vm_config::IvshmemConfig; use crate::vm_config::{ ConsoleOutputMode, DEFAULT_IOMMU_ADDRESS_WIDTH_BITS, DEFAULT_PCI_SEGMENT_APERTURE_WEIGHT, - DeviceConfig, DiskConfig, FsConfig, NetConfig, PmemConfig, UserDeviceConfig, VdpaConfig, - VhostMode, VmConfig, VsockConfig, + DeviceConfig, DiskConfig, FsConfig, GenericVhostUserConfig, NetConfig, PmemConfig, + UserDeviceConfig, VdpaConfig, VhostMode, VmConfig, VsockConfig, }; use crate::{DEVICE_MANAGER_SNAPSHOT_ID, GuestRegionMmap, PciDeviceInfo, device_node}; @@ -158,6 +158,7 @@ const IVSHMEM_DEVICE_NAME: &str = "__ivshmem"; const DISK_DEVICE_NAME_PREFIX: &str = "_disk"; const FS_DEVICE_NAME_PREFIX: &str = "_fs"; const NET_DEVICE_NAME_PREFIX: &str = "_net"; +const GENERIC_VHOST_USER_DEVICE_NAME_PREFIX: &str = "_generic_vhost_user"; const PMEM_DEVICE_NAME_PREFIX: &str = "_pmem"; const VDPA_DEVICE_NAME_PREFIX: &str = "_vdpa"; const VSOCK_DEVICE_NAME_PREFIX: &str = "_vsock"; @@ -197,6 +198,10 @@ pub enum DeviceManagerError { #[error("Cannot create virtio-rng device")] CreateVirtioRng(#[source] io::Error), + /// Cannot create generic vhost-user device + #[error("Cannot create generic vhost-user device")] + CreateGenericVhostUser(#[source] virtio_devices::vhost_user::Error), + /// Cannot create virtio-fs device #[error("Cannot create virtio-fs device")] CreateVirtioFs(#[source] virtio_devices::vhost_user::Error), @@ -205,6 +210,10 @@ pub enum DeviceManagerError { #[error("Virtio-fs device was created without a socket")] NoVirtioFsSock, + /// Generic vhost-user device was created without a socket. + #[error("Generic vhost-user device was created without a socket")] + NoGenericVhostUserSock, + /// Cannot create vhost-user-blk device #[error("Cannot create vhost-user-blk device")] CreateVhostUserBlk(#[source] virtio_devices::vhost_user::Error), @@ -2554,6 +2563,9 @@ impl DeviceManager { self.make_virtio_net_devices()?; self.make_virtio_rng_devices()?; + // Add generic vhost-user if required + self.make_generic_vhost_user_devices()?; + // Add virtio-fs if required self.make_virtio_fs_devices()?; @@ -3122,6 +3134,72 @@ impl DeviceManager { Ok(()) } + fn make_generic_vhost_user_device( + &mut self, + generic_vhost_user_cfg: &mut GenericVhostUserConfig, + ) -> DeviceManagerResult { + let id = if let Some(id) = &generic_vhost_user_cfg.id { + id.clone() + } else { + let id = self.next_device_name(GENERIC_VHOST_USER_DEVICE_NAME_PREFIX)?; + generic_vhost_user_cfg.id = Some(id.clone()); + id + }; + + info!("Creating generic vhost-user device: {generic_vhost_user_cfg:?}"); + + let mut node = device_node!(id); + + if let Some(generic_vhost_user_socket) = generic_vhost_user_cfg.socket.to_str() { + let generic_vhost_user_device = Arc::new(Mutex::new( + virtio_devices::vhost_user::GenericVhostUser::new( + id.clone(), + generic_vhost_user_socket, + generic_vhost_user_cfg.queue_sizes.clone(), + generic_vhost_user_cfg.device_type, + None, + self.seccomp_action.clone(), + self.exit_evt + .try_clone() + .map_err(DeviceManagerError::EventFd)?, + self.force_iommu, + state_from_id(self.snapshot.as_ref(), id.as_str()) + .map_err(DeviceManagerError::RestoreGetState)?, + ) + .map_err(DeviceManagerError::CreateGenericVhostUser)?, + )); + + // Update the device tree with the migratable device. + node.migratable = + Some(Arc::clone(&generic_vhost_user_device) as Arc>); + self.device_tree.lock().unwrap().insert(id.clone(), node); + + Ok(MetaVirtioDevice { + virtio_device: Arc::clone(&generic_vhost_user_device) + as Arc>, + iommu: false, + id, + pci_segment: generic_vhost_user_cfg.pci_segment, + dma_handler: None, + }) + } else { + Err(DeviceManagerError::NoGenericVhostUserSock) + } + } + + fn make_generic_vhost_user_devices(&mut self) -> DeviceManagerResult<()> { + let mut generic_vhost_user_devices = self.config.lock().unwrap().generic_vhost_user.clone(); + if let Some(generic_vhost_user_list_cfg) = &mut generic_vhost_user_devices { + for generic_vhost_user_cfg in generic_vhost_user_list_cfg.iter_mut() { + let device = self.make_generic_vhost_user_device(generic_vhost_user_cfg)?; + self.virtio_devices.push(device); + } + } + self.config.lock().unwrap().generic_vhost_user = generic_vhost_user_devices; + + Ok(()) + } + fn make_virtio_fs_device( &mut self, fs_cfg: &mut FsConfig, @@ -4918,6 +4996,16 @@ impl DeviceManager { self.hotplug_virtio_pci_device(device) } + pub fn add_generic_vhost_user( + &mut self, + generic_vhost_user_cfg: &mut GenericVhostUserConfig, + ) -> DeviceManagerResult { + self.validate_identifier(&generic_vhost_user_cfg.id)?; + + let device = self.make_generic_vhost_user_device(generic_vhost_user_cfg)?; + self.hotplug_virtio_pci_device(device) + } + pub fn add_pmem(&mut self, pmem_cfg: &mut PmemConfig) -> DeviceManagerResult { self.validate_identifier(&pmem_cfg.id)?; diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index dcf6614b24..627b13d5d7 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -59,8 +59,8 @@ use crate::migration::{recv_vm_config, recv_vm_state}; use crate::seccomp_filters::{Thread, get_seccomp_filter}; use crate::vm::{Error as VmError, Vm, VmState}; use crate::vm_config::{ - DeviceConfig, DiskConfig, FsConfig, NetConfig, PmemConfig, UserDeviceConfig, VdpaConfig, - VmConfig, VsockConfig, + DeviceConfig, DiskConfig, FsConfig, GenericVhostUserConfig, NetConfig, PmemConfig, + UserDeviceConfig, VdpaConfig, VmConfig, VsockConfig, }; mod acpi; @@ -2125,6 +2125,39 @@ impl RequestHandler for Vmm { } } + fn vm_add_generic_vhost_user( + &mut self, + generic_vhost_user_cfg: GenericVhostUserConfig, + ) -> result::Result>, VmError> { + self.vm_config.as_ref().ok_or(VmError::VmNotCreated)?; + + { + // Validate the configuration change in a cloned configuration + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap().clone(); + add_to_config( + &mut config.generic_vhost_user, + generic_vhost_user_cfg.clone(), + ); + config.validate().map_err(VmError::ConfigValidation)?; + } + + if let Some(ref mut vm) = self.vm { + let info = vm + .add_generic_vhost_user(generic_vhost_user_cfg) + .inspect_err(|e| { + error!("Error when adding new generic vhost-user device to the VM: {e:?}"); + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } else { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + add_to_config(&mut config.generic_vhost_user, generic_vhost_user_cfg); + Ok(None) + } + } + fn vm_add_pmem(&mut self, pmem_cfg: PmemConfig) -> result::Result>, VmError> { self.vm_config.as_ref().ok_or(VmError::VmNotCreated)?; @@ -2443,6 +2476,7 @@ mod unit_tests { }, balloon: None, fs: None, + generic_vhost_user: None, pmem: None, serial: ConsoleConfig { file: None, @@ -2679,6 +2713,59 @@ mod unit_tests { ); } + #[test] + fn test_vmm_vm_cold_add_generic_vhost_user() { + let mut vmm = create_dummy_vmm(); + let generic_vhost_user_config = + GenericVhostUserConfig::parse("virtio_id=26,socket=/tmp/sock,queue_sizes=[1024]") + .unwrap(); + + assert!(matches!( + vmm.vm_add_generic_vhost_user(generic_vhost_user_config.clone()), + Err(VmError::VmNotCreated) + )); + + let _ = vmm.vm_create(create_dummy_vm_config()); + assert!( + vmm.vm_config + .as_ref() + .unwrap() + .lock() + .unwrap() + .generic_vhost_user + .is_none() + ); + + assert!( + vmm.vm_add_generic_vhost_user(generic_vhost_user_config.clone()) + .unwrap() + .is_none() + ); + assert_eq!( + vmm.vm_config + .as_ref() + .unwrap() + .lock() + .unwrap() + .generic_vhost_user + .clone() + .unwrap() + .len(), + 1 + ); + assert_eq!( + vmm.vm_config + .as_ref() + .unwrap() + .lock() + .unwrap() + .generic_vhost_user + .clone() + .unwrap()[0], + generic_vhost_user_config + ); + } + #[test] fn test_vmm_vm_cold_add_pmem() { let mut vmm = create_dummy_vmm(); diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index db29072bed..9c37457c19 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -100,8 +100,8 @@ use crate::migration::{SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE, url_to_path}; #[cfg(feature = "fw_cfg")] use crate::vm_config::FwCfgConfig; use crate::vm_config::{ - DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, NumaConfig, PayloadConfig, - PmemConfig, UserDeviceConfig, VdpaConfig, VmConfig, VsockConfig, + DeviceConfig, DiskConfig, FsConfig, GenericVhostUserConfig, HotplugMethod, NetConfig, + NumaConfig, PayloadConfig, PmemConfig, UserDeviceConfig, VdpaConfig, VmConfig, VsockConfig, }; use crate::{ CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, GuestMemoryMmap, @@ -2136,6 +2136,33 @@ impl Vm { Ok(pci_device_info) } + pub fn add_generic_vhost_user( + &mut self, + mut generic_vhost_user_cfg: GenericVhostUserConfig, + ) -> Result { + let pci_device_info = self + .device_manager + .lock() + .unwrap() + .add_generic_vhost_user(&mut generic_vhost_user_cfg) + .map_err(Error::DeviceManager)?; + + // Update VmConfig by adding the new device. This is important to + // ensure the device would be created in case of a reboot. + { + let mut config = self.config.lock().unwrap(); + add_to_config(&mut config.generic_vhost_user, generic_vhost_user_cfg); + } + + self.device_manager + .lock() + .unwrap() + .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) + .map_err(Error::DeviceManager)?; + + Ok(pci_device_info) + } + pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result { let pci_device_info = self .device_manager diff --git a/vmm/src/vm_config.rs b/vmm/src/vm_config.rs index 5de8c31452..b9e67f7bba 100644 --- a/vmm/src/vm_config.rs +++ b/vmm/src/vm_config.rs @@ -472,6 +472,24 @@ impl ApplyLandlock for FsConfig { } } +#[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] +pub struct GenericVhostUserConfig { + pub socket: PathBuf, + pub queue_sizes: Vec, + #[serde(default)] + pub id: Option, + #[serde(default)] + pub pci_segment: u16, + pub device_type: u32, +} + +impl ApplyLandlock for GenericVhostUserConfig { + fn apply_landlock(&self, landlock: &mut Landlock) -> LandlockResult<()> { + landlock.add_rule_with_access(&self.socket, "rw")?; + Ok(()) + } +} + #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] pub struct PmemConfig { pub file: PathBuf, @@ -924,6 +942,7 @@ pub struct VmConfig { #[serde(default)] pub rng: RngConfig, pub balloon: Option, + pub generic_vhost_user: Option>, pub fs: Option>, pub pmem: Option>, #[serde(default = "default_serial")] @@ -1000,6 +1019,12 @@ impl VmConfig { } } + if let Some(generic_vhost_user_configs) = &self.generic_vhost_user { + for generic_vhost_user_config in generic_vhost_user_configs.iter() { + generic_vhost_user_config.apply_landlock(&mut landlock)?; + } + } + if let Some(pmem_configs) = &self.pmem { for pmem_config in pmem_configs.iter() { pmem_config.apply_landlock(&mut landlock)?; From d510f11a5000d86a956b457381c50ece9e4b32dd Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Wed, 4 Feb 2026 09:29:34 -0500 Subject: [PATCH 009/742] misc: generic-vhost-user: wire up to command line Support adding generic vhost-user devices via the Cloud Hypervisor command line. Signed-off-by: Demi Marie Obenour --- cloud-hypervisor/src/main.rs | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/cloud-hypervisor/src/main.rs b/cloud-hypervisor/src/main.rs index 351dd73b78..b2b184248b 100644 --- a/cloud-hypervisor/src/main.rs +++ b/cloud-hypervisor/src/main.rs @@ -32,9 +32,9 @@ use vmm::vm_config::FwCfgConfig; #[cfg(feature = "ivshmem")] use vmm::vm_config::IvshmemConfig; use vmm::vm_config::{ - BalloonConfig, DeviceConfig, DiskConfig, FsConfig, LandlockConfig, NetConfig, NumaConfig, - PciSegmentConfig, PmemConfig, RateLimiterGroupConfig, TpmConfig, UserDeviceConfig, VdpaConfig, - VmConfig, VsockConfig, + BalloonConfig, DeviceConfig, DiskConfig, FsConfig, GenericVhostUserConfig, LandlockConfig, + NetConfig, NumaConfig, PciSegmentConfig, PmemConfig, RateLimiterGroupConfig, TpmConfig, + UserDeviceConfig, VdpaConfig, VmConfig, VsockConfig, }; use vmm_sys_util::eventfd::EventFd; use vmm_sys_util::signal::block_signal; @@ -280,6 +280,11 @@ fn get_cli_options_sorted( .help("GDB socket (UNIX domain socket): path=") .num_args(1) .group("vmm-config"), + Arg::new("generic-vhost-user") + .long("generic-vhost-user") + .help(GenericVhostUserConfig::SYNTAX) + .num_args(1..) + .group("vm-config"), #[cfg(feature = "igvm")] Arg::new("igvm") .long("igvm") From df86b2864bac621575f97ed547fbcda2f2761833 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Wed, 4 Feb 2026 09:34:50 -0500 Subject: [PATCH 010/742] vmm: add HTTP API endpoints for generic vhost-user This includes OpenAPI schemas. Signed-off-by: Demi Marie Obenour --- fuzz/fuzz_targets/http_api.rs | 8 ++++ vmm/src/api/http/http_endpoint.rs | 10 +++-- vmm/src/api/http/mod.rs | 12 ++++-- vmm/src/api/openapi/cloud-hypervisor.yaml | 46 +++++++++++++++++++++++ 4 files changed, 68 insertions(+), 8 deletions(-) diff --git a/fuzz/fuzz_targets/http_api.rs b/fuzz/fuzz_targets/http_api.rs index ee9dd62f18..b7f38994fb 100644 --- a/fuzz/fuzz_targets/http_api.rs +++ b/fuzz/fuzz_targets/http_api.rs @@ -168,6 +168,7 @@ impl RequestHandler for StubApiRequestHandler { }, balloon: None, fs: None, + generic_vhost_user: None, pmem: None, serial: ConsoleConfig { file: None, @@ -254,6 +255,13 @@ impl RequestHandler for StubApiRequestHandler { Ok(None) } + fn vm_add_generic_vhost_user( + &mut self, + _: GenericVhostUserConfig, + ) -> Result>, VmError> { + Ok(None) + } + fn vm_add_pmem(&mut self, _: PmemConfig) -> Result>, VmError> { Ok(None) } diff --git a/vmm/src/api/http/http_endpoint.rs b/vmm/src/api/http/http_endpoint.rs index e463a20819..15ebfd2f9e 100644 --- a/vmm/src/api/http/http_endpoint.rs +++ b/vmm/src/api/http/http_endpoint.rs @@ -45,10 +45,11 @@ use crate::api::VmCoredump; use crate::api::http::http_endpoint::fds_helper::{attach_fds_to_cfg, attach_fds_to_cfgs}; use crate::api::http::{EndpointHandler, HttpError, error_response}; use crate::api::{ - AddDisk, ApiAction, ApiError, ApiRequest, NetConfig, VmAddDevice, VmAddFs, VmAddNet, VmAddPmem, - VmAddUserDevice, VmAddVdpa, VmAddVsock, VmBoot, VmConfig, VmCounters, VmDelete, VmNmi, VmPause, - VmPowerButton, VmReboot, VmReceiveMigration, VmRemoveDevice, VmResize, VmResizeDisk, - VmResizeZone, VmRestore, VmResume, VmSendMigration, VmShutdown, VmSnapshot, + AddDisk, ApiAction, ApiError, ApiRequest, NetConfig, VmAddDevice, VmAddFs, + VmAddGenericVhostUser, VmAddNet, VmAddPmem, VmAddUserDevice, VmAddVdpa, VmAddVsock, VmBoot, + VmConfig, VmCounters, VmDelete, VmNmi, VmPause, VmPowerButton, VmReboot, VmReceiveMigration, + VmRemoveDevice, VmResize, VmResizeDisk, VmResizeZone, VmRestore, VmResume, VmSendMigration, + VmShutdown, VmSnapshot, }; use crate::config::RestoreConfig; use crate::cpu::Error as CpuError; @@ -419,6 +420,7 @@ vm_action_put_handler!(VmNmi); vm_action_put_handler_body!(VmAddDevice); vm_action_put_handler_body!(AddDisk); vm_action_put_handler_body!(VmAddFs); +vm_action_put_handler_body!(VmAddGenericVhostUser); vm_action_put_handler_body!(VmAddPmem); vm_action_put_handler_body!(VmAddVdpa); vm_action_put_handler_body!(VmAddVsock); diff --git a/vmm/src/api/http/mod.rs b/vmm/src/api/http/mod.rs index 2aa52e8e37..3461c08af9 100644 --- a/vmm/src/api/http/mod.rs +++ b/vmm/src/api/http/mod.rs @@ -28,10 +28,10 @@ use self::http_endpoint::{VmActionHandler, VmCreate, VmInfo, VmmPing, VmmShutdow #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use crate::api::VmCoredump; use crate::api::{ - AddDisk, ApiError, ApiRequest, VmAddDevice, VmAddFs, VmAddNet, VmAddPmem, VmAddUserDevice, - VmAddVdpa, VmAddVsock, VmBoot, VmCounters, VmDelete, VmNmi, VmPause, VmPowerButton, VmReboot, - VmReceiveMigration, VmRemoveDevice, VmResize, VmResizeDisk, VmResizeZone, VmRestore, VmResume, - VmSendMigration, VmShutdown, VmSnapshot, + AddDisk, ApiError, ApiRequest, VmAddDevice, VmAddFs, VmAddGenericVhostUser, VmAddNet, + VmAddPmem, VmAddUserDevice, VmAddVdpa, VmAddVsock, VmBoot, VmCounters, VmDelete, VmNmi, + VmPause, VmPowerButton, VmReboot, VmReceiveMigration, VmRemoveDevice, VmResize, VmResizeDisk, + VmResizeZone, VmRestore, VmResume, VmSendMigration, VmShutdown, VmSnapshot, }; use crate::landlock::Landlock; use crate::seccomp_filters::{Thread, get_seccomp_filter}; @@ -196,6 +196,10 @@ pub static HTTP_ROUTES: LazyLock = LazyLock::new(|| { endpoint!("/vm.add-fs"), Box::new(VmActionHandler::new(&VmAddFs)), ); + r.routes.insert( + endpoint!("/vm.add-generic-vhost-user"), + Box::new(VmActionHandler::new(&VmAddGenericVhostUser)), + ); r.routes.insert( endpoint!("/vm.add-net"), Box::new(VmActionHandler::new(&VmAddNet)), diff --git a/vmm/src/api/openapi/cloud-hypervisor.yaml b/vmm/src/api/openapi/cloud-hypervisor.yaml index 629a6800d1..499218c7a5 100644 --- a/vmm/src/api/openapi/cloud-hypervisor.yaml +++ b/vmm/src/api/openapi/cloud-hypervisor.yaml @@ -277,6 +277,28 @@ paths: 500: description: The new device could not be added to the VM instance. + /vm.add-generic-vhost-user: + put: + summary: Add a new generic vhost-user device to the VM + requestBody: + description: The details of the new generic vhost-user device + content: + application/json: + schema: + $ref: "#/components/schemas/GenericVhostUserConfig" + required: true + responses: + 200: + description: The new device was successfully added to the VM instance. + content: + application/json: + schema: + $ref: "#/components/schemas/PciDeviceInfo" + 204: + description: The new device was successfully (cold) added to the VM instance. + 500: + description: The new device could not be added to the VM instance. + /vm.add-pmem: put: summary: Add a new pmem device to the VM @@ -603,6 +625,10 @@ components: type: array items: $ref: "#/components/schemas/FsConfig" + generic-vhost-user: + type: array + items: + $ref: "#/components/schemas/GenericVhostUserConfig" pmem: type: array items: @@ -1057,6 +1083,26 @@ components: id: type: string + GenericVhostUserConfig: + required: + - queue_sizes + - socket + - tag + - virtio_id + type: object + properties: + socket: + type: string + queue_size: + type: array + items: + type: uint16 + pci_segment: + type: integer + format: int16 + virtio_id: + type: uint32 + PmemConfig: required: - file From cfb69c68d2b00ab781fffb1002c3198b28c49546 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Wed, 4 Feb 2026 09:45:36 -0500 Subject: [PATCH 011/742] virtio-devices: generic vhost-user: add D-Bus API Allow adding and removing generic vhost-user devices via D-Bus. Signed-off-by: Demi Marie Obenour --- vmm/src/api/dbus/mod.rs | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/vmm/src/api/dbus/mod.rs b/vmm/src/api/dbus/mod.rs index 6f75fb5cda..ae39feb7d7 100644 --- a/vmm/src/api/dbus/mod.rs +++ b/vmm/src/api/dbus/mod.rs @@ -22,10 +22,10 @@ use super::{ApiAction, ApiRequest}; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use crate::api::VmCoredump; use crate::api::{ - AddDisk, Body, VmAddDevice, VmAddFs, VmAddNet, VmAddPmem, VmAddUserDevice, VmAddVdpa, - VmAddVsock, VmBoot, VmCounters, VmCreate, VmDelete, VmInfo, VmPause, VmPowerButton, VmReboot, - VmReceiveMigration, VmRemoveDevice, VmResize, VmResizeZone, VmRestore, VmResume, - VmSendMigration, VmShutdown, VmSnapshot, VmmPing, VmmShutdown, + AddDisk, Body, VmAddDevice, VmAddFs, VmAddGenericVhostUser, VmAddNet, VmAddPmem, + VmAddUserDevice, VmAddVdpa, VmAddVsock, VmBoot, VmCounters, VmCreate, VmDelete, VmInfo, + VmPause, VmPowerButton, VmReboot, VmReceiveMigration, VmRemoveDevice, VmResize, VmResizeZone, + VmRestore, VmResume, VmSendMigration, VmShutdown, VmSnapshot, VmmPing, VmmShutdown, }; use crate::seccomp_filters::{Thread, get_seccomp_filter}; use crate::{Error as VmmError, NetConfig, Result as VmmResult, VmConfig}; @@ -144,6 +144,16 @@ impl DBusApi { self.vm_action(&VmAddFs, fs_config).await } + async fn vm_add_generic_vhost_user( + &self, + generic_vhost_user_config: String, + ) -> Result> { + let generic_vhost_user_config = + serde_json::from_str(&generic_vhost_user_config).map_err(api_error)?; + self.vm_action(&VmAddGenericVhostUser, generic_vhost_user_config) + .await + } + async fn vm_add_net(&self, net_config: String) -> Result> { let mut net_config: NetConfig = serde_json::from_str(&net_config).map_err(api_error)?; if net_config.fds.is_some() { From 36371283c672a22240a623cef78541d94ee81b51 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Wed, 4 Feb 2026 09:43:23 -0500 Subject: [PATCH 012/742] ch-remote: add generic vhost-user support Support adding generic vhost-user devices via the ch-remote CLI. Signed-off-by: Demi Marie Obenour --- cloud-hypervisor/src/bin/ch-remote.rs | 55 ++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 2 deletions(-) diff --git a/cloud-hypervisor/src/bin/ch-remote.rs b/cloud-hypervisor/src/bin/ch-remote.rs index fd48ffab1f..6939eaa385 100644 --- a/cloud-hypervisor/src/bin/ch-remote.rs +++ b/cloud-hypervisor/src/bin/ch-remote.rs @@ -22,8 +22,8 @@ use option_parser::{ByteSized, ByteSizedParseError}; use thiserror::Error; use vmm::config::RestoreConfig; use vmm::vm_config::{ - DeviceConfig, DiskConfig, FsConfig, NetConfig, PmemConfig, UserDeviceConfig, VdpaConfig, - VsockConfig, + DeviceConfig, DiskConfig, FsConfig, GenericVhostUserConfig, NetConfig, PmemConfig, + UserDeviceConfig, VdpaConfig, VsockConfig, }; #[cfg(feature = "dbus_api")] use zbus::{proxy, zvariant::Optional}; @@ -49,6 +49,8 @@ enum Error { AddDiskConfig(#[source] vmm::config::Error), #[error("Error parsing filesystem syntax")] AddFsConfig(#[source] vmm::config::Error), + #[error("Error parsing generic vhost-user syntax")] + AddGenericVhostUserConfig(#[source] vmm::config::Error), #[error("Error parsing persistent memory syntax")] AddPmemConfig(#[source] vmm::config::Error), #[error("Error parsing network syntax")] @@ -83,6 +85,10 @@ trait DBusApi1 { fn vm_add_device(&self, device_config: &str) -> zbus::Result>; fn vm_add_disk(&self, disk_config: &str) -> zbus::Result>; fn vm_add_fs(&self, fs_config: &str) -> zbus::Result>; + fn vm_add_generic_vhost_user( + &self, + generic_vhost_user_config: &str, + ) -> zbus::Result>; fn vm_add_net(&self, net_config: &str) -> zbus::Result>; fn vm_add_pmem(&self, pmem_config: &str) -> zbus::Result>; fn vm_add_user_device(&self, vm_add_user_device: &str) -> zbus::Result>; @@ -155,6 +161,10 @@ impl<'a> DBusApi1ProxyBlocking<'a> { self.print_response(self.vm_add_fs(fs_config)) } + fn api_vm_add_generic_vhost_user(&self, generic_vhost_user_config: &str) -> ApiResult { + self.print_response(self.vm_add_generic_vhost_user(generic_vhost_user_config)) + } + fn api_vm_add_net(&self, net_config: &str) -> ApiResult { self.print_response(self.vm_add_net(net_config)) } @@ -398,6 +408,22 @@ fn rest_api_do_command(matches: &ArgMatches, socket: &mut UnixStream) -> ApiResu simple_api_command(socket, "PUT", "add-fs", Some(&fs_config)) .map_err(Error::HttpApiClient) } + Some("add-generic-vhost-user") => { + let device_config = add_generic_vhost_user_config( + matches + .subcommand_matches("add-generic-vhost-user") + .unwrap() + .get_one::("generic_vhost_user_config") + .unwrap(), + )?; + simple_api_command( + socket, + "PUT", + "add-generic-vhost-user", + Some(&device_config), + ) + .map_err(Error::HttpApiClient) + } Some("add-pmem") => { let pmem_config = add_pmem_config( matches @@ -620,6 +646,16 @@ fn dbus_api_do_command(matches: &ArgMatches, proxy: &DBusApi1ProxyBlocking<'_>) )?; proxy.api_vm_add_fs(&fs_config) } + Some("add-generic-vhost-user") => { + let generic_vhost_user_config = add_generic_vhost_user_config( + matches + .subcommand_matches("add-generic-vhost-user") + .unwrap() + .get_one::("generic_vhost_user_config") + .unwrap(), + )?; + proxy.api_vm_add_generic_vhost_user(&generic_vhost_user_config) + } Some("add-pmem") => { let pmem_config = add_pmem_config( matches @@ -835,6 +871,14 @@ fn add_fs_config(config: &str) -> Result { Ok(fs_config) } +fn add_generic_vhost_user_config(config: &str) -> Result { + let generic_vhost_user_config = + GenericVhostUserConfig::parse(config).map_err(Error::AddGenericVhostUserConfig)?; + let generic_vhost_user_config = serde_json::to_string(&generic_vhost_user_config).unwrap(); + + Ok(generic_vhost_user_config) +} + fn add_pmem_config(config: &str) -> Result { let pmem_config = PmemConfig::parse(config).map_err(Error::AddPmemConfig)?; let pmem_config = serde_json::to_string(&pmem_config).unwrap(); @@ -981,6 +1025,13 @@ fn get_cli_commands_sorted() -> Box<[Command]> { .index(1) .help(vmm::vm_config::FsConfig::SYNTAX), ), + Command::new("add-generic-vhost-user") + .about("Add generic vhost-user device") + .arg( + Arg::new("generic_vhost_user_config") + .index(1) + .help(vmm::vm_config::GenericVhostUserConfig::SYNTAX), + ), Command::new("add-net") .about("Add network device") .arg(Arg::new("net_config").index(1).help(NetConfig::SYNTAX)), From d6b80d9845a9e94087d0a4d3283fa1284511fc88 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Wed, 4 Feb 2026 09:44:04 -0500 Subject: [PATCH 013/742] tests: generic vhost-user: add support Include integeration tests for generic vhost-user devices. Signed-off-by: Demi Marie Obenour --- cloud-hypervisor/tests/integration.rs | 62 +++++++++++++++++++++++---- 1 file changed, 53 insertions(+), 9 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index e9d2e66c1b..221aadfbaf 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -470,6 +470,7 @@ fn prepare_virtiofsd(tmp_dir: &TempDir, shared_dir: &str) -> (std::process::Chil .args(["--shared-dir", shared_dir]) .args(["--socket-path", virtiofsd_socket_path.as_str()]) .args(["--cache", "never"]) + .args(["--tag", "myfs"]) .spawn() .unwrap(); @@ -1621,6 +1622,7 @@ fn test_boot_from_vhost_user_blk( fn _test_virtio_fs( prepare_daemon: &dyn Fn(&TempDir, &str) -> (std::process::Child, String), hotplug: bool, + use_generic_vhost_user: bool, pci_segment: Option, ) { #[cfg(target_arch = "aarch64")] @@ -1670,8 +1672,13 @@ fn _test_virtio_fs( } let fs_params = format!( - "id=myfs0,tag=myfs,socket={},num_queues=1,queue_size=1024{}", + "socket={},id=myfs0,{}{}", virtiofsd_socket_path, + if use_generic_vhost_user { + "queue_sizes=[1024,1024],virtio_id=26" + } else { + "tag=myfs,num_queues=1,queue_size=1024" + }, if let Some(pci_segment) = pci_segment { format!(",pci_segment={pci_segment}") } else { @@ -1680,10 +1687,22 @@ fn _test_virtio_fs( ); if !hotplug { - guest_command.args(["--fs", fs_params.as_str()]); + guest_command.args([ + if use_generic_vhost_user { + "--generic-vhost-user" + } else { + "--fs" + }, + fs_params.as_str(), + ]); } let mut child = guest_command.capture_output().spawn().unwrap(); + let add_arg = if use_generic_vhost_user { + "add-generic-vhost-user" + } else { + "add-fs" + }; let r = std::panic::catch_unwind(|| { guest.wait_vm_boot().unwrap(); @@ -1691,7 +1710,7 @@ fn _test_virtio_fs( if hotplug { // Add fs to the VM let (cmd_success, cmd_output) = - remote_command_w_output(&api_socket, "add-fs", Some(&fs_params)); + remote_command_w_output(&api_socket, add_arg, Some(&fs_params)); assert!(cmd_success); if let Some(pci_segment) = pci_segment { @@ -1764,8 +1783,13 @@ fn _test_virtio_fs( let r = std::panic::catch_unwind(|| { thread::sleep(std::time::Duration::new(10, 0)); let fs_params = format!( - "id=myfs0,tag=myfs,socket={},num_queues=1,queue_size=1024{}", + "id=myfs0,socket={},{}{}", virtiofsd_socket_path, + if use_generic_vhost_user { + "queue_sizes=[1024,1024],virtio_id=26" + } else { + "tag=myfs,num_queues=1,queue_size=1024" + }, if let Some(pci_segment) = pci_segment { format!(",pci_segment={pci_segment}") } else { @@ -1775,7 +1799,7 @@ fn _test_virtio_fs( // Add back and check it works let (cmd_success, cmd_output) = - remote_command_w_output(&api_socket, "add-fs", Some(&fs_params)); + remote_command_w_output(&api_socket, add_arg, Some(&fs_params)); assert!(cmd_success); if let Some(pci_segment) = pci_segment { assert!(String::from_utf8_lossy(&cmd_output).contains(&format!( @@ -5152,22 +5176,42 @@ mod common_parallel { #[test] fn test_virtio_fs() { - _test_virtio_fs(&prepare_virtiofsd, false, None); + _test_virtio_fs(&prepare_virtiofsd, false, false, None); } #[test] fn test_virtio_fs_hotplug() { - _test_virtio_fs(&prepare_virtiofsd, true, None); + _test_virtio_fs(&prepare_virtiofsd, true, false, None); } #[test] fn test_virtio_fs_multi_segment_hotplug() { - _test_virtio_fs(&prepare_virtiofsd, true, Some(15)); + _test_virtio_fs(&prepare_virtiofsd, true, false, Some(15)); } #[test] fn test_virtio_fs_multi_segment() { - _test_virtio_fs(&prepare_virtiofsd, false, Some(15)); + _test_virtio_fs(&prepare_virtiofsd, false, false, Some(15)); + } + + #[test] + fn test_generic_vhost_user() { + _test_virtio_fs(&prepare_virtiofsd, false, true, None); + } + + #[test] + fn test_generic_vhost_user_hotplug() { + _test_virtio_fs(&prepare_virtiofsd, true, true, None); + } + + #[test] + fn test_generic_vhost_user_multi_segment_hotplug() { + _test_virtio_fs(&prepare_virtiofsd, true, true, Some(15)); + } + + #[test] + fn test_generic_vhost_user_multi_segment() { + _test_virtio_fs(&prepare_virtiofsd, false, true, Some(15)); } #[test] From 042d1abd6734652cf5ab3591d8c1a9b400e713bb Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Wed, 4 Feb 2026 09:47:38 -0500 Subject: [PATCH 014/742] docs: generic vhost-user: document Include documentation for the generic vhost-user device. Signed-off-by: Demi Marie Obenour --- docs/api.md | 61 +++++++++++++++--------------- docs/device_model.md | 18 +++++++++ docs/generic-vhost-user.md | 76 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 125 insertions(+), 30 deletions(-) create mode 100644 docs/generic-vhost-user.md diff --git a/docs/api.md b/docs/api.md index 8f7a10642d..cea3f31812 100644 --- a/docs/api.md +++ b/docs/api.md @@ -72,36 +72,37 @@ The Cloud Hypervisor API exposes the following actions through its endpoints: ##### Virtual Machine (VM) Actions | Action | Endpoint | Request Body | Response Body | Prerequisites | -| ---------------------------------- | ----------------------- | ------------------------------- | ------------------------ | ------------------------------------------------------ | -| Create the VM | `/vm.create` | `/schemas/VmConfig` | N/A | The VM is not created yet | -| Delete the VM | `/vm.delete` | N/A | N/A | N/A | -| Boot the VM | `/vm.boot` | N/A | N/A | The VM is created but not booted | -| Shut the VM down | `/vm.shutdown` | N/A | N/A | The VM is booted | -| Reboot the VM | `/vm.reboot` | N/A | N/A | The VM is booted | -| Trigger power button of the VM | `/vm.power-button` | N/A | N/A | The VM is booted | -| Pause the VM | `/vm.pause` | N/A | N/A | The VM is booted | -| Resume the VM | `/vm.resume` | N/A | N/A | The VM is paused | -| Take a snapshot of the VM | `/vm.snapshot` | `/schemas/VmSnapshotConfig` | N/A | The VM is paused | -| Perform a coredump of the VM* | `/vm.coredump` | `/schemas/VmCoredumpData` | N/A | The VM is paused | -| Restore the VM from a snapshot | `/vm.restore` | `/schemas/RestoreConfig` | N/A | The VM is created but not booted | -| Add/remove CPUs to/from the VM | `/vm.resize` | `/schemas/VmResize` | N/A | The VM is booted | -| Add/remove memory from the VM | `/vm.resize` | `/schemas/VmResize` | N/A | The VM is booted | -| Resize a disk attached to the VM | `/vm.resize-disk` | `/schemas/VmResizeDisk` | N/A | The VM is created | -| Add/remove memory from a zone | `/vm.resize-zone` | `/schemas/VmResizeZone` | N/A | The VM is booted | -| Dump the VM information | `/vm.info` | N/A | `/schemas/VmInfo` | The VM is created | -| Add VFIO PCI device to the VM | `/vm.add-device` | `/schemas/VmAddDevice` | `/schemas/PciDeviceInfo` | The VM is booted | -| Add disk device to the VM | `/vm.add-disk` | `/schemas/DiskConfig` | `/schemas/PciDeviceInfo` | The VM is booted | -| Add fs device to the VM | `/vm.add-fs` | `/schemas/FsConfig` | `/schemas/PciDeviceInfo` | The VM is booted | -| Add pmem device to the VM | `/vm.add-pmem` | `/schemas/PmemConfig` | `/schemas/PciDeviceInfo` | The VM is booted | -| Add network device to the VM | `/vm.add-net` | `/schemas/NetConfig` | `/schemas/PciDeviceInfo` | The VM is booted | -| Add userspace PCI device to the VM | `/vm.add-user-device` | `/schemas/VmAddUserDevice` | `/schemas/PciDeviceInfo` | The VM is booted | -| Add vdpa device to the VM | `/vm.add-vdpa` | `/schemas/VdpaConfig` | `/schemas/PciDeviceInfo` | The VM is booted | -| Add vsock device to the VM | `/vm.add-vsock` | `/schemas/VsockConfig` | `/schemas/PciDeviceInfo` | The VM is booted | -| Remove device from the VM | `/vm.remove-device` | `/schemas/VmRemoveDevice` | N/A | The VM is booted | -| Dump the VM counters | `/vm.counters` | N/A | `/schemas/VmCounters` | The VM is booted | -| Inject an NMI | `/vm.nmi` | N/A | N/A | The VM is booted | -| Prepare to receive a migration | `/vm.receive-migration` | `/schemas/ReceiveMigrationData` | N/A | N/A | -| Start to send migration to target | `/vm.send-migration` | `/schemas/SendMigrationData` | N/A | The VM is booted and (shared mem or hugepages enabled) | +| --------------------------------------- | ---------------------------- | --------------------------------- | ------------------------ | ------------------------------------------------------ | +| Create the VM | `/vm.create` | `/schemas/VmConfig` | N/A | The VM is not created yet | +| Delete the VM | `/vm.delete` | N/A | N/A | N/A | +| Boot the VM | `/vm.boot` | N/A | N/A | The VM is created but not booted | +| Shut the VM down | `/vm.shutdown` | N/A | N/A | The VM is booted | +| Reboot the VM | `/vm.reboot` | N/A | N/A | The VM is booted | +| Trigger power button of the VM | `/vm.power-button` | N/A | N/A | The VM is booted | +| Pause the VM | `/vm.pause` | N/A | N/A | The VM is booted | +| Resume the VM | `/vm.resume` | N/A | N/A | The VM is paused | +| Take a snapshot of the VM | `/vm.snapshot` | `/schemas/VmSnapshotConfig` | N/A | The VM is paused | +| Perform a coredump of the VM* | `/vm.coredump` | `/schemas/VmCoredumpData` | N/A | The VM is paused | +| Restore the VM from a snapshot | `/vm.restore` | `/schemas/RestoreConfig` | N/A | The VM is created but not booted | +| Add/remove CPUs to/from the VM | `/vm.resize` | `/schemas/VmResize` | N/A | The VM is booted | +| Add/remove memory from the VM | `/vm.resize` | `/schemas/VmResize` | N/A | The VM is booted | +| Resize a disk attached to the VM | `/vm.resize-disk` | `/schemas/VmResizeDisk` | N/A | The VM is created | +| Add/remove memory from a zone | `/vm.resize-zone` | `/schemas/VmResizeZone` | N/A | The VM is booted | +| Dump the VM information | `/vm.info` | N/A | `/schemas/VmInfo` | The VM is created | +| Add VFIO PCI device to the VM | `/vm.add-device` | `/schemas/VmAddDevice` | `/schemas/PciDeviceInfo` | The VM is booted | +| Add disk device to the VM | `/vm.add-disk` | `/schemas/DiskConfig` | `/schemas/PciDeviceInfo` | The VM is booted | +| Add fs device to the VM | `/vm.add-fs` | `/schemas/FsConfig` | `/schemas/PciDeviceInfo` | The VM is booted | +| Add generic vhost-user device to the VM | `/vm.add-generic-vhost-user` | `/schemas/GenericVhostUserConfig` | `/schemas/PciDeviceInfo` | The VM is booted | +| Add pmem device to the VM | `/vm.add-pmem` | `/schemas/PmemConfig` | `/schemas/PciDeviceInfo` | The VM is booted | +| Add network device to the VM | `/vm.add-net` | `/schemas/NetConfig` | `/schemas/PciDeviceInfo` | The VM is booted | +| Add userspace PCI device to the VM | `/vm.add-user-device` | `/schemas/VmAddUserDevice` | `/schemas/PciDeviceInfo` | The VM is booted | +| Add vdpa device to the VM | `/vm.add-vdpa` | `/schemas/VdpaConfig` | `/schemas/PciDeviceInfo` | The VM is booted | +| Add vsock device to the VM | `/vm.add-vsock` | `/schemas/VsockConfig` | `/schemas/PciDeviceInfo` | The VM is booted | +| Remove device from the VM | `/vm.remove-device` | `/schemas/VmRemoveDevice` | N/A | The VM is booted | +| Dump the VM counters | `/vm.counters` | N/A | `/schemas/VmCounters` | The VM is booted | +| Inject an NMI | `/vm.nmi` | N/A | N/A | The VM is booted | +| Prepare to receive a migration | `/vm.receive-migration` | `/schemas/ReceiveMigrationData` | N/A | N/A | +| Start to send migration to target | `/vm.send-migration` | `/schemas/SendMigrationData` | N/A | The VM is booted and (shared mem or hugepages enabled) | * The `vmcoredump` action is available exclusively for the `x86_64` architecture and can be executed only when the `guest_debug` feature is diff --git a/docs/device_model.md b/docs/device_model.md index c072dc2eb6..ed4577a2cd 100644 --- a/docs/device_model.md +++ b/docs/device_model.md @@ -201,6 +201,24 @@ networking device (e.g. DPDK) into the VMM as their virtio network backend. This device is always built-in, and it is enabled when `vhost_user=true` and `socket` are provided to the `--net` parameter. +### vhost-user-generic + +This is a generic vhost-user device. The main use case is to provide a +vhost-user device that Cloud Hypervisor doesn't support natively. However, +there is nothing preventing its use for devices that Cloud Hypervisor does +support. For instance, the tag of a virtio-fs device can be set on the +virtiofsd command line, whereas the built-in virtio-fs support +requires the tag to be set in Cloud Hypervisor's command line. + +If the backend negotiates the `VHOST_USER_PROTOCOL_F_CONFIG` feature, +all configuration space access will be handled by it. Otherwise, +writes will be ignored and reads will return 0xFF. Cloud Hypervisor +warns if this happens. + +This device is always built-in, and it is enabled when the +`--generic-vhost-user` flag is passed. +See [the generic vhost-user documentation](generic-vhost-user.md) for more details. + ## VFIO VFIO (Virtual Function I/O) is a kernel framework that exposes direct device diff --git a/docs/generic-vhost-user.md b/docs/generic-vhost-user.md new file mode 100644 index 0000000000..6af813e28c --- /dev/null +++ b/docs/generic-vhost-user.md @@ -0,0 +1,76 @@ +# How to use generic vhost-user devices + +## What is a generic vhost-user device? + +Cloud Hypervisor deliberately does not have support for all types of virtio devices. +For instance, it does not natively support sound or media. + +However, the vhost-user protocol does not require the frontend to have separate +code for each type of vhost-user device. This allows writing a *generic* frontend +that supports almost all of them. + +Any vhost-user device that only uses supported protocol messages is +expected to work. It can (and often will) be of a type that Cloud +Hypervisor does not know about. It can even be of a type that is +not standardized. + +Virtio-GPU is known to *not* work. The version implemented in QEMU +requires `VHOST_USER_GPU_SET_SOCKET`, which is standard but will +never be implemented by Cloud Hypervisor. Other versions require +messages that have not been standardized. In the future, these +versions might be supported. + +## Examples + +virtiofsd meets these requirements if the `--tag` argument is passed. +Therefore, generic vhost-user can be used as an alternative to the built-in +virtio-fs support. See [fs.md](fs.md) for how to build the virtiofs daemon. + +To use generic vhost-user with virtiofsd, use a command line argument +similar to this: + +```bash +/path/to/virtiofsd \ + --tag=myfs \ + --log-level=debug \ + "--socket-path=$path_to_virtiofsd_socket" \ + "--shared-dir=$path_to_shared_directory" \ + "${other_virtiofsd_options[@]}" & + +/path/to/cloud-hypervisor \ + --cpus boot=1 \ + --memory size=1G,shared=on \ + --disk path=your-linux-image.iso \ + --kernel vmlinux \ + --cmdline "console=hvc0 root=/dev/vda1 rw" \ + --generic-vhost-user "socket=\"${path_to_virtiofsd_socket//\"/\"\"}\",virtio_id=26,queue_sizes=[512,512]" \ + "${other_cloud_hypervisor_options[@]}" +``` + +26 is the ID for a virtio-fs device. The IDs for other devices are defined +by the VIRTIO specification. The odd-looking variable expansion escapes +any double quotes in the socket path. It is also possible to provide +the name that is defined by the virtio specification, so `virtio_id=fs` +will also work. + +Inside the guest, you can mount the virtio-fs device with + +```bash +mkdir mount_dir +mount -t virtiofs -- myfs mount_dir/ +``` + +## Limitations + +Cloud Hypervisor does not save, restore, or migrate the PCI configuration +space of a generic vhost-user device. The backend can do it itself, but if +it does not these features will not work. + +Cloud Hypervisor cannot validate the number or size of the queues. Some +guest drivers do not validate these and will crash if they are wrong. +Notably, at least some versions of Linux will crash if one creates a +virtio-fs device (id 26) with only one queue. + +If any access to configuration space fails, Cloud Hypervisor will panic +instead of injecting an exception into the guest. It is unclear what +correct behavior is in this case. From 7ea1fa07a2b1aa9de5b16fdf5a0a257bf81f0a91 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Fri, 13 Feb 2026 16:44:14 -0500 Subject: [PATCH 015/742] misc: generic vhost-user: support human-readable device ID This avoids having to pass the numeric device ID, which is not very meaningful to humans. Signed-off-by: Demi Marie Obenour --- vmm/src/config.rs | 66 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 63 insertions(+), 3 deletions(-) diff --git a/vmm/src/config.rs b/vmm/src/config.rs index 25c0d96ae6..8c8b5cbda9 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -19,6 +19,7 @@ use option_parser::{ use serde::{Deserialize, Serialize}; use thiserror::Error; use virtio_bindings::virtio_blk::VIRTIO_BLK_ID_BYTES; +use virtio_bindings::virtio_ids::*; use virtio_devices::block::MINIMUM_BLOCK_QUEUE_SIZE; use virtio_devices::vhost_user::VIRTIO_FS_TAG_LEN; use virtio_devices::{RateLimiterConfig, TokenBucketConfig}; @@ -46,6 +47,11 @@ pub enum Error { /// Filesystem socket is missing #[error("Error parsing --fs: socket missing")] ParseFsSockMissing, + /// Generic vhost-user virtio ID is invalid + #[error( + "Error parsing --generic-vhost-user: virtio ID {0:?} invalid (leading zeros or unknown string)" + )] + ParseGenericVhostUserVirtioIdInvalid(String), /// Generic vhost-user socket is missing #[error("Error parsing --generic-vhost-user: socket missing")] ParseGenericVhostUserSockMissing, @@ -1670,7 +1676,7 @@ impl BalloonConfig { impl GenericVhostUserConfig { pub const SYNTAX: &'static str = "generic vhost-user parameters \ - \"virtio_id=,\ + \"virtio_id=,\ socket=,\ queue_sizes=,\ id=,pci_segment=\""; @@ -1695,10 +1701,64 @@ impl GenericVhostUserConfig { .convert("queue_sizes") .map_err(Error::ParseGenericVhostUser)? .ok_or(Error::ParseGenericVhostUserQueueSizeMissing)?; - let device_type = parser - .convert("virtio_id") + let device_type_str = parser + .convert::("virtio_id") .map_err(Error::ParseGenericVhostUser)? .ok_or(Error::ParseGenericVhostUserVirtioIdMissing)?; + let device_type = match device_type_str.as_bytes() { + b"net" => VIRTIO_ID_NET, + b"block" => VIRTIO_ID_BLOCK, + b"console" => VIRTIO_ID_CONSOLE, + b"rng" => VIRTIO_ID_RNG, + b"balloon" => VIRTIO_ID_BALLOON, + b"iomem" => VIRTIO_ID_IOMEM, + b"rpmsg" => VIRTIO_ID_RPMSG, + b"scsi" => VIRTIO_ID_SCSI, + b"9p" => VIRTIO_ID_9P, + b"mac80211_wlan" => VIRTIO_ID_MAC80211_WLAN, + b"rproc_serial" => VIRTIO_ID_RPROC_SERIAL, + b"caif" => VIRTIO_ID_CAIF, + b"memory_balloon" => VIRTIO_ID_MEMORY_BALLOON, + b"gpu" => VIRTIO_ID_GPU, + b"clock" => VIRTIO_ID_CLOCK, + b"input" => VIRTIO_ID_INPUT, + b"vsock" => VIRTIO_ID_VSOCK, + b"crypto" => VIRTIO_ID_CRYPTO, + b"signal_dist" => VIRTIO_ID_SIGNAL_DIST, + b"pstore" => VIRTIO_ID_PSTORE, + b"iommu" => VIRTIO_ID_IOMMU, + b"mem" => VIRTIO_ID_MEM, + b"sound" => VIRTIO_ID_SOUND, + b"fs" => VIRTIO_ID_FS, + b"pmem" => VIRTIO_ID_PMEM, + b"rpmb" => VIRTIO_ID_RPMB, + b"mac80211_hwsim" => VIRTIO_ID_MAC80211_HWSIM, + b"video_encoder" => VIRTIO_ID_VIDEO_ENCODER, + b"video_decoder" => VIRTIO_ID_VIDEO_DECODER, + b"scmi" => VIRTIO_ID_SCMI, + b"nitro_sec_mod" => VIRTIO_ID_NITRO_SEC_MOD, + b"i2c" => VIRTIO_ID_I2C_ADAPTER, + b"watchdog" => VIRTIO_ID_WATCHDOG, + b"can" => VIRTIO_ID_CAN, + b"dmabuf" => VIRTIO_ID_DMABUF, + b"param_serv" => VIRTIO_ID_PARAM_SERV, + b"audio_policy" => VIRTIO_ID_AUDIO_POLICY, + b"bt" => VIRTIO_ID_BT, + b"gpio" => VIRTIO_ID_GPIO, + b"rdma" => 42, + b"camera" => 43, + b"ism" => 44, + b"spi" => 45, + b"tee" => 46, + b"cpu_balloon" => 47, + b"media" => 48, + b"usb" => 49, + [b'1'..=b'9', ..] => match device_type_str.parse() { + Ok(id) => id, + Err(_) => return Err(Error::ParseGenericVhostUserVirtioIdInvalid(device_type_str)), + }, + _ => return Err(Error::ParseGenericVhostUserVirtioIdInvalid(device_type_str)), + }; let id = parser.get("id"); let pci_segment = parser .convert("pci_segment") From 989f3a323317878dd98c6cb97ddd02a7cb73a238 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Fri, 13 Feb 2026 16:45:10 -0500 Subject: [PATCH 016/742] misc: generic vhost-user: reject virtio device IDs that cannot work Some virtio devices cannot be implemented via vhost-user because they require tight integration with the VMM. This includes the IOMMU and watchdog devices. An attempt to create a generic vhost-user device with one of these IDs is always either a bug or human error. To aid debugging, return a helpful error message rather than silently continuing. Signed-off-by: Demi Marie Obenour --- vmm/src/config.rs | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/vmm/src/config.rs b/vmm/src/config.rs index 8c8b5cbda9..3e65f6a6d9 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -52,6 +52,11 @@ pub enum Error { "Error parsing --generic-vhost-user: virtio ID {0:?} invalid (leading zeros or unknown string)" )] ParseGenericVhostUserVirtioIdInvalid(String), + /// Generic vhost-user virtio ID is unsupported + #[error( + "Error parsing --generic-vhost-user: device with virtio ID {0:?} cannot be implemented via vhost-user" + )] + ParseGenericVhostUserVirtioIdUnsupported(String), /// Generic vhost-user socket is missing #[error("Error parsing --generic-vhost-user: socket missing")] ParseGenericVhostUserSockMissing, @@ -1759,6 +1764,18 @@ impl GenericVhostUserConfig { }, _ => return Err(Error::ParseGenericVhostUserVirtioIdInvalid(device_type_str)), }; + match device_type { + // vhost-user devices of these types definitely cannot work. + // Cloud Hypervisor needs to know if an IOMMU exists so that it + // can perform address translation, and a vhost-user device has + // no supported way to reset the guest. + VIRTIO_ID_WATCHDOG | VIRTIO_ID_IOMMU => { + return Err(Error::ParseGenericVhostUserVirtioIdUnsupported( + device_type_str, + )); + } + _ => {} + } let id = parser.get("id"); let pci_segment = parser .convert("pci_segment") @@ -3992,6 +4009,9 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" )); if pci_segment <= u16::MAX.into() && virtio_id <= u32::MAX.into() + && virtio_id != u64::from(VIRTIO_ID_BALLOON) + && virtio_id != u64::from(VIRTIO_ID_WATCHDOG) + && virtio_id != u64::from(VIRTIO_ID_IOMMU) && queue_sizes.0.iter().all(|&f| f <= u16::MAX.into()) { assert_eq!( From 25a63c8b879ea8311a2c47026abc11b909decefb Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 25 Feb 2026 13:57:09 +0100 Subject: [PATCH 017/742] tests: block: Add trailing_args parameter to run_qemu_img() Extend run_qemu_img() with an optional trailing_args parameter for arguments that follow the image path, such as the size in 'qemu-img create -f raw 128M'. Signed-off-by: Anatol Belski --- cloud-hypervisor/tests/integration.rs | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 221aadfbaf..5ac8d91597 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -3570,17 +3570,23 @@ mod common_parallel { _test_virtio_block(FOCAL_IMAGE_NAME, true, true, false, false, ImageType::Raw); } - fn run_qemu_img(path: &std::path::Path, args: &[&str]) -> std::process::Output { - std::process::Command::new("qemu-img") - .arg(args[0]) + fn run_qemu_img( + path: &std::path::Path, + args: &[&str], + trailing_args: Option<&[&str]>, + ) -> std::process::Output { + let mut cmd = std::process::Command::new("qemu-img"); + cmd.arg(args[0]) .args(&args[1..]) - .arg(path.to_str().unwrap()) - .output() - .unwrap() + .arg(path.to_str().unwrap()); + if let Some(extra) = trailing_args { + cmd.args(extra); + } + cmd.output().unwrap() } fn get_image_info(path: &std::path::Path) -> Option { - let output = run_qemu_img(path, &["info", "-U", "--output=json"]); + let output = run_qemu_img(path, &["info", "-U", "--output=json"], None); output.status.success().then(|| ())?; serde_json::from_slice(&output.stdout).ok() @@ -3759,7 +3765,7 @@ mod common_parallel { initial_backing_checksum: Option<(std::path::PathBuf, String, u32)>, ) { let path = resolve_disk_path(path_or_image_name); - let output = run_qemu_img(&path, &["check"]); + let output = run_qemu_img(&path, &["check"], None); assert!( output.status.success(), @@ -3777,7 +3783,7 @@ mod common_parallel { if let Some((backing_path, format, initial_checksum)) = initial_backing_checksum { if format.parse::().ok() != Some(block::qcow::ImageType::Raw) { - let output = run_qemu_img(&backing_path, &["check"]); + let output = run_qemu_img(&backing_path, &["check"], None); assert!( output.status.success(), From bc374c537c3d25e90eac663d3c15ffa92e27db58 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 25 Feb 2026 14:29:12 +0100 Subject: [PATCH 018/742] tests: block: Add test for block device discard on loop device Verify that a loopback block device advertises VIRTIO_BLK_F_DISCARD to the guest and that blkdiscard succeeds. Signed-off-by: Anatol Belski --- cloud-hypervisor/tests/integration.rs | 103 ++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 5ac8d91597..bbff720057 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -7646,6 +7646,109 @@ mod common_parallel { _test_virtio_block_discard("vhdx", "vhdx", &[], false, false); } + #[test] + fn test_virtio_block_discard_loop_device() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); + let kernel_path = direct_kernel_boot_path(); + + let test_disk_path = guest.tmp_dir.as_path().join("loop_discard_test.raw"); + let res = run_qemu_img(&test_disk_path, &["create", "-f", "raw"], Some(&["128M"])); + assert!( + res.status.success(), + "Failed to create raw backing image: {}", + String::from_utf8_lossy(&res.stderr) + ); + + let loop_dev = create_loop_device(test_disk_path.to_str().unwrap(), 4096, 5); + + let mut child = GuestCommand::new(&guest) + .args(["--cpus", "boot=1"]) + .args(["--memory", "size=512M"]) + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .args([ + "--disk", + format!( + "path={}", + guest.disk_config.disk(DiskType::OperatingSystem).unwrap() + ) + .as_str(), + format!( + "path={}", + guest.disk_config.disk(DiskType::CloudInit).unwrap() + ) + .as_str(), + format!("path={},image_type=raw", &loop_dev).as_str(), + ]) + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + assert_eq!( + guest + .ssh_command("lsblk | grep -c vdc") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 1 + ); + + assert_eq!( + guest + .ssh_command("lsblk -t | grep vdc | awk '{print $6}'") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 4096 + ); + + let discard_max = guest + .ssh_command("cat /sys/block/vdc/queue/discard_max_bytes") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(); + assert!( + discard_max > 0, + "discard_max_bytes={discard_max}, VIRTIO_BLK_F_DISCARD not negotiated" + ); + + guest + .ssh_command("sudo dd if=/dev/urandom of=/dev/vdc bs=4096 count=1024 oflag=direct") + .unwrap(); + guest.ssh_command("sync").unwrap(); + + let result = guest + .ssh_command("sudo blkdiscard -v -o 0 -l 4194304 /dev/vdc 2>&1 || true") + .unwrap(); + assert!( + !result.contains("Operation not supported") + && !result.contains("BLKDISCARD ioctl failed"), + "blkdiscard failed on loop device: {result}" + ); + + guest.ssh_command("sync").unwrap(); + + assert_guest_disk_region_is_zero(&guest, "/dev/vdc", 0, 4194304); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + handle_child_output(r, &output); + + Command::new("losetup") + .args(["-d", &loop_dev]) + .output() + .expect("loop device not found"); + } + fn _test_virtio_block_fstrim( format_name: &str, qemu_img_format: &str, From 8a165aef3b6995a5f9db393e75eb4aa52352b585 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 25 Feb 2026 15:04:14 +0100 Subject: [PATCH 019/742] tests: block: Add integration test for DM snapshot discard failure Verify that the guest remains stable when BLKDISCARD fails on the host backend. DM snapshot targets do not support discard, so the VMM returns VIRTIO_BLK_S_IOERR. The test retries blkdiscard several times, checking guest responsiveness after each attempt, then confirms normal I/O still works. The DM topology follows the same pattern used by WindowsDiskConfig. Signed-off-by: Anatol Belski --- cloud-hypervisor/tests/integration.rs | 183 ++++++++++++++++++++++++++ 1 file changed, 183 insertions(+) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index bbff720057..2037ad0083 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -7749,6 +7749,189 @@ mod common_parallel { .expect("loop device not found"); } + #[test] + fn test_virtio_block_discard_dm_snapshot() { + // Verify that the guest remains stable when BLKDISCARD fails on the + // host backend. DM snapshot targets do not support discard, so the + // VMM returns VIRTIO_BLK_S_IOERR. The guest must handle this + // gracefully even under repeated attempts. + // + // DM topology follows the same pattern used by WindowsDiskConfig. + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); + let kernel_path = direct_kernel_boot_path(); + + let origin_path = guest.tmp_dir.as_path().join("dm_origin.raw"); + let cow_path = guest.tmp_dir.as_path().join("dm_cow.raw"); + + let res = run_qemu_img(&origin_path, &["create", "-f", "raw"], Some(&["128M"])); + assert!( + res.status.success(), + "Failed to create origin image: {}", + String::from_utf8_lossy(&res.stderr) + ); + + let cow_size: u64 = 128 << 20; + let cow_sectors = cow_size / 512; + let cow_file = File::create(&cow_path).expect("Expect creating COW image to succeed"); + cow_file + .set_len(cow_size) + .expect("Expect truncating COW image to succeed"); + + let origin_sectors: u64 = 128 * 1024 * 1024 / 512; + let origin_loop = create_loop_device(origin_path.to_str().unwrap(), 4096, 5); + let cow_loop = create_loop_device(cow_path.to_str().unwrap(), 512, 5); + + let unique = format!( + "ch-test-{}", + guest + .tmp_dir + .as_path() + .file_name() + .unwrap() + .to_str() + .unwrap() + ); + let cow_dm_name = format!("{unique}-cow"); + let snap_dm_name = format!("{unique}-snap"); + + let output = Command::new("dmsetup") + .args([ + "create", + &cow_dm_name, + "--table", + &format!("0 {cow_sectors} linear {cow_loop} 0"), + ]) + .output() + .expect("Failed to run dmsetup"); + assert!( + output.status.success(), + "dmsetup create (cow linear) failed: {}", + String::from_utf8_lossy(&output.stderr) + ); + + Command::new("dmsetup") + .arg("mknodes") + .output() + .expect("dmsetup mknodes failed"); + + // dm-snapshot: origin + COW, non-persistent, chunk size 8 sectors. + let output = Command::new("dmsetup") + .args([ + "create", + &snap_dm_name, + "--table", + &format!("0 {origin_sectors} snapshot {origin_loop} /dev/mapper/{cow_dm_name} N 8"), + ]) + .output() + .expect("Failed to run dmsetup"); + assert!( + output.status.success(), + "dmsetup create (snapshot) failed: {}", + String::from_utf8_lossy(&output.stderr) + ); + + Command::new("dmsetup") + .arg("mknodes") + .output() + .expect("dmsetup mknodes failed"); + + let dm_dev = format!("/dev/mapper/{snap_dm_name}"); + + let mut child = GuestCommand::new(&guest) + .args(["--cpus", "boot=1"]) + .args(["--memory", "size=512M"]) + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .args([ + "--disk", + format!( + "path={}", + guest.disk_config.disk(DiskType::OperatingSystem).unwrap() + ) + .as_str(), + format!( + "path={}", + guest.disk_config.disk(DiskType::CloudInit).unwrap() + ) + .as_str(), + format!("path={},image_type=raw", &dm_dev).as_str(), + ]) + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + assert_eq!( + guest + .ssh_command("lsblk | grep -c vdc") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 1 + ); + + let discard_max = guest + .ssh_command("cat /sys/block/vdc/queue/discard_max_bytes") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(); + assert!( + discard_max > 0, + "discard_max_bytes={discard_max}, VIRTIO_BLK_F_DISCARD not negotiated" + ); + + guest + .ssh_command("sudo dd if=/dev/urandom of=/dev/vdc bs=4096 count=1024 oflag=direct") + .unwrap(); + guest.ssh_command("sync").unwrap(); + + // Discard is expected to fail on DM snapshot because the + // snapshot target does not support BLKDISCARD. + for attempt in 1..=3 { + let result = guest + .ssh_command("sudo blkdiscard -o 0 -l 4194304 /dev/vdc 2>&1; echo rc=$?") + .unwrap(); + println!("blkdiscard attempt {attempt}: {result}"); + + let uptime = guest.ssh_command("uptime").unwrap(); + assert!( + !uptime.is_empty(), + "Guest unresponsive after blkdiscard attempt {attempt}" + ); + } + + guest + .ssh_command("sudo dd if=/dev/urandom of=/dev/vdc bs=4096 count=256 oflag=direct") + .unwrap(); + let readback = guest + .ssh_command("sudo dd if=/dev/vdc bs=4096 count=1 iflag=direct 2>/dev/null | od -A n -t x1 | head -1") + .unwrap(); + assert!( + !readback.trim().is_empty(), + "Failed to read back from device after discard errors" + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + handle_child_output(r, &output); + + let _ = Command::new("dmsetup") + .args(["remove", &snap_dm_name]) + .output(); + let _ = Command::new("dmsetup") + .args(["remove", &cow_dm_name]) + .output(); + let _ = Command::new("losetup").args(["-d", &origin_loop]).output(); + let _ = Command::new("losetup").args(["-d", &cow_loop]).output(); + } + fn _test_virtio_block_fstrim( format_name: &str, qemu_img_format: &str, From e067c768802bb7ba4f40154176ab88f1e4ba9868 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 25 Feb 2026 15:18:43 +0100 Subject: [PATCH 020/742] tests: windows: Disable sparse for DM snapshot OS disks The Windows tests use a DM snapshot device for the OS disk. DM snapshot targets do not support BLKDISCARD, so the VMM returns IOERR for every TRIM attempt. viostor.sys may BSOD when the host returns an error for negotiated discard/write-zeroes operations. Add a default_disks_sparse_off() helper to GuestCommand and use it in all Windows tests. Signed-off-by: Anatol Belski --- cloud-hypervisor/tests/integration.rs | 18 +++++------ test_infra/src/lib.rs | 46 ++++++++++++--------------- 2 files changed, 30 insertions(+), 34 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 2037ad0083..99a22f2730 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -11537,7 +11537,7 @@ mod windows { .args(["--kernel", edk2_path().to_str().unwrap()]) .args(["--serial", "tty"]) .args(["--console", "off"]) - .default_disks() + .default_disks_sparse_off() .default_net() .capture_output() .spawn() @@ -11586,7 +11586,7 @@ mod windows { .args([ "--disk", format!( - "path={},num_queues=4", + "path={},num_queues=4,sparse=off", windows_guest .guest() .disk_config @@ -11654,7 +11654,7 @@ mod windows { .args(["--kernel", ovmf_path.to_str().unwrap()]) .args(["--serial", "tty"]) .args(["--console", "off"]) - .default_disks() + .default_disks_sparse_off() .default_net() .capture_output() .spawn() @@ -11743,7 +11743,7 @@ mod windows { .args(["--kernel", ovmf_path.to_str().unwrap()]) .args(["--serial", "tty"]) .args(["--console", "off"]) - .default_disks() + .default_disks_sparse_off() .default_net() .capture_output() .spawn() @@ -11818,7 +11818,7 @@ mod windows { .args(["--kernel", ovmf_path.to_str().unwrap()]) .args(["--serial", "tty"]) .args(["--console", "off"]) - .default_disks() + .default_disks_sparse_off() .default_net() .capture_output() .spawn() @@ -11892,7 +11892,7 @@ mod windows { .args(["--kernel", ovmf_path.to_str().unwrap()]) .args(["--serial", "tty"]) .args(["--console", "off"]) - .default_disks() + .default_disks_sparse_off() .default_net() .capture_output() .spawn() @@ -11966,7 +11966,7 @@ mod windows { .args(["--kernel", ovmf_path.to_str().unwrap()]) .args(["--serial", "tty"]) .args(["--console", "off"]) - .default_disks() + .default_disks_sparse_off() .default_net() .capture_output() .spawn() @@ -12062,7 +12062,7 @@ mod windows { .args(["--kernel", ovmf_path.to_str().unwrap()]) .args(["--serial", "tty"]) .args(["--console", "off"]) - .default_disks() + .default_disks_sparse_off() .default_net() .capture_output() .spawn() @@ -12194,7 +12194,7 @@ mod windows { .args(["--kernel", ovmf_path.to_str().unwrap()]) .args(["--serial", "tty"]) .args(["--console", "off"]) - .default_disks() + .default_disks_sparse_off() // The multi net dev config is borrowed from test_multiple_network_interfaces .args([ "--net", diff --git a/test_infra/src/lib.rs b/test_infra/src/lib.rs index 3ab550870e..2848a130e5 100644 --- a/test_infra/src/lib.rs +++ b/test_infra/src/lib.rs @@ -1448,35 +1448,31 @@ impl<'a> GuestCommand<'a> { } pub fn default_disks(&mut self) -> &mut Self { - if self.guest.disk_config.disk(DiskType::CloudInit).is_some() { + self.default_disks_inner(true) + } + + pub fn default_disks_sparse_off(&mut self) -> &mut Self { + self.default_disks_inner(false) + } + + fn default_disks_inner(&mut self, sparse: bool) -> &mut Self { + let sparse_opt = if sparse { "" } else { ",sparse=off" }; + let os_disk = format!( + "path={}{}", + self.guest + .disk_config + .disk(DiskType::OperatingSystem) + .unwrap(), + sparse_opt + ); + if let Some(cloud_init) = self.guest.disk_config.disk(DiskType::CloudInit) { self.args([ "--disk", - format!( - "path={}", - self.guest - .disk_config - .disk(DiskType::OperatingSystem) - .unwrap() - ) - .as_str(), - format!( - "path={}", - self.guest.disk_config.disk(DiskType::CloudInit).unwrap() - ) - .as_str(), + os_disk.as_str(), + format!("path={cloud_init}").as_str(), ]) } else { - self.args([ - "--disk", - format!( - "path={}", - self.guest - .disk_config - .disk(DiskType::OperatingSystem) - .unwrap() - ) - .as_str(), - ]) + self.args(["--disk", os_disk.as_str()]) } } From 6404d2d513e4799ac485168bce2a801ecfd69d1d Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Tue, 24 Feb 2026 22:13:46 +0100 Subject: [PATCH 021/742] block: Assume sparse support for block devices There is no non destructive readonly ioctl to query block device discard or write zeroes capabilities. BLKZEROOUT is guaranteed to succeed via kernel software fallback. BLKDISCARD may fail at runtime with EOPNOTSUPP on devices that lack trim support, but the error propagates to the guest as VIRTIO_BLK_S_IOERR and well behaved guests handle it gracefully. Signed-off-by: Anatol Belski --- block/src/lib.rs | 41 +++++++++++++---------------------------- 1 file changed, 13 insertions(+), 28 deletions(-) diff --git a/block/src/lib.rs b/block/src/lib.rs index 9f78cefd9e..3d45473a9a 100644 --- a/block/src/lib.rs +++ b/block/src/lib.rs @@ -846,34 +846,19 @@ fn probe_file_sparse_support(fd: libc::c_int) -> bool { supported } -/// Probe sparse support for a block device using ioctls. -fn probe_block_device_sparse_support(fd: libc::c_int) -> bool { - ioctl_io_nr!(BLKDISCARD, 0x12, 119); - ioctl_io_nr!(BLKZEROOUT, 0x12, 127); - - let range: [u64; 2] = [0, 0]; - - // SAFETY: FFI call with valid fd and valid range buffer - let punch_hole = unsafe { ioctl(fd, BLKDISCARD() as _, &range) } == 0; - - if !punch_hole { - let err = io::Error::last_os_error(); - debug!("Block device BLKDISCARD probe returned: {err}"); - } - - // SAFETY: FFI call with valid fd and valid range buffer - let zero_range = unsafe { ioctl(fd, BLKZEROOUT() as _, &range) } == 0; - - if !zero_range { - let err = io::Error::last_os_error(); - debug!("Block device BLKZEROOUT probe returned: {err}"); - } - - let supported = punch_hole || zero_range; - info!( - "Probed block device sparse support: punch_hole={punch_hole}, zero_range={zero_range} => {supported}" - ); - supported +/// Probe sparse support for a block device. +/// +/// Block devices always report sparse support. `BLKZEROOUT` is guaranteed to +/// succeed as the kernel provides a software fallback writing explicit zeros +/// when the hardware lacks a native write zeroes command. `BLKDISCARD` may fail +/// at runtime with `EOPNOTSUPP` on devices without trim or discard support, but +/// Linux guests handle this gracefully by ceasing discard requests. +/// +/// There is no non destructive read only ioctl to query block device discard +/// or write zeroes capabilities. +fn probe_block_device_sparse_support(_fd: libc::c_int) -> bool { + info!("Block device: assuming sparse support"); + true } /// Preallocate disk space for a disk image file. From c9cf3294ea83ab15cd3790ea71f1702e7bf85922 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Tue, 24 Feb 2026 15:10:59 -0800 Subject: [PATCH 022/742] vmm: remove duplicate vm config parameter Vm::create_device_manager accepted both config and _vm_config, but both represented the same VM configuration source. Remove _vm_config from the function signature and from its call site, and use config for the TDX dynamic check. This is a cleanup-only refactor with no intended functional change. Signed-off-by: Muminul Islam --- vmm/src/vm.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index 9c37457c19..3f793cd807 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -612,7 +612,6 @@ impl Vm { #[cfg(not(target_arch = "riscv64"))] timestamp, snapshot, - &config, )?; // Perform hypervisor-specific initialization @@ -798,10 +797,9 @@ impl Vm { boot_id_list: BTreeSet, #[cfg(not(target_arch = "riscv64"))] timestamp: Instant, snapshot: Option<&Snapshot>, - _vm_config: &Arc>, ) -> Result>> { #[cfg(feature = "tdx")] - let dynamic = !_vm_config.lock().unwrap().is_tdx_enabled(); + let dynamic = !config.lock().unwrap().is_tdx_enabled(); #[cfg(not(feature = "tdx"))] let dynamic = true; From 15d1f1d7fdd7b0698ace412c2398fbc3d515bcba Mon Sep 17 00:00:00 2001 From: Pascal Scholz Date: Tue, 24 Feb 2026 14:39:45 +0100 Subject: [PATCH 023/742] vmm: Refactor locking in `AddressManager::move_bar` The current implementation performs multiple operations on allocators in a row, with the single goal of updating the allocator. For each of these operations, the `Mutex` guarding the respective allocator is locked anew which introduces room for race conditions. Instead of locking the mutex multiple times, we should lock it once to perform the whole move. Signed-off-by: Pascal Scholz On-behalf-of: SAP pascal.scholz@sap.com --- vmm/src/device_manager.rs | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index f22696c7fe..52e4cddfa0 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -743,15 +743,10 @@ impl DeviceRelocation for AddressManager { ) -> std::result::Result<(), std::io::Error> { match region_type { PciBarRegionType::IoRegion => { + let mut sys_allocator = self.allocator.lock().unwrap(); // Update system allocator - self.allocator - .lock() - .unwrap() - .free_io_addresses(GuestAddress(old_base), len as GuestUsize); - - self.allocator - .lock() - .unwrap() + sys_allocator.free_io_addresses(GuestAddress(old_base), len as GuestUsize); + sys_allocator .allocate_io_addresses(Some(GuestAddress(new_base)), len as GuestUsize, None) .ok_or_else(|| io::Error::other("failed allocating new IO range"))?; @@ -761,26 +756,22 @@ impl DeviceRelocation for AddressManager { .map_err(io::Error::other)?; } PciBarRegionType::Memory32BitRegion | PciBarRegionType::Memory64BitRegion => { - let allocators = if region_type == PciBarRegionType::Memory32BitRegion { + let pci_mmio_allocators = if region_type == PciBarRegionType::Memory32BitRegion { &self.pci_mmio32_allocators } else { &self.pci_mmio64_allocators }; - // Find the specific allocator that this BAR was allocated from and use it for new one - for allocator in allocators { - let allocator_base = allocator.lock().unwrap().base(); - let allocator_end = allocator.lock().unwrap().end(); + // Find the specific allocator that this BAR was allocated from and use it for a new one + for pci_mmio_allocator_mutex in pci_mmio_allocators { + let mut pci_mmio_allocator = pci_mmio_allocator_mutex.lock().unwrap(); - if old_base >= allocator_base.0 && old_base <= allocator_end.0 { - allocator - .lock() - .unwrap() - .free(GuestAddress(old_base), len as GuestUsize); + if old_base >= pci_mmio_allocator.base().0 + && old_base <= pci_mmio_allocator.end().0 + { + pci_mmio_allocator.free(GuestAddress(old_base), len as GuestUsize); - allocator - .lock() - .unwrap() + pci_mmio_allocator .allocate(Some(GuestAddress(new_base)), len as GuestUsize, Some(len)) .ok_or_else(|| io::Error::other("failed allocating new MMIO range"))?; From 3f800d2bb41f2ee92cda53eccb1588feb6930be3 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Mon, 23 Feb 2026 15:31:54 -0800 Subject: [PATCH 024/742] vmm: Add core scheduling support for vCPU threads Add a core_scheduling option to --cpus with three modes of operation. This feature takes advantage of a kernel feature that restricts scheduling of processes on the SMT threads on the same core. This is useful for mitigating certain classes of side-channel attacks and has better performance that disabling SMT on the CPU. - vm (default): All vCPU threads share one core scheduling cookie. They may be co-scheduled on SMT siblings while host threads are excluded - this has minimal performance impact and can even potentially improve performance from co-location. - vcpu: Each vCPU gets a unique cookie preventing any two vCPUs from sharing SMT siblings. This has the strongest isolation but at some compromise of performance. - off: No core scheduling applied (old behaviour). This isolation is done by the kernel maintaining a "cookie" - threads with the same cookie can share the same core. In vCPU mode each vCPU thread the cookie is created when the thread starts and each gets a unique cookie. For VM mode the first vCPU thread (the leader) will create the cookie. All other vCPU threads started (via hotplug or during boot) will have that cookie shared to it. EINVAL/ENODEV from prctl is silently ignored so this works transparently on kernels older than 5.14 that lack PR_SCHED_CORE or when SMT disabled. Full details of this kernel feature can be found at: https://docs.kernel.org/admin-guide/hw-vuln/core-scheduling.html This implementation was inspired by crosvm's implementation - in particular the enable_core_scheduling() function. This is challenging to test via integration testing but the logging of the received cookie shows it working: VM case: cloud-hypervisor: 0.243102s: INFO:vmm/src/cpu.rs:1247 -- vCPU 1: core scheduling cookie = 0x33e4c167 cloud-hypervisor: 0.243102s: INFO:vmm/src/cpu.rs:1247 -- vCPU 0: core scheduling cookie = 0x33e4c167 vCPU case: cloud-hypervisor: 0.089356s: INFO:vmm/src/cpu.rs:1247 -- vCPU 0: core scheduling cookie = 0x13993ad6 cloud-hypervisor: 0.089380s: INFO:vmm/src/cpu.rs:1247 -- vCPU 1: core scheduling cookie = 0xd48e86e Signed-off-by: Rob Bradford --- cloud-hypervisor/src/main.rs | 7 +- docs/cpu.md | 34 +++++- fuzz/fuzz_targets/http_api.rs | 1 + vmm/src/api/openapi/cloud-hypervisor.yaml | 4 + vmm/src/config.rs | 62 +++++++++- vmm/src/cpu.rs | 139 +++++++++++++++++++++- vmm/src/lib.rs | 5 +- vmm/src/vm_config.rs | 11 ++ 8 files changed, 254 insertions(+), 9 deletions(-) diff --git a/cloud-hypervisor/src/main.rs b/cloud-hypervisor/src/main.rs index b2b184248b..b4d2bdf534 100644 --- a/cloud-hypervisor/src/main.rs +++ b/cloud-hypervisor/src/main.rs @@ -214,7 +214,7 @@ fn get_cli_options_sorted( kvm_hyperv=on|off,max_phys_bits=,\ affinity=,\ features=,\ - nested=on|off", + nested=on|off,core_scheduling=vm|vcpu|off", ) .default_value(default_vcpus) .group("vm-config"), @@ -916,8 +916,8 @@ mod unit_tests { #[cfg(target_arch = "x86_64")] use vmm::vm_config::DebugConsoleConfig; use vmm::vm_config::{ - ConsoleConfig, ConsoleOutputMode, CpuFeatures, CpusConfig, HotplugMethod, MemoryConfig, - PayloadConfig, RngConfig, VmConfig, + ConsoleConfig, ConsoleOutputMode, CoreScheduling, CpuFeatures, CpusConfig, HotplugMethod, + MemoryConfig, PayloadConfig, RngConfig, VmConfig, }; use crate::test_util::assert_args_sorted; @@ -968,6 +968,7 @@ mod unit_tests { affinity: None, features: CpuFeatures::default(), nested: true, + core_scheduling: CoreScheduling::Vm, }, memory: MemoryConfig { size: 536_870_912, diff --git a/docs/cpu.md b/docs/cpu.md index 6a55942691..8ed247c909 100644 --- a/docs/cpu.md +++ b/docs/cpu.md @@ -19,11 +19,12 @@ struct CpusConfig { affinity: Option>, features: CpuFeatures, nested: bool, + core_scheduling: CoreScheduling, } ``` ``` ---cpus boot=,max=,topology=:::,kvm_hyperv=on|off,max_phys_bits=,affinity=,features=,nested=on|off +--cpus boot=,max=,topology=:::,kvm_hyperv=on|off,max_phys_bits=,affinity=,features=,nested=on|off,core_scheduling=vm|vcpu|off ``` ### `boot` @@ -221,3 +222,34 @@ _Example_ ``` --cpus nested=on ``` + +### `core_scheduling` + +Core scheduling mode for vCPU threads. + +This option controls Linux core scheduling (`PR_SCHED_CORE`) for vCPU threads, +which prevents untrusted tasks from sharing SMT siblings. This mitigates +side-channel attacks (e.g. MDS, L1TF) between vCPU threads. + +Three modes are available: + +- `vm` (default): All vCPU threads share a single core scheduling cookie. + vCPUs may be co-scheduled on SMT siblings of the same core, providing + better performance while still isolating VM threads from host tasks. +- `vcpu`: Each vCPU thread gets its own unique cookie. No two vCPUs can + share SMT siblings, providing the strongest isolation between vCPUs at + the cost of performance. +- `off`: No core scheduling is applied. + +On kernels older than 5.14 (which lack `PR_SCHED_CORE` support), the +option silently has no effect. + +_Example_ + +``` +--cpus boot=2,core_scheduling=vm +``` + +In this example, both vCPUs will share the same core scheduling cookie, +allowing them to be co-scheduled on SMT siblings while preventing host +threads from sharing those siblings. diff --git a/fuzz/fuzz_targets/http_api.rs b/fuzz/fuzz_targets/http_api.rs index b7f38994fb..b7128a1678 100644 --- a/fuzz/fuzz_targets/http_api.rs +++ b/fuzz/fuzz_targets/http_api.rs @@ -137,6 +137,7 @@ impl RequestHandler for StubApiRequestHandler { affinity: None, features: CpuFeatures::default(), nested: true, + core_scheduling: CoreScheduling::default(), }, memory: MemoryConfig { size: 536_870_912, diff --git a/vmm/src/api/openapi/cloud-hypervisor.yaml b/vmm/src/api/openapi/cloud-hypervisor.yaml index 499218c7a5..c4f4b6acf0 100644 --- a/vmm/src/api/openapi/cloud-hypervisor.yaml +++ b/vmm/src/api/openapi/cloud-hypervisor.yaml @@ -738,6 +738,10 @@ components: $ref: "#/components/schemas/CpuAffinity" features: $ref: "#/components/schemas/CpuFeatures" + core_scheduling: + type: string + enum: ["Vm", "Vcpu", "Off"] + default: "Vm" PciSegmentConfig: required: diff --git a/vmm/src/config.rs b/vmm/src/config.rs index 3e65f6a6d9..3b32463674 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -596,6 +596,23 @@ impl FromStr for HotplugMethod { } } +pub enum ParseCoreSchedulingError { + InvalidValue(String), +} + +impl FromStr for CoreScheduling { + type Err = ParseCoreSchedulingError; + + fn from_str(s: &str) -> std::result::Result { + match s.to_lowercase().as_str() { + "vm" => Ok(CoreScheduling::Vm), + "vcpu" => Ok(CoreScheduling::Vcpu), + "off" => Ok(CoreScheduling::Off), + _ => Err(ParseCoreSchedulingError::InvalidValue(s.to_owned())), + } + } +} + pub enum CpuTopologyParseError { InvalidValue(String), } @@ -640,7 +657,8 @@ impl CpusConfig { .add("max_phys_bits") .add("affinity") .add("features") - .add("nested"); + .add("nested") + .add("core_scheduling"); parser.parse(cpus).map_err(Error::ParseCpus)?; let boot_vcpus: u32 = parser @@ -707,6 +725,11 @@ impl CpusConfig { "nested=off is not supported on aarch64 and riscv64 architectures".to_string(), ))); } + let core_scheduling = parser + .convert("core_scheduling") + .map_err(Error::ParseCpus)? + .unwrap_or(CoreScheduling::Vm); + Ok(CpusConfig { boot_vcpus, max_vcpus, @@ -716,6 +739,7 @@ impl CpusConfig { affinity, features, nested, + core_scheduling, }) } } @@ -3579,6 +3603,42 @@ mod unit_tests { }, ); + // Test core_scheduling parsing + assert_eq!( + CpusConfig::parse("boot=1,core_scheduling=vm")?, + CpusConfig { + boot_vcpus: 1, + max_vcpus: 1, + core_scheduling: CoreScheduling::Vm, + ..Default::default() + } + ); + assert_eq!( + CpusConfig::parse("boot=1,core_scheduling=vcpu")?, + CpusConfig { + boot_vcpus: 1, + max_vcpus: 1, + core_scheduling: CoreScheduling::Vcpu, + ..Default::default() + } + ); + assert_eq!( + CpusConfig::parse("boot=1,core_scheduling=off")?, + CpusConfig { + boot_vcpus: 1, + max_vcpus: 1, + core_scheduling: CoreScheduling::Off, + ..Default::default() + } + ); + // Default (no core_scheduling specified) should be Vm + assert_eq!( + CpusConfig::parse("boot=1")?.core_scheduling, + CoreScheduling::Vm + ); + // Invalid value should error + CpusConfig::parse("boot=1,core_scheduling=invalid").unwrap_err(); + Ok(()) } diff --git a/vmm/src/cpu.rs b/vmm/src/cpu.rs index bba78e642c..78149c4b55 100644 --- a/vmm/src/cpu.rs +++ b/vmm/src/cpu.rs @@ -17,7 +17,7 @@ use std::io::Write; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use std::mem::size_of; use std::os::unix::thread::JoinHandleExt; -use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicI32, Ordering}; use std::sync::{Arc, Barrier, Mutex}; use std::{cmp, io, result, thread}; @@ -92,7 +92,7 @@ use crate::gdb::{Debuggable, DebuggableError, get_raw_tid}; use crate::seccomp_filters::{Thread, get_seccomp_filter}; #[cfg(target_arch = "x86_64")] use crate::vm::physical_bits; -use crate::vm_config::CpusConfig; +use crate::vm_config::{CoreScheduling, CpusConfig}; use crate::{CPU_MANAGER_SNAPSHOT_ID, GuestMemoryMmap}; #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] @@ -220,9 +220,79 @@ pub enum Error { #[cfg(feature = "mshv")] #[error("Failed to set partition property")] SetPartitionProperty(#[source] anyhow::Error), + + #[error("Error enabling core scheduling")] + CoreScheduling(#[source] io::Error), } pub type Result = result::Result; +const PR_SCHED_CORE: libc::c_int = 62; +const PR_SCHED_CORE_GET: libc::c_int = 0; +const PR_SCHED_CORE_CREATE: libc::c_int = 1; +const PR_SCHED_CORE_SHARE_FROM: libc::c_int = 3; +const PIDTYPE_PID: libc::c_int = 0; + +/// Create a new unique core scheduling cookie for the current thread. +/// Silently succeeds on kernels that don't support PR_SCHED_CORE. +fn core_scheduling_create() -> Result<()> { + // SAFETY: prctl with PR_SCHED_CORE_CREATE on the current thread (pid=0). + // All arguments are valid constants. We check the return value. + let ret = unsafe { libc::prctl(PR_SCHED_CORE, PR_SCHED_CORE_CREATE, 0, PIDTYPE_PID, 0) }; + if ret == -1 { + let err = io::Error::last_os_error(); + // EINVAL: kernel < 5.14 where PR_SCHED_CORE is unknown. + // ENODEV: CONFIG_SCHED_CORE is enabled but SMT is not present/enabled, + // so core scheduling is not applicable. + // Both mean core scheduling is unavailable; silently ignore. + match err.raw_os_error() { + Some(libc::EINVAL) => { + warn!("Kernel lacks CONFIG_SCHED_CORE support - no SMT isolation"); + } + Some(libc::ENODEV) => {} + _ => return Err(Error::CoreScheduling(err)), + } + } + Ok(()) +} + +/// Copy the core scheduling cookie from the thread identified by `tid` +/// to the current thread, placing both in the same scheduling group. +/// Silently succeeds on kernels that don't support PR_SCHED_CORE. +fn core_scheduling_share_from(tid: i32) -> Result<()> { + // SAFETY: prctl with PR_SCHED_CORE_SHARE_FROM targeting tid. + // All arguments are valid. We check the return value. + let ret = unsafe { libc::prctl(PR_SCHED_CORE, PR_SCHED_CORE_SHARE_FROM, tid, PIDTYPE_PID, 0) }; + if ret == -1 { + let err = io::Error::last_os_error(); + match err.raw_os_error() { + Some(libc::EINVAL) | Some(libc::ENODEV) => {} + _ => return Err(Error::CoreScheduling(err)), + } + } + Ok(()) +} + +/// Read the core scheduling cookie of the current thread. +/// Returns 0 if no cookie is set or the kernel doesn't support PR_SCHED_CORE. +fn core_scheduling_cookie() -> u64 { + let mut cookie: u64 = 0; + // SAFETY: PR_SCHED_CORE_GET with pid=0 reads the current thread's cookie + // into the provided pointer. We pass a valid mutable reference. + let ret = unsafe { + libc::prctl( + PR_SCHED_CORE, + PR_SCHED_CORE_GET, + 0, + PIDTYPE_PID, + &mut cookie as *mut u64, + ) + }; + if ret == -1 { + return 0; + } + cookie +} + #[cfg(target_arch = "x86_64")] #[allow(dead_code)] #[repr(C, packed)] @@ -609,6 +679,9 @@ pub struct CpuManager { hypervisor: Arc, #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, + // TID of the first vCPU thread that created a core scheduling cookie (VM mode). + // 0 = no leader yet, -1 = leader creating cookie, >0 = leader TID (cookie ready). + core_scheduling_group_leader: Arc, } const CPU_ENABLE_FLAG: usize = 0; @@ -851,6 +924,7 @@ impl CpuManager { hypervisor, #[cfg(feature = "sev_snp")] sev_snp_enabled, + core_scheduling_group_leader: Arc::new(AtomicI32::new(0)), }))) } @@ -1079,6 +1153,9 @@ impl CpuManager { cpuset }); + let core_scheduling = self.config.core_scheduling; + let core_scheduling_group_leader = self.core_scheduling_group_leader.clone(); + // Retrieve seccomp filter for vcpu thread let vcpu_seccomp_filter = get_seccomp_filter( &self.seccomp_action, @@ -1117,6 +1194,64 @@ impl CpuManager { } } + // Set up core scheduling before seccomp locks down prctl. + match core_scheduling { + CoreScheduling::Vcpu => { + // Each vCPU gets its own unique cookie + if let Err(e) = core_scheduling_create() { + error!( + "Failed to enable core scheduling for vCPU {vcpu_id}: {e:?}" + ); + return; + } + } + CoreScheduling::Vm => { + // First vCPU creates a cookie; all others share from it. + // SAFETY: gettid() is always safe to call. + let my_tid = unsafe { libc::gettid() }; + if core_scheduling_group_leader + .compare_exchange(0, -1, Ordering::AcqRel, Ordering::Acquire) + .is_ok() + { + // We are the group leader — create the cookie + if let Err(e) = core_scheduling_create() { + error!( + "Failed to create core scheduling cookie: {e:?}" + ); + return; + } + // Signal that the cookie is ready by storing real TID + core_scheduling_group_leader + .store(my_tid, Ordering::Release); + } else { + // Wait for the leader to finish creating the cookie + let mut leader_tid = + core_scheduling_group_leader.load(Ordering::Acquire); + while leader_tid <= 0 { + std::hint::spin_loop(); + leader_tid = + core_scheduling_group_leader.load(Ordering::Acquire); + } + // Copy the leader's cookie to this thread + if let Err(e) = core_scheduling_share_from(leader_tid) { + error!( + "Failed to share core scheduling cookie \ + to vCPU {vcpu_id}: {e:?}" + ); + return; + } + } + } + CoreScheduling::Off => {} + } + + if core_scheduling != CoreScheduling::Off { + info!( + "vCPU {vcpu_id}: core scheduling cookie = {:#x}", + core_scheduling_cookie() + ); + } + // Apply seccomp filter for vcpu thread. if !vcpu_seccomp_filter.is_empty() && let Err(e) = apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 627b13d5d7..9ffd7fc0bc 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -2411,8 +2411,8 @@ mod unit_tests { #[cfg(target_arch = "x86_64")] use crate::vm_config::DebugConsoleConfig; use crate::vm_config::{ - ConsoleConfig, ConsoleOutputMode, CpuFeatures, CpusConfig, HotplugMethod, MemoryConfig, - PayloadConfig, RngConfig, + ConsoleConfig, ConsoleOutputMode, CoreScheduling, CpuFeatures, CpusConfig, HotplugMethod, + MemoryConfig, PayloadConfig, RngConfig, }; fn create_dummy_vmm() -> Vmm { @@ -2441,6 +2441,7 @@ mod unit_tests { affinity: None, features: CpuFeatures::default(), nested: true, + core_scheduling: CoreScheduling::default(), }, memory: MemoryConfig { size: 536_870_912, diff --git a/vmm/src/vm_config.rs b/vmm/src/vm_config.rs index b9e67f7bba..33c2b23acd 100644 --- a/vmm/src/vm_config.rs +++ b/vmm/src/vm_config.rs @@ -39,6 +39,14 @@ pub struct CpuFeatures { pub amx: bool, } +#[derive(Clone, Copy, Debug, PartialEq, Eq, Deserialize, Serialize, Default)] +pub enum CoreScheduling { + #[default] + Vm, // All vCPUs have the same cookie so can share a core + Vcpu, // Each vCPU has a unique cookie so can't share a core + Off, +} + #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] pub struct CpuTopology { pub threads_per_core: u16, @@ -72,6 +80,8 @@ pub struct CpusConfig { pub features: CpuFeatures, #[serde(default = "default_cpusconfig_nested")] pub nested: bool, + #[serde(default)] + pub core_scheduling: CoreScheduling, } pub const DEFAULT_VCPUS: u32 = 1; @@ -87,6 +97,7 @@ impl Default for CpusConfig { affinity: None, features: CpuFeatures::default(), nested: true, + core_scheduling: CoreScheduling::default(), } } } From 272fa624ef827e31c7fb0d373801a3f213f61089 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Thu, 26 Feb 2026 04:00:59 -0800 Subject: [PATCH 025/742] ci: Disable RISC-V workflows temporarily The workflows are very flaky and have been failing the majority of the time recently. Fixes: #7758 Signed-off-by: Rob Bradford --- .github/workflows/preview-riscv64-build.yaml | 30 -------------- .../workflows/preview-riscv64-modules.yaml | 39 ------------------- 2 files changed, 69 deletions(-) delete mode 100644 .github/workflows/preview-riscv64-build.yaml delete mode 100644 .github/workflows/preview-riscv64-modules.yaml diff --git a/.github/workflows/preview-riscv64-build.yaml b/.github/workflows/preview-riscv64-build.yaml deleted file mode 100644 index ad87232d3d..0000000000 --- a/.github/workflows/preview-riscv64-build.yaml +++ /dev/null @@ -1,30 +0,0 @@ -name: Cloud Hypervisor RISC-V 64-bit kvm build Preview -on: [pull_request, merge_group] -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event_name }} - cancel-in-progress: true - -jobs: - build: - name: Cargo - runs-on: riscv64-qemu-host - strategy: - fail-fast: false - - steps: - - name: Code checkout - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - - name: Install Rust toolchain - run: /opt/scripts/exec-in-qemu.sh rustup default 1.89.0 - - - name: Build test (kvm) - run: /opt/scripts/exec-in-qemu.sh cargo build --locked --no-default-features --features "kvm" -p cloud-hypervisor - - - name: Clippy test (kvm) - run: /opt/scripts/exec-in-qemu.sh cargo clippy --locked --no-default-features --features "kvm" -p cloud-hypervisor - - - name: Check no files were modified - run: test -z "$(git status --porcelain)" diff --git a/.github/workflows/preview-riscv64-modules.yaml b/.github/workflows/preview-riscv64-modules.yaml deleted file mode 100644 index 1b7ac6ed16..0000000000 --- a/.github/workflows/preview-riscv64-modules.yaml +++ /dev/null @@ -1,39 +0,0 @@ -name: Cloud Hypervisor RISC-V 64-bit Preview -on: [pull_request, merge_group] -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event_name }} - cancel-in-progress: true - -jobs: - build: - name: Cargo - runs-on: riscv64-qemu-host - strategy: - fail-fast: false - matrix: - module: - - hypervisor - - arch - - vm-allocator - - devices - - steps: - - name: Code checkout - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - - name: Install Rust toolchain - run: /opt/scripts/exec-in-qemu.sh rustup default 1.89.0 - - - name: Build ${{ matrix.module }} Module (kvm) - run: /opt/scripts/exec-in-qemu.sh cargo build --locked -p ${{ matrix.module }} --no-default-features --features "kvm" - - - name: Clippy ${{ matrix.module }} Module (kvm) - run: /opt/scripts/exec-in-qemu.sh cargo clippy --locked -p ${{ matrix.module }} --no-default-features --features "kvm" -- -D warnings - - - name: Test ${{ matrix.module }} Module (kvm) - run: /opt/scripts/exec-in-qemu.sh cargo test --locked -p ${{ matrix.module }} --no-default-features --features "kvm" - - - name: Check no files were modified - run: test -z "$(git status --porcelain)" From 00c05f4761ae106f94269b5b6255dbd6ca734f9a Mon Sep 17 00:00:00 2001 From: Saravanan D Date: Wed, 18 Feb 2026 14:23:03 -0800 Subject: [PATCH 026/742] block: Use logical block size for alignment O_DIRECT requires buffer addresses to be aligned to the backend device's logical block size. The existing bounce buffer logic in execute_async() hardcodes SECTOR_SIZE (512) for the alignment check and bounce buffer allocation. This is insufficient for devices with a 4096-byte logical block size, where misaligned buffers cause -EINVAL from the host kernel. Add an alignment() method to the AsyncIo trait that returns the backend's logical block size, defaulting to SECTOR_SIZE. The three raw I/O backends (io_uring, AIO, synchronous) probe the device topology via DiskTopology::probe() at creation time and return the actual logical block size. All image format backends would simply use the default value of 512 bytes since their underlying are not block devices. execute_async() now queries disk_image.alignment() instead of using the hardcoded SECTOR_SIZE Fixes: #7720 Signed-off-by: Saravanan D --- block/src/async_io.rs | 5 ++++- block/src/lib.rs | 14 ++++++++------ block/src/raw_async.rs | 18 +++++++++++++----- block/src/raw_async_aio.rs | 24 ++++++++++++++++++------ block/src/raw_sync.rs | 14 ++++++++++++-- 5 files changed, 55 insertions(+), 20 deletions(-) diff --git a/block/src/async_io.rs b/block/src/async_io.rs index a1e8fa3e46..fe3349e6a3 100644 --- a/block/src/async_io.rs +++ b/block/src/async_io.rs @@ -8,7 +8,7 @@ use std::os::fd::{AsRawFd, OwnedFd, RawFd}; use thiserror::Error; use vmm_sys_util::eventfd::EventFd; -use crate::{BatchRequest, DiskTopology}; +use crate::{BatchRequest, DiskTopology, SECTOR_SIZE}; #[derive(Error, Debug)] pub enum DiskFileError { @@ -145,4 +145,7 @@ pub trait AsyncIo: Send { fn submit_batch_requests(&mut self, _batch_request: &[BatchRequest]) -> AsyncIoResult<()> { Ok(()) } + fn alignment(&self) -> u64 { + SECTOR_SIZE + } } diff --git a/block/src/lib.rs b/block/src/lib.rs index 3d45473a9a..3ab8de9e41 100644 --- a/block/src/lib.rs +++ b/block/src/lib.rs @@ -436,6 +436,7 @@ impl Request { let sector = self.sector; let request_type = self.request_type; let offset = (sector << SECTOR_SHIFT) as libc::off_t; + let alignment = disk_image.alignment(); let mut iovecs: SmallVec<[libc::iovec; DEFAULT_DESCRIPTOR_VEC_SIZE]> = SmallVec::with_capacity(self.data_descriptors.len()); @@ -466,14 +467,15 @@ impl Request { assert!(origin_ptr.len() >= data_len); let origin_ptr = origin_ptr.ptr_guard(); - // Verify the buffer alignment. - // In case it's not properly aligned, an intermediate buffer is - // created with the correct alignment, and a copy from/to the - // origin buffer is performed, depending on the type of operation. - let iov_base = if (origin_ptr.as_ptr() as u64).is_multiple_of(SECTOR_SIZE) { + // O_DIRECT requires buffer addresses to be aligned to the + // backend device's logical block size. In case it's not properly + // aligned, an intermediate buffer is created with the correct + // alignment, and a copy from/to the origin buffer is performed, + // depending on the type of operation. + let iov_base = if (origin_ptr.as_ptr() as u64).is_multiple_of(alignment) { origin_ptr.as_ptr() as *mut libc::c_void } else { - let layout = Layout::from_size_align(data_len, SECTOR_SIZE as usize).unwrap(); + let layout = Layout::from_size_align(data_len, alignment as usize).unwrap(); // SAFETY: layout has non-zero size let aligned_ptr = unsafe { alloc_zeroed(layout) }; if aligned_ptr.is_null() { diff --git a/block/src/raw_async.rs b/block/src/raw_async.rs index 539aaa9095..3a890d716f 100644 --- a/block/src/raw_async.rs +++ b/block/src/raw_async.rs @@ -13,7 +13,7 @@ use vmm_sys_util::eventfd::EventFd; use crate::async_io::{ AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFile, DiskFileError, DiskFileResult, }; -use crate::{BatchRequest, DiskTopology, RequestType, probe_sparse_support}; +use crate::{BatchRequest, DiskTopology, RequestType, SECTOR_SIZE, probe_sparse_support}; pub struct RawFileDisk { file: File, @@ -40,10 +40,12 @@ impl DiskFile for RawFileDisk { } fn new_async_io(&self, ring_depth: u32) -> DiskFileResult> { - Ok(Box::new( - RawFileAsync::new(self.file.as_raw_fd(), ring_depth) - .map_err(DiskFileError::NewAsyncIo)?, - ) as Box) + let mut raw = RawFileAsync::new(self.file.as_raw_fd(), ring_depth) + .map_err(DiskFileError::NewAsyncIo)?; + raw.alignment = DiskTopology::probe(&self.file) + .map(|t| t.logical_block_size) + .unwrap_or(SECTOR_SIZE); + Ok(Box::new(raw) as Box) } fn topology(&mut self) -> DiskTopology { @@ -72,6 +74,7 @@ pub struct RawFileAsync { fd: RawFd, io_uring: IoUring, eventfd: EventFd, + alignment: u64, } impl RawFileAsync { @@ -87,6 +90,7 @@ impl RawFileAsync { fd, io_uring, eventfd, + alignment: SECTOR_SIZE, }) } } @@ -96,6 +100,10 @@ impl AsyncIo for RawFileAsync { &self.eventfd } + fn alignment(&self) -> u64 { + self.alignment + } + fn read_vectored( &mut self, offset: libc::off_t, diff --git a/block/src/raw_async_aio.rs b/block/src/raw_async_aio.rs index 6447a727d8..7266a3633a 100644 --- a/block/src/raw_async_aio.rs +++ b/block/src/raw_async_aio.rs @@ -16,7 +16,7 @@ use vmm_sys_util::eventfd::EventFd; use crate::async_io::{ AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFile, DiskFileError, DiskFileResult, }; -use crate::{DiskTopology, probe_sparse_support}; +use crate::{DiskTopology, SECTOR_SIZE, probe_sparse_support}; pub struct RawFileDiskAio { file: File, @@ -43,10 +43,12 @@ impl DiskFile for RawFileDiskAio { } fn new_async_io(&self, ring_depth: u32) -> DiskFileResult> { - Ok(Box::new( - RawFileAsyncAio::new(self.file.as_raw_fd(), ring_depth) - .map_err(DiskFileError::NewAsyncIo)?, - ) as Box) + let mut raw = RawFileAsyncAio::new(self.file.as_raw_fd(), ring_depth) + .map_err(DiskFileError::NewAsyncIo)?; + raw.alignment = DiskTopology::probe(&self.file) + .map(|t| t.logical_block_size) + .unwrap_or(SECTOR_SIZE); + Ok(Box::new(raw) as Box) } fn topology(&mut self) -> DiskTopology { @@ -71,6 +73,7 @@ pub struct RawFileAsyncAio { fd: RawFd, ctx: aio::IoContext, eventfd: EventFd, + alignment: u64, } impl RawFileAsyncAio { @@ -78,7 +81,12 @@ impl RawFileAsyncAio { let eventfd = EventFd::new(libc::EFD_NONBLOCK)?; let ctx = aio::IoContext::new(queue_depth)?; - Ok(RawFileAsyncAio { fd, ctx, eventfd }) + Ok(RawFileAsyncAio { + fd, + ctx, + eventfd, + alignment: SECTOR_SIZE, + }) } } @@ -87,6 +95,10 @@ impl AsyncIo for RawFileAsyncAio { &self.eventfd } + fn alignment(&self) -> u64 { + self.alignment + } + fn read_vectored( &mut self, offset: libc::off_t, diff --git a/block/src/raw_sync.rs b/block/src/raw_sync.rs index e1a5433b89..9c96863b69 100644 --- a/block/src/raw_sync.rs +++ b/block/src/raw_sync.rs @@ -13,7 +13,7 @@ use vmm_sys_util::eventfd::EventFd; use crate::async_io::{ AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFile, DiskFileError, DiskFileResult, }; -use crate::{DiskTopology, probe_sparse_support}; +use crate::{DiskTopology, SECTOR_SIZE, probe_sparse_support}; pub struct RawFileDiskSync { file: File, @@ -40,7 +40,11 @@ impl DiskFile for RawFileDiskSync { } fn new_async_io(&self, _ring_depth: u32) -> DiskFileResult> { - Ok(Box::new(RawFileSync::new(self.file.as_raw_fd())) as Box) + let mut raw = RawFileSync::new(self.file.as_raw_fd()); + raw.alignment = DiskTopology::probe(&self.file) + .map(|t| t.logical_block_size) + .unwrap_or(SECTOR_SIZE); + Ok(Box::new(raw) as Box) } fn topology(&mut self) -> DiskTopology { @@ -65,6 +69,7 @@ pub struct RawFileSync { fd: RawFd, eventfd: EventFd, completion_list: VecDeque<(u64, i32)>, + alignment: u64, } impl RawFileSync { @@ -73,6 +78,7 @@ impl RawFileSync { fd, eventfd: EventFd::new(libc::EFD_NONBLOCK).expect("Failed creating EventFd for RawFile"), completion_list: VecDeque::new(), + alignment: SECTOR_SIZE, } } } @@ -82,6 +88,10 @@ impl AsyncIo for RawFileSync { &self.eventfd } + fn alignment(&self) -> u64 { + self.alignment + } + fn read_vectored( &mut self, offset: libc::off_t, From 110192087ec5989d97f57ce3232366c135289ea2 Mon Sep 17 00:00:00 2001 From: Victor Vieux Date: Thu, 26 Feb 2026 11:53:49 -0800 Subject: [PATCH 027/742] vmm: config: Fix missing comma in NetConfig help text The SYNTAX help string for --net was missing a comma between pci_segment and offload_tso parameters, making the help output show them as a single run-on token. Signed-off-by: Victor Vieux --- vmm/src/config.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vmm/src/config.rs b/vmm/src/config.rs index 3b32463674..46f3443136 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -1414,7 +1414,7 @@ impl NetConfig { num_queues=,queue_size=,id=,\ vhost_user=,socket=,vhost_mode=client|server,\ bw_size=,bw_one_time_burst=,bw_refill_time=,\ - ops_size=,ops_one_time_burst=,ops_refill_time=,pci_segment=\ + ops_size=,ops_one_time_burst=,ops_refill_time=,pci_segment=,\ offload_tso=on|off,offload_ufo=on|off,offload_csum=on|off\""; pub fn parse(net: &str) -> Result { From 9fd9c244194f73644a590dc42160da121155bb3a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 26 Feb 2026 23:54:04 +0000 Subject: [PATCH 028/742] build: Bump actions/upload-artifact from 6 to 7 Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 6 to 7. - [Release notes](https://github.com/actions/upload-artifact/releases) - [Commits](https://github.com/actions/upload-artifact/compare/v6...v7) --- updated-dependencies: - dependency-name: actions/upload-artifact dependency-version: '7' dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/release.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 6a96de491b..a6e8defba4 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -54,7 +54,7 @@ jobs: cp target/${{ matrix.platform.target }}/release/ch-remote ./${{ matrix.platform.name_ch_remote }} - name: Upload Release Artifacts if: github.event_name == 'create' && github.event.ref_type == 'tag' - uses: actions/upload-artifact@v6 + uses: actions/upload-artifact@v7 with: name: Artifacts for ${{ matrix.platform.target }} path: | @@ -80,7 +80,7 @@ jobs: github.event_name == 'create' && github.event.ref_type == 'tag' && matrix.platform.target == 'x86_64-unknown-linux-gnu' id: upload-release-cloud-hypervisor-vendored-sources - uses: actions/upload-artifact@v6 + uses: actions/upload-artifact@v7 with: path: cloud-hypervisor-${{ github.event.ref }}.tar.xz name: cloud-hypervisor-${{ github.event.ref }}.tar.xz From fdc51d923f3a389202a625a288bccbb7cd3ded9b Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sun, 22 Feb 2026 22:06:51 -0800 Subject: [PATCH 029/742] tests: refactor event sequencing expectations for simple launch Move MetaEvent from the integration test into shared test infrastructure and expose it for reuse. Add a Guest helper that returns the expected sequential events for simple launch, and update the integration test to consume this helper instead of maintaining a local event list. Adjust expected behavior for confidential VMs by omitting the disk reset event, which is not guaranteed to be emitted in that mode. Preserve the existing expected sequence for non-confidential VMs. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 50 ++----------------------- test_infra/src/lib.rs | 53 +++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 46 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 99a22f2730..c836f631d4 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -889,28 +889,6 @@ fn fw_path(_fw_type: FwType) -> String { fw_path.to_str().unwrap().to_string() } -#[derive(Debug)] -struct MetaEvent { - event: String, - device_id: Option, -} - -impl MetaEvent { - pub fn match_with_json_event(&self, v: &serde_json::Value) -> bool { - let mut matched = false; - if v["event"].as_str().unwrap() == self.event { - if let Some(device_id) = &self.device_id { - if v["properties"]["id"].as_str().unwrap() == device_id { - matched = true; - } - } else { - matched = true; - } - } - matched - } -} - // Parse the event_monitor file based on the format that each event // is followed by a double newline fn parse_event_file(event_file: &str) -> Vec { @@ -2584,31 +2562,11 @@ fn _test_simple_launch(guest: &Guest) { assert_eq!(guest.get_cpu_count().unwrap_or_default(), 1); assert!(guest.get_total_memory().unwrap_or_default() > 480_000); assert_eq!(guest.get_pci_bridge_class().unwrap_or_default(), "0x060000"); - - let expected_sequential_events = [ - &MetaEvent { - event: "starting".to_string(), - device_id: None, - }, - &MetaEvent { - event: "booting".to_string(), - device_id: None, - }, - &MetaEvent { - event: "booted".to_string(), - device_id: None, - }, - &MetaEvent { - event: "activated".to_string(), - device_id: Some("_disk0".to_string()), - }, - &MetaEvent { - event: "reset".to_string(), - device_id: Some("_disk0".to_string()), - }, - ]; assert!(check_sequential_events( - &expected_sequential_events, + &guest + .get_expected_seq_events_for_simple_launch() + .iter() + .collect::>(), &event_path )); diff --git a/test_infra/src/lib.rs b/test_infra/src/lib.rs index 2848a130e5..cab4000828 100644 --- a/test_infra/src/lib.rs +++ b/test_infra/src/lib.rs @@ -883,6 +883,28 @@ pub fn kill_child(child: &mut Child) { } } +#[derive(Debug)] +pub struct MetaEvent { + pub event: String, + pub device_id: Option, +} + +impl MetaEvent { + pub fn match_with_json_event(&self, v: &serde_json::Value) -> bool { + let mut matched = false; + if v["event"].as_str().unwrap() == self.event { + if let Some(device_id) = &self.device_id { + if v["properties"]["id"].as_str().unwrap() == device_id { + matched = true; + } + } else { + matched = true; + } + } + matched + } +} + pub const PIPE_SIZE: i32 = 32 << 20; pub struct Guest { @@ -1320,6 +1342,37 @@ impl Guest { assert_eq!(self.ssh_command("sudo umount /mnt").unwrap(), ""); } } + + pub fn get_expected_seq_events_for_simple_launch(&self) -> Vec { + let mut out_evt = vec![ + MetaEvent { + event: "starting".to_string(), + device_id: None, + }, + MetaEvent { + event: "booting".to_string(), + device_id: None, + }, + MetaEvent { + event: "booted".to_string(), + device_id: None, + }, + MetaEvent { + event: "activated".to_string(), + device_id: Some("_disk0".to_string()), + }, + ]; + // For confidential VM, reset of the device does not trigger a VMM exit, or + // It is handled in the PSP + // so we won't receive the "reset" event for disk0. + if self.vm_type != GuestVmType::Confidential { + out_evt.push(MetaEvent { + event: "reset".to_string(), + device_id: Some("_disk0".to_string()), + }); + } + out_evt + } } #[derive(Default)] From 60c6242bde142ae2c8e88bfa9f14ebdbff9e78c4 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sun, 22 Feb 2026 22:13:57 -0800 Subject: [PATCH 030/742] tests: centralize default CPU arguments in test infrastructure Replace the hard-coded .args(["--cpus", "boot=1"]) in the simple launch integration test with a shared helper (default_cpus) from test infrastructure. Extend Guest with explicit CPU-related defaults (num_cpu, nested) and add default_cpus_string() so CPU configuration is derived from guest state instead of being duplicated at call sites. This refactor improves consistency and makes CPU defaults easier to maintain across integration tests. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 2 +- test_infra/src/lib.rs | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index c836f631d4..9838e14972 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -2545,7 +2545,7 @@ fn _test_simple_launch(guest: &Guest) { let event_path = temp_event_monitor_path(&guest.tmp_dir); let mut child = GuestCommand::new(guest) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=512M"]) .default_kernel_cmdline() .default_disks() diff --git a/test_infra/src/lib.rs b/test_infra/src/lib.rs index cab4000828..f45ed3e700 100644 --- a/test_infra/src/lib.rs +++ b/test_infra/src/lib.rs @@ -916,6 +916,8 @@ pub struct Guest { pub kernel_path: Option, pub kernel_cmdline: Option, pub console_type: Option, + pub num_cpu: u32, + pub nested: bool, } // Return the next id that can be used for this guest. This is stored in a @@ -985,6 +987,8 @@ impl Guest { kernel_path: None, kernel_cmdline: None, console_type: None, + num_cpu: 1u32, + nested: true, } } @@ -1373,6 +1377,14 @@ impl Guest { } out_evt } + + pub fn default_cpus_string(&self) -> String { + format!( + "boot={}{}", + self.num_cpu, + if self.nested { "" } else { ",nested=off" } + ) + } } #[derive(Default)] @@ -1555,6 +1567,10 @@ impl<'a> GuestCommand<'a> { self } + + pub fn default_cpus(&mut self) -> &mut Self { + self.args(["--cpus", self.guest.default_cpus_string().as_str()]) + } } /// Returns the absolute path into the workspaces target directory to locate the desired From 45a5c7a04e38891211f3fd9467d888e0d42dc499 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sun, 22 Feb 2026 22:17:48 -0800 Subject: [PATCH 031/742] tests: validate CPU count in the test infra Instead of validating number of CPU in the test case itself, moving the checking of the CPU count to Guest struct with a new function as The Guest already has the Default CPU number. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 2 +- test_infra/src/lib.rs | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 9838e14972..6b18591730 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -2559,7 +2559,7 @@ fn _test_simple_launch(guest: &Guest) { let r = std::panic::catch_unwind(|| { guest.wait_vm_boot().unwrap(); - assert_eq!(guest.get_cpu_count().unwrap_or_default(), 1); + guest.validate_cpu_count(None); assert!(guest.get_total_memory().unwrap_or_default() > 480_000); assert_eq!(guest.get_pci_bridge_class().unwrap_or_default(), "0x060000"); assert!(check_sequential_events( diff --git a/test_infra/src/lib.rs b/test_infra/src/lib.rs index f45ed3e700..d5349b43e0 100644 --- a/test_infra/src/lib.rs +++ b/test_infra/src/lib.rs @@ -1385,6 +1385,14 @@ impl Guest { if self.nested { "" } else { ",nested=off" } ) } + + pub fn validate_cpu_count(&self, expected_cpu_count: Option) { + let cpu = match expected_cpu_count { + Some(count) => count, + None => self.num_cpu, + }; + assert_eq!(self.get_cpu_count().unwrap_or_default(), cpu); + } } #[derive(Default)] From 05aeef06e504dbc3fc063c5abc652329c194f014 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sun, 22 Feb 2026 22:25:32 -0800 Subject: [PATCH 032/742] tests: centralize default memory args in helpers Replace hard-coded --memory args in simple launch tests with GuestCommand defaults driven by Guest state. Add Guest.mem_size_str with a default of 512M and introduce default_memory_string() and GuestCommand::default_memory(). Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 2 +- test_infra/src/lib.rs | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 6b18591730..7250e8d9f1 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -2546,7 +2546,7 @@ fn _test_simple_launch(guest: &Guest) { let mut child = GuestCommand::new(guest) .default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .default_kernel_cmdline() .default_disks() .default_net() diff --git a/test_infra/src/lib.rs b/test_infra/src/lib.rs index d5349b43e0..0293249ff1 100644 --- a/test_infra/src/lib.rs +++ b/test_infra/src/lib.rs @@ -918,6 +918,7 @@ pub struct Guest { pub console_type: Option, pub num_cpu: u32, pub nested: bool, + pub mem_size_str: String, } // Return the next id that can be used for this guest. This is stored in a @@ -989,6 +990,7 @@ impl Guest { console_type: None, num_cpu: 1u32, nested: true, + mem_size_str: "512M".to_string(), } } @@ -1386,6 +1388,10 @@ impl Guest { ) } + pub fn default_memory_string(&self) -> String { + format!("size={}", self.mem_size_str) + } + pub fn validate_cpu_count(&self, expected_cpu_count: Option) { let cpu = match expected_cpu_count { Some(count) => count, @@ -1579,6 +1585,10 @@ impl<'a> GuestCommand<'a> { pub fn default_cpus(&mut self) -> &mut Self { self.args(["--cpus", self.guest.default_cpus_string().as_str()]) } + + pub fn default_memory(&mut self) -> &mut Self { + self.args(["--memory", self.guest.default_memory_string().as_str()]) + } } /// Returns the absolute path into the workspaces target directory to locate the desired From 32edcf39a648636ebec20dea82d347c05dbc4054 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sun, 22 Feb 2026 22:39:10 -0800 Subject: [PATCH 033/742] tests: centralize memory validation in test helpers Replace the hard-coded memory threshold check in the simple launch integration test with Guest::validate_memory(None). Add Guest::get_expected_memory() to derive thresholds from mem_size_str and vm_type, and reuse this through validate_memory(). Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 2 +- test_infra/src/lib.rs | 34 +++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 7250e8d9f1..804784507b 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -2560,7 +2560,7 @@ fn _test_simple_launch(guest: &Guest) { guest.wait_vm_boot().unwrap(); guest.validate_cpu_count(None); - assert!(guest.get_total_memory().unwrap_or_default() > 480_000); + guest.validate_memory(None); assert_eq!(guest.get_pci_bridge_class().unwrap_or_default(), "0x060000"); assert!(check_sequential_events( &guest diff --git a/test_infra/src/lib.rs b/test_infra/src/lib.rs index 0293249ff1..60bf4ece03 100644 --- a/test_infra/src/lib.rs +++ b/test_infra/src/lib.rs @@ -1399,6 +1399,40 @@ impl Guest { }; assert_eq!(self.get_cpu_count().unwrap_or_default(), cpu); } + + fn get_expected_memory(&self) -> Option { + // For confidential VMs, the memory available to the guest is less than + // the memory assigned to the VM, as some of it is reserved for the PSP + // and bounce buffers. + // So we return the expected available memory for confidential VMs here. + let memory = match self.mem_size_str.as_str() { + "512M" => { + if self.vm_type == GuestVmType::Confidential { + 407_000 + } else { + 480_000 + } + } + "1G" => { + if self.vm_type == GuestVmType::Confidential { + 920_000 + } else { + 960_000 + } + } + // More to be added if more memory sizes are used in the tests + _ => panic!("Unsupported memory size: {}", self.mem_size_str), + }; + Some(memory) + } + + pub fn validate_memory(&self, expected_memory: Option) { + let memory = expected_memory + .or_else(|| self.get_expected_memory()) + .unwrap_or_default(); + + assert!(self.get_total_memory().unwrap_or_default() > memory); + } } #[derive(Default)] From af764235a093ac237ced21be74807a36829095d3 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sun, 22 Feb 2026 22:40:48 -0800 Subject: [PATCH 034/742] tests: nested not supported for CVM on MSHV Nested on MSHV confident VM not supported yet. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 804784507b..c07ae25faa 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -14441,6 +14441,7 @@ mod common_cvm { let mut guest = Guest::new(Box::new(disk_config)); guest.vm_type = GuestVmType::Confidential; guest.boot_timeout = DEFAULT_CVM_TCP_LISTENER_TIMEOUT; + guest.nested = false; _test_simple_launch(&guest) } } From c35bfbd79f5740a9d8f842633c67b931e61d6a84 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sun, 22 Feb 2026 23:14:28 -0800 Subject: [PATCH 035/742] tests: use default_cpus() across integration tests Replace hard-coded --cpus boot= arguments in integration tests with GuestCommand::default_cpus() for shared, centralized defaults. This removes duplicated CLI fragments and keeps CPU setup consistent. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 112 +++++++++++++------------- 1 file changed, 56 insertions(+), 56 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index c07ae25faa..e5349f3440 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -1174,7 +1174,7 @@ fn _test_power_button(acpi: bool) { direct_kernel_boot_path() }; - cmd.args(["--cpus", "boot=1"]) + cmd.default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -1635,7 +1635,7 @@ fn _test_virtio_fs( let mut guest_command = GuestCommand::new(&guest); guest_command - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=512M,hotplug_size=2048M,shared=on"]) .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -1837,7 +1837,7 @@ fn test_virtio_pmem(discard_writes: bool, specify_size: bool) { .expect("Expect creating disk image to succeed"); let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -1918,7 +1918,7 @@ fn _test_virtio_vsock(hotplug: bool) { let mut cmd = GuestCommand::new(&guest); cmd.args(["--api-socket", &api_socket]); - cmd.args(["--cpus", "boot=1"]); + cmd.default_cpus(); cmd.args(["--memory", "size=512M"]); cmd.args(["--kernel", kernel_path.to_str().unwrap()]); cmd.args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]); @@ -1992,7 +1992,7 @@ fn test_memory_mergeable(mergeable: bool) { let disk_config1 = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); let guest1 = Guest::new(Box::new(disk_config1)); let mut child1 = GuestCommand::new(&guest1) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", format!("size=512M,{memory_param}").as_str()]) .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -2018,7 +2018,7 @@ fn test_memory_mergeable(mergeable: bool) { let disk_config2 = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); let guest2 = Guest::new(Box::new(disk_config2)); let mut child2 = GuestCommand::new(&guest2) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", format!("size=512M,{memory_param}").as_str()]) .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -2281,7 +2281,7 @@ fn _test_virtio_iommu(acpi: bool) { }; let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -2865,7 +2865,7 @@ mod common_parallel { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(disk_config)); let mut cmd = GuestCommand::new(&guest); - cmd.args(["--cpus", "boot=1"]) + cmd.default_cpus() .args(["--memory", "size=128G"]) .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -2902,7 +2902,7 @@ mod common_parallel { let kernel_path = direct_kernel_boot_path(); let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=0,hotplug_method=virtio-mem"]) .args([ "--memory-zone", @@ -2990,7 +2990,7 @@ mod common_parallel { let api_socket = temp_api_path(&guest.tmp_dir); let mut cmd = GuestCommand::new(&guest); - cmd.args(["--cpus", "boot=1"]) + cmd.default_cpus() .args(["--api-socket", &api_socket]) .args(["--memory", "size=512M"]) .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) @@ -3051,7 +3051,7 @@ mod common_parallel { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(disk_config)); let mut cmd = GuestCommand::new(&guest); - cmd.args(["--cpus", "boot=1"]) + cmd.default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -3088,7 +3088,7 @@ mod common_parallel { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(disk_config)); let mut cmd = GuestCommand::new(&guest); - cmd.args(["--cpus", "boot=1"]) + cmd.default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -3150,7 +3150,7 @@ mod common_parallel { assert!(exec_host_command_status(format!("mkfs.ext4 {test_disk_path}").as_str()).success()); let mut cmd = GuestCommand::new(&guest); - cmd.args(["--cpus", "boot=1"]) + cmd.default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -3321,7 +3321,7 @@ mod common_parallel { let kernel_path = direct_kernel_boot_path(); let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -3367,7 +3367,7 @@ mod common_parallel { kernel_path.push("bzImage-x86_64"); let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -4456,7 +4456,7 @@ mod common_parallel { ); let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -4519,7 +4519,7 @@ mod common_parallel { ); let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -4586,7 +4586,7 @@ mod common_parallel { ); let child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -4636,7 +4636,7 @@ mod common_parallel { ); let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -4779,7 +4779,7 @@ mod common_parallel { assert_eq!(vhdx_image_size(vhdx_path), EMPTY_VHDX_FILE_SIZE); let mut cloud_child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -4860,7 +4860,7 @@ mod common_parallel { .expect("copying of OS disk failed"); let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", fw_path(FwType::RustHypervisorFirmware).as_str()]) .args([ @@ -4970,7 +4970,7 @@ mod common_parallel { let guest = Guest::new(Box::new(disk_config)); let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -5016,7 +5016,7 @@ mod common_parallel { let guest = Guest::new(Box::new(disk_config)); let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -5052,7 +5052,7 @@ mod common_parallel { let guest = Guest::new(Box::new(disk_config)); let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -5093,7 +5093,7 @@ mod common_parallel { let oem_strings = format!("oem_strings=[{s1},{s2}]"); let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -5196,7 +5196,7 @@ mod common_parallel { let kernel_path = direct_kernel_boot_path(); let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", kernel_path.to_str().unwrap()]) .args([ @@ -5251,7 +5251,7 @@ mod common_parallel { let kernel_path = direct_kernel_boot_path(); let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -5296,7 +5296,7 @@ mod common_parallel { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(disk_config)); let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -5332,7 +5332,7 @@ mod common_parallel { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(disk_config)); let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -5374,7 +5374,7 @@ mod common_parallel { #[cfg(target_arch = "aarch64")] let console_str: &str = "console=ttyAMA0"; - cmd.args(["--cpus", "boot=1"]) + cmd.default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args([ @@ -5430,7 +5430,7 @@ mod common_parallel { let console_str: &str = "console=ttyAMA0"; let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", kernel_path.to_str().unwrap()]) .args([ @@ -5488,7 +5488,7 @@ mod common_parallel { let console_str: &str = "console=ttyAMA0"; let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args([ @@ -5557,7 +5557,7 @@ mod common_parallel { let cmdline = DIRECT_KERNEL_BOOT_CMDLINE.to_owned() + serial_option; let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", &cmdline]) @@ -5604,7 +5604,7 @@ mod common_parallel { let cmdline = DIRECT_KERNEL_BOOT_CMDLINE.to_owned() + serial_option; let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", &cmdline]) @@ -5668,7 +5668,7 @@ mod common_parallel { let kernel_path = direct_kernel_boot_path(); let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -5713,7 +5713,7 @@ mod common_parallel { let console_path = guest.tmp_dir.as_path().join("console-output"); let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -6010,7 +6010,7 @@ mod common_parallel { let kernel_path = direct_kernel_boot_path(); let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", kernel_path.to_str().unwrap()]) .args([ @@ -6107,7 +6107,7 @@ mod common_parallel { let kernel_path = edk2_path(); let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -6519,7 +6519,7 @@ mod common_parallel { let guest_memory_size_kb = 512 * 1024; let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", format!("size={guest_memory_size_kb}K").as_str()]) .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -6561,7 +6561,7 @@ mod common_parallel { let mut child = GuestCommand::new(&guest) .args(["--api-socket", &api_socket]) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -6635,7 +6635,7 @@ mod common_parallel { } cmd.args(["--api-socket", &api_socket]) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -6827,7 +6827,7 @@ mod common_parallel { let mut cmd = GuestCommand::new(&guest); cmd.args(["--api-socket", &api_socket]) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -6942,7 +6942,7 @@ mod common_parallel { let mut cmd = GuestCommand::new(&guest); cmd.args(["--api-socket", &api_socket]) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -7185,7 +7185,7 @@ mod common_parallel { let loop_dev = create_loop_device(test_disk_path.to_str().unwrap(), 4096, 5); let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -8126,7 +8126,7 @@ mod common_parallel { ); let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -8342,7 +8342,7 @@ mod common_parallel { //Let's start a 4G guest with balloon occupied 2G memory let mut child = GuestCommand::new(&guest) .args(["--api-socket", &api_socket]) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=4G"]) .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -8396,7 +8396,7 @@ mod common_parallel { //Let's start a 4G guest with balloon occupied 2G memory let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=4G"]) .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -8480,7 +8480,7 @@ mod common_parallel { let mut cmd = GuestCommand::new(&guest); cmd.args(["--api-socket", &api_socket]) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -8622,7 +8622,7 @@ mod common_parallel { let mut cmd = GuestCommand::new(&guest); cmd.args(["--api-socket", &api_socket]) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -8840,7 +8840,7 @@ mod common_parallel { let api_socket = temp_api_path(&guest.tmp_dir); let mut cmd = GuestCommand::new(&guest); - cmd.args(["--cpus", "boot=1"]) + cmd.default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -8966,7 +8966,7 @@ mod common_parallel { let event_path = temp_event_monitor_path(&guest.tmp_dir); let mut cmd = GuestCommand::new(&guest); - cmd.args(["--cpus", "boot=1"]) + cmd.default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -9049,7 +9049,7 @@ mod common_parallel { let kernel_path = direct_kernel_boot_path(); let mut cmd = GuestCommand::new(&guest); - cmd.args(["--cpus", "boot=1"]) + cmd.default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -9522,7 +9522,7 @@ mod common_parallel { let api_socket = temp_api_path(&guest.tmp_dir); let mut child = GuestCommand::new(&guest) .args(["--api-socket", &api_socket]) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=1G,shared=on,hugepages=on"]) .args(["--kernel", fw_path(FwType::RustHypervisorFirmware).as_str()]) .args(["--serial", "tty", "--console", "off"]) @@ -9816,7 +9816,7 @@ mod common_parallel { let mut guest_cmd = GuestCommand::new(&guest); guest_cmd - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=1G"]) .args(["--kernel", fw_path(FwType::RustHypervisorFirmware).as_str()]) .args(["--tpm", &format!("socket={swtpm_socket_path}")]) @@ -9865,7 +9865,7 @@ mod common_parallel { let kernel_path = direct_kernel_boot_path(); - cmd.args(["--cpus", "boot=1"]) + cmd.default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", kernel_path.to_str().unwrap()]) .args([ @@ -14020,7 +14020,7 @@ mod aarch64_acpi { let guest = Guest::new(disk_config); let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=512M"]) .args(["--kernel", edk2_path().to_str().unwrap()]) .default_disks() From a216cf164f845485a67d1c5a549dde3650f79ddd Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sun, 22 Feb 2026 23:15:43 -0800 Subject: [PATCH 036/742] tests: use default memory helper in integration tests Replace hard-coded --memory size=512M args with default_memory() across integration tests to centralize default memory settings. This reduces duplicated CLI fragments and keeps behavior consistent. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 120 +++++++++++++------------- 1 file changed, 60 insertions(+), 60 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index e5349f3440..0b5b743714 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -995,7 +995,7 @@ fn test_cpu_topology(threads_per_core: u8, cores_per_package: u8, packages: u8, "boot={total_vcpus},topology={threads_per_core}:{cores_per_package}:1:{packages}" ), ]) - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .default_disks() @@ -1175,7 +1175,7 @@ fn _test_power_button(acpi: bool) { }; cmd.default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .capture_output() @@ -1838,7 +1838,7 @@ fn test_virtio_pmem(discard_writes: bool, specify_size: bool) { let mut child = GuestCommand::new(&guest) .default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .default_disks() @@ -1919,7 +1919,7 @@ fn _test_virtio_vsock(hotplug: bool) { let mut cmd = GuestCommand::new(&guest); cmd.args(["--api-socket", &api_socket]); cmd.default_cpus(); - cmd.args(["--memory", "size=512M"]); + cmd.default_memory(); cmd.args(["--kernel", kernel_path.to_str().unwrap()]); cmd.args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]); cmd.default_disks(); @@ -2282,7 +2282,7 @@ fn _test_virtio_iommu(acpi: bool) { let mut child = GuestCommand::new(&guest) .default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .args([ @@ -2639,7 +2639,7 @@ mod common_parallel { let mut cmd = GuestCommand::new(&guest); cmd.args(["--cpus", "boot=2,max=4"]) - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .capture_output() @@ -2694,7 +2694,7 @@ mod common_parallel { let max_phys_bits: u8 = 36; let mut child = GuestCommand::new(&guest) .args(["--cpus", &format!("max_phys_bits={max_phys_bits}")]) - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .default_disks() @@ -2740,7 +2740,7 @@ mod common_parallel { let mut child = GuestCommand::new(&guest) .args(["--cpus", "boot=2,affinity=[0@[0,2],1@[1,3]]"]) - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .default_disks() @@ -2781,7 +2781,7 @@ mod common_parallel { let mut child = GuestCommand::new(&guest) .args(["--cpus", "boot=4"]) - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .args([ @@ -2992,7 +2992,7 @@ mod common_parallel { cmd.default_cpus() .args(["--api-socket", &api_socket]) - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .args([ @@ -3052,7 +3052,7 @@ mod common_parallel { let guest = Guest::new(Box::new(disk_config)); let mut cmd = GuestCommand::new(&guest); cmd.default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .capture_output() @@ -3089,7 +3089,7 @@ mod common_parallel { let guest = Guest::new(Box::new(disk_config)); let mut cmd = GuestCommand::new(&guest); cmd.default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .args(["--net", guest.default_net_string_w_mtu(3000).as_str()]) @@ -3151,7 +3151,7 @@ mod common_parallel { let mut cmd = GuestCommand::new(&guest); cmd.default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .args([ @@ -3322,7 +3322,7 @@ mod common_parallel { let mut child = GuestCommand::new(&guest) .default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .default_disks() @@ -3368,7 +3368,7 @@ mod common_parallel { let mut child = GuestCommand::new(&guest) .default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .default_disks() @@ -4457,7 +4457,7 @@ mod common_parallel { let mut child = GuestCommand::new(&guest) .default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .args([ @@ -4520,7 +4520,7 @@ mod common_parallel { let mut child = GuestCommand::new(&guest) .default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .args([ @@ -4587,7 +4587,7 @@ mod common_parallel { let child = GuestCommand::new(&guest) .default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .args([ @@ -4637,7 +4637,7 @@ mod common_parallel { let mut child = GuestCommand::new(&guest) .default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .args([ @@ -4780,7 +4780,7 @@ mod common_parallel { let mut cloud_child = GuestCommand::new(&guest) .default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .args([ @@ -4861,7 +4861,7 @@ mod common_parallel { let mut child = GuestCommand::new(&guest) .default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", fw_path(FwType::RustHypervisorFirmware).as_str()]) .args([ "--disk", @@ -4971,7 +4971,7 @@ mod common_parallel { let mut child = GuestCommand::new(&guest) .default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .default_disks() @@ -5017,7 +5017,7 @@ mod common_parallel { let mut child = GuestCommand::new(&guest) .default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .args(["--platform", "serial_number=a=b;c=d"]) @@ -5053,7 +5053,7 @@ mod common_parallel { let mut child = GuestCommand::new(&guest) .default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .args(["--platform", "uuid=1e8aa28a-435d-4027-87f4-40dceff1fa0a"]) @@ -5094,7 +5094,7 @@ mod common_parallel { let mut child = GuestCommand::new(&guest) .default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .args(["--platform", &oem_strings]) @@ -5197,7 +5197,7 @@ mod common_parallel { let mut child = GuestCommand::new(&guest) .default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args([ "--disk", @@ -5252,7 +5252,7 @@ mod common_parallel { let mut child = GuestCommand::new(&guest) .default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .default_disks() @@ -5297,7 +5297,7 @@ mod common_parallel { let guest = Guest::new(Box::new(disk_config)); let mut child = GuestCommand::new(&guest) .default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .default_disks() @@ -5333,7 +5333,7 @@ mod common_parallel { let guest = Guest::new(Box::new(disk_config)); let mut child = GuestCommand::new(&guest) .default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .default_disks() @@ -5375,7 +5375,7 @@ mod common_parallel { let console_str: &str = "console=ttyAMA0"; cmd.default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args([ "--cmdline", @@ -5431,7 +5431,7 @@ mod common_parallel { let mut child = GuestCommand::new(&guest) .default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args([ "--cmdline", @@ -5489,7 +5489,7 @@ mod common_parallel { let mut child = GuestCommand::new(&guest) .default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args([ "--cmdline", @@ -5558,7 +5558,7 @@ mod common_parallel { let mut child = GuestCommand::new(&guest) .default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", &cmdline]) .default_disks() @@ -5605,7 +5605,7 @@ mod common_parallel { let mut child = GuestCommand::new(&guest) .default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", &cmdline]) .default_disks() @@ -5669,7 +5669,7 @@ mod common_parallel { let mut child = GuestCommand::new(&guest) .default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .default_disks() @@ -5714,7 +5714,7 @@ mod common_parallel { let console_path = guest.tmp_dir.as_path().join("console-output"); let mut child = GuestCommand::new(&guest) .default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .default_disks() @@ -6011,7 +6011,7 @@ mod common_parallel { let mut child = GuestCommand::new(&guest) .default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args([ "--cmdline", @@ -6108,7 +6108,7 @@ mod common_parallel { let mut child = GuestCommand::new(&guest) .default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .default_disks() @@ -6209,7 +6209,7 @@ mod common_parallel { let mut child = GuestCommand::new(&guest) .args(["--cpus", "boot=2,max=4"]) - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args([ "--cmdline", @@ -6562,7 +6562,7 @@ mod common_parallel { let mut child = GuestCommand::new(&guest) .args(["--api-socket", &api_socket]) .default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .args(["--landlock"]) @@ -6636,7 +6636,7 @@ mod common_parallel { cmd.args(["--api-socket", &api_socket]) .default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .default_disks() @@ -6828,7 +6828,7 @@ mod common_parallel { cmd.args(["--api-socket", &api_socket]) .default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .default_disks() @@ -6943,7 +6943,7 @@ mod common_parallel { cmd.args(["--api-socket", &api_socket]) .default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .default_disks() @@ -7186,7 +7186,7 @@ mod common_parallel { let mut child = GuestCommand::new(&guest) .default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .args([ @@ -7398,7 +7398,7 @@ mod common_parallel { let mut child = GuestCommand::new(&guest) .args(["--cpus", "boot=4"]) - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .args([ @@ -7925,7 +7925,7 @@ mod common_parallel { let mut child = GuestCommand::new(&guest) .args(["--cpus", "boot=4"]) - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .args([ @@ -8127,7 +8127,7 @@ mod common_parallel { let mut child = GuestCommand::new(&guest) .default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .args([ @@ -8243,7 +8243,7 @@ mod common_parallel { let mut child = GuestCommand::new(&guest) .args(["--cpus", "boot=4"]) - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .args([ @@ -8481,7 +8481,7 @@ mod common_parallel { cmd.args(["--api-socket", &api_socket]) .default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .default_disks() @@ -8623,7 +8623,7 @@ mod common_parallel { cmd.args(["--api-socket", &api_socket]) .default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .default_net() @@ -8841,7 +8841,7 @@ mod common_parallel { let mut cmd = GuestCommand::new(&guest); cmd.default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .default_disks() @@ -8967,7 +8967,7 @@ mod common_parallel { let mut cmd = GuestCommand::new(&guest); cmd.default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .default_disks() @@ -9050,7 +9050,7 @@ mod common_parallel { let mut cmd = GuestCommand::new(&guest); cmd.default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .default_disks() @@ -9112,7 +9112,7 @@ mod common_parallel { let mut child = GuestCommand::new(&guest) .args(["--cpus", &format!("boot={num_queue_pairs}")]) - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .default_disks() @@ -9237,7 +9237,7 @@ mod common_parallel { let mut guest_command = GuestCommand::new(&guest); guest_command .args(["--cpus", "boot=2"]) - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .default_disks() @@ -9866,7 +9866,7 @@ mod common_parallel { let kernel_path = direct_kernel_boot_path(); cmd.default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args([ "--cmdline", @@ -9915,7 +9915,7 @@ mod common_parallel { let mut cmd = GuestCommand::new(&guest); cmd.args(["--cpus", "boot=4"]) - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", cmd_line.as_str()]) .default_disks() @@ -10291,7 +10291,7 @@ mod ivshmem { let mut child = GuestCommand::new(&guest) .args(["--cpus", "boot=2"]) - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .default_disks() @@ -14021,7 +14021,7 @@ mod aarch64_acpi { let mut child = GuestCommand::new(&guest) .default_cpus() - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", edk2_path().to_str().unwrap()]) .default_disks() .default_net() @@ -14395,7 +14395,7 @@ mod fw_cfg { std::fs::write(&test_file, "test-file-content").unwrap(); cmd.args(["--cpus", "boot=4"]) - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", cmd_line]) .default_disks() From b41927dbcc190ee731e302d59f884884f4bdcb35 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sun, 22 Feb 2026 23:23:53 -0800 Subject: [PATCH 037/742] tests: cvm: remove unused GuestAddress import Drop an unused vm_memory::GuestAddress import from common_cvm in integration tests to keep the module clean. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 0b5b743714..5afc6da8e9 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -14432,9 +14432,8 @@ mod fw_cfg { #[cfg(all(feature = "sev_snp", target_arch = "x86_64"))] mod common_cvm { - use vm_memory::GuestAddress; - use crate::*; + #[test] fn test_focal_simple_launch() { let disk_config = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); From 5b5e5cb99912c46c5831facac37a279597a987fa Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Fri, 27 Feb 2026 01:32:32 -0800 Subject: [PATCH 038/742] vmm: cpu: Avoid potential infinite loop during core scheduling setup Avoid a potential infinite loop where if the leader fails to create a cookie due to an unexpected error (not one of the SMT/no kernel support errors) then the other vcpu threads will continue around their spinloops. This change also clarifies the state machine for the leader election with an explicit enum. Signed-off-by: Rob Bradford --- vmm/src/cpu.rs | 58 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 46 insertions(+), 12 deletions(-) diff --git a/vmm/src/cpu.rs b/vmm/src/cpu.rs index 78149c4b55..852850b9fa 100644 --- a/vmm/src/cpu.rs +++ b/vmm/src/cpu.rs @@ -679,8 +679,7 @@ pub struct CpuManager { hypervisor: Arc, #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, - // TID of the first vCPU thread that created a core scheduling cookie (VM mode). - // 0 = no leader yet, -1 = leader creating cookie, >0 = leader TID (cookie ready). + // State of the core scheduling group leader election (VM mode). core_scheduling_group_leader: Arc, } @@ -692,6 +691,36 @@ const CPU_EJECT_FLAG: usize = 3; const CPU_STATUS_OFFSET: u64 = 4; const CPU_SELECTION_OFFSET: u64 = 0; +/// State of the core scheduling group leader election for VM-wide cookie +/// sharing. +/// +/// The value will be in an `AtomicI32`. Positive values represent a leader +/// TID (cookie ready). +#[repr(i32)] +enum CoreSchedulingLeader { + /// No leader elected yet. + Initial = 0, + /// A leader has been elected and is creating the cookie. + Elected = -1, + /// The leader failed to create the cookie. + Error = -2, +} + +impl TryFrom for CoreSchedulingLeader { + type Error = (); + /// Convert from the raw `i32` (from the `AtomicI32`) value. + /// Quirky: Returns `Ok(state)` for known sentinel values, or `Err(())` for + /// a positive TID (cookie ready). + fn try_from(value: i32) -> result::Result { + match value { + 0 => Ok(Self::Initial), + -1 => Ok(Self::Elected), + -2 => Ok(Self::Error), + _ => Err(()), + } + } +} + impl BusDevice for CpuManager { fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. @@ -924,7 +953,9 @@ impl CpuManager { hypervisor, #[cfg(feature = "sev_snp")] sev_snp_enabled, - core_scheduling_group_leader: Arc::new(AtomicI32::new(0)), + core_scheduling_group_leader: Arc::new(AtomicI32::new( + CoreSchedulingLeader::Initial as i32, + )), }))) } @@ -1210,7 +1241,7 @@ impl CpuManager { // SAFETY: gettid() is always safe to call. let my_tid = unsafe { libc::gettid() }; if core_scheduling_group_leader - .compare_exchange(0, -1, Ordering::AcqRel, Ordering::Acquire) + .compare_exchange(CoreSchedulingLeader::Initial as i32, CoreSchedulingLeader::Elected as i32, Ordering::AcqRel, Ordering::Acquire) .is_ok() { // We are the group leader — create the cookie @@ -1218,6 +1249,8 @@ impl CpuManager { error!( "Failed to create core scheduling cookie: {e:?}" ); + // This will force the loop in the other threads to break out + core_scheduling_group_leader.store(CoreSchedulingLeader::Error as i32, Ordering::Release); return; } // Signal that the cookie is ready by storing real TID @@ -1225,14 +1258,15 @@ impl CpuManager { .store(my_tid, Ordering::Release); } else { // Wait for the leader to finish creating the cookie - let mut leader_tid = - core_scheduling_group_leader.load(Ordering::Acquire); - while leader_tid <= 0 { - std::hint::spin_loop(); - leader_tid = - core_scheduling_group_leader.load(Ordering::Acquire); - } - // Copy the leader's cookie to this thread + let leader_tid = loop { + let v = core_scheduling_group_leader.load(Ordering::Acquire); + match CoreSchedulingLeader::try_from(v) { + Ok(CoreSchedulingLeader::Error) => return, + Ok(CoreSchedulingLeader::Initial | + CoreSchedulingLeader::Elected) => std::hint::spin_loop(), + Err(()) => break v, + } + }; if let Err(e) = core_scheduling_share_from(leader_tid) { error!( "Failed to share core scheduling cookie \ From d1c89a271d47f6ac5862071cc450ba34cb168c9a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 27 Feb 2026 23:53:55 +0000 Subject: [PATCH 039/742] build: Bump crate-ci/typos from 1.43.5 to 1.44.0 Bumps [crate-ci/typos](https://github.com/crate-ci/typos) from 1.43.5 to 1.44.0. - [Release notes](https://github.com/crate-ci/typos/releases) - [Changelog](https://github.com/crate-ci/typos/blob/master/CHANGELOG.md) - [Commits](https://github.com/crate-ci/typos/compare/v1.43.5...v1.44.0) --- updated-dependencies: - dependency-name: crate-ci/typos dependency-version: 1.44.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- .github/workflows/quality.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/quality.yaml b/.github/workflows/quality.yaml index 0427708458..81baf1e3af 100644 --- a/.github/workflows/quality.yaml +++ b/.github/workflows/quality.yaml @@ -167,4 +167,4 @@ jobs: steps: - uses: actions/checkout@v6 # Executes "typos ." - - uses: crate-ci/typos@v1.43.5 + - uses: crate-ci/typos@v1.44.0 From a94fa554c32867dab78c00ce0faa75bf4c7cbe94 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Sat, 28 Feb 2026 23:51:13 +0100 Subject: [PATCH 040/742] hypervisor: kvm: Fix nightly rustfmt import ordering Nightly rustfmt now prefers `self` re-exports inline rather than a separate 'pub use {kvm_bindings, kvm_ioctls}' line. Signed-off-by: Anatol Belski --- hypervisor/src/kvm/mod.rs | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/hypervisor/src/kvm/mod.rs b/hypervisor/src/kvm/mod.rs index 259009151e..75073ec8d5 100644 --- a/hypervisor/src/kvm/mod.rs +++ b/hypervisor/src/kvm/mod.rs @@ -90,11 +90,11 @@ pub use kvm_bindings::kvm_vcpu_events as VcpuEvents; #[cfg(target_arch = "x86_64")] use kvm_bindings::nested::KvmNestedStateBuffer; pub use kvm_bindings::{ - KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI, - KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID, kvm_clock_data, - kvm_create_device, kvm_create_device as CreateDevice, kvm_device_attr as DeviceAttr, - kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_guest_debug, kvm_irq_routing, kvm_irq_routing_entry, - kvm_mp_state, kvm_run, kvm_userspace_memory_region, + self, KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_IRQ_ROUTING_IRQCHIP, + KVM_IRQ_ROUTING_MSI, KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID, + kvm_clock_data, kvm_create_device, kvm_create_device as CreateDevice, + kvm_device_attr as DeviceAttr, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_guest_debug, + kvm_irq_routing, kvm_irq_routing_entry, kvm_mp_state, kvm_run, kvm_userspace_memory_region, }; #[cfg(target_arch = "aarch64")] use kvm_bindings::{ @@ -109,14 +109,13 @@ use kvm_bindings::{KVM_REG_RISCV_CORE, kvm_riscv_core}; use kvm_bindings::{KVM_X86_DEFAULT_VM, KVM_X86_SW_PROTECTED_VM, KVMIO, kvm_run__bindgen_ty_1}; #[cfg(target_arch = "x86_64")] use kvm_bindings::{Xsave as xsave2, kvm_xsave2}; -pub use kvm_ioctls::{Cap, Kvm, VcpuExit}; +pub use kvm_ioctls::{self, Cap, Kvm, VcpuExit}; use thiserror::Error; use vfio_ioctls::VfioDeviceFd; #[cfg(target_arch = "x86_64")] use vmm_sys_util::{fam::FamStruct, ioctl_io_nr}; #[cfg(feature = "tdx")] use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_iowr_nr}; -pub use {kvm_bindings, kvm_ioctls}; #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] use crate::RegList; From 15ce890dd35fafe8154a5549a60c812a495d5c73 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 27 Feb 2026 11:58:32 +0100 Subject: [PATCH 041/742] block: Query actual DIO alignment for file backed images DiskTopology::probe() returned a hardcoded 512 for regular files, causing O_DIRECT failures on volumes with larger block sizes (e.g. 4K). Use statx(STATX_DIOALIGN) (Linux >= 6.1) to query the real per file DIO memory and offset alignment. Unlike fstatvfs().f_bsize, which only returns the filesystem preferred I/O block size, STATX_DIOALIGN reports the true DIO constraints accounting for the filesystem, underlying block device, and any stacking (loop, dm, etc.). Signed-off-by: Anatol Belski --- block/src/lib.rs | 67 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 66 insertions(+), 1 deletion(-) diff --git a/block/src/lib.rs b/block/src/lib.rs index 3ab8de9e41..3cbe9fe52c 100644 --- a/block/src/lib.rs +++ b/block/src/lib.rs @@ -38,7 +38,7 @@ use std::os::unix::io::AsRawFd; use std::path::Path; use std::str::FromStr; use std::time::Instant; -use std::{cmp, result}; +use std::{cmp, mem, result}; #[cfg(feature = "io_uring")] use io_uring::{IoUring, Probe, opcode}; @@ -1161,8 +1161,73 @@ impl DiskTopology { Ok(block_size) } + /// Query the O_DIRECT alignment requirement for a regular file. + /// + /// Uses `statx(STATX_DIOALIGN)` (Linux >= 6.1) to obtain the exact + /// memory and offset alignment the kernel requires for direct I/O on + /// this specific file. Unlike `fstatvfs().f_bsize`, which only returns + /// the filesystem's preferred I/O block size, `STATX_DIOALIGN` reports + /// the true per-file DIO constraints accounting for the filesystem, + /// underlying block device, and any stacking (loop, dm, etc.). + fn query_file_alignment(f: &File) -> u64 { + // The libc crate does not expose statx / STATX_DIOALIGN on all + // targets (e.g. musl), so define the constant and a minimal repr(C) + // struct locally and invoke the syscall directly. + const STATX_DIOALIGN: u32 = 0x2000; + + // Minimal statx layout, only the needed fields, + // everything else is padding. + #[repr(C)] + struct Statx { + stx_mask: u32, + _pad: [u8; 148], + stx_dio_mem_align: u32, + stx_dio_offset_align: u32, + _pad2: [u8; 96], + } + + let mut stx = mem::MaybeUninit::::zeroed(); + // SAFETY: FFI syscall with valid fd and correctly sized buffer. + let ret = unsafe { + libc::syscall( + libc::SYS_statx, + f.as_raw_fd(), + c"".as_ptr(), + libc::AT_EMPTY_PATH, + STATX_DIOALIGN, + stx.as_mut_ptr(), + ) + }; + if ret == 0 { + // SAFETY: statx succeeded, the struct is fully initialized. + let stx = unsafe { stx.assume_init() }; + if stx.stx_mask & STATX_DIOALIGN != 0 && stx.stx_dio_mem_align > 0 { + let align = cmp::max(stx.stx_dio_mem_align, stx.stx_dio_offset_align) as u64; + debug!("statx(STATX_DIOALIGN) returned alignment {align}"); + return align; + } + } + + debug!("O_DIRECT alignment query failed, falling back to default {SECTOR_SIZE}"); + SECTOR_SIZE + } + pub fn probe(f: &File) -> std::io::Result { if !Self::is_block_device(f)? { + // For regular files opened with O_DIRECT, the logical block size + // must reflect the filesystem DIO alignment so the guest issues + // correctly sized I/O. + // SAFETY: fcntl(F_GETFL) is always safe on a valid fd. + let flags = unsafe { libc::fcntl(f.as_raw_fd(), libc::F_GETFL) }; + if flags >= 0 && (flags & libc::O_DIRECT) != 0 { + let alignment = Self::query_file_alignment(f); + return Ok(DiskTopology { + logical_block_size: alignment, + physical_block_size: alignment, + minimum_io_size: alignment, + optimal_io_size: 0, + }); + } return Ok(DiskTopology::default()); } From 496c89c2892105fdc0ebf6c0cac8f9ccf9d632f6 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 27 Feb 2026 12:20:15 +0100 Subject: [PATCH 042/742] block: Add unit tests for DiskTopology file alignment probing Test valid power of two alignment, layout compatibility, direct helper coverage, and O_DIRECT write/read roundtrip. Signed-off-by: Anatol Belski --- block/src/lib.rs | 126 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) diff --git a/block/src/lib.rs b/block/src/lib.rs index 3cbe9fe52c..f8d56cf102 100644 --- a/block/src/lib.rs +++ b/block/src/lib.rs @@ -1239,3 +1239,129 @@ impl DiskTopology { }) } } + +#[cfg(test)] +mod unit_tests { + use std::alloc::{Layout, alloc_zeroed, dealloc}; + use std::fs::OpenOptions; + use std::io::Write; + use std::os::unix::fs::OpenOptionsExt; + use std::{ptr, slice}; + + use vmm_sys_util::tempfile::TempFile; + + use super::*; + + #[test] + fn test_probe_regular_file_returns_valid_alignment() { + let temp_file = TempFile::new().unwrap(); + let mut f = temp_file.into_file(); + f.write_all(&[0u8; 4096]).unwrap(); + f.sync_all().unwrap(); + + let topo = DiskTopology::probe(&f).unwrap(); + + assert_eq!( + topo.logical_block_size, SECTOR_SIZE, + "probe() should return {SECTOR_SIZE} for regular files without O_DIRECT, got {}", + topo.logical_block_size + ); + } + + #[test] + fn test_probe_regular_file_with_direct_returns_dio_alignment() { + let temp_file = TempFile::new().unwrap(); + let path = temp_file.as_path().to_owned(); + { + let f = temp_file.as_file(); + f.set_len(1 << 20).unwrap(); // 1 MiB + f.sync_all().unwrap(); + } + + let f = OpenOptions::new() + .read(true) + .write(true) + .custom_flags(libc::O_DIRECT) + .open(&path) + .unwrap(); + let topo = DiskTopology::probe(&f).unwrap(); + + assert!( + topo.logical_block_size.is_power_of_two(), + "logical_block_size {} is not a power of two", + topo.logical_block_size + ); + assert!( + topo.logical_block_size >= SECTOR_SIZE, + "logical_block_size {} is less than SECTOR_SIZE ({SECTOR_SIZE})", + topo.logical_block_size + ); + + let alignment = topo.logical_block_size as usize; + let layout = Layout::from_size_align(4096, alignment); + assert!( + layout.is_ok(), + "Layout::from_size_align(4096, {alignment}) failed: {:?}", + layout.err() + ); + } + + #[test] + fn test_dio_write_read_with_probed_alignment() { + let temp_file = TempFile::new().unwrap(); + let path = temp_file.as_path().to_owned(); + { + let f = temp_file.as_file(); + f.set_len(1 << 20).unwrap(); // 1 MiB + f.sync_all().unwrap(); + } + + let f = OpenOptions::new() + .read(true) + .write(true) + .custom_flags(libc::O_DIRECT) + .open(&path) + .unwrap(); + let topo = DiskTopology::probe(&f).unwrap(); + let alignment = topo.logical_block_size as usize; + + let layout = Layout::from_size_align(alignment, alignment).unwrap(); + // SAFETY: layout is valid (non-zero, power-of-two alignment). + let buf = unsafe { alloc_zeroed(layout) }; + assert!(!buf.is_null()); + + // SAFETY: buf is valid for `alignment` bytes. + unsafe { ptr::write_bytes(buf, 0xAB, alignment) }; + + // SAFETY: buf is aligned and sized for O_DIRECT; fd is valid. + let written = + unsafe { libc::pwrite(f.as_raw_fd(), buf as *const libc::c_void, alignment, 0) }; + assert_eq!( + written as usize, + alignment, + "O_DIRECT pwrite failed: {}", + io::Error::last_os_error() + ); + + // SAFETY: buf is valid for `alignment` bytes. + unsafe { ptr::write_bytes(buf, 0x00, alignment) }; + // SAFETY: buf is aligned and sized for O_DIRECT; fd is valid. + let read = unsafe { libc::pread(f.as_raw_fd(), buf as *mut libc::c_void, alignment, 0) }; + assert_eq!( + read as usize, + alignment, + "O_DIRECT pread failed: {}", + io::Error::last_os_error() + ); + + // SAFETY: buf is valid for `alignment` bytes after successful pread. + let slice = unsafe { slice::from_raw_parts(buf, alignment) }; + assert!( + slice.iter().all(|&b| b == 0xAB), + "Data mismatch after O_DIRECT roundtrip" + ); + + // SAFETY: buf was allocated with this layout via alloc_zeroed. + unsafe { dealloc(buf, layout) }; + } +} From ba889a6ec26bf88d73b4dcc52f8f909f06ab7bed Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 27 Feb 2026 23:29:22 +0100 Subject: [PATCH 043/742] tests: Fix loop device race in create_loop_device Move LOOP_CTL_GET_FREE + open + LOOP_CONFIGURE into the retry loop so each attempt requests a fresh free device number. Previously, a parallel test could claim the same device between GET_FREE and CONFIGURE, and retrying the same stale number would always fail with EBUSY. Signed-off-by: Anatol Belski --- cloud-hypervisor/tests/integration.rs | 95 ++++++++++++++------------- 1 file changed, 51 insertions(+), 44 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 5afc6da8e9..05b25d3e9b 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -20,7 +20,7 @@ use std::process::{Child, Command, Stdio}; use std::string::String; use std::sync::mpsc::Receiver; use std::sync::{Mutex, mpsc}; -use std::time::Duration; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; use std::{fs, io, thread}; use net_util::MacAddr; @@ -7095,24 +7095,6 @@ mod common_parallel { .open(LOOP_CTL_PATH) .unwrap(); - // Request a free loop device - let loop_device_number = - unsafe { libc::ioctl(loop_ctl_file.as_raw_fd(), LOOP_CTL_GET_FREE as _) }; - - if loop_device_number < 0 { - panic!("Couldn't find a free loop device"); - } - - // Create loop device path - let loop_device_path = format!("{LOOP_DEVICE_PREFIX}{loop_device_number}"); - - // Open loop device - let loop_device_file = OpenOptions::new() - .read(true) - .write(true) - .open(&loop_device_path) - .unwrap(); - // Open backing file let backing_file = OpenOptions::new() .read(true) @@ -7120,13 +7102,34 @@ mod common_parallel { .open(backing_file_path) .unwrap(); - let loop_config = LoopConfig { - fd: backing_file.as_raw_fd() as u32, - block_size, - ..Default::default() - }; - + // Retry the whole get free -> open -> configure sequence so that a + // race with another parallel test claiming the same loop device + // is resolved by requesting a new free device on each attempt. + let mut loop_device_path = String::new(); for i in 0..num_retries { + // Request a free loop device + let loop_device_number = + unsafe { libc::ioctl(loop_ctl_file.as_raw_fd(), LOOP_CTL_GET_FREE as _) }; + + if loop_device_number < 0 { + panic!("Couldn't find a free loop device"); + } + + loop_device_path = format!("{LOOP_DEVICE_PREFIX}{loop_device_number}"); + + // Open loop device + let loop_device_file = OpenOptions::new() + .read(true) + .write(true) + .open(&loop_device_path) + .unwrap(); + + let loop_config = LoopConfig { + fd: backing_file.as_raw_fd() as u32, + block_size, + ..Default::default() + }; + let ret = unsafe { libc::ioctl( loop_device_file.as_raw_fd(), @@ -7134,28 +7137,32 @@ mod common_parallel { &loop_config, ) }; - if ret != 0 { - if i < num_retries - 1 { - println!( - "Iteration {}: Failed to configure the loop device {}: {}", - i, - loop_device_path, - std::io::Error::last_os_error() - ); - } else { - panic!( - "Failed {} times trying to configure the loop device {}: {}", - num_retries, - loop_device_path, - std::io::Error::last_os_error() - ); - } - } else { + if ret == 0 { break; } - // Wait for a bit before retrying - thread::sleep(std::time::Duration::new(5, 0)); + if i < num_retries - 1 { + println!( + "Iteration {}: Failed to configure loop device {}: {}", + i, + loop_device_path, + io::Error::last_os_error() + ); + let jitter_ms = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .subsec_nanos() + % 500 + + 100; + thread::sleep(Duration::from_millis(jitter_ms as u64)); + } else { + panic!( + "Failed {} times trying to configure the loop device {}: {}", + num_retries, + loop_device_path, + io::Error::last_os_error() + ); + } } loop_device_path From 6fe3f63d270a542e0adc3fa75b93f50574816676 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 27 Feb 2026 23:37:23 +0100 Subject: [PATCH 044/742] tests: Add direct I/O file backed alignment test Verify that DiskTopology::probe() returns the correct DIO alignment for a regular file on a 4k sector filesystem. The test creates a loop device with --sector-size 4096, formats ext4, places a raw disk image on it, and boots a VM with direct=on. Asserts that the guest sees a 4096 byte logical sector and that a DIO write/read roundtrip succeeds. Signed-off-by: Anatol Belski --- cloud-hypervisor/tests/integration.rs | 120 ++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 05b25d3e9b..988f9df566 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -7261,6 +7261,126 @@ mod common_parallel { .expect("loop device not found"); } + #[test] + fn test_virtio_block_direct_io_file_backed_alignment_4k() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); + let kernel_path = direct_kernel_boot_path(); + + let mut workloads_path = dirs::home_dir().unwrap(); + workloads_path.push("workloads"); + let img_dir = TempDir::new_in(workloads_path.as_path()).unwrap(); + let fs_img_path = img_dir.as_path().join("fs_4ksec.img"); + + assert!( + exec_host_command_output(&format!( + "truncate -s 512M {}", + fs_img_path.to_str().unwrap() + )) + .status + .success(), + "truncate failed" + ); + + let loop_dev = exec_host_command_output(&format!( + "losetup --find --show --sector-size 4096 {}", + fs_img_path.to_str().unwrap() + )); + assert!(loop_dev.status.success(), "losetup failed"); + let loop_dev_path = String::from_utf8_lossy(&loop_dev.stdout).trim().to_string(); + + assert!( + exec_host_command_output(&format!("mkfs.ext4 -q {loop_dev_path}")) + .status + .success(), + "mkfs.ext4 failed" + ); + + let mnt_dir = img_dir.as_path().join("mnt"); + fs::create_dir_all(&mnt_dir).unwrap(); + assert!( + exec_host_command_output(&format!( + "mount {} {}", + &loop_dev_path, + mnt_dir.to_str().unwrap() + )) + .status + .success(), + "mount failed" + ); + + let test_disk_path = mnt_dir.join("dio_file_test.raw"); + assert!( + exec_host_command_output(&format!( + "truncate -s 64M {}", + test_disk_path.to_str().unwrap() + )) + .status + .success(), + "truncate test disk failed" + ); + + let mut child = GuestCommand::new(&guest) + .args(["--cpus", "boot=1"]) + .args(["--memory", "size=512M"]) + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .args([ + "--disk", + format!( + "path={}", + guest.disk_config.disk(DiskType::OperatingSystem).unwrap() + ) + .as_str(), + format!( + "path={}", + guest.disk_config.disk(DiskType::CloudInit).unwrap() + ) + .as_str(), + format!( + "path={},direct=on,image_type=raw", + test_disk_path.to_str().unwrap() + ) + .as_str(), + ]) + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + let log_sec: u32 = guest + .ssh_command("lsblk -t | grep vdc | awk '{print $6}'") + .unwrap() + .trim() + .parse() + .unwrap_or_default(); + assert_eq!( + log_sec, 4096, + "expected 4096-byte logical sector for file on 4k-sector fs, got {log_sec}" + ); + + guest + .ssh_command( + "sudo dd if=/dev/urandom of=/tmp/pattern bs=4096 count=8 && \ + sudo dd if=/tmp/pattern of=/dev/vdc bs=4096 count=8 seek=1 oflag=direct && \ + sudo dd if=/dev/vdc of=/tmp/readback bs=4096 count=8 skip=1 iflag=direct && \ + cmp /tmp/pattern /tmp/readback", + ) + .unwrap(); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); + + let _ = exec_host_command_output(&format!("umount {}", mnt_dir.to_str().unwrap())); + let _ = exec_host_command_output(&format!("losetup -d {loop_dev_path}")); + } + // Helper function to verify sparse file fn verify_sparse_file(test_disk_path: &str, expected_ratio: f64) { let res = exec_host_command_output(&format!("ls -s --block-size=1 {}", test_disk_path)); From aae7594a671af171f9963c2d9c98e5d9dd756155 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 27 Feb 2026 23:38:08 +0100 Subject: [PATCH 045/742] tests: Add direct I/O block device alignment test Boot a VM with a 4k sector loop device passed with direct=on and image_type=raw. Assert that the guest sees a 4096 byte logical sector and that a DIO write/read roundtrip at 4096 byte alignment succeeds. Signed-off-by: Anatol Belski --- cloud-hypervisor/tests/integration.rs | 86 +++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 988f9df566..3cdf1ba8ea 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -7261,6 +7261,92 @@ mod common_parallel { .expect("loop device not found"); } + #[test] + fn test_virtio_block_direct_io_block_device_alignment_4k() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); + let kernel_path = direct_kernel_boot_path(); + + // The backing file for the loop device must live on a filesystem that + // supports O_DIRECT (e.g. ext4). guest.tmp_dir is on tmpfs inside + // Docker, and the loop driver forwards I/O to the backing file. + let mut workloads_path = dirs::home_dir().unwrap(); + workloads_path.push("workloads"); + let img_dir = TempDir::new_in(workloads_path.as_path()).unwrap(); + let test_disk_path = img_dir.as_path().join("directio_test.img"); + // Preallocate the backing file -- a sparse file can deadlock when + // O_DIRECT writes through a loop device trigger block allocation + // in the backing filesystem. + assert!( + exec_host_command_output(&format!( + "fallocate -l 64M {}", + test_disk_path.to_str().unwrap() + )) + .status + .success(), + "fallocate failed" + ); + + let loop_dev = create_loop_device(test_disk_path.to_str().unwrap(), 4096, 5); + + let mut child = GuestCommand::new(&guest) + .args(["--cpus", "boot=1"]) + .args(["--memory", "size=512M"]) + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .args([ + "--disk", + format!( + "path={}", + guest.disk_config.disk(DiskType::OperatingSystem).unwrap() + ) + .as_str(), + format!( + "path={}", + guest.disk_config.disk(DiskType::CloudInit).unwrap() + ) + .as_str(), + format!("path={},direct=on,image_type=raw", &loop_dev).as_str(), + ]) + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + assert_eq!( + guest + .ssh_command("lsblk -t | grep vdc | awk '{print $6}'") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 4096 + ); + + guest + .ssh_command( + "sudo dd if=/dev/urandom of=/tmp/pattern bs=4096 count=1 && \ + sudo dd if=/tmp/pattern of=/dev/vdc bs=4096 count=1 seek=1 oflag=direct && \ + sudo dd if=/dev/vdc of=/tmp/readback bs=4096 count=1 skip=1 iflag=direct && \ + cmp /tmp/pattern /tmp/readback", + ) + .unwrap(); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); + + Command::new("losetup") + .args(["-d", &loop_dev]) + .output() + .expect("loop device cleanup failed"); + } + #[test] fn test_virtio_block_direct_io_file_backed_alignment_4k() { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); From 511e682909cf29ac8d4b390822b0ecafd0aa80d8 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 3 Mar 2026 01:12:32 +0000 Subject: [PATCH 046/742] build: Bump the non-rust-vmm group across 2 directories with 12 updates Bumps the non-rust-vmm group with 9 updates in the / directory: | Package | From | To | | --- | --- | --- | | [serde_with](https://github.com/jonasbb/serde_with) | `3.16.1` | `3.17.0` | | [zerocopy](https://github.com/google/zerocopy) | `0.8.39` | `0.8.40` | | [jiff](https://github.com/BurntSushi/jiff) | `0.2.21` | `0.2.22` | | libredox | `0.1.12` | `0.1.14` | | [libz-sys](https://github.com/rust-lang/libz-sys) | `1.1.23` | `1.1.24` | | [pin-project-lite](https://github.com/taiki-e/pin-project-lite) | `0.2.16` | `0.2.17` | | [piper](https://github.com/smol-rs/piper) | `0.2.4` | `0.2.5` | | [regex-syntax](https://github.com/rust-lang/regex) | `0.8.9` | `0.8.10` | | [tempfile](https://github.com/Stebalien/tempfile) | `3.25.0` | `3.26.0` | Bumps the non-rust-vmm group with 2 updates in the /fuzz directory: [serde_with](https://github.com/jonasbb/serde_with) and [zerocopy](https://github.com/google/zerocopy). Updates `serde_with` from 3.16.1 to 3.17.0 - [Release notes](https://github.com/jonasbb/serde_with/releases) - [Commits](https://github.com/jonasbb/serde_with/compare/v3.16.1...v3.17.0) Updates `zerocopy` from 0.8.39 to 0.8.40 - [Release notes](https://github.com/google/zerocopy/releases) - [Changelog](https://github.com/google/zerocopy/blob/main/CHANGELOG.md) - [Commits](https://github.com/google/zerocopy/compare/v0.8.39...v0.8.40) Updates `jiff` from 0.2.21 to 0.2.22 - [Release notes](https://github.com/BurntSushi/jiff/releases) - [Changelog](https://github.com/BurntSushi/jiff/blob/master/CHANGELOG.md) - [Commits](https://github.com/BurntSushi/jiff/compare/jiff-static-0.2.21...jiff-static-0.2.22) Updates `jiff-static` from 0.2.21 to 0.2.22 - [Release notes](https://github.com/BurntSushi/jiff/releases) - [Changelog](https://github.com/BurntSushi/jiff/blob/master/CHANGELOG.md) - [Commits](https://github.com/BurntSushi/jiff/compare/jiff-static-0.2.21...jiff-static-0.2.22) Updates `libredox` from 0.1.12 to 0.1.14 Updates `libz-sys` from 1.1.23 to 1.1.24 - [Release notes](https://github.com/rust-lang/libz-sys/releases) - [Commits](https://github.com/rust-lang/libz-sys/compare/1.1.23...1.1.24) Updates `pin-project-lite` from 0.2.16 to 0.2.17 - [Release notes](https://github.com/taiki-e/pin-project-lite/releases) - [Changelog](https://github.com/taiki-e/pin-project-lite/blob/main/CHANGELOG.md) - [Commits](https://github.com/taiki-e/pin-project-lite/compare/v0.2.16...v0.2.17) Updates `piper` from 0.2.4 to 0.2.5 - [Release notes](https://github.com/smol-rs/piper/releases) - [Changelog](https://github.com/smol-rs/piper/blob/main/CHANGELOG.md) - [Commits](https://github.com/smol-rs/piper/compare/v0.2.4...v0.2.5) Updates `regex-syntax` from 0.8.9 to 0.8.10 - [Release notes](https://github.com/rust-lang/regex/releases) - [Changelog](https://github.com/rust-lang/regex/blob/master/CHANGELOG.md) - [Commits](https://github.com/rust-lang/regex/compare/regex-syntax-0.8.9...regex-syntax-0.8.10) Updates `serde_with_macros` from 3.16.1 to 3.17.0 - [Release notes](https://github.com/jonasbb/serde_with/releases) - [Commits](https://github.com/jonasbb/serde_with/compare/v3.16.1...v3.17.0) Updates `tempfile` from 3.25.0 to 3.26.0 - [Changelog](https://github.com/Stebalien/tempfile/blob/master/CHANGELOG.md) - [Commits](https://github.com/Stebalien/tempfile/commits/v3.26.0) Updates `zerocopy-derive` from 0.8.39 to 0.8.40 - [Release notes](https://github.com/google/zerocopy/releases) - [Changelog](https://github.com/google/zerocopy/blob/main/CHANGELOG.md) - [Commits](https://github.com/google/zerocopy/compare/v0.8.39...v0.8.40) Updates `serde_with` from 3.16.1 to 3.17.0 - [Release notes](https://github.com/jonasbb/serde_with/releases) - [Commits](https://github.com/jonasbb/serde_with/compare/v3.16.1...v3.17.0) Updates `zerocopy` from 0.8.39 to 0.8.40 - [Release notes](https://github.com/google/zerocopy/releases) - [Changelog](https://github.com/google/zerocopy/blob/main/CHANGELOG.md) - [Commits](https://github.com/google/zerocopy/compare/v0.8.39...v0.8.40) Updates `serde_with_macros` from 3.16.1 to 3.17.0 - [Release notes](https://github.com/jonasbb/serde_with/releases) - [Commits](https://github.com/jonasbb/serde_with/compare/v3.16.1...v3.17.0) Updates `zerocopy-derive` from 0.8.39 to 0.8.40 - [Release notes](https://github.com/google/zerocopy/releases) - [Changelog](https://github.com/google/zerocopy/blob/main/CHANGELOG.md) - [Commits](https://github.com/google/zerocopy/compare/v0.8.39...v0.8.40) --- updated-dependencies: - dependency-name: serde_with dependency-version: 3.17.0 dependency-type: direct:production update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: zerocopy dependency-version: 0.8.40 dependency-type: direct:production update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: jiff dependency-version: 0.2.22 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: jiff-static dependency-version: 0.2.22 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: libredox dependency-version: 0.1.14 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: libz-sys dependency-version: 1.1.24 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: pin-project-lite dependency-version: 0.2.17 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: piper dependency-version: 0.2.5 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: regex-syntax dependency-version: 0.8.10 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: serde_with_macros dependency-version: 3.17.0 dependency-type: indirect update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: tempfile dependency-version: 3.26.0 dependency-type: indirect update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: zerocopy-derive dependency-version: 0.8.40 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: serde_with dependency-version: 3.17.0 dependency-type: direct:production update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: zerocopy dependency-version: 0.8.40 dependency-type: direct:production update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: serde_with_macros dependency-version: 3.17.0 dependency-type: indirect update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: zerocopy-derive dependency-version: 0.8.40 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm ... Signed-off-by: dependabot[bot] --- Cargo.lock | 51 +++++++++++++++++++++++----------------------- Cargo.toml | 4 ++-- devices/Cargo.toml | 2 +- fuzz/Cargo.lock | 16 +++++++-------- 4 files changed, 36 insertions(+), 37 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ea880ebc97..83ef1dc8c8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1136,9 +1136,9 @@ checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" [[package]] name = "jiff" -version = "0.2.21" +version = "0.2.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3e3d65f018c6ae946ab16e80944b97096ed73c35b221d1c478a6c81d8f57940" +checksum = "819b44bc7c87d9117eb522f14d46e918add69ff12713c475946b0a29363ed1c2" dependencies = [ "jiff-static", "log", @@ -1149,9 +1149,9 @@ dependencies = [ [[package]] name = "jiff-static" -version = "0.2.21" +version = "0.2.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a17c2b211d863c7fde02cbea8a3c1a439b98e109286554f2860bdded7ff83818" +checksum = "470252db18ecc35fd766c0891b1e3ec6cbbcd62507e85276c01bf75d8e94d4a1" dependencies = [ "proc-macro2", "quote", @@ -1232,11 +1232,10 @@ checksum = "6800badb6cb2082ffd7b6a67e6125bb39f18782f793520caee8cb8846be06112" [[package]] name = "libredox" -version = "0.1.12" +version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d0b95e02c851351f877147b7deea7b1afb1df71b63aa5f8270716e0c5720616" +checksum = "1744e39d1d6a9948f4f388969627434e31128196de472883b39f148769bfe30a" dependencies = [ - "bitflags 2.11.0", "libc", ] @@ -1256,9 +1255,9 @@ dependencies = [ [[package]] name = "libz-sys" -version = "1.1.23" +version = "1.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15d118bbf3771060e7311cc7bb0545b01d08a8b4a7de949198dec1fa0ca1c0f7" +checksum = "4735e9cbde5aac84a5ce588f6b23a90b9b0b528f6c5a8db8a4aff300463a0839" dependencies = [ "cc", "libc", @@ -1590,15 +1589,15 @@ dependencies = [ [[package]] name = "pin-project-lite" -version = "0.2.16" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" +checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" [[package]] name = "piper" -version = "0.2.4" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96c8c490f422ef9a4efd2cb5b42b76c8613d7e7dfc1caf667b8a3350a5acc066" +checksum = "c835479a4443ded371d6c535cbfd8d31ad92c5d23ae9770a61bc155e4992a3c1" dependencies = [ "atomic-waker", "fastrand", @@ -1891,9 +1890,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.8.9" +version = "0.8.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a96887878f22d7bad8a3b6dc5b7440e0ada9a245242924394987b21cf2210a4c" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" [[package]] name = "remain" @@ -2014,9 +2013,9 @@ dependencies = [ [[package]] name = "serde_with" -version = "3.16.1" +version = "3.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fa237f2807440d238e0364a218270b98f767a00d3dada77b1c53ae88940e2e7" +checksum = "381b283ce7bc6b476d903296fb59d0d36633652b633b27f64db4fb46dcbfc3b9" dependencies = [ "serde_core", "serde_with_macros", @@ -2024,9 +2023,9 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "3.16.1" +version = "3.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52a8e3ca0ca629121f70ab50f95249e5a6f925cc0f6ffe8256c45b728875706c" +checksum = "a6d4e30573c8cb306ed6ab1dca8423eec9a463ea0e155f45399455e0368b27e0" dependencies = [ "darling", "proc-macro2", @@ -2128,12 +2127,12 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.25.0" +version = "3.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0136791f7c95b1f6dd99f9cc786b91bb81c3800b639b3478e561ddb7be95e5f1" +checksum = "82a72c767771b47409d2345987fda8628641887d5466101319899796367354a0" dependencies = [ "fastrand", - "getrandom 0.4.1", + "getrandom 0.3.4", "once_cell", "rustix", "windows-sys 0.60.2", @@ -3027,18 +3026,18 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.39" +version = "0.8.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db6d35d663eadb6c932438e763b262fe1a70987f9ae936e60158176d710cae4a" +checksum = "a789c6e490b576db9f7e6b6d661bcc9799f7c0ac8352f56ea20193b2681532e5" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.39" +version = "0.8.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4122cd3169e94605190e77839c9a40d40ed048d305bfdc146e7df40ab0f3e517" +checksum = "f65c489a7071a749c849713807783f70672b28094011623e200cb86dcb835953" dependencies = [ "proc-macro2", "quote", diff --git a/Cargo.toml b/Cargo.toml index e34941f358..a3e76a797e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -70,7 +70,7 @@ igvm_defs = "0.4.0" # serde crates serde = "1.0.228" serde_json = "1.0.149" -serde_with = { version = "3.16.1", default-features = false } +serde_with = { version = "3.17.0", default-features = false } # other crates anyhow = "1.0.102" @@ -90,7 +90,7 @@ signal-hook = "0.4.3" thiserror = "2.0.18" uuid = { version = "1.21.0" } wait-timeout = "0.2.1" -zerocopy = { version = "0.8.39", default-features = false } +zerocopy = { version = "0.8.40", default-features = false } [workspace.lints.clippy] # Any clippy lint (group) in alphabetical order: diff --git a/devices/Cargo.toml b/devices/Cargo.toml index 0414f13d2a..d9ce839882 100644 --- a/devices/Cargo.toml +++ b/devices/Cargo.toml @@ -34,7 +34,7 @@ vm-memory = { workspace = true, features = [ ] } vm-migration = { path = "../vm-migration" } vmm-sys-util = { workspace = true } -zerocopy = { version = "0.8.39", features = [ +zerocopy = { version = "0.8.40", features = [ "alloc", "derive", ], optional = true } diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock index 52cf0e2286..6d98900f31 100644 --- a/fuzz/Cargo.lock +++ b/fuzz/Cargo.lock @@ -1091,9 +1091,9 @@ dependencies = [ [[package]] name = "serde_with" -version = "3.16.1" +version = "3.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fa237f2807440d238e0364a218270b98f767a00d3dada77b1c53ae88940e2e7" +checksum = "381b283ce7bc6b476d903296fb59d0d36633652b633b27f64db4fb46dcbfc3b9" dependencies = [ "serde_core", "serde_with_macros", @@ -1101,9 +1101,9 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "3.16.1" +version = "3.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52a8e3ca0ca629121f70ab50f95249e5a6f925cc0f6ffe8256c45b728875706c" +checksum = "a6d4e30573c8cb306ed6ab1dca8423eec9a463ea0e155f45399455e0368b27e0" dependencies = [ "darling", "proc-macro2", @@ -1778,18 +1778,18 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.39" +version = "0.8.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db6d35d663eadb6c932438e763b262fe1a70987f9ae936e60158176d710cae4a" +checksum = "a789c6e490b576db9f7e6b6d661bcc9799f7c0ac8352f56ea20193b2681532e5" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.39" +version = "0.8.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4122cd3169e94605190e77839c9a40d40ed048d305bfdc146e7df40ab0f3e517" +checksum = "f65c489a7071a749c849713807783f70672b28094011623e200cb86dcb835953" dependencies = [ "proc-macro2", "quote", From 838a4f86c465b733eeb48a24f4d52251f7bc8ef2 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Fri, 27 Feb 2026 14:14:13 -0800 Subject: [PATCH 047/742] tests: move shared constants and helpers to test_infra Move test constants (MAX_NUM_PCI_SEGMENTS, DIRECT_KERNEL_BOOT_CMDLINE, CONSOLE_TEST_STRING), arch-specific image name modules (x86_64, aarch64), and helper functions (direct_kernel_boot_path, edk2_path) from integration.rs to test_infra/src/lib.rs. This centralizes shared test definitions so they can be reused across multiple test crates instead of being confined to integration.rs. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 88 --------------------------- test_infra/src/lib.rs | 88 +++++++++++++++++++++++++++ 2 files changed, 88 insertions(+), 88 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 3cdf1ba8ea..0866249e07 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -29,69 +29,6 @@ use vmm_sys_util::tempdir::TempDir; use vmm_sys_util::tempfile::TempFile; use wait_timeout::ChildExt; -// Constant taken from the VMM crate. -const MAX_NUM_PCI_SEGMENTS: u16 = 96; - -#[cfg(target_arch = "x86_64")] -mod x86_64 { - pub const FOCAL_IMAGE_NAME: &str = "focal-server-cloudimg-amd64-custom-20210609-0.raw"; - pub const JAMMY_VFIO_IMAGE_NAME: &str = - "jammy-server-cloudimg-amd64-custom-vfio-20241012-0.raw"; - pub const FOCAL_IMAGE_NAME_VHD: &str = "focal-server-cloudimg-amd64-custom-20210609-0.vhd"; - pub const FOCAL_IMAGE_NAME_VHDX: &str = "focal-server-cloudimg-amd64-custom-20210609-0.vhdx"; - pub const JAMMY_IMAGE_NAME: &str = "jammy-server-cloudimg-amd64-custom-20241017-0.raw"; - pub const JAMMY_IMAGE_NAME_QCOW2: &str = "jammy-server-cloudimg-amd64-custom-20241017-0.qcow2"; - pub const JAMMY_IMAGE_NAME_QCOW2_ZLIB: &str = - "jammy-server-cloudimg-amd64-custom-20241017-0-zlib.qcow2"; - pub const JAMMY_IMAGE_NAME_QCOW2_ZSTD: &str = - "jammy-server-cloudimg-amd64-custom-20241017-0-zstd.qcow2"; - pub const JAMMY_IMAGE_NAME_QCOW2_BACKING_ZSTD_FILE: &str = - "jammy-server-cloudimg-amd64-custom-20241017-0-backing-zstd.qcow2"; - pub const JAMMY_IMAGE_NAME_QCOW2_BACKING_UNCOMPRESSED_FILE: &str = - "jammy-server-cloudimg-amd64-custom-20241017-0-backing-uncompressed.qcow2"; - pub const JAMMY_IMAGE_NAME_QCOW2_BACKING_RAW_FILE: &str = - "jammy-server-cloudimg-amd64-custom-20241017-0-backing-raw.qcow2"; - pub const WINDOWS_IMAGE_NAME: &str = "windows-server-2022-amd64-2.raw"; - pub const OVMF_NAME: &str = "CLOUDHV.fd"; - pub const GREP_SERIAL_IRQ_CMD: &str = "grep -c 'IO-APIC.*ttyS0' /proc/interrupts || true"; -} - -#[cfg(target_arch = "x86_64")] -use x86_64::*; - -#[cfg(target_arch = "aarch64")] -mod aarch64 { - pub const FOCAL_IMAGE_NAME: &str = "focal-server-cloudimg-arm64-custom-20210929-0.raw"; - pub const FOCAL_IMAGE_UPDATE_KERNEL_NAME: &str = - "focal-server-cloudimg-arm64-custom-20210929-0-update-kernel.raw"; - pub const FOCAL_IMAGE_NAME_VHD: &str = "focal-server-cloudimg-arm64-custom-20210929-0.vhd"; - pub const FOCAL_IMAGE_NAME_VHDX: &str = "focal-server-cloudimg-arm64-custom-20210929-0.vhdx"; - pub const JAMMY_IMAGE_NAME: &str = "jammy-server-cloudimg-arm64-custom-20220329-0.raw"; - pub const JAMMY_IMAGE_NAME_QCOW2: &str = "jammy-server-cloudimg-arm64-custom-20220329-0.qcow2"; - pub const JAMMY_IMAGE_NAME_QCOW2_ZLIB: &str = - "jammy-server-cloudimg-arm64-custom-20220329-0-zlib.qcow2"; - pub const JAMMY_IMAGE_NAME_QCOW2_ZSTD: &str = - "jammy-server-cloudimg-arm64-custom-20220329-0-zstd.qcow2"; - pub const JAMMY_IMAGE_NAME_QCOW2_BACKING_ZSTD_FILE: &str = - "jammy-server-cloudimg-arm64-custom-20220329-0-backing-zstd.qcow2"; - pub const JAMMY_IMAGE_NAME_QCOW2_BACKING_UNCOMPRESSED_FILE: &str = - "jammy-server-cloudimg-arm64-custom-20220329-0-backing-uncompressed.qcow2"; - pub const JAMMY_IMAGE_NAME_QCOW2_BACKING_RAW_FILE: &str = - "jammy-server-cloudimg-arm64-custom-20220329-0-backing-raw.qcow2"; - pub const WINDOWS_IMAGE_NAME: &str = "windows-11-iot-enterprise-aarch64.raw"; - pub const OVMF_NAME: &str = "CLOUDHV_EFI.fd"; - pub const GREP_SERIAL_IRQ_CMD: &str = "grep -c 'GICv3.*uart-pl011' /proc/interrupts || true"; - pub const GREP_PMU_IRQ_CMD: &str = "grep -c 'GICv3.*arm-pmu' /proc/interrupts || true"; -} - -#[cfg(target_arch = "aarch64")] -use aarch64::*; - -const DIRECT_KERNEL_BOOT_CMDLINE: &str = - "root=/dev/vda1 console=hvc0 rw systemd.journald.forward_to_console=1"; - -const CONSOLE_TEST_STRING: &str = "Started OpenBSD Secure Shell server"; - // This enum exists to make it more convenient to // implement test for both D-Bus and REST APIs. enum TargetApi { @@ -541,31 +478,6 @@ fn temp_vmcore_file_path(tmp_dir: &TempDir) -> String { String::from(tmp_dir.as_path().join("vmcore").to_str().unwrap()) } -// Creates the path for direct kernel boot and return the path. -// For x86_64, this function returns the vmlinux kernel path. -// For AArch64, this function returns the PE kernel path. -fn direct_kernel_boot_path() -> PathBuf { - let mut workload_path = dirs::home_dir().unwrap(); - workload_path.push("workloads"); - - let mut kernel_path = workload_path; - #[cfg(target_arch = "x86_64")] - kernel_path.push("vmlinux-x86_64"); - #[cfg(target_arch = "aarch64")] - kernel_path.push("Image-arm64"); - - kernel_path -} - -fn edk2_path() -> PathBuf { - let mut workload_path = dirs::home_dir().unwrap(); - workload_path.push("workloads"); - let mut edk2_path = workload_path; - edk2_path.push(OVMF_NAME); - - edk2_path -} - fn cloud_hypervisor_release_path() -> String { let mut workload_path = dirs::home_dir().unwrap(); workload_path.push("workloads"); diff --git a/test_infra/src/lib.rs b/test_infra/src/lib.rs index 60bf4ece03..f2b7bb9fbb 100644 --- a/test_infra/src/lib.rs +++ b/test_infra/src/lib.rs @@ -2050,3 +2050,91 @@ fn generate_host_data() -> String { rand::rng().fill_bytes(&mut bytes); bytes.iter().map(|b| format!("{b:02x}")).collect() } + +// Creates the path for direct kernel boot and return the path. +// For x86_64, this function returns the vmlinux kernel path. +// For AArch64, this function returns the PE kernel path. +pub fn direct_kernel_boot_path() -> PathBuf { + let mut workload_path = dirs::home_dir().unwrap(); + workload_path.push("workloads"); + + let mut kernel_path = workload_path; + #[cfg(target_arch = "x86_64")] + kernel_path.push("vmlinux-x86_64"); + #[cfg(target_arch = "aarch64")] + kernel_path.push("Image-arm64"); + + kernel_path +} + +pub fn edk2_path() -> PathBuf { + let mut workload_path = dirs::home_dir().unwrap(); + workload_path.push("workloads"); + let mut edk2_path = workload_path; + edk2_path.push(OVMF_NAME); + + edk2_path +} + +pub const DIRECT_KERNEL_BOOT_CMDLINE: &str = + "root=/dev/vda1 console=hvc0 rw systemd.journald.forward_to_console=1"; + +pub const CONSOLE_TEST_STRING: &str = "Started OpenBSD Secure Shell server"; + +// Constant taken from the VMM crate. +pub const MAX_NUM_PCI_SEGMENTS: u16 = 96; + +#[cfg(target_arch = "x86_64")] +pub mod x86_64 { + pub const FOCAL_IMAGE_NAME: &str = "focal-server-cloudimg-amd64-custom-20210609-0.raw"; + pub const JAMMY_VFIO_IMAGE_NAME: &str = + "jammy-server-cloudimg-amd64-custom-vfio-20241012-0.raw"; + pub const FOCAL_IMAGE_NAME_VHD: &str = "focal-server-cloudimg-amd64-custom-20210609-0.vhd"; + pub const FOCAL_IMAGE_NAME_VHDX: &str = "focal-server-cloudimg-amd64-custom-20210609-0.vhdx"; + pub const JAMMY_IMAGE_NAME: &str = "jammy-server-cloudimg-amd64-custom-20241017-0.raw"; + pub const JAMMY_IMAGE_NAME_QCOW2: &str = "jammy-server-cloudimg-amd64-custom-20241017-0.qcow2"; + pub const JAMMY_IMAGE_NAME_QCOW2_ZLIB: &str = + "jammy-server-cloudimg-amd64-custom-20241017-0-zlib.qcow2"; + pub const JAMMY_IMAGE_NAME_QCOW2_ZSTD: &str = + "jammy-server-cloudimg-amd64-custom-20241017-0-zstd.qcow2"; + pub const JAMMY_IMAGE_NAME_QCOW2_BACKING_ZSTD_FILE: &str = + "jammy-server-cloudimg-amd64-custom-20241017-0-backing-zstd.qcow2"; + pub const JAMMY_IMAGE_NAME_QCOW2_BACKING_UNCOMPRESSED_FILE: &str = + "jammy-server-cloudimg-amd64-custom-20241017-0-backing-uncompressed.qcow2"; + pub const JAMMY_IMAGE_NAME_QCOW2_BACKING_RAW_FILE: &str = + "jammy-server-cloudimg-amd64-custom-20241017-0-backing-raw.qcow2"; + pub const WINDOWS_IMAGE_NAME: &str = "windows-server-2022-amd64-2.raw"; + pub const OVMF_NAME: &str = "CLOUDHV.fd"; + pub const GREP_SERIAL_IRQ_CMD: &str = "grep -c 'IO-APIC.*ttyS0' /proc/interrupts || true"; +} + +#[cfg(target_arch = "x86_64")] +pub use x86_64::*; + +#[cfg(target_arch = "aarch64")] +pub mod aarch64 { + pub const FOCAL_IMAGE_NAME: &str = "focal-server-cloudimg-arm64-custom-20210929-0.raw"; + pub const FOCAL_IMAGE_UPDATE_KERNEL_NAME: &str = + "focal-server-cloudimg-arm64-custom-20210929-0-update-kernel.raw"; + pub const FOCAL_IMAGE_NAME_VHD: &str = "focal-server-cloudimg-arm64-custom-20210929-0.vhd"; + pub const FOCAL_IMAGE_NAME_VHDX: &str = "focal-server-cloudimg-arm64-custom-20210929-0.vhdx"; + pub const JAMMY_IMAGE_NAME: &str = "jammy-server-cloudimg-arm64-custom-20220329-0.raw"; + pub const JAMMY_IMAGE_NAME_QCOW2: &str = "jammy-server-cloudimg-arm64-custom-20220329-0.qcow2"; + pub const JAMMY_IMAGE_NAME_QCOW2_ZLIB: &str = + "jammy-server-cloudimg-arm64-custom-20220329-0-zlib.qcow2"; + pub const JAMMY_IMAGE_NAME_QCOW2_ZSTD: &str = + "jammy-server-cloudimg-arm64-custom-20220329-0-zstd.qcow2"; + pub const JAMMY_IMAGE_NAME_QCOW2_BACKING_ZSTD_FILE: &str = + "jammy-server-cloudimg-arm64-custom-20220329-0-backing-zstd.qcow2"; + pub const JAMMY_IMAGE_NAME_QCOW2_BACKING_UNCOMPRESSED_FILE: &str = + "jammy-server-cloudimg-arm64-custom-20220329-0-backing-uncompressed.qcow2"; + pub const JAMMY_IMAGE_NAME_QCOW2_BACKING_RAW_FILE: &str = + "jammy-server-cloudimg-arm64-custom-20220329-0-backing-raw.qcow2"; + pub const WINDOWS_IMAGE_NAME: &str = "windows-11-iot-enterprise-aarch64.raw"; + pub const OVMF_NAME: &str = "CLOUDHV_EFI.fd"; + pub const GREP_SERIAL_IRQ_CMD: &str = "grep -c 'GICv3.*uart-pl011' /proc/interrupts || true"; + pub const GREP_PMU_IRQ_CMD: &str = "grep -c 'GICv3.*arm-pmu' /proc/interrupts || true"; +} + +#[cfg(target_arch = "aarch64")] +pub use aarch64::*; From f2d3c17e1fa960e934692273fec1954f376b526c Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Fri, 27 Feb 2026 14:33:05 -0800 Subject: [PATCH 048/742] test_infra: set default kernel path and cmdline in Guest Initialize kernel_path and kernel_cmdline with standard direct boot defaults in the Guest constructor instead of None. This removes boilerplate from individual tests that use the common direct kernel boot configuration. Signed-off-by: Muminul Islam --- test_infra/src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test_infra/src/lib.rs b/test_infra/src/lib.rs index f2b7bb9fbb..345aa3c250 100644 --- a/test_infra/src/lib.rs +++ b/test_infra/src/lib.rs @@ -985,8 +985,8 @@ impl Guest { network, vm_type: GuestVmType::Regular, boot_timeout: DEFAULT_TCP_LISTENER_TIMEOUT, - kernel_path: None, - kernel_cmdline: None, + kernel_path: direct_kernel_boot_path().to_str().map(String::from), + kernel_cmdline: Some(DIRECT_KERNEL_BOOT_CMDLINE.to_string()), console_type: None, num_cpu: 1u32, nested: true, From 85159c9255b35c990fb636e68cd4b7d4ca953b84 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Fri, 27 Feb 2026 18:17:40 -0800 Subject: [PATCH 049/742] tests: Add factory pattern to create guest instances Introduce GuestFactory struct that encapsulates GuestVmType, boot timeout, and nested virtualization defaults. This avoids repeatedly specifying VM type and associated defaults when constructing Guest instances in integration tests. Two factory constructors are provided: - new_regular_guest_factory: default timeout, nested enabled - new_confidential_guest_factory: CVM timeout, nested disabled Multiple create_guest variants allow customizing CPU count, memory size, and nested virtualization while inheriting the factory's VM type and timeout settings. Signed-off-by: Muminul Islam --- test_infra/src/lib.rs | 54 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/test_infra/src/lib.rs b/test_infra/src/lib.rs index 345aa3c250..1ba5ceccc0 100644 --- a/test_infra/src/lib.rs +++ b/test_infra/src/lib.rs @@ -998,6 +998,21 @@ impl Guest { Self::new_from_ip_range(disk_config, "192.168", next_guest_id()) } + pub fn with_cpu(mut self, count: u32) -> Self { + self.num_cpu = count; + self + } + + pub fn with_memory(mut self, mem_size: &str) -> Self { + self.mem_size_str = mem_size.to_string(); + self + } + + pub fn with_nested(mut self, nested: bool) -> Self { + self.nested = nested; + self + } + pub fn default_net_string(&self) -> String { format!( "tap=,mac={},ip={},mask=255.255.255.128", @@ -1435,6 +1450,45 @@ impl Guest { } } +// A factory for creating guests with different configurations. The factory is initialized +// with a GuestVmType, and created guests will have the same GuestVmType as the factory. +// This allows creation of guests with different configurations (e.g. regular vs confidential) +// without specifying the GuestVmType each time. +// Based on the VmType, the default timeout for waiting for the VM to boot is also set, +// which is used in the wait_vm_boot() method of the Guest struct. Additionally, nested +// virtualization is disabled by default for confidential VMs, as it is not supported. +pub struct GuestFactory { + vm_type: GuestVmType, + boot_timeout: u32, + nested: bool, +} + +impl GuestFactory { + pub fn new_regular_guest_factory() -> Self { + Self { + vm_type: GuestVmType::Regular, + boot_timeout: DEFAULT_TCP_LISTENER_TIMEOUT, + nested: true, + } + } + + pub fn new_confidential_guest_factory() -> Self { + Self { + vm_type: GuestVmType::Confidential, + boot_timeout: DEFAULT_CVM_TCP_LISTENER_TIMEOUT, + nested: false, + } + } + + pub fn create_guest(&self, disk_config: Box) -> Guest { + let mut guest = Guest::new(disk_config); + guest.vm_type = self.vm_type; + guest.boot_timeout = self.boot_timeout; + guest.nested = self.nested; + guest + } +} + #[derive(Default)] pub enum VerbosityLevel { #[default] From f443bae9d9a12c2cc7903c9c295ecf1bd721e3b7 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Fri, 27 Feb 2026 16:02:59 -0800 Subject: [PATCH 050/742] tests: Support CVM in api_create_body() Refactor api_create_body() to branch based on GuestVmType. For confidential VMs, the JSON body now includes: - platform config with sev_snp enabled - IGVM payload with host_data instead of kernel - nested virtualization disabled in CPU config Replace the monolithic format! macro with incremental push_str calls for clearer JSON construction. Also improve error handling in GuestCommand by replacing an unwrap() with a descriptive expect() on the IGVM path. Signed-off-by: Muminul Islam --- test_infra/src/lib.rs | 57 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 47 insertions(+), 10 deletions(-) diff --git a/test_infra/src/lib.rs b/test_infra/src/lib.rs index 1ba5ceccc0..27b608c987 100644 --- a/test_infra/src/lib.rs +++ b/test_infra/src/lib.rs @@ -1084,16 +1084,50 @@ impl Guest { } pub fn api_create_body(&self, cpu_count: u8, kernel_path: &str, kernel_cmd: &str) -> String { - format! {"{{\"cpus\":{{\"boot_vcpus\":{},\"max_vcpus\":{}}},\"payload\":{{\"kernel\":\"{}\",\"cmdline\": \"{}\"}},\"net\":[{{\"ip\":\"{}\", \"mask\":\"255.255.255.0\", \"mac\":\"{}\"}}], \"disks\":[{{\"path\":\"{}\"}}, {{\"path\":\"{}\"}}]}}", - cpu_count, - cpu_count, - kernel_path, - kernel_cmd, - self.network.host_ip0, - self.network.guest_mac0, - self.disk_config.disk(DiskType::OperatingSystem).unwrap().as_str(), - self.disk_config.disk(DiskType::CloudInit).unwrap().as_str(), + let mut body = serde_json::json!({ + "cpus": { + "boot_vcpus": cpu_count, + "max_vcpus": cpu_count, + }, + "net": [ + { + "ip": self.network.host_ip0, + "mask": "255.255.255.0", + "mac": self.network.guest_mac0, + } + ], + "disks": [ + { + "path": self.disk_config.disk(DiskType::OperatingSystem).unwrap(), + }, + { + "path": self.disk_config.disk(DiskType::CloudInit).unwrap(), + } + ] + }); + + if !self.nested { + body["cpus"]["nested"] = serde_json::json!(false); } + + if self.vm_type == GuestVmType::Confidential { + body["platform"] = serde_json::json!({"sev_snp": true}); + body["payload"] = serde_json::json!({ + "igvm": direct_igvm_boot_path(Some("hvc0")) + .unwrap() + .to_str() + .unwrap(), + "cmdline": kernel_cmd, + "host_data": generate_host_data(), + }); + } else { + body["payload"] = serde_json::json!({ + "kernel": kernel_path, + "cmdline": kernel_cmd, + }); + } + + body.to_string() } pub fn get_cpu_count(&self) -> Result { @@ -1656,7 +1690,10 @@ impl<'a> GuestCommand<'a> { }; let igvm = direct_igvm_boot_path(Some(console_str)) .expect("IGVM boot file not found for console type: {console_str}"); - self.command.args(["--igvm", igvm.to_str().unwrap()]); + self.command.args([ + "--igvm", + igvm.to_str().expect("IGVM path is not valid UTF-8"), + ]); self.command .args(["--host-data", generate_host_data().as_str()]); self.command.args(["--platform", "sev_snp=on"]); From 7a0019514f8d0d92178bc85a40faa22c969ea2ae Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Fri, 27 Feb 2026 19:14:19 -0800 Subject: [PATCH 051/742] tests: Make api_create_body() parameterless Refactor api_create_body() to read cpu_count, kernel_path, and kernel_cmdline from Guest fields instead of taking them as parameters. This makes Guest the single source of truth for VM configuration. Update all call sites in HTTP and DBus API tests to use the new parameterless signature. Switch guest creation to use GuestFactory for consistent 4-CPU configuration. Replace manual CPU and memory assertions with validate_cpu_count() and validate_memory() helpers. Replace thread::sleep with wait_vm_boot() in _test_api_create_boot for proper boot synchronization. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 105 ++++++++++++-------------- test_infra/src/lib.rs | 12 +-- 2 files changed, 55 insertions(+), 62 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 0866249e07..b3613067e2 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -109,12 +109,7 @@ fn _test_api_create_boot(target_api: &TargetApi, guest: &Guest) { assert!(target_api.remote_command("ping", None)); // Create the VM first - let cpu_count: u8 = 4; - let request_body = guest.api_create_body( - cpu_count, - direct_kernel_boot_path().to_str().unwrap(), - DIRECT_KERNEL_BOOT_CMDLINE, - ); + let request_body = guest.api_create_body(); let temp_config_path = guest.tmp_dir.as_path().join("config"); std::fs::write(&temp_config_path, request_body).unwrap(); @@ -124,12 +119,12 @@ fn _test_api_create_boot(target_api: &TargetApi, guest: &Guest) { // Then boot it assert!(target_api.remote_command("boot", None)); - thread::sleep(std::time::Duration::new(20, 0)); let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); // Check that the VM booted as expected - assert_eq!(guest.get_cpu_count().unwrap_or_default() as u8, cpu_count); - assert!(guest.get_total_memory().unwrap_or_default() > 480_000); + guest.validate_cpu_count(None); + guest.validate_memory(None); }); kill_child(&mut child); @@ -154,12 +149,7 @@ fn _test_api_shutdown(target_api: &TargetApi, guest: &Guest) { assert!(target_api.remote_command("ping", None)); // Create the VM first - let cpu_count: u8 = 4; - let request_body = guest.api_create_body( - cpu_count, - direct_kernel_boot_path().to_str().unwrap(), - DIRECT_KERNEL_BOOT_CMDLINE, - ); + let request_body = guest.api_create_body(); let temp_config_path = guest.tmp_dir.as_path().join("config"); std::fs::write(&temp_config_path, request_body).unwrap(); @@ -174,8 +164,8 @@ fn _test_api_shutdown(target_api: &TargetApi, guest: &Guest) { guest.wait_vm_boot().unwrap(); // Check that the VM booted as expected - assert_eq!(guest.get_cpu_count().unwrap_or_default() as u8, cpu_count); - assert!(guest.get_total_memory().unwrap_or_default() > 480_000); + guest.validate_cpu_count(None); + guest.validate_memory(None); // Sync and shutdown without powering off to prevent filesystem // corruption. @@ -194,8 +184,8 @@ fn _test_api_shutdown(target_api: &TargetApi, guest: &Guest) { guest.wait_vm_boot().unwrap(); // Check that the VM booted as expected - assert_eq!(guest.get_cpu_count().unwrap_or_default() as u8, cpu_count); - assert!(guest.get_total_memory().unwrap_or_default() > 480_000); + guest.validate_cpu_count(None); + guest.validate_memory(None); }); kill_child(&mut child); @@ -220,12 +210,8 @@ fn _test_api_delete(target_api: &TargetApi, guest: &Guest) { assert!(target_api.remote_command("ping", None)); // Create the VM first - let cpu_count: u8 = 4; - let request_body = guest.api_create_body( - cpu_count, - direct_kernel_boot_path().to_str().unwrap(), - DIRECT_KERNEL_BOOT_CMDLINE, - ); + let request_body = guest.api_create_body(); + let temp_config_path = guest.tmp_dir.as_path().join("config"); std::fs::write(&temp_config_path, request_body).unwrap(); let create_config = temp_config_path.as_os_str().to_str().unwrap(); @@ -239,8 +225,8 @@ fn _test_api_delete(target_api: &TargetApi, guest: &Guest) { guest.wait_vm_boot().unwrap(); // Check that the VM booted as expected - assert_eq!(guest.get_cpu_count().unwrap_or_default() as u8, cpu_count); - assert!(guest.get_total_memory().unwrap_or_default() > 480_000); + guest.validate_cpu_count(None); + guest.validate_memory(None); // Sync and shutdown without powering off to prevent filesystem // corruption. @@ -261,8 +247,8 @@ fn _test_api_delete(target_api: &TargetApi, guest: &Guest) { guest.wait_vm_boot().unwrap(); // Check that the VM booted as expected - assert_eq!(guest.get_cpu_count().unwrap_or_default() as u8, cpu_count); - assert!(guest.get_total_memory().unwrap_or_default() > 480_000); + guest.validate_cpu_count(None); + guest.validate_memory(None); }); kill_child(&mut child); @@ -288,12 +274,7 @@ fn _test_api_pause_resume(target_api: &TargetApi, guest: &Guest) { assert!(target_api.remote_command("ping", None)); // Create the VM first - let cpu_count: u8 = 4; - let request_body = guest.api_create_body( - cpu_count, - direct_kernel_boot_path().to_str().unwrap(), - DIRECT_KERNEL_BOOT_CMDLINE, - ); + let request_body = guest.api_create_body(); let temp_config_path = guest.tmp_dir.as_path().join("config"); std::fs::write(&temp_config_path, request_body).unwrap(); @@ -307,8 +288,8 @@ fn _test_api_pause_resume(target_api: &TargetApi, guest: &Guest) { let r = std::panic::catch_unwind(|| { // Check that the VM booted as expected - assert_eq!(guest.get_cpu_count().unwrap_or_default() as u8, cpu_count); - assert!(guest.get_total_memory().unwrap_or_default() > 480_000); + guest.validate_cpu_count(None); + guest.validate_memory(None); // We now pause the VM assert!(target_api.remote_command("pause", None)); @@ -336,7 +317,7 @@ fn _test_api_pause_resume(target_api: &TargetApi, guest: &Guest) { thread::sleep(std::time::Duration::new(2, 0)); // Now we should be able to SSH back in and get the right number of CPUs - assert_eq!(guest.get_cpu_count().unwrap_or_default() as u8, cpu_count); + guest.validate_cpu_count(None); }); kill_child(&mut child); @@ -2522,6 +2503,7 @@ mod common_parallel { use std::process::Command; use block::ImageType; + use test_infra::GuestFactory; use crate::*; @@ -5961,7 +5943,9 @@ mod common_parallel { #[test] fn test_api_http_shutdown() { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); + let guest = GuestFactory::new_regular_guest_factory() + .create_guest(Box::new(disk_config)) + .with_cpu(4); let target_api = TargetApi::new_http_api(&guest.tmp_dir); _test_api_shutdown(&target_api, &guest); @@ -5970,7 +5954,9 @@ mod common_parallel { #[test] fn test_api_http_delete() { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); + let guest = GuestFactory::new_regular_guest_factory() + .create_guest(Box::new(disk_config)) + .with_cpu(4); let target_api = TargetApi::new_http_api(&guest.tmp_dir); _test_api_delete(&target_api, &guest); @@ -5979,7 +5965,9 @@ mod common_parallel { #[test] fn test_api_http_pause_resume() { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); + let guest = GuestFactory::new_regular_guest_factory() + .create_guest(Box::new(disk_config)) + .with_cpu(4); let target_api = TargetApi::new_http_api(&guest.tmp_dir); _test_api_pause_resume(&target_api, &guest); @@ -5988,7 +5976,9 @@ mod common_parallel { #[test] fn test_api_http_create_boot() { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); + let guest = GuestFactory::new_regular_guest_factory() + .create_guest(Box::new(disk_config)) + .with_cpu(4); let target_api = TargetApi::new_http_api(&guest.tmp_dir); _test_api_create_boot(&target_api, &guest); @@ -10105,12 +10095,7 @@ mod dbus_api { assert!(http_api.remote_command("ping", None)); // Create the VM first - let cpu_count: u8 = 4; - let request_body = guest.api_create_body( - cpu_count, - direct_kernel_boot_path().to_str().unwrap(), - DIRECT_KERNEL_BOOT_CMDLINE, - ); + let request_body = guest.api_create_body(); let temp_config_path = guest.tmp_dir.as_path().join("config"); std::fs::write(&temp_config_path, request_body).unwrap(); @@ -10125,8 +10110,8 @@ mod dbus_api { guest.wait_vm_boot().unwrap(); // Check that the VM booted as expected - assert_eq!(guest.get_cpu_count().unwrap_or_default() as u8, cpu_count); - assert!(guest.get_total_memory().unwrap_or_default() > 480_000); + guest.validate_cpu_count(None); + guest.validate_memory(None); // Sync and shutdown without powering off to prevent filesystem // corruption. @@ -10144,8 +10129,8 @@ mod dbus_api { guest.wait_vm_boot().unwrap(); // Check that the VM booted as expected - assert_eq!(guest.get_cpu_count().unwrap_or_default() as u8, cpu_count); - assert!(guest.get_total_memory().unwrap_or_default() > 480_000); + guest.validate_cpu_count(None); + guest.validate_memory(None); }); kill_child(&mut child); @@ -10157,7 +10142,9 @@ mod dbus_api { #[test] fn test_api_dbus_create_boot() { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); + let guest = GuestFactory::new_regular_guest_factory() + .create_guest(Box::new(disk_config)) + .with_cpu(4); let target_api = TargetApi::new_dbus_api(&guest.tmp_dir); _test_api_create_boot(&target_api, &guest); @@ -10166,7 +10153,9 @@ mod dbus_api { #[test] fn test_api_dbus_shutdown() { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); + let guest = GuestFactory::new_regular_guest_factory() + .create_guest(Box::new(disk_config)) + .with_cpu(4); let target_api = TargetApi::new_dbus_api(&guest.tmp_dir); _test_api_shutdown(&target_api, &guest); @@ -10175,7 +10164,9 @@ mod dbus_api { #[test] fn test_api_dbus_delete() { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); + let guest = GuestFactory::new_regular_guest_factory() + .create_guest(Box::new(disk_config)) + .with_cpu(4); let target_api = TargetApi::new_dbus_api(&guest.tmp_dir); _test_api_delete(&target_api, &guest); @@ -10184,7 +10175,9 @@ mod dbus_api { #[test] fn test_api_dbus_pause_resume() { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); + let guest = GuestFactory::new_regular_guest_factory() + .create_guest(Box::new(disk_config)) + .with_cpu(4); let target_api = TargetApi::new_dbus_api(&guest.tmp_dir); _test_api_pause_resume(&target_api, &guest); diff --git a/test_infra/src/lib.rs b/test_infra/src/lib.rs index 27b608c987..fba95ed977 100644 --- a/test_infra/src/lib.rs +++ b/test_infra/src/lib.rs @@ -1083,11 +1083,11 @@ impl Guest { ) } - pub fn api_create_body(&self, cpu_count: u8, kernel_path: &str, kernel_cmd: &str) -> String { + pub fn api_create_body(&self) -> String { let mut body = serde_json::json!({ "cpus": { - "boot_vcpus": cpu_count, - "max_vcpus": cpu_count, + "boot_vcpus": self.num_cpu, + "max_vcpus": self.num_cpu, }, "net": [ { @@ -1117,13 +1117,13 @@ impl Guest { .unwrap() .to_str() .unwrap(), - "cmdline": kernel_cmd, + "cmdline": self.kernel_cmdline.as_deref().unwrap(), "host_data": generate_host_data(), }); } else { body["payload"] = serde_json::json!({ - "kernel": kernel_path, - "cmdline": kernel_cmd, + "kernel": self.kernel_path.as_deref().unwrap(), + "cmdline": self.kernel_cmdline.as_deref().unwrap(), }); } From b8a6afd332d32f6fa00f9461401b8bdf0853e249 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Fri, 27 Feb 2026 18:19:47 -0800 Subject: [PATCH 052/742] tests: Add CVM HTTP API create/boot test Add test_api_http_create_boot to the common_cvm module using GuestFactory::new_confidential_guest_factory() with 4 CPUs. This extends API create/boot coverage to confidential VMs. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index b3613067e2..4c00f63d5a 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -14555,10 +14555,20 @@ mod common_cvm { #[test] fn test_focal_simple_launch() { let disk_config = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); - let mut guest = Guest::new(Box::new(disk_config)); + let guest = Guest::new(Box::new(disk_config)); guest.vm_type = GuestVmType::Confidential; guest.boot_timeout = DEFAULT_CVM_TCP_LISTENER_TIMEOUT; guest.nested = false; _test_simple_launch(&guest) } + + #[test] + fn test_api_http_create_boot() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = GuestFactory::new_confidential_guest_factory() + .create_guest(Box::new(disk_config)) + .with_cpu(4); + let target_api = TargetApi::new_http_api(&guest.tmp_dir); + _test_api_create_boot(&target_api, &guest); + } } From f6829561e59858c3a9c1415a62298cc8b48a952d Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 28 Feb 2026 16:19:10 -0800 Subject: [PATCH 053/742] tests: Use GuestFactory in CVM simple launch test Replace manual Guest field assignments with GuestFactory::new_confidential_guest_factory() in test_focal_simple_launch for consistent CVM guest creation. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 4c00f63d5a..01bd9a4ddf 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -14555,10 +14555,9 @@ mod common_cvm { #[test] fn test_focal_simple_launch() { let disk_config = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - guest.vm_type = GuestVmType::Confidential; - guest.boot_timeout = DEFAULT_CVM_TCP_LISTENER_TIMEOUT; - guest.nested = false; + let guest = + GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); + _test_simple_launch(&guest) } From 7c48aafb655bd3c44d5fbd863ae2076c87a28fbc Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 28 Feb 2026 16:46:18 -0800 Subject: [PATCH 054/742] tests: Add CVM HTTP API shutdown and delete tests Add test_api_http_shutdown and test_api_http_delete to the common_cvm module using GuestFactory with 4 CPUs. Both tests reuse existing _test_api_shutdown and _test_api_delete helpers to extend API coverage to confidential VMs. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 01bd9a4ddf..d9874d2e76 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -14570,4 +14570,26 @@ mod common_cvm { let target_api = TargetApi::new_http_api(&guest.tmp_dir); _test_api_create_boot(&target_api, &guest); } + + #[test] + fn test_api_http_shutdown() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = GuestFactory::new_confidential_guest_factory() + .create_guest(Box::new(disk_config)) + .with_cpu(4); + + let target_api = TargetApi::new_http_api(&guest.tmp_dir); + _test_api_shutdown(&target_api, &guest); + } + + #[test] + fn test_api_http_delete() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = GuestFactory::new_confidential_guest_factory() + .create_guest(Box::new(disk_config)) + .with_cpu(4); + + let target_api = TargetApi::new_http_api(&guest.tmp_dir); + _test_api_delete(&target_api, &guest); + } } From f57b7c5b86fa0764947d020284c0afd710ce5e81 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Wed, 4 Mar 2026 02:04:46 -0800 Subject: [PATCH 055/742] arch: x86_64: Correctly disable nested virtualization on AMD The loop that is for programming the APIC ID and disabling nested virtualization was prematurely breaking out on AMD platforms as the 0x1 leaf is also valid on AMD. This lead to the code attempting to disable SVM in the 0x8000_0001 leaf never being reached. Now only break out early if the CPU vendor is Intel. Signed-off-by: Rob Bradford --- arch/src/x86_64/mod.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/src/x86_64/mod.rs b/arch/src/x86_64/mod.rs index 979fd52a9b..8bc5ec19c7 100644 --- a/arch/src/x86_64/mod.rs +++ b/arch/src/x86_64/mod.rs @@ -841,11 +841,13 @@ pub fn configure_vcpu( entry.ebx &= 0xffffff; entry.ebx |= x2apic_id << 24; apic_id_patched = true; - if !nested { - // Disable nested virtualization for Intel - entry.ecx &= !(1 << VMX_ECX_BIT); + if matches!(cpu_vendor, CpuVendor::Intel) { + if !nested { + // Disable nested virtualization for Intel + entry.ecx &= !(1 << VMX_ECX_BIT); + } + break; } - break; } if entry.function == 0x8000_0001 { if !nested { From 67945b31e5fff04571c2d961c39d2bfefef952b8 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Wed, 4 Mar 2026 02:22:14 -0800 Subject: [PATCH 056/742] tests: Add integration test for nested virtualization Since we run integration tests on Intel & AMD this should test the behaviour of `--cpus nested={on|off}` correctly. Signed-off-by: Rob Bradford --- cloud-hypervisor/tests/integration.rs | 45 +++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index d9874d2e76..96963bc4ed 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -2616,6 +2616,51 @@ mod common_parallel { handle_child_output(r, &output); } + fn _test_nested_virtualization(nested: bool) { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)).with_nested(nested); + let mut child = GuestCommand::new(&guest) + .default_cpus() + .default_memory() + .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .default_disks() + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + let expected = if nested { "yes" } else { "no" }; + assert_eq!( + guest + .ssh_command("test -c /dev/kvm && echo yes || echo no") + .unwrap() + .trim(), + expected + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); + } + + #[test] + #[cfg(target_arch = "x86_64")] + fn test_nested_virtualization_on() { + _test_nested_virtualization(true); + } + + #[test] + #[cfg(target_arch = "x86_64")] + fn test_nested_virtualization_off() { + _test_nested_virtualization(false); + } + #[test] fn test_cpu_affinity() { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); From a758f8bd8285d6e5b91c728597c35f8c8e3b2bdd Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 4 Mar 2026 23:54:11 +0000 Subject: [PATCH 057/742] build: Bump docker/login-action from 3 to 4 Bumps [docker/login-action](https://github.com/docker/login-action) from 3 to 4. - [Release notes](https://github.com/docker/login-action/releases) - [Commits](https://github.com/docker/login-action/compare/v3...v4) --- updated-dependencies: - dependency-name: docker/login-action dependency-version: '4' dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/docker-image.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker-image.yaml b/.github/workflows/docker-image.yaml index 6a52f1edf0..1dc0df47b7 100644 --- a/.github/workflows/docker-image.yaml +++ b/.github/workflows/docker-image.yaml @@ -27,7 +27,7 @@ jobs: uses: docker/setup-buildx-action@v3 - name: Login to ghcr - uses: docker/login-action@v3 + uses: docker/login-action@v4 with: registry: ${{ env.REGISTRY }} username: ${{ github.actor }} From 1f93fef66f874ede0ca4665dce75f13b0d45455e Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 5 Mar 2026 16:13:48 +0100 Subject: [PATCH 058/742] vmm: Remove dead QcowDeviceCreate error variant The variant has been unused since commit 12e20effd which replaced direct QcowFile creation with QcowDiskSync. Signed-off-by: Anatol Belski --- vmm/src/device_manager.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 52e4cddfa0..d1a0f62b9b 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -267,10 +267,6 @@ pub enum DeviceManagerError { #[error("Failed to parse disk image format")] DetectImageType(#[source] io::Error), - /// Cannot open qcow disk path - #[error("Cannot open qcow disk path")] - QcowDeviceCreate(#[source] qcow::Error), - /// Cannot create serial manager #[error("Cannot create serial manager")] CreateSerialManager(#[source] SerialManagerError), From 2495fdc0e7634401cb9c85334a0b1b9532fe8012 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Thu, 5 Mar 2026 21:27:50 +0000 Subject: [PATCH 059/742] vmm: Print out the version information at launch It is useful to see this information in the log while debugging issues. Signed-off-by: Wei Liu --- cloud-hypervisor/src/main.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cloud-hypervisor/src/main.rs b/cloud-hypervisor/src/main.rs index b4d2bdf534..80cd502914 100644 --- a/cloud-hypervisor/src/main.rs +++ b/cloud-hypervisor/src/main.rs @@ -595,6 +595,8 @@ fn start_vmm(cmd_arguments: &ArgMatches) -> Result, Error> { } } + info!("{} starting", env!("BUILD_VERSION")); + let hypervisor = hypervisor::new().map_err(Error::CreateHypervisor)?; #[cfg(feature = "guest_debug")] From 5925a013af84c2f6ef6e856bffce96cec4cfa17a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 5 Mar 2026 23:54:08 +0000 Subject: [PATCH 060/742] build: Bump docker/setup-buildx-action from 3 to 4 Bumps [docker/setup-buildx-action](https://github.com/docker/setup-buildx-action) from 3 to 4. - [Release notes](https://github.com/docker/setup-buildx-action/releases) - [Commits](https://github.com/docker/setup-buildx-action/compare/v3...v4) --- updated-dependencies: - dependency-name: docker/setup-buildx-action dependency-version: '4' dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/docker-image.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker-image.yaml b/.github/workflows/docker-image.yaml index 1dc0df47b7..059aa19672 100644 --- a/.github/workflows/docker-image.yaml +++ b/.github/workflows/docker-image.yaml @@ -24,7 +24,7 @@ jobs: uses: docker/setup-qemu-action@v3 - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 + uses: docker/setup-buildx-action@v4 - name: Login to ghcr uses: docker/login-action@v4 From b7b38df99c3a0522723f4f1897e1e8e5117cf0ff Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 5 Mar 2026 22:59:03 +0100 Subject: [PATCH 061/742] block: raw: Use map_or instead of map().unwrap_or() Do the necessary replacements to satisfy clippy::map_unwrap_or. Signed-off-by: Anatol Belski --- block/src/raw_async.rs | 5 ++--- block/src/raw_async_aio.rs | 5 ++--- block/src/raw_sync.rs | 5 ++--- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/block/src/raw_async.rs b/block/src/raw_async.rs index 3a890d716f..152e5fa3ba 100644 --- a/block/src/raw_async.rs +++ b/block/src/raw_async.rs @@ -42,9 +42,8 @@ impl DiskFile for RawFileDisk { fn new_async_io(&self, ring_depth: u32) -> DiskFileResult> { let mut raw = RawFileAsync::new(self.file.as_raw_fd(), ring_depth) .map_err(DiskFileError::NewAsyncIo)?; - raw.alignment = DiskTopology::probe(&self.file) - .map(|t| t.logical_block_size) - .unwrap_or(SECTOR_SIZE); + raw.alignment = + DiskTopology::probe(&self.file).map_or(SECTOR_SIZE, |t| t.logical_block_size); Ok(Box::new(raw) as Box) } diff --git a/block/src/raw_async_aio.rs b/block/src/raw_async_aio.rs index 7266a3633a..fe7196ebba 100644 --- a/block/src/raw_async_aio.rs +++ b/block/src/raw_async_aio.rs @@ -45,9 +45,8 @@ impl DiskFile for RawFileDiskAio { fn new_async_io(&self, ring_depth: u32) -> DiskFileResult> { let mut raw = RawFileAsyncAio::new(self.file.as_raw_fd(), ring_depth) .map_err(DiskFileError::NewAsyncIo)?; - raw.alignment = DiskTopology::probe(&self.file) - .map(|t| t.logical_block_size) - .unwrap_or(SECTOR_SIZE); + raw.alignment = + DiskTopology::probe(&self.file).map_or(SECTOR_SIZE, |t| t.logical_block_size); Ok(Box::new(raw) as Box) } diff --git a/block/src/raw_sync.rs b/block/src/raw_sync.rs index 9c96863b69..9c2d6b7893 100644 --- a/block/src/raw_sync.rs +++ b/block/src/raw_sync.rs @@ -41,9 +41,8 @@ impl DiskFile for RawFileDiskSync { fn new_async_io(&self, _ring_depth: u32) -> DiskFileResult> { let mut raw = RawFileSync::new(self.file.as_raw_fd()); - raw.alignment = DiskTopology::probe(&self.file) - .map(|t| t.logical_block_size) - .unwrap_or(SECTOR_SIZE); + raw.alignment = + DiskTopology::probe(&self.file).map_or(SECTOR_SIZE, |t| t.logical_block_size); Ok(Box::new(raw) as Box) } From ff39c35ac21fb17c7a96622d9d13f605dbe86b1d Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 5 Mar 2026 23:12:18 +0100 Subject: [PATCH 062/742] arch: x86_64: Collapse nested if into match arm guards Do the necessary replacements to satisfy clippy::collapsible_match. Signed-off-by: Anatol Belski --- arch/src/x86_64/mod.rs | 96 ++++++++++++++++++++---------------------- 1 file changed, 46 insertions(+), 50 deletions(-) diff --git a/arch/src/x86_64/mod.rs b/arch/src/x86_64/mod.rs index 8bc5ec19c7..2b9ce38122 100644 --- a/arch/src/x86_64/mod.rs +++ b/arch/src/x86_64/mod.rs @@ -643,15 +643,13 @@ pub fn generate_common_cpuid( for entry in cpuid.as_mut_slice().iter_mut() { match entry.function { // Clear AMX related bits if the AMX feature is not enabled - 0x7 => { - if !config.amx { - if entry.index == 0 { - entry.edx &= !((1 << AMX_BF16) | (1 << AMX_TILE) | (1 << AMX_INT8)); - } - if entry.index == 1 { - entry.eax &= !(1 << AMX_FP16); - entry.edx &= !(1 << AMX_COMPLEX); - } + 0x7 if !config.amx => { + if entry.index == 0 { + entry.edx &= !((1 << AMX_BF16) | (1 << AMX_TILE) | (1 << AMX_INT8)); + } + if entry.index == 1 { + entry.eax &= !(1 << AMX_FP16); + entry.edx &= !(1 << AMX_COMPLEX); } } 0xd => @@ -673,55 +671,53 @@ pub fn generate_common_cpuid( } } } - 0x1d => { - // Tile Information (purely AMX related). - if !config.amx { - entry.eax = 0; - entry.ebx = 0; - entry.ecx = 0; - entry.edx = 0; - } + // Tile Information (purely AMX related). + 0x1d if !config.amx => { + entry.eax = 0; + entry.ebx = 0; + entry.ecx = 0; + entry.edx = 0; } - 0x1e => { - // TMUL information (purely AMX related) - if !config.amx { - entry.eax = 0; - entry.ebx = 0; - entry.ecx = 0; - entry.edx = 0; - } + // TMUL information (purely AMX related) + 0x1e if !config.amx => { + entry.eax = 0; + entry.ebx = 0; + entry.ecx = 0; + entry.edx = 0; } // Copy host L1 cache details if not populated by KVM - 0x8000_0005 => { - if entry.eax == 0 && entry.ebx == 0 && entry.ecx == 0 && entry.edx == 0 { - #[allow(unused_unsafe)] + 0x8000_0005 + if entry.eax == 0 + && entry.ebx == 0 + && entry.ecx == 0 + && entry.edx == 0 // SAFETY: cpuid called with valid leaves - if unsafe { std::arch::x86_64::__cpuid(0x8000_0000).eax } >= 0x8000_0005 { - // SAFETY: cpuid called with valid leaves - let leaf = unsafe { std::arch::x86_64::__cpuid(0x8000_0005) }; - entry.eax = leaf.eax; - entry.ebx = leaf.ebx; - entry.ecx = leaf.ecx; - entry.edx = leaf.edx; - } - } + && unsafe { std::arch::x86_64::__cpuid(0x8000_0000).eax } >= 0x8000_0005 => + { + // SAFETY: cpuid called with valid leaves + let leaf = unsafe { std::arch::x86_64::__cpuid(0x8000_0005) }; + entry.eax = leaf.eax; + entry.ebx = leaf.ebx; + entry.ecx = leaf.ecx; + entry.edx = leaf.edx; } // Copy host L2 cache details if not populated by KVM - 0x8000_0006 => { - if entry.eax == 0 && entry.ebx == 0 && entry.ecx == 0 && entry.edx == 0 { - #[allow(unused_unsafe)] + 0x8000_0006 + if entry.eax == 0 + && entry.ebx == 0 + && entry.ecx == 0 + && entry.edx == 0 // SAFETY: cpuid called with valid leaves - if unsafe { std::arch::x86_64::__cpuid(0x8000_0000).eax } >= 0x8000_0006 { - #[allow(unused_unsafe)] - // SAFETY: cpuid called with valid leaves - let leaf = unsafe { std::arch::x86_64::__cpuid(0x8000_0006) }; - entry.eax = leaf.eax; - entry.ebx = leaf.ebx; - entry.ecx = leaf.ecx; - entry.edx = leaf.edx; - } - } + && unsafe { std::arch::x86_64::__cpuid(0x8000_0000).eax } >= 0x8000_0006 => + { + #[allow(unused_unsafe)] + // SAFETY: cpuid called with valid leaves + let leaf = unsafe { std::arch::x86_64::__cpuid(0x8000_0006) }; + entry.eax = leaf.eax; + entry.ebx = leaf.ebx; + entry.ecx = leaf.ecx; + entry.edx = leaf.edx; } // Set CPU physical bits 0x8000_0008 => { From 89107d2db4b2e018cd497707cc18179fe3f5ef2b Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 5 Mar 2026 23:27:53 +0100 Subject: [PATCH 063/742] arch: x86_64: Allow unused_unsafe on cpuid match arms The nightly compiler used by cargo fuzz no longer requires unsafe for __cpuid intrinsics, but stable still does. Signed-off-by: Anatol Belski --- arch/src/x86_64/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/src/x86_64/mod.rs b/arch/src/x86_64/mod.rs index 2b9ce38122..d35a878e61 100644 --- a/arch/src/x86_64/mod.rs +++ b/arch/src/x86_64/mod.rs @@ -641,6 +641,7 @@ pub fn generate_common_cpuid( // Update some existing CPUID for entry in cpuid.as_mut_slice().iter_mut() { + #[allow(unused_unsafe)] match entry.function { // Clear AMX related bits if the AMX feature is not enabled 0x7 if !config.amx => { @@ -711,7 +712,6 @@ pub fn generate_common_cpuid( // SAFETY: cpuid called with valid leaves && unsafe { std::arch::x86_64::__cpuid(0x8000_0000).eax } >= 0x8000_0006 => { - #[allow(unused_unsafe)] // SAFETY: cpuid called with valid leaves let leaf = unsafe { std::arch::x86_64::__cpuid(0x8000_0006) }; entry.eax = leaf.eax; From f1c33afc8e5fcc94be14cea01a1b58a9259349bc Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 5 Mar 2026 23:38:13 +0100 Subject: [PATCH 064/742] virtio-devices: vsock: Simplify discarded accept result Do the necessary replacements to satisfy clippy::map_unwrap_or. Signed-off-by: Anatol Belski --- virtio-devices/src/vsock/unix/muxer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtio-devices/src/vsock/unix/muxer.rs b/virtio-devices/src/vsock/unix/muxer.rs index edce5b1e03..1a8570b75d 100644 --- a/virtio-devices/src/vsock/unix/muxer.rs +++ b/virtio-devices/src/vsock/unix/muxer.rs @@ -409,7 +409,7 @@ impl VsockMuxer { // If we're already maxed-out on connections, we'll just accept and // immediately discard this potentially new one. warn!("vsock: connection limit reached; refusing new host connection"); - self.host_sock.accept().map(|_| 0).unwrap_or(0); + let _ = self.host_sock.accept(); return; } self.host_sock From 3c62fabfc3c8676272e27b36a671b5a88d290312 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 5 Mar 2026 23:43:57 +0100 Subject: [PATCH 065/742] vmm: Collapse nested if into match arm guards Do the necessary replacements to satisfy clippy::collapsible_match. Signed-off-by: Anatol Belski --- vmm/src/lib.rs | 1 + vmm/src/serial_manager.rs | 49 ++++++++++++++++++++------------------- 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 9ffd7fc0bc..5c75a2db0e 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -735,6 +735,7 @@ impl Vmm { for signal in signals.forever() { match signal { + #[allow(clippy::collapsible_match)] SIGTERM | SIGINT => { if exit_evt.write(1).is_err() { // Resetting the terminal is usually done as the VMM exits diff --git a/vmm/src/serial_manager.rs b/vmm/src/serial_manager.rs index 5f8de1874a..8a0d391d67 100644 --- a/vmm/src/serial_manager.rs +++ b/vmm/src/serial_manager.rs @@ -132,33 +132,34 @@ impl SerialManager { let in_fd = match output { ConsoleOutput::Pty(ref fd) => fd.as_raw_fd(), - ConsoleOutput::Tty(_) => { + ConsoleOutput::Tty(_) // If running on an interactive TTY then accept input // SAFETY: trivially safe - if unsafe { libc::isatty(libc::STDIN_FILENO) == 1 } { - // SAFETY: STDIN_FILENO is a valid fd - let fd = unsafe { libc::dup(libc::STDIN_FILENO) }; - if fd == -1 { - return Err(Error::DupFd(std::io::Error::last_os_error())); - } - // SAFETY: fd is valid and owned by us - let stdin_clone = unsafe { File::from_raw_fd(fd) }; - // SAFETY: FFI calls with correct arguments - let ret = unsafe { - let mut flags = libc::fcntl(stdin_clone.as_raw_fd(), libc::F_GETFL); - flags |= libc::O_NONBLOCK; - libc::fcntl(stdin_clone.as_raw_fd(), libc::F_SETFL, flags) - }; - - if ret < 0 { - return Err(Error::SetNonBlocking(std::io::Error::last_os_error())); - } - - output = ConsoleOutput::Tty(Arc::new(stdin_clone)); - fd - } else { - return Ok(None); + if unsafe { libc::isatty(libc::STDIN_FILENO) == 1 } => + { + // SAFETY: STDIN_FILENO is a valid fd + let fd = unsafe { libc::dup(libc::STDIN_FILENO) }; + if fd == -1 { + return Err(Error::DupFd(std::io::Error::last_os_error())); } + // SAFETY: fd is valid and owned by us + let stdin_clone = unsafe { File::from_raw_fd(fd) }; + // SAFETY: FFI calls with correct arguments + let ret = unsafe { + let mut flags = libc::fcntl(stdin_clone.as_raw_fd(), libc::F_GETFL); + flags |= libc::O_NONBLOCK; + libc::fcntl(stdin_clone.as_raw_fd(), libc::F_SETFL, flags) + }; + + if ret < 0 { + return Err(Error::SetNonBlocking(std::io::Error::last_os_error())); + } + + output = ConsoleOutput::Tty(Arc::new(stdin_clone)); + fd + } + ConsoleOutput::Tty(_) => { + return Ok(None); } ConsoleOutput::Socket(ref fd) => { if let Some(path_in_socket) = socket { From e61349c10e83c9bdf8d53b5d514db955290d0db8 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 6 Mar 2026 00:31:21 +0100 Subject: [PATCH 066/742] vmm: gdb: Use map_or instead of map().unwrap_or() Do the necessary replacement to satisfy clippy::map_unwrap_or. Signed-off-by: Anatol Belski --- vmm/src/gdb.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vmm/src/gdb.rs b/vmm/src/gdb.rs index fc24767d9c..82a5d63a9d 100644 --- a/vmm/src/gdb.rs +++ b/vmm/src/gdb.rs @@ -484,7 +484,7 @@ impl run_blocking::BlockingEventLoop for GdbEventLoop { } } - if conn.peek().map(|b| b.is_some()).unwrap_or(true) { + if conn.peek().map_or(true, |b| b.is_some()) { let byte = conn .read() .map_err(run_blocking::WaitForStopReasonError::Connection)?; From 4ebbbe2294ce4c9f6d49a7fa7de7fe461bce4ab8 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 6 Mar 2026 00:39:57 +0100 Subject: [PATCH 067/742] vmm: igvm: Use sort_by_key instead of sort_by Do the necessary replacement to satisfy clippy::unnecessary_sort_by. Signed-off-by: Anatol Belski --- vmm/src/igvm/igvm_loader.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vmm/src/igvm/igvm_loader.rs b/vmm/src/igvm/igvm_loader.rs index 4d454f8223..6e256c1ecb 100644 --- a/vmm/src/igvm/igvm_loader.rs +++ b/vmm/src/igvm/igvm_loader.rs @@ -433,7 +433,7 @@ pub fn load_igvm( let mut now = Instant::now(); // Sort the gpas to group them by the page type - gpas.sort_by(|a, b| a.gpa.cmp(&b.gpa)); + gpas.sort_by_key(|a| a.gpa); let gpas_grouped = gpas .iter() From 5e2539ae1659ec97157f79510ea53fafdc2be68d Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Fri, 6 Mar 2026 08:22:05 -0800 Subject: [PATCH 068/742] vmm: serial_manager: Use more appropriately named variable This isn't an fd - rather it's the UnixListener struct. Signed-off-by: Rob Bradford --- vmm/src/serial_manager.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vmm/src/serial_manager.rs b/vmm/src/serial_manager.rs index 8a0d391d67..45edec95eb 100644 --- a/vmm/src/serial_manager.rs +++ b/vmm/src/serial_manager.rs @@ -161,11 +161,11 @@ impl SerialManager { ConsoleOutput::Tty(_) => { return Ok(None); } - ConsoleOutput::Socket(ref fd) => { + ConsoleOutput::Socket(ref listener) => { if let Some(path_in_socket) = socket { socket_path = Some(path_in_socket.clone()); } - fd.as_raw_fd() + listener.as_raw_fd() } _ => return Ok(None), }; From 48dcfc5fd04a4de91718d2975d67a85a46b4916b Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Fri, 6 Mar 2026 08:33:53 -0800 Subject: [PATCH 069/742] vmm: Rename ConsoleOutput to ConsoleTransport This is not just used for determine the output but also the input to the console where this can be bidirectional. Signed-off-by: Rob Bradford --- vmm/src/console_devices.rs | 40 +++++++++++++++---------------- vmm/src/device_manager.rs | 49 ++++++++++++++++++++------------------ vmm/src/serial_manager.rs | 37 ++++++++++++++-------------- 3 files changed, 64 insertions(+), 62 deletions(-) diff --git a/vmm/src/console_devices.rs b/vmm/src/console_devices.rs index 76655d6c16..066f649afa 100644 --- a/vmm/src/console_devices.rs +++ b/vmm/src/console_devices.rs @@ -56,7 +56,7 @@ pub enum ConsoleDeviceError { type ConsoleDeviceResult = result::Result; #[derive(Clone)] -pub enum ConsoleOutput { +pub enum ConsoleTransport { File(Arc), Pty(Arc), Tty(Arc), @@ -67,10 +67,10 @@ pub enum ConsoleOutput { #[derive(Clone)] pub struct ConsoleInfo { - pub console_main_fd: ConsoleOutput, - pub serial_main_fd: ConsoleOutput, + pub console_main_fd: ConsoleTransport, + pub serial_main_fd: ConsoleTransport, #[cfg(target_arch = "x86_64")] - pub debug_main_fd: ConsoleOutput, + pub debug_main_fd: ConsoleTransport, } fn modify_mode( @@ -185,7 +185,7 @@ pub(crate) fn pre_create_console_devices(vmm: &mut Vmm) -> ConsoleDeviceResult { let file = File::create(vmconfig.console.file.as_ref().unwrap()) .map_err(ConsoleDeviceError::CreateConsoleDevice)?; - ConsoleOutput::File(Arc::new(file)) + ConsoleTransport::File(Arc::new(file)) } ConsoleOutputMode::Pty => { let (main_fd, sub_fd, path) = @@ -200,7 +200,7 @@ pub(crate) fn pre_create_console_devices(vmm: &mut Vmm) -> ConsoleDeviceResult { // Duplicating the file descriptors like this is needed as otherwise @@ -222,26 +222,26 @@ pub(crate) fn pre_create_console_devices(vmm: &mut Vmm) -> ConsoleDeviceResult { return Err(ConsoleDeviceError::NoSocketOptionSupportForConsoleDevice); } - ConsoleOutputMode::Null => ConsoleOutput::Null, - ConsoleOutputMode::Off => ConsoleOutput::Off, + ConsoleOutputMode::Null => ConsoleTransport::Null, + ConsoleOutputMode::Off => ConsoleTransport::Off, }, serial_main_fd: match vmconfig.serial.mode { ConsoleOutputMode::File => { let file = File::create(vmconfig.serial.file.as_ref().unwrap()) .map_err(ConsoleDeviceError::CreateConsoleDevice)?; - ConsoleOutput::File(Arc::new(file)) + ConsoleTransport::File(Arc::new(file)) } ConsoleOutputMode::Pty => { let (main_fd, sub_fd, path) = create_pty().map_err(ConsoleDeviceError::CreateConsoleDevice)?; set_raw_mode(&sub_fd.as_raw_fd(), &mut original_termios_opt)?; vmconfig.serial.file = Some(path.clone()); - ConsoleOutput::Pty(Arc::new(main_fd)) + ConsoleTransport::Pty(Arc::new(main_fd)) } ConsoleOutputMode::Tty => { // During vm_shutdown, when serial device is closed, FD#2(STDOUT) @@ -257,41 +257,41 @@ pub(crate) fn pre_create_console_devices(vmm: &mut Vmm) -> ConsoleDeviceResult { let listener = UnixListener::bind(vmconfig.serial.socket.as_ref().unwrap()) .map_err(ConsoleDeviceError::CreateConsoleDevice)?; - ConsoleOutput::Socket(Arc::new(listener)) + ConsoleTransport::Socket(Arc::new(listener)) } - ConsoleOutputMode::Null => ConsoleOutput::Null, - ConsoleOutputMode::Off => ConsoleOutput::Off, + ConsoleOutputMode::Null => ConsoleTransport::Null, + ConsoleOutputMode::Off => ConsoleTransport::Off, }, #[cfg(target_arch = "x86_64")] debug_main_fd: match vmconfig.debug_console.mode { ConsoleOutputMode::File => { let file = File::create(vmconfig.debug_console.file.as_ref().unwrap()) .map_err(ConsoleDeviceError::CreateConsoleDevice)?; - ConsoleOutput::File(Arc::new(file)) + ConsoleTransport::File(Arc::new(file)) } ConsoleOutputMode::Pty => { let (main_fd, sub_fd, path) = create_pty().map_err(ConsoleDeviceError::CreateConsoleDevice)?; set_raw_mode(&sub_fd.as_raw_fd(), &mut original_termios_opt)?; vmconfig.debug_console.file = Some(path.clone()); - ConsoleOutput::Pty(Arc::new(main_fd)) + ConsoleTransport::Pty(Arc::new(main_fd)) } ConsoleOutputMode::Tty => { let out = dup_stdout().map_err(|e| ConsoleDeviceError::CreateConsoleDevice(e.into()))?; set_raw_mode(&out, &mut original_termios_opt)?; - ConsoleOutput::Tty(Arc::new(out)) + ConsoleTransport::Tty(Arc::new(out)) } ConsoleOutputMode::Socket => { return Err(ConsoleDeviceError::NoSocketOptionSupportForConsoleDevice); } - ConsoleOutputMode::Null => ConsoleOutput::Null, - ConsoleOutputMode::Off => ConsoleOutput::Off, + ConsoleOutputMode::Null => ConsoleTransport::Null, + ConsoleOutputMode::Off => ConsoleTransport::Off, }, }; diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index d1a0f62b9b..014de4c0da 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -116,7 +116,7 @@ use vm_migration::{ use vm_virtio::{AccessPlatform, VirtioDeviceType}; use vmm_sys_util::eventfd::EventFd; -use crate::console_devices::{ConsoleDeviceError, ConsoleInfo, ConsoleOutput}; +use crate::console_devices::{ConsoleDeviceError, ConsoleInfo, ConsoleTransport}; use crate::cpu::{CPU_MANAGER_ACPI_SIZE, CpuManager}; use crate::device_tree::{DeviceNode, DeviceTree}; use crate::interrupt::{LegacyUserspaceInterruptManager, MsiInterruptManager}; @@ -2334,17 +2334,17 @@ impl DeviceManager { fn add_virtio_console_device( &mut self, - console_fd: ConsoleOutput, + console_fd: ConsoleTransport, resize_pipe: Option>, ) -> DeviceManagerResult>> { let console_config = self.config.lock().unwrap().console.clone(); let endpoint = match console_fd { - ConsoleOutput::File(file) => Endpoint::File(file), - ConsoleOutput::Pty(file) => { + ConsoleTransport::File(file) => Endpoint::File(file), + ConsoleTransport::Pty(file) => { self.console_resize_pipe = resize_pipe; Endpoint::PtyPair(Arc::new(file.try_clone().unwrap()), file) } - ConsoleOutput::Tty(stdout) => { + ConsoleTransport::Tty(stdout) => { if stdout.is_terminal() { self.console_resize_pipe = resize_pipe; } @@ -2365,11 +2365,11 @@ impl DeviceManager { Endpoint::File(stdout) } } - ConsoleOutput::Socket(_) => { + ConsoleTransport::Socket(_) => { return Err(DeviceManagerError::NoSocketOptionSupportForConsoleDevice); } - ConsoleOutput::Null => Endpoint::Null, - ConsoleOutput::Off => return Ok(None), + ConsoleTransport::Null => Endpoint::Null, + ConsoleTransport::Off => return Ok(None), }; let id = String::from(CONSOLE_DEVICE_NAME); @@ -2434,19 +2434,21 @@ impl DeviceManager { let console_info = console_info.unwrap(); let serial_writer: Option> = match console_info.serial_main_fd { - ConsoleOutput::File(ref file) | ConsoleOutput::Tty(ref file) => { + ConsoleTransport::File(ref file) | ConsoleTransport::Tty(ref file) => { Some(Box::new(Arc::clone(file))) } - ConsoleOutput::Off - | ConsoleOutput::Null - | ConsoleOutput::Pty(_) - | ConsoleOutput::Socket(_) => None, + ConsoleTransport::Off + | ConsoleTransport::Null + | ConsoleTransport::Pty(_) + | ConsoleTransport::Socket(_) => None, }; - if !matches!(console_info.serial_main_fd, ConsoleOutput::Off) { + if !matches!(console_info.serial_main_fd, ConsoleTransport::Off) { let serial = self.add_serial_device(interrupt_manager, serial_writer)?; self.serial_manager = match console_info.serial_main_fd { - ConsoleOutput::Pty(_) | ConsoleOutput::Tty(_) | ConsoleOutput::Socket(_) => { + ConsoleTransport::Pty(_) + | ConsoleTransport::Tty(_) + | ConsoleTransport::Socket(_) => { let serial_manager = SerialManager::new( serial, console_info.serial_main_fd, @@ -2472,14 +2474,15 @@ impl DeviceManager { #[cfg(target_arch = "x86_64")] { - let debug_console_writer: Option> = - match console_info.debug_main_fd { - ConsoleOutput::File(file) | ConsoleOutput::Tty(file) => Some(Box::new(file)), - ConsoleOutput::Off - | ConsoleOutput::Null - | ConsoleOutput::Pty(_) - | ConsoleOutput::Socket(_) => None, - }; + let debug_console_writer: Option> = match console_info + .debug_main_fd + { + ConsoleTransport::File(file) | ConsoleTransport::Tty(file) => Some(Box::new(file)), + ConsoleTransport::Off + | ConsoleTransport::Null + | ConsoleTransport::Pty(_) + | ConsoleTransport::Socket(_) => None, + }; if let Some(writer) = debug_console_writer { let _ = self.add_debug_console_device(writer)?; } diff --git a/vmm/src/serial_manager.rs b/vmm/src/serial_manager.rs index 45edec95eb..e5d5ad949a 100644 --- a/vmm/src/serial_manager.rs +++ b/vmm/src/serial_manager.rs @@ -24,7 +24,7 @@ use serial_buffer::SerialBuffer; use thiserror::Error; use vmm_sys_util::eventfd::EventFd; -use crate::console_devices::ConsoleOutput; +use crate::console_devices::ConsoleTransport; #[derive(Debug, Error)] pub enum Error { @@ -114,7 +114,7 @@ pub struct SerialManager { #[cfg(target_arch = "aarch64")] serial: Arc>, epoll_file: File, - in_file: ConsoleOutput, + in_file: ConsoleTransport, kill_evt: EventFd, handle: Option>, pty_write_out: Option>, @@ -125,14 +125,14 @@ impl SerialManager { pub fn new( #[cfg(any(target_arch = "x86_64", target_arch = "riscv64"))] serial: Arc>, #[cfg(target_arch = "aarch64")] serial: Arc>, - mut output: ConsoleOutput, + mut output: ConsoleTransport, socket: Option, ) -> Result> { let mut socket_path: Option = None; let in_fd = match output { - ConsoleOutput::Pty(ref fd) => fd.as_raw_fd(), - ConsoleOutput::Tty(_) + ConsoleTransport::Pty(ref fd) => fd.as_raw_fd(), + ConsoleTransport::Tty(_) // If running on an interactive TTY then accept input // SAFETY: trivially safe if unsafe { libc::isatty(libc::STDIN_FILENO) == 1 } => @@ -155,13 +155,13 @@ impl SerialManager { return Err(Error::SetNonBlocking(std::io::Error::last_os_error())); } - output = ConsoleOutput::Tty(Arc::new(stdin_clone)); + output = ConsoleTransport::Tty(Arc::new(stdin_clone)); fd } - ConsoleOutput::Tty(_) => { + ConsoleTransport::Tty(_) => { return Ok(None); } - ConsoleOutput::Socket(ref listener) => { + ConsoleTransport::Socket(ref listener) => { if let Some(path_in_socket) = socket { socket_path = Some(path_in_socket.clone()); } @@ -181,7 +181,7 @@ impl SerialManager { ) .map_err(Error::Epoll)?; - let epoll_fd_data = if let ConsoleOutput::Socket(_) = output { + let epoll_fd_data = if let ConsoleTransport::Socket(_) = output { EpollDispatch::Socket } else { EpollDispatch::File @@ -196,7 +196,7 @@ impl SerialManager { .map_err(Error::Epoll)?; let mut pty_write_out = None; - if let ConsoleOutput::Pty(ref file) = output { + if let ConsoleTransport::Pty(ref file) = output { let write_out = Arc::new(AtomicBool::new(false)); pty_write_out = Some(write_out.clone()); let writer = file.try_clone().map_err(Error::FileClone)?; @@ -295,7 +295,7 @@ impl SerialManager { } }; - if matches!(in_file, ConsoleOutput::Pty(_)) && num_events == 0 { + if matches!(in_file, ConsoleTransport::Pty(_)) && num_events == 0 { // This very specific case happens when the serial is connected // to a PTY. We know EPOLLHUP is always present when there's nothing // connected at the other end of the PTY. That's why getting no event @@ -320,7 +320,7 @@ impl SerialManager { .map_err(Error::AcceptConnection)?; } - let ConsoleOutput::Socket(ref listener) = in_file else { + let ConsoleTransport::Socket(ref listener) = in_file else { unreachable!(); }; @@ -349,7 +349,7 @@ impl SerialManager { if event.events & libc::EPOLLIN as u32 != 0 { let mut input = [0u8; 64]; let count = match &in_file { - ConsoleOutput::Socket(_) => { + ConsoleTransport::Socket(_) => { if let Some(mut serial_reader) = reader.as_ref() { let count = serial_reader .read(&mut input) @@ -371,11 +371,10 @@ impl SerialManager { 0 } } - ConsoleOutput::Pty(file) | ConsoleOutput::Tty(file) => { - (&**file) - .read(&mut input) - .map_err(Error::ReadInput)? - } + ConsoleTransport::Pty(file) + | ConsoleTransport::Tty(file) => (&**file) + .read(&mut input) + .map_err(Error::ReadInput)?, _ => unreachable!(), }; @@ -432,7 +431,7 @@ impl Drop for SerialManager { if let Some(handle) = self.handle.take() { handle.join().ok(); } - if let ConsoleOutput::Socket(_) = self.in_file + if let ConsoleTransport::Socket(_) = self.in_file && let Some(socket_path) = self.socket_path.as_ref() { std::fs::remove_file(socket_path.as_os_str()) From f3d2d6c6692e75fcc8a2cf69f9b7f18a0101b418 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Fri, 6 Mar 2026 08:41:18 -0800 Subject: [PATCH 070/742] vmm: serial_manager: Be consistent with ConsoleTransport variable Ensure that the same variable name is used for the transport. Signed-off-by: Rob Bradford --- vmm/src/serial_manager.rs | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/vmm/src/serial_manager.rs b/vmm/src/serial_manager.rs index e5d5ad949a..5a61b39e58 100644 --- a/vmm/src/serial_manager.rs +++ b/vmm/src/serial_manager.rs @@ -114,7 +114,7 @@ pub struct SerialManager { #[cfg(target_arch = "aarch64")] serial: Arc>, epoll_file: File, - in_file: ConsoleTransport, + transport: ConsoleTransport, kill_evt: EventFd, handle: Option>, pty_write_out: Option>, @@ -125,12 +125,12 @@ impl SerialManager { pub fn new( #[cfg(any(target_arch = "x86_64", target_arch = "riscv64"))] serial: Arc>, #[cfg(target_arch = "aarch64")] serial: Arc>, - mut output: ConsoleTransport, + mut transport: ConsoleTransport, socket: Option, ) -> Result> { let mut socket_path: Option = None; - let in_fd = match output { + let in_fd = match transport { ConsoleTransport::Pty(ref fd) => fd.as_raw_fd(), ConsoleTransport::Tty(_) // If running on an interactive TTY then accept input @@ -155,7 +155,7 @@ impl SerialManager { return Err(Error::SetNonBlocking(std::io::Error::last_os_error())); } - output = ConsoleTransport::Tty(Arc::new(stdin_clone)); + transport = ConsoleTransport::Tty(Arc::new(stdin_clone)); fd } ConsoleTransport::Tty(_) => { @@ -181,7 +181,7 @@ impl SerialManager { ) .map_err(Error::Epoll)?; - let epoll_fd_data = if let ConsoleTransport::Socket(_) = output { + let epoll_fd_data = if let ConsoleTransport::Socket(_) = transport { EpollDispatch::Socket } else { EpollDispatch::File @@ -196,7 +196,7 @@ impl SerialManager { .map_err(Error::Epoll)?; let mut pty_write_out = None; - if let ConsoleTransport::Pty(ref file) = output { + if let ConsoleTransport::Pty(ref file) = transport { let write_out = Arc::new(AtomicBool::new(false)); pty_write_out = Some(write_out.clone()); let writer = file.try_clone().map_err(Error::FileClone)?; @@ -215,7 +215,7 @@ impl SerialManager { Ok(Some(SerialManager { serial, epoll_file, - in_file: output, + transport, kill_evt, handle: None, pty_write_out, @@ -257,7 +257,7 @@ impl SerialManager { } let epoll_fd = self.epoll_file.as_raw_fd(); - let in_file = self.in_file.clone(); + let transport = self.transport.clone(); let serial = self.serial.clone(); let pty_write_out = self.pty_write_out.clone(); let mut reader: Option = None; @@ -295,7 +295,7 @@ impl SerialManager { } }; - if matches!(in_file, ConsoleTransport::Pty(_)) && num_events == 0 { + if matches!(transport, ConsoleTransport::Pty(_)) && num_events == 0 { // This very specific case happens when the serial is connected // to a PTY. We know EPOLLHUP is always present when there's nothing // connected at the other end of the PTY. That's why getting no event @@ -320,7 +320,7 @@ impl SerialManager { .map_err(Error::AcceptConnection)?; } - let ConsoleTransport::Socket(ref listener) = in_file else { + let ConsoleTransport::Socket(ref listener) = transport else { unreachable!(); }; @@ -348,7 +348,7 @@ impl SerialManager { EpollDispatch::File => { if event.events & libc::EPOLLIN as u32 != 0 { let mut input = [0u8; 64]; - let count = match &in_file { + let count = match &transport { ConsoleTransport::Socket(_) => { if let Some(mut serial_reader) = reader.as_ref() { let count = serial_reader @@ -431,7 +431,7 @@ impl Drop for SerialManager { if let Some(handle) = self.handle.take() { handle.join().ok(); } - if let ConsoleTransport::Socket(_) = self.in_file + if let ConsoleTransport::Socket(_) = self.transport && let Some(socket_path) = self.socket_path.as_ref() { std::fs::remove_file(socket_path.as_os_str()) From 9cb49a244e3fd1b6fa364a11c0796cc23f83bf01 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Fri, 6 Mar 2026 08:57:03 -0800 Subject: [PATCH 071/742] vmm: device_manager: Be consistent with transport variable name Signed-off-by: Rob Bradford --- vmm/src/device_manager.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 014de4c0da..4dc965379b 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -2334,11 +2334,11 @@ impl DeviceManager { fn add_virtio_console_device( &mut self, - console_fd: ConsoleTransport, + transport: ConsoleTransport, resize_pipe: Option>, ) -> DeviceManagerResult>> { let console_config = self.config.lock().unwrap().console.clone(); - let endpoint = match console_fd { + let endpoint = match transport { ConsoleTransport::File(file) => Endpoint::File(file), ConsoleTransport::Pty(file) => { self.console_resize_pipe = resize_pipe; From 9d712a10a67d339b5f7c6bf0eb06892e64017502 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Fri, 6 Mar 2026 09:05:30 -0800 Subject: [PATCH 072/742] vmm: Improve naming of ConsoleInfo struct members These aren't FDs and we don't need to know that they are the main ones. Signed-off-by: Rob Bradford --- vmm/src/console_devices.rs | 12 ++++++------ vmm/src/device_manager.rs | 21 ++++++++------------- 2 files changed, 14 insertions(+), 19 deletions(-) diff --git a/vmm/src/console_devices.rs b/vmm/src/console_devices.rs index 066f649afa..32cba7b780 100644 --- a/vmm/src/console_devices.rs +++ b/vmm/src/console_devices.rs @@ -67,10 +67,10 @@ pub enum ConsoleTransport { #[derive(Clone)] pub struct ConsoleInfo { - pub console_main_fd: ConsoleTransport, - pub serial_main_fd: ConsoleTransport, + pub console: ConsoleTransport, + pub serial: ConsoleTransport, #[cfg(target_arch = "x86_64")] - pub debug_main_fd: ConsoleTransport, + pub debug: ConsoleTransport, } fn modify_mode( @@ -181,7 +181,7 @@ pub(crate) fn pre_create_console_devices(vmm: &mut Vmm) -> ConsoleDeviceResult { let file = File::create(vmconfig.console.file.as_ref().unwrap()) .map_err(ConsoleDeviceError::CreateConsoleDevice)?; @@ -230,7 +230,7 @@ pub(crate) fn pre_create_console_devices(vmm: &mut Vmm) -> ConsoleDeviceResult ConsoleTransport::Null, ConsoleOutputMode::Off => ConsoleTransport::Off, }, - serial_main_fd: match vmconfig.serial.mode { + serial: match vmconfig.serial.mode { ConsoleOutputMode::File => { let file = File::create(vmconfig.serial.file.as_ref().unwrap()) .map_err(ConsoleDeviceError::CreateConsoleDevice)?; @@ -268,7 +268,7 @@ pub(crate) fn pre_create_console_devices(vmm: &mut Vmm) -> ConsoleDeviceResult ConsoleTransport::Off, }, #[cfg(target_arch = "x86_64")] - debug_main_fd: match vmconfig.debug_console.mode { + debug: match vmconfig.debug_console.mode { ConsoleOutputMode::File => { let file = File::create(vmconfig.debug_console.file.as_ref().unwrap()) .map_err(ConsoleDeviceError::CreateConsoleDevice)?; diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 4dc965379b..1ac9afe30e 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -2433,7 +2433,7 @@ impl DeviceManager { // SAFETY: console_info is Some, so it's safe to unwrap. let console_info = console_info.unwrap(); - let serial_writer: Option> = match console_info.serial_main_fd { + let serial_writer: Option> = match console_info.serial { ConsoleTransport::File(ref file) | ConsoleTransport::Tty(ref file) => { Some(Box::new(Arc::clone(file))) } @@ -2443,18 +2443,15 @@ impl DeviceManager { | ConsoleTransport::Socket(_) => None, }; - if !matches!(console_info.serial_main_fd, ConsoleTransport::Off) { + if !matches!(console_info.serial, ConsoleTransport::Off) { let serial = self.add_serial_device(interrupt_manager, serial_writer)?; - self.serial_manager = match console_info.serial_main_fd { + self.serial_manager = match console_info.serial { ConsoleTransport::Pty(_) | ConsoleTransport::Tty(_) | ConsoleTransport::Socket(_) => { - let serial_manager = SerialManager::new( - serial, - console_info.serial_main_fd, - serial_config.socket, - ) - .map_err(DeviceManagerError::CreateSerialManager)?; + let serial_manager = + SerialManager::new(serial, console_info.serial, serial_config.socket) + .map_err(DeviceManagerError::CreateSerialManager)?; if let Some(mut serial_manager) = serial_manager { serial_manager .start_thread( @@ -2474,9 +2471,7 @@ impl DeviceManager { #[cfg(target_arch = "x86_64")] { - let debug_console_writer: Option> = match console_info - .debug_main_fd - { + let debug_console_writer: Option> = match console_info.debug { ConsoleTransport::File(file) | ConsoleTransport::Tty(file) => Some(Box::new(file)), ConsoleTransport::Off | ConsoleTransport::Null @@ -2489,7 +2484,7 @@ impl DeviceManager { } let console_resizer = - self.add_virtio_console_device(console_info.console_main_fd, console_resize_pipe)?; + self.add_virtio_console_device(console_info.console, console_resize_pipe)?; Ok(Arc::new(Console { console_resizer })) } From 4c2b2110c9cc64cb568c3b7a11697f3bfd5bece2 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Fri, 6 Mar 2026 09:10:21 -0800 Subject: [PATCH 073/742] vmm: serial_manager: Reorder epoll event setup It makes most sense to create the epoll FD and add the kill event before identifying the transport specific fd. Signed-off-by: Rob Bradford --- vmm/src/serial_manager.rs | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/vmm/src/serial_manager.rs b/vmm/src/serial_manager.rs index 5a61b39e58..f664768317 100644 --- a/vmm/src/serial_manager.rs +++ b/vmm/src/serial_manager.rs @@ -128,6 +128,17 @@ impl SerialManager { mut transport: ConsoleTransport, socket: Option, ) -> Result> { + let epoll_fd = epoll::create(true).map_err(Error::Epoll)?; + let kill_evt = EventFd::new(EFD_NONBLOCK).map_err(Error::EventFd)?; + + epoll::ctl( + epoll_fd, + epoll::ControlOptions::EPOLL_CTL_ADD, + kill_evt.as_raw_fd(), + epoll::Event::new(epoll::Events::EPOLLIN, EpollDispatch::Kill as u64), + ) + .map_err(Error::Epoll)?; + let mut socket_path: Option = None; let in_fd = match transport { @@ -170,17 +181,6 @@ impl SerialManager { _ => return Ok(None), }; - let epoll_fd = epoll::create(true).map_err(Error::Epoll)?; - let kill_evt = EventFd::new(EFD_NONBLOCK).map_err(Error::EventFd)?; - - epoll::ctl( - epoll_fd, - epoll::ControlOptions::EPOLL_CTL_ADD, - kill_evt.as_raw_fd(), - epoll::Event::new(epoll::Events::EPOLLIN, EpollDispatch::Kill as u64), - ) - .map_err(Error::Epoll)?; - let epoll_fd_data = if let ConsoleTransport::Socket(_) = transport { EpollDispatch::Socket } else { From e7e5fefb29225aceb0b0d1b614b3438ee6ddb0fd Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Fri, 6 Mar 2026 09:13:41 -0800 Subject: [PATCH 074/742] vmm: serial_manager: Improve event variable name Name the variable after what the intention on the caller side not the callee name. Signed-off-by: Rob Bradford --- vmm/src/serial_manager.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vmm/src/serial_manager.rs b/vmm/src/serial_manager.rs index f664768317..6ba2b95377 100644 --- a/vmm/src/serial_manager.rs +++ b/vmm/src/serial_manager.rs @@ -181,7 +181,7 @@ impl SerialManager { _ => return Ok(None), }; - let epoll_fd_data = if let ConsoleTransport::Socket(_) = transport { + let in_event = if let ConsoleTransport::Socket(_) = transport { EpollDispatch::Socket } else { EpollDispatch::File @@ -191,7 +191,7 @@ impl SerialManager { epoll_fd, epoll::ControlOptions::EPOLL_CTL_ADD, in_fd, - epoll::Event::new(epoll::Events::EPOLLIN, epoll_fd_data as u64), + epoll::Event::new(epoll::Events::EPOLLIN, in_event as u64), ) .map_err(Error::Epoll)?; From 072e980e0830665a469e3869b831c50f09d19298 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Fri, 6 Mar 2026 09:23:11 -0800 Subject: [PATCH 075/742] vmm: serial_manager: Use OwnedFd for the epoll FD Replace the use of a conventional File for managing the lifetime. Signed-off-by: Rob Bradford --- vmm/src/serial_manager.rs | 44 ++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/vmm/src/serial_manager.rs b/vmm/src/serial_manager.rs index 6ba2b95377..27f359bec6 100644 --- a/vmm/src/serial_manager.rs +++ b/vmm/src/serial_manager.rs @@ -6,6 +6,7 @@ use std::fs::File; use std::io::Read; use std::net::Shutdown; +use std::os::fd::OwnedFd; use std::os::unix::io::{AsRawFd, FromRawFd}; use std::os::unix::net::UnixStream; use std::panic::AssertUnwindSafe; @@ -113,7 +114,7 @@ pub struct SerialManager { serial: Arc>, #[cfg(target_arch = "aarch64")] serial: Arc>, - epoll_file: File, + epoll_fd: OwnedFd, transport: ConsoleTransport, kill_evt: EventFd, handle: Option>, @@ -208,13 +209,13 @@ impl SerialManager { .set_out(Some(Box::new(buffer))); } - // Use 'File' to enforce closing on 'epoll_fd' + // Use 'OwnedFd' to manage lifetime // SAFETY: epoll_fd is valid - let epoll_file = unsafe { File::from_raw_fd(epoll_fd) }; + let epoll_fd = unsafe { OwnedFd::from_raw_fd(epoll_fd) }; Ok(Some(SerialManager { serial, - epoll_file, + epoll_fd, transport, kill_evt, handle: None, @@ -256,7 +257,7 @@ impl SerialManager { return Ok(()); } - let epoll_fd = self.epoll_file.as_raw_fd(); + let epoll_fd = self.epoll_fd.try_clone().map_err(Error::Epoll)?; let transport = self.transport.clone(); let serial = self.serial.clone(); let pty_write_out = self.pty_write_out.clone(); @@ -278,22 +279,23 @@ impl SerialManager { [epoll::Event::new(epoll::Events::empty(), 0); EPOLL_EVENTS_LEN]; loop { - let num_events = match epoll::wait(epoll_fd, timeout, &mut events[..]) { - Ok(res) => res, - Err(e) => { - if e.kind() == io::ErrorKind::Interrupted { - // It's well defined from the epoll_wait() syscall - // documentation that the epoll loop can be interrupted - // before any of the requested events occurred or the - // timeout expired. In both those cases, epoll_wait() - // returns an error of type EINTR, but this should not - // be considered as a regular error. Instead it is more - // appropriate to retry, by calling into epoll_wait(). - continue; + let num_events = + match epoll::wait(epoll_fd.as_raw_fd(), timeout, &mut events[..]) { + Ok(res) => res, + Err(e) => { + if e.kind() == io::ErrorKind::Interrupted { + // It's well defined from the epoll_wait() syscall + // documentation that the epoll loop can be interrupted + // before any of the requested events occurred or the + // timeout expired. In both those cases, epoll_wait() + // returns an error of type EINTR, but this should not + // be considered as a regular error. Instead it is more + // appropriate to retry, by calling into epoll_wait(). + continue; + } + return Err(Error::Epoll(e)); } - return Err(Error::Epoll(e)); - } - }; + }; if matches!(transport, ConsoleTransport::Pty(_)) && num_events == 0 { // This very specific case happens when the serial is connected @@ -332,7 +334,7 @@ impl SerialManager { unix_stream.try_clone().map_err(Error::CloneUnixStream)?; epoll::ctl( - epoll_fd, + epoll_fd.as_raw_fd(), epoll::ControlOptions::EPOLL_CTL_ADD, unix_stream.as_raw_fd(), epoll::Event::new( From e5be2196f5fd96b84d0f80179549654c78a32875 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 6 Mar 2026 23:53:48 +0000 Subject: [PATCH 076/742] build: Bump docker/build-push-action from 6 to 7 Bumps [docker/build-push-action](https://github.com/docker/build-push-action) from 6 to 7. - [Release notes](https://github.com/docker/build-push-action/releases) - [Commits](https://github.com/docker/build-push-action/compare/v6...v7) --- updated-dependencies: - dependency-name: docker/build-push-action dependency-version: '7' dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/docker-image.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docker-image.yaml b/.github/workflows/docker-image.yaml index 059aa19672..0f6a51f2d4 100644 --- a/.github/workflows/docker-image.yaml +++ b/.github/workflows/docker-image.yaml @@ -46,7 +46,7 @@ jobs: - name: Build and push if: ${{ github.event_name == 'push' }} - uses: docker/build-push-action@v6 + uses: docker/build-push-action@v7 with: file: ./resources/Dockerfile platforms: linux/amd64,linux/arm64 @@ -55,7 +55,7 @@ jobs: - name: Build only if: ${{ github.event_name == 'pull_request' }} - uses: docker/build-push-action@v6 + uses: docker/build-push-action@v7 with: file: ./resources/Dockerfile platforms: linux/amd64,linux/arm64 From 355cbad09a7b1aaf27a016701a77bf34c67a85ce Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Sat, 7 Mar 2026 11:17:23 +0100 Subject: [PATCH 077/742] virtio-devices: iommu: Fix VirtioIommuConfig reserved field size Fix the _reserved field in VirtioIommuConfig from [u8; 7] to [u8; 3], correcting the struct size from 44 bytes to the spec-mandated 40 bytes. The virtio specification v1.2, Section 5.13.4 defines struct virtio_iommu_config as 40 bytes total. The kernel UAPI header linux/virtio_iommu.h matches this layout with __u8 reserved[3] since kernel 5.17. Prior to that, the struct was 36 bytes with no bypass field at all. The incorrect [u8; 7] made the packed struct 44 bytes. Since the struct is exposed to the guest, the guest saw a 44 byte device specific configuration region instead of 40 bytes. While well behaved guest drivers only access fields at known offsets and would not observe data corruption from the extra 4 zero bytes at the tail, the oversized config region is a spec violation. The write_config path is not affected because it validates the exact offset of the bypass field before allowing writes, and the bypass field sits at offset 36 regardless of the trailing reserved size. Signed-off-by: Anatol Belski --- virtio-devices/src/iommu.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtio-devices/src/iommu.rs b/virtio-devices/src/iommu.rs index f4812b04fb..1097b6582e 100644 --- a/virtio-devices/src/iommu.rs +++ b/virtio-devices/src/iommu.rs @@ -98,7 +98,7 @@ struct VirtioIommuConfig { domain_range: VirtioIommuRange32, probe_size: u32, bypass: u8, - _reserved: [u8; 7], + _reserved: [u8; 3], } /// Virtio IOMMU request type From 313d980538a4a61a77af2d131c29baa8077a16ed Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Sat, 7 Mar 2026 11:25:31 +0100 Subject: [PATCH 078/742] vm-virtio: Remove stale comment from watchdog device ID Device ID 35 for virtio watchdog was officially allocated and merged into the Linux kernel UAPI headers as VIRTIO_ID_WATCHDOG in kernel 5.15. The virtio specification v1.2 also lists device ID 35 for the watchdog device type. Leaving the comment is therefore misleading. Signed-off-by: Anatol Belski --- vm-virtio/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vm-virtio/src/lib.rs b/vm-virtio/src/lib.rs index b7f5370702..c560e5c86e 100644 --- a/vm-virtio/src/lib.rs +++ b/vm-virtio/src/lib.rs @@ -39,7 +39,7 @@ pub enum VirtioDeviceType { Mem = 24, Fs = 26, Pmem = 27, - Watchdog = 35, // Temporary until official number allocated + Watchdog = 35, Unknown = 0xFF, } From da0d0a20901ff3c2ae8256462ee21b16eb1882fe Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Sat, 7 Mar 2026 12:01:25 +0100 Subject: [PATCH 079/742] virtio-devices: Rename VIRTIO_F_IOMMU_PLATFORM Rename the transport feature bit constant from VIRTIO_F_IOMMU_PLATFORM to VIRTIO_F_ACCESS_PLATFORM across the entire virtio-devices crate. The virtio specification as of v1.1 carries bit 33 as VIRTIO_F_ACCESS_PLATFORM. The Linux kernel UAPI header carries VIRTIO_F_IOMMU_PLATFORM only as a backward-compatible alias. This is a pure rename with no functional or behavioral change. Signed-off-by: Anatol Belski --- virtio-devices/src/block.rs | 2 +- virtio-devices/src/console.rs | 4 ++-- virtio-devices/src/lib.rs | 2 +- virtio-devices/src/net.rs | 2 +- virtio-devices/src/pmem.rs | 4 ++-- virtio-devices/src/rng.rs | 4 ++-- virtio-devices/src/vdpa.rs | 4 ++-- virtio-devices/src/vhost_user/blk.rs | 4 ++-- virtio-devices/src/vhost_user/fs.rs | 4 ++-- virtio-devices/src/vhost_user/generic_vhost_user.rs | 4 ++-- virtio-devices/src/vhost_user/net.rs | 6 +++--- virtio-devices/src/vsock/device.rs | 4 ++-- 12 files changed, 22 insertions(+), 22 deletions(-) diff --git a/virtio-devices/src/block.rs b/virtio-devices/src/block.rs index 0d2b1fb271..9d09ab91a6 100644 --- a/virtio-devices/src/block.rs +++ b/virtio-devices/src/block.rs @@ -737,7 +737,7 @@ impl Block { } if iommu { - avail_features |= 1u64 << VIRTIO_F_IOMMU_PLATFORM; + avail_features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; } if read_only { diff --git a/virtio-devices/src/console.rs b/virtio-devices/src/console.rs index c8a9f08a02..7896fde9f1 100644 --- a/virtio-devices/src/console.rs +++ b/virtio-devices/src/console.rs @@ -25,7 +25,7 @@ use vmm_sys_util::eventfd::EventFd; use super::{ ActivateResult, EPOLL_HELPER_EVENT_LAST, EpollHelper, EpollHelperError, EpollHelperHandler, - Error as DeviceError, VIRTIO_F_IOMMU_PLATFORM, VIRTIO_F_VERSION_1, VirtioCommon, VirtioDevice, + Error as DeviceError, VIRTIO_F_ACCESS_PLATFORM, VIRTIO_F_VERSION_1, VirtioCommon, VirtioDevice, VirtioDeviceType, VirtioInterruptType, }; use crate::seccomp_filters::Thread; @@ -609,7 +609,7 @@ impl Console { } else { let mut avail_features = (1u64 << VIRTIO_F_VERSION_1) | (1u64 << VIRTIO_CONSOLE_F_SIZE); if iommu { - avail_features |= 1u64 << VIRTIO_F_IOMMU_PLATFORM; + avail_features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; } ( diff --git a/virtio-devices/src/lib.rs b/virtio-devices/src/lib.rs index da4f1c91be..111c9007e1 100644 --- a/virtio-devices/src/lib.rs +++ b/virtio-devices/src/lib.rs @@ -71,7 +71,7 @@ const DEVICE_FAILED: u32 = 0x80; const VIRTIO_F_RING_INDIRECT_DESC: u32 = 28; const VIRTIO_F_RING_EVENT_IDX: u32 = 29; const VIRTIO_F_VERSION_1: u32 = 32; -const VIRTIO_F_IOMMU_PLATFORM: u32 = 33; +const VIRTIO_F_ACCESS_PLATFORM: u32 = 33; const VIRTIO_F_IN_ORDER: u32 = 35; const VIRTIO_F_ORDER_PLATFORM: u32 = 36; #[allow(dead_code)] diff --git a/virtio-devices/src/net.rs b/virtio-devices/src/net.rs index 8eee661341..63b9de4116 100644 --- a/virtio-devices/src/net.rs +++ b/virtio-devices/src/net.rs @@ -463,7 +463,7 @@ impl Net { | (1 << VIRTIO_F_VERSION_1); if iommu { - avail_features |= 1u64 << VIRTIO_F_IOMMU_PLATFORM; + avail_features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; } // Configure TSO/UFO features when hardware checksum offload is enabled. diff --git a/virtio-devices/src/pmem.rs b/virtio-devices/src/pmem.rs index 549b62fd96..b7be248fa4 100644 --- a/virtio-devices/src/pmem.rs +++ b/virtio-devices/src/pmem.rs @@ -31,7 +31,7 @@ use vmm_sys_util::eventfd::EventFd; use super::{ ActivateError, ActivateResult, EPOLL_HELPER_EVENT_LAST, EpollHelper, EpollHelperError, - EpollHelperHandler, Error as DeviceError, VIRTIO_F_IOMMU_PLATFORM, VIRTIO_F_VERSION_1, + EpollHelperHandler, Error as DeviceError, VIRTIO_F_ACCESS_PLATFORM, VIRTIO_F_VERSION_1, VirtioCommon, VirtioDevice, VirtioDeviceType, }; use crate::seccomp_filters::Thread; @@ -307,7 +307,7 @@ impl Pmem { let mut avail_features = 1u64 << VIRTIO_F_VERSION_1; if iommu { - avail_features |= 1u64 << VIRTIO_F_IOMMU_PLATFORM; + avail_features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; } (avail_features, 0, config, false) }; diff --git a/virtio-devices/src/rng.rs b/virtio-devices/src/rng.rs index 2f980d4d8b..2409e272c1 100644 --- a/virtio-devices/src/rng.rs +++ b/virtio-devices/src/rng.rs @@ -24,7 +24,7 @@ use vmm_sys_util::eventfd::EventFd; use super::{ ActivateError, ActivateResult, EPOLL_HELPER_EVENT_LAST, EpollHelper, EpollHelperError, - EpollHelperHandler, Error as DeviceError, VIRTIO_F_IOMMU_PLATFORM, VIRTIO_F_VERSION_1, + EpollHelperHandler, Error as DeviceError, VIRTIO_F_ACCESS_PLATFORM, VIRTIO_F_VERSION_1, VirtioCommon, VirtioDevice, VirtioDeviceType, }; use crate::seccomp_filters::Thread; @@ -180,7 +180,7 @@ impl Rng { let mut avail_features = 1u64 << VIRTIO_F_VERSION_1; if iommu { - avail_features |= 1u64 << VIRTIO_F_IOMMU_PLATFORM; + avail_features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; } (avail_features, 0, false) diff --git a/virtio-devices/src/vdpa.rs b/virtio-devices/src/vdpa.rs index 725f215c77..9d20aac92e 100644 --- a/virtio-devices/src/vdpa.rs +++ b/virtio-devices/src/vdpa.rs @@ -28,7 +28,7 @@ use vmm_sys_util::eventfd::EventFd; use crate::{ ActivateError, ActivateResult, DEVICE_ACKNOWLEDGE, DEVICE_DRIVER, DEVICE_DRIVER_OK, - DEVICE_FEATURES_OK, GuestMemoryMmap, VIRTIO_F_IOMMU_PLATFORM, VirtioCommon, VirtioDevice, + DEVICE_FEATURES_OK, GuestMemoryMmap, VIRTIO_F_ACCESS_PLATFORM, VirtioCommon, VirtioDevice, VirtioInterrupt, VirtioInterruptType, get_host_address_range, }; @@ -164,7 +164,7 @@ impl Vdpa { let iova_range = vhost.get_iova_range().map_err(Error::GetIovaRange)?; - if avail_features & (1u64 << VIRTIO_F_IOMMU_PLATFORM) == 0 { + if avail_features & (1u64 << VIRTIO_F_ACCESS_PLATFORM) == 0 { return Err(Error::MissingAccessPlatformVirtioFeature); } diff --git a/virtio-devices/src/vhost_user/blk.rs b/virtio-devices/src/vhost_user/blk.rs index d26350c91a..22896ba7c6 100644 --- a/virtio-devices/src/vhost_user/blk.rs +++ b/virtio-devices/src/vhost_user/blk.rs @@ -31,7 +31,7 @@ use super::{DEFAULT_VIRTIO_FEATURES, Error, Result}; use crate::seccomp_filters::Thread; use crate::thread_helper::spawn_virtio_thread; use crate::vhost_user::VhostUserCommon; -use crate::{GuestMemoryMmap, GuestRegionMmap, VIRTIO_F_IOMMU_PLATFORM, VirtioInterrupt}; +use crate::{GuestMemoryMmap, GuestRegionMmap, VIRTIO_F_ACCESS_PLATFORM, VirtioInterrupt}; const DEFAULT_QUEUE_NUMBER: usize = 1; @@ -239,7 +239,7 @@ impl VirtioDevice for Blk { fn features(&self) -> u64 { let mut features = self.common.avail_features; if self.iommu { - features |= 1u64 << VIRTIO_F_IOMMU_PLATFORM; + features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; } features } diff --git a/virtio-devices/src/vhost_user/fs.rs b/virtio-devices/src/vhost_user/fs.rs index d0005af90f..434454fcef 100644 --- a/virtio-devices/src/vhost_user/fs.rs +++ b/virtio-devices/src/vhost_user/fs.rs @@ -25,7 +25,7 @@ use crate::seccomp_filters::Thread; use crate::thread_helper::spawn_virtio_thread; use crate::vhost_user::VhostUserCommon; use crate::{ - ActivateResult, GuestMemoryMmap, GuestRegionMmap, MmapRegion, VIRTIO_F_IOMMU_PLATFORM, + ActivateResult, GuestMemoryMmap, GuestRegionMmap, MmapRegion, VIRTIO_F_ACCESS_PLATFORM, VirtioCommon, VirtioDevice, VirtioDeviceType, VirtioInterrupt, VirtioSharedMemoryList, }; @@ -248,7 +248,7 @@ impl VirtioDevice for Fs { fn features(&self) -> u64 { let mut features = self.common.avail_features; if self.iommu { - features |= 1u64 << VIRTIO_F_IOMMU_PLATFORM; + features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; } features } diff --git a/virtio-devices/src/vhost_user/generic_vhost_user.rs b/virtio-devices/src/vhost_user/generic_vhost_user.rs index 5774af928a..d38eee3a92 100644 --- a/virtio-devices/src/vhost_user/generic_vhost_user.rs +++ b/virtio-devices/src/vhost_user/generic_vhost_user.rs @@ -27,7 +27,7 @@ use crate::seccomp_filters::Thread; use crate::thread_helper::spawn_virtio_thread; use crate::vhost_user::VhostUserCommon; use crate::{ - ActivateResult, GuestMemoryMmap, GuestRegionMmap, MmapRegion, VIRTIO_F_IOMMU_PLATFORM, + ActivateResult, GuestMemoryMmap, GuestRegionMmap, MmapRegion, VIRTIO_F_ACCESS_PLATFORM, VirtioCommon, VirtioDevice, VirtioInterrupt, VirtioSharedMemoryList, }; @@ -218,7 +218,7 @@ impl VirtioDevice for GenericVhostUser { fn features(&self) -> u64 { let mut features = self.common.avail_features; if self.iommu { - features |= 1u64 << VIRTIO_F_IOMMU_PLATFORM; + features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; } features } diff --git a/virtio-devices/src/vhost_user/net.rs b/virtio-devices/src/vhost_user/net.rs index e0a71c7342..187d710e39 100644 --- a/virtio-devices/src/vhost_user/net.rs +++ b/virtio-devices/src/vhost_user/net.rs @@ -30,8 +30,8 @@ use crate::thread_helper::spawn_virtio_thread; use crate::vhost_user::vu_common_ctrl::{VhostUserConfig, VhostUserHandle}; use crate::vhost_user::{DEFAULT_VIRTIO_FEATURES, Error, Result, VhostUserCommon}; use crate::{ - ActivateResult, GuestMemoryMmap, GuestRegionMmap, NetCtrlEpollHandler, VIRTIO_F_IOMMU_PLATFORM, - VirtioCommon, VirtioDevice, VirtioDeviceType, VirtioInterrupt, + ActivateResult, GuestMemoryMmap, GuestRegionMmap, NetCtrlEpollHandler, + VIRTIO_F_ACCESS_PLATFORM, VirtioCommon, VirtioDevice, VirtioDeviceType, VirtioInterrupt, }; const DEFAULT_QUEUE_NUMBER: usize = 2; @@ -275,7 +275,7 @@ impl VirtioDevice for Net { fn features(&self) -> u64 { let mut features = self.common.avail_features; if self.iommu { - features |= 1u64 << VIRTIO_F_IOMMU_PLATFORM; + features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; } features } diff --git a/virtio-devices/src/vsock/device.rs b/virtio-devices/src/vsock/device.rs index 27a0af1ff2..aa86aa0c95 100644 --- a/virtio-devices/src/vsock/device.rs +++ b/virtio-devices/src/vsock/device.rs @@ -50,7 +50,7 @@ use crate::seccomp_filters::Thread; use crate::thread_helper::spawn_virtio_thread; use crate::{ ActivateResult, EPOLL_HELPER_EVENT_LAST, EpollHelper, EpollHelperError, EpollHelperHandler, - Error as DeviceError, GuestMemoryMmap, VIRTIO_F_IN_ORDER, VIRTIO_F_IOMMU_PLATFORM, + Error as DeviceError, GuestMemoryMmap, VIRTIO_F_ACCESS_PLATFORM, VIRTIO_F_IN_ORDER, VIRTIO_F_VERSION_1, VirtioCommon, VirtioDevice, VirtioDeviceType, VirtioInterrupt, VirtioInterruptType, }; @@ -349,7 +349,7 @@ where let mut avail_features = (1u64 << VIRTIO_F_VERSION_1) | (1u64 << VIRTIO_F_IN_ORDER); if iommu { - avail_features |= 1u64 << VIRTIO_F_IOMMU_PLATFORM; + avail_features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; } (avail_features, 0, false) }; From 7c690ffec020a56acea587f52518ce9bc7fb3533 Mon Sep 17 00:00:00 2001 From: Victor Vieux Date: Fri, 6 Mar 2026 12:09:09 -0800 Subject: [PATCH 080/742] vmm: config: Expose disk lock granularity option Add a per-disk lock_granularity parameter that lets users choose between byte-range OFD locks and whole-file OFD locks: --disk path=/foo.img,lock_granularity=byte-range --disk path=/bar.img,lock_granularity=full Byte-range is the default and matches QEMU behavior, working best with storage backends where whole-file OFD locks are treated as mandatory. The full option restores the original whole-file locking for environments that depend on it. The LockGranularityChoice enum and its FromStr impl live in the block crate alongside the existing LockGranularity type. The Block device converts the user-facing choice to the internal LockGranularity at lock time, keeping device_manager.rs simple. Closes: #7553 Signed-off-by: Victor Vieux --- block/src/fcntl.rs | 33 ++++++++++++++++++ fuzz/fuzz_targets/block.rs | 2 ++ virtio-devices/src/block.rs | 41 +++++++++++++---------- vmm/src/api/openapi/cloud-hypervisor.yaml | 5 ++- vmm/src/config.rs | 26 ++++++++++++-- vmm/src/device_manager.rs | 1 + vmm/src/vm_config.rs | 3 ++ 7 files changed, 91 insertions(+), 20 deletions(-) diff --git a/block/src/fcntl.rs b/block/src/fcntl.rs index a2a684f322..23c6f9f167 100644 --- a/block/src/fcntl.rs +++ b/block/src/fcntl.rs @@ -16,6 +16,7 @@ use std::fmt::Debug; use std::io; use std::os::fd::{AsRawFd, RawFd}; +use std::str::FromStr; use thiserror::Error; @@ -140,6 +141,38 @@ impl LockGranularity { } } +/// User-facing choice for the lock granularity. +/// +/// This allows external management software to create snapshots of the disk +/// image. Without a byte-range lock, some NFS implementations may treat the +/// entire file as exclusively locked and prevent such operations (e.g. NetApp). +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, serde::Deserialize, serde::Serialize)] +#[serde(rename_all = "kebab-case")] +pub enum LockGranularityChoice { + /// Byte-range lock covering [0, size). + #[default] + ByteRange, + /// Whole-file lock (l_start=0, l_len=0) - original OFD whole-file lock behavior. + Full, +} + +/// Error returned when parsing a [`LockGranularityChoice`] from a string. +#[derive(Error, Debug)] +#[error("Invalid lock granularity value: {0}, expected 'byte-range' or 'full'")] +pub struct LockGranularityParseError(String); + +impl FromStr for LockGranularityChoice { + type Err = LockGranularityParseError; + + fn from_str(s: &str) -> Result { + match s { + "byte-range" => Ok(LockGranularityChoice::ByteRange), + "full" => Ok(LockGranularityChoice::Full), + _ => Err(LockGranularityParseError(s.to_owned())), + } + } +} + /// Returns a [`struct@libc::flock`] structure for the whole file. const fn get_flock(lock_type: LockType, granularity: LockGranularity) -> libc::flock { libc::flock { diff --git a/fuzz/fuzz_targets/block.rs b/fuzz/fuzz_targets/block.rs index 7d1fbdf38f..0ad9193fdb 100644 --- a/fuzz/fuzz_targets/block.rs +++ b/fuzz/fuzz_targets/block.rs @@ -16,6 +16,7 @@ use std::sync::Arc; use std::{ffi, io}; use block::async_io::DiskFile; +use block::fcntl::LockGranularityChoice; use block::raw_sync::RawFileDiskSync; use libfuzzer_sys::{fuzz_target, Corpus}; use seccompiler::SeccompAction; @@ -69,6 +70,7 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { queue_affinity, true, false, + LockGranularityChoice::default(), ) .unwrap(); diff --git a/virtio-devices/src/block.rs b/virtio-devices/src/block.rs index 9d09ab91a6..9bb97d31c3 100644 --- a/virtio-devices/src/block.rs +++ b/virtio-devices/src/block.rs @@ -19,7 +19,7 @@ use std::{io, result}; use anyhow::anyhow; use block::async_io::{AsyncIo, AsyncIoError, DiskFile, DiskFileError}; -use block::fcntl::{LockError, LockGranularity, LockType, get_lock_state}; +use block::fcntl::{LockError, LockGranularity, LockGranularityChoice, LockType, get_lock_state}; use block::{ ExecuteAsync, ExecuteError, Request, RequestType, VirtioBlockConfig, build_serial, fcntl, }; @@ -662,6 +662,7 @@ pub struct Block { serial: Vec, queue_affinity: BTreeMap>, disable_sector0_writes: bool, + lock_granularity_choice: LockGranularityChoice, } #[derive(Serialize, Deserialize)] @@ -692,6 +693,7 @@ impl Block { queue_affinity: BTreeMap>, sparse: bool, disable_sector0_writes: bool, + lock_granularity: LockGranularityChoice, ) -> io::Result { let (disk_nsectors, avail_features, acked_features, config, paused) = if let Some(state) = state { @@ -807,6 +809,7 @@ impl Block { serial, queue_affinity, disable_sector0_writes, + lock_granularity_choice: lock_granularity, }) } @@ -815,23 +818,27 @@ impl Block { } /// Returns the granularity for the advisory lock for this disk. - // TODO In future, we could add a `lock_granularity=` configuration to the CLI. - // For now, we stick to QEMU behavior. fn lock_granularity(&mut self) -> LockGranularity { - self.disk_image.physical_size().map_or_else( - // use a safe fallback - |e| { - let fallback = LockGranularity::WholeFile; - warn!( - "Can't get disk size for id={},path={}, falling back to {:?}: error: {e}", - self.id, - self.disk_path.display(), - fallback - ); - fallback - }, - |size| LockGranularity::ByteRange(0, size), - ) + match self.lock_granularity_choice { + LockGranularityChoice::Full => LockGranularity::WholeFile, + LockGranularityChoice::ByteRange => { + // Byte-range lock covering [0, size) + self.disk_image.physical_size().map_or_else( + // use a safe fallback + |e| { + let fallback = LockGranularity::WholeFile; + warn!( + "Can't get disk size for id={},path={}, falling back to {:?}: error: {e}", + self.id, + self.disk_path.display(), + fallback + ); + fallback + }, + |size| LockGranularity::ByteRange(0, size), + ) + } + } } /// Tries to set an advisory lock for the corresponding disk image. diff --git a/vmm/src/api/openapi/cloud-hypervisor.yaml b/vmm/src/api/openapi/cloud-hypervisor.yaml index c4f4b6acf0..77b16e97ff 100644 --- a/vmm/src/api/openapi/cloud-hypervisor.yaml +++ b/vmm/src/api/openapi/cloud-hypervisor.yaml @@ -980,7 +980,10 @@ components: image_type: type: string enum: [FixedVhd, Qcow2, Raw, Vhdx, Unknown] - + lock_granularity: + type: string + enum: [byte-range, full] + default: byte-range NetConfig: type: object diff --git a/vmm/src/config.rs b/vmm/src/config.rs index 46f3443136..b4c04570f5 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -1159,7 +1159,7 @@ impl DiskConfig { id=,pci_segment=,rate_limit_group=,\ queue_affinity=,\ serial=,backing_files=on|off,sparse=on|off,\ - image_type="; + image_type=,lock_granularity=byte-range|full"; pub fn parse(disk: &str) -> Result { let mut parser = OptionParser::new(); @@ -1187,7 +1187,8 @@ impl DiskConfig { .add("queue_affinity") .add("backing_files") .add("sparse") - .add("image_type"); + .add("image_type") + .add("lock_granularity"); parser.parse(disk).map_err(Error::ParseDisk)?; @@ -1289,6 +1290,11 @@ impl DiskConfig { ImageType::Unknown }; + let lock_granularity = parser + .convert::("lock_granularity") + .map_err(Error::ParseDisk)? + .unwrap_or_default(); + let bw_tb_config = if bw_size != 0 && bw_refill_time != 0 { Some(TokenBucketConfig { size: bw_size, @@ -1341,6 +1347,7 @@ impl DiskConfig { backing_files, sparse, image_type, + lock_granularity, }) } @@ -3800,6 +3807,7 @@ mod unit_tests { backing_files: false, sparse: true, image_type: ImageType::Unknown, + lock_granularity: LockGranularityChoice::default(), } } @@ -3871,6 +3879,20 @@ mod unit_tests { ..disk_fixture() } ); + assert_eq!( + DiskConfig::parse("path=/path/to_file,lock_granularity=full")?, + DiskConfig { + lock_granularity: LockGranularityChoice::Full, + ..disk_fixture() + } + ); + assert_eq!( + DiskConfig::parse("path=/path/to_file,lock_granularity=byte-range")?, + DiskConfig { + lock_granularity: LockGranularityChoice::ByteRange, + ..disk_fixture() + } + ); assert_eq!( DiskConfig::parse("path=/path/to_file,queue_affinity=[0@[1],1@[2],2@[3,4],3@[5-8]]")?, DiskConfig { diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 1ac9afe30e..c3c5618bda 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -2853,6 +2853,7 @@ impl DeviceManager { queue_affinity, disk_cfg.sparse, disable_sector0_writes, + disk_cfg.lock_granularity, ) .map_err(DeviceManagerError::CreateVirtioBlock)?; diff --git a/vmm/src/vm_config.rs b/vmm/src/vm_config.rs index 33c2b23acd..d453ead2d9 100644 --- a/vmm/src/vm_config.rs +++ b/vmm/src/vm_config.rs @@ -9,6 +9,7 @@ use std::str::FromStr; use std::{fs, result}; use block::ImageType; +pub use block::fcntl::LockGranularityChoice; use log::{debug, warn}; use net_util::MacAddr; use serde::{Deserialize, Serialize}; @@ -302,6 +303,8 @@ pub struct DiskConfig { pub sparse: bool, #[serde(default)] pub image_type: ImageType, + #[serde(default)] + pub lock_granularity: LockGranularityChoice, } impl ApplyLandlock for DiskConfig { From 01e4053bef19f7dc0166b177415bd70bfeb8b583 Mon Sep 17 00:00:00 2001 From: Victor Vieux Date: Fri, 6 Mar 2026 12:09:16 -0800 Subject: [PATCH 081/742] docs: Add disk locking documentation Add docs/disk_locking.md explaining advisory OFD locking, the lock_granularity parameter, byte-range vs whole-file semantics, and fallback behavior. Signed-off-by: Victor Vieux --- docs/disk_locking.md | 61 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 docs/disk_locking.md diff --git a/docs/disk_locking.md b/docs/disk_locking.md new file mode 100644 index 0000000000..adb8956a08 --- /dev/null +++ b/docs/disk_locking.md @@ -0,0 +1,61 @@ +# Disk Image Locking + +Cloud Hypervisor places an advisory lock on each disk image opened via +`--disk` to prevent multiple instances from concurrently accessing the +same file. This avoids potential data corruption from overlapping writes. +Locks are advisory and require cooperating processes; a non-cooperating +process can still open and write to a locked file. Locking is host-local +and does not enforce coordination across multiple hosts. + +If the backing file resides on network storage, the storage system must +correctly translate or propagate OFD (Open File Description) locks across +the network to ensure that advisory locking semantics are preserved in a +multi-host environment. In the case of Linux, OFD locks are translated +into NFS locks by the NFS driver. + +The implementation uses Open File Description (OFD) locks (`F_OFD_SETLK`) +rather than traditional POSIX locks (`F_SETLK`). OFD locks are only +released when the last file descriptor referencing the open file +description is closed, preventing accidental early release. + +## Lock Granularity + +The `lock_granularity` parameter controls how the lock is placed on the +disk image: + +``` +--disk path=/foo.img,lock_granularity=byte-range +--disk path=/bar.img,lock_granularity=full +``` + +### `byte-range` (default) + +Locks the byte range `[0, physical_file_size)`. The physical file size +is evaluated once at startup; if the file grows after the lock is +acquired, the newly appended region is not covered by the lock. + +The file is protected against concurrent access by other instances of +Cloud Hypervisor. That's the only thing we can guarantee. + +#### Fallback to full + +One caveat is that if the physical size of the disk image cannot be +determined at startup (e.g. with certain vhost-user backends), Cloud +Hypervisor falls back to a whole-file lock regardless of the +`lock_granularity` setting, as a byte-range lock cannot be safely +computed without knowing the physical file size. + +### `full` + +Locks the entire file using the OFD whole-file semantic (`l_start=0`, +`l_len=0`). This may be needed in environments that depend on whole-file +lock semantics. Note that on some network storage backends, whole-file +OFD locks may be treated as mandatory rather than advisory, which can +cause external tools to fail when accessing the disk image. Lock +behavior may also vary across network filesystem implementations. + +## Disk Resizing + +Cloud Hypervisor supports live disk resizing. Currently, byte-range +locks are not updated. However, as a part of the file is still locked, +no new Cloud Hypervisor instance can open the disk image. From 9f569c7b8db6d6722df1cd26441eefc7002e76ae Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 5 Mar 2026 23:01:11 +0000 Subject: [PATCH 082/742] virtio-devices: console: Fix atomic corruption in ConsoleResizer The update_console_size method was using fetch_and on the acked_features atomic, which modified the atomic and cleared other feature bits. Changed it to use a non-destructive load and bitwise AND. Signed-off-by: Andrei Vagin --- virtio-devices/src/console.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/virtio-devices/src/console.rs b/virtio-devices/src/console.rs index 7896fde9f1..fc715eaac4 100644 --- a/virtio-devices/src/console.rs +++ b/virtio-devices/src/console.rs @@ -526,11 +526,7 @@ impl ConsoleResizer { if let Some(tty) = self.tty.as_ref() { let (cols, rows) = get_win_size(tty); self.config.lock().unwrap().update_console_size(cols, rows); - if self - .acked_features - .fetch_and(1u64 << VIRTIO_CONSOLE_F_SIZE, Ordering::AcqRel) - != 0 - { + if self.acked_features.load(Ordering::Acquire) & (1u64 << VIRTIO_CONSOLE_F_SIZE) != 0 { // Send the interrupt to the driver let _ = self.config_evt.write(1); } From 49156c720ce11a2db3c334bc5e63bf3202ef23b9 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sat, 7 Mar 2026 01:24:32 +0000 Subject: [PATCH 083/742] virtio-devices: console: Use 0 as the 'len' for the transmit queue For a transmit queue (guest to host), the host only reads from the guest-provided buffers and does not write to them. According to the virtio specification (e.g., Section 2.6.8 in Virtio 1.1), the 'len' field in the used ring has to be set to the number of bytes written to the buffers. Therefore, it should be 0 for the console transmit queue. Signed-off-by: Andrei Vagin --- virtio-devices/src/console.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtio-devices/src/console.rs b/virtio-devices/src/console.rs index fc715eaac4..5430673e74 100644 --- a/virtio-devices/src/console.rs +++ b/virtio-devices/src/console.rs @@ -265,7 +265,7 @@ impl ConsoleEpollHandler { out.flush().map_err(Error::OutputFlush)?; } trans_queue - .add_used(desc_chain.memory(), desc_chain.head_index(), desc.len()) + .add_used(desc_chain.memory(), desc_chain.head_index(), 0) .map_err(Error::QueueAddUsed)?; used_descs = true; } From 2698cfed9824d804fd2b7e43172d917ef9854547 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sat, 7 Mar 2026 01:25:04 +0000 Subject: [PATCH 084/742] virtio-devices: console: Fix descriptor chain processing Both process_input_queue and process_output_queue were only processing the first descriptor of a chain, leading to data loss if the driver used chained descriptors. This change iterates through all descriptors in a chain. It also moves the flush call out of the descriptor loops to improve performance. Signed-off-by: Andrei Vagin --- virtio-devices/src/console.rs | 71 ++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 30 deletions(-) diff --git a/virtio-devices/src/console.rs b/virtio-devices/src/console.rs index 5430673e74..ab907db20f 100644 --- a/virtio-devices/src/console.rs +++ b/virtio-devices/src/console.rs @@ -51,8 +51,6 @@ const VIRTIO_CONSOLE_F_SIZE: u64 = 0; #[derive(Error, Debug)] enum Error { - #[error("Descriptor chain too short")] - DescriptorChainTooShort, #[error("Failed to read from guest memory")] GuestMemoryRead(#[source] vm_memory::guest_memory::Error), #[error("Failed to write to guest memory")] @@ -210,21 +208,28 @@ impl ConsoleEpollHandler { } while let Some(mut desc_chain) = recv_queue.pop_descriptor_chain(self.mem.memory()) { - let desc = desc_chain.next().ok_or(Error::DescriptorChainTooShort)?; - let len = cmp::min(desc.len(), in_buffer.len() as u32); - let source_slice = in_buffer.drain(..len as usize).collect::>(); - - desc_chain - .memory() - .write_slice( - &source_slice[..], - desc.addr() - .translate_gva(self.access_platform.as_deref(), desc.len() as usize), - ) - .map_err(Error::GuestMemoryWrite)?; + let mut total_len = 0; + while let Some(desc) = desc_chain.next() { + if in_buffer.is_empty() { + break; + } + let len = cmp::min(desc.len(), in_buffer.len() as u32); + let source_slice = in_buffer.drain(..len as usize).collect::>(); + + desc_chain + .memory() + .write_slice( + &source_slice[..], + desc.addr() + .translate_gva(self.access_platform.as_deref(), desc.len() as usize), + ) + .map_err(Error::GuestMemoryWrite)?; + + total_len += len; + } recv_queue - .add_used(desc_chain.memory(), desc_chain.head_index(), len) + .add_used(desc_chain.memory(), desc_chain.head_index(), total_len) .map_err(Error::QueueAddUsed)?; used_descs = true; @@ -248,21 +253,23 @@ impl ConsoleEpollHandler { let mut used_descs = false; while let Some(mut desc_chain) = trans_queue.pop_descriptor_chain(self.mem.memory()) { - let desc = desc_chain.next().ok_or(Error::DescriptorChainTooShort)?; - if let Some(out) = &mut self.out { - let mut buf: Vec = Vec::new(); - desc_chain - .memory() - .write_volatile_to( - desc.addr() - .translate_gva(self.access_platform.as_deref(), desc.len() as usize), - &mut buf, - desc.len() as usize, - ) - .map_err(Error::GuestMemoryRead)?; - - out.write_all(&buf).map_err(Error::OutputWriteAll)?; - out.flush().map_err(Error::OutputFlush)?; + while let Some(desc) = desc_chain.next() { + if let Some(out) = &mut self.out { + let mut buf: Vec = Vec::new(); + desc_chain + .memory() + .write_volatile_to( + desc.addr().translate_gva( + self.access_platform.as_deref(), + desc.len() as usize, + ), + &mut buf, + desc.len() as usize, + ) + .map_err(Error::GuestMemoryRead)?; + + out.write_all(&buf).map_err(Error::OutputWriteAll)?; + } } trans_queue .add_used(desc_chain.memory(), desc_chain.head_index(), 0) @@ -270,6 +277,10 @@ impl ConsoleEpollHandler { used_descs = true; } + if used_descs && let Some(out) = &mut self.out { + out.flush().map_err(Error::OutputFlush)?; + } + Ok(used_descs) } From 210514cbf38c0652eaebac0087671b395e0eb21e Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Sun, 22 Feb 2026 14:33:44 +0100 Subject: [PATCH 085/742] block: qcow: Extract QcowHeader and related types into header.rs Move QcowHeader, associated types, constants and helper functions into a new header.rs submodule. Public types are re-exported from mod.rs. No functional changes. Signed-off-by: Anatol Belski --- block/src/qcow/header.rs | 601 +++++++++++++++++++++++++++++++++++++++ block/src/qcow/mod.rs | 595 ++------------------------------------ 2 files changed, 618 insertions(+), 578 deletions(-) create mode 100644 block/src/qcow/header.rs diff --git a/block/src/qcow/header.rs b/block/src/qcow/header.rs new file mode 100644 index 0000000000..4549668418 --- /dev/null +++ b/block/src/qcow/header.rs @@ -0,0 +1,601 @@ +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// Copyright 2026 The Cloud Hypervisor Authors. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +//! QCOW2 header parsing, validation, and creation. + +use std::fmt::{Display, Formatter, Result as FmtResult}; +use std::io::{Read, Seek, SeekFrom, Write}; +use std::mem::size_of; +use std::str::FromStr; + +use bitflags::bitflags; +use vmm_sys_util::file_traits::FileSync; + +use super::decoder::{Decoder, ZlibDecoder, ZstdDecoder}; +use super::qcow_raw_file::BeUint; +use super::raw_file::RawFile; +use super::{Error, Result, div_round_up_u32, div_round_up_u64}; + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum ImageType { + Raw, + Qcow2, +} + +impl Display for ImageType { + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + match self { + ImageType::Raw => write!(f, "raw"), + ImageType::Qcow2 => write!(f, "qcow2"), + } + } +} + +impl FromStr for ImageType { + type Err = Error; + + fn from_str(s: &str) -> Result { + match s { + "raw" => Ok(ImageType::Raw), + "qcow2" => Ok(ImageType::Qcow2), + _ => Err(Error::UnsupportedBackingFileFormat(s.to_string())), + } + } +} + +#[derive(Clone, Debug)] +pub enum CompressionType { + Zlib, + Zstd, +} + +#[derive(Debug, Clone)] +pub struct BackingFileConfig { + pub path: String, + // If this is None, we will autodetect it. + pub format: Option, +} + +// Maximum data size supported. +pub(super) const MAX_QCOW_FILE_SIZE: u64 = 0x01 << 44; // 16 TB. + +// QCOW magic constant that starts the header. +pub(super) const QCOW_MAGIC: u32 = 0x5146_49fb; +// Default to a cluster size of 2^DEFAULT_CLUSTER_BITS +pub(super) const DEFAULT_CLUSTER_BITS: u32 = 16; +// Limit clusters to reasonable sizes. Choose the same limits as qemu. Making the clusters smaller +// increases the amount of overhead for book keeping. +pub(super) const MIN_CLUSTER_BITS: u32 = 9; +pub(super) const MAX_CLUSTER_BITS: u32 = 21; +// The L1 and RefCount table are kept in RAM, only handle files that require less than 35M entries. +// This easily covers 1 TB files. When support for bigger files is needed the assumptions made to +// keep these tables in RAM needs to be thrown out. +pub(super) const MAX_RAM_POINTER_TABLE_SIZE: u64 = 35_000_000; +// 16-bit refcounts. +pub(super) const DEFAULT_REFCOUNT_ORDER: u32 = 4; + +pub(super) const V2_BARE_HEADER_SIZE: u32 = 72; +pub(super) const V3_BARE_HEADER_SIZE: u32 = 104; +pub(super) const AUTOCLEAR_FEATURES_OFFSET: u64 = 88; + +pub(super) const COMPATIBLE_FEATURES_LAZY_REFCOUNTS: u64 = 1; + +// Compression types as defined in https://www.qemu.org/docs/master/interop/qcow2.html +const COMPRESSION_TYPE_ZLIB: u64 = 0; // zlib/deflate +const COMPRESSION_TYPE_ZSTD: u64 = 1; // zstd + +// Header extension types +pub(super) const HEADER_EXT_END: u32 = 0x00000000; +// Backing file format name (raw, qcow2) +pub(super) const HEADER_EXT_BACKING_FORMAT: u32 = 0xe2792aca; +// Feature name table +const HEADER_EXT_FEATURE_NAME_TABLE: u32 = 0x6803f857; + +// Feature name table entry type incompatible +const FEAT_TYPE_INCOMPATIBLE: u8 = 0; + +bitflags! { + #[derive(Debug, Clone, Copy, PartialEq, Eq)] + pub struct IncompatFeatures: u64 { + const DIRTY = 1 << 0; + const CORRUPT = 1 << 1; + const DATA_FILE = 1 << 2; + const COMPRESSION = 1 << 3; + const EXTENDED_L2 = 1 << 4; + } +} + +impl IncompatFeatures { + /// Features supported by this implementation. + pub(super) const SUPPORTED: IncompatFeatures = IncompatFeatures::DIRTY + .union(IncompatFeatures::CORRUPT) + .union(IncompatFeatures::COMPRESSION); + + /// Get the fallback name for a known feature bit. + fn flag_name(bit: u8) -> Option<&'static str> { + Some(match Self::from_bits_truncate(1u64 << bit) { + Self::DIRTY => "dirty bit", + Self::CORRUPT => "corrupt bit", + Self::DATA_FILE => "external data file", + Self::EXTENDED_L2 => "extended L2 entries", + _ => return None, + }) + } +} + +/// Error type for unsupported incompatible features. +#[derive(Debug, Clone, thiserror::Error)] +pub struct MissingFeatureError { + /// Unsupported feature bits. + features: IncompatFeatures, + /// Feature name table from the qcow2 image. + feature_names: Vec<(u8, String)>, +} + +impl MissingFeatureError { + pub(super) fn new(features: IncompatFeatures, feature_names: Vec<(u8, String)>) -> Self { + Self { + features, + feature_names, + } + } +} + +impl Display for MissingFeatureError { + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + let names: Vec = (0u8..64) + .filter(|&bit| self.features.bits() & (1u64 << bit) != 0) + .map(|bit| { + // First try the image's feature name table + self.feature_names + .iter() + .find(|(b, _)| *b == bit) + .map(|(_, name)| name.clone()) + // Then try hardcoded fallback names + .or_else(|| IncompatFeatures::flag_name(bit).map(|s| s.to_string())) + // Finally, use generic description + .unwrap_or_else(|| format!("unknown feature bit {bit}")) + }) + .collect(); + write!(f, "Missing features: {}", names.join(", ")) + } +} + +// The format supports a "header extension area", that crosvm does not use. +const QCOW_EMPTY_HEADER_EXTENSION_SIZE: u32 = 8; + +// Defined by the specification +const MAX_BACKING_FILE_SIZE: u32 = 1023; + +/// Contains the information from the header of a qcow file. +#[derive(Clone, Debug)] +pub struct QcowHeader { + pub magic: u32, + pub version: u32, + + pub backing_file_offset: u64, + pub backing_file_size: u32, + + pub cluster_bits: u32, + pub size: u64, + pub crypt_method: u32, + + pub l1_size: u32, + pub l1_table_offset: u64, + + pub refcount_table_offset: u64, + pub refcount_table_clusters: u32, + + pub nb_snapshots: u32, + pub snapshots_offset: u64, + + // v3 entries + pub incompatible_features: u64, + pub compatible_features: u64, + pub autoclear_features: u64, + pub refcount_order: u32, + pub header_size: u32, + pub compression_type: CompressionType, + + // Post-header entries + pub backing_file: Option, +} + +impl QcowHeader { + /// Read header extensions, optionally collecting feature names for error reporting. + pub(super) fn read_header_extensions( + f: &mut RawFile, + header: &mut QcowHeader, + mut feature_table: Option<&mut Vec<(u8, String)>>, + ) -> Result<()> { + // Extensions start directly after the header + f.seek(SeekFrom::Start(header.header_size as u64)) + .map_err(Error::ReadingHeader)?; + + loop { + let ext_type = u32::read_be(f).map_err(Error::ReadingHeader)?; + if ext_type == HEADER_EXT_END { + break; + } + + let ext_length = u32::read_be(f).map_err(Error::ReadingHeader)?; + + match ext_type { + HEADER_EXT_BACKING_FORMAT => { + let mut format_bytes = vec![0u8; ext_length as usize]; + f.read_exact(&mut format_bytes) + .map_err(Error::ReadingHeader)?; + let format_str = String::from_utf8(format_bytes) + .map_err(|err| Error::InvalidBackingFileName(err.utf8_error()))?; + if let Some(backing_file) = &mut header.backing_file { + backing_file.format = Some(format_str.parse()?); + } + } + HEADER_EXT_FEATURE_NAME_TABLE if feature_table.is_some() => { + const FEATURE_NAME_ENTRY_SIZE: usize = 1 + 1 + 46; // type + bit + name + let mut data = vec![0u8; ext_length as usize]; + f.read_exact(&mut data).map_err(Error::ReadingHeader)?; + let table = feature_table.as_mut().unwrap(); + for entry in data.chunks_exact(FEATURE_NAME_ENTRY_SIZE) { + if entry[0] == FEAT_TYPE_INCOMPATIBLE { + let bit_number = entry[1]; + let name_bytes = &entry[2..]; + let name_len = name_bytes.iter().position(|&b| b == 0).unwrap_or(46); + let name = String::from_utf8_lossy(&name_bytes[..name_len]).to_string(); + table.push((bit_number, name)); + } + } + } + _ => { + // Skip unknown extension + f.seek(SeekFrom::Current(ext_length as i64)) + .map_err(Error::ReadingHeader)?; + } + } + + // Skip to the next 8 byte boundary + let padding = (8 - (ext_length % 8)) % 8; + f.seek(SeekFrom::Current(padding as i64)) + .map_err(Error::ReadingHeader)?; + } + + Ok(()) + } + + /// Creates a QcowHeader from a reference to a file. + pub fn new(f: &mut RawFile) -> Result { + f.rewind().map_err(Error::ReadingHeader)?; + let magic = u32::read_be(f).map_err(Error::ReadingHeader)?; + if magic != QCOW_MAGIC { + return Err(Error::InvalidMagic); + } + + // Reads the next u32 from the file. + fn read_u32_be(f: &mut RawFile) -> Result { + u32::read_be(f).map_err(Error::ReadingHeader) + } + + // Reads the next u64 from the file. + fn read_u64_be(f: &mut RawFile) -> Result { + u64::read_be(f).map_err(Error::ReadingHeader) + } + + let version = read_u32_be(f)?; + + let mut header = QcowHeader { + magic, + version, + backing_file_offset: read_u64_be(f)?, + backing_file_size: read_u32_be(f)?, + cluster_bits: read_u32_be(f)?, + size: read_u64_be(f)?, + crypt_method: read_u32_be(f)?, + l1_size: read_u32_be(f)?, + l1_table_offset: read_u64_be(f)?, + refcount_table_offset: read_u64_be(f)?, + refcount_table_clusters: read_u32_be(f)?, + nb_snapshots: read_u32_be(f)?, + snapshots_offset: read_u64_be(f)?, + incompatible_features: if version == 2 { 0 } else { read_u64_be(f)? }, + compatible_features: if version == 2 { 0 } else { read_u64_be(f)? }, + autoclear_features: if version == 2 { 0 } else { read_u64_be(f)? }, + refcount_order: if version == 2 { + DEFAULT_REFCOUNT_ORDER + } else { + read_u32_be(f)? + }, + header_size: if version == 2 { + V2_BARE_HEADER_SIZE + } else { + read_u32_be(f)? + }, + compression_type: CompressionType::Zlib, + backing_file: None, + }; + if version == 3 && header.header_size > V3_BARE_HEADER_SIZE { + let raw_compression_type = read_u64_be(f)? >> (64 - 8); + header.compression_type = if raw_compression_type == COMPRESSION_TYPE_ZLIB { + Ok(CompressionType::Zlib) + } else if raw_compression_type == COMPRESSION_TYPE_ZSTD { + Ok(CompressionType::Zstd) + } else { + Err(Error::UnsupportedCompressionType) + }?; + } + if header.backing_file_size > MAX_BACKING_FILE_SIZE { + return Err(Error::BackingFileTooLong(header.backing_file_size as usize)); + } + if header.backing_file_offset != 0 { + f.seek(SeekFrom::Start(header.backing_file_offset)) + .map_err(Error::ReadingHeader)?; + let mut backing_file_name_bytes = vec![0u8; header.backing_file_size as usize]; + f.read_exact(&mut backing_file_name_bytes) + .map_err(Error::ReadingHeader)?; + let path = String::from_utf8(backing_file_name_bytes) + .map_err(|err| Error::InvalidBackingFileName(err.utf8_error()))?; + header.backing_file = Some(BackingFileConfig { path, format: None }); + } + + if version == 3 { + // Check for unsupported incompatible features first + let features = IncompatFeatures::from_bits_retain(header.incompatible_features); + let unsupported = features - IncompatFeatures::SUPPORTED; + if !unsupported.is_empty() { + // Read extensions only to get feature names for error reporting + let mut feature_table = Vec::new(); + if header.header_size > V3_BARE_HEADER_SIZE { + let _ = Self::read_header_extensions(f, &mut header, Some(&mut feature_table)); + } + return Err(Error::UnsupportedFeature(MissingFeatureError::new( + unsupported, + feature_table, + ))); + } + + // Features OK, now read extensions normally + if header.header_size > V3_BARE_HEADER_SIZE { + Self::read_header_extensions(f, &mut header, None)?; + } + } + + Ok(header) + } + + pub fn get_decoder(&self) -> Box { + match self.compression_type { + CompressionType::Zlib => Box::new(ZlibDecoder {}), + CompressionType::Zstd => Box::new(ZstdDecoder {}), + } + } + + pub fn create_for_size_and_path( + version: u32, + size: u64, + backing_file: Option<&str>, + ) -> Result { + let header_size = if version == 2 { + V2_BARE_HEADER_SIZE + } else { + V3_BARE_HEADER_SIZE + QCOW_EMPTY_HEADER_EXTENSION_SIZE + }; + let cluster_bits: u32 = DEFAULT_CLUSTER_BITS; + let cluster_size: u32 = 0x01 << cluster_bits; + let max_length: usize = (cluster_size - header_size) as usize; + if let Some(path) = backing_file + && path.len() > max_length + { + return Err(Error::BackingFileTooLong(path.len() - max_length)); + } + + // L2 blocks are always one cluster long. They contain cluster_size/sizeof(u64) addresses. + let entries_per_cluster: u32 = cluster_size / size_of::() as u32; + let num_clusters: u32 = div_round_up_u64(size, u64::from(cluster_size)) as u32; + let num_l2_clusters: u32 = div_round_up_u32(num_clusters, entries_per_cluster); + let l1_clusters: u32 = div_round_up_u32(num_l2_clusters, entries_per_cluster); + let header_clusters = div_round_up_u32(size_of::() as u32, cluster_size); + Ok(QcowHeader { + magic: QCOW_MAGIC, + version, + backing_file_offset: backing_file.map_or(0, |_| { + header_size + + if version == 3 { + QCOW_EMPTY_HEADER_EXTENSION_SIZE + } else { + 0 + } + }) as u64, + backing_file_size: backing_file.map_or(0, |x| x.len()) as u32, + cluster_bits: DEFAULT_CLUSTER_BITS, + size, + crypt_method: 0, + l1_size: num_l2_clusters, + l1_table_offset: u64::from(cluster_size), + // The refcount table is after l1 + header. + refcount_table_offset: u64::from(cluster_size * (l1_clusters + 1)), + refcount_table_clusters: { + // Pre-allocate enough clusters for the entire refcount table as it must be + // continuous in the file. Allocate enough space to refcount all clusters, including + // the refcount clusters. + let max_refcount_clusters = max_refcount_clusters( + DEFAULT_REFCOUNT_ORDER, + cluster_size, + num_clusters + l1_clusters + num_l2_clusters + header_clusters, + ) as u32; + // The refcount table needs to store the offset of each refcount cluster. + div_round_up_u32( + max_refcount_clusters * size_of::() as u32, + cluster_size, + ) + }, + nb_snapshots: 0, + snapshots_offset: 0, + incompatible_features: 0, + compatible_features: 0, + autoclear_features: 0, + refcount_order: DEFAULT_REFCOUNT_ORDER, + header_size, + compression_type: CompressionType::Zlib, + backing_file: backing_file.map(|path| BackingFileConfig { + path: String::from(path), + format: None, + }), + }) + } + + /// Write the header to `file`. + pub fn write_to(&self, file: &mut F) -> Result<()> { + // Writes the next u32 to the file. + fn write_u32_be(f: &mut F, value: u32) -> Result<()> { + u32::write_be(f, value).map_err(Error::WritingHeader) + } + + // Writes the next u64 to the file. + fn write_u64_be(f: &mut F, value: u64) -> Result<()> { + u64::write_be(f, value).map_err(Error::WritingHeader) + } + + write_u32_be(file, self.magic)?; + write_u32_be(file, self.version)?; + write_u64_be(file, self.backing_file_offset)?; + write_u32_be(file, self.backing_file_size)?; + write_u32_be(file, self.cluster_bits)?; + write_u64_be(file, self.size)?; + write_u32_be(file, self.crypt_method)?; + write_u32_be(file, self.l1_size)?; + write_u64_be(file, self.l1_table_offset)?; + write_u64_be(file, self.refcount_table_offset)?; + write_u32_be(file, self.refcount_table_clusters)?; + write_u32_be(file, self.nb_snapshots)?; + write_u64_be(file, self.snapshots_offset)?; + + if self.version == 3 { + write_u64_be(file, self.incompatible_features)?; + write_u64_be(file, self.compatible_features)?; + write_u64_be(file, self.autoclear_features)?; + write_u32_be(file, self.refcount_order)?; + write_u32_be(file, self.header_size)?; + + if self.header_size > V3_BARE_HEADER_SIZE { + write_u64_be(file, 0)?; // no compression + } + + write_u32_be(file, 0)?; // header extension type: end of header extension area + write_u32_be(file, 0)?; // length of header extension data: 0 + } + + if let Some(backing_file_path) = self.backing_file.as_ref().map(|bf| &bf.path) { + if self.backing_file_offset > 0 { + file.seek(SeekFrom::Start(self.backing_file_offset)) + .map_err(Error::WritingHeader)?; + } + write!(file, "{backing_file_path}").map_err(Error::WritingHeader)?; + } + + // Set the file length by seeking and writing a zero to the last byte. This avoids needing + // a `File` instead of anything that implements seek as the `file` argument. + // Zeros out the l1 and refcount table clusters. + let cluster_size = 0x01u64 << self.cluster_bits; + let refcount_blocks_size = u64::from(self.refcount_table_clusters) * cluster_size; + file.seek(SeekFrom::Start( + self.refcount_table_offset + refcount_blocks_size - 2, + )) + .map_err(Error::WritingHeader)?; + file.write(&[0u8]).map_err(Error::WritingHeader)?; + + Ok(()) + } + + /// Write only the incompatible_features field to the file at its fixed offset. + fn write_incompatible_features(&self, file: &mut F) -> Result<()> { + if self.version != 3 { + return Ok(()); + } + file.seek(SeekFrom::Start(V2_BARE_HEADER_SIZE as u64)) + .map_err(Error::WritingHeader)?; + u64::write_be(file, self.incompatible_features).map_err(Error::WritingHeader)?; + Ok(()) + } + + /// Set or clear the dirty bit for QCOW2 v3 images. + /// + /// When `dirty` is true, sets the bit to indicate the image is in use. + /// When `dirty` is false, clears the bit to indicate a clean shutdown. + pub fn set_dirty_bit( + &mut self, + file: &mut F, + dirty: bool, + ) -> Result<()> { + if self.version == 3 { + if dirty { + self.incompatible_features |= IncompatFeatures::DIRTY.bits(); + } else { + self.incompatible_features &= !IncompatFeatures::DIRTY.bits(); + } + self.write_incompatible_features(file)?; + file.fsync().map_err(Error::SyncingHeader)?; + } + Ok(()) + } + + /// Set the corrupt bit for QCOW2 v3 images. + /// + /// This marks the image as corrupted. Once set, the image can only be + /// opened read-only until repaired. + pub fn set_corrupt_bit(&mut self, file: &mut F) -> Result<()> { + if self.version == 3 { + self.incompatible_features |= IncompatFeatures::CORRUPT.bits(); + self.write_incompatible_features(file)?; + file.fsync().map_err(Error::SyncingHeader)?; + } + Ok(()) + } + + pub fn is_corrupt(&self) -> bool { + IncompatFeatures::from_bits_truncate(self.incompatible_features) + .contains(IncompatFeatures::CORRUPT) + } + + /// Clear all autoclear feature bits for QCOW2 v3 images. + /// + /// These bits indicate features that can be safely disabled when modified + /// by software that doesn't understand them. + pub fn clear_autoclear_features( + &mut self, + file: &mut F, + ) -> Result<()> { + if self.version == 3 && self.autoclear_features != 0 { + self.autoclear_features = 0; + file.seek(SeekFrom::Start(AUTOCLEAR_FEATURES_OFFSET)) + .map_err(Error::WritingHeader)?; + u64::write_be(file, 0).map_err(Error::WritingHeader)?; + file.fsync().map_err(Error::SyncingHeader)?; + } + Ok(()) + } +} + +pub(super) fn max_refcount_clusters( + refcount_order: u32, + cluster_size: u32, + num_clusters: u32, +) -> u64 { + // Use u64 as the product of the u32 inputs can overflow. + let refcount_bits = 0x01u64 << u64::from(refcount_order); + let cluster_bits = u64::from(cluster_size) * 8; + let for_data = div_round_up_u64(u64::from(num_clusters) * refcount_bits, cluster_bits); + let for_refcounts = div_round_up_u64(for_data * refcount_bits, cluster_bits); + for_data + for_refcounts +} + +/// Returns an Error if the given offset doesn't align to a cluster boundary. +pub(super) fn offset_is_cluster_boundary(offset: u64, cluster_bits: u32) -> Result<()> { + if offset & ((0x01 << cluster_bits) - 1) != 0 { + return Err(Error::InvalidOffset(offset)); + } + Ok(()) +} diff --git a/block/src/qcow/mod.rs b/block/src/qcow/mod.rs index fd932406c0..dd0f154f61 100644 --- a/block/src/qcow/mod.rs +++ b/block/src/qcow/mod.rs @@ -5,20 +5,34 @@ // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause mod decoder; +mod header; mod qcow_raw_file; mod raw_file; mod refcount; mod vec_cache; use std::cmp::{max, min}; -use std::fmt::{Debug, Display, Formatter, Result as FmtResult}; +use std::fmt::{Debug, Formatter, Result as FmtResult}; use std::fs::{OpenOptions, read_link}; use std::io::{self, Read, Seek, SeekFrom, Write}; use std::mem::size_of; use std::os::fd::{AsRawFd, RawFd}; -use std::str::{self, FromStr}; +use std::str; -use bitflags::bitflags; +#[cfg(test)] +use header::{ + AUTOCLEAR_FEATURES_OFFSET, DEFAULT_REFCOUNT_ORDER, HEADER_EXT_BACKING_FORMAT, HEADER_EXT_END, + V2_BARE_HEADER_SIZE, V3_BARE_HEADER_SIZE, +}; +pub use header::{ + BackingFileConfig, CompressionType, ImageType, IncompatFeatures, MissingFeatureError, + QcowHeader, +}; +use header::{ + COMPATIBLE_FEATURES_LAZY_REFCOUNTS, MAX_CLUSTER_BITS, MAX_QCOW_FILE_SIZE, + MAX_RAM_POINTER_TABLE_SIZE, MIN_CLUSTER_BITS, QCOW_MAGIC, max_refcount_clusters, + offset_is_cluster_boundary, +}; use libc::{EINVAL, EIO, ENOSPC}; use log::{error, warn}; use remain::sorted; @@ -28,7 +42,6 @@ use vmm_sys_util::seek_hole::SeekHole; use vmm_sys_util::write_zeroes::{PunchHole, WriteZeroesAt}; use crate::BlockBackend; -use crate::qcow::decoder::{Decoder, ZlibDecoder, ZstdDecoder}; use crate::qcow::qcow_raw_file::{BeUint, QcowRawFile}; pub use crate::qcow::raw_file::RawFile; use crate::qcow::refcount::RefCount; @@ -144,68 +157,6 @@ pub enum Error { pub type Result = std::result::Result; -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -pub enum ImageType { - Raw, - Qcow2, -} - -impl Display for ImageType { - fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - match self { - ImageType::Raw => write!(f, "raw"), - ImageType::Qcow2 => write!(f, "qcow2"), - } - } -} - -impl FromStr for ImageType { - type Err = Error; - - fn from_str(s: &str) -> Result { - match s { - "raw" => Ok(ImageType::Raw), - "qcow2" => Ok(ImageType::Qcow2), - _ => Err(Error::UnsupportedBackingFileFormat(s.to_string())), - } - } -} - -#[derive(Clone, Debug)] -pub enum CompressionType { - Zlib, - Zstd, -} - -#[derive(Debug, Clone)] -pub struct BackingFileConfig { - pub path: String, - // If this is None, we will autodetect it. - pub format: Option, -} - -// Maximum data size supported. -const MAX_QCOW_FILE_SIZE: u64 = 0x01 << 44; // 16 TB. - -// QCOW magic constant that starts the header. -const QCOW_MAGIC: u32 = 0x5146_49fb; -// Default to a cluster size of 2^DEFAULT_CLUSTER_BITS -const DEFAULT_CLUSTER_BITS: u32 = 16; -// Limit clusters to reasonable sizes. Choose the same limits as qemu. Making the clusters smaller -// increases the amount of overhead for book keeping. -const MIN_CLUSTER_BITS: u32 = 9; -const MAX_CLUSTER_BITS: u32 = 21; -// The L1 and RefCount table are kept in RAM, only handle files that require less than 35M entries. -// This easily covers 1 TB files. When support for bigger files is needed the assumptions made to -// keep these tables in RAM needs to be thrown out. -const MAX_RAM_POINTER_TABLE_SIZE: u64 = 35_000_000; -// 16-bit refcounts. -const DEFAULT_REFCOUNT_ORDER: u32 = 4; - -const V2_BARE_HEADER_SIZE: u32 = 72; -const V3_BARE_HEADER_SIZE: u32 = 104; -const AUTOCLEAR_FEATURES_OFFSET: u64 = 88; - // bits 0-8 and 56-63 are reserved. const L1_TABLE_OFFSET_MASK: u64 = 0x00ff_ffff_ffff_fe00; const L2_TABLE_OFFSET_MASK: u64 = 0x00ff_ffff_ffff_fe00; @@ -214,94 +165,6 @@ const ZERO_FLAG: u64 = 1 << 0; const COMPRESSED_FLAG: u64 = 1 << 62; const COMPRESSED_SECTOR_SIZE: u64 = 512; const CLUSTER_USED_FLAG: u64 = 1 << 63; -const COMPATIBLE_FEATURES_LAZY_REFCOUNTS: u64 = 1; - -// Compression types as defined in https://www.qemu.org/docs/master/interop/qcow2.html -const COMPRESSION_TYPE_ZLIB: u64 = 0; // zlib/deflate -const COMPRESSION_TYPE_ZSTD: u64 = 1; // zstd - -// Header extension types -const HEADER_EXT_END: u32 = 0x00000000; -// Backing file format name (raw, qcow2) -const HEADER_EXT_BACKING_FORMAT: u32 = 0xe2792aca; -// Feature name table -const HEADER_EXT_FEATURE_NAME_TABLE: u32 = 0x6803f857; - -// Feature name table entry type incompatible -const FEAT_TYPE_INCOMPATIBLE: u8 = 0; - -bitflags! { - #[derive(Debug, Clone, Copy, PartialEq, Eq)] - pub struct IncompatFeatures: u64 { - const DIRTY = 1 << 0; - const CORRUPT = 1 << 1; - const DATA_FILE = 1 << 2; - const COMPRESSION = 1 << 3; - const EXTENDED_L2 = 1 << 4; - } -} - -impl IncompatFeatures { - /// Features supported by this implementation. - const SUPPORTED: IncompatFeatures = IncompatFeatures::DIRTY - .union(IncompatFeatures::CORRUPT) - .union(IncompatFeatures::COMPRESSION); - - /// Get the fallback name for a known feature bit. - fn flag_name(bit: u8) -> Option<&'static str> { - Some(match Self::from_bits_truncate(1u64 << bit) { - Self::DIRTY => "dirty bit", - Self::CORRUPT => "corrupt bit", - Self::DATA_FILE => "external data file", - Self::EXTENDED_L2 => "extended L2 entries", - _ => return None, - }) - } -} - -/// Error type for unsupported incompatible features. -#[derive(Debug, Clone, Error)] -pub struct MissingFeatureError { - /// Unsupported feature bits. - features: IncompatFeatures, - /// Feature name table from the qcow2 image. - feature_names: Vec<(u8, String)>, -} - -impl MissingFeatureError { - fn new(features: IncompatFeatures, feature_names: Vec<(u8, String)>) -> Self { - Self { - features, - feature_names, - } - } -} - -impl Display for MissingFeatureError { - fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - let names: Vec = (0u8..64) - .filter(|&bit| self.features.bits() & (1u64 << bit) != 0) - .map(|bit| { - // First try the image's feature name table - self.feature_names - .iter() - .find(|(b, _)| *b == bit) - .map(|(_, name)| name.clone()) - // Then try hardcoded fallback names - .or_else(|| IncompatFeatures::flag_name(bit).map(|s| s.to_string())) - // Finally, use generic description - .unwrap_or_else(|| format!("unknown feature bit {bit}")) - }) - .collect(); - write!(f, "Missing features: {}", names.join(", ")) - } -} - -// The format supports a "header extension area", that crosvm does not use. -const QCOW_EMPTY_HEADER_EXTENSION_SIZE: u32 = 8; - -// Defined by the specification -const MAX_BACKING_FILE_SIZE: u32 = 1023; fn l2_entry_is_empty(l2_entry: u64) -> bool { l2_entry == 0 @@ -348,422 +211,6 @@ fn l1_entry_make(cluster_addr: u64, refcount_is_one: bool) -> u64 { (cluster_addr & L1_TABLE_OFFSET_MASK) | (refcount_is_one as u64 * CLUSTER_USED_FLAG) } -/// Contains the information from the header of a qcow file. -#[derive(Clone, Debug)] -pub struct QcowHeader { - pub magic: u32, - pub version: u32, - - pub backing_file_offset: u64, - pub backing_file_size: u32, - - pub cluster_bits: u32, - pub size: u64, - pub crypt_method: u32, - - pub l1_size: u32, - pub l1_table_offset: u64, - - pub refcount_table_offset: u64, - pub refcount_table_clusters: u32, - - pub nb_snapshots: u32, - pub snapshots_offset: u64, - - // v3 entries - pub incompatible_features: u64, - pub compatible_features: u64, - pub autoclear_features: u64, - pub refcount_order: u32, - pub header_size: u32, - pub compression_type: CompressionType, - - // Post-header entries - pub backing_file: Option, -} - -impl QcowHeader { - /// Read header extensions, optionally collecting feature names for error reporting. - fn read_header_extensions( - f: &mut RawFile, - header: &mut QcowHeader, - mut feature_table: Option<&mut Vec<(u8, String)>>, - ) -> Result<()> { - // Extensions start directly after the header - f.seek(SeekFrom::Start(header.header_size as u64)) - .map_err(Error::ReadingHeader)?; - - loop { - let ext_type = u32::read_be(f).map_err(Error::ReadingHeader)?; - if ext_type == HEADER_EXT_END { - break; - } - - let ext_length = u32::read_be(f).map_err(Error::ReadingHeader)?; - - match ext_type { - HEADER_EXT_BACKING_FORMAT => { - let mut format_bytes = vec![0u8; ext_length as usize]; - f.read_exact(&mut format_bytes) - .map_err(Error::ReadingHeader)?; - let format_str = String::from_utf8(format_bytes) - .map_err(|err| Error::InvalidBackingFileName(err.utf8_error()))?; - if let Some(backing_file) = &mut header.backing_file { - backing_file.format = Some(format_str.parse()?); - } - } - HEADER_EXT_FEATURE_NAME_TABLE if feature_table.is_some() => { - const FEATURE_NAME_ENTRY_SIZE: usize = 1 + 1 + 46; // type + bit + name - let mut data = vec![0u8; ext_length as usize]; - f.read_exact(&mut data).map_err(Error::ReadingHeader)?; - let table = feature_table.as_mut().unwrap(); - for entry in data.chunks_exact(FEATURE_NAME_ENTRY_SIZE) { - if entry[0] == FEAT_TYPE_INCOMPATIBLE { - let bit_number = entry[1]; - let name_bytes = &entry[2..]; - let name_len = name_bytes.iter().position(|&b| b == 0).unwrap_or(46); - let name = String::from_utf8_lossy(&name_bytes[..name_len]).to_string(); - table.push((bit_number, name)); - } - } - } - _ => { - // Skip unknown extension - f.seek(SeekFrom::Current(ext_length as i64)) - .map_err(Error::ReadingHeader)?; - } - } - - // Skip to the next 8 byte boundary - let padding = (8 - (ext_length % 8)) % 8; - f.seek(SeekFrom::Current(padding as i64)) - .map_err(Error::ReadingHeader)?; - } - - Ok(()) - } - - /// Creates a QcowHeader from a reference to a file. - pub fn new(f: &mut RawFile) -> Result { - f.rewind().map_err(Error::ReadingHeader)?; - let magic = u32::read_be(f).map_err(Error::ReadingHeader)?; - if magic != QCOW_MAGIC { - return Err(Error::InvalidMagic); - } - - // Reads the next u32 from the file. - fn read_u32_be(f: &mut RawFile) -> Result { - u32::read_be(f).map_err(Error::ReadingHeader) - } - - // Reads the next u64 from the file. - fn read_u64_be(f: &mut RawFile) -> Result { - u64::read_be(f).map_err(Error::ReadingHeader) - } - - let version = read_u32_be(f)?; - - let mut header = QcowHeader { - magic, - version, - backing_file_offset: read_u64_be(f)?, - backing_file_size: read_u32_be(f)?, - cluster_bits: read_u32_be(f)?, - size: read_u64_be(f)?, - crypt_method: read_u32_be(f)?, - l1_size: read_u32_be(f)?, - l1_table_offset: read_u64_be(f)?, - refcount_table_offset: read_u64_be(f)?, - refcount_table_clusters: read_u32_be(f)?, - nb_snapshots: read_u32_be(f)?, - snapshots_offset: read_u64_be(f)?, - incompatible_features: if version == 2 { 0 } else { read_u64_be(f)? }, - compatible_features: if version == 2 { 0 } else { read_u64_be(f)? }, - autoclear_features: if version == 2 { 0 } else { read_u64_be(f)? }, - refcount_order: if version == 2 { - DEFAULT_REFCOUNT_ORDER - } else { - read_u32_be(f)? - }, - header_size: if version == 2 { - V2_BARE_HEADER_SIZE - } else { - read_u32_be(f)? - }, - compression_type: CompressionType::Zlib, - backing_file: None, - }; - if version == 3 && header.header_size > V3_BARE_HEADER_SIZE { - let raw_compression_type = read_u64_be(f)? >> (64 - 8); - header.compression_type = if raw_compression_type == COMPRESSION_TYPE_ZLIB { - Ok(CompressionType::Zlib) - } else if raw_compression_type == COMPRESSION_TYPE_ZSTD { - Ok(CompressionType::Zstd) - } else { - Err(Error::UnsupportedCompressionType) - }?; - } - if header.backing_file_size > MAX_BACKING_FILE_SIZE { - return Err(Error::BackingFileTooLong(header.backing_file_size as usize)); - } - if header.backing_file_offset != 0 { - f.seek(SeekFrom::Start(header.backing_file_offset)) - .map_err(Error::ReadingHeader)?; - let mut backing_file_name_bytes = vec![0u8; header.backing_file_size as usize]; - f.read_exact(&mut backing_file_name_bytes) - .map_err(Error::ReadingHeader)?; - let path = String::from_utf8(backing_file_name_bytes) - .map_err(|err| Error::InvalidBackingFileName(err.utf8_error()))?; - header.backing_file = Some(BackingFileConfig { path, format: None }); - } - - if version == 3 { - // Check for unsupported incompatible features first - let features = IncompatFeatures::from_bits_retain(header.incompatible_features); - let unsupported = features - IncompatFeatures::SUPPORTED; - if !unsupported.is_empty() { - // Read extensions only to get feature names for error reporting - let mut feature_table = Vec::new(); - if header.header_size > V3_BARE_HEADER_SIZE { - let _ = Self::read_header_extensions(f, &mut header, Some(&mut feature_table)); - } - return Err(Error::UnsupportedFeature(MissingFeatureError::new( - unsupported, - feature_table, - ))); - } - - // Features OK, now read extensions normally - if header.header_size > V3_BARE_HEADER_SIZE { - Self::read_header_extensions(f, &mut header, None)?; - } - } - - Ok(header) - } - - pub fn get_decoder(&self) -> Box { - match self.compression_type { - CompressionType::Zlib => Box::new(ZlibDecoder {}), - CompressionType::Zstd => Box::new(ZstdDecoder {}), - } - } - - pub fn create_for_size_and_path( - version: u32, - size: u64, - backing_file: Option<&str>, - ) -> Result { - let header_size = if version == 2 { - V2_BARE_HEADER_SIZE - } else { - V3_BARE_HEADER_SIZE + QCOW_EMPTY_HEADER_EXTENSION_SIZE - }; - let cluster_bits: u32 = DEFAULT_CLUSTER_BITS; - let cluster_size: u32 = 0x01 << cluster_bits; - let max_length: usize = (cluster_size - header_size) as usize; - if let Some(path) = backing_file - && path.len() > max_length - { - return Err(Error::BackingFileTooLong(path.len() - max_length)); - } - - // L2 blocks are always one cluster long. They contain cluster_size/sizeof(u64) addresses. - let entries_per_cluster: u32 = cluster_size / size_of::() as u32; - let num_clusters: u32 = div_round_up_u64(size, u64::from(cluster_size)) as u32; - let num_l2_clusters: u32 = div_round_up_u32(num_clusters, entries_per_cluster); - let l1_clusters: u32 = div_round_up_u32(num_l2_clusters, entries_per_cluster); - let header_clusters = div_round_up_u32(size_of::() as u32, cluster_size); - Ok(QcowHeader { - magic: QCOW_MAGIC, - version, - backing_file_offset: backing_file.map_or(0, |_| { - header_size - + if version == 3 { - QCOW_EMPTY_HEADER_EXTENSION_SIZE - } else { - 0 - } - }) as u64, - backing_file_size: backing_file.map_or(0, |x| x.len()) as u32, - cluster_bits: DEFAULT_CLUSTER_BITS, - size, - crypt_method: 0, - l1_size: num_l2_clusters, - l1_table_offset: u64::from(cluster_size), - // The refcount table is after l1 + header. - refcount_table_offset: u64::from(cluster_size * (l1_clusters + 1)), - refcount_table_clusters: { - // Pre-allocate enough clusters for the entire refcount table as it must be - // continuous in the file. Allocate enough space to refcount all clusters, including - // the refcount clusters. - let max_refcount_clusters = max_refcount_clusters( - DEFAULT_REFCOUNT_ORDER, - cluster_size, - num_clusters + l1_clusters + num_l2_clusters + header_clusters, - ) as u32; - // The refcount table needs to store the offset of each refcount cluster. - div_round_up_u32( - max_refcount_clusters * size_of::() as u32, - cluster_size, - ) - }, - nb_snapshots: 0, - snapshots_offset: 0, - incompatible_features: 0, - compatible_features: 0, - autoclear_features: 0, - refcount_order: DEFAULT_REFCOUNT_ORDER, - header_size, - compression_type: CompressionType::Zlib, - backing_file: backing_file.map(|path| BackingFileConfig { - path: String::from(path), - format: None, - }), - }) - } - - /// Write the header to `file`. - pub fn write_to(&self, file: &mut F) -> Result<()> { - // Writes the next u32 to the file. - fn write_u32_be(f: &mut F, value: u32) -> Result<()> { - u32::write_be(f, value).map_err(Error::WritingHeader) - } - - // Writes the next u64 to the file. - fn write_u64_be(f: &mut F, value: u64) -> Result<()> { - u64::write_be(f, value).map_err(Error::WritingHeader) - } - - write_u32_be(file, self.magic)?; - write_u32_be(file, self.version)?; - write_u64_be(file, self.backing_file_offset)?; - write_u32_be(file, self.backing_file_size)?; - write_u32_be(file, self.cluster_bits)?; - write_u64_be(file, self.size)?; - write_u32_be(file, self.crypt_method)?; - write_u32_be(file, self.l1_size)?; - write_u64_be(file, self.l1_table_offset)?; - write_u64_be(file, self.refcount_table_offset)?; - write_u32_be(file, self.refcount_table_clusters)?; - write_u32_be(file, self.nb_snapshots)?; - write_u64_be(file, self.snapshots_offset)?; - - if self.version == 3 { - write_u64_be(file, self.incompatible_features)?; - write_u64_be(file, self.compatible_features)?; - write_u64_be(file, self.autoclear_features)?; - write_u32_be(file, self.refcount_order)?; - write_u32_be(file, self.header_size)?; - - if self.header_size > V3_BARE_HEADER_SIZE { - write_u64_be(file, 0)?; // no compression - } - - write_u32_be(file, 0)?; // header extension type: end of header extension area - write_u32_be(file, 0)?; // length of header extension data: 0 - } - - if let Some(backing_file_path) = self.backing_file.as_ref().map(|bf| &bf.path) { - if self.backing_file_offset > 0 { - file.seek(SeekFrom::Start(self.backing_file_offset)) - .map_err(Error::WritingHeader)?; - } - write!(file, "{backing_file_path}").map_err(Error::WritingHeader)?; - } - - // Set the file length by seeking and writing a zero to the last byte. This avoids needing - // a `File` instead of anything that implements seek as the `file` argument. - // Zeros out the l1 and refcount table clusters. - let cluster_size = 0x01u64 << self.cluster_bits; - let refcount_blocks_size = u64::from(self.refcount_table_clusters) * cluster_size; - file.seek(SeekFrom::Start( - self.refcount_table_offset + refcount_blocks_size - 2, - )) - .map_err(Error::WritingHeader)?; - file.write(&[0u8]).map_err(Error::WritingHeader)?; - - Ok(()) - } - - /// Write only the incompatible_features field to the file at its fixed offset. - fn write_incompatible_features(&self, file: &mut F) -> Result<()> { - if self.version != 3 { - return Ok(()); - } - file.seek(SeekFrom::Start(V2_BARE_HEADER_SIZE as u64)) - .map_err(Error::WritingHeader)?; - u64::write_be(file, self.incompatible_features).map_err(Error::WritingHeader)?; - Ok(()) - } - - /// Set or clear the dirty bit for QCOW2 v3 images. - /// - /// When `dirty` is true, sets the bit to indicate the image is in use. - /// When `dirty` is false, clears the bit to indicate a clean shutdown. - pub fn set_dirty_bit( - &mut self, - file: &mut F, - dirty: bool, - ) -> Result<()> { - if self.version == 3 { - if dirty { - self.incompatible_features |= IncompatFeatures::DIRTY.bits(); - } else { - self.incompatible_features &= !IncompatFeatures::DIRTY.bits(); - } - self.write_incompatible_features(file)?; - file.fsync().map_err(Error::SyncingHeader)?; - } - Ok(()) - } - - /// Set the corrupt bit for QCOW2 v3 images. - /// - /// This marks the image as corrupted. Once set, the image can only be - /// opened read-only until repaired. - pub fn set_corrupt_bit(&mut self, file: &mut F) -> Result<()> { - if self.version == 3 { - self.incompatible_features |= IncompatFeatures::CORRUPT.bits(); - self.write_incompatible_features(file)?; - file.fsync().map_err(Error::SyncingHeader)?; - } - Ok(()) - } - - pub fn is_corrupt(&self) -> bool { - IncompatFeatures::from_bits_truncate(self.incompatible_features) - .contains(IncompatFeatures::CORRUPT) - } - - /// Clear all autoclear feature bits for QCOW2 v3 images. - /// - /// These bits indicate features that can be safely disabled when modified - /// by software that doesn't understand them. - pub fn clear_autoclear_features( - &mut self, - file: &mut F, - ) -> Result<()> { - if self.version == 3 && self.autoclear_features != 0 { - self.autoclear_features = 0; - file.seek(SeekFrom::Start(AUTOCLEAR_FEATURES_OFFSET)) - .map_err(Error::WritingHeader)?; - u64::write_be(file, 0).map_err(Error::WritingHeader)?; - file.fsync().map_err(Error::SyncingHeader)?; - } - Ok(()) - } -} - -fn max_refcount_clusters(refcount_order: u32, cluster_size: u32, num_clusters: u32) -> u64 { - // Use u64 as the product of the u32 inputs can overflow. - let refcount_bits = 0x01u64 << u64::from(refcount_order); - let cluster_bits = u64::from(cluster_size) * 8; - let for_data = div_round_up_u64(u64::from(num_clusters) * refcount_bits, cluster_bits); - let for_refcounts = div_round_up_u64(for_data * refcount_bits, cluster_bits); - for_data + for_refcounts -} - trait BackingFileOps: Send + Seek + Read { fn read_at(&mut self, address: u64, buf: &mut [u8]) -> std::io::Result<()> { self.seek(SeekFrom::Start(address))?; @@ -2561,14 +2008,6 @@ impl BlockBackend for QcowFile { } } -// Returns an Error if the given offset doesn't align to a cluster boundary. -fn offset_is_cluster_boundary(offset: u64, cluster_bits: u32) -> Result<()> { - if offset & ((0x01 << cluster_bits) - 1) != 0 { - return Err(Error::InvalidOffset(offset)); - } - Ok(()) -} - // Ceiling of the division of `dividend`/`divisor`. fn div_round_up_u64(dividend: u64, divisor: u64) -> u64 { dividend / divisor + u64::from(!dividend.is_multiple_of(divisor)) From 63db385c3c93c7a1ed44e5405315cc769396f5fc Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Sun, 22 Feb 2026 13:07:13 +0100 Subject: [PATCH 086/742] block: qcow: Extract utility functions into util.rs Move L1 and L2 table entry helpers, division utilities and related constants from mod.rs into a dedicated util.rs submodule. Both mod.rs and metadata.rs import from util. No functional changes. Signed-off-by: Anatol Belski --- block/src/qcow/mod.rs | 75 +++++---------------------------------- block/src/qcow/util.rs | 79 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 67 deletions(-) create mode 100644 block/src/qcow/util.rs diff --git a/block/src/qcow/mod.rs b/block/src/qcow/mod.rs index dd0f154f61..83eb5a4bbc 100644 --- a/block/src/qcow/mod.rs +++ b/block/src/qcow/mod.rs @@ -9,6 +9,7 @@ mod header; mod qcow_raw_file; mod raw_file; mod refcount; +mod util; mod vec_cache; use std::cmp::{max, min}; @@ -37,6 +38,12 @@ use libc::{EINVAL, EIO, ENOSPC}; use log::{error, warn}; use remain::sorted; use thiserror::Error; +pub(crate) use util::MAX_NESTING_DEPTH; +use util::{ + L1_TABLE_OFFSET_MASK, L2_TABLE_OFFSET_MASK, div_round_up_u32, div_round_up_u64, l1_entry_make, + l2_entry_compressed_cluster_layout, l2_entry_is_compressed, l2_entry_is_empty, + l2_entry_is_zero, l2_entry_make_std, l2_entry_make_zero, l2_entry_std_cluster_addr, +}; use vmm_sys_util::file_traits::{FileSetLen, FileSync}; use vmm_sys_util::seek_hole::SeekHole; use vmm_sys_util::write_zeroes::{PunchHole, WriteZeroesAt}; @@ -47,9 +54,6 @@ pub use crate::qcow::raw_file::RawFile; use crate::qcow::refcount::RefCount; use crate::qcow::vec_cache::{CacheMap, Cacheable, VecCache}; -/// Nesting depth limit for disk formats that can open other disk files. -pub(super) const MAX_NESTING_DEPTH: u32 = 10; - #[sorted] #[derive(Debug, Error)] pub enum Error { @@ -157,60 +161,6 @@ pub enum Error { pub type Result = std::result::Result; -// bits 0-8 and 56-63 are reserved. -const L1_TABLE_OFFSET_MASK: u64 = 0x00ff_ffff_ffff_fe00; -const L2_TABLE_OFFSET_MASK: u64 = 0x00ff_ffff_ffff_fe00; -// Flags -const ZERO_FLAG: u64 = 1 << 0; -const COMPRESSED_FLAG: u64 = 1 << 62; -const COMPRESSED_SECTOR_SIZE: u64 = 512; -const CLUSTER_USED_FLAG: u64 = 1 << 63; - -fn l2_entry_is_empty(l2_entry: u64) -> bool { - l2_entry == 0 -} - -// Check bit 0 - only valid for standard clusters. -fn l2_entry_is_zero(l2_entry: u64) -> bool { - l2_entry & ZERO_FLAG != 0 -} - -fn l2_entry_is_compressed(l2_entry: u64) -> bool { - l2_entry & COMPRESSED_FLAG != 0 -} - -// Get file offset and size of compressed cluster data -fn l2_entry_compressed_cluster_layout(l2_entry: u64, cluster_bits: u32) -> (u64, usize) { - let compressed_size_shift = 62 - (cluster_bits - 8); - let compressed_size_mask = (1 << (cluster_bits - 8)) - 1; - let compressed_cluster_addr = l2_entry & ((1 << compressed_size_shift) - 1); - let nsectors = (l2_entry >> compressed_size_shift & compressed_size_mask) + 1; - let compressed_cluster_size = ((nsectors * COMPRESSED_SECTOR_SIZE) - - (compressed_cluster_addr & (COMPRESSED_SECTOR_SIZE - 1))) - as usize; - (compressed_cluster_addr, compressed_cluster_size) -} - -// Get file offset of standard (non-compressed) cluster -fn l2_entry_std_cluster_addr(l2_entry: u64) -> u64 { - l2_entry & L2_TABLE_OFFSET_MASK -} - -// Make L2 entry for standard (non-compressed) cluster -fn l2_entry_make_std(cluster_addr: u64) -> u64 { - (cluster_addr & L2_TABLE_OFFSET_MASK) | CLUSTER_USED_FLAG -} - -// Make L2 entry for preallocated zero cluster -fn l2_entry_make_zero(cluster_addr: u64) -> u64 { - (cluster_addr & L2_TABLE_OFFSET_MASK) | CLUSTER_USED_FLAG | ZERO_FLAG -} - -// Make L1 entry with optional flags -fn l1_entry_make(cluster_addr: u64, refcount_is_one: bool) -> u64 { - (cluster_addr & L1_TABLE_OFFSET_MASK) | (refcount_is_one as u64 * CLUSTER_USED_FLAG) -} - trait BackingFileOps: Send + Seek + Read { fn read_at(&mut self, address: u64, buf: &mut [u8]) -> std::io::Result<()> { self.seek(SeekFrom::Start(address))?; @@ -2008,16 +1958,6 @@ impl BlockBackend for QcowFile { } } -// Ceiling of the division of `dividend`/`divisor`. -fn div_round_up_u64(dividend: u64, divisor: u64) -> u64 { - dividend / divisor + u64::from(!dividend.is_multiple_of(divisor)) -} - -// Ceiling of the division of `dividend`/`divisor`. -fn div_round_up_u32(dividend: u32, divisor: u32) -> u32 { - dividend / divisor + u32::from(!dividend.is_multiple_of(divisor)) -} - fn convert_copy(reader: &mut R, writer: &mut W, offset: u64, size: u64) -> Result<()> where R: Read + Seek, @@ -2153,6 +2093,7 @@ mod unit_tests { use vmm_sys_util::tempfile::TempFile; use vmm_sys_util::write_zeroes::WriteZeroes; + use super::util::{COMPRESSED_FLAG, ZERO_FLAG}; use super::*; fn valid_header_v3() -> Vec { diff --git a/block/src/qcow/util.rs b/block/src/qcow/util.rs new file mode 100644 index 0000000000..bc8d017725 --- /dev/null +++ b/block/src/qcow/util.rs @@ -0,0 +1,79 @@ +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// Copyright 2026 The Cloud Hypervisor Authors. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +//! Pure helper functions and constants for QCOW2 L1/L2 table entry +//! manipulation and integer arithmetic. Shared across the `qcow` submodules. + +/// Nesting depth limit for disk formats that can open other disk files. +pub(crate) const MAX_NESTING_DEPTH: u32 = 10; + +// bits 0-8 and 56-63 are reserved. +pub(super) const L1_TABLE_OFFSET_MASK: u64 = 0x00ff_ffff_ffff_fe00; +pub(super) const L2_TABLE_OFFSET_MASK: u64 = 0x00ff_ffff_ffff_fe00; +// Flags +pub(super) const ZERO_FLAG: u64 = 1 << 0; +pub(super) const COMPRESSED_FLAG: u64 = 1 << 62; +pub(super) const COMPRESSED_SECTOR_SIZE: u64 = 512; +pub(super) const CLUSTER_USED_FLAG: u64 = 1 << 63; + +/// Check if L2 entry is empty (unallocated). +pub(super) fn l2_entry_is_empty(l2_entry: u64) -> bool { + l2_entry == 0 +} + +/// Check bit 0 - only valid for standard clusters. +pub(super) fn l2_entry_is_zero(l2_entry: u64) -> bool { + l2_entry & ZERO_FLAG != 0 +} + +/// Check if L2 entry refers to a compressed cluster. +pub(super) fn l2_entry_is_compressed(l2_entry: u64) -> bool { + l2_entry & COMPRESSED_FLAG != 0 +} + +/// Get file offset and size of compressed cluster data. +pub(super) fn l2_entry_compressed_cluster_layout(l2_entry: u64, cluster_bits: u32) -> (u64, usize) { + let compressed_size_shift = 62 - (cluster_bits - 8); + let compressed_size_mask = (1 << (cluster_bits - 8)) - 1; + let compressed_cluster_addr = l2_entry & ((1 << compressed_size_shift) - 1); + let nsectors = (l2_entry >> compressed_size_shift & compressed_size_mask) + 1; + let compressed_cluster_size = ((nsectors * COMPRESSED_SECTOR_SIZE) + - (compressed_cluster_addr & (COMPRESSED_SECTOR_SIZE - 1))) + as usize; + (compressed_cluster_addr, compressed_cluster_size) +} + +/// Get file offset of standard (non-compressed) cluster. +pub(super) fn l2_entry_std_cluster_addr(l2_entry: u64) -> u64 { + l2_entry & L2_TABLE_OFFSET_MASK +} + +/// Make L2 entry for standard (non-compressed) cluster. +pub(super) fn l2_entry_make_std(cluster_addr: u64) -> u64 { + (cluster_addr & L2_TABLE_OFFSET_MASK) | CLUSTER_USED_FLAG +} + +/// Make L2 entry for preallocated zero cluster. +pub(super) fn l2_entry_make_zero(cluster_addr: u64) -> u64 { + (cluster_addr & L2_TABLE_OFFSET_MASK) | CLUSTER_USED_FLAG | ZERO_FLAG +} + +/// Make L1 entry with optional flags. +pub(super) fn l1_entry_make(cluster_addr: u64, refcount_is_one: bool) -> u64 { + (cluster_addr & L1_TABLE_OFFSET_MASK) | (refcount_is_one as u64 * CLUSTER_USED_FLAG) +} + +/// Ceiling of the division of `dividend`/`divisor`. +pub(super) fn div_round_up_u32(dividend: u32, divisor: u32) -> u32 { + dividend / divisor + u32::from(!dividend.is_multiple_of(divisor)) +} + +/// Ceiling of the division of `dividend`/`divisor`. +pub(super) fn div_round_up_u64(dividend: u64, divisor: u64) -> u64 { + dividend / divisor + u64::from(!dividend.is_multiple_of(divisor)) +} From 9d686b08667e7410aba0a9259f329e8e759e5bda Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Sat, 21 Feb 2026 16:18:10 +0100 Subject: [PATCH 087/742] block: qcow: Add QcowMetadata with RwLock Introduce QcowMetadata, a thread safe wrapper around QCOW2 metadata tables and caches using RwLock. Provides cluster resolution for reads and writes, and deallocate operations for discard. Extract parse_qcow() from QcowFile so both QcowFile and QcowDiskSync can share the parsing and validation logic. Signed-off-by: Anatol Belski --- block/src/qcow/metadata.rs | 924 +++++++++++++++++++++++++++++++++++++ block/src/qcow/mod.rs | 425 +++++++++-------- 2 files changed, 1146 insertions(+), 203 deletions(-) create mode 100644 block/src/qcow/metadata.rs diff --git a/block/src/qcow/metadata.rs b/block/src/qcow/metadata.rs new file mode 100644 index 0000000000..88077236ca --- /dev/null +++ b/block/src/qcow/metadata.rs @@ -0,0 +1,924 @@ +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// Copyright 2026 The Cloud Hypervisor Authors. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +//! QCOW2 metadata with lock based synchronization. +//! +//! QcowMetadata wraps the in memory QCOW2 metadata tables behind a single +//! coarse RwLock. This separates metadata lookup from data I/O, allowing +//! data reads and writes to proceed without holding the metadata lock. +//! +//! On L2 cache hit, map_clusters_for_read only needs a read lock with +//! pure shared reference access on the cache. Cache misses and all write +//! operations upgrade to a write lock. + +use std::cmp::min; +use std::io::{self, Seek}; +use std::mem; +use std::sync::RwLock; + +use libc::{EINVAL, EIO}; + +use super::qcow_raw_file::QcowRawFile; +use super::refcount::RefCount; +use super::util::{ + l2_entry_compressed_cluster_layout, l2_entry_is_compressed, l2_entry_is_empty, + l2_entry_is_zero, l2_entry_make_std, l2_entry_make_zero, l2_entry_std_cluster_addr, +}; +use super::vec_cache::{CacheMap, Cacheable, VecCache}; +use super::{QcowHeader, refcount}; + +/// Describes how to satisfy a guest read for a single cluster region. +/// +/// Returned by QcowMetadata::map_clusters_for_read. The caller performs +/// the actual data I/O using its own per queue file descriptor without +/// holding the metadata lock. +#[derive(Debug)] +pub enum ClusterReadMapping { + /// The cluster is not allocated and the guest should see zeros. + /// This covers both truly unallocated clusters where the L1 or L2 + /// entry is zero and clusters with the ZERO flag set. + Zero { length: u64 }, + + /// The cluster is allocated at the given host file offset. + /// The offset is the exact byte position combining cluster base and + /// intra cluster offset. The length is the number of bytes to read, + /// bounded by cluster boundary and guest request. + Allocated { offset: u64, length: u64 }, + + /// The cluster is compressed. The decompressed data is returned inline + /// because decompression is a CPU only operation that was done under the + /// write lock to access the raw compressed bytes from disk. + /// + /// The data field contains exactly the bytes the guest requested, already + /// sliced from the decompressed cluster. + Compressed { data: Vec }, + + /// The cluster is not allocated in this layer but may exist in a backing + /// file. The caller should delegate to the backing file at the given + /// guest offset for the specified length in bytes. + Backing { offset: u64, length: u64 }, +} + +/// Describes how to satisfy a guest write for a single cluster region. +/// +/// Returned by QcowMetadata::map_cluster_for_write. The caller performs +/// the actual data I/O using its own per queue file descriptor without +/// holding the metadata lock. +#[derive(Debug)] +pub enum ClusterWriteMapping { + /// The write target is at the given host file offset. + /// This covers both already allocated clusters and freshly allocated ones. + /// The offset is the exact byte position combining cluster base and + /// intra cluster offset. + Allocated { offset: u64 }, +} + +/// Trait for reading from a backing file in a thread safe manner. +/// +/// Used by QcowMetadata::deallocate_bytes so it can read COW data +/// from the backing file without knowing the concrete backing type. +pub(crate) trait BackingRead: Send + Sync { + fn read_at(&self, address: u64, buf: &mut [u8]) -> io::Result<()>; +} + +/// Action that the caller must perform after deallocate_bytes. +#[derive(Debug)] +pub enum DeallocAction { + /// Punch a hole at the given host file offset for a full cluster. + PunchHole { host_offset: u64, length: u64 }, + /// Write zeros at the given host file offset for a partial cluster. + WriteZeroes { host_offset: u64, length: usize }, +} + +/// Shared QCOW2 metadata protected by a coarse RwLock. +/// +/// Holds the L1 table, L2 cache and refcount state in memory. L2 table +/// entries and refcount blocks are read from disk on cache miss and +/// written back on eviction or when dirty. +/// +/// One instance is shared via Arc across all virtio blk queues. Each +/// queue holds its own QcowRawFile clone for data I/O. +/// +/// Steady state guest I/O is read dominant at the metadata level. Every +/// read and every write to an already allocated cluster only needs an +/// L1 to L2 lookup, which completes under a shared read lock. Only +/// cluster allocation, L2 cache eviction and resize take the exclusive +/// write lock, so contention stays low and queues scale. +pub struct QcowMetadata { + inner: RwLock, +} + +/// The actual metadata state, accessible only through the RwLock. +pub(crate) struct QcowState { + pub(crate) header: QcowHeader, + pub(crate) l1_table: VecCache, + pub(crate) l2_entries: u64, + pub(crate) l2_cache: CacheMap>, + pub(crate) refcounts: RefCount, + pub(crate) avail_clusters: Vec, + pub(crate) unref_clusters: Vec, + /// Dedicated file descriptor for metadata I/O covering L2 table reads, + /// refcount block reads and dirty eviction writes. This is a dup clone + /// of the original fd, separate from the per queue data I/O fds. + pub(crate) raw_file: QcowRawFile, +} + +impl QcowMetadata { + pub(super) fn new(inner: QcowState) -> Self { + QcowMetadata { + inner: RwLock::new(inner), + } + } + + /// Maps a multicluster guest read range to a list of read mappings. + /// + /// This walks the range in cluster sized steps under a single lock + /// acquisition, reducing lock roundtrips for large reads. The returned + /// mappings are ordered by guest address and ready for io_uring + /// submission. The caller can coalesce adjacent allocated entries into + /// fewer submissions. + /// + /// On the read lock fast path, if all L2 tables are cached, the lookup + /// is pure memory access with no I/O and concurrent readers are allowed. + /// + /// On the write lock slow path, if an L2 cache miss occurs, the L2 + /// table is read from disk via the metadata fd, the cache is populated + /// and the mapping is returned. + /// + /// The has_backing_file flag indicates whether a backing file exists, + /// needed to distinguish zero versus backing for unallocated clusters. + pub fn map_clusters_for_read( + &self, + address: u64, + total_length: usize, + has_backing_file: bool, + ) -> io::Result> { + let inner = self.inner.read().unwrap(); + let cluster_size = inner.raw_file.cluster_size(); + let mut mappings = Vec::new(); + let mut mapped = 0usize; + let mut need_write_lock = false; + + // Fast path, try all chunks under read lock + while mapped < total_length { + let curr_addr = address + mapped as u64; + let offset_in_cluster = inner.raw_file.cluster_offset(curr_addr) as usize; + let count = min( + total_length - mapped, + cluster_size as usize - offset_in_cluster, + ); + + match inner.try_map_read(curr_addr, count, has_backing_file)? { + Some(mapping) => mappings.push(mapping), + None => { + need_write_lock = true; + break; + } + } + mapped += count; + } + + if !need_write_lock { + return Ok(mappings); + } + + // Slow path, drop read lock, take write lock, redo from where we stopped + drop(inner); + let mut inner = self.inner.write().unwrap(); + + // Remap everything under write lock for consistency since the L2 cache + // may have been evicted between the read to write lock transition. + mappings.clear(); + mapped = 0; + + while mapped < total_length { + let curr_addr = address + mapped as u64; + let offset_in_cluster = inner.raw_file.cluster_offset(curr_addr) as usize; + let count = min( + total_length - mapped, + cluster_size as usize - offset_in_cluster, + ); + + mappings.push(inner.map_read_with_populate(curr_addr, count, has_backing_file)?); + mapped += count; + } + + Ok(mappings) + } + + /// Maps a guest write address to a write mapping. + /// + /// Always takes a write lock since writes may need to allocate clusters, + /// update L2 entries and update refcounts. + /// + /// The backing_data parameter is the COW source. If the cluster is + /// unallocated and a backing file exists, the caller should have already + /// read the backing cluster data and pass it here. If None, the new + /// cluster is zeroed. + pub fn map_cluster_for_write( + &self, + address: u64, + backing_data: Option>, + ) -> io::Result { + let mut inner = self.inner.write().unwrap(); + inner.map_write(address, backing_data) + } + + pub fn flush(&self) -> io::Result<()> { + let mut inner = self.inner.write().unwrap(); + inner.sync_caches()?; + let mut unref = mem::take(&mut inner.unref_clusters); + inner.avail_clusters.append(&mut unref); + Ok(()) + } + + /// Deallocates a range of bytes. Full clusters are deallocated via metadata. + /// Partial clusters need the caller to write zeros. This method returns a + /// list of actions the caller should take. + pub(crate) fn deallocate_bytes( + &self, + address: u64, + length: usize, + sparse: bool, + virtual_size: u64, + cluster_size: u64, + backing_file: Option<&dyn BackingRead>, + ) -> io::Result> { + let mut inner = self.inner.write().unwrap(); + let mut actions = Vec::new(); + + let file_end = virtual_size; + let remaining_in_file = file_end.saturating_sub(address); + let write_count = min(length as u64, remaining_in_file) as usize; + + let mut nwritten = 0usize; + while nwritten < write_count { + let curr_addr = address + nwritten as u64; + let offset_in_cluster = inner.raw_file.cluster_offset(curr_addr) as usize; + let count = min( + write_count - nwritten, + cluster_size as usize - offset_in_cluster, + ); + + if count == cluster_size as usize { + let punch_offset = inner.deallocate_cluster(curr_addr, sparse)?; + if let Some(host_offset) = punch_offset { + actions.push(DeallocAction::PunchHole { + host_offset, + length: cluster_size, + }); + } + } else { + // Partial cluster - COW from backing to preserve non zeroed bytes, + // then the caller writes zeros to the partial range. + let backing_data = if let Some(backing) = backing_file { + let cluster_begin = curr_addr - offset_in_cluster as u64; + let mut data = vec![0u8; cluster_size as usize]; + backing.read_at(cluster_begin, &mut data)?; + Some(data) + } else { + None + }; + let mapping = inner.map_write(curr_addr, backing_data)?; + let ClusterWriteMapping::Allocated { offset } = mapping; + actions.push(DeallocAction::WriteZeroes { + host_offset: offset, + length: count, + }); + } + + nwritten += count; + } + Ok(actions) + } + + pub fn virtual_size(&self) -> u64 { + self.inner.read().unwrap().header.size + } + + pub fn cluster_size(&self) -> u64 { + self.inner.read().unwrap().raw_file.cluster_size() + } + + /// Returns the intra cluster byte offset for a given guest address. + pub fn cluster_offset(&self, address: u64) -> u64 { + self.inner.read().unwrap().raw_file.cluster_offset(address) + } +} + +impl QcowState { + /// Fast path read mapping under read lock only. Returns None on cache + /// miss. + /// + /// All access here is through shared reference. CacheMap::get, + /// VecCache::get and index operations are all shared reference compatible. + fn try_map_read( + &self, + address: u64, + count: usize, + has_backing_file: bool, + ) -> io::Result> { + if address >= self.header.size { + return Err(io::Error::from_raw_os_error(EINVAL)); + } + + let l1_index = self.l1_table_index(address) as usize; + let l2_addr_disk = match self.l1_table.get(l1_index) { + Some(&addr) => addr, + None => return Err(io::Error::from_raw_os_error(EINVAL)), + }; + + if l2_addr_disk == 0 { + return Ok(Some(self.unallocated_read_mapping( + address, + count, + has_backing_file, + ))); + } + + let l2_table = match self.l2_cache.get(l1_index) { + Some(table) => table, + None => return Ok(None), // cache miss, need write lock + }; + + let l2_index = self.l2_table_index(address) as usize; + let l2_entry = l2_table[l2_index]; + + // Compressed entries require disk I/O for decompression - can't do + // that under a read lock. Fall through to the write lock path. + if l2_entry_is_compressed(l2_entry) { + return Ok(None); + } + + if l2_entry_is_empty(l2_entry) { + Ok(Some(self.unallocated_read_mapping( + address, + count, + has_backing_file, + ))) + } else if l2_entry_is_zero(l2_entry) { + // Match original QcowFile::file_read semantics where zero flagged + // entries fall through to backing file when one exists or return + // zeros otherwise. + Ok(Some(self.unallocated_read_mapping( + address, + count, + has_backing_file, + ))) + } else { + let cluster_addr = l2_entry_std_cluster_addr(l2_entry); + let cluster_size = self.raw_file.cluster_size(); + if cluster_addr & (cluster_size - 1) != 0 { + // Fall through to write lock path which sets the corrupt bit + return Ok(None); + } + let intra_offset = self.raw_file.cluster_offset(address); + Ok(Some(ClusterReadMapping::Allocated { + offset: cluster_addr + intra_offset, + length: count as u64, + })) + } + } + + /// Slow path read mapping. Requires exclusive access to populate cache. + fn map_read_with_populate( + &mut self, + address: u64, + count: usize, + has_backing_file: bool, + ) -> io::Result { + if address >= self.header.size { + return Err(io::Error::from_raw_os_error(EINVAL)); + } + + let l1_index = self.l1_table_index(address) as usize; + let l2_addr_disk = match self.l1_table.get(l1_index) { + Some(&addr) => addr, + None => return Err(io::Error::from_raw_os_error(EINVAL)), + }; + + if l2_addr_disk == 0 { + return Ok(self.unallocated_read_mapping(address, count, has_backing_file)); + } + + // Populate cache if needed as this does I/O via the metadata raw file + self.cache_l2_cluster(l1_index, l2_addr_disk)?; + + let l2_index = self.l2_table_index(address) as usize; + let l2_entry = self.l2_cache.get(l1_index).unwrap()[l2_index]; + + if l2_entry_is_empty(l2_entry) { + Ok(self.unallocated_read_mapping(address, count, has_backing_file)) + } else if l2_entry_is_compressed(l2_entry) { + // Under write lock we can do I/O for decompression + let decompressed = self.decompress_l2_cluster(l2_entry)?; + let start = self.raw_file.cluster_offset(address) as usize; + let end = start + .checked_add(count) + .ok_or_else(|| io::Error::from_raw_os_error(EINVAL))?; + if end > decompressed.len() { + return Err(io::Error::from_raw_os_error(EINVAL)); + } + Ok(ClusterReadMapping::Compressed { + data: decompressed[start..end].to_vec(), + }) + } else if l2_entry_is_zero(l2_entry) { + // Match original QcowFile::file_read semantics where zero flagged + // entries fall through to backing file when one exists or return + // zeros otherwise. + Ok(self.unallocated_read_mapping(address, count, has_backing_file)) + } else { + let cluster_addr = l2_entry_std_cluster_addr(l2_entry); + let cluster_size = self.raw_file.cluster_size(); + if cluster_addr & (cluster_size - 1) != 0 { + self.set_corrupt_bit_best_effort(); + return Err(io::Error::from_raw_os_error(EIO)); + } + let intra_offset = self.raw_file.cluster_offset(address); + Ok(ClusterReadMapping::Allocated { + offset: cluster_addr + intra_offset, + length: count as u64, + }) + } + } + + fn unallocated_read_mapping( + &self, + address: u64, + count: usize, + has_backing_file: bool, + ) -> ClusterReadMapping { + if has_backing_file { + ClusterReadMapping::Backing { + offset: address, + length: count as u64, + } + } else { + ClusterReadMapping::Zero { + length: count as u64, + } + } + } + + /// Write path mapping. Always called under write lock. + fn map_write( + &mut self, + address: u64, + backing_data: Option>, + ) -> io::Result { + if address >= self.header.size { + return Err(io::Error::from_raw_os_error(EINVAL)); + } + + let l1_index = self.l1_table_index(address) as usize; + let l2_addr_disk = match self.l1_table.get(l1_index) { + Some(&addr) => addr, + None => return Err(io::Error::from_raw_os_error(EINVAL)), + }; + let l2_index = self.l2_table_index(address) as usize; + + let mut set_refcounts = Vec::new(); + + if let Some(new_addr) = self.cache_l2_cluster_alloc(l1_index, l2_addr_disk)? { + set_refcounts.push((new_addr, 1)); + } + + let l2_entry = self.l2_cache.get(l1_index).unwrap()[l2_index]; + let cluster_addr = if l2_entry_is_compressed(l2_entry) { + let decompressed_cluster = self.decompress_l2_cluster(l2_entry)?; + let cluster_addr = self.append_data_cluster(None)?; + self.update_cluster_addr(l1_index, l2_index, cluster_addr, &mut set_refcounts)?; + self.raw_file + .file_mut() + .seek(io::SeekFrom::Start(cluster_addr))?; + let nwritten = io::Write::write(self.raw_file.file_mut(), &decompressed_cluster)?; + if nwritten != decompressed_cluster.len() { + self.set_corrupt_bit_best_effort(); + return Err(io::Error::from_raw_os_error(EIO)); + } + self.deallocate_compressed_cluster(l2_entry)?; + cluster_addr + } else if l2_entry_is_empty(l2_entry) || l2_entry_is_zero(l2_entry) { + let cluster_addr = self.append_data_cluster(backing_data)?; + self.update_cluster_addr(l1_index, l2_index, cluster_addr, &mut set_refcounts)?; + cluster_addr + } else { + // Already allocated - validate alignment + let cluster_addr = l2_entry_std_cluster_addr(l2_entry); + if cluster_addr & (self.raw_file.cluster_size() - 1) != 0 { + self.set_corrupt_bit_best_effort(); + return Err(io::Error::from_raw_os_error(EIO)); + } + cluster_addr + }; + + // Apply deferred refcount updates + for (addr, refcount) in set_refcounts { + self.set_cluster_refcount_track_freed(addr, refcount)?; + } + + let intra_offset = self.raw_file.cluster_offset(address); + Ok(ClusterWriteMapping::Allocated { + offset: cluster_addr + intra_offset, + }) + } + + // -- Address computation helpers -- + + fn l1_table_index(&self, address: u64) -> u64 { + (address / self.raw_file.cluster_size()) / self.l2_entries + } + + fn l2_table_index(&self, address: u64) -> u64 { + (address / self.raw_file.cluster_size()) % self.l2_entries + } + + // -- Cache and allocation operations requiring exclusive access -- + + /// Populates the L2 cache for read operations without allocation. + fn cache_l2_cluster(&mut self, l1_index: usize, l2_addr_disk: u64) -> io::Result<()> { + if !self.l2_cache.contains_key(l1_index) { + let cluster_size = self.raw_file.cluster_size(); + if l2_addr_disk & (cluster_size - 1) != 0 { + self.set_corrupt_bit_best_effort(); + return Err(io::Error::from_raw_os_error(EIO)); + } + let l2_table = + VecCache::from_vec(self.raw_file.read_pointer_cluster(l2_addr_disk, None)?); + let l1_table = &self.l1_table; + let raw_file = &mut self.raw_file; + self.l2_cache.insert(l1_index, l2_table, |index, evicted| { + raw_file.write_pointer_table_direct(l1_table[index], evicted.iter()) + })?; + } + Ok(()) + } + + /// Populates the L2 cache for write operations and may allocate a new + /// L2 table. Returns the address of the newly allocated cluster if any. + fn cache_l2_cluster_alloc( + &mut self, + l1_index: usize, + l2_addr_disk: u64, + ) -> io::Result> { + let mut new_cluster: Option = None; + if !self.l2_cache.contains_key(l1_index) { + let l2_table = if l2_addr_disk == 0 { + // Allocate a new cluster to store the L2 table + let new_addr = self.get_new_cluster(None)?; + new_cluster = Some(new_addr); + self.l1_table[l1_index] = new_addr; + VecCache::new(self.l2_entries as usize) + } else { + let cluster_size = self.raw_file.cluster_size(); + if l2_addr_disk & (cluster_size - 1) != 0 { + self.set_corrupt_bit_best_effort(); + return Err(io::Error::from_raw_os_error(EIO)); + } + VecCache::from_vec(self.raw_file.read_pointer_cluster(l2_addr_disk, None)?) + }; + let l1_table = &self.l1_table; + let raw_file = &mut self.raw_file; + self.l2_cache.insert(l1_index, l2_table, |index, evicted| { + raw_file.write_pointer_table_direct(l1_table[index], evicted.iter()) + })?; + } + Ok(new_cluster) + } + + /// Allocates a new cluster from the free list or by extending the file. + fn get_new_cluster(&mut self, initial_data: Option>) -> io::Result { + if let Some(free_cluster) = self.avail_clusters.pop() { + if free_cluster == 0 { + self.set_corrupt_bit_best_effort(); + return Err(io::Error::from_raw_os_error(EIO)); + } + if let Some(initial_data) = initial_data { + self.raw_file.write_cluster(free_cluster, &initial_data)?; + } else { + self.raw_file.zero_cluster(free_cluster)?; + } + return Ok(free_cluster); + } + + let max_valid = self.refcounts.max_valid_cluster_offset(); + if let Some(new_cluster) = self.raw_file.add_cluster_end(max_valid)? { + if new_cluster == 0 { + self.set_corrupt_bit_best_effort(); + return Err(io::Error::from_raw_os_error(EIO)); + } + if let Some(initial_data) = initial_data { + self.raw_file.write_cluster(new_cluster, &initial_data)?; + } + Ok(new_cluster) + } else { + log::error!("No free clusters in get_new_cluster()"); + Err(io::Error::from_raw_os_error(libc::ENOSPC)) + } + } + + /// Allocates a data cluster and sets its refcount to 1. + fn append_data_cluster(&mut self, initial_data: Option>) -> io::Result { + let new_addr = self.get_new_cluster(initial_data)?; + self.set_cluster_refcount_track_freed(new_addr, 1)?; + Ok(new_addr) + } + + /// Updates the L1 and L2 tables to point to a new cluster address. + fn update_cluster_addr( + &mut self, + l1_index: usize, + l2_index: usize, + cluster_addr: u64, + set_refcounts: &mut Vec<(u64, u64)>, + ) -> io::Result<()> { + if !self.l2_cache.get(l1_index).unwrap().dirty() { + // Free the previously used cluster if one exists. Modified tables are always + // written to new clusters so the L1 table can be committed to disk after they + // are and L1 never points at an invalid table. + let addr = self.l1_table[l1_index]; + if addr != 0 { + self.unref_clusters.push(addr); + set_refcounts.push((addr, 0)); + } + + // Allocate a new cluster to store the L2 table and update the L1 table to point + // to the new table. The cluster will be written when the cache is flushed. + let new_addr = self.get_new_cluster(None)?; + set_refcounts.push((new_addr, 1)); + self.l1_table[l1_index] = new_addr; // marks l1_table dirty via IndexMut + } + // Write the L2 entry - IndexMut marks the L2 table dirty automatically. + self.l2_cache.get_mut(l1_index).unwrap()[l2_index] = l2_entry_make_std(cluster_addr); + Ok(()) + } + + /// Deallocates a cluster at the given guest address. + /// + /// If sparse is true, fully deallocates and returns the host offset if + /// the underlying storage should be punched after the refcount dropped + /// to zero. If sparse is false, uses the zero flag optimization when + /// possible. + /// + /// Returns None if no host punch_hole is needed. + pub(super) fn deallocate_cluster( + &mut self, + address: u64, + sparse: bool, + ) -> io::Result> { + if address >= self.header.size { + return Err(io::Error::from_raw_os_error(EINVAL)); + } + + let l1_index = self.l1_table_index(address) as usize; + let l2_addr_disk = match self.l1_table.get(l1_index) { + Some(&addr) => addr, + None => return Err(io::Error::from_raw_os_error(EINVAL)), + }; + let l2_index = self.l2_table_index(address) as usize; + + if l2_addr_disk == 0 { + return Ok(None); + } + + self.cache_l2_cluster(l1_index, l2_addr_disk)?; + + let l2_entry = self.l2_cache.get(l1_index).unwrap()[l2_index]; + if l2_entry_is_empty(l2_entry) || l2_entry_is_zero(l2_entry) { + return Ok(None); + } + + if l2_entry_is_compressed(l2_entry) { + self.deallocate_compressed_cluster(l2_entry)?; + self.l2_cache.get_mut(l1_index).unwrap()[l2_index] = 0; + return Ok(None); + } + + let cluster_addr = l2_entry_std_cluster_addr(l2_entry); + let refcount = self + .refcounts + .get_cluster_refcount(&mut self.raw_file, cluster_addr) + .map_err(|e| { + if matches!(e, refcount::Error::RefblockUnaligned(_)) { + self.set_corrupt_bit_best_effort(); + } + io::Error::new( + io::ErrorKind::InvalidData, + format!("failed to get cluster refcount: {e}"), + ) + })?; + if refcount == 0 { + return Err(io::Error::from_raw_os_error(EINVAL)); + } + + if sparse { + let new_refcount = refcount - 1; + self.set_cluster_refcount_track_freed(cluster_addr, new_refcount)?; + self.l2_cache.get_mut(l1_index).unwrap()[l2_index] = 0; + if new_refcount == 0 { + self.unref_clusters.push(cluster_addr); + return Ok(Some(cluster_addr)); + } + } else if refcount == 1 { + self.l2_cache.get_mut(l1_index).unwrap()[l2_index] = l2_entry_make_zero(cluster_addr); + } else { + self.set_cluster_refcount_track_freed(cluster_addr, refcount - 1)?; + self.l2_cache.get_mut(l1_index).unwrap()[l2_index] = 0; + } + Ok(None) + } + + /// Sets refcount for a cluster, tracking any newly freed clusters. + fn set_cluster_refcount_track_freed(&mut self, address: u64, refcount: u64) -> io::Result<()> { + let mut newly_unref = self.set_cluster_refcount(address, refcount)?; + self.unref_clusters.append(&mut newly_unref); + Ok(()) + } + + /// Sets the refcount for a cluster. Returns freed cluster addresses. + fn set_cluster_refcount(&mut self, address: u64, refcount: u64) -> io::Result> { + let mut added_clusters = Vec::new(); + let mut unref_clusters = Vec::new(); + let mut refcount_set = false; + let mut new_cluster = None; + + while !refcount_set { + match self.refcounts.set_cluster_refcount( + &mut self.raw_file, + address, + refcount, + new_cluster.take(), + ) { + Ok(None) => { + refcount_set = true; + } + Ok(Some(freed_cluster)) => { + let mut freed = self.set_cluster_refcount(freed_cluster, 0)?; + unref_clusters.append(&mut freed); + refcount_set = true; + } + Err(refcount::Error::EvictingRefCounts(e)) => { + return Err(e); + } + Err(refcount::Error::InvalidIndex) => { + self.set_corrupt_bit_best_effort(); + return Err(io::Error::from_raw_os_error(EINVAL)); + } + Err(refcount::Error::NeedCluster(addr)) => { + new_cluster = Some(( + addr, + VecCache::from_vec(self.raw_file.read_refcount_block(addr)?), + )); + } + Err(refcount::Error::NeedNewCluster) => { + let addr = self.get_new_cluster(None)?; + added_clusters.push(addr); + new_cluster = Some(( + addr, + VecCache::new(self.refcounts.refcounts_per_block() as usize), + )); + } + Err(refcount::Error::ReadingRefCounts(e)) => { + return Err(e); + } + Err(refcount::Error::RefcountOverflow { .. }) => { + return Err(io::Error::from_raw_os_error(EINVAL)); + } + Err(refcount::Error::RefblockUnaligned(_)) => { + self.set_corrupt_bit_best_effort(); + return Err(io::Error::from_raw_os_error(EIO)); + } + } + } + + for addr in added_clusters { + self.set_cluster_refcount(addr, 1)?; + } + Ok(unref_clusters) + } + + /// Flushes all dirty metadata to disk. + pub(super) fn sync_caches(&mut self) -> io::Result<()> { + use super::l1_entry_make; + + // Write out all dirty L2 tables. + for (l1_index, l2_table) in self.l2_cache.iter_mut().filter(|(_k, v)| v.dirty()) { + let addr = self.l1_table[*l1_index]; + if addr != 0 { + self.raw_file + .write_pointer_table_direct(addr, l2_table.iter())?; + } else { + self.set_corrupt_bit_best_effort(); + return Err(io::Error::from_raw_os_error(EINVAL)); + } + l2_table.mark_clean(); + } + // Write the modified refcount blocks. + self.refcounts.flush_blocks(&mut self.raw_file)?; + // Sync metadata and data clusters. + self.raw_file.file_mut().sync_all()?; + + // Push L1 table and refcount table last. + let mut sync_required = if self.l1_table.dirty() { + let refcounts = &mut self.refcounts; + self.raw_file.write_pointer_table( + self.header.l1_table_offset, + self.l1_table.iter(), + |raw_file, l2_addr| { + if l2_addr == 0 { + Ok(0) + } else { + let refcount = refcounts + .get_cluster_refcount(raw_file, l2_addr) + .map_err(|e| io::Error::other(super::Error::GettingRefcount(e)))?; + Ok(l1_entry_make(l2_addr, refcount == 1)) + } + }, + )?; + self.l1_table.mark_clean(); + true + } else { + false + }; + sync_required |= self.refcounts.flush_table(&mut self.raw_file)?; + if sync_required { + self.raw_file.file_mut().sync_data()?; + } + + Ok(()) + } + + /// Decompresses a compressed cluster, returning the raw decompressed bytes. + fn decompress_l2_cluster(&mut self, l2_entry: u64) -> io::Result> { + let (compressed_addr, compressed_size) = + l2_entry_compressed_cluster_layout(l2_entry, self.header.cluster_bits); + self.raw_file + .file_mut() + .seek(io::SeekFrom::Start(compressed_addr))?; + let mut compressed = vec![0u8; compressed_size]; + io::Read::read_exact(self.raw_file.file_mut(), &mut compressed)?; + let decoder = self.header.get_decoder(); + let cluster_size = self.raw_file.cluster_size() as usize; + let mut decompressed = vec![0u8; cluster_size]; + let decompressed_size = decoder + .decode(&compressed, &mut decompressed) + .map_err(|_| { + self.set_corrupt_bit_best_effort(); + io::Error::from_raw_os_error(EIO) + })?; + if decompressed_size as u64 != self.raw_file.cluster_size() { + self.set_corrupt_bit_best_effort(); + return Err(io::Error::from_raw_os_error(EIO)); + } + Ok(decompressed) + } + + /// Deallocates the clusters spanned by a compressed L2 entry. + fn deallocate_compressed_cluster(&mut self, l2_entry: u64) -> io::Result<()> { + let (compressed_addr, compressed_size) = + l2_entry_compressed_cluster_layout(l2_entry, self.header.cluster_bits); + let cluster_size = self.raw_file.cluster_size(); + + // Calculate the end of the compressed data region + let compressed_clusters_end = self.raw_file.cluster_address( + compressed_addr // Start of compressed data + + compressed_size as u64 // Add size to get end address + + cluster_size + - 1, // Catch possibly partially used last cluster + ); + + // Decrement refcount for each cluster spanned by the compressed data + let mut addr = self.raw_file.cluster_address(compressed_addr); + while addr < compressed_clusters_end { + let refcount = self + .refcounts + .get_cluster_refcount(&mut self.raw_file, addr) + .map_err(|e| { + if matches!(e, refcount::Error::RefblockUnaligned(_)) { + self.set_corrupt_bit_best_effort(); + } + io::Error::new( + io::ErrorKind::InvalidData, + format!("failed to get cluster refcount: {e}"), + ) + })?; + if refcount > 0 { + self.set_cluster_refcount_track_freed(addr, refcount - 1)?; + } + addr += cluster_size; + } + Ok(()) + } + + /// Best effort attempt to mark the image corrupt. + fn set_corrupt_bit_best_effort(&mut self) { + if let Err(e) = self.header.set_corrupt_bit(self.raw_file.file_mut()) { + log::warn!("Failed to persist corrupt bit: {e}"); + } + } +} diff --git a/block/src/qcow/mod.rs b/block/src/qcow/mod.rs index 83eb5a4bbc..06dce2c4ce 100644 --- a/block/src/qcow/mod.rs +++ b/block/src/qcow/mod.rs @@ -6,6 +6,7 @@ mod decoder; mod header; +pub(crate) mod metadata; mod qcow_raw_file; mod raw_file; mod refcount; @@ -182,7 +183,7 @@ impl BackingFileOps for RawFile { } /// Backing file wrapper -struct BackingFile { +pub(crate) struct BackingFile { inner: Box, virtual_size: u64, } @@ -276,6 +277,209 @@ impl Debug for BackingFile { } } +/// Parses and validates a QCOW2 image file, returning the metadata, backing +/// file and sparse flag. +/// +/// This shared constructor is used by both QcowFile for sequential I/O +/// and QcowDiskSync for lock based parallel I/O. +pub(crate) fn parse_qcow( + mut file: RawFile, + max_nesting_depth: u32, + sparse: bool, +) -> Result<(metadata::QcowState, Option, bool)> { + let mut header = QcowHeader::new(&mut file)?; + + // Only v2 and v3 files are supported. + if header.version != 2 && header.version != 3 { + return Err(Error::UnsupportedVersion(header.version)); + } + + // Make sure that the L1 table fits in RAM. + if u64::from(header.l1_size) > MAX_RAM_POINTER_TABLE_SIZE { + return Err(Error::InvalidL1TableSize(header.l1_size)); + } + + let cluster_bits: u32 = header.cluster_bits; + if !(MIN_CLUSTER_BITS..=MAX_CLUSTER_BITS).contains(&cluster_bits) { + return Err(Error::InvalidClusterSize); + } + let cluster_size = 0x01u64 << cluster_bits; + + // Limit the total size of the disk. + if header.size > MAX_QCOW_FILE_SIZE { + return Err(Error::FileTooBig(header.size)); + } + + let direct_io = file.is_direct(); + + let backing_file = BackingFile::new( + header.backing_file.as_ref(), + direct_io, + max_nesting_depth, + sparse, + )?; + + // Validate refcount order to be 0..6 + let refcount_bits: u64 = 0x01u64 + .checked_shl(header.refcount_order) + .ok_or(Error::UnsupportedRefcountOrder)?; + if refcount_bits > 64 { + return Err(Error::UnsupportedRefcountOrder); + } + + // Need at least one refcount cluster + if header.refcount_table_clusters == 0 { + return Err(Error::NoRefcountClusters); + } + offset_is_cluster_boundary(header.l1_table_offset, header.cluster_bits)?; + offset_is_cluster_boundary(header.snapshots_offset, header.cluster_bits)?; + // refcount table must be a cluster boundary, and within the file's virtual or actual size. + offset_is_cluster_boundary(header.refcount_table_offset, header.cluster_bits)?; + let file_size = file.metadata().map_err(Error::GettingFileSize)?.len(); + if header.refcount_table_offset > max(file_size, header.size) { + return Err(Error::RefcountTableOffEnd); + } + + // The first cluster should always have a non-zero refcount, so if it is 0, + // this is an old file with broken refcounts, which requires a rebuild. + let mut refcount_rebuild_required = true; + file.seek(SeekFrom::Start(header.refcount_table_offset)) + .map_err(Error::SeekingFile)?; + let first_refblock_addr = u64::read_be(&mut file).map_err(Error::ReadingHeader)?; + if first_refblock_addr != 0 { + file.seek(SeekFrom::Start(first_refblock_addr)) + .map_err(Error::SeekingFile)?; + let first_cluster_refcount = u16::read_be(&mut file).map_err(Error::ReadingHeader)?; + if first_cluster_refcount != 0 { + refcount_rebuild_required = false; + } + } + + if (header.compatible_features & COMPATIBLE_FEATURES_LAZY_REFCOUNTS) != 0 { + refcount_rebuild_required = true; + } + + let mut raw_file = + QcowRawFile::from(file, cluster_size, refcount_bits).ok_or(Error::InvalidClusterSize)?; + let is_writable = raw_file.file().is_writable(); + + if header.is_corrupt() { + if is_writable { + return Err(Error::CorruptImage); + } + let path = read_link(format!("/proc/self/fd/{}", raw_file.file().as_raw_fd())) + .map_or_else(|_| "".to_string(), |p| p.display().to_string()); + warn!("QCOW2 image is marked corrupt, opening read-only: {path}"); + } + + // Image already has dirty bit set. Refcounts may be invalid. + if IncompatFeatures::from_bits_truncate(header.incompatible_features) + .contains(IncompatFeatures::DIRTY) + { + log::warn!("QCOW2 image not cleanly closed, rebuilding refcounts"); + refcount_rebuild_required = true; + } + + // Skip refcount rebuilding for readonly files. + if refcount_rebuild_required && is_writable { + QcowFile::rebuild_refcounts(&mut raw_file, header.clone())?; + } + + let entries_per_cluster = cluster_size / size_of::() as u64; + let num_clusters = div_round_up_u64(header.size, cluster_size); + let num_l2_clusters = div_round_up_u64(num_clusters, entries_per_cluster); + let l1_clusters = div_round_up_u64(num_l2_clusters, entries_per_cluster); + let header_clusters = div_round_up_u64(size_of::() as u64, cluster_size); + if num_l2_clusters > MAX_RAM_POINTER_TABLE_SIZE { + return Err(Error::TooManyL1Entries(num_l2_clusters)); + } + let l1_table = VecCache::from_vec( + raw_file + .read_pointer_table( + header.l1_table_offset, + num_l2_clusters, + Some(L1_TABLE_OFFSET_MASK), + ) + .map_err(Error::ReadingHeader)?, + ); + + let num_clusters = div_round_up_u64(header.size, cluster_size); + let refcount_clusters = max_refcount_clusters( + header.refcount_order, + cluster_size as u32, + (num_clusters + l1_clusters + num_l2_clusters + header_clusters) as u32, + ); + // Check that the given header doesn't have a suspiciously sized refcount table. + if u64::from(header.refcount_table_clusters) > 2 * refcount_clusters { + return Err(Error::RefcountTableTooLarge); + } + if l1_clusters + refcount_clusters > MAX_RAM_POINTER_TABLE_SIZE { + return Err(Error::TooManyRefcounts(refcount_clusters)); + } + let refcount_block_entries = cluster_size * 8 / refcount_bits; + let mut refcounts = RefCount::new( + &mut raw_file, + header.refcount_table_offset, + refcount_clusters, + refcount_block_entries, + cluster_size, + refcount_bits, + ) + .map_err(Error::ReadingRefCounts)?; + + let l2_entries = cluster_size / size_of::() as u64; + + // Check that the L1 and refcount tables fit in a 64bit address space. + let l1_index = (header.size / cluster_size) / l2_entries; + header + .l1_table_offset + .checked_add(l1_index * size_of::() as u64) + .ok_or(Error::InvalidL1TableOffset)?; + header + .refcount_table_offset + .checked_add(u64::from(header.refcount_table_clusters) * cluster_size) + .ok_or(Error::InvalidRefcountTableOffset)?; + + // Find available (refcount == 0) clusters for the free list. + let file_size = raw_file + .file_mut() + .metadata() + .map_err(Error::GettingFileSize)? + .len(); + let mut avail_clusters = Vec::new(); + for i in (0..file_size).step_by(cluster_size as usize) { + let refcount = refcounts + .get_cluster_refcount(&mut raw_file, i) + .map_err(Error::GettingRefcount)?; + if refcount == 0 { + avail_clusters.push(i); + } + } + + if is_writable { + if !IncompatFeatures::from_bits_truncate(header.incompatible_features) + .contains(IncompatFeatures::DIRTY) + { + header.set_dirty_bit(raw_file.file_mut(), true)?; + } + + header.clear_autoclear_features(raw_file.file_mut())?; + } + + let inner = metadata::QcowState { + raw_file, + header, + l1_table, + l2_entries, + l2_cache: CacheMap::new(100), + refcounts, + avail_clusters, + unref_clusters: Vec::new(), + }; + + Ok((inner, backing_file, sparse)) +} + /// Represents a qcow2 file. This is a sparse file format maintained by the qemu project. /// Full documentation of the format can be found in the qemu repository. /// @@ -321,190 +525,34 @@ impl QcowFile { /// Creates a QcowFile from `file` and with a max nesting depth. File must be a valid qcow2 /// image. pub fn from_with_nesting_depth( - mut file: RawFile, + file: RawFile, max_nesting_depth: u32, sparse: bool, ) -> Result { - let header = QcowHeader::new(&mut file)?; - - // Only v2 and v3 files are supported. - if header.version != 2 && header.version != 3 { - return Err(Error::UnsupportedVersion(header.version)); - } - - // Make sure that the L1 table fits in RAM. - if u64::from(header.l1_size) > MAX_RAM_POINTER_TABLE_SIZE { - return Err(Error::InvalidL1TableSize(header.l1_size)); - } - - let cluster_bits: u32 = header.cluster_bits; - if !(MIN_CLUSTER_BITS..=MAX_CLUSTER_BITS).contains(&cluster_bits) { - return Err(Error::InvalidClusterSize); - } - let cluster_size = 0x01u64 << cluster_bits; - - // Limit the total size of the disk. - if header.size > MAX_QCOW_FILE_SIZE { - return Err(Error::FileTooBig(header.size)); - } - - let direct_io = file.is_direct(); - - let backing_file = BackingFile::new( - header.backing_file.as_ref(), - direct_io, - max_nesting_depth, - sparse, - )?; - - // Validate refcount order to be 0..6 - let refcount_bits: u64 = 0x01u64 - .checked_shl(header.refcount_order) - .ok_or(Error::UnsupportedRefcountOrder)?; - if refcount_bits > 64 { - return Err(Error::UnsupportedRefcountOrder); - } - - // Need at least one refcount cluster - if header.refcount_table_clusters == 0 { - return Err(Error::NoRefcountClusters); - } - offset_is_cluster_boundary(header.l1_table_offset, header.cluster_bits)?; - offset_is_cluster_boundary(header.snapshots_offset, header.cluster_bits)?; - // refcount table must be a cluster boundary, and within the file's virtual or actual size. - offset_is_cluster_boundary(header.refcount_table_offset, header.cluster_bits)?; - let file_size = file.metadata().map_err(Error::GettingFileSize)?.len(); - if header.refcount_table_offset > max(file_size, header.size) { - return Err(Error::RefcountTableOffEnd); - } - - // The first cluster should always have a non-zero refcount, so if it is 0, - // this is an old file with broken refcounts, which requires a rebuild. - let mut refcount_rebuild_required = true; - file.seek(SeekFrom::Start(header.refcount_table_offset)) - .map_err(Error::SeekingFile)?; - let first_refblock_addr = u64::read_be(&mut file).map_err(Error::ReadingHeader)?; - if first_refblock_addr != 0 { - file.seek(SeekFrom::Start(first_refblock_addr)) - .map_err(Error::SeekingFile)?; - let first_cluster_refcount = u16::read_be(&mut file).map_err(Error::ReadingHeader)?; - if first_cluster_refcount != 0 { - refcount_rebuild_required = false; - } - } - - if (header.compatible_features & COMPATIBLE_FEATURES_LAZY_REFCOUNTS) != 0 { - refcount_rebuild_required = true; - } - - let mut raw_file = QcowRawFile::from(file, cluster_size, refcount_bits) - .ok_or(Error::InvalidClusterSize)?; - let is_writable = raw_file.file().is_writable(); - - if header.is_corrupt() { - if is_writable { - return Err(Error::CorruptImage); - } - let path = read_link(format!("/proc/self/fd/{}", raw_file.file().as_raw_fd())) - .map_or_else(|_| "".to_string(), |p| p.display().to_string()); - warn!("QCOW2 image is marked corrupt, opening read-only: {path}"); - } - - // Image already has dirty bit set. Refcounts may be invalid. - if IncompatFeatures::from_bits_truncate(header.incompatible_features) - .contains(IncompatFeatures::DIRTY) - { - log::warn!("QCOW2 image not cleanly closed, rebuilding refcounts"); - refcount_rebuild_required = true; - } - - // Skip refcount rebuilding for readonly files. - if refcount_rebuild_required && is_writable { - QcowFile::rebuild_refcounts(&mut raw_file, header.clone())?; - } - - let entries_per_cluster = cluster_size / size_of::() as u64; - let num_clusters = div_round_up_u64(header.size, cluster_size); - let num_l2_clusters = div_round_up_u64(num_clusters, entries_per_cluster); - let l1_clusters = div_round_up_u64(num_l2_clusters, entries_per_cluster); - let header_clusters = div_round_up_u64(size_of::() as u64, cluster_size); - if num_l2_clusters > MAX_RAM_POINTER_TABLE_SIZE { - return Err(Error::TooManyL1Entries(num_l2_clusters)); - } - let l1_table = VecCache::from_vec( - raw_file - .read_pointer_table( - header.l1_table_offset, - num_l2_clusters, - Some(L1_TABLE_OFFSET_MASK), - ) - .map_err(Error::ReadingHeader)?, - ); - - let num_clusters = div_round_up_u64(header.size, cluster_size); - let refcount_clusters = max_refcount_clusters( - header.refcount_order, - cluster_size as u32, - (num_clusters + l1_clusters + num_l2_clusters + header_clusters) as u32, - ); - // Check that the given header doesn't have a suspiciously sized refcount table. - if u64::from(header.refcount_table_clusters) > 2 * refcount_clusters { - return Err(Error::RefcountTableTooLarge); - } - if l1_clusters + refcount_clusters > MAX_RAM_POINTER_TABLE_SIZE { - return Err(Error::TooManyRefcounts(refcount_clusters)); - } - let refcount_block_entries = cluster_size * 8 / refcount_bits; - let refcounts = RefCount::new( - &mut raw_file, - header.refcount_table_offset, - refcount_clusters, - refcount_block_entries, - cluster_size, - refcount_bits, - ) - .map_err(Error::ReadingRefCounts)?; - - let l2_entries = cluster_size / size_of::() as u64; - - let mut qcow = QcowFile { + let (inner, backing_file, sparse) = parse_qcow(file, max_nesting_depth, sparse)?; + let metadata::QcowState { + raw_file, + header, + l1_table, + l2_entries, + l2_cache, + refcounts, + avail_clusters, + unref_clusters, + } = inner; + Ok(QcowFile { raw_file, header, l1_table, l2_entries, - l2_cache: CacheMap::new(100), + l2_cache, refcounts, current_offset: 0, - unref_clusters: Vec::new(), - avail_clusters: Vec::new(), + unref_clusters, + avail_clusters, backing_file, sparse, - }; - - // Check that the L1 and refcount tables fit in a 64bit address space. - qcow.header - .l1_table_offset - .checked_add(qcow.l1_address_offset(qcow.virtual_size())) - .ok_or(Error::InvalidL1TableOffset)?; - qcow.header - .refcount_table_offset - .checked_add(u64::from(qcow.header.refcount_table_clusters) * cluster_size) - .ok_or(Error::InvalidRefcountTableOffset)?; - - qcow.find_avail_clusters()?; - - if is_writable { - if !IncompatFeatures::from_bits_truncate(qcow.header.incompatible_features) - .contains(IncompatFeatures::DIRTY) - { - qcow.header.set_dirty_bit(qcow.raw_file.file_mut(), true)?; - } - - qcow.header - .clear_autoclear_features(qcow.raw_file.file_mut())?; - } - - Ok(qcow) + }) } /// Creates a new QcowFile at the given path. @@ -779,29 +827,6 @@ impl QcowFile { Ok(()) } - fn find_avail_clusters(&mut self) -> Result<()> { - let cluster_size = self.raw_file.cluster_size(); - - let file_size = self - .raw_file - .file_mut() - .metadata() - .map_err(Error::GettingFileSize)? - .len(); - - for i in (0..file_size).step_by(cluster_size as usize) { - let refcount = self - .refcounts - .get_cluster_refcount(&mut self.raw_file, i) - .map_err(Error::GettingRefcount)?; - if refcount == 0 { - self.avail_clusters.push(i); - } - } - - Ok(()) - } - /// Rebuild the reference count tables. fn rebuild_refcounts(raw_file: &mut QcowRawFile, header: QcowHeader) -> Result<()> { fn add_ref( @@ -1130,12 +1155,6 @@ impl QcowFile { self.header.size } - // Gets the offset of `address` in the L1 table. - fn l1_address_offset(&self, address: u64) -> u64 { - let l1_index = self.l1_table_index(address); - l1_index * size_of::() as u64 - } - // Gets the offset of `address` in the L1 table. fn l1_table_index(&self, address: u64) -> u64 { (address / self.raw_file.cluster_size()) / self.l2_entries From a4a5b19f64a0f6322042e110166e528c3f648c78 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Tue, 17 Feb 2026 22:59:21 +0100 Subject: [PATCH 088/742] block: qcow: Add resize() to QcowMetadata Add resize() and grow_l1_table() so the metadata layer can grow the virtual disk size. Only grow is supported. Signed-off-by: Anatol Belski --- block/src/qcow/metadata.rs | 121 +++++++++++++++++++++++++++++++++++-- 1 file changed, 117 insertions(+), 4 deletions(-) diff --git a/block/src/qcow/metadata.rs b/block/src/qcow/metadata.rs index 88077236ca..c78a2af0be 100644 --- a/block/src/qcow/metadata.rs +++ b/block/src/qcow/metadata.rs @@ -26,8 +26,9 @@ use libc::{EINVAL, EIO}; use super::qcow_raw_file::QcowRawFile; use super::refcount::RefCount; use super::util::{ - l2_entry_compressed_cluster_layout, l2_entry_is_compressed, l2_entry_is_empty, - l2_entry_is_zero, l2_entry_make_std, l2_entry_make_zero, l2_entry_std_cluster_addr, + div_round_up_u64, l1_entry_make, l2_entry_compressed_cluster_layout, l2_entry_is_compressed, + l2_entry_is_empty, l2_entry_is_zero, l2_entry_make_std, l2_entry_make_zero, + l2_entry_std_cluster_addr, }; use super::vec_cache::{CacheMap, Cacheable, VecCache}; use super::{QcowHeader, refcount}; @@ -237,6 +238,16 @@ impl QcowMetadata { Ok(()) } + /// Resizes the QCOW2 image to the given new size. Only grow is + /// supported, shrink would require walking all L2 tables to reclaim + /// clusters beyond the new size and risks data loss. + /// + /// Returns an error if the new size is smaller than the current size. + pub fn resize(&self, new_size: u64) -> io::Result<()> { + let mut inner = self.inner.write().unwrap(); + inner.resize(new_size) + } + /// Deallocates a range of bytes. Full clusters are deallocated via metadata. /// Partial clusters need the caller to write zeros. This method returns a /// list of actions the caller should take. @@ -658,6 +669,110 @@ impl QcowState { Ok(()) } + /// Resizes the image to the given new size. Only grow is supported, + /// shrink would require walking all L2 tables to reclaim clusters + /// beyond the new size and risks data loss. + fn resize(&mut self, new_size: u64) -> io::Result<()> { + let current_size = self.header.size; + + if new_size == current_size { + return Ok(()); + } + + if new_size < current_size { + return Err(io::Error::other("shrinking QCOW2 images is not supported")); + } + + let cluster_size = self.raw_file.cluster_size(); + let entries_per_cluster = cluster_size / size_of::() as u64; + let new_clusters = div_round_up_u64(new_size, cluster_size); + let needed_l1_entries = div_round_up_u64(new_clusters, entries_per_cluster) as u32; + + if needed_l1_entries > self.header.l1_size { + self.grow_l1_table(needed_l1_entries)?; + } + + self.header.size = new_size; + + self.raw_file.file_mut().rewind()?; + self.header + .write_to(self.raw_file.file_mut()) + .map_err(|e| io::Error::other(format!("failed to write header during resize: {e}")))?; + + self.raw_file.file_mut().sync_all()?; + + Ok(()) + } + + /// Grows the L1 table to accommodate at least the requested number of entries. + fn grow_l1_table(&mut self, new_l1_size: u32) -> io::Result<()> { + let old_l1_size = self.header.l1_size; + let old_l1_offset = self.header.l1_table_offset; + let cluster_size = self.raw_file.cluster_size(); + + let new_l1_bytes = new_l1_size as u64 * size_of::() as u64; + let new_l1_clusters = div_round_up_u64(new_l1_bytes, cluster_size); + + // Allocate contiguous clusters at file end for new L1 table + let file_size = self.raw_file.file_mut().seek(io::SeekFrom::End(0))?; + let new_l1_offset = self.raw_file.cluster_address(file_size + cluster_size - 1); + + let new_file_end = new_l1_offset + new_l1_clusters * cluster_size; + self.raw_file.file_mut().set_len(new_file_end)?; + + // Set refcounts for the contiguous range + for i in 0..new_l1_clusters { + self.set_cluster_refcount_track_freed(new_l1_offset + i * cluster_size, 1)?; + } + + let mut new_l1_data = vec![0u64; new_l1_size as usize]; + let old_entries = self.l1_table.get_values(); + new_l1_data[..old_entries.len()].copy_from_slice(old_entries); + + for l2_addr in new_l1_data.iter_mut() { + if *l2_addr != 0 { + let refcount = self + .refcounts + .get_cluster_refcount(&mut self.raw_file, *l2_addr) + .map_err(|e| { + io::Error::other(format!("failed to get refcount during resize: {e}")) + })?; + *l2_addr = l1_entry_make(*l2_addr, refcount == 1); + } + } + + // Write the new L1 table to disk + self.raw_file + .write_pointer_table_direct(new_l1_offset, new_l1_data.iter())?; + + self.raw_file.file_mut().sync_all()?; + + self.header.l1_size = new_l1_size; + self.header.l1_table_offset = new_l1_offset; + + self.raw_file.file_mut().rewind()?; + self.header + .write_to(self.raw_file.file_mut()) + .map_err(|e| io::Error::other(format!("failed to write header during resize: {e}")))?; + + self.raw_file.file_mut().sync_all()?; + + // Free old L1 table clusters + let old_l1_bytes = old_l1_size as u64 * size_of::() as u64; + let old_l1_clusters = div_round_up_u64(old_l1_bytes, cluster_size); + for i in 0..old_l1_clusters { + let cluster_addr = old_l1_offset + i * cluster_size; + // Best effort: the old L1 clusters are no longer reachable, + // so a refcount update failure just leaks space. + let _ = self.set_cluster_refcount(cluster_addr, 0); + } + + // Update L1 table cache + self.l1_table.extend(new_l1_size as usize); + + Ok(()) + } + /// Deallocates a cluster at the given guest address. /// /// If sparse is true, fully deallocates and returns the host offset if @@ -804,8 +919,6 @@ impl QcowState { /// Flushes all dirty metadata to disk. pub(super) fn sync_caches(&mut self) -> io::Result<()> { - use super::l1_entry_make; - // Write out all dirty L2 tables. for (l1_index, l2_table) in self.l2_cache.iter_mut().filter(|(_k, v)| v.dirty()) { let addr = self.l1_table[*l1_index]; From 57e89b04f6d9e7da4d5d0b84d0021c9f1b706b7a Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Sat, 21 Feb 2026 16:52:20 +0100 Subject: [PATCH 089/742] block: qcow: Refactor BackingFile for ownership based decomposition Replace the clone based BackingFileOps trait with a BackingKind enum so backing files can be decomposed into their concrete owned types. BackingFile::new() for QCOW2 backings now calls parse_qcow() directly instead of building a full QcowFile. Remove Clone for BackingFile and QcowFile. Prerequisite for the qcow_sync rewrite which decomposes a BackingFile into a raw fd or QcowMetadata for lock free I/O. Signed-off-by: Anatol Belski --- block/src/qcow/metadata.rs | 33 +- block/src/qcow/mod.rs | 145 ++++-- block/src/qcow_sync.rs | 1005 ++++++++++++++++++++++++------------ 3 files changed, 816 insertions(+), 367 deletions(-) diff --git a/block/src/qcow/metadata.rs b/block/src/qcow/metadata.rs index c78a2af0be..b4b64cabd0 100644 --- a/block/src/qcow/metadata.rs +++ b/block/src/qcow/metadata.rs @@ -130,7 +130,7 @@ pub(crate) struct QcowState { } impl QcowMetadata { - pub(super) fn new(inner: QcowState) -> Self { + pub(crate) fn new(inner: QcowState) -> Self { QcowMetadata { inner: RwLock::new(inner), } @@ -238,6 +238,21 @@ impl QcowMetadata { Ok(()) } + /// Flushes dirty metadata caches and clears the dirty bit for + /// clean shutdown. + pub fn shutdown(&self) { + let mut inner = self.inner.write().unwrap(); + let _ = inner.sync_caches(); + let QcowState { + ref mut header, + ref mut raw_file, + .. + } = *inner; + if raw_file.file().is_writable() { + let _ = header.set_dirty_bit(raw_file.file_mut(), false); + } + } + /// Resizes the QCOW2 image to the given new size. Only grow is /// supported, shrink would require walking all L2 tables to reclaim /// clusters beyond the new size and risks data loss. @@ -260,6 +275,9 @@ impl QcowMetadata { cluster_size: u64, backing_file: Option<&dyn BackingRead>, ) -> io::Result> { + if address.checked_add(length as u64).is_none() { + return Ok(Vec::new()); + } let mut inner = self.inner.write().unwrap(); let mut actions = Vec::new(); @@ -476,6 +494,19 @@ impl QcowState { } } + /// Maps a single cluster region for a sequential read. + pub(crate) fn map_cluster_read( + &mut self, + address: u64, + count: usize, + has_backing_file: bool, + ) -> io::Result { + match self.try_map_read(address, count, has_backing_file)? { + Some(mapping) => Ok(mapping), + None => self.map_read_with_populate(address, count, has_backing_file), + } + } + /// Write path mapping. Always called under write lock. fn map_write( &mut self, diff --git a/block/src/qcow/mod.rs b/block/src/qcow/mod.rs index 06dce2c4ce..c0b4e8c720 100644 --- a/block/src/qcow/mod.rs +++ b/block/src/qcow/mod.rs @@ -7,7 +7,7 @@ mod decoder; mod header; pub(crate) mod metadata; -mod qcow_raw_file; +pub(crate) mod qcow_raw_file; mod raw_file; mod refcount; mod util; @@ -37,6 +37,7 @@ use header::{ }; use libc::{EINVAL, EIO, ENOSPC}; use log::{error, warn}; +use metadata::ClusterReadMapping; use remain::sorted; use thiserror::Error; pub(crate) use util::MAX_NESTING_DEPTH; @@ -162,29 +163,22 @@ pub enum Error { pub type Result = std::result::Result; -trait BackingFileOps: Send + Seek + Read { - fn read_at(&mut self, address: u64, buf: &mut [u8]) -> std::io::Result<()> { - self.seek(SeekFrom::Start(address))?; - self.read_exact(buf) - } - fn clone_box(&self) -> Box; -} - -impl BackingFileOps for QcowFile { - fn clone_box(&self) -> Box { - Box::new(self.clone()) - } -} - -impl BackingFileOps for RawFile { - fn clone_box(&self) -> Box { - Box::new(self.clone()) - } +/// Concrete backing file variants. +pub(crate) enum BackingKind { + /// Raw backing file. + Raw(RawFile), + /// QCOW2 backing parsed into metadata and raw file. + Qcow { + inner: Box, + backing: Option>, + }, + /// Full QcowFile used as backing, only in tests. + #[cfg(test)] + QcowFile(Box), } - /// Backing file wrapper pub(crate) struct BackingFile { - inner: Box, + kind: BackingKind, virtual_size: u64, } @@ -217,56 +211,108 @@ impl BackingFile { None => detect_image_type(&mut raw_file)?, }; - let (inner, virtual_size): (Box, u64) = match backing_format { + let (kind, virtual_size) = match backing_format { ImageType::Raw => { let size = raw_file .seek(SeekFrom::End(0)) .map_err(Error::BackingFileIo)?; raw_file.rewind().map_err(Error::BackingFileIo)?; - (Box::new(raw_file), size) + (BackingKind::Raw(raw_file), size) } ImageType::Qcow2 => { - let backing_qcow = - QcowFile::from_with_nesting_depth(raw_file, max_nesting_depth - 1, sparse) + let (inner, nested_backing, _sparse) = + parse_qcow(raw_file, max_nesting_depth - 1, sparse) .map_err(|e| Error::BackingFileOpen(Box::new(e)))?; - let size = backing_qcow.virtual_size(); - (Box::new(backing_qcow), size) + let size = inner.header.size; + ( + BackingKind::Qcow { + inner: Box::new(inner), + backing: nested_backing.map(Box::new), + }, + size, + ) } }; - Ok(Some(Self { - inner, - virtual_size, - })) + Ok(Some(Self { kind, virtual_size })) + } + + /// Consume and return the kind and virtual size. + pub(crate) fn into_kind(self) -> (BackingKind, u64) { + (self.kind, self.virtual_size) } /// Read from backing file, returning zeros for any portion beyond backing file size. #[inline] - fn read_at(&mut self, address: u64, buf: &mut [u8]) -> std::io::Result<()> { + pub(crate) fn read_at(&mut self, address: u64, buf: &mut [u8]) -> std::io::Result<()> { if address >= self.virtual_size { - // Entire read is beyond backing file buf.fill(0); return Ok(()); } let available = (self.virtual_size - address) as usize; - if available >= buf.len() { - // Entire read is within backing file - self.inner.read_at(address, buf) + let (target, overflow) = if available >= buf.len() { + (buf, &mut [][..]) } else { - // Partial read, fill the rest with zeroes - self.inner.read_at(address, &mut buf[..available])?; - buf[available..].fill(0); - Ok(()) - } + buf.split_at_mut(available) + }; + Self::read_at_inner(&mut self.kind, address, target)?; + overflow.fill(0); + Ok(()) } -} -impl Clone for BackingFile { - fn clone(&self) -> Self { - Self { - inner: self.inner.clone_box(), - virtual_size: self.virtual_size, + fn read_at_inner(kind: &mut BackingKind, address: u64, buf: &mut [u8]) -> std::io::Result<()> { + match kind { + BackingKind::Raw(file) => { + file.seek(SeekFrom::Start(address))?; + file.read_exact(buf) + } + #[cfg(test)] + BackingKind::QcowFile(qcow) => { + qcow.seek(SeekFrom::Start(address))?; + qcow.read_exact(buf) + } + BackingKind::Qcow { inner, backing } => { + let has_backing = backing.is_some(); + let cluster_size = inner.raw_file.cluster_size(); + let mut pos = 0usize; + while pos < buf.len() { + let curr_addr = address + pos as u64; + let intra = inner.raw_file.cluster_offset(curr_addr) as usize; + let count = min(buf.len() - pos, cluster_size as usize - intra); + let mapping = inner.map_cluster_read(curr_addr, count, has_backing)?; + match mapping { + ClusterReadMapping::Zero { length } => { + buf[pos..pos + length as usize].fill(0); + } + ClusterReadMapping::Allocated { + offset: host_off, + length, + } => { + inner.raw_file.file_mut().seek(SeekFrom::Start(host_off))?; + inner + .raw_file + .file_mut() + .read_exact(&mut buf[pos..pos + length as usize])?; + } + ClusterReadMapping::Compressed { data } => { + buf[pos..pos + data.len()].copy_from_slice(&data); + } + ClusterReadMapping::Backing { + offset: backing_off, + length, + } => { + if let Some(bf) = backing.as_mut() { + bf.read_at(backing_off, &mut buf[pos..pos + length as usize])?; + } else { + buf[pos..pos + length as usize].fill(0); + } + } + } + pos += count; + } + Ok(()) + } } } } @@ -497,7 +543,7 @@ pub(crate) fn parse_qcow( /// # Ok(()) /// # } /// ``` -#[derive(Clone, Debug)] +#[derive(Debug)] pub struct QcowFile { raw_file: QcowRawFile, header: QcowHeader, @@ -605,11 +651,12 @@ impl QcowFile { Ok(qcow) } + #[cfg(test)] pub fn set_backing_file(&mut self, backing: Option>) { self.backing_file = backing.map(|b| { let virtual_size = b.virtual_size(); BackingFile { - inner: Box::new(*b), + kind: BackingKind::QcowFile(b), virtual_size, } }); diff --git a/block/src/qcow_sync.rs b/block/src/qcow_sync.rs index 2707f5dfba..22d361adad 100644 --- a/block/src/qcow_sync.rs +++ b/block/src/qcow_sync.rs @@ -2,80 +2,236 @@ // // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause +use std::cmp::min; use std::collections::VecDeque; use std::fs::File; -use std::io::{self, Seek, SeekFrom}; -use std::os::fd::AsRawFd; -use std::sync::{Arc, Mutex}; +use std::os::fd::{AsRawFd, FromRawFd, OwnedFd, RawFd}; +use std::sync::Arc; +use std::{io, ptr, slice}; use vmm_sys_util::eventfd::EventFd; -use vmm_sys_util::write_zeroes::PunchHole; +use vmm_sys_util::write_zeroes::{PunchHole, WriteZeroesAt}; use crate::async_io::{ AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFile, DiskFileError, DiskFileResult, }; -use crate::qcow::{Error as QcowError, MAX_NESTING_DEPTH, QcowFile, RawFile, Result as QcowResult}; -use crate::{AsyncAdaptor, BlockBackend}; +use crate::qcow::metadata::{ + BackingRead, ClusterReadMapping, ClusterWriteMapping, DeallocAction, QcowMetadata, +}; +use crate::qcow::qcow_raw_file::QcowRawFile; +use crate::qcow::{ + BackingFile, BackingKind, Error as QcowError, MAX_NESTING_DEPTH, RawFile, Result as QcowResult, + parse_qcow, +}; + +/// Raw backing file using pread64 on a duplicated fd. +struct RawBacking { + fd: OwnedFd, + virtual_size: u64, +} + +// SAFETY: The only I/O operation is pread64 which is position independent +// and safe for concurrent use from multiple threads. +unsafe impl Sync for RawBacking {} + +impl BackingRead for RawBacking { + fn read_at(&self, address: u64, buf: &mut [u8]) -> io::Result<()> { + if address >= self.virtual_size { + buf.fill(0); + return Ok(()); + } + let available = (self.virtual_size - address) as usize; + if available >= buf.len() { + pread_exact(self.fd.as_raw_fd(), buf, address) + } else { + pread_exact(self.fd.as_raw_fd(), &mut buf[..available], address)?; + buf[available..].fill(0); + Ok(()) + } + } +} + +/// QCOW2 backing file with RwLock metadata and pread64 data reads. +/// +/// Read only because backing files never receive writes. Nested backing +/// files are handled recursively. +struct Qcow2MetadataBacking { + metadata: Arc, + data_fd: OwnedFd, + backing_file: Option>, +} + +// SAFETY: All reads go through QcowMetadata which uses RwLock +// and pread64 which is position independent and thread safe. +unsafe impl Sync for Qcow2MetadataBacking {} + +impl BackingRead for Qcow2MetadataBacking { + fn read_at(&self, address: u64, buf: &mut [u8]) -> io::Result<()> { + let virtual_size = self.metadata.virtual_size(); + if address >= virtual_size { + buf.fill(0); + return Ok(()); + } + let available = (virtual_size - address) as usize; + if available < buf.len() { + self.read_clusters(address, &mut buf[..available])?; + buf[available..].fill(0); + return Ok(()); + } + self.read_clusters(address, buf) + } +} + +impl Qcow2MetadataBacking { + /// Resolve cluster mappings via metadata then read allocated clusters + /// with pread64. + fn read_clusters(&self, address: u64, buf: &mut [u8]) -> io::Result<()> { + let total_len = buf.len(); + let has_backing = self.backing_file.is_some(); + + let mappings = self + .metadata + .map_clusters_for_read(address, total_len, has_backing)?; + + let mut buf_offset = 0usize; + for mapping in mappings { + match mapping { + ClusterReadMapping::Zero { length } => { + buf[buf_offset..buf_offset + length as usize].fill(0); + buf_offset += length as usize; + } + ClusterReadMapping::Allocated { + offset: host_offset, + length, + } => { + pread_exact( + self.data_fd.as_raw_fd(), + &mut buf[buf_offset..buf_offset + length as usize], + host_offset, + )?; + buf_offset += length as usize; + } + ClusterReadMapping::Compressed { data } => { + let len = data.len(); + buf[buf_offset..buf_offset + len].copy_from_slice(&data); + buf_offset += len; + } + ClusterReadMapping::Backing { + offset: backing_offset, + length, + } => { + self.backing_file.as_ref().unwrap().read_at( + backing_offset, + &mut buf[buf_offset..buf_offset + length as usize], + )?; + buf_offset += length as usize; + } + } + } + Ok(()) + } +} + +impl Drop for Qcow2MetadataBacking { + fn drop(&mut self) { + self.metadata.shutdown(); + } +} + +/// Construct a thread safe backing file reader. +fn shared_backing_from(bf: BackingFile) -> QcowResult> { + let (kind, virtual_size) = bf.into_kind(); + match kind { + BackingKind::Raw(raw_file) => { + // SAFETY: raw_file holds a valid open fd. + let dup_fd = unsafe { libc::dup(raw_file.as_raw_fd()) }; + if dup_fd < 0 { + return Err(QcowError::BackingFileIo(io::Error::last_os_error())); + } + // SAFETY: dup_fd is a freshly duplicated valid fd. + let fd = unsafe { OwnedFd::from_raw_fd(dup_fd) }; + Ok(Arc::new(RawBacking { fd, virtual_size })) + } + BackingKind::Qcow { inner, backing } => { + // SAFETY: inner.raw_file holds a valid open fd. + let dup_fd = unsafe { libc::dup(inner.raw_file.as_raw_fd()) }; + if dup_fd < 0 { + return Err(QcowError::BackingFileIo(io::Error::last_os_error())); + } + // SAFETY: dup_fd is a freshly duplicated valid fd. + let data_fd = unsafe { OwnedFd::from_raw_fd(dup_fd) }; + Ok(Arc::new(Qcow2MetadataBacking { + metadata: Arc::new(QcowMetadata::new(*inner)), + data_fd, + backing_file: backing.map(|bf| shared_backing_from(*bf)).transpose()?, + })) + } + #[cfg(test)] + BackingKind::QcowFile(_) => { + unreachable!("QcowFile variant is only used by set_backing_file() in tests") + } + } +} pub struct QcowDiskSync { - // FIXME: The Mutex serializes all QCOW2 I/O operations across queues, which - // is necessary for correctness but eliminates any parallelism benefit from - // multiqueue. QcowFile has internal mutable state (L2 cache, refcounts, file - // position) that is not safe to share across threads via Clone. - // - // A proper fix would require restructuring QcowFile to separate metadata - // operations (which need synchronization) from data I/O (which could be - // parallelized with per queue file descriptors). See #7560 for details. - qcow_file: Arc>, + metadata: Arc, + /// Shared across queues, resolved once at construction. + backing_file: Option>, + sparse: bool, + data_raw_file: QcowRawFile, } impl QcowDiskSync { pub fn new(file: File, direct_io: bool, backing_files: bool, sparse: bool) -> QcowResult { let max_nesting_depth = if backing_files { MAX_NESTING_DEPTH } else { 0 }; - let qcow_file = QcowFile::from_with_nesting_depth( - RawFile::new(file, direct_io), - max_nesting_depth, - sparse, - ) - .map_err(|e| match e { - QcowError::MaxNestingDepthExceeded if !backing_files => QcowError::BackingFilesDisabled, - other => other, - })?; + let (inner, backing_file, sparse) = + parse_qcow(RawFile::new(file, direct_io), max_nesting_depth, sparse).map_err(|e| { + match e { + QcowError::MaxNestingDepthExceeded if !backing_files => { + QcowError::BackingFilesDisabled + } + other => other, + } + })?; + let data_raw_file = inner.raw_file.clone(); Ok(QcowDiskSync { - qcow_file: Arc::new(Mutex::new(qcow_file)), + metadata: Arc::new(QcowMetadata::new(inner)), + backing_file: backing_file.map(shared_backing_from).transpose()?, + sparse, + data_raw_file, }) } } impl DiskFile for QcowDiskSync { fn logical_size(&mut self) -> DiskFileResult { - self.qcow_file - .lock() - .unwrap() - .seek(SeekFrom::End(0)) - .map_err(DiskFileError::Size) + Ok(self.metadata.virtual_size()) } fn physical_size(&mut self) -> DiskFileResult { - self.qcow_file.lock().unwrap().physical_size().map_err(|e| { - let io_inner = match e { - crate::Error::GetFileMetadata(e) => e, - _ => unreachable!(), - }; - DiskFileError::Size(io_inner) - }) + self.data_raw_file + .physical_size() + .map_err(DiskFileError::Size) } fn new_async_io(&self, _ring_depth: u32) -> DiskFileResult> { - Ok(Box::new(QcowSync::new(Arc::clone(&self.qcow_file))) as Box) + Ok(Box::new(QcowSync::new( + Arc::clone(&self.metadata), + self.data_raw_file.clone(), + self.backing_file.as_ref().map(Arc::clone), + self.sparse, + )) as Box) } fn resize(&mut self, size: u64) -> DiskFileResult<()> { - self.qcow_file - .lock() - .unwrap() + if self.backing_file.is_some() { + return Err(DiskFileError::ResizeError(io::Error::other( + "resize not supported with backing file", + ))); + } + self.metadata .resize(size) - .map_err(|e| DiskFileError::ResizeError(io::Error::other(e))) + .map_err(DiskFileError::ResizeError) } fn supports_sparse_operations(&self) -> bool { @@ -87,20 +243,38 @@ impl DiskFile for QcowDiskSync { } fn fd(&mut self) -> BorrowedDiskFd<'_> { - BorrowedDiskFd::new(self.qcow_file.lock().unwrap().as_raw_fd()) + BorrowedDiskFd::new(self.data_raw_file.as_raw_fd()) + } +} + +impl Drop for QcowDiskSync { + fn drop(&mut self) { + self.metadata.shutdown(); } } pub struct QcowSync { - qcow_file: Arc>, + metadata: Arc, + data_file: QcowRawFile, + /// See the backing_file field on QcowDiskSync. + backing_file: Option>, + sparse: bool, eventfd: EventFd, completion_list: VecDeque<(u64, i32)>, } impl QcowSync { - pub fn new(qcow_file: Arc>) -> Self { + fn new( + metadata: Arc, + data_file: QcowRawFile, + backing_file: Option>, + sparse: bool, + ) -> Self { QcowSync { - qcow_file, + metadata, + data_file, + backing_file, + sparse, eventfd: EventFd::new(libc::EFD_NONBLOCK) .expect("Failed creating EventFd for QcowSync"), completion_list: VecDeque::new(), @@ -108,7 +282,152 @@ impl QcowSync { } } -impl AsyncAdaptor for QcowFile {} +// -- Position independent I/O helpers -- +// +// Duplicated file descriptors share the kernel file description and thus the +// file position. Using seek then read from multiple queues races on that +// shared position. pread64 and pwrite64 are atomic and never touch the position. + +/// Read exactly the requested bytes at offset, looping on short reads. +fn pread_exact(fd: RawFd, buf: &mut [u8], offset: u64) -> io::Result<()> { + let mut total = 0usize; + while total < buf.len() { + // SAFETY: buf and fd are valid for the lifetime of the call. + let ret = unsafe { + libc::pread64( + fd, + buf[total..].as_mut_ptr() as *mut libc::c_void, + buf.len() - total, + (offset + total as u64) as libc::off_t, + ) + }; + if ret < 0 { + return Err(io::Error::last_os_error()); + } + if ret == 0 { + return Err(io::Error::from(io::ErrorKind::UnexpectedEof)); + } + total += ret as usize; + } + Ok(()) +} + +/// Write all bytes to fd at offset, looping on short writes. +fn pwrite_all(fd: RawFd, buf: &[u8], offset: u64) -> io::Result<()> { + let mut total = 0usize; + while total < buf.len() { + // SAFETY: buf and fd are valid for the lifetime of the call. + let ret = unsafe { + libc::pwrite64( + fd, + buf[total..].as_ptr() as *const libc::c_void, + buf.len() - total, + (offset + total as u64) as libc::off_t, + ) + }; + if ret < 0 { + return Err(io::Error::last_os_error()); + } + if ret == 0 { + return Err(io::Error::other("pwrite64 wrote 0 bytes")); + } + total += ret as usize; + } + Ok(()) +} + +// -- iovec helper functions -- +// +// Operate on the iovec array as a flat byte stream. + +/// Copy data into iovecs starting at the given byte offset. +/// +/// # Safety +/// Caller must ensure iovecs point to valid, writable memory of sufficient size. +unsafe fn scatter_to_iovecs(iovecs: &[libc::iovec], start: usize, data: &[u8]) { + let mut remaining = data; + let mut pos = 0usize; + for iov in iovecs { + let iov_end = pos + iov.iov_len; + if iov_end <= start || remaining.is_empty() { + pos = iov_end; + continue; + } + let iov_start = start.saturating_sub(pos); + let available = iov.iov_len - iov_start; + let count = min(available, remaining.len()); + // SAFETY: iov_base is valid for iov_len bytes per caller contract. + unsafe { + let dst = (iov.iov_base as *mut u8).add(iov_start); + ptr::copy_nonoverlapping(remaining.as_ptr(), dst, count); + } + remaining = &remaining[count..]; + if remaining.is_empty() { + break; + } + pos = iov_end; + } +} + +/// Zero fill iovecs starting at the given byte offset for the given length. +/// +/// # Safety +/// Caller must ensure iovecs point to valid, writable memory of sufficient size. +unsafe fn zero_fill_iovecs(iovecs: &[libc::iovec], start: usize, len: usize) { + let mut remaining = len; + let mut pos = 0usize; + for iov in iovecs { + let iov_end = pos + iov.iov_len; + if iov_end <= start || remaining == 0 { + pos = iov_end; + continue; + } + let iov_start = start.saturating_sub(pos); + let available = iov.iov_len - iov_start; + let count = min(available, remaining); + // SAFETY: iov_base is valid for iov_len bytes per caller contract. + unsafe { + let dst = (iov.iov_base as *mut u8).add(iov_start); + ptr::write_bytes(dst, 0, count); + } + remaining -= count; + if remaining == 0 { + break; + } + pos = iov_end; + } +} + +/// Gather bytes from iovecs starting at the given byte offset into a Vec. +/// +/// # Safety +/// Caller must ensure iovecs point to valid, readable memory of sufficient size. +unsafe fn gather_from_iovecs(iovecs: &[libc::iovec], start: usize, len: usize) -> Vec { + let mut result = Vec::with_capacity(len); + let mut remaining = len; + let mut pos = 0usize; + for iov in iovecs { + let iov_end = pos + iov.iov_len; + if iov_end <= start || remaining == 0 { + pos = iov_end; + continue; + } + let iov_start = start.saturating_sub(pos); + let available = iov.iov_len - iov_start; + let count = min(available, remaining); + // SAFETY: iov_base is valid for iov_len bytes per caller contract. + unsafe { + let src = (iov.iov_base as *const u8).add(iov_start); + result.extend_from_slice(slice::from_raw_parts(src, count)); + } + remaining -= count; + if remaining == 0 { + break; + } + pos = iov_end; + } + result +} impl AsyncIo for QcowSync { fn notifier(&self) -> &EventFd { @@ -121,13 +440,61 @@ impl AsyncIo for QcowSync { iovecs: &[libc::iovec], user_data: u64, ) -> AsyncIoResult<()> { - self.qcow_file.lock().unwrap().read_vectored_sync( - offset, - iovecs, - user_data, - &self.eventfd, - &mut self.completion_list, - ) + let address = offset as u64; + let total_len: usize = iovecs.iter().map(|v| v.iov_len).sum(); + + let has_backing = self.backing_file.is_some(); + let mappings = self + .metadata + .map_clusters_for_read(address, total_len, has_backing) + .map_err(AsyncIoError::ReadVectored)?; + + let mut buf_offset = 0usize; + for mapping in mappings { + match mapping { + ClusterReadMapping::Zero { length } => { + // SAFETY: iovecs point to valid guest memory buffers + unsafe { zero_fill_iovecs(iovecs, buf_offset, length as usize) }; + buf_offset += length as usize; + } + ClusterReadMapping::Allocated { + offset: host_offset, + length, + } => { + let mut buf = vec![0u8; length as usize]; + pread_exact(self.data_file.as_raw_fd(), &mut buf, host_offset) + .map_err(AsyncIoError::ReadVectored)?; + // SAFETY: iovecs point to valid guest memory buffers + unsafe { scatter_to_iovecs(iovecs, buf_offset, &buf) }; + buf_offset += length as usize; + } + ClusterReadMapping::Compressed { data } => { + let len = data.len(); + // SAFETY: iovecs point to valid guest memory buffers + unsafe { scatter_to_iovecs(iovecs, buf_offset, &data) }; + buf_offset += len; + } + ClusterReadMapping::Backing { + offset: backing_offset, + length, + } => { + let mut buf = vec![0u8; length as usize]; + self.backing_file + .as_ref() + .unwrap() + .read_at(backing_offset, &mut buf) + .map_err(AsyncIoError::ReadVectored)?; + // SAFETY: iovecs point to valid guest memory buffers + unsafe { scatter_to_iovecs(iovecs, buf_offset, &buf) }; + buf_offset += length as usize; + } + } + } + + self.completion_list + .push_back((user_data, total_len as i32)); + self.eventfd.write(1).unwrap(); + Ok(()) } fn write_vectored( @@ -136,21 +503,65 @@ impl AsyncIo for QcowSync { iovecs: &[libc::iovec], user_data: u64, ) -> AsyncIoResult<()> { - self.qcow_file.lock().unwrap().write_vectored_sync( - offset, - iovecs, - user_data, - &self.eventfd, - &mut self.completion_list, - ) + let address = offset as u64; + let total_len: usize = iovecs.iter().map(|v| v.iov_len).sum(); + let mut buf_offset = 0usize; + + while buf_offset < total_len { + let curr_addr = address + buf_offset as u64; + let cluster_size = self.metadata.cluster_size(); + let intra_offset = self.metadata.cluster_offset(curr_addr); + let remaining_in_cluster = (cluster_size - intra_offset) as usize; + let count = min(total_len - buf_offset, remaining_in_cluster); + + // Read backing data for COW if this is a partial cluster + // write to an unallocated cluster with a backing file. + let backing_data = if let Some(backing) = self + .backing_file + .as_ref() + .filter(|_| intra_offset != 0 || count < cluster_size as usize) + { + let cluster_begin = curr_addr - intra_offset; + let mut data = vec![0u8; cluster_size as usize]; + backing + .read_at(cluster_begin, &mut data) + .map_err(AsyncIoError::WriteVectored)?; + Some(data) + } else { + None + }; + + let mapping = self + .metadata + .map_cluster_for_write(curr_addr, backing_data) + .map_err(AsyncIoError::WriteVectored)?; + + match mapping { + ClusterWriteMapping::Allocated { + offset: host_offset, + } => { + // SAFETY: iovecs point to valid guest memory buffers + let buf = unsafe { gather_from_iovecs(iovecs, buf_offset, count) }; + pwrite_all(self.data_file.as_raw_fd(), &buf, host_offset) + .map_err(AsyncIoError::WriteVectored)?; + } + } + buf_offset += count; + } + + self.completion_list + .push_back((user_data, total_len as i32)); + self.eventfd.write(1).unwrap(); + Ok(()) } fn fsync(&mut self, user_data: Option) -> AsyncIoResult<()> { - self.qcow_file.lock().unwrap().fsync_sync( - user_data, - &self.eventfd, - &mut self.completion_list, - ) + self.metadata.flush().map_err(AsyncIoError::Fsync)?; + if let Some(user_data) = user_data { + self.completion_list.push_back((user_data, 0)); + self.eventfd.write(1).unwrap(); + } + Ok(()) } fn next_completed_request(&mut self) -> Option<(u64, i32)> { @@ -158,26 +569,49 @@ impl AsyncIo for QcowSync { } fn punch_hole(&mut self, offset: u64, length: u64, user_data: u64) -> AsyncIoResult<()> { - // For QCOW2, punch_hole calls deallocate_cluster + let virtual_size = self.metadata.virtual_size(); + let cluster_size = self.metadata.cluster_size(); + let result = self - .qcow_file - .lock() - .unwrap() - .punch_hole(offset, length) - .map(|_| 0i32) + .metadata + .deallocate_bytes( + offset, + length as usize, + self.sparse, + virtual_size, + cluster_size, + self.backing_file.as_deref(), + ) .map_err(AsyncIoError::PunchHole); match result { - Ok(res) => { - self.completion_list.push_back((user_data, res)); + Ok(actions) => { + for action in actions { + match action { + DeallocAction::PunchHole { + host_offset, + length, + } => { + let _ = self.data_file.file_mut().punch_hole(host_offset, length); + } + DeallocAction::WriteZeroes { + host_offset, + length, + } => { + let _ = self + .data_file + .file_mut() + .write_zeroes_at(host_offset, length); + } + } + } + self.completion_list.push_back((user_data, 0)); self.eventfd.write(1).unwrap(); Ok(()) } Err(e) => { - // CRITICAL: Always signal completion even on error to avoid hangs - let errno = if let AsyncIoError::PunchHole(io_err) = &e { - let err = io_err.raw_os_error().unwrap_or(libc::EIO); - -err + let errno = if let AsyncIoError::PunchHole(ref io_err) = e { + -io_err.raw_os_error().unwrap_or(libc::EIO) } else { -libc::EIO }; @@ -189,85 +623,90 @@ impl AsyncIo for QcowSync { } fn write_zeroes(&mut self, offset: u64, length: u64, user_data: u64) -> AsyncIoResult<()> { - // For QCOW2, write_zeroes is implemented by deallocating clusters via punch_hole. - // This is more efficient than writing actual zeros and reduces disk usage. + // For QCOW2 write_zeroes uses cluster deallocation, same as punch_hole. // Unallocated clusters inherently read as zero in the QCOW2 format. - let result = self - .qcow_file - .lock() - .unwrap() - .punch_hole(offset, length) - .map(|_| 0i32) - .map_err(AsyncIoError::WriteZeroes); - - match result { - Ok(res) => { - self.completion_list.push_back((user_data, res)); - self.eventfd.write(1).unwrap(); - Ok(()) - } - Err(e) => { - // Always signal completion even on error to avoid hangs - let errno = if let AsyncIoError::WriteZeroes(io_err) = &e { - let err = io_err.raw_os_error().unwrap_or(libc::EIO); - -err - } else { - -libc::EIO - }; - self.completion_list.push_back((user_data, errno)); - self.eventfd.write(1).unwrap(); - Ok(()) - } - } + self.punch_hole(offset, length, user_data) } } #[cfg(test)] mod unit_tests { - use std::io::{Read, Seek, SeekFrom, Write}; + use std::io::{Seek, SeekFrom, Write}; use vmm_sys_util::tempfile::TempFile; use super::*; - use crate::qcow::{QcowFile, QcowHeader, RawFile}; + use crate::async_io::DiskFile; + use crate::qcow::{QcowFile, RawFile}; + + fn create_disk_with_data( + file_size: u64, + data: &[u8], + offset: u64, + sparse: bool, + ) -> (TempFile, QcowDiskSync) { + let temp_file = TempFile::new().unwrap(); + { + let raw_file = RawFile::new(temp_file.as_file().try_clone().unwrap(), false); + let mut qcow_file = QcowFile::new(raw_file, 3, file_size, sparse).unwrap(); + qcow_file.seek(SeekFrom::Start(offset)).unwrap(); + qcow_file.write_all(data).unwrap(); + qcow_file.flush().unwrap(); + } + let disk = QcowDiskSync::new( + temp_file.as_file().try_clone().unwrap(), + false, + false, + sparse, + ) + .unwrap(); + (temp_file, disk) + } + + fn async_read(disk: &QcowDiskSync, offset: u64, len: usize) -> Vec { + let mut async_io = disk.new_async_io(1).unwrap(); + let mut buf = vec![0xFFu8; len]; + let iovec = libc::iovec { + iov_base: buf.as_mut_ptr() as *mut libc::c_void, + iov_len: buf.len(), + }; + async_io + .read_vectored(offset as libc::off_t, &[iovec], 1) + .unwrap(); + let (user_data, result) = async_io.next_completed_request().unwrap(); + assert_eq!(user_data, 1); + assert_eq!(result as usize, len, "read should return requested length"); + buf + } + + fn async_write(disk: &QcowDiskSync, offset: u64, data: &[u8]) { + let mut async_io = disk.new_async_io(1).unwrap(); + let iovec = libc::iovec { + iov_base: data.as_ptr() as *mut libc::c_void, + iov_len: data.len(), + }; + async_io + .write_vectored(offset as libc::off_t, &[iovec], 1) + .unwrap(); + let (user_data, result) = async_io.next_completed_request().unwrap(); + assert_eq!(user_data, 1); + assert_eq!(result as usize, data.len()); + } #[test] fn test_qcow_async_punch_hole_completion() { - // Create a QCOW2 image with valid header - let temp_file = TempFile::new().unwrap(); - let raw_file = RawFile::new(temp_file.into_file(), false); - let file_size = 1024 * 1024 * 100; // 100MB - let mut qcow_file = QcowFile::new(raw_file, 3, file_size, true).unwrap(); - - // Write some data - let data = vec![0xDD; 128 * 1024]; // 128KB - let offset = 0; - qcow_file.seek(SeekFrom::Start(offset)).unwrap(); - qcow_file.write_all(&data).unwrap(); - qcow_file.flush().unwrap(); - - // Create async wrapper - let qcow_file = Arc::new(Mutex::new(qcow_file)); - let mut async_qcow = QcowSync::new(qcow_file.clone()); - - // Punch hole - async_qcow - .punch_hole(offset, data.len() as u64, 100) - .unwrap(); + let data = vec![0xDD; 128 * 1024]; + let offset = 0u64; + let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &data, offset, true); - // Verify completion event was generated - let (user_data, result) = async_qcow.next_completed_request().unwrap(); + let mut async_io = disk.new_async_io(1).unwrap(); + async_io.punch_hole(offset, data.len() as u64, 100).unwrap(); + let (user_data, result) = async_io.next_completed_request().unwrap(); assert_eq!(user_data, 100); assert_eq!(result, 0, "punch_hole should succeed"); + drop(async_io); - // Verify data reads as zeros - let mut read_buf = vec![0; data.len()]; - qcow_file - .lock() - .unwrap() - .seek(SeekFrom::Start(offset)) - .unwrap(); - qcow_file.lock().unwrap().read_exact(&mut read_buf).unwrap(); + let read_buf = async_read(&disk, offset, data.len()); assert!( read_buf.iter().all(|&b| b == 0), "Punched hole should read as zeros" @@ -276,41 +715,20 @@ mod unit_tests { #[test] fn test_qcow_async_write_zeroes_completion() { - // Create a QCOW2 image with valid header - let temp_file = TempFile::new().unwrap(); - let raw_file = RawFile::new(temp_file.into_file(), false); - let file_size = 1024 * 1024 * 100; // 100MB - let mut qcow_file = QcowFile::new(raw_file, 3, file_size, true).unwrap(); - - // Write some data - let data = vec![0xEE; 256 * 1024]; // 256KB - let offset = 64 * 1024; // Start at 64KB offset - qcow_file.seek(SeekFrom::Start(offset)).unwrap(); - qcow_file.write_all(&data).unwrap(); - qcow_file.flush().unwrap(); - - // Create async wrapper - let qcow_file = Arc::new(Mutex::new(qcow_file)); - let mut async_qcow = QcowSync::new(qcow_file.clone()); - - // Write zeros - async_qcow + let data = vec![0xEE; 256 * 1024]; + let offset = 64 * 1024u64; + let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &data, offset, true); + + let mut async_io = disk.new_async_io(1).unwrap(); + async_io .write_zeroes(offset, data.len() as u64, 200) .unwrap(); - - // Verify completion event was generated - let (user_data, result) = async_qcow.next_completed_request().unwrap(); + let (user_data, result) = async_io.next_completed_request().unwrap(); assert_eq!(user_data, 200); assert_eq!(result, 0, "write_zeroes should succeed"); + drop(async_io); - // Verify data reads as zeros - let mut read_buf = vec![0; data.len()]; - qcow_file - .lock() - .unwrap() - .seek(SeekFrom::Start(offset)) - .unwrap(); - qcow_file.lock().unwrap().read_exact(&mut read_buf).unwrap(); + let read_buf = async_read(&disk, offset, data.len()); assert!( read_buf.iter().all(|&b| b == 0), "Zeroed region should read as zeros" @@ -319,186 +737,139 @@ mod unit_tests { #[test] fn test_qcow_async_multiple_operations() { - // Create a QCOW2 image with valid header - let temp_file = TempFile::new().unwrap(); - let raw_file = RawFile::new(temp_file.into_file(), false); - let file_size = 1024 * 1024 * 100; // 100MB - let mut qcow_file = QcowFile::new(raw_file, 3, file_size, true).unwrap(); - - // Write data at multiple offsets - let data = vec![0xFF; 64 * 1024]; // 64KB chunks - for i in 0..4 { - let offset = i * 128 * 1024; // 128KB spacing - qcow_file.seek(SeekFrom::Start(offset)).unwrap(); - qcow_file.write_all(&data).unwrap(); - } - qcow_file.flush().unwrap(); - - // Create async wrapper - let qcow_file = Arc::new(Mutex::new(qcow_file)); - let mut async_qcow = QcowSync::new(qcow_file.clone()); - - // Queue multiple punch_hole operations - async_qcow.punch_hole(0, 64 * 1024, 1).unwrap(); - async_qcow.punch_hole(128 * 1024, 64 * 1024, 2).unwrap(); - async_qcow.punch_hole(256 * 1024, 64 * 1024, 3).unwrap(); - - // Verify all completions - let (user_data, result) = async_qcow.next_completed_request().unwrap(); - assert_eq!(user_data, 1); - assert_eq!(result, 0); + let data = vec![0xFF; 64 * 1024]; + let (_temp, _) = create_disk_with_data(100 * 1024 * 1024, &[], 0, true); - let (user_data, result) = async_qcow.next_completed_request().unwrap(); - assert_eq!(user_data, 2); - assert_eq!(result, 0); - - let (user_data, result) = async_qcow.next_completed_request().unwrap(); - assert_eq!(user_data, 3); - assert_eq!(result, 0); + // Write data at multiple offsets via QcowFile first, then punch + { + let temp_file = _temp.as_file().try_clone().unwrap(); + let raw_file = RawFile::new(temp_file, false); + let mut qcow_file = QcowFile::from(raw_file).unwrap(); + for i in 0..4u64 { + let off = i * 128 * 1024; + qcow_file.seek(SeekFrom::Start(off)).unwrap(); + qcow_file.write_all(&data).unwrap(); + } + qcow_file.flush().unwrap(); + } - // Verify no more completions - assert!(async_qcow.next_completed_request().is_none()); + let disk = + QcowDiskSync::new(_temp.as_file().try_clone().unwrap(), false, false, true).unwrap(); + + let mut async_io = disk.new_async_io(1).unwrap(); + + async_io.punch_hole(0, 64 * 1024, 1).unwrap(); + async_io.punch_hole(128 * 1024, 64 * 1024, 2).unwrap(); + async_io.punch_hole(256 * 1024, 64 * 1024, 3).unwrap(); + + let (ud, res) = async_io.next_completed_request().unwrap(); + assert_eq!(ud, 1); + assert_eq!(res, 0); + let (ud, res) = async_io.next_completed_request().unwrap(); + assert_eq!(ud, 2); + assert_eq!(res, 0); + let (ud, res) = async_io.next_completed_request().unwrap(); + assert_eq!(ud, 3); + assert_eq!(res, 0); + assert!(async_io.next_completed_request().is_none()); } #[test] - fn test_qcow_punch_hole_with_shared_instance() { - // This test verifies that with Arc>, multiple async I/O operations - // share the same QcowFile instance and see each other's changes. + fn test_qcow_punch_hole_then_read() { + // Verify that after punch_hole, a second async_io sees zeros. + let data = vec![0xAB; 128 * 1024]; + let offset = 0u64; + let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &data, offset, true); - // Create a QCOW2 image - let temp_file = TempFile::new().unwrap(); - let raw_file = RawFile::new(temp_file.into_file(), false); - let file_size = 1024 * 1024 * 100; // 100MB - let mut qcow_file = QcowFile::new(raw_file, 3, file_size, true).unwrap(); - - // Write some data at offset 0 - let data = vec![0xAB; 128 * 1024]; // 128KB of 0xAB pattern - let offset = 0; - qcow_file.seek(SeekFrom::Start(offset)).unwrap(); - qcow_file.write_all(&data).unwrap(); - qcow_file.flush().unwrap(); - - let qcow_shared = Arc::new(Mutex::new(qcow_file)); - - // First async I/O: punch hole - let mut async_qcow1 = QcowSync::new(qcow_shared.clone()); - async_qcow1 + let mut async_io1 = disk.new_async_io(1).unwrap(); + async_io1 .punch_hole(offset, data.len() as u64, 100) .unwrap(); - - // Verify punch_hole completed - let (user_data, result) = async_qcow1.next_completed_request().unwrap(); + let (user_data, result) = async_io1.next_completed_request().unwrap(); assert_eq!(user_data, 100); - assert_eq!(result, 0, "punch_hole should succeed"); - - // Second async I/O: read from same shared instance - // This should see the deallocated cluster because they share the same QcowFile - let mut read_buf = vec![0xFF; data.len()]; - qcow_shared - .lock() - .unwrap() - .seek(SeekFrom::Start(offset)) - .unwrap(); - qcow_shared - .lock() - .unwrap() - .read_exact(&mut read_buf) - .unwrap(); + assert_eq!(result, 0); + drop(async_io1); - // The read should return zeros because the cluster was deallocated + // Read via second async_io, should see zeros + let read_buf = async_read(&disk, offset, data.len()); assert!( read_buf.iter().all(|&b| b == 0), - "After punch_hole, shared QcowFile instance should read zeros from deallocated cluster" + "After punch_hole, read should return zeros" ); } #[test] fn test_qcow_disk_sync_punch_hole_with_new_async_io() { - // This test simulates the EXACT real usage pattern: QcowDiskSync.new_async_io() - // creates a new QcowSync with a cloned QcowFile for each I/O operation. - - use std::io::Write; - - use crate::async_io::DiskFile; - - // Create a QCOW2 image - let temp_file = TempFile::new().unwrap(); - let file_size = 1024 * 1024 * 100; // 100MB - - { - let raw_file = RawFile::new(temp_file.as_file().try_clone().unwrap(), false); - let mut qcow_file = QcowFile::new(raw_file, 3, file_size, true).unwrap(); - - // Write data at offset 1MB - use single cluster (64KB) to simplify test - let data = vec![0xCD; 64 * 1024]; // 64KB (one cluster) - let offset = 1024 * 1024u64; - qcow_file.seek(SeekFrom::Start(offset)).unwrap(); - qcow_file.write_all(&data).unwrap(); - qcow_file.flush().unwrap(); - } - - // Open with QcowDiskSync (like real code does) - let disk = - QcowDiskSync::new(temp_file.as_file().try_clone().unwrap(), false, true, true).unwrap(); + // Simulates the real usage pattern of write data, punch hole, then read back. + let data = vec![0xCD; 64 * 1024]; // one cluster + let offset = 1024 * 1024u64; // 1MB offset + let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &data, offset, true); - // First async I/O: punch hole (simulates DISCARD command) + // Punch hole to simulate DISCARD let mut async_io1 = disk.new_async_io(1).unwrap(); - let offset = 1024 * 1024u64; - let length = 64 * 1024u64; // Single cluster - async_io1.punch_hole(offset, length, 1).unwrap(); + async_io1.punch_hole(offset, data.len() as u64, 1).unwrap(); let (user_data, result) = async_io1.next_completed_request().unwrap(); assert_eq!(user_data, 1); assert_eq!(result, 0, "punch_hole should succeed"); drop(async_io1); - // Second async I/O: read from the same location (simulates READ command) - let mut async_io2 = disk.new_async_io(1).unwrap(); - let mut read_buf = vec![0xFF; length as usize]; - let iovec = libc::iovec { - iov_base: read_buf.as_mut_ptr() as *mut libc::c_void, - iov_len: read_buf.len(), - }; + // Read from the same location to verify + let read_buf = async_read(&disk, offset, data.len()); + assert!( + read_buf.iter().all(|&b| b == 0), + "After punch_hole via new_async_io, read should return zeros" + ); + } - // These assertions are critical to prevent compiler optimization bugs - // that can reorder operations. Without them, the test can fail even - // though the QCOW2 implementation is correct. - assert_eq!(iovec.iov_base as *const u8, read_buf.as_ptr()); - assert_eq!(iovec.iov_len, read_buf.len()); + #[test] + fn test_qcow_async_read_write_roundtrip() { + let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &[], 0, true); - async_io2 - .read_vectored(offset as libc::off_t, &[iovec], 2) - .unwrap(); + let data = vec![0x42u8; 64 * 1024]; + let offset = 0u64; - let (user_data, result) = async_io2.next_completed_request().unwrap(); - assert_eq!(user_data, 2); - assert_eq!( - result as usize, length as usize, - "read should complete successfully" - ); + async_write(&disk, offset, &data); + + let mut async_io = disk.new_async_io(1).unwrap(); + async_io.fsync(Some(10)).unwrap(); + let (ud, res) = async_io.next_completed_request().unwrap(); + assert_eq!(ud, 10); + assert_eq!(res, 0); + drop(async_io); - // Verify the data is all zeros + let read_buf = async_read(&disk, offset, data.len()); + assert_eq!(read_buf, data, "Read-back should match written data"); + } + + #[test] + fn test_qcow_async_read_unallocated() { + // Reading from an unallocated region should return zeros. + let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &[], 0, true); + let read_buf = async_read(&disk, 0, 64 * 1024); assert!( read_buf.iter().all(|&b| b == 0), - "After punch_hole via new_async_io, read should return zeros" + "Unallocated region should read as zeros" ); } #[test] - fn backing_files_disabled_error() { - let header = - QcowHeader::create_for_size_and_path(3, 0x10_0000, Some("/path/to/backing/file")) - .expect("Failed to create header."); - let temp_file = TempFile::new().unwrap(); - let mut raw_file = RawFile::new(temp_file.as_file().try_clone().unwrap(), false); - header - .write_to(&mut raw_file) - .expect("Failed to write header."); - - let file = temp_file.into_file(); - match QcowDiskSync::new(file, false, false, true) { - Err(QcowError::BackingFilesDisabled) => {} - Err(other) => panic!("Expected BackingFilesDisabled, got: {other:?}"), - Ok(_) => panic!("Expected BackingFilesDisabled error, but succeeded"), - } + fn test_qcow_async_cross_cluster_read_write() { + let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &[], 0, true); + + // Default cluster size is 64KB. Write 96KB starting at 32KB to cross the boundary. + let data: Vec = (0..96 * 1024).map(|i| (i % 251) as u8).collect(); + let offset = 32 * 1024u64; + + async_write(&disk, offset, &data); + + let mut async_io = disk.new_async_io(1).unwrap(); + async_io.fsync(Some(99)).unwrap(); + drop(async_io); + + let read_buf = async_read(&disk, offset, data.len()); + assert_eq!( + read_buf, data, + "Cross-cluster read should match written data" + ); } } From fd6891db626024a5d3237e129530d80730ab5ae4 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Sun, 15 Feb 2026 00:56:00 +0100 Subject: [PATCH 090/742] block: qcow: Extend unit tests Add tests for multiqueue concurrent reads, raw and QCOW2 backing files, three layer backing chains, COW on partial cluster writes, discard with backing fallthrough, cross cluster boundary operations, reads beyond virtual size, and resize. Signed-off-by: Anatol Belski --- block/src/qcow_sync.rs | 836 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 834 insertions(+), 2 deletions(-) diff --git a/block/src/qcow_sync.rs b/block/src/qcow_sync.rs index 22d361adad..0aeaaef44d 100644 --- a/block/src/qcow_sync.rs +++ b/block/src/qcow_sync.rs @@ -632,12 +632,13 @@ impl AsyncIo for QcowSync { #[cfg(test)] mod unit_tests { use std::io::{Seek, SeekFrom, Write}; + use std::thread; use vmm_sys_util::tempfile::TempFile; use super::*; use crate::async_io::DiskFile; - use crate::qcow::{QcowFile, RawFile}; + use crate::qcow::{BackingFileConfig, ImageType, QcowFile, RawFile}; fn create_disk_with_data( file_size: u64, @@ -869,7 +870,838 @@ mod unit_tests { let read_buf = async_read(&disk, offset, data.len()); assert_eq!( read_buf, data, - "Cross-cluster read should match written data" + "Cross cluster read should match written data" + ); + } + + #[test] + fn test_backing_file_read() { + let backing_temp = TempFile::new().unwrap(); + let cluster_size = 1u64 << 16; + let file_size = cluster_size * 4; + let pattern: Vec = (0..file_size as usize).map(|i| (i % 251) as u8).collect(); + backing_temp.as_file().write_all(&pattern).unwrap(); + backing_temp.as_file().sync_all().unwrap(); + let backing_path = backing_temp.as_path().to_str().unwrap().to_string(); + + let overlay_temp = TempFile::new().unwrap(); + { + let raw = RawFile::new(overlay_temp.as_file().try_clone().unwrap(), false); + let backing_config = BackingFileConfig { + path: backing_path, + format: Some(ImageType::Raw), + }; + let _overlay = + QcowFile::new_from_backing(raw, 3, file_size, &backing_config, true).unwrap(); + } + + let file = overlay_temp.as_file().try_clone().unwrap(); + let disk = QcowDiskSync::new(file, false, true, true).unwrap(); + + // Read first cluster - should come from backing file + let buf = async_read(&disk, 0, cluster_size as usize); + assert_eq!( + &buf[..], + &pattern[..cluster_size as usize], + "First cluster should match backing file data" + ); + + let buf = async_read(&disk, cluster_size, cluster_size as usize); + assert_eq!( + &buf[..], + &pattern[cluster_size as usize..2 * cluster_size as usize], + "Second cluster should match backing file data" + ); + + // Read a partial range spanning cluster boundary + let mid = cluster_size - 512; + let len = 1024usize; + let buf = async_read(&disk, mid, len); + assert_eq!( + &buf[..], + &pattern[mid as usize..mid as usize + len], + "Cross cluster read from backing should match" + ); + + let buf = async_read(&disk, 0, file_size as usize); + assert_eq!( + &buf[..], + &pattern[..], + "Full file read from backing should match" + ); + } + + #[test] + fn test_backing_file_read_qcow2_backing() { + let backing_temp = TempFile::new().unwrap(); + let cluster_size = 1u64 << 16; + let file_size = cluster_size * 4; + let pattern: Vec = (0..file_size as usize).map(|i| (i % 251) as u8).collect(); + { + let raw = RawFile::new(backing_temp.as_file().try_clone().unwrap(), false); + let mut qcow = QcowFile::new(raw, 3, file_size, true).unwrap(); + qcow.seek(SeekFrom::Start(0)).unwrap(); + qcow.write_all(&pattern).unwrap(); + qcow.flush().unwrap(); + } + let backing_path = backing_temp.as_path().to_str().unwrap().to_string(); + + let overlay_temp = TempFile::new().unwrap(); + { + let raw = RawFile::new(overlay_temp.as_file().try_clone().unwrap(), false); + let backing_config = BackingFileConfig { + path: backing_path, + format: Some(ImageType::Qcow2), + }; + let _overlay = + QcowFile::new_from_backing(raw, 3, file_size, &backing_config, true).unwrap(); + } + + let file = overlay_temp.as_file().try_clone().unwrap(); + let disk = QcowDiskSync::new(file, false, true, true).unwrap(); + + // Read first cluster - should come from QCOW2 backing + let buf = async_read(&disk, 0, cluster_size as usize); + assert_eq!( + &buf[..], + &pattern[..cluster_size as usize], + "First cluster from QCOW2 backing should match" + ); + + let buf = async_read(&disk, 0, file_size as usize); + assert_eq!( + &buf[..], + &pattern[..], + "Full file from QCOW2 backing should match" + ); + + // Write to first cluster, then verify second cluster still reads from backing + let new_data = vec![0xAB; cluster_size as usize]; + async_write(&disk, 0, &new_data); + { + let mut async_io = disk.new_async_io(1).unwrap(); + async_io.fsync(Some(99)).unwrap(); + } + + let buf = async_read(&disk, 0, cluster_size as usize); + assert_eq!( + &buf[..], + &new_data[..], + "Written cluster should be new data" + ); + + let buf = async_read(&disk, cluster_size, cluster_size as usize); + assert_eq!( + &buf[..], + &pattern[cluster_size as usize..2 * cluster_size as usize], + "Unwritten cluster should still come from backing" + ); + } + + #[test] + fn test_multi_queue_concurrent_reads() { + // Verify that multiple queues (threads) can read simultaneously. + // This exercises the RwLock + pread64 design: concurrent L2 cache hits + // proceed in parallel and data reads are position independent. + let cluster_size = 1u64 << 16; + let file_size = cluster_size * 16; + let pattern: Vec = (0..file_size as usize).map(|i| (i % 251) as u8).collect(); + let (_temp, disk) = create_disk_with_data(file_size, &pattern, 0, true); + let disk = Arc::new(disk); + + let threads: Vec<_> = (0..8) + .map(|t| { + let disk = Arc::clone(&disk); + let pattern = pattern.clone(); + thread::spawn(move || { + for i in 0..16u64 { + // Each thread reads clusters in a different order + let cluster_idx = (i + t * 2) % 16; + let offset = cluster_idx * cluster_size; + let buf = async_read(&disk, offset, cluster_size as usize); + assert_eq!( + &buf[..], + &pattern[offset as usize..(offset + cluster_size) as usize], + "Thread {t} cluster {cluster_idx} mismatch" + ); + } + }) + }) + .collect(); + + for t in threads { + t.join().unwrap(); + } + } + + #[test] + fn test_multi_queue_concurrent_reads_qcow2_backing() { + // Same as above but reads go through a Qcow2MetadataBacking, + // exercising concurrent metadata resolution + pread64 in the backing. + let backing_temp = TempFile::new().unwrap(); + let cluster_size = 1u64 << 16; + let file_size = cluster_size * 16; + let pattern: Vec = (0..file_size as usize).map(|i| (i % 251) as u8).collect(); + { + let raw = RawFile::new(backing_temp.as_file().try_clone().unwrap(), false); + let mut qcow = QcowFile::new(raw, 3, file_size, true).unwrap(); + qcow.seek(SeekFrom::Start(0)).unwrap(); + qcow.write_all(&pattern).unwrap(); + qcow.flush().unwrap(); + } + let backing_path = backing_temp.as_path().to_str().unwrap().to_string(); + + let overlay_temp = TempFile::new().unwrap(); + { + let raw = RawFile::new(overlay_temp.as_file().try_clone().unwrap(), false); + let backing_config = BackingFileConfig { + path: backing_path, + format: Some(ImageType::Qcow2), + }; + let _overlay = + QcowFile::new_from_backing(raw, 3, file_size, &backing_config, true).unwrap(); + } + + let file = overlay_temp.as_file().try_clone().unwrap(); + let disk = Arc::new(QcowDiskSync::new(file, false, true, true).unwrap()); + + let threads: Vec<_> = (0..8) + .map(|t| { + let disk = Arc::clone(&disk); + let pattern = pattern.clone(); + thread::spawn(move || { + for i in 0..16u64 { + let cluster_idx = (i + t * 2) % 16; + let offset = cluster_idx * cluster_size; + let buf = async_read(&disk, offset, cluster_size as usize); + assert_eq!( + &buf[..], + &pattern[offset as usize..(offset + cluster_size) as usize], + "Thread {t} cluster {cluster_idx} mismatch (qcow2 backing)" + ); + } + }) + }) + .collect(); + + for t in threads { + t.join().unwrap(); + } + } + + #[test] + fn test_three_layer_backing_chain() { + // raw base -> qcow2 mid -> qcow2 overlay + // Tests recursive shared_backing_from() with nested backing. + let cluster_size = 1u64 << 16; + let file_size = cluster_size * 4; + let base_pattern: Vec = (0..file_size as usize).map(|i| (i % 251) as u8).collect(); + + // Layer 0: raw base + let base_temp = TempFile::new().unwrap(); + base_temp.as_file().write_all(&base_pattern).unwrap(); + base_temp.as_file().sync_all().unwrap(); + let base_path = base_temp.as_path().to_str().unwrap().to_string(); + + // Layer 1: qcow2 mid pointing at raw base, write to cluster 0 only + let mid_temp = TempFile::new().unwrap(); + let mid_pattern = vec![0xBBu8; cluster_size as usize]; + { + let raw = RawFile::new(mid_temp.as_file().try_clone().unwrap(), false); + let backing_config = BackingFileConfig { + path: base_path, + format: Some(ImageType::Raw), + }; + let mut mid = + QcowFile::new_from_backing(raw, 3, file_size, &backing_config, true).unwrap(); + mid.seek(SeekFrom::Start(0)).unwrap(); + mid.write_all(&mid_pattern).unwrap(); + mid.flush().unwrap(); + } + let mid_path = mid_temp.as_path().to_str().unwrap().to_string(); + + // Layer 2: qcow2 overlay pointing at qcow2 mid, write to cluster 1 only + let overlay_temp = TempFile::new().unwrap(); + let overlay_pattern = vec![0xCCu8; cluster_size as usize]; + { + let raw = RawFile::new(overlay_temp.as_file().try_clone().unwrap(), false); + let backing_config = BackingFileConfig { + path: mid_path, + format: Some(ImageType::Qcow2), + }; + let mut overlay = + QcowFile::new_from_backing(raw, 3, file_size, &backing_config, true).unwrap(); + overlay.seek(SeekFrom::Start(cluster_size)).unwrap(); + overlay.write_all(&overlay_pattern).unwrap(); + overlay.flush().unwrap(); + } + + let file = overlay_temp.as_file().try_clone().unwrap(); + let disk = QcowDiskSync::new(file, false, true, true).unwrap(); + + // Cluster 0: mid wrote 0xBB + let buf = async_read(&disk, 0, cluster_size as usize); + assert!( + buf.iter().all(|&b| b == 0xBB), + "Cluster 0 should come from mid layer" + ); + + // Cluster 1: overlay wrote 0xCC + let buf = async_read(&disk, cluster_size, cluster_size as usize); + assert!( + buf.iter().all(|&b| b == 0xCC), + "Cluster 1 should come from overlay" + ); + + // Cluster 2: falls through mid (unwritten) to raw base + let buf = async_read(&disk, cluster_size * 2, cluster_size as usize); + let expected_start = (cluster_size * 2) as usize; + assert_eq!( + &buf[..], + &base_pattern[expected_start..expected_start + cluster_size as usize], + "Cluster 2 should come from raw base" + ); + + // Cluster 3: also falls through to raw base + let buf = async_read(&disk, cluster_size * 3, cluster_size as usize); + let expected_start = (cluster_size * 3) as usize; + assert_eq!( + &buf[..], + &base_pattern[expected_start..expected_start + cluster_size as usize], + "Cluster 3 should come from raw base" + ); + } + + #[test] + fn test_backing_cow_preserves_all_unwritten_clusters() { + // Write to specific clusters in the overlay, verify all others still + // read from the qcow2 backing correctly. + let cluster_size = 1u64 << 16; + let num_clusters = 8u64; + let file_size = cluster_size * num_clusters; + let pattern: Vec = (0..file_size as usize).map(|i| (i % 251) as u8).collect(); + + let backing_temp = TempFile::new().unwrap(); + { + let raw = RawFile::new(backing_temp.as_file().try_clone().unwrap(), false); + let mut qcow = QcowFile::new(raw, 3, file_size, true).unwrap(); + qcow.seek(SeekFrom::Start(0)).unwrap(); + qcow.write_all(&pattern).unwrap(); + qcow.flush().unwrap(); + } + let backing_path = backing_temp.as_path().to_str().unwrap().to_string(); + + let overlay_temp = TempFile::new().unwrap(); + { + let raw = RawFile::new(overlay_temp.as_file().try_clone().unwrap(), false); + let backing_config = BackingFileConfig { + path: backing_path, + format: Some(ImageType::Qcow2), + }; + let _overlay = + QcowFile::new_from_backing(raw, 3, file_size, &backing_config, true).unwrap(); + } + + let file = overlay_temp.as_file().try_clone().unwrap(); + let disk = QcowDiskSync::new(file, false, true, true).unwrap(); + + let written = vec![0xFFu8; cluster_size as usize]; + for &idx in &[0u64, 3, 7] { + async_write(&disk, idx * cluster_size, &written); + } + { + let mut async_io = disk.new_async_io(1).unwrap(); + async_io.fsync(Some(99)).unwrap(); + } + + for &idx in &[0u64, 3, 7] { + let buf = async_read(&disk, idx * cluster_size, cluster_size as usize); + assert!( + buf.iter().all(|&b| b == 0xFF), + "Cluster {idx} should be written data" + ); + } + + // Verify unwritten clusters read from backing + for idx in 0..num_clusters { + if idx == 0 || idx == 3 || idx == 7 { + continue; + } + let offset = idx * cluster_size; + let buf = async_read(&disk, offset, cluster_size as usize); + assert_eq!( + &buf[..], + &pattern[offset as usize..(offset + cluster_size) as usize], + "Cluster {idx} should come from backing" + ); + } + } + + #[test] + fn test_qcow2_backing_read_beyond_virtual_size() { + // Read starting past the backing file virtual_size should return zeros. + let cluster_size = 1u64 << 16; + let backing_size = cluster_size * 2; + let overlay_size = cluster_size * 4; // overlay is larger than backing + + let backing_temp = TempFile::new().unwrap(); + { + let raw = RawFile::new(backing_temp.as_file().try_clone().unwrap(), false); + let mut qcow = QcowFile::new(raw, 3, backing_size, true).unwrap(); + qcow.seek(SeekFrom::Start(0)).unwrap(); + qcow.write_all(&vec![0xAA; backing_size as usize]).unwrap(); + qcow.flush().unwrap(); + } + let backing_path = backing_temp.as_path().to_str().unwrap().to_string(); + + let overlay_temp = TempFile::new().unwrap(); + { + let raw = RawFile::new(overlay_temp.as_file().try_clone().unwrap(), false); + let backing_config = BackingFileConfig { + path: backing_path, + format: Some(ImageType::Qcow2), + }; + let _overlay = + QcowFile::new_from_backing(raw, 3, overlay_size, &backing_config, true).unwrap(); + } + + let file = overlay_temp.as_file().try_clone().unwrap(); + let disk = QcowDiskSync::new(file, false, true, true).unwrap(); + + // Read cluster 2 (past backing virtual_size) - should be zeros + let buf = async_read(&disk, backing_size, cluster_size as usize); + assert!( + buf.iter().all(|&b| b == 0), + "Read beyond backing virtual_size should return zeros" + ); + } + + #[test] + fn test_qcow2_backing_read_spanning_virtual_size() { + // Read that starts within backing bounds but extends past virtual_size. + // First part should have backing data, remainder should be zeros. + let cluster_size = 1u64 << 16; + let backing_size = cluster_size * 2; + let overlay_size = cluster_size * 4; + + let backing_temp = TempFile::new().unwrap(); + let backing_data = vec![0xBBu8; backing_size as usize]; + { + let raw = RawFile::new(backing_temp.as_file().try_clone().unwrap(), false); + let mut qcow = QcowFile::new(raw, 3, backing_size, true).unwrap(); + qcow.seek(SeekFrom::Start(0)).unwrap(); + qcow.write_all(&backing_data).unwrap(); + qcow.flush().unwrap(); + } + let backing_path = backing_temp.as_path().to_str().unwrap().to_string(); + + let overlay_temp = TempFile::new().unwrap(); + { + let raw = RawFile::new(overlay_temp.as_file().try_clone().unwrap(), false); + let backing_config = BackingFileConfig { + path: backing_path, + format: Some(ImageType::Qcow2), + }; + let _overlay = + QcowFile::new_from_backing(raw, 3, overlay_size, &backing_config, true).unwrap(); + } + + let file = overlay_temp.as_file().try_clone().unwrap(); + let disk = QcowDiskSync::new(file, false, true, true).unwrap(); + + // Read 2 clusters starting at cluster 1 (spans backing boundary) + let read_len = cluster_size as usize * 2; + let buf = async_read(&disk, cluster_size, read_len); + + // First cluster should be backing data + assert!( + buf[..cluster_size as usize].iter().all(|&b| b == 0xBB), + "First half should come from backing" + ); + + // Second cluster is past backing virtual_size - zeros + assert!( + buf[cluster_size as usize..].iter().all(|&b| b == 0), + "Second half should be zeros (past backing virtual_size)" + ); + } + + #[test] + fn test_raw_backing_read_beyond_virtual_size() { + // Read past raw backing file virtual_size should return zeros. + let cluster_size = 1u64 << 16; + let backing_size = cluster_size * 2; + let overlay_size = cluster_size * 4; + + let backing_temp = TempFile::new().unwrap(); + let backing_data = vec![0xDD; backing_size as usize]; + backing_temp.as_file().write_all(&backing_data).unwrap(); + backing_temp.as_file().sync_all().unwrap(); + let backing_path = backing_temp.as_path().to_str().unwrap().to_string(); + + let overlay_temp = TempFile::new().unwrap(); + { + let raw = RawFile::new(overlay_temp.as_file().try_clone().unwrap(), false); + let backing_config = BackingFileConfig { + path: backing_path, + format: Some(ImageType::Raw), + }; + let _overlay = + QcowFile::new_from_backing(raw, 3, overlay_size, &backing_config, true).unwrap(); + } + + let file = overlay_temp.as_file().try_clone().unwrap(); + let disk = QcowDiskSync::new(file, false, true, true).unwrap(); + + // Read cluster 2 (past backing size) - should be zeros + let buf = async_read(&disk, backing_size, cluster_size as usize); + assert!( + buf.iter().all(|&b| b == 0), + "Read beyond raw backing virtual_size should return zeros" + ); + + // Read spanning boundary: cluster 1 has data, cluster 2 zeros + let read_len = cluster_size as usize * 2; + let buf = async_read(&disk, cluster_size, read_len); + assert!( + buf[..cluster_size as usize].iter().all(|&b| b == 0xDD), + "First half should come from raw backing" + ); + assert!( + buf[cluster_size as usize..].iter().all(|&b| b == 0), + "Second half should be zeros (past raw backing size)" + ); + } + + #[test] + fn test_qcow2_backing_cross_cluster_read() { + // Read spanning a cluster boundary through qcow2 backing. + // Exercises the read_clusters loop in Qcow2MetadataBacking. + let cluster_size = 1u64 << 16; + let file_size = cluster_size * 4; + let pattern: Vec = (0..file_size as usize).map(|i| (i % 251) as u8).collect(); + + let backing_temp = TempFile::new().unwrap(); + { + let raw = RawFile::new(backing_temp.as_file().try_clone().unwrap(), false); + let mut qcow = QcowFile::new(raw, 3, file_size, true).unwrap(); + qcow.seek(SeekFrom::Start(0)).unwrap(); + qcow.write_all(&pattern).unwrap(); + qcow.flush().unwrap(); + } + let backing_path = backing_temp.as_path().to_str().unwrap().to_string(); + + let overlay_temp = TempFile::new().unwrap(); + { + let raw = RawFile::new(overlay_temp.as_file().try_clone().unwrap(), false); + let backing_config = BackingFileConfig { + path: backing_path, + format: Some(ImageType::Qcow2), + }; + let _overlay = + QcowFile::new_from_backing(raw, 3, file_size, &backing_config, true).unwrap(); + } + + let file = overlay_temp.as_file().try_clone().unwrap(); + let disk = QcowDiskSync::new(file, false, true, true).unwrap(); + + // Read spanning clusters 1-2 boundary: 512 bytes before + 512 after + let mid = cluster_size - 512; + let len = 1024usize; + let buf = async_read(&disk, mid, len); + assert_eq!( + &buf[..], + &pattern[mid as usize..mid as usize + len], + "Cross cluster read through qcow2 backing should match" + ); + + // Read spanning clusters 0-1-2 (3 clusters worth) + let start = cluster_size / 2; + let len = cluster_size as usize * 2; + let buf = async_read(&disk, start, len); + assert_eq!( + &buf[..], + &pattern[start as usize..start as usize + len], + "Multi cluster read through qcow2 backing should match" + ); + } + + #[test] + fn test_punch_hole_with_backing_fallthrough() { + // Write to overlay, then punch hole. After punch, the cluster should + // fall through to backing data (not zeros). + let cluster_size = 1u64 << 16; + let file_size = cluster_size * 4; + let pattern: Vec = (0..file_size as usize).map(|i| (i % 251) as u8).collect(); + + let backing_temp = TempFile::new().unwrap(); + backing_temp.as_file().write_all(&pattern).unwrap(); + backing_temp.as_file().sync_all().unwrap(); + let backing_path = backing_temp.as_path().to_str().unwrap().to_string(); + + let overlay_temp = TempFile::new().unwrap(); + { + let raw = RawFile::new(overlay_temp.as_file().try_clone().unwrap(), false); + let backing_config = BackingFileConfig { + path: backing_path, + format: Some(ImageType::Raw), + }; + let _overlay = + QcowFile::new_from_backing(raw, 3, file_size, &backing_config, true).unwrap(); + } + + let file = overlay_temp.as_file().try_clone().unwrap(); + let disk = QcowDiskSync::new(file, false, true, true).unwrap(); + + let written = vec![0xFFu8; cluster_size as usize]; + async_write(&disk, 0, &written); + { + let mut async_io = disk.new_async_io(1).unwrap(); + async_io.fsync(Some(99)).unwrap(); + } + + let buf = async_read(&disk, 0, cluster_size as usize); + assert!(buf.iter().all(|&b| b == 0xFF), "Should read written data"); + + // Punch hole on cluster 0 - should deallocate and fall through to backing + { + let mut async_io = disk.new_async_io(1).unwrap(); + async_io.punch_hole(0, cluster_size, 42).unwrap(); + let (ud, res) = async_io.next_completed_request().unwrap(); + assert_eq!(ud, 42); + assert_eq!(res, 0); + } + + // Now read should return backing data, not zeros + let buf = async_read(&disk, 0, cluster_size as usize); + assert_eq!( + &buf[..], + &pattern[..cluster_size as usize], + "After punch_hole with backing, should read backing data" + ); + + // Cluster 1 should still be backing data throughout + let buf = async_read(&disk, cluster_size, cluster_size as usize); + assert_eq!( + &buf[..], + &pattern[cluster_size as usize..2 * cluster_size as usize], + "Untouched cluster should read from backing" + ); + } + + #[test] + fn test_rewrite_allocated_cluster() { + // Write to a cluster, then overwrite it. The second write should hit + // the already allocated path in map_write (no new cluster allocation). + let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &[], 0, true); + let cluster_size = 1u64 << 16; + + let data1 = vec![0xAAu8; cluster_size as usize]; + async_write(&disk, 0, &data1); + { + let mut aio = disk.new_async_io(1).unwrap(); + aio.fsync(Some(1)).unwrap(); + } + let buf = async_read(&disk, 0, cluster_size as usize); + assert!(buf.iter().all(|&b| b == 0xAA), "First write should stick"); + + let data2 = vec![0xBBu8; cluster_size as usize]; + async_write(&disk, 0, &data2); + { + let mut aio = disk.new_async_io(1).unwrap(); + aio.fsync(Some(2)).unwrap(); + } + let buf = async_read(&disk, 0, cluster_size as usize); + assert!( + buf.iter().all(|&b| b == 0xBB), + "Overwrite should replace data" + ); + } + + #[test] + fn test_partial_cluster_write_with_backing_cow() { + // Partial cluster write to an overlay with a backing file triggers COW. + // The unwritten part of the cluster must be copied from backing. + let cluster_size = 1u64 << 16; + let file_size = cluster_size * 4; + let pattern: Vec = (0..file_size as usize).map(|i| (i % 251) as u8).collect(); + + let backing_temp = TempFile::new().unwrap(); + backing_temp.as_file().write_all(&pattern).unwrap(); + backing_temp.as_file().sync_all().unwrap(); + let backing_path = backing_temp.as_path().to_str().unwrap().to_string(); + + let overlay_temp = TempFile::new().unwrap(); + { + let raw = RawFile::new(overlay_temp.as_file().try_clone().unwrap(), false); + let backing_config = BackingFileConfig { + path: backing_path, + format: Some(ImageType::Raw), + }; + let _overlay = + QcowFile::new_from_backing(raw, 3, file_size, &backing_config, true).unwrap(); + } + + let file = overlay_temp.as_file().try_clone().unwrap(); + let disk = QcowDiskSync::new(file, false, true, true).unwrap(); + + // Write 4KB at offset 4KB within cluster 0 (partial cluster) + let write_offset = 4096u64; + let write_len = 4096usize; + let write_data = vec![0xEEu8; write_len]; + async_write(&disk, write_offset, &write_data); + { + let mut aio = disk.new_async_io(1).unwrap(); + aio.fsync(Some(1)).unwrap(); + } + + let buf = async_read(&disk, 0, cluster_size as usize); + + // Before the write: should be COW'd from backing + assert_eq!( + &buf[..write_offset as usize], + &pattern[..write_offset as usize], + "Pre write region should be COW from backing" + ); + + assert_eq!( + &buf[write_offset as usize..write_offset as usize + write_len], + &write_data[..], + "Written region should be new data" + ); + + // After the write: should be COW'd from backing + let after_offset = write_offset as usize + write_len; + assert_eq!( + &buf[after_offset..cluster_size as usize], + &pattern[after_offset..cluster_size as usize], + "Post write region should be COW from backing" + ); + } + + #[test] + fn test_partial_cluster_deallocate() { + // Punch hole on a partial cluster range. The deallocate_bytes path + // should produce WriteZeroes actions for partial clusters. + let cluster_size = 1u64 << 16; + let file_size = cluster_size * 4; + + let data: Vec = (0..2 * cluster_size as usize) + .map(|i| (i % 251) as u8) + .collect(); + let (_temp, disk) = create_disk_with_data(file_size, &data, 0, true); + + // Punch a partial range: last 4KB of cluster 0 + first 4KB of cluster 1 + let punch_offset = cluster_size - 4096; + let punch_len = 8192u64; + { + let mut aio = disk.new_async_io(1).unwrap(); + aio.punch_hole(punch_offset, punch_len, 10).unwrap(); + let (ud, res) = aio.next_completed_request().unwrap(); + assert_eq!(ud, 10); + assert_eq!(res, 0); + } + + let buf = async_read(&disk, 0, 2 * cluster_size as usize); + + // Before punch: unchanged + assert_eq!( + &buf[..punch_offset as usize], + &data[..punch_offset as usize], + "Data before punch should be unchanged" + ); + + // Punched region: zeros + assert!( + buf[punch_offset as usize..(punch_offset + punch_len) as usize] + .iter() + .all(|&b| b == 0), + "Punched region should be zeros" + ); + + // After punch: unchanged + let after = (punch_offset + punch_len) as usize; + assert_eq!( + &buf[after..2 * cluster_size as usize], + &data[after..2 * cluster_size as usize], + "Data after punch should be unchanged" + ); + } + + #[test] + fn test_resize_grow() { + let cluster_size = 1u64 << 16; + let initial_size = cluster_size * 4; + let data = vec![0xAA; cluster_size as usize]; + let (_temp, mut disk) = create_disk_with_data(initial_size, &data, 0, true); + + assert_eq!(disk.logical_size().unwrap(), initial_size); + + let new_size = cluster_size * 8; + disk.resize(new_size).unwrap(); + assert_eq!(disk.logical_size().unwrap(), new_size); + + // Original data intact + let buf = async_read(&disk, 0, cluster_size as usize); + assert!( + buf.iter().all(|&b| b == 0xAA), + "Original data should survive resize" + ); + + // New region reads as zeros + let buf = async_read(&disk, initial_size, cluster_size as usize); + assert!( + buf.iter().all(|&b| b == 0), + "Newly grown region should read as zeros" + ); + + // Can write to newly grown region + let new_data = vec![0xBB; cluster_size as usize]; + async_write(&disk, initial_size, &new_data); + { + let mut aio = disk.new_async_io(1).unwrap(); + aio.fsync(Some(1)).unwrap(); + } + let buf = async_read(&disk, initial_size, cluster_size as usize); + assert!( + buf.iter().all(|&b| b == 0xBB), + "Write to grown region should work" + ); + } + + #[test] + fn test_resize_with_backing_file_rejected() { + let backing_temp = TempFile::new().unwrap(); + let cluster_size = 1u64 << 16; + let file_size = cluster_size * 4; + backing_temp + .as_file() + .write_all(&vec![0u8; file_size as usize]) + .unwrap(); + backing_temp.as_file().sync_all().unwrap(); + let backing_path = backing_temp.as_path().to_str().unwrap().to_string(); + + let overlay_temp = TempFile::new().unwrap(); + { + let raw = RawFile::new(overlay_temp.as_file().try_clone().unwrap(), false); + let backing_config = BackingFileConfig { + path: backing_path, + format: Some(ImageType::Raw), + }; + let _overlay = + QcowFile::new_from_backing(raw, 3, file_size, &backing_config, true).unwrap(); + } + + let file = overlay_temp.as_file().try_clone().unwrap(); + let mut disk = QcowDiskSync::new(file, false, true, true).unwrap(); + + assert_eq!(disk.logical_size().unwrap(), file_size); + let result = disk.resize(file_size * 2); + assert!(result.is_err(), "resize with backing file should fail"); + assert_eq!( + disk.logical_size().unwrap(), + file_size, + "size should be unchanged after failed resize" ); } } From 26fac58a7c4a21ac2bd74e545f3b10102fc7bc1c Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Mon, 9 Mar 2026 18:47:42 +0000 Subject: [PATCH 091/742] docs: CONTRIBUTING.md: Use the up to date link for DCO Since the project claims to follow Linux's process, update the link to point to Linux's process, instead of relying on an archived page which contains outdated information. Signed-off-by: Wei Liu --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 157098201c..ac65550c05 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -58,7 +58,7 @@ commit you make. ## Certificate of Origin -In order to get a clear contribution chain of trust we use the [signed-off-by language](https://web.archive.org/web/20230406041855/https://01.org/community/signed-process) +In order to get a clear contribution chain of trust we use the [signed-off-by language](https://www.kernel.org/doc/Documentation/process/submitting-patches.rst) used by the Linux kernel project. ## Patch format From a179f67b203bdb31693f9d1daba1e6e1da3fa06d Mon Sep 17 00:00:00 2001 From: sohakpt Date: Sun, 8 Mar 2026 23:21:47 +0800 Subject: [PATCH 092/742] main: Fix formatting issue in arguments Signed-off-by: sohakpt --- cloud-hypervisor/src/main.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cloud-hypervisor/src/main.rs b/cloud-hypervisor/src/main.rs index 80cd502914..6680483d6d 100644 --- a/cloud-hypervisor/src/main.rs +++ b/cloud-hypervisor/src/main.rs @@ -199,11 +199,10 @@ fn get_cli_options_sorted( .long("cmdline") .help("Kernel command line") .num_args(1) - .group("vm-config"), Arg::new("console") + .group("vm-config"), + Arg::new("console") .long("console") - .help( - "Control (virtio) console: \"off|null|pty|tty|file=,iommu=on|off\"", - ) + .help("Control (virtio) console: \"off|null|pty|tty|file=,iommu=on|off\"") .default_value("tty") .group("vm-config"), Arg::new("cpus") From 40768086b973bf01ba2eb8ae308e3c3fa93d407c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 10 Mar 2026 00:06:36 +0000 Subject: [PATCH 093/742] build: Bump the non-rust-vmm group across 2 directories with 15 updates Bumps the non-rust-vmm group with 9 updates in the / directory: | Package | From | To | | --- | --- | --- | | [libc](https://github.com/rust-lang/libc) | `0.2.182` | `0.2.183` | | [uuid](https://github.com/uuid-rs/uuid) | `1.21.0` | `1.22.0` | | [zerocopy](https://github.com/google/zerocopy) | `0.8.40` | `0.8.42` | | [getrandom](https://github.com/rust-random/getrandom) | `0.4.1` | `0.4.2` | | [jiff](https://github.com/BurntSushi/jiff) | `0.2.22` | `0.2.23` | | [proc-macro-crate](https://github.com/bkchr/proc-macro-crate) | `3.4.0` | `3.5.0` | | [quote](https://github.com/dtolnay/quote) | `1.0.44` | `1.0.45` | | [uds_windows](https://github.com/haraldh/rust_uds_windows) | `1.1.0` | `1.2.0` | | [winnow](https://github.com/winnow-rs/winnow) | `0.7.14` | `0.7.15` | Bumps the non-rust-vmm group with 6 updates in the /fuzz directory: | Package | From | To | | --- | --- | --- | | [libc](https://github.com/rust-lang/libc) | `0.2.182` | `0.2.183` | | [uuid](https://github.com/uuid-rs/uuid) | `1.21.0` | `1.22.0` | | [zerocopy](https://github.com/google/zerocopy) | `0.8.40` | `0.8.42` | | [proc-macro-crate](https://github.com/bkchr/proc-macro-crate) | `3.4.0` | `3.5.0` | | [quote](https://github.com/dtolnay/quote) | `1.0.44` | `1.0.45` | | [winnow](https://github.com/winnow-rs/winnow) | `0.7.14` | `0.7.15` | Updates `libc` from 0.2.182 to 0.2.183 - [Release notes](https://github.com/rust-lang/libc/releases) - [Changelog](https://github.com/rust-lang/libc/blob/0.2.183/CHANGELOG.md) - [Commits](https://github.com/rust-lang/libc/compare/0.2.182...0.2.183) Updates `uuid` from 1.21.0 to 1.22.0 - [Release notes](https://github.com/uuid-rs/uuid/releases) - [Commits](https://github.com/uuid-rs/uuid/compare/v1.21.0...v1.22.0) Updates `zerocopy` from 0.8.40 to 0.8.42 - [Release notes](https://github.com/google/zerocopy/releases) - [Changelog](https://github.com/google/zerocopy/blob/main/CHANGELOG.md) - [Commits](https://github.com/google/zerocopy/compare/v0.8.40...v0.8.42) Updates `getrandom` from 0.4.1 to 0.4.2 - [Changelog](https://github.com/rust-random/getrandom/blob/master/CHANGELOG.md) - [Commits](https://github.com/rust-random/getrandom/compare/v0.4.1...v0.4.2) Updates `jiff` from 0.2.22 to 0.2.23 - [Release notes](https://github.com/BurntSushi/jiff/releases) - [Changelog](https://github.com/BurntSushi/jiff/blob/master/CHANGELOG.md) - [Commits](https://github.com/BurntSushi/jiff/compare/jiff-static-0.2.22...jiff-static-0.2.23) Updates `jiff-static` from 0.2.22 to 0.2.23 - [Release notes](https://github.com/BurntSushi/jiff/releases) - [Changelog](https://github.com/BurntSushi/jiff/blob/master/CHANGELOG.md) - [Commits](https://github.com/BurntSushi/jiff/compare/jiff-static-0.2.22...jiff-static-0.2.23) Updates `proc-macro-crate` from 3.4.0 to 3.5.0 - [Release notes](https://github.com/bkchr/proc-macro-crate/releases) - [Commits](https://github.com/bkchr/proc-macro-crate/compare/v3.4.0...v3.5.0) Updates `quote` from 1.0.44 to 1.0.45 - [Release notes](https://github.com/dtolnay/quote/releases) - [Commits](https://github.com/dtolnay/quote/compare/1.0.44...1.0.45) Updates `rand_core` from 0.9.5 to 0.10.0 - [Release notes](https://github.com/rust-random/rand_core/releases) - [Changelog](https://github.com/rust-random/rand_core/blob/master/CHANGELOG.md) - [Commits](https://github.com/rust-random/rand_core/commits/v0.10.0) Updates `toml_datetime` from 0.7.5+spec-1.1.0 to 1.0.0+spec-1.1.0 - [Commits](https://github.com/toml-rs/toml/compare/toml_datetime-v0.7.5...toml_datetime-v1.0.0) Updates `toml_edit` from 0.23.10+spec-1.0.0 to 0.25.4+spec-1.1.0 - [Commits](https://github.com/toml-rs/toml/compare/v0.23.10...v0.25.4) Updates `uds_windows` from 1.1.0 to 1.2.0 - [Release notes](https://github.com/haraldh/rust_uds_windows/releases) - [Changelog](https://github.com/haraldh/rust_uds_windows/blob/master/CHANGELOG.md) - [Commits](https://github.com/haraldh/rust_uds_windows/compare/v1.1.0...v1.2.0) Updates `winnow` from 0.7.14 to 0.7.15 - [Changelog](https://github.com/winnow-rs/winnow/blob/main/CHANGELOG.md) - [Commits](https://github.com/winnow-rs/winnow/compare/v0.7.14...v0.7.15) Updates `zerocopy-derive` from 0.8.40 to 0.8.42 - [Release notes](https://github.com/google/zerocopy/releases) - [Changelog](https://github.com/google/zerocopy/blob/main/CHANGELOG.md) - [Commits](https://github.com/google/zerocopy/compare/v0.8.40...v0.8.42) Updates `libc` from 0.2.182 to 0.2.183 - [Release notes](https://github.com/rust-lang/libc/releases) - [Changelog](https://github.com/rust-lang/libc/blob/0.2.183/CHANGELOG.md) - [Commits](https://github.com/rust-lang/libc/compare/0.2.182...0.2.183) Updates `uuid` from 1.21.0 to 1.22.0 - [Release notes](https://github.com/uuid-rs/uuid/releases) - [Commits](https://github.com/uuid-rs/uuid/compare/v1.21.0...v1.22.0) Updates `zerocopy` from 0.8.40 to 0.8.42 - [Release notes](https://github.com/google/zerocopy/releases) - [Changelog](https://github.com/google/zerocopy/blob/main/CHANGELOG.md) - [Commits](https://github.com/google/zerocopy/compare/v0.8.40...v0.8.42) Updates `rand` from 0.9.2 to 0.10.0 - [Release notes](https://github.com/rust-random/rand/releases) - [Changelog](https://github.com/rust-random/rand/blob/master/CHANGELOG.md) - [Commits](https://github.com/rust-random/rand/compare/rand_core-0.9.2...0.10.0) Updates `proc-macro-crate` from 3.4.0 to 3.5.0 - [Release notes](https://github.com/bkchr/proc-macro-crate/releases) - [Commits](https://github.com/bkchr/proc-macro-crate/compare/v3.4.0...v3.5.0) Updates `quote` from 1.0.44 to 1.0.45 - [Release notes](https://github.com/dtolnay/quote/releases) - [Commits](https://github.com/dtolnay/quote/compare/1.0.44...1.0.45) Updates `rand_core` from 0.9.5 to 0.10.0 - [Release notes](https://github.com/rust-random/rand_core/releases) - [Changelog](https://github.com/rust-random/rand_core/blob/master/CHANGELOG.md) - [Commits](https://github.com/rust-random/rand_core/commits/v0.10.0) Updates `toml_datetime` from 0.7.5+spec-1.1.0 to 1.0.0+spec-1.1.0 - [Commits](https://github.com/toml-rs/toml/compare/toml_datetime-v0.7.5...toml_datetime-v1.0.0) Updates `toml_edit` from 0.23.10+spec-1.0.0 to 0.25.4+spec-1.1.0 - [Commits](https://github.com/toml-rs/toml/compare/v0.23.10...v0.25.4) Updates `winnow` from 0.7.14 to 0.7.15 - [Changelog](https://github.com/winnow-rs/winnow/blob/main/CHANGELOG.md) - [Commits](https://github.com/winnow-rs/winnow/compare/v0.7.14...v0.7.15) Updates `zerocopy-derive` from 0.8.40 to 0.8.42 - [Release notes](https://github.com/google/zerocopy/releases) - [Changelog](https://github.com/google/zerocopy/blob/main/CHANGELOG.md) - [Commits](https://github.com/google/zerocopy/compare/v0.8.40...v0.8.42) --- updated-dependencies: - dependency-name: libc dependency-version: 0.2.183 dependency-type: direct:production update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: uuid dependency-version: 1.22.0 dependency-type: direct:production update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: zerocopy dependency-version: 0.8.42 dependency-type: direct:production update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: getrandom dependency-version: 0.4.2 dependency-type: direct:production update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: jiff dependency-version: 0.2.23 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: jiff-static dependency-version: 0.2.23 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: proc-macro-crate dependency-version: 3.5.0 dependency-type: indirect update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: quote dependency-version: 1.0.45 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: rand_core dependency-version: 0.10.0 dependency-type: indirect update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: toml_datetime dependency-version: 1.0.0+spec-1.1.0 dependency-type: indirect update-type: version-update:semver-major dependency-group: non-rust-vmm - dependency-name: toml_edit dependency-version: 0.25.4+spec-1.1.0 dependency-type: indirect update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: uds_windows dependency-version: 1.2.0 dependency-type: indirect update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: winnow dependency-version: 0.7.15 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: zerocopy-derive dependency-version: 0.8.42 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: libc dependency-version: 0.2.183 dependency-type: direct:production update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: uuid dependency-version: 1.22.0 dependency-type: direct:production update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: zerocopy dependency-version: 0.8.42 dependency-type: direct:production update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: rand dependency-version: 0.10.0 dependency-type: indirect update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: proc-macro-crate dependency-version: 3.5.0 dependency-type: indirect update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: quote dependency-version: 1.0.45 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: rand_core dependency-version: 0.10.0 dependency-type: indirect update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: toml_datetime dependency-version: 1.0.0+spec-1.1.0 dependency-type: indirect update-type: version-update:semver-major dependency-group: non-rust-vmm - dependency-name: toml_edit dependency-version: 0.25.4+spec-1.1.0 dependency-type: indirect update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: winnow dependency-version: 0.7.15 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: zerocopy-derive dependency-version: 0.8.42 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm ... Signed-off-by: dependabot[bot] --- Cargo.lock | 114 ++++++++++++++++---------------------------- Cargo.toml | 6 +-- devices/Cargo.toml | 2 +- fuzz/Cargo.lock | 90 +++++++++++++++++----------------- fuzz/Cargo.toml | 2 +- net_util/Cargo.toml | 2 +- 6 files changed, 92 insertions(+), 124 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 83ef1dc8c8..1833fda938 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -394,7 +394,7 @@ checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601" dependencies = [ "cfg-if", "cpufeatures", - "rand_core 0.10.0", + "rand_core", ] [[package]] @@ -936,20 +936,20 @@ checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" dependencies = [ "cfg-if", "libc", - "r-efi", + "r-efi 5.3.0", "wasip2", ] [[package]] name = "getrandom" -version = "0.4.1" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "139ef39800118c7683f2fd3c98c1b23c09ae076556b435f8e9064ae108aaeeec" +checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" dependencies = [ "cfg-if", "libc", - "r-efi", - "rand_core 0.10.0", + "r-efi 6.0.0", + "rand_core", "wasip2", "wasip3", ] @@ -1136,9 +1136,9 @@ checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" [[package]] name = "jiff" -version = "0.2.22" +version = "0.2.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "819b44bc7c87d9117eb522f14d46e918add69ff12713c475946b0a29363ed1c2" +checksum = "1a3546dc96b6d42c5f24902af9e2538e82e39ad350b0c766eb3fbf2d8f3d8359" dependencies = [ "jiff-static", "log", @@ -1149,9 +1149,9 @@ dependencies = [ [[package]] name = "jiff-static" -version = "0.2.22" +version = "0.2.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "470252db18ecc35fd766c0891b1e3ec6cbbcd62507e85276c01bf75d8e94d4a1" +checksum = "2a8c8b344124222efd714b73bb41f8b5120b27a7cc1c75593a6ff768d9d05aa4" dependencies = [ "proc-macro2", "quote", @@ -1226,9 +1226,9 @@ checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" [[package]] name = "libc" -version = "0.2.182" +version = "0.2.183" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6800badb6cb2082ffd7b6a67e6125bb39f18782f793520caee8cb8846be06112" +checksum = "b5b646652bf6661599e1da8901b3b9522896f01e736bad5f723fe7a3a27f899d" [[package]] name = "libredox" @@ -1379,7 +1379,7 @@ name = "net_util" version = "0.1.0" dependencies = [ "epoll", - "getrandom 0.4.1", + "getrandom 0.4.2", "libc", "log", "net_gen", @@ -1730,15 +1730,6 @@ dependencies = [ "portable-atomic", ] -[[package]] -name = "ppv-lite86" -version = "0.2.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" -dependencies = [ - "zerocopy", -] - [[package]] name = "prettyplease" version = "0.2.37" @@ -1751,9 +1742,9 @@ dependencies = [ [[package]] name = "proc-macro-crate" -version = "3.4.0" +version = "3.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "219cb19e96be00ab2e37d6e299658a0cfa83e52429179969b0f0121b4ac46983" +checksum = "e67ba7e9b2b56446f1d419b1d807906278ffa1a658a8a5d8a39dcb1f5a78614f" dependencies = [ "toml_edit", ] @@ -1769,9 +1760,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.44" +version = "1.0.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21b2ebcf727b7760c461f091f9f0f539b77b8e87f2fd88131e7f1b433b3cece4" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" dependencies = [ "proc-macro2", ] @@ -1783,14 +1774,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" [[package]] -name = "rand" -version = "0.9.2" +name = "r-efi" +version = "6.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" -dependencies = [ - "rand_chacha", - "rand_core 0.9.5", -] +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" [[package]] name = "rand" @@ -1799,27 +1786,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bc266eb313df6c5c09c1c7b1fbe2510961e5bcd3add930c1e31f7ed9da0feff8" dependencies = [ "chacha20", - "getrandom 0.4.1", - "rand_core 0.10.0", -] - -[[package]] -name = "rand_chacha" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" -dependencies = [ - "ppv-lite86", - "rand_core 0.9.5", -] - -[[package]] -name = "rand_core" -version = "0.9.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" -dependencies = [ - "getrandom 0.3.4", + "getrandom 0.4.2", + "rand_core", ] [[package]] @@ -2132,7 +2100,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "82a72c767771b47409d2345987fda8628641887d5466101319899796367354a0" dependencies = [ "fastrand", - "getrandom 0.3.4", + "getrandom 0.4.2", "once_cell", "rustix", "windows-sys 0.60.2", @@ -2155,7 +2123,7 @@ dependencies = [ "dirs", "epoll", "libc", - "rand 0.10.0", + "rand", "serde_json", "ssh2", "thiserror 2.0.18", @@ -2211,18 +2179,18 @@ checksum = "3bf63baf9f5039dadc247375c29eb13706706cfde997d0330d05aa63a77d8820" [[package]] name = "toml_datetime" -version = "0.7.5+spec-1.1.0" +version = "1.0.0+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92e1cfed4a3038bc5a127e35a2d360f145e1f4b971b551a2ba5fd7aedf7e1347" +checksum = "32c2555c699578a4f59f0cc68e5116c8d7cabbd45e1409b989d4be085b53f13e" dependencies = [ "serde_core", ] [[package]] name = "toml_edit" -version = "0.23.10+spec-1.0.0" +version = "0.25.4+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84c8b9f757e028cee9fa244aea147aab2a9ec09d5325a9b01e0a49730c2b5269" +checksum = "7193cbd0ce53dc966037f54351dbbcf0d5a642c7f0038c382ef9e677ce8c13f2" dependencies = [ "indexmap", "toml_datetime", @@ -2294,13 +2262,13 @@ dependencies = [ [[package]] name = "uds_windows" -version = "1.1.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89daebc3e6fd160ac4aa9fc8b3bf71e1f74fbf92367ae71fb83a037e8bf164b9" +checksum = "51b70b87d15e91f553711b40df3048faf27a7a04e01e0ddc0cf9309f0af7c2ca" dependencies = [ "memoffset", "tempfile", - "winapi", + "windows-sys 0.60.2", ] [[package]] @@ -2323,13 +2291,13 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.21.0" +version = "1.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b672338555252d43fd2240c714dc444b8c6fb0a5c5335e65a07bba7742735ddb" +checksum = "a68d3c8f01c0cfa54a75291d83601161799e4a89a39e0929f4b0354d88757a37" dependencies = [ - "getrandom 0.4.1", + "getrandom 0.4.2", "js-sys", - "rand 0.9.2", + "rand", "serde_core", "wasm-bindgen", ] @@ -2868,9 +2836,9 @@ checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" [[package]] name = "winnow" -version = "0.7.14" +version = "0.7.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a5364e9d77fcdeeaa6062ced926ee3381faa2ee02d3eb83a5c27a8825540829" +checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945" dependencies = [ "memchr", ] @@ -3026,18 +2994,18 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.40" +version = "0.8.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a789c6e490b576db9f7e6b6d661bcc9799f7c0ac8352f56ea20193b2681532e5" +checksum = "f2578b716f8a7a858b7f02d5bd870c14bf4ddbbcf3a4c05414ba6503640505e3" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.40" +version = "0.8.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f65c489a7071a749c849713807783f70672b28094011623e200cb86dcb835953" +checksum = "7e6cc098ea4d3bd6246687de65af3f920c430e236bee1e3bf2e441463f08a02f" dependencies = [ "proc-macro2", "quote", diff --git a/Cargo.toml b/Cargo.toml index a3e76a797e..93765ba718 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -84,13 +84,13 @@ env_logger = "0.11.8" epoll = "4.4.0" flume = "0.12.0" itertools = "0.14.0" -libc = "0.2.182" +libc = "0.2.183" log = "0.4.29" signal-hook = "0.4.3" thiserror = "2.0.18" -uuid = { version = "1.21.0" } +uuid = { version = "1.22.0" } wait-timeout = "0.2.1" -zerocopy = { version = "0.8.40", default-features = false } +zerocopy = { version = "0.8.42", default-features = false } [workspace.lints.clippy] # Any clippy lint (group) in alphabetical order: diff --git a/devices/Cargo.toml b/devices/Cargo.toml index d9ce839882..06c99a1674 100644 --- a/devices/Cargo.toml +++ b/devices/Cargo.toml @@ -34,7 +34,7 @@ vm-memory = { workspace = true, features = [ ] } vm-migration = { path = "../vm-migration" } vmm-sys-util = { workspace = true } -zerocopy = { version = "0.8.40", features = [ +zerocopy = { version = "0.8.42", features = [ "alloc", "derive", ], optional = true } diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock index 6d98900f31..5d5f5e7a45 100644 --- a/fuzz/Cargo.lock +++ b/fuzz/Cargo.lock @@ -189,6 +189,17 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" +[[package]] +name = "chacha20" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601" +dependencies = [ + "cfg-if", + "cpufeatures", + "rand_core", +] + [[package]] name = "clap" version = "4.5.60" @@ -258,6 +269,15 @@ dependencies = [ "syn", ] +[[package]] +name = "cpufeatures" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201" +dependencies = [ + "libc", +] + [[package]] name = "crc-any" version = "2.5.0" @@ -530,6 +550,7 @@ dependencies = [ "cfg-if", "libc", "r-efi", + "rand_core", "wasip2", "wasip3", ] @@ -704,9 +725,9 @@ checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" [[package]] name = "libc" -version = "0.2.182" +version = "0.2.183" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6800badb6cb2082ffd7b6a67e6125bb39f18782f793520caee8cb8846be06112" +checksum = "b5b646652bf6661599e1da8901b3b9522896f01e736bad5f723fe7a3a27f899d" [[package]] name = "libfuzzer-sys" @@ -916,15 +937,6 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" -[[package]] -name = "ppv-lite86" -version = "0.2.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" -dependencies = [ - "zerocopy", -] - [[package]] name = "prettyplease" version = "0.2.37" @@ -937,9 +949,9 @@ dependencies = [ [[package]] name = "proc-macro-crate" -version = "3.4.0" +version = "3.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "219cb19e96be00ab2e37d6e299658a0cfa83e52429179969b0f0121b4ac46983" +checksum = "e67ba7e9b2b56446f1d419b1d807906278ffa1a658a8a5d8a39dcb1f5a78614f" dependencies = [ "toml_edit", ] @@ -955,9 +967,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.44" +version = "1.0.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21b2ebcf727b7760c461f091f9f0f539b77b8e87f2fd88131e7f1b433b3cece4" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" dependencies = [ "proc-macro2", ] @@ -970,32 +982,20 @@ checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" [[package]] name = "rand" -version = "0.9.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" -dependencies = [ - "rand_chacha", - "rand_core", -] - -[[package]] -name = "rand_chacha" -version = "0.9.0" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +checksum = "bc266eb313df6c5c09c1c7b1fbe2510961e5bcd3add930c1e31f7ed9da0feff8" dependencies = [ - "ppv-lite86", + "chacha20", + "getrandom 0.4.1", "rand_core", ] [[package]] name = "rand_core" -version = "0.9.5" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" -dependencies = [ - "getrandom 0.3.4", -] +checksum = "0c8d0fd677905edcbeedbf2edb6494d676f0e98d54d5cf9bda0b061cb8fb8aba" [[package]] name = "rate_limiter" @@ -1221,18 +1221,18 @@ dependencies = [ [[package]] name = "toml_datetime" -version = "0.7.5+spec-1.1.0" +version = "1.0.0+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92e1cfed4a3038bc5a127e35a2d360f145e1f4b971b551a2ba5fd7aedf7e1347" +checksum = "32c2555c699578a4f59f0cc68e5116c8d7cabbd45e1409b989d4be085b53f13e" dependencies = [ "serde_core", ] [[package]] name = "toml_edit" -version = "0.23.10+spec-1.0.0" +version = "0.25.4+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84c8b9f757e028cee9fa244aea147aab2a9ec09d5325a9b01e0a49730c2b5269" +checksum = "7193cbd0ce53dc966037f54351dbbcf0d5a642c7f0038c382ef9e677ce8c13f2" dependencies = [ "indexmap", "toml_datetime", @@ -1291,9 +1291,9 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.21.0" +version = "1.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b672338555252d43fd2240c714dc444b8c6fb0a5c5335e65a07bba7742735ddb" +checksum = "a68d3c8f01c0cfa54a75291d83601161799e4a89a39e0929f4b0354d88757a37" dependencies = [ "getrandom 0.4.1", "js-sys", @@ -1681,9 +1681,9 @@ dependencies = [ [[package]] name = "winnow" -version = "0.7.14" +version = "0.7.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a5364e9d77fcdeeaa6062ced926ee3381faa2ee02d3eb83a5c27a8825540829" +checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945" dependencies = [ "memchr", ] @@ -1778,18 +1778,18 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.40" +version = "0.8.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a789c6e490b576db9f7e6b6d661bcc9799f7c0ac8352f56ea20193b2681532e5" +checksum = "f2578b716f8a7a858b7f02d5bd870c14bf4ddbbcf3a4c05414ba6503640505e3" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.40" +version = "0.8.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f65c489a7071a749c849713807783f70672b28094011623e200cb86dcb835953" +checksum = "7e6cc098ea4d3bd6246687de65af3f920c430e236bee1e3bf2e441463f08a02f" dependencies = [ "proc-macro2", "quote", diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index 888928af06..88d31a152a 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -21,7 +21,7 @@ block = { path = "../block" } devices = { path = "../devices" } epoll = "4.3.3" hypervisor = { path = "../hypervisor", features = ["mshv_emulator"] } -libc = "0.2.182" +libc = "0.2.183" libfuzzer-sys = "0.4.12" linux-loader = { version = "0.13.1", features = ["bzimage", "elf", "pe"] } micro_http = { git = "https://github.com/firecracker-microvm/micro-http", branch = "main" } diff --git a/net_util/Cargo.toml b/net_util/Cargo.toml index e9a8f5badf..9f124cf613 100644 --- a/net_util/Cargo.toml +++ b/net_util/Cargo.toml @@ -6,7 +6,7 @@ version = "0.1.0" [dependencies] epoll = { workspace = true } -getrandom = "0.4.1" +getrandom = "0.4.2" libc = { workspace = true } log = { workspace = true } net_gen = { path = "../net_gen" } From 19d019629177aa48f13a75293263aabbf7dc5f06 Mon Sep 17 00:00:00 2001 From: wuxinyue Date: Tue, 2 Dec 2025 11:45:32 +0800 Subject: [PATCH 094/742] virtio-devices: block: Reduce latency in completion handling Signal the guest before processing queue submissions to enable earlier guest side completion event handling, reducing end-to-end latency for block device operations. FIO benchmarks show up to 7.4% bandwidth improvement at 16 iodepth and 4k block size with NVMe devices. Signed-off-by: wuxinyue --- virtio-devices/src/block.rs | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/virtio-devices/src/block.rs b/virtio-devices/src/block.rs index 9bb97d31c3..bf7d9123c1 100644 --- a/virtio-devices/src/block.rs +++ b/virtio-devices/src/block.rs @@ -607,17 +607,14 @@ impl EpollHelperHandler for BlockEpollHandler { )) })?; + self.try_signal_used_queue()?; + let rate_limit_reached = self.rate_limiter.as_ref().is_some_and(|r| r.is_blocked()); // Process the queue only when the rate limit is not reached if !rate_limit_reached { - self.process_queue_submit().map_err(|e| { - EpollHelperError::HandleEvent(anyhow!( - "Failed to process queue (submit): {e:?}" - )) - })?; + self.process_queue_submit_and_signal()?; } - self.try_signal_used_queue()?; } RATE_LIMITER_EVENT => { if let Some(rate_limiter) = &mut self.rate_limiter { From f211170fa2c773437f3c8bcee8087828e5cdc472 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Wed, 11 Mar 2026 07:28:02 -0700 Subject: [PATCH 095/742] vmm: openapi: Create enum types for enums This makes it clearer that these are enums that can be reused and also helps generation by providing names for those types. Signed-off-by: Rob Bradford --- vmm/src/api/openapi/cloud-hypervisor.yaml | 44 +++++++++++++++-------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/vmm/src/api/openapi/cloud-hypervisor.yaml b/vmm/src/api/openapi/cloud-hypervisor.yaml index 77b16e97ff..cdba385c74 100644 --- a/vmm/src/api/openapi/cloud-hypervisor.yaml +++ b/vmm/src/api/openapi/cloud-hypervisor.yaml @@ -520,6 +520,10 @@ components: type: string description: Virtual Machine Monitor information + VmState: + type: string + enum: [Created, Running, Shutdown, Paused] + VmInfo: required: - config @@ -529,8 +533,8 @@ components: config: $ref: "#/components/schemas/VmConfig" state: - type: string - enum: [Created, Running, Shutdown, Paused] + $ref: "#/components/schemas/VmState" + memory_actual_size: type: integer format: int64 @@ -710,6 +714,11 @@ components: packages: type: integer + CoreSchedulingMode: + type: string + enum: ["Vm", "Vcpu", "Off"] + default: "Vm" + CpusConfig: required: - boot_vcpus @@ -739,9 +748,8 @@ components: features: $ref: "#/components/schemas/CpuFeatures" core_scheduling: - type: string - enum: ["Vm", "Vcpu", "Off"] - default: "Vm" + $ref: "#/components/schemas/CoreSchedulingMode" + PciSegmentConfig: required: @@ -931,6 +939,15 @@ components: items: type: integer + ImageType: + type: string + enum: ["FixedVhd", "Qcow2", "Raw", "Vhdx", "Unknown"] + + LockGranularity: + type: string + enum: [byte-range, full] + default: byte-range + DiskConfig: type: object properties: @@ -978,12 +995,9 @@ components: type: boolean default: true image_type: - type: string - enum: [FixedVhd, Qcow2, Raw, Vhdx, Unknown] + $ref: "#/components/schemas/ImageType" lock_granularity: - type: string - enum: [byte-range, full] - default: byte-range + $ref: "#/components/schemas/LockGranularity" NetConfig: type: object @@ -1132,6 +1146,10 @@ components: id: type: string + ConsoleMode: + type: string + enum: ["Off", "Pty", "Tty", "File", "Socket", "Null"] + ConsoleConfig: required: - mode @@ -1142,8 +1160,7 @@ components: socket: type: string mode: - type: string - enum: ["Off", "Pty", "Tty", "File", "Socket", "Null"] + $ref: "#/components/schemas/ConsoleMode" iommu: type: boolean default: false @@ -1156,8 +1173,7 @@ components: file: type: string mode: - type: string - enum: ["Off", "Pty", "Tty", "File", "Null"] + $ref: "#/components/schemas/ConsoleMode" iobase: type: integer From 0a4fe0e41eae99fbe361ba41f46f45c379519bdd Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Wed, 11 Mar 2026 08:16:13 -0700 Subject: [PATCH 096/742] vmm: Fix OpenAPI definition of for lock granularity Use CamelCase as per the existing definitions (which allows the removal of the serde transformation) Signed-off-by: Rob Bradford --- block/src/fcntl.rs | 1 - vmm/src/api/openapi/cloud-hypervisor.yaml | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/block/src/fcntl.rs b/block/src/fcntl.rs index 23c6f9f167..98084748cf 100644 --- a/block/src/fcntl.rs +++ b/block/src/fcntl.rs @@ -147,7 +147,6 @@ impl LockGranularity { /// image. Without a byte-range lock, some NFS implementations may treat the /// entire file as exclusively locked and prevent such operations (e.g. NetApp). #[derive(Clone, Copy, Debug, Default, PartialEq, Eq, serde::Deserialize, serde::Serialize)] -#[serde(rename_all = "kebab-case")] pub enum LockGranularityChoice { /// Byte-range lock covering [0, size). #[default] diff --git a/vmm/src/api/openapi/cloud-hypervisor.yaml b/vmm/src/api/openapi/cloud-hypervisor.yaml index cdba385c74..efdcf7a678 100644 --- a/vmm/src/api/openapi/cloud-hypervisor.yaml +++ b/vmm/src/api/openapi/cloud-hypervisor.yaml @@ -945,8 +945,8 @@ components: LockGranularity: type: string - enum: [byte-range, full] - default: byte-range + enum: [ByteRange, Full] + default: ByteRange DiskConfig: type: object From d90f852ddf5d3bacc7afe373df1764a45fc12b01 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Tue, 10 Mar 2026 13:58:40 +0100 Subject: [PATCH 097/742] virtio-devices: block: add helpful message on failed sector 0 write We ran the Microsoft Windows installer in CHV (via network + VNC) and the installation always failed when the installer wanted to write the partition table. Since recently, for very good reason, sector 0 writes are disabled if the image type is not set explicitly and only implicitly auto-detected as raw [0]. To ease troubleshooting, I've added a descriptive log message. It is a little spammy, but it is what is required to help users to troubleshoot. [0] https://github.com/cloud-hypervisor/cloud-hypervisor/pull/7728/changes/6ecdf90e22adeecbaf1ce311b1abd358a4788d48 Signed-off-by: Philipp Schuster On-behalf-of: SAP philipp.schuster@sap.com --- virtio-devices/src/block.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/virtio-devices/src/block.rs b/virtio-devices/src/block.rs index bf7d9123c1..0f6ca08820 100644 --- a/virtio-devices/src/block.rs +++ b/virtio-devices/src/block.rs @@ -186,6 +186,9 @@ impl BlockEpollHandler { } if request_type == RequestType::Out && disable_sector0_writes && request.sector == 0 { + warn!( + "Attempting to write to sector 0 on a raw disk without specifying image_type=raw" + ); return Err(ExecuteError::ReadOnly); } From 5b55286099a3e55ec85f93218cb372148b0f89e4 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Tue, 10 Mar 2026 13:58:49 +0100 Subject: [PATCH 098/742] virtio-devices: block: add another helpful log message Signed-off-by: Philipp Schuster On-behalf-of: SAP philipp.schuster@sap.com --- virtio-devices/src/block.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/virtio-devices/src/block.rs b/virtio-devices/src/block.rs index 0f6ca08820..22b8ef31e6 100644 --- a/virtio-devices/src/block.rs +++ b/virtio-devices/src/block.rs @@ -182,6 +182,9 @@ impl BlockEpollHandler { // For virtio spec compliance // "A device MUST set the status byte to VIRTIO_BLK_S_IOERR for a write request // if the VIRTIO_BLK_F_RO feature if offered, and MUST NOT write any data." + warn!( + "Rejecting block request {request_type:?}: device is read-only (VIRTIO_BLK_F_RO negotiated)" + ); return Err(ExecuteError::ReadOnly); } From 57e64a0848ca63546b12159bfd29bac9897bdde2 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 11 Mar 2026 20:45:31 +0100 Subject: [PATCH 099/742] performance-metrics: Kill stale processes on test timeout When a test times out, the spawned thread containing the cloud-hypervisor child process, iperf3/ethr sub processes, and all associated resources (TAP devices, file descriptors, hugepage reservations) is abandoned without cleanup. This attaches a cleanup routine that kills cloud-hypervisor, iperf3, and ethr processes on timeout, then waits briefly for the kernel to reclaim their resources. This prevents leaked processes from interfering with subsequent tests. Removes the existing TODO comment. Signed-off-by: Anatol Belski --- performance-metrics/src/main.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/performance-metrics/src/main.rs b/performance-metrics/src/main.rs index c48e5a906d..34986395ae 100644 --- a/performance-metrics/src/main.rs +++ b/performance-metrics/src/main.rs @@ -1214,7 +1214,6 @@ fn run_test_with_timeout( let _ = sender.send(output); }); - // Todo: Need to cleanup/kill all hanging child processes let test_timeout = test.calc_timeout(&test_iterations, &test_timeout); receiver .recv_timeout(Duration::from_secs(test_timeout)) @@ -1223,10 +1222,18 @@ fn run_test_with_timeout( "[Error] Test '{}' time-out after {} seconds", test.name, test_timeout ); + cleanup_stale_processes(); Error::TestTimeout })? } +fn cleanup_stale_processes() { + for proc in &["cloud-hypervisor", "iperf3", "ethr"] { + let _ = Command::new("pkill").args(["-9", "-f", proc]).status(); + } + thread::sleep(Duration::from_secs(2)); +} + fn date() -> String { let output = test_infra::exec_host_command_output("date"); String::from_utf8_lossy(&output.stdout).trim().to_string() From c0a81bc903021784d2d117470c6091e52dc8382e Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 11 Mar 2026 20:54:11 +0100 Subject: [PATCH 100/742] performance-metrics: Settle host before each test Flush host writeback queues, drop the page cache and sleep 1s for kernel housekeeping before each test run. The cloud-hypervisor block backend does buffered I/O on the host side, so dirty pages from prior write tests can accumulate and compete for I/O bandwidth with subsequent tests. Dropping caches ensures cold read tests get a consistent baseline rather than benefiting from data cached by prior tests. The brief cooldown lets the kernel finish tearing down KVM state and freeing pages from the previous VM before the next one starts. Requires root, which the metrics container provides. Silently fails otherwise. Signed-off-by: Anatol Belski --- performance-metrics/src/main.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/performance-metrics/src/main.rs b/performance-metrics/src/main.rs index 34986395ae..bef0b74ab1 100644 --- a/performance-metrics/src/main.rs +++ b/performance-metrics/src/main.rs @@ -1234,6 +1234,14 @@ fn cleanup_stale_processes() { thread::sleep(Duration::from_secs(2)); } +fn settle_host() { + let _ = Command::new("sync").status(); + let _ = Command::new("bash") + .args(["-c", "echo 3 > /proc/sys/vm/drop_caches"]) + .status(); + thread::sleep(Duration::from_secs(1)); +} + fn date() -> String { let output = test_infra::exec_host_command_output("date"); String::from_utf8_lossy(&output.stdout).trim().to_string() @@ -1334,6 +1342,7 @@ fn main() { for test in test_list.iter() { if test_filter.is_empty() || test_filter.iter().any(|&s| test.name.contains(s)) { + settle_host(); match run_test_with_timeout(test, &overrides) { Ok(r) => { metrics_report.results.push(r); From 766f4206c9a8a127fcdfb71c74e5a77f21ab48fd Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 11 Mar 2026 23:02:31 +0100 Subject: [PATCH 101/742] performance-metrics: Set image_type=raw for block I/O test image Without explicit image_type fio first sequential write hits sector 0 and gets VIRTIO_BLK_S_IOERR. fio then hangs, causing block_write_MiBps and all other write tests using BLK_IO_TEST_IMG to time out. Signed-off-by: Anatol Belski --- performance-metrics/src/performance_tests.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/performance-metrics/src/performance_tests.rs b/performance-metrics/src/performance_tests.rs index 7eb07f368a..c3787f4da5 100644 --- a/performance-metrics/src/performance_tests.rs +++ b/performance-metrics/src/performance_tests.rs @@ -442,6 +442,8 @@ pub fn performance_block_io(control: &PerformanceTestControl) -> f64 { format!("path={test_file},queue_size={queue_size},num_queues={num_queues}"); if test_file == OVERLAY_WITH_QCOW2_BACKING || test_file == OVERLAY_WITH_RAW_BACKING { test_disk_arg.push_str(",image_type=qcow2,backing_files=on"); + } else if test_file == BLK_IO_TEST_IMG { + test_disk_arg.push_str(",image_type=raw"); } let mut child = GuestCommand::new(&guest) From 6850b04fa61954cf7ba45d1b164922c3e9e62fa6 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 11 Mar 2026 23:40:09 +0100 Subject: [PATCH 102/742] performance-metrics: Set image_type=qcow2 for remaining qcow2 test disks Without it the VMM autodetects the format and logs warnings that specifying image_type will become mandatory for non raw images in the future. Signed-off-by: Anatol Belski --- performance-metrics/src/performance_tests.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/performance-metrics/src/performance_tests.rs b/performance-metrics/src/performance_tests.rs index c3787f4da5..1022aac19e 100644 --- a/performance-metrics/src/performance_tests.rs +++ b/performance-metrics/src/performance_tests.rs @@ -440,8 +440,16 @@ pub fn performance_block_io(control: &PerformanceTestControl) -> f64 { let mut test_disk_arg = format!("path={test_file},queue_size={queue_size},num_queues={num_queues}"); - if test_file == OVERLAY_WITH_QCOW2_BACKING || test_file == OVERLAY_WITH_RAW_BACKING { - test_disk_arg.push_str(",image_type=qcow2,backing_files=on"); + if test_file == OVERLAY_WITH_QCOW2_BACKING + || test_file == OVERLAY_WITH_RAW_BACKING + || test_file == QCOW2_UNCOMPRESSED_IMG + || test_file == QCOW2_ZLIB_IMG + || test_file == QCOW2_ZSTD_IMG + { + test_disk_arg.push_str(",image_type=qcow2"); + if test_file == OVERLAY_WITH_QCOW2_BACKING || test_file == OVERLAY_WITH_RAW_BACKING { + test_disk_arg.push_str(",backing_files=on"); + } } else if test_file == BLK_IO_TEST_IMG { test_disk_arg.push_str(",image_type=raw"); } From 8671e193ff331a80c16d54b23c6364890ef48fb5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 9 Mar 2026 23:55:26 +0000 Subject: [PATCH 103/742] build: Bump docker/metadata-action from 5 to 6 Bumps [docker/metadata-action](https://github.com/docker/metadata-action) from 5 to 6. - [Release notes](https://github.com/docker/metadata-action/releases) - [Commits](https://github.com/docker/metadata-action/compare/v5...v6) --- updated-dependencies: - dependency-name: docker/metadata-action dependency-version: '6' dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/docker-image.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker-image.yaml b/.github/workflows/docker-image.yaml index 0f6a51f2d4..f077b51738 100644 --- a/.github/workflows/docker-image.yaml +++ b/.github/workflows/docker-image.yaml @@ -36,7 +36,7 @@ jobs: - name: Docker meta id: meta - uses: docker/metadata-action@v5 + uses: docker/metadata-action@v6 with: images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} # generate Docker tags based on the following events/attributes From 623af62743222277b637a77f4fd90397f38ab551 Mon Sep 17 00:00:00 2001 From: Emir Beganovic Date: Tue, 10 Mar 2026 13:02:21 +0100 Subject: [PATCH 104/742] block: Implement write_zeroes and punch_hole for AIO backend The AIO block backend advertises VIRTIO_BLK_F_WRITE_ZEROES and VIRTIO_BLK_F_DISCARD to guests because the filesystem probe (supports_sparse_operations) returns true on ext4/XFS. However, RawFileAsyncAio::write_zeroes() and punch_hole() return errors because Linux AIO (io_submit) has no IOCB command for fallocate. When io_uring is unavailable (e.g. io_uring_disabled=2, a common security hardening on enterprise Linux), Cloud Hypervisor falls back to the AIO backend. The guest negotiates the feature, issues WRITE_ZEROES requests, and gets I/O errors. Implement write_zeroes and punch_hole using synchronous libc::fallocate() calls, matching the pattern used by the sync backend (RawFileSync). A VecDeque-based completion list signals results to the caller via the existing eventfd mechanism. Unit tests mirror the existing raw_sync.rs test suite. Integration tests add AIO-specific variants of the discard and fstrim tests using _disable_io_uring=on. Signed-off-by: Emir Beganovic --- block/src/raw_async_aio.rs | 240 +++++++++++++++++++++++++- cloud-hypervisor/tests/integration.rs | 64 ++++++- 2 files changed, 292 insertions(+), 12 deletions(-) diff --git a/block/src/raw_async_aio.rs b/block/src/raw_async_aio.rs index fe7196ebba..20fb26c2c4 100644 --- a/block/src/raw_async_aio.rs +++ b/block/src/raw_async_aio.rs @@ -5,6 +5,7 @@ // Copyright © 2023 Crusoe Energy Systems LLC // +use std::collections::VecDeque; use std::fs::File; use std::io::{Seek, SeekFrom}; use std::os::unix::io::{AsRawFd, RawFd}; @@ -73,6 +74,7 @@ pub struct RawFileAsyncAio { ctx: aio::IoContext, eventfd: EventFd, alignment: u64, + completion_list: VecDeque<(u64, i32)>, } impl RawFileAsyncAio { @@ -85,6 +87,7 @@ impl RawFileAsyncAio { ctx, eventfd, alignment: SECTOR_SIZE, + completion_list: VecDeque::new(), }) } } @@ -168,6 +171,11 @@ impl AsyncIo for RawFileAsyncAio { } fn next_completed_request(&mut self) -> Option<(u64, i32)> { + // Drain synchronous completions first (from punch_hole/write_zeroes). + if let Some(completed) = self.completion_list.pop_front() { + return Some(completed); + } + let mut events: [aio::IoEvent; 1] = [aio::IoEvent::default()]; let rc = self.ctx.get_events(0, &mut events, None).unwrap(); if rc == 0 { @@ -177,15 +185,231 @@ impl AsyncIo for RawFileAsyncAio { } } - fn punch_hole(&mut self, _offset: u64, _length: u64, _user_data: u64) -> AsyncIoResult<()> { - Err(AsyncIoError::PunchHole(std::io::Error::other( - "punch_hole not supported with AIO backend", - ))) + fn punch_hole(&mut self, offset: u64, length: u64, user_data: u64) -> AsyncIoResult<()> { + // Linux AIO has no IOCB command for fallocate, so perform the operation + // synchronously and signal completion via the completion list, matching + // the pattern used by the sync backend (RawFileSync). + const FALLOC_FL_PUNCH_HOLE: i32 = 0x02; + const FALLOC_FL_KEEP_SIZE: i32 = 0x01; + let mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; + + // SAFETY: FFI call with valid arguments + let result = unsafe { + libc::fallocate( + self.fd as libc::c_int, + mode, + offset as libc::off_t, + length as libc::off_t, + ) + }; + if result < 0 { + return Err(AsyncIoError::PunchHole(std::io::Error::last_os_error())); + } + + self.completion_list.push_back((user_data, result)); + self.eventfd.write(1).unwrap(); + + Ok(()) + } + + fn write_zeroes(&mut self, offset: u64, length: u64, user_data: u64) -> AsyncIoResult<()> { + // Linux AIO has no IOCB command for fallocate, so perform the operation + // synchronously and signal completion via the completion list, matching + // the pattern used by the sync backend (RawFileSync). + const FALLOC_FL_ZERO_RANGE: i32 = 0x10; + const FALLOC_FL_KEEP_SIZE: i32 = 0x01; + let mode = FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE; + + // SAFETY: FFI call with valid arguments + let result = unsafe { + libc::fallocate( + self.fd as libc::c_int, + mode, + offset as libc::off_t, + length as libc::off_t, + ) + }; + if result < 0 { + return Err(AsyncIoError::WriteZeroes(std::io::Error::last_os_error())); + } + + self.completion_list.push_back((user_data, result)); + self.eventfd.write(1).unwrap(); + + Ok(()) } +} + +#[cfg(test)] +mod unit_tests { + use std::io::{Read, Seek, SeekFrom, Write}; + + use vmm_sys_util::tempfile::TempFile; + + use super::*; + + #[test] + fn test_punch_hole() { + let temp_file = TempFile::new().unwrap(); + let mut file = temp_file.into_file(); + + // Write 4MB of data + let data = vec![0xAA; 4 * 1024 * 1024]; + file.write_all(&data).unwrap(); + file.sync_all().unwrap(); + + // Create async IO instance + let mut async_io = RawFileAsyncAio::new(file.as_raw_fd(), 128).unwrap(); + + // Punch hole in the middle (1MB at offset 1MB) + let offset = 1024 * 1024; + let length = 1024 * 1024; + async_io.punch_hole(offset, length, 1).unwrap(); + + // Check completion + let (user_data, result) = async_io.next_completed_request().unwrap(); + assert_eq!(user_data, 1); + assert_eq!(result, 0); + + // Verify the hole reads as zeros + file.seek(SeekFrom::Start(offset)).unwrap(); + let mut read_buf = vec![0; length as usize]; + file.read_exact(&mut read_buf).unwrap(); + assert!( + read_buf.iter().all(|&b| b == 0), + "Punched hole should read as zeros" + ); + + // Verify data before hole is intact + file.seek(SeekFrom::Start(0)).unwrap(); + let mut read_buf = vec![0; 1024]; + file.read_exact(&mut read_buf).unwrap(); + assert!( + read_buf.iter().all(|&b| b == 0xAA), + "Data before hole should be intact" + ); + + // Verify data after hole is intact + file.seek(SeekFrom::Start(offset + length)).unwrap(); + let mut read_buf = vec![0; 1024]; + file.read_exact(&mut read_buf).unwrap(); + assert!( + read_buf.iter().all(|&b| b == 0xAA), + "Data after hole should be intact" + ); + } + + #[test] + fn test_write_zeroes() { + let temp_file = TempFile::new().unwrap(); + let mut file = temp_file.into_file(); + + // Write 4MB of data + let data = vec![0xBB; 4 * 1024 * 1024]; + file.write_all(&data).unwrap(); + file.sync_all().unwrap(); + + // Create async IO instance + let mut async_io = RawFileAsyncAio::new(file.as_raw_fd(), 128).unwrap(); + + // Write zeros in the middle (512KB at offset 2MB) + let offset = 2 * 1024 * 1024; + let length = 512 * 1024; + let write_zeroes_result = async_io.write_zeroes(offset, length, 2); + + // FALLOC_FL_ZERO_RANGE might not be supported on all filesystems (e.g., tmpfs) + // If it fails with ENOTSUP, skip the test + if let Err(AsyncIoError::WriteZeroes(ref e)) = write_zeroes_result + && (e.raw_os_error() == Some(libc::EOPNOTSUPP) + || e.raw_os_error() == Some(libc::ENOTSUP)) + { + eprintln!( + "Skipping test_write_zeroes: filesystem doesn't support FALLOC_FL_ZERO_RANGE" + ); + return; + } + write_zeroes_result.unwrap(); + + // Check completion + let (user_data, result) = async_io.next_completed_request().unwrap(); + assert_eq!(user_data, 2); + assert_eq!(result, 0); + + // Verify the zeroed region reads as zeros + file.seek(SeekFrom::Start(offset)).unwrap(); + let mut read_buf = vec![0; length as usize]; + file.read_exact(&mut read_buf).unwrap(); + assert!( + read_buf.iter().all(|&b| b == 0), + "Zeroed region should read as zeros" + ); + + // Verify data before zeroed region is intact + file.seek(SeekFrom::Start(offset - 1024)).unwrap(); + let mut read_buf = vec![0; 1024]; + file.read_exact(&mut read_buf).unwrap(); + assert!( + read_buf.iter().all(|&b| b == 0xBB), + "Data before zeroed region should be intact" + ); + + // Verify data after zeroed region is intact + file.seek(SeekFrom::Start(offset + length)).unwrap(); + let mut read_buf = vec![0; 1024]; + file.read_exact(&mut read_buf).unwrap(); + assert!( + read_buf.iter().all(|&b| b == 0xBB), + "Data after zeroed region should be intact" + ); + } + + #[test] + fn test_punch_hole_multiple_operations() { + let temp_file = TempFile::new().unwrap(); + let mut file = temp_file.into_file(); + + // Write 8MB of data + let data = vec![0xCC; 8 * 1024 * 1024]; + file.write_all(&data).unwrap(); + file.sync_all().unwrap(); + + // Create async IO instance + let mut async_io = RawFileAsyncAio::new(file.as_raw_fd(), 128).unwrap(); + + // Punch multiple holes + async_io.punch_hole(1024 * 1024, 512 * 1024, 10).unwrap(); + async_io + .punch_hole(3 * 1024 * 1024, 512 * 1024, 11) + .unwrap(); + async_io + .punch_hole(5 * 1024 * 1024, 512 * 1024, 12) + .unwrap(); + + // Check all completions + let (user_data, result) = async_io.next_completed_request().unwrap(); + assert_eq!(user_data, 10); + assert_eq!(result, 0); + + let (user_data, result) = async_io.next_completed_request().unwrap(); + assert_eq!(user_data, 11); + assert_eq!(result, 0); + + let (user_data, result) = async_io.next_completed_request().unwrap(); + assert_eq!(user_data, 12); + assert_eq!(result, 0); + + // Verify all holes read as zeros + file.seek(SeekFrom::Start(1024 * 1024)).unwrap(); + let mut read_buf = vec![0; 512 * 1024]; + file.read_exact(&mut read_buf).unwrap(); + assert!(read_buf.iter().all(|&b| b == 0)); + + file.seek(SeekFrom::Start(3 * 1024 * 1024)).unwrap(); + file.read_exact(&mut read_buf).unwrap(); + assert!(read_buf.iter().all(|&b| b == 0)); - fn write_zeroes(&mut self, _offset: u64, _length: u64, _user_data: u64) -> AsyncIoResult<()> { - Err(AsyncIoError::WriteZeroes(std::io::Error::other( - "write_zeroes not supported with AIO backend", - ))) + file.seek(SeekFrom::Start(5 * 1024 * 1024)).unwrap(); + file.read_exact(&mut read_buf).unwrap(); + assert!(read_buf.iter().all(|&b| b == 0)); } } diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 96963bc4ed..1d198828e9 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -7532,6 +7532,24 @@ mod common_parallel { extra_create_args: &[&str], expect_discard_success: bool, verify_disk: bool, + ) { + _test_virtio_block_discard_with_backend( + format_name, + qemu_img_format, + extra_create_args, + expect_discard_success, + verify_disk, + false, + ); + } + + fn _test_virtio_block_discard_with_backend( + format_name: &str, + qemu_img_format: &str, + extra_create_args: &[&str], + expect_discard_success: bool, + verify_disk: bool, + disable_io_uring: bool, ) { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(disk_config)); @@ -7574,9 +7592,14 @@ mod common_parallel { ) .as_str(), format!( - "path={},num_queues=4,image_type={}", + "path={},num_queues=4,image_type={}{}", test_disk_path.to_str().unwrap(), - format_name.to_lowercase() + format_name.to_lowercase(), + if disable_io_uring { + ",_disable_io_uring=on" + } else { + "" + } ) .as_str(), ]) @@ -7754,6 +7777,11 @@ mod common_parallel { _test_virtio_block_discard("raw", "raw", &[], true, false); } + #[test] + fn test_virtio_block_discard_raw_aio() { + _test_virtio_block_discard_with_backend("raw", "raw", &[], true, false, true); + } + #[test] fn test_virtio_block_discard_unsupported_vhd() { _test_virtio_block_discard("vhd", "vpc", &["-o", "subformat=fixed"], false, false); @@ -8056,6 +8084,24 @@ mod common_parallel { extra_create_args: &[&str], expect_fstrim_success: bool, verify_disk: bool, + ) { + _test_virtio_block_fstrim_with_backend( + format_name, + qemu_img_format, + extra_create_args, + expect_fstrim_success, + verify_disk, + false, + ); + } + + fn _test_virtio_block_fstrim_with_backend( + format_name: &str, + qemu_img_format: &str, + extra_create_args: &[&str], + expect_fstrim_success: bool, + verify_disk: bool, + disable_io_uring: bool, ) { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(disk_config)); @@ -8101,9 +8147,14 @@ mod common_parallel { ) .as_str(), format!( - "path={},num_queues=4,image_type={}", + "path={},num_queues=4,image_type={}{}", test_disk_path.to_str().unwrap(), - format_name.to_lowercase() + format_name.to_lowercase(), + if disable_io_uring { + ",_disable_io_uring=on" + } else { + "" + } ) .as_str(), ]) @@ -8242,6 +8293,11 @@ mod common_parallel { _test_virtio_block_fstrim("raw", "raw", &[], true, false); } + #[test] + fn test_virtio_block_fstrim_raw_aio() { + _test_virtio_block_fstrim_with_backend("raw", "raw", &[], true, false, true); + } + #[test] fn test_virtio_block_fstrim_unsupported_vhd() { _test_virtio_block_fstrim("vhd", "vpc", &["-o", "subformat=fixed"], false, false); From f184a0f0f3b909592228c19ab81196793310be7d Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 4 Mar 2026 22:26:12 +0100 Subject: [PATCH 105/742] block: Add error module skeleton Introduce error.rs as the home for a unified error hierarchy that will replace the per format error types at the public crate boundary. This commit is intentionally empty beyond the copyright header and module declaration in lib.rs. Signed-off-by: Anatol Belski --- block/src/error.rs | 17 +++++++++++++++++ block/src/lib.rs | 1 + 2 files changed, 18 insertions(+) create mode 100644 block/src/error.rs diff --git a/block/src/error.rs b/block/src/error.rs new file mode 100644 index 0000000000..a8dc2c1ad2 --- /dev/null +++ b/block/src/error.rs @@ -0,0 +1,17 @@ +// Copyright 2025 The Cloud Hypervisor Authors. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +//! Unified error handling for the block crate. +//! +//! # Architecture +//! +//! ```text +//! BlockError -- single public error type +//! |-- BlockErrorKind -- small, stable, matchable classification +//! |-- ErrorContext -- optional diagnostic metadata (path, offset, op) +//! +-- source -- format-specific error (boxed) +//! |-- QcowError +//! |-- VhdError / RawError / ... +//! +-- io::Error / etc. +//! ``` diff --git a/block/src/lib.rs b/block/src/lib.rs index f8d56cf102..1b739beafc 100644 --- a/block/src/lib.rs +++ b/block/src/lib.rs @@ -9,6 +9,7 @@ // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause pub mod async_io; +pub mod error; pub mod fcntl; pub mod fixed_vhd; #[cfg(feature = "io_uring")] From e4e74a9d9331d25159dd8176f09e5ae679ddb263 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 4 Mar 2026 22:28:01 +0100 Subject: [PATCH 106/742] block: Add BlockErrorKind classification enum Add a small, stable enum that classifies block errors into broad categories - I/O, invalid format, unsupported feature, corrupt image, out of bounds, not found, overflow. Callers match on this for control flow rather than on format specific error variants. Signed-off-by: Anatol Belski --- block/src/error.rs | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/block/src/error.rs b/block/src/error.rs index a8dc2c1ad2..3b7ae1f291 100644 --- a/block/src/error.rs +++ b/block/src/error.rs @@ -15,3 +15,42 @@ //! |-- VhdError / RawError / ... //! +-- io::Error / etc. //! ``` + +use std::fmt::{self, Display, Formatter}; + +/// Small, stable classification of block errors. +/// +/// Callers match on this for control flow. Adding new format specific +/// errors does not require new variants here. +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +#[non_exhaustive] +pub enum BlockErrorKind { + /// An underlying I/O operation failed. + Io, + /// The disk image format is structurally invalid. + InvalidFormat, + /// The disk image requires a feature that is not implemented. + UnsupportedFeature, + /// The image is marked or detected as corrupt. + CorruptImage, + /// An address, offset, or index is outside the valid range. + OutOfBounds, + /// A file or required internal structure could not be found. + NotFound, + /// An internal counter or limit was exceeded. + Overflow, +} + +impl Display for BlockErrorKind { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + match self { + Self::Io => write!(f, "I/O error"), + Self::InvalidFormat => write!(f, "invalid format"), + Self::UnsupportedFeature => write!(f, "unsupported feature"), + Self::CorruptImage => write!(f, "corrupt image"), + Self::OutOfBounds => write!(f, "out of bounds"), + Self::NotFound => write!(f, "not found"), + Self::Overflow => write!(f, "overflow"), + } + } +} From 55504177cbd4519e1a43a534508adbad177f3eb5 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 4 Mar 2026 22:28:52 +0100 Subject: [PATCH 107/742] block: Add ErrorContext for path/offset/op diagnostics Add a struct that carries optional diagnostic metadata - file path, byte offset, and operation name that can be attached to any BlockError. This lets errors report *where* and *during what* a failure occurred, which is especially useful when the same I/O kind shows up at multiple call sites. Signed-off-by: Anatol Belski --- block/src/error.rs | 55 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/block/src/error.rs b/block/src/error.rs index 3b7ae1f291..b94ddda068 100644 --- a/block/src/error.rs +++ b/block/src/error.rs @@ -17,6 +17,7 @@ //! ``` use std::fmt::{self, Display, Formatter}; +use std::path::PathBuf; /// Small, stable classification of block errors. /// @@ -54,3 +55,57 @@ impl Display for BlockErrorKind { } } } + +/// Classification of the operation that was in progress when an error occurred. +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +#[non_exhaustive] +pub enum ErrorOp { + /// Opening a disk image file. + Open, + /// Detecting the image format. + DetectImageType, + /// Duplicating a backing-file descriptor. + DupBackingFd, +} + +impl Display for ErrorOp { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + match self { + Self::Open => write!(f, "open"), + Self::DetectImageType => write!(f, "detect_image_type"), + Self::DupBackingFd => write!(f, "dup_backing_fd"), + } + } +} + +/// Optional diagnostic context attached to a [`BlockError`]. +#[derive(Debug, Default, Clone)] +pub struct ErrorContext { + pub path: Option, + pub offset: Option, + pub op: Option, +} + +impl Display for ErrorContext { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + let mut first = true; + if let Some(path) = &self.path { + write!(f, "path={}", path.display())?; + first = false; + } + if let Some(offset) = self.offset { + if !first { + write!(f, " ")?; + } + write!(f, "offset={offset:#x}")?; + first = false; + } + if let Some(op) = self.op { + if !first { + write!(f, " ")?; + } + write!(f, "op={op}")?; + } + Ok(()) + } +} From d5467dca8e10c5940820f218b8b418fd3146783d Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 4 Mar 2026 22:29:54 +0100 Subject: [PATCH 108/742] block: Add BlockError struct with Display and Error impls Add the single public crate error type. It combines a BlockErrorKind for classification, an optional boxed source for the underlying cause, and an optional ErrorContext for diagnostics. Display renders the kind and context only, leaving source traversal to error reporters so the cause chain is not duplicated in human readable output. Signed-off-by: Anatol Belski --- block/src/error.rs | 46 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 40 insertions(+), 6 deletions(-) diff --git a/block/src/error.rs b/block/src/error.rs index b94ddda068..13d9716854 100644 --- a/block/src/error.rs +++ b/block/src/error.rs @@ -16,6 +16,7 @@ //! +-- io::Error / etc. //! ``` +use std::error::Error as StdError; use std::fmt::{self, Display, Formatter}; use std::path::PathBuf; @@ -46,12 +47,12 @@ impl Display for BlockErrorKind { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { match self { Self::Io => write!(f, "I/O error"), - Self::InvalidFormat => write!(f, "invalid format"), - Self::UnsupportedFeature => write!(f, "unsupported feature"), - Self::CorruptImage => write!(f, "corrupt image"), - Self::OutOfBounds => write!(f, "out of bounds"), - Self::NotFound => write!(f, "not found"), - Self::Overflow => write!(f, "overflow"), + Self::InvalidFormat => write!(f, "Invalid format"), + Self::UnsupportedFeature => write!(f, "Unsupported feature"), + Self::CorruptImage => write!(f, "Corrupt image"), + Self::OutOfBounds => write!(f, "Out of bounds"), + Self::NotFound => write!(f, "Not found"), + Self::Overflow => write!(f, "Overflow"), } } } @@ -109,3 +110,36 @@ impl Display for ErrorContext { Ok(()) } } + +/// Unified error type for the block crate. +/// +/// Pairs a stable [`BlockErrorKind`] classification with an optional +/// boxed source error (format-specific) and optional [`ErrorContext`]. +/// +/// Display renders kind + context only; the underlying cause is +/// exposed via [`std::error::Error::source()`] for reporters that +/// walk the chain. +#[derive(Debug)] +pub struct BlockError { + kind: BlockErrorKind, + source: Option>, + ctx: Option, +} + +impl Display for BlockError { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.kind)?; + if let Some(ctx) = &self.ctx { + write!(f, " ({ctx})")?; + } + Ok(()) + } +} + +impl StdError for BlockError { + fn source(&self) -> Option<&(dyn StdError + 'static)> { + self.source + .as_ref() + .map(|e| e.as_ref() as &(dyn StdError + 'static)) + } +} From 4c295487364d9575f1689d56432794c55e245dda Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 4 Mar 2026 22:31:15 +0100 Subject: [PATCH 109/742] block: Add BlockError constructors and builder methods Add the construction and inspection API for BlockError, consisting on constructors that accept a kind and optional source, builder methods that attach context after the fact, and accessors for retrieving the kind, context, and typed source references. The builder pattern allows callers to enrich errors at each level of the call stack. Signed-off-by: Anatol Belski --- block/src/error.rs | 76 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/block/src/error.rs b/block/src/error.rs index 13d9716854..9ea710c006 100644 --- a/block/src/error.rs +++ b/block/src/error.rs @@ -126,6 +126,82 @@ pub struct BlockError { ctx: Option, } +impl BlockError { + /// Create a new `BlockError` from a kind and a source error. + pub fn new(kind: BlockErrorKind, source: E) -> Self + where + E: StdError + Send + Sync + 'static, + { + Self { + kind, + source: Some(Box::new(source)), + ctx: None, + } + } + + /// Create a `BlockError` from just a kind, with no underlying cause. + pub fn from_kind(kind: BlockErrorKind) -> Self { + Self { + kind, + source: None, + ctx: None, + } + } + + /// Attach or replace the source error (builder-style). + pub fn with_source(mut self, source: E) -> Self + where + E: StdError + Send + Sync + 'static, + { + self.source = Some(Box::new(source)); + self + } + + /// Attach diagnostic context. + pub fn with_ctx(mut self, ctx: ErrorContext) -> Self { + self.ctx = Some(ctx); + self + } + + /// Shorthand: attach an operation name. + pub fn with_op(mut self, op: ErrorOp) -> Self { + self.ctx.get_or_insert_with(ErrorContext::default).op = Some(op); + self + } + + /// Shorthand: attach a file path. + pub fn with_path(mut self, path: impl Into) -> Self { + self.ctx.get_or_insert_with(ErrorContext::default).path = Some(path.into()); + self + } + + /// Shorthand: attach a byte offset. + pub fn with_offset(mut self, offset: u64) -> Self { + self.ctx.get_or_insert_with(ErrorContext::default).offset = Some(offset); + self + } + + /// The error classification. + pub fn kind(&self) -> BlockErrorKind { + self.kind + } + + /// The diagnostic context, if any. + pub fn context(&self) -> Option<&ErrorContext> { + self.ctx.as_ref() + } + + /// Access the underlying source error, if any. + pub fn source_ref(&self) -> Option<&(dyn StdError + Send + Sync + 'static)> { + self.source.as_deref() + } + + /// Try to downcast the source to a concrete type. + pub fn downcast_ref(&self) -> Option<&T> { + self.source.as_ref()?.downcast_ref::() + } +} + impl Display for BlockError { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { write!(f, "{}", self.kind)?; From 9be154b03cd79c2debe9a089059149ef602c046b Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 5 Mar 2026 15:20:45 +0100 Subject: [PATCH 110/742] block: Add BlockResult, From Add the public BlockResult type alias and a From impl so that bare I/O errors automatically convert into BlockError with BlockErrorKind::Io via the ? operator. Signed-off-by: Anatol Belski --- block/src/error.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/block/src/error.rs b/block/src/error.rs index 9ea710c006..4b89bbb212 100644 --- a/block/src/error.rs +++ b/block/src/error.rs @@ -18,6 +18,7 @@ use std::error::Error as StdError; use std::fmt::{self, Display, Formatter}; +use std::io; use std::path::PathBuf; /// Small, stable classification of block errors. @@ -219,3 +220,12 @@ impl StdError for BlockError { .map(|e| e.as_ref() as &(dyn StdError + 'static)) } } + +/// Convenience: wrap an `io::Error` as `BlockErrorKind::Io`. +impl From for BlockError { + fn from(e: io::Error) -> Self { + Self::new(BlockErrorKind::Io, e) + } +} + +pub type BlockResult = Result; From 8c2794533dd69b4c77182385ae080130d8da35a1 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 11 Mar 2026 00:05:19 +0100 Subject: [PATCH 111/742] block: qcow: impl AsFd for RawFile and QcowRawFile Implement AsFd for both RawFile and QcowRawFile by delegating to the inner File handle. This enables safe fd borrowing through the standard AsFd trait, which is a prerequisite for replacing unsafe libc::dup calls with BorrowedFd::try_clone_to_owned(). Suggested-by: Rob Bradford Signed-off-by: Anatol Belski --- block/src/qcow/qcow_raw_file.rs | 8 +++++++- block/src/qcow/raw_file.rs | 7 +++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/block/src/qcow/qcow_raw_file.rs b/block/src/qcow/qcow_raw_file.rs index 92a569d347..232f6b5a5c 100644 --- a/block/src/qcow/qcow_raw_file.rs +++ b/block/src/qcow/qcow_raw_file.rs @@ -7,7 +7,7 @@ use std::fmt::Debug; use std::io::{self, BufWriter, Read, Seek, SeekFrom, Write}; use std::mem::size_of; -use std::os::fd::{AsRawFd, RawFd}; +use std::os::fd::{AsFd, AsRawFd, BorrowedFd, RawFd}; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use vmm_sys_util::write_zeroes::WriteZeroes; @@ -361,3 +361,9 @@ impl AsRawFd for QcowRawFile { self.file.as_raw_fd() } } + +impl AsFd for QcowRawFile { + fn as_fd(&self) -> BorrowedFd<'_> { + self.file.as_fd() + } +} diff --git a/block/src/qcow/raw_file.rs b/block/src/qcow/raw_file.rs index eda7751c3f..06ec4975f4 100644 --- a/block/src/qcow/raw_file.rs +++ b/block/src/qcow/raw_file.rs @@ -11,6 +11,7 @@ use std::alloc::{Layout, alloc_zeroed, dealloc}; use std::fs::{File, Metadata}; use std::io::{self, Read, Seek, SeekFrom, Write}; +use std::os::fd::{AsFd, BorrowedFd}; use std::os::unix::io::{AsRawFd, RawFd}; use std::slice; @@ -397,3 +398,9 @@ impl AsRawFd for RawFile { self.file.as_raw_fd() } } + +impl AsFd for RawFile { + fn as_fd(&self) -> BorrowedFd<'_> { + self.file.as_fd() + } +} From 58bdfaee3ab689e9eb624c4355fab2091c17774f Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 5 Mar 2026 15:21:06 +0100 Subject: [PATCH 112/742] block: qcow: QcowDiskSync returns BlockResult with path context Change QcowDiskSync::new() to return BlockResult instead of qcow::Result, mapping format specific errors to the appropriate BlockErrorKind at the crate boundary. The vmm caller attaches the disk image path to the error so failures identify which file was being opened. Signed-off-by: Anatol Belski --- block/src/qcow_sync.rs | 58 +++++++++++++++++++++++---------------- vmm/src/device_manager.rs | 7 ++++- 2 files changed, 41 insertions(+), 24 deletions(-) diff --git a/block/src/qcow_sync.rs b/block/src/qcow_sync.rs index 0aeaaef44d..d2f17b0599 100644 --- a/block/src/qcow_sync.rs +++ b/block/src/qcow_sync.rs @@ -5,7 +5,7 @@ use std::cmp::min; use std::collections::VecDeque; use std::fs::File; -use std::os::fd::{AsRawFd, FromRawFd, OwnedFd, RawFd}; +use std::os::fd::{AsFd, AsRawFd, BorrowedFd, OwnedFd, RawFd}; use std::sync::Arc; use std::{io, ptr, slice}; @@ -15,13 +15,13 @@ use vmm_sys_util::write_zeroes::{PunchHole, WriteZeroesAt}; use crate::async_io::{ AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFile, DiskFileError, DiskFileResult, }; +use crate::error::{BlockError, BlockErrorKind, BlockResult, ErrorOp}; use crate::qcow::metadata::{ BackingRead, ClusterReadMapping, ClusterWriteMapping, DeallocAction, QcowMetadata, }; use crate::qcow::qcow_raw_file::QcowRawFile; use crate::qcow::{ - BackingFile, BackingKind, Error as QcowError, MAX_NESTING_DEPTH, RawFile, Result as QcowResult, - parse_qcow, + BackingFile, BackingKind, Error as QcowError, MAX_NESTING_DEPTH, RawFile, parse_qcow, }; /// Raw backing file using pread64 on a duplicated fd. @@ -139,27 +139,23 @@ impl Drop for Qcow2MetadataBacking { } /// Construct a thread safe backing file reader. -fn shared_backing_from(bf: BackingFile) -> QcowResult> { +fn shared_backing_from(bf: BackingFile) -> BlockResult> { let (kind, virtual_size) = bf.into_kind(); + + let dup_fd = |fd: BorrowedFd<'_>| -> BlockResult { + fd.try_clone_to_owned().map_err(|e| { + BlockError::new(BlockErrorKind::Io, QcowError::BackingFileIo(e)) + .with_op(ErrorOp::DupBackingFd) + }) + }; + match kind { BackingKind::Raw(raw_file) => { - // SAFETY: raw_file holds a valid open fd. - let dup_fd = unsafe { libc::dup(raw_file.as_raw_fd()) }; - if dup_fd < 0 { - return Err(QcowError::BackingFileIo(io::Error::last_os_error())); - } - // SAFETY: dup_fd is a freshly duplicated valid fd. - let fd = unsafe { OwnedFd::from_raw_fd(dup_fd) }; + let fd = dup_fd(raw_file.as_fd())?; Ok(Arc::new(RawBacking { fd, virtual_size })) } BackingKind::Qcow { inner, backing } => { - // SAFETY: inner.raw_file holds a valid open fd. - let dup_fd = unsafe { libc::dup(inner.raw_file.as_raw_fd()) }; - if dup_fd < 0 { - return Err(QcowError::BackingFileIo(io::Error::last_os_error())); - } - // SAFETY: dup_fd is a freshly duplicated valid fd. - let data_fd = unsafe { OwnedFd::from_raw_fd(dup_fd) }; + let data_fd = dup_fd(inner.raw_file.as_fd())?; Ok(Arc::new(Qcow2MetadataBacking { metadata: Arc::new(QcowMetadata::new(*inner)), data_fd, @@ -182,17 +178,33 @@ pub struct QcowDiskSync { } impl QcowDiskSync { - pub fn new(file: File, direct_io: bool, backing_files: bool, sparse: bool) -> QcowResult { + pub fn new( + file: File, + direct_io: bool, + backing_files: bool, + sparse: bool, + ) -> BlockResult { let max_nesting_depth = if backing_files { MAX_NESTING_DEPTH } else { 0 }; let (inner, backing_file, sparse) = - parse_qcow(RawFile::new(file, direct_io), max_nesting_depth, sparse).map_err(|e| { - match e { + parse_qcow(RawFile::new(file, direct_io), max_nesting_depth, sparse) + .map_err(|e| match e { QcowError::MaxNestingDepthExceeded if !backing_files => { QcowError::BackingFilesDisabled } other => other, - } - })?; + }) + .map_err(|e| { + let kind = match &e { + QcowError::InvalidMagic | QcowError::UnsupportedVersion(_) => { + BlockErrorKind::InvalidFormat + } + QcowError::UnsupportedFeature(_) | QcowError::BackingFilesDisabled => { + BlockErrorKind::UnsupportedFeature + } + _ => BlockErrorKind::Io, + }; + BlockError::new(kind, e).with_op(ErrorOp::Open) + })?; let data_raw_file = inner.raw_file.clone(); Ok(QcowDiskSync { metadata: Arc::new(QcowMetadata::new(inner)), diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index c3c5618bda..aa02f479eb 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -33,6 +33,7 @@ use arch::layout::{APIC_START, IOAPIC_SIZE, IOAPIC_START}; use arch::{DeviceType, MmioDeviceInfo}; use arch::{NumaNodes, layout}; use block::async_io::DiskFile; +use block::error::BlockError; use block::fixed_vhd_sync::FixedVhdDiskSync; use block::qcow_sync::QcowDiskSync; use block::raw_async_aio::RawFileDiskAio; @@ -575,7 +576,7 @@ pub enum DeviceManagerError { /// Failed to create QcowDiskSync #[error("Failed to create QcowDiskSync")] - CreateQcowDiskSync(#[source] qcow::Error), + CreateQcowDiskSync(#[source] BlockError), /// Failed to create FixedVhdxDiskSync #[error("Failed to create FixedVhdxDiskSync")] @@ -2776,6 +2777,10 @@ impl DeviceManager { disk_cfg.backing_files, disk_cfg.sparse, ) + .map_err(|e| match &disk_cfg.path { + Some(p) => e.with_path(p), + None => e, + }) .map_err(DeviceManagerError::CreateQcowDiskSync)?, ) as Box } From b1bc376c9145666c24bec432b93db85b8fee7240 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 5 Mar 2026 16:03:19 +0100 Subject: [PATCH 113/742] block: Make detect_image_type return BlockResult with context Convert detect_image_type() from io::Result to BlockResult so that I/O failures carry the operation name in the error context. Update the corresponding vmm error variant to wrap BlockError. Signed-off-by: Anatol Belski --- block/src/lib.rs | 10 +++++++--- vmm/src/device_manager.rs | 4 ++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/block/src/lib.rs b/block/src/lib.rs index 1b739beafc..6507076a74 100644 --- a/block/src/lib.rs +++ b/block/src/lib.rs @@ -59,6 +59,7 @@ use vmm_sys_util::eventfd::EventFd; use vmm_sys_util::{aio, ioctl_io_nr}; use crate::async_io::{AsyncIo, AsyncIoError, AsyncIoResult}; +use crate::error::{BlockError, BlockErrorKind, BlockResult, ErrorOp}; use crate::vhdx::VhdxError; const SECTOR_SHIFT: u8 = 9; @@ -1065,13 +1066,16 @@ pub fn read_aligned_block_size(f: &mut File) -> std::io::Result> { } /// Determine image type through file parsing. -pub fn detect_image_type(f: &mut File) -> std::io::Result { - let block = read_aligned_block_size(f)?; +pub fn detect_image_type(f: &mut File) -> BlockResult { + let block = read_aligned_block_size(f) + .map_err(|e| BlockError::new(BlockErrorKind::Io, e).with_op(ErrorOp::DetectImageType))?; // Check 4 first bytes to get the header value and determine the image type let image_type = if u32::from_be_bytes(block[0..4].try_into().unwrap()) == QCOW_MAGIC { ImageType::Qcow2 - } else if vhd::is_fixed_vhd(f)? { + } else if vhd::is_fixed_vhd(f) + .map_err(|e| BlockError::new(BlockErrorKind::Io, e).with_op(ErrorOp::DetectImageType))? + { ImageType::FixedVhd } else if u64::from_le_bytes(block[0..8].try_into().unwrap()) == VHDX_SIGN { ImageType::Vhdx diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index aa02f479eb..3250843951 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -41,7 +41,7 @@ use block::raw_sync::RawFileDiskSync; use block::vhdx_sync::VhdxDiskSync; use block::{ ImageType, block_aio_is_supported, block_io_uring_is_supported, detect_image_type, - preallocate_disk, qcow, vhdx, + preallocate_disk, vhdx, }; #[cfg(feature = "io_uring")] use block::{fixed_vhd_async::FixedVhdDiskAsync, raw_async::RawFileDisk}; @@ -266,7 +266,7 @@ pub enum DeviceManagerError { /// Failed to parse disk image format #[error("Failed to parse disk image format")] - DetectImageType(#[source] io::Error), + DetectImageType(#[source] BlockError), /// Cannot create serial manager #[error("Cannot create serial manager")] From 2bcbe25539d2d329fe359a364c6b8805ec243f50 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 5 Mar 2026 16:57:20 +0100 Subject: [PATCH 114/742] block: qcow: Add backing file path to qcow error context Extend the BackingFileIo and BackingFileOpen variants of qcow::Error with a path field so that backing file failures report which file was involved. The path is populated from the backing file configuration. Signed-off-by: Anatol Belski --- block/src/qcow/mod.rs | 18 ++++++++++-------- block/src/qcow_sync.rs | 7 +++++-- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/block/src/qcow/mod.rs b/block/src/qcow/mod.rs index c0b4e8c720..9c9d561527 100644 --- a/block/src/qcow/mod.rs +++ b/block/src/qcow/mod.rs @@ -59,10 +59,10 @@ use crate::qcow::vec_cache::{CacheMap, Cacheable, VecCache}; #[sorted] #[derive(Debug, Error)] pub enum Error { - #[error("Backing file io error")] - BackingFileIo(#[source] io::Error), - #[error("Backing file open error")] - BackingFileOpen(#[source] Box), + #[error("Backing file I/O error: {0}")] + BackingFileIo(String /* path */, #[source] io::Error), + #[error("Backing file open error: {0}")] + BackingFileOpen(String /* path */, #[source] Box), #[error("Backing file support is disabled")] BackingFilesDisabled, #[error("Backing file name is too long: {0} bytes over")] @@ -201,7 +201,7 @@ impl BackingFile { let backing_raw_file = OpenOptions::new() .read(true) .open(&config.path) - .map_err(Error::BackingFileIo)?; + .map_err(|e| Error::BackingFileIo(config.path.clone(), e))?; let mut raw_file = RawFile::new(backing_raw_file, direct_io); @@ -215,14 +215,16 @@ impl BackingFile { ImageType::Raw => { let size = raw_file .seek(SeekFrom::End(0)) - .map_err(Error::BackingFileIo)?; - raw_file.rewind().map_err(Error::BackingFileIo)?; + .map_err(|e| Error::BackingFileIo(config.path.clone(), e))?; + raw_file + .rewind() + .map_err(|e| Error::BackingFileIo(config.path.clone(), e))?; (BackingKind::Raw(raw_file), size) } ImageType::Qcow2 => { let (inner, nested_backing, _sparse) = parse_qcow(raw_file, max_nesting_depth - 1, sparse) - .map_err(|e| Error::BackingFileOpen(Box::new(e)))?; + .map_err(|e| Error::BackingFileOpen(config.path.clone(), Box::new(e)))?; let size = inner.header.size; ( BackingKind::Qcow { diff --git a/block/src/qcow_sync.rs b/block/src/qcow_sync.rs index d2f17b0599..e1ad08f4ab 100644 --- a/block/src/qcow_sync.rs +++ b/block/src/qcow_sync.rs @@ -144,8 +144,11 @@ fn shared_backing_from(bf: BackingFile) -> BlockResult> { let dup_fd = |fd: BorrowedFd<'_>| -> BlockResult { fd.try_clone_to_owned().map_err(|e| { - BlockError::new(BlockErrorKind::Io, QcowError::BackingFileIo(e)) - .with_op(ErrorOp::DupBackingFd) + BlockError::new( + BlockErrorKind::Io, + QcowError::BackingFileIo(String::new(), e), + ) + .with_op(ErrorOp::DupBackingFd) }) }; From 4fea912d181d41da3699bebb94f62536dc063ad8 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 5 Mar 2026 18:26:54 +0100 Subject: [PATCH 115/742] block: qcow: Add open_disk_image helper with path context Add a small helper in the block crate that opens a disk image file and wraps any failure in a BlockError carrying the file path and operation context. Use it from the vmm device manager so that a failed open now reports which path couldn't be opened. Signed-off-by: Anatol Belski --- block/src/lib.rs | 12 +++++++++++- vmm/src/device_manager.rs | 19 ++++++++----------- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/block/src/lib.rs b/block/src/lib.rs index 6507076a74..f477cd36c6 100644 --- a/block/src/lib.rs +++ b/block/src/lib.rs @@ -32,7 +32,7 @@ pub mod vhdx_sync; use std::alloc::{Layout, alloc_zeroed, dealloc}; use std::collections::VecDeque; use std::fmt::{self, Debug}; -use std::fs::File; +use std::fs::{File, OpenOptions}; use std::io::{self, IoSlice, IoSliceMut, Read, Seek, SeekFrom, Write}; use std::os::linux::fs::MetadataExt; use std::os::unix::io::AsRawFd; @@ -1065,6 +1065,16 @@ pub fn read_aligned_block_size(f: &mut File) -> std::io::Result> { Ok(data) } +/// Open a disk image file, returning a [`BlockError`] with path context +/// on failure. +pub fn open_disk_image(path: &Path, options: &OpenOptions) -> BlockResult { + options.open(path).map_err(|e| { + BlockError::new(BlockErrorKind::Io, e) + .with_op(ErrorOp::Open) + .with_path(path) + }) +} + /// Determine image type through file parsing. pub fn detect_image_type(f: &mut File) -> BlockResult { let block = read_aligned_block_size(f) diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 3250843951..04b9e3b44b 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -41,7 +41,7 @@ use block::raw_sync::RawFileDiskSync; use block::vhdx_sync::VhdxDiskSync; use block::{ ImageType, block_aio_is_supported, block_io_uring_is_supported, detect_image_type, - preallocate_disk, vhdx, + open_disk_image, preallocate_disk, vhdx, }; #[cfg(feature = "io_uring")] use block::{fixed_vhd_async::FixedVhdDiskAsync, raw_async::RawFileDisk}; @@ -177,7 +177,7 @@ pub enum DeviceManagerError { /// Cannot open disk path #[error("Cannot open disk path")] - Disk(#[source] io::Error), + Disk(#[source] BlockError), /// Cannot create vhost-user-net device #[error("Cannot create vhost-user-net device")] @@ -2663,15 +2663,12 @@ impl DeviceManager { options.custom_flags(libc::O_DIRECT); } // Open block device path - let mut file: File = options - .open( - disk_cfg - .path - .as_ref() - .ok_or(DeviceManagerError::NoDiskPath)? - .clone(), - ) - .map_err(DeviceManagerError::Disk)?; + let disk_path = disk_cfg + .path + .as_ref() + .ok_or(DeviceManagerError::NoDiskPath)?; + let mut file: File = + open_disk_image(disk_path, &options).map_err(DeviceManagerError::Disk)?; let detected_image_type = detect_image_type(&mut file).map_err(DeviceManagerError::DetectImageType)?; From 732cddb8b33c63edb979258b718d1a38187406ef Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 9 Mar 2026 23:07:10 +0100 Subject: [PATCH 116/742] block: qcow: Migrate dirty/corrupt bit helpers to BlockResult Switch the header dirty and corrupt bit helpers from qcow::Result to BlockResult. Their callers either discard the result or unwrap in tests, so no caller signatures change. A map_err bridge in parse_qcow() converts back where needed. Signed-off-by: Anatol Belski --- block/src/qcow/header.rs | 18 +++++++++++------- block/src/qcow/mod.rs | 4 +++- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/block/src/qcow/header.rs b/block/src/qcow/header.rs index 4549668418..22a5492b19 100644 --- a/block/src/qcow/header.rs +++ b/block/src/qcow/header.rs @@ -20,6 +20,7 @@ use super::decoder::{Decoder, ZlibDecoder, ZstdDecoder}; use super::qcow_raw_file::BeUint; use super::raw_file::RawFile; use super::{Error, Result, div_round_up_u32, div_round_up_u64}; +use crate::error::{BlockError, BlockErrorKind, BlockResult}; #[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum ImageType { @@ -511,13 +512,14 @@ impl QcowHeader { } /// Write only the incompatible_features field to the file at its fixed offset. - fn write_incompatible_features(&self, file: &mut F) -> Result<()> { + fn write_incompatible_features(&self, file: &mut F) -> BlockResult<()> { if self.version != 3 { return Ok(()); } file.seek(SeekFrom::Start(V2_BARE_HEADER_SIZE as u64)) - .map_err(Error::WritingHeader)?; - u64::write_be(file, self.incompatible_features).map_err(Error::WritingHeader)?; + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::WritingHeader(e)))?; + u64::write_be(file, self.incompatible_features) + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::WritingHeader(e)))?; Ok(()) } @@ -529,7 +531,7 @@ impl QcowHeader { &mut self, file: &mut F, dirty: bool, - ) -> Result<()> { + ) -> BlockResult<()> { if self.version == 3 { if dirty { self.incompatible_features |= IncompatFeatures::DIRTY.bits(); @@ -537,7 +539,8 @@ impl QcowHeader { self.incompatible_features &= !IncompatFeatures::DIRTY.bits(); } self.write_incompatible_features(file)?; - file.fsync().map_err(Error::SyncingHeader)?; + file.fsync() + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SyncingHeader(e)))?; } Ok(()) } @@ -546,11 +549,12 @@ impl QcowHeader { /// /// This marks the image as corrupted. Once set, the image can only be /// opened read-only until repaired. - pub fn set_corrupt_bit(&mut self, file: &mut F) -> Result<()> { + pub fn set_corrupt_bit(&mut self, file: &mut F) -> BlockResult<()> { if self.version == 3 { self.incompatible_features |= IncompatFeatures::CORRUPT.bits(); self.write_incompatible_features(file)?; - file.fsync().map_err(Error::SyncingHeader)?; + file.fsync() + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SyncingHeader(e)))?; } Ok(()) } diff --git a/block/src/qcow/mod.rs b/block/src/qcow/mod.rs index 9c9d561527..af118d6f7a 100644 --- a/block/src/qcow/mod.rs +++ b/block/src/qcow/mod.rs @@ -508,7 +508,9 @@ pub(crate) fn parse_qcow( if !IncompatFeatures::from_bits_truncate(header.incompatible_features) .contains(IncompatFeatures::DIRTY) { - header.set_dirty_bit(raw_file.file_mut(), true)?; + header + .set_dirty_bit(raw_file.file_mut(), true) + .map_err(|e| Error::WritingHeader(io::Error::other(e)))?; } header.clear_autoclear_features(raw_file.file_mut())?; From c59c5687d26e5979bc8225a5e009592f99364339 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 9 Mar 2026 23:15:05 +0100 Subject: [PATCH 117/742] block: qcow: Migrate convert_copy() to BlockResult Switch convert_copy() to BlockResult, preserving the original qcow::Error variants as the BlockError source for diagnostics. A map_err bridge at the caller converts back where needed. Signed-off-by: Anatol Belski --- block/src/qcow/mod.rs | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/block/src/qcow/mod.rs b/block/src/qcow/mod.rs index af118d6f7a..2f6ddf4416 100644 --- a/block/src/qcow/mod.rs +++ b/block/src/qcow/mod.rs @@ -51,6 +51,7 @@ use vmm_sys_util::seek_hole::SeekHole; use vmm_sys_util::write_zeroes::{PunchHole, WriteZeroesAt}; use crate::BlockBackend; +use crate::error::{BlockError, BlockErrorKind, BlockResult}; use crate::qcow::qcow_raw_file::{BeUint, QcowRawFile}; pub use crate::qcow::raw_file::RawFile; use crate::qcow::refcount::RefCount; @@ -2028,7 +2029,7 @@ impl BlockBackend for QcowFile { } } -fn convert_copy(reader: &mut R, writer: &mut W, offset: u64, size: u64) -> Result<()> +fn convert_copy(reader: &mut R, writer: &mut W, offset: u64, size: u64) -> BlockResult<()> where R: Read + Seek, W: Write + Seek, @@ -2038,16 +2039,18 @@ where let mut read_count = 0; reader .seek(SeekFrom::Start(offset)) - .map_err(Error::SeekingFile)?; + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SeekingFile(e)))?; writer .seek(SeekFrom::Start(offset)) - .map_err(Error::SeekingFile)?; + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SeekingFile(e)))?; loop { let this_count = min(CHUNK_SIZE as u64, size - read_count) as usize; let nread = reader .read(&mut buf[..this_count]) - .map_err(Error::ReadingData)?; - writer.write(&buf[..nread]).map_err(Error::WritingData)?; + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::ReadingData(e)))?; + writer + .write(&buf[..nread]) + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::WritingData(e)))?; read_count += nread as u64; if nread == 0 || read_count == size { break; @@ -2081,7 +2084,8 @@ where } }; let count = next_hole - next_data; - convert_copy(reader, writer, next_data, count)?; + convert_copy(reader, writer, next_data, count) + .map_err(|e| Error::ReadingData(io::Error::other(e)))?; offset = next_hole; } From 748666fe4d642f17eeaaa36f56c9f252c600d5c0 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 9 Mar 2026 23:18:34 +0100 Subject: [PATCH 118/742] block: qcow: Migrate convert_reader_writer() to BlockResult Switch convert_reader_writer() to BlockResult, preserving the original qcow::Error variants as the BlockError source. The inner convert_copy() call now propagates BlockResult naturally. Callers get map_err bridges where they still return qcow::Error. Signed-off-by: Anatol Belski --- block/src/qcow/mod.rs | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/block/src/qcow/mod.rs b/block/src/qcow/mod.rs index 2f6ddf4416..b00523da8d 100644 --- a/block/src/qcow/mod.rs +++ b/block/src/qcow/mod.rs @@ -2060,7 +2060,7 @@ where Ok(()) } -fn convert_reader_writer(reader: &mut R, writer: &mut W, size: u64) -> Result<()> +fn convert_reader_writer(reader: &mut R, writer: &mut W, size: u64) -> BlockResult<()> where R: Read + Seek + SeekHole, W: Write + Seek, @@ -2068,24 +2068,32 @@ where let mut offset = 0; while offset < size { // Find the next range of data. - let next_data = match reader.seek_data(offset).map_err(Error::SeekingFile)? { + let next_data = match reader + .seek_data(offset) + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SeekingFile(e)))? + { Some(o) => o, None => { // No more data in the file. break; } }; - let next_hole = match reader.seek_hole(next_data).map_err(Error::SeekingFile)? { + let next_hole = match reader + .seek_hole(next_data) + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SeekingFile(e)))? + { Some(o) => o, None => { // This should not happen - there should always be at least one hole // after any data. - return Err(Error::SeekingFile(io::Error::from_raw_os_error(EINVAL))); + return Err(BlockError::new( + BlockErrorKind::Io, + Error::SeekingFile(io::Error::from_raw_os_error(EINVAL)), + )); } }; let count = next_hole - next_data; - convert_copy(reader, writer, next_data, count) - .map_err(|e| Error::ReadingData(io::Error::other(e)))?; + convert_copy(reader, writer, next_data, count)?; offset = next_hole; } @@ -2106,6 +2114,7 @@ where ImageType::Qcow2 => { let mut dst_writer = QcowFile::new(dst_file, 3, src_size, true)?; convert_reader_writer(reader, &mut dst_writer, src_size) + .map_err(|e| Error::WritingData(io::Error::other(e))) } ImageType::Raw => { let mut dst_writer = dst_file; @@ -2115,6 +2124,7 @@ where .set_len(src_size) .map_err(Error::SettingFileSize)?; convert_reader_writer(reader, &mut dst_writer, src_size) + .map_err(|e| Error::WritingData(io::Error::other(e))) } } } From 4930d93090ff8ab816159819960b83e88b1f156c Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 9 Mar 2026 23:19:50 +0100 Subject: [PATCH 119/742] block: qcow: Migrate convert_reader() to BlockResult Switch convert_reader() to BlockResult, preserving the original qcow::Error variants as the BlockError source. The inner convert_reader_writer() call now propagates naturally. Callers get map_err bridges where they still return qcow::Error. Signed-off-by: Anatol Belski --- block/src/qcow/mod.rs | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/block/src/qcow/mod.rs b/block/src/qcow/mod.rs index b00523da8d..974e6ec410 100644 --- a/block/src/qcow/mod.rs +++ b/block/src/qcow/mod.rs @@ -2100,21 +2100,27 @@ where Ok(()) } -fn convert_reader(reader: &mut R, dst_file: RawFile, dst_type: ImageType) -> Result<()> +fn convert_reader(reader: &mut R, dst_file: RawFile, dst_type: ImageType) -> BlockResult<()> where R: Read + Seek + SeekHole, { - let src_size = reader.seek(SeekFrom::End(0)).map_err(Error::SeekingFile)?; - reader.rewind().map_err(Error::SeekingFile)?; + let src_size = reader + .seek(SeekFrom::End(0)) + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SeekingFile(e)))?; + reader + .rewind() + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SeekingFile(e)))?; // Ensure the destination file is empty before writing to it. - dst_file.set_len(0).map_err(Error::SettingFileSize)?; + dst_file + .set_len(0) + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SettingFileSize(e)))?; match dst_type { ImageType::Qcow2 => { - let mut dst_writer = QcowFile::new(dst_file, 3, src_size, true)?; + let mut dst_writer = QcowFile::new(dst_file, 3, src_size, true) + .map_err(|e| BlockError::new(BlockErrorKind::Io, e))?; convert_reader_writer(reader, &mut dst_writer, src_size) - .map_err(|e| Error::WritingData(io::Error::other(e))) } ImageType::Raw => { let mut dst_writer = dst_file; @@ -2122,9 +2128,8 @@ where // of the desired size. dst_writer .set_len(src_size) - .map_err(Error::SettingFileSize)?; + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SettingFileSize(e)))?; convert_reader_writer(reader, &mut dst_writer, src_size) - .map_err(|e| Error::WritingData(io::Error::other(e))) } } } @@ -2144,11 +2149,13 @@ pub fn convert( let mut src_reader = QcowFile::from_with_nesting_depth(src_file, src_max_nesting_depth, true)?; convert_reader(&mut src_reader, dst_file, dst_type) + .map_err(|e| Error::ReadingData(io::Error::other(e))) } ImageType::Raw => { // src_file is a raw file. let mut src_reader = src_file; convert_reader(&mut src_reader, dst_file, dst_type) + .map_err(|e| Error::ReadingData(io::Error::other(e))) } } } From 1accf47db4510ae9345b62b83b0d229fb9b5f2d0 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 9 Mar 2026 23:22:38 +0100 Subject: [PATCH 120/742] block: qcow: Migrate convert() to BlockResult Switch the public convert() entry point to BlockResult. Inner calls to functions already returning BlockResult propagate naturally; those still returning qcow::Error get map_err bridges. Signed-off-by: Anatol Belski --- block/src/qcow/mod.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/block/src/qcow/mod.rs b/block/src/qcow/mod.rs index 974e6ec410..a5714badd7 100644 --- a/block/src/qcow/mod.rs +++ b/block/src/qcow/mod.rs @@ -2142,20 +2142,20 @@ pub fn convert( dst_file: RawFile, dst_type: ImageType, src_max_nesting_depth: u32, -) -> Result<()> { - let src_type = detect_image_type(&mut src_file)?; +) -> BlockResult<()> { + let src_type = + detect_image_type(&mut src_file).map_err(|e| BlockError::new(BlockErrorKind::Io, e))?; match src_type { ImageType::Qcow2 => { let mut src_reader = - QcowFile::from_with_nesting_depth(src_file, src_max_nesting_depth, true)?; + QcowFile::from_with_nesting_depth(src_file, src_max_nesting_depth, true) + .map_err(|e| BlockError::new(BlockErrorKind::Io, e))?; convert_reader(&mut src_reader, dst_file, dst_type) - .map_err(|e| Error::ReadingData(io::Error::other(e))) } ImageType::Raw => { // src_file is a raw file. let mut src_reader = src_file; convert_reader(&mut src_reader, dst_file, dst_type) - .map_err(|e| Error::ReadingData(io::Error::other(e))) } } } From 32d339c59edd90566ca0c36fe2cadc350f7b0ad0 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 9 Mar 2026 23:51:32 +0100 Subject: [PATCH 121/742] block: qcow: Migrate debug/test helpers to BlockResult Switch l2_table(), refcount_block(), and first_zero_refcount() to BlockResult. These are public inspection helpers with no callers within the crate. Signed-off-by: Anatol Belski --- block/src/qcow/mod.rs | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/block/src/qcow/mod.rs b/block/src/qcow/mod.rs index a5714badd7..0ed4cd858d 100644 --- a/block/src/qcow/mod.rs +++ b/block/src/qcow/mod.rs @@ -678,8 +678,11 @@ impl QcowFile { } /// Returns an L2_table of cluster addresses, only used for debugging. - pub fn l2_table(&mut self, l1_index: usize) -> Result> { - let l2_addr_disk = *self.l1_table.get(l1_index).ok_or(Error::InvalidIndex)?; + pub fn l2_table(&mut self, l1_index: usize) -> BlockResult> { + let l2_addr_disk = *self + .l1_table + .get(l1_index) + .ok_or_else(|| BlockError::new(BlockErrorKind::OutOfBounds, Error::InvalidIndex))?; if l2_addr_disk == 0 { // Reading from an unallocated cluster will return zeros. @@ -690,7 +693,7 @@ impl QcowFile { // Not in the cache. let table = VecCache::from_vec( Self::read_l2_cluster(&mut self.raw_file, l2_addr_disk) - .map_err(Error::ReadingPointers)?, + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::ReadingPointers(e)))?, ); let l1_table = &self.l1_table; let raw_file = &mut self.raw_file; @@ -698,7 +701,7 @@ impl QcowFile { .insert(l1_index, table, |index, evicted| { raw_file.write_pointer_table_direct(l1_table[index], evicted.iter()) }) - .map_err(Error::EvictingCache)?; + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::EvictingCache(e)))?; } // The index must exist as it was just inserted if it didn't already. @@ -711,19 +714,19 @@ impl QcowFile { } /// Returns the `index`th refcount block from the file. - pub fn refcount_block(&mut self, index: usize) -> Result> { + pub fn refcount_block(&mut self, index: usize) -> BlockResult> { self.refcounts .refcount_block(&mut self.raw_file, index) - .map_err(Error::ReadingRefCountBlock) + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::ReadingRefCountBlock(e))) } /// Returns the first cluster in the file with a 0 refcount. Used for testing. - pub fn first_zero_refcount(&mut self) -> Result> { + pub fn first_zero_refcount(&mut self) -> BlockResult> { let file_size = self .raw_file .file_mut() .metadata() - .map_err(Error::GettingFileSize)? + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::GettingFileSize(e)))? .len(); let cluster_size = 0x01u64 << self.header.cluster_bits; @@ -732,7 +735,7 @@ impl QcowFile { let cluster_refcount = self .refcounts .get_cluster_refcount(&mut self.raw_file, cluster_addr) - .map_err(Error::GettingRefcount)?; + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::GettingRefcount(e)))?; if cluster_refcount == 0 { return Ok(Some(cluster_addr)); } From 84b8d25bb631b5dc2a0c9788918625498f6e9f5e Mon Sep 17 00:00:00 2001 From: Julian Schindel Date: Thu, 12 Mar 2026 16:24:31 +0100 Subject: [PATCH 122/742] vm-migration: fix UB in `MemoryRangeTable::read_from` The pointer created by `Vec::as_ptr` may not be used for mutation of the underlying data [0]. This PR switches to `Vec::as_mut_ptr` and uses `cast` to avoid mutability changes when casting. Also improves safety reasoning, separates the unsafe call from the call to `read_exact` to improve clarity and simplifies the vector creation. [0]: https://doc.rust-lang.org/alloc/vec/struct.Vec.html#method.as_ptr On-behalf-of: SAP julian.schindel@sap.com Signed-off-by: Julian Schindel --- vm-migration/src/protocol.rs | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/vm-migration/src/protocol.rs b/vm-migration/src/protocol.rs index 3ae226ece2..4dfec4f625 100644 --- a/vm-migration/src/protocol.rs +++ b/vm-migration/src/protocol.rs @@ -332,19 +332,19 @@ impl MemoryRangeTable { pub fn read_from(fd: &mut dyn Read, length: u64) -> Result { assert!((length as usize).is_multiple_of(size_of::())); - let mut data: Vec = Vec::new(); - data.resize_with( - length as usize / (std::mem::size_of::()), - Default::default, - ); - // SAFETY: the slice is constructed with the correct arguments - fd.read_exact(unsafe { - std::slice::from_raw_parts_mut( - data.as_ptr() as *mut MemoryRange as *mut u8, - length as usize, - ) - }) - .map_err(MigratableError::MigrateSocket)?; + let mut data: Vec = + vec![MemoryRange::default(); length as usize / size_of::()]; + + // SAFETY: The pointer points to the just created vector data. + // `MemoryRange` can be read from and written to bytes since it's `[repr(C)]`. + // The vector data was initialized with `length as usize / size_of::()` valid + // `MemoryRange`s so the memory is valid for `length` bytes. + // During the lifetime of the slice, neither the backing vector nor the pointed to memory are accessed. + let data_slice_bytes = + unsafe { std::slice::from_raw_parts_mut(data.as_mut_ptr().cast(), length as usize) }; + + fd.read_exact(data_slice_bytes) + .map_err(MigratableError::MigrateSocket)?; Ok(Self { data }) } From 2df41986b792fc102cf1542ade5335aaeb3bd06c Mon Sep 17 00:00:00 2001 From: Sebastian Walz Date: Tue, 3 Mar 2026 18:19:40 +0100 Subject: [PATCH 123/742] main: add `.action(ArgAction::Append)` to all `.num_args(1..)` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With `.num_args(1..)`, multiple values can be specified for a CLI option, but the option cannot be specified more than once. In my experience, it’s more common to specify flags with a single argument multiple times to specify multiple arguments. One might thus expect to call cloud-hypervisor with e.g. `--disk path=foo --disk path==bar`. With this commit, both `--disk path=foo path=bar path=baz` and `--disk path=foo -disk path=bar path=baz` (note: combinations as well) are allowed. Signed-off-by: Sebastian Walz --- cloud-hypervisor/src/main.rs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/cloud-hypervisor/src/main.rs b/cloud-hypervisor/src/main.rs index 6680483d6d..9966711820 100644 --- a/cloud-hypervisor/src/main.rs +++ b/cloud-hypervisor/src/main.rs @@ -246,11 +246,13 @@ fn get_cli_options_sorted( .long("device") .help(DeviceConfig::SYNTAX) .num_args(1..) + .action(ArgAction::Append) .group("vm-config"), Arg::new("disk") .long("disk") .help(DiskConfig::SYNTAX) .num_args(1..) + .action(ArgAction::Append) .group("vm-config"), Arg::new("event-monitor") .long("event-monitor") @@ -266,6 +268,7 @@ fn get_cli_options_sorted( .long("fs") .help(FsConfig::SYNTAX) .num_args(1..) + .action(ArgAction::Append) .group("vm-config"), #[cfg(feature = "fw_cfg")] Arg::new("fw-cfg-config") @@ -283,6 +286,7 @@ fn get_cli_options_sorted( .long("generic-vhost-user") .help(GenericVhostUserConfig::SYNTAX) .num_args(1..) + .action(ArgAction::Append) .group("vm-config"), #[cfg(feature = "igvm")] Arg::new("igvm") @@ -328,6 +332,7 @@ fn get_cli_options_sorted( .long("landlock-rules") .help(LandlockConfig::SYNTAX) .num_args(1..) + .action(ArgAction::Append) .group("vm-config"), Arg::new("log-file") .long("log-file") @@ -360,21 +365,25 @@ fn get_cli_options_sorted( prefault=on|off\"", ) .num_args(1..) + .action(ArgAction::Append) .group("vm-config"), Arg::new("net") .long("net") .help(NetConfig::SYNTAX) .num_args(1..) + .action(ArgAction::Append) .group("vm-config"), Arg::new("numa") .long("numa") .help(NumaConfig::SYNTAX) .num_args(1..) + .action(ArgAction::Append) .group("vm-config"), Arg::new("pci-segment") .long("pci-segment") .help(PciSegmentConfig::SYNTAX) .num_args(1..) + .action(ArgAction::Append) .group("vm-config"), Arg::new("platform") .long("platform") @@ -387,6 +396,7 @@ fn get_cli_options_sorted( .long("pmem") .help(PmemConfig::SYNTAX) .num_args(1..) + .action(ArgAction::Append) .group("vm-config"), #[cfg(feature = "pvmemcontrol")] Arg::new("pvmemcontrol") @@ -405,6 +415,7 @@ fn get_cli_options_sorted( .long("rate-limit-group") .help(RateLimiterGroupConfig::SYNTAX) .num_args(1..) + .action(ArgAction::Append) .group("vm-config"), Arg::new("restore") .long("restore") @@ -437,6 +448,7 @@ fn get_cli_options_sorted( .long("user-device") .help(UserDeviceConfig::SYNTAX) .num_args(1..) + .action(ArgAction::Append) .group("vm-config"), Arg::new("v") .short('v') @@ -447,6 +459,7 @@ fn get_cli_options_sorted( .long("vdpa") .help(VdpaConfig::SYNTAX) .num_args(1..) + .action(ArgAction::Append) .group("vm-config"), Arg::new("version") .short('V') From 4e7f9595c8e5b15f22ae67682f0a1edef47a8112 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Thu, 12 Mar 2026 12:52:16 -0700 Subject: [PATCH 124/742] vmm: remove nested virtualization check for arm64/riscv64 Remove the architecture check that prevented nested virtualization control on arm64 and riscv64. This allows nested virtualization to be disabled where supported, particularly when using MSHV. Note that on arm64 disabling nested virtualization may not fully disable the capability depending on the underlying platform. Use of this functionality is left to the user's discretion. Signed-off-by: Muminul Islam --- vmm/src/config.rs | 8 -------- 1 file changed, 8 deletions(-) diff --git a/vmm/src/config.rs b/vmm/src/config.rs index b4c04570f5..42cbcfdbc0 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -717,14 +717,6 @@ impl CpusConfig { .map_err(Error::ParseCpus)? .is_none_or(|toggle| toggle.0); - // Nested virtualization is always turned on for aarch64 and riscv64 - // TODO: revisit this when nested support can be turned of on these architectures - #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] - if !nested { - return Err(Error::ParseCpus(OptionParserError::InvalidValue( - "nested=off is not supported on aarch64 and riscv64 architectures".to_string(), - ))); - } let core_scheduling = parser .convert("core_scheduling") .map_err(Error::ParseCpus)? From a747e2b72a2183938b1728c4ceefc3385eaf7728 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Wed, 11 Mar 2026 21:38:50 +0100 Subject: [PATCH 125/742] hypervisor: kvm: cleanup unneeded Arc Signed-off-by: Philipp Schuster On-behalf-of: SAP philipp.schuster@sap.com --- hypervisor/src/kvm/mod.rs | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/hypervisor/src/kvm/mod.rs b/hypervisor/src/kvm/mod.rs index 75073ec8d5..1c1abd4b68 100644 --- a/hypervisor/src/kvm/mod.rs +++ b/hypervisor/src/kvm/mod.rs @@ -427,10 +427,10 @@ struct KvmDirtyLogSlot { /// Wrapper over KVM VM ioctls. pub struct KvmVm { - fd: Arc, + fd: VmFd, #[cfg(target_arch = "x86_64")] msrs: Vec, - dirty_log_slots: Arc>>, + dirty_log_slots: RwLock>, } impl KvmVm { @@ -1238,8 +1238,6 @@ impl hypervisor::Hypervisor for KvmHypervisor { break; } - let vm_fd = Arc::new(fd); - #[cfg(target_arch = "x86_64")] { let msr_list = self.get_msr_list()?; @@ -1256,17 +1254,17 @@ impl hypervisor::Hypervisor for KvmHypervisor { } Ok(Arc::new(KvmVm { - fd: vm_fd, + fd, msrs, - dirty_log_slots: Arc::new(RwLock::new(HashMap::new())), + dirty_log_slots: RwLock::new(HashMap::new()), })) } #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] { Ok(Arc::new(KvmVm { - fd: vm_fd, - dirty_log_slots: Arc::new(RwLock::new(HashMap::new())), + fd, + dirty_log_slots: RwLock::new(HashMap::new()), })) } } From 001adbe15a3b6db7413d9124003fa55736fa5756 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Thu, 12 Mar 2026 09:32:28 +0100 Subject: [PATCH 126/742] hypervisor: mshv: cleanup unneeded Arc Signed-off-by: Philipp Schuster On-behalf-of: SAP philipp.schuster@sap.com --- hypervisor/src/mshv/mod.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hypervisor/src/mshv/mod.rs b/hypervisor/src/mshv/mod.rs index 613c4dc77e..00cc4a6844 100644 --- a/hypervisor/src/mshv/mod.rs +++ b/hypervisor/src/mshv/mod.rs @@ -346,7 +346,7 @@ impl hypervisor::Hypervisor for MshvHypervisor { Ok(Arc::new(MshvVm { fd: vm_fd, msrs: ArcSwap::new(Vec::::new().into()), - dirty_log_slots: Arc::new(RwLock::new(HashMap::new())), + dirty_log_slots: RwLock::new(HashMap::new()), #[cfg(feature = "sev_snp")] sev_snp_enabled: mshv_vm_type == VmType::Snp, #[cfg(feature = "sev_snp")] @@ -364,7 +364,7 @@ impl hypervisor::Hypervisor for MshvHypervisor { { Ok(Arc::new(MshvVm { fd: vm_fd, - dirty_log_slots: Arc::new(RwLock::new(HashMap::new())), + dirty_log_slots: RwLock::new(HashMap::new()), })) } } @@ -1716,7 +1716,7 @@ pub struct MshvVm { fd: Arc, #[cfg(target_arch = "x86_64")] msrs: ArcSwap>, - dirty_log_slots: Arc>>, + dirty_log_slots: RwLock>, #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, #[cfg(feature = "sev_snp")] From f630694bb0a709052e7067c8c37f2e7fd7c828ea Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Wed, 11 Feb 2026 20:08:45 -0500 Subject: [PATCH 127/742] virtio-devices: Use const fn to compute PCI BAR offsets This is much less error-prone than manual computation. No functional change intended. Signed-off-by: Demi Marie Obenour --- virtio-devices/src/transport/pci_device.rs | 32 +++++++++++++++++----- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/virtio-devices/src/transport/pci_device.rs b/virtio-devices/src/transport/pci_device.rs index 408611e29a..bf1d169c83 100644 --- a/virtio-devices/src/transport/pci_device.rs +++ b/virtio-devices/src/transport/pci_device.rs @@ -231,28 +231,46 @@ impl PciSubclass for PciVirtioSubclass { } } +/// Max number of virtio queues Cloud Hypervisor supports. +/// This is set by the current size of the notification BAR. +const MAX_QUEUES: u64 = 0x400; + +// Automatically compute the position of the next entry in the BAR. +// This handles alignment properly and is much less error-prone than +// manual calculation. +const fn next_bar_addr_align(offset: u64, size: u64, align: u64) -> u64 { + assert!(align >= 0x2000, "too small alignment for structure in BAR"); + assert!(align.is_power_of_two(), "alignment must be a power of 2"); + (offset + size).next_multiple_of(align) +} +// Same as next_bar_addr_align(), but with the default alignment (8K). +const fn next_bar_addr(offset: u64, size: u64) -> u64 { + next_bar_addr_align(offset, size, 0x2000) +} + // Allocate one bar for the structs pointed to by the capability structures. // As per the PCI specification, because the same BAR shares MSI-X and non // MSI-X structures, it is recommended to use 8KiB alignment for all those // structures. const COMMON_CONFIG_BAR_OFFSET: u64 = 0x0000; const COMMON_CONFIG_SIZE: u64 = 56; -const ISR_CONFIG_BAR_OFFSET: u64 = 0x2000; +const ISR_CONFIG_BAR_OFFSET: u64 = next_bar_addr(COMMON_CONFIG_BAR_OFFSET, COMMON_CONFIG_SIZE); const ISR_CONFIG_SIZE: u64 = 1; -const DEVICE_CONFIG_BAR_OFFSET: u64 = 0x4000; +const DEVICE_CONFIG_BAR_OFFSET: u64 = next_bar_addr(ISR_CONFIG_BAR_OFFSET, ISR_CONFIG_SIZE); const DEVICE_CONFIG_SIZE: u64 = 0x1000; -const NOTIFICATION_BAR_OFFSET: u64 = 0x6000; -const NOTIFICATION_SIZE: u64 = 0x1000; -const MSIX_TABLE_BAR_OFFSET: u64 = 0x8000; +const NOTIFICATION_BAR_OFFSET: u64 = next_bar_addr(DEVICE_CONFIG_BAR_OFFSET, DEVICE_CONFIG_SIZE); +const NOTIFICATION_SIZE: u64 = MAX_QUEUES * NOTIFY_OFF_MULTIPLIER as u64; +const MSIX_TABLE_BAR_OFFSET: u64 = next_bar_addr(NOTIFICATION_BAR_OFFSET, NOTIFICATION_SIZE); + // The size is 256KiB because the table can hold up to 2048 entries, with each // entry being 128 bits (4 DWORDS). const MSIX_TABLE_SIZE: u64 = 0x40000; -const MSIX_PBA_BAR_OFFSET: u64 = 0x48000; +const MSIX_PBA_BAR_OFFSET: u64 = next_bar_addr(MSIX_TABLE_BAR_OFFSET, MSIX_TABLE_SIZE); // The size is 2KiB because the Pending Bit Array has one bit per vector and it // can support up to 2048 vectors. const MSIX_PBA_SIZE: u64 = 0x800; // The BAR size must be a power of 2. -const CAPABILITY_BAR_SIZE: u64 = 0x80000; +const CAPABILITY_BAR_SIZE: u64 = (MSIX_PBA_BAR_OFFSET + MSIX_PBA_SIZE).next_power_of_two(); const VIRTIO_COMMON_BAR_INDEX: usize = 0; const VIRTIO_SHM_BAR_INDEX: usize = 2; From e265543e3c283c7ef8fc381968fbebcc592d9883 Mon Sep 17 00:00:00 2001 From: Julian Schindel Date: Thu, 12 Mar 2026 17:00:57 +0100 Subject: [PATCH 128/742] misc: make MSRV workspace-wide for cloud-hypervisor dependencies Moves the MSRV requirement to the workspace and expands it to all cloud-hypervisor dependencies and dev-dependencies. This improves discoverability for new contributors working on crates other than the cloud-hypervisor itself and creates consistency regarding the MSRV of cloud-hypervisor dependencies. Functionally, this doesn't change anything for dependencies of the cloud-hypervisor crate as the MSRV requirement is already enforced by CI when building the cloud-hypervisor with the MSRV versioned compiler. On-behalf-of: SAP julian.schindel@sap.com Signed-off-by: Julian Schindel --- Cargo.toml | 8 ++++++++ api_client/Cargo.toml | 1 + arch/Cargo.toml | 1 + block/Cargo.toml | 1 + cloud-hypervisor/Cargo.toml | 9 +-------- devices/Cargo.toml | 1 + event_monitor/Cargo.toml | 1 + hypervisor/Cargo.toml | 1 + net_gen/Cargo.toml | 1 + net_util/Cargo.toml | 1 + option_parser/Cargo.toml | 1 + pci/Cargo.toml | 1 + rate_limiter/Cargo.toml | 1 + serial_buffer/Cargo.toml | 1 + test_infra/Cargo.toml | 1 + tpm/Cargo.toml | 1 + tracer/Cargo.toml | 1 + virtio-devices/Cargo.toml | 1 + vm-allocator/Cargo.toml | 1 + vm-device/Cargo.toml | 1 + vm-migration/Cargo.toml | 1 + vm-virtio/Cargo.toml | 1 + vmm/Cargo.toml | 1 + 23 files changed, 30 insertions(+), 8 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 93765ba718..4c4ad78e8a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -41,6 +41,14 @@ members = [ "vmm", ] package.edition = "2024" +# Minimum buildable version: +# Keep in sync with version in .github/workflows/build.yaml +# Policy on MSRV (see #4318): +# Can only be bumped if satisfying any of the following: +# a.) A dependency requires it, +# b.) If we want to use a new feature and that MSRV is at least 6 months old, +# c.) There is a security issue that is addressed by the toolchain update. +package.rust-version = "1.89.0" resolver = "3" [workspace.dependencies] diff --git a/api_client/Cargo.toml b/api_client/Cargo.toml index 93a7836fcc..1ab0e5862e 100644 --- a/api_client/Cargo.toml +++ b/api_client/Cargo.toml @@ -3,6 +3,7 @@ authors = ["The Cloud Hypervisor Authors"] edition.workspace = true license = "Apache-2.0" name = "api_client" +rust-version.workspace = true version = "0.1.0" [dependencies] diff --git a/arch/Cargo.toml b/arch/Cargo.toml index 804be793d0..2e30b9e532 100644 --- a/arch/Cargo.toml +++ b/arch/Cargo.toml @@ -2,6 +2,7 @@ authors = ["The Chromium OS Authors"] edition.workspace = true name = "arch" +rust-version.workspace = true version = "0.1.0" [features] diff --git a/block/Cargo.toml b/block/Cargo.toml index 70a731a731..ab62c2308c 100644 --- a/block/Cargo.toml +++ b/block/Cargo.toml @@ -2,6 +2,7 @@ authors = ["The Chromium OS Authors", "The Cloud Hypervisor Authors"] edition.workspace = true name = "block" +rust-version.workspace = true version = "0.1.0" [features] diff --git a/cloud-hypervisor/Cargo.toml b/cloud-hypervisor/Cargo.toml index d69773e743..8259b716fa 100644 --- a/cloud-hypervisor/Cargo.toml +++ b/cloud-hypervisor/Cargo.toml @@ -7,15 +7,8 @@ edition = "2024" homepage = "https://github.com/cloud-hypervisor/cloud-hypervisor" license = "Apache-2.0 AND BSD-3-Clause" name = "cloud-hypervisor" +rust-version.workspace = true version = "51.0.0" -# Minimum buildable version: -# Keep in sync with version in .github/workflows/build.yaml -# Policy on MSRV (see #4318): -# Can only be bumped if satisfying any of the following: -# a.) A dependency requires it, -# b.) If we want to use a new feature and that MSRV is at least 6 months old, -# c.) There is a security issue that is addressed by the toolchain update. -rust-version = "1.89.0" [dependencies] anyhow = { workspace = true } diff --git a/devices/Cargo.toml b/devices/Cargo.toml index 06c99a1674..af4d7b73ce 100644 --- a/devices/Cargo.toml +++ b/devices/Cargo.toml @@ -2,6 +2,7 @@ authors = ["The Chromium OS Authors"] edition.workspace = true name = "devices" +rust-version.workspace = true version = "0.1.0" [dependencies] diff --git a/event_monitor/Cargo.toml b/event_monitor/Cargo.toml index 18ac2567c3..ee52a7fd3d 100644 --- a/event_monitor/Cargo.toml +++ b/event_monitor/Cargo.toml @@ -2,6 +2,7 @@ authors = ["The Cloud Hypervisor Authors"] edition.workspace = true name = "event_monitor" +rust-version.workspace = true version = "0.1.0" [dependencies] diff --git a/hypervisor/Cargo.toml b/hypervisor/Cargo.toml index e18f9ba390..707779fefc 100644 --- a/hypervisor/Cargo.toml +++ b/hypervisor/Cargo.toml @@ -3,6 +3,7 @@ authors = ["Microsoft Authors"] edition.workspace = true license = "Apache-2.0 OR BSD-3-Clause" name = "hypervisor" +rust-version.workspace = true version = "0.1.0" [features] diff --git a/net_gen/Cargo.toml b/net_gen/Cargo.toml index a99c7c995d..dfbcee2af9 100644 --- a/net_gen/Cargo.toml +++ b/net_gen/Cargo.toml @@ -3,6 +3,7 @@ authors = ["The Chromium OS Authors"] edition = "2021" #edition.workspace = true name = "net_gen" +rust-version.workspace = true version = "0.1.0" [dependencies] diff --git a/net_util/Cargo.toml b/net_util/Cargo.toml index 9f124cf613..fccb89320f 100644 --- a/net_util/Cargo.toml +++ b/net_util/Cargo.toml @@ -2,6 +2,7 @@ authors = ["The Chromium OS Authors"] edition.workspace = true name = "net_util" +rust-version.workspace = true version = "0.1.0" [dependencies] diff --git a/option_parser/Cargo.toml b/option_parser/Cargo.toml index 3d76690b41..54c77e296b 100644 --- a/option_parser/Cargo.toml +++ b/option_parser/Cargo.toml @@ -2,6 +2,7 @@ authors = ["The Cloud Hypervisor Authors"] edition.workspace = true name = "option_parser" +rust-version.workspace = true version = "0.1.0" [dependencies] diff --git a/pci/Cargo.toml b/pci/Cargo.toml index 760baae03d..c1b69f9854 100644 --- a/pci/Cargo.toml +++ b/pci/Cargo.toml @@ -2,6 +2,7 @@ authors = ["Samuel Ortiz "] edition.workspace = true name = "pci" +rust-version.workspace = true version = "0.1.0" [features] diff --git a/rate_limiter/Cargo.toml b/rate_limiter/Cargo.toml index 206ec7b7f8..2d32e8a25b 100644 --- a/rate_limiter/Cargo.toml +++ b/rate_limiter/Cargo.toml @@ -1,6 +1,7 @@ [package] edition.workspace = true name = "rate_limiter" +rust-version.workspace = true version = "0.1.0" [dependencies] diff --git a/serial_buffer/Cargo.toml b/serial_buffer/Cargo.toml index 767c8a97ff..89766d5e86 100644 --- a/serial_buffer/Cargo.toml +++ b/serial_buffer/Cargo.toml @@ -2,6 +2,7 @@ authors = ["The Cloud Hypervisor Authors"] edition.workspace = true name = "serial_buffer" +rust-version.workspace = true version = "0.1.0" [lints] diff --git a/test_infra/Cargo.toml b/test_infra/Cargo.toml index 6c53e9ca0f..b5854521ad 100644 --- a/test_infra/Cargo.toml +++ b/test_infra/Cargo.toml @@ -2,6 +2,7 @@ authors = ["The Cloud Hypervisor Authors"] edition.workspace = true name = "test_infra" +rust-version.workspace = true version = "0.1.0" [dependencies] diff --git a/tpm/Cargo.toml b/tpm/Cargo.toml index 82dc8f79be..5d6bba1a04 100644 --- a/tpm/Cargo.toml +++ b/tpm/Cargo.toml @@ -3,6 +3,7 @@ authors = ["Microsoft Authors"] edition = "2021" license = "Apache-2.0" name = "tpm" +rust-version.workspace = true version = "0.1.0" [dependencies] diff --git a/tracer/Cargo.toml b/tracer/Cargo.toml index 1ac9f4e393..64f3399902 100644 --- a/tracer/Cargo.toml +++ b/tracer/Cargo.toml @@ -2,6 +2,7 @@ authors = ["The Cloud Hypervisor Authors"] edition.workspace = true name = "tracer" +rust-version.workspace = true version = "0.1.0" [dependencies] diff --git a/virtio-devices/Cargo.toml b/virtio-devices/Cargo.toml index 5cbfe145f4..41b9da8e0a 100644 --- a/virtio-devices/Cargo.toml +++ b/virtio-devices/Cargo.toml @@ -2,6 +2,7 @@ authors = ["The Cloud Hypervisor Authors"] edition.workspace = true name = "virtio-devices" +rust-version.workspace = true version = "0.1.0" [features] diff --git a/vm-allocator/Cargo.toml b/vm-allocator/Cargo.toml index a4996d6dc3..3826479313 100644 --- a/vm-allocator/Cargo.toml +++ b/vm-allocator/Cargo.toml @@ -2,6 +2,7 @@ authors = ["The Chromium OS Authors"] edition.workspace = true name = "vm-allocator" +rust-version.workspace = true version = "0.1.0" [features] diff --git a/vm-device/Cargo.toml b/vm-device/Cargo.toml index a57ea57f5b..358ffc7435 100644 --- a/vm-device/Cargo.toml +++ b/vm-device/Cargo.toml @@ -2,6 +2,7 @@ authors = ["The Cloud Hypervisor Authors"] edition.workspace = true name = "vm-device" +rust-version.workspace = true version = "0.1.0" [features] diff --git a/vm-migration/Cargo.toml b/vm-migration/Cargo.toml index b17475065c..66b4e4f6a9 100644 --- a/vm-migration/Cargo.toml +++ b/vm-migration/Cargo.toml @@ -2,6 +2,7 @@ authors = ["The Cloud Hypervisor Authors"] edition.workspace = true name = "vm-migration" +rust-version.workspace = true version = "0.1.0" [dependencies] diff --git a/vm-virtio/Cargo.toml b/vm-virtio/Cargo.toml index 228f552416..de90b209d0 100644 --- a/vm-virtio/Cargo.toml +++ b/vm-virtio/Cargo.toml @@ -2,6 +2,7 @@ authors = ["The Cloud Hypervisor Authors"] edition.workspace = true name = "vm-virtio" +rust-version.workspace = true version = "0.1.0" [features] diff --git a/vmm/Cargo.toml b/vmm/Cargo.toml index 43b1de14b4..35fe314299 100644 --- a/vmm/Cargo.toml +++ b/vmm/Cargo.toml @@ -2,6 +2,7 @@ authors = ["The Cloud Hypervisor Authors"] edition.workspace = true name = "vmm" +rust-version.workspace = true version = "0.1.0" [features] From 7d582ec0f68a71f6232658939de02fc0949b8aed Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 12 Mar 2026 23:53:56 +0000 Subject: [PATCH 129/742] build: Bump docker/setup-qemu-action from 3 to 4 Bumps [docker/setup-qemu-action](https://github.com/docker/setup-qemu-action) from 3 to 4. - [Release notes](https://github.com/docker/setup-qemu-action/releases) - [Commits](https://github.com/docker/setup-qemu-action/compare/v3...v4) --- updated-dependencies: - dependency-name: docker/setup-qemu-action dependency-version: '4' dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/docker-image.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker-image.yaml b/.github/workflows/docker-image.yaml index f077b51738..8636d35f00 100644 --- a/.github/workflows/docker-image.yaml +++ b/.github/workflows/docker-image.yaml @@ -21,7 +21,7 @@ jobs: uses: actions/checkout@v6 - name: Set up QEMU - uses: docker/setup-qemu-action@v3 + uses: docker/setup-qemu-action@v4 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v4 From 4ba2d770d150bc725a33770f053f46c8fa2381d9 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Thu, 12 Mar 2026 19:34:54 -0700 Subject: [PATCH 130/742] scripts: update CVM test script to add thread - Modified the integration test script to support CVM test threads - Add more parameters to cargo nextest to match other files Signed-off-by: Muminul Islam --- scripts/run_integration_tests_cvm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/run_integration_tests_cvm.sh b/scripts/run_integration_tests_cvm.sh index edf543fc88..37e2cc74e7 100755 --- a/scripts/run_integration_tests_cvm.sh +++ b/scripts/run_integration_tests_cvm.sh @@ -27,7 +27,7 @@ popd || exit cargo build --features $build_features --all --release --target "$BUILD_TARGET" export RUST_BACKTRACE=1 -cargo nextest run $test_features "common_cvm::$test_filter" -- ${test_binary_args[*]} +time cargo nextest run $test_features --retries 3 --no-fail-fast --no-tests=pass --test-threads=$(($(nproc) / 4)) "common_cvm::$test_filter" -- ${test_binary_args[*]} RES=$? exit $RES From f6a1d821d7a070fc00c4f6b1b6ded2a567dbf0dc Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Thu, 12 Mar 2026 18:52:34 -0700 Subject: [PATCH 131/742] tests: extend timeout for CVM tests Confidential VMs require additional time during boot to load the IGVM image, complete page measurements, and perform Reverse Map Table (RMP) validation. In addition, PSP latency can further delay the boot process. Extend the test timeout to accommodate these additional initialization steps. Signed-off-by: Muminul Islam --- test_infra/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_infra/src/lib.rs b/test_infra/src/lib.rs index fba95ed977..22a1ea99dd 100644 --- a/test_infra/src/lib.rs +++ b/test_infra/src/lib.rs @@ -76,7 +76,7 @@ pub struct GuestNetworkConfig { pub const DEFAULT_TCP_LISTENER_MESSAGE: &str = "booted"; pub const DEFAULT_TCP_LISTENER_PORT: u16 = 8000; pub const DEFAULT_TCP_LISTENER_TIMEOUT: u32 = 120; -pub const DEFAULT_CVM_TCP_LISTENER_TIMEOUT: u32 = 120; +pub const DEFAULT_CVM_TCP_LISTENER_TIMEOUT: u32 = 140; #[derive(Error, Debug)] pub enum WaitForBootError { From 3cbbce353e770dab2109e04d5c56f96c22980670 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 13 Mar 2026 09:25:16 +0100 Subject: [PATCH 132/742] tests: Set image_type=raw for rate limiter block test images The rate limiter tests create raw block images with dd but do not specify image_type=raw. Without it the VMM autodetects the format and enables sector 0 write protection for unknown image types, causing I/O errors when fio writes to sector 0 and making the test hang until timeout. Signed-off-by: Anatol Belski --- cloud-hypervisor/tests/integration.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 1d198828e9..bbc1999335 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -14402,11 +14402,11 @@ mod rate_limiter { let test_blk_params = if bandwidth { format!( - "path={blk_rate_limiter_test_img},num_queues={num_queues},bw_size={bw_size},bw_refill_time={bw_refill_time}" + "path={blk_rate_limiter_test_img},num_queues={num_queues},bw_size={bw_size},bw_refill_time={bw_refill_time},image_type=raw" ) } else { format!( - "path={blk_rate_limiter_test_img},num_queues={num_queues},ops_size={bw_size},ops_refill_time={bw_refill_time}" + "path={blk_rate_limiter_test_img},num_queues={num_queues},ops_size={bw_size},ops_refill_time={bw_refill_time},image_type=raw" ) }; @@ -14512,7 +14512,7 @@ mod rate_limiter { ); disk_args.push(format!( - "path={test_img_path},num_queues={num_queues},rate_limit_group=group0" + "path={test_img_path},num_queues={num_queues},rate_limit_group=group0,image_type=raw" )); } From b5169ff4198833301ecfab1aaf277ae934c102ec Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Tue, 10 Mar 2026 09:26:28 +0100 Subject: [PATCH 133/742] vmm: migration: flatten control flow in send_migration() Move the error branch to the top and remove unnecessary nesting in send_migration(). This change is purely mechanical and introduces no functional changes. It simplifies the control flow and prepares the code for the following migration-related improvements in this series. Signed-off-by: Philipp Schuster On-behalf-of: SAP philipp.schuster@sap.com --- vmm/src/lib.rs | 63 +++++++++++++++++++++++++------------------------- 1 file changed, 32 insertions(+), 31 deletions(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 5c75a2db0e..cea3d2fa46 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -2364,41 +2364,42 @@ impl RequestHandler for Vmm { ))); } - if let Some(vm) = self.vm.as_mut() { - Self::send_migration( - vm, - #[cfg(all(feature = "kvm", target_arch = "x86_64"))] - self.hypervisor.as_ref(), - &send_data_migration, - ) - .map_err(|migration_err| { - error!("Migration failed: {migration_err:?}"); + let vm = self + .vm + .as_mut() + .ok_or_else(|| MigratableError::MigrateSend(anyhow!("VM is not running")))?; - // Stop logging dirty pages only for non-local migrations - if !send_data_migration.local - && let Err(e) = vm.stop_dirty_log() - { - return e; - } + Self::send_migration( + vm, + #[cfg(all(feature = "kvm", target_arch = "x86_64"))] + self.hypervisor.as_ref(), + &send_data_migration, + ) + .map_err(|migration_err| { + error!("Migration failed: {migration_err:?}"); - if vm.get_state() == VmState::Paused - && let Err(e) = vm.resume() - { - return e; - } + // Stop logging dirty pages only for non-local migrations + if !send_data_migration.local + && let Err(e) = vm.stop_dirty_log() + { + return e; + } - migration_err - })?; + if vm.get_state() == VmState::Paused + && let Err(e) = vm.resume() + { + return e; + } - // Shutdown the VM after the migration succeeded - self.exit_evt.write(1).map_err(|e| { - MigratableError::MigrateSend(anyhow!( - "Failed shutting down the VM after migration: {e:?}" - )) - }) - } else { - Err(MigratableError::MigrateSend(anyhow!("VM is not running"))) - } + migration_err + })?; + + // Shutdown the VM after the migration succeeded + self.exit_evt.write(1).map_err(|e| { + MigratableError::MigrateSend(anyhow!( + "Failed shutting down the VM after migration: {e:?}" + )) + }) } } From 5c93bcf2d708b5dd5abbc19776bc1eab9e3b4b7d Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Mon, 9 Mar 2026 15:32:30 +0100 Subject: [PATCH 134/742] vmm: migration: only permit migration of running VMs Currently, it is not possible to migrate a paused VM. It fails with the following error: ``` [2026-03-09T14:43:42Z ERROR cloud_hypervisor] Fatal error: HttpApiClient(ServerResponse(InternalServerError, Some("[\"Error from API\",\"Error starting migration sender\",\"Failed to pause migratable component\",\"Invalid transition: InvalidStateTransition(Paused, Paused)\"]"))) Error: ch-remote exited with the following chain of errors: 0: http client error 1: Server responded with InternalServerError 2: Error from API 3: Error starting migration sender 4: Failed to pause migratable component 5: Invalid transition: InvalidStateTransition(Paused, Paused) ``` and even worse, after that, the VM is resumed on the source! Make the behavior explicit by only allowing migration of VMs in the Running state. This avoids unintended state transitions during migration and clarifies the current expected semantics. Future work could extend the migration protocol to work with paused VMs and preserve the VM runtime state, allowing paused VMs to be migrated without altering their state. Signed-off-by: Philipp Schuster On-behalf-of: SAP philipp.schuster@sap.com --- vmm/src/lib.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index cea3d2fa46..96331fffa9 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -2369,6 +2369,16 @@ impl RequestHandler for Vmm { .as_mut() .ok_or_else(|| MigratableError::MigrateSend(anyhow!("VM is not running")))?; + // Only running VMs can be migrated: Future work can fix this to allow + // also the migration of paused VMs while preserving the state in success + // and error case. See #7815. + if vm.get_state() != VmState::Running { + return Err(MigratableError::MigrateSend(anyhow!( + "VM is not in running state: {:?}", + vm.get_state() + ))); + } + Self::send_migration( vm, #[cfg(all(feature = "kvm", target_arch = "x86_64"))] From b90634a88714066eeb9da446fa7063a2f6623735 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Thu, 5 Mar 2026 21:02:43 +0100 Subject: [PATCH 135/742] vmm: migration: emit lifecycle events Emit migration lifecycle events via the event monitor. This aligns migration with other VM lifecycle operations such as boot, pause, and resume, allowing external management software to observe migration progress consistently. Events emitted: src: vm.migration-started vm.migration-finished vm.migration-failed dst: vm.migration-receive-started vm.migration-receive-finished vm.migration-receive-failed Please note that these features are independent of an upcoming new endpoint to fetch migration statistics. Signed-off-by: Philipp Schuster On-behalf-of: SAP philipp.schuster@sap.com --- vmm/src/lib.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 96331fffa9..bcfa41e27e 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -2305,6 +2305,8 @@ impl RequestHandler for Vmm { // Accept the connection and get the socket let mut socket = Vmm::receive_migration_socket(&receive_data_migration.receiver_url)?; + event!("vm", "migration-receive-started"); + let mut state = ReceiveMigrationState::Established; while !state.finished() { @@ -2334,8 +2336,11 @@ impl RequestHandler for Vmm { } if let ReceiveMigrationState::Aborted = state { + event!("vm", "migration-receive-failed"); self.vm = None; self.vm_config = None; + } else { + event!("vm", "migration-receive-finished"); } Ok(()) @@ -2379,6 +2384,7 @@ impl RequestHandler for Vmm { ))); } + event!("vm", "migration-started"); Self::send_migration( vm, #[cfg(all(feature = "kvm", target_arch = "x86_64"))] @@ -2387,6 +2393,7 @@ impl RequestHandler for Vmm { ) .map_err(|migration_err| { error!("Migration failed: {migration_err:?}"); + event!("vm", "migration-failed"); // Stop logging dirty pages only for non-local migrations if !send_data_migration.local @@ -2404,6 +2411,8 @@ impl RequestHandler for Vmm { migration_err })?; + event!("vm", "migration-finished"); + // Shutdown the VM after the migration succeeded self.exit_evt.write(1).map_err(|e| { MigratableError::MigrateSend(anyhow!( From d4b5502472d706c8e9c2241eb856fcac72740f62 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Fri, 6 Mar 2026 13:28:59 +0100 Subject: [PATCH 136/742] vmm: reduce verbosity of dirty logging output Lower several informational messages in the dirty logging path to debug level. These messages are noisy in practice and provide little value since dirty logging is known to work reliably. More useful migration metrics (e.g., dirty size per iteration) is logged per iteration in subsequent commits. Signed-off-by: Philipp Schuster On-behalf-of: SAP philipp.schuster@sap.com --- vmm/src/memory_manager.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vmm/src/memory_manager.rs b/vmm/src/memory_manager.rs index 03b7b8a837..2cb735ecd1 100644 --- a/vmm/src/memory_manager.rs +++ b/vmm/src/memory_manager.rs @@ -2673,11 +2673,11 @@ impl Migratable for MemoryManager { let sub_table = MemoryRangeTable::from_dirty_bitmap(dirty_bitmap, r.gpa, 4096); if sub_table.regions().is_empty() { - info!("Dirty Memory Range Table is empty"); + debug!("Dirty Memory Range Table is empty"); } else { - info!("Dirty Memory Range Table:"); + debug!("Dirty Memory Range Table:"); for range in sub_table.regions() { - info!("GPA: {:x} size: {} (KiB)", range.gpa, range.length / 1024); + debug!("GPA: {:x} size: {} (KiB)", range.gpa, range.length / 1024); } } From 1f5c5093e44089b13a68747c14a73bcd7a8b0d84 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Thu, 12 Mar 2026 07:09:42 +0100 Subject: [PATCH 137/742] vmm: migration: refactor memory migration into iteration helpers Refactor the precopy memory migration path into dedicated helpers that handle the different migration phases: - initial full memory transfer - repeated dirty-page iterations while the VM is running - final iteration after the VM is paused This separates concerns in the migration code and provides the infrastructure needed for collecting migration metrics in the following changes. These changes are inspired by [0] but differ significantly in details. [0] https://github.com/cloud-hypervisor/cloud-hypervisor/pull/7033 Signed-off-by: Philipp Schuster On-behalf-of: SAP philipp.schuster@sap.com --- vmm/src/lib.rs | 120 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 78 insertions(+), 42 deletions(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index bcfa41e27e..4186386787 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -27,7 +27,7 @@ use console_devices::{ConsoleInfo, pre_create_console_devices}; use event_monitor::event; use landlock::LandlockError; use libc::{EFD_NONBLOCK, SIGINT, SIGTERM, TCSANOW, tcsetattr, termios}; -use log::{error, info, trace, warn}; +use log::{debug, error, info, trace, warn}; use memory_manager::MemoryManagerSnapshotData; use pci::PciBdf; use seccompiler::{SeccompAction, apply_filter}; @@ -1216,29 +1216,94 @@ impl Vmm { } } - // Returns true if there were dirty pages to send - fn vm_maybe_send_dirty_pages( + /// Transmits the given [`MemoryRangeTable`] over the wire if there is at + /// least one region. + /// + /// Sends a memory migration request, the range table, and the corresponding + /// guest memory regions over the given socket. Waits for acknowledgment + /// from the destination. + fn vm_send_dirty_pages( vm: &mut Vm, socket: &mut SocketStream, - ) -> result::Result { - // Send (dirty) memory table - let table = vm.dirty_log()?; - - // But if there are no regions go straight to pause + table: &MemoryRangeTable, + ) -> result::Result<(), MigratableError> { if table.regions().is_empty() { - return Ok(false); + return Ok(()); } - Request::memory(table.length()).write_to(socket).unwrap(); + Request::memory(table.length()).write_to(socket)?; table.write_to(socket)?; // And then the memory itself - vm.send_memory_regions(&table, socket)?; + vm.send_memory_regions(table, socket)?; Response::read_from(socket)?.ok_or_abandon( socket, MigratableError::MigrateSend(anyhow!("Error during dirty memory migration")), )?; - Ok(true) + Ok(()) + } + + /// Performs the initial memory transmission (iteration zero) plus a + /// variable number of memory iterations with the goal to eventually migrate + /// the VM in a reasonably small downtime. + /// + /// This returns as soon as the precopy migration indicates it is converged + /// (e.g., reasonably small downtime) is reached. + fn do_memory_iterations( + vm: &mut Vm, + socket: &mut SocketStream, + iteration_counter: &mut u64, + is_converged: impl Fn(u64) -> bool, + ) -> result::Result { + loop { + let iteration_table = if *iteration_counter == 0 { + vm.memory_range_table()? + } else { + // TODO do this in a thread #7816 + vm.dirty_log()? + }; + + if is_converged(*iteration_counter) { + debug!("Precopy converged: iter={iteration_counter}"); + break Ok(iteration_table); + } + + // Send the current dirty pages + Self::vm_send_dirty_pages(vm, socket, &iteration_table)?; + + // Prepare next iteration. + *iteration_counter += 1; + } + } + + /// Performs the memory migration including multiple iterations. + /// + /// This includes: + /// - initial memory - VM is running + /// - multiple memory delta transmissions - VM is running + /// - final memory iteration - VM is paused + fn do_memory_migration( + vm: &mut Vm, + socket: &mut SocketStream, + ) -> result::Result<(), MigratableError> { + const MAX_ITERATIONS: u64 = 5; + + let mut iteration_counter = 0; + let is_converged = |iteration_counter: u64| iteration_counter >= MAX_ITERATIONS; + + vm.start_dirty_log()?; + let remaining = + Self::do_memory_iterations(vm, socket, &mut iteration_counter, is_converged)?; + vm.pause()?; + + // Send last batch of dirty pages + let mut final_table = vm.dirty_log()?; + final_table.extend(remaining); + Vmm::vm_send_dirty_pages(vm, socket, &final_table)?; + + info!("Memory migration complete"); + + Ok(()) } fn send_migration( @@ -1323,36 +1388,7 @@ impl Vmm { // Now pause VM vm.pause()?; } else { - // Start logging dirty pages - vm.start_dirty_log()?; - - // Send memory table - let table = vm.memory_range_table()?; - Request::memory(table.length()) - .write_to(&mut socket) - .unwrap(); - table.write_to(&mut socket)?; - // And then the memory itself - vm.send_memory_regions(&table, &mut socket)?; - Response::read_from(&mut socket)?.ok_or_abandon( - &mut socket, - MigratableError::MigrateSend(anyhow!("Error during dirty memory migration")), - )?; - - // Try at most 5 passes of dirty memory sending - const MAX_DIRTY_MIGRATIONS: usize = 5; - for i in 0..MAX_DIRTY_MIGRATIONS { - info!("Dirty memory migration {i} of {MAX_DIRTY_MIGRATIONS}"); - if !Self::vm_maybe_send_dirty_pages(vm, &mut socket)? { - break; - } - } - - // Now pause VM - vm.pause()?; - - // Send last batch of dirty pages - Self::vm_maybe_send_dirty_pages(vm, &mut socket)?; + Self::do_memory_migration(vm, &mut socket)?; } // We release the locks early to enable locking them on the destination host. From 447a4c236b9ae8140b813511bfffec45e84be5fa Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Thu, 12 Mar 2026 07:09:51 +0100 Subject: [PATCH 138/742] vmm: migration: add code comment Signed-off-by: Philipp Schuster On-behalf-of: SAP philipp.schuster@sap.com --- vmm/src/vm.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index 3f793cd807..a094803ee4 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -2850,6 +2850,8 @@ impl Vm { Ok(()) } + /// Writes the contents of the given guest memory regions to the provided sink. + /// Used, for example, during VM live migration to transfer memory to a socket. pub fn send_memory_regions( &mut self, ranges: &MemoryRangeTable, From beb58084065d16d73307a5faf423e05080c77592 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Tue, 10 Mar 2026 09:19:58 +0100 Subject: [PATCH 139/742] vm-migration: add MemoryMigrationContext for precopy metrics Introduce MemoryMigrationContext to track internal metrics of an ongoing precopy memory migration. The context aggregates information such as iteration count, transferred bytes, durations, bandwidth, and estimated downtime. This enables migration logic to make decisions based on runtime characteristics, such as terminating iterations once the expected downtime is below a target threshold. The type is used in the next commit to implement iteration-based migration metrics. Signed-off-by: Philipp Schuster On-behalf-of: SAP philipp.schuster@sap.com --- vm-migration/src/context.rs | 406 +++++++++++++++++++++++++++++++++++ vm-migration/src/lib.rs | 2 + vm-migration/src/protocol.rs | 5 + 3 files changed, 413 insertions(+) create mode 100644 vm-migration/src/context.rs diff --git a/vm-migration/src/context.rs b/vm-migration/src/context.rs new file mode 100644 index 0000000000..7dfa5b7d9e --- /dev/null +++ b/vm-migration/src/context.rs @@ -0,0 +1,406 @@ +// Copyright © 2026 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// + +//! Module for [`MemoryMigrationContext`]. + +use std::fmt; +use std::fmt::Display; +use std::time::{Duration, Instant}; + +use crate::protocol::MemoryRangeTable; + +/// Internal metrics for the precopy migration phase. +/// +/// The context aggregates runtime statistics such as iteration count, +/// transferred bytes, durations, bandwidth, and estimated downtime. +/// These metrics allow the migration logic to make decisions based on +/// observed runtime behavior, for example terminating further iterations +/// once the expected downtime falls below a configured threshold. +/// +/// The structure is updated both between iterations and during an +/// iteration so that it always reflects the most recent state. +#[derive(Debug, PartialEq)] +pub struct MemoryMigrationContext { + /// Current iteration: 0 initial total transmission, >0 delta transmission. + pub iteration: usize, + /// Total bytes sent across all iterations. + total_sent_bytes: u64, + /// Total bytes to send in the current iteration. + pub current_iteration_total_bytes: u64, + /// The currently measured bandwidth. + /// + /// This is updated (at least) after each completed iteration. + bandwidth_bytes_per_second: f64, + /// Calculated downtime in milliseconds regarding the current bandwidth and + /// the remaining memory. + /// + /// This is only `None` for iteration 0. + /// + /// Please note that this ignores any additional migration overhead and + /// only looks at the memory transfer itself. + estimated_downtime: Option, + /// Begin of the memory migration. + migration_begin: Instant, + /// Duration of the memory migration. + /// + /// This is only `None` until the last iteration is finished. + migration_duration: Option, + /// Begin of the current iteration. + iteration_begin: Instant, + /// Duration of the current iteration. + /// + /// This includes the transmission, all logging, and update of any metrics. + /// + /// This is only `None` for iteration 0. + iteration_duration: Option, + /// Begin of the current transfer. + transfer_begin: Instant, + /// Duration of the current transfer. + /// + /// This is only `None` for iteration 0. + transfer_duration: Option, +} + +impl MemoryMigrationContext { + /// Creates a new context. + /// + /// Please note that you should create this struct right before the precopy + /// memory migration starts, as the field `migration_begin` is set to + /// [`Instant::now`]. + pub fn new() -> Self { + Self { + iteration: 0, + total_sent_bytes: 0, + current_iteration_total_bytes: 0, + bandwidth_bytes_per_second: 0.0, + estimated_downtime: None, + migration_begin: Instant::now(), + migration_duration: None, + // Will be updated soon -> so this value is never read + iteration_begin: Instant::now(), + iteration_duration: None, + // Will be updated soon -> so this value is never read + transfer_begin: Instant::now(), + transfer_duration: None, + } + } + + /// Updates the metrics right before the transfer over the wire. + /// + /// Supposed to be called once per precopy memory iteration. + /// + /// This helps to feed the "is converged?" with fresh metrics to + /// potentially stop the precopy phase. + pub fn update_metrics_before_transfer( + &mut self, + iteration_begin: Instant, + iteration_table: &MemoryRangeTable, + ) { + self.iteration_begin = iteration_begin; + self.current_iteration_total_bytes = iteration_table.effective_size(); + self.estimated_downtime = if self.current_iteration_total_bytes == 0 { + Some(Duration::ZERO) + } else if self.bandwidth_bytes_per_second == 0.0 { + // Only happens on the very first iteration + None + } else { + let calculated_downtime_s = + self.current_iteration_total_bytes as f64 / (self.bandwidth_bytes_per_second); + Some(Duration::from_secs_f64(calculated_downtime_s)) + } + } + + /// Updates the metrics right after the transfer over the wire. + /// + /// Supposed to be called once per precopy memory iteration. + /// + /// This updates the bandwidth and ensures that + /// [`Self::update_metrics_before_transfer`] operates on fresh metrics on + /// the new iteration. + /// + /// # Panics + /// + /// If the transfer duration is longer than the iteration duration, this + /// function panics. This can never happen with real-world data but in + /// artificial unit test scenarios. + pub fn update_metrics_after_transfer( + &mut self, + transfer_begin: Instant, + transfer_duration: Duration, + ) { + self.transfer_begin = transfer_begin; + self.transfer_duration = Some(transfer_duration); + self.total_sent_bytes += self.current_iteration_total_bytes; + self.bandwidth_bytes_per_second = + Self::calculate_bandwidth(self.current_iteration_total_bytes, transfer_duration); + + // We might have a few operations after that before the loop starts + // (e.g., logging) again, but practically, this is negligible for this + // metric. + self.iteration_duration = Some(self.iteration_begin.elapsed()); + + // Catch programming errors: + // unwrap is fine as both values are set by now + assert!( + self.iteration_duration.unwrap() >= self.transfer_duration.unwrap(), + "iteration_duration must be larger than transfer_duration: {}ms < {}ms", + self.iteration_duration.unwrap().as_millis(), + self.transfer_duration.unwrap().as_millis(), + ); + } + + /// Finalizes the metrics. + /// + /// From now on, the metrics are considered finalized and should not be + /// modified. They can be stored for further analysis. + #[inline] + pub fn finalize(&mut self) { + // Any overhead from the function call is negligible. + self.migration_duration = Some(self.migration_begin.elapsed()); + } + + /// Returns the average bandwidth over the whole duration of the migration. + #[inline] + pub fn average_bandwidth(&self) -> f64 { + Self::calculate_bandwidth(self.total_sent_bytes, self.migration_begin.elapsed()) + } + + /// Calculates the bandwidth in bytes per second. + /// + /// Returns `0.0` if the duration is zero to avoid division by zero. + #[inline] + fn calculate_bandwidth(bytes: u64, duration: Duration) -> f64 { + if duration == Duration::ZERO { + 0.0 + } else { + bytes as f64 / duration.as_secs_f64() + } + } +} + +impl Default for MemoryMigrationContext { + fn default() -> Self { + Self::new() + } +} + +// The display format must be a compact one-liner to enable concise log messages per iteration. +impl Display for MemoryMigrationContext { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let curr_mib = self.current_iteration_total_bytes.div_ceil(1024 * 1024); + let total_mib = self.total_sent_bytes.div_ceil(1024 * 1024); + + // Current bandwidth in MiB/s + let curr_bw_mib_s = self.bandwidth_bytes_per_second / 1024.0 / 1024.0; + + // Time elapsed since memory migration start. + let elapsed = self + .migration_duration + .unwrap_or_else(|| Instant::now() - self.migration_begin) + .as_secs_f64(); + + // Internally, this again evaluates `self.migration_begin.elapsed()` + // but this is negligible. + let avg_bw_mib_s = self.average_bandwidth() / 1024.0 / 1024.0; + + // Transfer duration and iteration overhead + let transfer_s = self.transfer_duration.map_or(0.0, |d| d.as_secs_f64()); + let iteration_overhead_ms = self + .iteration_duration + .and_then(|iter| { + self.transfer_duration.map(|tr| { + // This is guaranteed by update_metrics_after_transfer() + assert!(iter >= tr); + (iter - tr).as_millis() + }) + }) + .unwrap_or(0); + + let est_downtime_ms = self.estimated_downtime.map_or(0, |d| d.as_millis()); + + write!( + f, + "iter={} \ + curr={curr_mib}MiB \ + total={total_mib}MiB \ + bw={curr_bw_mib_s:.2}MiB/s \ + transfer={transfer_s:.2}s \ + overhead={iteration_overhead_ms}ms \ + est_downtime={est_downtime_ms}ms \ + elapsed={elapsed:.2}s \ + avg_bw={avg_bw_mib_s:.2}MiB/s", + self.iteration, + ) + } +} + +#[cfg(test)] +mod unit_tests { + use std::time::{Duration, Instant}; + + use super::*; + use crate::protocol::MemoryRange; + + fn make_table(bytes: u64) -> MemoryRangeTable { + let mut table = MemoryRangeTable::default(); + if bytes > 0 { + table.push(MemoryRange { + gpa: 0, + length: bytes, + }); + } + table + } + + /// A controlled migration scenario with fixed timing offsets. + /// + /// ```text + /// migration_begin + /// + 1.0s -> iteration_begin + /// + 1.1s -> transfer_begin + /// + 2.0s -> transfer ends (transfer_duration = 0.9s) + /// + 2.1s -> iteration ends (iteration_duration = 1.1s, overhead = 0.2s) + /// ``` + struct Scenario { + migration_begin: Instant, + iteration_begin: Instant, + transfer_begin: Instant, + transfer_duration: Duration, + } + + impl Scenario { + /// We use a fixed point in the past so all offsets are in the past too, + /// meaning elapsed() calls in the code under test will be >= our durations. + const FIXPOINT_PAST: Duration = Duration::from_secs(10); + + fn new() -> Self { + // Use a fixed point in the past so all offsets are in the past too, + // meaning elapsed() calls in the code under test will be >= our durations. + let migration_begin = Instant::now() - Self::FIXPOINT_PAST; + Self { + migration_begin, + iteration_begin: migration_begin + Duration::from_millis(1000), + transfer_begin: migration_begin + Duration::from_millis(1100), + transfer_duration: Duration::from_millis(900), + } + } + + fn make_ctx(&self) -> MemoryMigrationContext { + let mut ctx = MemoryMigrationContext::new(); + // Override migration_begin with our controlled value. + ctx.migration_begin = self.migration_begin; + ctx + } + } + + #[test] + fn before_transfer_updates_begin_and_bytes() { + let s = Scenario::new(); + let mut ctx = s.make_ctx(); + + ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(4096)); + + assert_eq!(ctx.iteration_begin, s.iteration_begin); + assert_eq!(ctx.current_iteration_total_bytes, 4096); + } + + #[test] + fn before_transfer_estimated_downtime() { + let s = Scenario::new(); + let mut ctx = s.make_ctx(); + + // Empty table -> zero downtime regardless of bandwidth + ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(0)); + assert_eq!(ctx.estimated_downtime, Some(Duration::ZERO)); + + // No bandwidth yet -> None + ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(1024)); + assert_eq!(ctx.estimated_downtime, None); + + // 1024 B/s, 1024 bytes -> 1s + ctx.bandwidth_bytes_per_second = 1024.0; + ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(1024)); + assert_eq!(ctx.estimated_downtime, Some(Duration::from_secs(1))); + } + + #[test] + fn after_transfer_updates_timing_and_bandwidth() { + let s = Scenario::new(); + let mut ctx = s.make_ctx(); + + ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(1024)); + ctx.update_metrics_after_transfer(s.transfer_begin, s.transfer_duration); + + assert_eq!(ctx.transfer_begin, s.transfer_begin); + assert_eq!(ctx.transfer_duration, Some(s.transfer_duration)); + // 1024 bytes / 0.9s + assert_eq!(ctx.bandwidth_bytes_per_second, 1024.0 / 0.9); + // iteration_duration = time from iteration_begin until now (>= transfer_duration) + assert!(ctx.iteration_duration.unwrap() >= s.transfer_duration); + // Zero transfer_duration -> bandwidth is 0.0, no division by zero + ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(1024)); + ctx.update_metrics_after_transfer(s.transfer_begin, Duration::ZERO); + assert_eq!(ctx.bandwidth_bytes_per_second, 0.0); + + // Check finalize() sets migration duration + assert_eq!(ctx.migration_duration, None); + ctx.finalize(); + assert!(matches!(ctx.migration_duration, Some(d) if d >= Scenario::FIXPOINT_PAST)); + } + + #[test] + fn two_iterations_accumulate_bytes_and_feed_downtime_estimate() { + let s = Scenario::new(); + let mut ctx = s.make_ctx(); + + // Iteration 0: no bandwidth yet -> downtime is None + ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(1024)); + assert_eq!(ctx.estimated_downtime, None); + ctx.update_metrics_after_transfer(s.transfer_begin, s.transfer_duration); + assert_eq!(ctx.total_sent_bytes, 1024); + + // Iteration 1: bandwidth now known -> downtime is Some + ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(2048)); + assert!(ctx.estimated_downtime.is_some()); + ctx.update_metrics_after_transfer(s.transfer_begin, s.transfer_duration); + assert_eq!(ctx.total_sent_bytes, 1024 + 2048); + + // Check finalize() sets migration duration + assert_eq!(ctx.migration_duration, None); + ctx.finalize(); + assert!(matches!(ctx.migration_duration, Some(d) if d >= Scenario::FIXPOINT_PAST)); + } + + #[test] + /// The display format is specifically crafted to be very insightful in logs. + /// Therefore, we have a dedicated test for that format. + fn display_format() { + let s = Scenario::new(); + let mut ctx = s.make_ctx(); + + // Iteration 0: 1 MiB in 1s + ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(1024 * 1024)); + ctx.update_metrics_after_transfer(s.transfer_begin, Duration::from_secs(1)); + ctx.iteration += 1; + + // Iteration 1: 512 KiB in 1s; fix migration_duration for deterministic elapsed/avg_bw + ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(512 * 1024)); + ctx.update_metrics_after_transfer(s.transfer_begin, Duration::from_secs(1)); + + ctx.migration_duration = Some(Duration::from_secs(2)); + let out = ctx.to_string(); + + assert_eq!( + out, + "iter=1 curr=1MiB total=2MiB bw=0.50MiB/s transfer=1.00s overhead=8000ms est_downtime=500ms elapsed=2.00s avg_bw=0.15MiB/s" + ); + + // Should change elapsed() time! + // Since this is at least 10s, we never face timing issues in CI! + ctx.finalize(); + let out2 = ctx.to_string(); + assert_ne!(out2, out, "elapsed time should have changed! is={out2}"); + } +} diff --git a/vm-migration/src/lib.rs b/vm-migration/src/lib.rs index 921ae5b3db..22da8df7f4 100644 --- a/vm-migration/src/lib.rs +++ b/vm-migration/src/lib.rs @@ -4,12 +4,14 @@ // use anyhow::anyhow; +pub use context::MemoryMigrationContext; use serde::{Deserialize, Serialize}; use thiserror::Error; use crate::protocol::MemoryRangeTable; mod bitpos_iterator; +mod context; pub mod protocol; #[derive(Error, Debug)] diff --git a/vm-migration/src/protocol.rs b/vm-migration/src/protocol.rs index 4dfec4f625..ab5975d4e5 100644 --- a/vm-migration/src/protocol.rs +++ b/vm-migration/src/protocol.rs @@ -376,6 +376,11 @@ impl MemoryRangeTable { } Self { data } } + + /// Returns the effective size in bytes. + pub fn effective_size(&self) -> u64 { + self.data.iter().map(|r| r.length).sum() + } } #[cfg(test)] From 39768704f3227afad4a77f4dd36ce526d5bf686d Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Tue, 10 Mar 2026 09:20:03 +0100 Subject: [PATCH 140/742] vmm: migration: add iteration metrics and downtime estimation Add infrastructure to collect metrics during precopy memory migration iterations. For each iteration we now track transferred bytes, duration, bandwidth, and estimate the expected downtime based on the remaining memory of the current iteration and measured bandwidth. These metrics are logged and used to decide when to stop the precopy phase. This also introduces basic termination conditions such as: - maximum number of iterations - reaching a target downtime - maximum migration duration This is the fundament for an upcoming API call to publicly export statistics about an ongoing live migration. The changes are, however, self-contained and helpful by themselves. The new log now looks somewhat as in the following, providing lots of helpful insights (especially the bandwidth and estimated downtime are helpful). The metrics were measured with CHV build with `--release`, a VM under heavy load (lots of memory writes), same-host TCP migration and prefault=on: ``` cloud-hypervisor: 12.702682s: DEBUG:vmm/src/lib.rs:1313 -- Precopy: iter=0 total=6144MiB curr=6144MiB bw=1986.83MiB/s transfer=3.09s overhead=0ms est_downtime=0ms elapsed=3.11s avg_bw=1975.41MiB/s cloud-hypervisor: 15.728419s: DEBUG:vmm/src/lib.rs:1313 -- Precopy: iter=1 total=11562MiB curr=5418MiB bw=1824.44MiB/s transfer=2.97s overhead=56ms est_downtime=2726ms elapsed=6.14s avg_bw=1884.21MiB/s cloud-hypervisor: 18.710428s: DEBUG:vmm/src/lib.rs:1313 -- Precopy: iter=2 total=16980MiB curr=5418MiB bw=1854.25MiB/s transfer=2.92s overhead=59ms est_downtime=2969ms elapsed=9.12s avg_bw=1862.17MiB/s cloud-hypervisor: 21.783699s: DEBUG:vmm/src/lib.rs:1313 -- Precopy: iter=3 total=22407MiB curr=5428MiB bw=1799.43MiB/s transfer=3.02s overhead=56ms est_downtime=2926ms elapsed=12.19s avg_bw=1837.92MiB/s cloud-hypervisor: 25.785696s: DEBUG:vmm/src/lib.rs:1313 -- Precopy: iter=4 total=27825MiB curr=5418MiB bw=1375.53MiB/s transfer=3.94s overhead=62ms est_downtime=3010ms elapsed=16.19s avg_bw=1718.26MiB/s cloud-hypervisor: 29.000349s: DEBUG:vmm/src/lib.rs:1313 -- Precopy: iter=5 total=33243MiB curr=5418MiB bw=1727.60MiB/s transfer=3.14s overhead=78ms est_downtime=3938ms elapsed=19.41s avg_bw=1712.82MiB/s cloud-hypervisor: 32.215805s: DEBUG:vmm/src/lib.rs:1313 -- Precopy: iter=6 total=38671MiB curr=5429MiB bw=1724.03MiB/s transfer=3.15s overhead=66ms est_downtime=3142ms elapsed=22.62s avg_bw=1709.33MiB/s cloud-hypervisor: 32.275215s: DEBUG:vmm/src/lib.rs:1286 -- Precopy converged: iter=7 total=38671MiB curr=5418MiB bw=1720.46MiB/s transfer=3.15s overhead=66ms est_downtime=3142ms elapsed=22.68s avg_bw=1704.85MiB/s ... cloud-hypervisor: 33.411682s: INFO:vmm/src/lib.rs:1365 -- Precopy complete: iter=8 total=44339MiB curr=5668MiB bw=1799.98MiB/s transfer=3.15s overhead=66ms est_downtime=3142ms elapsed=23.82s avg_bw=1861.45MiB/s ``` # Outlook We can add user-configurable downtimes and migration downtimes next. These changes are inspired by [0] but differ significantly in details. [0] https://github.com/cloud-hypervisor/cloud-hypervisor/pull/7033 Signed-off-by: Philipp Schuster On-behalf-of: SAP philipp.schuster@sap.com --- vmm/src/lib.rs | 62 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 18 deletions(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 4186386787..007db270e6 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -39,7 +39,10 @@ use tracer::trace_scoped; use vm_memory::bitmap::{AtomicBitmap, BitmapSlice}; use vm_memory::{ReadVolatile, VolatileMemoryError, VolatileSlice, WriteVolatile}; use vm_migration::protocol::*; -use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable}; +use vm_migration::{ + MemoryMigrationContext, Migratable, MigratableError, Pausable, Snapshot, Snapshottable, + Transportable, +}; use vmm_sys_util::eventfd::EventFd; use vmm_sys_util::signal::unblock_signal; use vmm_sys_util::sock_ctrl_msg::ScmSocket; @@ -1252,27 +1255,37 @@ impl Vmm { fn do_memory_iterations( vm: &mut Vm, socket: &mut SocketStream, - iteration_counter: &mut u64, - is_converged: impl Fn(u64) -> bool, + ctx: &mut MemoryMigrationContext, + is_converged: impl Fn(&MemoryMigrationContext) -> bool, ) -> result::Result { loop { - let iteration_table = if *iteration_counter == 0 { + let iteration_begin = Instant::now(); + + let iteration_table = if ctx.iteration == 0 { vm.memory_range_table()? } else { // TODO do this in a thread #7816 vm.dirty_log()? }; - if is_converged(*iteration_counter) { - debug!("Precopy converged: iter={iteration_counter}"); + ctx.update_metrics_before_transfer(iteration_begin, &iteration_table); + if is_converged(ctx) { + debug!("Precopy converged: {ctx}"); break Ok(iteration_table); } // Send the current dirty pages + let transfer_begin = Instant::now(); Self::vm_send_dirty_pages(vm, socket, &iteration_table)?; + let transfer_duration = transfer_begin.elapsed(); + ctx.update_metrics_after_transfer(transfer_begin, transfer_duration); - // Prepare next iteration. - *iteration_counter += 1; + // Log progress of the current iteration + debug!("Precopy: {ctx}"); + + // Increment iteration last: This way we ensure that the logging + // above matches the actual iteration. + ctx.iteration += 1; } } @@ -1286,22 +1299,35 @@ impl Vmm { vm: &mut Vm, socket: &mut SocketStream, ) -> result::Result<(), MigratableError> { - const MAX_ITERATIONS: u64 = 5; + const MAX_ITERATIONS: usize = 5; - let mut iteration_counter = 0; - let is_converged = |iteration_counter: u64| iteration_counter >= MAX_ITERATIONS; + let mut ctx = MemoryMigrationContext::new(); + let is_converged = |ctx: &MemoryMigrationContext| { + // TODO: Add check for configurable downtime and max migration time #7111 + ctx.iteration >= MAX_ITERATIONS || ctx.current_iteration_total_bytes == 0 + }; vm.start_dirty_log()?; - let remaining = - Self::do_memory_iterations(vm, socket, &mut iteration_counter, is_converged)?; + let remaining = Self::do_memory_iterations(vm, socket, &mut ctx, is_converged)?; vm.pause()?; - // Send last batch of dirty pages - let mut final_table = vm.dirty_log()?; - final_table.extend(remaining); - Vmm::vm_send_dirty_pages(vm, socket, &final_table)?; + // Send last batch of dirty pages: final iteration + { + let iteration_begin = Instant::now(); + + let mut final_table = vm.dirty_log()?; + final_table.extend(remaining); + + ctx.update_metrics_before_transfer(iteration_begin, &final_table); + let transfer_begin = Instant::now(); + Vmm::vm_send_dirty_pages(vm, socket, &final_table)?; + let transfer_duration = transfer_begin.elapsed(); + ctx.update_metrics_after_transfer(transfer_begin, transfer_duration); + ctx.iteration += 1; + } + ctx.finalize(); - info!("Memory migration complete"); + info!("Precopy complete: {ctx}"); Ok(()) } From fcdb10373bfc954711e9cdd915f09231f8690942 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Fri, 6 Mar 2026 13:16:45 +0100 Subject: [PATCH 141/742] vmm: migration: emit event for each memory iteration Emit a "vm.migration-memory-iteration" event after every precopy memory iteration to allow management software to observe forward progress during migration. This event is primarily intended for integration with management software such as libvirt, where it maps to VIR_DOMAIN_EVENT_ID_MIGRATION_ITERATION. The event is intentionally independent of any upcoming migration metrics endpoint. Detailed migration statistics will be exposed via that endpoint, while this event provides a lightweight progress signal expected by external management layers. With this event, management software can detect forward progress during migration without being blocked on any upcoming migration metrics endpoint. Signed-off-by: Philipp Schuster On-behalf-of: SAP philipp.schuster@sap.com --- vmm/src/lib.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 007db270e6..f03927ce3e 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -1283,6 +1283,14 @@ impl Vmm { // Log progress of the current iteration debug!("Precopy: {ctx}"); + // Enables management software (e.g., libvirt) to easily track forward progress. + event!( + "vm", + "migration-memory-iteration", + "id", + ctx.iteration.to_string() + ); + // Increment iteration last: This way we ensure that the logging // above matches the actual iteration. ctx.iteration += 1; From de37f2794503546dbde292f9c8c240640107aab5 Mon Sep 17 00:00:00 2001 From: Shayon Mukherjee Date: Fri, 13 Mar 2026 05:47:16 -0700 Subject: [PATCH 142/742] vmm: add userfaultfd constants module Add a small constants module with the ioctl numbers and protocol constants needed for userfaultfd-based demand-paged snapshot restore. These are derived from the kernel's include/uapi/linux/userfaultfd.h. Signed-off-by: Shayon Mukherjee --- vmm/src/userfaultfd.rs | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 vmm/src/userfaultfd.rs diff --git a/vmm/src/userfaultfd.rs b/vmm/src/userfaultfd.rs new file mode 100644 index 0000000000..3f8447a327 --- /dev/null +++ b/vmm/src/userfaultfd.rs @@ -0,0 +1,25 @@ +// Copyright © 2026 Cloud Hypervisor Authors +// +// SPDX-License-Identifier: Apache-2.0 + +// See include/uapi/linux/userfaultfd.h in the kernel code. +pub const UFFDIO_API: u64 = 0xc018_aa3f; // _IOWR(0xAA, 0x3F, struct uffdio_api) +pub const UFFDIO_REGISTER: u64 = 0xc020_aa00; // _IOWR(0xAA, 0x00, struct uffdio_register) +pub const UFFDIO_COPY: u64 = 0xc028_aa03; // _IOWR(0xAA, 0x03, struct uffdio_copy) +pub const UFFDIO_WAKE: u64 = 0x4010_aa02; // _IOW(0xAA, 0x02, struct uffdio_range) + +// Seccomp compares these as Dword (u32); ensure they fit. +const _: () = assert!(UFFDIO_API <= u32::MAX as u64); +const _: () = assert!(UFFDIO_REGISTER <= u32::MAX as u64); +const _: () = assert!(UFFDIO_COPY <= u32::MAX as u64); +const _: () = assert!(UFFDIO_WAKE <= u32::MAX as u64); + +pub const UFFD_API: u64 = 0xAA; +pub const UFFDIO_REGISTER_MODE_MISSING: u64 = 1; +pub const UFFD_EVENT_PAGEFAULT: u8 = 0x12; +pub const UFFD_FEATURE_MISSING_SHMEM: u64 = 1 << 5; +pub const UFFD_FEATURE_MISSING_HUGETLBFS: u64 = 1 << 6; + +const _UFFDIO_COPY: u64 = 0x03; +const _UFFDIO_WAKE: u64 = 0x02; +pub const UFFD_API_RANGE_IOCTLS_BASIC: u64 = (1 << _UFFDIO_WAKE) | (1 << _UFFDIO_COPY); From 8340307ace21305ad76757159dac88432dd4c8d6 Mon Sep 17 00:00:00 2001 From: Shayon Mukherjee Date: Fri, 13 Mar 2026 05:47:16 -0700 Subject: [PATCH 143/742] vmm: add uffd abstraction module Add safe Rust wrappers around the raw userfaultfd ioctls: create (syscall + API handshake), register (missing-page mode), copy (resolve fault), and wake (unblock threads after EEXIST race). These are used by the demand-paged snapshot restore handler in a subsequent commit. Signed-off-by: Shayon Mukherjee --- vmm/src/uffd.rs | 167 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100644 vmm/src/uffd.rs diff --git a/vmm/src/uffd.rs b/vmm/src/uffd.rs new file mode 100644 index 0000000000..eb73f46592 --- /dev/null +++ b/vmm/src/uffd.rs @@ -0,0 +1,167 @@ +// Copyright © 2026 Cloud Hypervisor Authors +// +// SPDX-License-Identifier: Apache-2.0 + +//! Minimal userfaultfd bindings for demand-paged snapshot restore. +//! +//! Uses the `userfaultfd(2)` syscall (available since Linux 4.3) to create a +//! fault descriptor, then `UFFDIO_API` / `UFFDIO_REGISTER` / `UFFDIO_COPY` +//! ioctls to handle page faults from a background thread. +//! +//! Unlike an mmap(MAP_PRIVATE) overlay approach, UFFD does not replace the +//! original memory mapping, so it remains compatible with VFIO device +//! passthrough and shared-memory-backed guest RAM. + +use std::os::fd::{AsRawFd, BorrowedFd, FromRawFd, OwnedFd}; + +use crate::userfaultfd; + +#[repr(C)] +pub(crate) struct UffdioApi { + pub api: u64, + pub features: u64, + pub ioctls: u64, +} + +#[repr(C)] +pub(crate) struct UffdioRegister { + pub range_start: u64, + pub range_len: u64, + pub mode: u64, + pub ioctls: u64, +} + +#[repr(C)] +pub(crate) struct UffdioCopy { + pub dst: u64, + pub src: u64, + pub len: u64, + pub mode: u64, + pub copy: i64, +} + +/// Flat representation of `struct uffd_msg` (32 bytes). +/// +/// The kernel struct contains an 8-byte header followed by a 24-byte +/// union (`arg`). We only use the `arg.pagefault` variant, so the +/// union is flattened into its pagefault fields here. The trailing +/// 8 bytes (`arg.pagefault.feat` + padding) are unused. +#[repr(C)] +pub(crate) struct UffdMsg { + pub event: u8, + _reserved1: u8, + _reserved2: u16, + _reserved3: u32, + pub pf_flags: u64, + pub pf_address: u64, + _pad: [u8; 8], +} + +const _: () = assert!(std::mem::size_of::() == 32); + +/// Create a userfaultfd file descriptor and perform the API handshake. +pub(crate) fn create(required_features: u64) -> Result { + // SAFETY: `userfaultfd` syscall with O_CLOEXEC | O_NONBLOCK flags. + let fd = unsafe { libc::syscall(libc::SYS_userfaultfd, libc::O_CLOEXEC | libc::O_NONBLOCK) }; + if fd < 0 { + return Err(std::io::Error::last_os_error()); + } + // SAFETY: the syscall returned a valid fd above. + let fd = unsafe { OwnedFd::from_raw_fd(fd as std::os::unix::io::RawFd) }; + + let mut api = UffdioApi { + api: userfaultfd::UFFD_API, + features: required_features, + ioctls: 0, + }; + // SAFETY: `api` is a valid, correctly-sized struct for this ioctl. + let ret = unsafe { + libc::ioctl( + fd.as_raw_fd(), + userfaultfd::UFFDIO_API as libc::Ioctl, + &mut api, + ) + }; + if ret < 0 { + return Err(std::io::Error::last_os_error()); + } + + Ok(fd) +} + +/// Register a memory range for missing-page fault handling. +pub(crate) fn register(fd: BorrowedFd<'_>, addr: u64, len: u64) -> Result { + let mut reg = UffdioRegister { + range_start: addr, + range_len: len, + mode: userfaultfd::UFFDIO_REGISTER_MODE_MISSING, + ioctls: 0, + }; + // SAFETY: `reg` is a valid, correctly-sized struct for this ioctl. + let ret = unsafe { + libc::ioctl( + fd.as_raw_fd(), + userfaultfd::UFFDIO_REGISTER as libc::Ioctl, + &mut reg, + ) + }; + if ret < 0 { + return Err(std::io::Error::last_os_error()); + } + Ok(reg.ioctls) +} + +/// Resolve a page fault by copying data into the faulted address. +pub(crate) fn copy( + fd: BorrowedFd<'_>, + dst: u64, + src: *const u8, + len: u64, +) -> Result<(), std::io::Error> { + let mut cp = UffdioCopy { + dst, + src: src as u64, + len, + mode: 0, + copy: 0, + }; + // SAFETY: `cp` is a valid, correctly-sized struct for this ioctl. + let ret = unsafe { + libc::ioctl( + fd.as_raw_fd(), + userfaultfd::UFFDIO_COPY as libc::Ioctl, + &mut cp, + ) + }; + if ret < 0 { + return Err(std::io::Error::last_os_error()); + } + Ok(()) +} + +#[repr(C)] +struct UffdioRange { + start: u64, + len: u64, +} + +/// Wake threads waiting on a fault in the given range without copying data. +/// +/// Needed after UFFDIO_COPY returns EEXIST: the page was already resolved +/// by a concurrent fault, but any additional threads blocked on that page +/// may not have been woken. +pub(crate) fn wake(fd: BorrowedFd<'_>, addr: u64, len: u64) -> Result<(), std::io::Error> { + let mut range = UffdioRange { start: addr, len }; + // SAFETY: `range` is a valid, correctly-sized struct for this ioctl. + let ret = unsafe { + libc::ioctl( + fd.as_raw_fd(), + userfaultfd::UFFDIO_WAKE as libc::Ioctl, + &mut range, + ) + }; + if ret < 0 { + return Err(std::io::Error::last_os_error()); + } + Ok(()) +} From bf85af907e000b02aa6327a59f5fdc6b3ac22b33 Mon Sep 17 00:00:00 2001 From: Shayon Mukherjee Date: Fri, 13 Mar 2026 05:47:16 -0700 Subject: [PATCH 144/742] vmm: config: add memory_restore_mode to RestoreConfig Add a MemoryRestoreMode enum (Copy | OnDemand) to RestoreConfig so the restore path can be selected at restore time. Copy preserves the existing eager read-copy behavior. OnDemand enables userfaultfd-based demand paging and fails restore if the kernel does not support it. Validate that prefault=on is not combined with OnDemand mode. Update the OpenAPI spec with the new enum field. Signed-off-by: Shayon Mukherjee --- docs/snapshot_restore.md | 22 ++++++ vmm/src/api/openapi/cloud-hypervisor.yaml | 7 ++ vmm/src/config.rs | 95 ++++++++++++++++++++++- 3 files changed, 121 insertions(+), 3 deletions(-) diff --git a/docs/snapshot_restore.md b/docs/snapshot_restore.md index df7248805e..2cf8eda5a8 100644 --- a/docs/snapshot_restore.md +++ b/docs/snapshot_restore.md @@ -93,6 +93,28 @@ start using it. At this point, the VM is fully restored and is identical to the VM which was snapshot earlier. +Restore also supports selecting how guest memory is populated: + +```bash +./cloud-hypervisor \ + --api-socket /tmp/cloud-hypervisor.sock \ + --restore source_url=file:///home/foo/snapshot,memory_restore_mode=ondemand +``` + +If `memory_restore_mode` is omitted, Cloud Hypervisor uses the eager-copy +restore path (`copy`). + +With `memory_restore_mode=ondemand`, restore uses `userfaultfd` to fault snapshot +pages in on first access instead of copying the full `memory-ranges` file into +guest RAM before restore completes. This mode is strict: if Cloud Hypervisor +cannot enable the `userfaultfd` restore path, restore fails instead of falling +back to `copy`. + +Current constraints for `memory_restore_mode=ondemand`: + +- `prefault=on` is not supported +- the snapshot memory ranges must be page-aligned + ## Restore a VM with new Net FDs For a VM created with FDs explicitly passed to NetConfig, a set of valid FDs need to be provided along with the VM restore command in the following syntax: diff --git a/vmm/src/api/openapi/cloud-hypervisor.yaml b/vmm/src/api/openapi/cloud-hypervisor.yaml index efdcf7a678..8bdf14e50f 100644 --- a/vmm/src/api/openapi/cloud-hypervisor.yaml +++ b/vmm/src/api/openapi/cloud-hypervisor.yaml @@ -1342,6 +1342,11 @@ components: destination_url: type: string + MemoryRestoreMode: + type: string + enum: [Copy, OnDemand] + default: Copy + RestoreConfig: required: - source_url @@ -1351,6 +1356,8 @@ components: type: string prefault: type: boolean + memory_restore_mode: + $ref: "#/components/schemas/MemoryRestoreMode" ReceiveMigrationData: required: diff --git a/vmm/src/config.rs b/vmm/src/config.rs index 42cbcfdbc0..8b284660a1 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -356,6 +356,9 @@ pub enum ValidationError { /// Number of FDs passed during Restore are incorrect to the NetConfig #[error("Number of Net FDs passed for '{0}' during Restore: {1}. Expected: {2}")] RestoreNetFdCountMismatch(String, usize, usize), + /// Prefault cannot be combined with on-demand restore + #[error("'prefault' cannot be combined with 'memory_restore_mode=ondemand'")] + InvalidRestorePrefaultWithOnDemand, /// Path provided in landlock-rules doesn't exist #[error("Path {0:?} provided in landlock-rules does not exist")] LandlockPathDoesNotExist(PathBuf), @@ -2564,27 +2567,61 @@ where } } +#[derive(Clone, Copy, Debug, PartialEq, Eq, Deserialize, Serialize, Default)] +pub enum MemoryRestoreMode { + /// Restore by eagerly copying the snapshot into guest RAM before resume. + #[default] + Copy, + /// Restore lazily by faulting snapshot pages into guest RAM on demand. + OnDemand, +} + +#[derive(Debug, Error)] +pub enum MemoryRestoreModeParseError { + #[error("Invalid value: {0}")] + InvalidValue(String), +} + +impl FromStr for MemoryRestoreMode { + type Err = MemoryRestoreModeParseError; + + fn from_str(s: &str) -> result::Result { + match s.to_lowercase().as_str() { + "copy" => Ok(Self::Copy), + "ondemand" => Ok(Self::OnDemand), + _ => Err(MemoryRestoreModeParseError::InvalidValue(s.to_owned())), + } + } +} + #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize, Default)] pub struct RestoreConfig { pub source_url: PathBuf, #[serde(default)] pub prefault: bool, #[serde(default)] + pub memory_restore_mode: MemoryRestoreMode, + #[serde(default)] pub net_fds: Option>, } impl RestoreConfig { pub const SYNTAX: &'static str = "Restore from a VM snapshot. \ - \nRestore parameters \"source_url=,prefault=on|off,\ + \nRestore parameters \"source_url=,prefault=on|off,memory_restore_mode=copy|ondemand,\ net_fds=\" \ \n`source_url` should be a valid URL (e.g file:///foo/bar or tcp://192.168.1.10/foo) \ - \n`prefault` brings memory pages in when enabled (disabled by default) \ + \n`prefault` controls eager prefaulting for the copy-based restore path (disabled by default) \ + \n`memory_restore_mode=copy` preserves the existing eager read-copy restore behavior, while `memory_restore_mode=ondemand` enables lazy demand paging and fails restore if userfaultfd support is unavailable \ \n`net_fds` is a list of net ids with new file descriptors. \ Only net devices backed by FDs directly are needed as input."; pub fn parse(restore: &str) -> Result { let mut parser = OptionParser::new(); - parser.add("source_url").add("prefault").add("net_fds"); + parser + .add("source_url") + .add("prefault") + .add("memory_restore_mode") + .add("net_fds"); parser.parse(restore).map_err(Error::ParseRestore)?; let source_url = parser @@ -2596,6 +2633,10 @@ impl RestoreConfig { .map_err(Error::ParseRestore)? .unwrap_or(Toggle(false)) .0; + let memory_restore_mode = parser + .convert::("memory_restore_mode") + .map_err(Error::ParseRestore)? + .unwrap_or_default(); let net_fds = parser .convert::>>("net_fds") .map_err(Error::ParseRestore)? @@ -2612,6 +2653,7 @@ impl RestoreConfig { Ok(RestoreConfig { source_url, prefault, + memory_restore_mode, net_fds, }) } @@ -2620,6 +2662,10 @@ impl RestoreConfig { // corresponding 'RestoreNetConfig' with a matched 'id' and expected // number of FDs. pub fn validate(&self, vm_config: &VmConfig) -> ValidationResult<()> { + if self.memory_restore_mode == MemoryRestoreMode::OnDemand && self.prefault { + return Err(ValidationError::InvalidRestorePrefaultWithOnDemand); + } + let mut restored_net_with_fds = HashMap::new(); for n in self.net_fds.iter().flatten() { assert_eq!( @@ -4498,6 +4544,7 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" RestoreConfig { source_url: PathBuf::from("/path/to/snapshot"), prefault: false, + memory_restore_mode: MemoryRestoreMode::Copy, net_fds: None, } ); @@ -4508,6 +4555,7 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" RestoreConfig { source_url: PathBuf::from("/path/to/snapshot"), prefault: false, + memory_restore_mode: MemoryRestoreMode::Copy, net_fds: Some(vec![ RestoredNetConfig { id: "net0".to_string(), @@ -4522,11 +4570,39 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" ]), } ); + assert_eq!( + RestoreConfig::parse("source_url=/path/to/snapshot,memory_restore_mode=ondemand")?, + RestoreConfig { + source_url: PathBuf::from("/path/to/snapshot"), + prefault: false, + memory_restore_mode: MemoryRestoreMode::OnDemand, + net_fds: None, + } + ); // Parsing should fail as source_url is a required field RestoreConfig::parse("prefault=off").unwrap_err(); + RestoreConfig::parse("source_url=/path/to/snapshot,memory_restore_mode=bogus").unwrap_err(); Ok(()) } + #[test] + fn test_restore_config_serde() { + assert_eq!( + serde_json::from_str::(r#"{"source_url":"/path/to/snapshot"}"#) + .unwrap() + .memory_restore_mode, + MemoryRestoreMode::Copy + ); + assert_eq!( + serde_json::from_str::( + r#"{"source_url":"/path/to/snapshot","memory_restore_mode":"OnDemand"}"# + ) + .unwrap() + .memory_restore_mode, + MemoryRestoreMode::OnDemand + ); + } + #[test] fn test_restore_config_validation() { // interested in only VmConfig.net, so set rest to default values @@ -4589,6 +4665,7 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" let valid_config = RestoreConfig { source_url: PathBuf::from("/path/to/snapshot"), prefault: false, + memory_restore_mode: MemoryRestoreMode::Copy, net_fds: Some(vec![ RestoredNetConfig { id: "net0".to_string(), @@ -4663,6 +4740,7 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" let another_valid_config = RestoreConfig { source_url: PathBuf::from("/path/to/snapshot"), prefault: false, + memory_restore_mode: MemoryRestoreMode::Copy, net_fds: None, }; snapshot_vm_config.net = Some(vec![NetConfig { @@ -4671,6 +4749,17 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" ..net_fixture() }]); another_valid_config.validate(&snapshot_vm_config).unwrap(); + + let invalid_restore_mode = RestoreConfig { + source_url: PathBuf::from("/path/to/snapshot"), + prefault: true, + memory_restore_mode: MemoryRestoreMode::OnDemand, + net_fds: None, + }; + assert_eq!( + invalid_restore_mode.validate(&snapshot_vm_config), + Err(ValidationError::InvalidRestorePrefaultWithOnDemand) + ); } fn platform_fixture() -> PlatformConfig { From c417924a29566945d3ec96eb99cc3dbddf4c6692 Mon Sep 17 00:00:00 2001 From: Shayon Mukherjee Date: Fri, 13 Mar 2026 08:16:33 -0700 Subject: [PATCH 145/742] vmm: memory_manager: add on-demand snapshot restore via userfaultfd When memory_restore_mode=ondemand is specified on the restore command, the memory manager creates a userfaultfd descriptor, registers each guest RAM range for missing-page fault interception, and spawns a handler thread that serves page faults from the snapshot file using UFFDIO_COPY. This avoids reading the entire memory-ranges file into guest RAM before restore completes. The handler uses epoll to multiplex the userfaultfd and a stop eventfd for clean shutdown. Concurrent faults from multiple vCPUs are handled by treating EEXIST as a benign race and waking blocked threads with UFFDIO_WAKE. Once all pages have been served the handler exits automatically. If the handler thread panics the VMM is signalled to exit since the VM cannot continue without page fault service. MemoryZone gains a backing_page_size field so the handler resolves fault granularity from the zone rather than the top-level config. Errors from the UFFD setup path use a structured UffdError enum and a new MigratableError::OnDemandRestore variant, with a From impl to keep call sites concise. The seccomp filter is updated to allow the userfaultfd syscall and the four uffd ioctls (UFFDIO_API, UFFDIO_COPY, UFFDIO_REGISTER, UFFDIO_WAKE) under the VMM thread profile. Signed-off-by: Shayon Mukherjee --- vm-migration/src/lib.rs | 35 +++ vmm/src/lib.rs | 31 ++- vmm/src/memory_manager.rs | 456 ++++++++++++++++++++++++++++++++++++- vmm/src/seccomp_filters.rs | 7 + vmm/src/vm.rs | 7 +- 5 files changed, 512 insertions(+), 24 deletions(-) diff --git a/vm-migration/src/lib.rs b/vm-migration/src/lib.rs index 22da8df7f4..2283ff8bed 100644 --- a/vm-migration/src/lib.rs +++ b/vm-migration/src/lib.rs @@ -14,6 +14,38 @@ mod bitpos_iterator; mod context; pub mod protocol; +#[derive(Error, Debug)] +pub enum UffdError { + #[error("Snapshot ranges are not page-aligned")] + UnalignedRanges, + + #[error("Failed to create userfaultfd")] + Create(#[source] std::io::Error), + + #[error("Cannot translate GPA {gpa:#x} to host address")] + GpaTranslation { gpa: u64 }, + + #[error("Failed to register region at {addr:#x}+{len:#x}")] + Register { + addr: u64, + len: u64, + #[source] + source: std::io::Error, + }, + + #[error("Region at {addr:#x}+{len:#x} missing COPY/WAKE support")] + MissingIoctlSupport { addr: u64, len: u64 }, + + #[error("Failed to spawn handler thread")] + SpawnThread(#[source] std::io::Error), + + #[error("Handler terminated before startup completed")] + HandlerStartup, + + #[error("Handler failed after startup")] + HandlerFailed(#[source] std::io::Error), +} + #[derive(Error, Debug)] pub enum MigratableError { #[error("Failed to pause migratable component")] @@ -34,6 +66,9 @@ pub enum MigratableError { #[error("Failed to receive migratable component snapshot")] MigrateReceive(#[source] anyhow::Error), + #[error("On-demand restore failed")] + OnDemandRestore(#[source] UffdError), + #[error("Socket error")] MigrateSocket(#[source] std::io::Error), diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index f03927ce3e..0b82be49e6 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -51,7 +51,7 @@ use crate::api::{ ApiRequest, ApiResponse, RequestHandler, VmInfoResponse, VmReceiveMigrationData, VmSendMigrationData, VmmPingResponse, }; -use crate::config::{RestoreConfig, add_to_config}; +use crate::config::{MemoryRestoreMode, RestoreConfig, add_to_config}; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use crate::coredump::GuestDebuggable; use crate::landlock::Landlock; @@ -88,6 +88,8 @@ mod pci_segment; pub mod seccomp_filters; mod serial_manager; mod sigwinch_listener; +mod uffd; +mod userfaultfd; pub mod vm; pub mod vm_config; @@ -1506,6 +1508,7 @@ impl Vmm { source_url: &str, vm_config: Arc>, prefault: bool, + memory_restore_mode: MemoryRestoreMode, ) -> std::result::Result<(), VmError> { let snapshot = recv_vm_state(source_url).map_err(VmError::Restore)?; #[cfg(all(feature = "kvm", target_arch = "x86_64"))] @@ -1548,6 +1551,7 @@ impl Vmm { Some(&snapshot), Some(source_url), Some(prefault), + Some(memory_restore_mode), )?; self.vm = Some(vm); @@ -1754,6 +1758,7 @@ impl RequestHandler for Vmm { None, None, None, + None, )?; self.vm = Some(vm); @@ -1838,17 +1843,22 @@ impl RequestHandler for Vmm { } } - self.vm_restore(source_url, vm_config, restore_cfg.prefault) - .map_err(|vm_restore_err| { - error!("VM Restore failed: {vm_restore_err:?}"); + self.vm_restore( + source_url, + vm_config, + restore_cfg.prefault, + restore_cfg.memory_restore_mode, + ) + .map_err(|vm_restore_err| { + error!("VM Restore failed: {vm_restore_err:?}"); - // Cleanup the VM being created while vm restore - if let Err(e) = self.vm_delete() { - return e; - } + // Cleanup the VM being created while vm restore + if let Err(e) = self.vm_delete() { + return e; + } - vm_restore_err - }) + vm_restore_err + }) } #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] @@ -1930,6 +1940,7 @@ impl RequestHandler for Vmm { None, None, None, + None, )?; // And we boot it diff --git a/vmm/src/memory_manager.rs b/vmm/src/memory_manager.rs index 2cb735ecd1..ba0313b29a 100644 --- a/vmm/src/memory_manager.rs +++ b/vmm/src/memory_manager.rs @@ -7,13 +7,13 @@ use std::collections::BTreeMap; use std::collections::HashMap; use std::fs::{File, OpenOptions}; -use std::io::{self}; +use std::io::{self, Read as _, Seek, SeekFrom}; use std::ops::{BitAnd, Not, Sub}; -#[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] -use std::os::fd::AsFd; +use std::os::fd::{AsFd, OwnedFd}; use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::mpsc::{self, Receiver, SyncSender}; use std::sync::{Arc, Barrier, Mutex}; use std::{ffi, result, thread}; @@ -44,15 +44,31 @@ use vm_memory::{ use vm_migration::protocol::{MemoryRange, MemoryRangeTable}; use vm_migration::{ Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable, Transportable, + UffdError, }; +use vmm_sys_util::eventfd::EventFd; +use crate::config::MemoryRestoreMode; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use crate::coredump::{ CoredumpMemoryRegion, CoredumpMemoryRegions, DumpState, GuestDebuggableError, }; use crate::migration::url_to_path; use crate::vm_config::{HotplugMethod, MemoryConfig, MemoryZoneConfig}; -use crate::{GuestMemoryMmap, GuestRegionMmap, MEMORY_MANAGER_SNAPSHOT_ID}; +use crate::{GuestMemoryMmap, GuestRegionMmap, MEMORY_MANAGER_SNAPSHOT_ID, uffd}; + +struct UffdHandler { + stop_event: EventFd, + result_rx: Receiver>, + handle: thread::JoinHandle<()>, +} + +struct UffdRange { + host_addr: u64, + length: u64, + file_offset: u64, + page_size: u64, +} pub const MEMORY_MANAGER_ACPI_SIZE: usize = 0x18; @@ -116,13 +132,25 @@ impl VirtioMemZone { } } -#[derive(Default)] pub struct MemoryZone { regions: Vec>, virtio_mem_zone: Option, + shared: bool, + hugepages: bool, + backing_page_size: u64, } impl MemoryZone { + fn new(shared: bool, hugepages: bool, backing_page_size: u64) -> Self { + Self { + regions: Vec::new(), + virtio_mem_zone: None, + shared, + hugepages, + backing_page_size, + } + } + pub fn regions(&self) -> &Vec> { &self.regions } @@ -132,6 +160,21 @@ impl MemoryZone { pub fn virtio_mem_zone_mut(&mut self) -> Option<&mut VirtioMemZone> { self.virtio_mem_zone.as_mut() } + + fn backing_page_size_for_gpa(&self, gpa: u64) -> Option { + if self.regions.iter().any(|region| { + let start = region.start_addr().raw_value(); + gpa >= start && gpa < start + region.len() + }) { + return Some(self.backing_page_size); + } + + self.virtio_mem_zone.as_ref().and_then(|virtio_mem_zone| { + let start = virtio_mem_zone.region.start_addr().raw_value(); + (gpa >= start && gpa < start + virtio_mem_zone.region.len()) + .then_some(self.backing_page_size) + }) + } } pub type MemoryZones = HashMap; @@ -187,6 +230,7 @@ pub struct MemoryManager { // This is useful for getting the dirty pages as we need to know the // slots that the mapping is created in. guest_ram_mappings: Vec, + uffd_handler: Option, pub acpi_address: Option, #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] @@ -350,6 +394,12 @@ pub enum Error { MisalignedMemorySize, } +impl From for Error { + fn from(e: UffdError) -> Self { + Error::Restore(MigratableError::OnDemandRestore(e)) + } +} + const ENABLE_FLAG: usize = 0; const INSERTING_FLAG: usize = 1; const REMOVING_FLAG: usize = 2; @@ -551,7 +601,10 @@ impl MemoryManager { } // Add zone id to the list of memory zones. - memory_zones.insert(zone.id.clone(), MemoryZone::default()); + memory_zones.insert( + zone.id.clone(), + MemoryZone::new(zone.shared, zone.hugepages, zone_align_size), + ); for ram_region in ram_regions.iter() { let mut ram_region_offset = 0; @@ -642,7 +695,10 @@ impl MemoryManager { ); return Err(Error::DuplicateZoneId); } - memory_zones.insert(zone.id.clone(), MemoryZone::default()); + memory_zones.insert( + zone.id.clone(), + MemoryZone::new(zone.shared, zone.hugepages, zone_align_size), + ); } if ram_region_consumed { @@ -670,7 +726,11 @@ impl MemoryManager { let mut memory_zones = HashMap::new(); for zone_config in zones_config { - memory_zones.insert(zone_config.id.clone(), MemoryZone::default()); + let zone_page_size = memory_zone_get_align_size(zone_config)?; + memory_zones.insert( + zone_config.id.clone(), + MemoryZone::new(zone_config.shared, zone_config.hugepages, zone_page_size), + ); } for guest_ram_mapping in guest_ram_mappings { @@ -760,6 +820,360 @@ impl MemoryManager { Ok(()) } + /// Restore guest memory using userfaultfd for lazy demand paging. + /// + /// Instead of reading the entire snapshot into guest RAM upfront (which + /// blocks restore for hundreds of milliseconds at multi-GB sizes), this + /// registers the guest memory regions with a userfaultfd. A background + /// thread handles page faults by reading the corresponding page from the + /// snapshot file and copying it into guest memory via `UFFDIO_COPY`. + /// + /// This preserves the original memory mapping type (anonymous or shared), + /// making it compatible with VFIO device passthrough and shared-memory + /// guest RAM. + /// + /// Fails the restore if UFFD setup cannot be completed successfully. + /// + /// The handler thread keeps the snapshot file open while lazy restore + /// is active. The file must remain available until the VM is shut down or + /// all faulted pages have been served. + fn restore_by_uffd( + &mut self, + file_path: &Path, + saved_regions: &MemoryRangeTable, + exit_evt: &EventFd, + ) -> Result<(), Error> { + if saved_regions.is_empty() { + return Ok(()); + } + + let guest_memory = self.guest_memory.memory(); + let required_uffd_features = self.required_uffd_features(); + + // SAFETY: FFI call. Trivially safe. + let base_page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) } as u64; + + info!( + "UFFD restore: attempting demand-paged restore for {} region(s)", + saved_regions.regions().len() + ); + + if saved_regions + .regions() + .iter() + .any(|range| range.gpa % base_page_size != 0 || range.length % base_page_size != 0) + { + return Err(UffdError::UnalignedRanges.into()); + } + + let snapshot_file = File::open(file_path).map_err(Error::SnapshotOpen)?; + + let uffd_fd = uffd::create(required_uffd_features).map_err(UffdError::Create)?; + + let mut handler_ranges: Vec = Vec::new(); + let mut file_offset: u64 = 0; + + for range in saved_regions.regions() { + let host_addr = guest_memory + .get_host_address(GuestAddress(range.gpa)) + .map_err(|_| UffdError::GpaTranslation { gpa: range.gpa })? + as u64; + + let ioctls = uffd::register(uffd_fd.as_fd(), host_addr, range.length).map_err(|e| { + UffdError::Register { + addr: host_addr, + len: range.length, + source: e, + } + })?; + + if ioctls & crate::userfaultfd::UFFD_API_RANGE_IOCTLS_BASIC + != crate::userfaultfd::UFFD_API_RANGE_IOCTLS_BASIC + { + return Err(UffdError::MissingIoctlSupport { + addr: host_addr, + len: range.length, + } + .into()); + } + + let range_page_size = self + .memory_zones + .values() + .find_map(|zone| zone.backing_page_size_for_gpa(range.gpa)) + .unwrap_or(base_page_size); + + handler_ranges.push(UffdRange { + host_addr, + length: range.length, + file_offset, + page_size: range_page_size, + }); + + file_offset += range.length; + } + + info!( + "UFFD restore: registered {} region(s), {} total bytes, spawning handler", + handler_ranges.len(), + file_offset + ); + + let stop_event = EventFd::new(libc::EFD_NONBLOCK).map_err(Error::EventFdFail)?; + let thread_stop_event = stop_event.try_clone().map_err(Error::EventFdFail)?; + let thread_exit_evt = exit_evt.try_clone().map_err(Error::EventFdFail)?; + let (ready_tx, ready_rx) = mpsc::sync_channel(1); + let (result_tx, result_rx) = mpsc::sync_channel(1); + let handle = thread::Builder::new() + .name("uffd-handler".to_string()) + .spawn(move || { + std::panic::catch_unwind(std::panic::AssertUnwindSafe(move || { + let max_page_size = handler_ranges + .iter() + .map(|r| r.page_size) + .max() + .unwrap_or(base_page_size); + let result = Self::uffd_handler_loop( + uffd_fd, + thread_stop_event, + snapshot_file, + &handler_ranges, + max_page_size, + &ready_tx, + ); + + if let Err(e) = &result { + error!("UFFD handler exited with error: {e}"); + } + + result_tx.send(result).ok(); + })) + .map_err(|_| { + error!("uffd-handler thread panicked"); + thread_exit_evt.write(1).ok(); + }) + .ok(); + }) + .map_err(UffdError::SpawnThread)?; + + if ready_rx.recv().is_err() { + handle.join().ok(); + return Err(UffdError::HandlerStartup.into()); + } + + if let Ok(Err(e)) = result_rx.try_recv() { + handle.join().ok(); + return Err(UffdError::HandlerFailed(e).into()); + } + + self.uffd_handler = Some(UffdHandler { + stop_event, + result_rx, + handle, + }); + + info!("UFFD restore: demand-paged restore enabled"); + + Ok(()) + } + + fn required_uffd_features(&self) -> u64 { + let mut features = 0u64; + if self.memory_zones.values().any(|z| z.shared && !z.hugepages) { + features |= crate::userfaultfd::UFFD_FEATURE_MISSING_SHMEM; + } + if self.memory_zones.values().any(|z| z.hugepages) { + features |= crate::userfaultfd::UFFD_FEATURE_MISSING_HUGETLBFS; + } + features + } + + fn stop_uffd_handler(&mut self) { + if let Some(uffd_handler) = self.uffd_handler.take() { + uffd_handler.stop_event.write(1).ok(); + uffd_handler.handle.join().ok(); + + match uffd_handler.result_rx.try_recv() { + Ok(Err(e)) => error!("UFFD handler terminated with error: {e}"), + Err(mpsc::TryRecvError::Disconnected) => { + warn!("UFFD handler terminated unexpectedly (possible panic)"); + } + _ => {} + } + } + } + + /// Poll the UFFD fd and serve page faults from the snapshot file. + /// + /// Runs until the fd is closed (EPOLLHUP) or an unrecoverable error occurs. + /// Each fault triggers a seek + read from the snapshot file followed by a + /// `UFFDIO_COPY` to resolve the fault and wake the faulting thread. + #[allow(clippy::needless_pass_by_value)] + fn uffd_handler_loop( + uffd_fd: OwnedFd, + stop_event: EventFd, + mut snapshot_file: File, + ranges: &[UffdRange], + page_size: u64, + ready_tx: &SyncSender<()>, + ) -> Result<(), io::Error> { + let uffd_raw_fd = uffd_fd.as_raw_fd(); + let mut page_buf = vec![0u8; page_size as usize]; + + let total_pages: u64 = ranges.iter().map(|r| r.length.div_ceil(r.page_size)).sum(); + let mut pages_served: u64 = 0; + + const EVENT_STOP: u64 = 0; + const EVENT_UFFD: u64 = 1; + + let epoll_fd = epoll::create(true).map_err(io::Error::other)?; + // SAFETY: epoll_fd is valid and owned by this scope. + let _epoll_file = unsafe { File::from_raw_fd(epoll_fd) }; + + epoll::ctl( + epoll_fd, + epoll::ControlOptions::EPOLL_CTL_ADD, + stop_event.as_raw_fd(), + epoll::Event::new(epoll::Events::EPOLLIN, EVENT_STOP), + ) + .map_err(io::Error::other)?; + + epoll::ctl( + epoll_fd, + epoll::ControlOptions::EPOLL_CTL_ADD, + uffd_raw_fd, + epoll::Event::new(epoll::Events::EPOLLIN | epoll::Events::EPOLLHUP, EVENT_UFFD), + ) + .map_err(io::Error::other)?; + + ready_tx.send(()).ok(); + + let mut events = vec![epoll::Event::new(epoll::Events::empty(), 0); 2]; + loop { + let num_events = match epoll::wait(epoll_fd, -1, &mut events) { + Ok(n) => n, + Err(e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => return Err(e), + }; + + let mut got_uffd_data = false; + for event in events.iter().take(num_events) { + let token = event.data; + let evt_flags = event.events; + + if token == EVENT_STOP { + stop_event.read().ok(); + info!("UFFD handler: received stop event, exiting"); + return Ok(()); + } + + if token == EVENT_UFFD + && (evt_flags & epoll::Events::EPOLLHUP.bits()) != 0 + && (evt_flags & epoll::Events::EPOLLIN.bits()) == 0 + { + info!("UFFD handler: fd closed (EPOLLHUP), exiting"); + return Ok(()); + } + + if token == EVENT_UFFD && (evt_flags & epoll::Events::EPOLLIN.bits()) != 0 { + got_uffd_data = true; + } + } + + if !got_uffd_data { + continue; + } + + // SAFETY: UffdMsg is a plain repr(C) struct, safe to zero-init. + let mut msg: uffd::UffdMsg = unsafe { std::mem::zeroed() }; + // SAFETY: reading a uffd_msg-sized struct from the valid uffd fd. + let n = unsafe { + libc::read( + uffd_raw_fd, + &mut msg as *mut uffd::UffdMsg as *mut libc::c_void, + std::mem::size_of::(), + ) + }; + if n < 0 { + let err = io::Error::last_os_error(); + if err.kind() == io::ErrorKind::WouldBlock { + continue; + } + return Err(err); + } + if n == 0 { + info!("UFFD handler: EOF on fd, exiting"); + return Ok(()); + } + if n as usize != std::mem::size_of::() { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "Short read from userfaultfd", + )); + } + + if msg.event != crate::userfaultfd::UFFD_EVENT_PAGEFAULT { + continue; + } + + let fault_addr = msg.pf_address; + + let mut served = false; + for range in ranges { + // Round down to the page boundary containing the faulted address. + let page_addr = fault_addr & !(range.page_size - 1); + if page_addr >= range.host_addr && page_addr < range.host_addr + range.length { + let offset_in_range = page_addr - range.host_addr; + let file_pos = range.file_offset + offset_in_range; + + snapshot_file.seek(SeekFrom::Start(file_pos))?; + snapshot_file.read_exact(&mut page_buf[..range.page_size as usize])?; + + loop { + match uffd::copy( + uffd_fd.as_fd(), + page_addr, + page_buf.as_ptr(), + range.page_size, + ) { + Ok(()) => { + pages_served += 1; + break; + } + Err(e) if e.raw_os_error() == Some(libc::EEXIST) => { + if let Err(e) = + uffd::wake(uffd_fd.as_fd(), page_addr, range.page_size) + { + warn!("UFFDIO_WAKE failed at {page_addr:#x}: {e}"); + } + break; + } + Err(e) if e.raw_os_error() == Some(libc::EAGAIN) => { + // The kernel can report a transient EAGAIN while the fault + // is being resolved; yield and retry instead of aborting restore. + thread::yield_now(); + } + Err(e) => return Err(e), + } + } + served = true; + break; + } + } + + if !served { + return Err(io::Error::other(format!( + "UFFD handler: fault at {fault_addr:#x} does not belong to any registered range", + ))); + } + + if pages_served == total_pages { + info!("UFFD handler: all {pages_served} pages served, exiting"); + return Ok(()); + } + } + } + fn validate_memory_config( config: &MemoryConfig, user_provided_zones: bool, @@ -1227,6 +1641,7 @@ impl MemoryManager { snapshot_memory_ranges: MemoryRangeTable::default(), memory_zones, guest_ram_mappings: Vec::new(), + uffd_handler: None, acpi_address, log_dirty: dynamic, // Cannot log dirty pages on a TD arch_mem_regions, @@ -1240,13 +1655,16 @@ impl MemoryManager { Ok(Arc::new(Mutex::new(memory_manager))) } + #[allow(clippy::too_many_arguments)] pub fn new_from_snapshot( snapshot: &Snapshot, vm: Arc, config: &MemoryConfig, source_url: Option<&str>, prefault: bool, + memory_restore_mode: MemoryRestoreMode, phys_bits: u8, + exit_evt: &EventFd, ) -> Result>, Error> { if let Some(source_url) = source_url { let mut memory_file_path = url_to_path(source_url).map_err(Error::Restore)?; @@ -1266,9 +1684,17 @@ impl MemoryManager { Default::default(), )?; - mm.lock() - .unwrap() - .fill_saved_regions(memory_file_path, &mem_snapshot.memory_ranges)?; + if memory_restore_mode == MemoryRestoreMode::OnDemand { + mm.lock().unwrap().restore_by_uffd( + &memory_file_path, + &mem_snapshot.memory_ranges, + exit_evt, + )?; + } else { + mm.lock() + .unwrap() + .fill_saved_regions(memory_file_path, &mem_snapshot.memory_ranges)?; + } Ok(mm) } else { @@ -2530,6 +2956,12 @@ impl Aml for MemoryManager { impl Pausable for MemoryManager {} +impl Drop for MemoryManager { + fn drop(&mut self) { + self.stop_uffd_handler(); + } +} + #[derive(Clone, Serialize, Deserialize)] pub struct MemoryManagerSnapshotData { memory_ranges: MemoryRangeTable, diff --git a/vmm/src/seccomp_filters.rs b/vmm/src/seccomp_filters.rs index bb40b99f60..938e7832f3 100644 --- a/vmm/src/seccomp_filters.rs +++ b/vmm/src/seccomp_filters.rs @@ -25,6 +25,8 @@ use vhost::vhost_kern::vhost_binding::{ VHOST_VDPA_SET_STATUS, VHOST_VDPA_SET_VRING_ENABLE, VHOST_VDPA_SUSPEND, }; +use crate::userfaultfd::{UFFDIO_API, UFFDIO_COPY, UFFDIO_REGISTER, UFFDIO_WAKE}; + #[derive(Copy, Clone)] pub enum Thread { HttpApi, @@ -362,6 +364,10 @@ fn create_vmm_ioctl_seccomp_rule_common( VHOST_VDPA_GET_CONFIG_SIZE() )?], and![Cond::new(1, ArgLen::Dword, Eq, VHOST_VDPA_SUSPEND())?], + and![Cond::new(1, ArgLen::Dword, Eq, UFFDIO_API)?], + and![Cond::new(1, ArgLen::Dword, Eq, UFFDIO_COPY)?], + and![Cond::new(1, ArgLen::Dword, Eq, UFFDIO_REGISTER)?], + and![Cond::new(1, ArgLen::Dword, Eq, UFFDIO_WAKE)?], ]; let hypervisor_rules = create_vmm_ioctl_seccomp_rule_hypervisor(hypervisor_type)?; @@ -691,6 +697,7 @@ fn vmm_thread_rules( (libc::SYS_unlink, vec![]), #[cfg(target_arch = "aarch64")] (libc::SYS_unlinkat, vec![]), + (libc::SYS_userfaultfd, vec![]), (libc::SYS_wait4, vec![]), (libc::SYS_write, vec![]), (libc::SYS_writev, vec![]), diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index a094803ee4..710847fab3 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -76,7 +76,7 @@ use vm_migration::{ use vmm_sys_util::eventfd::EventFd; use vmm_sys_util::sock_ctrl_msg::ScmSocket; -use crate::config::{ValidationError, add_to_config}; +use crate::config::{MemoryRestoreMode, ValidationError, add_to_config}; use crate::console_devices::{ConsoleDeviceError, ConsoleInfo}; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use crate::coredump::{ @@ -1265,6 +1265,7 @@ impl Vm { snapshot: Option<&Snapshot>, source_url: Option<&str>, prefault: Option, + memory_restore_mode: Option, ) -> Result { trace_scoped!("Vm::new"); @@ -1300,8 +1301,10 @@ impl Vm { vm.clone(), &vm_config.lock().unwrap().memory.clone(), source_url, - prefault.unwrap(), + prefault.unwrap_or(false), + memory_restore_mode.unwrap_or_default(), phys_bits, + &exit_evt, ) .map_err(Error::MemoryManager)? } else { From ec389c4faed4281687698dc5311308cd91385d46 Mon Sep 17 00:00:00 2001 From: Shayon Mukherjee Date: Fri, 13 Mar 2026 08:16:33 -0700 Subject: [PATCH 146/742] tests: add integration tests for on-demand snapshot restore Add UFFD restore tests to common_sequential: basic anonymous RAM, shared memory, and hugepage-backed zone memory. Each exercises the full snapshot/restore cycle with memory_restore_mode=ondemand and verifies CPU count, memory size, and device health after resume. Signed-off-by: Shayon Mukherjee --- cloud-hypervisor/tests/integration.rs | 168 ++++++++++++++++++++++++++ 1 file changed, 168 insertions(+) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index bbc1999335..f0d5f6f252 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -11025,6 +11025,174 @@ mod common_sequential { handle_child_output(r, &output); } + #[test] + #[cfg(not(feature = "mshv"))] + fn test_snapshot_restore_uffd() { + _test_snapshot_restore_uffd("size=2G", &[], 1_920_000); + } + + #[test] + #[cfg(not(feature = "mshv"))] + fn test_snapshot_restore_uffd_shared_memory() { + _test_snapshot_restore_uffd("size=512M,shared=on", &[], 480_000); + } + + #[test] + #[cfg(not(feature = "mshv"))] + fn test_snapshot_restore_uffd_hugepage_zone() { + if !exec_host_command_status( + "grep -q '^Hugepagesize:[[:space:]]*2048 kB' /proc/meminfo && test $(awk '/HugePages_Free/ {print $2}' /proc/meminfo) -ge 256", + ) + .success() + { + println!("SKIPPED: not enough free 2MiB hugepages for UFFD restore test"); + return; + } + + _test_snapshot_restore_uffd( + "size=0", + &["id=mem0,size=512M,hugepages=on,hugepage_size=2M"], + 480_000, + ); + } + + fn _test_snapshot_restore_uffd( + memory_config: &str, + memory_zone_config: &[&str], + min_total_memory_kib: u32, + ) { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); + let kernel_path = direct_kernel_boot_path(); + + let api_socket_source = format!("{}.1", temp_api_path(&guest.tmp_dir)); + + let console_text = String::from("On a branch floating down river a cricket, singing."); + let snapshot_dir = temp_snapshot_dir_path(&guest.tmp_dir); + let socket = temp_vsock_path(&guest.tmp_dir); + let event_path = temp_event_monitor_path(&guest.tmp_dir); + + let mut source_cmd = GuestCommand::new(&guest); + source_cmd + .args(["--api-socket", &api_socket_source]) + .args(["--event-monitor", format!("path={event_path}").as_str()]) + .args(["--cpus", "boot=4"]) + .args(["--memory", memory_config]); + + if !memory_zone_config.is_empty() { + source_cmd.args(["--memory-zone"]).args(memory_zone_config); + } + + let mut child = source_cmd + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .default_disks() + .default_net() + .args(["--vsock", format!("cid=3,socket={socket}").as_str()]) + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + assert_eq!(guest.get_cpu_count().unwrap_or_default(), 4); + assert!(guest.get_total_memory().unwrap_or_default() > min_total_memory_kib); + + guest.check_devices_common(Some(&socket), Some(&console_text), None); + + snapshot_and_check_events(&api_socket_source, &snapshot_dir, &event_path); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + handle_child_output(r, &output); + + let r = std::panic::catch_unwind(|| { + assert!(String::from_utf8_lossy(&output.stdout).contains(&console_text)); + }); + handle_child_output(r, &output); + + Command::new("rm") + .arg("-f") + .arg(socket.as_str()) + .output() + .unwrap(); + + let api_socket_restored = format!("{}.2", temp_api_path(&guest.tmp_dir)); + let event_path_restored = format!("{}.2", temp_event_monitor_path(&guest.tmp_dir)); + + let mut child = GuestCommand::new(&guest) + .args(["--api-socket", &api_socket_restored]) + .args([ + "--event-monitor", + format!("path={event_path_restored}").as_str(), + ]) + .args([ + "--restore", + format!("source_url=file://{snapshot_dir},memory_restore_mode=ondemand").as_str(), + ]) + .capture_output() + .spawn() + .unwrap(); + + thread::sleep(std::time::Duration::new(20, 0)); + + let latest_events = [&MetaEvent { + event: "restored".to_string(), + device_id: None, + }]; + assert!(check_latest_events_exact( + &latest_events, + &event_path_restored + )); + + let r = std::panic::catch_unwind(|| { + assert!(remote_command(&api_socket_restored, "resume", None)); + thread::sleep(std::time::Duration::new(1, 0)); + let latest_events = [ + &MetaEvent { + event: "resuming".to_string(), + device_id: None, + }, + &MetaEvent { + event: "resumed".to_string(), + device_id: None, + }, + ]; + assert!(check_latest_events_exact( + &latest_events, + &event_path_restored + )); + + assert_eq!(guest.get_cpu_count().unwrap_or_default(), 4); + assert!(guest.get_total_memory().unwrap_or_default() > min_total_memory_kib); + + guest.check_devices_common(Some(&socket), Some(&console_text), None); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + handle_child_output(r, &output); + + let r = std::panic::catch_unwind(|| { + assert!(String::from_utf8_lossy(&output.stdout).contains(&console_text)); + + let logs = format!( + "{}\n{}", + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr) + ); + assert!( + logs.contains("UFFD restore: demand-paged restore enabled"), + "Expected UFFD restore path to be enabled. output: {logs}" + ); + }); + handle_child_output(r, &output); + + let _ = remove_dir_all(snapshot_dir.as_str()); + } + #[test] #[cfg(not(feature = "mshv"))] // See issue #7437 #[ignore = "See #6970"] From ab8169c855553ee7609ac29f37d4349a590ec6ad Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Fri, 13 Mar 2026 13:11:44 -0700 Subject: [PATCH 147/742] build: Bump timeout on integration tests We now have more tests and are hitting up against the timeout Signed-off-by: Rob Bradford --- .github/workflows/integration-x86-64.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration-x86-64.yaml b/.github/workflows/integration-x86-64.yaml index 9334b242ea..b620954d04 100644 --- a/.github/workflows/integration-x86-64.yaml +++ b/.github/workflows/integration-x86-64.yaml @@ -6,7 +6,7 @@ concurrency: jobs: build: - timeout-minutes: 60 + timeout-minutes: 80 strategy: fail-fast: false matrix: @@ -41,7 +41,7 @@ jobs: run: sudo modprobe openvswitch - name: Run integration tests if: ${{ github.event_name != 'pull_request' || (matrix.runner == 'garm-jammy' && matrix.libc == 'gnu') }} - timeout-minutes: 40 + timeout-minutes: 60 run: scripts/dev_cli.sh tests --integration --libc ${{ matrix.libc }} - name: Run live-migration integration tests if: ${{ github.event_name != 'pull_request' || (matrix.runner == 'garm-jammy' && matrix.libc == 'gnu') }} From f77c6ef78bdca486eec0e6d81033af7082f9997d Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Thu, 12 Mar 2026 09:50:37 -0700 Subject: [PATCH 148/742] virtio-devices: introduce ActivationContext for device activation Signed-off-by: Peter Oskolkov --- fuzz/fuzz_targets/balloon.rs | 10 +++--- fuzz/fuzz_targets/block.rs | 10 +++--- fuzz/fuzz_targets/console.rs | 10 +++--- fuzz/fuzz_targets/iommu.rs | 10 +++--- fuzz/fuzz_targets/mem.rs | 10 +++--- fuzz/fuzz_targets/net.rs | 10 +++--- fuzz/fuzz_targets/pmem.rs | 10 +++--- fuzz/fuzz_targets/rng.rs | 10 +++--- fuzz/fuzz_targets/vsock.rs | 10 +++--- fuzz/fuzz_targets/watchdog.rs | 10 +++--- virtio-devices/src/balloon.rs | 13 ++++---- virtio-devices/src/block.rs | 12 +++---- virtio-devices/src/console.rs | 13 ++++---- virtio-devices/src/device.rs | 13 ++++---- virtio-devices/src/iommu.rs | 13 ++++---- virtio-devices/src/lib.rs | 4 +-- virtio-devices/src/mem.rs | 13 ++++---- virtio-devices/src/net.rs | 12 +++---- virtio-devices/src/pmem.rs | 13 ++++---- virtio-devices/src/rng.rs | 13 ++++---- .../src/transport/pci_common_config.rs | 12 ++----- virtio-devices/src/transport/pci_device.rs | 13 ++++---- virtio-devices/src/vdpa.rs | 13 ++++---- virtio-devices/src/vhost_user/blk.rs | 14 ++++----- virtio-devices/src/vhost_user/fs.rs | 14 ++++----- .../src/vhost_user/generic_vhost_user.rs | 14 ++++----- virtio-devices/src/vhost_user/net.rs | 15 ++++----- virtio-devices/src/vsock/device.rs | 31 ++++++++++--------- virtio-devices/src/watchdog.rs | 13 ++++---- vmm/src/device_manager.rs | 2 +- 30 files changed, 183 insertions(+), 177 deletions(-) diff --git a/fuzz/fuzz_targets/balloon.rs b/fuzz/fuzz_targets/balloon.rs index b745cf6187..edb4f0cf1a 100644 --- a/fuzz/fuzz_targets/balloon.rs +++ b/fuzz/fuzz_targets/balloon.rs @@ -95,15 +95,15 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { reporting_queue_evt.write(1).unwrap(); balloon - .activate( - guest_memory, - Arc::new(NoopVirtioInterrupt {}), - vec![ + .activate(virtio_devices::ActivationContext { + mem: guest_memory, + interrupt_cb: Arc::new(NoopVirtioInterrupt {}), + queues: vec![ (0, inflate_q, inflate_evt), (1, deflate_q, deflate_evt), (2, reporting_q, reporting_evt), ], - ) + }) .ok(); // Wait for the events to finish and balloon device worker thread to return diff --git a/fuzz/fuzz_targets/block.rs b/fuzz/fuzz_targets/block.rs index 0ad9193fdb..19b875f044 100644 --- a/fuzz/fuzz_targets/block.rs +++ b/fuzz/fuzz_targets/block.rs @@ -91,11 +91,11 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { queue_evt.write(1).unwrap(); block - .activate( - guest_memory, - Arc::new(NoopVirtioInterrupt {}), - vec![(0, q, evt)], - ) + .activate(virtio_devices::ActivationContext { + mem: guest_memory, + interrupt_cb: Arc::new(NoopVirtioInterrupt {}), + queues: vec![(0, q, evt)], + }) .ok(); // Wait for the events to finish and block device worker thread to return diff --git a/fuzz/fuzz_targets/console.rs b/fuzz/fuzz_targets/console.rs index 4b3a49df91..7f5fafaebc 100644 --- a/fuzz/fuzz_targets/console.rs +++ b/fuzz/fuzz_targets/console.rs @@ -128,11 +128,11 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { pipe_tx.write_all(console_input_bytes).unwrap(); // To use fuzzed data; console - .activate( - guest_memory, - Arc::new(NoopVirtioInterrupt {}), - vec![(0, input_queue, input_evt), (1, output_queue, output_evt)], - ) + .activate(virtio_devices::ActivationContext { + mem: guest_memory, + interrupt_cb: Arc::new(NoopVirtioInterrupt {}), + queues: vec![(0, input_queue, input_evt), (1, output_queue, output_evt)], + }) .unwrap(); // Wait for the events to finish and console device worker thread to return diff --git a/fuzz/fuzz_targets/iommu.rs b/fuzz/fuzz_targets/iommu.rs index 8c9f26b262..791ab6b000 100644 --- a/fuzz/fuzz_targets/iommu.rs +++ b/fuzz/fuzz_targets/iommu.rs @@ -107,14 +107,14 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { request_queue_evt.write(1).unwrap(); iommu - .activate( - guest_memory, - Arc::new(NoopVirtioInterrupt {}), - vec![ + .activate(virtio_devices::ActivationContext { + mem: guest_memory, + interrupt_cb: Arc::new(NoopVirtioInterrupt {}), + queues: vec![ (0, request_queue, request_evt), (0, _event_queue, _event_evt), ], - ) + }) .ok(); // Wait for the events to finish and vIOMMU device worker thread to return diff --git a/fuzz/fuzz_targets/mem.rs b/fuzz/fuzz_targets/mem.rs index 57fc9a91dd..46627e9315 100644 --- a/fuzz/fuzz_targets/mem.rs +++ b/fuzz/fuzz_targets/mem.rs @@ -105,11 +105,11 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { queue_evt.write(1).unwrap(); virtio_mem - .activate( - guest_memory, - Arc::new(NoopVirtioInterrupt {}), - vec![(0, q, evt)], - ) + .activate(virtio_devices::ActivationContext { + mem: guest_memory, + interrupt_cb: Arc::new(NoopVirtioInterrupt {}), + queues: vec![(0, q, evt)], + }) .ok(); // Wait for the events to finish and virtio-mem device worker thread to return diff --git a/fuzz/fuzz_targets/net.rs b/fuzz/fuzz_targets/net.rs index 30968d2a47..0af835ac06 100644 --- a/fuzz/fuzz_targets/net.rs +++ b/fuzz/fuzz_targets/net.rs @@ -143,11 +143,11 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { input_queue_evt.write(1).unwrap(); output_queue_evt.write(1).unwrap(); - net.activate( - guest_memory, - Arc::new(NoopVirtioInterrupt {}), - vec![(0, input_queue, input_evt), (1, output_queue, output_evt)], - ) + net.activate(virtio_devices::ActivationContext { + mem: guest_memory, + interrupt_cb: Arc::new(NoopVirtioInterrupt {}), + queues: vec![(0, input_queue, input_evt), (1, output_queue, output_evt)], + }) .unwrap(); // Wait for the events to finish and net device worker thread to return diff --git a/fuzz/fuzz_targets/pmem.rs b/fuzz/fuzz_targets/pmem.rs index a8fcb7a774..b42c20daea 100644 --- a/fuzz/fuzz_targets/pmem.rs +++ b/fuzz/fuzz_targets/pmem.rs @@ -61,11 +61,11 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { // Kick the 'queue' event before activate the pmem device queue_evt.write(1).unwrap(); - pmem.activate( - guest_memory, - Arc::new(NoopVirtioInterrupt {}), - vec![(0, q, evt)], - ) + pmem.activate(virtio_devices::ActivationContext { + mem: guest_memory, + interrupt_cb: Arc::new(NoopVirtioInterrupt {}), + queues: vec![(0, q, evt)], + }) .ok(); // Wait for the events to finish and pmem device worker thread to return diff --git a/fuzz/fuzz_targets/rng.rs b/fuzz/fuzz_targets/rng.rs index 8d5ffe35b3..d9cd11f099 100644 --- a/fuzz/fuzz_targets/rng.rs +++ b/fuzz/fuzz_targets/rng.rs @@ -99,11 +99,11 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { // Kick the 'queue' event before activate the rng device queue_evt.write(1).unwrap(); - rng.activate( - guest_memory, - Arc::new(NoopVirtioInterrupt {}), - vec![(0, q, evt)], - ) + rng.activate(virtio_devices::ActivationContext { + mem: guest_memory, + interrupt_cb: Arc::new(NoopVirtioInterrupt {}), + queues: vec![(0, q, evt)], + }) .ok(); // Wait for the events to finish and rng device worker thread to return diff --git a/fuzz/fuzz_targets/vsock.rs b/fuzz/fuzz_targets/vsock.rs index 144b8b4057..72bdeb4d63 100644 --- a/fuzz/fuzz_targets/vsock.rs +++ b/fuzz/fuzz_targets/vsock.rs @@ -108,11 +108,11 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { .unwrap(); vsock - .activate( - guest_memory, - Arc::new(NoopVirtioInterrupt {}), - vec![(0, q, evt)], - ) + .activate(virtio_devices::ActivationContext { + mem: guest_memory, + interrupt_cb: Arc::new(NoopVirtioInterrupt {}), + queues: vec![(0, q, evt)], + }) .ok(); // Wait for the events to finish and vsock device worker thread to return diff --git a/fuzz/fuzz_targets/watchdog.rs b/fuzz/fuzz_targets/watchdog.rs index f203a228f9..8736f8af3f 100644 --- a/fuzz/fuzz_targets/watchdog.rs +++ b/fuzz/fuzz_targets/watchdog.rs @@ -64,11 +64,11 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { queue_evt.write(1).unwrap(); watchdog - .activate( - guest_memory, - Arc::new(NoopVirtioInterrupt {}), - vec![(0, q, evt)], - ) + .activate(virtio_devices::ActivationContext { + mem: guest_memory, + interrupt_cb: Arc::new(NoopVirtioInterrupt {}), + queues: vec![(0, q, evt)], + }) .ok(); // Wait for the events to finish and watchdog device worker thread to return diff --git a/virtio-devices/src/balloon.rs b/virtio-devices/src/balloon.rs index 3db6832617..bb9c46cbc8 100644 --- a/virtio-devices/src/balloon.rs +++ b/virtio-devices/src/balloon.rs @@ -590,12 +590,13 @@ impl VirtioDevice for Balloon { } } - fn activate( - &mut self, - mem: GuestMemoryAtomic, - interrupt_cb: Arc, - mut queues: Vec<(usize, Queue, EventFd)>, - ) -> ActivateResult { + fn activate(&mut self, context: crate::device::ActivationContext) -> ActivateResult { + let crate::device::ActivationContext { + mem, + interrupt_cb, + mut queues, + .. + } = context; self.common.activate(&queues, interrupt_cb.clone())?; let (kill_evt, pause_evt) = self.common.dup_eventfds(); diff --git a/virtio-devices/src/block.rs b/virtio-devices/src/block.rs index 22b8ef31e6..82eca61827 100644 --- a/virtio-devices/src/block.rs +++ b/virtio-devices/src/block.rs @@ -1008,12 +1008,12 @@ impl VirtioDevice for Block { self.update_writeback(); } - fn activate( - &mut self, - mem: GuestMemoryAtomic, - interrupt_cb: Arc, - mut queues: Vec<(usize, Queue, EventFd)>, - ) -> ActivateResult { + fn activate(&mut self, context: crate::device::ActivationContext) -> ActivateResult { + let crate::device::ActivationContext { + mem, + interrupt_cb, + mut queues, + } = context; // See if the guest didn't ack the device being read-only. // If so, warn and pretend it did. let original_acked_features = self.common.acked_features; diff --git a/virtio-devices/src/console.rs b/virtio-devices/src/console.rs index ab907db20f..74b42c46b5 100644 --- a/virtio-devices/src/console.rs +++ b/virtio-devices/src/console.rs @@ -710,12 +710,13 @@ impl VirtioDevice for Console { self.read_config_from_slice(self.config.lock().unwrap().as_slice(), offset, data); } - fn activate( - &mut self, - mem: GuestMemoryAtomic, - interrupt_cb: Arc, - mut queues: Vec<(usize, Queue, EventFd)>, - ) -> ActivateResult { + fn activate(&mut self, context: crate::device::ActivationContext) -> ActivateResult { + let crate::device::ActivationContext { + mem, + interrupt_cb, + mut queues, + .. + } = context; self.common.activate(&queues, interrupt_cb.clone())?; self.resizer .acked_features diff --git a/virtio-devices/src/device.rs b/virtio-devices/src/device.rs index f0ed28f517..d5e873bf42 100644 --- a/virtio-devices/src/device.rs +++ b/virtio-devices/src/device.rs @@ -53,6 +53,12 @@ pub struct VirtioSharedMemoryList { pub region_list: Vec, } +pub struct ActivationContext { + pub mem: GuestMemoryAtomic, + pub interrupt_cb: Arc, + pub queues: Vec<(usize, Queue, EventFd)>, +} + /// Trait for virtio devices to be driven by a virtio transport. /// /// The lifecycle of a virtio device is to be moved to a virtio transport, which will then query the @@ -94,12 +100,7 @@ pub trait VirtioDevice: Send { } /// Activates this device for real usage. - fn activate( - &mut self, - mem: GuestMemoryAtomic, - interrupt_evt: Arc, - queues: Vec<(usize, Queue, EventFd)>, - ) -> ActivateResult; + fn activate(&mut self, context: ActivationContext) -> ActivateResult; /// Optionally deactivates this device and returns ownership of the guest memory map, interrupt /// event, and queue events. diff --git a/virtio-devices/src/iommu.rs b/virtio-devices/src/iommu.rs index 1097b6582e..513d510b56 100644 --- a/virtio-devices/src/iommu.rs +++ b/virtio-devices/src/iommu.rs @@ -1075,12 +1075,13 @@ impl VirtioDevice for Iommu { self.update_bypass(); } - fn activate( - &mut self, - mem: GuestMemoryAtomic, - interrupt_cb: Arc, - mut queues: Vec<(usize, Queue, EventFd)>, - ) -> ActivateResult { + fn activate(&mut self, context: crate::device::ActivationContext) -> ActivateResult { + let crate::device::ActivationContext { + mem, + interrupt_cb, + mut queues, + .. + } = context; self.common.activate(&queues, interrupt_cb.clone())?; let (kill_evt, pause_evt) = self.common.dup_eventfds(); diff --git a/virtio-devices/src/lib.rs b/virtio-devices/src/lib.rs index 111c9007e1..14e565ccb8 100644 --- a/virtio-devices/src/lib.rs +++ b/virtio-devices/src/lib.rs @@ -42,8 +42,8 @@ pub use self::balloon::Balloon; pub use self::block::{Block, BlockState}; pub use self::console::{Console, ConsoleResizer, Endpoint}; pub use self::device::{ - DmaRemapping, VirtioCommon, VirtioDevice, VirtioInterrupt, VirtioInterruptType, - VirtioSharedMemoryList, + ActivationContext, DmaRemapping, VirtioCommon, VirtioDevice, VirtioInterrupt, + VirtioInterruptType, VirtioSharedMemoryList, }; pub use self::epoll_helper::{ EPOLL_HELPER_EVENT_LAST, EpollHelper, EpollHelperError, EpollHelperHandler, diff --git a/virtio-devices/src/mem.rs b/virtio-devices/src/mem.rs index 936fdbe42a..aed8ed48d2 100644 --- a/virtio-devices/src/mem.rs +++ b/virtio-devices/src/mem.rs @@ -950,12 +950,13 @@ impl VirtioDevice for Mem { self.read_config_from_slice(self.config.lock().unwrap().as_slice(), offset, data); } - fn activate( - &mut self, - mem: GuestMemoryAtomic, - interrupt_cb: Arc, - mut queues: Vec<(usize, Queue, EventFd)>, - ) -> ActivateResult { + fn activate(&mut self, context: crate::device::ActivationContext) -> ActivateResult { + let crate::device::ActivationContext { + mem, + interrupt_cb, + mut queues, + .. + } = context; self.common.activate(&queues, interrupt_cb.clone())?; let (kill_evt, pause_evt) = self.common.dup_eventfds(); diff --git a/virtio-devices/src/net.rs b/virtio-devices/src/net.rs index 63b9de4116..2e3aa02eaf 100644 --- a/virtio-devices/src/net.rs +++ b/virtio-devices/src/net.rs @@ -693,12 +693,12 @@ impl VirtioDevice for Net { self.read_config_from_slice(self.config.as_slice(), offset, data); } - fn activate( - &mut self, - mem: GuestMemoryAtomic, - interrupt_cb: Arc, - mut queues: Vec<(usize, Queue, EventFd)>, - ) -> ActivateResult { + fn activate(&mut self, context: crate::device::ActivationContext) -> ActivateResult { + let crate::device::ActivationContext { + mem, + interrupt_cb, + mut queues, + } = context; self.common.activate(&queues, interrupt_cb.clone())?; let num_queues = queues.len(); diff --git a/virtio-devices/src/pmem.rs b/virtio-devices/src/pmem.rs index b7be248fa4..fd995747c2 100644 --- a/virtio-devices/src/pmem.rs +++ b/virtio-devices/src/pmem.rs @@ -377,12 +377,13 @@ impl VirtioDevice for Pmem { self.read_config_from_slice(self.config.as_slice(), offset, data); } - fn activate( - &mut self, - mem: GuestMemoryAtomic, - interrupt_cb: Arc, - mut queues: Vec<(usize, Queue, EventFd)>, - ) -> ActivateResult { + fn activate(&mut self, context: crate::device::ActivationContext) -> ActivateResult { + let crate::device::ActivationContext { + mem, + interrupt_cb, + mut queues, + .. + } = context; self.common.activate(&queues, interrupt_cb.clone())?; let (kill_evt, pause_evt) = self.common.dup_eventfds(); if let Some(disk) = self.disk.as_ref() { diff --git a/virtio-devices/src/rng.rs b/virtio-devices/src/rng.rs index 2409e272c1..16a539f923 100644 --- a/virtio-devices/src/rng.rs +++ b/virtio-devices/src/rng.rs @@ -244,12 +244,13 @@ impl VirtioDevice for Rng { self.common.ack_features(value); } - fn activate( - &mut self, - mem: GuestMemoryAtomic, - interrupt_cb: Arc, - mut queues: Vec<(usize, Queue, EventFd)>, - ) -> ActivateResult { + fn activate(&mut self, context: crate::device::ActivationContext) -> ActivateResult { + let crate::device::ActivationContext { + mem, + interrupt_cb, + mut queues, + .. + } = context; self.common.activate(&queues, interrupt_cb.clone())?; let (kill_evt, pause_evt) = self.common.dup_eventfds(); diff --git a/virtio-devices/src/transport/pci_common_config.rs b/virtio-devices/src/transport/pci_common_config.rs index 5a7b5f57a4..68ed01c60a 100644 --- a/virtio-devices/src/transport/pci_common_config.rs +++ b/virtio-devices/src/transport/pci_common_config.rs @@ -404,11 +404,8 @@ impl Snapshottable for VirtioPciCommonConfig { #[cfg(test)] mod unit_tests { - use vm_memory::GuestMemoryAtomic; - use vmm_sys_util::eventfd::EventFd; - use super::*; - use crate::{ActivateResult, GuestMemoryMmap, VirtioInterrupt}; + use crate::{ActivateResult, ActivationContext}; struct DummyDevice(u32); const QUEUE_SIZE: u16 = 256; @@ -421,12 +418,7 @@ mod unit_tests { fn queue_max_sizes(&self) -> &[u16] { QUEUE_SIZES } - fn activate( - &mut self, - _mem: GuestMemoryAtomic, - _interrupt_evt: Arc, - _queues: Vec<(usize, Queue, EventFd)>, - ) -> ActivateResult { + fn activate(&mut self, _context: ActivationContext) -> ActivateResult { Ok(()) } diff --git a/virtio-devices/src/transport/pci_device.rs b/virtio-devices/src/transport/pci_device.rs index bf1d169c83..26440ddf2e 100644 --- a/virtio-devices/src/transport/pci_device.rs +++ b/virtio-devices/src/transport/pci_device.rs @@ -309,12 +309,13 @@ pub struct VirtioPciDeviceActivator { } impl VirtioPciDeviceActivator { - pub fn activate(&mut self) -> ActivateResult { - self.device.lock().unwrap().activate( - self.memory.take().unwrap(), - self.interrupt.take().unwrap(), - self.queues.take().unwrap(), - )?; + pub fn activate(mut self) -> ActivateResult { + let mut locked_device = self.device.lock().unwrap(); + locked_device.activate(crate::device::ActivationContext { + mem: self.memory.take().unwrap(), + interrupt_cb: self.interrupt.take().unwrap(), + queues: self.queues.take().unwrap(), + })?; self.device_activated.store(true, Ordering::SeqCst); if let Some(barrier) = self.barrier.take() { diff --git a/virtio-devices/src/vdpa.rs b/virtio-devices/src/vdpa.rs index 9d20aac92e..4773ef55fa 100644 --- a/virtio-devices/src/vdpa.rs +++ b/virtio-devices/src/vdpa.rs @@ -428,12 +428,13 @@ impl VirtioDevice for Vdpa { } } - fn activate( - &mut self, - mem: GuestMemoryAtomic, - virtio_interrupt: Arc, - queues: Vec<(usize, Queue, EventFd)>, - ) -> ActivateResult { + fn activate(&mut self, context: crate::device::ActivationContext) -> ActivateResult { + let crate::device::ActivationContext { + mem, + interrupt_cb: virtio_interrupt, + queues, + .. + } = context; self.activate_vdpa(&mem.memory(), virtio_interrupt.as_ref(), &queues) .map_err(ActivateError::ActivateVdpa)?; diff --git a/virtio-devices/src/vhost_user/blk.rs b/virtio-devices/src/vhost_user/blk.rs index 22896ba7c6..9125e79909 100644 --- a/virtio-devices/src/vhost_user/blk.rs +++ b/virtio-devices/src/vhost_user/blk.rs @@ -19,7 +19,6 @@ use virtio_bindings::virtio_blk::{ VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_RO, VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_WRITE_ZEROES, }; -use virtio_queue::Queue; use vm_memory::{ByteValued, GuestMemoryAtomic}; use vm_migration::protocol::MemoryRangeTable; use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable}; @@ -279,12 +278,13 @@ impl VirtioDevice for Blk { } } - fn activate( - &mut self, - mem: GuestMemoryAtomic, - interrupt_cb: Arc, - queues: Vec<(usize, Queue, EventFd)>, - ) -> ActivateResult { + fn activate(&mut self, context: crate::device::ActivationContext) -> ActivateResult { + let crate::device::ActivationContext { + mem, + interrupt_cb, + queues, + .. + } = context; self.common.activate(&queues, interrupt_cb.clone())?; self.guest_memory = Some(mem.clone()); diff --git a/virtio-devices/src/vhost_user/fs.rs b/virtio-devices/src/vhost_user/fs.rs index 434454fcef..fb21105c8a 100644 --- a/virtio-devices/src/vhost_user/fs.rs +++ b/virtio-devices/src/vhost_user/fs.rs @@ -12,7 +12,6 @@ use serde::{Deserialize, Serialize}; use serde_with::{Bytes, serde_as}; use vhost::vhost_user::message::{VhostUserProtocolFeatures, VhostUserVirtioFeatures}; use vhost::vhost_user::{FrontendReqHandler, VhostUserFrontend, VhostUserFrontendReqHandler}; -use virtio_queue::Queue; use vm_device::UserspaceMapping; use vm_memory::{ByteValued, GuestMemoryAtomic}; use vm_migration::protocol::MemoryRangeTable; @@ -261,12 +260,13 @@ impl VirtioDevice for Fs { self.read_config_from_slice(self.config.as_slice(), offset, data); } - fn activate( - &mut self, - mem: GuestMemoryAtomic, - interrupt_cb: Arc, - queues: Vec<(usize, Queue, EventFd)>, - ) -> ActivateResult { + fn activate(&mut self, context: crate::device::ActivationContext) -> ActivateResult { + let crate::device::ActivationContext { + mem, + interrupt_cb, + queues, + .. + } = context; self.common.activate(&queues, interrupt_cb.clone())?; self.guest_memory = Some(mem.clone()); diff --git a/virtio-devices/src/vhost_user/generic_vhost_user.rs b/virtio-devices/src/vhost_user/generic_vhost_user.rs index d38eee3a92..b8af44d75d 100644 --- a/virtio-devices/src/vhost_user/generic_vhost_user.rs +++ b/virtio-devices/src/vhost_user/generic_vhost_user.rs @@ -14,7 +14,6 @@ use vhost::vhost_user::message::{ VhostUserConfigFlags, VhostUserProtocolFeatures, VhostUserVirtioFeatures, }; use vhost::vhost_user::{FrontendReqHandler, VhostUserFrontend, VhostUserFrontendReqHandler}; -use virtio_queue::Queue; use vm_device::UserspaceMapping; use vm_memory::GuestMemoryAtomic; use vm_migration::protocol::MemoryRangeTable; @@ -277,12 +276,13 @@ impl VirtioDevice for GenericVhostUser { } } - fn activate( - &mut self, - mem: GuestMemoryAtomic, - interrupt_cb: Arc, - queues: Vec<(usize, Queue, EventFd)>, - ) -> ActivateResult { + fn activate(&mut self, context: crate::device::ActivationContext) -> ActivateResult { + let crate::device::ActivationContext { + mem, + interrupt_cb, + queues, + .. + } = context; self.common.activate(&queues, interrupt_cb.clone())?; self.guest_memory = Some(mem.clone()); diff --git a/virtio-devices/src/vhost_user/net.rs b/virtio-devices/src/vhost_user/net.rs index 187d710e39..165115ae7e 100644 --- a/virtio-devices/src/vhost_user/net.rs +++ b/virtio-devices/src/vhost_user/net.rs @@ -19,7 +19,7 @@ use virtio_bindings::virtio_net::{ VIRTIO_NET_F_MAC, VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_MTU, }; use virtio_bindings::virtio_ring::VIRTIO_RING_F_EVENT_IDX; -use virtio_queue::{Queue, QueueT}; +use virtio_queue::QueueT; use vm_memory::{ByteValued, GuestMemoryAtomic}; use vm_migration::protocol::MemoryRangeTable; use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable}; @@ -288,12 +288,13 @@ impl VirtioDevice for Net { self.read_config_from_slice(self.config.as_slice(), offset, data); } - fn activate( - &mut self, - mem: GuestMemoryAtomic, - interrupt_cb: Arc, - mut queues: Vec<(usize, Queue, EventFd)>, - ) -> ActivateResult { + fn activate(&mut self, context: crate::device::ActivationContext) -> ActivateResult { + let crate::device::ActivationContext { + mem, + interrupt_cb, + mut queues, + .. + } = context; self.common.activate(&queues, interrupt_cb.clone())?; self.guest_memory = Some(mem.clone()); diff --git a/virtio-devices/src/vsock/device.rs b/virtio-devices/src/vsock/device.rs index aa86aa0c95..5b215c3597 100644 --- a/virtio-devices/src/vsock/device.rs +++ b/virtio-devices/src/vsock/device.rs @@ -435,12 +435,13 @@ where } } - fn activate( - &mut self, - mem: GuestMemoryAtomic, - interrupt_cb: Arc, - queues: Vec<(usize, Queue, EventFd)>, - ) -> ActivateResult { + fn activate(&mut self, context: crate::device::ActivationContext) -> ActivateResult { + let crate::device::ActivationContext { + mem, + interrupt_cb, + queues, + .. + } = context; self.common.activate(&queues, interrupt_cb.clone())?; let (kill_evt, pause_evt) = self.common.dup_eventfds(); @@ -593,9 +594,11 @@ mod unit_tests { let memory = GuestMemoryAtomic::new(ctx.mem.clone()); // Test a bad activation. - let bad_activate = - ctx.device - .activate(memory.clone(), Arc::new(NoopVirtioInterrupt {}), Vec::new()); + let bad_activate = ctx.device.activate(crate::device::ActivationContext { + mem: memory.clone(), + interrupt_cb: Arc::new(NoopVirtioInterrupt {}), + queues: Vec::new(), + }); match bad_activate { Err(ActivateError::BadActivate) => (), other => panic!("{other:?}"), @@ -603,10 +606,10 @@ mod unit_tests { // Test a correct activation. ctx.device - .activate( - memory, - Arc::new(NoopVirtioInterrupt {}), - vec![ + .activate(crate::device::ActivationContext { + mem: memory, + interrupt_cb: Arc::new(NoopVirtioInterrupt {}), + queues: vec![ ( 0, Queue::new(256).unwrap(), @@ -623,7 +626,7 @@ mod unit_tests { EventFd::new(EFD_NONBLOCK).unwrap(), ), ], - ) + }) .unwrap(); } diff --git a/virtio-devices/src/watchdog.rs b/virtio-devices/src/watchdog.rs index 6b9f7cc0ac..742a2e0241 100644 --- a/virtio-devices/src/watchdog.rs +++ b/virtio-devices/src/watchdog.rs @@ -326,12 +326,13 @@ impl VirtioDevice for Watchdog { self.common.ack_features(value); } - fn activate( - &mut self, - mem: GuestMemoryAtomic, - interrupt_cb: Arc, - mut queues: Vec<(usize, Queue, EventFd)>, - ) -> ActivateResult { + fn activate(&mut self, context: crate::device::ActivationContext) -> ActivateResult { + let crate::device::ActivationContext { + mem, + interrupt_cb, + mut queues, + .. + } = context; self.common.activate(&queues, interrupt_cb.clone())?; let (kill_evt, pause_evt) = self.common.dup_eventfds(); diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 04b9e3b44b..e560b02d7a 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -4543,7 +4543,7 @@ impl DeviceManager { } pub fn activate_virtio_devices(&self) -> DeviceManagerResult<()> { - for mut activator in self.pending_activations.lock().unwrap().drain(..) { + for activator in self.pending_activations.lock().unwrap().drain(..) { activator .activate() .map_err(DeviceManagerError::VirtioActivate)?; From 21bd3ae91661104a7c58662316b4538eac09fbc7 Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Thu, 12 Mar 2026 09:51:50 -0700 Subject: [PATCH 149/742] virtio-devices: switch driver_status to Arc Signed-off-by: Peter Oskolkov --- virtio-devices/src/transport/pci_common_config.rs | 14 +++++++------- virtio-devices/src/transport/pci_device.rs | 10 ++++++---- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/virtio-devices/src/transport/pci_common_config.rs b/virtio-devices/src/transport/pci_common_config.rs index 68ed01c60a..dcd65f7bc1 100644 --- a/virtio-devices/src/transport/pci_common_config.rs +++ b/virtio-devices/src/transport/pci_common_config.rs @@ -6,7 +6,7 @@ // // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause -use std::sync::atomic::{AtomicU16, Ordering}; +use std::sync::atomic::{AtomicU8, AtomicU16, Ordering}; use std::sync::{Arc, Mutex}; use byteorder::{ByteOrder, LittleEndian}; @@ -125,7 +125,7 @@ pub fn get_vring_size(t: VringType, queue_size: u16) -> u64 { /// le64 queue_used; // 0x30 // read-write pub struct VirtioPciCommonConfig { pub access_platform: Option>, - pub driver_status: u8, + pub driver_status: Arc, pub config_generation: u8, pub device_feature_select: u32, pub driver_feature_select: u32, @@ -141,7 +141,7 @@ impl VirtioPciCommonConfig { ) -> Self { VirtioPciCommonConfig { access_platform, - driver_status: state.driver_status, + driver_status: Arc::new(AtomicU8::new(state.driver_status)), config_generation: state.config_generation, device_feature_select: state.device_feature_select, driver_feature_select: state.driver_feature_select, @@ -153,7 +153,7 @@ impl VirtioPciCommonConfig { fn state(&self) -> VirtioPciCommonConfigState { VirtioPciCommonConfigState { - driver_status: self.driver_status, + driver_status: self.driver_status.load(Ordering::Acquire), config_generation: self.config_generation, device_feature_select: self.device_feature_select, driver_feature_select: self.driver_feature_select, @@ -223,7 +223,7 @@ impl VirtioPciCommonConfig { debug!("read_common_config_byte: offset 0x{offset:x}"); // The driver is only allowed to do aligned, properly sized access. match offset { - 0x14 => self.driver_status, + 0x14 => self.driver_status.load(Ordering::Acquire), 0x15 => self.config_generation, _ => { warn!("invalid virtio config byte read: 0x{offset:x}"); @@ -235,7 +235,7 @@ impl VirtioPciCommonConfig { fn write_common_config_byte(&mut self, offset: u64, value: u8) { debug!("write_common_config_byte: offset 0x{offset:x}"); match offset { - 0x14 => self.driver_status = value, + 0x14 => self.driver_status.store(value, Ordering::Release), _ => { warn!("invalid virtio config byte write: 0x{offset:x}"); } @@ -437,7 +437,7 @@ mod unit_tests { fn write_base_regs() { let mut regs = VirtioPciCommonConfig { access_platform: None, - driver_status: 0xaa, + driver_status: Arc::new(AtomicU8::new(0xaa)), config_generation: 0x55, device_feature_select: 0x0, driver_feature_select: 0x0, diff --git a/virtio-devices/src/transport/pci_device.rs b/virtio-devices/src/transport/pci_device.rs index 26440ddf2e..1a8d3da96e 100644 --- a/virtio-devices/src/transport/pci_device.rs +++ b/virtio-devices/src/transport/pci_device.rs @@ -660,13 +660,13 @@ impl VirtioPciDevice { fn is_driver_ready(&self) -> bool { let ready_bits = (DEVICE_ACKNOWLEDGE | DEVICE_DRIVER | DEVICE_DRIVER_OK | DEVICE_FEATURES_OK) as u8; - self.common_config.driver_status == ready_bits - && self.common_config.driver_status & DEVICE_FAILED as u8 == 0 + let driver_status = self.common_config.driver_status.load(Ordering::SeqCst); + driver_status == ready_bits && (driver_status & DEVICE_FAILED as u8) == 0 } /// Determines if the driver has requested the device (re)init / reset itself fn is_driver_init(&self) -> bool { - self.common_config.driver_status == DEVICE_INIT as u8 + self.common_config.driver_status.load(Ordering::SeqCst) == DEVICE_INIT as u8 } pub fn config_bar_addr(&self) -> u64 { @@ -1238,7 +1238,9 @@ impl PciDevice for VirtioPciDevice { self.common_config.queue_select = 0; } else { error!("Attempt to reset device when not implemented in underlying device"); - self.common_config.driver_status = crate::DEVICE_FAILED as u8; + self.common_config + .driver_status + .store(crate::DEVICE_FAILED as u8, Ordering::SeqCst); } } From b5053ae4dededfda406bc36c8c1956c4fc0ed704 Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Thu, 12 Mar 2026 09:55:03 -0700 Subject: [PATCH 150/742] virtio-devices: wire driver_status to EpollHandler Signed-off-by: Peter Oskolkov --- fuzz/fuzz_targets/balloon.rs | 1 + fuzz/fuzz_targets/block.rs | 1 + fuzz/fuzz_targets/console.rs | 1 + fuzz/fuzz_targets/iommu.rs | 1 + fuzz/fuzz_targets/mem.rs | 1 + fuzz/fuzz_targets/net.rs | 1 + fuzz/fuzz_targets/pmem.rs | 1 + fuzz/fuzz_targets/rng.rs | 1 + fuzz/fuzz_targets/vsock.rs | 1 + fuzz/fuzz_targets/watchdog.rs | 1 + virtio-devices/src/block.rs | 9 ++++++++- virtio-devices/src/device.rs | 3 ++- virtio-devices/src/net.rs | 9 ++++++++- virtio-devices/src/transport/pci_device.rs | 5 ++++- virtio-devices/src/vsock/device.rs | 2 ++ 15 files changed, 34 insertions(+), 4 deletions(-) diff --git a/fuzz/fuzz_targets/balloon.rs b/fuzz/fuzz_targets/balloon.rs index edb4f0cf1a..58b9b30582 100644 --- a/fuzz/fuzz_targets/balloon.rs +++ b/fuzz/fuzz_targets/balloon.rs @@ -103,6 +103,7 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { (1, deflate_q, deflate_evt), (2, reporting_q, reporting_evt), ], + device_status: Arc::new(std::sync::atomic::AtomicU8::new(0)), }) .ok(); diff --git a/fuzz/fuzz_targets/block.rs b/fuzz/fuzz_targets/block.rs index 19b875f044..51007fe384 100644 --- a/fuzz/fuzz_targets/block.rs +++ b/fuzz/fuzz_targets/block.rs @@ -95,6 +95,7 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { mem: guest_memory, interrupt_cb: Arc::new(NoopVirtioInterrupt {}), queues: vec![(0, q, evt)], + device_status: Arc::new(std::sync::atomic::AtomicU8::new(0)), }) .ok(); diff --git a/fuzz/fuzz_targets/console.rs b/fuzz/fuzz_targets/console.rs index 7f5fafaebc..e27331ed01 100644 --- a/fuzz/fuzz_targets/console.rs +++ b/fuzz/fuzz_targets/console.rs @@ -132,6 +132,7 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { mem: guest_memory, interrupt_cb: Arc::new(NoopVirtioInterrupt {}), queues: vec![(0, input_queue, input_evt), (1, output_queue, output_evt)], + device_status: Arc::new(std::sync::atomic::AtomicU8::new(0)), }) .unwrap(); diff --git a/fuzz/fuzz_targets/iommu.rs b/fuzz/fuzz_targets/iommu.rs index 791ab6b000..a10640487f 100644 --- a/fuzz/fuzz_targets/iommu.rs +++ b/fuzz/fuzz_targets/iommu.rs @@ -114,6 +114,7 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { (0, request_queue, request_evt), (0, _event_queue, _event_evt), ], + device_status: Arc::new(std::sync::atomic::AtomicU8::new(0)), }) .ok(); diff --git a/fuzz/fuzz_targets/mem.rs b/fuzz/fuzz_targets/mem.rs index 46627e9315..73ec11b025 100644 --- a/fuzz/fuzz_targets/mem.rs +++ b/fuzz/fuzz_targets/mem.rs @@ -109,6 +109,7 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { mem: guest_memory, interrupt_cb: Arc::new(NoopVirtioInterrupt {}), queues: vec![(0, q, evt)], + device_status: Arc::new(std::sync::atomic::AtomicU8::new(0)), }) .ok(); diff --git a/fuzz/fuzz_targets/net.rs b/fuzz/fuzz_targets/net.rs index 0af835ac06..df9a1dce5a 100644 --- a/fuzz/fuzz_targets/net.rs +++ b/fuzz/fuzz_targets/net.rs @@ -147,6 +147,7 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { mem: guest_memory, interrupt_cb: Arc::new(NoopVirtioInterrupt {}), queues: vec![(0, input_queue, input_evt), (1, output_queue, output_evt)], + device_status: Arc::new(std::sync::atomic::AtomicU8::new(0)), }) .unwrap(); diff --git a/fuzz/fuzz_targets/pmem.rs b/fuzz/fuzz_targets/pmem.rs index b42c20daea..0bd083a1c2 100644 --- a/fuzz/fuzz_targets/pmem.rs +++ b/fuzz/fuzz_targets/pmem.rs @@ -65,6 +65,7 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { mem: guest_memory, interrupt_cb: Arc::new(NoopVirtioInterrupt {}), queues: vec![(0, q, evt)], + device_status: Arc::new(std::sync::atomic::AtomicU8::new(0)), }) .ok(); diff --git a/fuzz/fuzz_targets/rng.rs b/fuzz/fuzz_targets/rng.rs index d9cd11f099..13548664a8 100644 --- a/fuzz/fuzz_targets/rng.rs +++ b/fuzz/fuzz_targets/rng.rs @@ -103,6 +103,7 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { mem: guest_memory, interrupt_cb: Arc::new(NoopVirtioInterrupt {}), queues: vec![(0, q, evt)], + device_status: Arc::new(std::sync::atomic::AtomicU8::new(0)), }) .ok(); diff --git a/fuzz/fuzz_targets/vsock.rs b/fuzz/fuzz_targets/vsock.rs index 72bdeb4d63..33ebe78886 100644 --- a/fuzz/fuzz_targets/vsock.rs +++ b/fuzz/fuzz_targets/vsock.rs @@ -112,6 +112,7 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { mem: guest_memory, interrupt_cb: Arc::new(NoopVirtioInterrupt {}), queues: vec![(0, q, evt)], + device_status: Arc::new(std::sync::atomic::AtomicU8::new(0)), }) .ok(); diff --git a/fuzz/fuzz_targets/watchdog.rs b/fuzz/fuzz_targets/watchdog.rs index 8736f8af3f..31361755df 100644 --- a/fuzz/fuzz_targets/watchdog.rs +++ b/fuzz/fuzz_targets/watchdog.rs @@ -68,6 +68,7 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { mem: guest_memory, interrupt_cb: Arc::new(NoopVirtioInterrupt {}), queues: vec![(0, q, evt)], + device_status: Arc::new(std::sync::atomic::AtomicU8::new(0)), }) .ok(); diff --git a/virtio-devices/src/block.rs b/virtio-devices/src/block.rs index 82eca61827..ee05b6e8d6 100644 --- a/virtio-devices/src/block.rs +++ b/virtio-devices/src/block.rs @@ -13,7 +13,7 @@ use std::num::Wrapping; use std::ops::Deref; use std::os::unix::io::AsRawFd; use std::path::PathBuf; -use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicU8, AtomicU64, Ordering}; use std::sync::{Arc, Barrier}; use std::{io, result}; @@ -161,6 +161,8 @@ struct BlockEpollHandler { host_cpus: Option>, acked_features: u64, disable_sector0_writes: bool, + #[allow(unused)] + device_status: Arc, } fn has_feature(features: u64, feature_flag: u64) -> bool { @@ -666,6 +668,7 @@ pub struct Block { queue_affinity: BTreeMap>, disable_sector0_writes: bool, lock_granularity_choice: LockGranularityChoice, + device_status: Arc, } #[derive(Serialize, Deserialize)] @@ -813,6 +816,7 @@ impl Block { queue_affinity, disable_sector0_writes, lock_granularity_choice: lock_granularity, + device_status: Arc::new(AtomicU8::new(0)), }) } @@ -1013,7 +1017,9 @@ impl VirtioDevice for Block { mem, interrupt_cb, mut queues, + device_status, } = context; + self.device_status = device_status; // See if the guest didn't ack the device being read-only. // If so, warn and pretend it did. let original_acked_features = self.common.acked_features; @@ -1069,6 +1075,7 @@ impl VirtioDevice for Block { host_cpus: self.queue_affinity.get(&queue_idx).cloned(), acked_features: self.common.acked_features, disable_sector0_writes: self.disable_sector0_writes, + device_status: self.device_status.clone(), }; let paused = self.common.paused.clone(); diff --git a/virtio-devices/src/device.rs b/virtio-devices/src/device.rs index d5e873bf42..4b5cdaf03f 100644 --- a/virtio-devices/src/device.rs +++ b/virtio-devices/src/device.rs @@ -9,7 +9,7 @@ use std::collections::HashMap; use std::io::Write; use std::num::Wrapping; -use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicU8, Ordering}; use std::sync::{Arc, Barrier}; use std::thread; @@ -57,6 +57,7 @@ pub struct ActivationContext { pub mem: GuestMemoryAtomic, pub interrupt_cb: Arc, pub queues: Vec<(usize, Queue, EventFd)>, + pub device_status: Arc, } /// Trait for virtio devices to be driven by a virtio transport. diff --git a/virtio-devices/src/net.rs b/virtio-devices/src/net.rs index 2e3aa02eaf..9f72b93103 100644 --- a/virtio-devices/src/net.rs +++ b/virtio-devices/src/net.rs @@ -10,7 +10,7 @@ use std::net::IpAddr; use std::num::Wrapping; use std::ops::Deref; use std::os::unix::io::{AsRawFd, RawFd}; -use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicU8, Ordering}; use std::sync::{Arc, Barrier}; use std::{result, thread}; @@ -179,6 +179,8 @@ struct NetEpollHandler { // a restore as the vCPU thread isn't ready to handle the interrupt. This causes // issues when combined with VIRTIO_RING_F_EVENT_IDX interrupt suppression. driver_awake: bool, + #[allow(unused)] + device_status: Arc, } impl NetEpollHandler { @@ -414,6 +416,7 @@ pub struct Net { seccomp_action: SeccompAction, rate_limiter_config: Option, exit_evt: EventFd, + device_status: Arc, } #[derive(Serialize, Deserialize)] @@ -535,6 +538,7 @@ impl Net { seccomp_action, rate_limiter_config, exit_evt, + device_status: Arc::new(AtomicU8::new(0)), }) } @@ -698,7 +702,9 @@ impl VirtioDevice for Net { mem, interrupt_cb, mut queues, + device_status, } = context; + self.device_status = device_status; self.common.activate(&queues, interrupt_cb.clone())?; let num_queues = queues.len(); @@ -803,6 +809,7 @@ impl VirtioDevice for Net { kill_evt, pause_evt, driver_awake: false, + device_status: self.device_status.clone(), }; let paused = self.common.paused.clone(); diff --git a/virtio-devices/src/transport/pci_device.rs b/virtio-devices/src/transport/pci_device.rs index 1a8d3da96e..7f049070d6 100644 --- a/virtio-devices/src/transport/pci_device.rs +++ b/virtio-devices/src/transport/pci_device.rs @@ -10,7 +10,7 @@ use std::any::Any; use std::cmp; use std::io::Write; use std::ops::Deref; -use std::sync::atomic::{AtomicBool, AtomicU16, AtomicUsize, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicU8, AtomicU16, AtomicUsize, Ordering}; use std::sync::{Arc, Barrier, Mutex}; use anyhow::anyhow; @@ -306,6 +306,7 @@ pub struct VirtioPciDeviceActivator { queues: Option>, barrier: Option>, id: String, + status: Arc, } impl VirtioPciDeviceActivator { @@ -315,6 +316,7 @@ impl VirtioPciDeviceActivator { mem: self.memory.take().unwrap(), interrupt_cb: self.interrupt.take().unwrap(), queues: self.queues.take().unwrap(), + device_status: self.status, })?; self.device_activated.store(true, Ordering::SeqCst); @@ -820,6 +822,7 @@ impl VirtioPciDevice { device_activated: self.device_activated.clone(), barrier, id: self.id.clone(), + status: self.common_config.driver_status.clone(), } } diff --git a/virtio-devices/src/vsock/device.rs b/virtio-devices/src/vsock/device.rs index 5b215c3597..6d38ecf398 100644 --- a/virtio-devices/src/vsock/device.rs +++ b/virtio-devices/src/vsock/device.rs @@ -598,6 +598,7 @@ mod unit_tests { mem: memory.clone(), interrupt_cb: Arc::new(NoopVirtioInterrupt {}), queues: Vec::new(), + device_status: Arc::new(std::sync::atomic::AtomicU8::new(0)), }); match bad_activate { Err(ActivateError::BadActivate) => (), @@ -626,6 +627,7 @@ mod unit_tests { EventFd::new(EFD_NONBLOCK).unwrap(), ), ], + device_status: Arc::new(std::sync::atomic::AtomicU8::new(0)), }) .unwrap(); } From 563303b50a6d4a06b0660840887684d6167da3c2 Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Thu, 12 Mar 2026 09:56:49 -0700 Subject: [PATCH 151/742] virtio-devices: net: handle corrupted requests with NEEDS_RESET A buggy or malicious guest may write an inappropriate value into virtqueue's next_avail field. This will result in an error when iterating over the queue: https://github.com/rust-vmm/vm-virtio/blob/863837ef863f6880bb8357e60bbac49e72c0844c/virtio-queue/src/queue.rs#L708 but this error is (logged and) ignored if pop_descriptor_chain() is used: https://github.com/rust-vmm/vm-virtio/blob/863837ef863f6880bb8357e60bbac49e72c0844c/virtio-queue/src/queue.rs#L583 A reasonable approach, implemented here, is to mark the device as NEEDS_RESET and ignore further queue events until the guest reinitializes the device. How this patch was tested: Linux kernel was patched to trigger a bad next_avail when the virtqueue queue counter reaches 5000: --------------- START OF LINUX KERNEL PATCH ---------- $ git diff diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index b784aab668670..989f2a0c64a77 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -15,6 +15,9 @@ #include #include + +void virtqueue_kick_always(struct virtqueue *vq); + #ifdef DEBUG /* For development, we want to crash whenever the ring is screwed. */ #define BAD_RING(_vq, fmt, args...) \ @@ -677,6 +680,12 @@ static inline int virtqueue_add_split( struct virtqueue *_vq, * new available array entries. */ virtio_wmb(vq->weak_barriers); vq->split.avail_idx_shadow++; + { + if ((vq->split.avail_idx_shadow % 100) == 0) + printk(KERN_ERR "avail idx: %d", + (int)vq->split.avail_idx_shadow); + if (vq->split.avail_idx_shadow == 5000) + vq->split.avail_idx_shadow = 0; + } vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->split.avail_idx_shadow); vq->num_added++; @@ -689,6 +698,11 @@ static inline int virtqueue_add_split( struct virtqueue *_vq, if (unlikely(vq->num_added == (1 << 16) - 1)) virtqueue_kick(_vq); + { + if (unlikely(vq->split.avail_idx_shadow == 0)) + virtqueue_kick_always(_vq); + } + return 0; unmap_release: @@ -2515,6 +2529,11 @@ bool virtqueue_kick(struct virtqueue *vq) } EXPORT_SYMBOL_GPL(virtqueue_kick); +void virtqueue_kick_always(struct virtqueue *vq) +{ + virtqueue_kick_prepare(vq); + virtqueue_notify(vq); +} /** * virtqueue_get_buf_ctx - get the next used buffer * @_vq: the struct virtqueue we're talking about. --------------- END OF LINUX KERNEL PATCH ---------- Then the kernel was booted, and the host pinged until the nic became unresponsive: ping -i 0.002 192.168.4.1 Device status was confirmed using cat /sys/class/net/eth0/device/status (it was 0x4f). Then the device was re-initialized: DEV_NAME=$(basename $(readlink -f /sys/class/net/eth0/device)) echo $DEV_NAME | tee /sys/bus/virtio/drivers/virtio_net/unbind echo $DEV_NAME | tee /sys/bus/virtio/drivers/virtio_net/bind ip link set eth0 up At this point networking became healthly again. Signed-off-by: Peter Oskolkov --- net_util/src/queue_pair.rs | 16 +++++++-- virtio-devices/src/lib.rs | 1 + virtio-devices/src/net.rs | 67 +++++++++++++++++++++++++++++++------- 3 files changed, 70 insertions(+), 14 deletions(-) diff --git a/net_util/src/queue_pair.rs b/net_util/src/queue_pair.rs index 86a1c758dc..c0b8825e71 100644 --- a/net_util/src/queue_pair.rs +++ b/net_util/src/queue_pair.rs @@ -51,7 +51,13 @@ impl TxVirtio { let mut retry_write = false; let mut rate_limit_reached = false; - while let Some(mut desc_chain) = queue.pop_descriptor_chain(mem) { + loop { + let mut iter = queue + .iter(mem) + .map_err(NetQueuePairError::QueueIteratorFailed)?; + let Some(mut desc_chain) = iter.next() else { + break; + }; if rate_limit_reached { queue.go_to_previous_position(); break; @@ -180,7 +186,13 @@ impl RxVirtio { let mut exhausted_descs = true; let mut rate_limit_reached = false; - while let Some(mut desc_chain) = queue.pop_descriptor_chain(mem) { + loop { + let mut iter = queue + .iter(mem) + .map_err(NetQueuePairError::QueueIteratorFailed)?; + let Some(mut desc_chain) = iter.next() else { + break; + }; if rate_limit_reached { exhausted_descs = false; queue.go_to_previous_position(); diff --git a/virtio-devices/src/lib.rs b/virtio-devices/src/lib.rs index 14e565ccb8..f69ff5579e 100644 --- a/virtio-devices/src/lib.rs +++ b/virtio-devices/src/lib.rs @@ -66,6 +66,7 @@ const DEVICE_ACKNOWLEDGE: u32 = 0x01; const DEVICE_DRIVER: u32 = 0x02; const DEVICE_DRIVER_OK: u32 = 0x04; const DEVICE_FEATURES_OK: u32 = 0x08; +const DEVICE_NEEDS_RESET: u32 = 0x40; const DEVICE_FAILED: u32 = 0x80; const VIRTIO_F_RING_INDIRECT_DESC: u32 = 28; diff --git a/virtio-devices/src/net.rs b/virtio-devices/src/net.rs index 9f72b93103..ec8afc2cc7 100644 --- a/virtio-devices/src/net.rs +++ b/virtio-devices/src/net.rs @@ -16,7 +16,7 @@ use std::{result, thread}; use anyhow::anyhow; use event_monitor::event; -use log::{debug, error, info}; +use log::{debug, error, info, warn}; #[cfg(not(fuzzing))] use net_util::virtio_features_to_tap_offload; use net_util::{ @@ -179,7 +179,6 @@ struct NetEpollHandler { // a restore as the vCPU thread isn't ready to handle the interrupt. This causes // issues when combined with VIRTIO_RING_F_EVENT_IDX interrupt suppression. driver_awake: bool, - #[allow(unused)] device_status: Arc, } @@ -194,6 +193,9 @@ impl NetEpollHandler { } fn handle_rx_event(&mut self) -> result::Result<(), DeviceError> { + if self.needs_reset() { + return Ok(()); + } let queue_evt = &self.queue_evt_pair.0; if let Err(e) = queue_evt.read() { error!("Failed to get rx queue event: {e:?}"); @@ -222,13 +224,43 @@ impl NetEpollHandler { Ok(()) } + fn handle_queue_iterator_error(&mut self, err: &virtio_queue::Error) { + // The guest submitted a corrupted VirtQ request, and the error + // was logged during queue processing. We cannot just ignore the + // error, as the guest could continue spamming the VMM with bad + // requests, triggering excessive error logging. So we mark + // the device "NEEDS_RESET", effectively stopping all request + // processing (see self.needs_reset() usage) until the guest + // resets and reactivates the device. + + warn!( + "Corrupted request detected (virtqueue error: {err:?}). \ +Setting device status to 'NEEDS_RESET' and stopping processing queues until reset." + ); + + self.device_status + .fetch_or(crate::DEVICE_NEEDS_RESET as u8, Ordering::SeqCst); + + // Let the guest know that the device status has changed. + if let Err(e) = self.interrupt_cb.trigger(VirtioInterruptType::Config) { + error!("Failed to signal config interrupt: {e:?}"); + } + } + fn process_tx(&mut self) -> result::Result<(), DeviceError> { - if self + if self.needs_reset() { + return Ok(()); + } + let res = self .net - .process_tx(&self.mem.memory(), &mut self.queue_pair.1) - .map_err(DeviceError::NetQueuePair)? - || !self.driver_awake - { + .process_tx(&self.mem.memory(), &mut self.queue_pair.1); + + if let Err(net_util::NetQueuePairError::QueueIteratorFailed(err)) = res { + self.handle_queue_iterator_error(&err); + return Ok(()); + } + + if res.map_err(DeviceError::NetQueuePair)? || !self.driver_awake { self.signal_used_queue(self.queue_index_base + 1)?; debug!("Signalling TX queue"); } else { @@ -252,12 +284,19 @@ impl NetEpollHandler { } fn handle_rx_tap_event(&mut self) -> result::Result<(), DeviceError> { - if self + if self.needs_reset() { + return Ok(()); + } + let res = self .net - .process_rx(&self.mem.memory(), &mut self.queue_pair.0) - .map_err(DeviceError::NetQueuePair)? - || !self.driver_awake - { + .process_rx(&self.mem.memory(), &mut self.queue_pair.0); + + if let Err(net_util::NetQueuePairError::QueueIteratorFailed(err)) = res { + self.handle_queue_iterator_error(&err); + return Ok(()); + } + + if res.map_err(DeviceError::NetQueuePair)? || !self.driver_awake { self.signal_used_queue(self.queue_index_base)?; debug!("Signalling RX queue"); } else { @@ -307,6 +346,10 @@ impl NetEpollHandler { Ok(()) } + + fn needs_reset(&self) -> bool { + (self.device_status.load(Ordering::Acquire) & crate::DEVICE_NEEDS_RESET as u8) != 0 + } } impl EpollHelperHandler for NetEpollHandler { From 8b60b38281f95a519802e21358080582c28c46ef Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Thu, 12 Mar 2026 09:58:15 -0700 Subject: [PATCH 152/742] virtio-devices: block: handle corrupted requests with NEEDS_RESET Signed-off-by: Peter Oskolkov --- virtio-devices/src/block.rs | 46 +++++++++++++++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 2 deletions(-) diff --git a/virtio-devices/src/block.rs b/virtio-devices/src/block.rs index ee05b6e8d6..1668565340 100644 --- a/virtio-devices/src/block.rs +++ b/virtio-devices/src/block.rs @@ -161,7 +161,6 @@ struct BlockEpollHandler { host_cpus: Option>, acked_features: u64, disable_sector0_writes: bool, - #[allow(unused)] device_status: Arc, } @@ -170,6 +169,10 @@ fn has_feature(features: u64, feature_flag: u64) -> bool { } impl BlockEpollHandler { + fn needs_reset(&self) -> bool { + (self.device_status.load(Ordering::Acquire) & crate::DEVICE_NEEDS_RESET as u8) != 0 + } + fn check_request( features: u64, request: &Request, @@ -200,12 +203,48 @@ impl BlockEpollHandler { Ok(()) } + fn handle_queue_iterator_error(&mut self, err: &virtio_queue::Error) { + // The guest submitted a corrupted VirtQ request, and the error + // was logged during queue processing. We cannot just ignore the + // error, as the guest could continue spamming the VMM with bad + // requests, triggering excessive error logging. So we mark + // the device "NEEDS_RESET", effectively stopping all request + // processing (see self.needs_reset() usage) until the guest + // resets and reactivates the device. + + warn!( + "Corrupted request detected (virtqueue error: {err:?}). \ +Setting device status to 'NEEDS_RESET' and stopping processing queues until reset." + ); + + self.device_status + .fetch_or(crate::DEVICE_NEEDS_RESET as u8, Ordering::SeqCst); + + // Let the guest know that the device status has changed. + if let Err(e) = self.interrupt_cb.trigger(VirtioInterruptType::Config) { + error!("Failed to signal config interrupt: {e:?}"); + } + } + fn process_queue_submit(&mut self) -> Result<()> { + if self.needs_reset() { + return Ok(()); + } let queue = &mut self.queue; let mut batch_requests = Vec::new(); let mut batch_inflight_requests = Vec::new(); - while let Some(mut desc_chain) = queue.pop_descriptor_chain(self.mem.memory()) { + loop { + let mut desc_chain = match queue.iter(self.mem.memory()) { + Ok(mut iter) => match iter.next() { + Some(c) => c, + None => break, + }, + Err(err) => { + self.handle_queue_iterator_error(&err); + return Ok(()); + } + }; let mut request = Request::parse(&mut desc_chain, self.access_platform.as_deref()) .map_err(Error::RequestParsing)?; @@ -388,6 +427,9 @@ impl BlockEpollHandler { } fn process_queue_complete(&mut self) -> Result<()> { + if self.needs_reset() { + return Ok(()); + } let mem = self.mem.memory(); let mut read_bytes = Wrapping(0); let mut write_bytes = Wrapping(0); From c53bc3d1701b6e1c2fc67aea3ba90d2ac473aa78 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Fri, 13 Mar 2026 18:15:12 -0700 Subject: [PATCH 153/742] scripts: refactor prepare_vdpa for distro-specific setup Split the vDPA preparation flow into helper functions for building modules, validating availability, loading modules, and creating devices. Build the vdpa_sim modules only on Ubuntu, where the script installs dependencies and compiles them from the matching kernel source. On other distributions, reuse the installed kernel modules and verify that they are available before continuing. This makes the script easier to follow and supports systems such as Azure Linux, where the modules are provided by the kernel package. Signed-off-by: Muminul Islam --- scripts/prepare_vdpa.sh | 92 +++++++++++++++++++++++++++++------------ 1 file changed, 65 insertions(+), 27 deletions(-) diff --git a/scripts/prepare_vdpa.sh b/scripts/prepare_vdpa.sh index 4a99daaf7b..601e61dbcf 100755 --- a/scripts/prepare_vdpa.sh +++ b/scripts/prepare_vdpa.sh @@ -1,35 +1,73 @@ #!/usr/bin/env bash set -x -sudo apt install -y libncurses-dev gawk flex bison openssl libssl-dev dkms libelf-dev libudev-dev libpci-dev libiberty-dev autoconf git make dpkg-dev libmnl-dev pkg-config iproute2 -sudo sed -i -- 's/# deb-src/deb-src/g' /etc/apt/sources.list -sudo apt update -apt-get source linux-image-unsigned-"$(uname -r)" -pushd linux-azure*/drivers/vdpa/vdpa_sim/ || exit -# REUSE-IgnoreStart -cat <<'EOF' >Makefile +build_install_vdpa_sim_modules_ubuntu() { + sudo apt install -y libncurses-dev gawk flex bison openssl libssl-dev dkms libelf-dev libudev-dev libpci-dev libiberty-dev autoconf git make dpkg-dev libmnl-dev pkg-config iproute2 + sudo sed -i -- 's/# deb-src/deb-src/g' /etc/apt/sources.list + sudo apt update + apt-get source linux-image-unsigned-"$(uname -r)" + pushd linux-azure*/drivers/vdpa/vdpa_sim/ || exit + # REUSE-IgnoreStart + cat <<'EOF' >Makefile # SPDX-License-Identifier: GPL-2.0 obj-m += vdpa_sim.o obj-m += vdpa_sim_net.o obj-m += vdpa_sim_blk.o EOF -# REUSE-IgnoreEnd -make -C /lib/modules/"$(uname -r)"/build M="$PWD" -sudo make -C /lib/modules/"$(uname -r)"/build M="$PWD" modules_install -popd || exit -sudo depmod -a -sudo modprobe vdpa -sudo modprobe vhost_vdpa -sudo modprobe vdpa_sim -sudo modprobe vdpa_sim_blk -sudo modprobe vdpa_sim_net -# Create /dev/vhost-vdpa-0 -sudo vdpa dev add name vdpa-blk1 mgmtdev vdpasim_blk -# Create /dev/vhost-vdpa-1 -sudo vdpa dev add name vdpa-blk2 mgmtdev vdpasim_blk -# Create /dev/vhost-vdpa-2 -sudo vdpa dev add name vdpa-net1 mgmtdev vdpasim_net -sudo chmod 660 /dev/vhost-vdpa-0 -sudo chmod 660 /dev/vhost-vdpa-1 -sudo chmod 660 /dev/vhost-vdpa-2 -vdpa dev show -jp + # REUSE-IgnoreEnd + make -C /lib/modules/"$(uname -r)"/build M="$PWD" + sudo make -C /lib/modules/"$(uname -r)"/build M="$PWD" modules_install + popd || exit + sudo depmod -a +} + +check_vdpa_sim_modules() { + for module in $MODULES; do + modinfo "$module" || { + echo "Module $module is not installed. Please build and install it first." + exit 1 + } + done +} + +modproobe_modules() { + for module in $MODULES; do + sudo modprobe "$module" || { + echo "Failed to load module $module. Please check if it is installed correctly." + exit 1 + } + done +} + +prepare_vdpa() { + # Create /dev/vhost-vdpa-0 + sudo vdpa dev add name vdpa-blk1 mgmtdev vdpasim_blk + # Create /dev/vhost-vdpa-1 + sudo vdpa dev add name vdpa-blk2 mgmtdev vdpasim_blk + # Create /dev/vhost-vdpa-2 + sudo vdpa dev add name vdpa-net1 mgmtdev vdpasim_net + sudo chmod 660 /dev/vhost-vdpa-0 + sudo chmod 660 /dev/vhost-vdpa-1 + sudo chmod 660 /dev/vhost-vdpa-2 + vdpa dev show -jp +} + +MODULES="vdpa vhost_vdpa vdpa_sim vdpa_sim_blk vdpa_sim_net" +DISTRO_NAME="ubuntu" +if [[ -f /etc/lsb-release ]]; then + DISTRO_NAME=$(grep DISTRIB_ID /etc/lsb-release | cut -d '=' -f 2) + # Converts the value of the DISTRO_NAME variable to lowercase letters. + DISTRO_NAME=$(echo "$DISTRO_NAME" | tr '[:upper:]' '[:lower:]') + echo "Distribution Name: $DISTRO_NAME" +fi + +if [[ "$DISTRO_NAME" == "ubuntu" ]]; then + build_install_vdpa_sim_modules_ubuntu +fi +# For other distros, we assume the modules are already built and installed +# For Azure Linux, the modules are included in the kernel and should be available by default +check_vdpa_sim_modules + +modproobe_modules + +prepare_vdpa From 5aa68ddf9ebc0c36fc85eb64e03007b965675284 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Sat, 14 Mar 2026 12:05:17 +0100 Subject: [PATCH 154/742] block: Add raw_async_io_tests shared test helpers Add raw_async_io_tests.rs with punch_hole, write_zeroes, and multiple_operations helpers that take &mut dyn AsyncIo + &mut File. These are raw-backend-specific. They verify data by reading the underlying file directly, which only works for plain file backends without container format metadata. Signed-off-by: Anatol Belski --- block/src/lib.rs | 2 + block/src/raw_async_io_tests.rs | 162 ++++++++++++++++++++++++++++++++ 2 files changed, 164 insertions(+) create mode 100644 block/src/raw_async_io_tests.rs diff --git a/block/src/lib.rs b/block/src/lib.rs index f477cd36c6..6e3e50178f 100644 --- a/block/src/lib.rs +++ b/block/src/lib.rs @@ -24,6 +24,8 @@ pub mod qcow_sync; /// Enabled with the `"io_uring"` feature pub mod raw_async; pub mod raw_async_aio; +#[cfg(test)] +mod raw_async_io_tests; pub mod raw_sync; pub mod vhd; pub mod vhdx; diff --git a/block/src/raw_async_io_tests.rs b/block/src/raw_async_io_tests.rs new file mode 100644 index 0000000000..560e41e334 --- /dev/null +++ b/block/src/raw_async_io_tests.rs @@ -0,0 +1,162 @@ +// Copyright 2026 The Cloud Hypervisor Authors. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +//! Shared test helpers for [`AsyncIo`] backends. +//! +//! Each helper takes a `&mut dyn AsyncIo` together with the [`File`] handle +//! that backs the I/O object, so the same logic exercises every backend with +//! only the constructor differing. + +use std::fs::File; +use std::io::{Read, Seek, SeekFrom, Write}; + +use crate::async_io::{AsyncIo, AsyncIoError}; + +/// Tests punching a hole in the middle of a 4 MB file and verifying data +/// integrity around the hole. +pub fn test_punch_hole(async_io: &mut dyn AsyncIo, file: &mut File) { + // Write 4MB of data + let data = vec![0xAA; 4 * 1024 * 1024]; + file.write_all(&data).unwrap(); + file.sync_all().unwrap(); + + // Punch hole in the middle (1MB at offset 1MB) + let offset = 1024 * 1024; + let length = 1024 * 1024; + async_io.punch_hole(offset, length, 1).unwrap(); + + // Check completion + let (user_data, result) = async_io.next_completed_request().unwrap(); + assert_eq!(user_data, 1); + assert_eq!(result, 0); + + // Verify the hole reads as zeros + file.seek(SeekFrom::Start(offset)).unwrap(); + let mut read_buf = vec![0; length as usize]; + file.read_exact(&mut read_buf).unwrap(); + assert!( + read_buf.iter().all(|&b| b == 0), + "Punched hole should read as zeros" + ); + + // Verify data before hole is intact + file.seek(SeekFrom::Start(0)).unwrap(); + let mut read_buf = vec![0; 1024]; + file.read_exact(&mut read_buf).unwrap(); + assert!( + read_buf.iter().all(|&b| b == 0xAA), + "Data before hole should be intact" + ); + + // Verify data after hole is intact + file.seek(SeekFrom::Start(offset + length)).unwrap(); + let mut read_buf = vec![0; 1024]; + file.read_exact(&mut read_buf).unwrap(); + assert!( + read_buf.iter().all(|&b| b == 0xAA), + "Data after hole should be intact" + ); +} + +/// Tests writing zeroes to a 512 KB region inside a 4 MB file and verifying +/// surrounding data is preserved. Gracefully skips when the filesystem does +/// not support `FALLOC_FL_ZERO_RANGE`. +pub fn test_write_zeroes(async_io: &mut dyn AsyncIo, file: &mut File) { + // Write 4MB of data + let data = vec![0xBB; 4 * 1024 * 1024]; + file.write_all(&data).unwrap(); + file.sync_all().unwrap(); + + // Write zeros in the middle (512KB at offset 2MB) + let offset = 2 * 1024 * 1024; + let length = 512 * 1024; + let write_zeroes_result = async_io.write_zeroes(offset, length, 2); + + // FALLOC_FL_ZERO_RANGE might not be supported on all filesystems (e.g., tmpfs) + // If it fails with ENOTSUP, skip the test + if let Err(AsyncIoError::WriteZeroes(ref e)) = write_zeroes_result + && (e.raw_os_error() == Some(libc::EOPNOTSUPP) || e.raw_os_error() == Some(libc::ENOTSUP)) + { + eprintln!("Skipping test_write_zeroes: filesystem doesn't support FALLOC_FL_ZERO_RANGE"); + return; + } + write_zeroes_result.unwrap(); + + // Check completion + let (user_data, result) = async_io.next_completed_request().unwrap(); + assert_eq!(user_data, 2); + assert_eq!(result, 0); + + // Verify the zeroed region reads as zeros + file.seek(SeekFrom::Start(offset)).unwrap(); + let mut read_buf = vec![0; length as usize]; + file.read_exact(&mut read_buf).unwrap(); + assert!( + read_buf.iter().all(|&b| b == 0), + "Zeroed region should read as zeros" + ); + + // Verify data before zeroed region is intact + file.seek(SeekFrom::Start(offset - 1024)).unwrap(); + let mut read_buf = vec![0; 1024]; + file.read_exact(&mut read_buf).unwrap(); + assert!( + read_buf.iter().all(|&b| b == 0xBB), + "Data before zeroed region should be intact" + ); + + // Verify data after zeroed region is intact + file.seek(SeekFrom::Start(offset + length)).unwrap(); + let mut read_buf = vec![0; 1024]; + file.read_exact(&mut read_buf).unwrap(); + assert!( + read_buf.iter().all(|&b| b == 0xBB), + "Data after zeroed region should be intact" + ); +} + +/// Tests punching multiple holes in an 8 MB file and verifying each hole +/// independently reads as zeroes. +pub fn test_punch_hole_multiple_operations(async_io: &mut dyn AsyncIo, file: &mut File) { + // Write 8MB of data + let data = vec![0xCC; 8 * 1024 * 1024]; + file.write_all(&data).unwrap(); + file.sync_all().unwrap(); + + // Punch multiple holes + async_io.punch_hole(1024 * 1024, 512 * 1024, 10).unwrap(); + async_io + .punch_hole(3 * 1024 * 1024, 512 * 1024, 11) + .unwrap(); + async_io + .punch_hole(5 * 1024 * 1024, 512 * 1024, 12) + .unwrap(); + + // Check all completions + let (user_data, result) = async_io.next_completed_request().unwrap(); + assert_eq!(user_data, 10); + assert_eq!(result, 0); + + let (user_data, result) = async_io.next_completed_request().unwrap(); + assert_eq!(user_data, 11); + assert_eq!(result, 0); + + let (user_data, result) = async_io.next_completed_request().unwrap(); + assert_eq!(user_data, 12); + assert_eq!(result, 0); + + // Verify all holes read as zeros + file.seek(SeekFrom::Start(1024 * 1024)).unwrap(); + let mut read_buf = vec![0; 512 * 1024]; + file.read_exact(&mut read_buf).unwrap(); + assert!(read_buf.iter().all(|&b| b == 0)); + + file.seek(SeekFrom::Start(3 * 1024 * 1024)).unwrap(); + file.read_exact(&mut read_buf).unwrap(); + assert!(read_buf.iter().all(|&b| b == 0)); + + file.seek(SeekFrom::Start(5 * 1024 * 1024)).unwrap(); + file.read_exact(&mut read_buf).unwrap(); + assert!(read_buf.iter().all(|&b| b == 0)); +} From d883b54fb7cf6fff2ea53645b16fb6f51f727f82 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Sat, 14 Mar 2026 12:07:57 +0100 Subject: [PATCH 155/742] block: Deduplicate raw_sync and raw_async_aio tests Replace duplicated test bodies with thin wrappers that construct the backend-specific AsyncIo instance and delegate to the shared raw_async_io_tests helpers. Signed-off-by: Anatol Belski --- block/src/raw_async_aio.rs | 150 ++----------------------------------- block/src/raw_sync.rs | 150 ++----------------------------------- 2 files changed, 10 insertions(+), 290 deletions(-) diff --git a/block/src/raw_async_aio.rs b/block/src/raw_async_aio.rs index 20fb26c2c4..7f66ce4114 100644 --- a/block/src/raw_async_aio.rs +++ b/block/src/raw_async_aio.rs @@ -242,174 +242,34 @@ impl AsyncIo for RawFileAsyncAio { #[cfg(test)] mod unit_tests { - use std::io::{Read, Seek, SeekFrom, Write}; + use std::os::unix::io::AsRawFd; use vmm_sys_util::tempfile::TempFile; use super::*; + use crate::raw_async_io_tests; #[test] fn test_punch_hole() { let temp_file = TempFile::new().unwrap(); let mut file = temp_file.into_file(); - - // Write 4MB of data - let data = vec![0xAA; 4 * 1024 * 1024]; - file.write_all(&data).unwrap(); - file.sync_all().unwrap(); - - // Create async IO instance let mut async_io = RawFileAsyncAio::new(file.as_raw_fd(), 128).unwrap(); - - // Punch hole in the middle (1MB at offset 1MB) - let offset = 1024 * 1024; - let length = 1024 * 1024; - async_io.punch_hole(offset, length, 1).unwrap(); - - // Check completion - let (user_data, result) = async_io.next_completed_request().unwrap(); - assert_eq!(user_data, 1); - assert_eq!(result, 0); - - // Verify the hole reads as zeros - file.seek(SeekFrom::Start(offset)).unwrap(); - let mut read_buf = vec![0; length as usize]; - file.read_exact(&mut read_buf).unwrap(); - assert!( - read_buf.iter().all(|&b| b == 0), - "Punched hole should read as zeros" - ); - - // Verify data before hole is intact - file.seek(SeekFrom::Start(0)).unwrap(); - let mut read_buf = vec![0; 1024]; - file.read_exact(&mut read_buf).unwrap(); - assert!( - read_buf.iter().all(|&b| b == 0xAA), - "Data before hole should be intact" - ); - - // Verify data after hole is intact - file.seek(SeekFrom::Start(offset + length)).unwrap(); - let mut read_buf = vec![0; 1024]; - file.read_exact(&mut read_buf).unwrap(); - assert!( - read_buf.iter().all(|&b| b == 0xAA), - "Data after hole should be intact" - ); + raw_async_io_tests::test_punch_hole(&mut async_io, &mut file); } #[test] fn test_write_zeroes() { let temp_file = TempFile::new().unwrap(); let mut file = temp_file.into_file(); - - // Write 4MB of data - let data = vec![0xBB; 4 * 1024 * 1024]; - file.write_all(&data).unwrap(); - file.sync_all().unwrap(); - - // Create async IO instance let mut async_io = RawFileAsyncAio::new(file.as_raw_fd(), 128).unwrap(); - - // Write zeros in the middle (512KB at offset 2MB) - let offset = 2 * 1024 * 1024; - let length = 512 * 1024; - let write_zeroes_result = async_io.write_zeroes(offset, length, 2); - - // FALLOC_FL_ZERO_RANGE might not be supported on all filesystems (e.g., tmpfs) - // If it fails with ENOTSUP, skip the test - if let Err(AsyncIoError::WriteZeroes(ref e)) = write_zeroes_result - && (e.raw_os_error() == Some(libc::EOPNOTSUPP) - || e.raw_os_error() == Some(libc::ENOTSUP)) - { - eprintln!( - "Skipping test_write_zeroes: filesystem doesn't support FALLOC_FL_ZERO_RANGE" - ); - return; - } - write_zeroes_result.unwrap(); - - // Check completion - let (user_data, result) = async_io.next_completed_request().unwrap(); - assert_eq!(user_data, 2); - assert_eq!(result, 0); - - // Verify the zeroed region reads as zeros - file.seek(SeekFrom::Start(offset)).unwrap(); - let mut read_buf = vec![0; length as usize]; - file.read_exact(&mut read_buf).unwrap(); - assert!( - read_buf.iter().all(|&b| b == 0), - "Zeroed region should read as zeros" - ); - - // Verify data before zeroed region is intact - file.seek(SeekFrom::Start(offset - 1024)).unwrap(); - let mut read_buf = vec![0; 1024]; - file.read_exact(&mut read_buf).unwrap(); - assert!( - read_buf.iter().all(|&b| b == 0xBB), - "Data before zeroed region should be intact" - ); - - // Verify data after zeroed region is intact - file.seek(SeekFrom::Start(offset + length)).unwrap(); - let mut read_buf = vec![0; 1024]; - file.read_exact(&mut read_buf).unwrap(); - assert!( - read_buf.iter().all(|&b| b == 0xBB), - "Data after zeroed region should be intact" - ); + raw_async_io_tests::test_write_zeroes(&mut async_io, &mut file); } #[test] fn test_punch_hole_multiple_operations() { let temp_file = TempFile::new().unwrap(); let mut file = temp_file.into_file(); - - // Write 8MB of data - let data = vec![0xCC; 8 * 1024 * 1024]; - file.write_all(&data).unwrap(); - file.sync_all().unwrap(); - - // Create async IO instance let mut async_io = RawFileAsyncAio::new(file.as_raw_fd(), 128).unwrap(); - - // Punch multiple holes - async_io.punch_hole(1024 * 1024, 512 * 1024, 10).unwrap(); - async_io - .punch_hole(3 * 1024 * 1024, 512 * 1024, 11) - .unwrap(); - async_io - .punch_hole(5 * 1024 * 1024, 512 * 1024, 12) - .unwrap(); - - // Check all completions - let (user_data, result) = async_io.next_completed_request().unwrap(); - assert_eq!(user_data, 10); - assert_eq!(result, 0); - - let (user_data, result) = async_io.next_completed_request().unwrap(); - assert_eq!(user_data, 11); - assert_eq!(result, 0); - - let (user_data, result) = async_io.next_completed_request().unwrap(); - assert_eq!(user_data, 12); - assert_eq!(result, 0); - - // Verify all holes read as zeros - file.seek(SeekFrom::Start(1024 * 1024)).unwrap(); - let mut read_buf = vec![0; 512 * 1024]; - file.read_exact(&mut read_buf).unwrap(); - assert!(read_buf.iter().all(|&b| b == 0)); - - file.seek(SeekFrom::Start(3 * 1024 * 1024)).unwrap(); - file.read_exact(&mut read_buf).unwrap(); - assert!(read_buf.iter().all(|&b| b == 0)); - - file.seek(SeekFrom::Start(5 * 1024 * 1024)).unwrap(); - file.read_exact(&mut read_buf).unwrap(); - assert!(read_buf.iter().all(|&b| b == 0)); + raw_async_io_tests::test_punch_hole_multiple_operations(&mut async_io, &mut file); } } diff --git a/block/src/raw_sync.rs b/block/src/raw_sync.rs index 9c2d6b7893..0e60a48a75 100644 --- a/block/src/raw_sync.rs +++ b/block/src/raw_sync.rs @@ -211,174 +211,34 @@ impl AsyncIo for RawFileSync { #[cfg(test)] mod unit_tests { - use std::io::{Read, Seek, SeekFrom, Write}; + use std::os::unix::io::AsRawFd; use vmm_sys_util::tempfile::TempFile; use super::*; + use crate::raw_async_io_tests; #[test] fn test_punch_hole() { let temp_file = TempFile::new().unwrap(); let mut file = temp_file.into_file(); - - // Write 4MB of data - let data = vec![0xAA; 4 * 1024 * 1024]; - file.write_all(&data).unwrap(); - file.sync_all().unwrap(); - - // Create async IO instance let mut async_io = RawFileSync::new(file.as_raw_fd()); - - // Punch hole in the middle (1MB at offset 1MB) - let offset = 1024 * 1024; - let length = 1024 * 1024; - async_io.punch_hole(offset, length, 1).unwrap(); - - // Check completion - let (user_data, result) = async_io.next_completed_request().unwrap(); - assert_eq!(user_data, 1); - assert_eq!(result, 0); - - // Verify the hole reads as zeros - file.seek(SeekFrom::Start(offset)).unwrap(); - let mut read_buf = vec![0; length as usize]; - file.read_exact(&mut read_buf).unwrap(); - assert!( - read_buf.iter().all(|&b| b == 0), - "Punched hole should read as zeros" - ); - - // Verify data before hole is intact - file.seek(SeekFrom::Start(0)).unwrap(); - let mut read_buf = vec![0; 1024]; - file.read_exact(&mut read_buf).unwrap(); - assert!( - read_buf.iter().all(|&b| b == 0xAA), - "Data before hole should be intact" - ); - - // Verify data after hole is intact - file.seek(SeekFrom::Start(offset + length)).unwrap(); - let mut read_buf = vec![0; 1024]; - file.read_exact(&mut read_buf).unwrap(); - assert!( - read_buf.iter().all(|&b| b == 0xAA), - "Data after hole should be intact" - ); + raw_async_io_tests::test_punch_hole(&mut async_io, &mut file); } #[test] fn test_write_zeroes() { let temp_file = TempFile::new().unwrap(); let mut file = temp_file.into_file(); - - // Write 4MB of data - let data = vec![0xBB; 4 * 1024 * 1024]; - file.write_all(&data).unwrap(); - file.sync_all().unwrap(); - - // Create async IO instance let mut async_io = RawFileSync::new(file.as_raw_fd()); - - // Write zeros in the middle (512KB at offset 2MB) - let offset = 2 * 1024 * 1024; - let length = 512 * 1024; - let write_zeroes_result = async_io.write_zeroes(offset, length, 2); - - // FALLOC_FL_ZERO_RANGE might not be supported on all filesystems (e.g., tmpfs) - // If it fails with ENOTSUP, skip the test - if let Err(AsyncIoError::WriteZeroes(ref e)) = write_zeroes_result - && (e.raw_os_error() == Some(libc::EOPNOTSUPP) - || e.raw_os_error() == Some(libc::ENOTSUP)) - { - eprintln!( - "Skipping test_write_zeroes: filesystem doesn't support FALLOC_FL_ZERO_RANGE" - ); - return; - } - write_zeroes_result.unwrap(); - - // Check completion - let (user_data, result) = async_io.next_completed_request().unwrap(); - assert_eq!(user_data, 2); - assert_eq!(result, 0); - - // Verify the zeroed region reads as zeros - file.seek(SeekFrom::Start(offset)).unwrap(); - let mut read_buf = vec![0; length as usize]; - file.read_exact(&mut read_buf).unwrap(); - assert!( - read_buf.iter().all(|&b| b == 0), - "Zeroed region should read as zeros" - ); - - // Verify data before zeroed region is intact - file.seek(SeekFrom::Start(offset - 1024)).unwrap(); - let mut read_buf = vec![0; 1024]; - file.read_exact(&mut read_buf).unwrap(); - assert!( - read_buf.iter().all(|&b| b == 0xBB), - "Data before zeroed region should be intact" - ); - - // Verify data after zeroed region is intact - file.seek(SeekFrom::Start(offset + length)).unwrap(); - let mut read_buf = vec![0; 1024]; - file.read_exact(&mut read_buf).unwrap(); - assert!( - read_buf.iter().all(|&b| b == 0xBB), - "Data after zeroed region should be intact" - ); + raw_async_io_tests::test_write_zeroes(&mut async_io, &mut file); } #[test] fn test_punch_hole_multiple_operations() { let temp_file = TempFile::new().unwrap(); let mut file = temp_file.into_file(); - - // Write 8MB of data - let data = vec![0xCC; 8 * 1024 * 1024]; - file.write_all(&data).unwrap(); - file.sync_all().unwrap(); - - // Create async IO instance let mut async_io = RawFileSync::new(file.as_raw_fd()); - - // Punch multiple holes - async_io.punch_hole(1024 * 1024, 512 * 1024, 10).unwrap(); - async_io - .punch_hole(3 * 1024 * 1024, 512 * 1024, 11) - .unwrap(); - async_io - .punch_hole(5 * 1024 * 1024, 512 * 1024, 12) - .unwrap(); - - // Check all completions - let (user_data, result) = async_io.next_completed_request().unwrap(); - assert_eq!(user_data, 10); - assert_eq!(result, 0); - - let (user_data, result) = async_io.next_completed_request().unwrap(); - assert_eq!(user_data, 11); - assert_eq!(result, 0); - - let (user_data, result) = async_io.next_completed_request().unwrap(); - assert_eq!(user_data, 12); - assert_eq!(result, 0); - - // Verify all holes read as zeros - file.seek(SeekFrom::Start(1024 * 1024)).unwrap(); - let mut read_buf = vec![0; 512 * 1024]; - file.read_exact(&mut read_buf).unwrap(); - assert!(read_buf.iter().all(|&b| b == 0)); - - file.seek(SeekFrom::Start(3 * 1024 * 1024)).unwrap(); - file.read_exact(&mut read_buf).unwrap(); - assert!(read_buf.iter().all(|&b| b == 0)); - - file.seek(SeekFrom::Start(5 * 1024 * 1024)).unwrap(); - file.read_exact(&mut read_buf).unwrap(); - assert!(read_buf.iter().all(|&b| b == 0)); + raw_async_io_tests::test_punch_hole_multiple_operations(&mut async_io, &mut file); } } From 05aa9ebcd412476d79964d749a9a17f6c8473984 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Sat, 14 Mar 2026 13:26:52 +0100 Subject: [PATCH 156/742] block: Replace local FALLOC_FL_* constants with libc::* in probe probe_file_sparse_support() defined FALLOC_FL_KEEP_SIZE, FALLOC_FL_PUNCH_HOLE, and FALLOC_FL_ZERO_RANGE as local constants. These are available from the libc crate directly. Signed-off-by: Anatol Belski --- block/src/lib.rs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/block/src/lib.rs b/block/src/lib.rs index 6e3e50178f..c02b315cc2 100644 --- a/block/src/lib.rs +++ b/block/src/lib.rs @@ -45,7 +45,9 @@ use std::{cmp, mem, result}; #[cfg(feature = "io_uring")] use io_uring::{IoUring, Probe, opcode}; -use libc::{S_IFBLK, S_IFMT, ioctl}; +use libc::{ + FALLOC_FL_KEEP_SIZE, FALLOC_FL_PUNCH_HOLE, FALLOC_FL_ZERO_RANGE, S_IFBLK, S_IFMT, ioctl, +}; use log::{debug, error, info, warn}; use serde::{Deserialize, Serialize}; use smallvec::SmallVec; @@ -807,10 +809,6 @@ pub fn probe_sparse_support(file: &File) -> bool { /// Probe sparse support for a regular file using fallocate(). fn probe_file_sparse_support(fd: libc::c_int) -> bool { - const FALLOC_FL_KEEP_SIZE: libc::c_int = 0x01; - const FALLOC_FL_PUNCH_HOLE: libc::c_int = 0x02; - const FALLOC_FL_ZERO_RANGE: libc::c_int = 0x10; - // SAFETY: FFI call with valid fd let file_size = unsafe { libc::lseek(fd, 0, libc::SEEK_END) }; if file_size < 0 { From 1733e08a0f075c804db299bd0ae883ed94dffe9b Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Sat, 14 Mar 2026 13:27:18 +0100 Subject: [PATCH 157/742] block: raw: Replace FALLOC_FL_* consts with libc::* in raw backends raw_sync, raw_async, and raw_async_aio each defined FALLOC_FL_PUNCH_HOLE, FALLOC_FL_KEEP_SIZE, and FALLOC_FL_ZERO_RANGE as local constants in their punch_hole() and write_zeroes() implementations. These are available from the libc crate directly. Signed-off-by: Anatol Belski --- block/src/raw_async.rs | 5 +---- block/src/raw_async_aio.rs | 5 +---- block/src/raw_sync.rs | 5 +---- 3 files changed, 3 insertions(+), 12 deletions(-) diff --git a/block/src/raw_async.rs b/block/src/raw_async.rs index 152e5fa3ba..8544040f50 100644 --- a/block/src/raw_async.rs +++ b/block/src/raw_async.rs @@ -7,6 +7,7 @@ use std::io::{Error, Seek, SeekFrom}; use std::os::unix::io::{AsRawFd, RawFd}; use io_uring::{IoUring, opcode, types}; +use libc::{FALLOC_FL_KEEP_SIZE, FALLOC_FL_PUNCH_HOLE, FALLOC_FL_ZERO_RANGE}; use log::warn; use vmm_sys_util::eventfd::EventFd; @@ -268,8 +269,6 @@ impl AsyncIo for RawFileAsync { fn punch_hole(&mut self, offset: u64, length: u64, user_data: u64) -> AsyncIoResult<()> { let (submitter, mut sq, _) = self.io_uring.split(); - const FALLOC_FL_PUNCH_HOLE: i32 = 0x02; - const FALLOC_FL_KEEP_SIZE: i32 = 0x01; let mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; // SAFETY: The file descriptor is known to be valid. @@ -295,8 +294,6 @@ impl AsyncIo for RawFileAsync { fn write_zeroes(&mut self, offset: u64, length: u64, user_data: u64) -> AsyncIoResult<()> { let (submitter, mut sq, _) = self.io_uring.split(); - const FALLOC_FL_ZERO_RANGE: i32 = 0x10; - const FALLOC_FL_KEEP_SIZE: i32 = 0x01; let mode = FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE; // SAFETY: The file descriptor is known to be valid. diff --git a/block/src/raw_async_aio.rs b/block/src/raw_async_aio.rs index 7f66ce4114..f59e463b4c 100644 --- a/block/src/raw_async_aio.rs +++ b/block/src/raw_async_aio.rs @@ -10,6 +10,7 @@ use std::fs::File; use std::io::{Seek, SeekFrom}; use std::os::unix::io::{AsRawFd, RawFd}; +use libc::{FALLOC_FL_KEEP_SIZE, FALLOC_FL_PUNCH_HOLE, FALLOC_FL_ZERO_RANGE}; use log::warn; use vmm_sys_util::aio; use vmm_sys_util::eventfd::EventFd; @@ -189,8 +190,6 @@ impl AsyncIo for RawFileAsyncAio { // Linux AIO has no IOCB command for fallocate, so perform the operation // synchronously and signal completion via the completion list, matching // the pattern used by the sync backend (RawFileSync). - const FALLOC_FL_PUNCH_HOLE: i32 = 0x02; - const FALLOC_FL_KEEP_SIZE: i32 = 0x01; let mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; // SAFETY: FFI call with valid arguments @@ -216,8 +215,6 @@ impl AsyncIo for RawFileAsyncAio { // Linux AIO has no IOCB command for fallocate, so perform the operation // synchronously and signal completion via the completion list, matching // the pattern used by the sync backend (RawFileSync). - const FALLOC_FL_ZERO_RANGE: i32 = 0x10; - const FALLOC_FL_KEEP_SIZE: i32 = 0x01; let mode = FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE; // SAFETY: FFI call with valid arguments diff --git a/block/src/raw_sync.rs b/block/src/raw_sync.rs index 0e60a48a75..b9f89dde05 100644 --- a/block/src/raw_sync.rs +++ b/block/src/raw_sync.rs @@ -7,6 +7,7 @@ use std::fs::File; use std::io::{Seek, SeekFrom}; use std::os::unix::io::{AsRawFd, RawFd}; +use libc::{FALLOC_FL_KEEP_SIZE, FALLOC_FL_PUNCH_HOLE, FALLOC_FL_ZERO_RANGE}; use log::warn; use vmm_sys_util::eventfd::EventFd; @@ -161,8 +162,6 @@ impl AsyncIo for RawFileSync { } fn punch_hole(&mut self, offset: u64, length: u64, user_data: u64) -> AsyncIoResult<()> { - const FALLOC_FL_PUNCH_HOLE: i32 = 0x02; - const FALLOC_FL_KEEP_SIZE: i32 = 0x01; let mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; // SAFETY: FFI call with valid arguments @@ -185,8 +184,6 @@ impl AsyncIo for RawFileSync { } fn write_zeroes(&mut self, offset: u64, length: u64, user_data: u64) -> AsyncIoResult<()> { - const FALLOC_FL_ZERO_RANGE: i32 = 0x10; - const FALLOC_FL_KEEP_SIZE: i32 = 0x01; let mode = FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE; // SAFETY: FFI call with valid arguments From faebd57db88539feef8dbb037956e2135cea7cff Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Mon, 16 Mar 2026 10:51:05 +0100 Subject: [PATCH 158/742] docs: Add git commit hygiene guidelines to CONTRIBUTING.md Write down our policy for git commit hygiene, especially when it comes to the history, i.e., multiple git commits in a PR. TL;DR: Commits must be revieable units guiding reviewers how the developer got from A to B. Signed-off-by: Philipp Schuster On-behalf-of: SAP philipp.schuster@sap.com --- CONTRIBUTING.md | 53 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 13 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ac65550c05..1518d0f1fb 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -61,22 +61,31 @@ commit you make. In order to get a clear contribution chain of trust we use the [signed-off-by language](https://www.kernel.org/doc/Documentation/process/submitting-patches.rst) used by the Linux kernel project. -## Patch format +## Patch format & Git Commit Hygiene -Beside the signed-off-by footer, we expect each patch to comply with the following format: +_We use **Patch** as synonym for **Commit**._ -``` -: Change summary +We require patches to: -More detailed explanation of your changes: Why and how. -Wrap it to 72 characters. -See http://chris.beams.io/posts/git-commit/ -for some more good pieces of advice. +- Have a `Signed-off-by: Name ` footer +- Follow the pattern: \ + ``` + : Change summary + + More detailed explanation of your changes: Why and how. + Wrap it to 72 characters. + See http://chris.beams.io/posts/git-commit/ + for some more good pieces of advice. + + Signed-off-by: + ``` + -Signed-off-by: -``` +Valid components are listed in `TitleStartsWithComponent.py`. In short, each +cargo workspace member is a valid component as well as `build`, `ci`, `docs` and +`misc`. -For example: +Example patch: ``` vm-virtio: Reset underlying device on driver request @@ -94,6 +103,20 @@ configure it anyway. Signed-off-by: Rob Bradford ``` +### Git Commit History + +We value a clean, **reviewable** commit history. Each commit should represent +a self-contained, logical step that guides reviewers clearly from A to B. + +Avoid patterns like `init A -> init B -> fix A` or \ +`init design A -> revert A -> use design B`. Commits must be independently +reviewable - don't leave "fix previous commit" or earlier design attempts in +the history. + +Intermediate work-in-progress changes are acceptable only if a subsequent +commit in the same series cleans them up (e.g. a temporary `#[allow(unused)]` +removed in the next commit). + ## Pull requests Cloud Hypervisor uses the “fork-and-pull” development model. Follow these steps if @@ -104,10 +127,14 @@ you want to merge your changes to `cloud-hypervisor`: 1. Within your fork, create a branch for your contribution. 1. [Create a pull request](https://help.github.com/articles/creating-a-pull-request-from-a-fork/) against the main branch of the Cloud Hypervisor repository. -1. To update your pull request amend existing commits whenever applicable and - then push the new changes to your pull request branch. +1. Each commit must comply with the Commit Hygiene guidelines above. +1. A pull request should address a single component or concern to keep review + focused and approvals straightforward. 1. Once the pull request is approved it can be integrated. +Please squash any changes done during review already into the corresponding +commits instead of pushing `: addressing review for A`-style commits. + ## Issue tracking If you have a problem, please let us know. We recommend using From a97348d24e05a8dcea018bae00f85a8a42e6be0d Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Sun, 15 Mar 2026 02:38:50 -0700 Subject: [PATCH 159/742] net_util: Fix clippy errors related to use of String error: this argument is passed by value, but not consumed in the function body --> net_util/src/tap.rs:685:17 | 685 | ifname: String, | ^^^^^^ help: consider changing the type to: `&str` | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#needless_pass_by_value = note: requested on the command line with `-D clippy::needless-pass-by-value` Signed-off-by: Rob Bradford --- net_util/src/tap.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net_util/src/tap.rs b/net_util/src/tap.rs index 36f0e2ba33..1622add3a6 100644 --- a/net_util/src/tap.rs +++ b/net_util/src/tap.rs @@ -663,7 +663,7 @@ mod unit_tests { } // Sends a test packet on the interface named "ifname". - fn pnet_send_packet(ifname: String) { + fn pnet_send_packet(ifname: &str) { let payload = DATA_STRING.as_bytes(); // eth hdr + ip hdr + udp hdr + payload len @@ -682,7 +682,7 @@ mod unit_tests { // interface, an object that can be used to send Ethernet frames, and a receiver of // Ethernet frames arriving at the specified interface. fn pnet_get_mac_tx_rx( - ifname: String, + ifname: &str, ) -> (MacAddr, Box, Box) { let interface_name_matches = |iface: &NetworkInterface| iface.name == ifname; @@ -778,7 +778,7 @@ mod unit_tests { tap.enable().unwrap(); // Send a packet to the interface. We expect to be able to receive it on the associated fd. - pnet_send_packet(tap.if_name_as_str().to_owned()); + pnet_send_packet(tap.if_name_as_str()); let mut buf = [0u8; 4096]; @@ -836,7 +836,7 @@ mod unit_tests { tap.set_ip_addr(ip_addr, Some(netmask)).unwrap(); tap.enable().unwrap(); - let (mac, _, mut rx) = pnet_get_mac_tx_rx(tap.if_name_as_str().to_owned()); + let (mac, _, mut rx) = pnet_get_mac_tx_rx(tap.if_name_as_str()); let payload = DATA_STRING.as_bytes(); From 5b23f0a154f9adc5d71f9a9eb2802ee24a230ecf Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Sun, 15 Mar 2026 02:43:57 -0700 Subject: [PATCH 160/742] tests: Fix clippy issues related to format string use error: variables can be used directly in the `format!` string --> cloud-hypervisor/tests/integration.rs:12770:27 | 12770 | let driver_path = format!("{}/driver", NVIDIA_VFIO_DEVICE); | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#uninlined_format_args help: change this to | 12770 - let driver_path = format!("{}/driver", NVIDIA_VFIO_DEVICE); 12770 + let driver_path = format!("{NVIDIA_VFIO_DEVICE}/driver"); Signed-off-by: Rob Bradford --- cloud-hypervisor/tests/integration.rs | 142 +++++++++----------------- 1 file changed, 51 insertions(+), 91 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index f0d5f6f252..b8ab87e652 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -4599,15 +4599,14 @@ mod common_parallel { let output = child.wait_with_output().unwrap(); let stderr = String::from_utf8_lossy(&output.stderr); panic!( - "VM should not have exited when opening corrupt image as readonly. Exit status: {}, stderr: {}", - status, stderr + "VM should not have exited when opening corrupt image as readonly. Exit status: {status}, stderr: {stderr}" ); } Ok(None) => { // VM is still running as expected } Err(e) => { - panic!("Error checking process status: {}", e); + panic!("Error checking process status: {e}"); } } @@ -4617,8 +4616,7 @@ mod common_parallel { let stderr = String::from_utf8_lossy(&output.stderr); assert!( stderr.contains("QCOW2 image is marked corrupt, opening read-only"), - "Expected warning about corrupt image being opened read-only. stderr: {}", - stderr + "Expected warning about corrupt image being opened read-only. stderr: {stderr}" ); assert_eq!( @@ -6764,9 +6762,9 @@ mod common_parallel { // Create a disk image that we can write to assert!( - exec_host_command_output(&format!( - "sudo dd if=/dev/zero of=/tmp/resize.img bs=1M count=16" - )) + exec_host_command_output( + &"sudo dd if=/dev/zero of=/tmp/resize.img bs=1M count=16".to_string() + ) .status .success() ); @@ -7416,7 +7414,7 @@ mod common_parallel { // Helper function to verify sparse file fn verify_sparse_file(test_disk_path: &str, expected_ratio: f64) { - let res = exec_host_command_output(&format!("ls -s --block-size=1 {}", test_disk_path)); + let res = exec_host_command_output(&format!("ls -s --block-size=1 {test_disk_path}")); assert!(res.status.success(), "ls -s command failed"); let out = String::from_utf8_lossy(&res.stdout); let actual_bytes: u64 = out @@ -7425,7 +7423,7 @@ mod common_parallel { .and_then(|s| s.parse().ok()) .expect("Failed to parse ls -s output"); - let res = exec_host_command_output(&format!("ls -l {}", test_disk_path)); + let res = exec_host_command_output(&format!("ls -l {test_disk_path}")); assert!(res.status.success(), "ls -l command failed"); let out = String::from_utf8_lossy(&res.stdout); let apparent_size: u64 = out @@ -7437,17 +7435,14 @@ mod common_parallel { let threshold = (apparent_size as f64 * expected_ratio) as u64; assert!( actual_bytes < threshold, - "Expected file to be sparse: apparent_size={} bytes, actual_disk_usage={} bytes (threshold={})", - apparent_size, - actual_bytes, - threshold + "Expected file to be sparse: apparent_size={apparent_size} bytes, actual_disk_usage={actual_bytes} bytes (threshold={threshold})" ); } // Helper function to count zero flagged regions in QCOW2 image fn count_qcow2_zero_regions(test_disk_path: &str) -> Option { let res = - exec_host_command_output(&format!("qemu-img map --output=json -U {}", test_disk_path)); + exec_host_command_output(&format!("qemu-img map --output=json -U {test_disk_path}")); if !res.status.success() { return None; } @@ -7476,8 +7471,7 @@ mod common_parallel { // - RAW: Verify actual holes (unallocated extents) exist in sparse regions // - Could parse extent output to count holes vs allocated regions fn verify_fiemap_extents(test_disk_path: &str, format_type: &str) { - let blocksize_output = - exec_host_command_output(&format!("stat -f -c %S {}", test_disk_path)); + let blocksize_output = exec_host_command_output(&format!("stat -f -c %S {test_disk_path}")); let blocksize = if blocksize_output.status.success() { String::from_utf8_lossy(&blocksize_output.stdout) .trim() @@ -7488,7 +7482,7 @@ mod common_parallel { }; let fiemap_output = - exec_host_command_output(&format!("filefrag -b {} -v {}", blocksize, test_disk_path)); + exec_host_command_output(&format!("filefrag -b {blocksize} -v {test_disk_path}")); if fiemap_output.status.success() { let fiemap_str = String::from_utf8_lossy(&fiemap_output.stdout); @@ -7498,8 +7492,7 @@ mod common_parallel { assert!( has_extents || has_holes, - "FIEMAP should show extent information or holes for {} file", - format_type + "FIEMAP should show extent information or holes for {format_type} file" ); } } @@ -7508,8 +7501,7 @@ mod common_parallel { fn assert_guest_disk_region_is_zero(guest: &Guest, device: &str, offset: u64, length: u64) { let result = guest .ssh_command(&format!( - "sudo hexdump -v -s {} -n {} -e '1/1 \"%02x\"' {} | grep -qv '^00*$' && echo 'NONZERO' || echo 'ZEROS'", - offset, length, device + "sudo hexdump -v -s {offset} -n {length} -e '1/1 \"%02x\"' {device} | grep -qv '^00*$' && echo 'NONZERO' || echo 'ZEROS'" )) .unwrap(); @@ -7560,7 +7552,7 @@ mod common_parallel { .as_path() .join(format!("discard_test.{}", format_name.to_lowercase())); - let mut cmd = format!("qemu-img create -f {} ", qemu_img_format); + let mut cmd = format!("qemu-img create -f {qemu_img_format} "); if !extra_create_args.is_empty() { cmd.push_str(&extra_create_args.join(" ")); cmd.push(' '); @@ -7570,8 +7562,7 @@ mod common_parallel { let res = exec_host_command_output(&cmd); assert!( res.status.success(), - "Failed to create {} test image", - format_name + "Failed to create {format_name} test image" ); let mut child = GuestCommand::new(&guest) @@ -7641,8 +7632,7 @@ mod common_parallel { // Write one 4MB block at offset 1MB guest .ssh_command(&format!( - "sudo dd if=/dev/zero of=/dev/vdc bs=1M count={} seek={} oflag=direct", - WRITE_SIZE_MB, WRITE_OFFSET_MB + "sudo dd if=/dev/zero of=/dev/vdc bs=1M count={WRITE_SIZE_MB} seek={WRITE_OFFSET_MB} oflag=direct" )) .unwrap(); guest.ssh_command("sync").unwrap(); @@ -7668,19 +7658,14 @@ mod common_parallel { for (i, (offset, length)) in discard_operations.iter().enumerate() { let result = guest .ssh_command(&format!( - "sudo blkdiscard -v -o {} -l {} /dev/vdc 2>&1 || true", - offset, length + "sudo blkdiscard -v -o {offset} -l {length} /dev/vdc 2>&1 || true" )) .unwrap(); assert!( !result.contains("Operation not supported") && !result.contains("BLKDISCARD"), - "blkdiscard #{} at offset {} length {} failed: {}", - i, - offset, - length, - result + "blkdiscard #{i} at offset {offset} length {length} failed: {result}" ); } @@ -7746,8 +7731,7 @@ mod common_parallel { let all_zeros = buffer.iter().all(|&b| b == 0); assert!( all_zeros, - "Expected discarded region at offset {} length {} to contain all zeros", - offset, length + "Expected discarded region at offset {offset} length {length} to contain all zeros" ); } @@ -8112,7 +8096,7 @@ mod common_parallel { .as_path() .join(format!("fstrim_test.{}", format_name.to_lowercase())); - let mut cmd = format!("qemu-img create -f {} ", qemu_img_format); + let mut cmd = format!("qemu-img create -f {qemu_img_format} "); if !extra_create_args.is_empty() { cmd.push_str(&extra_create_args.join(" ")); cmd.push(' '); @@ -8122,8 +8106,7 @@ mod common_parallel { let res = exec_host_command_output(&cmd); assert!( res.status.success(), - "Failed to create {} test image", - format_name + "Failed to create {format_name} test image" ); const WRITE_SIZE_MB: u64 = 4; @@ -8187,8 +8170,7 @@ mod common_parallel { for (iteration, &write_size_kb) in BLOCK_DISCARD_TEST_SIZES_KB.iter().enumerate() { guest .ssh_command(&format!( - "sudo dd if=/dev/zero of=/mnt/test/testfile{} bs=1K count={}", - iteration, write_size_kb + "sudo dd if=/dev/zero of=/mnt/test/testfile{iteration} bs=1K count={write_size_kb}" )) .unwrap(); @@ -8200,20 +8182,19 @@ mod common_parallel { "ls -s --block-size=1 {}", test_disk_path.to_str().unwrap() )); - if res.status.success() { - if let Some(size) = String::from_utf8_lossy(&res.stdout) + if res.status.success() + && let Some(size) = String::from_utf8_lossy(&res.stdout) .split_whitespace() .next() .and_then(|s| s.parse::().ok()) - { - max_size_during_writes.set(max_size_during_writes.get().max(size)); - } + { + max_size_during_writes.set(max_size_during_writes.get().max(size)); } } // Make blocks available for discard guest - .ssh_command(&format!("sudo rm /mnt/test/testfile{}", iteration)) + .ssh_command(&format!("sudo rm /mnt/test/testfile{iteration}")) .unwrap(); guest.ssh_command("sync").unwrap(); @@ -8224,10 +8205,7 @@ mod common_parallel { // Would output like "/mnt/test: X bytes (Y MB) trimmed" assert!( fstrim_result.contains("trimmed") || fstrim_result.contains("bytes"), - "fstrim iteration {} ({}KB) should report trimmed bytes: {}", - iteration, - write_size_kb, - fstrim_result + "fstrim iteration {iteration} ({write_size_kb}KB) should report trimmed bytes: {fstrim_result}" ); } else { // For unsupported formats, expect fstrim to fail @@ -8322,13 +8300,11 @@ mod common_parallel { let test_disk_path = guest.tmp_dir.as_path().join("sparse_off_test.raw"); let test_disk_path = test_disk_path.to_str().unwrap(); - let res = exec_host_command_output(&format!( - "truncate -s {} {}", - TEST_DISK_SIZE, test_disk_path - )); + let res = + exec_host_command_output(&format!("truncate -s {TEST_DISK_SIZE} {test_disk_path}")); assert!(res.status.success(), "Failed to create sparse test file"); - let res = exec_host_command_output(&format!("ls -s --block-size=1 {}", test_disk_path)); + let res = exec_host_command_output(&format!("ls -s --block-size=1 {test_disk_path}")); assert!(res.status.success()); let initial_bytes: u64 = String::from_utf8_lossy(&res.stdout) .split_whitespace() @@ -8337,8 +8313,7 @@ mod common_parallel { .expect("Failed to parse initial disk usage"); assert!( initial_bytes < INITIAL_ALLOCATION_THRESHOLD, - "File should be initially sparse: {} bytes allocated", - initial_bytes + "File should be initially sparse: {initial_bytes} bytes allocated" ); let mut child = GuestCommand::new(&guest) @@ -8358,7 +8333,7 @@ mod common_parallel { guest.disk_config.disk(DiskType::CloudInit).unwrap() ) .as_str(), - format!("path={},sparse=off", test_disk_path).as_str(), + format!("path={test_disk_path},sparse=off").as_str(), ]) .default_net() .capture_output() @@ -8388,7 +8363,7 @@ mod common_parallel { // - physical >= logical is fully allocated, modulo block alignment // - physical < logical is still sparse - let res = exec_host_command_output(&format!("ls -l {}", test_disk_path)); + let res = exec_host_command_output(&format!("ls -l {test_disk_path}")); assert!(res.status.success()); let logical_size: u64 = String::from_utf8_lossy(&res.stdout) .split_whitespace() @@ -8396,7 +8371,7 @@ mod common_parallel { .and_then(|s| s.parse().ok()) .expect("Failed to parse logical size"); - let res = exec_host_command_output(&format!("ls -s --block-size=1 {}", test_disk_path)); + let res = exec_host_command_output(&format!("ls -s --block-size=1 {test_disk_path}")); assert!(res.status.success()); let physical_size: u64 = String::from_utf8_lossy(&res.stdout) .split_whitespace() @@ -8406,33 +8381,26 @@ mod common_parallel { assert_eq!( logical_size, TEST_DISK_SIZE_BYTES, - "Logical size should be exactly {} bytes, got {}", - TEST_DISK_SIZE_BYTES, logical_size + "Logical size should be exactly {TEST_DISK_SIZE_BYTES} bytes, got {logical_size}" ); - let res = exec_host_command_output(&format!("stat -c '%o' {}", test_disk_path)); + let res = exec_host_command_output(&format!("stat -c '%o' {test_disk_path}")); assert!(res.status.success()); let block_size: u64 = String::from_utf8_lossy(&res.stdout) .trim() .parse() .expect("Failed to parse block size from stat"); - let expected_max = ((logical_size + block_size - 1) / block_size) * block_size; + let expected_max = logical_size.div_ceil(block_size) * block_size; assert!( physical_size >= logical_size, - "File should be fully allocated with sparse=off: logical={} bytes, physical={} bytes (physical < logical means still sparse)", - logical_size, - physical_size + "File should be fully allocated with sparse=off: logical={logical_size} bytes, physical={physical_size} bytes (physical < logical means still sparse)" ); assert!( physical_size <= expected_max, - "Physical size seems too large: logical={} bytes, physical={} bytes, expected_max={} bytes (block_size={})", - logical_size, - physical_size, - expected_max, - block_size + "Physical size seems too large: logical={logical_size} bytes, physical={physical_size} bytes, expected_max={expected_max} bytes (block_size={block_size})" ); } @@ -8449,8 +8417,7 @@ mod common_parallel { let test_disk_path = test_disk_path.to_str().unwrap(); let res = exec_host_command_output(&format!( - "qemu-img create -f qcow2 {} {}", - test_disk_path, TEST_DISK_SIZE + "qemu-img create -f qcow2 {test_disk_path} {TEST_DISK_SIZE}" )); assert!(res.status.success(), "Failed to create QCOW2 test image"); @@ -8474,7 +8441,7 @@ mod common_parallel { guest.disk_config.disk(DiskType::CloudInit).unwrap() ) .as_str(), - format!("path={},sparse=off,num_queues=4", test_disk_path).as_str(), + format!("path={test_disk_path},sparse=off,num_queues=4").as_str(), ]) .default_net() .capture_output() @@ -8496,11 +8463,10 @@ mod common_parallel { let mut current_offset_kb = 1024; - for (_iteration, &size_kb) in BLOCK_DISCARD_TEST_SIZES_KB.iter().enumerate() { + for &size_kb in BLOCK_DISCARD_TEST_SIZES_KB.iter() { guest .ssh_command(&format!( - "sudo dd if=/dev/urandom of=/dev/vdc bs=1K count={} seek={} oflag=direct", - size_kb, current_offset_kb + "sudo dd if=/dev/urandom of=/dev/vdc bs=1K count={size_kb} seek={current_offset_kb} oflag=direct" )) .unwrap(); @@ -8538,9 +8504,7 @@ mod common_parallel { assert!( zero_regions_after > zero_regions_before, - "Expected zero-flagged regions to increase with sparse=off: before={}, after={}", - zero_regions_before, - zero_regions_after + "Expected zero-flagged regions to increase with sparse=off: before={zero_regions_before}, after={zero_regions_after}" ); disk_check_consistency(&test_disk_path, None); @@ -12762,26 +12726,22 @@ mod vfio { fn test_nvidia_guest_numa_generic_initiator() { // Skip test if VFIO device is not available or not ready if !std::path::Path::new(NVIDIA_VFIO_DEVICE).exists() { - println!("SKIPPED: VFIO device {} not found", NVIDIA_VFIO_DEVICE); + println!("SKIPPED: VFIO device {NVIDIA_VFIO_DEVICE} not found"); return; } // Check if device is bound to vfio-pci driver - let driver_path = format!("{}/driver", NVIDIA_VFIO_DEVICE); + let driver_path = format!("{NVIDIA_VFIO_DEVICE}/driver"); if let Ok(driver) = std::fs::read_link(&driver_path) { let driver_name = driver.file_name().unwrap_or_default().to_string_lossy(); if driver_name != "vfio-pci" { println!( - "SKIPPED: VFIO device {} bound to {}, not vfio-pci", - NVIDIA_VFIO_DEVICE, driver_name + "SKIPPED: VFIO device {NVIDIA_VFIO_DEVICE} bound to {driver_name}, not vfio-pci" ); return; } } else { - println!( - "SKIPPED: VFIO device {} not bound to any driver", - NVIDIA_VFIO_DEVICE - ); + println!("SKIPPED: VFIO device {NVIDIA_VFIO_DEVICE} not bound to any driver"); return; } @@ -12802,7 +12762,7 @@ mod vfio { ]) .args([ "--device", - &format!("id=vfio0,path={},iommu=on", NVIDIA_VFIO_DEVICE), + &format!("id=vfio0,path={NVIDIA_VFIO_DEVICE},iommu=on"), ]) .args(["--kernel", fw_path(FwType::RustHypervisorFirmware).as_str()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) From c14fd5c5754b44bd95a217ecbe4ae8e5ef8d36c5 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Sun, 15 Mar 2026 02:47:18 -0700 Subject: [PATCH 161/742] tests: Fix clippy issue related to formatting error: consider adding a `;` to the last statement for consistent formatting --> cloud-hypervisor/tests/integration.rs:2516:9 | 2516 | _test_simple_launch(&guest) | ^^^^^^^^^^^^^^^^^^^^^^^^^^^ help: add a `;` here: `_test_simple_launch(&guest);` | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#semicolon_if_nothing_returned Signed-off-by: Rob Bradford --- cloud-hypervisor/tests/integration.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index b8ab87e652..53fcbc17c3 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -2513,7 +2513,7 @@ mod common_parallel { let disk_config = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); let mut guest = Guest::new(Box::new(disk_config)); guest.kernel_path = Some(fw_path(FwType::RustHypervisorFirmware)); - _test_simple_launch(&guest) + _test_simple_launch(&guest); } #[test] @@ -2522,7 +2522,7 @@ mod common_parallel { let disk_config = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); let mut guest = Guest::new(Box::new(disk_config)); guest.kernel_path = Some(fw_path(FwType::Ovmf)); - _test_simple_launch(&guest) + _test_simple_launch(&guest); } #[test] @@ -14787,7 +14787,7 @@ mod common_cvm { let guest = GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); - _test_simple_launch(&guest) + _test_simple_launch(&guest); } #[test] From b92ab6e4b10e92408f97fd557931e02f97747cfc Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Sun, 15 Mar 2026 02:48:05 -0700 Subject: [PATCH 162/742] tests: Fix clippy issue related to unnecessary closure error: unnecessary closure used with `bool::then` --> cloud-hypervisor/tests/integration.rs:3488:9 | 3488 | output.status.success().then(|| ())?; | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#unnecessary_lazy_evaluations = note: `-D clippy::unnecessary-lazy-evaluations` implied by `-D clippy::all` = help: to override `-D clippy::all` add `#[allow(clippy::unnecessary_lazy_evaluations)]` help: use `then_some` instead | 3488 - output.status.success().then(|| ())?; 3488 + output.status.success().then_some(())?; | Signed-off-by: Rob Bradford --- cloud-hypervisor/tests/integration.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 53fcbc17c3..25dde15c03 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -3485,7 +3485,7 @@ mod common_parallel { fn get_image_info(path: &std::path::Path) -> Option { let output = run_qemu_img(path, &["info", "-U", "--output=json"], None); - output.status.success().then(|| ())?; + output.status.success().then_some(())?; serde_json::from_slice(&output.stdout).ok() } From 41a2a8ea6278a4c24b30f766476bb478e618c638 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Sun, 15 Mar 2026 02:49:43 -0700 Subject: [PATCH 163/742] tests: Fix clippy issue related to unnecessary enumerate() error: you seem to use `.enumerate()` and immediately discard the index --> cloud-hypervisor/tests/integration.rs:7675:72 | 7675 | for (_i, (offset, length)) in discard_operations.iter().enumerate() { | ^^^^^^^^^^^^ | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#unused_enumerate_index = note: `-D clippy::unused-enumerate-index` implied by `-D clippy::all` = help: to override `-D clippy::all` add `#[allow(clippy::unused_enumerate_index)]` help: remove the `.enumerate()` call | 7675 - for (_i, (offset, length)) in discard_operations.iter().enumerate() { 7675 + for (offset, length) in discard_operations.iter() { | Signed-off-by: Rob Bradford --- cloud-hypervisor/tests/integration.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 25dde15c03..f7345bf8d9 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -7673,7 +7673,7 @@ mod common_parallel { guest.ssh_command("sync").unwrap(); // Verify VM sees zeros in discarded regions - for (_i, (offset, length)) in discard_operations.iter().enumerate() { + for (offset, length) in discard_operations.iter() { assert_guest_disk_region_is_zero(&guest, "/dev/vdc", *offset, *length); } From 93d8896042a794fd63f11819ecb665c725755c52 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Sun, 15 Mar 2026 02:50:13 -0700 Subject: [PATCH 164/742] tests: Fix clippy issue related to unnecessary move error: the borrowed expression implements the required traits --> cloud-hypervisor/tests/integration.rs:8510:32 | 8510 | disk_check_consistency(&test_disk_path, None); | ^^^^^^^^^^^^^^^ help: change this to: `test_disk_path` | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#needless_borrows_for_generic_args = note: `-D clippy::needless-borrows-for-generic-args` implied by `-D clippy::all` = help: to override `-D clippy::all` add `#[allow(clippy::needless_borrows_for_generic_args)]` error: could not compile `cloud-hypervisor` (test "integration") due to 7 previous errors Signed-off-by: Rob Bradford --- cloud-hypervisor/tests/integration.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index f7345bf8d9..730b5b4688 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -8507,7 +8507,7 @@ mod common_parallel { "Expected zero-flagged regions to increase with sparse=off: before={zero_regions_before}, after={zero_regions_after}" ); - disk_check_consistency(&test_disk_path, None); + disk_check_consistency(test_disk_path, None); } #[test] From 0a7e32c31217af05102e9b30b1da4453b5061389 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Sun, 15 Mar 2026 02:49:13 -0700 Subject: [PATCH 165/742] tests: Fix clippy issue related to unnecessary use of .to_string() error: unnecessary use of `to_string` --> cloud-hypervisor/tests/integration.rs:6765:38 | 6765 | exec_host_command_output(&"sudo dd if=/dev/zero of=/tmp/resize.img bs=1M count=16".to_string()) | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ help: use: `"sudo dd if=/dev/zero of=/tmp/resize.img bs=1M count=16"` | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#unnecessary_to_owned = note: `-D clippy::unnecessary-to-owned` implied by `-D clippy::all` = help: to override `-D clippy::all` add `#[allow(clippy::unnecessary_to_owned)]` Signed-off-by: Rob Bradford --- cloud-hypervisor/tests/integration.rs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 730b5b4688..d48b18adbf 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -6762,11 +6762,9 @@ mod common_parallel { // Create a disk image that we can write to assert!( - exec_host_command_output( - &"sudo dd if=/dev/zero of=/tmp/resize.img bs=1M count=16".to_string() - ) - .status - .success() + exec_host_command_output("sudo dd if=/dev/zero of=/tmp/resize.img bs=1M count=16") + .status + .success() ); let mut cmd = GuestCommand::new(&guest); From 14d8cf5f1e0a817e7194bf3f7bd89900ed6dfeb0 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Sun, 15 Mar 2026 03:01:27 -0700 Subject: [PATCH 166/742] tests: Fix clippy issue related to passing by value warning: this argument is passed by value, but not consumed in the function body --> cloud-hypervisor/tests/integration.rs:3785:51 | 3785 | fn run_multiqueue_qcow2_test(image_config: QcowTestImageConfig, test_fn: F) | ^^^^^^^^^^^^^^^^^^^ | help: or consider marking this type as `Copy` --> cloud-hypervisor/tests/integration.rs:3774:5 | 3774 | enum QcowTestImageConfig { | ^^^^^^^^^^^^^^^^^^^^^^^^ = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#needless_pass_by_value = note: requested on the command line with `-D clippy::needless-pass-by-value` Signed-off-by: Rob Bradford --- cloud-hypervisor/tests/integration.rs | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index d48b18adbf..8841b00aea 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -3782,7 +3782,7 @@ mod common_parallel { /// /// Creates a VM with multiple virtio queues on the test disk, then runs the /// provided test closure. Handles VM lifecycle and consistency checks. - fn run_multiqueue_qcow2_test(image_config: QcowTestImageConfig, test_fn: F) + fn run_multiqueue_qcow2_test(image_config: &QcowTestImageConfig, test_fn: F) where F: FnOnce(&Guest) + std::panic::UnwindSafe, { @@ -3793,7 +3793,7 @@ mod common_parallel { let test_image_path = guest.tmp_dir.as_path().join("test.qcow2"); // Create test image based on configuration and capture backing checksum if applicable - let initial_backing_checksum = match image_config { + let initial_backing_checksum = match *image_config { QcowTestImageConfig::Simple(size) => { Command::new("qemu-img") .arg("create") @@ -3876,7 +3876,7 @@ mod common_parallel { #[test] fn test_virtio_block_qcow2_multiqueue_writes() { - run_multiqueue_qcow2_test(QcowTestImageConfig::Simple("256M"), |guest| { + run_multiqueue_qcow2_test(&QcowTestImageConfig::Simple("256M"), |guest| { assert_eq!( guest .ssh_command("ls -ll /sys/block/vdc/mq | grep ^d | wc -l") @@ -3946,7 +3946,7 @@ mod common_parallel { #[test] fn test_virtio_block_qcow2_multiqueue_mixed_rw() { - run_multiqueue_qcow2_test(QcowTestImageConfig::Simple("512M"), |guest| { + run_multiqueue_qcow2_test(&QcowTestImageConfig::Simple("512M"), |guest| { guest .ssh_command("sudo mkfs.ext4 -F /dev/vdc") .expect("Failed to format disk"); @@ -3997,7 +3997,7 @@ mod common_parallel { #[test] fn test_virtio_block_qcow2_multiqueue_backing() { - run_multiqueue_qcow2_test(QcowTestImageConfig::WithBacking, |guest| { + run_multiqueue_qcow2_test(&QcowTestImageConfig::WithBacking, |guest| { guest .ssh_command("sudo mkfs.ext4 -F /dev/vdc") .expect("Failed to format disk"); @@ -4041,7 +4041,7 @@ mod common_parallel { #[test] fn test_virtio_block_qcow2_multiqueue_random_4k() { - run_multiqueue_qcow2_test(QcowTestImageConfig::Simple("256M"), |guest| { + run_multiqueue_qcow2_test(&QcowTestImageConfig::Simple("256M"), |guest| { guest .ssh_command( "for i in $(seq 1 8); do \ @@ -4071,7 +4071,7 @@ mod common_parallel { #[test] fn test_virtio_block_qcow2_multiqueue_fsync() { - run_multiqueue_qcow2_test(QcowTestImageConfig::Simple("256M"), |guest| { + run_multiqueue_qcow2_test(&QcowTestImageConfig::Simple("256M"), |guest| { guest .ssh_command("sudo mkfs.ext4 -F /dev/vdc") .expect("Failed to format disk"); @@ -4118,7 +4118,7 @@ mod common_parallel { #[test] fn test_virtio_block_qcow2_multiqueue_metadata() { - run_multiqueue_qcow2_test(QcowTestImageConfig::Simple("256M"), |guest| { + run_multiqueue_qcow2_test(&QcowTestImageConfig::Simple("256M"), |guest| { guest .ssh_command("sudo mkfs.ext4 -F /dev/vdc") .expect("Failed to format disk"); @@ -4197,7 +4197,7 @@ mod common_parallel { #[test] fn test_virtio_block_qcow2_multiqueue_discard_mount() { - run_multiqueue_qcow2_test(QcowTestImageConfig::Simple("256M"), |guest| { + run_multiqueue_qcow2_test(&QcowTestImageConfig::Simple("256M"), |guest| { guest .ssh_command("sudo mkfs.ext4 -F /dev/vdc") .expect("Failed to format disk"); @@ -4260,7 +4260,7 @@ mod common_parallel { } #[test] fn test_virtio_block_qcow2_multiqueue_wide_writes() { - run_multiqueue_qcow2_test(QcowTestImageConfig::Simple("1G"), |guest| { + run_multiqueue_qcow2_test(&QcowTestImageConfig::Simple("1G"), |guest| { // Scattered write pattern - write to widely separated offsets in parallel. // This should initiate many L2 table allocations simultaneously across different queues. guest @@ -4300,7 +4300,7 @@ mod common_parallel { #[test] fn test_virtio_block_qcow2_multiqueue_discard_stress() { - run_multiqueue_qcow2_test(QcowTestImageConfig::Simple("512M"), |guest| { + run_multiqueue_qcow2_test(&QcowTestImageConfig::Simple("512M"), |guest| { guest .ssh_command("sudo mkfs.ext4 -F /dev/vdc") .expect("Failed to format disk"); From 0da63a4507995ddf73f261e54837e970b5df4da1 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Mon, 16 Mar 2026 02:53:39 -0700 Subject: [PATCH 167/742] tests: Ensure clippy --tests runs on integration.rs Include the file when running clippy as well as when building with devcli_testenv set. Fixes: #7846 Signed-off-by: Rob Bradford --- cloud-hypervisor/tests/integration.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 8841b00aea..fb1049100a 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 // -#![cfg(devcli_testenv)] +#![cfg(any(devcli_testenv, clippy))] #![allow(clippy::undocumented_unsafe_blocks)] // When enabling the `mshv` feature, we skip quite some tests and // hence have known dead-code. This annotation silences dead-code From f1875668e0f1cc2cda99ca54f8bb6c488ea65bdc Mon Sep 17 00:00:00 2001 From: Souradeep Date: Thu, 12 Mar 2026 11:59:03 +0000 Subject: [PATCH 168/742] scripts: Rename sha1sums-aarch64 to sha1sums-aarch64-common Rename the aarch64 sha1sums file to sha1sums-aarch64-common to follow the same naming convention as sha1sums-x86_64-common. This allows run_metrics.sh to use the generic sha1sums-${TEST_ARCH}-common pattern for all architectures, removing the need for aarch64-specific conditionals. Update run_integration_tests_aarch64.sh to reference the renamed file. Signed-off-by: Souradeep --- scripts/run_integration_tests_aarch64.sh | 4 ++-- scripts/{sha1sums-aarch64 => sha1sums-aarch64-common} | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename scripts/{sha1sums-aarch64 => sha1sums-aarch64-common} (100%) diff --git a/scripts/run_integration_tests_aarch64.sh b/scripts/run_integration_tests_aarch64.sh index 68500028bc..2489d7892a 100755 --- a/scripts/run_integration_tests_aarch64.sh +++ b/scripts/run_integration_tests_aarch64.sh @@ -26,7 +26,7 @@ build_virtiofsd() { } update_workloads() { - cp scripts/sha1sums-aarch64 "$WORKLOADS_DIR" + cp scripts/sha1sums-aarch64-common "$WORKLOADS_DIR" FOCAL_OS_RAW_IMAGE_NAME="focal-server-cloudimg-arm64-custom-20210929-0.raw" FOCAL_OS_RAW_IMAGE_DOWNLOAD_URL="https://ch-images.azureedge.net/$FOCAL_OS_RAW_IMAGE_NAME" @@ -138,7 +138,7 @@ update_workloads() { pushd "$WORKLOADS_DIR" || exit - if ! sha1sum sha1sums-aarch64 --check; then + if ! sha1sum sha1sums-aarch64-common --check; then echo "sha1sum validation of images failed, remove invalid images to fix the issue." exit 1 fi diff --git a/scripts/sha1sums-aarch64 b/scripts/sha1sums-aarch64-common similarity index 100% rename from scripts/sha1sums-aarch64 rename to scripts/sha1sums-aarch64-common From 1539b195d12c23d93ddd9f8ed4c347da14745a8b Mon Sep 17 00:00:00 2001 From: Alyssa Ross Date: Mon, 16 Mar 2026 17:44:25 +0100 Subject: [PATCH 169/742] virtio-devices: remove incorrect comment This device is not called virtio-vhost-user; that's something else. I don't think the comment really clarifies anything anyway, so just remove it. Fixes: 8c618ff5e ("virtio-devices: generic-vhost-user: implement device") Signed-off-by: Alyssa Ross --- virtio-devices/src/vhost_user/generic_vhost_user.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtio-devices/src/vhost_user/generic_vhost_user.rs b/virtio-devices/src/vhost_user/generic_vhost_user.rs index b8af44d75d..b90c6c079d 100644 --- a/virtio-devices/src/vhost_user/generic_vhost_user.rs +++ b/virtio-devices/src/vhost_user/generic_vhost_user.rs @@ -121,7 +121,7 @@ since the backend only supports {backend_num_queues}\n", ); return Err(Error::BadQueueNum); } - // Create virtio-vhost-user device configuration. + ( acked_features, // If part of the available features that have been acked, the From a7fefb63dd58687afcefe066e00baade1ef5db55 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 16 Mar 2026 17:02:39 +0100 Subject: [PATCH 170/742] virtio-devices: block: Populate discard and write zeroes config When VIRTIO_BLK_F_DISCARD or VIRTIO_BLK_F_WRITE_ZEROES features are advertised, the virtio spec v1.2, sections 5.2.4 and 5.2.6.1, requires the corresponding VirtioBlockConfig fields to contain valid, non zero values. Leaving them at zero causes strictly behaved drivers to either reject the features or crash. Populate max_discard_sectors, max_discard_seg, discard_sector_alignment, max_write_zeroes_sectors, max_write_zeroes_seg and write_zeroes_may_unmap after feature advertisement so drivers can safely negotiate these features. Fixes: #7849 Signed-off-by: Anatol Belski --- virtio-devices/src/block.rs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/virtio-devices/src/block.rs b/virtio-devices/src/block.rs index 1668565340..af6d638dd0 100644 --- a/virtio-devices/src/block.rs +++ b/virtio-devices/src/block.rs @@ -777,10 +777,12 @@ impl Block { // - Always advertise WRITE_ZEROES // - Advertise DISCARD only if sparse=true OR format supports marking // clusters as zero without deallocating + let mut discard_supported = false; if disk_image.supports_sparse_operations() { avail_features |= 1u64 << VIRTIO_BLK_F_WRITE_ZEROES; if sparse || disk_image.supports_zero_flag() { avail_features |= 1u64 << VIRTIO_BLK_F_DISCARD; + discard_supported = true; } } else if sparse { warn!("sparse=on requested but backend does not support sparse operations"); @@ -823,6 +825,17 @@ impl Block { ..Default::default() }; + if avail_features & (1u64 << VIRTIO_BLK_F_WRITE_ZEROES) != 0 { + config.max_write_zeroes_sectors = u32::MAX; + config.max_write_zeroes_seg = 1; + config.write_zeroes_may_unmap = if discard_supported { 1 } else { 0 }; + } + if avail_features & (1u64 << VIRTIO_BLK_F_DISCARD) != 0 { + config.max_discard_sectors = u32::MAX; + config.max_discard_seg = 1; + config.discard_sector_alignment = 1; + } + if num_queues > 1 { avail_features |= 1u64 << VIRTIO_BLK_F_MQ; config.num_queues = num_queues as u16; From 083892b2e22d2a79cb92656e22aa259854718940 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 16 Mar 2026 17:13:52 +0100 Subject: [PATCH 171/742] tests: windows: remove sparse=off workaround The config space fix in the previous commit correctly populates the discard and write zeroes fields, so the sparse=off workaround is no longer needed for Windows guests. Replace default_disks_sparse_off() with default_disks() in all Windows test cases and remove the explicit sparse=off from the multi queue test. Signed-off-by: Anatol Belski --- cloud-hypervisor/tests/integration.rs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index fb1049100a..7218cbb393 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -11844,7 +11844,7 @@ mod windows { .args(["--kernel", edk2_path().to_str().unwrap()]) .args(["--serial", "tty"]) .args(["--console", "off"]) - .default_disks_sparse_off() + .default_disks() .default_net() .capture_output() .spawn() @@ -11893,7 +11893,7 @@ mod windows { .args([ "--disk", format!( - "path={},num_queues=4,sparse=off", + "path={},num_queues=4", windows_guest .guest() .disk_config @@ -11961,7 +11961,7 @@ mod windows { .args(["--kernel", ovmf_path.to_str().unwrap()]) .args(["--serial", "tty"]) .args(["--console", "off"]) - .default_disks_sparse_off() + .default_disks() .default_net() .capture_output() .spawn() @@ -12050,7 +12050,7 @@ mod windows { .args(["--kernel", ovmf_path.to_str().unwrap()]) .args(["--serial", "tty"]) .args(["--console", "off"]) - .default_disks_sparse_off() + .default_disks() .default_net() .capture_output() .spawn() @@ -12125,7 +12125,7 @@ mod windows { .args(["--kernel", ovmf_path.to_str().unwrap()]) .args(["--serial", "tty"]) .args(["--console", "off"]) - .default_disks_sparse_off() + .default_disks() .default_net() .capture_output() .spawn() @@ -12199,7 +12199,7 @@ mod windows { .args(["--kernel", ovmf_path.to_str().unwrap()]) .args(["--serial", "tty"]) .args(["--console", "off"]) - .default_disks_sparse_off() + .default_disks() .default_net() .capture_output() .spawn() @@ -12273,7 +12273,7 @@ mod windows { .args(["--kernel", ovmf_path.to_str().unwrap()]) .args(["--serial", "tty"]) .args(["--console", "off"]) - .default_disks_sparse_off() + .default_disks() .default_net() .capture_output() .spawn() @@ -12369,7 +12369,7 @@ mod windows { .args(["--kernel", ovmf_path.to_str().unwrap()]) .args(["--serial", "tty"]) .args(["--console", "off"]) - .default_disks_sparse_off() + .default_disks() .default_net() .capture_output() .spawn() @@ -12501,7 +12501,7 @@ mod windows { .args(["--kernel", ovmf_path.to_str().unwrap()]) .args(["--serial", "tty"]) .args(["--console", "off"]) - .default_disks_sparse_off() + .default_disks() // The multi net dev config is borrowed from test_multiple_network_interfaces .args([ "--net", From ad3179fe11502a0c2fbb554b6b305c72f2f5bfaf Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 17 Mar 2026 00:07:55 +0000 Subject: [PATCH 172/742] build: Bump the non-rust-vmm group across 2 directories with 23 updates Bumps the non-rust-vmm group with 15 updates in the / directory: | Package | From | To | | --- | --- | --- | | [serde_with](https://github.com/jonasbb/serde_with) | `3.17.0` | `3.18.0` | | [clap](https://github.com/clap-rs/clap) | `4.5.60` | `4.6.0` | | [num_enum](https://github.com/illicitonion/num_enum) | `0.7.5` | `0.7.6` | | [gdbstub](https://github.com/daniel5151/gdbstub) | `0.7.9` | `0.7.10` | | [gdbstub_arch](https://github.com/daniel5151/gdbstub) | `0.3.2` | `0.3.3` | | [anstyle](https://github.com/rust-cli/anstyle) | `1.0.13` | `1.0.14` | | [cc](https://github.com/rust-lang/cc-rs) | `1.2.56` | `1.2.57` | | [clap_lex](https://github.com/clap-rs/clap) | `1.0.0` | `1.1.0` | | [colorchoice](https://github.com/rust-cli/anstyle) | `1.0.4` | `1.0.5` | | [libz-sys](https://github.com/rust-lang/libz-sys) | `1.1.24` | `1.1.25` | | [once_cell](https://github.com/matklad/once_cell) | `1.21.3` | `1.21.4` | | [openssl-sys](https://github.com/rust-openssl/rust-openssl) | `0.9.111` | `0.9.112` | | [portable-atomic-util](https://github.com/taiki-e/portable-atomic-util) | `0.2.5` | `0.2.6` | | [tempfile](https://github.com/Stebalien/tempfile) | `3.26.0` | `3.27.0` | | [uds_windows](https://github.com/haraldh/rust_uds_windows) | `1.2.0` | `1.2.1` | Bumps the non-rust-vmm group with 10 updates in the /fuzz directory: | Package | From | To | | --- | --- | --- | | [serde_with](https://github.com/jonasbb/serde_with) | `3.17.0` | `3.18.0` | | [clap](https://github.com/clap-rs/clap) | `4.5.60` | `4.6.0` | | [num_enum](https://github.com/illicitonion/num_enum) | `0.7.5` | `0.7.6` | | [gdbstub](https://github.com/daniel5151/gdbstub) | `0.7.9` | `0.7.10` | | [gdbstub_arch](https://github.com/daniel5151/gdbstub) | `0.3.2` | `0.3.3` | | [anstyle](https://github.com/rust-cli/anstyle) | `1.0.13` | `1.0.14` | | [cc](https://github.com/rust-lang/cc-rs) | `1.2.56` | `1.2.57` | | [clap_lex](https://github.com/clap-rs/clap) | `1.0.0` | `1.1.0` | | [colorchoice](https://github.com/rust-cli/anstyle) | `1.0.4` | `1.0.5` | | [once_cell](https://github.com/matklad/once_cell) | `1.21.3` | `1.21.4` | Updates `serde_with` from 3.17.0 to 3.18.0 - [Release notes](https://github.com/jonasbb/serde_with/releases) - [Commits](https://github.com/jonasbb/serde_with/compare/v3.17.0...v3.18.0) Updates `clap` from 4.5.60 to 4.6.0 - [Release notes](https://github.com/clap-rs/clap/releases) - [Changelog](https://github.com/clap-rs/clap/blob/master/CHANGELOG.md) - [Commits](https://github.com/clap-rs/clap/compare/clap_complete-v4.5.60...clap_complete-v4.6.0) Updates `num_enum` from 0.7.5 to 0.7.6 - [Commits](https://github.com/illicitonion/num_enum/compare/0.7.5...0.7.6) Updates `gdbstub` from 0.7.9 to 0.7.10 - [Release notes](https://github.com/daniel5151/gdbstub/releases) - [Changelog](https://github.com/daniel5151/gdbstub/blob/master/CHANGELOG.md) - [Commits](https://github.com/daniel5151/gdbstub/compare/0.7.9...0.7.10) Updates `gdbstub_arch` from 0.3.2 to 0.3.3 - [Release notes](https://github.com/daniel5151/gdbstub/releases) - [Changelog](https://github.com/daniel5151/gdbstub/blob/master/CHANGELOG.md) - [Commits](https://github.com/daniel5151/gdbstub/commits) Updates `anstyle` from 1.0.13 to 1.0.14 - [Commits](https://github.com/rust-cli/anstyle/compare/v1.0.13...v1.0.14) Updates `cc` from 1.2.56 to 1.2.57 - [Release notes](https://github.com/rust-lang/cc-rs/releases) - [Changelog](https://github.com/rust-lang/cc-rs/blob/main/CHANGELOG.md) - [Commits](https://github.com/rust-lang/cc-rs/compare/cc-v1.2.56...cc-v1.2.57) Updates `clap_builder` from 4.5.60 to 4.6.0 - [Release notes](https://github.com/clap-rs/clap/releases) - [Changelog](https://github.com/clap-rs/clap/blob/master/CHANGELOG.md) - [Commits](https://github.com/clap-rs/clap/compare/v4.5.60...v4.6.0) Updates `clap_lex` from 1.0.0 to 1.1.0 - [Release notes](https://github.com/clap-rs/clap/releases) - [Changelog](https://github.com/clap-rs/clap/blob/master/CHANGELOG.md) - [Commits](https://github.com/clap-rs/clap/compare/clap_lex-v1.0.0...clap_lex-v1.1.0) Updates `colorchoice` from 1.0.4 to 1.0.5 - [Commits](https://github.com/rust-cli/anstyle/compare/colorchoice-v1.0.4...colorchoice-v1.0.5) Updates `darling` from 0.21.3 to 0.23.0 - [Release notes](https://github.com/TedDriggs/darling/releases) - [Changelog](https://github.com/TedDriggs/darling/blob/master/CHANGELOG.md) - [Commits](https://github.com/TedDriggs/darling/compare/v0.21.3...v0.23.0) Updates `darling_core` from 0.21.3 to 0.23.0 - [Release notes](https://github.com/TedDriggs/darling/releases) - [Changelog](https://github.com/TedDriggs/darling/blob/master/CHANGELOG.md) - [Commits](https://github.com/TedDriggs/darling/compare/v0.21.3...v0.23.0) Updates `darling_macro` from 0.21.3 to 0.23.0 - [Release notes](https://github.com/TedDriggs/darling/releases) - [Changelog](https://github.com/TedDriggs/darling/blob/master/CHANGELOG.md) - [Commits](https://github.com/TedDriggs/darling/compare/v0.21.3...v0.23.0) Updates `libz-sys` from 1.1.24 to 1.1.25 - [Release notes](https://github.com/rust-lang/libz-sys/releases) - [Commits](https://github.com/rust-lang/libz-sys/compare/1.1.24...1.1.25) Updates `num_enum_derive` from 0.7.5 to 0.7.6 - [Commits](https://github.com/illicitonion/num_enum/compare/0.7.5...0.7.6) Updates `once_cell` from 1.21.3 to 1.21.4 - [Changelog](https://github.com/matklad/once_cell/blob/master/CHANGELOG.md) - [Commits](https://github.com/matklad/once_cell/compare/v1.21.3...v1.21.4) Updates `openssl-sys` from 0.9.111 to 0.9.112 - [Release notes](https://github.com/rust-openssl/rust-openssl/releases) - [Commits](https://github.com/rust-openssl/rust-openssl/compare/openssl-sys-v0.9.111...openssl-sys-v0.9.112) Updates `portable-atomic-util` from 0.2.5 to 0.2.6 - [Release notes](https://github.com/taiki-e/portable-atomic-util/releases) - [Changelog](https://github.com/taiki-e/portable-atomic-util/blob/main/CHANGELOG.md) - [Commits](https://github.com/taiki-e/portable-atomic-util/compare/v0.2.5...v0.2.6) Updates `serde_with_macros` from 3.17.0 to 3.18.0 - [Release notes](https://github.com/jonasbb/serde_with/releases) - [Commits](https://github.com/jonasbb/serde_with/compare/v3.17.0...v3.18.0) Updates `tempfile` from 3.26.0 to 3.27.0 - [Changelog](https://github.com/Stebalien/tempfile/blob/master/CHANGELOG.md) - [Commits](https://github.com/Stebalien/tempfile/compare/v3.26.0...v3.27.0) Updates `uds_windows` from 1.2.0 to 1.2.1 - [Release notes](https://github.com/haraldh/rust_uds_windows/releases) - [Changelog](https://github.com/haraldh/rust_uds_windows/blob/master/CHANGELOG.md) - [Commits](https://github.com/haraldh/rust_uds_windows/compare/v1.2.0...v1.2.1) Updates `serde_with` from 3.17.0 to 3.18.0 - [Release notes](https://github.com/jonasbb/serde_with/releases) - [Commits](https://github.com/jonasbb/serde_with/compare/v3.17.0...v3.18.0) Updates `clap` from 4.5.60 to 4.6.0 - [Release notes](https://github.com/clap-rs/clap/releases) - [Changelog](https://github.com/clap-rs/clap/blob/master/CHANGELOG.md) - [Commits](https://github.com/clap-rs/clap/compare/clap_complete-v4.5.60...clap_complete-v4.6.0) Updates `num_enum` from 0.7.5 to 0.7.6 - [Commits](https://github.com/illicitonion/num_enum/compare/0.7.5...0.7.6) Updates `gdbstub` from 0.7.9 to 0.7.10 - [Release notes](https://github.com/daniel5151/gdbstub/releases) - [Changelog](https://github.com/daniel5151/gdbstub/blob/master/CHANGELOG.md) - [Commits](https://github.com/daniel5151/gdbstub/compare/0.7.9...0.7.10) Updates `gdbstub_arch` from 0.3.2 to 0.3.3 - [Release notes](https://github.com/daniel5151/gdbstub/releases) - [Changelog](https://github.com/daniel5151/gdbstub/blob/master/CHANGELOG.md) - [Commits](https://github.com/daniel5151/gdbstub/commits) Updates `anstream` from 0.6.21 to 1.0.0 - [Commits](https://github.com/rust-cli/anstyle/compare/anstream-v0.6.21...anstream-v1.0.0) Updates `anstyle` from 1.0.13 to 1.0.14 - [Commits](https://github.com/rust-cli/anstyle/compare/v1.0.13...v1.0.14) Updates `anstyle-parse` from 0.2.7 to 1.0.0 - [Commits](https://github.com/rust-cli/anstyle/compare/anstyle-parse-v0.2.7...anstyle-parse-v1.0.0) Updates `cc` from 1.2.56 to 1.2.57 - [Release notes](https://github.com/rust-lang/cc-rs/releases) - [Changelog](https://github.com/rust-lang/cc-rs/blob/main/CHANGELOG.md) - [Commits](https://github.com/rust-lang/cc-rs/compare/cc-v1.2.56...cc-v1.2.57) Updates `clap_builder` from 4.5.60 to 4.6.0 - [Release notes](https://github.com/clap-rs/clap/releases) - [Changelog](https://github.com/clap-rs/clap/blob/master/CHANGELOG.md) - [Commits](https://github.com/clap-rs/clap/compare/v4.5.60...v4.6.0) Updates `clap_lex` from 1.0.0 to 1.1.0 - [Release notes](https://github.com/clap-rs/clap/releases) - [Changelog](https://github.com/clap-rs/clap/blob/master/CHANGELOG.md) - [Commits](https://github.com/clap-rs/clap/compare/clap_lex-v1.0.0...clap_lex-v1.1.0) Updates `colorchoice` from 1.0.4 to 1.0.5 - [Commits](https://github.com/rust-cli/anstyle/compare/colorchoice-v1.0.4...colorchoice-v1.0.5) Updates `darling` from 0.21.3 to 0.23.0 - [Release notes](https://github.com/TedDriggs/darling/releases) - [Changelog](https://github.com/TedDriggs/darling/blob/master/CHANGELOG.md) - [Commits](https://github.com/TedDriggs/darling/compare/v0.21.3...v0.23.0) Updates `darling_core` from 0.21.3 to 0.23.0 - [Release notes](https://github.com/TedDriggs/darling/releases) - [Changelog](https://github.com/TedDriggs/darling/blob/master/CHANGELOG.md) - [Commits](https://github.com/TedDriggs/darling/compare/v0.21.3...v0.23.0) Updates `darling_macro` from 0.21.3 to 0.23.0 - [Release notes](https://github.com/TedDriggs/darling/releases) - [Changelog](https://github.com/TedDriggs/darling/blob/master/CHANGELOG.md) - [Commits](https://github.com/TedDriggs/darling/compare/v0.21.3...v0.23.0) Updates `num_enum_derive` from 0.7.5 to 0.7.6 - [Commits](https://github.com/illicitonion/num_enum/compare/0.7.5...0.7.6) Updates `once_cell` from 1.21.3 to 1.21.4 - [Changelog](https://github.com/matklad/once_cell/blob/master/CHANGELOG.md) - [Commits](https://github.com/matklad/once_cell/compare/v1.21.3...v1.21.4) Updates `serde_with_macros` from 3.17.0 to 3.18.0 - [Release notes](https://github.com/jonasbb/serde_with/releases) - [Commits](https://github.com/jonasbb/serde_with/compare/v3.17.0...v3.18.0) --- updated-dependencies: - dependency-name: serde_with dependency-version: 3.18.0 dependency-type: direct:production update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: clap dependency-version: 4.6.0 dependency-type: direct:production update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: num_enum dependency-version: 0.7.6 dependency-type: direct:production update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: gdbstub dependency-version: 0.7.10 dependency-type: direct:production update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: gdbstub_arch dependency-version: 0.3.3 dependency-type: direct:production update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: anstyle dependency-version: 1.0.14 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: cc dependency-version: 1.2.57 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: clap_builder dependency-version: 4.6.0 dependency-type: indirect update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: clap_lex dependency-version: 1.1.0 dependency-type: indirect update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: colorchoice dependency-version: 1.0.5 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: darling dependency-version: 0.23.0 dependency-type: indirect update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: darling_core dependency-version: 0.23.0 dependency-type: indirect update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: darling_macro dependency-version: 0.23.0 dependency-type: indirect update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: libz-sys dependency-version: 1.1.25 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: num_enum_derive dependency-version: 0.7.6 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: once_cell dependency-version: 1.21.4 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: openssl-sys dependency-version: 0.9.112 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: portable-atomic-util dependency-version: 0.2.6 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: serde_with_macros dependency-version: 3.18.0 dependency-type: indirect update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: tempfile dependency-version: 3.27.0 dependency-type: indirect update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: uds_windows dependency-version: 1.2.1 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: serde_with dependency-version: 3.18.0 dependency-type: direct:production update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: clap dependency-version: 4.6.0 dependency-type: direct:production update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: num_enum dependency-version: 0.7.6 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: gdbstub dependency-version: 0.7.10 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: gdbstub_arch dependency-version: 0.3.3 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: anstream dependency-version: 1.0.0 dependency-type: indirect update-type: version-update:semver-major dependency-group: non-rust-vmm - dependency-name: anstyle dependency-version: 1.0.14 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: anstyle-parse dependency-version: 1.0.0 dependency-type: indirect update-type: version-update:semver-major dependency-group: non-rust-vmm - dependency-name: cc dependency-version: 1.2.57 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: clap_builder dependency-version: 4.6.0 dependency-type: indirect update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: clap_lex dependency-version: 1.1.0 dependency-type: indirect update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: colorchoice dependency-version: 1.0.5 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: darling dependency-version: 0.23.0 dependency-type: indirect update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: darling_core dependency-version: 0.23.0 dependency-type: indirect update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: darling_macro dependency-version: 0.23.0 dependency-type: indirect update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: num_enum_derive dependency-version: 0.7.6 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: once_cell dependency-version: 1.21.4 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: serde_with_macros dependency-version: 3.18.0 dependency-type: indirect update-type: version-update:semver-minor dependency-group: non-rust-vmm ... Signed-off-by: dependabot[bot] --- Cargo.lock | 143 +++++++++++++++++++++++++-------------------- Cargo.toml | 4 +- devices/Cargo.toml | 2 +- fuzz/Cargo.lock | 107 +++++++++++++++++---------------- vmm/Cargo.toml | 4 +- 5 files changed, 138 insertions(+), 122 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1833fda938..083f421012 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -42,7 +42,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" dependencies = [ "anstyle", - "anstyle-parse", + "anstyle-parse 0.2.7", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstream" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d" +dependencies = [ + "anstyle", + "anstyle-parse 1.0.0", "anstyle-query", "anstyle-wincon", "colorchoice", @@ -52,9 +67,9 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.13" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" [[package]] name = "anstyle-parse" @@ -65,13 +80,22 @@ dependencies = [ "utf8parse", ] +[[package]] +name = "anstyle-parse" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e" +dependencies = [ + "utf8parse", +] + [[package]] name = "anstyle-query" version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" dependencies = [ - "windows-sys 0.60.2", + "windows-sys 0.61.0", ] [[package]] @@ -82,7 +106,7 @@ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys 0.60.2", + "windows-sys 0.61.0", ] [[package]] @@ -370,9 +394,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "cc" -version = "1.2.56" +version = "1.2.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aebf35691d1bfb0ac386a69bac2fde4dd276fb618cf8bf4f5318fe285e821bb2" +checksum = "7a0dd1ca384932ff3641c8718a02769f1698e7563dc6974ffd03346116310423" dependencies = [ "find-msvc-tools", "jobserver", @@ -399,20 +423,20 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.60" +version = "4.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2797f34da339ce31042b27d23607e051786132987f595b02ba4f6a6dffb7030a" +checksum = "b193af5b67834b676abd72466a96c1024e6a6ad978a1f484bd90b85c94041351" dependencies = [ "clap_builder", ] [[package]] name = "clap_builder" -version = "4.5.60" +version = "4.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24a241312cea5059b13574bb9b3861cabf758b879c15190b37b6d6fd63ab6876" +checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" dependencies = [ - "anstream", + "anstream 1.0.0", "anstyle", "clap_lex", "strsim", @@ -421,9 +445,9 @@ dependencies = [ [[package]] name = "clap_lex" -version = "1.0.0" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a822ea5bc7590f9d40f1ba12c0dc3c2760f3482c6984db1573ad11031420831" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" [[package]] name = "cloud-hypervisor" @@ -459,9 +483,9 @@ dependencies = [ [[package]] name = "colorchoice" -version = "1.0.4" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" +checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" [[package]] name = "concat-idents" @@ -517,9 +541,9 @@ checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" [[package]] name = "darling" -version = "0.21.3" +version = "0.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cdf337090841a411e2a7f3deb9187445851f91b309c0c0a29e05f74a00a48c0" +checksum = "25ae13da2f202d56bd7f91c25fba009e7717a1e4a1cc98a76d844b65ae912e9d" dependencies = [ "darling_core", "darling_macro", @@ -527,11 +551,10 @@ dependencies = [ [[package]] name = "darling_core" -version = "0.21.3" +version = "0.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1247195ecd7e3c85f83c8d2a366e4210d588e802133e1e355180a9870b517ea4" +checksum = "9865a50f7c335f53564bb694ef660825eb8610e0a53d3e11bf1b0d3df31e03b0" dependencies = [ - "fnv", "ident_case", "proc-macro2", "quote", @@ -541,9 +564,9 @@ dependencies = [ [[package]] name = "darling_macro" -version = "0.21.3" +version = "0.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d38308df82d1080de0afee5d069fa14b0326a88c14f15c5ccda35b4a6c414c81" +checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d" dependencies = [ "darling_core", "quote", @@ -618,7 +641,7 @@ dependencies = [ "libc", "option-ext", "redox_users", - "windows-sys 0.60.2", + "windows-sys 0.61.0", ] [[package]] @@ -670,7 +693,7 @@ version = "0.11.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2daee4ea451f429a58296525ddf28b45a3b64f1acf6587e2067437bb11e218d" dependencies = [ - "anstream", + "anstream 0.6.21", "anstyle", "env_filter", "jiff", @@ -700,7 +723,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.60.2", + "windows-sys 0.61.0", ] [[package]] @@ -778,12 +801,6 @@ dependencies = [ "spin", ] -[[package]] -name = "fnv" -version = "1.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" - [[package]] name = "foldhash" version = "0.1.5" @@ -893,23 +910,23 @@ dependencies = [ [[package]] name = "gdbstub" -version = "0.7.9" +version = "0.7.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bf845b08f7c2ef3b5ad19f80779d43ae20d278652b91bb80adda65baf2d8ed6" +checksum = "5bafc7e33650ab9f05dcc16325f05d56b8d10393114e31a19a353b86fa60cfe7" dependencies = [ "bitflags 2.11.0", "cfg-if", "log", "managed", "num-traits", - "paste", + "pastey", ] [[package]] name = "gdbstub_arch" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22dde0e1b68787036ccedd0b1ff6f953527a0e807e571fbe898975203027278f" +checksum = "6c02bfe7bd65f42bcda751456869dfa1eb2bd1c36e309b9ec27f4888d41cf258" dependencies = [ "gdbstub", "num-traits", @@ -1255,9 +1272,9 @@ dependencies = [ [[package]] name = "libz-sys" -version = "1.1.24" +version = "1.1.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4735e9cbde5aac84a5ce588f6b23a90b9b0b528f6c5a8db8a4aff300463a0839" +checksum = "d52f4c29e2a68ac30c9087e1b772dc9f44a2b66ed44edf2266cf2be9b03dafc1" dependencies = [ "cc", "libc", @@ -1413,9 +1430,9 @@ dependencies = [ [[package]] name = "num_enum" -version = "0.7.5" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1207a7e20ad57b847bbddc6776b968420d38292bbfe2089accff5e19e82454c" +checksum = "5d0bca838442ec211fa11de3a8b0e0e8f3a4522575b5c4c06ed722e005036f26" dependencies = [ "num_enum_derive", "rustversion", @@ -1423,9 +1440,9 @@ dependencies = [ [[package]] name = "num_enum_derive" -version = "0.7.5" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff32365de1b6743cb203b710788263c44a03de03802daf96092f2da4fe6ba4d7" +checksum = "680998035259dcfcafe653688bf2aa6d3e2dc05e98be6ab46afb089dc84f1df8" dependencies = [ "proc-macro-crate", "proc-macro2", @@ -1444,9 +1461,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.21.3" +version = "1.21.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" [[package]] name = "once_cell_polyfill" @@ -1485,9 +1502,9 @@ dependencies = [ [[package]] name = "openssl-sys" -version = "0.9.111" +version = "0.9.112" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "82cab2d520aa75e3c58898289429321eb788c3106963d0dc886ec7a5f4adc321" +checksum = "57d55af3b3e226502be1526dfdba67ab0e9c96fc293004e79576b2b9edb0dbdb" dependencies = [ "cc", "libc", @@ -1549,10 +1566,10 @@ dependencies = [ ] [[package]] -name = "paste" -version = "1.0.15" +name = "pastey" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" +checksum = "b867cad97c0791bbd3aaa6472142568c6c9e8f71937e98379f584cfb0cf35bec" [[package]] name = "pci" @@ -1723,9 +1740,9 @@ checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" [[package]] name = "portable-atomic-util" -version = "0.2.5" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a9db96d7fa8782dd8c15ce32ffe8680bbd1e978a43bf51a34d39483540495f5" +checksum = "091397be61a01d4be58e7841595bd4bfedb15f1cd54977d79b8271e94ed799a3" dependencies = [ "portable-atomic", ] @@ -1895,7 +1912,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys 0.60.2", + "windows-sys 0.61.0", ] [[package]] @@ -1981,9 +1998,9 @@ dependencies = [ [[package]] name = "serde_with" -version = "3.17.0" +version = "3.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "381b283ce7bc6b476d903296fb59d0d36633652b633b27f64db4fb46dcbfc3b9" +checksum = "dd5414fad8e6907dbdd5bc441a50ae8d6e26151a03b1de04d89a5576de61d01f" dependencies = [ "serde_core", "serde_with_macros", @@ -1991,9 +2008,9 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "3.17.0" +version = "3.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6d4e30573c8cb306ed6ab1dca8423eec9a463ea0e155f45399455e0368b27e0" +checksum = "d3db8978e608f1fe7357e211969fd9abdcae80bac1ba7a3369bb7eb6b404eb65" dependencies = [ "darling", "proc-macro2", @@ -2095,15 +2112,15 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.26.0" +version = "3.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "82a72c767771b47409d2345987fda8628641887d5466101319899796367354a0" +checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" dependencies = [ "fastrand", "getrandom 0.4.2", "once_cell", "rustix", - "windows-sys 0.60.2", + "windows-sys 0.61.0", ] [[package]] @@ -2262,13 +2279,13 @@ dependencies = [ [[package]] name = "uds_windows" -version = "1.2.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51b70b87d15e91f553711b40df3048faf27a7a04e01e0ddc0cf9309f0af7c2ca" +checksum = "f2f6fb2847f6742cd76af783a2a2c49e9375d0a111c7bef6f71cd9e738c72d6e" dependencies = [ "memoffset", "tempfile", - "windows-sys 0.60.2", + "windows-sys 0.61.0", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 4c4ad78e8a..b8fc54d9e1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -78,14 +78,14 @@ igvm_defs = "0.4.0" # serde crates serde = "1.0.228" serde_json = "1.0.149" -serde_with = { version = "3.17.0", default-features = false } +serde_with = { version = "3.18.0", default-features = false } # other crates anyhow = "1.0.102" bitflags = "2.11.0" byteorder = "1.5.0" cfg-if = "1.0.4" -clap = "4.5.60" +clap = "4.6.0" dhat = "0.3.3" dirs = "6.0.0" env_logger = "0.11.8" diff --git a/devices/Cargo.toml b/devices/Cargo.toml index af4d7b73ce..516f41e834 100644 --- a/devices/Cargo.toml +++ b/devices/Cargo.toml @@ -21,7 +21,7 @@ linux-loader = { workspace = true, features = [ "pe", ], optional = true } log = { workspace = true } -num_enum = "0.7.5" +num_enum = "0.7.6" pci = { path = "../pci" } serde = { workspace = true, features = ["derive"] } thiserror = { workspace = true } diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock index 5d5f5e7a45..be0380751f 100644 --- a/fuzz/Cargo.lock +++ b/fuzz/Cargo.lock @@ -19,9 +19,9 @@ checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" [[package]] name = "anstream" -version = "0.6.21" +version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" +checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d" dependencies = [ "anstyle", "anstyle-parse", @@ -34,15 +34,15 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.13" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" [[package]] name = "anstyle-parse" -version = "0.2.7" +version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" +checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e" dependencies = [ "utf8parse", ] @@ -173,9 +173,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "cc" -version = "1.2.56" +version = "1.2.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aebf35691d1bfb0ac386a69bac2fde4dd276fb618cf8bf4f5318fe285e821bb2" +checksum = "7a0dd1ca384932ff3641c8718a02769f1698e7563dc6974ffd03346116310423" dependencies = [ "find-msvc-tools", "jobserver", @@ -202,18 +202,18 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.60" +version = "4.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2797f34da339ce31042b27d23607e051786132987f595b02ba4f6a6dffb7030a" +checksum = "b193af5b67834b676abd72466a96c1024e6a6ad978a1f484bd90b85c94041351" dependencies = [ "clap_builder", ] [[package]] name = "clap_builder" -version = "4.5.60" +version = "4.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24a241312cea5059b13574bb9b3861cabf758b879c15190b37b6d6fd63ab6876" +checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" dependencies = [ "anstream", "anstyle", @@ -223,9 +223,9 @@ dependencies = [ [[package]] name = "clap_lex" -version = "1.0.0" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a822ea5bc7590f9d40f1ba12c0dc3c2760f3482c6984db1573ad11031420831" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" [[package]] name = "cloud-hypervisor-fuzz" @@ -255,9 +255,9 @@ dependencies = [ [[package]] name = "colorchoice" -version = "1.0.4" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" +checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" [[package]] name = "concat-idents" @@ -298,9 +298,9 @@ dependencies = [ [[package]] name = "darling" -version = "0.21.3" +version = "0.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cdf337090841a411e2a7f3deb9187445851f91b309c0c0a29e05f74a00a48c0" +checksum = "25ae13da2f202d56bd7f91c25fba009e7717a1e4a1cc98a76d844b65ae912e9d" dependencies = [ "darling_core", "darling_macro", @@ -308,11 +308,10 @@ dependencies = [ [[package]] name = "darling_core" -version = "0.21.3" +version = "0.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1247195ecd7e3c85f83c8d2a366e4210d588e802133e1e355180a9870b517ea4" +checksum = "9865a50f7c335f53564bb694ef660825eb8610e0a53d3e11bf1b0d3df31e03b0" dependencies = [ - "fnv", "ident_case", "proc-macro2", "quote", @@ -322,9 +321,9 @@ dependencies = [ [[package]] name = "darling_macro" -version = "0.21.3" +version = "0.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d38308df82d1080de0afee5d069fa14b0326a88c14f15c5ccda35b4a6c414c81" +checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d" dependencies = [ "darling_core", "quote", @@ -468,12 +467,6 @@ dependencies = [ "spin", ] -[[package]] -name = "fnv" -version = "1.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" - [[package]] name = "foldhash" version = "0.1.5" @@ -494,23 +487,23 @@ checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893" [[package]] name = "gdbstub" -version = "0.7.9" +version = "0.7.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bf845b08f7c2ef3b5ad19f80779d43ae20d278652b91bb80adda65baf2d8ed6" +checksum = "5bafc7e33650ab9f05dcc16325f05d56b8d10393114e31a19a353b86fa60cfe7" dependencies = [ "bitflags 2.11.0", "cfg-if", "log", "managed", "num-traits", - "paste", + "pastey", ] [[package]] name = "gdbstub_arch" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22dde0e1b68787036ccedd0b1ff6f953527a0e807e571fbe898975203027278f" +checksum = "6c02bfe7bd65f42bcda751456869dfa1eb2bd1c36e309b9ec27f4888d41cf258" dependencies = [ "gdbstub", "num-traits", @@ -537,19 +530,19 @@ checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" dependencies = [ "cfg-if", "libc", - "r-efi", + "r-efi 5.3.0", "wasip2", ] [[package]] name = "getrandom" -version = "0.4.1" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "139ef39800118c7683f2fd3c98c1b23c09ae076556b435f8e9064ae108aaeeec" +checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" dependencies = [ "cfg-if", "libc", - "r-efi", + "r-efi 6.0.0", "rand_core", "wasip2", "wasip3", @@ -820,7 +813,7 @@ name = "net_util" version = "0.1.0" dependencies = [ "epoll", - "getrandom 0.4.1", + "getrandom 0.4.2", "libc", "log", "net_gen", @@ -845,9 +838,9 @@ dependencies = [ [[package]] name = "num_enum" -version = "0.7.5" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1207a7e20ad57b847bbddc6776b968420d38292bbfe2089accff5e19e82454c" +checksum = "5d0bca838442ec211fa11de3a8b0e0e8f3a4522575b5c4c06ed722e005036f26" dependencies = [ "num_enum_derive", "rustversion", @@ -855,9 +848,9 @@ dependencies = [ [[package]] name = "num_enum_derive" -version = "0.7.5" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff32365de1b6743cb203b710788263c44a03de03802daf96092f2da4fe6ba4d7" +checksum = "680998035259dcfcafe653688bf2aa6d3e2dc05e98be6ab46afb089dc84f1df8" dependencies = [ "proc-macro-crate", "proc-macro2", @@ -867,9 +860,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.21.3" +version = "1.21.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" [[package]] name = "once_cell_polyfill" @@ -905,10 +898,10 @@ dependencies = [ ] [[package]] -name = "paste" -version = "1.0.15" +name = "pastey" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" +checksum = "b867cad97c0791bbd3aaa6472142568c6c9e8f71937e98379f584cfb0cf35bec" [[package]] name = "pci" @@ -980,6 +973,12 @@ version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" +[[package]] +name = "r-efi" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" + [[package]] name = "rand" version = "0.10.0" @@ -987,7 +986,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bc266eb313df6c5c09c1c7b1fbe2510961e5bcd3add930c1e31f7ed9da0feff8" dependencies = [ "chacha20", - "getrandom 0.4.1", + "getrandom 0.4.2", "rand_core", ] @@ -1091,9 +1090,9 @@ dependencies = [ [[package]] name = "serde_with" -version = "3.17.0" +version = "3.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "381b283ce7bc6b476d903296fb59d0d36633652b633b27f64db4fb46dcbfc3b9" +checksum = "dd5414fad8e6907dbdd5bc441a50ae8d6e26151a03b1de04d89a5576de61d01f" dependencies = [ "serde_core", "serde_with_macros", @@ -1101,9 +1100,9 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "3.17.0" +version = "3.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6d4e30573c8cb306ed6ab1dca8423eec9a463ea0e155f45399455e0368b27e0" +checksum = "d3db8978e608f1fe7357e211969fd9abdcae80bac1ba7a3369bb7eb6b404eb65" dependencies = [ "darling", "proc-macro2", @@ -1295,7 +1294,7 @@ version = "1.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a68d3c8f01c0cfa54a75291d83601161799e4a89a39e0929f4b0354d88757a37" dependencies = [ - "getrandom 0.4.1", + "getrandom 0.4.2", "js-sys", "rand", "wasm-bindgen", diff --git a/vmm/Cargo.toml b/vmm/Cargo.toml index 35fe314299..ab0278e6d1 100644 --- a/vmm/Cargo.toml +++ b/vmm/Cargo.toml @@ -49,8 +49,8 @@ epoll = { workspace = true } event_monitor = { path = "../event_monitor" } flume = { workspace = true } futures = { version = "0.3.32", optional = true } -gdbstub = { version = "0.7.9", optional = true } -gdbstub_arch = { version = "0.3.2", optional = true } +gdbstub = { version = "0.7.10", optional = true } +gdbstub_arch = { version = "0.3.3", optional = true } hex = { version = "0.4.3", optional = true } hypervisor = { path = "../hypervisor" } igvm = { workspace = true, optional = true } From ef91fc64e577cb7621ebeef9e8695ad3279c5e38 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Sun, 15 Mar 2026 06:35:43 -0700 Subject: [PATCH 173/742] virtio-devices: vhost_user: Trigger interrupts in guest on resume Trigger the interrupts in the guest for the virtio device queues behind the vhost-user devices when resuming. This avoids a situation where interrupts from the backend get lost when they are dispatched from the backend when then guest is paused leading to the guest/backend effectively waiting for each other to move forward. This is more reproducible with longer durations between pause and resume as there is more opportunity for the backend to completely process it's queue and fire all the interrupts. It's perfectly safe and allowed by the virtio spec to generate these interrupts and the performance impact is negligible and is a safe way to ensure forward progress after a resume. See: #7850 Signed-off-by: Rob Bradford --- virtio-devices/src/vhost_user/mod.rs | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/virtio-devices/src/vhost_user/mod.rs b/virtio-devices/src/vhost_user/mod.rs index 158da3d800..0dad19acea 100644 --- a/virtio-devices/src/vhost_user/mod.rs +++ b/virtio-devices/src/vhost_user/mod.rs @@ -302,6 +302,7 @@ pub struct VhostUserCommon { pub vu_num_queues: usize, pub migration_started: bool, pub server: bool, + pub interrupt_cb: Option>, } impl VhostUserCommon { @@ -345,6 +346,8 @@ impl VhostUserCommon { ) .map_err(ActivateError::VhostUserSetup)?; + self.interrupt_cb = Some(interrupt_cb.clone()); + Ok(VhostUserEpollHandler { vu: vu.clone(), mem, @@ -425,10 +428,16 @@ impl VhostUserCommon { if let Some(vu) = &self.vu { vu.lock().unwrap().resume_vhost_user().map_err(|e| { MigratableError::Resume(anyhow!("Error resuming vhost-user backend: {e:?}")) - }) - } else { - Ok(()) + })?; + } + if let Some(interrupt_cb) = &self.interrupt_cb { + for i in 0..self.vu_num_queues { + interrupt_cb + .trigger(crate::VirtioInterruptType::Queue(i as u16)) + .ok(); + } } + Ok(()) } pub fn snapshot<'a, T>(&mut self, state: &T) -> std::result::Result From c42cc478febdeee2135238a159fb4dcd26563d5d Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Sat, 14 Mar 2026 14:34:07 +0100 Subject: [PATCH 174/742] performance-metrics: Add micro benchmark support Introduce support for in process micro benchmarks alongside the existing VM level performance tests. Micro benchmarks are integrated into the same PerformanceTest/TEST_LIST infrastructure and follow the same iteration, timeout, and reporting pipeline. They are distinguished by a micro_* name prefix. The test dispatch loop is refactored to pre filter the test list and gate init/cleanup behind a flag, so that pure micro benchmark runs skip the expensive VM lifecycle entirely. Mixed runs (VM + micro) continue to work correctly. Signed-off-by: Anatol Belski --- performance-metrics/src/main.rs | 37 +++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/performance-metrics/src/main.rs b/performance-metrics/src/main.rs index bef0b74ab1..d1228153da 100644 --- a/performance-metrics/src/main.rs +++ b/performance-metrics/src/main.rs @@ -1338,24 +1338,35 @@ fn main() { .unwrap_or_default(), }); - init_tests(&overrides); + // Determine which tests will actually run. + let tests_to_run: Vec<&&PerformanceTest> = test_list + .iter() + .filter(|t| test_filter.is_empty() || test_filter.iter().any(|&s| t.name.contains(s))) + .collect(); - for test in test_list.iter() { - if test_filter.is_empty() || test_filter.iter().any(|&s| test.name.contains(s)) { - settle_host(); - match run_test_with_timeout(test, &overrides) { - Ok(r) => { - metrics_report.results.push(r); - } - Err(e) => { - eprintln!("Aborting test due to error: '{e:?}'"); - std::process::exit(1); - } + // Skip heavy VM level init/cleanup when only micro benchmarks are selected. + let needs_vm_tests = tests_to_run.iter().any(|t| !t.name.starts_with("micro_")); + + if needs_vm_tests { + init_tests(&overrides); + } + + for test in tests_to_run { + settle_host(); + match run_test_with_timeout(test, &overrides) { + Ok(r) => { + metrics_report.results.push(r); + } + Err(e) => { + eprintln!("Aborting test due to error: '{e:?}'"); + std::process::exit(1); } } } - cleanup_tests(); + if needs_vm_tests { + cleanup_tests(); + } let mut report_file: Box = if let Some(file) = cmd_arguments.get_one::("report-file") { From b12620cf2545a97b930b44f505f05890eef77baf Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Sun, 15 Mar 2026 11:47:37 +0100 Subject: [PATCH 175/742] performance-metrics: Add num_ops field to PerformanceTestControl Add an optional num_ops parameter for micro benchmarks to configure workload size (e.g. number of AIO operations to submit). A warning is emitted if it is accidentally set on a non micro test where it has no effect. Signed-off-by: Anatol Belski --- performance-metrics/src/main.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/performance-metrics/src/main.rs b/performance-metrics/src/main.rs index d1228153da..cdc06fe108 100644 --- a/performance-metrics/src/main.rs +++ b/performance-metrics/src/main.rs @@ -179,6 +179,7 @@ pub struct PerformanceTestControl { net_control: Option<(bool, bool)>, // First bool is for RX(true)/TX(false), second bool is for bandwidth or PPS block_control: Option, num_boot_vcpus: Option, + num_ops: Option, // Workload size for micro benchmarks } impl fmt::Display for PerformanceTestControl { @@ -203,6 +204,9 @@ impl fmt::Display for PerformanceTestControl { o.fio_ops, o.bandwidth, o.test_file ); } + if let Some(o) = self.num_ops { + output = format!("{output}, num_ops = {o}"); + } write!(f, "{output}") } @@ -219,6 +223,7 @@ impl PerformanceTestControl { net_control: None, block_control: None, num_boot_vcpus: Some(1), + num_ops: None, } } } @@ -235,6 +240,13 @@ struct PerformanceTest { impl PerformanceTest { pub fn run(&self, overrides: &PerformanceTestOverrides) -> PerformanceTestResult { + if self.control.num_ops.is_some() && !self.name.starts_with("micro_") { + eprintln!( + "Warning: num_ops is set on '{}' but has no effect on non micro benchmarks", + self.name + ); + } + // Run warmup iterations if configured (results discarded) for _ in 0..self.control.warmup_iterations { if let Some(test_timeout) = overrides.test_timeout { From be6f63a740c857c1f6c9b5b569eedfa9e4fdc7c5 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Sat, 14 Mar 2026 15:28:14 +0100 Subject: [PATCH 176/742] performance-metrics: Add util module with shared micro benchmark helpers These factor out common setup and synchronization patterns used by block layer micro benchmarks. Signed-off-by: Anatol Belski --- Cargo.lock | 1 + performance-metrics/Cargo.toml | 1 + performance-metrics/src/main.rs | 1 + performance-metrics/src/util.rs | 36 +++++++++++++++++++++++++++++++++ 4 files changed, 39 insertions(+) create mode 100644 performance-metrics/src/util.rs diff --git a/Cargo.lock b/Cargo.lock index 083f421012..af88d4db99 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1602,6 +1602,7 @@ dependencies = [ "serde_json", "test_infra", "thiserror 2.0.18", + "vmm-sys-util", ] [[package]] diff --git a/performance-metrics/Cargo.toml b/performance-metrics/Cargo.toml index 472f1159b3..60be8f45a6 100644 --- a/performance-metrics/Cargo.toml +++ b/performance-metrics/Cargo.toml @@ -11,6 +11,7 @@ serde = { workspace = true, features = ["derive", "rc"] } serde_json = { workspace = true } test_infra = { path = "../test_infra" } thiserror = { workspace = true } +vmm-sys-util = { workspace = true } [lints] workspace = true diff --git a/performance-metrics/src/main.rs b/performance-metrics/src/main.rs index cdc06fe108..bcfee6d974 100644 --- a/performance-metrics/src/main.rs +++ b/performance-metrics/src/main.rs @@ -5,6 +5,7 @@ // Custom harness to run performance tests mod performance_tests; +mod util; use std::process::Command; use std::sync::Arc; diff --git a/performance-metrics/src/util.rs b/performance-metrics/src/util.rs new file mode 100644 index 0000000000..dcc2257501 --- /dev/null +++ b/performance-metrics/src/util.rs @@ -0,0 +1,36 @@ +// Copyright 2026 The Cloud Hypervisor Authors. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +//! Shared benchmark helpers. + +use std::io::ErrorKind; +use std::thread; +use std::time::Duration; + +use vmm_sys_util::eventfd::EventFd; +use vmm_sys_util::tempfile::TempFile; + +pub const BLOCK_SIZE: u64 = 4096; + +/// Create a temporary file pre sized to hold `num_blocks` blocks. +pub fn sized_tempfile(num_blocks: usize) -> TempFile { + let tmp = TempFile::new().expect("failed to create tempfile"); + tmp.as_file() + .set_len(BLOCK_SIZE * num_blocks as u64) + .expect("failed to set file length"); + tmp +} + +/// Spin and wait until the given eventfd becomes readable. +pub fn wait_for_eventfd(notifier: &EventFd) { + loop { + match notifier.read() { + Ok(_) => return, + Err(e) if e.kind() == ErrorKind::WouldBlock => { + thread::sleep(Duration::from_micros(50)); + } + Err(e) => panic!("eventfd read failed: {e}"), + } + } +} From 00957fa9dbe7004eafefecb805e1e775e5aad36f Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Sat, 14 Mar 2026 15:28:29 +0100 Subject: [PATCH 177/742] performance-metrics: Add AIO completion drain micro benchmark Add micro_block_raw_aio_drain_128_us and micro_block_raw_aio_drain_256_us tests that submit N AIO writes to a temporary file, wait for the eventfd signal, then time how long it takes to drain all completions via next_completed_request(). This measures per completion syscall overhead and provides a baseline before any batching optimizations. Signed-off-by: Anatol Belski --- Cargo.lock | 2 + performance-metrics/Cargo.toml | 2 + performance-metrics/src/main.rs | 31 +++++++++++- performance-metrics/src/micro_bench_block.rs | 53 ++++++++++++++++++++ 4 files changed, 87 insertions(+), 1 deletion(-) create mode 100644 performance-metrics/src/micro_bench_block.rs diff --git a/Cargo.lock b/Cargo.lock index af88d4db99..7fa87d6512 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1596,8 +1596,10 @@ dependencies = [ name = "performance-metrics" version = "0.1.0" dependencies = [ + "block", "clap", "dirs", + "libc", "serde", "serde_json", "test_infra", diff --git a/performance-metrics/Cargo.toml b/performance-metrics/Cargo.toml index 60be8f45a6..516ea9e0a2 100644 --- a/performance-metrics/Cargo.toml +++ b/performance-metrics/Cargo.toml @@ -5,8 +5,10 @@ name = "performance-metrics" version = "0.1.0" [dependencies] +block = { path = "../block" } clap = { workspace = true, features = ["wrap_help"] } dirs = { workspace = true } +libc = { workspace = true } serde = { workspace = true, features = ["derive", "rc"] } serde_json = { workspace = true } test_infra = { path = "../test_infra" } diff --git a/performance-metrics/src/main.rs b/performance-metrics/src/main.rs index bcfee6d974..c648d189c7 100644 --- a/performance-metrics/src/main.rs +++ b/performance-metrics/src/main.rs @@ -4,6 +4,7 @@ // // Custom harness to run performance tests +mod micro_bench_block; mod performance_tests; mod util; @@ -336,6 +337,10 @@ mod adjuster { v * 1000.0 } + pub fn s_to_us(v: f64) -> f64 { + v * 1_000_000.0 + } + pub fn bps_to_gbps(v: f64) -> f64 { v / (1_000_000_000_f64) } @@ -346,7 +351,7 @@ mod adjuster { } } -const TEST_LIST: [PerformanceTest; 60] = [ +const TEST_LIST: [PerformanceTest; 62] = [ PerformanceTest { name: "boot_time_ms", func_ptr: performance_boot_time, @@ -1197,6 +1202,30 @@ const TEST_LIST: [PerformanceTest; 60] = [ }, unit_adjuster: adjuster::Bps_to_MiBps, }, + PerformanceTest { + name: "micro_block_raw_aio_drain_128_us", + func_ptr: micro_bench_block::micro_bench_aio_drain, + control: PerformanceTestControl { + test_timeout: 5, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_raw_aio_drain_256_us", + func_ptr: micro_bench_block::micro_bench_aio_drain, + control: PerformanceTestControl { + test_timeout: 5, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, ]; fn run_test_with_timeout( diff --git a/performance-metrics/src/micro_bench_block.rs b/performance-metrics/src/micro_bench_block.rs new file mode 100644 index 0000000000..6dc51af658 --- /dev/null +++ b/performance-metrics/src/micro_bench_block.rs @@ -0,0 +1,53 @@ +// Copyright 2026 The Cloud Hypervisor Authors. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +//! In process micro benchmarks for block layer internals. +//! +//! These run without booting a VM and measure hot path operations +//! (e.g. AIO completion draining) at the syscall level. + +use std::os::unix::io::AsRawFd; +use std::time::Instant; + +use block::async_io::AsyncIo; +use block::raw_async_aio::RawFileAsyncAio; + +use crate::PerformanceTestControl; +use crate::util::{self, BLOCK_SIZE}; + +/// Submit num_ops AIO writes, wait for them all to land, then time +/// how long it takes to drain every completion via next_completed_request(). +/// +/// Returns the drain wall clock time in seconds. +pub fn micro_bench_aio_drain(control: &PerformanceTestControl) -> f64 { + let num_ops = control.num_ops.expect("num_ops required") as usize; + let tmp = util::sized_tempfile(num_ops); + let fd = tmp.as_file().as_raw_fd(); + let mut aio = RawFileAsyncAio::new(fd, num_ops as u32).expect("failed to create AIO context"); + + let buf = vec![0xA5u8; BLOCK_SIZE as usize]; + + // Submit all writes. + for i in 0..num_ops { + let iovec = libc::iovec { + iov_base: buf.as_ptr() as *mut _, + iov_len: buf.len(), + }; + aio.write_vectored((i as u64 * BLOCK_SIZE) as libc::off_t, &[iovec], i as u64) + .expect("write_vectored failed"); + } + + // Wait until the eventfd signals that completions are available. + util::wait_for_eventfd(aio.notifier()); + + // Drain all completions and measure. + let start = Instant::now(); + let mut drained = 0usize; + while drained < num_ops { + if aio.next_completed_request().is_some() { + drained += 1; + } + } + start.elapsed().as_secs_f64() +} From 0f7dc514ba4d13e29b51db491d7447152e74a4f9 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Thu, 12 Mar 2026 23:08:23 -0400 Subject: [PATCH 178/742] vmm: Wrap all InterruptRoute operations in a mutex The InterruptRoute code tried to be thread-safe, but it wasn't. In particular, concurrently enabling and disabling an InterruptRoute could result in the route thinking it was enabled (when it was disabled) or visa versa. Wrap all operations in a mutex and drop the attempt at being lock-free. Signed-off-by: Demi Marie Obenour --- vmm/src/interrupt.rs | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/vmm/src/interrupt.rs b/vmm/src/interrupt.rs index e42ba2f76b..70a58dfb12 100644 --- a/vmm/src/interrupt.rs +++ b/vmm/src/interrupt.rs @@ -5,7 +5,6 @@ use std::collections::HashMap; use std::io; -use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::{Arc, Mutex}; use devices::interrupt_controller::InterruptController; @@ -23,7 +22,7 @@ pub type Result = std::io::Result; struct InterruptRoute { gsi: u32, irq_fd: EventFd, - registered: AtomicBool, + registered: bool, } impl InterruptRoute { @@ -36,39 +35,39 @@ impl InterruptRoute { Ok(InterruptRoute { gsi, irq_fd, - registered: AtomicBool::new(false), + registered: false, }) } - pub fn enable(&self, vm: &dyn hypervisor::Vm) -> Result<()> { - if !self.registered.load(Ordering::Acquire) { + pub fn enable(&mut self, vm: &dyn hypervisor::Vm) -> Result<()> { + if !self.registered { vm.register_irqfd(&self.irq_fd, self.gsi) .map_err(|e| io::Error::other(format!("Failed registering irq_fd: {e}")))?; // Update internals to track the irq_fd as "registered". - self.registered.store(true, Ordering::Release); + self.registered = true; } Ok(()) } - pub fn disable(&self, vm: &dyn hypervisor::Vm) -> Result<()> { - if self.registered.load(Ordering::Acquire) { + pub fn disable(&mut self, vm: &dyn hypervisor::Vm) -> Result<()> { + if self.registered { vm.unregister_irqfd(&self.irq_fd, self.gsi) .map_err(|e| io::Error::other(format!("Failed unregistering irq_fd: {e}")))?; // Update internals to track the irq_fd as "unregistered". - self.registered.store(false, Ordering::Release); + self.registered = false; } Ok(()) } - pub fn trigger(&self) -> Result<()> { + pub fn trigger(&mut self) -> Result<()> { self.irq_fd.write(1) } - pub fn notifier(&self) -> Option { + pub fn notifier(&mut self) -> Option { Some( self.irq_fd .try_clone() @@ -85,7 +84,7 @@ pub struct RoutingEntry { pub struct MsiInterruptGroup { vm: Arc, gsi_msi_routes: Arc>>, - irq_routes: HashMap, + irq_routes: HashMap>, } impl MsiInterruptGroup { @@ -109,7 +108,7 @@ impl MsiInterruptGroup { fn new( vm: Arc, gsi_msi_routes: Arc>>, - irq_routes: HashMap, + irq_routes: HashMap>, ) -> Self { MsiInterruptGroup { vm, @@ -122,7 +121,7 @@ impl MsiInterruptGroup { impl InterruptSourceGroup for MsiInterruptGroup { fn enable(&self) -> Result<()> { for (_, route) in self.irq_routes.iter() { - route.enable(self.vm.as_ref())?; + route.lock().unwrap().enable(self.vm.as_ref())?; } Ok(()) @@ -130,7 +129,7 @@ impl InterruptSourceGroup for MsiInterruptGroup { fn disable(&self) -> Result<()> { for (_, route) in self.irq_routes.iter() { - route.disable(self.vm.as_ref())?; + route.lock().unwrap().disable(self.vm.as_ref())?; } Ok(()) @@ -138,7 +137,7 @@ impl InterruptSourceGroup for MsiInterruptGroup { fn trigger(&self, index: InterruptIndex) -> Result<()> { if let Some(route) = self.irq_routes.get(&index) { - return route.trigger(); + return route.lock().unwrap().trigger(); } Err(io::Error::other(format!( @@ -148,7 +147,7 @@ impl InterruptSourceGroup for MsiInterruptGroup { fn notifier(&self, index: InterruptIndex) -> Option { if let Some(route) = self.irq_routes.get(&index) { - return route.notifier(); + return route.lock().unwrap().notifier(); } None @@ -162,6 +161,7 @@ impl InterruptSourceGroup for MsiInterruptGroup { set_gsi: bool, ) -> Result<()> { if let Some(route) = self.irq_routes.get(&index) { + let mut route = route.lock().unwrap(); let entry = RoutingEntry { route: self.vm.make_routing_entry(route.gsi, &config), masked, @@ -293,10 +293,10 @@ impl InterruptManager for MsiInterruptManager { fn create_group(&self, config: Self::GroupConfig) -> Result> { let mut allocator = self.allocator.lock().unwrap(); - let mut irq_routes: HashMap = + let mut irq_routes: HashMap> = HashMap::with_capacity(config.count as usize); for i in config.base..config.base + config.count { - irq_routes.insert(i, InterruptRoute::new(&mut allocator)?); + irq_routes.insert(i, Mutex::new(InterruptRoute::new(&mut allocator)?)); } Ok(Arc::new(MsiInterruptGroup::new( From e1c40211ae977df51949e3d8967101059ef64d4c Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Thu, 12 Mar 2026 21:48:12 -0400 Subject: [PATCH 179/742] virtio-devices: Add set_notifier() method to VirtioInterrupt It is currently left as unimplemented!(). No functional change intended as there are no callers. Signed-off-by: Demi Marie Obenour --- Cargo.lock | 1 + fuzz/Cargo.lock | 1 + fuzz/fuzz_targets/balloon.rs | 9 +++++++++ fuzz/fuzz_targets/block.rs | 9 +++++++++ fuzz/fuzz_targets/console.rs | 9 +++++++++ fuzz/fuzz_targets/iommu.rs | 9 +++++++++ fuzz/fuzz_targets/mem.rs | 9 +++++++++ fuzz/fuzz_targets/net.rs | 9 +++++++++ fuzz/fuzz_targets/pmem.rs | 9 +++++++++ fuzz/fuzz_targets/rng.rs | 9 +++++++++ fuzz/fuzz_targets/vsock.rs | 9 +++++++++ fuzz/fuzz_targets/watchdog.rs | 9 +++++++++ virtio-devices/Cargo.toml | 1 + virtio-devices/src/device.rs | 6 ++++++ virtio-devices/src/transport/pci_device.rs | 9 +++++++++ virtio-devices/src/vsock/mod.rs | 9 +++++++++ 16 files changed, 117 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 7fa87d6512..1c02d2604f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2455,6 +2455,7 @@ dependencies = [ "byteorder", "epoll", "event_monitor", + "hypervisor", "libc", "log", "mshv-ioctls", diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock index be0380751f..e03789eaf4 100644 --- a/fuzz/Cargo.lock +++ b/fuzz/Cargo.lock @@ -1372,6 +1372,7 @@ dependencies = [ "byteorder", "epoll", "event_monitor", + "hypervisor", "libc", "log", "net_util", diff --git a/fuzz/fuzz_targets/balloon.rs b/fuzz/fuzz_targets/balloon.rs index 58b9b30582..69f0c07e84 100644 --- a/fuzz/fuzz_targets/balloon.rs +++ b/fuzz/fuzz_targets/balloon.rs @@ -119,6 +119,15 @@ impl VirtioInterrupt for NoopVirtioInterrupt { fn trigger(&self, _int_type: VirtioInterruptType) -> std::result::Result<(), std::io::Error> { Ok(()) } + + fn set_notifier( + &self, + _interrupt: u32, + _eventfd: Option, + _vm: &dyn hypervisor::Vm, + ) -> std::io::Result<()> { + unimplemented!() + } } macro_rules! align { diff --git a/fuzz/fuzz_targets/block.rs b/fuzz/fuzz_targets/block.rs index 51007fe384..952011b55b 100644 --- a/fuzz/fuzz_targets/block.rs +++ b/fuzz/fuzz_targets/block.rs @@ -121,6 +121,15 @@ impl VirtioInterrupt for NoopVirtioInterrupt { fn trigger(&self, _int_type: VirtioInterruptType) -> std::result::Result<(), std::io::Error> { Ok(()) } + + fn set_notifier( + &self, + _interrupt: u32, + _eventfd: Option, + _vm: &dyn hypervisor::Vm, + ) -> std::io::Result<()> { + unimplemented!() + } } fn setup_virt_queue(bytes: &[u8; QUEUE_DATA_SIZE]) -> Queue { diff --git a/fuzz/fuzz_targets/console.rs b/fuzz/fuzz_targets/console.rs index e27331ed01..a335a96027 100644 --- a/fuzz/fuzz_targets/console.rs +++ b/fuzz/fuzz_targets/console.rs @@ -148,6 +148,15 @@ impl VirtioInterrupt for NoopVirtioInterrupt { fn trigger(&self, _int_type: VirtioInterruptType) -> std::result::Result<(), std::io::Error> { Ok(()) } + + fn set_notifier( + &self, + _interrupt: u32, + _eventfd: Option, + _vm: &dyn hypervisor::Vm, + ) -> std::io::Result<()> { + unimplemented!() + } } fn setup_virt_queues(bytes: &[&[u8; QUEUE_DATA_SIZE]], base_addr: u64) -> Vec { diff --git a/fuzz/fuzz_targets/iommu.rs b/fuzz/fuzz_targets/iommu.rs index a10640487f..11600a36a7 100644 --- a/fuzz/fuzz_targets/iommu.rs +++ b/fuzz/fuzz_targets/iommu.rs @@ -130,6 +130,15 @@ impl VirtioInterrupt for NoopVirtioInterrupt { fn trigger(&self, _int_type: VirtioInterruptType) -> std::result::Result<(), std::io::Error> { Ok(()) } + + fn set_notifier( + &self, + _interrupt: u32, + _eventfd: Option, + _vm: &dyn hypervisor::Vm, + ) -> std::io::Result<()> { + unimplemented!() + } } fn setup_virt_queue(bytes: &[u8; QUEUE_DATA_SIZE]) -> Queue { diff --git a/fuzz/fuzz_targets/mem.rs b/fuzz/fuzz_targets/mem.rs index 73ec11b025..e430e195aa 100644 --- a/fuzz/fuzz_targets/mem.rs +++ b/fuzz/fuzz_targets/mem.rs @@ -125,6 +125,15 @@ impl VirtioInterrupt for NoopVirtioInterrupt { fn trigger(&self, _int_type: VirtioInterruptType) -> std::result::Result<(), std::io::Error> { Ok(()) } + + fn set_notifier( + &self, + _interrupt: u32, + _eventfd: Option, + _vm: &dyn hypervisor::Vm, + ) -> std::io::Result<()> { + unimplemented!() + } } // Create a dummy virtio-mem device for fuzzing purpose only diff --git a/fuzz/fuzz_targets/net.rs b/fuzz/fuzz_targets/net.rs index df9a1dce5a..efc9605806 100644 --- a/fuzz/fuzz_targets/net.rs +++ b/fuzz/fuzz_targets/net.rs @@ -166,6 +166,15 @@ impl VirtioInterrupt for NoopVirtioInterrupt { fn trigger(&self, _int_type: VirtioInterruptType) -> std::result::Result<(), std::io::Error> { Ok(()) } + + fn set_notifier( + &self, + _interrupt: u32, + _eventfd: Option, + _vm: &dyn hypervisor::Vm, + ) -> std::io::Result<()> { + unimplemented!() + } } fn setup_virt_queues(bytes: &[&[u8; QUEUE_DATA_SIZE]], base_addr: u64) -> Vec { diff --git a/fuzz/fuzz_targets/pmem.rs b/fuzz/fuzz_targets/pmem.rs index 0bd083a1c2..37eabf86cd 100644 --- a/fuzz/fuzz_targets/pmem.rs +++ b/fuzz/fuzz_targets/pmem.rs @@ -95,6 +95,15 @@ impl VirtioInterrupt for NoopVirtioInterrupt { fn trigger(&self, _int_type: VirtioInterruptType) -> std::result::Result<(), std::io::Error> { Ok(()) } + + fn set_notifier( + &self, + _interrupt: u32, + _eventfd: Option, + _vm: &dyn hypervisor::Vm, + ) -> std::io::Result<()> { + unimplemented!() + } } // Create a dummy virtio-pmem device for fuzzing purpose only diff --git a/fuzz/fuzz_targets/rng.rs b/fuzz/fuzz_targets/rng.rs index 13548664a8..c3029f33b4 100644 --- a/fuzz/fuzz_targets/rng.rs +++ b/fuzz/fuzz_targets/rng.rs @@ -119,6 +119,15 @@ impl VirtioInterrupt for NoopVirtioInterrupt { fn trigger(&self, _int_type: VirtioInterruptType) -> std::result::Result<(), std::io::Error> { Ok(()) } + + fn set_notifier( + &self, + _interrupt: u32, + _eventfd: Option, + _vm: &dyn hypervisor::Vm, + ) -> std::io::Result<()> { + unimplemented!() + } } fn setup_virt_queue(bytes: &[u8; QUEUE_DATA_SIZE]) -> Queue { diff --git a/fuzz/fuzz_targets/vsock.rs b/fuzz/fuzz_targets/vsock.rs index 33ebe78886..559f2ec138 100644 --- a/fuzz/fuzz_targets/vsock.rs +++ b/fuzz/fuzz_targets/vsock.rs @@ -128,6 +128,15 @@ impl VirtioInterrupt for NoopVirtioInterrupt { fn trigger(&self, _int_type: VirtioInterruptType) -> std::result::Result<(), std::io::Error> { Ok(()) } + + fn set_notifier( + &self, + _interrupt: u32, + _eventfd: Option, + _vm: &dyn hypervisor::Vm, + ) -> std::io::Result<()> { + unimplemented!() + } } fn setup_virt_queue(bytes: &[u8; QUEUE_DATA_SIZE]) -> Queue { diff --git a/fuzz/fuzz_targets/watchdog.rs b/fuzz/fuzz_targets/watchdog.rs index 31361755df..60f4afab55 100644 --- a/fuzz/fuzz_targets/watchdog.rs +++ b/fuzz/fuzz_targets/watchdog.rs @@ -84,6 +84,15 @@ impl VirtioInterrupt for NoopVirtioInterrupt { fn trigger(&self, _int_type: VirtioInterruptType) -> std::result::Result<(), std::io::Error> { Ok(()) } + + fn set_notifier( + &self, + _interrupt: u32, + _eventfd: Option, + _vm: &dyn hypervisor::Vm, + ) -> std::io::Result<()> { + unimplemented!() + } } fn setup_virt_queue(bytes: &[u8; QUEUE_DATA_SIZE]) -> Queue { diff --git a/virtio-devices/Cargo.toml b/virtio-devices/Cargo.toml index 41b9da8e0a..d2658eeeca 100644 --- a/virtio-devices/Cargo.toml +++ b/virtio-devices/Cargo.toml @@ -17,6 +17,7 @@ block = { path = "../block" } byteorder = { workspace = true } epoll = { workspace = true } event_monitor = { path = "../event_monitor" } +hypervisor = { path = "../hypervisor" } libc = { workspace = true } log = { workspace = true } mshv-ioctls = { workspace = true, optional = true } diff --git a/virtio-devices/src/device.rs b/virtio-devices/src/device.rs index 4b5cdaf03f..91b742a0bc 100644 --- a/virtio-devices/src/device.rs +++ b/virtio-devices/src/device.rs @@ -37,6 +37,12 @@ pub trait VirtioInterrupt: Send + Sync { fn notifier(&self, _int_type: VirtioInterruptType) -> Option { None } + fn set_notifier( + &self, + int_type: u32, + notifier: Option, + vm: &dyn hypervisor::Vm, + ) -> std::io::Result<()>; } #[derive(Clone)] diff --git a/virtio-devices/src/transport/pci_device.rs b/virtio-devices/src/transport/pci_device.rs index 7f049070d6..70e03d0287 100644 --- a/virtio-devices/src/transport/pci_device.rs +++ b/virtio-devices/src/transport/pci_device.rs @@ -914,6 +914,15 @@ impl VirtioInterrupt for VirtioInterruptMsix { self.interrupt_source_group .notifier(vector as InterruptIndex) } + + fn set_notifier( + &self, + _interrupt: u32, + _eventfd: Option, + _vm: &dyn hypervisor::Vm, + ) -> std::io::Result<()> { + unimplemented!() + } } impl PciDevice for VirtioPciDevice { diff --git a/virtio-devices/src/vsock/mod.rs b/virtio-devices/src/vsock/mod.rs index 7895587855..34561f5d46 100644 --- a/virtio-devices/src/vsock/mod.rs +++ b/virtio-devices/src/vsock/mod.rs @@ -188,6 +188,15 @@ pub mod unit_tests { ) -> std::result::Result<(), std::io::Error> { Ok(()) } + + fn set_notifier( + &self, + _interrupt: u32, + _eventfd: Option, + _vm: &dyn hypervisor::Vm, + ) -> std::io::Result<()> { + unimplemented!() + } } pub struct TestBackend { From 9f62c33d0064b5687343c409753138b74aa5deab Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Thu, 12 Mar 2026 22:07:43 -0400 Subject: [PATCH 180/742] vmm: Support external FDs for InterruptSourceGroup This allows creating an InterruptSourceGroup with an externally provided file descriptor. It also allows changing the file descriptor afterwards. Signed-off-by: Demi Marie Obenour --- vmm/src/interrupt.rs | 49 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 7 deletions(-) diff --git a/vmm/src/interrupt.rs b/vmm/src/interrupt.rs index 70a58dfb12..07727d1ad6 100644 --- a/vmm/src/interrupt.rs +++ b/vmm/src/interrupt.rs @@ -21,13 +21,16 @@ pub type Result = std::io::Result; struct InterruptRoute { gsi: u32, - irq_fd: EventFd, + irq_fd: Option, registered: bool, } impl InterruptRoute { pub fn new(allocator: &mut SystemAllocator) -> Result { - let irq_fd = EventFd::new(libc::EFD_NONBLOCK)?; + Self::new_with_fd(allocator, Some(EventFd::new(libc::EFD_NONBLOCK)?)) + } + + pub fn new_with_fd(allocator: &mut SystemAllocator, irq_fd: Option) -> Result { let gsi = allocator .allocate_gsi() .ok_or_else(|| io::Error::other("Failed allocating new GSI"))?; @@ -41,8 +44,10 @@ impl InterruptRoute { pub fn enable(&mut self, vm: &dyn hypervisor::Vm) -> Result<()> { if !self.registered { - vm.register_irqfd(&self.irq_fd, self.gsi) - .map_err(|e| io::Error::other(format!("Failed registering irq_fd: {e}")))?; + if let Some(ref irq_fd) = self.irq_fd { + vm.register_irqfd(irq_fd, self.gsi) + .map_err(|e| io::Error::other(format!("Failed registering irq_fd: {e}")))?; + } // Update internals to track the irq_fd as "registered". self.registered = true; @@ -53,8 +58,10 @@ impl InterruptRoute { pub fn disable(&mut self, vm: &dyn hypervisor::Vm) -> Result<()> { if self.registered { - vm.unregister_irqfd(&self.irq_fd, self.gsi) - .map_err(|e| io::Error::other(format!("Failed unregistering irq_fd: {e}")))?; + if let Some(ref irq_fd) = self.irq_fd { + vm.unregister_irqfd(irq_fd, self.gsi) + .map_err(|e| io::Error::other(format!("Failed unregistering irq_fd: {e}")))?; + } // Update internals to track the irq_fd as "unregistered". self.registered = false; @@ -64,16 +71,44 @@ impl InterruptRoute { } pub fn trigger(&mut self) -> Result<()> { - self.irq_fd.write(1) + match self.irq_fd { + Some(ref fd) => fd.write(1), + None => Ok(()), + } } pub fn notifier(&mut self) -> Option { Some( self.irq_fd + .as_ref()? .try_clone() .expect("Failed cloning interrupt's EventFd"), ) } + + #[allow(dead_code)] + pub fn set_notifier( + &mut self, + eventfd: Option, + vm: &dyn hypervisor::Vm, + ) -> Result<()> { + let old_irqfd = core::mem::replace(&mut self.irq_fd, eventfd); + if self.registered { + if let Some(ref irq_fd) = self.irq_fd { + vm.register_irqfd(irq_fd, self.gsi) + .map_err(|e| io::Error::other(format!("Failed registering irq_fd: {e}")))? + } + // If the irqfd cannot be unregistered, what to do? Spin? + // Returning an error isn't helpful as the new irqfd is already registered. + if let Some(old_irq_fd) = old_irqfd { + match vm.unregister_irqfd(&old_irq_fd, self.gsi) { + Ok(()) => {} + Err(e) => log::warn!("Failed unregistering old irqfd: {e}"), + } + } + } + Ok(()) + } } pub struct RoutingEntry { From d609410b8ba5c1e44ec10697bb48aebddc36b47b Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Wed, 11 Feb 2026 20:08:45 -0500 Subject: [PATCH 181/742] pci: Support injecting interrupts from externally-provided irqfds The virtio vhost-user device backend prefers to use externally-provided eventfds as irqfds. This allows the frontend VM to notify the backend VM directly, without the need for a userspace proxy process. Since the frontend can provide irqfds at any time, the backend needs to register and unregister irqfds dynamically. This is tricky because the functions that access the irqfd table all take `&self`, not `&mut self`. The obvious solution to this problem is to wrap the table in a mutex. Most of these functions are not called on hot paths, but `.notifier()` is called whenever Cloud Hypervisor needs to inject an interrupt into a guest. Most devices don't need to register irqfds at runtime, and for them, slowing down interrupt injection would be wasteful. Instead, require devices to opt-in to irqfd registration. The irqfd table now comes in two forms: one that contains a mutex and one that does not. The one containing a mutex can be mutated freely, while attempting to mutate the one that does not will panic. Right now, no code registeres irqfds at runtime, but this will change in subsequent commits. Signed-off-by: Demi Marie Obenour --- pci/src/lib.rs | 5 +- pci/src/msix.rs | 62 ++++++++++++++++++++-- pci/src/vfio.rs | 4 +- virtio-devices/src/device.rs | 12 +++++ virtio-devices/src/transport/pci_device.rs | 52 ++++++++++-------- vm-device/src/interrupt/mod.rs | 42 ++++++++++++++- vmm/src/device_manager.rs | 4 +- vmm/src/interrupt.rs | 46 +++++++++++++++- 8 files changed, 194 insertions(+), 33 deletions(-) diff --git a/pci/src/lib.rs b/pci/src/lib.rs index 5ab87cf19d..17c3ab7235 100644 --- a/pci/src/lib.rs +++ b/pci/src/lib.rs @@ -32,7 +32,10 @@ pub use self::device::{ BarReprogrammingParams, DeviceRelocation, Error as PciDeviceError, PciDevice, }; pub use self::msi::{MsiCap, MsiConfig, msi_num_enabled_vectors}; -pub use self::msix::{MSIX_CONFIG_ID, MSIX_TABLE_ENTRY_SIZE, MsixCap, MsixConfig, MsixTableEntry}; +pub use self::msix::{ + MSIX_CONFIG_ID, MSIX_TABLE_ENTRY_SIZE, MaybeMutInterruptSourceGroup, MsixCap, MsixConfig, + MsixTableEntry, +}; pub use self::vfio::{MmioRegion, VfioDmaMapping, VfioPciDevice, VfioPciError}; pub use self::vfio_user::{VfioUserDmaMapping, VfioUserPciDevice, VfioUserPciDeviceError}; diff --git a/pci/src/msix.rs b/pci/src/msix.rs index 9bc5e63f3a..49b379b02a 100644 --- a/pci/src/msix.rs +++ b/pci/src/msix.rs @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause // -use std::sync::Arc; +use std::sync::{Arc, Mutex}; use std::{io, result}; use byteorder::{ByteOrder, LittleEndian}; @@ -15,6 +15,7 @@ use vm_device::interrupt::{ }; use vm_memory::ByteValued; use vm_migration::{MigratableError, Pausable, Snapshot, Snapshottable}; +use vmm_sys_util::eventfd::EventFd; use crate::{PciCapability, PciCapabilityId}; @@ -72,11 +73,66 @@ pub struct MsixConfigState { enabled: bool, } +#[derive(Clone)] +pub enum MaybeMutInterruptSourceGroup { + Immutable(Arc), + Mutable(Arc>), +} + +macro_rules! impl_method { + ($( + fn $i: ident(&self $(,$index:ident : $InterruptIndex:ty)*$(,)?) -> $r: ty; + )*) => { + $( + fn $i(&self $(,$index: $InterruptIndex)*) -> $r { + match self { + Self::Immutable(source) => source.$i($($index),*), + Self::Mutable(source) => source.lock().unwrap().$i($($index),*), + } + } + )* + }; +} + +impl InterruptSourceGroup for MaybeMutInterruptSourceGroup { + impl_method! { + fn trigger(&self, index: InterruptIndex) -> vm_device::interrupt::Result<()>; + + fn notifier(&self, index: InterruptIndex) -> Option; + + fn update( + &self, + index: InterruptIndex, + config: InterruptSourceConfig, + masked: bool, + set_gsi: bool, + ) -> vm_device::interrupt::Result<()>; + + fn set_gsi(&self) -> vm_device::interrupt::Result<()>; + } +} + +impl MaybeMutInterruptSourceGroup { + pub fn set_notifier( + &self, + index: InterruptIndex, + eventfd: Option, + vm: &dyn hypervisor::Vm, + ) -> std::io::Result<()> { + match self { + Self::Immutable(_) => panic!( + "Attempted to set a notifier of an immutable source. You must mark your device as needing a mutable source by having sets_irqfd() return true." + ), + Self::Mutable(source) => source.lock().unwrap().set_notifier(index, eventfd, vm), + } + } +} + pub struct MsixConfig { pub table_entries: Vec, pub pba_entries: Vec, pub devid: u32, - interrupt_source_group: Arc, + interrupt_source_group: MaybeMutInterruptSourceGroup, masked: bool, enabled: bool, } @@ -84,7 +140,7 @@ pub struct MsixConfig { impl MsixConfig { pub fn new( msix_vectors: u16, - interrupt_source_group: Arc, + interrupt_source_group: MaybeMutInterruptSourceGroup, devid: u32, state: Option, ) -> result::Result { diff --git a/pci/src/vfio.rs b/pci/src/vfio.rs index 9e8e7e3163..e46e276aa6 100644 --- a/pci/src/vfio.rs +++ b/pci/src/vfio.rs @@ -37,7 +37,7 @@ use vmm_sys_util::eventfd::EventFd; use crate::mmap::MmapRegion; use crate::msi::{MSI_CONFIG_ID, MsiConfigState}; -use crate::msix::MsixConfigState; +use crate::msix::{MaybeMutInterruptSourceGroup, MsixConfigState}; use crate::{ BarReprogrammingParams, MSIX_CONFIG_ID, MSIX_TABLE_ENTRY_SIZE, MsiCap, MsiConfig, MsixCap, MsixConfig, PCI_CONFIGURATION_ID, PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, @@ -863,7 +863,7 @@ impl VfioCommon { let msix_config = MsixConfig::new( msix_cap.table_size(), - interrupt_source_group.clone(), + MaybeMutInterruptSourceGroup::Immutable(interrupt_source_group.clone()), bdf.into(), state, ) diff --git a/virtio-devices/src/device.rs b/virtio-devices/src/device.rs index 91b742a0bc..f0673f5614 100644 --- a/virtio-devices/src/device.rs +++ b/virtio-devices/src/device.rs @@ -80,6 +80,18 @@ pub trait VirtioDevice: Send { /// The maximum size of each queue that this device supports. fn queue_max_sizes(&self) -> &[u16]; + /// Whether the device needs to register extra irqfds at runtime + /// from external sources. + /// The default is false. If this is true, locking is required for + /// most operations involving interrupts (but not for sending) + /// interrupts from external irqfds). + /// + /// If the device claims to not need to register irqfds, but + /// attempts to do so, a panic will ensue. + fn interrupt_source_mutable(&self) -> bool { + false + } + /// The set of feature bits that this device supports. fn features(&self) -> u64 { 0 diff --git a/virtio-devices/src/transport/pci_device.rs b/virtio-devices/src/transport/pci_device.rs index 70e03d0287..3e2a96ccd9 100644 --- a/virtio-devices/src/transport/pci_device.rs +++ b/virtio-devices/src/transport/pci_device.rs @@ -17,9 +17,10 @@ use anyhow::anyhow; use libc::EFD_NONBLOCK; use log::{error, info}; use pci::{ - BarReprogrammingParams, MsixCap, MsixConfig, PciBarConfiguration, PciBarRegionType, - PciCapability, PciCapabilityId, PciClassCode, PciConfiguration, PciDevice, PciDeviceError, - PciHeaderType, PciMassStorageSubclass, PciNetworkControllerSubclass, PciSubclass, + BarReprogrammingParams, MaybeMutInterruptSourceGroup, MsixCap, MsixConfig, PciBarConfiguration, + PciBarRegionType, PciCapability, PciCapabilityId, PciClassCode, PciConfiguration, PciDevice, + PciDeviceError, PciHeaderType, PciMassStorageSubclass, PciNetworkControllerSubclass, + PciSubclass, }; use serde::{Deserialize, Serialize}; use thiserror::Error; @@ -359,7 +360,7 @@ pub struct VirtioPciDevice { // PCI interrupts. interrupt_status: Arc, virtio_interrupt: Option>, - interrupt_source_group: Arc, + interrupt_source_group: MaybeMutInterruptSourceGroup, // virtio queues queues: Vec, @@ -433,17 +434,26 @@ impl VirtioPciDevice { let pci_device_id = VIRTIO_PCI_DEVICE_ID_BASE + locked_device.device_type() as u16; - let interrupt_source_group = interrupt_manager - .create_group(MsiIrqGroupConfig { + let interrupt_source_group: MaybeMutInterruptSourceGroup = { + let config = MsiIrqGroupConfig { base: 0, count: msix_num as InterruptIndex, + }; + (if locked_device.interrupt_source_mutable() { + interrupt_manager + .create_group_mut(config) + .map(MaybeMutInterruptSourceGroup::Mutable) + } else { + interrupt_manager + .create_group(config) + .map(MaybeMutInterruptSourceGroup::Immutable) }) .map_err(|e| { VirtioPciDeviceError::CreateVirtioPciDevice(anyhow!( "Failed creating MSI interrupt group: {e}" )) - })?; - + })? + }; let msix_state = vm_migration::state_from_id(snapshot, pci::MSIX_CONFIG_ID).map_err(|e| { VirtioPciDeviceError::CreateVirtioPciDevice(anyhow!( @@ -452,14 +462,11 @@ impl VirtioPciDevice { })?; let (msix_config, msix_config_clone) = if msix_num > 0 { + let interrupt_source_group: MaybeMutInterruptSourceGroup = + interrupt_source_group.clone(); let msix_config = Arc::new(Mutex::new( - MsixConfig::new( - msix_num, - interrupt_source_group.clone(), - pci_device_bdf, - msix_state, - ) - .unwrap(), + MsixConfig::new(msix_num, interrupt_source_group, pci_device_bdf, msix_state) + .unwrap(), )); let msix_config_clone = msix_config.clone(); (Some(msix_config), Some(msix_config_clone)) @@ -598,7 +605,7 @@ impl VirtioPciDevice { memory, settings_bar: 0, use_64bit_bar, - interrupt_source_group, + interrupt_source_group: interrupt_source_group.clone(), cap_pci_cfg_info, bar_regions: vec![], activate_evt, @@ -855,7 +862,7 @@ pub struct VirtioInterruptMsix { msix_config: Arc>, config_vector: Arc, queues_vectors: Arc>>, - interrupt_source_group: Arc, + interrupt_source_group: MaybeMutInterruptSourceGroup, } impl VirtioInterruptMsix { @@ -863,7 +870,7 @@ impl VirtioInterruptMsix { msix_config: Arc>, config_vector: Arc, queues_vectors: Arc>>, - interrupt_source_group: Arc, + interrupt_source_group: MaybeMutInterruptSourceGroup, ) -> Self { VirtioInterruptMsix { msix_config, @@ -917,11 +924,12 @@ impl VirtioInterrupt for VirtioInterruptMsix { fn set_notifier( &self, - _interrupt: u32, - _eventfd: Option, - _vm: &dyn hypervisor::Vm, + interrupt: u32, + eventfd: Option, + vm: &dyn hypervisor::Vm, ) -> std::io::Result<()> { - unimplemented!() + self.interrupt_source_group + .set_notifier(interrupt, eventfd, vm) } } diff --git a/vm-device/src/interrupt/mod.rs b/vm-device/src/interrupt/mod.rs index 342cbe0631..e9b0180d29 100644 --- a/vm-device/src/interrupt/mod.rs +++ b/vm-device/src/interrupt/mod.rs @@ -57,7 +57,8 @@ //! * The virtual device backend requests the interrupt manager to create an interrupt group //! according to guest configuration information -use std::sync::Arc; +use std::io::{Error, ErrorKind}; +use std::sync::{Arc, Mutex}; pub use hypervisor::{InterruptSourceConfig, LegacyIrqSourceConfig, MsiIrqSourceConfig}; use vmm_sys_util::eventfd::EventFd; @@ -107,6 +108,30 @@ pub trait InterruptManager: Send + Sync { /// * count: number of Interrupt Sources to be managed by the group object. fn create_group(&self, config: Self::GroupConfig) -> Result>; + /// Create an [InterruptSourceGroup](trait.InterruptSourceGroup.html) object to manage + /// interrupt sources for a virtual device + /// + /// An [InterruptSourceGroup](trait.InterruptSourceGroup.html) object manages all interrupt + /// sources of the same type for a virtual device. + /// + /// This is the same as [`Self::create_group`], except that the returned + /// [`InterruptSourceGroup`] allows setting the irqfd used as notifier via + /// [`InterruptSourceGroup::set_notifier`]. + /// + /// # Arguments + /// * interrupt_type: type of interrupt source. + /// * base: base Interrupt Source ID to be managed by the group object. + /// * count: number of Interrupt Sources to be managed by the group object. + fn create_group_mut( + &self, + _config: Self::GroupConfig, + ) -> Result>> { + Err(Error::new( + ErrorKind::Unsupported, + "setting notifiers not supported", + )) + } + /// Destroy an [InterruptSourceGroup](trait.InterruptSourceGroup.html) object created by /// [create_group()](trait.InterruptManager.html#tymethod.create_group). /// @@ -137,7 +162,7 @@ pub trait InterruptSourceGroup: Send + Sync { /// Returns an interrupt notifier from this interrupt. /// /// An interrupt notifier allows for external components and processes - /// to inject interrupts into a guest, by writing to the file returned + /// to inject interrupts into a guest, by writing to the [`EventFd`] returned /// by this method. #[allow(unused_variables)] fn notifier(&self, index: InterruptIndex) -> Option; @@ -159,4 +184,17 @@ pub trait InterruptSourceGroup: Send + Sync { /// Set the interrupt group GSI routing table. fn set_gsi(&self) -> Result<()>; + + /// Sets the [`EventFd`] used to trigger interrupts. + fn set_notifier( + &mut self, + _index: InterruptIndex, + _eventfd: Option, + _vm: &dyn hypervisor::Vm, + ) -> Result<()> { + Err(Error::new( + ErrorKind::Unsupported, + "setting notifiers not supported", + )) + } } diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index e560b02d7a..958d3086b1 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -4201,8 +4201,8 @@ impl DeviceManager { return Err(DeviceManagerError::MissingNode); } - // Allows support for one MSI-X vector per queue. It also adds 1 - // as we need to take into account the dedicated vector to notify + // Allows support for one MSI-X vector per interrupt needed by the device. + // It also adds 1 as we need to take into account the dedicated vector to notify // about a virtio config change. let msix_num = (virtio_device.lock().unwrap().queue_max_sizes().len() + 1) as u16; diff --git a/vmm/src/interrupt.rs b/vmm/src/interrupt.rs index 07727d1ad6..0995d83567 100644 --- a/vmm/src/interrupt.rs +++ b/vmm/src/interrupt.rs @@ -86,6 +86,9 @@ impl InterruptRoute { ) } + // This is currently not used, but the upcoming vhost-guest feature + // will use it. Use #[allow(dead_code)] to suppress a compiler + // warning. #[allow(dead_code)] pub fn set_notifier( &mut self, @@ -96,7 +99,7 @@ impl InterruptRoute { if self.registered { if let Some(ref irq_fd) = self.irq_fd { vm.register_irqfd(irq_fd, self.gsi) - .map_err(|e| io::Error::other(format!("Failed registering irq_fd: {e}")))? + .map_err(|e| io::Error::other(format!("Failed registering irq_fd: {e}")))?; } // If the irqfd cannot be unregistered, what to do? Spin? // Returning an error isn't helpful as the new irqfd is already registered. @@ -235,6 +238,19 @@ impl InterruptSourceGroup for MsiInterruptGroup { let routes = self.gsi_msi_routes.lock().unwrap(); self.set_gsi_routes(&routes) } + + fn set_notifier( + &mut self, + index: InterruptIndex, + eventfd: Option, + vm: &dyn hypervisor::Vm, + ) -> Result<()> { + if let Some(route) = self.irq_routes.get(&index) { + return route.lock().unwrap().set_notifier(eventfd, vm); + } + + Ok(()) + } } pub struct LegacyUserspaceInterruptGroup { @@ -323,6 +339,26 @@ impl InterruptManager for LegacyUserspaceInterruptManager { } } +impl MsiInterruptManager { + fn create_group_raw( + &self, + config: ::GroupConfig, + ) -> Result { + let mut allocator = self.allocator.lock().unwrap(); + let mut irq_routes: HashMap> = + HashMap::with_capacity(config.count as usize); + for i in config.base..config.base + config.count { + irq_routes.insert(i, Mutex::new(InterruptRoute::new(&mut allocator)?)); + } + + Ok(MsiInterruptGroup::new( + self.vm.clone(), + self.gsi_msi_routes.clone(), + irq_routes, + )) + } +} + impl InterruptManager for MsiInterruptManager { type GroupConfig = MsiIrqGroupConfig; @@ -341,6 +377,14 @@ impl InterruptManager for MsiInterruptManager { ))) } + fn create_group_mut( + &self, + config: Self::GroupConfig, + ) -> vm_device::interrupt::Result>> { + let r = self.create_group_raw(config)?; + Ok(Arc::new(Mutex::new(r))) + } + fn destroy_group(&self, _group: Arc) -> Result<()> { Ok(()) } From 3e2e453d89813d5bef2e51e2b3673451ba6fd946 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Tue, 17 Mar 2026 17:46:36 +0100 Subject: [PATCH 182/742] block: Honor unmap flag in write zeroes requests The write zeroes segment descriptor (struct virtio_blk_discard_write_zeroes, virtio spec v1.2 section 5.2.6) includes a flags field with an unmap bit. Per section 5.2.6.2, if unmap is set, the device MAY deallocate the specified range of sectors in the device backend storage, as if the discard command had been sent. Read the flags field and when the unmap bit is set, use punch_hole to deallocate the range. Otherwise continue using write_zeroes via ZERO_RANGE which preserves allocation. This allows the guest to reclaim host disk space through write zeroes requests on thin provisioned images. Signed-off-by: Anatol Belski --- block/src/lib.rs | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/block/src/lib.rs b/block/src/lib.rs index c02b315cc2..2c8556c395 100644 --- a/block/src/lib.rs +++ b/block/src/lib.rs @@ -621,15 +621,18 @@ impl Request { } let mut wz_sector = [0u8; 8]; - let mut wz_num_sectors = [0u8; 4]; + let mut wz_flags = [0u8; 4]; mem.read_slice(&mut wz_sector, data_addr) .map_err(ExecuteError::Read)?; mem.read_slice(&mut wz_num_sectors, data_addr.checked_add(8).unwrap()) .map_err(ExecuteError::Read)?; + mem.read_slice(&mut wz_flags, data_addr.checked_add(12).unwrap()) + .map_err(ExecuteError::Read)?; let wz_sector = u64::from_le_bytes(wz_sector); let wz_num_sectors = u32::from_le_bytes(wz_num_sectors); + let wz_flags = u32::from_le_bytes(wz_flags); let wz_offset = wz_sector * SECTOR_SIZE; if wz_offset == 0 && disable_sector0_writes { @@ -637,9 +640,15 @@ impl Request { } let wz_length = (wz_num_sectors as u64) * SECTOR_SIZE; - disk_image - .write_zeroes(wz_offset, wz_length, user_data) - .map_err(ExecuteError::AsyncWriteZeroes)?; + if wz_flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP != 0 { + disk_image + .punch_hole(wz_offset, wz_length, user_data) + .map_err(ExecuteError::AsyncPunchHole)?; + } else { + disk_image + .write_zeroes(wz_offset, wz_length, user_data) + .map_err(ExecuteError::AsyncWriteZeroes)?; + } } RequestType::Unsupported(t) => return Err(ExecuteError::Unsupported(t)), } From 26035df8e5ec737b178ba22eb24b2e0f85741887 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Tue, 17 Mar 2026 21:37:29 +0100 Subject: [PATCH 183/742] tests: add integration test for WRITE_ZEROES with UNMAP flag Add test_virtio_block_write_zeroes_unmap_raw to verify that the VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP code path works correctly with raw disk images. The test creates a 128M raw disk and writes 64M of random data, then uses fallocate --punch-hole on the guest block device, which the Linux virtio-blk driver translates to VIRTIO_BLK_T_WRITE_ZEROES with VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP set. It then verifies: - the zeroed region reads back as zero from the guest - the host file became sparse (punch_hole succeeded) - FIEMAP confirms the file has holes Signed-off-by: Anatol Belski --- cloud-hypervisor/tests/integration.rs | 88 +++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 7218cbb393..6bcbecddbc 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -7764,6 +7764,94 @@ mod common_parallel { _test_virtio_block_discard_with_backend("raw", "raw", &[], true, false, true); } + #[test] + fn test_virtio_block_write_zeroes_unmap_raw() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); + + let test_disk_path = guest.tmp_dir.as_path().join("write_zeroes_unmap_test.raw"); + + let res = exec_host_command_output(&format!( + "dd if=/dev/zero of={} bs=1M count=128", + test_disk_path.to_str().unwrap() + )); + assert!(res.status.success(), "Failed to create raw test image"); + + let mut child = GuestCommand::new(&guest) + .default_cpus() + .default_memory() + .default_kernel_cmdline() + .args([ + "--disk", + format!( + "path={}", + guest.disk_config.disk(DiskType::OperatingSystem).unwrap() + ) + .as_str(), + format!( + "path={}", + guest.disk_config.disk(DiskType::CloudInit).unwrap() + ) + .as_str(), + format!("path={},image_type=raw", test_disk_path.to_str().unwrap()).as_str(), + ]) + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + assert_eq!( + guest + .ssh_command("lsblk | grep -c vdc") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 1 + ); + + let wz_max = guest + .ssh_command("cat /sys/block/vdc/queue/write_zeroes_max_bytes") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(); + assert!( + wz_max > 0, + "write_zeroes_max_bytes={wz_max}, VIRTIO_BLK_F_WRITE_ZEROES not negotiated" + ); + + guest + .ssh_command("sudo dd if=/dev/urandom of=/dev/vdc bs=1M count=64 oflag=direct") + .unwrap(); + guest.ssh_command("sync").unwrap(); + + // fallocate --punch-hole on a block device sends + // WRITE_ZEROES with VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP set. + let result = guest + .ssh_command("sudo fallocate -p -o 0 -l 67108864 /dev/vdc 2>&1 || true") + .unwrap(); + assert!( + !result.contains("Operation not supported") && !result.contains("not supported"), + "fallocate --punch-hole failed: {result}" + ); + guest.ssh_command("sync").unwrap(); + + assert_guest_disk_region_is_zero(&guest, "/dev/vdc", 0, 4096 * 256); + + let test_disk_str = test_disk_path.to_str().unwrap(); + verify_sparse_file(test_disk_str, 1.0); + verify_fiemap_extents(test_disk_str, "raw"); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + handle_child_output(r, &output); + } + #[test] fn test_virtio_block_discard_unsupported_vhd() { _test_virtio_block_discard("vhd", "vpc", &["-o", "subformat=fixed"], false, false); From 0e7b2d68fc2e278a15bea7e12f39ad766059eef8 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 18 Mar 2026 00:08:45 +0100 Subject: [PATCH 184/742] performance-metrics: Add test exclude filter support Add a --test-exclude flag that excludes tests matching the provided keywords. Both --test-filter and --test-exclude are now applied before --list-tests, so listing respects the active filters. Signed-off-by: Anatol Belski --- performance-metrics/src/main.rs | 37 ++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/performance-metrics/src/main.rs b/performance-metrics/src/main.rs index c648d189c7..2f6b8bdb50 100644 --- a/performance-metrics/src/main.rs +++ b/performance-metrics/src/main.rs @@ -1301,6 +1301,13 @@ fn main() { .num_args(1) .required(false), ) + .arg( + Arg::new("test-exclude") + .long("test-exclude") + .help("Exclude metrics tests matching the provided keywords") + .num_args(1) + .required(false), + ) .arg( Arg::new("list-tests") .long("list-tests") @@ -1346,19 +1353,31 @@ fn main() { .filter(|t| !(cfg!(target_arch = "aarch64") && t.name == "virtio_net_latency_us")) .collect(); + let test_filter = match cmd_arguments.get_many::("test-filter") { + Some(s) => s.collect(), + None => Vec::new(), + }; + + let test_exclude = match cmd_arguments.get_many::("test-exclude") { + Some(s) => s.collect(), + None => Vec::new(), + }; + + // Determine which tests will actually run. + let tests_to_run: Vec<&&PerformanceTest> = test_list + .iter() + .filter(|t| test_filter.is_empty() || test_filter.iter().any(|&s| t.name.contains(s))) + .filter(|t| !test_exclude.iter().any(|&s| t.name.contains(s))) + .collect(); + if cmd_arguments.get_flag("list-tests") { - for test in test_list.iter() { + for test in tests_to_run.iter() { println!("\"{}\" ({})", test.name, test.control); } return; } - let test_filter = match cmd_arguments.get_many::("test-filter") { - Some(s) => s.collect(), - None => Vec::new(), - }; - // Run performance tests sequentially and report results (in both readable/json format) let mut metrics_report: MetricsReport = Default::default(); @@ -1380,12 +1399,6 @@ fn main() { .unwrap_or_default(), }); - // Determine which tests will actually run. - let tests_to_run: Vec<&&PerformanceTest> = test_list - .iter() - .filter(|t| test_filter.is_empty() || test_filter.iter().any(|&s| t.name.contains(s))) - .collect(); - // Skip heavy VM level init/cleanup when only micro benchmarks are selected. let needs_vm_tests = tests_to_run.iter().any(|t| !t.name.starts_with("micro_")); From 07479536abcd087b18dc71a62e996966af5552af Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 18 Mar 2026 00:09:22 +0100 Subject: [PATCH 185/742] scripts: Wire --test-exclude through test harness Add --test-exclude to process_common_args in test-util.sh and forward it to the performance-metrics binary from run_metrics.sh. Signed-off-by: Anatol Belski --- scripts/run_metrics.sh | 4 ++++ scripts/test-util.sh | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/scripts/run_metrics.sh b/scripts/run_metrics.sh index f70ccf1cde..88c207b322 100755 --- a/scripts/run_metrics.sh +++ b/scripts/run_metrics.sh @@ -108,6 +108,10 @@ if [ -n "$test_filter" ]; then test_binary_args+=("--test-filter $test_filter") fi +if [ -n "$test_exclude" ]; then + test_binary_args+=("--test-exclude $test_exclude") +fi + # Ensure that git commands can be run in this directory (for metrics report) git config --global --add safe.directory "$PWD" diff --git a/scripts/test-util.sh b/scripts/test-util.sh index 8958439330..5b414f8583 100644 --- a/scripts/test-util.sh +++ b/scripts/test-util.sh @@ -122,6 +122,10 @@ process_common_args() { shift test_filter="$1" ;; + "--test-exclude") + shift + test_exclude="$1" + ;; "--build-guest-kernel") build_kernel=true ;; From f4772e7f4c58d72eb8e03e729c81a2e0a3432dfc Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 18 Mar 2026 00:10:37 +0100 Subject: [PATCH 186/742] ci: Exclude micro benchmarks from metrics CI Skip micro_ prefixed tests in the metrics CI workflow to avoid dashboard pollution. They can still be run on demand via --test-filter micro_. Signed-off-by: Anatol Belski --- .github/workflows/integration-metrics.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration-metrics.yaml b/.github/workflows/integration-metrics.yaml index 952e938fdf..4e66f4b614 100644 --- a/.github/workflows/integration-metrics.yaml +++ b/.github/workflows/integration-metrics.yaml @@ -17,6 +17,6 @@ jobs: fetch-depth: 0 - name: Run metrics tests timeout-minutes: 60 - run: scripts/dev_cli.sh tests --metrics -- -- --report-file /root/workloads/metrics.json + run: scripts/dev_cli.sh tests --metrics -- --test-exclude micro_ -- --report-file /root/workloads/metrics.json - name: Upload metrics report run: 'curl -X PUT https://ch-metrics.azurewebsites.net/api/publishmetrics -H "x-functions-key: $METRICS_PUBLISH_KEY" -T ~/workloads/metrics.json' From 068b5ecb6318ae7ec56a1025cff174b953708609 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Tue, 17 Mar 2026 09:30:01 -0700 Subject: [PATCH 187/742] vmm: Add support for resuming automatically on restore Add an option that can be used when restoring to resume the VM. This is particularly useful when restoring the VM via the direct VMM command line, when you might not want/have an API socket configured. Signed-off-by: Rob Bradford --- cloud-hypervisor/tests/integration.rs | 83 +++++++++++++++++++-------- docs/snapshot_restore.md | 9 +++ vmm/src/config.rs | 32 ++++++++++- vmm/src/lib.rs | 18 ++++-- 4 files changed, 108 insertions(+), 34 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 6bcbecddbc..6e4f8c1fec 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -10814,16 +10814,22 @@ mod common_sequential { #[test] #[cfg(not(feature = "mshv"))] fn test_snapshot_restore_hotplug_virtiomem() { - _test_snapshot_restore(true); + _test_snapshot_restore(true, false); } #[test] #[cfg(not(feature = "mshv"))] // See issue #7437 fn test_snapshot_restore_basic() { - _test_snapshot_restore(false); + _test_snapshot_restore(false, false); } - fn _test_snapshot_restore(use_hotplug: bool) { + #[test] + #[cfg(not(feature = "mshv"))] + fn test_snapshot_restore_with_resume() { + _test_snapshot_restore(false, true); + } + + fn _test_snapshot_restore(use_hotplug: bool, use_resume_option: bool) { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(disk_config)); let kernel_path = direct_kernel_boot_path(); @@ -10975,7 +10981,7 @@ mod common_sequential { ]) .args([ "--restore", - format!("source_url=file://{snapshot_dir}").as_str(), + format!("source_url=file://{snapshot_dir},resume={use_resume_option}").as_str(), ]) .capture_output() .spawn() @@ -11005,28 +11011,12 @@ mod common_sequential { &expected_events, &event_path_restored )); - let latest_events = [&MetaEvent { - event: "restored".to_string(), - device_id: None, - }]; - assert!(check_latest_events_exact( - &latest_events, - &event_path_restored - )); - - // Remove the snapshot dir - let _ = remove_dir_all(snapshot_dir.as_str()); - - let r = std::panic::catch_unwind(|| { - // Resume the VM - assert!(remote_command(&api_socket_restored, "resume", None)); - // There is no way that we can ensure the 'write()' to the - // event file is completed when the 'resume' request is - // returned successfully, because the 'write()' was done - // asynchronously from a different thread of Cloud - // Hypervisor (e.g. the event-monitor thread). - thread::sleep(std::time::Duration::new(1, 0)); + if use_resume_option { let latest_events = [ + &MetaEvent { + event: "restored".to_string(), + device_id: None, + }, &MetaEvent { event: "resuming".to_string(), device_id: None, @@ -11040,6 +11030,49 @@ mod common_sequential { &latest_events, &event_path_restored )); + } else { + let latest_events = [&MetaEvent { + event: "restored".to_string(), + device_id: None, + }]; + assert!(check_latest_events_exact( + &latest_events, + &event_path_restored + )); + } + + // Remove the snapshot dir + let _ = remove_dir_all(snapshot_dir.as_str()); + + let r = std::panic::catch_unwind(|| { + if use_resume_option { + // VM was automatically resumed via restore option, just wait for events + thread::sleep(std::time::Duration::new(1, 0)); + } else { + // Resume the VM manually + assert!(remote_command(&api_socket_restored, "resume", None)); + // There is no way that we can ensure the 'write()' to the + // event file is completed when the 'resume' request is + // returned successfully, because the 'write()' was done + // asynchronously from a different thread of Cloud + // Hypervisor (e.g. the event-monitor thread). + thread::sleep(std::time::Duration::new(1, 0)); + + let latest_events = [ + &MetaEvent { + event: "resuming".to_string(), + device_id: None, + }, + &MetaEvent { + event: "resumed".to_string(), + device_id: None, + }, + ]; + assert!(check_latest_events_exact( + &latest_events, + &event_path_restored + )); + } // Perform same checks to validate VM has been properly restored assert_eq!(guest.get_cpu_count().unwrap_or_default(), 4); diff --git a/docs/snapshot_restore.md b/docs/snapshot_restore.md index 2cf8eda5a8..567f77a9a6 100644 --- a/docs/snapshot_restore.md +++ b/docs/snapshot_restore.md @@ -90,6 +90,15 @@ start using it. ./ch-remote --api-socket=/tmp/cloud-hypervisor.sock resume ``` +Alternatively, the `resume` option can be used to automatically resume the VM +after restore completes: + +```bash +./cloud-hypervisor \ + --api-socket /tmp/cloud-hypervisor.sock \ + --restore source_url=file:///home/foo/snapshot,resume=true +``` + At this point, the VM is fully restored and is identical to the VM which was snapshot earlier. diff --git a/vmm/src/config.rs b/vmm/src/config.rs index 8b284660a1..93ec6c8915 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -2603,17 +2603,20 @@ pub struct RestoreConfig { pub memory_restore_mode: MemoryRestoreMode, #[serde(default)] pub net_fds: Option>, + #[serde(default)] + pub resume: bool, } impl RestoreConfig { pub const SYNTAX: &'static str = "Restore from a VM snapshot. \ \nRestore parameters \"source_url=,prefault=on|off,memory_restore_mode=copy|ondemand,\ - net_fds=\" \ + net_fds=,resume=true|false\" \ \n`source_url` should be a valid URL (e.g file:///foo/bar or tcp://192.168.1.10/foo) \ \n`prefault` controls eager prefaulting for the copy-based restore path (disabled by default) \ \n`memory_restore_mode=copy` preserves the existing eager read-copy restore behavior, while `memory_restore_mode=ondemand` enables lazy demand paging and fails restore if userfaultfd support is unavailable \ \n`net_fds` is a list of net ids with new file descriptors. \ - Only net devices backed by FDs directly are needed as input."; + Only net devices backed by FDs directly are needed as input.\ + \n `resume` controls whether the VM will be directly resumed after restore "; pub fn parse(restore: &str) -> Result { let mut parser = OptionParser::new(); @@ -2621,7 +2624,8 @@ impl RestoreConfig { .add("source_url") .add("prefault") .add("memory_restore_mode") - .add("net_fds"); + .add("net_fds") + .add("resume"); parser.parse(restore).map_err(Error::ParseRestore)?; let source_url = parser @@ -2649,12 +2653,18 @@ impl RestoreConfig { }) .collect() }); + let resume = parser + .convert::("resume") + .map_err(Error::ParseRestore)? + .unwrap_or(Toggle(false)) + .0; Ok(RestoreConfig { source_url, prefault, memory_restore_mode, net_fds, + resume, }) } @@ -4546,6 +4556,7 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" prefault: false, memory_restore_mode: MemoryRestoreMode::Copy, net_fds: None, + resume: false, } ); assert_eq!( @@ -4568,6 +4579,7 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" fds: Some(vec![5, 6, 7, 8]), } ]), + resume: false, } ); assert_eq!( @@ -4577,6 +4589,17 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" prefault: false, memory_restore_mode: MemoryRestoreMode::OnDemand, net_fds: None, + resume: false, + } + ); + assert_eq!( + RestoreConfig::parse("source_url=/path/to/snapshot,resume=on")?, + RestoreConfig { + source_url: PathBuf::from("/path/to/snapshot"), + prefault: false, + memory_restore_mode: MemoryRestoreMode::Copy, + net_fds: None, + resume: true, } ); // Parsing should fail as source_url is a required field @@ -4678,6 +4701,7 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" fds: Some(vec![7, 8]), }, ]), + resume: false, }; valid_config.validate(&snapshot_vm_config).unwrap(); @@ -4742,6 +4766,7 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" prefault: false, memory_restore_mode: MemoryRestoreMode::Copy, net_fds: None, + resume: false, }; snapshot_vm_config.net = Some(vec![NetConfig { id: Some("net2".to_owned()), @@ -4755,6 +4780,7 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" prefault: true, memory_restore_mode: MemoryRestoreMode::OnDemand, net_fds: None, + resume: false, }; assert_eq!( invalid_restore_mode.validate(&snapshot_vm_config), diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 0b82be49e6..cb72d69d96 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -1849,16 +1849,22 @@ impl RequestHandler for Vmm { restore_cfg.prefault, restore_cfg.memory_restore_mode, ) - .map_err(|vm_restore_err| { - error!("VM Restore failed: {vm_restore_err:?}"); - - // Cleanup the VM being created while vm restore + .and_then(|()| { + if restore_cfg.resume { + self.vm_resume() + } else { + Ok(()) + } + }) + .map_err(|e| { + error!("VM Restore failed: {e:?}"); if let Err(e) = self.vm_delete() { return e; } + e + })?; - vm_restore_err - }) + Ok(()) } #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] From 54b27d8812a49b70af80c4d598796c5b88e667f4 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Wed, 18 Mar 2026 08:10:34 -0700 Subject: [PATCH 188/742] vmm: openapi: Add resume field to RestoreConfig Signed-off-by: Rob Bradford --- vmm/src/api/openapi/cloud-hypervisor.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vmm/src/api/openapi/cloud-hypervisor.yaml b/vmm/src/api/openapi/cloud-hypervisor.yaml index 8bdf14e50f..e8d3350dba 100644 --- a/vmm/src/api/openapi/cloud-hypervisor.yaml +++ b/vmm/src/api/openapi/cloud-hypervisor.yaml @@ -1358,6 +1358,8 @@ components: type: boolean memory_restore_mode: $ref: "#/components/schemas/MemoryRestoreMode" + resume: + type: boolean ReceiveMigrationData: required: From 9655eaddd5b6414e8c629045a40c14eb0996ba3d Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 18 Mar 2026 13:49:12 +0100 Subject: [PATCH 189/742] block: Use offset_of! for virtio_blk_discard_write_zeroes field offsets Replace magic numeric offsets with mem::offset_of!() referencing the virtio_blk_discard_write_zeroes struct from the virtio-bindings crate when reading the sector, num_sectors and flags fields in the DISCARD and WRITE_ZEROES request handlers. No functional change. Signed-off-by: Anatol Belski --- block/src/lib.rs | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/block/src/lib.rs b/block/src/lib.rs index 2c8556c395..4a6ea4979d 100644 --- a/block/src/lib.rs +++ b/block/src/lib.rs @@ -69,6 +69,13 @@ use crate::vhdx::VhdxError; const SECTOR_SHIFT: u8 = 9; pub const SECTOR_SIZE: u64 = 0x01 << SECTOR_SHIFT; +/// Field offsets within `struct virtio_blk_discard_write_zeroes`. +const DISCARD_WZ_SECTOR_OFFSET: u64 = + mem::offset_of!(virtio_blk_discard_write_zeroes, sector) as u64; +const DISCARD_WZ_NUM_SECTORS_OFFSET: u64 = + mem::offset_of!(virtio_blk_discard_write_zeroes, num_sectors) as u64; +const DISCARD_WZ_FLAGS_OFFSET: u64 = mem::offset_of!(virtio_blk_discard_write_zeroes, flags) as u64; + #[derive(Error, Debug)] pub enum Error { #[error("Guest gave us bad memory addresses")] @@ -589,9 +596,15 @@ impl Request { let mut discard_sector = [0u8; 8]; let mut discard_num_sectors = [0u8; 4]; - mem.read_slice(&mut discard_sector, data_addr) + + let sector_addr = data_addr.checked_add(DISCARD_WZ_SECTOR_OFFSET).unwrap(); + mem.read_slice(&mut discard_sector, sector_addr) .map_err(ExecuteError::Read)?; - mem.read_slice(&mut discard_num_sectors, data_addr.checked_add(8).unwrap()) + + let num_sectors_addr = data_addr + .checked_add(DISCARD_WZ_NUM_SECTORS_OFFSET) + .unwrap(); + mem.read_slice(&mut discard_num_sectors, num_sectors_addr) .map_err(ExecuteError::Read)?; let discard_sector = u64::from_le_bytes(discard_sector); @@ -623,11 +636,19 @@ impl Request { let mut wz_sector = [0u8; 8]; let mut wz_num_sectors = [0u8; 4]; let mut wz_flags = [0u8; 4]; - mem.read_slice(&mut wz_sector, data_addr) + + let sector_addr = data_addr.checked_add(DISCARD_WZ_SECTOR_OFFSET).unwrap(); + mem.read_slice(&mut wz_sector, sector_addr) .map_err(ExecuteError::Read)?; - mem.read_slice(&mut wz_num_sectors, data_addr.checked_add(8).unwrap()) + + let num_sectors_addr = data_addr + .checked_add(DISCARD_WZ_NUM_SECTORS_OFFSET) + .unwrap(); + mem.read_slice(&mut wz_num_sectors, num_sectors_addr) .map_err(ExecuteError::Read)?; - mem.read_slice(&mut wz_flags, data_addr.checked_add(12).unwrap()) + + let flags_addr = data_addr.checked_add(DISCARD_WZ_FLAGS_OFFSET).unwrap(); + mem.read_slice(&mut wz_flags, flags_addr) .map_err(ExecuteError::Read)?; let wz_sector = u64::from_le_bytes(wz_sector); From 0ee0441f2b79cbcca3049f5b697b6f6aa2b7e8d6 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Tue, 17 Mar 2026 17:49:09 +0100 Subject: [PATCH 190/742] virtio-devices: block: Derive discard alignment from topology Set discard_sector_alignment from the logical block size reported by the backend topology instead of hardcoding it to 1 sector. This gives the guest accurate alignment hints so it can avoid sub block discards that the filesystem might silently ignore. For example, on a 4K block filesystem the alignment is now 8 sectors (4096/512) instead of 1. For image formats with their own allocation units (QCOW2 clusters, VHD/VHDX block sizes), the ideal alignment would be derived from the format cluster/block size. This is left for a followup that surfaces allocation granularity through DiskTopology. Signed-off-by: Anatol Belski --- virtio-devices/src/block.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtio-devices/src/block.rs b/virtio-devices/src/block.rs index af6d638dd0..2adbff74f8 100644 --- a/virtio-devices/src/block.rs +++ b/virtio-devices/src/block.rs @@ -833,7 +833,7 @@ impl Block { if avail_features & (1u64 << VIRTIO_BLK_F_DISCARD) != 0 { config.max_discard_sectors = u32::MAX; config.max_discard_seg = 1; - config.discard_sector_alignment = 1; + config.discard_sector_alignment = (logical_block_size / SECTOR_SIZE) as u32; } if num_queues > 1 { From 19fa512f0245f96c95d0a06c6f2e7c7e83bf38e4 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 18 Mar 2026 15:27:47 +0100 Subject: [PATCH 191/742] block: Batch drain AIO completions in next_completed_request Collect up to 32 completions per io_getevents call instead of one at a time, buffering them in the existing VecDeque. This reduces syscalls from 128 to 4 per drain cycle at the default queue depth. The stack cost is 1 KB per call. Signed-off-by: Anatol Belski --- block/src/raw_async_aio.rs | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/block/src/raw_async_aio.rs b/block/src/raw_async_aio.rs index f59e463b4c..c2e6a174e0 100644 --- a/block/src/raw_async_aio.rs +++ b/block/src/raw_async_aio.rs @@ -172,18 +172,16 @@ impl AsyncIo for RawFileAsyncAio { } fn next_completed_request(&mut self) -> Option<(u64, i32)> { - // Drain synchronous completions first (from punch_hole/write_zeroes). - if let Some(completed) = self.completion_list.pop_front() { - return Some(completed); - } - - let mut events: [aio::IoEvent; 1] = [aio::IoEvent::default()]; - let rc = self.ctx.get_events(0, &mut events, None).unwrap(); - if rc == 0 { - None - } else { - Some((events[0].data, events[0].res as i32)) + if self.completion_list.is_empty() { + // Drain pending AIO completions batched into the same queue. + let mut events = [aio::IoEvent::default(); 32]; + let rc = self.ctx.get_events(0, &mut events, None).unwrap(); + for event in &events[..rc] { + self.completion_list + .push_back((event.data, event.res as i32)); + } } + self.completion_list.pop_front() } fn punch_hole(&mut self, offset: u64, length: u64, user_data: u64) -> AsyncIoResult<()> { From e278e5e931d62e0a11c0a584c290a412ab41c8e3 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Thu, 5 Mar 2026 17:20:38 -0800 Subject: [PATCH 192/742] tests: Refactor _test_power_button to accept Guest parameter Remove the acpi bool parameter and internal guest and kernel setup from _test_power_button. The function now accepts a Guest reference and uses default_kernel_cmdline() instead of hardcoded kernel paths. Update test_power_button in common_parallel to create a regular guest via GuestFactory. Update test_power_button_acpi in aarch64_acpi to use with_kernel_path(edk2_path()) for ACPI firmware support. Add with_kernel_path() builder method on Guest in test_infra to allow overriding the kernel path after guest creation. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 28 +++++++++++---------------- test_infra/src/lib.rs | 5 +++++ 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 6e4f8c1fec..b65a2c4427 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -1052,25 +1052,13 @@ fn _test_guest_numa_nodes(acpi: bool) { } #[allow(unused_variables)] -fn _test_power_button(acpi: bool) { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - let mut cmd = GuestCommand::new(&guest); +fn _test_power_button(guest: &Guest) { + let mut cmd = GuestCommand::new(guest); let api_socket = temp_api_path(&guest.tmp_dir); - #[cfg(target_arch = "x86_64")] - let kernel_path = direct_kernel_boot_path(); - #[cfg(target_arch = "aarch64")] - let kernel_path = if acpi { - edk2_path() - } else { - direct_kernel_boot_path() - }; - cmd.default_cpus() .default_memory() - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .default_kernel_cmdline() .capture_output() .default_disks() .default_net() @@ -2828,7 +2816,9 @@ mod common_parallel { #[test] fn test_power_button() { - _test_power_button(false); + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = GuestFactory::new_regular_guest_factory().create_guest(Box::new(disk_config)); + _test_power_button(&guest); } #[test] @@ -14533,7 +14523,11 @@ mod aarch64_acpi { #[test] fn test_power_button_acpi() { - _test_power_button(true); + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = GuestFactory::new_regular_guest_factory() + .create_guest(Box::new(disk_config)) + .with_kernel_path(edk2_path().to_str().unwrap()); + _test_power_button(&guest); } #[test] diff --git a/test_infra/src/lib.rs b/test_infra/src/lib.rs index 22a1ea99dd..c96e568997 100644 --- a/test_infra/src/lib.rs +++ b/test_infra/src/lib.rs @@ -1013,6 +1013,11 @@ impl Guest { self } + pub fn with_kernel_path(mut self, kernel_path: &str) -> Self { + self.kernel_path = Some(kernel_path.to_string()); + self + } + pub fn default_net_string(&self) -> String { format!( "tap=,mac={},ip={},mask=255.255.255.128", From b6d8df772d9bf025c26d4b5cb3001b6cf3b7ad8c Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Thu, 5 Mar 2026 17:32:42 -0800 Subject: [PATCH 193/742] tests: Add power button test for confidential VMs Add test_power_button to the common_cvm integration test module to verify that power button functionality works correctly in confidential guest environments. The test creates an Ubuntu Jammy-based confidential VM using GuestFactory and delegates to the existing _test_power_button helper to validate the power button signal handling. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index b65a2c4427..1cca190168 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -14934,4 +14934,12 @@ mod common_cvm { let target_api = TargetApi::new_http_api(&guest.tmp_dir); _test_api_delete(&target_api, &guest); } + + #[test] + fn test_power_button() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = + GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); + _test_power_button(&guest); + } } From dacc92a08fd84379cdcc2afe7dec94b4c0f6b3b5 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Thu, 5 Mar 2026 17:50:10 -0800 Subject: [PATCH 194/742] tests: Refactor _test_virtio_vsock to accept Guest parameter Extract guest and kernel setup out of _test_virtio_vsock and pass a Guest reference as a parameter instead. Replace explicit kernel and cmdline arguments with default_kernel_cmdline(). Move guest creation to the test call sites using GuestFactory::new_regular_guest_factory(), enabling reuse of the helper with different guest types such as confidential VMs. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 32 ++++++++++++--------------- 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 1cca190168..144a8706d5 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -1781,28 +1781,15 @@ fn get_fd_count(pid: u32) -> usize { fs::read_dir(format!("/proc/{pid}/fd")).unwrap().count() } -fn _test_virtio_vsock(hotplug: bool) { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - #[cfg(target_arch = "x86_64")] - let kernel_path = direct_kernel_boot_path(); - #[cfg(target_arch = "aarch64")] - let kernel_path = if hotplug { - edk2_path() - } else { - direct_kernel_boot_path() - }; - +fn _test_virtio_vsock(guest: &Guest, hotplug: bool) { let socket = temp_vsock_path(&guest.tmp_dir); let api_socket = temp_api_path(&guest.tmp_dir); - let mut cmd = GuestCommand::new(&guest); + let mut cmd = GuestCommand::new(guest); cmd.args(["--api-socket", &api_socket]); cmd.default_cpus(); cmd.default_memory(); - cmd.args(["--kernel", kernel_path.to_str().unwrap()]); - cmd.args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]); + cmd.default_kernel_cmdline(); cmd.default_disks(); cmd.default_net(); @@ -5965,12 +5952,21 @@ mod common_parallel { #[test] fn test_virtio_vsock() { - _test_virtio_vsock(false); + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = GuestFactory::new_regular_guest_factory().create_guest(Box::new(disk_config)); + _test_virtio_vsock(&guest, false); } #[test] fn test_virtio_vsock_hotplug() { - _test_virtio_vsock(true); + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + #[cfg(target_arch = "x86_64")] + let guest = GuestFactory::new_regular_guest_factory().create_guest(Box::new(disk_config)); + #[cfg(target_arch = "aarch64")] + let guest = GuestFactory::new_regular_guest_factory() + .create_guest(Box::new(disk_config)) + .with_kernel_path(edk2_path().to_str().unwrap()); + _test_virtio_vsock(&guest, true); } #[test] From c44b7679cf68f96e3190a57470197ed9b5fa814f Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Thu, 5 Mar 2026 17:57:21 -0800 Subject: [PATCH 195/742] tests: Add virtio vsock test for confidential VMs Add test_virtio_vsock to the common_cvm integration test module to verify that virtio vsock functionality works correctly in confidential guest environments. The test creates an Ubuntu Jammy-based confidential VM using GuestFactory and delegates to the existing _test_virtio_vsock helper with hotplug disabled. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 144a8706d5..47c3c1f040 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -14938,4 +14938,12 @@ mod common_cvm { GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); _test_power_button(&guest); } + + #[test] + fn test_virtio_vsock() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = + GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); + _test_virtio_vsock(&guest, false); + } } From 18ed8e61d201cc1de0cba0b523e5ed4a9853629e Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Thu, 5 Mar 2026 18:38:10 -0800 Subject: [PATCH 196/742] tests: Refactor test_multi_cpu into reusable helper Extract the multi-CPU test logic from the test_multi_cpu test into a standalone _test_multi_cpu helper that accepts a Guest reference as a parameter. Update the test_multi_cpu call site in common_parallel to create the guest via GuestFactory::new_regular_guest_factory() and delegate to the new helper. This enables reuse of the test logic with different guest types such as confidential VMs. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 69 +++++++++++++-------------- 1 file changed, 34 insertions(+), 35 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 47c3c1f040..ccc874bad0 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -2471,6 +2471,37 @@ fn _test_simple_launch(guest: &Guest) { handle_child_output(r, &output); } +fn _test_multi_cpu(guest: &Guest) { + let mut cmd = GuestCommand::new(guest); + cmd.args(["--cpus", "boot=2,max=4"]) + .default_memory() + .default_kernel_cmdline() + .capture_output() + .default_disks() + .default_net(); + + let mut child = cmd.spawn().unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + assert_eq!(guest.get_cpu_count().unwrap_or_default(), 2); + + assert_eq!( + guest + .ssh_command(r#"sudo dmesg | grep "smp: Brought up" | sed "s/\[\ *[0-9.]*\] //""#) + .unwrap() + .trim(), + "smp: Brought up 1 node, 2 CPUs" + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + mod common_parallel { use std::cmp; use std::fs::{File, OpenOptions, copy}; @@ -2502,41 +2533,9 @@ mod common_parallel { #[test] fn test_multi_cpu() { - let jammy_image = JAMMY_IMAGE_NAME.to_string(); - let disk_config = UbuntuDiskConfig::new(jammy_image); - let guest = Guest::new(Box::new(disk_config)); - - let mut cmd = GuestCommand::new(&guest); - cmd.args(["--cpus", "boot=2,max=4"]) - .default_memory() - .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .capture_output() - .default_disks() - .default_net(); - - let mut child = cmd.spawn().unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - assert_eq!(guest.get_cpu_count().unwrap_or_default(), 2); - - assert_eq!( - guest - .ssh_command( - r#"sudo dmesg | grep "smp: Brought up" | sed "s/\[\ *[0-9.]*\] //""# - ) - .unwrap() - .trim(), - "smp: Brought up 1 node, 2 CPUs" - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = GuestFactory::new_regular_guest_factory().create_guest(Box::new(disk_config)); + _test_multi_cpu(&guest); } #[test] From 848a28048364f5d77789cfec5b6c3c1188935a1a Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Thu, 5 Mar 2026 18:40:16 -0800 Subject: [PATCH 197/742] tests: Add multi-CPU test for confidential VMs Add test_multi_cpu to the common_cvm integration test module to verify that multi-CPU functionality works correctly in confidential guest environments. The test creates an Ubuntu Jammy-based confidential VM using GuestFactory and delegates to the existing _test_multi_cpu helper to validate SMP boot with multiple vCPUs. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index ccc874bad0..d138e184a3 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -14945,4 +14945,12 @@ mod common_cvm { GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); _test_virtio_vsock(&guest, false); } + + #[test] + fn test_multi_cpu() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = + GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); + _test_multi_cpu(&guest); + } } From 9059fb902d5bf896fe0fffcf9b603ede50d9c075 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Thu, 5 Mar 2026 19:44:59 -0800 Subject: [PATCH 198/742] tests: Refactor test_cpu_affinity into reusable helper Extract CPU affinity test logic from test_cpu_affinity into a standalone _test_cpu_affinity helper that accepts a Guest reference. The helper verifies the host has at least 4 CPUs, boots a VM with affinity settings, and asserts vcpu0 is pinned to cores 0,2 and vcpu1 to cores 1,3. Add default_cpus_with_affinity_string() to Guest and default_cpus_with_affinity() to GuestCommand in test_infra to generate CPU arguments with affinity configuration. Update the test_cpu_affinity call site in common_parallel to use GuestFactory and delegate to the new helper, enabling reuse with different guest types. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 76 ++++++++++++++------------- test_infra/src/lib.rs | 18 +++++++ 2 files changed, 58 insertions(+), 36 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index d138e184a3..50d9f4e4c7 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -2502,6 +2502,42 @@ fn _test_multi_cpu(guest: &Guest) { handle_child_output(r, &output); } +fn _test_cpu_affinity(guest: &Guest) { + // We need the host to have at least 4 CPUs if we want to be able + // to run this test. + let host_cpus_count = exec_host_command_output("nproc"); + assert!( + String::from_utf8_lossy(&host_cpus_count.stdout) + .trim() + .parse::() + .unwrap_or(0) + >= 4 + ); + + let mut child = GuestCommand::new(guest) + .default_cpus_with_affinity() + .default_memory() + .default_kernel_cmdline() + .default_disks() + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + let pid = child.id(); + let taskset_vcpu0 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep vcpu0 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str()); + assert_eq!(String::from_utf8_lossy(&taskset_vcpu0.stdout).trim(), "0,2"); + let taskset_vcpu1 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep vcpu1 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str()); + assert_eq!(String::from_utf8_lossy(&taskset_vcpu1.stdout).trim(), "1,3"); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + handle_child_output(r, &output); +} + mod common_parallel { use std::cmp; use std::fs::{File, OpenOptions, copy}; @@ -2638,42 +2674,10 @@ mod common_parallel { #[test] fn test_cpu_affinity() { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - // We need the host to have at least 4 CPUs if we want to be able - // to run this test. - let host_cpus_count = exec_host_command_output("nproc"); - assert!( - String::from_utf8_lossy(&host_cpus_count.stdout) - .trim() - .parse::() - .unwrap_or(0) - >= 4 - ); - - let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=2,affinity=[0@[0,2],1@[1,3]]"]) - .default_memory() - .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .default_net() - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - let pid = child.id(); - let taskset_vcpu0 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep vcpu0 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str()); - assert_eq!(String::from_utf8_lossy(&taskset_vcpu0.stdout).trim(), "0,2"); - let taskset_vcpu1 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep vcpu1 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str()); - assert_eq!(String::from_utf8_lossy(&taskset_vcpu1.stdout).trim(), "1,3"); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - handle_child_output(r, &output); + let guest = GuestFactory::new_regular_guest_factory() + .create_guest(Box::new(disk_config)) + .with_cpu(2); + _test_cpu_affinity(&guest); } #[test] diff --git a/test_infra/src/lib.rs b/test_infra/src/lib.rs index c96e568997..e1af41bc0f 100644 --- a/test_infra/src/lib.rs +++ b/test_infra/src/lib.rs @@ -1442,6 +1442,14 @@ impl Guest { ) } + pub fn default_cpus_with_affinity_string(&self) -> String { + format!( + "boot={},affinity=[0@[0,2],1@[1,3]]{}", + self.num_cpu, + if self.nested { "" } else { ",nested=off" } + ) + } + pub fn default_memory_string(&self) -> String { format!("size={}", self.mem_size_str) } @@ -1716,6 +1724,16 @@ impl<'a> GuestCommand<'a> { self.args(["--cpus", self.guest.default_cpus_string().as_str()]) } + pub fn default_cpus_with_affinity(&mut self) -> &mut Self { + // Only support cpu affinity for 2 VCPUs for now, + // as it is only used in a test that validates cpu affinity is applied correctly. + assert_eq!(self.guest.num_cpu, 2); + self.args([ + "--cpus", + self.guest.default_cpus_with_affinity_string().as_str(), + ]) + } + pub fn default_memory(&mut self) -> &mut Self { self.args(["--memory", self.guest.default_memory_string().as_str()]) } From 7bf439975f0a23cf23b51245fc18cd560c55adf8 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Thu, 5 Mar 2026 19:45:48 -0800 Subject: [PATCH 199/742] tests: Add CPU affinity test for confidential VMs Add test_cpu_affinity to the common_cvm integration test module to verify that CPU pinning works correctly in confidential guest environments. The test creates a 2-vCPU Ubuntu Jammy-based confidential VM using GuestFactory and delegates to the existing _test_cpu_affinity helper to validate vCPU-to-core affinity. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 50d9f4e4c7..dbb29b675a 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -14957,4 +14957,13 @@ mod common_cvm { GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); _test_multi_cpu(&guest); } + + #[test] + fn test_cpu_affinity() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = GuestFactory::new_confidential_guest_factory() + .create_guest(Box::new(disk_config)) + .with_cpu(2); + _test_cpu_affinity(&guest); + } } From c761d741bf7b696849d952ea0391e2f2fce01c5d Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Thu, 5 Mar 2026 21:02:38 -0800 Subject: [PATCH 200/742] tests: Refactor test_virtio_queue_affinity into reusable helper Extract virtio queue affinity test logic into a standalone _test_virtio_queue_affinity helper that accepts a Guest reference. The helper verifies the host has at least 4 CPUs, boots a VM with per-queue affinity on the cloud-init disk, and asserts each disk queue thread is pinned to the expected cores. Replace hardcoded kernel and cmdline arguments with default_cpus() and default_kernel_cmdline(). Update the test call site in common_parallel to use GuestFactory and delegate to the new helper, enabling reuse with different guest types. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 108 +++++++++++++------------- 1 file changed, 56 insertions(+), 52 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index dbb29b675a..ddb4ea00a7 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -2538,6 +2538,58 @@ fn _test_cpu_affinity(guest: &Guest) { handle_child_output(r, &output); } +fn _test_virtio_queue_affinity(guest: &Guest) { + // We need the host to have at least 4 CPUs if we want to be able + // to run this test. + let host_cpus_count = exec_host_command_output("nproc"); + assert!( + String::from_utf8_lossy(&host_cpus_count.stdout) + .trim() + .parse::() + .unwrap_or(0) + >= 4 + ); + + let mut child = GuestCommand::new(guest) + .default_cpus() + .default_memory() + .default_kernel_cmdline() + .args([ + "--disk", + format!( + "path={}", + guest.disk_config.disk(DiskType::OperatingSystem).unwrap() + ) + .as_str(), + format!( + "path={},num_queues=4,queue_affinity=[0@[0,2],1@[1,3],2@[1],3@[3]]", + guest.disk_config.disk(DiskType::CloudInit).unwrap() + ) + .as_str(), + ]) + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + let pid = child.id(); + let taskset_q0 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep disk1_q0 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str()); + assert_eq!(String::from_utf8_lossy(&taskset_q0.stdout).trim(), "0,2"); + let taskset_q1 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep disk1_q1 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str()); + assert_eq!(String::from_utf8_lossy(&taskset_q1.stdout).trim(), "1,3"); + let taskset_q2 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep disk1_q2 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str()); + assert_eq!(String::from_utf8_lossy(&taskset_q2.stdout).trim(), "1"); + let taskset_q3 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep disk1_q3 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str()); + assert_eq!(String::from_utf8_lossy(&taskset_q3.stdout).trim(), "3"); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + handle_child_output(r, &output); +} + mod common_parallel { use std::cmp; use std::fs::{File, OpenOptions, copy}; @@ -2683,58 +2735,10 @@ mod common_parallel { #[test] fn test_virtio_queue_affinity() { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - // We need the host to have at least 4 CPUs if we want to be able - // to run this test. - let host_cpus_count = exec_host_command_output("nproc"); - assert!( - String::from_utf8_lossy(&host_cpus_count.stdout) - .trim() - .parse::() - .unwrap_or(0) - >= 4 - ); - - let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=4"]) - .default_memory() - .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .args([ - "--disk", - format!( - "path={}", - guest.disk_config.disk(DiskType::OperatingSystem).unwrap() - ) - .as_str(), - format!( - "path={},num_queues=4,queue_affinity=[0@[0,2],1@[1,3],2@[1],3@[3]]", - guest.disk_config.disk(DiskType::CloudInit).unwrap() - ) - .as_str(), - ]) - .default_net() - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - let pid = child.id(); - let taskset_q0 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep disk1_q0 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str()); - assert_eq!(String::from_utf8_lossy(&taskset_q0.stdout).trim(), "0,2"); - let taskset_q1 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep disk1_q1 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str()); - assert_eq!(String::from_utf8_lossy(&taskset_q1.stdout).trim(), "1,3"); - let taskset_q2 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep disk1_q2 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str()); - assert_eq!(String::from_utf8_lossy(&taskset_q2.stdout).trim(), "1"); - let taskset_q3 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep disk1_q3 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str()); - assert_eq!(String::from_utf8_lossy(&taskset_q3.stdout).trim(), "3"); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - handle_child_output(r, &output); + let guest = GuestFactory::new_regular_guest_factory() + .create_guest(Box::new(disk_config)) + .with_cpu(4); + _test_virtio_queue_affinity(&guest); } #[test] From bad5da622ef013f4db229ae93c2b63c185b209bc Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Thu, 5 Mar 2026 21:03:44 -0800 Subject: [PATCH 201/742] tests: Add virtio queue affinity test for confidential VMs Add test_virtio_queue_affinity to the common_cvm integration test module to verify that per-queue CPU pinning works correctly in confidential guest environments. The test creates a 4-vCPU Ubuntu Jammy-based confidential VM using GuestFactory and delegates to the existing _test_virtio_queue_affinity helper to validate queue-to-core affinity settings. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index ddb4ea00a7..cd5118642f 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -14970,4 +14970,13 @@ mod common_cvm { .with_cpu(2); _test_cpu_affinity(&guest); } + + #[test] + fn test_virtio_queue_affinity() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = GuestFactory::new_confidential_guest_factory() + .create_guest(Box::new(disk_config)) + .with_cpu(4); + _test_virtio_queue_affinity(&guest); + } } From acec5b00d64f1268cfc69703a287029a3eb261f9 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Thu, 5 Mar 2026 22:31:34 -0800 Subject: [PATCH 202/742] tests: Refactor test_pci_msi into reusable helper Extract PCI MSI interrupt test logic from test_pci_msi into a standalone _test_pci_msi helper that accepts a Guest reference. The helper boots a VM, waits for boot, and asserts that 12 MSI interrupts are present in /proc/interrupts. Replace hardcoded kernel and cmdline arguments with default_kernel_cmdline(). Update the test call site in common_parallel to use GuestFactory and delegate to the new helper, enabling reuse with different guest types. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 67 ++++++++++++++------------- 1 file changed, 35 insertions(+), 32 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index cd5118642f..4e919884bf 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -2590,6 +2590,39 @@ fn _test_virtio_queue_affinity(guest: &Guest) { handle_child_output(r, &output); } +fn _test_pci_msi(guest: &Guest) { + let mut cmd = GuestCommand::new(guest); + cmd.default_cpus() + .default_memory() + .default_kernel_cmdline() + .capture_output() + .default_disks() + .default_net(); + + let mut child = cmd.spawn().unwrap(); + + guest.wait_vm_boot().unwrap(); + + let grep_cmd = format!("grep -c {} /proc/interrupts", get_msi_interrupt_pattern()); + + let r = std::panic::catch_unwind(|| { + assert_eq!( + guest + .ssh_command(&grep_cmd) + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 12 + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + mod common_parallel { use std::cmp; use std::fs::{File, OpenOptions, copy}; @@ -2972,38 +3005,8 @@ mod common_parallel { #[test] fn test_pci_msi() { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - let mut cmd = GuestCommand::new(&guest); - cmd.default_cpus() - .default_memory() - .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .capture_output() - .default_disks() - .default_net(); - - let mut child = cmd.spawn().unwrap(); - - guest.wait_vm_boot().unwrap(); - - let grep_cmd = format!("grep -c {} /proc/interrupts", get_msi_interrupt_pattern()); - - let r = std::panic::catch_unwind(|| { - assert_eq!( - guest - .ssh_command(&grep_cmd) - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 12 - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); + let guest = GuestFactory::new_regular_guest_factory().create_guest(Box::new(disk_config)); + _test_pci_msi(&guest); } #[test] From 31f1e67be73648c62704a598e1e3b8cdfca5f36d Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Thu, 5 Mar 2026 22:32:32 -0800 Subject: [PATCH 203/742] tests: Add PCI MSI interrupt test for confidential VMs Add test_pci_msi to the common_cvm integration test module to verify that PCI MSI interrupt functionality works correctly in confidential guest environments. The test creates an Ubuntu Jammy-based confidential VM using GuestFactory and delegates to the existing _test_pci_msi helper to validate MSI interrupts. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 4e919884bf..9ea1e0d882 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -14982,4 +14982,12 @@ mod common_cvm { .with_cpu(4); _test_virtio_queue_affinity(&guest); } + + #[test] + fn test_pci_msi() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = + GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); + _test_pci_msi(&guest); + } } From 5378f8d6146698f2a249b8b25b9b1e8ca7a28835 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Thu, 5 Mar 2026 22:45:00 -0800 Subject: [PATCH 204/742] tests: Refactor test_virtio_net_ctrl_queue into reusable helper Extract virtio net control queue test logic into a standalone _test_virtio_net_ctrl_queue helper that accepts a Guest reference. The helper boots a VM with MTU 3000, verifies ethtool can disable rx-gro-hw, and asserts the guest interface MTU is correctly set. Replace hardcoded kernel and cmdline arguments with default_kernel_cmdline(). Update the test call site in common_parallel to use GuestFactory and delegate to the new helper, enabling reuse with different guest types. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 87 ++++++++++++++------------- 1 file changed, 45 insertions(+), 42 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 9ea1e0d882..5ee867c81c 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -2623,6 +2623,49 @@ fn _test_pci_msi(guest: &Guest) { handle_child_output(r, &output); } +fn _test_virtio_net_ctrl_queue(guest: &Guest) { + let mut cmd = GuestCommand::new(guest); + cmd.default_cpus() + .default_memory() + .default_kernel_cmdline() + .args(["--net", guest.default_net_string_w_mtu(3000).as_str()]) + .capture_output() + .default_disks(); + + let mut child = cmd.spawn().unwrap(); + + guest.wait_vm_boot().unwrap(); + + #[cfg(target_arch = "aarch64")] + let iface = "enp0s4"; + #[cfg(target_arch = "x86_64")] + let iface = "ens4"; + + let r = std::panic::catch_unwind(|| { + assert_eq!( + guest + .ssh_command( + format!("sudo ethtool -K {iface} rx-gro-hw off && echo success").as_str() + ) + .unwrap() + .trim(), + "success" + ); + assert_eq!( + guest + .ssh_command(format!("cat /sys/class/net/{iface}/mtu").as_str()) + .unwrap() + .trim(), + "3000" + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + mod common_parallel { use std::cmp; use std::fs::{File, OpenOptions, copy}; @@ -3012,48 +3055,8 @@ mod common_parallel { #[test] fn test_virtio_net_ctrl_queue() { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - let mut cmd = GuestCommand::new(&guest); - cmd.default_cpus() - .default_memory() - .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .args(["--net", guest.default_net_string_w_mtu(3000).as_str()]) - .capture_output() - .default_disks(); - - let mut child = cmd.spawn().unwrap(); - - guest.wait_vm_boot().unwrap(); - - #[cfg(target_arch = "aarch64")] - let iface = "enp0s4"; - #[cfg(target_arch = "x86_64")] - let iface = "ens4"; - - let r = std::panic::catch_unwind(|| { - assert_eq!( - guest - .ssh_command( - format!("sudo ethtool -K {iface} rx-gro-hw off && echo success").as_str() - ) - .unwrap() - .trim(), - "success" - ); - assert_eq!( - guest - .ssh_command(format!("cat /sys/class/net/{iface}/mtu").as_str()) - .unwrap() - .trim(), - "3000" - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); + let guest = GuestFactory::new_regular_guest_factory().create_guest(Box::new(disk_config)); + _test_virtio_net_ctrl_queue(&guest); } #[test] From 6f8776ac50bebbedd60ee155e1f42fecb9c28812 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Thu, 5 Mar 2026 22:46:19 -0800 Subject: [PATCH 205/742] tests: Add virtio net ctrl queue test for confidential VMs Add test_virtio_net_ctrl_queue to the common_cvm integration test module to verify that virtio net control queue functionality works correctly in confidential guest environments. The test creates an Ubuntu Jammy-based confidential VM using GuestFactory and delegates to the existing _test_virtio_net_ctrl_queue helper to validate MTU configuration and ethtool offload settings. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 5ee867c81c..2ffe0bef04 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -14993,4 +14993,12 @@ mod common_cvm { GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); _test_pci_msi(&guest); } + + #[test] + fn test_virtio_net_ctrl_queue() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = + GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); + _test_virtio_net_ctrl_queue(&guest); + } } From 2aa92bf4b4ec1c717caccbb10e2edb81902bdaa9 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 7 Mar 2026 22:07:18 -0800 Subject: [PATCH 206/742] tests: Add platform argument support to kernel cmdline builder Introduce default_kernel_cmdline_with_platform() in GuestCommand that accepts an optional platform parameter. For confidential VMs, the platform arg is prepended to sev_snp=on. For regular VMs, it is passed via --platform if provided. Retain default_kernel_cmdline() as a convenience wrapper that calls the new method with None, preserving backward compatibility. This enables tests to pass additional platform configuration such as num_pci_segments alongside the default kernel and cmdline setup. Signed-off-by: Muminul Islam --- test_infra/src/lib.rs | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/test_infra/src/lib.rs b/test_infra/src/lib.rs index e1af41bc0f..b6d7641f6b 100644 --- a/test_infra/src/lib.rs +++ b/test_infra/src/lib.rs @@ -1694,7 +1694,7 @@ impl<'a> GuestCommand<'a> { self.args(["--net", self.guest.default_net_string().as_str()]) } - pub fn default_kernel_cmdline(&mut self) -> &mut Self { + pub fn default_kernel_cmdline_with_platform(&mut self, platform: Option<&str>) -> &mut Self { if self.guest.vm_type == GuestVmType::Confidential { let console_str = if let Some(c) = &self.guest.console_type { c.as_str() @@ -1709,17 +1709,34 @@ impl<'a> GuestCommand<'a> { ]); self.command .args(["--host-data", generate_host_data().as_str()]); - self.command.args(["--platform", "sev_snp=on"]); + self.command.args([ + "--platform", + &format!( + "{}sev_snp=on", + if let Some(p) = platform { + format!("{p},") + } else { + String::new() + } + ), + ]); } else if let Some(kernel) = &self.guest.kernel_path { self.command.args(["--kernel", kernel.as_str()]); if let Some(cmdline) = &self.guest.kernel_cmdline { self.command.args(["--cmdline", cmdline]); } + if let Some(platform_arg) = platform { + self.command.args(["--platform", platform_arg]); + } } self } + pub fn default_kernel_cmdline(&mut self) -> &mut Self { + self.default_kernel_cmdline_with_platform(None) + } + pub fn default_cpus(&mut self) -> &mut Self { self.args(["--cpus", self.guest.default_cpus_string().as_str()]) } From 62bb7bfe452366cbd2434e7e9b6c9869bfa94c78 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 7 Mar 2026 15:47:20 -0800 Subject: [PATCH 207/742] tests: Refactor _test_pci_multiple_segments Extract common PCI multiple segment disk test logic into _test_pci_multiple_segments() and reuse it from the test case. Switch guest creation to GuestFactory in test_pci_multiple_segments and pass segment values through helper parameters. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 203 +++++++++++++------------- 1 file changed, 104 insertions(+), 99 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 2ffe0bef04..745128ce6c 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -2666,6 +2666,108 @@ fn _test_virtio_net_ctrl_queue(guest: &Guest) { handle_child_output(r, &output); } +fn _test_pci_multiple_segments( + guest: &Guest, + max_num_pci_segments: u16, + pci_segments_for_disk: u16, +) { + // Prepare another disk file for the virtio-disk device + let test_disk_path = String::from( + guest + .tmp_dir + .as_path() + .join("test-disk.raw") + .to_str() + .unwrap(), + ); + assert!( + exec_host_command_status(format!("truncate {test_disk_path} -s 4M").as_str()).success() + ); + assert!(exec_host_command_status(format!("mkfs.ext4 {test_disk_path}").as_str()).success()); + + let mut cmd = GuestCommand::new(guest); + cmd.default_cpus() + .default_memory() + .default_kernel_cmdline_with_platform(Some(&format!( + "num_pci_segments={max_num_pci_segments}" + ))) + .args([ + "--disk", + format!( + "path={}", + guest.disk_config.disk(DiskType::OperatingSystem).unwrap() + ) + .as_str(), + format!( + "path={}", + guest.disk_config.disk(DiskType::CloudInit).unwrap() + ) + .as_str(), + format!("path={test_disk_path},pci_segment={pci_segments_for_disk},image_type=raw") + .as_str(), + ]) + .capture_output() + .default_net(); + + let mut child = cmd.spawn().unwrap(); + + guest.wait_vm_boot().unwrap(); + + let grep_cmd = "lspci | grep \"Host bridge\" | wc -l"; + + let r = std::panic::catch_unwind(|| { + // There should be MAX_NUM_PCI_SEGMENTS PCI host bridges in the guest. + assert_eq!( + guest + .ssh_command(grep_cmd) + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + max_num_pci_segments + ); + + // Check both if /dev/vdc exists and if the block size is 4M. + assert_eq!( + guest + .ssh_command("lsblk | grep vdc | grep -c 4M") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 1 + ); + + // Mount the device. + guest.ssh_command("mkdir mount_image").unwrap(); + guest + .ssh_command("sudo mount -o rw -t ext4 /dev/vdc mount_image/") + .unwrap(); + // Grant all users with write permission. + guest.ssh_command("sudo chmod a+w mount_image/").unwrap(); + + // Write something to the device. + guest + .ssh_command("sudo echo \"bar\" >> mount_image/foo") + .unwrap(); + + // Check the content of the block device. The file "foo" should + // contain "bar". + assert_eq!( + guest + .ssh_command("sudo cat mount_image/foo") + .unwrap() + .trim(), + "bar" + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + mod common_parallel { use std::cmp; use std::fs::{File, OpenOptions, copy}; @@ -3062,105 +3164,8 @@ mod common_parallel { #[test] fn test_pci_multiple_segments() { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - // Prepare another disk file for the virtio-disk device - let test_disk_path = String::from( - guest - .tmp_dir - .as_path() - .join("test-disk.raw") - .to_str() - .unwrap(), - ); - assert!( - exec_host_command_status(format!("truncate {test_disk_path} -s 4M").as_str()).success() - ); - assert!(exec_host_command_status(format!("mkfs.ext4 {test_disk_path}").as_str()).success()); - - let mut cmd = GuestCommand::new(&guest); - cmd.default_cpus() - .default_memory() - .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .args([ - "--platform", - &format!("num_pci_segments={MAX_NUM_PCI_SEGMENTS}"), - ]) - .args([ - "--disk", - format!( - "path={}", - guest.disk_config.disk(DiskType::OperatingSystem).unwrap() - ) - .as_str(), - format!( - "path={}", - guest.disk_config.disk(DiskType::CloudInit).unwrap() - ) - .as_str(), - format!("path={test_disk_path},pci_segment=15,image_type=raw").as_str(), - ]) - .capture_output() - .default_net(); - - let mut child = cmd.spawn().unwrap(); - - guest.wait_vm_boot().unwrap(); - - let grep_cmd = "lspci | grep \"Host bridge\" | wc -l"; - - let r = std::panic::catch_unwind(|| { - // There should be MAX_NUM_PCI_SEGMENTS PCI host bridges in the guest. - assert_eq!( - guest - .ssh_command(grep_cmd) - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - MAX_NUM_PCI_SEGMENTS - ); - - // Check both if /dev/vdc exists and if the block size is 4M. - assert_eq!( - guest - .ssh_command("lsblk | grep vdc | grep -c 4M") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 1 - ); - - // Mount the device. - guest.ssh_command("mkdir mount_image").unwrap(); - guest - .ssh_command("sudo mount -o rw -t ext4 /dev/vdc mount_image/") - .unwrap(); - // Grant all users with write permission. - guest.ssh_command("sudo chmod a+w mount_image/").unwrap(); - - // Write something to the device. - guest - .ssh_command("sudo echo \"bar\" >> mount_image/foo") - .unwrap(); - - // Check the content of the block device. The file "foo" should - // contain "bar". - assert_eq!( - guest - .ssh_command("sudo cat mount_image/foo") - .unwrap() - .trim(), - "bar" - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); + let guest = GuestFactory::new_regular_guest_factory().create_guest(Box::new(disk_config)); + _test_pci_multiple_segments(&guest, MAX_NUM_PCI_SEGMENTS, 15u16); } #[test] From 38dc35cc337e5e079b2a222f9f43cabcab7a037f Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 7 Mar 2026 15:49:27 -0800 Subject: [PATCH 208/742] tests: Add PCI multiple segments test for confidential VMs Add test_pci_multiple_segments to the common_cvm integration test module to verify multiple PCI segment support in confidential guest environments. The test uses 8 PCI segments, which exceeds the Linux default of 6 and matches the maximum supported by the IGVM file for SEV-SNP guests. A test disk is placed on segment 5 to validate cross-segment device functionality. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 745128ce6c..a064a5bbb0 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -15006,4 +15006,16 @@ mod common_cvm { GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); _test_virtio_net_ctrl_queue(&guest); } + + #[test] + fn test_pci_multiple_segments() { + // Use 8 segments to test the multiple segment support since it's more than the default 6 + // supported by Linux + // IGVM file used by Sev-Snp Guest now support up to 8 segments, so we can use 8 segments for testing. + let num_pci_segments: u16 = 8; + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = + GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); + _test_pci_multiple_segments(&guest, num_pci_segments, 5); + } } From 3b56ec240a32af6456456e30fdfb99050d4ca1d3 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 7 Mar 2026 15:55:24 -0800 Subject: [PATCH 209/742] tests: Refactor test_direct_kernel_boot into reusable helper Extract direct kernel boot test logic into a standalone _test_direct_kernel_boot helper that accepts a Guest reference. The helper boots a VM, validates CPU count and memory using generic validate_cpu_count and validate_memory methods, and asserts 12 MSI interrupts in /proc/interrupts. Replace hardcoded kernel and cmdline arguments with default_kernel_cmdline(). Update the test call site in common_parallel to use GuestFactory and delegate to the new helper, enabling reuse with different guest types. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 74 +++++++++++++-------------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index a064a5bbb0..2fc61ef51c 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -2768,6 +2768,41 @@ fn _test_pci_multiple_segments( handle_child_output(r, &output); } +fn _test_direct_kernel_boot(guest: &Guest) { + let mut child = GuestCommand::new(guest) + .default_cpus() + .default_memory() + .default_kernel_cmdline() + .default_disks() + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + guest.validate_cpu_count(None); + guest.validate_memory(None); + + let grep_cmd = format!("grep -c {} /proc/interrupts", get_msi_interrupt_pattern()); + assert_eq!( + guest + .ssh_command(&grep_cmd) + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 12 + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + mod common_parallel { use std::cmp; use std::fs::{File, OpenOptions, copy}; @@ -3250,43 +3285,8 @@ mod common_parallel { #[test] fn test_direct_kernel_boot() { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - let kernel_path = direct_kernel_boot_path(); - - let mut child = GuestCommand::new(&guest) - .default_cpus() - .default_memory() - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .default_net() - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - assert_eq!(guest.get_cpu_count().unwrap_or_default(), 1); - assert!(guest.get_total_memory().unwrap_or_default() > 480_000); - - let grep_cmd = format!("grep -c {} /proc/interrupts", get_msi_interrupt_pattern()); - assert_eq!( - guest - .ssh_command(&grep_cmd) - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 12 - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); + let guest = GuestFactory::new_regular_guest_factory().create_guest(Box::new(disk_config)); + _test_direct_kernel_boot(&guest); } #[test] From af4a14fa70d0f21e6386f0d7db81757bbd65cc3e Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 7 Mar 2026 15:56:23 -0800 Subject: [PATCH 210/742] tests: Add direct kernel boot test for confidential VMs Add test_direct_kernel_boot to the common_cvm integration test module to verify that boot, CPU, memory, and MSI interrupt functionality work correctly in confidential guest environments. The test creates an Ubuntu Jammy-based confidential VM using GuestFactory and delegates to the existing _test_direct_kernel_boot helper. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 2fc61ef51c..fed59fbe7e 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -15018,4 +15018,12 @@ mod common_cvm { GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); _test_pci_multiple_segments(&guest, num_pci_segments, 5); } + + #[test] + fn test_direct_kernel_boot() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = + GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); + _test_direct_kernel_boot(&guest); + } } From ec730fde21701f3a466bac816702e19ad4ab3331 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 7 Mar 2026 22:50:03 -0800 Subject: [PATCH 211/742] tests: Refactor _test_virtio_block and utilities to module scope Move _test_virtio_block to module-level scope, accepting a Guest reference instead of an image name string. Replace hardcoded CPU, kernel, and cmdline arguments with default_cpus and default_kernel_cmdline. Promote all supporting disk utilities to module-level scope: compute_backing_checksum, disk_check_consistency, run_qemu_img, get_image_info, get_qcow2_v3_info, check_dirty_flag, check_corrupt_flag, set_corrupt_flag, resolve_disk_path, and compute_file_checksum. Update all test_virtio_block_* call sites in common_parallel to create guests via GuestFactory::new_regular_guest_factory() with 4 vCPUs and pass them to the helper. This enables reuse with different guest types such as confidential VMs. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 692 +++++++++++++------------- 1 file changed, 337 insertions(+), 355 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index fed59fbe7e..de2eee6fbe 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -11,7 +11,7 @@ use std::collections::HashMap; use std::ffi::CStr; -use std::fs::OpenOptions; +use std::fs::{File, OpenOptions, copy}; use std::io::{BufRead, Read, Seek, SeekFrom, Write}; use std::net::TcpListener; use std::os::unix::io::AsRawFd; @@ -21,8 +21,9 @@ use std::string::String; use std::sync::mpsc::Receiver; use std::sync::{Mutex, mpsc}; use std::time::{Duration, SystemTime, UNIX_EPOCH}; -use std::{fs, io, thread}; +use std::{cmp, fs, io, thread}; +use block::ImageType; use net_util::MacAddr; use test_infra::*; use vmm_sys_util::tempdir::TempDir; @@ -2803,13 +2804,305 @@ fn _test_direct_kernel_boot(guest: &Guest) { handle_child_output(r, &output); } +fn _test_virtio_block( + guest: &Guest, + disable_io_uring: bool, + disable_aio: bool, + verify_os_disk: bool, + backing_files: bool, + image_type: ImageType, +) { + let mut workload_path = dirs::home_dir().unwrap(); + workload_path.push("workloads"); + + let mut blk_file_path = workload_path; + blk_file_path.push("blk.img"); + + let initial_backing_checksum = if verify_os_disk { + compute_backing_checksum(guest.disk_config.disk(DiskType::OperatingSystem).unwrap()) + } else { + None + }; + assert!( + guest.num_cpu >= 4, + "_test_virtio_block requires at least 4 CPUs to match num_queues=4" + ); + let mut cloud_child = GuestCommand::new(guest) + .default_cpus() + .args(["--memory", "size=512M,shared=on"]) + .default_kernel_cmdline() + .args([ + "--disk", + format!( + "path={},backing_files={},image_type={image_type}", + guest.disk_config.disk(DiskType::OperatingSystem).unwrap(), + if backing_files { "on" } else { "off" }, + ) + .as_str(), + format!( + "path={}", + guest.disk_config.disk(DiskType::CloudInit).unwrap() + ) + .as_str(), + format!( + "path={},readonly=on,direct=on,num_queues=4,_disable_io_uring={},_disable_aio={}", + blk_file_path.to_str().unwrap(), + disable_io_uring, + disable_aio, + ) + .as_str(), + ]) + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + // Check both if /dev/vdc exists and if the block size is 16M. + assert_eq!( + guest + .ssh_command("lsblk | grep vdc | grep -c 16M") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 1 + ); + + // Check both if /dev/vdc exists and if this block is RO. + assert_eq!( + guest + .ssh_command("lsblk | grep vdc | awk '{print $5}'") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 1 + ); + + // Check if the number of queues is 4. + assert_eq!( + guest + .ssh_command("ls -ll /sys/block/vdc/mq | grep ^d | wc -l") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 4 + ); + }); + + if verify_os_disk { + // Use clean shutdown to allow cloud-hypervisor to clear + // the dirty bit in the QCOW2 v3 image. + kill_child(&mut cloud_child); + } else { + let _ = cloud_child.kill(); + } + let output = cloud_child.wait_with_output().unwrap(); + + handle_child_output(r, &output); + + if verify_os_disk { + disk_check_consistency( + guest.disk_config.disk(DiskType::OperatingSystem).unwrap(), + initial_backing_checksum, + ); + } +} + +fn compute_backing_checksum( + path_or_image_name: impl AsRef, +) -> Option<(std::path::PathBuf, String, u32)> { + let path = resolve_disk_path(path_or_image_name); + + let mut file = File::open(&path).ok()?; + if !matches!( + block::detect_image_type(&mut file).ok()?, + block::ImageType::Qcow2 + ) { + return None; + } + + let info = get_image_info(&path)?; + + let backing_file = info["backing-filename"].as_str()?; + let backing_path = if std::path::Path::new(backing_file).is_absolute() { + std::path::PathBuf::from(backing_file) + } else { + path.parent() + .unwrap_or_else(|| std::path::Path::new(".")) + .join(backing_file) + }; + + let backing_info = get_image_info(&backing_path)?; + let backing_format = backing_info["format"].as_str()?.to_string(); + let mut file = File::open(&backing_path).ok()?; + let file_size = file.metadata().ok()?.len(); + let checksum = compute_file_checksum(&mut file, file_size); + + Some((backing_path, backing_format, checksum)) +} + +/// Uses `qemu-img check` to verify disk image consistency. +/// +/// Supported formats are `qcow2` (compressed and uncompressed), +/// `vhdx`, `qed`, `parallels`, `vmdk`, and `vdi`. See man page +/// for more details. +/// +/// It takes either a full path to the image or just the name of +/// the image located in the `workloads` directory. +/// +/// For QCOW2 images with backing files, also verifies the backing file +/// integrity and checks that the backing file hasn't been modified +/// during the test. +/// +/// For QCOW2 v3 images, also verifies the dirty bit is cleared. +fn disk_check_consistency( + path_or_image_name: impl AsRef, + initial_backing_checksum: Option<(std::path::PathBuf, String, u32)>, +) { + let path = resolve_disk_path(path_or_image_name); + let output = run_qemu_img(&path, &["check"], None); + + assert!( + output.status.success(), + "qemu-img check failed: {}", + String::from_utf8_lossy(&output.stderr) + ); + + match check_dirty_flag(&path) { + Ok(Some(dirty)) => { + assert!(!dirty, "QCOW2 image shutdown unclean"); + } + Ok(None) => {} // Not a QCOW2 v3 image, skip dirty flag check + Err(e) => panic!("Failed to check dirty flag: {e}"), + } + + if let Some((backing_path, format, initial_checksum)) = initial_backing_checksum { + if format.parse::().ok() != Some(block::qcow::ImageType::Raw) { + let output = run_qemu_img(&backing_path, &["check"], None); + + assert!( + output.status.success(), + "qemu-img check of backing file failed: {}", + String::from_utf8_lossy(&output.stderr) + ); + } + + let mut file = File::open(&backing_path).unwrap(); + let file_size = file.metadata().unwrap().len(); + assert_eq!( + initial_checksum, + compute_file_checksum(&mut file, file_size) + ); + } +} + +fn run_qemu_img( + path: &std::path::Path, + args: &[&str], + trailing_args: Option<&[&str]>, +) -> std::process::Output { + let mut cmd = std::process::Command::new("qemu-img"); + cmd.arg(args[0]) + .args(&args[1..]) + .arg(path.to_str().unwrap()); + if let Some(extra) = trailing_args { + cmd.args(extra); + } + cmd.output().unwrap() +} + +fn get_image_info(path: &std::path::Path) -> Option { + let output = run_qemu_img(path, &["info", "-U", "--output=json"], None); + + output.status.success().then_some(())?; + serde_json::from_slice(&output.stdout).ok() +} + +fn get_qcow2_v3_info(path: &Path) -> Result, String> { + let info = get_image_info(path) + .ok_or_else(|| format!("qemu-img info failed for {}", path.display()))?; + if info["format"].as_str() != Some("qcow2") { + return Ok(None); + } + // QCOW2 v3 has compat "1.1", v2 has "0.10" + if info["format-specific"]["data"]["compat"].as_str() != Some("1.1") { + return Ok(None); + } + Ok(Some(info)) +} + +fn check_dirty_flag(path: &Path) -> Result, String> { + Ok(get_qcow2_v3_info(path)?.and_then(|info| info["dirty-flag"].as_bool())) +} + +fn check_corrupt_flag(path: &Path) -> Result, String> { + Ok(get_qcow2_v3_info(path)? + .and_then(|info| info["format-specific"]["data"]["corrupt"].as_bool())) +} + +const QCOW2_INCOMPATIBLE_FEATURES_OFFSET: u64 = 72; + +fn set_corrupt_flag(path: &Path, corrupt: bool) -> io::Result<()> { + let mut file = OpenOptions::new().read(true).write(true).open(path)?; + + file.seek(SeekFrom::Start(QCOW2_INCOMPATIBLE_FEATURES_OFFSET))?; + let mut buf = [0u8; 8]; + file.read_exact(&mut buf)?; + let mut features = u64::from_be_bytes(buf); + + if corrupt { + features |= 0x02; + } else { + features &= !0x02; + } + + file.seek(SeekFrom::Start(QCOW2_INCOMPATIBLE_FEATURES_OFFSET))?; + file.write_all(&features.to_be_bytes())?; + file.sync_all()?; + Ok(()) +} + +fn resolve_disk_path(path_or_image_name: impl AsRef) -> std::path::PathBuf { + if path_or_image_name.as_ref().exists() { + // A full path is provided + path_or_image_name.as_ref().to_path_buf() + } else { + // An image name is provided + let mut workload_path = dirs::home_dir().unwrap(); + workload_path.push("workloads"); + workload_path.as_path().join(path_or_image_name.as_ref()) + } +} + +fn compute_file_checksum(reader: &mut dyn std::io::Read, size: u64) -> u32 { + // Read first 16MB or entire data if smaller + let read_size = cmp::min(size, 16 * 1024 * 1024) as usize; + + let mut buffer = vec![0u8; read_size]; + reader.read_exact(&mut buffer).unwrap(); + + // DJB2 hash + let mut hash: u32 = 5381; + for byte in buffer.iter() { + hash = hash.wrapping_mul(33).wrapping_add(*byte as u32); + } + hash +} + +fn make_virtio_block_guest(factory: &GuestFactory, image_name: &str) -> Guest { + let disk_config = UbuntuDiskConfig::new(image_name.to_string()); + factory.create_guest(Box::new(disk_config)).with_cpu(4) +} + mod common_parallel { - use std::cmp; - use std::fs::{File, OpenOptions, copy}; use std::io::{self, SeekFrom}; use std::process::Command; - use block::ImageType; use test_infra::GuestFactory; use crate::*; @@ -3335,224 +3628,27 @@ mod common_parallel { handle_child_output(r, &output); } - fn _test_virtio_block( - image_name: &str, - disable_io_uring: bool, - disable_aio: bool, - verify_os_disk: bool, - backing_files: bool, - image_type: ImageType, - ) { - let disk_config = UbuntuDiskConfig::new(image_name.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - let mut workload_path = dirs::home_dir().unwrap(); - workload_path.push("workloads"); - - let mut blk_file_path = workload_path; - blk_file_path.push("blk.img"); - - let kernel_path = direct_kernel_boot_path(); - - let initial_backing_checksum = if verify_os_disk { - compute_backing_checksum(guest.disk_config.disk(DiskType::OperatingSystem).unwrap()) - } else { - None - }; - - let mut cloud_child = GuestCommand::new(&guest) - .args(["--cpus", "boot=4"]) - .args(["--memory", "size=512M,shared=on"]) - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .args([ - "--disk", - format!( - "path={},backing_files={},image_type={image_type}", - guest.disk_config.disk(DiskType::OperatingSystem).unwrap(), - if backing_files { "on"} else {"off"}, - ) - .as_str(), - format!( - "path={}", - guest.disk_config.disk(DiskType::CloudInit).unwrap() - ) - .as_str(), - format!( - "path={},readonly=on,direct=on,num_queues=4,_disable_io_uring={},_disable_aio={}", - blk_file_path.to_str().unwrap(), - disable_io_uring, - disable_aio, - ) - .as_str(), - ]) - .default_net() - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - // Check both if /dev/vdc exists and if the block size is 16M. - assert_eq!( - guest - .ssh_command("lsblk | grep vdc | grep -c 16M") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 1 - ); - - // Check both if /dev/vdc exists and if this block is RO. - assert_eq!( - guest - .ssh_command("lsblk | grep vdc | awk '{print $5}'") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 1 - ); - - // Check if the number of queues is 4. - assert_eq!( - guest - .ssh_command("ls -ll /sys/block/vdc/mq | grep ^d | wc -l") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 4 - ); - }); - - if verify_os_disk { - // Use clean shutdown to allow cloud-hypervisor to clear - // the dirty bit in the QCOW2 v3 image. - kill_child(&mut cloud_child); - } else { - let _ = cloud_child.kill(); - } - let output = cloud_child.wait_with_output().unwrap(); - - handle_child_output(r, &output); - - if verify_os_disk { - disk_check_consistency( - guest.disk_config.disk(DiskType::OperatingSystem).unwrap(), - initial_backing_checksum, - ); - } - } - #[test] fn test_virtio_block_io_uring() { - _test_virtio_block(FOCAL_IMAGE_NAME, false, true, false, false, ImageType::Raw); + let guest = + make_virtio_block_guest(&GuestFactory::new_regular_guest_factory(), FOCAL_IMAGE_NAME); + _test_virtio_block(&guest, false, true, false, false, ImageType::Raw); } #[test] fn test_virtio_block_aio() { - _test_virtio_block(FOCAL_IMAGE_NAME, true, false, false, false, ImageType::Raw); + let guest = + make_virtio_block_guest(&GuestFactory::new_regular_guest_factory(), FOCAL_IMAGE_NAME) + .with_cpu(4); + _test_virtio_block(&guest, true, false, false, false, ImageType::Raw); } #[test] fn test_virtio_block_sync() { - _test_virtio_block(FOCAL_IMAGE_NAME, true, true, false, false, ImageType::Raw); - } - - fn run_qemu_img( - path: &std::path::Path, - args: &[&str], - trailing_args: Option<&[&str]>, - ) -> std::process::Output { - let mut cmd = std::process::Command::new("qemu-img"); - cmd.arg(args[0]) - .args(&args[1..]) - .arg(path.to_str().unwrap()); - if let Some(extra) = trailing_args { - cmd.args(extra); - } - cmd.output().unwrap() - } - - fn get_image_info(path: &std::path::Path) -> Option { - let output = run_qemu_img(path, &["info", "-U", "--output=json"], None); - - output.status.success().then_some(())?; - serde_json::from_slice(&output.stdout).ok() - } - - fn get_qcow2_v3_info(path: &Path) -> Result, String> { - let info = get_image_info(path) - .ok_or_else(|| format!("qemu-img info failed for {}", path.display()))?; - if info["format"].as_str() != Some("qcow2") { - return Ok(None); - } - // QCOW2 v3 has compat "1.1", v2 has "0.10" - if info["format-specific"]["data"]["compat"].as_str() != Some("1.1") { - return Ok(None); - } - Ok(Some(info)) - } - - fn check_dirty_flag(path: &Path) -> Result, String> { - Ok(get_qcow2_v3_info(path)?.and_then(|info| info["dirty-flag"].as_bool())) - } - - fn check_corrupt_flag(path: &Path) -> Result, String> { - Ok(get_qcow2_v3_info(path)? - .and_then(|info| info["format-specific"]["data"]["corrupt"].as_bool())) - } - - const QCOW2_INCOMPATIBLE_FEATURES_OFFSET: u64 = 72; - - fn set_corrupt_flag(path: &Path, corrupt: bool) -> io::Result<()> { - let mut file = OpenOptions::new().read(true).write(true).open(path)?; - - file.seek(SeekFrom::Start(QCOW2_INCOMPATIBLE_FEATURES_OFFSET))?; - let mut buf = [0u8; 8]; - file.read_exact(&mut buf)?; - let mut features = u64::from_be_bytes(buf); - - if corrupt { - features |= 0x02; - } else { - features &= !0x02; - } - - file.seek(SeekFrom::Start(QCOW2_INCOMPATIBLE_FEATURES_OFFSET))?; - file.write_all(&features.to_be_bytes())?; - file.sync_all()?; - Ok(()) - } - - fn resolve_disk_path(path_or_image_name: impl AsRef) -> std::path::PathBuf { - if path_or_image_name.as_ref().exists() { - // A full path is provided - path_or_image_name.as_ref().to_path_buf() - } else { - // An image name is provided - let mut workload_path = dirs::home_dir().unwrap(); - workload_path.push("workloads"); - workload_path.as_path().join(path_or_image_name.as_ref()) - } - } - - fn compute_file_checksum(reader: &mut dyn std::io::Read, size: u64) -> u32 { - // Read first 16MB or entire data if smaller - let read_size = cmp::min(size, 16 * 1024 * 1024) as usize; - - let mut buffer = vec![0u8; read_size]; - reader.read_exact(&mut buffer).unwrap(); - - // DJB2 hash - let mut hash: u32 = 5381; - for byte in buffer.iter() { - hash = hash.wrapping_mul(33).wrapping_add(*byte as u32); - } - hash + let guest = + make_virtio_block_guest(&GuestFactory::new_regular_guest_factory(), FOCAL_IMAGE_NAME) + .with_cpu(4); + _test_virtio_block(&guest, true, true, false, false, ImageType::Raw); } #[test] @@ -3605,164 +3701,58 @@ mod common_parallel { assert_eq!(position, 16 * 1024 * 1024); } - fn compute_backing_checksum( - path_or_image_name: impl AsRef, - ) -> Option<(std::path::PathBuf, String, u32)> { - let path = resolve_disk_path(path_or_image_name); - - let mut file = File::open(&path).ok()?; - if !matches!( - block::detect_image_type(&mut file).ok()?, - block::ImageType::Qcow2 - ) { - return None; - } - - let info = get_image_info(&path)?; - - let backing_file = info["backing-filename"].as_str()?; - let backing_path = if std::path::Path::new(backing_file).is_absolute() { - std::path::PathBuf::from(backing_file) - } else { - path.parent() - .unwrap_or_else(|| std::path::Path::new(".")) - .join(backing_file) - }; - - let backing_info = get_image_info(&backing_path)?; - let backing_format = backing_info["format"].as_str()?.to_string(); - let mut file = File::open(&backing_path).ok()?; - let file_size = file.metadata().ok()?.len(); - let checksum = compute_file_checksum(&mut file, file_size); - - Some((backing_path, backing_format, checksum)) - } - - /// Uses `qemu-img check` to verify disk image consistency. - /// - /// Supported formats are `qcow2` (compressed and uncompressed), - /// `vhdx`, `qed`, `parallels`, `vmdk`, and `vdi`. See man page - /// for more details. - /// - /// It takes either a full path to the image or just the name of - /// the image located in the `workloads` directory. - /// - /// For QCOW2 images with backing files, also verifies the backing file - /// integrity and checks that the backing file hasn't been modified - /// during the test. - /// - /// For QCOW2 v3 images, also verifies the dirty bit is cleared. - fn disk_check_consistency( - path_or_image_name: impl AsRef, - initial_backing_checksum: Option<(std::path::PathBuf, String, u32)>, - ) { - let path = resolve_disk_path(path_or_image_name); - let output = run_qemu_img(&path, &["check"], None); - - assert!( - output.status.success(), - "qemu-img check failed: {}", - String::from_utf8_lossy(&output.stderr) - ); - - match check_dirty_flag(&path) { - Ok(Some(dirty)) => { - assert!(!dirty, "QCOW2 image shutdown unclean"); - } - Ok(None) => {} // Not a QCOW2 v3 image, skip dirty flag check - Err(e) => panic!("Failed to check dirty flag: {e}"), - } - - if let Some((backing_path, format, initial_checksum)) = initial_backing_checksum { - if format.parse::().ok() != Some(block::qcow::ImageType::Raw) { - let output = run_qemu_img(&backing_path, &["check"], None); - - assert!( - output.status.success(), - "qemu-img check of backing file failed: {}", - String::from_utf8_lossy(&output.stderr) - ); - } - - let mut file = File::open(&backing_path).unwrap(); - let file_size = file.metadata().unwrap().len(); - assert_eq!( - initial_checksum, - compute_file_checksum(&mut file, file_size) - ); - } - } - #[test] fn test_virtio_block_qcow2() { - _test_virtio_block( - JAMMY_IMAGE_NAME_QCOW2, - false, - false, - true, - false, - ImageType::Qcow2, - ); + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME_QCOW2.to_string()); + let guest = GuestFactory::new_regular_guest_factory() + .create_guest(Box::new(disk_config)) + .with_cpu(4); + _test_virtio_block(&guest, false, false, true, false, ImageType::Qcow2); } #[test] fn test_virtio_block_qcow2_zlib() { - _test_virtio_block( - JAMMY_IMAGE_NAME_QCOW2_ZLIB, - false, - false, - true, - false, - ImageType::Qcow2, - ); + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME_QCOW2_ZLIB.to_string()); + let guest = GuestFactory::new_regular_guest_factory() + .create_guest(Box::new(disk_config)) + .with_cpu(4); + _test_virtio_block(&guest, false, false, true, false, ImageType::Qcow2); } #[test] fn test_virtio_block_qcow2_zstd() { - _test_virtio_block( + let guest = make_virtio_block_guest( + &GuestFactory::new_regular_guest_factory(), JAMMY_IMAGE_NAME_QCOW2_ZSTD, - false, - false, - true, - false, - ImageType::Qcow2, ); + _test_virtio_block(&guest, false, false, true, false, ImageType::Qcow2); } #[test] fn test_virtio_block_qcow2_backing_zstd_file() { - _test_virtio_block( + let guest = make_virtio_block_guest( + &GuestFactory::new_regular_guest_factory(), JAMMY_IMAGE_NAME_QCOW2_BACKING_ZSTD_FILE, - false, - false, - true, - true, - ImageType::Qcow2, ); + _test_virtio_block(&guest, false, false, true, true, ImageType::Qcow2); } #[test] fn test_virtio_block_qcow2_backing_uncompressed_file() { - _test_virtio_block( + let guest = make_virtio_block_guest( + &GuestFactory::new_regular_guest_factory(), JAMMY_IMAGE_NAME_QCOW2_BACKING_UNCOMPRESSED_FILE, - false, - false, - true, - true, - ImageType::Qcow2, ); + _test_virtio_block(&guest, false, false, true, true, ImageType::Qcow2); } #[test] fn test_virtio_block_qcow2_backing_raw_file() { - _test_virtio_block( + let guest = make_virtio_block_guest( + &GuestFactory::new_regular_guest_factory(), JAMMY_IMAGE_NAME_QCOW2_BACKING_RAW_FILE, - false, - false, - true, - true, - ImageType::Qcow2, ); + _test_virtio_block(&guest, false, false, true, true, ImageType::Qcow2); } /// Configuration for QCOW2 multiqueue test image setup @@ -4642,15 +4632,11 @@ mod common_parallel { .arg(vhd_file_path.to_str().unwrap()) .output() .expect("Expect generating VHD image from RAW image"); - - _test_virtio_block( + let guest = make_virtio_block_guest( + &GuestFactory::new_regular_guest_factory(), FOCAL_IMAGE_NAME_VHD, - false, - false, - false, - false, - ImageType::FixedVhd, ); + _test_virtio_block(&guest, false, false, false, false, ImageType::FixedVhd); } #[test] @@ -4673,15 +4659,11 @@ mod common_parallel { .arg(vhdx_file_path.to_str().unwrap()) .output() .expect("Expect generating dynamic VHDx image from RAW image"); - - _test_virtio_block( + let guest = make_virtio_block_guest( + &GuestFactory::new_regular_guest_factory(), FOCAL_IMAGE_NAME_VHDX, - false, - false, - true, - false, - ImageType::Vhdx, ); + _test_virtio_block(&guest, false, false, true, false, ImageType::Vhdx); } #[test] From 297b683fcb6c9a9b225d04213dc04b0faf8f9715 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 7 Mar 2026 23:07:59 -0800 Subject: [PATCH 212/742] tests: Add virtio block tests for confidential VMs Add a full suite of test_virtio_block_* tests to the common_cvm integration test module to verify virtio block functionality in confidential guest environments. The following tests are added, all using 4-vCPU confidential VMs created via GuestFactory: - test_virtio_block_io_uring (Raw image, io_uring backend) - test_virtio_block_aio (Raw image, AIO backend) - test_virtio_block_sync (Raw image, sync backend) - test_virtio_block_qcow2 (QCOW2 image) - test_virtio_block_qcow2_zlib (QCOW2 with zlib compression) - test_virtio_block_qcow2_zstd (QCOW2 with zstd compression) - test_virtio_block_qcow2_backing_zstd_file - test_virtio_block_qcow2_backing_uncompressed_file - test_virtio_block_qcow2_backing_raw_file This extends CVM test coverage to all virtio block I/O backends and disk image formats. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 83 +++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index de2eee6fbe..8f6dec37a9 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -15008,4 +15008,87 @@ mod common_cvm { GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); _test_direct_kernel_boot(&guest); } + + #[test] + fn test_virtio_block_io_uring() { + let guest = make_virtio_block_guest( + &GuestFactory::new_confidential_guest_factory(), + FOCAL_IMAGE_NAME, + ); + _test_virtio_block(&guest, false, true, false, false, ImageType::Raw); + } + + #[test] + fn test_virtio_block_aio() { + let guest = make_virtio_block_guest( + &GuestFactory::new_confidential_guest_factory(), + FOCAL_IMAGE_NAME, + ); + _test_virtio_block(&guest, true, false, false, false, ImageType::Raw); + } + + #[test] + fn test_virtio_block_sync() { + let guest = make_virtio_block_guest( + &GuestFactory::new_confidential_guest_factory(), + FOCAL_IMAGE_NAME, + ); + _test_virtio_block(&guest, true, true, false, false, ImageType::Raw); + } + + #[test] + fn test_virtio_block_qcow2() { + let guest = make_virtio_block_guest( + &GuestFactory::new_confidential_guest_factory(), + JAMMY_IMAGE_NAME_QCOW2, + ); + _test_virtio_block(&guest, false, false, true, false, ImageType::Qcow2); + } + + #[test] + fn test_virtio_block_qcow2_zlib() { + let guest = make_virtio_block_guest( + &GuestFactory::new_confidential_guest_factory(), + JAMMY_IMAGE_NAME_QCOW2_ZLIB, + ); + _test_virtio_block(&guest, false, false, true, false, ImageType::Qcow2); + } + + #[test] + fn test_virtio_block_qcow2_zstd() { + let guest = make_virtio_block_guest( + &GuestFactory::new_confidential_guest_factory(), + JAMMY_IMAGE_NAME_QCOW2_ZSTD, + ); + _test_virtio_block(&guest, false, false, true, false, ImageType::Qcow2); + } + + #[test] + fn test_virtio_block_qcow2_backing_zstd_file() { + let guest = make_virtio_block_guest( + &GuestFactory::new_confidential_guest_factory(), + JAMMY_IMAGE_NAME_QCOW2_BACKING_ZSTD_FILE, + ); + + _test_virtio_block(&guest, false, false, true, true, ImageType::Qcow2); + } + + #[test] + fn test_virtio_block_qcow2_backing_uncompressed_file() { + let guest = make_virtio_block_guest( + &GuestFactory::new_confidential_guest_factory(), + JAMMY_IMAGE_NAME_QCOW2_BACKING_UNCOMPRESSED_FILE, + ); + + _test_virtio_block(&guest, false, false, true, true, ImageType::Qcow2); + } + + #[test] + fn test_virtio_block_qcow2_backing_raw_file() { + let guest = make_virtio_block_guest( + &GuestFactory::new_confidential_guest_factory(), + JAMMY_IMAGE_NAME_QCOW2_BACKING_RAW_FILE, + ); + _test_virtio_block(&guest, false, false, true, true, ImageType::Qcow2); + } } From a15bfcfee6ce7dc1e42b131ea9d1a7ec1722ad8a Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 12 Mar 2026 20:13:01 +0100 Subject: [PATCH 213/742] block: Add disk_file module skeleton Composable disk capability traits with DiskFile as a supertrait bundling DiskSize and Geometry. Optional capabilities are separate traits: PhysicalSize, DiskFd, SparseCapable, Resizable. AsyncDiskFile extends DiskFile with async I/O construction. Empty module with doc comment, trait definitions follow. Signed-off-by: Anatol Belski --- block/src/disk_file.rs | 34 ++++++++++++++++++++++++++++++++++ block/src/lib.rs | 1 + 2 files changed, 35 insertions(+) create mode 100644 block/src/disk_file.rs diff --git a/block/src/disk_file.rs b/block/src/disk_file.rs new file mode 100644 index 0000000000..30efaff551 --- /dev/null +++ b/block/src/disk_file.rs @@ -0,0 +1,34 @@ +// Copyright 2026 The Cloud Hypervisor Authors. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +//! Composable disk capability traits for the block crate. +//! +//! Small traits define individual capabilities: +//! +//! - [`DiskSize`] - reported capacity (logical size) +//! - [`PhysicalSize`] - host allocation size +//! - [`DiskFd`] - backing file descriptor access +//! - [`Geometry`] - sector/cluster geometry (default 512B) +//! - [`SparseCapable`] - sparse and zero flag support +//! - [`Resizable`] - online resize +//! +//! [`DiskFile`] is a supertrait that bundles the universal capabilities +//! (`DiskSize` + `Geometry`). [`FullDiskFile`] adds all optional +//! capabilities. [`AsyncDiskFile`] extends `DiskFile` with async I/O +//! construction for virtio queue workers. [`AsyncFullDiskFile`] +//! combines both axes. +//! +//! ```text +//! DiskFile: DiskSize + Geometry + Sync +//! / \ +//! FullDiskFile: AsyncDiskFile: +//! DiskFile + PhysicalSize + DiskFile + Unpin +//! DiskFd + SparseCapable + try_clone, new_async_io +//! Resizable +//! \ / +//! AsyncFullDiskFile: FullDiskFile + AsyncDiskFile +//! ``` +//! +//! Readonly accessors take `&self`. Only [`Resizable::resize`] requires +//! `&mut self`. Errors are returned as [`BlockResult`]. diff --git a/block/src/lib.rs b/block/src/lib.rs index 4a6ea4979d..288db3fbfd 100644 --- a/block/src/lib.rs +++ b/block/src/lib.rs @@ -9,6 +9,7 @@ // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause pub mod async_io; +pub mod disk_file; pub mod error; pub mod fcntl; pub mod fixed_vhd; From 85e4e5027c42afa79273221935ff8d1cc26c12d7 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 12 Mar 2026 20:13:47 +0100 Subject: [PATCH 214/742] block: disk_file: Add DiskSize trait Reported capacity of a disk image. Every format, be it file backed, network, memory, exposes a logical size. Single method logical_size() returning the virtual size in bytes. Signed-off-by: Anatol Belski --- block/src/disk_file.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/block/src/disk_file.rs b/block/src/disk_file.rs index 30efaff551..3f7edfabf1 100644 --- a/block/src/disk_file.rs +++ b/block/src/disk_file.rs @@ -32,3 +32,13 @@ //! //! Readonly accessors take `&self`. Only [`Resizable::resize`] requires //! `&mut self`. Errors are returned as [`BlockResult`]. + +use std::fmt::Debug; + +use crate::BlockResult; + +/// Reported capacity of a disk image. +pub trait DiskSize: Send + Debug { + /// Virtual size of the disk image in bytes (reported capacity). + fn logical_size(&self) -> BlockResult; +} From 2df731b8d3817de0aab0732cdfccdb9e532b1f96 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 12 Mar 2026 20:14:57 +0100 Subject: [PATCH 215/742] block: disk_file: Add PhysicalSize trait Host allocation size for file-backed disk images. Reports actual bytes occupied on the host filesystem. Not every format supports this, e.g. network or memory backed disks. Signed-off-by: Anatol Belski --- block/src/disk_file.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/block/src/disk_file.rs b/block/src/disk_file.rs index 3f7edfabf1..1dbaf19458 100644 --- a/block/src/disk_file.rs +++ b/block/src/disk_file.rs @@ -42,3 +42,9 @@ pub trait DiskSize: Send + Debug { /// Virtual size of the disk image in bytes (reported capacity). fn logical_size(&self) -> BlockResult; } + +/// Host allocation size of a file-backed disk image. +pub trait PhysicalSize: Send + Debug { + /// Actual bytes occupied on the host filesystem. + fn physical_size(&self) -> BlockResult; +} From 2fa877e087b86c4d03aacbb46b82827ab6b72336 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 12 Mar 2026 20:19:00 +0100 Subject: [PATCH 216/742] block: disk_file: Add DiskFd trait Backing file descriptor access for disk images backed by a file. Returns a BorrowedDiskFd that wraps the raw fd with lifetime tracking. Not available for network or memory backed disk formats. Signed-off-by: Anatol Belski --- block/src/disk_file.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/block/src/disk_file.rs b/block/src/disk_file.rs index 1dbaf19458..31c5ee085e 100644 --- a/block/src/disk_file.rs +++ b/block/src/disk_file.rs @@ -36,6 +36,7 @@ use std::fmt::Debug; use crate::BlockResult; +use crate::async_io::BorrowedDiskFd; /// Reported capacity of a disk image. pub trait DiskSize: Send + Debug { @@ -48,3 +49,9 @@ pub trait PhysicalSize: Send + Debug { /// Actual bytes occupied on the host filesystem. fn physical_size(&self) -> BlockResult; } + +/// Backing file descriptor access for disk images backed by a file. +pub trait DiskFd: Send + Debug { + /// Borrows the underlying file descriptor. + fn fd(&self) -> BorrowedDiskFd<'_>; +} From b98a5cc58dcd9951e1333d38a59a82f910158cca Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 12 Mar 2026 20:22:55 +0100 Subject: [PATCH 217/742] block: disk_file: Add Geometry trait Sector and cluster geometry of a disk image. Returns DiskTopology with a default implementation providing 512B logical and physical block sizes. Formats that probe the underlying device override this. Signed-off-by: Anatol Belski --- block/src/disk_file.rs | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/block/src/disk_file.rs b/block/src/disk_file.rs index 31c5ee085e..9dd52be6dc 100644 --- a/block/src/disk_file.rs +++ b/block/src/disk_file.rs @@ -35,8 +35,8 @@ use std::fmt::Debug; -use crate::BlockResult; use crate::async_io::BorrowedDiskFd; +use crate::{BlockResult, DiskTopology}; /// Reported capacity of a disk image. pub trait DiskSize: Send + Debug { @@ -55,3 +55,13 @@ pub trait DiskFd: Send + Debug { /// Borrows the underlying file descriptor. fn fd(&self) -> BorrowedDiskFd<'_>; } + +/// Sector and cluster geometry of a disk image. +/// +/// Default returns `DiskTopology::default()` (512B logical/physical). +pub trait Geometry: Send + Debug { + /// Returns the disk topology. + fn topology(&self) -> DiskTopology { + DiskTopology::default() + } +} From a0a5718b420c73ca233802aa1b775178509eca5b Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 12 Mar 2026 20:24:11 +0100 Subject: [PATCH 218/742] block: disk_file: Add SparseCapable trait Sparse and zero flag support for thin provisioned disk images. Two methods with false defaults: sparse operations (punch hole, write zeroes, discard) and zero flag optimization in WRITE_ZEROES. Signed-off-by: Anatol Belski --- block/src/disk_file.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/block/src/disk_file.rs b/block/src/disk_file.rs index 9dd52be6dc..3b3aa0fec3 100644 --- a/block/src/disk_file.rs +++ b/block/src/disk_file.rs @@ -65,3 +65,19 @@ pub trait Geometry: Send + Debug { DiskTopology::default() } } + +/// Sparse and zero flag support for thin provisioned disk images. +pub trait SparseCapable: Send + Debug { + /// Indicates support for sparse operations (punch hole, write zeroes, discard). + fn supports_sparse_operations(&self) -> bool { + false + } + + /// Indicates support for a metadata level zero flag optimization in + /// virtio `VIRTIO_BLK_T_WRITE_ZEROES` requests. When true, the format + /// can mark regions as reading zeros via a metadata bit rather than + /// writing actual zero bytes to disk. + fn supports_zero_flag(&self) -> bool { + false + } +} From da1b1044937e946a3d382737a1986cede158e822 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 12 Mar 2026 20:25:01 +0100 Subject: [PATCH 219/742] block: disk_file: Add Resizable trait Live disk resize support. Single method resize() taking &mut self and the new size in bytes. Implementations may return an error if the backend does not support resizing. Signed-off-by: Anatol Belski --- block/src/disk_file.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/block/src/disk_file.rs b/block/src/disk_file.rs index 3b3aa0fec3..69a1152c85 100644 --- a/block/src/disk_file.rs +++ b/block/src/disk_file.rs @@ -81,3 +81,12 @@ pub trait SparseCapable: Send + Debug { false } } + +/// Live disk resize support. +/// +/// Implementations may return an error if the backend does not +/// support resizing (e.g. fixed size formats). +pub trait Resizable: Send + Debug { + /// Resizes the disk image to the given size in bytes, if the backend supports it. + fn resize(&mut self, size: u64) -> BlockResult<()>; +} From 5d50d646715b9447ab39c61d8a21ced3a857fb2d Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 12 Mar 2026 20:35:13 +0100 Subject: [PATCH 220/742] block: disk_file: Add DiskFile supertrait Bundles DiskSize and Geometry as the universal disk capabilities every format must implement. Adds Sync so that Arc can be shared across threads for concurrent readonly access. Signed-off-by: Anatol Belski --- block/src/disk_file.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/block/src/disk_file.rs b/block/src/disk_file.rs index 69a1152c85..632b370e7c 100644 --- a/block/src/disk_file.rs +++ b/block/src/disk_file.rs @@ -90,3 +90,10 @@ pub trait Resizable: Send + Debug { /// Resizes the disk image to the given size in bytes, if the backend supports it. fn resize(&mut self, size: u64) -> BlockResult<()>; } + +/// Supertrait bundling universal disk capabilities. +/// +/// Every disk format implements `DiskSize` and `Geometry`. +/// `Sync` is required so that `Arc` can be shared +/// across threads for concurrent readonly access. +pub trait DiskFile: DiskSize + Geometry + Sync {} From 50741cca2a7bfaf4814eb4b95d4a4025042b90c9 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 12 Mar 2026 20:45:16 +0100 Subject: [PATCH 221/742] block: disk_file: Add AsyncDiskFile trait Extend DiskFile with async I/O construction for virtio queue workers. AsyncDiskFile adds try_clone() for creating independent handles to the same backing storage, and new_async_io() for constructing an async I/O engine at the given ring depth. Bounds: DiskFile + Unpin. Unpin ensures trait objects can be moved freely (all concrete disk file types are naturally Unpin since they hold no self referential state). Signed-off-by: Anatol Belski --- block/src/disk_file.rs | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/block/src/disk_file.rs b/block/src/disk_file.rs index 632b370e7c..2f280e3315 100644 --- a/block/src/disk_file.rs +++ b/block/src/disk_file.rs @@ -35,7 +35,7 @@ use std::fmt::Debug; -use crate::async_io::BorrowedDiskFd; +use crate::async_io::{AsyncIo, BorrowedDiskFd}; use crate::{BlockResult, DiskTopology}; /// Reported capacity of a disk image. @@ -97,3 +97,33 @@ pub trait Resizable: Send + Debug { /// `Sync` is required so that `Arc` can be shared /// across threads for concurrent readonly access. pub trait DiskFile: DiskSize + Geometry + Sync {} + +/// Extended disk file trait for virtio queue workers. +/// +/// Adds cloning and async I/O construction on top of [`DiskFile`]. +/// `Unpin` is required so trait objects can be moved freely. +pub trait AsyncDiskFile: DiskFile + Unpin { + /// Creates an independent handle for a queue worker. + /// + /// The clone shares internally reference counted state (e.g. + /// `Arc`) with the original, but owns its own file + /// descriptor and I/O completion resources. Each virtio queue + /// gets one clone so that workers can operate in parallel + /// without contending on I/O state. + /// + /// Returns `Box` (not `AsyncFullDiskFile`) + /// because clones only serve as data plane handles for queue + /// workers. The original remains the control plane for feature + /// negotiation and configuration. + fn try_clone(&self) -> BlockResult>; + + /// Constructs a per queue async I/O engine. + /// + /// # Arguments + /// + /// * `ring_depth` - maximum number of in flight I/O operations. + /// Callers typically pass the virtio queue size. Must be greater + /// than zero. Backends that do not use an async ring (e.g. sync + /// fallback implementations) may ignore this value. + fn new_async_io(&self, ring_depth: u32) -> BlockResult>; +} From 8623edb8aae2c4acaff56afd62c2b08f32521b51 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 12 Mar 2026 22:56:44 +0100 Subject: [PATCH 222/742] block: disk_file: Add FullDiskFile trait Marker trait bundling all optional capabilities (PhysicalSize, DiskFd, SparseCapable, Resizable) on top of DiskFile. Blanket impl covers any type implementing all constituent traits. Signed-off-by: Anatol Belski --- block/src/disk_file.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/block/src/disk_file.rs b/block/src/disk_file.rs index 2f280e3315..f76a99662f 100644 --- a/block/src/disk_file.rs +++ b/block/src/disk_file.rs @@ -98,6 +98,18 @@ pub trait Resizable: Send + Debug { /// across threads for concurrent readonly access. pub trait DiskFile: DiskSize + Geometry + Sync {} +/// Full capability disk file trait. +/// +/// Bundles all optional capabilities on top of [`DiskFile`]: +/// file descriptor access, physical size, sparse operations, and resize. +/// Used by consumers that need feature negotiation without async I/O +/// (e.g. vhost user block). +pub trait FullDiskFile: DiskFile + PhysicalSize + DiskFd + SparseCapable + Resizable {} + +/// Blanket implementation: any type implementing all constituent traits +/// automatically satisfies [`FullDiskFile`]. +impl FullDiskFile for T {} + /// Extended disk file trait for virtio queue workers. /// /// Adds cloning and async I/O construction on top of [`DiskFile`]. From dac8707c35e1cddf13c7df9848d31e3e05d36451 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 12 Mar 2026 22:57:36 +0100 Subject: [PATCH 223/742] block: disk_file: Add AsyncFullDiskFile trait Marker trait combining FullDiskFile and AsyncDiskFile. Blanket impl covers any type implementing both supertraits. Signed-off-by: Anatol Belski --- block/src/disk_file.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/block/src/disk_file.rs b/block/src/disk_file.rs index f76a99662f..9e057bd05b 100644 --- a/block/src/disk_file.rs +++ b/block/src/disk_file.rs @@ -139,3 +139,20 @@ pub trait AsyncDiskFile: DiskFile + Unpin { /// fallback implementations) may ignore this value. fn new_async_io(&self, ring_depth: u32) -> BlockResult>; } + +/// Full capability async disk file trait. +/// +/// Combines [`FullDiskFile`] (all optional capabilities) with +/// [`AsyncDiskFile`] (async I/O construction). This is the top level +/// trait for virtio block devices that need both feature negotiation +/// and async queue workers. +/// +/// The type narrowing on [`AsyncDiskFile::try_clone`] is intentional: +/// clones only serve as data plane handles for queue workers, while +/// the original `AsyncFullDiskFile` handle remains the control plane +/// for feature negotiation and configuration. +pub trait AsyncFullDiskFile: FullDiskFile + AsyncDiskFile {} + +/// Blanket implementation: any type implementing both [`FullDiskFile`] +/// and [`AsyncDiskFile`] automatically satisfies [`AsyncFullDiskFile`]. +impl AsyncFullDiskFile for T {} From ad789024f906c8610cb3b338dd71210cf27e0b90 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 12 Mar 2026 20:55:47 +0100 Subject: [PATCH 224/742] block: qcow_sync: Add Debug impl for QcowDiskSync The new composable traits require Debug. Implement it manually since QcowMetadata contains RwLock state that cannot auto derive. Signed-off-by: Anatol Belski --- block/src/qcow_sync.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/block/src/qcow_sync.rs b/block/src/qcow_sync.rs index e1ad08f4ab..73e7c89ec0 100644 --- a/block/src/qcow_sync.rs +++ b/block/src/qcow_sync.rs @@ -7,7 +7,7 @@ use std::collections::VecDeque; use std::fs::File; use std::os::fd::{AsFd, AsRawFd, BorrowedFd, OwnedFd, RawFd}; use std::sync::Arc; -use std::{io, ptr, slice}; +use std::{fmt, io, ptr, slice}; use vmm_sys_util::eventfd::EventFd; use vmm_sys_util::write_zeroes::{PunchHole, WriteZeroesAt}; @@ -180,6 +180,15 @@ pub struct QcowDiskSync { data_raw_file: QcowRawFile, } +impl fmt::Debug for QcowDiskSync { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("QcowDiskSync") + .field("sparse", &self.sparse) + .field("has_backing", &self.backing_file.is_some()) + .finish_non_exhaustive() + } +} + impl QcowDiskSync { pub fn new( file: File, From 0a7b6b089bcbb243a3d4b2e3b15ce9c81595da17 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 12 Mar 2026 20:56:32 +0100 Subject: [PATCH 225/742] block: qcow_sync: impl DiskSize for QcowDiskSync Delegate to QcowMetadata::virtual_size() which returns the guest visible capacity stored in the QCOW2 header. Signed-off-by: Anatol Belski --- block/src/qcow_sync.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/block/src/qcow_sync.rs b/block/src/qcow_sync.rs index 73e7c89ec0..ce06ad6816 100644 --- a/block/src/qcow_sync.rs +++ b/block/src/qcow_sync.rs @@ -15,6 +15,7 @@ use vmm_sys_util::write_zeroes::{PunchHole, WriteZeroesAt}; use crate::async_io::{ AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFile, DiskFileError, DiskFileResult, }; +use crate::disk_file; use crate::error::{BlockError, BlockErrorKind, BlockResult, ErrorOp}; use crate::qcow::metadata::{ BackingRead, ClusterReadMapping, ClusterWriteMapping, DeallocAction, QcowMetadata, @@ -277,6 +278,12 @@ impl Drop for QcowDiskSync { } } +impl disk_file::DiskSize for QcowDiskSync { + fn logical_size(&self) -> BlockResult { + Ok(self.metadata.virtual_size()) + } +} + pub struct QcowSync { metadata: Arc, data_file: QcowRawFile, From 4b731ee771387d7efec2cd12e5261a07dfa10431 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 12 Mar 2026 20:57:38 +0100 Subject: [PATCH 226/742] block: qcow_sync: impl PhysicalSize for QcowDiskSync Delegate to QcowRawFile::physical_size() which returns the actual host allocation size of the QCOW2 container file. Signed-off-by: Anatol Belski --- block/src/qcow_sync.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/block/src/qcow_sync.rs b/block/src/qcow_sync.rs index ce06ad6816..3ff5a64723 100644 --- a/block/src/qcow_sync.rs +++ b/block/src/qcow_sync.rs @@ -284,6 +284,12 @@ impl disk_file::DiskSize for QcowDiskSync { } } +impl disk_file::PhysicalSize for QcowDiskSync { + fn physical_size(&self) -> BlockResult { + Ok(self.data_raw_file.physical_size()?) + } +} + pub struct QcowSync { metadata: Arc, data_file: QcowRawFile, From b305a7670bb88712ae9e0c1d3fe48f49d38b1b76 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 12 Mar 2026 20:58:26 +0100 Subject: [PATCH 227/742] block: qcow_sync: impl DiskFd for QcowDiskSync Borrows the raw file descriptor from the underlying QcowRawFile for fcntl() operations. Uses &self for shared access. Signed-off-by: Anatol Belski --- block/src/qcow_sync.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/block/src/qcow_sync.rs b/block/src/qcow_sync.rs index 3ff5a64723..a2898a49bc 100644 --- a/block/src/qcow_sync.rs +++ b/block/src/qcow_sync.rs @@ -290,6 +290,12 @@ impl disk_file::PhysicalSize for QcowDiskSync { } } +impl disk_file::DiskFd for QcowDiskSync { + fn fd(&self) -> BorrowedDiskFd<'_> { + BorrowedDiskFd::new(self.data_raw_file.as_fd().as_raw_fd()) + } +} + pub struct QcowSync { metadata: Arc, data_file: QcowRawFile, From 63f5e6e97f781563679b271df2da6f743f1b7794 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 12 Mar 2026 20:59:46 +0100 Subject: [PATCH 228/742] block: qcow_sync: impl Geometry for QcowDiskSync Uses the default DiskTopology (512B logical/physical) since QCOW2 does not probe the underlying device geometry. Signed-off-by: Anatol Belski --- block/src/qcow_sync.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/src/qcow_sync.rs b/block/src/qcow_sync.rs index a2898a49bc..56c19307f3 100644 --- a/block/src/qcow_sync.rs +++ b/block/src/qcow_sync.rs @@ -296,6 +296,8 @@ impl disk_file::DiskFd for QcowDiskSync { } } +impl disk_file::Geometry for QcowDiskSync {} + pub struct QcowSync { metadata: Arc, data_file: QcowRawFile, From 554562fef24d49cfab2dccee5e163c8da5bb5e7d Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 12 Mar 2026 21:06:48 +0100 Subject: [PATCH 229/742] block: qcow_sync: impl SparseCapable for QcowDiskSync Advertise support for sparse operations and the zero flag. QCOW2 inherently supports both through cluster deallocation. Signed-off-by: Anatol Belski --- block/src/qcow_sync.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/block/src/qcow_sync.rs b/block/src/qcow_sync.rs index 56c19307f3..c75e895e8b 100644 --- a/block/src/qcow_sync.rs +++ b/block/src/qcow_sync.rs @@ -298,6 +298,16 @@ impl disk_file::DiskFd for QcowDiskSync { impl disk_file::Geometry for QcowDiskSync {} +impl disk_file::SparseCapable for QcowDiskSync { + fn supports_sparse_operations(&self) -> bool { + true + } + + fn supports_zero_flag(&self) -> bool { + true + } +} + pub struct QcowSync { metadata: Arc, data_file: QcowRawFile, From f35bec19e8b2d5085874d2c2bd7eafa77d338c79 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 12 Mar 2026 21:20:08 +0100 Subject: [PATCH 230/742] block: qcow_sync: impl Resizable for QcowDiskSync Add ErrorOp::Resize variant and implement the Resizable trait. Resize is rejected when a backing file is present. Signed-off-by: Anatol Belski --- block/src/error.rs | 3 +++ block/src/qcow_sync.rs | 18 ++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/block/src/error.rs b/block/src/error.rs index 4b89bbb212..ebaa33ec5b 100644 --- a/block/src/error.rs +++ b/block/src/error.rs @@ -68,6 +68,8 @@ pub enum ErrorOp { DetectImageType, /// Duplicating a backing-file descriptor. DupBackingFd, + /// Resizing a disk image. + Resize, } impl Display for ErrorOp { @@ -76,6 +78,7 @@ impl Display for ErrorOp { Self::Open => write!(f, "open"), Self::DetectImageType => write!(f, "detect_image_type"), Self::DupBackingFd => write!(f, "dup_backing_fd"), + Self::Resize => write!(f, "resize"), } } } diff --git a/block/src/qcow_sync.rs b/block/src/qcow_sync.rs index c75e895e8b..4c2611a5a4 100644 --- a/block/src/qcow_sync.rs +++ b/block/src/qcow_sync.rs @@ -308,6 +308,24 @@ impl disk_file::SparseCapable for QcowDiskSync { } } +impl disk_file::Resizable for QcowDiskSync { + fn resize(&mut self, size: u64) -> BlockResult<()> { + if self.backing_file.is_some() { + return Err(BlockError::new( + BlockErrorKind::UnsupportedFeature, + DiskFileError::ResizeError(io::Error::other( + "resize not supported with backing file", + )), + ) + .with_op(ErrorOp::Resize)); + } + self.metadata.resize(size).map_err(|e| { + BlockError::new(BlockErrorKind::Io, DiskFileError::ResizeError(e)) + .with_op(ErrorOp::Resize) + }) + } +} + pub struct QcowSync { metadata: Arc, data_file: QcowRawFile, From 1fc8e4adb6c7aebd890ef270e7006ca749b580af Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 12 Mar 2026 21:25:40 +0100 Subject: [PATCH 231/742] block: qcow_sync: impl DiskFile for QcowDiskSync Marker impl binding the DiskSize and HasTopology supertraits. Signed-off-by: Anatol Belski --- block/src/qcow_sync.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/src/qcow_sync.rs b/block/src/qcow_sync.rs index 4c2611a5a4..e9b64726ee 100644 --- a/block/src/qcow_sync.rs +++ b/block/src/qcow_sync.rs @@ -326,6 +326,8 @@ impl disk_file::Resizable for QcowDiskSync { } } +impl disk_file::DiskFile for QcowDiskSync {} + pub struct QcowSync { metadata: Arc, data_file: QcowRawFile, From fe6e3e8feff2fc5b9266c1cfb4e8ee18d30c14ec Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 12 Mar 2026 21:30:43 +0100 Subject: [PATCH 232/742] block: qcow_sync: impl AsyncDiskFile for QcowDiskSync Implement try_clone by sharing the metadata Arc and cloning the data file descriptor. The new_async_io method creates a QcowSync worker identical to the async_io::DiskFile version. Signed-off-by: Anatol Belski --- block/src/qcow_sync.rs | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/block/src/qcow_sync.rs b/block/src/qcow_sync.rs index e9b64726ee..e539754fcb 100644 --- a/block/src/qcow_sync.rs +++ b/block/src/qcow_sync.rs @@ -328,6 +328,28 @@ impl disk_file::Resizable for QcowDiskSync { impl disk_file::DiskFile for QcowDiskSync {} +impl disk_file::AsyncDiskFile for QcowDiskSync { + fn try_clone(&self) -> BlockResult> { + Ok(Box::new(QcowDiskSync { + metadata: Arc::clone(&self.metadata), + backing_file: self.backing_file.as_ref().map(Arc::clone), + sparse: self.sparse, + data_raw_file: self.data_raw_file.clone(), + })) + } + + // ring_depth is unused - this sync backend performs blocking I/O + // instead of submitting to an async ring. + fn new_async_io(&self, _ring_depth: u32) -> BlockResult> { + Ok(Box::new(QcowSync::new( + Arc::clone(&self.metadata), + self.data_raw_file.clone(), + self.backing_file.as_ref().map(Arc::clone), + self.sparse, + ))) + } +} + pub struct QcowSync { metadata: Arc, data_file: QcowRawFile, From 264013b4243c00355dab68aedd7e9c868c104326 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 12 Mar 2026 23:45:35 +0100 Subject: [PATCH 233/742] block: disk_file: Add DiskBackend dispatch enum Introduce DiskBackend with two variants: - Legacy: wraps Box for existing formats - Next: wraps Box Methods return BlockResult, with DiskFileError converted up to BlockError on the Legacy path. The Next path passes through directly with zero conversion overhead. This is a transitional type. Once all formats implement AsyncFullDiskFile, DiskBackend and Legacy are removed and callers hold Box directly. Signed-off-by: Anatol Belski --- block/src/disk_file.rs | 83 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 82 insertions(+), 1 deletion(-) diff --git a/block/src/disk_file.rs b/block/src/disk_file.rs index 9e057bd05b..fea7243abb 100644 --- a/block/src/disk_file.rs +++ b/block/src/disk_file.rs @@ -34,8 +34,10 @@ //! `&mut self`. Errors are returned as [`BlockResult`]. use std::fmt::Debug; +use std::io; -use crate::async_io::{AsyncIo, BorrowedDiskFd}; +use crate::async_io::{self, AsyncIo, BorrowedDiskFd}; +use crate::error::{BlockError, BlockErrorKind}; use crate::{BlockResult, DiskTopology}; /// Reported capacity of a disk image. @@ -156,3 +158,82 @@ pub trait AsyncFullDiskFile: FullDiskFile + AsyncDiskFile {} /// Blanket implementation: any type implementing both [`FullDiskFile`] /// and [`AsyncDiskFile`] automatically satisfies [`AsyncFullDiskFile`]. impl AsyncFullDiskFile for T {} + +/// A disk backend that dispatches to either the existing [`async_io::DiskFile`] +/// trait or the next-generation [`AsyncFullDiskFile`] trait. +pub enum DiskBackend { + /// Existing disk file backend (raw, vhd, vhdx, etc.). + Legacy(Box), + /// Next-generation disk file backend (qcow2, and more formats as they migrate). + Next(Box), +} + +impl DiskBackend { + pub fn logical_size(&mut self) -> BlockResult { + match self { + Self::Legacy(d) => d + .logical_size() + .map_err(|e| BlockError::new(BlockErrorKind::Io, io::Error::other(e))), + Self::Next(d) => d.logical_size(), + } + } + + pub fn physical_size(&mut self) -> BlockResult { + match self { + Self::Legacy(d) => d + .physical_size() + .map_err(|e| BlockError::new(BlockErrorKind::Io, io::Error::other(e))), + Self::Next(d) => d.physical_size(), + } + } + + pub fn topology(&mut self) -> DiskTopology { + match self { + Self::Legacy(d) => d.topology(), + Self::Next(d) => d.topology(), + } + } + + pub fn supports_sparse_operations(&self) -> bool { + match self { + Self::Legacy(d) => d.supports_sparse_operations(), + Self::Next(d) => d.supports_sparse_operations(), + } + } + + pub fn supports_zero_flag(&self) -> bool { + match self { + Self::Legacy(d) => d.supports_zero_flag(), + Self::Next(d) => d.supports_zero_flag(), + } + } + + pub fn fd(&mut self) -> BorrowedDiskFd<'_> { + match self { + Self::Legacy(d) => d.fd(), + Self::Next(d) => d.fd(), + } + } + + pub fn new_async_io(&self, ring_depth: u32) -> BlockResult> { + match self { + Self::Legacy(d) => d + .new_async_io(ring_depth) + .map_err(|e| BlockError::new(BlockErrorKind::Io, io::Error::other(e))), + Self::Next(d) => d.new_async_io(ring_depth), + } + } + + pub fn resize(&mut self, new_size: u64) -> BlockResult<()> { + match self { + Self::Legacy(d) => d.resize(new_size).map_err(|e| match e { + async_io::DiskFileError::Unsupported => BlockError::new( + BlockErrorKind::UnsupportedFeature, + io::Error::other("resize not supported"), + ), + _ => BlockError::new(BlockErrorKind::Io, io::Error::other(e)), + }), + Self::Next(d) => d.resize(new_size), + } + } +} From b4dad66d35f13d8f398c4a4350ca85a4f9000e5d Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 12 Mar 2026 23:51:58 +0100 Subject: [PATCH 234/742] virtio-devices: vmm: fuzz: Switch to DiskBackend Change Block to hold DiskBackend instead of Box. In device_manager, existing formats (raw, vhd, vhdx) are wrapped in DiskBackend::Legacy while QcowDiskSync uses DiskBackend::Next. The fuzz target is updated accordingly. The Error::DiskResize variant now carries BlockError instead of DiskFileError, matching the BlockResult return type of DiskBackend::resize(). Signed-off-by: Anatol Belski --- fuzz/fuzz_targets/block.rs | 3 ++- virtio-devices/src/block.rs | 10 ++++++---- vmm/src/device_manager.rs | 27 ++++++++++++++++----------- 3 files changed, 24 insertions(+), 16 deletions(-) diff --git a/fuzz/fuzz_targets/block.rs b/fuzz/fuzz_targets/block.rs index 952011b55b..35d59c9850 100644 --- a/fuzz/fuzz_targets/block.rs +++ b/fuzz/fuzz_targets/block.rs @@ -16,6 +16,7 @@ use std::sync::Arc; use std::{ffi, io}; use block::async_io::DiskFile; +use block::disk_file::DiskBackend; use block::fcntl::LockGranularityChoice; use block::raw_sync::RawFileDiskSync; use libfuzzer_sys::{fuzz_target, Corpus}; @@ -56,7 +57,7 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { let queue_affinity = BTreeMap::new(); let mut block = Block::new( "tmp".to_owned(), - qcow_disk, + DiskBackend::Legacy(qcow_disk), PathBuf::from(""), false, false, diff --git a/virtio-devices/src/block.rs b/virtio-devices/src/block.rs index 2adbff74f8..edd1327f45 100644 --- a/virtio-devices/src/block.rs +++ b/virtio-devices/src/block.rs @@ -18,7 +18,9 @@ use std::sync::{Arc, Barrier}; use std::{io, result}; use anyhow::anyhow; -use block::async_io::{AsyncIo, AsyncIoError, DiskFile, DiskFileError}; +use block::async_io::{AsyncIo, AsyncIoError}; +use block::disk_file::DiskBackend; +use block::error::BlockError; use block::fcntl::{LockError, LockGranularity, LockGranularityChoice, LockType, get_lock_state}; use block::{ ExecuteAsync, ExecuteError, Request, RequestType, VirtioBlockConfig, build_serial, fcntl, @@ -104,7 +106,7 @@ pub enum Error { #[error("Failed signal config interrupt")] ConfigChange(#[source] io::Error), #[error("Disk resize failed")] - DiskResize(#[source] DiskFileError), + DiskResize(#[source] BlockError), } pub type Result = result::Result; @@ -697,7 +699,7 @@ impl EpollHelperHandler for BlockEpollHandler { pub struct Block { common: VirtioCommon, id: String, - disk_image: Box, + disk_image: DiskBackend, disk_path: PathBuf, disk_nsectors: Arc, config: VirtioBlockConfig, @@ -727,7 +729,7 @@ impl Block { #[allow(clippy::too_many_arguments)] pub fn new( id: String, - mut disk_image: Box, + mut disk_image: DiskBackend, disk_path: PathBuf, read_only: bool, iommu: bool, diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 958d3086b1..51a5e476a7 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -33,6 +33,7 @@ use arch::layout::{APIC_START, IOAPIC_SIZE, IOAPIC_START}; use arch::{DeviceType, MmioDeviceInfo}; use arch::{NumaNodes, layout}; use block::async_io::DiskFile; +use block::disk_file::DiskBackend; use block::error::BlockError; use block::fixed_vhd_sync::FixedVhdDiskSync; use block::qcow_sync::QcowDiskSync; @@ -2721,17 +2722,17 @@ impl DeviceManager { unreachable!("Checked in if statement above"); #[cfg(feature = "io_uring")] { - Box::new( + DiskBackend::Legacy(Box::new( FixedVhdDiskAsync::new(file) .map_err(DeviceManagerError::CreateFixedVhdDiskAsync)?, - ) as Box + ) as Box) } } else { info!("Using synchronous fixed VHD disk file"); - Box::new( + DiskBackend::Legacy(Box::new( FixedVhdDiskSync::new(file) .map_err(DeviceManagerError::CreateFixedVhdDiskSync)?, - ) as Box + ) as Box) } } ImageType::Raw => { @@ -2755,19 +2756,23 @@ impl DeviceManager { unreachable!("Checked in if statement above"); #[cfg(feature = "io_uring")] { - Box::new(RawFileDisk::new(file)) as Box + DiskBackend::Legacy( + Box::new(RawFileDisk::new(file)) as Box + ) } } else if !disk_cfg.disable_aio && self.aio_is_supported() { info!("Using asynchronous RAW disk file (aio)"); - Box::new(RawFileDiskAio::new(file)) as Box + DiskBackend::Legacy(Box::new(RawFileDiskAio::new(file)) as Box) } else { info!("Using synchronous RAW disk file"); - Box::new(RawFileDiskSync::new(file)) as Box + DiskBackend::Legacy( + Box::new(RawFileDiskSync::new(file)) as Box + ) } } ImageType::Qcow2 => { info!("Using synchronous QCOW2 disk file"); - Box::new( + DiskBackend::Next(Box::new( QcowDiskSync::new( file, disk_cfg.direct, @@ -2779,14 +2784,14 @@ impl DeviceManager { None => e, }) .map_err(DeviceManagerError::CreateQcowDiskSync)?, - ) as Box + )) } ImageType::Vhdx => { info!("Using synchronous VHDX disk file"); - Box::new( + DiskBackend::Legacy(Box::new( VhdxDiskSync::new(file) .map_err(DeviceManagerError::CreateFixedVhdxDiskSync)?, - ) as Box + ) as Box) } ImageType::Unknown => unreachable!(), }; From 0d062962acc3dde1b61f30ddd50fc62c46e5039c Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 12 Mar 2026 23:56:41 +0100 Subject: [PATCH 235/742] block: qcow: Remove async_io::DiskFile impl from QcowDiskSync QcowDiskSync now exclusively uses disk_file::DiskFile and disk_file::AsyncDiskFile. The old async_io::DiskFile impl is removed along with its unused imports (DiskFile, DiskFileError, DiskFileResult). Tests are updated to import the new traits. Signed-off-by: Anatol Belski --- block/src/qcow_sync.rs | 50 ++---------------------------------------- 1 file changed, 2 insertions(+), 48 deletions(-) diff --git a/block/src/qcow_sync.rs b/block/src/qcow_sync.rs index e539754fcb..cce4c192d7 100644 --- a/block/src/qcow_sync.rs +++ b/block/src/qcow_sync.rs @@ -12,9 +12,7 @@ use std::{fmt, io, ptr, slice}; use vmm_sys_util::eventfd::EventFd; use vmm_sys_util::write_zeroes::{PunchHole, WriteZeroesAt}; -use crate::async_io::{ - AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFile, DiskFileError, DiskFileResult, -}; +use crate::async_io::{AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFileError}; use crate::disk_file; use crate::error::{BlockError, BlockErrorKind, BlockResult, ErrorOp}; use crate::qcow::metadata::{ @@ -228,50 +226,6 @@ impl QcowDiskSync { } } -impl DiskFile for QcowDiskSync { - fn logical_size(&mut self) -> DiskFileResult { - Ok(self.metadata.virtual_size()) - } - - fn physical_size(&mut self) -> DiskFileResult { - self.data_raw_file - .physical_size() - .map_err(DiskFileError::Size) - } - - fn new_async_io(&self, _ring_depth: u32) -> DiskFileResult> { - Ok(Box::new(QcowSync::new( - Arc::clone(&self.metadata), - self.data_raw_file.clone(), - self.backing_file.as_ref().map(Arc::clone), - self.sparse, - )) as Box) - } - - fn resize(&mut self, size: u64) -> DiskFileResult<()> { - if self.backing_file.is_some() { - return Err(DiskFileError::ResizeError(io::Error::other( - "resize not supported with backing file", - ))); - } - self.metadata - .resize(size) - .map_err(DiskFileError::ResizeError) - } - - fn supports_sparse_operations(&self) -> bool { - true - } - - fn supports_zero_flag(&self) -> bool { - true - } - - fn fd(&mut self) -> BorrowedDiskFd<'_> { - BorrowedDiskFd::new(self.data_raw_file.as_raw_fd()) - } -} - impl Drop for QcowDiskSync { fn drop(&mut self) { self.metadata.shutdown(); @@ -734,7 +688,7 @@ mod unit_tests { use vmm_sys_util::tempfile::TempFile; use super::*; - use crate::async_io::DiskFile; + use crate::disk_file::{AsyncDiskFile, DiskSize, Resizable}; use crate::qcow::{BackingFileConfig, ImageType, QcowFile, RawFile}; fn create_disk_with_data( From 5724a0189c86417660d11a9c74c32eebd786773d Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Thu, 19 Mar 2026 16:26:59 +0000 Subject: [PATCH 236/742] tests: Make the IOMMU segment test case more flexible Linux kernel's behavior changes overtime. The grouping can be different across different versions and different architectures. We only cares about the exact SBDF exists somewhere. It doesn't matter which group it is under. Change the check so that this test case is no longer tied to the grouping behavior of a particular kernel. Signed-off-by: Wei Liu --- cloud-hypervisor/tests/integration.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 8f6dec37a9..3339c99d0d 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -3460,12 +3460,11 @@ mod common_parallel { .does_device_vendor_pair_match("0x1057", "0x1af4") .unwrap_or_default() ); - assert_eq!( + assert!( guest - .ssh_command("ls /sys/kernel/iommu_groups/1/devices") + .ssh_command("ls /sys/kernel/iommu_groups/*/devices") .unwrap() - .trim(), - "0001:00:01.0" + .contains("0001:00:01.0") ); }); From 6e6127bbda8c4b61e1278d509e7cb503d3abc9ee Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Thu, 19 Mar 2026 16:43:50 +0000 Subject: [PATCH 237/742] tests: Make test_vdpa_block more flexible We only care that the device exists under an IOMMU group but not which one. Signed-off-by: Wei Liu --- cloud-hypervisor/tests/integration.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 3339c99d0d..a0f3333761 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -9921,12 +9921,11 @@ mod common_parallel { .does_device_vendor_pair_match("0x1057", "0x1af4") .unwrap_or_default() ); - assert_eq!( + assert!( guest - .ssh_command("ls /sys/kernel/iommu_groups/1/devices") + .ssh_command("ls /sys/kernel/iommu_groups/*/devices") .unwrap() - .trim(), - "0001:00:01.0" + .contains("0001:00:01.0") ); // Check both if /dev/vdd exists and if the block size is 128M. From 43642d8df8d3edefd09f1bbed89693645ffc225d Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Thu, 19 Mar 2026 17:01:32 +0000 Subject: [PATCH 238/742] tests: Make test_virtio_mmu more flexible We only verify devices are under some group but not which one. With the change, the acpi variable is only needed for aarch64. Add an underscore prefix to avoid a compilation warning on x86_64. Signed-off-by: Wei Liu --- cloud-hypervisor/tests/integration.rs | 38 ++++++++++----------------- 1 file changed, 14 insertions(+), 24 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index a0f3333761..3e2a33da70 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -2128,7 +2128,7 @@ fn vm_state(api_socket: &str) -> String { // The last interesting part of this test is that it exercises the network // interface attached to the virtual IOMMU since this is the one used to // send all commands through SSH. -fn _test_virtio_iommu(acpi: bool) { +fn _test_virtio_iommu(_acpi: bool /* not needed on x86_64 */) { // Virtio-iommu support is ready in recent kernel (v5.14). But the kernel in // Focal image is still old. // So if ACPI is enabled on AArch64, we use a modified Focal image in which @@ -2143,7 +2143,7 @@ fn _test_virtio_iommu(acpi: bool) { #[cfg(target_arch = "x86_64")] let kernel_path = direct_kernel_boot_path(); #[cfg(target_arch = "aarch64")] - let kernel_path = if acpi { + let kernel_path = if _acpi { edk2_path() } else { direct_kernel_boot_path() @@ -2187,39 +2187,29 @@ fn _test_virtio_iommu(acpi: bool) { // All devices on the PCI bus will be attached to the virtual IOMMU, except the // virtio-iommu device itself. So these devices will all be added to IOMMU groups, // and appear under folder '/sys/kernel/iommu_groups/'. - // The result is, in the case of FDT, IOMMU group '0' contains "0000:00:01.0" - // which is the console. The first disk "0000:00:02.0" is in group '1'. - // While on ACPI, console device is not attached to IOMMU. So the IOMMU group '0' - // contains "0000:00:02.0" which is the first disk. // - // Verify the iommu group of the first disk. - let iommu_group = if acpi { 0 } else { 2 }; - assert_eq!( + // Verify the first disk is in an iommu group. + assert!( guest - .ssh_command(format!("ls /sys/kernel/iommu_groups/{iommu_group}/devices").as_str()) + .ssh_command("ls /sys/kernel/iommu_groups/*/devices") .unwrap() - .trim(), - "0000:00:02.0" + .contains("0000:00:02.0") ); - // Verify the iommu group of the second disk. - let iommu_group = if acpi { 1 } else { 3 }; - assert_eq!( + // Verify the second disk is in an iommu group. + assert!( guest - .ssh_command(format!("ls /sys/kernel/iommu_groups/{iommu_group}/devices").as_str()) + .ssh_command("ls /sys/kernel/iommu_groups/*/devices") .unwrap() - .trim(), - "0000:00:03.0" + .contains("0000:00:03.0") ); - // Verify the iommu group of the network card. - let iommu_group = if acpi { 2 } else { 4 }; - assert_eq!( + // Verify the network card is in an iommu group. + assert!( guest - .ssh_command(format!("ls /sys/kernel/iommu_groups/{iommu_group}/devices").as_str()) + .ssh_command("ls /sys/kernel/iommu_groups/*/devices") .unwrap() - .trim(), - "0000:00:04.0" + .contains("0000:00:04.0") ); }); From 0c3249b14f718f1b3aae10a7ae3618930e76c1e5 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Thu, 19 Mar 2026 23:30:42 -0700 Subject: [PATCH 239/742] tests: split integration helpers into common modules Move shared integration test logic out of tests/integration.rs. Add tests/common/{mod.rs,tests_wrappers.rs,utils.rs} and migrate API, VM lifecycle, disk/net, and utility helpers. Update integration.rs to import common modules and keep test entrypoints thin. Benefits: Reduces integration.rs size and duplication Groups reusable helpers by role Improves readability and future maintenance Fixes: https://github.com/cloud-hypervisor/cloud-hypervisor/issues/7808 Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/common/mod.rs | 6 + .../tests/common/tests_wrappers.rs | 2059 +++++++++++ cloud-hypervisor/tests/common/utils.rs | 1045 ++++++ cloud-hypervisor/tests/integration.rs | 3074 +---------------- 4 files changed, 3117 insertions(+), 3067 deletions(-) create mode 100644 cloud-hypervisor/tests/common/mod.rs create mode 100644 cloud-hypervisor/tests/common/tests_wrappers.rs create mode 100644 cloud-hypervisor/tests/common/utils.rs diff --git a/cloud-hypervisor/tests/common/mod.rs b/cloud-hypervisor/tests/common/mod.rs new file mode 100644 index 0000000000..da58f907e8 --- /dev/null +++ b/cloud-hypervisor/tests/common/mod.rs @@ -0,0 +1,6 @@ +// Copyright 2025 The Cloud Hypervisor Authors. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +pub(crate) mod tests_wrappers; +pub(crate) mod utils; diff --git a/cloud-hypervisor/tests/common/tests_wrappers.rs b/cloud-hypervisor/tests/common/tests_wrappers.rs new file mode 100644 index 0000000000..afe54ed5ef --- /dev/null +++ b/cloud-hypervisor/tests/common/tests_wrappers.rs @@ -0,0 +1,2059 @@ +// Copyright 2025 The Cloud Hypervisor Authors. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 +use std::ffi::CStr; +use std::fs::{self, OpenOptions}; +use std::io::{Seek, SeekFrom, Write}; +use std::path::{Path, PathBuf}; +use std::string::String; +use std::sync::mpsc; +use std::thread; + +use block::ImageType; +use net_util::MacAddr; +use test_infra::*; +use vmm_sys_util::tempdir::TempDir; +use vmm_sys_util::tempfile::TempFile; + +use crate::common::utils::{TargetApi, *}; + +// Start cloud-hypervisor with no VM parameters, only the API server running. +// From the API: Create a VM, boot it and check that it looks as expected. +pub(crate) fn _test_api_create_boot(target_api: &TargetApi, guest: &Guest) { + let mut child = GuestCommand::new(guest) + .args(target_api.guest_args()) + .capture_output() + .spawn() + .unwrap(); + + thread::sleep(std::time::Duration::new(1, 0)); + + // Verify API server is running + assert!(target_api.remote_command("ping", None)); + + // Create the VM first + let request_body = guest.api_create_body(); + + let temp_config_path = guest.tmp_dir.as_path().join("config"); + std::fs::write(&temp_config_path, request_body).unwrap(); + let create_config = temp_config_path.as_os_str().to_str().unwrap(); + + assert!(target_api.remote_command("create", Some(create_config),)); + + // Then boot it + assert!(target_api.remote_command("boot", None)); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + // Check that the VM booted as expected + guest.validate_cpu_count(None); + guest.validate_memory(None); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +// Start cloud-hypervisor with no VM parameters, only the API server running. +// From the API: Create a VM, boot it and check it can be shutdown and then +// booted again +pub(crate) fn _test_api_shutdown(target_api: &TargetApi, guest: &Guest) { + let mut child = GuestCommand::new(guest) + .args(target_api.guest_args()) + .capture_output() + .spawn() + .unwrap(); + + thread::sleep(std::time::Duration::new(1, 0)); + + // Verify API server is running + assert!(target_api.remote_command("ping", None)); + + // Create the VM first + let request_body = guest.api_create_body(); + + let temp_config_path = guest.tmp_dir.as_path().join("config"); + std::fs::write(&temp_config_path, request_body).unwrap(); + let create_config = temp_config_path.as_os_str().to_str().unwrap(); + + let r = std::panic::catch_unwind(|| { + assert!(target_api.remote_command("create", Some(create_config))); + + // Then boot it + assert!(target_api.remote_command("boot", None)); + + guest.wait_vm_boot().unwrap(); + + // Check that the VM booted as expected + guest.validate_cpu_count(None); + guest.validate_memory(None); + + // Sync and shutdown without powering off to prevent filesystem + // corruption. + guest.ssh_command("sync").unwrap(); + guest.ssh_command("sudo shutdown -H now").unwrap(); + + // Wait for the guest to be fully shutdown + thread::sleep(std::time::Duration::new(20, 0)); + + // Then shut it down + assert!(target_api.remote_command("shutdown", None)); + + // Then boot it again + assert!(target_api.remote_command("boot", None)); + + guest.wait_vm_boot().unwrap(); + + // Check that the VM booted as expected + guest.validate_cpu_count(None); + guest.validate_memory(None); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +// Start cloud-hypervisor with no VM parameters, only the API server running. +// From the API: Create a VM, boot it and check it can be deleted and then recreated +// booted again. +pub(crate) fn _test_api_delete(target_api: &TargetApi, guest: &Guest) { + let mut child = GuestCommand::new(guest) + .args(target_api.guest_args()) + .capture_output() + .spawn() + .unwrap(); + + thread::sleep(std::time::Duration::new(1, 0)); + + // Verify API server is running + assert!(target_api.remote_command("ping", None)); + + // Create the VM first + let request_body = guest.api_create_body(); + + let temp_config_path = guest.tmp_dir.as_path().join("config"); + std::fs::write(&temp_config_path, request_body).unwrap(); + let create_config = temp_config_path.as_os_str().to_str().unwrap(); + + let r = std::panic::catch_unwind(|| { + assert!(target_api.remote_command("create", Some(create_config))); + + // Then boot it + assert!(target_api.remote_command("boot", None)); + + guest.wait_vm_boot().unwrap(); + + // Check that the VM booted as expected + guest.validate_cpu_count(None); + guest.validate_memory(None); + + // Sync and shutdown without powering off to prevent filesystem + // corruption. + guest.ssh_command("sync").unwrap(); + guest.ssh_command("sudo shutdown -H now").unwrap(); + + // Wait for the guest to be fully shutdown + thread::sleep(std::time::Duration::new(20, 0)); + + // Then delete it + assert!(target_api.remote_command("delete", None)); + + assert!(target_api.remote_command("create", Some(create_config))); + + // Then boot it again + assert!(target_api.remote_command("boot", None)); + + guest.wait_vm_boot().unwrap(); + + // Check that the VM booted as expected + guest.validate_cpu_count(None); + guest.validate_memory(None); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +// Start cloud-hypervisor with no VM parameters, only the API server running. +// From the API: Create a VM, boot it and check that it looks as expected. +// Then we pause the VM, check that it's no longer available. +// Finally we resume the VM and check that it's available. +pub(crate) fn _test_api_pause_resume(target_api: &TargetApi, guest: &Guest) { + let mut child = GuestCommand::new(guest) + .args(target_api.guest_args()) + .capture_output() + .spawn() + .unwrap(); + + thread::sleep(std::time::Duration::new(1, 0)); + + // Verify API server is running + assert!(target_api.remote_command("ping", None)); + + // Create the VM first + let request_body = guest.api_create_body(); + + let temp_config_path = guest.tmp_dir.as_path().join("config"); + std::fs::write(&temp_config_path, request_body).unwrap(); + let create_config = temp_config_path.as_os_str().to_str().unwrap(); + + assert!(target_api.remote_command("create", Some(create_config))); + + // Then boot it + assert!(target_api.remote_command("boot", None)); + thread::sleep(std::time::Duration::new(20, 0)); + + let r = std::panic::catch_unwind(|| { + // Check that the VM booted as expected + guest.validate_cpu_count(None); + guest.validate_memory(None); + + // We now pause the VM + assert!(target_api.remote_command("pause", None)); + + // Check pausing again fails + assert!(!target_api.remote_command("pause", None)); + + thread::sleep(std::time::Duration::new(2, 0)); + + // SSH into the VM should fail + ssh_command_ip( + "grep -c processor /proc/cpuinfo", + &guest.network.guest_ip0, + 2, + 5, + ) + .unwrap_err(); + + // Resume the VM + assert!(target_api.remote_command("resume", None)); + + // Check resuming again fails + assert!(!target_api.remote_command("resume", None)); + + thread::sleep(std::time::Duration::new(2, 0)); + + // Now we should be able to SSH back in and get the right number of CPUs + guest.validate_cpu_count(None); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +pub(crate) fn _test_pty_interaction(pty_path: PathBuf) { + let mut cf = std::fs::OpenOptions::new() + .write(true) + .read(true) + .open(pty_path) + .unwrap(); + + // Some dumb sleeps but we don't want to write + // before the console is up and we don't want + // to try and write the next line before the + // login process is ready. + thread::sleep(std::time::Duration::new(5, 0)); + assert_eq!(cf.write(b"cloud\n").unwrap(), 6); + thread::sleep(std::time::Duration::new(2, 0)); + assert_eq!(cf.write(b"cloud123\n").unwrap(), 9); + thread::sleep(std::time::Duration::new(2, 0)); + assert_eq!(cf.write(b"echo test_pty_console\n").unwrap(), 22); + thread::sleep(std::time::Duration::new(2, 0)); + + // read pty and ensure they have a login shell + // some fairly hacky workarounds to avoid looping + // forever in case the channel is blocked getting output + let ptyc = pty_read(cf); + let mut empty = 0; + let mut prev = String::new(); + loop { + thread::sleep(std::time::Duration::new(2, 0)); + match ptyc.try_recv() { + Ok(line) => { + empty = 0; + prev = prev + &line; + if prev.contains("test_pty_console") { + break; + } + } + Err(mpsc::TryRecvError::Empty) => { + empty += 1; + assert!(empty <= 5, "No login on pty"); + } + _ => { + panic!("No login on pty") + } + } + } +} + +pub(crate) fn test_cpu_topology( + threads_per_core: u8, + cores_per_package: u8, + packages: u8, + use_fw: bool, +) { + let disk_config = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); + let total_vcpus = threads_per_core * cores_per_package * packages; + let direct_kernel_boot_path = direct_kernel_boot_path(); + let mut kernel_path = direct_kernel_boot_path.to_str().unwrap(); + let fw_path = fw_path(FwType::RustHypervisorFirmware); + if use_fw { + kernel_path = fw_path.as_str(); + } + + let mut child = GuestCommand::new(&guest) + .args([ + "--cpus", + &format!( + "boot={total_vcpus},topology={threads_per_core}:{cores_per_package}:1:{packages}" + ), + ]) + .default_memory() + .args(["--kernel", kernel_path]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .default_disks() + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + assert_eq!( + guest.get_cpu_count().unwrap_or_default(), + u32::from(total_vcpus) + ); + assert_eq!( + guest + .ssh_command("lscpu | grep \"per core\" | cut -f 2 -d \":\" | sed \"s# *##\"") + .unwrap() + .trim() + .parse::() + .unwrap_or(0), + threads_per_core + ); + + assert_eq!( + guest + .ssh_command("lscpu | grep \"per socket\" | cut -f 2 -d \":\" | sed \"s# *##\"") + .unwrap() + .trim() + .parse::() + .unwrap_or(0), + cores_per_package + ); + + assert_eq!( + guest + .ssh_command("lscpu | grep \"Socket\" | cut -f 2 -d \":\" | sed \"s# *##\"") + .unwrap() + .trim() + .parse::() + .unwrap_or(0), + packages + ); + + #[cfg(target_arch = "x86_64")] + { + let mut cpu_id = 0; + for package_id in 0..packages { + for core_id in 0..cores_per_package { + for _ in 0..threads_per_core { + assert_eq!( + guest + .ssh_command(&format!("cat /sys/devices/system/cpu/cpu{cpu_id}/topology/physical_package_id")) + .unwrap() + .trim() + .parse::() + .unwrap_or(0), + package_id + ); + + assert_eq!( + guest + .ssh_command(&format!( + "cat /sys/devices/system/cpu/cpu{cpu_id}/topology/core_id" + )) + .unwrap() + .trim() + .parse::() + .unwrap_or(0), + core_id + ); + + cpu_id += 1; + } + } + } + } + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +#[allow(unused_variables)] +pub(crate) fn _test_guest_numa_nodes(acpi: bool) { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); + let api_socket = temp_api_path(&guest.tmp_dir); + #[cfg(target_arch = "x86_64")] + let kernel_path = direct_kernel_boot_path(); + #[cfg(target_arch = "aarch64")] + let kernel_path = if acpi { + edk2_path() + } else { + direct_kernel_boot_path() + }; + + let mut child = GuestCommand::new(&guest) + .args(["--cpus", "boot=6,max=12"]) + .args(["--memory", "size=0,hotplug_method=virtio-mem"]) + .args([ + "--memory-zone", + "id=mem0,size=1G,hotplug_size=3G", + "id=mem1,size=2G,hotplug_size=3G", + "id=mem2,size=3G,hotplug_size=3G", + ]) + .args([ + "--numa", + "guest_numa_id=0,cpus=[0-2,9],distances=[1@15,2@20],memory_zones=mem0", + "guest_numa_id=1,cpus=[3-4,6-8],distances=[0@20,2@25],memory_zones=mem1", + "guest_numa_id=2,cpus=[5,10-11],distances=[0@25,1@30],memory_zones=mem2", + ]) + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .args(["--api-socket", &api_socket]) + .capture_output() + .default_disks() + .default_net() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + guest.check_numa_common( + Some(&[960_000, 1_920_000, 2_880_000]), + Some(&[&[0, 1, 2], &[3, 4], &[5]]), + Some(&["10 15 20", "20 10 25", "25 30 10"]), + ); + + // AArch64 currently does not support hotplug, and therefore we only + // test hotplug-related function on x86_64 here. + #[cfg(target_arch = "x86_64")] + { + guest.enable_memory_hotplug(); + + // Resize every memory zone and check each associated NUMA node + // has been assigned the right amount of memory. + resize_zone_command(&api_socket, "mem0", "4G"); + resize_zone_command(&api_socket, "mem1", "4G"); + resize_zone_command(&api_socket, "mem2", "4G"); + // Resize to the maximum amount of CPUs and check each NUMA + // node has been assigned the right CPUs set. + resize_command(&api_socket, Some(12), None, None, None); + thread::sleep(std::time::Duration::new(5, 0)); + + guest.check_numa_common( + Some(&[3_840_000, 3_840_000, 3_840_000]), + Some(&[&[0, 1, 2, 9], &[3, 4, 6, 7, 8], &[5, 10, 11]]), + None, + ); + } + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +#[allow(unused_variables)] +pub(crate) fn _test_power_button(guest: &Guest) { + let mut cmd = GuestCommand::new(guest); + let api_socket = temp_api_path(&guest.tmp_dir); + + cmd.default_cpus() + .default_memory() + .default_kernel_cmdline() + .capture_output() + .default_disks() + .default_net() + .args(["--api-socket", &api_socket]); + + let child = cmd.spawn().unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + assert!(remote_command(&api_socket, "power-button", None)); + }); + + let output = child.wait_with_output().unwrap(); + assert!(output.status.success()); + handle_child_output(r, &output); +} + +pub(crate) fn test_vhost_user_net( + tap: Option<&str>, + num_queues: usize, + prepare_daemon: &PrepareNetDaemon, + generate_host_mac: bool, + client_mode_daemon: bool, +) { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); + let api_socket = temp_api_path(&guest.tmp_dir); + + let kernel_path = direct_kernel_boot_path(); + + let host_mac = if generate_host_mac { + Some(MacAddr::local_random()) + } else { + None + }; + + let mtu = Some(3000); + + let (mut daemon_command, vunet_socket_path) = prepare_daemon( + &guest.tmp_dir, + &guest.network.host_ip0, + tap, + mtu, + num_queues, + client_mode_daemon, + ); + + let net_params = format!( + "vhost_user=true,mac={},socket={},num_queues={},queue_size=1024{},vhost_mode={},mtu=3000", + guest.network.guest_mac0, + vunet_socket_path, + num_queues, + if let Some(host_mac) = host_mac { + format!(",host_mac={host_mac}") + } else { + String::new() + }, + if client_mode_daemon { + "server" + } else { + "client" + }, + ); + + let mut ch_command = GuestCommand::new(&guest); + ch_command + .args(["--cpus", format!("boot={}", num_queues / 2).as_str()]) + .args(["--memory", "size=512M,hotplug_size=2048M,shared=on"]) + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .default_disks() + .args(["--net", net_params.as_str()]) + .args(["--api-socket", &api_socket]) + .capture_output(); + + let mut daemon_child: std::process::Child; + let mut child: std::process::Child; + + if client_mode_daemon { + child = ch_command.spawn().unwrap(); + // Make sure the VMM is waiting for the backend to connect + thread::sleep(std::time::Duration::new(10, 0)); + daemon_child = daemon_command.spawn().unwrap(); + } else { + daemon_child = daemon_command.spawn().unwrap(); + // Make sure the backend is waiting for the VMM to connect + thread::sleep(std::time::Duration::new(10, 0)); + child = ch_command.spawn().unwrap(); + } + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + if let Some(tap_name) = tap { + let tap_count = exec_host_command_output(&format!("ip link | grep -c {tap_name}")); + assert_eq!(String::from_utf8_lossy(&tap_count.stdout).trim(), "1"); + } + + if let Some(host_mac) = tap { + let mac_count = exec_host_command_output(&format!("ip link | grep -c {host_mac}")); + assert_eq!(String::from_utf8_lossy(&mac_count.stdout).trim(), "1"); + } + + #[cfg(target_arch = "aarch64")] + let iface = "enp0s4"; + #[cfg(target_arch = "x86_64")] + let iface = "ens4"; + + assert_eq!( + guest + .ssh_command(format!("cat /sys/class/net/{iface}/mtu").as_str()) + .unwrap() + .trim(), + "3000" + ); + + // 1 network interface + default localhost ==> 2 interfaces + // It's important to note that this test is fully exercising the + // vhost-user-net implementation and the associated backend since + // it does not define any --net network interface. That means all + // the ssh communication in that test happens through the network + // interface backed by vhost-user-net. + assert_eq!( + guest + .ssh_command("ip -o link | wc -l") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 2 + ); + + // The following pci devices will appear on guest with PCI-MSI + // interrupt vectors assigned. + // 1 virtio-console with 3 vectors: config, Rx, Tx + // 1 virtio-blk with 2 vectors: config, Request + // 1 virtio-blk with 2 vectors: config, Request + // 1 virtio-rng with 2 vectors: config, Request + // Since virtio-net has 2 queue pairs, its vectors is as follows: + // 1 virtio-net with 5 vectors: config, Rx (2), Tx (2) + // Based on the above, the total vectors should 14. + let grep_cmd = format!("grep -c {} /proc/interrupts", get_msi_interrupt_pattern()); + + assert_eq!( + guest + .ssh_command(&grep_cmd) + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 10 + (num_queues as u32) + ); + + // ACPI feature is needed. + #[cfg(target_arch = "x86_64")] + { + guest.enable_memory_hotplug(); + + // Add RAM to the VM + let desired_ram = 1024 << 20; + resize_command(&api_socket, None, Some(desired_ram), None, None); + + thread::sleep(std::time::Duration::new(10, 0)); + + // Here by simply checking the size (through ssh), we validate + // the connection is still working, which means vhost-user-net + // keeps working after the resize. + assert!(guest.get_total_memory().unwrap_or_default() > 960_000); + } + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + thread::sleep(std::time::Duration::new(5, 0)); + let _ = daemon_child.kill(); + let _ = daemon_child.wait(); + + handle_child_output(r, &output); +} + +type PrepareBlkDaemon = dyn Fn(&TempDir, &str, usize, bool, bool) -> (std::process::Child, String); + +pub(crate) fn test_vhost_user_blk( + num_queues: usize, + readonly: bool, + direct: bool, + prepare_vhost_user_blk_daemon: Option<&PrepareBlkDaemon>, +) { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); + let api_socket = temp_api_path(&guest.tmp_dir); + + let kernel_path = direct_kernel_boot_path(); + + let (blk_params, daemon_child) = { + let prepare_daemon = prepare_vhost_user_blk_daemon.unwrap(); + // Start the daemon + let (daemon_child, vubd_socket_path) = + prepare_daemon(&guest.tmp_dir, "blk.img", num_queues, readonly, direct); + + ( + format!( + "vhost_user=true,socket={vubd_socket_path},num_queues={num_queues},queue_size=128", + ), + Some(daemon_child), + ) + }; + + let mut child = GuestCommand::new(&guest) + .args(["--cpus", format!("boot={num_queues}").as_str()]) + .args(["--memory", "size=512M,hotplug_size=2048M,shared=on"]) + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .args([ + "--disk", + format!( + "path={}", + guest.disk_config.disk(DiskType::OperatingSystem).unwrap() + ) + .as_str(), + format!( + "path={}", + guest.disk_config.disk(DiskType::CloudInit).unwrap() + ) + .as_str(), + blk_params.as_str(), + ]) + .default_net() + .args(["--api-socket", &api_socket]) + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + // Check both if /dev/vdc exists and if the block size is 16M. + assert_eq!( + guest + .ssh_command("lsblk | grep vdc | grep -c 16M") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 1 + ); + + // Check if this block is RO or RW. + assert_eq!( + guest + .ssh_command("lsblk | grep vdc | awk '{print $5}'") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + readonly as u32 + ); + + // Check if the number of queues in /sys/block/vdc/mq matches the + // expected num_queues. + assert_eq!( + guest + .ssh_command("ls -ll /sys/block/vdc/mq | grep ^d | wc -l") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + num_queues as u32 + ); + + // Mount the device + let mount_ro_rw_flag = if readonly { "ro,noload" } else { "rw" }; + guest.ssh_command("mkdir mount_image").unwrap(); + guest + .ssh_command( + format!("sudo mount -o {mount_ro_rw_flag} -t ext4 /dev/vdc mount_image/").as_str(), + ) + .unwrap(); + + // Check the content of the block device. The file "foo" should + // contain "bar". + assert_eq!( + guest.ssh_command("cat mount_image/foo").unwrap().trim(), + "bar" + ); + + // ACPI feature is needed. + #[cfg(target_arch = "x86_64")] + { + guest.enable_memory_hotplug(); + + // Add RAM to the VM + let desired_ram = 1024 << 20; + resize_command(&api_socket, None, Some(desired_ram), None, None); + + thread::sleep(std::time::Duration::new(10, 0)); + + assert!(guest.get_total_memory().unwrap_or_default() > 960_000); + + // Check again the content of the block device after the resize + // has been performed. + assert_eq!( + guest.ssh_command("cat mount_image/foo").unwrap().trim(), + "bar" + ); + } + + // Unmount the device + guest.ssh_command("sudo umount /dev/vdc").unwrap(); + guest.ssh_command("rm -r mount_image").unwrap(); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + if let Some(mut daemon_child) = daemon_child { + thread::sleep(std::time::Duration::new(5, 0)); + let _ = daemon_child.kill(); + let _ = daemon_child.wait(); + } + + handle_child_output(r, &output); +} + +pub(crate) fn test_boot_from_vhost_user_blk( + num_queues: usize, + readonly: bool, + direct: bool, + prepare_vhost_user_blk_daemon: Option<&PrepareBlkDaemon>, +) { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); + + let kernel_path = direct_kernel_boot_path(); + + let disk_path = guest.disk_config.disk(DiskType::OperatingSystem).unwrap(); + + let (blk_boot_params, daemon_child) = { + let prepare_daemon = prepare_vhost_user_blk_daemon.unwrap(); + // Start the daemon + let (daemon_child, vubd_socket_path) = prepare_daemon( + &guest.tmp_dir, + disk_path.as_str(), + num_queues, + readonly, + direct, + ); + + ( + format!( + "vhost_user=true,socket={vubd_socket_path},num_queues={num_queues},queue_size=128", + ), + Some(daemon_child), + ) + }; + + let mut child = GuestCommand::new(&guest) + .args(["--cpus", format!("boot={num_queues}").as_str()]) + .args(["--memory", "size=512M,shared=on"]) + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .args([ + "--disk", + blk_boot_params.as_str(), + format!( + "path={}", + guest.disk_config.disk(DiskType::CloudInit).unwrap() + ) + .as_str(), + ]) + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + // Just check the VM booted correctly. + assert_eq!(guest.get_cpu_count().unwrap_or_default(), num_queues as u32); + assert!(guest.get_total_memory().unwrap_or_default() > 480_000); + }); + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + if let Some(mut daemon_child) = daemon_child { + thread::sleep(std::time::Duration::new(5, 0)); + let _ = daemon_child.kill(); + let _ = daemon_child.wait(); + } + + handle_child_output(r, &output); +} + +pub(crate) fn _test_virtio_fs( + prepare_daemon: &dyn Fn(&TempDir, &str) -> (std::process::Child, String), + hotplug: bool, + use_generic_vhost_user: bool, + pci_segment: Option, +) { + #[cfg(target_arch = "aarch64")] + let focal_image = if hotplug { + FOCAL_IMAGE_UPDATE_KERNEL_NAME.to_string() + } else { + FOCAL_IMAGE_NAME.to_string() + }; + #[cfg(target_arch = "x86_64")] + let focal_image = FOCAL_IMAGE_NAME.to_string(); + let disk_config = UbuntuDiskConfig::new(focal_image); + let guest = Guest::new(Box::new(disk_config)); + let api_socket = temp_api_path(&guest.tmp_dir); + + let mut workload_path = dirs::home_dir().unwrap(); + workload_path.push("workloads"); + + let mut shared_dir = workload_path; + shared_dir.push("shared_dir"); + + #[cfg(target_arch = "x86_64")] + let kernel_path = direct_kernel_boot_path(); + #[cfg(target_arch = "aarch64")] + let kernel_path = if hotplug { + edk2_path() + } else { + direct_kernel_boot_path() + }; + + let (mut daemon_child, virtiofsd_socket_path) = + prepare_daemon(&guest.tmp_dir, shared_dir.to_str().unwrap()); + + let mut guest_command = GuestCommand::new(&guest); + guest_command + .default_cpus() + .args(["--memory", "size=512M,hotplug_size=2048M,shared=on"]) + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .default_disks() + .default_net() + .args(["--api-socket", &api_socket]); + if pci_segment.is_some() { + guest_command.args([ + "--platform", + &format!("num_pci_segments={MAX_NUM_PCI_SEGMENTS}"), + ]); + } + + let fs_params = format!( + "socket={},id=myfs0,{}{}", + virtiofsd_socket_path, + if use_generic_vhost_user { + "queue_sizes=[1024,1024],virtio_id=26" + } else { + "tag=myfs,num_queues=1,queue_size=1024" + }, + if let Some(pci_segment) = pci_segment { + format!(",pci_segment={pci_segment}") + } else { + String::new() + } + ); + + if !hotplug { + guest_command.args([ + if use_generic_vhost_user { + "--generic-vhost-user" + } else { + "--fs" + }, + fs_params.as_str(), + ]); + } + + let mut child = guest_command.capture_output().spawn().unwrap(); + let add_arg = if use_generic_vhost_user { + "add-generic-vhost-user" + } else { + "add-fs" + }; + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + if hotplug { + // Add fs to the VM + let (cmd_success, cmd_output) = + remote_command_w_output(&api_socket, add_arg, Some(&fs_params)); + assert!(cmd_success); + + if let Some(pci_segment) = pci_segment { + assert!(String::from_utf8_lossy(&cmd_output).contains(&format!( + "{{\"id\":\"myfs0\",\"bdf\":\"{pci_segment:04x}:00:01.0\"}}" + ))); + } else { + assert!( + String::from_utf8_lossy(&cmd_output) + .contains("{\"id\":\"myfs0\",\"bdf\":\"0000:00:06.0\"}") + ); + } + + thread::sleep(std::time::Duration::new(10, 0)); + } + + // Mount shared directory through virtio_fs filesystem + guest + .ssh_command("mkdir -p mount_dir && sudo mount -t virtiofs myfs mount_dir/") + .unwrap(); + + // Check file1 exists and its content is "foo" + assert_eq!( + guest.ssh_command("cat mount_dir/file1").unwrap().trim(), + "foo" + ); + // Check file2 does not exist + guest + .ssh_command("[ ! -f 'mount_dir/file2' ] || true") + .unwrap(); + + // Check file3 exists and its content is "bar" + assert_eq!( + guest.ssh_command("cat mount_dir/file3").unwrap().trim(), + "bar" + ); + + // ACPI feature is needed. + #[cfg(target_arch = "x86_64")] + { + guest.enable_memory_hotplug(); + + // Add RAM to the VM + let desired_ram = 1024 << 20; + resize_command(&api_socket, None, Some(desired_ram), None, None); + + thread::sleep(std::time::Duration::new(30, 0)); + assert!(guest.get_total_memory().unwrap_or_default() > 960_000); + + // After the resize, check again that file1 exists and its + // content is "foo". + assert_eq!( + guest.ssh_command("cat mount_dir/file1").unwrap().trim(), + "foo" + ); + } + + if hotplug { + // Remove from VM + guest.ssh_command("sudo umount mount_dir").unwrap(); + assert!(remote_command(&api_socket, "remove-device", Some("myfs0"))); + } + }); + + let (r, hotplug_daemon_child) = if r.is_ok() && hotplug { + thread::sleep(std::time::Duration::new(10, 0)); + let (daemon_child, virtiofsd_socket_path) = + prepare_daemon(&guest.tmp_dir, shared_dir.to_str().unwrap()); + + let r = std::panic::catch_unwind(|| { + thread::sleep(std::time::Duration::new(10, 0)); + let fs_params = format!( + "id=myfs0,socket={},{}{}", + virtiofsd_socket_path, + if use_generic_vhost_user { + "queue_sizes=[1024,1024],virtio_id=26" + } else { + "tag=myfs,num_queues=1,queue_size=1024" + }, + if let Some(pci_segment) = pci_segment { + format!(",pci_segment={pci_segment}") + } else { + String::new() + } + ); + + // Add back and check it works + let (cmd_success, cmd_output) = + remote_command_w_output(&api_socket, add_arg, Some(&fs_params)); + assert!(cmd_success); + if let Some(pci_segment) = pci_segment { + assert!(String::from_utf8_lossy(&cmd_output).contains(&format!( + "{{\"id\":\"myfs0\",\"bdf\":\"{pci_segment:04x}:00:01.0\"}}" + ))); + } else { + assert!( + String::from_utf8_lossy(&cmd_output) + .contains("{\"id\":\"myfs0\",\"bdf\":\"0000:00:06.0\"}") + ); + } + + thread::sleep(std::time::Duration::new(10, 0)); + // Mount shared directory through virtio_fs filesystem + guest + .ssh_command("mkdir -p mount_dir && sudo mount -t virtiofs myfs mount_dir/") + .unwrap(); + + // Check file1 exists and its content is "foo" + assert_eq!( + guest.ssh_command("cat mount_dir/file1").unwrap().trim(), + "foo" + ); + }); + + (r, Some(daemon_child)) + } else { + (r, None) + }; + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + let _ = daemon_child.kill(); + let _ = daemon_child.wait(); + + if let Some(mut daemon_child) = hotplug_daemon_child { + let _ = daemon_child.kill(); + let _ = daemon_child.wait(); + } + + handle_child_output(r, &output); +} + +pub(crate) fn test_virtio_pmem(discard_writes: bool, specify_size: bool) { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); + + let kernel_path = direct_kernel_boot_path(); + + let pmem_temp_file = TempFile::new().unwrap(); + pmem_temp_file.as_file().set_len(128 << 20).unwrap(); + + std::process::Command::new("mkfs.ext4") + .arg(pmem_temp_file.as_path()) + .output() + .expect("Expect creating disk image to succeed"); + + let mut child = GuestCommand::new(&guest) + .default_cpus() + .default_memory() + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .default_disks() + .default_net() + .args([ + "--pmem", + format!( + "file={}{}{}", + pmem_temp_file.as_path().to_str().unwrap(), + if specify_size { ",size=128M" } else { "" }, + if discard_writes { + ",discard_writes=on" + } else { + "" + } + ) + .as_str(), + ]) + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + // Check for the presence of /dev/pmem0 + assert_eq!( + guest.ssh_command("ls /dev/pmem0").unwrap().trim(), + "/dev/pmem0" + ); + + // Check changes persist after reboot + assert_eq!(guest.ssh_command("sudo mount /dev/pmem0 /mnt").unwrap(), ""); + assert_eq!(guest.ssh_command("ls /mnt").unwrap(), "lost+found\n"); + guest + .ssh_command("echo test123 | sudo tee /mnt/test") + .unwrap(); + assert_eq!(guest.ssh_command("sudo umount /mnt").unwrap(), ""); + assert_eq!(guest.ssh_command("ls /mnt").unwrap(), ""); + + guest.reboot_linux(0); + assert_eq!(guest.ssh_command("sudo mount /dev/pmem0 /mnt").unwrap(), ""); + assert_eq!( + guest + .ssh_command("sudo cat /mnt/test || true") + .unwrap() + .trim(), + if discard_writes { "" } else { "test123" } + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +pub(crate) fn _test_virtio_vsock(guest: &Guest, hotplug: bool) { + let socket = temp_vsock_path(&guest.tmp_dir); + let api_socket = temp_api_path(&guest.tmp_dir); + + let mut cmd = GuestCommand::new(guest); + cmd.args(["--api-socket", &api_socket]); + cmd.default_cpus(); + cmd.default_memory(); + cmd.default_kernel_cmdline(); + cmd.default_disks(); + cmd.default_net(); + + if !hotplug { + cmd.args(["--vsock", format!("cid=3,socket={socket}").as_str()]); + } + + let mut child = cmd.capture_output().spawn().unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + if hotplug { + let (cmd_success, cmd_output) = remote_command_w_output( + &api_socket, + "add-vsock", + Some(format!("cid=3,socket={socket},id=test0").as_str()), + ); + assert!(cmd_success); + assert!( + String::from_utf8_lossy(&cmd_output) + .contains("{\"id\":\"test0\",\"bdf\":\"0000:00:06.0\"}") + ); + thread::sleep(std::time::Duration::new(10, 0)); + // Check adding a second one fails + assert!(!remote_command( + &api_socket, + "add-vsock", + Some("cid=1234,socket=/tmp/fail") + )); + } + + // Validate vsock works as expected. + guest.check_vsock(socket.as_str()); + guest.reboot_linux(0); + // Validate vsock still works after a reboot. + guest.check_vsock(socket.as_str()); + + if hotplug { + assert!(remote_command(&api_socket, "remove-device", Some("test0"))); + } + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +pub(crate) fn test_memory_mergeable(mergeable: bool) { + let memory_param = if mergeable { + "mergeable=on" + } else { + "mergeable=off" + }; + + // We assume the number of shared pages in the rest of the system to be constant + let ksm_ps_init = get_ksm_pages_shared(); + + let disk_config1 = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); + let guest1 = Guest::new(Box::new(disk_config1)); + let mut child1 = GuestCommand::new(&guest1) + .default_cpus() + .args(["--memory", format!("size=512M,{memory_param}").as_str()]) + .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .default_disks() + .args(["--net", guest1.default_net_string().as_str()]) + .args(["--serial", "tty", "--console", "off"]) + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest1.wait_vm_boot().unwrap(); + }); + if r.is_err() { + kill_child(&mut child1); + let output = child1.wait_with_output().unwrap(); + handle_child_output(r, &output); + panic!("Test should already be failed/panicked"); // To explicitly mark this block never return + } + + let ksm_ps_guest1 = get_ksm_pages_shared(); + + let disk_config2 = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); + let guest2 = Guest::new(Box::new(disk_config2)); + let mut child2 = GuestCommand::new(&guest2) + .default_cpus() + .args(["--memory", format!("size=512M,{memory_param}").as_str()]) + .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .default_disks() + .args(["--net", guest2.default_net_string().as_str()]) + .args(["--serial", "tty", "--console", "off"]) + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest2.wait_vm_boot().unwrap(); + let ksm_ps_guest2 = get_ksm_pages_shared(); + + if mergeable { + println!( + "ksm pages_shared after vm1 booted '{ksm_ps_guest1}', ksm pages_shared after vm2 booted '{ksm_ps_guest2}'" + ); + // We are expecting the number of shared pages to increase as the number of VM increases + assert!(ksm_ps_guest1 < ksm_ps_guest2); + } else { + assert!(ksm_ps_guest1 == ksm_ps_init); + assert!(ksm_ps_guest2 == ksm_ps_init); + } + }); + + kill_child(&mut child1); + kill_child(&mut child2); + + let output = child1.wait_with_output().unwrap(); + child2.wait().unwrap(); + + handle_child_output(r, &output); +} + +// This test validates that it can find the virtio-iommu device at first. +// It also verifies that both disks and the network card are attached to +// the virtual IOMMU by looking at /sys/kernel/iommu_groups directory. +// The last interesting part of this test is that it exercises the network +// interface attached to the virtual IOMMU since this is the one used to +// send all commands through SSH. +pub(crate) fn _test_virtio_iommu(_acpi: bool /* not needed on x86_64 */) { + // Virtio-iommu support is ready in recent kernel (v5.14). But the kernel in + // Focal image is still old. + // So if ACPI is enabled on AArch64, we use a modified Focal image in which + // the kernel binary has been updated. + #[cfg(target_arch = "aarch64")] + let focal_image = FOCAL_IMAGE_UPDATE_KERNEL_NAME.to_string(); + #[cfg(target_arch = "x86_64")] + let focal_image = FOCAL_IMAGE_NAME.to_string(); + let disk_config = UbuntuDiskConfig::new(focal_image); + let guest = Guest::new(Box::new(disk_config)); + + #[cfg(target_arch = "x86_64")] + let kernel_path = direct_kernel_boot_path(); + #[cfg(target_arch = "aarch64")] + let kernel_path = if _acpi { + edk2_path() + } else { + direct_kernel_boot_path() + }; + + let mut child = GuestCommand::new(&guest) + .default_cpus() + .default_memory() + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .args([ + "--disk", + format!( + "path={},iommu=on", + guest.disk_config.disk(DiskType::OperatingSystem).unwrap() + ) + .as_str(), + format!( + "path={},iommu=on", + guest.disk_config.disk(DiskType::CloudInit).unwrap() + ) + .as_str(), + ]) + .args(["--net", guest.default_net_string_w_iommu().as_str()]) + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + // Verify the virtio-iommu device is present. + assert!( + guest + .does_device_vendor_pair_match("0x1057", "0x1af4") + .unwrap_or_default() + ); + + // On AArch64, if the guest system boots from FDT, the behavior of IOMMU is a bit + // different with ACPI. + // All devices on the PCI bus will be attached to the virtual IOMMU, except the + // virtio-iommu device itself. So these devices will all be added to IOMMU groups, + // and appear under folder '/sys/kernel/iommu_groups/'. + // + // Verify the first disk is in an iommu group. + assert!( + guest + .ssh_command("ls /sys/kernel/iommu_groups/*/devices") + .unwrap() + .contains("0000:00:02.0") + ); + + // Verify the second disk is in an iommu group. + assert!( + guest + .ssh_command("ls /sys/kernel/iommu_groups/*/devices") + .unwrap() + .contains("0000:00:03.0") + ); + + // Verify the network card is in an iommu group. + assert!( + guest + .ssh_command("ls /sys/kernel/iommu_groups/*/devices") + .unwrap() + .contains("0000:00:04.0") + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +// ivshmem test +// This case validates that read data from host(host write data to ivshmem backend file, +// guest read data from ivshmem pci bar2 memory) +// and write data to host(guest write data to ivshmem pci bar2 memory, host read it from +// ivshmem backend file). +// It also checks the size of the shared memory region. +pub(crate) fn _test_ivshmem(guest: &Guest, ivshmem_file_path: impl AsRef, file_size: &str) { + let ivshmem_file_path = ivshmem_file_path.as_ref(); + let test_message_read = String::from("ivshmem device test data read"); + // Modify backend file data before function test + let mut file = OpenOptions::new() + .read(true) + .write(true) + .open(ivshmem_file_path) + .unwrap(); + file.seek(SeekFrom::Start(0)).unwrap(); + file.write_all(test_message_read.as_bytes()).unwrap(); + file.write_all(b"\0").unwrap(); + file.flush().unwrap(); + + let output = fs::read_to_string(ivshmem_file_path).unwrap(); + let nul_pos = output.as_bytes().iter().position(|&b| b == 0).unwrap(); + let c_str = CStr::from_bytes_until_nul(&output.as_bytes()[..=nul_pos]).unwrap(); + let file_message = c_str.to_string_lossy().to_string(); + // Check if the backend file data is correct + assert_eq!(test_message_read, file_message); + + let device_id_line = String::from( + guest + .ssh_command("lspci -D | grep \"Inter-VM shared memory\"") + .unwrap() + .trim(), + ); + // Check if ivshmem exists + assert!(!device_id_line.is_empty()); + let device_id = device_id_line.split(" ").next().unwrap(); + // Check shard memory size + assert_eq!( + guest + .ssh_command( + format!("lspci -vv -s {device_id} | grep -c \"Region 2.*size={file_size}\"") + .as_str(), + ) + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 1 + ); + + // guest don't have gcc or g++, try to use python to test :( + // This python program try to mmap the ivshmem pci bar2 memory and read the data from it. + let ivshmem_test_read = format!( + r#" +import os +import mmap +from ctypes import create_string_buffer, c_char, memmove + +if __name__ == "__main__": + device_path = f"/sys/bus/pci/devices/{device_id}/resource2" + fd = os.open(device_path, os.O_RDWR | os.O_SYNC) + + PAGE_SIZE = os.sysconf('SC_PAGESIZE') + + with mmap.mmap(fd, PAGE_SIZE, flags=mmap.MAP_SHARED, + prot=mmap.PROT_READ | mmap.PROT_WRITE, offset=0) as shmem: + c_buf = (c_char * PAGE_SIZE).from_buffer(shmem) + null_pos = c_buf.raw.find(b'\x00') + valid_data = c_buf.raw[:null_pos] if null_pos != -1 else c_buf.raw + print(valid_data.decode('utf-8', errors='replace'), end="") + shmem.flush() + del c_buf + + os.close(fd) + "# + ); + guest + .ssh_command( + format!( + r#"cat << EOF > test_read.py +{ivshmem_test_read} +EOF +"# + ) + .as_str(), + ) + .unwrap(); + let guest_message = guest.ssh_command("sudo python3 test_read.py").unwrap(); + + // Check the probe message in host and guest + assert_eq!(test_message_read, guest_message); + + let test_message_write = "ivshmem device test data write"; + // Then the program writes a test message to the memory and flush it. + let ivshmem_test_write = format!( + r#" +import os +import mmap +from ctypes import create_string_buffer, c_char, memmove + +if __name__ == "__main__": + device_path = f"/sys/bus/pci/devices/{device_id}/resource2" + test_message = "{test_message_write}" + fd = os.open(device_path, os.O_RDWR | os.O_SYNC) + + PAGE_SIZE = os.sysconf('SC_PAGESIZE') + + with mmap.mmap(fd, PAGE_SIZE, flags=mmap.MAP_SHARED, + prot=mmap.PROT_READ | mmap.PROT_WRITE, offset=0) as shmem: + shmem.flush() + c_buf = (c_char * PAGE_SIZE).from_buffer(shmem) + encoded_msg = test_message.encode('utf-8').ljust(1000, b'\x00') + memmove(c_buf, encoded_msg, len(encoded_msg)) + shmem.flush() + del c_buf + + os.close(fd) + "# + ); + + guest + .ssh_command( + format!( + r#"cat << EOF > test_write.py +{ivshmem_test_write} +EOF +"# + ) + .as_str(), + ) + .unwrap(); + + let _ = guest.ssh_command("sudo python3 test_write.py").unwrap(); + + let output = fs::read_to_string(ivshmem_file_path).unwrap(); + let nul_pos = output.as_bytes().iter().position(|&b| b == 0).unwrap(); + let c_str = CStr::from_bytes_until_nul(&output.as_bytes()[..=nul_pos]).unwrap(); + let file_message = c_str.to_string_lossy().to_string(); + // Check to send data from guest to host + assert_eq!(test_message_write, file_message); +} + +pub(crate) fn _test_simple_launch(guest: &Guest) { + let event_path = temp_event_monitor_path(&guest.tmp_dir); + + let mut child = GuestCommand::new(guest) + .default_cpus() + .default_memory() + .default_kernel_cmdline() + .default_disks() + .default_net() + .args(["--serial", "tty", "--console", "off"]) + .args(["--event-monitor", format!("path={event_path}").as_str()]) + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + guest.validate_cpu_count(None); + guest.validate_memory(None); + assert_eq!(guest.get_pci_bridge_class().unwrap_or_default(), "0x060000"); + assert!(check_sequential_events( + &guest + .get_expected_seq_events_for_simple_launch() + .iter() + .collect::>(), + &event_path + )); + + // It's been observed on the Bionic image that udev and snapd + // services can cause some delay in the VM's shutdown. Disabling + // them improves the reliability of this test. + let _ = guest.ssh_command("sudo systemctl disable udev"); + let _ = guest.ssh_command("sudo systemctl stop udev"); + let _ = guest.ssh_command("sudo systemctl disable snapd"); + let _ = guest.ssh_command("sudo systemctl stop snapd"); + + guest.ssh_command("sudo poweroff").unwrap(); + thread::sleep(std::time::Duration::new(20, 0)); + let latest_events = [ + &MetaEvent { + event: "shutdown".to_string(), + device_id: None, + }, + &MetaEvent { + event: "deleted".to_string(), + device_id: None, + }, + &MetaEvent { + event: "shutdown".to_string(), + device_id: None, + }, + ]; + assert!(check_latest_events_exact(&latest_events, &event_path)); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +pub(crate) fn _test_multi_cpu(guest: &Guest) { + let mut cmd = GuestCommand::new(guest); + cmd.args(["--cpus", "boot=2,max=4"]) + .default_memory() + .default_kernel_cmdline() + .capture_output() + .default_disks() + .default_net(); + + let mut child = cmd.spawn().unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + assert_eq!(guest.get_cpu_count().unwrap_or_default(), 2); + + assert_eq!( + guest + .ssh_command(r#"sudo dmesg | grep "smp: Brought up" | sed "s/\[\ *[0-9.]*\] //""#) + .unwrap() + .trim(), + "smp: Brought up 1 node, 2 CPUs" + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +pub(crate) fn _test_cpu_affinity(guest: &Guest) { + // We need the host to have at least 4 CPUs if we want to be able + // to run this test. + let host_cpus_count = exec_host_command_output("nproc"); + assert!( + String::from_utf8_lossy(&host_cpus_count.stdout) + .trim() + .parse::() + .unwrap_or(0) + >= 4 + ); + + let mut child = GuestCommand::new(guest) + .default_cpus_with_affinity() + .default_memory() + .default_kernel_cmdline() + .default_disks() + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + let pid = child.id(); + let taskset_vcpu0 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep vcpu0 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str()); + assert_eq!(String::from_utf8_lossy(&taskset_vcpu0.stdout).trim(), "0,2"); + let taskset_vcpu1 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep vcpu1 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str()); + assert_eq!(String::from_utf8_lossy(&taskset_vcpu1.stdout).trim(), "1,3"); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + handle_child_output(r, &output); +} + +pub(crate) fn _test_virtio_queue_affinity(guest: &Guest) { + // We need the host to have at least 4 CPUs if we want to be able + // to run this test. + let host_cpus_count = exec_host_command_output("nproc"); + assert!( + String::from_utf8_lossy(&host_cpus_count.stdout) + .trim() + .parse::() + .unwrap_or(0) + >= 4 + ); + + let mut child = GuestCommand::new(guest) + .default_cpus() + .default_memory() + .default_kernel_cmdline() + .args([ + "--disk", + format!( + "path={}", + guest.disk_config.disk(DiskType::OperatingSystem).unwrap() + ) + .as_str(), + format!( + "path={},num_queues=4,queue_affinity=[0@[0,2],1@[1,3],2@[1],3@[3]]", + guest.disk_config.disk(DiskType::CloudInit).unwrap() + ) + .as_str(), + ]) + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + let pid = child.id(); + let taskset_q0 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep disk1_q0 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str()); + assert_eq!(String::from_utf8_lossy(&taskset_q0.stdout).trim(), "0,2"); + let taskset_q1 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep disk1_q1 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str()); + assert_eq!(String::from_utf8_lossy(&taskset_q1.stdout).trim(), "1,3"); + let taskset_q2 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep disk1_q2 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str()); + assert_eq!(String::from_utf8_lossy(&taskset_q2.stdout).trim(), "1"); + let taskset_q3 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep disk1_q3 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str()); + assert_eq!(String::from_utf8_lossy(&taskset_q3.stdout).trim(), "3"); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + handle_child_output(r, &output); +} + +pub(crate) fn _test_pci_msi(guest: &Guest) { + let mut cmd = GuestCommand::new(guest); + cmd.default_cpus() + .default_memory() + .default_kernel_cmdline() + .capture_output() + .default_disks() + .default_net(); + + let mut child = cmd.spawn().unwrap(); + + guest.wait_vm_boot().unwrap(); + + let grep_cmd = format!("grep -c {} /proc/interrupts", get_msi_interrupt_pattern()); + + let r = std::panic::catch_unwind(|| { + assert_eq!( + guest + .ssh_command(&grep_cmd) + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 12 + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +pub(crate) fn _test_virtio_net_ctrl_queue(guest: &Guest) { + let mut cmd = GuestCommand::new(guest); + cmd.default_cpus() + .default_memory() + .default_kernel_cmdline() + .args(["--net", guest.default_net_string_w_mtu(3000).as_str()]) + .capture_output() + .default_disks(); + + let mut child = cmd.spawn().unwrap(); + + guest.wait_vm_boot().unwrap(); + + #[cfg(target_arch = "aarch64")] + let iface = "enp0s4"; + #[cfg(target_arch = "x86_64")] + let iface = "ens4"; + + let r = std::panic::catch_unwind(|| { + assert_eq!( + guest + .ssh_command( + format!("sudo ethtool -K {iface} rx-gro-hw off && echo success").as_str() + ) + .unwrap() + .trim(), + "success" + ); + assert_eq!( + guest + .ssh_command(format!("cat /sys/class/net/{iface}/mtu").as_str()) + .unwrap() + .trim(), + "3000" + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +pub(crate) fn _test_pci_multiple_segments( + guest: &Guest, + max_num_pci_segments: u16, + pci_segments_for_disk: u16, +) { + // Prepare another disk file for the virtio-disk device + let test_disk_path = String::from( + guest + .tmp_dir + .as_path() + .join("test-disk.raw") + .to_str() + .unwrap(), + ); + assert!( + exec_host_command_status(format!("truncate {test_disk_path} -s 4M").as_str()).success() + ); + assert!(exec_host_command_status(format!("mkfs.ext4 {test_disk_path}").as_str()).success()); + + let mut cmd = GuestCommand::new(guest); + cmd.default_cpus() + .default_memory() + .default_kernel_cmdline_with_platform(Some(&format!( + "num_pci_segments={max_num_pci_segments}" + ))) + .args([ + "--disk", + format!( + "path={}", + guest.disk_config.disk(DiskType::OperatingSystem).unwrap() + ) + .as_str(), + format!( + "path={}", + guest.disk_config.disk(DiskType::CloudInit).unwrap() + ) + .as_str(), + format!("path={test_disk_path},pci_segment={pci_segments_for_disk},image_type=raw") + .as_str(), + ]) + .capture_output() + .default_net(); + + let mut child = cmd.spawn().unwrap(); + + guest.wait_vm_boot().unwrap(); + + let grep_cmd = "lspci | grep \"Host bridge\" | wc -l"; + + let r = std::panic::catch_unwind(|| { + // There should be MAX_NUM_PCI_SEGMENTS PCI host bridges in the guest. + assert_eq!( + guest + .ssh_command(grep_cmd) + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + max_num_pci_segments + ); + + // Check both if /dev/vdc exists and if the block size is 4M. + assert_eq!( + guest + .ssh_command("lsblk | grep vdc | grep -c 4M") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 1 + ); + + // Mount the device. + guest.ssh_command("mkdir mount_image").unwrap(); + guest + .ssh_command("sudo mount -o rw -t ext4 /dev/vdc mount_image/") + .unwrap(); + // Grant all users with write permission. + guest.ssh_command("sudo chmod a+w mount_image/").unwrap(); + + // Write something to the device. + guest + .ssh_command("sudo echo \"bar\" >> mount_image/foo") + .unwrap(); + + // Check the content of the block device. The file "foo" should + // contain "bar". + assert_eq!( + guest + .ssh_command("sudo cat mount_image/foo") + .unwrap() + .trim(), + "bar" + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +pub(crate) fn _test_direct_kernel_boot(guest: &Guest) { + let mut child = GuestCommand::new(guest) + .default_cpus() + .default_memory() + .default_kernel_cmdline() + .default_disks() + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + guest.validate_cpu_count(None); + guest.validate_memory(None); + + let grep_cmd = format!("grep -c {} /proc/interrupts", get_msi_interrupt_pattern()); + assert_eq!( + guest + .ssh_command(&grep_cmd) + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 12 + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +pub(crate) fn _test_virtio_block( + guest: &Guest, + disable_io_uring: bool, + disable_aio: bool, + verify_os_disk: bool, + backing_files: bool, + image_type: ImageType, +) { + let mut workload_path = dirs::home_dir().unwrap(); + workload_path.push("workloads"); + + let mut blk_file_path = workload_path; + blk_file_path.push("blk.img"); + + let initial_backing_checksum = if verify_os_disk { + compute_backing_checksum(guest.disk_config.disk(DiskType::OperatingSystem).unwrap()) + } else { + None + }; + assert!( + guest.num_cpu >= 4, + "_test_virtio_block requires at least 4 CPUs to match num_queues=4" + ); + let mut cloud_child = GuestCommand::new(guest) + .default_cpus() + .args(["--memory", "size=512M,shared=on"]) + .default_kernel_cmdline() + .args([ + "--disk", + format!( + "path={},backing_files={},image_type={image_type}", + guest.disk_config.disk(DiskType::OperatingSystem).unwrap(), + if backing_files { "on" } else { "off" }, + ) + .as_str(), + format!( + "path={}", + guest.disk_config.disk(DiskType::CloudInit).unwrap() + ) + .as_str(), + format!( + "path={},readonly=on,direct=on,num_queues=4,_disable_io_uring={},_disable_aio={}", + blk_file_path.to_str().unwrap(), + disable_io_uring, + disable_aio, + ) + .as_str(), + ]) + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + // Check both if /dev/vdc exists and if the block size is 16M. + assert_eq!( + guest + .ssh_command("lsblk | grep vdc | grep -c 16M") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 1 + ); + + // Check both if /dev/vdc exists and if this block is RO. + assert_eq!( + guest + .ssh_command("lsblk | grep vdc | awk '{print $5}'") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 1 + ); + + // Check if the number of queues is 4. + assert_eq!( + guest + .ssh_command("ls -ll /sys/block/vdc/mq | grep ^d | wc -l") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 4 + ); + }); + + if verify_os_disk { + // Use clean shutdown to allow cloud-hypervisor to clear + // the dirty bit in the QCOW2 v3 image. + kill_child(&mut cloud_child); + } else { + let _ = cloud_child.kill(); + } + let output = cloud_child.wait_with_output().unwrap(); + + handle_child_output(r, &output); + + if verify_os_disk { + disk_check_consistency( + guest.disk_config.disk(DiskType::OperatingSystem).unwrap(), + initial_backing_checksum, + ); + } +} diff --git a/cloud-hypervisor/tests/common/utils.rs b/cloud-hypervisor/tests/common/utils.rs new file mode 100644 index 0000000000..ac48641826 --- /dev/null +++ b/cloud-hypervisor/tests/common/utils.rs @@ -0,0 +1,1045 @@ +// Copyright 2025 The Cloud Hypervisor Authors. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 +use std::collections::HashMap; +use std::fs::{File, OpenOptions}; +use std::io::{BufRead, Read, Seek, SeekFrom, Write}; +use std::path::{Path, PathBuf}; +use std::process::{Child, Command}; +use std::string::String; +use std::sync::mpsc; +use std::sync::mpsc::Receiver; +use std::{cmp, fs, io, thread}; + +use test_infra::*; +use vmm_sys_util::tempdir::TempDir; + +const QCOW2_INCOMPATIBLE_FEATURES_OFFSET: u64 = 72; +// 10MB is our maximum accepted overhead. +pub(crate) const MAXIMUM_VMM_OVERHEAD_KB: u32 = 10 * 1024; + +// This enum exists to make it more convenient to +// implement test for both D-Bus and REST APIs. +pub(crate) enum TargetApi { + // API socket + HttpApi(String), + // well known service name, object path + DBusApi(String, String), +} + +impl TargetApi { + pub(crate) fn new_http_api(tmp_dir: &TempDir) -> Self { + Self::HttpApi(temp_api_path(tmp_dir)) + } + + pub(crate) fn new_dbus_api(tmp_dir: &TempDir) -> Self { + // `tmp_dir` is in the form of "/tmp/chXXXXXX" + // and we take the `chXXXXXX` part as a unique identifier for the guest + let id = tmp_dir.as_path().file_name().unwrap().to_str().unwrap(); + + Self::DBusApi( + format!("org.cloudhypervisor.{id}"), + format!("/org/cloudhypervisor/{id}"), + ) + } + + pub(crate) fn guest_args(&self) -> Vec { + match self { + TargetApi::HttpApi(api_socket) => { + vec![format!("--api-socket={}", api_socket.as_str())] + } + TargetApi::DBusApi(service_name, object_path) => { + vec![ + format!("--dbus-service-name={}", service_name.as_str()), + format!("--dbus-object-path={}", object_path.as_str()), + ] + } + } + } + + pub(crate) fn remote_args(&self) -> Vec { + // `guest_args` and `remote_args` are consistent with each other + self.guest_args() + } + + pub(crate) fn remote_command(&self, command: &str, arg: Option<&str>) -> bool { + let mut cmd = Command::new(clh_command("ch-remote")); + cmd.args(self.remote_args()); + cmd.arg(command); + + if let Some(arg) = arg { + cmd.arg(arg); + } + + let output = cmd.output().unwrap(); + if output.status.success() { + true + } else { + eprintln!("Error running ch-remote command: {:?}", &cmd); + let stderr = String::from_utf8_lossy(&output.stderr); + eprintln!("stderr: {stderr}"); + false + } + } +} + +pub(crate) fn temp_api_path(tmp_dir: &TempDir) -> String { + String::from( + tmp_dir + .as_path() + .join("cloud-hypervisor.sock") + .to_str() + .unwrap(), + ) +} + +pub(crate) fn prepare_virtiofsd( + tmp_dir: &TempDir, + shared_dir: &str, +) -> (std::process::Child, String) { + let mut workload_path = dirs::home_dir().unwrap(); + workload_path.push("workloads"); + + let mut virtiofsd_path = workload_path; + virtiofsd_path.push("virtiofsd"); + let virtiofsd_path = String::from(virtiofsd_path.to_str().unwrap()); + + let virtiofsd_socket_path = + String::from(tmp_dir.as_path().join("virtiofs.sock").to_str().unwrap()); + + // Start the daemon + let child = Command::new(virtiofsd_path.as_str()) + .args(["--shared-dir", shared_dir]) + .args(["--socket-path", virtiofsd_socket_path.as_str()]) + .args(["--cache", "never"]) + .args(["--tag", "myfs"]) + .spawn() + .unwrap(); + + thread::sleep(std::time::Duration::new(10, 0)); + + (child, virtiofsd_socket_path) +} + +pub(crate) fn prepare_vubd( + tmp_dir: &TempDir, + blk_img: &str, + num_queues: usize, + rdonly: bool, + direct: bool, +) -> (std::process::Child, String) { + let mut workload_path = dirs::home_dir().unwrap(); + workload_path.push("workloads"); + + let mut blk_file_path = workload_path; + blk_file_path.push(blk_img); + let blk_file_path = String::from(blk_file_path.to_str().unwrap()); + + let vubd_socket_path = String::from(tmp_dir.as_path().join("vub.sock").to_str().unwrap()); + + // Start the daemon + let child = Command::new(clh_command("vhost_user_block")) + .args([ + "--block-backend", + format!( + "path={blk_file_path},socket={vubd_socket_path},num_queues={num_queues},readonly={rdonly},direct={direct}" + ) + .as_str(), + ]) + .spawn() + .unwrap(); + + thread::sleep(std::time::Duration::new(10, 0)); + + (child, vubd_socket_path) +} + +pub(crate) fn temp_vsock_path(tmp_dir: &TempDir) -> String { + String::from(tmp_dir.as_path().join("vsock").to_str().unwrap()) +} + +pub(crate) fn temp_event_monitor_path(tmp_dir: &TempDir) -> String { + String::from(tmp_dir.as_path().join("event.json").to_str().unwrap()) +} + +// Creates the directory and returns the path. +pub(crate) fn temp_snapshot_dir_path(tmp_dir: &TempDir) -> String { + let snapshot_dir = String::from(tmp_dir.as_path().join("snapshot").to_str().unwrap()); + std::fs::create_dir(&snapshot_dir).unwrap(); + snapshot_dir +} + +pub(crate) fn temp_vmcore_file_path(tmp_dir: &TempDir) -> String { + String::from(tmp_dir.as_path().join("vmcore").to_str().unwrap()) +} + +pub(crate) fn cloud_hypervisor_release_path() -> String { + let mut workload_path = dirs::home_dir().unwrap(); + workload_path.push("workloads"); + + let mut ch_release_path = workload_path; + #[cfg(target_arch = "x86_64")] + ch_release_path.push("cloud-hypervisor-static"); + #[cfg(target_arch = "aarch64")] + ch_release_path.push("cloud-hypervisor-static-aarch64"); + + ch_release_path.into_os_string().into_string().unwrap() +} + +pub(crate) fn prepare_vhost_user_net_daemon( + tmp_dir: &TempDir, + ip: &str, + tap: Option<&str>, + mtu: Option, + num_queues: usize, + client_mode: bool, +) -> (std::process::Command, String) { + let vunet_socket_path = String::from(tmp_dir.as_path().join("vunet.sock").to_str().unwrap()); + + // Start the daemon + let mut net_params = format!( + "ip={ip},mask=255.255.255.128,socket={vunet_socket_path},num_queues={num_queues},queue_size=1024,client={client_mode}" + ); + + if let Some(tap) = tap { + net_params.push_str(format!(",tap={tap}").as_str()); + } + + if let Some(mtu) = mtu { + net_params.push_str(format!(",mtu={mtu}").as_str()); + } + + let mut command = Command::new(clh_command("vhost_user_net")); + command.args(["--net-backend", net_params.as_str()]); + + (command, vunet_socket_path) +} + +pub(crate) fn prepare_swtpm_daemon(tmp_dir: &TempDir) -> (std::process::Command, String) { + let swtpm_tpm_dir = String::from(tmp_dir.as_path().join("swtpm").to_str().unwrap()); + let swtpm_socket_path = String::from( + tmp_dir + .as_path() + .join("swtpm") + .join("swtpm.sock") + .to_str() + .unwrap(), + ); + std::fs::create_dir(&swtpm_tpm_dir).unwrap(); + + let mut swtpm_command = Command::new("swtpm"); + let swtpm_args = [ + "socket", + "--tpmstate", + &format!("dir={swtpm_tpm_dir}"), + "--ctrl", + &format!("type=unixio,path={swtpm_socket_path}"), + "--flags", + "startup-clear", + "--tpm2", + ]; + swtpm_command.args(swtpm_args); + + (swtpm_command, swtpm_socket_path) +} + +pub(crate) fn remote_command(api_socket: &str, command: &str, arg: Option<&str>) -> bool { + let mut cmd = Command::new(clh_command("ch-remote")); + cmd.args([&format!("--api-socket={api_socket}"), command]); + + if let Some(arg) = arg { + cmd.arg(arg); + } + let output = cmd.output().unwrap(); + if output.status.success() { + true + } else { + eprintln!("Error running ch-remote command: {:?}", &cmd); + let stderr = String::from_utf8_lossy(&output.stderr); + eprintln!("stderr: {stderr}"); + false + } +} + +pub(crate) fn remote_command_w_output( + api_socket: &str, + command: &str, + arg: Option<&str>, +) -> (bool, Vec) { + let mut cmd = Command::new(clh_command("ch-remote")); + cmd.args([&format!("--api-socket={api_socket}"), command]); + + if let Some(arg) = arg { + cmd.arg(arg); + } + + let output = cmd.output().expect("Failed to launch ch-remote"); + + (output.status.success(), output.stdout) +} + +pub(crate) fn resize_command( + api_socket: &str, + desired_vcpus: Option, + desired_ram: Option, + desired_balloon: Option, + event_file: Option<&str>, +) -> bool { + let mut cmd = Command::new(clh_command("ch-remote")); + cmd.args([&format!("--api-socket={api_socket}"), "resize"]); + + if let Some(desired_vcpus) = desired_vcpus { + cmd.arg(format!("--cpus={desired_vcpus}")); + } + + if let Some(desired_ram) = desired_ram { + cmd.arg(format!("--memory={desired_ram}")); + } + + if let Some(desired_balloon) = desired_balloon { + cmd.arg(format!("--balloon={desired_balloon}")); + } + + let ret = cmd.status().expect("Failed to launch ch-remote").success(); + + if let Some(event_path) = event_file { + let latest_events = [ + &MetaEvent { + event: "resizing".to_string(), + device_id: None, + }, + &MetaEvent { + event: "resized".to_string(), + device_id: None, + }, + ]; + // See: #5938 + thread::sleep(std::time::Duration::new(1, 0)); + assert!(check_latest_events_exact(&latest_events, event_path)); + } + + ret +} + +pub(crate) fn resize_zone_command(api_socket: &str, id: &str, desired_size: &str) -> bool { + let mut cmd = Command::new(clh_command("ch-remote")); + cmd.args([ + &format!("--api-socket={api_socket}"), + "resize-zone", + &format!("--id={id}"), + &format!("--size={desired_size}"), + ]); + + cmd.status().expect("Failed to launch ch-remote").success() +} + +pub(crate) fn resize_disk_command(api_socket: &str, id: &str, desired_size: &str) -> bool { + let mut cmd = Command::new(clh_command("ch-remote")); + cmd.args([ + &format!("--api-socket={api_socket}"), + "resize-disk", + &format!("--disk={id}"), + &format!("--size={desired_size}"), + ]); + + cmd.status().expect("Failed to launch ch-remote").success() +} + +// setup OVS-DPDK bridge and ports +pub(crate) fn setup_ovs_dpdk() { + // setup OVS-DPDK + assert!(exec_host_command_status("service openvswitch-switch start").success()); + assert!(exec_host_command_status("ovs-vsctl init").success()); + assert!( + exec_host_command_status("ovs-vsctl set Open_vSwitch . other_config:dpdk-init=true") + .success() + ); + assert!(exec_host_command_status("service openvswitch-switch restart").success()); + + // Create OVS-DPDK bridge and ports + assert!( + exec_host_command_status( + "ovs-vsctl add-br ovsbr0 -- set bridge ovsbr0 datapath_type=netdev", + ) + .success() + ); + assert!(exec_host_command_status("ovs-vsctl add-port ovsbr0 vhost-user1 -- set Interface vhost-user1 type=dpdkvhostuserclient options:vhost-server-path=/tmp/dpdkvhostclient1").success()); + assert!(exec_host_command_status("ovs-vsctl add-port ovsbr0 vhost-user2 -- set Interface vhost-user2 type=dpdkvhostuserclient options:vhost-server-path=/tmp/dpdkvhostclient2").success()); + assert!(exec_host_command_status("ip link set up dev ovsbr0").success()); + assert!(exec_host_command_status("service openvswitch-switch restart").success()); +} + +pub(crate) fn cleanup_ovs_dpdk() { + assert!(exec_host_command_status("ovs-vsctl del-br ovsbr0").success()); + exec_host_command_status("rm -f ovs-vsctl /tmp/dpdkvhostclient1 /tmp/dpdkvhostclient2"); +} + +// Setup two guests and ensure they are connected through ovs-dpdk +pub(crate) fn setup_ovs_dpdk_guests( + guest1: &Guest, + guest2: &Guest, + api_socket: &str, + release_binary: bool, +) -> (Child, Child) { + setup_ovs_dpdk(); + + let clh_path = if release_binary { + cloud_hypervisor_release_path() + } else { + clh_command("cloud-hypervisor") + }; + + let mut child1 = GuestCommand::new_with_binary_path(guest1, &clh_path) + .args(["--cpus", "boot=2"]) + .args(["--memory", "size=0,shared=on"]) + .args(["--memory-zone", "id=mem0,size=1G,shared=on,host_numa_node=0"]) + .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .default_disks() + .args(["--net", guest1.default_net_string().as_str(), "vhost_user=true,socket=/tmp/dpdkvhostclient1,num_queues=2,queue_size=256,vhost_mode=server"]) + .capture_output() + .spawn() + .unwrap(); + + #[cfg(target_arch = "x86_64")] + let guest_net_iface = "ens5"; + #[cfg(target_arch = "aarch64")] + let guest_net_iface = "enp0s5"; + + let r = std::panic::catch_unwind(|| { + guest1.wait_vm_boot().unwrap(); + + guest1 + .ssh_command(&format!( + "sudo ip addr add 172.100.0.1/24 dev {guest_net_iface}" + )) + .unwrap(); + guest1 + .ssh_command(&format!("sudo ip link set up dev {guest_net_iface}")) + .unwrap(); + + let guest_ip = guest1.network.guest_ip0.clone(); + thread::spawn(move || { + ssh_command_ip( + "nc -l 12345", + &guest_ip, + DEFAULT_SSH_RETRIES, + DEFAULT_SSH_TIMEOUT, + ) + .unwrap(); + }); + }); + if r.is_err() { + cleanup_ovs_dpdk(); + + let _ = child1.kill(); + let output = child1.wait_with_output().unwrap(); + handle_child_output(r, &output); + panic!("Test should already be failed/panicked"); // To explicitly mark this block never return + } + + let mut child2 = GuestCommand::new_with_binary_path(guest2, &clh_path) + .args(["--api-socket", api_socket]) + .args(["--cpus", "boot=2"]) + .args(["--memory", "size=0,shared=on"]) + .args(["--memory-zone", "id=mem0,size=1G,shared=on,host_numa_node=0"]) + .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .default_disks() + .args(["--net", guest2.default_net_string().as_str(), "vhost_user=true,socket=/tmp/dpdkvhostclient2,num_queues=2,queue_size=256,vhost_mode=server"]) + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest2.wait_vm_boot().unwrap(); + + guest2 + .ssh_command(&format!( + "sudo ip addr add 172.100.0.2/24 dev {guest_net_iface}" + )) + .unwrap(); + guest2 + .ssh_command(&format!("sudo ip link set up dev {guest_net_iface}")) + .unwrap(); + + // Check the connection works properly between the two VMs + guest2.ssh_command("nc -vz 172.100.0.1 12345").unwrap(); + }); + if r.is_err() { + cleanup_ovs_dpdk(); + + let _ = child1.kill(); + let _ = child2.kill(); + let output = child2.wait_with_output().unwrap(); + handle_child_output(r, &output); + panic!("Test should already be failed/panicked"); // To explicitly mark this block never return + } + + (child1, child2) +} + +pub enum FwType { + Ovmf, + RustHypervisorFirmware, +} + +pub(crate) fn fw_path(_fw_type: FwType) -> String { + let mut workload_path = dirs::home_dir().unwrap(); + workload_path.push("workloads"); + + let mut fw_path = workload_path; + #[cfg(target_arch = "aarch64")] + fw_path.push("CLOUDHV_EFI.fd"); + #[cfg(target_arch = "x86_64")] + { + match _fw_type { + FwType::Ovmf => fw_path.push(OVMF_NAME), + FwType::RustHypervisorFirmware => fw_path.push("hypervisor-fw"), + } + } + + fw_path.to_str().unwrap().to_string() +} + +// Parse the event_monitor file based on the format that each event +// is followed by a double newline +fn parse_event_file(event_file: &str) -> Vec { + let content = fs::read(event_file).unwrap(); + let mut ret = Vec::new(); + for entry in String::from_utf8_lossy(&content) + .trim() + .split("\n\n") + .collect::>() + { + ret.push(serde_json::from_str(entry).unwrap()); + } + + ret +} + +// Return true if all events from the input 'expected_events' are matched sequentially +// with events from the 'event_file' +pub(crate) fn check_sequential_events(expected_events: &[&MetaEvent], event_file: &str) -> bool { + let json_events = parse_event_file(event_file); + let len = expected_events.len(); + let mut idx = 0; + for e in &json_events { + if idx == len { + break; + } + if expected_events[idx].match_with_json_event(e) { + idx += 1; + } + } + + let ret = idx == len; + + if !ret { + eprintln!( + "\n\n==== Start 'check_sequential_events' failed ==== \ + \n\nexpected_events={expected_events:?}\nactual_events={json_events:?} \ + \n\n==== End 'check_sequential_events' failed ====", + ); + } + + ret +} + +// Return true if all events from the input 'expected_events' are matched exactly +// with events from the 'event_file' +pub(crate) fn check_sequential_events_exact( + expected_events: &[&MetaEvent], + event_file: &str, +) -> bool { + let json_events = parse_event_file(event_file); + assert!(expected_events.len() <= json_events.len()); + let json_events = &json_events[..expected_events.len()]; + + for (idx, e) in json_events.iter().enumerate() { + if !expected_events[idx].match_with_json_event(e) { + eprintln!( + "\n\n==== Start 'check_sequential_events_exact' failed ==== \ + \n\nexpected_events={expected_events:?}\nactual_events={json_events:?} \ + \n\n==== End 'check_sequential_events_exact' failed ====", + ); + + return false; + } + } + + true +} + +// Return true if events from the input 'latest_events' are matched exactly +// with the most recent events from the 'event_file' +pub(crate) fn check_latest_events_exact(latest_events: &[&MetaEvent], event_file: &str) -> bool { + let json_events = parse_event_file(event_file); + assert!(latest_events.len() <= json_events.len()); + let json_events = &json_events[(json_events.len() - latest_events.len())..]; + + for (idx, e) in json_events.iter().enumerate() { + if !latest_events[idx].match_with_json_event(e) { + eprintln!( + "\n\n==== Start 'check_latest_events_exact' failed ==== \ + \n\nexpected_events={latest_events:?}\nactual_events={json_events:?} \ + \n\n==== End 'check_latest_events_exact' failed ====", + ); + + return false; + } + } + + true +} + +pub(super) fn get_msi_interrupt_pattern() -> String { + #[cfg(target_arch = "x86_64")] + { + "PCI-MSI".to_string() + } + #[cfg(target_arch = "aarch64")] + { + if cfg!(feature = "mshv") { + "GICv2m-PCI-MSIX".to_string() + } else { + "ITS-PCI-MSIX".to_string() + } + } +} + +pub(super) type PrepareNetDaemon = dyn Fn( + &TempDir, + &str, + Option<&str>, + Option, + usize, + bool, +) -> (std::process::Command, String); + +pub(super) fn get_ksm_pages_shared() -> u32 { + fs::read_to_string("/sys/kernel/mm/ksm/pages_shared") + .unwrap() + .trim() + .parse::() + .unwrap() +} + +fn _get_vmm_overhead(pid: u32, guest_memory_size: u32) -> HashMap { + let smaps = fs::File::open(format!("/proc/{pid}/smaps")).unwrap(); + let reader = io::BufReader::new(smaps); + + let mut skip_map: bool = false; + let mut region_name: String = String::new(); + let mut region_maps = HashMap::new(); + for line in reader.lines() { + let l = line.unwrap(); + + if l.contains('-') { + let values: Vec<&str> = l.split_whitespace().collect(); + region_name = values.last().unwrap().trim().to_string(); + if region_name == "0" { + region_name = "anonymous".to_string(); + } + } + + // Each section begins with something that looks like: + // Size: 2184 kB + if l.starts_with("Size:") { + let values: Vec<&str> = l.split_whitespace().collect(); + let map_size = values[1].parse::().unwrap(); + // We skip the assigned guest RAM map, its RSS is only + // dependent on the guest actual memory usage. + // Everything else can be added to the VMM overhead. + skip_map = map_size >= guest_memory_size; + continue; + } + + // If this is a map we're taking into account, then we only + // count the RSS. The sum of all counted RSS is the VMM overhead. + if !skip_map && l.starts_with("Rss:") { + let values: Vec<&str> = l.split_whitespace().collect(); + let value = values[1].trim().parse::().unwrap(); + *region_maps.entry(region_name.clone()).or_insert(0) += value; + } + } + + region_maps +} + +pub(crate) fn get_vmm_overhead(pid: u32, guest_memory_size: u32) -> u32 { + let mut total = 0; + + for (region_name, value) in &_get_vmm_overhead(pid, guest_memory_size) { + eprintln!("{region_name}: {value}"); + total += value; + } + + total +} + +pub(crate) fn process_rss_kib(pid: u32) -> usize { + let command = format!("ps -q {pid} -o rss="); + let rss = exec_host_command_output(&command); + String::from_utf8_lossy(&rss.stdout).trim().parse().unwrap() +} + +#[derive(PartialEq, Eq, PartialOrd)] +pub struct Counters { + rx_bytes: u64, + rx_frames: u64, + tx_bytes: u64, + tx_frames: u64, + read_bytes: u64, + write_bytes: u64, + read_ops: u64, + write_ops: u64, +} + +pub(crate) fn get_counters(api_socket: &str) -> Counters { + // Get counters + let (cmd_success, cmd_output) = remote_command_w_output(api_socket, "counters", None); + assert!(cmd_success); + + let counters: HashMap<&str, HashMap<&str, u64>> = + serde_json::from_slice(&cmd_output).unwrap_or_default(); + + let rx_bytes = *counters.get("_net2").unwrap().get("rx_bytes").unwrap(); + let rx_frames = *counters.get("_net2").unwrap().get("rx_frames").unwrap(); + let tx_bytes = *counters.get("_net2").unwrap().get("tx_bytes").unwrap(); + let tx_frames = *counters.get("_net2").unwrap().get("tx_frames").unwrap(); + + let read_bytes = *counters.get("_disk0").unwrap().get("read_bytes").unwrap(); + let write_bytes = *counters.get("_disk0").unwrap().get("write_bytes").unwrap(); + let read_ops = *counters.get("_disk0").unwrap().get("read_ops").unwrap(); + let write_ops = *counters.get("_disk0").unwrap().get("write_ops").unwrap(); + + Counters { + rx_bytes, + rx_frames, + tx_bytes, + tx_frames, + read_bytes, + write_bytes, + read_ops, + write_ops, + } +} + +pub(super) fn pty_read(mut pty: std::fs::File) -> Receiver { + let (tx, rx) = mpsc::channel::(); + thread::spawn(move || { + loop { + thread::sleep(std::time::Duration::new(1, 0)); + let mut buf = [0; 512]; + match pty.read(&mut buf) { + Ok(_bytes) => { + let output = std::str::from_utf8(&buf).unwrap().to_string(); + match tx.send(output) { + Ok(_) => (), + Err(_) => break, + } + } + Err(_) => break, + } + } + }); + rx +} + +pub(crate) fn get_pty_path(api_socket: &str, pty_type: &str) -> PathBuf { + let (cmd_success, cmd_output) = remote_command_w_output(api_socket, "info", None); + assert!(cmd_success); + let info: serde_json::Value = serde_json::from_slice(&cmd_output).unwrap_or_default(); + assert_eq!("Pty", info["config"][pty_type]["mode"]); + PathBuf::from( + info["config"][pty_type]["file"] + .as_str() + .expect("Missing pty path"), + ) +} + +// VFIO test network setup. +// We reserve a different IP class for it: 172.18.0.0/24. +#[cfg(target_arch = "x86_64")] +pub(crate) fn setup_vfio_network_interfaces() { + // 'vfio-br0' + assert!(exec_host_command_status("sudo ip link add name vfio-br0 type bridge").success()); + assert!(exec_host_command_status("sudo ip link set vfio-br0 up").success()); + assert!(exec_host_command_status("sudo ip addr add 172.18.0.1/24 dev vfio-br0").success()); + // 'vfio-tap0' + assert!(exec_host_command_status("sudo ip tuntap add vfio-tap0 mode tap").success()); + assert!(exec_host_command_status("sudo ip link set vfio-tap0 master vfio-br0").success()); + assert!(exec_host_command_status("sudo ip link set vfio-tap0 up").success()); + // 'vfio-tap1' + assert!(exec_host_command_status("sudo ip tuntap add vfio-tap1 mode tap").success()); + assert!(exec_host_command_status("sudo ip link set vfio-tap1 master vfio-br0").success()); + assert!(exec_host_command_status("sudo ip link set vfio-tap1 up").success()); + // 'vfio-tap2' + assert!(exec_host_command_status("sudo ip tuntap add vfio-tap2 mode tap").success()); + assert!(exec_host_command_status("sudo ip link set vfio-tap2 master vfio-br0").success()); + assert!(exec_host_command_status("sudo ip link set vfio-tap2 up").success()); + // 'vfio-tap3' + assert!(exec_host_command_status("sudo ip tuntap add vfio-tap3 mode tap").success()); + assert!(exec_host_command_status("sudo ip link set vfio-tap3 master vfio-br0").success()); + assert!(exec_host_command_status("sudo ip link set vfio-tap3 up").success()); +} + +// Tear VFIO test network down +#[cfg(target_arch = "x86_64")] +pub(crate) fn cleanup_vfio_network_interfaces() { + assert!(exec_host_command_status("sudo ip link del vfio-br0").success()); + assert!(exec_host_command_status("sudo ip link del vfio-tap0").success()); + assert!(exec_host_command_status("sudo ip link del vfio-tap1").success()); + assert!(exec_host_command_status("sudo ip link del vfio-tap2").success()); + assert!(exec_host_command_status("sudo ip link del vfio-tap3").success()); +} + +pub(crate) fn balloon_size(api_socket: &str) -> u64 { + let (cmd_success, cmd_output) = remote_command_w_output(api_socket, "info", None); + assert!(cmd_success); + + let info: serde_json::Value = serde_json::from_slice(&cmd_output).unwrap_or_default(); + let total_mem = &info["config"]["memory"]["size"] + .to_string() + .parse::() + .unwrap(); + let actual_mem = &info["memory_actual_size"] + .to_string() + .parse::() + .unwrap(); + total_mem - actual_mem +} + +pub(crate) fn vm_state(api_socket: &str) -> String { + let (cmd_success, cmd_output) = remote_command_w_output(api_socket, "info", None); + assert!(cmd_success); + + let info: serde_json::Value = serde_json::from_slice(&cmd_output).unwrap_or_default(); + let state = &info["state"].as_str().unwrap(); + + state.to_string() +} + +pub(crate) fn make_virtio_block_guest(factory: &GuestFactory, image_name: &str) -> Guest { + let disk_config = UbuntuDiskConfig::new(image_name.to_string()); + factory.create_guest(Box::new(disk_config)).with_cpu(4) +} + +pub(crate) fn compute_backing_checksum( + path_or_image_name: impl AsRef, +) -> Option<(std::path::PathBuf, String, u32)> { + let path = resolve_disk_path(path_or_image_name); + + let mut file = File::open(&path).ok()?; + if !matches!( + block::detect_image_type(&mut file).ok()?, + block::ImageType::Qcow2 + ) { + return None; + } + + let info = get_image_info(&path)?; + + let backing_file = info["backing-filename"].as_str()?; + let backing_path = if std::path::Path::new(backing_file).is_absolute() { + std::path::PathBuf::from(backing_file) + } else { + path.parent() + .unwrap_or_else(|| std::path::Path::new(".")) + .join(backing_file) + }; + + let backing_info = get_image_info(&backing_path)?; + let backing_format = backing_info["format"].as_str()?.to_string(); + let mut file = File::open(&backing_path).ok()?; + let file_size = file.metadata().ok()?.len(); + let checksum = compute_file_checksum(&mut file, file_size); + + Some((backing_path, backing_format, checksum)) +} + +/// Uses `qemu-img check` to verify disk image consistency. +/// +/// Supported formats are `qcow2` (compressed and uncompressed), +/// `vhdx`, `qed`, `parallels`, `vmdk`, and `vdi`. See man page +/// for more details. +/// +/// It takes either a full path to the image or just the name of +/// the image located in the `workloads` directory. +/// +/// For QCOW2 images with backing files, also verifies the backing file +/// integrity and checks that the backing file hasn't been modified +/// during the test. +/// +/// For QCOW2 v3 images, also verifies the dirty bit is cleared. +pub(crate) fn disk_check_consistency( + path_or_image_name: impl AsRef, + initial_backing_checksum: Option<(std::path::PathBuf, String, u32)>, +) { + let path = resolve_disk_path(path_or_image_name); + let output = run_qemu_img(&path, &["check"], None); + + assert!( + output.status.success(), + "qemu-img check failed: {}", + String::from_utf8_lossy(&output.stderr) + ); + + match check_dirty_flag(&path) { + Ok(Some(dirty)) => { + assert!(!dirty, "QCOW2 image shutdown unclean"); + } + Ok(None) => {} // Not a QCOW2 v3 image, skip dirty flag check + Err(e) => panic!("Failed to check dirty flag: {e}"), + } + + if let Some((backing_path, format, initial_checksum)) = initial_backing_checksum { + if format.parse::().ok() != Some(block::qcow::ImageType::Raw) { + let output = run_qemu_img(&backing_path, &["check"], None); + + assert!( + output.status.success(), + "qemu-img check of backing file failed: {}", + String::from_utf8_lossy(&output.stderr) + ); + } + + let mut file = File::open(&backing_path).unwrap(); + let file_size = file.metadata().unwrap().len(); + assert_eq!( + initial_checksum, + compute_file_checksum(&mut file, file_size) + ); + } +} + +pub(crate) fn run_qemu_img( + path: &std::path::Path, + args: &[&str], + trailing_args: Option<&[&str]>, +) -> std::process::Output { + let mut cmd = std::process::Command::new("qemu-img"); + cmd.arg(args[0]) + .args(&args[1..]) + .arg(path.to_str().unwrap()); + if let Some(extra) = trailing_args { + cmd.args(extra); + } + cmd.output().unwrap() +} + +fn get_image_info(path: &std::path::Path) -> Option { + let output = run_qemu_img(path, &["info", "-U", "--output=json"], None); + + output.status.success().then_some(())?; + serde_json::from_slice(&output.stdout).ok() +} + +fn get_qcow2_v3_info(path: &Path) -> Result, String> { + let info = get_image_info(path) + .ok_or_else(|| format!("qemu-img info failed for {}", path.display()))?; + if info["format"].as_str() != Some("qcow2") { + return Ok(None); + } + // QCOW2 v3 has compat "1.1", v2 has "0.10" + if info["format-specific"]["data"]["compat"].as_str() != Some("1.1") { + return Ok(None); + } + Ok(Some(info)) +} + +pub(crate) fn check_dirty_flag(path: &Path) -> Result, String> { + Ok(get_qcow2_v3_info(path)?.and_then(|info| info["dirty-flag"].as_bool())) +} + +pub(crate) fn check_corrupt_flag(path: &Path) -> Result, String> { + Ok(get_qcow2_v3_info(path)? + .and_then(|info| info["format-specific"]["data"]["corrupt"].as_bool())) +} + +pub(crate) fn set_corrupt_flag(path: &Path, corrupt: bool) -> io::Result<()> { + let mut file = OpenOptions::new().read(true).write(true).open(path)?; + + file.seek(SeekFrom::Start(QCOW2_INCOMPATIBLE_FEATURES_OFFSET))?; + let mut buf = [0u8; 8]; + file.read_exact(&mut buf)?; + let mut features = u64::from_be_bytes(buf); + + if corrupt { + features |= 0x02; + } else { + features &= !0x02; + } + + file.seek(SeekFrom::Start(QCOW2_INCOMPATIBLE_FEATURES_OFFSET))?; + file.write_all(&features.to_be_bytes())?; + file.sync_all()?; + Ok(()) +} + +fn resolve_disk_path(path_or_image_name: impl AsRef) -> std::path::PathBuf { + if path_or_image_name.as_ref().exists() { + // A full path is provided + path_or_image_name.as_ref().to_path_buf() + } else { + // An image name is provided + let mut workload_path = dirs::home_dir().unwrap(); + workload_path.push("workloads"); + workload_path.as_path().join(path_or_image_name.as_ref()) + } +} + +pub(crate) fn compute_file_checksum(reader: &mut dyn std::io::Read, size: u64) -> u32 { + // Read first 16MB or entire data if smaller + let read_size = cmp::min(size, 16 * 1024 * 1024) as usize; + + let mut buffer = vec![0u8; read_size]; + reader.read_exact(&mut buffer).unwrap(); + + // DJB2 hash + let mut hash: u32 = 5381; + for byte in buffer.iter() { + hash = hash.wrapping_mul(33).wrapping_add(*byte as u32); + } + hash +} + +pub(crate) fn get_reboot_count(guest: &Guest) -> u32 { + guest + .ssh_command("sudo last | grep -c reboot") + .unwrap() + .trim() + .parse::() + .unwrap_or_default() +} + +pub(crate) fn enable_guest_watchdog(guest: &Guest, watchdog_sec: u32) { + // Check for PCI device + assert!( + guest + .does_device_vendor_pair_match("0x1063", "0x1af4") + .unwrap_or_default() + ); + + // Enable systemd watchdog + guest + .ssh_command(&format!( + "echo RuntimeWatchdogSec={watchdog_sec}s | sudo tee -a /etc/systemd/system.conf" + )) + .unwrap(); + + guest.ssh_command("sudo systemctl daemon-reexec").unwrap(); +} + +pub(crate) fn make_guest_panic(guest: &Guest) { + // Check for pvpanic device + assert!( + guest + .does_device_vendor_pair_match("0x0011", "0x1b36") + .unwrap_or_default() + ); + + // Trigger guest a panic + guest.ssh_command("screen -dmS reboot sh -c \"sleep 5; echo s | tee /proc/sysrq-trigger; echo c | sudo tee /proc/sysrq-trigger\"").unwrap(); +} diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 3e2a33da70..4b8a6b2e37 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -8,3086 +8,26 @@ // hence have known dead-code. This annotation silences dead-code // related warnings for our quality workflow to pass. #![allow(dead_code)] - -use std::collections::HashMap; -use std::ffi::CStr; use std::fs::{File, OpenOptions, copy}; -use std::io::{BufRead, Read, Seek, SeekFrom, Write}; +use std::io::{Read, Seek, Write}; use std::net::TcpListener; use std::os::unix::io::AsRawFd; -use std::path::{Path, PathBuf}; +use std::path::PathBuf; use std::process::{Child, Command, Stdio}; use std::string::String; -use std::sync::mpsc::Receiver; -use std::sync::{Mutex, mpsc}; +use std::sync::Mutex; use std::time::{Duration, SystemTime, UNIX_EPOCH}; -use std::{cmp, fs, io, thread}; +use std::{fs, thread}; use block::ImageType; -use net_util::MacAddr; use test_infra::*; use vmm_sys_util::tempdir::TempDir; use vmm_sys_util::tempfile::TempFile; use wait_timeout::ChildExt; -// This enum exists to make it more convenient to -// implement test for both D-Bus and REST APIs. -enum TargetApi { - // API socket - HttpApi(String), - // well known service name, object path - DBusApi(String, String), -} - -impl TargetApi { - fn new_http_api(tmp_dir: &TempDir) -> Self { - Self::HttpApi(temp_api_path(tmp_dir)) - } - - fn new_dbus_api(tmp_dir: &TempDir) -> Self { - // `tmp_dir` is in the form of "/tmp/chXXXXXX" - // and we take the `chXXXXXX` part as a unique identifier for the guest - let id = tmp_dir.as_path().file_name().unwrap().to_str().unwrap(); - - Self::DBusApi( - format!("org.cloudhypervisor.{id}"), - format!("/org/cloudhypervisor/{id}"), - ) - } - - fn guest_args(&self) -> Vec { - match self { - TargetApi::HttpApi(api_socket) => { - vec![format!("--api-socket={}", api_socket.as_str())] - } - TargetApi::DBusApi(service_name, object_path) => { - vec![ - format!("--dbus-service-name={}", service_name.as_str()), - format!("--dbus-object-path={}", object_path.as_str()), - ] - } - } - } - - fn remote_args(&self) -> Vec { - // `guest_args` and `remote_args` are consistent with each other - self.guest_args() - } - - fn remote_command(&self, command: &str, arg: Option<&str>) -> bool { - let mut cmd = Command::new(clh_command("ch-remote")); - cmd.args(self.remote_args()); - cmd.arg(command); - - if let Some(arg) = arg { - cmd.arg(arg); - } - - let output = cmd.output().unwrap(); - if output.status.success() { - true - } else { - eprintln!("Error running ch-remote command: {:?}", &cmd); - let stderr = String::from_utf8_lossy(&output.stderr); - eprintln!("stderr: {stderr}"); - false - } - } -} - -// Start cloud-hypervisor with no VM parameters, only the API server running. -// From the API: Create a VM, boot it and check that it looks as expected. -fn _test_api_create_boot(target_api: &TargetApi, guest: &Guest) { - let mut child = GuestCommand::new(guest) - .args(target_api.guest_args()) - .capture_output() - .spawn() - .unwrap(); - - thread::sleep(std::time::Duration::new(1, 0)); - - // Verify API server is running - assert!(target_api.remote_command("ping", None)); - - // Create the VM first - let request_body = guest.api_create_body(); - - let temp_config_path = guest.tmp_dir.as_path().join("config"); - std::fs::write(&temp_config_path, request_body).unwrap(); - let create_config = temp_config_path.as_os_str().to_str().unwrap(); - - assert!(target_api.remote_command("create", Some(create_config),)); - - // Then boot it - assert!(target_api.remote_command("boot", None)); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - // Check that the VM booted as expected - guest.validate_cpu_count(None); - guest.validate_memory(None); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); -} - -// Start cloud-hypervisor with no VM parameters, only the API server running. -// From the API: Create a VM, boot it and check it can be shutdown and then -// booted again -fn _test_api_shutdown(target_api: &TargetApi, guest: &Guest) { - let mut child = GuestCommand::new(guest) - .args(target_api.guest_args()) - .capture_output() - .spawn() - .unwrap(); - - thread::sleep(std::time::Duration::new(1, 0)); - - // Verify API server is running - assert!(target_api.remote_command("ping", None)); - - // Create the VM first - let request_body = guest.api_create_body(); - - let temp_config_path = guest.tmp_dir.as_path().join("config"); - std::fs::write(&temp_config_path, request_body).unwrap(); - let create_config = temp_config_path.as_os_str().to_str().unwrap(); - - let r = std::panic::catch_unwind(|| { - assert!(target_api.remote_command("create", Some(create_config))); - - // Then boot it - assert!(target_api.remote_command("boot", None)); - - guest.wait_vm_boot().unwrap(); - - // Check that the VM booted as expected - guest.validate_cpu_count(None); - guest.validate_memory(None); - - // Sync and shutdown without powering off to prevent filesystem - // corruption. - guest.ssh_command("sync").unwrap(); - guest.ssh_command("sudo shutdown -H now").unwrap(); - - // Wait for the guest to be fully shutdown - thread::sleep(std::time::Duration::new(20, 0)); - - // Then shut it down - assert!(target_api.remote_command("shutdown", None)); - - // Then boot it again - assert!(target_api.remote_command("boot", None)); - - guest.wait_vm_boot().unwrap(); - - // Check that the VM booted as expected - guest.validate_cpu_count(None); - guest.validate_memory(None); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); -} - -// Start cloud-hypervisor with no VM parameters, only the API server running. -// From the API: Create a VM, boot it and check it can be deleted and then recreated -// booted again. -fn _test_api_delete(target_api: &TargetApi, guest: &Guest) { - let mut child = GuestCommand::new(guest) - .args(target_api.guest_args()) - .capture_output() - .spawn() - .unwrap(); - - thread::sleep(std::time::Duration::new(1, 0)); - - // Verify API server is running - assert!(target_api.remote_command("ping", None)); - - // Create the VM first - let request_body = guest.api_create_body(); - - let temp_config_path = guest.tmp_dir.as_path().join("config"); - std::fs::write(&temp_config_path, request_body).unwrap(); - let create_config = temp_config_path.as_os_str().to_str().unwrap(); - - let r = std::panic::catch_unwind(|| { - assert!(target_api.remote_command("create", Some(create_config))); - - // Then boot it - assert!(target_api.remote_command("boot", None)); - - guest.wait_vm_boot().unwrap(); - - // Check that the VM booted as expected - guest.validate_cpu_count(None); - guest.validate_memory(None); - - // Sync and shutdown without powering off to prevent filesystem - // corruption. - guest.ssh_command("sync").unwrap(); - guest.ssh_command("sudo shutdown -H now").unwrap(); - - // Wait for the guest to be fully shutdown - thread::sleep(std::time::Duration::new(20, 0)); - - // Then delete it - assert!(target_api.remote_command("delete", None)); - - assert!(target_api.remote_command("create", Some(create_config))); - - // Then boot it again - assert!(target_api.remote_command("boot", None)); - - guest.wait_vm_boot().unwrap(); - - // Check that the VM booted as expected - guest.validate_cpu_count(None); - guest.validate_memory(None); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); -} - -// Start cloud-hypervisor with no VM parameters, only the API server running. -// From the API: Create a VM, boot it and check that it looks as expected. -// Then we pause the VM, check that it's no longer available. -// Finally we resume the VM and check that it's available. -fn _test_api_pause_resume(target_api: &TargetApi, guest: &Guest) { - let mut child = GuestCommand::new(guest) - .args(target_api.guest_args()) - .capture_output() - .spawn() - .unwrap(); - - thread::sleep(std::time::Duration::new(1, 0)); - - // Verify API server is running - assert!(target_api.remote_command("ping", None)); - - // Create the VM first - let request_body = guest.api_create_body(); - - let temp_config_path = guest.tmp_dir.as_path().join("config"); - std::fs::write(&temp_config_path, request_body).unwrap(); - let create_config = temp_config_path.as_os_str().to_str().unwrap(); - - assert!(target_api.remote_command("create", Some(create_config))); - - // Then boot it - assert!(target_api.remote_command("boot", None)); - thread::sleep(std::time::Duration::new(20, 0)); - - let r = std::panic::catch_unwind(|| { - // Check that the VM booted as expected - guest.validate_cpu_count(None); - guest.validate_memory(None); - - // We now pause the VM - assert!(target_api.remote_command("pause", None)); - - // Check pausing again fails - assert!(!target_api.remote_command("pause", None)); - - thread::sleep(std::time::Duration::new(2, 0)); - - // SSH into the VM should fail - ssh_command_ip( - "grep -c processor /proc/cpuinfo", - &guest.network.guest_ip0, - 2, - 5, - ) - .unwrap_err(); - - // Resume the VM - assert!(target_api.remote_command("resume", None)); - - // Check resuming again fails - assert!(!target_api.remote_command("resume", None)); - - thread::sleep(std::time::Duration::new(2, 0)); - - // Now we should be able to SSH back in and get the right number of CPUs - guest.validate_cpu_count(None); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); -} - -fn _test_pty_interaction(pty_path: PathBuf) { - let mut cf = std::fs::OpenOptions::new() - .write(true) - .read(true) - .open(pty_path) - .unwrap(); - - // Some dumb sleeps but we don't want to write - // before the console is up and we don't want - // to try and write the next line before the - // login process is ready. - thread::sleep(std::time::Duration::new(5, 0)); - assert_eq!(cf.write(b"cloud\n").unwrap(), 6); - thread::sleep(std::time::Duration::new(2, 0)); - assert_eq!(cf.write(b"cloud123\n").unwrap(), 9); - thread::sleep(std::time::Duration::new(2, 0)); - assert_eq!(cf.write(b"echo test_pty_console\n").unwrap(), 22); - thread::sleep(std::time::Duration::new(2, 0)); - - // read pty and ensure they have a login shell - // some fairly hacky workarounds to avoid looping - // forever in case the channel is blocked getting output - let ptyc = pty_read(cf); - let mut empty = 0; - let mut prev = String::new(); - loop { - thread::sleep(std::time::Duration::new(2, 0)); - match ptyc.try_recv() { - Ok(line) => { - empty = 0; - prev = prev + &line; - if prev.contains("test_pty_console") { - break; - } - } - Err(mpsc::TryRecvError::Empty) => { - empty += 1; - assert!(empty <= 5, "No login on pty"); - } - _ => { - panic!("No login on pty") - } - } - } -} - -fn prepare_virtiofsd(tmp_dir: &TempDir, shared_dir: &str) -> (std::process::Child, String) { - let mut workload_path = dirs::home_dir().unwrap(); - workload_path.push("workloads"); - - let mut virtiofsd_path = workload_path; - virtiofsd_path.push("virtiofsd"); - let virtiofsd_path = String::from(virtiofsd_path.to_str().unwrap()); - - let virtiofsd_socket_path = - String::from(tmp_dir.as_path().join("virtiofs.sock").to_str().unwrap()); - - // Start the daemon - let child = Command::new(virtiofsd_path.as_str()) - .args(["--shared-dir", shared_dir]) - .args(["--socket-path", virtiofsd_socket_path.as_str()]) - .args(["--cache", "never"]) - .args(["--tag", "myfs"]) - .spawn() - .unwrap(); - - thread::sleep(std::time::Duration::new(10, 0)); - - (child, virtiofsd_socket_path) -} - -fn prepare_vubd( - tmp_dir: &TempDir, - blk_img: &str, - num_queues: usize, - rdonly: bool, - direct: bool, -) -> (std::process::Child, String) { - let mut workload_path = dirs::home_dir().unwrap(); - workload_path.push("workloads"); - - let mut blk_file_path = workload_path; - blk_file_path.push(blk_img); - let blk_file_path = String::from(blk_file_path.to_str().unwrap()); - - let vubd_socket_path = String::from(tmp_dir.as_path().join("vub.sock").to_str().unwrap()); - - // Start the daemon - let child = Command::new(clh_command("vhost_user_block")) - .args([ - "--block-backend", - format!( - "path={blk_file_path},socket={vubd_socket_path},num_queues={num_queues},readonly={rdonly},direct={direct}" - ) - .as_str(), - ]) - .spawn() - .unwrap(); - - thread::sleep(std::time::Duration::new(10, 0)); - - (child, vubd_socket_path) -} - -fn temp_vsock_path(tmp_dir: &TempDir) -> String { - String::from(tmp_dir.as_path().join("vsock").to_str().unwrap()) -} - -fn temp_api_path(tmp_dir: &TempDir) -> String { - String::from( - tmp_dir - .as_path() - .join("cloud-hypervisor.sock") - .to_str() - .unwrap(), - ) -} - -fn temp_event_monitor_path(tmp_dir: &TempDir) -> String { - String::from(tmp_dir.as_path().join("event.json").to_str().unwrap()) -} - -// Creates the directory and returns the path. -fn temp_snapshot_dir_path(tmp_dir: &TempDir) -> String { - let snapshot_dir = String::from(tmp_dir.as_path().join("snapshot").to_str().unwrap()); - std::fs::create_dir(&snapshot_dir).unwrap(); - snapshot_dir -} - -fn temp_vmcore_file_path(tmp_dir: &TempDir) -> String { - String::from(tmp_dir.as_path().join("vmcore").to_str().unwrap()) -} - -fn cloud_hypervisor_release_path() -> String { - let mut workload_path = dirs::home_dir().unwrap(); - workload_path.push("workloads"); - - let mut ch_release_path = workload_path; - #[cfg(target_arch = "x86_64")] - ch_release_path.push("cloud-hypervisor-static"); - #[cfg(target_arch = "aarch64")] - ch_release_path.push("cloud-hypervisor-static-aarch64"); - - ch_release_path.into_os_string().into_string().unwrap() -} - -fn prepare_vhost_user_net_daemon( - tmp_dir: &TempDir, - ip: &str, - tap: Option<&str>, - mtu: Option, - num_queues: usize, - client_mode: bool, -) -> (std::process::Command, String) { - let vunet_socket_path = String::from(tmp_dir.as_path().join("vunet.sock").to_str().unwrap()); - - // Start the daemon - let mut net_params = format!( - "ip={ip},mask=255.255.255.128,socket={vunet_socket_path},num_queues={num_queues},queue_size=1024,client={client_mode}" - ); - - if let Some(tap) = tap { - net_params.push_str(format!(",tap={tap}").as_str()); - } - - if let Some(mtu) = mtu { - net_params.push_str(format!(",mtu={mtu}").as_str()); - } - - let mut command = Command::new(clh_command("vhost_user_net")); - command.args(["--net-backend", net_params.as_str()]); - - (command, vunet_socket_path) -} - -fn prepare_swtpm_daemon(tmp_dir: &TempDir) -> (std::process::Command, String) { - let swtpm_tpm_dir = String::from(tmp_dir.as_path().join("swtpm").to_str().unwrap()); - let swtpm_socket_path = String::from( - tmp_dir - .as_path() - .join("swtpm") - .join("swtpm.sock") - .to_str() - .unwrap(), - ); - std::fs::create_dir(&swtpm_tpm_dir).unwrap(); - - let mut swtpm_command = Command::new("swtpm"); - let swtpm_args = [ - "socket", - "--tpmstate", - &format!("dir={swtpm_tpm_dir}"), - "--ctrl", - &format!("type=unixio,path={swtpm_socket_path}"), - "--flags", - "startup-clear", - "--tpm2", - ]; - swtpm_command.args(swtpm_args); - - (swtpm_command, swtpm_socket_path) -} - -fn remote_command(api_socket: &str, command: &str, arg: Option<&str>) -> bool { - let mut cmd = Command::new(clh_command("ch-remote")); - cmd.args([&format!("--api-socket={api_socket}"), command]); - - if let Some(arg) = arg { - cmd.arg(arg); - } - let output = cmd.output().unwrap(); - if output.status.success() { - true - } else { - eprintln!("Error running ch-remote command: {:?}", &cmd); - let stderr = String::from_utf8_lossy(&output.stderr); - eprintln!("stderr: {stderr}"); - false - } -} - -fn remote_command_w_output(api_socket: &str, command: &str, arg: Option<&str>) -> (bool, Vec) { - let mut cmd = Command::new(clh_command("ch-remote")); - cmd.args([&format!("--api-socket={api_socket}"), command]); - - if let Some(arg) = arg { - cmd.arg(arg); - } - - let output = cmd.output().expect("Failed to launch ch-remote"); - - (output.status.success(), output.stdout) -} - -fn resize_command( - api_socket: &str, - desired_vcpus: Option, - desired_ram: Option, - desired_balloon: Option, - event_file: Option<&str>, -) -> bool { - let mut cmd = Command::new(clh_command("ch-remote")); - cmd.args([&format!("--api-socket={api_socket}"), "resize"]); - - if let Some(desired_vcpus) = desired_vcpus { - cmd.arg(format!("--cpus={desired_vcpus}")); - } - - if let Some(desired_ram) = desired_ram { - cmd.arg(format!("--memory={desired_ram}")); - } - - if let Some(desired_balloon) = desired_balloon { - cmd.arg(format!("--balloon={desired_balloon}")); - } - - let ret = cmd.status().expect("Failed to launch ch-remote").success(); - - if let Some(event_path) = event_file { - let latest_events = [ - &MetaEvent { - event: "resizing".to_string(), - device_id: None, - }, - &MetaEvent { - event: "resized".to_string(), - device_id: None, - }, - ]; - // See: #5938 - thread::sleep(std::time::Duration::new(1, 0)); - assert!(check_latest_events_exact(&latest_events, event_path)); - } - - ret -} - -fn resize_zone_command(api_socket: &str, id: &str, desired_size: &str) -> bool { - let mut cmd = Command::new(clh_command("ch-remote")); - cmd.args([ - &format!("--api-socket={api_socket}"), - "resize-zone", - &format!("--id={id}"), - &format!("--size={desired_size}"), - ]); - - cmd.status().expect("Failed to launch ch-remote").success() -} - -fn resize_disk_command(api_socket: &str, id: &str, desired_size: &str) -> bool { - let mut cmd = Command::new(clh_command("ch-remote")); - cmd.args([ - &format!("--api-socket={api_socket}"), - "resize-disk", - &format!("--disk={id}"), - &format!("--size={desired_size}"), - ]); - - cmd.status().expect("Failed to launch ch-remote").success() -} - -// setup OVS-DPDK bridge and ports -fn setup_ovs_dpdk() { - // setup OVS-DPDK - assert!(exec_host_command_status("service openvswitch-switch start").success()); - assert!(exec_host_command_status("ovs-vsctl init").success()); - assert!( - exec_host_command_status("ovs-vsctl set Open_vSwitch . other_config:dpdk-init=true") - .success() - ); - assert!(exec_host_command_status("service openvswitch-switch restart").success()); - - // Create OVS-DPDK bridge and ports - assert!( - exec_host_command_status( - "ovs-vsctl add-br ovsbr0 -- set bridge ovsbr0 datapath_type=netdev", - ) - .success() - ); - assert!(exec_host_command_status("ovs-vsctl add-port ovsbr0 vhost-user1 -- set Interface vhost-user1 type=dpdkvhostuserclient options:vhost-server-path=/tmp/dpdkvhostclient1").success()); - assert!(exec_host_command_status("ovs-vsctl add-port ovsbr0 vhost-user2 -- set Interface vhost-user2 type=dpdkvhostuserclient options:vhost-server-path=/tmp/dpdkvhostclient2").success()); - assert!(exec_host_command_status("ip link set up dev ovsbr0").success()); - assert!(exec_host_command_status("service openvswitch-switch restart").success()); -} -fn cleanup_ovs_dpdk() { - assert!(exec_host_command_status("ovs-vsctl del-br ovsbr0").success()); - exec_host_command_status("rm -f ovs-vsctl /tmp/dpdkvhostclient1 /tmp/dpdkvhostclient2"); -} -// Setup two guests and ensure they are connected through ovs-dpdk -fn setup_ovs_dpdk_guests( - guest1: &Guest, - guest2: &Guest, - api_socket: &str, - release_binary: bool, -) -> (Child, Child) { - setup_ovs_dpdk(); - - let clh_path = if release_binary { - cloud_hypervisor_release_path() - } else { - clh_command("cloud-hypervisor") - }; - - let mut child1 = GuestCommand::new_with_binary_path(guest1, &clh_path) - .args(["--cpus", "boot=2"]) - .args(["--memory", "size=0,shared=on"]) - .args(["--memory-zone", "id=mem0,size=1G,shared=on,host_numa_node=0"]) - .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .args(["--net", guest1.default_net_string().as_str(), "vhost_user=true,socket=/tmp/dpdkvhostclient1,num_queues=2,queue_size=256,vhost_mode=server"]) - .capture_output() - .spawn() - .unwrap(); - - #[cfg(target_arch = "x86_64")] - let guest_net_iface = "ens5"; - #[cfg(target_arch = "aarch64")] - let guest_net_iface = "enp0s5"; - - let r = std::panic::catch_unwind(|| { - guest1.wait_vm_boot().unwrap(); - - guest1 - .ssh_command(&format!( - "sudo ip addr add 172.100.0.1/24 dev {guest_net_iface}" - )) - .unwrap(); - guest1 - .ssh_command(&format!("sudo ip link set up dev {guest_net_iface}")) - .unwrap(); - - let guest_ip = guest1.network.guest_ip0.clone(); - thread::spawn(move || { - ssh_command_ip( - "nc -l 12345", - &guest_ip, - DEFAULT_SSH_RETRIES, - DEFAULT_SSH_TIMEOUT, - ) - .unwrap(); - }); - }); - if r.is_err() { - cleanup_ovs_dpdk(); - - let _ = child1.kill(); - let output = child1.wait_with_output().unwrap(); - handle_child_output(r, &output); - panic!("Test should already be failed/panicked"); // To explicitly mark this block never return - } - - let mut child2 = GuestCommand::new_with_binary_path(guest2, &clh_path) - .args(["--api-socket", api_socket]) - .args(["--cpus", "boot=2"]) - .args(["--memory", "size=0,shared=on"]) - .args(["--memory-zone", "id=mem0,size=1G,shared=on,host_numa_node=0"]) - .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .args(["--net", guest2.default_net_string().as_str(), "vhost_user=true,socket=/tmp/dpdkvhostclient2,num_queues=2,queue_size=256,vhost_mode=server"]) - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest2.wait_vm_boot().unwrap(); - - guest2 - .ssh_command(&format!( - "sudo ip addr add 172.100.0.2/24 dev {guest_net_iface}" - )) - .unwrap(); - guest2 - .ssh_command(&format!("sudo ip link set up dev {guest_net_iface}")) - .unwrap(); - - // Check the connection works properly between the two VMs - guest2.ssh_command("nc -vz 172.100.0.1 12345").unwrap(); - }); - if r.is_err() { - cleanup_ovs_dpdk(); - - let _ = child1.kill(); - let _ = child2.kill(); - let output = child2.wait_with_output().unwrap(); - handle_child_output(r, &output); - panic!("Test should already be failed/panicked"); // To explicitly mark this block never return - } - - (child1, child2) -} - -enum FwType { - Ovmf, - RustHypervisorFirmware, -} - -fn fw_path(_fw_type: FwType) -> String { - let mut workload_path = dirs::home_dir().unwrap(); - workload_path.push("workloads"); - - let mut fw_path = workload_path; - #[cfg(target_arch = "aarch64")] - fw_path.push("CLOUDHV_EFI.fd"); - #[cfg(target_arch = "x86_64")] - { - match _fw_type { - FwType::Ovmf => fw_path.push(OVMF_NAME), - FwType::RustHypervisorFirmware => fw_path.push("hypervisor-fw"), - } - } - - fw_path.to_str().unwrap().to_string() -} - -// Parse the event_monitor file based on the format that each event -// is followed by a double newline -fn parse_event_file(event_file: &str) -> Vec { - let content = fs::read(event_file).unwrap(); - let mut ret = Vec::new(); - for entry in String::from_utf8_lossy(&content) - .trim() - .split("\n\n") - .collect::>() - { - ret.push(serde_json::from_str(entry).unwrap()); - } - - ret -} - -// Return true if all events from the input 'expected_events' are matched sequentially -// with events from the 'event_file' -fn check_sequential_events(expected_events: &[&MetaEvent], event_file: &str) -> bool { - let json_events = parse_event_file(event_file); - let len = expected_events.len(); - let mut idx = 0; - for e in &json_events { - if idx == len { - break; - } - if expected_events[idx].match_with_json_event(e) { - idx += 1; - } - } - - let ret = idx == len; - - if !ret { - eprintln!( - "\n\n==== Start 'check_sequential_events' failed ==== \ - \n\nexpected_events={expected_events:?}\nactual_events={json_events:?} \ - \n\n==== End 'check_sequential_events' failed ====", - ); - } - - ret -} - -// Return true if all events from the input 'expected_events' are matched exactly -// with events from the 'event_file' -fn check_sequential_events_exact(expected_events: &[&MetaEvent], event_file: &str) -> bool { - let json_events = parse_event_file(event_file); - assert!(expected_events.len() <= json_events.len()); - let json_events = &json_events[..expected_events.len()]; - - for (idx, e) in json_events.iter().enumerate() { - if !expected_events[idx].match_with_json_event(e) { - eprintln!( - "\n\n==== Start 'check_sequential_events_exact' failed ==== \ - \n\nexpected_events={expected_events:?}\nactual_events={json_events:?} \ - \n\n==== End 'check_sequential_events_exact' failed ====", - ); - - return false; - } - } - - true -} - -// Return true if events from the input 'latest_events' are matched exactly -// with the most recent events from the 'event_file' -fn check_latest_events_exact(latest_events: &[&MetaEvent], event_file: &str) -> bool { - let json_events = parse_event_file(event_file); - assert!(latest_events.len() <= json_events.len()); - let json_events = &json_events[(json_events.len() - latest_events.len())..]; - - for (idx, e) in json_events.iter().enumerate() { - if !latest_events[idx].match_with_json_event(e) { - eprintln!( - "\n\n==== Start 'check_latest_events_exact' failed ==== \ - \n\nexpected_events={latest_events:?}\nactual_events={json_events:?} \ - \n\n==== End 'check_latest_events_exact' failed ====", - ); - - return false; - } - } - - true -} - -fn test_cpu_topology(threads_per_core: u8, cores_per_package: u8, packages: u8, use_fw: bool) { - let disk_config = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - let total_vcpus = threads_per_core * cores_per_package * packages; - let direct_kernel_boot_path = direct_kernel_boot_path(); - let mut kernel_path = direct_kernel_boot_path.to_str().unwrap(); - let fw_path = fw_path(FwType::RustHypervisorFirmware); - if use_fw { - kernel_path = fw_path.as_str(); - } - - let mut child = GuestCommand::new(&guest) - .args([ - "--cpus", - &format!( - "boot={total_vcpus},topology={threads_per_core}:{cores_per_package}:1:{packages}" - ), - ]) - .default_memory() - .args(["--kernel", kernel_path]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .default_net() - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - assert_eq!( - guest.get_cpu_count().unwrap_or_default(), - u32::from(total_vcpus) - ); - assert_eq!( - guest - .ssh_command("lscpu | grep \"per core\" | cut -f 2 -d \":\" | sed \"s# *##\"") - .unwrap() - .trim() - .parse::() - .unwrap_or(0), - threads_per_core - ); - - assert_eq!( - guest - .ssh_command("lscpu | grep \"per socket\" | cut -f 2 -d \":\" | sed \"s# *##\"") - .unwrap() - .trim() - .parse::() - .unwrap_or(0), - cores_per_package - ); - - assert_eq!( - guest - .ssh_command("lscpu | grep \"Socket\" | cut -f 2 -d \":\" | sed \"s# *##\"") - .unwrap() - .trim() - .parse::() - .unwrap_or(0), - packages - ); - - #[cfg(target_arch = "x86_64")] - { - let mut cpu_id = 0; - for package_id in 0..packages { - for core_id in 0..cores_per_package { - for _ in 0..threads_per_core { - assert_eq!( - guest - .ssh_command(&format!("cat /sys/devices/system/cpu/cpu{cpu_id}/topology/physical_package_id")) - .unwrap() - .trim() - .parse::() - .unwrap_or(0), - package_id - ); - - assert_eq!( - guest - .ssh_command(&format!( - "cat /sys/devices/system/cpu/cpu{cpu_id}/topology/core_id" - )) - .unwrap() - .trim() - .parse::() - .unwrap_or(0), - core_id - ); - - cpu_id += 1; - } - } - } - } - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); -} - -#[allow(unused_variables)] -fn _test_guest_numa_nodes(acpi: bool) { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - let api_socket = temp_api_path(&guest.tmp_dir); - #[cfg(target_arch = "x86_64")] - let kernel_path = direct_kernel_boot_path(); - #[cfg(target_arch = "aarch64")] - let kernel_path = if acpi { - edk2_path() - } else { - direct_kernel_boot_path() - }; - - let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=6,max=12"]) - .args(["--memory", "size=0,hotplug_method=virtio-mem"]) - .args([ - "--memory-zone", - "id=mem0,size=1G,hotplug_size=3G", - "id=mem1,size=2G,hotplug_size=3G", - "id=mem2,size=3G,hotplug_size=3G", - ]) - .args([ - "--numa", - "guest_numa_id=0,cpus=[0-2,9],distances=[1@15,2@20],memory_zones=mem0", - "guest_numa_id=1,cpus=[3-4,6-8],distances=[0@20,2@25],memory_zones=mem1", - "guest_numa_id=2,cpus=[5,10-11],distances=[0@25,1@30],memory_zones=mem2", - ]) - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .args(["--api-socket", &api_socket]) - .capture_output() - .default_disks() - .default_net() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - guest.check_numa_common( - Some(&[960_000, 1_920_000, 2_880_000]), - Some(&[&[0, 1, 2], &[3, 4], &[5]]), - Some(&["10 15 20", "20 10 25", "25 30 10"]), - ); - - // AArch64 currently does not support hotplug, and therefore we only - // test hotplug-related function on x86_64 here. - #[cfg(target_arch = "x86_64")] - { - guest.enable_memory_hotplug(); - - // Resize every memory zone and check each associated NUMA node - // has been assigned the right amount of memory. - resize_zone_command(&api_socket, "mem0", "4G"); - resize_zone_command(&api_socket, "mem1", "4G"); - resize_zone_command(&api_socket, "mem2", "4G"); - // Resize to the maximum amount of CPUs and check each NUMA - // node has been assigned the right CPUs set. - resize_command(&api_socket, Some(12), None, None, None); - thread::sleep(std::time::Duration::new(5, 0)); - - guest.check_numa_common( - Some(&[3_840_000, 3_840_000, 3_840_000]), - Some(&[&[0, 1, 2, 9], &[3, 4, 6, 7, 8], &[5, 10, 11]]), - None, - ); - } - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); -} - -#[allow(unused_variables)] -fn _test_power_button(guest: &Guest) { - let mut cmd = GuestCommand::new(guest); - let api_socket = temp_api_path(&guest.tmp_dir); - - cmd.default_cpus() - .default_memory() - .default_kernel_cmdline() - .capture_output() - .default_disks() - .default_net() - .args(["--api-socket", &api_socket]); - - let child = cmd.spawn().unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - assert!(remote_command(&api_socket, "power-button", None)); - }); - - let output = child.wait_with_output().unwrap(); - assert!(output.status.success()); - handle_child_output(r, &output); -} - -fn get_msi_interrupt_pattern() -> String { - #[cfg(target_arch = "x86_64")] - { - "PCI-MSI".to_string() - } - #[cfg(target_arch = "aarch64")] - { - if cfg!(feature = "mshv") { - "GICv2m-PCI-MSIX".to_string() - } else { - "ITS-PCI-MSIX".to_string() - } - } -} - -type PrepareNetDaemon = dyn Fn( - &TempDir, - &str, - Option<&str>, - Option, - usize, - bool, -) -> (std::process::Command, String); - -fn test_vhost_user_net( - tap: Option<&str>, - num_queues: usize, - prepare_daemon: &PrepareNetDaemon, - generate_host_mac: bool, - client_mode_daemon: bool, -) { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - let api_socket = temp_api_path(&guest.tmp_dir); - - let kernel_path = direct_kernel_boot_path(); - - let host_mac = if generate_host_mac { - Some(MacAddr::local_random()) - } else { - None - }; - - let mtu = Some(3000); - - let (mut daemon_command, vunet_socket_path) = prepare_daemon( - &guest.tmp_dir, - &guest.network.host_ip0, - tap, - mtu, - num_queues, - client_mode_daemon, - ); - - let net_params = format!( - "vhost_user=true,mac={},socket={},num_queues={},queue_size=1024{},vhost_mode={},mtu=3000", - guest.network.guest_mac0, - vunet_socket_path, - num_queues, - if let Some(host_mac) = host_mac { - format!(",host_mac={host_mac}") - } else { - String::new() - }, - if client_mode_daemon { - "server" - } else { - "client" - }, - ); - - let mut ch_command = GuestCommand::new(&guest); - ch_command - .args(["--cpus", format!("boot={}", num_queues / 2).as_str()]) - .args(["--memory", "size=512M,hotplug_size=2048M,shared=on"]) - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .args(["--net", net_params.as_str()]) - .args(["--api-socket", &api_socket]) - .capture_output(); - - let mut daemon_child: std::process::Child; - let mut child: std::process::Child; - - if client_mode_daemon { - child = ch_command.spawn().unwrap(); - // Make sure the VMM is waiting for the backend to connect - thread::sleep(std::time::Duration::new(10, 0)); - daemon_child = daemon_command.spawn().unwrap(); - } else { - daemon_child = daemon_command.spawn().unwrap(); - // Make sure the backend is waiting for the VMM to connect - thread::sleep(std::time::Duration::new(10, 0)); - child = ch_command.spawn().unwrap(); - } - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - if let Some(tap_name) = tap { - let tap_count = exec_host_command_output(&format!("ip link | grep -c {tap_name}")); - assert_eq!(String::from_utf8_lossy(&tap_count.stdout).trim(), "1"); - } - - if let Some(host_mac) = tap { - let mac_count = exec_host_command_output(&format!("ip link | grep -c {host_mac}")); - assert_eq!(String::from_utf8_lossy(&mac_count.stdout).trim(), "1"); - } - - #[cfg(target_arch = "aarch64")] - let iface = "enp0s4"; - #[cfg(target_arch = "x86_64")] - let iface = "ens4"; - - assert_eq!( - guest - .ssh_command(format!("cat /sys/class/net/{iface}/mtu").as_str()) - .unwrap() - .trim(), - "3000" - ); - - // 1 network interface + default localhost ==> 2 interfaces - // It's important to note that this test is fully exercising the - // vhost-user-net implementation and the associated backend since - // it does not define any --net network interface. That means all - // the ssh communication in that test happens through the network - // interface backed by vhost-user-net. - assert_eq!( - guest - .ssh_command("ip -o link | wc -l") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 2 - ); - - // The following pci devices will appear on guest with PCI-MSI - // interrupt vectors assigned. - // 1 virtio-console with 3 vectors: config, Rx, Tx - // 1 virtio-blk with 2 vectors: config, Request - // 1 virtio-blk with 2 vectors: config, Request - // 1 virtio-rng with 2 vectors: config, Request - // Since virtio-net has 2 queue pairs, its vectors is as follows: - // 1 virtio-net with 5 vectors: config, Rx (2), Tx (2) - // Based on the above, the total vectors should 14. - let grep_cmd = format!("grep -c {} /proc/interrupts", get_msi_interrupt_pattern()); - - assert_eq!( - guest - .ssh_command(&grep_cmd) - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 10 + (num_queues as u32) - ); - - // ACPI feature is needed. - #[cfg(target_arch = "x86_64")] - { - guest.enable_memory_hotplug(); - - // Add RAM to the VM - let desired_ram = 1024 << 20; - resize_command(&api_socket, None, Some(desired_ram), None, None); - - thread::sleep(std::time::Duration::new(10, 0)); - - // Here by simply checking the size (through ssh), we validate - // the connection is still working, which means vhost-user-net - // keeps working after the resize. - assert!(guest.get_total_memory().unwrap_or_default() > 960_000); - } - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - thread::sleep(std::time::Duration::new(5, 0)); - let _ = daemon_child.kill(); - let _ = daemon_child.wait(); - - handle_child_output(r, &output); -} - -type PrepareBlkDaemon = dyn Fn(&TempDir, &str, usize, bool, bool) -> (std::process::Child, String); - -fn test_vhost_user_blk( - num_queues: usize, - readonly: bool, - direct: bool, - prepare_vhost_user_blk_daemon: Option<&PrepareBlkDaemon>, -) { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - let api_socket = temp_api_path(&guest.tmp_dir); - - let kernel_path = direct_kernel_boot_path(); - - let (blk_params, daemon_child) = { - let prepare_daemon = prepare_vhost_user_blk_daemon.unwrap(); - // Start the daemon - let (daemon_child, vubd_socket_path) = - prepare_daemon(&guest.tmp_dir, "blk.img", num_queues, readonly, direct); - - ( - format!( - "vhost_user=true,socket={vubd_socket_path},num_queues={num_queues},queue_size=128", - ), - Some(daemon_child), - ) - }; - - let mut child = GuestCommand::new(&guest) - .args(["--cpus", format!("boot={num_queues}").as_str()]) - .args(["--memory", "size=512M,hotplug_size=2048M,shared=on"]) - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .args([ - "--disk", - format!( - "path={}", - guest.disk_config.disk(DiskType::OperatingSystem).unwrap() - ) - .as_str(), - format!( - "path={}", - guest.disk_config.disk(DiskType::CloudInit).unwrap() - ) - .as_str(), - blk_params.as_str(), - ]) - .default_net() - .args(["--api-socket", &api_socket]) - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - // Check both if /dev/vdc exists and if the block size is 16M. - assert_eq!( - guest - .ssh_command("lsblk | grep vdc | grep -c 16M") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 1 - ); - - // Check if this block is RO or RW. - assert_eq!( - guest - .ssh_command("lsblk | grep vdc | awk '{print $5}'") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - readonly as u32 - ); - - // Check if the number of queues in /sys/block/vdc/mq matches the - // expected num_queues. - assert_eq!( - guest - .ssh_command("ls -ll /sys/block/vdc/mq | grep ^d | wc -l") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - num_queues as u32 - ); - - // Mount the device - let mount_ro_rw_flag = if readonly { "ro,noload" } else { "rw" }; - guest.ssh_command("mkdir mount_image").unwrap(); - guest - .ssh_command( - format!("sudo mount -o {mount_ro_rw_flag} -t ext4 /dev/vdc mount_image/").as_str(), - ) - .unwrap(); - - // Check the content of the block device. The file "foo" should - // contain "bar". - assert_eq!( - guest.ssh_command("cat mount_image/foo").unwrap().trim(), - "bar" - ); - - // ACPI feature is needed. - #[cfg(target_arch = "x86_64")] - { - guest.enable_memory_hotplug(); - - // Add RAM to the VM - let desired_ram = 1024 << 20; - resize_command(&api_socket, None, Some(desired_ram), None, None); - - thread::sleep(std::time::Duration::new(10, 0)); - - assert!(guest.get_total_memory().unwrap_or_default() > 960_000); - - // Check again the content of the block device after the resize - // has been performed. - assert_eq!( - guest.ssh_command("cat mount_image/foo").unwrap().trim(), - "bar" - ); - } - - // Unmount the device - guest.ssh_command("sudo umount /dev/vdc").unwrap(); - guest.ssh_command("rm -r mount_image").unwrap(); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - if let Some(mut daemon_child) = daemon_child { - thread::sleep(std::time::Duration::new(5, 0)); - let _ = daemon_child.kill(); - let _ = daemon_child.wait(); - } - - handle_child_output(r, &output); -} - -fn test_boot_from_vhost_user_blk( - num_queues: usize, - readonly: bool, - direct: bool, - prepare_vhost_user_blk_daemon: Option<&PrepareBlkDaemon>, -) { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - let kernel_path = direct_kernel_boot_path(); - - let disk_path = guest.disk_config.disk(DiskType::OperatingSystem).unwrap(); - - let (blk_boot_params, daemon_child) = { - let prepare_daemon = prepare_vhost_user_blk_daemon.unwrap(); - // Start the daemon - let (daemon_child, vubd_socket_path) = prepare_daemon( - &guest.tmp_dir, - disk_path.as_str(), - num_queues, - readonly, - direct, - ); - - ( - format!( - "vhost_user=true,socket={vubd_socket_path},num_queues={num_queues},queue_size=128", - ), - Some(daemon_child), - ) - }; - - let mut child = GuestCommand::new(&guest) - .args(["--cpus", format!("boot={num_queues}").as_str()]) - .args(["--memory", "size=512M,shared=on"]) - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .args([ - "--disk", - blk_boot_params.as_str(), - format!( - "path={}", - guest.disk_config.disk(DiskType::CloudInit).unwrap() - ) - .as_str(), - ]) - .default_net() - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - // Just check the VM booted correctly. - assert_eq!(guest.get_cpu_count().unwrap_or_default(), num_queues as u32); - assert!(guest.get_total_memory().unwrap_or_default() > 480_000); - }); - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - if let Some(mut daemon_child) = daemon_child { - thread::sleep(std::time::Duration::new(5, 0)); - let _ = daemon_child.kill(); - let _ = daemon_child.wait(); - } - - handle_child_output(r, &output); -} - -fn _test_virtio_fs( - prepare_daemon: &dyn Fn(&TempDir, &str) -> (std::process::Child, String), - hotplug: bool, - use_generic_vhost_user: bool, - pci_segment: Option, -) { - #[cfg(target_arch = "aarch64")] - let focal_image = if hotplug { - FOCAL_IMAGE_UPDATE_KERNEL_NAME.to_string() - } else { - FOCAL_IMAGE_NAME.to_string() - }; - #[cfg(target_arch = "x86_64")] - let focal_image = FOCAL_IMAGE_NAME.to_string(); - let disk_config = UbuntuDiskConfig::new(focal_image); - let guest = Guest::new(Box::new(disk_config)); - let api_socket = temp_api_path(&guest.tmp_dir); - - let mut workload_path = dirs::home_dir().unwrap(); - workload_path.push("workloads"); - - let mut shared_dir = workload_path; - shared_dir.push("shared_dir"); - - #[cfg(target_arch = "x86_64")] - let kernel_path = direct_kernel_boot_path(); - #[cfg(target_arch = "aarch64")] - let kernel_path = if hotplug { - edk2_path() - } else { - direct_kernel_boot_path() - }; - - let (mut daemon_child, virtiofsd_socket_path) = - prepare_daemon(&guest.tmp_dir, shared_dir.to_str().unwrap()); - - let mut guest_command = GuestCommand::new(&guest); - guest_command - .default_cpus() - .args(["--memory", "size=512M,hotplug_size=2048M,shared=on"]) - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .default_net() - .args(["--api-socket", &api_socket]); - if pci_segment.is_some() { - guest_command.args([ - "--platform", - &format!("num_pci_segments={MAX_NUM_PCI_SEGMENTS}"), - ]); - } - - let fs_params = format!( - "socket={},id=myfs0,{}{}", - virtiofsd_socket_path, - if use_generic_vhost_user { - "queue_sizes=[1024,1024],virtio_id=26" - } else { - "tag=myfs,num_queues=1,queue_size=1024" - }, - if let Some(pci_segment) = pci_segment { - format!(",pci_segment={pci_segment}") - } else { - String::new() - } - ); - - if !hotplug { - guest_command.args([ - if use_generic_vhost_user { - "--generic-vhost-user" - } else { - "--fs" - }, - fs_params.as_str(), - ]); - } - - let mut child = guest_command.capture_output().spawn().unwrap(); - let add_arg = if use_generic_vhost_user { - "add-generic-vhost-user" - } else { - "add-fs" - }; - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - if hotplug { - // Add fs to the VM - let (cmd_success, cmd_output) = - remote_command_w_output(&api_socket, add_arg, Some(&fs_params)); - assert!(cmd_success); - - if let Some(pci_segment) = pci_segment { - assert!(String::from_utf8_lossy(&cmd_output).contains(&format!( - "{{\"id\":\"myfs0\",\"bdf\":\"{pci_segment:04x}:00:01.0\"}}" - ))); - } else { - assert!( - String::from_utf8_lossy(&cmd_output) - .contains("{\"id\":\"myfs0\",\"bdf\":\"0000:00:06.0\"}") - ); - } - - thread::sleep(std::time::Duration::new(10, 0)); - } - - // Mount shared directory through virtio_fs filesystem - guest - .ssh_command("mkdir -p mount_dir && sudo mount -t virtiofs myfs mount_dir/") - .unwrap(); - - // Check file1 exists and its content is "foo" - assert_eq!( - guest.ssh_command("cat mount_dir/file1").unwrap().trim(), - "foo" - ); - // Check file2 does not exist - guest - .ssh_command("[ ! -f 'mount_dir/file2' ] || true") - .unwrap(); - - // Check file3 exists and its content is "bar" - assert_eq!( - guest.ssh_command("cat mount_dir/file3").unwrap().trim(), - "bar" - ); - - // ACPI feature is needed. - #[cfg(target_arch = "x86_64")] - { - guest.enable_memory_hotplug(); - - // Add RAM to the VM - let desired_ram = 1024 << 20; - resize_command(&api_socket, None, Some(desired_ram), None, None); - - thread::sleep(std::time::Duration::new(30, 0)); - assert!(guest.get_total_memory().unwrap_or_default() > 960_000); - - // After the resize, check again that file1 exists and its - // content is "foo". - assert_eq!( - guest.ssh_command("cat mount_dir/file1").unwrap().trim(), - "foo" - ); - } - - if hotplug { - // Remove from VM - guest.ssh_command("sudo umount mount_dir").unwrap(); - assert!(remote_command(&api_socket, "remove-device", Some("myfs0"))); - } - }); - - let (r, hotplug_daemon_child) = if r.is_ok() && hotplug { - thread::sleep(std::time::Duration::new(10, 0)); - let (daemon_child, virtiofsd_socket_path) = - prepare_daemon(&guest.tmp_dir, shared_dir.to_str().unwrap()); - - let r = std::panic::catch_unwind(|| { - thread::sleep(std::time::Duration::new(10, 0)); - let fs_params = format!( - "id=myfs0,socket={},{}{}", - virtiofsd_socket_path, - if use_generic_vhost_user { - "queue_sizes=[1024,1024],virtio_id=26" - } else { - "tag=myfs,num_queues=1,queue_size=1024" - }, - if let Some(pci_segment) = pci_segment { - format!(",pci_segment={pci_segment}") - } else { - String::new() - } - ); - - // Add back and check it works - let (cmd_success, cmd_output) = - remote_command_w_output(&api_socket, add_arg, Some(&fs_params)); - assert!(cmd_success); - if let Some(pci_segment) = pci_segment { - assert!(String::from_utf8_lossy(&cmd_output).contains(&format!( - "{{\"id\":\"myfs0\",\"bdf\":\"{pci_segment:04x}:00:01.0\"}}" - ))); - } else { - assert!( - String::from_utf8_lossy(&cmd_output) - .contains("{\"id\":\"myfs0\",\"bdf\":\"0000:00:06.0\"}") - ); - } - - thread::sleep(std::time::Duration::new(10, 0)); - // Mount shared directory through virtio_fs filesystem - guest - .ssh_command("mkdir -p mount_dir && sudo mount -t virtiofs myfs mount_dir/") - .unwrap(); - - // Check file1 exists and its content is "foo" - assert_eq!( - guest.ssh_command("cat mount_dir/file1").unwrap().trim(), - "foo" - ); - }); - - (r, Some(daemon_child)) - } else { - (r, None) - }; - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - let _ = daemon_child.kill(); - let _ = daemon_child.wait(); - - if let Some(mut daemon_child) = hotplug_daemon_child { - let _ = daemon_child.kill(); - let _ = daemon_child.wait(); - } - - handle_child_output(r, &output); -} - -fn test_virtio_pmem(discard_writes: bool, specify_size: bool) { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - let kernel_path = direct_kernel_boot_path(); - - let pmem_temp_file = TempFile::new().unwrap(); - pmem_temp_file.as_file().set_len(128 << 20).unwrap(); - - std::process::Command::new("mkfs.ext4") - .arg(pmem_temp_file.as_path()) - .output() - .expect("Expect creating disk image to succeed"); - - let mut child = GuestCommand::new(&guest) - .default_cpus() - .default_memory() - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .default_net() - .args([ - "--pmem", - format!( - "file={}{}{}", - pmem_temp_file.as_path().to_str().unwrap(), - if specify_size { ",size=128M" } else { "" }, - if discard_writes { - ",discard_writes=on" - } else { - "" - } - ) - .as_str(), - ]) - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - // Check for the presence of /dev/pmem0 - assert_eq!( - guest.ssh_command("ls /dev/pmem0").unwrap().trim(), - "/dev/pmem0" - ); - - // Check changes persist after reboot - assert_eq!(guest.ssh_command("sudo mount /dev/pmem0 /mnt").unwrap(), ""); - assert_eq!(guest.ssh_command("ls /mnt").unwrap(), "lost+found\n"); - guest - .ssh_command("echo test123 | sudo tee /mnt/test") - .unwrap(); - assert_eq!(guest.ssh_command("sudo umount /mnt").unwrap(), ""); - assert_eq!(guest.ssh_command("ls /mnt").unwrap(), ""); - - guest.reboot_linux(0); - assert_eq!(guest.ssh_command("sudo mount /dev/pmem0 /mnt").unwrap(), ""); - assert_eq!( - guest - .ssh_command("sudo cat /mnt/test || true") - .unwrap() - .trim(), - if discard_writes { "" } else { "test123" } - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); -} - -fn get_fd_count(pid: u32) -> usize { - fs::read_dir(format!("/proc/{pid}/fd")).unwrap().count() -} - -fn _test_virtio_vsock(guest: &Guest, hotplug: bool) { - let socket = temp_vsock_path(&guest.tmp_dir); - let api_socket = temp_api_path(&guest.tmp_dir); - - let mut cmd = GuestCommand::new(guest); - cmd.args(["--api-socket", &api_socket]); - cmd.default_cpus(); - cmd.default_memory(); - cmd.default_kernel_cmdline(); - cmd.default_disks(); - cmd.default_net(); - - if !hotplug { - cmd.args(["--vsock", format!("cid=3,socket={socket}").as_str()]); - } - - let mut child = cmd.capture_output().spawn().unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - if hotplug { - let (cmd_success, cmd_output) = remote_command_w_output( - &api_socket, - "add-vsock", - Some(format!("cid=3,socket={socket},id=test0").as_str()), - ); - assert!(cmd_success); - assert!( - String::from_utf8_lossy(&cmd_output) - .contains("{\"id\":\"test0\",\"bdf\":\"0000:00:06.0\"}") - ); - thread::sleep(std::time::Duration::new(10, 0)); - // Check adding a second one fails - assert!(!remote_command( - &api_socket, - "add-vsock", - Some("cid=1234,socket=/tmp/fail") - )); - } - - // Validate vsock works as expected. - guest.check_vsock(socket.as_str()); - guest.reboot_linux(0); - // Validate vsock still works after a reboot. - guest.check_vsock(socket.as_str()); - - if hotplug { - assert!(remote_command(&api_socket, "remove-device", Some("test0"))); - } - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); -} - -fn get_ksm_pages_shared() -> u32 { - fs::read_to_string("/sys/kernel/mm/ksm/pages_shared") - .unwrap() - .trim() - .parse::() - .unwrap() -} - -fn test_memory_mergeable(mergeable: bool) { - let memory_param = if mergeable { - "mergeable=on" - } else { - "mergeable=off" - }; - - // We assume the number of shared pages in the rest of the system to be constant - let ksm_ps_init = get_ksm_pages_shared(); - - let disk_config1 = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); - let guest1 = Guest::new(Box::new(disk_config1)); - let mut child1 = GuestCommand::new(&guest1) - .default_cpus() - .args(["--memory", format!("size=512M,{memory_param}").as_str()]) - .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .args(["--net", guest1.default_net_string().as_str()]) - .args(["--serial", "tty", "--console", "off"]) - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest1.wait_vm_boot().unwrap(); - }); - if r.is_err() { - kill_child(&mut child1); - let output = child1.wait_with_output().unwrap(); - handle_child_output(r, &output); - panic!("Test should already be failed/panicked"); // To explicitly mark this block never return - } - - let ksm_ps_guest1 = get_ksm_pages_shared(); - - let disk_config2 = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); - let guest2 = Guest::new(Box::new(disk_config2)); - let mut child2 = GuestCommand::new(&guest2) - .default_cpus() - .args(["--memory", format!("size=512M,{memory_param}").as_str()]) - .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .args(["--net", guest2.default_net_string().as_str()]) - .args(["--serial", "tty", "--console", "off"]) - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest2.wait_vm_boot().unwrap(); - let ksm_ps_guest2 = get_ksm_pages_shared(); - - if mergeable { - println!( - "ksm pages_shared after vm1 booted '{ksm_ps_guest1}', ksm pages_shared after vm2 booted '{ksm_ps_guest2}'" - ); - // We are expecting the number of shared pages to increase as the number of VM increases - assert!(ksm_ps_guest1 < ksm_ps_guest2); - } else { - assert!(ksm_ps_guest1 == ksm_ps_init); - assert!(ksm_ps_guest2 == ksm_ps_init); - } - }); - - kill_child(&mut child1); - kill_child(&mut child2); - - let output = child1.wait_with_output().unwrap(); - child2.wait().unwrap(); - - handle_child_output(r, &output); -} - -fn _get_vmm_overhead(pid: u32, guest_memory_size: u32) -> HashMap { - let smaps = fs::File::open(format!("/proc/{pid}/smaps")).unwrap(); - let reader = io::BufReader::new(smaps); - - let mut skip_map: bool = false; - let mut region_name: String = String::new(); - let mut region_maps = HashMap::new(); - for line in reader.lines() { - let l = line.unwrap(); - - if l.contains('-') { - let values: Vec<&str> = l.split_whitespace().collect(); - region_name = values.last().unwrap().trim().to_string(); - if region_name == "0" { - region_name = "anonymous".to_string(); - } - } - - // Each section begins with something that looks like: - // Size: 2184 kB - if l.starts_with("Size:") { - let values: Vec<&str> = l.split_whitespace().collect(); - let map_size = values[1].parse::().unwrap(); - // We skip the assigned guest RAM map, its RSS is only - // dependent on the guest actual memory usage. - // Everything else can be added to the VMM overhead. - skip_map = map_size >= guest_memory_size; - continue; - } - - // If this is a map we're taking into account, then we only - // count the RSS. The sum of all counted RSS is the VMM overhead. - if !skip_map && l.starts_with("Rss:") { - let values: Vec<&str> = l.split_whitespace().collect(); - let value = values[1].trim().parse::().unwrap(); - *region_maps.entry(region_name.clone()).or_insert(0) += value; - } - } - - region_maps -} - -fn get_vmm_overhead(pid: u32, guest_memory_size: u32) -> u32 { - let mut total = 0; - - for (region_name, value) in &_get_vmm_overhead(pid, guest_memory_size) { - eprintln!("{region_name}: {value}"); - total += value; - } - - total -} - -fn process_rss_kib(pid: u32) -> usize { - let command = format!("ps -q {pid} -o rss="); - let rss = exec_host_command_output(&command); - String::from_utf8_lossy(&rss.stdout).trim().parse().unwrap() -} - -// 10MB is our maximum accepted overhead. -const MAXIMUM_VMM_OVERHEAD_KB: u32 = 10 * 1024; - -#[derive(PartialEq, Eq, PartialOrd)] -struct Counters { - rx_bytes: u64, - rx_frames: u64, - tx_bytes: u64, - tx_frames: u64, - read_bytes: u64, - write_bytes: u64, - read_ops: u64, - write_ops: u64, -} - -fn get_counters(api_socket: &str) -> Counters { - // Get counters - let (cmd_success, cmd_output) = remote_command_w_output(api_socket, "counters", None); - assert!(cmd_success); - - let counters: HashMap<&str, HashMap<&str, u64>> = - serde_json::from_slice(&cmd_output).unwrap_or_default(); - - let rx_bytes = *counters.get("_net2").unwrap().get("rx_bytes").unwrap(); - let rx_frames = *counters.get("_net2").unwrap().get("rx_frames").unwrap(); - let tx_bytes = *counters.get("_net2").unwrap().get("tx_bytes").unwrap(); - let tx_frames = *counters.get("_net2").unwrap().get("tx_frames").unwrap(); - - let read_bytes = *counters.get("_disk0").unwrap().get("read_bytes").unwrap(); - let write_bytes = *counters.get("_disk0").unwrap().get("write_bytes").unwrap(); - let read_ops = *counters.get("_disk0").unwrap().get("read_ops").unwrap(); - let write_ops = *counters.get("_disk0").unwrap().get("write_ops").unwrap(); - - Counters { - rx_bytes, - rx_frames, - tx_bytes, - tx_frames, - read_bytes, - write_bytes, - read_ops, - write_ops, - } -} - -fn pty_read(mut pty: std::fs::File) -> Receiver { - let (tx, rx) = mpsc::channel::(); - thread::spawn(move || { - loop { - thread::sleep(std::time::Duration::new(1, 0)); - let mut buf = [0; 512]; - match pty.read(&mut buf) { - Ok(_bytes) => { - let output = std::str::from_utf8(&buf).unwrap().to_string(); - match tx.send(output) { - Ok(_) => (), - Err(_) => break, - } - } - Err(_) => break, - } - } - }); - rx -} - -fn get_pty_path(api_socket: &str, pty_type: &str) -> PathBuf { - let (cmd_success, cmd_output) = remote_command_w_output(api_socket, "info", None); - assert!(cmd_success); - let info: serde_json::Value = serde_json::from_slice(&cmd_output).unwrap_or_default(); - assert_eq!("Pty", info["config"][pty_type]["mode"]); - PathBuf::from( - info["config"][pty_type]["file"] - .as_str() - .expect("Missing pty path"), - ) -} - -// VFIO test network setup. -// We reserve a different IP class for it: 172.18.0.0/24. -#[cfg(target_arch = "x86_64")] -fn setup_vfio_network_interfaces() { - // 'vfio-br0' - assert!(exec_host_command_status("sudo ip link add name vfio-br0 type bridge").success()); - assert!(exec_host_command_status("sudo ip link set vfio-br0 up").success()); - assert!(exec_host_command_status("sudo ip addr add 172.18.0.1/24 dev vfio-br0").success()); - // 'vfio-tap0' - assert!(exec_host_command_status("sudo ip tuntap add vfio-tap0 mode tap").success()); - assert!(exec_host_command_status("sudo ip link set vfio-tap0 master vfio-br0").success()); - assert!(exec_host_command_status("sudo ip link set vfio-tap0 up").success()); - // 'vfio-tap1' - assert!(exec_host_command_status("sudo ip tuntap add vfio-tap1 mode tap").success()); - assert!(exec_host_command_status("sudo ip link set vfio-tap1 master vfio-br0").success()); - assert!(exec_host_command_status("sudo ip link set vfio-tap1 up").success()); - // 'vfio-tap2' - assert!(exec_host_command_status("sudo ip tuntap add vfio-tap2 mode tap").success()); - assert!(exec_host_command_status("sudo ip link set vfio-tap2 master vfio-br0").success()); - assert!(exec_host_command_status("sudo ip link set vfio-tap2 up").success()); - // 'vfio-tap3' - assert!(exec_host_command_status("sudo ip tuntap add vfio-tap3 mode tap").success()); - assert!(exec_host_command_status("sudo ip link set vfio-tap3 master vfio-br0").success()); - assert!(exec_host_command_status("sudo ip link set vfio-tap3 up").success()); -} - -// Tear VFIO test network down -#[cfg(target_arch = "x86_64")] -fn cleanup_vfio_network_interfaces() { - assert!(exec_host_command_status("sudo ip link del vfio-br0").success()); - assert!(exec_host_command_status("sudo ip link del vfio-tap0").success()); - assert!(exec_host_command_status("sudo ip link del vfio-tap1").success()); - assert!(exec_host_command_status("sudo ip link del vfio-tap2").success()); - assert!(exec_host_command_status("sudo ip link del vfio-tap3").success()); -} - -fn balloon_size(api_socket: &str) -> u64 { - let (cmd_success, cmd_output) = remote_command_w_output(api_socket, "info", None); - assert!(cmd_success); - - let info: serde_json::Value = serde_json::from_slice(&cmd_output).unwrap_or_default(); - let total_mem = &info["config"]["memory"]["size"] - .to_string() - .parse::() - .unwrap(); - let actual_mem = &info["memory_actual_size"] - .to_string() - .parse::() - .unwrap(); - total_mem - actual_mem -} - -fn vm_state(api_socket: &str) -> String { - let (cmd_success, cmd_output) = remote_command_w_output(api_socket, "info", None); - assert!(cmd_success); - - let info: serde_json::Value = serde_json::from_slice(&cmd_output).unwrap_or_default(); - let state = &info["state"].as_str().unwrap(); - - state.to_string() -} - -// This test validates that it can find the virtio-iommu device at first. -// It also verifies that both disks and the network card are attached to -// the virtual IOMMU by looking at /sys/kernel/iommu_groups directory. -// The last interesting part of this test is that it exercises the network -// interface attached to the virtual IOMMU since this is the one used to -// send all commands through SSH. -fn _test_virtio_iommu(_acpi: bool /* not needed on x86_64 */) { - // Virtio-iommu support is ready in recent kernel (v5.14). But the kernel in - // Focal image is still old. - // So if ACPI is enabled on AArch64, we use a modified Focal image in which - // the kernel binary has been updated. - #[cfg(target_arch = "aarch64")] - let focal_image = FOCAL_IMAGE_UPDATE_KERNEL_NAME.to_string(); - #[cfg(target_arch = "x86_64")] - let focal_image = FOCAL_IMAGE_NAME.to_string(); - let disk_config = UbuntuDiskConfig::new(focal_image); - let guest = Guest::new(Box::new(disk_config)); - - #[cfg(target_arch = "x86_64")] - let kernel_path = direct_kernel_boot_path(); - #[cfg(target_arch = "aarch64")] - let kernel_path = if _acpi { - edk2_path() - } else { - direct_kernel_boot_path() - }; - - let mut child = GuestCommand::new(&guest) - .default_cpus() - .default_memory() - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .args([ - "--disk", - format!( - "path={},iommu=on", - guest.disk_config.disk(DiskType::OperatingSystem).unwrap() - ) - .as_str(), - format!( - "path={},iommu=on", - guest.disk_config.disk(DiskType::CloudInit).unwrap() - ) - .as_str(), - ]) - .args(["--net", guest.default_net_string_w_iommu().as_str()]) - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - // Verify the virtio-iommu device is present. - assert!( - guest - .does_device_vendor_pair_match("0x1057", "0x1af4") - .unwrap_or_default() - ); - - // On AArch64, if the guest system boots from FDT, the behavior of IOMMU is a bit - // different with ACPI. - // All devices on the PCI bus will be attached to the virtual IOMMU, except the - // virtio-iommu device itself. So these devices will all be added to IOMMU groups, - // and appear under folder '/sys/kernel/iommu_groups/'. - // - // Verify the first disk is in an iommu group. - assert!( - guest - .ssh_command("ls /sys/kernel/iommu_groups/*/devices") - .unwrap() - .contains("0000:00:02.0") - ); - - // Verify the second disk is in an iommu group. - assert!( - guest - .ssh_command("ls /sys/kernel/iommu_groups/*/devices") - .unwrap() - .contains("0000:00:03.0") - ); - - // Verify the network card is in an iommu group. - assert!( - guest - .ssh_command("ls /sys/kernel/iommu_groups/*/devices") - .unwrap() - .contains("0000:00:04.0") - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); -} - -fn get_reboot_count(guest: &Guest) -> u32 { - guest - .ssh_command("sudo last | grep -c reboot") - .unwrap() - .trim() - .parse::() - .unwrap_or_default() -} - -fn enable_guest_watchdog(guest: &Guest, watchdog_sec: u32) { - // Check for PCI device - assert!( - guest - .does_device_vendor_pair_match("0x1063", "0x1af4") - .unwrap_or_default() - ); - - // Enable systemd watchdog - guest - .ssh_command(&format!( - "echo RuntimeWatchdogSec={watchdog_sec}s | sudo tee -a /etc/systemd/system.conf" - )) - .unwrap(); - - guest.ssh_command("sudo systemctl daemon-reexec").unwrap(); -} - -fn make_guest_panic(guest: &Guest) { - // Check for pvpanic device - assert!( - guest - .does_device_vendor_pair_match("0x0011", "0x1b36") - .unwrap_or_default() - ); - - // Trigger guest a panic - guest.ssh_command("screen -dmS reboot sh -c \"sleep 5; echo s | tee /proc/sysrq-trigger; echo c | sudo tee /proc/sysrq-trigger\"").unwrap(); -} - -// ivshmem test -// This case validates that read data from host(host write data to ivshmem backend file, -// guest read data from ivshmem pci bar2 memory) -// and write data to host(guest write data to ivshmem pci bar2 memory, host read it from -// ivshmem backend file). -// It also checks the size of the shared memory region. -fn _test_ivshmem(guest: &Guest, ivshmem_file_path: impl AsRef, file_size: &str) { - let ivshmem_file_path = ivshmem_file_path.as_ref(); - let test_message_read = String::from("ivshmem device test data read"); - // Modify backend file data before function test - let mut file = OpenOptions::new() - .read(true) - .write(true) - .open(ivshmem_file_path) - .unwrap(); - file.seek(SeekFrom::Start(0)).unwrap(); - file.write_all(test_message_read.as_bytes()).unwrap(); - file.write_all(b"\0").unwrap(); - file.flush().unwrap(); - - let output = fs::read_to_string(ivshmem_file_path).unwrap(); - let nul_pos = output.as_bytes().iter().position(|&b| b == 0).unwrap(); - let c_str = CStr::from_bytes_until_nul(&output.as_bytes()[..=nul_pos]).unwrap(); - let file_message = c_str.to_string_lossy().to_string(); - // Check if the backend file data is correct - assert_eq!(test_message_read, file_message); - - let device_id_line = String::from( - guest - .ssh_command("lspci -D | grep \"Inter-VM shared memory\"") - .unwrap() - .trim(), - ); - // Check if ivshmem exists - assert!(!device_id_line.is_empty()); - let device_id = device_id_line.split(" ").next().unwrap(); - // Check shard memory size - assert_eq!( - guest - .ssh_command( - format!("lspci -vv -s {device_id} | grep -c \"Region 2.*size={file_size}\"") - .as_str(), - ) - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 1 - ); - - // guest don't have gcc or g++, try to use python to test :( - // This python program try to mmap the ivshmem pci bar2 memory and read the data from it. - let ivshmem_test_read = format!( - r#" -import os -import mmap -from ctypes import create_string_buffer, c_char, memmove - -if __name__ == "__main__": - device_path = f"/sys/bus/pci/devices/{device_id}/resource2" - fd = os.open(device_path, os.O_RDWR | os.O_SYNC) - - PAGE_SIZE = os.sysconf('SC_PAGESIZE') - - with mmap.mmap(fd, PAGE_SIZE, flags=mmap.MAP_SHARED, - prot=mmap.PROT_READ | mmap.PROT_WRITE, offset=0) as shmem: - c_buf = (c_char * PAGE_SIZE).from_buffer(shmem) - null_pos = c_buf.raw.find(b'\x00') - valid_data = c_buf.raw[:null_pos] if null_pos != -1 else c_buf.raw - print(valid_data.decode('utf-8', errors='replace'), end="") - shmem.flush() - del c_buf - - os.close(fd) - "# - ); - guest - .ssh_command( - format!( - r#"cat << EOF > test_read.py -{ivshmem_test_read} -EOF -"# - ) - .as_str(), - ) - .unwrap(); - let guest_message = guest.ssh_command("sudo python3 test_read.py").unwrap(); - - // Check the probe message in host and guest - assert_eq!(test_message_read, guest_message); - - let test_message_write = "ivshmem device test data write"; - // Then the program writes a test message to the memory and flush it. - let ivshmem_test_write = format!( - r#" -import os -import mmap -from ctypes import create_string_buffer, c_char, memmove - -if __name__ == "__main__": - device_path = f"/sys/bus/pci/devices/{device_id}/resource2" - test_message = "{test_message_write}" - fd = os.open(device_path, os.O_RDWR | os.O_SYNC) - - PAGE_SIZE = os.sysconf('SC_PAGESIZE') - - with mmap.mmap(fd, PAGE_SIZE, flags=mmap.MAP_SHARED, - prot=mmap.PROT_READ | mmap.PROT_WRITE, offset=0) as shmem: - shmem.flush() - c_buf = (c_char * PAGE_SIZE).from_buffer(shmem) - encoded_msg = test_message.encode('utf-8').ljust(1000, b'\x00') - memmove(c_buf, encoded_msg, len(encoded_msg)) - shmem.flush() - del c_buf - - os.close(fd) - "# - ); - - guest - .ssh_command( - format!( - r#"cat << EOF > test_write.py -{ivshmem_test_write} -EOF -"# - ) - .as_str(), - ) - .unwrap(); - - let _ = guest.ssh_command("sudo python3 test_write.py").unwrap(); - - let output = fs::read_to_string(ivshmem_file_path).unwrap(); - let nul_pos = output.as_bytes().iter().position(|&b| b == 0).unwrap(); - let c_str = CStr::from_bytes_until_nul(&output.as_bytes()[..=nul_pos]).unwrap(); - let file_message = c_str.to_string_lossy().to_string(); - // Check to send data from guest to host - assert_eq!(test_message_write, file_message); -} - -fn _test_simple_launch(guest: &Guest) { - let event_path = temp_event_monitor_path(&guest.tmp_dir); - - let mut child = GuestCommand::new(guest) - .default_cpus() - .default_memory() - .default_kernel_cmdline() - .default_disks() - .default_net() - .args(["--serial", "tty", "--console", "off"]) - .args(["--event-monitor", format!("path={event_path}").as_str()]) - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - guest.validate_cpu_count(None); - guest.validate_memory(None); - assert_eq!(guest.get_pci_bridge_class().unwrap_or_default(), "0x060000"); - assert!(check_sequential_events( - &guest - .get_expected_seq_events_for_simple_launch() - .iter() - .collect::>(), - &event_path - )); - - // It's been observed on the Bionic image that udev and snapd - // services can cause some delay in the VM's shutdown. Disabling - // them improves the reliability of this test. - let _ = guest.ssh_command("sudo systemctl disable udev"); - let _ = guest.ssh_command("sudo systemctl stop udev"); - let _ = guest.ssh_command("sudo systemctl disable snapd"); - let _ = guest.ssh_command("sudo systemctl stop snapd"); - - guest.ssh_command("sudo poweroff").unwrap(); - thread::sleep(std::time::Duration::new(20, 0)); - let latest_events = [ - &MetaEvent { - event: "shutdown".to_string(), - device_id: None, - }, - &MetaEvent { - event: "deleted".to_string(), - device_id: None, - }, - &MetaEvent { - event: "shutdown".to_string(), - device_id: None, - }, - ]; - assert!(check_latest_events_exact(&latest_events, &event_path)); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); -} - -fn _test_multi_cpu(guest: &Guest) { - let mut cmd = GuestCommand::new(guest); - cmd.args(["--cpus", "boot=2,max=4"]) - .default_memory() - .default_kernel_cmdline() - .capture_output() - .default_disks() - .default_net(); - - let mut child = cmd.spawn().unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - assert_eq!(guest.get_cpu_count().unwrap_or_default(), 2); - - assert_eq!( - guest - .ssh_command(r#"sudo dmesg | grep "smp: Brought up" | sed "s/\[\ *[0-9.]*\] //""#) - .unwrap() - .trim(), - "smp: Brought up 1 node, 2 CPUs" - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); -} - -fn _test_cpu_affinity(guest: &Guest) { - // We need the host to have at least 4 CPUs if we want to be able - // to run this test. - let host_cpus_count = exec_host_command_output("nproc"); - assert!( - String::from_utf8_lossy(&host_cpus_count.stdout) - .trim() - .parse::() - .unwrap_or(0) - >= 4 - ); - - let mut child = GuestCommand::new(guest) - .default_cpus_with_affinity() - .default_memory() - .default_kernel_cmdline() - .default_disks() - .default_net() - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - let pid = child.id(); - let taskset_vcpu0 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep vcpu0 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str()); - assert_eq!(String::from_utf8_lossy(&taskset_vcpu0.stdout).trim(), "0,2"); - let taskset_vcpu1 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep vcpu1 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str()); - assert_eq!(String::from_utf8_lossy(&taskset_vcpu1.stdout).trim(), "1,3"); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - handle_child_output(r, &output); -} - -fn _test_virtio_queue_affinity(guest: &Guest) { - // We need the host to have at least 4 CPUs if we want to be able - // to run this test. - let host_cpus_count = exec_host_command_output("nproc"); - assert!( - String::from_utf8_lossy(&host_cpus_count.stdout) - .trim() - .parse::() - .unwrap_or(0) - >= 4 - ); - - let mut child = GuestCommand::new(guest) - .default_cpus() - .default_memory() - .default_kernel_cmdline() - .args([ - "--disk", - format!( - "path={}", - guest.disk_config.disk(DiskType::OperatingSystem).unwrap() - ) - .as_str(), - format!( - "path={},num_queues=4,queue_affinity=[0@[0,2],1@[1,3],2@[1],3@[3]]", - guest.disk_config.disk(DiskType::CloudInit).unwrap() - ) - .as_str(), - ]) - .default_net() - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - let pid = child.id(); - let taskset_q0 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep disk1_q0 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str()); - assert_eq!(String::from_utf8_lossy(&taskset_q0.stdout).trim(), "0,2"); - let taskset_q1 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep disk1_q1 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str()); - assert_eq!(String::from_utf8_lossy(&taskset_q1.stdout).trim(), "1,3"); - let taskset_q2 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep disk1_q2 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str()); - assert_eq!(String::from_utf8_lossy(&taskset_q2.stdout).trim(), "1"); - let taskset_q3 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep disk1_q3 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str()); - assert_eq!(String::from_utf8_lossy(&taskset_q3.stdout).trim(), "3"); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - handle_child_output(r, &output); -} - -fn _test_pci_msi(guest: &Guest) { - let mut cmd = GuestCommand::new(guest); - cmd.default_cpus() - .default_memory() - .default_kernel_cmdline() - .capture_output() - .default_disks() - .default_net(); - - let mut child = cmd.spawn().unwrap(); - - guest.wait_vm_boot().unwrap(); - - let grep_cmd = format!("grep -c {} /proc/interrupts", get_msi_interrupt_pattern()); - - let r = std::panic::catch_unwind(|| { - assert_eq!( - guest - .ssh_command(&grep_cmd) - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 12 - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); -} - -fn _test_virtio_net_ctrl_queue(guest: &Guest) { - let mut cmd = GuestCommand::new(guest); - cmd.default_cpus() - .default_memory() - .default_kernel_cmdline() - .args(["--net", guest.default_net_string_w_mtu(3000).as_str()]) - .capture_output() - .default_disks(); - - let mut child = cmd.spawn().unwrap(); - - guest.wait_vm_boot().unwrap(); - - #[cfg(target_arch = "aarch64")] - let iface = "enp0s4"; - #[cfg(target_arch = "x86_64")] - let iface = "ens4"; - - let r = std::panic::catch_unwind(|| { - assert_eq!( - guest - .ssh_command( - format!("sudo ethtool -K {iface} rx-gro-hw off && echo success").as_str() - ) - .unwrap() - .trim(), - "success" - ); - assert_eq!( - guest - .ssh_command(format!("cat /sys/class/net/{iface}/mtu").as_str()) - .unwrap() - .trim(), - "3000" - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); -} - -fn _test_pci_multiple_segments( - guest: &Guest, - max_num_pci_segments: u16, - pci_segments_for_disk: u16, -) { - // Prepare another disk file for the virtio-disk device - let test_disk_path = String::from( - guest - .tmp_dir - .as_path() - .join("test-disk.raw") - .to_str() - .unwrap(), - ); - assert!( - exec_host_command_status(format!("truncate {test_disk_path} -s 4M").as_str()).success() - ); - assert!(exec_host_command_status(format!("mkfs.ext4 {test_disk_path}").as_str()).success()); - - let mut cmd = GuestCommand::new(guest); - cmd.default_cpus() - .default_memory() - .default_kernel_cmdline_with_platform(Some(&format!( - "num_pci_segments={max_num_pci_segments}" - ))) - .args([ - "--disk", - format!( - "path={}", - guest.disk_config.disk(DiskType::OperatingSystem).unwrap() - ) - .as_str(), - format!( - "path={}", - guest.disk_config.disk(DiskType::CloudInit).unwrap() - ) - .as_str(), - format!("path={test_disk_path},pci_segment={pci_segments_for_disk},image_type=raw") - .as_str(), - ]) - .capture_output() - .default_net(); - - let mut child = cmd.spawn().unwrap(); - - guest.wait_vm_boot().unwrap(); - - let grep_cmd = "lspci | grep \"Host bridge\" | wc -l"; - - let r = std::panic::catch_unwind(|| { - // There should be MAX_NUM_PCI_SEGMENTS PCI host bridges in the guest. - assert_eq!( - guest - .ssh_command(grep_cmd) - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - max_num_pci_segments - ); - - // Check both if /dev/vdc exists and if the block size is 4M. - assert_eq!( - guest - .ssh_command("lsblk | grep vdc | grep -c 4M") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 1 - ); - - // Mount the device. - guest.ssh_command("mkdir mount_image").unwrap(); - guest - .ssh_command("sudo mount -o rw -t ext4 /dev/vdc mount_image/") - .unwrap(); - // Grant all users with write permission. - guest.ssh_command("sudo chmod a+w mount_image/").unwrap(); - - // Write something to the device. - guest - .ssh_command("sudo echo \"bar\" >> mount_image/foo") - .unwrap(); - - // Check the content of the block device. The file "foo" should - // contain "bar". - assert_eq!( - guest - .ssh_command("sudo cat mount_image/foo") - .unwrap() - .trim(), - "bar" - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); -} - -fn _test_direct_kernel_boot(guest: &Guest) { - let mut child = GuestCommand::new(guest) - .default_cpus() - .default_memory() - .default_kernel_cmdline() - .default_disks() - .default_net() - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - guest.validate_cpu_count(None); - guest.validate_memory(None); - - let grep_cmd = format!("grep -c {} /proc/interrupts", get_msi_interrupt_pattern()); - assert_eq!( - guest - .ssh_command(&grep_cmd) - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 12 - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); -} - -fn _test_virtio_block( - guest: &Guest, - disable_io_uring: bool, - disable_aio: bool, - verify_os_disk: bool, - backing_files: bool, - image_type: ImageType, -) { - let mut workload_path = dirs::home_dir().unwrap(); - workload_path.push("workloads"); - - let mut blk_file_path = workload_path; - blk_file_path.push("blk.img"); - - let initial_backing_checksum = if verify_os_disk { - compute_backing_checksum(guest.disk_config.disk(DiskType::OperatingSystem).unwrap()) - } else { - None - }; - assert!( - guest.num_cpu >= 4, - "_test_virtio_block requires at least 4 CPUs to match num_queues=4" - ); - let mut cloud_child = GuestCommand::new(guest) - .default_cpus() - .args(["--memory", "size=512M,shared=on"]) - .default_kernel_cmdline() - .args([ - "--disk", - format!( - "path={},backing_files={},image_type={image_type}", - guest.disk_config.disk(DiskType::OperatingSystem).unwrap(), - if backing_files { "on" } else { "off" }, - ) - .as_str(), - format!( - "path={}", - guest.disk_config.disk(DiskType::CloudInit).unwrap() - ) - .as_str(), - format!( - "path={},readonly=on,direct=on,num_queues=4,_disable_io_uring={},_disable_aio={}", - blk_file_path.to_str().unwrap(), - disable_io_uring, - disable_aio, - ) - .as_str(), - ]) - .default_net() - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - // Check both if /dev/vdc exists and if the block size is 16M. - assert_eq!( - guest - .ssh_command("lsblk | grep vdc | grep -c 16M") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 1 - ); - - // Check both if /dev/vdc exists and if this block is RO. - assert_eq!( - guest - .ssh_command("lsblk | grep vdc | awk '{print $5}'") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 1 - ); - - // Check if the number of queues is 4. - assert_eq!( - guest - .ssh_command("ls -ll /sys/block/vdc/mq | grep ^d | wc -l") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 4 - ); - }); - - if verify_os_disk { - // Use clean shutdown to allow cloud-hypervisor to clear - // the dirty bit in the QCOW2 v3 image. - kill_child(&mut cloud_child); - } else { - let _ = cloud_child.kill(); - } - let output = cloud_child.wait_with_output().unwrap(); - - handle_child_output(r, &output); - - if verify_os_disk { - disk_check_consistency( - guest.disk_config.disk(DiskType::OperatingSystem).unwrap(), - initial_backing_checksum, - ); - } -} - -fn compute_backing_checksum( - path_or_image_name: impl AsRef, -) -> Option<(std::path::PathBuf, String, u32)> { - let path = resolve_disk_path(path_or_image_name); - - let mut file = File::open(&path).ok()?; - if !matches!( - block::detect_image_type(&mut file).ok()?, - block::ImageType::Qcow2 - ) { - return None; - } - - let info = get_image_info(&path)?; - - let backing_file = info["backing-filename"].as_str()?; - let backing_path = if std::path::Path::new(backing_file).is_absolute() { - std::path::PathBuf::from(backing_file) - } else { - path.parent() - .unwrap_or_else(|| std::path::Path::new(".")) - .join(backing_file) - }; - - let backing_info = get_image_info(&backing_path)?; - let backing_format = backing_info["format"].as_str()?.to_string(); - let mut file = File::open(&backing_path).ok()?; - let file_size = file.metadata().ok()?.len(); - let checksum = compute_file_checksum(&mut file, file_size); - - Some((backing_path, backing_format, checksum)) -} - -/// Uses `qemu-img check` to verify disk image consistency. -/// -/// Supported formats are `qcow2` (compressed and uncompressed), -/// `vhdx`, `qed`, `parallels`, `vmdk`, and `vdi`. See man page -/// for more details. -/// -/// It takes either a full path to the image or just the name of -/// the image located in the `workloads` directory. -/// -/// For QCOW2 images with backing files, also verifies the backing file -/// integrity and checks that the backing file hasn't been modified -/// during the test. -/// -/// For QCOW2 v3 images, also verifies the dirty bit is cleared. -fn disk_check_consistency( - path_or_image_name: impl AsRef, - initial_backing_checksum: Option<(std::path::PathBuf, String, u32)>, -) { - let path = resolve_disk_path(path_or_image_name); - let output = run_qemu_img(&path, &["check"], None); - - assert!( - output.status.success(), - "qemu-img check failed: {}", - String::from_utf8_lossy(&output.stderr) - ); - - match check_dirty_flag(&path) { - Ok(Some(dirty)) => { - assert!(!dirty, "QCOW2 image shutdown unclean"); - } - Ok(None) => {} // Not a QCOW2 v3 image, skip dirty flag check - Err(e) => panic!("Failed to check dirty flag: {e}"), - } - - if let Some((backing_path, format, initial_checksum)) = initial_backing_checksum { - if format.parse::().ok() != Some(block::qcow::ImageType::Raw) { - let output = run_qemu_img(&backing_path, &["check"], None); - - assert!( - output.status.success(), - "qemu-img check of backing file failed: {}", - String::from_utf8_lossy(&output.stderr) - ); - } - - let mut file = File::open(&backing_path).unwrap(); - let file_size = file.metadata().unwrap().len(); - assert_eq!( - initial_checksum, - compute_file_checksum(&mut file, file_size) - ); - } -} - -fn run_qemu_img( - path: &std::path::Path, - args: &[&str], - trailing_args: Option<&[&str]>, -) -> std::process::Output { - let mut cmd = std::process::Command::new("qemu-img"); - cmd.arg(args[0]) - .args(&args[1..]) - .arg(path.to_str().unwrap()); - if let Some(extra) = trailing_args { - cmd.args(extra); - } - cmd.output().unwrap() -} - -fn get_image_info(path: &std::path::Path) -> Option { - let output = run_qemu_img(path, &["info", "-U", "--output=json"], None); - - output.status.success().then_some(())?; - serde_json::from_slice(&output.stdout).ok() -} - -fn get_qcow2_v3_info(path: &Path) -> Result, String> { - let info = get_image_info(path) - .ok_or_else(|| format!("qemu-img info failed for {}", path.display()))?; - if info["format"].as_str() != Some("qcow2") { - return Ok(None); - } - // QCOW2 v3 has compat "1.1", v2 has "0.10" - if info["format-specific"]["data"]["compat"].as_str() != Some("1.1") { - return Ok(None); - } - Ok(Some(info)) -} - -fn check_dirty_flag(path: &Path) -> Result, String> { - Ok(get_qcow2_v3_info(path)?.and_then(|info| info["dirty-flag"].as_bool())) -} - -fn check_corrupt_flag(path: &Path) -> Result, String> { - Ok(get_qcow2_v3_info(path)? - .and_then(|info| info["format-specific"]["data"]["corrupt"].as_bool())) -} - -const QCOW2_INCOMPATIBLE_FEATURES_OFFSET: u64 = 72; - -fn set_corrupt_flag(path: &Path, corrupt: bool) -> io::Result<()> { - let mut file = OpenOptions::new().read(true).write(true).open(path)?; - - file.seek(SeekFrom::Start(QCOW2_INCOMPATIBLE_FEATURES_OFFSET))?; - let mut buf = [0u8; 8]; - file.read_exact(&mut buf)?; - let mut features = u64::from_be_bytes(buf); - - if corrupt { - features |= 0x02; - } else { - features &= !0x02; - } - - file.seek(SeekFrom::Start(QCOW2_INCOMPATIBLE_FEATURES_OFFSET))?; - file.write_all(&features.to_be_bytes())?; - file.sync_all()?; - Ok(()) -} - -fn resolve_disk_path(path_or_image_name: impl AsRef) -> std::path::PathBuf { - if path_or_image_name.as_ref().exists() { - // A full path is provided - path_or_image_name.as_ref().to_path_buf() - } else { - // An image name is provided - let mut workload_path = dirs::home_dir().unwrap(); - workload_path.push("workloads"); - workload_path.as_path().join(path_or_image_name.as_ref()) - } -} - -fn compute_file_checksum(reader: &mut dyn std::io::Read, size: u64) -> u32 { - // Read first 16MB or entire data if smaller - let read_size = cmp::min(size, 16 * 1024 * 1024) as usize; - - let mut buffer = vec![0u8; read_size]; - reader.read_exact(&mut buffer).unwrap(); - - // DJB2 hash - let mut hash: u32 = 5381; - for byte in buffer.iter() { - hash = hash.wrapping_mul(33).wrapping_add(*byte as u32); - } - hash -} - -fn make_virtio_block_guest(factory: &GuestFactory, image_name: &str) -> Guest { - let disk_config = UbuntuDiskConfig::new(image_name.to_string()); - factory.create_guest(Box::new(disk_config)).with_cpu(4) -} +mod common; +use common::tests_wrappers::*; +use common::utils::*; mod common_parallel { use std::io::{self, SeekFrom}; From d77e3e7ca25257443fc20d424359fcce592c92ed Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Sat, 14 Mar 2026 09:45:38 +0100 Subject: [PATCH 240/742] block: qcow: Add From for BlockError Temporary From impl that classifies each qcow::Error variant into the appropriate BlockErrorKind. This enables an incremental migration of qcow functions from qcow::Result to BlockResult, where each subsequent commit replaces bare ? sites with explicit BlockError::new calls until this impl can be removed. The mapping assigns InvalidFormat for structural header violations, UnsupportedFeature for version and feature mismatches, CorruptImage for internal inconsistencies, Overflow for nesting depth and Io for everything else. Signed-off-by: Anatol Belski --- block/src/error.rs | 52 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/block/src/error.rs b/block/src/error.rs index ebaa33ec5b..afd4b5533a 100644 --- a/block/src/error.rs +++ b/block/src/error.rs @@ -231,4 +231,56 @@ impl From for BlockError { } } +/// Temporary scaffolding: classify a `qcow::Error` into the appropriate +/// `BlockErrorKind`. +/// +/// This impl exists only to allow an incremental migration of the qcow +/// parse/construct chain from `qcow::Result` to `BlockResult`. Each +/// subsequent commit replaces bare `?` sites with explicit +/// `BlockError::new(kind, e)` calls. Once every site is migrated this +/// impl will be removed. +impl From for BlockError { + fn from(e: crate::qcow::Error) -> Self { + use crate::qcow::Error as E; + let kind = match &e { + // Structural / format violations + E::InvalidMagic + | E::BackingFileTooLong(_) + | E::InvalidBackingFileName(_) + | E::InvalidClusterSize + | E::InvalidL1TableSize(_) + | E::InvalidL1TableOffset + | E::InvalidOffset(_) + | E::InvalidRefcountTableOffset + | E::InvalidRefcountTableSize(_) + | E::FileTooBig(_) + | E::NoRefcountClusters + | E::RefcountTableOffEnd + | E::RefcountTableTooLarge + | E::TooManyL1Entries(_) + | E::TooManyRefcounts(_) + | E::SizeTooSmallForNumberOfClusters => BlockErrorKind::InvalidFormat, + + // Unsupported features / versions + E::UnsupportedVersion(_) + | E::UnsupportedFeature(_) + | E::UnsupportedCompressionType + | E::UnsupportedBackingFileFormat(_) + | E::UnsupportedRefcountOrder + | E::BackingFilesDisabled + | E::ShrinkNotSupported => BlockErrorKind::UnsupportedFeature, + + // Corrupt image + E::CorruptImage => BlockErrorKind::CorruptImage, + + // Nesting depth overflow + E::MaxNestingDepthExceeded => BlockErrorKind::Overflow, + + // Everything else is I/O + _ => BlockErrorKind::Io, + }; + Self::new(kind, e) + } +} + pub type BlockResult = Result; From c4a5c7f843b000720e666cc9348fb3fd85277259 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Sat, 14 Mar 2026 09:46:03 +0100 Subject: [PATCH 241/742] block: qcow: Switch QcowFile constructors to BlockResult Switch the public QcowFile constructors (new, new_from_backing, new_from_header, from, from_backing, from_file_and_header) from qcow::Result to BlockResult. Internal calls to header functions that still return qcow::Result are wrapped with explicit error classification at each call site. Test assertions are updated to match on BlockErrorKind and use downcast to inspect the underlying qcow::Error variant. Signed-off-by: Anatol Belski --- block/src/qcow/mod.rs | 81 +++++++++++++++++++++++++++++++++---------- 1 file changed, 62 insertions(+), 19 deletions(-) diff --git a/block/src/qcow/mod.rs b/block/src/qcow/mod.rs index 0ed4cd858d..8248c3f846 100644 --- a/block/src/qcow/mod.rs +++ b/block/src/qcow/mod.rs @@ -569,7 +569,7 @@ impl QcowFile { /// Creates a QcowFile from `file`. File must be a valid qcow2 image. /// /// Additionally, max nesting depth of this qcow2 image will be set to default value 10. - pub fn from(file: RawFile) -> Result { + pub fn from(file: RawFile) -> BlockResult { Self::from_with_nesting_depth(file, MAX_NESTING_DEPTH, true) } @@ -579,7 +579,7 @@ impl QcowFile { file: RawFile, max_nesting_depth: u32, sparse: bool, - ) -> Result { + ) -> BlockResult { let (inner, backing_file, sparse) = parse_qcow(file, max_nesting_depth, sparse)?; let metadata::QcowState { raw_file, @@ -607,8 +607,20 @@ impl QcowFile { } /// Creates a new QcowFile at the given path. - pub fn new(file: RawFile, version: u32, virtual_size: u64, sparse: bool) -> Result { - let header = QcowHeader::create_for_size_and_path(version, virtual_size, None)?; + pub fn new( + file: RawFile, + version: u32, + virtual_size: u64, + sparse: bool, + ) -> BlockResult { + let header = + QcowHeader::create_for_size_and_path(version, virtual_size, None).map_err(|e| { + let kind = match &e { + Error::BackingFileTooLong(_) => BlockErrorKind::InvalidFormat, + _ => BlockErrorKind::Io, + }; + BlockError::new(kind, e) + })?; QcowFile::new_from_header(file, &header, sparse) } @@ -619,12 +631,19 @@ impl QcowFile { backing_file_size: u64, backing_config: &BackingFileConfig, sparse: bool, - ) -> Result { + ) -> BlockResult { let mut header = QcowHeader::create_for_size_and_path( version, backing_file_size, Some(&backing_config.path), - )?; + ) + .map_err(|e| { + let kind = match &e { + Error::BackingFileTooLong(_) => BlockErrorKind::InvalidFormat, + _ => BlockErrorKind::Io, + }; + BlockError::new(kind, e) + })?; if let Some(backing_file) = &mut header.backing_file { backing_file.format = backing_config.format; } @@ -632,9 +651,16 @@ impl QcowFile { // backing_file is loaded by new_from_header -> Self::from() based on the header } - fn new_from_header(mut file: RawFile, header: &QcowHeader, sparse: bool) -> Result { - file.rewind().map_err(Error::SeekingFile)?; - header.write_to(&mut file)?; + fn new_from_header( + mut file: RawFile, + header: &QcowHeader, + sparse: bool, + ) -> BlockResult { + file.rewind() + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SeekingFile(e)))?; + header + .write_to(&mut file) + .map_err(|e| BlockError::new(BlockErrorKind::Io, e))?; let mut qcow = Self::from_with_nesting_depth(file, MAX_NESTING_DEPTH, sparse)?; @@ -646,9 +672,9 @@ impl QcowFile { let mut cluster_addr = 0; while cluster_addr < end_cluster_addr { - let mut unref_clusters = qcow - .set_cluster_refcount(cluster_addr, 1) - .map_err(Error::SettingRefcountRefcount)?; + let mut unref_clusters = qcow.set_cluster_refcount(cluster_addr, 1).map_err(|e| { + BlockError::new(BlockErrorKind::Io, Error::SettingRefcountRefcount(e)) + })?; qcow.unref_clusters.append(&mut unref_clusters); cluster_addr += cluster_size; } @@ -2180,6 +2206,7 @@ pub fn detect_image_type(file: &mut RawFile) -> Result { #[cfg(test)] mod unit_tests { + use std::error::Error as StdError; use std::fs::File; use std::path::Path; @@ -2531,7 +2558,11 @@ mod unit_tests { disk_file.rewind().unwrap(); // The maximum nesting depth is 0, which means backing file is not allowed. let res = QcowFile::from_with_nesting_depth(disk_file, 0, true); - assert!(matches!(res.unwrap_err(), Error::MaxNestingDepthExceeded)); + let err = res.unwrap_err(); + assert!(matches!(err.kind(), BlockErrorKind::Overflow)); + let source = StdError::source(&err).unwrap(); + let qcow_err = source.downcast_ref::().unwrap(); + assert!(matches!(qcow_err, Error::MaxNestingDepthExceeded)); } /// Create a qcow2 file with itself as its backing file. @@ -3839,7 +3870,7 @@ mod unit_tests { assert!(result.is_err()); let err = result.unwrap_err(); assert!( - matches!(err, Error::CorruptImage), + matches!(err.kind(), BlockErrorKind::CorruptImage), "Expected CorruptImage error, got: {err:?}" ); }); @@ -3853,8 +3884,11 @@ mod unit_tests { let result = QcowFile::from(disk_file); assert!(result.is_err()); let err = result.unwrap_err(); + assert!(matches!(err.kind(), BlockErrorKind::UnsupportedFeature)); + let source = StdError::source(&err).unwrap(); + let qcow_err = source.downcast_ref::().unwrap(); assert!( - matches!(err, Error::UnsupportedFeature(ref v) if v.to_string().contains("external")), + matches!(qcow_err, Error::UnsupportedFeature(v) if v.to_string().contains("external")), "Expected UnsupportedFeature error mentioning external, got: {err:?}" ); }); @@ -3868,8 +3902,11 @@ mod unit_tests { let result = QcowFile::from(disk_file); assert!(result.is_err()); let err = result.unwrap_err(); + assert!(matches!(err.kind(), BlockErrorKind::UnsupportedFeature)); + let source = StdError::source(&err).unwrap(); + let qcow_err = source.downcast_ref::().unwrap(); assert!( - matches!(err, Error::UnsupportedFeature(ref v) if v.to_string().contains("extended")), + matches!(qcow_err, Error::UnsupportedFeature(v) if v.to_string().contains("extended")), "Expected UnsupportedFeature error mentioning extended, got: {err:?}" ); }); @@ -3882,7 +3919,10 @@ mod unit_tests { with_basic_file(&header, |disk_file: RawFile| { let result = QcowFile::from(disk_file); assert!(result.is_err()); - assert!(matches!(result.unwrap_err(), Error::UnsupportedFeature(_))); + assert!(matches!( + result.unwrap_err().kind(), + BlockErrorKind::UnsupportedFeature + )); }); } @@ -3894,8 +3934,11 @@ mod unit_tests { let result = QcowFile::from(disk_file); assert!(result.is_err()); let err = result.unwrap_err(); + assert!(matches!(err.kind(), BlockErrorKind::UnsupportedFeature)); + let source = StdError::source(&err).unwrap(); + let qcow_err = source.downcast_ref::().unwrap(); assert!( - matches!(err, Error::UnsupportedFeature(ref v) if v.to_string().contains("unknown")), + matches!(qcow_err, Error::UnsupportedFeature(v) if v.to_string().contains("unknown")), "Expected UnsupportedFeature error mentioning unknown, got: {err:?}" ); }); @@ -4095,7 +4138,7 @@ mod unit_tests { assert!(result.is_err()); let err = result.unwrap_err(); assert!( - matches!(err, Error::CorruptImage), + matches!(err.kind(), BlockErrorKind::CorruptImage), "Expected CorruptImage error, got: {err:?}" ); }); From be9ef116aad907525ba5bb14fc7783aa6c392472 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 13 Mar 2026 23:39:45 +0100 Subject: [PATCH 242/742] block: qcow: Switch parse_qcow and BackingFile::new to BlockResult Switch parse_qcow and BackingFile::new from qcow::Result to BlockResult. Every early return site now produces an explicit BlockError with the appropriate kind. Remaining internal calls to functions still on qcow::Result rely on the From scaffolding and will be converted in subsequent commits. Two helpers are added to BlockError. with_kind replaces the classification on an existing error, used in QcowDiskSync::new to avoid double wrapping when the caller needs a different kind. into_source consumes the error and returns the boxed source, used at the recursive BackingFile open to extract the qcow::Error for BackingFileOpen without letting qcow::Error hold a BlockError. The qcow_sync boundary is simplified to a single closure that operates on the BlockError already returned by parse_qcow. Signed-off-by: Anatol Belski --- block/src/error.rs | 11 ++++++ block/src/qcow/mod.rs | 90 ++++++++++++++++++++++++++++++++++-------- block/src/qcow_sync.rs | 27 ++++--------- 3 files changed, 92 insertions(+), 36 deletions(-) diff --git a/block/src/error.rs b/block/src/error.rs index afd4b5533a..b235b17e48 100644 --- a/block/src/error.rs +++ b/block/src/error.rs @@ -167,6 +167,12 @@ impl BlockError { self } + /// Replace the error classification (builder-style). + pub fn with_kind(mut self, kind: BlockErrorKind) -> Self { + self.kind = kind; + self + } + /// Shorthand: attach an operation name. pub fn with_op(mut self, op: ErrorOp) -> Self { self.ctx.get_or_insert_with(ErrorContext::default).op = Some(op); @@ -204,6 +210,11 @@ impl BlockError { pub fn downcast_ref(&self) -> Option<&T> { self.source.as_ref()?.downcast_ref::() } + + /// Consume the error and return the boxed source, if any. + pub fn into_source(self) -> Option> { + self.source + } } impl Display for BlockError { diff --git a/block/src/qcow/mod.rs b/block/src/qcow/mod.rs index 8248c3f846..27c7e3a8cd 100644 --- a/block/src/qcow/mod.rs +++ b/block/src/qcow/mod.rs @@ -189,14 +189,17 @@ impl BackingFile { direct_io: bool, max_nesting_depth: u32, sparse: bool, - ) -> Result> { + ) -> BlockResult> { let Some(config) = backing_file_config else { return Ok(None); }; // Check nesting depth - applies to any backing file if max_nesting_depth == 0 { - return Err(Error::MaxNestingDepthExceeded); + return Err(BlockError::new( + BlockErrorKind::Overflow, + Error::MaxNestingDepthExceeded, + )); } let backing_raw_file = OpenOptions::new() @@ -224,8 +227,17 @@ impl BackingFile { } ImageType::Qcow2 => { let (inner, nested_backing, _sparse) = - parse_qcow(raw_file, max_nesting_depth - 1, sparse) - .map_err(|e| Error::BackingFileOpen(config.path.clone(), Box::new(e)))?; + parse_qcow(raw_file, max_nesting_depth - 1, sparse).map_err(|e| { + let kind = e.kind(); + let source = e + .into_source() + .and_then(|s| s.downcast::().ok()) + .map(|qcow_err| Error::BackingFileOpen(config.path.clone(), qcow_err)); + match source { + Some(err) => BlockError::new(kind, err), + None => BlockError::from_kind(kind), + } + })?; let size = inner.header.size; ( BackingKind::Qcow { @@ -335,28 +347,51 @@ pub(crate) fn parse_qcow( mut file: RawFile, max_nesting_depth: u32, sparse: bool, -) -> Result<(metadata::QcowState, Option, bool)> { - let mut header = QcowHeader::new(&mut file)?; +) -> BlockResult<(metadata::QcowState, Option, bool)> { + let mut header = QcowHeader::new(&mut file).map_err(|e| { + let kind = match &e { + Error::InvalidMagic + | Error::BackingFileTooLong(_) + | Error::InvalidBackingFileName(_) => BlockErrorKind::InvalidFormat, + Error::UnsupportedFeature(_) | Error::UnsupportedCompressionType => { + BlockErrorKind::UnsupportedFeature + } + _ => BlockErrorKind::Io, + }; + BlockError::new(kind, e) + })?; // Only v2 and v3 files are supported. if header.version != 2 && header.version != 3 { - return Err(Error::UnsupportedVersion(header.version)); + return Err(BlockError::new( + BlockErrorKind::UnsupportedFeature, + Error::UnsupportedVersion(header.version), + )); } // Make sure that the L1 table fits in RAM. if u64::from(header.l1_size) > MAX_RAM_POINTER_TABLE_SIZE { - return Err(Error::InvalidL1TableSize(header.l1_size)); + return Err(BlockError::new( + BlockErrorKind::InvalidFormat, + Error::InvalidL1TableSize(header.l1_size), + )); } let cluster_bits: u32 = header.cluster_bits; if !(MIN_CLUSTER_BITS..=MAX_CLUSTER_BITS).contains(&cluster_bits) { - return Err(Error::InvalidClusterSize); + return Err(BlockError::new( + BlockErrorKind::InvalidFormat, + Error::InvalidClusterSize, + )); } let cluster_size = 0x01u64 << cluster_bits; // Limit the total size of the disk. if header.size > MAX_QCOW_FILE_SIZE { - return Err(Error::FileTooBig(header.size)); + return Err(BlockError::new( + BlockErrorKind::InvalidFormat, + Error::FileTooBig(header.size), + )); } let direct_io = file.is_direct(); @@ -373,12 +408,18 @@ pub(crate) fn parse_qcow( .checked_shl(header.refcount_order) .ok_or(Error::UnsupportedRefcountOrder)?; if refcount_bits > 64 { - return Err(Error::UnsupportedRefcountOrder); + return Err(BlockError::new( + BlockErrorKind::UnsupportedFeature, + Error::UnsupportedRefcountOrder, + )); } // Need at least one refcount cluster if header.refcount_table_clusters == 0 { - return Err(Error::NoRefcountClusters); + return Err(BlockError::new( + BlockErrorKind::InvalidFormat, + Error::NoRefcountClusters, + )); } offset_is_cluster_boundary(header.l1_table_offset, header.cluster_bits)?; offset_is_cluster_boundary(header.snapshots_offset, header.cluster_bits)?; @@ -386,7 +427,10 @@ pub(crate) fn parse_qcow( offset_is_cluster_boundary(header.refcount_table_offset, header.cluster_bits)?; let file_size = file.metadata().map_err(Error::GettingFileSize)?.len(); if header.refcount_table_offset > max(file_size, header.size) { - return Err(Error::RefcountTableOffEnd); + return Err(BlockError::new( + BlockErrorKind::CorruptImage, + Error::RefcountTableOffEnd, + )); } // The first cluster should always have a non-zero refcount, so if it is 0, @@ -414,7 +458,10 @@ pub(crate) fn parse_qcow( if header.is_corrupt() { if is_writable { - return Err(Error::CorruptImage); + return Err(BlockError::new( + BlockErrorKind::CorruptImage, + Error::CorruptImage, + )); } let path = read_link(format!("/proc/self/fd/{}", raw_file.file().as_raw_fd())) .map_or_else(|_| "".to_string(), |p| p.display().to_string()); @@ -440,7 +487,10 @@ pub(crate) fn parse_qcow( let l1_clusters = div_round_up_u64(num_l2_clusters, entries_per_cluster); let header_clusters = div_round_up_u64(size_of::() as u64, cluster_size); if num_l2_clusters > MAX_RAM_POINTER_TABLE_SIZE { - return Err(Error::TooManyL1Entries(num_l2_clusters)); + return Err(BlockError::new( + BlockErrorKind::CorruptImage, + Error::TooManyL1Entries(num_l2_clusters), + )); } let l1_table = VecCache::from_vec( raw_file @@ -460,10 +510,16 @@ pub(crate) fn parse_qcow( ); // Check that the given header doesn't have a suspiciously sized refcount table. if u64::from(header.refcount_table_clusters) > 2 * refcount_clusters { - return Err(Error::RefcountTableTooLarge); + return Err(BlockError::new( + BlockErrorKind::CorruptImage, + Error::RefcountTableTooLarge, + )); } if l1_clusters + refcount_clusters > MAX_RAM_POINTER_TABLE_SIZE { - return Err(Error::TooManyRefcounts(refcount_clusters)); + return Err(BlockError::new( + BlockErrorKind::InvalidFormat, + Error::TooManyRefcounts(refcount_clusters), + )); } let refcount_block_entries = cluster_size * 8 / refcount_bits; let mut refcounts = RefCount::new( diff --git a/block/src/qcow_sync.rs b/block/src/qcow_sync.rs index cce4c192d7..7340a7aa40 100644 --- a/block/src/qcow_sync.rs +++ b/block/src/qcow_sync.rs @@ -197,25 +197,14 @@ impl QcowDiskSync { ) -> BlockResult { let max_nesting_depth = if backing_files { MAX_NESTING_DEPTH } else { 0 }; let (inner, backing_file, sparse) = - parse_qcow(RawFile::new(file, direct_io), max_nesting_depth, sparse) - .map_err(|e| match e { - QcowError::MaxNestingDepthExceeded if !backing_files => { - QcowError::BackingFilesDisabled - } - other => other, - }) - .map_err(|e| { - let kind = match &e { - QcowError::InvalidMagic | QcowError::UnsupportedVersion(_) => { - BlockErrorKind::InvalidFormat - } - QcowError::UnsupportedFeature(_) | QcowError::BackingFilesDisabled => { - BlockErrorKind::UnsupportedFeature - } - _ => BlockErrorKind::Io, - }; - BlockError::new(kind, e).with_op(ErrorOp::Open) - })?; + parse_qcow(RawFile::new(file, direct_io), max_nesting_depth, sparse).map_err(|e| { + let e = if !backing_files && matches!(e.kind(), BlockErrorKind::Overflow) { + e.with_kind(BlockErrorKind::UnsupportedFeature) + } else { + e + }; + e.with_op(ErrorOp::Open) + })?; let data_raw_file = inner.raw_file.clone(); Ok(QcowDiskSync { metadata: Arc::new(QcowMetadata::new(inner)), From 524e620240d82bcbf31c8327073cf51b65adb0e6 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 13 Mar 2026 23:53:42 +0100 Subject: [PATCH 243/742] block: qcow: Classify errors in parse_qcow and BackingFile::new Replace remaining automatic From conversions in parse_qcow and BackingFile::new with explicit BlockError::new calls carrying the appropriate BlockErrorKind at every error site. Internal functions that still return qcow::Result (QcowHeader::new, offset_is_cluster_boundary, clear_autoclear_features and others) are wrapped with map_err at the boundary. These functions stay on qcow::Result as they are internal to the qcow module and the classification belongs at the call site rather than inside the function itself. Signed-off-by: Anatol Belski --- block/src/qcow/mod.rs | 92 ++++++++++++++++++++++++++++++------------- 1 file changed, 64 insertions(+), 28 deletions(-) diff --git a/block/src/qcow/mod.rs b/block/src/qcow/mod.rs index 27c7e3a8cd..c926642e64 100644 --- a/block/src/qcow/mod.rs +++ b/block/src/qcow/mod.rs @@ -205,7 +205,12 @@ impl BackingFile { let backing_raw_file = OpenOptions::new() .read(true) .open(&config.path) - .map_err(|e| Error::BackingFileIo(config.path.clone(), e))?; + .map_err(|e| { + BlockError::new( + BlockErrorKind::Io, + Error::BackingFileIo(config.path.clone(), e), + ) + })?; let mut raw_file = RawFile::new(backing_raw_file, direct_io); @@ -217,12 +222,18 @@ impl BackingFile { let (kind, virtual_size) = match backing_format { ImageType::Raw => { - let size = raw_file - .seek(SeekFrom::End(0)) - .map_err(|e| Error::BackingFileIo(config.path.clone(), e))?; - raw_file - .rewind() - .map_err(|e| Error::BackingFileIo(config.path.clone(), e))?; + let size = raw_file.seek(SeekFrom::End(0)).map_err(|e| { + BlockError::new( + BlockErrorKind::Io, + Error::BackingFileIo(config.path.clone(), e), + ) + })?; + raw_file.rewind().map_err(|e| { + BlockError::new( + BlockErrorKind::Io, + Error::BackingFileIo(config.path.clone(), e), + ) + })?; (BackingKind::Raw(raw_file), size) } ImageType::Qcow2 => { @@ -404,9 +415,12 @@ pub(crate) fn parse_qcow( )?; // Validate refcount order to be 0..6 - let refcount_bits: u64 = 0x01u64 - .checked_shl(header.refcount_order) - .ok_or(Error::UnsupportedRefcountOrder)?; + let refcount_bits: u64 = 0x01u64.checked_shl(header.refcount_order).ok_or_else(|| { + BlockError::new( + BlockErrorKind::UnsupportedFeature, + Error::UnsupportedRefcountOrder, + ) + })?; if refcount_bits > 64 { return Err(BlockError::new( BlockErrorKind::UnsupportedFeature, @@ -421,11 +435,17 @@ pub(crate) fn parse_qcow( Error::NoRefcountClusters, )); } - offset_is_cluster_boundary(header.l1_table_offset, header.cluster_bits)?; - offset_is_cluster_boundary(header.snapshots_offset, header.cluster_bits)?; + offset_is_cluster_boundary(header.l1_table_offset, header.cluster_bits) + .map_err(|e| BlockError::new(BlockErrorKind::CorruptImage, e))?; + offset_is_cluster_boundary(header.snapshots_offset, header.cluster_bits) + .map_err(|e| BlockError::new(BlockErrorKind::CorruptImage, e))?; // refcount table must be a cluster boundary, and within the file's virtual or actual size. - offset_is_cluster_boundary(header.refcount_table_offset, header.cluster_bits)?; - let file_size = file.metadata().map_err(Error::GettingFileSize)?.len(); + offset_is_cluster_boundary(header.refcount_table_offset, header.cluster_bits) + .map_err(|e| BlockError::new(BlockErrorKind::CorruptImage, e))?; + let file_size = file + .metadata() + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::GettingFileSize(e)))? + .len(); if header.refcount_table_offset > max(file_size, header.size) { return Err(BlockError::new( BlockErrorKind::CorruptImage, @@ -437,12 +457,14 @@ pub(crate) fn parse_qcow( // this is an old file with broken refcounts, which requires a rebuild. let mut refcount_rebuild_required = true; file.seek(SeekFrom::Start(header.refcount_table_offset)) - .map_err(Error::SeekingFile)?; - let first_refblock_addr = u64::read_be(&mut file).map_err(Error::ReadingHeader)?; + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SeekingFile(e)))?; + let first_refblock_addr = u64::read_be(&mut file) + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::ReadingHeader(e)))?; if first_refblock_addr != 0 { file.seek(SeekFrom::Start(first_refblock_addr)) - .map_err(Error::SeekingFile)?; - let first_cluster_refcount = u16::read_be(&mut file).map_err(Error::ReadingHeader)?; + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SeekingFile(e)))?; + let first_cluster_refcount = u16::read_be(&mut file) + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::ReadingHeader(e)))?; if first_cluster_refcount != 0 { refcount_rebuild_required = false; } @@ -452,8 +474,8 @@ pub(crate) fn parse_qcow( refcount_rebuild_required = true; } - let mut raw_file = - QcowRawFile::from(file, cluster_size, refcount_bits).ok_or(Error::InvalidClusterSize)?; + let mut raw_file = QcowRawFile::from(file, cluster_size, refcount_bits) + .ok_or_else(|| BlockError::new(BlockErrorKind::InvalidFormat, Error::InvalidClusterSize))?; let is_writable = raw_file.file().is_writable(); if header.is_corrupt() { @@ -499,7 +521,7 @@ pub(crate) fn parse_qcow( num_l2_clusters, Some(L1_TABLE_OFFSET_MASK), ) - .map_err(Error::ReadingHeader)?, + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::ReadingHeader(e)))?, ); let num_clusters = div_round_up_u64(header.size, cluster_size); @@ -530,7 +552,7 @@ pub(crate) fn parse_qcow( cluster_size, refcount_bits, ) - .map_err(Error::ReadingRefCounts)?; + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::ReadingRefCounts(e)))?; let l2_entries = cluster_size / size_of::() as u64; @@ -539,23 +561,30 @@ pub(crate) fn parse_qcow( header .l1_table_offset .checked_add(l1_index * size_of::() as u64) - .ok_or(Error::InvalidL1TableOffset)?; + .ok_or_else(|| { + BlockError::new(BlockErrorKind::CorruptImage, Error::InvalidL1TableOffset) + })?; header .refcount_table_offset .checked_add(u64::from(header.refcount_table_clusters) * cluster_size) - .ok_or(Error::InvalidRefcountTableOffset)?; + .ok_or_else(|| { + BlockError::new( + BlockErrorKind::CorruptImage, + Error::InvalidRefcountTableOffset, + ) + })?; // Find available (refcount == 0) clusters for the free list. let file_size = raw_file .file_mut() .metadata() - .map_err(Error::GettingFileSize)? + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::GettingFileSize(e)))? .len(); let mut avail_clusters = Vec::new(); for i in (0..file_size).step_by(cluster_size as usize) { let refcount = refcounts .get_cluster_refcount(&mut raw_file, i) - .map_err(Error::GettingRefcount)?; + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::GettingRefcount(e)))?; if refcount == 0 { avail_clusters.push(i); } @@ -567,10 +596,17 @@ pub(crate) fn parse_qcow( { header .set_dirty_bit(raw_file.file_mut(), true) - .map_err(|e| Error::WritingHeader(io::Error::other(e)))?; + .map_err(|e| { + BlockError::new( + BlockErrorKind::Io, + Error::WritingHeader(io::Error::other(e)), + ) + })?; } - header.clear_autoclear_features(raw_file.file_mut())?; + header + .clear_autoclear_features(raw_file.file_mut()) + .map_err(|e| BlockError::new(BlockErrorKind::Io, e))?; } let inner = metadata::QcowState { From f6ec817b9b24f8724b1d15b7905d6afc1d86ec91 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Sat, 14 Mar 2026 00:00:14 +0100 Subject: [PATCH 244/742] block: qcow: Switch resize and grow_l1_table to BlockResult Switch resize and grow_l1_table from qcow::Result to BlockResult. All I/O error sites use explicit BlockError::new with the Io kind. The write_to call in grow_l1_table rewraps WritingHeader as ResizeIo to preserve the existing error semantics. The two resize tests that check for ShrinkNotSupported and ResizeWithBackingFile are updated to match on BlockErrorKind with downcast to inspect the underlying variant. Signed-off-by: Anatol Belski --- block/src/qcow/mod.rs | 58 +++++++++++++++++++++++++++++-------------- 1 file changed, 39 insertions(+), 19 deletions(-) diff --git a/block/src/qcow/mod.rs b/block/src/qcow/mod.rs index c926642e64..75afd268dc 100644 --- a/block/src/qcow/mod.rs +++ b/block/src/qcow/mod.rs @@ -868,7 +868,7 @@ impl QcowFile { /// if needed. Shrinking is not supported, as it could lead to data /// loss. Not supported when a backing file is present in that case /// an error is returned. - pub fn resize(&mut self, new_size: u64) -> Result<()> { + pub fn resize(&mut self, new_size: u64) -> BlockResult<()> { let current_size = self.virtual_size(); if new_size == current_size { @@ -876,11 +876,17 @@ impl QcowFile { } if new_size < current_size { - return Err(Error::ShrinkNotSupported); + return Err(BlockError::new( + BlockErrorKind::UnsupportedFeature, + Error::ShrinkNotSupported, + )); } if self.backing_file.is_some() { - return Err(Error::ResizeWithBackingFile); + return Err(BlockError::new( + BlockErrorKind::UnsupportedFeature, + Error::ResizeWithBackingFile, + )); } // Grow the L1 table if needed @@ -898,18 +904,20 @@ impl QcowFile { self.raw_file .file_mut() .rewind() - .map_err(Error::SeekingFile)?; + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SeekingFile(e)))?; self.header .write_to(self.raw_file.file_mut()) .map_err(|e| match e { - Error::WritingHeader(io_err) => Error::ResizeIo(io_err), - other => other, + Error::WritingHeader(io_err) => { + BlockError::new(BlockErrorKind::Io, Error::ResizeIo(io_err)) + } + other => BlockError::new(BlockErrorKind::Io, other), })?; self.raw_file .file_mut() .sync_all() - .map_err(Error::SyncingHeader)?; + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SyncingHeader(e)))?; Ok(()) } @@ -919,7 +927,7 @@ impl QcowFile { /// This allocates a new L1 table at file end (guaranteeing contiguity), /// copies existing entries, updates refcounts, and atomically switches /// to the new table. - fn grow_l1_table(&mut self, new_l1_size: u32) -> Result<()> { + fn grow_l1_table(&mut self, new_l1_size: u32) -> BlockResult<()> { let old_l1_size = self.header.l1_size; let old_l1_offset = self.header.l1_table_offset; let cluster_size = self.raw_file.cluster_size(); @@ -932,7 +940,7 @@ impl QcowFile { .raw_file .file_mut() .seek(SeekFrom::End(0)) - .map_err(Error::ResizeIo)?; + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::ResizeIo(e)))?; let new_l1_offset = self.raw_file.cluster_address(file_size + cluster_size - 1); // Extend file to fit all L1 clusters @@ -940,12 +948,12 @@ impl QcowFile { self.raw_file .file_mut() .set_len(new_file_end) - .map_err(Error::SettingFileSize)?; + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SettingFileSize(e)))?; // Set refcounts for the contiguous range for i in 0..new_l1_clusters { self.set_cluster_refcount(new_l1_offset + i * cluster_size, 1) - .map_err(Error::ResizeIo)?; + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::ResizeIo(e)))?; } let mut new_l1_data = vec![0u64; new_l1_size as usize]; @@ -957,7 +965,7 @@ impl QcowFile { let refcount = self .refcounts .get_cluster_refcount(&mut self.raw_file, *l2_addr) - .map_err(Error::GettingRefcount)?; + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::GettingRefcount(e)))?; *l2_addr = l1_entry_make(*l2_addr, refcount == 1); } } @@ -965,12 +973,12 @@ impl QcowFile { // Write the new L1 table to the file. self.raw_file .write_pointer_table_direct(new_l1_offset, new_l1_data.iter()) - .map_err(Error::ResizeIo)?; + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::ResizeIo(e)))?; self.raw_file .file_mut() .sync_all() - .map_err(Error::SyncingHeader)?; + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SyncingHeader(e)))?; self.header.l1_size = new_l1_size; self.header.l1_table_offset = new_l1_offset; @@ -978,13 +986,15 @@ impl QcowFile { self.raw_file .file_mut() .rewind() - .map_err(Error::SeekingFile)?; - self.header.write_to(self.raw_file.file_mut())?; + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SeekingFile(e)))?; + self.header + .write_to(self.raw_file.file_mut()) + .map_err(|e| BlockError::new(BlockErrorKind::Io, e))?; self.raw_file .file_mut() .sync_all() - .map_err(Error::SyncingHeader)?; + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SyncingHeader(e)))?; // Free old L1 table clusters let old_l1_bytes = old_l1_size as u64 * size_of::() as u64; @@ -3141,7 +3151,12 @@ mod unit_tests { let result = q.resize(smaller_size); assert!(result.is_err()); - assert!(matches!(result.unwrap_err(), Error::ShrinkNotSupported)); + let err = result.unwrap_err(); + assert!(matches!(err.kind(), BlockErrorKind::UnsupportedFeature)); + assert!(matches!( + err.downcast_ref::(), + Some(Error::ShrinkNotSupported) + )); assert_eq!(q.virtual_size(), original_size); }); @@ -3172,7 +3187,12 @@ mod unit_tests { let result = overlay.resize(backing_size * 2); assert!(result.is_err()); - assert!(matches!(result.unwrap_err(), Error::ResizeWithBackingFile)); + let err = result.unwrap_err(); + assert!(matches!(err.kind(), BlockErrorKind::UnsupportedFeature)); + assert!(matches!( + err.downcast_ref::(), + Some(Error::ResizeWithBackingFile) + )); assert_eq!(overlay.virtual_size(), backing_size); } From 9daf1782a87ba983a5c8cafe143e575c7a02be98 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Sat, 14 Mar 2026 00:31:37 +0100 Subject: [PATCH 245/742] block: qcow: Switch rebuild_refcounts to BlockResult Switch rebuild_refcounts from qcow::Result to BlockResult. The inner helper functions remain on qcow::Result since they are purely internal, and are wrapped with map_err at each call site where they cross the BlockResult boundary. InvalidRefcountTableSize errors are classified as CorruptImage since they indicate inconsistent internal refcount structures rather than a format violation. Signed-off-by: Anatol Belski --- block/src/qcow/mod.rs | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/block/src/qcow/mod.rs b/block/src/qcow/mod.rs index 75afd268dc..95df69d36f 100644 --- a/block/src/qcow/mod.rs +++ b/block/src/qcow/mod.rs @@ -1011,7 +1011,7 @@ impl QcowFile { } /// Rebuild the reference count tables. - fn rebuild_refcounts(raw_file: &mut QcowRawFile, header: QcowHeader) -> Result<()> { + fn rebuild_refcounts(raw_file: &mut QcowRawFile, header: QcowHeader) -> BlockResult<()> { fn add_ref( refcounts: &mut [u64], cluster_size: u64, @@ -1234,7 +1234,7 @@ impl QcowFile { let file_size = raw_file .file_mut() .metadata() - .map_err(Error::GettingFileSize)? + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::GettingFileSize(e)))? .len(); let refcount_bits = 1u64 << header.refcount_order; @@ -1264,25 +1264,33 @@ impl QcowFile { max_valid_cluster_index += refblocks_for_refs + reftable_clusters_for_refs; if max_valid_cluster_index > MAX_RAM_POINTER_TABLE_SIZE { - return Err(Error::InvalidRefcountTableSize(max_valid_cluster_index)); + return Err(BlockError::new( + BlockErrorKind::CorruptImage, + Error::InvalidRefcountTableSize(max_valid_cluster_index), + )); } let max_valid_cluster_offset = max_valid_cluster_index * cluster_size; if max_valid_cluster_offset < file_size - cluster_size { - return Err(Error::InvalidRefcountTableSize(max_valid_cluster_offset)); + return Err(BlockError::new( + BlockErrorKind::CorruptImage, + Error::InvalidRefcountTableSize(max_valid_cluster_offset), + )); } let mut refcounts = vec![0; max_valid_cluster_index as usize]; // Find all references clusters and rebuild refcounts. - set_header_refcount(&mut refcounts, cluster_size, max_refcount, refcount_bits)?; + set_header_refcount(&mut refcounts, cluster_size, max_refcount, refcount_bits) + .map_err(|e| BlockError::new(BlockErrorKind::Io, e))?; set_l1_refcounts( &mut refcounts, &header, cluster_size, max_refcount, refcount_bits, - )?; + ) + .map_err(|e| BlockError::new(BlockErrorKind::Io, e))?; set_data_refcounts( &mut refcounts, &header, @@ -1290,14 +1298,16 @@ impl QcowFile { raw_file, max_refcount, refcount_bits, - )?; + ) + .map_err(|e| BlockError::new(BlockErrorKind::Io, e))?; set_refcount_table_refcounts( &mut refcounts, &header, cluster_size, max_refcount, refcount_bits, - )?; + ) + .map_err(|e| BlockError::new(BlockErrorKind::Io, e))?; // Allocate clusters to store the new reference count blocks. let ref_table = alloc_refblocks( @@ -1306,7 +1316,8 @@ impl QcowFile { refblock_clusters, max_refcount, refcount_bits, - )?; + ) + .map_err(|e| BlockError::new(BlockErrorKind::Io, e))?; // Write updated reference counts and point the reftable at them. write_refblocks( @@ -1316,6 +1327,7 @@ impl QcowFile { raw_file, refcount_block_entries, ) + .map_err(|e| BlockError::new(BlockErrorKind::Io, e)) } // Limits the range so that it doesn't exceed the virtual size of the file. From 5410d4b2d568154069338bab25f88398c22a4e7b Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Sat, 14 Mar 2026 00:46:20 +0100 Subject: [PATCH 246/742] block: qcow: Switch detect_image_type to BlockResult Switch detect_image_type from qcow::Result to BlockResult with explicit error classification at every I/O site. This is the last function migrated before the From scaffolding can be removed. Signed-off-by: Anatol Belski --- block/src/qcow/mod.rs | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/block/src/qcow/mod.rs b/block/src/qcow/mod.rs index 95df69d36f..0c77b865cc 100644 --- a/block/src/qcow/mod.rs +++ b/block/src/qcow/mod.rs @@ -2286,8 +2286,7 @@ pub fn convert( dst_type: ImageType, src_max_nesting_depth: u32, ) -> BlockResult<()> { - let src_type = - detect_image_type(&mut src_file).map_err(|e| BlockError::new(BlockErrorKind::Io, e))?; + let src_type = detect_image_type(&mut src_file)?; match src_type { ImageType::Qcow2 => { let mut src_reader = @@ -2304,17 +2303,21 @@ pub fn convert( } /// Detect the type of an image file by checking for a valid qcow2 header. -pub fn detect_image_type(file: &mut RawFile) -> Result { - let orig_seek = file.stream_position().map_err(Error::SeekingFile)?; - file.rewind().map_err(Error::SeekingFile)?; - let magic = u32::read_be(file).map_err(Error::ReadingHeader)?; +pub fn detect_image_type(file: &mut RawFile) -> BlockResult { + let orig_seek = file + .stream_position() + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SeekingFile(e)))?; + file.rewind() + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SeekingFile(e)))?; + let magic = u32::read_be(file) + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::ReadingHeader(e)))?; let image_type = if magic == QCOW_MAGIC { ImageType::Qcow2 } else { ImageType::Raw }; file.seek(SeekFrom::Start(orig_seek)) - .map_err(Error::SeekingFile)?; + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SeekingFile(e)))?; Ok(image_type) } From fc79d08d7ddffa2a5ca743327e89e0e689f02b53 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Sat, 14 Mar 2026 01:07:31 +0100 Subject: [PATCH 247/742] block: qcow: Remove From for BlockError All public qcow functions now return BlockResult with explicit error classification at every site. The temporary From impl introduced in the first commit of this series is no longer needed and is removed. Internal functions in header.rs and the rebuild_refcounts helpers stay on qcow::Result. Classification happens at the call site boundary where qcow::Result meets BlockResult. Signed-off-by: Anatol Belski --- block/src/error.rs | 52 ---------------------------------------------- 1 file changed, 52 deletions(-) diff --git a/block/src/error.rs b/block/src/error.rs index b235b17e48..645057005e 100644 --- a/block/src/error.rs +++ b/block/src/error.rs @@ -242,56 +242,4 @@ impl From for BlockError { } } -/// Temporary scaffolding: classify a `qcow::Error` into the appropriate -/// `BlockErrorKind`. -/// -/// This impl exists only to allow an incremental migration of the qcow -/// parse/construct chain from `qcow::Result` to `BlockResult`. Each -/// subsequent commit replaces bare `?` sites with explicit -/// `BlockError::new(kind, e)` calls. Once every site is migrated this -/// impl will be removed. -impl From for BlockError { - fn from(e: crate::qcow::Error) -> Self { - use crate::qcow::Error as E; - let kind = match &e { - // Structural / format violations - E::InvalidMagic - | E::BackingFileTooLong(_) - | E::InvalidBackingFileName(_) - | E::InvalidClusterSize - | E::InvalidL1TableSize(_) - | E::InvalidL1TableOffset - | E::InvalidOffset(_) - | E::InvalidRefcountTableOffset - | E::InvalidRefcountTableSize(_) - | E::FileTooBig(_) - | E::NoRefcountClusters - | E::RefcountTableOffEnd - | E::RefcountTableTooLarge - | E::TooManyL1Entries(_) - | E::TooManyRefcounts(_) - | E::SizeTooSmallForNumberOfClusters => BlockErrorKind::InvalidFormat, - - // Unsupported features / versions - E::UnsupportedVersion(_) - | E::UnsupportedFeature(_) - | E::UnsupportedCompressionType - | E::UnsupportedBackingFileFormat(_) - | E::UnsupportedRefcountOrder - | E::BackingFilesDisabled - | E::ShrinkNotSupported => BlockErrorKind::UnsupportedFeature, - - // Corrupt image - E::CorruptImage => BlockErrorKind::CorruptImage, - - // Nesting depth overflow - E::MaxNestingDepthExceeded => BlockErrorKind::Overflow, - - // Everything else is I/O - _ => BlockErrorKind::Io, - }; - Self::new(kind, e) - } -} - pub type BlockResult = Result; From d3cad420a5c6714db13d0c9b222fe435548fbc10 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Sat, 21 Mar 2026 09:37:35 +0100 Subject: [PATCH 248/742] block: Validate segment count for DISCARD and WRITE_ZEROES requests Split the data length check into two conditions: - reject descriptors shorter than one virtio_blk_discard_write_zeroes segment, and - reject payloads exceeding MAX_DISCARD_WRITE_ZEROES_SEG segments Previously only the minimum length was checked and extra segments were silently dropped. Signed-off-by: Anatol Belski --- block/src/lib.rs | 18 +++++++++++++++--- virtio-devices/src/block.rs | 7 ++++--- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/block/src/lib.rs b/block/src/lib.rs index 288db3fbfd..b6c083cefb 100644 --- a/block/src/lib.rs +++ b/block/src/lib.rs @@ -70,7 +70,11 @@ use crate::vhdx::VhdxError; const SECTOR_SHIFT: u8 = 9; pub const SECTOR_SIZE: u64 = 0x01 << SECTOR_SHIFT; -/// Field offsets within `struct virtio_blk_discard_write_zeroes`. +/// Maximum number of segments per DISCARD or WRITE_ZEROES request. +pub const MAX_DISCARD_WRITE_ZEROES_SEG: u32 = 1; + +/// Size and field offsets within `struct virtio_blk_discard_write_zeroes`. +const DISCARD_WZ_SEG_SIZE: u32 = mem::size_of::() as u32; const DISCARD_WZ_SECTOR_OFFSET: u64 = mem::offset_of!(virtio_blk_discard_write_zeroes, sector) as u64; const DISCARD_WZ_NUM_SECTORS_OFFSET: u64 = @@ -105,6 +109,8 @@ pub enum Error { RawFileError(#[source] std::io::Error), #[error("The requested operation does not support multiple descriptors")] TooManyDescriptors, + #[error("Request contains too many segments")] + TooManySegments, #[error("Failure in vhdx")] VhdxError(#[source] VhdxError), } @@ -591,9 +597,12 @@ impl Request { return Err(ExecuteError::BadRequest(Error::TooManyDescriptors)); }; - if data_len < 16 { + if data_len < DISCARD_WZ_SEG_SIZE { return Err(ExecuteError::BadRequest(Error::DescriptorLengthTooSmall)); } + if data_len > DISCARD_WZ_SEG_SIZE * MAX_DISCARD_WRITE_ZEROES_SEG { + return Err(ExecuteError::BadRequest(Error::TooManySegments)); + } let mut discard_sector = [0u8; 8]; let mut discard_num_sectors = [0u8; 4]; @@ -630,9 +639,12 @@ impl Request { return Err(ExecuteError::BadRequest(Error::TooManyDescriptors)); }; - if data_len < 16 { + if data_len < DISCARD_WZ_SEG_SIZE { return Err(ExecuteError::BadRequest(Error::DescriptorLengthTooSmall)); } + if data_len > DISCARD_WZ_SEG_SIZE * MAX_DISCARD_WRITE_ZEROES_SEG { + return Err(ExecuteError::BadRequest(Error::TooManySegments)); + } let mut wz_sector = [0u8; 8]; let mut wz_num_sectors = [0u8; 4]; diff --git a/virtio-devices/src/block.rs b/virtio-devices/src/block.rs index edd1327f45..97d7c58c15 100644 --- a/virtio-devices/src/block.rs +++ b/virtio-devices/src/block.rs @@ -23,7 +23,8 @@ use block::disk_file::DiskBackend; use block::error::BlockError; use block::fcntl::{LockError, LockGranularity, LockGranularityChoice, LockType, get_lock_state}; use block::{ - ExecuteAsync, ExecuteError, Request, RequestType, VirtioBlockConfig, build_serial, fcntl, + ExecuteAsync, ExecuteError, MAX_DISCARD_WRITE_ZEROES_SEG, Request, RequestType, + VirtioBlockConfig, build_serial, fcntl, }; use event_monitor::event; use log::{debug, error, info, warn}; @@ -829,12 +830,12 @@ impl Block { if avail_features & (1u64 << VIRTIO_BLK_F_WRITE_ZEROES) != 0 { config.max_write_zeroes_sectors = u32::MAX; - config.max_write_zeroes_seg = 1; + config.max_write_zeroes_seg = MAX_DISCARD_WRITE_ZEROES_SEG; config.write_zeroes_may_unmap = if discard_supported { 1 } else { 0 }; } if avail_features & (1u64 << VIRTIO_BLK_F_DISCARD) != 0 { config.max_discard_sectors = u32::MAX; - config.max_discard_seg = 1; + config.max_discard_seg = MAX_DISCARD_WRITE_ZEROES_SEG; config.discard_sector_alignment = (logical_block_size / SECTOR_SIZE) as u32; } From c79f3acfabf352af75f8b2081768503812a5127a Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 20 Mar 2026 23:25:29 +0100 Subject: [PATCH 249/742] block: Validate sector range for DISCARD and WRITE_ZEROES requests Add range validation for DISCARD and WRITE_ZEROES, matching the existing check in the read/write path. Per virtio spec section 5.2.6.1, a driver must not submit a request which would cause a read or write beyond capacity. Use checked_add to guard against overflow, then compare against disk_nsectors. Without this, requests beyond device capacity pass through to the host punch_hole/write_zeroes calls, relying on backend specific behavior rather than returning VIRTIO_BLK_S_IOERR consistently. Signed-off-by: Anatol Belski --- block/src/lib.rs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/block/src/lib.rs b/block/src/lib.rs index b6c083cefb..81b3f27c00 100644 --- a/block/src/lib.rs +++ b/block/src/lib.rs @@ -625,6 +625,13 @@ impl Request { let discard_num_sectors = u32::from_le_bytes(discard_num_sectors); + let top = discard_sector + .checked_add(discard_num_sectors as u64) + .ok_or(ExecuteError::BadRequest(Error::InvalidOffset))?; + if top > disk_nsectors { + return Err(ExecuteError::BadRequest(Error::InvalidOffset)); + } + let discard_offset = discard_sector * SECTOR_SIZE; let discard_length = (discard_num_sectors as u64) * SECTOR_SIZE; @@ -672,6 +679,14 @@ impl Request { if wz_offset == 0 && disable_sector0_writes { return Err(ExecuteError::BadRequest(Error::InvalidOffset)); } + + let top = wz_sector + .checked_add(wz_num_sectors as u64) + .ok_or(ExecuteError::BadRequest(Error::InvalidOffset))?; + if top > disk_nsectors { + return Err(ExecuteError::BadRequest(Error::InvalidOffset)); + } + let wz_length = (wz_num_sectors as u64) * SECTOR_SIZE; if wz_flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP != 0 { From 4f52e9355e922413047a35190041670800f767f9 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Sun, 22 Mar 2026 11:27:55 +0000 Subject: [PATCH 250/742] virtio-devices: seccomp: Allow fcntl unconditionally This now required after Rust-VMM crate bumps. Also reorder some syscalls so that they are now in alphabetical order. Signed-off-by: Rob Bradford --- virtio-devices/src/seccomp_filters.rs | 12 ++++-------- vmm/src/seccomp_filters.rs | 27 +++++++++++++-------------- 2 files changed, 17 insertions(+), 22 deletions(-) diff --git a/virtio-devices/src/seccomp_filters.rs b/virtio-devices/src/seccomp_filters.rs index 63d01a5d8d..f44fdc1b92 100644 --- a/virtio-devices/src/seccomp_filters.rs +++ b/virtio-devices/src/seccomp_filters.rs @@ -146,11 +146,11 @@ fn virtio_mem_thread_rules() -> Vec<(i64, Vec)> { fn virtio_net_thread_rules() -> Vec<(i64, Vec)> { vec![ + #[cfg(feature = "sev_snp")] + (libc::SYS_ioctl, create_mshv_sev_snp_ioctl_seccomp_rule()), (libc::SYS_readv, vec![]), (libc::SYS_timerfd_settime, vec![]), (libc::SYS_writev, vec![]), - #[cfg(feature = "sev_snp")] - (libc::SYS_ioctl, create_mshv_sev_snp_ioctl_seccomp_rule()), ] } @@ -254,14 +254,11 @@ fn virtio_vsock_thread_rules() -> Vec<(i64, Vec)> { vec![ (libc::SYS_accept4, vec![]), (libc::SYS_connect, vec![]), + (libc::SYS_fcntl, vec![]), (libc::SYS_ioctl, create_vsock_ioctl_seccomp_rule()), (libc::SYS_recvfrom, vec![]), (libc::SYS_sendto, vec![]), (libc::SYS_socket, vec![]), - // If debug_assertions is enabled, closing a file first checks - // whether the FD is valid with fcntl. - #[cfg(debug_assertions)] - (libc::SYS_fcntl, vec![]), ] } @@ -308,6 +305,7 @@ fn virtio_thread_common() -> Vec<(i64, Vec)> { #[cfg(target_arch = "x86_64")] (libc::SYS_epoll_wait, vec![]), (libc::SYS_exit, vec![]), + (libc::SYS_fcntl, vec![]), (libc::SYS_futex, vec![]), (libc::SYS_gettid, vec![]), (libc::SYS_madvise, vec![]), @@ -321,8 +319,6 @@ fn virtio_thread_common() -> Vec<(i64, Vec)> { (libc::SYS_rt_sigreturn, vec![]), (libc::SYS_sigaltstack, vec![]), (libc::SYS_write, vec![]), - #[cfg(debug_assertions)] - (libc::SYS_fcntl, vec![]), ] } diff --git a/vmm/src/seccomp_filters.rs b/vmm/src/seccomp_filters.rs index 938e7832f3..18b8ba097d 100644 --- a/vmm/src/seccomp_filters.rs +++ b/vmm/src/seccomp_filters.rs @@ -501,6 +501,7 @@ fn signal_handler_thread_rules() -> Result)>, Backend (libc::SYS_close, vec![]), (libc::SYS_exit, vec![]), (libc::SYS_exit_group, vec![]), + (libc::SYS_fcntl, vec![]), (libc::SYS_futex, vec![]), (libc::SYS_ioctl, create_signal_handler_ioctl_seccomp_rule()?), (libc::SYS_landlock_create_ruleset, vec![]), @@ -517,8 +518,6 @@ fn signal_handler_thread_rules() -> Result)>, Backend (libc::SYS_sendto, vec![]), (libc::SYS_sigaltstack, vec![]), (libc::SYS_write, vec![]), - #[cfg(debug_assertions)] - (libc::SYS_fcntl, vec![]), ]) } @@ -534,7 +533,10 @@ fn pty_foreground_thread_rules() -> Result)>, Backend Ok(vec![ (libc::SYS_close, vec![]), (libc::SYS_exit_group, vec![]), + (libc::SYS_fcntl, vec![]), + (libc::SYS_getcwd, vec![]), (libc::SYS_getpgid, vec![]), + (libc::SYS_gettid, vec![]), #[cfg(target_arch = "x86_64")] (libc::SYS_getpgrp, vec![]), (libc::SYS_ioctl, create_pty_foreground_ioctl_seccomp_rule()?), @@ -549,12 +551,8 @@ fn pty_foreground_thread_rules() -> Result)>, Backend (libc::SYS_rt_sigreturn, vec![]), (libc::SYS_sched_yield, vec![]), (libc::SYS_setsid, vec![]), - (libc::SYS_gettid, vec![]), (libc::SYS_sigaltstack, vec![]), (libc::SYS_write, vec![]), - #[cfg(debug_assertions)] - (libc::SYS_fcntl, vec![]), - (libc::SYS_getcwd, vec![]), ]) } @@ -598,6 +596,7 @@ fn vmm_thread_rules( #[cfg(target_arch = "aarch64")] (libc::SYS_newfstatat, vec![]), (libc::SYS_futex, vec![]), + (libc::SYS_getcwd, vec![]), (libc::SYS_getdents64, vec![]), (libc::SYS_getpgid, vec![]), #[cfg(target_arch = "x86_64")] @@ -701,7 +700,6 @@ fn vmm_thread_rules( (libc::SYS_wait4, vec![]), (libc::SYS_write, vec![]), (libc::SYS_writev, vec![]), - (libc::SYS_getcwd, vec![]), ]) } @@ -801,11 +799,13 @@ fn vcpu_thread_rules( (libc::SYS_dup, vec![]), (libc::SYS_exit, vec![]), (libc::SYS_epoll_ctl, vec![]), + (libc::SYS_fcntl, vec![]), (libc::SYS_fstat, vec![]), - (libc::SYS_gettid, vec![]), (libc::SYS_futex, vec![]), + (libc::SYS_getcwd, vec![]), (libc::SYS_getrandom, vec![]), (libc::SYS_getpid, vec![]), + (libc::SYS_gettid, vec![]), ( libc::SYS_ioctl, create_vcpu_ioctl_seccomp_rule(hypervisor_type)?, @@ -842,8 +842,6 @@ fn vcpu_thread_rules( (libc::SYS_unlinkat, vec![]), (libc::SYS_write, vec![]), (libc::SYS_writev, vec![]), - (libc::SYS_fcntl, vec![]), - (libc::SYS_getcwd, vec![]), ]) } @@ -863,6 +861,7 @@ fn http_api_thread_rules() -> Result)>, BackendError> (libc::SYS_epoll_wait, vec![]), (libc::SYS_exit, vec![]), (libc::SYS_fcntl, vec![]), + (libc::SYS_getcwd, vec![]), (libc::SYS_gettid, vec![]), (libc::SYS_futex, vec![]), (libc::SYS_getrandom, vec![]), @@ -876,12 +875,11 @@ fn http_api_thread_rules() -> Result)>, BackendError> (libc::SYS_prctl, vec![]), (libc::SYS_recvfrom, vec![]), (libc::SYS_recvmsg, vec![]), + (libc::SYS_rt_sigprocmask, vec![]), (libc::SYS_sched_yield, vec![]), (libc::SYS_sendto, vec![]), (libc::SYS_sigaltstack, vec![]), (libc::SYS_write, vec![]), - (libc::SYS_rt_sigprocmask, vec![]), - (libc::SYS_getcwd, vec![]), ]) } @@ -899,7 +897,9 @@ fn dbus_api_thread_rules() -> Result)>, BackendError> (libc::SYS_epoll_ctl, vec![]), (libc::SYS_exit, vec![]), (libc::SYS_gettid, vec![]), + (libc::SYS_fcntl, vec![]), (libc::SYS_futex, vec![]), + (libc::SYS_getcwd, vec![]), (libc::SYS_getrandom, vec![]), (libc::SYS_madvise, vec![]), (libc::SYS_mmap, vec![]), @@ -915,7 +915,6 @@ fn dbus_api_thread_rules() -> Result)>, BackendError> (libc::SYS_set_robust_list, vec![]), (libc::SYS_sigaltstack, vec![]), (libc::SYS_write, vec![]), - (libc::SYS_getcwd, vec![]), ]) } @@ -923,6 +922,7 @@ fn event_monitor_thread_rules() -> Result)>, BackendE Ok(vec![ (libc::SYS_brk, vec![]), (libc::SYS_close, vec![]), + (libc::SYS_getcwd, vec![]), (libc::SYS_gettid, vec![]), (libc::SYS_futex, vec![]), (libc::SYS_landlock_create_ruleset, vec![]), @@ -932,7 +932,6 @@ fn event_monitor_thread_rules() -> Result)>, BackendE (libc::SYS_prctl, vec![]), (libc::SYS_sched_yield, vec![]), (libc::SYS_write, vec![]), - (libc::SYS_getcwd, vec![]), ]) } From e05065f509dedf4c9477ca0a47c12c1c3440efd5 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Thu, 19 Mar 2026 17:20:59 -0700 Subject: [PATCH 251/742] build: Bump rust-vmm dependencies Bump to the released versions that are compatible wherever possible but for the vhost and vfio crates they are git hashes as no releases with compatible versions have yet been made. Signed-off-by: Rob Bradford --- Cargo.lock | 244 ++++++++---------- Cargo.toml | 22 +- arch/src/aarch64/uefi.rs | 2 +- fuzz/Cargo.lock | 170 +++++------- fuzz/Cargo.toml | 12 +- hypervisor/src/kvm/mod.rs | 2 +- pci/src/vfio.rs | 35 +-- vhost_user_block/src/lib.rs | 24 +- vhost_user_net/src/lib.rs | 23 +- virtio-devices/src/console.rs | 2 +- virtio-devices/src/lib.rs | 4 +- virtio-devices/src/rng.rs | 2 +- virtio-devices/src/vhost_user/mod.rs | 3 +- .../src/vhost_user/vu_common_ctrl.rs | 11 +- vmm/src/device_manager.rs | 17 +- vmm/src/memory_manager.rs | 24 +- vmm/src/vm.rs | 6 +- 17 files changed, 272 insertions(+), 331 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1c02d2604f..2fb1cfe887 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -95,7 +95,7 @@ version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" dependencies = [ - "windows-sys 0.61.0", + "windows-sys 0.61.2", ] [[package]] @@ -106,7 +106,7 @@ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys 0.61.0", + "windows-sys 0.61.2", ] [[package]] @@ -119,7 +119,7 @@ checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" name = "api_client" version = "0.1.0" dependencies = [ - "thiserror 2.0.18", + "thiserror", "vmm-sys-util", ] @@ -144,7 +144,7 @@ dependencies = [ "linux-loader", "log", "serde", - "thiserror 2.0.18", + "thiserror", "uuid", "vm-fdt", "vm-memory", @@ -204,7 +204,7 @@ dependencies = [ "polling", "rustix", "slab", - "windows-sys 0.61.0", + "windows-sys 0.61.2", ] [[package]] @@ -262,7 +262,7 @@ dependencies = [ "rustix", "signal-hook-registry", "slab", - "windows-sys 0.61.0", + "windows-sys 0.61.2", ] [[package]] @@ -357,7 +357,7 @@ dependencies = [ "remain", "serde", "smallvec", - "thiserror 2.0.18", + "thiserror", "uuid", "virtio-bindings", "virtio-queue", @@ -471,7 +471,7 @@ dependencies = [ "serde_json", "signal-hook", "test_infra", - "thiserror 2.0.18", + "thiserror", "tpm", "tracer", "vm-memory", @@ -597,7 +597,7 @@ dependencies = [ "num_enum", "pci", "serde", - "thiserror 2.0.18", + "thiserror", "tpm", "vm-allocator", "vm-device", @@ -641,7 +641,7 @@ dependencies = [ "libc", "option-ext", "redox_users", - "windows-sys 0.61.0", + "windows-sys 0.61.2", ] [[package]] @@ -723,7 +723,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.61.0", + "windows-sys 0.61.2", ] [[package]] @@ -764,7 +764,7 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" dependencies = [ - "getrandom 0.2.15", + "getrandom 0.2.17", ] [[package]] @@ -934,9 +934,9 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.15" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" +checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" dependencies = [ "cfg-if", "js-sys", @@ -1040,7 +1040,7 @@ dependencies = [ "serde", "serde_json", "serde_with", - "thiserror 2.0.18", + "thiserror", "vfio-ioctls", "vm-memory", "vmm-sys-util", @@ -1081,7 +1081,7 @@ dependencies = [ "open-enum", "range_map_vec", "static_assertions", - "thiserror 2.0.18", + "thiserror", "tracing", "zerocopy", ] @@ -1147,9 +1147,9 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.17" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" [[package]] name = "jiff" @@ -1187,9 +1187,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.77" +version = "0.3.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f" +checksum = "b49715b7073f385ba4bc528e5747d02e66cb39c6146efb66b781f131f0fb399c" dependencies = [ "once_cell", "wasm-bindgen", @@ -1197,9 +1197,9 @@ dependencies = [ [[package]] name = "kvm-bindings" -version = "0.12.1" +version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a537873e15e8daabb416667e606d9b0abc2a8fb9a45bd5853b888ae0ead82f9" +checksum = "4b3c06ff73c7ce03e780887ec2389d62d2a2a9ddf471ab05c2ff69207cd3f3b4" dependencies = [ "serde", "vmm-sys-util", @@ -1208,9 +1208,9 @@ dependencies = [ [[package]] name = "kvm-ioctls" -version = "0.22.1" +version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c8f7370330b4f57981e300fa39b02088f2f2a5c2d0f1f994e8090589619c56d" +checksum = "333f77a20344a448f3f70664918135fddeb804e938f28a99d685bd92926e0b19" dependencies = [ "bitflags 2.11.0", "kvm-bindings", @@ -1226,7 +1226,7 @@ checksum = "49fefd6652c57d68aaa32544a4c0e642929725bdc1fd929367cdeb673ab81088" dependencies = [ "enumflags2", "libc", - "thiserror 2.0.18", + "thiserror", ] [[package]] @@ -1284,9 +1284,9 @@ dependencies = [ [[package]] name = "linux-loader" -version = "0.13.1" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53802c0b111faf302a16fa20a2e3a33bd0eab408f60fc34cbfe052f6b153791e" +checksum = "de72cb02c55ecffcf75fe78295926f872eb6eb0a58d629c58a8c324dc26380f6" dependencies = [ "vm-memory", ] @@ -1336,7 +1336,7 @@ dependencies = [ [[package]] name = "micro_http" version = "0.1.0" -source = "git+https://github.com/firecracker-microvm/micro-http?branch=main#3248ceeae41461d034624b582d5d358cd6e6f89f" +source = "git+https://github.com/firecracker-microvm/micro-http?branch=main#5c2254d6cf4f32a668d0d8e57ba20bebad9d4fba" dependencies = [ "libc", "vmm-sys-util", @@ -1380,7 +1380,7 @@ checksum = "f035616abe1e4cbc026a1a8094ff8d3900f5063fe6608309098bc745926fdfd8" dependencies = [ "libc", "mshv-bindings", - "thiserror 2.0.18", + "thiserror", "vmm-sys-util", ] @@ -1405,7 +1405,7 @@ dependencies = [ "rate_limiter", "serde", "serde_json", - "thiserror 2.0.18", + "thiserror", "virtio-bindings", "virtio-queue", "vm-memory", @@ -1523,7 +1523,7 @@ checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" name = "option_parser" version = "0.1.0" dependencies = [ - "thiserror 2.0.18", + "thiserror", ] [[package]] @@ -1581,7 +1581,7 @@ dependencies = [ "libc", "log", "serde", - "thiserror 2.0.18", + "thiserror", "vfio-bindings", "vfio-ioctls", "vfio_user", @@ -1603,7 +1603,7 @@ dependencies = [ "serde", "serde_json", "test_infra", - "thiserror 2.0.18", + "thiserror", "vmm-sys-util", ] @@ -1732,7 +1732,7 @@ dependencies = [ "hermit-abi", "pin-project-lite", "rustix", - "windows-sys 0.61.0", + "windows-sys 0.61.2", ] [[package]] @@ -1829,7 +1829,7 @@ dependencies = [ "epoll", "libc", "log", - "thiserror 2.0.18", + "thiserror", "vmm-sys-util", ] @@ -1848,9 +1848,9 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a4e608c6638b9c18977b00b475ac1f28d14e84b27d8d42f70e0bf1e3dec127ac" dependencies = [ - "getrandom 0.2.15", + "getrandom 0.2.17", "libredox", - "thiserror 2.0.18", + "thiserror", ] [[package]] @@ -1915,7 +1915,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys 0.61.0", + "windows-sys 0.61.2", ] [[package]] @@ -2123,7 +2123,7 @@ dependencies = [ "getrandom 0.4.2", "once_cell", "rustix", - "windows-sys 0.61.0", + "windows-sys 0.61.2", ] [[package]] @@ -2146,38 +2146,18 @@ dependencies = [ "rand", "serde_json", "ssh2", - "thiserror 2.0.18", + "thiserror", "vmm-sys-util", "wait-timeout", ] -[[package]] -name = "thiserror" -version = "1.0.62" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2675633b1499176c2dff06b0856a27976a8f9d436737b4cf4f312d4d91d8bbb" -dependencies = [ - "thiserror-impl 1.0.62", -] - [[package]] name = "thiserror" version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" dependencies = [ - "thiserror-impl 2.0.18", -] - -[[package]] -name = "thiserror-impl" -version = "1.0.62" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d20468752b09f49e909e55a5d338caa8bedf615594e9d80bc4c565d30faf798c" -dependencies = [ - "proc-macro2", - "quote", - "syn", + "thiserror-impl", ] [[package]] @@ -2199,32 +2179,32 @@ checksum = "3bf63baf9f5039dadc247375c29eb13706706cfde997d0330d05aa63a77d8820" [[package]] name = "toml_datetime" -version = "1.0.0+spec-1.1.0" +version = "1.0.1+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32c2555c699578a4f59f0cc68e5116c8d7cabbd45e1409b989d4be085b53f13e" +checksum = "9b320e741db58cac564e26c607d3cc1fdc4a88fd36c879568c07856ed83ff3e9" dependencies = [ "serde_core", ] [[package]] name = "toml_edit" -version = "0.25.4+spec-1.1.0" +version = "0.25.5+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7193cbd0ce53dc966037f54351dbbcf0d5a642c7f0038c382ef9e677ce8c13f2" +checksum = "8ca1a40644a28bce036923f6a431df0b34236949d111cc07cb6dca830c9ef2e1" dependencies = [ "indexmap", "toml_datetime", "toml_parser", - "winnow", + "winnow 1.0.0", ] [[package]] name = "toml_parser" -version = "1.0.9+spec-1.1.0" +version = "1.0.10+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "702d4415e08923e7e1ef96cd5727c0dfed80b4d2fa25db9647fe5eb6f7c5a4c4" +checksum = "7df25b4befd31c4816df190124375d5a20c6b6921e2cad937316de3fccd63420" dependencies = [ - "winnow", + "winnow 1.0.0", ] [[package]] @@ -2235,7 +2215,7 @@ dependencies = [ "libc", "log", "net_gen", - "thiserror 2.0.18", + "thiserror", "vmm-sys-util", ] @@ -2288,7 +2268,7 @@ checksum = "f2f6fb2847f6742cd76af783a2a2c49e9375d0a111c7bef6f71cd9e738c72d6e" dependencies = [ "memoffset", "tempfile", - "windows-sys 0.61.0", + "windows-sys 0.61.2", ] [[package]] @@ -2330,18 +2310,16 @@ checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" [[package]] name = "vfio-bindings" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "698c66a4522a31ab407a410a59c9660da036178e4fe3f371825cd6aad7d46837" +version = "0.6.1" +source = "git+https://github.com/rust-vmm/vfio?rev=df861a878168ad71602d8a1945bd3b7acbd22693#df861a878168ad71602d8a1945bd3b7acbd22693" dependencies = [ "vmm-sys-util", ] [[package]] name = "vfio-ioctls" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7af7e8d49719333e5eb52209417f26695c9ab2b117a82596a63a44947f97c5d6" +version = "0.5.2" +source = "git+https://github.com/rust-vmm/vfio?rev=df861a878168ad71602d8a1945bd3b7acbd22693#df861a878168ad71602d8a1945bd3b7acbd22693" dependencies = [ "byteorder", "kvm-bindings", @@ -2350,7 +2328,7 @@ dependencies = [ "log", "mshv-bindings", "mshv-ioctls", - "thiserror 2.0.18", + "thiserror", "vfio-bindings", "vm-memory", "vmm-sys-util", @@ -2358,9 +2336,8 @@ dependencies = [ [[package]] name = "vfio_user" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8db5bc783aad75202ad4cbcdc5e893cff1dd8fa24a1bcdb4de8998d3c4d169a" +version = "0.1.2" +source = "git+https://github.com/rust-vmm/vfio?rev=df861a878168ad71602d8a1945bd3b7acbd22693#df861a878168ad71602d8a1945bd3b7acbd22693" dependencies = [ "bitflags 2.11.0", "libc", @@ -2368,7 +2345,7 @@ dependencies = [ "serde", "serde_derive", "serde_json", - "thiserror 2.0.18", + "thiserror", "vfio-bindings", "vm-memory", "vmm-sys-util", @@ -2376,9 +2353,8 @@ dependencies = [ [[package]] name = "vhost" -version = "0.14.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a4dcad85a129d97d5d4b2f3c47a4affdeedd76bdcd02094bcb5d9b76cac2d05" +version = "0.15.0" +source = "git+https://github.com/rust-vmm/vhost?rev=c9b80a1c93bac7820e4aee4269aa904568937035#c9b80a1c93bac7820e4aee4269aa904568937035" dependencies = [ "bitflags 2.11.0", "libc", @@ -2389,9 +2365,8 @@ dependencies = [ [[package]] name = "vhost-user-backend" -version = "0.20.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e183205a9ba7cb9c47fcb0fc0a07fc295a110efbb11ab78ad0d793b0a38a7bde" +version = "0.21.0" +source = "git+https://github.com/rust-vmm/vhost?rev=c9b80a1c93bac7820e4aee4269aa904568937035#c9b80a1c93bac7820e4aee4269aa904568937035" dependencies = [ "libc", "log", @@ -2412,7 +2387,7 @@ dependencies = [ "libc", "log", "option_parser", - "thiserror 2.0.18", + "thiserror", "vhost", "vhost-user-backend", "virtio-bindings", @@ -2432,7 +2407,7 @@ dependencies = [ "log", "net_util", "option_parser", - "thiserror 2.0.18", + "thiserror", "vhost", "vhost-user-backend", "virtio-bindings", @@ -2442,9 +2417,9 @@ dependencies = [ [[package]] name = "virtio-bindings" -version = "0.2.6" +version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "804f498a26d5a63be7bbb8bdcd3869c3f286c4c4a17108905276454da0caf8cb" +checksum = "091f1f09cfbf2a78563b562e7a949465cce1aef63b6065645188d995162f8868" [[package]] name = "virtio-devices" @@ -2466,7 +2441,7 @@ dependencies = [ "serde", "serde_with", "serial_buffer", - "thiserror 2.0.18", + "thiserror", "vhost", "virtio-bindings", "virtio-queue", @@ -2480,10 +2455,11 @@ dependencies = [ [[package]] name = "virtio-queue" -version = "0.16.0" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb0479158f863e59323771a1f684d843962f76960b86fecfec2bfa9c8f0f9180" +checksum = "e358084f32ed165fddb41d98ff1b7ff3c08b9611d8d6114a1b422e2e85688baf" dependencies = [ + "libc", "log", "virtio-bindings", "vm-memory", @@ -2505,7 +2481,7 @@ version = "0.1.0" dependencies = [ "hypervisor", "serde", - "thiserror 2.0.18", + "thiserror", "vfio-ioctls", "vm-memory", "vmm-sys-util", @@ -2519,13 +2495,13 @@ checksum = "7e21282841a059bb62627ce8441c491f09603622cd5a21c43bfedc85a2952f23" [[package]] name = "vm-memory" -version = "0.16.2" +version = "0.17.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fd5e56d48353c5f54ef50bd158a0452fc82f5383da840f7b8efc31695dd3b9d" +checksum = "f39348a049689cabd3377cdd9182bf526ec76a6f823b79903896452e9d7a7380" dependencies = [ "arc-swap", "libc", - "thiserror 1.0.62", + "thiserror", "winapi", ] @@ -2537,7 +2513,7 @@ dependencies = [ "itertools", "serde", "serde_json", - "thiserror 2.0.18", + "thiserror", "vm-memory", ] @@ -2589,7 +2565,7 @@ dependencies = [ "serde_json", "serial_buffer", "signal-hook", - "thiserror 2.0.18", + "thiserror", "tracer", "uuid", "vfio-ioctls", @@ -2609,9 +2585,9 @@ dependencies = [ [[package]] name = "vmm-sys-util" -version = "0.14.0" +version = "0.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d21f366bf22bfba3e868349978766a965cbe628c323d58e026be80b8357ab789" +checksum = "506c62fdf617a5176827c2f9afbcf1be155b03a9b4bf9617a60dbc07e3a1642f" dependencies = [ "bitflags 1.3.2", "libc", @@ -2654,35 +2630,22 @@ dependencies = [ [[package]] name = "wasm-bindgen" -version = "0.2.100" +version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" +checksum = "6532f9a5c1ece3798cb1c2cfdba640b9b3ba884f5db45973a6f442510a87d38e" dependencies = [ "cfg-if", "once_cell", "rustversion", "wasm-bindgen-macro", -] - -[[package]] -name = "wasm-bindgen-backend" -version = "0.2.100" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" -dependencies = [ - "bumpalo", - "log", - "proc-macro2", - "quote", - "syn", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-macro" -version = "0.2.100" +version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" +checksum = "18a2d50fcf105fb33bb15f00e7a77b772945a2ee45dcf454961fd843e74c18e6" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -2690,22 +2653,22 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.100" +version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" +checksum = "03ce4caeaac547cdf713d280eda22a730824dd11e6b8c3ca9e42247b25c631e3" dependencies = [ + "bumpalo", "proc-macro2", "quote", "syn", - "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.100" +version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" +checksum = "75a326b8c223ee17883a4251907455a2431acc2791c98c26279376490c378c16" dependencies = [ "unicode-ident", ] @@ -2783,9 +2746,9 @@ dependencies = [ [[package]] name = "windows-sys" -version = "0.61.0" +version = "0.61.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e201184e40b2ede64bc2ea34968b28e33622acdbbf37104f0e4a33f7abe657aa" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" dependencies = [ "windows-link", ] @@ -2864,6 +2827,15 @@ dependencies = [ "memchr", ] +[[package]] +name = "winnow" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a90e88e4667264a994d34e6d1ab2d26d398dcdca8b7f52bec8668957517fc7d8" +dependencies = [ + "memchr", +] + [[package]] name = "wit-bindgen" version = "0.51.0" @@ -2980,8 +2952,8 @@ dependencies = [ "tracing", "uds_windows", "uuid", - "windows-sys 0.61.0", - "winnow", + "windows-sys 0.61.2", + "winnow 0.7.15", "zbus_macros", "zbus_names", "zvariant", @@ -3009,24 +2981,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ffd8af6d5b78619bab301ff3c560a5bd22426150253db278f164d6cf3b72c50f" dependencies = [ "serde", - "winnow", + "winnow 0.7.15", "zvariant", ] [[package]] name = "zerocopy" -version = "0.8.42" +version = "0.8.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2578b716f8a7a858b7f02d5bd870c14bf4ddbbcf3a4c05414ba6503640505e3" +checksum = "efbb2a062be311f2ba113ce66f697a4dc589f85e78a4aea276200804cea0ed87" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.42" +version = "0.8.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e6cc098ea4d3bd6246687de65af3f920c430e236bee1e3bf2e441463f08a02f" +checksum = "0e8bc7269b54418e7aeeef514aa68f8690b8c0489a06b0136e5f57c4c5ccab89" dependencies = [ "proc-macro2", "quote", @@ -3076,7 +3048,7 @@ dependencies = [ "endi", "enumflags2", "serde", - "winnow", + "winnow 0.7.15", "zvariant_derive", "zvariant_utils", ] @@ -3104,5 +3076,5 @@ dependencies = [ "quote", "serde", "syn", - "winnow", + "winnow 0.7.15", ] diff --git a/Cargo.toml b/Cargo.toml index b8fc54d9e1..bd2a53cad2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -54,22 +54,22 @@ resolver = "3" [workspace.dependencies] # rust-vmm crates acpi_tables = "0.2.0" -kvm-bindings = "0.12.1" -kvm-ioctls = "0.22.1" -linux-loader = "0.13.1" +kvm-bindings = "0.14.0" +kvm-ioctls = "0.24.0" +linux-loader = "0.13.2" mshv-bindings = "0.6.7" mshv-ioctls = "0.6.7" seccompiler = "0.5.0" -vfio-bindings = { version = "0.6.0", default-features = false } -vfio-ioctls = { version = "0.5.1", default-features = false } -vfio_user = { version = "0.1.1", default-features = false } -vhost = { version = "0.14.0", default-features = false } -vhost-user-backend = { version = "0.20.0", default-features = false } +vfio-bindings = { git = "https://github.com/rust-vmm/vfio", rev = "df861a878168ad71602d8a1945bd3b7acbd22693", default-features = false } +vfio-ioctls = { git = "https://github.com/rust-vmm/vfio", rev = "df861a878168ad71602d8a1945bd3b7acbd22693", default-features = false } +vfio_user = { git = "https://github.com/rust-vmm/vfio", rev = "df861a878168ad71602d8a1945bd3b7acbd22693", default-features = false } +vhost = { git = "https://github.com/rust-vmm/vhost", rev = "c9b80a1c93bac7820e4aee4269aa904568937035", default-features = false } +vhost-user-backend = { git = "https://github.com/rust-vmm/vhost", rev = "c9b80a1c93bac7820e4aee4269aa904568937035", default-features = false } virtio-bindings = "0.2.6" -virtio-queue = "0.16.0" +virtio-queue = "0.17.0" vm-fdt = "0.3.0" -vm-memory = "0.16.1" -vmm-sys-util = "0.14.0" +vm-memory = "0.17.1" +vmm-sys-util = "0.15.0" # igvm crates igvm = "0.4.0" diff --git a/arch/src/aarch64/uefi.rs b/arch/src/aarch64/uefi.rs index bd40e36ff0..2ff3a8638f 100644 --- a/arch/src/aarch64/uefi.rs +++ b/arch/src/aarch64/uefi.rs @@ -7,7 +7,7 @@ use std::os::fd::AsFd; use std::result; use thiserror::Error; -use vm_memory::{GuestAddress, GuestMemory}; +use vm_memory::{Bytes, GuestAddress, GuestMemory}; /// Errors thrown while loading UEFI binary #[derive(Debug, Error)] diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock index e03789eaf4..4dfe9600e5 100644 --- a/fuzz/Cargo.lock +++ b/fuzz/Cargo.lock @@ -100,7 +100,7 @@ dependencies = [ "linux-loader", "log", "serde", - "thiserror 2.0.18", + "thiserror", "uuid", "vm-fdt", "vm-memory", @@ -149,7 +149,7 @@ dependencies = [ "remain", "serde", "smallvec", - "thiserror 2.0.18", + "thiserror", "uuid", "virtio-bindings", "virtio-queue", @@ -352,7 +352,7 @@ dependencies = [ "num_enum", "pci", "serde", - "thiserror 2.0.18", + "thiserror", "tpm", "vm-allocator", "vm-device", @@ -589,7 +589,7 @@ dependencies = [ "serde", "serde_json", "serde_with", - "thiserror 2.0.18", + "thiserror", "vfio-ioctls", "vm-memory", "vmm-sys-util", @@ -646,9 +646,9 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.17" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" [[package]] name = "jobserver" @@ -662,9 +662,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.77" +version = "0.3.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f" +checksum = "b49715b7073f385ba4bc528e5747d02e66cb39c6146efb66b781f131f0fb399c" dependencies = [ "once_cell", "wasm-bindgen", @@ -672,9 +672,9 @@ dependencies = [ [[package]] name = "kvm-bindings" -version = "0.12.1" +version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a537873e15e8daabb416667e606d9b0abc2a8fb9a45bd5853b888ae0ead82f9" +checksum = "4b3c06ff73c7ce03e780887ec2389d62d2a2a9ddf471ab05c2ff69207cd3f3b4" dependencies = [ "serde", "vmm-sys-util", @@ -683,9 +683,9 @@ dependencies = [ [[package]] name = "kvm-ioctls" -version = "0.22.1" +version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c8f7370330b4f57981e300fa39b02088f2f2a5c2d0f1f994e8090589619c56d" +checksum = "333f77a20344a448f3f70664918135fddeb804e938f28a99d685bd92926e0b19" dependencies = [ "bitflags 2.11.0", "kvm-bindings", @@ -701,7 +701,7 @@ checksum = "49fefd6652c57d68aaa32544a4c0e642929725bdc1fd929367cdeb673ab81088" dependencies = [ "enumflags2", "libc", - "thiserror 2.0.18", + "thiserror", ] [[package]] @@ -734,9 +734,9 @@ dependencies = [ [[package]] name = "linux-loader" -version = "0.13.1" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53802c0b111faf302a16fa20a2e3a33bd0eab408f60fc34cbfe052f6b153791e" +checksum = "de72cb02c55ecffcf75fe78295926f872eb6eb0a58d629c58a8c324dc26380f6" dependencies = [ "vm-memory", ] @@ -771,7 +771,7 @@ checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" [[package]] name = "micro_http" version = "0.1.0" -source = "git+https://github.com/firecracker-microvm/micro-http?branch=main#3248ceeae41461d034624b582d5d358cd6e6f89f" +source = "git+https://github.com/firecracker-microvm/micro-http?branch=main#5c2254d6cf4f32a668d0d8e57ba20bebad9d4fba" dependencies = [ "libc", "vmm-sys-util", @@ -819,7 +819,7 @@ dependencies = [ "net_gen", "rate_limiter", "serde", - "thiserror 2.0.18", + "thiserror", "virtio-bindings", "virtio-queue", "vm-memory", @@ -894,7 +894,7 @@ dependencies = [ name = "option_parser" version = "0.1.0" dependencies = [ - "thiserror 2.0.18", + "thiserror", ] [[package]] @@ -913,7 +913,7 @@ dependencies = [ "libc", "log", "serde", - "thiserror 2.0.18", + "thiserror", "vfio-bindings", "vfio-ioctls", "vfio_user", @@ -1003,7 +1003,7 @@ dependencies = [ "epoll", "libc", "log", - "thiserror 2.0.18", + "thiserror", "vmm-sys-util", ] @@ -1178,33 +1178,13 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "thiserror" -version = "1.0.64" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d50af8abc119fb8bb6dbabcfa89656f46f84aa0ac7688088608076ad2b459a84" -dependencies = [ - "thiserror-impl 1.0.64", -] - [[package]] name = "thiserror" version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" dependencies = [ - "thiserror-impl 2.0.18", -] - -[[package]] -name = "thiserror-impl" -version = "1.0.64" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08904e7672f5eb876eaaf87e0ce17857500934f4981c4a0ab2b4aa98baac7fc3" -dependencies = [ - "proc-macro2", - "quote", - "syn", + "thiserror-impl", ] [[package]] @@ -1220,18 +1200,18 @@ dependencies = [ [[package]] name = "toml_datetime" -version = "1.0.0+spec-1.1.0" +version = "1.0.1+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32c2555c699578a4f59f0cc68e5116c8d7cabbd45e1409b989d4be085b53f13e" +checksum = "9b320e741db58cac564e26c607d3cc1fdc4a88fd36c879568c07856ed83ff3e9" dependencies = [ "serde_core", ] [[package]] name = "toml_edit" -version = "0.25.4+spec-1.1.0" +version = "0.25.5+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7193cbd0ce53dc966037f54351dbbcf0d5a642c7f0038c382ef9e677ce8c13f2" +checksum = "8ca1a40644a28bce036923f6a431df0b34236949d111cc07cb6dca830c9ef2e1" dependencies = [ "indexmap", "toml_datetime", @@ -1241,9 +1221,9 @@ dependencies = [ [[package]] name = "toml_parser" -version = "1.0.9+spec-1.1.0" +version = "1.0.10+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "702d4415e08923e7e1ef96cd5727c0dfed80b4d2fa25db9647fe5eb6f7c5a4c4" +checksum = "7df25b4befd31c4816df190124375d5a20c6b6921e2cad937316de3fccd63420" dependencies = [ "winnow", ] @@ -1256,7 +1236,7 @@ dependencies = [ "libc", "log", "net_gen", - "thiserror 2.0.18", + "thiserror", "vmm-sys-util", ] @@ -1302,25 +1282,23 @@ dependencies = [ [[package]] name = "vfio-bindings" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "698c66a4522a31ab407a410a59c9660da036178e4fe3f371825cd6aad7d46837" +version = "0.6.1" +source = "git+https://github.com/rust-vmm/vfio?rev=df861a878168ad71602d8a1945bd3b7acbd22693#df861a878168ad71602d8a1945bd3b7acbd22693" dependencies = [ "vmm-sys-util", ] [[package]] name = "vfio-ioctls" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7af7e8d49719333e5eb52209417f26695c9ab2b117a82596a63a44947f97c5d6" +version = "0.5.2" +source = "git+https://github.com/rust-vmm/vfio?rev=df861a878168ad71602d8a1945bd3b7acbd22693#df861a878168ad71602d8a1945bd3b7acbd22693" dependencies = [ "byteorder", "kvm-bindings", "kvm-ioctls", "libc", "log", - "thiserror 2.0.18", + "thiserror", "vfio-bindings", "vm-memory", "vmm-sys-util", @@ -1328,9 +1306,8 @@ dependencies = [ [[package]] name = "vfio_user" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8db5bc783aad75202ad4cbcdc5e893cff1dd8fa24a1bcdb4de8998d3c4d169a" +version = "0.1.2" +source = "git+https://github.com/rust-vmm/vfio?rev=df861a878168ad71602d8a1945bd3b7acbd22693#df861a878168ad71602d8a1945bd3b7acbd22693" dependencies = [ "bitflags 2.11.0", "libc", @@ -1338,7 +1315,7 @@ dependencies = [ "serde", "serde_derive", "serde_json", - "thiserror 2.0.18", + "thiserror", "vfio-bindings", "vm-memory", "vmm-sys-util", @@ -1346,9 +1323,8 @@ dependencies = [ [[package]] name = "vhost" -version = "0.14.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a4dcad85a129d97d5d4b2f3c47a4affdeedd76bdcd02094bcb5d9b76cac2d05" +version = "0.15.0" +source = "git+https://github.com/rust-vmm/vhost?rev=c9b80a1c93bac7820e4aee4269aa904568937035#c9b80a1c93bac7820e4aee4269aa904568937035" dependencies = [ "bitflags 2.11.0", "libc", @@ -1359,9 +1335,9 @@ dependencies = [ [[package]] name = "virtio-bindings" -version = "0.2.6" +version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "804f498a26d5a63be7bbb8bdcd3869c3f286c4c4a17108905276454da0caf8cb" +checksum = "091f1f09cfbf2a78563b562e7a949465cce1aef63b6065645188d995162f8868" [[package]] name = "virtio-devices" @@ -1382,7 +1358,7 @@ dependencies = [ "serde", "serde_with", "serial_buffer", - "thiserror 2.0.18", + "thiserror", "vhost", "virtio-bindings", "virtio-queue", @@ -1396,10 +1372,11 @@ dependencies = [ [[package]] name = "virtio-queue" -version = "0.16.0" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb0479158f863e59323771a1f684d843962f76960b86fecfec2bfa9c8f0f9180" +checksum = "e358084f32ed165fddb41d98ff1b7ff3c08b9611d8d6114a1b422e2e85688baf" dependencies = [ + "libc", "log", "virtio-bindings", "vm-memory", @@ -1421,7 +1398,7 @@ version = "0.1.0" dependencies = [ "hypervisor", "serde", - "thiserror 2.0.18", + "thiserror", "vfio-ioctls", "vm-memory", "vmm-sys-util", @@ -1435,13 +1412,13 @@ checksum = "7e21282841a059bb62627ce8441c491f09603622cd5a21c43bfedc85a2952f23" [[package]] name = "vm-memory" -version = "0.16.2" +version = "0.17.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fd5e56d48353c5f54ef50bd158a0452fc82f5383da840f7b8efc31695dd3b9d" +checksum = "f39348a049689cabd3377cdd9182bf526ec76a6f823b79903896452e9d7a7380" dependencies = [ "arc-swap", "libc", - "thiserror 1.0.64", + "thiserror", "winapi", ] @@ -1453,7 +1430,7 @@ dependencies = [ "itertools", "serde", "serde_json", - "thiserror 2.0.18", + "thiserror", "vm-memory", ] @@ -1497,7 +1474,7 @@ dependencies = [ "serde_json", "serial_buffer", "signal-hook", - "thiserror 2.0.18", + "thiserror", "tracer", "uuid", "vfio-ioctls", @@ -1516,9 +1493,9 @@ dependencies = [ [[package]] name = "vmm-sys-util" -version = "0.14.0" +version = "0.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d21f366bf22bfba3e868349978766a965cbe628c323d58e026be80b8357ab789" +checksum = "506c62fdf617a5176827c2f9afbcf1be155b03a9b4bf9617a60dbc07e3a1642f" dependencies = [ "bitflags 1.3.2", "libc", @@ -1552,35 +1529,22 @@ dependencies = [ [[package]] name = "wasm-bindgen" -version = "0.2.100" +version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" +checksum = "6532f9a5c1ece3798cb1c2cfdba640b9b3ba884f5db45973a6f442510a87d38e" dependencies = [ "cfg-if", "once_cell", "rustversion", "wasm-bindgen-macro", -] - -[[package]] -name = "wasm-bindgen-backend" -version = "0.2.100" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" -dependencies = [ - "bumpalo", - "log", - "proc-macro2", - "quote", - "syn", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-macro" -version = "0.2.100" +version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" +checksum = "18a2d50fcf105fb33bb15f00e7a77b772945a2ee45dcf454961fd843e74c18e6" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -1588,22 +1552,22 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.100" +version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" +checksum = "03ce4caeaac547cdf713d280eda22a730824dd11e6b8c3ca9e42247b25c631e3" dependencies = [ + "bumpalo", "proc-macro2", "quote", "syn", - "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.100" +version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" +checksum = "75a326b8c223ee17883a4251907455a2431acc2791c98c26279376490c378c16" dependencies = [ "unicode-ident", ] @@ -1681,9 +1645,9 @@ dependencies = [ [[package]] name = "winnow" -version = "0.7.15" +version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945" +checksum = "a90e88e4667264a994d34e6d1ab2d26d398dcdca8b7f52bec8668957517fc7d8" dependencies = [ "memchr", ] @@ -1778,18 +1742,18 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.42" +version = "0.8.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2578b716f8a7a858b7f02d5bd870c14bf4ddbbcf3a4c05414ba6503640505e3" +checksum = "efbb2a062be311f2ba113ce66f697a4dc589f85e78a4aea276200804cea0ed87" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.42" +version = "0.8.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e6cc098ea4d3bd6246687de65af3f920c430e236bee1e3bf2e441463f08a02f" +checksum = "0e8bc7269b54418e7aeeef514aa68f8690b8c0489a06b0136e5f57c4c5ccab89" dependencies = [ "proc-macro2", "quote", diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index 88d31a152a..80c59be630 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -19,23 +19,23 @@ pvmemcontrol = [] arbitrary = "1.4.2" block = { path = "../block" } devices = { path = "../devices" } -epoll = "4.3.3" +epoll = "4.4.0" hypervisor = { path = "../hypervisor", features = ["mshv_emulator"] } libc = "0.2.183" libfuzzer-sys = "0.4.12" -linux-loader = { version = "0.13.1", features = ["bzimage", "elf", "pe"] } +linux-loader = { version = "0.13.2", features = ["bzimage", "elf", "pe"] } micro_http = { git = "https://github.com/firecracker-microvm/micro-http", branch = "main" } -mshv-bindings = "0.6.6" +mshv-bindings = "0.6.7" net_util = { path = "../net_util" } seccompiler = "0.5.0" virtio-devices = { path = "../virtio-devices" } -virtio-queue = "0.16.0" +virtio-queue = "0.17.0" vm-device = { path = "../vm-device" } -vm-memory = "0.16.0" +vm-memory = "0.17.1" vm-migration = { path = "../vm-migration" } vm-virtio = { path = "../vm-virtio" } vmm = { path = "../vmm", features = ["guest_debug"] } -vmm-sys-util = "0.14.0" +vmm-sys-util = "0.15.0" # Prevent this from interfering with workspaces [workspace] diff --git a/hypervisor/src/kvm/mod.rs b/hypervisor/src/kvm/mod.rs index 1c1abd4b68..d18785fd90 100644 --- a/hypervisor/src/kvm/mod.rs +++ b/hypervisor/src/kvm/mod.rs @@ -2991,7 +2991,7 @@ impl KvmVcpu { let maybe_size = self .fd - .get_nested_state(&mut buffer) + .nested_state(&mut buffer) .map_err(|e| cpu::HypervisorCpuError::GetNestedState(e.into()))?; if let Some(_size) = maybe_size { diff --git a/pci/src/vfio.rs b/pci/src/vfio.rs index e46e276aa6..ee4e43e31e 100644 --- a/pci/src/vfio.rs +++ b/pci/src/vfio.rs @@ -1709,10 +1709,8 @@ impl VfioPciDevice { unsafe { self.container.vfio_dma_map( user_memory_region.start, - user_memory_region.mapping.len().try_into().unwrap(), - (user_memory_region.mapping.addr() as usize) - .try_into() - .unwrap(), + user_memory_region.mapping.len(), + user_memory_region.mapping.addr(), ) } .map_err(|e| VfioPciError::DmaMap(e, self.device_path.clone(), self.bdf))?; @@ -1734,7 +1732,7 @@ impl VfioPciDevice { if !self.iommu_attached && let Err(e) = self .container - .vfio_dma_unmap(user_memory_region.start, len.try_into().unwrap()) + .vfio_dma_unmap(user_memory_region.start, len) .map_err(|e| VfioPciError::DmaUnmap(e, self.device_path.clone(), self.bdf)) { error!( @@ -1892,7 +1890,7 @@ impl PciDevice for VfioPciDevice { if !self.iommu_attached && let Err(e) = self .container - .vfio_dma_unmap(user_memory_region.start, len.try_into().unwrap()) + .vfio_dma_unmap(user_memory_region.start, len) .map_err(|e| { VfioPciError::DmaUnmap(e, self.device_path.clone(), self.bdf) }) @@ -1950,11 +1948,8 @@ iova 0x{:x}, size 0x{:x}: {}, ", // host_addr points to len bytes of // valid memory that will only be unmapped with munmap(). unsafe { - self.container.vfio_dma_map( - user_memory_region.start, - len.try_into().unwrap(), - (host_addr as usize).try_into().unwrap(), - ) + self.container + .vfio_dma_map(user_memory_region.start, len, host_addr) } .map_err(|e| VfioPciError::DmaMap(e, self.device_path.clone(), self.bdf)) .map_err(|e| { @@ -2069,11 +2064,7 @@ impl ExternalDmaMapping for VfioDmaMapping ExternalDmaMapping for VfioDmaMapping std::result::Result<(), io::Error> { - self.container.vfio_dma_unmap(iova, size).map_err(|e| { - io::Error::other(format!( - "failed to unmap memory for VFIO container, \ + self.container + .vfio_dma_unmap(iova, size as usize) + .map_err(|e| { + io::Error::other(format!( + "failed to unmap memory for VFIO container, \ iova 0x{iova:x}, size 0x{size:x}: {e:?}" - )) - }) + )) + }) } } diff --git a/vhost_user_block/src/lib.rs b/vhost_user_block/src/lib.rs index 13668e1cdb..9b0e429ee4 100644 --- a/vhost_user_block/src/lib.rs +++ b/vhost_user_block/src/lib.rs @@ -12,6 +12,7 @@ use std::fs::{File, OpenOptions}; use std::io::{Read, Seek, SeekFrom, Write}; use std::ops::{Deref, DerefMut}; use std::os::unix::fs::OpenOptionsExt; +use std::os::unix::io::{FromRawFd, IntoRawFd}; use std::path::PathBuf; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::{Arc, Mutex, RwLock, RwLockWriteGuard}; @@ -34,6 +35,7 @@ use virtio_bindings::virtio_ring::VIRTIO_RING_F_EVENT_IDX; use virtio_queue::QueueT; use vm_memory::{ByteValued, Bytes, GuestAddressSpace, GuestMemoryAtomic}; use vmm_sys_util::epoll::EventSet; +use vmm_sys_util::event::{EventConsumer, EventNotifier}; use vmm_sys_util::eventfd::EventFd; type GuestMemoryMmap = vm_memory::GuestMemoryMmap; @@ -423,15 +425,15 @@ impl VhostUserBackendMut for VhostUserBlkBackend { Ok(()) } - fn exit_event(&self, thread_index: usize) -> Option { - Some( - self.threads[thread_index] - .lock() - .unwrap() - .kill_evt - .try_clone() - .unwrap(), - ) + fn exit_event(&self, thread_index: usize) -> Option<(EventConsumer, EventNotifier)> { + let kill_evt = &self.threads[thread_index].lock().unwrap().kill_evt; + // SAFETY: kill_evt is a valid eventfd + unsafe { + Some(( + EventConsumer::from_raw_fd(kill_evt.try_clone().unwrap().into_raw_fd()), + EventNotifier::from_raw_fd(kill_evt.try_clone().unwrap().into_raw_fd()), + )) + } } fn queues_per_thread(&self) -> Vec { @@ -533,14 +535,14 @@ pub fn start_block_backend(backend_command: &str) { debug!("blk_backend is created!\n"); - let listener = Listener::new(&backend_config.socket, true).unwrap(); + let mut listener = Listener::new(&backend_config.socket, true).unwrap(); let name = "vhost-user-blk-backend"; let mut blk_daemon = VhostUserDaemon::new(name.to_string(), blk_backend.clone(), mem).unwrap(); debug!("blk_daemon is created!\n"); - if let Err(e) = blk_daemon.start(listener) { + if let Err(e) = blk_daemon.start(&mut listener) { error!("Failed to start daemon for vhost-user-block with error: {e:?}\n"); process::exit(1); } diff --git a/vhost_user_net/src/lib.rs b/vhost_user_net/src/lib.rs index 0e89a763a8..254058abd3 100644 --- a/vhost_user_net/src/lib.rs +++ b/vhost_user_net/src/lib.rs @@ -8,7 +8,7 @@ use std::net::{IpAddr, Ipv4Addr}; use std::ops::Deref; -use std::os::unix::io::{AsRawFd, RawFd}; +use std::os::unix::io::{AsRawFd, FromRawFd, IntoRawFd, RawFd}; use std::sync::{Arc, Mutex, RwLock}; use std::{io, process}; @@ -27,6 +27,7 @@ use virtio_bindings::virtio_config::{VIRTIO_F_NOTIFY_ON_EMPTY, VIRTIO_F_VERSION_ use virtio_bindings::virtio_net::*; use vm_memory::{GuestAddressSpace, GuestMemoryAtomic}; use vmm_sys_util::epoll::EventSet; +use vmm_sys_util::event::{EventConsumer, EventNotifier}; use vmm_sys_util::eventfd::EventFd; type GuestMemoryMmap = vm_memory::GuestMemoryMmap; @@ -249,15 +250,15 @@ impl VhostUserBackendMut for VhostUserNetBackend { Ok(()) } - fn exit_event(&self, thread_index: usize) -> Option { - Some( - self.threads[thread_index] - .lock() - .unwrap() - .kill_evt - .try_clone() - .unwrap(), - ) + fn exit_event(&self, thread_index: usize) -> Option<(EventConsumer, EventNotifier)> { + let kill_evt = &self.threads[thread_index].lock().unwrap().kill_evt; + // SAFETY: kill_evt is a valid eventfd + unsafe { + Some(( + EventConsumer::from_raw_fd(kill_evt.try_clone().unwrap().into_raw_fd()), + EventNotifier::from_raw_fd(kill_evt.try_clone().unwrap().into_raw_fd()), + )) + } } fn queues_per_thread(&self) -> Vec { @@ -394,7 +395,7 @@ pub fn start_net_backend(backend_command: &str) { if let Err(e) = if backend_config.client { net_daemon.start_client(&backend_config.socket) } else { - net_daemon.start(Listener::new(&backend_config.socket, true).unwrap()) + net_daemon.start(&mut Listener::new(&backend_config.socket, true).unwrap()) } { error!("failed to start daemon for vhost-user-net with error: {e:?}"); process::exit(1); diff --git a/virtio-devices/src/console.rs b/virtio-devices/src/console.rs index 74b42c46b5..96282b5228 100644 --- a/virtio-devices/src/console.rs +++ b/virtio-devices/src/console.rs @@ -18,7 +18,7 @@ use serde::{Deserialize, Serialize}; use serial_buffer::SerialBuffer; use thiserror::Error; use virtio_queue::{Queue, QueueT}; -use vm_memory::{ByteValued, Bytes, GuestAddressSpace, GuestMemory, GuestMemoryAtomic}; +use vm_memory::{ByteValued, Bytes, GuestAddressSpace, GuestMemoryAtomic}; use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable}; use vm_virtio::{AccessPlatform, Translatable}; use vmm_sys_util::eventfd::EventFd; diff --git a/virtio-devices/src/lib.rs b/virtio-devices/src/lib.rs index f69ff5579e..d57673ad34 100644 --- a/virtio-devices/src/lib.rs +++ b/virtio-devices/src/lib.rs @@ -168,9 +168,7 @@ pub fn get_host_address_range( if mem.check_range(addr, size) { let slice = mem.get_slice(addr, size).unwrap(); assert!(slice.len() >= size); - // TODO: return a VolatileSlice and fix all callers. - #[allow(deprecated)] - Some(slice.as_ptr()) + Some(slice.ptr_guard_mut().as_ptr()) } else { None } diff --git a/virtio-devices/src/rng.rs b/virtio-devices/src/rng.rs index 16a539f923..2133cb79ee 100644 --- a/virtio-devices/src/rng.rs +++ b/virtio-devices/src/rng.rs @@ -17,7 +17,7 @@ use seccompiler::SeccompAction; use serde::{Deserialize, Serialize}; use thiserror::Error; use virtio_queue::{Queue, QueueT}; -use vm_memory::{GuestAddressSpace, GuestMemory, GuestMemoryAtomic}; +use vm_memory::{Bytes, GuestAddressSpace, GuestMemoryAtomic}; use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable}; use vm_virtio::{AccessPlatform, Translatable}; use vmm_sys_util::eventfd::EventFd; diff --git a/virtio-devices/src/vhost_user/mod.rs b/virtio-devices/src/vhost_user/mod.rs index 0dad19acea..aca9aba113 100644 --- a/virtio-devices/src/vhost_user/mod.rs +++ b/virtio-devices/src/vhost_user/mod.rs @@ -17,8 +17,9 @@ use vhost::vhost_user::message::{ }; use vhost::vhost_user::{FrontendReqHandler, VhostUserFrontendReqHandler}; use virtio_queue::{Error as QueueError, Queue}; +use vm_memory::guest_memory::Error as MmapError; use vm_memory::mmap::MmapRegionError; -use vm_memory::{Address, Error as MmapError, GuestAddressSpace, GuestMemory, GuestMemoryAtomic}; +use vm_memory::{Address, GuestAddressSpace, GuestMemory, GuestMemoryAtomic}; use vm_migration::protocol::MemoryRangeTable; use vm_migration::{MigratableError, Snapshot}; use vmm_sys_util::eventfd::EventFd; diff --git a/virtio-devices/src/vhost_user/vu_common_ctrl.rs b/virtio-devices/src/vhost_user/vu_common_ctrl.rs index 264635149c..b603463fb4 100644 --- a/virtio-devices/src/vhost_user/vu_common_ctrl.rs +++ b/virtio-devices/src/vhost_user/vu_common_ctrl.rs @@ -21,9 +21,8 @@ use vhost::vhost_user::{ use vhost::{VhostBackend, VhostUserDirtyLogRegion, VhostUserMemoryRegionInfo, VringConfigData}; use virtio_queue::desc::RawDescriptor; use virtio_queue::{Queue, QueueT}; -use vm_memory::{ - Address, Error as MmapError, FileOffset, GuestAddress, GuestMemory, GuestMemoryRegion, -}; +use vm_memory::guest_memory::Error as MmapError; +use vm_memory::{Address, FileOffset, GuestAddress, GuestMemory, GuestMemoryRegion}; use vm_migration::protocol::MemoryRangeTable; use vmm_sys_util::eventfd::EventFd; @@ -67,7 +66,11 @@ impl VhostUserHandle { for region in mem.iter() { let (mmap_handle, mmap_offset) = match region.file_offset() { Some(_file_offset) => (_file_offset.file().as_raw_fd(), _file_offset.start()), - None => return Err(Error::VhostUserMemoryRegion(MmapError::NoMemoryRegion)), + None => { + return Err(Error::VhostUserMemoryRegion( + MmapError::InvalidGuestAddress(region.start_addr()), + )); + } }; let vhost_user_net_reg = VhostUserMemoryRegionInfo { diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 51a5e476a7..50b82772d0 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -91,7 +91,7 @@ use seccompiler::SeccompAction; use serde::{Deserialize, Serialize}; use thiserror::Error; use tracer::trace_scoped; -use vfio_ioctls::{VfioContainer, VfioDevice, VfioDeviceFd}; +use vfio_ioctls::{VfioContainer, VfioDevice, VfioDeviceFd, VfioOps}; use virtio_devices::transport::{VirtioPciDevice, VirtioPciDeviceActivator, VirtioTransport}; use virtio_devices::vhost_user::VhostUserConfig; use virtio_devices::{ @@ -3867,8 +3867,11 @@ impl DeviceManager { vfio_container }; - let vfio_device = VfioDevice::new(&device_cfg.path, Arc::clone(&vfio_container)) - .map_err(DeviceManagerError::VfioCreate)?; + let vfio_device = VfioDevice::new( + &device_cfg.path, + Arc::clone(&vfio_container) as Arc, + ) + .map_err(DeviceManagerError::VfioCreate)?; if needs_dma_mapping { // Register DMA mapping in IOMMU. @@ -3884,8 +3887,8 @@ impl DeviceManager { unsafe { vfio_container.vfio_dma_map( region.start_addr().raw_value(), - region.len(), - region.as_ptr() as u64, + region.len() as usize, + region.as_ptr(), ) } .map_err(DeviceManagerError::VfioDmaMap)?; @@ -4519,8 +4522,8 @@ impl DeviceManager { unsafe { vfio_container.vfio_dma_map( new_region.start_addr().raw_value(), - new_region.len(), - new_region.as_ptr() as u64, + new_region.len() as usize, + new_region.as_ptr(), ) } .map_err(DeviceManagerError::UpdateMemoryForVfioPciDevice)?; diff --git a/vmm/src/memory_manager.rs b/vmm/src/memory_manager.rs index ba0313b29a..c8f64b15ad 100644 --- a/vmm/src/memory_manager.rs +++ b/vmm/src/memory_manager.rs @@ -35,10 +35,10 @@ use vm_allocator::GsiApic; use vm_allocator::{AddressAllocator, MemorySlotAllocator, SystemAllocator}; use vm_device::BusDevice; use vm_memory::bitmap::AtomicBitmap; -use vm_memory::guest_memory::FileOffset; +use vm_memory::guest_memory::{Error as MmapError, FileOffset}; use vm_memory::mmap::MmapRegionError; use vm_memory::{ - Address, Error as MmapError, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, + Address, Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, GuestMemoryError, GuestMemoryRegion, GuestUsize, MmapRegion, ReadVolatile, }; use vm_migration::protocol::{MemoryRange, MemoryRangeTable}; @@ -251,6 +251,10 @@ pub enum Error { #[error("Mmap backed guest memory error")] GuestMemory(#[source] MmapError), + /// Guest region collection error + #[error("Guest region collection error")] + GuestRegionCollection(#[source] vm_memory::GuestRegionCollectionError), + /// Failed to allocate a memory range. #[error("Failed to allocate a memory range")] MemoryRangeAllocation, @@ -1448,7 +1452,7 @@ impl MemoryManager { config.thp, )?; let guest_memory = - GuestMemoryMmap::from_arc_regions(regions).map_err(Error::GuestMemory)?; + GuestMemoryMmap::from_arc_regions(regions).map_err(Error::GuestRegionCollection)?; let boot_guest_memory = guest_memory.clone(); ( GuestAddress(data.start_of_device_area), @@ -1485,8 +1489,8 @@ impl MemoryManager { let (mem_regions, mut memory_zones) = Self::create_memory_regions_from_zones(&ram_regions, &zones, prefault, config.thp)?; - let mut guest_memory = - GuestMemoryMmap::from_arc_regions(mem_regions).map_err(Error::GuestMemory)?; + let mut guest_memory = GuestMemoryMmap::from_arc_regions(mem_regions) + .map_err(Error::GuestRegionCollection)?; let boot_guest_memory = guest_memory.clone(); @@ -1534,7 +1538,7 @@ impl MemoryManager { guest_memory = guest_memory .insert_region(Arc::clone(®ion)) - .map_err(Error::GuestMemory)?; + .map_err(Error::GuestRegionCollection)?; let hotplugged_size = zone.hotplugged_size.unwrap_or(0); let region_size = region.len(); @@ -1961,9 +1965,9 @@ impl MemoryManager { thp, )?; - Ok(Arc::new( - GuestRegionMmap::new(r, start_addr).map_err(Error::GuestMemory)?, - )) + Ok(Arc::new(GuestRegionMmap::new(r, start_addr).ok_or( + Error::GuestMemory(MmapError::InvalidGuestAddress(start_addr)), + )?)) } // Duplicate of `memory_zone_get_align_size` that does not require a `zone` @@ -2024,7 +2028,7 @@ impl MemoryManager { .guest_memory .memory() .insert_region(region) - .map_err(Error::GuestMemory)?; + .map_err(Error::GuestRegionCollection)?; self.guest_memory.lock().unwrap().replace(guest_memory); Ok(()) diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index 710847fab3..430c9bdc74 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -65,10 +65,10 @@ use thiserror::Error; use tracer::trace_scoped; use vm_device::Bus; #[cfg(feature = "tdx")] +use vm_memory::GuestMemory; +#[cfg(feature = "tdx")] use vm_memory::{Address, ByteValued, GuestMemoryRegion, ReadVolatile}; -use vm_memory::{ - Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, WriteVolatile, -}; +use vm_memory::{Bytes, GuestAddress, GuestAddressSpace, GuestMemoryAtomic, WriteVolatile}; use vm_migration::protocol::{MemoryRangeTable, Request, Response}; use vm_migration::{ Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable, snapshot_from_id, From 3a56f20ee156a1c550dae98f5f57e3b3f8deb965 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Sun, 22 Mar 2026 18:52:26 +0000 Subject: [PATCH 252/742] vmm: seccomp: Add readlink/readlinkat to vcpu filter Now needed for IOMMU group resolution with VFIO. Fixes: #7876 Signed-off-by: Rob Bradford --- vmm/src/seccomp_filters.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vmm/src/seccomp_filters.rs b/vmm/src/seccomp_filters.rs index 18b8ba097d..d295761518 100644 --- a/vmm/src/seccomp_filters.rs +++ b/vmm/src/seccomp_filters.rs @@ -824,6 +824,10 @@ fn vcpu_thread_rules( (libc::SYS_pread64, vec![]), (libc::SYS_pwrite64, vec![]), (libc::SYS_read, vec![]), + #[cfg(target_arch = "x86_64")] + (libc::SYS_readlink, vec![]), + #[cfg(target_arch = "aarch64")] + (libc::SYS_readlinkat, vec![]), (libc::SYS_recvfrom, vec![]), (libc::SYS_recvmsg, vec![]), (libc::SYS_rt_sigaction, vec![]), From 2b28c5b15ac0f9733dcf556695fb1ab98fc0b220 Mon Sep 17 00:00:00 2001 From: Julian Schindel Date: Fri, 13 Mar 2026 15:11:14 +0100 Subject: [PATCH 253/742] net_gen: replace net_gen with libc The libc crate provides all functionality provided by the net_gen crate. Removing the net_gen crate reduces the maintenance burden. The switch to libc required some fixes, most notably the switch from a `Vec` to a `CString` for the `net_util::Tap.if_name` field. On-behalf-of: SAP julian.schindel@sap.com Signed-off-by: Julian Schindel --- Cargo.lock | 9 - Cargo.toml | 1 - fuzz/Cargo.lock | 9 - fuzz/fuzz_targets/net.rs | 2 +- net_gen/Cargo.toml | 13 - net_gen/src/if_tun.rs | 327 ---------- net_gen/src/iff.rs | 1228 -------------------------------------- net_gen/src/inn.rs | 294 --------- net_gen/src/ipv6.rs | 41 -- net_gen/src/lib.rs | 66 -- net_gen/src/sockios.rs | 95 --- net_util/Cargo.toml | 1 - net_util/src/lib.rs | 28 +- net_util/src/open_tap.rs | 6 +- net_util/src/tap.rs | 228 +++---- tpm/Cargo.toml | 1 - tpm/src/emulator.rs | 10 +- 17 files changed, 142 insertions(+), 2217 deletions(-) delete mode 100644 net_gen/Cargo.toml delete mode 100644 net_gen/src/if_tun.rs delete mode 100644 net_gen/src/iff.rs delete mode 100644 net_gen/src/inn.rs delete mode 100644 net_gen/src/ipv6.rs delete mode 100644 net_gen/src/lib.rs delete mode 100644 net_gen/src/sockios.rs diff --git a/Cargo.lock b/Cargo.lock index 2fb1cfe887..048c911371 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1384,13 +1384,6 @@ dependencies = [ "vmm-sys-util", ] -[[package]] -name = "net_gen" -version = "0.1.0" -dependencies = [ - "vmm-sys-util", -] - [[package]] name = "net_util" version = "0.1.0" @@ -1399,7 +1392,6 @@ dependencies = [ "getrandom 0.4.2", "libc", "log", - "net_gen", "pnet", "pnet_datalink", "rate_limiter", @@ -2214,7 +2206,6 @@ dependencies = [ "anyhow", "libc", "log", - "net_gen", "thiserror", "vmm-sys-util", ] diff --git a/Cargo.toml b/Cargo.toml index bd2a53cad2..39a79ba815 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,7 +22,6 @@ members = [ "devices", "event_monitor", "hypervisor", - "net_gen", "net_util", "option_parser", "pci", diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock index 4dfe9600e5..475f0b8344 100644 --- a/fuzz/Cargo.lock +++ b/fuzz/Cargo.lock @@ -801,13 +801,6 @@ dependencies = [ "zerocopy", ] -[[package]] -name = "net_gen" -version = "0.1.0" -dependencies = [ - "vmm-sys-util", -] - [[package]] name = "net_util" version = "0.1.0" @@ -816,7 +809,6 @@ dependencies = [ "getrandom 0.4.2", "libc", "log", - "net_gen", "rate_limiter", "serde", "thiserror", @@ -1235,7 +1227,6 @@ dependencies = [ "anyhow", "libc", "log", - "net_gen", "thiserror", "vmm-sys-util", ] diff --git a/fuzz/fuzz_targets/net.rs b/fuzz/fuzz_targets/net.rs index efc9605806..55c98bdcfc 100644 --- a/fuzz/fuzz_targets/net.rs +++ b/fuzz/fuzz_targets/net.rs @@ -64,7 +64,7 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { } let (dummy_tap_frontend, dummy_tap_backend) = create_socketpair().unwrap(); - let if_name = "fuzzer_tap_name".as_bytes().to_vec(); + let if_name = "fuzzer_tap_name"; let tap = net_util::Tap::new_for_fuzzing(dummy_tap_frontend, if_name); let mut net = virtio_devices::Net::new_with_tap( diff --git a/net_gen/Cargo.toml b/net_gen/Cargo.toml deleted file mode 100644 index dfbcee2af9..0000000000 --- a/net_gen/Cargo.toml +++ /dev/null @@ -1,13 +0,0 @@ -[package] -authors = ["The Chromium OS Authors"] -edition = "2021" -#edition.workspace = true -name = "net_gen" -rust-version.workspace = true -version = "0.1.0" - -[dependencies] -vmm-sys-util = { workspace = true } - -[lints] -workspace = true diff --git a/net_gen/src/if_tun.rs b/net_gen/src/if_tun.rs deleted file mode 100644 index ab9f327b94..0000000000 --- a/net_gen/src/if_tun.rs +++ /dev/null @@ -1,327 +0,0 @@ -// Copyright © 2021 Intel Corporation -// -// SPDX-License-Identifier: Apache-2.0 -// - -// bindgen /usr/include/linux/if_tun.h --no-layout-tests - -/* automatically generated by rust-bindgen 0.58.1 */ - -#[repr(C)] -#[derive(Default)] -pub struct __IncompleteArrayField(::std::marker::PhantomData, [T; 0]); -#[allow(clippy::missing_safety_doc)] -impl __IncompleteArrayField { - #[inline] - pub const fn new() -> Self { - __IncompleteArrayField(::std::marker::PhantomData, []) - } - #[inline] - pub fn as_ptr(&self) -> *const T { - self as *const _ as *const T - } - #[inline] - pub fn as_mut_ptr(&mut self) -> *mut T { - self as *mut _ as *mut T - } - #[inline] - pub unsafe fn as_slice(&self, len: usize) -> &[T] { - ::std::slice::from_raw_parts(self.as_ptr(), len) - } - #[inline] - pub unsafe fn as_mut_slice(&mut self, len: usize) -> &mut [T] { - ::std::slice::from_raw_parts_mut(self.as_mut_ptr(), len) - } -} -impl ::std::fmt::Debug for __IncompleteArrayField { - fn fmt(&self, fmt: &mut ::std::fmt::Formatter<'_>) -> ::std::fmt::Result { - fmt.write_str("__IncompleteArrayField") - } -} -pub const __BITS_PER_LONG: u32 = 64; -pub const __FD_SETSIZE: u32 = 1024; -pub const ETH_ALEN: u32 = 6; -pub const ETH_TLEN: u32 = 2; -pub const ETH_HLEN: u32 = 14; -pub const ETH_ZLEN: u32 = 60; -pub const ETH_DATA_LEN: u32 = 1500; -pub const ETH_FRAME_LEN: u32 = 1514; -pub const ETH_FCS_LEN: u32 = 4; -pub const ETH_MIN_MTU: u32 = 68; -pub const ETH_MAX_MTU: u32 = 65535; -pub const ETH_P_LOOP: u32 = 96; -pub const ETH_P_PUP: u32 = 512; -pub const ETH_P_PUPAT: u32 = 513; -pub const ETH_P_TSN: u32 = 8944; -pub const ETH_P_ERSPAN2: u32 = 8939; -pub const ETH_P_IP: u32 = 2048; -pub const ETH_P_X25: u32 = 2053; -pub const ETH_P_ARP: u32 = 2054; -pub const ETH_P_BPQ: u32 = 2303; -pub const ETH_P_IEEEPUP: u32 = 2560; -pub const ETH_P_IEEEPUPAT: u32 = 2561; -pub const ETH_P_BATMAN: u32 = 17157; -pub const ETH_P_DEC: u32 = 24576; -pub const ETH_P_DNA_DL: u32 = 24577; -pub const ETH_P_DNA_RC: u32 = 24578; -pub const ETH_P_DNA_RT: u32 = 24579; -pub const ETH_P_LAT: u32 = 24580; -pub const ETH_P_DIAG: u32 = 24581; -pub const ETH_P_CUST: u32 = 24582; -pub const ETH_P_SCA: u32 = 24583; -pub const ETH_P_TEB: u32 = 25944; -pub const ETH_P_RARP: u32 = 32821; -pub const ETH_P_ATALK: u32 = 32923; -pub const ETH_P_AARP: u32 = 33011; -pub const ETH_P_8021Q: u32 = 33024; -pub const ETH_P_ERSPAN: u32 = 35006; -pub const ETH_P_IPX: u32 = 33079; -pub const ETH_P_IPV6: u32 = 34525; -pub const ETH_P_PAUSE: u32 = 34824; -pub const ETH_P_SLOW: u32 = 34825; -pub const ETH_P_WCCP: u32 = 34878; -pub const ETH_P_MPLS_UC: u32 = 34887; -pub const ETH_P_MPLS_MC: u32 = 34888; -pub const ETH_P_ATMMPOA: u32 = 34892; -pub const ETH_P_PPP_DISC: u32 = 34915; -pub const ETH_P_PPP_SES: u32 = 34916; -pub const ETH_P_LINK_CTL: u32 = 34924; -pub const ETH_P_ATMFATE: u32 = 34948; -pub const ETH_P_PAE: u32 = 34958; -pub const ETH_P_AOE: u32 = 34978; -pub const ETH_P_8021AD: u32 = 34984; -pub const ETH_P_802_EX1: u32 = 34997; -pub const ETH_P_PREAUTH: u32 = 35015; -pub const ETH_P_TIPC: u32 = 35018; -pub const ETH_P_LLDP: u32 = 35020; -pub const ETH_P_MRP: u32 = 35043; -pub const ETH_P_MACSEC: u32 = 35045; -pub const ETH_P_8021AH: u32 = 35047; -pub const ETH_P_MVRP: u32 = 35061; -pub const ETH_P_1588: u32 = 35063; -pub const ETH_P_NCSI: u32 = 35064; -pub const ETH_P_PRP: u32 = 35067; -pub const ETH_P_FCOE: u32 = 35078; -pub const ETH_P_IBOE: u32 = 35093; -pub const ETH_P_TDLS: u32 = 35085; -pub const ETH_P_FIP: u32 = 35092; -pub const ETH_P_80221: u32 = 35095; -pub const ETH_P_HSR: u32 = 35119; -pub const ETH_P_NSH: u32 = 35151; -pub const ETH_P_LOOPBACK: u32 = 36864; -pub const ETH_P_QINQ1: u32 = 37120; -pub const ETH_P_QINQ2: u32 = 37376; -pub const ETH_P_QINQ3: u32 = 37632; -pub const ETH_P_EDSA: u32 = 56026; -pub const ETH_P_DSA_8021Q: u32 = 56027; -pub const ETH_P_IFE: u32 = 60734; -pub const ETH_P_AF_IUCV: u32 = 64507; -pub const ETH_P_802_3_MIN: u32 = 1536; -pub const ETH_P_802_3: u32 = 1; -pub const ETH_P_AX25: u32 = 2; -pub const ETH_P_ALL: u32 = 3; -pub const ETH_P_802_2: u32 = 4; -pub const ETH_P_SNAP: u32 = 5; -pub const ETH_P_DDCMP: u32 = 6; -pub const ETH_P_WAN_PPP: u32 = 7; -pub const ETH_P_PPP_MP: u32 = 8; -pub const ETH_P_LOCALTALK: u32 = 9; -pub const ETH_P_CAN: u32 = 12; -pub const ETH_P_CANFD: u32 = 13; -pub const ETH_P_PPPTALK: u32 = 16; -pub const ETH_P_TR_802_2: u32 = 17; -pub const ETH_P_MOBITEX: u32 = 21; -pub const ETH_P_CONTROL: u32 = 22; -pub const ETH_P_IRDA: u32 = 23; -pub const ETH_P_ECONET: u32 = 24; -pub const ETH_P_HDLC: u32 = 25; -pub const ETH_P_ARCNET: u32 = 26; -pub const ETH_P_DSA: u32 = 27; -pub const ETH_P_TRAILER: u32 = 28; -pub const ETH_P_PHONET: u32 = 245; -pub const ETH_P_IEEE802154: u32 = 246; -pub const ETH_P_CAIF: u32 = 247; -pub const ETH_P_XDSA: u32 = 248; -pub const ETH_P_MAP: u32 = 249; -pub const __UAPI_DEF_ETHHDR: u32 = 1; -pub const BPF_LD: u32 = 0; -pub const BPF_LDX: u32 = 1; -pub const BPF_ST: u32 = 2; -pub const BPF_STX: u32 = 3; -pub const BPF_ALU: u32 = 4; -pub const BPF_JMP: u32 = 5; -pub const BPF_RET: u32 = 6; -pub const BPF_MISC: u32 = 7; -pub const BPF_W: u32 = 0; -pub const BPF_H: u32 = 8; -pub const BPF_B: u32 = 16; -pub const BPF_IMM: u32 = 0; -pub const BPF_ABS: u32 = 32; -pub const BPF_IND: u32 = 64; -pub const BPF_MEM: u32 = 96; -pub const BPF_LEN: u32 = 128; -pub const BPF_MSH: u32 = 160; -pub const BPF_ADD: u32 = 0; -pub const BPF_SUB: u32 = 16; -pub const BPF_MUL: u32 = 32; -pub const BPF_DIV: u32 = 48; -pub const BPF_OR: u32 = 64; -pub const BPF_AND: u32 = 80; -pub const BPF_LSH: u32 = 96; -pub const BPF_RSH: u32 = 112; -pub const BPF_NEG: u32 = 128; -pub const BPF_MOD: u32 = 144; -pub const BPF_XOR: u32 = 160; -pub const BPF_JA: u32 = 0; -pub const BPF_JEQ: u32 = 16; -pub const BPF_JGT: u32 = 32; -pub const BPF_JGE: u32 = 48; -pub const BPF_JSET: u32 = 64; -pub const BPF_K: u32 = 0; -pub const BPF_X: u32 = 8; -pub const BPF_MAXINSNS: u32 = 4096; -pub const BPF_MAJOR_VERSION: u32 = 1; -pub const BPF_MINOR_VERSION: u32 = 1; -pub const BPF_A: u32 = 16; -pub const BPF_TAX: u32 = 0; -pub const BPF_TXA: u32 = 128; -pub const BPF_MEMWORDS: u32 = 16; -pub const SKF_AD_OFF: i32 = -4096; -pub const SKF_AD_PROTOCOL: u32 = 0; -pub const SKF_AD_PKTTYPE: u32 = 4; -pub const SKF_AD_IFINDEX: u32 = 8; -pub const SKF_AD_NLATTR: u32 = 12; -pub const SKF_AD_NLATTR_NEST: u32 = 16; -pub const SKF_AD_MARK: u32 = 20; -pub const SKF_AD_QUEUE: u32 = 24; -pub const SKF_AD_HATYPE: u32 = 28; -pub const SKF_AD_RXHASH: u32 = 32; -pub const SKF_AD_CPU: u32 = 36; -pub const SKF_AD_ALU_XOR_X: u32 = 40; -pub const SKF_AD_VLAN_TAG: u32 = 44; -pub const SKF_AD_VLAN_TAG_PRESENT: u32 = 48; -pub const SKF_AD_PAY_OFFSET: u32 = 52; -pub const SKF_AD_RANDOM: u32 = 56; -pub const SKF_AD_VLAN_TPID: u32 = 60; -pub const SKF_AD_MAX: u32 = 64; -pub const SKF_NET_OFF: i32 = -1048576; -pub const SKF_LL_OFF: i32 = -2097152; -pub const BPF_NET_OFF: i32 = -1048576; -pub const BPF_LL_OFF: i32 = -2097152; -pub const TUN_READQ_SIZE: u32 = 500; -pub const TUN_TYPE_MASK: u32 = 15; -pub const IFF_TUN: u32 = 1; -pub const IFF_TAP: u32 = 2; -pub const IFF_NAPI: u32 = 16; -pub const IFF_NAPI_FRAGS: u32 = 32; -pub const IFF_NO_PI: u32 = 4096; -pub const IFF_ONE_QUEUE: u32 = 8192; -pub const IFF_VNET_HDR: u32 = 16384; -pub const IFF_TUN_EXCL: u32 = 32768; -pub const IFF_MULTI_QUEUE: u32 = 256; -pub const IFF_ATTACH_QUEUE: u32 = 512; -pub const IFF_DETACH_QUEUE: u32 = 1024; -pub const IFF_PERSIST: u32 = 2048; -pub const IFF_NOFILTER: u32 = 4096; -pub const TUN_TX_TIMESTAMP: u32 = 1; -pub const TUN_F_CSUM: u32 = 1; -pub const TUN_F_TSO4: u32 = 2; -pub const TUN_F_TSO6: u32 = 4; -pub const TUN_F_TSO_ECN: u32 = 8; -pub const TUN_F_UFO: u32 = 16; -pub const TUN_PKT_STRIP: u32 = 1; -pub const TUN_FLT_ALLMULTI: u32 = 1; -pub type __s8 = ::std::os::raw::c_schar; -pub type __u8 = ::std::os::raw::c_uchar; -pub type __s16 = ::std::os::raw::c_short; -pub type __u16 = ::std::os::raw::c_ushort; -pub type __s32 = ::std::os::raw::c_int; -pub type __u32 = ::std::os::raw::c_uint; -pub type __s64 = ::std::os::raw::c_longlong; -pub type __u64 = ::std::os::raw::c_ulonglong; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct __kernel_fd_set { - pub fds_bits: [::std::os::raw::c_ulong; 16usize], -} -pub type __kernel_sighandler_t = - ::std::option::Option; -pub type __kernel_key_t = ::std::os::raw::c_int; -pub type __kernel_mqd_t = ::std::os::raw::c_int; -pub type __kernel_old_uid_t = ::std::os::raw::c_ushort; -pub type __kernel_old_gid_t = ::std::os::raw::c_ushort; -pub type __kernel_old_dev_t = ::std::os::raw::c_ulong; -pub type __kernel_long_t = ::std::os::raw::c_long; -pub type __kernel_ulong_t = ::std::os::raw::c_ulong; -pub type __kernel_ino_t = __kernel_ulong_t; -pub type __kernel_mode_t = ::std::os::raw::c_uint; -pub type __kernel_pid_t = ::std::os::raw::c_int; -pub type __kernel_ipc_pid_t = ::std::os::raw::c_int; -pub type __kernel_uid_t = ::std::os::raw::c_uint; -pub type __kernel_gid_t = ::std::os::raw::c_uint; -pub type __kernel_suseconds_t = __kernel_long_t; -pub type __kernel_daddr_t = ::std::os::raw::c_int; -pub type __kernel_uid32_t = ::std::os::raw::c_uint; -pub type __kernel_gid32_t = ::std::os::raw::c_uint; -pub type __kernel_size_t = __kernel_ulong_t; -pub type __kernel_ssize_t = __kernel_long_t; -pub type __kernel_ptrdiff_t = __kernel_long_t; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct __kernel_fsid_t { - pub val: [::std::os::raw::c_int; 2usize], -} -pub type __kernel_off_t = __kernel_long_t; -pub type __kernel_loff_t = ::std::os::raw::c_longlong; -pub type __kernel_old_time_t = __kernel_long_t; -pub type __kernel_time_t = __kernel_long_t; -pub type __kernel_time64_t = ::std::os::raw::c_longlong; -pub type __kernel_clock_t = __kernel_long_t; -pub type __kernel_timer_t = ::std::os::raw::c_int; -pub type __kernel_clockid_t = ::std::os::raw::c_int; -pub type __kernel_caddr_t = *mut ::std::os::raw::c_char; -pub type __kernel_uid16_t = ::std::os::raw::c_ushort; -pub type __kernel_gid16_t = ::std::os::raw::c_ushort; -pub type __le16 = __u16; -pub type __be16 = __u16; -pub type __le32 = __u32; -pub type __be32 = __u32; -pub type __le64 = __u64; -pub type __be64 = __u64; -pub type __sum16 = __u16; -pub type __wsum = __u32; -pub type __poll_t = ::std::os::raw::c_uint; -#[repr(C, packed)] -#[derive(Debug, Copy, Clone)] -pub struct ethhdr { - pub h_dest: [::std::os::raw::c_uchar; 6usize], - pub h_source: [::std::os::raw::c_uchar; 6usize], - pub h_proto: __be16, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct sock_filter { - pub code: __u16, - pub jt: __u8, - pub jf: __u8, - pub k: __u32, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct sock_fprog { - pub len: ::std::os::raw::c_ushort, - pub filter: *mut sock_filter, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct tun_pi { - pub flags: __u16, - pub proto: __be16, -} -#[repr(C)] -#[derive(Debug)] -pub struct tun_filter { - pub flags: __u16, - pub count: __u16, - pub addr: __IncompleteArrayField<[__u8; 6usize]>, -} diff --git a/net_gen/src/iff.rs b/net_gen/src/iff.rs deleted file mode 100644 index 974e01b42e..0000000000 --- a/net_gen/src/iff.rs +++ /dev/null @@ -1,1228 +0,0 @@ -// Copyright © 2021 Intel Corporation -// -// SPDX-License-Identifier: Apache-2.0 -// - -// bindgen /usr/include/linux/if.h --no-layout-tests - -/* automatically generated by rust-bindgen 0.58.1 */ - -#[repr(C)] -#[derive(Default)] -pub struct __IncompleteArrayField(::std::marker::PhantomData, [T; 0]); -#[allow(clippy::missing_safety_doc)] -impl __IncompleteArrayField { - #[inline] - pub const fn new() -> Self { - __IncompleteArrayField(::std::marker::PhantomData, []) - } - #[inline] - pub fn as_ptr(&self) -> *const T { - self as *const _ as *const T - } - #[inline] - pub fn as_mut_ptr(&mut self) -> *mut T { - self as *mut _ as *mut T - } - #[inline] - pub unsafe fn as_slice(&self, len: usize) -> &[T] { - ::std::slice::from_raw_parts(self.as_ptr(), len) - } - #[inline] - pub unsafe fn as_mut_slice(&mut self, len: usize) -> &mut [T] { - ::std::slice::from_raw_parts_mut(self.as_mut_ptr(), len) - } -} -impl ::std::fmt::Debug for __IncompleteArrayField { - fn fmt(&self, fmt: &mut ::std::fmt::Formatter<'_>) -> ::std::fmt::Result { - fmt.write_str("__IncompleteArrayField") - } -} -pub const __UAPI_DEF_IF_IFCONF: u32 = 1; -pub const __UAPI_DEF_IF_IFMAP: u32 = 1; -pub const __UAPI_DEF_IF_IFNAMSIZ: u32 = 1; -pub const __UAPI_DEF_IF_IFREQ: u32 = 1; -pub const __UAPI_DEF_IF_NET_DEVICE_FLAGS: u32 = 1; -pub const __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO: u32 = 1; -pub const __UAPI_DEF_IN_ADDR: u32 = 1; -pub const __UAPI_DEF_IN_IPPROTO: u32 = 1; -pub const __UAPI_DEF_IN_PKTINFO: u32 = 1; -pub const __UAPI_DEF_IP_MREQ: u32 = 1; -pub const __UAPI_DEF_SOCKADDR_IN: u32 = 1; -pub const __UAPI_DEF_IN_CLASS: u32 = 1; -pub const __UAPI_DEF_IN6_ADDR: u32 = 1; -pub const __UAPI_DEF_IN6_ADDR_ALT: u32 = 1; -pub const __UAPI_DEF_SOCKADDR_IN6: u32 = 1; -pub const __UAPI_DEF_IPV6_MREQ: u32 = 1; -pub const __UAPI_DEF_IPPROTO_V6: u32 = 1; -pub const __UAPI_DEF_IPV6_OPTIONS: u32 = 1; -pub const __UAPI_DEF_IN6_PKTINFO: u32 = 1; -pub const __UAPI_DEF_IP6_MTUINFO: u32 = 1; -pub const __UAPI_DEF_SOCKADDR_IPX: u32 = 1; -pub const __UAPI_DEF_IPX_ROUTE_DEFINITION: u32 = 1; -pub const __UAPI_DEF_IPX_INTERFACE_DEFINITION: u32 = 1; -pub const __UAPI_DEF_IPX_CONFIG_DATA: u32 = 1; -pub const __UAPI_DEF_IPX_ROUTE_DEF: u32 = 1; -pub const __UAPI_DEF_XATTR: u32 = 1; -pub const __BITS_PER_LONG: u32 = 64; -pub const __FD_SETSIZE: u32 = 1024; -pub const _K_SS_MAXSIZE: u32 = 128; -pub const _SYS_SOCKET_H: u32 = 1; -pub const _FEATURES_H: u32 = 1; -pub const _DEFAULT_SOURCE: u32 = 1; -pub const __GLIBC_USE_ISOC2X: u32 = 0; -pub const __USE_ISOC11: u32 = 1; -pub const __USE_ISOC99: u32 = 1; -pub const __USE_ISOC95: u32 = 1; -pub const __USE_POSIX_IMPLICITLY: u32 = 1; -pub const _POSIX_SOURCE: u32 = 1; -pub const _POSIX_C_SOURCE: u32 = 200809; -pub const __USE_POSIX: u32 = 1; -pub const __USE_POSIX2: u32 = 1; -pub const __USE_POSIX199309: u32 = 1; -pub const __USE_POSIX199506: u32 = 1; -pub const __USE_XOPEN2K: u32 = 1; -pub const __USE_XOPEN2K8: u32 = 1; -pub const _ATFILE_SOURCE: u32 = 1; -pub const __USE_MISC: u32 = 1; -pub const __USE_ATFILE: u32 = 1; -pub const __USE_FORTIFY_LEVEL: u32 = 0; -pub const __GLIBC_USE_DEPRECATED_GETS: u32 = 0; -pub const __GLIBC_USE_DEPRECATED_SCANF: u32 = 0; -pub const _STDC_PREDEF_H: u32 = 1; -pub const __STDC_IEC_559__: u32 = 1; -pub const __STDC_IEC_559_COMPLEX__: u32 = 1; -pub const __STDC_ISO_10646__: u32 = 201706; -pub const __GNU_LIBRARY__: u32 = 6; -pub const __GLIBC__: u32 = 2; -pub const __GLIBC_MINOR__: u32 = 32; -pub const _SYS_CDEFS_H: u32 = 1; -pub const __glibc_c99_flexarr_available: u32 = 1; -pub const __WORDSIZE: u32 = 64; -pub const __WORDSIZE_TIME64_COMPAT32: u32 = 1; -pub const __SYSCALL_WORDSIZE: u32 = 64; -pub const __LDOUBLE_REDIRECTS_TO_FLOAT128_ABI: u32 = 0; -pub const __HAVE_GENERIC_SELECTION: u32 = 1; -pub const __iovec_defined: u32 = 1; -pub const _SYS_TYPES_H: u32 = 1; -pub const _BITS_TYPES_H: u32 = 1; -pub const __TIMESIZE: u32 = 64; -pub const _BITS_TYPESIZES_H: u32 = 1; -pub const __OFF_T_MATCHES_OFF64_T: u32 = 1; -pub const __INO_T_MATCHES_INO64_T: u32 = 1; -pub const __RLIM_T_MATCHES_RLIM64_T: u32 = 1; -pub const __STATFS_MATCHES_STATFS64: u32 = 1; -pub const __KERNEL_OLD_TIMEVAL_MATCHES_TIMEVAL64: u32 = 1; -pub const _BITS_TIME64_H: u32 = 1; -pub const __clock_t_defined: u32 = 1; -pub const __clockid_t_defined: u32 = 1; -pub const __time_t_defined: u32 = 1; -pub const __timer_t_defined: u32 = 1; -pub const _BITS_STDINT_INTN_H: u32 = 1; -pub const __BIT_TYPES_DEFINED__: u32 = 1; -pub const _ENDIAN_H: u32 = 1; -pub const _BITS_ENDIAN_H: u32 = 1; -pub const __LITTLE_ENDIAN: u32 = 1234; -pub const __BIG_ENDIAN: u32 = 4321; -pub const __PDP_ENDIAN: u32 = 3412; -pub const _BITS_ENDIANNESS_H: u32 = 1; -pub const __BYTE_ORDER: u32 = 1234; -pub const __FLOAT_WORD_ORDER: u32 = 1234; -pub const LITTLE_ENDIAN: u32 = 1234; -pub const BIG_ENDIAN: u32 = 4321; -pub const PDP_ENDIAN: u32 = 3412; -pub const BYTE_ORDER: u32 = 1234; -pub const _BITS_BYTESWAP_H: u32 = 1; -pub const _BITS_UINTN_IDENTITY_H: u32 = 1; -pub const _SYS_SELECT_H: u32 = 1; -pub const __sigset_t_defined: u32 = 1; -pub const __timeval_defined: u32 = 1; -pub const _STRUCT_TIMESPEC: u32 = 1; -pub const FD_SETSIZE: u32 = 1024; -pub const _BITS_PTHREADTYPES_COMMON_H: u32 = 1; -pub const _THREAD_SHARED_TYPES_H: u32 = 1; -pub const _BITS_PTHREADTYPES_ARCH_H: u32 = 1; -pub const __SIZEOF_PTHREAD_MUTEX_T: u32 = 40; -pub const __SIZEOF_PTHREAD_ATTR_T: u32 = 56; -pub const __SIZEOF_PTHREAD_RWLOCK_T: u32 = 56; -pub const __SIZEOF_PTHREAD_BARRIER_T: u32 = 32; -pub const __SIZEOF_PTHREAD_MUTEXATTR_T: u32 = 4; -pub const __SIZEOF_PTHREAD_COND_T: u32 = 48; -pub const __SIZEOF_PTHREAD_CONDATTR_T: u32 = 4; -pub const __SIZEOF_PTHREAD_RWLOCKATTR_T: u32 = 8; -pub const __SIZEOF_PTHREAD_BARRIERATTR_T: u32 = 4; -pub const _THREAD_MUTEX_INTERNAL_H: u32 = 1; -pub const __PTHREAD_MUTEX_HAVE_PREV: u32 = 1; -pub const __have_pthread_attr_t: u32 = 1; -pub const PF_UNSPEC: u32 = 0; -pub const PF_LOCAL: u32 = 1; -pub const PF_UNIX: u32 = 1; -pub const PF_FILE: u32 = 1; -pub const PF_INET: u32 = 2; -pub const PF_AX25: u32 = 3; -pub const PF_IPX: u32 = 4; -pub const PF_APPLETALK: u32 = 5; -pub const PF_NETROM: u32 = 6; -pub const PF_BRIDGE: u32 = 7; -pub const PF_ATMPVC: u32 = 8; -pub const PF_X25: u32 = 9; -pub const PF_INET6: u32 = 10; -pub const PF_ROSE: u32 = 11; -pub const PF_DECnet: u32 = 12; -pub const PF_NETBEUI: u32 = 13; -pub const PF_SECURITY: u32 = 14; -pub const PF_KEY: u32 = 15; -pub const PF_NETLINK: u32 = 16; -pub const PF_ROUTE: u32 = 16; -pub const PF_PACKET: u32 = 17; -pub const PF_ASH: u32 = 18; -pub const PF_ECONET: u32 = 19; -pub const PF_ATMSVC: u32 = 20; -pub const PF_RDS: u32 = 21; -pub const PF_SNA: u32 = 22; -pub const PF_IRDA: u32 = 23; -pub const PF_PPPOX: u32 = 24; -pub const PF_WANPIPE: u32 = 25; -pub const PF_LLC: u32 = 26; -pub const PF_IB: u32 = 27; -pub const PF_MPLS: u32 = 28; -pub const PF_CAN: u32 = 29; -pub const PF_TIPC: u32 = 30; -pub const PF_BLUETOOTH: u32 = 31; -pub const PF_IUCV: u32 = 32; -pub const PF_RXRPC: u32 = 33; -pub const PF_ISDN: u32 = 34; -pub const PF_PHONET: u32 = 35; -pub const PF_IEEE802154: u32 = 36; -pub const PF_CAIF: u32 = 37; -pub const PF_ALG: u32 = 38; -pub const PF_NFC: u32 = 39; -pub const PF_VSOCK: u32 = 40; -pub const PF_KCM: u32 = 41; -pub const PF_QIPCRTR: u32 = 42; -pub const PF_SMC: u32 = 43; -pub const PF_XDP: u32 = 44; -pub const PF_MAX: u32 = 45; -pub const AF_UNSPEC: u32 = 0; -pub const AF_LOCAL: u32 = 1; -pub const AF_UNIX: u32 = 1; -pub const AF_FILE: u32 = 1; -pub const AF_INET: u32 = 2; -pub const AF_AX25: u32 = 3; -pub const AF_IPX: u32 = 4; -pub const AF_APPLETALK: u32 = 5; -pub const AF_NETROM: u32 = 6; -pub const AF_BRIDGE: u32 = 7; -pub const AF_ATMPVC: u32 = 8; -pub const AF_X25: u32 = 9; -pub const AF_INET6: u32 = 10; -pub const AF_ROSE: u32 = 11; -pub const AF_DECnet: u32 = 12; -pub const AF_NETBEUI: u32 = 13; -pub const AF_SECURITY: u32 = 14; -pub const AF_KEY: u32 = 15; -pub const AF_NETLINK: u32 = 16; -pub const AF_ROUTE: u32 = 16; -pub const AF_PACKET: u32 = 17; -pub const AF_ASH: u32 = 18; -pub const AF_ECONET: u32 = 19; -pub const AF_ATMSVC: u32 = 20; -pub const AF_RDS: u32 = 21; -pub const AF_SNA: u32 = 22; -pub const AF_IRDA: u32 = 23; -pub const AF_PPPOX: u32 = 24; -pub const AF_WANPIPE: u32 = 25; -pub const AF_LLC: u32 = 26; -pub const AF_IB: u32 = 27; -pub const AF_MPLS: u32 = 28; -pub const AF_CAN: u32 = 29; -pub const AF_TIPC: u32 = 30; -pub const AF_BLUETOOTH: u32 = 31; -pub const AF_IUCV: u32 = 32; -pub const AF_RXRPC: u32 = 33; -pub const AF_ISDN: u32 = 34; -pub const AF_PHONET: u32 = 35; -pub const AF_IEEE802154: u32 = 36; -pub const AF_CAIF: u32 = 37; -pub const AF_ALG: u32 = 38; -pub const AF_NFC: u32 = 39; -pub const AF_VSOCK: u32 = 40; -pub const AF_KCM: u32 = 41; -pub const AF_QIPCRTR: u32 = 42; -pub const AF_SMC: u32 = 43; -pub const AF_XDP: u32 = 44; -pub const AF_MAX: u32 = 45; -pub const SOL_RAW: u32 = 255; -pub const SOL_DECNET: u32 = 261; -pub const SOL_X25: u32 = 262; -pub const SOL_PACKET: u32 = 263; -pub const SOL_ATM: u32 = 264; -pub const SOL_AAL: u32 = 265; -pub const SOL_IRDA: u32 = 266; -pub const SOL_NETBEUI: u32 = 267; -pub const SOL_LLC: u32 = 268; -pub const SOL_DCCP: u32 = 269; -pub const SOL_NETLINK: u32 = 270; -pub const SOL_TIPC: u32 = 271; -pub const SOL_RXRPC: u32 = 272; -pub const SOL_PPPOL2TP: u32 = 273; -pub const SOL_BLUETOOTH: u32 = 274; -pub const SOL_PNPIPE: u32 = 275; -pub const SOL_RDS: u32 = 276; -pub const SOL_IUCV: u32 = 277; -pub const SOL_CAIF: u32 = 278; -pub const SOL_ALG: u32 = 279; -pub const SOL_NFC: u32 = 280; -pub const SOL_KCM: u32 = 281; -pub const SOL_TLS: u32 = 282; -pub const SOL_XDP: u32 = 283; -pub const SOMAXCONN: u32 = 4096; -pub const _BITS_SOCKADDR_H: u32 = 1; -pub const _SS_SIZE: u32 = 128; -pub const FIOSETOWN: u32 = 35073; -pub const SIOCSPGRP: u32 = 35074; -pub const FIOGETOWN: u32 = 35075; -pub const SIOCGPGRP: u32 = 35076; -pub const SIOCATMARK: u32 = 35077; -pub const SIOCGSTAMP_OLD: u32 = 35078; -pub const SIOCGSTAMPNS_OLD: u32 = 35079; -pub const SOL_SOCKET: u32 = 1; -pub const SO_DEBUG: u32 = 1; -pub const SO_REUSEADDR: u32 = 2; -pub const SO_TYPE: u32 = 3; -pub const SO_ERROR: u32 = 4; -pub const SO_DONTROUTE: u32 = 5; -pub const SO_BROADCAST: u32 = 6; -pub const SO_SNDBUF: u32 = 7; -pub const SO_RCVBUF: u32 = 8; -pub const SO_SNDBUFFORCE: u32 = 32; -pub const SO_RCVBUFFORCE: u32 = 33; -pub const SO_KEEPALIVE: u32 = 9; -pub const SO_OOBINLINE: u32 = 10; -pub const SO_NO_CHECK: u32 = 11; -pub const SO_PRIORITY: u32 = 12; -pub const SO_LINGER: u32 = 13; -pub const SO_BSDCOMPAT: u32 = 14; -pub const SO_REUSEPORT: u32 = 15; -pub const SO_PASSCRED: u32 = 16; -pub const SO_PEERCRED: u32 = 17; -pub const SO_RCVLOWAT: u32 = 18; -pub const SO_SNDLOWAT: u32 = 19; -pub const SO_RCVTIMEO_OLD: u32 = 20; -pub const SO_SNDTIMEO_OLD: u32 = 21; -pub const SO_SECURITY_AUTHENTICATION: u32 = 22; -pub const SO_SECURITY_ENCRYPTION_TRANSPORT: u32 = 23; -pub const SO_SECURITY_ENCRYPTION_NETWORK: u32 = 24; -pub const SO_BINDTODEVICE: u32 = 25; -pub const SO_ATTACH_FILTER: u32 = 26; -pub const SO_DETACH_FILTER: u32 = 27; -pub const SO_GET_FILTER: u32 = 26; -pub const SO_PEERNAME: u32 = 28; -pub const SO_ACCEPTCONN: u32 = 30; -pub const SO_PEERSEC: u32 = 31; -pub const SO_PASSSEC: u32 = 34; -pub const SO_MARK: u32 = 36; -pub const SO_PROTOCOL: u32 = 38; -pub const SO_DOMAIN: u32 = 39; -pub const SO_RXQ_OVFL: u32 = 40; -pub const SO_WIFI_STATUS: u32 = 41; -pub const SCM_WIFI_STATUS: u32 = 41; -pub const SO_PEEK_OFF: u32 = 42; -pub const SO_NOFCS: u32 = 43; -pub const SO_LOCK_FILTER: u32 = 44; -pub const SO_SELECT_ERR_QUEUE: u32 = 45; -pub const SO_BUSY_POLL: u32 = 46; -pub const SO_MAX_PACING_RATE: u32 = 47; -pub const SO_BPF_EXTENSIONS: u32 = 48; -pub const SO_INCOMING_CPU: u32 = 49; -pub const SO_ATTACH_BPF: u32 = 50; -pub const SO_DETACH_BPF: u32 = 27; -pub const SO_ATTACH_REUSEPORT_CBPF: u32 = 51; -pub const SO_ATTACH_REUSEPORT_EBPF: u32 = 52; -pub const SO_CNX_ADVICE: u32 = 53; -pub const SCM_TIMESTAMPING_OPT_STATS: u32 = 54; -pub const SO_MEMINFO: u32 = 55; -pub const SO_INCOMING_NAPI_ID: u32 = 56; -pub const SO_COOKIE: u32 = 57; -pub const SCM_TIMESTAMPING_PKTINFO: u32 = 58; -pub const SO_PEERGROUPS: u32 = 59; -pub const SO_ZEROCOPY: u32 = 60; -pub const SO_TXTIME: u32 = 61; -pub const SCM_TXTIME: u32 = 61; -pub const SO_BINDTOIFINDEX: u32 = 62; -pub const SO_TIMESTAMP_OLD: u32 = 29; -pub const SO_TIMESTAMPNS_OLD: u32 = 35; -pub const SO_TIMESTAMPING_OLD: u32 = 37; -pub const SO_TIMESTAMP_NEW: u32 = 63; -pub const SO_TIMESTAMPNS_NEW: u32 = 64; -pub const SO_TIMESTAMPING_NEW: u32 = 65; -pub const SO_RCVTIMEO_NEW: u32 = 66; -pub const SO_SNDTIMEO_NEW: u32 = 67; -pub const SO_DETACH_REUSEPORT_BPF: u32 = 68; -pub const SO_TIMESTAMP: u32 = 29; -pub const SO_TIMESTAMPNS: u32 = 35; -pub const SO_TIMESTAMPING: u32 = 37; -pub const SO_RCVTIMEO: u32 = 20; -pub const SO_SNDTIMEO: u32 = 21; -pub const SCM_TIMESTAMP: u32 = 29; -pub const SCM_TIMESTAMPNS: u32 = 35; -pub const SCM_TIMESTAMPING: u32 = 37; -pub const __osockaddr_defined: u32 = 1; -pub const IFNAMSIZ: u32 = 16; -pub const IFALIASZ: u32 = 256; -pub const ALTIFNAMSIZ: u32 = 128; -pub const GENERIC_HDLC_VERSION: u32 = 4; -pub const CLOCK_DEFAULT: u32 = 0; -pub const CLOCK_EXT: u32 = 1; -pub const CLOCK_INT: u32 = 2; -pub const CLOCK_TXINT: u32 = 3; -pub const CLOCK_TXFROMRX: u32 = 4; -pub const ENCODING_DEFAULT: u32 = 0; -pub const ENCODING_NRZ: u32 = 1; -pub const ENCODING_NRZI: u32 = 2; -pub const ENCODING_FM_MARK: u32 = 3; -pub const ENCODING_FM_SPACE: u32 = 4; -pub const ENCODING_MANCHESTER: u32 = 5; -pub const PARITY_DEFAULT: u32 = 0; -pub const PARITY_NONE: u32 = 1; -pub const PARITY_CRC16_PR0: u32 = 2; -pub const PARITY_CRC16_PR1: u32 = 3; -pub const PARITY_CRC16_PR0_CCITT: u32 = 4; -pub const PARITY_CRC16_PR1_CCITT: u32 = 5; -pub const PARITY_CRC32_PR0_CCITT: u32 = 6; -pub const PARITY_CRC32_PR1_CCITT: u32 = 7; -pub const LMI_DEFAULT: u32 = 0; -pub const LMI_NONE: u32 = 1; -pub const LMI_ANSI: u32 = 2; -pub const LMI_CCITT: u32 = 3; -pub const LMI_CISCO: u32 = 4; -pub const IF_GET_IFACE: u32 = 1; -pub const IF_GET_PROTO: u32 = 2; -pub const IF_IFACE_V35: u32 = 4096; -pub const IF_IFACE_V24: u32 = 4097; -pub const IF_IFACE_X21: u32 = 4098; -pub const IF_IFACE_T1: u32 = 4099; -pub const IF_IFACE_E1: u32 = 4100; -pub const IF_IFACE_SYNC_SERIAL: u32 = 4101; -pub const IF_IFACE_X21D: u32 = 4102; -pub const IF_PROTO_HDLC: u32 = 8192; -pub const IF_PROTO_PPP: u32 = 8193; -pub const IF_PROTO_CISCO: u32 = 8194; -pub const IF_PROTO_FR: u32 = 8195; -pub const IF_PROTO_FR_ADD_PVC: u32 = 8196; -pub const IF_PROTO_FR_DEL_PVC: u32 = 8197; -pub const IF_PROTO_X25: u32 = 8198; -pub const IF_PROTO_HDLC_ETH: u32 = 8199; -pub const IF_PROTO_FR_ADD_ETH_PVC: u32 = 8200; -pub const IF_PROTO_FR_DEL_ETH_PVC: u32 = 8201; -pub const IF_PROTO_FR_PVC: u32 = 8202; -pub const IF_PROTO_FR_ETH_PVC: u32 = 8203; -pub const IF_PROTO_RAW: u32 = 8204; -pub const IFHWADDRLEN: u32 = 6; -pub type __s8 = ::std::os::raw::c_schar; -pub type __u8 = ::std::os::raw::c_uchar; -pub type __s16 = ::std::os::raw::c_short; -pub type __u16 = ::std::os::raw::c_ushort; -pub type __s32 = ::std::os::raw::c_int; -pub type __u32 = ::std::os::raw::c_uint; -pub type __s64 = ::std::os::raw::c_longlong; -pub type __u64 = ::std::os::raw::c_ulonglong; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct __kernel_fd_set { - pub fds_bits: [::std::os::raw::c_ulong; 16usize], -} -pub type __kernel_sighandler_t = - ::std::option::Option; -pub type __kernel_key_t = ::std::os::raw::c_int; -pub type __kernel_mqd_t = ::std::os::raw::c_int; -pub type __kernel_old_uid_t = ::std::os::raw::c_ushort; -pub type __kernel_old_gid_t = ::std::os::raw::c_ushort; -pub type __kernel_old_dev_t = ::std::os::raw::c_ulong; -pub type __kernel_long_t = ::std::os::raw::c_long; -pub type __kernel_ulong_t = ::std::os::raw::c_ulong; -pub type __kernel_ino_t = __kernel_ulong_t; -pub type __kernel_mode_t = ::std::os::raw::c_uint; -pub type __kernel_pid_t = ::std::os::raw::c_int; -pub type __kernel_ipc_pid_t = ::std::os::raw::c_int; -pub type __kernel_uid_t = ::std::os::raw::c_uint; -pub type __kernel_gid_t = ::std::os::raw::c_uint; -pub type __kernel_suseconds_t = __kernel_long_t; -pub type __kernel_daddr_t = ::std::os::raw::c_int; -pub type __kernel_uid32_t = ::std::os::raw::c_uint; -pub type __kernel_gid32_t = ::std::os::raw::c_uint; -pub type __kernel_size_t = __kernel_ulong_t; -pub type __kernel_ssize_t = __kernel_long_t; -pub type __kernel_ptrdiff_t = __kernel_long_t; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct __kernel_fsid_t { - pub val: [::std::os::raw::c_int; 2usize], -} -pub type __kernel_off_t = __kernel_long_t; -pub type __kernel_loff_t = ::std::os::raw::c_longlong; -pub type __kernel_old_time_t = __kernel_long_t; -pub type __kernel_time_t = __kernel_long_t; -pub type __kernel_time64_t = ::std::os::raw::c_longlong; -pub type __kernel_clock_t = __kernel_long_t; -pub type __kernel_timer_t = ::std::os::raw::c_int; -pub type __kernel_clockid_t = ::std::os::raw::c_int; -pub type __kernel_caddr_t = *mut ::std::os::raw::c_char; -pub type __kernel_uid16_t = ::std::os::raw::c_ushort; -pub type __kernel_gid16_t = ::std::os::raw::c_ushort; -pub type __le16 = __u16; -pub type __be16 = __u16; -pub type __le32 = __u32; -pub type __be32 = __u32; -pub type __le64 = __u64; -pub type __be64 = __u64; -pub type __sum16 = __u16; -pub type __wsum = __u32; -pub type __poll_t = ::std::os::raw::c_uint; -pub type __kernel_sa_family_t = ::std::os::raw::c_ushort; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct __kernel_sockaddr_storage { - pub __bindgen_anon_1: __kernel_sockaddr_storage__bindgen_ty_1, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub union __kernel_sockaddr_storage__bindgen_ty_1 { - pub __bindgen_anon_1: __kernel_sockaddr_storage__bindgen_ty_1__bindgen_ty_1, - pub __align: *mut ::std::os::raw::c_void, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct __kernel_sockaddr_storage__bindgen_ty_1__bindgen_ty_1 { - pub ss_family: __kernel_sa_family_t, - pub __data: [::std::os::raw::c_char; 126usize], -} -pub type size_t = ::std::os::raw::c_ulong; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct iovec { - pub iov_base: *mut ::std::os::raw::c_void, - pub iov_len: size_t, -} -pub type __u_char = ::std::os::raw::c_uchar; -pub type __u_short = ::std::os::raw::c_ushort; -pub type __u_int = ::std::os::raw::c_uint; -pub type __u_long = ::std::os::raw::c_ulong; -pub type __int8_t = ::std::os::raw::c_schar; -pub type __uint8_t = ::std::os::raw::c_uchar; -pub type __int16_t = ::std::os::raw::c_short; -pub type __uint16_t = ::std::os::raw::c_ushort; -pub type __int32_t = ::std::os::raw::c_int; -pub type __uint32_t = ::std::os::raw::c_uint; -pub type __int64_t = ::std::os::raw::c_long; -pub type __uint64_t = ::std::os::raw::c_ulong; -pub type __int_least8_t = __int8_t; -pub type __uint_least8_t = __uint8_t; -pub type __int_least16_t = __int16_t; -pub type __uint_least16_t = __uint16_t; -pub type __int_least32_t = __int32_t; -pub type __uint_least32_t = __uint32_t; -pub type __int_least64_t = __int64_t; -pub type __uint_least64_t = __uint64_t; -pub type __quad_t = ::std::os::raw::c_long; -pub type __u_quad_t = ::std::os::raw::c_ulong; -pub type __intmax_t = ::std::os::raw::c_long; -pub type __uintmax_t = ::std::os::raw::c_ulong; -pub type __dev_t = ::std::os::raw::c_ulong; -pub type __uid_t = ::std::os::raw::c_uint; -pub type __gid_t = ::std::os::raw::c_uint; -pub type __ino_t = ::std::os::raw::c_ulong; -pub type __ino64_t = ::std::os::raw::c_ulong; -pub type __mode_t = ::std::os::raw::c_uint; -pub type __nlink_t = ::std::os::raw::c_ulong; -pub type __off_t = ::std::os::raw::c_long; -pub type __off64_t = ::std::os::raw::c_long; -pub type __pid_t = ::std::os::raw::c_int; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct __fsid_t { - pub __val: [::std::os::raw::c_int; 2usize], -} -pub type __clock_t = ::std::os::raw::c_long; -pub type __rlim_t = ::std::os::raw::c_ulong; -pub type __rlim64_t = ::std::os::raw::c_ulong; -pub type __id_t = ::std::os::raw::c_uint; -pub type __time_t = ::std::os::raw::c_long; -pub type __useconds_t = ::std::os::raw::c_uint; -pub type __suseconds_t = ::std::os::raw::c_long; -pub type __suseconds64_t = ::std::os::raw::c_long; -pub type __daddr_t = ::std::os::raw::c_int; -pub type __key_t = ::std::os::raw::c_int; -pub type __clockid_t = ::std::os::raw::c_int; -pub type __timer_t = *mut ::std::os::raw::c_void; -pub type __blksize_t = ::std::os::raw::c_long; -pub type __blkcnt_t = ::std::os::raw::c_long; -pub type __blkcnt64_t = ::std::os::raw::c_long; -pub type __fsblkcnt_t = ::std::os::raw::c_ulong; -pub type __fsblkcnt64_t = ::std::os::raw::c_ulong; -pub type __fsfilcnt_t = ::std::os::raw::c_ulong; -pub type __fsfilcnt64_t = ::std::os::raw::c_ulong; -pub type __fsword_t = ::std::os::raw::c_long; -pub type __ssize_t = ::std::os::raw::c_long; -pub type __syscall_slong_t = ::std::os::raw::c_long; -pub type __syscall_ulong_t = ::std::os::raw::c_ulong; -pub type __loff_t = __off64_t; -pub type __caddr_t = *mut ::std::os::raw::c_char; -pub type __intptr_t = ::std::os::raw::c_long; -pub type __socklen_t = ::std::os::raw::c_uint; -pub type __sig_atomic_t = ::std::os::raw::c_int; -pub type u_char = __u_char; -pub type u_short = __u_short; -pub type u_int = __u_int; -pub type u_long = __u_long; -pub type quad_t = __quad_t; -pub type u_quad_t = __u_quad_t; -pub type fsid_t = __fsid_t; -pub type loff_t = __loff_t; -pub type ino_t = __ino_t; -pub type dev_t = __dev_t; -pub type gid_t = __gid_t; -pub type mode_t = __mode_t; -pub type nlink_t = __nlink_t; -pub type uid_t = __uid_t; -pub type off_t = __off_t; -pub type pid_t = __pid_t; -pub type id_t = __id_t; -pub type ssize_t = __ssize_t; -pub type daddr_t = __daddr_t; -pub type caddr_t = __caddr_t; -pub type key_t = __key_t; -pub type clock_t = __clock_t; -pub type clockid_t = __clockid_t; -pub type time_t = __time_t; -pub type timer_t = __timer_t; -pub type ulong = ::std::os::raw::c_ulong; -pub type ushort = ::std::os::raw::c_ushort; -pub type uint = ::std::os::raw::c_uint; -pub type u_int8_t = __uint8_t; -pub type u_int16_t = __uint16_t; -pub type u_int32_t = __uint32_t; -pub type u_int64_t = __uint64_t; -pub type register_t = ::std::os::raw::c_long; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct __sigset_t { - pub __val: [::std::os::raw::c_ulong; 16usize], -} -pub type sigset_t = __sigset_t; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct timeval { - pub tv_sec: __time_t, - pub tv_usec: __suseconds_t, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct timespec { - pub tv_sec: __time_t, - pub tv_nsec: __syscall_slong_t, -} -pub type suseconds_t = __suseconds_t; -pub type __fd_mask = ::std::os::raw::c_long; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct fd_set { - pub __fds_bits: [__fd_mask; 16usize], -} -pub type fd_mask = __fd_mask; -extern "C" { - pub fn select( - __nfds: ::std::os::raw::c_int, - __readfds: *mut fd_set, - __writefds: *mut fd_set, - __exceptfds: *mut fd_set, - __timeout: *mut timeval, - ) -> ::std::os::raw::c_int; -} -extern "C" { - pub fn pselect( - __nfds: ::std::os::raw::c_int, - __readfds: *mut fd_set, - __writefds: *mut fd_set, - __exceptfds: *mut fd_set, - __timeout: *const timespec, - __sigmask: *const __sigset_t, - ) -> ::std::os::raw::c_int; -} -pub type blksize_t = __blksize_t; -pub type blkcnt_t = __blkcnt_t; -pub type fsblkcnt_t = __fsblkcnt_t; -pub type fsfilcnt_t = __fsfilcnt_t; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct __pthread_internal_list { - pub __prev: *mut __pthread_internal_list, - pub __next: *mut __pthread_internal_list, -} -pub type __pthread_list_t = __pthread_internal_list; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct __pthread_internal_slist { - pub __next: *mut __pthread_internal_slist, -} -pub type __pthread_slist_t = __pthread_internal_slist; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct __pthread_mutex_s { - pub __lock: ::std::os::raw::c_int, - pub __count: ::std::os::raw::c_uint, - pub __owner: ::std::os::raw::c_int, - pub __nusers: ::std::os::raw::c_uint, - pub __kind: ::std::os::raw::c_int, - pub __spins: ::std::os::raw::c_short, - pub __elision: ::std::os::raw::c_short, - pub __list: __pthread_list_t, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct __pthread_rwlock_arch_t { - pub __readers: ::std::os::raw::c_uint, - pub __writers: ::std::os::raw::c_uint, - pub __wrphase_futex: ::std::os::raw::c_uint, - pub __writers_futex: ::std::os::raw::c_uint, - pub __pad3: ::std::os::raw::c_uint, - pub __pad4: ::std::os::raw::c_uint, - pub __cur_writer: ::std::os::raw::c_int, - pub __shared: ::std::os::raw::c_int, - pub __rwelision: ::std::os::raw::c_schar, - pub __pad1: [::std::os::raw::c_uchar; 7usize], - pub __pad2: ::std::os::raw::c_ulong, - pub __flags: ::std::os::raw::c_uint, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct __pthread_cond_s { - pub __bindgen_anon_1: __pthread_cond_s__bindgen_ty_1, - pub __bindgen_anon_2: __pthread_cond_s__bindgen_ty_2, - pub __g_refs: [::std::os::raw::c_uint; 2usize], - pub __g_size: [::std::os::raw::c_uint; 2usize], - pub __g1_orig_size: ::std::os::raw::c_uint, - pub __wrefs: ::std::os::raw::c_uint, - pub __g_signals: [::std::os::raw::c_uint; 2usize], -} -#[repr(C)] -#[derive(Copy, Clone)] -pub union __pthread_cond_s__bindgen_ty_1 { - pub __wseq: ::std::os::raw::c_ulonglong, - pub __wseq32: __pthread_cond_s__bindgen_ty_1__bindgen_ty_1, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct __pthread_cond_s__bindgen_ty_1__bindgen_ty_1 { - pub __low: ::std::os::raw::c_uint, - pub __high: ::std::os::raw::c_uint, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub union __pthread_cond_s__bindgen_ty_2 { - pub __g1_start: ::std::os::raw::c_ulonglong, - pub __g1_start32: __pthread_cond_s__bindgen_ty_2__bindgen_ty_1, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct __pthread_cond_s__bindgen_ty_2__bindgen_ty_1 { - pub __low: ::std::os::raw::c_uint, - pub __high: ::std::os::raw::c_uint, -} -pub type __tss_t = ::std::os::raw::c_uint; -pub type __thrd_t = ::std::os::raw::c_ulong; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct __once_flag { - pub __data: ::std::os::raw::c_int, -} -pub type pthread_t = ::std::os::raw::c_ulong; -#[repr(C)] -#[derive(Copy, Clone)] -pub union pthread_mutexattr_t { - pub __size: [::std::os::raw::c_char; 4usize], - pub __align: ::std::os::raw::c_int, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub union pthread_condattr_t { - pub __size: [::std::os::raw::c_char; 4usize], - pub __align: ::std::os::raw::c_int, -} -pub type pthread_key_t = ::std::os::raw::c_uint; -pub type pthread_once_t = ::std::os::raw::c_int; -#[repr(C)] -#[derive(Copy, Clone)] -pub union pthread_attr_t { - pub __size: [::std::os::raw::c_char; 56usize], - pub __align: ::std::os::raw::c_long, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub union pthread_mutex_t { - pub __data: __pthread_mutex_s, - pub __size: [::std::os::raw::c_char; 40usize], - pub __align: ::std::os::raw::c_long, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub union pthread_cond_t { - pub __data: __pthread_cond_s, - pub __size: [::std::os::raw::c_char; 48usize], - pub __align: ::std::os::raw::c_longlong, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub union pthread_rwlock_t { - pub __data: __pthread_rwlock_arch_t, - pub __size: [::std::os::raw::c_char; 56usize], - pub __align: ::std::os::raw::c_long, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub union pthread_rwlockattr_t { - pub __size: [::std::os::raw::c_char; 8usize], - pub __align: ::std::os::raw::c_long, -} -pub type pthread_spinlock_t = ::std::os::raw::c_int; -#[repr(C)] -#[derive(Copy, Clone)] -pub union pthread_barrier_t { - pub __size: [::std::os::raw::c_char; 32usize], - pub __align: ::std::os::raw::c_long, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub union pthread_barrierattr_t { - pub __size: [::std::os::raw::c_char; 4usize], - pub __align: ::std::os::raw::c_int, -} -pub type socklen_t = __socklen_t; -pub const __socket_type_SOCK_STREAM: __socket_type = 1; -pub const __socket_type_SOCK_DGRAM: __socket_type = 2; -pub const __socket_type_SOCK_RAW: __socket_type = 3; -pub const __socket_type_SOCK_RDM: __socket_type = 4; -pub const __socket_type_SOCK_SEQPACKET: __socket_type = 5; -pub const __socket_type_SOCK_DCCP: __socket_type = 6; -pub const __socket_type_SOCK_PACKET: __socket_type = 10; -pub const __socket_type_SOCK_CLOEXEC: __socket_type = 524288; -pub const __socket_type_SOCK_NONBLOCK: __socket_type = 2048; -pub type __socket_type = ::std::os::raw::c_uint; -pub type sa_family_t = ::std::os::raw::c_ushort; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct sockaddr { - pub sa_family: sa_family_t, - pub sa_data: [::std::os::raw::c_uchar; 14usize], -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct sockaddr_storage { - pub ss_family: sa_family_t, - pub __ss_padding: [::std::os::raw::c_char; 118usize], - pub __ss_align: ::std::os::raw::c_ulong, -} -pub const MSG_OOB: ::std::os::raw::c_uint = 1; -pub const MSG_PEEK: ::std::os::raw::c_uint = 2; -pub const MSG_DONTROUTE: ::std::os::raw::c_uint = 4; -pub const MSG_CTRUNC: ::std::os::raw::c_uint = 8; -pub const MSG_PROXY: ::std::os::raw::c_uint = 16; -pub const MSG_TRUNC: ::std::os::raw::c_uint = 32; -pub const MSG_DONTWAIT: ::std::os::raw::c_uint = 64; -pub const MSG_EOR: ::std::os::raw::c_uint = 128; -pub const MSG_WAITALL: ::std::os::raw::c_uint = 256; -pub const MSG_FIN: ::std::os::raw::c_uint = 512; -pub const MSG_SYN: ::std::os::raw::c_uint = 1024; -pub const MSG_CONFIRM: ::std::os::raw::c_uint = 2048; -pub const MSG_RST: ::std::os::raw::c_uint = 4096; -pub const MSG_ERRQUEUE: ::std::os::raw::c_uint = 8192; -pub const MSG_NOSIGNAL: ::std::os::raw::c_uint = 16384; -pub const MSG_MORE: ::std::os::raw::c_uint = 32768; -pub const MSG_WAITFORONE: ::std::os::raw::c_uint = 65536; -pub const MSG_BATCH: ::std::os::raw::c_uint = 262144; -pub const MSG_ZEROCOPY: ::std::os::raw::c_uint = 67108864; -pub const MSG_FASTOPEN: ::std::os::raw::c_uint = 536870912; -pub const MSG_CMSG_CLOEXEC: ::std::os::raw::c_uint = 1073741824; -pub type _bindgen_ty_1 = ::std::os::raw::c_uint; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct msghdr { - pub msg_name: *mut ::std::os::raw::c_void, - pub msg_namelen: socklen_t, - pub msg_iov: *mut iovec, - pub msg_iovlen: size_t, - pub msg_control: *mut ::std::os::raw::c_void, - pub msg_controllen: size_t, - pub msg_flags: ::std::os::raw::c_int, -} -#[repr(C)] -#[derive(Debug)] -pub struct cmsghdr { - pub cmsg_len: size_t, - pub cmsg_level: ::std::os::raw::c_int, - pub cmsg_type: ::std::os::raw::c_int, - pub __cmsg_data: __IncompleteArrayField<::std::os::raw::c_uchar>, -} -extern "C" { - pub fn __cmsg_nxthdr(__mhdr: *mut msghdr, __cmsg: *mut cmsghdr) -> *mut cmsghdr; -} -pub const SCM_RIGHTS: ::std::os::raw::c_uint = 1; -pub type _bindgen_ty_2 = ::std::os::raw::c_uint; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct linger { - pub l_onoff: ::std::os::raw::c_int, - pub l_linger: ::std::os::raw::c_int, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct osockaddr { - pub sa_family: ::std::os::raw::c_ushort, - pub sa_data: [::std::os::raw::c_uchar; 14usize], -} -pub const SHUT_RD: ::std::os::raw::c_uint = 0; -pub const SHUT_WR: ::std::os::raw::c_uint = 1; -pub const SHUT_RDWR: ::std::os::raw::c_uint = 2; -pub type _bindgen_ty_3 = ::std::os::raw::c_uint; -extern "C" { - pub fn socket( - __domain: ::std::os::raw::c_int, - __type: ::std::os::raw::c_int, - __protocol: ::std::os::raw::c_int, - ) -> ::std::os::raw::c_int; -} -extern "C" { - pub fn socketpair( - __domain: ::std::os::raw::c_int, - __type: ::std::os::raw::c_int, - __protocol: ::std::os::raw::c_int, - __fds: *mut ::std::os::raw::c_int, - ) -> ::std::os::raw::c_int; -} -extern "C" { - pub fn bind( - __fd: ::std::os::raw::c_int, - __addr: *const sockaddr, - __len: socklen_t, - ) -> ::std::os::raw::c_int; -} -extern "C" { - pub fn getsockname( - __fd: ::std::os::raw::c_int, - __addr: *mut sockaddr, - __len: *mut socklen_t, - ) -> ::std::os::raw::c_int; -} -extern "C" { - pub fn connect( - __fd: ::std::os::raw::c_int, - __addr: *const sockaddr, - __len: socklen_t, - ) -> ::std::os::raw::c_int; -} -extern "C" { - pub fn getpeername( - __fd: ::std::os::raw::c_int, - __addr: *mut sockaddr, - __len: *mut socklen_t, - ) -> ::std::os::raw::c_int; -} -extern "C" { - pub fn send( - __fd: ::std::os::raw::c_int, - __buf: *const ::std::os::raw::c_void, - __n: size_t, - __flags: ::std::os::raw::c_int, - ) -> ssize_t; -} -extern "C" { - pub fn recv( - __fd: ::std::os::raw::c_int, - __buf: *mut ::std::os::raw::c_void, - __n: size_t, - __flags: ::std::os::raw::c_int, - ) -> ssize_t; -} -extern "C" { - pub fn sendto( - __fd: ::std::os::raw::c_int, - __buf: *const ::std::os::raw::c_void, - __n: size_t, - __flags: ::std::os::raw::c_int, - __addr: *const sockaddr, - __addr_len: socklen_t, - ) -> ssize_t; -} -extern "C" { - pub fn recvfrom( - __fd: ::std::os::raw::c_int, - __buf: *mut ::std::os::raw::c_void, - __n: size_t, - __flags: ::std::os::raw::c_int, - __addr: *mut sockaddr, - __addr_len: *mut socklen_t, - ) -> ssize_t; -} -extern "C" { - pub fn sendmsg( - __fd: ::std::os::raw::c_int, - __message: *const msghdr, - __flags: ::std::os::raw::c_int, - ) -> ssize_t; -} -extern "C" { - pub fn recvmsg( - __fd: ::std::os::raw::c_int, - __message: *mut msghdr, - __flags: ::std::os::raw::c_int, - ) -> ssize_t; -} -extern "C" { - pub fn getsockopt( - __fd: ::std::os::raw::c_int, - __level: ::std::os::raw::c_int, - __optname: ::std::os::raw::c_int, - __optval: *mut ::std::os::raw::c_void, - __optlen: *mut socklen_t, - ) -> ::std::os::raw::c_int; -} -extern "C" { - pub fn setsockopt( - __fd: ::std::os::raw::c_int, - __level: ::std::os::raw::c_int, - __optname: ::std::os::raw::c_int, - __optval: *const ::std::os::raw::c_void, - __optlen: socklen_t, - ) -> ::std::os::raw::c_int; -} -extern "C" { - pub fn listen(__fd: ::std::os::raw::c_int, __n: ::std::os::raw::c_int) - -> ::std::os::raw::c_int; -} -extern "C" { - pub fn accept( - __fd: ::std::os::raw::c_int, - __addr: *mut sockaddr, - __addr_len: *mut socklen_t, - ) -> ::std::os::raw::c_int; -} -extern "C" { - pub fn shutdown( - __fd: ::std::os::raw::c_int, - __how: ::std::os::raw::c_int, - ) -> ::std::os::raw::c_int; -} -extern "C" { - pub fn sockatmark(__fd: ::std::os::raw::c_int) -> ::std::os::raw::c_int; -} -extern "C" { - pub fn isfdtype( - __fd: ::std::os::raw::c_int, - __fdtype: ::std::os::raw::c_int, - ) -> ::std::os::raw::c_int; -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct sync_serial_settings { - pub clock_rate: ::std::os::raw::c_uint, - pub clock_type: ::std::os::raw::c_uint, - pub loopback: ::std::os::raw::c_ushort, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct te1_settings { - pub clock_rate: ::std::os::raw::c_uint, - pub clock_type: ::std::os::raw::c_uint, - pub loopback: ::std::os::raw::c_ushort, - pub slot_map: ::std::os::raw::c_uint, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct raw_hdlc_proto { - pub encoding: ::std::os::raw::c_ushort, - pub parity: ::std::os::raw::c_ushort, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct fr_proto { - pub t391: ::std::os::raw::c_uint, - pub t392: ::std::os::raw::c_uint, - pub n391: ::std::os::raw::c_uint, - pub n392: ::std::os::raw::c_uint, - pub n393: ::std::os::raw::c_uint, - pub lmi: ::std::os::raw::c_ushort, - pub dce: ::std::os::raw::c_ushort, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct fr_proto_pvc { - pub dlci: ::std::os::raw::c_uint, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct fr_proto_pvc_info { - pub dlci: ::std::os::raw::c_uint, - pub master: [::std::os::raw::c_char; 16usize], -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct cisco_proto { - pub interval: ::std::os::raw::c_uint, - pub timeout: ::std::os::raw::c_uint, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct x25_hdlc_proto { - pub dce: ::std::os::raw::c_ushort, - pub modulo: ::std::os::raw::c_uint, - pub window: ::std::os::raw::c_uint, - pub t1: ::std::os::raw::c_uint, - pub t2: ::std::os::raw::c_uint, - pub n2: ::std::os::raw::c_uint, -} -pub const net_device_flags_IFF_UP: net_device_flags = 1; -pub const net_device_flags_IFF_BROADCAST: net_device_flags = 2; -pub const net_device_flags_IFF_DEBUG: net_device_flags = 4; -pub const net_device_flags_IFF_LOOPBACK: net_device_flags = 8; -pub const net_device_flags_IFF_POINTOPOINT: net_device_flags = 16; -pub const net_device_flags_IFF_NOTRAILERS: net_device_flags = 32; -pub const net_device_flags_IFF_RUNNING: net_device_flags = 64; -pub const net_device_flags_IFF_NOARP: net_device_flags = 128; -pub const net_device_flags_IFF_PROMISC: net_device_flags = 256; -pub const net_device_flags_IFF_ALLMULTI: net_device_flags = 512; -pub const net_device_flags_IFF_MASTER: net_device_flags = 1024; -pub const net_device_flags_IFF_SLAVE: net_device_flags = 2048; -pub const net_device_flags_IFF_MULTICAST: net_device_flags = 4096; -pub const net_device_flags_IFF_PORTSEL: net_device_flags = 8192; -pub const net_device_flags_IFF_AUTOMEDIA: net_device_flags = 16384; -pub const net_device_flags_IFF_DYNAMIC: net_device_flags = 32768; -pub const net_device_flags_IFF_LOWER_UP: net_device_flags = 65536; -pub const net_device_flags_IFF_DORMANT: net_device_flags = 131072; -pub const net_device_flags_IFF_ECHO: net_device_flags = 262144; -#[doc = " enum net_device_flags - &struct net_device flags"] -#[doc = ""] -#[doc = " These are the &struct net_device flags, they can be set by drivers, the"] -#[doc = " kernel and some can be triggered by userspace. Userspace can query and"] -#[doc = " set these flags using userspace utilities but there is also a sysfs"] -#[doc = " entry available for all dev flags which can be queried and set. These flags"] -#[doc = " are shared for all types of net_devices. The sysfs entries are available"] -#[doc = " via /sys/class/net//flags. Flags which can be toggled through sysfs"] -#[doc = " are annotated below, note that only a few flags can be toggled and some"] -#[doc = " other flags are always preserved from the original net_device flags"] -#[doc = " even if you try to set them via sysfs. Flags which are always preserved"] -#[doc = " are kept under the flag grouping @IFF_VOLATILE. Flags which are __volatile__"] -#[doc = " are annotated below as such."] -#[doc = ""] -#[doc = " You should have a pretty good reason to be extending these flags."] -#[doc = ""] -#[doc = " @IFF_UP: interface is up. Can be toggled through sysfs."] -#[doc = " @IFF_BROADCAST: broadcast address valid. Volatile."] -#[doc = " @IFF_DEBUG: turn on debugging. Can be toggled through sysfs."] -#[doc = " @IFF_LOOPBACK: is a loopback net. Volatile."] -#[doc = " @IFF_POINTOPOINT: interface is has p-p link. Volatile."] -#[doc = " @IFF_NOTRAILERS: avoid use of trailers. Can be toggled through sysfs."] -#[doc = "\tVolatile."] -#[doc = " @IFF_RUNNING: interface RFC2863 OPER_UP. Volatile."] -#[doc = " @IFF_NOARP: no ARP protocol. Can be toggled through sysfs. Volatile."] -#[doc = " @IFF_PROMISC: receive all packets. Can be toggled through sysfs."] -#[doc = " @IFF_ALLMULTI: receive all multicast packets. Can be toggled through"] -#[doc = "\tsysfs."] -#[doc = " @IFF_MASTER: master of a load balancer. Volatile."] -#[doc = " @IFF_SLAVE: slave of a load balancer. Volatile."] -#[doc = " @IFF_MULTICAST: Supports multicast. Can be toggled through sysfs."] -#[doc = " @IFF_PORTSEL: can set media type. Can be toggled through sysfs."] -#[doc = " @IFF_AUTOMEDIA: auto media select active. Can be toggled through sysfs."] -#[doc = " @IFF_DYNAMIC: dialup device with changing addresses. Can be toggled"] -#[doc = "\tthrough sysfs."] -#[doc = " @IFF_LOWER_UP: driver signals L1 up. Volatile."] -#[doc = " @IFF_DORMANT: driver signals dormant. Volatile."] -#[doc = " @IFF_ECHO: echo sent packets. Volatile."] -pub type net_device_flags = ::std::os::raw::c_uint; -pub const IF_OPER_UNKNOWN: ::std::os::raw::c_uint = 0; -pub const IF_OPER_NOTPRESENT: ::std::os::raw::c_uint = 1; -pub const IF_OPER_DOWN: ::std::os::raw::c_uint = 2; -pub const IF_OPER_LOWERLAYERDOWN: ::std::os::raw::c_uint = 3; -pub const IF_OPER_TESTING: ::std::os::raw::c_uint = 4; -pub const IF_OPER_DORMANT: ::std::os::raw::c_uint = 5; -pub const IF_OPER_UP: ::std::os::raw::c_uint = 6; -pub type _bindgen_ty_4 = ::std::os::raw::c_uint; -pub const IF_LINK_MODE_DEFAULT: ::std::os::raw::c_uint = 0; -pub const IF_LINK_MODE_DORMANT: ::std::os::raw::c_uint = 1; -pub const IF_LINK_MODE_TESTING: ::std::os::raw::c_uint = 2; -pub type _bindgen_ty_5 = ::std::os::raw::c_uint; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct ifmap { - pub mem_start: ::std::os::raw::c_ulong, - pub mem_end: ::std::os::raw::c_ulong, - pub base_addr: ::std::os::raw::c_ushort, - pub irq: ::std::os::raw::c_uchar, - pub dma: ::std::os::raw::c_uchar, - pub port: ::std::os::raw::c_uchar, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct if_settings { - pub type_: ::std::os::raw::c_uint, - pub size: ::std::os::raw::c_uint, - pub ifs_ifsu: if_settings__bindgen_ty_1, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub union if_settings__bindgen_ty_1 { - pub raw_hdlc: *mut raw_hdlc_proto, - pub cisco: *mut cisco_proto, - pub fr: *mut fr_proto, - pub fr_pvc: *mut fr_proto_pvc, - pub fr_pvc_info: *mut fr_proto_pvc_info, - pub x25: *mut x25_hdlc_proto, - pub sync: *mut sync_serial_settings, - pub te1: *mut te1_settings, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct ifreq { - pub ifr_ifrn: ifreq__bindgen_ty_1, - pub ifr_ifru: ifreq__bindgen_ty_2, -} - -impl Default for ifreq { - fn default() -> Self { - // SAFETY: all zeros is a valid pattern for this data type - unsafe { std::mem::zeroed() } - } -} -#[repr(C)] -#[derive(Copy, Clone)] -pub union ifreq__bindgen_ty_1 { - pub ifrn_name: [::std::os::raw::c_uchar; 16usize], -} -#[repr(C)] -#[derive(Copy, Clone)] -pub union ifreq__bindgen_ty_2 { - pub ifru_addr: sockaddr, - pub ifru_dstaddr: sockaddr, - pub ifru_broadaddr: sockaddr, - pub ifru_netmask: sockaddr, - pub ifru_hwaddr: sockaddr, - pub ifru_flags: ::std::os::raw::c_short, - pub ifru_ivalue: ::std::os::raw::c_int, - pub ifru_mtu: ::std::os::raw::c_int, - pub ifru_map: ifmap, - pub ifru_slave: [::std::os::raw::c_uchar; 16usize], - pub ifru_newname: [::std::os::raw::c_uchar; 16usize], - pub ifru_data: *mut ::std::os::raw::c_void, - pub ifru_settings: if_settings, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct ifconf { - pub ifc_len: ::std::os::raw::c_int, - pub ifc_ifcu: ifconf__bindgen_ty_1, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub union ifconf__bindgen_ty_1 { - pub ifcu_buf: *mut ::std::os::raw::c_char, - pub ifcu_req: *mut ifreq, -} diff --git a/net_gen/src/inn.rs b/net_gen/src/inn.rs deleted file mode 100644 index f7a4e508a4..0000000000 --- a/net_gen/src/inn.rs +++ /dev/null @@ -1,294 +0,0 @@ -// Copyright © 2021 Intel Corporation -// -// SPDX-License-Identifier: Apache-2.0 - -// bindgen /usr/include/linux/in.h --no-layout-tests - -/* automatically generated by rust-bindgen 0.58.1 */ - -pub const __BITS_PER_LONG: u32 = 64; -pub const __FD_SETSIZE: u32 = 1024; -pub const __UAPI_DEF_IF_IFCONF: u32 = 1; -pub const __UAPI_DEF_IF_IFMAP: u32 = 1; -pub const __UAPI_DEF_IF_IFNAMSIZ: u32 = 1; -pub const __UAPI_DEF_IF_IFREQ: u32 = 1; -pub const __UAPI_DEF_IF_NET_DEVICE_FLAGS: u32 = 1; -pub const __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO: u32 = 1; -pub const __UAPI_DEF_IN_ADDR: u32 = 1; -pub const __UAPI_DEF_IN_IPPROTO: u32 = 1; -pub const __UAPI_DEF_IN_PKTINFO: u32 = 1; -pub const __UAPI_DEF_IP_MREQ: u32 = 1; -pub const __UAPI_DEF_SOCKADDR_IN: u32 = 1; -pub const __UAPI_DEF_IN_CLASS: u32 = 1; -pub const __UAPI_DEF_IN6_ADDR: u32 = 1; -pub const __UAPI_DEF_IN6_ADDR_ALT: u32 = 1; -pub const __UAPI_DEF_SOCKADDR_IN6: u32 = 1; -pub const __UAPI_DEF_IPV6_MREQ: u32 = 1; -pub const __UAPI_DEF_IPPROTO_V6: u32 = 1; -pub const __UAPI_DEF_IPV6_OPTIONS: u32 = 1; -pub const __UAPI_DEF_IN6_PKTINFO: u32 = 1; -pub const __UAPI_DEF_IP6_MTUINFO: u32 = 1; -pub const __UAPI_DEF_SOCKADDR_IPX: u32 = 1; -pub const __UAPI_DEF_IPX_ROUTE_DEFINITION: u32 = 1; -pub const __UAPI_DEF_IPX_INTERFACE_DEFINITION: u32 = 1; -pub const __UAPI_DEF_IPX_CONFIG_DATA: u32 = 1; -pub const __UAPI_DEF_IPX_ROUTE_DEF: u32 = 1; -pub const __UAPI_DEF_XATTR: u32 = 1; -pub const _K_SS_MAXSIZE: u32 = 128; -pub const IP_TOS: u32 = 1; -pub const IP_TTL: u32 = 2; -pub const IP_HDRINCL: u32 = 3; -pub const IP_OPTIONS: u32 = 4; -pub const IP_ROUTER_ALERT: u32 = 5; -pub const IP_RECVOPTS: u32 = 6; -pub const IP_RETOPTS: u32 = 7; -pub const IP_PKTINFO: u32 = 8; -pub const IP_PKTOPTIONS: u32 = 9; -pub const IP_MTU_DISCOVER: u32 = 10; -pub const IP_RECVERR: u32 = 11; -pub const IP_RECVTTL: u32 = 12; -pub const IP_RECVTOS: u32 = 13; -pub const IP_MTU: u32 = 14; -pub const IP_FREEBIND: u32 = 15; -pub const IP_IPSEC_POLICY: u32 = 16; -pub const IP_XFRM_POLICY: u32 = 17; -pub const IP_PASSSEC: u32 = 18; -pub const IP_TRANSPARENT: u32 = 19; -pub const IP_RECVRETOPTS: u32 = 7; -pub const IP_ORIGDSTADDR: u32 = 20; -pub const IP_RECVORIGDSTADDR: u32 = 20; -pub const IP_MINTTL: u32 = 21; -pub const IP_NODEFRAG: u32 = 22; -pub const IP_CHECKSUM: u32 = 23; -pub const IP_BIND_ADDRESS_NO_PORT: u32 = 24; -pub const IP_RECVFRAGSIZE: u32 = 25; -pub const IP_PMTUDISC_DONT: u32 = 0; -pub const IP_PMTUDISC_WANT: u32 = 1; -pub const IP_PMTUDISC_DO: u32 = 2; -pub const IP_PMTUDISC_PROBE: u32 = 3; -pub const IP_PMTUDISC_INTERFACE: u32 = 4; -pub const IP_PMTUDISC_OMIT: u32 = 5; -pub const IP_MULTICAST_IF: u32 = 32; -pub const IP_MULTICAST_TTL: u32 = 33; -pub const IP_MULTICAST_LOOP: u32 = 34; -pub const IP_ADD_MEMBERSHIP: u32 = 35; -pub const IP_DROP_MEMBERSHIP: u32 = 36; -pub const IP_UNBLOCK_SOURCE: u32 = 37; -pub const IP_BLOCK_SOURCE: u32 = 38; -pub const IP_ADD_SOURCE_MEMBERSHIP: u32 = 39; -pub const IP_DROP_SOURCE_MEMBERSHIP: u32 = 40; -pub const IP_MSFILTER: u32 = 41; -pub const MCAST_JOIN_GROUP: u32 = 42; -pub const MCAST_BLOCK_SOURCE: u32 = 43; -pub const MCAST_UNBLOCK_SOURCE: u32 = 44; -pub const MCAST_LEAVE_GROUP: u32 = 45; -pub const MCAST_JOIN_SOURCE_GROUP: u32 = 46; -pub const MCAST_LEAVE_SOURCE_GROUP: u32 = 47; -pub const MCAST_MSFILTER: u32 = 48; -pub const IP_MULTICAST_ALL: u32 = 49; -pub const IP_UNICAST_IF: u32 = 50; -pub const MCAST_EXCLUDE: u32 = 0; -pub const MCAST_INCLUDE: u32 = 1; -pub const IP_DEFAULT_MULTICAST_TTL: u32 = 1; -pub const IP_DEFAULT_MULTICAST_LOOP: u32 = 1; -pub const __SOCK_SIZE__: u32 = 16; -pub const IN_CLASSA_NET: u32 = 4278190080; -pub const IN_CLASSA_NSHIFT: u32 = 24; -pub const IN_CLASSA_HOST: u32 = 16777215; -pub const IN_CLASSA_MAX: u32 = 128; -pub const IN_CLASSB_NET: u32 = 4294901760; -pub const IN_CLASSB_NSHIFT: u32 = 16; -pub const IN_CLASSB_HOST: u32 = 65535; -pub const IN_CLASSB_MAX: u32 = 65536; -pub const IN_CLASSC_NET: u32 = 4294967040; -pub const IN_CLASSC_NSHIFT: u32 = 8; -pub const IN_CLASSC_HOST: u32 = 255; -pub const IN_MULTICAST_NET: u32 = 3758096384; -pub const IN_CLASSE_NET: u32 = 4294967295; -pub const IN_CLASSE_NSHIFT: u32 = 0; -pub const IN_LOOPBACKNET: u32 = 127; -pub const INADDR_LOOPBACK: u32 = 2130706433; -pub const INADDR_UNSPEC_GROUP: u32 = 3758096384; -pub const INADDR_ALLHOSTS_GROUP: u32 = 3758096385; -pub const INADDR_ALLRTRS_GROUP: u32 = 3758096386; -pub const INADDR_ALLSNOOPERS_GROUP: u32 = 3758096490; -pub const INADDR_MAX_LOCAL_GROUP: u32 = 3758096639; -pub const __LITTLE_ENDIAN: u32 = 1234; -pub type __s8 = ::std::os::raw::c_schar; -pub type __u8 = ::std::os::raw::c_uchar; -pub type __s16 = ::std::os::raw::c_short; -pub type __u16 = ::std::os::raw::c_ushort; -pub type __s32 = ::std::os::raw::c_int; -pub type __u32 = ::std::os::raw::c_uint; -pub type __s64 = ::std::os::raw::c_longlong; -pub type __u64 = ::std::os::raw::c_ulonglong; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct __kernel_fd_set { - pub fds_bits: [::std::os::raw::c_ulong; 16usize], -} -pub type __kernel_sighandler_t = - ::std::option::Option; -pub type __kernel_key_t = ::std::os::raw::c_int; -pub type __kernel_mqd_t = ::std::os::raw::c_int; -pub type __kernel_old_uid_t = ::std::os::raw::c_ushort; -pub type __kernel_old_gid_t = ::std::os::raw::c_ushort; -pub type __kernel_old_dev_t = ::std::os::raw::c_ulong; -pub type __kernel_long_t = ::std::os::raw::c_long; -pub type __kernel_ulong_t = ::std::os::raw::c_ulong; -pub type __kernel_ino_t = __kernel_ulong_t; -pub type __kernel_mode_t = ::std::os::raw::c_uint; -pub type __kernel_pid_t = ::std::os::raw::c_int; -pub type __kernel_ipc_pid_t = ::std::os::raw::c_int; -pub type __kernel_uid_t = ::std::os::raw::c_uint; -pub type __kernel_gid_t = ::std::os::raw::c_uint; -pub type __kernel_suseconds_t = __kernel_long_t; -pub type __kernel_daddr_t = ::std::os::raw::c_int; -pub type __kernel_uid32_t = ::std::os::raw::c_uint; -pub type __kernel_gid32_t = ::std::os::raw::c_uint; -pub type __kernel_size_t = __kernel_ulong_t; -pub type __kernel_ssize_t = __kernel_long_t; -pub type __kernel_ptrdiff_t = __kernel_long_t; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct __kernel_fsid_t { - pub val: [::std::os::raw::c_int; 2usize], -} -pub type __kernel_off_t = __kernel_long_t; -pub type __kernel_loff_t = ::std::os::raw::c_longlong; -pub type __kernel_old_time_t = __kernel_long_t; -pub type __kernel_time_t = __kernel_long_t; -pub type __kernel_time64_t = ::std::os::raw::c_longlong; -pub type __kernel_clock_t = __kernel_long_t; -pub type __kernel_timer_t = ::std::os::raw::c_int; -pub type __kernel_clockid_t = ::std::os::raw::c_int; -pub type __kernel_caddr_t = *mut ::std::os::raw::c_char; -pub type __kernel_uid16_t = ::std::os::raw::c_ushort; -pub type __kernel_gid16_t = ::std::os::raw::c_ushort; -pub type __le16 = __u16; -pub type __be16 = __u16; -pub type __le32 = __u32; -pub type __be32 = __u32; -pub type __le64 = __u64; -pub type __be64 = __u64; -pub type __sum16 = __u16; -pub type __wsum = __u32; -pub type __poll_t = ::std::os::raw::c_uint; -pub type __kernel_sa_family_t = ::std::os::raw::c_ushort; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct __kernel_sockaddr_storage { - pub __bindgen_anon_1: __kernel_sockaddr_storage__bindgen_ty_1, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub union __kernel_sockaddr_storage__bindgen_ty_1 { - pub __bindgen_anon_1: __kernel_sockaddr_storage__bindgen_ty_1__bindgen_ty_1, - pub __align: *mut ::std::os::raw::c_void, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct __kernel_sockaddr_storage__bindgen_ty_1__bindgen_ty_1 { - pub ss_family: __kernel_sa_family_t, - pub __data: [::std::os::raw::c_char; 126usize], -} -pub const IPPROTO_IP: ::std::os::raw::c_uint = 0; -pub const IPPROTO_ICMP: ::std::os::raw::c_uint = 1; -pub const IPPROTO_IGMP: ::std::os::raw::c_uint = 2; -pub const IPPROTO_IPIP: ::std::os::raw::c_uint = 4; -pub const IPPROTO_TCP: ::std::os::raw::c_uint = 6; -pub const IPPROTO_EGP: ::std::os::raw::c_uint = 8; -pub const IPPROTO_PUP: ::std::os::raw::c_uint = 12; -pub const IPPROTO_UDP: ::std::os::raw::c_uint = 17; -pub const IPPROTO_IDP: ::std::os::raw::c_uint = 22; -pub const IPPROTO_TP: ::std::os::raw::c_uint = 29; -pub const IPPROTO_DCCP: ::std::os::raw::c_uint = 33; -pub const IPPROTO_IPV6: ::std::os::raw::c_uint = 41; -pub const IPPROTO_RSVP: ::std::os::raw::c_uint = 46; -pub const IPPROTO_GRE: ::std::os::raw::c_uint = 47; -pub const IPPROTO_ESP: ::std::os::raw::c_uint = 50; -pub const IPPROTO_AH: ::std::os::raw::c_uint = 51; -pub const IPPROTO_MTP: ::std::os::raw::c_uint = 92; -pub const IPPROTO_BEETPH: ::std::os::raw::c_uint = 94; -pub const IPPROTO_ENCAP: ::std::os::raw::c_uint = 98; -pub const IPPROTO_PIM: ::std::os::raw::c_uint = 103; -pub const IPPROTO_COMP: ::std::os::raw::c_uint = 108; -pub const IPPROTO_SCTP: ::std::os::raw::c_uint = 132; -pub const IPPROTO_UDPLITE: ::std::os::raw::c_uint = 136; -pub const IPPROTO_MPLS: ::std::os::raw::c_uint = 137; -pub const IPPROTO_ETHERNET: ::std::os::raw::c_uint = 143; -pub const IPPROTO_RAW: ::std::os::raw::c_uint = 255; -pub const IPPROTO_MPTCP: ::std::os::raw::c_uint = 262; -pub const IPPROTO_MAX: ::std::os::raw::c_uint = 263; -pub type _bindgen_ty_1 = ::std::os::raw::c_uint; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct in_addr { - pub s_addr: __be32, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct ip_mreq { - pub imr_multiaddr: in_addr, - pub imr_interface: in_addr, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct ip_mreqn { - pub imr_multiaddr: in_addr, - pub imr_address: in_addr, - pub imr_ifindex: ::std::os::raw::c_int, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct ip_mreq_source { - pub imr_multiaddr: __be32, - pub imr_interface: __be32, - pub imr_sourceaddr: __be32, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct ip_msfilter { - pub imsf_multiaddr: __be32, - pub imsf_interface: __be32, - pub imsf_fmode: __u32, - pub imsf_numsrc: __u32, - pub imsf_slist: [__be32; 1usize], -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct group_req { - pub gr_interface: __u32, - pub gr_group: __kernel_sockaddr_storage, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct group_source_req { - pub gsr_interface: __u32, - pub gsr_group: __kernel_sockaddr_storage, - pub gsr_source: __kernel_sockaddr_storage, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct group_filter { - pub gf_interface: __u32, - pub gf_group: __kernel_sockaddr_storage, - pub gf_fmode: __u32, - pub gf_numsrc: __u32, - pub gf_slist: [__kernel_sockaddr_storage; 1usize], -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct in_pktinfo { - pub ipi_ifindex: ::std::os::raw::c_int, - pub ipi_spec_dst: in_addr, - pub ipi_addr: in_addr, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct sockaddr_in { - pub sin_family: __kernel_sa_family_t, - pub sin_port: __be16, - pub sin_addr: in_addr, - pub __pad: [::std::os::raw::c_uchar; 8usize], -} diff --git a/net_gen/src/ipv6.rs b/net_gen/src/ipv6.rs deleted file mode 100644 index 65d9349ec3..0000000000 --- a/net_gen/src/ipv6.rs +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright © 2025 Cloud Hypervisor Authors -// -// SPDX-License-Identifier: Apache-2.0 - -// bindgen /usr/include/linux/ipv6.h --no-layout-tests --constified-enum '*' --allowlist-type 'sockaddr_in6|in6_ifreq' - -/* automatically generated by rust-bindgen 0.71.1 */ - -pub type __u8 = ::std::os::raw::c_uchar; -pub type __u16 = ::std::os::raw::c_ushort; -pub type __u32 = ::std::os::raw::c_uint; -pub type __be16 = __u16; -pub type __be32 = __u32; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct in6_addr { - pub in6_u: in6_addr__bindgen_ty_1, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub union in6_addr__bindgen_ty_1 { - pub u6_addr8: [__u8; 16usize], - pub u6_addr16: [__be16; 8usize], - pub u6_addr32: [__be32; 4usize], -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct sockaddr_in6 { - pub sin6_family: ::std::os::raw::c_ushort, - pub sin6_port: __be16, - pub sin6_flowinfo: __be32, - pub sin6_addr: in6_addr, - pub sin6_scope_id: __u32, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct in6_ifreq { - pub ifr6_addr: in6_addr, - pub ifr6_prefixlen: __u32, - pub ifr6_ifindex: ::std::os::raw::c_int, -} diff --git a/net_gen/src/lib.rs b/net_gen/src/lib.rs deleted file mode 100644 index 91a5c8c15d..0000000000 --- a/net_gen/src/lib.rs +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright TUNTAP, 2017 The Chromium OS Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the THIRD-PARTY file. -// -// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause - -#![allow(non_upper_case_globals)] -#![allow(non_camel_case_types)] -#![allow(non_snake_case)] - -// generated with bindgen /usr/include/linux/if.h --no-unstable-rust -// --constified-enum '*' --with-derive-default -- -D __UAPI_DEF_IF_IFNAMSIZ -D -// __UAPI_DEF_IF_NET_DEVICE_FLAGS -D __UAPI_DEF_IF_IFREQ -D __UAPI_DEF_IF_IFMAP -// Name is "iff" to avoid conflicting with "if" keyword. -// Generated against Linux 4.11 to include fix "uapi: fix linux/if.h userspace -// compilation errors". -// Manual fixup of ifrn_name to be of type c_uchar instead of c_char. -pub mod iff; -// generated with bindgen /usr/include/linux/if_tun.h --no-unstable-rust -// --constified-enum '*' --with-derive-default -pub mod if_tun; -// generated with bindgen /usr/include/linux/in.h --no-unstable-rust -// --constified-enum '*' --with-derive-default -// Name is "inn" to avoid conflicting with "in" keyword. -pub mod inn; -// generated with bindgen /usr/include/linux/ipv6.h --no-layout-tests --constified-enum '*' -// --allowlist-type 'sockaddr_in6|in6_ifreq' -pub mod ipv6; -// generated with bindgen /usr/include/linux/sockios.h --no-unstable-rust -// --constified-enum '*' --with-derive-default -pub mod sockios; -pub use if_tun::{ - sock_fprog, IFF_MULTI_QUEUE, IFF_NO_PI, IFF_TAP, IFF_VNET_HDR, TUN_F_CSUM, TUN_F_TSO4, - TUN_F_TSO6, TUN_F_TSO_ECN, TUN_F_UFO, -}; -pub use iff::{ifreq, net_device_flags_IFF_UP, setsockopt, sockaddr, AF_INET}; -pub use inn::sockaddr_in; -pub use ipv6::{in6_ifreq, sockaddr_in6}; -use vmm_sys_util::{ioctl_ior_nr, ioctl_iow_nr}; - -pub const TUNTAP: ::std::os::raw::c_uint = 84; - -ioctl_iow_nr!(TUNSETNOCSUM, TUNTAP, 200, ::std::os::raw::c_int); -ioctl_iow_nr!(TUNSETDEBUG, TUNTAP, 201, ::std::os::raw::c_int); -ioctl_iow_nr!(TUNSETIFF, TUNTAP, 202, ::std::os::raw::c_int); -ioctl_iow_nr!(TUNSETPERSIST, TUNTAP, 203, ::std::os::raw::c_int); -ioctl_iow_nr!(TUNSETOWNER, TUNTAP, 204, ::std::os::raw::c_int); -ioctl_iow_nr!(TUNSETLINK, TUNTAP, 205, ::std::os::raw::c_int); -ioctl_iow_nr!(TUNSETGROUP, TUNTAP, 206, ::std::os::raw::c_int); -ioctl_ior_nr!(TUNGETFEATURES, TUNTAP, 207, ::std::os::raw::c_uint); -ioctl_iow_nr!(TUNSETOFFLOAD, TUNTAP, 208, ::std::os::raw::c_uint); -ioctl_iow_nr!(TUNSETTXFILTER, TUNTAP, 209, ::std::os::raw::c_uint); -ioctl_ior_nr!(TUNGETIFF, TUNTAP, 210, ::std::os::raw::c_uint); -ioctl_ior_nr!(TUNGETSNDBUF, TUNTAP, 211, ::std::os::raw::c_int); -ioctl_iow_nr!(TUNSETSNDBUF, TUNTAP, 212, ::std::os::raw::c_int); -ioctl_iow_nr!(TUNATTACHFILTER, TUNTAP, 213, sock_fprog); -ioctl_iow_nr!(TUNDETACHFILTER, TUNTAP, 214, sock_fprog); -ioctl_ior_nr!(TUNGETVNETHDRSZ, TUNTAP, 215, ::std::os::raw::c_int); -ioctl_iow_nr!(TUNSETVNETHDRSZ, TUNTAP, 216, ::std::os::raw::c_int); -ioctl_iow_nr!(TUNSETQUEUE, TUNTAP, 217, ::std::os::raw::c_int); -ioctl_iow_nr!(TUNSETIFINDEX, TUNTAP, 218, ::std::os::raw::c_uint); -ioctl_ior_nr!(TUNGETFILTER, TUNTAP, 219, sock_fprog); -ioctl_iow_nr!(TUNSETVNETLE, TUNTAP, 220, ::std::os::raw::c_int); -ioctl_ior_nr!(TUNGETVNETLE, TUNTAP, 221, ::std::os::raw::c_int); -ioctl_iow_nr!(TUNSETVNETBE, TUNTAP, 222, ::std::os::raw::c_int); -ioctl_ior_nr!(TUNGETVNETBE, TUNTAP, 223, ::std::os::raw::c_int); diff --git a/net_gen/src/sockios.rs b/net_gen/src/sockios.rs deleted file mode 100644 index ad3dee7c8a..0000000000 --- a/net_gen/src/sockios.rs +++ /dev/null @@ -1,95 +0,0 @@ -// Copyright © 2021 Intel Corporation -// -// SPDX-License-Identifier: Apache-2.0 -// - -// bindgen /usr/include/linux/sockios.h --no-layout-tests - -/* automatically generated by rust-bindgen 0.58.1 */ - -pub const __BITS_PER_LONG: u32 = 64; -pub const FIOSETOWN: u32 = 35073; -pub const SIOCSPGRP: u32 = 35074; -pub const FIOGETOWN: u32 = 35075; -pub const SIOCGPGRP: u32 = 35076; -pub const SIOCATMARK: u32 = 35077; -pub const SIOCGSTAMP_OLD: u32 = 35078; -pub const SIOCGSTAMPNS_OLD: u32 = 35079; -pub const SOCK_IOC_TYPE: u32 = 137; -pub const SIOCGSTAMP: u32 = 35078; -pub const SIOCGSTAMPNS: u32 = 35079; -pub const SIOCADDRT: u32 = 35083; -pub const SIOCDELRT: u32 = 35084; -pub const SIOCRTMSG: u32 = 35085; -pub const SIOCGIFNAME: u32 = 35088; -pub const SIOCSIFLINK: u32 = 35089; -pub const SIOCGIFCONF: u32 = 35090; -pub const SIOCGIFFLAGS: u32 = 35091; -pub const SIOCSIFFLAGS: u32 = 35092; -pub const SIOCGIFADDR: u32 = 35093; -pub const SIOCSIFADDR: u32 = 35094; -pub const SIOCGIFDSTADDR: u32 = 35095; -pub const SIOCSIFDSTADDR: u32 = 35096; -pub const SIOCGIFBRDADDR: u32 = 35097; -pub const SIOCSIFBRDADDR: u32 = 35098; -pub const SIOCGIFNETMASK: u32 = 35099; -pub const SIOCSIFNETMASK: u32 = 35100; -pub const SIOCGIFMETRIC: u32 = 35101; -pub const SIOCSIFMETRIC: u32 = 35102; -pub const SIOCGIFMEM: u32 = 35103; -pub const SIOCSIFMEM: u32 = 35104; -pub const SIOCGIFMTU: u32 = 35105; -pub const SIOCSIFMTU: u32 = 35106; -pub const SIOCSIFNAME: u32 = 35107; -pub const SIOCSIFHWADDR: u32 = 35108; -pub const SIOCGIFENCAP: u32 = 35109; -pub const SIOCSIFENCAP: u32 = 35110; -pub const SIOCGIFHWADDR: u32 = 35111; -pub const SIOCGIFSLAVE: u32 = 35113; -pub const SIOCSIFSLAVE: u32 = 35120; -pub const SIOCADDMULTI: u32 = 35121; -pub const SIOCDELMULTI: u32 = 35122; -pub const SIOCGIFINDEX: u32 = 35123; -pub const SIOGIFINDEX: u32 = 35123; -pub const SIOCSIFPFLAGS: u32 = 35124; -pub const SIOCGIFPFLAGS: u32 = 35125; -pub const SIOCDIFADDR: u32 = 35126; -pub const SIOCSIFHWBROADCAST: u32 = 35127; -pub const SIOCGIFCOUNT: u32 = 35128; -pub const SIOCGIFBR: u32 = 35136; -pub const SIOCSIFBR: u32 = 35137; -pub const SIOCGIFTXQLEN: u32 = 35138; -pub const SIOCSIFTXQLEN: u32 = 35139; -pub const SIOCETHTOOL: u32 = 35142; -pub const SIOCGMIIPHY: u32 = 35143; -pub const SIOCGMIIREG: u32 = 35144; -pub const SIOCSMIIREG: u32 = 35145; -pub const SIOCWANDEV: u32 = 35146; -pub const SIOCOUTQNSD: u32 = 35147; -pub const SIOCGSKNS: u32 = 35148; -pub const SIOCDARP: u32 = 35155; -pub const SIOCGARP: u32 = 35156; -pub const SIOCSARP: u32 = 35157; -pub const SIOCDRARP: u32 = 35168; -pub const SIOCGRARP: u32 = 35169; -pub const SIOCSRARP: u32 = 35170; -pub const SIOCGIFMAP: u32 = 35184; -pub const SIOCSIFMAP: u32 = 35185; -pub const SIOCADDDLCI: u32 = 35200; -pub const SIOCDELDLCI: u32 = 35201; -pub const SIOCGIFVLAN: u32 = 35202; -pub const SIOCSIFVLAN: u32 = 35203; -pub const SIOCBONDENSLAVE: u32 = 35216; -pub const SIOCBONDRELEASE: u32 = 35217; -pub const SIOCBONDSETHWADDR: u32 = 35218; -pub const SIOCBONDSLAVEINFOQUERY: u32 = 35219; -pub const SIOCBONDINFOQUERY: u32 = 35220; -pub const SIOCBONDCHANGEACTIVE: u32 = 35221; -pub const SIOCBRADDBR: u32 = 35232; -pub const SIOCBRDELBR: u32 = 35233; -pub const SIOCBRADDIF: u32 = 35234; -pub const SIOCBRDELIF: u32 = 35235; -pub const SIOCSHWTSTAMP: u32 = 35248; -pub const SIOCGHWTSTAMP: u32 = 35249; -pub const SIOCDEVPRIVATE: u32 = 35312; -pub const SIOCPROTOPRIVATE: u32 = 35296; diff --git a/net_util/Cargo.toml b/net_util/Cargo.toml index fccb89320f..03a40defce 100644 --- a/net_util/Cargo.toml +++ b/net_util/Cargo.toml @@ -10,7 +10,6 @@ epoll = { workspace = true } getrandom = "0.4.2" libc = { workspace = true } log = { workspace = true } -net_gen = { path = "../net_gen" } rate_limiter = { path = "../rate_limiter" } serde = { workspace = true, features = ["derive"] } thiserror = { workspace = true } diff --git a/net_util/src/lib.rs b/net_util/src/lib.rs index 6cf5791507..7152c1676f 100644 --- a/net_util/src/lib.rs +++ b/net_util/src/lib.rs @@ -59,15 +59,15 @@ unsafe impl ByteValued for VirtioNetConfig {} /// Create a sockaddr_in from an IPv4 address, and expose it as /// an opaque sockaddr suitable for usage by socket ioctls. -fn create_sockaddr(ip_addr: net::Ipv4Addr) -> net_gen::sockaddr { - // IPv4 addresses big-endian (network order), but Ipv4Addr will give us - // a view of those bytes directly so we can avoid any endian trickiness. - let addr_in = net_gen::sockaddr_in { - sin_family: net_gen::AF_INET as u16, +fn create_sockaddr(ip_addr: net::Ipv4Addr) -> libc::sockaddr { + let addr_in = libc::sockaddr_in { + sin_family: libc::AF_INET as u16, sin_port: 0, - // SAFETY: ip_addr can be safely transmute to in_addr - sin_addr: unsafe { mem::transmute::<[u8; 4], net_gen::inn::in_addr>(ip_addr.octets()) }, - __pad: [0; 8usize], + sin_addr: libc::in_addr { + // Use network byte order (big endian). + s_addr: ip_addr.to_bits().to_be(), + }, + sin_zero: [0; 8], }; // SAFETY: addr_in can be safely transmute to sockaddr @@ -167,19 +167,19 @@ pub fn build_net_config_space_with_mq( pub fn virtio_features_to_tap_offload(features: u64) -> c_uint { let mut tap_offloads: c_uint = 0; if features & (1 << VIRTIO_NET_F_GUEST_CSUM) != 0 { - tap_offloads |= net_gen::TUN_F_CSUM; + tap_offloads |= libc::TUN_F_CSUM; } if features & (1 << VIRTIO_NET_F_GUEST_TSO4) != 0 { - tap_offloads |= net_gen::TUN_F_TSO4; + tap_offloads |= libc::TUN_F_TSO4; } if features & (1 << VIRTIO_NET_F_GUEST_TSO6) != 0 { - tap_offloads |= net_gen::TUN_F_TSO6; + tap_offloads |= libc::TUN_F_TSO6; } if features & (1 << VIRTIO_NET_F_GUEST_ECN) != 0 { - tap_offloads |= net_gen::TUN_F_TSO_ECN; + tap_offloads |= libc::TUN_F_TSO_ECN; } if features & (1 << VIRTIO_NET_F_GUEST_UFO) != 0 { - tap_offloads |= net_gen::TUN_F_UFO; + tap_offloads |= libc::TUN_F_UFO; } tap_offloads @@ -194,7 +194,7 @@ mod unit_tests { let addr: net::Ipv4Addr = "10.0.0.1".parse().unwrap(); let sockaddr = create_sockaddr(addr); - assert_eq!(sockaddr.sa_family, net_gen::AF_INET as u16); + assert_eq!(sockaddr.sa_family, libc::AF_INET as u16); let data = &sockaddr.sa_data[..]; diff --git a/net_util/src/open_tap.rs b/net_util/src/open_tap.rs index 39d4285df3..a5168d22a0 100644 --- a/net_util/src/open_tap.rs +++ b/net_util/src/open_tap.rs @@ -48,11 +48,11 @@ fn check_mq_support(if_name: &Option<&str>, queue_pairs: usize) -> Result<()> { return Ok(()); } let tun_flags_str = fs::read_to_string(path).map_err(Error::ReadSysfsTunFlags)?; - let tun_flags = u32::from_str_radix(tun_flags_str.trim().trim_start_matches("0x"), 16) + let tun_flags = i32::from_str_radix(tun_flags_str.trim().trim_start_matches("0x"), 16) .map_err(Error::ConvertHexStringToInt)?; - if (tun_flags & net_gen::IFF_MULTI_QUEUE != 0) && !mq { + if (tun_flags & libc::IFF_MULTI_QUEUE != 0) && !mq { return Err(Error::MultiQueueNoDeviceSupport); - } else if (tun_flags & net_gen::IFF_MULTI_QUEUE == 0) && mq { + } else if (tun_flags & libc::IFF_MULTI_QUEUE == 0) && mq { return Err(Error::MultiQueueNoTapSupport); } } diff --git a/net_util/src/tap.rs b/net_util/src/tap.rs index 1622add3a6..43b42a4a2e 100644 --- a/net_util/src/tap.rs +++ b/net_util/src/tap.rs @@ -5,12 +5,14 @@ // Use of this source code is governed by a BSD-style license that can be // found in the THIRD-PARTY file. +use std::ffi::{CStr, CString}; use std::fs::File; use std::io::{Error as IoError, Read, Result as IoResult, Write}; use std::net::{IpAddr, Ipv6Addr}; use std::os::raw::*; use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; +use libc::{__c_anonymous_ifr_ifru, ifreq}; use thiserror::Error; use vmm_sys_util::ioctl::{ioctl_with_mut_ref, ioctl_with_ref, ioctl_with_val}; @@ -43,6 +45,8 @@ pub enum Error { NetUtil(#[source] NetUtilError), #[error("Interface name too long (max length is {MAX_INTERFACE_NAME_LEN}): {0}")] IfnameTooLong(String), + #[error("Interface name contains interior NUL byte: {0:?}")] + IfnameContainsNUL(String), #[error("Invalid interface name (does it exist?): {0}")] InvalidIfname(String), #[error("Error parsing MAC data")] @@ -62,7 +66,8 @@ pub type Result = ::std::result::Result; #[derive(Debug)] pub struct Tap { tap_file: File, - if_name: Vec, + /// The name does not exceed [`MAX_INTERFACE_NAME_LEN`] bytes excluding the NUL byte. + if_name: CString, } impl PartialEq for Tap { @@ -80,23 +85,6 @@ impl std::clone::Clone for Tap { } } -// Returns a byte vector representing the contents of a null terminated C string which -// contains if_name. -fn build_terminated_if_name(if_name: &str) -> Result> { - // Convert the string slice to bytes, and shadow the variable, - // since we no longer need the &str version. - let bytes = if_name.as_bytes(); - - if bytes.len() > MAX_INTERFACE_NAME_LEN { - return Err(Error::IfnameTooLong(if_name.to_string())); - } - - let mut terminated_if_name = vec![b'\0'; bytes.len() + 1]; - terminated_if_name[..bytes.len()].copy_from_slice(bytes); - - Ok(terminated_if_name) -} - fn ipv6_mask_to_prefix(mask: Ipv6Addr) -> Result { let mask = mask.segments(); let mut iter = mask.iter(); @@ -169,7 +157,12 @@ impl Tap { } pub fn open_named(if_name: &str, num_queue_pairs: usize, flags: Option) -> Result { - let terminated_if_name = build_terminated_if_name(if_name)?; + if if_name.len() > MAX_INTERFACE_NAME_LEN { + return Err(Error::IfnameTooLong(if_name.to_string())); + } + + let terminated_if_name = + CString::new(if_name).map_err(|_| Error::IfnameContainsNUL(if_name.to_string()))?; // SAFETY: FFI call let fd = unsafe { @@ -192,42 +185,48 @@ impl Tap { // value. let mut features = 0; // SAFETY: IOCTL with correct arguments - let ret = unsafe { ioctl_with_mut_ref(&tuntap, net_gen::TUNGETFEATURES(), &mut features) }; + let ret = + unsafe { ioctl_with_mut_ref(&tuntap, libc::TUNGETFEATURES as c_ulong, &mut features) }; if ret < 0 { return Err(Error::GetFeatures(IoError::last_os_error())); } // Check if the user parameters match the kernel support for MQ - if (features & net_gen::IFF_MULTI_QUEUE == 0) && num_queue_pairs > 1 { + if (features & libc::IFF_MULTI_QUEUE == 0) && num_queue_pairs > 1 { return Err(Error::MultiQueueKernelSupport); } - // This is pretty messy because of the unions used by ifreq. Since we - // don't call as_mut on the same union field more than once, this block - // is safe. - let mut ifreq: net_gen::ifreq = Default::default(); - // SAFETY: see the comment above. - unsafe { - let ifrn_name = ifreq.ifr_ifrn.ifrn_name.as_mut(); - let name_slice = &mut ifrn_name[..terminated_if_name.len()]; - name_slice.copy_from_slice(terminated_if_name.as_slice()); - ifreq.ifr_ifru.ifru_flags = - (net_gen::IFF_TAP | net_gen::IFF_NO_PI | net_gen::IFF_VNET_HDR) as c_short; - if num_queue_pairs > 1 { - ifreq.ifr_ifru.ifru_flags |= net_gen::IFF_MULTI_QUEUE as c_short; - } + let mut ifru_flags = (libc::IFF_TAP | libc::IFF_NO_PI | libc::IFF_VNET_HDR) as c_short; + if num_queue_pairs > 1 { + ifru_flags |= libc::IFF_MULTI_QUEUE as c_short; } + let mut ifreq = libc::ifreq { + ifr_name: [0; libc::IFNAMSIZ], + ifr_ifru: __c_anonymous_ifr_ifru { ifru_flags }, + }; + + // Convert and copy bytes to `ifr_name` buffer. + // `terminated_if_name` will fit into `ifr_name` since we enforce the length limit + // above. + ifreq + .ifr_name + .iter_mut() + .zip(terminated_if_name.as_bytes_with_nul()) + .for_each(|(ifr_name_char, terminated_if_name_byte)| { + *ifr_name_char = *terminated_if_name_byte as c_char; + }); + // SAFETY: ioctl is safe since we call it with a valid tap fd and check the return // value. - let ret = unsafe { ioctl_with_mut_ref(&tuntap, net_gen::TUNSETIFF(), &mut ifreq) }; + let ret = unsafe { ioctl_with_mut_ref(&tuntap, libc::TUNSETIFF as c_ulong, &mut ifreq) }; if ret < 0 { return Err(Error::ConfigureTap(IoError::last_os_error())); } - // SAFETY: only the name is accessed, and it's cloned out. - let mut if_name = unsafe { ifreq.ifr_ifrn.ifrn_name }.to_vec(); - if_name.truncate(terminated_if_name.len() - 1); + // SAFETY: `ifreq.ifr_name` is set by the `ioctl_with_mut_ref` call and we checked the + // return code, so the name must be a valid `CStr`. + let if_name = unsafe { CStr::from_ptr(ifreq.ifr_name.as_ptr()) }.to_owned(); Ok(Tap { tap_file: tuntap, if_name, @@ -254,27 +253,36 @@ impl Tap { // SAFETY: fd is a tap fd let tap_file = unsafe { File::from_raw_fd(fd) }; - let mut ifreq: net_gen::ifreq = Default::default(); + let mut ifreq: libc::ifreq = ifreq { + ifr_name: [0; libc::IFNAMSIZ], + ifr_ifru: __c_anonymous_ifr_ifru { ifru_flags: 0 }, + }; // Get current config including name // SAFETY: IOCTL with correct arguments - unsafe { Self::ioctl_with_mut_ref(&tap_file, net_gen::TUNGETIFF(), &mut ifreq)? }; - - // SAFETY: We only access one field of the ifru union - let if_name = unsafe { ifreq.ifr_ifrn.ifrn_name }.to_vec(); + unsafe { Self::ioctl_with_mut_ref(&tap_file, libc::TUNGETIFF as c_ulong, &mut ifreq)? }; + + let if_name = { + let ifr_ptr = ifreq.ifr_name.as_ptr(); + // SAFETY: The `ifr_name` field of the union is a valid, nul-terminated C string since it + // was just set by the ioctl call, and we checked for errors. + // We immediately convert the `CStr` to the owned `CString, so the memory of the union field + // is not accessed or mutated during the lifetime of the `Cstr`. + unsafe { CStr::from_ptr(ifr_ptr).to_owned() } + }; // Try and update flags. Depending on how the tap was created (macvtap // or via open_named()) this might return -EEXIST so we just ignore that. // SAFETY: access union fields unsafe { ifreq.ifr_ifru.ifru_flags = - (net_gen::IFF_TAP | net_gen::IFF_NO_PI | net_gen::IFF_VNET_HDR) as c_short; + (libc::IFF_TAP | libc::IFF_NO_PI | libc::IFF_VNET_HDR) as c_short; if num_queue_pairs > 1 { - ifreq.ifr_ifru.ifru_flags |= net_gen::IFF_MULTI_QUEUE as c_short; + ifreq.ifr_ifru.ifru_flags |= libc::IFF_MULTI_QUEUE as c_short; } } // SAFETY: IOCTL with correct arguments - let ret = unsafe { ioctl_with_mut_ref(&tap_file, net_gen::TUNSETIFF(), &mut ifreq) }; + let ret = unsafe { ioctl_with_mut_ref(&tap_file, libc::TUNSETIFF as c_ulong, &mut ifreq) }; if ret < 0 && IoError::last_os_error().raw_os_error().unwrap() != libc::EEXIST { return Err(Error::ConfigureTap(IoError::last_os_error())); } @@ -300,7 +308,7 @@ impl Tap { // SAFETY: ioctl is safe. Called with a valid sock fd, and we check the return. unsafe { - Self::ioctl_with_ref(&sock, net_gen::sockios::SIOCSIFADDR as c_ulong, &ifreq)?; + Self::ioctl_with_ref(&sock, libc::SIOCSIFADDR as c_ulong, &ifreq)?; } if let Some(IpAddr::V4(mask)) = netmask { @@ -308,11 +316,7 @@ impl Tap { // SAFETY: ioctl is safe. Called with a valid sock fd, and we check the return. unsafe { - Self::ioctl_with_ref( - &sock, - net_gen::sockios::SIOCSIFNETMASK as c_ulong, - &ifreq, - )?; + Self::ioctl_with_ref(&sock, libc::SIOCSIFNETMASK as c_ulong, &ifreq)?; } } @@ -322,18 +326,14 @@ impl Tap { let ifindex = { // SAFETY: ioctl is safe. Called with a valid sock fd, and we check the return. unsafe { - Self::ioctl_with_ref( - &sock, - net_gen::sockios::SIOCGIFINDEX as c_ulong, - &ifreq, - )?; + Self::ioctl_with_ref(&sock, libc::SIOCGIFINDEX as c_ulong, &ifreq)?; } // SAFETY: ifru_ivalue contains the ifindex and is set by the previous ioctl unsafe { - match ifreq.ifr_ifru.ifru_ivalue { + match ifreq.ifr_ifru.ifru_ifindex { 0 => { - let name = String::from_utf8_lossy(&self.if_name).to_string(); + let name = self.if_name.to_string_lossy().to_string(); return Err(Error::InvalidIfname(name)); } i => i, @@ -347,19 +347,17 @@ impl Tap { None => 0, }; - let ifreq = net_gen::in6_ifreq { + let ifreq = libc::in6_ifreq { // SAFETY: addr can be safely transmuted to in6_addr ifr6_addr: unsafe { - std::mem::transmute::<[u8; 16], net_gen::ipv6::in6_addr>(addr.octets()) + std::mem::transmute::<[u8; 16], libc::in6_addr>(addr.octets()) }, ifr6_prefixlen: prefixlen as u32, ifr6_ifindex: ifindex, }; // SAFETY: ioctl is safe. Called with a valid sock fd, and we check the return. - unsafe { - Self::ioctl_with_ref(&sock, net_gen::sockios::SIOCSIFADDR as c_ulong, &ifreq) - } + unsafe { Self::ioctl_with_ref(&sock, libc::SIOCSIFADDR as c_ulong, &ifreq) } } } } @@ -380,18 +378,18 @@ impl Tap { let mut ifreq = self.get_ifreq(); // SAFETY: ioctl is safe. Called with a valid sock fd, and we check the return. - unsafe { Self::ioctl_with_ref(&sock, net_gen::sockios::SIOCGIFHWADDR as c_ulong, &ifreq)? }; + unsafe { Self::ioctl_with_ref(&sock, libc::SIOCGIFHWADDR as c_ulong, &ifreq)? }; // SAFETY: We only access one field of the ifru union unsafe { let ifru_hwaddr = &mut ifreq.ifr_ifru.ifru_hwaddr; for (i, v) in addr.get_bytes().iter().enumerate() { - ifru_hwaddr.sa_data[i] = *v as c_uchar; + ifru_hwaddr.sa_data[i] = *v as c_char; } } // SAFETY: ioctl is safe. Called with a valid sock fd, and we check the return. - unsafe { Self::ioctl_with_ref(&sock, net_gen::sockios::SIOCSIFHWADDR as c_ulong, &ifreq) } + unsafe { Self::ioctl_with_ref(&sock, libc::SIOCSIFHWADDR as c_ulong, &ifreq) } } /// Get mac addr for tap interface. @@ -401,12 +399,21 @@ impl Tap { let ifreq = self.get_ifreq(); // SAFETY: ioctl is safe. Called with a valid sock fd, and we check the return. - unsafe { Self::ioctl_with_ref(&sock, net_gen::sockios::SIOCGIFHWADDR as c_ulong, &ifreq)? }; - - // SAFETY: We only access one field of the ifru union - let addr = unsafe { - MacAddr::from_bytes(&ifreq.ifr_ifru.ifru_hwaddr.sa_data[0..MAC_ADDR_LEN]) - .map_err(Error::MacParsing)? + unsafe { Self::ioctl_with_ref(&sock, libc::SIOCGIFHWADDR as c_ulong, &ifreq)? }; + + let addr = { + let bytes: Vec = + // SAFETY: The `ioctl_with_ref` ensures accessing `ifru_hwaddr` is valid. + unsafe { ifreq.ifr_ifru.ifru_hwaddr.sa_data[0..MAC_ADDR_LEN].iter() } + .map(|byte| { + // On some architectures, `c_char` is already a `u8`. + #[allow(clippy::unnecessary_cast)] + { + *byte as u8 + } + }) + .collect(); + MacAddr::from_bytes(&bytes).map_err(Error::MacParsing)? }; Ok(addr) } @@ -418,7 +425,7 @@ impl Tap { let ifreq = self.get_ifreq(); // SAFETY: ioctl is safe. Called with a valid sock fd, and we check the return. - unsafe { Self::ioctl_with_ref(&sock, net_gen::sockios::SIOCGIFMTU as c_ulong, &ifreq)? }; + unsafe { Self::ioctl_with_ref(&sock, libc::SIOCGIFMTU as c_ulong, &ifreq)? }; // SAFETY: access a union field let mtu = unsafe { ifreq.ifr_ifru.ifru_mtu }; @@ -439,13 +446,19 @@ impl Tap { ifreq.ifr_ifru.ifru_mtu = mtu; // SAFETY: ioctl is safe. Called with a valid sock fd, and we check the return. - unsafe { Self::ioctl_with_ref(&sock, net_gen::sockios::SIOCSIFMTU as c_ulong, &ifreq) } + unsafe { Self::ioctl_with_ref(&sock, libc::SIOCSIFMTU as c_ulong, &ifreq) } } /// Set the offload flags for the tap interface. pub fn set_offload(&self, flags: c_uint) -> Result<()> { // SAFETY: ioctl is safe. Called with a valid tap fd, and we check the return. - unsafe { Self::ioctl_with_val(&self.tap_file, net_gen::TUNSETOFFLOAD(), flags as c_ulong) } + unsafe { + Self::ioctl_with_val( + &self.tap_file, + libc::TUNSETOFFLOAD as c_ulong, + flags as c_ulong, + ) + } } /// Enable the tap interface. @@ -455,48 +468,44 @@ impl Tap { let mut ifreq = self.get_ifreq(); // SAFETY: IOCTL with correct arguments - unsafe { Self::ioctl_with_ref(&sock, net_gen::sockios::SIOCGIFFLAGS as c_ulong, &ifreq)? }; + unsafe { Self::ioctl_with_ref(&sock, libc::SIOCGIFFLAGS as c_ulong, &ifreq)? }; // If TAP device is already up don't try and enable it // SAFETY: access a union field let ifru_flags = unsafe { ifreq.ifr_ifru.ifru_flags }; - if ifru_flags & net_gen::net_device_flags_IFF_UP as i16 - == net_gen::net_device_flags_IFF_UP as i16 - { + if ifru_flags & libc::IFF_UP as i16 == libc::IFF_UP as i16 { return Ok(()); } - ifreq.ifr_ifru.ifru_flags = net_gen::net_device_flags_IFF_UP as i16; + ifreq.ifr_ifru.ifru_flags = libc::IFF_UP as i16; // SAFETY: ioctl is safe. Called with a valid sock fd, and we check the return. - unsafe { Self::ioctl_with_ref(&sock, net_gen::sockios::SIOCSIFFLAGS as c_ulong, &ifreq) } + unsafe { Self::ioctl_with_ref(&sock, libc::SIOCSIFFLAGS as c_ulong, &ifreq) } } /// Set the size of the vnet hdr. pub fn set_vnet_hdr_size(&self, size: c_int) -> Result<()> { // SAFETY: ioctl is safe. Called with a valid tap fd, and we check the return. - unsafe { Self::ioctl_with_ref(&self.tap_file, net_gen::TUNSETVNETHDRSZ(), &size) } + unsafe { Self::ioctl_with_ref(&self.tap_file, libc::TUNSETVNETHDRSZ as c_ulong, &size) } } - fn get_ifreq(&self) -> net_gen::ifreq { - let mut ifreq: net_gen::ifreq = Default::default(); - - // This sets the name of the interface, which is the only entry - // in a single-field union. - // SAFETY: access union fields and we're sure the copy is okay. - unsafe { - let ifrn_name = ifreq.ifr_ifrn.ifrn_name.as_mut(); - let name_slice = &mut ifrn_name[..self.if_name.len()]; - name_slice.copy_from_slice(&self.if_name); - } + fn get_ifreq(&self) -> libc::ifreq { + let mut ifreq: libc::ifreq = libc::ifreq { + ifr_name: [0; libc::IFNAMSIZ], + ifr_ifru: __c_anonymous_ifr_ifru { ifru_flags: 0 }, + }; + // Convert and copy bytes to `ifr_name` buffer. + // `self.if_name` will fit into `ifr_name` since we enforce the length when setting it. ifreq - } + .ifr_name + .iter_mut() + .zip(self.if_name.as_bytes_with_nul()) + .for_each(|(ifr_name_char, terminated_if_name_byte)| { + *ifr_name_char = *terminated_if_name_byte as c_char; + }); - /// Returns the raw bytes of the interface name, which may or may not be - /// valid UTF-8. - pub fn if_name_as_bytes(&self) -> &[u8] { - &self.if_name + ifreq } /// Returns the interface name as a string, truncated at the first NUL byte @@ -509,19 +518,20 @@ impl Tap { /// thus valid UTF-8. Also, self-generated interface names form CHV are /// also always created from Rust strings, thus valid UTF-8. pub fn if_name_as_str(&self) -> &str { - // All bytes until first NUL. - let nul_terminated = self - .if_name_as_bytes() - .split(|&b| b == 0) - .next() - .unwrap_or(&[]); - // Panicking here is fine, see function documentation. - std::str::from_utf8(nul_terminated).expect("Tap interface name should be valid UTF-8") + std::str::from_utf8(self.if_name.as_bytes()) + .expect("Tap interface name should be valid UTF-8") } #[cfg(fuzzing)] - pub fn new_for_fuzzing(tap_file: File, if_name: Vec) -> Self { + pub fn new_for_fuzzing(tap_file: File, if_name: &str) -> Self { + if if_name.len() > MAX_INTERFACE_NAME_LEN { + panic!("provided name longer than `MAX_INTERFACE_NAME_LEN`") + } + + let if_name = CString::new(if_name) + .map_err(|_| Error::IfnameContainsNUL(if_name.to_string())) + .unwrap(); Tap { tap_file, if_name } } } diff --git a/tpm/Cargo.toml b/tpm/Cargo.toml index 5d6bba1a04..dd87f99371 100644 --- a/tpm/Cargo.toml +++ b/tpm/Cargo.toml @@ -10,7 +10,6 @@ version = "0.1.0" anyhow = { workspace = true } libc = { workspace = true } log = { workspace = true } -net_gen = { path = "../net_gen" } thiserror = { workspace = true } vmm-sys-util = { workspace = true } diff --git a/tpm/src/emulator.rs b/tpm/src/emulator.rs index b27a069aa0..0ffd3a62e4 100644 --- a/tpm/src/emulator.rs +++ b/tpm/src/emulator.rs @@ -150,16 +150,16 @@ impl Emulator { // SAFETY: FFI calls and return value of the unsafe call is checked unsafe { - let tv = net_gen::iff::timeval { + let tv = libc::timeval { tv_sec: 0, tv_usec: 100000, // Set recv timeout to 100ms }; - let ret = net_gen::setsockopt( + let ret = libc::setsockopt( fds[0], - net_gen::iff::SOL_SOCKET as i32, - net_gen::iff::SO_RCVTIMEO as i32, + libc::SOL_SOCKET, + libc::SO_RCVTIMEO, &tv as *const _ as *const libc::c_void, - std::mem::size_of::() as u32, + std::mem::size_of::() as u32, ); if ret == -1 { return Err(Error::PrepareDataFd(anyhow!( From 27a40ed32b9cbaed2c65a5779fddf0a5b78d557b Mon Sep 17 00:00:00 2001 From: Julian Schindel Date: Wed, 18 Mar 2026 15:47:23 +0100 Subject: [PATCH 254/742] ci: remove `net_gen` from allowed titles On-behalf-of: SAP julian.schindel@sap.com Signed-off-by: Julian Schindel --- scripts/gitlint/rules/TitleStartsWithComponent.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/gitlint/rules/TitleStartsWithComponent.py b/scripts/gitlint/rules/TitleStartsWithComponent.py index 1310e45ccd..a25172a629 100644 --- a/scripts/gitlint/rules/TitleStartsWithComponent.py +++ b/scripts/gitlint/rules/TitleStartsWithComponent.py @@ -46,7 +46,6 @@ def validate(self, line, _commit): 'hypervisor', 'main', 'misc', - 'net_gen', 'net_util', 'openapi', 'option_parser', From edfbb7e18086e898fea282b4d350b26c5c792739 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 23 Mar 2026 12:02:14 +0100 Subject: [PATCH 255/742] block: Add DISCARD_WZ_MAX_PAYLOAD constant Introduce DISCARD_WZ_MAX_PAYLOAD as the precomputed product of DISCARD_WZ_SEG_SIZE and MAX_DISCARD_WRITE_ZEROES_SEG. Use it in the DISCARD and WRITE_ZEROES segment count checks instead of repeating the multiplication inline. Suggested-by: Philipp Schuster Signed-off-by: Anatol Belski --- block/src/lib.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/block/src/lib.rs b/block/src/lib.rs index 81b3f27c00..e6b23b3870 100644 --- a/block/src/lib.rs +++ b/block/src/lib.rs @@ -75,6 +75,7 @@ pub const MAX_DISCARD_WRITE_ZEROES_SEG: u32 = 1; /// Size and field offsets within `struct virtio_blk_discard_write_zeroes`. const DISCARD_WZ_SEG_SIZE: u32 = mem::size_of::() as u32; +const DISCARD_WZ_MAX_PAYLOAD: u32 = DISCARD_WZ_SEG_SIZE * MAX_DISCARD_WRITE_ZEROES_SEG; const DISCARD_WZ_SECTOR_OFFSET: u64 = mem::offset_of!(virtio_blk_discard_write_zeroes, sector) as u64; const DISCARD_WZ_NUM_SECTORS_OFFSET: u64 = @@ -600,7 +601,7 @@ impl Request { if data_len < DISCARD_WZ_SEG_SIZE { return Err(ExecuteError::BadRequest(Error::DescriptorLengthTooSmall)); } - if data_len > DISCARD_WZ_SEG_SIZE * MAX_DISCARD_WRITE_ZEROES_SEG { + if data_len > DISCARD_WZ_MAX_PAYLOAD { return Err(ExecuteError::BadRequest(Error::TooManySegments)); } @@ -649,7 +650,7 @@ impl Request { if data_len < DISCARD_WZ_SEG_SIZE { return Err(ExecuteError::BadRequest(Error::DescriptorLengthTooSmall)); } - if data_len > DISCARD_WZ_SEG_SIZE * MAX_DISCARD_WRITE_ZEROES_SEG { + if data_len > DISCARD_WZ_MAX_PAYLOAD { return Err(ExecuteError::BadRequest(Error::TooManySegments)); } From 92109136f16dcc664ffa7e9a0bdc16ee2e3b1719 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 23 Mar 2026 12:04:08 +0100 Subject: [PATCH 256/742] block: Include actual segment count in TooManySegments error Include the number of segments found in the request payload in the TooManySegments error variant so the logged message shows both the actual and maximum values. Suggested-by: Philipp Schuster Signed-off-by: Anatol Belski --- block/src/lib.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/block/src/lib.rs b/block/src/lib.rs index e6b23b3870..0640611c78 100644 --- a/block/src/lib.rs +++ b/block/src/lib.rs @@ -110,8 +110,8 @@ pub enum Error { RawFileError(#[source] std::io::Error), #[error("The requested operation does not support multiple descriptors")] TooManyDescriptors, - #[error("Request contains too many segments")] - TooManySegments, + #[error("Request contains too many segments ({0}, max {MAX_DISCARD_WRITE_ZEROES_SEG})")] + TooManySegments(u32), #[error("Failure in vhdx")] VhdxError(#[source] VhdxError), } @@ -602,7 +602,9 @@ impl Request { return Err(ExecuteError::BadRequest(Error::DescriptorLengthTooSmall)); } if data_len > DISCARD_WZ_MAX_PAYLOAD { - return Err(ExecuteError::BadRequest(Error::TooManySegments)); + return Err(ExecuteError::BadRequest(Error::TooManySegments( + data_len.div_ceil(DISCARD_WZ_SEG_SIZE), + ))); } let mut discard_sector = [0u8; 8]; @@ -651,7 +653,9 @@ impl Request { return Err(ExecuteError::BadRequest(Error::DescriptorLengthTooSmall)); } if data_len > DISCARD_WZ_MAX_PAYLOAD { - return Err(ExecuteError::BadRequest(Error::TooManySegments)); + return Err(ExecuteError::BadRequest(Error::TooManySegments( + data_len.div_ceil(DISCARD_WZ_SEG_SIZE), + ))); } let mut wz_sector = [0u8; 8]; From 882f82f04bffa3ae9dc3591217aff40787528958 Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Fri, 20 Mar 2026 16:17:16 +0100 Subject: [PATCH 257/742] virtio-devices: fix barrier handling in virtio-net When configuring multiple queues for a virtio device, the guest can activate between 1 and the configured amout of queues. The firmware, for example, may activate only one queue, while a Linux guest would likely activate all available queues. The constructor of virtio-net initializes the `paused_sync` barrier using the configured queue count (plus one for the main thread). This can be wrong if the guest enables a different number of queues at activation time, which can make pause hang. Thus, we now recompute the barrier size from the queues that are actually activated. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- virtio-devices/src/net.rs | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/virtio-devices/src/net.rs b/virtio-devices/src/net.rs index ec8afc2cc7..3bb360c646 100644 --- a/virtio-devices/src/net.rs +++ b/virtio-devices/src/net.rs @@ -752,7 +752,15 @@ impl VirtioDevice for Net { let num_queues = queues.len(); let event_idx = self.common.feature_acked(VIRTIO_RING_F_EVENT_IDX.into()); - if self.common.feature_acked(VIRTIO_NET_F_CTRL_VQ.into()) && !num_queues.is_multiple_of(2) { + + // Recompute the barrier size from the queues that are actually activated. + let has_ctrl_queue = + self.common.feature_acked(VIRTIO_NET_F_CTRL_VQ.into()) && !num_queues.is_multiple_of(2); + let ctrl_threads = if has_ctrl_queue { 1 } else { 0 }; + let qp_threads = (num_queues - ctrl_threads) / 2; + self.common.paused_sync = Some(Arc::new(Barrier::new(1 + qp_threads + ctrl_threads))); + + if has_ctrl_queue { let ctrl_queue_index = num_queues - 1; let (_, mut ctrl_queue, ctrl_queue_evt) = queues.remove(ctrl_queue_index); @@ -772,10 +780,6 @@ impl VirtioDevice for Net { }; let paused = self.common.paused.clone(); - // Let's update the barrier as we need 1 for each RX/TX pair + - // 1 for the control queue + 1 for the main thread signalling - // the pause. - self.common.paused_sync = Some(Arc::new(Barrier::new(self.taps.len() + 2))); let paused_sync = self.common.paused_sync.clone(); let mut epoll_threads = Vec::new(); From 5aeb9f55d190abd9a1d220b12d799a020b9800f1 Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Fri, 20 Mar 2026 16:21:02 +0100 Subject: [PATCH 258/742] virtio-devices: fix barrier handling in virtio-blk When configuring multiple queues for a virtio device, the guest can activate between 1 and the configured amount of queues. The firmware, for example, may activate only one queue, while a Linux guest would likely activate all available queues. The constructor of virtio-blk initializes the `paused_sync` barrier using the configured queue count (plus one for the main thread). This can be wrong if the guest enable a different number of queues at activation time, which can make pause hang. Thus, we now recompute the barrier size from the queues that are actually activated. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- virtio-devices/src/block.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/virtio-devices/src/block.rs b/virtio-devices/src/block.rs index 97d7c58c15..47309dd877 100644 --- a/virtio-devices/src/block.rs +++ b/virtio-devices/src/block.rs @@ -1087,6 +1087,9 @@ impl VirtioDevice for Block { } self.common.activate(&queues, interrupt_cb.clone())?; + // Recompute the barrier size from the queues that are actually activated. + self.common.paused_sync = Some(Arc::new(Barrier::new(queues.len() + 1))); + self.update_writeback(); let mut epoll_threads = Vec::new(); From 7e0f8f7163d79982ad0491a4ec7702006fab9929 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 24 Mar 2026 00:05:50 +0000 Subject: [PATCH 259/742] build: Bump the non-rust-vmm group across 2 directories with 10 updates Bumps the non-rust-vmm group with 6 updates in the / directory: | Package | From | To | | --- | --- | --- | | [env_logger](https://github.com/rust-cli/env_logger) | `0.11.9` | `0.11.10` | | [arc-swap](https://github.com/vorner/arc-swap) | `1.8.2` | `1.9.0` | | [env_filter](https://github.com/rust-cli/env_logger) | `1.0.0` | `1.0.1` | | [terminal_size](https://github.com/eminence/terminal-size) | `0.4.3` | `0.4.4` | | [toml_datetime](https://github.com/toml-rs/toml) | `1.0.1+spec-1.1.0` | `1.1.0+spec-1.1.0` | | [toml_edit](https://github.com/toml-rs/toml) | `0.25.5+spec-1.1.0` | `0.25.8+spec-1.1.0` | Bumps the non-rust-vmm group with 3 updates in the /fuzz directory: [arc-swap](https://github.com/vorner/arc-swap), [toml_datetime](https://github.com/toml-rs/toml) and [toml_edit](https://github.com/toml-rs/toml). Updates `env_logger` from 0.11.9 to 0.11.10 - [Release notes](https://github.com/rust-cli/env_logger/releases) - [Changelog](https://github.com/rust-cli/env_logger/blob/main/CHANGELOG.md) - [Commits](https://github.com/rust-cli/env_logger/compare/v0.11.9...v0.11.10) Updates `arc-swap` from 1.8.2 to 1.9.0 - [Changelog](https://github.com/vorner/arc-swap/blob/master/CHANGELOG.md) - [Commits](https://github.com/vorner/arc-swap/compare/v1.8.2...v1.9.0) Updates `anstream` from 0.6.21 to 1.0.0 - [Commits](https://github.com/rust-cli/anstyle/compare/anstream-v0.6.21...anstream-v1.0.0) Updates `anstyle-parse` from 0.2.7 to 1.0.0 - [Commits](https://github.com/rust-cli/anstyle/compare/anstyle-parse-v0.2.7...anstyle-parse-v1.0.0) Updates `env_filter` from 1.0.0 to 1.0.1 - [Release notes](https://github.com/rust-cli/env_logger/releases) - [Changelog](https://github.com/rust-cli/env_logger/blob/main/CHANGELOG.md) - [Commits](https://github.com/rust-cli/env_logger/compare/env_filter-v1.0.0...env_filter-v1.0.1) Updates `terminal_size` from 0.4.3 to 0.4.4 - [Release notes](https://github.com/eminence/terminal-size/releases) - [Commits](https://github.com/eminence/terminal-size/compare/v0.4.3...v0.4.4) Updates `toml_datetime` from 1.0.1+spec-1.1.0 to 1.1.0+spec-1.1.0 - [Commits](https://github.com/toml-rs/toml/compare/toml_datetime-v1.0.1...toml_datetime-v1.1.0) Updates `toml_edit` from 0.25.5+spec-1.1.0 to 0.25.8+spec-1.1.0 - [Commits](https://github.com/toml-rs/toml/compare/v0.25.5...v0.25.8) Updates `toml_parser` from 1.0.10+spec-1.1.0 to 1.1.0+spec-1.1.0 - [Commits](https://github.com/toml-rs/toml/compare/toml_parser-v1.0.10...toml_parser-v1.1.0) Updates `windows-sys` from 0.60.2 to 0.61.2 - [Release notes](https://github.com/microsoft/windows-rs/releases) - [Commits](https://github.com/microsoft/windows-rs/commits) Updates `arc-swap` from 1.8.2 to 1.9.0 - [Changelog](https://github.com/vorner/arc-swap/blob/master/CHANGELOG.md) - [Commits](https://github.com/vorner/arc-swap/compare/v1.8.2...v1.9.0) Updates `toml_datetime` from 1.0.1+spec-1.1.0 to 1.1.0+spec-1.1.0 - [Commits](https://github.com/toml-rs/toml/compare/toml_datetime-v1.0.1...toml_datetime-v1.1.0) Updates `toml_edit` from 0.25.5+spec-1.1.0 to 0.25.8+spec-1.1.0 - [Commits](https://github.com/toml-rs/toml/compare/v0.25.5...v0.25.8) Updates `toml_parser` from 1.0.10+spec-1.1.0 to 1.1.0+spec-1.1.0 - [Commits](https://github.com/toml-rs/toml/compare/toml_parser-v1.0.10...toml_parser-v1.1.0) --- updated-dependencies: - dependency-name: env_logger dependency-version: 0.11.10 dependency-type: direct:production update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: arc-swap dependency-version: 1.9.0 dependency-type: direct:production update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: anstream dependency-version: 1.0.0 dependency-type: indirect update-type: version-update:semver-major dependency-group: non-rust-vmm - dependency-name: anstyle-parse dependency-version: 1.0.0 dependency-type: indirect update-type: version-update:semver-major dependency-group: non-rust-vmm - dependency-name: env_filter dependency-version: 1.0.1 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: terminal_size dependency-version: 0.4.4 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: toml_datetime dependency-version: 1.1.0+spec-1.1.0 dependency-type: indirect update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: toml_edit dependency-version: 0.25.8+spec-1.1.0 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: toml_parser dependency-version: 1.1.0+spec-1.1.0 dependency-type: indirect update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: windows-sys dependency-version: 0.61.2 dependency-type: indirect update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: arc-swap dependency-version: 1.9.0 dependency-type: indirect update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: toml_datetime dependency-version: 1.1.0+spec-1.1.0 dependency-type: indirect update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: toml_edit dependency-version: 0.25.8+spec-1.1.0 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: toml_parser dependency-version: 1.1.0+spec-1.1.0 dependency-type: indirect update-type: version-update:semver-minor dependency-group: non-rust-vmm ... Signed-off-by: dependabot[bot] --- Cargo.lock | 156 ++++++++---------------------------------- Cargo.toml | 2 +- fuzz/Cargo.lock | 16 ++--- hypervisor/Cargo.toml | 2 +- 4 files changed, 39 insertions(+), 137 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 048c911371..b009dfa901 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -35,21 +35,6 @@ dependencies = [ "memchr", ] -[[package]] -name = "anstream" -version = "0.6.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" -dependencies = [ - "anstyle", - "anstyle-parse 0.2.7", - "anstyle-query", - "anstyle-wincon", - "colorchoice", - "is_terminal_polyfill", - "utf8parse", -] - [[package]] name = "anstream" version = "1.0.0" @@ -57,7 +42,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d" dependencies = [ "anstyle", - "anstyle-parse 1.0.0", + "anstyle-parse", "anstyle-query", "anstyle-wincon", "colorchoice", @@ -71,15 +56,6 @@ version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" -[[package]] -name = "anstyle-parse" -version = "0.2.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" -dependencies = [ - "utf8parse", -] - [[package]] name = "anstyle-parse" version = "1.0.0" @@ -95,7 +71,7 @@ version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" dependencies = [ - "windows-sys 0.61.2", + "windows-sys", ] [[package]] @@ -106,7 +82,7 @@ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys 0.61.2", + "windows-sys", ] [[package]] @@ -125,9 +101,9 @@ dependencies = [ [[package]] name = "arc-swap" -version = "1.8.2" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9f3647c145568cec02c42054e07bdf9a5a698e15b466fb2341bfc393cd24aa5" +checksum = "a07d1f37ff60921c83bdfc7407723bdefe89b44b98a9b772f225c8f9d67141a6" dependencies = [ "rustversion", ] @@ -204,7 +180,7 @@ dependencies = [ "polling", "rustix", "slab", - "windows-sys 0.61.2", + "windows-sys", ] [[package]] @@ -262,7 +238,7 @@ dependencies = [ "rustix", "signal-hook-registry", "slab", - "windows-sys 0.61.2", + "windows-sys", ] [[package]] @@ -436,7 +412,7 @@ version = "4.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" dependencies = [ - "anstream 1.0.0", + "anstream", "anstyle", "clap_lex", "strsim", @@ -641,7 +617,7 @@ dependencies = [ "libc", "option-ext", "redox_users", - "windows-sys 0.61.2", + "windows-sys", ] [[package]] @@ -679,9 +655,9 @@ dependencies = [ [[package]] name = "env_filter" -version = "1.0.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a1c3cc8e57274ec99de65301228b537f1e4eedc1b8e0f9411c6caac8ae7308f" +checksum = "32e90c2accc4b07a8456ea0debdc2e7587bdd890680d71173a15d4ae604f6eef" dependencies = [ "log", "regex", @@ -689,11 +665,11 @@ dependencies = [ [[package]] name = "env_logger" -version = "0.11.9" +version = "0.11.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2daee4ea451f429a58296525ddf28b45a3b64f1acf6587e2067437bb11e218d" +checksum = "0621c04f2196ac3f488dd583365b9c09be011a4ab8b9f37248ffcc8f6198b56a" dependencies = [ - "anstream 0.6.21", + "anstream", "anstyle", "env_filter", "jiff", @@ -723,7 +699,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.61.2", + "windows-sys", ] [[package]] @@ -1724,7 +1700,7 @@ dependencies = [ "hermit-abi", "pin-project-lite", "rustix", - "windows-sys 0.61.2", + "windows-sys", ] [[package]] @@ -1907,7 +1883,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys 0.61.2", + "windows-sys", ] [[package]] @@ -2115,17 +2091,17 @@ dependencies = [ "getrandom 0.4.2", "once_cell", "rustix", - "windows-sys 0.61.2", + "windows-sys", ] [[package]] name = "terminal_size" -version = "0.4.3" +version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60b8cb979cb11c32ce1603f8137b22262a9d131aaa5c37b5678025f22b8becd0" +checksum = "230a1b821ccbd75b185820a1f1ff7b14d21da1e442e22c0863ea5f08771a8874" dependencies = [ "rustix", - "windows-sys 0.60.2", + "windows-sys", ] [[package]] @@ -2171,18 +2147,18 @@ checksum = "3bf63baf9f5039dadc247375c29eb13706706cfde997d0330d05aa63a77d8820" [[package]] name = "toml_datetime" -version = "1.0.1+spec-1.1.0" +version = "1.1.0+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b320e741db58cac564e26c607d3cc1fdc4a88fd36c879568c07856ed83ff3e9" +checksum = "97251a7c317e03ad83774a8752a7e81fb6067740609f75ea2b585b569a59198f" dependencies = [ "serde_core", ] [[package]] name = "toml_edit" -version = "0.25.5+spec-1.1.0" +version = "0.25.8+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ca1a40644a28bce036923f6a431df0b34236949d111cc07cb6dca830c9ef2e1" +checksum = "16bff38f1d86c47f9ff0647e6838d7bb362522bdf44006c7068c2b1e606f1f3c" dependencies = [ "indexmap", "toml_datetime", @@ -2192,9 +2168,9 @@ dependencies = [ [[package]] name = "toml_parser" -version = "1.0.10+spec-1.1.0" +version = "1.1.0+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7df25b4befd31c4816df190124375d5a20c6b6921e2cad937316de3fccd63420" +checksum = "2334f11ee363607eb04df9b8fc8a13ca1715a72ba8662a26ac285c98aabb4011" dependencies = [ "winnow 1.0.0", ] @@ -2259,7 +2235,7 @@ checksum = "f2f6fb2847f6742cd76af783a2a2c49e9375d0a111c7bef6f71cd9e738c72d6e" dependencies = [ "memoffset", "tempfile", - "windows-sys 0.61.2", + "windows-sys", ] [[package]] @@ -2726,15 +2702,6 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" -[[package]] -name = "windows-sys" -version = "0.60.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" -dependencies = [ - "windows-targets", -] - [[package]] name = "windows-sys" version = "0.61.2" @@ -2744,71 +2711,6 @@ dependencies = [ "windows-link", ] -[[package]] -name = "windows-targets" -version = "0.53.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" -dependencies = [ - "windows-link", - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_gnullvm", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", -] - -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" - -[[package]] -name = "windows_aarch64_msvc" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" - -[[package]] -name = "windows_i686_gnu" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3" - -[[package]] -name = "windows_i686_gnullvm" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" - -[[package]] -name = "windows_i686_msvc" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" - -[[package]] -name = "windows_x86_64_gnu" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" - -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" - -[[package]] -name = "windows_x86_64_msvc" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" - [[package]] name = "winnow" version = "0.7.15" @@ -2943,7 +2845,7 @@ dependencies = [ "tracing", "uds_windows", "uuid", - "windows-sys 0.61.2", + "windows-sys", "winnow 0.7.15", "zbus_macros", "zbus_names", diff --git a/Cargo.toml b/Cargo.toml index 39a79ba815..b8320172e1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -87,7 +87,7 @@ cfg-if = "1.0.4" clap = "4.6.0" dhat = "0.3.3" dirs = "6.0.0" -env_logger = "0.11.8" +env_logger = "0.11.10" epoll = "4.4.0" flume = "0.12.0" itertools = "0.14.0" diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock index 475f0b8344..d5cda1e24f 100644 --- a/fuzz/Cargo.lock +++ b/fuzz/Cargo.lock @@ -81,9 +81,9 @@ checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1" [[package]] name = "arc-swap" -version = "1.8.2" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9f3647c145568cec02c42054e07bdf9a5a698e15b466fb2341bfc393cd24aa5" +checksum = "a07d1f37ff60921c83bdfc7407723bdefe89b44b98a9b772f225c8f9d67141a6" dependencies = [ "rustversion", ] @@ -1192,18 +1192,18 @@ dependencies = [ [[package]] name = "toml_datetime" -version = "1.0.1+spec-1.1.0" +version = "1.1.0+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b320e741db58cac564e26c607d3cc1fdc4a88fd36c879568c07856ed83ff3e9" +checksum = "97251a7c317e03ad83774a8752a7e81fb6067740609f75ea2b585b569a59198f" dependencies = [ "serde_core", ] [[package]] name = "toml_edit" -version = "0.25.5+spec-1.1.0" +version = "0.25.8+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ca1a40644a28bce036923f6a431df0b34236949d111cc07cb6dca830c9ef2e1" +checksum = "16bff38f1d86c47f9ff0647e6838d7bb362522bdf44006c7068c2b1e606f1f3c" dependencies = [ "indexmap", "toml_datetime", @@ -1213,9 +1213,9 @@ dependencies = [ [[package]] name = "toml_parser" -version = "1.0.10+spec-1.1.0" +version = "1.1.0+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7df25b4befd31c4816df190124375d5a20c6b6921e2cad937316de3fccd63420" +checksum = "2334f11ee363607eb04df9b8fc8a13ca1715a72ba8662a26ac285c98aabb4011" dependencies = [ "winnow", ] diff --git a/hypervisor/Cargo.toml b/hypervisor/Cargo.toml index 707779fefc..1ffaa46b78 100644 --- a/hypervisor/Cargo.toml +++ b/hypervisor/Cargo.toml @@ -15,7 +15,7 @@ tdx = [] [dependencies] anyhow = { workspace = true } -arc-swap = "1.8.2" +arc-swap = "1.9.0" bitfield-struct = "0.12.0" byteorder = { workspace = true } cfg-if = { workspace = true } From 040fcaed92f7629f5a0d9ae57b9cb175a52e5e70 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Tue, 17 Mar 2026 11:11:54 +0100 Subject: [PATCH 260/742] vmm: add VmSendMigrationData::parse(); integrate with OptionParser This change prepares upcoming options (following commit) that are added to VmSendMigrationData. VmSendMigrationData is a special case as it is currently the only "rich configuration" type that lives outside `config.rs`, as it is purely API-facing. Therefore, it isn't integrated into the existing OptionParser infrastructure. We therefore introduce a `parse()` method to use that in `ch-remote` in the following. In `ch-remote`, we remove `--local` for `send-migration` and switch to the new option string parsing constructor (breaking change!). This prepares the addition of downtime and timeout options in the following and streamlines the `ch-remote` command line interface with other commands, such as `ch-remote add-net`. Lastly, this commit updates the integration tests. Signed-off-by: Philipp Schuster On-behalf-of: SAP philipp.schuster@sap.com --- cloud-hypervisor/src/bin/ch-remote.rs | 38 +++++++----------- cloud-hypervisor/tests/integration.rs | 13 +++---- vmm/src/api/mod.rs | 55 ++++++++++++++++++++++++++- 3 files changed, 73 insertions(+), 33 deletions(-) diff --git a/cloud-hypervisor/src/bin/ch-remote.rs b/cloud-hypervisor/src/bin/ch-remote.rs index 6939eaa385..afc41e7e96 100644 --- a/cloud-hypervisor/src/bin/ch-remote.rs +++ b/cloud-hypervisor/src/bin/ch-remote.rs @@ -16,7 +16,9 @@ use api_client::{ Error as ApiClientError, simple_api_command, simple_api_command_with_fds, simple_api_full_command, }; -use clap::{Arg, ArgAction, ArgMatches, Command}; +#[cfg(feature = "dbus_api")] +use clap::ArgAction; +use clap::{Arg, ArgMatches, Command}; use log::error; use option_parser::{ByteSized, ByteSizedParseError}; use thiserror::Error; @@ -69,6 +71,8 @@ enum Error { ReadingFile(#[source] std::io::Error), #[error("Invalid disk size")] InvalidDiskSize(#[source] ByteSizedParseError), + #[error("Error parsing send migration configuration")] + SendMigrationConfig(#[from] vmm::api::VmSendMigrationParseError), } enum TargetApi<'a> { @@ -519,11 +523,7 @@ fn rest_api_do_command(matches: &ArgMatches, socket: &mut UnixStream) -> ApiResu .unwrap() .get_one::("send_migration_config") .unwrap(), - matches - .subcommand_matches("send-migration") - .unwrap() - .get_flag("send_migration_local"), - ); + )?; simple_api_command(socket, "PUT", "send-migration", Some(&send_migration_data)) .map_err(Error::HttpApiClient) } @@ -743,11 +743,7 @@ fn dbus_api_do_command(matches: &ArgMatches, proxy: &DBusApi1ProxyBlocking<'_>) .unwrap() .get_one::("send_migration_config") .unwrap(), - matches - .subcommand_matches("send-migration") - .unwrap() - .get_flag("send_migration_local"), - ); + )?; proxy.api_vm_send_migration(&send_migration_data) } Some("receive-migration") => { @@ -953,13 +949,11 @@ fn receive_migration_data(url: &str) -> String { serde_json::to_string(&receive_migration_data).unwrap() } -fn send_migration_data(url: &str, local: bool) -> String { - let send_migration_data = vmm::api::VmSendMigrationData { - destination_url: url.to_owned(), - local, - }; - - serde_json::to_string(&send_migration_data).unwrap() +fn send_migration_data(config: &str) -> Result { + let send_migration_data = + vmm::api::VmSendMigrationData::parse(config).map_err(Error::SendMigrationConfig)?; + let send_migration_config = serde_json::to_string(&send_migration_data).unwrap(); + Ok(send_migration_config) } fn create_data(path: &str) -> Result { @@ -1141,13 +1135,7 @@ fn get_cli_commands_sorted() -> Box<[Command]> { .arg( Arg::new("send_migration_config") .index(1) - .help(""), - ) - .arg( - Arg::new("send_migration_local") - .long("local") - .num_args(0) - .action(ArgAction::SetTrue), + .help(vmm::api::VmSendMigrationData::SYNTAX), ), Command::new("shutdown").about("Shutdown the VM"), Command::new("shutdown-vmm").about("Shutdown the VMM"), diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 4b8a6b2e37..5454cae6b6 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -9891,17 +9891,16 @@ mod live_migration { thread::sleep(std::time::Duration::new(1, 0)); // Start to send migration from the source VM - let mut args = [ + let args = [ format!("--api-socket={}", &src_api_socket), "send-migration".to_string(), - format! {"unix:{migration_socket}"}, + format!( + "destination_url=unix:{migration_socket},local={}", + if local { "on" } else { "off" } + ), ] .to_vec(); - if local { - args.insert(2, "--local".to_string()); - } - let mut send_migration = Command::new(clh_command("ch-remote")) .args(&args) .stderr(Stdio::piped()) @@ -11066,7 +11065,7 @@ mod live_migration { .args([ &format!("--api-socket={src_api_socket}"), "send-migration", - &format!("tcp:{host_ip}:{migration_port}"), + &format!("destination_url=tcp:{host_ip}:{migration_port}"), ]) .stdin(Stdio::null()) .stderr(Stdio::piped()) diff --git a/vmm/src/api/mod.rs b/vmm/src/api/mod.rs index 17ba0011b0..fd290a8a7c 100644 --- a/vmm/src/api/mod.rs +++ b/vmm/src/api/mod.rs @@ -38,6 +38,7 @@ use std::sync::mpsc::{RecvError, SendError, Sender, channel}; use log::info; use micro_http::Body; +use option_parser::{OptionParser, OptionParserError, Toggle}; use serde::{Deserialize, Serialize}; use thiserror::Error; use vm_migration::MigratableError; @@ -266,7 +267,12 @@ pub struct VmReceiveMigrationData { pub receiver_url: String, } -#[derive(Clone, Deserialize, Serialize, Default, Debug)] +#[derive(Debug, Error)] +#[error("Error parsing send migration parameters")] +pub struct VmSendMigrationParseError(#[source] OptionParserError); + +/// Configuration for an outgoing migration. +#[derive(Clone, Deserialize, Serialize, Debug)] pub struct VmSendMigrationData { /// URL to migrate the VM to pub destination_url: String, @@ -275,6 +281,33 @@ pub struct VmSendMigrationData { pub local: bool, } +impl VmSendMigrationData { + pub const SYNTAX: &'static str = "VM send migration parameters \ + \"destination_url=[,local=on|off]\""; + + pub fn parse(migration: &str) -> Result { + let mut parser = OptionParser::new(); + parser.add("destination_url").add("local"); + parser.parse(migration).map_err(VmSendMigrationParseError)?; + + let destination_url = parser.get("destination_url").ok_or_else(|| { + VmSendMigrationParseError(OptionParserError::InvalidSyntax( + "destination_url is required".to_string(), + )) + })?; + let local = parser + .convert::("local") + .map_err(VmSendMigrationParseError)? + .unwrap_or(Toggle(false)) + .0; + + Ok(Self { + destination_url, + local, + }) + } +} + pub enum ApiResponsePayload { /// No data is sent on the channel. Empty, @@ -1541,3 +1574,23 @@ impl ApiAction for VmNmi { get_response_body(self, api_evt, api_sender, data) } } + +#[cfg(test)] +mod unit_tests { + use super::*; + + #[test] + fn test_vm_send_migration_data_parse() { + // Fully specified + let data = VmSendMigrationData::parse("destination_url=tcp://192.168.1.1:8080,local=on") + .expect("valid migration string should parse"); + assert_eq!(data.destination_url, "tcp://192.168.1.1:8080"); + assert!(data.local); + + // Unknown option is an error + VmSendMigrationData::parse("destination_url=unix:/tmp/sock,unknown_field=foo").unwrap_err(); + + // Invalid toggle value is an error + VmSendMigrationData::parse("destination_url=unix:/tmp/sock,local=yes").unwrap_err(); + } +} From bbb0f083b01cf88bfcb4e2d66f78fe7681158cf4 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Fri, 13 Mar 2026 23:26:28 +0100 Subject: [PATCH 261/742] vmm: api: add configurable downtime and timeout to VmSendMigrationData Management software needs fine-grained control over live migration to meet QoS requirements for VM guests. Add `downtime_ms`, `timeout_s`, and `timeout_strategy` fields to `VmSendMigrationData`, exposed via API. This commit contains the API changes only; the VMM does not yet act on these values. This follows in the next commit. For the JSON API, downtime and timeout are represented as plain integers (downtime_ms and timeout_s) to make the units explicit. Using Duration directly would require custom (de)serialization logic, so instead the internal raw integers are exposed as Duration via getters. This introduces minor conversion overhead but keeps the Rust API clear and unambiguous. Signed-off-by: Philipp Schuster On-behalf-of: SAP philipp.schuster@sap.com --- vmm/src/api/mod.rs | 165 +++++++++++++++++++++++++++++++++++++++++++-- vmm/src/lib.rs | 12 +++- 2 files changed, 170 insertions(+), 7 deletions(-) diff --git a/vmm/src/api/mod.rs b/vmm/src/api/mod.rs index fd290a8a7c..4b51c7eb3d 100644 --- a/vmm/src/api/mod.rs +++ b/vmm/src/api/mod.rs @@ -34,7 +34,10 @@ pub mod dbus; pub mod http; use std::io; +use std::num::NonZeroU64; +use std::str::FromStr; use std::sync::mpsc::{RecvError, SendError, Sender, channel}; +use std::time::Duration; use log::info; use micro_http::Body; @@ -267,27 +270,83 @@ pub struct VmReceiveMigrationData { pub receiver_url: String, } +#[derive(Copy, Clone, Default, Deserialize, Serialize, Debug, PartialEq, Eq)] +/// The migration timeout strategy. +/// +/// This strategy describes the behavior of the migration when the target +/// downtime can't be reached in the given timeout. +pub enum TimeoutStrategy { + #[default] + /// Cancel the migration and keep the VM running on the source. + Cancel, + /// Ignore the timeout and migrate anyway. + Ignore, +} + +impl FromStr for TimeoutStrategy { + type Err = String; + fn from_str(s: &str) -> Result { + match s.to_lowercase().as_str() { + "cancel" => Ok(TimeoutStrategy::Cancel), + "ignore" => Ok(TimeoutStrategy::Ignore), + _ => Err(format!("Invalid timeout strategy: {s}")), + } + } +} + #[derive(Debug, Error)] #[error("Error parsing send migration parameters")] pub struct VmSendMigrationParseError(#[source] OptionParserError); /// Configuration for an outgoing migration. #[derive(Clone, Deserialize, Serialize, Debug)] +#[cfg_attr(test, derive(PartialEq))] pub struct VmSendMigrationData { /// URL to migrate the VM to pub destination_url: String, /// Send memory across socket without copying #[serde(default)] pub local: bool, + /// The maximum downtime the migration aims for. + /// + /// Usually, on the order of a few hundred milliseconds. + #[serde(default = "VmSendMigrationData::default_downtime_ms")] + downtime_ms: NonZeroU64, + /// The timeout for the migration, i.e., the maximum duration. + #[serde(default = "VmSendMigrationData::default_timeout_s")] + timeout_s: NonZeroU64, + /// The timeout strategy for the migration. + #[serde(default)] + pub timeout_strategy: TimeoutStrategy, } impl VmSendMigrationData { pub const SYNTAX: &'static str = "VM send migration parameters \ - \"destination_url=[,local=on|off]\""; + \"destination_url=[,local=on|off,\ + downtime_ms=,timeout_s=,\ + timeout_strategy=cancel|ignore]\""; + + // Same as QEMU. + pub const DEFAULT_DOWNTIME: Duration = Duration::from_millis(300); + pub const DEFAULT_TIMEOUT: Duration = Duration::from_secs(60 * 60 /* one hour */); + + fn default_downtime_ms() -> NonZeroU64 { + let ms_u64 = u64::try_from(Self::DEFAULT_DOWNTIME.as_millis()).unwrap(); + NonZeroU64::new(ms_u64).unwrap() + } + + fn default_timeout_s() -> NonZeroU64 { + NonZeroU64::new(Self::DEFAULT_TIMEOUT.as_secs()).unwrap() + } pub fn parse(migration: &str) -> Result { let mut parser = OptionParser::new(); - parser.add("destination_url").add("local"); + parser + .add("destination_url") + .add("local") + .add("downtime_ms") + .add("timeout_s") + .add("timeout_strategy"); parser.parse(migration).map_err(VmSendMigrationParseError)?; let destination_url = parser.get("destination_url").ok_or_else(|| { @@ -300,12 +359,49 @@ impl VmSendMigrationData { .map_err(VmSendMigrationParseError)? .unwrap_or(Toggle(false)) .0; + let downtime_ms = match parser + .convert::("downtime_ms") + .map_err(VmSendMigrationParseError)? + { + Some(v) => NonZeroU64::new(v).ok_or_else(|| { + VmSendMigrationParseError(OptionParserError::InvalidValue( + "downtime_ms must be non-zero".to_string(), + )) + })?, + None => Self::default_downtime_ms(), + }; + let timeout_s = match parser + .convert::("timeout_s") + .map_err(VmSendMigrationParseError)? + { + Some(v) => NonZeroU64::new(v).ok_or_else(|| { + VmSendMigrationParseError(OptionParserError::InvalidValue( + "timeout_s must be non-zero".to_string(), + )) + })?, + None => Self::default_timeout_s(), + }; + let timeout_strategy = parser + .convert("timeout_strategy") + .map_err(VmSendMigrationParseError)? + .unwrap_or_default(); Ok(Self { destination_url, local, + downtime_ms, + timeout_s, + timeout_strategy, }) } + + pub fn downtime(&self) -> Duration { + Duration::from_millis(self.downtime_ms.get()) + } + + pub fn timeout(&self) -> Duration { + Duration::from_secs(self.timeout_s.get()) + } } pub enum ApiResponsePayload { @@ -1582,15 +1678,76 @@ mod unit_tests { #[test] fn test_vm_send_migration_data_parse() { // Fully specified - let data = VmSendMigrationData::parse("destination_url=tcp://192.168.1.1:8080,local=on") - .expect("valid migration string should parse"); + let data = VmSendMigrationData::parse( + "destination_url=tcp://192.168.1.1:8080,local=on,downtime_ms=200,timeout_s=3600,timeout_strategy=cancel" + ).expect("valid migration string should parse"); assert_eq!(data.destination_url, "tcp://192.168.1.1:8080"); assert!(data.local); + assert_eq!(data.downtime_ms.get(), 200); + assert_eq!(data.timeout_s.get(), 3600); + assert_eq!(data.timeout_strategy, TimeoutStrategy::Cancel); + + // Defaults applied when optional fields are omitted + let data = VmSendMigrationData::parse("destination_url=tcp://192.168.1.1:8080") + .expect("minimal migration string should parse"); + assert_eq!(data.destination_url, "tcp://192.168.1.1:8080"); + assert!(!data.local); + assert_eq!(data.downtime_ms, VmSendMigrationData::default_downtime_ms()); + assert_eq!(data.timeout_s, VmSendMigrationData::default_timeout_s()); + assert_eq!(data.timeout_strategy, TimeoutStrategy::default()); + + // Missing destination_url is an error + VmSendMigrationData::parse("local=on,downtime_ms=200").unwrap_err(); + + // Zero downtime_ms is rejected + let _data = + VmSendMigrationData::parse("destination_url=tcp://192.168.1.1:8080,downtime_ms=0") + .expect_err("zero downtime_ms should be rejected"); + + // Zero timeout_s is rejected + let _data = VmSendMigrationData::parse("destination_url=unix:/tmp/sock,timeout_s=0") + .expect_err("zero timeout_s should be rejected"); // Unknown option is an error VmSendMigrationData::parse("destination_url=unix:/tmp/sock,unknown_field=foo").unwrap_err(); // Invalid toggle value is an error VmSendMigrationData::parse("destination_url=unix:/tmp/sock,local=yes").unwrap_err(); + + // Timeout strategy + let _data = VmSendMigrationData::parse( + "destination_url=tcp://192.168.1.1:8080,timeout_strategy=invalid", + ) + .expect_err("zero downtime_ms should be rejected"); + + // Happy path with some defaults + let data = + VmSendMigrationData::parse("destination_url=tcp://192.168.1.1:8080,downtime_ms=150") + .unwrap(); + assert_eq!( + data, + VmSendMigrationData { + destination_url: "tcp://192.168.1.1:8080".to_string(), + local: false, + downtime_ms: NonZeroU64::new(150).unwrap(), + timeout_s: VmSendMigrationData::default_timeout_s(), + timeout_strategy: Default::default(), + } + ); + + // Happy path, fully specified + let data = + VmSendMigrationData::parse("destination_url=tcp://192.168.1.1:8080,downtime_ms=150,timeout_s=900,timeout_strategy=ignore") + .unwrap(); + assert_eq!( + data, + VmSendMigrationData { + destination_url: "tcp://192.168.1.1:8080".to_string(), + local: false, + downtime_ms: NonZeroU64::new(150).unwrap(), + timeout_s: NonZeroU64::new(900).unwrap(), + timeout_strategy: TimeoutStrategy::Ignore, + } + ); } } diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index cb72d69d96..bb775390a4 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -1308,6 +1308,8 @@ impl Vmm { fn do_memory_migration( vm: &mut Vm, socket: &mut SocketStream, + // Used in next commit + _send_data_migration: &VmSendMigrationData, ) -> result::Result<(), MigratableError> { const MAX_ITERATIONS: usize = 5; @@ -1424,7 +1426,7 @@ impl Vmm { // Now pause VM vm.pause()?; } else { - Self::do_memory_migration(vm, &mut socket)?; + Self::do_memory_migration(vm, &mut socket, send_data_migration)?; } // We release the locks early to enable locking them on the destination host. @@ -2438,8 +2440,12 @@ impl RequestHandler for Vmm { send_data_migration: VmSendMigrationData, ) -> result::Result<(), MigratableError> { info!( - "Sending migration: destination_url = {}, local = {}", - send_data_migration.destination_url, send_data_migration.local + "Sending migration: destination_url={},local={},downtime={}ms,timeout={}s,timeout_strategy={:?}", + send_data_migration.destination_url, + send_data_migration.local, + send_data_migration.downtime().as_millis(), + send_data_migration.timeout().as_secs(), + send_data_migration.timeout_strategy ); if !self From 49e03c8bc529fbab3b8d31393fe9f1d069919c29 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Fri, 13 Mar 2026 23:26:47 +0100 Subject: [PATCH 262/742] vmm: migration: Converge based on user-provided downtime and timeout Wire the new `downtime_ms`, `timeout_s`, and `timeout_strategy` fields from `VmSendMigrationData` into the precopy loop, replacing the previous hard-coded 5-iteration cap. Each iteration now evaluates three convergence criteria in order: - no dirty pages remain; - the estimated final-iteration downtime is within the configured budget - or the overall migration timeout has elapsed. On timeout, `TimeoutStrategy::Cancel` aborts and keeps the VM live on the source, while `TimeoutStrategy::Force` proceeds regardless of the downtime target. The convergence callback is updated to return a Result to propagate the cancel error cleanly up the call stack. With the recent changes [0], it is fairly easy to implement the new checks and operate on actual metrics. These changes are inspired by [1] but differ significantly in details. [0] https://github.com/cloud-hypervisor/cloud-hypervisor/pull/7799 [1] https://github.com/cloud-hypervisor/cloud-hypervisor/pull/7033 Signed-off-by: Philipp Schuster On-behalf-of: SAP philipp.schuster@sap.com --- vm-migration/src/context.rs | 4 +- vmm/src/lib.rs | 115 ++++++++++++++++++++++++++++++++---- 2 files changed, 104 insertions(+), 15 deletions(-) diff --git a/vm-migration/src/context.rs b/vm-migration/src/context.rs index 7dfa5b7d9e..d7680821cc 100644 --- a/vm-migration/src/context.rs +++ b/vm-migration/src/context.rs @@ -40,9 +40,9 @@ pub struct MemoryMigrationContext { /// /// Please note that this ignores any additional migration overhead and /// only looks at the memory transfer itself. - estimated_downtime: Option, + pub estimated_downtime: Option, /// Begin of the memory migration. - migration_begin: Instant, + pub migration_begin: Instant, /// Duration of the memory migration. /// /// This is only `None` until the last iteration is finished. diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index bb775390a4..d1cf6693a3 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -13,6 +13,7 @@ use std::panic::AssertUnwindSafe; use std::path::PathBuf; use std::sync::mpsc::{Receiver, RecvError, SendError, Sender}; use std::sync::{Arc, Mutex}; +use std::time::Duration; #[cfg(not(target_arch = "riscv64"))] use std::time::Instant; use std::{io, result, thread}; @@ -48,8 +49,8 @@ use vmm_sys_util::signal::unblock_signal; use vmm_sys_util::sock_ctrl_msg::ScmSocket; use crate::api::{ - ApiRequest, ApiResponse, RequestHandler, VmInfoResponse, VmReceiveMigrationData, - VmSendMigrationData, VmmPingResponse, + ApiRequest, ApiResponse, RequestHandler, TimeoutStrategy, VmInfoResponse, + VmReceiveMigrationData, VmSendMigrationData, VmmPingResponse, }; use crate::config::{MemoryRestoreMode, RestoreConfig, add_to_config}; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] @@ -1258,7 +1259,7 @@ impl Vmm { vm: &mut Vm, socket: &mut SocketStream, ctx: &mut MemoryMigrationContext, - is_converged: impl Fn(&MemoryMigrationContext) -> bool, + is_converged: impl Fn(&MemoryMigrationContext) -> result::Result, ) -> result::Result { loop { let iteration_begin = Instant::now(); @@ -1271,7 +1272,7 @@ impl Vmm { }; ctx.update_metrics_before_transfer(iteration_begin, &iteration_table); - if is_converged(ctx) { + if is_converged(ctx)? { debug!("Precopy converged: {ctx}"); break Ok(iteration_table); } @@ -1299,6 +1300,95 @@ impl Vmm { } } + /// Checks whether the precopy memory migration has converged and it is safe + /// to proceed to the final (paused) memory iteration. + /// + /// Once this returns, the VM is expected to stop as soon as possible. + /// + /// Convergence is reached when any of the following criteria is met: + /// + /// 1. **No dirty pages remain** – the current iteration would transfer zero + /// bytes. + /// 2. **Downtime budget is met** – the estimated downtime for the final + /// (paused) iteration is within the caller-specified + /// [`VmSendMigrationData::downtime`] budget. + /// 3. **Timeout** – the precopy phase has been running for at least + /// [`VmSendMigrationData::timeout`]. The outcome depends on + /// [`VmSendMigrationData::timeout_strategy`]: + /// - [`TimeoutStrategy::Cancel`] – returns + /// - [`TimeoutStrategy::Ignore`] – the migration completes despite not + /// meeting the downtime budget. + /// [`MigratableError::MigrateSend`] so the caller can abort the + /// migration cleanly. + /// + /// # Returns + /// + /// * `Ok(true)` – convergence criterion met; the caller should stop precopy + /// iterations. + /// * `Ok(false)` – not yet converged; the caller should run another + /// dirty-page iteration. + /// * `Err(_)` – the timeout was reached and [`TimeoutStrategy::Cancel`] + /// is in effect. + fn is_precopy_converged( + ctx: &MemoryMigrationContext, + send_data_migration: &VmSendMigrationData, + ) -> result::Result { + if ctx.current_iteration_total_bytes == 0 { + debug!("Precopy: No more memory to transfer"); + return Ok(true); + } + + // We currently ignore the time required to transfer the final + // VM state (device state and vCPUs) and the time needed on the + // receiver to create the VM and initialize its data structures + // before execution can resume. + // + // Manual testing showed that migrating an idle VM on a modern + // AMD CPU (CHV release build) adds ~5 ms of overhead when + // scaling from 1 to 200 vCPUs. Given this small cost, we + // deliberately avoid additional heuristics to estimate the + // downtime more precisely - for now. Instead, we approximate + // the downtime just by the transfer time of the final memory + // delta. + if let Some(memory_downtime) = ctx.estimated_downtime + && memory_downtime <= send_data_migration.downtime() + { + debug!( + "Precopy: Target downtime can be met: {}ms <= {}ms", + memory_downtime.as_millis(), + send_data_migration.downtime().as_millis() + ); + return Ok(true); + } + + // We check the beginning of the precopy migration and not the overall migration, and + // this is fine: precopy takes the longest and the earlier steps are negligible. + if ctx.migration_begin.elapsed() >= send_data_migration.timeout() { + return match send_data_migration.timeout_strategy { + TimeoutStrategy::Cancel => { + let msg = format!( + "Precopy: Timeout reached: {}s: migration didn't converge in time", + send_data_migration.timeout().as_secs() + ); + Err(MigratableError::MigrateSend(anyhow!("{msg}"))) + } + TimeoutStrategy::Ignore => { + info!( + "Precopy: Pausing VM, ignoring target downtime ({}ms) due to timeout ({}s): Estimated downtime: {}ms", + send_data_migration.downtime().as_millis(), + send_data_migration.timeout().as_secs(), + ctx.estimated_downtime + .unwrap_or(Duration::from_secs(0)) + .as_millis() + ); + Ok(true) + } + }; + } + + Ok(false) + } + /// Performs the memory migration including multiple iterations. /// /// This includes: @@ -1308,19 +1398,18 @@ impl Vmm { fn do_memory_migration( vm: &mut Vm, socket: &mut SocketStream, - // Used in next commit - _send_data_migration: &VmSendMigrationData, + send_data_migration: &VmSendMigrationData, ) -> result::Result<(), MigratableError> { - const MAX_ITERATIONS: usize = 5; - let mut ctx = MemoryMigrationContext::new(); - let is_converged = |ctx: &MemoryMigrationContext| { - // TODO: Add check for configurable downtime and max migration time #7111 - ctx.iteration >= MAX_ITERATIONS || ctx.current_iteration_total_bytes == 0 - }; vm.start_dirty_log()?; - let remaining = Self::do_memory_iterations(vm, socket, &mut ctx, is_converged)?; + let remaining = Self::do_memory_iterations( + vm, + socket, + &mut ctx, + // We bind send_data_migration to the callback + |ctx| Self::is_precopy_converged(ctx, send_data_migration), + )?; vm.pause()?; // Send last batch of dirty pages: final iteration From 49868f483ee1b79df7af799d32731e778f570c10 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Fri, 13 Mar 2026 23:24:26 +0100 Subject: [PATCH 263/742] vmm: update openapi spec Signed-off-by: Philipp Schuster On-behalf-of: SAP philipp.schuster@sap.com --- vmm/src/api/openapi/cloud-hypervisor.yaml | 27 +++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/vmm/src/api/openapi/cloud-hypervisor.yaml b/vmm/src/api/openapi/cloud-hypervisor.yaml index e8d3350dba..14f74018ea 100644 --- a/vmm/src/api/openapi/cloud-hypervisor.yaml +++ b/vmm/src/api/openapi/cloud-hypervisor.yaml @@ -1369,6 +1369,15 @@ components: receiver_url: type: string + TimeoutStrategy: + type: string + enum: ["Cancel", "Ignore"] + default: "Cancel" + description: > + The strategy to apply when the migration timeout is reached. + Cancel will abort the migration and keep the VM running on the source. + Ignore will proceed with the migration regardless of the downtime requirement. + SendMigrationData: required: - destination_url @@ -1378,6 +1387,24 @@ components: type: string local: type: boolean + downtime_ms: + type: integer + format: int64 + minimum: 1 + default: 300 + description: > + The maximum downtime the migration aims for, in milliseconds. + Defaults to 300ms. + timeout_s: + type: integer + format: int64 + minimum: 1 + default: 3600 + description: > + The timeout for the migration (maximum total duration), in seconds. + Defaults to 3600s (one hour). + timeout_strategy: + $ref: "#/components/schemas/TimeoutStrategy" VmAddUserDevice: required: From c8cee779b000de8288987becd6957e9ee75e8493 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Fri, 13 Mar 2026 23:23:58 +0100 Subject: [PATCH 264/742] docs: update live_migration.md Signed-off-by: Philipp Schuster On-behalf-of: SAP philipp.schuster@sap.com --- docs/live_migration.md | 39 +++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/docs/live_migration.md b/docs/live_migration.md index ac5bf93f75..ac842d3172 100644 --- a/docs/live_migration.md +++ b/docs/live_migration.md @@ -3,8 +3,9 @@ This document gives examples of how to use the live migration support in Cloud Hypervisor: -1. local migration - migrating a VM from one Cloud Hypervisor instance to another on the same machine; -1. remote migration - migrating a VM between two machines; +1. **Local Migration**: Migrating a VM from one Cloud Hypervisor instance to another on the same machine; also called + UNIX socket migration. +1. **Remote Migration** (TCP Migration): migrating a VM between two TCP/IP hosts. > :warning: These examples place sockets in /tmp. This is done for > simplicity and should not be done in production. @@ -28,7 +29,8 @@ Launch the destination VM from the same directory (on the host machine): $ target/release/cloud-hypervisor --api-socket=/tmp/api2 ``` -Get ready for receiving migration for the destination VM (on the host machine): +Get ready for receiving migration for the destination VM (on the host +machine): ```console $ target/release/ch-remote --api-socket=/tmp/api2 receive-migration unix:/tmp/sock @@ -37,14 +39,16 @@ $ target/release/ch-remote --api-socket=/tmp/api2 receive-migration unix:/tmp/so Start to send migration for the source VM (on the host machine): ```console -$ target/release/ch-remote --api-socket=/tmp/api1 send-migration --local unix:/tmp/sock +$ target/release/ch-remote --api-socket=/tmp/api1 send-migration destination_url=unix:/tmp/sock,local=true ``` When the above commands completed, the source VM should be successfully migrated to the destination VM. Now the destination VM is running while the source VM is terminated gracefully. -## Remote Migration +## Remote Migration (TCP Migration) + +_Hint: For developing purposes, same-host TCP migrations are also supported._ In this example, we will migrate a VM from one machine (`src`) to another (`dst`) across the network. To keep it simple, we will use a @@ -171,7 +175,13 @@ After a few seconds the VM should be up and you can interact with it. Initiate the Migration over TCP: ```console -src $ ch-remote --api-socket=/tmp/api send-migration tcp:{dst}:{port} +src $ ch-remote --api-socket=/tmp/api send-migration destination_url=tcp:{dst}:{port} +``` + +With migration parameters: + +```console +src $ ch-remote --api-socket=/tmp/api send-migration destination_url=tcp:{dst}:{port},downtime_ms=200,timeout_s=3600,timeout_strategy=cancel ``` > Replace {dst}:{port} with the actual IP address and port of your destination host. @@ -180,3 +190,20 @@ After completing the above commands, the source VM will be migrated to the destination host and continue running there. The source VM instance will terminate normally. All ongoing processes and connections within the VM should remain intact after the migration. + +#### Migration Parameters + +Cloud Hypervisor supports additional parameters to control the +migration process. Via the API or `ch-remote`, you may specify: + +- `downtime_ms `: \ + The maximum downtime the migration aims for, in milliseconds. + Defaults to `300ms`. +- `timeout_s `: \ + The timeout for the migration (maximum total duration), in seconds. + Defaults to `3600s` (one hour). +- `timeout_strategy ` (`[cancel, ignore]`): \ + The strategy to apply when the migration timeout is reached. + Cancel will abort the migration and keep the VM running on the source. + Ignore will proceed with the migration regardless of the downtime requirement. + Defaults to `cancel`. From 98fd139111491277368b8a426d631ab1449ce14d Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Fri, 20 Mar 2026 18:04:32 +0100 Subject: [PATCH 265/742] tests: add integration tests This adds two new integration tests for the new functionality: - VM under load, downtime=1ms, timeout=1s, timeout_strategy=cancel - VM under load, downtime=1ms, timeout=1s, timeout_strategy=force By using a short downtime and timeout plus adding a stress worker in the guest, we can prevent quick migration. Therefore, we can nicely test the timeout_strategy. Testing for a specific downtime is cumbersome to do and highly depends on CPU/host utilization. To prevent flakiness, there is no such test integration test. I did, however, manual testing of that functionality. Signed-off-by: Philipp Schuster On-behalf-of: SAP philipp.schuster@sap.com --- cloud-hypervisor/tests/integration.rs | 173 ++++++++++++++++++++++++++ 1 file changed, 173 insertions(+) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 5454cae6b6..811dd431f0 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -9868,6 +9868,8 @@ mod vfio { } mod live_migration { + use vmm::api::TimeoutStrategy; + use crate::*; pub fn start_live_migration( @@ -11244,7 +11246,168 @@ mod live_migration { handle_child_output(r, &dest_output); } + fn _test_live_migration_tcp_timeout(timeout_strategy: TimeoutStrategy) { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); + let kernel_path = direct_kernel_boot_path(); + let net_id = "net1337"; + let net_params = format!( + "id={},tap=,mac={},ip={},mask=255.255.255.128", + net_id, guest.network.guest_mac0, guest.network.host_ip0 + ); + let memory_param: &[&str] = &["--memory", "size=2G,shared=on"]; + let boot_vcpus = 2; + + let src_vm_path = clh_command("cloud-hypervisor"); + let src_api_socket = temp_api_path(&guest.tmp_dir); + let mut src_vm_cmd = GuestCommand::new_with_binary_path(&guest, &src_vm_path); + src_vm_cmd + .args(["--cpus", format!("boot={boot_vcpus}").as_str()]) + .args(memory_param) + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .default_disks() + .args(["--net", net_params.as_str()]) + .args(["--api-socket", &src_api_socket]) + .capture_output(); + let mut src_child = src_vm_cmd.spawn().unwrap(); + + let mut dest_api_socket = temp_api_path(&guest.tmp_dir); + dest_api_socket.push_str(".dest"); + let mut dest_child = GuestCommand::new(&guest) + .args(["--api-socket", &dest_api_socket]) + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { + guest.wait_vm_boot().unwrap(); + + assert_eq!(guest.get_cpu_count().unwrap_or_default(), boot_vcpus); + assert!(guest.get_total_memory().unwrap_or_default() > 2_000_000); + + // Start a memory stressor in the background to keep pages dirty, + // ensuring the precopy loop cannot converge within the 1s timeout. + guest + .ssh_command("nohup stress --vm 1 --vm-bytes 1G --vm-keep &>/dev/null &") + .unwrap(); + // Give stress a moment to actually start dirtying memory + thread::sleep(Duration::from_secs(3)); + + let migration_port = get_available_port(); + let host_ip = "127.0.0.1"; + + let mut receive_migration = Command::new(clh_command("ch-remote")) + .args([ + &format!("--api-socket={dest_api_socket}"), + "receive-migration", + &format!("tcp:0.0.0.0:{migration_port}"), + ]) + .stdin(Stdio::null()) + .stderr(Stdio::piped()) + .stdout(Stdio::piped()) + .spawn() + .unwrap(); + + thread::sleep(Duration::from_secs(1)); + + // Use a tight downtime budget (50ms) combined with a 1s timeout so the + // migration cannot converge regardless of strategy. + let mut send_migration = Command::new(clh_command("ch-remote")) + .args([ + &format!("--api-socket={src_api_socket}"), + "send-migration", + &format!( + "destination_url=tcp:{host_ip}:{migration_port},downtime_ms=50,timeout_s=1,timeout_strategy={timeout_strategy:?}" + ), + ]) + .stdin(Stdio::null()) + .stderr(Stdio::piped()) + .stdout(Stdio::piped()) + .spawn() + .unwrap(); + + let send_status = send_migration + .wait_timeout(Duration::from_secs(60)) + .unwrap(); + let receive_status = receive_migration + .wait_timeout(Duration::from_secs(60)) + .unwrap(); + + // Clean up receive-migration regardless of its outcome + if receive_status.is_none() { + let _ = receive_migration.kill(); + } + + // Kill the stressor now that migration has completed or aborted, + // to reduce system load during post-migration checks. + let _ = guest.ssh_command("pkill -f 'stress --vm'"); + + match timeout_strategy { + TimeoutStrategy::Cancel => { + // With cancel strategy the send must fail and the source VM + // must keep running. + let send_failed = match send_status { + Some(status) => !status.success(), + None => { + let _ = send_migration.kill(); + false + } + }; + assert!( + send_failed, + "send-migration should have failed due to 1s timeout with cancel strategy" + ); + + thread::sleep(Duration::from_secs(2)); + assert!( + src_child.try_wait().unwrap().is_none(), + "Source VM should still be running after a cancelled migration" + ); + + // Confirm the source VM is still responsive over SSH + assert_eq!(guest.get_cpu_count().unwrap_or_default(), boot_vcpus); + assert!(guest.get_total_memory().unwrap_or_default() >= 2_000_000); + } + TimeoutStrategy::Ignore => { + // With Ignore strategy the send must succeed despite the timeout + // being reached, and the source VM must have terminated. + let send_succeeded = match send_status { + Some(status) => status.success(), + None => { + let _ = send_migration.kill(); + false + } + }; + assert!( + send_succeeded, + "send-migration should have succeeded with timeout_strategy=ignore" + ); + + thread::sleep(Duration::from_secs(3)); + assert!( + src_child.try_wait().unwrap().is_some(), + "Source VM should have terminated after a forced migration" + ); + + // Confirm the VM is still responsive over SSH on the new host + assert_eq!(guest.get_cpu_count().unwrap_or_default(), boot_vcpus); + assert!(guest.get_total_memory().unwrap_or_default() >= 2_000_000); + } + } + })); + + let _ = src_child.kill(); + let src_output = src_child.wait_with_output().unwrap(); + let _ = dest_child.kill(); + let _dest_output = dest_child.wait_with_output().unwrap(); + + handle_child_output(r, &src_output); + } + mod live_migration_parallel { + use vmm::api::TimeoutStrategy; + use super::*; #[test] fn test_live_migration_basic() { @@ -11261,6 +11424,16 @@ mod live_migration { _test_live_migration_tcp(); } + #[test] + fn test_live_migration_tcp_timeout_cancel() { + _test_live_migration_tcp_timeout(TimeoutStrategy::Cancel); + } + + #[test] + fn test_live_migration_tcp_timeout_ignore() { + _test_live_migration_tcp_timeout(TimeoutStrategy::Ignore); + } + #[test] fn test_live_migration_watchdog() { _test_live_migration_watchdog(false, false); From 81495241eb1f49e7a1623501f8a49b092be55b05 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Sat, 21 Mar 2026 09:11:32 +0100 Subject: [PATCH 266/742] tests: reduce memory pressure, fix ARM tests in CI Mosts tests used 4GB of RAM, although the VM is mostly idling. In CI, we experienced OOM issues on the ARM runners. If we reduce the VM memory of the parallel live migration tests to 1.5GB RAM, we still have enough capacity in the VM so that everything succeeds while reducing resource usage. Signed-off-by: Philipp Schuster On-behalf-of: SAP philipp.schuster@sap.com --- cloud-hypervisor/tests/integration.rs | 35 ++++++++++++--------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 811dd431f0..87cf4f2396 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -10026,9 +10026,9 @@ mod live_migration { ); let memory_param: &[&str] = if local { - &["--memory", "size=4G,shared=on"] + &["--memory", "size=1500M,shared=on"] } else { - &["--memory", "size=4G"] + &["--memory", "size=1500M"] }; let boot_vcpus = 2; @@ -10084,7 +10084,7 @@ mod live_migration { assert_eq!(guest.get_cpu_count().unwrap_or_default(), boot_vcpus); // Check the guest RAM - assert!(guest.get_total_memory().unwrap_or_default() > 3_840_000); + assert!(guest.get_total_memory().unwrap_or_default() > 1_400_000); // Check the guest virtio-devices, e.g. block, rng, console, and net guest.check_devices_common(None, Some(&console_text), Some(&pmem_path)); @@ -10151,7 +10151,7 @@ mod live_migration { let r = std::panic::catch_unwind(|| { // Perform same checks to validate VM has been properly migrated assert_eq!(guest.get_cpu_count().unwrap_or_default(), boot_vcpus); - assert!(guest.get_total_memory().unwrap_or_default() > 3_840_000); + assert!(guest.get_total_memory().unwrap_or_default() > 1_400_000); guest.check_devices_common(None, Some(&console_text), Some(&pmem_path)); }); @@ -10616,9 +10616,9 @@ mod live_migration { ); let memory_param: &[&str] = if local { - &["--memory", "size=4G,shared=on"] + &["--memory", "size=1500M,shared=on"] } else { - &["--memory", "size=4G"] + &["--memory", "size=1500M"] }; let boot_vcpus = 2; @@ -10674,7 +10674,7 @@ mod live_migration { // Check the number of vCPUs assert_eq!(guest.get_cpu_count().unwrap_or_default(), boot_vcpus); // Check the guest RAM - assert!(guest.get_total_memory().unwrap_or_default() > 3_840_000); + assert!(guest.get_total_memory().unwrap_or_default() > 1_400_000); // Check the guest virtio-devices, e.g. block, rng, console, and net guest.check_devices_common(None, Some(&console_text), Some(&pmem_path)); // x86_64: Following what's done in the `test_snapshot_restore`, we need @@ -10759,7 +10759,7 @@ mod live_migration { let r = std::panic::catch_unwind(|| { // Perform same checks to validate VM has been properly migrated assert_eq!(guest.get_cpu_count().unwrap_or_default(), boot_vcpus); - assert!(guest.get_total_memory().unwrap_or_default() > 3_840_000); + assert!(guest.get_total_memory().unwrap_or_default() > 1_400_000); guest.check_devices_common(None, Some(&console_text), Some(&pmem_path)); @@ -10934,7 +10934,7 @@ mod live_migration { "--cpus", format!("boot={boot_vcpus},max={max_vcpus}").as_str(), ]) - .args(["--memory", "size=4G,shared=on"]) + .args(["--memory", "size=1500M,shared=on"]) .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .default_disks() @@ -10966,7 +10966,7 @@ mod live_migration { assert_eq!(guest.get_cpu_count().unwrap_or_default(), boot_vcpus); // Check the guest RAM - assert!(guest.get_total_memory().unwrap_or_default() > 3_840_000); + assert!(guest.get_total_memory().unwrap_or_default() > 1_400_000); // Check Landlock is enabled by hot-plugging a disk. assert!(!remote_command( @@ -11016,7 +11016,7 @@ mod live_migration { let r = std::panic::catch_unwind(|| { // Perform same checks to validate VM has been properly migrated assert_eq!(guest.get_cpu_count().unwrap_or_default(), boot_vcpus); - assert!(guest.get_total_memory().unwrap_or_default() > 3_840_000); + assert!(guest.get_total_memory().unwrap_or_default() > 1_400_000); }); // Check Landlock is enabled on destination VM by hot-plugging a disk. @@ -11128,7 +11128,7 @@ mod live_migration { "id={},tap=,mac={},ip={},mask=255.255.255.128", net_id, guest.network.guest_mac0, guest.network.host_ip0 ); - let memory_param: &[&str] = &["--memory", "size=4G,shared=on"]; + let memory_param: &[&str] = &["--memory", "size=1500M,shared=on"]; let boot_vcpus = 2; let max_vcpus = 4; let pmem_temp_file = TempFile::new().unwrap(); @@ -11178,7 +11178,7 @@ mod live_migration { guest.wait_vm_boot().unwrap(); // Ensure the source VM is running normally assert_eq!(guest.get_cpu_count().unwrap_or_default(), boot_vcpus); - assert!(guest.get_total_memory().unwrap_or_default() > 3_840_000); + assert!(guest.get_total_memory().unwrap_or_default() > 1_400_000); guest.check_devices_common(None, Some(&console_text), Some(&pmem_path)); // On x86_64 architecture, remove and re-add the virtio-net device @@ -11230,7 +11230,7 @@ mod live_migration { let r = std::panic::catch_unwind(|| { // Perform the same checks to ensure the VM has migrated correctly assert_eq!(guest.get_cpu_count().unwrap_or_default(), boot_vcpus); - assert!(guest.get_total_memory().unwrap_or_default() > 3_840_000); + assert!(guest.get_total_memory().unwrap_or_default() > 1_400_000); guest.check_devices_common(None, Some(&console_text), Some(&pmem_path)); }); @@ -11255,7 +11255,7 @@ mod live_migration { "id={},tap=,mac={},ip={},mask=255.255.255.128", net_id, guest.network.guest_mac0, guest.network.host_ip0 ); - let memory_param: &[&str] = &["--memory", "size=2G,shared=on"]; + let memory_param: &[&str] = &["--memory", "size=1500M,shared=on"]; let boot_vcpus = 2; let src_vm_path = clh_command("cloud-hypervisor"); @@ -11284,12 +11284,11 @@ mod live_migration { guest.wait_vm_boot().unwrap(); assert_eq!(guest.get_cpu_count().unwrap_or_default(), boot_vcpus); - assert!(guest.get_total_memory().unwrap_or_default() > 2_000_000); // Start a memory stressor in the background to keep pages dirty, // ensuring the precopy loop cannot converge within the 1s timeout. guest - .ssh_command("nohup stress --vm 1 --vm-bytes 1G --vm-keep &>/dev/null &") + .ssh_command("nohup stress --vm 2 --vm-bytes 200M --vm-keep &>/dev/null &") .unwrap(); // Give stress a moment to actually start dirtying memory thread::sleep(Duration::from_secs(3)); @@ -11367,7 +11366,6 @@ mod live_migration { // Confirm the source VM is still responsive over SSH assert_eq!(guest.get_cpu_count().unwrap_or_default(), boot_vcpus); - assert!(guest.get_total_memory().unwrap_or_default() >= 2_000_000); } TimeoutStrategy::Ignore => { // With Ignore strategy the send must succeed despite the timeout @@ -11392,7 +11390,6 @@ mod live_migration { // Confirm the VM is still responsive over SSH on the new host assert_eq!(guest.get_cpu_count().unwrap_or_default(), boot_vcpus); - assert!(guest.get_total_memory().unwrap_or_default() >= 2_000_000); } } })); From 00b3a48900bbed866011bc6ef5738ddcfe31ca71 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Tue, 24 Mar 2026 18:40:42 +0100 Subject: [PATCH 267/742] docs: revisit AI/LLM policy in CONTRIBUTING.md This summarizes the latest state all relevant parties have agreed on in the last meeting. This commit was rephrased and improved with the help of Claude Code using the Sonnet 4.6 model. Signed-off-by: Philipp Schuster On-behalf-of: SAP philipp.schuster@sap.com --- CONTRIBUTING.md | 51 ++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 44 insertions(+), 7 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 1518d0f1fb..ead1cbe200 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -164,12 +164,49 @@ Signed-off-by: Sebastien Boeuf Then, after the corresponding PR is merged, GitHub will automatically close that issue when parsing the [commit message](https://help.github.com/articles/closing-issues-via-commit-messages/). -## AI Generated Code +## AI/LLM Assistance & Generated Code -Our policy is to decline any contributions known to contain contents -generated or derived from using Large Language Models (LLMs). This -includes ChatGPT, Gemini, Claude, Copilot and similar tools. +We recommend **a careful and conservative approach** to LLM usage, guided by +sound engineering judgment. Please use AI/LLM-assisted tooling thoughtfully and +responsibly to ensure efficient use of limited project resources, particularly +in code review and long-term maintenance. Our primary goals are to avoid +ambiguity in license compliance and to keep contributions clear and easy to +review. -The goal is to avoid ambiguity in license compliance and optimize the -use of limited project resources, especially for code review and -maintenance. This policy can be revisited as LLMs evolve and mature. +Or in other words: please apply common sense and don't blindly accept LLM +suggestions. + +This policy can be revisited as LLMs evolve and mature. + +### Code Review + +We generally recommend doing early coarse-grained reviews using state-of-the-art +LLMs. This can help identify rough edges, copy & paste errors, and typos early +on. This reduces review cycles for human reviewers. + +Please **do not** use GitHub Copilot directly in PRs to keep discussions clean. +Instead, ask an LLM of your choice for a review. A convenient way to do this is + +- appending `.patch` to the GitHub PR URL + (e.g., `https://github.com/cloud-hypervisor/cloud-hypervisor/pull/1234.patch`) + and pasting it into the LLM of your choice, or +- using a local agent in your terminal, such as `codex` or `claude`. + +### Contributions assisted by LLMs + +All contributions **must** be submitted by a human contributor. Automated or +bot-driven PRs are not accepted. + +You are responsible for every piece of code you submit, and you must understand +both the design and the implementation details. LLMs are useful for prototyping +and generating boilerplate code. However, large or complex logic must be +authored and fully understood by the contributor - LLM output should not be +submitted without careful review and comprehension. + +Please disclose LLM use in your commit message and PR description if it +meaningfully contributed to the submitted code. Again, we recommend careful and +conservative use of LLMs, guided by common sense. + +Maintainers reserve the right to request additional clarification or decline +contributions where LLM usage raises concerns. Ultimately, acceptance of any +contribution is at the maintainers' discretion. From 56771a0f4cdee8b7c044403a5176e01fcd06d7c9 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Tue, 24 Mar 2026 18:40:55 +0100 Subject: [PATCH 268/742] docs: remove trailing spaces in CONTRIBUTING.md Signed-off-by: Philipp Schuster On-behalf-of: SAP philipp.schuster@sap.com --- CONTRIBUTING.md | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ead1cbe200..cdda170b4a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -21,7 +21,7 @@ convention and enforce it through the Continuous Integration (CI) process callin ```sh # We currently rely on nightly-only formatting features -cargo +nightly fmt --all +cargo +nightly fmt --all cargo check --all-targets --tests cargo clippy --all-targets --tests # Please note that this will not execute integration tests. @@ -36,7 +36,7 @@ gitlint --commits "HEAD~3..HEAD" _Caution: These tests are taking a long time to complete (40+ mins) and need special setup._ ```sh - bash ./scripts/dev_cli.sh tests --integration -- --test-filter '' + bash ./scripts/dev_cli.sh tests --integration -- --test-filter '' ``` ### Setup Commit Hook @@ -71,35 +71,35 @@ We require patches to: - Follow the pattern: \ ``` : Change summary - + More detailed explanation of your changes: Why and how. Wrap it to 72 characters. See http://chris.beams.io/posts/git-commit/ for some more good pieces of advice. - + Signed-off-by: ``` - + Valid components are listed in `TitleStartsWithComponent.py`. In short, each -cargo workspace member is a valid component as well as `build`, `ci`, `docs` and +cargo workspace member is a valid component as well as `build`, `ci`, `docs` and `misc`. Example patch: ``` vm-virtio: Reset underlying device on driver request - + If the driver triggers a reset by writing zero into the status register then reset the underlying device if supported. A device reset also requires resetting various aspects of the queue. - + In order to be able to do a subsequent reactivate it is required to reclaim certain resources (interrupt and queue EventFDs.) If a device reset is requested by the driver but the underlying device does not support it then generate an error as the driver would not be able to configure it anyway. - + Signed-off-by: Rob Bradford ``` @@ -109,11 +109,11 @@ We value a clean, **reviewable** commit history. Each commit should represent a self-contained, logical step that guides reviewers clearly from A to B. Avoid patterns like `init A -> init B -> fix A` or \ -`init design A -> revert A -> use design B`. Commits must be independently +`init design A -> revert A -> use design B`. Commits must be independently reviewable - don't leave "fix previous commit" or earlier design attempts in the history. -Intermediate work-in-progress changes are acceptable only if a subsequent +Intermediate work-in-progress changes are acceptable only if a subsequent commit in the same series cleans them up (e.g. a temporary `#[allow(unused)]` removed in the next commit). @@ -150,14 +150,14 @@ comments or by adding the `Fixes` keyword to your commit message: ``` serial: Set terminal in raw mode - + In order to have proper output from the serial, we need to setup the terminal in raw mode. When the VM is shutting down, it is also the VMM responsibility to set the terminal back into canonical mode if we don't want to get any weird behavior from the shell. - + Fixes #88 - + Signed-off-by: Sebastien Boeuf ``` From c75504c833e3c0fc5c614986145b260b5db3dce7 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 00:17:38 -0700 Subject: [PATCH 269/742] tests: move CVM tests to integration_cvm.rs Move all CVM (Confidential VM) integration tests from integration.rs into a dedicated integration_cvm.rs file. This separation improves code organization and makes it easier to manage CVM-specific test cases independently from regular integration tests. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 207 -------------------- cloud-hypervisor/tests/integration_cvm.rs | 223 ++++++++++++++++++++++ 2 files changed, 223 insertions(+), 207 deletions(-) create mode 100644 cloud-hypervisor/tests/integration_cvm.rs diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 87cf4f2396..3af45c35c3 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -11982,210 +11982,3 @@ mod fw_cfg { handle_child_output(r, &output); } } - -#[cfg(all(feature = "sev_snp", target_arch = "x86_64"))] -mod common_cvm { - use crate::*; - - #[test] - fn test_focal_simple_launch() { - let disk_config = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); - let guest = - GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); - - _test_simple_launch(&guest); - } - - #[test] - fn test_api_http_create_boot() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = GuestFactory::new_confidential_guest_factory() - .create_guest(Box::new(disk_config)) - .with_cpu(4); - let target_api = TargetApi::new_http_api(&guest.tmp_dir); - _test_api_create_boot(&target_api, &guest); - } - - #[test] - fn test_api_http_shutdown() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = GuestFactory::new_confidential_guest_factory() - .create_guest(Box::new(disk_config)) - .with_cpu(4); - - let target_api = TargetApi::new_http_api(&guest.tmp_dir); - _test_api_shutdown(&target_api, &guest); - } - - #[test] - fn test_api_http_delete() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = GuestFactory::new_confidential_guest_factory() - .create_guest(Box::new(disk_config)) - .with_cpu(4); - - let target_api = TargetApi::new_http_api(&guest.tmp_dir); - _test_api_delete(&target_api, &guest); - } - - #[test] - fn test_power_button() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = - GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); - _test_power_button(&guest); - } - - #[test] - fn test_virtio_vsock() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = - GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); - _test_virtio_vsock(&guest, false); - } - - #[test] - fn test_multi_cpu() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = - GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); - _test_multi_cpu(&guest); - } - - #[test] - fn test_cpu_affinity() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = GuestFactory::new_confidential_guest_factory() - .create_guest(Box::new(disk_config)) - .with_cpu(2); - _test_cpu_affinity(&guest); - } - - #[test] - fn test_virtio_queue_affinity() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = GuestFactory::new_confidential_guest_factory() - .create_guest(Box::new(disk_config)) - .with_cpu(4); - _test_virtio_queue_affinity(&guest); - } - - #[test] - fn test_pci_msi() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = - GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); - _test_pci_msi(&guest); - } - - #[test] - fn test_virtio_net_ctrl_queue() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = - GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); - _test_virtio_net_ctrl_queue(&guest); - } - - #[test] - fn test_pci_multiple_segments() { - // Use 8 segments to test the multiple segment support since it's more than the default 6 - // supported by Linux - // IGVM file used by Sev-Snp Guest now support up to 8 segments, so we can use 8 segments for testing. - let num_pci_segments: u16 = 8; - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = - GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); - _test_pci_multiple_segments(&guest, num_pci_segments, 5); - } - - #[test] - fn test_direct_kernel_boot() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = - GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); - _test_direct_kernel_boot(&guest); - } - - #[test] - fn test_virtio_block_io_uring() { - let guest = make_virtio_block_guest( - &GuestFactory::new_confidential_guest_factory(), - FOCAL_IMAGE_NAME, - ); - _test_virtio_block(&guest, false, true, false, false, ImageType::Raw); - } - - #[test] - fn test_virtio_block_aio() { - let guest = make_virtio_block_guest( - &GuestFactory::new_confidential_guest_factory(), - FOCAL_IMAGE_NAME, - ); - _test_virtio_block(&guest, true, false, false, false, ImageType::Raw); - } - - #[test] - fn test_virtio_block_sync() { - let guest = make_virtio_block_guest( - &GuestFactory::new_confidential_guest_factory(), - FOCAL_IMAGE_NAME, - ); - _test_virtio_block(&guest, true, true, false, false, ImageType::Raw); - } - - #[test] - fn test_virtio_block_qcow2() { - let guest = make_virtio_block_guest( - &GuestFactory::new_confidential_guest_factory(), - JAMMY_IMAGE_NAME_QCOW2, - ); - _test_virtio_block(&guest, false, false, true, false, ImageType::Qcow2); - } - - #[test] - fn test_virtio_block_qcow2_zlib() { - let guest = make_virtio_block_guest( - &GuestFactory::new_confidential_guest_factory(), - JAMMY_IMAGE_NAME_QCOW2_ZLIB, - ); - _test_virtio_block(&guest, false, false, true, false, ImageType::Qcow2); - } - - #[test] - fn test_virtio_block_qcow2_zstd() { - let guest = make_virtio_block_guest( - &GuestFactory::new_confidential_guest_factory(), - JAMMY_IMAGE_NAME_QCOW2_ZSTD, - ); - _test_virtio_block(&guest, false, false, true, false, ImageType::Qcow2); - } - - #[test] - fn test_virtio_block_qcow2_backing_zstd_file() { - let guest = make_virtio_block_guest( - &GuestFactory::new_confidential_guest_factory(), - JAMMY_IMAGE_NAME_QCOW2_BACKING_ZSTD_FILE, - ); - - _test_virtio_block(&guest, false, false, true, true, ImageType::Qcow2); - } - - #[test] - fn test_virtio_block_qcow2_backing_uncompressed_file() { - let guest = make_virtio_block_guest( - &GuestFactory::new_confidential_guest_factory(), - JAMMY_IMAGE_NAME_QCOW2_BACKING_UNCOMPRESSED_FILE, - ); - - _test_virtio_block(&guest, false, false, true, true, ImageType::Qcow2); - } - - #[test] - fn test_virtio_block_qcow2_backing_raw_file() { - let guest = make_virtio_block_guest( - &GuestFactory::new_confidential_guest_factory(), - JAMMY_IMAGE_NAME_QCOW2_BACKING_RAW_FILE, - ); - _test_virtio_block(&guest, false, false, true, true, ImageType::Qcow2); - } -} diff --git a/cloud-hypervisor/tests/integration_cvm.rs b/cloud-hypervisor/tests/integration_cvm.rs new file mode 100644 index 0000000000..ed848c8f1d --- /dev/null +++ b/cloud-hypervisor/tests/integration_cvm.rs @@ -0,0 +1,223 @@ +// Copyright 2026 The Cloud Hypervisor Authors. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 +// +#![cfg(any(devcli_testenv, clippy))] +#![allow(clippy::undocumented_unsafe_blocks)] +// When enabling the `mshv` feature, we skip quite some tests and +// hence have known dead-code. This annotation silences dead-code +// related warnings for our quality workflow to pass. +#![allow(dead_code)] +mod common; + +#[cfg(all(feature = "sev_snp", target_arch = "x86_64"))] +mod common_cvm { + use block::ImageType; + use common::tests_wrappers::*; + use common::utils::*; + use test_infra::*; + + use super::*; + + #[test] + fn test_focal_simple_launch() { + let disk_config = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); + let guest = + GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); + + _test_simple_launch(&guest); + } + + #[test] + fn test_api_http_create_boot() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = GuestFactory::new_confidential_guest_factory() + .create_guest(Box::new(disk_config)) + .with_cpu(4); + let target_api = TargetApi::new_http_api(&guest.tmp_dir); + _test_api_create_boot(&target_api, &guest); + } + + #[test] + fn test_api_http_shutdown() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = GuestFactory::new_confidential_guest_factory() + .create_guest(Box::new(disk_config)) + .with_cpu(4); + + let target_api = TargetApi::new_http_api(&guest.tmp_dir); + _test_api_shutdown(&target_api, &guest); + } + + #[test] + fn test_api_http_delete() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = GuestFactory::new_confidential_guest_factory() + .create_guest(Box::new(disk_config)) + .with_cpu(4); + + let target_api = TargetApi::new_http_api(&guest.tmp_dir); + _test_api_delete(&target_api, &guest); + } + + #[test] + fn test_power_button() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = + GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); + _test_power_button(&guest); + } + + #[test] + fn test_virtio_vsock() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = + GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); + _test_virtio_vsock(&guest, false); + } + + #[test] + fn test_multi_cpu() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = + GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); + _test_multi_cpu(&guest); + } + + #[test] + fn test_cpu_affinity() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = GuestFactory::new_confidential_guest_factory() + .create_guest(Box::new(disk_config)) + .with_cpu(2); + _test_cpu_affinity(&guest); + } + + #[test] + fn test_virtio_queue_affinity() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = GuestFactory::new_confidential_guest_factory() + .create_guest(Box::new(disk_config)) + .with_cpu(4); + _test_virtio_queue_affinity(&guest); + } + + #[test] + fn test_pci_msi() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = + GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); + _test_pci_msi(&guest); + } + + #[test] + fn test_virtio_net_ctrl_queue() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = + GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); + _test_virtio_net_ctrl_queue(&guest); + } + + #[test] + fn test_pci_multiple_segments() { + // Use 8 segments to test the multiple segment support since it's more than the default 6 + // supported by Linux + // IGVM file used by Sev-Snp Guest now support up to 8 segments, so we can use 8 segments for testing. + let num_pci_segments: u16 = 8; + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = + GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); + _test_pci_multiple_segments(&guest, num_pci_segments, 5); + } + + #[test] + fn test_direct_kernel_boot() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = + GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); + _test_direct_kernel_boot(&guest); + } + + #[test] + fn test_virtio_block_io_uring() { + let guest = make_virtio_block_guest( + &GuestFactory::new_confidential_guest_factory(), + FOCAL_IMAGE_NAME, + ); + _test_virtio_block(&guest, false, true, false, false, ImageType::Raw); + } + + #[test] + fn test_virtio_block_aio() { + let guest = make_virtio_block_guest( + &GuestFactory::new_confidential_guest_factory(), + FOCAL_IMAGE_NAME, + ); + _test_virtio_block(&guest, true, false, false, false, ImageType::Raw); + } + + #[test] + fn test_virtio_block_sync() { + let guest = make_virtio_block_guest( + &GuestFactory::new_confidential_guest_factory(), + FOCAL_IMAGE_NAME, + ); + _test_virtio_block(&guest, true, true, false, false, ImageType::Raw); + } + + #[test] + fn test_virtio_block_qcow2() { + let guest = make_virtio_block_guest( + &GuestFactory::new_confidential_guest_factory(), + JAMMY_IMAGE_NAME_QCOW2, + ); + _test_virtio_block(&guest, false, false, true, false, ImageType::Qcow2); + } + + #[test] + fn test_virtio_block_qcow2_zlib() { + let guest = make_virtio_block_guest( + &GuestFactory::new_confidential_guest_factory(), + JAMMY_IMAGE_NAME_QCOW2_ZLIB, + ); + _test_virtio_block(&guest, false, false, true, false, ImageType::Qcow2); + } + + #[test] + fn test_virtio_block_qcow2_zstd() { + let guest = make_virtio_block_guest( + &GuestFactory::new_confidential_guest_factory(), + JAMMY_IMAGE_NAME_QCOW2_ZSTD, + ); + _test_virtio_block(&guest, false, false, true, false, ImageType::Qcow2); + } + + #[test] + fn test_virtio_block_qcow2_backing_zstd_file() { + let guest = make_virtio_block_guest( + &GuestFactory::new_confidential_guest_factory(), + JAMMY_IMAGE_NAME_QCOW2_BACKING_ZSTD_FILE, + ); + + _test_virtio_block(&guest, false, false, true, true, ImageType::Qcow2); + } + + #[test] + fn test_virtio_block_qcow2_backing_uncompressed_file() { + let guest = make_virtio_block_guest( + &GuestFactory::new_confidential_guest_factory(), + JAMMY_IMAGE_NAME_QCOW2_BACKING_UNCOMPRESSED_FILE, + ); + + _test_virtio_block(&guest, false, false, true, true, ImageType::Qcow2); + } + + #[test] + fn test_virtio_block_qcow2_backing_raw_file() { + let guest = make_virtio_block_guest( + &GuestFactory::new_confidential_guest_factory(), + JAMMY_IMAGE_NAME_QCOW2_BACKING_RAW_FILE, + ); + _test_virtio_block(&guest, false, false, true, true, ImageType::Qcow2); + } +} From 7802470906cc1cf7c034345116f1cf5ea26cb88e Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 16:17:55 -0700 Subject: [PATCH 270/742] tests: add basic_cvm_guest macro for CVM tests Introduce a basic_cvm_guest! macro in integration_cvm.rs to reduce boilerplate when creating confidential VM guest instances. This replaces repetitive UbuntuDiskConfig and GuestFactory::new_confidential_guest_factory() calls across multiple CVM test functions. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration_cvm.rs | 64 +++++++---------------- 1 file changed, 19 insertions(+), 45 deletions(-) diff --git a/cloud-hypervisor/tests/integration_cvm.rs b/cloud-hypervisor/tests/integration_cvm.rs index ed848c8f1d..e2b7b08add 100644 --- a/cloud-hypervisor/tests/integration_cvm.rs +++ b/cloud-hypervisor/tests/integration_cvm.rs @@ -18,32 +18,30 @@ mod common_cvm { use test_infra::*; use super::*; + macro_rules! basic_cvm_guest { + ($image_name:expr) => {{ + let disk_config = UbuntuDiskConfig::new($image_name.to_string()); + GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)) + }}; + } #[test] fn test_focal_simple_launch() { - let disk_config = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); - let guest = - GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); + let guest = basic_cvm_guest!(FOCAL_IMAGE_NAME); _test_simple_launch(&guest); } #[test] fn test_api_http_create_boot() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = GuestFactory::new_confidential_guest_factory() - .create_guest(Box::new(disk_config)) - .with_cpu(4); + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME).with_cpu(4); let target_api = TargetApi::new_http_api(&guest.tmp_dir); _test_api_create_boot(&target_api, &guest); } #[test] fn test_api_http_shutdown() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = GuestFactory::new_confidential_guest_factory() - .create_guest(Box::new(disk_config)) - .with_cpu(4); + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME).with_cpu(4); let target_api = TargetApi::new_http_api(&guest.tmp_dir); _test_api_shutdown(&target_api, &guest); @@ -51,70 +49,50 @@ mod common_cvm { #[test] fn test_api_http_delete() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = GuestFactory::new_confidential_guest_factory() - .create_guest(Box::new(disk_config)) - .with_cpu(4); - + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); let target_api = TargetApi::new_http_api(&guest.tmp_dir); _test_api_delete(&target_api, &guest); } #[test] fn test_power_button() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = - GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); _test_power_button(&guest); } #[test] fn test_virtio_vsock() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = - GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); _test_virtio_vsock(&guest, false); } #[test] fn test_multi_cpu() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = - GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); _test_multi_cpu(&guest); } #[test] fn test_cpu_affinity() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = GuestFactory::new_confidential_guest_factory() - .create_guest(Box::new(disk_config)) - .with_cpu(2); + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME).with_cpu(2); _test_cpu_affinity(&guest); } #[test] fn test_virtio_queue_affinity() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = GuestFactory::new_confidential_guest_factory() - .create_guest(Box::new(disk_config)) - .with_cpu(4); + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME).with_cpu(4); _test_virtio_queue_affinity(&guest); } #[test] fn test_pci_msi() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = - GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); _test_pci_msi(&guest); } #[test] fn test_virtio_net_ctrl_queue() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = - GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); _test_virtio_net_ctrl_queue(&guest); } @@ -124,17 +102,13 @@ mod common_cvm { // supported by Linux // IGVM file used by Sev-Snp Guest now support up to 8 segments, so we can use 8 segments for testing. let num_pci_segments: u16 = 8; - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = - GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); _test_pci_multiple_segments(&guest, num_pci_segments, 5); } #[test] fn test_direct_kernel_boot() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = - GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)); + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); _test_direct_kernel_boot(&guest); } From 0fc0f2bd0e84c38d6b61bd77c09b85710353f32d Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 16:36:41 -0700 Subject: [PATCH 271/742] tests: add basic_regular_guest macro and with_kernel Introduce a basic_regular_guest! macro in integration.rs to reduce boilerplate when creating regular (non-CVM) guest instances. Also add a with_kernel() builder method to Guest in test_infra, allowing fluent configuration of the kernel path. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 11 +++++++++-- test_infra/src/lib.rs | 5 +++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 3af45c35c3..04f61f59ee 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -29,6 +29,13 @@ mod common; use common::tests_wrappers::*; use common::utils::*; +macro_rules! basic_regular_guest { + ($image_name:expr) => {{ + let disk_config = UbuntuDiskConfig::new($image_name.to_string()); + GuestFactory::new_regular_guest_factory().create_guest(Box::new(disk_config)) + }}; +} + mod common_parallel { use std::io::{self, SeekFrom}; use std::process::Command; @@ -41,8 +48,8 @@ mod common_parallel { #[cfg(target_arch = "x86_64")] fn test_focal_hypervisor_fw() { let disk_config = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); - let mut guest = Guest::new(Box::new(disk_config)); - guest.kernel_path = Some(fw_path(FwType::RustHypervisorFirmware)); + let guest = basic_regular_guest!(FOCAL_IMAGE_NAME) + .with_kernel(fw_path(FwType::RustHypervisorFirmware)); _test_simple_launch(&guest); } diff --git a/test_infra/src/lib.rs b/test_infra/src/lib.rs index b6d7641f6b..a299b2c494 100644 --- a/test_infra/src/lib.rs +++ b/test_infra/src/lib.rs @@ -1018,6 +1018,11 @@ impl Guest { self } + pub fn with_kernel(mut self, kernel: String) -> Self { + self.kernel_path = Some(kernel); + self + } + pub fn default_net_string(&self) -> String { format!( "tap=,mac={},ip={},mask=255.255.255.128", From 6aad6e40c8df3e94ee7d99dd83dec7510cf4fb31 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 16:59:34 -0700 Subject: [PATCH 272/742] tests: use basic_regular_guest macro in OVMF test Refactor test_focal_ovmf to use the basic_regular_guest! macro with the with_kernel() builder instead of manually constructing UbuntuDiskConfig and setting kernel_path. This also removes leftover unused disk_config in test_focal_hypervisor_fw. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 04f61f59ee..972b979fb9 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -47,7 +47,6 @@ mod common_parallel { #[test] #[cfg(target_arch = "x86_64")] fn test_focal_hypervisor_fw() { - let disk_config = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); let guest = basic_regular_guest!(FOCAL_IMAGE_NAME) .with_kernel(fw_path(FwType::RustHypervisorFirmware)); _test_simple_launch(&guest); @@ -56,9 +55,7 @@ mod common_parallel { #[test] #[cfg(target_arch = "x86_64")] fn test_focal_ovmf() { - let disk_config = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); - let mut guest = Guest::new(Box::new(disk_config)); - guest.kernel_path = Some(fw_path(FwType::Ovmf)); + let guest = basic_regular_guest!(FOCAL_IMAGE_NAME).with_kernel(fw_path(FwType::Ovmf)); _test_simple_launch(&guest); } From ac8d4c295352507210e49bb031f7b40621694d53 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 00:39:00 -0700 Subject: [PATCH 273/742] tests: extract _test_virtio_block_dynamic_vhdx_expand Move the virtio block dynamic VHDX expand test logic from integration.rs into a shared _test_virtio_block_dynamic_ vhdx_expand() function in tests_wrappers.rs. The original test in integration.rs now calls this shared function, enabling reuse by CVM tests. Signed-off-by: Muminul Islam --- .../tests/common/tests_wrappers.rs | 82 ++++++++++++++++++ cloud-hypervisor/tests/integration.rs | 86 +------------------ 2 files changed, 84 insertions(+), 84 deletions(-) diff --git a/cloud-hypervisor/tests/common/tests_wrappers.rs b/cloud-hypervisor/tests/common/tests_wrappers.rs index afe54ed5ef..fa14ac6b93 100644 --- a/cloud-hypervisor/tests/common/tests_wrappers.rs +++ b/cloud-hypervisor/tests/common/tests_wrappers.rs @@ -2057,3 +2057,85 @@ pub(crate) fn _test_virtio_block( ); } } + +pub fn _test_virtio_block_dynamic_vhdx_expand(guest: &Guest) { + const VIRTUAL_DISK_SIZE: u64 = 100 << 20; + const EMPTY_VHDX_FILE_SIZE: u64 = 8 << 20; + const FULL_VHDX_FILE_SIZE: u64 = 112 << 20; + const DYNAMIC_VHDX_NAME: &str = "dynamic.vhdx"; + + let vhdx_pathbuf = guest.tmp_dir.as_path().join(DYNAMIC_VHDX_NAME); + let vhdx_path = vhdx_pathbuf.to_str().unwrap(); + + // Generate a 100 MiB dynamic VHDX file + std::process::Command::new("qemu-img") + .arg("create") + .args(["-f", "vhdx"]) + .arg(vhdx_path) + .arg(VIRTUAL_DISK_SIZE.to_string()) + .output() + .expect("Expect generating dynamic VHDX image"); + + // Check if the size matches with empty VHDx file size + assert_eq!(vhdx_image_size(vhdx_path), EMPTY_VHDX_FILE_SIZE); + + let mut cloud_child = GuestCommand::new(guest) + .default_cpus() + .default_memory() + .default_kernel_cmdline() + .args([ + "--disk", + format!( + "path={}", + guest.disk_config.disk(DiskType::OperatingSystem).unwrap() + ) + .as_str(), + format!( + "path={}", + guest.disk_config.disk(DiskType::CloudInit).unwrap() + ) + .as_str(), + format!("path={vhdx_path}").as_str(), + ]) + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + // Check both if /dev/vdc exists and if the block size is 100 MiB. + assert_eq!( + guest + .ssh_command("lsblk | grep vdc | grep -c 100M") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 1 + ); + + // Write 100 MB of data to the VHDx disk + guest + .ssh_command("sudo dd if=/dev/urandom of=/dev/vdc bs=1M count=100") + .unwrap(); + }); + + // Check if the size matches with expected expanded VHDx file size + assert_eq!(vhdx_image_size(vhdx_path), FULL_VHDX_FILE_SIZE); + + kill_child(&mut cloud_child); + let output = cloud_child.wait_with_output().unwrap(); + + handle_child_output(r, &output); + + disk_check_consistency(vhdx_path, None); +} + +fn vhdx_image_size(disk_name: &str) -> u64 { + std::fs::File::open(disk_name) + .unwrap() + .seek(SeekFrom::End(0)) + .unwrap() +} diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 972b979fb9..b17ecfa7e4 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -1601,90 +1601,8 @@ mod common_parallel { #[test] fn test_virtio_block_dynamic_vhdx_expand() { - const VIRTUAL_DISK_SIZE: u64 = 100 << 20; - const EMPTY_VHDX_FILE_SIZE: u64 = 8 << 20; - const FULL_VHDX_FILE_SIZE: u64 = 112 << 20; - const DYNAMIC_VHDX_NAME: &str = "dynamic.vhdx"; - - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - let kernel_path = direct_kernel_boot_path(); - - let vhdx_pathbuf = guest.tmp_dir.as_path().join(DYNAMIC_VHDX_NAME); - let vhdx_path = vhdx_pathbuf.to_str().unwrap(); - - // Generate a 100 MiB dynamic VHDX file - std::process::Command::new("qemu-img") - .arg("create") - .args(["-f", "vhdx"]) - .arg(vhdx_path) - .arg(VIRTUAL_DISK_SIZE.to_string()) - .output() - .expect("Expect generating dynamic VHDX image"); - - // Check if the size matches with empty VHDx file size - assert_eq!(vhdx_image_size(vhdx_path), EMPTY_VHDX_FILE_SIZE); - - let mut cloud_child = GuestCommand::new(&guest) - .default_cpus() - .default_memory() - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .args([ - "--disk", - format!( - "path={}", - guest.disk_config.disk(DiskType::OperatingSystem).unwrap() - ) - .as_str(), - format!( - "path={}", - guest.disk_config.disk(DiskType::CloudInit).unwrap() - ) - .as_str(), - format!("path={vhdx_path}").as_str(), - ]) - .default_net() - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - // Check both if /dev/vdc exists and if the block size is 100 MiB. - assert_eq!( - guest - .ssh_command("lsblk | grep vdc | grep -c 100M") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 1 - ); - - // Write 100 MB of data to the VHDx disk - guest - .ssh_command("sudo dd if=/dev/urandom of=/dev/vdc bs=1M count=100") - .unwrap(); - }); - - // Check if the size matches with expected expanded VHDx file size - assert_eq!(vhdx_image_size(vhdx_path), FULL_VHDX_FILE_SIZE); - - kill_child(&mut cloud_child); - let output = cloud_child.wait_with_output().unwrap(); - - handle_child_output(r, &output); - - disk_check_consistency(vhdx_path, None); - } - - fn vhdx_image_size(disk_name: &str) -> u64 { - std::fs::File::open(disk_name) - .unwrap() - .seek(SeekFrom::End(0)) - .unwrap() + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); + _test_virtio_block_dynamic_vhdx_expand(&guest); } #[test] From 2929cc9d1615d4bd354b9519bb90f1c213ffe751 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 00:50:38 -0700 Subject: [PATCH 274/742] tests: add CVM test for virtio_block_dynamic_vhdx_expand Add a CVM variant of the virtio block dynamic VHDX expand test in integration_cvm.rs. This test creates a confidential guest and invokes the shared _test_virtio_block_dynamic_vhdx_expand() wrapper to validate VHDX expansion under CVM. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration_cvm.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cloud-hypervisor/tests/integration_cvm.rs b/cloud-hypervisor/tests/integration_cvm.rs index e2b7b08add..6e5b0bd32c 100644 --- a/cloud-hypervisor/tests/integration_cvm.rs +++ b/cloud-hypervisor/tests/integration_cvm.rs @@ -194,4 +194,10 @@ mod common_cvm { ); _test_virtio_block(&guest, false, false, true, true, ImageType::Qcow2); } + + #[test] + fn test_virtio_block_dynamic_vhdx_expand() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); + _test_virtio_block_dynamic_vhdx_expand(&guest); + } } From 56696ed901d6f0045eb9e5fd6c91a635562235a4 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 00:55:52 -0700 Subject: [PATCH 275/742] tests: extract _test_split_irqchip to tests_wrappers Move the split IRQ chip test logic from integration.rs into a shared _test_split_irqchip() function in tests_wrappers.rs. The original test now delegates to this shared function, enabling reuse by CVM tests. Signed-off-by: Muminul Islam --- .../tests/common/tests_wrappers.rs | 41 ++++++++++++++++++ cloud-hypervisor/tests/integration.rs | 43 +------------------ 2 files changed, 43 insertions(+), 41 deletions(-) diff --git a/cloud-hypervisor/tests/common/tests_wrappers.rs b/cloud-hypervisor/tests/common/tests_wrappers.rs index fa14ac6b93..2663a6499d 100644 --- a/cloud-hypervisor/tests/common/tests_wrappers.rs +++ b/cloud-hypervisor/tests/common/tests_wrappers.rs @@ -2139,3 +2139,44 @@ fn vhdx_image_size(disk_name: &str) -> u64 { .seek(SeekFrom::End(0)) .unwrap() } + +#[cfg(target_arch = "x86_64")] +pub fn _test_split_irqchip(guest: &Guest) { + let mut child = GuestCommand::new(guest) + .default_cpus() + .default_memory() + .default_kernel_cmdline() + .default_disks() + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + assert_eq!( + guest + .ssh_command("grep -c IO-APIC.*timer /proc/interrupts || true") + .unwrap() + .trim() + .parse::() + .unwrap_or(1), + 0 + ); + assert_eq!( + guest + .ssh_command("grep -c IO-APIC.*cascade /proc/interrupts || true") + .unwrap() + .trim() + .parse::() + .unwrap_or(1), + 0 + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index b17ecfa7e4..1b6562fab5 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -1731,47 +1731,8 @@ mod common_parallel { #[test] #[cfg(target_arch = "x86_64")] fn test_split_irqchip() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - let mut child = GuestCommand::new(&guest) - .default_cpus() - .default_memory() - .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .default_net() - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - assert_eq!( - guest - .ssh_command("grep -c IO-APIC.*timer /proc/interrupts || true") - .unwrap() - .trim() - .parse::() - .unwrap_or(1), - 0 - ); - assert_eq!( - guest - .ssh_command("grep -c IO-APIC.*cascade /proc/interrupts || true") - .unwrap() - .trim() - .parse::() - .unwrap_or(1), - 0 - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); + _test_split_irqchip(&guest); } #[test] From b65a3a58f379b9effe5c4943b3c25013920d52ba Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 00:56:42 -0700 Subject: [PATCH 276/742] tests: add CVM test for split_irqchip Add a CVM variant of the split IRQ chip test in integration_cvm.rs. This test creates a confidential guest and invokes the shared _test_split_irqchip() wrapper to validate split IRQ chip behavior under CVM. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration_cvm.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cloud-hypervisor/tests/integration_cvm.rs b/cloud-hypervisor/tests/integration_cvm.rs index 6e5b0bd32c..1df8d0f115 100644 --- a/cloud-hypervisor/tests/integration_cvm.rs +++ b/cloud-hypervisor/tests/integration_cvm.rs @@ -200,4 +200,10 @@ mod common_cvm { let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); _test_virtio_block_dynamic_vhdx_expand(&guest); } + + #[test] + fn test_split_irqchip() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); + _test_split_irqchip(&guest); + } } From 5f4ad4bb1e78d5311b692dccb153259b9c6be395 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 16:08:51 -0700 Subject: [PATCH 277/742] tests: extract _test_dmi_serial_number to tests_wrappers Move the DMI serial number test logic from integration.rs into a shared _test_dmi_serial_number() function in tests_wrappers.rs. The original test now delegates to this shared function, enabling reuse by CVM tests. Signed-off-by: Muminul Islam --- .../tests/common/tests_wrappers.rs | 30 +++++++++++++++++ cloud-hypervisor/tests/integration.rs | 32 ++----------------- 2 files changed, 32 insertions(+), 30 deletions(-) diff --git a/cloud-hypervisor/tests/common/tests_wrappers.rs b/cloud-hypervisor/tests/common/tests_wrappers.rs index 2663a6499d..053cdf4414 100644 --- a/cloud-hypervisor/tests/common/tests_wrappers.rs +++ b/cloud-hypervisor/tests/common/tests_wrappers.rs @@ -2180,3 +2180,33 @@ pub fn _test_split_irqchip(guest: &Guest) { handle_child_output(r, &output); } + +#[cfg(target_arch = "x86_64")] +pub(crate) fn _test_dmi_serial_number(guest: &Guest) { + let mut child = GuestCommand::new(guest) + .default_cpus() + .default_memory() + .default_kernel_cmdline_with_platform(Some("serial_number=a=b;c=d")) + .default_disks() + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + assert_eq!( + guest + .ssh_command("sudo cat /sys/class/dmi/id/product_serial") + .unwrap() + .trim(), + "a=b;c=d" + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 1b6562fab5..9cb2dc209d 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -1738,37 +1738,9 @@ mod common_parallel { #[test] #[cfg(target_arch = "x86_64")] fn test_dmi_serial_number() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - let mut child = GuestCommand::new(&guest) - .default_cpus() - .default_memory() - .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .args(["--platform", "serial_number=a=b;c=d"]) - .default_disks() - .default_net() - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - assert_eq!( - guest - .ssh_command("sudo cat /sys/class/dmi/id/product_serial") - .unwrap() - .trim(), - "a=b;c=d" - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); - handle_child_output(r, &output); + _test_dmi_serial_number(&guest); } #[test] From b1a0ab4d1f78daac35c62e03aed3c87b0c0eb92a Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 17:34:09 -0700 Subject: [PATCH 278/742] tests: use basic_regular_guest macro in test_multi_cpu Replace manual UbuntuDiskConfig and GuestFactory guest creation with the basic_regular_guest! macro in test_multi_cpu for consistency and reduced boilerplate. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 9cb2dc209d..b09d72cf9c 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -61,8 +61,7 @@ mod common_parallel { #[test] fn test_multi_cpu() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = GuestFactory::new_regular_guest_factory().create_guest(Box::new(disk_config)); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); _test_multi_cpu(&guest); } From bcdbc875be71fced1ccbddc84a6bdcac634cb096 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 17:34:21 -0700 Subject: [PATCH 279/742] tests: use basic_regular_guest macro in test_cpu_affinity Replace manual UbuntuDiskConfig and GuestFactory guest creation with the basic_regular_guest! macro in test_cpu_affinity, chaining with_cpu(2) for the required CPU count configuration. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index b09d72cf9c..5927597393 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -164,10 +164,7 @@ mod common_parallel { #[test] fn test_cpu_affinity() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = GuestFactory::new_regular_guest_factory() - .create_guest(Box::new(disk_config)) - .with_cpu(2); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME).with_cpu(2); _test_cpu_affinity(&guest); } From 43fb142afb135378745da114598463486e585ef3 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 17:34:34 -0700 Subject: [PATCH 280/742] tests: use basic_regular_guest macro in test_virtio_queue_affinity Replace manual UbuntuDiskConfig and GuestFactory guest creation with the basic_regular_guest! macro in test_virtio_queue_affinity, chaining with_cpu(4) for the required CPU count configuration. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 5927597393..3d8c3a8917 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -170,10 +170,7 @@ mod common_parallel { #[test] fn test_virtio_queue_affinity() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = GuestFactory::new_regular_guest_factory() - .create_guest(Box::new(disk_config)) - .with_cpu(4); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME).with_cpu(4); _test_virtio_queue_affinity(&guest); } From 43731149c874109366db5fa31b332bbc8c13b599 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 17:34:46 -0700 Subject: [PATCH 281/742] tests: use basic_regular_guest macro in test_power_button Replace manual UbuntuDiskConfig and GuestFactory guest creation with the basic_regular_guest! macro in test_power_button for consistency and reduced boilerplate. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 3d8c3a8917..82f9c54530 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -243,8 +243,7 @@ mod common_parallel { #[test] fn test_power_button() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = GuestFactory::new_regular_guest_factory().create_guest(Box::new(disk_config)); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); _test_power_button(&guest); } From 4c4cf940ec2e5041698377ae02b14a5b6b85cf2a Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 17:35:01 -0700 Subject: [PATCH 282/742] tests: use basic_regular_guest macro in test_pci_msi Replace manual UbuntuDiskConfig and GuestFactory guest creation with the basic_regular_guest! macro in test_pci_msi for consistency and reduced boilerplate. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 82f9c54530..2f0a85f137 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -402,8 +402,7 @@ mod common_parallel { #[test] fn test_pci_msi() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = GuestFactory::new_regular_guest_factory().create_guest(Box::new(disk_config)); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); _test_pci_msi(&guest); } From 54c91b02814addfe1992104b72de94143e6a0c87 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 17:35:13 -0700 Subject: [PATCH 283/742] tests: use basic_regular_guest macro in test_virtio_net_ctrl_queue Replace manual UbuntuDiskConfig and GuestFactory guest creation with the basic_regular_guest! macro in test_virtio_net_ctrl_queue for consistency and reduced boilerplate. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 2f0a85f137..ca7cc88378 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -408,8 +408,7 @@ mod common_parallel { #[test] fn test_virtio_net_ctrl_queue() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = GuestFactory::new_regular_guest_factory().create_guest(Box::new(disk_config)); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); _test_virtio_net_ctrl_queue(&guest); } From 84951127b7509a47d1e05aaa1d0280e1093e1002 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 17:35:29 -0700 Subject: [PATCH 284/742] tests: use basic_regular_guest macro in test_pci_multiple_segments Replace manual UbuntuDiskConfig and GuestFactory guest creation with the basic_regular_guest! macro in test_pci_multiple_segments for consistency and reduced boilerplate. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index ca7cc88378..2eac9f0697 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -414,8 +414,7 @@ mod common_parallel { #[test] fn test_pci_multiple_segments() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = GuestFactory::new_regular_guest_factory().create_guest(Box::new(disk_config)); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); _test_pci_multiple_segments(&guest, MAX_NUM_PCI_SEGMENTS, 15u16); } From a6724ba79f1b13736c3666ffae8267bfa0df4e81 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 17:35:41 -0700 Subject: [PATCH 285/742] tests: use basic_regular_guest macro in test_direct_kernel_boot Replace manual UbuntuDiskConfig and GuestFactory guest creation with the basic_regular_guest! macro in test_direct_kernel_boot for consistency and reduced boilerplate. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 2eac9f0697..acbbdbb27b 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -499,8 +499,7 @@ mod common_parallel { #[test] fn test_direct_kernel_boot() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = GuestFactory::new_regular_guest_factory().create_guest(Box::new(disk_config)); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); _test_direct_kernel_boot(&guest); } From b9c9e7265d1ad19ab72ef1269d353094084a539d Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 17:35:53 -0700 Subject: [PATCH 286/742] tests: use basic_regular_guest macro in test_virtio_vsock Replace manual UbuntuDiskConfig and GuestFactory guest creation with the basic_regular_guest! macro in test_virtio_vsock for consistency and reduced boilerplate. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index acbbdbb27b..b5c8f04773 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -2724,8 +2724,7 @@ mod common_parallel { #[test] fn test_virtio_vsock() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = GuestFactory::new_regular_guest_factory().create_guest(Box::new(disk_config)); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); _test_virtio_vsock(&guest, false); } From aead6dd59f96f5bdfdf2ef829ec91cc4f502484d Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 17:36:12 -0700 Subject: [PATCH 287/742] tests: use basic_regular_guest macro in test_virtio_vsock_hotplug Replace manual UbuntuDiskConfig and GuestFactory guest creation with the basic_regular_guest! macro in test_virtio_vsock_hotplug. The aarch64 variant chains with_kernel_path for EDK2 firmware support. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index b5c8f04773..d56c25f783 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -2730,13 +2730,11 @@ mod common_parallel { #[test] fn test_virtio_vsock_hotplug() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); #[cfg(target_arch = "x86_64")] - let guest = GuestFactory::new_regular_guest_factory().create_guest(Box::new(disk_config)); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); #[cfg(target_arch = "aarch64")] - let guest = GuestFactory::new_regular_guest_factory() - .create_guest(Box::new(disk_config)) - .with_kernel_path(edk2_path().to_str().unwrap()); + let guest = + basic_regular_guest!(JAMMY_IMAGE_NAME).with_kernel_path(edk2_path().to_str().unwrap()); _test_virtio_vsock(&guest, true); } From e5b589ee542f8e5db8de7dd583a1bee7c0136ca8 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 17:37:19 -0700 Subject: [PATCH 288/742] tests: use basic_regular_guest macro in test_api_http_shutdown Replace manual UbuntuDiskConfig and GuestFactory guest creation with the basic_regular_guest! macro in test_api_http_shutdown, chaining with_cpu(4) for the required CPU count configuration. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index d56c25f783..2c68b82f17 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -2740,10 +2740,7 @@ mod common_parallel { #[test] fn test_api_http_shutdown() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = GuestFactory::new_regular_guest_factory() - .create_guest(Box::new(disk_config)) - .with_cpu(4); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME).with_cpu(4); let target_api = TargetApi::new_http_api(&guest.tmp_dir); _test_api_shutdown(&target_api, &guest); From 1b3cd88b25ba365a97e7b8b4f30283cc3a28fffd Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 17:37:32 -0700 Subject: [PATCH 289/742] tests: use basic_regular_guest macro in test_api_http_delete Replace manual UbuntuDiskConfig and GuestFactory guest creation with the basic_regular_guest! macro in test_api_http_delete, chaining with_cpu(4) for the required CPU count configuration. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 2c68b82f17..97009f470d 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -2748,10 +2748,7 @@ mod common_parallel { #[test] fn test_api_http_delete() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = GuestFactory::new_regular_guest_factory() - .create_guest(Box::new(disk_config)) - .with_cpu(4); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME).with_cpu(4); let target_api = TargetApi::new_http_api(&guest.tmp_dir); _test_api_delete(&target_api, &guest); From 698084f3d7d521d1f39a00ae48b401f4fdf63d3b Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 17:37:45 -0700 Subject: [PATCH 290/742] tests: use basic_regular_guest macro in test_api_http_pause_resume Replace manual UbuntuDiskConfig and GuestFactory guest creation with the basic_regular_guest! macro in test_api_http_pause_resume, chaining with_cpu(4) for the required CPU count configuration. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 97009f470d..4d5f56270a 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -2756,10 +2756,7 @@ mod common_parallel { #[test] fn test_api_http_pause_resume() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = GuestFactory::new_regular_guest_factory() - .create_guest(Box::new(disk_config)) - .with_cpu(4); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME).with_cpu(4); let target_api = TargetApi::new_http_api(&guest.tmp_dir); _test_api_pause_resume(&target_api, &guest); From 6f9794ded04d4f1d02a395b1f8da902b0bdcb2e7 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 17:37:58 -0700 Subject: [PATCH 291/742] tests: use basic_regular_guest macro in test_api_http_create_boot Replace manual UbuntuDiskConfig and GuestFactory guest creation with the basic_regular_guest! macro in test_api_http_create_boot, chaining with_cpu(4) for the required CPU count configuration. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 4d5f56270a..e0279c1f0e 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -2764,10 +2764,7 @@ mod common_parallel { #[test] fn test_api_http_create_boot() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = GuestFactory::new_regular_guest_factory() - .create_guest(Box::new(disk_config)) - .with_cpu(4); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME).with_cpu(4); let target_api = TargetApi::new_http_api(&guest.tmp_dir); _test_api_create_boot(&target_api, &guest); From dcb4d99a50760c26894ffac12ec23f28ce51ccd7 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 17:49:45 -0700 Subject: [PATCH 292/742] tests: extract _test_dmi_uuid to tests_wrappers Move the DMI UUID test logic from integration.rs into a shared _test_dmi_uuid() function in tests_wrappers.rs. The original test in integration.rs now uses the basic_regular_guest macro and delegates to this shared function, enabling reuse by CVM tests. Signed-off-by: Muminul Islam --- .../tests/common/tests_wrappers.rs | 29 ++++++++++++++++ cloud-hypervisor/tests/integration.rs | 33 ++----------------- 2 files changed, 31 insertions(+), 31 deletions(-) diff --git a/cloud-hypervisor/tests/common/tests_wrappers.rs b/cloud-hypervisor/tests/common/tests_wrappers.rs index 053cdf4414..972c115f0e 100644 --- a/cloud-hypervisor/tests/common/tests_wrappers.rs +++ b/cloud-hypervisor/tests/common/tests_wrappers.rs @@ -2210,3 +2210,32 @@ pub(crate) fn _test_dmi_serial_number(guest: &Guest) { handle_child_output(r, &output); } + +pub(crate) fn _test_dmi_uuid(guest: &Guest) { + let mut child = GuestCommand::new(guest) + .default_cpus() + .default_memory() + .default_kernel_cmdline_with_platform(Some("uuid=1e8aa28a-435d-4027-87f4-40dceff1fa0a")) + .default_disks() + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + assert_eq!( + guest + .ssh_command("sudo cat /sys/class/dmi/id/product_uuid") + .unwrap() + .trim(), + "1e8aa28a-435d-4027-87f4-40dceff1fa0a" + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} \ No newline at end of file diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index e0279c1f0e..fc78b62fe7 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -1734,37 +1734,8 @@ mod common_parallel { #[test] #[cfg(target_arch = "x86_64")] fn test_dmi_uuid() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - let mut child = GuestCommand::new(&guest) - .default_cpus() - .default_memory() - .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .args(["--platform", "uuid=1e8aa28a-435d-4027-87f4-40dceff1fa0a"]) - .default_disks() - .default_net() - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - assert_eq!( - guest - .ssh_command("sudo cat /sys/class/dmi/id/product_uuid") - .unwrap() - .trim(), - "1e8aa28a-435d-4027-87f4-40dceff1fa0a" - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); + _test_dmi_uuid(&guest); } #[test] From 43641cf7146edfbf84852e55aad4a58e2ee17492 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 17:51:16 -0700 Subject: [PATCH 293/742] tests: add CVM test for dmi_uuid Add a CVM variant of the DMI UUID test in integration_cvm.rs. This test creates a confidential guest using the basic_cvm_guest! macro and invokes the shared _test_dmi_uuid() wrapper to validate DMI UUID behavior under CVM. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/common/tests_wrappers.rs | 2 +- cloud-hypervisor/tests/integration_cvm.rs | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/cloud-hypervisor/tests/common/tests_wrappers.rs b/cloud-hypervisor/tests/common/tests_wrappers.rs index 972c115f0e..d421e9c136 100644 --- a/cloud-hypervisor/tests/common/tests_wrappers.rs +++ b/cloud-hypervisor/tests/common/tests_wrappers.rs @@ -2238,4 +2238,4 @@ pub(crate) fn _test_dmi_uuid(guest: &Guest) { let output = child.wait_with_output().unwrap(); handle_child_output(r, &output); -} \ No newline at end of file +} diff --git a/cloud-hypervisor/tests/integration_cvm.rs b/cloud-hypervisor/tests/integration_cvm.rs index 1df8d0f115..e599541f6c 100644 --- a/cloud-hypervisor/tests/integration_cvm.rs +++ b/cloud-hypervisor/tests/integration_cvm.rs @@ -206,4 +206,10 @@ mod common_cvm { let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); _test_split_irqchip(&guest); } + + #[test] + fn test_dmi_uuid() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); + _test_dmi_uuid(&guest); + } } From c3976ccbdae0fe86c109dbd47f0cd3c590f99161 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 17:56:23 -0700 Subject: [PATCH 294/742] tests: extract _test_dmi_oem_strings to tests_wrappers Move the DMI OEM strings test logic from integration.rs into a shared _test_dmi_oem_strings() function in tests_wrappers.rs. The original test in integration.rs now uses the basic_regular_guest! macro and delegates to this shared function, enabling reuse by CVM tests. Signed-off-by: Muminul Islam --- .../tests/common/tests_wrappers.rs | 50 +++++++++++++++++ cloud-hypervisor/tests/integration.rs | 54 +------------------ 2 files changed, 52 insertions(+), 52 deletions(-) diff --git a/cloud-hypervisor/tests/common/tests_wrappers.rs b/cloud-hypervisor/tests/common/tests_wrappers.rs index d421e9c136..7c1c19e6be 100644 --- a/cloud-hypervisor/tests/common/tests_wrappers.rs +++ b/cloud-hypervisor/tests/common/tests_wrappers.rs @@ -2239,3 +2239,53 @@ pub(crate) fn _test_dmi_uuid(guest: &Guest) { handle_child_output(r, &output); } + +pub(crate) fn _test_dmi_oem_strings(guest: &Guest) { + let s1 = "io.systemd.credential:xx=yy"; + let s2 = "This is a test string"; + + let oem_strings = format!("oem_strings=[{s1},{s2}]"); + + let mut child = GuestCommand::new(guest) + .default_cpus() + .default_memory() + .default_kernel_cmdline_with_platform(Some(&oem_strings)) + .default_disks() + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + assert_eq!( + guest + .ssh_command("sudo dmidecode --oem-string count") + .unwrap() + .trim(), + "2" + ); + + assert_eq!( + guest + .ssh_command("sudo dmidecode --oem-string 1") + .unwrap() + .trim(), + s1 + ); + + assert_eq!( + guest + .ssh_command("sudo dmidecode --oem-string 2") + .unwrap() + .trim(), + s2 + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index fc78b62fe7..25b043e8f2 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -1741,58 +1741,8 @@ mod common_parallel { #[test] #[cfg(target_arch = "x86_64")] fn test_dmi_oem_strings() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - let s1 = "io.systemd.credential:xx=yy"; - let s2 = "This is a test string"; - - let oem_strings = format!("oem_strings=[{s1},{s2}]"); - - let mut child = GuestCommand::new(&guest) - .default_cpus() - .default_memory() - .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .args(["--platform", &oem_strings]) - .default_disks() - .default_net() - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - assert_eq!( - guest - .ssh_command("sudo dmidecode --oem-string count") - .unwrap() - .trim(), - "2" - ); - - assert_eq!( - guest - .ssh_command("sudo dmidecode --oem-string 1") - .unwrap() - .trim(), - s1 - ); - - assert_eq!( - guest - .ssh_command("sudo dmidecode --oem-string 2") - .unwrap() - .trim(), - s2 - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); + _test_dmi_oem_strings(&guest); } #[test] From b7a7366ef90ac2b40f3dc425af8a5a838ced4cf4 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 17:56:38 -0700 Subject: [PATCH 295/742] tests: add CVM test for dmi_oem_strings Add a CVM variant of the DMI OEM strings test in integration_cvm.rs. This test creates a confidential guest using the basic_cvm_guest! macro and invokes the shared _test_dmi_oem_strings() wrapper to validate DMI OEM strings behavior under CVM. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration_cvm.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cloud-hypervisor/tests/integration_cvm.rs b/cloud-hypervisor/tests/integration_cvm.rs index e599541f6c..fd47e21f01 100644 --- a/cloud-hypervisor/tests/integration_cvm.rs +++ b/cloud-hypervisor/tests/integration_cvm.rs @@ -212,4 +212,10 @@ mod common_cvm { let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); _test_dmi_uuid(&guest); } + + #[test] + fn test_dmi_oem_strings() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); + _test_dmi_oem_strings(&guest); + } } From 8d9e8ea6f822810b0bd25b440fe763e8628b08ef Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 18:20:42 -0700 Subject: [PATCH 296/742] tests: extract _test_serial_off to tests_wrappers Extract test logic from test_serial_off into a shared _test_serial_off wrapper function in tests_wrappers.rs. Update the parent test case to use the basic_regular_guest macro. The wrapper uses default_kernel_cmdline() for kernel/cmdline setup. Signed-off-by: Muminul Islam --- .../tests/common/tests_wrappers.rs | 33 +++++++++++++++++ cloud-hypervisor/tests/integration.rs | 35 ++----------------- 2 files changed, 35 insertions(+), 33 deletions(-) diff --git a/cloud-hypervisor/tests/common/tests_wrappers.rs b/cloud-hypervisor/tests/common/tests_wrappers.rs index 7c1c19e6be..f983de5519 100644 --- a/cloud-hypervisor/tests/common/tests_wrappers.rs +++ b/cloud-hypervisor/tests/common/tests_wrappers.rs @@ -2289,3 +2289,36 @@ pub(crate) fn _test_dmi_oem_strings(guest: &Guest) { handle_child_output(r, &output); } + +pub(crate) fn _test_serial_off(guest: &Guest) { + let mut child = GuestCommand::new(guest) + .default_cpus() + .default_memory() + .default_kernel_cmdline() + .default_disks() + .default_net() + .args(["--serial", "off"]) + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + // Test that there is no ttyS0 + assert_eq!( + guest + .ssh_command(GREP_SERIAL_IRQ_CMD) + .unwrap() + .trim() + .parse::() + .unwrap_or(1), + 0 + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 25b043e8f2..990847de35 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -1936,39 +1936,8 @@ mod common_parallel { #[test] fn test_serial_off() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - let mut child = GuestCommand::new(&guest) - .default_cpus() - .default_memory() - .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .default_net() - .args(["--serial", "off"]) - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - // Test that there is no ttyS0 - assert_eq!( - guest - .ssh_command(GREP_SERIAL_IRQ_CMD) - .unwrap() - .trim() - .parse::() - .unwrap_or(1), - 0 - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); + _test_serial_off(&guest); } #[test] From 77266daf86d31b2a3753e5fe1c0cbbb0021cba1b Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 18:21:54 -0700 Subject: [PATCH 297/742] tests: extract _test_multiple_network_interfaces wrapper Extract test logic from test_multiple_network_interfaces into a shared _test_multiple_network_interfaces wrapper function in tests_wrappers.rs. Update the parent test case to use the basic_regular_guest macro. The wrapper uses default_kernel_cmdline() for kernel/cmdline setup. Signed-off-by: Muminul Islam --- .../tests/common/tests_wrappers.rs | 40 +++++++++++++++++ cloud-hypervisor/tests/integration.rs | 45 +------------------ 2 files changed, 42 insertions(+), 43 deletions(-) diff --git a/cloud-hypervisor/tests/common/tests_wrappers.rs b/cloud-hypervisor/tests/common/tests_wrappers.rs index f983de5519..c1a21986bb 100644 --- a/cloud-hypervisor/tests/common/tests_wrappers.rs +++ b/cloud-hypervisor/tests/common/tests_wrappers.rs @@ -2322,3 +2322,43 @@ pub(crate) fn _test_serial_off(guest: &Guest) { handle_child_output(r, &output); } + +pub(crate) fn _test_multiple_network_interfaces(guest: &Guest) { + let mut child = GuestCommand::new(guest) + .default_cpus() + .default_memory() + .default_kernel_cmdline() + .default_disks() + .args([ + "--net", + guest.default_net_string().as_str(), + "tap=,mac=8a:6b:6f:5a:de:ac,ip=192.168.3.1,mask=255.255.255.128", + "tap=mytap1,mac=fe:1f:9e:e1:60:f2,ip=192.168.4.1,mask=255.255.255.128", + ]) + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + let tap_count = exec_host_command_output("ip link | grep -c mytap1"); + assert_eq!(String::from_utf8_lossy(&tap_count.stdout).trim(), "1"); + + // 3 network interfaces + default localhost ==> 4 interfaces + assert_eq!( + guest + .ssh_command("ip -o link | wc -l") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 4 + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 990847de35..3677fbe2ad 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -1852,49 +1852,8 @@ mod common_parallel { #[test] fn test_multiple_network_interfaces() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - let kernel_path = direct_kernel_boot_path(); - - let mut child = GuestCommand::new(&guest) - .default_cpus() - .default_memory() - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .args([ - "--net", - guest.default_net_string().as_str(), - "tap=,mac=8a:6b:6f:5a:de:ac,ip=192.168.3.1,mask=255.255.255.128", - "tap=mytap1,mac=fe:1f:9e:e1:60:f2,ip=192.168.4.1,mask=255.255.255.128", - ]) - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - let tap_count = exec_host_command_output("ip link | grep -c mytap1"); - assert_eq!(String::from_utf8_lossy(&tap_count.stdout).trim(), "1"); - - // 3 network interfaces + default localhost ==> 4 interfaces - assert_eq!( - guest - .ssh_command("ip -o link | wc -l") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 4 - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); + _test_multiple_network_interfaces(&guest); } #[test] From dfd07ae6d0f48c1f856a60d642594543fdc4c4a9 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 18:22:59 -0700 Subject: [PATCH 298/742] tests: extract _test_virtio_console to tests_wrappers Extract test logic from test_virtio_console into a shared _test_virtio_console wrapper function in tests_wrappers.rs. Update the parent test case to use the basic_regular_guest macro. The wrapper uses default_kernel_cmdline() for kernel/cmdline setup. Signed-off-by: Muminul Islam --- .../tests/common/tests_wrappers.rs | 39 ++++++++++++++++ cloud-hypervisor/tests/integration.rs | 44 +------------------ 2 files changed, 41 insertions(+), 42 deletions(-) diff --git a/cloud-hypervisor/tests/common/tests_wrappers.rs b/cloud-hypervisor/tests/common/tests_wrappers.rs index c1a21986bb..874ad795c3 100644 --- a/cloud-hypervisor/tests/common/tests_wrappers.rs +++ b/cloud-hypervisor/tests/common/tests_wrappers.rs @@ -2362,3 +2362,42 @@ pub(crate) fn _test_multiple_network_interfaces(guest: &Guest) { handle_child_output(r, &output); } + +pub(crate) fn _test_virtio_console(guest: &Guest) { + let mut child = GuestCommand::new(guest) + .default_cpus() + .default_memory() + .default_kernel_cmdline() + .default_disks() + .default_net() + .args(["--console", "tty"]) + .args(["--serial", "null"]) + .capture_output() + .spawn() + .unwrap(); + + let text = String::from("On a branch floating down river a cricket, singing."); + let cmd = format!("echo {text} | sudo tee /dev/hvc0"); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + assert!( + guest + .does_device_vendor_pair_match("0x1043", "0x1af4") + .unwrap_or_default() + ); + + guest.ssh_command(&cmd).unwrap(); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + handle_child_output(r, &output); + + let r = std::panic::catch_unwind(|| { + assert!(String::from_utf8_lossy(&output.stdout).contains(&text)); + }); + + handle_child_output(r, &output); +} diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 3677fbe2ad..55f081d12f 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -2197,48 +2197,8 @@ mod common_parallel { #[test] fn test_virtio_console() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - let kernel_path = direct_kernel_boot_path(); - - let mut child = GuestCommand::new(&guest) - .default_cpus() - .default_memory() - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .default_net() - .args(["--console", "tty"]) - .args(["--serial", "null"]) - .capture_output() - .spawn() - .unwrap(); - - let text = String::from("On a branch floating down river a cricket, singing."); - let cmd = format!("echo {text} | sudo tee /dev/hvc0"); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - assert!( - guest - .does_device_vendor_pair_match("0x1043", "0x1af4") - .unwrap_or_default() - ); - - guest.ssh_command(&cmd).unwrap(); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - handle_child_output(r, &output); - - let r = std::panic::catch_unwind(|| { - assert!(String::from_utf8_lossy(&output.stdout).contains(&text)); - }); - - handle_child_output(r, &output); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); + _test_virtio_console(&guest); } #[test] From fb02146e2cbc99767e9e41f437fbfe5c4062494b Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 18:24:50 -0700 Subject: [PATCH 299/742] tests: extract _test_console_file to tests_wrappers Extract test logic from test_console_file into a shared _test_console_file wrapper function in tests_wrappers.rs. Update the parent test case to use the basic_regular_guest macro. The wrapper uses default_kernel_cmdline() for kernel/cmdline setup. Signed-off-by: Muminul Islam --- .../tests/common/tests_wrappers.rs | 48 ++++++++++++++++++- cloud-hypervisor/tests/integration.rs | 48 +------------------ 2 files changed, 49 insertions(+), 47 deletions(-) diff --git a/cloud-hypervisor/tests/common/tests_wrappers.rs b/cloud-hypervisor/tests/common/tests_wrappers.rs index 874ad795c3..f268ed75dd 100644 --- a/cloud-hypervisor/tests/common/tests_wrappers.rs +++ b/cloud-hypervisor/tests/common/tests_wrappers.rs @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 use std::ffi::CStr; use std::fs::{self, OpenOptions}; -use std::io::{Seek, SeekFrom, Write}; +use std::io::{Read, Seek, SeekFrom, Write}; use std::path::{Path, PathBuf}; use std::string::String; use std::sync::mpsc; @@ -14,6 +14,7 @@ use net_util::MacAddr; use test_infra::*; use vmm_sys_util::tempdir::TempDir; use vmm_sys_util::tempfile::TempFile; +use wait_timeout::ChildExt; use crate::common::utils::{TargetApi, *}; @@ -2401,3 +2402,48 @@ pub(crate) fn _test_virtio_console(guest: &Guest) { handle_child_output(r, &output); } + +pub(crate) fn _test_console_file(guest: &Guest) { + let console_path = guest.tmp_dir.as_path().join("console-output"); + let mut child = GuestCommand::new(guest) + .default_cpus() + .default_memory() + .default_kernel_cmdline() + .default_disks() + .default_net() + .args([ + "--console", + format!("file={}", console_path.to_str().unwrap()).as_str(), + ]) + .capture_output() + .spawn() + .unwrap(); + + guest.wait_vm_boot().unwrap(); + + guest.ssh_command("sudo shutdown -h now").unwrap(); + + let _ = child.wait_timeout(std::time::Duration::from_secs(20)); + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + let r = std::panic::catch_unwind(|| { + // Check that the cloud-hypervisor binary actually terminated + assert!(output.status.success()); + + // Do this check after shutdown of the VM as an easy way to ensure + // all writes are flushed to disk + let mut f = std::fs::File::open(console_path).unwrap(); + let mut buf = String::new(); + f.read_to_string(&mut buf).unwrap(); + + if !buf.contains(CONSOLE_TEST_STRING) { + eprintln!( + "\n\n==== Console file output ====\n\n{buf}\n\n==== End console file output ====" + ); + } + assert!(buf.contains(CONSOLE_TEST_STRING)); + }); + + handle_child_output(r, &output); +} diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 55f081d12f..4e63ad9320 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -2203,52 +2203,8 @@ mod common_parallel { #[test] fn test_console_file() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - let console_path = guest.tmp_dir.as_path().join("console-output"); - let mut child = GuestCommand::new(&guest) - .default_cpus() - .default_memory() - .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .default_net() - .args([ - "--console", - format!("file={}", console_path.to_str().unwrap()).as_str(), - ]) - .capture_output() - .spawn() - .unwrap(); - - guest.wait_vm_boot().unwrap(); - - guest.ssh_command("sudo shutdown -h now").unwrap(); - - let _ = child.wait_timeout(std::time::Duration::from_secs(20)); - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - let r = std::panic::catch_unwind(|| { - // Check that the cloud-hypervisor binary actually terminated - assert!(output.status.success()); - - // Do this check after shutdown of the VM as an easy way to ensure - // all writes are flushed to disk - let mut f = std::fs::File::open(console_path).unwrap(); - let mut buf = String::new(); - f.read_to_string(&mut buf).unwrap(); - - if !buf.contains(CONSOLE_TEST_STRING) { - eprintln!( - "\n\n==== Console file output ====\n\n{buf}\n\n==== End console file output ====" - ); - } - assert!(buf.contains(CONSOLE_TEST_STRING)); - }); - - handle_child_output(r, &output); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); + _test_console_file(&guest); } #[test] From bff6e40eaaca6de2b495a776140ad8411af38420 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 18:25:41 -0700 Subject: [PATCH 300/742] tests: extract _test_direct_kernel_boot_noacpi wrapper Extract test logic from test_direct_kernel_boot_noacpi into a shared _test_direct_kernel_boot_noacpi wrapper function in tests_wrappers.rs. The kernel cmdline modification (acpi=off) is kept in the parent test case. The wrapper uses default_kernel_cmdline() for kernel/cmdline setup. Signed-off-by: Muminul Islam --- .../tests/common/tests_wrappers.rs | 24 ++++++++++++++ cloud-hypervisor/tests/integration.rs | 33 ++----------------- 2 files changed, 27 insertions(+), 30 deletions(-) diff --git a/cloud-hypervisor/tests/common/tests_wrappers.rs b/cloud-hypervisor/tests/common/tests_wrappers.rs index f268ed75dd..dbabe1503b 100644 --- a/cloud-hypervisor/tests/common/tests_wrappers.rs +++ b/cloud-hypervisor/tests/common/tests_wrappers.rs @@ -2447,3 +2447,27 @@ pub(crate) fn _test_console_file(guest: &Guest) { handle_child_output(r, &output); } + +pub(crate) fn _test_direct_kernel_boot_noacpi(guest: &Guest) { + let mut child = GuestCommand::new(guest) + .default_cpus() + .default_memory() + .default_kernel_cmdline() + .default_disks() + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + assert_eq!(guest.get_cpu_count().unwrap_or_default(), 1); + assert!(guest.get_total_memory().unwrap_or_default() > 480_000); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 4e63ad9320..ec8db5f8ee 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -2455,36 +2455,9 @@ mod common_parallel { #[test] fn test_direct_kernel_boot_noacpi() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - let kernel_path = direct_kernel_boot_path(); - - let mut child = GuestCommand::new(&guest) - .default_cpus() - .default_memory() - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args([ - "--cmdline", - format!("{DIRECT_KERNEL_BOOT_CMDLINE} acpi=off").as_str(), - ]) - .default_disks() - .default_net() - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - assert_eq!(guest.get_cpu_count().unwrap_or_default(), 1); - assert!(guest.get_total_memory().unwrap_or_default() > 480_000); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); + let mut guest = basic_regular_guest!(JAMMY_IMAGE_NAME); + guest.kernel_cmdline = Some(format!("{DIRECT_KERNEL_BOOT_CMDLINE} acpi=off")); + _test_direct_kernel_boot_noacpi(&guest); } #[test] From b478fa0ffd0a48f52e37ff405905443e721a5a97 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 18:27:35 -0700 Subject: [PATCH 301/742] tests: extract _test_pci_bar_reprogramming wrapper Extract test logic from test_pci_bar_reprogramming into a shared _test_pci_bar_reprogramming wrapper function in tests_wrappers.rs. The kernel_path selection for aarch64 (edk2) is kept in the parent test case. The wrapper uses default_kernel_cmdline() for kernel/cmdline setup. Signed-off-by: Muminul Islam --- .../tests/common/tests_wrappers.rs | 81 ++++++++++++++++ cloud-hypervisor/tests/integration.rs | 95 +------------------ 2 files changed, 86 insertions(+), 90 deletions(-) diff --git a/cloud-hypervisor/tests/common/tests_wrappers.rs b/cloud-hypervisor/tests/common/tests_wrappers.rs index dbabe1503b..5088192b11 100644 --- a/cloud-hypervisor/tests/common/tests_wrappers.rs +++ b/cloud-hypervisor/tests/common/tests_wrappers.rs @@ -2471,3 +2471,84 @@ pub(crate) fn _test_direct_kernel_boot_noacpi(guest: &Guest) { handle_child_output(r, &output); } + +pub(crate) fn _test_pci_bar_reprogramming(guest: &Guest) { + let mut child = GuestCommand::new(guest) + .default_cpus() + .default_memory() + .default_kernel_cmdline() + .default_disks() + .args([ + "--net", + guest.default_net_string().as_str(), + "tap=,mac=8a:6b:6f:5a:de:ac,ip=192.168.3.1,mask=255.255.255.128", + ]) + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + // 2 network interfaces + default localhost ==> 3 interfaces + assert_eq!( + guest + .ssh_command("ip -o link | wc -l") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 3 + ); + + let init_bar_addr = guest + .ssh_command("sudo awk '{print $1; exit}' /sys/bus/pci/devices/0000:00:05.0/resource") + .unwrap(); + + // Remove the PCI device + guest + .ssh_command("echo 1 | sudo tee /sys/bus/pci/devices/0000:00:05.0/remove") + .unwrap(); + + // Only 1 network interface left + default localhost ==> 2 interfaces + assert_eq!( + guest + .ssh_command("ip -o link | wc -l") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 2 + ); + + // Remove the PCI device + guest + .ssh_command("echo 1 | sudo tee /sys/bus/pci/rescan") + .unwrap(); + + // Back to 2 network interface + default localhost ==> 3 interfaces + assert_eq!( + guest + .ssh_command("ip -o link | wc -l") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 3 + ); + + let new_bar_addr = guest + .ssh_command("sudo awk '{print $1; exit}' /sys/bus/pci/devices/0000:00:05.0/resource") + .unwrap(); + + // Let's compare the BAR addresses for our virtio-net device. + // They should be different as we expect the BAR reprogramming + // to have happened. + assert_ne!(init_bar_addr, new_bar_addr); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index ec8db5f8ee..746291f36f 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -2524,97 +2524,12 @@ mod common_parallel { // properly probed first, then removing it, and adding it again by doing a // rescan. fn test_pci_bar_reprogramming() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - #[cfg(target_arch = "x86_64")] - let kernel_path = direct_kernel_boot_path(); #[cfg(target_arch = "aarch64")] - let kernel_path = edk2_path(); - - let mut child = GuestCommand::new(&guest) - .default_cpus() - .default_memory() - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .args([ - "--net", - guest.default_net_string().as_str(), - "tap=,mac=8a:6b:6f:5a:de:ac,ip=192.168.3.1,mask=255.255.255.128", - ]) - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - // 2 network interfaces + default localhost ==> 3 interfaces - assert_eq!( - guest - .ssh_command("ip -o link | wc -l") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 3 - ); - - let init_bar_addr = guest - .ssh_command( - "sudo awk '{print $1; exit}' /sys/bus/pci/devices/0000:00:05.0/resource", - ) - .unwrap(); - - // Remove the PCI device - guest - .ssh_command("echo 1 | sudo tee /sys/bus/pci/devices/0000:00:05.0/remove") - .unwrap(); - - // Only 1 network interface left + default localhost ==> 2 interfaces - assert_eq!( - guest - .ssh_command("ip -o link | wc -l") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 2 - ); - - // Remove the PCI device - guest - .ssh_command("echo 1 | sudo tee /sys/bus/pci/rescan") - .unwrap(); - - // Back to 2 network interface + default localhost ==> 3 interfaces - assert_eq!( - guest - .ssh_command("ip -o link | wc -l") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 3 - ); - - let new_bar_addr = guest - .ssh_command( - "sudo awk '{print $1; exit}' /sys/bus/pci/devices/0000:00:05.0/resource", - ) - .unwrap(); - - // Let's compare the BAR addresses for our virtio-net device. - // They should be different as we expect the BAR reprogramming - // to have happened. - assert_ne!(init_bar_addr, new_bar_addr); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); + let guest = + basic_regular_guest!(JAMMY_IMAGE_NAME).with_kernel_path(edk2_path().to_str().unwrap()); + #[cfg(target_arch = "x86_64")] + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); + _test_pci_bar_reprogramming(&guest); } #[test] From af7fabd2c665a0ff409af2b0bd1261cbf5bbfe72 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 18:29:16 -0700 Subject: [PATCH 302/742] tests: extract _test_memory_overhead to tests_wrappers Extract test logic from test_memory_overhead into a shared _test_memory_overhead wrapper function in tests_wrappers.rs. The custom memory size is set in the parent test case via with_memory(). The wrapper uses default_kernel_cmdline() and default_memory() for setup. Signed-off-by: Muminul Islam --- .../tests/common/tests_wrappers.rs | 25 ++++++++++++++ cloud-hypervisor/tests/integration.rs | 34 +++---------------- 2 files changed, 29 insertions(+), 30 deletions(-) diff --git a/cloud-hypervisor/tests/common/tests_wrappers.rs b/cloud-hypervisor/tests/common/tests_wrappers.rs index 5088192b11..965e644333 100644 --- a/cloud-hypervisor/tests/common/tests_wrappers.rs +++ b/cloud-hypervisor/tests/common/tests_wrappers.rs @@ -2552,3 +2552,28 @@ pub(crate) fn _test_pci_bar_reprogramming(guest: &Guest) { handle_child_output(r, &output); } + +pub(crate) fn _test_memory_overhead(guest: &Guest, guest_memory_size_kb: u32) { + let mut child = GuestCommand::new(guest) + .default_cpus() + .default_memory() + .default_kernel_cmdline() + .default_net() + .default_disks() + .capture_output() + .spawn() + .unwrap(); + + guest.wait_vm_boot().unwrap(); + + let r = std::panic::catch_unwind(|| { + let overhead = get_vmm_overhead(child.id(), guest_memory_size_kb); + eprintln!("Guest memory overhead: {overhead} vs {MAXIMUM_VMM_OVERHEAD_KB}"); + assert!(overhead <= MAXIMUM_VMM_OVERHEAD_KB); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 746291f36f..cb3616cc44 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -2852,36 +2852,10 @@ mod common_parallel { #[test] fn test_memory_overhead() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - let kernel_path = direct_kernel_boot_path(); - - let guest_memory_size_kb = 512 * 1024; - - let mut child = GuestCommand::new(&guest) - .default_cpus() - .args(["--memory", format!("size={guest_memory_size_kb}K").as_str()]) - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_net() - .default_disks() - .capture_output() - .spawn() - .unwrap(); - - guest.wait_vm_boot().unwrap(); - - let r = std::panic::catch_unwind(|| { - let overhead = get_vmm_overhead(child.id(), guest_memory_size_kb); - eprintln!("Guest memory overhead: {overhead} vs {MAXIMUM_VMM_OVERHEAD_KB}"); - assert!(overhead <= MAXIMUM_VMM_OVERHEAD_KB); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); + let guest_memory_size_kb: u32 = 512 * 1024; + let guest = + basic_regular_guest!(JAMMY_IMAGE_NAME).with_memory(&format!("{guest_memory_size_kb}K")); + _test_memory_overhead(&guest, guest_memory_size_kb); } #[test] From c1d929d0cf243b6d3f752e5ce92e1eaee9444ed9 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 18:30:30 -0700 Subject: [PATCH 303/742] tests: extract _test_landlock to tests_wrappers Extract test logic from test_landlock into a shared _test_landlock wrapper function in tests_wrappers.rs. Update the parent test case to use the basic_regular_guest macro. The wrapper uses default_kernel_cmdline() for kernel/cmdline setup. Signed-off-by: Muminul Islam --- .../tests/common/tests_wrappers.rs | 54 ++++++++++++++++ cloud-hypervisor/tests/integration.rs | 62 +------------------ 2 files changed, 56 insertions(+), 60 deletions(-) diff --git a/cloud-hypervisor/tests/common/tests_wrappers.rs b/cloud-hypervisor/tests/common/tests_wrappers.rs index 965e644333..ecfc9ac2e2 100644 --- a/cloud-hypervisor/tests/common/tests_wrappers.rs +++ b/cloud-hypervisor/tests/common/tests_wrappers.rs @@ -2577,3 +2577,57 @@ pub(crate) fn _test_memory_overhead(guest: &Guest, guest_memory_size_kb: u32) { handle_child_output(r, &output); } + +pub(crate) fn _test_landlock(guest: &Guest) { + let api_socket = temp_api_path(&guest.tmp_dir); + + let mut child = GuestCommand::new(guest) + .args(["--api-socket", &api_socket]) + .default_cpus() + .default_memory() + .default_kernel_cmdline() + .args(["--landlock"]) + .default_disks() + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + // Check /dev/vdc is not there + assert_eq!( + guest + .ssh_command("lsblk | grep -c vdc.*16M || true") + .unwrap() + .trim() + .parse::() + .unwrap_or(1), + 0 + ); + + // Now let's add the extra disk. + let mut blk_file_path = dirs::home_dir().unwrap(); + blk_file_path.push("workloads"); + blk_file_path.push("blk.img"); + // As the path to the hotplug disk is not pre-added, this remote + // command will fail. + assert!(!remote_command( + &api_socket, + "add-disk", + Some( + format!( + "path={},id=test0,readonly=true", + blk_file_path.to_str().unwrap() + ) + .as_str() + ), + )); + }); + + let _ = child.kill(); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index cb3616cc44..18801ff1d3 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -2864,66 +2864,8 @@ mod common_parallel { // the path for the hotplug disk is not pre-added to Landlock rules, this // the test will result in a failure. fn test_landlock() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - #[cfg(target_arch = "x86_64")] - let kernel_path = direct_kernel_boot_path(); - #[cfg(target_arch = "aarch64")] - let kernel_path = edk2_path(); - - let api_socket = temp_api_path(&guest.tmp_dir); - - let mut child = GuestCommand::new(&guest) - .args(["--api-socket", &api_socket]) - .default_cpus() - .default_memory() - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .args(["--landlock"]) - .default_disks() - .default_net() - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - // Check /dev/vdc is not there - assert_eq!( - guest - .ssh_command("lsblk | grep -c vdc.*16M || true") - .unwrap() - .trim() - .parse::() - .unwrap_or(1), - 0 - ); - - // Now let's add the extra disk. - let mut blk_file_path = dirs::home_dir().unwrap(); - blk_file_path.push("workloads"); - blk_file_path.push("blk.img"); - // As the path to the hotplug disk is not pre-added, this remote - // command will fail. - assert!(!remote_command( - &api_socket, - "add-disk", - Some( - format!( - "path={},id=test0,readonly=true", - blk_file_path.to_str().unwrap() - ) - .as_str() - ), - )); - }); - - let _ = child.kill(); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); + _test_landlock(&guest); } fn _test_disk_hotplug(landlock_enabled: bool) { From 8f40aed0ca1e5074269ba6c90c9465d2131d82cd Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 18:32:15 -0700 Subject: [PATCH 304/742] tests: extract _test_disk_hotplug to tests_wrappers Extract test logic from _test_disk_hotplug into a shared wrapper function in tests_wrappers.rs. Update both test_disk_hotplug and test_disk_hotplug_with_landlock to use the basic_regular_guest macro. The wrapper uses default_kernel_cmdline() for kernel/cmdline setup. Signed-off-by: Muminul Islam --- .../tests/common/tests_wrappers.rs | 172 ++++++++++++++++ cloud-hypervisor/tests/integration.rs | 188 +----------------- 2 files changed, 179 insertions(+), 181 deletions(-) diff --git a/cloud-hypervisor/tests/common/tests_wrappers.rs b/cloud-hypervisor/tests/common/tests_wrappers.rs index ecfc9ac2e2..f4379127c7 100644 --- a/cloud-hypervisor/tests/common/tests_wrappers.rs +++ b/cloud-hypervisor/tests/common/tests_wrappers.rs @@ -2631,3 +2631,175 @@ pub(crate) fn _test_landlock(guest: &Guest) { handle_child_output(r, &output); } + +pub(crate) fn _test_disk_hotplug(guest: &Guest, landlock_enabled: bool) { + let api_socket = temp_api_path(&guest.tmp_dir); + + let mut blk_file_path = dirs::home_dir().unwrap(); + blk_file_path.push("workloads"); + blk_file_path.push("blk.img"); + + let mut cmd = GuestCommand::new(guest); + if landlock_enabled { + cmd.args(["--landlock"]).args([ + "--landlock-rules", + format!("path={blk_file_path:?},access=rw").as_str(), + ]); + } + + cmd.args(["--api-socket", &api_socket]) + .default_cpus() + .default_memory() + .default_kernel_cmdline() + .default_disks() + .default_net() + .capture_output(); + + let mut child = cmd.spawn().unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + // Check /dev/vdc is not there + assert_eq!( + guest + .ssh_command("lsblk | grep -c vdc.*16M || true") + .unwrap() + .trim() + .parse::() + .unwrap_or(1), + 0 + ); + + // Now let's add the extra disk. + let (cmd_success, cmd_output) = remote_command_w_output( + &api_socket, + "add-disk", + Some( + format!( + "path={},id=test0,readonly=true", + blk_file_path.to_str().unwrap() + ) + .as_str(), + ), + ); + assert!(cmd_success); + assert!( + String::from_utf8_lossy(&cmd_output) + .contains("{\"id\":\"test0\",\"bdf\":\"0000:00:06.0\"}") + ); + + thread::sleep(std::time::Duration::new(10, 0)); + + // Check that /dev/vdc exists and the block size is 16M. + assert_eq!( + guest + .ssh_command("lsblk | grep vdc | grep -c 16M") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 1 + ); + // And check the block device can be read. + guest + .ssh_command("sudo dd if=/dev/vdc of=/dev/null bs=1M iflag=direct count=16") + .unwrap(); + + // Let's remove it the extra disk. + assert!(remote_command(&api_socket, "remove-device", Some("test0"))); + thread::sleep(std::time::Duration::new(5, 0)); + // And check /dev/vdc is not there + assert_eq!( + guest + .ssh_command("lsblk | grep -c vdc.*16M || true") + .unwrap() + .trim() + .parse::() + .unwrap_or(1), + 0 + ); + + // And add it back to validate unplug did work correctly. + let (cmd_success, cmd_output) = remote_command_w_output( + &api_socket, + "add-disk", + Some( + format!( + "path={},id=test0,readonly=true", + blk_file_path.to_str().unwrap() + ) + .as_str(), + ), + ); + assert!(cmd_success); + assert!( + String::from_utf8_lossy(&cmd_output) + .contains("{\"id\":\"test0\",\"bdf\":\"0000:00:06.0\"}") + ); + + thread::sleep(std::time::Duration::new(10, 0)); + + // Check that /dev/vdc exists and the block size is 16M. + assert_eq!( + guest + .ssh_command("lsblk | grep vdc | grep -c 16M") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 1 + ); + // And check the block device can be read. + guest + .ssh_command("sudo dd if=/dev/vdc of=/dev/null bs=1M iflag=direct count=16") + .unwrap(); + + // Reboot the VM. + guest.reboot_linux(0); + + // Check still there after reboot + assert_eq!( + guest + .ssh_command("lsblk | grep vdc | grep -c 16M") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 1 + ); + + assert!(remote_command(&api_socket, "remove-device", Some("test0"))); + + thread::sleep(std::time::Duration::new(20, 0)); + + // Check device has gone away + assert_eq!( + guest + .ssh_command("lsblk | grep -c vdc.*16M || true") + .unwrap() + .trim() + .parse::() + .unwrap_or(1), + 0 + ); + + guest.reboot_linux(1); + + // Check device still absent + assert_eq!( + guest + .ssh_command("lsblk | grep -c vdc.*16M || true") + .unwrap() + .trim() + .parse::() + .unwrap_or(1), + 0 + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 18801ff1d3..4bc3a36bbb 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -2868,196 +2868,22 @@ mod common_parallel { _test_landlock(&guest); } - fn _test_disk_hotplug(landlock_enabled: bool) { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - + #[test] + fn test_disk_hotplug() { #[cfg(target_arch = "x86_64")] let kernel_path = direct_kernel_boot_path(); #[cfg(target_arch = "aarch64")] let kernel_path = edk2_path(); - - let api_socket = temp_api_path(&guest.tmp_dir); - - let mut blk_file_path = dirs::home_dir().unwrap(); - blk_file_path.push("workloads"); - blk_file_path.push("blk.img"); - - let mut cmd = GuestCommand::new(&guest); - if landlock_enabled { - cmd.args(["--landlock"]).args([ - "--landlock-rules", - format!("path={blk_file_path:?},access=rw").as_str(), - ]); - } - - cmd.args(["--api-socket", &api_socket]) - .default_cpus() - .default_memory() - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .default_net() - .capture_output(); - - let mut child = cmd.spawn().unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - // Check /dev/vdc is not there - assert_eq!( - guest - .ssh_command("lsblk | grep -c vdc.*16M || true") - .unwrap() - .trim() - .parse::() - .unwrap_or(1), - 0 - ); - - // Now let's add the extra disk. - let (cmd_success, cmd_output) = remote_command_w_output( - &api_socket, - "add-disk", - Some( - format!( - "path={},id=test0,readonly=true", - blk_file_path.to_str().unwrap() - ) - .as_str(), - ), - ); - assert!(cmd_success); - assert!( - String::from_utf8_lossy(&cmd_output) - .contains("{\"id\":\"test0\",\"bdf\":\"0000:00:06.0\"}") - ); - - thread::sleep(std::time::Duration::new(10, 0)); - - // Check that /dev/vdc exists and the block size is 16M. - assert_eq!( - guest - .ssh_command("lsblk | grep vdc | grep -c 16M") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 1 - ); - // And check the block device can be read. - guest - .ssh_command("sudo dd if=/dev/vdc of=/dev/null bs=1M iflag=direct count=16") - .unwrap(); - - // Let's remove it the extra disk. - assert!(remote_command(&api_socket, "remove-device", Some("test0"))); - thread::sleep(std::time::Duration::new(5, 0)); - // And check /dev/vdc is not there - assert_eq!( - guest - .ssh_command("lsblk | grep -c vdc.*16M || true") - .unwrap() - .trim() - .parse::() - .unwrap_or(1), - 0 - ); - - // And add it back to validate unplug did work correctly. - let (cmd_success, cmd_output) = remote_command_w_output( - &api_socket, - "add-disk", - Some( - format!( - "path={},id=test0,readonly=true", - blk_file_path.to_str().unwrap() - ) - .as_str(), - ), - ); - assert!(cmd_success); - assert!( - String::from_utf8_lossy(&cmd_output) - .contains("{\"id\":\"test0\",\"bdf\":\"0000:00:06.0\"}") - ); - - thread::sleep(std::time::Duration::new(10, 0)); - - // Check that /dev/vdc exists and the block size is 16M. - assert_eq!( - guest - .ssh_command("lsblk | grep vdc | grep -c 16M") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 1 - ); - // And check the block device can be read. - guest - .ssh_command("sudo dd if=/dev/vdc of=/dev/null bs=1M iflag=direct count=16") - .unwrap(); - - // Reboot the VM. - guest.reboot_linux(0); - - // Check still there after reboot - assert_eq!( - guest - .ssh_command("lsblk | grep vdc | grep -c 16M") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 1 - ); - - assert!(remote_command(&api_socket, "remove-device", Some("test0"))); - - thread::sleep(std::time::Duration::new(20, 0)); - - // Check device has gone away - assert_eq!( - guest - .ssh_command("lsblk | grep -c vdc.*16M || true") - .unwrap() - .trim() - .parse::() - .unwrap_or(1), - 0 - ); - - guest.reboot_linux(1); - - // Check device still absent - assert_eq!( - guest - .ssh_command("lsblk | grep -c vdc.*16M || true") - .unwrap() - .trim() - .parse::() - .unwrap_or(1), - 0 - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); - } - - #[test] - fn test_disk_hotplug() { - _test_disk_hotplug(false); + let guest = + basic_regular_guest!(JAMMY_IMAGE_NAME).with_kernel_path(kernel_path.to_str().unwrap()); + _test_disk_hotplug(&guest, false); } #[test] #[cfg(target_arch = "x86_64")] fn test_disk_hotplug_with_landlock() { - _test_disk_hotplug(true); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); + _test_disk_hotplug(&guest, true); } #[test] From ef81bb135898932dd355357e8df09e055b0d6cee Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 18:35:09 -0700 Subject: [PATCH 305/742] tests: extract _test_virtio_block_topology wrapper Extract test logic from test_virtio_block_topology into a shared _test_virtio_block_topology wrapper function in tests_wrappers.rs. The loop device creation and cleanup are kept in the parent test case. The wrapper uses default_kernel_cmdline() for kernel/cmdline setup. Signed-off-by: Muminul Islam --- .../tests/common/tests_wrappers.rs | 65 +++++++++++++++++ cloud-hypervisor/tests/integration.rs | 71 +------------------ 2 files changed, 67 insertions(+), 69 deletions(-) diff --git a/cloud-hypervisor/tests/common/tests_wrappers.rs b/cloud-hypervisor/tests/common/tests_wrappers.rs index f4379127c7..4c39043aa3 100644 --- a/cloud-hypervisor/tests/common/tests_wrappers.rs +++ b/cloud-hypervisor/tests/common/tests_wrappers.rs @@ -2803,3 +2803,68 @@ pub(crate) fn _test_disk_hotplug(guest: &Guest, landlock_enabled: bool) { handle_child_output(r, &output); } + +pub(crate) fn _test_virtio_block_topology(guest: &Guest, loop_dev: &str) { + let mut child = GuestCommand::new(guest) + .default_cpus() + .default_memory() + .default_kernel_cmdline() + .args([ + "--disk", + format!( + "path={}", + guest.disk_config.disk(DiskType::OperatingSystem).unwrap() + ) + .as_str(), + format!( + "path={}", + guest.disk_config.disk(DiskType::CloudInit).unwrap() + ) + .as_str(), + format!("path={loop_dev}").as_str(), + ]) + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + // MIN-IO column + assert_eq!( + guest + .ssh_command("lsblk -t| grep vdc | awk '{print $3}'") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 4096 + ); + // PHY-SEC column + assert_eq!( + guest + .ssh_command("lsblk -t| grep vdc | awk '{print $5}'") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 4096 + ); + // LOG-SEC column + assert_eq!( + guest + .ssh_command("lsblk -t| grep vdc | awk '{print $6}'") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 4096 + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 4bc3a36bbb..d67570ee22 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -3251,10 +3251,7 @@ mod common_parallel { #[test] fn test_virtio_block_topology() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - let kernel_path = direct_kernel_boot_path(); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); let test_disk_path = guest.tmp_dir.as_path().join("test.img"); let output = exec_host_command_output( @@ -3271,71 +3268,7 @@ mod common_parallel { } let loop_dev = create_loop_device(test_disk_path.to_str().unwrap(), 4096, 5); - - let mut child = GuestCommand::new(&guest) - .default_cpus() - .default_memory() - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .args([ - "--disk", - format!( - "path={}", - guest.disk_config.disk(DiskType::OperatingSystem).unwrap() - ) - .as_str(), - format!( - "path={}", - guest.disk_config.disk(DiskType::CloudInit).unwrap() - ) - .as_str(), - format!("path={}", &loop_dev).as_str(), - ]) - .default_net() - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - // MIN-IO column - assert_eq!( - guest - .ssh_command("lsblk -t| grep vdc | awk '{print $3}'") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 4096 - ); - // PHY-SEC column - assert_eq!( - guest - .ssh_command("lsblk -t| grep vdc | awk '{print $5}'") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 4096 - ); - // LOG-SEC column - assert_eq!( - guest - .ssh_command("lsblk -t| grep vdc | awk '{print $6}'") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 4096 - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); - + _test_virtio_block_topology(&guest, &loop_dev); Command::new("losetup") .args(["-d", &loop_dev]) .output() From 7284c02d31cb4cd3d8b8898e112da72b059cbcb4 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 18:37:48 -0700 Subject: [PATCH 306/742] tests: extract _test_net_hotplug to tests_wrappers Extract test logic from _test_net_hotplug into a shared wrapper function in tests_wrappers.rs. Update both test_net_hotplug and test_net_multi_segment_hotplug to use the basic_regular_guest macro. The wrapper uses default_kernel_cmdline() for kernel/cmdline setup. Signed-off-by: Muminul Islam --- .../tests/common/tests_wrappers.rs | 172 ++++++++++++++++ cloud-hypervisor/tests/integration.rs | 186 ++---------------- 2 files changed, 183 insertions(+), 175 deletions(-) diff --git a/cloud-hypervisor/tests/common/tests_wrappers.rs b/cloud-hypervisor/tests/common/tests_wrappers.rs index 4c39043aa3..6f002fee71 100644 --- a/cloud-hypervisor/tests/common/tests_wrappers.rs +++ b/cloud-hypervisor/tests/common/tests_wrappers.rs @@ -2868,3 +2868,175 @@ pub(crate) fn _test_virtio_block_topology(guest: &Guest, loop_dev: &str) { handle_child_output(r, &output); } + +pub(crate) fn _test_net_hotplug( + guest: &Guest, + max_num_pci_segments: u16, + pci_segment: Option, +) { + let api_socket = temp_api_path(&guest.tmp_dir); + + // Boot without network + let mut cmd = GuestCommand::new(guest); + + cmd.args(["--api-socket", &api_socket]) + .default_cpus() + .default_memory() + .default_net() + .default_disks() + .capture_output(); + + if pci_segment.is_some() { + cmd.default_kernel_cmdline_with_platform(Some(&format!( + "num_pci_segments={max_num_pci_segments}" + ))); + } else { + cmd.default_kernel_cmdline(); + } + + let mut child = cmd.spawn().unwrap(); + + guest.wait_vm_boot().unwrap(); + + let r = std::panic::catch_unwind(|| { + // Add network + let (cmd_success, cmd_output) = remote_command_w_output( + &api_socket, + "add-net", + Some( + format!( + "id=test0,tap=,mac={},ip={},mask=255.255.255.128{}", + guest.network.guest_mac1, + guest.network.host_ip1, + if let Some(pci_segment) = pci_segment { + format!(",pci_segment={pci_segment}") + } else { + String::new() + } + ) + .as_str(), + ), + ); + assert!(cmd_success); + + if let Some(pci_segment) = pci_segment { + assert!(String::from_utf8_lossy(&cmd_output).contains(&format!( + "{{\"id\":\"test0\",\"bdf\":\"{pci_segment:04x}:00:01.0\"}}" + ))); + } else { + assert!( + String::from_utf8_lossy(&cmd_output) + .contains("{\"id\":\"test0\",\"bdf\":\"0000:00:06.0\"}") + ); + } + + thread::sleep(std::time::Duration::new(5, 0)); + + // 2 network interfaces + default localhost ==> 3 interfaces + assert_eq!( + guest + .ssh_command("ip -o link | wc -l") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 3 + ); + + // Test the same using the added network interface's IP + assert_eq!( + ssh_command_ip( + "ip -o link | wc -l", + &guest.network.guest_ip1, + DEFAULT_SSH_RETRIES, + DEFAULT_SSH_TIMEOUT + ) + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 3 + ); + + // Remove network + assert!(remote_command(&api_socket, "remove-device", Some("test0"),)); + thread::sleep(std::time::Duration::new(5, 0)); + + // Add network + let (cmd_success, cmd_output) = remote_command_w_output( + &api_socket, + "add-net", + Some( + format!( + "id=test1,tap=,mac={},ip={},mask=255.255.255.128{}", + guest.network.guest_mac1, + guest.network.host_ip1, + if let Some(pci_segment) = pci_segment { + format!(",pci_segment={pci_segment}") + } else { + String::new() + } + ) + .as_str(), + ), + ); + assert!(cmd_success); + + if let Some(pci_segment) = pci_segment { + assert!(String::from_utf8_lossy(&cmd_output).contains(&format!( + "{{\"id\":\"test1\",\"bdf\":\"{pci_segment:04x}:00:01.0\"}}" + ))); + } else { + assert!( + String::from_utf8_lossy(&cmd_output) + .contains("{\"id\":\"test1\",\"bdf\":\"0000:00:06.0\"}") + ); + } + + thread::sleep(std::time::Duration::new(5, 0)); + + // 2 network interfaces + default localhost ==> 3 interfaces + assert_eq!( + guest + .ssh_command("ip -o link | wc -l") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 3 + ); + + guest.reboot_linux(0); + + // 2 network interfaces + default localhost ==> 3 interfaces + assert_eq!( + guest + .ssh_command("ip -o link | wc -l") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 3 + ); + + // Test the same using the added network interface's IP + assert_eq!( + ssh_command_ip( + "ip -o link | wc -l", + &guest.network.guest_ip1, + DEFAULT_SSH_RETRIES, + DEFAULT_SSH_TIMEOUT + ) + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 3 + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index d67570ee22..770313eeee 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -4936,189 +4936,25 @@ mod common_parallel { #[test] fn test_net_hotplug() { - _test_net_hotplug(None); + #[cfg(target_arch = "x86_64")] + let kernel_path = direct_kernel_boot_path(); + #[cfg(target_arch = "aarch64")] + let kernel_path = edk2_path(); + let guest = + basic_regular_guest!(JAMMY_IMAGE_NAME).with_kernel_path(kernel_path.to_str().unwrap()); + + _test_net_hotplug(&guest, MAX_NUM_PCI_SEGMENTS, None); } #[test] fn test_net_multi_segment_hotplug() { - _test_net_hotplug(Some(15)); - } - - fn _test_net_hotplug(pci_segment: Option) { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - #[cfg(target_arch = "x86_64")] let kernel_path = direct_kernel_boot_path(); #[cfg(target_arch = "aarch64")] let kernel_path = edk2_path(); - - let api_socket = temp_api_path(&guest.tmp_dir); - - // Boot without network - let mut cmd = GuestCommand::new(&guest); - - cmd.args(["--api-socket", &api_socket]) - .default_cpus() - .default_memory() - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_net() - .default_disks() - .capture_output(); - - if pci_segment.is_some() { - cmd.args([ - "--platform", - &format!("num_pci_segments={MAX_NUM_PCI_SEGMENTS}"), - ]); - } - - let mut child = cmd.spawn().unwrap(); - - guest.wait_vm_boot().unwrap(); - - let r = std::panic::catch_unwind(|| { - // Add network - let (cmd_success, cmd_output) = remote_command_w_output( - &api_socket, - "add-net", - Some( - format!( - "id=test0,tap=,mac={},ip={},mask=255.255.255.128{}", - guest.network.guest_mac1, - guest.network.host_ip1, - if let Some(pci_segment) = pci_segment { - format!(",pci_segment={pci_segment}") - } else { - String::new() - } - ) - .as_str(), - ), - ); - assert!(cmd_success); - - if let Some(pci_segment) = pci_segment { - assert!(String::from_utf8_lossy(&cmd_output).contains(&format!( - "{{\"id\":\"test0\",\"bdf\":\"{pci_segment:04x}:00:01.0\"}}" - ))); - } else { - assert!( - String::from_utf8_lossy(&cmd_output) - .contains("{\"id\":\"test0\",\"bdf\":\"0000:00:06.0\"}") - ); - } - - thread::sleep(std::time::Duration::new(5, 0)); - - // 2 network interfaces + default localhost ==> 3 interfaces - assert_eq!( - guest - .ssh_command("ip -o link | wc -l") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 3 - ); - - // Test the same using the added network interface's IP - assert_eq!( - ssh_command_ip( - "ip -o link | wc -l", - &guest.network.guest_ip1, - DEFAULT_SSH_RETRIES, - DEFAULT_SSH_TIMEOUT - ) - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 3 - ); - - // Remove network - assert!(remote_command(&api_socket, "remove-device", Some("test0"),)); - thread::sleep(std::time::Duration::new(5, 0)); - - // Add network - let (cmd_success, cmd_output) = remote_command_w_output( - &api_socket, - "add-net", - Some( - format!( - "id=test1,tap=,mac={},ip={},mask=255.255.255.128{}", - guest.network.guest_mac1, - guest.network.host_ip1, - if let Some(pci_segment) = pci_segment { - format!(",pci_segment={pci_segment}") - } else { - String::new() - } - ) - .as_str(), - ), - ); - assert!(cmd_success); - - if let Some(pci_segment) = pci_segment { - assert!(String::from_utf8_lossy(&cmd_output).contains(&format!( - "{{\"id\":\"test1\",\"bdf\":\"{pci_segment:04x}:00:01.0\"}}" - ))); - } else { - assert!( - String::from_utf8_lossy(&cmd_output) - .contains("{\"id\":\"test1\",\"bdf\":\"0000:00:06.0\"}") - ); - } - - thread::sleep(std::time::Duration::new(5, 0)); - - // 2 network interfaces + default localhost ==> 3 interfaces - assert_eq!( - guest - .ssh_command("ip -o link | wc -l") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 3 - ); - - guest.reboot_linux(0); - - // 2 network interfaces + default localhost ==> 3 interfaces - assert_eq!( - guest - .ssh_command("ip -o link | wc -l") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 3 - ); - - // Test the same using the added network interface's IP - assert_eq!( - ssh_command_ip( - "ip -o link | wc -l", - &guest.network.guest_ip1, - DEFAULT_SSH_RETRIES, - DEFAULT_SSH_TIMEOUT - ) - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 3 - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); + let guest = + basic_regular_guest!(JAMMY_IMAGE_NAME).with_kernel_path(kernel_path.to_str().unwrap()); + _test_net_hotplug(&guest, MAX_NUM_PCI_SEGMENTS, Some(15)); } #[test] From 4b77ac26d14c1ee432710415522be46b504681df Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 18:39:05 -0700 Subject: [PATCH 307/742] tests: extract _test_counters to tests_wrappers Extract test logic from test_counters into a shared _test_counters wrapper function in tests_wrappers.rs. Update the parent test case to use the basic_regular_guest macro. The wrapper uses default_kernel_cmdline() for kernel/cmdline setup. Signed-off-by: Muminul Islam --- .../tests/common/tests_wrappers.rs | 34 ++++++++++++++++++ cloud-hypervisor/tests/integration.rs | 36 ++----------------- 2 files changed, 36 insertions(+), 34 deletions(-) diff --git a/cloud-hypervisor/tests/common/tests_wrappers.rs b/cloud-hypervisor/tests/common/tests_wrappers.rs index 6f002fee71..81afdadc60 100644 --- a/cloud-hypervisor/tests/common/tests_wrappers.rs +++ b/cloud-hypervisor/tests/common/tests_wrappers.rs @@ -3040,3 +3040,37 @@ pub(crate) fn _test_net_hotplug( handle_child_output(r, &output); } + +pub(crate) fn _test_counters(guest: &Guest) { + let api_socket = temp_api_path(&guest.tmp_dir); + + let mut cmd = GuestCommand::new(guest); + cmd.default_cpus() + .default_memory() + .default_kernel_cmdline() + .default_disks() + .args(["--net", guest.default_net_string().as_str()]) + .args(["--api-socket", &api_socket]) + .capture_output(); + + let mut child = cmd.spawn().unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + let orig_counters = get_counters(&api_socket); + guest + .ssh_command("dd if=/dev/zero of=test count=8 bs=1M") + .unwrap(); + + let new_counters = get_counters(&api_socket); + + // Check that all the counters have increased + assert!(new_counters > orig_counters); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 770313eeee..47b8980298 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -5008,40 +5008,8 @@ mod common_parallel { #[test] fn test_counters() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - let api_socket = temp_api_path(&guest.tmp_dir); - - let mut cmd = GuestCommand::new(&guest); - cmd.default_cpus() - .default_memory() - .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .args(["--net", guest.default_net_string().as_str()]) - .args(["--api-socket", &api_socket]) - .capture_output(); - - let mut child = cmd.spawn().unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - let orig_counters = get_counters(&api_socket); - guest - .ssh_command("dd if=/dev/zero of=test count=8 bs=1M") - .unwrap(); - - let new_counters = get_counters(&api_socket); - - // Check that all the counters have increased - assert!(new_counters > orig_counters); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); + _test_counters(&guest); } #[test] From 93a3fc5b913a1458463c989a15ab5d7faaaba56c Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 18:40:43 -0700 Subject: [PATCH 308/742] tests: extract _test_watchdog to tests_wrappers Extract test logic from test_watchdog into a shared _test_watchdog wrapper function in tests_wrappers.rs. Update the parent test case to use the basic_regular_guest macro with FOCAL_IMAGE_NAME. The wrapper uses default_kernel_cmdline() for kernel/cmdline setup. Signed-off-by: Muminul Islam --- .../tests/common/tests_wrappers.rs | 77 ++++++++++++++++++ cloud-hypervisor/tests/integration.rs | 81 +------------------ 2 files changed, 79 insertions(+), 79 deletions(-) diff --git a/cloud-hypervisor/tests/common/tests_wrappers.rs b/cloud-hypervisor/tests/common/tests_wrappers.rs index 81afdadc60..65d3be97ac 100644 --- a/cloud-hypervisor/tests/common/tests_wrappers.rs +++ b/cloud-hypervisor/tests/common/tests_wrappers.rs @@ -3074,3 +3074,80 @@ pub(crate) fn _test_counters(guest: &Guest) { handle_child_output(r, &output); } + +pub(crate) fn _test_watchdog(guest: &Guest) { + let api_socket = temp_api_path(&guest.tmp_dir); + let event_path = temp_event_monitor_path(&guest.tmp_dir); + + let mut cmd = GuestCommand::new(guest); + cmd.default_cpus() + .default_memory() + .default_kernel_cmdline() + .default_disks() + .args(["--net", guest.default_net_string().as_str()]) + .args(["--watchdog"]) + .args(["--api-socket", &api_socket]) + .args(["--event-monitor", format!("path={event_path}").as_str()]) + .capture_output(); + + let mut child = cmd.spawn().unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + let mut expected_reboot_count = 1; + + // Enable the watchdog with a 15s timeout + enable_guest_watchdog(guest, 15); + + assert_eq!(get_reboot_count(guest), expected_reboot_count); + assert_eq!( + guest + .ssh_command("sudo journalctl | grep -c -- \"Watchdog started\"") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 1 + ); + + // Allow some normal time to elapse to check we don't get spurious reboots + thread::sleep(std::time::Duration::new(40, 0)); + // Check no reboot + assert_eq!(get_reboot_count(guest), expected_reboot_count); + + // Trigger a panic (sync first). We need to do this inside a screen with a delay so the SSH command returns. + guest.ssh_command("screen -dmS reboot sh -c \"sleep 5; echo s | tee /proc/sysrq-trigger; echo c | sudo tee /proc/sysrq-trigger\"").unwrap(); + // Allow some time for the watchdog to trigger (max 30s) and reboot to happen + guest.wait_vm_boot_custom_timeout(50).unwrap(); + // Check a reboot is triggered by the watchdog + expected_reboot_count += 1; + assert_eq!(get_reboot_count(guest), expected_reboot_count); + + #[cfg(target_arch = "x86_64")] + { + // Now pause the VM and remain offline for 30s + assert!(remote_command(&api_socket, "pause", None)); + let latest_events = [ + &MetaEvent { + event: "pausing".to_string(), + device_id: None, + }, + &MetaEvent { + event: "paused".to_string(), + device_id: None, + }, + ]; + assert!(check_latest_events_exact(&latest_events, &event_path)); + assert!(remote_command(&api_socket, "resume", None)); + + // Check no reboot + assert_eq!(get_reboot_count(guest), expected_reboot_count); + } + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 47b8980298..b91ba34966 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -5099,85 +5099,8 @@ mod common_parallel { #[test] fn test_watchdog() { - let disk_config = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - let api_socket = temp_api_path(&guest.tmp_dir); - - let kernel_path = direct_kernel_boot_path(); - let event_path = temp_event_monitor_path(&guest.tmp_dir); - - let mut cmd = GuestCommand::new(&guest); - cmd.default_cpus() - .default_memory() - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .args(["--net", guest.default_net_string().as_str()]) - .args(["--watchdog"]) - .args(["--api-socket", &api_socket]) - .args(["--event-monitor", format!("path={event_path}").as_str()]) - .capture_output(); - - let mut child = cmd.spawn().unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - let mut expected_reboot_count = 1; - - // Enable the watchdog with a 15s timeout - enable_guest_watchdog(&guest, 15); - - assert_eq!(get_reboot_count(&guest), expected_reboot_count); - assert_eq!( - guest - .ssh_command("sudo journalctl | grep -c -- \"Watchdog started\"") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 1 - ); - - // Allow some normal time to elapse to check we don't get spurious reboots - thread::sleep(std::time::Duration::new(40, 0)); - // Check no reboot - assert_eq!(get_reboot_count(&guest), expected_reboot_count); - - // Trigger a panic (sync first). We need to do this inside a screen with a delay so the SSH command returns. - guest.ssh_command("screen -dmS reboot sh -c \"sleep 5; echo s | tee /proc/sysrq-trigger; echo c | sudo tee /proc/sysrq-trigger\"").unwrap(); - // Allow some time for the watchdog to trigger (max 30s) and reboot to happen - guest.wait_vm_boot_custom_timeout(50).unwrap(); - // Check a reboot is triggered by the watchdog - expected_reboot_count += 1; - assert_eq!(get_reboot_count(&guest), expected_reboot_count); - - #[cfg(target_arch = "x86_64")] - { - // Now pause the VM and remain offline for 30s - assert!(remote_command(&api_socket, "pause", None)); - let latest_events = [ - &MetaEvent { - event: "pausing".to_string(), - device_id: None, - }, - &MetaEvent { - event: "paused".to_string(), - device_id: None, - }, - ]; - assert!(check_latest_events_exact(&latest_events, &event_path)); - assert!(remote_command(&api_socket, "resume", None)); - - // Check no reboot - assert_eq!(get_reboot_count(&guest), expected_reboot_count); - } - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); + let guest = basic_regular_guest!(FOCAL_IMAGE_NAME); + _test_watchdog(&guest); } #[test] From 17774ee5644464007113a3bf05ec72b003afb075 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 18:41:52 -0700 Subject: [PATCH 309/742] tests: extract _test_pvpanic to tests_wrappers Extract test logic from test_pvpanic into a shared _test_pvpanic wrapper function in tests_wrappers.rs. Update the parent test case to use the basic_regular_guest macro. The wrapper uses default_kernel_cmdline() for kernel/cmdline setup. Signed-off-by: Muminul Islam --- .../tests/common/tests_wrappers.rs | 42 +++++++++++++++++ cloud-hypervisor/tests/integration.rs | 46 +------------------ 2 files changed, 44 insertions(+), 44 deletions(-) diff --git a/cloud-hypervisor/tests/common/tests_wrappers.rs b/cloud-hypervisor/tests/common/tests_wrappers.rs index 65d3be97ac..bfc954b256 100644 --- a/cloud-hypervisor/tests/common/tests_wrappers.rs +++ b/cloud-hypervisor/tests/common/tests_wrappers.rs @@ -3151,3 +3151,45 @@ pub(crate) fn _test_watchdog(guest: &Guest) { handle_child_output(r, &output); } + +pub(crate) fn _test_pvpanic(guest: &Guest) { + let api_socket = temp_api_path(&guest.tmp_dir); + let event_path = temp_event_monitor_path(&guest.tmp_dir); + + let mut cmd = GuestCommand::new(guest); + cmd.default_cpus() + .default_memory() + .default_kernel_cmdline() + .default_disks() + .args(["--net", guest.default_net_string().as_str()]) + .args(["--pvpanic"]) + .args(["--api-socket", &api_socket]) + .args(["--event-monitor", format!("path={event_path}").as_str()]) + .capture_output(); + + let mut child = cmd.spawn().unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + // Trigger guest a panic + make_guest_panic(guest); + + // Wait a while for guest + thread::sleep(std::time::Duration::new(10, 0)); + + let expected_sequential_events = [&MetaEvent { + event: "panic".to_string(), + device_id: None, + }]; + assert!(check_latest_events_exact( + &expected_sequential_events, + &event_path + )); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index b91ba34966..e31c08a408 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -5105,50 +5105,8 @@ mod common_parallel { #[test] fn test_pvpanic() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - let api_socket = temp_api_path(&guest.tmp_dir); - let event_path = temp_event_monitor_path(&guest.tmp_dir); - - let kernel_path = direct_kernel_boot_path(); - - let mut cmd = GuestCommand::new(&guest); - cmd.default_cpus() - .default_memory() - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .args(["--net", guest.default_net_string().as_str()]) - .args(["--pvpanic"]) - .args(["--api-socket", &api_socket]) - .args(["--event-monitor", format!("path={event_path}").as_str()]) - .capture_output(); - - let mut child = cmd.spawn().unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - // Trigger guest a panic - make_guest_panic(&guest); - - // Wait a while for guest - thread::sleep(std::time::Duration::new(10, 0)); - - let expected_sequential_events = [&MetaEvent { - event: "panic".to_string(), - device_id: None, - }]; - assert!(check_latest_events_exact( - &expected_sequential_events, - &event_path - )); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); + _test_pvpanic(&guest); } #[test] From a3423a4483762dd9d7743fa19fae67e80cb71540 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 18:43:39 -0700 Subject: [PATCH 310/742] tests: extract _test_tap_from_fd to tests_wrappers Extract test logic from test_tap_from_fd into a shared _test_tap_from_fd wrapper function in tests_wrappers.rs. Update the parent test case to use the basic_regular_guest macro with with_cpu(2). The wrapper uses default_kernel_cmdline() for kernel/cmdline setup. Signed-off-by: Muminul Islam --- .../tests/common/tests_wrappers.rs | 70 ++++++++++++++++++ cloud-hypervisor/tests/integration.rs | 73 +------------------ 2 files changed, 72 insertions(+), 71 deletions(-) diff --git a/cloud-hypervisor/tests/common/tests_wrappers.rs b/cloud-hypervisor/tests/common/tests_wrappers.rs index bfc954b256..0b8ffd7fe6 100644 --- a/cloud-hypervisor/tests/common/tests_wrappers.rs +++ b/cloud-hypervisor/tests/common/tests_wrappers.rs @@ -4,6 +4,7 @@ use std::ffi::CStr; use std::fs::{self, OpenOptions}; use std::io::{Read, Seek, SeekFrom, Write}; +use std::os::unix::io::AsRawFd; use std::path::{Path, PathBuf}; use std::string::String; use std::sync::mpsc; @@ -3193,3 +3194,72 @@ pub(crate) fn _test_pvpanic(guest: &Guest) { handle_child_output(r, &output); } + +pub(crate) fn _test_tap_from_fd(guest: &Guest) { + // Create a TAP interface with multi-queue enabled + let num_queue_pairs: usize = 2; + + use std::str::FromStr; + let taps = net_util::open_tap( + Some("chtap0"), + Some(std::net::IpAddr::V4( + std::net::Ipv4Addr::from_str(&guest.network.host_ip0).unwrap(), + )), + None, + &mut None, + None, + num_queue_pairs, + Some(libc::O_RDWR | libc::O_NONBLOCK), + ) + .unwrap(); + + let mut child = GuestCommand::new(guest) + .default_cpus() + .default_memory() + .default_kernel_cmdline() + .default_disks() + .args([ + "--net", + &format!( + "fd=[{},{}],mac={},num_queues={}", + taps[0].as_raw_fd(), + taps[1].as_raw_fd(), + guest.network.guest_mac0, + num_queue_pairs * 2 + ), + ]) + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + assert_eq!( + guest + .ssh_command("ip -o link | wc -l") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 2 + ); + + guest.reboot_linux(0); + + assert_eq!( + guest + .ssh_command("ip -o link | wc -l") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 2 + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index e31c08a408..744986fee9 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -5111,77 +5111,8 @@ mod common_parallel { #[test] fn test_tap_from_fd() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - let kernel_path = direct_kernel_boot_path(); - - // Create a TAP interface with multi-queue enabled - let num_queue_pairs: usize = 2; - - use std::str::FromStr; - let taps = net_util::open_tap( - Some("chtap0"), - Some(std::net::IpAddr::V4( - std::net::Ipv4Addr::from_str(&guest.network.host_ip0).unwrap(), - )), - None, - &mut None, - None, - num_queue_pairs, - Some(libc::O_RDWR | libc::O_NONBLOCK), - ) - .unwrap(); - - let mut child = GuestCommand::new(&guest) - .args(["--cpus", &format!("boot={num_queue_pairs}")]) - .default_memory() - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .args([ - "--net", - &format!( - "fd=[{},{}],mac={},num_queues={}", - taps[0].as_raw_fd(), - taps[1].as_raw_fd(), - guest.network.guest_mac0, - num_queue_pairs * 2 - ), - ]) - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - assert_eq!( - guest - .ssh_command("ip -o link | wc -l") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 2 - ); - - guest.reboot_linux(0); - - assert_eq!( - guest - .ssh_command("ip -o link | wc -l") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 2 - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME).with_cpu(2); + _test_tap_from_fd(&guest); } // By design, a guest VM won't be able to connect to the host From 205bb5ff5e040bab83fc0b9c0e2c21c9599a589f Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 18:46:58 -0700 Subject: [PATCH 311/742] tests: extract _test_macvtap to tests_wrappers Extract test logic from _test_macvtap into a shared wrapper function in tests_wrappers.rs. Update both test_macvtap and test_macvtap_hotplug to use the basic_regular_guest macro with with_cpu(2). The wrapper uses default_kernel_cmdline() for kernel/cmdline setup. Signed-off-by: Muminul Islam --- .../tests/common/tests_wrappers.rs | 145 +++++++++++++++- cloud-hypervisor/tests/integration.rs | 158 +----------------- 2 files changed, 148 insertions(+), 155 deletions(-) diff --git a/cloud-hypervisor/tests/common/tests_wrappers.rs b/cloud-hypervisor/tests/common/tests_wrappers.rs index 0b8ffd7fe6..64db4e7de8 100644 --- a/cloud-hypervisor/tests/common/tests_wrappers.rs +++ b/cloud-hypervisor/tests/common/tests_wrappers.rs @@ -1,7 +1,7 @@ // Copyright 2025 The Cloud Hypervisor Authors. All rights reserved. // // SPDX-License-Identifier: Apache-2.0 -use std::ffi::CStr; +use std::ffi::{CStr, CString}; use std::fs::{self, OpenOptions}; use std::io::{Read, Seek, SeekFrom, Write}; use std::os::unix::io::AsRawFd; @@ -3263,3 +3263,146 @@ pub(crate) fn _test_tap_from_fd(guest: &Guest) { handle_child_output(r, &output); } + +// test creates two macvtap interfaces in 'bridge' mode on the +// same physical net interface, one for the guest and one for +// the host. With additional setup on the IP address and the +// routing table, it enables the communications between the +// guest VM and the host machine. +// Details: https://wiki.libvirt.org/page/TroubleshootMacvtapHostFail +pub(crate) fn _test_macvtap( + guest: &Guest, + hotplug: bool, + guest_macvtap_name: &str, + host_macvtap_name: &str, +) { + let api_socket = temp_api_path(&guest.tmp_dir); + + let phy_net = "eth0"; + + // Create a macvtap interface for the guest VM to use + assert!( + exec_host_command_status(&format!( + "sudo ip link add link {phy_net} name {guest_macvtap_name} type macvtap mod bridge" + )) + .success() + ); + assert!( + exec_host_command_status(&format!( + "sudo ip link set {} address {} up", + guest_macvtap_name, guest.network.guest_mac0 + )) + .success() + ); + assert!(exec_host_command_status(&format!("sudo ip link show {guest_macvtap_name}")).success()); + + let tap_index = + fs::read_to_string(format!("/sys/class/net/{guest_macvtap_name}/ifindex")).unwrap(); + let tap_device = format!("/dev/tap{}", tap_index.trim()); + + assert!(exec_host_command_status(&format!("sudo chown $UID.$UID {tap_device}")).success()); + + let cstr_tap_device = CString::new(tap_device).unwrap(); + let tap_fd1 = unsafe { libc::open(cstr_tap_device.as_ptr(), libc::O_RDWR) }; + assert!(tap_fd1 > 0); + let tap_fd2 = unsafe { libc::open(cstr_tap_device.as_ptr(), libc::O_RDWR) }; + assert!(tap_fd2 > 0); + + // Create a macvtap on the same physical net interface for + // the host machine to use + assert!( + exec_host_command_status(&format!( + "sudo ip link add link {phy_net} name {host_macvtap_name} type macvtap mod bridge" + )) + .success() + ); + // Use default mask "255.255.255.0" + assert!( + exec_host_command_status(&format!( + "sudo ip address add {}/24 dev {}", + guest.network.host_ip0, host_macvtap_name + )) + .success() + ); + assert!( + exec_host_command_status(&format!("sudo ip link set dev {host_macvtap_name} up")).success() + ); + + let mut guest_command = GuestCommand::new(guest); + guest_command + .default_cpus() + .default_memory() + .default_kernel_cmdline() + .default_disks() + .args(["--api-socket", &api_socket]); + + let net_params = format!( + "fd=[{},{}],mac={},num_queues=4", + tap_fd1, tap_fd2, guest.network.guest_mac0 + ); + + if !hotplug { + guest_command.args(["--net", &net_params]); + } + + let mut child = guest_command.capture_output().spawn().unwrap(); + + if hotplug { + // Give some time to the VMM process to listen to the API + // socket. This is the only requirement to avoid the following + // call to ch-remote from failing. + thread::sleep(std::time::Duration::new(10, 0)); + // Hotplug the virtio-net device + let (cmd_success, cmd_output) = + remote_command_w_output(&api_socket, "add-net", Some(&net_params)); + assert!(cmd_success); + #[cfg(target_arch = "x86_64")] + assert!( + String::from_utf8_lossy(&cmd_output) + .contains("{\"id\":\"_net2\",\"bdf\":\"0000:00:05.0\"}") + ); + #[cfg(target_arch = "aarch64")] + assert!( + String::from_utf8_lossy(&cmd_output) + .contains("{\"id\":\"_net0\",\"bdf\":\"0000:00:05.0\"}") + ); + } + + // The functional connectivity provided by the virtio-net device + // gets tested through wait_vm_boot() as it expects to receive a + // HTTP request, and through the SSH command as well. + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + assert_eq!( + guest + .ssh_command("ip -o link | wc -l") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 2 + ); + + guest.reboot_linux(0); + + assert_eq!( + guest + .ssh_command("ip -o link | wc -l") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 2 + ); + }); + + kill_child(&mut child); + + exec_host_command_status(&format!("sudo ip link del {guest_macvtap_name}")); + exec_host_command_status(&format!("sudo ip link del {host_macvtap_name}")); + + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 744986fee9..b4042400c8 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -5115,168 +5115,18 @@ mod common_parallel { _test_tap_from_fd(&guest); } - // By design, a guest VM won't be able to connect to the host - // machine when using a macvtap network interface (while it can - // communicate externally). As a workaround, this integration - // test creates two macvtap interfaces in 'bridge' mode on the - // same physical net interface, one for the guest and one for - // the host. With additional setup on the IP address and the - // routing table, it enables the communications between the - // guest VM and the host machine. - // Details: https://wiki.libvirt.org/page/TroubleshootMacvtapHostFail - fn _test_macvtap(hotplug: bool, guest_macvtap_name: &str, host_macvtap_name: &str) { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - let api_socket = temp_api_path(&guest.tmp_dir); - - #[cfg(target_arch = "x86_64")] - let kernel_path = direct_kernel_boot_path(); - #[cfg(target_arch = "aarch64")] - let kernel_path = edk2_path(); - - let phy_net = "eth0"; - - // Create a macvtap interface for the guest VM to use - assert!( - exec_host_command_status(&format!( - "sudo ip link add link {phy_net} name {guest_macvtap_name} type macvtap mod bridge" - )) - .success() - ); - assert!( - exec_host_command_status(&format!( - "sudo ip link set {} address {} up", - guest_macvtap_name, guest.network.guest_mac0 - )) - .success() - ); - assert!( - exec_host_command_status(&format!("sudo ip link show {guest_macvtap_name}")).success() - ); - - let tap_index = - fs::read_to_string(format!("/sys/class/net/{guest_macvtap_name}/ifindex")).unwrap(); - let tap_device = format!("/dev/tap{}", tap_index.trim()); - - assert!(exec_host_command_status(&format!("sudo chown $UID.$UID {tap_device}")).success()); - - let cstr_tap_device = std::ffi::CString::new(tap_device).unwrap(); - let tap_fd1 = unsafe { libc::open(cstr_tap_device.as_ptr(), libc::O_RDWR) }; - assert!(tap_fd1 > 0); - let tap_fd2 = unsafe { libc::open(cstr_tap_device.as_ptr(), libc::O_RDWR) }; - assert!(tap_fd2 > 0); - - // Create a macvtap on the same physical net interface for - // the host machine to use - assert!( - exec_host_command_status(&format!( - "sudo ip link add link {phy_net} name {host_macvtap_name} type macvtap mod bridge" - )) - .success() - ); - // Use default mask "255.255.255.0" - assert!( - exec_host_command_status(&format!( - "sudo ip address add {}/24 dev {}", - guest.network.host_ip0, host_macvtap_name - )) - .success() - ); - assert!( - exec_host_command_status(&format!("sudo ip link set dev {host_macvtap_name} up")) - .success() - ); - - let mut guest_command = GuestCommand::new(&guest); - guest_command - .args(["--cpus", "boot=2"]) - .default_memory() - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .args(["--api-socket", &api_socket]); - - let net_params = format!( - "fd=[{},{}],mac={},num_queues=4", - tap_fd1, tap_fd2, guest.network.guest_mac0 - ); - - if !hotplug { - guest_command.args(["--net", &net_params]); - } - - let mut child = guest_command.capture_output().spawn().unwrap(); - - if hotplug { - // Give some time to the VMM process to listen to the API - // socket. This is the only requirement to avoid the following - // call to ch-remote from failing. - thread::sleep(std::time::Duration::new(10, 0)); - // Hotplug the virtio-net device - let (cmd_success, cmd_output) = - remote_command_w_output(&api_socket, "add-net", Some(&net_params)); - assert!(cmd_success); - #[cfg(target_arch = "x86_64")] - assert!( - String::from_utf8_lossy(&cmd_output) - .contains("{\"id\":\"_net2\",\"bdf\":\"0000:00:05.0\"}") - ); - #[cfg(target_arch = "aarch64")] - assert!( - String::from_utf8_lossy(&cmd_output) - .contains("{\"id\":\"_net0\",\"bdf\":\"0000:00:05.0\"}") - ); - } - - // The functional connectivity provided by the virtio-net device - // gets tested through wait_vm_boot() as it expects to receive a - // HTTP request, and through the SSH command as well. - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - assert_eq!( - guest - .ssh_command("ip -o link | wc -l") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 2 - ); - - guest.reboot_linux(0); - - assert_eq!( - guest - .ssh_command("ip -o link | wc -l") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 2 - ); - }); - - kill_child(&mut child); - - exec_host_command_status(&format!("sudo ip link del {guest_macvtap_name}")); - exec_host_command_status(&format!("sudo ip link del {host_macvtap_name}")); - - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); - } - #[test] #[cfg_attr(target_arch = "aarch64", ignore = "See #5443")] fn test_macvtap() { - _test_macvtap(false, "guestmacvtap0", "hostmacvtap0"); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME).with_cpu(2); + _test_macvtap(&guest, false, "guestmacvtap0", "hostmacvtap0"); } #[test] #[cfg_attr(target_arch = "aarch64", ignore = "See #5443")] fn test_macvtap_hotplug() { - _test_macvtap(true, "guestmacvtap1", "hostmacvtap1"); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME).with_cpu(2); + _test_macvtap(&guest, true, "guestmacvtap1", "hostmacvtap1"); } #[test] From e89b58469453deaa17697ca84c6c32ba68210ba7 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 18:49:43 -0700 Subject: [PATCH 312/742] tests: extract _test_vdpa_block to tests_wrappers Extract test logic from test_vdpa_block into a shared _test_vdpa_block wrapper function in tests_wrappers.rs. The vdpa module check is kept in the parent test case. Update the parent to use basic_regular_guest macro with with_cpu(2). The wrapper uses default_kernel_cmdline() for kernel/cmdline setup. Signed-off-by: Muminul Islam --- .../tests/common/tests_wrappers.rs | 112 +++++++++++++++++ cloud-hypervisor/tests/integration.rs | 117 +----------------- 2 files changed, 114 insertions(+), 115 deletions(-) diff --git a/cloud-hypervisor/tests/common/tests_wrappers.rs b/cloud-hypervisor/tests/common/tests_wrappers.rs index 64db4e7de8..159a7b0ca8 100644 --- a/cloud-hypervisor/tests/common/tests_wrappers.rs +++ b/cloud-hypervisor/tests/common/tests_wrappers.rs @@ -3406,3 +3406,115 @@ pub(crate) fn _test_macvtap( handle_child_output(r, &output); } + +pub(crate) fn _test_vdpa_block(guest: &Guest) { + let api_socket = temp_api_path(&guest.tmp_dir); + + let mut child = GuestCommand::new(guest) + .default_cpus() + .args(["--memory", "size=512M,hugepages=on"]) + .default_kernel_cmdline_with_platform(Some("num_pci_segments=2,iommu_segments=1")) + .default_disks() + .default_net() + .args(["--vdpa", "path=/dev/vhost-vdpa-0,num_queues=1"]) + .args(["--api-socket", &api_socket]) + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + // Check both if /dev/vdc exists and if the block size is 128M. + assert_eq!( + guest + .ssh_command("lsblk | grep vdc | grep -c 128M") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 1 + ); + + // Check the content of the block device after we wrote to it. + // The vpda-sim-blk should let us read what we previously wrote. + guest + .ssh_command("sudo bash -c 'echo foobar > /dev/vdc'") + .unwrap(); + assert_eq!( + guest.ssh_command("sudo head -1 /dev/vdc").unwrap().trim(), + "foobar" + ); + + // Hotplug an extra vDPA block device behind the vIOMMU + // Add a new vDPA device to the VM + let (cmd_success, cmd_output) = remote_command_w_output( + &api_socket, + "add-vdpa", + Some("id=myvdpa0,path=/dev/vhost-vdpa-1,num_queues=1,pci_segment=1,iommu=on"), + ); + assert!(cmd_success); + assert!( + String::from_utf8_lossy(&cmd_output) + .contains("{\"id\":\"myvdpa0\",\"bdf\":\"0001:00:01.0\"}") + ); + + thread::sleep(std::time::Duration::new(10, 0)); + + // Check IOMMU setup + assert!( + guest + .does_device_vendor_pair_match("0x1057", "0x1af4") + .unwrap_or_default() + ); + assert!( + guest + .ssh_command("ls /sys/kernel/iommu_groups/*/devices") + .unwrap() + .contains("0001:00:01.0") + ); + + // Check both if /dev/vdd exists and if the block size is 128M. + assert_eq!( + guest + .ssh_command("lsblk | grep vdd | grep -c 128M") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 1 + ); + + // Write some content to the block device we've just plugged. + guest + .ssh_command("sudo bash -c 'echo foobar > /dev/vdd'") + .unwrap(); + + // Check we can read the content back. + assert_eq!( + guest.ssh_command("sudo head -1 /dev/vdd").unwrap().trim(), + "foobar" + ); + + // Unplug the device + let cmd_success = remote_command(&api_socket, "remove-device", Some("myvdpa0")); + assert!(cmd_success); + thread::sleep(std::time::Duration::new(10, 0)); + + // Check /dev/vdd doesn't exist anymore + assert_eq!( + guest + .ssh_command("lsblk | grep -c vdd || true") + .unwrap() + .trim() + .parse::() + .unwrap_or(1), + 0 + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index b4042400c8..a9acca630d 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -5408,121 +5408,8 @@ mod common_parallel { // Before trying to run the test, verify the vdpa_sim_blk module is correctly loaded. assert!(exec_host_command_status("lsmod | grep vdpa_sim_blk").success()); - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - let api_socket = temp_api_path(&guest.tmp_dir); - - let kernel_path = direct_kernel_boot_path(); - - let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=2"]) - .args(["--memory", "size=512M,hugepages=on"]) - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .default_net() - .args(["--vdpa", "path=/dev/vhost-vdpa-0,num_queues=1"]) - .args(["--platform", "num_pci_segments=2,iommu_segments=1"]) - .args(["--api-socket", &api_socket]) - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - // Check both if /dev/vdc exists and if the block size is 128M. - assert_eq!( - guest - .ssh_command("lsblk | grep vdc | grep -c 128M") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 1 - ); - - // Check the content of the block device after we wrote to it. - // The vpda-sim-blk should let us read what we previously wrote. - guest - .ssh_command("sudo bash -c 'echo foobar > /dev/vdc'") - .unwrap(); - assert_eq!( - guest.ssh_command("sudo head -1 /dev/vdc").unwrap().trim(), - "foobar" - ); - - // Hotplug an extra vDPA block device behind the vIOMMU - // Add a new vDPA device to the VM - let (cmd_success, cmd_output) = remote_command_w_output( - &api_socket, - "add-vdpa", - Some("id=myvdpa0,path=/dev/vhost-vdpa-1,num_queues=1,pci_segment=1,iommu=on"), - ); - assert!(cmd_success); - assert!( - String::from_utf8_lossy(&cmd_output) - .contains("{\"id\":\"myvdpa0\",\"bdf\":\"0001:00:01.0\"}") - ); - - thread::sleep(std::time::Duration::new(10, 0)); - - // Check IOMMU setup - assert!( - guest - .does_device_vendor_pair_match("0x1057", "0x1af4") - .unwrap_or_default() - ); - assert!( - guest - .ssh_command("ls /sys/kernel/iommu_groups/*/devices") - .unwrap() - .contains("0001:00:01.0") - ); - - // Check both if /dev/vdd exists and if the block size is 128M. - assert_eq!( - guest - .ssh_command("lsblk | grep vdd | grep -c 128M") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 1 - ); - - // Write some content to the block device we've just plugged. - guest - .ssh_command("sudo bash -c 'echo foobar > /dev/vdd'") - .unwrap(); - - // Check we can read the content back. - assert_eq!( - guest.ssh_command("sudo head -1 /dev/vdd").unwrap().trim(), - "foobar" - ); - - // Unplug the device - let cmd_success = remote_command(&api_socket, "remove-device", Some("myvdpa0")); - assert!(cmd_success); - thread::sleep(std::time::Duration::new(10, 0)); - - // Check /dev/vdd doesn't exist anymore - assert_eq!( - guest - .ssh_command("lsblk | grep -c vdd || true") - .unwrap() - .trim() - .parse::() - .unwrap_or(1), - 0 - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME).with_cpu(2); + _test_vdpa_block(&guest); } #[test] From 108391f720587176e6c79065184c80daaec03b01 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 18:51:42 -0700 Subject: [PATCH 313/742] tests: add CVM test for multiple_network_interfaces Add a CVM integration test for multiple_network_interfaces that validates the same functionality using a confidential guest with the basic_cvm_guest macro. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration_cvm.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cloud-hypervisor/tests/integration_cvm.rs b/cloud-hypervisor/tests/integration_cvm.rs index fd47e21f01..25c6730d3b 100644 --- a/cloud-hypervisor/tests/integration_cvm.rs +++ b/cloud-hypervisor/tests/integration_cvm.rs @@ -218,4 +218,10 @@ mod common_cvm { let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); _test_dmi_oem_strings(&guest); } + + #[test] + fn test_multiple_network_interfaces() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); + _test_multiple_network_interfaces(&guest); + } } From 36871b3547fef9c42e78aa1ba56d8ec1225d4179 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 18:51:52 -0700 Subject: [PATCH 314/742] tests: add CVM test for serial_off Add a CVM integration test for serial_off that validates the same functionality using a confidential guest with the basic_cvm_guest macro. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration_cvm.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cloud-hypervisor/tests/integration_cvm.rs b/cloud-hypervisor/tests/integration_cvm.rs index 25c6730d3b..3f464961d3 100644 --- a/cloud-hypervisor/tests/integration_cvm.rs +++ b/cloud-hypervisor/tests/integration_cvm.rs @@ -224,4 +224,10 @@ mod common_cvm { let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); _test_multiple_network_interfaces(&guest); } + + #[test] + fn test_serial_off() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); + _test_serial_off(&guest); + } } From e9d8ecf38ae7fa9c442cdf5e678b295daeef0802 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 18:52:05 -0700 Subject: [PATCH 315/742] tests: add CVM test for virtio_console Add a CVM integration test for virtio_console that validates the same functionality using a confidential guest with the basic_cvm_guest macro. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration_cvm.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cloud-hypervisor/tests/integration_cvm.rs b/cloud-hypervisor/tests/integration_cvm.rs index 3f464961d3..b8261b30a9 100644 --- a/cloud-hypervisor/tests/integration_cvm.rs +++ b/cloud-hypervisor/tests/integration_cvm.rs @@ -230,4 +230,10 @@ mod common_cvm { let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); _test_serial_off(&guest); } + + #[test] + fn test_virtio_console() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); + _test_virtio_console(&guest); + } } From a4f1c65b14edeb54ce585a315c9af49f403810ad Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 18:52:15 -0700 Subject: [PATCH 316/742] tests: add CVM test for console_file Add a CVM integration test for console_file that validates the same functionality using a confidential guest with the basic_cvm_guest macro. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration_cvm.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cloud-hypervisor/tests/integration_cvm.rs b/cloud-hypervisor/tests/integration_cvm.rs index b8261b30a9..02ad0e730c 100644 --- a/cloud-hypervisor/tests/integration_cvm.rs +++ b/cloud-hypervisor/tests/integration_cvm.rs @@ -236,4 +236,10 @@ mod common_cvm { let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); _test_virtio_console(&guest); } + + #[test] + fn test_console_file() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); + _test_console_file(&guest); + } } From f4df95c142b70fdee3e3d4fa6393a982efbfeebc Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 18:52:25 -0700 Subject: [PATCH 317/742] tests: add CVM test for direct_kernel_boot_noacpi Add a CVM integration test for direct_kernel_boot_noacpi that validates the same functionality using a confidential guest with the basic_cvm_guest macro. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration_cvm.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cloud-hypervisor/tests/integration_cvm.rs b/cloud-hypervisor/tests/integration_cvm.rs index 02ad0e730c..26cb3b1921 100644 --- a/cloud-hypervisor/tests/integration_cvm.rs +++ b/cloud-hypervisor/tests/integration_cvm.rs @@ -242,4 +242,10 @@ mod common_cvm { let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); _test_console_file(&guest); } + + #[test] + fn test_direct_kernel_boot_noacpi() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); + _test_direct_kernel_boot_noacpi(&guest); + } } From 0c477c12ecfebc3d653c3bea6940809ed881ed1e Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 18:52:36 -0700 Subject: [PATCH 318/742] tests: add CVM test for pci_bar_reprogramming Add a CVM integration test for pci_bar_reprogramming that validates the same functionality using a confidential guest with the basic_cvm_guest macro. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration_cvm.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cloud-hypervisor/tests/integration_cvm.rs b/cloud-hypervisor/tests/integration_cvm.rs index 26cb3b1921..c28ecc810f 100644 --- a/cloud-hypervisor/tests/integration_cvm.rs +++ b/cloud-hypervisor/tests/integration_cvm.rs @@ -248,4 +248,10 @@ mod common_cvm { let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); _test_direct_kernel_boot_noacpi(&guest); } + + #[test] + fn test_pci_bar_reprogramming() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); + _test_pci_bar_reprogramming(&guest); + } } From 833d1d39a5a6f87fdfe7ba9066d0b8c3674853ce Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 18:53:59 -0700 Subject: [PATCH 319/742] tests: add CVM test for memory_overhead Add a CVM integration test for memory_overhead that validates the same functionality using a confidential guest with the basic_cvm_guest macro and custom memory size. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration_cvm.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cloud-hypervisor/tests/integration_cvm.rs b/cloud-hypervisor/tests/integration_cvm.rs index c28ecc810f..41cd75192f 100644 --- a/cloud-hypervisor/tests/integration_cvm.rs +++ b/cloud-hypervisor/tests/integration_cvm.rs @@ -254,4 +254,12 @@ mod common_cvm { let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); _test_pci_bar_reprogramming(&guest); } + + #[test] + fn test_memory_overhead() { + let guest_memory_size_kb: u32 = 512 * 1024; + let guest = + basic_cvm_guest!(JAMMY_IMAGE_NAME).with_memory(&format!("{guest_memory_size_kb}K")); + _test_memory_overhead(&guest, guest_memory_size_kb); + } } From 9564b4f278a737f088da5a3c8ff1cc461ac8be74 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 18:54:09 -0700 Subject: [PATCH 320/742] tests: add CVM test for landlock Add a CVM integration test for landlock that validates the same functionality using a confidential guest with the basic_cvm_guest macro. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration_cvm.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cloud-hypervisor/tests/integration_cvm.rs b/cloud-hypervisor/tests/integration_cvm.rs index 41cd75192f..9b2647c5e7 100644 --- a/cloud-hypervisor/tests/integration_cvm.rs +++ b/cloud-hypervisor/tests/integration_cvm.rs @@ -262,4 +262,10 @@ mod common_cvm { basic_cvm_guest!(JAMMY_IMAGE_NAME).with_memory(&format!("{guest_memory_size_kb}K")); _test_memory_overhead(&guest, guest_memory_size_kb); } + + #[test] + fn test_landlock() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); + _test_landlock(&guest); + } } From 60abdf0ea15d3db357b7adf1a9661deb0e98334b Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 18:54:18 -0700 Subject: [PATCH 321/742] tests: add CVM test for disk_hotplug Add a CVM integration test for disk_hotplug that validates the same functionality using a confidential guest with the basic_cvm_guest macro. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration_cvm.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cloud-hypervisor/tests/integration_cvm.rs b/cloud-hypervisor/tests/integration_cvm.rs index 9b2647c5e7..d18ddc2c8b 100644 --- a/cloud-hypervisor/tests/integration_cvm.rs +++ b/cloud-hypervisor/tests/integration_cvm.rs @@ -268,4 +268,10 @@ mod common_cvm { let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); _test_landlock(&guest); } + + #[test] + fn test_disk_hotplug() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); + _test_disk_hotplug(&guest, false); + } } From 9a88b9a42e45bde82decb8c3d6fbb07d1c725c6b Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 18:54:27 -0700 Subject: [PATCH 322/742] tests: add CVM test for net_hotplug Add a CVM integration test for net_hotplug that validates the same functionality using a confidential guest with the basic_cvm_guest macro. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration_cvm.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/cloud-hypervisor/tests/integration_cvm.rs b/cloud-hypervisor/tests/integration_cvm.rs index d18ddc2c8b..b99b262538 100644 --- a/cloud-hypervisor/tests/integration_cvm.rs +++ b/cloud-hypervisor/tests/integration_cvm.rs @@ -16,6 +16,7 @@ mod common_cvm { use common::tests_wrappers::*; use common::utils::*; use test_infra::*; + const NUM_PCI_SEGMENTS: u16 = 8; use super::*; macro_rules! basic_cvm_guest { @@ -101,9 +102,8 @@ mod common_cvm { // Use 8 segments to test the multiple segment support since it's more than the default 6 // supported by Linux // IGVM file used by Sev-Snp Guest now support up to 8 segments, so we can use 8 segments for testing. - let num_pci_segments: u16 = 8; let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); - _test_pci_multiple_segments(&guest, num_pci_segments, 5); + _test_pci_multiple_segments(&guest, NUM_PCI_SEGMENTS, 5); } #[test] @@ -274,4 +274,10 @@ mod common_cvm { let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); _test_disk_hotplug(&guest, false); } + + #[test] + fn test_net_hotplug() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); + _test_net_hotplug(&guest, NUM_PCI_SEGMENTS, None); + } } From 81f26bd99514444007d374df031169b0af7d0102 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 18:54:37 -0700 Subject: [PATCH 323/742] tests: add CVM test for counters Add a CVM integration test for counters that validates the same functionality using a confidential guest with the basic_cvm_guest macro. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration_cvm.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cloud-hypervisor/tests/integration_cvm.rs b/cloud-hypervisor/tests/integration_cvm.rs index b99b262538..2b0275bfe1 100644 --- a/cloud-hypervisor/tests/integration_cvm.rs +++ b/cloud-hypervisor/tests/integration_cvm.rs @@ -280,4 +280,10 @@ mod common_cvm { let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); _test_net_hotplug(&guest, NUM_PCI_SEGMENTS, None); } + + #[test] + fn test_counters() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); + _test_counters(&guest); + } } From 5ad9822a5005eb8d257560c234951c4f630602d7 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 18:54:46 -0700 Subject: [PATCH 324/742] tests: add CVM test for watchdog Add a CVM integration test for watchdog that validates the same functionality using a confidential guest with the basic_cvm_guest macro and FOCAL_IMAGE_NAME. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration_cvm.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cloud-hypervisor/tests/integration_cvm.rs b/cloud-hypervisor/tests/integration_cvm.rs index 2b0275bfe1..fedc1c6080 100644 --- a/cloud-hypervisor/tests/integration_cvm.rs +++ b/cloud-hypervisor/tests/integration_cvm.rs @@ -286,4 +286,10 @@ mod common_cvm { let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); _test_counters(&guest); } + + #[test] + fn test_watchdog() { + let guest = basic_cvm_guest!(FOCAL_IMAGE_NAME); + _test_watchdog(&guest); + } } From 7d672872d634eb9e5ab087a35af2425a53a51fba Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 18:54:55 -0700 Subject: [PATCH 325/742] tests: add CVM test for pvpanic Add a CVM integration test for pvpanic that validates the same functionality using a confidential guest with the basic_cvm_guest macro. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration_cvm.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cloud-hypervisor/tests/integration_cvm.rs b/cloud-hypervisor/tests/integration_cvm.rs index fedc1c6080..254542427c 100644 --- a/cloud-hypervisor/tests/integration_cvm.rs +++ b/cloud-hypervisor/tests/integration_cvm.rs @@ -292,4 +292,10 @@ mod common_cvm { let guest = basic_cvm_guest!(FOCAL_IMAGE_NAME); _test_watchdog(&guest); } + + #[test] + fn test_pvpanic() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); + _test_pvpanic(&guest); + } } From 6fa6ead2b3830fb2f1d1330e0ac1079bfc89fc0b Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 18:55:05 -0700 Subject: [PATCH 326/742] tests: add CVM test for tap_from_fd Add a CVM integration test for tap_from_fd that validates the same functionality using a confidential guest with the basic_cvm_guest macro and with_cpu(2). Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration_cvm.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cloud-hypervisor/tests/integration_cvm.rs b/cloud-hypervisor/tests/integration_cvm.rs index 254542427c..0002f6084e 100644 --- a/cloud-hypervisor/tests/integration_cvm.rs +++ b/cloud-hypervisor/tests/integration_cvm.rs @@ -298,4 +298,10 @@ mod common_cvm { let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); _test_pvpanic(&guest); } + + #[test] + fn test_tap_from_fd() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME).with_cpu(2); + _test_tap_from_fd(&guest); + } } From b7bd19b24a34d1c728c49941a754d94120f519f7 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 18:55:49 -0700 Subject: [PATCH 327/742] tests: add CVM test for macvtap Add a CVM integration test for macvtap that validates the same functionality using a confidential guest with the basic_cvm_guest macro and with_cpu(2). Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration_cvm.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cloud-hypervisor/tests/integration_cvm.rs b/cloud-hypervisor/tests/integration_cvm.rs index 0002f6084e..8b8ab00f2b 100644 --- a/cloud-hypervisor/tests/integration_cvm.rs +++ b/cloud-hypervisor/tests/integration_cvm.rs @@ -304,4 +304,10 @@ mod common_cvm { let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME).with_cpu(2); _test_tap_from_fd(&guest); } + + #[test] + fn test_macvtap() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME).with_cpu(2); + _test_macvtap(&guest, false, "guestmacvtap0", "hostmacvtap0"); + } } From 7e62565cca6371d2aaf6c0a0be1c70b56f74d5ae Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 18:56:04 -0700 Subject: [PATCH 328/742] tests: add CVM test for macvtap_hotplug Add a CVM integration test for macvtap_hotplug that validates the same functionality using a confidential guest with the basic_cvm_guest macro and with_cpu(2). Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration_cvm.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cloud-hypervisor/tests/integration_cvm.rs b/cloud-hypervisor/tests/integration_cvm.rs index 8b8ab00f2b..9ee575ca7a 100644 --- a/cloud-hypervisor/tests/integration_cvm.rs +++ b/cloud-hypervisor/tests/integration_cvm.rs @@ -310,4 +310,10 @@ mod common_cvm { let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME).with_cpu(2); _test_macvtap(&guest, false, "guestmacvtap0", "hostmacvtap0"); } + + #[test] + fn test_macvtap_hotplug() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME).with_cpu(2); + _test_macvtap(&guest, true, "guestmacvtap1", "hostmacvtap1"); + } } From 80ce6960062a001f92afe19d5f69f290818c6654 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Sat, 21 Mar 2026 18:56:19 -0700 Subject: [PATCH 329/742] tests: add CVM test for vdpa_block Add a CVM integration test for vdpa_block that validates the same functionality using a confidential guest with the basic_cvm_guest macro and with_cpu(2). Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/integration_cvm.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cloud-hypervisor/tests/integration_cvm.rs b/cloud-hypervisor/tests/integration_cvm.rs index 9ee575ca7a..039f8840ff 100644 --- a/cloud-hypervisor/tests/integration_cvm.rs +++ b/cloud-hypervisor/tests/integration_cvm.rs @@ -316,4 +316,12 @@ mod common_cvm { let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME).with_cpu(2); _test_macvtap(&guest, true, "guestmacvtap1", "hostmacvtap1"); } + + #[test] + fn test_vdpa_block() { + assert!(exec_host_command_status("lsmod | grep vdpa_sim_blk").success()); + + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME).with_cpu(2); + _test_vdpa_block(&guest); + } } From 11d8ac48cbf5ef2e8417c9e58cd3406769d0c07f Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Wed, 25 Mar 2026 05:33:14 -0700 Subject: [PATCH 330/742] build: Use released vfio & vhost crates Replace the git dependencies with newly released versions Signed-off-by: Rob Bradford --- Cargo.lock | 25 +++++++++++++++---------- Cargo.toml | 10 +++++----- fuzz/Cargo.lock | 20 ++++++++++++-------- 3 files changed, 32 insertions(+), 23 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b009dfa901..ad3fe2b934 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2277,16 +2277,18 @@ checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" [[package]] name = "vfio-bindings" -version = "0.6.1" -source = "git+https://github.com/rust-vmm/vfio?rev=df861a878168ad71602d8a1945bd3b7acbd22693#df861a878168ad71602d8a1945bd3b7acbd22693" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "188dac3057a0cbc94470085204c84b82ff7ec5dac629a514323cd133d1f9abe0" dependencies = [ "vmm-sys-util", ] [[package]] name = "vfio-ioctls" -version = "0.5.2" -source = "git+https://github.com/rust-vmm/vfio?rev=df861a878168ad71602d8a1945bd3b7acbd22693#df861a878168ad71602d8a1945bd3b7acbd22693" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c80c6d52f8e592e31a8f7eb45e882a9617aa61ec2479981a175e9f0a79f2434e" dependencies = [ "byteorder", "kvm-bindings", @@ -2303,8 +2305,9 @@ dependencies = [ [[package]] name = "vfio_user" -version = "0.1.2" -source = "git+https://github.com/rust-vmm/vfio?rev=df861a878168ad71602d8a1945bd3b7acbd22693#df861a878168ad71602d8a1945bd3b7acbd22693" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "731c2582dd43f4f174ab47b4c933a1a9bb872d9d1b7f54c5867e12dbc1491b75" dependencies = [ "bitflags 2.11.0", "libc", @@ -2320,8 +2323,9 @@ dependencies = [ [[package]] name = "vhost" -version = "0.15.0" -source = "git+https://github.com/rust-vmm/vhost?rev=c9b80a1c93bac7820e4aee4269aa904568937035#c9b80a1c93bac7820e4aee4269aa904568937035" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee90657203a8644e9a0860a0db6a7887d8ef0c7bc09fc22dfa4ae75df65bac86" dependencies = [ "bitflags 2.11.0", "libc", @@ -2332,8 +2336,9 @@ dependencies = [ [[package]] name = "vhost-user-backend" -version = "0.21.0" -source = "git+https://github.com/rust-vmm/vhost?rev=c9b80a1c93bac7820e4aee4269aa904568937035#c9b80a1c93bac7820e4aee4269aa904568937035" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5925983d8fb537752ad3e26604c0a17abfa5de77cb6773a096c8a959c9eca0f" dependencies = [ "libc", "log", diff --git a/Cargo.toml b/Cargo.toml index b8320172e1..cc2bd3c175 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -59,11 +59,11 @@ linux-loader = "0.13.2" mshv-bindings = "0.6.7" mshv-ioctls = "0.6.7" seccompiler = "0.5.0" -vfio-bindings = { git = "https://github.com/rust-vmm/vfio", rev = "df861a878168ad71602d8a1945bd3b7acbd22693", default-features = false } -vfio-ioctls = { git = "https://github.com/rust-vmm/vfio", rev = "df861a878168ad71602d8a1945bd3b7acbd22693", default-features = false } -vfio_user = { git = "https://github.com/rust-vmm/vfio", rev = "df861a878168ad71602d8a1945bd3b7acbd22693", default-features = false } -vhost = { git = "https://github.com/rust-vmm/vhost", rev = "c9b80a1c93bac7820e4aee4269aa904568937035", default-features = false } -vhost-user-backend = { git = "https://github.com/rust-vmm/vhost", rev = "c9b80a1c93bac7820e4aee4269aa904568937035", default-features = false } +vfio-bindings = { version = "0.6.2", default-features = false } +vfio-ioctls = { version = "0.5.3", default-features = false } +vfio_user = { version = "0.1.3", default-features = false } +vhost = { version = "0.16.0", default-features = false } +vhost-user-backend = { version = "0.22.0", default-features = false } virtio-bindings = "0.2.6" virtio-queue = "0.17.0" vm-fdt = "0.3.0" diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock index d5cda1e24f..559fd35ce6 100644 --- a/fuzz/Cargo.lock +++ b/fuzz/Cargo.lock @@ -1273,16 +1273,18 @@ dependencies = [ [[package]] name = "vfio-bindings" -version = "0.6.1" -source = "git+https://github.com/rust-vmm/vfio?rev=df861a878168ad71602d8a1945bd3b7acbd22693#df861a878168ad71602d8a1945bd3b7acbd22693" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "188dac3057a0cbc94470085204c84b82ff7ec5dac629a514323cd133d1f9abe0" dependencies = [ "vmm-sys-util", ] [[package]] name = "vfio-ioctls" -version = "0.5.2" -source = "git+https://github.com/rust-vmm/vfio?rev=df861a878168ad71602d8a1945bd3b7acbd22693#df861a878168ad71602d8a1945bd3b7acbd22693" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c80c6d52f8e592e31a8f7eb45e882a9617aa61ec2479981a175e9f0a79f2434e" dependencies = [ "byteorder", "kvm-bindings", @@ -1297,8 +1299,9 @@ dependencies = [ [[package]] name = "vfio_user" -version = "0.1.2" -source = "git+https://github.com/rust-vmm/vfio?rev=df861a878168ad71602d8a1945bd3b7acbd22693#df861a878168ad71602d8a1945bd3b7acbd22693" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "731c2582dd43f4f174ab47b4c933a1a9bb872d9d1b7f54c5867e12dbc1491b75" dependencies = [ "bitflags 2.11.0", "libc", @@ -1314,8 +1317,9 @@ dependencies = [ [[package]] name = "vhost" -version = "0.15.0" -source = "git+https://github.com/rust-vmm/vhost?rev=c9b80a1c93bac7820e4aee4269aa904568937035#c9b80a1c93bac7820e4aee4269aa904568937035" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee90657203a8644e9a0860a0db6a7887d8ef0c7bc09fc22dfa4ae75df65bac86" dependencies = [ "bitflags 2.11.0", "libc", From a8d962640f0dabde12dd7738384a13081c23b3a2 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Wed, 25 Mar 2026 06:28:19 -0700 Subject: [PATCH 331/742] build: Use newer virtiofsd Switch to a git hash of virtiofsd as unfortunately the last release is some time ago and does not incorporate a dependency bump that enables it to work with snapshot-restore. Signed-off-by: Rob Bradford --- scripts/run_integration_tests_aarch64.sh | 2 +- scripts/run_integration_tests_x86_64.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/run_integration_tests_aarch64.sh b/scripts/run_integration_tests_aarch64.sh index 2489d7892a..b4c3482d0e 100755 --- a/scripts/run_integration_tests_aarch64.sh +++ b/scripts/run_integration_tests_aarch64.sh @@ -13,7 +13,7 @@ build_virtiofsd() { VIRTIOFSD_DIR="$WORKLOADS_DIR/virtiofsd_build" VIRTIOFSD_REPO="https://gitlab.com/virtio-fs/virtiofsd.git" - checkout_repo "$VIRTIOFSD_DIR" "$VIRTIOFSD_REPO" v1.13.3 "bbf82173682a3e48083771a0a23331e5c23b4924" + checkout_repo "$VIRTIOFSD_DIR" "$VIRTIOFSD_REPO" main "0f5865629dc995a3e9d5a73b4eb45bb91740bccb" if [ ! -f "$VIRTIOFSD_DIR/.built" ]; then pushd "$VIRTIOFSD_DIR" || exit diff --git a/scripts/run_integration_tests_x86_64.sh b/scripts/run_integration_tests_x86_64.sh index 80ac279349..858bd2f872 100755 --- a/scripts/run_integration_tests_x86_64.sh +++ b/scripts/run_integration_tests_x86_64.sh @@ -124,7 +124,7 @@ if [ ! -f "$VIRTIOFSD" ]; then pushd "$WORKLOADS_DIR" || exit git clone "https://gitlab.com/virtio-fs/virtiofsd.git" $VIRTIOFSD_DIR pushd $VIRTIOFSD_DIR || exit - git checkout v1.13.3 + git checkout 0f5865629dc995a3e9d5a73b4eb45bb91740bccb time cargo build --release cp target/release/virtiofsd "$VIRTIOFSD" || exit 1 popd || exit From 0b90180266363e39019538b61312709380aed1d1 Mon Sep 17 00:00:00 2001 From: Alyssa Ross Date: Wed, 25 Mar 2026 14:49:22 +0100 Subject: [PATCH 332/742] vmm: use UAPI name for PIDTYPE_PID As far as I can tell, PIDTYPE_PID is a kernel-internal name, and PR_SCHED_CORE_SCOPE_THREAD is the UAPI name. There's no PIDTYPE_PID in the UAPI headers, and the core scheduling documentation says that the fourth prctl argument should be a "PR_SCHED_CORE_SCOPE_-prefixed macro constant". Link: https://www.kernel.org/doc/html/v6.19/admin-guide/hw-vuln/core-scheduling.html#usage Fixes: 3f800d2bb ("vmm: Add core scheduling support for vCPU threads") Signed-off-by: Alyssa Ross --- vmm/src/cpu.rs | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/vmm/src/cpu.rs b/vmm/src/cpu.rs index 852850b9fa..e855204566 100644 --- a/vmm/src/cpu.rs +++ b/vmm/src/cpu.rs @@ -230,14 +230,22 @@ const PR_SCHED_CORE: libc::c_int = 62; const PR_SCHED_CORE_GET: libc::c_int = 0; const PR_SCHED_CORE_CREATE: libc::c_int = 1; const PR_SCHED_CORE_SHARE_FROM: libc::c_int = 3; -const PIDTYPE_PID: libc::c_int = 0; +const PR_SCHED_CORE_SCOPE_THREAD: libc::c_int = 0; /// Create a new unique core scheduling cookie for the current thread. /// Silently succeeds on kernels that don't support PR_SCHED_CORE. fn core_scheduling_create() -> Result<()> { // SAFETY: prctl with PR_SCHED_CORE_CREATE on the current thread (pid=0). // All arguments are valid constants. We check the return value. - let ret = unsafe { libc::prctl(PR_SCHED_CORE, PR_SCHED_CORE_CREATE, 0, PIDTYPE_PID, 0) }; + let ret = unsafe { + libc::prctl( + PR_SCHED_CORE, + PR_SCHED_CORE_CREATE, + 0, + PR_SCHED_CORE_SCOPE_THREAD, + 0, + ) + }; if ret == -1 { let err = io::Error::last_os_error(); // EINVAL: kernel < 5.14 where PR_SCHED_CORE is unknown. @@ -261,7 +269,15 @@ fn core_scheduling_create() -> Result<()> { fn core_scheduling_share_from(tid: i32) -> Result<()> { // SAFETY: prctl with PR_SCHED_CORE_SHARE_FROM targeting tid. // All arguments are valid. We check the return value. - let ret = unsafe { libc::prctl(PR_SCHED_CORE, PR_SCHED_CORE_SHARE_FROM, tid, PIDTYPE_PID, 0) }; + let ret = unsafe { + libc::prctl( + PR_SCHED_CORE, + PR_SCHED_CORE_SHARE_FROM, + tid, + PR_SCHED_CORE_SCOPE_THREAD, + 0, + ) + }; if ret == -1 { let err = io::Error::last_os_error(); match err.raw_os_error() { @@ -283,7 +299,7 @@ fn core_scheduling_cookie() -> u64 { PR_SCHED_CORE, PR_SCHED_CORE_GET, 0, - PIDTYPE_PID, + PR_SCHED_CORE_SCOPE_THREAD, &mut cookie as *mut u64, ) }; From 65073259c6a9567f8f40475da00e093aff61498c Mon Sep 17 00:00:00 2001 From: Dylan Reid Date: Wed, 25 Mar 2026 18:08:21 -0700 Subject: [PATCH 333/742] vmm: handle malformed balloon actual from guest The actual size of the balloon is taken directly from the guest. A misbehaving guest can set it to an arbitrary value and cause underflow on the next vm.info call. Use a saturation_sub instead to avoid a panic in a debug build or a crazy number in a release build. Signed-off-by: Dylan Reid --- vmm/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index d1cf6693a3..03edec26b5 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -2061,7 +2061,7 @@ impl RequestHandler for Vmm { let mut memory_actual_size = config.memory.total_size(); if let Some(vm) = &self.vm { - memory_actual_size -= vm.balloon_size(); + memory_actual_size = memory_actual_size.saturating_sub(vm.balloon_size()); } let device_tree = self From 57e766bdbbfcdf1f36f696fc735fbebbea97f5ca Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Thu, 26 Mar 2026 06:44:23 -0700 Subject: [PATCH 334/742] virtio-devices: Only try and activate if the device became ready Previously this code could lead to the device being trying to be activated multiple times as the code to trigger the activation was based on the state of the device (not yet activated and device being ready). This could occur if anothe vCPU wrote to a PCI BAR on this device before the device activation was completed by the VMM thread. Now we only trigger the activation if the device readiness has changed as a result of this BAR write (by checking that the readiness was originally unready.) Signed-off-by: Rob Bradford --- virtio-devices/src/transport/pci_device.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/virtio-devices/src/transport/pci_device.rs b/virtio-devices/src/transport/pci_device.rs index 3e2a96ccd9..1eb1cc03f7 100644 --- a/virtio-devices/src/transport/pci_device.rs +++ b/virtio-devices/src/transport/pci_device.rs @@ -1179,6 +1179,7 @@ impl PciDevice for VirtioPciDevice { } fn write_bar(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { + let initial_ready = self.is_driver_ready(); match offset { o if o < COMMON_CONFIG_BAR_OFFSET + COMMON_CONFIG_SIZE => self.common_config.write( o - COMMON_CONFIG_BAR_OFFSET, @@ -1230,8 +1231,8 @@ impl PciDevice for VirtioPciDevice { _ => (), } - // Try and activate the device if the driver status has changed - if self.needs_activation() { + // Try and activate the device if the driver status has changed (from unready to ready) + if !initial_ready && self.needs_activation() { let barrier = Arc::new(Barrier::new(2)); let activator = self.prepare_activator(Some(barrier.clone())); self.pending_activations.lock().unwrap().push(activator); From 5bb4ea20a57615439629cc83736e5cde14e10820 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Thu, 26 Mar 2026 18:49:56 -0700 Subject: [PATCH 335/742] scripts: Fix volume argument splitting in dev_cli.sh Use IFS-based splitting instead of parameter expansion to correctly separate '#'-delimited volume paths in process_volumes_args(). The previous approach placed all volumes into a single array element, causing Docker to receive malformed --volume arguments. Signed-off-by: Muminul Islam --- scripts/dev_cli.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/dev_cli.sh b/scripts/dev_cli.sh index 0e190a0072..0c0a2d5e4f 100755 --- a/scripts/dev_cli.sh +++ b/scripts/dev_cli.sh @@ -165,7 +165,7 @@ process_volumes_args() { return fi exported_volumes="" - arr_vols=("${arg_vols//#/ }") + IFS='#' read -ra arr_vols <<<"$arg_vols" for var in "${arr_vols[@]}"; do dev=$(echo "$var" | cut -d ':' -f 1) if [[ ! -e "$dev" ]]; then From 7360bfe33a848f7d13b3ccc7df0759715f8e2f87 Mon Sep 17 00:00:00 2001 From: Bo Chen Date: Wed, 21 May 2025 21:02:56 +0000 Subject: [PATCH 336/742] pci, vmm: Switch to use more generic `VfioOps` trait Replace the concrete `VfioContainer` type with the `VfioOps` trait object for device passthrough. This decouples the VFIO DMA mapping interface from the legacy VFIO container/group implementation, allowing it to be extended to support VFIO cdev and iommufd in the future. Signed-off-by: Bo Chen --- pci/src/vfio.rs | 12 +++++------- vmm/src/device_manager.rs | 4 ++-- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/pci/src/vfio.rs b/pci/src/vfio.rs index ee4e43e31e..7f97e73dad 100644 --- a/pci/src/vfio.rs +++ b/pci/src/vfio.rs @@ -19,9 +19,7 @@ use log::{error, info}; use serde::{Deserialize, Serialize}; use thiserror::Error; use vfio_bindings::bindings::vfio::*; -use vfio_ioctls::{ - VfioContainer, VfioDevice, VfioIrq, VfioRegionInfoCap, VfioRegionSparseMmapArea, -}; +use vfio_ioctls::{VfioDevice, VfioIrq, VfioOps, VfioRegionInfoCap, VfioRegionSparseMmapArea}; use vm_allocator::page_size::{ align_page_size_down, align_page_size_up, is_4k_aligned, is_4k_multiple, is_page_size_aligned, }; @@ -1468,7 +1466,7 @@ pub struct VfioPciDevice { id: String, vm: Arc, device: Arc, - container: Arc, + container: Arc, common: VfioCommon, iommu_attached: bool, memory_slot_allocator: MemorySlotAllocator, @@ -1483,7 +1481,7 @@ impl VfioPciDevice { id: String, vm: Arc, device: VfioDevice, - container: Arc, + container: Arc, msi_interrupt_manager: Arc>, legacy_interrupt_group: Option>, iommu_attached: bool, @@ -2000,7 +1998,7 @@ impl Migratable for VfioPciDevice {} /// be used when the caller tries to provide a way to update the mappings /// associated with a specific VFIO container. pub struct VfioDmaMapping { - container: Arc, + container: Arc, memory: Arc, mmio_regions: Arc>>, } @@ -2012,7 +2010,7 @@ impl VfioDmaMapping { /// * `memory`: guest memory to mmap. /// * `mmio_regions`: mmio_regions to mmap. pub fn new( - container: Arc, + container: Arc, memory: Arc, mmio_regions: Arc>>, ) -> Self { diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 50b82772d0..8dc2399e34 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -1032,7 +1032,7 @@ pub struct DeviceManager { // VFIO container // Only one container can be created, therefore it is stored as part of the // DeviceManager to be reused. - vfio_container: Option>, + vfio_container: Option>, // Paravirtualized IOMMU iommu_device: Option>>, @@ -3798,7 +3798,7 @@ impl DeviceManager { self.add_vfio_device(device_cfg) } - fn create_vfio_container(&self) -> DeviceManagerResult> { + fn create_vfio_container(&self) -> DeviceManagerResult> { let passthrough_device = self .passthrough_device .as_ref() From 1bc49758a0e10bc8a8515baba105c40bc30678b0 Mon Sep 17 00:00:00 2001 From: Bo Chen Date: Thu, 26 Mar 2026 22:38:01 +0000 Subject: [PATCH 337/742] pci, vmm: Cleanup the naming and references to VFIO container Following the `VfioContainer` to `VfioOps` trait switch, update the remaining field names, method names, comments, and log messages to use `vfio_ops` and "host IOMMU address space" consistently. No fucntional changes. Signed-off-by: Bo Chen --- pci/src/vfio.rs | 44 ++++++++++----------- vmm/src/device_manager.rs | 81 ++++++++++++++++++++------------------- 2 files changed, 64 insertions(+), 61 deletions(-) diff --git a/pci/src/vfio.rs b/pci/src/vfio.rs index 7f97e73dad..dbcf59fbe8 100644 --- a/pci/src/vfio.rs +++ b/pci/src/vfio.rs @@ -1466,7 +1466,7 @@ pub struct VfioPciDevice { id: String, vm: Arc, device: Arc, - container: Arc, + vfio_ops: Arc, common: VfioCommon, iommu_attached: bool, memory_slot_allocator: MemorySlotAllocator, @@ -1481,7 +1481,7 @@ impl VfioPciDevice { id: String, vm: Arc, device: VfioDevice, - container: Arc, + vfio_ops: Arc, msi_interrupt_manager: Arc>, legacy_interrupt_group: Option>, iommu_attached: bool, @@ -1510,7 +1510,7 @@ impl VfioPciDevice { id, vm, device, - container, + vfio_ops, common, iommu_attached, memory_slot_allocator, @@ -1705,7 +1705,7 @@ impl VfioPciDevice { // user_memory_region.mapping.len() bytes of // valid memory that will only be unmapped with munmap(). unsafe { - self.container.vfio_dma_map( + self.vfio_ops.vfio_dma_map( user_memory_region.start, user_memory_region.mapping.len(), user_memory_region.mapping.addr(), @@ -1726,15 +1726,15 @@ impl VfioPciDevice { for user_memory_region in region.user_memory_regions.drain(..) { let len = user_memory_region.mapping.len(); let host_addr = user_memory_region.mapping.addr(); - // Unmap from vfio container + // Unmap MMIO region from the host IOMMU address space via VfioOps if !self.iommu_attached && let Err(e) = self - .container + .vfio_ops .vfio_dma_unmap(user_memory_region.start, len) .map_err(|e| VfioPciError::DmaUnmap(e, self.device_path.clone(), self.bdf)) { error!( - "Could not unmap mmio region from vfio container: \ + "Could not unmap MMIO region from the host IOMMU address space: \ iova 0x{:x}, size 0x{:x}: {}, ", user_memory_region.start, len, e ); @@ -1884,17 +1884,17 @@ impl PciDevice for VfioPciDevice { for user_memory_region in region.user_memory_regions.iter_mut() { let len = user_memory_region.mapping.len(); let host_addr = user_memory_region.mapping.addr(); - // Unmap the old MMIO region from vfio container + // Unmap the old MMIO region from the host IOMMU address space via VfioOps if !self.iommu_attached && let Err(e) = self - .container + .vfio_ops .vfio_dma_unmap(user_memory_region.start, len) .map_err(|e| { VfioPciError::DmaUnmap(e, self.device_path.clone(), self.bdf) }) { error!( - "Could not unmap mmio region from vfio container: \ + "Could not unmap MMIO region from the host IOMMU address space: \ iova 0x{:x}, size 0x{:x}: {}, ", user_memory_region.start, len, e ); @@ -1938,7 +1938,7 @@ iova 0x{:x}, size 0x{:x}: {}, ", } .map_err(io::Error::other)?; - // Map the moved mmio region to vfio container + // Map the moved MMIO region into the host IOMMU address space via VfioOps if !self.iommu_attached { // vfio_dma_map is unsound and ought to be marked as unsafe #[allow(unused_unsafe)] @@ -1946,13 +1946,13 @@ iova 0x{:x}, size 0x{:x}: {}, ", // host_addr points to len bytes of // valid memory that will only be unmapped with munmap(). unsafe { - self.container + self.vfio_ops .vfio_dma_map(user_memory_region.start, len, host_addr) } .map_err(|e| VfioPciError::DmaMap(e, self.device_path.clone(), self.bdf)) .map_err(|e| { io::Error::other(format!( - "Could not map mmio region to vfio container: \ + "Could not map MMIO region into the host IOMMU address space: \ iova 0x{:x}, size 0x{:x}: {}, ", user_memory_region.start, len, e )) @@ -1996,9 +1996,9 @@ impl Migratable for VfioPciDevice {} /// This structure implements the ExternalDmaMapping trait. It is meant to /// be used when the caller tries to provide a way to update the mappings -/// associated with a specific VFIO container. +/// associated with a specific VfioOps instance. pub struct VfioDmaMapping { - container: Arc, + vfio_ops: Arc, memory: Arc, mmio_regions: Arc>>, } @@ -2006,16 +2006,16 @@ pub struct VfioDmaMapping { impl VfioDmaMapping { /// Create a DmaMapping object. /// # Parameters - /// * `container`: VFIO container object. + /// * `vfio_ops`: VfioOps instance. /// * `memory`: guest memory to mmap. /// * `mmio_regions`: mmio_regions to mmap. pub fn new( - container: Arc, + vfio_ops: Arc, memory: Arc, mmio_regions: Arc>>, ) -> Self { VfioDmaMapping { - container, + vfio_ops, memory, mmio_regions, } @@ -2062,20 +2062,20 @@ impl ExternalDmaMapping for VfioDmaMapping std::result::Result<(), io::Error> { - self.container + self.vfio_ops .vfio_dma_unmap(iova, size as usize) .map_err(|e| { io::Error::other(format!( - "failed to unmap memory for VFIO container, \ + "failed to unmap memory from the host IOMMU address space, \ iova 0x{iova:x}, size 0x{size:x}: {e:?}" )) }) diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 8dc2399e34..95814cc92d 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -1029,10 +1029,10 @@ pub struct DeviceManager { // Passthrough device handle passthrough_device: Option, - // VFIO container - // Only one container can be created, therefore it is stored as part of the + // VFIO operation instance + // Only one can be created, therefore it is stored as part of the // DeviceManager to be reused. - vfio_container: Option>, + vfio_ops: Option>, // Paravirtualized IOMMU iommu_device: Option>>, @@ -1350,7 +1350,7 @@ impl DeviceManager { msi_interrupt_manager, legacy_interrupt_manager: None, passthrough_device: None, - vfio_container: None, + vfio_ops: None, iommu_device: None, iommu_mapping: None, iommu_attached_devices: None, @@ -3798,7 +3798,7 @@ impl DeviceManager { self.add_vfio_device(device_cfg) } - fn create_vfio_container(&self) -> DeviceManagerResult> { + fn create_vfio_ops(&self) -> DeviceManagerResult> { let passthrough_device = self .passthrough_device .as_ref() @@ -3830,19 +3830,24 @@ impl DeviceManager { let mut needs_dma_mapping = false; - // Here we create a new VFIO container for two reasons. Either this is - // the first VFIO device, meaning we need a new VFIO container, which - // will be shared with other VFIO devices. Or the new VFIO device is - // attached to a vIOMMU, meaning we must create a dedicated VFIO - // container. In the vIOMMU use case, we can't let all devices under - // the same VFIO container since we couldn't map/unmap memory for each - // device. That's simply because the map/unmap operations happen at the - // VFIO container level. - let vfio_container = if device_cfg.iommu { - let vfio_container = self.create_vfio_container()?; + // Here we create a new VfioOps for two reasons: + // 1) This is the first VFIO device, meaning we need a new VfioOps + // which will be shared with other VFIO devices. + // 2) The new VFIO device is attached to a vIOMMU, meaning we must + // create a dedicated VfioOps. In the vIOMMU use case, we can't + // let all devices share the same VfioOps since we couldn't + // map/unmap memory for each device independently. That's simply + // because the map/unmap operations happen at the VfioOps level. + // + // Note: this is a limitation of the legacy VFIO interface using + // container/group. The VFIO cdev and iommufd do not have such a + // limitation, and this will be revised once we have VFIO cdev and + // iommufd support. + let vfio_ops = if device_cfg.iommu { + let vfio_ops = self.create_vfio_ops()?; let vfio_mapping = Arc::new(VfioDmaMapping::new( - Arc::clone(&vfio_container), + Arc::clone(&vfio_ops), Arc::new(self.memory_manager.lock().unwrap().guest_memory()), Arc::clone(&self.mmio_regions), )); @@ -3856,22 +3861,20 @@ impl DeviceManager { return Err(DeviceManagerError::MissingVirtualIommu); } - vfio_container - } else if let Some(vfio_container) = &self.vfio_container { - Arc::clone(vfio_container) + vfio_ops + } else if let Some(vfio_ops) = &self.vfio_ops { + Arc::clone(vfio_ops) } else { - let vfio_container = self.create_vfio_container()?; + let vfio_ops = self.create_vfio_ops()?; needs_dma_mapping = true; - self.vfio_container = Some(Arc::clone(&vfio_container)); + self.vfio_ops = Some(Arc::clone(&vfio_ops)); - vfio_container + vfio_ops }; - let vfio_device = VfioDevice::new( - &device_cfg.path, - Arc::clone(&vfio_container) as Arc, - ) - .map_err(DeviceManagerError::VfioCreate)?; + let vfio_device = + VfioDevice::new(&device_cfg.path, Arc::clone(&vfio_ops) as Arc) + .map_err(DeviceManagerError::VfioCreate)?; if needs_dma_mapping { // Register DMA mapping in IOMMU. @@ -3885,7 +3888,7 @@ impl DeviceManager { // to len bytes of valid memory starting at as_ptr() // that will only be freed with munmap(). unsafe { - vfio_container.vfio_dma_map( + vfio_ops.vfio_dma_map( region.start_addr().raw_value(), region.len() as usize, region.as_ptr(), @@ -3896,7 +3899,7 @@ impl DeviceManager { } let vfio_mapping = Arc::new(VfioDmaMapping::new( - Arc::clone(&vfio_container), + Arc::clone(&vfio_ops), Arc::new(self.memory_manager.lock().unwrap().guest_memory()), Arc::clone(&self.mmio_regions), )); @@ -3934,7 +3937,7 @@ impl DeviceManager { vfio_name.clone(), self.address_manager.vm.clone(), vfio_device, - vfio_container, + vfio_ops, self.msi_interrupt_manager.clone(), legacy_interrupt_group, device_cfg.iommu, @@ -4513,14 +4516,14 @@ impl DeviceManager { } // Take care of updating the memory for VFIO PCI devices. - if let Some(vfio_container) = &self.vfio_container { + if let Some(vfio_ops) = &self.vfio_ops { // vfio_dma_map is unsound and ought to be marked as unsafe #[allow(unused_unsafe)] // SAFETY: GuestMemoryMmap guarantees that region points // to len bytes of valid memory starting at as_ptr() // that will only be freed with munmap(). unsafe { - vfio_container.vfio_dma_map( + vfio_ops.vfio_dma_map( new_region.start_addr().raw_value(), new_region.len() as usize, new_region.as_ptr(), @@ -4764,7 +4767,7 @@ impl DeviceManager { let (pci_device, bus_device, virtio_device, remove_dma_handler) = match pci_device_handle { // VirtioMemMappingSource::Container cleanup is handled by - // cleanup_vfio_container when the last VFIO device is removed. + // cleanup_vfio_ops when the last VFIO device is removed. PciDeviceHandle::Vfio(vfio_pci_device) => { // Remove this device's MMIO regions from the DeviceManager's // mmio_regions list. We match on UserMemoryRegion slot numbers @@ -5154,11 +5157,11 @@ impl DeviceManager { &self.acpi_platform_addresses } - fn cleanup_vfio_container(&mut self) { - // Drop the 'vfio container' instance when "Self" is the only reference - if let Some(1) = self.vfio_container.as_ref().map(Arc::strong_count) { - debug!("Drop 'vfio container' given no active 'vfio devices'."); - self.vfio_container = None; + fn cleanup_vfio_ops(&mut self) { + // Drop the VfioOps instance when "Self" is the only reference + if let Some(1) = self.vfio_ops.as_ref().map(Arc::strong_count) { + debug!("Drop VfioOps given no active VFIO devices."); + self.vfio_ops = None; } } } @@ -5644,7 +5647,7 @@ impl BusDevice for DeviceManager { if let Err(e) = self.eject_device(self.selected_segment as u16, slot_id as u8) { error!("Failed ejecting device {slot_id}: {e:?}"); } - self.cleanup_vfio_container(); + self.cleanup_vfio_ops(); slot_bitmap &= !(1 << slot_id); } } From 9156758828dd3774abbad245ce7fb8e2e9dc08db Mon Sep 17 00:00:00 2001 From: Saravanan D Date: Wed, 25 Mar 2026 21:44:33 -0700 Subject: [PATCH 338/742] pci: clamp sparse mmap holes to physical BAR For VFIO devices with non page aligned MSI-X offsets, fixup_msix_region() relocates MSI-X table and PBA offsets into an enlarged virtual BAR by mutating msix.cap in place. generate_sparse_areas() later reads those relocated offsets to carve mmap holes, but receives the physical BAR size as region_size. The relocated offsets exceed the physical BAR boundary, and the kernel rejects the mmap with EINVAL. Guard inter_ranges insertion with an offset < region_size check so relocated entries are skipped. The full physical BAR is mmapped as a single region. The relocated MSI-X in the upper half of the virtual BAR remains trapped because it has no mmap backing. Linux kernel commit a32295c612c5 ("vfio-pci: Allow mapping MSIX BAR") allows mmapping the entire BAR including the MSI-X region when VFIO_REGION_INFO_CAP_MSIX_MAPPABLE is advertised. The actual security guarantees come from IOMMU isolation and interrupt remapping, not from filtering MSI-X table accesses. QEMU follows the same pattern, mmapping the entire physical BAR when MsixMappable is present. Fixes: #7898 Signed-off-by: Saravanan D --- pci/src/vfio.rs | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/pci/src/vfio.rs b/pci/src/vfio.rs index dbcf59fbe8..f1d22ff63c 100644 --- a/pci/src/vfio.rs +++ b/pci/src/vfio.rs @@ -1562,13 +1562,33 @@ impl VfioPciDevice { let (offset, size) = msix.cap.table_range(); let offset = align_page_size_down(offset); let size = align_page_size_up(size); - inter_ranges.insert(offset, size); + // MSI-X mmap region safety: when a device has a non page + // aligned MSI-X offset, fixup_msix_region() relocates MSI-X + // to the upper half of an enlarged virtual BAR, causing the + // offsets in msix.cap to exceed the physical BAR size. This + // check skips carving a hole, preventing invalid offsets from + // reaching the mmap path. With no holes, + // generate_sparse_areas() returns a single sparse region + // covering the entire physical BAR. The relocated MSI-X in + // the virtual BAR remains trapped because its upper half has + // no mmap backing. Exposing the physical MSI-X region through + // mmap is safe when the kernel advertises + // VFIO_REGION_INFO_CAP_MSIX_MAPPABLE. When MSI-X offsets are + // already page aligned, fixup_msix_region() does not relocate + // and this check is satisfied, so a hole is carved at the + // intended offset as before. + if offset < region_size { + inter_ranges.insert(offset, size); + } } if region_index == msix.cap.pba_bir() { let (offset, size) = msix.cap.pba_range(); let offset = align_page_size_down(offset); let size = align_page_size_up(size); - inter_ranges.insert(offset, size); + // See MSI-X mmap safety comment above. + if offset < region_size { + inter_ranges.insert(offset, size); + } } } From c5dba9ea3b73d21d61e98d293dd41b2f4a2f68cc Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 25 Mar 2026 22:44:43 +0100 Subject: [PATCH 339/742] vmm: seccomp: Allow BLKGETSIZE64 ioctl Add BLKGETSIZE64 0x80081272 to the VMM seccomp ioctl allow list alongside the existing BLK* ioctls. This is needed for querying block device size without seeking. Signed-off-by: Anatol Belski --- vmm/src/seccomp_filters.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vmm/src/seccomp_filters.rs b/vmm/src/seccomp_filters.rs index d295761518..97f020e650 100644 --- a/vmm/src/seccomp_filters.rs +++ b/vmm/src/seccomp_filters.rs @@ -110,9 +110,10 @@ mod kvm { pub const KVM_SET_NESTED_STATE: u64 = 1082175167; } -// Block device ioctls for sparse support probing (not exported by libc) +// Block device ioctls (not exported by libc) const BLKDISCARD: u64 = 0x1277; // _IO(0x12, 119) const BLKZEROOUT: u64 = 0x127f; // _IO(0x12, 127) +const BLKGETSIZE64: u64 = 0x80081272; // _IOR(0x12, 114, size_t) // MSHV IOCTL code. This is unstable until the kernel code has been declared stable. #[cfg(feature = "mshv")] @@ -265,6 +266,7 @@ fn create_vmm_ioctl_seccomp_rule_common( and![Cond::new(1, ArgLen::Dword, Eq, BLKPBSZGET as _)?], and![Cond::new(1, ArgLen::Dword, Eq, BLKIOMIN as _)?], and![Cond::new(1, ArgLen::Dword, Eq, BLKIOOPT as _)?], + and![Cond::new(1, ArgLen::Dword, Eq, BLKGETSIZE64 as _)?], and![Cond::new(1, ArgLen::Dword, Eq, BLKDISCARD as _)?], and![Cond::new(1, ArgLen::Dword, Eq, BLKZEROOUT as _)?], and![Cond::new(1, ArgLen::Dword, Eq, FIOCLEX as _)?], From 051a6eff5c563e5f4787b37b848212d914226dea Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 25 Mar 2026 22:45:44 +0100 Subject: [PATCH 340/742] block: Add query_device_size() for file and block device size Add a shared helper that returns the logical size in bytes for both regular files and block devices using an immutable &File reference. Regular files use metadata().len(). Block devices use the BLKGETSIZE64 ioctl. Any other file type returns an InvalidInput error. This avoids seek(SeekFrom::End(0)) which requires &mut self and can return incorrect results for block devices when the file position is in an unexpected state. Signed-off-by: Anatol Belski --- block/src/lib.rs | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/block/src/lib.rs b/block/src/lib.rs index 0640611c78..2c07276e23 100644 --- a/block/src/lib.rs +++ b/block/src/lib.rs @@ -38,6 +38,7 @@ use std::fmt::{self, Debug}; use std::fs::{File, OpenOptions}; use std::io::{self, IoSlice, IoSliceMut, Read, Seek, SeekFrom, Write}; use std::os::linux::fs::MetadataExt; +use std::os::unix::fs::FileTypeExt; use std::os::unix::io::AsRawFd; use std::path::Path; use std::str::FromStr; @@ -61,7 +62,7 @@ use vm_memory::{ }; use vm_virtio::{AccessPlatform, Translatable}; use vmm_sys_util::eventfd::EventFd; -use vmm_sys_util::{aio, ioctl_io_nr}; +use vmm_sys_util::{aio, ioctl_io_nr, ioctl_ior_nr}; use crate::async_io::{AsyncIo, AsyncIoError, AsyncIoResult}; use crate::error::{BlockError, BlockErrorKind, BlockResult, ErrorOp}; @@ -1193,6 +1194,36 @@ ioctl_io_nr!(BLKSSZGET, 0x12, 104); ioctl_io_nr!(BLKPBSZGET, 0x12, 123); ioctl_io_nr!(BLKIOMIN, 0x12, 120); ioctl_io_nr!(BLKIOOPT, 0x12, 121); +ioctl_ior_nr!(BLKGETSIZE64, 0x12, 114, u64); + +/// Returns `(logical_size, physical_size)` in bytes for regular files and block devices. +/// +/// For regular files, logical size is `st_size` and physical size is +/// `st_blocks * 512` (actual host allocation). For block devices both +/// values equal the `BLKGETSIZE64` result. +pub fn query_device_size(file: &File) -> io::Result<(u64, u64)> { + let m = file.metadata()?; + if m.is_file() { + // st_blocks is always in 512-byte units on Linux + Ok((m.len(), m.st_blocks() * 512)) + } else if m.file_type().is_block_device() { + let mut size: u64 = 0; + // SAFETY: BLKGETSIZE64 reads the device size into a u64 pointer. + let ret = unsafe { libc::ioctl(file.as_raw_fd(), BLKGETSIZE64() as _, &mut size) }; + if ret != 0 { + return Err(io::Error::last_os_error()); + } + Ok((size, size)) + } else { + Err(io::Error::new( + io::ErrorKind::InvalidInput, + format!( + "disk image must be a regular file or block device, is: {:?}", + m.file_type() + ), + )) + } +} #[derive(Copy, Clone)] enum BlockSize { From f14e2d2b40c23e7d133b120ae5c535d1422120d5 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 25 Mar 2026 23:20:57 +0100 Subject: [PATCH 341/742] block: raw_async: Use query_device_size() for size queries Use query_device_size() instead of seek(End(0)) and metadata().len() to correctly handle block device and regular file handles. Signed-off-by: Anatol Belski --- block/src/raw_async.rs | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/block/src/raw_async.rs b/block/src/raw_async.rs index 8544040f50..4efff45e9c 100644 --- a/block/src/raw_async.rs +++ b/block/src/raw_async.rs @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause use std::fs::File; -use std::io::{Error, Seek, SeekFrom}; +use std::io::Error; use std::os::unix::io::{AsRawFd, RawFd}; use io_uring::{IoUring, opcode, types}; @@ -14,7 +14,9 @@ use vmm_sys_util::eventfd::EventFd; use crate::async_io::{ AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFile, DiskFileError, DiskFileResult, }; -use crate::{BatchRequest, DiskTopology, RequestType, SECTOR_SIZE, probe_sparse_support}; +use crate::{ + BatchRequest, DiskTopology, RequestType, SECTOR_SIZE, probe_sparse_support, query_device_size, +}; pub struct RawFileDisk { file: File, @@ -28,16 +30,15 @@ impl RawFileDisk { impl DiskFile for RawFileDisk { fn logical_size(&mut self) -> DiskFileResult { - self.file - .seek(SeekFrom::End(0)) - .map_err(DiskFileError::Size) + Ok(query_device_size(&self.file) + .map_err(DiskFileError::Size)? + .0) } fn physical_size(&mut self) -> DiskFileResult { - self.file - .metadata() - .map(|m| m.len()) - .map_err(DiskFileError::Size) + Ok(query_device_size(&self.file) + .map_err(DiskFileError::Size)? + .1) } fn new_async_io(&self, ring_depth: u32) -> DiskFileResult> { From 290f57a6e3aaf470a75a95ebd4f67947b7b406bd Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 25 Mar 2026 23:21:18 +0100 Subject: [PATCH 342/742] block: raw_sync: Use query_device_size() for size queries Use query_device_size() instead of seek(End(0)) and metadata().len() to correctly handle block device and regular file handles. Signed-off-by: Anatol Belski --- block/src/raw_sync.rs | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/block/src/raw_sync.rs b/block/src/raw_sync.rs index b9f89dde05..c045c5942d 100644 --- a/block/src/raw_sync.rs +++ b/block/src/raw_sync.rs @@ -4,7 +4,6 @@ use std::collections::VecDeque; use std::fs::File; -use std::io::{Seek, SeekFrom}; use std::os::unix::io::{AsRawFd, RawFd}; use libc::{FALLOC_FL_KEEP_SIZE, FALLOC_FL_PUNCH_HOLE, FALLOC_FL_ZERO_RANGE}; @@ -14,7 +13,7 @@ use vmm_sys_util::eventfd::EventFd; use crate::async_io::{ AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFile, DiskFileError, DiskFileResult, }; -use crate::{DiskTopology, SECTOR_SIZE, probe_sparse_support}; +use crate::{DiskTopology, SECTOR_SIZE, probe_sparse_support, query_device_size}; pub struct RawFileDiskSync { file: File, @@ -28,16 +27,15 @@ impl RawFileDiskSync { impl DiskFile for RawFileDiskSync { fn logical_size(&mut self) -> DiskFileResult { - self.file - .seek(SeekFrom::End(0)) - .map_err(DiskFileError::Size) + Ok(query_device_size(&self.file) + .map_err(DiskFileError::Size)? + .0) } fn physical_size(&mut self) -> DiskFileResult { - self.file - .metadata() - .map(|m| m.len()) - .map_err(DiskFileError::Size) + Ok(query_device_size(&self.file) + .map_err(DiskFileError::Size)? + .1) } fn new_async_io(&self, _ring_depth: u32) -> DiskFileResult> { From 1b92af5534e7b8d2207fbb33e536dc5b35a17d32 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 25 Mar 2026 23:21:36 +0100 Subject: [PATCH 343/742] block: raw_async_aio: Use query_device_size() for size queries Use query_device_size() instead of seek(End(0)) and metadata().len() to correctly handle block device and regular file handles. Signed-off-by: Anatol Belski --- block/src/raw_async_aio.rs | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/block/src/raw_async_aio.rs b/block/src/raw_async_aio.rs index c2e6a174e0..66f7a667bf 100644 --- a/block/src/raw_async_aio.rs +++ b/block/src/raw_async_aio.rs @@ -7,7 +7,6 @@ use std::collections::VecDeque; use std::fs::File; -use std::io::{Seek, SeekFrom}; use std::os::unix::io::{AsRawFd, RawFd}; use libc::{FALLOC_FL_KEEP_SIZE, FALLOC_FL_PUNCH_HOLE, FALLOC_FL_ZERO_RANGE}; @@ -18,7 +17,7 @@ use vmm_sys_util::eventfd::EventFd; use crate::async_io::{ AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFile, DiskFileError, DiskFileResult, }; -use crate::{DiskTopology, SECTOR_SIZE, probe_sparse_support}; +use crate::{DiskTopology, SECTOR_SIZE, probe_sparse_support, query_device_size}; pub struct RawFileDiskAio { file: File, @@ -32,16 +31,15 @@ impl RawFileDiskAio { impl DiskFile for RawFileDiskAio { fn logical_size(&mut self) -> DiskFileResult { - self.file - .seek(SeekFrom::End(0)) - .map_err(DiskFileError::Size) + Ok(query_device_size(&self.file) + .map_err(DiskFileError::Size)? + .0) } fn physical_size(&mut self) -> DiskFileResult { - self.file - .metadata() - .map(|m| m.len()) - .map_err(DiskFileError::Size) + Ok(query_device_size(&self.file) + .map_err(DiskFileError::Size)? + .1) } fn new_async_io(&self, ring_depth: u32) -> DiskFileResult> { From 8b6eb83f2a833cf721edacdf5d286bbc63d9730a Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 25 Mar 2026 23:21:45 +0100 Subject: [PATCH 344/742] block: qcow: raw_file: Use query_device_size() for size queries Use query_device_size() instead of metadata().len() to correctly handle block device and regular file handles. Signed-off-by: Anatol Belski --- block/src/qcow/raw_file.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/block/src/qcow/raw_file.rs b/block/src/qcow/raw_file.rs index 06ec4975f4..56ec797355 100644 --- a/block/src/qcow/raw_file.rs +++ b/block/src/qcow/raw_file.rs @@ -20,7 +20,7 @@ use vmm_sys_util::file_traits::FileSync; use vmm_sys_util::seek_hole::SeekHole; use vmm_sys_util::write_zeroes::{PunchHole, WriteZeroesAt}; -use crate::BlockBackend; +use crate::{BlockBackend, query_device_size}; #[derive(Debug)] pub struct RawFile { @@ -374,11 +374,15 @@ impl SeekHole for RawFile { impl BlockBackend for RawFile { fn logical_size(&self) -> std::result::Result { - Ok(self.metadata().map_err(crate::Error::RawFileError)?.len()) + Ok(query_device_size(&self.file) + .map_err(crate::Error::RawFileError)? + .0) } fn physical_size(&self) -> std::result::Result { - Ok(self.metadata().map_err(crate::Error::RawFileError)?.len()) + Ok(query_device_size(&self.file) + .map_err(crate::Error::RawFileError)? + .1) } } From a21b9588ecd2b9530892ba94b05a839e14ddf236 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 25 Mar 2026 23:32:25 +0100 Subject: [PATCH 345/742] virtio-devices: block: Use logical_size() for advisory lock range Use logical_size() instead of physical_size() for the byte-range advisory lock. physical_size() returns st_blocks*512 which is the actual host allocation and can be smaller than the guest visible extent on sparse files, leaving part of the range unprotected. Signed-off-by: Anatol Belski --- virtio-devices/src/block.rs | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/virtio-devices/src/block.rs b/virtio-devices/src/block.rs index 47309dd877..a38a378d1b 100644 --- a/virtio-devices/src/block.rs +++ b/virtio-devices/src/block.rs @@ -8,6 +8,7 @@ // // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause +use std::cmp::max; use std::collections::{BTreeMap, HashMap, VecDeque}; use std::num::Wrapping; use std::ops::Deref; @@ -887,10 +888,16 @@ impl Block { match self.lock_granularity_choice { LockGranularityChoice::Full => LockGranularity::WholeFile, LockGranularityChoice::ByteRange => { - // Byte-range lock covering [0, size) - self.disk_image.physical_size().map_or_else( - // use a safe fallback - |e| { + // Byte range lock covering [0, max(logical, physical)) + // logical > physical for sparse files, physical > logical + // for small dense files due to filesystem block rounding. + let logical = self.disk_image.logical_size(); + let physical = self.disk_image.physical_size(); + match (logical, physical) { + (Ok(l), Ok(p)) => LockGranularity::ByteRange(0, max(l, p)), + (Ok(l), Err(_)) => LockGranularity::ByteRange(0, l), + (Err(_), Ok(p)) => LockGranularity::ByteRange(0, p), + (Err(e), Err(_)) => { let fallback = LockGranularity::WholeFile; warn!( "Can't get disk size for id={},path={}, falling back to {:?}: error: {e}", @@ -899,9 +906,8 @@ impl Block { fallback ); fallback - }, - |size| LockGranularity::ByteRange(0, size), - ) + } + } } } } From c21d6bb85635b0799ec53e69de16bc11d2302fe5 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 26 Mar 2026 09:27:38 +0100 Subject: [PATCH 346/742] block: Add unit tests for query_device_size() Test regular file with odd size (not page/sector aligned), sparse file with punch hole verifying physical < logical, and char device rejection, etc. Signed-off-by: Anatol Belski --- block/src/lib.rs | 64 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/block/src/lib.rs b/block/src/lib.rs index 2c07276e23..d62e2717b2 100644 --- a/block/src/lib.rs +++ b/block/src/lib.rs @@ -1473,4 +1473,68 @@ mod unit_tests { // SAFETY: buf was allocated with this layout via alloc_zeroed. unsafe { dealloc(buf, layout) }; } + + #[test] + fn test_query_device_size_regular_file() { + let temp_file = TempFile::new().unwrap(); + let mut f = temp_file.into_file(); + // 5 sectors + 13 extra bytes - not page aligned, not sectoraligned + f.write_all(&[0xAB; 5 * 512 + 13]).unwrap(); + f.sync_all().unwrap(); + + let (logical, physical) = query_device_size(&f).unwrap(); + assert_eq!(logical, 5 * 512 + 13); + assert!(physical > 0); + } + + #[test] + fn test_query_device_size_sparse_file_punch_hole() { + let temp_file = TempFile::new().unwrap(); + let f = temp_file.as_file(); + // Allocate 1 MiB + let size: i64 = 1 << 20; + f.set_len(size as u64).unwrap(); + // SAFETY: fd is valid, range is within file size. + let ret = unsafe { + libc::fallocate( + f.as_raw_fd(), + 0, // allocate + 0, + size, + ) + }; + assert_eq!(ret, 0, "fallocate failed: {}", io::Error::last_os_error()); + f.sync_all().unwrap(); + + let (log_before, phys_before) = query_device_size(f).unwrap(); + assert_eq!(log_before, size as u64); + assert_eq!(phys_before, size as u64); + + // Punch a hole in the middle 512 KiB + // SAFETY: fd is valid, range is within file size. + let ret = unsafe { + libc::fallocate( + f.as_raw_fd(), + libc::FALLOC_FL_PUNCH_HOLE | libc::FALLOC_FL_KEEP_SIZE, + size / 4, + size / 2, + ) + }; + assert_eq!(ret, 0, "punch hole failed: {}", io::Error::last_os_error()); + f.sync_all().unwrap(); + + let (logical, physical) = query_device_size(f).unwrap(); + assert_eq!(logical, size as u64, "logical size must not change"); + assert!( + physical < logical, + "physical ({physical}) should be less than logical ({logical}) after punch hole" + ); + } + + #[test] + fn test_query_device_size_rejects_char_device() { + let f = std::fs::File::open("/dev/zero").unwrap(); + let err = query_device_size(&f).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidInput); + } } From e1e6d0a25b1a0feb6987af17d9adef6f7b667efb Mon Sep 17 00:00:00 2001 From: Alyssa Ross Date: Fri, 27 Mar 2026 11:09:33 +0100 Subject: [PATCH 347/742] vmm: fix rebooting with landlock and pty console When landlock support was added, creation of file descriptors was moved out into a function called pre_create_console_devices, with the idea being that this could be run before Landlock rules are applied and access to all the necessary paths are dropped. This idea didn't take reboots into account, though. When a VM is rebooted, pre_create_console_devices is called again, but now the Landlock rules have been applied, so they need to allow access to all those paths anyway. I imagine the way this was intended to work was that file descriptors would be preserved across reboot, but that's not currently the case, and it's not a trivial change to make because they get dropped when the VM is destroyed. Longer term it would be ideal if Cloud Hypervisor's implementation was more focused on file descriptors than paths[1], and if created VMs only took references to file descriptors, so they were easily preserved across reboots. Fixes: b3e5738b4 ("vmm: Introduce ApplyLandlock trait") Closes: https://github.com/cloud-hypervisor/cloud-hypervisor/issues/7547 Link: https://github.com/cloud-hypervisor/cloud-hypervisor/issues/7704 [1] Signed-off-by: Alyssa Ross --- vmm/src/vm_config.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/vmm/src/vm_config.rs b/vmm/src/vm_config.rs index d453ead2d9..88f8af4acf 100644 --- a/vmm/src/vm_config.rs +++ b/vmm/src/vm_config.rs @@ -553,6 +553,10 @@ pub fn default_consoleconfig_file() -> Option { impl ApplyLandlock for ConsoleConfig { fn apply_landlock(&self, landlock: &mut Landlock) -> LandlockResult<()> { + if self.mode == ConsoleOutputMode::Pty { + landlock.add_rule_with_access(Path::new("/dev/pts"), "rw")?; + landlock.add_rule_with_access(Path::new("/dev/ptmx"), "rw")?; + } if let Some(file) = &self.file { landlock.add_rule_with_access(file, "rw")?; } @@ -586,6 +590,10 @@ impl Default for DebugConsoleConfig { #[cfg(target_arch = "x86_64")] impl ApplyLandlock for DebugConsoleConfig { fn apply_landlock(&self, landlock: &mut Landlock) -> LandlockResult<()> { + if self.mode == ConsoleOutputMode::Pty { + landlock.add_rule_with_access(Path::new("/dev/pts"), "rw")?; + landlock.add_rule_with_access(Path::new("/dev/ptmx"), "rw")?; + } if let Some(file) = &self.file { landlock.add_rule_with_access(file, "rw")?; } From a77f89847ec1e0f172a6e95b7a6d89aa6d4dc958 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Thu, 26 Mar 2026 08:48:32 -0700 Subject: [PATCH 348/742] virtio-devices: vhost_user: Consolidate device State into VhostUserState Each vhost-user device type had near identical State structs. Deduplicate those by introducing a new common struct (and parameterising it when it needs to embed a config member.) This will make it easier to reuse more code in the future and to add more struct members to handle the requirements of snapshot/restore. These changes have been designed to have no impact on the existing snapshot/restore state. Signed-off-by: Rob Bradford --- virtio-devices/src/vhost_user/blk.rs | 13 +++--------- virtio-devices/src/vhost_user/fs.rs | 14 +++---------- .../src/vhost_user/generic_vhost_user.rs | 13 +++--------- virtio-devices/src/vhost_user/mod.rs | 20 +++++++++++++++++-- virtio-devices/src/vhost_user/net.rs | 13 +++--------- 5 files changed, 30 insertions(+), 43 deletions(-) diff --git a/virtio-devices/src/vhost_user/blk.rs b/virtio-devices/src/vhost_user/blk.rs index 9125e79909..66958e32c3 100644 --- a/virtio-devices/src/vhost_user/blk.rs +++ b/virtio-devices/src/vhost_user/blk.rs @@ -9,7 +9,6 @@ use block::VirtioBlockConfig; use event_monitor::event; use log::{error, info}; use seccompiler::SeccompAction; -use serde::{Deserialize, Serialize}; use vhost::vhost_user::message::{ VhostUserConfigFlags, VhostUserProtocolFeatures, VhostUserVirtioFeatures, }; @@ -29,19 +28,12 @@ use super::vu_common_ctrl::{VhostUserConfig, VhostUserHandle}; use super::{DEFAULT_VIRTIO_FEATURES, Error, Result}; use crate::seccomp_filters::Thread; use crate::thread_helper::spawn_virtio_thread; -use crate::vhost_user::VhostUserCommon; +use crate::vhost_user::{VhostUserCommon, VhostUserState}; use crate::{GuestMemoryMmap, GuestRegionMmap, VIRTIO_F_ACCESS_PLATFORM, VirtioInterrupt}; const DEFAULT_QUEUE_NUMBER: usize = 1; -#[derive(Serialize, Deserialize)] -pub struct State { - pub avail_features: u64, - pub acked_features: u64, - pub config: VirtioBlockConfig, - pub acked_protocol_features: u64, - pub vu_num_queues: usize, -} +pub type State = VhostUserState; struct BackendReqHandler {} impl VhostUserFrontendReqHandler for BackendReqHandler {} @@ -206,6 +198,7 @@ impl Blk { config: self.config, acked_protocol_features: self.vu_common.acked_protocol_features, vu_num_queues: self.vu_common.vu_num_queues, + ..Default::default() } } } diff --git a/virtio-devices/src/vhost_user/fs.rs b/virtio-devices/src/vhost_user/fs.rs index fb21105c8a..f9f819c0cc 100644 --- a/virtio-devices/src/vhost_user/fs.rs +++ b/virtio-devices/src/vhost_user/fs.rs @@ -22,7 +22,7 @@ use super::vu_common_ctrl::VhostUserHandle; use super::{DEFAULT_VIRTIO_FEATURES, Error, Result}; use crate::seccomp_filters::Thread; use crate::thread_helper::spawn_virtio_thread; -use crate::vhost_user::VhostUserCommon; +use crate::vhost_user::{VhostUserCommon, VhostUserState}; use crate::{ ActivateResult, GuestMemoryMmap, GuestRegionMmap, MmapRegion, VIRTIO_F_ACCESS_PLATFORM, VirtioCommon, VirtioDevice, VirtioDeviceType, VirtioInterrupt, VirtioSharedMemoryList, @@ -31,15 +31,7 @@ use crate::{ const NUM_QUEUE_OFFSET: usize = 1; const DEFAULT_QUEUE_NUMBER: usize = 2; -#[derive(Serialize, Deserialize)] -pub struct State { - pub avail_features: u64, - pub acked_features: u64, - pub config: VirtioFsConfig, - pub acked_protocol_features: u64, - pub vu_num_queues: usize, - pub backend_req_support: bool, -} +pub type State = VhostUserState; struct BackendReqHandler {} impl VhostUserFrontendReqHandler for BackendReqHandler {} @@ -215,7 +207,7 @@ impl Fs { config: self.config, acked_protocol_features: self.vu_common.acked_protocol_features, vu_num_queues: self.vu_common.vu_num_queues, - backend_req_support: false, + ..Default::default() } } } diff --git a/virtio-devices/src/vhost_user/generic_vhost_user.rs b/virtio-devices/src/vhost_user/generic_vhost_user.rs index b90c6c079d..d0880b02e1 100644 --- a/virtio-devices/src/vhost_user/generic_vhost_user.rs +++ b/virtio-devices/src/vhost_user/generic_vhost_user.rs @@ -9,7 +9,6 @@ use std::{result, thread}; use event_monitor::event; use log::{error, info, warn}; use seccompiler::SeccompAction; -use serde::{Deserialize, Serialize}; use vhost::vhost_user::message::{ VhostUserConfigFlags, VhostUserProtocolFeatures, VhostUserVirtioFeatures, }; @@ -24,20 +23,13 @@ use super::vu_common_ctrl::VhostUserHandle; use super::{Error, Result}; use crate::seccomp_filters::Thread; use crate::thread_helper::spawn_virtio_thread; -use crate::vhost_user::VhostUserCommon; +use crate::vhost_user::{VhostUserCommon, VhostUserState}; use crate::{ ActivateResult, GuestMemoryMmap, GuestRegionMmap, MmapRegion, VIRTIO_F_ACCESS_PLATFORM, VirtioCommon, VirtioDevice, VirtioInterrupt, VirtioSharedMemoryList, }; -#[derive(Serialize, Deserialize)] -pub struct State { - pub avail_features: u64, - pub acked_features: u64, - pub acked_protocol_features: u64, - pub vu_num_queues: usize, - pub backend_req_support: bool, -} +pub type State = VhostUserState<()>; struct BackendReqHandler {} impl VhostUserFrontendReqHandler for BackendReqHandler {} @@ -171,6 +163,7 @@ since the backend only supports {backend_num_queues}\n", acked_protocol_features: self.vu_common.acked_protocol_features, vu_num_queues: self.vu_common.vu_num_queues, backend_req_support: false, + ..Default::default() } } diff --git a/virtio-devices/src/vhost_user/mod.rs b/virtio-devices/src/vhost_user/mod.rs index aca9aba113..8632125391 100644 --- a/virtio-devices/src/vhost_user/mod.rs +++ b/virtio-devices/src/vhost_user/mod.rs @@ -295,6 +295,22 @@ impl EpollHelperHandler for VhostUserEpollHandle } } +/// Common snapshot state for all vhost-user device types. +/// +/// Generic over `C` which is the device-specific config type +/// (e.g. VirtioBlockConfig, VirtioFsConfig, VirtioNetConfig). +/// Devices without a config type use `()`. +#[derive(Default, Serialize, Deserialize)] +pub struct VhostUserState { + pub avail_features: u64, + pub acked_features: u64, + pub config: C, + pub acked_protocol_features: u64, + pub vu_num_queues: usize, + #[serde(default)] + pub backend_req_support: bool, +} + #[derive(Default)] pub struct VhostUserCommon { pub vu: Option>>, @@ -441,9 +457,9 @@ impl VhostUserCommon { Ok(()) } - pub fn snapshot<'a, T>(&mut self, state: &T) -> std::result::Result + pub fn snapshot(&mut self, state: &T) -> std::result::Result where - T: Serialize + Deserialize<'a>, + T: Serialize, { let snapshot = Snapshot::new_from_state(state)?; diff --git a/virtio-devices/src/vhost_user/net.rs b/virtio-devices/src/vhost_user/net.rs index 165115ae7e..d6d142ba6d 100644 --- a/virtio-devices/src/vhost_user/net.rs +++ b/virtio-devices/src/vhost_user/net.rs @@ -9,7 +9,6 @@ use event_monitor::event; use log::{error, info}; use net_util::{CtrlQueue, MacAddr, VirtioNetConfig, build_net_config_space}; use seccompiler::SeccompAction; -use serde::{Deserialize, Serialize}; use vhost::vhost_user::message::{VhostUserProtocolFeatures, VhostUserVirtioFeatures}; use vhost::vhost_user::{FrontendReqHandler, VhostUserFrontend, VhostUserFrontendReqHandler}; use virtio_bindings::virtio_net::{ @@ -28,7 +27,7 @@ use vmm_sys_util::eventfd::EventFd; use crate::seccomp_filters::Thread; use crate::thread_helper::spawn_virtio_thread; use crate::vhost_user::vu_common_ctrl::{VhostUserConfig, VhostUserHandle}; -use crate::vhost_user::{DEFAULT_VIRTIO_FEATURES, Error, Result, VhostUserCommon}; +use crate::vhost_user::{DEFAULT_VIRTIO_FEATURES, Error, Result, VhostUserCommon, VhostUserState}; use crate::{ ActivateResult, GuestMemoryMmap, GuestRegionMmap, NetCtrlEpollHandler, VIRTIO_F_ACCESS_PLATFORM, VirtioCommon, VirtioDevice, VirtioDeviceType, VirtioInterrupt, @@ -36,14 +35,7 @@ use crate::{ const DEFAULT_QUEUE_NUMBER: usize = 2; -#[derive(Serialize, Deserialize)] -pub struct State { - pub avail_features: u64, - pub acked_features: u64, - pub config: VirtioNetConfig, - pub acked_protocol_features: u64, - pub vu_num_queues: usize, -} +pub type State = VhostUserState; struct BackendReqHandler {} impl VhostUserFrontendReqHandler for BackendReqHandler {} @@ -235,6 +227,7 @@ impl Net { config: self.config, acked_protocol_features: self.vu_common.acked_protocol_features, vu_num_queues: self.vu_common.vu_num_queues, + ..Default::default() } } } From bd56214d5453f03ca3d98323a8c41647c0e4c3a1 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Thu, 26 Mar 2026 08:48:41 -0700 Subject: [PATCH 349/742] virtio-devices: vhost_user: Add VhostUserCommon::state() helper With a common state structure for all vhost-user devices the state() methods can also be refactored for reuse. This will make it easier to add new common fields in the future for snapshot/restore. Signed-off-by: Rob Bradford --- virtio-devices/src/vhost_user/blk.rs | 9 +-------- virtio-devices/src/vhost_user/fs.rs | 9 +-------- virtio-devices/src/vhost_user/generic_vhost_user.rs | 9 +-------- virtio-devices/src/vhost_user/mod.rs | 11 +++++++++++ virtio-devices/src/vhost_user/net.rs | 9 +-------- 5 files changed, 15 insertions(+), 32 deletions(-) diff --git a/virtio-devices/src/vhost_user/blk.rs b/virtio-devices/src/vhost_user/blk.rs index 66958e32c3..7b574a583e 100644 --- a/virtio-devices/src/vhost_user/blk.rs +++ b/virtio-devices/src/vhost_user/blk.rs @@ -192,14 +192,7 @@ impl Blk { } fn state(&self) -> State { - State { - avail_features: self.common.avail_features, - acked_features: self.common.acked_features, - config: self.config, - acked_protocol_features: self.vu_common.acked_protocol_features, - vu_num_queues: self.vu_common.vu_num_queues, - ..Default::default() - } + self.vu_common.state(&self.common, self.config) } } diff --git a/virtio-devices/src/vhost_user/fs.rs b/virtio-devices/src/vhost_user/fs.rs index f9f819c0cc..3ae5262649 100644 --- a/virtio-devices/src/vhost_user/fs.rs +++ b/virtio-devices/src/vhost_user/fs.rs @@ -201,14 +201,7 @@ impl Fs { } fn state(&self) -> State { - State { - avail_features: self.common.avail_features, - acked_features: self.common.acked_features, - config: self.config, - acked_protocol_features: self.vu_common.acked_protocol_features, - vu_num_queues: self.vu_common.vu_num_queues, - ..Default::default() - } + self.vu_common.state(&self.common, self.config) } } diff --git a/virtio-devices/src/vhost_user/generic_vhost_user.rs b/virtio-devices/src/vhost_user/generic_vhost_user.rs index d0880b02e1..2110e6842e 100644 --- a/virtio-devices/src/vhost_user/generic_vhost_user.rs +++ b/virtio-devices/src/vhost_user/generic_vhost_user.rs @@ -157,14 +157,7 @@ since the backend only supports {backend_num_queues}\n", } fn state(&self) -> State { - State { - avail_features: self.common.avail_features, - acked_features: self.common.acked_features, - acked_protocol_features: self.vu_common.acked_protocol_features, - vu_num_queues: self.vu_common.vu_num_queues, - backend_req_support: false, - ..Default::default() - } + self.vu_common.state(&self.common, ()) } #[cold] diff --git a/virtio-devices/src/vhost_user/mod.rs b/virtio-devices/src/vhost_user/mod.rs index 8632125391..811ff92e11 100644 --- a/virtio-devices/src/vhost_user/mod.rs +++ b/virtio-devices/src/vhost_user/mod.rs @@ -457,6 +457,17 @@ impl VhostUserCommon { Ok(()) } + pub fn state(&self, common: &crate::VirtioCommon, config: C) -> VhostUserState { + VhostUserState { + avail_features: common.avail_features, + acked_features: common.acked_features, + config, + acked_protocol_features: self.acked_protocol_features, + vu_num_queues: self.vu_num_queues, + ..Default::default() + } + } + pub fn snapshot(&mut self, state: &T) -> std::result::Result where T: Serialize, diff --git a/virtio-devices/src/vhost_user/net.rs b/virtio-devices/src/vhost_user/net.rs index d6d142ba6d..2145691a00 100644 --- a/virtio-devices/src/vhost_user/net.rs +++ b/virtio-devices/src/vhost_user/net.rs @@ -221,14 +221,7 @@ impl Net { } fn state(&self) -> State { - State { - avail_features: self.common.avail_features, - acked_features: self.common.acked_features, - config: self.config, - acked_protocol_features: self.vu_common.acked_protocol_features, - vu_num_queues: self.vu_common.vu_num_queues, - ..Default::default() - } + self.vu_common.state(&self.common, self.config) } } From 3e233af654e18acc4422e69cd375367d3b49bf38 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Thu, 26 Mar 2026 08:49:29 -0700 Subject: [PATCH 350/742] virtio-devices: vhost_user: Access acked_protocol_features directly In GenericVhostUser's read_config and write_config, access vu_common.acked_protocol_features directly instead of going through the state() method which creates a struct. This removes creating the struct just to access two fields that are already directly accessible. Signed-off-by: Rob Bradford --- virtio-devices/src/vhost_user/generic_vhost_user.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/virtio-devices/src/vhost_user/generic_vhost_user.rs b/virtio-devices/src/vhost_user/generic_vhost_user.rs index 2110e6842e..368f46982b 100644 --- a/virtio-devices/src/vhost_user/generic_vhost_user.rs +++ b/virtio-devices/src/vhost_user/generic_vhost_user.rs @@ -213,7 +213,8 @@ impl VirtioDevice for GenericVhostUser { } fn read_config(&self, offset: u64, data: &mut [u8]) { - if (VhostUserProtocolFeatures::CONFIG.bits() & self.state().acked_protocol_features) == 0 { + if (VhostUserProtocolFeatures::CONFIG.bits() & self.vu_common.acked_protocol_features) == 0 + { self.warn_no_config_access(); data.fill(0xFF); @@ -240,7 +241,8 @@ impl VirtioDevice for GenericVhostUser { } fn write_config(&mut self, offset: u64, data: &[u8]) { - if (VhostUserProtocolFeatures::CONFIG.bits() & self.state().acked_protocol_features) == 0 { + if (VhostUserProtocolFeatures::CONFIG.bits() & self.vu_common.acked_protocol_features) == 0 + { self.warn_no_config_access(); return; } From 8536a2536e95a3a0762c03d426ff8eaaec7f1acf Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Thu, 26 Mar 2026 14:14:37 -0700 Subject: [PATCH 351/742] virtio-devices: vhost_user: Adapt state() to return Result This is a refactoring step in preparation for fetching backend device state via SET_DEVICE_STATE_FD which can fail. Signed-off-by: Rob Bradford --- virtio-devices/src/vhost_user/blk.rs | 4 ++-- virtio-devices/src/vhost_user/fs.rs | 4 ++-- virtio-devices/src/vhost_user/generic_vhost_user.rs | 4 ++-- virtio-devices/src/vhost_user/mod.rs | 12 +++++++++--- virtio-devices/src/vhost_user/net.rs | 4 ++-- 5 files changed, 17 insertions(+), 11 deletions(-) diff --git a/virtio-devices/src/vhost_user/blk.rs b/virtio-devices/src/vhost_user/blk.rs index 7b574a583e..8d6003216c 100644 --- a/virtio-devices/src/vhost_user/blk.rs +++ b/virtio-devices/src/vhost_user/blk.rs @@ -191,7 +191,7 @@ impl Blk { }) } - fn state(&self) -> State { + fn state(&self) -> std::result::Result { self.vu_common.state(&self.common, self.config) } } @@ -367,7 +367,7 @@ impl Snapshottable for Blk { } fn snapshot(&mut self) -> std::result::Result { - self.vu_common.snapshot(&self.state()) + self.vu_common.snapshot(&self.state()?) } } impl Transportable for Blk {} diff --git a/virtio-devices/src/vhost_user/fs.rs b/virtio-devices/src/vhost_user/fs.rs index 3ae5262649..b5bf495baa 100644 --- a/virtio-devices/src/vhost_user/fs.rs +++ b/virtio-devices/src/vhost_user/fs.rs @@ -200,7 +200,7 @@ impl Fs { }) } - fn state(&self) -> State { + fn state(&self) -> std::result::Result { self.vu_common.state(&self.common, self.config) } } @@ -377,7 +377,7 @@ impl Snapshottable for Fs { } fn snapshot(&mut self) -> std::result::Result { - self.vu_common.snapshot(&self.state()) + self.vu_common.snapshot(&self.state()?) } } impl Transportable for Fs {} diff --git a/virtio-devices/src/vhost_user/generic_vhost_user.rs b/virtio-devices/src/vhost_user/generic_vhost_user.rs index 368f46982b..9b4c8e4990 100644 --- a/virtio-devices/src/vhost_user/generic_vhost_user.rs +++ b/virtio-devices/src/vhost_user/generic_vhost_user.rs @@ -156,7 +156,7 @@ since the backend only supports {backend_num_queues}\n", }) } - fn state(&self) -> State { + fn state(&self) -> std::result::Result { self.vu_common.state(&self.common, ()) } @@ -396,7 +396,7 @@ impl Snapshottable for GenericVhostUser { } fn snapshot(&mut self) -> std::result::Result { - self.vu_common.snapshot(&self.state()) + self.vu_common.snapshot(&self.state()?) } } impl Transportable for GenericVhostUser {} diff --git a/virtio-devices/src/vhost_user/mod.rs b/virtio-devices/src/vhost_user/mod.rs index 811ff92e11..07df6f0130 100644 --- a/virtio-devices/src/vhost_user/mod.rs +++ b/virtio-devices/src/vhost_user/mod.rs @@ -457,15 +457,21 @@ impl VhostUserCommon { Ok(()) } - pub fn state(&self, common: &crate::VirtioCommon, config: C) -> VhostUserState { - VhostUserState { + pub fn state( + &self, + common: &crate::VirtioCommon, + config: C, + ) -> std::result::Result, MigratableError> { + let state = VhostUserState { avail_features: common.avail_features, acked_features: common.acked_features, config, acked_protocol_features: self.acked_protocol_features, vu_num_queues: self.vu_num_queues, ..Default::default() - } + }; + + Ok(state) } pub fn snapshot(&mut self, state: &T) -> std::result::Result diff --git a/virtio-devices/src/vhost_user/net.rs b/virtio-devices/src/vhost_user/net.rs index 2145691a00..ac3719d9d0 100644 --- a/virtio-devices/src/vhost_user/net.rs +++ b/virtio-devices/src/vhost_user/net.rs @@ -220,7 +220,7 @@ impl Net { }) } - fn state(&self) -> State { + fn state(&self) -> std::result::Result { self.vu_common.state(&self.common, self.config) } } @@ -425,7 +425,7 @@ impl Snapshottable for Net { } fn snapshot(&mut self) -> std::result::Result { - self.vu_common.snapshot(&self.state()) + self.vu_common.snapshot(&self.state()?) } } impl Transportable for Net {} From afc3cb8e1f5b93307b58d3ee06983c0b614fadf1 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Thu, 26 Mar 2026 14:15:09 -0700 Subject: [PATCH 352/742] virtio-devices: vhost_user: Rename update_supports_migration Rename update_supports_migration() to update_supported_features() as this method will be extended to track additional capability flags beyond just migration support (e.g. DEVICE_STATE for snapshot/restore.) Signed-off-by: Rob Bradford --- virtio-devices/src/vhost_user/vu_common_ctrl.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/virtio-devices/src/vhost_user/vu_common_ctrl.rs b/virtio-devices/src/vhost_user/vu_common_ctrl.rs index b603463fb4..497888ca80 100644 --- a/virtio-devices/src/vhost_user/vu_common_ctrl.rs +++ b/virtio-devices/src/vhost_user/vu_common_ctrl.rs @@ -150,7 +150,7 @@ impl VhostUserHandle { self.vu.set_hdr_flags(VhostUserHeaderFlag::NEED_REPLY); } - self.update_supports_migration(acked_features, acked_protocol_features.bits()); + self.update_supported_features(acked_features, acked_protocol_features.bits()); Ok((acked_features, acked_protocol_features.bits())) } @@ -334,7 +334,7 @@ impl VhostUserHandle { } } - self.update_supports_migration(acked_features, acked_protocol_features); + self.update_supported_features(acked_features, acked_protocol_features); Ok(()) } @@ -438,7 +438,7 @@ impl VhostUserHandle { Ok(()) } - fn update_supports_migration(&mut self, acked_features: u64, acked_protocol_features: u64) { + fn update_supported_features(&mut self, acked_features: u64, acked_protocol_features: u64) { if (acked_features & u64::from(vhost::vhost_kern::vhost_binding::VHOST_F_LOG_ALL) != 0) && (acked_protocol_features & VhostUserProtocolFeatures::LOG_SHMFD.bits() != 0) { From a6b945540253790777396219bbb24e7cbbfb3233 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Thu, 26 Mar 2026 13:34:54 -0700 Subject: [PATCH 353/742] virtio-devices: vhost_user: Add get/set of backend state support The vhost-user protocol now has support for the backend to provide an opaque blob of data (read or written through a pipe) that the VMM can use to save/restore state after snapshot/restore or live migration. It also adds a command for checking the backend accepts the uploaded device state. One quirk of saving the state is that GET_VRING_BASE must be used first to quiesce the state of the backend and flush any in-flight requests. This then also requires saving that index for use on the restore. Signed-off-by: Rob Bradford --- virtio-devices/src/vhost_user/mod.rs | 9 ++ .../src/vhost_user/vu_common_ctrl.rs | 122 ++++++++++++++++-- 2 files changed, 120 insertions(+), 11 deletions(-) diff --git a/virtio-devices/src/vhost_user/mod.rs b/virtio-devices/src/vhost_user/mod.rs index 07df6f0130..92e1b73d8b 100644 --- a/virtio-devices/src/vhost_user/mod.rs +++ b/virtio-devices/src/vhost_user/mod.rs @@ -148,6 +148,14 @@ pub enum Error { NewMmapRegion(#[source] MmapRegionError), #[error("Could not find the shm log region")] MissingShmLogRegion, + #[error("Failed setting device state fd")] + VhostUserSetDeviceStateFd(#[source] VhostError), + #[error("Failed checking device state")] + VhostUserCheckDeviceState(#[source] VhostError), + #[error("Failed saving/restoring backend state")] + SaveRestoreBackendState(#[source] io::Error), + #[error("Vring bases count ({0}) does not match queue count ({1})")] + VringBasesCountMismatch(usize, usize), } type Result = std::result::Result; @@ -360,6 +368,7 @@ impl VhostUserCommon { acked_features, &backend_req_handler, inflight.as_mut(), + None, ) .map_err(ActivateError::VhostUserSetup)?; diff --git a/virtio-devices/src/vhost_user/vu_common_ctrl.rs b/virtio-devices/src/vhost_user/vu_common_ctrl.rs index 497888ca80..0e59e23990 100644 --- a/virtio-devices/src/vhost_user/vu_common_ctrl.rs +++ b/virtio-devices/src/vhost_user/vu_common_ctrl.rs @@ -3,8 +3,9 @@ use std::ffi; use std::fs::File; -use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; -use std::os::unix::net::UnixListener; +use std::io::{Read, Write}; +use std::os::unix::io::{AsRawFd, FromRawFd, OwnedFd, RawFd}; +use std::os::unix::net::{UnixListener, UnixStream}; use std::sync::Arc; use std::sync::atomic::Ordering; use std::thread::sleep; @@ -13,7 +14,8 @@ use std::time::{Duration, Instant}; use log::{error, info}; use vhost::vhost_kern::vhost_binding::{VHOST_F_LOG_ALL, VHOST_VRING_F_LOG}; use vhost::vhost_user::message::{ - VhostUserHeaderFlag, VhostUserInflight, VhostUserProtocolFeatures, VhostUserVirtioFeatures, + VhostTransferStateDirection, VhostTransferStatePhase, VhostUserHeaderFlag, VhostUserInflight, + VhostUserProtocolFeatures, VhostUserVirtioFeatures, }; use vhost::vhost_user::{ Frontend, FrontendReqHandler, VhostUserFrontend, VhostUserFrontendReqHandler, @@ -54,6 +56,7 @@ pub struct VhostUserHandle { vu: Frontend, ready: bool, supports_migration: bool, + supports_device_state: bool, shm_log: Option>, acked_features: u64, vrings_info: Option>, @@ -164,7 +167,14 @@ impl VhostUserHandle { acked_features: u64, backend_req_handler: &Option>, inflight: Option<&mut Inflight>, + vring_bases: Option<&[u64]>, ) -> Result<()> { + if let Some(bases) = &vring_bases + && bases.len() != queues.len() + { + return Err(Error::VringBasesCountMismatch(bases.len(), queues.len())); + } + self.vu .set_features(acked_features) .map_err(Error::VhostUserSetFeatures)?; @@ -207,7 +217,7 @@ impl VhostUserHandle { } let mut vrings_info = Vec::new(); - for (queue_index, queue, queue_evt) in queues.iter() { + for (i, (queue_index, queue, queue_evt)) in queues.iter().enumerate() { let actual_size: usize = queue.size().into(); let config_data = VringConfigData { @@ -247,14 +257,16 @@ impl VhostUserHandle { self.vu .set_vring_addr(*queue_index, &config_data) .map_err(Error::VhostUserSetVringAddr)?; + let base = if let Some(bases) = vring_bases { + bases[i] as u16 + } else { + queue + .avail_idx(mem, Ordering::Acquire) + .map_err(Error::GetAvailableIndex)? + .0 + }; self.vu - .set_vring_base( - *queue_index, - queue - .avail_idx(mem, Ordering::Acquire) - .map_err(Error::GetAvailableIndex)? - .0, - ) + .set_vring_base(*queue_index, base) .map_err(Error::VhostUserSetVringBase)?; if let Some(eventfd) = @@ -359,6 +371,7 @@ impl VhostUserHandle { acked_features, backend_req_handler, inflight, + None, ) } @@ -382,6 +395,7 @@ impl VhostUserHandle { vu: Frontend::from_stream(stream, num_queues), ready: false, supports_migration: false, + supports_device_state: false, shm_log: None, acked_features: 0, vrings_info: None, @@ -398,6 +412,7 @@ impl VhostUserHandle { vu: m, ready: false, supports_migration: false, + supports_device_state: false, shm_log: None, acked_features: 0, vrings_info: None, @@ -444,6 +459,91 @@ impl VhostUserHandle { { self.supports_migration = true; } + self.supports_device_state = + acked_protocol_features & VhostUserProtocolFeatures::DEVICE_STATE.bits() != 0; + } + + pub fn supports_device_state(&self) -> bool { + self.supports_device_state + } + + /// Save backend device state via the SET_DEVICE_STATE_FD protocol. + /// Returns the opaque state blob and per-queue vring base indices. + pub fn save_backend_state(&mut self) -> Result<(Vec, Vec)> { + // GET_VRING_BASE for each queue to stop the backend and capture indices + let mut vring_bases = Vec::new(); + for queue_index in &self.queue_indexes { + let base = self + .vu + .get_vring_base(*queue_index) + .map_err(Error::VhostUserGetVringBase)?; + vring_bases.push(base as u64); + } + + // The backend considers the vrings stopped after GET_VRING_BASE. + self.ready = false; + + let (local, remote) = UnixStream::pair().map_err(Error::SaveRestoreBackendState)?; + + let mut read_file: File = match self + .vu + .set_device_state_fd( + VhostTransferStateDirection::SAVE, + VhostTransferStatePhase::STOPPED, + remote.into(), + ) + .map_err(Error::VhostUserSetDeviceStateFd)? + { + Some(file) => file, + None => OwnedFd::from(local).into(), + }; + + // Read all state from the socket + let mut state = Vec::new(); + read_file + .read_to_end(&mut state) + .map_err(Error::SaveRestoreBackendState)?; + + // Verify the transfer succeeded + self.vu + .check_device_state() + .map_err(Error::VhostUserCheckDeviceState)?; + + Ok((state, vring_bases)) + } + + /// Restore backend device state via the SET_DEVICE_STATE_FD protocol. + /// Sends the saved opaque state blob to the backend via a socket. + pub fn restore_backend_state(&mut self, state: &[u8]) -> Result<()> { + let (local, remote) = UnixStream::pair().map_err(Error::SaveRestoreBackendState)?; + + // Explicit scope to close the write end and signal EOF to the backend + { + let mut write_file: File = match self + .vu + .set_device_state_fd( + VhostTransferStateDirection::LOAD, + VhostTransferStatePhase::STOPPED, + remote.into(), + ) + .map_err(Error::VhostUserSetDeviceStateFd)? + { + Some(file) => file, + None => OwnedFd::from(local).into(), + }; + + // Write the saved state to the socket + write_file + .write_all(state) + .map_err(Error::SaveRestoreBackendState)?; + } + + // Verify the transfer succeeded + self.vu + .check_device_state() + .map_err(Error::VhostUserCheckDeviceState)?; + + Ok(()) } fn update_log_base(&mut self, last_ram_addr: u64) -> Result>> { From 02688993a0fd5334803077f17f7c4bf2995ee041 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Thu, 26 Mar 2026 13:41:38 -0700 Subject: [PATCH 354/742] virtio-devices: vhost_user: Fetch and store the backend state/vring Fetch the opaque device state from the backend and store it along with the last vring used in the state used for the snapshot. Signed-off-by: Rob Bradford --- virtio-devices/src/vhost_user/mod.rs | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/virtio-devices/src/vhost_user/mod.rs b/virtio-devices/src/vhost_user/mod.rs index 92e1b73d8b..3858c5c052 100644 --- a/virtio-devices/src/vhost_user/mod.rs +++ b/virtio-devices/src/vhost_user/mod.rs @@ -317,6 +317,10 @@ pub struct VhostUserState { pub vu_num_queues: usize, #[serde(default)] pub backend_req_support: bool, + #[serde(default)] + pub vring_bases: Option>, + #[serde(default)] + pub backend_state: Option>, } #[derive(Default)] @@ -328,6 +332,7 @@ pub struct VhostUserCommon { pub migration_started: bool, pub server: bool, pub interrupt_cb: Option>, + pub vring_bases: Option>, } impl VhostUserCommon { @@ -359,6 +364,7 @@ impl VhostUserCommon { .iter() .map(|(i, q, e)| (*i, vm_virtio::clone_queue(q), e.try_clone().unwrap())) .collect::>(); + let vring_bases = self.vring_bases.take(); vu.lock() .unwrap() .setup_vhost_user( @@ -368,7 +374,7 @@ impl VhostUserCommon { acked_features, &backend_req_handler, inflight.as_mut(), - None, + vring_bases.as_deref(), ) .map_err(ActivateError::VhostUserSetup)?; @@ -471,7 +477,7 @@ impl VhostUserCommon { common: &crate::VirtioCommon, config: C, ) -> std::result::Result, MigratableError> { - let state = VhostUserState { + let mut state = VhostUserState { avail_features: common.avail_features, acked_features: common.acked_features, config, @@ -480,6 +486,17 @@ impl VhostUserCommon { ..Default::default() }; + if let Some(vu) = &self.vu { + let mut vu_locked = vu.lock().unwrap(); + if vu_locked.supports_device_state() { + let (backend_state, vring_bases) = vu_locked.save_backend_state().map_err(|e| { + MigratableError::Snapshot(anyhow!("Failed saving backend state: {e:?}")) + })?; + state.backend_state = Some(backend_state); + state.vring_bases = Some(vring_bases); + } + } + Ok(state) } From 2fbb98e2c3ee6e52164f9c0d3577a14832cde415 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Thu, 26 Mar 2026 15:16:58 -0700 Subject: [PATCH 355/742] virtio-devices: vhost_user: Add common code for restoring state Add a common method for validating the state (checking vrings & device_state) and then restoring the backend state if present. Signed-off-by: Rob Bradford --- virtio-devices/src/vhost_user/mod.rs | 11 +++++++++++ virtio-devices/src/vhost_user/vu_common_ctrl.rs | 10 +++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/virtio-devices/src/vhost_user/mod.rs b/virtio-devices/src/vhost_user/mod.rs index 3858c5c052..86a774737a 100644 --- a/virtio-devices/src/vhost_user/mod.rs +++ b/virtio-devices/src/vhost_user/mod.rs @@ -156,6 +156,8 @@ pub enum Error { SaveRestoreBackendState(#[source] io::Error), #[error("Vring bases count ({0}) does not match queue count ({1})")] VringBasesCountMismatch(usize, usize), + #[error("Backend state and vring bases must both be present or both be absent")] + InconsistentBackendState, } type Result = std::result::Result; @@ -323,6 +325,15 @@ pub struct VhostUserState { pub backend_state: Option>, } +impl VhostUserState { + pub fn validate(&self) -> Result<()> { + if self.backend_state.is_some() != self.vring_bases.is_some() { + return Err(Error::InconsistentBackendState); + } + Ok(()) + } +} + #[derive(Default)] pub struct VhostUserCommon { pub vu: Option>>, diff --git a/virtio-devices/src/vhost_user/vu_common_ctrl.rs b/virtio-devices/src/vhost_user/vu_common_ctrl.rs index 0e59e23990..b2ee2ae5dc 100644 --- a/virtio-devices/src/vhost_user/vu_common_ctrl.rs +++ b/virtio-devices/src/vhost_user/vu_common_ctrl.rs @@ -28,7 +28,7 @@ use vm_memory::{Address, FileOffset, GuestAddress, GuestMemory, GuestMemoryRegio use vm_migration::protocol::MemoryRangeTable; use vmm_sys_util::eventfd::EventFd; -use super::{Error, Result}; +use super::{Error, Result, VhostUserState}; use crate::vhost_user::Inflight; use crate::{ GuestMemoryMmap, GuestRegionMmap, MmapRegion, VirtioInterrupt, VirtioInterruptType, @@ -512,6 +512,14 @@ impl VhostUserHandle { Ok((state, vring_bases)) } + pub fn restore_state(&mut self, state: &VhostUserState) -> Result<()> { + state.validate()?; + if let Some(backend_state) = &state.backend_state { + self.restore_backend_state(backend_state)?; + } + Ok(()) + } + /// Restore backend device state via the SET_DEVICE_STATE_FD protocol. /// Sends the saved opaque state blob to the backend via a socket. pub fn restore_backend_state(&mut self, state: &[u8]) -> Result<()> { From cd3334a3c27f9b46eb20d2c7a28d2d0d3e6c4ff8 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Thu, 26 Mar 2026 14:22:42 -0700 Subject: [PATCH 356/742] virtio-devices: vhost_user: Enable snapshot/restore for vhost-user-* Enable the DEVICE_STATE protocol feature negotiation for all vhost-user devices (block, fs, net, and generic). Restoring the state (including the backend state if present) and vrings. Signed-off-by: Rob Bradford --- virtio-devices/src/vhost_user/blk.rs | 9 +- virtio-devices/src/vhost_user/fs.rs | 9 +- .../src/vhost_user/generic_vhost_user.rs | 125 ++++++++++-------- virtio-devices/src/vhost_user/net.rs | 9 +- 4 files changed, 92 insertions(+), 60 deletions(-) diff --git a/virtio-devices/src/vhost_user/blk.rs b/virtio-devices/src/vhost_user/blk.rs index 8d6003216c..203012b8ec 100644 --- a/virtio-devices/src/vhost_user/blk.rs +++ b/virtio-devices/src/vhost_user/blk.rs @@ -72,6 +72,7 @@ impl Blk { vu_num_queues, config, paused, + vring_bases, ) = if let Some(state) = state { info!("Restoring vhost-user-block {id}"); @@ -80,6 +81,8 @@ impl Blk { state.acked_protocol_features, )?; + vu.restore_state(&state)?; + ( state.avail_features, state.acked_features, @@ -87,6 +90,7 @@ impl Blk { state.vu_num_queues, state.config, true, + state.vring_bases, ) } else { // Filling device and vring features VMM supports. @@ -111,7 +115,8 @@ impl Blk { | VhostUserProtocolFeatures::CONFIGURE_MEM_SLOTS | VhostUserProtocolFeatures::REPLY_ACK | VhostUserProtocolFeatures::INFLIGHT_SHMFD - | VhostUserProtocolFeatures::LOG_SHMFD; + | VhostUserProtocolFeatures::LOG_SHMFD + | VhostUserProtocolFeatures::DEVICE_STATE; let (acked_features, acked_protocol_features) = vu.negotiate_features_vhost_user(avail_features, avail_protocol_features)?; @@ -160,6 +165,7 @@ impl Blk { num_queues, config, false, + None, ) }; @@ -179,6 +185,7 @@ impl Blk { acked_protocol_features, socket_path: vu_cfg.socket, vu_num_queues, + vring_bases, ..Default::default() }, id, diff --git a/virtio-devices/src/vhost_user/fs.rs b/virtio-devices/src/vhost_user/fs.rs index b5bf495baa..1b8edfe6bd 100644 --- a/virtio-devices/src/vhost_user/fs.rs +++ b/virtio-devices/src/vhost_user/fs.rs @@ -101,6 +101,7 @@ impl Fs { vu_num_queues, config, paused, + vring_bases, ) = if let Some(state) = state { info!("Restoring vhost-user-fs {id}"); @@ -109,6 +110,8 @@ impl Fs { state.acked_protocol_features, )?; + vu.restore_state(&state)?; + ( state.avail_features, state.acked_features, @@ -116,6 +119,7 @@ impl Fs { state.vu_num_queues, state.config, true, + state.vring_bases, ) } else { // Filling device and vring features VMM supports. @@ -125,7 +129,8 @@ impl Fs { | VhostUserProtocolFeatures::CONFIGURE_MEM_SLOTS | VhostUserProtocolFeatures::REPLY_ACK | VhostUserProtocolFeatures::INFLIGHT_SHMFD - | VhostUserProtocolFeatures::LOG_SHMFD; + | VhostUserProtocolFeatures::LOG_SHMFD + | VhostUserProtocolFeatures::DEVICE_STATE; let (acked_features, acked_protocol_features) = vu.negotiate_features_vhost_user(avail_features, avail_protocol_features)?; @@ -168,6 +173,7 @@ impl Fs { num_queues, config, false, + None, ) }; @@ -187,6 +193,7 @@ impl Fs { acked_protocol_features, socket_path: path.to_string(), vu_num_queues, + vring_bases, ..Default::default() }, id, diff --git a/virtio-devices/src/vhost_user/generic_vhost_user.rs b/virtio-devices/src/vhost_user/generic_vhost_user.rs index 9b4c8e4990..da8fe53a88 100644 --- a/virtio-devices/src/vhost_user/generic_vhost_user.rs +++ b/virtio-devices/src/vhost_user/generic_vhost_user.rs @@ -68,64 +68,74 @@ impl GenericVhostUser { // Connect to the vhost-user socket. let mut vu = VhostUserHandle::connect_vhost_user(false, path, num_queues as u64, false)?; - let (avail_features, acked_features, acked_protocol_features, vu_num_queues, paused) = - if let Some(state) = state { - info!("Restoring generic vhost-user {id}"); - vu.set_protocol_features_vhost_user( - state.acked_features, - state.acked_protocol_features, - )?; - - ( - state.avail_features, - state.acked_features, - state.acked_protocol_features, - state.vu_num_queues, - true, - ) - } else { - let avail_protocol_features = VhostUserProtocolFeatures::CONFIG - | VhostUserProtocolFeatures::MQ - | VhostUserProtocolFeatures::CONFIGURE_MEM_SLOTS - | VhostUserProtocolFeatures::REPLY_ACK - | VhostUserProtocolFeatures::INFLIGHT_SHMFD - | VhostUserProtocolFeatures::LOG_SHMFD; - - let avail_features = super::DEFAULT_VIRTIO_FEATURES; - - let (acked_features, acked_protocol_features) = - vu.negotiate_features_vhost_user(avail_features, avail_protocol_features)?; - - let backend_num_queues = - if acked_protocol_features & VhostUserProtocolFeatures::MQ.bits() != 0 { - vu.socket_handle() - .get_queue_num() - .map_err(Error::VhostUserGetQueueMaxNum)? - as usize - } else { - num_queues - }; - - if num_queues > backend_num_queues { - error!( - "generic vhost-user requested too many queues ({num_queues}) \ + let ( + avail_features, + acked_features, + acked_protocol_features, + vu_num_queues, + paused, + vring_bases, + ) = if let Some(state) = state { + info!("Restoring generic vhost-user {id}"); + vu.set_protocol_features_vhost_user( + state.acked_features, + state.acked_protocol_features, + )?; + + vu.restore_state(&state)?; + + ( + state.avail_features, + state.acked_features, + state.acked_protocol_features, + state.vu_num_queues, + true, + state.vring_bases, + ) + } else { + let avail_protocol_features = VhostUserProtocolFeatures::CONFIG + | VhostUserProtocolFeatures::MQ + | VhostUserProtocolFeatures::CONFIGURE_MEM_SLOTS + | VhostUserProtocolFeatures::REPLY_ACK + | VhostUserProtocolFeatures::INFLIGHT_SHMFD + | VhostUserProtocolFeatures::LOG_SHMFD + | VhostUserProtocolFeatures::DEVICE_STATE; + + let avail_features = super::DEFAULT_VIRTIO_FEATURES; + + let (acked_features, acked_protocol_features) = + vu.negotiate_features_vhost_user(avail_features, avail_protocol_features)?; + + let backend_num_queues = + if acked_protocol_features & VhostUserProtocolFeatures::MQ.bits() != 0 { + vu.socket_handle() + .get_queue_num() + .map_err(Error::VhostUserGetQueueMaxNum)? as usize + } else { + num_queues + }; + + if num_queues > backend_num_queues { + error!( + "generic vhost-user requested too many queues ({num_queues}) \ since the backend only supports {backend_num_queues}\n", - ); - return Err(Error::BadQueueNum); - } - - ( - acked_features, - // If part of the available features that have been acked, the - // PROTOCOL_FEATURES bit must be already set through the VIRTIO - // acked features as we know the guest would never ack it, thus - // the feature would be lost. - acked_features & VhostUserVirtioFeatures::PROTOCOL_FEATURES.bits(), - acked_protocol_features, - num_queues, - false, - ) - }; + ); + return Err(Error::BadQueueNum); + } + + ( + acked_features, + // If part of the available features that have been acked, the + // PROTOCOL_FEATURES bit must be already set through the VIRTIO + // acked features as we know the guest would never ack it, thus + // the feature would be lost. + acked_features & VhostUserVirtioFeatures::PROTOCOL_FEATURES.bits(), + acked_protocol_features, + num_queues, + false, + None, + ) + }; Ok(GenericVhostUser { common: VirtioCommon { @@ -143,6 +153,7 @@ since the backend only supports {backend_num_queues}\n", acked_protocol_features, socket_path: path.to_string(), vu_num_queues, + vring_bases, ..Default::default() }, id, diff --git a/virtio-devices/src/vhost_user/net.rs b/virtio-devices/src/vhost_user/net.rs index ac3719d9d0..d05626901a 100644 --- a/virtio-devices/src/vhost_user/net.rs +++ b/virtio-devices/src/vhost_user/net.rs @@ -82,6 +82,7 @@ impl Net { vu_num_queues, config, paused, + vring_bases, ) = if let Some(state) = state { info!("Restoring vhost-user-net {id}"); @@ -95,6 +96,8 @@ impl Net { state.acked_protocol_features, )?; + vu.restore_state(&state)?; + // If the control queue feature has been negotiated, let's // increase the number of queues. if state.acked_features & (1 << VIRTIO_NET_F_CTRL_VQ) != 0 { @@ -108,6 +111,7 @@ impl Net { state.vu_num_queues, state.config, true, + state.vring_bases, ) } else { // Filling device and vring features VMM supports. @@ -144,7 +148,8 @@ impl Net { | VhostUserProtocolFeatures::CONFIGURE_MEM_SLOTS | VhostUserProtocolFeatures::REPLY_ACK | VhostUserProtocolFeatures::INFLIGHT_SHMFD - | VhostUserProtocolFeatures::LOG_SHMFD; + | VhostUserProtocolFeatures::LOG_SHMFD + | VhostUserProtocolFeatures::DEVICE_STATE; let (mut acked_features, acked_protocol_features) = vu.negotiate_features_vhost_user(avail_features, avail_protocol_features)?; @@ -187,6 +192,7 @@ impl Net { vu_num_queues, config, false, + None, ) }; @@ -208,6 +214,7 @@ impl Net { socket_path: vu_cfg.socket, vu_num_queues, server, + vring_bases, ..Default::default() }, config, From 0bc3ca1103142be52a52bf820f09c93305deb81c Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Thu, 26 Mar 2026 09:01:57 -0700 Subject: [PATCH 357/742] tests: Add snapshot/restore integration test for virtio-fs Add test_snapshot_restore_virtio_fs which validates that virtio-fs continues to work correctly across a snapshot/restore cycle. Signed-off-by: Rob Bradford --- cloud-hypervisor/tests/integration.rs | 152 ++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index a9acca630d..8fbe6dd73b 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -7077,6 +7077,158 @@ mod common_sequential { handle_child_output(r, &output); } + #[test] + #[cfg(not(feature = "mshv"))] + fn test_snapshot_restore_virtio_fs() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); + let kernel_path = direct_kernel_boot_path(); + + let api_socket_source = format!("{}.1", temp_api_path(&guest.tmp_dir)); + + let mut workload_path = dirs::home_dir().unwrap(); + workload_path.push("workloads"); + let mut shared_dir = workload_path; + shared_dir.push("shared_dir"); + + let (mut daemon_child, virtiofsd_socket_path) = + prepare_virtiofsd(&guest.tmp_dir, shared_dir.to_str().unwrap()); + + let event_path = temp_event_monitor_path(&guest.tmp_dir); + + let mut child = GuestCommand::new(&guest) + .args(["--api-socket", &api_socket_source]) + .args(["--event-monitor", format!("path={event_path}").as_str()]) + .args(["--cpus", "boot=2"]) + .args(["--memory", "size=512M,shared=on"]) + .args(["--kernel", kernel_path.to_str().unwrap()]) + .default_disks() + .default_net() + .args([ + "--fs", + format!("socket={virtiofsd_socket_path},tag=myfs,num_queues=1,queue_size=1024") + .as_str(), + ]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .capture_output() + .spawn() + .unwrap(); + + let snapshot_dir = temp_snapshot_dir_path(&guest.tmp_dir); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + // Mount virtiofs and write a test file + guest + .ssh_command("mkdir -p mount_dir && sudo mount -t virtiofs myfs mount_dir/") + .unwrap(); + + // Verify the shared directory is accessible + assert_eq!( + guest.ssh_command("cat mount_dir/file1").unwrap().trim(), + "foo" + ); + + // Write a file from the guest + guest + .ssh_command( + "sudo bash -c 'echo snapshot_test_data > mount_dir/snapshot_test_file'", + ) + .unwrap(); + + snapshot_and_check_events(&api_socket_source, &snapshot_dir, &event_path); + }); + + // Shutdown the source VM + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + handle_child_output(r, &output); + + // Kill the old virtiofsd + let _ = daemon_child.kill(); + let _ = daemon_child.wait(); + + // Start a fresh virtiofsd (reusing the same socket path) + let (mut daemon_child, _) = prepare_virtiofsd(&guest.tmp_dir, shared_dir.to_str().unwrap()); + + let api_socket_restored = format!("{}.2", temp_api_path(&guest.tmp_dir)); + let event_path_restored = format!("{}.2", temp_event_monitor_path(&guest.tmp_dir)); + + // Restore the VM from the snapshot + let mut child = GuestCommand::new(&guest) + .args(["--api-socket", &api_socket_restored]) + .args([ + "--event-monitor", + format!("path={event_path_restored}").as_str(), + ]) + .args([ + "--restore", + format!("source_url=file://{snapshot_dir}").as_str(), + ]) + .capture_output() + .spawn() + .unwrap(); + + // Wait for the VM to be restored + thread::sleep(std::time::Duration::new(20, 0)); + + let latest_events = [&MetaEvent { + event: "restored".to_string(), + device_id: None, + }]; + assert!(check_latest_events_exact( + &latest_events, + &event_path_restored + )); + + // Remove the snapshot dir + let _ = remove_dir_all(snapshot_dir.as_str()); + + let r = std::panic::catch_unwind(|| { + // Resume the VM + assert!(remote_command(&api_socket_restored, "resume", None)); + thread::sleep(std::time::Duration::new(5, 0)); + + // Verify virtiofs still works after restore + // Read the file written before snapshot + assert_eq!( + guest + .ssh_command("cat mount_dir/snapshot_test_file") + .unwrap() + .trim(), + "snapshot_test_data" + ); + + // Read the pre-existing shared file + assert_eq!( + guest.ssh_command("cat mount_dir/file1").unwrap().trim(), + "foo" + ); + + // Write a new file after restore + guest + .ssh_command("sudo bash -c 'echo post_restore_data > mount_dir/post_restore_file'") + .unwrap(); + + // Verify the new file exists on the host + let post_restore_content = + std::fs::read_to_string(shared_dir.join("post_restore_file")).unwrap(); + assert_eq!(post_restore_content.trim(), "post_restore_data"); + }); + + // Shutdown the target VM + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + handle_child_output(r, &output); + + // Clean up virtiofsd and test files + let _ = daemon_child.kill(); + let _ = daemon_child.wait(); + let _ = std::fs::remove_file(shared_dir.join("snapshot_test_file")); + let _ = std::fs::remove_file(shared_dir.join("post_restore_file")); + } + #[test] fn test_virtio_pmem_persist_writes() { test_virtio_pmem(false, false); From c06f6c32932e3241676ba658a8fa9b2852d9e305 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Fri, 27 Mar 2026 15:47:26 +0100 Subject: [PATCH 358/742] tests: fix CI flakiness This test has failed at least once in upstream CI. With the applied stress workload, reducing the downtime to 1 ms makes it virtually impossible for CI runners with various speeds to complete VM migration. In other words: we will always be able to cancel. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- cloud-hypervisor/tests/integration.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 8fbe6dd73b..bac62af116 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -9949,7 +9949,7 @@ mod live_migration { // Start a memory stressor in the background to keep pages dirty, // ensuring the precopy loop cannot converge within the 1s timeout. guest - .ssh_command("nohup stress --vm 2 --vm-bytes 200M --vm-keep &>/dev/null &") + .ssh_command("nohup stress --vm 2 --vm-bytes 220M --vm-keep &>/dev/null &") .unwrap(); // Give stress a moment to actually start dirtying memory thread::sleep(Duration::from_secs(3)); @@ -9971,14 +9971,14 @@ mod live_migration { thread::sleep(Duration::from_secs(1)); - // Use a tight downtime budget (50ms) combined with a 1s timeout so the - // migration cannot converge regardless of strategy. + // Use a tight downtime budget (1ms) combined with a 1s timeout so the + // migration practically cannot converge regardless of strategy. let mut send_migration = Command::new(clh_command("ch-remote")) .args([ &format!("--api-socket={src_api_socket}"), "send-migration", &format!( - "destination_url=tcp:{host_ip}:{migration_port},downtime_ms=50,timeout_s=1,timeout_strategy={timeout_strategy:?}" + "destination_url=tcp:{host_ip}:{migration_port},downtime_ms=1,timeout_s=1,timeout_strategy={timeout_strategy:?}" ), ]) .stdin(Stdio::null()) From baacaea35e5bcd2440f8a8078345f08d6c6b38d4 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Fri, 27 Mar 2026 16:45:59 -0700 Subject: [PATCH 359/742] build: Bump vfio-ioctls to 0.6.0 This version is identical but with a new version number as the old version is yanked due to a semver break. Signed-off-by: Rob Bradford --- Cargo.lock | 4 ++-- Cargo.toml | 2 +- fuzz/Cargo.lock | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ad3fe2b934..c1ce72b0b6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2286,9 +2286,9 @@ dependencies = [ [[package]] name = "vfio-ioctls" -version = "0.5.3" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c80c6d52f8e592e31a8f7eb45e882a9617aa61ec2479981a175e9f0a79f2434e" +checksum = "d4b1d98dff7f0d219278e406323e7eda4d426447bd203c7828189baf0d8c07b7" dependencies = [ "byteorder", "kvm-bindings", diff --git a/Cargo.toml b/Cargo.toml index cc2bd3c175..e22ace8382 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -60,7 +60,7 @@ mshv-bindings = "0.6.7" mshv-ioctls = "0.6.7" seccompiler = "0.5.0" vfio-bindings = { version = "0.6.2", default-features = false } -vfio-ioctls = { version = "0.5.3", default-features = false } +vfio-ioctls = { version = "0.6.0", default-features = false } vfio_user = { version = "0.1.3", default-features = false } vhost = { version = "0.16.0", default-features = false } vhost-user-backend = { version = "0.22.0", default-features = false } diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock index 559fd35ce6..488476e9e9 100644 --- a/fuzz/Cargo.lock +++ b/fuzz/Cargo.lock @@ -1282,9 +1282,9 @@ dependencies = [ [[package]] name = "vfio-ioctls" -version = "0.5.3" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c80c6d52f8e592e31a8f7eb45e882a9617aa61ec2479981a175e9f0a79f2434e" +checksum = "d4b1d98dff7f0d219278e406323e7eda4d426447bd203c7828189baf0d8c07b7" dependencies = [ "byteorder", "kvm-bindings", From ce3b3fa1b248e86ae1bccacc273e87427b4b18d2 Mon Sep 17 00:00:00 2001 From: Bo Chen Date: Fri, 27 Mar 2026 18:41:24 +0000 Subject: [PATCH 360/742] build: Enable the vfio CI worker This reverts commit 8aaf3734aadbf550ad7df06328f5c7964fc38220. Fixes: #7751 Signed-off-by: Saravanan D Signed-off-by: Bo Chen --- .github/workflows/integration-vfio.yaml | 33 +++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 .github/workflows/integration-vfio.yaml diff --git a/.github/workflows/integration-vfio.yaml b/.github/workflows/integration-vfio.yaml new file mode 100644 index 0000000000..218e897270 --- /dev/null +++ b/.github/workflows/integration-vfio.yaml @@ -0,0 +1,33 @@ +name: Cloud Hypervisor Tests (VFIO) +on: [merge_group, pull_request] +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event_name }} + cancel-in-progress: true + +jobs: + build: + name: Tests (VFIO) + runs-on: ${{ github.event_name == 'pull_request' && 'ubuntu-latest' || 'vfio-nvidia' }} + env: + AUTH_DOWNLOAD_TOKEN: ${{ secrets.AUTH_DOWNLOAD_TOKEN }} + steps: + - name: Fix workspace permissions + if: ${{ github.event_name != 'pull_request' }} + run: sudo chown -R runner:runner ${GITHUB_WORKSPACE} + - name: Code checkout + if: ${{ github.event_name != 'pull_request' }} + uses: actions/checkout@v6 + with: + fetch-depth: 0 + - name: Run VFIO integration tests + if: ${{ github.event_name != 'pull_request' }} + timeout-minutes: 15 + run: scripts/dev_cli.sh tests --integration-vfio + # Most tests are failing with musl see #6790 + # - name: Run VFIO integration tests for musl + # if: ${{ github.event_name != 'pull_request' }} + # timeout-minutes: 15 + # run: scripts/dev_cli.sh tests --integration-vfio --libc musl + - name: Skipping build for PR + if: ${{ github.event_name == 'pull_request' }} + run: echo "Skipping build for PR" From ea0a0393b50ac56e3a06217dab0f3f2a0d9e2a1b Mon Sep 17 00:00:00 2001 From: Bo Chen Date: Fri, 27 Mar 2026 18:53:31 +0000 Subject: [PATCH 361/742] build: Fix the vfio integration test workflow Switch the workspace ownership fix from 'runner' to 'github-runner' to match the new dedicated service account used on the self-hosted VFIO runner. Signed-off-by: Bo Chen --- .github/workflows/integration-vfio.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration-vfio.yaml b/.github/workflows/integration-vfio.yaml index 218e897270..19d5cadba6 100644 --- a/.github/workflows/integration-vfio.yaml +++ b/.github/workflows/integration-vfio.yaml @@ -13,7 +13,7 @@ jobs: steps: - name: Fix workspace permissions if: ${{ github.event_name != 'pull_request' }} - run: sudo chown -R runner:runner ${GITHUB_WORKSPACE} + run: sudo chown -R github-runner:github-runner ${GITHUB_WORKSPACE} - name: Code checkout if: ${{ github.event_name != 'pull_request' }} uses: actions/checkout@v6 From 9b9950335fcec2d8c555ebcf04b361d4d6cc8936 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Sat, 28 Mar 2026 12:55:09 -0400 Subject: [PATCH 362/742] ci: Run pull_request_target workflows with no permisisons The MSHV tests need access to secrets so that they can run workloads in Azure. It does not need privileged access to GitHub. Ensure its GITHUB_TOKEN has no permissions. Signed-off-by: Demi Marie Obenour --- .github/workflows/mshv-integration.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/mshv-integration.yaml b/.github/workflows/mshv-integration.yaml index 261b84d546..ad544249eb 100644 --- a/.github/workflows/mshv-integration.yaml +++ b/.github/workflows/mshv-integration.yaml @@ -1,5 +1,6 @@ name: Cloud Hypervisor Tests (MSHV) (x86_64) on: [pull_request_target, merge_group] +permissions: {} jobs: infra-setup: From cbe7018f84d4ac52c5cab1385d9a5d754cf5b788 Mon Sep 17 00:00:00 2001 From: Anirudh Rayabharam Date: Mon, 30 Mar 2026 13:49:08 +0000 Subject: [PATCH 363/742] vmm: cpu: fix broken URL Fix broken URL reported by Lychee. The binutils-gdb github repo no longer exists. Use the equivalent sourceware.org link. Signed-off-by: Anirudh Rayabharam --- vmm/src/cpu.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vmm/src/cpu.rs b/vmm/src/cpu.rs index e855204566..f8fff2b299 100644 --- a/vmm/src/cpu.rs +++ b/vmm/src/cpu.rs @@ -2762,7 +2762,7 @@ impl Debuggable for CpuManager { ]; // GDB exposes 32-bit eflags instead of 64-bit rflags. - // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml + // https://sourceware.org/git/?p=binutils-gdb.git;a=blob;f=gdb/features/i386/64bit-core.xml let eflags = gregs.get_rflags() as u32; let rip = gregs.get_rip(); From 823ab99e027a280685c398aabc29d78254b04d3c Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Sat, 28 Mar 2026 14:46:01 +0100 Subject: [PATCH 364/742] virtio-devices: block: Use error specific status in sync fallback path The sync fallback path in process_queue_submit() hardcoded VIRTIO_BLK_S_IOERR for all errors. This caused unsupported request errors to report IOERR to the guest instead of the correct VIRTIO_BLK_S_UNSUPP. Use ExecuteError::status() to return the appropriate status code for each error variant. Signed-off-by: Anatol Belski --- virtio-devices/src/block.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtio-devices/src/block.rs b/virtio-devices/src/block.rs index a38a378d1b..8f919e3e04 100644 --- a/virtio-devices/src/block.rs +++ b/virtio-devices/src/block.rs @@ -340,7 +340,7 @@ Setting device status to 'NEEDS_RESET' and stopping processing queues until rese Ok(_) => VIRTIO_BLK_S_OK, Err(e) => { warn!("Request failed: {request:x?} {e:?}"); - VIRTIO_BLK_S_IOERR + e.status() as u32 } }; From 6261cad0d8da1124e4460cbded8a259cecbb993f Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Tue, 24 Mar 2026 18:22:56 -0700 Subject: [PATCH 365/742] block: Add DiskFileError::Clone variant Add a Clone variant to DiskFileError for error handling in the upcoming AsyncDiskFile::try_clone() implementations. This variant will be used by RawFileDisk and RawFileDiskSync when cloning the underlying file descriptor fails. Signed-off-by: Muminul Islam --- block/src/async_io.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/src/async_io.rs b/block/src/async_io.rs index fe3349e6a3..2d8ea37878 100644 --- a/block/src/async_io.rs +++ b/block/src/async_io.rs @@ -24,6 +24,8 @@ pub enum DiskFileError { /// Resize failed #[error("Resize failed")] ResizeError(#[source] std::io::Error), + #[error("Failed cloning disk file")] + Clone(#[source] std::io::Error), } pub type DiskFileResult = std::result::Result; From 4586ca133a171ebfae8d0aa558827e0f5c69e9bf Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Tue, 24 Mar 2026 17:17:33 -0700 Subject: [PATCH 366/742] block: Derive Debug on RawFileDisk Add #[derive(Debug)] to RawFileDisk. This is required by the new disk_file traits which have Send + Debug bounds. Signed-off-by: Muminul Islam --- block/src/raw_async.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/block/src/raw_async.rs b/block/src/raw_async.rs index 4efff45e9c..b5ae8f50a3 100644 --- a/block/src/raw_async.rs +++ b/block/src/raw_async.rs @@ -18,6 +18,7 @@ use crate::{ BatchRequest, DiskTopology, RequestType, SECTOR_SIZE, probe_sparse_support, query_device_size, }; +#[derive(Debug)] pub struct RawFileDisk { file: File, } From 064720221590e0a94a68ef4bab78d8ddc3055feb Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Tue, 24 Mar 2026 17:24:02 -0700 Subject: [PATCH 367/742] block: Implement DiskSize trait for RawFileDisk Add disk_file::DiskSize trait implementation for RawFileDisk using BlockError and BlockResult. Uses metadata().len() instead of seek(SeekFrom::End(0)), taking &self instead of &mut self. Add BlockError, BlockErrorKind, BlockResult, and disk_file imports needed by this and subsequent trait impls. Signed-off-by: Muminul Islam --- block/src/raw_async.rs | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/block/src/raw_async.rs b/block/src/raw_async.rs index b5ae8f50a3..7dcda5bbdb 100644 --- a/block/src/raw_async.rs +++ b/block/src/raw_async.rs @@ -14,8 +14,10 @@ use vmm_sys_util::eventfd::EventFd; use crate::async_io::{ AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFile, DiskFileError, DiskFileResult, }; +use crate::error::{BlockError, BlockErrorKind, BlockResult}; use crate::{ - BatchRequest, DiskTopology, RequestType, SECTOR_SIZE, probe_sparse_support, query_device_size, + BatchRequest, DiskTopology, RequestType, SECTOR_SIZE, disk_file, probe_sparse_support, + query_device_size, }; #[derive(Debug)] @@ -72,6 +74,14 @@ impl DiskFile for RawFileDisk { } } +impl disk_file::DiskSize for RawFileDisk { + fn logical_size(&self) -> BlockResult { + query_device_size(&self.file) + .map(|(logical_size, _)| logical_size) + .map_err(|e| BlockError::new(BlockErrorKind::Io, DiskFileError::Size(e))) + } +} + pub struct RawFileAsync { fd: RawFd, io_uring: IoUring, From aa6a7aea0e6af7c84f9b9e72378e252f1ec20f57 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Tue, 24 Mar 2026 17:24:25 -0700 Subject: [PATCH 368/742] block: Implement PhysicalSize trait for RawFileDisk Add disk_file::PhysicalSize trait implementation for RawFileDisk. Returns metadata().len() wrapped in BlockError on failure, consistent with the DiskSize impl. Signed-off-by: Muminul Islam --- block/src/raw_async.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/block/src/raw_async.rs b/block/src/raw_async.rs index 7dcda5bbdb..477077effd 100644 --- a/block/src/raw_async.rs +++ b/block/src/raw_async.rs @@ -82,6 +82,14 @@ impl disk_file::DiskSize for RawFileDisk { } } +impl disk_file::PhysicalSize for RawFileDisk { + fn physical_size(&self) -> BlockResult { + query_device_size(&self.file) + .map(|(_, physical_size)| physical_size) + .map_err(|e| BlockError::new(BlockErrorKind::Io, DiskFileError::Size(e))) + } +} + pub struct RawFileAsync { fd: RawFd, io_uring: IoUring, From ca1f48a7179c86132a630fbdaadafe7f7e832903 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Tue, 24 Mar 2026 17:24:42 -0700 Subject: [PATCH 369/742] block: Implement DiskFd trait for RawFileDisk Add disk_file::DiskFd trait implementation for RawFileDisk. Delegates to file.as_raw_fd() via BorrowedDiskFd, taking &self instead of &mut self. Signed-off-by: Muminul Islam --- block/src/raw_async.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/block/src/raw_async.rs b/block/src/raw_async.rs index 477077effd..c08f501c4b 100644 --- a/block/src/raw_async.rs +++ b/block/src/raw_async.rs @@ -90,6 +90,12 @@ impl disk_file::PhysicalSize for RawFileDisk { } } +impl disk_file::DiskFd for RawFileDisk { + fn fd(&self) -> BorrowedDiskFd<'_> { + BorrowedDiskFd::new(self.file.as_raw_fd()) + } +} + pub struct RawFileAsync { fd: RawFd, io_uring: IoUring, From 7e514d8d0e8b099c2dab2ede0427ac38609475d3 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Tue, 24 Mar 2026 17:24:54 -0700 Subject: [PATCH 370/742] block: Implement Geometry trait for RawFileDisk Add disk_file::Geometry trait implementation for RawFileDisk. Probes disk topology from the file, falling back to defaults on failure. Takes &self instead of &mut self and uses unwrap_or_else for cleaner error handling. Signed-off-by: Muminul Islam --- block/src/raw_async.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/block/src/raw_async.rs b/block/src/raw_async.rs index c08f501c4b..e5ba96de31 100644 --- a/block/src/raw_async.rs +++ b/block/src/raw_async.rs @@ -96,6 +96,15 @@ impl disk_file::DiskFd for RawFileDisk { } } +impl disk_file::Geometry for RawFileDisk { + fn topology(&self) -> DiskTopology { + DiskTopology::probe(&self.file).unwrap_or_else(|_| { + warn!("Unable to get device topology. Using default topology"); + DiskTopology::default() + }) + } +} + pub struct RawFileAsync { fd: RawFd, io_uring: IoUring, From 899dec4c0433f98f039c30409efc7036b382c4d4 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Tue, 24 Mar 2026 17:25:05 -0700 Subject: [PATCH 371/742] block: Implement SparseCapable trait for RawFileDisk Add disk_file::SparseCapable trait implementation for RawFileDisk. Delegates to probe_sparse_support() to detect whether the underlying file supports hole-punching. Signed-off-by: Muminul Islam --- block/src/raw_async.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/block/src/raw_async.rs b/block/src/raw_async.rs index e5ba96de31..86f23b1fa9 100644 --- a/block/src/raw_async.rs +++ b/block/src/raw_async.rs @@ -105,6 +105,12 @@ impl disk_file::Geometry for RawFileDisk { } } +impl disk_file::SparseCapable for RawFileDisk { + fn supports_sparse_operations(&self) -> bool { + probe_sparse_support(&self.file) + } +} + pub struct RawFileAsync { fd: RawFd, io_uring: IoUring, From 64b65caddd78262252d186f7a23841e05fb1f180 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Tue, 24 Mar 2026 17:25:17 -0700 Subject: [PATCH 372/742] block: Implement Resizable trait for RawFileDisk Add disk_file::Resizable trait implementation for RawFileDisk. Calls file.set_len(size) and wraps the I/O error in BlockError on failure. Signed-off-by: Muminul Islam --- block/src/raw_async.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/block/src/raw_async.rs b/block/src/raw_async.rs index 86f23b1fa9..0dc0bff2f1 100644 --- a/block/src/raw_async.rs +++ b/block/src/raw_async.rs @@ -111,6 +111,14 @@ impl disk_file::SparseCapable for RawFileDisk { } } +impl disk_file::Resizable for RawFileDisk { + fn resize(&mut self, size: u64) -> BlockResult<()> { + self.file + .set_len(size) + .map_err(|e| BlockError::new(BlockErrorKind::Io, DiskFileError::ResizeError(e))) + } +} + pub struct RawFileAsync { fd: RawFd, io_uring: IoUring, From 607a2c769d97edde98da0ddd882dd3289591032a Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Tue, 24 Mar 2026 17:25:29 -0700 Subject: [PATCH 373/742] block: Implement DiskFile marker trait for RawFileDisk Add empty disk_file::DiskFile impl for RawFileDisk. This marker supertrait requires DiskSize + Geometry + Sync, all of which are now satisfied. Signed-off-by: Muminul Islam --- block/src/raw_async.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/src/raw_async.rs b/block/src/raw_async.rs index 0dc0bff2f1..153a0a94f1 100644 --- a/block/src/raw_async.rs +++ b/block/src/raw_async.rs @@ -119,6 +119,8 @@ impl disk_file::Resizable for RawFileDisk { } } +impl disk_file::DiskFile for RawFileDisk {} + pub struct RawFileAsync { fd: RawFd, io_uring: IoUring, From 491c5493c5c8dc72c8f972770940df3f7ddb1247 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Tue, 24 Mar 2026 17:25:44 -0700 Subject: [PATCH 374/742] block: Implement AsyncDiskFile trait for RawFileDisk Add disk_file::AsyncDiskFile trait implementation for RawFileDisk with try_clone() and new_async_io() methods. try_clone() duplicates the underlying file descriptor and wraps it in a new RawFileDisk. new_async_io() creates a RawFileAsync (io_uring) backend, wrapping errors in BlockError instead of DiskFileError. Signed-off-by: Muminul Islam --- block/src/raw_async.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/block/src/raw_async.rs b/block/src/raw_async.rs index 153a0a94f1..42309edda7 100644 --- a/block/src/raw_async.rs +++ b/block/src/raw_async.rs @@ -121,6 +121,24 @@ impl disk_file::Resizable for RawFileDisk { impl disk_file::DiskFile for RawFileDisk {} +impl disk_file::AsyncDiskFile for RawFileDisk { + fn try_clone(&self) -> BlockResult> { + let file = self + .file + .try_clone() + .map_err(|e| BlockError::new(BlockErrorKind::Io, DiskFileError::Clone(e)))?; + Ok(Box::new(RawFileDisk { file })) + } + + fn new_async_io(&self, ring_depth: u32) -> BlockResult> { + let mut raw = RawFileAsync::new(self.file.as_raw_fd(), ring_depth) + .map_err(|e| BlockError::new(BlockErrorKind::Io, DiskFileError::NewAsyncIo(e)))?; + raw.alignment = + DiskTopology::probe(&self.file).map_or(SECTOR_SIZE, |t| t.logical_block_size); + Ok(Box::new(raw) as Box) + } +} + pub struct RawFileAsync { fd: RawFd, io_uring: IoUring, From 573f3af77d1f049f382f694328ba0f15ace72434 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Tue, 24 Mar 2026 18:15:00 -0700 Subject: [PATCH 375/742] vmm: Switch RawFileDisk to DiskBackend::Next Update device_manager.rs to construct DiskBackend::Next instead of DiskBackend::Legacy for the io_uring raw backend. Signed-off-by: Muminul Islam --- block/src/raw_async.rs | 47 +-------------------------------------- vmm/src/device_manager.rs | 4 +--- 2 files changed, 2 insertions(+), 49 deletions(-) diff --git a/block/src/raw_async.rs b/block/src/raw_async.rs index 42309edda7..90332aa4b8 100644 --- a/block/src/raw_async.rs +++ b/block/src/raw_async.rs @@ -11,9 +11,7 @@ use libc::{FALLOC_FL_KEEP_SIZE, FALLOC_FL_PUNCH_HOLE, FALLOC_FL_ZERO_RANGE}; use log::warn; use vmm_sys_util::eventfd::EventFd; -use crate::async_io::{ - AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFile, DiskFileError, DiskFileResult, -}; +use crate::async_io::{AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFileError}; use crate::error::{BlockError, BlockErrorKind, BlockResult}; use crate::{ BatchRequest, DiskTopology, RequestType, SECTOR_SIZE, disk_file, probe_sparse_support, @@ -31,49 +29,6 @@ impl RawFileDisk { } } -impl DiskFile for RawFileDisk { - fn logical_size(&mut self) -> DiskFileResult { - Ok(query_device_size(&self.file) - .map_err(DiskFileError::Size)? - .0) - } - - fn physical_size(&mut self) -> DiskFileResult { - Ok(query_device_size(&self.file) - .map_err(DiskFileError::Size)? - .1) - } - - fn new_async_io(&self, ring_depth: u32) -> DiskFileResult> { - let mut raw = RawFileAsync::new(self.file.as_raw_fd(), ring_depth) - .map_err(DiskFileError::NewAsyncIo)?; - raw.alignment = - DiskTopology::probe(&self.file).map_or(SECTOR_SIZE, |t| t.logical_block_size); - Ok(Box::new(raw) as Box) - } - - fn topology(&mut self) -> DiskTopology { - if let Ok(topology) = DiskTopology::probe(&self.file) { - topology - } else { - warn!("Unable to get device topology. Using default topology"); - DiskTopology::default() - } - } - - fn resize(&mut self, size: u64) -> DiskFileResult<()> { - self.file.set_len(size).map_err(DiskFileError::ResizeError) - } - - fn supports_sparse_operations(&self) -> bool { - probe_sparse_support(&self.file) - } - - fn fd(&mut self) -> BorrowedDiskFd<'_> { - BorrowedDiskFd::new(self.file.as_raw_fd()) - } -} - impl disk_file::DiskSize for RawFileDisk { fn logical_size(&self) -> BlockResult { query_device_size(&self.file) diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 95814cc92d..080fe9a3d6 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -2756,9 +2756,7 @@ impl DeviceManager { unreachable!("Checked in if statement above"); #[cfg(feature = "io_uring")] { - DiskBackend::Legacy( - Box::new(RawFileDisk::new(file)) as Box - ) + DiskBackend::Next(Box::new(RawFileDisk::new(file))) } } else if !disk_cfg.disable_aio && self.aio_is_supported() { info!("Using asynchronous RAW disk file (aio)"); From 98fedac77f4c66caf5cf5f7b109741f3b6ff3acc Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Mon, 23 Mar 2026 13:31:50 -0700 Subject: [PATCH 376/742] block: Derive Debug on RawFileDiskSync Add #[derive(Debug)] to RawFileDiskSync. This is required by the new disk_file traits which have Send + Debug bounds. Signed-off-by: Muminul Islam --- block/src/raw_sync.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/block/src/raw_sync.rs b/block/src/raw_sync.rs index c045c5942d..0051c1b677 100644 --- a/block/src/raw_sync.rs +++ b/block/src/raw_sync.rs @@ -15,6 +15,7 @@ use crate::async_io::{ }; use crate::{DiskTopology, SECTOR_SIZE, probe_sparse_support, query_device_size}; +#[derive(Debug)] pub struct RawFileDiskSync { file: File, } From 00a355d273cf8edd767dabb51d5d4bfa684c424a Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Tue, 24 Mar 2026 17:34:57 -0700 Subject: [PATCH 377/742] block: Implement DiskSize trait for RawFileDiskSync Add disk_file::DiskSize trait implementation for RawFileDiskSync using BlockError and BlockResult. Uses metadata().len() instead of seek(SeekFrom::End(0)), taking &self instead of &mut self. Add BlockError, BlockErrorKind, BlockResult, and disk_file imports needed by this and subsequent trait impls. Signed-off-by: Muminul Islam --- block/src/raw_sync.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/block/src/raw_sync.rs b/block/src/raw_sync.rs index 0051c1b677..23f2d10e54 100644 --- a/block/src/raw_sync.rs +++ b/block/src/raw_sync.rs @@ -13,7 +13,8 @@ use vmm_sys_util::eventfd::EventFd; use crate::async_io::{ AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFile, DiskFileError, DiskFileResult, }; -use crate::{DiskTopology, SECTOR_SIZE, probe_sparse_support, query_device_size}; +use crate::error::{BlockError, BlockErrorKind, BlockResult}; +use crate::{DiskTopology, SECTOR_SIZE, disk_file, probe_sparse_support, query_device_size}; #[derive(Debug)] pub struct RawFileDiskSync { @@ -64,6 +65,14 @@ impl DiskFile for RawFileDiskSync { } } +impl disk_file::DiskSize for RawFileDiskSync { + fn logical_size(&self) -> BlockResult { + query_device_size(&self.file) + .map(|(logical_size, _)| logical_size) + .map_err(|e| BlockError::new(BlockErrorKind::Io, DiskFileError::Size(e))) + } +} + pub struct RawFileSync { fd: RawFd, eventfd: EventFd, From 744c365faab91fb7ea1c6cd83b6b63703b2b5491 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Tue, 24 Mar 2026 17:35:15 -0700 Subject: [PATCH 378/742] block: Implement PhysicalSize trait for RawFileDiskSync Add disk_file::PhysicalSize trait implementation for RawFileDiskSync. Returns metadata().len() wrapped in BlockError on failure, consistent with the DiskSize impl. Signed-off-by: Muminul Islam --- block/src/raw_sync.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/block/src/raw_sync.rs b/block/src/raw_sync.rs index 23f2d10e54..35eb2fbf44 100644 --- a/block/src/raw_sync.rs +++ b/block/src/raw_sync.rs @@ -73,6 +73,14 @@ impl disk_file::DiskSize for RawFileDiskSync { } } +impl disk_file::PhysicalSize for RawFileDiskSync { + fn physical_size(&self) -> BlockResult { + query_device_size(&self.file) + .map(|(_, physical_size)| physical_size) + .map_err(|e| BlockError::new(BlockErrorKind::Io, DiskFileError::Size(e))) + } +} + pub struct RawFileSync { fd: RawFd, eventfd: EventFd, From bd026ce4cb8cc5b6a75d9683fe98cea6760dd50d Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Tue, 24 Mar 2026 17:35:27 -0700 Subject: [PATCH 379/742] block: Implement DiskFd trait for RawFileDiskSync Add disk_file::DiskFd trait implementation for RawFileDiskSync. Delegates to file.as_raw_fd() via BorrowedDiskFd, taking &self instead of &mut self. Signed-off-by: Muminul Islam --- block/src/raw_sync.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/block/src/raw_sync.rs b/block/src/raw_sync.rs index 35eb2fbf44..46488ff79f 100644 --- a/block/src/raw_sync.rs +++ b/block/src/raw_sync.rs @@ -81,6 +81,12 @@ impl disk_file::PhysicalSize for RawFileDiskSync { } } +impl disk_file::DiskFd for RawFileDiskSync { + fn fd(&self) -> BorrowedDiskFd<'_> { + BorrowedDiskFd::new(self.file.as_raw_fd()) + } +} + pub struct RawFileSync { fd: RawFd, eventfd: EventFd, From aa5aa6fe89fe38597c85084c4529ce5c6d6b782c Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Tue, 24 Mar 2026 17:35:39 -0700 Subject: [PATCH 380/742] block: Implement Geometry trait for RawFileDiskSync Add disk_file::Geometry trait implementation for RawFileDiskSync. Probes disk topology from the file, falling back to defaults on failure. Takes &self instead of &mut self and uses unwrap_or_else for cleaner error handling. Signed-off-by: Muminul Islam --- block/src/raw_sync.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/block/src/raw_sync.rs b/block/src/raw_sync.rs index 46488ff79f..c6f30f53e8 100644 --- a/block/src/raw_sync.rs +++ b/block/src/raw_sync.rs @@ -87,6 +87,15 @@ impl disk_file::DiskFd for RawFileDiskSync { } } +impl disk_file::Geometry for RawFileDiskSync { + fn topology(&self) -> DiskTopology { + DiskTopology::probe(&self.file).unwrap_or_else(|_| { + warn!("Unable to get device topology. Using default topology"); + DiskTopology::default() + }) + } +} + pub struct RawFileSync { fd: RawFd, eventfd: EventFd, From fe39929a2ad751a7985828bbb240c4a9d8b7105e Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Tue, 24 Mar 2026 17:35:52 -0700 Subject: [PATCH 381/742] block: Implement SparseCapable trait for RawFileDiskSync Add disk_file::SparseCapable trait implementation for RawFileDiskSync. Delegates to probe_sparse_support() to detect whether the underlying file supports hole-punching. Signed-off-by: Muminul Islam --- block/src/raw_sync.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/block/src/raw_sync.rs b/block/src/raw_sync.rs index c6f30f53e8..5a6a65b361 100644 --- a/block/src/raw_sync.rs +++ b/block/src/raw_sync.rs @@ -96,6 +96,12 @@ impl disk_file::Geometry for RawFileDiskSync { } } +impl disk_file::SparseCapable for RawFileDiskSync { + fn supports_sparse_operations(&self) -> bool { + probe_sparse_support(&self.file) + } +} + pub struct RawFileSync { fd: RawFd, eventfd: EventFd, From 6b14d27f305ab4ba7aef6be082d90e14a8983d17 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Tue, 24 Mar 2026 17:36:03 -0700 Subject: [PATCH 382/742] block: Implement Resizable trait for RawFileDiskSync Add disk_file::Resizable trait implementation for RawFileDiskSync. Calls file.set_len(size) and wraps the I/O error in BlockError on failure. Signed-off-by: Muminul Islam --- block/src/raw_sync.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/block/src/raw_sync.rs b/block/src/raw_sync.rs index 5a6a65b361..59d842debc 100644 --- a/block/src/raw_sync.rs +++ b/block/src/raw_sync.rs @@ -102,6 +102,14 @@ impl disk_file::SparseCapable for RawFileDiskSync { } } +impl disk_file::Resizable for RawFileDiskSync { + fn resize(&mut self, size: u64) -> BlockResult<()> { + self.file + .set_len(size) + .map_err(|e| BlockError::new(BlockErrorKind::Io, DiskFileError::ResizeError(e))) + } +} + pub struct RawFileSync { fd: RawFd, eventfd: EventFd, From da72a3abfb0e3125aa51cb0584dd4d1023b75be5 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Tue, 24 Mar 2026 17:36:15 -0700 Subject: [PATCH 383/742] block: Implement DiskFile marker trait for RawFileDiskSync Add empty disk_file::DiskFile impl for RawFileDiskSync. This marker supertrait requires DiskSize + Geometry + Sync, all of which are now satisfied. Signed-off-by: Muminul Islam --- block/src/raw_sync.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/src/raw_sync.rs b/block/src/raw_sync.rs index 59d842debc..d25709d800 100644 --- a/block/src/raw_sync.rs +++ b/block/src/raw_sync.rs @@ -110,6 +110,8 @@ impl disk_file::Resizable for RawFileDiskSync { } } +impl disk_file::DiskFile for RawFileDiskSync {} + pub struct RawFileSync { fd: RawFd, eventfd: EventFd, From 662d350cf9c086f043d103a4a72a52c24b26ee86 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Tue, 24 Mar 2026 17:36:39 -0700 Subject: [PATCH 384/742] block: Implement AsyncDiskFile trait for RawFileDiskSync Add disk_file::AsyncDiskFile trait implementation for RawFileDiskSync with try_clone() and new_async_io() methods. try_clone() duplicates the underlying file descriptor and wraps it in a new RawFileDiskSync. new_async_io() creates a RawFileSync (synchronous fallback) backend, wrapping errors in BlockError instead of DiskFileError. Add DiskFileError::Clone variant in async_io.rs for the try_clone() error path. Signed-off-by: Muminul Islam --- block/src/raw_sync.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/block/src/raw_sync.rs b/block/src/raw_sync.rs index d25709d800..0e95fa8e2a 100644 --- a/block/src/raw_sync.rs +++ b/block/src/raw_sync.rs @@ -112,6 +112,23 @@ impl disk_file::Resizable for RawFileDiskSync { impl disk_file::DiskFile for RawFileDiskSync {} +impl disk_file::AsyncDiskFile for RawFileDiskSync { + fn try_clone(&self) -> BlockResult> { + let file = self + .file + .try_clone() + .map_err(|e| BlockError::new(BlockErrorKind::Io, DiskFileError::Clone(e)))?; + Ok(Box::new(RawFileDiskSync { file })) + } + + fn new_async_io(&self, _ring_depth: u32) -> BlockResult> { + let mut raw = RawFileSync::new(self.file.as_raw_fd()); + raw.alignment = + DiskTopology::probe(&self.file).map_or(SECTOR_SIZE, |t| t.logical_block_size); + Ok(Box::new(raw) as Box) + } +} + pub struct RawFileSync { fd: RawFd, eventfd: EventFd, From afd018f9d1e4a769dc73f3333d82eb3b822b8630 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Tue, 24 Mar 2026 17:32:30 -0700 Subject: [PATCH 385/742] fuzz: Switch RawFileDiskSync to DiskBackend::Next Update fuzz/block.rs to construct DiskBackend::Next instead of DiskBackend::Legacy for the synchronous raw backend. Remove the unused async_io::DiskFile import. Signed-off-by: Muminul Islam --- fuzz/fuzz_targets/block.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fuzz/fuzz_targets/block.rs b/fuzz/fuzz_targets/block.rs index 35d59c9850..abddc27b41 100644 --- a/fuzz/fuzz_targets/block.rs +++ b/fuzz/fuzz_targets/block.rs @@ -15,7 +15,6 @@ use std::path::PathBuf; use std::sync::Arc; use std::{ffi, io}; -use block::async_io::DiskFile; use block::disk_file::DiskBackend; use block::fcntl::LockGranularityChoice; use block::raw_sync::RawFileDiskSync; @@ -53,11 +52,10 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { // Create a virtio-block device backed by a synchronous raw file let shm = memfd_create(&ffi::CString::new("fuzz").unwrap(), 0).unwrap(); let disk_file: File = unsafe { File::from_raw_fd(shm) }; - let qcow_disk = Box::new(RawFileDiskSync::new(disk_file)) as Box; let queue_affinity = BTreeMap::new(); let mut block = Block::new( "tmp".to_owned(), - DiskBackend::Legacy(qcow_disk), + DiskBackend::Next(Box::new(RawFileDiskSync::new(disk_file))), PathBuf::from(""), false, false, From 4f44cd9ed3c06a345f0f87bd969fb755b5ca19bb Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Tue, 24 Mar 2026 17:32:39 -0700 Subject: [PATCH 386/742] vmm: Switch RawFileDiskSync to DiskBackend::Next Update device_manager.rs to construct DiskBackend::Next instead of DiskBackend::Legacy for the synchronous raw backend. Signed-off-by: Muminul Islam --- vmm/src/device_manager.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 080fe9a3d6..342f99b373 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -2763,9 +2763,7 @@ impl DeviceManager { DiskBackend::Legacy(Box::new(RawFileDiskAio::new(file)) as Box) } else { info!("Using synchronous RAW disk file"); - DiskBackend::Legacy( - Box::new(RawFileDiskSync::new(file)) as Box - ) + DiskBackend::Next(Box::new(RawFileDiskSync::new(file))) } } ImageType::Qcow2 => { From 38eb10d209544156e67e03f52346211359f0d49a Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Tue, 24 Mar 2026 17:32:20 -0700 Subject: [PATCH 387/742] block: Remove legacy DiskFile impl from RawFileDiskSync Remove the legacy async_io::DiskFile implementation from RawFileDiskSync now that the new disk_file trait impls are in place. Remove unused imports: Seek, SeekFrom, DiskFile, and DiskFileResult. Signed-off-by: Muminul Islam --- block/src/raw_sync.rs | 42 +----------------------------------------- 1 file changed, 1 insertion(+), 41 deletions(-) diff --git a/block/src/raw_sync.rs b/block/src/raw_sync.rs index 0e95fa8e2a..491ef0563d 100644 --- a/block/src/raw_sync.rs +++ b/block/src/raw_sync.rs @@ -10,9 +10,7 @@ use libc::{FALLOC_FL_KEEP_SIZE, FALLOC_FL_PUNCH_HOLE, FALLOC_FL_ZERO_RANGE}; use log::warn; use vmm_sys_util::eventfd::EventFd; -use crate::async_io::{ - AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFile, DiskFileError, DiskFileResult, -}; +use crate::async_io::{AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFileError}; use crate::error::{BlockError, BlockErrorKind, BlockResult}; use crate::{DiskTopology, SECTOR_SIZE, disk_file, probe_sparse_support, query_device_size}; @@ -27,44 +25,6 @@ impl RawFileDiskSync { } } -impl DiskFile for RawFileDiskSync { - fn logical_size(&mut self) -> DiskFileResult { - Ok(query_device_size(&self.file) - .map_err(DiskFileError::Size)? - .0) - } - - fn physical_size(&mut self) -> DiskFileResult { - Ok(query_device_size(&self.file) - .map_err(DiskFileError::Size)? - .1) - } - - fn new_async_io(&self, _ring_depth: u32) -> DiskFileResult> { - let mut raw = RawFileSync::new(self.file.as_raw_fd()); - raw.alignment = - DiskTopology::probe(&self.file).map_or(SECTOR_SIZE, |t| t.logical_block_size); - Ok(Box::new(raw) as Box) - } - - fn topology(&mut self) -> DiskTopology { - if let Ok(topology) = DiskTopology::probe(&self.file) { - topology - } else { - warn!("Unable to get device topology. Using default topology"); - DiskTopology::default() - } - } - - fn supports_sparse_operations(&self) -> bool { - probe_sparse_support(&self.file) - } - - fn fd(&mut self) -> BorrowedDiskFd<'_> { - BorrowedDiskFd::new(self.file.as_raw_fd()) - } -} - impl disk_file::DiskSize for RawFileDiskSync { fn logical_size(&self) -> BlockResult { query_device_size(&self.file) From 15073edf081c03949344526ba1a15a8f7c6f16c9 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Tue, 24 Mar 2026 17:01:55 +0100 Subject: [PATCH 388/742] block: vhd: Switch FixedVhdDiskAsync::new to BlockResult Map FixedVhd::new io::Error to BlockError with ErrorOp::Open. Update vmm CreateFixedVhdDiskAsync source type accordingly. Signed-off-by: Anatol Belski --- block/src/fixed_vhd_async.rs | 7 +++++-- vmm/src/device_manager.rs | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/block/src/fixed_vhd_async.rs b/block/src/fixed_vhd_async.rs index 6e858f74a5..141435420c 100644 --- a/block/src/fixed_vhd_async.rs +++ b/block/src/fixed_vhd_async.rs @@ -10,6 +10,7 @@ use vmm_sys_util::eventfd::EventFd; use crate::async_io::{ AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFile, DiskFileError, DiskFileResult, }; +use crate::error::{BlockError, BlockResult, ErrorOp}; use crate::fixed_vhd::FixedVhd; use crate::raw_async::RawFileAsync; use crate::{BatchRequest, BlockBackend}; @@ -17,8 +18,10 @@ use crate::{BatchRequest, BlockBackend}; pub struct FixedVhdDiskAsync(FixedVhd); impl FixedVhdDiskAsync { - pub fn new(file: File) -> std::io::Result { - Ok(Self(FixedVhd::new(file)?)) + pub fn new(file: File) -> BlockResult { + Ok(Self( + FixedVhd::new(file).map_err(|e| BlockError::from(e).with_op(ErrorOp::Open))?, + )) } } diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 342f99b373..26bc24dbc0 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -569,7 +569,7 @@ pub enum DeviceManagerError { /// Failed to create FixedVhdDiskAsync #[error("Failed to create FixedVhdDiskAsync")] - CreateFixedVhdDiskAsync(#[source] io::Error), + CreateFixedVhdDiskAsync(#[source] BlockError), /// Failed to create FixedVhdDiskSync #[error("Failed to create FixedVhdDiskSync")] From d82b1101d9099867a1dad0778ff20e9677c9cb7b Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Tue, 24 Mar 2026 17:04:53 +0100 Subject: [PATCH 389/742] block: vhd: impl DiskSize for FixedVhdDiskAsync Delegate to infallible FixedVhd::logical_size(). Signed-off-by: Anatol Belski --- block/src/fixed_vhd_async.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/block/src/fixed_vhd_async.rs b/block/src/fixed_vhd_async.rs index 141435420c..5f4c6d842d 100644 --- a/block/src/fixed_vhd_async.rs +++ b/block/src/fixed_vhd_async.rs @@ -13,8 +13,9 @@ use crate::async_io::{ use crate::error::{BlockError, BlockResult, ErrorOp}; use crate::fixed_vhd::FixedVhd; use crate::raw_async::RawFileAsync; -use crate::{BatchRequest, BlockBackend}; +use crate::{BatchRequest, BlockBackend, disk_file}; +#[derive(Debug)] pub struct FixedVhdDiskAsync(FixedVhd); impl FixedVhdDiskAsync { @@ -50,6 +51,12 @@ impl DiskFile for FixedVhdDiskAsync { } } +impl disk_file::DiskSize for FixedVhdDiskAsync { + fn logical_size(&self) -> BlockResult { + Ok(self.0.logical_size().unwrap()) + } +} + pub struct FixedVhdAsync { raw_file_async: RawFileAsync, size: u64, From 42af5913d95d60152ea53bd1b2db2610f58d01bb Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Tue, 24 Mar 2026 17:05:38 +0100 Subject: [PATCH 390/742] block: vhd: impl PhysicalSize for FixedVhdDiskAsync Fix .unwrap() bug: use explicit GetFileMetadata match. Signed-off-by: Anatol Belski --- block/src/fixed_vhd_async.rs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/block/src/fixed_vhd_async.rs b/block/src/fixed_vhd_async.rs index 5f4c6d842d..5b15d52076 100644 --- a/block/src/fixed_vhd_async.rs +++ b/block/src/fixed_vhd_async.rs @@ -10,7 +10,7 @@ use vmm_sys_util::eventfd::EventFd; use crate::async_io::{ AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFile, DiskFileError, DiskFileResult, }; -use crate::error::{BlockError, BlockResult, ErrorOp}; +use crate::error::{BlockError, BlockErrorKind, BlockResult, ErrorOp}; use crate::fixed_vhd::FixedVhd; use crate::raw_async::RawFileAsync; use crate::{BatchRequest, BlockBackend, disk_file}; @@ -57,6 +57,17 @@ impl disk_file::DiskSize for FixedVhdDiskAsync { } } +impl disk_file::PhysicalSize for FixedVhdDiskAsync { + fn physical_size(&self) -> BlockResult { + self.0.physical_size().map_err(|e| match e { + crate::Error::GetFileMetadata(io) => { + BlockError::new(BlockErrorKind::Io, crate::Error::GetFileMetadata(io)) + } + _ => BlockError::new(BlockErrorKind::Io, e), + }) + } +} + pub struct FixedVhdAsync { raw_file_async: RawFileAsync, size: u64, From 2973987c811240b1e78844da8863099fe33e2936 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Tue, 24 Mar 2026 17:06:38 +0100 Subject: [PATCH 391/742] block: vhd: impl DiskFd for FixedVhdDiskAsync Delegate to FixedVhd::as_raw_fd(). Signed-off-by: Anatol Belski --- block/src/fixed_vhd_async.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/block/src/fixed_vhd_async.rs b/block/src/fixed_vhd_async.rs index 5b15d52076..b5b622e7ee 100644 --- a/block/src/fixed_vhd_async.rs +++ b/block/src/fixed_vhd_async.rs @@ -68,6 +68,12 @@ impl disk_file::PhysicalSize for FixedVhdDiskAsync { } } +impl disk_file::DiskFd for FixedVhdDiskAsync { + fn fd(&self) -> BorrowedDiskFd<'_> { + BorrowedDiskFd::new(self.0.as_raw_fd()) + } +} + pub struct FixedVhdAsync { raw_file_async: RawFileAsync, size: u64, From 1db64f511efb8e06e6734860e6716408317cb865 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Tue, 24 Mar 2026 17:09:05 +0100 Subject: [PATCH 392/742] block: vhd: impl Geometry for FixedVhdDiskAsync Use default DiskTopology with 512 byte sectors. Signed-off-by: Anatol Belski --- block/src/fixed_vhd_async.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/src/fixed_vhd_async.rs b/block/src/fixed_vhd_async.rs index b5b622e7ee..3ce44e8dbd 100644 --- a/block/src/fixed_vhd_async.rs +++ b/block/src/fixed_vhd_async.rs @@ -74,6 +74,8 @@ impl disk_file::DiskFd for FixedVhdDiskAsync { } } +impl disk_file::Geometry for FixedVhdDiskAsync {} + pub struct FixedVhdAsync { raw_file_async: RawFileAsync, size: u64, From 0dece29fa8b33008ecf6bce916bc73f99bd4815a Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Tue, 24 Mar 2026 17:09:47 +0100 Subject: [PATCH 393/742] block: vhd: impl SparseCapable for FixedVhdDiskAsync Fixed VHD does not support sparse operations. Signed-off-by: Anatol Belski --- block/src/fixed_vhd_async.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/src/fixed_vhd_async.rs b/block/src/fixed_vhd_async.rs index 3ce44e8dbd..bba4f07de2 100644 --- a/block/src/fixed_vhd_async.rs +++ b/block/src/fixed_vhd_async.rs @@ -76,6 +76,8 @@ impl disk_file::DiskFd for FixedVhdDiskAsync { impl disk_file::Geometry for FixedVhdDiskAsync {} +impl disk_file::SparseCapable for FixedVhdDiskAsync {} + pub struct FixedVhdAsync { raw_file_async: RawFileAsync, size: u64, From df5d2d64b228c9a52a23713e10968d86463b3bf8 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Tue, 24 Mar 2026 17:10:08 +0100 Subject: [PATCH 394/742] block: vhd: impl Resizable for FixedVhdDiskAsync Fixed VHD does not support resize, return UnsupportedFeature. Signed-off-by: Anatol Belski --- block/src/fixed_vhd_async.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/block/src/fixed_vhd_async.rs b/block/src/fixed_vhd_async.rs index bba4f07de2..c265636c8d 100644 --- a/block/src/fixed_vhd_async.rs +++ b/block/src/fixed_vhd_async.rs @@ -78,6 +78,16 @@ impl disk_file::Geometry for FixedVhdDiskAsync {} impl disk_file::SparseCapable for FixedVhdDiskAsync {} +impl disk_file::Resizable for FixedVhdDiskAsync { + fn resize(&mut self, _size: u64) -> BlockResult<()> { + Err(BlockError::new( + BlockErrorKind::UnsupportedFeature, + DiskFileError::ResizeError(std::io::Error::other("resize not supported for fixed VHD")), + ) + .with_op(ErrorOp::Resize)) + } +} + pub struct FixedVhdAsync { raw_file_async: RawFileAsync, size: u64, From be6ce5b87858337d47afcf7449430514b023d967 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Tue, 24 Mar 2026 17:10:21 +0100 Subject: [PATCH 395/742] block: vhd: impl DiskFile for FixedVhdDiskAsync Marker impl bundling DiskSize and Geometry supertraits. Signed-off-by: Anatol Belski --- block/src/fixed_vhd_async.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/src/fixed_vhd_async.rs b/block/src/fixed_vhd_async.rs index c265636c8d..8f6b37db19 100644 --- a/block/src/fixed_vhd_async.rs +++ b/block/src/fixed_vhd_async.rs @@ -88,6 +88,8 @@ impl disk_file::Resizable for FixedVhdDiskAsync { } } +impl disk_file::DiskFile for FixedVhdDiskAsync {} + pub struct FixedVhdAsync { raw_file_async: RawFileAsync, size: u64, From c0db1f61ac82ed7f16e5bdf11f3cca2b860e4497 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Tue, 24 Mar 2026 17:11:02 +0100 Subject: [PATCH 396/742] block: vhd: impl AsyncDiskFile for FixedVhdDiskAsync Delegate try_clone() to FixedVhd::clone() and new_async_io() to FixedVhdAsync, preserving DiskFileError::NewAsyncIo. Signed-off-by: Anatol Belski --- block/src/fixed_vhd_async.rs | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/block/src/fixed_vhd_async.rs b/block/src/fixed_vhd_async.rs index 8f6b37db19..9b768f80c6 100644 --- a/block/src/fixed_vhd_async.rs +++ b/block/src/fixed_vhd_async.rs @@ -90,6 +90,26 @@ impl disk_file::Resizable for FixedVhdDiskAsync { impl disk_file::DiskFile for FixedVhdDiskAsync {} +impl disk_file::AsyncDiskFile for FixedVhdDiskAsync { + fn try_clone(&self) -> BlockResult> { + Ok(Box::new(FixedVhdDiskAsync(self.0.clone()))) + } + + fn new_async_io(&self, ring_depth: u32) -> BlockResult> { + Ok(Box::new( + FixedVhdAsync::new( + self.0.as_raw_fd(), + ring_depth, + self.0.logical_size().unwrap(), + ) + .map_err(|e| { + BlockError::new(BlockErrorKind::Io, DiskFileError::NewAsyncIo(e)) + .with_op(ErrorOp::Open) + })?, + )) + } +} + pub struct FixedVhdAsync { raw_file_async: RawFileAsync, size: u64, From b69bd219fa6284aefc341cf97b184097abdde875 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Tue, 24 Mar 2026 17:11:36 +0100 Subject: [PATCH 397/742] vmm: Switch fixed VHD async to DiskBackend::Next Wire FixedVhdDiskAsync through the new composable trait system. Signed-off-by: Anatol Belski --- vmm/src/device_manager.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 26bc24dbc0..4699d7231d 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -2722,10 +2722,10 @@ impl DeviceManager { unreachable!("Checked in if statement above"); #[cfg(feature = "io_uring")] { - DiskBackend::Legacy(Box::new( + DiskBackend::Next(Box::new( FixedVhdDiskAsync::new(file) .map_err(DeviceManagerError::CreateFixedVhdDiskAsync)?, - ) as Box) + )) } } else { info!("Using synchronous fixed VHD disk file"); From 019aa528304ca70c979db343e45b43fce4f08cc4 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Tue, 24 Mar 2026 17:12:14 +0100 Subject: [PATCH 398/742] block: vhd: Remove legacy async_io::DiskFile impl from FixedVhdDiskAsync No remaining consumers after switching to DiskBackend::Next. Signed-off-by: Anatol Belski --- block/src/fixed_vhd_async.rs | 29 +---------------------------- 1 file changed, 1 insertion(+), 28 deletions(-) diff --git a/block/src/fixed_vhd_async.rs b/block/src/fixed_vhd_async.rs index 9b768f80c6..699fb2a494 100644 --- a/block/src/fixed_vhd_async.rs +++ b/block/src/fixed_vhd_async.rs @@ -7,9 +7,7 @@ use std::os::unix::io::{AsRawFd, RawFd}; use vmm_sys_util::eventfd::EventFd; -use crate::async_io::{ - AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFile, DiskFileError, DiskFileResult, -}; +use crate::async_io::{AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFileError}; use crate::error::{BlockError, BlockErrorKind, BlockResult, ErrorOp}; use crate::fixed_vhd::FixedVhd; use crate::raw_async::RawFileAsync; @@ -26,31 +24,6 @@ impl FixedVhdDiskAsync { } } -impl DiskFile for FixedVhdDiskAsync { - fn logical_size(&mut self) -> DiskFileResult { - Ok(self.0.logical_size().unwrap()) - } - - fn physical_size(&mut self) -> DiskFileResult { - Ok(self.0.physical_size().unwrap()) - } - - fn new_async_io(&self, ring_depth: u32) -> DiskFileResult> { - Ok(Box::new( - FixedVhdAsync::new( - self.0.as_raw_fd(), - ring_depth, - self.0.logical_size().unwrap(), - ) - .map_err(DiskFileError::NewAsyncIo)?, - ) as Box) - } - - fn fd(&mut self) -> BorrowedDiskFd<'_> { - BorrowedDiskFd::new(self.0.as_raw_fd()) - } -} - impl disk_file::DiskSize for FixedVhdDiskAsync { fn logical_size(&self) -> BlockResult { Ok(self.0.logical_size().unwrap()) From b8b32f59275b8123e2ff70afd1efd91fbc9526c0 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 25 Mar 2026 15:19:19 +0100 Subject: [PATCH 399/742] block: vhdx: Switch VhdxDiskSync::new to BlockResult Wrap VhdxError via BlockError::new(Io, e).with_op(Open). Update VMM CreateFixedVhdxDiskSync error variant from VhdxError to BlockError. Signed-off-by: Anatol Belski --- block/src/vhdx_sync.rs | 9 ++++++--- vmm/src/device_manager.rs | 4 ++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/block/src/vhdx_sync.rs b/block/src/vhdx_sync.rs index 0a0dc47bc2..a99f4f6f9c 100644 --- a/block/src/vhdx_sync.rs +++ b/block/src/vhdx_sync.rs @@ -12,7 +12,8 @@ use vmm_sys_util::eventfd::EventFd; use crate::async_io::{ AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFile, DiskFileError, DiskFileResult, }; -use crate::vhdx::{Result as VhdxResult, Vhdx}; +use crate::error::{BlockError, BlockErrorKind, BlockResult, ErrorOp}; +use crate::vhdx::Vhdx; use crate::{AsyncAdaptor, BlockBackend, Error}; pub struct VhdxDiskSync { @@ -28,9 +29,11 @@ pub struct VhdxDiskSync { } impl VhdxDiskSync { - pub fn new(f: File) -> VhdxResult { + pub fn new(f: File) -> BlockResult { Ok(VhdxDiskSync { - vhdx_file: Arc::new(Mutex::new(Vhdx::new(f)?)), + vhdx_file: Arc::new(Mutex::new(Vhdx::new(f).map_err(|e| { + BlockError::new(BlockErrorKind::Io, e).with_op(ErrorOp::Open) + })?)), }) } } diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 4699d7231d..4437cc8e94 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -42,7 +42,7 @@ use block::raw_sync::RawFileDiskSync; use block::vhdx_sync::VhdxDiskSync; use block::{ ImageType, block_aio_is_supported, block_io_uring_is_supported, detect_image_type, - open_disk_image, preallocate_disk, vhdx, + open_disk_image, preallocate_disk, }; #[cfg(feature = "io_uring")] use block::{fixed_vhd_async::FixedVhdDiskAsync, raw_async::RawFileDisk}; @@ -581,7 +581,7 @@ pub enum DeviceManagerError { /// Failed to create FixedVhdxDiskSync #[error("Failed to create FixedVhdxDiskSync")] - CreateFixedVhdxDiskSync(#[source] vhdx::VhdxError), + CreateFixedVhdxDiskSync(#[source] BlockError), /// Failed to add DMA mapping handler to virtio-mem device. #[error("Failed to add DMA mapping handler to virtio-mem device")] From 3164b66cb5b231e82ec7adc69f64ca09b1aa7cf0 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 25 Mar 2026 16:36:15 +0100 Subject: [PATCH 400/742] block: vhdx: impl DiskSize for VhdxDiskSync Delegates to Vhdx::virtual_disk_size() which is infallible. Signed-off-by: Anatol Belski --- block/src/vhdx_sync.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/block/src/vhdx_sync.rs b/block/src/vhdx_sync.rs index a99f4f6f9c..7a863a8640 100644 --- a/block/src/vhdx_sync.rs +++ b/block/src/vhdx_sync.rs @@ -14,8 +14,9 @@ use crate::async_io::{ }; use crate::error::{BlockError, BlockErrorKind, BlockResult, ErrorOp}; use crate::vhdx::Vhdx; -use crate::{AsyncAdaptor, BlockBackend, Error}; +use crate::{AsyncAdaptor, BlockBackend, Error, disk_file}; +#[derive(Debug)] pub struct VhdxDiskSync { // FIXME: The Mutex serializes all VHDX I/O operations across queues, which // is necessary for correctness but eliminates any parallelism benefit from @@ -38,6 +39,12 @@ impl VhdxDiskSync { } } +impl disk_file::DiskSize for VhdxDiskSync { + fn logical_size(&self) -> BlockResult { + Ok(self.vhdx_file.lock().unwrap().virtual_disk_size()) + } +} + impl DiskFile for VhdxDiskSync { fn logical_size(&mut self) -> DiskFileResult { Ok(self.vhdx_file.lock().unwrap().virtual_disk_size()) From 5578f329cfa813326e0ee369afa50f475bdde3e0 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 25 Mar 2026 16:37:26 +0100 Subject: [PATCH 401/742] block: vhdx: impl PhysicalSize for VhdxDiskSync Explicitly matches GetFileMetadata to preserve the original error type. Signed-off-by: Anatol Belski --- block/src/vhdx_sync.rs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/block/src/vhdx_sync.rs b/block/src/vhdx_sync.rs index 7a863a8640..d52e0667e4 100644 --- a/block/src/vhdx_sync.rs +++ b/block/src/vhdx_sync.rs @@ -45,6 +45,21 @@ impl disk_file::DiskSize for VhdxDiskSync { } } +impl disk_file::PhysicalSize for VhdxDiskSync { + fn physical_size(&self) -> BlockResult { + self.vhdx_file + .lock() + .unwrap() + .physical_size() + .map_err(|e| match e { + Error::GetFileMetadata(io) => { + BlockError::new(BlockErrorKind::Io, Error::GetFileMetadata(io)) + } + _ => BlockError::new(BlockErrorKind::Io, e), + }) + } +} + impl DiskFile for VhdxDiskSync { fn logical_size(&mut self) -> DiskFileResult { Ok(self.vhdx_file.lock().unwrap().virtual_disk_size()) From eca9e14ecbc18e9715910fdd844ae9d11c770592 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 25 Mar 2026 16:39:53 +0100 Subject: [PATCH 402/742] block: vhdx: impl DiskFd for VhdxDiskSync Returns the raw fd from the inner Vhdx via mutex lock. Signed-off-by: Anatol Belski --- block/src/vhdx_sync.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/block/src/vhdx_sync.rs b/block/src/vhdx_sync.rs index d52e0667e4..7a009bfdff 100644 --- a/block/src/vhdx_sync.rs +++ b/block/src/vhdx_sync.rs @@ -60,6 +60,12 @@ impl disk_file::PhysicalSize for VhdxDiskSync { } } +impl disk_file::DiskFd for VhdxDiskSync { + fn fd(&self) -> BorrowedDiskFd<'_> { + BorrowedDiskFd::new(self.vhdx_file.lock().unwrap().as_raw_fd()) + } +} + impl DiskFile for VhdxDiskSync { fn logical_size(&mut self) -> DiskFileResult { Ok(self.vhdx_file.lock().unwrap().virtual_disk_size()) From 62244f94da6383b628385bdd7e9716d01be9146b Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 25 Mar 2026 16:41:01 +0100 Subject: [PATCH 403/742] block: vhdx: impl Geometry for VhdxDiskSync Uses default topology. Signed-off-by: Anatol Belski --- block/src/vhdx_sync.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/src/vhdx_sync.rs b/block/src/vhdx_sync.rs index 7a009bfdff..9f1e952c23 100644 --- a/block/src/vhdx_sync.rs +++ b/block/src/vhdx_sync.rs @@ -66,6 +66,8 @@ impl disk_file::DiskFd for VhdxDiskSync { } } +impl disk_file::Geometry for VhdxDiskSync {} + impl DiskFile for VhdxDiskSync { fn logical_size(&mut self) -> DiskFileResult { Ok(self.vhdx_file.lock().unwrap().virtual_disk_size()) From 3e904d7a279095b48c905165678424bdccf4fef0 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 25 Mar 2026 16:41:37 +0100 Subject: [PATCH 404/742] block: vhdx: impl SparseCapable for VhdxDiskSync VHDX does not support sparse operations. Signed-off-by: Anatol Belski --- block/src/vhdx_sync.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/src/vhdx_sync.rs b/block/src/vhdx_sync.rs index 9f1e952c23..c3b04c9d1f 100644 --- a/block/src/vhdx_sync.rs +++ b/block/src/vhdx_sync.rs @@ -68,6 +68,8 @@ impl disk_file::DiskFd for VhdxDiskSync { impl disk_file::Geometry for VhdxDiskSync {} +impl disk_file::SparseCapable for VhdxDiskSync {} + impl DiskFile for VhdxDiskSync { fn logical_size(&mut self) -> DiskFileResult { Ok(self.vhdx_file.lock().unwrap().virtual_disk_size()) From ffdc8c49d19911fa471964de36dd2fb055682320 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 25 Mar 2026 16:42:26 +0100 Subject: [PATCH 405/742] block: vhdx: impl Resizable for VhdxDiskSync Returns UnsupportedFeature, VHDX resize is not supported. Signed-off-by: Anatol Belski --- block/src/vhdx_sync.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/block/src/vhdx_sync.rs b/block/src/vhdx_sync.rs index c3b04c9d1f..3eb8a91be3 100644 --- a/block/src/vhdx_sync.rs +++ b/block/src/vhdx_sync.rs @@ -70,6 +70,16 @@ impl disk_file::Geometry for VhdxDiskSync {} impl disk_file::SparseCapable for VhdxDiskSync {} +impl disk_file::Resizable for VhdxDiskSync { + fn resize(&mut self, _size: u64) -> BlockResult<()> { + Err(BlockError::new( + BlockErrorKind::UnsupportedFeature, + DiskFileError::ResizeError(std::io::Error::other("resize not supported for VHDX")), + ) + .with_op(ErrorOp::Resize)) + } +} + impl DiskFile for VhdxDiskSync { fn logical_size(&mut self) -> DiskFileResult { Ok(self.vhdx_file.lock().unwrap().virtual_disk_size()) From e99ad15939a835c1286e1270b14b2811a32e36a8 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 25 Mar 2026 16:42:48 +0100 Subject: [PATCH 406/742] block: vhdx: impl DiskFile for VhdxDiskSync Supertrait marker, all component traits already implemented. Signed-off-by: Anatol Belski --- block/src/vhdx_sync.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/src/vhdx_sync.rs b/block/src/vhdx_sync.rs index 3eb8a91be3..48d25714e3 100644 --- a/block/src/vhdx_sync.rs +++ b/block/src/vhdx_sync.rs @@ -80,6 +80,8 @@ impl disk_file::Resizable for VhdxDiskSync { } } +impl disk_file::DiskFile for VhdxDiskSync {} + impl DiskFile for VhdxDiskSync { fn logical_size(&mut self) -> DiskFileResult { Ok(self.vhdx_file.lock().unwrap().virtual_disk_size()) From cf9496376f84b9bb42c2afa16929d43af87d60a5 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 25 Mar 2026 16:43:37 +0100 Subject: [PATCH 407/742] block: vhdx: impl AsyncDiskFile for VhdxDiskSync try_clone() shares the Arc>. new_async_io() creates VhdxSync with a cloned Arc (no error path, infallible). Signed-off-by: Anatol Belski --- block/src/vhdx_sync.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/block/src/vhdx_sync.rs b/block/src/vhdx_sync.rs index 48d25714e3..8679978255 100644 --- a/block/src/vhdx_sync.rs +++ b/block/src/vhdx_sync.rs @@ -82,6 +82,18 @@ impl disk_file::Resizable for VhdxDiskSync { impl disk_file::DiskFile for VhdxDiskSync {} +impl disk_file::AsyncDiskFile for VhdxDiskSync { + fn try_clone(&self) -> BlockResult> { + Ok(Box::new(VhdxDiskSync { + vhdx_file: Arc::clone(&self.vhdx_file), + })) + } + + fn new_async_io(&self, _ring_depth: u32) -> BlockResult> { + Ok(Box::new(VhdxSync::new(Arc::clone(&self.vhdx_file)))) + } +} + impl DiskFile for VhdxDiskSync { fn logical_size(&mut self) -> DiskFileResult { Ok(self.vhdx_file.lock().unwrap().virtual_disk_size()) From b703043f7718864884eb1865b10c9db314463f7e Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 25 Mar 2026 16:44:44 +0100 Subject: [PATCH 408/742] vmm: Switch VHDX to DiskBackend::Next Wire VhdxDiskSync through DiskBackend::Next instead of Legacy. Signed-off-by: Anatol Belski --- vmm/src/device_manager.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 4437cc8e94..ea66cda9da 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -2784,10 +2784,10 @@ impl DeviceManager { } ImageType::Vhdx => { info!("Using synchronous VHDX disk file"); - DiskBackend::Legacy(Box::new( + DiskBackend::Next(Box::new( VhdxDiskSync::new(file) .map_err(DeviceManagerError::CreateFixedVhdxDiskSync)?, - ) as Box) + )) } ImageType::Unknown => unreachable!(), }; From 6e36932e90735c4cf1fc38cb314ce50e9d22f517 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 25 Mar 2026 16:46:46 +0100 Subject: [PATCH 409/742] block: vhdx: Remove legacy async_io::DiskFile impl from VhdxDiskSync All functionality now provided by composable disk_file traits. Signed-off-by: Anatol Belski --- block/src/vhdx_sync.rs | 28 +--------------------------- 1 file changed, 1 insertion(+), 27 deletions(-) diff --git a/block/src/vhdx_sync.rs b/block/src/vhdx_sync.rs index 8679978255..0405554c1b 100644 --- a/block/src/vhdx_sync.rs +++ b/block/src/vhdx_sync.rs @@ -9,9 +9,7 @@ use std::sync::{Arc, Mutex}; use vmm_sys_util::eventfd::EventFd; -use crate::async_io::{ - AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFile, DiskFileError, DiskFileResult, -}; +use crate::async_io::{AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFileError}; use crate::error::{BlockError, BlockErrorKind, BlockResult, ErrorOp}; use crate::vhdx::Vhdx; use crate::{AsyncAdaptor, BlockBackend, Error, disk_file}; @@ -94,30 +92,6 @@ impl disk_file::AsyncDiskFile for VhdxDiskSync { } } -impl DiskFile for VhdxDiskSync { - fn logical_size(&mut self) -> DiskFileResult { - Ok(self.vhdx_file.lock().unwrap().virtual_disk_size()) - } - - fn physical_size(&mut self) -> DiskFileResult { - self.vhdx_file.lock().unwrap().physical_size().map_err(|e| { - let io_inner = match e { - Error::GetFileMetadata(e) => e, - _ => unreachable!(), - }; - DiskFileError::Size(io_inner) - }) - } - - fn new_async_io(&self, _ring_depth: u32) -> DiskFileResult> { - Ok(Box::new(VhdxSync::new(Arc::clone(&self.vhdx_file))) as Box) - } - - fn fd(&mut self) -> BorrowedDiskFd<'_> { - BorrowedDiskFd::new(self.vhdx_file.lock().unwrap().as_raw_fd()) - } -} - pub struct VhdxSync { vhdx_file: Arc>, eventfd: EventFd, From 24e90492806631670347afc13aa83d7462841e1d Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 23 Mar 2026 23:30:08 +0100 Subject: [PATCH 410/742] block: vhd: Switch FixedVhdDiskSync::new to BlockResult Classify the io::Error as BlockErrorKind::Io with ErrorOp::Open. Update vmm CreateFixedVhdDiskSync to take BlockError. Signed-off-by: Anatol Belski --- block/src/fixed_vhd_sync.rs | 7 +++++-- vmm/src/device_manager.rs | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/block/src/fixed_vhd_sync.rs b/block/src/fixed_vhd_sync.rs index ecb5e83ad0..bf6c431fe3 100644 --- a/block/src/fixed_vhd_sync.rs +++ b/block/src/fixed_vhd_sync.rs @@ -11,14 +11,17 @@ use crate::BlockBackend; use crate::async_io::{ AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFile, DiskFileError, DiskFileResult, }; +use crate::error::{BlockError, BlockResult, ErrorOp}; use crate::fixed_vhd::FixedVhd; use crate::raw_sync::RawFileSync; pub struct FixedVhdDiskSync(FixedVhd); impl FixedVhdDiskSync { - pub fn new(file: File) -> std::io::Result { - Ok(Self(FixedVhd::new(file)?)) + pub fn new(file: File) -> BlockResult { + Ok(Self( + FixedVhd::new(file).map_err(|e| BlockError::from(e).with_op(ErrorOp::Open))?, + )) } } diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index ea66cda9da..bdcd33602c 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -573,7 +573,7 @@ pub enum DeviceManagerError { /// Failed to create FixedVhdDiskSync #[error("Failed to create FixedVhdDiskSync")] - CreateFixedVhdDiskSync(#[source] io::Error), + CreateFixedVhdDiskSync(#[source] BlockError), /// Failed to create QcowDiskSync #[error("Failed to create QcowDiskSync")] From 713674454966eae112f28b943387514da750dfae Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 23 Mar 2026 23:54:02 +0100 Subject: [PATCH 411/742] block: vhd: impl DiskSize for FixedVhdDiskSync Delegate to FixedVhd::logical_size() which returns the guest visible capacity parsed from the VHD footer at construction. Signed-off-by: Anatol Belski --- block/src/fixed_vhd_sync.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/block/src/fixed_vhd_sync.rs b/block/src/fixed_vhd_sync.rs index bf6c431fe3..9ea036b90a 100644 --- a/block/src/fixed_vhd_sync.rs +++ b/block/src/fixed_vhd_sync.rs @@ -7,14 +7,15 @@ use std::os::unix::io::{AsRawFd, RawFd}; use vmm_sys_util::eventfd::EventFd; -use crate::BlockBackend; use crate::async_io::{ AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFile, DiskFileError, DiskFileResult, }; use crate::error::{BlockError, BlockResult, ErrorOp}; use crate::fixed_vhd::FixedVhd; use crate::raw_sync::RawFileSync; +use crate::{BlockBackend, disk_file}; +#[derive(Debug)] pub struct FixedVhdDiskSync(FixedVhd); impl FixedVhdDiskSync { @@ -52,6 +53,12 @@ impl DiskFile for FixedVhdDiskSync { } } +impl disk_file::DiskSize for FixedVhdDiskSync { + fn logical_size(&self) -> BlockResult { + Ok(self.0.logical_size().unwrap()) + } +} + pub struct FixedVhdSync { raw_file_sync: RawFileSync, size: u64, From 266c5fc241116d5d196af63470970365937421ca Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Tue, 24 Mar 2026 09:14:11 +0100 Subject: [PATCH 412/742] block: vhd: impl PhysicalSize for FixedVhdDiskSync Delegate to FixedVhd::physical_size() which calls file.metadata(). Preserve the crate::Error::GetFileMetadata variant as the BlockError source for diagnostic chain traversal. Signed-off-by: Anatol Belski --- block/src/fixed_vhd_sync.rs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/block/src/fixed_vhd_sync.rs b/block/src/fixed_vhd_sync.rs index 9ea036b90a..15624eada0 100644 --- a/block/src/fixed_vhd_sync.rs +++ b/block/src/fixed_vhd_sync.rs @@ -10,7 +10,7 @@ use vmm_sys_util::eventfd::EventFd; use crate::async_io::{ AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFile, DiskFileError, DiskFileResult, }; -use crate::error::{BlockError, BlockResult, ErrorOp}; +use crate::error::{BlockError, BlockErrorKind, BlockResult, ErrorOp}; use crate::fixed_vhd::FixedVhd; use crate::raw_sync::RawFileSync; use crate::{BlockBackend, disk_file}; @@ -59,6 +59,17 @@ impl disk_file::DiskSize for FixedVhdDiskSync { } } +impl disk_file::PhysicalSize for FixedVhdDiskSync { + fn physical_size(&self) -> BlockResult { + self.0.physical_size().map_err(|e| match e { + crate::Error::GetFileMetadata(io) => { + BlockError::new(BlockErrorKind::Io, crate::Error::GetFileMetadata(io)) + } + _ => BlockError::new(BlockErrorKind::Io, e), + }) + } +} + pub struct FixedVhdSync { raw_file_sync: RawFileSync, size: u64, From dfa9a25fd8a32469122b27958907c57162dae5ea Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Tue, 24 Mar 2026 09:33:16 +0100 Subject: [PATCH 413/742] block: vhd: impl DiskFd for FixedVhdDiskSync Delegate to FixedVhd::as_raw_fd() for the backing file descriptor. Signed-off-by: Anatol Belski --- block/src/fixed_vhd_sync.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/block/src/fixed_vhd_sync.rs b/block/src/fixed_vhd_sync.rs index 15624eada0..778ab16531 100644 --- a/block/src/fixed_vhd_sync.rs +++ b/block/src/fixed_vhd_sync.rs @@ -70,6 +70,12 @@ impl disk_file::PhysicalSize for FixedVhdDiskSync { } } +impl disk_file::DiskFd for FixedVhdDiskSync { + fn fd(&self) -> BorrowedDiskFd<'_> { + BorrowedDiskFd::new(self.0.as_raw_fd()) + } +} + pub struct FixedVhdSync { raw_file_sync: RawFileSync, size: u64, From 1a56cb3d0e7fdd896a80baedf1149ae92f7b4f50 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Tue, 24 Mar 2026 13:53:34 +0100 Subject: [PATCH 414/742] block: vhd: impl Geometry for FixedVhdDiskSync Use default DiskTopology with 512byte sectors. Signed-off-by: Anatol Belski --- block/src/fixed_vhd_sync.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/src/fixed_vhd_sync.rs b/block/src/fixed_vhd_sync.rs index 778ab16531..056c79818a 100644 --- a/block/src/fixed_vhd_sync.rs +++ b/block/src/fixed_vhd_sync.rs @@ -76,6 +76,8 @@ impl disk_file::DiskFd for FixedVhdDiskSync { } } +impl disk_file::Geometry for FixedVhdDiskSync {} + pub struct FixedVhdSync { raw_file_sync: RawFileSync, size: u64, From 74b0613c04d4fc7199b0474ff18e9bccf17a8d1c Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Tue, 24 Mar 2026 13:59:55 +0100 Subject: [PATCH 415/742] block: vhd: impl SparseCapable for FixedVhdDiskSync Fixed VHD does not support sparse operations. Signed-off-by: Anatol Belski --- block/src/fixed_vhd_sync.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/src/fixed_vhd_sync.rs b/block/src/fixed_vhd_sync.rs index 056c79818a..c66a945cb8 100644 --- a/block/src/fixed_vhd_sync.rs +++ b/block/src/fixed_vhd_sync.rs @@ -78,6 +78,8 @@ impl disk_file::DiskFd for FixedVhdDiskSync { impl disk_file::Geometry for FixedVhdDiskSync {} +impl disk_file::SparseCapable for FixedVhdDiskSync {} + pub struct FixedVhdSync { raw_file_sync: RawFileSync, size: u64, From 60af47c99ba877e8725e632f98a91647e228c11f Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Tue, 24 Mar 2026 14:01:34 +0100 Subject: [PATCH 416/742] block: vhd: impl Resizable for FixedVhdDiskSync Fixed VHD does not support resize, return UnsupportedFeature. Signed-off-by: Anatol Belski --- block/src/fixed_vhd_sync.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/block/src/fixed_vhd_sync.rs b/block/src/fixed_vhd_sync.rs index c66a945cb8..10836853a9 100644 --- a/block/src/fixed_vhd_sync.rs +++ b/block/src/fixed_vhd_sync.rs @@ -80,6 +80,16 @@ impl disk_file::Geometry for FixedVhdDiskSync {} impl disk_file::SparseCapable for FixedVhdDiskSync {} +impl disk_file::Resizable for FixedVhdDiskSync { + fn resize(&mut self, _size: u64) -> BlockResult<()> { + Err(BlockError::new( + BlockErrorKind::UnsupportedFeature, + DiskFileError::ResizeError(std::io::Error::other("resize not supported for fixed VHD")), + ) + .with_op(ErrorOp::Resize)) + } +} + pub struct FixedVhdSync { raw_file_sync: RawFileSync, size: u64, From ce1592d1df8fd9c36e850bd1dda314a36af318bf Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Tue, 24 Mar 2026 14:57:14 +0100 Subject: [PATCH 417/742] block: vhd: impl DiskFile for FixedVhdDiskSync Marker impl bundling DiskSize and Geometry supertraits. Signed-off-by: Anatol Belski --- block/src/fixed_vhd_sync.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/src/fixed_vhd_sync.rs b/block/src/fixed_vhd_sync.rs index 10836853a9..92250fc8e7 100644 --- a/block/src/fixed_vhd_sync.rs +++ b/block/src/fixed_vhd_sync.rs @@ -90,6 +90,8 @@ impl disk_file::Resizable for FixedVhdDiskSync { } } +impl disk_file::DiskFile for FixedVhdDiskSync {} + pub struct FixedVhdSync { raw_file_sync: RawFileSync, size: u64, From cc2f878094255b7a1dabd077be66dc722edee371 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Tue, 24 Mar 2026 15:11:10 +0100 Subject: [PATCH 418/742] block: vhd: impl AsyncDiskFile for FixedVhdDiskSync Delegate try_clone() to FixedVhd::clone() and new_async_io() to FixedVhdSync, preserving DiskFileError::NewAsyncIo. Signed-off-by: Anatol Belski --- block/src/fixed_vhd_sync.rs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/block/src/fixed_vhd_sync.rs b/block/src/fixed_vhd_sync.rs index 92250fc8e7..b28ce38689 100644 --- a/block/src/fixed_vhd_sync.rs +++ b/block/src/fixed_vhd_sync.rs @@ -92,6 +92,21 @@ impl disk_file::Resizable for FixedVhdDiskSync { impl disk_file::DiskFile for FixedVhdDiskSync {} +impl disk_file::AsyncDiskFile for FixedVhdDiskSync { + fn try_clone(&self) -> BlockResult> { + Ok(Box::new(FixedVhdDiskSync(self.0.clone()))) + } + + fn new_async_io(&self, _ring_depth: u32) -> BlockResult> { + Ok(Box::new( + FixedVhdSync::new(self.0.as_raw_fd(), self.0.logical_size().unwrap()).map_err(|e| { + BlockError::new(BlockErrorKind::Io, DiskFileError::NewAsyncIo(e)) + .with_op(ErrorOp::Open) + })?, + )) + } +} + pub struct FixedVhdSync { raw_file_sync: RawFileSync, size: u64, From c90f5a9e47f37f431f7793c633b16538a058bbc6 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Tue, 24 Mar 2026 15:21:08 +0100 Subject: [PATCH 419/742] vmm: Switch fixed VHD sync to DiskBackend::Next Wire FixedVhdDiskSync through the new composable trait system. Signed-off-by: Anatol Belski --- vmm/src/device_manager.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index bdcd33602c..72ce17b26a 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -2729,10 +2729,10 @@ impl DeviceManager { } } else { info!("Using synchronous fixed VHD disk file"); - DiskBackend::Legacy(Box::new( + DiskBackend::Next(Box::new( FixedVhdDiskSync::new(file) .map_err(DeviceManagerError::CreateFixedVhdDiskSync)?, - ) as Box) + )) } } ImageType::Raw => { From 5ca6ff869fd467dda2013c514a5d7d00f4653bc1 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Tue, 24 Mar 2026 16:37:17 +0100 Subject: [PATCH 420/742] block: vhd: Remove legacy async_io::DiskFile impl from FixedVhdDiskSync No remaining consumers after switching to DiskBackend::Next. Signed-off-by: Anatol Belski --- block/src/fixed_vhd_sync.rs | 31 +------------------------------ 1 file changed, 1 insertion(+), 30 deletions(-) diff --git a/block/src/fixed_vhd_sync.rs b/block/src/fixed_vhd_sync.rs index b28ce38689..14685522b3 100644 --- a/block/src/fixed_vhd_sync.rs +++ b/block/src/fixed_vhd_sync.rs @@ -7,9 +7,7 @@ use std::os::unix::io::{AsRawFd, RawFd}; use vmm_sys_util::eventfd::EventFd; -use crate::async_io::{ - AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFile, DiskFileError, DiskFileResult, -}; +use crate::async_io::{AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFileError}; use crate::error::{BlockError, BlockErrorKind, BlockResult, ErrorOp}; use crate::fixed_vhd::FixedVhd; use crate::raw_sync::RawFileSync; @@ -26,33 +24,6 @@ impl FixedVhdDiskSync { } } -impl DiskFile for FixedVhdDiskSync { - fn logical_size(&mut self) -> DiskFileResult { - Ok(self.0.logical_size().unwrap()) - } - - fn physical_size(&mut self) -> DiskFileResult { - self.0.physical_size().map_err(|e| { - let io_inner = match e { - crate::Error::GetFileMetadata(e) => e, - _ => unreachable!(), - }; - DiskFileError::Size(io_inner) - }) - } - - fn new_async_io(&self, _ring_depth: u32) -> DiskFileResult> { - Ok(Box::new( - FixedVhdSync::new(self.0.as_raw_fd(), self.0.logical_size().unwrap()) - .map_err(DiskFileError::NewAsyncIo)?, - ) as Box) - } - - fn fd(&mut self) -> BorrowedDiskFd<'_> { - BorrowedDiskFd::new(self.0.as_raw_fd()) - } -} - impl disk_file::DiskSize for FixedVhdDiskSync { fn logical_size(&self) -> BlockResult { Ok(self.0.logical_size().unwrap()) From c943103f7530553afe689187035be32c4c5fd881 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 31 Mar 2026 00:05:58 +0000 Subject: [PATCH 421/742] build(deps): bump the non-rust-vmm group across 2 directories with 7 updates Bumps the non-rust-vmm group with 5 updates in the / directory: | Package | From | To | | --- | --- | --- | | [uuid](https://github.com/uuid-rs/uuid) | `1.22.0` | `1.23.0` | | [zerocopy](https://github.com/google/zerocopy) | `0.8.47` | `0.8.48` | | [cc](https://github.com/rust-lang/cc-rs) | `1.2.57` | `1.2.58` | | libredox | `0.1.14` | `0.1.15` | | [simd-adler32](https://github.com/mcountryman/simd-adler32) | `0.3.8` | `0.3.9` | Bumps the non-rust-vmm group with 5 updates in the /fuzz directory: | Package | From | To | | --- | --- | --- | | [uuid](https://github.com/uuid-rs/uuid) | `1.22.0` | `1.23.0` | | [zerocopy](https://github.com/google/zerocopy) | `0.8.47` | `0.8.48` | | [cc](https://github.com/rust-lang/cc-rs) | `1.2.57` | `1.2.58` | | [simd-adler32](https://github.com/mcountryman/simd-adler32) | `0.3.8` | `0.3.9` | | [winnow](https://github.com/winnow-rs/winnow) | `1.0.0` | `1.0.1` | Updates `uuid` from 1.22.0 to 1.23.0 - [Release notes](https://github.com/uuid-rs/uuid/releases) - [Commits](https://github.com/uuid-rs/uuid/compare/v1.22.0...v1.23.0) Updates `zerocopy` from 0.8.47 to 0.8.48 - [Release notes](https://github.com/google/zerocopy/releases) - [Changelog](https://github.com/google/zerocopy/blob/main/CHANGELOG.md) - [Commits](https://github.com/google/zerocopy/compare/v0.8.47...v0.8.48) Updates `cc` from 1.2.57 to 1.2.58 - [Release notes](https://github.com/rust-lang/cc-rs/releases) - [Changelog](https://github.com/rust-lang/cc-rs/blob/main/CHANGELOG.md) - [Commits](https://github.com/rust-lang/cc-rs/compare/cc-v1.2.57...cc-v1.2.58) Updates `libredox` from 0.1.14 to 0.1.15 Updates `simd-adler32` from 0.3.8 to 0.3.9 - [Changelog](https://github.com/mcountryman/simd-adler32/blob/main/CHANGELOG.md) - [Commits](https://github.com/mcountryman/simd-adler32/commits/v0.3.9) Updates `zerocopy-derive` from 0.8.47 to 0.8.48 - [Release notes](https://github.com/google/zerocopy/releases) - [Changelog](https://github.com/google/zerocopy/blob/main/CHANGELOG.md) - [Commits](https://github.com/google/zerocopy/compare/v0.8.47...v0.8.48) Updates `uuid` from 1.22.0 to 1.23.0 - [Release notes](https://github.com/uuid-rs/uuid/releases) - [Commits](https://github.com/uuid-rs/uuid/compare/v1.22.0...v1.23.0) Updates `zerocopy` from 0.8.47 to 0.8.48 - [Release notes](https://github.com/google/zerocopy/releases) - [Changelog](https://github.com/google/zerocopy/blob/main/CHANGELOG.md) - [Commits](https://github.com/google/zerocopy/compare/v0.8.47...v0.8.48) Updates `cc` from 1.2.57 to 1.2.58 - [Release notes](https://github.com/rust-lang/cc-rs/releases) - [Changelog](https://github.com/rust-lang/cc-rs/blob/main/CHANGELOG.md) - [Commits](https://github.com/rust-lang/cc-rs/compare/cc-v1.2.57...cc-v1.2.58) Updates `simd-adler32` from 0.3.8 to 0.3.9 - [Changelog](https://github.com/mcountryman/simd-adler32/blob/main/CHANGELOG.md) - [Commits](https://github.com/mcountryman/simd-adler32/commits/v0.3.9) Updates `winnow` from 1.0.0 to 1.0.1 - [Changelog](https://github.com/winnow-rs/winnow/blob/main/CHANGELOG.md) - [Commits](https://github.com/winnow-rs/winnow/compare/v1.0.0...v1.0.1) Updates `zerocopy-derive` from 0.8.47 to 0.8.48 - [Release notes](https://github.com/google/zerocopy/releases) - [Changelog](https://github.com/google/zerocopy/blob/main/CHANGELOG.md) - [Commits](https://github.com/google/zerocopy/compare/v0.8.47...v0.8.48) --- updated-dependencies: - dependency-name: uuid dependency-version: 1.23.0 dependency-type: direct:production update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: zerocopy dependency-version: 0.8.48 dependency-type: direct:production update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: cc dependency-version: 1.2.58 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: libredox dependency-version: 0.1.15 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: simd-adler32 dependency-version: 0.3.9 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: zerocopy-derive dependency-version: 0.8.48 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: uuid dependency-version: 1.23.0 dependency-type: direct:production update-type: version-update:semver-minor dependency-group: non-rust-vmm - dependency-name: zerocopy dependency-version: 0.8.48 dependency-type: direct:production update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: cc dependency-version: 1.2.58 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: simd-adler32 dependency-version: 0.3.9 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: winnow dependency-version: 1.0.1 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: zerocopy-derive dependency-version: 0.8.48 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm ... Signed-off-by: dependabot[bot] --- Cargo.lock | 24 ++++++++++++------------ Cargo.toml | 4 ++-- devices/Cargo.toml | 2 +- fuzz/Cargo.lock | 24 ++++++++++++------------ 4 files changed, 27 insertions(+), 27 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c1ce72b0b6..ac44003adb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -370,9 +370,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "cc" -version = "1.2.57" +version = "1.2.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a0dd1ca384932ff3641c8718a02769f1698e7563dc6974ffd03346116310423" +checksum = "e1e928d4b69e3077709075a938a05ffbedfa53a84c8f766efbf8220bb1ff60e1" dependencies = [ "find-msvc-tools", "jobserver", @@ -1225,9 +1225,9 @@ checksum = "b5b646652bf6661599e1da8901b3b9522896f01e736bad5f723fe7a3a27f899d" [[package]] name = "libredox" -version = "0.1.14" +version = "0.1.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1744e39d1d6a9948f4f388969627434e31128196de472883b39f148769bfe30a" +checksum = "7ddbf48fd451246b1f8c2610bd3b4ac0cc6e149d89832867093ab69a17194f08" dependencies = [ "libc", ] @@ -2021,9 +2021,9 @@ dependencies = [ [[package]] name = "simd-adler32" -version = "0.3.8" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2" +checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214" [[package]] name = "slab" @@ -2258,9 +2258,9 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.22.0" +version = "1.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a68d3c8f01c0cfa54a75291d83601161799e4a89a39e0929f4b0354d88757a37" +checksum = "5ac8b6f42ead25368cf5b098aeb3dc8a1a2c05a3eee8a9a1a68c640edbfc79d9" dependencies = [ "getrandom 0.4.2", "js-sys", @@ -2885,18 +2885,18 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.47" +version = "0.8.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "efbb2a062be311f2ba113ce66f697a4dc589f85e78a4aea276200804cea0ed87" +checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.47" +version = "0.8.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e8bc7269b54418e7aeeef514aa68f8690b8c0489a06b0136e5f57c4c5ccab89" +checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4" dependencies = [ "proc-macro2", "quote", diff --git a/Cargo.toml b/Cargo.toml index e22ace8382..67d5398f01 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -95,9 +95,9 @@ libc = "0.2.183" log = "0.4.29" signal-hook = "0.4.3" thiserror = "2.0.18" -uuid = { version = "1.22.0" } +uuid = { version = "1.23.0" } wait-timeout = "0.2.1" -zerocopy = { version = "0.8.42", default-features = false } +zerocopy = { version = "0.8.48", default-features = false } [workspace.lints.clippy] # Any clippy lint (group) in alphabetical order: diff --git a/devices/Cargo.toml b/devices/Cargo.toml index 516f41e834..afdc3403bf 100644 --- a/devices/Cargo.toml +++ b/devices/Cargo.toml @@ -35,7 +35,7 @@ vm-memory = { workspace = true, features = [ ] } vm-migration = { path = "../vm-migration" } vmm-sys-util = { workspace = true } -zerocopy = { version = "0.8.42", features = [ +zerocopy = { version = "0.8.48", features = [ "alloc", "derive", ], optional = true } diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock index 488476e9e9..8741ab8591 100644 --- a/fuzz/Cargo.lock +++ b/fuzz/Cargo.lock @@ -173,9 +173,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "cc" -version = "1.2.57" +version = "1.2.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a0dd1ca384932ff3641c8718a02769f1698e7563dc6974ffd03346116310423" +checksum = "e1e928d4b69e3077709075a938a05ffbedfa53a84c8f766efbf8220bb1ff60e1" dependencies = [ "find-msvc-tools", "jobserver", @@ -1134,9 +1134,9 @@ dependencies = [ [[package]] name = "simd-adler32" -version = "0.3.8" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2" +checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214" [[package]] name = "smallvec" @@ -1261,9 +1261,9 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.22.0" +version = "1.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a68d3c8f01c0cfa54a75291d83601161799e4a89a39e0929f4b0354d88757a37" +checksum = "5ac8b6f42ead25368cf5b098aeb3dc8a1a2c05a3eee8a9a1a68c640edbfc79d9" dependencies = [ "getrandom 0.4.2", "js-sys", @@ -1640,9 +1640,9 @@ dependencies = [ [[package]] name = "winnow" -version = "1.0.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a90e88e4667264a994d34e6d1ab2d26d398dcdca8b7f52bec8668957517fc7d8" +checksum = "09dac053f1cd375980747450bfc7250c264eaae0583872e845c0c7cd578872b5" dependencies = [ "memchr", ] @@ -1737,18 +1737,18 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.47" +version = "0.8.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "efbb2a062be311f2ba113ce66f697a4dc589f85e78a4aea276200804cea0ed87" +checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.47" +version = "0.8.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e8bc7269b54418e7aeeef514aa68f8690b8c0489a06b0136e5f57c4c5ccab89" +checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4" dependencies = [ "proc-macro2", "quote", From c52e15143950c67c2ecbe99f5e0aded159b3a74f Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Tue, 31 Mar 2026 06:05:14 +0000 Subject: [PATCH 422/742] github: Introduce cool down periods for non-rust-vmm crates This gives the community more time to react to possible security chain compromises. We have high confidence that rust-vmm crates are trusted, and the community is fully capable of spotting any issues. There is no need to delay that group. Signed-off-by: Wei Liu --- .github/dependabot.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 781912648b..2f3865fb1a 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -37,6 +37,11 @@ updates: interval: weekly allow: - dependency-type: all + cooldown: + default-days: 7 + semver-major-days: 14 + semver-minor-days: 7 + semver-patch-days: 3 ignore: - dependency-name: "acpi_tables" - dependency-name: "kvm-bindings" From ff20f183647c602bf5e5028d99959e5547a7dd84 Mon Sep 17 00:00:00 2001 From: CMGS Date: Tue, 31 Mar 2026 06:12:21 +0000 Subject: [PATCH 423/742] vmm: restore KVM clock before resuming vCPUs Reorder resume() to: set_clock, device_manager.resume, cpu_manager.resume. This matches the inverse of pause() which correctly saves the clock before pausing vCPUs. Signed-off-by: CMGS --- vmm/src/vm.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index 430c9bdc74..c1e21a5ac9 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -3111,7 +3111,8 @@ impl Pausable for Vm { .valid_transition(new_state) .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {e:?}")))?; - self.cpu_manager.lock().unwrap().resume()?; + // Restore KVM clock BEFORE vCPUs start running, so they see correct + // TSC/kvmclock from the first instruction after resume. #[cfg(target_arch = "x86_64")] { if let Some(clock) = &self.saved_clock { @@ -3128,6 +3129,7 @@ impl Pausable for Vm { } self.device_manager.lock().unwrap().resume()?; + self.cpu_manager.lock().unwrap().resume()?; // And we're back to the Running state. self.state = new_state; From 7832401816c5902b013630b05de6e5638b463e68 Mon Sep 17 00:00:00 2001 From: Souradeep Chakrabarti Date: Tue, 31 Mar 2026 06:18:27 +0000 Subject: [PATCH 424/742] hypervisor: Add GHCB CPUID, MSR and TERM_REQ handlers When booting an SEV-SNP guest VM using IGVM with -pvalidate_opt 1 (lazy page acceptance), the guest kernel's #VC exception handler may issue VMGEXIT with SVM_EXIT_CPUID (0x72) or SVM_EXIT_MSR (0x7c) exit codes via the GHCB page protocol. The hypervisor had no handlers for these exit codes, causing the guest's #VC handler to fail and trigger sev_es_terminate(), which sends GHCB_MSR_TERM_REQ (0x100). The hypervisor then panicked on the unhandled 0x100 operation. Add the following handlers to the GHCB VMGEXIT processing: - SVM_EXIT_CPUID (0x72): Read function/index/xcr0/xss from the GHCB page and return CPUID results via get_cpuid_values(). - SVM_EXIT_MSR (0x7c): Handle MSR read (RDMSR) and write (WRMSR) requests from the guest via the GHCB page protocol. - GHCB_MSR_TERM_REQ (0x100): Decode reason_set and reason_val from the GHCB MSR and return an error instead of panicking, allowing graceful error propagation. Testing: Reproducer (on Azure DC16as_cc_v5, /dev/mshv): cloud-hypervisor --cpus boot=1,nested=off --memory size=512M \ --disk path=osdisk.img path=cloudinit \ --net "tap=,mac=12:34:56:78:90:06,ip=192.168.6.1,mask=255.255.255.128" \ --serial null --console pty \ --api-socket /tmp/ch.sock \ --igvm /igvm_files/linux-ttyS0.bin \ --host-data --platform sev_snp=on -v Before fix: thread 'vcpu0' panicked at hypervisor/src/mshv/mod.rs:1207:30: Unsupported VMGEXIT operation: 100 After fix: VM boots successfully to login prompt with no panics. All virtio devices (console, rng, disks) activate normally. No regression risk for non-SEV-SNP: all new code is within the HVMSG_X64_SEV_VMGEXIT_INTERCEPT handler which is only reached for SEV-SNP guests. Signed-off-by: Souradeep Chakrabarti --- hypervisor/src/mshv/mod.rs | 101 +++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) diff --git a/hypervisor/src/mshv/mod.rs b/hypervisor/src/mshv/mod.rs index 00cc4a6844..8623531c5b 100644 --- a/hypervisor/src/mshv/mod.rs +++ b/hypervisor/src/mshv/mod.rs @@ -13,6 +13,8 @@ use anyhow::anyhow; #[cfg(target_arch = "x86_64")] use arc_swap::ArcSwap; #[cfg(feature = "sev_snp")] +use log::error; +#[cfg(feature = "sev_snp")] use log::info; use log::{debug, warn}; use mshv_bindings::*; @@ -85,6 +87,12 @@ use crate::{CpuState, IoEventAddress, IrqRoutingEntry, MpState}; pub const PAGE_SHIFT: usize = 12; +// SVM exit codes not yet defined in mshv-bindings (AMD APM Vol 2, Table 15-7) +#[cfg(feature = "sev_snp")] +const SVM_EXITCODE_CPUID: u32 = 0x72; +#[cfg(feature = "sev_snp")] +const SVM_EXITCODE_MSR: u32 = 0x7c; + #[cfg(target_arch = "x86_64")] impl From for ClockData { fn from(d: MshvClockData) -> Self { @@ -1199,11 +1207,104 @@ impl cpu::Vcpu for MshvVcpu { // Clear the SW_EXIT_INFO1 register to indicate no error self.clear_swexit_info1()?; } + SVM_EXITCODE_CPUID => { + // SAFETY: Accessing fields from the mapped GHCB page + let cpuid_fn = unsafe { (*ghcb).rax } as u32; + // SAFETY: Accessing fields from the mapped GHCB page + let cpuid_idx = unsafe { (*ghcb).rcx } as u32; + // SAFETY: Accessing fields from the mapped GHCB page + let xcr0 = unsafe { (*ghcb).xfem }; + // SAFETY: Accessing fields from the mapped GHCB page + let xss = unsafe { (*ghcb).xss }; + debug!("GHCB CPUID: fn=0x{cpuid_fn:x} idx=0x{cpuid_idx:x}"); + + let cpuid_result = self + .fd + .get_cpuid_values(cpuid_fn, cpuid_idx, xcr0, xss) + .unwrap_or([0u32; 4]); + + set_svm_field_u64_ptr!(ghcb, rax, cpuid_result[0] as u64); + set_svm_field_u64_ptr!(ghcb, rbx, cpuid_result[1] as u64); + set_svm_field_u64_ptr!(ghcb, rcx, cpuid_result[2] as u64); + set_svm_field_u64_ptr!(ghcb, rdx, cpuid_result[3] as u64); + + self.clear_swexit_info1()?; + } + SVM_EXITCODE_MSR => { + let exit_info1 = + info.__bindgen_anon_2.__bindgen_anon_1.sw_exit_info1; + // SAFETY: Accessing fields from the mapped GHCB page + let msr_index = unsafe { (*ghcb).rcx } as u32; + let is_write = exit_info1 & 1 != 0; + + if is_write { + // SAFETY: Accessing fields from the mapped GHCB page + let msr_lo = unsafe { (*ghcb).rax } as u32; + // SAFETY: Accessing fields from the mapped GHCB page + let msr_hi = unsafe { (*ghcb).rdx } as u32; + let msr_val = ((msr_hi as u64) << 32) | (msr_lo as u64); + debug!( + "GHCB MSR WRITE: index=0x{msr_index:x} val=0x{msr_val:x}" + ); + let entry = msr_entry { + index: msr_index, + data: msr_val, + ..Default::default() + }; + let msr_entries = MsrEntries::from_entries(&[entry]) + .map_err(|e| { + cpu::HypervisorCpuError::RunVcpu(e.into()) + })?; + self.fd.set_msrs(&msr_entries).map_err(|e| { + cpu::HypervisorCpuError::RunVcpu(e.into()) + })?; + } else { + let entry = msr_entry { + index: msr_index, + ..Default::default() + }; + let mut msr_entries = MsrEntries::from_entries(&[entry]) + .map_err(|e| { + cpu::HypervisorCpuError::RunVcpu(e.into()) + })?; + self.fd.get_msrs(&mut msr_entries).map_err(|e| { + cpu::HypervisorCpuError::RunVcpu(e.into()) + })?; + let msr_slice = msr_entries.as_slice(); + if msr_slice.is_empty() { + return Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( + "get_msrs returned no entries for index 0x{msr_index:x}" + ))); + } + let msr_val = msr_slice[0].data; + debug!( + "GHCB MSR READ: index=0x{msr_index:x} val=0x{msr_val:x}" + ); + set_svm_field_u64_ptr!(ghcb, rax, msr_val & 0xFFFFFFFF); + set_svm_field_u64_ptr!(ghcb, rdx, msr_val >> 32); + } + + self.clear_swexit_info1()?; + } _ => { panic!("GHCB_INFO_NORMAL: Unhandled exit code: {exit_code:0x}") } } } + GHCB_INFO_SHUTDOWN_REQUEST => { + let ghcb_msr_val = { info.ghcb_msr }; + let reason_set = (ghcb_msr_val >> 12) & 0xf; + let reason_val = (ghcb_msr_val >> 16) & 0xff; + error!( + "GHCB_MSR_TERM_REQ: Guest terminated! \ + ghcb_msr=0x{ghcb_msr_val:x}, \ + reason_set={reason_set}, reason_val={reason_val}" + ); + return Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( + "Guest requested termination via GHCB_MSR_TERM_REQ \ + (reason_set={reason_set}, reason_val={reason_val})" + ))); + } _ => panic!("Unsupported VMGEXIT operation: {ghcb_op:0x}"), } From 082fdc4d076086e84ab0392389354681eb603c70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tobias=20K=C3=A4sser?= Date: Sat, 28 Mar 2026 12:35:34 +0000 Subject: [PATCH 425/742] virtio-devices: fix guest-triggerable panic via OOB queue_select MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The queue_msix_vector register (offset 0x1a in virtio PCI common config) was indexed into the msix_queues Vec using the guest-controlled queue_select value without bounds checking. A malicious guest can set queue_select to any u16 value via offset 0x16, then read or write offset 0x1a to trigger an out-of-bounds panic, crashing the VMM. Replace direct Vec indexing with .get()/.get_mut() for bounds-checked access, returning VIRTQ_MSI_NO_VECTOR (0xFFFF) on OOB reads to match the virtio "no vector" sentinel. Add a regression test that sets queue_select to 0xFFFF and exercises both the read and write paths. AI/LLM disclosure: this patch was co-authored with Claude Code. Fixes #7917 Signed-off-by: Tobias Kässer --- .../src/transport/pci_common_config.rs | 50 ++++++++++++++++++- virtio-devices/src/transport/pci_device.rs | 2 +- 2 files changed, 49 insertions(+), 3 deletions(-) diff --git a/virtio-devices/src/transport/pci_common_config.rs b/virtio-devices/src/transport/pci_common_config.rs index dcd65f7bc1..c59c454b77 100644 --- a/virtio-devices/src/transport/pci_common_config.rs +++ b/virtio-devices/src/transport/pci_common_config.rs @@ -16,6 +16,7 @@ use virtio_queue::{Queue, QueueT}; use vm_migration::{MigratableError, Pausable, Snapshot, Snapshottable}; use vm_virtio::AccessPlatform; +use super::pci_device::VIRTQ_MSI_NO_VECTOR; use crate::VirtioDevice; pub const VIRTIO_PCI_COMMON_CONFIG_ID: &str = "virtio_pci_common_config"; @@ -249,7 +250,13 @@ impl VirtioPciCommonConfig { 0x12 => queues.len() as u16, // num_queues 0x16 => self.queue_select, 0x18 => self.with_queue(queues, |q| q.size()).unwrap_or(0), - 0x1a => self.msix_queues.lock().unwrap()[self.queue_select as usize], + 0x1a => self + .msix_queues + .lock() + .unwrap() + .get(usize::from(self.queue_select)) + .copied() + .unwrap_or(VIRTQ_MSI_NO_VECTOR), 0x1c => u16::from(self.with_queue(queues, |q| q.ready()).unwrap_or(false)), 0x1e => self.queue_select, // notify_off _ => { @@ -265,7 +272,16 @@ impl VirtioPciCommonConfig { 0x10 => self.msix_config.store(value, Ordering::Release), 0x16 => self.queue_select = value, 0x18 => self.with_queue_mut(queues, |q| q.set_size(value)), - 0x1a => self.msix_queues.lock().unwrap()[self.queue_select as usize] = value, + 0x1a => { + if let Some(entry) = self + .msix_queues + .lock() + .unwrap() + .get_mut(usize::from(self.queue_select)) + { + *entry = value; + } + } 0x1c => self.with_queue_mut(queues, |q| { let ready = value == 1; q.set_ready(ready); @@ -484,4 +500,34 @@ mod unit_tests { assert_eq!(read_back[0], 0xaa); assert_eq!(read_back[1], 0x55); } + + #[test] + fn oob_queue_select_does_not_panic() { + // Regression test: reading/writing queue_msix_vector (offset 0x1a) + // with an out-of-bounds queue_select must not panic. + let mut regs = VirtioPciCommonConfig { + access_platform: None, + driver_status: Arc::new(AtomicU8::new(0)), + config_generation: 0, + device_feature_select: 0, + driver_feature_select: 0, + queue_select: 0, + msix_config: Arc::new(AtomicU16::new(0)), + msix_queues: Arc::new(Mutex::new(vec![0; 1])), // only 1 queue + }; + + let dev = Arc::new(Mutex::new(DummyDevice(0))); + let mut queues = vec![Queue::new(256).unwrap()]; + + // Set queue_select to an out-of-bounds value. + regs.write(0x16, &[0xFF, 0xFF], &mut queues, dev.clone()); + + // Read queue_msix_vector — must not panic, should return VIRTQ_MSI_NO_VECTOR. + let mut read_back = vec![0x00, 0x00]; + regs.read(0x1a, &mut read_back, &queues, dev.clone()); + assert_eq!(LittleEndian::read_u16(&read_back), VIRTQ_MSI_NO_VECTOR); + + // Write queue_msix_vector — must not panic. + regs.write(0x1a, &[0xAB, 0xCD], &mut queues, dev); + } } diff --git a/virtio-devices/src/transport/pci_device.rs b/virtio-devices/src/transport/pci_device.rs index 1eb1cc03f7..dd54472d39 100644 --- a/virtio-devices/src/transport/pci_device.rs +++ b/virtio-devices/src/transport/pci_device.rs @@ -45,7 +45,7 @@ use crate::{ }; /// Vector value used to disable MSI for a queue. -const VIRTQ_MSI_NO_VECTOR: u16 = 0xffff; +pub(super) const VIRTQ_MSI_NO_VECTOR: u16 = 0xffff; enum PciCapabilityType { Common = 1, From ca58685f4cbe634e1d1c55da16205d7666e137f8 Mon Sep 17 00:00:00 2001 From: CMGS Date: Tue, 31 Mar 2026 06:13:58 +0000 Subject: [PATCH 426/742] vmm: call notify_guest_clock_paused for Hyper-V guests Previously, KVM_KVMCLOCK_CTRL was skipped when kvm_hyperv=on because Windows does not use pvclock directly. However, KVM internally uses pvclock data structures as the basis for computing the Hyper-V Reference TSC page parameters. Not calling KVM_KVMCLOCK_CTRL means there is no mechanism to signal time discontinuity to Windows guests after pause/resume, contributing to multi-minute hangs. Remove the kvm_hyperv guard so all guests receive the clock-paused notification. Signed-off-by: CMGS --- vmm/src/cpu.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/vmm/src/cpu.rs b/vmm/src/cpu.rs index f8fff2b299..4b15cffc31 100644 --- a/vmm/src/cpu.rs +++ b/vmm/src/cpu.rs @@ -2637,16 +2637,16 @@ impl Pausable for CpuManager { self.signal_vcpus() .map_err(|e| MigratableError::Pause(anyhow!("Error signalling vCPUs: {e}")))?; + // Notify all guests (including Hyper-V / Windows) that the clock was + // paused. KVM_KVMCLOCK_CTRL updates internal KVM state that affects + // both pvclock (Linux) and the Hyper-V TSC reference page, so it must + // be called unconditionally. #[cfg(all(feature = "kvm", target_arch = "x86_64"))] for vcpu in self.vcpus.iter() { let vcpu = vcpu.lock().unwrap(); - if !self.config.kvm_hyperv { - vcpu.vcpu.notify_guest_clock_paused().map_err(|e| { - MigratableError::Pause(anyhow!( - "Could not notify guest it has been paused {e:?}" - )) - })?; - } + vcpu.vcpu.notify_guest_clock_paused().map_err(|e| { + MigratableError::Pause(anyhow!("Could not notify guest it has been paused {e:?}")) + })?; } // The vCPU thread will change its paused state before parking, wait here for each From af1d69a9ddff03db3bdee85b299c53c1ea62a9ed Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Sat, 28 Mar 2026 14:11:55 +0100 Subject: [PATCH 427/742] block: Reject discard requests with flags set The virtio spec v1.2 in 5.2.6.2 requires that the device MUST return VIRTIO_BLK_S_UNSUPP for discard commands if the unmap flag is set or if any unknown flag is set. The discard handler was not reading the flags field at all, silently accepting requests with arbitrary flags. Read and validate the flags, rejecting any non-zero value with VIRTIO_BLK_S_UNSUPP. Signed-off-by: Anatol Belski --- block/src/lib.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/block/src/lib.rs b/block/src/lib.rs index d62e2717b2..912cf44011 100644 --- a/block/src/lib.rs +++ b/block/src/lib.rs @@ -610,6 +610,7 @@ impl Request { let mut discard_sector = [0u8; 8]; let mut discard_num_sectors = [0u8; 4]; + let mut discard_flags = [0u8; 4]; let sector_addr = data_addr.checked_add(DISCARD_WZ_SECTOR_OFFSET).unwrap(); mem.read_slice(&mut discard_sector, sector_addr) @@ -621,6 +622,17 @@ impl Request { mem.read_slice(&mut discard_num_sectors, num_sectors_addr) .map_err(ExecuteError::Read)?; + let flags_addr = data_addr.checked_add(DISCARD_WZ_FLAGS_OFFSET).unwrap(); + mem.read_slice(&mut discard_flags, flags_addr) + .map_err(ExecuteError::Read)?; + + let discard_flags = u32::from_le_bytes(discard_flags); + // Per virtio spec v1.2 reject discard if any flag is set, including unmap. + if discard_flags != 0 { + warn!("Unsupported flags {discard_flags:#x} in discard request"); + return Err(ExecuteError::Unsupported(VIRTIO_BLK_T_DISCARD)); + } + let discard_sector = u64::from_le_bytes(discard_sector); if discard_sector == 0 && disable_sector0_writes { From 8ca5210603d9c7ad67ccdc31ab66b2c22fce80d1 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Sat, 28 Mar 2026 14:36:15 +0100 Subject: [PATCH 428/742] block: Reject write zeroes with unknown flags The virtio spec v1.2 in 5.2.6.2 requires that the device MUST return VIRTIO_BLK_S_UNSUPP for write zeroes commands if any unknown flag is set. Add an early check that rejects requests with reserved flag bits set by returning VIRTIO_BLK_S_UNSUPP via the existing ExecuteError::Unsupported variant. Signed-off-by: Anatol Belski --- block/src/lib.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/block/src/lib.rs b/block/src/lib.rs index 912cf44011..d02ab5469d 100644 --- a/block/src/lib.rs +++ b/block/src/lib.rs @@ -691,7 +691,13 @@ impl Request { let wz_sector = u64::from_le_bytes(wz_sector); let wz_num_sectors = u32::from_le_bytes(wz_num_sectors); + let wz_flags = u32::from_le_bytes(wz_flags); + // Per virtio spec v1.2 reject write zeroes if any unknown flag is set. + if (wz_flags & !VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) != 0 { + warn!("Unsupported flags {wz_flags:#x} in write zeroes request"); + return Err(ExecuteError::Unsupported(VIRTIO_BLK_T_WRITE_ZEROES)); + } let wz_offset = wz_sector * SECTOR_SIZE; if wz_offset == 0 && disable_sector0_writes { From e4e2a37afabc764a9071b01f3d60ef7ced49cd7a Mon Sep 17 00:00:00 2001 From: CMGS Date: Tue, 31 Mar 2026 08:50:04 +0000 Subject: [PATCH 429/742] block: Restrict DISCARD to explicit sparse=true PR #7852 fixed the missing VirtioBlockConfig fields but did not change the feature advertisement logic. The condition `sparse || disk_image.supports_zero_flag()` causes qcow2 to advertise DISCARD even with sparse=false, because qcow2 can mark clusters as zero (supports_zero_flag() returns true). Windows viostor BSODs (DRIVER_IRQL_NOT_LESS_OR_EQUAL) when DISCARD is advertised on qcow2 backends, making sparse=off ineffective as a workaround for qcow2 images. Restrict DISCARD to explicit sparse=true only. WRITE_ZEROES remains available for all sparse-capable backends. Fixes #7849 Signed-off-by: CMGS --- cloud-hypervisor/tests/integration.rs | 61 +++++++++++++-------------- virtio-devices/src/block.rs | 9 ++-- 2 files changed, 35 insertions(+), 35 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index bac62af116..5ae554981c 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -4564,7 +4564,6 @@ mod common_parallel { #[test] fn test_virtio_block_sparse_off_qcow2() { const TEST_DISK_SIZE: &str = "2G"; - const CLUSTER_SIZE_BYTES: u64 = 64 * 1024; let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(disk_config)); @@ -4618,37 +4617,36 @@ mod common_parallel { 1 ); - let mut current_offset_kb = 1024; - - for &size_kb in BLOCK_DISCARD_TEST_SIZES_KB.iter() { - guest - .ssh_command(&format!( - "sudo dd if=/dev/urandom of=/dev/vdc bs=1K count={size_kb} seek={current_offset_kb} oflag=direct" - )) - .unwrap(); - - guest.ssh_command("sync").unwrap(); - - guest - .ssh_command(&format!( - "sudo blkdiscard -o {} -l {} /dev/vdc", - current_offset_kb * 1024, - size_kb * 1024 - )) - .unwrap(); - - guest.ssh_command("sync").unwrap(); + // With sparse=off, DISCARD should NOT be advertised. + // blkdiscard is expected to fail. + let discard_result = + guest.ssh_command("sudo blkdiscard -o 1048576 -l 1048576 /dev/vdc 2>&1; echo $?"); + let exit_code = discard_result + .unwrap() + .trim() + .lines() + .last() + .unwrap_or("1") + .parse::() + .unwrap_or(1); + assert_ne!( + exit_code, 0, + "blkdiscard should fail with sparse=off (DISCARD not advertised)" + ); - // Verify VM sees zeros in discarded region - assert_guest_disk_region_is_zero( - &guest, - "/dev/vdc", - current_offset_kb * 1024, - size_kb * 1024, - ); + // WRITE_ZEROES should still work via blkdiscard --zeroout + guest + .ssh_command( + "sudo dd if=/dev/urandom of=/dev/vdc bs=1K count=64 seek=1024 oflag=direct", + ) + .unwrap(); + guest.ssh_command("sync").unwrap(); + guest + .ssh_command("sudo blkdiscard -z -o 1048576 -l 65536 /dev/vdc") + .unwrap(); + guest.ssh_command("sync").unwrap(); - current_offset_kb += size_kb + 64; - } + assert_guest_disk_region_is_zero(&guest, "/dev/vdc", 1048576, 65536); }); kill_child(&mut child); @@ -4659,9 +4657,10 @@ mod common_parallel { handle_child_output(r, &output); + // WRITE_ZEROES should still produce zero-flagged regions assert!( zero_regions_after > zero_regions_before, - "Expected zero-flagged regions to increase with sparse=off: before={zero_regions_before}, after={zero_regions_after}" + "Expected zero-flagged regions to increase via WRITE_ZEROES: before={zero_regions_before}, after={zero_regions_after}" ); disk_check_consistency(test_disk_path, None); diff --git a/virtio-devices/src/block.rs b/virtio-devices/src/block.rs index 8f919e3e04..1080b71bcb 100644 --- a/virtio-devices/src/block.rs +++ b/virtio-devices/src/block.rs @@ -778,13 +778,14 @@ impl Block { | (1u64 << VIRTIO_RING_F_INDIRECT_DESC); // When backend supports sparse operations: - // - Always advertise WRITE_ZEROES - // - Advertise DISCARD only if sparse=true OR format supports marking - // clusters as zero without deallocating + // - Always advertise WRITE_ZEROES (safe for all drivers) + // - Advertise DISCARD only when sparse=true, since DISCARD + // deallocates space via punch_hole and should require + // explicit user opt in. let mut discard_supported = false; if disk_image.supports_sparse_operations() { avail_features |= 1u64 << VIRTIO_BLK_F_WRITE_ZEROES; - if sparse || disk_image.supports_zero_flag() { + if sparse { avail_features |= 1u64 << VIRTIO_BLK_F_DISCARD; discard_supported = true; } From 9f980969fced6479ad5b2de94ca1699c3f409b87 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Tue, 31 Mar 2026 22:50:25 +0200 Subject: [PATCH 430/742] block: Add UnsupportedFlags error variant for flag validation Introduce ExecuteError::UnsupportedFlags to carry both the request type and the rejected flags value, replacing the generic ExecuteError::Unsupported at discard and write zeroes flag validation sites. This provides structured context for debugging without changing the returned VIRTIO_BLK_S_UNSUPP status. Signed-off-by: Anatol Belski --- block/src/lib.rs | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/block/src/lib.rs b/block/src/lib.rs index d02ab5469d..6d093daca4 100644 --- a/block/src/lib.rs +++ b/block/src/lib.rs @@ -169,6 +169,8 @@ pub enum ExecuteError { WriteAll(#[source] io::Error), #[error("Unsupported request: {0}")] Unsupported(u32), + #[error("Unsupported flags {flags:#x} for request type {request_type}")] + UnsupportedFlags { request_type: u32, flags: u32 }, #[error("Failed to submit io uring")] SubmitIoUring(#[source] io::Error), #[error("Failed to get guest address")] @@ -199,6 +201,7 @@ impl ExecuteError { ExecuteError::Write(_) => VIRTIO_BLK_S_IOERR, ExecuteError::WriteAll(_) => VIRTIO_BLK_S_IOERR, ExecuteError::Unsupported(_) => VIRTIO_BLK_S_UNSUPP, + ExecuteError::UnsupportedFlags { .. } => VIRTIO_BLK_S_UNSUPP, ExecuteError::SubmitIoUring(_) => VIRTIO_BLK_S_IOERR, ExecuteError::GetHostAddress(_) => VIRTIO_BLK_S_IOERR, ExecuteError::AsyncRead(_) => VIRTIO_BLK_S_IOERR, @@ -630,7 +633,10 @@ impl Request { // Per virtio spec v1.2 reject discard if any flag is set, including unmap. if discard_flags != 0 { warn!("Unsupported flags {discard_flags:#x} in discard request"); - return Err(ExecuteError::Unsupported(VIRTIO_BLK_T_DISCARD)); + return Err(ExecuteError::UnsupportedFlags { + request_type: VIRTIO_BLK_T_DISCARD, + flags: discard_flags, + }); } let discard_sector = u64::from_le_bytes(discard_sector); @@ -696,7 +702,10 @@ impl Request { // Per virtio spec v1.2 reject write zeroes if any unknown flag is set. if (wz_flags & !VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) != 0 { warn!("Unsupported flags {wz_flags:#x} in write zeroes request"); - return Err(ExecuteError::Unsupported(VIRTIO_BLK_T_WRITE_ZEROES)); + return Err(ExecuteError::UnsupportedFlags { + request_type: VIRTIO_BLK_T_WRITE_ZEROES, + flags: wz_flags, + }); } let wz_offset = wz_sector * SECTOR_SIZE; From 820140930a5ccbd59d9bfeb9e310d4ef5ae49141 Mon Sep 17 00:00:00 2001 From: Bo Chen Date: Mon, 16 Mar 2026 17:55:52 +0000 Subject: [PATCH 431/742] vmm: interrupt: Reduce visibility of internal types and methods Signed-off-by: Bo Chen --- vmm/src/interrupt.rs | 52 ++++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 29 deletions(-) diff --git a/vmm/src/interrupt.rs b/vmm/src/interrupt.rs index 0995d83567..80ddd46ae8 100644 --- a/vmm/src/interrupt.rs +++ b/vmm/src/interrupt.rs @@ -17,7 +17,7 @@ use vm_device::interrupt::{ use vmm_sys_util::eventfd::EventFd; /// Reuse std::io::Result to simplify interoperability among crates. -pub type Result = std::io::Result; +type Result = std::io::Result; struct InterruptRoute { gsi: u32, @@ -26,11 +26,11 @@ struct InterruptRoute { } impl InterruptRoute { - pub fn new(allocator: &mut SystemAllocator) -> Result { + fn new(allocator: &mut SystemAllocator) -> Result { Self::new_with_fd(allocator, Some(EventFd::new(libc::EFD_NONBLOCK)?)) } - pub fn new_with_fd(allocator: &mut SystemAllocator, irq_fd: Option) -> Result { + fn new_with_fd(allocator: &mut SystemAllocator, irq_fd: Option) -> Result { let gsi = allocator .allocate_gsi() .ok_or_else(|| io::Error::other("Failed allocating new GSI"))?; @@ -42,7 +42,7 @@ impl InterruptRoute { }) } - pub fn enable(&mut self, vm: &dyn hypervisor::Vm) -> Result<()> { + fn enable(&mut self, vm: &dyn hypervisor::Vm) -> Result<()> { if !self.registered { if let Some(ref irq_fd) = self.irq_fd { vm.register_irqfd(irq_fd, self.gsi) @@ -56,7 +56,7 @@ impl InterruptRoute { Ok(()) } - pub fn disable(&mut self, vm: &dyn hypervisor::Vm) -> Result<()> { + fn disable(&mut self, vm: &dyn hypervisor::Vm) -> Result<()> { if self.registered { if let Some(ref irq_fd) = self.irq_fd { vm.unregister_irqfd(irq_fd, self.gsi) @@ -70,14 +70,14 @@ impl InterruptRoute { Ok(()) } - pub fn trigger(&mut self) -> Result<()> { + fn trigger(&mut self) -> Result<()> { match self.irq_fd { Some(ref fd) => fd.write(1), None => Ok(()), } } - pub fn notifier(&mut self) -> Option { + fn notifier(&mut self) -> Option { Some( self.irq_fd .as_ref()? @@ -90,11 +90,7 @@ impl InterruptRoute { // will use it. Use #[allow(dead_code)] to suppress a compiler // warning. #[allow(dead_code)] - pub fn set_notifier( - &mut self, - eventfd: Option, - vm: &dyn hypervisor::Vm, - ) -> Result<()> { + fn set_notifier(&mut self, eventfd: Option, vm: &dyn hypervisor::Vm) -> Result<()> { let old_irqfd = core::mem::replace(&mut self.irq_fd, eventfd); if self.registered { if let Some(ref irq_fd) = self.irq_fd { @@ -114,18 +110,30 @@ impl InterruptRoute { } } -pub struct RoutingEntry { +struct RoutingEntry { route: IrqRoutingEntry, masked: bool, } -pub struct MsiInterruptGroup { +struct MsiInterruptGroup { vm: Arc, gsi_msi_routes: Arc>>, irq_routes: HashMap>, } impl MsiInterruptGroup { + fn new( + vm: Arc, + gsi_msi_routes: Arc>>, + irq_routes: HashMap>, + ) -> Self { + MsiInterruptGroup { + vm, + gsi_msi_routes, + irq_routes, + } + } + fn set_gsi_routes(&self, routes: &HashMap) -> Result<()> { let mut entry_vec: Vec = Vec::new(); for (_, entry) in routes.iter() { @@ -142,20 +150,6 @@ impl MsiInterruptGroup { } } -impl MsiInterruptGroup { - fn new( - vm: Arc, - gsi_msi_routes: Arc>>, - irq_routes: HashMap>, - ) -> Self { - MsiInterruptGroup { - vm, - gsi_msi_routes, - irq_routes, - } - } -} - impl InterruptSourceGroup for MsiInterruptGroup { fn enable(&self) -> Result<()> { for (_, route) in self.irq_routes.iter() { @@ -253,7 +247,7 @@ impl InterruptSourceGroup for MsiInterruptGroup { } } -pub struct LegacyUserspaceInterruptGroup { +struct LegacyUserspaceInterruptGroup { ioapic: Arc>, irq: u32, } From 0686045290204c59661f46f987e2cd60c5f40ee7 Mon Sep 17 00:00:00 2001 From: Bo Chen Date: Fri, 27 Mar 2026 23:34:16 +0000 Subject: [PATCH 432/742] vmm: interrupt: Allocate GSIs for MSI/MSI-X interrupt vectors lazily Previously, GSIs were eagerly allocated for all MSI-X vectors a device advertises (i.e. the maximum the device can support). This can easily exhaust KVM_MAX_IRQ_ROUTES (4096) with modern NVMe devices that support up to 2048 MSI-X vectors. Defer GSI allocation to the first time an interrupt vector is unmasked. The EventFd is still created eagerly since external components (e.g. VFIO) need it at device init time. Signed-off-by: Bo Chen --- vmm/src/interrupt.rs | 79 +++++++++++++++++++++++++++++++++----------- 1 file changed, 60 insertions(+), 19 deletions(-) diff --git a/vmm/src/interrupt.rs b/vmm/src/interrupt.rs index 80ddd46ae8..f08aaab7fa 100644 --- a/vmm/src/interrupt.rs +++ b/vmm/src/interrupt.rs @@ -20,32 +20,49 @@ use vmm_sys_util::eventfd::EventFd; type Result = std::io::Result; struct InterruptRoute { - gsi: u32, + gsi: Option, irq_fd: Option, registered: bool, } impl InterruptRoute { - fn new(allocator: &mut SystemAllocator) -> Result { - Self::new_with_fd(allocator, Some(EventFd::new(libc::EFD_NONBLOCK)?)) + fn new() -> Result { + // The irq_fd must be created eagerly because external components + // (say, VFIO) need the fd at device initialization time via notifier(). + Self::new_with_fd(Some(EventFd::new(libc::EFD_NONBLOCK)?)) } - fn new_with_fd(allocator: &mut SystemAllocator, irq_fd: Option) -> Result { - let gsi = allocator - .allocate_gsi() - .ok_or_else(|| io::Error::other("Failed allocating new GSI"))?; - + fn new_with_fd(irq_fd: Option) -> Result { Ok(InterruptRoute { - gsi, + gsi: None, irq_fd, registered: false, }) } + fn allocate_gsi(&mut self, allocator: &mut SystemAllocator) -> Result { + match self.gsi { + Some(existing) => Ok(existing), + None => { + let new_gsi = allocator + .allocate_gsi() + .ok_or_else(|| io::Error::other("Failed allocating new GSI"))?; + self.gsi = Some(new_gsi); + Ok(new_gsi) + } + } + } + fn enable(&mut self, vm: &dyn hypervisor::Vm) -> Result<()> { + let gsi = match self.gsi { + Some(gsi) => gsi, + // Do nothing if no GSI was ever allocated for this route, which means the interrupt is still masked. + None => return Ok(()), + }; + if !self.registered { if let Some(ref irq_fd) = self.irq_fd { - vm.register_irqfd(irq_fd, self.gsi) + vm.register_irqfd(irq_fd, gsi) .map_err(|e| io::Error::other(format!("Failed registering irq_fd: {e}")))?; } @@ -57,9 +74,15 @@ impl InterruptRoute { } fn disable(&mut self, vm: &dyn hypervisor::Vm) -> Result<()> { + let gsi = match self.gsi { + Some(gsi) => gsi, + // Do nothing if no GSI was ever allocated for this route, which means the interrupt is still masked. + None => return Ok(()), + }; + if self.registered { if let Some(ref irq_fd) = self.irq_fd { - vm.unregister_irqfd(irq_fd, self.gsi) + vm.unregister_irqfd(irq_fd, gsi) .map_err(|e| io::Error::other(format!("Failed unregistering irq_fd: {e}")))?; } @@ -93,14 +116,17 @@ impl InterruptRoute { fn set_notifier(&mut self, eventfd: Option, vm: &dyn hypervisor::Vm) -> Result<()> { let old_irqfd = core::mem::replace(&mut self.irq_fd, eventfd); if self.registered { + // A registered route must have a GSI allocated, since enable() + // only sets registered=true after using a valid GSI. + let gsi = self.gsi.expect("registered route has no GSI allocated"); if let Some(ref irq_fd) = self.irq_fd { - vm.register_irqfd(irq_fd, self.gsi) + vm.register_irqfd(irq_fd, gsi) .map_err(|e| io::Error::other(format!("Failed registering irq_fd: {e}")))?; } // If the irqfd cannot be unregistered, what to do? Spin? // Returning an error isn't helpful as the new irqfd is already registered. if let Some(old_irq_fd) = old_irqfd { - match vm.unregister_irqfd(&old_irq_fd, self.gsi) { + match vm.unregister_irqfd(&old_irq_fd, gsi) { Ok(()) => {} Err(e) => log::warn!("Failed unregistering old irqfd: {e}"), } @@ -119,6 +145,7 @@ struct MsiInterruptGroup { vm: Arc, gsi_msi_routes: Arc>>, irq_routes: HashMap>, + allocator: Arc>, } impl MsiInterruptGroup { @@ -126,11 +153,13 @@ impl MsiInterruptGroup { vm: Arc, gsi_msi_routes: Arc>>, irq_routes: HashMap>, + allocator: Arc>, ) -> Self { MsiInterruptGroup { vm, gsi_msi_routes, irq_routes, + allocator, } } @@ -194,8 +223,20 @@ impl InterruptSourceGroup for MsiInterruptGroup { ) -> Result<()> { if let Some(route) = self.irq_routes.get(&index) { let mut route = route.lock().unwrap(); + let gsi = if masked { + match route.gsi { + Some(gsi) => gsi, + // No update needed if masked and no GSI was ever allocated + None => return Ok(()), + } + } else { + // Allocate a GSI when the interrupt vector is first unmasked + let mut allocator = self.allocator.lock().unwrap(); + route.allocate_gsi(&mut allocator)? + }; + let entry = RoutingEntry { - route: self.vm.make_routing_entry(route.gsi, &config), + route: self.vm.make_routing_entry(gsi, &config), masked, }; @@ -208,7 +249,7 @@ impl InterruptSourceGroup for MsiInterruptGroup { } let mut routes = self.gsi_msi_routes.lock().unwrap(); - routes.insert(route.gsi, entry); + routes.insert(gsi, entry); if set_gsi { self.set_gsi_routes(&routes)?; } @@ -338,17 +379,17 @@ impl MsiInterruptManager { &self, config: ::GroupConfig, ) -> Result { - let mut allocator = self.allocator.lock().unwrap(); let mut irq_routes: HashMap> = HashMap::with_capacity(config.count as usize); for i in config.base..config.base + config.count { - irq_routes.insert(i, Mutex::new(InterruptRoute::new(&mut allocator)?)); + irq_routes.insert(i, Mutex::new(InterruptRoute::new()?)); } Ok(MsiInterruptGroup::new( self.vm.clone(), self.gsi_msi_routes.clone(), irq_routes, + self.allocator.clone(), )) } } @@ -357,17 +398,17 @@ impl InterruptManager for MsiInterruptManager { type GroupConfig = MsiIrqGroupConfig; fn create_group(&self, config: Self::GroupConfig) -> Result> { - let mut allocator = self.allocator.lock().unwrap(); let mut irq_routes: HashMap> = HashMap::with_capacity(config.count as usize); for i in config.base..config.base + config.count { - irq_routes.insert(i, Mutex::new(InterruptRoute::new(&mut allocator)?)); + irq_routes.insert(i, Mutex::new(InterruptRoute::new()?)); } Ok(Arc::new(MsiInterruptGroup::new( self.vm.clone(), self.gsi_msi_routes.clone(), irq_routes, + self.allocator.clone(), ))) } From db93c6fdc7dfc3786128d2aaf2086fe49bc0d858 Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Tue, 31 Mar 2026 12:03:20 +0200 Subject: [PATCH 433/742] pci: Save deferred BAR reprogramming state OVMF can reprogram PCI BARs while memory space decoding is disabled. Cloud Hypervisor defers the corresponding BAR move in `pending_bar_reprogram` until the PCI command register enables Memory Space again. That deferred state was not part of `PciConfigurationState`. A snapshot taken in that window restored the new BAR values in PCI config space, but lost the pending BAR relocation needed to update the VMM-side BAR mapping. The restore logs show guest MMIO accesses to the reprogrammed BAR addresses `0xc0000000`, `0x100000000`, and `0x100080000` hitting unregistered addresses. The firmware serial output shows OVMF assigning those same BAR addresses during PCI resource allocation, then reaching BDS, finding the mass-storage device, and failing to boot from it. Serialize and restore `pending_bar_reprogram` so deferred BAR moves survive snapshot and restore. Co-authored-by: Thomas Prescher Co-authored-by: Julian Schindel On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- pci/src/configuration.rs | 9 ++++++++- pci/src/device.rs | 3 ++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/pci/src/configuration.rs b/pci/src/configuration.rs index 4dbd04f122..f506017b26 100644 --- a/pci/src/configuration.rs +++ b/pci/src/configuration.rs @@ -422,6 +422,9 @@ pub struct PciConfigurationState { rom_bar_used: bool, last_capability: Option<(usize, usize)>, msix_cap_reg_idx: Option, + // Preserve deferred BAR moves across snapshot and restore. + #[serde(default)] + pending_bar_reprogram: Vec, } /// Contains the configuration space of a PCI node. @@ -557,6 +560,7 @@ impl PciConfiguration { rom_bar_used, last_capability, msix_cap_reg_idx, + pending_bar_reprogram, ) = if let Some(state) = state { ( state.registers.try_into().unwrap(), @@ -567,6 +571,7 @@ impl PciConfiguration { state.rom_bar_used, state.last_capability, state.msix_cap_reg_idx, + state.pending_bar_reprogram, ) } else { let mut registers = [0u32; NUM_CONFIGURATION_REGISTERS]; @@ -606,6 +611,7 @@ impl PciConfiguration { false, None, None, + Vec::new(), ) }; @@ -619,7 +625,7 @@ impl PciConfiguration { last_capability, msix_cap_reg_idx, msix_config, - pending_bar_reprogram: Vec::new(), + pending_bar_reprogram, } } @@ -633,6 +639,7 @@ impl PciConfiguration { rom_bar_used: self.rom_bar_used, last_capability: self.last_capability, msix_cap_reg_idx: self.msix_cap_reg_idx, + pending_bar_reprogram: self.pending_bar_reprogram.clone(), } } diff --git a/pci/src/device.rs b/pci/src/device.rs index 3c5b3315f8..3a23ea7772 100644 --- a/pci/src/device.rs +++ b/pci/src/device.rs @@ -8,6 +8,7 @@ use std::any::Any; use std::sync::{Arc, Barrier, Mutex}; use std::{io, result}; +use serde::{Deserialize, Serialize}; use thiserror::Error; use vm_allocator::{AddressAllocator, SystemAllocator}; use vm_device::Resource; @@ -35,7 +36,7 @@ pub enum Error { } pub type Result = std::result::Result; -#[derive(Clone, Copy, Debug)] +#[derive(Clone, Copy, Debug, Serialize, Deserialize)] pub struct BarReprogrammingParams { pub old_base: u64, pub new_base: u64, From a0bbef3a761bd0824c447d06d02b79dff4ab534c Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Tue, 31 Mar 2026 04:31:35 -0700 Subject: [PATCH 434/742] virtio-devices: Embed VirtioCommon in VhostUserCommon Since vhost-user devices are always virtio devices it makes sense to structure this struct inside the VhostUserCommon struct. This then also makes some of the methods on VhostUserCommon cleaner since they can now act directly on the common virtio bits (e.g. for kill_evt) Signed-off-by: Rob Bradford --- virtio-devices/src/vhost_user/blk.rs | 62 +++++++------- virtio-devices/src/vhost_user/fs.rs | 62 +++++++------- .../src/vhost_user/generic_vhost_user.rs | 62 +++++++------- virtio-devices/src/vhost_user/mod.rs | 15 ++-- virtio-devices/src/vhost_user/net.rs | 83 ++++++++++--------- 5 files changed, 145 insertions(+), 139 deletions(-) diff --git a/virtio-devices/src/vhost_user/blk.rs b/virtio-devices/src/vhost_user/blk.rs index 203012b8ec..f6e9a46232 100644 --- a/virtio-devices/src/vhost_user/blk.rs +++ b/virtio-devices/src/vhost_user/blk.rs @@ -39,7 +39,6 @@ struct BackendReqHandler {} impl VhostUserFrontendReqHandler for BackendReqHandler {} pub struct Blk { - common: VirtioCommon, vu_common: VhostUserCommon, id: String, config: VirtioBlockConfig, @@ -170,17 +169,17 @@ impl Blk { }; Ok(Blk { - common: VirtioCommon { - device_type: VirtioDeviceType::Block as u32, - queue_sizes: vec![vu_cfg.queue_size; num_queues], - avail_features, - acked_features, - paused_sync: Some(Arc::new(Barrier::new(2))), - min_queues: DEFAULT_QUEUE_NUMBER as u16, - paused: Arc::new(AtomicBool::new(paused)), - ..Default::default() - }, vu_common: VhostUserCommon { + virtio_common: VirtioCommon { + device_type: VirtioDeviceType::Block as u32, + queue_sizes: vec![vu_cfg.queue_size; num_queues], + avail_features, + acked_features, + paused_sync: Some(Arc::new(Barrier::new(2))), + min_queues: DEFAULT_QUEUE_NUMBER as u16, + paused: Arc::new(AtomicBool::new(paused)), + ..Default::default() + }, vu: Some(Arc::new(Mutex::new(vu))), acked_protocol_features, socket_path: vu_cfg.socket, @@ -199,18 +198,18 @@ impl Blk { } fn state(&self) -> std::result::Result { - self.vu_common.state(&self.common, self.config) + self.vu_common.state(self.config) } } impl Drop for Blk { fn drop(&mut self) { - if let Some(kill_evt) = self.common.kill_evt.take() + if let Some(kill_evt) = self.vu_common.virtio_common.kill_evt.take() && let Err(e) = kill_evt.write(1) { error!("failed to kill vhost-user-blk: {e:?}"); } - self.common.wait_for_epoll_threads(); + self.vu_common.virtio_common.wait_for_epoll_threads(); if let Some(thread) = self.epoll_thread.take() && let Err(e) = thread.join() { @@ -221,15 +220,15 @@ impl Drop for Blk { impl VirtioDevice for Blk { fn device_type(&self) -> u32 { - self.common.device_type + self.vu_common.virtio_common.device_type } fn queue_max_sizes(&self) -> &[u16] { - &self.common.queue_sizes + &self.vu_common.virtio_common.queue_sizes } fn features(&self) -> u64 { - let mut features = self.common.avail_features; + let mut features = self.vu_common.virtio_common.avail_features; if self.iommu { features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; } @@ -237,7 +236,7 @@ impl VirtioDevice for Blk { } fn ack_features(&mut self, value: u64) { - self.common.ack_features(value); + self.vu_common.virtio_common.ack_features(value); } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -278,27 +277,29 @@ impl VirtioDevice for Blk { queues, .. } = context; - self.common.activate(&queues, interrupt_cb.clone())?; + self.vu_common + .virtio_common + .activate(&queues, interrupt_cb.clone())?; self.guest_memory = Some(mem.clone()); let backend_req_handler: Option> = None; // Run a dedicated thread for handling potential reconnections with // the backend. - let (kill_evt, pause_evt) = self.common.dup_eventfds(); + let (kill_evt, pause_evt) = self.vu_common.virtio_common.dup_eventfds(); let mut handler = self.vu_common.activate( mem, &queues, interrupt_cb, - self.common.acked_features, + self.vu_common.virtio_common.acked_features, backend_req_handler, kill_evt, pause_evt, )?; - let paused = self.common.paused.clone(); - let paused_sync = self.common.paused_sync.clone(); + let paused = self.vu_common.virtio_common.paused.clone(); + let paused_sync = self.vu_common.virtio_common.paused_sync.clone(); let mut epoll_threads = Vec::new(); @@ -317,8 +318,8 @@ impl VirtioDevice for Blk { fn reset(&mut self) -> Option> { // We first must resume the virtio thread if it was paused. - if self.common.pause_evt.take().is_some() { - self.common.resume().ok()?; + if self.vu_common.virtio_common.pause_evt.take().is_some() { + self.vu_common.virtio_common.resume().ok()?; } if let Some(vu) = &self.vu_common.vu @@ -328,7 +329,7 @@ impl VirtioDevice for Blk { return None; } - if let Some(kill_evt) = self.common.kill_evt.take() { + if let Some(kill_evt) = self.vu_common.virtio_common.kill_evt.take() { // Ignore the result because there is nothing we can do about it. let _ = kill_evt.write(1); } @@ -336,7 +337,7 @@ impl VirtioDevice for Blk { event!("virtio-device", "reset", "id", &self.id); // Return the interrupt - Some(self.common.interrupt_cb.take().unwrap()) + Some(self.vu_common.virtio_common.interrupt_cb.take().unwrap()) } fn shutdown(&mut self) { @@ -354,11 +355,11 @@ impl VirtioDevice for Blk { impl Pausable for Blk { fn pause(&mut self) -> result::Result<(), MigratableError> { self.vu_common.pause()?; - self.common.pause() + self.vu_common.virtio_common.pause() } fn resume(&mut self) -> result::Result<(), MigratableError> { - self.common.resume()?; + self.vu_common.virtio_common.resume()?; if let Some(epoll_thread) = &self.epoll_thread { epoll_thread.thread().unpark(); @@ -397,7 +398,6 @@ impl Migratable for Blk { } fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> { - self.vu_common - .complete_migration(self.common.kill_evt.take()) + self.vu_common.complete_migration() } } diff --git a/virtio-devices/src/vhost_user/fs.rs b/virtio-devices/src/vhost_user/fs.rs index 1b8edfe6bd..3f982ff299 100644 --- a/virtio-devices/src/vhost_user/fs.rs +++ b/virtio-devices/src/vhost_user/fs.rs @@ -59,7 +59,6 @@ impl Default for VirtioFsConfig { unsafe impl ByteValued for VirtioFsConfig {} pub struct Fs { - common: VirtioCommon, vu_common: VhostUserCommon, id: String, config: VirtioFsConfig, @@ -178,17 +177,17 @@ impl Fs { }; Ok(Fs { - common: VirtioCommon { - device_type: VirtioDeviceType::Fs as u32, - avail_features, - acked_features, - queue_sizes: vec![queue_size; num_queues], - paused_sync: Some(Arc::new(Barrier::new(2))), - min_queues: 1, - paused: Arc::new(AtomicBool::new(paused)), - ..Default::default() - }, vu_common: VhostUserCommon { + virtio_common: VirtioCommon { + device_type: VirtioDeviceType::Fs as u32, + avail_features, + acked_features, + queue_sizes: vec![queue_size; num_queues], + paused_sync: Some(Arc::new(Barrier::new(2))), + min_queues: 1, + paused: Arc::new(AtomicBool::new(paused)), + ..Default::default() + }, vu: Some(Arc::new(Mutex::new(vu))), acked_protocol_features, socket_path: path.to_string(), @@ -208,17 +207,17 @@ impl Fs { } fn state(&self) -> std::result::Result { - self.vu_common.state(&self.common, self.config) + self.vu_common.state(self.config) } } impl Drop for Fs { fn drop(&mut self) { - if let Some(kill_evt) = self.common.kill_evt.take() { + if let Some(kill_evt) = self.vu_common.virtio_common.kill_evt.take() { // Ignore the result because there is nothing we can do about it. let _ = kill_evt.write(1); } - self.common.wait_for_epoll_threads(); + self.vu_common.virtio_common.wait_for_epoll_threads(); if let Some(thread) = self.epoll_thread.take() && let Err(e) = thread.join() { @@ -229,15 +228,15 @@ impl Drop for Fs { impl VirtioDevice for Fs { fn device_type(&self) -> u32 { - self.common.device_type + self.vu_common.virtio_common.device_type } fn queue_max_sizes(&self) -> &[u16] { - &self.common.queue_sizes + &self.vu_common.virtio_common.queue_sizes } fn features(&self) -> u64 { - let mut features = self.common.avail_features; + let mut features = self.vu_common.virtio_common.avail_features; if self.iommu { features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; } @@ -245,7 +244,7 @@ impl VirtioDevice for Fs { } fn ack_features(&mut self, value: u64) { - self.common.ack_features(value); + self.vu_common.virtio_common.ack_features(value); } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -259,26 +258,28 @@ impl VirtioDevice for Fs { queues, .. } = context; - self.common.activate(&queues, interrupt_cb.clone())?; + self.vu_common + .virtio_common + .activate(&queues, interrupt_cb.clone())?; self.guest_memory = Some(mem.clone()); let backend_req_handler: Option> = None; // Run a dedicated thread for handling potential reconnections with // the backend. - let (kill_evt, pause_evt) = self.common.dup_eventfds(); + let (kill_evt, pause_evt) = self.vu_common.virtio_common.dup_eventfds(); let mut handler = self.vu_common.activate( mem, &queues, interrupt_cb, - self.common.acked_features, + self.vu_common.virtio_common.acked_features, backend_req_handler, kill_evt, pause_evt, )?; - let paused = self.common.paused.clone(); - let paused_sync = self.common.paused_sync.clone(); + let paused = self.vu_common.virtio_common.paused.clone(); + let paused_sync = self.vu_common.virtio_common.paused_sync.clone(); let mut epoll_threads = Vec::new(); spawn_virtio_thread( @@ -297,8 +298,8 @@ impl VirtioDevice for Fs { fn reset(&mut self) -> Option> { // We first must resume the virtio thread if it was paused. - if self.common.pause_evt.take().is_some() { - self.common.resume().ok()?; + if self.vu_common.virtio_common.pause_evt.take().is_some() { + self.vu_common.virtio_common.resume().ok()?; } if let Some(vu) = &self.vu_common.vu @@ -308,7 +309,7 @@ impl VirtioDevice for Fs { return None; } - if let Some(kill_evt) = self.common.kill_evt.take() { + if let Some(kill_evt) = self.vu_common.virtio_common.kill_evt.take() { // Ignore the result because there is nothing we can do about it. let _ = kill_evt.write(1); } @@ -316,7 +317,7 @@ impl VirtioDevice for Fs { event!("virtio-device", "reset", "id", &self.id); // Return the interrupt - Some(self.common.interrupt_cb.take().unwrap()) + Some(self.vu_common.virtio_common.interrupt_cb.take().unwrap()) } fn shutdown(&mut self) { @@ -364,11 +365,11 @@ impl VirtioDevice for Fs { impl Pausable for Fs { fn pause(&mut self) -> result::Result<(), MigratableError> { self.vu_common.pause()?; - self.common.pause() + self.vu_common.virtio_common.pause() } fn resume(&mut self) -> result::Result<(), MigratableError> { - self.common.resume()?; + self.vu_common.virtio_common.resume()?; if let Some(epoll_thread) = &self.epoll_thread { epoll_thread.thread().unpark(); @@ -407,7 +408,6 @@ impl Migratable for Fs { } fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> { - self.vu_common - .complete_migration(self.common.kill_evt.take()) + self.vu_common.complete_migration() } } diff --git a/virtio-devices/src/vhost_user/generic_vhost_user.rs b/virtio-devices/src/vhost_user/generic_vhost_user.rs index da8fe53a88..0554973f82 100644 --- a/virtio-devices/src/vhost_user/generic_vhost_user.rs +++ b/virtio-devices/src/vhost_user/generic_vhost_user.rs @@ -34,7 +34,6 @@ pub type State = VhostUserState<()>; struct BackendReqHandler {} impl VhostUserFrontendReqHandler for BackendReqHandler {} pub struct GenericVhostUser { - common: VirtioCommon, vu_common: VhostUserCommon, id: String, // Hold ownership of the memory that is allocated for the device @@ -138,17 +137,17 @@ since the backend only supports {backend_num_queues}\n", }; Ok(GenericVhostUser { - common: VirtioCommon { - device_type, - avail_features, - acked_features, - queue_sizes: request_queue_sizes, - paused_sync: Some(Arc::new(Barrier::new(2))), - min_queues: 1, - paused: Arc::new(AtomicBool::new(paused)), - ..Default::default() - }, vu_common: VhostUserCommon { + virtio_common: VirtioCommon { + device_type, + avail_features, + acked_features, + queue_sizes: request_queue_sizes, + paused_sync: Some(Arc::new(Barrier::new(2))), + min_queues: 1, + paused: Arc::new(AtomicBool::new(paused)), + ..Default::default() + }, vu: Some(Arc::new(Mutex::new(vu))), acked_protocol_features, socket_path: path.to_string(), @@ -168,7 +167,7 @@ since the backend only supports {backend_num_queues}\n", } fn state(&self) -> std::result::Result { - self.vu_common.state(&self.common, ()) + self.vu_common.state(()) } #[cold] @@ -189,11 +188,11 @@ space access. Reads will return 0xFF and writes will be ignored." impl Drop for GenericVhostUser { fn drop(&mut self) { - if let Some(kill_evt) = self.common.kill_evt.take() { + if let Some(kill_evt) = self.vu_common.virtio_common.kill_evt.take() { // Ignore the result because there is nothing we can do about it. let _ = kill_evt.write(1); } - self.common.wait_for_epoll_threads(); + self.vu_common.virtio_common.wait_for_epoll_threads(); if let Some(thread) = self.epoll_thread.take() && let Err(e) = thread.join() { @@ -204,15 +203,15 @@ impl Drop for GenericVhostUser { impl VirtioDevice for GenericVhostUser { fn device_type(&self) -> u32 { - self.common.device_type + self.vu_common.virtio_common.device_type } fn queue_max_sizes(&self) -> &[u16] { - &self.common.queue_sizes + &self.vu_common.virtio_common.queue_sizes } fn features(&self) -> u64 { - let mut features = self.common.avail_features; + let mut features = self.vu_common.virtio_common.avail_features; if self.iommu { features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; } @@ -220,7 +219,7 @@ impl VirtioDevice for GenericVhostUser { } fn ack_features(&mut self, value: u64) { - self.common.ack_features(value); + self.vu_common.virtio_common.ack_features(value); } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -282,26 +281,28 @@ impl VirtioDevice for GenericVhostUser { queues, .. } = context; - self.common.activate(&queues, interrupt_cb.clone())?; + self.vu_common + .virtio_common + .activate(&queues, interrupt_cb.clone())?; self.guest_memory = Some(mem.clone()); let backend_req_handler: Option> = None; // Run a dedicated thread for handling potential reconnections with // the backend. - let (kill_evt, pause_evt) = self.common.dup_eventfds(); + let (kill_evt, pause_evt) = self.vu_common.virtio_common.dup_eventfds(); let mut handler = self.vu_common.activate( mem, &queues, interrupt_cb, - self.common.acked_features, + self.vu_common.virtio_common.acked_features, backend_req_handler, kill_evt, pause_evt, )?; - let paused = self.common.paused.clone(); - let paused_sync = self.common.paused_sync.clone(); + let paused = self.vu_common.virtio_common.paused.clone(); + let paused_sync = self.vu_common.virtio_common.paused_sync.clone(); let mut epoll_threads = Vec::new(); spawn_virtio_thread( @@ -320,8 +321,8 @@ impl VirtioDevice for GenericVhostUser { fn reset(&mut self) -> Option> { // We first must resume the virtio thread if it was paused. - if self.common.pause_evt.take().is_some() { - self.common.resume().ok()?; + if self.vu_common.virtio_common.pause_evt.take().is_some() { + self.vu_common.virtio_common.resume().ok()?; } if let Some(vu) = &self.vu_common.vu @@ -331,7 +332,7 @@ impl VirtioDevice for GenericVhostUser { return None; } - if let Some(kill_evt) = self.common.kill_evt.take() { + if let Some(kill_evt) = self.vu_common.virtio_common.kill_evt.take() { // Ignore the result because there is nothing we can do about it. let _ = kill_evt.write(1); } @@ -339,7 +340,7 @@ impl VirtioDevice for GenericVhostUser { event!("virtio-device", "reset", "id", &self.id); // Return the interrupt - Some(self.common.interrupt_cb.take().unwrap()) + Some(self.vu_common.virtio_common.interrupt_cb.take().unwrap()) } fn shutdown(&mut self) { @@ -387,11 +388,11 @@ impl VirtioDevice for GenericVhostUser { impl Pausable for GenericVhostUser { fn pause(&mut self) -> result::Result<(), MigratableError> { self.vu_common.pause()?; - self.common.pause() + self.vu_common.virtio_common.pause() } fn resume(&mut self) -> result::Result<(), MigratableError> { - self.common.resume()?; + self.vu_common.virtio_common.resume()?; if let Some(epoll_thread) = &self.epoll_thread { epoll_thread.thread().unpark(); @@ -430,7 +431,6 @@ impl Migratable for GenericVhostUser { } fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> { - self.vu_common - .complete_migration(self.common.kill_evt.take()) + self.vu_common.complete_migration() } } diff --git a/virtio-devices/src/vhost_user/mod.rs b/virtio-devices/src/vhost_user/mod.rs index 86a774737a..a80c5aa237 100644 --- a/virtio-devices/src/vhost_user/mod.rs +++ b/virtio-devices/src/vhost_user/mod.rs @@ -29,7 +29,7 @@ use crate::{ ActivateError, EPOLL_HELPER_EVENT_LAST, EpollHelper, EpollHelperError, EpollHelperHandler, GuestMemoryMmap, GuestRegionMmap, VIRTIO_F_IN_ORDER, VIRTIO_F_NOTIFICATION_DATA, VIRTIO_F_ORDER_PLATFORM, VIRTIO_F_RING_EVENT_IDX, VIRTIO_F_RING_INDIRECT_DESC, - VIRTIO_F_VERSION_1, VirtioInterrupt, + VIRTIO_F_VERSION_1, VirtioCommon, VirtioInterrupt, }; pub mod blk; @@ -336,6 +336,7 @@ impl VhostUserState { #[derive(Default)] pub struct VhostUserCommon { + pub virtio_common: VirtioCommon, pub vu: Option>>, pub acked_protocol_features: u64, pub socket_path: String, @@ -485,12 +486,11 @@ impl VhostUserCommon { pub fn state( &self, - common: &crate::VirtioCommon, config: C, ) -> std::result::Result, MigratableError> { let mut state = VhostUserState { - avail_features: common.avail_features, - acked_features: common.acked_features, + avail_features: self.virtio_common.avail_features, + acked_features: self.virtio_common.acked_features, config, acked_protocol_features: self.acked_protocol_features, vu_num_queues: self.vu_num_queues, @@ -586,15 +586,12 @@ impl VhostUserCommon { Ok(()) } - pub fn complete_migration( - &mut self, - kill_evt: Option, - ) -> std::result::Result<(), MigratableError> { + pub fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> { self.migration_started = false; // Make sure the device thread is killed in order to prevent from // reconnections to the socket. - if let Some(kill_evt) = kill_evt { + if let Some(kill_evt) = self.virtio_common.kill_evt.take() { kill_evt.write(1).map_err(|e| { MigratableError::CompleteMigration(anyhow!( "Error killing vhost-user thread: {e:?}" diff --git a/virtio-devices/src/vhost_user/net.rs b/virtio-devices/src/vhost_user/net.rs index d05626901a..a270ed5414 100644 --- a/virtio-devices/src/vhost_user/net.rs +++ b/virtio-devices/src/vhost_user/net.rs @@ -41,7 +41,6 @@ struct BackendReqHandler {} impl VhostUserFrontendReqHandler for BackendReqHandler {} pub struct Net { - common: VirtioCommon, vu_common: VhostUserCommon, id: String, config: VirtioNetConfig, @@ -198,17 +197,17 @@ impl Net { Ok(Net { id, - common: VirtioCommon { - device_type: VirtioDeviceType::Net as u32, - queue_sizes: vec![vu_cfg.queue_size; num_queues], - avail_features, - acked_features, - paused_sync: Some(Arc::new(Barrier::new(2))), - min_queues: DEFAULT_QUEUE_NUMBER as u16, - paused: Arc::new(AtomicBool::new(paused)), - ..Default::default() - }, vu_common: VhostUserCommon { + virtio_common: VirtioCommon { + device_type: VirtioDeviceType::Net as u32, + queue_sizes: vec![vu_cfg.queue_size; num_queues], + avail_features, + acked_features, + paused_sync: Some(Arc::new(Barrier::new(2))), + min_queues: DEFAULT_QUEUE_NUMBER as u16, + paused: Arc::new(AtomicBool::new(paused)), + ..Default::default() + }, vu: Some(Arc::new(Mutex::new(vu))), acked_protocol_features, socket_path: vu_cfg.socket, @@ -228,19 +227,19 @@ impl Net { } fn state(&self) -> std::result::Result { - self.vu_common.state(&self.common, self.config) + self.vu_common.state(self.config) } } impl Drop for Net { fn drop(&mut self) { - if let Some(kill_evt) = self.common.kill_evt.take() + if let Some(kill_evt) = self.vu_common.virtio_common.kill_evt.take() && let Err(e) = kill_evt.write(1) { error!("failed to kill vhost-user-net: {e:?}"); } - self.common.wait_for_epoll_threads(); + self.vu_common.virtio_common.wait_for_epoll_threads(); if let Some(thread) = self.epoll_thread.take() && let Err(e) = thread.join() @@ -258,15 +257,15 @@ impl Drop for Net { impl VirtioDevice for Net { fn device_type(&self) -> u32 { - self.common.device_type + self.vu_common.virtio_common.device_type } fn queue_max_sizes(&self) -> &[u16] { - &self.common.queue_sizes + &self.vu_common.virtio_common.queue_sizes } fn features(&self) -> u64 { - let mut features = self.common.avail_features; + let mut features = self.vu_common.virtio_common.avail_features; if self.iommu { features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; } @@ -274,7 +273,7 @@ impl VirtioDevice for Net { } fn ack_features(&mut self, value: u64) { - self.common.ack_features(value); + self.vu_common.virtio_common.ack_features(value); } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -288,18 +287,28 @@ impl VirtioDevice for Net { mut queues, .. } = context; - self.common.activate(&queues, interrupt_cb.clone())?; + self.vu_common + .virtio_common + .activate(&queues, interrupt_cb.clone())?; self.guest_memory = Some(mem.clone()); let num_queues = queues.len(); - let event_idx = self.common.feature_acked(VIRTIO_RING_F_EVENT_IDX.into()); - if self.common.feature_acked(VIRTIO_NET_F_CTRL_VQ.into()) && !num_queues.is_multiple_of(2) { + let event_idx = self + .vu_common + .virtio_common + .feature_acked(VIRTIO_RING_F_EVENT_IDX.into()); + if self + .vu_common + .virtio_common + .feature_acked(VIRTIO_NET_F_CTRL_VQ.into()) + && !num_queues.is_multiple_of(2) + { let ctrl_queue_index = num_queues - 1; let (_, mut ctrl_queue, ctrl_queue_evt) = queues.remove(ctrl_queue_index); ctrl_queue.set_event_idx(event_idx); - let (kill_evt, pause_evt) = self.common.dup_eventfds(); + let (kill_evt, pause_evt) = self.vu_common.virtio_common.dup_eventfds(); let mut ctrl_handler = NetCtrlEpollHandler { mem: mem.clone(), @@ -313,12 +322,12 @@ impl VirtioDevice for Net { queue_index: ctrl_queue_index as u16, }; - let paused = self.common.paused.clone(); + let paused = self.vu_common.virtio_common.paused.clone(); // Let's update the barrier as we need 1 for the control queue // thread + 1 for the common vhost-user thread + 1 for the main // thread signalling the pause. - self.common.paused_sync = Some(Arc::new(Barrier::new(3))); - let paused_sync = self.common.paused_sync.clone(); + self.vu_common.virtio_common.paused_sync = Some(Arc::new(Barrier::new(3))); + let paused_sync = self.vu_common.virtio_common.paused_sync.clone(); let mut epoll_threads = Vec::new(); spawn_virtio_thread( @@ -336,11 +345,12 @@ impl VirtioDevice for Net { // The backend acknowledged features must not contain VIRTIO_NET_F_MAC // since we don't expect the backend to handle it. - let backend_acked_features = self.common.acked_features & !(1 << VIRTIO_NET_F_MAC); + let backend_acked_features = + self.vu_common.virtio_common.acked_features & !(1 << VIRTIO_NET_F_MAC); // Run a dedicated thread for handling potential reconnections with // the backend. - let (kill_evt, pause_evt) = self.common.dup_eventfds(); + let (kill_evt, pause_evt) = self.vu_common.virtio_common.dup_eventfds(); let mut handler = self.vu_common.activate( mem, @@ -352,8 +362,8 @@ impl VirtioDevice for Net { pause_evt, )?; - let paused = self.common.paused.clone(); - let paused_sync = self.common.paused_sync.clone(); + let paused = self.vu_common.virtio_common.paused.clone(); + let paused_sync = self.vu_common.virtio_common.paused_sync.clone(); let mut epoll_threads = Vec::new(); spawn_virtio_thread( @@ -371,8 +381,8 @@ impl VirtioDevice for Net { fn reset(&mut self) -> Option> { // We first must resume the virtio thread if it was paused. - if self.common.pause_evt.take().is_some() { - self.common.resume().ok()?; + if self.vu_common.virtio_common.pause_evt.take().is_some() { + self.vu_common.virtio_common.resume().ok()?; } if let Some(vu) = &self.vu_common.vu @@ -382,7 +392,7 @@ impl VirtioDevice for Net { return None; } - if let Some(kill_evt) = self.common.kill_evt.take() { + if let Some(kill_evt) = self.vu_common.virtio_common.kill_evt.take() { // Ignore the result because there is nothing we can do about it. let _ = kill_evt.write(1); } @@ -390,7 +400,7 @@ impl VirtioDevice for Net { event!("virtio-device", "reset", "id", &self.id); // Return the interrupt - Some(self.common.interrupt_cb.take().unwrap()) + Some(self.vu_common.virtio_common.interrupt_cb.take().unwrap()) } fn shutdown(&mut self) { @@ -408,11 +418,11 @@ impl VirtioDevice for Net { impl Pausable for Net { fn pause(&mut self) -> result::Result<(), MigratableError> { self.vu_common.pause()?; - self.common.pause() + self.vu_common.virtio_common.pause() } fn resume(&mut self) -> result::Result<(), MigratableError> { - self.common.resume()?; + self.vu_common.virtio_common.resume()?; if let Some(epoll_thread) = &self.epoll_thread { epoll_thread.thread().unpark(); @@ -455,7 +465,6 @@ impl Migratable for Net { } fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> { - self.vu_common - .complete_migration(self.common.kill_evt.take()) + self.vu_common.complete_migration() } } From 63aeb597ef3c7a24e32d4c7cd2935eacc0e38db3 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Tue, 31 Mar 2026 04:41:51 -0700 Subject: [PATCH 435/742] virtio-devices: Move epoll_thread to VhostUserCommon This is used by all devices so it can be part of the common state. Moving it simplifies the code and simplifies some future improvements around shutdown for migration. Signed-off-by: Rob Bradford --- virtio-devices/src/vhost_user/blk.rs | 10 ++++------ virtio-devices/src/vhost_user/fs.rs | 10 ++++------ virtio-devices/src/vhost_user/generic_vhost_user.rs | 10 ++++------ virtio-devices/src/vhost_user/mod.rs | 3 ++- virtio-devices/src/vhost_user/net.rs | 8 +++----- 5 files changed, 17 insertions(+), 24 deletions(-) diff --git a/virtio-devices/src/vhost_user/blk.rs b/virtio-devices/src/vhost_user/blk.rs index f6e9a46232..529eb517d5 100644 --- a/virtio-devices/src/vhost_user/blk.rs +++ b/virtio-devices/src/vhost_user/blk.rs @@ -3,7 +3,7 @@ use std::sync::atomic::AtomicBool; use std::sync::{Arc, Barrier, Mutex}; -use std::{mem, result, thread}; +use std::{mem, result}; use block::VirtioBlockConfig; use event_monitor::event; @@ -43,7 +43,6 @@ pub struct Blk { id: String, config: VirtioBlockConfig, guest_memory: Option>, - epoll_thread: Option>, seccomp_action: SeccompAction, exit_evt: EventFd, iommu: bool, @@ -190,7 +189,6 @@ impl Blk { id, config, guest_memory: None, - epoll_thread: None, seccomp_action, exit_evt, iommu, @@ -210,7 +208,7 @@ impl Drop for Blk { error!("failed to kill vhost-user-blk: {e:?}"); } self.vu_common.virtio_common.wait_for_epoll_threads(); - if let Some(thread) = self.epoll_thread.take() + if let Some(thread) = self.vu_common.epoll_thread.take() && let Err(e) = thread.join() { error!("Error joining thread: {e:?}"); @@ -311,7 +309,7 @@ impl VirtioDevice for Blk { &self.exit_evt, move || handler.run(&paused, paused_sync.as_ref().unwrap()), )?; - self.epoll_thread = Some(epoll_threads.remove(0)); + self.vu_common.epoll_thread = Some(epoll_threads.remove(0)); Ok(()) } @@ -361,7 +359,7 @@ impl Pausable for Blk { fn resume(&mut self) -> result::Result<(), MigratableError> { self.vu_common.virtio_common.resume()?; - if let Some(epoll_thread) = &self.epoll_thread { + if let Some(epoll_thread) = &self.vu_common.epoll_thread { epoll_thread.thread().unpark(); } diff --git a/virtio-devices/src/vhost_user/fs.rs b/virtio-devices/src/vhost_user/fs.rs index 3f982ff299..71859365ed 100644 --- a/virtio-devices/src/vhost_user/fs.rs +++ b/virtio-devices/src/vhost_user/fs.rs @@ -1,9 +1,9 @@ // Copyright 2019 Intel Corporation. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +use std::result; use std::sync::atomic::AtomicBool; use std::sync::{Arc, Barrier, Mutex}; -use std::{result, thread}; use event_monitor::event; use log::{error, info}; @@ -67,7 +67,6 @@ pub struct Fs { cache: Option<(VirtioSharedMemoryList, MmapRegion)>, seccomp_action: SeccompAction, guest_memory: Option>, - epoll_thread: Option>, exit_evt: EventFd, iommu: bool, } @@ -200,7 +199,6 @@ impl Fs { cache, seccomp_action, guest_memory: None, - epoll_thread: None, exit_evt, iommu, }) @@ -218,7 +216,7 @@ impl Drop for Fs { let _ = kill_evt.write(1); } self.vu_common.virtio_common.wait_for_epoll_threads(); - if let Some(thread) = self.epoll_thread.take() + if let Some(thread) = self.vu_common.epoll_thread.take() && let Err(e) = thread.join() { error!("Error joining thread: {e:?}"); @@ -290,7 +288,7 @@ impl VirtioDevice for Fs { &self.exit_evt, move || handler.run(&paused, paused_sync.as_ref().unwrap()), )?; - self.epoll_thread = Some(epoll_threads.remove(0)); + self.vu_common.epoll_thread = Some(epoll_threads.remove(0)); event!("virtio-device", "activated", "id", &self.id); Ok(()) @@ -371,7 +369,7 @@ impl Pausable for Fs { fn resume(&mut self) -> result::Result<(), MigratableError> { self.vu_common.virtio_common.resume()?; - if let Some(epoll_thread) = &self.epoll_thread { + if let Some(epoll_thread) = &self.vu_common.epoll_thread { epoll_thread.thread().unpark(); } diff --git a/virtio-devices/src/vhost_user/generic_vhost_user.rs b/virtio-devices/src/vhost_user/generic_vhost_user.rs index 0554973f82..83f3fe465d 100644 --- a/virtio-devices/src/vhost_user/generic_vhost_user.rs +++ b/virtio-devices/src/vhost_user/generic_vhost_user.rs @@ -2,9 +2,9 @@ // Copyright 2025 Demi Marie Obenour. // SPDX-License-Identifier: Apache-2.0 +use std::result; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::{Arc, Barrier, Mutex}; -use std::{result, thread}; use event_monitor::event; use log::{error, info, warn}; @@ -41,7 +41,6 @@ pub struct GenericVhostUser { cache: Option<(VirtioSharedMemoryList, MmapRegion)>, seccomp_action: SeccompAction, guest_memory: Option>, - epoll_thread: Option>, exit_evt: EventFd, iommu: bool, cfg_warning: AtomicBool, @@ -159,7 +158,6 @@ since the backend only supports {backend_num_queues}\n", cache, seccomp_action, guest_memory: None, - epoll_thread: None, exit_evt, iommu, cfg_warning: AtomicBool::new(false), @@ -193,7 +191,7 @@ impl Drop for GenericVhostUser { let _ = kill_evt.write(1); } self.vu_common.virtio_common.wait_for_epoll_threads(); - if let Some(thread) = self.epoll_thread.take() + if let Some(thread) = self.vu_common.epoll_thread.take() && let Err(e) = thread.join() { error!("Error joining thread: {e:?}"); @@ -313,7 +311,7 @@ impl VirtioDevice for GenericVhostUser { &self.exit_evt, move || handler.run(&paused, paused_sync.as_ref().unwrap()), )?; - self.epoll_thread = Some(epoll_threads.remove(0)); + self.vu_common.epoll_thread = Some(epoll_threads.remove(0)); event!("virtio-device", "activated", "id", &self.id); Ok(()) @@ -394,7 +392,7 @@ impl Pausable for GenericVhostUser { fn resume(&mut self) -> result::Result<(), MigratableError> { self.vu_common.virtio_common.resume()?; - if let Some(epoll_thread) = &self.epoll_thread { + if let Some(epoll_thread) = &self.vu_common.epoll_thread { epoll_thread.thread().unpark(); } diff --git a/virtio-devices/src/vhost_user/mod.rs b/virtio-devices/src/vhost_user/mod.rs index a80c5aa237..3ececc663c 100644 --- a/virtio-devices/src/vhost_user/mod.rs +++ b/virtio-devices/src/vhost_user/mod.rs @@ -1,11 +1,11 @@ // Copyright 2019 Intel Corporation. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 -use std::io; use std::ops::Deref; use std::os::unix::io::AsRawFd; use std::sync::atomic::AtomicBool; use std::sync::{Arc, Barrier, Mutex}; +use std::{io, thread}; use anyhow::anyhow; use log::error; @@ -345,6 +345,7 @@ pub struct VhostUserCommon { pub server: bool, pub interrupt_cb: Option>, pub vring_bases: Option>, + pub epoll_thread: Option>, } impl VhostUserCommon { diff --git a/virtio-devices/src/vhost_user/net.rs b/virtio-devices/src/vhost_user/net.rs index a270ed5414..7cd4e42070 100644 --- a/virtio-devices/src/vhost_user/net.rs +++ b/virtio-devices/src/vhost_user/net.rs @@ -46,7 +46,6 @@ pub struct Net { config: VirtioNetConfig, guest_memory: Option>, ctrl_queue_epoll_thread: Option>, - epoll_thread: Option>, seccomp_action: SeccompAction, exit_evt: EventFd, iommu: bool, @@ -219,7 +218,6 @@ impl Net { config, guest_memory: None, ctrl_queue_epoll_thread: None, - epoll_thread: None, seccomp_action, exit_evt, iommu, @@ -241,7 +239,7 @@ impl Drop for Net { self.vu_common.virtio_common.wait_for_epoll_threads(); - if let Some(thread) = self.epoll_thread.take() + if let Some(thread) = self.vu_common.epoll_thread.take() && let Err(e) = thread.join() { error!("Error joining thread: {e:?}"); @@ -374,7 +372,7 @@ impl VirtioDevice for Net { &self.exit_evt, move || handler.run(&paused, paused_sync.as_ref().unwrap()), )?; - self.epoll_thread = Some(epoll_threads.remove(0)); + self.vu_common.epoll_thread = Some(epoll_threads.remove(0)); Ok(()) } @@ -424,7 +422,7 @@ impl Pausable for Net { fn resume(&mut self) -> result::Result<(), MigratableError> { self.vu_common.virtio_common.resume()?; - if let Some(epoll_thread) = &self.epoll_thread { + if let Some(epoll_thread) = &self.vu_common.epoll_thread { epoll_thread.thread().unpark(); } From 68691db37bc6b133e0cf068137223daa75dad71a Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Tue, 31 Mar 2026 05:54:12 -0700 Subject: [PATCH 436/742] virtio-devices: vhost_user: Correctly shutdown epoll thread If the epoll thread is paused, which would be expected as a part of live migration/snapshot-restore unpause the thread so that it can receive the kill event. This mirrors the reset() behaviour of virtio devices. It is important here so as to close the connection with the vhost-user-backend to allow same host and --local migration and since after getting the device state the vhost-user backend should no longer be used. As a result of this change we can do --local and same-host migration with virtio-fs. Signed-off-by: Rob Bradford --- virtio-devices/src/vhost_user/mod.rs | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/virtio-devices/src/vhost_user/mod.rs b/virtio-devices/src/vhost_user/mod.rs index 3ececc663c..95e819e18c 100644 --- a/virtio-devices/src/vhost_user/mod.rs +++ b/virtio-devices/src/vhost_user/mod.rs @@ -3,7 +3,7 @@ use std::ops::Deref; use std::os::unix::io::AsRawFd; -use std::sync::atomic::AtomicBool; +use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::{Arc, Barrier, Mutex}; use std::{io, thread}; @@ -425,6 +425,22 @@ impl VhostUserCommon { } pub fn shutdown(&mut self) { + // Signal the epoll thread to exit, unpause it (it may be parked + // if the VM was paused for migration), then wait for it to finish. + // This ensures the thread drops its Arc, fully + // closing the vhost-user socket so the backend can accept a new + // connection from the destination. + if let Some(kill_evt) = self.virtio_common.kill_evt.take() { + let _ = kill_evt.write(1); + } + self.virtio_common.paused.store(false, Ordering::SeqCst); + if let Some(t) = self.epoll_thread.as_ref() { + t.thread().unpark(); + } + if let Some(t) = self.epoll_thread.take() { + let _ = t.join(); + } + // Remove socket path if needed if self.server { let _ = std::fs::remove_file(&self.socket_path); From cc1735c3996d9873bfb8f8cc787200905578a159 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Tue, 31 Mar 2026 06:50:24 -0700 Subject: [PATCH 437/742] virtio-devices: vhost_user: Use the VhostUserHandle enum for LOG_ALL This is equivalent value but removes the need to manually use the constant to shift. Signed-off-by: Rob Bradford --- virtio-devices/src/vhost_user/vu_common_ctrl.rs | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/virtio-devices/src/vhost_user/vu_common_ctrl.rs b/virtio-devices/src/vhost_user/vu_common_ctrl.rs index b2ee2ae5dc..1e356f6485 100644 --- a/virtio-devices/src/vhost_user/vu_common_ctrl.rs +++ b/virtio-devices/src/vhost_user/vu_common_ctrl.rs @@ -12,7 +12,7 @@ use std::thread::sleep; use std::time::{Duration, Instant}; use log::{error, info}; -use vhost::vhost_kern::vhost_binding::{VHOST_F_LOG_ALL, VHOST_VRING_F_LOG}; +use vhost::vhost_kern::vhost_binding::VHOST_VRING_F_LOG; use vhost::vhost_user::message::{ VhostTransferStateDirection, VhostTransferStatePhase, VhostUserHeaderFlag, VhostUserInflight, VhostUserProtocolFeatures, VhostUserVirtioFeatures, @@ -454,11 +454,8 @@ impl VhostUserHandle { } fn update_supported_features(&mut self, acked_features: u64, acked_protocol_features: u64) { - if (acked_features & u64::from(vhost::vhost_kern::vhost_binding::VHOST_F_LOG_ALL) != 0) - && (acked_protocol_features & VhostUserProtocolFeatures::LOG_SHMFD.bits() != 0) - { - self.supports_migration = true; - } + self.supports_migration = acked_features & VhostUserVirtioFeatures::LOG_ALL.bits() != 0 + && acked_protocol_features & VhostUserProtocolFeatures::LOG_SHMFD.bits() != 0; self.supports_device_state = acked_protocol_features & VhostUserProtocolFeatures::DEVICE_STATE.bits() != 0; } @@ -643,7 +640,7 @@ impl VhostUserHandle { self.update_log_base(last_ram_addr)?; // Enable VHOST_F_LOG_ALL feature - let features = self.acked_features | (1 << VHOST_F_LOG_ALL); + let features = self.acked_features | VhostUserVirtioFeatures::LOG_ALL.bits(); self.vu .set_features(features) .map_err(Error::VhostUserSetFeatures)?; From 070f3bbea15f8e8ffea31afbfc6e50bbc7672625 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Tue, 31 Mar 2026 06:52:10 -0700 Subject: [PATCH 438/742] virtio-devices: Reject dirty logging if backend does not support it Signed-off-by: Rob Bradford --- virtio-devices/src/vhost_user/vu_common_ctrl.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/virtio-devices/src/vhost_user/vu_common_ctrl.rs b/virtio-devices/src/vhost_user/vu_common_ctrl.rs index 1e356f6485..23e06a3c0a 100644 --- a/virtio-devices/src/vhost_user/vu_common_ctrl.rs +++ b/virtio-devices/src/vhost_user/vu_common_ctrl.rs @@ -670,6 +670,10 @@ impl VhostUserHandle { } pub fn dirty_log(&mut self, last_ram_addr: u64) -> Result { + if !self.supports_migration { + return Err(Error::MigrationNotSupported); + } + // The log region is updated by creating a new region that is sent to // the backend. This ensures the backend stops logging to the previous // region. The previous region is returned and processed to create the From 266ad8aa0ebca38e863171236a8b7fdfe966bc37 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Tue, 31 Mar 2026 06:52:45 -0700 Subject: [PATCH 439/742] virtio-devices: vhost_user: Advertise LOG_ALL feature Advertising support for this virtio feature is required to enable support for migration. (Along with the LOG_SHMFD protocol feature.) Signed-off-by: Rob Bradford --- virtio-devices/src/vhost_user/mod.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/virtio-devices/src/vhost_user/mod.rs b/virtio-devices/src/vhost_user/mod.rs index 95e819e18c..01a75f0575 100644 --- a/virtio-devices/src/vhost_user/mod.rs +++ b/virtio-devices/src/vhost_user/mod.rs @@ -167,7 +167,8 @@ pub const DEFAULT_VIRTIO_FEATURES: u64 = (1 << VIRTIO_F_RING_INDIRECT_DESC) | (1 << VIRTIO_F_IN_ORDER) | (1 << VIRTIO_F_ORDER_PLATFORM) | (1 << VIRTIO_F_NOTIFICATION_DATA) - | VhostUserVirtioFeatures::PROTOCOL_FEATURES.bits(); + | VhostUserVirtioFeatures::PROTOCOL_FEATURES.bits() + | VhostUserVirtioFeatures::LOG_ALL.bits(); const HUP_CONNECTION_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 1; const BACKEND_REQ_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 2; From fa29dbd0c5ceaf80b851d1614c063ab6ac2b4576 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Tue, 31 Mar 2026 07:28:38 -0700 Subject: [PATCH 440/742] virtio-devices: Reuse common shutdown code in drop implementations Now that the VhostUserCommon::shutdown implementation has been filled out to support migration it can also be used for the drop implementations in the vhost-user devices. It's worth noting that the call to wait_for_epoll_threads() was a no-op as those threads are only configured on conventional virtio devices. Signed-off-by: Rob Bradford --- virtio-devices/src/vhost_user/blk.rs | 12 +----------- virtio-devices/src/vhost_user/fs.rs | 11 +---------- .../src/vhost_user/generic_vhost_user.rs | 11 +---------- virtio-devices/src/vhost_user/net.rs | 14 +------------- 4 files changed, 4 insertions(+), 44 deletions(-) diff --git a/virtio-devices/src/vhost_user/blk.rs b/virtio-devices/src/vhost_user/blk.rs index 529eb517d5..8221b7b501 100644 --- a/virtio-devices/src/vhost_user/blk.rs +++ b/virtio-devices/src/vhost_user/blk.rs @@ -202,17 +202,7 @@ impl Blk { impl Drop for Blk { fn drop(&mut self) { - if let Some(kill_evt) = self.vu_common.virtio_common.kill_evt.take() - && let Err(e) = kill_evt.write(1) - { - error!("failed to kill vhost-user-blk: {e:?}"); - } - self.vu_common.virtio_common.wait_for_epoll_threads(); - if let Some(thread) = self.vu_common.epoll_thread.take() - && let Err(e) = thread.join() - { - error!("Error joining thread: {e:?}"); - } + self.vu_common.shutdown(); } } diff --git a/virtio-devices/src/vhost_user/fs.rs b/virtio-devices/src/vhost_user/fs.rs index 71859365ed..967fdecf99 100644 --- a/virtio-devices/src/vhost_user/fs.rs +++ b/virtio-devices/src/vhost_user/fs.rs @@ -211,16 +211,7 @@ impl Fs { impl Drop for Fs { fn drop(&mut self) { - if let Some(kill_evt) = self.vu_common.virtio_common.kill_evt.take() { - // Ignore the result because there is nothing we can do about it. - let _ = kill_evt.write(1); - } - self.vu_common.virtio_common.wait_for_epoll_threads(); - if let Some(thread) = self.vu_common.epoll_thread.take() - && let Err(e) = thread.join() - { - error!("Error joining thread: {e:?}"); - } + self.vu_common.shutdown(); } } diff --git a/virtio-devices/src/vhost_user/generic_vhost_user.rs b/virtio-devices/src/vhost_user/generic_vhost_user.rs index 83f3fe465d..1778834431 100644 --- a/virtio-devices/src/vhost_user/generic_vhost_user.rs +++ b/virtio-devices/src/vhost_user/generic_vhost_user.rs @@ -186,16 +186,7 @@ space access. Reads will return 0xFF and writes will be ignored." impl Drop for GenericVhostUser { fn drop(&mut self) { - if let Some(kill_evt) = self.vu_common.virtio_common.kill_evt.take() { - // Ignore the result because there is nothing we can do about it. - let _ = kill_evt.write(1); - } - self.vu_common.virtio_common.wait_for_epoll_threads(); - if let Some(thread) = self.vu_common.epoll_thread.take() - && let Err(e) = thread.join() - { - error!("Error joining thread: {e:?}"); - } + self.vu_common.shutdown(); } } diff --git a/virtio-devices/src/vhost_user/net.rs b/virtio-devices/src/vhost_user/net.rs index 7cd4e42070..1c85b5f38e 100644 --- a/virtio-devices/src/vhost_user/net.rs +++ b/virtio-devices/src/vhost_user/net.rs @@ -231,19 +231,7 @@ impl Net { impl Drop for Net { fn drop(&mut self) { - if let Some(kill_evt) = self.vu_common.virtio_common.kill_evt.take() - && let Err(e) = kill_evt.write(1) - { - error!("failed to kill vhost-user-net: {e:?}"); - } - - self.vu_common.virtio_common.wait_for_epoll_threads(); - - if let Some(thread) = self.vu_common.epoll_thread.take() - && let Err(e) = thread.join() - { - error!("Error joining thread: {e:?}"); - } + self.vu_common.shutdown(); if let Some(thread) = self.ctrl_queue_epoll_thread.take() && let Err(e) = thread.join() From a99c40dbbb5565a432776344b221332fea6af9df Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Mon, 30 Mar 2026 02:21:52 -0700 Subject: [PATCH 441/742] tests: Add live migration test for virtio-fs Signed-off-by: Rob Bradford --- cloud-hypervisor/tests/common/utils.rs | 14 +- cloud-hypervisor/tests/integration.rs | 185 ++++++++++++++++++++++++- 2 files changed, 197 insertions(+), 2 deletions(-) diff --git a/cloud-hypervisor/tests/common/utils.rs b/cloud-hypervisor/tests/common/utils.rs index ac48641826..aff1e94ef5 100644 --- a/cloud-hypervisor/tests/common/utils.rs +++ b/cloud-hypervisor/tests/common/utils.rs @@ -9,6 +9,7 @@ use std::process::{Child, Command}; use std::string::String; use std::sync::mpsc; use std::sync::mpsc::Receiver; +use std::time::{Duration, Instant}; use std::{cmp, fs, io, thread}; use test_infra::*; @@ -93,6 +94,17 @@ pub(crate) fn temp_api_path(tmp_dir: &TempDir) -> String { ) } +pub(crate) fn wait_for_virtiofsd_socket(socket: &str) { + // Wait for virtiofds to start + let deadline = Instant::now() + Duration::from_secs(10); + while !Path::new(socket).exists() { + if Instant::now() > deadline { + panic!("virtiofsd socket did not appear within 10s"); + } + thread::sleep(Duration::from_millis(50)); + } +} + pub(crate) fn prepare_virtiofsd( tmp_dir: &TempDir, shared_dir: &str, @@ -116,7 +128,7 @@ pub(crate) fn prepare_virtiofsd( .spawn() .unwrap(); - thread::sleep(std::time::Duration::new(10, 0)); + wait_for_virtiofsd_socket(virtiofsd_socket_path.as_str()); (child, virtiofsd_socket_path) } diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 5ae554981c..a763ccba4c 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -10062,6 +10062,177 @@ mod live_migration { handle_child_output(r, &src_output); } + fn _test_live_migration_virtio_fs(local: bool) { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); + let kernel_path = direct_kernel_boot_path(); + + let mut workload_path = dirs::home_dir().unwrap(); + workload_path.push("workloads"); + let mut shared_dir = workload_path; + shared_dir.push("shared_dir"); + + let (daemon_child, virtiofsd_socket_path) = + prepare_virtiofsd(&guest.tmp_dir, shared_dir.to_str().unwrap()); + + let src_api_socket = temp_api_path(&guest.tmp_dir); + + // Start the source VM + let mut src_child = GuestCommand::new(&guest) + .args(["--api-socket", &src_api_socket]) + .args(["--cpus", "boot=2"]) + .args(["--memory", "size=512M,shared=on"]) + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .default_disks() + .default_net() + .args([ + "--fs", + format!("socket={virtiofsd_socket_path},tag=myfs,num_queues=1,queue_size=1024") + .as_str(), + ]) + .capture_output() + .spawn() + .unwrap(); + + // Start the destination VM + let mut dest_api_socket = temp_api_path(&guest.tmp_dir); + dest_api_socket.push_str(".dest"); + let mut dest_child = GuestCommand::new(&guest) + .args(["--api-socket", &dest_api_socket]) + .capture_output() + .spawn() + .unwrap(); + + // Spawn a thread that waits for the old virtiofsd to exit then + // starts a replacement. During migration the source saves + // DEVICE_STATE then disconnects, causing virtiofsd to exit. + // The destination needs a fresh virtiofsd to load DEVICE_STATE. + // We remove the socket file first so the destination cannot + // accidentally connect to the old instance. + let virtiofsd_socket_clone = virtiofsd_socket_path.clone(); + let shared_dir_str = shared_dir.to_str().unwrap().to_string(); + let (restart_tx, restart_rx) = std::sync::mpsc::channel(); + let _monitor = thread::spawn(move || { + let mut child = daemon_child; + let _ = child.wait(); + let mut path = dirs::home_dir().unwrap(); + path.push("workloads"); + path.push("virtiofsd"); + let new_child = Command::new(path) + .args(["--shared-dir", &shared_dir_str]) + .args(["--socket-path", &virtiofsd_socket_clone]) + .args(["--cache", "never"]) + .args(["--tag", "myfs"]) + .spawn() + .unwrap(); + wait_for_virtiofsd_socket(&virtiofsd_socket_clone); + let _ = restart_tx.send(new_child); + }); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + // Mount virtiofs and verify it works + guest + .ssh_command("mkdir -p mount_dir && sudo mount -t virtiofs myfs mount_dir/") + .unwrap(); + + // Write a test file through virtiofs before migration + guest + .ssh_command( + "sudo bash -c 'echo pre_migration_data > mount_dir/migration_test_file'", + ) + .unwrap(); + + // Verify the file is accessible + assert_eq!( + guest + .ssh_command("cat mount_dir/migration_test_file") + .unwrap() + .trim(), + "pre_migration_data" + ); + + let migration_socket = String::from( + guest + .tmp_dir + .as_path() + .join("live-migration.sock") + .to_str() + .unwrap(), + ); + + // Remove the socket so the destination cannot connect to + // the old virtiofsd (which is still running). The source's + // existing connection uses an already-accepted fd. + let _ = std::fs::remove_file(&virtiofsd_socket_path); + + assert!( + start_live_migration(&migration_socket, &src_api_socket, &dest_api_socket, local), + "Unsuccessful command: 'send-migration' or 'receive-migration'." + ); + }); + + // Check and report any errors occurred during the live-migration + if r.is_err() { + print_and_panic( + src_child, + dest_child, + None, + "Error occurred during live-migration with virtio-fs", + ); + } + + // Check the source vm has been terminated successfully (give it '3s' to settle) + thread::sleep(Duration::from_secs(3)); + if !src_child.try_wait().unwrap().is_some_and(|s| s.success()) { + print_and_panic( + src_child, + dest_child, + None, + "source VM was not terminated successfully.", + ); + } + + // Post live-migration checks + let r = std::panic::catch_unwind(|| { + // Verify virtiofs still works after migration + // Read the file written before migration + assert_eq!( + guest + .ssh_command("cat mount_dir/migration_test_file") + .unwrap() + .trim(), + "pre_migration_data" + ); + + // Write a new file after migration + guest + .ssh_command( + "sudo bash -c 'echo post_migration_data > mount_dir/post_migration_file'", + ) + .unwrap(); + + // Verify the new file exists on the host + let post_content = + std::fs::read_to_string(shared_dir.join("post_migration_file")).unwrap(); + assert_eq!(post_content.trim(), "post_migration_data"); + }); + + // Clean up + let _ = dest_child.kill(); + let dest_output = dest_child.wait_with_output().unwrap(); + if let Ok(mut new_daemon) = restart_rx.try_recv() { + let _ = new_daemon.kill(); + let _ = new_daemon.wait(); + } + let _ = std::fs::remove_file(shared_dir.join("migration_test_file")); + let _ = std::fs::remove_file(shared_dir.join("post_migration_file")); + + handle_child_output(r, &dest_output); + } + mod live_migration_parallel { use vmm::api::TimeoutStrategy; @@ -10134,7 +10305,19 @@ mod live_migration { mod live_migration_sequential { use super::*; - // NUMA & balloon live migration tests are large so run sequentially + // NUMA, balloon, and virtio-fs live migration tests run sequentially + + #[test] + #[cfg(not(feature = "mshv"))] + fn test_live_migration_virtio_fs() { + _test_live_migration_virtio_fs(false); + } + + #[test] + #[cfg(not(feature = "mshv"))] + fn test_live_migration_virtio_fs_local() { + _test_live_migration_virtio_fs(true); + } #[test] fn test_live_migration_balloon() { From a8904a93a64a08eacabfae420268605cb464a452 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Wed, 1 Apr 2026 04:53:45 -0700 Subject: [PATCH 442/742] build: Only run ARM64 CI on merge queue Unfortunately with a single ARM64 machine this has now become a bottleneck for landing PRs. Copy the methodology we use for existing jobs that we only run on the MQ by creating dummy jobs that run on the GH hosted runner (ubuntu-latest) allowing the PR to transition into the MQ by passing the required checks. Signed-off-by: Rob Bradford --- .github/workflows/integration-arm64.yaml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/.github/workflows/integration-arm64.yaml b/.github/workflows/integration-arm64.yaml index 0678dbc839..d0d6966482 100644 --- a/.github/workflows/integration-arm64.yaml +++ b/.github/workflows/integration-arm64.yaml @@ -8,19 +8,24 @@ jobs: build: timeout-minutes: 120 name: Tests (ARM64) - runs-on: bookworm-arm64 + runs-on: ${{ github.event_name == 'pull_request' && 'ubuntu-latest' || 'bookworm-arm64' }} steps: - name: Fix workspace permissions + if: ${{ github.event_name != 'pull_request' }} run: sudo chown -R runner:runner ${GITHUB_WORKSPACE} - name: Code checkout + if: ${{ github.event_name != 'pull_request' }} uses: actions/checkout@v6 with: fetch-depth: 0 - name: Run unit tests (musl) + if: ${{ github.event_name != 'pull_request' }} run: scripts/dev_cli.sh tests --unit --libc musl - name: Load openvswitch module + if: ${{ github.event_name != 'pull_request' }} run: sudo modprobe openvswitch - name: Run integration tests (musl) + if: ${{ github.event_name != 'pull_request' }} timeout-minutes: 60 run: scripts/dev_cli.sh tests --integration --libc musl - name: Install Azure CLI @@ -52,3 +57,6 @@ jobs: if: ${{ github.event_name != 'pull_request' }} timeout-minutes: 30 run: scripts/dev_cli.sh tests --integration-windows --libc musl + - name: Skipping build for PR + if: ${{ github.event_name == 'pull_request' }} + run: echo "Skipping build for PR" From 6de3d5279fc4b5cb176a4b21b43925b1f445abbc Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 30 Mar 2026 11:42:45 +0200 Subject: [PATCH 443/742] tests: Constify rate limiter runtimes Introduce named constants for the net and single block rate limiter test runtimes and use them directly at the call sites. Signed-off-by: Anatol Belski --- cloud-hypervisor/tests/integration.rs | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index a763ccba4c..4360f71276 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -10475,6 +10475,9 @@ mod aarch64_acpi { mod rate_limiter { use super::*; + const NET_RATE_LIMITER_RUNTIME: u32 = 10; + const BLOCK_RATE_LIMITER_RUNTIME: u32 = 10; + // Check if the 'measured' rate is within the expected 'difference' (in percentage) // compared to given 'limit' rate. fn check_rate_limit(measured: f64, limit: f64, difference: f64) -> bool { @@ -10498,7 +10501,6 @@ mod rate_limiter { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(disk_config)); - let test_timeout = 10; let num_queues = 2; let queue_size = 256; let bw_size = 10485760_u64; // bytes @@ -10528,9 +10530,14 @@ mod rate_limiter { let r = std::panic::catch_unwind(|| { guest.wait_vm_boot().unwrap(); - let measured_bps = - measure_virtio_net_throughput(test_timeout, num_queues / 2, &guest, rx, true) - .unwrap(); + let measured_bps = measure_virtio_net_throughput( + NET_RATE_LIMITER_RUNTIME, + num_queues / 2, + &guest, + rx, + true, + ) + .unwrap(); assert!(check_rate_limit(measured_bps, limit_bps, 0.1)); }); @@ -10550,7 +10557,6 @@ mod rate_limiter { } fn _test_rate_limiter_block(bandwidth: bool, num_queues: u32) { - let test_timeout = 10; let fio_ops = FioOps::RandRW; let bw_size = if bandwidth { @@ -10618,7 +10624,7 @@ mod rate_limiter { let fio_command = format!( "sudo fio --filename=/dev/vdc --name=test --output-format=json \ --direct=1 --bs=4k --ioengine=io_uring --iodepth=64 \ - --rw={fio_ops} --runtime={test_timeout} --numjobs={num_queues}" + --rw={fio_ops} --runtime={BLOCK_RATE_LIMITER_RUNTIME} --numjobs={num_queues}" ); let output = guest.ssh_command(&fio_command).unwrap(); From f1160381110ce6a362b0dd9a946f44976812b1f0 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 30 Mar 2026 11:43:32 +0200 Subject: [PATCH 444/742] tests: Increase rate limiter refill time The rate limiter token bucket has a fixed 100 ms cool down time that pauses I/O whenever the bucket empties. With a 100 ms refill time, the actual throughput drops to roughly half of the target rate and causes the tests to miss their target. Increase the net and single block refill time from 100 ms to 1000 ms and scale the bucket sizes by 10x to preserve the target rate. Set the one time burst to 0 to avoid overshooting the upper bound, and raise the runtime constants from 10 s to 20 s for steadier measurements. Signed-off-by: Anatol Belski --- cloud-hypervisor/tests/integration.rs | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 4360f71276..f7f4e55544 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -10475,8 +10475,8 @@ mod aarch64_acpi { mod rate_limiter { use super::*; - const NET_RATE_LIMITER_RUNTIME: u32 = 10; - const BLOCK_RATE_LIMITER_RUNTIME: u32 = 10; + const NET_RATE_LIMITER_RUNTIME: u32 = 20; + const BLOCK_RATE_LIMITER_RUNTIME: u32 = 20; // Check if the 'measured' rate is within the expected 'difference' (in percentage) // compared to given 'limit' rate. @@ -10503,12 +10503,12 @@ mod rate_limiter { let num_queues = 2; let queue_size = 256; - let bw_size = 10485760_u64; // bytes - let bw_refill_time = 100; // ms + let bw_size = 104857600_u64; // bytes + let bw_refill_time = 1000; // ms let limit_bps = (bw_size * 8 * 1000) as f64 / bw_refill_time as f64; let net_params = format!( - "tap=,mac={},ip={},mask=255.255.255.128,num_queues={},queue_size={},bw_size={},bw_refill_time={}", + "tap=,mac={},ip={},mask=255.255.255.128,num_queues={},queue_size={},bw_size={},bw_one_time_burst=0,bw_refill_time={}", guest.network.guest_mac0, guest.network.host_ip0, num_queues, @@ -10560,11 +10560,11 @@ mod rate_limiter { let fio_ops = FioOps::RandRW; let bw_size = if bandwidth { - 10485760_u64 // bytes + 104857600_u64 // bytes } else { - 100_u64 // I/O + 1000_u64 // I/O }; - let bw_refill_time = 100; // ms + let bw_refill_time = 1000; // ms let limit_rate = (bw_size * 1000) as f64 / bw_refill_time as f64; let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); @@ -10585,11 +10585,11 @@ mod rate_limiter { let test_blk_params = if bandwidth { format!( - "path={blk_rate_limiter_test_img},num_queues={num_queues},bw_size={bw_size},bw_refill_time={bw_refill_time},image_type=raw" + "path={blk_rate_limiter_test_img},num_queues={num_queues},bw_size={bw_size},bw_one_time_burst=0,bw_refill_time={bw_refill_time},image_type=raw" ) } else { format!( - "path={blk_rate_limiter_test_img},num_queues={num_queues},ops_size={bw_size},ops_refill_time={bw_refill_time},image_type=raw" + "path={blk_rate_limiter_test_img},num_queues={num_queues},ops_size={bw_size},ops_one_time_burst=0,ops_refill_time={bw_refill_time},image_type=raw" ) }; From 36d6dacee2a42626430b74dc292a086a270e5527 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 30 Mar 2026 11:43:58 +0200 Subject: [PATCH 445/742] tests: Increase group block rate limiter refill window Increase the group block refill time from 100 ms to 1000 ms and scale the shared bucket sizes to preserve the target rate. Set the one time burst to 0 to avoid transient overshoot and use the shared block runtime constant directly in the group path. Signed-off-by: Anatol Belski --- cloud-hypervisor/tests/integration.rs | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index f7f4e55544..07acd2d856 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -10643,15 +10643,14 @@ mod rate_limiter { } fn _test_rate_limiter_group_block(bandwidth: bool, num_queues: u32, num_disks: u32) { - let test_timeout = 10; let fio_ops = FioOps::RandRW; let bw_size = if bandwidth { - 10485760_u64 // bytes + 104857600_u64 // bytes } else { - 100_u64 // I/O + 1000_u64 // I/O }; - let bw_refill_time = 100; // ms + let bw_refill_time = 1000; // ms let limit_rate = (bw_size * 1000) as f64 / bw_refill_time as f64; let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); @@ -10660,9 +10659,13 @@ mod rate_limiter { let test_img_dir = TempDir::new_with_prefix("/var/tmp/ch").unwrap(); let rate_limit_group_arg = if bandwidth { - format!("id=group0,bw_size={bw_size},bw_refill_time={bw_refill_time}") + format!( + "id=group0,bw_size={bw_size},bw_one_time_burst=0,bw_refill_time={bw_refill_time}" + ) } else { - format!("id=group0,ops_size={bw_size},ops_refill_time={bw_refill_time}") + format!( + "id=group0,ops_size={bw_size},ops_one_time_burst=0,ops_refill_time={bw_refill_time}" + ) }; let mut disk_args = vec![ @@ -10718,7 +10721,7 @@ mod rate_limiter { let mut fio_command = format!( "sudo fio --name=global --output-format=json \ --direct=1 --bs=4k --ioengine=io_uring --iodepth=64 \ - --rw={fio_ops} --runtime={test_timeout} --numjobs={num_queues}" + --rw={fio_ops} --runtime={BLOCK_RATE_LIMITER_RUNTIME} --numjobs={num_queues}" ); // Generate additional argument for each disk: From 9db5f0439ec3fc9b8e040f8223539ad03c1be215 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 30 Mar 2026 11:46:49 +0200 Subject: [PATCH 446/742] tests: Stabilize block rate limiter workloads Add a ramp time before measuring the block rate limiter tests so both the single device and group workloads are measured after warm up to make the measurements less sensitive to startup transients. Signed-off-by: Anatol Belski --- cloud-hypervisor/tests/integration.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 07acd2d856..bd4168d771 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -10477,6 +10477,7 @@ mod rate_limiter { const NET_RATE_LIMITER_RUNTIME: u32 = 20; const BLOCK_RATE_LIMITER_RUNTIME: u32 = 20; + const BLOCK_RATE_LIMITER_RAMP_TIME: u32 = 5; // Check if the 'measured' rate is within the expected 'difference' (in percentage) // compared to given 'limit' rate. @@ -10624,7 +10625,8 @@ mod rate_limiter { let fio_command = format!( "sudo fio --filename=/dev/vdc --name=test --output-format=json \ --direct=1 --bs=4k --ioengine=io_uring --iodepth=64 \ - --rw={fio_ops} --runtime={BLOCK_RATE_LIMITER_RUNTIME} --numjobs={num_queues}" + --rw={fio_ops} --runtime={BLOCK_RATE_LIMITER_RUNTIME} \ + --ramp_time={BLOCK_RATE_LIMITER_RAMP_TIME} --numjobs={num_queues}", ); let output = guest.ssh_command(&fio_command).unwrap(); @@ -10721,7 +10723,8 @@ mod rate_limiter { let mut fio_command = format!( "sudo fio --name=global --output-format=json \ --direct=1 --bs=4k --ioengine=io_uring --iodepth=64 \ - --rw={fio_ops} --runtime={BLOCK_RATE_LIMITER_RUNTIME} --numjobs={num_queues}" + --rw={fio_ops} --runtime={BLOCK_RATE_LIMITER_RUNTIME} \ + --ramp_time={BLOCK_RATE_LIMITER_RAMP_TIME} --numjobs={num_queues}", ); // Generate additional argument for each disk: From 98dbe6d1283be476ccf499c163a00e3fa331f99e Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Fri, 27 Mar 2026 23:47:43 -0700 Subject: [PATCH 447/742] block: Derive Debug on RawFileDiskAio Add #[derive(Debug)] to RawFileDiskAio. This is required by the new disk_file traits which have Send + Debug bounds. Signed-off-by: Muminul Islam --- block/src/raw_async_aio.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/block/src/raw_async_aio.rs b/block/src/raw_async_aio.rs index 66f7a667bf..790525a008 100644 --- a/block/src/raw_async_aio.rs +++ b/block/src/raw_async_aio.rs @@ -19,6 +19,7 @@ use crate::async_io::{ }; use crate::{DiskTopology, SECTOR_SIZE, probe_sparse_support, query_device_size}; +#[derive(Debug)] pub struct RawFileDiskAio { file: File, } From fca54cb14294f8961dfe6f9582c1e6fd3d3603bc Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Fri, 27 Mar 2026 23:48:02 -0700 Subject: [PATCH 448/742] block: Implement DiskSize trait for RawFileDiskAio Add disk_file::DiskSize trait implementation for RawFileDiskAio using BlockError and BlockResult. Takes &self instead of &mut self. Add BlockError, BlockErrorKind, BlockResult, and disk_file imports needed by this and subsequent trait impls. Signed-off-by: Muminul Islam --- block/src/raw_async_aio.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/block/src/raw_async_aio.rs b/block/src/raw_async_aio.rs index 790525a008..c2da2ccd19 100644 --- a/block/src/raw_async_aio.rs +++ b/block/src/raw_async_aio.rs @@ -17,7 +17,8 @@ use vmm_sys_util::eventfd::EventFd; use crate::async_io::{ AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFile, DiskFileError, DiskFileResult, }; -use crate::{DiskTopology, SECTOR_SIZE, probe_sparse_support, query_device_size}; +use crate::error::{BlockError, BlockErrorKind, BlockResult}; +use crate::{DiskTopology, SECTOR_SIZE, disk_file, probe_sparse_support, query_device_size}; #[derive(Debug)] pub struct RawFileDiskAio { @@ -69,6 +70,14 @@ impl DiskFile for RawFileDiskAio { } } +impl disk_file::DiskSize for RawFileDiskAio { + fn logical_size(&self) -> BlockResult { + query_device_size(&self.file) + .map(|(logical_size, _)| logical_size) + .map_err(|e| BlockError::new(BlockErrorKind::Io, DiskFileError::Size(e))) + } +} + pub struct RawFileAsyncAio { fd: RawFd, ctx: aio::IoContext, From 78bbdbef86cf181a8322a997ab7acaebf1a9b0f2 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Fri, 27 Mar 2026 23:48:18 -0700 Subject: [PATCH 449/742] block: Implement PhysicalSize trait for RawFileDiskAio Add disk_file::PhysicalSize trait implementation for RawFileDiskAio. Returns the physical size from query_device_size wrapped in BlockError on failure, consistent with the DiskSize impl. Signed-off-by: Muminul Islam --- block/src/raw_async_aio.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/block/src/raw_async_aio.rs b/block/src/raw_async_aio.rs index c2da2ccd19..c627318cde 100644 --- a/block/src/raw_async_aio.rs +++ b/block/src/raw_async_aio.rs @@ -78,6 +78,14 @@ impl disk_file::DiskSize for RawFileDiskAio { } } +impl disk_file::PhysicalSize for RawFileDiskAio { + fn physical_size(&self) -> BlockResult { + query_device_size(&self.file) + .map(|(_, physical_size)| physical_size) + .map_err(|e| BlockError::new(BlockErrorKind::Io, DiskFileError::Size(e))) + } +} + pub struct RawFileAsyncAio { fd: RawFd, ctx: aio::IoContext, From df890599c20a9b29f4b266c9bde635531f9650da Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Fri, 27 Mar 2026 23:48:31 -0700 Subject: [PATCH 450/742] block: Implement DiskFd trait for RawFileDiskAio Add disk_file::DiskFd trait implementation for RawFileDiskAio. Delegates to file.as_raw_fd() via BorrowedDiskFd, taking &self instead of &mut self. Signed-off-by: Muminul Islam --- block/src/raw_async_aio.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/block/src/raw_async_aio.rs b/block/src/raw_async_aio.rs index c627318cde..86b9271cfb 100644 --- a/block/src/raw_async_aio.rs +++ b/block/src/raw_async_aio.rs @@ -86,6 +86,12 @@ impl disk_file::PhysicalSize for RawFileDiskAio { } } +impl disk_file::DiskFd for RawFileDiskAio { + fn fd(&self) -> BorrowedDiskFd<'_> { + BorrowedDiskFd::new(self.file.as_raw_fd()) + } +} + pub struct RawFileAsyncAio { fd: RawFd, ctx: aio::IoContext, From 190380c9bad3563f6d5bfb3885aafe6ea09b45fd Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Fri, 27 Mar 2026 23:49:01 -0700 Subject: [PATCH 451/742] block: Implement Geometry trait for RawFileDiskAio Add disk_file::Geometry trait implementation for RawFileDiskAio. Probes disk topology from the file, falling back to defaults on failure. Takes &self instead of &mut self and uses unwrap_or_else for cleaner error handling. Signed-off-by: Muminul Islam --- block/src/raw_async_aio.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/block/src/raw_async_aio.rs b/block/src/raw_async_aio.rs index 86b9271cfb..6c1fe0aad6 100644 --- a/block/src/raw_async_aio.rs +++ b/block/src/raw_async_aio.rs @@ -92,6 +92,15 @@ impl disk_file::DiskFd for RawFileDiskAio { } } +impl disk_file::Geometry for RawFileDiskAio { + fn topology(&self) -> DiskTopology { + DiskTopology::probe(&self.file).unwrap_or_else(|_| { + warn!("Unable to get device topology. Using default topology"); + DiskTopology::default() + }) + } +} + pub struct RawFileAsyncAio { fd: RawFd, ctx: aio::IoContext, From b7cf8737acbaeed0cb80129d468a4a0e773d2785 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Fri, 27 Mar 2026 23:49:15 -0700 Subject: [PATCH 452/742] block: Implement SparseCapable trait for RawFileDiskAio Add disk_file::SparseCapable trait implementation for RawFileDiskAio. Delegates to probe_sparse_support() to detect whether the underlying file supports hole-punching. Signed-off-by: Muminul Islam --- block/src/raw_async_aio.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/block/src/raw_async_aio.rs b/block/src/raw_async_aio.rs index 6c1fe0aad6..b12469312c 100644 --- a/block/src/raw_async_aio.rs +++ b/block/src/raw_async_aio.rs @@ -101,6 +101,12 @@ impl disk_file::Geometry for RawFileDiskAio { } } +impl disk_file::SparseCapable for RawFileDiskAio { + fn supports_sparse_operations(&self) -> bool { + probe_sparse_support(&self.file) + } +} + pub struct RawFileAsyncAio { fd: RawFd, ctx: aio::IoContext, From 62264cb3c730a8a53f0c18e0deb761e85dbaa5d7 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Fri, 27 Mar 2026 23:49:28 -0700 Subject: [PATCH 453/742] block: Implement Resizable trait for RawFileDiskAio Add disk_file::Resizable trait implementation for RawFileDiskAio. Calls file.set_len(size) and wraps the I/O error in BlockError on failure. Signed-off-by: Muminul Islam --- block/src/raw_async_aio.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/block/src/raw_async_aio.rs b/block/src/raw_async_aio.rs index b12469312c..7e7ff26c69 100644 --- a/block/src/raw_async_aio.rs +++ b/block/src/raw_async_aio.rs @@ -107,6 +107,14 @@ impl disk_file::SparseCapable for RawFileDiskAio { } } +impl disk_file::Resizable for RawFileDiskAio { + fn resize(&mut self, size: u64) -> BlockResult<()> { + self.file + .set_len(size) + .map_err(|e| BlockError::new(BlockErrorKind::Io, DiskFileError::ResizeError(e))) + } +} + pub struct RawFileAsyncAio { fd: RawFd, ctx: aio::IoContext, From 39bbbaaa598506fd6827ac7356489a70a11fe67a Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Fri, 27 Mar 2026 23:49:42 -0700 Subject: [PATCH 454/742] block: Implement DiskFile marker trait for RawFileDiskAio Add empty disk_file::DiskFile impl for RawFileDiskAio. This marker supertrait requires DiskSize + Geometry + Sync, all of which are now satisfied. Signed-off-by: Muminul Islam --- block/src/raw_async_aio.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/src/raw_async_aio.rs b/block/src/raw_async_aio.rs index 7e7ff26c69..9a990c1050 100644 --- a/block/src/raw_async_aio.rs +++ b/block/src/raw_async_aio.rs @@ -115,6 +115,8 @@ impl disk_file::Resizable for RawFileDiskAio { } } +impl disk_file::DiskFile for RawFileDiskAio {} + pub struct RawFileAsyncAio { fd: RawFd, ctx: aio::IoContext, From 6df7cda4ad342d65fcb66a25e3e469f7be6bb20d Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Fri, 27 Mar 2026 23:49:58 -0700 Subject: [PATCH 455/742] block: Implement AsyncDiskFile trait for RawFileDiskAio Add disk_file::AsyncDiskFile trait implementation for RawFileDiskAio with try_clone() and new_async_io() methods. try_clone() duplicates the underlying file descriptor and wraps it in a new RawFileDiskAio. new_async_io() creates a RawFileAsyncAio (Linux AIO) backend, wrapping errors in BlockError instead of DiskFileError. Signed-off-by: Muminul Islam --- block/src/raw_async_aio.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/block/src/raw_async_aio.rs b/block/src/raw_async_aio.rs index 9a990c1050..74e1611525 100644 --- a/block/src/raw_async_aio.rs +++ b/block/src/raw_async_aio.rs @@ -117,6 +117,24 @@ impl disk_file::Resizable for RawFileDiskAio { impl disk_file::DiskFile for RawFileDiskAio {} +impl disk_file::AsyncDiskFile for RawFileDiskAio { + fn try_clone(&self) -> BlockResult> { + let file = self + .file + .try_clone() + .map_err(|e| BlockError::new(BlockErrorKind::Io, DiskFileError::Clone(e)))?; + Ok(Box::new(RawFileDiskAio { file })) + } + + fn new_async_io(&self, ring_depth: u32) -> BlockResult> { + let mut raw = RawFileAsyncAio::new(self.file.as_raw_fd(), ring_depth) + .map_err(|e| BlockError::new(BlockErrorKind::Io, DiskFileError::NewAsyncIo(e)))?; + raw.alignment = + DiskTopology::probe(&self.file).map_or(SECTOR_SIZE, |t| t.logical_block_size); + Ok(Box::new(raw) as Box) + } +} + pub struct RawFileAsyncAio { fd: RawFd, ctx: aio::IoContext, From 783cc8bbd9142c17eef6d10a6d308e01e9a46cc1 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Fri, 27 Mar 2026 23:51:43 -0700 Subject: [PATCH 456/742] block: Remove legacy DiskFile impl from RawFileDiskAio Remove the old async_io::DiskFile trait implementation from RawFileDiskAio, now that the new disk_file trait hierarchy is fully implemented. Clean up unused imports: DiskFile and DiskFileResult from crate::async_io. Signed-off-by: Muminul Islam --- block/src/raw_async_aio.rs | 43 +------------------------------------- vmm/src/device_manager.rs | 3 +-- 2 files changed, 2 insertions(+), 44 deletions(-) diff --git a/block/src/raw_async_aio.rs b/block/src/raw_async_aio.rs index 74e1611525..f320be2115 100644 --- a/block/src/raw_async_aio.rs +++ b/block/src/raw_async_aio.rs @@ -14,9 +14,7 @@ use log::warn; use vmm_sys_util::aio; use vmm_sys_util::eventfd::EventFd; -use crate::async_io::{ - AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFile, DiskFileError, DiskFileResult, -}; +use crate::async_io::{AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFileError}; use crate::error::{BlockError, BlockErrorKind, BlockResult}; use crate::{DiskTopology, SECTOR_SIZE, disk_file, probe_sparse_support, query_device_size}; @@ -31,45 +29,6 @@ impl RawFileDiskAio { } } -impl DiskFile for RawFileDiskAio { - fn logical_size(&mut self) -> DiskFileResult { - Ok(query_device_size(&self.file) - .map_err(DiskFileError::Size)? - .0) - } - - fn physical_size(&mut self) -> DiskFileResult { - Ok(query_device_size(&self.file) - .map_err(DiskFileError::Size)? - .1) - } - - fn new_async_io(&self, ring_depth: u32) -> DiskFileResult> { - let mut raw = RawFileAsyncAio::new(self.file.as_raw_fd(), ring_depth) - .map_err(DiskFileError::NewAsyncIo)?; - raw.alignment = - DiskTopology::probe(&self.file).map_or(SECTOR_SIZE, |t| t.logical_block_size); - Ok(Box::new(raw) as Box) - } - - fn topology(&mut self) -> DiskTopology { - if let Ok(topology) = DiskTopology::probe(&self.file) { - topology - } else { - warn!("Unable to get device topology. Using default topology"); - DiskTopology::default() - } - } - - fn supports_sparse_operations(&self) -> bool { - probe_sparse_support(&self.file) - } - - fn fd(&mut self) -> BorrowedDiskFd<'_> { - BorrowedDiskFd::new(self.file.as_raw_fd()) - } -} - impl disk_file::DiskSize for RawFileDiskAio { fn logical_size(&self) -> BlockResult { query_device_size(&self.file) diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 72ce17b26a..747e6311c1 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -32,7 +32,6 @@ use arch::layout::{APIC_START, IOAPIC_SIZE, IOAPIC_START}; #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] use arch::{DeviceType, MmioDeviceInfo}; use arch::{NumaNodes, layout}; -use block::async_io::DiskFile; use block::disk_file::DiskBackend; use block::error::BlockError; use block::fixed_vhd_sync::FixedVhdDiskSync; @@ -2760,7 +2759,7 @@ impl DeviceManager { } } else if !disk_cfg.disable_aio && self.aio_is_supported() { info!("Using asynchronous RAW disk file (aio)"); - DiskBackend::Legacy(Box::new(RawFileDiskAio::new(file)) as Box) + DiskBackend::Next(Box::new(RawFileDiskAio::new(file))) } else { info!("Using synchronous RAW disk file"); DiskBackend::Next(Box::new(RawFileDiskSync::new(file))) From f6b4061629f0c6c4afd90aa2453dd10976c94101 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Fri, 27 Mar 2026 23:55:41 -0700 Subject: [PATCH 457/742] block: Return BlockResult from RawFileAsyncAio::new() Change the return type of RawFileAsyncAio::new() from std::io::Result to BlockResult, wrapping internal errors from EventFd::new() and IoContext::new() in BlockError with DiskFileError::NewAsyncIo. This simplifies the caller in AsyncDiskFile::new_async_io() which no longer needs its own error mapping. Signed-off-by: Muminul Islam --- block/src/raw_async_aio.rs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/block/src/raw_async_aio.rs b/block/src/raw_async_aio.rs index f320be2115..980f8d13a7 100644 --- a/block/src/raw_async_aio.rs +++ b/block/src/raw_async_aio.rs @@ -86,8 +86,7 @@ impl disk_file::AsyncDiskFile for RawFileDiskAio { } fn new_async_io(&self, ring_depth: u32) -> BlockResult> { - let mut raw = RawFileAsyncAio::new(self.file.as_raw_fd(), ring_depth) - .map_err(|e| BlockError::new(BlockErrorKind::Io, DiskFileError::NewAsyncIo(e)))?; + let mut raw = RawFileAsyncAio::new(self.file.as_raw_fd(), ring_depth)?; raw.alignment = DiskTopology::probe(&self.file).map_or(SECTOR_SIZE, |t| t.logical_block_size); Ok(Box::new(raw) as Box) @@ -103,9 +102,11 @@ pub struct RawFileAsyncAio { } impl RawFileAsyncAio { - pub fn new(fd: RawFd, queue_depth: u32) -> std::io::Result { - let eventfd = EventFd::new(libc::EFD_NONBLOCK)?; - let ctx = aio::IoContext::new(queue_depth)?; + pub fn new(fd: RawFd, queue_depth: u32) -> BlockResult { + let eventfd = + EventFd::new(libc::EFD_NONBLOCK).map_err(|e| BlockError::new(BlockErrorKind::Io, e))?; + let ctx = + aio::IoContext::new(queue_depth).map_err(|e| BlockError::new(BlockErrorKind::Io, e))?; Ok(RawFileAsyncAio { fd, From 8026eb177f892fc39f27331842950bc0174d8a54 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 2 Apr 2026 04:16:59 +0000 Subject: [PATCH 458/742] build(deps): bump crate-ci/typos from 1.44.0 to 1.45.0 Bumps [crate-ci/typos](https://github.com/crate-ci/typos) from 1.44.0 to 1.45.0. - [Release notes](https://github.com/crate-ci/typos/releases) - [Changelog](https://github.com/crate-ci/typos/blob/master/CHANGELOG.md) - [Commits](https://github.com/crate-ci/typos/compare/v1.44.0...v1.45.0) --- updated-dependencies: - dependency-name: crate-ci/typos dependency-version: 1.45.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- .github/workflows/quality.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/quality.yaml b/.github/workflows/quality.yaml index 81baf1e3af..de6391186a 100644 --- a/.github/workflows/quality.yaml +++ b/.github/workflows/quality.yaml @@ -167,4 +167,4 @@ jobs: steps: - uses: actions/checkout@v6 # Executes "typos ." - - uses: crate-ci/typos@v1.44.0 + - uses: crate-ci/typos@v1.45.0 From c60168256a1befccf10172afc192f702cd39968c Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Tue, 31 Mar 2026 04:28:41 +0000 Subject: [PATCH 459/742] net_util: Tolerate some unsupported command classes Windows NetKVM driver (>= 0.1.271) issues unsupported command classes even when they are not even advertised. According to the Virtio 1.2 specification: RX, VLAN, and ANNOUNCE control paths are only meaningful when their corresponding features are negotiated in sections 5.1.3.1, 5.1.6.5.1.2, 5.1.6.5.2.2, and 5.1.6.5.4.1. RX and VLAN are explicitly described as best-effort in sections 5.1.6.5.1 and 5.1.6.5.3. Instead of returning an error to the guest, return success to the guest. Fixes: #7925 Signed-off-by: Wei Liu --- net_util/src/ctrl_queue.rs | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/net_util/src/ctrl_queue.rs b/net_util/src/ctrl_queue.rs index 284b6ec4e6..f449d5527a 100644 --- a/net_util/src/ctrl_queue.rs +++ b/net_util/src/ctrl_queue.rs @@ -2,12 +2,16 @@ // // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause -use log::{error, info, warn}; +use log::{debug, error, info, warn}; use thiserror::Error; use virtio_bindings::virtio_net::{ - VIRTIO_NET_CTRL_GUEST_OFFLOADS, VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET, VIRTIO_NET_CTRL_MQ, - VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX, VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN, - VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, VIRTIO_NET_ERR, VIRTIO_NET_OK, + VIRTIO_NET_CTRL_ANNOUNCE, VIRTIO_NET_CTRL_ANNOUNCE_ACK, VIRTIO_NET_CTRL_GUEST_OFFLOADS, + VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET, VIRTIO_NET_CTRL_MQ, VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX, + VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN, VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, VIRTIO_NET_CTRL_RX, + VIRTIO_NET_CTRL_RX_ALLMULTI, VIRTIO_NET_CTRL_RX_ALLUNI, VIRTIO_NET_CTRL_RX_NOBCAST, + VIRTIO_NET_CTRL_RX_NOMULTI, VIRTIO_NET_CTRL_RX_NOUNI, VIRTIO_NET_CTRL_RX_PROMISC, + VIRTIO_NET_CTRL_VLAN, VIRTIO_NET_CTRL_VLAN_ADD, VIRTIO_NET_CTRL_VLAN_DEL, VIRTIO_NET_ERR, + VIRTIO_NET_OK, }; use virtio_queue::{Queue, QueueT}; use vm_memory::{ByteValued, Bytes, GuestMemoryError}; @@ -53,6 +57,26 @@ pub struct ControlHeader { // SAFETY: ControlHeader only contains a series of integers unsafe impl ByteValued for ControlHeader {} +fn is_tolerated_ctrl_command(ctrl_hdr: ControlHeader) -> bool { + match u32::from(ctrl_hdr.class) { + VIRTIO_NET_CTRL_RX => matches!( + u32::from(ctrl_hdr.cmd), + VIRTIO_NET_CTRL_RX_PROMISC + | VIRTIO_NET_CTRL_RX_ALLMULTI + | VIRTIO_NET_CTRL_RX_ALLUNI + | VIRTIO_NET_CTRL_RX_NOMULTI + | VIRTIO_NET_CTRL_RX_NOUNI + | VIRTIO_NET_CTRL_RX_NOBCAST + ), + VIRTIO_NET_CTRL_VLAN => matches!( + u32::from(ctrl_hdr.cmd), + VIRTIO_NET_CTRL_VLAN_ADD | VIRTIO_NET_CTRL_VLAN_DEL + ), + VIRTIO_NET_CTRL_ANNOUNCE => u32::from(ctrl_hdr.cmd) == VIRTIO_NET_CTRL_ANNOUNCE_ACK, + _ => false, + } +} + pub struct CtrlQueue { pub taps: Vec, } @@ -128,6 +152,10 @@ impl CtrlQueue { false } } + _ if is_tolerated_ctrl_command(ctrl_hdr) => { + debug!("Ignoring unsupported but tolerated control command {ctrl_hdr:?}"); + true + } _ => { warn!("Unsupported command {ctrl_hdr:?}"); false From 7461143194e1367869ae05dde72f1f531c1d980f Mon Sep 17 00:00:00 2001 From: CMGS Date: Tue, 31 Mar 2026 14:30:31 +0000 Subject: [PATCH 460/742] net_util: fix ctrl_queue used_len to only count written bytes The control queue handler passed the total length of all descriptors (header + data + status) as used_len to add_used. Per virtio spec section 2.6.8, used_len must only count bytes written to device-writable descriptors. The device only writes the 1-byte status/ack field. Windows NetKVM >= 0.1.285 strictly checks this value and calls NdisMRemoveMiniport when len != sizeof(virtio_net_ctrl_ack), removing the network adapter immediately after activation. Signed-off-by: CMGS --- net_util/src/ctrl_queue.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net_util/src/ctrl_queue.rs b/net_util/src/ctrl_queue.rs index f449d5527a..e42b4c0ca5 100644 --- a/net_util/src/ctrl_queue.rs +++ b/net_util/src/ctrl_queue.rs @@ -171,7 +171,9 @@ impl CtrlQueue { .translate_gva(access_platform, status_desc.len() as usize), ) .map_err(Error::GuestMemory)?; - let len = ctrl_desc.len() + data_desc.len() + status_desc.len(); + // Per virtio spec 2.6.8, used_len is the number of bytes written + // to device-writable descriptors. Only the status byte is written. + let len = status_desc.len(); queue .add_used(desc_chain.memory(), desc_chain.head_index(), len) From ef9133a3eedb832e0d47ea243f7705895274707f Mon Sep 17 00:00:00 2001 From: Chinmoy Date: Mon, 16 Feb 2026 22:28:31 +0530 Subject: [PATCH 461/742] vmm: acpi: Take &T instead of &Arc> Refactor ACPI table creation functions to accept borrowed references, removing double indirection and moving locking to callers. Signed-off-by: Chinmoy --- vmm/src/acpi.rs | 79 ++++++++++++++++----------------------- vmm/src/device_manager.rs | 4 +- vmm/src/vm.rs | 18 ++++----- 3 files changed, 43 insertions(+), 58 deletions(-) diff --git a/vmm/src/acpi.rs b/vmm/src/acpi.rs index 8f46b20ddb..ac05306bcf 100644 --- a/vmm/src/acpi.rs +++ b/vmm/src/acpi.rs @@ -2,7 +2,6 @@ // // SPDX-License-Identifier: Apache-2.0 // -use std::sync::{Arc, Mutex}; use std::time::Instant; use acpi_tables::Aml; @@ -192,7 +191,7 @@ bitflags! { impl MemoryAffinity { fn from_region( - region: &Arc, + region: &GuestRegionMmap, proximity_domain: u32, flags: MemAffinityFlags, ) -> Self { @@ -258,9 +257,9 @@ struct ViotPciRangeNode { } pub fn create_dsdt_table( - device_manager: &Arc>, - cpu_manager: &Arc>, - memory_manager: &Arc>, + device_manager: &DeviceManager, + cpu_manager: &CpuManager, + memory_manager: &MemoryManager, ) -> Sdt { trace_scoped!("create_dsdt_table"); // DSDT @@ -268,9 +267,9 @@ pub fn create_dsdt_table( let mut bytes = Vec::new(); - device_manager.lock().unwrap().to_aml_bytes(&mut bytes); - cpu_manager.lock().unwrap().to_aml_bytes(&mut bytes); - memory_manager.lock().unwrap().to_aml_bytes(&mut bytes); + device_manager.to_aml_bytes(&mut bytes); + cpu_manager.to_aml_bytes(&mut bytes); + memory_manager.to_aml_bytes(&mut bytes); dsdt.append_slice(&bytes); dsdt @@ -278,14 +277,13 @@ pub fn create_dsdt_table( const FACP_DSDT_OFFSET: usize = 140; -fn create_facp_table(dsdt_offset: GuestAddress, device_manager: &Arc>) -> Sdt { +fn create_facp_table(dsdt_offset: GuestAddress, device_manager: &DeviceManager) -> Sdt { trace_scoped!("create_facp_table"); // Revision 6 of the ACPI FADT table is 276 bytes long let mut facp = Sdt::new(*b"FACP", 276, 6, *b"CLOUDH", *b"CHFACP ", 1); { - let device_manager = device_manager.lock().unwrap(); if let Some(address) = device_manager.acpi_platform_addresses().reset_reg_address { // RESET_REG facp.write(116, address); @@ -369,7 +367,7 @@ fn create_tpm2_table() -> Sdt { fn create_srat_table( numa_nodes: &NumaNodes, - device_manager: &Arc>, + device_manager: &DeviceManager, #[cfg(target_arch = "x86_64")] topology: Option<(u16, u16, u16, u16)>, ) -> Sdt { let mut srat = Sdt::new(*b"SRAT", 36, 3, *b"CLOUDH", *b"CHSRAT ", 1); @@ -381,7 +379,6 @@ fn create_srat_table( assert_eq!(std::mem::size_of::(), 40); // Confirm struct size matches ACPI 6.6 spec assert_eq!(std::mem::size_of::(), 32); - let dm = device_manager.lock().unwrap(); for (node_id, node) in numa_nodes.iter() { let proximity_domain = *node_id; @@ -436,7 +433,7 @@ fn create_srat_table( // Add Generic Initiator Affinity structures for device-only NUMA nodes if let Some(device_id) = &node.device_id { // Resolve device_id to guest BDF - if let Some(bdf) = dm.get_device_bdf(device_id) { + if let Some(bdf) = device_manager.get_device_bdf(device_id) { srat.append(GenericInitiatorAffinity::from_pci_bdf( bdf, proximity_domain, @@ -852,9 +849,9 @@ fn create_viot_table(iommu_bdf: &PciBdf, devices_bdf: &[PciBdf]) -> Sdt { // * `Vec` contains a list of table pointers stored in XSDT. fn create_acpi_tables_internal( dsdt_addr: GuestAddress, - device_manager: &Arc>, - cpu_manager: &Arc>, - memory_manager: &Arc>, + device_manager: &DeviceManager, + cpu_manager: &CpuManager, + memory_manager: &MemoryManager, numa_nodes: &NumaNodes, tpm_enabled: bool, ) -> (Rsdp, Vec, Vec) { @@ -876,15 +873,13 @@ fn create_acpi_tables_internal( // MADT #[cfg(target_arch = "aarch64")] let vgic = device_manager - .lock() - .unwrap() .get_interrupt_controller() .unwrap() .lock() .unwrap() .get_vgic() .unwrap(); - let madt = cpu_manager.lock().unwrap().create_madt( + let madt = cpu_manager.create_madt( #[cfg(target_arch = "aarch64")] vgic, ); @@ -897,7 +892,7 @@ fn create_acpi_tables_internal( // PPTT #[cfg(target_arch = "aarch64")] { - let pptt = cpu_manager.lock().unwrap().create_pptt(); + let pptt = cpu_manager.create_pptt(); let pptt_addr = prev_tbl_addr.checked_add(prev_tbl_len).unwrap(); tables_bytes.extend_from_slice(pptt.as_slice()); xsdt_table_pointers.push(pptt_addr.0); @@ -917,7 +912,7 @@ fn create_acpi_tables_internal( } // MCFG - let mcfg = create_mcfg_table(device_manager.lock().unwrap().pci_segments()); + let mcfg = create_mcfg_table(device_manager.pci_segments()); let mcfg_addr = prev_tbl_addr.checked_add(prev_tbl_len).unwrap(); tables_bytes.extend_from_slice(mcfg.as_slice()); xsdt_table_pointers.push(mcfg_addr.0); @@ -928,16 +923,12 @@ fn create_acpi_tables_internal( #[cfg(target_arch = "aarch64")] { let is_serial_on = device_manager - .lock() - .unwrap() .get_device_info() .clone() .contains_key(&(DeviceType::Serial, DeviceType::Serial.to_string())); let serial_device_addr = arch::layout::LEGACY_SERIAL_MAPPED_IO_START.raw_value(); let serial_device_irq = if is_serial_on { device_manager - .lock() - .unwrap() .get_device_info() .clone() .get(&(DeviceType::Serial, DeviceType::Serial.to_string())) @@ -979,7 +970,7 @@ fn create_acpi_tables_internal( // Only created if the NUMA nodes list is not empty. if !numa_nodes.is_empty() { #[cfg(target_arch = "x86_64")] - let topology = cpu_manager.lock().unwrap().get_vcpu_topology(); + let topology = cpu_manager.get_vcpu_topology(); // SRAT let srat = create_srat_table( numa_nodes, @@ -1003,7 +994,7 @@ fn create_acpi_tables_internal( #[cfg(target_arch = "aarch64")] { - let iort = create_iort_table(device_manager.lock().unwrap().pci_segments()); + let iort = create_iort_table(device_manager.pci_segments()); let iort_addr = prev_tbl_addr.checked_add(prev_tbl_len).unwrap(); tables_bytes.extend_from_slice(iort.as_slice()); xsdt_table_pointers.push(iort_addr.0); @@ -1012,8 +1003,7 @@ fn create_acpi_tables_internal( } // VIOT - if let Some((iommu_bdf, devices_bdf)) = device_manager.lock().unwrap().iommu_attached_devices() - { + if let Some((iommu_bdf, devices_bdf)) = device_manager.iommu_attached_devices() { let viot = create_viot_table(iommu_bdf, devices_bdf); let viot_addr = prev_tbl_addr.checked_add(prev_tbl_len).unwrap(); @@ -1040,9 +1030,9 @@ fn create_acpi_tables_internal( #[cfg(feature = "fw_cfg")] pub fn create_acpi_tables_for_fw_cfg( - device_manager: &Arc>, - cpu_manager: &Arc>, - memory_manager: &Arc>, + device_manager: &DeviceManager, + cpu_manager: &CpuManager, + memory_manager: &MemoryManager, numa_nodes: &NumaNodes, tpm_enabled: bool, ) -> Result<(), crate::vm::Error> { @@ -1087,8 +1077,6 @@ pub fn create_acpi_tables_for_fw_cfg( checksums.push(xsdt_checksum); device_manager - .lock() - .unwrap() .fw_cfg() .expect("fw_cfg must be present") .lock() @@ -1099,9 +1087,9 @@ pub fn create_acpi_tables_for_fw_cfg( pub fn create_acpi_tables( guest_mem: &GuestMemoryMmap, - device_manager: &Arc>, - cpu_manager: &Arc>, - memory_manager: &Arc>, + device_manager: &DeviceManager, + cpu_manager: &CpuManager, + memory_manager: &MemoryManager, numa_nodes: &NumaNodes, tpm_enabled: bool, ) -> GuestAddress { @@ -1139,9 +1127,9 @@ pub fn create_acpi_tables( #[cfg(feature = "tdx")] pub fn create_acpi_tables_tdx( - device_manager: &Arc>, - cpu_manager: &Arc>, - memory_manager: &Arc>, + device_manager: &DeviceManager, + cpu_manager: &CpuManager, + memory_manager: &MemoryManager, numa_nodes: &NumaNodes, ) -> Vec { // DSDT @@ -1155,18 +1143,16 @@ pub fn create_acpi_tables_tdx( tables.push(create_facp_table(GuestAddress(0), device_manager)); // MADT - tables.push(cpu_manager.lock().unwrap().create_madt()); + tables.push(cpu_manager.create_madt()); // MCFG - tables.push(create_mcfg_table( - device_manager.lock().unwrap().pci_segments(), - )); + tables.push(create_mcfg_table(device_manager.pci_segments())); // SRAT and SLIT // Only created if the NUMA nodes list is not empty. if !numa_nodes.is_empty() { #[cfg(target_arch = "x86_64")] - let topology = cpu_manager.lock().unwrap().get_vcpu_topology(); + let topology = cpu_manager.get_vcpu_topology(); // SRAT tables.push(create_srat_table( @@ -1181,8 +1167,7 @@ pub fn create_acpi_tables_tdx( } // VIOT - if let Some((iommu_bdf, devices_bdf)) = device_manager.lock().unwrap().iommu_attached_devices() - { + if let Some((iommu_bdf, devices_bdf)) = device_manager.iommu_attached_devices() { tables.push(create_viot_table(iommu_bdf, devices_bdf)); } diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 747e6311c1..b6d35964f9 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -1756,7 +1756,7 @@ impl DeviceManager { } #[cfg(target_arch = "aarch64")] - pub fn get_interrupt_controller(&mut self) -> Option<&Arc>> { + pub fn get_interrupt_controller(&self) -> Option<&Arc>> { self.interrupt_controller.as_ref() } @@ -1793,7 +1793,7 @@ impl DeviceManager { } #[cfg(target_arch = "riscv64")] - pub fn get_interrupt_controller(&mut self) -> Option<&Arc>> { + pub fn get_interrupt_controller(&self) -> Option<&Arc>> { self.interrupt_controller.as_ref() } diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index c1e21a5ac9..7a5a19133a 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -2514,9 +2514,9 @@ impl Vm { // Loop over the ACPI tables and copy them to the HOB. for acpi_table in crate::acpi::create_acpi_tables_tdx( - &self.device_manager, - &self.cpu_manager, - &self.memory_manager, + &self.device_manager.lock().unwrap(), + &self.cpu_manager.lock().unwrap(), + &self.memory_manager.lock().unwrap(), &self.numa_nodes, ) { hob.add_acpi_table(&mem, acpi_table.as_slice()) @@ -2576,9 +2576,9 @@ impl Vm { let tpm_enabled = self.config.lock().unwrap().tpm.is_some(); let rsdp_addr = crate::acpi::create_acpi_tables( &mem, - &self.device_manager, - &self.cpu_manager, - &self.memory_manager, + &self.device_manager.lock().unwrap(), + &self.cpu_manager.lock().unwrap(), + &self.memory_manager.lock().unwrap(), &self.numa_nodes, tpm_enabled, ); @@ -2643,9 +2643,9 @@ impl Vm { if fw_cfg_config.acpi_tables { let tpm_enabled = self.config.lock().unwrap().tpm.is_some(); crate::acpi::create_acpi_tables_for_fw_cfg( - &self.device_manager, - &self.cpu_manager, - &self.memory_manager, + &self.device_manager.lock().unwrap(), + &self.cpu_manager.lock().unwrap(), + &self.memory_manager.lock().unwrap(), &self.numa_nodes, tpm_enabled, )?; From d0b253472d3c69283a55fd22891b7b8d0a2e3c20 Mon Sep 17 00:00:00 2001 From: Chinmoy Date: Mon, 16 Feb 2026 22:31:30 +0530 Subject: [PATCH 462/742] pci, devices, virtio-devices, vmm: Refactor allocate_bars Refactor PciDevice::allocate_bars trait and all implementations to take &mut SystemAllocator instead of &Arc>, removing double indirection. The caller in device_manager.rs now acquires the lock before calling allocate_bars. Signed-off-by: Chinmoy --- devices/src/ivshmem.rs | 2 +- devices/src/pvmemcontrol.rs | 4 ++-- devices/src/pvpanic.rs | 4 ++-- pci/src/device.rs | 4 ++-- pci/src/vfio.rs | 6 ++---- pci/src/vfio_user.rs | 2 +- virtio-devices/src/transport/pci_device.rs | 2 +- vmm/src/device_manager.rs | 2 +- 8 files changed, 12 insertions(+), 14 deletions(-) diff --git a/devices/src/ivshmem.rs b/devices/src/ivshmem.rs index fff48a72c6..98291c74e8 100644 --- a/devices/src/ivshmem.rs +++ b/devices/src/ivshmem.rs @@ -217,7 +217,7 @@ impl BusDevice for IvshmemDevice { impl PciDevice for IvshmemDevice { fn allocate_bars( &mut self, - _allocator: &Arc>, + _allocator: &mut SystemAllocator, mmio32_allocator: &mut AddressAllocator, mmio64_allocator: &mut AddressAllocator, resources: Option>, diff --git a/devices/src/pvmemcontrol.rs b/devices/src/pvmemcontrol.rs index 06e0e24923..d4b37456be 100644 --- a/devices/src/pvmemcontrol.rs +++ b/devices/src/pvmemcontrol.rs @@ -5,7 +5,7 @@ use std::collections::HashMap; use std::ffi::CString; -use std::sync::{Arc, Barrier, Mutex, RwLock}; +use std::sync::{Arc, Barrier, RwLock}; use std::{io, result}; use log::{debug, warn}; @@ -722,7 +722,7 @@ impl PciDevice for PvmemcontrolPciDevice { fn allocate_bars( &mut self, - _allocator: &Arc>, + _allocator: &mut SystemAllocator, mmio32_allocator: &mut AddressAllocator, _mmio64_allocator: &mut AddressAllocator, resources: Option>, diff --git a/devices/src/pvpanic.rs b/devices/src/pvpanic.rs index 0451da1a05..9540a91252 100644 --- a/devices/src/pvpanic.rs +++ b/devices/src/pvpanic.rs @@ -5,7 +5,7 @@ use std::any::Any; use std::result; -use std::sync::{Arc, Barrier, Mutex}; +use std::sync::{Arc, Barrier}; use anyhow::anyhow; use event_monitor::event; @@ -174,7 +174,7 @@ impl PciDevice for PvPanicDevice { fn allocate_bars( &mut self, - _allocator: &Arc>, + _allocator: &mut SystemAllocator, mmio32_allocator: &mut AddressAllocator, _mmio64_allocator: &mut AddressAllocator, resources: Option>, diff --git a/pci/src/device.rs b/pci/src/device.rs index 3a23ea7772..29c89b8c42 100644 --- a/pci/src/device.rs +++ b/pci/src/device.rs @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause use std::any::Any; -use std::sync::{Arc, Barrier, Mutex}; +use std::sync::{Arc, Barrier}; use std::{io, result}; use serde::{Deserialize, Serialize}; @@ -49,7 +49,7 @@ pub trait PciDevice: Send { /// returns an address. Returns a Vec of (GuestAddress, GuestUsize) tuples. fn allocate_bars( &mut self, - _allocator: &Arc>, + _allocator: &mut SystemAllocator, _mmio32_allocator: &mut AddressAllocator, _mmio64_allocator: &mut AddressAllocator, _resources: Option>, diff --git a/pci/src/vfio.rs b/pci/src/vfio.rs index f1d22ff63c..53c77a95c1 100644 --- a/pci/src/vfio.rs +++ b/pci/src/vfio.rs @@ -602,7 +602,7 @@ impl VfioCommon { #[allow(unused_variables)] pub(crate) fn allocate_bars( &mut self, - allocator: &Arc>, + allocator: &mut SystemAllocator, mmio32_allocator: &mut AddressAllocator, mmio64_allocator: &mut AddressAllocator, resources: Option<&[Resource]>, @@ -741,8 +741,6 @@ impl VfioCommon { PciBarRegionType::IoRegion => { // The address needs to be 4 bytes aligned. allocator - .lock() - .unwrap() .allocate_io_addresses(restored_bar_addr, region_size, Some(0x4)) .ok_or(PciDeviceError::IoAllocationFailed(region_size))? } @@ -1852,7 +1850,7 @@ const PCI_ROM_EXP_BAR_INDEX: usize = 12; impl PciDevice for VfioPciDevice { fn allocate_bars( &mut self, - allocator: &Arc>, + allocator: &mut SystemAllocator, mmio32_allocator: &mut AddressAllocator, mmio64_allocator: &mut AddressAllocator, resources: Option>, diff --git a/pci/src/vfio_user.rs b/pci/src/vfio_user.rs index 248f87272a..456047d42d 100644 --- a/pci/src/vfio_user.rs +++ b/pci/src/vfio_user.rs @@ -391,7 +391,7 @@ impl Vfio for VfioUserClientWrapper { impl PciDevice for VfioUserPciDevice { fn allocate_bars( &mut self, - allocator: &Arc>, + allocator: &mut SystemAllocator, mmio32_allocator: &mut AddressAllocator, mmio64_allocator: &mut AddressAllocator, resources: Option>, diff --git a/virtio-devices/src/transport/pci_device.rs b/virtio-devices/src/transport/pci_device.rs index dd54472d39..36975e3f7f 100644 --- a/virtio-devices/src/transport/pci_device.rs +++ b/virtio-devices/src/transport/pci_device.rs @@ -978,7 +978,7 @@ impl PciDevice for VirtioPciDevice { fn allocate_bars( &mut self, - _allocator: &Arc>, + _allocator: &mut SystemAllocator, mmio32_allocator: &mut AddressAllocator, mmio64_allocator: &mut AddressAllocator, resources: Option>, diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index b6d35964f9..edb8b76af0 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -3995,7 +3995,7 @@ impl DeviceManager { .lock() .unwrap() .allocate_bars( - &self.address_manager.allocator, + &mut self.address_manager.allocator.lock().unwrap(), &mut self.pci_segments[segment_id as usize] .mem32_allocator .lock() From 5ae329305a047a68dce6f58afb83633ecff53e6a Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 1 Apr 2026 21:23:37 +0200 Subject: [PATCH 463/742] virtio-devices: block: Fix writeback mode update flow Virtio v1.2 says that if CONFIG_WCE is negotiated but FLUSH is not, the device must initialize writeback to 0. It also says that if CONFIG_WCE was not negotiated but FLUSH was, the driver should assume presence of a writeback cache. Introduce a pure is_writeback_enabled helper and a set_writeback_mode helper. This makes the two call flows explicit: * write_config resolves the guest requested mode against the negotiated features before storing it back * activate starts from the default writeback preference and then resolves it against the negotiated features * reset restores the initial writeback state This keeps the config space value and the runtime writeback flag in sync and makes the spec driven fallback easier to follow. Signed-off-by: Anatol Belski --- virtio-devices/src/block.rs | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/virtio-devices/src/block.rs b/virtio-devices/src/block.rs index 1080b71bcb..bd740fc4a5 100644 --- a/virtio-devices/src/block.rs +++ b/virtio-devices/src/block.rs @@ -974,24 +974,24 @@ impl Block { } } - fn update_writeback(&mut self) { - // Use writeback from config if VIRTIO_BLK_F_CONFIG_WCE - let writeback = if self.common.feature_acked(VIRTIO_BLK_F_CONFIG_WCE.into()) { - self.config.writeback == 1 - } else { - // Else check if VIRTIO_BLK_F_FLUSH negotiated - self.common.feature_acked(VIRTIO_BLK_F_FLUSH.into()) - }; + /// The virtio v1.2 spec says "If VIRTIO_BLK_F_CONFIG_WCE was not + /// negotiated but VIRTIO_BLK_F_FLUSH was, the driver SHOULD assume + /// presence of a writeback cache." It also says "If + /// VIRTIO_BLK_F_CONFIG_WCE is negotiated but VIRTIO_BLK_F_FLUSH is not, + /// the device MUST initialize writeback to 0." + fn is_writeback_enabled(&self, desired: bool) -> bool { + let flush = self.common.feature_acked(VIRTIO_BLK_F_FLUSH.into()); + let wce = self.common.feature_acked(VIRTIO_BLK_F_CONFIG_WCE.into()); + if wce { flush && desired } else { flush } + } + fn set_writeback_mode(&mut self, enabled: bool) { + self.config.writeback = enabled as u8; + self.writeback.store(enabled, Ordering::Release); info!( "Changing cache mode to {}", - if writeback { - "writeback" - } else { - "writethrough" - } + if enabled { "writeback" } else { "writethrough" } ); - self.writeback.store(writeback, Ordering::Release); } pub fn resize(&mut self, new_size: u64) -> Result<()> { @@ -1073,8 +1073,8 @@ impl VirtioDevice for Block { return; } - self.config.writeback = data[0]; - self.update_writeback(); + let writeback = self.is_writeback_enabled(data[0] == 1); + self.set_writeback_mode(writeback); } fn activate(&mut self, context: crate::device::ActivationContext) -> ActivateResult { @@ -1097,7 +1097,8 @@ impl VirtioDevice for Block { // Recompute the barrier size from the queues that are actually activated. self.common.paused_sync = Some(Arc::new(Barrier::new(queues.len() + 1))); - self.update_writeback(); + let writeback = self.is_writeback_enabled(self.config.writeback == 1); + self.set_writeback_mode(writeback); let mut epoll_threads = Vec::new(); let event_idx = self.common.feature_acked(VIRTIO_RING_F_EVENT_IDX.into()); @@ -1167,6 +1168,7 @@ impl VirtioDevice for Block { fn reset(&mut self) -> Option> { let result = self.common.reset(); + self.set_writeback_mode(true); event!("virtio-device", "reset", "id", &self.id); result } From 488927a5eacb257a68fb7467371bf408e161d5ca Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Wed, 1 Apr 2026 19:01:56 -0700 Subject: [PATCH 464/742] performance-metrics: Remove duplicate direct_kernel_boot_path() Remove the local direct_kernel_boot_path() function and unused PathBuf import from performance_tests.rs. The identical public function from test_infra is already available via wildcard import. Signed-off-by: Muminul Islam --- performance-metrics/src/performance_tests.rs | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/performance-metrics/src/performance_tests.rs b/performance-metrics/src/performance_tests.rs index 1022aac19e..fa2f8ac3dc 100644 --- a/performance-metrics/src/performance_tests.rs +++ b/performance-metrics/src/performance_tests.rs @@ -5,7 +5,6 @@ // Performance tests -use std::path::PathBuf; use std::time::Duration; use std::{fs, thread}; @@ -130,22 +129,6 @@ fn performance_test_new_guest(disk_config: Box) -> Guest { const DIRECT_KERNEL_BOOT_CMDLINE: &str = "root=/dev/vda1 console=hvc0 rw systemd.journald.forward_to_console=1"; -// Creates the path for direct kernel boot and return the path. -// For x86_64, this function returns the vmlinux kernel path. -// For AArch64, this function returns the PE kernel path. -fn direct_kernel_boot_path() -> PathBuf { - let mut workload_path = dirs::home_dir().unwrap(); - workload_path.push("workloads"); - - let mut kernel_path = workload_path; - #[cfg(target_arch = "x86_64")] - kernel_path.push("vmlinux-x86_64"); - #[cfg(target_arch = "aarch64")] - kernel_path.push("Image-arm64"); - - kernel_path -} - fn remote_command(api_socket: &str, command: &str, arg: Option<&str>) -> bool { let mut cmd = std::process::Command::new(clh_command("ch-remote")); cmd.args([&format!("--api-socket={api_socket}"), command]); From 4cf10737e4fe6fcd738261eea7d22c910555569c Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Wed, 1 Apr 2026 19:02:25 -0700 Subject: [PATCH 465/742] performance-metrics: Remove duplicate DIRECT_KERNEL_BOOT_CMDLINE Remove the local DIRECT_KERNEL_BOOT_CMDLINE constant from performance_tests.rs. The identical public constant from test_infra is already available via wildcard import. Signed-off-by: Muminul Islam --- performance-metrics/src/performance_tests.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/performance-metrics/src/performance_tests.rs b/performance-metrics/src/performance_tests.rs index fa2f8ac3dc..c12c1955a7 100644 --- a/performance-metrics/src/performance_tests.rs +++ b/performance-metrics/src/performance_tests.rs @@ -126,9 +126,6 @@ fn performance_test_new_guest(disk_config: Box) -> Guest { Guest::new_from_ip_range(disk_config, "172.19", 0) } -const DIRECT_KERNEL_BOOT_CMDLINE: &str = - "root=/dev/vda1 console=hvc0 rw systemd.journald.forward_to_console=1"; - fn remote_command(api_socket: &str, command: &str, arg: Option<&str>) -> bool { let mut cmd = std::process::Command::new(clh_command("ch-remote")); cmd.args([&format!("--api-socket={api_socket}"), command]); From 1400e614cb7379e34f638a82c2c936a72f95593c Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Wed, 1 Apr 2026 19:03:17 -0700 Subject: [PATCH 466/742] performance-metrics: Remove duplicate x86_64 FOCAL_IMAGE_NAME Remove the local x86_64 FOCAL_IMAGE_NAME constant from performance_tests.rs. The identical public constant from test_infra is already available via wildcard import. The aarch64 definition is kept as it differs from test_infra: performance-metrics uses a specific image with the '-update-tool' suffix. Signed-off-by: Muminul Islam --- performance-metrics/src/performance_tests.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/performance-metrics/src/performance_tests.rs b/performance-metrics/src/performance_tests.rs index c12c1955a7..a015f13c63 100644 --- a/performance-metrics/src/performance_tests.rs +++ b/performance-metrics/src/performance_tests.rs @@ -13,8 +13,6 @@ use thiserror::Error; use crate::{ImageFormat, PerformanceTestControl, PerformanceTestOverrides, mean}; -#[cfg(target_arch = "x86_64")] -pub const FOCAL_IMAGE_NAME: &str = "focal-server-cloudimg-amd64-custom-20210609-0.raw"; #[cfg(target_arch = "aarch64")] pub const FOCAL_IMAGE_NAME: &str = "focal-server-cloudimg-arm64-custom-20210929-0-update-tool.raw"; From c8bfac66f4449c8f04b631fb3970bbf0b8bd5f57 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Wed, 1 Apr 2026 19:15:54 -0700 Subject: [PATCH 467/742] test_infra: Move remote_command to test_infra Move remote_command() and remote_command_w_output() from tests/common/utils.rs into test_infra/src/lib.rs to allow reuse across crates. The cloud-hypervisor integration tests already use 'use test_infra::*', so the functions are available without any caller changes. Signed-off-by: Muminul Islam --- cloud-hypervisor/tests/common/utils.rs | 35 ------------------------- test_infra/src/lib.rs | 36 ++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 35 deletions(-) diff --git a/cloud-hypervisor/tests/common/utils.rs b/cloud-hypervisor/tests/common/utils.rs index aff1e94ef5..5064e1970e 100644 --- a/cloud-hypervisor/tests/common/utils.rs +++ b/cloud-hypervisor/tests/common/utils.rs @@ -255,41 +255,6 @@ pub(crate) fn prepare_swtpm_daemon(tmp_dir: &TempDir) -> (std::process::Command, (swtpm_command, swtpm_socket_path) } -pub(crate) fn remote_command(api_socket: &str, command: &str, arg: Option<&str>) -> bool { - let mut cmd = Command::new(clh_command("ch-remote")); - cmd.args([&format!("--api-socket={api_socket}"), command]); - - if let Some(arg) = arg { - cmd.arg(arg); - } - let output = cmd.output().unwrap(); - if output.status.success() { - true - } else { - eprintln!("Error running ch-remote command: {:?}", &cmd); - let stderr = String::from_utf8_lossy(&output.stderr); - eprintln!("stderr: {stderr}"); - false - } -} - -pub(crate) fn remote_command_w_output( - api_socket: &str, - command: &str, - arg: Option<&str>, -) -> (bool, Vec) { - let mut cmd = Command::new(clh_command("ch-remote")); - cmd.args([&format!("--api-socket={api_socket}"), command]); - - if let Some(arg) = arg { - cmd.arg(arg); - } - - let output = cmd.output().expect("Failed to launch ch-remote"); - - (output.status.success(), output.stdout) -} - pub(crate) fn resize_command( api_socket: &str, desired_vcpus: Option, diff --git a/test_infra/src/lib.rs b/test_infra/src/lib.rs index a299b2c494..248e4efa34 100644 --- a/test_infra/src/lib.rs +++ b/test_infra/src/lib.rs @@ -1776,6 +1776,42 @@ pub fn clh_command(cmd: &str) -> String { String::from(full_path.to_str().unwrap()) } +pub fn remote_command(api_socket: &str, command: &str, arg: Option<&str>) -> bool { + let mut cmd = Command::new(clh_command("ch-remote")); + cmd.args([&format!("--api-socket={api_socket}"), command]); + + if let Some(arg) = arg { + cmd.arg(arg); + } + + let output = cmd.output().unwrap(); + if output.status.success() { + true + } else { + eprintln!("Error running ch-remote command: {:?}", &cmd); + let stderr = String::from_utf8_lossy(&output.stderr); + eprintln!("stderr: {stderr}"); + false + } +} + +pub fn remote_command_w_output( + api_socket: &str, + command: &str, + arg: Option<&str>, +) -> (bool, Vec) { + let mut cmd = Command::new(clh_command("ch-remote")); + cmd.args([&format!("--api-socket={api_socket}"), command]); + + if let Some(arg) = arg { + cmd.arg(arg); + } + + let output = cmd.output().expect("Failed to launch ch-remote"); + + (output.status.success(), output.stdout) +} + pub fn parse_iperf3_output(output: &[u8], sender: bool, bandwidth: bool) -> Result { std::panic::catch_unwind(|| { let s = String::from_utf8_lossy(output); From 1b479e40eaa31ed26aa6dc065ed0ba07eeabee7b Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Wed, 1 Apr 2026 19:15:59 -0700 Subject: [PATCH 468/742] performance-metrics: Remove duplicate remote_command Remove the local remote_command() function from performance_tests.rs. The identical function is now available from test_infra via the existing glob import. Signed-off-by: Muminul Islam --- performance-metrics/src/performance_tests.rs | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/performance-metrics/src/performance_tests.rs b/performance-metrics/src/performance_tests.rs index a015f13c63..813963481b 100644 --- a/performance-metrics/src/performance_tests.rs +++ b/performance-metrics/src/performance_tests.rs @@ -124,26 +124,6 @@ fn performance_test_new_guest(disk_config: Box) -> Guest { Guest::new_from_ip_range(disk_config, "172.19", 0) } -fn remote_command(api_socket: &str, command: &str, arg: Option<&str>) -> bool { - let mut cmd = std::process::Command::new(clh_command("ch-remote")); - cmd.args([&format!("--api-socket={api_socket}"), command]); - - if let Some(arg) = arg { - cmd.arg(arg); - } - let output = cmd.output().unwrap(); - if output.status.success() { - true - } else { - eprintln!( - "Error running ch-remote command: {:?}\nstderr: {}", - &cmd, - String::from_utf8_lossy(&output.stderr) - ); - false - } -} - pub fn performance_net_throughput(control: &PerformanceTestControl) -> f64 { let test_timeout = control.test_timeout; let (rx, bandwidth) = control.net_control.unwrap(); From d2f6476149ce7220a5994a3b9cff2c70c7cf3764 Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Mon, 9 Mar 2026 10:02:29 +0100 Subject: [PATCH 469/742] vmm: move migration socket helpers into transport module On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vmm/src/lib.rs | 81 ++++----------------------------- vmm/src/migration_transport.rs | 83 ++++++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+), 71 deletions(-) create mode 100644 vmm/src/migration_transport.rs diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 03edec26b5..071a4abc59 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -6,10 +6,11 @@ use std::collections::HashMap; use std::fs::File; use std::io::{Read, Write, stdout}; -use std::net::{TcpListener, TcpStream}; +use std::net::TcpStream; use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; -use std::os::unix::net::{UnixListener, UnixStream}; +use std::os::unix::net::UnixStream; use std::panic::AssertUnwindSafe; +#[cfg(feature = "guest_debug")] use std::path::PathBuf; use std::sync::mpsc::{Receiver, RecvError, SendError, Sender}; use std::sync::{Arc, Mutex}; @@ -85,6 +86,7 @@ pub mod interrupt; pub mod landlock; pub mod memory_manager; pub mod migration; +pub mod migration_transport; mod pci_segment; pub mod seccomp_filters; mod serial_manager; @@ -1155,73 +1157,6 @@ impl Vmm { Ok(()) } - fn socket_url_to_path(url: &str) -> result::Result { - url.strip_prefix("unix:") - .ok_or_else(|| { - MigratableError::MigrateSend(anyhow!("Could not extract path from URL: {url}")) - }) - .map(|s| s.into()) - } - - fn send_migration_socket( - destination_url: &str, - ) -> std::result::Result { - if let Some(address) = destination_url.strip_prefix("tcp:") { - info!("Connecting to TCP socket at {address}"); - - let socket = TcpStream::connect(address).map_err(|e| { - MigratableError::MigrateSend(anyhow!("Error connecting to TCP socket: {e}")) - })?; - - Ok(SocketStream::Tcp(socket)) - } else { - let path = Vmm::socket_url_to_path(destination_url)?; - info!("Connecting to UNIX socket at {path:?}"); - - let socket = UnixStream::connect(&path).map_err(|e| { - MigratableError::MigrateSend(anyhow!("Error connecting to UNIX socket: {e}")) - })?; - - Ok(SocketStream::Unix(socket)) - } - } - - fn receive_migration_socket( - receiver_url: &str, - ) -> std::result::Result { - if let Some(address) = receiver_url.strip_prefix("tcp:") { - let listener = TcpListener::bind(address).map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error binding to TCP socket: {e}")) - })?; - - let (socket, _addr) = listener.accept().map_err(|e| { - MigratableError::MigrateReceive(anyhow!( - "Error accepting connection on TCP socket: {e}" - )) - })?; - - Ok(SocketStream::Tcp(socket)) - } else { - let path = Vmm::socket_url_to_path(receiver_url)?; - let listener = UnixListener::bind(&path).map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error binding to UNIX socket: {e}")) - })?; - - let (socket, _addr) = listener.accept().map_err(|e| { - MigratableError::MigrateReceive(anyhow!( - "Error accepting connection on UNIX socket: {e}" - )) - })?; - - // Remove the UNIX socket file after accepting the connection - std::fs::remove_file(&path).map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error removing UNIX socket file: {e}")) - })?; - - Ok(SocketStream::Unix(socket)) - } - } - /// Transmits the given [`MemoryRangeTable`] over the wire if there is at /// least one region. /// @@ -1440,7 +1375,8 @@ impl Vmm { send_data_migration: &VmSendMigrationData, ) -> result::Result<(), MigratableError> { // Set up the socket connection - let mut socket = Self::send_migration_socket(&send_data_migration.destination_url)?; + let mut socket = + migration_transport::send_migration_socket(&send_data_migration.destination_url)?; // Start the migration Request::start().write_to(&mut socket)?; @@ -2481,7 +2417,8 @@ impl RequestHandler for Vmm { ); // Accept the connection and get the socket - let mut socket = Vmm::receive_migration_socket(&receive_data_migration.receiver_url)?; + let mut socket = + migration_transport::receive_migration_socket(&receive_data_migration.receiver_url)?; event!("vm", "migration-receive-started"); @@ -2610,6 +2547,8 @@ const DEVICE_MANAGER_SNAPSHOT_ID: &str = "device-manager"; #[cfg(test)] mod unit_tests { + use std::path::PathBuf; + use super::*; #[cfg(target_arch = "x86_64")] use crate::vm_config::DebugConsoleConfig; diff --git a/vmm/src/migration_transport.rs b/vmm/src/migration_transport.rs new file mode 100644 index 0000000000..820501a3e9 --- /dev/null +++ b/vmm/src/migration_transport.rs @@ -0,0 +1,83 @@ +// Copyright © 2026 Contributors to the Cloud Hypervisor project +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::net::{TcpListener, TcpStream}; +use std::os::unix::net::{UnixListener, UnixStream}; +use std::path::PathBuf; +use std::result::Result; + +use anyhow::anyhow; +use log::info; +use vm_migration::MigratableError; + +use crate::SocketStream; + +/// Extract a UNIX socket path from a "unix:" migration URL. +fn socket_url_to_path(url: &str) -> Result { + url.strip_prefix("unix:") + .ok_or_else(|| anyhow!("Could not extract path from URL: {url}")) + .map(|s| s.into()) +} + +/// Connect to a migration endpoint and return the established stream. +pub(crate) fn send_migration_socket( + destination_url: &str, +) -> Result { + if let Some(address) = destination_url.strip_prefix("tcp:") { + info!("Connecting to TCP socket at {address}"); + + let socket = TcpStream::connect(address).map_err(|e| { + MigratableError::MigrateSend(anyhow!("Error connecting to TCP socket: {e}")) + })?; + + Ok(SocketStream::Tcp(socket)) + } else { + let path = socket_url_to_path(destination_url).map_err(MigratableError::MigrateSend)?; + info!("Connecting to UNIX socket at {path:?}"); + + let socket = UnixStream::connect(&path).map_err(|e| { + MigratableError::MigrateSend(anyhow!("Error connecting to UNIX socket: {e}")) + })?; + + Ok(SocketStream::Unix(socket)) + } +} + +/// Bind and accept a migration connection for the receiver side. +pub(crate) fn receive_migration_socket( + receiver_url: &str, +) -> Result { + if let Some(address) = receiver_url.strip_prefix("tcp:") { + let listener = TcpListener::bind(address).map_err(|e| { + MigratableError::MigrateReceive(anyhow!("Error binding to TCP socket: {e}")) + })?; + + let (socket, _addr) = listener.accept().map_err(|e| { + MigratableError::MigrateReceive(anyhow!( + "Error accepting connection on TCP socket: {e}" + )) + })?; + + Ok(SocketStream::Tcp(socket)) + } else { + let path = socket_url_to_path(receiver_url).map_err(MigratableError::MigrateSend)?; + let listener = UnixListener::bind(&path).map_err(|e| { + MigratableError::MigrateReceive(anyhow!("Error binding to UNIX socket: {e}")) + })?; + + let (socket, _addr) = listener.accept().map_err(|e| { + MigratableError::MigrateReceive(anyhow!( + "Error accepting connection on UNIX socket: {e}" + )) + })?; + + // Remove the UNIX socket file after accepting the connection + std::fs::remove_file(&path).map_err(|e| { + MigratableError::MigrateReceive(anyhow!("Error removing UNIX socket file: {e}")) + })?; + + Ok(SocketStream::Unix(socket)) + } +} From bb3e1b4073eb31c30ab0bb01588fef5c58be42ab Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Thu, 26 Mar 2026 17:27:51 +0100 Subject: [PATCH 470/742] vmm: stop removing the UNIX socket file When doing a local migration using a UNIX socket, we removed the UNIX socket file after accepting the connection. The VMM does not own this socket file, which makes this an unsafe operation. Thus, we stop doing that. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vmm/src/migration_transport.rs | 5 ----- 1 file changed, 5 deletions(-) diff --git a/vmm/src/migration_transport.rs b/vmm/src/migration_transport.rs index 820501a3e9..ddc2337b5e 100644 --- a/vmm/src/migration_transport.rs +++ b/vmm/src/migration_transport.rs @@ -73,11 +73,6 @@ pub(crate) fn receive_migration_socket( )) })?; - // Remove the UNIX socket file after accepting the connection - std::fs::remove_file(&path).map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error removing UNIX socket file: {e}")) - })?; - Ok(SocketStream::Unix(socket)) } } From d4a8d55074da94caf770de126e80507af544b2cc Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Mon, 9 Mar 2026 10:32:06 +0100 Subject: [PATCH 471/742] vmm: extract small request/response helpers to reduce boilerplate On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vmm/src/lib.rs | 14 +++++++------- vmm/src/migration_transport.rs | 21 +++++++++++++++++++++ 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 071a4abc59..5926ac70b1 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -1176,7 +1176,7 @@ impl Vmm { table.write_to(socket)?; // And then the memory itself vm.send_memory_regions(table, socket)?; - Response::read_from(socket)?.ok_or_abandon( + migration_transport::expect_ok_response( socket, MigratableError::MigrateSend(anyhow!("Error during dirty memory migration")), )?; @@ -1379,9 +1379,9 @@ impl Vmm { migration_transport::send_migration_socket(&send_data_migration.destination_url)?; // Start the migration - Request::start().write_to(&mut socket)?; - Response::read_from(&mut socket)?.ok_or_abandon( + migration_transport::send_request_expect_ok( &mut socket, + Request::start(), MigratableError::MigrateSend(anyhow!("Error starting migration")), )?; @@ -1439,7 +1439,7 @@ impl Vmm { socket .write_all(&config_data) .map_err(MigratableError::MigrateSocket)?; - Response::read_from(&mut socket)?.ok_or_abandon( + migration_transport::expect_ok_response( &mut socket, MigratableError::MigrateSend(anyhow!("Error during config migration")), )?; @@ -1466,15 +1466,15 @@ impl Vmm { socket .write_all(&snapshot_data) .map_err(MigratableError::MigrateSocket)?; - Response::read_from(&mut socket)?.ok_or_abandon( + migration_transport::expect_ok_response( &mut socket, MigratableError::MigrateSend(anyhow!("Error during state migration")), )?; // Complete the migration // At this step, the receiving VMM will acquire disk locks again. - Request::complete().write_to(&mut socket)?; - Response::read_from(&mut socket)?.ok_or_abandon( + migration_transport::send_request_expect_ok( &mut socket, + Request::complete(), MigratableError::MigrateSend(anyhow!("Error completing migration")), )?; diff --git a/vmm/src/migration_transport.rs b/vmm/src/migration_transport.rs index ddc2337b5e..990f14473e 100644 --- a/vmm/src/migration_transport.rs +++ b/vmm/src/migration_transport.rs @@ -11,6 +11,7 @@ use std::result::Result; use anyhow::anyhow; use log::info; use vm_migration::MigratableError; +use vm_migration::protocol::{Request, Response}; use crate::SocketStream; @@ -76,3 +77,23 @@ pub(crate) fn receive_migration_socket( Ok(SocketStream::Unix(socket)) } } + +/// Read a response and return Ok(()) if it was a [`Response::Ok`]. +pub(crate) fn expect_ok_response( + socket: &mut SocketStream, + error: MigratableError, +) -> Result<(), MigratableError> { + Response::read_from(socket)? + .ok_or_abandon(socket, error) + .map(|_| ()) +} + +/// Send a request and validate that the peer responds with OK. +pub(crate) fn send_request_expect_ok( + socket: &mut SocketStream, + request: Request, + error: MigratableError, +) -> Result<(), MigratableError> { + request.write_to(socket)?; + expect_ok_response(socket, error) +} From 196e48af308f0201e856f38209e9bbdc1850756c Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Mon, 9 Mar 2026 10:41:27 +0100 Subject: [PATCH 472/742] vmm: extract a helper to send the VM config This further decreases boilerplate code in lib.rs while keeping the behavior. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vmm/src/lib.rs | 10 +--------- vmm/src/migration_transport.rs | 24 ++++++++++++++++++++++-- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 5926ac70b1..78ff205d31 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -1434,15 +1434,7 @@ impl Vmm { common_cpuid, memory_manager_data: vm.memory_manager_data(), }; - let config_data = serde_json::to_vec(&vm_migration_config).unwrap(); - Request::config(config_data.len() as u64).write_to(&mut socket)?; - socket - .write_all(&config_data) - .map_err(MigratableError::MigrateSocket)?; - migration_transport::expect_ok_response( - &mut socket, - MigratableError::MigrateSend(anyhow!("Error during config migration")), - )?; + migration_transport::send_config(&mut socket, &vm_migration_config)?; // Let every Migratable object know about the migration being started. vm.start_migration()?; diff --git a/vmm/src/migration_transport.rs b/vmm/src/migration_transport.rs index 990f14473e..6cf3e5bd12 100644 --- a/vmm/src/migration_transport.rs +++ b/vmm/src/migration_transport.rs @@ -3,17 +3,19 @@ // SPDX-License-Identifier: Apache-2.0 // +use std::io::Write; use std::net::{TcpListener, TcpStream}; use std::os::unix::net::{UnixListener, UnixStream}; use std::path::PathBuf; use std::result::Result; -use anyhow::anyhow; +use anyhow::{Context, anyhow}; use log::info; +use serde_json; use vm_migration::MigratableError; use vm_migration::protocol::{Request, Response}; -use crate::SocketStream; +use crate::{SocketStream, VmMigrationConfig}; /// Extract a UNIX socket path from a "unix:" migration URL. fn socket_url_to_path(url: &str) -> Result { @@ -97,3 +99,21 @@ pub(crate) fn send_request_expect_ok( request.write_to(socket)?; expect_ok_response(socket, error) } + +/// Serialize and send the VM configuration payload. +pub(crate) fn send_config( + socket: &mut SocketStream, + config: &VmMigrationConfig, +) -> Result<(), MigratableError> { + let config_data = serde_json::to_vec(config) + .context("Error serializing VM migration config") + .map_err(MigratableError::MigrateSend)?; + Request::config(config_data.len() as u64).write_to(socket)?; + socket + .write_all(&config_data) + .map_err(MigratableError::MigrateSocket)?; + expect_ok_response( + socket, + MigratableError::MigrateSend(anyhow!("Error during config migration")), + ) +} From 693cdccbb01d5f2e83a0f3f34ece944a62e0e196 Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Mon, 9 Mar 2026 10:54:06 +0100 Subject: [PATCH 473/742] vmm: extract a helper to send the VM state On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vmm/src/lib.rs | 10 +--------- vmm/src/migration_transport.rs | 20 +++++++++++++++++++- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 78ff205d31..75de067af3 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -1453,15 +1453,7 @@ impl Vmm { // Capture snapshot and send it let vm_snapshot = vm.snapshot()?; - let snapshot_data = serde_json::to_vec(&vm_snapshot).unwrap(); - Request::state(snapshot_data.len() as u64).write_to(&mut socket)?; - socket - .write_all(&snapshot_data) - .map_err(MigratableError::MigrateSocket)?; - migration_transport::expect_ok_response( - &mut socket, - MigratableError::MigrateSend(anyhow!("Error during state migration")), - )?; + migration_transport::send_state(&mut socket, &vm_snapshot)?; // Complete the migration // At this step, the receiving VMM will acquire disk locks again. migration_transport::send_request_expect_ok( diff --git a/vmm/src/migration_transport.rs b/vmm/src/migration_transport.rs index 6cf3e5bd12..99d0e12dfb 100644 --- a/vmm/src/migration_transport.rs +++ b/vmm/src/migration_transport.rs @@ -12,8 +12,8 @@ use std::result::Result; use anyhow::{Context, anyhow}; use log::info; use serde_json; -use vm_migration::MigratableError; use vm_migration::protocol::{Request, Response}; +use vm_migration::{MigratableError, Snapshot}; use crate::{SocketStream, VmMigrationConfig}; @@ -117,3 +117,21 @@ pub(crate) fn send_config( MigratableError::MigrateSend(anyhow!("Error during config migration")), ) } + +/// Serialize and send the VM snapshot payload. +pub(crate) fn send_state( + socket: &mut SocketStream, + snapshot: &Snapshot, +) -> Result<(), MigratableError> { + let snapshot_data = serde_json::to_vec(snapshot) + .context("Error serializing VM snapshot") + .map_err(MigratableError::MigrateSend)?; + Request::state(snapshot_data.len() as u64).write_to(socket)?; + socket + .write_all(&snapshot_data) + .map_err(MigratableError::MigrateSocket)?; + expect_ok_response( + socket, + MigratableError::MigrateSend(anyhow!("Error during state migration")), + ) +} From e03c0f7708db3b662a607933b9595d1a0ed3b93e Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Mon, 9 Mar 2026 10:13:09 +0100 Subject: [PATCH 474/742] vmm: move function to send dirty pages into transport module On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vmm/src/lib.rs | 31 ++----------------------------- vmm/src/migration_transport.rs | 30 +++++++++++++++++++++++++++++- 2 files changed, 31 insertions(+), 30 deletions(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 75de067af3..db84542e30 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -1157,33 +1157,6 @@ impl Vmm { Ok(()) } - /// Transmits the given [`MemoryRangeTable`] over the wire if there is at - /// least one region. - /// - /// Sends a memory migration request, the range table, and the corresponding - /// guest memory regions over the given socket. Waits for acknowledgment - /// from the destination. - fn vm_send_dirty_pages( - vm: &mut Vm, - socket: &mut SocketStream, - table: &MemoryRangeTable, - ) -> result::Result<(), MigratableError> { - if table.regions().is_empty() { - return Ok(()); - } - - Request::memory(table.length()).write_to(socket)?; - table.write_to(socket)?; - // And then the memory itself - vm.send_memory_regions(table, socket)?; - migration_transport::expect_ok_response( - socket, - MigratableError::MigrateSend(anyhow!("Error during dirty memory migration")), - )?; - - Ok(()) - } - /// Performs the initial memory transmission (iteration zero) plus a /// variable number of memory iterations with the goal to eventually migrate /// the VM in a reasonably small downtime. @@ -1214,7 +1187,7 @@ impl Vmm { // Send the current dirty pages let transfer_begin = Instant::now(); - Self::vm_send_dirty_pages(vm, socket, &iteration_table)?; + migration_transport::vm_send_dirty_pages(vm, socket, &iteration_table)?; let transfer_duration = transfer_begin.elapsed(); ctx.update_metrics_after_transfer(transfer_begin, transfer_duration); @@ -1356,7 +1329,7 @@ impl Vmm { ctx.update_metrics_before_transfer(iteration_begin, &final_table); let transfer_begin = Instant::now(); - Vmm::vm_send_dirty_pages(vm, socket, &final_table)?; + migration_transport::vm_send_dirty_pages(vm, socket, &final_table)?; let transfer_duration = transfer_begin.elapsed(); ctx.update_metrics_after_transfer(transfer_begin, transfer_duration); ctx.iteration += 1; diff --git a/vmm/src/migration_transport.rs b/vmm/src/migration_transport.rs index 99d0e12dfb..fd3e035e11 100644 --- a/vmm/src/migration_transport.rs +++ b/vmm/src/migration_transport.rs @@ -12,9 +12,10 @@ use std::result::Result; use anyhow::{Context, anyhow}; use log::info; use serde_json; -use vm_migration::protocol::{Request, Response}; +use vm_migration::protocol::{MemoryRangeTable, Request, Response}; use vm_migration::{MigratableError, Snapshot}; +use crate::vm::Vm; use crate::{SocketStream, VmMigrationConfig}; /// Extract a UNIX socket path from a "unix:" migration URL. @@ -135,3 +136,30 @@ pub(crate) fn send_state( MigratableError::MigrateSend(anyhow!("Error during state migration")), ) } + +/// Transmits the given [`MemoryRangeTable`] over the wire if there is at +/// least one region. +/// +/// Sends a memory migration request, the range table, and the corresponding +/// guest memory regions over the given socket. Waits for acknowledgment +/// from the destination. +pub(crate) fn vm_send_dirty_pages( + vm: &mut Vm, + socket: &mut SocketStream, + table: &MemoryRangeTable, +) -> Result<(), MigratableError> { + if table.regions().is_empty() { + return Ok(()); + } + + Request::memory(table.length()).write_to(socket)?; + table.write_to(socket)?; + // And then the memory itself + vm.send_memory_regions(table, socket)?; + expect_ok_response( + socket, + MigratableError::MigrateSend(anyhow!("Error during dirty memory migration")), + )?; + + Ok(()) +} From 9248143e18d8bd4fb13e2b7f81dc5f4f95ebdc86 Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Mon, 9 Mar 2026 16:19:41 +0100 Subject: [PATCH 475/742] vmm: extract send_memory_regions from vm And rename it for better naming consistency. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vmm/src/lib.rs | 4 +-- vmm/src/migration_transport.rs | 56 +++++++++++++++++++++++++--------- vmm/src/vm.rs | 49 +++-------------------------- 3 files changed, 48 insertions(+), 61 deletions(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index db84542e30..1a9c53dd31 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -1187,7 +1187,7 @@ impl Vmm { // Send the current dirty pages let transfer_begin = Instant::now(); - migration_transport::vm_send_dirty_pages(vm, socket, &iteration_table)?; + migration_transport::send_memory_ranges(&vm.guest_memory(), &iteration_table, socket)?; let transfer_duration = transfer_begin.elapsed(); ctx.update_metrics_after_transfer(transfer_begin, transfer_duration); @@ -1329,7 +1329,7 @@ impl Vmm { ctx.update_metrics_before_transfer(iteration_begin, &final_table); let transfer_begin = Instant::now(); - migration_transport::vm_send_dirty_pages(vm, socket, &final_table)?; + migration_transport::send_memory_ranges(&vm.guest_memory(), &final_table, socket)?; let transfer_duration = transfer_begin.elapsed(); ctx.update_metrics_after_transfer(transfer_begin, transfer_duration); ctx.iteration += 1; diff --git a/vmm/src/migration_transport.rs b/vmm/src/migration_transport.rs index fd3e035e11..22dac6f71d 100644 --- a/vmm/src/migration_transport.rs +++ b/vmm/src/migration_transport.rs @@ -12,11 +12,11 @@ use std::result::Result; use anyhow::{Context, anyhow}; use log::info; use serde_json; +use vm_memory::{Bytes, GuestAddress, GuestAddressSpace, GuestMemoryAtomic}; use vm_migration::protocol::{MemoryRangeTable, Request, Response}; use vm_migration::{MigratableError, Snapshot}; -use crate::vm::Vm; -use crate::{SocketStream, VmMigrationConfig}; +use crate::{GuestMemoryMmap, SocketStream, VmMigrationConfig}; /// Extract a UNIX socket path from a "unix:" migration URL. fn socket_url_to_path(url: &str) -> Result { @@ -137,29 +137,55 @@ pub(crate) fn send_state( ) } -/// Transmits the given [`MemoryRangeTable`] over the wire if there is at -/// least one region. +/// Transmits the given [`MemoryRangeTable`] and the corresponding guest memory +/// content over the wire if there is at least one range. /// /// Sends a memory migration request, the range table, and the corresponding -/// guest memory regions over the given socket. Waits for acknowledgment +/// guest memory range over the given socket. Waits for acknowledgment /// from the destination. -pub(crate) fn vm_send_dirty_pages( - vm: &mut Vm, +pub(crate) fn send_memory_ranges( + guest_memory: &GuestMemoryAtomic, + ranges: &MemoryRangeTable, socket: &mut SocketStream, - table: &MemoryRangeTable, ) -> Result<(), MigratableError> { - if table.regions().is_empty() { + if ranges.regions().is_empty() { return Ok(()); } - Request::memory(table.length()).write_to(socket)?; - table.write_to(socket)?; + // Send the memory table + Request::memory(ranges.length()).write_to(socket)?; + ranges.write_to(socket)?; + // And then the memory itself - vm.send_memory_regions(table, socket)?; + let mem = guest_memory.memory(); + for range in ranges.regions() { + let mut offset: u64 = 0; + // Here we are manually handling the retry in case we can't read the + // whole region at once because we can't use the implementation + // from vm-memory::GuestMemory of write_all_to() as it is not + // following the correct behavior. For more info about this issue + // see: https://github.com/rust-vmm/vm-memory/issues/174 + loop { + let bytes_written = mem + .write_volatile_to( + GuestAddress(range.gpa + offset), + socket, + (range.length - offset) as usize, + ) + .map_err(|e| { + MigratableError::MigrateSend(anyhow!( + "Error transferring memory to socket: {e}" + )) + })?; + offset += bytes_written as u64; + + if offset == range.length { + break; + } + } + } expect_ok_response( socket, MigratableError::MigrateSend(anyhow!("Error during dirty memory migration")), - )?; - - Ok(()) + ) } diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index 7a5a19133a..c0eaba7ed6 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -68,7 +68,7 @@ use vm_device::Bus; use vm_memory::GuestMemory; #[cfg(feature = "tdx")] use vm_memory::{Address, ByteValued, GuestMemoryRegion, ReadVolatile}; -use vm_memory::{Bytes, GuestAddress, GuestAddressSpace, GuestMemoryAtomic, WriteVolatile}; +use vm_memory::{Bytes, GuestAddress, GuestAddressSpace, GuestMemoryAtomic}; use vm_migration::protocol::{MemoryRangeTable, Request, Response}; use vm_migration::{ Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable, snapshot_from_id, @@ -2853,49 +2853,6 @@ impl Vm { Ok(()) } - /// Writes the contents of the given guest memory regions to the provided sink. - /// Used, for example, during VM live migration to transfer memory to a socket. - pub fn send_memory_regions( - &mut self, - ranges: &MemoryRangeTable, - fd: &mut F, - ) -> std::result::Result<(), MigratableError> - where - F: WriteVolatile, - { - let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); - let mem = guest_memory.memory(); - - for range in ranges.regions() { - let mut offset: u64 = 0; - // Here we are manually handling the retry in case we can't the - // whole region at once because we can't use the implementation - // from vm-memory::GuestMemory of write_all_to() as it is not - // following the correct behavior. For more info about this issue - // see: https://github.com/rust-vmm/vm-memory/issues/174 - loop { - let bytes_written = mem - .write_volatile_to( - GuestAddress(range.gpa + offset), - fd, - (range.length - offset) as usize, - ) - .map_err(|e| { - MigratableError::MigrateSend(anyhow!( - "Error transferring memory to socket: {e}" - )) - })?; - offset += bytes_written as u64; - - if offset == range.length { - break; - } - } - } - - Ok(()) - } - pub fn memory_range_table(&self) -> std::result::Result { self.memory_manager .lock() @@ -2903,6 +2860,10 @@ impl Vm { .memory_range_table(false) } + pub fn guest_memory(&self) -> GuestMemoryAtomic { + self.memory_manager.lock().unwrap().guest_memory() + } + pub fn device_tree(&self) -> Arc> { self.device_manager.lock().unwrap().device_tree() } From e175ad64f2403c030e781f2c7d90166030f5c6c8 Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Tue, 10 Mar 2026 09:47:19 +0100 Subject: [PATCH 476/742] vmm: move SocketStream into the migration_transport module This is mainly to clean up the lib.rs a bit more. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vmm/src/lib.rs | 90 ++------------------------------ vmm/src/migration_transport.rs | 95 ++++++++++++++++++++++++++++++++-- 2 files changed, 95 insertions(+), 90 deletions(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 1a9c53dd31..c08ec9e28d 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -6,9 +6,7 @@ use std::collections::HashMap; use std::fs::File; use std::io::{Read, Write, stdout}; -use std::net::TcpStream; use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; -use std::os::unix::net::UnixStream; use std::panic::AssertUnwindSafe; #[cfg(feature = "guest_debug")] use std::path::PathBuf; @@ -38,8 +36,8 @@ use serde::{Deserialize, Serialize}; use signal_hook::iterator::{Handle, Signals}; use thiserror::Error; use tracer::trace_scoped; -use vm_memory::bitmap::{AtomicBitmap, BitmapSlice}; -use vm_memory::{ReadVolatile, VolatileMemoryError, VolatileSlice, WriteVolatile}; +use vm_memory::ReadVolatile; +use vm_memory::bitmap::AtomicBitmap; use vm_migration::protocol::*; use vm_migration::{ MemoryMigrationContext, Migratable, MigratableError, Pausable, Snapshot, Snapshottable, @@ -61,6 +59,7 @@ use crate::memory_manager::MemoryManager; #[cfg(all(feature = "kvm", target_arch = "x86_64"))] use crate::migration::get_vm_snapshot; use crate::migration::{recv_vm_config, recv_vm_state}; +use crate::migration_transport::SocketStream; use crate::seccomp_filters::{Thread, get_seccomp_filter}; use crate::vm::{Error as VmError, Vm, VmState}; use crate::vm_config::{ @@ -265,89 +264,6 @@ impl From for EpollDispatch { } } -enum SocketStream { - Unix(UnixStream), - Tcp(TcpStream), -} - -impl Read for SocketStream { - fn read(&mut self, buf: &mut [u8]) -> std::io::Result { - match self { - SocketStream::Unix(stream) => stream.read(buf), - SocketStream::Tcp(stream) => stream.read(buf), - } - } -} - -impl Write for SocketStream { - fn write(&mut self, buf: &[u8]) -> std::io::Result { - match self { - SocketStream::Unix(stream) => stream.write(buf), - SocketStream::Tcp(stream) => stream.write(buf), - } - } - - fn flush(&mut self) -> std::io::Result<()> { - match self { - SocketStream::Unix(stream) => stream.flush(), - SocketStream::Tcp(stream) => stream.flush(), - } - } -} - -impl AsRawFd for SocketStream { - fn as_raw_fd(&self) -> RawFd { - match self { - SocketStream::Unix(s) => s.as_raw_fd(), - SocketStream::Tcp(s) => s.as_raw_fd(), - } - } -} - -impl ReadVolatile for SocketStream { - fn read_volatile( - &mut self, - buf: &mut VolatileSlice, - ) -> std::result::Result { - match self { - SocketStream::Unix(s) => s.read_volatile(buf), - SocketStream::Tcp(s) => s.read_volatile(buf), - } - } - - fn read_exact_volatile( - &mut self, - buf: &mut VolatileSlice, - ) -> std::result::Result<(), VolatileMemoryError> { - match self { - SocketStream::Unix(s) => s.read_exact_volatile(buf), - SocketStream::Tcp(s) => s.read_exact_volatile(buf), - } - } -} - -impl WriteVolatile for SocketStream { - fn write_volatile( - &mut self, - buf: &VolatileSlice, - ) -> std::result::Result { - match self { - SocketStream::Unix(s) => s.write_volatile(buf), - SocketStream::Tcp(s) => s.write_volatile(buf), - } - } - - fn write_all_volatile( - &mut self, - buf: &VolatileSlice, - ) -> std::result::Result<(), VolatileMemoryError> { - match self { - SocketStream::Unix(s) => s.write_all_volatile(buf), - SocketStream::Tcp(s) => s.write_all_volatile(buf), - } - } -} - pub struct EpollContext { epoll_file: File, } diff --git a/vmm/src/migration_transport.rs b/vmm/src/migration_transport.rs index 22dac6f71d..168d718be7 100644 --- a/vmm/src/migration_transport.rs +++ b/vmm/src/migration_transport.rs @@ -3,8 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 // -use std::io::Write; +use std::io::{self, Read, Write}; use std::net::{TcpListener, TcpStream}; +use std::os::unix::io::{AsRawFd, RawFd}; use std::os::unix::net::{UnixListener, UnixStream}; use std::path::PathBuf; use std::result::Result; @@ -12,11 +13,99 @@ use std::result::Result; use anyhow::{Context, anyhow}; use log::info; use serde_json; -use vm_memory::{Bytes, GuestAddress, GuestAddressSpace, GuestMemoryAtomic}; +use vm_memory::bitmap::BitmapSlice; +use vm_memory::{ + Bytes, GuestAddress, GuestAddressSpace, GuestMemoryAtomic, ReadVolatile, VolatileMemoryError, + VolatileSlice, WriteVolatile, +}; use vm_migration::protocol::{MemoryRangeTable, Request, Response}; use vm_migration::{MigratableError, Snapshot}; -use crate::{GuestMemoryMmap, SocketStream, VmMigrationConfig}; +use crate::{GuestMemoryMmap, VmMigrationConfig}; + +/// Transport-agnostic stream used by the migration protocol. +pub(crate) enum SocketStream { + Unix(UnixStream), + Tcp(TcpStream), +} + +impl Read for SocketStream { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + match self { + SocketStream::Unix(stream) => stream.read(buf), + SocketStream::Tcp(stream) => stream.read(buf), + } + } +} + +impl Write for SocketStream { + fn write(&mut self, buf: &[u8]) -> io::Result { + match self { + SocketStream::Unix(stream) => stream.write(buf), + SocketStream::Tcp(stream) => stream.write(buf), + } + } + + fn flush(&mut self) -> io::Result<()> { + match self { + SocketStream::Unix(stream) => stream.flush(), + SocketStream::Tcp(stream) => stream.flush(), + } + } +} + +impl AsRawFd for SocketStream { + fn as_raw_fd(&self) -> RawFd { + match self { + SocketStream::Unix(s) => s.as_raw_fd(), + SocketStream::Tcp(s) => s.as_raw_fd(), + } + } +} + +impl ReadVolatile for SocketStream { + fn read_volatile( + &mut self, + buf: &mut VolatileSlice, + ) -> Result { + match self { + SocketStream::Unix(s) => s.read_volatile(buf), + SocketStream::Tcp(s) => s.read_volatile(buf), + } + } + + fn read_exact_volatile( + &mut self, + buf: &mut VolatileSlice, + ) -> Result<(), VolatileMemoryError> { + match self { + SocketStream::Unix(s) => s.read_exact_volatile(buf), + SocketStream::Tcp(s) => s.read_exact_volatile(buf), + } + } +} + +impl WriteVolatile for SocketStream { + fn write_volatile( + &mut self, + buf: &VolatileSlice, + ) -> Result { + match self { + SocketStream::Unix(s) => s.write_volatile(buf), + SocketStream::Tcp(s) => s.write_volatile(buf), + } + } + + fn write_all_volatile( + &mut self, + buf: &VolatileSlice, + ) -> Result<(), VolatileMemoryError> { + match self { + SocketStream::Unix(s) => s.write_all_volatile(buf), + SocketStream::Tcp(s) => s.write_all_volatile(buf), + } + } +} /// Extract a UNIX socket path from a "unix:" migration URL. fn socket_url_to_path(url: &str) -> Result { From ec42ee8004064c2038f7d659b64fe5df80def1ea Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Tue, 10 Mar 2026 10:07:10 +0100 Subject: [PATCH 477/742] vmm: extract receive_memory_regions from memory manager The memory manager is guarded by a mutex, thus parallel accesses to it and its members are not possible. But we have to execute this function in parallel when we introduce multiple TCP connections. Otherwise, the workers who receive the data and write it into guest memory will block on each other, and thus slow down the migration. Also rename the function to receive_memory_ranges for better naming consistency. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vmm/src/lib.rs | 16 ++++--------- vmm/src/memory_manager.rs | 43 +--------------------------------- vmm/src/migration_transport.rs | 38 ++++++++++++++++++++++++++++++ 3 files changed, 44 insertions(+), 53 deletions(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index c08ec9e28d..c2fb3e9e68 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -36,7 +36,6 @@ use serde::{Deserialize, Serialize}; use signal_hook::iterator::{Handle, Signals}; use thiserror::Error; use tracer::trace_scoped; -use vm_memory::ReadVolatile; use vm_memory::bitmap::AtomicBitmap; use vm_migration::protocol::*; use vm_migration::{ @@ -1056,21 +1055,16 @@ impl Vmm { Ok(()) } - fn vm_receive_memory( + fn vm_receive_memory( &mut self, req: &Request, - socket: &mut T, + socket: &mut SocketStream, memory_manager: &mut MemoryManager, - ) -> std::result::Result<(), MigratableError> - where - T: Read + ReadVolatile, - { - // Read table + ) -> std::result::Result<(), MigratableError> { let table = MemoryRangeTable::read_from(socket, req.length())?; - // And then read the memory itself - memory_manager.receive_memory_regions(&table, socket)?; - Ok(()) + // And then the memory itself + migration_transport::receive_memory_ranges(&memory_manager.guest_memory(), &table, socket) } /// Performs the initial memory transmission (iteration zero) plus a diff --git a/vmm/src/memory_manager.rs b/vmm/src/memory_manager.rs index c8f64b15ad..62b4522cc2 100644 --- a/vmm/src/memory_manager.rs +++ b/vmm/src/memory_manager.rs @@ -39,7 +39,7 @@ use vm_memory::guest_memory::{Error as MmapError, FileOffset}; use vm_memory::mmap::MmapRegionError; use vm_memory::{ Address, Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, - GuestMemoryError, GuestMemoryRegion, GuestUsize, MmapRegion, ReadVolatile, + GuestMemoryError, GuestMemoryRegion, GuestUsize, MmapRegion, }; use vm_migration::protocol::{MemoryRange, MemoryRangeTable}; use vm_migration::{ @@ -2572,47 +2572,6 @@ impl MemoryManager { debug!("coredump total bytes {total_bytes}"); Ok(()) } - - pub fn receive_memory_regions( - &mut self, - ranges: &MemoryRangeTable, - fd: &mut F, - ) -> std::result::Result<(), MigratableError> - where - F: ReadVolatile, - { - let guest_memory = self.guest_memory(); - let mem = guest_memory.memory(); - - for range in ranges.regions() { - let mut offset: u64 = 0; - // Here we are manually handling the retry in case we can't the - // whole region at once because we can't use the implementation - // from vm-memory::GuestMemory of read_exact_from() as it is not - // following the correct behavior. For more info about this issue - // see: https://github.com/rust-vmm/vm-memory/issues/174 - loop { - let bytes_read = mem - .read_volatile_from( - GuestAddress(range.gpa + offset), - fd, - (range.length - offset) as usize, - ) - .map_err(|e| { - MigratableError::MigrateReceive(anyhow!( - "Error receiving memory from socket: {e}" - )) - })?; - offset += bytes_read as u64; - - if offset == range.length { - break; - } - } - } - - Ok(()) - } } struct MemoryNotify { diff --git a/vmm/src/migration_transport.rs b/vmm/src/migration_transport.rs index 168d718be7..12412b0a04 100644 --- a/vmm/src/migration_transport.rs +++ b/vmm/src/migration_transport.rs @@ -278,3 +278,41 @@ pub(crate) fn send_memory_ranges( MigratableError::MigrateSend(anyhow!("Error during dirty memory migration")), ) } + +/// Receive memory contents for the given range table into guest memory. +pub(crate) fn receive_memory_ranges( + guest_memory: &GuestMemoryAtomic, + ranges: &MemoryRangeTable, + socket: &mut SocketStream, +) -> Result<(), MigratableError> { + let mem = guest_memory.memory(); + + for range in ranges.regions() { + let mut offset: u64 = 0; + // Here we are manually handling the retry in case we can't read the + // whole region at once because we can't use the implementation + // from vm-memory::GuestMemory of read_exact_from() as it is not + // following the correct behavior. For more info about this issue + // see: https://github.com/rust-vmm/vm-memory/issues/174 + loop { + let bytes_read = mem + .read_volatile_from( + GuestAddress(range.gpa + offset), + socket, + (range.length - offset) as usize, + ) + .map_err(|e| { + MigratableError::MigrateReceive(anyhow!( + "Error receiving memory from socket: {e}" + )) + })?; + offset += bytes_read as u64; + + if offset == range.length { + break; + } + } + } + + Ok(()) +} From 765311085f06d864d32ae9644f80bfd839977962 Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Tue, 10 Mar 2026 10:26:59 +0100 Subject: [PATCH 478/742] vmm: keep direct reference to guest memory around That way we avoid having to grab a lock when receiving a chunk of memory over the migration socket. This is a necessary prerequisite for having multiple memory receiving threads. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vmm/src/lib.rs | 55 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 20 deletions(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index c2fb3e9e68..f118e93e92 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -36,6 +36,7 @@ use serde::{Deserialize, Serialize}; use signal_hook::iterator::{Handle, Signals}; use thiserror::Error; use tracer::trace_scoped; +use vm_memory::GuestMemoryAtomic; use vm_memory::bitmap::AtomicBitmap; use vm_migration::protocol::*; use vm_migration::{ @@ -622,8 +623,16 @@ enum ReceiveMigrationState { /// We received file descriptors for memory. This can only happen on UNIX domain sockets. MemoryFdsReceived(Vec<(u32, File)>), - /// We received the VM configuration. We keep the memory configuration around to populate guest memory. From this point on, the sender can start sending memory updates. - Configured(Arc>), + /// We received the VM configuration. We keep a direct reference to the guest memory + /// around to populate it without having to acquire a lock (which we would have to do + /// when accessing the memory through the memory manager). + /// + /// We keep the memory manager around to pass it into the next state. From this point + /// on, the sender can start sending memory updates. + Configured( + Arc>, + GuestMemoryAtomic, + ), /// Memory is populated and we received the state. The VM is ready to go. StateReceived, @@ -835,14 +844,19 @@ impl Vmm { ))) }; - let mut configure_vm = - |socket: &mut SocketStream, - memory_files: HashMap| - -> std::result::Result>, MigratableError> { - let memory_manager = self.vm_receive_config(req, socket, memory_files)?; - - Ok(memory_manager) - }; + let mut configure_vm = |socket: &mut SocketStream, + memory_files: HashMap| + -> std::result::Result< + ( + Arc>, + GuestMemoryAtomic, + ), + MigratableError, + > { + let memory_manager = self.vm_receive_config(req, socket, memory_files)?; + let guest_memory = memory_manager.lock().unwrap().guest_memory(); + Ok((memory_manager, guest_memory)) + }; let recv_memory_fd = |socket: &mut SocketStream, mut memory_files: Vec<(u32, File)>| @@ -865,23 +879,24 @@ impl Vmm { }, Started => match req.command() { Command::MemoryFd => recv_memory_fd(socket, Vec::new()).map(MemoryFdsReceived), - Command::Config => configure_vm(socket, Default::default()).map(Configured), + Command::Config => { + configure_vm(socket, Default::default()).map(|res| Configured(res.0, res.1)) + } _ => invalid_command(), }, MemoryFdsReceived(memory_files) => match req.command() { Command::MemoryFd => recv_memory_fd(socket, memory_files).map(MemoryFdsReceived), - Command::Config => { - configure_vm(socket, HashMap::from_iter(memory_files)).map(Configured) - } + Command::Config => configure_vm(socket, HashMap::from_iter(memory_files)) + .map(|res| Configured(res.0, res.1)), _ => invalid_command(), }, - Configured(memory_manager) => match req.command() { + Configured(memory_manager, guest_memory) => match req.command() { Command::Memory => { - self.vm_receive_memory(req, socket, &mut memory_manager.lock().unwrap())?; - Ok(Configured(memory_manager)) + self.vm_receive_memory(req, socket, &guest_memory)?; + Ok(Configured(memory_manager, guest_memory)) } Command::State => { - self.vm_receive_state(req, socket, memory_manager.clone())?; + self.vm_receive_state(req, socket, memory_manager)?; Ok(StateReceived) } _ => invalid_command(), @@ -1059,12 +1074,12 @@ impl Vmm { &mut self, req: &Request, socket: &mut SocketStream, - memory_manager: &mut MemoryManager, + guest_mem: &GuestMemoryAtomic, ) -> std::result::Result<(), MigratableError> { let table = MemoryRangeTable::read_from(socket, req.length())?; // And then the memory itself - migration_transport::receive_memory_ranges(&memory_manager.guest_memory(), &table, socket) + migration_transport::receive_memory_ranges(guest_mem, &table, socket) } /// Performs the initial memory transmission (iteration zero) plus a From 7311211b38d018fa1743e49b4903a6590ae412ef Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Tue, 10 Mar 2026 11:17:20 +0100 Subject: [PATCH 479/742] vmm: allow keeping the socket listener around This allows accepting multiple connections in the migration receive path. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vmm/src/lib.rs | 5 +-- vmm/src/migration_transport.rs | 63 ++++++++++++++++++++-------------- 2 files changed, 40 insertions(+), 28 deletions(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index f118e93e92..311ec84b88 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -2298,9 +2298,10 @@ impl RequestHandler for Vmm { receive_data_migration.receiver_url ); + let mut listener = + migration_transport::receive_migration_listener(&receive_data_migration.receiver_url)?; // Accept the connection and get the socket - let mut socket = - migration_transport::receive_migration_socket(&receive_data_migration.receiver_url)?; + let mut socket = listener.accept()?; event!("vm", "migration-receive-started"); diff --git a/vmm/src/migration_transport.rs b/vmm/src/migration_transport.rs index 12412b0a04..a33ee503cd 100644 --- a/vmm/src/migration_transport.rs +++ b/vmm/src/migration_transport.rs @@ -23,6 +23,31 @@ use vm_migration::{MigratableError, Snapshot}; use crate::{GuestMemoryMmap, VmMigrationConfig}; +/// Transport-agnostic listener used to receive connections. +#[derive(Debug)] +pub(crate) enum ReceiveListener { + Tcp(TcpListener), + Unix(UnixListener), +} + +impl ReceiveListener { + /// Block until a connection is accepted. + pub(crate) fn accept(&mut self) -> Result { + match self { + ReceiveListener::Tcp(listener) => listener + .accept() + .map(|(socket, _)| SocketStream::Tcp(socket)) + .context("Failed to accept TCP migration connection") + .map_err(MigratableError::MigrateReceive), + ReceiveListener::Unix(listener) => listener + .accept() + .map(|(socket, _)| SocketStream::Unix(socket)) + .context("Failed to accept Unix migration connection") + .map_err(MigratableError::MigrateReceive), + } + } +} + /// Transport-agnostic stream used by the migration protocol. pub(crate) enum SocketStream { Unix(UnixStream), @@ -138,35 +163,21 @@ pub(crate) fn send_migration_socket( } } -/// Bind and accept a migration connection for the receiver side. -pub(crate) fn receive_migration_socket( +/// Bind a migration listener for the receiver side. +pub(crate) fn receive_migration_listener( receiver_url: &str, -) -> Result { +) -> Result { if let Some(address) = receiver_url.strip_prefix("tcp:") { - let listener = TcpListener::bind(address).map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error binding to TCP socket: {e}")) - })?; - - let (socket, _addr) = listener.accept().map_err(|e| { - MigratableError::MigrateReceive(anyhow!( - "Error accepting connection on TCP socket: {e}" - )) - })?; - - Ok(SocketStream::Tcp(socket)) + TcpListener::bind(address) + .map(ReceiveListener::Tcp) + .context("Error binding to TCP socket") + .map_err(MigratableError::MigrateReceive) } else { - let path = socket_url_to_path(receiver_url).map_err(MigratableError::MigrateSend)?; - let listener = UnixListener::bind(&path).map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error binding to UNIX socket: {e}")) - })?; - - let (socket, _addr) = listener.accept().map_err(|e| { - MigratableError::MigrateReceive(anyhow!( - "Error accepting connection on UNIX socket: {e}" - )) - })?; - - Ok(SocketStream::Unix(socket)) + let path = socket_url_to_path(receiver_url).map_err(MigratableError::MigrateReceive)?; + UnixListener::bind(&path) + .map(ReceiveListener::Unix) + .context("Error binding to UNIX socket") + .map_err(MigratableError::MigrateReceive) } } From 98ece1e34713672118dd2aa08a347174f8e0e98a Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Thu, 12 Mar 2026 10:11:49 +0100 Subject: [PATCH 480/742] vmm: add functionality for an abortable accept for sockets With this, the receiver side of a migration can wait for incoming connections, while also being able to abort the accept when the migration is done. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vmm/src/migration_transport.rs | 76 ++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/vmm/src/migration_transport.rs b/vmm/src/migration_transport.rs index a33ee503cd..cda22dfac2 100644 --- a/vmm/src/migration_transport.rs +++ b/vmm/src/migration_transport.rs @@ -5,6 +5,7 @@ use std::io::{self, Read, Write}; use std::net::{TcpListener, TcpStream}; +use std::os::fd::{AsFd, BorrowedFd}; use std::os::unix::io::{AsRawFd, RawFd}; use std::os::unix::net::{UnixListener, UnixStream}; use std::path::PathBuf; @@ -46,6 +47,32 @@ impl ReceiveListener { .map_err(MigratableError::MigrateReceive), } } + + /// Same as [`Self::accept`], but returns `None` if the abort event was signaled. + fn abortable_accept( + &mut self, + abort_event: &impl AsRawFd, + ) -> Result, MigratableError> { + if wait_for_readable(&self, abort_event) + .context("Error while waiting for socket to become readable") + .map_err(MigratableError::MigrateReceive)? + { + // The listener is readable; accept the connection. + Ok(Some(self.accept()?)) + } else { + // The abort event was signaled before any connection arrived. + Ok(None) + } + } +} + +impl AsFd for ReceiveListener { + fn as_fd(&self) -> BorrowedFd<'_> { + match self { + ReceiveListener::Tcp(listener) => listener.as_fd(), + ReceiveListener::Unix(listener) => listener.as_fd(), + } + } } /// Transport-agnostic stream used by the migration protocol. @@ -132,6 +159,55 @@ impl WriteVolatile for SocketStream { } } +// Wait for `fd` to become readable. In this case, we return true. In case +// `abort_event` was signaled, return false. +fn wait_for_readable(fd: &impl AsFd, abort_event: &impl AsRawFd) -> Result { + let fd = fd.as_fd().as_raw_fd(); + let abort_event = abort_event.as_raw_fd(); + + let mut poll_fds = [ + libc::pollfd { + fd: abort_event, + events: libc::POLLIN, + revents: 0, + }, + libc::pollfd { + fd, + events: libc::POLLIN, + revents: 0, + }, + ]; + + loop { + // SAFETY: This is safe, because the file descriptors are valid and the + // poll_fds array is properly initialized. + let ret = unsafe { libc::poll(poll_fds.as_mut_ptr(), poll_fds.len() as libc::nfds_t, -1) }; + + if ret >= 0 { + break; + } + + let err = io::Error::last_os_error(); + if err.raw_os_error() == Some(libc::EINTR) { + continue; + } + + return Err(err); + } + + if poll_fds[0].revents & libc::POLLIN != 0 { + return Ok(false); + } + + if poll_fds[1].revents & libc::POLLIN != 0 { + return Ok(true); + } + + Err(io::Error::other( + "Poll returned, but neither file descriptor is readable?", + )) +} + /// Extract a UNIX socket path from a "unix:" migration URL. fn socket_url_to_path(url: &str) -> Result { url.strip_prefix("unix:") From 058954a8c10667ecbb594f79b089172c597d9742 Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Thu, 12 Mar 2026 10:29:49 +0100 Subject: [PATCH 481/742] vmm: make receive_memory_ranges take the requests directly This just removes some unnecessary indirections. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vmm/src/lib.rs | 14 +------------- vmm/src/migration_transport.rs | 11 ++++++++--- 2 files changed, 9 insertions(+), 16 deletions(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 311ec84b88..be9ebf929b 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -892,7 +892,7 @@ impl Vmm { }, Configured(memory_manager, guest_memory) => match req.command() { Command::Memory => { - self.vm_receive_memory(req, socket, &guest_memory)?; + migration_transport::receive_memory_ranges(&guest_memory, req, socket)?; Ok(Configured(memory_manager, guest_memory)) } Command::State => { @@ -1070,18 +1070,6 @@ impl Vmm { Ok(()) } - fn vm_receive_memory( - &mut self, - req: &Request, - socket: &mut SocketStream, - guest_mem: &GuestMemoryAtomic, - ) -> std::result::Result<(), MigratableError> { - let table = MemoryRangeTable::read_from(socket, req.length())?; - - // And then the memory itself - migration_transport::receive_memory_ranges(guest_mem, &table, socket) - } - /// Performs the initial memory transmission (iteration zero) plus a /// variable number of memory iterations with the goal to eventually migrate /// the VM in a reasonably small downtime. diff --git a/vmm/src/migration_transport.rs b/vmm/src/migration_transport.rs index cda22dfac2..6294122140 100644 --- a/vmm/src/migration_transport.rs +++ b/vmm/src/migration_transport.rs @@ -19,7 +19,7 @@ use vm_memory::{ Bytes, GuestAddress, GuestAddressSpace, GuestMemoryAtomic, ReadVolatile, VolatileMemoryError, VolatileSlice, WriteVolatile, }; -use vm_migration::protocol::{MemoryRangeTable, Request, Response}; +use vm_migration::protocol::{Command, MemoryRangeTable, Request, Response}; use vm_migration::{MigratableError, Snapshot}; use crate::{GuestMemoryMmap, VmMigrationConfig}; @@ -366,12 +366,17 @@ pub(crate) fn send_memory_ranges( ) } -/// Receive memory contents for the given range table into guest memory. +/// Receive memory contents for the given request and copy it into guest memory. pub(crate) fn receive_memory_ranges( guest_memory: &GuestMemoryAtomic, - ranges: &MemoryRangeTable, + req: &Request, socket: &mut SocketStream, ) -> Result<(), MigratableError> { + debug_assert_eq!(req.command(), Command::Memory); + // Read the memory table + let ranges = MemoryRangeTable::read_from(socket, req.length())?; + + // And then the memory itself let mem = guest_memory.memory(); for range in ranges.regions() { From 5c556880dd63f220f3a7cc8cb05d63e68824b5e6 Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Thu, 12 Mar 2026 10:37:33 +0100 Subject: [PATCH 482/742] vmm: implement functionality to accept multiple connections Adds the functionality to accept multiple connections on the receiver side of a live migration. A thread listens for incoming connections and creates a worker for each new connection. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vmm/src/migration_transport.rs | 192 ++++++++++++++++++++++++++++++++- 1 file changed, 189 insertions(+), 3 deletions(-) diff --git a/vmm/src/migration_transport.rs b/vmm/src/migration_transport.rs index 6294122140..27fc464fad 100644 --- a/vmm/src/migration_transport.rs +++ b/vmm/src/migration_transport.rs @@ -3,16 +3,17 @@ // SPDX-License-Identifier: Apache-2.0 // -use std::io::{self, Read, Write}; +use std::io::{self, ErrorKind, Read, Write}; use std::net::{TcpListener, TcpStream}; use std::os::fd::{AsFd, BorrowedFd}; use std::os::unix::io::{AsRawFd, RawFd}; use std::os::unix::net::{UnixListener, UnixStream}; use std::path::PathBuf; use std::result::Result; +use std::thread; use anyhow::{Context, anyhow}; -use log::info; +use log::{debug, error, info, warn}; use serde_json; use vm_memory::bitmap::BitmapSlice; use vm_memory::{ @@ -21,6 +22,7 @@ use vm_memory::{ }; use vm_migration::protocol::{Command, MemoryRangeTable, Request, Response}; use vm_migration::{MigratableError, Snapshot}; +use vmm_sys_util::eventfd::EventFd; use crate::{GuestMemoryMmap, VmMigrationConfig}; @@ -115,6 +117,15 @@ impl AsRawFd for SocketStream { } } +impl AsFd for SocketStream { + fn as_fd(&self) -> BorrowedFd<'_> { + match self { + SocketStream::Unix(s) => s.as_fd(), + SocketStream::Tcp(s) => s.as_fd(), + } + } +} + impl ReadVolatile for SocketStream { fn read_volatile( &mut self, @@ -208,6 +219,181 @@ fn wait_for_readable(fd: &impl AsFd, abort_event: &impl AsRawFd) -> Result>>, + + /// This fd gets signaled when the migration stops, and will then stop + /// the [`Self::accept_thread`]. + terminate_fd: EventFd, +} + +impl ReceiveAdditionalConnections { + /// Starts a thread to accept incoming connections and handle them. These + /// additional connections are used to receive additional memory regions + /// during VM migration. + pub(crate) fn new( + listener: ReceiveListener, + guest_memory: GuestMemoryAtomic, + ) -> Result { + let event_fd = EventFd::new(0) + .context("Error creating terminate fd") + .map_err(MigratableError::MigrateReceive)?; + + let terminate_fd = event_fd + .try_clone() + .context("Error cloning terminate fd") + .map_err(MigratableError::MigrateReceive)?; + + let accept_thread = thread::Builder::new() + .name("migrate-receive-accept-connections".to_owned()) + .spawn(move || Self::accept_connections(listener, &terminate_fd, &guest_memory)) + .context("Error creating connection accept thread") + .map_err(MigratableError::MigrateReceive)?; + + Ok(Self { + accept_thread: Some(accept_thread), + terminate_fd: event_fd, + }) + } + + fn accept_connections( + mut listener: ReceiveListener, + terminate_fd: &EventFd, + guest_memory: &GuestMemoryAtomic, + ) -> Result<(), MigratableError> { + let mut threads: Vec>> = Vec::new(); + while let Some(mut socket) = listener.abortable_accept(terminate_fd)? { + let guest_memory = guest_memory.clone(); + let terminate_fd = terminate_fd + .try_clone() + .context("Error cloning terminate fd") + .map_err(MigratableError::MigrateReceive)?; + + match thread::Builder::new() + .name(format!("migrate-receive-memory-{}", threads.len()).to_owned()) + .spawn(move || { + Self::worker_receive_memory(&mut socket, &terminate_fd, &guest_memory) + }) { + Ok(t) => threads.push(t), + Err(e) => { + error!("Error spawning receive-memory thread: {e}"); + break; + } + } + } + + info!("Stopped accepting additional connections. Cleaning up threads."); + + // We only return the first error we encounter here. + let mut first_err = Ok(()); + for thread in threads { + let err = match thread.join() { + Ok(Ok(())) => None, + Ok(Err(e)) => Some(e), + Err(panic) => Some(MigratableError::MigrateReceive(anyhow!( + "receive-memory thread panicked: {panic:?}" + ))), + }; + + if let Some(e) = err { + warn!("Error in receive-memory thread: {e}"); + + if first_err.is_ok() { + first_err = Err(e); + } + } + } + + first_err + } + + // Handles a `Memory` request by writing its payload to the VM memory. + fn worker_receive_memory( + mut socket: &mut SocketStream, + terminate_fd: &EventFd, + guest_memory: &GuestMemoryAtomic, + ) -> Result<(), MigratableError> { + loop { + // We only check whether we should abort when waiting for a new request. If the + // sender stops sending data mid-request, we will hang forever. + if !wait_for_readable(socket, terminate_fd) + .context("Failed to poll fds") + .map_err(MigratableError::MigrateReceive)? + { + info!("Got signal to tear down connection."); + return Ok(()); + } + + let req = match Request::read_from(&mut socket) { + Ok(req) => req, + Err(MigratableError::MigrateSocket(io_error)) + if io_error.kind() == ErrorKind::UnexpectedEof => + { + // EOF is only handled here while reading the next request + // header. Each memory chunk is fully received and acked + // before the worker loops back to Request::read_from(), so + // EOF at this point means the sender finished sending + // memory rather than dropping a chunk mid-transfer. + debug!( + "Connection closed by peer as expected (sender finished sending memory)" + ); + return Ok(()); + } + Err(e) => return Err(e), + }; + + if req.command() != Command::Memory { + error!( + "Dropping connection. Only Memory commands are allowed on additional connections." + ); + return Err(MigratableError::MigrateReceive(anyhow!( + "Received non memory command on migration receive worker: {:?}", + req.command() + ))); + } + + receive_memory_ranges(guest_memory, &req, socket)?; + Response::ok().write_to(socket)?; + } + } + + /// Signals to the worker threads that the migration is finished and joins them. + /// If any thread encountered an error, this error is returned by this function. + pub(crate) fn cleanup(&mut self) -> Result<(), MigratableError> { + self.terminate_fd + .write(1) + .context("Failed to signal termination to worker threads.") + .map_err(MigratableError::MigrateReceive)?; + let accept_thread = self + .accept_thread + .take() + .context("Error taking accept thread.") + .map_err(MigratableError::MigrateReceive)?; + accept_thread + .join() + .map_err(|panic| { + MigratableError::MigrateReceive(anyhow!( + "Accept connections thread panicked: {panic:?}" + )) + }) + .flatten() + } +} + +impl Drop for ReceiveAdditionalConnections { + fn drop(&mut self) { + if self.accept_thread.is_some() { + warn!( + "ReceiveAdditionalConnections was not cleaned up! Either cleanup() was never called (programming error) or it failed before completing." + ); + } + } +} + /// Extract a UNIX socket path from a "unix:" migration URL. fn socket_url_to_path(url: &str) -> Result { url.strip_prefix("unix:") @@ -366,7 +552,7 @@ pub(crate) fn send_memory_ranges( ) } -/// Receive memory contents for the given request and copy it into guest memory. +/// Receive memory contents for the given range table into guest memory. pub(crate) fn receive_memory_ranges( guest_memory: &GuestMemoryAtomic, req: &Request, From 5e563e7ea31457743a5ebaf60c5b604351ad3c0a Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Thu, 12 Mar 2026 13:21:23 +0100 Subject: [PATCH 483/742] vm-migration: allow partitioning memory tables For sending memory over multiple connections, we need a way to split up the work. With these changes, we can chop a memory table into same-sized chunks for transmit On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vm-migration/src/protocol.rs | 229 ++++++++++++++++++++++++++++++++++- 1 file changed, 228 insertions(+), 1 deletion(-) diff --git a/vm-migration/src/protocol.rs b/vm-migration/src/protocol.rs index ab5975d4e5..0a62375f54 100644 --- a/vm-migration/src/protocol.rs +++ b/vm-migration/src/protocol.rs @@ -272,12 +272,99 @@ pub struct MemoryRange { pub length: u64, } -#[derive(Clone, Default, Serialize, Deserialize)] +/// A set of guest-memory ranges to transfer as one migration payload. +#[derive(Clone, Default, Debug, Serialize, Deserialize)] pub struct MemoryRangeTable { data: Vec, } +/// Iterator returned by [`MemoryRangeTable::partition`]. +/// +/// Each item contains at most `chunk_size` bytes. A range may be split across +/// multiple items. +/// +/// The iterator may reorder ranges for efficiency, so callers must not rely on +/// the order in which chunks or ranges are yielded. +#[derive(Clone, Default, Debug)] +struct MemoryRangeTableIterator { + chunk_size: u64, + data: Vec, +} + +impl MemoryRangeTableIterator { + /// Create an iterator that partitions `table` into chunks of at most + /// `chunk_size` bytes. + pub fn new(table: MemoryRangeTable, chunk_size: u64) -> Self { + MemoryRangeTableIterator { + chunk_size, + data: table.data, + } + } +} + +impl Iterator for MemoryRangeTableIterator { + type Item = MemoryRangeTable; + + /// Return the next memory range in the table, making sure that + /// the returned range is not larger than `chunk_size`. + /// + /// **Note**: Do not rely on the order of the ranges returned by this + /// iterator. This allows for a more efficient implementation. + fn next(&mut self) -> Option { + let mut ranges: Vec = vec![]; + let mut ranges_size: u64 = 0; + + loop { + assert!(ranges_size <= self.chunk_size); + + if ranges_size == self.chunk_size || self.data.is_empty() { + break; + } + + if let Some(range) = self.data.pop() { + let next_range: MemoryRange = if ranges_size + range.length > self.chunk_size { + // How many bytes we need to put back into the table. + let leftover_bytes = ranges_size + range.length - self.chunk_size; + assert!(leftover_bytes <= range.length); + let returned_bytes = range.length - leftover_bytes; + assert!(returned_bytes <= range.length); + assert_eq!(leftover_bytes + returned_bytes, range.length); + + self.data.push(MemoryRange { + gpa: range.gpa, + length: leftover_bytes, + }); + MemoryRange { + gpa: range.gpa + leftover_bytes, + length: returned_bytes, + } + } else { + range + }; + + ranges_size += next_range.length; + ranges.push(next_range); + } + } + + if ranges.is_empty() { + None + } else { + Some(MemoryRangeTable { data: ranges }) + } + } +} + impl MemoryRangeTable { + pub fn ranges(&self) -> &[MemoryRange] { + &self.data + } + + /// Partitions the table into chunks of at most `chunk_size` bytes. + pub fn partition(self, chunk_size: u64) -> impl Iterator { + MemoryRangeTableIterator::new(self, chunk_size) + } + /// Converts an iterator over a dirty bitmap into an iterator of dirty /// [`MemoryRange`]s, merging consecutive dirty pages into contiguous ranges. /// @@ -413,4 +500,144 @@ mod unit_tests { ] ); } + + #[test] + fn test_memory_range_table_partition() { + // We start the test similar as the one above, but with a input that is simpler to parse for + // developers. + let input = [0b11_0011_0011_0011]; + + let start_gpa = 0x1000; + let page_size = 0x1000; + + let table = MemoryRangeTable::from_dirty_bitmap(input, start_gpa, page_size); + let expected_regions = [ + MemoryRange { + gpa: start_gpa, + length: page_size * 2, + }, + MemoryRange { + gpa: start_gpa + 4 * page_size, + length: page_size * 2, + }, + MemoryRange { + gpa: start_gpa + 8 * page_size, + length: page_size * 2, + }, + MemoryRange { + gpa: start_gpa + 12 * page_size, + length: page_size * 2, + }, + ]; + assert_eq!(table.regions(), &expected_regions); + + // In the first test, we expect to see the exact same result as above, as we use the length + // of every region (which is fixed!). + { + let chunks = table + .clone() + .partition(page_size * 2) + .map(|table| table.data) + .collect::>(); + + // The implementation currently returns the ranges in reverse order. + // For better testability, we reverse it. + let chunks = chunks + .into_iter() + .map(|vec| vec.into_iter().rev().collect::>()) + .rev() + .collect::>(); + + assert_eq!( + chunks, + &[ + [expected_regions[0].clone()].to_vec(), + [expected_regions[1].clone()].to_vec(), + [expected_regions[2].clone()].to_vec(), + [expected_regions[3].clone()].to_vec(), + ] + ); + } + + // Next, we have a more sophisticated test with a chunk size of 5 pages. + { + let chunks = table + .clone() + .partition(page_size * 5) + .map(|table| table.data) + .collect::>(); + + // The implementation currently returns the ranges in reverse order. + // For better testability, we reverse it. + let chunks = chunks + .into_iter() + .map(|vec| vec.into_iter().rev().collect::>()) + .rev() + .collect::>(); + + assert_eq!( + chunks, + &[ + vec![ + MemoryRange { + gpa: start_gpa, + length: 2 * page_size + }, + MemoryRange { + gpa: start_gpa + 4 * page_size, + length: page_size + } + ], + vec![ + MemoryRange { + gpa: start_gpa + 5 * page_size, + length: page_size + }, + MemoryRange { + gpa: start_gpa + 8 * page_size, + length: 2 * page_size + }, + MemoryRange { + gpa: start_gpa + 12 * page_size, + length: 2 * page_size + } + ] + ] + ); + } + } + + #[test] + fn test_memory_range_table_partition_uneven_split() { + // Three consecutive dirty pages produce one 3-page range, which lets + // us test an uneven 1+2 page split while using the same helper as the + // other partition tests above. + let input = [0b111]; + let start_gpa = 0x1000; + let page_size = 0x1000; + + let table = MemoryRangeTable::from_dirty_bitmap(input, start_gpa, page_size); + + let chunks = table + .partition(page_size * 2) + .map(|table| table.data) + .collect::>(); + + // The implementation currently returns ranges in reverse order. + let chunks = chunks.into_iter().rev().collect::>(); + + assert_eq!( + chunks, + &[ + vec![MemoryRange { + gpa: start_gpa, + length: page_size, + }], + vec![MemoryRange { + gpa: start_gpa + page_size, + length: page_size * 2, + }], + ] + ); + } } From 5a2dea8fa6ec028df5a2642835b852aa94a52c46 Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Thu, 12 Mar 2026 13:23:20 +0100 Subject: [PATCH 484/742] vmm: implement a gate This gate behaves like a barrier, but it can be opened, meaning that threads can be released before all threads arrived at the gate. This lets us release waiting threads in case of an error, which will be important for the sender side of a live migration with multiple TCP connections. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vmm/src/sync_utils.rs | 127 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 vmm/src/sync_utils.rs diff --git a/vmm/src/sync_utils.rs b/vmm/src/sync_utils.rs new file mode 100644 index 0000000000..14517eac24 --- /dev/null +++ b/vmm/src/sync_utils.rs @@ -0,0 +1,127 @@ +// Copyright © 2026 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::sync::{Condvar, Mutex}; + +/// A single use abortable gate. The main thread will create the gate and pass +/// it to the memory sending threads. The main thread can always open the gate. +/// That way the main thread can also open the gate before all workers arrive +/// there, e.g. if one worker signals that an error occurred and thus cannot +/// continue. +#[derive(Debug)] +pub struct Gate { + /// True if the gate is open, false otherwise. + open: Mutex, + /// Used to notify waiting threads. + cv: Condvar, +} + +impl Gate { + pub fn new() -> Self { + Self { + open: Mutex::new(false), + cv: Condvar::new(), + } + } + + /// Wait at the gate. Only blocks if the gate is not opened. + pub fn wait(&self) { + let mut open = self.open.lock().unwrap(); + while !*open { + open = self.cv.wait(open).unwrap(); + } + } + + /// Open the gate, releasing all waiting threads. + pub fn open(&self) { + let mut open = self.open.lock().unwrap(); + *open = true; + self.cv.notify_all(); + } +} + +#[cfg(test)] +mod tests { + use std::sync::{Arc, mpsc}; + use std::thread; + use std::time::Duration; + + use super::Gate; + + #[test] + fn gate_blocks_until_open() { + let gate = Arc::new(Gate::new()); + let (tx, rx) = mpsc::channel(); + + let gate_clone = gate.clone(); + thread::spawn(move || { + gate_clone.wait(); + tx.send(()).unwrap(); + }); + + // Give the thread time to block. + thread::sleep(Duration::from_millis(50)); + assert!(rx.try_recv().is_err()); + + gate.open(); + rx.recv_timeout(Duration::from_secs(1)).unwrap(); + } + + #[test] + fn gate_open_before_wait_is_non_blocking() { + let gate = Arc::new(Gate::new()); + gate.open(); + + let (tx, rx) = mpsc::channel(); + let gate_clone = gate.clone(); + thread::spawn(move || { + gate_clone.wait(); + tx.send(()).unwrap(); + }); + + rx.recv_timeout(Duration::from_secs(1)).unwrap(); + } + + #[test] + fn gate_releases_multiple_waiters() { + let gate = Arc::new(Gate::new()); + let (tx, rx) = mpsc::channel(); + + for _ in 0..4 { + let gate_clone = gate.clone(); + let tx = tx.clone(); + thread::spawn(move || { + gate_clone.wait(); + tx.send(()).unwrap(); + }); + } + + // Ensure nobody passed before open. + thread::sleep(Duration::from_millis(50)); + assert!(rx.try_recv().is_err()); + + gate.open(); + + for _ in 0..4 { + rx.recv_timeout(Duration::from_secs(1)).unwrap(); + } + } + + #[test] + fn gate_open_is_idempotent() { + let gate = Arc::new(Gate::new()); + gate.open(); + gate.open(); + + let (tx, rx) = mpsc::channel(); + let gate_clone = gate.clone(); + thread::spawn(move || { + gate_clone.wait(); + tx.send(()).unwrap(); + }); + + rx.recv_timeout(Duration::from_secs(1)).unwrap(); + } +} From 07484abd35acb70584ab6f406b5f7bb8f400a494 Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Thu, 12 Mar 2026 13:29:24 +0100 Subject: [PATCH 485/742] vmm: implement functionality to send via multiple connections Implements the functionality to send VM memory via multiple connections during a live migration. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vmm/src/lib.rs | 1 + vmm/src/migration_transport.rs | 339 +++++++++++++++++++++++++++++++++ 2 files changed, 340 insertions(+) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index be9ebf929b..60ec64b617 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -90,6 +90,7 @@ mod pci_segment; pub mod seccomp_filters; mod serial_manager; mod sigwinch_listener; +mod sync_utils; mod uffd; mod userfaultfd; pub mod vm; diff --git a/vmm/src/migration_transport.rs b/vmm/src/migration_transport.rs index 27fc464fad..4568de9aa9 100644 --- a/vmm/src/migration_transport.rs +++ b/vmm/src/migration_transport.rs @@ -5,12 +5,17 @@ use std::io::{self, ErrorKind, Read, Write}; use std::net::{TcpListener, TcpStream}; +use std::num::NonZeroU32; use std::os::fd::{AsFd, BorrowedFd}; use std::os::unix::io::{AsRawFd, RawFd}; use std::os::unix::net::{UnixListener, UnixStream}; use std::path::PathBuf; use std::result::Result; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::mpsc::{Receiver, Sender, SyncSender, TrySendError, channel, sync_channel}; +use std::sync::{Arc, Mutex}; use std::thread; +use std::time::Duration; use anyhow::{Context, anyhow}; use log::{debug, error, info, warn}; @@ -24,6 +29,7 @@ use vm_migration::protocol::{Command, MemoryRangeTable, Request, Response}; use vm_migration::{MigratableError, Snapshot}; use vmm_sys_util::eventfd::EventFd; +use crate::sync_utils::Gate; use crate::{GuestMemoryMmap, VmMigrationConfig}; /// Transport-agnostic listener used to receive connections. @@ -394,6 +400,339 @@ impl Drop for ReceiveAdditionalConnections { } } +/// The different kinds of messages we can send to memory sending threads. +#[derive(Debug)] +enum SendMemoryThreadMessage { + /// A chunk of memory that the thread should send to the receiving side of the + /// live migration. + Memory(MemoryRangeTable), + /// A synchronization point after each iteration of sending memory. That way the + /// main thread knows when all memory is sent and acknowledged. + Gate(Arc), + /// Sending memory is done and the threads are not needed anymore. + Disconnect, +} + +/// The different kinds of messages the main thread can receive from a memory +/// sending thread. +#[derive(Debug)] +enum SendMemoryThreadNotify { + /// A sending thread arrived at the gate. The main thread does not wait at the + /// gate, otherwise we could miss error messages. + Gate, + /// A sending thread encountered an error while sending memory. + Error, +} + +/// This struct keeps track of additional threads we use to send VM memory. +pub(crate) struct SendAdditionalConnections { + guest_memory: GuestMemoryAtomic, + threads: Vec>>, + /// Sender to all workers. The receiver is shared by all workers. + message_tx: SyncSender, + /// If an error occurs in one of the memory sending threads, the main thread signals + /// this using this flag. Only the main thread checks this variable, the worker + /// threads will be stopped during cleanup. + worker_error: Arc, + /// After the main thread sent all memory chunks to the sender threads, it waits + /// until one of the workers notifies it. Either because an error occurred, or + /// because they arrived at the gate. + notify_rx: Receiver, +} + +impl SendAdditionalConnections { + /// How many requests can be queued for each connection before the main + /// thread has to wait for workers to catch up. This bounded [`SyncChannel`] + /// provides backpressure, so send_chunk() re-checks worker_error promptly + /// instead of queueing all memory descriptors up front and only noticing + /// failures at the next gate synchronization point. + const BUFFERED_REQUESTS_PER_THREAD: usize = 64; + + /// The size of each chunk of memory to send. + /// + /// We want to make this large, because each chunk is acknowledged and we wait + /// for the ack before sending the next chunk. The challenge is that if it is + /// _too_ large, we become more sensitive to network issues, like packet drops + /// in individual connections, because large amounts of data can pool when + /// throughput on one connection is temporarily reduced. + /// + /// We can consider making this configurable, but a better network protocol that + /// doesn't require ACKs would be more efficient. + /// + /// The best-case throughput per connection can be estimated via: + /// chunk_size / (chunk_size / throughput_per_connection + round_trip_time) + /// + /// This chunk size together with eight connections is sufficient to saturate a 100G link. + const CHUNK_SIZE: u64 = 64 /* MiB */ << 20; + + pub(crate) fn new( + destination: &str, + connections: NonZeroU32, + guest_memory: &GuestMemoryAtomic, + ) -> Result { + let mut threads = Vec::new(); + let configured_connections = connections.get(); + let buffer_size = Self::BUFFERED_REQUESTS_PER_THREAD * configured_connections as usize; + let (message_tx, message_rx) = sync_channel::(buffer_size); + let worker_error = Arc::new(AtomicBool::new(false)); + let (notify_tx, notify_rx) = channel::(); + + // If one connection is configured, we don't have to create any additional threads. + // In this case the main thread does the sending. + if configured_connections == 1 { + return Ok(Self { + guest_memory: guest_memory.clone(), + threads, + message_tx, + worker_error, + notify_rx, + }); + } + + let message_rx = Arc::new(Mutex::new(message_rx)); + // If we use multiple threads to send memory, the main thread only distributes + // the memory chunks to the workers, but does not send memory anymore. Thus in + // this case we create one additional thread for each connection. + for n in 0..configured_connections { + let mut socket = send_migration_socket(destination)?; + let guest_memory = guest_memory.clone(); + let message_rx = message_rx.clone(); + let worker_error = worker_error.clone(); + let notify_tx = notify_tx.clone(); + + let thread = thread::Builder::new() + .name(format!("migrate-send-memory-{n}")) + .spawn(move || { + Self::worker_send_memory( + &mut socket, + &guest_memory, + &message_rx, + &worker_error, + ¬ify_tx, + ) + }) + .inspect_err(|_| { + // If an error occurs here, we still do some light cleanup. + for _ in 0..threads.len() { + message_tx.send(SendMemoryThreadMessage::Disconnect).ok(); + } + threads.drain(..).for_each(|thread| { + thread.join().ok(); + }); + }) + .context("Error spawning send-memory thread") + .map_err(MigratableError::MigrateSend)?; + threads.push(thread); + } + + Ok(Self { + guest_memory: guest_memory.clone(), + threads, + message_tx, + worker_error, + notify_rx, + }) + } + + fn worker_send_memory( + socket: &mut SocketStream, + guest_memory: &GuestMemoryAtomic, + message_rx: &Mutex>, + worker_error: &AtomicBool, + notify_tx: &Sender, + ) -> Result<(), MigratableError> { + info!("Spawned thread to send VM memory."); + loop { + // Every memory sending thread receives messages from the main thread through this + // channel. The lock is necessary to synchronize the multiple consumers. If the + // workers are very quick, lock contention could become a performance issue. + let message = message_rx + .lock() + .map_err(|_| MigratableError::MigrateSend(anyhow!("message_rx mutex is poisoned"))) + .inspect_err(|_| { + worker_error.store(true, Ordering::Relaxed); + // We ignore errors during error handling. + notify_tx.send(SendMemoryThreadNotify::Error).ok(); + })? + .recv() + .context("Error receiving message from main thread") + .map_err(MigratableError::MigrateSend) + .inspect_err(|_| { + worker_error.store(true, Ordering::Relaxed); + notify_tx.send(SendMemoryThreadNotify::Error).ok(); + })?; + match message { + SendMemoryThreadMessage::Memory(table) => { + send_memory_ranges(guest_memory, &table, socket) + .inspect_err(|_| { + worker_error.store(true, Ordering::Relaxed); + notify_tx.send(SendMemoryThreadNotify::Error).ok(); + }) + .context("Error sending memory to receiver side") + .map_err(MigratableError::MigrateSend)?; + } + SendMemoryThreadMessage::Gate(gate) => { + notify_tx + .send(SendMemoryThreadNotify::Gate) + .context("Error sending gate notification to main thread") + .map_err(MigratableError::MigrateSend) + .inspect_err(|_| { + // Sending via `notify_tx` just failed, so we don't try to send another + // message via it. + worker_error.store(true, Ordering::Relaxed); + })?; + gate.wait(); + } + SendMemoryThreadMessage::Disconnect => { + return Ok(()); + } + } + } + } + + /// Send memory via all connections that we have. `socket` is the original socket + /// that was used to connect to the destination. Returns Ok(true) if memory was + /// sent, Ok(false) if the given table was empty. + /// + /// When this function returns, all memory has been sent and acknowledged. + pub(crate) fn send_memory( + &mut self, + table: MemoryRangeTable, + socket: &mut SocketStream, + ) -> Result { + if table.regions().is_empty() { + return Ok(false); + } + + // If we use only one connection, we send the memory directly. + if self.threads.is_empty() { + send_memory_ranges(&self.guest_memory, &table, socket)?; + return Ok(true); + } + + // The chunk size is chosen to be big enough so that even very fast links need some + // milliseconds to send it. + for chunk in table.partition(Self::CHUNK_SIZE) { + self.send_chunk(chunk)?; + } + + self.wait_for_pending_data()?; + Ok(true) + } + + fn send_chunk(&mut self, chunk: MemoryRangeTable) -> Result<(), MigratableError> { + let mut chunk = SendMemoryThreadMessage::Memory(chunk); + // [`Self::message_tx`] has a limited size, so we may have to retry sending the chunk + loop { + if self.worker_error.load(Ordering::Relaxed) { + return self.cleanup(); + } + + // Use try_send() so we can keep checking worker_error while the + // workers catch up. A blocking send() could wait forever if a + // worker failed and stopped making progress. + match self.message_tx.try_send(chunk) { + Ok(()) => { + return Ok(()); + } + Err(TrySendError::Full(unsent_chunk)) => { + // The channel is full. We wait for a short time and retry. + thread::sleep(Duration::from_millis(10)); + chunk = unsent_chunk; + } + Err(TrySendError::Disconnected(_)) => { + // The workers didn't disconnect for no reason, thus we do a cleanup. + return Err(self.cleanup().err().unwrap_or(MigratableError::MigrateSend( + anyhow!("All sending threads disconnected, but none returned an error?"), + ))); + } + } + } + } + + /// Wait until all data that is in-flight has actually been sent and acknowledged. + fn wait_for_pending_data(&mut self) -> Result<(), MigratableError> { + let gate = Arc::new(Gate::new()); + for _ in 0..self.threads.len() { + self.message_tx + .send(SendMemoryThreadMessage::Gate(gate.clone())) + .context("Error sending gate message to workers") + .map_err(MigratableError::MigrateSend)?; + } + + // We cannot simply wait at the gate, otherwise we might miss it when a sender + // thread encounters an error. Thus we wait for the workers to notify us that + // they arrived at the gate. + let mut seen_threads = 0; + loop { + match self + .notify_rx + .recv() + .context("Error receiving message from workers") + .map_err(MigratableError::MigrateSend)? + { + SendMemoryThreadNotify::Gate => { + seen_threads += 1; + if seen_threads == self.threads.len() { + gate.open(); + return Ok(()); + } + } + SendMemoryThreadNotify::Error => { + // If an error occurred in one of the worker threads, we open + // the gate to make sure that no thread hangs. After that, we + // receive the error from Self::cleanup() and return it. + gate.open(); + return self.cleanup(); + } + } + } + } + + /// Sends disconnect messages to all workers and joins them. + pub(crate) fn cleanup(&mut self) -> Result<(), MigratableError> { + // Send disconnect messages to all workers. + for _ in 0..self.threads.len() { + // All threads may have terminated, leading to a dropped receiver. Thus we ignore + // errors here. + self.message_tx + .try_send(SendMemoryThreadMessage::Disconnect) + .ok(); + } + + let mut first_err = Ok(()); + self.threads.drain(..).for_each(|thread| { + let err = match thread.join() { + Ok(Ok(())) => None, + Ok(Err(e)) => Some(e), + Err(panic) => Some(MigratableError::MigrateSend(anyhow!( + "send-memory thread panicked: {panic:?}" + ))), + }; + + if let Some(e) = err { + warn!("Error in send-memory thread: {e}"); + + if first_err.is_ok() { + first_err = Err(e); + } + } + }); + + first_err + } +} + +impl Drop for SendAdditionalConnections { + fn drop(&mut self) { + if !self.threads.is_empty() { + warn!( + "SendAdditionalConnections was not cleaned up! Either cleanup() was never called (programming error) or it failed before completing." + ); + } + } +} + /// Extract a UNIX socket path from a "unix:" migration URL. fn socket_url_to_path(url: &str) -> Result { url.strip_prefix("unix:") From 55e6971c47f39c80ee7a0016f98e16aad75fff46 Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Thu, 12 Mar 2026 14:30:43 +0100 Subject: [PATCH 486/742] vmm: funnel VM memory via additional connections abstraction At this point, we are still only using a single connection. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vmm/src/lib.rs | 111 ++++++++++++++++++++++++--------- vmm/src/migration_transport.rs | 16 +++++ 2 files changed, 97 insertions(+), 30 deletions(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 60ec64b617..a93dd1bdd0 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -6,6 +6,7 @@ use std::collections::HashMap; use std::fs::File; use std::io::{Read, Write, stdout}; +use std::num::NonZero; use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; use std::panic::AssertUnwindSafe; #[cfg(feature = "guest_debug")] @@ -59,7 +60,9 @@ use crate::memory_manager::MemoryManager; #[cfg(all(feature = "kvm", target_arch = "x86_64"))] use crate::migration::get_vm_snapshot; use crate::migration::{recv_vm_config, recv_vm_state}; -use crate::migration_transport::SocketStream; +use crate::migration_transport::{ + ReceiveAdditionalConnections, ReceiveListener, SendAdditionalConnections, SocketStream, +}; use crate::seccomp_filters::{Thread, get_seccomp_filter}; use crate::vm::{Error as VmError, Vm, VmState}; use crate::vm_config::{ @@ -613,6 +616,13 @@ pub struct Vmm { console_info: Option, } +/// Just a wrapper for the data that goes into +/// [`ReceiveMigrationState::Configured`] +struct ReceiveMigrationConfiguredData { + memory_manager: Arc>, + guest_memory: GuestMemoryAtomic, + connections: ReceiveAdditionalConnections, +} /// The receiver's state machine behind the migration protocol. enum ReceiveMigrationState { /// The connection is established and we haven't received any commands yet. @@ -630,10 +640,7 @@ enum ReceiveMigrationState { /// /// We keep the memory manager around to pass it into the next state. From this point /// on, the sender can start sending memory updates. - Configured( - Arc>, - GuestMemoryAtomic, - ), + Configured(ReceiveMigrationConfiguredData), /// Memory is populated and we received the state. The VM is ready to go. StateReceived, @@ -833,6 +840,7 @@ impl Vmm { fn vm_receive_migration_step( &mut self, socket: &mut SocketStream, + listener: &ReceiveListener, state: ReceiveMigrationState, req: &Request, _receive_data_migration: &VmReceiveMigrationData, @@ -845,19 +853,25 @@ impl Vmm { ))) }; - let mut configure_vm = |socket: &mut SocketStream, - memory_files: HashMap| - -> std::result::Result< - ( - Arc>, - GuestMemoryAtomic, - ), - MigratableError, - > { - let memory_manager = self.vm_receive_config(req, socket, memory_files)?; - let guest_memory = memory_manager.lock().unwrap().guest_memory(); - Ok((memory_manager, guest_memory)) - }; + let mut configure_vm = + |socket: &mut SocketStream, + memory_files: HashMap| + -> std::result::Result { + let memory_manager = self.vm_receive_config(req, socket, memory_files)?; + let guest_memory = memory_manager.lock().unwrap().guest_memory(); + // Create the additional-connection receiver even in the single-connection case. + // At this point the receiver does not know whether the sender will use extra TCP + // connections. If it does not, no worker connections are accepted and memory + // requests continue to arrive on the main connection. + let connections = listener + .try_clone() + .and_then(|l| ReceiveAdditionalConnections::new(l, guest_memory.clone()))?; + Ok(ReceiveMigrationConfiguredData { + memory_manager, + guest_memory, + connections, + }) + }; let recv_memory_fd = |socket: &mut SocketStream, mut memory_files: Vec<(u32, File)>| @@ -880,24 +894,42 @@ impl Vmm { }, Started => match req.command() { Command::MemoryFd => recv_memory_fd(socket, Vec::new()).map(MemoryFdsReceived), - Command::Config => { - configure_vm(socket, Default::default()).map(|res| Configured(res.0, res.1)) - } + Command::Config => configure_vm(socket, Default::default()).map(Configured), _ => invalid_command(), }, MemoryFdsReceived(memory_files) => match req.command() { Command::MemoryFd => recv_memory_fd(socket, memory_files).map(MemoryFdsReceived), - Command::Config => configure_vm(socket, HashMap::from_iter(memory_files)) - .map(|res| Configured(res.0, res.1)), + Command::Config => { + configure_vm(socket, HashMap::from_iter(memory_files)).map(Configured) + } _ => invalid_command(), }, - Configured(memory_manager, guest_memory) => match req.command() { + Configured(mut config_data) => match req.command() { + // Memory commands use the main connection only in the single-connection case. + // When multiple TCP connections are configured, the worker connections carry + // all memory commands and the main connection is used only for control traffic. Command::Memory => { - migration_transport::receive_memory_ranges(&guest_memory, req, socket)?; - Ok(Configured(memory_manager, guest_memory)) + migration_transport::receive_memory_ranges( + &config_data.guest_memory, + req, + socket, + ) + .inspect_err(|_| { + // connections.cleanup() already logs all errors that occurred in one of the + // threads. Furthermore, this path is only taken in the single-connection case, + // thus we do not expect any errors during this cleanup. The warning should + // reflect that. + if let Err(e) = config_data.connections.cleanup() { + warn!( + "Unexpected error while cleaning up migration connections after a main-connection memory receive failure: {e}" + ); + } + })?; + Ok(Configured(config_data)) } Command::State => { - self.vm_receive_state(req, socket, memory_manager)?; + config_data.connections.cleanup()?; + self.vm_receive_state(req, socket, config_data.memory_manager)?; Ok(StateReceived) } _ => invalid_command(), @@ -1082,6 +1114,7 @@ impl Vmm { socket: &mut SocketStream, ctx: &mut MemoryMigrationContext, is_converged: impl Fn(&MemoryMigrationContext) -> result::Result, + mem_send: &mut SendAdditionalConnections, ) -> result::Result { loop { let iteration_begin = Instant::now(); @@ -1101,7 +1134,7 @@ impl Vmm { // Send the current dirty pages let transfer_begin = Instant::now(); - migration_transport::send_memory_ranges(&vm.guest_memory(), &iteration_table, socket)?; + mem_send.send_memory(iteration_table, socket)?; let transfer_duration = transfer_begin.elapsed(); ctx.update_metrics_after_transfer(transfer_begin, transfer_duration); @@ -1221,6 +1254,7 @@ impl Vmm { vm: &mut Vm, socket: &mut SocketStream, send_data_migration: &VmSendMigrationData, + mem_send: &mut SendAdditionalConnections, ) -> result::Result<(), MigratableError> { let mut ctx = MemoryMigrationContext::new(); @@ -1231,6 +1265,7 @@ impl Vmm { &mut ctx, // We bind send_data_migration to the callback |ctx| Self::is_precopy_converged(ctx, send_data_migration), + mem_send, )?; vm.pause()?; @@ -1243,7 +1278,7 @@ impl Vmm { ctx.update_metrics_before_transfer(iteration_begin, &final_table); let transfer_begin = Instant::now(); - migration_transport::send_memory_ranges(&vm.guest_memory(), &final_table, socket)?; + mem_send.send_memory(final_table, socket)?; let transfer_duration = transfer_begin.elapsed(); ctx.update_metrics_after_transfer(transfer_begin, transfer_duration); ctx.iteration += 1; @@ -1330,7 +1365,22 @@ impl Vmm { // Now pause VM vm.pause()?; } else { - Self::do_memory_migration(vm, &mut socket, send_data_migration)?; + let mut mem_send = migration_transport::SendAdditionalConnections::new( + &send_data_migration.destination_url, + NonZero::new(1).unwrap(), + &vm.guest_memory(), + )?; + + Self::do_memory_migration(vm, &mut socket, send_data_migration, &mut mem_send) + .inspect_err(|_| { + // Calling cleanup multiple times is fine, thus here we just make sure + // that it is called. + if let Err(e) = mem_send.cleanup() { + warn!("Error cleaning up migration connections: {e}"); + } + })?; + + mem_send.cleanup()?; } // We release the locks early to enable locking them on the destination host. @@ -2302,6 +2352,7 @@ impl RequestHandler for Vmm { let (response, new_state) = match self.vm_receive_migration_step( &mut socket, + &listener, state, &req, &receive_data_migration, diff --git a/vmm/src/migration_transport.rs b/vmm/src/migration_transport.rs index 4568de9aa9..1ee71506d8 100644 --- a/vmm/src/migration_transport.rs +++ b/vmm/src/migration_transport.rs @@ -72,6 +72,22 @@ impl ReceiveListener { Ok(None) } } + + /// Tries to clone a [`ReceiveListener`]. + pub(crate) fn try_clone(&self) -> Result { + match self { + ReceiveListener::Tcp(listener) => listener + .try_clone() + .map(ReceiveListener::Tcp) + .context("Failed to clone TCP listener") + .map_err(MigratableError::MigrateReceive), + ReceiveListener::Unix(listener) => listener + .try_clone() + .map(ReceiveListener::Unix) + .context("Failed to clone Unix listener") + .map_err(MigratableError::MigrateReceive), + } + } } impl AsFd for ReceiveListener { From fb19881918a43544bdf7d58a6297dba07918d06f Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Thu, 12 Mar 2026 15:08:07 +0100 Subject: [PATCH 487/742] vm-migration: add connections field to API And wire everything up. From now on the multiple connections feature can be used. This commit series is heavily based on Julian Stecklina's work, so kudos to him! Co-authored-by: Julian Stecklina On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vmm/src/api/mod.rs | 42 +++++++++++++++++++---- vmm/src/api/openapi/cloud-hypervisor.yaml | 5 +++ vmm/src/lib.rs | 3 +- 3 files changed, 42 insertions(+), 8 deletions(-) diff --git a/vmm/src/api/mod.rs b/vmm/src/api/mod.rs index 4b51c7eb3d..10457d7d38 100644 --- a/vmm/src/api/mod.rs +++ b/vmm/src/api/mod.rs @@ -34,7 +34,7 @@ pub mod dbus; pub mod http; use std::io; -use std::num::NonZeroU64; +use std::num::{NonZeroU32, NonZeroU64}; use std::str::FromStr; use std::sync::mpsc::{RecvError, SendError, Sender, channel}; use std::time::Duration; @@ -302,7 +302,7 @@ pub struct VmSendMigrationParseError(#[source] OptionParserError); #[derive(Clone, Deserialize, Serialize, Debug)] #[cfg_attr(test, derive(PartialEq))] pub struct VmSendMigrationData { - /// URL to migrate the VM to + /// Migration destination, e.g. `tcp::` or `unix:/path/to/socket`. pub destination_url: String, /// Send memory across socket without copying #[serde(default)] @@ -318,13 +318,17 @@ pub struct VmSendMigrationData { /// The timeout strategy for the migration. #[serde(default)] pub timeout_strategy: TimeoutStrategy, + + /// The number of parallel connections for migration. + #[serde(default = "VmSendMigrationData::default_connections")] + pub connections: NonZeroU32, } impl VmSendMigrationData { pub const SYNTAX: &'static str = "VM send migration parameters \ \"destination_url=[,local=on|off,\ downtime_ms=,timeout_s=,\ - timeout_strategy=cancel|ignore]\""; + timeout_strategy=cancel|ignore,connections=]\""; // Same as QEMU. pub const DEFAULT_DOWNTIME: Duration = Duration::from_millis(300); @@ -339,6 +343,11 @@ impl VmSendMigrationData { NonZeroU64::new(Self::DEFAULT_TIMEOUT.as_secs()).unwrap() } + // Use a single connection as default for backward compatibility. + fn default_connections() -> NonZeroU32 { + NonZeroU32::new(1).unwrap() + } + pub fn parse(migration: &str) -> Result { let mut parser = OptionParser::new(); parser @@ -346,7 +355,8 @@ impl VmSendMigrationData { .add("local") .add("downtime_ms") .add("timeout_s") - .add("timeout_strategy"); + .add("timeout_strategy") + .add("connections"); parser.parse(migration).map_err(VmSendMigrationParseError)?; let destination_url = parser.get("destination_url").ok_or_else(|| { @@ -385,6 +395,17 @@ impl VmSendMigrationData { .convert("timeout_strategy") .map_err(VmSendMigrationParseError)? .unwrap_or_default(); + let connections = match parser + .convert::("connections") + .map_err(VmSendMigrationParseError)? + { + Some(v) => NonZeroU32::new(v).ok_or_else(|| { + VmSendMigrationParseError(OptionParserError::InvalidValue( + "connections must be non-zero".to_string(), + )) + })?, + None => Self::default_connections(), + }; Ok(Self { destination_url, @@ -392,6 +413,7 @@ impl VmSendMigrationData { downtime_ms, timeout_s, timeout_strategy, + connections, }) } @@ -1679,13 +1701,14 @@ mod unit_tests { fn test_vm_send_migration_data_parse() { // Fully specified let data = VmSendMigrationData::parse( - "destination_url=tcp://192.168.1.1:8080,local=on,downtime_ms=200,timeout_s=3600,timeout_strategy=cancel" + "destination_url=tcp://192.168.1.1:8080,local=on,downtime_ms=200,timeout_s=3600,timeout_strategy=cancel,connections=2" ).expect("valid migration string should parse"); assert_eq!(data.destination_url, "tcp://192.168.1.1:8080"); assert!(data.local); assert_eq!(data.downtime_ms.get(), 200); assert_eq!(data.timeout_s.get(), 3600); assert_eq!(data.timeout_strategy, TimeoutStrategy::Cancel); + assert_eq!(data.connections.get(), 2); // Defaults applied when optional fields are omitted let data = VmSendMigrationData::parse("destination_url=tcp://192.168.1.1:8080") @@ -1695,6 +1718,7 @@ mod unit_tests { assert_eq!(data.downtime_ms, VmSendMigrationData::default_downtime_ms()); assert_eq!(data.timeout_s, VmSendMigrationData::default_timeout_s()); assert_eq!(data.timeout_strategy, TimeoutStrategy::default()); + assert_eq!(data.connections, VmSendMigrationData::default_connections()); // Missing destination_url is an error VmSendMigrationData::parse("local=on,downtime_ms=200").unwrap_err(); @@ -1708,6 +1732,10 @@ mod unit_tests { let _data = VmSendMigrationData::parse("destination_url=unix:/tmp/sock,timeout_s=0") .expect_err("zero timeout_s should be rejected"); + // Zero connections is rejected + let _data = VmSendMigrationData::parse("destination_url=unix:/tmp/sock,connections=0") + .expect_err("zero connections should be rejected"); + // Unknown option is an error VmSendMigrationData::parse("destination_url=unix:/tmp/sock,unknown_field=foo").unwrap_err(); @@ -1732,12 +1760,13 @@ mod unit_tests { downtime_ms: NonZeroU64::new(150).unwrap(), timeout_s: VmSendMigrationData::default_timeout_s(), timeout_strategy: Default::default(), + connections: VmSendMigrationData::default_connections(), } ); // Happy path, fully specified let data = - VmSendMigrationData::parse("destination_url=tcp://192.168.1.1:8080,downtime_ms=150,timeout_s=900,timeout_strategy=ignore") + VmSendMigrationData::parse("destination_url=tcp://192.168.1.1:8080,downtime_ms=150,timeout_s=900,timeout_strategy=ignore,connections=4") .unwrap(); assert_eq!( data, @@ -1747,6 +1776,7 @@ mod unit_tests { downtime_ms: NonZeroU64::new(150).unwrap(), timeout_s: NonZeroU64::new(900).unwrap(), timeout_strategy: TimeoutStrategy::Ignore, + connections: NonZeroU32::new(4).unwrap(), } ); } diff --git a/vmm/src/api/openapi/cloud-hypervisor.yaml b/vmm/src/api/openapi/cloud-hypervisor.yaml index 14f74018ea..c2fe5af4b9 100644 --- a/vmm/src/api/openapi/cloud-hypervisor.yaml +++ b/vmm/src/api/openapi/cloud-hypervisor.yaml @@ -1405,6 +1405,11 @@ components: Defaults to 3600s (one hour). timeout_strategy: $ref: "#/components/schemas/TimeoutStrategy" + connections: + type: integer + format: int64 + default: 1 + minimum: 1 VmAddUserDevice: required: diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index a93dd1bdd0..f1226090c9 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -6,7 +6,6 @@ use std::collections::HashMap; use std::fs::File; use std::io::{Read, Write, stdout}; -use std::num::NonZero; use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; use std::panic::AssertUnwindSafe; #[cfg(feature = "guest_debug")] @@ -1367,7 +1366,7 @@ impl Vmm { } else { let mut mem_send = migration_transport::SendAdditionalConnections::new( &send_data_migration.destination_url, - NonZero::new(1).unwrap(), + send_data_migration.connections, &vm.guest_memory(), )?; From a9a832f392447101762f6a4f6937603a06ed8156 Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Mon, 30 Mar 2026 15:14:43 +0200 Subject: [PATCH 488/742] vmm: validate VmSendMigrationData Validates that there are no conflicting options set, and that the destination URL is valid. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- cloud-hypervisor/src/bin/ch-remote.rs | 2 +- vmm/src/api/mod.rs | 116 ++++++++++++++++++++------ vmm/src/lib.rs | 7 +- 3 files changed, 96 insertions(+), 29 deletions(-) diff --git a/cloud-hypervisor/src/bin/ch-remote.rs b/cloud-hypervisor/src/bin/ch-remote.rs index afc41e7e96..236e7438e0 100644 --- a/cloud-hypervisor/src/bin/ch-remote.rs +++ b/cloud-hypervisor/src/bin/ch-remote.rs @@ -72,7 +72,7 @@ enum Error { #[error("Invalid disk size")] InvalidDiskSize(#[source] ByteSizedParseError), #[error("Error parsing send migration configuration")] - SendMigrationConfig(#[from] vmm::api::VmSendMigrationParseError), + SendMigrationConfig(#[from] vmm::api::VmSendMigrationConfigError), } enum TargetApi<'a> { diff --git a/vmm/src/api/mod.rs b/vmm/src/api/mod.rs index 10457d7d38..f66fbe9ab0 100644 --- a/vmm/src/api/mod.rs +++ b/vmm/src/api/mod.rs @@ -295,8 +295,13 @@ impl FromStr for TimeoutStrategy { } #[derive(Debug, Error)] -#[error("Error parsing send migration parameters")] -pub struct VmSendMigrationParseError(#[source] OptionParserError); +pub enum VmSendMigrationConfigError { + #[error("Error parsing send migration parameters")] + ParseError(#[source] OptionParserError), + + #[error("Error validating send migration parameters")] + ValidationError(String), +} /// Configuration for an outgoing migration. #[derive(Clone, Deserialize, Serialize, Debug)] @@ -348,7 +353,7 @@ impl VmSendMigrationData { NonZeroU32::new(1).unwrap() } - pub fn parse(migration: &str) -> Result { + pub fn parse(migration: &str) -> Result { let mut parser = OptionParser::new(); parser .add("destination_url") @@ -357,24 +362,26 @@ impl VmSendMigrationData { .add("timeout_s") .add("timeout_strategy") .add("connections"); - parser.parse(migration).map_err(VmSendMigrationParseError)?; + parser + .parse(migration) + .map_err(VmSendMigrationConfigError::ParseError)?; let destination_url = parser.get("destination_url").ok_or_else(|| { - VmSendMigrationParseError(OptionParserError::InvalidSyntax( + VmSendMigrationConfigError::ParseError(OptionParserError::InvalidSyntax( "destination_url is required".to_string(), )) })?; let local = parser .convert::("local") - .map_err(VmSendMigrationParseError)? + .map_err(VmSendMigrationConfigError::ParseError)? .unwrap_or(Toggle(false)) .0; let downtime_ms = match parser .convert::("downtime_ms") - .map_err(VmSendMigrationParseError)? + .map_err(VmSendMigrationConfigError::ParseError)? { Some(v) => NonZeroU64::new(v).ok_or_else(|| { - VmSendMigrationParseError(OptionParserError::InvalidValue( + VmSendMigrationConfigError::ParseError(OptionParserError::InvalidValue( "downtime_ms must be non-zero".to_string(), )) })?, @@ -382,10 +389,10 @@ impl VmSendMigrationData { }; let timeout_s = match parser .convert::("timeout_s") - .map_err(VmSendMigrationParseError)? + .map_err(VmSendMigrationConfigError::ParseError)? { Some(v) => NonZeroU64::new(v).ok_or_else(|| { - VmSendMigrationParseError(OptionParserError::InvalidValue( + VmSendMigrationConfigError::ParseError(OptionParserError::InvalidValue( "timeout_s must be non-zero".to_string(), )) })?, @@ -393,28 +400,32 @@ impl VmSendMigrationData { }; let timeout_strategy = parser .convert("timeout_strategy") - .map_err(VmSendMigrationParseError)? + .map_err(VmSendMigrationConfigError::ParseError)? .unwrap_or_default(); let connections = match parser .convert::("connections") - .map_err(VmSendMigrationParseError)? + .map_err(VmSendMigrationConfigError::ParseError)? { Some(v) => NonZeroU32::new(v).ok_or_else(|| { - VmSendMigrationParseError(OptionParserError::InvalidValue( + VmSendMigrationConfigError::ParseError(OptionParserError::InvalidValue( "connections must be non-zero".to_string(), )) })?, None => Self::default_connections(), }; - Ok(Self { + let data = Self { destination_url, local, downtime_ms, timeout_s, timeout_strategy, connections, - }) + }; + + data.validate()?; + + Ok(data) } pub fn downtime(&self) -> Duration { @@ -424,6 +435,47 @@ impl VmSendMigrationData { pub fn timeout(&self) -> Duration { Duration::from_secs(self.timeout_s.get()) } + + pub fn validate(&self) -> Result<(), VmSendMigrationConfigError> { + match self.destination_url.as_str() { + url if url + .strip_prefix("tcp:") + .is_some_and(|addr| !addr.is_empty()) => {} + url if url + .strip_prefix("unix:") + .is_some_and(|path| !path.is_empty()) => + { + if self.connections.get() > 1 { + return Err(VmSendMigrationConfigError::ValidationError( + "UNIX sockets and connections option cannot be used at the same time." + .to_string(), + )); + } + } + _ => { + return Err(VmSendMigrationConfigError::ValidationError( + "destination_url must use tcp:: or unix:.".to_string(), + )); + } + } + + if self.local { + if !self.destination_url.starts_with("unix:") { + return Err(VmSendMigrationConfigError::ValidationError( + "local option is only supported with UNIX sockets.".to_string(), + )); + } + + if self.connections.get() > 1 { + return Err(VmSendMigrationConfigError::ValidationError( + "local option and connections option cannot be used at the same time." + .to_string(), + )); + } + } + + Ok(()) + } } pub enum ApiResponsePayload { @@ -1701,19 +1753,19 @@ mod unit_tests { fn test_vm_send_migration_data_parse() { // Fully specified let data = VmSendMigrationData::parse( - "destination_url=tcp://192.168.1.1:8080,local=on,downtime_ms=200,timeout_s=3600,timeout_strategy=cancel,connections=2" + "destination_url=unix:/tmp/migrate.sock,local=on,downtime_ms=200,timeout_s=3600,timeout_strategy=cancel" ).expect("valid migration string should parse"); - assert_eq!(data.destination_url, "tcp://192.168.1.1:8080"); + assert_eq!(data.destination_url, "unix:/tmp/migrate.sock"); assert!(data.local); assert_eq!(data.downtime_ms.get(), 200); assert_eq!(data.timeout_s.get(), 3600); assert_eq!(data.timeout_strategy, TimeoutStrategy::Cancel); - assert_eq!(data.connections.get(), 2); + assert_eq!(data.connections.get(), 1); // Defaults applied when optional fields are omitted - let data = VmSendMigrationData::parse("destination_url=tcp://192.168.1.1:8080") + let data = VmSendMigrationData::parse("destination_url=tcp:192.168.1.1:8080") .expect("minimal migration string should parse"); - assert_eq!(data.destination_url, "tcp://192.168.1.1:8080"); + assert_eq!(data.destination_url, "tcp:192.168.1.1:8080"); assert!(!data.local); assert_eq!(data.downtime_ms, VmSendMigrationData::default_downtime_ms()); assert_eq!(data.timeout_s, VmSendMigrationData::default_timeout_s()); @@ -1725,7 +1777,7 @@ mod unit_tests { // Zero downtime_ms is rejected let _data = - VmSendMigrationData::parse("destination_url=tcp://192.168.1.1:8080,downtime_ms=0") + VmSendMigrationData::parse("destination_url=tcp:192.168.1.1:8080,downtime_ms=0") .expect_err("zero downtime_ms should be rejected"); // Zero timeout_s is rejected @@ -1744,18 +1796,28 @@ mod unit_tests { // Timeout strategy let _data = VmSendMigrationData::parse( - "destination_url=tcp://192.168.1.1:8080,timeout_strategy=invalid", + "destination_url=tcp:192.168.1.1:8080,timeout_strategy=invalid", ) - .expect_err("zero downtime_ms should be rejected"); + .expect_err("invalid timeout strategy should be rejected"); + + // Invalid destination URL scheme is rejected + VmSendMigrationData::parse("destination_url=file:///tmp/migration").unwrap_err(); + + // Local migration requires a UNIX socket destination + VmSendMigrationData::parse("destination_url=tcp:192.168.1.1:8080,local=yes").unwrap_err(); + + // Local migration cannot use multiple connections + VmSendMigrationData::parse("destination_url=unix:/tmp/sock,local=yes,connections=2") + .unwrap_err(); // Happy path with some defaults let data = - VmSendMigrationData::parse("destination_url=tcp://192.168.1.1:8080,downtime_ms=150") + VmSendMigrationData::parse("destination_url=tcp:192.168.1.1:8080,downtime_ms=150") .unwrap(); assert_eq!( data, VmSendMigrationData { - destination_url: "tcp://192.168.1.1:8080".to_string(), + destination_url: "tcp:192.168.1.1:8080".to_string(), local: false, downtime_ms: NonZeroU64::new(150).unwrap(), timeout_s: VmSendMigrationData::default_timeout_s(), @@ -1766,12 +1828,12 @@ mod unit_tests { // Happy path, fully specified let data = - VmSendMigrationData::parse("destination_url=tcp://192.168.1.1:8080,downtime_ms=150,timeout_s=900,timeout_strategy=ignore,connections=4") + VmSendMigrationData::parse("destination_url=tcp:192.168.1.1:8080,downtime_ms=150,timeout_s=900,timeout_strategy=ignore,connections=4") .unwrap(); assert_eq!( data, VmSendMigrationData { - destination_url: "tcp://192.168.1.1:8080".to_string(), + destination_url: "tcp:192.168.1.1:8080".to_string(), local: false, downtime_ms: NonZeroU64::new(150).unwrap(), timeout_s: NonZeroU64::new(900).unwrap(), diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index f1226090c9..ce5664249b 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -17,7 +17,7 @@ use std::time::Duration; use std::time::Instant; use std::{io, result, thread}; -use anyhow::anyhow; +use anyhow::{Context, anyhow}; #[cfg(feature = "dbus_api")] use api::dbus::{DBusApiOptions, DBusApiShutdownChannels}; use api::http::HttpApiHandle; @@ -2387,6 +2387,11 @@ impl RequestHandler for Vmm { &mut self, send_data_migration: VmSendMigrationData, ) -> result::Result<(), MigratableError> { + send_data_migration + .validate() + .context("Invalid send migration configuration") + .map_err(MigratableError::MigrateSend)?; + info!( "Sending migration: destination_url={},local={},downtime={}ms,timeout={}s,timeout_strategy={:?}", send_data_migration.destination_url, From ecddc6f84297484b60704feed1bde9b71aa51115 Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Wed, 1 Apr 2026 16:10:19 +0200 Subject: [PATCH 489/742] vmm: add upper limit for amount of parallel connections during migration Check that the amount of parallel connections does not exceed 128 and update documentation. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- docs/live_migration.md | 4 +++ vmm/src/api/mod.rs | 21 +++++++++++-- vmm/src/api/openapi/cloud-hypervisor.yaml | 5 +++ vmm/src/migration_transport.rs | 38 ++++++++++++++++++++--- 4 files changed, 60 insertions(+), 8 deletions(-) diff --git a/docs/live_migration.md b/docs/live_migration.md index ac842d3172..81eed06665 100644 --- a/docs/live_migration.md +++ b/docs/live_migration.md @@ -207,3 +207,7 @@ migration process. Via the API or `ch-remote`, you may specify: Cancel will abort the migration and keep the VM running on the source. Ignore will proceed with the migration regardless of the downtime requirement. Defaults to `cancel`. +- `connections `: \ + The number of parallel TCP connections to use for migration. + Must be between `1` and `128`. Defaults to `1`. + Multiple connections are not supported with local UNIX-socket migration. diff --git a/vmm/src/api/mod.rs b/vmm/src/api/mod.rs index f66fbe9ab0..e4ee7235ad 100644 --- a/vmm/src/api/mod.rs +++ b/vmm/src/api/mod.rs @@ -53,6 +53,7 @@ pub use self::http::{start_http_fd_thread, start_http_path_thread}; use crate::Error as VmmError; use crate::config::RestoreConfig; use crate::device_tree::DeviceTree; +use crate::migration_transport::MAX_MIGRATION_CONNECTIONS; use crate::vm::{Error as VmError, VmState}; use crate::vm_config::{ DeviceConfig, DiskConfig, FsConfig, GenericVhostUserConfig, NetConfig, PmemConfig, @@ -324,7 +325,9 @@ pub struct VmSendMigrationData { #[serde(default)] pub timeout_strategy: TimeoutStrategy, - /// The number of parallel connections for migration. + /// The number of parallel TCP connections for migration. + /// + /// Must be between 1 and `MAX_MIGRATION_CONNECTIONS` inclusive. #[serde(default = "VmSendMigrationData::default_connections")] pub connections: NonZeroU32, } @@ -459,6 +462,12 @@ impl VmSendMigrationData { } } + if self.connections.get() > MAX_MIGRATION_CONNECTIONS { + return Err(VmSendMigrationConfigError::ValidationError(format!( + "connections must not exceed {MAX_MIGRATION_CONNECTIONS}." + ))); + } + if self.local { if !self.destination_url.starts_with("unix:") { return Err(VmSendMigrationConfigError::ValidationError( @@ -1785,8 +1794,14 @@ mod unit_tests { .expect_err("zero timeout_s should be rejected"); // Zero connections is rejected - let _data = VmSendMigrationData::parse("destination_url=unix:/tmp/sock,connections=0") - .expect_err("zero connections should be rejected"); + let _data = + VmSendMigrationData::parse("destination_url=tcp:192.168.1.1:8080,connections=0") + .expect_err("zero connections should be rejected"); + + // Excessive numbers of parallel connections are rejected + let _data = + VmSendMigrationData::parse("destination_url=tcp:192.168.1.1:8080,connections=129") + .expect_err("too many connections should be rejected"); // Unknown option is an error VmSendMigrationData::parse("destination_url=unix:/tmp/sock,unknown_field=foo").unwrap_err(); diff --git a/vmm/src/api/openapi/cloud-hypervisor.yaml b/vmm/src/api/openapi/cloud-hypervisor.yaml index c2fe5af4b9..01106a257b 100644 --- a/vmm/src/api/openapi/cloud-hypervisor.yaml +++ b/vmm/src/api/openapi/cloud-hypervisor.yaml @@ -1410,6 +1410,11 @@ components: format: int64 default: 1 minimum: 1 + maximum: 128 + description: > + The number of parallel TCP connections to use for migration. + Must be between 1 and 128. Multiple connections are not supported + with local UNIX-socket migration. VmAddUserDevice: required: diff --git a/vmm/src/migration_transport.rs b/vmm/src/migration_transport.rs index 1ee71506d8..6440dc8fd6 100644 --- a/vmm/src/migration_transport.rs +++ b/vmm/src/migration_transport.rs @@ -32,6 +32,10 @@ use vmm_sys_util::eventfd::EventFd; use crate::sync_utils::Gate; use crate::{GuestMemoryMmap, VmMigrationConfig}; +/// Hard upper bound for migration worker connections on both the sender and +/// receiver side. +pub(crate) const MAX_MIGRATION_CONNECTIONS: u32 = 128; + /// Transport-agnostic listener used to receive connections. #[derive(Debug)] pub(crate) enum ReceiveListener { @@ -288,12 +292,30 @@ impl ReceiveAdditionalConnections { guest_memory: &GuestMemoryAtomic, ) -> Result<(), MigratableError> { let mut threads: Vec>> = Vec::new(); - while let Some(mut socket) = listener.abortable_accept(terminate_fd)? { + let mut first_err = loop { + let socket = match listener.abortable_accept(terminate_fd) { + Ok(socket) => socket, + Err(e) => break Err(e), + }; + let Some(mut socket) = socket else { + break Ok(()); + }; + + if threads.len() >= MAX_MIGRATION_CONNECTIONS as usize { + break Err(MigratableError::MigrateReceive(anyhow!( + "Received more than {MAX_MIGRATION_CONNECTIONS} additional migration connections." + ))); + } + let guest_memory = guest_memory.clone(); - let terminate_fd = terminate_fd + let terminate_fd = match terminate_fd .try_clone() .context("Error cloning terminate fd") - .map_err(MigratableError::MigrateReceive)?; + .map_err(MigratableError::MigrateReceive) + { + Ok(terminate_fd) => terminate_fd, + Err(e) => break Err(e), + }; match thread::Builder::new() .name(format!("migrate-receive-memory-{}", threads.len()).to_owned()) @@ -303,15 +325,21 @@ impl ReceiveAdditionalConnections { Ok(t) => threads.push(t), Err(e) => { error!("Error spawning receive-memory thread: {e}"); - break; + break Err(MigratableError::MigrateReceive( + anyhow!(e).context("Error spawning receive-memory thread"), + )); } } + }; + + if first_err.is_err() { + warn!("Signaling termination due to an error while accepting connections."); + let _ = terminate_fd.write(1); } info!("Stopped accepting additional connections. Cleaning up threads."); // We only return the first error we encounter here. - let mut first_err = Ok(()); for thread in threads { let err = match thread.join() { Ok(Ok(())) => None, From 17919a7a8c5cb1b498045083f3bdb96a07178f01 Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Wed, 1 Apr 2026 16:58:06 +0200 Subject: [PATCH 490/742] tests: add integration test for migration with multiple TCP connections On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- cloud-hypervisor/tests/integration.rs | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index bd4168d771..fded5727ed 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -8528,6 +8528,8 @@ mod vfio { } mod live_migration { + use std::num::NonZeroU32; + use vmm::api::TimeoutStrategy; use crate::*; @@ -9701,7 +9703,11 @@ mod live_migration { .port() } - fn start_live_migration_tcp(src_api_socket: &str, dest_api_socket: &str) -> bool { + fn start_live_migration_tcp( + src_api_socket: &str, + dest_api_socket: &str, + connections: NonZeroU32, + ) -> bool { // Get an available TCP port let migration_port = get_available_port(); let host_ip = "127.0.0.1"; @@ -9723,11 +9729,14 @@ mod live_migration { thread::sleep(Duration::from_secs(1)); // Start the 'send-migration' command on the source + let connections = connections.get(); let mut send_migration = Command::new(clh_command("ch-remote")) .args([ &format!("--api-socket={src_api_socket}"), "send-migration", - &format!("destination_url=tcp:{host_ip}:{migration_port}"), + &format!( + "destination_url=tcp:{host_ip}:{migration_port},connections={connections}" + ), ]) .stdin(Stdio::null()) .stderr(Stdio::piped()) @@ -9778,7 +9787,7 @@ mod live_migration { send_success && receive_success } - fn _test_live_migration_tcp() { + fn _test_live_migration_tcp(connections: NonZeroU32) { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(disk_config)); let kernel_path = direct_kernel_boot_path(); @@ -9860,7 +9869,7 @@ mod live_migration { } // Start TCP live migration assert!( - start_live_migration_tcp(&src_api_socket, &dest_api_socket), + start_live_migration_tcp(&src_api_socket, &dest_api_socket, connections), "Unsuccessful command: 'send-migration' or 'receive-migration'." ); }); @@ -10249,7 +10258,12 @@ mod live_migration { #[test] fn test_live_migration_tcp() { - _test_live_migration_tcp(); + _test_live_migration_tcp(NonZeroU32::new(1).unwrap()); + } + + #[test] + fn test_live_migration_tcp_parallel_connections() { + _test_live_migration_tcp(NonZeroU32::new(8).unwrap()); } #[test] From 8248650e799457fee3068b0bf3b54ffcade4bce1 Mon Sep 17 00:00:00 2001 From: Damian Barabonkov Date: Thu, 2 Apr 2026 10:15:31 -0700 Subject: [PATCH 491/742] pci: Handle dword MSI-X control writes Some guests update the MSI-X capability through a 32-bit write at offset 0 instead of a 16-bit write at offset 2. Update the cached Message Control state for that path as well so MSI-X enablement stays in sync with the guest configuration. Add a short comment documenting why the dword write path also updates the cached MSI-X Message Control state. This is important for passthrough GPUs, where MSI-X interrupts are used during NVIDIA Fabric Manager registration. Without updating the cached state on the dword write path, interrupt delivery can remain stale and GPU initialization or fabric registration can fail. Signed-off-by: Damian Barabonkov --- pci/src/vfio.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pci/src/vfio.rs b/pci/src/vfio.rs index 53c77a95c1..0fa3ae8365 100644 --- a/pci/src/vfio.rs +++ b/pci/src/vfio.rs @@ -158,6 +158,10 @@ impl VfioMsix { // Update "Message Control" word if offset == 2 && data.len() == 2 { self.bar.set_msg_ctl(LittleEndian::read_u16(data)); + } else if offset == 0 && data.len() == 4 { + // Some guests update MSI-X control through the dword config write path. + self.bar + .set_msg_ctl((LittleEndian::read_u32(data) >> 16) as u16); } let new_enabled = self.bar.enabled(); From 676c0d320bf47549121c33df4848ac68449b574b Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Thu, 2 Apr 2026 19:40:10 -0700 Subject: [PATCH 492/742] build: use latest 0.6.8 mshv crates Use mshv-{ioctls, bindings) with the latest versions that fixes a bug on interrupt vector. Signed-off-by: Muminul Islam --- Cargo.lock | 8 ++++---- Cargo.toml | 4 ++-- fuzz/Cargo.lock | 4 ++-- fuzz/Cargo.toml | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ac44003adb..f60d47d317 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1336,9 +1336,9 @@ checksum = "c505b3e17ed6b70a7ed2e67fbb2c560ee327353556120d6e72f5232b6880d536" [[package]] name = "mshv-bindings" -version = "0.6.7" +version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3cbfd4f32d185152003679339751839da77c17e18fa8882a11051a236f841426" +checksum = "a94fc3871dd23738188e5bc76a1d1a5930ebcaf9308c560a7274aa62b1770594" dependencies = [ "libc", "num_enum", @@ -1350,9 +1350,9 @@ dependencies = [ [[package]] name = "mshv-ioctls" -version = "0.6.7" +version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f035616abe1e4cbc026a1a8094ff8d3900f5063fe6608309098bc745926fdfd8" +checksum = "1339723fe3a26baf4041459de20ad923e89d312c3bb25dbf9f60738c22a47f5e" dependencies = [ "libc", "mshv-bindings", diff --git a/Cargo.toml b/Cargo.toml index 67d5398f01..dab7e6c727 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -56,8 +56,8 @@ acpi_tables = "0.2.0" kvm-bindings = "0.14.0" kvm-ioctls = "0.24.0" linux-loader = "0.13.2" -mshv-bindings = "0.6.7" -mshv-ioctls = "0.6.7" +mshv-bindings = "0.6.8" +mshv-ioctls = "0.6.8" seccompiler = "0.5.0" vfio-bindings = { version = "0.6.2", default-features = false } vfio-ioctls = { version = "0.6.0", default-features = false } diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock index 8741ab8591..dd07d241c8 100644 --- a/fuzz/Cargo.lock +++ b/fuzz/Cargo.lock @@ -789,9 +789,9 @@ dependencies = [ [[package]] name = "mshv-bindings" -version = "0.6.7" +version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3cbfd4f32d185152003679339751839da77c17e18fa8882a11051a236f841426" +checksum = "a94fc3871dd23738188e5bc76a1d1a5930ebcaf9308c560a7274aa62b1770594" dependencies = [ "libc", "num_enum", diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index 80c59be630..410a4968d7 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -25,7 +25,7 @@ libc = "0.2.183" libfuzzer-sys = "0.4.12" linux-loader = { version = "0.13.2", features = ["bzimage", "elf", "pe"] } micro_http = { git = "https://github.com/firecracker-microvm/micro-http", branch = "main" } -mshv-bindings = "0.6.7" +mshv-bindings = "0.6.8" net_util = { path = "../net_util" } seccompiler = "0.5.0" virtio-devices = { path = "../virtio-devices" } From f7f9895d579bcb5cccc8248a28ed20671d4216a4 Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Fri, 20 Mar 2026 13:07:10 +0100 Subject: [PATCH 493/742] hypervisor: kvm: preserve guest MTRR MSRs KVM_GET_MSR_INDEX_LIST does not consistently include all guest-programmable MTRR MSRs. During save/restore while booting, the VMM initially sets only MSR_MTRRdefType, then guest firmware or other early boot code can program additional MTRR state before the snapshot is taken. If those MSRs are missing from the vCPU MSR buffer, snapshot omits part of the guest's MTRR configuration and restore resumes with an incomplete MTRR map. Add the guest-programmable MTRR MSRs to the KVM MSR index list used to build the vCPU MSR buffer so the existing snapshot/restore path preserves the guest's MTRR state. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- hypervisor/src/arch/x86/mod.rs | 30 ++++++++++++++++++++++++++++ hypervisor/src/arch/x86/msr_index.rs | 16 +++++++++++++++ hypervisor/src/kvm/mod.rs | 19 +++++++++++++++--- 3 files changed, 62 insertions(+), 3 deletions(-) diff --git a/hypervisor/src/arch/x86/mod.rs b/hypervisor/src/arch/x86/mod.rs index 78e4d7cc5d..45f820cde7 100644 --- a/hypervisor/src/arch/x86/mod.rs +++ b/hypervisor/src/arch/x86/mod.rs @@ -28,6 +28,36 @@ pub mod msr_index; // MTRR constants pub const MTRR_ENABLE: u64 = 0x800; // IA32_MTRR_DEF_TYPE MSR: E (MTRRs enabled) flag, bit 11 pub const MTRR_MEM_TYPE_WB: u64 = 0x6; +pub const MTRR_MSR_INDICES: [u32; 28] = [ + msr_index::MSR_MTRRdefType, + msr_index::MSR_IA32_MTRR_PHYSBASE0, + msr_index::MSR_IA32_MTRR_PHYSMASK0, + msr_index::MSR_IA32_MTRR_PHYSBASE1, + msr_index::MSR_IA32_MTRR_PHYSMASK1, + msr_index::MSR_IA32_MTRR_PHYSBASE2, + msr_index::MSR_IA32_MTRR_PHYSMASK2, + msr_index::MSR_IA32_MTRR_PHYSBASE3, + msr_index::MSR_IA32_MTRR_PHYSMASK3, + msr_index::MSR_IA32_MTRR_PHYSBASE4, + msr_index::MSR_IA32_MTRR_PHYSMASK4, + msr_index::MSR_IA32_MTRR_PHYSBASE5, + msr_index::MSR_IA32_MTRR_PHYSMASK5, + msr_index::MSR_IA32_MTRR_PHYSBASE6, + msr_index::MSR_IA32_MTRR_PHYSMASK6, + msr_index::MSR_IA32_MTRR_PHYSBASE7, + msr_index::MSR_IA32_MTRR_PHYSMASK7, + msr_index::MSR_MTRRfix64K_00000, + msr_index::MSR_MTRRfix16K_80000, + msr_index::MSR_MTRRfix16K_A0000, + msr_index::MSR_MTRRfix4K_C0000, + msr_index::MSR_MTRRfix4K_C8000, + msr_index::MSR_MTRRfix4K_D0000, + msr_index::MSR_MTRRfix4K_D8000, + msr_index::MSR_MTRRfix4K_E0000, + msr_index::MSR_MTRRfix4K_E8000, + msr_index::MSR_MTRRfix4K_F0000, + msr_index::MSR_MTRRfix4K_F8000, +]; // IOAPIC pins pub const NUM_IOAPIC_PINS: usize = 24; diff --git a/hypervisor/src/arch/x86/msr_index.rs b/hypervisor/src/arch/x86/msr_index.rs index 810fe08b9a..607ee3b2c1 100644 --- a/hypervisor/src/arch/x86/msr_index.rs +++ b/hypervisor/src/arch/x86/msr_index.rs @@ -85,6 +85,22 @@ pub const MSR_IA32_RTIT_ADDR3_B: ::std::os::raw::c_uint = 0x00000587; pub const MSR_IA32_RTIT_CR3_MATCH: ::std::os::raw::c_uint = 0x00000572; pub const MSR_IA32_RTIT_OUTPUT_BASE: ::std::os::raw::c_uint = 0x00000560; pub const MSR_IA32_RTIT_OUTPUT_MASK: ::std::os::raw::c_uint = 0x00000561; +pub const MSR_IA32_MTRR_PHYSBASE0: ::std::os::raw::c_uint = 0x00000200; +pub const MSR_IA32_MTRR_PHYSMASK0: ::std::os::raw::c_uint = 0x00000201; +pub const MSR_IA32_MTRR_PHYSBASE1: ::std::os::raw::c_uint = 0x00000202; +pub const MSR_IA32_MTRR_PHYSMASK1: ::std::os::raw::c_uint = 0x00000203; +pub const MSR_IA32_MTRR_PHYSBASE2: ::std::os::raw::c_uint = 0x00000204; +pub const MSR_IA32_MTRR_PHYSMASK2: ::std::os::raw::c_uint = 0x00000205; +pub const MSR_IA32_MTRR_PHYSBASE3: ::std::os::raw::c_uint = 0x00000206; +pub const MSR_IA32_MTRR_PHYSMASK3: ::std::os::raw::c_uint = 0x00000207; +pub const MSR_IA32_MTRR_PHYSBASE4: ::std::os::raw::c_uint = 0x00000208; +pub const MSR_IA32_MTRR_PHYSMASK4: ::std::os::raw::c_uint = 0x00000209; +pub const MSR_IA32_MTRR_PHYSBASE5: ::std::os::raw::c_uint = 0x0000020a; +pub const MSR_IA32_MTRR_PHYSMASK5: ::std::os::raw::c_uint = 0x0000020b; +pub const MSR_IA32_MTRR_PHYSBASE6: ::std::os::raw::c_uint = 0x0000020c; +pub const MSR_IA32_MTRR_PHYSMASK6: ::std::os::raw::c_uint = 0x0000020d; +pub const MSR_IA32_MTRR_PHYSBASE7: ::std::os::raw::c_uint = 0x0000020e; +pub const MSR_IA32_MTRR_PHYSMASK7: ::std::os::raw::c_uint = 0x0000020f; pub const MSR_MTRRfix64K_00000: ::std::os::raw::c_uint = 0x00000250; pub const MSR_MTRRfix16K_80000: ::std::os::raw::c_uint = 0x00000258; pub const MSR_MTRRfix16K_A0000: ::std::os::raw::c_uint = 0x00000259; diff --git a/hypervisor/src/kvm/mod.rs b/hypervisor/src/kvm/mod.rs index d18785fd90..8b21002d1d 100644 --- a/hypervisor/src/kvm/mod.rs +++ b/hypervisor/src/kvm/mod.rs @@ -70,7 +70,8 @@ pub use x86_64::{CpuId, ExtendedControlRegisters, MsrEntries, VcpuKvmState}; use crate::ClockData; #[cfg(target_arch = "x86_64")] use crate::arch::x86::{ - CpuIdEntry, FpuState, LapicState, MsrEntry, NUM_IOAPIC_PINS, SpecialRegisters, XsaveState, + CpuIdEntry, FpuState, LapicState, MTRR_MSR_INDICES, MsrEntry, NUM_IOAPIC_PINS, + SpecialRegisters, XsaveState, }; use crate::{CpuState, IoEventAddress, IrqRoutingEntry, MpState, StandardRegisters}; // aarch64 dependencies @@ -1128,9 +1129,21 @@ impl KvmHypervisor { /// Retrieve the list of MSRs supported by the hypervisor. /// fn get_msr_list(&self) -> hypervisor::Result { - self.kvm + let mut indices = self + .kvm .get_msr_index_list() - .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into())) + .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into()))? + .as_slice() + .to_vec(); + + // KVM_GET_MSR_INDEX_LIST does not include MTRR MSRs, but firmware may update them before an early boot snapshot. + indices.extend(MTRR_MSR_INDICES); + + let mut msr_list = MsrList::new(indices.len()) + .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into()))?; + msr_list.as_mut_slice().copy_from_slice(&indices); + + Ok(msr_list) } } From f56c8392ea48b43620be238c6650f3ed15184cb0 Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Thu, 2 Apr 2026 20:52:21 +0000 Subject: [PATCH 494/742] virtio-devices: vsock: RST vsocks on snapshot restore Otherwise guest connections just hang. Signed-off-by: Peter Oskolkov --- virtio-devices/src/vsock/device.rs | 8 +++++++- virtio-devices/src/vsock/mod.rs | 7 ++++++- virtio-devices/src/vsock/unix/muxer.rs | 18 +++++++++++++++++- 3 files changed, 30 insertions(+), 3 deletions(-) diff --git a/virtio-devices/src/vsock/device.rs b/virtio-devices/src/vsock/device.rs index 6d38ecf398..25412503af 100644 --- a/virtio-devices/src/vsock/device.rs +++ b/virtio-devices/src/vsock/device.rs @@ -323,6 +323,8 @@ pub struct Vsock { pub struct VsockState { pub avail_features: u64, pub acked_features: u64, + #[serde(default)] + pub connections: Vec<(u32, u32)>, } impl Vsock @@ -336,7 +338,7 @@ where id: String, cid: u32, path: PathBuf, - backend: B, + mut backend: B, iommu: bool, seccomp_action: SeccompAction, exit_evt: EventFd, @@ -344,6 +346,9 @@ where ) -> io::Result> { let (avail_features, acked_features, paused) = if let Some(state) = state { info!("Restoring virtio-vsock {id}"); + // Instead of letting the guest connection hang/timeout, proactively let + // the guest know the connection is gone. + backend.queue_rst_for_connections(state.connections.clone()); (state.avail_features, state.acked_features, true) } else { let mut avail_features = (1u64 << VIRTIO_F_VERSION_1) | (1u64 << VIRTIO_F_IN_ORDER); @@ -378,6 +383,7 @@ where VsockState { avail_features: self.common.avail_features, acked_features: self.common.acked_features, + connections: self.backend.read().unwrap().connections(), } } diff --git a/virtio-devices/src/vsock/mod.rs b/virtio-devices/src/vsock/mod.rs index 34561f5d46..cc1ef1ad23 100644 --- a/virtio-devices/src/vsock/mod.rs +++ b/virtio-devices/src/vsock/mod.rs @@ -158,7 +158,12 @@ pub trait VsockChannel { /// It that needs to be sendable through a mpsc channel (the latter due to how `vmm::EpollContext` works). /// Currently, the only implementation we have is `crate::virtio::unix::muxer::VsockMuxer`, which /// translates guest-side vsock connections to host-side Unix domain socket connections. -pub trait VsockBackend: VsockChannel + VsockEpollListener + Send {} +pub trait VsockBackend: VsockChannel + VsockEpollListener + Send { + fn connections(&self) -> Vec<(u32, u32)> { + Vec::new() + } + fn queue_rst_for_connections(&mut self, _conns: Vec<(u32, u32)>) {} +} #[cfg(any(test, fuzzing))] pub mod unit_tests { diff --git a/virtio-devices/src/vsock/unix/muxer.rs b/virtio-devices/src/vsock/unix/muxer.rs index 1a8570b75d..499c68eac1 100644 --- a/virtio-devices/src/vsock/unix/muxer.rs +++ b/virtio-devices/src/vsock/unix/muxer.rs @@ -345,7 +345,23 @@ impl VsockEpollListener for VsockMuxer { } } -impl VsockBackend for VsockMuxer {} +impl VsockBackend for VsockMuxer { + fn connections(&self) -> Vec<(u32, u32)> { + self.conn_map + .keys() + .map(|k| (k.local_port, k.peer_port)) + .collect() + } + + fn queue_rst_for_connections(&mut self, conns: Vec<(u32, u32)>) { + for (local_port, peer_port) in conns { + self.rxq.push(MuxerRx::RstPkt { + local_port, + peer_port, + }); + } + } +} impl VsockMuxer { /// Muxer constructor. From aef0a43b52c2c6a688e2235fc18cddfc604d51eb Mon Sep 17 00:00:00 2001 From: Max Makarov Date: Sat, 4 Apr 2026 20:32:49 +0000 Subject: [PATCH 495/742] vdpa: fix RX failure after device reset by always using base 0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After a vDPA device reset, activate_vdpa() read avail_idx from guest memory to pass as the vring base via VHOST_SET_VRING_BASE. However, the guest memory still contained the stale avail_idx from the previous session. For a 256-entry ring, this meant base=256, causing the hardware to believe the entire RX ring was consumed with no available buffers — RX silently stopped while TX continued to work. QEMU handles this correctly by tracking last_avail_idx internally (reset to 0 in virtio_reset()) and passing that value, rather than reading from guest memory. Fix by always passing base=0 to set_vring_base(). After a device reset, both the guest driver and the vhost backend restart their rings from index 0. For live migration, the correct base should come from VHOST_GET_VRING_BASE (saved before the migration), not guest memory. Tested with mlx5_vdpa (ConnectX-6 Dx) + Windows Server 2025 (netkvm). Before: RX=0 after 3rd driver activation. After: full connectivity. Signed-off-by: Max Makarov --- virtio-devices/src/vdpa.rs | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/virtio-devices/src/vdpa.rs b/virtio-devices/src/vdpa.rs index 4773ef55fa..7cd3415181 100644 --- a/virtio-devices/src/vdpa.rs +++ b/virtio-devices/src/vdpa.rs @@ -217,7 +217,7 @@ impl Vdpa { fn activate_vdpa( &mut self, - mem: &GuestMemoryMmap, + _mem: &GuestMemoryMmap, virtio_interrupt: &dyn VirtioInterrupt, queues: &[(usize, Queue, EventFd)], ) -> Result<()> { @@ -269,13 +269,7 @@ impl Vdpa { self.vhost .as_ref() .unwrap() - .set_vring_base( - *queue_index, - queue - .avail_idx(mem, Ordering::Acquire) - .map_err(Error::GetAvailableIndex)? - .0, - ) + .set_vring_base(*queue_index, 0) .map_err(Error::SetVringBase)?; if let Some(eventfd) = From 77ce3f6cbfa08a2a695de1760575f35670418347 Mon Sep 17 00:00:00 2001 From: Jared White Date: Sun, 5 Apr 2026 21:36:07 -0700 Subject: [PATCH 496/742] vmm: memory_actual_size reflects hotplug state It is desirable to be able to track the progress of memory hotplug. Update the memory_actual_size field to query the current plugged size from virtio-mem to enable this. Signed-off-by: Jared White --- virtio-devices/src/mem.rs | 4 ++++ vmm/src/config.rs | 31 +++++++++++++++++-------------- vmm/src/lib.rs | 4 +++- vmm/src/memory_manager.rs | 13 +++++++++++++ vmm/src/vm.rs | 8 ++++++++ 5 files changed, 45 insertions(+), 15 deletions(-) diff --git a/virtio-devices/src/mem.rs b/virtio-devices/src/mem.rs index aed8ed48d2..067100164e 100644 --- a/virtio-devices/src/mem.rs +++ b/virtio-devices/src/mem.rs @@ -834,6 +834,10 @@ impl Mem { }) } + pub fn plugged_size(&self) -> u64 { + self.config.lock().unwrap().plugged_size + } + pub fn resize(&mut self, size: u64) -> result::Result<(), Error> { let mut config = self.config.lock().unwrap(); config.resize(size).map_err(|e| { diff --git a/vmm/src/config.rs b/vmm/src/config.rs index 93ec6c8915..fc7eb1b8d1 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -1039,21 +1039,24 @@ impl MemoryConfig { } pub fn total_size(&self) -> u64 { - let mut size = self.size; - if let Some(hotplugged_size) = self.hotplugged_size { - size += hotplugged_size; - } - - if let Some(zones) = &self.zones { - for zone in zones.iter() { - size += zone.size; - if let Some(hotplugged_size) = zone.hotplugged_size { - size += hotplugged_size; - } - } - } + self.size + + self + .zones + .iter() + .flatten() + .map(|zone| zone.size) + .sum::() + + self.hotplugged_size() + } - size + pub fn hotplugged_size(&self) -> u64 { + self.hotplugged_size.unwrap_or(0) + + self + .zones + .iter() + .flatten() + .filter_map(|zone| zone.hotplugged_size) + .sum::() } } diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index ce5664249b..92bf4c6b70 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -1915,9 +1915,11 @@ impl RequestHandler for Vmm { }; let config = vm_config.lock().unwrap().clone(); - let mut memory_actual_size = config.memory.total_size(); + let mut memory_actual_size = + config.memory.total_size() - config.memory.hotplugged_size(); if let Some(vm) = &self.vm { memory_actual_size = memory_actual_size.saturating_sub(vm.balloon_size()); + memory_actual_size += vm.virtio_mem_plugged_size(); } let device_tree = self diff --git a/vmm/src/memory_manager.rs b/vmm/src/memory_manager.rs index 62b4522cc2..b2058435b4 100644 --- a/vmm/src/memory_manager.rs +++ b/vmm/src/memory_manager.rs @@ -2420,6 +2420,19 @@ impl MemoryManager { unsafe { (*stat.as_ptr()).st_nlink as usize > 0 } } + pub fn virtio_mem_plugged_size(&self) -> u64 { + self.memory_zones + .values() + .filter_map(|zone| { + zone.virtio_mem_zone + .as_ref()? + .virtio_device + .as_ref() + .map(|dev| dev.lock().unwrap().plugged_size()) + }) + .sum() + } + pub fn memory_zones(&self) -> &MemoryZones { &self.memory_zones } diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index c0eaba7ed6..1f3bcfeebd 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -2822,6 +2822,14 @@ impl Vm { self.device_manager.lock().unwrap().balloon_size() } + /// Get the actual size of the virtio_mem regions + pub fn virtio_mem_plugged_size(&self) -> u64 { + self.memory_manager + .lock() + .unwrap() + .virtio_mem_plugged_size() + } + pub fn send_memory_fds( &mut self, socket: &mut UnixStream, From fa949678d1c82362e23da3d8959cfa0ab26e6f64 Mon Sep 17 00:00:00 2001 From: Anirudh Rayabharam Date: Mon, 6 Apr 2026 14:21:58 +0000 Subject: [PATCH 497/742] scripts: build mshv feature too for dbus, fw_cfg & ivshmem tests The aarch64 dbus, fw_cfg & ivshmem tests don't build the mshv feature causing them to fail when run on MSHV. Fix by building the mshv feature too just like the x86 version of the script does. Signed-off-by: Anirudh Rayabharam --- scripts/run_integration_tests_aarch64.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/run_integration_tests_aarch64.sh b/scripts/run_integration_tests_aarch64.sh index b4c3482d0e..9dac761e98 100755 --- a/scripts/run_integration_tests_aarch64.sh +++ b/scripts/run_integration_tests_aarch64.sh @@ -285,7 +285,7 @@ fi # Run tests on dbus_api if [ $RES -eq 0 ]; then - cargo build --features "dbus_api" --all --release --target "$BUILD_TARGET" + cargo build --features "mshv,dbus_api" --all --release --target "$BUILD_TARGET" export RUST_BACKTRACE=1 # integration tests now do not reply on build feature "dbus_api" time cargo nextest run $test_features --retries 3 --no-fail-fast --no-tests=pass --test-threads=$(($(nproc) / 4)) "dbus_api::$test_filter" -- ${test_binary_args[*]} @@ -294,14 +294,14 @@ fi # Run tests on fw_cfg if [ $RES -eq 0 ]; then - cargo build --features "fw_cfg" --all --release --target "$BUILD_TARGET" + cargo build --features "mshv,fw_cfg" --all --release --target "$BUILD_TARGET" export RUST_BACKTRACE=1 time cargo nextest run $test_features --retries 3 --no-fail-fast --no-tests=pass --test-threads=$(($(nproc) / 4)) "fw_cfg::$test_filter" -- ${test_binary_args[*]} RES=$? fi if [ $RES -eq 0 ]; then - cargo build --features "ivshmem" --all --release --target "$BUILD_TARGET" + cargo build --features "mshv,ivshmem" --all --release --target "$BUILD_TARGET" export RUST_BACKTRACE=1 time cargo nextest run $test_features --retries 3 --no-fail-fast --no-tests=pass --test-threads=$(($(nproc) / 4)) "ivshmem::$test_filter" -- ${test_binary_args[*]} From 84c5e48bdb0dfc5028ac34a2ec80e2076c1088ea Mon Sep 17 00:00:00 2001 From: Thomas Prescher Date: Thu, 9 Apr 2026 14:40:18 +0200 Subject: [PATCH 498/742] scripts: use latest ovmf version for x86_64 This commit bumps ovmf to ch-13b4963ec4 [1]. [1] https://github.com/cloud-hypervisor/edk2/releases/tag/ch-13b4963ec4 On-behalf-of: SAP thomas.prescher@sap.com Signed-off-by: Thomas Prescher --- scripts/sha1sums-x86_64 | 2 +- scripts/test-util.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/sha1sums-x86_64 b/scripts/sha1sums-x86_64 index c49f00b266..1f7500e8b4 100644 --- a/scripts/sha1sums-x86_64 +++ b/scripts/sha1sums-x86_64 @@ -1,3 +1,3 @@ d4a44acc6014d5f83dea1c625c43d677a95fa75f alpine-minirootfs-x86_64.tar.gz 540ac358429305d7aa94e15363665d1c9d845982 hypervisor-fw -4e96fd0914a44005d40707b2b0c7e829e4086bd5 CLOUDHV.fd +cf89e3e052c8ef0b6192abee6128eef943393307 CLOUDHV.fd diff --git a/scripts/test-util.sh b/scripts/test-util.sh index 5b414f8583..2ad10d12fb 100644 --- a/scripts/test-util.sh +++ b/scripts/test-util.sh @@ -199,7 +199,7 @@ prepare_linux() { } download_ovmf() { - OVMF_FW_TAG="ch-a54f262b09" + OVMF_FW_TAG="ch-13b4963ec4" OVMF_FW_URL="https://github.com/cloud-hypervisor/edk2/releases/download/$OVMF_FW_TAG/CLOUDHV.fd" OVMF_FW="$WORKLOADS_DIR/CLOUDHV.fd" pushd "$WORKLOADS_DIR" || exit From 474106a067627b04b2d379856e113c6d97ba3802 Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Wed, 8 Apr 2026 17:36:00 -0700 Subject: [PATCH 499/742] vmm: skip uefi allocation on direct boot A 4M uefi_region is allocated unconditionally. When directly booting a kernel, it goes unused. Avoid the allocation in this case by moving the call to add_uefi_flash() to load_firmware(). Also extended add_uefi_flash() to riscv64 since it shares the load_firmware() path. It looked like up to this point a firmware boot on riscv64 would panic with an uninitialized uefi_flash. Signed-off-by: JP Kobryn --- vmm/src/memory_manager.rs | 4 ++-- vmm/src/vm.rs | 14 +++++--------- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/vmm/src/memory_manager.rs b/vmm/src/memory_manager.rs index b2058435b4..3a57ea60a4 100644 --- a/vmm/src/memory_manager.rs +++ b/vmm/src/memory_manager.rs @@ -1372,9 +1372,9 @@ impl MemoryManager { Ok(()) } - #[cfg(target_arch = "aarch64")] + #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] pub fn add_uefi_flash(&mut self) -> Result<(), Error> { - // On AArch64, the UEFI binary requires a flash device at address 0. + // The UEFI binary requires a flash device at address 0. // 4 MiB memory is mapped to simulate the flash. let uefi_mem_slot = self.allocate_memory_slot(); let uefi_region = GuestRegionMmap::new( diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index 1f3bcfeebd..04bd2d595a 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -899,14 +899,6 @@ impl Vm { .allocate_address_space() .map_err(Error::MemoryManager)?; - // Add UEFI flash for aarch64 - #[cfg(target_arch = "aarch64")] - memory_manager - .lock() - .unwrap() - .add_uefi_flash() - .map_err(Error::MemoryManager)?; - // Load payload asynchronously let load_payload_handle = if snapshot.is_none() { Self::load_payload_async( @@ -1405,7 +1397,11 @@ impl Vm { mut firmware: &File, memory_manager: Arc>, ) -> Result { - let uefi_flash = memory_manager.lock().as_ref().unwrap().uefi_flash(); + let mut memory_manager = memory_manager.lock().unwrap(); + memory_manager + .add_uefi_flash() + .map_err(Error::MemoryManager)?; + let uefi_flash = memory_manager.uefi_flash(); let mem = uefi_flash.memory(); arch::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut firmware) .map_err(Error::UefiLoad)?; From de601c5eaa7651762ddba4b32772768eb23a2800 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 10 Apr 2026 00:04:48 +0000 Subject: [PATCH 500/742] build(deps): bump the non-rust-vmm group across 2 directories with 10 updates Bumps the non-rust-vmm group with 9 updates in the / directory: | Package | From | To | | --- | --- | --- | | [libc](https://github.com/rust-lang/libc) | `0.2.183` | `0.2.184` | | [signal-hook](https://github.com/vorner/signal-hook) | `0.4.3` | `0.4.4` | | [arc-swap](https://github.com/vorner/arc-swap) | `1.9.0` | `1.9.1` | | [cc](https://github.com/rust-lang/cc-rs) | `1.2.58` | `1.2.59` | | [indexmap](https://github.com/indexmap-rs/indexmap) | `2.13.0` | `2.13.1` | | [libz-sys](https://github.com/rust-lang/libz-sys) | `1.1.25` | `1.1.28` | | [semver](https://github.com/dtolnay/semver) | `1.0.27` | `1.0.28` | | [toml_datetime](https://github.com/toml-rs/toml) | `1.1.0+spec-1.1.0` | `1.1.1+spec-1.1.0` | | [toml_edit](https://github.com/toml-rs/toml) | `0.25.8+spec-1.1.0` | `0.25.10+spec-1.1.0` | Bumps the non-rust-vmm group with 8 updates in the /fuzz directory: | Package | From | To | | --- | --- | --- | | [libc](https://github.com/rust-lang/libc) | `0.2.183` | `0.2.184` | | [signal-hook](https://github.com/vorner/signal-hook) | `0.4.3` | `0.4.4` | | [arc-swap](https://github.com/vorner/arc-swap) | `1.9.0` | `1.9.1` | | [cc](https://github.com/rust-lang/cc-rs) | `1.2.58` | `1.2.59` | | [indexmap](https://github.com/indexmap-rs/indexmap) | `2.13.0` | `2.13.1` | | [semver](https://github.com/dtolnay/semver) | `1.0.27` | `1.0.28` | | [toml_datetime](https://github.com/toml-rs/toml) | `1.1.0+spec-1.1.0` | `1.1.1+spec-1.1.0` | | [toml_edit](https://github.com/toml-rs/toml) | `0.25.8+spec-1.1.0` | `0.25.10+spec-1.1.0` | Updates `libc` from 0.2.183 to 0.2.184 - [Release notes](https://github.com/rust-lang/libc/releases) - [Changelog](https://github.com/rust-lang/libc/blob/0.2.184/CHANGELOG.md) - [Commits](https://github.com/rust-lang/libc/compare/0.2.183...0.2.184) Updates `signal-hook` from 0.4.3 to 0.4.4 - [Changelog](https://github.com/vorner/signal-hook/blob/master/CHANGELOG.md) - [Commits](https://github.com/vorner/signal-hook/compare/v0.4.3...v0.4.4) Updates `arc-swap` from 1.9.0 to 1.9.1 - [Changelog](https://github.com/vorner/arc-swap/blob/master/CHANGELOG.md) - [Commits](https://github.com/vorner/arc-swap/compare/v1.9.0...v1.9.1) Updates `cc` from 1.2.58 to 1.2.59 - [Release notes](https://github.com/rust-lang/cc-rs/releases) - [Changelog](https://github.com/rust-lang/cc-rs/blob/main/CHANGELOG.md) - [Commits](https://github.com/rust-lang/cc-rs/compare/cc-v1.2.58...cc-v1.2.59) Updates `indexmap` from 2.13.0 to 2.13.1 - [Changelog](https://github.com/indexmap-rs/indexmap/blob/main/RELEASES.md) - [Commits](https://github.com/indexmap-rs/indexmap/compare/2.13.0...2.13.1) Updates `libz-sys` from 1.1.25 to 1.1.28 - [Release notes](https://github.com/rust-lang/libz-sys/releases) - [Commits](https://github.com/rust-lang/libz-sys/compare/1.1.25...1.1.28) Updates `semver` from 1.0.27 to 1.0.28 - [Release notes](https://github.com/dtolnay/semver/releases) - [Commits](https://github.com/dtolnay/semver/compare/1.0.27...1.0.28) Updates `toml_datetime` from 1.1.0+spec-1.1.0 to 1.1.1+spec-1.1.0 - [Commits](https://github.com/toml-rs/toml/compare/toml_datetime-v1.1.0...toml_datetime-v1.1.1) Updates `toml_edit` from 0.25.8+spec-1.1.0 to 0.25.10+spec-1.1.0 - [Commits](https://github.com/toml-rs/toml/compare/v0.25.8...v0.25.10) Updates `toml_parser` from 1.1.0+spec-1.1.0 to 1.1.2+spec-1.1.0 - [Commits](https://github.com/toml-rs/toml/compare/toml_parser-v1.1.0...toml_parser-v1.1.2) Updates `libc` from 0.2.183 to 0.2.184 - [Release notes](https://github.com/rust-lang/libc/releases) - [Changelog](https://github.com/rust-lang/libc/blob/0.2.184/CHANGELOG.md) - [Commits](https://github.com/rust-lang/libc/compare/0.2.183...0.2.184) Updates `signal-hook` from 0.4.3 to 0.4.4 - [Changelog](https://github.com/vorner/signal-hook/blob/master/CHANGELOG.md) - [Commits](https://github.com/vorner/signal-hook/compare/v0.4.3...v0.4.4) Updates `arc-swap` from 1.9.0 to 1.9.1 - [Changelog](https://github.com/vorner/arc-swap/blob/master/CHANGELOG.md) - [Commits](https://github.com/vorner/arc-swap/compare/v1.9.0...v1.9.1) Updates `cc` from 1.2.58 to 1.2.59 - [Release notes](https://github.com/rust-lang/cc-rs/releases) - [Changelog](https://github.com/rust-lang/cc-rs/blob/main/CHANGELOG.md) - [Commits](https://github.com/rust-lang/cc-rs/compare/cc-v1.2.58...cc-v1.2.59) Updates `indexmap` from 2.13.0 to 2.13.1 - [Changelog](https://github.com/indexmap-rs/indexmap/blob/main/RELEASES.md) - [Commits](https://github.com/indexmap-rs/indexmap/compare/2.13.0...2.13.1) Updates `semver` from 1.0.27 to 1.0.28 - [Release notes](https://github.com/dtolnay/semver/releases) - [Commits](https://github.com/dtolnay/semver/compare/1.0.27...1.0.28) Updates `toml_datetime` from 1.1.0+spec-1.1.0 to 1.1.1+spec-1.1.0 - [Commits](https://github.com/toml-rs/toml/compare/toml_datetime-v1.1.0...toml_datetime-v1.1.1) Updates `toml_edit` from 0.25.8+spec-1.1.0 to 0.25.10+spec-1.1.0 - [Commits](https://github.com/toml-rs/toml/compare/v0.25.8...v0.25.10) Updates `toml_parser` from 1.1.0+spec-1.1.0 to 1.1.2+spec-1.1.0 - [Commits](https://github.com/toml-rs/toml/compare/toml_parser-v1.1.0...toml_parser-v1.1.2) --- updated-dependencies: - dependency-name: libc dependency-version: 0.2.184 dependency-type: direct:production update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: signal-hook dependency-version: 0.4.4 dependency-type: direct:production update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: arc-swap dependency-version: 1.9.1 dependency-type: direct:production update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: cc dependency-version: 1.2.59 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: indexmap dependency-version: 2.13.1 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: libz-sys dependency-version: 1.1.28 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: semver dependency-version: 1.0.28 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: toml_datetime dependency-version: 1.1.1+spec-1.1.0 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: toml_edit dependency-version: 0.25.10+spec-1.1.0 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: toml_parser dependency-version: 1.1.2+spec-1.1.0 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: libc dependency-version: 0.2.184 dependency-type: direct:production update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: signal-hook dependency-version: 0.4.4 dependency-type: direct:production update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: arc-swap dependency-version: 1.9.1 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: cc dependency-version: 1.2.59 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: indexmap dependency-version: 2.13.1 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: semver dependency-version: 1.0.28 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: toml_datetime dependency-version: 1.1.1+spec-1.1.0 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: toml_edit dependency-version: 0.25.10+spec-1.1.0 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm - dependency-name: toml_parser dependency-version: 1.1.2+spec-1.1.0 dependency-type: indirect update-type: version-update:semver-patch dependency-group: non-rust-vmm ... Signed-off-by: dependabot[bot] --- Cargo.lock | 46 +++++++++++++++++++++---------------------- Cargo.toml | 4 ++-- fuzz/Cargo.lock | 42 +++++++++++++++++++-------------------- fuzz/Cargo.toml | 2 +- hypervisor/Cargo.toml | 2 +- 5 files changed, 48 insertions(+), 48 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f60d47d317..bc927af8c4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -101,9 +101,9 @@ dependencies = [ [[package]] name = "arc-swap" -version = "1.9.0" +version = "1.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a07d1f37ff60921c83bdfc7407723bdefe89b44b98a9b772f225c8f9d67141a6" +checksum = "6a3a1fd6f75306b68087b831f025c712524bcb19aad54e557b1129cfa0a2b207" dependencies = [ "rustversion", ] @@ -370,9 +370,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "cc" -version = "1.2.58" +version = "1.2.59" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1e928d4b69e3077709075a938a05ffbedfa53a84c8f766efbf8220bb1ff60e1" +checksum = "b7a4d3ec6524d28a329fc53654bbadc9bdd7b0431f5d65f1a56ffb28a1ee5283" dependencies = [ "find-msvc-tools", "jobserver", @@ -970,9 +970,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.16.1" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51" [[package]] name = "heck" @@ -1076,12 +1076,12 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.13.0" +version = "2.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017" +checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" dependencies = [ "equivalent", - "hashbrown 0.16.1", + "hashbrown 0.17.0", "serde", "serde_core", ] @@ -1219,9 +1219,9 @@ checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" [[package]] name = "libc" -version = "0.2.183" +version = "0.2.184" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5b646652bf6661599e1da8901b3b9522896f01e736bad5f723fe7a3a27f899d" +checksum = "48f5d2a454e16a5ea0f4ced81bd44e4cfc7bd3a507b61887c99fd3538b28e4af" [[package]] name = "libredox" @@ -1248,9 +1248,9 @@ dependencies = [ [[package]] name = "libz-sys" -version = "1.1.25" +version = "1.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d52f4c29e2a68ac30c9087e1b772dc9f44a2b66ed44edf2266cf2be9b03dafc1" +checksum = "fc3a226e576f50782b3305c5ccf458698f92798987f551c6a02efe8276721e22" dependencies = [ "cc", "libc", @@ -1909,9 +1909,9 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.27" +version = "1.0.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" +checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd" [[package]] name = "serde" @@ -2001,9 +2001,9 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" [[package]] name = "signal-hook" -version = "0.4.3" +version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b57709da74f9ff9f4a27dce9526eec25ca8407c45a7887243b031a58935fb8e" +checksum = "b2a0c28ca5908dbdbcd52e6fdaa00358ab88637f8ab33e1f188dd510eb44b53d" dependencies = [ "libc", "signal-hook-registry", @@ -2147,18 +2147,18 @@ checksum = "3bf63baf9f5039dadc247375c29eb13706706cfde997d0330d05aa63a77d8820" [[package]] name = "toml_datetime" -version = "1.1.0+spec-1.1.0" +version = "1.1.1+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97251a7c317e03ad83774a8752a7e81fb6067740609f75ea2b585b569a59198f" +checksum = "3165f65f62e28e0115a00b2ebdd37eb6f3b641855f9d636d3cd4103767159ad7" dependencies = [ "serde_core", ] [[package]] name = "toml_edit" -version = "0.25.8+spec-1.1.0" +version = "0.25.11+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16bff38f1d86c47f9ff0647e6838d7bb362522bdf44006c7068c2b1e606f1f3c" +checksum = "0b59c4d22ed448339746c59b905d24568fcbb3ab65a500494f7b8c3e97739f2b" dependencies = [ "indexmap", "toml_datetime", @@ -2168,9 +2168,9 @@ dependencies = [ [[package]] name = "toml_parser" -version = "1.1.0+spec-1.1.0" +version = "1.1.2+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2334f11ee363607eb04df9b8fc8a13ca1715a72ba8662a26ac285c98aabb4011" +checksum = "a2abe9b86193656635d2411dc43050282ca48aa31c2451210f4202550afb7526" dependencies = [ "winnow 1.0.0", ] diff --git a/Cargo.toml b/Cargo.toml index dab7e6c727..6ce471501d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -91,9 +91,9 @@ env_logger = "0.11.10" epoll = "4.4.0" flume = "0.12.0" itertools = "0.14.0" -libc = "0.2.183" +libc = "0.2.184" log = "0.4.29" -signal-hook = "0.4.3" +signal-hook = "0.4.4" thiserror = "2.0.18" uuid = { version = "1.23.0" } wait-timeout = "0.2.1" diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock index dd07d241c8..7906fd5cd4 100644 --- a/fuzz/Cargo.lock +++ b/fuzz/Cargo.lock @@ -81,9 +81,9 @@ checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1" [[package]] name = "arc-swap" -version = "1.9.0" +version = "1.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a07d1f37ff60921c83bdfc7407723bdefe89b44b98a9b772f225c8f9d67141a6" +checksum = "6a3a1fd6f75306b68087b831f025c712524bcb19aad54e557b1129cfa0a2b207" dependencies = [ "rustversion", ] @@ -173,9 +173,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "cc" -version = "1.2.58" +version = "1.2.59" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1e928d4b69e3077709075a938a05ffbedfa53a84c8f766efbf8220bb1ff60e1" +checksum = "b7a4d3ec6524d28a329fc53654bbadc9bdd7b0431f5d65f1a56ffb28a1ee5283" dependencies = [ "find-msvc-tools", "jobserver", @@ -559,9 +559,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.16.1" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51" [[package]] name = "heck" @@ -619,12 +619,12 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" [[package]] name = "indexmap" -version = "2.13.0" +version = "2.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017" +checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" dependencies = [ "equivalent", - "hashbrown 0.16.1", + "hashbrown 0.17.0", "serde", "serde_core", ] @@ -718,9 +718,9 @@ checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" [[package]] name = "libc" -version = "0.2.183" +version = "0.2.184" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5b646652bf6661599e1da8901b3b9522896f01e736bad5f723fe7a3a27f899d" +checksum = "48f5d2a454e16a5ea0f4ced81bd44e4cfc7bd3a507b61887c99fd3538b28e4af" [[package]] name = "libfuzzer-sys" @@ -1033,9 +1033,9 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.27" +version = "1.0.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" +checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd" [[package]] name = "serde" @@ -1114,9 +1114,9 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" [[package]] name = "signal-hook" -version = "0.4.3" +version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b57709da74f9ff9f4a27dce9526eec25ca8407c45a7887243b031a58935fb8e" +checksum = "b2a0c28ca5908dbdbcd52e6fdaa00358ab88637f8ab33e1f188dd510eb44b53d" dependencies = [ "libc", "signal-hook-registry", @@ -1192,18 +1192,18 @@ dependencies = [ [[package]] name = "toml_datetime" -version = "1.1.0+spec-1.1.0" +version = "1.1.1+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97251a7c317e03ad83774a8752a7e81fb6067740609f75ea2b585b569a59198f" +checksum = "3165f65f62e28e0115a00b2ebdd37eb6f3b641855f9d636d3cd4103767159ad7" dependencies = [ "serde_core", ] [[package]] name = "toml_edit" -version = "0.25.8+spec-1.1.0" +version = "0.25.11+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16bff38f1d86c47f9ff0647e6838d7bb362522bdf44006c7068c2b1e606f1f3c" +checksum = "0b59c4d22ed448339746c59b905d24568fcbb3ab65a500494f7b8c3e97739f2b" dependencies = [ "indexmap", "toml_datetime", @@ -1213,9 +1213,9 @@ dependencies = [ [[package]] name = "toml_parser" -version = "1.1.0+spec-1.1.0" +version = "1.1.2+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2334f11ee363607eb04df9b8fc8a13ca1715a72ba8662a26ac285c98aabb4011" +checksum = "a2abe9b86193656635d2411dc43050282ca48aa31c2451210f4202550afb7526" dependencies = [ "winnow", ] diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index 410a4968d7..83dead0c9e 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -21,7 +21,7 @@ block = { path = "../block" } devices = { path = "../devices" } epoll = "4.4.0" hypervisor = { path = "../hypervisor", features = ["mshv_emulator"] } -libc = "0.2.183" +libc = "0.2.184" libfuzzer-sys = "0.4.12" linux-loader = { version = "0.13.2", features = ["bzimage", "elf", "pe"] } micro_http = { git = "https://github.com/firecracker-microvm/micro-http", branch = "main" } diff --git a/hypervisor/Cargo.toml b/hypervisor/Cargo.toml index 1ffaa46b78..19a9ca794d 100644 --- a/hypervisor/Cargo.toml +++ b/hypervisor/Cargo.toml @@ -15,7 +15,7 @@ tdx = [] [dependencies] anyhow = { workspace = true } -arc-swap = "1.9.0" +arc-swap = "1.9.1" bitfield-struct = "0.12.0" byteorder = { workspace = true } cfg-if = { workspace = true } From 7f377eadd92b8c935c2354f2521c0ca3385e31de Mon Sep 17 00:00:00 2001 From: Bo Chen Date: Tue, 19 Aug 2025 21:35:22 +0000 Subject: [PATCH 501/742] vmm: Fix `--platform` syntax with optional feature flags The `--platform` help string was hardcoded and did not reflect which optional features (tdx, sev_snp) were actually enabled in. Build the syntax string dynamically as `PlatformConfig::syntax()`, conditionally appending feature-gated options so the CLI help stays accurate. Signed-off-by: Bo Chen --- cloud-hypervisor/src/main.rs | 6 +++--- vmm/src/config.rs | 25 +++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/cloud-hypervisor/src/main.rs b/cloud-hypervisor/src/main.rs index 9966711820..415e7ed922 100644 --- a/cloud-hypervisor/src/main.rs +++ b/cloud-hypervisor/src/main.rs @@ -33,8 +33,8 @@ use vmm::vm_config::FwCfgConfig; use vmm::vm_config::IvshmemConfig; use vmm::vm_config::{ BalloonConfig, DeviceConfig, DiskConfig, FsConfig, GenericVhostUserConfig, LandlockConfig, - NetConfig, NumaConfig, PciSegmentConfig, PmemConfig, RateLimiterGroupConfig, TpmConfig, - UserDeviceConfig, VdpaConfig, VmConfig, VsockConfig, + NetConfig, NumaConfig, PciSegmentConfig, PlatformConfig, PmemConfig, RateLimiterGroupConfig, + TpmConfig, UserDeviceConfig, VdpaConfig, VmConfig, VsockConfig, }; use vmm_sys_util::eventfd::EventFd; use vmm_sys_util::signal::block_signal; @@ -388,7 +388,7 @@ fn get_cli_options_sorted( Arg::new("platform") .long("platform") .help( - "num_pci_segments=,iommu_segments=,iommu_address_width=,serial_number=,uuid=,oem_strings=" + PlatformConfig::syntax() ) .num_args(1) .group("vm-config"), diff --git a/vmm/src/config.rs b/vmm/src/config.rs index fc7eb1b8d1..e37809745c 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -9,6 +9,7 @@ use std::fs; use std::path::PathBuf; use std::result; use std::str::FromStr; +use std::sync::LazyLock; use block::ImageType; use clap::ArgMatches; @@ -798,6 +799,30 @@ impl PciSegmentConfig { } impl PlatformConfig { + pub fn syntax() -> &'static str { + static SYNTAX: LazyLock = LazyLock::new(|| { + let mut syntax = "Platform configuration parameters \ + \"num_pci_segments=,iommu_segments=,\ + iommu_address_width=,serial_number=,\ + uuid=,oem_strings=" + .to_string(); + + if cfg!(feature = "tdx") { + syntax.push_str(",tdx=on|off"); + } + + if cfg!(feature = "sev_snp") { + syntax.push_str(",sev_snp=on|off"); + } + + syntax.push('"'); + + syntax + }); + + &SYNTAX + } + pub fn parse(platform: &str) -> Result { let mut parser = OptionParser::new(); parser From 13972a0edf56abc1bf92f6ab2b76d7488264acf4 Mon Sep 17 00:00:00 2001 From: Bo Chen Date: Tue, 19 Aug 2025 22:01:03 +0000 Subject: [PATCH 502/742] vmm: Introduce option `--platform iommufd=on|off` This option allows user to configure VFIO device pass-through with iommufd (e.g. vfio cdev mode) or not (e.g. vfio legacy mode). Signed-off-by: Bo Chen --- vmm/src/api/openapi/cloud-hypervisor.yaml | 3 +++ vmm/src/config.rs | 12 ++++++++++-- vmm/src/vm_config.rs | 2 ++ 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/vmm/src/api/openapi/cloud-hypervisor.yaml b/vmm/src/api/openapi/cloud-hypervisor.yaml index 01106a257b..7055c9788b 100644 --- a/vmm/src/api/openapi/cloud-hypervisor.yaml +++ b/vmm/src/api/openapi/cloud-hypervisor.yaml @@ -794,6 +794,9 @@ components: sev_snp: type: boolean default: false + iommufd: + type: boolean + default: false MemoryZoneConfig: required: diff --git a/vmm/src/config.rs b/vmm/src/config.rs index e37809745c..b04ba2749c 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -804,7 +804,7 @@ impl PlatformConfig { let mut syntax = "Platform configuration parameters \ \"num_pci_segments=,iommu_segments=,\ iommu_address_width=,serial_number=,\ - uuid=,oem_strings=" + uuid=,oem_strings=,iommufd=on|off" .to_string(); if cfg!(feature = "tdx") { @@ -831,7 +831,8 @@ impl PlatformConfig { .add("iommu_address_width") .add("serial_number") .add("uuid") - .add("oem_strings"); + .add("oem_strings") + .add("iommufd"); #[cfg(feature = "tdx")] parser.add("tdx"); #[cfg(feature = "sev_snp")] @@ -858,6 +859,11 @@ impl PlatformConfig { .convert::("oem_strings") .map_err(Error::ParsePlatform)? .map(|v| v.0); + let iommufd = parser + .convert::("iommufd") + .map_err(Error::ParsePlatform)? + .unwrap_or(Toggle(false)) + .0; #[cfg(feature = "tdx")] let tdx = parser .convert::("tdx") @@ -877,6 +883,7 @@ impl PlatformConfig { serial_number, uuid, oem_strings, + iommufd, #[cfg(feature = "tdx")] tdx, #[cfg(feature = "sev_snp")] @@ -4824,6 +4831,7 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" serial_number: None, uuid: None, oem_strings: None, + iommufd: false, #[cfg(feature = "tdx")] tdx: false, #[cfg(feature = "sev_snp")] diff --git a/vmm/src/vm_config.rs b/vmm/src/vm_config.rs index 88f8af4acf..cca72dde4b 100644 --- a/vmm/src/vm_config.rs +++ b/vmm/src/vm_config.rs @@ -133,6 +133,8 @@ pub struct PlatformConfig { #[cfg(feature = "sev_snp")] #[serde(default)] pub sev_snp: bool, + #[serde(default)] + pub iommufd: bool, } pub const DEFAULT_PCI_SEGMENT_APERTURE_WEIGHT: u32 = 1; From fe5f991c3765bc5e3abeee5b8eac8b2b5f3441a2 Mon Sep 17 00:00:00 2001 From: Bo Chen Date: Thu, 9 Apr 2026 04:50:54 +0000 Subject: [PATCH 503/742] vmm: Support device passthrough with vfio cdev and iommufd When `--platform iommufd=on` is set, use the vfio cdev interface backed by iommufd instead of the legacy vfio container/group interface for device passthrough. The cdev path opens '/dev/iommu' via IommuFd, allocates an IOAS, and binds VFIO devices through VfioIommufd. The legacy container/group path remains the default and is used when iommufd is not enabled. Add iommufd-ioctls as a workspace dependency and enable the "vfio_cdev" feature on vfio-ioctls for KVM builds. Fixes: #6892 Signed-off-by: Bo Chen --- Cargo.lock | 20 +++++++++++++++++++ Cargo.toml | 1 + vmm/Cargo.toml | 3 +++ vmm/src/device_manager.rs | 41 ++++++++++++++++++++++++++++++++++++--- 4 files changed, 62 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bc927af8c4..7c03c08766 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1097,6 +1097,23 @@ dependencies = [ "libc", ] +[[package]] +name = "iommufd-bindings" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd7de3a04f6fd55f171a6682852f7aa360bb848a85e0c610513349e006b3c139" + +[[package]] +name = "iommufd-ioctls" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4eabd3414d9c4e716c9a198fbfac484625f088c075605372daf037edfe336e18" +dependencies = [ + "iommufd-bindings", + "thiserror", + "vmm-sys-util", +] + [[package]] name = "ipnetwork" version = "0.20.0" @@ -2291,6 +2308,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4b1d98dff7f0d219278e406323e7eda4d426447bd203c7828189baf0d8c07b7" dependencies = [ "byteorder", + "iommufd-bindings", + "iommufd-ioctls", "kvm-bindings", "kvm-ioctls", "libc", @@ -2521,6 +2540,7 @@ dependencies = [ "hypervisor", "igvm", "igvm_defs", + "iommufd-ioctls", "landlock", "libc", "linux-loader", diff --git a/Cargo.toml b/Cargo.toml index 6ce471501d..92a52f81a3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -53,6 +53,7 @@ resolver = "3" [workspace.dependencies] # rust-vmm crates acpi_tables = "0.2.0" +iommufd-ioctls = "0.1.0" kvm-bindings = "0.14.0" kvm-ioctls = "0.24.0" linux-loader = "0.13.2" diff --git a/vmm/Cargo.toml b/vmm/Cargo.toml index ab0278e6d1..1fe5e0e47b 100644 --- a/vmm/Cargo.toml +++ b/vmm/Cargo.toml @@ -17,8 +17,10 @@ ivshmem = ["devices/ivshmem"] kvm = [ "arch/kvm", "hypervisor/kvm", + "iommufd-ioctls", "pci/kvm", "vfio-ioctls/kvm", + "vfio-ioctls/vfio_cdev", "virtio-devices/kvm", "vm-device/kvm", ] @@ -55,6 +57,7 @@ hex = { version = "0.4.3", optional = true } hypervisor = { path = "../hypervisor" } igvm = { workspace = true, optional = true } igvm_defs = { workspace = true, optional = true } +iommufd-ioctls = { workspace = true, optional = true } landlock = "0.4.4" libc = { workspace = true } linux-loader = { workspace = true, features = ["bzimage", "elf", "pe"] } diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index edb8b76af0..a797a7bc84 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -76,6 +76,8 @@ use event_monitor::event; use hypervisor::IoEventAddress; #[cfg(target_arch = "aarch64")] use hypervisor::arch::aarch64::regs::AARCH64_PMU_IRQ; +#[cfg(feature = "kvm")] +use iommufd_ioctls::IommuFd; use libc::{ MAP_NORESERVE, MAP_PRIVATE, MAP_SHARED, O_TMPFILE, PROT_READ, PROT_WRITE, TCSANOW, tcsetattr, termios, @@ -90,6 +92,8 @@ use seccompiler::SeccompAction; use serde::{Deserialize, Serialize}; use thiserror::Error; use tracer::trace_scoped; +#[cfg(feature = "kvm")] +use vfio_ioctls::VfioIommufd; use vfio_ioctls::{VfioContainer, VfioDevice, VfioDeviceFd, VfioOps}; use virtio_devices::transport::{VirtioPciDevice, VirtioPciDeviceActivator, VirtioTransport}; use virtio_devices::vhost_user::VhostUserConfig; @@ -361,6 +365,15 @@ pub enum DeviceManagerError { #[error("Error getting pty peer")] GetPtyPeer(#[source] vmm_sys_util::errno::Error), + /// Cannot create iommufd + #[cfg(feature = "kvm")] + #[error("Cannot create iommufd")] + IommufdCreate(#[source] iommufd_ioctls::IommufdError), + + /// iommufd is not supported + #[error("iommufd is not supported without the kvm feature")] + IommufdNotSupported, + /// Cannot create a VFIO device #[error("Cannot create a VFIO device")] VfioCreate(#[source] vfio_ioctls::VfioError), @@ -3803,9 +3816,31 @@ impl DeviceManager { .try_clone() .map_err(DeviceManagerError::VfioCreate)?; - Ok(Arc::new( - VfioContainer::new(Some(Arc::new(dup))).map_err(DeviceManagerError::VfioCreate)?, - )) + let iommufd = self + .config + .lock() + .unwrap() + .platform + .as_ref() + .is_some_and(|p| p.iommufd); + + if iommufd { + #[cfg(feature = "kvm")] + { + info!("Using vfio cdev mode with iommufd."); + let iommufd = IommuFd::new().map_err(DeviceManagerError::IommufdCreate)?; + let vfio_iommufd = VfioIommufd::new(Arc::new(iommufd), None, Some(Arc::new(dup))) + .map_err(DeviceManagerError::VfioCreate)?; + Ok(Arc::new(vfio_iommufd)) + } + #[cfg(not(feature = "kvm"))] + Err(DeviceManagerError::IommufdNotSupported) + } else { + info!("Using vfio legacy mode with vfio container/group."); + Ok(Arc::new( + VfioContainer::new(Some(Arc::new(dup))).map_err(DeviceManagerError::VfioCreate)?, + )) + } } fn add_vfio_device( From 32c459c3dc140263d948adacbb11c48062721f86 Mon Sep 17 00:00:00 2001 From: Bo Chen Date: Fri, 5 Sep 2025 18:59:03 +0000 Subject: [PATCH 504/742] virtio-devices, vmm: Add seccomp rules for iommufd and vfio cdev Signed-off-by: Bo Chen --- virtio-devices/src/seccomp_filters.rs | 8 ++++ vmm/src/seccomp_filters.rs | 56 ++++++++++++++++++++++++++- 2 files changed, 62 insertions(+), 2 deletions(-) diff --git a/virtio-devices/src/seccomp_filters.rs b/virtio-devices/src/seccomp_filters.rs index f44fdc1b92..37c444999a 100644 --- a/virtio-devices/src/seccomp_filters.rs +++ b/virtio-devices/src/seccomp_filters.rs @@ -53,6 +53,10 @@ macro_rules! or { const VFIO_IOMMU_MAP_DMA: u64 = 0x3b71; const VFIO_IOMMU_UNMAP_DMA: u64 = 0x3b72; +// See include/uapi/linux/iommufd.h in the kernel code. +const IOMMU_IOAS_MAP: u64 = 0x3b85; +const IOMMU_IOAS_UNMAP: u64 = 0x3b86; + #[cfg(feature = "sev_snp")] fn mshv_sev_snp_ioctl_seccomp_rule() -> SeccompRule { and![ @@ -83,6 +87,8 @@ fn create_virtio_iommu_ioctl_seccomp_rule() -> Vec { or![ and![Cond::new(1, ArgLen::Dword, Eq, VFIO_IOMMU_MAP_DMA).unwrap()], and![Cond::new(1, ArgLen::Dword, Eq, VFIO_IOMMU_UNMAP_DMA).unwrap()], + and![Cond::new(1, ArgLen::Dword, Eq, IOMMU_IOAS_MAP).unwrap()], + and![Cond::new(1, ArgLen::Dword, Eq, IOMMU_IOAS_UNMAP).unwrap()], ] } @@ -90,6 +96,8 @@ fn create_virtio_mem_ioctl_seccomp_rule() -> Vec { or![ and![Cond::new(1, ArgLen::Dword, Eq, VFIO_IOMMU_MAP_DMA).unwrap()], and![Cond::new(1, ArgLen::Dword, Eq, VFIO_IOMMU_UNMAP_DMA).unwrap()], + and![Cond::new(1, ArgLen::Dword, Eq, IOMMU_IOAS_MAP).unwrap()], + and![Cond::new(1, ArgLen::Dword, Eq, IOMMU_IOAS_UNMAP).unwrap()], ] } diff --git a/vmm/src/seccomp_filters.rs b/vmm/src/seccomp_filters.rs index 97f020e650..25da7f9c9a 100644 --- a/vmm/src/seccomp_filters.rs +++ b/vmm/src/seccomp_filters.rs @@ -110,6 +110,18 @@ mod kvm { pub const KVM_SET_NESTED_STATE: u64 = 1082175167; } +mod iommufd { + // See include/uapi/linux/iommufd.h in the kernel code. + pub const IOMMU_IOAS_ALLOC: u64 = 0x3b81; + pub const IOMMU_IOAS_MAP: u64 = 0x3b85; + pub const IOMMU_IOAS_UNMAP: u64 = 0x3b86; + + // See include/uapi/linux/vfio.h in the kernel code. + pub const VFIO_DEVICE_BIND_IOMMUFD: u64 = 0x3b76; + pub const VFIO_DEVICE_ATTACH_IOMMUFD_PT: u64 = 0x3b77; + pub const VFIO_DEVICE_DETACH_IOMMUFD_PT: u64 = 0x3b78; +} + // Block device ioctls (not exported by libc) const BLKDISCARD: u64 = 0x1277; // _IO(0x12, 119) const BLKZEROOUT: u64 = 0x127f; // _IO(0x12, 127) @@ -247,6 +259,28 @@ fn create_vmm_ioctl_seccomp_rule_common_kvm() -> Result, Backen ]) } +fn create_vmm_ioctl_seccomp_rule_iommufd() -> Result, BackendError> { + use iommufd::*; + Ok(or![ + and![Cond::new(1, ArgLen::Dword, Eq, IOMMU_IOAS_ALLOC)?], + and![Cond::new(1, ArgLen::Dword, Eq, IOMMU_IOAS_MAP)?], + and![Cond::new(1, ArgLen::Dword, Eq, IOMMU_IOAS_UNMAP)?], + and![Cond::new(1, ArgLen::Dword, Eq, VFIO_DEVICE_BIND_IOMMUFD)?], + and![Cond::new( + 1, + ArgLen::Dword, + Eq, + VFIO_DEVICE_ATTACH_IOMMUFD_PT + )?], + and![Cond::new( + 1, + ArgLen::Dword, + Eq, + VFIO_DEVICE_DETACH_IOMMUFD_PT + )?], + ]) +} + fn create_vmm_ioctl_seccomp_rule_hypervisor( hypervisor_type: HypervisorType, ) -> Result, BackendError> { @@ -373,9 +407,11 @@ fn create_vmm_ioctl_seccomp_rule_common( ]; let hypervisor_rules = create_vmm_ioctl_seccomp_rule_hypervisor(hypervisor_type)?; - common_rules.extend(hypervisor_rules); + let iommufd_rules = create_vmm_ioctl_seccomp_rule_iommufd()?; + common_rules.extend(iommufd_rules); + Ok(common_rules) } @@ -764,6 +800,20 @@ fn create_vcpu_ioctl_seccomp_rule_hypervisor( } } +fn create_vcpu_ioctl_seccomp_rule_iommufd() -> Result, BackendError> { + use iommufd::*; + Ok(or![ + and![Cond::new(1, ArgLen::Dword, Eq, IOMMU_IOAS_MAP)?], + and![Cond::new(1, ArgLen::Dword, Eq, IOMMU_IOAS_UNMAP)?], + and![Cond::new( + 1, + ArgLen::Dword, + Eq, + VFIO_DEVICE_DETACH_IOMMUFD_PT + )?], + ]) +} + fn create_vcpu_ioctl_seccomp_rule( hypervisor_type: HypervisorType, ) -> Result, BackendError> { @@ -784,9 +834,11 @@ fn create_vcpu_ioctl_seccomp_rule( ]; let hypervisor_rules = create_vcpu_ioctl_seccomp_rule_hypervisor(hypervisor_type)?; - rules.extend(hypervisor_rules); + let iommufd_rules = create_vcpu_ioctl_seccomp_rule_iommufd()?; + rules.extend(iommufd_rules); + Ok(rules) } From 87992c77c112a0ecb72375623c451dc29470502e Mon Sep 17 00:00:00 2001 From: Bo Chen Date: Thu, 9 Apr 2026 04:26:31 +0000 Subject: [PATCH 505/742] vmm: Introduce option `--platform vfio_p2p_dma=on|off` Add a user-configurable option to control whether VFIO device MMIO BAR regions are DMA-mapped into the host IOMMU address space. This mapping is required for peer-to-peer DMA between devices (e.g. NVLink, RDMA NIC accessing GPU VRAM). However, iommufd on upstream kernels does not support mapping device MMIO pages (VM_PFNMAP), causing IOMMU_IOAS_MAP to fail with -EFAULT. Kernels with the NVIDIA PFNMAP workaround or future kernels with DMABUF-based mapping (IOMMU_IOAS_MAP_FILE) handle this correctly. The option defaults to `on` to preserve existing behavior. Users on vanilla kernels using iommufd should set `vfio_p2p_dma=off` to skip MMIO BAR DMA mapping. A validation check ensures that `x_nv_gpudirect_clique` (which depends on P2P DMA) cannot be used when `vfio_p2p_dma=off`. Signed-off-by: Bo Chen --- pci/src/vfio.rs | 16 ++++++- vmm/src/api/openapi/cloud-hypervisor.yaml | 3 ++ vmm/src/config.rs | 58 ++++++++++++++++++++++- vmm/src/device_manager.rs | 9 ++++ vmm/src/vm_config.rs | 6 +++ 5 files changed, 88 insertions(+), 4 deletions(-) diff --git a/pci/src/vfio.rs b/pci/src/vfio.rs index 0fa3ae8365..7dc6f9c3b1 100644 --- a/pci/src/vfio.rs +++ b/pci/src/vfio.rs @@ -1471,6 +1471,9 @@ pub struct VfioPciDevice { vfio_ops: Arc, common: VfioCommon, iommu_attached: bool, + // Whether to map VFIO device MMIO BARs into the host IOMMU address space. + // Required for peer-to-peer DMA between VFIO devices. + p2p_dma: bool, memory_slot_allocator: MemorySlotAllocator, bdf: PciBdf, device_path: PathBuf, @@ -1487,6 +1490,7 @@ impl VfioPciDevice { msi_interrupt_manager: Arc>, legacy_interrupt_group: Option>, iommu_attached: bool, + p2p_dma: bool, bdf: PciBdf, memory_slot_allocator: MemorySlotAllocator, snapshot: Option<&Snapshot>, @@ -1515,6 +1519,7 @@ impl VfioPciDevice { vfio_ops, common, iommu_attached, + p2p_dma, memory_slot_allocator, bdf, device_path, @@ -1719,7 +1724,9 @@ impl VfioPciDevice { } .map_err(VfioPciError::CreateUserMemoryRegion)?; - if !self.iommu_attached { + // Map the MMIO BAR into the host IOMMU address space via VfioOps + // Only needed if p2p_dma is enabled. + if !self.iommu_attached && self.p2p_dma { // vfio_dma_map should be unsafe but isn't. #[allow(unused_unsafe)] // SAFETY: MmapRegion invariants guarantee that @@ -1749,7 +1756,9 @@ impl VfioPciDevice { let len = user_memory_region.mapping.len(); let host_addr = user_memory_region.mapping.addr(); // Unmap MMIO region from the host IOMMU address space via VfioOps + // Only needed if p2p_dma is enabled. if !self.iommu_attached + && self.p2p_dma && let Err(e) = self .vfio_ops .vfio_dma_unmap(user_memory_region.start, len) @@ -1907,7 +1916,9 @@ impl PciDevice for VfioPciDevice { let len = user_memory_region.mapping.len(); let host_addr = user_memory_region.mapping.addr(); // Unmap the old MMIO region from the host IOMMU address space via VfioOps + // Only needed if p2p_dma is enabled. if !self.iommu_attached + && self.p2p_dma && let Err(e) = self .vfio_ops .vfio_dma_unmap(user_memory_region.start, len) @@ -1961,7 +1972,8 @@ iova 0x{:x}, size 0x{:x}: {}, ", .map_err(io::Error::other)?; // Map the moved MMIO region into the host IOMMU address space via VfioOps - if !self.iommu_attached { + // Only needed if p2p_dma is enabled. + if !self.iommu_attached && self.p2p_dma { // vfio_dma_map is unsound and ought to be marked as unsafe #[allow(unused_unsafe)] // SAFETY: MmapRegion invariants guarantee that diff --git a/vmm/src/api/openapi/cloud-hypervisor.yaml b/vmm/src/api/openapi/cloud-hypervisor.yaml index 7055c9788b..5be55560e6 100644 --- a/vmm/src/api/openapi/cloud-hypervisor.yaml +++ b/vmm/src/api/openapi/cloud-hypervisor.yaml @@ -797,6 +797,9 @@ components: iommufd: type: boolean default: false + vfio_p2p_dma: + type: boolean + default: true MemoryZoneConfig: required: diff --git a/vmm/src/config.rs b/vmm/src/config.rs index b04ba2749c..405cb1da76 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -315,6 +315,9 @@ pub enum ValidationError { /// On a IOMMU segment but not behind IOMMU #[error("Device is on an IOMMU PCI segment ({0}) but not placed behind IOMMU")] OnIommuSegment(u16), + /// GPUDirect clique requires P2P DMA + #[error("Device with x_nv_gpudirect_clique requires vfio_p2p_dma=on")] + GpuDirectCliqueRequiresP2pDma, // On a IOMMU segment but IOMMU not supported #[error( "Device is on an IOMMU PCI segment ({0}) but does not support being placed behind IOMMU" @@ -804,7 +807,8 @@ impl PlatformConfig { let mut syntax = "Platform configuration parameters \ \"num_pci_segments=,iommu_segments=,\ iommu_address_width=,serial_number=,\ - uuid=,oem_strings=,iommufd=on|off" + uuid=,oem_strings=,iommufd=on|off,\ + vfio_p2p_dma=on|off" .to_string(); if cfg!(feature = "tdx") { @@ -832,7 +836,8 @@ impl PlatformConfig { .add("serial_number") .add("uuid") .add("oem_strings") - .add("iommufd"); + .add("iommufd") + .add("vfio_p2p_dma"); #[cfg(feature = "tdx")] parser.add("tdx"); #[cfg(feature = "sev_snp")] @@ -864,6 +869,11 @@ impl PlatformConfig { .map_err(Error::ParsePlatform)? .unwrap_or(Toggle(false)) .0; + let vfio_p2p_dma = parser + .convert::("vfio_p2p_dma") + .map_err(Error::ParsePlatform)? + .unwrap_or(Toggle(true)) + .0; #[cfg(feature = "tdx")] let tdx = parser .convert::("tdx") @@ -884,6 +894,7 @@ impl PlatformConfig { uuid, oem_strings, iommufd, + vfio_p2p_dma, #[cfg(feature = "tdx")] tdx, #[cfg(feature = "sev_snp")] @@ -2277,6 +2288,13 @@ impl DeviceConfig { } } + if self.x_nv_gpudirect_clique.is_some() { + let vfio_p2p_dma = vm_config.platform.as_ref().is_none_or(|p| p.vfio_p2p_dma); + if !vfio_p2p_dma { + return Err(ValidationError::GpuDirectCliqueRequiresP2pDma); + } + } + Ok(()) } } @@ -4832,6 +4850,7 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" uuid: None, oem_strings: None, iommufd: false, + vfio_p2p_dma: default_platformconfig_vfio_p2p_dma(), #[cfg(feature = "tdx")] tdx: false, #[cfg(feature = "sev_snp")] @@ -5572,6 +5591,41 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" config_with_invalid_host_data.validate().unwrap_err(); } + // x_nv_gpudirect_clique with vfio_p2p_dma=off should fail + let mut invalid_config = valid_config.clone(); + invalid_config.platform = Some(PlatformConfig { + vfio_p2p_dma: false, + ..platform_fixture() + }); + invalid_config.devices = Some(vec![DeviceConfig { + x_nv_gpudirect_clique: Some(0), + ..device_fixture() + }]); + assert_eq!( + invalid_config.validate(), + Err(ValidationError::GpuDirectCliqueRequiresP2pDma) + ); + + // x_nv_gpudirect_clique with vfio_p2p_dma=on should pass + let mut still_valid_config = valid_config.clone(); + still_valid_config.platform = Some(PlatformConfig { + vfio_p2p_dma: true, + ..platform_fixture() + }); + still_valid_config.devices = Some(vec![DeviceConfig { + x_nv_gpudirect_clique: Some(0), + ..device_fixture() + }]); + still_valid_config.validate().unwrap(); + + // x_nv_gpudirect_clique with no platform config (default p2p_dma=on) should pass + let mut still_valid_config = valid_config.clone(); + still_valid_config.devices = Some(vec![DeviceConfig { + x_nv_gpudirect_clique: Some(0), + ..device_fixture() + }]); + still_valid_config.validate().unwrap(); + let mut still_valid_config = valid_config; // SAFETY: Safe as the file was just opened let fd1 = unsafe { libc::dup(File::open("/dev/null").unwrap().as_raw_fd()) }; diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index a797a7bc84..0ae8526448 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -3963,6 +3963,14 @@ impl DeviceManager { let memory_manager = self.memory_manager.clone(); + let vfio_p2p_dma = self + .config + .lock() + .unwrap() + .platform + .as_ref() + .is_none_or(|p| p.vfio_p2p_dma); + let vfio_pci_device = VfioPciDevice::new( vfio_name.clone(), self.address_manager.vm.clone(), @@ -3971,6 +3979,7 @@ impl DeviceManager { self.msi_interrupt_manager.clone(), legacy_interrupt_group, device_cfg.iommu, + vfio_p2p_dma, pci_device_bdf, memory_manager.lock().unwrap().memory_slot_allocator(), vm_migration::snapshot_from_id(self.snapshot.as_ref(), vfio_name.as_str()), diff --git a/vmm/src/vm_config.rs b/vmm/src/vm_config.rs index cca72dde4b..94b1c011c8 100644 --- a/vmm/src/vm_config.rs +++ b/vmm/src/vm_config.rs @@ -113,6 +113,10 @@ pub fn default_platformconfig_iommu_address_width_bits() -> u8 { DEFAULT_IOMMU_ADDRESS_WIDTH_BITS } +pub fn default_platformconfig_vfio_p2p_dma() -> bool { + true +} + #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] pub struct PlatformConfig { #[serde(default = "default_platformconfig_num_pci_segments")] @@ -135,6 +139,8 @@ pub struct PlatformConfig { pub sev_snp: bool, #[serde(default)] pub iommufd: bool, + #[serde(default = "default_platformconfig_vfio_p2p_dma")] + pub vfio_p2p_dma: bool, } pub const DEFAULT_PCI_SEGMENT_APERTURE_WEIGHT: u32 = 1; From 9d47769bc25e6b9aaae1aa10d28a4d3a1d5b113b Mon Sep 17 00:00:00 2001 From: Bo Chen Date: Tue, 7 Apr 2026 21:22:21 +0000 Subject: [PATCH 506/742] tests: Add iommufd integration tests Add an `iommufd` flag to existing VFIO integration tests. When false, tests use the legacy vfio container/group backend (existing behavior). When true, tests use vfio cdev with iommufd and vfio_p2p_dma=off. vfio_p2p_dma=off is required because the VFIO test runner uses a stock Ubuntu 24.04 kernel (v6.8) which does not support mapping device MMIO pages (VM_PFNMAP) through iommufd, causing IOMMU_IOAS_MAP to fail with -EFAULT on MMIO BAR regions. Signed-off-by: Bo Chen --- cloud-hypervisor/tests/integration.rs | 82 +++++++++++++++++++++++---- scripts/run_integration_tests_vfio.sh | 7 +++ 2 files changed, 78 insertions(+), 11 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index fded5727ed..15516e84e9 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -8240,7 +8240,15 @@ mod vfio { use crate::*; const NVIDIA_VFIO_DEVICE: &str = "/sys/bus/pci/devices/0002:00:01.0"; - fn test_nvidia_card_memory_hotplug(hotplug_method: &str) { + fn platform_cfg(iommufd: bool) -> String { + if iommufd { + "iommufd=on,vfio_p2p_dma=off".to_string() + } else { + "iommufd=off".to_string() + } + } + + fn test_nvidia_card_memory_hotplug(hotplug_method: &str, iommufd: bool) { let disk_config = UbuntuDiskConfig::new(JAMMY_VFIO_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(disk_config)); let api_socket = temp_api_path(&guest.tmp_dir); @@ -8252,6 +8260,7 @@ mod vfio { format!("size=4G,hotplug_size=4G,hotplug_method={hotplug_method}").as_str(), ]) .args(["--kernel", fw_path(FwType::RustHypervisorFirmware).as_str()]) + .args(["--platform", &platform_cfg(iommufd)]) .args(["--device", format!("path={NVIDIA_VFIO_DEVICE}").as_str()]) .args(["--api-socket", &api_socket]) .default_disks() @@ -8285,16 +8294,25 @@ mod vfio { #[test] fn test_nvidia_card_memory_hotplug_acpi() { - test_nvidia_card_memory_hotplug("acpi"); + test_nvidia_card_memory_hotplug("acpi", false); } #[test] fn test_nvidia_card_memory_hotplug_virtio_mem() { - test_nvidia_card_memory_hotplug("virtio-mem"); + test_nvidia_card_memory_hotplug("virtio-mem", false); } #[test] - fn test_nvidia_card_pci_hotplug() { + fn test_iommufd_nvidia_card_memory_hotplug_acpi() { + test_nvidia_card_memory_hotplug("acpi", true); + } + + #[test] + fn test_iommufd_nvidia_card_memory_hotplug_virtio_mem() { + test_nvidia_card_memory_hotplug("virtio-mem", true); + } + + fn test_nvidia_card_pci_hotplug_common(iommufd: bool) { let disk_config = UbuntuDiskConfig::new(JAMMY_VFIO_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(disk_config)); let api_socket = temp_api_path(&guest.tmp_dir); @@ -8303,6 +8321,7 @@ mod vfio { .args(["--cpus", "boot=4"]) .args(["--memory", "size=4G"]) .args(["--kernel", fw_path(FwType::RustHypervisorFirmware).as_str()]) + .args(["--platform", &platform_cfg(iommufd)]) .args(["--api-socket", &api_socket]) .default_disks() .default_net() @@ -8338,7 +8357,16 @@ mod vfio { } #[test] - fn test_nvidia_card_reboot() { + fn test_nvidia_card_pci_hotplug() { + test_nvidia_card_pci_hotplug_common(false); + } + + #[test] + fn test_iommufd_nvidia_card_pci_hotplug() { + test_nvidia_card_pci_hotplug_common(true); + } + + fn test_nvidia_card_reboot_common(iommufd: bool) { let disk_config = UbuntuDiskConfig::new(JAMMY_VFIO_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(disk_config)); let api_socket = temp_api_path(&guest.tmp_dir); @@ -8346,6 +8374,7 @@ mod vfio { let mut child = GuestCommand::new(&guest) .args(["--cpus", "boot=4"]) .args(["--memory", "size=4G"]) + .args(["--platform", &platform_cfg(iommufd)]) .args(["--kernel", fw_path(FwType::RustHypervisorFirmware).as_str()]) .args([ "--device", @@ -8377,20 +8406,31 @@ mod vfio { } #[test] - fn test_nvidia_card_iommu_address_width() { + fn test_nvidia_card_reboot() { + test_nvidia_card_reboot_common(false); + } + + #[test] + fn test_iommufd_nvidia_card_reboot() { + test_nvidia_card_reboot_common(true); + } + + fn test_nvidia_card_iommu_address_width_common(iommufd: bool) { let disk_config = UbuntuDiskConfig::new(JAMMY_VFIO_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(disk_config)); let api_socket = temp_api_path(&guest.tmp_dir); + let platform = format!( + "num_pci_segments=2,iommu_segments=1,iommu_address_width=42,{}", + platform_cfg(iommufd) + ); + let mut child = GuestCommand::new(&guest) .args(["--cpus", "boot=4"]) .args(["--memory", "size=4G"]) .args(["--kernel", fw_path(FwType::RustHypervisorFirmware).as_str()]) .args(["--device", format!("path={NVIDIA_VFIO_DEVICE}").as_str()]) - .args([ - "--platform", - "num_pci_segments=2,iommu_segments=1,iommu_address_width=42", - ]) + .args(["--platform", &platform]) .args(["--api-socket", &api_socket]) .default_disks() .default_net() @@ -8416,7 +8456,16 @@ mod vfio { } #[test] - fn test_nvidia_guest_numa_generic_initiator() { + fn test_nvidia_card_iommu_address_width() { + test_nvidia_card_iommu_address_width_common(false); + } + + #[test] + fn test_iommufd_nvidia_card_iommu_address_width() { + test_nvidia_card_iommu_address_width_common(true); + } + + fn test_nvidia_guest_numa_generic_initiator_common(iommufd: bool) { // Skip test if VFIO device is not available or not ready if !std::path::Path::new(NVIDIA_VFIO_DEVICE).exists() { println!("SKIPPED: VFIO device {NVIDIA_VFIO_DEVICE} not found"); @@ -8453,6 +8502,7 @@ mod vfio { "guest_numa_id=1,cpus=[2-3],distances=[0@20,2@30],memory_zones=mem1", "guest_numa_id=2,device_id=vfio0,distances=[0@25,1@30]", ]) + .args(["--platform", &platform_cfg(iommufd)]) .args([ "--device", &format!("id=vfio0,path={NVIDIA_VFIO_DEVICE},iommu=on"), @@ -8525,6 +8575,16 @@ mod vfio { handle_child_output(r, &output); } + + #[test] + fn test_nvidia_guest_numa_generic_initiator() { + test_nvidia_guest_numa_generic_initiator_common(false); + } + + #[test] + fn test_iommufd_nvidia_guest_numa_generic_initiator() { + test_nvidia_guest_numa_generic_initiator_common(true); + } } mod live_migration { diff --git a/scripts/run_integration_tests_vfio.sh b/scripts/run_integration_tests_vfio.sh index b32afe5a23..eecd3111b8 100755 --- a/scripts/run_integration_tests_vfio.sh +++ b/scripts/run_integration_tests_vfio.sh @@ -30,7 +30,14 @@ cargo build --features mshv --all --release --target "$BUILD_TARGET" export RUST_BACKTRACE=1 export RUSTFLAGS="$RUSTFLAGS" +# Run VFIO tests using legacy vfio interface with container/group time cargo nextest run --no-tests=pass --test-threads=1 "vfio::test_nvidia" -- ${test_binary_args[*]} RES=$? +# Run VFIO tests using vfio cdev interface backed by iommufd +if [ $RES -eq 0 ]; then + time cargo nextest run --no-tests=pass --test-threads=1 "vfio::test_iommufd" -- ${test_binary_args[*]} + RES=$? +fi + exit $RES From 23e139c0f86b3cf088d7cc797fced9805e22cabb Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Thu, 9 Apr 2026 13:00:24 -0400 Subject: [PATCH 507/742] ci: Double-quote variables in GitHub Actions This is best practice for shell scripts. Signed-off-by: Demi Marie Obenour --- .github/workflows/gitlint.yaml | 2 +- .github/workflows/integration-arm64.yaml | 2 +- .github/workflows/integration-vfio.yaml | 2 +- .github/workflows/mshv-infra.yaml | 44 ++++++++++++------------ .github/workflows/mshv-integration.yaml | 14 ++++---- 5 files changed, 31 insertions(+), 33 deletions(-) diff --git a/.github/workflows/gitlint.yaml b/.github/workflows/gitlint.yaml index 6fd0ec4ab9..178f15aa40 100644 --- a/.github/workflows/gitlint.yaml +++ b/.github/workflows/gitlint.yaml @@ -22,4 +22,4 @@ jobs: pip install --upgrade gitlint - name: Lint git commit messages run: | - gitlint --commits origin/$GITHUB_BASE_REF.. + gitlint --commits "origin/$GITHUB_BASE_REF.." diff --git a/.github/workflows/integration-arm64.yaml b/.github/workflows/integration-arm64.yaml index d0d6966482..fc545c2499 100644 --- a/.github/workflows/integration-arm64.yaml +++ b/.github/workflows/integration-arm64.yaml @@ -52,7 +52,7 @@ jobs: popd mkdir -p "$HOME/workloads" az storage blob download --container-name private-images --file "$IMG_GZ_PATH" --name "$IMG_GZ_BLOB_NAME" --connection-string "${{ secrets.CH_PRIVATE_IMAGES }}" - gzip -d $IMG_GZ_PATH + gzip -d "$IMG_GZ_PATH" - name: Run Windows guest integration tests if: ${{ github.event_name != 'pull_request' }} timeout-minutes: 30 diff --git a/.github/workflows/integration-vfio.yaml b/.github/workflows/integration-vfio.yaml index 19d5cadba6..b4f2ca2f94 100644 --- a/.github/workflows/integration-vfio.yaml +++ b/.github/workflows/integration-vfio.yaml @@ -13,7 +13,7 @@ jobs: steps: - name: Fix workspace permissions if: ${{ github.event_name != 'pull_request' }} - run: sudo chown -R github-runner:github-runner ${GITHUB_WORKSPACE} + run: sudo chown -R github-runner:github-runner "${GITHUB_WORKSPACE}" - name: Code checkout if: ${{ github.event_name != 'pull_request' }} uses: actions/checkout@v6 diff --git a/.github/workflows/mshv-infra.yaml b/.github/workflows/mshv-infra.yaml index ad71f69041..22f5ddbe6b 100644 --- a/.github/workflows/mshv-infra.yaml +++ b/.github/workflows/mshv-infra.yaml @@ -68,7 +68,7 @@ jobs: fi az --version echo "Logging into Azure CLI using Managed Identity" - az login --identity --client-id ${MI_CLIENT_ID} + az login --identity --client-id "${MI_CLIENT_ID}" - name: Get Location id: get-location @@ -118,7 +118,7 @@ jobs: echo "Creating Resource Group: $RG" # Create the resource group echo "Creating resource group in location: ${LOCATION}" - az group create --name ${RG} --location ${LOCATION} + az group create --name "${RG}" --location "${LOCATION}" echo "RG_NAME=${RG}" >> $GITHUB_OUTPUT echo "Resource group created successfully." @@ -130,7 +130,7 @@ jobs: set -e echo "Generating SSH key: $KEY" mkdir -p ~/.ssh - ssh-keygen -t rsa -b 4096 -f ~/.ssh/${KEY} -N "" + ssh-keygen -t rsa -b 4096 -f ~/.ssh/"${KEY}" -N "" - name: Create VM id: vm-setup @@ -150,7 +150,7 @@ jobs: # Extract subnet ID from the runner VM echo "Retrieving subnet ID..." - SUBNET_ID=$(az network vnet list --resource-group ${RUNNER_RG} --query "[?contains(location, '${LOCATION}')].{SUBNETS:subnets}" | jq -r ".[0].SUBNETS[0].id") + SUBNET_ID=$(az network vnet list --resource-group "$RUNNER_RG" --query "[?contains(location, '${LOCATION}')].{SUBNETS:subnets}" | jq -r ".[0].SUBNETS[0].id") if [[ -z "${SUBNET_ID}" ]]; then echo "ERROR: Failed to retrieve Subnet ID." exit 1 @@ -158,7 +158,7 @@ jobs: # Extract image ID from the runner VM echo "Retrieving image ID..." - IMAGE_ID=$(az image show --resource-group ${RUNNER_RG} --name ${VM_IMAGE_NAME} --query "id" -o tsv) + IMAGE_ID=$(az image show --resource-group "$RUNNER_RG" --name "$VM_IMAGE_NAME" --query "id" -o tsv) if [[ -z "${IMAGE_ID}" ]]; then echo "ERROR: Failed to retrieve Image ID." exit 1 @@ -166,24 +166,24 @@ jobs: # Create VM az vm create \ - --resource-group ${RG} \ - --name ${VM_NAME} \ - --subnet ${SUBNET_ID} \ - --size ${VM_SKU} \ - --location ${LOCATION} \ - --image ${IMAGE_ID} \ - --os-disk-size-gb ${OS_DISK_SIZE} \ + --resource-group "${RG}" \ + --name "${VM_NAME}" \ + --subnet "${SUBNET_ID}" \ + --size "${VM_SKU}" \ + --location "${LOCATION}" \ + --image "${IMAGE_ID}" \ + --os-disk-size-gb "${OS_DISK_SIZE}" \ --public-ip-sku Standard \ --storage-sku Premium_LRS \ --public-ip-address "" \ - --admin-username ${USERNAME} \ - --ssh-key-value ~/.ssh/${KEY}.pub \ + --admin-username "${USERNAME}" \ + --ssh-key-value ~/.ssh/"${KEY}".pub \ --security-type Standard \ --output json - az vm boot-diagnostics enable --name ${VM_NAME} --resource-group ${RG} + az vm boot-diagnostics enable --name "${VM_NAME}" --resource-group "${RG}" - echo "VM_NAME=${VM_NAME}" >> $GITHUB_OUTPUT + echo "VM_NAME=${VM_NAME}" >> "$GITHUB_OUTPUT" echo "VM creation process completed successfully." - name: Get VM Private IP @@ -195,12 +195,12 @@ jobs: set -e echo "Retrieving VM Private IP address..." # Retrieve VM Private IP address - PRIVATE_IP=$(az vm show -g ${RG} -n ${VM_NAME} -d --query privateIps -o tsv) + PRIVATE_IP=$(az vm show -g "${RG}" -n "${VM_NAME}" -d --query privateIps -o tsv) if [[ -z "$PRIVATE_IP" ]]; then echo "ERROR: Failed to retrieve private IP address." exit 1 fi - echo "PRIVATE_IP=$PRIVATE_IP" >> $GITHUB_OUTPUT + echo "PRIVATE_IP=$PRIVATE_IP" >> "$GITHUB_OUTPUT" - name: Wait for SSH availability env: @@ -209,7 +209,7 @@ jobs: USERNAME: ${{ secrets.USERNAME }} run: | echo "Waiting for SSH to be accessible..." - timeout 120 bash -c 'until ssh -o StrictHostKeyChecking=no -i ~/.ssh/${KEY} ${USERNAME}@${PRIVATE_IP} "exit" 2>/dev/null; do sleep 5; done' + timeout 120 bash -c 'until ssh -o StrictHostKeyChecking=no -i ~/.ssh/"${KEY}" -- "${USERNAME}@${PRIVATE_IP}" "exit" 2>/dev/null; do sleep 5; done' echo "VM is accessible!" - name: Remove Old Host Key @@ -218,7 +218,7 @@ jobs: run: | set -e echo "Removing the old host key" - ssh-keygen -R $PRIVATE_IP + ssh-keygen -R "$PRIVATE_IP" - name: SSH into VM and Install Dependencies env: @@ -227,7 +227,7 @@ jobs: USERNAME: ${{ secrets.USERNAME }} run: | set -e - ssh -i ~/.ssh/${KEY} -o StrictHostKeyChecking=no ${USERNAME}@${PRIVATE_IP} << EOF + ssh -i ~/.ssh/"${KEY}" -o StrictHostKeyChecking=no -- "${USERNAME}@${PRIVATE_IP}" << EOF set -e echo "Logged in successfully." echo "Installing dependencies..." @@ -243,6 +243,6 @@ jobs: sudo systemctl enable containerd.service sudo systemctl start docker sudo groupadd -f docker - sudo usermod -a -G docker ${USERNAME} + sudo usermod -a -G docker "${USERNAME}" sudo systemctl restart docker EOF diff --git a/.github/workflows/mshv-integration.yaml b/.github/workflows/mshv-integration.yaml index ad544249eb..1b83b5d292 100644 --- a/.github/workflows/mshv-integration.yaml +++ b/.github/workflows/mshv-integration.yaml @@ -38,7 +38,7 @@ jobs: run: | set -e echo "Connecting to the VM via SSH..." - ssh -i ~/.ssh/${KEY} -o StrictHostKeyChecking=no ${USERNAME}@${PRIVATE_IP} << EOF + ssh -i ~/.ssh/"${KEY}" -o StrictHostKeyChecking=no -- "${USERNAME}@${PRIVATE_IP}" << EOF set -e echo "Logged in successfully." export PATH="\$HOME/.cargo/bin:\$PATH" @@ -87,9 +87,7 @@ jobs: PRIVATE_IP: ${{ needs.infra-setup.outputs.PRIVATE_IP }} USERNAME: ${{ secrets.MSHV_USERNAME }} run: | - ssh -i ~/.ssh/${KEY} -o StrictHostKeyChecking=no ${USERNAME}@${PRIVATE_IP} << EOF - sudo dmesg - EOF + ssh -i ~/.ssh/"${KEY}" -o StrictHostKeyChecking=no -- "${USERNAME}@${PRIVATE_IP}" sudo dmesg - name: Dump serial console logs if: always() @@ -111,8 +109,8 @@ jobs: env: RG: MSHV-INTEGRATION-${{ github.run_id }} run: | - if az group exists --name ${RG}; then - az group delete --name ${RG} --yes --no-wait + if az group exists --name "${RG}"; then + az group delete --name "${RG}" --yes --no-wait else echo "Resource Group ${RG} does not exist. Skipping deletion." fi @@ -122,8 +120,8 @@ jobs: env: KEY: azure_key_${{ github.run_id }} run: | - if [ -f ~/.ssh/${KEY} ]; then - rm -f ~/.ssh/${KEY} ~/.ssh/${KEY}.pub + if [ -f ~/.ssh/"${KEY}" ]; then + rm -f ~/.ssh/"${KEY}" ~/.ssh/"${KEY}.pub" echo "SSH key deleted successfully." else echo "SSH key does not exist. Skipping deletion." From 5b67b8994a3686225c35186bc9ee4b7d0171356c Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Thu, 9 Apr 2026 13:17:16 -0400 Subject: [PATCH 508/742] ci: Use set -eufo pipefail Most scripts can use it and it is good at catching errors. Signed-off-by: Demi Marie Obenour --- .github/workflows/dco.yaml | 1 + .github/workflows/integration-arm64.yaml | 2 ++ .github/workflows/integration-windows.yaml | 3 +++ .github/workflows/integration-x86-64.yaml | 1 + .github/workflows/lychee.yaml | 1 + .github/workflows/mshv-infra.yaml | 18 +++++++++--------- .github/workflows/mshv-integration.yaml | 4 ++-- .github/workflows/package-consistency.yaml | 1 + .github/workflows/quality.yaml | 2 +- 9 files changed, 21 insertions(+), 12 deletions(-) diff --git a/.github/workflows/dco.yaml b/.github/workflows/dco.yaml index 655c0b5e2f..67dfadd5c4 100644 --- a/.github/workflows/dco.yaml +++ b/.github/workflows/dco.yaml @@ -16,5 +16,6 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | + set -eufo pipefail pip3 install -U dco-check dco-check -e "49699333+dependabot[bot]@users.noreply.github.com" diff --git a/.github/workflows/integration-arm64.yaml b/.github/workflows/integration-arm64.yaml index fc545c2499..873daaa747 100644 --- a/.github/workflows/integration-arm64.yaml +++ b/.github/workflows/integration-arm64.yaml @@ -31,6 +31,7 @@ jobs: - name: Install Azure CLI if: ${{ github.event_name != 'pull_request' }} run: | + set -eufo pipefail sudo apt install -y ca-certificates curl apt-transport-https lsb-release gnupg curl -sL https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor | sudo tee /etc/apt/trusted.gpg.d/microsoft.gpg > /dev/null echo "deb [arch=arm64] https://packages.microsoft.com/repos/azure-cli/ bookworm main" | sudo tee /etc/apt/sources.list.d/azure-cli.list @@ -40,6 +41,7 @@ jobs: if: ${{ github.event_name != 'pull_request' }} shell: bash run: | + set -eufo pipefail IMG_BASENAME=windows-11-iot-enterprise-aarch64.raw IMG_PATH=$HOME/workloads/$IMG_BASENAME IMG_GZ_PATH=$HOME/workloads/$IMG_BASENAME.gz diff --git a/.github/workflows/integration-windows.yaml b/.github/workflows/integration-windows.yaml index 51877aa476..bb1f68158c 100644 --- a/.github/workflows/integration-windows.yaml +++ b/.github/workflows/integration-windows.yaml @@ -17,6 +17,7 @@ jobs: - name: Install Docker if: ${{ github.event_name != 'pull_request' }} run: | + set -eufo pipefail sudo apt-get update sudo apt-get -y install ca-certificates curl gnupg curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg @@ -27,6 +28,7 @@ jobs: - name: Install Azure CLI if: ${{ github.event_name != 'pull_request' }} run: | + set -eufo pipefail sudo apt install -y ca-certificates curl apt-transport-https lsb-release gnupg curl -sL https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor | sudo tee /etc/apt/trusted.gpg.d/microsoft.gpg > /dev/null echo "deb [arch=amd64] https://packages.microsoft.com/repos/azure-cli/ jammy main" | sudo tee /etc/apt/sources.list.d/azure-cli.list @@ -35,6 +37,7 @@ jobs: - name: Download Windows image if: ${{ github.event_name != 'pull_request' }} run: | + set -eufo pipefail mkdir $HOME/workloads az storage blob download --container-name private-images --file "$HOME/workloads/windows-server-2022-amd64-2.raw" --name windows-server-2022-amd64-2.raw --connection-string "${{ secrets.CH_PRIVATE_IMAGES }}" - name: Run Windows guest integration tests diff --git a/.github/workflows/integration-x86-64.yaml b/.github/workflows/integration-x86-64.yaml index b620954d04..98e2a8dcdc 100644 --- a/.github/workflows/integration-x86-64.yaml +++ b/.github/workflows/integration-x86-64.yaml @@ -23,6 +23,7 @@ jobs: - name: Install Docker if: ${{ github.event_name != 'pull_request' || (matrix.runner == 'garm-jammy' && matrix.libc == 'gnu') }} run: | + set -eufo pipefail sudo apt-get update sudo apt-get -y install ca-certificates curl gnupg curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg diff --git a/.github/workflows/lychee.yaml b/.github/workflows/lychee.yaml index e77c595ed3..105e2e9a6e 100644 --- a/.github/workflows/lychee.yaml +++ b/.github/workflows/lychee.yaml @@ -20,6 +20,7 @@ jobs: # NEW STEP: Print all changed-files outputs for verification - name: Verify Changed Files run: | + set -eufo pipefail echo "--- tj-actions/changed-files Outputs ---" echo "any_changed: ${{ steps.changed-files.outputs.any_changed }}" echo "all_changed_files: ${{ steps.changed-files.outputs.all_changed_files }}" diff --git a/.github/workflows/mshv-infra.yaml b/.github/workflows/mshv-infra.yaml index 22f5ddbe6b..e067e9ea80 100644 --- a/.github/workflows/mshv-infra.yaml +++ b/.github/workflows/mshv-infra.yaml @@ -59,7 +59,7 @@ jobs: env: MI_CLIENT_ID: ${{ secrets.MI_CLIENT_ID }} run: | - set -e + set -eufo pipefail echo "Installing Azure CLI if not already installed" if ! command -v az &>/dev/null; then curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash @@ -76,7 +76,7 @@ jobs: SKU: ${{ inputs.VM_SKU }} STORAGE_ACCOUNT_PATHS: ${{ secrets.STORAGE_ACCOUNT_PATHS }} run: | - set -e + set -eufo pipefail # Extract vCPU count from SKU (e.g., "Standard_D2s_v3" => 2) vcpu=$(echo "$SKU" | sed -n 's/^Standard_[A-Za-z]\+\([0-9]\+\).*/\1/p') if [[ -z "$vcpu" ]]; then @@ -114,7 +114,7 @@ jobs: RG: ${{ inputs.RG }} STORAGE_ACCOUNT_PATHS: ${{ secrets.STORAGE_ACCOUNT_PATHS }} run: | - set -e + set -eufo pipefail echo "Creating Resource Group: $RG" # Create the resource group echo "Creating resource group in location: ${LOCATION}" @@ -127,7 +127,7 @@ jobs: env: KEY: ${{ inputs.KEY }} run: | - set -e + set -eufo pipefail echo "Generating SSH key: $KEY" mkdir -p ~/.ssh ssh-keygen -t rsa -b 4096 -f ~/.ssh/"${KEY}" -N "" @@ -145,7 +145,7 @@ jobs: VM_IMAGE_NAME: ${{ inputs.ARCH }}_${{ steps.get-location.outputs.location }}_image VM_NAME: ${{ inputs.ARCH }}_${{ steps.get-location.outputs.location }}_${{ github.run_id }} run: | - set -e + set -eufo pipefail echo "Creating $VM_SKU VM: $VM_NAME" # Extract subnet ID from the runner VM @@ -192,7 +192,7 @@ jobs: RG: ${{ inputs.RG }} VM_NAME: ${{ inputs.ARCH }}_${{ steps.get-location.outputs.location }}_${{ github.run_id }} run: | - set -e + set -eufo pipefail echo "Retrieving VM Private IP address..." # Retrieve VM Private IP address PRIVATE_IP=$(az vm show -g "${RG}" -n "${VM_NAME}" -d --query privateIps -o tsv) @@ -216,7 +216,7 @@ jobs: env: PRIVATE_IP: ${{ steps.get-vm-ip.outputs.PRIVATE_IP }} run: | - set -e + set -eufo pipefail echo "Removing the old host key" ssh-keygen -R "$PRIVATE_IP" @@ -226,9 +226,9 @@ jobs: PRIVATE_IP: ${{ steps.get-vm-ip.outputs.PRIVATE_IP }} USERNAME: ${{ secrets.USERNAME }} run: | - set -e + set -eufo pipefail ssh -i ~/.ssh/"${KEY}" -o StrictHostKeyChecking=no -- "${USERNAME}@${PRIVATE_IP}" << EOF - set -e + set -eufo pipefail echo "Logged in successfully." echo "Installing dependencies..." sudo tdnf install -y git moby-engine moby-cli clang llvm pkg-config make gcc glibc-devel diff --git a/.github/workflows/mshv-integration.yaml b/.github/workflows/mshv-integration.yaml index 1b83b5d292..d4ecea4ed1 100644 --- a/.github/workflows/mshv-integration.yaml +++ b/.github/workflows/mshv-integration.yaml @@ -36,7 +36,7 @@ jobs: RG: MSHV-${{ github.run_id }} USERNAME: ${{ secrets.MSHV_USERNAME }} run: | - set -e + set -eufo pipefail echo "Connecting to the VM via SSH..." ssh -i ~/.ssh/"${KEY}" -o StrictHostKeyChecking=no -- "${USERNAME}@${PRIVATE_IP}" << EOF set -e @@ -96,7 +96,7 @@ jobs: RG_NAME: ${{ needs.infra-setup.outputs.RG_NAME }} VM_NAME: ${{ needs.infra-setup.outputs.VM_NAME }} run: | - set -e + set -eufo pipefail az vm boot-diagnostics get-boot-log --name "${VM_NAME}" --resource-group "${RG_NAME}" | jq -r cleanup: diff --git a/.github/workflows/package-consistency.yaml b/.github/workflows/package-consistency.yaml index df7f01b8af..7f7808c882 100644 --- a/.github/workflows/package-consistency.yaml +++ b/.github/workflows/package-consistency.yaml @@ -27,6 +27,7 @@ jobs: - name: Check Rust VMM Package Consistency of fuzz Workspace run: | + set -eufo pipefail pushd fuzz python3 ../scripts/package-consistency-check.py github.com/rust-vmm popd diff --git a/.github/workflows/quality.yaml b/.github/workflows/quality.yaml index de6391186a..73b385811b 100644 --- a/.github/workflows/quality.yaml +++ b/.github/workflows/quality.yaml @@ -44,7 +44,7 @@ jobs: - name: Bisectability Check (default features) if: ${{ github.event_name == 'pull_request' && matrix.target == 'x86_64-unknown-linux-gnu' }} run: | - set -e + set -eufo pipefail commits=$(git rev-list origin/${{ github.base_ref }}..${{ github.sha }}) for commit in $commits; do git checkout $commit; cargo check --tests --examples --all --target=${{ matrix.target }}; done git checkout ${{ github.sha }} From 295a76ea28fb15034def7f1a9a9bfd55147c0dc3 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Thu, 9 Apr 2026 13:18:09 -0400 Subject: [PATCH 509/742] ci: Use a variable of type number for the OS disk size It's better to let GitHub Actions validate this. Signed-off-by: Demi Marie Obenour --- .github/workflows/mshv-infra.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/mshv-infra.yaml b/.github/workflows/mshv-infra.yaml index e067e9ea80..18e419f050 100644 --- a/.github/workflows/mshv-infra.yaml +++ b/.github/workflows/mshv-infra.yaml @@ -13,7 +13,7 @@ on: OS_DISK_SIZE: description: 'OS Disk Size in GB' required: true - type: string + type: number RG: description: 'Resource Group Name' required: true From dcdf16b8ff80e37da460b15107ccac94c2ad2099 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Thu, 9 Apr 2026 13:19:49 -0400 Subject: [PATCH 510/742] ci: Use bash regex instead of sed Easier to read and more reliable. Signed-off-by: Demi Marie Obenour --- .github/workflows/mshv-infra.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/mshv-infra.yaml b/.github/workflows/mshv-infra.yaml index 18e419f050..24ee2a59cd 100644 --- a/.github/workflows/mshv-infra.yaml +++ b/.github/workflows/mshv-infra.yaml @@ -78,11 +78,11 @@ jobs: run: | set -eufo pipefail # Extract vCPU count from SKU (e.g., "Standard_D2s_v3" => 2) - vcpu=$(echo "$SKU" | sed -n 's/^Standard_[A-Za-z]\+\([0-9]\+\).*/\1/p') - if [[ -z "$vcpu" ]]; then - echo "Cannot extract vCPU count from SKU: $SKU" + if ! [[ "$SKU" =~ ^Standard_[A-Za-z]+([1-9][0-9]*) ]]; then + printf 'Cannot extract vCPU count from SKU: %q\n' "$SKU" exit 1 fi + vcpu=${BASH_REMATCH[1]} SUPPORTED_LOCATIONS=$(echo "$STORAGE_ACCOUNT_PATHS" | jq -r 'to_entries[] | .key') From b8a61da06af84bd441c963bdff019aa093ffb684 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Thu, 9 Apr 2026 13:39:55 -0400 Subject: [PATCH 511/742] ci: Use jq instead of bash arithmetic jq's arithmetic is much more robust. Signed-off-by: Demi Marie Obenour --- .github/workflows/mshv-infra.yaml | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/.github/workflows/mshv-infra.yaml b/.github/workflows/mshv-infra.yaml index 24ee2a59cd..6e15c212e4 100644 --- a/.github/workflows/mshv-infra.yaml +++ b/.github/workflows/mshv-infra.yaml @@ -93,11 +93,9 @@ jobs: continue fi - usage=$(az vm list-usage --location "$location" --query "[?name.value=='$family'] | [0]" -o json) - current=$(echo "$usage" | jq -r '.currentValue') - limit=$(echo "$usage" | jq -r '.limit') - - if [[ $((limit - current)) -ge $vcpu ]]; then + remaining=$(az vm list-usage --location "$location" --query "[?name.value=='$family'] | [0]" -o json | + jq '.limit + 0 - .currentValue >= $ARGS.positional[0]' --jsonargs "$vcpu") + if [[ "$remaining" = true ]]; then echo "Sufficient quota found in $location" echo "location=$location" >> "$GITHUB_OUTPUT" exit 0 From 3a23e2f84112a9bd275f662a75404372eae044dd Mon Sep 17 00:00:00 2001 From: Anirudh Rayabharam Date: Sat, 11 Apr 2026 06:13:18 +0000 Subject: [PATCH 512/742] ci: fix jq usage in mshv-infra.yaml Workflow runs fail in the "Get Location" step with: jq: error (at :9): string ("100") and number (0) cannot be added Use tonumber to explicitly convert string to number instead of the "+ 0" trick. Signed-off-by: Anirudh Rayabharam --- .github/workflows/mshv-infra.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/mshv-infra.yaml b/.github/workflows/mshv-infra.yaml index 6e15c212e4..89cb5f6fbc 100644 --- a/.github/workflows/mshv-infra.yaml +++ b/.github/workflows/mshv-infra.yaml @@ -94,7 +94,7 @@ jobs: fi remaining=$(az vm list-usage --location "$location" --query "[?name.value=='$family'] | [0]" -o json | - jq '.limit + 0 - .currentValue >= $ARGS.positional[0]' --jsonargs "$vcpu") + jq '(.limit | tonumber) - (.currentValue | tonumber) >= ($ARGS.positional[0] | tonumber)' --jsonargs "$vcpu") if [[ "$remaining" = true ]]; then echo "Sufficient quota found in $location" echo "location=$location" >> "$GITHUB_OUTPUT" From bdc7a6947d6d175b65d23f45712ce55bcf4e2052 Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Wed, 8 Apr 2026 18:52:01 -0700 Subject: [PATCH 513/742] vmm: add per-zone mergeable option to --memory-zone Add a `mergeable` field to `MemoryZoneConfig` so that KSM page merging can be enabled selectively per memory zone rather than globally for all guest RAM. Previously, `MADV_MERGEABLE` was only controllable via the top-level `--memory mergeable=on` flag, which applied uniformly to all regions. With this change, users can leave boot memory unmerged while enabling KSM only on hotplug zones: --memory size=0,hotplug_method=virtio-mem --memory-zone id=boot,size=512M,shared=on,mergeable=off --memory-zone id=hotplug,size=256M,hotplug_size=1G,shared=off,mergeable=on The `MemoryZone` runtime struct now carries the `mergeable` flag so that both `allocate_address_space` and `add_ram_region` can apply per-zone `MADV_MERGEABLE` instead of the global `self.mergeable`. The top-level `--memory mergeable=on` path continues to work unchanged: the default zone is synthesised from `MemoryConfig` and inherits its `mergeable` value. AI/LLM disclosure: this patch was co-authored with GitHub Copilot and Claude Code (Opus 4.6). Signed-off-by: JP Kobryn --- docs/memory.md | 31 +++++++++++++- vmm/src/config.rs | 90 ++++++++++++++++++++++++++++++++++++++- vmm/src/memory_manager.rs | 31 ++++++++++---- vmm/src/vm_config.rs | 4 +- 4 files changed, 145 insertions(+), 11 deletions(-) diff --git a/docs/memory.md b/docs/memory.md index 75179e0e07..fb42e89374 100644 --- a/docs/memory.md +++ b/docs/memory.md @@ -214,11 +214,12 @@ struct MemoryZoneConfig { hotplug_size: Option, hotplugged_size: Option, prefault: bool, + mergeable: bool, } ``` ``` ---memory-zone User defined memory zone parameters "size=,file=,shared=on|off,hugepages=on|off,hugepage_size=,host_numa_node=,id=,hotplug_size=,hotplugged_size=,prefault=on|off" +--memory-zone User defined memory zone parameters "size=,file=,shared=on|off,hugepages=on|off,hugepage_size=,host_numa_node=,id=,hotplug_size=,hotplugged_size=,prefault=on|off,mergeable=on|off" ``` This parameter expects one or more occurrences, allowing for a list of memory @@ -422,6 +423,34 @@ _Example_ --memory-zone id=mem0,size=1G,prefault=on ``` +### `mergeable` + +Specifies if the pages from this memory zone must be marked as _mergeable_, +enabling Kernel Same-page Merging (KSM) for this zone. + +This is the per-zone equivalent of the top-level `--memory mergeable=on` option. +It allows KSM to be enabled selectively — for example, enabling it only on a +hotplug zone while leaving boot memory unaffected: + +``` +--memory size=2G,mergeable=off +--memory-zone id=hotplug,size=0,hotplug_size=8G,mergeable=on +``` + +For KSM to have any effect, the host kernel must have KSM enabled: +``` +echo 1 > /sys/kernel/mm/ksm/run +``` + +By default this option is turned off. + +_Example_ + +``` +--memory size=0 +--memory-zone id=mem0,size=1G,mergeable=on +``` + ## NUMA settings `NumaConfig` or what is known as `--numa` from the CLI perspective has been diff --git a/vmm/src/config.rs b/vmm/src/config.rs index 405cb1da76..b2ae29a84f 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -1006,7 +1006,8 @@ impl MemoryConfig { .add("host_numa_node") .add("hotplug_size") .add("hotplugged_size") - .add("prefault"); + .add("prefault") + .add("mergeable"); parser.parse(memory_zone).map_err(Error::ParseMemoryZone)?; let id = parser.get("id").ok_or(Error::ParseMemoryZoneIdMissing)?; @@ -1047,6 +1048,11 @@ impl MemoryConfig { .map_err(Error::ParseMemoryZone)? .unwrap_or(Toggle(false)) .0; + let mergeable = parser + .convert::("mergeable") + .map_err(Error::ParseMemoryZone)? + .unwrap_or(Toggle(mergeable)) + .0; zones.push(MemoryZoneConfig { id, @@ -1059,6 +1065,7 @@ impl MemoryConfig { hotplug_size, hotplugged_size, prefault, + mergeable, }); } Some(zones) @@ -3750,6 +3757,87 @@ mod unit_tests { Ok(()) } + #[test] + fn test_mem_zone_parsing() -> Result<()> { + // mergeable defaults to false + assert_eq!( + MemoryConfig::parse("size=0", Some(vec!["id=mem0,size=1G"]))?, + MemoryConfig { + size: 0, + zones: Some(vec![MemoryZoneConfig { + id: "mem0".to_string(), + size: 1 << 30, + ..Default::default() + }]), + ..Default::default() + } + ); + // mergeable=on + assert_eq!( + MemoryConfig::parse("size=0", Some(vec!["id=mem0,size=1G,mergeable=on"]))?, + MemoryConfig { + size: 0, + zones: Some(vec![MemoryZoneConfig { + id: "mem0".to_string(), + size: 1 << 30, + mergeable: true, + ..Default::default() + }]), + ..Default::default() + } + ); + // mergeable=off is explicit false + assert_eq!( + MemoryConfig::parse("size=0", Some(vec!["id=mem0,size=1G,mergeable=off"]))?, + MemoryConfig { + size: 0, + zones: Some(vec![MemoryZoneConfig { + id: "mem0".to_string(), + size: 1 << 30, + mergeable: false, + ..Default::default() + }]), + ..Default::default() + } + ); + // per-zone mergeable independent of global mergeable + assert_eq!( + MemoryConfig::parse( + "size=1G,mergeable=off", + Some(vec!["id=hotplug,size=0,hotplug_size=4G,mergeable=on"]) + )?, + MemoryConfig { + size: 1 << 30, + mergeable: false, + hotplug_method: HotplugMethod::Acpi, + zones: Some(vec![MemoryZoneConfig { + id: "hotplug".to_string(), + size: 0, + hotplug_size: Some(4 << 30), + mergeable: true, + ..Default::default() + }]), + ..Default::default() + } + ); + // global mergeable=on inherited by zone with no explicit mergeable + assert_eq!( + MemoryConfig::parse("size=0,mergeable=on", Some(vec!["id=mem0,size=1G"]))?, + MemoryConfig { + size: 0, + mergeable: true, + zones: Some(vec![MemoryZoneConfig { + id: "mem0".to_string(), + size: 1 << 30, + mergeable: true, + ..Default::default() + }]), + ..Default::default() + } + ); + Ok(()) + } + #[test] fn test_mem_parsing() -> Result<()> { assert_eq!(MemoryConfig::parse("", None)?, MemoryConfig::default()); diff --git a/vmm/src/memory_manager.rs b/vmm/src/memory_manager.rs index 3a57ea60a4..a01949b0fc 100644 --- a/vmm/src/memory_manager.rs +++ b/vmm/src/memory_manager.rs @@ -138,16 +138,18 @@ pub struct MemoryZone { shared: bool, hugepages: bool, backing_page_size: u64, + mergeable: bool, } impl MemoryZone { - fn new(shared: bool, hugepages: bool, backing_page_size: u64) -> Self { + fn new(shared: bool, hugepages: bool, backing_page_size: u64, mergeable: bool) -> Self { Self { regions: Vec::new(), virtio_mem_zone: None, shared, hugepages, backing_page_size, + mergeable, } } @@ -607,7 +609,7 @@ impl MemoryManager { // Add zone id to the list of memory zones. memory_zones.insert( zone.id.clone(), - MemoryZone::new(zone.shared, zone.hugepages, zone_align_size), + MemoryZone::new(zone.shared, zone.hugepages, zone_align_size, zone.mergeable), ); for ram_region in ram_regions.iter() { @@ -701,7 +703,12 @@ impl MemoryManager { } memory_zones.insert( zone.id.clone(), - MemoryZone::new(zone.shared, zone.hugepages, zone_align_size), + MemoryZone::new( + zone.shared, + zone.hugepages, + zone_align_size, + zone.mergeable, + ), ); } @@ -733,7 +740,12 @@ impl MemoryManager { let zone_page_size = memory_zone_get_align_size(zone_config)?; memory_zones.insert( zone_config.id.clone(), - MemoryZone::new(zone_config.shared, zone_config.hugepages, zone_page_size), + MemoryZone::new( + zone_config.shared, + zone_config.hugepages, + zone_page_size, + zone_config.mergeable, + ), ); } @@ -1295,6 +1307,7 @@ impl MemoryManager { hotplug_size: config.hotplug_size, hotplugged_size: config.hotplugged_size, prefault: config.prefault, + mergeable: config.mergeable, }]; Ok((config.size, zones, allow_mem_hotplug)) @@ -1316,10 +1329,10 @@ impl MemoryManager { regions.push((virtio_mem_zone.region().clone(), true)); } - list.push((zone_id.clone(), regions)); + list.push((zone_id.clone(), regions, memory_zone.mergeable)); } - for (zone_id, regions) in list { + for (zone_id, regions, zone_mergeable) in list { for (region, virtio_mem) in regions { // SAFETY: guaranteed by GuestRegionMmap invariants let slot = unsafe { @@ -1327,7 +1340,7 @@ impl MemoryManager { region.start_addr().raw_value(), region.len().try_into().unwrap(), region.as_ptr(), - self.mergeable, + zone_mergeable, false, self.log_dirty, ) @@ -2087,7 +2100,9 @@ impl MemoryManager { region.start_addr().0, region.len().try_into().unwrap(), region.as_ptr(), - self.mergeable, + self.memory_zones + .get(DEFAULT_MEMORY_ZONE) + .map_or(self.mergeable, |z| z.mergeable), false, self.log_dirty, ) diff --git a/vmm/src/vm_config.rs b/vmm/src/vm_config.rs index 94b1c011c8..541f6f21b0 100644 --- a/vmm/src/vm_config.rs +++ b/vmm/src/vm_config.rs @@ -159,7 +159,7 @@ pub struct PciSegmentConfig { pub mmio64_aperture_weight: u32, } -#[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] +#[derive(Clone, Debug, Default, PartialEq, Eq, Deserialize, Serialize)] pub struct MemoryZoneConfig { pub id: String, pub size: u64, @@ -179,6 +179,8 @@ pub struct MemoryZoneConfig { pub hotplugged_size: Option, #[serde(default)] pub prefault: bool, + #[serde(default)] + pub mergeable: bool, } impl ApplyLandlock for MemoryZoneConfig { From 490e338e16453f15b8be247d80ef0b01c1ce2e37 Mon Sep 17 00:00:00 2001 From: Changyuan Lyu Date: Fri, 10 Apr 2026 11:49:05 -0700 Subject: [PATCH 514/742] pci: synchronize VfioMsix::cap and VfioMsix::bar Currently, when snapshoting a running VFIO device with MSI-X enabled, we get a snapshot where `msix_config.state.enabled` is not consistent with `msix_state.cap.msg_ctl`, ```jsonc { "snapshots": { "vfio_common": { "snapshots": { "msix_config": { "snapshots": {}, "state": { "enabled": true // ... } }, // .. }, "state": { "msix_state": { "cap": { "msg_ctl": 3, "table": 1, "pba": 2049 }, // ... } // ... } } }, // ... } ``` The root cause is, after a `MsixCap` is parsed from the device PCI config space and propagated to a corresponding `MsixConfig`, `MsixCap::msg_ctl` is never get updated at runtime, only `MsixConfig::msg_ctl` is updated. This commit makes `VfioMsix::update` update both `VfioMsix::bar` (of type `MsixConfig`) and `VfioMsix::cap` (of type `MsixCap`). Signed-off-by: Changyuan Lyu --- pci/src/vfio.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pci/src/vfio.rs b/pci/src/vfio.rs index 7dc6f9c3b1..51e3aa9271 100644 --- a/pci/src/vfio.rs +++ b/pci/src/vfio.rs @@ -157,11 +157,14 @@ impl VfioMsix { // Update "Message Control" word if offset == 2 && data.len() == 2 { - self.bar.set_msg_ctl(LittleEndian::read_u16(data)); + let data = LittleEndian::read_u16(data); + self.bar.set_msg_ctl(data); + self.cap.set_msg_ctl(data); } else if offset == 0 && data.len() == 4 { // Some guests update MSI-X control through the dword config write path. - self.bar - .set_msg_ctl((LittleEndian::read_u32(data) >> 16) as u16); + let data = (LittleEndian::read_u32(data) >> 16) as u16; + self.bar.set_msg_ctl(data); + self.cap.set_msg_ctl(data); } let new_enabled = self.bar.enabled(); From 2ddbc5abbbbae8b9c0e7b8a10a54c6be7060707f Mon Sep 17 00:00:00 2001 From: Saravanan D Date: Tue, 7 Apr 2026 06:49:28 +0000 Subject: [PATCH 515/742] scripts: use arch OVMF downloads and bump fw tag Rename download_ovmf to download_amd64_ovmf and add a separate download_aarch64_ovmf for CLOUDHV_EFI.fd Replace build_edk2 with download_aarch64_ovmf in aarch64 scripts Update OVMF firmware tag to ch-1e1b96f126 Update CLOUDHV.fd sha1sum in sha1sums-x86_64 Add CLOUDHV_EFI.fd sha1sum in sha1sums-aarch64-common Fixes: #7622 Signed-off-by: Saravanan D --- scripts/run_integration_tests_aarch64.sh | 6 +++--- scripts/run_integration_tests_windows_aarch64.sh | 4 ++-- scripts/run_integration_tests_windows_x86_64.sh | 3 ++- scripts/run_integration_tests_x86_64.sh | 2 +- scripts/sha1sums-aarch64-common | 1 + scripts/sha1sums-x86_64 | 2 +- scripts/test-util.sh | 14 ++++++++++++-- 7 files changed, 22 insertions(+), 10 deletions(-) diff --git a/scripts/run_integration_tests_aarch64.sh b/scripts/run_integration_tests_aarch64.sh index 9dac761e98..5a1dce52b9 100755 --- a/scripts/run_integration_tests_aarch64.sh +++ b/scripts/run_integration_tests_aarch64.sh @@ -136,6 +136,9 @@ update_workloads() { popd || exit fi + # Download aarch64 ovmf + download_aarch64_ovmf + pushd "$WORKLOADS_DIR" || exit if ! sha1sum sha1sums-aarch64-common --check; then @@ -202,9 +205,6 @@ update_workloads() { echo "foo" >"$SHARED_DIR/file1" echo "bar" >"$SHARED_DIR/file3" || exit 1 fi - - # Checkout and build EDK2 - build_edk2 } process_common_args "$@" diff --git a/scripts/run_integration_tests_windows_aarch64.sh b/scripts/run_integration_tests_windows_aarch64.sh index 8f12a2740a..58523e2098 100755 --- a/scripts/run_integration_tests_windows_aarch64.sh +++ b/scripts/run_integration_tests_windows_aarch64.sh @@ -18,9 +18,9 @@ fi WIN_IMAGE_BASENAME="windows-11-iot-enterprise-aarch64.raw" WIN_IMAGE_FILE="$WORKLOADS_DIR/$WIN_IMAGE_BASENAME" -# Checkout and build EDK2 +# Download aarch64 OVMF OVMF_FW="$WORKLOADS_DIR/CLOUDHV_EFI.fd" -build_edk2 +download_aarch64_ovmf # Check if the images are present if [[ ! -f ${WIN_IMAGE_FILE} || ! -f ${OVMF_FW} ]]; then diff --git a/scripts/run_integration_tests_windows_x86_64.sh b/scripts/run_integration_tests_windows_x86_64.sh index d8f6861497..52b7796b83 100755 --- a/scripts/run_integration_tests_windows_x86_64.sh +++ b/scripts/run_integration_tests_windows_x86_64.sh @@ -17,7 +17,8 @@ WIN_IMAGE_FILE="/root/workloads/windows-server-2022-amd64-2.raw" WORKLOADS_DIR="/root/workloads" -download_ovmf +# Download amd64 ovmf +download_amd64_ovmf CFLAGS="" if [[ "${BUILD_TARGET}" == "x86_64-unknown-linux-musl" ]]; then diff --git a/scripts/run_integration_tests_x86_64.sh b/scripts/run_integration_tests_x86_64.sh index 858bd2f872..f12fa9aaaa 100755 --- a/scripts/run_integration_tests_x86_64.sh +++ b/scripts/run_integration_tests_x86_64.sh @@ -25,7 +25,7 @@ if [ ! -f "$WORKLOADS_DIR/hypervisor-fw" ]; then fi if [ ! -f "$WORKLOADS_DIR/CLOUDHV.fd" ]; then - download_ovmf + download_amd64_ovmf fi download_x86_guest_images diff --git a/scripts/sha1sums-aarch64-common b/scripts/sha1sums-aarch64-common index 4585509712..d955f98bec 100644 --- a/scripts/sha1sums-aarch64-common +++ b/scripts/sha1sums-aarch64-common @@ -3,3 +3,4 @@ e4addb6e212a298144f9eb0eb6e36019d013f0e7 alpine-minirootfs-aarch64.tar.gz 9953b31bb1923cdd8d91b1b7cc9ad3a9be1e0a59 focal-server-cloudimg-arm64-custom-20210929-0.raw 7118f4d4cad18c8357bc2ad9824a50f9a82a860a jammy-server-cloudimg-arm64-custom-20220329-0.qcow2 1f2b71be43b8f748f01306c4454e5c921343faa4 jammy-server-cloudimg-arm64-custom-20220329-0.raw +ce3656987f9e4238ef8afbd65fca219460c1f767 CLOUDHV_EFI.fd diff --git a/scripts/sha1sums-x86_64 b/scripts/sha1sums-x86_64 index 1f7500e8b4..e719bcc316 100644 --- a/scripts/sha1sums-x86_64 +++ b/scripts/sha1sums-x86_64 @@ -1,3 +1,3 @@ d4a44acc6014d5f83dea1c625c43d677a95fa75f alpine-minirootfs-x86_64.tar.gz 540ac358429305d7aa94e15363665d1c9d845982 hypervisor-fw -cf89e3e052c8ef0b6192abee6128eef943393307 CLOUDHV.fd +fb2e6834cc482c80a45766f6dcf12474f4fcb74e CLOUDHV.fd diff --git a/scripts/test-util.sh b/scripts/test-util.sh index 2ad10d12fb..3ba2474a49 100644 --- a/scripts/test-util.sh +++ b/scripts/test-util.sh @@ -198,8 +198,8 @@ prepare_linux() { fi } -download_ovmf() { - OVMF_FW_TAG="ch-13b4963ec4" +download_amd64_ovmf() { + OVMF_FW_TAG="ch-1e1b96f126" OVMF_FW_URL="https://github.com/cloud-hypervisor/edk2/releases/download/$OVMF_FW_TAG/CLOUDHV.fd" OVMF_FW="$WORKLOADS_DIR/CLOUDHV.fd" pushd "$WORKLOADS_DIR" || exit @@ -208,6 +208,16 @@ download_ovmf() { popd || exit } +download_aarch64_ovmf() { + OVMF_FW_TAG="ch-1e1b96f126" + OVMF_FW_URL="https://github.com/cloud-hypervisor/edk2/releases/download/$OVMF_FW_TAG/CLOUDHV_EFI.fd" + OVMF_FW="$WORKLOADS_DIR/CLOUDHV_EFI.fd" + pushd "$WORKLOADS_DIR" || exit + rm -f "$OVMF_FW" + download_with_retries $OVMF_FW_URL || exit 1 + popd || exit +} + # Function to mount image partition, execute commands, and cleanup. # Arguments: $1: Image file path, $2: Mount directory, $3+: Commands to execute. mount_and_exec() { From 23a980cd54136a68d8ca56f1c5392eb92a686879 Mon Sep 17 00:00:00 2001 From: Saravanan D Date: Tue, 31 Mar 2026 06:17:43 +0000 Subject: [PATCH 516/742] pci: expand sub-page VFIO BAR mmap to page size On aarch64 with 64K host pages, VFIO passthrough of devices with sub-page BARs (e.g. 16K NVMe BAR0) crashes with EINVAL from KVM_SET_USER_MEMORY_REGION, which requires memory_size to be a multiple of the host page size. Expand the mmap to page size instead of rejecting it, matching QEMU's approach. The kernel's vfio_pci_probe_mmaps() already verifies that sub-page BARs are page-aligned and reserves the remainder of the page, so expansion is safe at offset 0. Reject sub-page sparse areas at non-zero offsets where this guarantee does not apply. The expanded mmap region will not overlap with the relocated MSI-X trap region because fixup_msix_region() ensures MSI-X relocation at >= page_size offset. Signed-off-by: Saravanan D --- pci/src/vfio.rs | 38 +++++++++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/pci/src/vfio.rs b/pci/src/vfio.rs index 51e3aa9271..b49ed51989 100644 --- a/pci/src/vfio.rs +++ b/pci/src/vfio.rs @@ -21,7 +21,8 @@ use thiserror::Error; use vfio_bindings::bindings::vfio::*; use vfio_ioctls::{VfioDevice, VfioIrq, VfioOps, VfioRegionInfoCap, VfioRegionSparseMmapArea}; use vm_allocator::page_size::{ - align_page_size_down, align_page_size_up, is_4k_aligned, is_4k_multiple, is_page_size_aligned, + align_page_size_down, align_page_size_up, get_page_size, is_4k_aligned, is_4k_multiple, + is_page_size_aligned, }; use vm_allocator::{AddressAllocator, MemorySlotAllocator, SystemAllocator}; use vm_device::dma_mapping::ExternalDmaMapping; @@ -1686,9 +1687,40 @@ impl VfioPciDevice { self.common.interrupt.msix.as_ref(), )?; + let page_size = get_page_size(); for area in sparse_areas.iter() { + // KVM_SET_USER_MEMORY_REGION requires memory_size to be a + // multiple of the host page size. On aarch64 with 64K pages + // a device BAR can be smaller than a page (e.g. 16K NVMe + // BAR). + // + // The kernel only sets VFIO_REGION_INFO_FLAG_MMAP on sub-page + // BARs after verifying the physical BAR start is page-aligned + // and reserving the rest of the page. Expansion is only safe + // at offset 0 where the kernel reservation applies. + // + // fixup_msix_region() ensures MSI-X relocation at >= page_size + // offset, so the expanded mmap cannot overlap the trap region. + let mmap_len = if area.size < page_size { + if area.offset != 0 { + error!( + "BAR {}: sub-page sparse area at non-zero offset 0x{:x} \ + cannot be safely expanded to page size", + region.index, area.offset, + ); + return Err(VfioPciError::MmapArea); + } + info!( + "BAR {}: expanding sub-page sparse area mmap from 0x{:x} to \ + page size 0x{:x}", + region.index, area.size, page_size, + ); + page_size + } else { + area.size + }; let mapping = match MmapRegion::mmap( - area.size, + mmap_len, prot, fd, mmap_offset, @@ -1699,7 +1731,7 @@ impl VfioPciDevice { error!( "Could not mmap sparse area (offset = 0x{:x}, size = 0x{:x}): {}", mmap_offset, - area.size, + mmap_len, std::io::Error::last_os_error() ); return Err(VfioPciError::MmapArea); From 8b212aafc085619381788bdcc71eb40109fdbef4 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Sun, 12 Apr 2026 05:58:50 -0400 Subject: [PATCH 517/742] ci: Escape $ in heredoc in MSHV workflow script This is a preexisting bug in the MSHV integration tests, but previously it only caused a warning. With commit Fixes: 5b67b8994a36 ("ci: Use set -eufo pipefail") it becomes an error. Fixes: 5b67b8994a36 ("ci: Use set -eufo pipefail") Fixes: #7996 Signed-off-by: Demi Marie Obenour --- .github/workflows/mshv-integration.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/mshv-integration.yaml b/.github/workflows/mshv-integration.yaml index d4ecea4ed1..437cf44f6c 100644 --- a/.github/workflows/mshv-integration.yaml +++ b/.github/workflows/mshv-integration.yaml @@ -67,12 +67,12 @@ jobs: echo "Setting permissions..." for i in 0 1 2; do - dev="/dev/vhost-vdpa-$i" - if [ -e "$dev" ]; then - sudo chown $USER:$USER "$dev" - sudo chmod 660 "$dev" + dev="/dev/vhost-vdpa-\$i" + if [ -e "\$dev" ]; then + sudo chown \$USER:\$USER "\$dev" + sudo chmod 660 "\$dev" else - echo "Warning: Device $dev not found" + echo "Warning: Device \$dev not found" fi done From 6adc4f2c90d1afa3cc9a6204e291772eb383cde4 Mon Sep 17 00:00:00 2001 From: Bo Chen Date: Sat, 11 Apr 2026 04:42:44 +0000 Subject: [PATCH 518/742] tests: vfio: Add more checks on the Nvidia GPU from the guest Signed-off-by: Bo Chen --- cloud-hypervisor/tests/integration.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 15516e84e9..8b16c909e3 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -8274,6 +8274,9 @@ mod vfio { assert!(guest.get_total_memory().unwrap_or_default() > 3_840_000); + // Verify the VFIO device works before memory hotplug + guest.check_nvidia_gpu(); + guest.enable_memory_hotplug(); // Add RAM to the VM @@ -8447,6 +8450,9 @@ mod vfio { .unwrap() .contains("input address: 42 bits") ); + + // Check the VFIO device works after boot + guest.check_nvidia_gpu(); }); let _ = child.kill(); From e8b6fe054859b3631ec030330a4999a3d9ef4d9c Mon Sep 17 00:00:00 2001 From: Bo Chen Date: Sat, 11 Apr 2026 04:51:22 +0000 Subject: [PATCH 519/742] tests: vfio: Capture guest dmesg when nvidia-smi failed The guest dmesg can provide more context from the guest kernel, say Nvidia driver errors, IOMMU faults, etc. Signed-off-by: Bo Chen --- test_infra/src/lib.rs | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/test_infra/src/lib.rs b/test_infra/src/lib.rs index 248e4efa34..47d022cd6e 100644 --- a/test_infra/src/lib.rs +++ b/test_infra/src/lib.rs @@ -1316,11 +1316,18 @@ impl Guest { #[cfg(target_arch = "x86_64")] pub fn check_nvidia_gpu(&self) { - assert!( - self.ssh_command("nvidia-smi") - .unwrap() - .contains("NVIDIA L40S") - ); + let output = self.ssh_command("nvidia-smi").unwrap(); + if !output.contains("NVIDIA L40S") { + let dmesg = self + .ssh_command("sudo dmesg") + .unwrap_or_else(|e| format!("Failed to get dmesg: {e:?}")); + eprintln!( + "\n\n==== Guest dmesg (nvidia-smi check failed) ====\n\n\ + {dmesg}\n\ + \n==== End guest dmesg ====\n\n" + ); + panic!("nvidia-smi output did not contain 'NVIDIA L40S': {output}"); + } } pub fn reboot_linux(&self, current_reboot_count: u32) { From e48b14dcacd128a2bf5301bdc9e568a66795d694 Mon Sep 17 00:00:00 2001 From: Bo Chen Date: Sat, 11 Apr 2026 04:58:26 +0000 Subject: [PATCH 520/742] scripts: Add retries for vfio integration tests Signed-off-by: Bo Chen --- scripts/run_integration_tests_vfio.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/run_integration_tests_vfio.sh b/scripts/run_integration_tests_vfio.sh index eecd3111b8..3f1b267787 100755 --- a/scripts/run_integration_tests_vfio.sh +++ b/scripts/run_integration_tests_vfio.sh @@ -31,12 +31,12 @@ export RUST_BACKTRACE=1 export RUSTFLAGS="$RUSTFLAGS" # Run VFIO tests using legacy vfio interface with container/group -time cargo nextest run --no-tests=pass --test-threads=1 "vfio::test_nvidia" -- ${test_binary_args[*]} +time cargo nextest run --retries 3 --no-tests=pass --test-threads=1 "vfio::test_nvidia" -- ${test_binary_args[*]} RES=$? # Run VFIO tests using vfio cdev interface backed by iommufd if [ $RES -eq 0 ]; then - time cargo nextest run --no-tests=pass --test-threads=1 "vfio::test_iommufd" -- ${test_binary_args[*]} + time cargo nextest run --retries 3 --no-tests=pass --test-threads=1 "vfio::test_iommufd" -- ${test_binary_args[*]} RES=$? fi From a3899a9783683053ed841a0b0b8bdb4fc9ba888f Mon Sep 17 00:00:00 2001 From: Bo Chen Date: Sat, 11 Apr 2026 05:20:12 +0000 Subject: [PATCH 521/742] tests: Add retries for windows integration tests Signed-off-by: Bo Chen --- scripts/run_integration_tests_windows_aarch64.sh | 2 +- scripts/run_integration_tests_windows_x86_64.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/run_integration_tests_windows_aarch64.sh b/scripts/run_integration_tests_windows_aarch64.sh index 58523e2098..69537d7769 100755 --- a/scripts/run_integration_tests_windows_aarch64.sh +++ b/scripts/run_integration_tests_windows_aarch64.sh @@ -44,7 +44,7 @@ cargo build --all --release --target "$BUILD_TARGET" # Only run with 1 thread to avoid tests interfering with one another because # Windows has a static IP configured -time cargo nextest run --no-tests=pass "windows::$test_filter" --target "$BUILD_TARGET" -- ${test_binary_args[*]} +time cargo nextest run --retries 3 --no-tests=pass "windows::$test_filter" --target "$BUILD_TARGET" -- ${test_binary_args[*]} RES=$? dmsetup remove_all -f diff --git a/scripts/run_integration_tests_windows_x86_64.sh b/scripts/run_integration_tests_windows_x86_64.sh index 52b7796b83..56d41c166e 100755 --- a/scripts/run_integration_tests_windows_x86_64.sh +++ b/scripts/run_integration_tests_windows_x86_64.sh @@ -48,7 +48,7 @@ export RUSTFLAGS="$RUSTFLAGS" # Only run with 1 thread to avoid tests interfering with one another because # Windows has a static IP configured -time cargo nextest run --no-tests=pass $test_features "windows::$test_filter" --target "$BUILD_TARGET" -- ${test_binary_args[*]} +time cargo nextest run --retries 3 --no-tests=pass $test_features "windows::$test_filter" --target "$BUILD_TARGET" -- ${test_binary_args[*]} RES=$? dmsetup remove_all -f From 87ba83bb01d2e2a6c24c23dbfb10381731b5d265 Mon Sep 17 00:00:00 2001 From: Max Makarov Date: Fri, 10 Apr 2026 17:17:01 +0000 Subject: [PATCH 522/742] vmm: pci_segment: use segment id as ACPI _UID The ACPI specification requires _UID to be unique across devices sharing the same _HID (ACPI 6.5 section 6.1.12). Currently every PciSegment emits _UID=0 for its PNP0A08 host bridge, which violates the spec when num_pci_segments > 1. Windows guests detect this during ACPI namespace enumeration and abort boot with BSOD 0xA5 ACPI_BIOS_ERROR, pointing at the _UID object of the second PNP0A08 node. Linux guests are lenient and silently accept the collision, so the issue has gone unnoticed. Use self.id as _UID, matching what _SEG does on the line above. For single-segment VMs (id == 0) this is a no-op at runtime. Signed-off-by: Max Makarov --- vmm/src/pci_segment.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vmm/src/pci_segment.rs b/vmm/src/pci_segment.rs index b334ddb5d6..81f11063ee 100644 --- a/vmm/src/pci_segment.rs +++ b/vmm/src/pci_segment.rs @@ -360,7 +360,7 @@ impl Aml for PciSegment { pci_dsdt_inner_data.push(&adr); let seg = aml::Name::new("_SEG".into(), &self.id); pci_dsdt_inner_data.push(&seg); - let uid = aml::Name::new("_UID".into(), &aml::ZERO); + let uid = aml::Name::new("_UID".into(), &self.id); pci_dsdt_inner_data.push(&uid); let cca = aml::Name::new("_CCA".into(), &aml::ONE); pci_dsdt_inner_data.push(&cca); From c99ed77d1a8f6126f22402da206a604a97dc7db8 Mon Sep 17 00:00:00 2001 From: Max Makarov Date: Sat, 11 Apr 2026 08:11:51 +0000 Subject: [PATCH 523/742] tests: pci: verify per-segment ACPI _UID in DSDT Extend test_pci_multiple_segments_numa_node to assert that every PNP0A08 host bridge in the guest DSDT exposes a unique _UID matching its PCI segment id. Linux surfaces the evaluated _UID value through /sys/bus/acpi/devices/PNP0A08:*/uid, so the check is a single additional ssh command on top of the existing test plumbing. This test is used (rather than test_pci_multiple_segments) so that the assertion runs on both x86_64 and aarch64: the numa_node variant boots through edk2 firmware on aarch64, making ACPI (and PNP0A08 host bridges) available, whereas the non-firmware variant uses FDT on aarch64 and exposes no PNP0A08 nodes. Without a per-segment _UID, two PNP0A08 nodes share _UID=0 which violates ACPI 6.5 section 6.1.12 and triggers BSOD 0xA5 on Windows guests. This assertion would catch any future regression of that kind. Signed-off-by: Max Makarov --- cloud-hypervisor/tests/integration.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 8b16c909e3..22d32852e6 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -489,6 +489,20 @@ mod common_parallel { .unwrap_or_default(), TEST_DISK_NODE ); + + // Each PNP0A08 host bridge in the DSDT must expose a unique + // _UID matching its PCI segment id. Linux surfaces the + // evaluated _UID via /sys/bus/acpi/devices/PNP0A08:*/uid. + // This test uses firmware boot on aarch64, so ACPI is + // available on both supported architectures. + let mut uids: Vec = guest + .ssh_command("cat /sys/bus/acpi/devices/PNP0A08:*/uid") + .unwrap() + .lines() + .filter_map(|l| l.trim().parse::().ok()) + .collect(); + uids.sort(); + assert_eq!(uids, vec![0u16, 1u16]); }); kill_child(&mut child); From b9c3cfb14d5d8a6503f34b06447974d15668eb5c Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Wed, 8 Apr 2026 17:16:19 +0200 Subject: [PATCH 524/742] vm-migration: context: move unit tests into sub module This helps to better separate the unit tests from the new ones in the following commit. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- vm-migration/src/context.rs | 297 ++++++++++++++++++------------------ 1 file changed, 150 insertions(+), 147 deletions(-) diff --git a/vm-migration/src/context.rs b/vm-migration/src/context.rs index d7680821cc..69f9f3730c 100644 --- a/vm-migration/src/context.rs +++ b/vm-migration/src/context.rs @@ -238,169 +238,172 @@ impl Display for MemoryMigrationContext { #[cfg(test)] mod unit_tests { - use std::time::{Duration, Instant}; - use super::*; - use crate::protocol::MemoryRange; - - fn make_table(bytes: u64) -> MemoryRangeTable { - let mut table = MemoryRangeTable::default(); - if bytes > 0 { - table.push(MemoryRange { - gpa: 0, - length: bytes, - }); - } - table - } - - /// A controlled migration scenario with fixed timing offsets. - /// - /// ```text - /// migration_begin - /// + 1.0s -> iteration_begin - /// + 1.1s -> transfer_begin - /// + 2.0s -> transfer ends (transfer_duration = 0.9s) - /// + 2.1s -> iteration ends (iteration_duration = 1.1s, overhead = 0.2s) - /// ``` - struct Scenario { - migration_begin: Instant, - iteration_begin: Instant, - transfer_begin: Instant, - transfer_duration: Duration, - } - - impl Scenario { - /// We use a fixed point in the past so all offsets are in the past too, - /// meaning elapsed() calls in the code under test will be >= our durations. - const FIXPOINT_PAST: Duration = Duration::from_secs(10); - - fn new() -> Self { - // Use a fixed point in the past so all offsets are in the past too, - // meaning elapsed() calls in the code under test will be >= our durations. - let migration_begin = Instant::now() - Self::FIXPOINT_PAST; - Self { - migration_begin, - iteration_begin: migration_begin + Duration::from_millis(1000), - transfer_begin: migration_begin + Duration::from_millis(1100), - transfer_duration: Duration::from_millis(900), + mod memory_migration_ctx_tests { + use std::time::{Duration, Instant}; + + use super::*; + use crate::protocol::MemoryRange; + + fn make_table(bytes: u64) -> MemoryRangeTable { + let mut table = MemoryRangeTable::default(); + if bytes > 0 { + table.push(MemoryRange { + gpa: 0, + length: bytes, + }); } + table } - fn make_ctx(&self) -> MemoryMigrationContext { - let mut ctx = MemoryMigrationContext::new(); - // Override migration_begin with our controlled value. - ctx.migration_begin = self.migration_begin; - ctx + /// A controlled migration scenario with fixed timing offsets. + /// + /// ```text + /// migration_begin + /// + 1.0s -> iteration_begin + /// + 1.1s -> transfer_begin + /// + 2.0s -> transfer ends (transfer_duration = 0.9s) + /// + 2.1s -> iteration ends (iteration_duration = 1.1s, overhead = 0.2s) + /// ``` + struct Scenario { + migration_begin: Instant, + iteration_begin: Instant, + transfer_begin: Instant, + transfer_duration: Duration, } - } - #[test] - fn before_transfer_updates_begin_and_bytes() { - let s = Scenario::new(); - let mut ctx = s.make_ctx(); - - ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(4096)); - - assert_eq!(ctx.iteration_begin, s.iteration_begin); - assert_eq!(ctx.current_iteration_total_bytes, 4096); - } + impl Scenario { + /// We use a fixed point in the past so all offsets are in the past too, + /// meaning elapsed() calls in the code under test will be >= our durations. + const FIXPOINT_PAST: Duration = Duration::from_secs(10); + + fn new() -> Self { + // Use a fixed point in the past so all offsets are in the past too, + // meaning elapsed() calls in the code under test will be >= our durations. + let migration_begin = Instant::now() - Self::FIXPOINT_PAST; + Self { + migration_begin, + iteration_begin: migration_begin + Duration::from_millis(1000), + transfer_begin: migration_begin + Duration::from_millis(1100), + transfer_duration: Duration::from_millis(900), + } + } - #[test] - fn before_transfer_estimated_downtime() { - let s = Scenario::new(); - let mut ctx = s.make_ctx(); + fn make_ctx(&self) -> MemoryMigrationContext { + let mut ctx = MemoryMigrationContext::new(); + // Override migration_begin with our controlled value. + ctx.migration_begin = self.migration_begin; + ctx + } + } - // Empty table -> zero downtime regardless of bandwidth - ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(0)); - assert_eq!(ctx.estimated_downtime, Some(Duration::ZERO)); + #[test] + fn before_transfer_updates_begin_and_bytes() { + let s = Scenario::new(); + let mut ctx = s.make_ctx(); - // No bandwidth yet -> None - ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(1024)); - assert_eq!(ctx.estimated_downtime, None); + ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(4096)); - // 1024 B/s, 1024 bytes -> 1s - ctx.bandwidth_bytes_per_second = 1024.0; - ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(1024)); - assert_eq!(ctx.estimated_downtime, Some(Duration::from_secs(1))); - } + assert_eq!(ctx.iteration_begin, s.iteration_begin); + assert_eq!(ctx.current_iteration_total_bytes, 4096); + } - #[test] - fn after_transfer_updates_timing_and_bandwidth() { - let s = Scenario::new(); - let mut ctx = s.make_ctx(); - - ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(1024)); - ctx.update_metrics_after_transfer(s.transfer_begin, s.transfer_duration); - - assert_eq!(ctx.transfer_begin, s.transfer_begin); - assert_eq!(ctx.transfer_duration, Some(s.transfer_duration)); - // 1024 bytes / 0.9s - assert_eq!(ctx.bandwidth_bytes_per_second, 1024.0 / 0.9); - // iteration_duration = time from iteration_begin until now (>= transfer_duration) - assert!(ctx.iteration_duration.unwrap() >= s.transfer_duration); - // Zero transfer_duration -> bandwidth is 0.0, no division by zero - ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(1024)); - ctx.update_metrics_after_transfer(s.transfer_begin, Duration::ZERO); - assert_eq!(ctx.bandwidth_bytes_per_second, 0.0); - - // Check finalize() sets migration duration - assert_eq!(ctx.migration_duration, None); - ctx.finalize(); - assert!(matches!(ctx.migration_duration, Some(d) if d >= Scenario::FIXPOINT_PAST)); - } + #[test] + fn before_transfer_estimated_downtime() { + let s = Scenario::new(); + let mut ctx = s.make_ctx(); - #[test] - fn two_iterations_accumulate_bytes_and_feed_downtime_estimate() { - let s = Scenario::new(); - let mut ctx = s.make_ctx(); - - // Iteration 0: no bandwidth yet -> downtime is None - ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(1024)); - assert_eq!(ctx.estimated_downtime, None); - ctx.update_metrics_after_transfer(s.transfer_begin, s.transfer_duration); - assert_eq!(ctx.total_sent_bytes, 1024); - - // Iteration 1: bandwidth now known -> downtime is Some - ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(2048)); - assert!(ctx.estimated_downtime.is_some()); - ctx.update_metrics_after_transfer(s.transfer_begin, s.transfer_duration); - assert_eq!(ctx.total_sent_bytes, 1024 + 2048); - - // Check finalize() sets migration duration - assert_eq!(ctx.migration_duration, None); - ctx.finalize(); - assert!(matches!(ctx.migration_duration, Some(d) if d >= Scenario::FIXPOINT_PAST)); - } + // Empty table -> zero downtime regardless of bandwidth + ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(0)); + assert_eq!(ctx.estimated_downtime, Some(Duration::ZERO)); - #[test] - /// The display format is specifically crafted to be very insightful in logs. - /// Therefore, we have a dedicated test for that format. - fn display_format() { - let s = Scenario::new(); - let mut ctx = s.make_ctx(); + // No bandwidth yet -> None + ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(1024)); + assert_eq!(ctx.estimated_downtime, None); - // Iteration 0: 1 MiB in 1s - ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(1024 * 1024)); - ctx.update_metrics_after_transfer(s.transfer_begin, Duration::from_secs(1)); - ctx.iteration += 1; - - // Iteration 1: 512 KiB in 1s; fix migration_duration for deterministic elapsed/avg_bw - ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(512 * 1024)); - ctx.update_metrics_after_transfer(s.transfer_begin, Duration::from_secs(1)); + // 1024 B/s, 1024 bytes -> 1s + ctx.bandwidth_bytes_per_second = 1024.0; + ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(1024)); + assert_eq!(ctx.estimated_downtime, Some(Duration::from_secs(1))); + } - ctx.migration_duration = Some(Duration::from_secs(2)); - let out = ctx.to_string(); + #[test] + fn after_transfer_updates_timing_and_bandwidth() { + let s = Scenario::new(); + let mut ctx = s.make_ctx(); + + ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(1024)); + ctx.update_metrics_after_transfer(s.transfer_begin, s.transfer_duration); + + assert_eq!(ctx.transfer_begin, s.transfer_begin); + assert_eq!(ctx.transfer_duration, Some(s.transfer_duration)); + // 1024 bytes / 0.9s + assert_eq!(ctx.bandwidth_bytes_per_second, 1024.0 / 0.9); + // iteration_duration = time from iteration_begin until now (>= transfer_duration) + assert!(ctx.iteration_duration.unwrap() >= s.transfer_duration); + // Zero transfer_duration -> bandwidth is 0.0, no division by zero + ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(1024)); + ctx.update_metrics_after_transfer(s.transfer_begin, Duration::ZERO); + assert_eq!(ctx.bandwidth_bytes_per_second, 0.0); + + // Check finalize() sets migration duration + assert_eq!(ctx.migration_duration, None); + ctx.finalize(); + assert!(matches!(ctx.migration_duration, Some(d) if d >= Scenario::FIXPOINT_PAST)); + } - assert_eq!( - out, - "iter=1 curr=1MiB total=2MiB bw=0.50MiB/s transfer=1.00s overhead=8000ms est_downtime=500ms elapsed=2.00s avg_bw=0.15MiB/s" - ); + #[test] + fn two_iterations_accumulate_bytes_and_feed_downtime_estimate() { + let s = Scenario::new(); + let mut ctx = s.make_ctx(); + + // Iteration 0: no bandwidth yet -> downtime is None + ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(1024)); + assert_eq!(ctx.estimated_downtime, None); + ctx.update_metrics_after_transfer(s.transfer_begin, s.transfer_duration); + assert_eq!(ctx.total_sent_bytes, 1024); + + // Iteration 1: bandwidth now known -> downtime is Some + ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(2048)); + assert!(ctx.estimated_downtime.is_some()); + ctx.update_metrics_after_transfer(s.transfer_begin, s.transfer_duration); + assert_eq!(ctx.total_sent_bytes, 1024 + 2048); + + // Check finalize() sets migration duration + assert_eq!(ctx.migration_duration, None); + ctx.finalize(); + assert!(matches!(ctx.migration_duration, Some(d) if d >= Scenario::FIXPOINT_PAST)); + } - // Should change elapsed() time! - // Since this is at least 10s, we never face timing issues in CI! - ctx.finalize(); - let out2 = ctx.to_string(); - assert_ne!(out2, out, "elapsed time should have changed! is={out2}"); + #[test] + /// The display format is specifically crafted to be very insightful in logs. + /// Therefore, we have a dedicated test for that format. + fn display_format() { + let s = Scenario::new(); + let mut ctx = s.make_ctx(); + + // Iteration 0: 1 MiB in 1s + ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(1024 * 1024)); + ctx.update_metrics_after_transfer(s.transfer_begin, Duration::from_secs(1)); + ctx.iteration += 1; + + // Iteration 1: 512 KiB in 1s; fix migration_duration for deterministic elapsed/avg_bw + ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(512 * 1024)); + ctx.update_metrics_after_transfer(s.transfer_begin, Duration::from_secs(1)); + + ctx.migration_duration = Some(Duration::from_secs(2)); + let out = ctx.to_string(); + + assert_eq!( + out, + "iter=1 curr=1MiB total=2MiB bw=0.50MiB/s transfer=1.00s overhead=8000ms est_downtime=500ms elapsed=2.00s avg_bw=0.15MiB/s" + ); + + // Should change elapsed() time! + // Since this is at least 10s, we never face timing issues in CI! + ctx.finalize(); + let out2 = ctx.to_string(); + assert_ne!(out2, out, "elapsed time should have changed! is={out2}"); + } } } From f21184c325fc78ecd64f27e3c3076972f7664238 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Thu, 9 Apr 2026 09:29:52 +0200 Subject: [PATCH 525/742] vm-migration: expose memory timing needed by migration metrics Expose the finalized per-iteration timing fields needed by higher-level migration metrics and factor the iteration-overhead calculation into a small helper. This keeps the existing MemoryMigrationContext behavior intact while making the timing data easier to consume from migration-level context in the following commits. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- vm-migration/src/context.rs | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/vm-migration/src/context.rs b/vm-migration/src/context.rs index 69f9f3730c..0bb9ea190f 100644 --- a/vm-migration/src/context.rs +++ b/vm-migration/src/context.rs @@ -54,13 +54,13 @@ pub struct MemoryMigrationContext { /// This includes the transmission, all logging, and update of any metrics. /// /// This is only `None` for iteration 0. - iteration_duration: Option, + pub iteration_duration: Option, /// Begin of the current transfer. transfer_begin: Instant, /// Duration of the current transfer. /// /// This is only `None` for iteration 0. - transfer_duration: Option, + pub transfer_duration: Option, } impl MemoryMigrationContext { @@ -178,6 +178,22 @@ impl MemoryMigrationContext { bytes as f64 / duration.as_secs_f64() } } + + /// Calculates the overhead of an iteration. + /// + /// This is the additional time next to the transfer time and includes + /// fetching and parsing the dirty log, for example. + fn iteration_overhead(&self) -> Duration { + self.iteration_duration + .and_then(|iter| { + self.transfer_duration.map(|tr| { + // This is guaranteed by update_metrics_after_transfer() + assert!(iter >= tr); + iter - tr + }) + }) + .unwrap_or_default() + } } impl Default for MemoryMigrationContext { @@ -207,16 +223,7 @@ impl Display for MemoryMigrationContext { // Transfer duration and iteration overhead let transfer_s = self.transfer_duration.map_or(0.0, |d| d.as_secs_f64()); - let iteration_overhead_ms = self - .iteration_duration - .and_then(|iter| { - self.transfer_duration.map(|tr| { - // This is guaranteed by update_metrics_after_transfer() - assert!(iter >= tr); - (iter - tr).as_millis() - }) - }) - .unwrap_or(0); + let iteration_overhead_ms = self.iteration_overhead().as_millis(); let est_downtime_ms = self.estimated_downtime.map_or(0, |d| d.as_millis()); From 94f78e5a96f4abd25a10a7b2822dc17e210ead0a Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Thu, 9 Apr 2026 09:30:26 +0200 Subject: [PATCH 526/742] vm-migration: add migration-level context for downtime tracking Add migration-level context types that extend the existing memory-only metrics with overall migration duration and downtime breakdown. OngoingMigrationContext models the sender-side migration progress until all inputs needed for final downtime accounting are available. CompletedMigrationContext then stores the finalized migration metrics, including the final memory iteration, snapshotting, snapshot transfer, and completion phase. This provides the data needed to log effective downtime in the VMM and lays the groundwork for future migration statistics reporting. # Terminology At first glance, the use of "state" and "[VM] snapshot" may seem confusing. As discussed in [0], we use "state" consistently in the migration code. On the VM side, "snapshotting" is merely the mechanism used to obtain the VM state. [0] https://github.com/cloud-hypervisor/cloud-hypervisor/pull/7979#discussion_r3061359899 On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- vm-migration/src/context.rs | 272 ++++++++++++++++++++++++++++++++++- vm-migration/src/lib.rs | 5 +- vm-migration/src/protocol.rs | 1 + 3 files changed, 275 insertions(+), 3 deletions(-) diff --git a/vm-migration/src/context.rs b/vm-migration/src/context.rs index 0bb9ea190f..338dad9edd 100644 --- a/vm-migration/src/context.rs +++ b/vm-migration/src/context.rs @@ -3,14 +3,213 @@ // SPDX-License-Identifier: Apache-2.0 // -//! Module for [`MemoryMigrationContext`]. +//! Module for context and metrics of migrations. +//! +//! Main exports: +//! - [`OngoingMigrationContext`] +//! - [`CompletedMigrationContext`] +//! - [`MemoryMigrationContext`] use std::fmt; -use std::fmt::Display; +use std::fmt::{Display, Formatter}; use std::time::{Duration, Instant}; +use thiserror::Error; + use crate::protocol::MemoryRangeTable; +/// Metrics of the VM downtime during a migration. +/// +/// By downtime, we mean the time between the VM pause() and the corresponding +/// resume() on the destination. This downtime covers the time when the vCPUs +/// didn't execute a single instruction. The network downtime might be longer +/// and is not covered by this type. +/// +/// This metric is only relevant for the migration of running VMs. +#[derive(Debug, PartialEq)] +pub struct DowntimeContext { + /// The effective downtime Cloud Hypervisor observed (from the migration sender). + /// + /// This is roughly the sum of all the other durations. + pub effective_downtime: Duration, + /// The time of the final memory iteration. + pub final_memory_iteration_dur: Duration, + /// The time needed to aggregate the final VM state (i.e., snapshotting it). + pub state_dur: Duration, + /// The time needed to send the final VM state including deserializing it on + /// the destination + pub send_state_dur: Duration, + /// The time of the completion request. This includes resuming the VM (if it + /// was running before the migration). + pub complete_dur: Duration, +} + +impl Display for DowntimeContext { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!( + f, + // Caution: This format is specifically crafted for the VMM log + "{}ms (final_iter:{}ms state:{}ms send_state:{}ms complete:{}ms)", + self.effective_downtime.as_millis(), + self.final_memory_iteration_dur.as_millis(), + self.state_dur.as_millis(), + self.send_state_dur.as_millis(), + self.complete_dur.as_millis() + ) + } +} + +/// The internal metrics of a completed migration. +/// +/// The properties of this type help to investigate timings of the migration, +/// with specific focus on the VM downtime. +/// +/// This type is static once it was created and should not change. +#[derive(Debug, PartialEq)] +pub struct CompletedMigrationContext { + /// Total duration of the migration. + pub migration_dur: Duration, + pub downtime_ctx: DowntimeContext, + /// The finalized context of the memory migration. + pub memory_ctx: MemoryMigrationContext, +} + +impl CompletedMigrationContext { + fn new( + migration_dur: Duration, + effective_downtime: Duration, + state_dur: Duration, + send_state_dur: Duration, + complete_dur: Duration, + memory_ctx: MemoryMigrationContext, + ) -> Self { + Self { + migration_dur, + downtime_ctx: DowntimeContext { + effective_downtime, + final_memory_iteration_dur: memory_ctx.iteration_duration.unwrap_or_default(), + state_dur, + send_state_dur, + complete_dur, + }, + memory_ctx, + } + } +} + +/// Error returned when the migration context is advanced in an invalid order. +#[derive(Clone, Copy, Debug, Eq, Error, PartialEq)] +pub enum MigrationContextError { + /// The memory migration context was not finalized before transition. + #[error("memory migration context should be finalized before pausing the VM")] + MemoryContextNotFinalized, + /// The transition to `VmPaused` was attempted from an invalid state. + #[error("memory migration should only advance from the Begin state")] + InvalidVmPausedTransition, + /// Finalization was attempted before memory migration completed. + #[error("migration should only finalize after memory migration completed")] + InvalidFinalizeTransition, +} + +/// Holds context and metrics about the current ongoing migration. +/// +/// This is a state-machine to properly reflect the intermediate states and +/// their properties. This machine does not have a `Completed` variant in favor +/// of [`CompletedMigrationContext`], which is easier to work with. +#[derive(Debug, PartialEq)] +pub enum OngoingMigrationContext { + /// Migration started. + Begin { + /// Begin of the migration. + migration_begin: Instant, + }, + /// VM memory fully transferred to the destination and the VM is paused. + VmPaused { + /// Begin of the migration. + migration_begin: Instant, + /// Downtime begin of the migration. + downtime_begin: Instant, + /// The finalized context of the memory migration. + finalized_memory_ctx: MemoryMigrationContext, + }, +} + +impl OngoingMigrationContext { + /// Creates a new context. + pub fn new() -> Self { + Self::Begin { + migration_begin: Instant::now(), + } + } + + /// Marks the memory migration as completed and records when downtime + /// started. The VM is now in paused state. + pub fn set_vm_paused( + &mut self, + downtime_begin: Instant, + finalized_memory_ctx: MemoryMigrationContext, + ) -> Result<(), MigrationContextError> { + if finalized_memory_ctx.migration_duration.is_none() { + return Err(MigrationContextError::MemoryContextNotFinalized); + } + let migration_begin = match self { + Self::Begin { migration_begin } => *migration_begin, + _ => return Err(MigrationContextError::InvalidVmPausedTransition), + }; + *self = Self::VmPaused { + migration_begin, + downtime_begin, + finalized_memory_ctx, + }; + Ok(()) + } + + /// Finalizes the metrics and returns a [`CompletedMigrationContext`]. + /// + /// This should be called right after the completed migration was + /// acknowledged by the receiver. From now on, the metrics are considered + /// finalized and should not be modified. They can be stored for further + /// analysis. + /// + /// # Arguments + /// - `state_dur`: The time needed to aggregate the final VM state (i.e., + /// snapshotting it). + /// - `send_state_dur`: The time needed to send the final VM state + /// including deserializing it on the destination. + /// - `complete_dur`: The time of the completion request. This includes + /// resuming the VM (if it was running before the migration). + pub fn finalize( + self, + state_dur: Duration, + send_state_dur: Duration, + complete_dur: Duration, + ) -> Result { + let (migration_begin, downtime_begin, finalized_memory_ctx) = match self { + Self::VmPaused { + migration_begin, + downtime_begin, + finalized_memory_ctx, + } => (migration_begin, downtime_begin, finalized_memory_ctx), + _ => return Err(MigrationContextError::InvalidFinalizeTransition), + }; + + Ok(CompletedMigrationContext::new( + migration_begin.elapsed(), + downtime_begin.elapsed(), + state_dur, + send_state_dur, + complete_dur, + finalized_memory_ctx, + )) + } +} + +impl Default for OngoingMigrationContext { + fn default() -> Self { + Self::new() + } +} + /// Internal metrics for the precopy migration phase. /// /// The context aggregates runtime statistics such as iteration count, @@ -246,6 +445,75 @@ impl Display for MemoryMigrationContext { #[cfg(test)] mod unit_tests { use super::*; + + /// Tests for [`CompletedMigrationContext`] and [`OngoingMigrationContext`]. + mod migration_ctx_tests { + use super::*; + + #[test] + fn memory_migrated_and_vm_paused_records_transition() { + let mut ctx = OngoingMigrationContext::new(); + let downtime_begin = Instant::now(); + + let mut memory_ctx = MemoryMigrationContext::new(); + memory_ctx.finalize(); + + ctx.set_vm_paused(downtime_begin, memory_ctx) + .expect("migration context should transition to VmPaused after memory migration"); + + assert!(matches!( + ctx, + OngoingMigrationContext::VmPaused { + downtime_begin: recorded_downtime_begin, + .. + } if recorded_downtime_begin == downtime_begin + )); + } + + #[test] + fn finalize_returns_completed_context() { + let mut ctx = OngoingMigrationContext::new(); + let downtime_begin = Instant::now() - Duration::from_millis(10); + + let mut memory_ctx = MemoryMigrationContext::new(); + memory_ctx.finalize(); + + ctx.set_vm_paused(downtime_begin, memory_ctx) + .expect("migration context should transition to VmPaused after memory migration"); + + let completed = ctx + .finalize( + Duration::from_millis(1), + Duration::from_millis(2), + Duration::from_millis(3), + ) + .expect("migration context should finalize after memory migration completed"); + + assert_eq!(completed.downtime_ctx.state_dur, Duration::from_millis(1)); + assert_eq!( + completed.downtime_ctx.send_state_dur, + Duration::from_millis(2) + ); + assert_eq!( + completed.downtime_ctx.complete_dur, + Duration::from_millis(3) + ); + assert!(completed.downtime_ctx.effective_downtime >= Duration::from_millis(10)); + assert!(completed.migration_dur > Duration::ZERO); + assert!(completed.memory_ctx.migration_duration.is_some()); + } + + #[test] + fn finalize_errors_before_memory_migration_completed() { + let err = OngoingMigrationContext::new() + .finalize(Duration::ZERO, Duration::ZERO, Duration::ZERO) + .unwrap_err(); + + assert_eq!(err, MigrationContextError::InvalidFinalizeTransition); + } + } + + /// Tests for [`MemoryMigrationContext`]. mod memory_migration_ctx_tests { use std::time::{Duration, Instant}; diff --git a/vm-migration/src/lib.rs b/vm-migration/src/lib.rs index 2283ff8bed..0faedf2858 100644 --- a/vm-migration/src/lib.rs +++ b/vm-migration/src/lib.rs @@ -4,7 +4,10 @@ // use anyhow::anyhow; -pub use context::MemoryMigrationContext; +pub use context::{ + CompletedMigrationContext, DowntimeContext, MemoryMigrationContext, MigrationContextError, + OngoingMigrationContext, +}; use serde::{Deserialize, Serialize}; use thiserror::Error; diff --git a/vm-migration/src/protocol.rs b/vm-migration/src/protocol.rs index 0a62375f54..f927d7a36e 100644 --- a/vm-migration/src/protocol.rs +++ b/vm-migration/src/protocol.rs @@ -115,6 +115,7 @@ pub enum Command { Config, State, Memory, + /// Finalizes the migration and resumes the VM on the guest. Complete, Abandon, MemoryFd, From 6a3024c13d89c143cbc600985f8c915d2bfe2ef1 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Thu, 9 Apr 2026 10:42:55 +0200 Subject: [PATCH 527/742] vm-migration: add MemoryMigrationContext::empty_finalized() helper This is helpful in the following to properly aggregate statistics for local migrations. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- vm-migration/src/context.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vm-migration/src/context.rs b/vm-migration/src/context.rs index 338dad9edd..21801c0290 100644 --- a/vm-migration/src/context.rs +++ b/vm-migration/src/context.rs @@ -286,6 +286,15 @@ impl MemoryMigrationContext { } } + /// Returns an empty finalized block. + /// + /// This can be used if no memory was transferred (e.g., local migration). + pub fn empty_finalized() -> Self { + let mut this = Self::new(); + this.finalize(); + this + } + /// Updates the metrics right before the transfer over the wire. /// /// Supposed to be called once per precopy memory iteration. From f32506447fae80edf8987a2ef3264a29de33b998 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Thu, 9 Apr 2026 09:32:14 +0200 Subject: [PATCH 528/742] vmm: add helper to measure successful operation duration Add a small helper that returns both the successful result of an operation and the time it took to complete. Subsequent migration instrumentation uses this to keep timing code compact and consistent. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- vmm/src/lib.rs | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 92bf4c6b70..c64abac20d 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -563,6 +563,17 @@ pub fn start_vmm_thread( }) } +/// Measures the time of the callback, in case it returns `Ok`. +fn measure_ok(f: F) -> result::Result<(T, Duration), E> +where + F: FnOnce() -> result::Result, +{ + let begin = Instant::now(); + let value = f()?; + let duration = begin.elapsed(); + Ok((value, duration)) +} + #[derive(Clone, Deserialize, Serialize)] struct VmMigrationConfig { vm_config: Arc>, From f15999823dee2de36e70c3304e38d69296d0afc5 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Thu, 9 Apr 2026 09:32:21 +0200 Subject: [PATCH 529/742] vmm: outgoing migration: log effective downtime Use OngoingMigrationContext to measure and log the effective VM downtime (pause to remote resume) and the cost of each non-trivial step in the downtime window: snapshotting, sending the snapshot, and awaiting completion. This makes it straightforward to identify and reduce downtime as live migration matures. Example: ``` cloud-hypervisor: 7.703402s: INFO:vmm/src/lib.rs:1494 -- Migration completed after 2.2s with a downtime of 298ms (goal was 300ms) cloud-hypervisor: 7.703453s: DEBUG:vmm/src/lib.rs:1500 -- Downtime breakdown: 298ms (final_iter:269ms state:7ms send_state:19ms complete:1ms) ``` Note: downtime is measured on the source only; cross-host clock skew may cause unreliable results. # Terminology At first glance, the use of "state" and "[VM] snapshot" may seem confusing. As discussed in [0], we use "state" consistently in the migration code. On the VM side, "snapshotting" is merely the mechanism used to obtain the VM state. [0] https://github.com/cloud-hypervisor/cloud-hypervisor/pull/7979#discussion_r3061359899 On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- vmm/src/lib.rs | 99 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 70 insertions(+), 29 deletions(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index c64abac20d..5cedeef65e 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -40,8 +40,8 @@ use vm_memory::GuestMemoryAtomic; use vm_memory::bitmap::AtomicBitmap; use vm_migration::protocol::*; use vm_migration::{ - MemoryMigrationContext, Migratable, MigratableError, Pausable, Snapshot, Snapshottable, - Transportable, + MemoryMigrationContext, Migratable, MigratableError, OngoingMigrationContext, Pausable, + Snapshot, Snapshottable, Transportable, }; use vmm_sys_util::eventfd::EventFd; use vmm_sys_util::signal::unblock_signal; @@ -1260,23 +1260,30 @@ impl Vmm { /// - initial memory - VM is running /// - multiple memory delta transmissions - VM is running /// - final memory iteration - VM is paused + /// + /// Stores the [finalized] [`MemoryMigrationContext`] in the provided + /// [`OngoingMigrationContext`]. + /// + /// [finalized]: MemoryMigrationContext::finalize fn do_memory_migration( vm: &mut Vm, socket: &mut SocketStream, send_data_migration: &VmSendMigrationData, mem_send: &mut SendAdditionalConnections, + ctx: &mut OngoingMigrationContext, ) -> result::Result<(), MigratableError> { - let mut ctx = MemoryMigrationContext::new(); + let mut mem_ctx = MemoryMigrationContext::new(); vm.start_dirty_log()?; let remaining = Self::do_memory_iterations( vm, socket, - &mut ctx, + &mut mem_ctx, // We bind send_data_migration to the callback |ctx| Self::is_precopy_converged(ctx, send_data_migration), mem_send, )?; + let downtime_begin = Instant::now(); vm.pause()?; // Send last batch of dirty pages: final iteration @@ -1286,26 +1293,31 @@ impl Vmm { let mut final_table = vm.dirty_log()?; final_table.extend(remaining); - ctx.update_metrics_before_transfer(iteration_begin, &final_table); + mem_ctx.update_metrics_before_transfer(iteration_begin, &final_table); let transfer_begin = Instant::now(); mem_send.send_memory(final_table, socket)?; let transfer_duration = transfer_begin.elapsed(); - ctx.update_metrics_after_transfer(transfer_begin, transfer_duration); - ctx.iteration += 1; + mem_ctx.update_metrics_after_transfer(transfer_begin, transfer_duration); + mem_ctx.iteration += 1; } - ctx.finalize(); - - info!("Precopy complete: {ctx}"); + mem_ctx.finalize(); + info!("Precopy complete: {mem_ctx}"); + ctx.set_vm_paused(downtime_begin, mem_ctx) + .expect("migration context should transition to VmPaused after memory migration"); Ok(()) } + /// Performs a migration including all its phases. fn send_migration( vm: &mut Vm, #[cfg(all(feature = "kvm", target_arch = "x86_64"))] hypervisor: &dyn hypervisor::Hypervisor, send_data_migration: &VmSendMigrationData, ) -> result::Result<(), MigratableError> { + // State machine that is updated with more context as we progress. + let mut ctx = OngoingMigrationContext::new(); + // Set up the socket connection let mut socket = migration_transport::send_migration_socket(&send_data_migration.destination_url)?; @@ -1373,7 +1385,14 @@ impl Vmm { if send_data_migration.local { // Now pause VM + let downtime_begin = Instant::now(); vm.pause()?; + ctx.set_vm_paused( + downtime_begin, + // No memory was transferred + MemoryMigrationContext::empty_finalized(), + ) + .expect("migration context should transition to VmPaused for local migration"); } else { let mut mem_send = migration_transport::SendAdditionalConnections::new( &send_data_migration.destination_url, @@ -1381,14 +1400,20 @@ impl Vmm { &vm.guest_memory(), )?; - Self::do_memory_migration(vm, &mut socket, send_data_migration, &mut mem_send) - .inspect_err(|_| { - // Calling cleanup multiple times is fine, thus here we just make sure - // that it is called. - if let Err(e) = mem_send.cleanup() { - warn!("Error cleaning up migration connections: {e}"); - } - })?; + Self::do_memory_migration( + vm, + &mut socket, + send_data_migration, + &mut mem_send, + &mut ctx, + ) + .inspect_err(|_| { + // Calling cleanup multiple times is fine, thus here we just make sure + // that it is called. + if let Err(e) = mem_send.cleanup() { + warn!("Error cleaning up migration connections: {e}"); + } + })?; mem_send.cleanup()?; } @@ -1399,23 +1424,39 @@ impl Vmm { .map_err(|e| MigratableError::UnlockError(anyhow!("{e}")))?; // Capture snapshot and send it - let vm_snapshot = vm.snapshot()?; - migration_transport::send_state(&mut socket, &vm_snapshot)?; - // Complete the migration - // At this step, the receiving VMM will acquire disk locks again. - migration_transport::send_request_expect_ok( - &mut socket, - Request::complete(), - MigratableError::MigrateSend(anyhow!("Error completing migration")), - )?; + let (vm_snapshot, snapshot_duration) = measure_ok(|| vm.snapshot())?; + let (_, send_snapshot_duration) = + measure_ok(|| migration_transport::send_state(&mut socket, &vm_snapshot))?; + + // Complete the migration. + // When this returns, we know the VM was resumed (if it was running + // before the migration) and that the receiving VMM acquired disk + // locks again. + let (_, complete_duration) = measure_ok(|| { + migration_transport::send_request_expect_ok( + &mut socket, + Request::complete(), + MigratableError::MigrateSend(anyhow!("Error completing migration")), + ) + })?; + + let ctx = ctx + .finalize(snapshot_duration, send_snapshot_duration, complete_duration) + .expect("migration context should finalize after memory migration completed"); + + info!( + "Migration completed after {:.1}s with a downtime of {}ms (goal was {}ms)", + ctx.migration_dur.as_secs_f32(), + ctx.downtime_ctx.effective_downtime.as_millis(), + send_data_migration.downtime().as_millis() + ); + debug!("Downtime breakdown: {}", ctx.downtime_ctx); // Stop logging dirty pages if !send_data_migration.local { vm.stop_dirty_log()?; } - info!("Migration complete"); - // Let every Migratable object know about the migration being complete vm.complete_migration() } From 2515b06f1938a12a98f4926539a2d481f8fe09bf Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Wed, 8 Apr 2026 16:04:30 +0200 Subject: [PATCH 530/742] vmm: incoming migration: log duration of state receive and VM resume Instrument the two main downtime-phase operations on the destination side - receiving state and resuming the VM - so their costs are visible in logs and can be iterated on. The new log messages may look like this: ```text cloud-hypervisor: 7.283424s: DEBUG:vmm/src/lib.rs:948 -- Migration (incoming): recv_snapshot:3ms restore:10ms cloud-hypervisor: 7.284824s: DEBUG:vmm/src/lib.rs:967 -- Migration (incoming): resume:1ms cloud-hypervisor: 7.284842s: DEBUG:vmm/src/lib.rs:977 -- Migration (incoming): Receiving final state and resuming the VM took 15ms ``` On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- vmm/src/lib.rs | 126 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 85 insertions(+), 41 deletions(-) diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 5cedeef65e..df94b54e22 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -653,7 +653,10 @@ enum ReceiveMigrationState { Configured(ReceiveMigrationConfiguredData), /// Memory is populated and we received the state. The VM is ready to go. - StateReceived, + StateReceived { + /// The timestamp where the VMM started to receive the final state. + state_receive_begin: Instant, + }, /// The migration is successful. Completed, @@ -938,18 +941,43 @@ impl Vmm { Ok(Configured(config_data)) } Command::State => { + let state_receive_begin = Instant::now(); config_data.connections.cleanup()?; - self.vm_receive_state(req, socket, config_data.memory_manager)?; - Ok(StateReceived) + let (recv_state_dur, restore_vm_dur) = + self.vm_receive_state(req, socket, config_data.memory_manager)?; + debug!( + "Migration (incoming): recv_snapshot:{}ms restore:{}ms", + recv_state_dur.as_millis(), + restore_vm_dur.as_millis(), + ); + Ok(StateReceived { + state_receive_begin, + }) } _ => invalid_command(), }, - StateReceived => match req.command() { + StateReceived { + state_receive_begin, + } => match req.command() { Command::Complete => { // The unwrap is safe, because the state machine makes sure we called // vm_receive_state before, which creates the VM. let vm = self.vm.as_mut().unwrap(); - vm.resume()?; + let (_, resume_duration) = measure_ok(|| vm.resume())?; + debug!( + "Migration (incoming): resume:{}ms", + resume_duration.as_millis() + ); + // This logs the downtime without the final memory delta, so + // it does not reflect the actual downtime. While we could + // pass along the timestamp from when the VM was paused, + // that would rely on both VM hosts having synchronized + // clocks, which we cannot guarantee. For that reason, this + // is logged as debug! rather than info!. + debug!( + "Migration (incoming): Receiving final state and resuming the VM took {}ms", + state_receive_begin.elapsed().as_millis() + ); Ok(Completed) } _ => invalid_command(), @@ -1046,23 +1074,33 @@ impl Vmm { Ok(memory_manager) } + /// Receives the final VM state (devices, vCPUs) and restores the VM. + /// + /// Measures the time for each step. fn vm_receive_state( &mut self, req: &Request, socket: &mut T, mm: Arc>, - ) -> std::result::Result<(), MigratableError> + ) -> std::result::Result< + ( + Duration, /* state receive + deserialize */ + Duration, /* restoring */ + ), + MigratableError, + > where T: Read, { - // Read in state data - let mut data: Vec = Vec::new(); - data.resize_with(req.length() as usize, Default::default); - socket - .read_exact(&mut data) - .map_err(MigratableError::MigrateSocket)?; - let snapshot: Snapshot = serde_json::from_slice(&data).map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error deserialising snapshot: {e}")) + let (snapshot, receive_duration): (Snapshot, Duration) = measure_ok(|| { + let mut data: Vec = Vec::new(); + data.resize_with(req.length() as usize, Default::default); + socket + .read_exact(&mut data) + .map_err(MigratableError::MigrateSocket)?; + serde_json::from_slice(&data).map_err(|e| { + MigratableError::MigrateReceive(anyhow!("Error deserialising snapshot: {e}")) + }) })?; let exit_evt = self.exit_evt.try_clone().map_err(|e| { @@ -1079,38 +1117,44 @@ impl Vmm { MigratableError::MigrateReceive(anyhow!("Error cloning activate EventFd: {e}")) })?; - #[cfg(not(target_arch = "riscv64"))] - let timestamp = Instant::now(); - let hypervisor_vm = mm.lock().unwrap().vm.clone(); - let mut vm = Vm::new_from_memory_manager( - self.vm_config.clone().unwrap(), - mm, - hypervisor_vm, - exit_evt, - reset_evt, - #[cfg(feature = "guest_debug")] - debug_evt, - &self.seccomp_action, - self.hypervisor.clone(), - activate_evt, + let (vm, restore_duration) = measure_ok(|| { #[cfg(not(target_arch = "riscv64"))] - timestamp, - self.console_info.clone(), - self.console_resize_pipe.clone(), - Arc::clone(&self.original_termios_opt), - Some(&snapshot), - ) - .map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error creating VM from snapshot: {e:?}")) - })?; + let timestamp = Instant::now(); + let hypervisor_vm = mm.lock().unwrap().vm.clone(); + + let mut vm = Vm::new_from_memory_manager( + self.vm_config.clone().unwrap(), + mm, + hypervisor_vm, + exit_evt, + reset_evt, + #[cfg(feature = "guest_debug")] + debug_evt, + &self.seccomp_action, + self.hypervisor.clone(), + activate_evt, + #[cfg(not(target_arch = "riscv64"))] + timestamp, + self.console_info.clone(), + self.console_resize_pipe.clone(), + Arc::clone(&self.original_termios_opt), + Some(&snapshot), + ) + .map_err(|e| { + MigratableError::MigrateReceive(anyhow!("Error creating VM from snapshot: {e:?}")) + })?; - // Create VM - vm.restore().map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Failed restoring the Vm: {e}")) + // Create VM + vm.restore().map_err(|e| { + MigratableError::MigrateReceive(anyhow!("Failed restoring the Vm: {e}")) + })?; + + Ok(vm) })?; + self.vm = Some(vm); - Ok(()) + Ok((receive_duration, restore_duration)) } /// Performs the initial memory transmission (iteration zero) plus a From fd2d33e8ab8ce57f26d62f38b8bd1403b848a097 Mon Sep 17 00:00:00 2001 From: Anirudh Rayabharam Date: Thu, 26 Mar 2026 10:20:18 +0000 Subject: [PATCH 531/742] performance-metrics: Add --continue-on-failure flag and status tracking Add a --continue-on-failure CLI flag that allows the test harness to continue executing remaining tests after encountering a failure, instead of aborting immediately. When set, failed tests are recorded with zeroed metrics and a "FAILED" status, the report file is always generated, and the process exits with a non-zero code if any test failed. Without the flag, the existing fail-fast behavior is preserved. Also add a "status" field ("PASSED"/"FAILED") to PerformanceTestResult so report consumers can distinguish successful tests from failed ones. Signed-off-by: Anirudh Rayabharam --- performance-metrics/src/main.rs | 69 ++++++++++++++++++++++++++++----- 1 file changed, 60 insertions(+), 9 deletions(-) diff --git a/performance-metrics/src/main.rs b/performance-metrics/src/main.rs index 2f6b8bdb50..622d8793c7 100644 --- a/performance-metrics/src/main.rs +++ b/performance-metrics/src/main.rs @@ -28,6 +28,14 @@ enum Error { TestFailed, } +#[derive(Deserialize, Serialize)] +enum TestStatus { + #[serde(rename = "PASSED")] + Passed, + #[serde(rename = "FAILED")] + Failed, +} + #[derive(Deserialize, Serialize)] pub struct PerformanceTestResult { name: String, @@ -35,6 +43,31 @@ pub struct PerformanceTestResult { std_dev: f64, max: f64, min: f64, + status: TestStatus, +} + +impl PerformanceTestResult { + fn passed(name: &str, mean: f64, std_dev: f64, max: f64, min: f64) -> Self { + Self { + name: name.to_string(), + mean, + std_dev, + max, + min, + status: TestStatus::Passed, + } + } + + fn failed(name: &str) -> Self { + Self { + name: name.to_string(), + mean: 0.0, + std_dev: 0.0, + max: 0.0, + min: 0.0, + status: TestStatus::Failed, + } + } } #[derive(Deserialize, Serialize)] @@ -280,13 +313,7 @@ impl PerformanceTest { let max = (self.unit_adjuster)(metrics.clone().into_iter().reduce(f64::max).unwrap()); let min = (self.unit_adjuster)(metrics.clone().into_iter().reduce(f64::min).unwrap()); - PerformanceTestResult { - name: self.name.to_string(), - mean, - std_dev, - max, - min, - } + PerformanceTestResult::passed(self.name, mean, std_dev, max, min) } // Calculate the timeout for each test @@ -1316,6 +1343,14 @@ fn main() { .action(ArgAction::SetTrue) .required(false), ) + .arg( + Arg::new("continue-on-failure") + .long("continue-on-failure") + .help("Continue running remaining tests after a test failure") + .num_args(0) + .action(ArgAction::SetTrue) + .required(false), + ) .arg( Arg::new("report-file") .long("report-file") @@ -1406,6 +1441,9 @@ fn main() { init_tests(&overrides); } + let continue_on_failure = cmd_arguments.get_flag("continue-on-failure"); + let mut has_failure = false; + for test in tests_to_run { settle_host(); match run_test_with_timeout(test, &overrides) { @@ -1413,8 +1451,17 @@ fn main() { metrics_report.results.push(r); } Err(e) => { - eprintln!("Aborting test due to error: '{e:?}'"); - std::process::exit(1); + if continue_on_failure { + eprintln!("Test '{}' failed: '{e:?}'. Continuing.", test.name); + has_failure = true; + metrics_report + .results + .push(PerformanceTestResult::failed(test.name)); + cleanup_stale_processes(); + } else { + eprintln!("Aborting test due to error: '{e:?}'"); + std::process::exit(1); + } } } } @@ -1448,4 +1495,8 @@ fn main() { std::process::exit(1); }) .unwrap(); + + if has_failure { + std::process::exit(1); + } } From e38c5c434038776a7c2cc01d9dbe72d3c057d493 Mon Sep 17 00:00:00 2001 From: CMGS Date: Mon, 13 Apr 2026 17:23:08 +0800 Subject: [PATCH 532/742] pci: rollback BAR address on failed move_bar When BAR reprogramming is detected, detect_bar_reprogramming() eagerly updates the BAR address in config space before the actual MMIO remapping occurs. If the subsequent move_bar() fails (e.g. the new address falls outside the allocator range), the config register retains the new address while the MMIO bus still uses the old one, leaving the device broken. Add restore_bar_addr() to undo the config space update when move_bar() fails, so the device remains functional at its original address. For 64-bit BARs, restore both the low and high BAR slots as well as the corresponding config registers, mirroring the two-slot update logic in detect_bar_reprogramming(). Implement restore_bar_addr() for all PciDevice implementations (VirtioPciDevice, VfioPciDevice, VfioUserPciDevice, IvshmemDevice, PvPanicDevice, and PvmemcontrolPciDevice) by delegating to their respective PciConfiguration::restore_bar_addr(). Signed-off-by: CMGS --- devices/src/ivshmem.rs | 4 ++ devices/src/pvmemcontrol.rs | 4 ++ devices/src/pvpanic.rs | 4 ++ pci/src/bus.rs | 16 ++++--- pci/src/configuration.rs | 49 ++++++++++++++++++++++ pci/src/device.rs | 4 ++ pci/src/vfio.rs | 4 ++ pci/src/vfio_user.rs | 4 ++ virtio-devices/src/transport/pci_device.rs | 4 ++ 9 files changed, 88 insertions(+), 5 deletions(-) diff --git a/devices/src/ivshmem.rs b/devices/src/ivshmem.rs index 98291c74e8..932e0d9eba 100644 --- a/devices/src/ivshmem.rs +++ b/devices/src/ivshmem.rs @@ -382,6 +382,10 @@ impl PciDevice for IvshmemDevice { Ok(()) } + fn restore_bar_addr(&mut self, params: &BarReprogrammingParams) { + self.configuration.restore_bar_addr(params); + } + fn as_any_mut(&mut self) -> &mut dyn Any { self } diff --git a/devices/src/pvmemcontrol.rs b/devices/src/pvmemcontrol.rs index d4b37456be..171fdf1544 100644 --- a/devices/src/pvmemcontrol.rs +++ b/devices/src/pvmemcontrol.rs @@ -712,6 +712,10 @@ impl PciDevice for PvmemcontrolPciDevice { self.configuration.read_config_register(reg_idx) } + fn restore_bar_addr(&mut self, params: &BarReprogrammingParams) { + self.configuration.restore_bar_addr(params); + } + fn as_any_mut(&mut self) -> &mut dyn std::any::Any { self } diff --git a/devices/src/pvpanic.rs b/devices/src/pvpanic.rs index 9540a91252..3b9c9d5a80 100644 --- a/devices/src/pvpanic.rs +++ b/devices/src/pvpanic.rs @@ -231,6 +231,10 @@ impl PciDevice for PvPanicDevice { Ok(()) } + fn restore_bar_addr(&mut self, params: &BarReprogrammingParams) { + self.configuration.restore_bar_addr(params); + } + fn read_bar(&mut self, _base: u64, _offset: u64, data: &mut [u8]) { data[0] = self.events; } diff --git a/pci/src/bus.rs b/pci/src/bus.rs index eaae23a4d8..1fa7bd866a 100644 --- a/pci/src/bus.rs +++ b/pci/src/bus.rs @@ -10,7 +10,7 @@ use std::ops::DerefMut; use std::sync::{Arc, Barrier, Mutex}; use byteorder::{ByteOrder, LittleEndian}; -use log::error; +use log::warn; use thiserror::Error; use vm_device::{Bus, BusDevice, BusDeviceSync}; @@ -280,10 +280,15 @@ impl PciConfigIo { device.deref_mut(), params.region_type, ) { - error!( - "Failed moving device BAR: {}: 0x{:x}->0x{:x}(0x{:x})", + warn!( + "Failed moving device BAR: {}: 0x{:x}->0x{:x}(0x{:x}), keeping old BAR", e, params.old_base, params.new_base, params.len ); + // Rollback: the config register was already updated to + // new_base by detect_bar_reprogramming(). Restore it by + // writing back the old address so device state stays + // consistent with the MMIO bus mapping. + device.restore_bar_addr(params); } } @@ -405,10 +410,11 @@ impl PciConfigMmio { device.deref_mut(), params.region_type, ) { - error!( - "Failed moving device BAR: {}: 0x{:x}->0x{:x}(0x{:x})", + warn!( + "Failed moving device BAR: {}: 0x{:x}->0x{:x}(0x{:x}), keeping old BAR", e, params.old_base, params.new_base, params.len ); + device.restore_bar_addr(params); } } } diff --git a/pci/src/configuration.rs b/pci/src/configuration.rs index f506017b26..2a905e19bc 100644 --- a/pci/src/configuration.rs +++ b/pci/src/configuration.rs @@ -1093,6 +1093,55 @@ impl PciConfiguration { pub(crate) fn clear_pending_bar_reprogram(&mut self) { self.pending_bar_reprogram = Vec::new(); } + + /// Restore BAR address after a failed move. This undoes the premature + /// address update in detect_bar_reprogramming() so that config space + /// stays consistent with the actual MMIO mapping. + pub fn restore_bar_addr(&mut self, params: &BarReprogrammingParams) { + match params.region_type { + PciBarRegionType::Memory64BitRegion => { + // 64-bit BAR spans two slots: bars[i] (low, type Memory64BitRegion) + // and bars[i+1] (high, type None). Mirror detect_bar_reprogramming + // by matching the combined address and restoring both halves. + for i in 0..NUM_BAR_REGS - 1 { + if self.bars[i].r#type != Some(PciBarRegionType::Memory64BitRegion) { + continue; + } + let low_mask = self.writable_bits[BAR0_REG + i]; + let high_mask = self.writable_bits[BAR0_REG + i + 1]; + let current = (u64::from(self.bars[i + 1].addr & high_mask) << 32) + | u64::from(self.bars[i].addr & low_mask); + if current == params.new_base { + let old_low = params.old_base as u32; + let old_high = (params.old_base >> 32) as u32; + self.bars[i].addr = old_low; + self.bars[i + 1].addr = old_high; + self.registers[BAR0_REG + i] = + (self.registers[BAR0_REG + i] & !low_mask) | (old_low & low_mask); + self.registers[BAR0_REG + i + 1] = (self.registers[BAR0_REG + i + 1] + & !high_mask) + | (old_high & high_mask); + return; + } + } + } + _ => { + // 32-bit Memory or IO BAR + for i in 0..NUM_BAR_REGS { + let mask = self.writable_bits[BAR0_REG + i]; + if self.bars[i].r#type == Some(params.region_type) + && u64::from(self.bars[i].addr & mask) == params.new_base + { + let old = params.old_base as u32; + self.bars[i].addr = old; + self.registers[BAR0_REG + i] = + (self.registers[BAR0_REG + i] & !mask) | (old & mask); + return; + } + } + } + } + } } impl Pausable for PciConfiguration {} diff --git a/pci/src/device.rs b/pci/src/device.rs index 29c89b8c42..482e15e404 100644 --- a/pci/src/device.rs +++ b/pci/src/device.rs @@ -93,6 +93,10 @@ pub trait PciDevice: Send { fn move_bar(&mut self, _old_base: u64, _new_base: u64) -> result::Result<(), io::Error> { Ok(()) } + /// Restore BAR address in config space after a failed move_bar. + /// This rolls back the address update made by detect_bar_reprogramming() + /// so that the config register stays consistent with the MMIO bus mapping. + fn restore_bar_addr(&mut self, _params: &BarReprogrammingParams) {} /// Provides a mutable reference to the Any trait. This is useful to let /// the caller have access to the underlying type behind the trait. fn as_any_mut(&mut self) -> &mut dyn Any; diff --git a/pci/src/vfio.rs b/pci/src/vfio.rs index b49ed51989..e0c9110a86 100644 --- a/pci/src/vfio.rs +++ b/pci/src/vfio.rs @@ -2034,6 +2034,10 @@ iova 0x{:x}, size 0x{:x}: {}, ", Ok(()) } + fn restore_bar_addr(&mut self, params: &BarReprogrammingParams) { + self.common.configuration.restore_bar_addr(params); + } + fn as_any_mut(&mut self) -> &mut dyn Any { self } diff --git a/pci/src/vfio_user.rs b/pci/src/vfio_user.rs index 456047d42d..27c7dc0405 100644 --- a/pci/src/vfio_user.rs +++ b/pci/src/vfio_user.rs @@ -414,6 +414,10 @@ impl PciDevice for VfioUserPciDevice { .free_bars(allocator, mmio32_allocator, mmio64_allocator) } + fn restore_bar_addr(&mut self, params: &BarReprogrammingParams) { + self.common.configuration.restore_bar_addr(params); + } + fn as_any_mut(&mut self) -> &mut dyn Any { self } diff --git a/virtio-devices/src/transport/pci_device.rs b/virtio-devices/src/transport/pci_device.rs index 36975e3f7f..54a29caa5d 100644 --- a/virtio-devices/src/transport/pci_device.rs +++ b/virtio-devices/src/transport/pci_device.rs @@ -1133,6 +1133,10 @@ impl PciDevice for VirtioPciDevice { Ok(()) } + fn restore_bar_addr(&mut self, params: &BarReprogrammingParams) { + self.configuration.restore_bar_addr(params); + } + fn read_bar(&mut self, _base: u64, offset: u64, data: &mut [u8]) { match offset { o if o < COMMON_CONFIG_BAR_OFFSET + COMMON_CONFIG_SIZE => self.common_config.read( From 0a4be0c1c7368b21d66d26b990492a3736a28f4c Mon Sep 17 00:00:00 2001 From: CMGS Date: Mon, 13 Apr 2026 17:23:15 +0800 Subject: [PATCH 533/742] vmm: extend last MMIO64 allocator to cover full range The MMIO64 allocator size is computed with alignment truncation: size = (range / alignment) * alignment This loses up to one alignment unit (4 GiB) at the top of the address space. When a guest (Windows with virtio-win 0.1.285) programs a BAR near the top of the physical address space, the allocation fails because the address falls in the truncated gap. Give the last PCI segment allocator all remaining space up to the end of the device area, so no addresses are lost. The `end` parameter of create_mmio_allocators() is an inclusive address (the last valid byte). Fix the 32-bit caller and tests to pass inclusive values, consistent with the 64-bit caller which already uses the inclusive end_of_device_area(). Signed-off-by: CMGS --- vmm/src/device_manager.rs | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 0ae8526448..77ef0cc423 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -1140,6 +1140,8 @@ pub struct DeviceManager { ivshmem_device: Option>>, } +/// Create per-PCI-segment MMIO allocators over the range `[start, end]`. +/// Both `start` and `end` are inclusive addresses. fn create_mmio_allocators( start: u64, end: u64, @@ -1157,7 +1159,15 @@ fn create_mmio_allocators( for segment_id in 0..num_pci_segments as u64 { let weight = weights[segment_id as usize] as u64; let mmio_start = start + i * pci_segment_mmio_size; - let mmio_size = pci_segment_mmio_size * weight; + let is_last = segment_id == num_pci_segments as u64 - 1; + // Give the last segment all remaining space so no addresses + // near the top of the physical address space are lost to + // alignment truncation. + let mmio_size = if is_last { + end - mmio_start + 1 + } else { + pci_segment_mmio_size * weight + }; let allocator = Arc::new(Mutex::new( AddressAllocator::new(GuestAddress(mmio_start), mmio_size).unwrap(), )); @@ -1218,7 +1228,8 @@ impl DeviceManager { } let start_of_mmio32_area = layout::MEM_32BIT_DEVICES_START.0; - let end_of_mmio32_area = layout::MEM_32BIT_DEVICES_START.0 + layout::MEM_32BIT_DEVICES_SIZE; + let end_of_mmio32_area = + layout::MEM_32BIT_DEVICES_START.0 + layout::MEM_32BIT_DEVICES_SIZE - 1; let pci_mmio32_allocators = create_mmio_allocators( start_of_mmio32_area, end_of_mmio32_area, @@ -5739,7 +5750,7 @@ mod unit_tests { #[test] fn test_create_mmio_allocators() { - let res = create_mmio_allocators(0x100000, 0x400000, 1, &[1], 4 << 10); + let res = create_mmio_allocators(0x100000, 0x3fffff, 1, &[1], 4 << 10); assert_eq!(res.len(), 1); assert_eq!( res[0].lock().unwrap().base(), @@ -5750,7 +5761,7 @@ mod unit_tests { vm_memory::GuestAddress(0x3fffff) ); - let res = create_mmio_allocators(0x100000, 0x400000, 2, &[1, 1], 4 << 10); + let res = create_mmio_allocators(0x100000, 0x3fffff, 2, &[1, 1], 4 << 10); assert_eq!(res.len(), 2); assert_eq!( res[0].lock().unwrap().base(), @@ -5769,7 +5780,7 @@ mod unit_tests { vm_memory::GuestAddress(0x3fffff) ); - let res = create_mmio_allocators(0x100000, 0x400000, 2, &[2, 1], 4 << 10); + let res = create_mmio_allocators(0x100000, 0x3fffff, 2, &[2, 1], 4 << 10); assert_eq!(res.len(), 2); assert_eq!( res[0].lock().unwrap().base(), From ff329126150930a88212d4c31607265d06ae619b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 13 Apr 2026 23:55:43 +0000 Subject: [PATCH 534/742] build(deps): bump crate-ci/typos from 1.45.0 to 1.45.1 Bumps [crate-ci/typos](https://github.com/crate-ci/typos) from 1.45.0 to 1.45.1. - [Release notes](https://github.com/crate-ci/typos/releases) - [Changelog](https://github.com/crate-ci/typos/blob/master/CHANGELOG.md) - [Commits](https://github.com/crate-ci/typos/compare/v1.45.0...v1.45.1) --- updated-dependencies: - dependency-name: crate-ci/typos dependency-version: 1.45.1 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- .github/workflows/quality.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/quality.yaml b/.github/workflows/quality.yaml index 73b385811b..776cd8eb42 100644 --- a/.github/workflows/quality.yaml +++ b/.github/workflows/quality.yaml @@ -167,4 +167,4 @@ jobs: steps: - uses: actions/checkout@v6 # Executes "typos ." - - uses: crate-ci/typos@v1.45.0 + - uses: crate-ci/typos@v1.45.1 From c77094bd5408450d2bef79f9ccdb761c8e06b5ad Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Fri, 10 Apr 2026 13:36:25 +0200 Subject: [PATCH 535/742] test_infra: add polling helpers for integration tests Add generic polling helpers for integration tests and build the SSH wait helpers on top of them. This lets follow-up test changes replace fixed sleeps with condition-based waits without duplicating retry logic at each call site. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- test_infra/src/lib.rs | 108 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 107 insertions(+), 1 deletion(-) diff --git a/test_infra/src/lib.rs b/test_infra/src/lib.rs index 47d022cd6e..c66d40ec28 100644 --- a/test_infra/src/lib.rs +++ b/test_infra/src/lib.rs @@ -15,7 +15,7 @@ use std::os::unix::io::{AsRawFd, FromRawFd}; use std::path::{Path, PathBuf}; use std::process::{Child, Command, ExitStatus, Output, Stdio}; use std::str::FromStr; -use std::time::Duration; +use std::time::{Duration, Instant}; use std::{env, fmt, fs, io, thread}; use rand::Rng; @@ -57,6 +57,44 @@ pub enum Error { WaitTimeout(#[source] WaitTimeoutError), } +/// Polls a boolean condition until it becomes true or the timeout expires. +pub fn wait_until(timeout: Duration, mut condition: F) -> bool +where + F: FnMut() -> bool, +{ + const INTERVAL: Duration = Duration::from_millis(50); + let start = Instant::now(); + + loop { + if condition() { + return true; + } + + if start.elapsed() >= timeout { + return false; + } + + thread::sleep(INTERVAL); + } +} + +/// Retries an operation until it returns `Ok` or the timeout expires. +pub fn wait_until_succeeds(timeout: Duration, mut operation: F) -> Result +where + F: FnMut() -> Result, +{ + const INTERVAL: Duration = Duration::from_millis(50); + let start = Instant::now(); + + loop { + match operation() { + Ok(result) => return Ok(result), + Err(err) if start.elapsed() >= timeout => return Err(err), + Err(_) => thread::sleep(INTERVAL), + } + } +} + pub struct GuestNetworkConfig { pub guest_ip0: String, pub host_ip0: String, @@ -618,6 +656,25 @@ pub enum SshCommandError { WaitEof(#[source] ssh2::Error), } +#[derive(Error, Debug)] +pub enum WaitForSshError { + #[error("timed out after {timeout:?} waiting for ssh command {command:?} on {ip}: {source}")] + Timeout { + command: String, + ip: String, + timeout: Duration, + #[source] + source: SshCommandError, + }, +} + +fn default_guest_auth() -> PasswordAuth { + PasswordAuth { + username: String::from("cloud"), + password: String::from("cloud123"), + } +} + fn scp_to_guest_with_auth( path: &Path, remote_path: &Path, @@ -791,6 +848,24 @@ pub fn ssh_command_ip( ) } +/// Waits until SSH to the guest becomes available. +pub fn wait_for_ssh( + command: &str, + auth: &PasswordAuth, + ip: &str, + timeout: Duration, +) -> Result { + wait_until_succeeds(timeout, || { + ssh_command_ip_with_auth(command, auth, ip, 1, 1) + }) + .map_err(|source| WaitForSshError::Timeout { + command: command.to_string(), + ip: ip.to_string(), + timeout, + source, + }) +} + pub fn exec_host_command_with_retries(command: &str, retries: u32, interval: Duration) -> bool { for _ in 0..retries { let s = exec_host_command_output(command).status; @@ -1093,6 +1168,37 @@ impl Guest { ) } + /// Waits until SSH to the guest becomes available using the + /// [default guest authentication] and the default guest IP. + /// + /// [default guest authentication]: default_guest_auth + pub fn wait_for_ssh(&self, timeout: Duration) -> Result<(), WaitForSshError> { + wait_for_ssh( + "true", + &default_guest_auth(), + &self.network.guest_ip0, + timeout, + ) + .map(|_| ()) + } + + /// Waits until the provided command succeeds via SSH on the guest using the + /// [default guest authentication] and the default guest IP. + /// + /// [default guest authentication]: default_guest_auth + pub fn wait_for_ssh_command( + &self, + command: &str, + timeout: Duration, + ) -> Result { + wait_for_ssh( + command, + &default_guest_auth(), + &self.network.guest_ip0, + timeout, + ) + } + pub fn api_create_body(&self) -> String { let mut body = serde_json::json!({ "cpus": { From a3b4687caa58fbf5b7f469afffd3e584ef4b4ba7 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Mon, 13 Apr 2026 17:10:16 +0200 Subject: [PATCH 536/742] test_infra: split SSH command helpers by retry behavior Split the SSH helpers into a one-shot execution path and a retrying wrapper with linear backoff. This makes it possible to use a single bounded SSH attempt when tests need a direct readiness probe while preserving the existing retrying behavior for callers that expect it. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- cloud-hypervisor/tests/integration.rs | 4 +- test_infra/src/lib.rs | 98 +++++++++++++++------------ 2 files changed, 58 insertions(+), 44 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 22d32852e6..f16ea003bc 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -7286,7 +7286,7 @@ mod windows { } fn ssh_cmd(&self, cmd: &str) -> String { - ssh_command_ip_with_auth( + ssh_command_ip_with_auth_retry( cmd, &self.auth, &self.guest.network.guest_ip0, @@ -7477,7 +7477,7 @@ mod windows { // The timeout increase by n*1+n*2+n*3+..., therefore the initial // interval must be small. let tmo_int = 2; - let out = ssh_command_ip_with_auth( + let out = ssh_command_ip_with_auth_retry( cmd, &self.auth, &self.guest.network.guest_ip0, diff --git a/test_infra/src/lib.rs b/test_infra/src/lib.rs index c66d40ec28..21bf8f3c11 100644 --- a/test_infra/src/lib.rs +++ b/test_infra/src/lib.rs @@ -764,50 +764,63 @@ pub fn scp_to_guest( ) } +/// Executes a command on a remote host via SSH using password authentication. +/// Returns the stdout output on success, or an [`SshCommandError`] on any +/// connection, authentication, or execution failure. pub fn ssh_command_ip_with_auth( command: &str, auth: &PasswordAuth, ip: &str, - retries: u8, - timeout: u8, ) -> Result { let mut s = String::new(); + let tcp = TcpStream::connect(format!("{ip}:22")).map_err(SshCommandError::Connection)?; + let mut sess = Session::new().unwrap(); + sess.set_tcp_stream(tcp); + sess.handshake().map_err(SshCommandError::Handshake)?; + sess.userauth_password(&auth.username, &auth.password) + .map_err(SshCommandError::Authentication)?; + assert!(sess.authenticated()); + let mut channel = sess + .channel_session() + .map_err(SshCommandError::ChannelSession)?; + channel.exec(command).map_err(SshCommandError::Command)?; + // Intentionally ignore these results here as their failure + // does not precipitate a repeat + let _ = channel.read_to_string(&mut s); + let _ = channel.close(); + let _ = channel.wait_close(); + let status = channel.exit_status().map_err(SshCommandError::ExitStatus)?; + if status != 0 { + Err(SshCommandError::NonZeroExitStatus(status)) + } else { + Ok(s) + } +} +/// Executes a command on a remote host via SSH using password authentication, +/// retrying on failure with linear backoff. +/// +/// Delegates each attempt to [`ssh_command_ip_with_auth`]. After the +/// *n*-th consecutive failure the function sleeps for `timeout_s * n` seconds +/// before the next attempt. Once `retries` attempts are exhausted the command +/// output and error are printed to stderr and the last error is returned. +/// +/// Note that `timeout_s` is not a per-attempt deadline — individual connection +/// and I/O operations may block for as long as the OS or SSH layer allows. +// TODO since we have we probably want to migrate every single invocation to a +// more graceful combination of wait_until() and ssh_command_ip_with_auth(). +pub fn ssh_command_ip_with_auth_retry( + command: &str, + auth: &PasswordAuth, + ip: &str, + retries: u8, + // Base unit for the inter-retry sleep duration, in seconds. + timeout_s: u8, +) -> Result { let mut counter = 0; loop { - let mut closure = || -> Result<(), SshCommandError> { - let tcp = - TcpStream::connect(format!("{ip}:22")).map_err(SshCommandError::Connection)?; - let mut sess = Session::new().unwrap(); - sess.set_tcp_stream(tcp); - sess.handshake().map_err(SshCommandError::Handshake)?; - - sess.userauth_password(&auth.username, &auth.password) - .map_err(SshCommandError::Authentication)?; - assert!(sess.authenticated()); - - let mut channel = sess - .channel_session() - .map_err(SshCommandError::ChannelSession)?; - channel.exec(command).map_err(SshCommandError::Command)?; - - // Intentionally ignore these results here as their failure - // does not precipitate a repeat - let _ = channel.read_to_string(&mut s); - let _ = channel.close(); - let _ = channel.wait_close(); - - let status = channel.exit_status().map_err(SshCommandError::ExitStatus)?; - - if status != 0 { - Err(SshCommandError::NonZeroExitStatus(status)) - } else { - Ok(()) - } - }; - - match closure() { - Ok(_) => break, + match ssh_command_ip_with_auth(command, auth, ip) { + Ok(s) => return Ok(s), Err(e) => { counter += 1; if counter >= retries { @@ -816,27 +829,28 @@ pub fn ssh_command_ip_with_auth( command=\"{command}\"\n\ auth=\"{auth:#?}\"\n\ ip=\"{ip}\"\n\ - output=\"{s}\"\n\ error=\"{e:?}\"\n\ - \n==== End ssh command outout ====\n\n" + \n==== End ssh command output ====\n\n" ); - return Err(e); } } } - thread::sleep(std::time::Duration::new((timeout * counter).into(), 0)); + thread::sleep(std::time::Duration::new((timeout_s * counter).into(), 0)); } - Ok(s) } +/// Executes a command on a remote host via SSH using password authentication, +/// retrying on failure with linear backoff. +/// +/// Wrapper around [`ssh_command_ip_with_auth_retry`]. pub fn ssh_command_ip( command: &str, ip: &str, retries: u8, timeout: u8, ) -> Result { - ssh_command_ip_with_auth( + ssh_command_ip_with_auth_retry( command, &PasswordAuth { username: String::from("cloud"), @@ -856,7 +870,7 @@ pub fn wait_for_ssh( timeout: Duration, ) -> Result { wait_until_succeeds(timeout, || { - ssh_command_ip_with_auth(command, auth, ip, 1, 1) + ssh_command_ip_with_auth_retry(command, auth, ip, 1, 1) }) .map_err(|source| WaitForSshError::Timeout { command: command.to_string(), From b26488b1bff6aff39c0fdaa9f63d803aee23b6b3 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Mon, 13 Apr 2026 18:03:18 +0200 Subject: [PATCH 537/742] test_infra: bound SSH session runtime in wait_for_ssh Allow one-shot SSH commands to install a libssh2 session timeout and use that path from wait_for_ssh. This keeps SSH readiness probes from blocking far beyond their caller provided timeout when the guest network is slow or broken. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- test_infra/src/lib.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/test_infra/src/lib.rs b/test_infra/src/lib.rs index 21bf8f3c11..f9f45d36d9 100644 --- a/test_infra/src/lib.rs +++ b/test_infra/src/lib.rs @@ -771,11 +771,15 @@ pub fn ssh_command_ip_with_auth( command: &str, auth: &PasswordAuth, ip: &str, + timeout: Option, ) -> Result { let mut s = String::new(); let tcp = TcpStream::connect(format!("{ip}:22")).map_err(SshCommandError::Connection)?; let mut sess = Session::new().unwrap(); sess.set_tcp_stream(tcp); + if let Some(timeout) = timeout { + sess.set_timeout(timeout.as_millis() as u32); + } sess.handshake().map_err(SshCommandError::Handshake)?; sess.userauth_password(&auth.username, &auth.password) .map_err(SshCommandError::Authentication)?; @@ -819,7 +823,7 @@ pub fn ssh_command_ip_with_auth_retry( ) -> Result { let mut counter = 0; loop { - match ssh_command_ip_with_auth(command, auth, ip) { + match ssh_command_ip_with_auth(command, auth, ip, None) { Ok(s) => return Ok(s), Err(e) => { counter += 1; @@ -870,7 +874,7 @@ pub fn wait_for_ssh( timeout: Duration, ) -> Result { wait_until_succeeds(timeout, || { - ssh_command_ip_with_auth_retry(command, auth, ip, 1, 1) + ssh_command_ip_with_auth(command, auth, ip, Some(timeout)) }) .map_err(|source| WaitForSshError::Timeout { command: command.to_string(), From 2c395d4ae8c1fd866708a6265302d849db1e8a67 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Fri, 10 Apr 2026 13:36:34 +0200 Subject: [PATCH 538/742] tests: Plumping to retry when event monitor output is not ready Treat missing or still-short event monitor files as a retryable state in integration test helpers. This keeps polling-based restore and snapshot checks from failing early with file-not-found or short-file assertions while the monitor output is still being written. In the following, we can gracefully wait for the corresponding conditions to become true. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- cloud-hypervisor/tests/common/utils.rs | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/cloud-hypervisor/tests/common/utils.rs b/cloud-hypervisor/tests/common/utils.rs index 5064e1970e..f7cc1ea181 100644 --- a/cloud-hypervisor/tests/common/utils.rs +++ b/cloud-hypervisor/tests/common/utils.rs @@ -498,6 +498,9 @@ fn parse_event_file(event_file: &str) -> Vec { // Return true if all events from the input 'expected_events' are matched sequentially // with events from the 'event_file' pub(crate) fn check_sequential_events(expected_events: &[&MetaEvent], event_file: &str) -> bool { + if !Path::new(event_file).exists() { + return false; + } let json_events = parse_event_file(event_file); let len = expected_events.len(); let mut idx = 0; @@ -529,8 +532,13 @@ pub(crate) fn check_sequential_events_exact( expected_events: &[&MetaEvent], event_file: &str, ) -> bool { + if !Path::new(event_file).exists() { + return false; + } let json_events = parse_event_file(event_file); - assert!(expected_events.len() <= json_events.len()); + if expected_events.len() > json_events.len() { + return false; + } let json_events = &json_events[..expected_events.len()]; for (idx, e) in json_events.iter().enumerate() { @@ -551,8 +559,13 @@ pub(crate) fn check_sequential_events_exact( // Return true if events from the input 'latest_events' are matched exactly // with the most recent events from the 'event_file' pub(crate) fn check_latest_events_exact(latest_events: &[&MetaEvent], event_file: &str) -> bool { + if !Path::new(event_file).exists() { + return false; + } let json_events = parse_event_file(event_file); - assert!(latest_events.len() <= json_events.len()); + if latest_events.len() > json_events.len() { + return false; + } let json_events = &json_events[(json_events.len() - latest_events.len())..]; for (idx, e) in json_events.iter().enumerate() { From 47024e73ceac3ff6f6fe931bf59ee7d497f6d61d Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Fri, 10 Apr 2026 13:37:45 +0200 Subject: [PATCH 539/742] tests: Replace common integration sleeps with polling Use polling helpers in common integration tests instead of fixed sleeps where the tests already know the expected ready state. This updates CPU and memory hotplug checks as well as a few device- and restore-related waits in common_parallel to stop oversleeping on the fast path while keeping the same assertions. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- cloud-hypervisor/tests/integration.rs | 295 +++++++++++++++----------- test_infra/src/lib.rs | 28 ++- 2 files changed, 194 insertions(+), 129 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index f16ea003bc..9b0a0a952e 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -282,17 +282,25 @@ mod common_parallel { guest.enable_memory_hotplug(); resize_zone_command(&api_socket, "mem0", "3G"); - thread::sleep(std::time::Duration::new(5, 0)); - assert!(guest.get_total_memory().unwrap_or_default() > 4_800_000); + assert!(wait_until(Duration::from_secs(5), || guest + .get_total_memory() + .unwrap_or_default() + > 4_800_000)); resize_zone_command(&api_socket, "mem2", "3G"); - thread::sleep(std::time::Duration::new(5, 0)); - assert!(guest.get_total_memory().unwrap_or_default() > 6_720_000); + assert!(wait_until(Duration::from_secs(5), || guest + .get_total_memory() + .unwrap_or_default() + > 6_720_000)); resize_zone_command(&api_socket, "mem0", "2G"); - thread::sleep(std::time::Duration::new(5, 0)); - assert!(guest.get_total_memory().unwrap_or_default() > 5_760_000); + assert!(wait_until(Duration::from_secs(5), || guest + .get_total_memory() + .unwrap_or_default() + > 5_760_000)); resize_zone_command(&api_socket, "mem2", "2G"); - thread::sleep(std::time::Duration::new(5, 0)); - assert!(guest.get_total_memory().unwrap_or_default() > 4_800_000); + assert!(wait_until(Duration::from_secs(5), || guest + .get_total_memory() + .unwrap_or_default() + > 4_800_000)); guest.reboot_linux(0); @@ -302,11 +310,15 @@ mod common_parallel { // Check if we can still resize down to the initial 'boot'size resize_zone_command(&api_socket, "mem0", "1G"); - thread::sleep(std::time::Duration::new(5, 0)); - assert!(guest.get_total_memory().unwrap_or_default() < 4_800_000); + assert!(wait_until(Duration::from_secs(5), || guest + .get_total_memory() + .unwrap_or_default() + < 4_800_000)); resize_zone_command(&api_socket, "mem2", "1G"); - thread::sleep(std::time::Duration::new(5, 0)); - assert!(guest.get_total_memory().unwrap_or_default() < 3_840_000); + assert!(wait_until(Duration::from_secs(5), || guest + .get_total_memory() + .unwrap_or_default() + < 3_840_000)); }); kill_child(&mut child); @@ -2332,11 +2344,28 @@ mod common_parallel { .spawn() .unwrap(); - thread::sleep(std::time::Duration::new(30, 0)); + guest.wait_for_ssh(Duration::from_secs(30)).unwrap(); let r = std::panic::catch_unwind(|| { guest.ssh_command_l1("sudo systemctl start vfio").unwrap(); - thread::sleep(std::time::Duration::new(120, 0)); + let auth = PasswordAuth { + username: String::from("cloud"), + password: String::from("cloud123"), + }; + wait_for_ssh( + "true", + &auth, + &guest.network.l2_guest_ip1, + Duration::from_secs(120), + ) + .unwrap(); + wait_for_ssh( + "true", + &auth, + &guest.network.l2_guest_ip2, + Duration::from_secs(120), + ) + .unwrap(); // We booted our cloud hypervisor L2 guest with a "VFIOTAG" tag // added to its kernel command line. @@ -2395,7 +2424,18 @@ mod common_parallel { 1 )); - thread::sleep(std::time::Duration::new(10, 0)); + wait_for_ssh( + "true", + &auth, + &guest.network.l2_guest_ip3, + Duration::from_secs(10), + ) + .unwrap(); + assert!(wait_until(Duration::from_secs(10), || { + guest + .ssh_command_l2_1("ls /sys/bus/pci/devices") + .is_ok_and(|output| check_lines_count(output.trim(), 9)) + })); // Let's also verify from the third virtio-net device passed to // the L2 VM. This third device has been hotplugged through the L2 @@ -2427,7 +2467,11 @@ mod common_parallel { remove-device vfio123", ) .unwrap(); - thread::sleep(std::time::Duration::new(10, 0)); + assert!(wait_until(Duration::from_secs(10), || { + guest + .ssh_command_l2_1("ls /sys/bus/pci/devices") + .is_ok_and(|output| check_lines_count(output.trim(), 8)) + })); // Check the amount of PCI devices appearing in L2 VM is back down // to 8 devices. @@ -2596,11 +2640,9 @@ mod common_parallel { guest .ssh_command("echo 1 | sudo tee /sys/bus/cpu/devices/cpu3/online") .unwrap(); - thread::sleep(std::time::Duration::new(10, 0)); - assert_eq!( - guest.get_cpu_count().unwrap_or_default(), - u32::from(desired_vcpus) - ); + assert!(wait_until(Duration::from_secs(10), || { + guest.get_cpu_count().unwrap_or_default() == u32::from(desired_vcpus) + })); guest.reboot_linux(0); @@ -2613,11 +2655,9 @@ mod common_parallel { let desired_vcpus = 2; resize_command(&api_socket, Some(desired_vcpus), None, None, None); - thread::sleep(std::time::Duration::new(10, 0)); - assert_eq!( - guest.get_cpu_count().unwrap_or_default(), - u32::from(desired_vcpus) - ); + assert!(wait_until(Duration::from_secs(10), || { + guest.get_cpu_count().unwrap_or_default() == u32::from(desired_vcpus) + })); // Resize the VM back up to 4 let desired_vcpus = 4; @@ -2629,11 +2669,9 @@ mod common_parallel { guest .ssh_command("echo 1 | sudo tee /sys/bus/cpu/devices/cpu3/online") .unwrap(); - thread::sleep(std::time::Duration::new(10, 0)); - assert_eq!( - guest.get_cpu_count().unwrap_or_default(), - u32::from(desired_vcpus) - ); + assert!(wait_until(Duration::from_secs(10), || { + guest.get_cpu_count().unwrap_or_default() == u32::from(desired_vcpus) + })); }); kill_child(&mut child); @@ -2681,16 +2719,18 @@ mod common_parallel { let desired_ram = 1024 << 20; resize_command(&api_socket, None, Some(desired_ram), None, None); - thread::sleep(std::time::Duration::new(10, 0)); - assert!(guest.get_total_memory().unwrap_or_default() > 960_000); + assert!(wait_until(Duration::from_secs(10), || { + guest.get_total_memory().unwrap_or_default() > 960_000 + })); // Use balloon to remove RAM from the VM let desired_balloon = 512 << 20; resize_command(&api_socket, None, None, Some(desired_balloon), None); - thread::sleep(std::time::Duration::new(10, 0)); - assert!(guest.get_total_memory().unwrap_or_default() > 480_000); - assert!(guest.get_total_memory().unwrap_or_default() < 960_000); + assert!(wait_until(Duration::from_secs(10), || { + let total_memory = guest.get_total_memory().unwrap_or_default(); + total_memory > 480_000 && total_memory < 960_000 + })); guest.reboot_linux(0); @@ -2700,9 +2740,9 @@ mod common_parallel { let desired_balloon = 0; resize_command(&api_socket, None, None, Some(desired_balloon), None); - thread::sleep(std::time::Duration::new(10, 0)); - - assert!(guest.get_total_memory().unwrap_or_default() > 960_000); + assert!(wait_until(Duration::from_secs(10), || { + guest.get_total_memory().unwrap_or_default() > 960_000 + })); guest.enable_memory_hotplug(); @@ -2710,8 +2750,9 @@ mod common_parallel { let desired_ram = 2048 << 20; resize_command(&api_socket, None, Some(desired_ram), None, None); - thread::sleep(std::time::Duration::new(10, 0)); - assert!(guest.get_total_memory().unwrap_or_default() > 1_920_000); + assert!(wait_until(Duration::from_secs(10), || { + guest.get_total_memory().unwrap_or_default() > 1_920_000 + })); // Remove RAM to the VM (only applies after reboot) let desired_ram = 1024 << 20; @@ -2764,23 +2805,26 @@ mod common_parallel { let desired_ram = 1024 << 20; resize_command(&api_socket, None, Some(desired_ram), None, None); - thread::sleep(std::time::Duration::new(10, 0)); - assert!(guest.get_total_memory().unwrap_or_default() > 960_000); + assert!(wait_until(Duration::from_secs(10), || { + guest.get_total_memory().unwrap_or_default() > 960_000 + })); // Add RAM to the VM let desired_ram = 2048 << 20; resize_command(&api_socket, None, Some(desired_ram), None, None); - thread::sleep(std::time::Duration::new(10, 0)); - assert!(guest.get_total_memory().unwrap_or_default() > 1_920_000); + assert!(wait_until(Duration::from_secs(10), || { + guest.get_total_memory().unwrap_or_default() > 1_920_000 + })); // Remove RAM from the VM let desired_ram = 1024 << 20; resize_command(&api_socket, None, Some(desired_ram), None, None); - thread::sleep(std::time::Duration::new(10, 0)); - assert!(guest.get_total_memory().unwrap_or_default() > 960_000); - assert!(guest.get_total_memory().unwrap_or_default() < 1_920_000); + assert!(wait_until(Duration::from_secs(10), || { + let total_memory = guest.get_total_memory().unwrap_or_default(); + total_memory > 960_000 && total_memory < 1_920_000 + })); guest.reboot_linux(0); @@ -2791,9 +2835,10 @@ mod common_parallel { // Check we can still resize to 512MiB let desired_ram = 512 << 20; resize_command(&api_socket, None, Some(desired_ram), None, None); - thread::sleep(std::time::Duration::new(10, 0)); - assert!(guest.get_total_memory().unwrap_or_default() > 480_000); - assert!(guest.get_total_memory().unwrap_or_default() < 960_000); + assert!(wait_until(Duration::from_secs(10), || { + let total_memory = guest.get_total_memory().unwrap_or_default(); + total_memory > 480_000 && total_memory < 960_000 + })); }); kill_child(&mut child); @@ -2849,11 +2894,9 @@ mod common_parallel { guest .ssh_command("echo 1 | sudo tee /sys/bus/cpu/devices/cpu3/online") .unwrap(); - thread::sleep(std::time::Duration::new(10, 0)); - assert_eq!( - guest.get_cpu_count().unwrap_or_default(), - u32::from(desired_vcpus) - ); + assert!(wait_until(Duration::from_secs(10), || { + guest.get_cpu_count().unwrap_or_default() == u32::from(desired_vcpus) + })); assert!(guest.get_total_memory().unwrap_or_default() > 960_000); }); @@ -4779,18 +4822,16 @@ mod common_parallel { .unwrap(); }); - // Wait for 50 seconds to make sure the stress command is consuming - // the expected amount of memory. - thread::sleep(std::time::Duration::new(50, 0)); + // Wait for guest memory consumption to reach the expected level. + assert!(wait_until(Duration::from_secs(60), || process_rss_kib(pid) >= 2097152)); let rss = process_rss_kib(pid); println!("RSS {rss} >= 2097152"); assert!(rss >= 2097152); - // Wait for an extra minute to make sure the stress command has - // completed and that the guest reported the free pages to the VMM - // through the virtio-balloon device. We expect the RSS to be under - // 2GiB. - thread::sleep(std::time::Duration::new(60, 0)); + // Wait for stress to complete and free-page reporting to shrink RSS again. + assert!(wait_until(Duration::from_secs(120), || process_rss_kib( + pid + ) < 2097152)); let rss = process_rss_kib(pid); println!("RSS {rss} < 2097152"); assert!(rss < 2097152); @@ -4914,18 +4955,12 @@ mod common_parallel { assert!(remote_command(&api_socket, "remove-device", Some("test0"))); - thread::sleep(std::time::Duration::new(20, 0)); - - // Check device has gone away - assert_eq!( + // Wait for the pmem device to disappear from lsblk. + assert!(wait_until(Duration::from_secs(20), || { guest .ssh_command("lsblk | grep -c pmem0.*128M || true") - .unwrap() - .trim() - .parse::() - .unwrap_or(1), - 0 - ); + .is_ok_and(|output| output.trim().parse::().unwrap_or(1) == 0) + })); guest.reboot_linux(1); @@ -5174,8 +5209,12 @@ mod common_parallel { .unwrap(); }); - // Wait for the server to be listening - thread::sleep(std::time::Duration::new(5, 0)); + guest1 + .wait_for_ssh_command( + "ss -ltnH | awk '{print $4}' | grep -q ':12345$'", + Duration::from_secs(20), + ) + .unwrap(); // Check the connection fails this time guest2.ssh_command("nc -vz 172.100.0.1 12345").unwrap_err(); @@ -5196,8 +5235,10 @@ mod common_parallel { Some(format!("file://{snapshot_dir}").as_str()), )); - // Wait to make sure the snapshot is completed - thread::sleep(std::time::Duration::new(10, 0)); + // Wait for the source VM snapshot artifacts to be ready. + assert!(wait_until(Duration::from_secs(10), || { + std::path::Path::new(&snapshot_dir).exists() + })); }); // Shutdown the source VM @@ -5224,12 +5265,17 @@ mod common_parallel { .spawn() .unwrap(); - // Wait for the VM to be restored - thread::sleep(std::time::Duration::new(10, 0)); + // Wait for the restored VM to accept SSH again after resume. let r = std::panic::catch_unwind(|| { // Resume the VM + assert!(wait_until(Duration::from_secs(30), || remote_command( + &api_socket_restored, + "info", + None + ))); assert!(remote_command(&api_socket_restored, "resume", None)); + guest2.wait_for_ssh(Duration::from_secs(30)).unwrap(); // Spawn a new netcat listener in the first VM let guest_ip = guest1.network.guest_ip0.clone(); @@ -5243,8 +5289,12 @@ mod common_parallel { .unwrap(); }); - // Wait for the server to be listening - thread::sleep(std::time::Duration::new(5, 0)); + guest1 + .wait_for_ssh_command( + "ss -ltnH | awk '{print $4}' | grep -q ':12345$'", + Duration::from_secs(20), + ) + .unwrap(); // And check the connection is still functional after restore guest2.ssh_command("nc -vz 172.100.0.1 12345").unwrap(); @@ -5370,18 +5420,14 @@ mod common_parallel { .contains("{\"id\":\"vfio_user0\",\"bdf\":\"0000:00:05.0\"}") ); - thread::sleep(std::time::Duration::new(10, 0)); - // Check both if /dev/nvme exists and if the block size is 128M. - assert_eq!( + assert!(wait_until(Duration::from_secs(10), || { guest .ssh_command("lsblk | grep nvme0n1 | grep -c 128M") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 1 - ); + .ok() + .and_then(|output| output.trim().parse::().ok()) + == Some(1) + })); // Check changes persist after reboot assert_eq!( @@ -5528,7 +5574,9 @@ mod common_parallel { // Start swtpm daemon let mut swtpm_child = swtpm_command.spawn().unwrap(); - thread::sleep(std::time::Duration::new(10, 0)); + assert!(wait_until(Duration::from_secs(10), || { + std::path::Path::new(&swtpm_socket_path).exists() + })); let mut child = guest_cmd.spawn().unwrap(); let r = std::panic::catch_unwind(|| { guest.wait_vm_boot().unwrap(); @@ -5634,17 +5682,13 @@ mod common_parallel { assert!(remote_command(&api_socket, "nmi", None)); - // Wait a while for guest - thread::sleep(std::time::Duration::new(3, 0)); - let expected_sequential_events = [&MetaEvent { event: "panic".to_string(), device_id: None, }]; - assert!(check_latest_events_exact( - &expected_sequential_events, - &event_path - )); + assert!(wait_until(Duration::from_secs(3), || { + check_latest_events_exact(&expected_sequential_events, &event_path) + })); }); kill_child(&mut child); @@ -8296,11 +8340,13 @@ mod vfio { // Add RAM to the VM let desired_ram = 6 << 30; resize_command(&api_socket, None, Some(desired_ram), None, None); - thread::sleep(std::time::Duration::new(30, 0)); + assert!(wait_until(Duration::from_secs(5), || { + guest.get_total_memory().unwrap_or_default() > 5_760_000 + })); assert!(guest.get_total_memory().unwrap_or_default() > 5_760_000); // Check the VFIO device works when RAM is increased to 6GiB - guest.check_nvidia_gpu(); + assert!(guest.check_nvidia_gpu()); }); let _ = child.kill(); @@ -8361,10 +8407,8 @@ mod vfio { .contains("{\"id\":\"vfio0\",\"bdf\":\"0000:00:06.0\"}") ); - thread::sleep(std::time::Duration::new(10, 0)); - // Check the VFIO device works after hotplug - guest.check_nvidia_gpu(); + assert!(wait_until(Duration::from_secs(10), || guest.check_nvidia_gpu())); }); let _ = child.kill(); @@ -8408,12 +8452,12 @@ mod vfio { guest.wait_vm_boot().unwrap(); // Check the VFIO device works after boot - guest.check_nvidia_gpu(); + assert!(guest.check_nvidia_gpu()); guest.reboot_linux(0); // Check the VFIO device works after reboot - guest.check_nvidia_gpu(); + assert!(guest.check_nvidia_gpu()); }); let _ = child.kill(); @@ -8841,7 +8885,9 @@ mod live_migration { "remove-device", Some(net_id), )); - thread::sleep(std::time::Duration::new(10, 0)); + assert!(wait_until(Duration::from_secs(10), || { + guest.wait_for_ssh(Duration::from_secs(1)).is_err() + })); // Plug the virtio-net device again assert!(remote_command( @@ -8849,7 +8895,7 @@ mod live_migration { "add-net", Some(net_params.as_str()), )); - thread::sleep(std::time::Duration::new(10, 0)); + guest.wait_for_ssh(Duration::from_secs(10)).unwrap(); } // Start the live-migration @@ -8993,11 +9039,16 @@ mod live_migration { assert!(guest.get_total_memory().unwrap_or_default() > 3_840_000); // Increase the guest RAM resize_command(&src_api_socket, None, Some(6 << 30), None, None); - thread::sleep(std::time::Duration::new(5, 0)); + assert!(wait_until(Duration::from_secs(30), || { + guest.get_total_memory().unwrap_or_default() > 5_760_000 + })); assert!(guest.get_total_memory().unwrap_or_default() > 5_760_000); // Use balloon to remove RAM from the VM resize_command(&src_api_socket, None, None, Some(1 << 30), None); - thread::sleep(std::time::Duration::new(5, 0)); + assert!(wait_until(Duration::from_secs(5), || { + let total_memory = guest.get_total_memory().unwrap_or_default(); + total_memory > 4_800_000 && total_memory < 5_760_000 + })); let total_memory = guest.get_total_memory().unwrap_or_default(); assert!(total_memory > 4_800_000); assert!(total_memory < 5_760_000); @@ -9015,7 +9066,9 @@ mod live_migration { "remove-device", Some(net_id), )); - thread::sleep(std::time::Duration::new(10, 0)); + assert!(wait_until(Duration::from_secs(10), || { + guest.wait_for_ssh(Duration::from_secs(1)).is_err() + })); // Plug the virtio-net device again assert!(remote_command( @@ -9023,7 +9076,7 @@ mod live_migration { "add-net", Some(net_params.as_str()), )); - thread::sleep(std::time::Duration::new(10, 0)); + guest.wait_for_ssh(Duration::from_secs(10)).unwrap(); } // Start the live-migration @@ -9233,7 +9286,9 @@ mod live_migration { "remove-device", Some(net_id), )); - thread::sleep(std::time::Duration::new(10, 0)); + assert!(wait_until(Duration::from_secs(10), || { + guest.wait_for_ssh(Duration::from_secs(1)).is_err() + })); // Plug the virtio-net device again assert!(remote_command( @@ -9241,7 +9296,7 @@ mod live_migration { "add-net", Some(net_params.as_str()), )); - thread::sleep(std::time::Duration::new(10, 0)); + guest.wait_for_ssh(Duration::from_secs(10)).unwrap(); } // Start the live-migration @@ -9429,7 +9484,9 @@ mod live_migration { "remove-device", Some(net_id), )); - thread::sleep(std::time::Duration::new(10, 0)); + assert!(wait_until(Duration::from_secs(10), || { + guest.wait_for_ssh(Duration::from_secs(1)).is_err() + })); // Plug the virtio-net device again assert!(remote_command( @@ -9437,7 +9494,7 @@ mod live_migration { "add-net", Some(net_params.as_str()), )); - thread::sleep(std::time::Duration::new(10, 0)); + guest.wait_for_ssh(Duration::from_secs(10)).unwrap(); } // Enable watchdog and ensure its functional @@ -9938,14 +9995,16 @@ mod live_migration { "remove-device", Some(net_id), )); - thread::sleep(Duration::new(10, 0)); + assert!(wait_until(Duration::from_secs(10), || { + guest.wait_for_ssh(Duration::from_secs(1)).is_err() + })); // Re-add the virtio-net device assert!(remote_command( &src_api_socket, "add-net", Some(net_params.as_str()), )); - thread::sleep(Duration::new(10, 0)); + guest.wait_for_ssh(Duration::from_secs(10)).unwrap(); } // Start TCP live migration assert!( diff --git a/test_infra/src/lib.rs b/test_infra/src/lib.rs index f9f45d36d9..ba10cd5b51 100644 --- a/test_infra/src/lib.rs +++ b/test_infra/src/lib.rs @@ -1439,19 +1439,25 @@ impl Guest { } #[cfg(target_arch = "x86_64")] - pub fn check_nvidia_gpu(&self) { + pub fn check_nvidia_gpu(&self) -> bool { let output = self.ssh_command("nvidia-smi").unwrap(); - if !output.contains("NVIDIA L40S") { - let dmesg = self - .ssh_command("sudo dmesg") - .unwrap_or_else(|e| format!("Failed to get dmesg: {e:?}")); - eprintln!( - "\n\n==== Guest dmesg (nvidia-smi check failed) ====\n\n\ - {dmesg}\n\ - \n==== End guest dmesg ====\n\n" - ); - panic!("nvidia-smi output did not contain 'NVIDIA L40S': {output}"); + + if output.contains("NVIDIA L40S") { + return true; } + + let dmesg = self + .ssh_command("sudo dmesg") + .unwrap_or_else(|e| format!("Failed to get dmesg: {e:?}")); + + eprintln!( + "\n\n==== Guest dmesg (nvidia-smi check failed) ====\n\n\ + {dmesg}\n\ + \n==== End guest dmesg ====\n\n" + ); + eprintln!("nvidia-smi output did not contain 'NVIDIA L40S': {output}"); + + false } pub fn reboot_linux(&self, current_reboot_count: u32) { From 44ac7c0ba084c50bbf32b1ef4abbf806671f8e1c Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Fri, 10 Apr 2026 13:38:19 +0200 Subject: [PATCH 540/742] tests: Poll snapshot restore readiness in Linux tests Replace fixed sleeps in Linux snapshot and restore integration tests with event monitor and API readiness checks. This updates ivshmem and common_sequential snapshot paths to wait for concrete restore and snapshot completion signals instead of sleeping for an assumed amount of time. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- cloud-hypervisor/tests/integration.rs | 223 +++++++++++++------------- 1 file changed, 113 insertions(+), 110 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 9b0a0a952e..3371118216 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -6162,30 +6162,26 @@ mod ivshmem { .spawn() .unwrap(); - // Wait for the VM to be restored - thread::sleep(std::time::Duration::new(20, 0)); - let latest_events = [&MetaEvent { event: "restored".to_string(), device_id: None, }]; - assert!(check_latest_events_exact( - &latest_events, - &event_path_restored - )); + // Wait for the restored event to show up in the monitor file. + assert!(wait_until(Duration::from_secs(30), || { + check_latest_events_exact(&latest_events, &event_path_restored) + })); // Remove the snapshot dir let _ = remove_dir_all(snapshot_dir.as_str()); let r = std::panic::catch_unwind(|| { // Resume the VM + assert!(wait_until(Duration::from_secs(30), || remote_command( + &api_socket_restored, + "info", + None + ))); assert!(remote_command(&api_socket_restored, "resume", None)); - // There is no way that we can ensure the 'write()' to the - // event file is completed when the 'resume' request is - // returned successfully, because the 'write()' was done - // asynchronously from a different thread of Cloud - // Hypervisor (e.g. the event-monitor thread). - thread::sleep(std::time::Duration::new(1, 0)); let latest_events = [ &MetaEvent { event: "resuming".to_string(), @@ -6196,10 +6192,9 @@ mod ivshmem { device_id: None, }, ]; - assert!(check_latest_events_exact( - &latest_events, - &event_path_restored - )); + assert!(wait_until(Duration::from_secs(30), || { + check_latest_events_exact(&latest_events, &event_path_restored) + })); // Check the number of vCPUs assert_eq!(guest.get_cpu_count().unwrap_or_default(), 2); @@ -6259,9 +6254,10 @@ mod common_sequential { device_id: None, }, ]; - // See: #5938 - thread::sleep(std::time::Duration::new(1, 0)); - assert!(check_latest_events_exact(&latest_events, event_path)); + + assert!(wait_until(Duration::from_secs(30), || { + check_latest_events_exact(&latest_events, event_path) + })); // Take a snapshot from the VM assert!(remote_command( @@ -6270,9 +6266,6 @@ mod common_sequential { Some(format!("file://{snapshot_dir}").as_str()), )); - // Wait to make sure the snapshot is completed - thread::sleep(std::time::Duration::new(10, 0)); - let latest_events = [ &MetaEvent { event: "snapshotting".to_string(), @@ -6283,9 +6276,10 @@ mod common_sequential { device_id: None, }, ]; - // See: #5938 - thread::sleep(std::time::Duration::new(1, 0)); - assert!(check_latest_events_exact(&latest_events, event_path)); + + assert!(wait_until(Duration::from_secs(30), || { + check_latest_events_exact(&latest_events, event_path) + })); } // One thing to note about this test. The virtio-net device is heavily used @@ -6321,7 +6315,7 @@ mod common_sequential { "id={},tap=,mac={},ip={},mask=255.255.255.128", net_id, guest.network.guest_mac0, guest.network.host_ip0 ); - let mut mem_params = "size=2G"; + let mut mem_params = "size=1G"; if use_hotplug { mem_params = "size=2G,hotplug_method=virtio-mem,hotplug_size=32G"; @@ -6368,7 +6362,12 @@ mod common_sequential { // Check the number of vCPUs assert_eq!(guest.get_cpu_count().unwrap_or_default(), 4); // Check the guest RAM - assert!(guest.get_total_memory().unwrap_or_default() > 1_920_000); + let total_memory = guest.get_total_memory().unwrap_or_default(); + if use_hotplug { + assert!(total_memory > 1_900_000, "total memory: {total_memory}"); + } else { + assert!(total_memory > 900_000, "total memory: {total_memory}"); + } if use_hotplug { // Increase guest RAM with virtio-mem resize_command( @@ -6390,8 +6389,8 @@ mod common_sequential { ); thread::sleep(std::time::Duration::new(5, 0)); let total_memory = guest.get_total_memory().unwrap_or_default(); - assert!(total_memory > 4_800_000); - assert!(total_memory < 5_760_000); + assert!(total_memory > 4_800_000, "total_memory is {total_memory}"); + assert!(total_memory < 5_760_000, "total_memory is {total_memory}"); } // Check the guest virtio-devices, e.g. block, rng, vsock, console, and net guest.check_devices_common(Some(&socket), Some(&console_text), None); @@ -6415,9 +6414,9 @@ mod common_sequential { event: "device-removed".to_string(), device_id: Some(net_id.to_string()), }]; - // See: #5938 - thread::sleep(std::time::Duration::new(1, 0)); - assert!(check_latest_events_exact(&latest_events, &event_path)); + assert!(wait_until(Duration::from_secs(30), || { + check_latest_events_exact(&latest_events, &event_path) + })); // Plug the virtio-net device again assert!(remote_command( @@ -6467,8 +6466,6 @@ mod common_sequential { .spawn() .unwrap(); - // Wait for the VM to be restored - thread::sleep(std::time::Duration::new(20, 0)); let expected_events = [ &MetaEvent { event: "starting".to_string(), @@ -6487,10 +6484,9 @@ mod common_sequential { device_id: None, }, ]; - assert!(check_sequential_events( - &expected_events, - &event_path_restored - )); + assert!(wait_until(Duration::from_secs(30), || { + check_sequential_events(&expected_events, &event_path_restored) + })); if use_resume_option { let latest_events = [ &MetaEvent { @@ -6506,21 +6502,26 @@ mod common_sequential { device_id: None, }, ]; - assert!(check_latest_events_exact( - &latest_events, - &event_path_restored - )); + assert!(wait_until(Duration::from_secs(30), || { + check_latest_events_exact(&latest_events, &event_path_restored) + })); } else { let latest_events = [&MetaEvent { event: "restored".to_string(), device_id: None, }]; - assert!(check_latest_events_exact( - &latest_events, - &event_path_restored - )); + assert!(wait_until(Duration::from_secs(30), || { + check_latest_events_exact(&latest_events, &event_path_restored) + })); } + // Wait until the restored VM API is ready before issuing follow-up requests. + assert!(wait_until(Duration::from_secs(30), || remote_command( + &api_socket_restored, + "info", + None + ))); + // Remove the snapshot dir let _ = remove_dir_all(snapshot_dir.as_str()); @@ -6530,13 +6531,12 @@ mod common_sequential { thread::sleep(std::time::Duration::new(1, 0)); } else { // Resume the VM manually + assert!(wait_until(Duration::from_secs(30), || remote_command( + &api_socket_restored, + "info", + None + ))); assert!(remote_command(&api_socket_restored, "resume", None)); - // There is no way that we can ensure the 'write()' to the - // event file is completed when the 'resume' request is - // returned successfully, because the 'write()' was done - // asynchronously from a different thread of Cloud - // Hypervisor (e.g. the event-monitor thread). - thread::sleep(std::time::Duration::new(1, 0)); let latest_events = [ &MetaEvent { @@ -6548,18 +6548,17 @@ mod common_sequential { device_id: None, }, ]; - assert!(check_latest_events_exact( - &latest_events, - &event_path_restored - )); + assert!(wait_until(Duration::from_secs(30), || { + check_latest_events_exact(&latest_events, &event_path_restored) + })); } // Perform same checks to validate VM has been properly restored assert_eq!(guest.get_cpu_count().unwrap_or_default(), 4); let total_memory = guest.get_total_memory().unwrap_or_default(); if use_hotplug { - assert!(total_memory > 4_800_000); - assert!(total_memory < 5_760_000); + assert!(total_memory > 4_800_000, "total_memory is {total_memory}"); + assert!(total_memory < 5_760_000, "total_memory is {total_memory}"); // Deflate balloon to restore entire RAM to the VM resize_command(&api_socket_restored, None, None, Some(0), None); thread::sleep(std::time::Duration::new(5, 0)); @@ -6568,10 +6567,10 @@ mod common_sequential { resize_command(&api_socket_restored, None, Some(5 << 30), None, None); thread::sleep(std::time::Duration::new(5, 0)); let total_memory = guest.get_total_memory().unwrap_or_default(); - assert!(total_memory > 4_800_000); - assert!(total_memory < 5_760_000); + assert!(total_memory > 4_800_000, "total_memory is {total_memory}"); + assert!(total_memory < 5_760_000, "total_memory is {total_memory}"); } else { - assert!(total_memory > 1_920_000); + assert!(total_memory > 900_000, "total memory: {total_memory}"); } guest.check_devices_common(Some(&socket), Some(&console_text), None); @@ -6699,20 +6698,23 @@ mod common_sequential { .spawn() .unwrap(); - thread::sleep(std::time::Duration::new(20, 0)); - let latest_events = [&MetaEvent { event: "restored".to_string(), device_id: None, }]; - assert!(check_latest_events_exact( - &latest_events, - &event_path_restored - )); + + assert!(wait_until(Duration::from_secs(30), || { + check_latest_events_exact(&latest_events, &event_path_restored) + })); let r = std::panic::catch_unwind(|| { + assert!(wait_until(Duration::from_secs(30), || remote_command( + &api_socket_restored, + "info", + None + ))); assert!(remote_command(&api_socket_restored, "resume", None)); - thread::sleep(std::time::Duration::new(1, 0)); + let latest_events = [ &MetaEvent { event: "resuming".to_string(), @@ -6723,10 +6725,9 @@ mod common_sequential { device_id: None, }, ]; - assert!(check_latest_events_exact( - &latest_events, - &event_path_restored - )); + assert!(wait_until(Duration::from_secs(30), || { + check_latest_events_exact(&latest_events, &event_path_restored) + })); assert_eq!(guest.get_cpu_count().unwrap_or_default(), 4); assert!(guest.get_total_memory().unwrap_or_default() > min_total_memory_kib); @@ -6897,7 +6898,9 @@ mod common_sequential { )); // Wait for the VM to be restored - thread::sleep(std::time::Duration::new(20, 0)); + assert!(wait_until(Duration::from_secs(20), || { + remote_command(&api_socket_restored, "info", None) + })); // close the fds as CH duplicates them before using for tap in taps.iter() { @@ -6922,31 +6925,30 @@ mod common_sequential { device_id: None, }, ]; - assert!(check_sequential_events( - &expected_events, - &event_path_restored - )); + // Wait for the restore event sequence to be recorded. + assert!(wait_until(Duration::from_secs(30), || { + check_sequential_events(&expected_events, &event_path_restored) + })); let latest_events = [&MetaEvent { event: "restored".to_string(), device_id: None, }]; - assert!(check_latest_events_exact( - &latest_events, - &event_path_restored - )); + assert!(wait_until(Duration::from_secs(30), || { + check_latest_events_exact(&latest_events, &event_path_restored) + })); // Remove the snapshot dir let _ = remove_dir_all(snapshot_dir.as_str()); let r = std::panic::catch_unwind(|| { // Resume the VM + assert!(wait_until(Duration::from_secs(20), || remote_command( + &api_socket_restored, + "info", + None + ))); assert!(remote_command(&api_socket_restored, "resume", None)); - // There is no way that we can ensure the 'write()' to the - // event file is completed when the 'resume' request is - // returned successfully, because the 'write()' was done - // asynchronously from a different thread of Cloud - // Hypervisor (e.g. the event-monitor thread). - thread::sleep(std::time::Duration::new(1, 0)); + let latest_events = [ &MetaEvent { event: "resuming".to_string(), @@ -6957,10 +6959,9 @@ mod common_sequential { device_id: None, }, ]; - assert!(check_latest_events_exact( - &latest_events, - &event_path_restored - )); + assert!(wait_until(Duration::from_secs(30), || { + check_latest_events_exact(&latest_events, &event_path_restored) + })); // Perform same checks to validate VM has been properly restored assert_eq!(guest.get_cpu_count().unwrap_or_default(), n_cpu); @@ -7063,30 +7064,26 @@ mod common_sequential { .spawn() .unwrap(); - // Wait for the VM to be restored - thread::sleep(std::time::Duration::new(20, 0)); - let latest_events = [&MetaEvent { event: "restored".to_string(), device_id: None, }]; - assert!(check_latest_events_exact( - &latest_events, - &event_path_restored - )); + // Wait for the restored event to show up in the monitor file. + assert!(wait_until(Duration::from_secs(30), || { + check_latest_events_exact(&latest_events, &event_path_restored) + })); // Remove the snapshot dir let _ = remove_dir_all(snapshot_dir.as_str()); let r = std::panic::catch_unwind(|| { // Resume the VM + assert!(wait_until(Duration::from_secs(30), || remote_command( + &api_socket_restored, + "info", + None + ))); assert!(remote_command(&api_socket_restored, "resume", None)); - // There is no way that we can ensure the 'write()' to the - // event file is completed when the 'resume' request is - // returned successfully, because the 'write()' was done - // asynchronously from a different thread of Cloud - // Hypervisor (e.g. the event-monitor thread). - thread::sleep(std::time::Duration::new(1, 0)); let latest_events = [ &MetaEvent { event: "resuming".to_string(), @@ -7097,10 +7094,9 @@ mod common_sequential { device_id: None, }, ]; - assert!(check_latest_events_exact( - &latest_events, - &event_path_restored - )); + assert!(wait_until(Duration::from_secs(30), || { + check_latest_events_exact(&latest_events, &event_path_restored) + })); // Check the number of vCPUs assert_eq!(guest.get_cpu_count().unwrap_or_default(), 2); @@ -7228,7 +7224,9 @@ mod common_sequential { .unwrap(); // Wait for the VM to be restored - thread::sleep(std::time::Duration::new(20, 0)); + assert!(wait_until(Duration::from_secs(30), || { + remote_command(&api_socket_restored, "info", None) + })); let latest_events = [&MetaEvent { event: "restored".to_string(), @@ -7244,6 +7242,11 @@ mod common_sequential { let r = std::panic::catch_unwind(|| { // Resume the VM + assert!(wait_until(Duration::from_secs(30), || remote_command( + &api_socket_restored, + "info", + None + ))); assert!(remote_command(&api_socket_restored, "resume", None)); thread::sleep(std::time::Duration::new(5, 0)); From cd92f65f7b8bcf91dc1e0e05d9571e7d5a73023d Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Fri, 10 Apr 2026 13:39:53 +0200 Subject: [PATCH 541/742] tests: Replace Windows integration sleeps with polling Replace fixed sleeps in Windows integration tests with polling helpers that wait for boot, snapshot readiness, and device enumeration. This keeps the same test intent while avoiding long fixed delays on the fast path. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- cloud-hypervisor/tests/integration.rs | 157 ++++++++++++++++---------- 1 file changed, 97 insertions(+), 60 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 3371118216..82b1076017 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -7518,37 +7518,19 @@ mod windows { )) } - fn wait_for_boot(&self) -> bool { - let cmd = "dir /b c:\\ | find \"Windows\""; - let tmo_max = 180; - // The timeout increase by n*1+n*2+n*3+..., therefore the initial - // interval must be small. - let tmo_int = 2; - let out = ssh_command_ip_with_auth_retry( - cmd, + fn wait_for_boot(&self) -> Result<(), WaitForSshError> { + let out = wait_for_ssh( + "dir /b c:\\ | find \"Windows\"", &self.auth, &self.guest.network.guest_ip0, - { - let mut ret = 1; - let mut tmo_acc = 0; - loop { - tmo_acc += tmo_int * ret; - if tmo_acc >= tmo_max { - break; - } - ret += 1; - } - ret - }, - tmo_int, - ) - .unwrap(); + Duration::from_secs(180), + )?; - if "Windows" == out.trim() { - return true; + if out.trim() == "Windows" { + Ok(()) + } else { + panic!("Unexpected Windows boot probe output: {:?}", out.trim()); } - - false } } @@ -7617,7 +7599,7 @@ mod windows { let r = std::panic::catch_unwind(|| { // Wait to make sure Windows boots up - assert!(windows_guest.wait_for_boot()); + windows_guest.wait_for_boot().unwrap(); windows_guest.shutdown(); }); @@ -7682,7 +7664,7 @@ mod windows { let r = std::panic::catch_unwind(|| { // Wait to make sure Windows boots up - assert!(windows_guest.wait_for_boot()); + windows_guest.wait_for_boot().unwrap(); windows_guest.shutdown(); }); @@ -7733,7 +7715,7 @@ mod windows { let mut child_dnsmasq = windows_guest.run_dnsmasq(); // Wait to make sure Windows boots up - assert!(windows_guest.wait_for_boot()); + windows_guest.wait_for_boot().unwrap(); let snapshot_dir = temp_snapshot_dir_path(&tmp_dir); @@ -7747,8 +7729,11 @@ mod windows { Some(format!("file://{snapshot_dir}").as_str()), )); - // Wait to make sure the snapshot is completed - thread::sleep(std::time::Duration::new(30, 0)); + let snapshot_state_path = std::path::Path::new(&snapshot_dir).join("state.json"); + let snapshot_config_path = std::path::Path::new(&snapshot_dir).join("config.json"); + assert!(wait_until(Duration::from_secs(30), || { + snapshot_state_path.exists() && snapshot_config_path.exists() + })); let _ = child.kill(); child.wait().unwrap(); @@ -7767,10 +7752,17 @@ mod windows { .unwrap(); // Wait for the VM to be restored - thread::sleep(std::time::Duration::new(20, 0)); + assert!(wait_until(Duration::from_secs(30), || { + remote_command(&api_socket_restored, "info", None) + })); let r = std::panic::catch_unwind(|| { // Resume the VM + assert!(wait_until(Duration::from_secs(30), || remote_command( + &api_socket_restored, + "info", + None + ))); assert!(remote_command(&api_socket_restored, "resume", None)); windows_guest.shutdown(); @@ -7816,7 +7808,7 @@ mod windows { let r = std::panic::catch_unwind(|| { // Wait to make sure Windows boots up - assert!(windows_guest.wait_for_boot()); + windows_guest.wait_for_boot().unwrap(); let vcpu_num = 2; // Check the initial number of CPUs the guest sees @@ -7827,8 +7819,10 @@ mod windows { let vcpu_num = 6; // Hotplug some CPUs resize_command(&api_socket, Some(vcpu_num), None, None, None); - // Wait to make sure CPUs are added - thread::sleep(std::time::Duration::new(10, 0)); + // Wait for Windows to report the hotplugged CPUs. + assert!(wait_until(Duration::from_secs(10), || windows_guest + .cpu_count() + == vcpu_num)); // Check the guest sees the correct number assert_eq!(windows_guest.cpu_count(), vcpu_num); // Check the CH process has the correct number of vcpu threads @@ -7837,12 +7831,16 @@ mod windows { let vcpu_num = 4; // Remove some CPUs. Note that Windows doesn't support hot-remove. resize_command(&api_socket, Some(vcpu_num), None, None, None); - // Wait to make sure CPUs are removed thread::sleep(std::time::Duration::new(10, 0)); + // Reboot to let Windows catch up windows_guest.reboot(); - // Wait to make sure Windows completely rebooted - thread::sleep(std::time::Duration::new(60, 0)); + // Wait for Windows to come back after the reboot. + windows_guest.wait_for_boot().unwrap(); + // Wait for Windows to reflect the unplugged CPU count. + assert!(wait_until(Duration::from_secs(60), || windows_guest + .cpu_count() + == vcpu_num)); // Check the guest sees the correct number assert_eq!(windows_guest.cpu_count(), vcpu_num); // Check the CH process has the correct number of vcpu threads @@ -7891,7 +7889,7 @@ mod windows { let r = std::panic::catch_unwind(|| { // Wait to make sure Windows boots up - assert!(windows_guest.wait_for_boot()); + windows_guest.wait_for_boot().unwrap(); let ram_size = 2 * 1024 * 1024 * 1024; // Check the initial number of RAM the guest sees @@ -7906,20 +7904,22 @@ mod windows { let ram_size = 4 * 1024 * 1024 * 1024; // Hotplug some RAM resize_command(&api_socket, None, Some(ram_size), None, None); - // Wait to make sure RAM has been added - thread::sleep(std::time::Duration::new(10, 0)); - // Check the guest sees the correct number - assert_eq!(windows_guest.ram_size(), ram_size - reserved_ram_size); + // Wait for Windows to report the hotplugged memory. + assert!(wait_until(Duration::from_secs(10), || windows_guest + .ram_size() + == ram_size - reserved_ram_size)); let ram_size = 3 * 1024 * 1024 * 1024; // Unplug some RAM. Note that hot-remove most likely won't work. resize_command(&api_socket, None, Some(ram_size), None, None); - // Wait to make sure RAM has been added - thread::sleep(std::time::Duration::new(10, 0)); // Reboot to let Windows catch up windows_guest.reboot(); - // Wait to make sure guest completely rebooted - thread::sleep(std::time::Duration::new(60, 0)); + // Wait for Windows to come back after the reboot. + windows_guest.wait_for_boot().unwrap(); + // Wait for Windows to reflect the unplugged RAM amount. + assert!(wait_until(Duration::from_secs(60), || windows_guest + .ram_size() + == ram_size - reserved_ram_size)); // Check the guest sees the correct number assert_eq!(windows_guest.ram_size(), ram_size - reserved_ram_size); @@ -7965,7 +7965,7 @@ mod windows { let r = std::panic::catch_unwind(|| { // Wait to make sure Windows boots up - assert!(windows_guest.wait_for_boot()); + windows_guest.wait_for_boot().unwrap(); // Initially present network device let netdev_num = 1; @@ -7980,7 +7980,11 @@ mod windows { ); assert!(cmd_success); assert!(String::from_utf8_lossy(&cmd_output).contains("\"id\":\"_net2\"")); - thread::sleep(std::time::Duration::new(5, 0)); + // Wait for Windows to enumerate the added network device. + assert!(wait_until(Duration::from_secs(5), || windows_guest + .netdev_count() + == 2 + && netdev_ctrl_threads_count(child.id()) == 2)); // Verify the device is on the system let netdev_num = 2; assert_eq!(windows_guest.netdev_count(), netdev_num); @@ -7989,7 +7993,11 @@ mod windows { // Remove network device let cmd_success = remote_command(&api_socket, "remove-device", Some("_net2")); assert!(cmd_success); - thread::sleep(std::time::Duration::new(5, 0)); + // Wait for Windows to drop the removed network device. + assert!(wait_until(Duration::from_secs(5), || windows_guest + .netdev_count() + == 1 + && netdev_ctrl_threads_count(child.id()) == 1)); // Verify the device has been removed let netdev_num = 1; assert_eq!(windows_guest.netdev_count(), netdev_num); @@ -8041,7 +8049,7 @@ mod windows { let r = std::panic::catch_unwind(|| { // Wait to make sure Windows boots up - assert!(windows_guest.wait_for_boot()); + windows_guest.wait_for_boot().unwrap(); // Initially present disk device let disk_num = 1; @@ -8056,10 +8064,14 @@ mod windows { ); assert!(cmd_success); assert!(String::from_utf8_lossy(&cmd_output).contains("\"id\":\"_disk2\"")); - thread::sleep(std::time::Duration::new(5, 0)); // Online disk device windows_guest.disks_set_rw(); windows_guest.disks_online(); + // Wait for Windows to enumerate the added disk. + assert!(wait_until(Duration::from_secs(5), || windows_guest + .disk_count() + == 2 + && disk_ctrl_threads_count(child.id()) == 2)); // Verify the device is on the system let disk_num = 2; assert_eq!(windows_guest.disk_count(), disk_num); @@ -8072,7 +8084,11 @@ mod windows { // Unmount disk device let cmd_success = remote_command(&api_socket, "remove-device", Some("_disk2")); assert!(cmd_success); - thread::sleep(std::time::Duration::new(5, 0)); + // Wait for Windows to drop the removed disk. + assert!(wait_until(Duration::from_secs(5), || windows_guest + .disk_count() + == 1 + && disk_ctrl_threads_count(child.id()) == 1)); // Verify the device has been removed let disk_num = 1; assert_eq!(windows_guest.disk_count(), disk_num); @@ -8085,7 +8101,11 @@ mod windows { Some(format!("path={disk},readonly=off").as_str()), ); assert!(cmd_success); - thread::sleep(std::time::Duration::new(5, 0)); + // Wait for Windows to mount the re-added disk again. + assert!(wait_until(Duration::from_secs(5), || windows_guest + .disk_file_read(fname) + .trim() + == data)); let out = windows_guest.disk_file_read(fname); assert_eq!(data, out.trim()); @@ -8151,7 +8171,7 @@ mod windows { let r = std::panic::catch_unwind(|| { // Wait to make sure Windows boots up - assert!(windows_guest.wait_for_boot()); + windows_guest.wait_for_boot().unwrap(); // Initially present disk device let disk_num = 1; @@ -8161,6 +8181,10 @@ mod windows { for it in &disk_test_data { let disk_id = it[0].as_str(); let disk = it[1].as_str(); + + let expected_disk_num = windows_guest.disk_count() + 1; + let expected_ctrl_threads = disk_ctrl_threads_count(child.id()) + 1; + // Hotplug disk device let (cmd_success, cmd_output) = remote_command_w_output( &api_socket, @@ -8172,7 +8196,13 @@ mod windows { String::from_utf8_lossy(&cmd_output) .contains(format!("\"id\":\"{disk_id}\"").as_str()) ); - thread::sleep(std::time::Duration::new(5, 0)); + + // Wait for disk to appear + assert!(wait_until(Duration::from_secs(5), || { + windows_guest.disk_count() == expected_disk_num + && disk_ctrl_threads_count(child.id()) == expected_ctrl_threads + })); + // Online disk devices windows_guest.disks_set_rw(); windows_guest.disks_online(); @@ -8194,9 +8224,13 @@ mod windows { let disk_id = it[0].as_str(); let cmd_success = remote_command(&api_socket, "remove-device", Some(disk_id)); assert!(cmd_success); - thread::sleep(std::time::Duration::new(5, 0)); } + // Wait for Windows to drop all removed disks. + assert!(wait_until(Duration::from_secs(5), || windows_guest + .disk_count() + == 1 + && disk_ctrl_threads_count(child.id()) == 1)); // Verify the devices have been removed let disk_num = 1; assert_eq!(windows_guest.disk_count(), disk_num); @@ -8211,9 +8245,12 @@ mod windows { Some(format!("path={disk},readonly=off").as_str()), ); assert!(cmd_success); - thread::sleep(std::time::Duration::new(5, 0)); } + // Wait for Windows to enumerate the re-added disks. + assert!(wait_until(Duration::from_secs(5), || { + windows_guest.disk_count() == 4 && disk_ctrl_threads_count(child.id()) == 4 + })); // Check the files exists with the expected contents for it in &disk_test_data { let fname = it[2].as_str(); @@ -8273,7 +8310,7 @@ mod windows { let r = std::panic::catch_unwind(|| { // Wait to make sure Windows boots up - assert!(windows_guest.wait_for_boot()); + windows_guest.wait_for_boot().unwrap(); let netdev_num = 3; assert_eq!(windows_guest.netdev_count(), netdev_num); From d54a7d27a1995c4156f323ef4aa6f39784fc0a5a Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Fri, 10 Apr 2026 13:40:06 +0200 Subject: [PATCH 542/742] tests: Reduce memory usage in integration tests This reduces pressure on CI and enables to run more tests locally on developer machines (with 16GB of RAM or less). No functional changes. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- cloud-hypervisor/tests/integration.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 82b1076017..82c24a4f8f 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -5069,7 +5069,7 @@ mod common_parallel { let mut cmd = GuestCommand::new(&guest); cmd.args(["--cpus", "boot=4"]) - .args(["--memory", "size=4G"]) + .args(["--memory", "size=1G"]) .args(["--kernel", fw_path(FwType::RustHypervisorFirmware).as_str()]) .default_disks() .args(["--net", guest.default_net_string().as_str()]) @@ -5117,7 +5117,7 @@ mod common_parallel { let mut cmd = GuestCommand::new(&guest); cmd.args(["--cpus", "boot=4"]) - .args(["--memory", "size=4G"]) + .args(["--memory", "size=1G"]) .args(["--kernel", fw_path(FwType::RustHypervisorFirmware).as_str()]) .default_disks() .args(["--net", guest.default_net_string().as_str()]) @@ -8422,7 +8422,7 @@ mod vfio { let mut child = GuestCommand::new(&guest) .args(["--cpus", "boot=4"]) - .args(["--memory", "size=4G"]) + .args(["--memory", "size=1G"]) .args(["--kernel", fw_path(FwType::RustHypervisorFirmware).as_str()]) .args(["--platform", &platform_cfg(iommufd)]) .args(["--api-socket", &api_socket]) @@ -8474,7 +8474,7 @@ mod vfio { let mut child = GuestCommand::new(&guest) .args(["--cpus", "boot=4"]) - .args(["--memory", "size=4G"]) + .args(["--memory", "size=1G"]) .args(["--platform", &platform_cfg(iommufd)]) .args(["--kernel", fw_path(FwType::RustHypervisorFirmware).as_str()]) .args([ @@ -8528,7 +8528,7 @@ mod vfio { let mut child = GuestCommand::new(&guest) .args(["--cpus", "boot=4"]) - .args(["--memory", "size=4G"]) + .args(["--memory", "size=1G"]) .args(["--kernel", fw_path(FwType::RustHypervisorFirmware).as_str()]) .args(["--device", format!("path={NVIDIA_VFIO_DEVICE}").as_str()]) .args(["--platform", &platform]) @@ -10713,7 +10713,7 @@ mod rate_limiter { let mut child = GuestCommand::new(&guest) .args(["--cpus", &format!("boot={}", num_queues / 2)]) - .args(["--memory", "size=4G"]) + .args(["--memory", "size=1G"]) .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .default_disks() @@ -10789,7 +10789,7 @@ mod rate_limiter { let mut child = GuestCommand::new(&guest) .args(["--cpus", &format!("boot={num_queues}")]) - .args(["--memory", "size=4G"]) + .args(["--memory", "size=1G"]) .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .args([ @@ -10899,7 +10899,7 @@ mod rate_limiter { let mut child = GuestCommand::new(&guest) .args(["--cpus", &format!("boot={}", num_queues * num_disks)]) - .args(["--memory", "size=4G"]) + .args(["--memory", "size=1G"]) .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .args(["--rate-limit-group", &rate_limit_group_arg]) From df58e814ebbd9e79c64bd732be078224c221e795 Mon Sep 17 00:00:00 2001 From: Zhiheng Tao Date: Tue, 14 Apr 2026 14:12:59 +0800 Subject: [PATCH 543/742] vmm: fix UFFDIO_WAKE and UFFD_FEATURE_MISSING_HUGETLBFS UFFDIO_WAKE was 0x4010_aa02 (_IOW) but should be 0x8010_aa02, causing every wake call to silently fail with -EINVAL. UFFD_FEATURE_MISSING_HUGETLBFS was (1<<6) but should be (1<<4), colliding with UFFD_FEATURE_EVENT_UNMAP. Signed-off-by: Zhiheng Tao --- vmm/src/userfaultfd.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vmm/src/userfaultfd.rs b/vmm/src/userfaultfd.rs index 3f8447a327..f79ef52322 100644 --- a/vmm/src/userfaultfd.rs +++ b/vmm/src/userfaultfd.rs @@ -6,7 +6,7 @@ pub const UFFDIO_API: u64 = 0xc018_aa3f; // _IOWR(0xAA, 0x3F, struct uffdio_api) pub const UFFDIO_REGISTER: u64 = 0xc020_aa00; // _IOWR(0xAA, 0x00, struct uffdio_register) pub const UFFDIO_COPY: u64 = 0xc028_aa03; // _IOWR(0xAA, 0x03, struct uffdio_copy) -pub const UFFDIO_WAKE: u64 = 0x4010_aa02; // _IOW(0xAA, 0x02, struct uffdio_range) +pub const UFFDIO_WAKE: u64 = 0x8010_aa02; // _IOR(0xAA, 0x02, struct uffdio_range) // Seccomp compares these as Dword (u32); ensure they fit. const _: () = assert!(UFFDIO_API <= u32::MAX as u64); @@ -17,8 +17,8 @@ const _: () = assert!(UFFDIO_WAKE <= u32::MAX as u64); pub const UFFD_API: u64 = 0xAA; pub const UFFDIO_REGISTER_MODE_MISSING: u64 = 1; pub const UFFD_EVENT_PAGEFAULT: u8 = 0x12; +pub const UFFD_FEATURE_MISSING_HUGETLBFS: u64 = 1 << 4; pub const UFFD_FEATURE_MISSING_SHMEM: u64 = 1 << 5; -pub const UFFD_FEATURE_MISSING_HUGETLBFS: u64 = 1 << 6; const _UFFDIO_COPY: u64 = 0x03; const _UFFDIO_WAKE: u64 = 0x02; From 3e3008f3657cc7a2db3da75977a91f7e0d7e92c9 Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Tue, 14 Apr 2026 08:56:01 +0200 Subject: [PATCH 544/742] virtio-devices: signal activated queue eventfds on resume A restored virtqueue can already contain pending descriptors when the VM resumes. Before this change, the worker thread was unparked and then waited for a fresh queue eventfd signal. That is normally fine, but not when the queue was already non-empty at snapshot time. The virtqueue state lives in guest memory and is restored, but the original host-side queue eventfd signal is not persistent snapshot state. If the guest already notified the queue before the snapshot, it may not notify it again after resume. That can leave the worker idle while the guest is still waiting for the pending request to complete. In one observed case, this stalled a virtio-blk flush during early boot after snapshot/restore. We mitigate this in the shared `VirtioCommon` resume path. `VirtioCommon` retains cloned queue eventfds for activated virtqueues and signals each of them once on resume after unparking the worker threads. Keep virtio-net on its existing special-case path: it resumes worker threads without signaling queue eventfds so the `driver_awake` workaround remains intact until the guest performs a real notify. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- virtio-devices/src/device.rs | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/virtio-devices/src/device.rs b/virtio-devices/src/device.rs index f0673f5614..96a40a4dad 100644 --- a/virtio-devices/src/device.rs +++ b/virtio-devices/src/device.rs @@ -13,6 +13,7 @@ use std::sync::atomic::{AtomicBool, AtomicU8, Ordering}; use std::sync::{Arc, Barrier}; use std::thread; +use anyhow::anyhow; use libc::EFD_NONBLOCK; use log::{error, info, warn}; use virtio_queue::Queue; @@ -215,6 +216,7 @@ pub struct VirtioCommon { pub paused_sync: Option>, pub epoll_threads: Option>>, pub queue_sizes: Vec, + pub queue_evts: Vec, pub device_type: u32, pub min_queues: u16, pub access_platform: Option>, @@ -252,6 +254,21 @@ impl VirtioCommon { return Err(ActivateError::BadActivate); } + // Do not retain virtio-net queue eventfds here. Signaling them on + // resume would break its `driver_awake` workaround. + self.queue_evts = match VirtioDeviceType::from(self.device_type) { + VirtioDeviceType::Net => Vec::new(), + _ => queues + .iter() + .map(|(_, _, queue_evt)| { + queue_evt.try_clone().map_err(|e| { + error!("failed cloning queue EventFd: {e}"); + ActivateError::BadActivate + }) + }) + .collect::, _>>()?, + }; + let kill_evt = EventFd::new(EFD_NONBLOCK).map_err(|e| { error!("failed creating kill EventFd: {e}"); ActivateError::BadActivate @@ -272,6 +289,8 @@ impl VirtioCommon { } pub fn reset(&mut self) -> Option> { + self.queue_evts.clear(); + // We first must resume the virtio thread if it was paused. if self.pause_evt.take().is_some() { self.resume().ok()?; @@ -355,6 +374,16 @@ impl Pausable for VirtioCommon { } } + // Signal each activated queue eventfd so workers process restored queues + // that may already contain pending requests. + for queue_evt in &self.queue_evts { + queue_evt.write(1).map_err(|e| { + MigratableError::Resume(anyhow!( + "Could not notify restored virtio worker on resume: {e}" + )) + })?; + } + Ok(()) } } From 101c2590514ffaf53726aaf2a9a636673b6142b5 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Tue, 14 Apr 2026 12:10:55 +0100 Subject: [PATCH 545/742] virtio-devices: trigger interrupt into guest on resume This will wake up the guest and avoid a livelock situation by ensuring that it will process any pending queues on its side. Signed-off-by: Rob Bradford --- virtio-devices/src/device.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/virtio-devices/src/device.rs b/virtio-devices/src/device.rs index 96a40a4dad..0bab703137 100644 --- a/virtio-devices/src/device.rs +++ b/virtio-devices/src/device.rs @@ -384,6 +384,15 @@ impl Pausable for VirtioCommon { })?; } + // Also trigger interrupts into the guest to wake up the driver to avoid a "livelock" + if let Some(interrupt_cb) = &self.interrupt_cb { + for i in 0..self.queue_evts.len() { + interrupt_cb + .trigger(crate::VirtioInterruptType::Queue(i as u16)) + .ok(); + } + } + Ok(()) } } From 72fc0976f113284ee229bc657926d3b712f74e6a Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Tue, 14 Apr 2026 12:14:05 +0100 Subject: [PATCH 546/742] virtio-devices: net: Remove "driver_awake" workaround for restore Now on the generic restore path the worker thread is notified on the events and also the guest is notified via the interrupt. This avoids the same "livelock" situation that required this "driver_awake" workaround when restoring the net device. Signed-off-by: Rob Bradford --- virtio-devices/src/device.rs | 21 ++++++++------------- virtio-devices/src/net.rs | 14 ++------------ 2 files changed, 10 insertions(+), 25 deletions(-) diff --git a/virtio-devices/src/device.rs b/virtio-devices/src/device.rs index 0bab703137..d1b9257995 100644 --- a/virtio-devices/src/device.rs +++ b/virtio-devices/src/device.rs @@ -254,20 +254,15 @@ impl VirtioCommon { return Err(ActivateError::BadActivate); } - // Do not retain virtio-net queue eventfds here. Signaling them on - // resume would break its `driver_awake` workaround. - self.queue_evts = match VirtioDeviceType::from(self.device_type) { - VirtioDeviceType::Net => Vec::new(), - _ => queues - .iter() - .map(|(_, _, queue_evt)| { - queue_evt.try_clone().map_err(|e| { - error!("failed cloning queue EventFd: {e}"); - ActivateError::BadActivate - }) + self.queue_evts = queues + .iter() + .map(|(_, _, queue_evt)| { + queue_evt.try_clone().map_err(|e| { + error!("failed cloning queue EventFd: {e}"); + ActivateError::BadActivate }) - .collect::, _>>()?, - }; + }) + .collect::, _>>()?; let kill_evt = EventFd::new(EFD_NONBLOCK).map_err(|e| { error!("failed creating kill EventFd: {e}"); diff --git a/virtio-devices/src/net.rs b/virtio-devices/src/net.rs index 3bb360c646..ed8c05eeb8 100644 --- a/virtio-devices/src/net.rs +++ b/virtio-devices/src/net.rs @@ -174,11 +174,6 @@ struct NetEpollHandler { queue_index_base: u16, queue_pair: (Queue, Queue), queue_evt_pair: (EventFd, EventFd), - // Always generate interrupts until the driver has signalled to the device. - // This mitigates a problem with interrupts from tap events being "lost" upon - // a restore as the vCPU thread isn't ready to handle the interrupt. This causes - // issues when combined with VIRTIO_RING_F_EVENT_IDX interrupt suppression. - driver_awake: bool, device_status: Arc, } @@ -260,7 +255,7 @@ Setting device status to 'NEEDS_RESET' and stopping processing queues until rese return Ok(()); } - if res.map_err(DeviceError::NetQueuePair)? || !self.driver_awake { + if res.map_err(DeviceError::NetQueuePair)? { self.signal_used_queue(self.queue_index_base + 1)?; debug!("Signalling TX queue"); } else { @@ -296,7 +291,7 @@ Setting device status to 'NEEDS_RESET' and stopping processing queues until rese return Ok(()); } - if res.map_err(DeviceError::NetQueuePair)? || !self.driver_awake { + if res.map_err(DeviceError::NetQueuePair)? { self.signal_used_queue(self.queue_index_base)?; debug!("Signalling RX queue"); } else { @@ -361,7 +356,6 @@ impl EpollHelperHandler for NetEpollHandler { let ev_type = event.data as u16; match ev_type { RX_QUEUE_EVENT => { - self.driver_awake = true; self.handle_rx_event().map_err(|e| { EpollHelperError::HandleEvent(anyhow!("Error processing RX queue: {e:?}")) })?; @@ -371,7 +365,6 @@ impl EpollHelperHandler for NetEpollHandler { if let Err(e) = queue_evt.read() { error!("Failed to get tx queue event: {e:?}"); } - self.driver_awake = true; self.handle_tx_event().map_err(|e| { EpollHelperError::HandleEvent(anyhow!("Error processing TX queue: {e:?}")) })?; @@ -428,8 +421,6 @@ impl EpollHelperHandler for NetEpollHandler { "Error from 'rate_limiter.event_handler()': {e:?}" )) })?; - - self.driver_awake = true; self.process_tx().map_err(|e| { EpollHelperError::HandleEvent(anyhow!("Error processing TX queue: {e:?}")) })?; @@ -855,7 +846,6 @@ impl VirtioDevice for Net { interrupt_cb: interrupt_cb.clone(), kill_evt, pause_evt, - driver_awake: false, device_status: self.device_status.clone(), }; From 0a32a9ca9194878585e530c05f77f00c63d9c6a8 Mon Sep 17 00:00:00 2001 From: Shayon Mukherjee Date: Tue, 14 Apr 2026 06:58:59 -0400 Subject: [PATCH 547/742] vmm: add compile-time validation for userfaultfd ioctl constants Cross-check each UFFDIO_* constant against the Linux _IOC(dir, type, nr, size) encoding formula at compile time so that transposed direction bits or struct sizes are caught immediately rather than silently producing wrong ioctl numbers at runtime. Signed-off-by: Shayon Mukherjee --- vmm/src/userfaultfd.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/vmm/src/userfaultfd.rs b/vmm/src/userfaultfd.rs index f79ef52322..bbefb16bc6 100644 --- a/vmm/src/userfaultfd.rs +++ b/vmm/src/userfaultfd.rs @@ -8,6 +8,18 @@ pub const UFFDIO_REGISTER: u64 = 0xc020_aa00; // _IOWR(0xAA, 0x00, struct uffdio pub const UFFDIO_COPY: u64 = 0xc028_aa03; // _IOWR(0xAA, 0x03, struct uffdio_copy) pub const UFFDIO_WAKE: u64 = 0x8010_aa02; // _IOR(0xAA, 0x02, struct uffdio_range) +// Validate ioctl encoding against the _IO{R,W,WR}(type, nr, size) formula so +// transposed direction bits or sizes are caught at compile time. +const fn ioctl_ioc(dir: u64, typ: u64, nr: u64, size: u64) -> u64 { + (dir << 30) | (size << 16) | (typ << 8) | nr +} +const IOC_READ: u64 = 2; +const IOC_READWRITE: u64 = 3; +const _: () = assert!(UFFDIO_API == ioctl_ioc(IOC_READWRITE, 0xAA, 0x3F, 24)); +const _: () = assert!(UFFDIO_REGISTER == ioctl_ioc(IOC_READWRITE, 0xAA, 0x00, 32)); +const _: () = assert!(UFFDIO_COPY == ioctl_ioc(IOC_READWRITE, 0xAA, 0x03, 40)); +const _: () = assert!(UFFDIO_WAKE == ioctl_ioc(IOC_READ, 0xAA, 0x02, 16)); + // Seccomp compares these as Dword (u32); ensure they fit. const _: () = assert!(UFFDIO_API <= u32::MAX as u64); const _: () = assert!(UFFDIO_REGISTER <= u32::MAX as u64); From 1268539b26b42c45af4f591902f429a91dca00fb Mon Sep 17 00:00:00 2001 From: Nguyen Dinh Phi Date: Wed, 8 Apr 2026 23:11:27 +0800 Subject: [PATCH 548/742] main: remove api socket path when start_vmm fails Previously the UNIX socket file was only removed on the success path (start_vmm returned Ok(Some(path))). If start_vmm failed after the HTTP API had bound a path-based socket, the file could be left on disk. Parse --api-socket in parse_api_socket(), call start_vmm with the result, then unlink the path in main after start_vmm returns for both success and failure (fd= mode unchanged: no path to remove). Signed-off-by: Nguyen Dinh Phi --- cloud-hypervisor/src/main.rs | 80 ++++++++++++++++++++++-------------- 1 file changed, 49 insertions(+), 31 deletions(-) diff --git a/cloud-hypervisor/src/main.rs b/cloud-hypervisor/src/main.rs index 415e7ed922..06ef1fe30f 100644 --- a/cloud-hypervisor/src/main.rs +++ b/cloud-hypervisor/src/main.rs @@ -504,7 +504,37 @@ fn create_app(default_vcpus: String, default_memory: String, default_rng: String .args(args) } -fn start_vmm(cmd_arguments: &ArgMatches) -> Result, Error> { +fn parse_api_socket(cmd_arguments: &ArgMatches) -> Result<(Option, Option), Error> { + if let Some(socket_config) = cmd_arguments.get_one::("api-socket") { + let mut parser = OptionParser::new(); + parser.add("path").add("fd"); + parser.parse(socket_config).unwrap_or_default(); + + if let Some(fd) = parser.get("fd") { + Ok(( + None, + Some(fd.parse::().map_err(Error::ParsingApiSocket)?), + )) + } else if let Some(path) = parser.get("path") { + Ok((Some(path), None)) + } else { + Ok(( + cmd_arguments + .get_one::("api-socket") + .map(|s| s.to_string()), + None, + )) + } + } else { + Ok((None, None)) + } +} + +fn start_vmm( + cmd_arguments: &ArgMatches, + api_socket_path: &Option, + api_socket_fd: Option, +) -> Result<(), Error> { let log_level = match cmd_arguments.get_count("v") { 0 => LevelFilter::Warn, 1 => LevelFilter::Info, @@ -527,31 +557,6 @@ fn start_vmm(cmd_arguments: &ArgMatches) -> Result, Error> { .map(|()| log::set_max_level(log_level)) .map_err(Error::LoggerSetup)?; - let (api_socket_path, api_socket_fd) = - if let Some(socket_config) = cmd_arguments.get_one::("api-socket") { - let mut parser = OptionParser::new(); - parser.add("path").add("fd"); - parser.parse(socket_config).unwrap_or_default(); - - if let Some(fd) = parser.get("fd") { - ( - None, - Some(fd.parse::().map_err(Error::ParsingApiSocket)?), - ) - } else if let Some(path) = parser.get("path") { - (Some(path), None) - } else { - ( - cmd_arguments - .get_one::("api-socket") - .map(|s| s.to_string()), - None, - ) - } - } else { - (None, None) - }; - let (api_request_sender, api_request_receiver) = channel(); let api_evt = EventFd::new(EFD_NONBLOCK).map_err(Error::CreateApiEventFd)?; @@ -712,7 +717,7 @@ fn start_vmm(cmd_arguments: &ArgMatches) -> Result, Error> { let vmm_thread_handle = vmm::start_vmm_thread( vmm::VmmVersionInfo::new(env!("BUILD_VERSION"), env!("CARGO_PKG_VERSION")), - &api_socket_path, + api_socket_path, api_socket_fd, #[cfg(feature = "dbus_api")] dbus_options, @@ -798,7 +803,7 @@ fn start_vmm(cmd_arguments: &ArgMatches) -> Result, Error> { dbus_api_graceful_shutdown(chs); } - r.map(|_| api_socket_path) + r } // This is a best-effort solution to the latency induced by the RCU @@ -904,9 +909,22 @@ fn main() { warn!("Error expanding FD table: {e}"); } - let exit_code = match start_vmm(&cmd_arguments) { - Ok(path) => { - path.map(|s| std::fs::remove_file(s).ok()); + let (api_socket_path, api_socket_fd) = match parse_api_socket(&cmd_arguments) { + Ok(p) => p, + Err(top_error) => { + cloud_hypervisor::cli_print_error_chain(&top_error, "Cloud Hypervisor", |_, _, _| None); + std::process::exit(1); + } + }; + + let vmm_result = start_vmm(&cmd_arguments, &api_socket_path, api_socket_fd); + + if let Some(ref p) = api_socket_path { + let _ = std::fs::remove_file(p); + } + + let exit_code = match vmm_result { + Ok(()) => { info!("Cloud Hypervisor exited successfully"); 0 } From a8ff2c50afcff47afae04faa3de820b8c76d32b8 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Fri, 3 Apr 2026 17:12:30 +0100 Subject: [PATCH 549/742] option_parser: Introduce parse_subset() tolerating unknown options Refactor parse() into a version that can control whether to tolerate unknown options. This can then be used to then parse a subset of the options. Signed-off-by: Rob Bradford --- option_parser/src/lib.rs | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/option_parser/src/lib.rs b/option_parser/src/lib.rs index 699a26252c..72b9783de1 100644 --- a/option_parser/src/lib.rs +++ b/option_parser/src/lib.rs @@ -93,7 +93,7 @@ impl OptionParser { } } - pub fn parse(&mut self, input: &str) -> OptionParserResult<()> { + fn parse_inner(&mut self, input: &str, ignore_unknown: bool) -> OptionParserResult<()> { if input.trim().is_empty() { return Ok(()); } @@ -101,7 +101,11 @@ impl OptionParser { for option in split_commas(input)?.iter() { let parts: Vec<&str> = option.splitn(2, '=').collect(); match self.options.get_mut(parts[0]) { - None => return Err(OptionParserError::UnknownOption(parts[0].to_owned())), + None => { + if !ignore_unknown { + return Err(OptionParserError::UnknownOption(parts[0].to_owned())); + } + } Some(value) => { if value.requires_value { if parts.len() != 2 { @@ -118,6 +122,14 @@ impl OptionParser { Ok(()) } + pub fn parse(&mut self, input: &str) -> OptionParserResult<()> { + self.parse_inner(input, false) + } + + pub fn parse_subset(&mut self, input: &str) -> OptionParserResult<()> { + self.parse_inner(input, true) + } + pub fn add(&mut self, option: &str) -> &mut Self { // Check that option=value has balanced // quotes and brackets iff value does. From 791889cefd62f96c80475b36bf5d1613fa739c35 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Sat, 4 Apr 2026 08:44:11 -0700 Subject: [PATCH 550/742] option_parser: Add support for adding from a slice Add an OptionParser::add_all method that takes a slice of option names and use that to add to the set of parameters that the parser works on. Signed-off-by: Rob Bradford --- option_parser/src/lib.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/option_parser/src/lib.rs b/option_parser/src/lib.rs index 72b9783de1..67c12bd055 100644 --- a/option_parser/src/lib.rs +++ b/option_parser/src/lib.rs @@ -148,6 +148,14 @@ impl OptionParser { self } + pub fn add_all(&mut self, options: &[&str]) -> &mut Self { + for option in options { + self.add(option); + } + + self + } + pub fn add_valueless(&mut self, option: &str) -> &mut Self { self.options.insert( option.to_owned(), From d212255073b411db6806f04999b79bb4821ffbb7 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Tue, 14 Apr 2026 07:49:46 +0100 Subject: [PATCH 551/742] option_parser: Add documentation strings Autogenerated with Claude Opus 4.6 and reviewed with human eyes. Signed-off-by: Rob Bradford --- option_parser/src/lib.rs | 94 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) diff --git a/option_parser/src/lib.rs b/option_parser/src/lib.rs index 67c12bd055..85d265f6a0 100644 --- a/option_parser/src/lib.rs +++ b/option_parser/src/lib.rs @@ -3,6 +3,29 @@ // SPDX-License-Identifier: Apache-2.0 // +//! A parser for comma-separated `key=value` option strings. +//! +//! This crate provides [`OptionParser`], which parses strings of the form +//! `"key1=value1,key2=value2,..."` into a set of named options that can then +//! be retrieved and converted to various types. +//! +//! Values may be quoted with `"` to embed commas and other special characters, +//! and brackets `[` `]` are tracked so that list-valued options like +//! `topology=[1,2,3]` are not split at inner commas. +//! +//! # Example +//! +//! ``` +//! use option_parser::OptionParser; +//! +//! let mut parser = OptionParser::new(); +//! parser.add("size").add("mergeable"); +//! parser.parse("size=128M,mergeable=on").unwrap(); +//! +//! assert_eq!(parser.get("size"), Some("128M".to_owned())); +//! assert_eq!(parser.get("mergeable"), Some("on".to_owned())); +//! ``` + use std::collections::HashMap; use std::fmt::{Display, Write}; use std::num::ParseIntError; @@ -27,6 +50,12 @@ mod private_trait { } use private_trait::Parseable; +/// A parser for comma-separated `key=value` option strings. +/// +/// Options must be registered with [`add`](Self::add) or +/// [`add_valueless`](Self::add_valueless) before parsing. After calling +/// [`parse`](Self::parse), values can be retrieved with [`get`](Self::get) +/// or converted to a specific type with [`convert`](Self::convert). #[derive(Default)] pub struct OptionParser { options: HashMap, @@ -37,14 +66,19 @@ struct OptionParserValue { requires_value: bool, } +/// Errors returned when parsing or converting options. #[derive(Debug, Error)] pub enum OptionParserError { + /// An option name was not previously registered with [`OptionParser::add`]. #[error("unknown option: {0}")] UnknownOption(String), + /// The input string has invalid syntax (unbalanced quotes/brackets, missing `=`). #[error("unknown option: {0}")] InvalidSyntax(String), + /// A value could not be converted to the requested type. #[error("unable to convert {1} for {0}")] Conversion(String /* field */, String /* value */), + /// A value was syntactically valid but semantically wrong. #[error("invalid value: {0}")] InvalidValue(String), } @@ -87,6 +121,7 @@ fn split_commas(s: &str) -> OptionParserResult> { } impl OptionParser { + /// Creates an empty `OptionParser` with no registered options. pub fn new() -> Self { Self { options: HashMap::new(), @@ -122,14 +157,30 @@ impl OptionParser { Ok(()) } + /// Parses a comma-separated `key=value` string, updating registered options. + /// + /// Returns an error if the input contains an unknown option name, has + /// unbalanced quotes or brackets, or a value-requiring option lacks `=`. pub fn parse(&mut self, input: &str) -> OptionParserResult<()> { self.parse_inner(input, false) } + /// Like [`parse`](Self::parse), but silently ignores unknown option names. + /// + /// This is useful when multiple parsers share the same input string and + /// each only cares about a subset of the options. pub fn parse_subset(&mut self, input: &str) -> OptionParserResult<()> { self.parse_inner(input, true) } + /// Registers a named option that requires a value (i.e. `key=value`). + /// + /// Option names must not contain `"`, `[`, `]`, `=`, or `,`. + /// Returns `&mut Self` for chaining. + /// + /// # Panics + /// + /// Panics if the option name contains a forbidden character. pub fn add(&mut self, option: &str) -> &mut Self { // Check that option=value has balanced // quotes and brackets iff value does. @@ -148,6 +199,9 @@ impl OptionParser { self } + /// Registers multiple value-requiring options at once. + /// + /// Equivalent to calling [`add`](Self::add) for each element in the slice. pub fn add_all(&mut self, options: &[&str]) -> &mut Self { for option in options { self.add(option); @@ -156,6 +210,10 @@ impl OptionParser { self } + /// Registers a flag-style option that does not take a value. + /// + /// When this option appears in the input string (without `=`), it is + /// marked as set. Use [`is_set`](Self::is_set) to query it. pub fn add_valueless(&mut self, option: &str) -> &mut Self { self.options.insert( option.to_owned(), @@ -168,6 +226,10 @@ impl OptionParser { self } + /// Returns the raw string value of an option, or `None` if the option was + /// not set or if its value is an empty string (e.g. `key=`). + /// + /// Surrounding double-quotes in the value are removed. pub fn get(&self, option: &str) -> Option { self.options .get(option) @@ -181,6 +243,9 @@ impl OptionParser { }) } + /// Returns `true` if the option was present in the parsed input. + /// + /// This works for both value-requiring and valueless options. pub fn is_set(&self, option: &str) -> bool { self.options .get(option) @@ -188,6 +253,14 @@ impl OptionParser { .is_some() } + /// Retrieves and converts an option value to type `T`. + /// + /// Returns `Ok(None)` if the option was not set or its value is empty. + /// Returns `Err` if the value cannot be converted to `T`. + /// + /// `T` can be any type that implements `FromStr` (e.g. `u32`, `String`), + /// or one of this crate's types such as [`Toggle`], [`IntegerList`], + /// [`Tuple`], or [`StringList`]. pub fn convert(&self, option: &str) -> OptionParserResult> { match self.options.get(option).and_then(|v| v.value.as_ref()) { None => Ok(None), @@ -204,6 +277,9 @@ impl OptionParser { } } +/// A boolean-like value that accepts `"on"`, `"true"`, `"off"`, `"false"`, or `""`. +/// +/// An empty string is treated as `false`. pub struct Toggle(pub bool); #[derive(Error, Debug)] @@ -227,6 +303,10 @@ impl Parseable for Toggle { } } +/// A byte size parsed from a human-readable string with optional `K`, `M`, or `G` suffix. +/// +/// The suffix is binary (1K = 1024, 1M = 1048576, 1G = 1073741824). +/// A bare integer is treated as bytes. pub struct ByteSized(pub u64); #[derive(Error, Debug)] @@ -259,6 +339,9 @@ impl FromStr for ByteSized { } } +/// A list of integers parsed from a bracket-enclosed, comma-separated string. +/// +/// Ranges are supported with `-`: `"[0,2-4,6]"` produces `[0, 2, 3, 4, 6]`. pub struct IntegerList(pub Vec); impl Display for IntegerList { @@ -324,7 +407,11 @@ impl Parseable for IntegerList { } } +/// Types that can appear as the second element of a [`Tuple`] pair. +/// +/// Implemented for `u64`, `Vec`, `Vec`, and `Vec`. pub trait TupleValue { + /// Parses the value portion of a `key@value` tuple element. fn parse_value(input: &str) -> Result where Self: Sized; @@ -366,6 +453,10 @@ impl TupleValue for Vec { } } +/// A list of `key@value` pairs parsed from a bracket-enclosed string. +/// +/// The format is `[key1@value1,key2@value2,...]` where `@` separates each +/// pair's elements. `S` is the key type and `T` is the value type. #[derive(PartialEq, Eq, Debug)] pub struct Tuple(pub Vec<(S, T)>); @@ -421,6 +512,9 @@ impl Parseable for Tuple { } } +/// A list of strings parsed from a bracket-enclosed, comma-separated string. +/// +/// The format is `[str1,str2,...]`. Brackets are optional. #[derive(Default)] pub struct StringList(pub Vec); From e5496a093d50242144138d94b5022da6a31cfc84 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Tue, 14 Apr 2026 07:52:07 +0100 Subject: [PATCH 552/742] option_parser: Fix incorrect error message The error message for the InvalidSyntax was copied from UnknownOption. Correct it to "invalid syntax". Signed-off-by: Rob Bradford --- option_parser/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/option_parser/src/lib.rs b/option_parser/src/lib.rs index 85d265f6a0..e674507d4d 100644 --- a/option_parser/src/lib.rs +++ b/option_parser/src/lib.rs @@ -73,7 +73,7 @@ pub enum OptionParserError { #[error("unknown option: {0}")] UnknownOption(String), /// The input string has invalid syntax (unbalanced quotes/brackets, missing `=`). - #[error("unknown option: {0}")] + #[error("invalid syntax: {0}")] InvalidSyntax(String), /// A value could not be converted to the requested type. #[error("unable to convert {1} for {0}")] From d93770c11dc63d239f0e4f9af3256104aad7adbe Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Tue, 14 Apr 2026 08:03:39 +0100 Subject: [PATCH 553/742] option_parser: Fill out unit testing Add unit tests generated with Claude Opus 4.6 and reviewed by human eyes. Signed-off-by: Rob Bradford --- option_parser/src/lib.rs | 230 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 230 insertions(+) diff --git a/option_parser/src/lib.rs b/option_parser/src/lib.rs index e674507d4d..e57cd895e2 100644 --- a/option_parser/src/lib.rs +++ b/option_parser/src/lib.rs @@ -647,4 +647,234 @@ mod unit_tests { fn check_dequote() { assert_eq!(dequote("a\u{3b2}\"a\"\"\""), "a\u{3b2}a\""); } + + #[test] + fn test_empty_input() { + let mut parser = OptionParser::new(); + parser.add("foo"); + parser.parse("").unwrap(); + parser.parse(" ").unwrap(); + assert!(!parser.is_set("foo")); + } + + #[test] + fn test_parse_subset_ignores_unknown() { + let mut parser = OptionParser::new(); + parser.add("known"); + parser.parse_subset("known=val,unknown=other").unwrap(); + assert_eq!(parser.get("known"), Some("val".to_owned())); + assert!(!parser.is_set("unknown")); + } + + #[test] + fn test_add_all() { + let mut parser = OptionParser::new(); + parser.add_all(&["a", "b", "c"]); + parser.parse("a=1,b=2,c=3").unwrap(); + assert_eq!(parser.get("a"), Some("1".to_owned())); + assert_eq!(parser.get("b"), Some("2".to_owned())); + assert_eq!(parser.get("c"), Some("3".to_owned())); + } + + #[test] + fn test_add_valueless() { + let mut parser = OptionParser::new(); + parser.add_valueless("readonly"); + parser.add("path"); + parser.parse("path=/dev/sda,readonly").unwrap(); + assert!(parser.is_set("readonly")); + assert_eq!(parser.get("readonly"), None); + assert_eq!(parser.get("path"), Some("/dev/sda".to_owned())); + } + + #[test] + fn test_convert_integer() { + let mut parser = OptionParser::new(); + parser.add("count"); + parser.parse("count=42").unwrap(); + assert_eq!(parser.convert::("count").unwrap(), Some(42)); + assert_eq!(parser.convert::("count").unwrap(), Some(42)); + } + + #[test] + fn test_convert_unset_returns_none() { + let mut parser = OptionParser::new(); + parser.add("count"); + assert_eq!(parser.convert::("count").unwrap(), None); + } + + #[test] + fn test_convert_invalid_returns_error() { + let mut parser = OptionParser::new(); + parser.add("count"); + parser.parse("count=notanumber").unwrap(); + parser.convert::("count").unwrap_err(); + } + + #[test] + fn test_toggle() { + for (input, expected) in [ + ("on", true), + ("off", false), + ("true", true), + ("false", false), + ("ON", true), + ("OFF", false), + ("True", true), + ("False", false), + ] { + let mut parser = OptionParser::new(); + parser.add("flag"); + parser.parse(&format!("flag={input}")).unwrap(); + let toggle = parser.convert::("flag").unwrap().unwrap(); + assert_eq!(toggle.0, expected, "Toggle({input}) should be {expected}"); + } + } + + #[test] + fn test_toggle_invalid() { + let mut parser = OptionParser::new(); + parser.add("flag"); + parser.parse("flag=maybe").unwrap(); + assert!(parser.convert::("flag").is_err()); + } + + #[test] + fn test_byte_sized() { + let cases = [ + ("1024", 1024u64), + ("1K", 1024), + ("2M", 2 * 1024 * 1024), + ("4G", 4 * 1024 * 1024 * 1024), + ("0K", 0), + ]; + for (input, expected) in cases { + let mut parser = OptionParser::new(); + parser.add("size"); + parser.parse(&format!("size={input}")).unwrap(); + let bs = parser.convert::("size").unwrap().unwrap(); + assert_eq!(bs.0, expected, "ByteSized({input}) should be {expected}"); + } + } + + #[test] + fn test_byte_sized_invalid() { + assert!("xyzK".parse::().is_err()); + assert!("".parse::().is_err()); + } + + #[test] + fn test_integer_list_single_values() { + let list = IntegerList::from_str("[1,3,5]").unwrap(); + assert_eq!(list.0, vec![1, 3, 5]); + } + + #[test] + fn test_integer_list_ranges() { + let list = IntegerList::from_str("[0,2-4,7]").unwrap(); + assert_eq!(list.0, vec![0, 2, 3, 4, 7]); + } + + #[test] + fn test_integer_list_invalid_range() { + assert!(IntegerList::from_str("[5-3]").is_err()); + assert!(IntegerList::from_str("[5-5]").is_err()); + } + + #[test] + fn test_integer_list_too_many_dashes() { + assert!(IntegerList::from_str("[1-2-3]").is_err()); + } + + #[test] + fn test_integer_list_display() { + let list = IntegerList(vec![1, 2, 3]); + assert_eq!(format!("{list}"), "[1,2,3]"); + + let empty = IntegerList(vec![]); + assert_eq!(format!("{empty}"), "[]"); + + let single = IntegerList(vec![42]); + assert_eq!(format!("{single}"), "[42]"); + } + + #[test] + fn test_string_list() { + let list = StringList::from_str("[foo,bar,baz]").unwrap(); + assert_eq!(list.0, vec!["foo", "bar", "baz"]); + } + + #[test] + fn test_string_list_no_brackets() { + let list = StringList::from_str("foo,bar").unwrap(); + assert_eq!(list.0, vec!["foo", "bar"]); + } + + #[test] + fn test_tuple_single_pair() { + let t = Tuple::::from_str("[foo@42]").unwrap(); + assert_eq!(t, Tuple(vec![("foo".to_owned(), 42)])); + } + + #[test] + fn test_tuple_multiple_pairs() { + let t = Tuple::>::from_str("[a@[1,2],b@[3,4]]").unwrap(); + assert_eq!( + t, + Tuple(vec![ + ("a".to_owned(), vec![1, 2]), + ("b".to_owned(), vec![3, 4]), + ]) + ); + } + + #[test] + fn test_tuple_missing_at_separator() { + Tuple::::from_str("[foo42]").unwrap_err(); + } + + #[test] + fn test_tuple_missing_brackets() { + Tuple::::from_str("foo@42").unwrap_err(); + } + + #[test] + fn test_split_commas_unbalanced_bracket() { + split_commas("[a,b").unwrap_err(); + split_commas("a]").unwrap_err(); + } + + #[test] + fn test_split_commas_unbalanced_quote() { + split_commas("\"abc").unwrap_err(); + } + + #[test] + fn test_quoted_value_with_commas() { + let mut parser = OptionParser::new(); + parser.add("cmd"); + parser.parse("cmd=\"a,b,c\"").unwrap(); + assert_eq!(parser.get("cmd"), Some("a,b,c".to_owned())); + } + + #[test] + #[should_panic(expected = "forbidden character")] + fn test_add_option_with_equals() { + let mut parser = OptionParser::new(); + parser.add("bad=name"); + } + + #[test] + #[should_panic(expected = "forbidden character")] + fn test_add_option_with_comma() { + let mut parser = OptionParser::new(); + parser.add("bad,name"); + } + + #[test] + #[should_panic(expected = "forbidden character")] + fn test_add_option_with_bracket() { + let mut parser = OptionParser::new(); + parser.add("bad[name"); + } } From 2148f2e0bc69b7563358029b7ee8db94aa7720f0 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Tue, 14 Apr 2026 08:25:17 +0100 Subject: [PATCH 554/742] option_parser: Fix incorrect unit test This unit test was trying to test with extra "="s in the input but was instead testing using an unknown option. Add the option to the parser to not hit that incorrect error. Signed-off-by: Rob Bradford --- option_parser/src/lib.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/option_parser/src/lib.rs b/option_parser/src/lib.rs index e57cd895e2..6be4bcbb1a 100644 --- a/option_parser/src/lib.rs +++ b/option_parser/src/lib.rs @@ -588,11 +588,13 @@ mod unit_tests { assert_eq!(split_commas("\"\"").unwrap(), vec!["\"\""]); parser.parse("size=128M,hanging_param").unwrap_err(); - parser - .parse("size=128M,too_many_equals=foo=bar") - .unwrap_err(); parser.parse("size=128M,file=/dev/shm").unwrap_err(); + // Equals signs within a value are fine (splitn(2, '=') keeps them) + parser.add("extra"); + parser.parse("extra=foo=bar").unwrap(); + assert_eq!(parser.get("extra"), Some("foo=bar".to_owned())); + parser.parse("size=128M").unwrap(); assert_eq!(parser.get("size"), Some("128M".to_owned())); assert!(!parser.is_set("mergeable")); From 4f7ff8fe48bd7c6eb2d257920873378c71fdfa8e Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Fri, 3 Apr 2026 16:46:46 +0100 Subject: [PATCH 555/742] vmm: Introduce a PciDeviceCommonConfig struct Introduce a common struct that can encompass all the config fields used by devices that are PCI based. The use of `skip_serializing_if` means that the iommu field will only be included if set (otherwise falling back to default false). This neatly handles the devices that don't support an iommu. Signed-off-by: Rob Bradford --- vmm/src/vm_config.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/vmm/src/vm_config.rs b/vmm/src/vm_config.rs index 541f6f21b0..1b20d76da6 100644 --- a/vmm/src/vm_config.rs +++ b/vmm/src/vm_config.rs @@ -273,6 +273,16 @@ pub struct VirtQueueAffinity { pub host_cpus: Vec, } +#[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize, Default)] +pub struct PciDeviceCommonConfig { + #[serde(default)] + pub id: Option, + #[serde(default, skip_serializing_if = "<&bool as std::ops::Not>::not")] + pub iommu: bool, + #[serde(default)] + pub pci_segment: u16, +} + #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] pub struct DiskConfig { pub path: Option, From c66c2b847053bcc22f1c367d59914536b69903c8 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Fri, 3 Apr 2026 17:15:54 +0100 Subject: [PATCH 556/742] vmm: config: Implement PciDeviceCommonConfig::parse This parses a subset of the device configuration options used for devices that are PCI based. Signed-off-by: Rob Bradford --- vmm/src/config.rs | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/vmm/src/config.rs b/vmm/src/config.rs index b2ae29a84f..2cf80191d7 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -209,6 +209,8 @@ pub enum Error { /// Failed Parsing FwCfgItem config #[error("Error parsing --fw-cfg-config items")] ParseFwCfgItem(#[source] OptionParserError), + #[error("Error parsing common PCI device config")] + ParsePciDeviceCommonConfig(#[source] OptionParserError), } #[derive(Debug, PartialEq, Eq, Error)] @@ -1197,6 +1199,35 @@ impl RateLimiterGroupConfig { } } +impl PciDeviceCommonConfig { + pub fn parse(input: &str) -> Result { + let mut parser = OptionParser::new(); + + parser.add("id").add("iommu").add("pci_segment"); + + parser + .parse_subset(input) + .map_err(Error::ParsePciDeviceCommonConfig)?; + + let id = parser.get("id"); + let iommu = parser + .convert::("iommu") + .map_err(Error::ParsePciDeviceCommonConfig)? + .unwrap_or(Toggle(false)) + .0; + let pci_segment = parser + .convert("pci_segment") + .map_err(Error::ParsePciDeviceCommonConfig)? + .unwrap_or_default(); + + Ok(Self { + id, + iommu, + pci_segment, + }) + } +} + impl DiskConfig { pub const SYNTAX: &'static str = "Disk parameters \ \"path=,readonly=on|off,direct=on|off,iommu=on|off,\ From a96da9d4bc8e82f25a673439d5320ec0638c94ce Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Fri, 3 Apr 2026 11:16:24 -0700 Subject: [PATCH 557/742] vmm: Introduce PciDeviceCommonConfig::validate() Implement some common PCI segment validation. This can be used to reduce duplication across the different validation methods. Signed-off-by: Rob Bradford --- vmm/src/config.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/vmm/src/config.rs b/vmm/src/config.rs index 2cf80191d7..6c399dbdd9 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -1226,6 +1226,23 @@ impl PciDeviceCommonConfig { pci_segment, }) } + + pub fn validate(&self, vm_config: &VmConfig) -> ValidationResult<()> { + if let Some(platform_config) = vm_config.platform.as_ref() { + if self.pci_segment >= platform_config.num_pci_segments { + return Err(ValidationError::InvalidPciSegment(self.pci_segment)); + } + + if let Some(iommu_segments) = platform_config.iommu_segments.as_ref() + && iommu_segments.contains(&self.pci_segment) + && !self.iommu + { + return Err(ValidationError::OnIommuSegment(self.pci_segment)); + } + } + + Ok(()) + } } impl DiskConfig { From d2ce7667bc9db5f81bfbd4bb0217d846fca9bfed Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Fri, 3 Apr 2026 11:18:09 -0700 Subject: [PATCH 558/742] vmm: config: Switch DiskConfig to use PciDeviceCommonConfig Switch DiskConfig over to using the newly extracted struct members as used by all PCI based devices. The use of #[serde(flatten)] means that this change has no impact on the JSON format that the data is stored as. Signed-off-by: Rob Bradford --- vmm/src/config.rs | 77 ++++++++++++++++++--------------------- vmm/src/device_manager.rs | 16 ++++---- vmm/src/vm_config.rs | 8 +--- 3 files changed, 46 insertions(+), 55 deletions(-) diff --git a/vmm/src/config.rs b/vmm/src/config.rs index 6c399dbdd9..9ff05bb5f6 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -1299,11 +1299,6 @@ impl DiskConfig { .map_err(Error::ParseDisk)? .unwrap_or(Toggle(false)) .0; - let iommu = parser - .convert::("iommu") - .map_err(Error::ParseDisk)? - .unwrap_or(Toggle(false)) - .0; let queue_size = parser .convert("queue_size") .map_err(Error::ParseDisk)? @@ -1318,7 +1313,6 @@ impl DiskConfig { .unwrap_or(Toggle(false)) .0; let vhost_socket = parser.get("socket"); - let id = parser.get("id"); let disable_io_uring = parser .convert::("_disable_io_uring") .map_err(Error::ParseDisk)? @@ -1329,10 +1323,6 @@ impl DiskConfig { .map_err(Error::ParseDisk)? .unwrap_or(Toggle(false)) .0; - let pci_segment = parser - .convert("pci_segment") - .map_err(Error::ParseDisk)? - .unwrap_or_default(); let rate_limit_group = parser.get("rate_limit_group"); let bw_size = parser .convert("bw_size") @@ -1423,21 +1413,21 @@ impl DiskConfig { .unwrap_or_else(|| Toggle(default_diskconfig_sparse())) .0; + let pci_common = PciDeviceCommonConfig::parse(disk)?; + Ok(DiskConfig { + pci_common, path, readonly, direct, - iommu, num_queues, queue_size, vhost_user, vhost_socket, rate_limit_group, rate_limiter_config, - id, disable_io_uring, disable_aio, - pci_segment, serial, queue_affinity, backing_files, @@ -1448,6 +1438,8 @@ impl DiskConfig { } pub fn validate(&self, vm_config: &VmConfig) -> ValidationResult<()> { + self.pci_common.validate(vm_config)?; + if self.num_queues > vm_config.cpus.boot_vcpus as usize { return Err(ValidationError::TooManyQueues( self.num_queues, @@ -1459,23 +1451,10 @@ impl DiskConfig { return Err(ValidationError::InvalidQueueSize(self.queue_size)); } - if self.vhost_user && self.iommu { + if self.vhost_user && self.pci_common.iommu { return Err(ValidationError::IommuNotSupported); } - if let Some(platform_config) = vm_config.platform.as_ref() { - if self.pci_segment >= platform_config.num_pci_segments { - return Err(ValidationError::InvalidPciSegment(self.pci_segment)); - } - - if let Some(iommu_segments) = platform_config.iommu_segments.as_ref() - && iommu_segments.contains(&self.pci_segment) - && !self.iommu - { - return Err(ValidationError::OnIommuSegment(self.pci_segment)); - } - } - if self.rate_limiter_config.is_some() && self.rate_limit_group.is_some() { return Err(ValidationError::InvalidRateLimiterGroup); } @@ -3075,9 +3054,9 @@ impl VmConfig { } disk.validate(self)?; - self.iommu |= disk.iommu; + self.iommu |= disk.pci_common.iommu; - Self::validate_identifier(&mut id_list, &disk.id)?; + Self::validate_identifier(&mut id_list, &disk.pci_common.id)?; } } @@ -3564,7 +3543,7 @@ impl VmConfig { // Remove if disk device if let Some(disks) = self.disks.as_mut() { let len = disks.len(); - disks.retain(|dev| dev.id.as_ref().map(|id| id.as_ref()) != Some(id)); + disks.retain(|dev| dev.pci_common.id.as_ref().map(|id| id.as_ref()) != Some(id)); removed |= disks.len() != len; } @@ -4025,20 +4004,18 @@ mod unit_tests { fn disk_fixture() -> DiskConfig { DiskConfig { + pci_common: PciDeviceCommonConfig::default(), path: Some(PathBuf::from("/path/to_file")), readonly: false, direct: false, - iommu: false, num_queues: 1, queue_size: 128, vhost_user: false, vhost_socket: None, - id: None, disable_io_uring: false, disable_aio: false, rate_limit_group: None, rate_limiter_config: None, - pci_segment: 0, serial: None, queue_affinity: None, backing_files: false, @@ -4057,7 +4034,10 @@ mod unit_tests { assert_eq!( DiskConfig::parse("path=/path/to_file,id=mydisk0")?, DiskConfig { - id: Some("mydisk0".to_owned()), + pci_common: PciDeviceCommonConfig { + id: Some("mydisk0".to_owned()), + ..Default::default() + }, ..disk_fixture() } ); @@ -4074,14 +4054,20 @@ mod unit_tests { assert_eq!( DiskConfig::parse("path=/path/to_file,iommu=on")?, DiskConfig { - iommu: true, + pci_common: PciDeviceCommonConfig { + iommu: true, + ..Default::default() + }, ..disk_fixture() } ); assert_eq!( DiskConfig::parse("path=/path/to_file,iommu=on,queue_size=256")?, DiskConfig { - iommu: true, + pci_common: PciDeviceCommonConfig { + iommu: true, + ..Default::default() + }, queue_size: 256, ..disk_fixture() } @@ -4089,7 +4075,10 @@ mod unit_tests { assert_eq!( DiskConfig::parse("path=/path/to_file,iommu=on,queue_size=256,num_queues=4")?, DiskConfig { - iommu: true, + pci_common: PciDeviceCommonConfig { + iommu: true, + ..Default::default() + }, queue_size: 256, num_queues: 4, ..disk_fixture() @@ -5326,8 +5315,11 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" ..platform_fixture() }); still_valid_config.disks = Some(vec![DiskConfig { - iommu: true, - pci_segment: 1, + pci_common: PciDeviceCommonConfig { + iommu: true, + pci_segment: 1, + ..Default::default() + }, ..disk_fixture() }]); still_valid_config.validate().unwrap(); @@ -5388,8 +5380,11 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" ..platform_fixture() }); invalid_config.disks = Some(vec![DiskConfig { - iommu: false, - pci_segment: 1, + pci_common: PciDeviceCommonConfig { + iommu: false, + pci_segment: 1, + ..Default::default() + }, ..disk_fixture() }]); assert_eq!( diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 77ef0cc423..9cfbbc4e21 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -2636,11 +2636,11 @@ impl DeviceManager { disk_cfg: &mut DiskConfig, is_hotplug: bool, ) -> DeviceManagerResult { - let id = if let Some(id) = &disk_cfg.id { + let id = if let Some(id) = &disk_cfg.pci_common.id { id.clone() } else { let id = self.next_device_name(DISK_DEVICE_NAME_PREFIX)?; - disk_cfg.id = Some(id.clone()); + disk_cfg.pci_common.id = Some(id.clone()); id }; @@ -2822,7 +2822,7 @@ impl DeviceManager { let bw = rate_limiter_cfg.bandwidth.unwrap_or_default(); let ops = rate_limiter_cfg.ops.unwrap_or_default(); let mut rate_limit_group = RateLimiterGroup::new( - disk_cfg.id.as_ref().unwrap(), + disk_cfg.pci_common.id.as_ref().unwrap(), bw.size, bw.one_time_burst.unwrap_or(0), bw.refill_time, @@ -2865,7 +2865,7 @@ impl DeviceManager { .ok_or(DeviceManagerError::NoDiskPath)? .clone(), disk_cfg.readonly, - self.force_iommu | disk_cfg.iommu, + self.force_iommu | disk_cfg.pci_common.iommu, disk_cfg.num_queues, disk_cfg.queue_size, disk_cfg.serial.clone(), @@ -2913,9 +2913,9 @@ impl DeviceManager { Ok(MetaVirtioDevice { virtio_device, - iommu: disk_cfg.iommu, + iommu: disk_cfg.pci_common.iommu, id, - pci_segment: disk_cfg.pci_segment, + pci_segment: disk_cfg.pci_common.pci_segment, dma_handler: None, }) } @@ -5028,9 +5028,9 @@ impl DeviceManager { } pub fn add_disk(&mut self, disk_cfg: &mut DiskConfig) -> DeviceManagerResult { - self.validate_identifier(&disk_cfg.id)?; + self.validate_identifier(&disk_cfg.pci_common.id)?; - if disk_cfg.iommu && !self.is_iommu_segment(disk_cfg.pci_segment) { + if disk_cfg.pci_common.iommu && !self.is_iommu_segment(disk_cfg.pci_common.pci_segment) { return Err(DeviceManagerError::InvalidIommuHotplug); } diff --git a/vmm/src/vm_config.rs b/vmm/src/vm_config.rs index 1b20d76da6..d5465d9245 100644 --- a/vmm/src/vm_config.rs +++ b/vmm/src/vm_config.rs @@ -285,13 +285,13 @@ pub struct PciDeviceCommonConfig { #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] pub struct DiskConfig { + #[serde(flatten)] + pub pci_common: PciDeviceCommonConfig, pub path: Option, #[serde(default)] pub readonly: bool, #[serde(default)] pub direct: bool, - #[serde(default)] - pub iommu: bool, #[serde(default = "default_diskconfig_num_queues")] pub num_queues: usize, #[serde(default = "default_diskconfig_queue_size")] @@ -303,8 +303,6 @@ pub struct DiskConfig { pub rate_limit_group: Option, #[serde(default)] pub rate_limiter_config: Option, - #[serde(default)] - pub id: Option, // For testing use only. Not exposed in API. #[serde(default)] pub disable_io_uring: bool, @@ -312,8 +310,6 @@ pub struct DiskConfig { #[serde(default)] pub disable_aio: bool, #[serde(default)] - pub pci_segment: u16, - #[serde(default)] pub serial: Option, #[serde(default)] pub queue_affinity: Option>, From 92c2cf0103e25505b2cf034c97676be7e76c809f Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Fri, 3 Apr 2026 11:49:54 -0700 Subject: [PATCH 559/742] vmm: config: Switch NetConfig to use PciDeviceCommonConfig Switch NetConfig over to using the newly extracted struct members as used by all PCI based devices. The use of #[serde(flatten)] means that this change has no impact on the JSON format that the data is stored as. Signed-off-by: Rob Bradford --- vmm/src/api/http/http_endpoint.rs | 2 +- vmm/src/config.rs | 88 ++++++++++++++++--------------- vmm/src/device_manager.rs | 20 +++---- vmm/src/lib.rs | 4 +- vmm/src/vm_config.rs | 8 +-- 5 files changed, 61 insertions(+), 61 deletions(-) diff --git a/vmm/src/api/http/http_endpoint.rs b/vmm/src/api/http/http_endpoint.rs index 15ebfd2f9e..92b53ac68e 100644 --- a/vmm/src/api/http/http_endpoint.rs +++ b/vmm/src/api/http/http_endpoint.rs @@ -122,7 +122,7 @@ mod fds_helper { impl ConfigWithFDs for NetConfig { fn id(&self) -> Option<&str> { - self.id.as_deref() + self.pci_common.id.as_deref() } fn fds_from_http_body(&self) -> Option<&[RawFd]> { diff --git a/vmm/src/config.rs b/vmm/src/config.rs index 9ff05bb5f6..5e53e8bdac 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -1554,11 +1554,6 @@ impl NetConfig { .unwrap_or(Toggle(true)) .0; let mtu = parser.convert("mtu").map_err(Error::ParseNetwork)?; - let iommu = parser - .convert::("iommu") - .map_err(Error::ParseNetwork)? - .unwrap_or(Toggle(false)) - .0; let queue_size = parser .convert("queue_size") .map_err(Error::ParseNetwork)? @@ -1577,15 +1572,10 @@ impl NetConfig { .convert("vhost_mode") .map_err(Error::ParseNetwork)? .unwrap_or_default(); - let id = parser.get("id"); let fds = parser .convert::("fd") .map_err(Error::ParseNetwork)? .map(|v| v.0.iter().map(|e| *e as i32).collect()); - let pci_segment = parser - .convert("pci_segment") - .map_err(Error::ParseNetwork)? - .unwrap_or_default(); let bw_size = parser .convert("bw_size") .map_err(Error::ParseNetwork)? @@ -1637,23 +1627,23 @@ impl NetConfig { None }; + let pci_common = PciDeviceCommonConfig::parse(net)?; + let config = NetConfig { + pci_common, tap, ip, mask, mac, host_mac, mtu, - iommu, num_queues, queue_size, vhost_user, vhost_socket, vhost_mode, - id, fds, rate_limiter_config, - pci_segment, offload_tso, offload_ufo, offload_csum, @@ -1662,6 +1652,8 @@ impl NetConfig { } pub fn validate(&self, vm_config: &VmConfig) -> ValidationResult<()> { + self.pci_common.validate(vm_config)?; + if self.num_queues < 2 { return Err(ValidationError::VnetQueueLowerThan2(self.num_queues)); } @@ -1689,23 +1681,10 @@ impl NetConfig { )); } - if self.vhost_user && self.iommu { + if self.vhost_user && self.pci_common.iommu { return Err(ValidationError::IommuNotSupported); } - if let Some(platform_config) = vm_config.platform.as_ref() { - if self.pci_segment >= platform_config.num_pci_segments { - return Err(ValidationError::InvalidPciSegment(self.pci_segment)); - } - - if let Some(iommu_segments) = platform_config.iommu_segments.as_ref() - && iommu_segments.contains(&self.pci_segment) - && !self.iommu - { - return Err(ValidationError::OnIommuSegment(self.pci_segment)); - } - } - if let Some(mtu) = self.mtu && mtu < virtio_devices::net::MIN_MTU { @@ -2778,6 +2757,7 @@ impl RestoreConfig { for net_fds in vm_config.net.iter().flatten() { if let Some(expected_fds) = &net_fds.fds { let expected_id = net_fds + .pci_common .id .as_ref() .expect("Invalid 'NetConfig' with empty 'id' for VM restore."); @@ -3066,9 +3046,9 @@ impl VmConfig { return Err(ValidationError::VhostUserRequiresSharedMemory); } net.validate(self)?; - self.iommu |= net.iommu; + self.iommu |= net.pci_common.iommu; - Self::validate_identifier(&mut id_list, &net.id)?; + Self::validate_identifier(&mut id_list, &net.pci_common.id)?; } } @@ -3564,7 +3544,7 @@ impl VmConfig { // Remove if net device if let Some(net) = self.net.as_mut() { let len = net.len(); - net.retain(|dev| dev.id.as_ref().map(|id| id.as_ref()) != Some(id)); + net.retain(|dev| dev.pci_common.id.as_ref().map(|id| id.as_ref()) != Some(id)); removed |= net.len() != len; } @@ -4148,22 +4128,20 @@ mod unit_tests { fn net_fixture() -> NetConfig { NetConfig { + pci_common: PciDeviceCommonConfig::default(), tap: None, ip: None, mask: None, mac: MacAddr::parse_str("de:ad:be:ef:12:34").unwrap(), host_mac: Some(MacAddr::parse_str("12:34:de:ad:be:ef").unwrap()), mtu: None, - iommu: false, num_queues: 2, queue_size: 256, vhost_user: false, vhost_socket: None, vhost_mode: VhostMode::Client, - id: None, fds: None, rate_limiter_config: None, - pci_segment: 0, offload_tso: true, offload_ufo: true, offload_csum: true, @@ -4181,7 +4159,10 @@ mod unit_tests { assert_eq!( NetConfig::parse("mac=de:ad:be:ef:12:34,host_mac=12:34:de:ad:be:ef,id=mynet0")?, NetConfig { - id: Some("mynet0".to_owned()), + pci_common: PciDeviceCommonConfig { + id: Some("mynet0".to_owned()), + ..Default::default() + }, ..net_fixture() } ); @@ -4214,9 +4195,12 @@ mod unit_tests { "mac=de:ad:be:ef:12:34,host_mac=12:34:de:ad:be:ef,num_queues=4,queue_size=1024,iommu=on" )?, NetConfig { + pci_common: PciDeviceCommonConfig { + iommu: true, + ..Default::default() + }, num_queues: 4, queue_size: 1024, - iommu: true, ..net_fixture() } ); @@ -4840,19 +4824,28 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" preserved_fds: None, net: Some(vec![ NetConfig { - id: Some("net0".to_owned()), + pci_common: PciDeviceCommonConfig { + id: Some("net0".to_owned()), + ..Default::default() + }, num_queues: 2, fds: Some(vec![-1, -1, -1, -1]), ..net_fixture() }, NetConfig { - id: Some("net1".to_owned()), + pci_common: PciDeviceCommonConfig { + id: Some("net1".to_owned()), + ..Default::default() + }, num_queues: 1, fds: Some(vec![-1, -1]), ..net_fixture() }, NetConfig { - id: Some("net2".to_owned()), + pci_common: PciDeviceCommonConfig { + id: Some("net2".to_owned()), + ..Default::default() + }, fds: None, ..net_fixture() }, @@ -4947,7 +4940,10 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" resume: false, }; snapshot_vm_config.net = Some(vec![NetConfig { - id: Some("net2".to_owned()), + pci_common: PciDeviceCommonConfig { + id: Some("net2".to_owned()), + ..Default::default() + }, fds: None, ..net_fixture() }]); @@ -5330,8 +5326,11 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" ..platform_fixture() }); still_valid_config.net = Some(vec![NetConfig { - iommu: true, - pci_segment: 1, + pci_common: PciDeviceCommonConfig { + iommu: true, + pci_segment: 1, + ..Default::default() + }, ..net_fixture() }]); still_valid_config.validate().unwrap(); @@ -5398,8 +5397,11 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" ..platform_fixture() }); invalid_config.net = Some(vec![NetConfig { - iommu: false, - pci_segment: 1, + pci_common: PciDeviceCommonConfig { + iommu: false, + pci_segment: 1, + ..Default::default() + }, ..net_fixture() }]); assert_eq!( diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 9cfbbc4e21..9a5bdff56f 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -2937,11 +2937,11 @@ impl DeviceManager { &mut self, net_cfg: &mut NetConfig, ) -> DeviceManagerResult { - let id = if let Some(id) = &net_cfg.id { + let id = if let Some(id) = &net_cfg.pci_common.id { id.clone() } else { let id = self.next_device_name(NET_DEVICE_NAME_PREFIX)?; - net_cfg.id = Some(id.clone()); + net_cfg.pci_common.id = Some(id.clone()); id }; info!("Creating virtio-net device: {net_cfg:?}"); @@ -2999,7 +2999,7 @@ impl DeviceManager { Some(net_cfg.mac), &mut net_cfg.host_mac, net_cfg.mtu, - self.force_iommu | net_cfg.iommu, + self.force_iommu | net_cfg.pci_common.iommu, net_cfg.num_queues, net_cfg.queue_size, self.seccomp_action.clone(), @@ -3020,7 +3020,7 @@ impl DeviceManager { fds, Some(net_cfg.mac), net_cfg.mtu, - self.force_iommu | net_cfg.iommu, + self.force_iommu | net_cfg.pci_common.iommu, net_cfg.queue_size, self.seccomp_action.clone(), net_cfg.rate_limiter_config, @@ -3050,7 +3050,7 @@ impl DeviceManager { Some(net_cfg.mac), &mut net_cfg.host_mac, net_cfg.mtu, - self.force_iommu | net_cfg.iommu, + self.force_iommu | net_cfg.pci_common.iommu, net_cfg.num_queues, net_cfg.queue_size, self.seccomp_action.clone(), @@ -3083,9 +3083,9 @@ impl DeviceManager { Ok(MetaVirtioDevice { virtio_device, - iommu: net_cfg.iommu, + iommu: net_cfg.pci_common.iommu, id, - pci_segment: net_cfg.pci_segment, + pci_segment: net_cfg.pci_common.pci_segment, dma_handler: None, }) } @@ -4737,7 +4737,7 @@ impl DeviceManager { let nets = config.net.as_deref_mut().unwrap(); let net_dev_cfg = nets .iter_mut() - .find(|net| net.id.as_deref() == Some(id)) + .find(|net| net.pci_common.id.as_deref() == Some(id)) // unwrap: the device could not have been removed without an ID .unwrap(); let fds = net_dev_cfg.fds.take().unwrap_or(Vec::new()); @@ -5067,9 +5067,9 @@ impl DeviceManager { } pub fn add_net(&mut self, net_cfg: &mut NetConfig) -> DeviceManagerResult { - self.validate_identifier(&net_cfg.id)?; + self.validate_identifier(&net_cfg.pci_common.id)?; - if net_cfg.iommu && !self.is_iommu_segment(net_cfg.pci_segment) { + if net_cfg.pci_common.iommu && !self.is_iommu_segment(net_cfg.pci_common.pci_segment) { return Err(DeviceManagerError::InvalidIommuHotplug); } diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index df94b54e22..66a2a104d0 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -1879,7 +1879,9 @@ impl RequestHandler for Vmm { for net in restored_nets.iter() { for net_config in vm_net_configs.iter_mut() { // update only if the net dev is backed by FDs - if net_config.id.as_ref() == Some(&net.id) && net_config.fds.is_some() { + if net_config.pci_common.id.as_ref() == Some(&net.id) + && net_config.fds.is_some() + { net_config.fds.clone_from(&net.fds); } } diff --git a/vmm/src/vm_config.rs b/vmm/src/vm_config.rs index d5465d9245..e318aba7d4 100644 --- a/vmm/src/vm_config.rs +++ b/vmm/src/vm_config.rs @@ -350,6 +350,8 @@ pub fn default_diskconfig_sparse() -> bool { #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] pub struct NetConfig { + #[serde(flatten)] + pub pci_common: PciDeviceCommonConfig, #[serde(default = "default_netconfig_tap")] pub tap: Option, pub ip: Option, @@ -360,8 +362,6 @@ pub struct NetConfig { pub host_mac: Option, #[serde(default)] pub mtu: Option, - #[serde(default)] - pub iommu: bool, #[serde(default = "default_netconfig_num_queues")] pub num_queues: usize, #[serde(default = "default_netconfig_queue_size")] @@ -371,8 +371,6 @@ pub struct NetConfig { pub vhost_socket: Option, #[serde(default)] pub vhost_mode: VhostMode, - #[serde(default)] - pub id: Option, // Special deserialize handling: // Therefore, we don't serialize FDs, and whatever value is here after // deserialization is invalid. @@ -383,8 +381,6 @@ pub struct NetConfig { pub fds: Option>, #[serde(default)] pub rate_limiter_config: Option, - #[serde(default)] - pub pci_segment: u16, #[serde(default = "default_netconfig_true")] pub offload_tso: bool, #[serde(default = "default_netconfig_true")] From 40150dd72d288f7b2999e6ca08c88e22a18e4fbc Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Fri, 3 Apr 2026 11:56:41 -0700 Subject: [PATCH 560/742] vmm: config: Switch FsConfig to use PciDeviceCommonConfig Switch FsConfig over to using the newly extracted struct members as used by all PCI based devices. The use of #[serde(flatten)] means that this change has no impact on the JSON format that the data is stored as. As virtio-fs does not support being placed behind an IOMMU an error is now raised if iommu is set. This option is not exposed via the CLI but could happen with a miscontructed JSON/API call. Signed-off-by: Rob Bradford --- vmm/src/config.rs | 42 +++++++++++++-------------------------- vmm/src/device_manager.rs | 8 ++++---- vmm/src/vm_config.rs | 6 ++---- 3 files changed, 20 insertions(+), 36 deletions(-) diff --git a/vmm/src/config.rs b/vmm/src/config.rs index 5e53e8bdac..a447b89f37 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -1937,20 +1937,14 @@ impl FsConfig { .map_err(Error::ParseFileSystem)? .unwrap_or_else(default_fsconfig_num_queues); - let id = parser.get("id"); - - let pci_segment = parser - .convert("pci_segment") - .map_err(Error::ParseFileSystem)? - .unwrap_or_default(); + let pci_common = PciDeviceCommonConfig::parse(fs)?; Ok(FsConfig { + pci_common, tag, socket, num_queues, queue_size, - id, - pci_segment, }) } @@ -1962,21 +1956,11 @@ impl FsConfig { )); } - if let Some(platform_config) = vm_config.platform.as_ref() { - if self.pci_segment >= platform_config.num_pci_segments { - return Err(ValidationError::InvalidPciSegment(self.pci_segment)); - } - - if let Some(iommu_segments) = platform_config.iommu_segments.as_ref() - && iommu_segments.contains(&self.pci_segment) - { - return Err(ValidationError::IommuNotSupportedOnSegment( - self.pci_segment, - )); - } + if self.pci_common.iommu { + return Err(ValidationError::IommuNotSupported); } - Ok(()) + self.pci_common.validate(vm_config) } } @@ -3059,7 +3043,7 @@ impl VmConfig { for fs in fses { fs.validate(self)?; - Self::validate_identifier(&mut id_list, &fs.id)?; + Self::validate_identifier(&mut id_list, &fs.pci_common.id)?; } } @@ -3530,7 +3514,7 @@ impl VmConfig { // Remove if fs device if let Some(fs) = self.fs.as_mut() { let len = fs.len(); - fs.retain(|dev| dev.id.as_ref().map(|id| id.as_ref()) != Some(id)); + fs.retain(|dev| dev.pci_common.id.as_ref().map(|id| id.as_ref()) != Some(id)); removed |= fs.len() != len; } @@ -4256,12 +4240,11 @@ mod unit_tests { fn fs_fixture() -> FsConfig { FsConfig { + pci_common: PciDeviceCommonConfig::default(), socket: PathBuf::from("/tmp/sock"), tag: "mytag".to_owned(), num_queues: 1, queue_size: 1024, - id: None, - pci_segment: 0, } } @@ -5471,7 +5454,7 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" }]); assert_eq!( invalid_config.validate(), - Err(ValidationError::IommuNotSupportedOnSegment(1)) + Err(ValidationError::OnIommuSegment(1)) ); let mut invalid_config = valid_config.clone(); @@ -5495,12 +5478,15 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" ..platform_fixture() }); invalid_config.fs = Some(vec![FsConfig { - pci_segment: 1, + pci_common: PciDeviceCommonConfig { + pci_segment: 1, + ..Default::default() + }, ..fs_fixture() }]); assert_eq!( invalid_config.validate(), - Err(ValidationError::IommuNotSupportedOnSegment(1)) + Err(ValidationError::OnIommuSegment(1)) ); let mut invalid_config = valid_config.clone(); diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 9a5bdff56f..c431fc7564 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -3216,11 +3216,11 @@ impl DeviceManager { &mut self, fs_cfg: &mut FsConfig, ) -> DeviceManagerResult { - let id = if let Some(id) = &fs_cfg.id { + let id = if let Some(id) = &fs_cfg.pci_common.id { id.clone() } else { let id = self.next_device_name(FS_DEVICE_NAME_PREFIX)?; - fs_cfg.id = Some(id.clone()); + fs_cfg.pci_common.id = Some(id.clone()); id }; @@ -3257,7 +3257,7 @@ impl DeviceManager { as Arc>, iommu: false, id, - pci_segment: fs_cfg.pci_segment, + pci_segment: fs_cfg.pci_common.pci_segment, dma_handler: None, }) } else { @@ -5039,7 +5039,7 @@ impl DeviceManager { } pub fn add_fs(&mut self, fs_cfg: &mut FsConfig) -> DeviceManagerResult { - self.validate_identifier(&fs_cfg.id)?; + self.validate_identifier(&fs_cfg.pci_common.id)?; let device = self.make_virtio_fs_device(fs_cfg)?; self.hotplug_virtio_pci_device(device) diff --git a/vmm/src/vm_config.rs b/vmm/src/vm_config.rs index e318aba7d4..847a79bdac 100644 --- a/vmm/src/vm_config.rs +++ b/vmm/src/vm_config.rs @@ -471,16 +471,14 @@ pub struct PvmemcontrolConfig {} #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] pub struct FsConfig { + #[serde(flatten)] + pub pci_common: PciDeviceCommonConfig, pub tag: String, pub socket: PathBuf, #[serde(default = "default_fsconfig_num_queues")] pub num_queues: usize, #[serde(default = "default_fsconfig_queue_size")] pub queue_size: u16, - #[serde(default)] - pub id: Option, - #[serde(default)] - pub pci_segment: u16, } pub fn default_fsconfig_num_queues() -> usize { From 0d3080e036c479e7791cc201f44239f0b0686f7b Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Fri, 3 Apr 2026 11:59:42 -0700 Subject: [PATCH 561/742] vmm: config: Switch GenericVhostUserConfig to use PciDeviceCommonConfig Switch GenericVhostUserConfig over to using the newly extracted struct members as used by all PCI based devices. The use of #[serde(flatten)] means that this change has no impact on the JSON format that the data is stored as. As generic vhost-user devices do not support being placed behind an IOMMU an error is now raised if iommu is set. This can't happen via the CLI but could via the JSON/API. Signed-off-by: Rob Bradford --- vmm/src/config.rs | 37 +++++++++++++------------------------ vmm/src/device_manager.rs | 8 ++++---- vmm/src/vm_config.rs | 6 ++---- 3 files changed, 19 insertions(+), 32 deletions(-) diff --git a/vmm/src/config.rs b/vmm/src/config.rs index a447b89f37..d3aad9d93b 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -1861,11 +1861,7 @@ impl GenericVhostUserConfig { } _ => {} } - let id = parser.get("id"); - let pci_segment = parser - .convert("pci_segment") - .map_err(Error::ParseGenericVhostUser)? - .unwrap_or_default(); + let pci_common = PciDeviceCommonConfig::parse(vhost_user)?; let mut converted_queue_sizes: Vec = Vec::new(); for (offset, &queue_size) in queue_sizes.iter().enumerate() { match queue_size.try_into() { @@ -1879,30 +1875,19 @@ impl GenericVhostUserConfig { } Ok(GenericVhostUserConfig { + pci_common, socket: socket.into(), device_type, - id, - pci_segment, queue_sizes: converted_queue_sizes, }) } pub fn validate(&self, vm_config: &VmConfig) -> ValidationResult<()> { - if let Some(platform_config) = vm_config.platform.as_ref() { - if self.pci_segment >= platform_config.num_pci_segments { - return Err(ValidationError::InvalidPciSegment(self.pci_segment)); - } - - if let Some(iommu_segments) = platform_config.iommu_segments.as_ref() - && iommu_segments.contains(&self.pci_segment) - { - return Err(ValidationError::IommuNotSupportedOnSegment( - self.pci_segment, - )); - } + if self.pci_common.iommu { + return Err(ValidationError::IommuNotSupported); } - Ok(()) + self.pci_common.validate(vm_config) } } @@ -3054,7 +3039,7 @@ impl VmConfig { for generic_vhost_user_device in generic_vhost_user_devices { generic_vhost_user_device.validate(self)?; - Self::validate_identifier(&mut id_list, &generic_vhost_user_device.id)?; + Self::validate_identifier(&mut id_list, &generic_vhost_user_device.pci_common.id)?; } } @@ -3521,7 +3506,8 @@ impl VmConfig { // Remove if generic vhost-user device if let Some(generic_vhost_user) = self.generic_vhost_user.as_mut() { let len = generic_vhost_user.len(); - generic_vhost_user.retain(|dev| dev.id.as_ref().map(|id| id.as_ref()) != Some(id)); + generic_vhost_user + .retain(|dev| dev.pci_common.id.as_ref().map(|id| id.as_ref()) != Some(id)); removed |= generic_vhost_user.len() != len; } @@ -4292,10 +4278,13 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" assert_eq!( config.unwrap(), GenericVhostUserConfig { + pci_common: PciDeviceCommonConfig { + id: Some(id.to_owned()), + pci_segment: u16::try_from(pci_segment).unwrap(), + ..Default::default() + }, socket: socket.into(), - id: Some(id.to_owned()), device_type: u32::try_from(virtio_id).unwrap(), - pci_segment: u16::try_from(pci_segment).unwrap(), queue_sizes: queue_sizes .0 .iter() diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index c431fc7564..cb3f7c0ef5 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -3150,11 +3150,11 @@ impl DeviceManager { &mut self, generic_vhost_user_cfg: &mut GenericVhostUserConfig, ) -> DeviceManagerResult { - let id = if let Some(id) = &generic_vhost_user_cfg.id { + let id = if let Some(id) = &generic_vhost_user_cfg.pci_common.id { id.clone() } else { let id = self.next_device_name(GENERIC_VHOST_USER_DEVICE_NAME_PREFIX)?; - generic_vhost_user_cfg.id = Some(id.clone()); + generic_vhost_user_cfg.pci_common.id = Some(id.clone()); id }; @@ -3191,7 +3191,7 @@ impl DeviceManager { as Arc>, iommu: false, id, - pci_segment: generic_vhost_user_cfg.pci_segment, + pci_segment: generic_vhost_user_cfg.pci_common.pci_segment, dma_handler: None, }) } else { @@ -5049,7 +5049,7 @@ impl DeviceManager { &mut self, generic_vhost_user_cfg: &mut GenericVhostUserConfig, ) -> DeviceManagerResult { - self.validate_identifier(&generic_vhost_user_cfg.id)?; + self.validate_identifier(&generic_vhost_user_cfg.pci_common.id)?; let device = self.make_generic_vhost_user_device(generic_vhost_user_cfg)?; self.hotplug_virtio_pci_device(device) diff --git a/vmm/src/vm_config.rs b/vmm/src/vm_config.rs index 847a79bdac..b9b6609882 100644 --- a/vmm/src/vm_config.rs +++ b/vmm/src/vm_config.rs @@ -498,12 +498,10 @@ impl ApplyLandlock for FsConfig { #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] pub struct GenericVhostUserConfig { + #[serde(flatten)] + pub pci_common: PciDeviceCommonConfig, pub socket: PathBuf, pub queue_sizes: Vec, - #[serde(default)] - pub id: Option, - #[serde(default)] - pub pci_segment: u16, pub device_type: u32, } From c9082570b0f589426409d8d144264041f168db7a Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Fri, 3 Apr 2026 12:05:29 -0700 Subject: [PATCH 562/742] vmm: config: Switch PmemConfig to use PciDeviceCommonConfig Switch PmemConfig over to using the newly extracted struct members as used by all PCI based devices. The use of #[serde(flatten)] means that this change has no impact on the JSON format that the data is stored as. Signed-off-by: Rob Bradford --- vmm/src/config.rs | 63 +++++++++++++++------------------------ vmm/src/device_manager.rs | 18 +++++------ vmm/src/vm_config.rs | 8 ++--- 3 files changed, 35 insertions(+), 54 deletions(-) diff --git a/vmm/src/config.rs b/vmm/src/config.rs index d3aad9d93b..c9104007f1 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -2061,52 +2061,28 @@ impl PmemConfig { .add("pci_segment"); parser.parse(pmem).map_err(Error::ParsePersistentMemory)?; + let pci_common = PciDeviceCommonConfig::parse(pmem)?; let file = PathBuf::from(parser.get("file").ok_or(Error::ParsePmemFileMissing)?); let size = parser .convert::("size") .map_err(Error::ParsePersistentMemory)? .map(|v| v.0); - let iommu = parser - .convert::("iommu") - .map_err(Error::ParsePersistentMemory)? - .unwrap_or(Toggle(false)) - .0; let discard_writes = parser .convert::("discard_writes") .map_err(Error::ParsePersistentMemory)? .unwrap_or(Toggle(false)) .0; - let id = parser.get("id"); - let pci_segment = parser - .convert("pci_segment") - .map_err(Error::ParsePersistentMemory)? - .unwrap_or_default(); Ok(PmemConfig { + pci_common, file, size, - iommu, discard_writes, - id, - pci_segment, }) } pub fn validate(&self, vm_config: &VmConfig) -> ValidationResult<()> { - if let Some(platform_config) = vm_config.platform.as_ref() { - if self.pci_segment >= platform_config.num_pci_segments { - return Err(ValidationError::InvalidPciSegment(self.pci_segment)); - } - - if let Some(iommu_segments) = platform_config.iommu_segments.as_ref() - && iommu_segments.contains(&self.pci_segment) - && !self.iommu - { - return Err(ValidationError::OnIommuSegment(self.pci_segment)); - } - } - - Ok(()) + self.pci_common.validate(vm_config) } } @@ -3046,9 +3022,9 @@ impl VmConfig { if let Some(pmems) = &self.pmem { for pmem in pmems { pmem.validate(self)?; - self.iommu |= pmem.iommu; + self.iommu |= pmem.pci_common.iommu; - Self::validate_identifier(&mut id_list, &pmem.id)?; + Self::validate_identifier(&mut id_list, &pmem.pci_common.id)?; } } @@ -3521,7 +3497,7 @@ impl VmConfig { // Remove if pmem device if let Some(pmem) = self.pmem.as_mut() { let len = pmem.len(); - pmem.retain(|dev| dev.id.as_ref().map(|id| id.as_ref()) != Some(id)); + pmem.retain(|dev| dev.pci_common.id.as_ref().map(|id| id.as_ref()) != Some(id)); removed |= pmem.len() != len; } @@ -4345,12 +4321,10 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" fn pmem_fixture() -> PmemConfig { PmemConfig { + pci_common: PciDeviceCommonConfig::default(), file: PathBuf::from("/tmp/pmem"), size: Some(128 << 20), - iommu: false, discard_writes: false, - id: None, - pci_segment: 0, } } @@ -4366,15 +4340,21 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" assert_eq!( PmemConfig::parse("file=/tmp/pmem,size=128M,id=mypmem0")?, PmemConfig { - id: Some("mypmem0".to_owned()), + pci_common: PciDeviceCommonConfig { + id: Some("mypmem0".to_owned()), + ..Default::default() + }, ..pmem_fixture() } ); assert_eq!( PmemConfig::parse("file=/tmp/pmem,size=128M,iommu=on,discard_writes=on")?, PmemConfig { + pci_common: PciDeviceCommonConfig { + iommu: true, + ..Default::default() + }, discard_writes: true, - iommu: true, ..pmem_fixture() } ); @@ -5313,8 +5293,11 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" ..platform_fixture() }); still_valid_config.pmem = Some(vec![PmemConfig { - iommu: true, - pci_segment: 1, + pci_common: PciDeviceCommonConfig { + iommu: true, + pci_segment: 1, + ..Default::default() + }, ..pmem_fixture() }]); still_valid_config.validate().unwrap(); @@ -5388,8 +5371,10 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" ..platform_fixture() }); invalid_config.pmem = Some(vec![PmemConfig { - iommu: false, - pci_segment: 1, + pci_common: PciDeviceCommonConfig { + pci_segment: 1, + ..Default::default() + }, ..pmem_fixture() }]); assert_eq!( diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index cb3f7c0ef5..061c80f62f 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -3282,11 +3282,11 @@ impl DeviceManager { &mut self, pmem_cfg: &mut PmemConfig, ) -> DeviceManagerResult { - let id = if let Some(id) = &pmem_cfg.id { + let id = if let Some(id) = &pmem_cfg.pci_common.id { id.clone() } else { let id = self.next_device_name(PMEM_DEVICE_NAME_PREFIX)?; - pmem_cfg.id = Some(id.clone()); + pmem_cfg.pci_common.id = Some(id.clone()); id }; @@ -3358,7 +3358,7 @@ impl DeviceManager { let (region_base, region_size) = if let Some((base, size)) = region_range { // The memory needs to be 2MiB aligned in order to support // hugepages. - self.pci_segments[pmem_cfg.pci_segment as usize] + self.pci_segments[pmem_cfg.pci_common.pci_segment as usize] .mem64_allocator .lock() .unwrap() @@ -3373,7 +3373,7 @@ impl DeviceManager { } else { // The memory needs to be 2MiB aligned in order to support // hugepages. - let base = self.pci_segments[pmem_cfg.pci_segment as usize] + let base = self.pci_segments[pmem_cfg.pci_common.pci_segment as usize] .mem64_allocator .lock() .unwrap() @@ -3421,7 +3421,7 @@ impl DeviceManager { file, GuestAddress(region_base), mapping, - self.force_iommu | pmem_cfg.iommu, + self.force_iommu | pmem_cfg.pci_common.iommu, self.seccomp_action.clone(), self.exit_evt .try_clone() @@ -3444,9 +3444,9 @@ impl DeviceManager { Ok(MetaVirtioDevice { virtio_device: Arc::clone(&virtio_pmem_device) as Arc>, - iommu: pmem_cfg.iommu, + iommu: pmem_cfg.pci_common.iommu, id, - pci_segment: pmem_cfg.pci_segment, + pci_segment: pmem_cfg.pci_common.pci_segment, dma_handler: None, }) } @@ -5056,9 +5056,9 @@ impl DeviceManager { } pub fn add_pmem(&mut self, pmem_cfg: &mut PmemConfig) -> DeviceManagerResult { - self.validate_identifier(&pmem_cfg.id)?; + self.validate_identifier(&pmem_cfg.pci_common.id)?; - if pmem_cfg.iommu && !self.is_iommu_segment(pmem_cfg.pci_segment) { + if pmem_cfg.pci_common.iommu && !self.is_iommu_segment(pmem_cfg.pci_common.pci_segment) { return Err(DeviceManagerError::InvalidIommuHotplug); } diff --git a/vmm/src/vm_config.rs b/vmm/src/vm_config.rs index b9b6609882..079dc1ac61 100644 --- a/vmm/src/vm_config.rs +++ b/vmm/src/vm_config.rs @@ -514,17 +514,13 @@ impl ApplyLandlock for GenericVhostUserConfig { #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] pub struct PmemConfig { + #[serde(flatten)] + pub pci_common: PciDeviceCommonConfig, pub file: PathBuf, #[serde(default)] pub size: Option, #[serde(default)] - pub iommu: bool, - #[serde(default)] pub discard_writes: bool, - #[serde(default)] - pub id: Option, - #[serde(default)] - pub pci_segment: u16, } impl ApplyLandlock for PmemConfig { From 47182201f9aadc1420df5bc266a6a44bdc361d0f Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Fri, 3 Apr 2026 12:08:38 -0700 Subject: [PATCH 563/742] vmm: config: Switch DeviceConfig to use PciDeviceCommonConfig Switch DeviceConfig over to using the newly extracted struct members as used by all PCI based devices. The use of #[serde(flatten)] means that this change has no impact on the JSON format that the data is stored as. Signed-off-by: Rob Bradford --- vmm/src/config.rs | 63 ++++++++++++++++----------------------- vmm/src/device_manager.rs | 20 +++++++------ vmm/src/vm_config.rs | 8 ++--- 3 files changed, 38 insertions(+), 53 deletions(-) diff --git a/vmm/src/config.rs b/vmm/src/config.rs index c9104007f1..c2c943020e 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -2206,45 +2206,23 @@ impl DeviceConfig { .add("x_nv_gpudirect_clique"); parser.parse(device).map_err(Error::ParseDevice)?; + let pci_common = PciDeviceCommonConfig::parse(device)?; let path = parser .get("path") .map(PathBuf::from) .ok_or(Error::ParseDevicePathMissing)?; - let iommu = parser - .convert::("iommu") - .map_err(Error::ParseDevice)? - .unwrap_or(Toggle(false)) - .0; - let id = parser.get("id"); - let pci_segment = parser - .convert::("pci_segment") - .map_err(Error::ParseDevice)? - .unwrap_or_default(); let x_nv_gpudirect_clique = parser .convert::("x_nv_gpudirect_clique") .map_err(Error::ParseDevice)?; Ok(DeviceConfig { + pci_common, path, - iommu, - id, - pci_segment, x_nv_gpudirect_clique, }) } pub fn validate(&self, vm_config: &VmConfig) -> ValidationResult<()> { - if let Some(platform_config) = vm_config.platform.as_ref() { - if self.pci_segment >= platform_config.num_pci_segments { - return Err(ValidationError::InvalidPciSegment(self.pci_segment)); - } - - if let Some(iommu_segments) = platform_config.iommu_segments.as_ref() - && iommu_segments.contains(&self.pci_segment) - && !self.iommu - { - return Err(ValidationError::OnIommuSegment(self.pci_segment)); - } - } + self.pci_common.validate(vm_config)?; if self.x_nv_gpudirect_clique.is_some() { let vfio_p2p_dma = vm_config.platform.as_ref().is_none_or(|p| p.vfio_p2p_dma); @@ -3120,9 +3098,9 @@ impl VmConfig { } device.validate(self)?; - self.iommu |= device.iommu; + self.iommu |= device.pci_common.iommu; - Self::validate_identifier(&mut id_list, &device.id)?; + Self::validate_identifier(&mut id_list, &device.pci_common.id)?; } } @@ -3454,7 +3432,7 @@ impl VmConfig { // Remove if VFIO device if let Some(devices) = self.devices.as_mut() { let len = devices.len(); - devices.retain(|dev| dev.id.as_ref().map(|id| id.as_ref()) != Some(id)); + devices.retain(|dev| dev.pci_common.id.as_ref().map(|id| id.as_ref()) != Some(id)); removed |= devices.len() != len; } @@ -4443,10 +4421,8 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" fn device_fixture() -> DeviceConfig { DeviceConfig { + pci_common: PciDeviceCommonConfig::default(), path: PathBuf::from("/path/to/device"), - id: None, - iommu: false, - pci_segment: 0, x_nv_gpudirect_clique: None, } } @@ -4463,7 +4439,10 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" assert_eq!( DeviceConfig::parse("path=/path/to/device,iommu=on")?, DeviceConfig { - iommu: true, + pci_common: PciDeviceCommonConfig { + iommu: true, + ..Default::default() + }, ..device_fixture() } ); @@ -4471,8 +4450,11 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" assert_eq!( DeviceConfig::parse("path=/path/to/device,iommu=on,id=mydevice0")?, DeviceConfig { - id: Some("mydevice0".to_owned()), - iommu: true, + pci_common: PciDeviceCommonConfig { + id: Some("mydevice0".to_owned()), + iommu: true, + ..Default::default() + }, ..device_fixture() } ); @@ -5308,8 +5290,11 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" ..platform_fixture() }); still_valid_config.devices = Some(vec![DeviceConfig { - iommu: true, - pci_segment: 1, + pci_common: PciDeviceCommonConfig { + iommu: true, + pci_segment: 1, + ..Default::default() + }, ..device_fixture() }]); still_valid_config.validate().unwrap(); @@ -5389,8 +5374,10 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" ..platform_fixture() }); invalid_config.devices = Some(vec![DeviceConfig { - iommu: false, - pci_segment: 1, + pci_common: PciDeviceCommonConfig { + pci_segment: 1, + ..Default::default() + }, ..device_fixture() }]); assert_eq!( diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 061c80f62f..da03c6b553 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -3858,16 +3858,16 @@ impl DeviceManager { &mut self, device_cfg: &mut DeviceConfig, ) -> DeviceManagerResult<(PciBdf, String)> { - let vfio_name = if let Some(id) = &device_cfg.id { + let vfio_name = if let Some(id) = &device_cfg.pci_common.id { id.clone() } else { let id = self.next_device_name(VFIO_DEVICE_NAME_PREFIX)?; - device_cfg.id = Some(id.clone()); + device_cfg.pci_common.id = Some(id.clone()); id }; let (pci_segment_id, pci_device_bdf, resources) = - self.pci_resources(&vfio_name, device_cfg.pci_segment)?; + self.pci_resources(&vfio_name, device_cfg.pci_common.pci_segment)?; let mut needs_dma_mapping = false; @@ -3884,7 +3884,7 @@ impl DeviceManager { // container/group. The VFIO cdev and iommufd do not have such a // limitation, and this will be revised once we have VFIO cdev and // iommufd support. - let vfio_ops = if device_cfg.iommu { + let vfio_ops = if device_cfg.pci_common.iommu { let vfio_ops = self.create_vfio_ops()?; let vfio_mapping = Arc::new(VfioDmaMapping::new( @@ -3989,7 +3989,7 @@ impl DeviceManager { vfio_ops, self.msi_interrupt_manager.clone(), legacy_interrupt_group, - device_cfg.iommu, + device_cfg.pci_common.iommu, vfio_p2p_dma, pci_device_bdf, memory_manager.lock().unwrap().memory_slot_allocator(), @@ -4104,7 +4104,7 @@ impl DeviceManager { if let Some(device_list_cfg) = &mut devices { for device_cfg in device_list_cfg.iter_mut() { let (device_id, _) = self.add_passthrough_device(device_cfg)?; - if device_cfg.iommu && self.iommu_device.is_some() { + if device_cfg.pci_common.iommu && self.iommu_device.is_some() { iommu_attached_device_ids.push(device_id); } } @@ -4630,16 +4630,18 @@ impl DeviceManager { &mut self, device_cfg: &mut DeviceConfig, ) -> DeviceManagerResult { - self.validate_identifier(&device_cfg.id)?; + self.validate_identifier(&device_cfg.pci_common.id)?; - if device_cfg.iommu && !self.is_iommu_segment(device_cfg.pci_segment) { + if device_cfg.pci_common.iommu && !self.is_iommu_segment(device_cfg.pci_common.pci_segment) + { return Err(DeviceManagerError::InvalidIommuHotplug); } let (bdf, device_name) = self.add_passthrough_device(device_cfg)?; // Update the PCIU bitmap - self.pci_segments[device_cfg.pci_segment as usize].pci_devices_up |= 1 << bdf.device(); + self.pci_segments[device_cfg.pci_common.pci_segment as usize].pci_devices_up |= + 1 << bdf.device(); Ok(PciDeviceInfo { id: device_name, diff --git a/vmm/src/vm_config.rs b/vmm/src/vm_config.rs index 079dc1ac61..f6eb8af687 100644 --- a/vmm/src/vm_config.rs +++ b/vmm/src/vm_config.rs @@ -607,14 +607,10 @@ impl ApplyLandlock for DebugConsoleConfig { #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] pub struct DeviceConfig { + #[serde(flatten)] + pub pci_common: PciDeviceCommonConfig, pub path: PathBuf, #[serde(default)] - pub iommu: bool, - #[serde(default)] - pub id: Option, - #[serde(default)] - pub pci_segment: u16, - #[serde(default)] pub x_nv_gpudirect_clique: Option, } From 2c50be4753dbdb5d8b793ca2b7708c3a9fe935c7 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Fri, 3 Apr 2026 12:11:01 -0700 Subject: [PATCH 564/742] vmm: config: Switch UserDeviceConfig to use PciDeviceCommonConfig Switch UserDeviceConfig over to using the newly extracted struct members as used by all PCI based devices. The use of #[serde(flatten)] means that this change has no impact on the JSON format that the data is stored as. As VFIO user devices do not support being placed behind an IOMMU an error is now raised if iommu is set. This can't happen via the CLI but could via the JSON/API. Signed-off-by: Rob Bradford --- vmm/src/config.rs | 38 +++++++++++--------------------------- vmm/src/device_manager.rs | 11 ++++++----- vmm/src/vm_config.rs | 6 ++---- 3 files changed, 19 insertions(+), 36 deletions(-) diff --git a/vmm/src/config.rs b/vmm/src/config.rs index c2c943020e..b0d0de1e48 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -2244,39 +2244,21 @@ impl UserDeviceConfig { parser.add("socket").add("id").add("pci_segment"); parser.parse(user_device).map_err(Error::ParseUserDevice)?; + let pci_common = PciDeviceCommonConfig::parse(user_device)?; let socket = parser .get("socket") .map(PathBuf::from) .ok_or(Error::ParseUserDeviceSocketMissing)?; - let id = parser.get("id"); - let pci_segment = parser - .convert::("pci_segment") - .map_err(Error::ParseUserDevice)? - .unwrap_or_default(); - Ok(UserDeviceConfig { - socket, - id, - pci_segment, - }) + Ok(UserDeviceConfig { pci_common, socket }) } pub fn validate(&self, vm_config: &VmConfig) -> ValidationResult<()> { - if let Some(platform_config) = vm_config.platform.as_ref() { - if self.pci_segment >= platform_config.num_pci_segments { - return Err(ValidationError::InvalidPciSegment(self.pci_segment)); - } - - if let Some(iommu_segments) = platform_config.iommu_segments.as_ref() - && iommu_segments.contains(&self.pci_segment) - { - return Err(ValidationError::IommuNotSupportedOnSegment( - self.pci_segment, - )); - } + if self.pci_common.iommu { + return Err(ValidationError::IommuNotSupported); } - Ok(()) + self.pci_common.validate(vm_config) } } @@ -3052,7 +3034,7 @@ impl VmConfig { for user_device in user_devices { user_device.validate(self)?; - Self::validate_identifier(&mut id_list, &user_device.id)?; + Self::validate_identifier(&mut id_list, &user_device.pci_common.id)?; } } @@ -3439,7 +3421,7 @@ impl VmConfig { // Remove if VFIO user device if let Some(user_devices) = self.user_devices.as_mut() { let len = user_devices.len(); - user_devices.retain(|dev| dev.id.as_ref().map(|id| id.as_ref()) != Some(id)); + user_devices.retain(|dev| dev.pci_common.id.as_ref().map(|id| id.as_ref()) != Some(id)); removed |= user_devices.len() != len; } @@ -5409,9 +5391,11 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" ..platform_fixture() }); invalid_config.user_devices = Some(vec![UserDeviceConfig { - pci_segment: 1, + pci_common: PciDeviceCommonConfig { + pci_segment: 1, + ..Default::default() + }, socket: PathBuf::new(), - id: None, }]); assert_eq!( invalid_config.validate(), diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index da03c6b553..43eb7257cc 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -4120,16 +4120,16 @@ impl DeviceManager { &mut self, device_cfg: &mut UserDeviceConfig, ) -> DeviceManagerResult<(PciBdf, String)> { - let vfio_user_name = if let Some(id) = &device_cfg.id { + let vfio_user_name = if let Some(id) = &device_cfg.pci_common.id { id.clone() } else { let id = self.next_device_name(VFIO_USER_DEVICE_NAME_PREFIX)?; - device_cfg.id = Some(id.clone()); + device_cfg.pci_common.id = Some(id.clone()); id }; let (pci_segment_id, pci_device_bdf, resources) = - self.pci_resources(&vfio_user_name, device_cfg.pci_segment)?; + self.pci_resources(&vfio_user_name, device_cfg.pci_common.pci_segment)?; let legacy_interrupt_group = if let Some(legacy_interrupt_manager) = &self.legacy_interrupt_manager { @@ -4653,12 +4653,13 @@ impl DeviceManager { &mut self, device_cfg: &mut UserDeviceConfig, ) -> DeviceManagerResult { - self.validate_identifier(&device_cfg.id)?; + self.validate_identifier(&device_cfg.pci_common.id)?; let (bdf, device_name) = self.add_vfio_user_device(device_cfg)?; // Update the PCIU bitmap - self.pci_segments[device_cfg.pci_segment as usize].pci_devices_up |= 1 << bdf.device(); + self.pci_segments[device_cfg.pci_common.pci_segment as usize].pci_devices_up |= + 1 << bdf.device(); Ok(PciDeviceInfo { id: device_name, diff --git a/vmm/src/vm_config.rs b/vmm/src/vm_config.rs index f6eb8af687..6afdb8f6c0 100644 --- a/vmm/src/vm_config.rs +++ b/vmm/src/vm_config.rs @@ -633,11 +633,9 @@ impl ApplyLandlock for DeviceConfig { #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] pub struct UserDeviceConfig { + #[serde(flatten)] + pub pci_common: PciDeviceCommonConfig, pub socket: PathBuf, - #[serde(default)] - pub id: Option, - #[serde(default)] - pub pci_segment: u16, } impl ApplyLandlock for UserDeviceConfig { From ece77c3c52a8a9d4b297386c6f45604cb3c81b52 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Fri, 3 Apr 2026 12:14:02 -0700 Subject: [PATCH 565/742] vmm: config: Switch VdpaConfig to use PciDeviceCommonConfig Switch VdpaConfig over to using the newly extracted struct members as used by all PCI based devices. The use of #[serde(flatten)] means that this change has no impact on the JSON format that the data is stored as. Signed-off-by: Rob Bradford --- vmm/src/config.rs | 50 ++++++++++++--------------------------- vmm/src/device_manager.rs | 12 +++++----- vmm/src/vm_config.rs | 8 ++----- 3 files changed, 23 insertions(+), 47 deletions(-) diff --git a/vmm/src/config.rs b/vmm/src/config.rs index b0d0de1e48..e7ae70564a 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -2277,6 +2277,7 @@ impl VdpaConfig { .add("pci_segment"); parser.parse(vdpa).map_err(Error::ParseVdpa)?; + let pci_common = PciDeviceCommonConfig::parse(vdpa)?; let path = parser .get("path") .map(PathBuf::from) @@ -2285,41 +2286,16 @@ impl VdpaConfig { .convert("num_queues") .map_err(Error::ParseVdpa)? .unwrap_or_else(default_vdpaconfig_num_queues); - let iommu = parser - .convert::("iommu") - .map_err(Error::ParseVdpa)? - .unwrap_or(Toggle(false)) - .0; - let id = parser.get("id"); - let pci_segment = parser - .convert("pci_segment") - .map_err(Error::ParseVdpa)? - .unwrap_or_default(); Ok(VdpaConfig { + pci_common, path, num_queues, - iommu, - id, - pci_segment, }) } pub fn validate(&self, vm_config: &VmConfig) -> ValidationResult<()> { - if let Some(platform_config) = vm_config.platform.as_ref() { - if self.pci_segment >= platform_config.num_pci_segments { - return Err(ValidationError::InvalidPciSegment(self.pci_segment)); - } - - if let Some(iommu_segments) = platform_config.iommu_segments.as_ref() - && iommu_segments.contains(&self.pci_segment) - && !self.iommu - { - return Err(ValidationError::OnIommuSegment(self.pci_segment)); - } - } - - Ok(()) + self.pci_common.validate(vm_config) } } @@ -3041,9 +3017,9 @@ impl VmConfig { if let Some(vdpa_devices) = &self.vdpa { for vdpa_device in vdpa_devices { vdpa_device.validate(self)?; - self.iommu |= vdpa_device.iommu; + self.iommu |= vdpa_device.pci_common.iommu; - Self::validate_identifier(&mut id_list, &vdpa_device.id)?; + Self::validate_identifier(&mut id_list, &vdpa_device.pci_common.id)?; } } @@ -3464,7 +3440,7 @@ impl VmConfig { // Remove if vDPA device if let Some(vdpa) = self.vdpa.as_mut() { let len = vdpa.len(); - vdpa.retain(|dev| dev.id.as_ref().map(|id| id.as_ref()) != Some(id)); + vdpa.retain(|dev| dev.pci_common.id.as_ref().map(|id| id.as_ref()) != Some(id)); removed |= vdpa.len() != len; } @@ -4446,11 +4422,9 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" fn vdpa_fixture() -> VdpaConfig { VdpaConfig { + pci_common: PciDeviceCommonConfig::default(), path: PathBuf::from("/dev/vhost-vdpa"), num_queues: 1, - iommu: false, - id: None, - pci_segment: 0, } } @@ -4462,8 +4436,11 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" assert_eq!( VdpaConfig::parse("path=/dev/vhost-vdpa,num_queues=2,id=my_vdpa")?, VdpaConfig { + pci_common: PciDeviceCommonConfig { + id: Some("my_vdpa".to_owned()), + ..Default::default() + }, num_queues: 2, - id: Some("my_vdpa".to_owned()), ..vdpa_fixture() } ); @@ -5408,7 +5385,10 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" ..platform_fixture() }); invalid_config.vdpa = Some(vec![VdpaConfig { - pci_segment: 1, + pci_common: PciDeviceCommonConfig { + pci_segment: 1, + ..Default::default() + }, ..vdpa_fixture() }]); assert_eq!( diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 43eb7257cc..d16b9c2539 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -3715,11 +3715,11 @@ impl DeviceManager { &mut self, vdpa_cfg: &mut VdpaConfig, ) -> DeviceManagerResult { - let id = if let Some(id) = &vdpa_cfg.id { + let id = if let Some(id) = &vdpa_cfg.pci_common.id { id.clone() } else { let id = self.next_device_name(VDPA_DEVICE_NAME_PREFIX)?; - vdpa_cfg.id = Some(id.clone()); + vdpa_cfg.pci_common.id = Some(id.clone()); id }; @@ -3755,9 +3755,9 @@ impl DeviceManager { Ok(MetaVirtioDevice { virtio_device: vdpa_device as Arc>, - iommu: vdpa_cfg.iommu, + iommu: vdpa_cfg.pci_common.iommu, id, - pci_segment: vdpa_cfg.pci_segment, + pci_segment: vdpa_cfg.pci_common.pci_segment, dma_handler: Some(vdpa_mapping), }) } @@ -5081,9 +5081,9 @@ impl DeviceManager { } pub fn add_vdpa(&mut self, vdpa_cfg: &mut VdpaConfig) -> DeviceManagerResult { - self.validate_identifier(&vdpa_cfg.id)?; + self.validate_identifier(&vdpa_cfg.pci_common.id)?; - if vdpa_cfg.iommu && !self.is_iommu_segment(vdpa_cfg.pci_segment) { + if vdpa_cfg.pci_common.iommu && !self.is_iommu_segment(vdpa_cfg.pci_common.pci_segment) { return Err(DeviceManagerError::InvalidIommuHotplug); } diff --git a/vmm/src/vm_config.rs b/vmm/src/vm_config.rs index 6afdb8f6c0..e915f955b7 100644 --- a/vmm/src/vm_config.rs +++ b/vmm/src/vm_config.rs @@ -647,15 +647,11 @@ impl ApplyLandlock for UserDeviceConfig { #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] pub struct VdpaConfig { + #[serde(flatten)] + pub pci_common: PciDeviceCommonConfig, pub path: PathBuf, #[serde(default = "default_vdpaconfig_num_queues")] pub num_queues: usize, - #[serde(default)] - pub iommu: bool, - #[serde(default)] - pub id: Option, - #[serde(default)] - pub pci_segment: u16, } pub fn default_vdpaconfig_num_queues() -> usize { From 37b1ed1b84f09b5b8e7d6b7935054cfe91296f13 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Fri, 3 Apr 2026 12:16:49 -0700 Subject: [PATCH 566/742] vmm: config: Switch VsockConfig to use PciDeviceCommonConfig Switch VsockConfig over to using the newly extracted struct members as used by all PCI based devices. The use of #[serde(flatten)] means that this change has no impact on the JSON format that the data is stored as. Signed-off-by: Rob Bradford --- vmm/src/config.rs | 62 +++++++++++++-------------------------- vmm/src/device_manager.rs | 14 ++++----- vmm/src/vm_config.rs | 8 ++--- 3 files changed, 29 insertions(+), 55 deletions(-) diff --git a/vmm/src/config.rs b/vmm/src/config.rs index e7ae70564a..4b7e4a8ff3 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -2313,49 +2313,25 @@ impl VsockConfig { .add("pci_segment"); parser.parse(vsock).map_err(Error::ParseVsock)?; + let pci_common = PciDeviceCommonConfig::parse(vsock)?; let socket = parser .get("socket") .map(PathBuf::from) .ok_or(Error::ParseVsockSockMissing)?; - let iommu = parser - .convert::("iommu") - .map_err(Error::ParseVsock)? - .unwrap_or(Toggle(false)) - .0; let cid = parser .convert("cid") .map_err(Error::ParseVsock)? .ok_or(Error::ParseVsockCidMissing)?; - let id = parser.get("id"); - let pci_segment = parser - .convert("pci_segment") - .map_err(Error::ParseVsock)? - .unwrap_or_default(); Ok(VsockConfig { + pci_common, cid, socket, - iommu, - id, - pci_segment, }) } pub fn validate(&self, vm_config: &VmConfig) -> ValidationResult<()> { - if let Some(platform_config) = vm_config.platform.as_ref() { - if self.pci_segment >= platform_config.num_pci_segments { - return Err(ValidationError::InvalidPciSegment(self.pci_segment)); - } - - if let Some(iommu_segments) = platform_config.iommu_segments.as_ref() - && iommu_segments.contains(&self.pci_segment) - && !self.iommu - { - return Err(ValidationError::OnIommuSegment(self.pci_segment)); - } - } - - Ok(()) + self.pci_common.validate(vm_config) } } @@ -3064,9 +3040,9 @@ impl VmConfig { if let Some(vsock) = &self.vsock { vsock.validate(self)?; - self.iommu |= vsock.iommu; + self.iommu |= vsock.pci_common.iommu; - Self::validate_identifier(&mut id_list, &vsock.id)?; + Self::validate_identifier(&mut id_list, &vsock.pci_common.id)?; } let num_pci_segments = match &self.platform { @@ -3446,7 +3422,7 @@ impl VmConfig { // Remove if vsock device if let Some(vsock) = self.vsock.as_ref() - && vsock.id.as_ref().map(|id| id.as_ref()) == Some(id) + && vsock.pci_common.id.as_ref().map(|id| id.as_ref()) == Some(id) { self.vsock = None; removed = true; @@ -4467,21 +4443,20 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" assert_eq!( VsockConfig::parse("socket=/tmp/sock,cid=3")?, VsockConfig { + pci_common: PciDeviceCommonConfig::default(), cid: 3, socket: PathBuf::from("/tmp/sock"), - iommu: false, - id: None, - pci_segment: 0, } ); assert_eq!( VsockConfig::parse("socket=/tmp/sock,cid=3,iommu=on")?, VsockConfig { + pci_common: PciDeviceCommonConfig { + iommu: true, + ..Default::default() + }, cid: 3, socket: PathBuf::from("/tmp/sock"), - iommu: true, - id: None, - pci_segment: 0, } ); Ok(()) @@ -5264,11 +5239,13 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" ..platform_fixture() }); still_valid_config.vsock = Some(VsockConfig { + pci_common: PciDeviceCommonConfig { + iommu: true, + pci_segment: 1, + ..Default::default() + }, cid: 3, socket: PathBuf::new(), - id: None, - iommu: true, - pci_segment: 1, }); still_valid_config.validate().unwrap(); @@ -5350,11 +5327,12 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" ..platform_fixture() }); invalid_config.vsock = Some(VsockConfig { + pci_common: PciDeviceCommonConfig { + pci_segment: 1, + ..Default::default() + }, cid: 3, socket: PathBuf::new(), - id: None, - iommu: false, - pci_segment: 1, }); assert_eq!( invalid_config.validate(), diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index d16b9c2539..12c1e41a5b 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -3469,11 +3469,11 @@ impl DeviceManager { &mut self, vsock_cfg: &mut VsockConfig, ) -> DeviceManagerResult { - let id = if let Some(id) = &vsock_cfg.id { + let id = if let Some(id) = &vsock_cfg.pci_common.id { id.clone() } else { let id = self.next_device_name(VSOCK_DEVICE_NAME_PREFIX)?; - vsock_cfg.id = Some(id.clone()); + vsock_cfg.pci_common.id = Some(id.clone()); id }; @@ -3493,7 +3493,7 @@ impl DeviceManager { vsock_cfg.cid, vsock_cfg.socket.clone(), backend, - self.force_iommu | vsock_cfg.iommu, + self.force_iommu | vsock_cfg.pci_common.iommu, self.seccomp_action.clone(), self.exit_evt .try_clone() @@ -3515,9 +3515,9 @@ impl DeviceManager { Ok(MetaVirtioDevice { virtio_device: Arc::clone(&vsock_device) as Arc>, - iommu: vsock_cfg.iommu, + iommu: vsock_cfg.pci_common.iommu, id, - pci_segment: vsock_cfg.pci_segment, + pci_segment: vsock_cfg.pci_common.pci_segment, dma_handler: None, }) } @@ -5092,9 +5092,9 @@ impl DeviceManager { } pub fn add_vsock(&mut self, vsock_cfg: &mut VsockConfig) -> DeviceManagerResult { - self.validate_identifier(&vsock_cfg.id)?; + self.validate_identifier(&vsock_cfg.pci_common.id)?; - if vsock_cfg.iommu && !self.is_iommu_segment(vsock_cfg.pci_segment) { + if vsock_cfg.pci_common.iommu && !self.is_iommu_segment(vsock_cfg.pci_common.pci_segment) { return Err(DeviceManagerError::InvalidIommuHotplug); } diff --git a/vmm/src/vm_config.rs b/vmm/src/vm_config.rs index e915f955b7..156650cb2e 100644 --- a/vmm/src/vm_config.rs +++ b/vmm/src/vm_config.rs @@ -667,14 +667,10 @@ impl ApplyLandlock for VdpaConfig { #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] pub struct VsockConfig { + #[serde(flatten)] + pub pci_common: PciDeviceCommonConfig, pub cid: u32, pub socket: PathBuf, - #[serde(default)] - pub iommu: bool, - #[serde(default)] - pub id: Option, - #[serde(default)] - pub pci_segment: u16, } impl ApplyLandlock for VsockConfig { From dde28dc38aac8ce50e26c9017e208b854a2f8d8e Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Fri, 3 Apr 2026 12:40:55 -0700 Subject: [PATCH 567/742] vmm: config: Remove unused error variant The IommuNotSupportedOnSegment variant is no longer needed as the common PciDeviceCommonConfig::validate() handles this case with the OnIommuSegment variant along with more use of the IommuNotSupported error variant. Signed-off-by: Rob Bradford --- vmm/src/config.rs | 5 ----- 1 file changed, 5 deletions(-) diff --git a/vmm/src/config.rs b/vmm/src/config.rs index 4b7e4a8ff3..d80e389f58 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -320,11 +320,6 @@ pub enum ValidationError { /// GPUDirect clique requires P2P DMA #[error("Device with x_nv_gpudirect_clique requires vfio_p2p_dma=on")] GpuDirectCliqueRequiresP2pDma, - // On a IOMMU segment but IOMMU not supported - #[error( - "Device is on an IOMMU PCI segment ({0}) but does not support being placed behind IOMMU" - )] - IommuNotSupportedOnSegment(u16), // Identifier is not unique #[error("Identifier {0} is not unique")] IdentifierNotUnique(String), From 0c837abff2b4b9bfb817fef57bb659bf6777d3ae Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Sat, 4 Apr 2026 08:56:49 -0700 Subject: [PATCH 568/742] vmm: config: Put common options in an array This can then be used with the OptionParser::add_all() API to reduce the number of locations the same options are added to the parser. The only quirk is that some devices do not support an IOMMU (because they are vhost-user / vfio-user based). There are two different versions of the array to support that. Signed-off-by: Rob Bradford --- vmm/src/config.rs | 39 ++++++++++++++------------------------- 1 file changed, 14 insertions(+), 25 deletions(-) diff --git a/vmm/src/config.rs b/vmm/src/config.rs index d80e389f58..8e0c4232af 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -1195,10 +1195,13 @@ impl RateLimiterGroupConfig { } impl PciDeviceCommonConfig { + const OPTIONS: &[&str] = &["id", "pci_segment"]; + const OPTIONS_IOMMU: &[&str] = &["id", "iommu", "pci_segment"]; + pub fn parse(input: &str) -> Result { let mut parser = OptionParser::new(); - parser.add("id").add("iommu").add("pci_segment"); + parser.add_all(Self::OPTIONS_IOMMU); parser .parse_subset(input) @@ -1258,7 +1261,6 @@ impl DiskConfig { .add("path") .add("readonly") .add("direct") - .add("iommu") .add("queue_size") .add("num_queues") .add("vhost_user") @@ -1269,17 +1271,16 @@ impl DiskConfig { .add("ops_size") .add("ops_one_time_burst") .add("ops_refill_time") - .add("id") .add("_disable_io_uring") .add("_disable_aio") - .add("pci_segment") .add("serial") .add("rate_limit_group") .add("queue_affinity") .add("backing_files") .add("sparse") .add("image_type") - .add("lock_granularity"); + .add("lock_granularity") + .add_all(PciDeviceCommonConfig::OPTIONS_IOMMU); parser.parse(disk).map_err(Error::ParseDisk)?; @@ -1507,13 +1508,11 @@ impl NetConfig { .add("offload_ufo") .add("offload_csum") .add("mtu") - .add("iommu") .add("queue_size") .add("num_queues") .add("vhost_user") .add("socket") .add("vhost_mode") - .add("id") .add("fd") .add("bw_size") .add("bw_one_time_burst") @@ -1521,7 +1520,7 @@ impl NetConfig { .add("ops_size") .add("ops_one_time_burst") .add("ops_refill_time") - .add("pci_segment"); + .add_all(PciDeviceCommonConfig::OPTIONS_IOMMU); parser.parse(net).map_err(Error::ParseNetwork)?; let tap = parser.get("tap"); @@ -1772,8 +1771,7 @@ impl GenericVhostUserConfig { .add("virtio_id") .add("queue_sizes") .add("socket") - .add("id") - .add("pci_segment"); + .add_all(PciDeviceCommonConfig::OPTIONS); parser .parse(vhost_user) .map_err(Error::ParseGenericVhostUser)?; @@ -1898,8 +1896,7 @@ impl FsConfig { .add("queue_size") .add("num_queues") .add("socket") - .add("id") - .add("pci_segment"); + .add_all(PciDeviceCommonConfig::OPTIONS); parser.parse(fs).map_err(Error::ParseFileSystem)?; let tag = parser.get("tag").ok_or(Error::ParseFsTagMissing)?; @@ -2050,10 +2047,8 @@ impl PmemConfig { parser .add("size") .add("file") - .add("iommu") .add("discard_writes") - .add("id") - .add("pci_segment"); + .add_all(PciDeviceCommonConfig::OPTIONS_IOMMU); parser.parse(pmem).map_err(Error::ParsePersistentMemory)?; let pci_common = PciDeviceCommonConfig::parse(pmem)?; @@ -2195,9 +2190,7 @@ impl DeviceConfig { let mut parser = OptionParser::new(); parser .add("path") - .add("id") - .add("iommu") - .add("pci_segment") + .add_all(PciDeviceCommonConfig::OPTIONS_IOMMU) .add("x_nv_gpudirect_clique"); parser.parse(device).map_err(Error::ParseDevice)?; @@ -2236,7 +2229,7 @@ impl UserDeviceConfig { pub fn parse(user_device: &str) -> Result { let mut parser = OptionParser::new(); - parser.add("socket").add("id").add("pci_segment"); + parser.add("socket").add_all(PciDeviceCommonConfig::OPTIONS); parser.parse(user_device).map_err(Error::ParseUserDevice)?; let pci_common = PciDeviceCommonConfig::parse(user_device)?; @@ -2267,9 +2260,7 @@ impl VdpaConfig { parser .add("path") .add("num_queues") - .add("iommu") - .add("id") - .add("pci_segment"); + .add_all(PciDeviceCommonConfig::OPTIONS_IOMMU); parser.parse(vdpa).map_err(Error::ParseVdpa)?; let pci_common = PciDeviceCommonConfig::parse(vdpa)?; @@ -2303,9 +2294,7 @@ impl VsockConfig { parser .add("socket") .add("cid") - .add("iommu") - .add("id") - .add("pci_segment"); + .add_all(PciDeviceCommonConfig::OPTIONS_IOMMU); parser.parse(vsock).map_err(Error::ParseVsock)?; let pci_common = PciDeviceCommonConfig::parse(vsock)?; From 7ac877cc26c405c65542289a44eeda929b2070a4 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Fri, 3 Apr 2026 13:32:07 -0700 Subject: [PATCH 569/742] vmm: device_manager: Reuse PciDeviceCommonConfig in MetaVirtioDevice This struct has the same members and it can be reused to reduce complexity now and if other common PCI related fields need to be added in the future. Signed-off-by: Rob Bradford --- vmm/src/device_manager.rs | 109 ++++++++++++++++++++------------------ 1 file changed, 56 insertions(+), 53 deletions(-) diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 12c1e41a5b..d32f5b7716 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -132,8 +132,8 @@ use crate::serial_manager::{Error as SerialManagerError, SerialManager}; use crate::vm_config::IvshmemConfig; use crate::vm_config::{ ConsoleOutputMode, DEFAULT_IOMMU_ADDRESS_WIDTH_BITS, DEFAULT_PCI_SEGMENT_APERTURE_WEIGHT, - DeviceConfig, DiskConfig, FsConfig, GenericVhostUserConfig, NetConfig, PmemConfig, - UserDeviceConfig, VdpaConfig, VhostMode, VmConfig, VsockConfig, + DeviceConfig, DiskConfig, FsConfig, GenericVhostUserConfig, NetConfig, PciDeviceCommonConfig, + PmemConfig, UserDeviceConfig, VdpaConfig, VhostMode, VmConfig, VsockConfig, }; use crate::{DEVICE_MANAGER_SNAPSHOT_ID, GuestRegionMmap, PciDeviceInfo, device_node}; @@ -926,12 +926,19 @@ pub enum PciDeviceHandle { #[derive(Clone)] struct MetaVirtioDevice { virtio_device: Arc>, - iommu: bool, - id: String, - pci_segment: u16, + pci_common: PciDeviceCommonConfig, dma_handler: Option>, } +impl MetaVirtioDevice { + fn id(&self) -> &str { + self.pci_common + .id + .as_deref() + .expect("ID should have been assigned before use") + } +} + #[derive(Default)] pub struct AcpiPlatformAddresses { pub pm_timer_address: Option, @@ -1669,24 +1676,25 @@ impl DeviceManager { let mut iommu_attached_devices = Vec::new(); { for handle in self.virtio_devices.clone() { - let mapping: Option> = if handle.iommu { + let mapping: Option> = if handle.pci_common.iommu { self.iommu_mapping.clone() } else { None }; + let id = handle.id().to_owned(); let dev_id = self.add_virtio_pci_device( handle.virtio_device, &mapping, - &handle.id, - handle.pci_segment, + &id, + handle.pci_common.pci_segment, handle.dma_handler, )?; // Track device BDF for Generic Initiator support - self.device_id_to_bdf.insert(handle.id.clone(), dev_id); + self.device_id_to_bdf.insert(id, dev_id); - if handle.iommu { + if handle.pci_common.iommu { iommu_attached_devices.push(dev_id); } } @@ -2417,9 +2425,11 @@ impl DeviceManager { self.virtio_devices.push(MetaVirtioDevice { virtio_device: Arc::clone(&virtio_console_device) as Arc>, - iommu: console_config.iommu, - id: id.clone(), - pci_segment: 0, + pci_common: PciDeviceCommonConfig { + id: Some(id.clone()), + iommu: console_config.iommu, + ..Default::default() + }, dma_handler: None, }); @@ -2913,9 +2923,7 @@ impl DeviceManager { Ok(MetaVirtioDevice { virtio_device, - iommu: disk_cfg.pci_common.iommu, - id, - pci_segment: disk_cfg.pci_common.pci_segment, + pci_common: disk_cfg.pci_common.clone(), dma_handler: None, }) } @@ -3083,9 +3091,7 @@ impl DeviceManager { Ok(MetaVirtioDevice { virtio_device, - iommu: net_cfg.pci_common.iommu, - id, - pci_segment: net_cfg.pci_common.pci_segment, + pci_common: net_cfg.pci_common.clone(), dma_handler: None, }) } @@ -3128,9 +3134,11 @@ impl DeviceManager { self.virtio_devices.push(MetaVirtioDevice { virtio_device: Arc::clone(&virtio_rng_device) as Arc>, - iommu: rng_config.iommu, - id: id.clone(), - pci_segment: 0, + pci_common: PciDeviceCommonConfig { + id: Some(id.clone()), + iommu: rng_config.iommu, + ..Default::default() + }, dma_handler: None, }); @@ -3189,9 +3197,7 @@ impl DeviceManager { Ok(MetaVirtioDevice { virtio_device: Arc::clone(&generic_vhost_user_device) as Arc>, - iommu: false, - id, - pci_segment: generic_vhost_user_cfg.pci_common.pci_segment, + pci_common: generic_vhost_user_cfg.pci_common.clone(), dma_handler: None, }) } else { @@ -3255,9 +3261,7 @@ impl DeviceManager { Ok(MetaVirtioDevice { virtio_device: Arc::clone(&virtio_fs_device) as Arc>, - iommu: false, - id, - pci_segment: fs_cfg.pci_common.pci_segment, + pci_common: fs_cfg.pci_common.clone(), dma_handler: None, }) } else { @@ -3444,9 +3448,7 @@ impl DeviceManager { Ok(MetaVirtioDevice { virtio_device: Arc::clone(&virtio_pmem_device) as Arc>, - iommu: pmem_cfg.pci_common.iommu, - id, - pci_segment: pmem_cfg.pci_common.pci_segment, + pci_common: pmem_cfg.pci_common.clone(), dma_handler: None, }) } @@ -3515,9 +3517,7 @@ impl DeviceManager { Ok(MetaVirtioDevice { virtio_device: Arc::clone(&vsock_device) as Arc>, - iommu: vsock_cfg.pci_common.iommu, - id, - pci_segment: vsock_cfg.pci_common.pci_segment, + pci_common: vsock_cfg.pci_common.clone(), dma_handler: None, }) } @@ -3571,9 +3571,10 @@ impl DeviceManager { self.virtio_devices.push(MetaVirtioDevice { virtio_device: Arc::clone(&virtio_mem_device) as Arc>, - iommu: false, - id: memory_zone_id.clone(), - pci_segment: 0, + pci_common: PciDeviceCommonConfig { + id: Some(memory_zone_id.clone()), + ..Default::default() + }, dma_handler: None, }); @@ -3658,9 +3659,10 @@ impl DeviceManager { self.virtio_devices.push(MetaVirtioDevice { virtio_device: Arc::clone(&virtio_balloon_device) as Arc>, - iommu: false, - id: id.clone(), - pci_segment: 0, + pci_common: PciDeviceCommonConfig { + id: Some(id.clone()), + ..Default::default() + }, dma_handler: None, }); @@ -3697,9 +3699,10 @@ impl DeviceManager { self.virtio_devices.push(MetaVirtioDevice { virtio_device: Arc::clone(&virtio_watchdog_device) as Arc>, - iommu: false, - id: id.clone(), - pci_segment: 0, + pci_common: PciDeviceCommonConfig { + id: Some(id.clone()), + ..Default::default() + }, dma_handler: None, }); @@ -3755,9 +3758,7 @@ impl DeviceManager { Ok(MetaVirtioDevice { virtio_device: vdpa_device as Arc>, - iommu: vdpa_cfg.pci_common.iommu, - id, - pci_segment: vdpa_cfg.pci_common.pci_segment, + pci_common: vdpa_cfg.pci_common.clone(), dma_handler: Some(vdpa_mapping), }) } @@ -4555,7 +4556,7 @@ impl DeviceManager { .map_err(DeviceManagerError::UpdateMemoryForVirtioDevice)?; if let Some(dma_handler) = &handle.dma_handler - && !handle.iommu + && !handle.pci_common.iommu { let gpa = new_region.start_addr().0; let size = new_region.len(); @@ -4994,24 +4995,26 @@ impl DeviceManager { // for instance. self.virtio_devices.push(handle.clone()); - let mapping: Option> = if handle.iommu { + let mapping: Option> = if handle.pci_common.iommu { self.iommu_mapping.clone() } else { None }; + let id = handle.id().to_owned(); let bdf = self.add_virtio_pci_device( handle.virtio_device, &mapping, - &handle.id, - handle.pci_segment, + &id, + handle.pci_common.pci_segment, handle.dma_handler, )?; // Update the PCIU bitmap - self.pci_segments[handle.pci_segment as usize].pci_devices_up |= 1 << bdf.device(); + self.pci_segments[handle.pci_common.pci_segment as usize].pci_devices_up |= + 1 << bdf.device(); - Ok(PciDeviceInfo { id: handle.id, bdf }) + Ok(PciDeviceInfo { id, bdf }) } fn is_iommu_segment(&self, pci_segment_id: u16) -> bool { @@ -5108,7 +5111,7 @@ impl DeviceManager { for handle in &self.virtio_devices { let virtio_device = handle.virtio_device.lock().unwrap(); if let Some(device_counters) = virtio_device.counters() { - counters.insert(handle.id.clone(), device_counters.clone()); + counters.insert(handle.id().to_owned(), device_counters.clone()); } } From cc7e56fa078cabe57802312228a333184f6985f2 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Tue, 14 Apr 2026 08:20:59 +0100 Subject: [PATCH 570/742] vmm: device_manager: Use more idiomatic Rust for ID assignment Use a more idiomatic Rust approach when establishing an autogenerated ID when none is set. Signed-off-by: Rob Bradford --- vmm/src/device_manager.rs | 91 +++++++++++++++++++++------------------ 1 file changed, 49 insertions(+), 42 deletions(-) diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index d32f5b7716..91d9509f38 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -2646,12 +2646,13 @@ impl DeviceManager { disk_cfg: &mut DiskConfig, is_hotplug: bool, ) -> DeviceManagerResult { - let id = if let Some(id) = &disk_cfg.pci_common.id { - id.clone() - } else { - let id = self.next_device_name(DISK_DEVICE_NAME_PREFIX)?; - disk_cfg.pci_common.id = Some(id.clone()); - id + let id = match disk_cfg.pci_common.id.as_ref() { + Some(id) => id.clone(), + None => disk_cfg + .pci_common + .id + .insert(self.next_device_name(DISK_DEVICE_NAME_PREFIX)?) + .clone(), }; info!("Creating virtio-block device: {disk_cfg:?}"); @@ -2945,12 +2946,13 @@ impl DeviceManager { &mut self, net_cfg: &mut NetConfig, ) -> DeviceManagerResult { - let id = if let Some(id) = &net_cfg.pci_common.id { - id.clone() - } else { - let id = self.next_device_name(NET_DEVICE_NAME_PREFIX)?; - net_cfg.pci_common.id = Some(id.clone()); - id + let id = match net_cfg.pci_common.id.as_ref() { + Some(id) => id.clone(), + None => net_cfg + .pci_common + .id + .insert(self.next_device_name(NET_DEVICE_NAME_PREFIX)?) + .clone(), }; info!("Creating virtio-net device: {net_cfg:?}"); @@ -3158,12 +3160,13 @@ impl DeviceManager { &mut self, generic_vhost_user_cfg: &mut GenericVhostUserConfig, ) -> DeviceManagerResult { - let id = if let Some(id) = &generic_vhost_user_cfg.pci_common.id { - id.clone() - } else { - let id = self.next_device_name(GENERIC_VHOST_USER_DEVICE_NAME_PREFIX)?; - generic_vhost_user_cfg.pci_common.id = Some(id.clone()); - id + let id = match generic_vhost_user_cfg.pci_common.id.as_ref() { + Some(id) => id.clone(), + None => generic_vhost_user_cfg + .pci_common + .id + .insert(self.next_device_name(GENERIC_VHOST_USER_DEVICE_NAME_PREFIX)?) + .clone(), }; info!("Creating generic vhost-user device: {generic_vhost_user_cfg:?}"); @@ -3222,12 +3225,13 @@ impl DeviceManager { &mut self, fs_cfg: &mut FsConfig, ) -> DeviceManagerResult { - let id = if let Some(id) = &fs_cfg.pci_common.id { - id.clone() - } else { - let id = self.next_device_name(FS_DEVICE_NAME_PREFIX)?; - fs_cfg.pci_common.id = Some(id.clone()); - id + let id = match fs_cfg.pci_common.id.as_ref() { + Some(id) => id.clone(), + None => fs_cfg + .pci_common + .id + .insert(self.next_device_name(FS_DEVICE_NAME_PREFIX)?) + .clone(), }; info!("Creating virtio-fs device: {fs_cfg:?}"); @@ -3286,12 +3290,13 @@ impl DeviceManager { &mut self, pmem_cfg: &mut PmemConfig, ) -> DeviceManagerResult { - let id = if let Some(id) = &pmem_cfg.pci_common.id { - id.clone() - } else { - let id = self.next_device_name(PMEM_DEVICE_NAME_PREFIX)?; - pmem_cfg.pci_common.id = Some(id.clone()); - id + let id = match pmem_cfg.pci_common.id.as_ref() { + Some(id) => id.clone(), + None => pmem_cfg + .pci_common + .id + .insert(self.next_device_name(PMEM_DEVICE_NAME_PREFIX)?) + .clone(), }; info!("Creating virtio-pmem device: {pmem_cfg:?}"); @@ -3471,12 +3476,13 @@ impl DeviceManager { &mut self, vsock_cfg: &mut VsockConfig, ) -> DeviceManagerResult { - let id = if let Some(id) = &vsock_cfg.pci_common.id { - id.clone() - } else { - let id = self.next_device_name(VSOCK_DEVICE_NAME_PREFIX)?; - vsock_cfg.pci_common.id = Some(id.clone()); - id + let id = match vsock_cfg.pci_common.id.as_ref() { + Some(id) => id.clone(), + None => vsock_cfg + .pci_common + .id + .insert(self.next_device_name(VSOCK_DEVICE_NAME_PREFIX)?) + .clone(), }; info!("Creating virtio-vsock device: {vsock_cfg:?}"); @@ -3718,12 +3724,13 @@ impl DeviceManager { &mut self, vdpa_cfg: &mut VdpaConfig, ) -> DeviceManagerResult { - let id = if let Some(id) = &vdpa_cfg.pci_common.id { - id.clone() - } else { - let id = self.next_device_name(VDPA_DEVICE_NAME_PREFIX)?; - vdpa_cfg.pci_common.id = Some(id.clone()); - id + let id = match vdpa_cfg.pci_common.id.as_ref() { + Some(id) => id.clone(), + None => vdpa_cfg + .pci_common + .id + .insert(self.next_device_name(VDPA_DEVICE_NAME_PREFIX)?) + .clone(), }; info!("Creating vDPA device: {vdpa_cfg:?}"); From e6c8b5e8168d22e406c1b1de048d52d52982150e Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Mon, 13 Apr 2026 21:09:03 +0200 Subject: [PATCH 571/742] tests: run more tests in parallel They can safely run in parallel. This further speeds up the CI by ~5. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- cloud-hypervisor/tests/integration.rs | 444 +++++++++++++------------- 1 file changed, 225 insertions(+), 219 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 82c24a4f8f..3c4987fccb 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -6125,7 +6125,7 @@ mod ivshmem { // Check the number of vCPUs assert_eq!(guest.get_cpu_count().unwrap_or_default(), 2); - common_sequential::snapshot_and_check_events( + snapshot_restore_common::snapshot_and_check_events( &api_socket_source, &snapshot_dir, &event_path, @@ -6224,19 +6224,57 @@ mod ivshmem { fn test_live_migration_ivshmem_local() { _test_live_migration_ivshmem(true); } -} -mod common_sequential { - use std::fs::remove_dir_all; + #[test] + #[cfg(not(feature = "mshv"))] + fn test_snapshot_restore_hotplug_virtiomem() { + snapshot_restore_common::_test_snapshot_restore(true, false); + } - use crate::*; + #[test] + #[cfg(not(feature = "mshv"))] // See issue #7437 + fn test_snapshot_restore_basic() { + snapshot_restore_common::_test_snapshot_restore(false, false); + } #[test] #[cfg(not(feature = "mshv"))] - fn test_memory_mergeable_on() { - test_memory_mergeable(true); + fn test_snapshot_restore_with_resume() { + snapshot_restore_common::_test_snapshot_restore(false, true); + } + + #[test] + #[cfg(not(feature = "mshv"))] + fn test_snapshot_restore_uffd() { + snapshot_restore_common::_test_snapshot_restore_uffd("size=2G", &[], 1_920_000); + } + + #[test] + #[cfg(not(feature = "mshv"))] + fn test_snapshot_restore_uffd_shared_memory() { + snapshot_restore_common::_test_snapshot_restore_uffd("size=512M,shared=on", &[], 480_000); + } + + #[test] + #[cfg(not(feature = "mshv"))] // See issue #7437 + #[cfg(target_arch = "x86_64")] + fn test_snapshot_restore_pvpanic() { + snapshot_restore_common::_test_snapshot_restore_devices(true); } + #[test] + fn test_virtio_pmem_persist_writes() { + test_virtio_pmem(false, false); + } +} + +#[cfg(not(feature = "mshv"))] +mod snapshot_restore_common { + use std::fs::remove_dir_all; + use std::process::Command; + + use crate::*; + pub(crate) fn snapshot_and_check_events( api_socket: &str, snapshot_dir: &str, @@ -6282,28 +6320,7 @@ mod common_sequential { })); } - // One thing to note about this test. The virtio-net device is heavily used - // through each ssh command. There's no need to perform a dedicated test to - // verify the migration went well for virtio-net. - #[test] - #[cfg(not(feature = "mshv"))] - fn test_snapshot_restore_hotplug_virtiomem() { - _test_snapshot_restore(true, false); - } - - #[test] - #[cfg(not(feature = "mshv"))] // See issue #7437 - fn test_snapshot_restore_basic() { - _test_snapshot_restore(false, false); - } - - #[test] - #[cfg(not(feature = "mshv"))] - fn test_snapshot_restore_with_resume() { - _test_snapshot_restore(false, true); - } - - fn _test_snapshot_restore(use_hotplug: bool, use_resume_option: bool) { + pub(crate) fn _test_snapshot_restore(use_hotplug: bool, use_resume_option: bool) { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(disk_config)); let kernel_path = direct_kernel_boot_path(); @@ -6427,7 +6444,11 @@ mod common_sequential { thread::sleep(std::time::Duration::new(10, 0)); } - snapshot_and_check_events(&api_socket_source, &snapshot_dir, &event_path); + snapshot_restore_common::snapshot_and_check_events( + &api_socket_source, + &snapshot_dir, + &event_path, + ); }); // Shutdown the source VM and check console output @@ -6587,38 +6608,7 @@ mod common_sequential { handle_child_output(r, &output); } - #[test] - #[cfg(not(feature = "mshv"))] - fn test_snapshot_restore_uffd() { - _test_snapshot_restore_uffd("size=2G", &[], 1_920_000); - } - - #[test] - #[cfg(not(feature = "mshv"))] - fn test_snapshot_restore_uffd_shared_memory() { - _test_snapshot_restore_uffd("size=512M,shared=on", &[], 480_000); - } - - #[test] - #[cfg(not(feature = "mshv"))] - fn test_snapshot_restore_uffd_hugepage_zone() { - if !exec_host_command_status( - "grep -q '^Hugepagesize:[[:space:]]*2048 kB' /proc/meminfo && test $(awk '/HugePages_Free/ {print $2}' /proc/meminfo) -ge 256", - ) - .success() - { - println!("SKIPPED: not enough free 2MiB hugepages for UFFD restore test"); - return; - } - - _test_snapshot_restore_uffd( - "size=0", - &["id=mem0,size=512M,hugepages=on,hugepage_size=2M"], - 480_000, - ); - } - - fn _test_snapshot_restore_uffd( + pub(crate) fn _test_snapshot_restore_uffd( memory_config: &str, memory_zone_config: &[&str], min_total_memory_kib: u32, @@ -6757,6 +6747,169 @@ mod common_sequential { let _ = remove_dir_all(snapshot_dir.as_str()); } + pub(crate) fn _test_snapshot_restore_devices(pvpanic: bool) { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); + let kernel_path = direct_kernel_boot_path(); + + let api_socket_source = format!("{}.1", temp_api_path(&guest.tmp_dir)); + + let device_params = { + let mut data = vec![]; + if pvpanic { + data.push(String::from("--pvpanic")); + } + data + }; + + let socket = temp_vsock_path(&guest.tmp_dir); + let event_path = temp_event_monitor_path(&guest.tmp_dir); + + let mut child = GuestCommand::new(&guest) + .args(["--api-socket", &api_socket_source]) + .args(["--event-monitor", format!("path={event_path}").as_str()]) + .args(["--cpus", "boot=2"]) + .args(["--memory", "size=1G"]) + .args(["--kernel", kernel_path.to_str().unwrap()]) + .default_disks() + .default_net() + .args(["--vsock", format!("cid=3,socket={socket}").as_str()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .args(device_params) + .capture_output() + .spawn() + .unwrap(); + + let console_text = String::from("On a branch floating down river a cricket, singing."); + let snapshot_dir = temp_snapshot_dir_path(&guest.tmp_dir); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + assert_eq!(guest.get_cpu_count().unwrap_or_default(), 2); + + snapshot_and_check_events(&api_socket_source, &snapshot_dir, &event_path); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + handle_child_output(r, &output); + + Command::new("rm") + .arg("-f") + .arg(socket.as_str()) + .output() + .unwrap(); + + let api_socket_restored = format!("{}.2", temp_api_path(&guest.tmp_dir)); + let event_path_restored = format!("{}.2", temp_event_monitor_path(&guest.tmp_dir)); + + let mut child = GuestCommand::new(&guest) + .args(["--api-socket", &api_socket_restored]) + .args([ + "--event-monitor", + format!("path={event_path_restored}").as_str(), + ]) + .args([ + "--restore", + format!("source_url=file://{snapshot_dir}").as_str(), + ]) + .capture_output() + .spawn() + .unwrap(); + + let latest_events = [&MetaEvent { + event: "restored".to_string(), + device_id: None, + }]; + assert!(wait_until(Duration::from_secs(30), || { + check_latest_events_exact(&latest_events, &event_path_restored) + })); + + let _ = remove_dir_all(snapshot_dir.as_str()); + + let r = std::panic::catch_unwind(|| { + assert!(wait_until(Duration::from_secs(30), || remote_command( + &api_socket_restored, + "info", + None + ))); + assert!(remote_command(&api_socket_restored, "resume", None)); + let latest_events = [ + &MetaEvent { + event: "resuming".to_string(), + device_id: None, + }, + &MetaEvent { + event: "resumed".to_string(), + device_id: None, + }, + ]; + assert!(wait_until(Duration::from_secs(30), || { + check_latest_events_exact(&latest_events, &event_path_restored) + })); + + assert_eq!(guest.get_cpu_count().unwrap_or_default(), 2); + guest.check_devices_common(Some(&socket), Some(&console_text), None); + + if pvpanic { + make_guest_panic(&guest); + thread::sleep(std::time::Duration::new(10, 0)); + + let expected_sequential_events = [&MetaEvent { + event: "panic".to_string(), + device_id: None, + }]; + assert!(check_latest_events_exact( + &expected_sequential_events, + &event_path_restored + )); + } + }); + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + handle_child_output(r, &output); + + let r = std::panic::catch_unwind(|| { + assert!(String::from_utf8_lossy(&output.stdout).contains(&console_text)); + }); + + handle_child_output(r, &output); + } +} + +mod common_sequential { + #[cfg(not(feature = "mshv"))] + use std::fs::remove_dir_all; + + #[cfg(not(feature = "mshv"))] + use crate::*; + + #[test] + #[cfg(not(feature = "mshv"))] + fn test_memory_mergeable_on() { + test_memory_mergeable(true); + } + + #[test] + #[cfg(not(feature = "mshv"))] + fn test_snapshot_restore_uffd_hugepage_zone() { + if !exec_host_command_status( + "grep -q '^Hugepagesize:[[:space:]]*2048 kB' /proc/meminfo && test $(awk '/HugePages_Free/ {print $2}' /proc/meminfo) -ge 256", + ) + .success() + { + println!("SKIPPED: not enough free 2MiB hugepages for UFFD restore test"); + return; + } + + snapshot_restore_common::_test_snapshot_restore_uffd( + "size=0", + &["id=mem0,size=512M,hugepages=on,hugepage_size=2M"], + 480_000, + ); + } + #[test] #[cfg(not(feature = "mshv"))] // See issue #7437 #[ignore = "See #6970"] @@ -6843,7 +6996,11 @@ mod common_sequential { // Check the guest virtio-devices, e.g. block, rng, vsock, console, and net guest.check_devices_common(None, Some(&console_text), None); - snapshot_and_check_events(&api_socket_source, &snapshot_dir, &event_path); + snapshot_restore_common::snapshot_and_check_events( + &api_socket_source, + &snapshot_dir, + &event_path, + ); }); // Shutdown the source VM and check console output @@ -6981,155 +7138,6 @@ mod common_sequential { handle_child_output(r, &output); } - #[test] - #[cfg(not(feature = "mshv"))] // See issue #7437 - #[cfg(target_arch = "x86_64")] - fn test_snapshot_restore_pvpanic() { - _test_snapshot_restore_devices(true); - } - - fn _test_snapshot_restore_devices(pvpanic: bool) { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - let kernel_path = direct_kernel_boot_path(); - - let api_socket_source = format!("{}.1", temp_api_path(&guest.tmp_dir)); - - let device_params = { - let mut data = vec![]; - if pvpanic { - data.push(String::from("--pvpanic")); - } - data - }; - - let socket = temp_vsock_path(&guest.tmp_dir); - let event_path = temp_event_monitor_path(&guest.tmp_dir); - - let mut child = GuestCommand::new(&guest) - .args(["--api-socket", &api_socket_source]) - .args(["--event-monitor", format!("path={event_path}").as_str()]) - .args(["--cpus", "boot=2"]) - .args(["--memory", "size=1G"]) - .args(["--kernel", kernel_path.to_str().unwrap()]) - .default_disks() - .default_net() - .args(["--vsock", format!("cid=3,socket={socket}").as_str()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .args(device_params) - .capture_output() - .spawn() - .unwrap(); - - let console_text = String::from("On a branch floating down river a cricket, singing."); - // Create the snapshot directory - let snapshot_dir = temp_snapshot_dir_path(&guest.tmp_dir); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - // Check the number of vCPUs - assert_eq!(guest.get_cpu_count().unwrap_or_default(), 2); - - snapshot_and_check_events(&api_socket_source, &snapshot_dir, &event_path); - }); - - // Shutdown the source VM and check console output - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - handle_child_output(r, &output); - - // Remove the vsock socket file. - Command::new("rm") - .arg("-f") - .arg(socket.as_str()) - .output() - .unwrap(); - - let api_socket_restored = format!("{}.2", temp_api_path(&guest.tmp_dir)); - let event_path_restored = format!("{}.2", temp_event_monitor_path(&guest.tmp_dir)); - - // Restore the VM from the snapshot - let mut child = GuestCommand::new(&guest) - .args(["--api-socket", &api_socket_restored]) - .args([ - "--event-monitor", - format!("path={event_path_restored}").as_str(), - ]) - .args([ - "--restore", - format!("source_url=file://{snapshot_dir}").as_str(), - ]) - .capture_output() - .spawn() - .unwrap(); - - let latest_events = [&MetaEvent { - event: "restored".to_string(), - device_id: None, - }]; - // Wait for the restored event to show up in the monitor file. - assert!(wait_until(Duration::from_secs(30), || { - check_latest_events_exact(&latest_events, &event_path_restored) - })); - - // Remove the snapshot dir - let _ = remove_dir_all(snapshot_dir.as_str()); - - let r = std::panic::catch_unwind(|| { - // Resume the VM - assert!(wait_until(Duration::from_secs(30), || remote_command( - &api_socket_restored, - "info", - None - ))); - assert!(remote_command(&api_socket_restored, "resume", None)); - let latest_events = [ - &MetaEvent { - event: "resuming".to_string(), - device_id: None, - }, - &MetaEvent { - event: "resumed".to_string(), - device_id: None, - }, - ]; - assert!(wait_until(Duration::from_secs(30), || { - check_latest_events_exact(&latest_events, &event_path_restored) - })); - - // Check the number of vCPUs - assert_eq!(guest.get_cpu_count().unwrap_or_default(), 2); - guest.check_devices_common(Some(&socket), Some(&console_text), None); - - if pvpanic { - // Trigger guest a panic - make_guest_panic(&guest); - // Wait a while for guest - thread::sleep(std::time::Duration::new(10, 0)); - - let expected_sequential_events = [&MetaEvent { - event: "panic".to_string(), - device_id: None, - }]; - assert!(check_latest_events_exact( - &expected_sequential_events, - &event_path_restored - )); - } - }); - // Shutdown the target VM and check console output - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - handle_child_output(r, &output); - - let r = std::panic::catch_unwind(|| { - assert!(String::from_utf8_lossy(&output.stdout).contains(&console_text)); - }); - - handle_child_output(r, &output); - } - #[test] #[cfg(not(feature = "mshv"))] fn test_snapshot_restore_virtio_fs() { @@ -7189,8 +7197,11 @@ mod common_sequential { "sudo bash -c 'echo snapshot_test_data > mount_dir/snapshot_test_file'", ) .unwrap(); - - snapshot_and_check_events(&api_socket_source, &snapshot_dir, &event_path); + snapshot_restore_common::snapshot_and_check_events( + &api_socket_source, + &snapshot_dir, + &event_path, + ); }); // Shutdown the source VM @@ -7288,11 +7299,6 @@ mod common_sequential { let _ = std::fs::remove_file(shared_dir.join("snapshot_test_file")); let _ = std::fs::remove_file(shared_dir.join("post_restore_file")); } - - #[test] - fn test_virtio_pmem_persist_writes() { - test_virtio_pmem(false, false); - } } mod windows { From 98aa9d9c1212d051b235623aead4ee33666e9d34 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Tue, 14 Apr 2026 10:30:26 +0200 Subject: [PATCH 572/742] tests: make VFIO memory hotplug more robust After memory hotplug, it may happen that it takes a few seconds until a VFIO device is available again (IOMMU/DMA mappings need update). On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- cloud-hypervisor/tests/integration.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 3c4987fccb..ebf08d8417 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -8391,8 +8391,10 @@ mod vfio { })); assert!(guest.get_total_memory().unwrap_or_default() > 5_760_000); - // Check the VFIO device works when RAM is increased to 6GiB - assert!(guest.check_nvidia_gpu()); + // Check the VFIO device works when RAM is increased to 6GiB. + // After guest memory hotplug, the VMM must refresh VFIO/iommufd DMA + // mappings for the passthrough GPU. + assert!(wait_until(Duration::from_secs(10), || guest.check_nvidia_gpu())); }); let _ = child.kill(); From 73680c38c7d3e69bdfb6269609c58147fa2c499d Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 3 Apr 2026 17:25:59 +0200 Subject: [PATCH 573/742] ci: Switch to Windows Server 2025 for AMD64 The updated image is configured in a same way as the previously used 2022. SAC, SSH, and RDP are configured. All Windows updates to the curent date are installed. Includes latest stable virtio-win 0.1.285 drivers. Signed-off-by: Anatol Belski --- .github/workflows/integration-windows.yaml | 2 +- scripts/run_integration_tests_windows_x86_64.sh | 2 +- test_infra/src/lib.rs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/integration-windows.yaml b/.github/workflows/integration-windows.yaml index bb1f68158c..1010ab73ec 100644 --- a/.github/workflows/integration-windows.yaml +++ b/.github/workflows/integration-windows.yaml @@ -39,7 +39,7 @@ jobs: run: | set -eufo pipefail mkdir $HOME/workloads - az storage blob download --container-name private-images --file "$HOME/workloads/windows-server-2022-amd64-2.raw" --name windows-server-2022-amd64-2.raw --connection-string "${{ secrets.CH_PRIVATE_IMAGES }}" + az storage blob download --container-name private-images --file "$HOME/workloads/windows-server-2025-amd64-1.raw" --name windows-server-2025-amd64-1.raw --connection-string "${{ secrets.CH_PRIVATE_IMAGES }}" - name: Run Windows guest integration tests if: ${{ github.event_name != 'pull_request' }} timeout-minutes: 15 diff --git a/scripts/run_integration_tests_windows_x86_64.sh b/scripts/run_integration_tests_windows_x86_64.sh index 56d41c166e..6b358c5c28 100755 --- a/scripts/run_integration_tests_windows_x86_64.sh +++ b/scripts/run_integration_tests_windows_x86_64.sh @@ -13,7 +13,7 @@ test_features="" if [ "$hypervisor" = "mshv" ]; then test_features="--features mshv" fi -WIN_IMAGE_FILE="/root/workloads/windows-server-2022-amd64-2.raw" +WIN_IMAGE_FILE="/root/workloads/windows-server-2025-amd64-1.raw" WORKLOADS_DIR="/root/workloads" diff --git a/test_infra/src/lib.rs b/test_infra/src/lib.rs index ba10cd5b51..e32ee791fc 100644 --- a/test_infra/src/lib.rs +++ b/test_infra/src/lib.rs @@ -2412,7 +2412,7 @@ pub mod x86_64 { "jammy-server-cloudimg-amd64-custom-20241017-0-backing-uncompressed.qcow2"; pub const JAMMY_IMAGE_NAME_QCOW2_BACKING_RAW_FILE: &str = "jammy-server-cloudimg-amd64-custom-20241017-0-backing-raw.qcow2"; - pub const WINDOWS_IMAGE_NAME: &str = "windows-server-2022-amd64-2.raw"; + pub const WINDOWS_IMAGE_NAME: &str = "windows-server-2025-amd64-1.raw"; pub const OVMF_NAME: &str = "CLOUDHV.fd"; pub const GREP_SERIAL_IRQ_CMD: &str = "grep -c 'IO-APIC.*ttyS0' /proc/interrupts || true"; } From 7f3dfe2154cbde10e74ae654049b45172949c85e Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 20 Mar 2026 13:52:04 +0100 Subject: [PATCH 574/742] block: qcow: Extract positional I/O helpers into a common module These position independent I/O helpers use pread64/pwrite64 to avoid races on the shared file position when multiple queues operate on duplicated file descriptors. Extracting them prepares for reuse by the upcoming qcow_async backend. Signed-off-by: Anatol Belski --- block/src/lib.rs | 1 + block/src/qcow_common.rs | 67 ++++++++++++++++++++++++++++++++++++++++ block/src/qcow_sync.rs | 61 ++---------------------------------- 3 files changed, 70 insertions(+), 59 deletions(-) create mode 100644 block/src/qcow_common.rs diff --git a/block/src/lib.rs b/block/src/lib.rs index 6d093daca4..ad25f4593c 100644 --- a/block/src/lib.rs +++ b/block/src/lib.rs @@ -18,6 +18,7 @@ pub mod fixed_vhd; pub mod fixed_vhd_async; pub mod fixed_vhd_sync; pub mod qcow; +pub(crate) mod qcow_common; pub mod qcow_sync; #[cfg(feature = "io_uring")] /// Async primitives based on `io-uring` diff --git a/block/src/qcow_common.rs b/block/src/qcow_common.rs new file mode 100644 index 0000000000..f118314257 --- /dev/null +++ b/block/src/qcow_common.rs @@ -0,0 +1,67 @@ +// Copyright © 2021 Intel Corporation +// +// Copyright 2026 The Cloud Hypervisor Authors. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +//! Shared helpers for QCOW2 sync and async backends. +//! +//! Position-independent I/O (`pread_exact`, `pwrite_all`) and iovec +//! scatter/gather helpers used by both `qcow_sync` and `qcow_async`. + +use std::io; +use std::os::fd::RawFd; + +// -- Position independent I/O helpers -- +// +// Duplicated file descriptors share the kernel file description and thus the +// file position. Using seek then read from multiple queues races on that +// shared position. pread64 and pwrite64 are atomic and never touch the position. + +/// Read exactly the requested bytes at offset, looping on short reads. +pub fn pread_exact(fd: RawFd, buf: &mut [u8], offset: u64) -> io::Result<()> { + let mut total = 0usize; + while total < buf.len() { + // SAFETY: buf and fd are valid for the lifetime of the call. + let ret = unsafe { + libc::pread64( + fd, + buf[total..].as_mut_ptr() as *mut libc::c_void, + buf.len() - total, + (offset + total as u64) as libc::off_t, + ) + }; + if ret < 0 { + return Err(io::Error::last_os_error()); + } + if ret == 0 { + return Err(io::Error::from(io::ErrorKind::UnexpectedEof)); + } + total += ret as usize; + } + Ok(()) +} + +/// Write all bytes to fd at offset, looping on short writes. +pub fn pwrite_all(fd: RawFd, buf: &[u8], offset: u64) -> io::Result<()> { + let mut total = 0usize; + while total < buf.len() { + // SAFETY: buf and fd are valid for the lifetime of the call. + let ret = unsafe { + libc::pwrite64( + fd, + buf[total..].as_ptr() as *const libc::c_void, + buf.len() - total, + (offset + total as u64) as libc::off_t, + ) + }; + if ret < 0 { + return Err(io::Error::last_os_error()); + } + if ret == 0 { + return Err(io::Error::other("pwrite64 wrote 0 bytes")); + } + total += ret as usize; + } + Ok(()) +} diff --git a/block/src/qcow_sync.rs b/block/src/qcow_sync.rs index 7340a7aa40..cde534314e 100644 --- a/block/src/qcow_sync.rs +++ b/block/src/qcow_sync.rs @@ -5,7 +5,7 @@ use std::cmp::min; use std::collections::VecDeque; use std::fs::File; -use std::os::fd::{AsFd, AsRawFd, BorrowedFd, OwnedFd, RawFd}; +use std::os::fd::{AsFd, AsRawFd, BorrowedFd, OwnedFd}; use std::sync::Arc; use std::{fmt, io, ptr, slice}; @@ -22,6 +22,7 @@ use crate::qcow::qcow_raw_file::QcowRawFile; use crate::qcow::{ BackingFile, BackingKind, Error as QcowError, MAX_NESTING_DEPTH, RawFile, parse_qcow, }; +use crate::qcow_common::{pread_exact, pwrite_all}; /// Raw backing file using pread64 on a duplicated fd. struct RawBacking { @@ -322,64 +323,6 @@ impl QcowSync { } } -// -- Position independent I/O helpers -- -// -// Duplicated file descriptors share the kernel file description and thus the -// file position. Using seek then read from multiple queues races on that -// shared position. pread64 and pwrite64 are atomic and never touch the position. - -/// Read exactly the requested bytes at offset, looping on short reads. -fn pread_exact(fd: RawFd, buf: &mut [u8], offset: u64) -> io::Result<()> { - let mut total = 0usize; - while total < buf.len() { - // SAFETY: buf and fd are valid for the lifetime of the call. - let ret = unsafe { - libc::pread64( - fd, - buf[total..].as_mut_ptr() as *mut libc::c_void, - buf.len() - total, - (offset + total as u64) as libc::off_t, - ) - }; - if ret < 0 { - return Err(io::Error::last_os_error()); - } - if ret == 0 { - return Err(io::Error::from(io::ErrorKind::UnexpectedEof)); - } - total += ret as usize; - } - Ok(()) -} - -/// Write all bytes to fd at offset, looping on short writes. -fn pwrite_all(fd: RawFd, buf: &[u8], offset: u64) -> io::Result<()> { - let mut total = 0usize; - while total < buf.len() { - // SAFETY: buf and fd are valid for the lifetime of the call. - let ret = unsafe { - libc::pwrite64( - fd, - buf[total..].as_ptr() as *const libc::c_void, - buf.len() - total, - (offset + total as u64) as libc::off_t, - ) - }; - if ret < 0 { - return Err(io::Error::last_os_error()); - } - if ret == 0 { - return Err(io::Error::other("pwrite64 wrote 0 bytes")); - } - total += ret as usize; - } - Ok(()) -} - -// -- iovec helper functions -- -// -// Operate on the iovec array as a flat byte stream. - /// Copy data into iovecs starting at the given byte offset. /// /// # Safety From 81f43f96c3eed97bd13f619738aae223a0fcb607 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 20 Mar 2026 13:55:23 +0100 Subject: [PATCH 575/742] block: qcow: Move iovec scatter/gather helpers to qcow_common Move scatter_to_iovecs, zero_fill_iovecs and gather_from_iovecs into qcow_common so they can be shared with the upcoming qcow_async backend. These helpers treat an iovec array as a flat byte stream and are used by both read_vectored and write_vectored code paths. Signed-off-by: Anatol Belski --- block/src/qcow_common.rs | 96 +++++++++++++++++++++++++++++++++++++++- block/src/qcow_sync.rs | 95 ++------------------------------------- 2 files changed, 99 insertions(+), 92 deletions(-) diff --git a/block/src/qcow_common.rs b/block/src/qcow_common.rs index f118314257..dc492e36d2 100644 --- a/block/src/qcow_common.rs +++ b/block/src/qcow_common.rs @@ -9,8 +9,9 @@ //! Position-independent I/O (`pread_exact`, `pwrite_all`) and iovec //! scatter/gather helpers used by both `qcow_sync` and `qcow_async`. -use std::io; +use std::cmp::min; use std::os::fd::RawFd; +use std::{io, ptr, slice}; // -- Position independent I/O helpers -- // @@ -65,3 +66,96 @@ pub fn pwrite_all(fd: RawFd, buf: &[u8], offset: u64) -> io::Result<()> { } Ok(()) } + +// -- iovec helper functions -- +// +// Operate on the iovec array as a flat byte stream. + +/// Copy data into iovecs starting at the given byte offset. +/// +/// # Safety +/// Caller must ensure iovecs point to valid, writable memory of sufficient size. +pub unsafe fn scatter_to_iovecs(iovecs: &[libc::iovec], start: usize, data: &[u8]) { + let mut remaining = data; + let mut pos = 0usize; + for iov in iovecs { + let iov_end = pos + iov.iov_len; + if iov_end <= start || remaining.is_empty() { + pos = iov_end; + continue; + } + let iov_start = start.saturating_sub(pos); + let available = iov.iov_len - iov_start; + let count = min(available, remaining.len()); + // SAFETY: iov_base is valid for iov_len bytes per caller contract. + unsafe { + let dst = (iov.iov_base as *mut u8).add(iov_start); + ptr::copy_nonoverlapping(remaining.as_ptr(), dst, count); + } + remaining = &remaining[count..]; + if remaining.is_empty() { + break; + } + pos = iov_end; + } +} + +/// Zero fill iovecs starting at the given byte offset for the given length. +/// +/// # Safety +/// Caller must ensure iovecs point to valid, writable memory of sufficient size. +pub unsafe fn zero_fill_iovecs(iovecs: &[libc::iovec], start: usize, len: usize) { + let mut remaining = len; + let mut pos = 0usize; + for iov in iovecs { + let iov_end = pos + iov.iov_len; + if iov_end <= start || remaining == 0 { + pos = iov_end; + continue; + } + let iov_start = start.saturating_sub(pos); + let available = iov.iov_len - iov_start; + let count = min(available, remaining); + // SAFETY: iov_base is valid for iov_len bytes per caller contract. + unsafe { + let dst = (iov.iov_base as *mut u8).add(iov_start); + ptr::write_bytes(dst, 0, count); + } + remaining -= count; + if remaining == 0 { + break; + } + pos = iov_end; + } +} + +/// Gather bytes from iovecs starting at the given byte offset into a Vec. +/// +/// # Safety +/// Caller must ensure iovecs point to valid, readable memory of sufficient size. +pub unsafe fn gather_from_iovecs(iovecs: &[libc::iovec], start: usize, len: usize) -> Vec { + let mut result = Vec::with_capacity(len); + let mut remaining = len; + let mut pos = 0usize; + for iov in iovecs { + let iov_end = pos + iov.iov_len; + if iov_end <= start || remaining == 0 { + pos = iov_end; + continue; + } + let iov_start = start.saturating_sub(pos); + let available = iov.iov_len - iov_start; + let count = min(available, remaining); + // SAFETY: iov_base is valid for iov_len bytes per caller contract. + unsafe { + let src = (iov.iov_base as *const u8).add(iov_start); + result.extend_from_slice(slice::from_raw_parts(src, count)); + } + remaining -= count; + if remaining == 0 { + break; + } + pos = iov_end; + } + result +} diff --git a/block/src/qcow_sync.rs b/block/src/qcow_sync.rs index cde534314e..694fe80af0 100644 --- a/block/src/qcow_sync.rs +++ b/block/src/qcow_sync.rs @@ -7,7 +7,7 @@ use std::collections::VecDeque; use std::fs::File; use std::os::fd::{AsFd, AsRawFd, BorrowedFd, OwnedFd}; use std::sync::Arc; -use std::{fmt, io, ptr, slice}; +use std::{fmt, io}; use vmm_sys_util::eventfd::EventFd; use vmm_sys_util::write_zeroes::{PunchHole, WriteZeroesAt}; @@ -22,7 +22,9 @@ use crate::qcow::qcow_raw_file::QcowRawFile; use crate::qcow::{ BackingFile, BackingKind, Error as QcowError, MAX_NESTING_DEPTH, RawFile, parse_qcow, }; -use crate::qcow_common::{pread_exact, pwrite_all}; +use crate::qcow_common::{ + gather_from_iovecs, pread_exact, pwrite_all, scatter_to_iovecs, zero_fill_iovecs, +}; /// Raw backing file using pread64 on a duplicated fd. struct RawBacking { @@ -323,95 +325,6 @@ impl QcowSync { } } -/// Copy data into iovecs starting at the given byte offset. -/// -/// # Safety -/// Caller must ensure iovecs point to valid, writable memory of sufficient size. -unsafe fn scatter_to_iovecs(iovecs: &[libc::iovec], start: usize, data: &[u8]) { - let mut remaining = data; - let mut pos = 0usize; - for iov in iovecs { - let iov_end = pos + iov.iov_len; - if iov_end <= start || remaining.is_empty() { - pos = iov_end; - continue; - } - let iov_start = start.saturating_sub(pos); - let available = iov.iov_len - iov_start; - let count = min(available, remaining.len()); - // SAFETY: iov_base is valid for iov_len bytes per caller contract. - unsafe { - let dst = (iov.iov_base as *mut u8).add(iov_start); - ptr::copy_nonoverlapping(remaining.as_ptr(), dst, count); - } - remaining = &remaining[count..]; - if remaining.is_empty() { - break; - } - pos = iov_end; - } -} - -/// Zero fill iovecs starting at the given byte offset for the given length. -/// -/// # Safety -/// Caller must ensure iovecs point to valid, writable memory of sufficient size. -unsafe fn zero_fill_iovecs(iovecs: &[libc::iovec], start: usize, len: usize) { - let mut remaining = len; - let mut pos = 0usize; - for iov in iovecs { - let iov_end = pos + iov.iov_len; - if iov_end <= start || remaining == 0 { - pos = iov_end; - continue; - } - let iov_start = start.saturating_sub(pos); - let available = iov.iov_len - iov_start; - let count = min(available, remaining); - // SAFETY: iov_base is valid for iov_len bytes per caller contract. - unsafe { - let dst = (iov.iov_base as *mut u8).add(iov_start); - ptr::write_bytes(dst, 0, count); - } - remaining -= count; - if remaining == 0 { - break; - } - pos = iov_end; - } -} - -/// Gather bytes from iovecs starting at the given byte offset into a Vec. -/// -/// # Safety -/// Caller must ensure iovecs point to valid, readable memory of sufficient size. -unsafe fn gather_from_iovecs(iovecs: &[libc::iovec], start: usize, len: usize) -> Vec { - let mut result = Vec::with_capacity(len); - let mut remaining = len; - let mut pos = 0usize; - for iov in iovecs { - let iov_end = pos + iov.iov_len; - if iov_end <= start || remaining == 0 { - pos = iov_end; - continue; - } - let iov_start = start.saturating_sub(pos); - let available = iov.iov_len - iov_start; - let count = min(available, remaining); - // SAFETY: iov_base is valid for iov_len bytes per caller contract. - unsafe { - let src = (iov.iov_base as *const u8).add(iov_start); - result.extend_from_slice(slice::from_raw_parts(src, count)); - } - remaining -= count; - if remaining == 0 { - break; - } - pos = iov_end; - } - result -} - impl AsyncIo for QcowSync { fn notifier(&self) -> &EventFd { &self.eventfd From e345299f4de1f5af7963aea7050f350d36fc9d68 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 20 Mar 2026 16:03:24 +0100 Subject: [PATCH 576/742] block: qcow: Move RawBacking to qcow/backing module Move the raw backing file reader into the new qcow/backing module so it can be shared between qcow_sync and the upcoming qcow_async backend. Signed-off-by: Anatol Belski --- block/src/qcow/backing.rs | 40 +++++++++++++++++++++++++++++++++++++++ block/src/qcow/mod.rs | 1 + block/src/qcow_sync.rs | 28 +-------------------------- 3 files changed, 42 insertions(+), 27 deletions(-) create mode 100644 block/src/qcow/backing.rs diff --git a/block/src/qcow/backing.rs b/block/src/qcow/backing.rs new file mode 100644 index 0000000000..1ad1b74b9f --- /dev/null +++ b/block/src/qcow/backing.rs @@ -0,0 +1,40 @@ +// Copyright © 2021 Intel Corporation +// +// Copyright 2026 The Cloud Hypervisor Authors. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +//! Thread safe backing file readers for QCOW2 images. + +use std::io; +use std::os::fd::{AsRawFd, OwnedFd}; + +use crate::qcow::metadata::BackingRead; +use crate::qcow_common::pread_exact; + +/// Raw backing file using pread64 on a duplicated fd. +pub(crate) struct RawBacking { + pub(crate) fd: OwnedFd, + pub(crate) virtual_size: u64, +} + +// SAFETY: The only I/O operation is pread64 which is position independent +// and safe for concurrent use from multiple threads. +unsafe impl Sync for RawBacking {} + +impl BackingRead for RawBacking { + fn read_at(&self, address: u64, buf: &mut [u8]) -> io::Result<()> { + if address >= self.virtual_size { + buf.fill(0); + return Ok(()); + } + let available = (self.virtual_size - address) as usize; + if available >= buf.len() { + pread_exact(self.fd.as_raw_fd(), buf, address) + } else { + pread_exact(self.fd.as_raw_fd(), &mut buf[..available], address)?; + buf[available..].fill(0); + Ok(()) + } + } +} diff --git a/block/src/qcow/mod.rs b/block/src/qcow/mod.rs index 0c77b865cc..4fc4916f30 100644 --- a/block/src/qcow/mod.rs +++ b/block/src/qcow/mod.rs @@ -4,6 +4,7 @@ // // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause +pub(crate) mod backing; mod decoder; mod header; pub(crate) mod metadata; diff --git a/block/src/qcow_sync.rs b/block/src/qcow_sync.rs index 694fe80af0..5665ad475b 100644 --- a/block/src/qcow_sync.rs +++ b/block/src/qcow_sync.rs @@ -15,6 +15,7 @@ use vmm_sys_util::write_zeroes::{PunchHole, WriteZeroesAt}; use crate::async_io::{AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFileError}; use crate::disk_file; use crate::error::{BlockError, BlockErrorKind, BlockResult, ErrorOp}; +use crate::qcow::backing::RawBacking; use crate::qcow::metadata::{ BackingRead, ClusterReadMapping, ClusterWriteMapping, DeallocAction, QcowMetadata, }; @@ -26,33 +27,6 @@ use crate::qcow_common::{ gather_from_iovecs, pread_exact, pwrite_all, scatter_to_iovecs, zero_fill_iovecs, }; -/// Raw backing file using pread64 on a duplicated fd. -struct RawBacking { - fd: OwnedFd, - virtual_size: u64, -} - -// SAFETY: The only I/O operation is pread64 which is position independent -// and safe for concurrent use from multiple threads. -unsafe impl Sync for RawBacking {} - -impl BackingRead for RawBacking { - fn read_at(&self, address: u64, buf: &mut [u8]) -> io::Result<()> { - if address >= self.virtual_size { - buf.fill(0); - return Ok(()); - } - let available = (self.virtual_size - address) as usize; - if available >= buf.len() { - pread_exact(self.fd.as_raw_fd(), buf, address) - } else { - pread_exact(self.fd.as_raw_fd(), &mut buf[..available], address)?; - buf[available..].fill(0); - Ok(()) - } - } -} - /// QCOW2 backing file with RwLock metadata and pread64 data reads. /// /// Read only because backing files never receive writes. Nested backing From 5ec80d45ab5fb609678443ced7c6030166bb1f3a Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 20 Mar 2026 16:07:51 +0100 Subject: [PATCH 577/742] block: qcow: Move Qcow2MetadataBacking to qcow/backing Move the QCOW2 metadata backed reader into qcow/backing alongside RawBacking. Signed-off-by: Anatol Belski --- block/src/qcow/backing.rs | 92 ++++++++++++++++++++++++++++++++++++++- block/src/qcow_sync.rs | 89 +------------------------------------ 2 files changed, 92 insertions(+), 89 deletions(-) diff --git a/block/src/qcow/backing.rs b/block/src/qcow/backing.rs index 1ad1b74b9f..c688900c5e 100644 --- a/block/src/qcow/backing.rs +++ b/block/src/qcow/backing.rs @@ -8,8 +8,9 @@ use std::io; use std::os::fd::{AsRawFd, OwnedFd}; +use std::sync::Arc; -use crate::qcow::metadata::BackingRead; +use crate::qcow::metadata::{BackingRead, ClusterReadMapping, QcowMetadata}; use crate::qcow_common::pread_exact; /// Raw backing file using pread64 on a duplicated fd. @@ -38,3 +39,92 @@ impl BackingRead for RawBacking { } } } + +/// QCOW2 image used as a backing file for another QCOW2 image. +/// +/// Resolves guest offsets through the QCOW2 cluster mapping (L1/L2 +/// tables, refcounts) before reading the underlying data. Read only +/// because backing files never receive writes. Nested backing chains +/// are handled recursively via the optional `backing_file` field. +pub(crate) struct Qcow2MetadataBacking { + pub(crate) metadata: Arc, + pub(crate) data_fd: OwnedFd, + pub(crate) backing_file: Option>, +} + +// SAFETY: All reads go through QcowMetadata which uses RwLock +// and pread64 which is position independent and thread safe. +unsafe impl Sync for Qcow2MetadataBacking {} + +impl BackingRead for Qcow2MetadataBacking { + fn read_at(&self, address: u64, buf: &mut [u8]) -> io::Result<()> { + let virtual_size = self.metadata.virtual_size(); + if address >= virtual_size { + buf.fill(0); + return Ok(()); + } + let available = (virtual_size - address) as usize; + if available < buf.len() { + self.read_clusters(address, &mut buf[..available])?; + buf[available..].fill(0); + return Ok(()); + } + self.read_clusters(address, buf) + } +} + +impl Qcow2MetadataBacking { + /// Resolve cluster mappings via metadata then read allocated clusters + /// with pread64. + fn read_clusters(&self, address: u64, buf: &mut [u8]) -> io::Result<()> { + let total_len = buf.len(); + let has_backing = self.backing_file.is_some(); + + let mappings = self + .metadata + .map_clusters_for_read(address, total_len, has_backing)?; + + let mut buf_offset = 0usize; + for mapping in mappings { + match mapping { + ClusterReadMapping::Zero { length } => { + buf[buf_offset..buf_offset + length as usize].fill(0); + buf_offset += length as usize; + } + ClusterReadMapping::Allocated { + offset: host_offset, + length, + } => { + pread_exact( + self.data_fd.as_raw_fd(), + &mut buf[buf_offset..buf_offset + length as usize], + host_offset, + )?; + buf_offset += length as usize; + } + ClusterReadMapping::Compressed { data } => { + let len = data.len(); + buf[buf_offset..buf_offset + len].copy_from_slice(&data); + buf_offset += len; + } + ClusterReadMapping::Backing { + offset: backing_offset, + length, + } => { + self.backing_file.as_ref().unwrap().read_at( + backing_offset, + &mut buf[buf_offset..buf_offset + length as usize], + )?; + buf_offset += length as usize; + } + } + } + Ok(()) + } +} + +impl Drop for Qcow2MetadataBacking { + fn drop(&mut self) { + self.metadata.shutdown(); + } +} diff --git a/block/src/qcow_sync.rs b/block/src/qcow_sync.rs index 5665ad475b..483db80f50 100644 --- a/block/src/qcow_sync.rs +++ b/block/src/qcow_sync.rs @@ -15,7 +15,7 @@ use vmm_sys_util::write_zeroes::{PunchHole, WriteZeroesAt}; use crate::async_io::{AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFileError}; use crate::disk_file; use crate::error::{BlockError, BlockErrorKind, BlockResult, ErrorOp}; -use crate::qcow::backing::RawBacking; +use crate::qcow::backing::{Qcow2MetadataBacking, RawBacking}; use crate::qcow::metadata::{ BackingRead, ClusterReadMapping, ClusterWriteMapping, DeallocAction, QcowMetadata, }; @@ -27,93 +27,6 @@ use crate::qcow_common::{ gather_from_iovecs, pread_exact, pwrite_all, scatter_to_iovecs, zero_fill_iovecs, }; -/// QCOW2 backing file with RwLock metadata and pread64 data reads. -/// -/// Read only because backing files never receive writes. Nested backing -/// files are handled recursively. -struct Qcow2MetadataBacking { - metadata: Arc, - data_fd: OwnedFd, - backing_file: Option>, -} - -// SAFETY: All reads go through QcowMetadata which uses RwLock -// and pread64 which is position independent and thread safe. -unsafe impl Sync for Qcow2MetadataBacking {} - -impl BackingRead for Qcow2MetadataBacking { - fn read_at(&self, address: u64, buf: &mut [u8]) -> io::Result<()> { - let virtual_size = self.metadata.virtual_size(); - if address >= virtual_size { - buf.fill(0); - return Ok(()); - } - let available = (virtual_size - address) as usize; - if available < buf.len() { - self.read_clusters(address, &mut buf[..available])?; - buf[available..].fill(0); - return Ok(()); - } - self.read_clusters(address, buf) - } -} - -impl Qcow2MetadataBacking { - /// Resolve cluster mappings via metadata then read allocated clusters - /// with pread64. - fn read_clusters(&self, address: u64, buf: &mut [u8]) -> io::Result<()> { - let total_len = buf.len(); - let has_backing = self.backing_file.is_some(); - - let mappings = self - .metadata - .map_clusters_for_read(address, total_len, has_backing)?; - - let mut buf_offset = 0usize; - for mapping in mappings { - match mapping { - ClusterReadMapping::Zero { length } => { - buf[buf_offset..buf_offset + length as usize].fill(0); - buf_offset += length as usize; - } - ClusterReadMapping::Allocated { - offset: host_offset, - length, - } => { - pread_exact( - self.data_fd.as_raw_fd(), - &mut buf[buf_offset..buf_offset + length as usize], - host_offset, - )?; - buf_offset += length as usize; - } - ClusterReadMapping::Compressed { data } => { - let len = data.len(); - buf[buf_offset..buf_offset + len].copy_from_slice(&data); - buf_offset += len; - } - ClusterReadMapping::Backing { - offset: backing_offset, - length, - } => { - self.backing_file.as_ref().unwrap().read_at( - backing_offset, - &mut buf[buf_offset..buf_offset + length as usize], - )?; - buf_offset += length as usize; - } - } - } - Ok(()) - } -} - -impl Drop for Qcow2MetadataBacking { - fn drop(&mut self) { - self.metadata.shutdown(); - } -} - /// Construct a thread safe backing file reader. fn shared_backing_from(bf: BackingFile) -> BlockResult> { let (kind, virtual_size) = bf.into_kind(); From 8cd2c957ef53da1e12cbf0d3ce991e9af2488f70 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 20 Mar 2026 16:13:23 +0100 Subject: [PATCH 578/742] block: qcow: Move shared_backing_from to qcow/backing Move the backing file constructor into qcow/backing alongside the types it creates. Both qcow_sync and qcow_async can now import shared_backing_from directly from qcow/backing. Signed-off-by: Anatol Belski --- block/src/qcow/backing.rs | 38 ++++++++++++++++++++++++++++++++++- block/src/qcow_sync.rs | 42 +++------------------------------------ 2 files changed, 40 insertions(+), 40 deletions(-) diff --git a/block/src/qcow/backing.rs b/block/src/qcow/backing.rs index c688900c5e..e5e037b0ad 100644 --- a/block/src/qcow/backing.rs +++ b/block/src/qcow/backing.rs @@ -7,10 +7,12 @@ //! Thread safe backing file readers for QCOW2 images. use std::io; -use std::os::fd::{AsRawFd, OwnedFd}; +use std::os::fd::{AsFd, AsRawFd, BorrowedFd, OwnedFd}; use std::sync::Arc; +use crate::error::{BlockError, BlockErrorKind, BlockResult, ErrorOp}; use crate::qcow::metadata::{BackingRead, ClusterReadMapping, QcowMetadata}; +use crate::qcow::{BackingFile, BackingKind, Error as QcowError}; use crate::qcow_common::pread_exact; /// Raw backing file using pread64 on a duplicated fd. @@ -128,3 +130,37 @@ impl Drop for Qcow2MetadataBacking { self.metadata.shutdown(); } } + +/// Construct a thread safe backing file reader. +pub fn shared_backing_from(bf: BackingFile) -> BlockResult> { + let (kind, virtual_size) = bf.into_kind(); + + let dup_fd = |fd: BorrowedFd<'_>| -> BlockResult { + fd.try_clone_to_owned().map_err(|e| { + BlockError::new( + BlockErrorKind::Io, + QcowError::BackingFileIo(String::new(), e), + ) + .with_op(ErrorOp::DupBackingFd) + }) + }; + + match kind { + BackingKind::Raw(raw_file) => { + let fd = dup_fd(raw_file.as_fd())?; + Ok(Arc::new(RawBacking { fd, virtual_size })) + } + BackingKind::Qcow { inner, backing } => { + let data_fd = dup_fd(inner.raw_file.as_fd())?; + Ok(Arc::new(Qcow2MetadataBacking { + metadata: Arc::new(QcowMetadata::new(*inner)), + data_fd, + backing_file: backing.map(|bf| shared_backing_from(*bf)).transpose()?, + })) + } + #[cfg(test)] + BackingKind::QcowFile(_) => { + unreachable!("QcowFile variant is only used by set_backing_file() in tests") + } + } +} diff --git a/block/src/qcow_sync.rs b/block/src/qcow_sync.rs index 483db80f50..f948a3b5fd 100644 --- a/block/src/qcow_sync.rs +++ b/block/src/qcow_sync.rs @@ -5,7 +5,7 @@ use std::cmp::min; use std::collections::VecDeque; use std::fs::File; -use std::os::fd::{AsFd, AsRawFd, BorrowedFd, OwnedFd}; +use std::os::fd::{AsFd, AsRawFd}; use std::sync::Arc; use std::{fmt, io}; @@ -15,52 +15,16 @@ use vmm_sys_util::write_zeroes::{PunchHole, WriteZeroesAt}; use crate::async_io::{AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFileError}; use crate::disk_file; use crate::error::{BlockError, BlockErrorKind, BlockResult, ErrorOp}; -use crate::qcow::backing::{Qcow2MetadataBacking, RawBacking}; +use crate::qcow::backing::shared_backing_from; use crate::qcow::metadata::{ BackingRead, ClusterReadMapping, ClusterWriteMapping, DeallocAction, QcowMetadata, }; use crate::qcow::qcow_raw_file::QcowRawFile; -use crate::qcow::{ - BackingFile, BackingKind, Error as QcowError, MAX_NESTING_DEPTH, RawFile, parse_qcow, -}; +use crate::qcow::{MAX_NESTING_DEPTH, RawFile, parse_qcow}; use crate::qcow_common::{ gather_from_iovecs, pread_exact, pwrite_all, scatter_to_iovecs, zero_fill_iovecs, }; -/// Construct a thread safe backing file reader. -fn shared_backing_from(bf: BackingFile) -> BlockResult> { - let (kind, virtual_size) = bf.into_kind(); - - let dup_fd = |fd: BorrowedFd<'_>| -> BlockResult { - fd.try_clone_to_owned().map_err(|e| { - BlockError::new( - BlockErrorKind::Io, - QcowError::BackingFileIo(String::new(), e), - ) - .with_op(ErrorOp::DupBackingFd) - }) - }; - - match kind { - BackingKind::Raw(raw_file) => { - let fd = dup_fd(raw_file.as_fd())?; - Ok(Arc::new(RawBacking { fd, virtual_size })) - } - BackingKind::Qcow { inner, backing } => { - let data_fd = dup_fd(inner.raw_file.as_fd())?; - Ok(Arc::new(Qcow2MetadataBacking { - metadata: Arc::new(QcowMetadata::new(*inner)), - data_fd, - backing_file: backing.map(|bf| shared_backing_from(*bf)).transpose()?, - })) - } - #[cfg(test)] - BackingKind::QcowFile(_) => { - unreachable!("QcowFile variant is only used by set_backing_file() in tests") - } - } -} - pub struct QcowDiskSync { metadata: Arc, /// Shared across queues, resolved once at construction. From 3422a8b25853469b8185b87abbedd161ebd7aa0d Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 20 Mar 2026 17:37:19 +0100 Subject: [PATCH 579/742] block: qcow: Add QcowDiskAsync struct stub Introduce the device level handle for the async QCOW2 backend. QcowDiskAsync mirrors QcowDiskSync. It parses the image, resolves the backing chain and wraps QcowMetadata in an Arc for sharing across virtio queues. No trait impls yet, just the struct, constructor, Drop and Debug. Signed-off-by: Anatol Belski --- block/src/lib.rs | 1 + block/src/qcow_async.rs | 70 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 block/src/qcow_async.rs diff --git a/block/src/lib.rs b/block/src/lib.rs index ad25f4593c..c98f20ae87 100644 --- a/block/src/lib.rs +++ b/block/src/lib.rs @@ -18,6 +18,7 @@ pub mod fixed_vhd; pub mod fixed_vhd_async; pub mod fixed_vhd_sync; pub mod qcow; +pub mod qcow_async; pub(crate) mod qcow_common; pub mod qcow_sync; #[cfg(feature = "io_uring")] diff --git a/block/src/qcow_async.rs b/block/src/qcow_async.rs new file mode 100644 index 0000000000..71bc6f4742 --- /dev/null +++ b/block/src/qcow_async.rs @@ -0,0 +1,70 @@ +// Copyright © 2021 Intel Corporation +// +// Copyright 2026 The Cloud Hypervisor Authors. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +//! QCOW2 async disk backend. + +use std::fmt; +use std::fs::File; +use std::sync::Arc; + +use crate::error::{BlockErrorKind, BlockResult, ErrorOp}; +use crate::qcow::backing::shared_backing_from; +use crate::qcow::metadata::{BackingRead, QcowMetadata}; +use crate::qcow::qcow_raw_file::QcowRawFile; +use crate::qcow::{MAX_NESTING_DEPTH, RawFile, parse_qcow}; + +/// Device level handle for a QCOW2 image. +/// +/// Owns the parsed metadata and backing file chain. One instance is +/// created per disk and shared across virtio queues. +pub struct QcowDiskAsync { + metadata: Arc, + backing_file: Option>, + sparse: bool, + data_raw_file: QcowRawFile, +} + +impl fmt::Debug for QcowDiskAsync { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("QcowDiskAsync") + .field("sparse", &self.sparse) + .field("has_backing", &self.backing_file.is_some()) + .finish_non_exhaustive() + } +} + +impl QcowDiskAsync { + pub fn new( + file: File, + direct_io: bool, + backing_files: bool, + sparse: bool, + ) -> BlockResult { + let max_nesting_depth = if backing_files { MAX_NESTING_DEPTH } else { 0 }; + let (inner, backing_file, sparse) = + parse_qcow(RawFile::new(file, direct_io), max_nesting_depth, sparse).map_err(|e| { + let e = if !backing_files && matches!(e.kind(), BlockErrorKind::Overflow) { + e.with_kind(BlockErrorKind::UnsupportedFeature) + } else { + e + }; + e.with_op(ErrorOp::Open) + })?; + let data_raw_file = inner.raw_file.clone(); + Ok(QcowDiskAsync { + metadata: Arc::new(QcowMetadata::new(inner)), + backing_file: backing_file.map(shared_backing_from).transpose()?, + sparse, + data_raw_file, + }) + } +} + +impl Drop for QcowDiskAsync { + fn drop(&mut self) { + self.metadata.shutdown(); + } +} From f7771138180bda48c50ce39535b25b388f9d1d42 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 20 Mar 2026 17:42:37 +0100 Subject: [PATCH 580/742] block: qcow_async: impl DiskSize for QcowDiskAsync Delegates to QcowMetadata::virtual_size, identical to the sync backend. Signed-off-by: Anatol Belski --- block/src/qcow_async.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/block/src/qcow_async.rs b/block/src/qcow_async.rs index 71bc6f4742..d6b14ae773 100644 --- a/block/src/qcow_async.rs +++ b/block/src/qcow_async.rs @@ -10,6 +10,7 @@ use std::fmt; use std::fs::File; use std::sync::Arc; +use crate::disk_file; use crate::error::{BlockErrorKind, BlockResult, ErrorOp}; use crate::qcow::backing::shared_backing_from; use crate::qcow::metadata::{BackingRead, QcowMetadata}; @@ -68,3 +69,9 @@ impl Drop for QcowDiskAsync { self.metadata.shutdown(); } } + +impl disk_file::DiskSize for QcowDiskAsync { + fn logical_size(&self) -> BlockResult { + Ok(self.metadata.virtual_size()) + } +} From 403b75656cd1fdcb50db0b5262e6d6d2cea1cb95 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 20 Mar 2026 17:43:37 +0100 Subject: [PATCH 581/742] block: qcow_async: impl PhysicalSize for QcowDiskAsync Queries the underlying raw file for on disk size. Signed-off-by: Anatol Belski --- block/src/qcow_async.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/block/src/qcow_async.rs b/block/src/qcow_async.rs index d6b14ae773..388dcab551 100644 --- a/block/src/qcow_async.rs +++ b/block/src/qcow_async.rs @@ -75,3 +75,9 @@ impl disk_file::DiskSize for QcowDiskAsync { Ok(self.metadata.virtual_size()) } } + +impl disk_file::PhysicalSize for QcowDiskAsync { + fn physical_size(&self) -> BlockResult { + Ok(self.data_raw_file.physical_size()?) + } +} From 91d6356db4fb364daa4e4762128e60db3b0092d3 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 20 Mar 2026 17:44:19 +0100 Subject: [PATCH 582/742] block: qcow_async: impl DiskFd for QcowDiskAsync Returns a borrowed file descriptor for the underlying data file. Signed-off-by: Anatol Belski --- block/src/qcow_async.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/block/src/qcow_async.rs b/block/src/qcow_async.rs index 388dcab551..226ed7bcf3 100644 --- a/block/src/qcow_async.rs +++ b/block/src/qcow_async.rs @@ -8,8 +8,10 @@ use std::fmt; use std::fs::File; +use std::os::fd::{AsFd, AsRawFd}; use std::sync::Arc; +use crate::async_io::BorrowedDiskFd; use crate::disk_file; use crate::error::{BlockErrorKind, BlockResult, ErrorOp}; use crate::qcow::backing::shared_backing_from; @@ -81,3 +83,9 @@ impl disk_file::PhysicalSize for QcowDiskAsync { Ok(self.data_raw_file.physical_size()?) } } + +impl disk_file::DiskFd for QcowDiskAsync { + fn fd(&self) -> BorrowedDiskFd<'_> { + BorrowedDiskFd::new(self.data_raw_file.as_fd().as_raw_fd()) + } +} From fa700f6493b1fbbe0123e4b9f6480a7785a11fc3 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 20 Mar 2026 17:49:23 +0100 Subject: [PATCH 583/742] block: qcow_async: impl Geometry for QcowDiskAsync Uses the default geometry, same as the sync backend. Signed-off-by: Anatol Belski --- block/src/qcow_async.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/src/qcow_async.rs b/block/src/qcow_async.rs index 226ed7bcf3..f9cd0572e9 100644 --- a/block/src/qcow_async.rs +++ b/block/src/qcow_async.rs @@ -89,3 +89,5 @@ impl disk_file::DiskFd for QcowDiskAsync { BorrowedDiskFd::new(self.data_raw_file.as_fd().as_raw_fd()) } } + +impl disk_file::Geometry for QcowDiskAsync {} From 52cd45874a7b5d4f8bd84e57b55191f35d2d637e Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 20 Mar 2026 17:50:00 +0100 Subject: [PATCH 584/742] block: qcow_async: impl SparseCapable for QcowDiskAsync QCOW2 images support both sparse operations and the zero flag. Signed-off-by: Anatol Belski --- block/src/qcow_async.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/block/src/qcow_async.rs b/block/src/qcow_async.rs index f9cd0572e9..62b2872b2d 100644 --- a/block/src/qcow_async.rs +++ b/block/src/qcow_async.rs @@ -91,3 +91,13 @@ impl disk_file::DiskFd for QcowDiskAsync { } impl disk_file::Geometry for QcowDiskAsync {} + +impl disk_file::SparseCapable for QcowDiskAsync { + fn supports_sparse_operations(&self) -> bool { + true + } + + fn supports_zero_flag(&self) -> bool { + true + } +} From 8c3b6cb04be777ecf2d37fcd1d8c58dc7cc97177 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 20 Mar 2026 17:51:27 +0100 Subject: [PATCH 585/742] block: qcow_async: impl Resizable for QcowDiskAsync Delegates to QcowMetadata::resize. Rejects resize when a backing file is present, same as the sync backend. Signed-off-by: Anatol Belski --- block/src/qcow_async.rs | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/block/src/qcow_async.rs b/block/src/qcow_async.rs index 62b2872b2d..a90fc2d08a 100644 --- a/block/src/qcow_async.rs +++ b/block/src/qcow_async.rs @@ -6,14 +6,14 @@ //! QCOW2 async disk backend. -use std::fmt; use std::fs::File; use std::os::fd::{AsFd, AsRawFd}; use std::sync::Arc; +use std::{fmt, io}; -use crate::async_io::BorrowedDiskFd; +use crate::async_io::{BorrowedDiskFd, DiskFileError}; use crate::disk_file; -use crate::error::{BlockErrorKind, BlockResult, ErrorOp}; +use crate::error::{BlockError, BlockErrorKind, BlockResult, ErrorOp}; use crate::qcow::backing::shared_backing_from; use crate::qcow::metadata::{BackingRead, QcowMetadata}; use crate::qcow::qcow_raw_file::QcowRawFile; @@ -101,3 +101,21 @@ impl disk_file::SparseCapable for QcowDiskAsync { true } } + +impl disk_file::Resizable for QcowDiskAsync { + fn resize(&mut self, size: u64) -> BlockResult<()> { + if self.backing_file.is_some() { + return Err(BlockError::new( + BlockErrorKind::UnsupportedFeature, + DiskFileError::ResizeError(io::Error::other( + "resize not supported with backing file", + )), + ) + .with_op(ErrorOp::Resize)); + } + self.metadata.resize(size).map_err(|e| { + BlockError::new(BlockErrorKind::Io, DiskFileError::ResizeError(e)) + .with_op(ErrorOp::Resize) + }) + } +} From 896660789825b11c96c6b7e50a249660a98c6e3e Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 20 Mar 2026 17:52:25 +0100 Subject: [PATCH 586/742] block: qcow_async: impl DiskFile for QcowDiskAsync Marker supertrait combining all composable capability traits. QcowDiskAsync now satisfies the full DiskFile contract. Signed-off-by: Anatol Belski --- block/src/qcow_async.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/src/qcow_async.rs b/block/src/qcow_async.rs index a90fc2d08a..4e0826c652 100644 --- a/block/src/qcow_async.rs +++ b/block/src/qcow_async.rs @@ -119,3 +119,5 @@ impl disk_file::Resizable for QcowDiskAsync { }) } } + +impl disk_file::DiskFile for QcowDiskAsync {} From 8d684cad98957751557dd664d203f45c6144672b Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 20 Mar 2026 18:22:58 +0100 Subject: [PATCH 587/742] block: qcow_async: Add QcowAsync struct and constructor Per queue I/O worker that uses io_uring for asynchronous reads against fully allocated clusters. The struct holds the shared metadata, data file, optional backing reader, the io_uring instance and a synthetic completion list. Feature gated on io_uring in lib.rs. Signed-off-by: Anatol Belski --- block/src/lib.rs | 1 + block/src/qcow_async.rs | 47 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/block/src/lib.rs b/block/src/lib.rs index c98f20ae87..811ee974ef 100644 --- a/block/src/lib.rs +++ b/block/src/lib.rs @@ -18,6 +18,7 @@ pub mod fixed_vhd; pub mod fixed_vhd_async; pub mod fixed_vhd_sync; pub mod qcow; +#[cfg(feature = "io_uring")] pub mod qcow_async; pub(crate) mod qcow_common; pub mod qcow_sync; diff --git a/block/src/qcow_async.rs b/block/src/qcow_async.rs index 4e0826c652..b0d93e6876 100644 --- a/block/src/qcow_async.rs +++ b/block/src/qcow_async.rs @@ -6,11 +6,15 @@ //! QCOW2 async disk backend. +use std::collections::VecDeque; use std::fs::File; use std::os::fd::{AsFd, AsRawFd}; use std::sync::Arc; use std::{fmt, io}; +use io_uring::IoUring; +use vmm_sys_util::eventfd::EventFd; + use crate::async_io::{BorrowedDiskFd, DiskFileError}; use crate::disk_file; use crate::error::{BlockError, BlockErrorKind, BlockResult, ErrorOp}; @@ -121,3 +125,46 @@ impl disk_file::Resizable for QcowDiskAsync { } impl disk_file::DiskFile for QcowDiskAsync {} + +/// Per queue QCOW2 I/O worker using io_uring. +/// +/// Reads against fully allocated single mapping clusters are submitted +/// to io_uring for true asynchronous completion. All other cluster +/// types (zero, compressed, backing) and multi mapping reads fall back +/// to synchronous I/O with synthetic completions. +/// +/// Writes are synchronous because metadata allocation must complete +/// before the host offset is known. +pub struct QcowAsync { + metadata: Arc, + data_file: QcowRawFile, + backing_file: Option>, + sparse: bool, + io_uring: IoUring, + eventfd: EventFd, + completion_list: VecDeque<(u64, i32)>, +} + +impl QcowAsync { + fn new( + metadata: Arc, + data_file: QcowRawFile, + backing_file: Option>, + sparse: bool, + ring_depth: u32, + ) -> io::Result { + let io_uring = IoUring::new(ring_depth)?; + let eventfd = EventFd::new(libc::EFD_NONBLOCK)?; + io_uring.submitter().register_eventfd(eventfd.as_raw_fd())?; + + Ok(QcowAsync { + metadata, + data_file, + backing_file, + sparse, + io_uring, + eventfd, + completion_list: VecDeque::new(), + }) + } +} From 85df0fef0df9430323a3d2d773af9918c07fe3fc Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 20 Mar 2026 18:41:18 +0100 Subject: [PATCH 588/742] block: qcow_async: impl AsyncIo scaffold for QcowAsync Add the AsyncIo trait impl with notifier and next_completed_request filled in. The remaining methods are stubbed with unimplemented and will be filled in by subsequent commits. next_completed_request drains io_uring completions first, then falls back to the synthetic completion list. Signed-off-by: Anatol Belski --- block/src/qcow_async.rs | 47 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/block/src/qcow_async.rs b/block/src/qcow_async.rs index b0d93e6876..43b1b6aaaf 100644 --- a/block/src/qcow_async.rs +++ b/block/src/qcow_async.rs @@ -15,7 +15,7 @@ use std::{fmt, io}; use io_uring::IoUring; use vmm_sys_util::eventfd::EventFd; -use crate::async_io::{BorrowedDiskFd, DiskFileError}; +use crate::async_io::{AsyncIo, AsyncIoResult, BorrowedDiskFd, DiskFileError}; use crate::disk_file; use crate::error::{BlockError, BlockErrorKind, BlockResult, ErrorOp}; use crate::qcow::backing::shared_backing_from; @@ -168,3 +168,48 @@ impl QcowAsync { }) } } + +impl AsyncIo for QcowAsync { + fn notifier(&self) -> &EventFd { + &self.eventfd + } + + fn read_vectored( + &mut self, + offset: libc::off_t, + iovecs: &[libc::iovec], + user_data: u64, + ) -> AsyncIoResult<()> { + unimplemented!() + } + + fn write_vectored( + &mut self, + offset: libc::off_t, + iovecs: &[libc::iovec], + user_data: u64, + ) -> AsyncIoResult<()> { + unimplemented!() + } + + fn fsync(&mut self, user_data: Option) -> AsyncIoResult<()> { + unimplemented!() + } + + fn next_completed_request(&mut self) -> Option<(u64, i32)> { + // Drain io_uring completions first, then synthetic ones. + self.io_uring + .completion() + .next() + .map(|entry| (entry.user_data(), entry.result())) + .or_else(|| self.completion_list.pop_front()) + } + + fn punch_hole(&mut self, offset: u64, length: u64, user_data: u64) -> AsyncIoResult<()> { + unimplemented!() + } + + fn write_zeroes(&mut self, offset: u64, length: u64, user_data: u64) -> AsyncIoResult<()> { + unimplemented!() + } +} From e7621abfbdb7daafb2cbd4730b63acff04331f51 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 20 Mar 2026 18:43:55 +0100 Subject: [PATCH 589/742] block: qcow_async: impl read_vectored for QcowAsync Single allocated cluster reads are submitted to io_uring for true async completion. Mixed mapping reads (zero, compressed, backing, multi cluster) fall back to synchronous pread64 with synthetic completions. Signed-off-by: Anatol Belski --- block/src/qcow_async.rs | 130 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 126 insertions(+), 4 deletions(-) diff --git a/block/src/qcow_async.rs b/block/src/qcow_async.rs index 43b1b6aaaf..e3bb399c0f 100644 --- a/block/src/qcow_async.rs +++ b/block/src/qcow_async.rs @@ -8,20 +8,22 @@ use std::collections::VecDeque; use std::fs::File; +use std::io::Error; use std::os::fd::{AsFd, AsRawFd}; use std::sync::Arc; use std::{fmt, io}; -use io_uring::IoUring; +use io_uring::{IoUring, opcode, types}; use vmm_sys_util::eventfd::EventFd; -use crate::async_io::{AsyncIo, AsyncIoResult, BorrowedDiskFd, DiskFileError}; +use crate::async_io::{AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFileError}; use crate::disk_file; use crate::error::{BlockError, BlockErrorKind, BlockResult, ErrorOp}; use crate::qcow::backing::shared_backing_from; -use crate::qcow::metadata::{BackingRead, QcowMetadata}; +use crate::qcow::metadata::{BackingRead, ClusterReadMapping, QcowMetadata}; use crate::qcow::qcow_raw_file::QcowRawFile; use crate::qcow::{MAX_NESTING_DEPTH, RawFile, parse_qcow}; +use crate::qcow_common::{pread_exact, scatter_to_iovecs, zero_fill_iovecs}; /// Device level handle for a QCOW2 image. /// @@ -180,7 +182,40 @@ impl AsyncIo for QcowAsync { iovecs: &[libc::iovec], user_data: u64, ) -> AsyncIoResult<()> { - unimplemented!() + let total_len: usize = iovecs.iter().map(|v| v.iov_len).sum(); + + if let Some(host_offset) = Self::resolve_read( + &self.metadata, + &self.data_file, + &self.backing_file, + offset as u64, + iovecs, + total_len, + )? { + let fd = self.data_file.as_raw_fd(); + let (submitter, mut sq, _) = self.io_uring.split(); + + // SAFETY: fd is valid and iovecs point to valid guest memory. + unsafe { + sq.push( + &opcode::Readv::new(types::Fd(fd), iovecs.as_ptr(), iovecs.len() as u32) + .offset(host_offset) + .build() + .user_data(user_data), + ) + .map_err(|_| { + AsyncIoError::ReadVectored(Error::other("Submission queue is full")) + })?; + }; + + sq.sync(); + submitter.submit().map_err(AsyncIoError::ReadVectored)?; + } else { + self.completion_list + .push_back((user_data, total_len as i32)); + self.eventfd.write(1).unwrap(); + } + Ok(()) } fn write_vectored( @@ -213,3 +248,90 @@ impl AsyncIo for QcowAsync { unimplemented!() } } + +impl QcowAsync { + /// Resolves read mappings for a guest read request. + /// + /// Returns `Some(host_offset)` if the entire read falls within a single + /// allocated cluster (fast path). Otherwise handles the read + /// synchronously via `scatter_read_sync` and returns `None`. + fn resolve_read( + metadata: &QcowMetadata, + data_file: &QcowRawFile, + backing_file: &Option>, + address: u64, + iovecs: &[libc::iovec], + total_len: usize, + ) -> AsyncIoResult> { + let has_backing = backing_file.is_some(); + let mappings = metadata + .map_clusters_for_read(address, total_len, has_backing) + .map_err(AsyncIoError::ReadVectored)?; + + if mappings.len() == 1 + && let ClusterReadMapping::Allocated { + offset: host_offset, + length, + } = &mappings[0] + && *length as usize == total_len + { + return Ok(Some(*host_offset)); + } + + Self::scatter_read_sync(mappings, iovecs, data_file, backing_file)?; + Ok(None) + } + + /// Scatter-read cluster mappings synchronously into iovec buffers. + fn scatter_read_sync( + mappings: Vec, + iovecs: &[libc::iovec], + data_file: &QcowRawFile, + backing_file: &Option>, + ) -> AsyncIoResult<()> { + let mut buf_offset = 0usize; + for mapping in mappings { + match mapping { + ClusterReadMapping::Zero { length } => { + // SAFETY: iovecs point to valid guest memory buffers. + unsafe { + zero_fill_iovecs(iovecs, buf_offset, length as usize); + } + buf_offset += length as usize; + } + ClusterReadMapping::Allocated { + offset: host_offset, + length, + } => { + let mut buf = vec![0u8; length as usize]; + pread_exact(data_file.as_raw_fd(), &mut buf, host_offset) + .map_err(AsyncIoError::ReadVectored)?; + // SAFETY: iovecs point to valid guest memory buffers. + unsafe { scatter_to_iovecs(iovecs, buf_offset, &buf) }; + buf_offset += length as usize; + } + ClusterReadMapping::Compressed { data } => { + let len = data.len(); + // SAFETY: iovecs point to valid guest memory buffers. + unsafe { scatter_to_iovecs(iovecs, buf_offset, &data) }; + buf_offset += len; + } + ClusterReadMapping::Backing { + offset: backing_offset, + length, + } => { + let mut buf = vec![0u8; length as usize]; + backing_file + .as_ref() + .unwrap() + .read_at(backing_offset, &mut buf) + .map_err(AsyncIoError::ReadVectored)?; + // SAFETY: iovecs point to valid guest memory buffers. + unsafe { scatter_to_iovecs(iovecs, buf_offset, &buf) }; + buf_offset += length as usize; + } + } + } + Ok(()) + } +} From a97260c5e0ee1728d2e383ac8bb10b9c9076db77 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 20 Mar 2026 20:01:09 +0100 Subject: [PATCH 590/742] block: qcow_async: impl write_vectored for QcowAsync Synchronous per cluster write path - gather guest data from iovecs, map each cluster through QcowMetadata, and pwrite to the allocated host offset. Partial cluster writes with a backing file read the backing data first so map_cluster_for_write can perform COW. Signed-off-by: Anatol Belski --- block/src/qcow_async.rs | 77 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 74 insertions(+), 3 deletions(-) diff --git a/block/src/qcow_async.rs b/block/src/qcow_async.rs index e3bb399c0f..2239867c87 100644 --- a/block/src/qcow_async.rs +++ b/block/src/qcow_async.rs @@ -6,6 +6,7 @@ //! QCOW2 async disk backend. +use std::cmp::min; use std::collections::VecDeque; use std::fs::File; use std::io::Error; @@ -20,10 +21,12 @@ use crate::async_io::{AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, Disk use crate::disk_file; use crate::error::{BlockError, BlockErrorKind, BlockResult, ErrorOp}; use crate::qcow::backing::shared_backing_from; -use crate::qcow::metadata::{BackingRead, ClusterReadMapping, QcowMetadata}; +use crate::qcow::metadata::{BackingRead, ClusterReadMapping, ClusterWriteMapping, QcowMetadata}; use crate::qcow::qcow_raw_file::QcowRawFile; use crate::qcow::{MAX_NESTING_DEPTH, RawFile, parse_qcow}; -use crate::qcow_common::{pread_exact, scatter_to_iovecs, zero_fill_iovecs}; +use crate::qcow_common::{ + gather_from_iovecs, pread_exact, pwrite_all, scatter_to_iovecs, zero_fill_iovecs, +}; /// Device level handle for a QCOW2 image. /// @@ -218,13 +221,30 @@ impl AsyncIo for QcowAsync { Ok(()) } + // TODO Make writes async. + // Writes are synchronous. Async writes require a multi step + // state machine for COW (backing read, cluster allocation, data + // write, L2 commit) with per request buffer lifetime tracking + // and write ordering. fn write_vectored( &mut self, offset: libc::off_t, iovecs: &[libc::iovec], user_data: u64, ) -> AsyncIoResult<()> { - unimplemented!() + Self::cow_write_sync( + offset as u64, + iovecs, + &self.metadata, + &self.data_file, + &self.backing_file, + )?; + + let total_len: usize = iovecs.iter().map(|v| v.iov_len).sum(); + self.completion_list + .push_back((user_data, total_len as i32)); + self.eventfd.write(1).unwrap(); + Ok(()) } fn fsync(&mut self, user_data: Option) -> AsyncIoResult<()> { @@ -334,4 +354,55 @@ impl QcowAsync { } Ok(()) } + + /// Write iovec data cluster-by-cluster with COW from backing file. + fn cow_write_sync( + address: u64, + iovecs: &[libc::iovec], + metadata: &QcowMetadata, + data_file: &QcowRawFile, + backing_file: &Option>, + ) -> AsyncIoResult<()> { + let total_len: usize = iovecs.iter().map(|v| v.iov_len).sum(); + let cluster_size = metadata.cluster_size(); + let mut buf_offset = 0usize; + + while buf_offset < total_len { + let curr_addr = address + buf_offset as u64; + let intra_offset = metadata.cluster_offset(curr_addr); + let remaining_in_cluster = (cluster_size - intra_offset) as usize; + let count = min(total_len - buf_offset, remaining_in_cluster); + + let backing_data = if let Some(backing) = backing_file + .as_ref() + .filter(|_| intra_offset != 0 || count < cluster_size as usize) + { + let cluster_begin = curr_addr - intra_offset; + let mut data = vec![0u8; cluster_size as usize]; + backing + .read_at(cluster_begin, &mut data) + .map_err(AsyncIoError::WriteVectored)?; + Some(data) + } else { + None + }; + + let mapping = metadata + .map_cluster_for_write(curr_addr, backing_data) + .map_err(AsyncIoError::WriteVectored)?; + + match mapping { + ClusterWriteMapping::Allocated { + offset: host_offset, + } => { + // SAFETY: iovecs point to valid guest memory buffers. + let buf = unsafe { gather_from_iovecs(iovecs, buf_offset, count) }; + pwrite_all(data_file.as_raw_fd(), &buf, host_offset) + .map_err(AsyncIoError::WriteVectored)?; + } + } + buf_offset += count; + } + Ok(()) + } } From bf70c19857dcdfd13b375b12fba202f30dcf0f70 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 20 Mar 2026 20:07:32 +0100 Subject: [PATCH 591/742] block: qcow_async: impl fsync for QcowAsync Flush dirty metadata caches and sync the underlying file via QcowMetadata::flush, then signal synthetic completion. Signed-off-by: Anatol Belski --- block/src/qcow_async.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/block/src/qcow_async.rs b/block/src/qcow_async.rs index 2239867c87..ad94f74afa 100644 --- a/block/src/qcow_async.rs +++ b/block/src/qcow_async.rs @@ -248,7 +248,12 @@ impl AsyncIo for QcowAsync { } fn fsync(&mut self, user_data: Option) -> AsyncIoResult<()> { - unimplemented!() + self.metadata.flush().map_err(AsyncIoError::Fsync)?; + if let Some(user_data) = user_data { + self.completion_list.push_back((user_data, 0)); + self.eventfd.write(1).unwrap(); + } + Ok(()) } fn next_completed_request(&mut self) -> Option<(u64, i32)> { From 261361c1b0741be2276b556336349ab738614813 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 20 Mar 2026 20:12:14 +0100 Subject: [PATCH 592/742] block: qcow_async: impl punch_hole and write_zeroes for QcowAsync Deallocate clusters through QcowMetadata::deallocate_bytes, then apply the resulting DeallocAction list (punch hole or write zeroes at host offsets). write_zeroes delegates to punch_hole since unallocated QCOW2 clusters inherently read as zero. Signed-off-by: Anatol Belski --- block/src/qcow_async.rs | 65 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 62 insertions(+), 3 deletions(-) diff --git a/block/src/qcow_async.rs b/block/src/qcow_async.rs index ad94f74afa..629ca8df46 100644 --- a/block/src/qcow_async.rs +++ b/block/src/qcow_async.rs @@ -16,12 +16,15 @@ use std::{fmt, io}; use io_uring::{IoUring, opcode, types}; use vmm_sys_util::eventfd::EventFd; +use vmm_sys_util::write_zeroes::{PunchHole, WriteZeroesAt}; use crate::async_io::{AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFileError}; use crate::disk_file; use crate::error::{BlockError, BlockErrorKind, BlockResult, ErrorOp}; use crate::qcow::backing::shared_backing_from; -use crate::qcow::metadata::{BackingRead, ClusterReadMapping, ClusterWriteMapping, QcowMetadata}; +use crate::qcow::metadata::{ + BackingRead, ClusterReadMapping, ClusterWriteMapping, DeallocAction, QcowMetadata, +}; use crate::qcow::qcow_raw_file::QcowRawFile; use crate::qcow::{MAX_NESTING_DEPTH, RawFile, parse_qcow}; use crate::qcow_common::{ @@ -172,6 +175,26 @@ impl QcowAsync { completion_list: VecDeque::new(), }) } + + fn apply_dealloc_action(&mut self, action: &DeallocAction) { + match action { + DeallocAction::PunchHole { + host_offset, + length, + } => { + let _ = self.data_file.file_mut().punch_hole(*host_offset, *length); + } + DeallocAction::WriteZeroes { + host_offset, + length, + } => { + let _ = self + .data_file + .file_mut() + .write_zeroes_at(*host_offset, *length); + } + } + } } impl AsyncIo for QcowAsync { @@ -266,11 +289,47 @@ impl AsyncIo for QcowAsync { } fn punch_hole(&mut self, offset: u64, length: u64, user_data: u64) -> AsyncIoResult<()> { - unimplemented!() + let virtual_size = self.metadata.virtual_size(); + let cluster_size = self.metadata.cluster_size(); + + let result = self + .metadata + .deallocate_bytes( + offset, + length as usize, + self.sparse, + virtual_size, + cluster_size, + self.backing_file.as_deref(), + ) + .map_err(AsyncIoError::PunchHole); + + match result { + Ok(actions) => { + for action in &actions { + self.apply_dealloc_action(action); + } + self.completion_list.push_back((user_data, 0)); + self.eventfd.write(1).unwrap(); + Ok(()) + } + Err(e) => { + let errno = if let AsyncIoError::PunchHole(ref io_err) = e { + -io_err.raw_os_error().unwrap_or(libc::EIO) + } else { + -libc::EIO + }; + self.completion_list.push_back((user_data, errno)); + self.eventfd.write(1).unwrap(); + Ok(()) + } + } } fn write_zeroes(&mut self, offset: u64, length: u64, user_data: u64) -> AsyncIoResult<()> { - unimplemented!() + // For QCOW2, zeroing and hole punching are the same operation. + // Both discard guest data so the range reads back as zero. + self.punch_hole(offset, length, user_data) } } From ffc579b913687bdd0ce9191517fe01f695eb7e4f Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 20 Mar 2026 20:17:01 +0100 Subject: [PATCH 593/742] block: qcow_async: impl AsyncDiskFile for QcowDiskAsync try_clone shares the Arc wrapped metadata and backing file. new_async_io creates a QcowAsync worker with its own io_uring instance for the given ring depth. Signed-off-by: Anatol Belski --- block/src/qcow_async.rs | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/block/src/qcow_async.rs b/block/src/qcow_async.rs index 629ca8df46..b5720bdca8 100644 --- a/block/src/qcow_async.rs +++ b/block/src/qcow_async.rs @@ -134,6 +134,30 @@ impl disk_file::Resizable for QcowDiskAsync { impl disk_file::DiskFile for QcowDiskAsync {} +impl disk_file::AsyncDiskFile for QcowDiskAsync { + fn try_clone(&self) -> BlockResult> { + Ok(Box::new(QcowDiskAsync { + metadata: Arc::clone(&self.metadata), + backing_file: self.backing_file.as_ref().map(Arc::clone), + sparse: self.sparse, + data_raw_file: self.data_raw_file.clone(), + })) + } + + fn new_async_io(&self, ring_depth: u32) -> BlockResult> { + Ok(Box::new( + QcowAsync::new( + Arc::clone(&self.metadata), + self.data_raw_file.clone(), + self.backing_file.as_ref().map(Arc::clone), + self.sparse, + ring_depth, + ) + .map_err(|e| BlockError::new(BlockErrorKind::Io, DiskFileError::NewAsyncIo(e)))?, + )) + } +} + /// Per queue QCOW2 I/O worker using io_uring. /// /// Reads against fully allocated single mapping clusters are submitted From 3d5a40dfa68e4717044a93cc07c51b072d98d38f Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 20 Mar 2026 20:30:53 +0100 Subject: [PATCH 594/742] vmm: device_manager: Wire up QcowDiskAsync with io_uring When io_uring is available and not disabled, open QCOW2 images with QcowDiskAsync for asynchronous reads. Falls back to QcowDiskSync otherwise. Signed-off-by: Anatol Belski --- vmm/src/device_manager.rs | 60 ++++++++++++++++++++++++++++++--------- 1 file changed, 46 insertions(+), 14 deletions(-) diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 91d9509f38..9522be53d2 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -35,6 +35,8 @@ use arch::{NumaNodes, layout}; use block::disk_file::DiskBackend; use block::error::BlockError; use block::fixed_vhd_sync::FixedVhdDiskSync; +#[cfg(feature = "io_uring")] +use block::qcow_async::QcowDiskAsync; use block::qcow_sync::QcowDiskSync; use block::raw_async_aio::RawFileDiskAio; use block::raw_sync::RawFileDiskSync; @@ -591,6 +593,10 @@ pub enum DeviceManagerError { #[error("Failed to create QcowDiskSync")] CreateQcowDiskSync(#[source] BlockError), + /// Failed to create QcowDiskAsync + #[error("Failed to create QcowDiskAsync")] + CreateQcowDiskAsync(#[source] BlockError), + /// Failed to create FixedVhdxDiskSync #[error("Failed to create FixedVhdxDiskSync")] CreateFixedVhdxDiskSync(#[source] BlockError), @@ -2801,20 +2807,46 @@ impl DeviceManager { } } ImageType::Qcow2 => { - info!("Using synchronous QCOW2 disk file"); - DiskBackend::Next(Box::new( - QcowDiskSync::new( - file, - disk_cfg.direct, - disk_cfg.backing_files, - disk_cfg.sparse, - ) - .map_err(|e| match &disk_cfg.path { - Some(p) => e.with_path(p), - None => e, - }) - .map_err(DeviceManagerError::CreateQcowDiskSync)?, - )) + if cfg!(feature = "io_uring") + && !disk_cfg.disable_io_uring + && self.io_uring_is_supported() + { + info!("Using asynchronous QCOW2 disk file (io_uring)"); + + #[cfg(not(feature = "io_uring"))] + unreachable!("Checked in if statement above"); + #[cfg(feature = "io_uring")] + { + DiskBackend::Next(Box::new( + QcowDiskAsync::new( + file, + disk_cfg.direct, + disk_cfg.backing_files, + disk_cfg.sparse, + ) + .map_err(|e| match &disk_cfg.path { + Some(p) => e.with_path(p), + None => e, + }) + .map_err(DeviceManagerError::CreateQcowDiskAsync)?, + )) + } + } else { + info!("Using synchronous QCOW2 disk file"); + DiskBackend::Next(Box::new( + QcowDiskSync::new( + file, + disk_cfg.direct, + disk_cfg.backing_files, + disk_cfg.sparse, + ) + .map_err(|e| match &disk_cfg.path { + Some(p) => e.with_path(p), + None => e, + }) + .map_err(DeviceManagerError::CreateQcowDiskSync)?, + )) + } } ImageType::Vhdx => { info!("Using synchronous VHDX disk file"); From b1d126fdbff5a7d2b843a1d41e489bf68f15f696 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Sat, 21 Mar 2026 17:22:30 +0100 Subject: [PATCH 595/742] block: qcow: Implement batch request submission Implement batch_requests_enabled() and submit_batch_requests() for QcowAsync. Without batching, each read_vectored call performs its own io_uring submit() syscall. With batching, the virtio queue handler collects all pending requests and submits them in a single call, pushing multiple SQEs before one submit() syscall. Each request in the batch is classified through the metadata layer. Requests that hit the fast path (single allocated cluster mapping) are pushed to the io_uring submission queue. Requests that require the slow path (compressed, backing, zero fill, or mixed mappings) are completed synchronously and queued as synthetic completions. Signed-off-by: Anatol Belski --- block/src/qcow_async.rs | 80 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 79 insertions(+), 1 deletion(-) diff --git a/block/src/qcow_async.rs b/block/src/qcow_async.rs index b5720bdca8..11e7554618 100644 --- a/block/src/qcow_async.rs +++ b/block/src/qcow_async.rs @@ -19,7 +19,6 @@ use vmm_sys_util::eventfd::EventFd; use vmm_sys_util::write_zeroes::{PunchHole, WriteZeroesAt}; use crate::async_io::{AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFileError}; -use crate::disk_file; use crate::error::{BlockError, BlockErrorKind, BlockResult, ErrorOp}; use crate::qcow::backing::shared_backing_from; use crate::qcow::metadata::{ @@ -30,6 +29,7 @@ use crate::qcow::{MAX_NESTING_DEPTH, RawFile, parse_qcow}; use crate::qcow_common::{ gather_from_iovecs, pread_exact, pwrite_all, scatter_to_iovecs, zero_fill_iovecs, }; +use crate::{BatchRequest, RequestType, disk_file}; /// Device level handle for a QCOW2 image. /// @@ -355,6 +355,84 @@ impl AsyncIo for QcowAsync { // Both discard guest data so the range reads back as zero. self.punch_hole(offset, length, user_data) } + + fn batch_requests_enabled(&self) -> bool { + true + } + + fn submit_batch_requests(&mut self, batch_request: &[BatchRequest]) -> AsyncIoResult<()> { + let (submitter, mut sq, _) = self.io_uring.split(); + let mut needs_submit = false; + let mut sync_completions: Vec<(u64, i32)> = Vec::new(); + + for req in batch_request { + match req.request_type { + RequestType::In => { + let total_len: usize = req.iovecs.iter().map(|v| v.iov_len).sum(); + + if let Some(host_offset) = Self::resolve_read( + &self.metadata, + &self.data_file, + &self.backing_file, + req.offset as u64, + &req.iovecs, + total_len, + )? { + let fd = self.data_file.as_raw_fd(); + // SAFETY: fd is valid and iovecs point to valid guest memory. + unsafe { + sq.push( + &opcode::Readv::new( + types::Fd(fd), + req.iovecs.as_ptr(), + req.iovecs.len() as u32, + ) + .offset(host_offset) + .build() + .user_data(req.user_data), + ) + .map_err(|_| { + AsyncIoError::ReadVectored(Error::other("Submission queue is full")) + })?; + } + needs_submit = true; + } else { + sync_completions.push((req.user_data, total_len as i32)); + } + } + RequestType::Out => { + let total_len: usize = req.iovecs.iter().map(|v| v.iov_len).sum(); + Self::cow_write_sync( + req.offset as u64, + &req.iovecs, + &self.metadata, + &self.data_file, + &self.backing_file, + )?; + sync_completions.push((req.user_data, total_len as i32)); + } + _ => { + unreachable!("Unexpected batch request type: {:?}", req.request_type) + } + } + } + + if needs_submit { + sq.sync(); + submitter + .submit() + .map_err(AsyncIoError::SubmitBatchRequests)?; + } + + if !sync_completions.is_empty() { + for c in sync_completions { + self.completion_list.push_back(c); + } + self.eventfd.write(1).unwrap(); + } + + Ok(()) + } } impl QcowAsync { From 8ffbe9d6feec43e4c06f567133e800bd071165bf Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 30 Mar 2026 22:40:56 +0200 Subject: [PATCH 596/742] block: qcow: Test async punch hole Add a QcowAsync unit test for punch hole completion. The test verifies that a punch hole request reports successful completion and that the deallocated range reads back as zeroes. Signed-off-by: Anatol Belski --- block/src/qcow_async.rs | 85 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/block/src/qcow_async.rs b/block/src/qcow_async.rs index 11e7554618..74cf9abef6 100644 --- a/block/src/qcow_async.rs +++ b/block/src/qcow_async.rs @@ -572,3 +572,88 @@ impl QcowAsync { Ok(()) } } + +#[cfg(test)] +mod unit_tests { + use std::io::{Seek, SeekFrom, Write}; + + use vmm_sys_util::tempfile::TempFile; + + use super::*; + use crate::disk_file::AsyncDiskFile; + use crate::qcow::{QcowFile, RawFile}; + + fn create_disk_with_data( + file_size: u64, + data: &[u8], + offset: u64, + sparse: bool, + ) -> (TempFile, QcowDiskAsync) { + let temp_file = TempFile::new().unwrap(); + { + let raw_file = RawFile::new(temp_file.as_file().try_clone().unwrap(), false); + let mut qcow_file = QcowFile::new(raw_file, 3, file_size, sparse).unwrap(); + qcow_file.seek(SeekFrom::Start(offset)).unwrap(); + qcow_file.write_all(data).unwrap(); + qcow_file.flush().unwrap(); + } + let disk = QcowDiskAsync::new( + temp_file.as_file().try_clone().unwrap(), + false, + false, + sparse, + ) + .unwrap(); + (temp_file, disk) + } + + fn wait_for_completion(async_io: &mut dyn AsyncIo) -> (u64, i32) { + loop { + if let Some(c) = async_io.next_completed_request() { + return c; + } + let fd = async_io.notifier().as_raw_fd(); + let mut val = 0u64; + // SAFETY: reading 8 bytes from a valid eventfd. + unsafe { + libc::read(fd, &mut val as *mut u64 as *mut libc::c_void, 8); + } + } + } + + fn async_read(disk: &QcowDiskAsync, offset: u64, len: usize) -> Vec { + let mut async_io = disk.new_async_io(1).unwrap(); + let mut buf = vec![0xFFu8; len]; + let iovec = libc::iovec { + iov_base: buf.as_mut_ptr() as *mut libc::c_void, + iov_len: buf.len(), + }; + async_io + .read_vectored(offset as libc::off_t, &[iovec], 1) + .unwrap(); + let (user_data, result) = wait_for_completion(async_io.as_mut()); + assert_eq!(user_data, 1); + assert_eq!(result as usize, len, "read should return requested length"); + buf + } + + #[test] + fn test_qcow_async_punch_hole_completion() { + let data = vec![0xDD; 128 * 1024]; + let offset = 0u64; + let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &data, offset, true); + + let mut async_io = disk.new_async_io(1).unwrap(); + async_io.punch_hole(offset, data.len() as u64, 100).unwrap(); + let (user_data, result) = async_io.next_completed_request().unwrap(); + assert_eq!(user_data, 100); + assert_eq!(result, 0, "punch_hole should succeed"); + drop(async_io); + + let read_buf = async_read(&disk, offset, data.len()); + assert!( + read_buf.iter().all(|&b| b == 0), + "Punched hole should read as zeros" + ); + } +} From e522d3a0aa41138dc4dd7b342682676488f3e46b Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 30 Mar 2026 22:43:01 +0200 Subject: [PATCH 597/742] block: qcow: Test async write zeroes Add a QcowAsync unit test for write zeroes completion. The test verifies that a write zeroes request reports successful completion and that the zeroed range reads back as zeroes. Signed-off-by: Anatol Belski --- block/src/qcow_async.rs | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/block/src/qcow_async.rs b/block/src/qcow_async.rs index 74cf9abef6..a01acf604f 100644 --- a/block/src/qcow_async.rs +++ b/block/src/qcow_async.rs @@ -656,4 +656,26 @@ mod unit_tests { "Punched hole should read as zeros" ); } + + #[test] + fn test_qcow_async_write_zeroes_completion() { + let data = vec![0xAA; 128 * 1024]; + let offset = 0u64; + let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &data, offset, true); + + let mut async_io = disk.new_async_io(1).unwrap(); + async_io + .write_zeroes(offset, data.len() as u64, 200) + .unwrap(); + let (user_data, result) = async_io.next_completed_request().unwrap(); + assert_eq!(user_data, 200); + assert_eq!(result, 0, "write_zeroes should succeed"); + drop(async_io); + + let read_buf = async_read(&disk, offset, data.len()); + assert!( + read_buf.iter().all(|&b| b == 0), + "Write zeroes region should read as zeros" + ); + } } From 1e66c144f4950e7f3c1999f42cd49c8f7b649414 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 30 Mar 2026 22:55:45 +0200 Subject: [PATCH 598/742] block: qcow: Test async write and read roundtrip Add a QcowAsync unit test that writes a byte pattern through write_vectored, reads it back through read_vectored, and verifies the data matches. This exercises the core async write and read paths end to end. Also add an async_write helper for future tests. Signed-off-by: Anatol Belski --- block/src/qcow_async.rs | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/block/src/qcow_async.rs b/block/src/qcow_async.rs index a01acf604f..31ad2584d6 100644 --- a/block/src/qcow_async.rs +++ b/block/src/qcow_async.rs @@ -612,6 +612,7 @@ mod unit_tests { if let Some(c) = async_io.next_completed_request() { return c; } + // Block until the eventfd is signaled (io_uring or synthetic). let fd = async_io.notifier().as_raw_fd(); let mut val = 0u64; // SAFETY: reading 8 bytes from a valid eventfd. @@ -621,6 +622,24 @@ mod unit_tests { } } + fn async_write(disk: &QcowDiskAsync, offset: u64, data: &[u8]) { + let mut async_io = disk.new_async_io(1).unwrap(); + let iovec = libc::iovec { + iov_base: data.as_ptr() as *mut libc::c_void, + iov_len: data.len(), + }; + async_io + .write_vectored(offset as libc::off_t, &[iovec], 2) + .unwrap(); + let (user_data, result) = wait_for_completion(async_io.as_mut()); + assert_eq!(user_data, 2); + assert_eq!( + result as usize, + data.len(), + "write should return requested length" + ); + } + fn async_read(disk: &QcowDiskAsync, offset: u64, len: usize) -> Vec { let mut async_io = disk.new_async_io(1).unwrap(); let mut buf = vec![0xFFu8; len]; @@ -678,4 +697,23 @@ mod unit_tests { "Write zeroes region should read as zeros" ); } + + #[test] + fn test_qcow_async_write_read_roundtrip() { + let file_size = 100 * 1024 * 1024; + let temp_file = TempFile::new().unwrap(); + { + let raw_file = RawFile::new(temp_file.as_file().try_clone().unwrap(), false); + QcowFile::new(raw_file, 3, file_size, true).unwrap(); + } + let disk = QcowDiskAsync::new(temp_file.as_file().try_clone().unwrap(), false, false, true) + .unwrap(); + + let pattern: Vec = (0..128 * 1024).map(|i| (i % 251) as u8).collect(); + let offset = 64 * 1024; + + async_write(&disk, offset, &pattern); + let read_buf = async_read(&disk, offset, pattern.len()); + assert_eq!(read_buf, pattern, "read should match written data"); + } } From 093922ff24abcdb86375b2d6f00299b4e48bb417 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 30 Mar 2026 22:59:45 +0200 Subject: [PATCH 599/742] block: qcow: Test async read spanning cluster boundary Add a QcowAsync unit test that writes distinct patterns into two adjacent clusters, then issues a single read spanning the cluster boundary. Verifies that multi mapping read resolution returns the correct data from both clusters. Signed-off-by: Anatol Belski --- block/src/qcow_async.rs | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/block/src/qcow_async.rs b/block/src/qcow_async.rs index 31ad2584d6..c26fdaea2b 100644 --- a/block/src/qcow_async.rs +++ b/block/src/qcow_async.rs @@ -716,4 +716,30 @@ mod unit_tests { let read_buf = async_read(&disk, offset, pattern.len()); assert_eq!(read_buf, pattern, "read should match written data"); } + + #[test] + fn test_qcow_async_read_spanning_cluster_boundary() { + let cluster_size: u64 = 65536; + let file_size = 100 * 1024 * 1024; + + // Write distinct patterns into two adjacent clusters. + let pattern_a = vec![0xAA; cluster_size as usize]; + let pattern_b = vec![0xBB; cluster_size as usize]; + let (_temp, disk) = create_disk_with_data(file_size, &pattern_a, 0, true); + async_write(&disk, cluster_size, &pattern_b); + + // Read across the boundary: last 4K of cluster 0 + first 4K of cluster 1. + let read_offset = cluster_size - 4096; + let read_len = 8192; + let buf = async_read(&disk, read_offset, read_len); + + assert!( + buf[..4096].iter().all(|&b| b == 0xAA), + "first half should come from cluster 0" + ); + assert!( + buf[4096..].iter().all(|&b| b == 0xBB), + "second half should come from cluster 1" + ); + } } From 34a9f7246dd3778d4f9ad2797ad5e2a7447ad86a Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 30 Mar 2026 23:04:56 +0200 Subject: [PATCH 600/742] block: qcow: Test async batch mixed requests Add a QcowAsync unit test that submits a batch of interleaved write and read requests via submit_batch_requests. Verifies that all completions arrive with the correct user_data and that the read back data matches the written data. Signed-off-by: Anatol Belski --- block/src/qcow_async.rs | 97 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/block/src/qcow_async.rs b/block/src/qcow_async.rs index c26fdaea2b..4501502a92 100644 --- a/block/src/qcow_async.rs +++ b/block/src/qcow_async.rs @@ -582,6 +582,7 @@ mod unit_tests { use super::*; use crate::disk_file::AsyncDiskFile; use crate::qcow::{QcowFile, RawFile}; + use crate::{BatchRequest, RequestType}; fn create_disk_with_data( file_size: u64, @@ -742,4 +743,100 @@ mod unit_tests { "second half should come from cluster 1" ); } + + #[test] + fn test_qcow_async_batch_mixed_requests() { + let file_size = 100 * 1024 * 1024; + let temp_file = TempFile::new().unwrap(); + { + let raw_file = RawFile::new(temp_file.as_file().try_clone().unwrap(), false); + QcowFile::new(raw_file, 3, file_size, true).unwrap(); + } + let disk = QcowDiskAsync::new(temp_file.as_file().try_clone().unwrap(), false, false, true) + .unwrap(); + + let mut async_io = disk.new_async_io(8).unwrap(); + + // Prepare write data for two regions. + let write_a = vec![0xAA; 4096]; + let write_b = vec![0xBB; 4096]; + let offset_a: u64 = 0; + let offset_b: u64 = 65536; + + let iov_a = libc::iovec { + iov_base: write_a.as_ptr() as *mut libc::c_void, + iov_len: write_a.len(), + }; + let iov_b = libc::iovec { + iov_base: write_b.as_ptr() as *mut libc::c_void, + iov_len: write_b.len(), + }; + + let batch = vec![ + BatchRequest { + offset: offset_a as libc::off_t, + iovecs: smallvec::smallvec![iov_a], + user_data: 10, + request_type: RequestType::Out, + }, + BatchRequest { + offset: offset_b as libc::off_t, + iovecs: smallvec::smallvec![iov_b], + user_data: 20, + request_type: RequestType::Out, + }, + ]; + + async_io.submit_batch_requests(&batch).unwrap(); + + let mut completions = [ + wait_for_completion(async_io.as_mut()), + wait_for_completion(async_io.as_mut()), + ]; + completions.sort_by_key(|c| c.0); + assert_eq!(completions[0], (10, 4096)); + assert_eq!(completions[1], (20, 4096)); + drop(async_io); + + // Batch read both regions back. + let mut read_a = vec![0u8; 4096]; + let mut read_b = vec![0u8; 4096]; + let riov_a = libc::iovec { + iov_base: read_a.as_mut_ptr() as *mut libc::c_void, + iov_len: read_a.len(), + }; + let riov_b = libc::iovec { + iov_base: read_b.as_mut_ptr() as *mut libc::c_void, + iov_len: read_b.len(), + }; + + let mut async_io = disk.new_async_io(8).unwrap(); + let read_batch = vec![ + BatchRequest { + offset: offset_a as libc::off_t, + iovecs: smallvec::smallvec![riov_a], + user_data: 30, + request_type: RequestType::In, + }, + BatchRequest { + offset: offset_b as libc::off_t, + iovecs: smallvec::smallvec![riov_b], + user_data: 40, + request_type: RequestType::In, + }, + ]; + + async_io.submit_batch_requests(&read_batch).unwrap(); + + let mut completions = [ + wait_for_completion(async_io.as_mut()), + wait_for_completion(async_io.as_mut()), + ]; + completions.sort_by_key(|c| c.0); + assert_eq!(completions[0], (30, 4096)); + assert_eq!(completions[1], (40, 4096)); + + assert_eq!(read_a, write_a, "batch read A should match written data"); + assert_eq!(read_b, write_b, "batch read B should match written data"); + } } From 6208f3caee063ec1f813d237cd39934b365575bb Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 30 Mar 2026 23:16:36 +0200 Subject: [PATCH 601/742] block: qcow: Test async read of unallocated region Add a QcowAsync unit test that reads from a range that was never written. Verifies the fundamental QCOW contract that unallocated clusters return zeroes. Signed-off-by: Anatol Belski --- block/src/qcow_async.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/block/src/qcow_async.rs b/block/src/qcow_async.rs index 4501502a92..bdc4fe341a 100644 --- a/block/src/qcow_async.rs +++ b/block/src/qcow_async.rs @@ -839,4 +839,22 @@ mod unit_tests { assert_eq!(read_a, write_a, "batch read A should match written data"); assert_eq!(read_b, write_b, "batch read B should match written data"); } + + #[test] + fn test_qcow_async_read_unallocated() { + let file_size = 100 * 1024 * 1024; + let temp_file = TempFile::new().unwrap(); + { + let raw_file = RawFile::new(temp_file.as_file().try_clone().unwrap(), false); + QcowFile::new(raw_file, 3, file_size, true).unwrap(); + } + let disk = QcowDiskAsync::new(temp_file.as_file().try_clone().unwrap(), false, false, true) + .unwrap(); + + let buf = async_read(&disk, 0, 128 * 1024); + assert!( + buf.iter().all(|&b| b == 0), + "unallocated region should read as zeroes" + ); + } } From 48155c4151816ff57aee0bc615aea876b00b3c0b Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 30 Mar 2026 23:36:27 +0200 Subject: [PATCH 602/742] block: qcow: Test async sub cluster write Add a QcowAsync unit test that writes 4K into the middle of a cluster, then reads the entire cluster back. Verifies that the written region matches and surrounding bytes remain zero. This exercises the COW path where unwritten parts of a newly allocated cluster must be zero filled. Signed-off-by: Anatol Belski --- block/src/qcow_async.rs | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/block/src/qcow_async.rs b/block/src/qcow_async.rs index bdc4fe341a..9063fd7d58 100644 --- a/block/src/qcow_async.rs +++ b/block/src/qcow_async.rs @@ -857,4 +857,42 @@ mod unit_tests { "unallocated region should read as zeroes" ); } + + #[test] + fn test_qcow_async_sub_cluster_write() { + let cluster_size = 65536usize; + let file_size = 100 * 1024 * 1024; + let temp_file = TempFile::new().unwrap(); + { + let raw_file = RawFile::new(temp_file.as_file().try_clone().unwrap(), false); + QcowFile::new(raw_file, 3, file_size, true).unwrap(); + } + let disk = QcowDiskAsync::new(temp_file.as_file().try_clone().unwrap(), false, false, true) + .unwrap(); + + // Write 4K into the middle of a cluster. + let write_offset = 4096u64; + let write_len = 4096; + let pattern = vec![0xCC; write_len]; + async_write(&disk, write_offset, &pattern); + + // Read the entire cluster back. + let buf = async_read(&disk, 0, cluster_size); + + assert!( + buf[..write_offset as usize].iter().all(|&b| b == 0), + "bytes before the write should be zero" + ); + assert_eq!( + &buf[write_offset as usize..write_offset as usize + write_len], + &pattern[..], + "written region should match" + ); + assert!( + buf[write_offset as usize + write_len..] + .iter() + .all(|&b| b == 0), + "bytes after the write should be zero" + ); + } } From 072b3a85d243748202b6be9193978a15ca85ea28 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 30 Mar 2026 23:45:12 +0200 Subject: [PATCH 603/742] block: qcow_async: Add write after punch hole test Write data, punch hole to deallocate, then rewrite the same range and verify the new contents read back correctly. Signed-off-by: Anatol Belski --- block/src/qcow_async.rs | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/block/src/qcow_async.rs b/block/src/qcow_async.rs index 9063fd7d58..daf8d1634c 100644 --- a/block/src/qcow_async.rs +++ b/block/src/qcow_async.rs @@ -895,4 +895,32 @@ mod unit_tests { "bytes after the write should be zero" ); } + + #[test] + fn test_qcow_async_write_after_punch_hole() { + let data = vec![0xAA; 64 * 1024]; + let offset = 0u64; + let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &data, offset, true); + + let buf = async_read(&disk, offset, data.len()); + assert!(buf.iter().all(|&b| b == 0xAA)); + + let mut async_io = disk.new_async_io(1).unwrap(); + async_io.punch_hole(offset, data.len() as u64, 10).unwrap(); + let (_, result) = wait_for_completion(async_io.as_mut()); + assert_eq!(result, 0); + drop(async_io); + + let buf = async_read(&disk, offset, data.len()); + assert!( + buf.iter().all(|&b| b == 0), + "should be zero after punch hole" + ); + + let new_data = vec![0xBB; 64 * 1024]; + async_write(&disk, offset, &new_data); + + let buf = async_read(&disk, offset, new_data.len()); + assert_eq!(buf, new_data, "should read new data after rewrite"); + } } From bb833a90e5a26b4bc086fe9539be81b123d5b5fd Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 30 Mar 2026 23:51:03 +0200 Subject: [PATCH 604/742] block: qcow_async: Add large sequential I/O test Write a distinct byte pattern into each of eight consecutive clusters in a single operation, then read the full range back and verify per cluster contents. Signed-off-by: Anatol Belski --- block/src/qcow_async.rs | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/block/src/qcow_async.rs b/block/src/qcow_async.rs index daf8d1634c..d58b1b1b75 100644 --- a/block/src/qcow_async.rs +++ b/block/src/qcow_async.rs @@ -923,4 +923,28 @@ mod unit_tests { let buf = async_read(&disk, offset, new_data.len()); assert_eq!(buf, new_data, "should read new data after rewrite"); } + + #[test] + fn test_qcow_async_large_sequential_io() { + let cluster_size = 64 * 1024; + let num_clusters = 8; + let total_len = cluster_size * num_clusters; + let offset = 0u64; + + let mut data = vec![0u8; total_len]; + for (i, chunk) in data.chunks_mut(cluster_size).enumerate() { + chunk.fill((i + 1) as u8); + } + + let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &data, offset, true); + + let buf = async_read(&disk, offset, total_len); + assert_eq!(buf.len(), total_len); + for (i, chunk) in buf.chunks(cluster_size).enumerate() { + assert!( + chunk.iter().all(|&b| b == (i + 1) as u8), + "cluster {i} mismatch" + ); + } + } } From d4fc1d38c8e978f420e7a26f5224840131ec0cf7 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Tue, 31 Mar 2026 00:56:26 +0200 Subject: [PATCH 605/742] scripts: dev_cli: Allow io_uring syscalls in unit tests The unit test container runs with Docker default seccomp profile which blocks io_uring_setup, io_uring_enter and io_uring_register. This causes all qcow_async unit tests to fail with EPERM when creating an io_uring instance. Add --security-opt seccomp=unconfined to the unit test docker run invocation. The container already has --device access and cap_net_admin, so this does not materially change the security posture. Signed-off-by: Anatol Belski --- scripts/dev_cli.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/dev_cli.sh b/scripts/dev_cli.sh index 0c0a2d5e4f..6e9d58eb7b 100755 --- a/scripts/dev_cli.sh +++ b/scripts/dev_cli.sh @@ -438,6 +438,7 @@ cmd_tests() { --device $exported_device \ --device /dev/net/tun \ --cap-add net_admin \ + --security-opt seccomp=unconfined \ --volume "$CLH_ROOT_DIR:$CTR_CLH_ROOT_DIR" \ ${exported_volumes:+$exported_volumes} \ --env BUILD_TARGET="$target" \ From a6d3901f3e1d4077703e04f6d32f771da0ae60de Mon Sep 17 00:00:00 2001 From: Dylan Reid Date: Tue, 7 Apr 2026 10:43:57 -0700 Subject: [PATCH 606/742] misc: return errors from IOMMU address translation instead of panicking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The address that is passed from the guest should be treated as untrusted. Currently an invalid address will panic the VMM. This only allows the guest to hurt itself, but we shouldn't have the VMM crashing. Instead let's return an error if possible or invalidate the queue if it happen during setup. The data flow from guest to translate_gva/translate_gpa is: 1. Guest writes a raw u64 address into a virtio descriptor in the shared descriptor table (guest memory). 2. The virtio-queue crate reads this descriptor via read_obj() and returns the addr field as-is in a GuestAddress — no validation. 3. Device code calls .translate_gva(access_platform, len) on the GuestAddress. 4. With IOMMU (access_platform is Some): the address is an IOVA that must be translated to a GPA via the IOMMU mapping table. If the guest provides an unmapped IOVA, translation returns Err. Previously, .unwrap() here panicked the VMM. 5. Without IOMMU (access_platform is None): translate_gva is a no-op (returns self). The raw address flows to GuestMemory::read_obj() which validates it — out-of-range addresses return Err(InvalidGuestAddress), so no host memory corruption is possible. Signed-off-by: Dylan Reid --- block/src/lib.rs | 9 ++-- net_util/src/ctrl_queue.rs | 9 ++-- net_util/src/queue_pair.rs | 15 ++++-- virtio-devices/src/console.rs | 14 +++-- virtio-devices/src/pmem.rs | 6 ++- virtio-devices/src/rng.rs | 5 +- .../src/transport/pci_common_config.rs | 33 +++++++++--- virtio-devices/src/vdpa.rs | 35 ++++++++----- virtio-devices/src/vsock/packet.rs | 31 +++++++---- vm-virtio/src/lib.rs | 52 ++++++++++++++----- 10 files changed, 153 insertions(+), 56 deletions(-) diff --git a/block/src/lib.rs b/block/src/lib.rs index 811ee974ef..7a55cc4498 100644 --- a/block/src/lib.rs +++ b/block/src/lib.rs @@ -312,7 +312,8 @@ impl Request { let hdr_desc_addr = hdr_desc .addr() - .translate_gva(access_platform, hdr_desc.len() as usize); + .translate_gva(access_platform, hdr_desc.len() as usize) + .map_err(|e| Error::GuestMemory(GuestMemoryError::IOError(e)))?; let mut req = Request { request_type: request_type(desc_chain.memory(), hdr_desc_addr)?, @@ -353,7 +354,8 @@ impl Request { req.data_descriptors.push(( desc.addr() - .translate_gva(access_platform, desc.len() as usize), + .translate_gva(access_platform, desc.len() as usize) + .map_err(|e| Error::GuestMemory(GuestMemoryError::IOError(e)))?, desc.len(), )); desc = desc_chain @@ -384,7 +386,8 @@ impl Request { req.status_addr = status_desc .addr() - .translate_gva(access_platform, status_desc.len() as usize); + .translate_gva(access_platform, status_desc.len() as usize) + .map_err(|e| Error::GuestMemory(GuestMemoryError::IOError(e)))?; Ok(req) } diff --git a/net_util/src/ctrl_queue.rs b/net_util/src/ctrl_queue.rs index e42b4c0ca5..b14b380364 100644 --- a/net_util/src/ctrl_queue.rs +++ b/net_util/src/ctrl_queue.rs @@ -100,14 +100,16 @@ impl CtrlQueue { .read_obj( ctrl_desc .addr() - .translate_gva(access_platform, ctrl_desc.len() as usize), + .translate_gva(access_platform, ctrl_desc.len() as usize) + .map_err(|e| Error::GuestMemory(GuestMemoryError::IOError(e)))?, ) .map_err(Error::GuestMemory)?; let data_desc = desc_chain.next().ok_or(Error::NoDataDescriptor)?; let data_desc_addr = data_desc .addr() - .translate_gva(access_platform, data_desc.len() as usize); + .translate_gva(access_platform, data_desc.len() as usize) + .map_err(|e| Error::GuestMemory(GuestMemoryError::IOError(e)))?; let status_desc = desc_chain.next().ok_or(Error::NoStatusDescriptor)?; @@ -168,7 +170,8 @@ impl CtrlQueue { if ok { VIRTIO_NET_OK } else { VIRTIO_NET_ERR } as u8, status_desc .addr() - .translate_gva(access_platform, status_desc.len() as usize), + .translate_gva(access_platform, status_desc.len() as usize) + .map_err(|e| Error::GuestMemory(GuestMemoryError::IOError(e)))?, ) .map_err(Error::GuestMemory)?; // Per virtio spec 2.6.8, used_len is the number of bytes written diff --git a/net_util/src/queue_pair.rs b/net_util/src/queue_pair.rs index c0b8825e71..a569031815 100644 --- a/net_util/src/queue_pair.rs +++ b/net_util/src/queue_pair.rs @@ -69,7 +69,10 @@ impl TxVirtio { while let Some(desc) = next_desc { let desc_addr = desc .addr() - .translate_gva(access_platform, desc.len() as usize); + .translate_gva(access_platform, desc.len() as usize) + .map_err(|e| { + NetQueuePairError::GuestMemory(vm_memory::GuestMemoryError::IOError(e)) + })?; if !desc.is_write_only() && desc.len() > 0 { let buf = desc_chain .memory() @@ -207,7 +210,10 @@ impl RxVirtio { .memory() .checked_offset( desc.addr() - .translate_gva(access_platform, desc.len() as usize), + .translate_gva(access_platform, desc.len() as usize) + .map_err(|e| { + NetQueuePairError::GuestMemory(vm_memory::GuestMemoryError::IOError(e)) + })?, 10, ) .ok_or(NetQueuePairError::DescriptorInvalidHeader)?; @@ -217,7 +223,10 @@ impl RxVirtio { while let Some(desc) = next_desc { let desc_addr = desc .addr() - .translate_gva(access_platform, desc.len() as usize); + .translate_gva(access_platform, desc.len() as usize) + .map_err(|e| { + NetQueuePairError::GuestMemory(vm_memory::GuestMemoryError::IOError(e)) + })?; if desc.is_write_only() && desc.len() > 0 { let buf = desc_chain .memory() diff --git a/virtio-devices/src/console.rs b/virtio-devices/src/console.rs index 96282b5228..d2d57b9e50 100644 --- a/virtio-devices/src/console.rs +++ b/virtio-devices/src/console.rs @@ -221,7 +221,10 @@ impl ConsoleEpollHandler { .write_slice( &source_slice[..], desc.addr() - .translate_gva(self.access_platform.as_deref(), desc.len() as usize), + .translate_gva(self.access_platform.as_deref(), desc.len() as usize) + .map_err(|e| { + Error::GuestMemoryWrite(vm_memory::GuestMemoryError::IOError(e)) + })?, ) .map_err(Error::GuestMemoryWrite)?; @@ -259,10 +262,11 @@ impl ConsoleEpollHandler { desc_chain .memory() .write_volatile_to( - desc.addr().translate_gva( - self.access_platform.as_deref(), - desc.len() as usize, - ), + desc.addr() + .translate_gva(self.access_platform.as_deref(), desc.len() as usize) + .map_err(|e| { + Error::GuestMemoryRead(vm_memory::GuestMemoryError::IOError(e)) + })?, &mut buf, desc.len() as usize, ) diff --git a/virtio-devices/src/pmem.rs b/virtio-devices/src/pmem.rs index fd995747c2..3abec1c0f0 100644 --- a/virtio-devices/src/pmem.rs +++ b/virtio-devices/src/pmem.rs @@ -123,7 +123,8 @@ impl Request { .memory() .read_obj( desc.addr() - .translate_gva(access_platform, desc.len() as usize), + .translate_gva(access_platform, desc.len() as usize) + .map_err(|e| Error::GuestMemory(GuestMemoryError::IOError(e)))?, ) .map_err(Error::GuestMemory)?; @@ -147,7 +148,8 @@ impl Request { type_: request_type, status_addr: status_desc .addr() - .translate_gva(access_platform, status_desc.len() as usize), + .translate_gva(access_platform, status_desc.len() as usize) + .map_err(|e| Error::GuestMemory(GuestMemoryError::IOError(e)))?, }) } } diff --git a/virtio-devices/src/rng.rs b/virtio-devices/src/rng.rs index 2133cb79ee..6bb0269c5e 100644 --- a/virtio-devices/src/rng.rs +++ b/virtio-devices/src/rng.rs @@ -78,7 +78,10 @@ impl RngEpollHandler { .memory() .read_volatile_from( desc.addr() - .translate_gva(self.access_platform.as_deref(), desc.len() as usize), + .translate_gva(self.access_platform.as_deref(), desc.len() as usize) + .map_err(|e| { + Error::GuestMemoryWrite(vm_memory::GuestMemoryError::IOError(e)) + })?, &mut self.random_file, desc.len() as usize, ) diff --git a/virtio-devices/src/transport/pci_common_config.rs b/virtio-devices/src/transport/pci_common_config.rs index c59c454b77..11e1d3ac70 100644 --- a/virtio-devices/src/transport/pci_common_config.rs +++ b/virtio-devices/src/transport/pci_common_config.rs @@ -287,15 +287,36 @@ impl VirtioPciCommonConfig { q.set_ready(ready); // Translate address of descriptor table and vrings. if ready && let Some(access_platform) = &self.access_platform { - let desc_table = access_platform + let desc_table = match access_platform .translate_gva(q.desc_table(), get_vring_size(VringType::Desc, q.size())) - .unwrap(); - let avail_ring = access_platform + { + Ok(addr) => addr, + Err(e) => { + error!("Failed to translate desc_table GVA: {e}"); + q.set_ready(false); + return; + } + }; + let avail_ring = match access_platform .translate_gva(q.avail_ring(), get_vring_size(VringType::Avail, q.size())) - .unwrap(); - let used_ring = access_platform + { + Ok(addr) => addr, + Err(e) => { + error!("Failed to translate avail_ring GVA: {e}"); + q.set_ready(false); + return; + } + }; + let used_ring = match access_platform .translate_gva(q.used_ring(), get_vring_size(VringType::Used, q.size())) - .unwrap(); + { + Ok(addr) => addr, + Err(e) => { + error!("Failed to translate used_ring GVA: {e}"); + q.set_ready(false); + return; + } + }; q.set_desc_table_address( Some((desc_table & 0xffff_ffff) as u32), Some((desc_table >> 32) as u32), diff --git a/virtio-devices/src/vdpa.rs b/virtio-devices/src/vdpa.rs index 7cd3415181..a35c35eb56 100644 --- a/virtio-devices/src/vdpa.rs +++ b/virtio-devices/src/vdpa.rs @@ -88,6 +88,8 @@ pub enum Error { SetVringKick(#[source] vhost::Error), #[error("Failed to set vring size")] SetVringNum(#[source] vhost::Error), + #[error("Failed to translate address")] + TranslateAddress(#[source] std::io::Error), } pub type Result = std::result::Result; @@ -246,18 +248,27 @@ impl Vdpa { queue_max_size, queue_size, flags: 0u32, - desc_table_addr: queue.desc_table().translate_gpa( - self.common.access_platform.as_deref(), - queue_size as usize * std::mem::size_of::(), - ), - used_ring_addr: queue.used_ring().translate_gpa( - self.common.access_platform.as_deref(), - 4 + queue_size as usize * 8, - ), - avail_ring_addr: queue.avail_ring().translate_gpa( - self.common.access_platform.as_deref(), - 4 + queue_size as usize * 2, - ), + desc_table_addr: queue + .desc_table() + .translate_gpa( + self.common.access_platform.as_deref(), + queue_size as usize * std::mem::size_of::(), + ) + .map_err(Error::TranslateAddress)?, + used_ring_addr: queue + .used_ring() + .translate_gpa( + self.common.access_platform.as_deref(), + 4 + queue_size as usize * 8, + ) + .map_err(Error::TranslateAddress)?, + avail_ring_addr: queue + .avail_ring() + .translate_gpa( + self.common.access_platform.as_deref(), + 4 + queue_size as usize * 2, + ) + .map_err(Error::TranslateAddress)?, log_addr: None, }; diff --git a/virtio-devices/src/vsock/packet.rs b/virtio-devices/src/vsock/packet.rs index 57218a5b87..e6b4c5afbb 100644 --- a/virtio-devices/src/vsock/packet.rs +++ b/virtio-devices/src/vsock/packet.rs @@ -142,7 +142,8 @@ impl VsockPacket { let guest_hdr_addr = head .addr() - .translate_gva(access_platform, VSOCK_PKT_HDR_SIZE); + .translate_gva(access_platform, VSOCK_PKT_HDR_SIZE) + .map_err(|_| VsockError::GuestMemory)?; // To avoid TOCTOU issues when reading/writing the VSock packet header in guest memory, // we need to copy the content of the header in the VMM's memory. @@ -178,8 +179,9 @@ impl VsockPacket { desc_chain.memory(), head.addr() .checked_add(VSOCK_PKT_HDR_SIZE as u64) - .unwrap() - .translate_gva(access_platform, buf_size), + .ok_or(VsockError::GuestMemory)? + .translate_gva(access_platform, buf_size) + .map_err(|_| VsockError::GuestMemory)?, buf_size, ) .ok_or(VsockError::GuestMemory)?; @@ -214,7 +216,10 @@ impl VsockPacket { let desc_len = desc.len() as usize; if desc_len > 0 && offset < total_len { let to_copy = std::cmp::min(desc_len, total_len - offset); - let desc_addr = desc.addr().translate_gva(access_platform, desc_len); + let desc_addr = desc + .addr() + .translate_gva(access_platform, desc_len) + .map_err(|_| VsockError::GuestMemory)?; desc_chain .memory() .read_slice(&mut owned[offset..offset + to_copy], desc_addr) @@ -242,7 +247,10 @@ impl VsockPacket { let buf_size = buf_desc.len() as usize; let buf_ptr = get_host_address_range( desc_chain.memory(), - buf_desc.addr().translate_gva(access_platform, buf_size), + buf_desc + .addr() + .translate_gva(access_platform, buf_size) + .map_err(|_| VsockError::GuestMemory)?, buf_size, ) .ok_or(VsockError::GuestMemory)?; @@ -283,7 +291,8 @@ impl VsockPacket { let guest_hdr_addr = head .addr() - .translate_gva(access_platform, VSOCK_PKT_HDR_SIZE); + .translate_gva(access_platform, VSOCK_PKT_HDR_SIZE) + .map_err(|_| VsockError::GuestMemory)?; // To avoid TOCTOU issues when reading/writing the VSock packet header in guest memory, // we need to copy the content of the header in the VMM's memory. @@ -313,7 +322,10 @@ impl VsockPacket { buf: Some(PacketBuffer::Borrowed { ptr: get_host_address_range( desc_chain.memory(), - buf_desc.addr().translate_gva(access_platform, buf_size), + buf_desc + .addr() + .translate_gva(access_platform, buf_size) + .map_err(|_| VsockError::GuestMemory)?, buf_size, ) .ok_or(VsockError::GuestMemory)?, @@ -330,8 +342,9 @@ impl VsockPacket { desc_chain.memory(), head.addr() .checked_add(VSOCK_PKT_HDR_SIZE as u64) - .unwrap() - .translate_gva(access_platform, buf_size), + .ok_or(VsockError::GuestMemory)? + .translate_gva(access_platform, buf_size) + .map_err(|_| VsockError::GuestMemory)?, buf_size, ) .ok_or(VsockError::GuestMemory)?, diff --git a/vm-virtio/src/lib.rs b/vm-virtio/src/lib.rs index c560e5c86e..fbd94b2b72 100644 --- a/vm-virtio/src/lib.rs +++ b/vm-virtio/src/lib.rs @@ -101,32 +101,60 @@ pub trait AccessPlatform: Send + Sync + Debug { } pub trait Translatable { - fn translate_gva(&self, access_platform: Option<&dyn AccessPlatform>, len: usize) -> Self; - fn translate_gpa(&self, access_platform: Option<&dyn AccessPlatform>, len: usize) -> Self; + fn translate_gva( + &self, + access_platform: Option<&dyn AccessPlatform>, + len: usize, + ) -> std::result::Result + where + Self: Sized; + fn translate_gpa( + &self, + access_platform: Option<&dyn AccessPlatform>, + len: usize, + ) -> std::result::Result + where + Self: Sized; } impl Translatable for GuestAddress { - fn translate_gva(&self, access_platform: Option<&dyn AccessPlatform>, len: usize) -> Self { - GuestAddress(self.0.translate_gva(access_platform, len)) + fn translate_gva( + &self, + access_platform: Option<&dyn AccessPlatform>, + len: usize, + ) -> std::result::Result { + Ok(GuestAddress(self.0.translate_gva(access_platform, len)?)) } - fn translate_gpa(&self, access_platform: Option<&dyn AccessPlatform>, len: usize) -> Self { - GuestAddress(self.0.translate_gpa(access_platform, len)) + fn translate_gpa( + &self, + access_platform: Option<&dyn AccessPlatform>, + len: usize, + ) -> std::result::Result { + Ok(GuestAddress(self.0.translate_gpa(access_platform, len)?)) } } impl Translatable for u64 { - fn translate_gva(&self, access_platform: Option<&dyn AccessPlatform>, len: usize) -> Self { + fn translate_gva( + &self, + access_platform: Option<&dyn AccessPlatform>, + len: usize, + ) -> std::result::Result { if let Some(access_platform) = access_platform { - access_platform.translate_gva(*self, len as u64).unwrap() + access_platform.translate_gva(*self, len as u64) } else { - *self + Ok(*self) } } - fn translate_gpa(&self, access_platform: Option<&dyn AccessPlatform>, len: usize) -> Self { + fn translate_gpa( + &self, + access_platform: Option<&dyn AccessPlatform>, + len: usize, + ) -> std::result::Result { if let Some(access_platform) = access_platform { - access_platform.translate_gpa(*self, len as u64).unwrap() + access_platform.translate_gpa(*self, len as u64) } else { - *self + Ok(*self) } } } From c657ea6e2368c8017525293abe5d5b9ee887507c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 14 Apr 2026 23:53:59 +0000 Subject: [PATCH 607/742] build(deps): bump softprops/action-gh-release from 2 to 3 Bumps [softprops/action-gh-release](https://github.com/softprops/action-gh-release) from 2 to 3. - [Release notes](https://github.com/softprops/action-gh-release/releases) - [Changelog](https://github.com/softprops/action-gh-release/blob/master/CHANGELOG.md) - [Commits](https://github.com/softprops/action-gh-release/compare/v2...v3) --- updated-dependencies: - dependency-name: softprops/action-gh-release dependency-version: '3' dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/release.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index a6e8defba4..bc7c3e152e 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -86,7 +86,7 @@ jobs: name: cloud-hypervisor-${{ github.event.ref }}.tar.xz - name: Create GitHub Release if: github.event_name == 'create' && github.event.ref_type == 'tag' - uses: softprops/action-gh-release@v2 + uses: softprops/action-gh-release@v3 with: draft: true files: | From 6d0d4bc5e2fdf9fc8da1ff6c6a37ff14f9718e25 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Thu, 9 Apr 2026 22:24:37 +0200 Subject: [PATCH 608/742] vmm: protect vcpu states in CpuManager with a mutex This is a prerequisite for the next commit where we need shared access. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- vmm/src/cpu.rs | 101 ++++++++++++++++++++++++++++++------------------- 1 file changed, 62 insertions(+), 39 deletions(-) diff --git a/vmm/src/cpu.rs b/vmm/src/cpu.rs index 4b15cffc31..061c20d47e 100644 --- a/vmm/src/cpu.rs +++ b/vmm/src/cpu.rs @@ -682,7 +682,7 @@ pub struct CpuManager { reset_evt: EventFd, #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, - vcpu_states: Vec, + vcpu_states: Arc>>, selected_cpu: u32, vcpus: Vec>>, seccomp_action: SeccompAction, @@ -741,6 +741,7 @@ impl BusDevice for CpuManager { fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. data.fill(0); + let vcpu_states = self.vcpu_states.lock().unwrap(); match offset { CPU_SELECTION_OFFSET => { @@ -750,7 +751,7 @@ impl BusDevice for CpuManager { } CPU_STATUS_OFFSET => { if self.selected_cpu < self.max_vcpus() { - let state = &self.vcpu_states[usize::try_from(self.selected_cpu).unwrap()]; + let state = &vcpu_states[usize::try_from(self.selected_cpu).unwrap()]; if state.active() { data[0] |= 1 << CPU_ENABLE_FLAG; } @@ -779,23 +780,28 @@ impl BusDevice for CpuManager { } CPU_STATUS_OFFSET => { if self.selected_cpu < self.max_vcpus() { - let state = &mut self.vcpu_states[usize::try_from(self.selected_cpu).unwrap()]; - // The ACPI code writes back a 1 to acknowledge the insertion - if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG) - && state.inserting - { - state.inserting = false; - } - // Ditto for removal - if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG) - && state.removing - { - state.removing = false; - } - // Trigger removal of vCPU - if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG - && let Err(e) = self.remove_vcpu(self.selected_cpu) - { + let eject = { + // This structure is not shared with the vCPU thread, therefore, holding the + // lock for the entire function doesn't cause any deadlock. + let mut vcpu_states = self.vcpu_states.lock().unwrap(); + let state = &mut vcpu_states[usize::try_from(self.selected_cpu).unwrap()]; + + if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG) + && state.inserting + { + state.inserting = false; + } + + if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG) + && state.removing + { + state.removing = false; + } + + data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG + }; + + if eject && let Err(e) = self.remove_vcpu(self.selected_cpu) { error!("Error removing vCPU: {e:?}"); } } else { @@ -907,6 +913,7 @@ impl CpuManager { let max_vcpus = usize::try_from(config.max_vcpus).unwrap(); let mut vcpu_states = Vec::with_capacity(max_vcpus); vcpu_states.resize_with(max_vcpus, VcpuState::default); + let vcpu_states = Arc::new(Mutex::new(vcpu_states)); let hypervisor_type = hypervisor.hypervisor_type(); #[cfg(target_arch = "x86_64")] let cpu_vendor = hypervisor.get_cpu_vendor(); @@ -1176,14 +1183,14 @@ impl CpuManager { let vcpus_pause_signalled = self.vcpus_pause_signalled.clone(); let vcpus_kick_signalled = self.vcpus_kick_signalled.clone(); - let vcpu_kill = self.vcpu_states[usize::try_from(vcpu_id).unwrap()] - .kill - .clone(); - let vcpu_run_interrupted = self.vcpu_states[usize::try_from(vcpu_id).unwrap()] + let mut vcpu_states = self.vcpu_states.lock().unwrap(); + + let vcpu_kill = vcpu_states[usize::try_from(vcpu_id).unwrap()].kill.clone(); + let vcpu_run_interrupted = vcpu_states[usize::try_from(vcpu_id).unwrap()] .vcpu_run_interrupted .clone(); let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone(); - let vcpu_paused = self.vcpu_states[usize::try_from(vcpu_id).unwrap()] + let vcpu_paused = vcpu_states[usize::try_from(vcpu_id).unwrap()] .paused .clone(); @@ -1470,8 +1477,8 @@ impl CpuManager { // On hot plug calls into this function entry_point is None. It is for // those hotplug CPU additions that we need to set the inserting flag. - self.vcpu_states[usize::try_from(vcpu_id).unwrap()].handle = handle; - self.vcpu_states[usize::try_from(vcpu_id).unwrap()].inserting = inserting; + vcpu_states[usize::try_from(vcpu_id).unwrap()].handle = handle; + vcpu_states[usize::try_from(vcpu_id).unwrap()].inserting = inserting; Ok(()) } @@ -1515,17 +1522,20 @@ impl CpuManager { } fn mark_vcpus_for_removal(&mut self, desired_vcpus: u32) { + let mut vcpu_states = self.vcpu_states.lock().unwrap(); + let present_vcpus = Self::active_vcpus(&vcpu_states); + // Mark vCPUs for removal, actual removal happens on ejection - for cpu_id in desired_vcpus..self.present_vcpus() { - self.vcpu_states[usize::try_from(cpu_id).unwrap()].removing = true; - self.vcpu_states[usize::try_from(cpu_id).unwrap()] + for cpu_id in desired_vcpus..present_vcpus { + vcpu_states[usize::try_from(cpu_id).unwrap()].removing = true; + vcpu_states[usize::try_from(cpu_id).unwrap()] .pending_removal .store(true, Ordering::SeqCst); } } pub fn check_pending_removed_vcpu(&mut self) -> bool { - for state in self.vcpu_states.iter() { + for state in self.vcpu_states.lock().unwrap().iter() { if state.active() && state.pending_removal.load(Ordering::SeqCst) { return true; } @@ -1535,7 +1545,8 @@ impl CpuManager { fn remove_vcpu(&mut self, cpu_id: u32) -> Result<()> { info!("Removing vCPU: cpu_id = {cpu_id}"); - let state = &mut self.vcpu_states[usize::try_from(cpu_id).unwrap()]; + let mut vcpu_states = self.vcpu_states.lock().unwrap(); + let state = &mut vcpu_states[usize::try_from(cpu_id).unwrap()]; state.kill.store(true, Ordering::SeqCst); state.signal_thread(); state.wait_until_signal_acknowledged()?; @@ -1631,12 +1642,15 @@ impl CpuManager { /// For the vCPU threads this will interrupt the KVM_RUN ioctl() allowing /// the loop to check the shared state booleans. fn signal_vcpus(&mut self) -> Result<()> { + // Holding the lock for the whole operation is correct: + let vcpu_states = self.vcpu_states.lock().unwrap(); + // Splitting this into two loops reduced the time to pause many vCPUs // massively. Example: 254 vCPUs. >254ms -> ~4ms. - for state in self.vcpu_states.iter() { + for state in vcpu_states.iter() { state.signal_thread(); } - for state in self.vcpu_states.iter() { + for state in vcpu_states.iter() { state.wait_until_signal_acknowledged()?; } @@ -1651,14 +1665,14 @@ impl CpuManager { self.vcpus_pause_signalled.store(false, Ordering::SeqCst); // Unpark all the VCPU threads. - for state in self.vcpu_states.iter() { + for state in self.vcpu_states.lock().unwrap().iter() { state.unpark_thread(); } self.signal_vcpus()?; // Wait for all the threads to finish. This removes the state from the vector. - for mut state in self.vcpu_states.drain(..) { + for mut state in self.vcpu_states.lock().unwrap().drain(..) { state.join_thread()?; } @@ -1691,8 +1705,15 @@ impl CpuManager { self.cpuid.clone() } + /// Locks the vCPU states and calls [`Self::active_vcpus`]. fn present_vcpus(&self) -> u32 { - self.vcpu_states + let lock = self.vcpu_states.lock().unwrap(); + Self::active_vcpus(&lock) + } + + /// Counts the number of active vCPUs (running vCPU threads). + fn active_vcpus(vcpu_states: &[VcpuState]) -> u32 { + vcpu_states .iter() .fold(0, |acc, state| acc + state.active() as u32) } @@ -2651,7 +2672,7 @@ impl Pausable for CpuManager { // The vCPU thread will change its paused state before parking, wait here for each // activated vCPU change their state to ensure they have parked. - for state in self.vcpu_states.iter() { + for state in self.vcpu_states.lock().unwrap().iter() { if state.active() { // wait for vCPU to update state while !state.paused.load(Ordering::SeqCst) { @@ -2669,16 +2690,18 @@ impl Pausable for CpuManager { // their run vCPU loop. self.vcpus_pause_signalled.store(false, Ordering::SeqCst); + let vcpu_states = self.vcpu_states.lock().unwrap(); + // Unpark all the vCPU threads. // Step 1/2: signal each thread { - for state in self.vcpu_states.iter() { + for state in vcpu_states.iter() { state.unpark_thread(); } } // Step 2/2: wait for state ACK { - for state in self.vcpu_states.iter() { + for state in vcpu_states.iter() { // wait for vCPU to update state while state.paused.load(Ordering::SeqCst) { // To avoid a priority inversion with the vCPU thread From 5ff4696cea911a5a90ff286c134ff3ebbd3134bd Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Thu, 9 Apr 2026 22:17:32 +0200 Subject: [PATCH 609/742] vmm: introduce ACPI CPU hotplug controller (fix deadlock) Extract AcpiCpuHotplugController from CpuManager and move the BusDevice implementation to the new type. This separates VMM-internal vCPU management from the guest-visible ACPI CPU hotplug MMIO interface. Besides clarifying responsibilities and reducing technical debt, this fixes a rare deadlock involving pause handling and MMIO access. New responsibilities: - CpuManager manages VMM-internal vCPU lifecycle and coordination - AcpiCpuHotplugController implements the guest-visible ACPI CPU hotplug MMIO interface A vCPU thread may exit KVM_RUN to perform an MMIO access previously handled by CpuManager. If the VMM thread begins processing a `pause` event before that MMIO operation acquires access to CpuManager, CpuManager::pause() will block waiting for the vCPU thread to ACK the pause, while the vCPU thread is blocked waiting to complete the MMIO operation through the same CpuManager - which it can never lock - the VMM is deadlocked. This can occur during early boot or CPU hotplug when pause events race with MMIO accesses. The issue is rare and timing-dependent, but real. For reproducing: run `ch-remote pause|resume` in a loop while booting a Linux VM (via direct kernel boot). With the new design, these MMIO operations no longer depend on CpuManager, which removes the deadlock path entirely. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- vmm/src/cpu.rs | 233 +++++++++++++++++++++----------------- vmm/src/device_manager.rs | 13 ++- 2 files changed, 138 insertions(+), 108 deletions(-) diff --git a/vmm/src/cpu.rs b/vmm/src/cpu.rs index 061c20d47e..eb892552b0 100644 --- a/vmm/src/cpu.rs +++ b/vmm/src/cpu.rs @@ -682,8 +682,8 @@ pub struct CpuManager { reset_evt: EventFd, #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, + // Shared with AcpiCpuHotplugController vcpu_states: Arc>>, - selected_cpu: u32, vcpus: Vec>>, seccomp_action: SeccompAction, vm_ops: Arc, @@ -699,14 +699,6 @@ pub struct CpuManager { core_scheduling_group_leader: Arc, } -const CPU_ENABLE_FLAG: usize = 0; -const CPU_INSERTING_FLAG: usize = 1; -const CPU_REMOVING_FLAG: usize = 2; -const CPU_EJECT_FLAG: usize = 3; - -const CPU_STATUS_OFFSET: u64 = 4; -const CPU_SELECTION_OFFSET: u64 = 0; - /// State of the core scheduling group leader election for VM-wide cookie /// sharing. /// @@ -737,85 +729,6 @@ impl TryFrom for CoreSchedulingLeader { } } -impl BusDevice for CpuManager { - fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { - // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. - data.fill(0); - let vcpu_states = self.vcpu_states.lock().unwrap(); - - match offset { - CPU_SELECTION_OFFSET => { - assert!(data.len() >= core::mem::size_of::()); - data[0..core::mem::size_of::()] - .copy_from_slice(&self.selected_cpu.to_le_bytes()); - } - CPU_STATUS_OFFSET => { - if self.selected_cpu < self.max_vcpus() { - let state = &vcpu_states[usize::try_from(self.selected_cpu).unwrap()]; - if state.active() { - data[0] |= 1 << CPU_ENABLE_FLAG; - } - if state.inserting { - data[0] |= 1 << CPU_INSERTING_FLAG; - } - if state.removing { - data[0] |= 1 << CPU_REMOVING_FLAG; - } - } else { - warn!("Out of range vCPU id: {}", self.selected_cpu); - } - } - _ => { - warn!("Unexpected offset for accessing CPU manager device: {offset:#}"); - } - } - } - - fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { - match offset { - CPU_SELECTION_OFFSET => { - assert!(data.len() >= core::mem::size_of::()); - self.selected_cpu = - u32::from_le_bytes(data[0..core::mem::size_of::()].try_into().unwrap()); - } - CPU_STATUS_OFFSET => { - if self.selected_cpu < self.max_vcpus() { - let eject = { - // This structure is not shared with the vCPU thread, therefore, holding the - // lock for the entire function doesn't cause any deadlock. - let mut vcpu_states = self.vcpu_states.lock().unwrap(); - let state = &mut vcpu_states[usize::try_from(self.selected_cpu).unwrap()]; - - if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG) - && state.inserting - { - state.inserting = false; - } - - if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG) - && state.removing - { - state.removing = false; - } - - data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG - }; - - if eject && let Err(e) = self.remove_vcpu(self.selected_cpu) { - error!("Error removing vCPU: {e:?}"); - } - } else { - warn!("Out of range vCPU id: {}", self.selected_cpu); - } - } - _ => { - warn!("Unexpected offset for accessing CPU manager device: {offset:#}"); - } - } - None - } -} - #[derive(Default)] struct VcpuState { inserting: bool, @@ -965,7 +878,6 @@ impl CpuManager { reset_evt, #[cfg(feature = "guest_debug")] vm_debug_evt, - selected_cpu: 0, vcpus: Vec::with_capacity(max_vcpus), seccomp_action, vm_ops, @@ -1543,23 +1455,6 @@ impl CpuManager { false } - fn remove_vcpu(&mut self, cpu_id: u32) -> Result<()> { - info!("Removing vCPU: cpu_id = {cpu_id}"); - let mut vcpu_states = self.vcpu_states.lock().unwrap(); - let state = &mut vcpu_states[usize::try_from(cpu_id).unwrap()]; - state.kill.store(true, Ordering::SeqCst); - state.signal_thread(); - state.wait_until_signal_acknowledged()?; - state.join_thread()?; - state.handle = None; - - // Once the thread has exited, clear the "kill" so that it can reused - state.kill.store(false, Ordering::SeqCst); - state.pending_removal.store(false, Ordering::SeqCst); - - Ok(()) - } - pub fn create_boot_vcpus( &mut self, snapshot: Option<&Snapshot>, @@ -3203,6 +3098,132 @@ impl CpuElf64Writable for CpuManager { } } +/// MMIO-accessible controller for handling ACPI hotplug and unplug events. +/// +/// Shares state about the vCPUs with the [`CpuManager`]. +pub struct AcpiCpuHotplugController { + /// The currently selected CPU by the guest. + selected_cpu: u32, + /// Shared vCPU state with [`CpuManager`]. + vcpu_states: Arc>>, + /// Maximum number of vCPUS of the VM. + max_vcpus: u32, +} + +impl AcpiCpuHotplugController { + const CPU_ENABLE_FLAG: usize = 0; + const CPU_INSERTING_FLAG: usize = 1; + const CPU_REMOVING_FLAG: usize = 2; + const CPU_EJECT_FLAG: usize = 3; + + const CPU_SELECTION_OFFSET: u64 = 0; + const CPU_STATUS_OFFSET: u64 = 4; + + /// Creates a new [`AcpiCpuHotplugController`]. + pub fn new(cpu_manager: &CpuManager) -> AcpiCpuHotplugController { + Self { + max_vcpus: cpu_manager.config.max_vcpus, + selected_cpu: 0, + vcpu_states: cpu_manager.vcpu_states.clone(), + } + } + + /// Removes a vCPU from the guest. + /// + /// The corresponding vCPU thread will be gracefully stopped and joined. + fn remove_vcpu(cpu_id: u32, state: &mut VcpuState) -> Result<()> { + info!("Removing vCPU: cpu_id = {cpu_id}"); + state.kill.store(true, Ordering::SeqCst); + state.signal_thread(); + state.wait_until_signal_acknowledged()?; + state.join_thread()?; + state.handle = None; + + // Once the thread has exited, clear the "kill" so that it can reused + state.kill.store(false, Ordering::SeqCst); + state.pending_removal.store(false, Ordering::SeqCst); + + Ok(()) + } +} + +impl BusDevice for AcpiCpuHotplugController { + fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { + // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. + data.fill(0); + let vcpu_states = self.vcpu_states.lock().unwrap(); + + match offset { + Self::CPU_SELECTION_OFFSET => { + assert!(data.len() >= core::mem::size_of::()); + data[0..core::mem::size_of::()] + .copy_from_slice(&self.selected_cpu.to_le_bytes()); + } + Self::CPU_STATUS_OFFSET => { + if self.selected_cpu < self.max_vcpus { + let state = &vcpu_states[usize::try_from(self.selected_cpu).unwrap()]; + if state.active() { + data[0] |= 1 << Self::CPU_ENABLE_FLAG; + } + if state.inserting { + data[0] |= 1 << Self::CPU_INSERTING_FLAG; + } + if state.removing { + data[0] |= 1 << Self::CPU_REMOVING_FLAG; + } + } else { + warn!("Out of range vCPU id: {}", self.selected_cpu); + } + } + _ => { + warn!("Unexpected offset for accessing CPU manager device: {offset:#}"); + } + } + } + + fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { + match offset { + Self::CPU_SELECTION_OFFSET => { + assert!(data.len() >= core::mem::size_of::()); + self.selected_cpu = + u32::from_le_bytes(data[0..core::mem::size_of::()].try_into().unwrap()); + } + Self::CPU_STATUS_OFFSET => { + if self.selected_cpu < self.max_vcpus { + // This structure is not shared with the vCPU thread, therefore, holding the + // lock for the entire function doesn't cause any deadlock. + let mut vcpu_states = self.vcpu_states.lock().unwrap(); + let state = &mut vcpu_states[usize::try_from(self.selected_cpu).unwrap()]; + // The ACPI code writes back a 1 to acknowledge the insertion + if (data[0] & (1 << Self::CPU_INSERTING_FLAG) == 1 << Self::CPU_INSERTING_FLAG) + && state.inserting + { + state.inserting = false; + } + // Ditto for removal + if (data[0] & (1 << Self::CPU_REMOVING_FLAG) == 1 << Self::CPU_REMOVING_FLAG) + && state.removing + { + state.removing = false; + } + // Trigger removal of vCPU: + if data[0] & (1 << Self::CPU_EJECT_FLAG) == 1 << Self::CPU_EJECT_FLAG + && let Err(e) = Self::remove_vcpu(self.selected_cpu, state) + { + error!("Error removing vCPU: {e:?}"); + } + } else { + warn!("Out of range vCPU id: {}", self.selected_cpu); + } + } + _ => { + warn!("Unexpected offset for accessing CPU manager device: {offset:#}"); + } + } + None + } +} + #[cfg(all(feature = "kvm", target_arch = "x86_64"))] #[cfg(test)] mod unit_tests { diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 9522be53d2..39281e53ed 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -124,7 +124,7 @@ use vm_virtio::{AccessPlatform, VirtioDeviceType}; use vmm_sys_util::eventfd::EventFd; use crate::console_devices::{ConsoleDeviceError, ConsoleInfo, ConsoleTransport}; -use crate::cpu::{CPU_MANAGER_ACPI_SIZE, CpuManager}; +use crate::cpu::{AcpiCpuHotplugController, CPU_MANAGER_ACPI_SIZE, CpuManager}; use crate::device_tree::{DeviceNode, DeviceTree}; use crate::interrupt::{LegacyUserspaceInterruptManager, MsiInterruptManager}; use crate::memory_manager::{Error as MemoryManagerError, MEMORY_MANAGER_ACPI_SIZE, MemoryManager}; @@ -1026,6 +1026,10 @@ pub struct DeviceManager { // CPU Manager cpu_manager: Arc>, + /// Owned version needed to keep the bus device alive (the bus only holds + /// a weak reference). + _acpi_cpu_hotplug_controller: Arc>, + // The virtio devices on the system virtio_devices: Vec, @@ -1324,6 +1328,10 @@ impl DeviceManager { )?); } + let acpi_cpu_hotplug_controller = + AcpiCpuHotplugController::new(&cpu_manager.lock().unwrap()); + let acpi_cpu_hotplug_controller = Arc::new(Mutex::new(acpi_cpu_hotplug_controller)); + if dynamic { let acpi_address = address_manager .allocator @@ -1335,7 +1343,7 @@ impl DeviceManager { address_manager .mmio_bus .insert( - cpu_manager.clone(), + acpi_cpu_hotplug_controller.clone(), acpi_address.0, CPU_MANAGER_ACPI_SIZE as u64, ) @@ -1429,6 +1437,7 @@ impl DeviceManager { fw_cfg: None, #[cfg(feature = "ivshmem")] ivshmem_device: None, + _acpi_cpu_hotplug_controller: acpi_cpu_hotplug_controller, }; let device_manager = Arc::new(Mutex::new(device_manager)); From 7eab5901adf733932d0e856b9cd5595e220e4e20 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Thu, 9 Apr 2026 22:24:50 +0200 Subject: [PATCH 610/742] vmm: improve misc documentation This improves the documentation at various places. On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- vm-device/src/bus.rs | 3 +++ vmm/src/cpu.rs | 17 ++++++++++++++--- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/vm-device/src/bus.rs b/vm-device/src/bus.rs index 2897ac303e..eacca24987 100644 --- a/vm-device/src/bus.rs +++ b/vm-device/src/bus.rs @@ -147,6 +147,9 @@ impl Bus { None } + /// Inserts a bus device into the bus. + /// + /// The bus will only hold a weak reference to the object. #[allow(clippy::needless_pass_by_value)] pub fn insert(&self, device: Arc, base: u64, len: u64) -> Result<()> { if len == 0 { diff --git a/vmm/src/cpu.rs b/vmm/src/cpu.rs index eb892552b0..1450e0a8e2 100644 --- a/vmm/src/cpu.rs +++ b/vmm/src/cpu.rs @@ -729,13 +729,17 @@ impl TryFrom for CoreSchedulingLeader { } } +/// Management structure for a vCPU (thread). #[derive(Default)] struct VcpuState { inserting: bool, removing: bool, pending_removal: Arc, + /// Handle to the vCPU thread. handle: Option>, + /// Instructs the thread to exit the run-vCPU loop. kill: Arc, + /// Used to ACK interruption from the run vCPU loop to the CPU Manager. vcpu_run_interrupted: Arc, /// Used to ACK state changes from the run vCPU loop to the CPU Manager. paused: Arc, @@ -750,6 +754,13 @@ impl VcpuState { /// /// Please call [`Self::wait_until_signal_acknowledged`] afterward to block /// until the vCPU thread has acknowledged the signal. + /// + /// If the thread is in KVM_RUN (or MSHV_RUN_VP or equivalent), this kicks + /// the thread out of kernel space. If the thread is in user-space, the + /// thread will just handle the event eventually. If the thread is in + /// user-space but about to enter kernel-space, the user-space signal + /// handler will make sure that the next kernel entry of the given + /// vCPU thread immediately exits to handle the event in user-space. fn signal_thread(&self) { if let Some(handle) = self.handle.as_ref() { // SAFETY: FFI call with correct arguments @@ -1532,10 +1543,10 @@ impl CpuManager { } } - /// Signal to the spawned threads (vCPUs and console signal handler). + /// Signals all vCPU threads and waits for them to ACK the interruption. /// - /// For the vCPU threads this will interrupt the KVM_RUN ioctl() allowing - /// the loop to check the shared state booleans. + /// Calls [`VcpuState::signal_thread`] and + /// [`VcpuState::wait_until_signal_acknowledged`] for each vCPU. fn signal_vcpus(&mut self) -> Result<()> { // Holding the lock for the whole operation is correct: let vcpu_states = self.vcpu_states.lock().unwrap(); From edfd597993583dd728c4102b67eae04b3c46059b Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Fri, 10 Apr 2026 15:23:45 +0200 Subject: [PATCH 611/742] tests: fix weird "console=ttyS0rw" string On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- cloud-hypervisor/tests/integration.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index ebf08d8417..c4bccc46ad 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -1941,7 +1941,7 @@ mod common_parallel { .args([ "--cmdline", DIRECT_KERNEL_BOOT_CMDLINE - .replace("console=hvc0 ", console_str) + .replace("console=hvc0", console_str) .as_str(), ]) .default_disks() @@ -1997,7 +1997,7 @@ mod common_parallel { .args([ "--cmdline", DIRECT_KERNEL_BOOT_CMDLINE - .replace("console=hvc0 ", console_str) + .replace("console=hvc0", console_str) .as_str(), ]) .default_disks() @@ -2055,7 +2055,7 @@ mod common_parallel { .args([ "--cmdline", DIRECT_KERNEL_BOOT_CMDLINE - .replace("console=hvc0 ", console_str) + .replace("console=hvc0", console_str) .as_str(), ]) .default_disks() @@ -2613,7 +2613,7 @@ mod common_parallel { .args([ "--cmdline", DIRECT_KERNEL_BOOT_CMDLINE - .replace("console=hvc0 ", console_str) + .replace("console=hvc0", console_str) .as_str(), ]) .args(["--serial", "tty"]) @@ -5621,7 +5621,7 @@ mod common_parallel { .args([ "--cmdline", DIRECT_KERNEL_BOOT_CMDLINE - .replace("console=hvc0 ", tty_str) + .replace("console=hvc0", tty_str) .as_str(), ]) .capture_output() From 39844e883943146fff19f8d3450b0882ac069d6f Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Wed, 15 Apr 2026 10:22:14 +0200 Subject: [PATCH 612/742] docs: refine coding standards in CONTRIBUTING.md TL;DR: Add note about how we expect code comments/documentation This updates the coding standards as discussed [0]. The general guideline is to write down as little process as possible and leave room for pragmatic exceptions, maintainer and contributor preferences while still striving for excellent code quality. [0]: https://github.com/cloud-hypervisor/cloud-hypervisor/pull/7990#issuecomment-4245571054 On-behalf-of: SAP philipp.schuster@sap.com Signed-off-by: Philipp Schuster --- CONTRIBUTING.md | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index cdda170b4a..943f53c39d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -11,11 +11,30 @@ license of those projects. New code should be under the [Apache v2 License](https://opensource.org/licenses/Apache-2.0). -## Coding Style +## Coding Style & Code Comments -We follow the [Rust Style](https://github.com/rust-lang/rust/tree/HEAD/src/doc/style-guide/src) -convention and enforce it through the Continuous Integration (CI) process calling into `rustfmt`, -`clippy`, and other well-known code quality tool of the ecosystem for each submitted Pull Request (PR). +We use the [Rust Style] guide and enforce formatting and linting in CI, +including `rustfmt`, `clippy`, and other common Rust quality checks, for every +pull request. We adapt to best practices, new lints and new tooling as the +ecosystem evolves. + +Code should **speak for itself** (for example, by using descriptive identifiers) +and be **easy to read and maintain**. Beyond the conventions and tooling +described above, contributors have _some_ room to apply their own style and +preferred structure. Maintainers may still suggest refactorings where they +believe readability, consistency, or maintainability can be improved. + +For new code, add documentation and comments where they **provide additional value**: + +* **Rustdoc** explains the API to its users. +* **Inline comments** explain the code the reader, especially *why* it is + written that way. +* **Commit messages** explain the broader context of a change (for more + information on commit messages, see below). + +Comments should be concise and add additional context or information to the code. + +[Rust Style]: https://github.com/rust-lang/rust/tree/HEAD/src/doc/style-guide/src ## Basic Checks From c35749fb3959f2d09b233571b2a3a36ce40c933a Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 15 Apr 2026 15:37:55 +0200 Subject: [PATCH 613/742] block: qcow: Rename Qcow2MetadataBacking to Qcow2Backing The old name read as 'metadata for a QCOW2 backing file' rather than what it actually is: a QCOW2 backing file reader. Rename to Qcow2Backing to parallel RawBacking and clarify intent. Suggested-by: Philipp Schuster Signed-off-by: Anatol Belski --- block/src/qcow/backing.rs | 12 ++++++------ block/src/qcow_sync.rs | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/block/src/qcow/backing.rs b/block/src/qcow/backing.rs index e5e037b0ad..754618f132 100644 --- a/block/src/qcow/backing.rs +++ b/block/src/qcow/backing.rs @@ -48,7 +48,7 @@ impl BackingRead for RawBacking { /// tables, refcounts) before reading the underlying data. Read only /// because backing files never receive writes. Nested backing chains /// are handled recursively via the optional `backing_file` field. -pub(crate) struct Qcow2MetadataBacking { +pub(crate) struct Qcow2Backing { pub(crate) metadata: Arc, pub(crate) data_fd: OwnedFd, pub(crate) backing_file: Option>, @@ -56,9 +56,9 @@ pub(crate) struct Qcow2MetadataBacking { // SAFETY: All reads go through QcowMetadata which uses RwLock // and pread64 which is position independent and thread safe. -unsafe impl Sync for Qcow2MetadataBacking {} +unsafe impl Sync for Qcow2Backing {} -impl BackingRead for Qcow2MetadataBacking { +impl BackingRead for Qcow2Backing { fn read_at(&self, address: u64, buf: &mut [u8]) -> io::Result<()> { let virtual_size = self.metadata.virtual_size(); if address >= virtual_size { @@ -75,7 +75,7 @@ impl BackingRead for Qcow2MetadataBacking { } } -impl Qcow2MetadataBacking { +impl Qcow2Backing { /// Resolve cluster mappings via metadata then read allocated clusters /// with pread64. fn read_clusters(&self, address: u64, buf: &mut [u8]) -> io::Result<()> { @@ -125,7 +125,7 @@ impl Qcow2MetadataBacking { } } -impl Drop for Qcow2MetadataBacking { +impl Drop for Qcow2Backing { fn drop(&mut self) { self.metadata.shutdown(); } @@ -152,7 +152,7 @@ pub fn shared_backing_from(bf: BackingFile) -> BlockResult> } BackingKind::Qcow { inner, backing } => { let data_fd = dup_fd(inner.raw_file.as_fd())?; - Ok(Arc::new(Qcow2MetadataBacking { + Ok(Arc::new(Qcow2Backing { metadata: Arc::new(QcowMetadata::new(*inner)), data_fd, backing_file: backing.map(|bf| shared_backing_from(*bf)).transpose()?, diff --git a/block/src/qcow_sync.rs b/block/src/qcow_sync.rs index f948a3b5fd..ec1637543a 100644 --- a/block/src/qcow_sync.rs +++ b/block/src/qcow_sync.rs @@ -783,7 +783,7 @@ mod unit_tests { #[test] fn test_multi_queue_concurrent_reads_qcow2_backing() { - // Same as above but reads go through a Qcow2MetadataBacking, + // Same as above but reads go through a Qcow2Backing, // exercising concurrent metadata resolution + pread64 in the backing. let backing_temp = TempFile::new().unwrap(); let cluster_size = 1u64 << 16; @@ -1123,7 +1123,7 @@ mod unit_tests { #[test] fn test_qcow2_backing_cross_cluster_read() { // Read spanning a cluster boundary through qcow2 backing. - // Exercises the read_clusters loop in Qcow2MetadataBacking. + // Exercises the read_clusters loop in Qcow2Backing. let cluster_size = 1u64 << 16; let file_size = cluster_size * 4; let pattern: Vec = (0..file_size as usize).map(|i| (i % 251) as u8).collect(); From 6c1da4f5c1172edf21ea349167189ef7049d5723 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 13 Apr 2026 19:50:15 +0200 Subject: [PATCH 614/742] block: qcow: Expose RawFile alignment as a public accessor Add pub fn alignment() to RawFile so that callers can query the O_DIRECT buffer alignment requirement probed at file open time. Signed-off-by: Anatol Belski --- block/src/qcow/raw_file.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/block/src/qcow/raw_file.rs b/block/src/qcow/raw_file.rs index 56ec797355..c2a01811f8 100644 --- a/block/src/qcow/raw_file.rs +++ b/block/src/qcow/raw_file.rs @@ -125,6 +125,10 @@ impl RawFile { self.direct_io } + pub fn alignment(&self) -> usize { + self.alignment + } + /// Returns true if the file was opened with write access. pub fn is_writable(&self) -> bool { // SAFETY: fcntl with F_GETFL is safe and doesn't modify the file descriptor From f50b5ab1f2a0accea210d6bafafe7860e274ffcc Mon Sep 17 00:00:00 2001 From: CMGS Date: Wed, 15 Apr 2026 12:17:24 +0200 Subject: [PATCH 615/742] block: qcow: Add aligned bounce buffers for O_DIRECT I/O When the data file is opened with O_DIRECT, buffer address, length, and file offset must satisfy the device alignment. Add AlignedBuf RAII wrapper and aligned_pread/aligned_pwrite helpers in qcow_common that use bounce buffers when alignment constraints are not met. For writes with misaligned offset, a read modify write is performed on the aligned region. gather_from_iovecs_into gathers iovec data directly into a caller provided buffer, avoiding an intermediate Vec allocation. Fixes: #8007 Signed-off-by: CMGS Co-authored-by: Anatol Belski Signed-off-by: Anatol Belski --- block/src/qcow_common.rs | 121 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 112 insertions(+), 9 deletions(-) diff --git a/block/src/qcow_common.rs b/block/src/qcow_common.rs index dc492e36d2..08a1a9ca3e 100644 --- a/block/src/qcow_common.rs +++ b/block/src/qcow_common.rs @@ -9,6 +9,7 @@ //! Position-independent I/O (`pread_exact`, `pwrite_all`) and iovec //! scatter/gather helpers used by both `qcow_sync` and `qcow_async`. +use std::alloc::{Layout, alloc_zeroed, dealloc}; use std::cmp::min; use std::os::fd::RawFd; use std::{io, ptr, slice}; @@ -67,6 +68,98 @@ pub fn pwrite_all(fd: RawFd, buf: &[u8], offset: u64) -> io::Result<()> { Ok(()) } +/// RAII wrapper for an aligned heap buffer required by O_DIRECT. +pub struct AlignedBuf { + ptr: *mut u8, + layout: Layout, +} + +impl AlignedBuf { + pub fn new(size: usize, alignment: usize) -> io::Result { + let size = size.max(1).next_multiple_of(alignment); + let layout = Layout::from_size_align(size, alignment) + .map_err(|e| io::Error::other(format!("invalid aligned layout: {e}")))?; + // SAFETY: layout has non-zero size. + let ptr = unsafe { alloc_zeroed(layout) }; + if ptr.is_null() { + return Err(io::Error::new( + io::ErrorKind::OutOfMemory, + "aligned allocation failed", + )); + } + Ok(AlignedBuf { ptr, layout }) + } + + pub fn as_mut_slice(&mut self, len: usize) -> &mut [u8] { + let len = len.min(self.layout.size()); + // SAFETY: ptr is valid for layout.size() bytes; len <= layout.size(). + unsafe { slice::from_raw_parts_mut(self.ptr, len) } + } + + pub fn as_slice(&self, len: usize) -> &[u8] { + let len = len.min(self.layout.size()); + // SAFETY: ptr is valid for layout.size() bytes; len <= layout.size(). + unsafe { slice::from_raw_parts(self.ptr, len) } + } + + #[cfg(test)] + pub fn layout(&self) -> &Layout { + &self.layout + } + + #[cfg(test)] + pub fn ptr(&self) -> *const u8 { + self.ptr + } +} + +impl Drop for AlignedBuf { + fn drop(&mut self) { + // SAFETY: ptr was allocated by alloc_zeroed with self.layout. + unsafe { dealloc(self.ptr, self.layout) }; + } +} + +/// Read into `buf` via an aligned bounce buffer when O_DIRECT requires it. +pub fn aligned_pread(fd: RawFd, buf: &mut [u8], offset: u64, alignment: usize) -> io::Result<()> { + if alignment == 0 + || ((buf.as_ptr() as usize).is_multiple_of(alignment) + && buf.len().is_multiple_of(alignment) + && (offset as usize).is_multiple_of(alignment)) + { + return pread_exact(fd, buf, offset); + } + + let aligned_offset = offset & !(alignment as u64 - 1); + let head = (offset - aligned_offset) as usize; + let aligned_len = (head + buf.len()).next_multiple_of(alignment); + let mut bounce = AlignedBuf::new(aligned_len, alignment)?; + pread_exact(fd, bounce.as_mut_slice(aligned_len), aligned_offset)?; + buf.copy_from_slice(&bounce.as_slice(aligned_len)[head..head + buf.len()]); + Ok(()) +} + +/// Write `buf` via an aligned bounce buffer when O_DIRECT requires it. +pub fn aligned_pwrite(fd: RawFd, buf: &[u8], offset: u64, alignment: usize) -> io::Result<()> { + if alignment == 0 + || ((buf.as_ptr() as usize).is_multiple_of(alignment) + && buf.len().is_multiple_of(alignment) + && (offset as usize).is_multiple_of(alignment)) + { + return pwrite_all(fd, buf, offset); + } + + let aligned_offset = offset & !(alignment as u64 - 1); + let head = (offset - aligned_offset) as usize; + let aligned_len = (head + buf.len()).next_multiple_of(alignment); + let mut bounce = AlignedBuf::new(aligned_len, alignment)?; + + // Read-modify-write: read the existing aligned region, overlay our data. + pread_exact(fd, bounce.as_mut_slice(aligned_len), aligned_offset)?; + bounce.as_mut_slice(aligned_len)[head..head + buf.len()].copy_from_slice(buf); + pwrite_all(fd, bounce.as_slice(aligned_len), aligned_offset) +} + // -- iovec helper functions -- // // Operate on the iovec array as a flat byte stream. @@ -129,33 +222,43 @@ pub unsafe fn zero_fill_iovecs(iovecs: &[libc::iovec], start: usize, len: usize) } } -/// Gather bytes from iovecs starting at the given byte offset into a Vec. +/// Gather bytes from iovecs starting at the given byte offset into `dst`. /// /// # Safety /// Caller must ensure iovecs point to valid, readable memory of sufficient size. -pub unsafe fn gather_from_iovecs(iovecs: &[libc::iovec], start: usize, len: usize) -> Vec { - let mut result = Vec::with_capacity(len); - let mut remaining = len; +pub unsafe fn gather_from_iovecs_into(iovecs: &[libc::iovec], start: usize, dst: &mut [u8]) { + let len = dst.len(); + let mut written = 0usize; let mut pos = 0usize; for iov in iovecs { let iov_end = pos + iov.iov_len; - if iov_end <= start || remaining == 0 { + if iov_end <= start || written == len { pos = iov_end; continue; } let iov_start = start.saturating_sub(pos); let available = iov.iov_len - iov_start; - let count = min(available, remaining); + let count = min(available, len - written); // SAFETY: iov_base is valid for iov_len bytes per caller contract. unsafe { let src = (iov.iov_base as *const u8).add(iov_start); - result.extend_from_slice(slice::from_raw_parts(src, count)); + ptr::copy_nonoverlapping(src, dst.as_mut_ptr().add(written), count); } - remaining -= count; - if remaining == 0 { + written += count; + if written == len { break; } pos = iov_end; } +} + +/// Gather bytes from iovecs starting at the given byte offset into a Vec. +/// +/// # Safety +/// Caller must ensure iovecs point to valid, readable memory of sufficient size. +pub unsafe fn gather_from_iovecs(iovecs: &[libc::iovec], start: usize, len: usize) -> Vec { + let mut result = vec![0u8; len]; + // SAFETY: caller guarantees iovecs are valid; result has len bytes. + unsafe { gather_from_iovecs_into(iovecs, start, &mut result) }; result } From fd8495b342238f35b98f61c27917a2849befd0cb Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 15 Apr 2026 12:18:06 +0200 Subject: [PATCH 616/742] block: qcow: Use aligned I/O in QcowSync Store the alignment from the data file in QcowSync. Use AlignedBuf directly in read_vectored and write_vectored as the intermediate buffer so that aligned_pread/aligned_pwrite can skip the bounce copy when offset and length are naturally aligned. Use gather_from_iovecs_into to gather iovec data directly into the aligned buffer. Signed-off-by: Anatol Belski --- block/src/qcow_sync.rs | 58 +++++++++++++++++++++++++++++++++++------- 1 file changed, 49 insertions(+), 9 deletions(-) diff --git a/block/src/qcow_sync.rs b/block/src/qcow_sync.rs index ec1637543a..648f8d6445 100644 --- a/block/src/qcow_sync.rs +++ b/block/src/qcow_sync.rs @@ -22,7 +22,8 @@ use crate::qcow::metadata::{ use crate::qcow::qcow_raw_file::QcowRawFile; use crate::qcow::{MAX_NESTING_DEPTH, RawFile, parse_qcow}; use crate::qcow_common::{ - gather_from_iovecs, pread_exact, pwrite_all, scatter_to_iovecs, zero_fill_iovecs, + AlignedBuf, aligned_pread, aligned_pwrite, gather_from_iovecs, gather_from_iovecs_into, + pread_exact, pwrite_all, scatter_to_iovecs, zero_fill_iovecs, }; pub struct QcowDiskSync { @@ -153,6 +154,8 @@ pub struct QcowSync { /// See the backing_file field on QcowDiskSync. backing_file: Option>, sparse: bool, + /// O_DIRECT alignment requirement (0 = no alignment needed). + alignment: usize, eventfd: EventFd, completion_list: VecDeque<(u64, i32)>, } @@ -164,11 +167,13 @@ impl QcowSync { backing_file: Option>, sparse: bool, ) -> Self { + let alignment = data_file.file().alignment(); QcowSync { metadata, data_file, backing_file, sparse, + alignment, eventfd: EventFd::new(libc::EFD_NONBLOCK) .expect("Failed creating EventFd for QcowSync"), completion_list: VecDeque::new(), @@ -208,12 +213,29 @@ impl AsyncIo for QcowSync { offset: host_offset, length, } => { - let mut buf = vec![0u8; length as usize]; - pread_exact(self.data_file.as_raw_fd(), &mut buf, host_offset) + let len = length as usize; + if self.alignment > 0 { + // O_DIRECT, aligned buffer avoids bounce copy. + let mut abuf = AlignedBuf::new(len, self.alignment) + .map_err(AsyncIoError::ReadVectored)?; + aligned_pread( + self.data_file.as_raw_fd(), + abuf.as_mut_slice(len), + host_offset, + self.alignment, + ) .map_err(AsyncIoError::ReadVectored)?; - // SAFETY: iovecs point to valid guest memory buffers - unsafe { scatter_to_iovecs(iovecs, buf_offset, &buf) }; - buf_offset += length as usize; + // SAFETY: iovecs point to valid guest memory buffers + unsafe { scatter_to_iovecs(iovecs, buf_offset, abuf.as_slice(len)) }; + } else { + // No O_DIRECT, plain buffer is fine. + let mut buf = vec![0u8; len]; + pread_exact(self.data_file.as_raw_fd(), &mut buf, host_offset) + .map_err(AsyncIoError::ReadVectored)?; + // SAFETY: iovecs point to valid guest memory buffers + unsafe { scatter_to_iovecs(iovecs, buf_offset, &buf) }; + } + buf_offset += len; } ClusterReadMapping::Compressed { data } => { let len = data.len(); @@ -287,10 +309,28 @@ impl AsyncIo for QcowSync { ClusterWriteMapping::Allocated { offset: host_offset, } => { - // SAFETY: iovecs point to valid guest memory buffers - let buf = unsafe { gather_from_iovecs(iovecs, buf_offset, count) }; - pwrite_all(self.data_file.as_raw_fd(), &buf, host_offset) + if self.alignment > 0 { + // O_DIRECT, gather directly into aligned buffer. + let mut abuf = AlignedBuf::new(count, self.alignment) + .map_err(AsyncIoError::WriteVectored)?; + // SAFETY: iovecs point to valid guest memory buffers + unsafe { + gather_from_iovecs_into(iovecs, buf_offset, abuf.as_mut_slice(count)); + } + aligned_pwrite( + self.data_file.as_raw_fd(), + abuf.as_slice(count), + host_offset, + self.alignment, + ) .map_err(AsyncIoError::WriteVectored)?; + } else { + // No O_DIRECT, plain buffer is fine. + // SAFETY: iovecs point to valid guest memory buffers + let buf = unsafe { gather_from_iovecs(iovecs, buf_offset, count) }; + pwrite_all(self.data_file.as_raw_fd(), &buf, host_offset) + .map_err(AsyncIoError::WriteVectored)?; + } } } buf_offset += count; From cfa60a95b98683c514dc9aa5a241b98a9cf8aee0 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 15 Apr 2026 12:18:36 +0200 Subject: [PATCH 617/742] block: qcow: Use aligned I/O in QcowAsync Store the alignment from the data file in QcowAsync. Use aligned_pread in scatter_read_sync and aligned_pwrite with gather_from_iovecs_into in cow_write_sync, matching the QcowSync approach. Signed-off-by: Anatol Belski --- block/src/qcow_async.rs | 68 +++++++++++++++++++++++++++++++++++------ 1 file changed, 58 insertions(+), 10 deletions(-) diff --git a/block/src/qcow_async.rs b/block/src/qcow_async.rs index d58b1b1b75..e65a4ac757 100644 --- a/block/src/qcow_async.rs +++ b/block/src/qcow_async.rs @@ -27,7 +27,8 @@ use crate::qcow::metadata::{ use crate::qcow::qcow_raw_file::QcowRawFile; use crate::qcow::{MAX_NESTING_DEPTH, RawFile, parse_qcow}; use crate::qcow_common::{ - gather_from_iovecs, pread_exact, pwrite_all, scatter_to_iovecs, zero_fill_iovecs, + AlignedBuf, aligned_pread, aligned_pwrite, gather_from_iovecs_into, pread_exact, pwrite_all, + scatter_to_iovecs, zero_fill_iovecs, }; use crate::{BatchRequest, RequestType, disk_file}; @@ -172,6 +173,8 @@ pub struct QcowAsync { data_file: QcowRawFile, backing_file: Option>, sparse: bool, + /// O_DIRECT alignment requirement (0 = no alignment needed). + alignment: usize, io_uring: IoUring, eventfd: EventFd, completion_list: VecDeque<(u64, i32)>, @@ -185,6 +188,7 @@ impl QcowAsync { sparse: bool, ring_depth: u32, ) -> io::Result { + let alignment = data_file.file().alignment(); let io_uring = IoUring::new(ring_depth)?; let eventfd = EventFd::new(libc::EFD_NONBLOCK)?; io_uring.submitter().register_eventfd(eventfd.as_raw_fd())?; @@ -194,6 +198,7 @@ impl QcowAsync { data_file, backing_file, sparse, + alignment, io_uring, eventfd, completion_list: VecDeque::new(), @@ -241,6 +246,7 @@ impl AsyncIo for QcowAsync { offset as u64, iovecs, total_len, + self.alignment, )? { let fd = self.data_file.as_raw_fd(); let (submitter, mut sq, _) = self.io_uring.split(); @@ -285,6 +291,7 @@ impl AsyncIo for QcowAsync { &self.metadata, &self.data_file, &self.backing_file, + self.alignment, )?; let total_len: usize = iovecs.iter().map(|v| v.iov_len).sum(); @@ -377,6 +384,7 @@ impl AsyncIo for QcowAsync { req.offset as u64, &req.iovecs, total_len, + self.alignment, )? { let fd = self.data_file.as_raw_fd(); // SAFETY: fd is valid and iovecs point to valid guest memory. @@ -408,6 +416,7 @@ impl AsyncIo for QcowAsync { &self.metadata, &self.data_file, &self.backing_file, + self.alignment, )?; sync_completions.push((req.user_data, total_len as i32)); } @@ -448,6 +457,7 @@ impl QcowAsync { address: u64, iovecs: &[libc::iovec], total_len: usize, + alignment: usize, ) -> AsyncIoResult> { let has_backing = backing_file.is_some(); let mappings = metadata @@ -464,7 +474,7 @@ impl QcowAsync { return Ok(Some(*host_offset)); } - Self::scatter_read_sync(mappings, iovecs, data_file, backing_file)?; + Self::scatter_read_sync(mappings, iovecs, data_file, backing_file, alignment)?; Ok(None) } @@ -474,6 +484,7 @@ impl QcowAsync { iovecs: &[libc::iovec], data_file: &QcowRawFile, backing_file: &Option>, + alignment: usize, ) -> AsyncIoResult<()> { let mut buf_offset = 0usize; for mapping in mappings { @@ -489,12 +500,27 @@ impl QcowAsync { offset: host_offset, length, } => { - let mut buf = vec![0u8; length as usize]; - pread_exact(data_file.as_raw_fd(), &mut buf, host_offset) + let len = length as usize; + if alignment > 0 { + let mut abuf = + AlignedBuf::new(len, alignment).map_err(AsyncIoError::ReadVectored)?; + aligned_pread( + data_file.as_raw_fd(), + abuf.as_mut_slice(len), + host_offset, + alignment, + ) .map_err(AsyncIoError::ReadVectored)?; - // SAFETY: iovecs point to valid guest memory buffers. - unsafe { scatter_to_iovecs(iovecs, buf_offset, &buf) }; - buf_offset += length as usize; + // SAFETY: iovecs point to valid guest memory buffers. + unsafe { scatter_to_iovecs(iovecs, buf_offset, abuf.as_slice(len)) }; + } else { + let mut buf = vec![0u8; len]; + pread_exact(data_file.as_raw_fd(), &mut buf, host_offset) + .map_err(AsyncIoError::ReadVectored)?; + // SAFETY: iovecs point to valid guest memory buffers. + unsafe { scatter_to_iovecs(iovecs, buf_offset, &buf) }; + } + buf_offset += len; } ClusterReadMapping::Compressed { data } => { let len = data.len(); @@ -528,6 +554,7 @@ impl QcowAsync { metadata: &QcowMetadata, data_file: &QcowRawFile, backing_file: &Option>, + alignment: usize, ) -> AsyncIoResult<()> { let total_len: usize = iovecs.iter().map(|v| v.iov_len).sum(); let cluster_size = metadata.cluster_size(); @@ -561,10 +588,31 @@ impl QcowAsync { ClusterWriteMapping::Allocated { offset: host_offset, } => { - // SAFETY: iovecs point to valid guest memory buffers. - let buf = unsafe { gather_from_iovecs(iovecs, buf_offset, count) }; - pwrite_all(data_file.as_raw_fd(), &buf, host_offset) + if alignment > 0 { + // O_DIRECT, gather directly into aligned buffer. + let mut abuf = AlignedBuf::new(count, alignment) + .map_err(AsyncIoError::WriteVectored)?; + // SAFETY: iovecs point to valid guest memory buffers + unsafe { + gather_from_iovecs_into(iovecs, buf_offset, abuf.as_mut_slice(count)); + } + aligned_pwrite( + data_file.as_raw_fd(), + abuf.as_slice(count), + host_offset, + alignment, + ) .map_err(AsyncIoError::WriteVectored)?; + } else { + // No O_DIRECT, plain buffer is fine. + let mut buf = vec![0u8; count]; + // SAFETY: iovecs point to valid guest memory buffers. + unsafe { + gather_from_iovecs_into(iovecs, buf_offset, &mut buf); + } + pwrite_all(data_file.as_raw_fd(), &buf, host_offset) + .map_err(AsyncIoError::WriteVectored)?; + } } } buf_offset += count; From b62525f79226e5455772d4a79b53b7b00cada42c Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 13 Apr 2026 22:15:56 +0200 Subject: [PATCH 618/742] block: qcow: Add direct_io test coverage for QcowSync Add direct_io variants for suitable tests by extracting test bodies into _impl(direct_io: bool) functions. Each original test calls _impl(false) and a new _direct_io test calls _impl(true). When direct_io is true, RawFile probes alignment and QcowSync exercises the AlignedBuf and bounce buffer paths in read_vectored and write_vectored. Signed-off-by: Anatol Belski --- block/src/qcow_sync.rs | 227 +++++++++++++++++++++++++++++++++-------- 1 file changed, 186 insertions(+), 41 deletions(-) diff --git a/block/src/qcow_sync.rs b/block/src/qcow_sync.rs index 648f8d6445..e60392df10 100644 --- a/block/src/qcow_sync.rs +++ b/block/src/qcow_sync.rs @@ -432,6 +432,7 @@ mod unit_tests { data: &[u8], offset: u64, sparse: bool, + direct_io: bool, ) -> (TempFile, QcowDiskSync) { let temp_file = TempFile::new().unwrap(); { @@ -443,7 +444,7 @@ mod unit_tests { } let disk = QcowDiskSync::new( temp_file.as_file().try_clone().unwrap(), - false, + direct_io, false, sparse, ) @@ -485,7 +486,7 @@ mod unit_tests { fn test_qcow_async_punch_hole_completion() { let data = vec![0xDD; 128 * 1024]; let offset = 0u64; - let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &data, offset, true); + let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &data, offset, true, false); let mut async_io = disk.new_async_io(1).unwrap(); async_io.punch_hole(offset, data.len() as u64, 100).unwrap(); @@ -505,7 +506,7 @@ mod unit_tests { fn test_qcow_async_write_zeroes_completion() { let data = vec![0xEE; 256 * 1024]; let offset = 64 * 1024u64; - let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &data, offset, true); + let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &data, offset, true, false); let mut async_io = disk.new_async_io(1).unwrap(); async_io @@ -526,7 +527,7 @@ mod unit_tests { #[test] fn test_qcow_async_multiple_operations() { let data = vec![0xFF; 64 * 1024]; - let (_temp, _) = create_disk_with_data(100 * 1024 * 1024, &[], 0, true); + let (_temp, _) = create_disk_with_data(100 * 1024 * 1024, &[], 0, true, false); // Write data at multiple offsets via QcowFile first, then punch { @@ -567,7 +568,7 @@ mod unit_tests { // Verify that after punch_hole, a second async_io sees zeros. let data = vec![0xAB; 128 * 1024]; let offset = 0u64; - let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &data, offset, true); + let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &data, offset, true, false); let mut async_io1 = disk.new_async_io(1).unwrap(); async_io1 @@ -591,7 +592,7 @@ mod unit_tests { // Simulates the real usage pattern of write data, punch hole, then read back. let data = vec![0xCD; 64 * 1024]; // one cluster let offset = 1024 * 1024u64; // 1MB offset - let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &data, offset, true); + let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &data, offset, true, false); // Punch hole to simulate DISCARD let mut async_io1 = disk.new_async_io(1).unwrap(); @@ -609,9 +610,8 @@ mod unit_tests { ); } - #[test] - fn test_qcow_async_read_write_roundtrip() { - let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &[], 0, true); + fn test_qcow_async_read_write_roundtrip_impl(direct_io: bool) { + let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &[], 0, true, direct_io); let data = vec![0x42u8; 64 * 1024]; let offset = 0u64; @@ -630,9 +630,18 @@ mod unit_tests { } #[test] - fn test_qcow_async_read_unallocated() { + fn test_qcow_async_read_write_roundtrip() { + test_qcow_async_read_write_roundtrip_impl(false); + } + + #[test] + fn test_qcow_async_read_write_roundtrip_direct_io() { + test_qcow_async_read_write_roundtrip_impl(true); + } + + fn test_qcow_async_read_unallocated_impl(direct_io: bool) { // Reading from an unallocated region should return zeros. - let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &[], 0, true); + let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &[], 0, true, direct_io); let read_buf = async_read(&disk, 0, 64 * 1024); assert!( read_buf.iter().all(|&b| b == 0), @@ -641,8 +650,17 @@ mod unit_tests { } #[test] - fn test_qcow_async_cross_cluster_read_write() { - let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &[], 0, true); + fn test_qcow_async_read_unallocated() { + test_qcow_async_read_unallocated_impl(false); + } + + #[test] + fn test_qcow_async_read_unallocated_direct_io() { + test_qcow_async_read_unallocated_impl(true); + } + + fn test_qcow_async_cross_cluster_read_write_impl(direct_io: bool) { + let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &[], 0, true, direct_io); // Default cluster size is 64KB. Write 96KB starting at 32KB to cross the boundary. let data: Vec = (0..96 * 1024).map(|i| (i % 251) as u8).collect(); @@ -662,7 +680,16 @@ mod unit_tests { } #[test] - fn test_backing_file_read() { + fn test_qcow_async_cross_cluster_read_write() { + test_qcow_async_cross_cluster_read_write_impl(false); + } + + #[test] + fn test_qcow_async_cross_cluster_read_write_direct_io() { + test_qcow_async_cross_cluster_read_write_impl(true); + } + + fn test_backing_file_read_impl(direct_io: bool) { let backing_temp = TempFile::new().unwrap(); let cluster_size = 1u64 << 16; let file_size = cluster_size * 4; @@ -683,7 +710,7 @@ mod unit_tests { } let file = overlay_temp.as_file().try_clone().unwrap(); - let disk = QcowDiskSync::new(file, false, true, true).unwrap(); + let disk = QcowDiskSync::new(file, direct_io, true, true).unwrap(); // Read first cluster - should come from backing file let buf = async_read(&disk, 0, cluster_size as usize); @@ -719,7 +746,16 @@ mod unit_tests { } #[test] - fn test_backing_file_read_qcow2_backing() { + fn test_backing_file_read() { + test_backing_file_read_impl(false); + } + + #[test] + fn test_backing_file_read_direct_io() { + test_backing_file_read_impl(true); + } + + fn test_backing_file_read_qcow2_backing_impl(direct_io: bool) { let backing_temp = TempFile::new().unwrap(); let cluster_size = 1u64 << 16; let file_size = cluster_size * 4; @@ -745,7 +781,7 @@ mod unit_tests { } let file = overlay_temp.as_file().try_clone().unwrap(); - let disk = QcowDiskSync::new(file, false, true, true).unwrap(); + let disk = QcowDiskSync::new(file, direct_io, true, true).unwrap(); // Read first cluster - should come from QCOW2 backing let buf = async_read(&disk, 0, cluster_size as usize); @@ -786,14 +822,23 @@ mod unit_tests { } #[test] - fn test_multi_queue_concurrent_reads() { + fn test_backing_file_read_qcow2_backing() { + test_backing_file_read_qcow2_backing_impl(false); + } + + #[test] + fn test_backing_file_read_qcow2_backing_direct_io() { + test_backing_file_read_qcow2_backing_impl(true); + } + + fn test_multi_queue_concurrent_reads_impl(direct_io: bool) { // Verify that multiple queues (threads) can read simultaneously. // This exercises the RwLock + pread64 design: concurrent L2 cache hits // proceed in parallel and data reads are position independent. let cluster_size = 1u64 << 16; let file_size = cluster_size * 16; let pattern: Vec = (0..file_size as usize).map(|i| (i % 251) as u8).collect(); - let (_temp, disk) = create_disk_with_data(file_size, &pattern, 0, true); + let (_temp, disk) = create_disk_with_data(file_size, &pattern, 0, true, direct_io); let disk = Arc::new(disk); let threads: Vec<_> = (0..8) @@ -822,7 +867,16 @@ mod unit_tests { } #[test] - fn test_multi_queue_concurrent_reads_qcow2_backing() { + fn test_multi_queue_concurrent_reads() { + test_multi_queue_concurrent_reads_impl(false); + } + + #[test] + fn test_multi_queue_concurrent_reads_direct_io() { + test_multi_queue_concurrent_reads_impl(true); + } + + fn test_multi_queue_concurrent_reads_qcow2_backing_impl(direct_io: bool) { // Same as above but reads go through a Qcow2Backing, // exercising concurrent metadata resolution + pread64 in the backing. let backing_temp = TempFile::new().unwrap(); @@ -850,7 +904,7 @@ mod unit_tests { } let file = overlay_temp.as_file().try_clone().unwrap(); - let disk = Arc::new(QcowDiskSync::new(file, false, true, true).unwrap()); + let disk = Arc::new(QcowDiskSync::new(file, direct_io, true, true).unwrap()); let threads: Vec<_> = (0..8) .map(|t| { @@ -877,7 +931,16 @@ mod unit_tests { } #[test] - fn test_three_layer_backing_chain() { + fn test_multi_queue_concurrent_reads_qcow2_backing() { + test_multi_queue_concurrent_reads_qcow2_backing_impl(false); + } + + #[test] + fn test_multi_queue_concurrent_reads_qcow2_backing_direct_io() { + test_multi_queue_concurrent_reads_qcow2_backing_impl(true); + } + + fn test_three_layer_backing_chain_impl(direct_io: bool) { // raw base -> qcow2 mid -> qcow2 overlay // Tests recursive shared_backing_from() with nested backing. let cluster_size = 1u64 << 16; @@ -924,7 +987,7 @@ mod unit_tests { } let file = overlay_temp.as_file().try_clone().unwrap(); - let disk = QcowDiskSync::new(file, false, true, true).unwrap(); + let disk = QcowDiskSync::new(file, direct_io, true, true).unwrap(); // Cluster 0: mid wrote 0xBB let buf = async_read(&disk, 0, cluster_size as usize); @@ -960,7 +1023,16 @@ mod unit_tests { } #[test] - fn test_backing_cow_preserves_all_unwritten_clusters() { + fn test_three_layer_backing_chain() { + test_three_layer_backing_chain_impl(false); + } + + #[test] + fn test_three_layer_backing_chain_direct_io() { + test_three_layer_backing_chain_impl(true); + } + + fn test_backing_cow_preserves_all_unwritten_clusters_impl(direct_io: bool) { // Write to specific clusters in the overlay, verify all others still // read from the qcow2 backing correctly. let cluster_size = 1u64 << 16; @@ -990,7 +1062,7 @@ mod unit_tests { } let file = overlay_temp.as_file().try_clone().unwrap(); - let disk = QcowDiskSync::new(file, false, true, true).unwrap(); + let disk = QcowDiskSync::new(file, direct_io, true, true).unwrap(); let written = vec![0xFFu8; cluster_size as usize]; for &idx in &[0u64, 3, 7] { @@ -1025,7 +1097,16 @@ mod unit_tests { } #[test] - fn test_qcow2_backing_read_beyond_virtual_size() { + fn test_backing_cow_preserves_all_unwritten_clusters() { + test_backing_cow_preserves_all_unwritten_clusters_impl(false); + } + + #[test] + fn test_backing_cow_preserves_all_unwritten_clusters_direct_io() { + test_backing_cow_preserves_all_unwritten_clusters_impl(true); + } + + fn test_qcow2_backing_read_beyond_virtual_size_impl(direct_io: bool) { // Read starting past the backing file virtual_size should return zeros. let cluster_size = 1u64 << 16; let backing_size = cluster_size * 2; @@ -1053,7 +1134,7 @@ mod unit_tests { } let file = overlay_temp.as_file().try_clone().unwrap(); - let disk = QcowDiskSync::new(file, false, true, true).unwrap(); + let disk = QcowDiskSync::new(file, direct_io, true, true).unwrap(); // Read cluster 2 (past backing virtual_size) - should be zeros let buf = async_read(&disk, backing_size, cluster_size as usize); @@ -1064,7 +1145,16 @@ mod unit_tests { } #[test] - fn test_qcow2_backing_read_spanning_virtual_size() { + fn test_qcow2_backing_read_beyond_virtual_size() { + test_qcow2_backing_read_beyond_virtual_size_impl(false); + } + + #[test] + fn test_qcow2_backing_read_beyond_virtual_size_direct_io() { + test_qcow2_backing_read_beyond_virtual_size_impl(true); + } + + fn test_qcow2_backing_read_spanning_virtual_size_impl(direct_io: bool) { // Read that starts within backing bounds but extends past virtual_size. // First part should have backing data, remainder should be zeros. let cluster_size = 1u64 << 16; @@ -1094,7 +1184,7 @@ mod unit_tests { } let file = overlay_temp.as_file().try_clone().unwrap(); - let disk = QcowDiskSync::new(file, false, true, true).unwrap(); + let disk = QcowDiskSync::new(file, direct_io, true, true).unwrap(); // Read 2 clusters starting at cluster 1 (spans backing boundary) let read_len = cluster_size as usize * 2; @@ -1114,7 +1204,16 @@ mod unit_tests { } #[test] - fn test_raw_backing_read_beyond_virtual_size() { + fn test_qcow2_backing_read_spanning_virtual_size() { + test_qcow2_backing_read_spanning_virtual_size_impl(false); + } + + #[test] + fn test_qcow2_backing_read_spanning_virtual_size_direct_io() { + test_qcow2_backing_read_spanning_virtual_size_impl(true); + } + + fn test_raw_backing_read_beyond_virtual_size_impl(direct_io: bool) { // Read past raw backing file virtual_size should return zeros. let cluster_size = 1u64 << 16; let backing_size = cluster_size * 2; @@ -1138,7 +1237,7 @@ mod unit_tests { } let file = overlay_temp.as_file().try_clone().unwrap(); - let disk = QcowDiskSync::new(file, false, true, true).unwrap(); + let disk = QcowDiskSync::new(file, direct_io, true, true).unwrap(); // Read cluster 2 (past backing size) - should be zeros let buf = async_read(&disk, backing_size, cluster_size as usize); @@ -1161,7 +1260,16 @@ mod unit_tests { } #[test] - fn test_qcow2_backing_cross_cluster_read() { + fn test_raw_backing_read_beyond_virtual_size() { + test_raw_backing_read_beyond_virtual_size_impl(false); + } + + #[test] + fn test_raw_backing_read_beyond_virtual_size_direct_io() { + test_raw_backing_read_beyond_virtual_size_impl(true); + } + + fn test_qcow2_backing_cross_cluster_read_impl(direct_io: bool) { // Read spanning a cluster boundary through qcow2 backing. // Exercises the read_clusters loop in Qcow2Backing. let cluster_size = 1u64 << 16; @@ -1190,7 +1298,7 @@ mod unit_tests { } let file = overlay_temp.as_file().try_clone().unwrap(); - let disk = QcowDiskSync::new(file, false, true, true).unwrap(); + let disk = QcowDiskSync::new(file, direct_io, true, true).unwrap(); // Read spanning clusters 1-2 boundary: 512 bytes before + 512 after let mid = cluster_size - 512; @@ -1214,7 +1322,16 @@ mod unit_tests { } #[test] - fn test_punch_hole_with_backing_fallthrough() { + fn test_qcow2_backing_cross_cluster_read() { + test_qcow2_backing_cross_cluster_read_impl(false); + } + + #[test] + fn test_qcow2_backing_cross_cluster_read_direct_io() { + test_qcow2_backing_cross_cluster_read_impl(true); + } + + fn test_punch_hole_with_backing_fallthrough_impl(direct_io: bool) { // Write to overlay, then punch hole. After punch, the cluster should // fall through to backing data (not zeros). let cluster_size = 1u64 << 16; @@ -1238,7 +1355,7 @@ mod unit_tests { } let file = overlay_temp.as_file().try_clone().unwrap(); - let disk = QcowDiskSync::new(file, false, true, true).unwrap(); + let disk = QcowDiskSync::new(file, direct_io, true, true).unwrap(); let written = vec![0xFFu8; cluster_size as usize]; async_write(&disk, 0, &written); @@ -1277,10 +1394,19 @@ mod unit_tests { } #[test] - fn test_rewrite_allocated_cluster() { + fn test_punch_hole_with_backing_fallthrough() { + test_punch_hole_with_backing_fallthrough_impl(false); + } + + #[test] + fn test_punch_hole_with_backing_fallthrough_direct_io() { + test_punch_hole_with_backing_fallthrough_impl(true); + } + + fn test_rewrite_allocated_cluster_impl(direct_io: bool) { // Write to a cluster, then overwrite it. The second write should hit // the already allocated path in map_write (no new cluster allocation). - let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &[], 0, true); + let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &[], 0, true, direct_io); let cluster_size = 1u64 << 16; let data1 = vec![0xAAu8; cluster_size as usize]; @@ -1306,7 +1432,16 @@ mod unit_tests { } #[test] - fn test_partial_cluster_write_with_backing_cow() { + fn test_rewrite_allocated_cluster() { + test_rewrite_allocated_cluster_impl(false); + } + + #[test] + fn test_rewrite_allocated_cluster_direct_io() { + test_rewrite_allocated_cluster_impl(true); + } + + fn test_partial_cluster_write_with_backing_cow_impl(direct_io: bool) { // Partial cluster write to an overlay with a backing file triggers COW. // The unwritten part of the cluster must be copied from backing. let cluster_size = 1u64 << 16; @@ -1330,7 +1465,7 @@ mod unit_tests { } let file = overlay_temp.as_file().try_clone().unwrap(); - let disk = QcowDiskSync::new(file, false, true, true).unwrap(); + let disk = QcowDiskSync::new(file, direct_io, true, true).unwrap(); // Write 4KB at offset 4KB within cluster 0 (partial cluster) let write_offset = 4096u64; @@ -1366,6 +1501,16 @@ mod unit_tests { ); } + #[test] + fn test_partial_cluster_write_with_backing_cow() { + test_partial_cluster_write_with_backing_cow_impl(false); + } + + #[test] + fn test_partial_cluster_write_with_backing_cow_direct_io() { + test_partial_cluster_write_with_backing_cow_impl(true); + } + #[test] fn test_partial_cluster_deallocate() { // Punch hole on a partial cluster range. The deallocate_bytes path @@ -1376,7 +1521,7 @@ mod unit_tests { let data: Vec = (0..2 * cluster_size as usize) .map(|i| (i % 251) as u8) .collect(); - let (_temp, disk) = create_disk_with_data(file_size, &data, 0, true); + let (_temp, disk) = create_disk_with_data(file_size, &data, 0, true, false); // Punch a partial range: last 4KB of cluster 0 + first 4KB of cluster 1 let punch_offset = cluster_size - 4096; @@ -1420,7 +1565,7 @@ mod unit_tests { let cluster_size = 1u64 << 16; let initial_size = cluster_size * 4; let data = vec![0xAA; cluster_size as usize]; - let (_temp, mut disk) = create_disk_with_data(initial_size, &data, 0, true); + let (_temp, mut disk) = create_disk_with_data(initial_size, &data, 0, true, false); assert_eq!(disk.logical_size().unwrap(), initial_size); From 7fd5e74f0ef75e728ba5f8ec24f7a61930a9f5ef Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 13 Apr 2026 22:19:36 +0200 Subject: [PATCH 619/742] block: qcow: Add multi iovec read/write test Exercise scatter/gather with multiple iovecs per operation, covering both the standard and direct_io paths. Write uses 3 iovecs with distinct patterns, read uses 3 iovecs with different sizes, then reassembles and compares. Signed-off-by: Anatol Belski --- block/src/qcow_sync.rs | 83 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/block/src/qcow_sync.rs b/block/src/qcow_sync.rs index e60392df10..8b5f5c3f60 100644 --- a/block/src/qcow_sync.rs +++ b/block/src/qcow_sync.rs @@ -1636,4 +1636,87 @@ mod unit_tests { "size should be unchanged after failed resize" ); } + + fn test_multi_iovec_read_write_impl(direct_io: bool) { + // Exercise scatter/gather with multiple iovecs per operation. + let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &[], 0, true, direct_io); + + // Write: 3 iovecs with distinct patterns + let a = vec![0xAAu8; 16 * 1024]; + let b = vec![0xBBu8; 32 * 1024]; + let c = vec![0xCCu8; 16 * 1024]; + let iovecs_w = [ + libc::iovec { + iov_base: a.as_ptr() as *mut libc::c_void, + iov_len: a.len(), + }, + libc::iovec { + iov_base: b.as_ptr() as *mut libc::c_void, + iov_len: b.len(), + }, + libc::iovec { + iov_base: c.as_ptr() as *mut libc::c_void, + iov_len: c.len(), + }, + ]; + let total = a.len() + b.len() + c.len(); + + let mut aio = disk.new_async_io(1).unwrap(); + aio.write_vectored(0, &iovecs_w, 1).unwrap(); + let (ud, res) = aio.next_completed_request().unwrap(); + assert_eq!(ud, 1); + assert_eq!(res as usize, total); + aio.fsync(Some(2)).unwrap(); + drop(aio); + + // Read back into 3 iovecs of different sizes + let mut r1 = vec![0u8; 8 * 1024]; + let mut r2 = vec![0u8; 48 * 1024]; + let mut r3 = vec![0u8; 8 * 1024]; + let iovecs_r = [ + libc::iovec { + iov_base: r1.as_mut_ptr() as *mut libc::c_void, + iov_len: r1.len(), + }, + libc::iovec { + iov_base: r2.as_mut_ptr() as *mut libc::c_void, + iov_len: r2.len(), + }, + libc::iovec { + iov_base: r3.as_mut_ptr() as *mut libc::c_void, + iov_len: r3.len(), + }, + ]; + + let mut aio = disk.new_async_io(1).unwrap(); + aio.read_vectored(0, &iovecs_r, 10).unwrap(); + let (ud, res) = aio.next_completed_request().unwrap(); + assert_eq!(ud, 10); + assert_eq!(res as usize, total); + drop(aio); + + // Reassemble the read buffers into a flat vec + let mut got = Vec::with_capacity(total); + got.extend_from_slice(&r1); + got.extend_from_slice(&r2); + got.extend_from_slice(&r3); + + // Build expected from the write buffers + let mut expected = Vec::with_capacity(total); + expected.extend_from_slice(&a); + expected.extend_from_slice(&b); + expected.extend_from_slice(&c); + + assert_eq!(got, expected, "Multi iovec read should match written data"); + } + + #[test] + fn test_multi_iovec_read_write() { + test_multi_iovec_read_write_impl(false); + } + + #[test] + fn test_multi_iovec_read_write_direct_io() { + test_multi_iovec_read_write_impl(true); + } } From fe711b3a0b63bb09496c35c7419a4dfbc2753517 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 13 Apr 2026 22:29:05 +0200 Subject: [PATCH 620/742] block: qcow: Add aligned_pread pass through test Test that aligned_pread takes the fast path when buffer address, length, and offset are all properly aligned. Signed-off-by: Anatol Belski --- block/src/qcow_sync.rs | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/block/src/qcow_sync.rs b/block/src/qcow_sync.rs index 8b5f5c3f60..c3e57da2b9 100644 --- a/block/src/qcow_sync.rs +++ b/block/src/qcow_sync.rs @@ -419,6 +419,7 @@ impl AsyncIo for QcowSync { #[cfg(test)] mod unit_tests { use std::io::{Seek, SeekFrom, Write}; + use std::os::fd::RawFd; use std::thread; use vmm_sys_util::tempfile::TempFile; @@ -1719,4 +1720,36 @@ mod unit_tests { fn test_multi_iovec_read_write_direct_io() { test_multi_iovec_read_write_impl(true); } + + // -- Low level aligned I/O function tests -- + // + // Test aligned_pread and aligned_pwrite directly with controlled + // alignment values on a plain temp file. + + /// Create a temp file filled with a repeating pattern of the given size. + /// Returns the TempFile (must be kept alive) and the raw fd. + fn create_pattern_file(size: usize) -> (TempFile, RawFd) { + let tf = TempFile::new().unwrap(); + let pattern: Vec = (0..size).map(|i| (i % 251) as u8).collect(); + tf.as_file().write_all(&pattern).unwrap(); + tf.as_file().sync_all().unwrap(); + let fd = tf.as_file().as_raw_fd(); + (tf, fd) + } + + #[test] + fn test_aligned_pread_pass_through() { + // When buffer address, length, and offset are all aligned, + // aligned_pread should take the fast path (no bounce buffer). + let size = 4096usize; + let (_tf, fd) = create_pattern_file(size); + let alignment = 512; + + // Use AlignedBuf to guarantee buffer address alignment. + let mut abuf = AlignedBuf::new(size, alignment).unwrap(); + aligned_pread(fd, abuf.as_mut_slice(size), 0, alignment).unwrap(); + + let expected: Vec = (0..size).map(|i| (i % 251) as u8).collect(); + assert_eq!(abuf.as_slice(size), &expected[..]); + } } From 021838b63cb963b4bb8e323f8d2ab74a1c54557b Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 13 Apr 2026 22:30:25 +0200 Subject: [PATCH 621/742] block: qcow: Add aligned_pread bounce buffer test Test that aligned_pread correctly uses a bounce buffer when the caller buffer address is not aligned. Signed-off-by: Anatol Belski --- block/src/qcow_sync.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/block/src/qcow_sync.rs b/block/src/qcow_sync.rs index c3e57da2b9..cf3b27c254 100644 --- a/block/src/qcow_sync.rs +++ b/block/src/qcow_sync.rs @@ -1752,4 +1752,21 @@ mod unit_tests { let expected: Vec = (0..size).map(|i| (i % 251) as u8).collect(); assert_eq!(abuf.as_slice(size), &expected[..]); } + + #[test] + fn test_aligned_pread_bounce_unaligned_buffer() { + // Force a misaligned buffer so aligned_pread must take the + // bounce path. A plain vec![0u8; 4096] is often page-aligned + // by the allocator, which would skip the bounce entirely. + let size = 4096usize; + let (_tf, fd) = create_pattern_file(size); + let alignment = 512; + + let mut backing = vec![0u8; size + 1]; + let buf = &mut backing[1..size + 1]; + aligned_pread(fd, buf, 0, alignment).unwrap(); + + let expected: Vec = (0..size).map(|i| (i % 251) as u8).collect(); + assert_eq!(buf, &expected[..]); + } } From 05d52d035371047f0e82a77b422ef20a0a9659a1 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 13 Apr 2026 22:31:35 +0200 Subject: [PATCH 622/742] block: qcow: Add aligned_pread unaligned offset test Test that aligned_pread handles a non aligned offset by rounding down, reading an aligned region, and returning the correct slice from within the bounce buffer. Signed-off-by: Anatol Belski --- block/src/qcow_sync.rs | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/block/src/qcow_sync.rs b/block/src/qcow_sync.rs index cf3b27c254..5b4a6b2168 100644 --- a/block/src/qcow_sync.rs +++ b/block/src/qcow_sync.rs @@ -1769,4 +1769,24 @@ mod unit_tests { let expected: Vec = (0..size).map(|i| (i % 251) as u8).collect(); assert_eq!(buf, &expected[..]); } + + #[test] + fn test_aligned_pread_unaligned_offset() { + // Read at an offset that is not a multiple of alignment. + // aligned_pread should round down the offset, read an aligned + // region, then copy the correct slice into the caller buffer. + let file_size = 8192usize; + let (_tf, fd) = create_pattern_file(file_size); + let alignment = 512; + + let offset = 100u64; + let len = 200usize; + let mut buf = vec![0u8; len]; + aligned_pread(fd, &mut buf, offset, alignment).unwrap(); + + let expected: Vec = (offset as usize..offset as usize + len) + .map(|i| (i % 251) as u8) + .collect(); + assert_eq!(buf, expected); + } } From 32af2f2a22bbc49d4c8bd07e0bc09809cd7180cd Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 13 Apr 2026 22:35:44 +0200 Subject: [PATCH 623/742] block: qcow: Test aligned_pwrite pass through path Write 4096 bytes of pattern data at offset 0 using AlignedBuf and verify data integrity via plain pread_exact. All parameters are naturally aligned to 512 so the fast path is exercised. Signed-off-by: Anatol Belski --- block/src/qcow_sync.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/block/src/qcow_sync.rs b/block/src/qcow_sync.rs index 5b4a6b2168..af6bfc1ba7 100644 --- a/block/src/qcow_sync.rs +++ b/block/src/qcow_sync.rs @@ -1789,4 +1789,22 @@ mod unit_tests { .collect(); assert_eq!(buf, expected); } + + #[test] + fn test_aligned_pwrite_pass_through() { + // When buffer address, length, and offset are all aligned, + // aligned_pwrite should take the fast path. + let size = 4096usize; + let (_tf, fd) = create_pattern_file(size); + let alignment = 512; + + let data: Vec = (0..size).map(|i| ((i + 1) % 251) as u8).collect(); + let mut abuf = AlignedBuf::new(size, alignment).unwrap(); + abuf.as_mut_slice(size).copy_from_slice(&data); + aligned_pwrite(fd, abuf.as_slice(size), 0, alignment).unwrap(); + + let mut readback = vec![0u8; size]; + pread_exact(fd, &mut readback, 0).unwrap(); + assert_eq!(readback, data); + } } From 5ed9f2e3d8a480f11e06594182e924a13702be53 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 13 Apr 2026 22:37:40 +0200 Subject: [PATCH 624/742] block: qcow: Test aligned_pwrite bounce unaligned buffer Write 4096 bytes via plain Vec whose address is not guaranteed to be aligned. The bounce buffer path copies data into an aligned allocation before the syscall. Read back with pread_exact to verify data integrity. Signed-off-by: Anatol Belski --- block/src/qcow_sync.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/block/src/qcow_sync.rs b/block/src/qcow_sync.rs index af6bfc1ba7..8989354c3f 100644 --- a/block/src/qcow_sync.rs +++ b/block/src/qcow_sync.rs @@ -1807,4 +1807,22 @@ mod unit_tests { pread_exact(fd, &mut readback, 0).unwrap(); assert_eq!(readback, data); } + + #[test] + fn test_aligned_pwrite_bounce_unaligned_buffer() { + // Force a misaligned buffer so aligned_pwrite must take the + // bounce path. A plain vec![0u8; 4096] is often page-aligned + // by the allocator, which would skip the bounce entirely. + let size = 4096usize; + let (_tf, fd) = create_pattern_file(size); + let alignment = 512; + + let backing: Vec = (0..size + 1).map(|i| ((i + 1) % 251) as u8).collect(); + let data = &backing[1..size + 1]; + aligned_pwrite(fd, data, 0, alignment).unwrap(); + + let mut readback = vec![0u8; size]; + pread_exact(fd, &mut readback, 0).unwrap(); + assert_eq!(readback, data); + } } From 7aa477936e16dd6867b6a9c5c8e8db7b8fc1e35a Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 13 Apr 2026 22:39:19 +0200 Subject: [PATCH 625/742] block: qcow: Test aligned_pwrite unaligned offset Write at offset 100 with alignment 512 so the read modify write path is exercised. Verify the written region and that surrounding data is preserved. Signed-off-by: Anatol Belski --- block/src/qcow_sync.rs | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/block/src/qcow_sync.rs b/block/src/qcow_sync.rs index 8989354c3f..14b9c7c362 100644 --- a/block/src/qcow_sync.rs +++ b/block/src/qcow_sync.rs @@ -1825,4 +1825,35 @@ mod unit_tests { pread_exact(fd, &mut readback, 0).unwrap(); assert_eq!(readback, data); } + + #[test] + fn test_aligned_pwrite_unaligned_offset() { + // Write at an offset that is not a multiple of alignment. + // aligned_pwrite should do read-modify-write and preserve + // surrounding data. + let file_size = 8192usize; + let (_tf, fd) = create_pattern_file(file_size); + let alignment = 512; + + let offset = 100u64; + let len = 200usize; + let data: Vec = (0..len).map(|i| ((i + 1) % 239) as u8).collect(); + aligned_pwrite(fd, &data, offset, alignment).unwrap(); + + // Read entire file and verify the written region plus untouched areas. + let mut whole = vec![0u8; file_size]; + pread_exact(fd, &mut whole, 0).unwrap(); + + // Before the write region: original pattern. + let before: Vec = (0..offset as usize).map(|i| (i % 251) as u8).collect(); + assert_eq!(&whole[..offset as usize], &before[..]); + + // The written region. + assert_eq!(&whole[offset as usize..offset as usize + len], &data[..]); + + // After the write region: original pattern. + let after_start = offset as usize + len; + let after: Vec = (after_start..file_size).map(|i| (i % 251) as u8).collect(); + assert_eq!(&whole[after_start..], &after[..]); + } } From ab811126189cdaf113f98cda31aa33118a1fd650 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 13 Apr 2026 22:40:26 +0200 Subject: [PATCH 626/742] block: qcow: Test aligned pread and pwrite with 4096 alignment Exercise both aligned_pread and aligned_pwrite with 4096 byte alignment instead of 512. Verify written data and that surrounding regions are preserved. Signed-off-by: Anatol Belski --- block/src/qcow_sync.rs | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/block/src/qcow_sync.rs b/block/src/qcow_sync.rs index 14b9c7c362..a98557abe5 100644 --- a/block/src/qcow_sync.rs +++ b/block/src/qcow_sync.rs @@ -1856,4 +1856,32 @@ mod unit_tests { let after: Vec = (after_start..file_size).map(|i| (i % 251) as u8).collect(); assert_eq!(&whole[after_start..], &after[..]); } + + #[test] + fn test_aligned_pread_pwrite_4096_alignment() { + // Exercise aligned I/O with 4096 byte alignment. + let file_size = 16384usize; + let (_tf, fd) = create_pattern_file(file_size); + let alignment = 4096; + + // Write 4096 bytes at offset 4096 via unaligned Vec. + let offset = 4096u64; + let len = 4096usize; + let data: Vec = (0..len).map(|i| ((i + 1) % 239) as u8).collect(); + aligned_pwrite(fd, &data, offset, alignment).unwrap(); + + // Read back the written region via unaligned Vec. + let mut buf = vec![0u8; len]; + aligned_pread(fd, &mut buf, offset, alignment).unwrap(); + assert_eq!(buf, data); + + // Verify untouched regions. + let mut whole = vec![0u8; file_size]; + pread_exact(fd, &mut whole, 0).unwrap(); + let before: Vec = (0..offset as usize).map(|i| (i % 251) as u8).collect(); + assert_eq!(&whole[..offset as usize], &before[..]); + let after_start = offset as usize + len; + let after: Vec = (after_start..file_size).map(|i| (i % 251) as u8).collect(); + assert_eq!(&whole[after_start..], &after[..]); + } } From e5ad85d7def285d093f85916b7096ec5cb412297 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 13 Apr 2026 23:34:26 +0200 Subject: [PATCH 627/742] test_infra: Prepare QCOW2 overlay for Windows guests WindowsDiskConfig now creates a qcow2 overlay backed by the raw Windows image during prepare_files(). The overlay is placed under ~/workloads alongside the raw image. Writes go into the overlay so the backing raw image stays unmodified, matching the CoW semantics already provided by the dm snapshot for raw tests. Drop removes the qcow2 file. The DiskConfig trait gains a qcow2_disk() default method returning None. WindowsDiskConfig overrides it to expose the overlay path. Signed-off-by: Anatol Belski --- test_infra/src/lib.rs | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/test_infra/src/lib.rs b/test_infra/src/lib.rs index e32ee791fc..091fef7c8f 100644 --- a/test_infra/src/lib.rs +++ b/test_infra/src/lib.rs @@ -226,6 +226,9 @@ pub trait DiskConfig { fn prepare_files(&mut self, tmp_dir: &TempDir, network: &GuestNetworkConfig); fn prepare_cloudinit(&self, tmp_dir: &TempDir, network: &GuestNetworkConfig) -> String; fn disk(&self, disk_type: DiskType) -> Option; + fn qcow2_disk(&self) -> Option { + None + } } #[derive(Clone)] @@ -248,6 +251,7 @@ impl UbuntuDiskConfig { pub struct WindowsDiskConfig { image_name: String, osdisk_path: String, + osdisk_qcow2_path: String, loopback_device: String, windows_snapshot_cow: String, windows_snapshot: String, @@ -258,6 +262,7 @@ impl WindowsDiskConfig { WindowsDiskConfig { image_name, osdisk_path: String::new(), + osdisk_qcow2_path: String::new(), loopback_device: String::new(), windows_snapshot_cow: String::new(), windows_snapshot: String::new(), @@ -286,6 +291,10 @@ impl Drop for WindowsDiskConfig { .args(["-d", self.loopback_device.as_str()]) .output() .expect("Expect removing loopback device to succeed"); + + if !self.osdisk_qcow2_path.is_empty() { + let _ = fs::remove_file(&self.osdisk_qcow2_path); + } } } @@ -451,7 +460,7 @@ impl DiskConfig for WindowsDiskConfig { let mut osdisk_path = workload_path; osdisk_path.push(&self.image_name); - let osdisk_blk_size = fs::metadata(osdisk_path) + let osdisk_blk_size = fs::metadata(&osdisk_path) .expect("Expect retrieving Windows image metadata") .len() >> 9; @@ -530,6 +539,27 @@ impl DiskConfig for WindowsDiskConfig { self.osdisk_path = format!("/dev/mapper/{windows_snapshot}"); self.windows_snapshot_cow = windows_snapshot_cow; self.windows_snapshot = windows_snapshot; + + // Create a qcow2 overlay backed by the raw image. + let mut workload_path = dirs::home_dir().unwrap(); + workload_path.push("workloads"); + let qcow2_name = format!("windows-qcow2-{}.qcow2", random_extension.to_str().unwrap()); + let qcow2_path = workload_path.join(&qcow2_name); + let output = Command::new("qemu-img") + .args([ + "create", + "-f", + "qcow2", + "-b", + osdisk_path.to_str().unwrap(), + "-F", + "raw", + qcow2_path.to_str().unwrap(), + ]) + .output() + .expect("Expect creating qcow2 overlay to succeed"); + assert!(output.status.success(), "qemu-img create failed"); + self.osdisk_qcow2_path = qcow2_path.to_str().unwrap().to_string(); } fn disk(&self, disk_type: DiskType) -> Option { @@ -538,6 +568,10 @@ impl DiskConfig for WindowsDiskConfig { DiskType::CloudInit => None, } } + + fn qcow2_disk(&self) -> Option { + Some(self.osdisk_qcow2_path.clone()) + } } pub fn rate_limited_copy, Q: AsRef>(from: P, to: Q) -> io::Result { From 11cf332114028a2f1d33357527e8a741b23eaeab Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Mon, 13 Apr 2026 23:44:45 +0200 Subject: [PATCH 628/742] tests: Add Windows QCOW2 guest boot with direct I/O test Boot a Windows guest from a qcow2 overlay with direct=on. After boot, write 5 randomly filled files from 4MB to 20MB, copy each file, and compare SHA256 hashes to verify data integrity through the aligned bounce buffer path. Signed-off-by: Anatol Belski --- cloud-hypervisor/tests/integration.rs | 67 +++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index c4bccc46ad..cc24cb8ebb 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -8337,6 +8337,73 @@ mod windows { handle_child_output(r, &output); } + + #[test] + fn test_windows_guest_qcow2_backing_direct() { + let windows_guest = WindowsGuest::new(); + + let qcow2_path = windows_guest.guest().disk_config.qcow2_disk().unwrap(); + + let mut child = GuestCommand::new(windows_guest.guest()) + .args(["--cpus", "boot=2,kvm_hyperv=on"]) + .args(["--memory", "size=4G"]) + .args(["--kernel", edk2_path().to_str().unwrap()]) + .args(["--serial", "tty"]) + .args(["--console", "off"]) + .args([ + "--disk", + format!("path={qcow2_path},image_type=qcow2,backing_files=on,direct=on").as_str(), + ]) + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let fd = child.stdout.as_ref().unwrap().as_raw_fd(); + let pipesize = unsafe { libc::fcntl(fd, libc::F_SETPIPE_SZ, PIPE_SIZE) }; + let fd = child.stderr.as_ref().unwrap().as_raw_fd(); + let pipesize1 = unsafe { libc::fcntl(fd, libc::F_SETPIPE_SZ, PIPE_SIZE) }; + + assert!(pipesize >= PIPE_SIZE && pipesize1 >= PIPE_SIZE); + + let mut child_dnsmasq = windows_guest.run_dnsmasq(); + + let r = std::panic::catch_unwind(|| { + windows_guest.wait_for_boot().unwrap(); + + // Write and read back files through qcow2 + direct I/O. + for i in 0..5 { + let fname = format!("c:\\test-dio-{i}.bin"); + let fname2 = format!("c:\\test-dio-{i}-copy.bin"); + let size = (i + 1) * 4 * 1024 * 1024; + windows_guest.ssh_cmd(&format!( + "powershell -Command \"\ + $r = New-Object byte[] {size}; \ + (New-Object Random {i}).NextBytes($r); \ + [IO.File]::WriteAllBytes('{fname}', $r)\"" + )); + let hash_write = windows_guest.ssh_cmd(&format!( + "powershell -Command \"(Get-FileHash '{fname}' -Algorithm SHA256).Hash\"" + )); + windows_guest.ssh_cmd(&format!("copy {fname} {fname2}")); + let hash_read = windows_guest.ssh_cmd(&format!( + "powershell -Command \"(Get-FileHash '{fname2}' -Algorithm SHA256).Hash\"" + )); + assert_eq!(hash_write.trim(), hash_read.trim()); + } + + windows_guest.shutdown(); + }); + + let _ = child.wait_timeout(std::time::Duration::from_secs(60)); + let _ = child.kill(); + let output = child.wait_with_output().unwrap(); + + let _ = child_dnsmasq.kill(); + let _ = child_dnsmasq.wait(); + + handle_child_output(r, &output); + } } #[cfg(target_arch = "x86_64")] From a84a0b8b25b2eb5c9dc7c343318a544bdb2fc1e4 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Tue, 14 Apr 2026 18:18:25 +0200 Subject: [PATCH 629/742] block: qcow: Add AlignedBuf allocation and access test Test AlignedBuf with 512 and 4096 byte alignment. Verify pointer alignment, zero initialization, and write/read round trip. Passes under miri. Signed-off-by: Anatol Belski --- block/src/qcow_sync.rs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/block/src/qcow_sync.rs b/block/src/qcow_sync.rs index a98557abe5..e696638eb4 100644 --- a/block/src/qcow_sync.rs +++ b/block/src/qcow_sync.rs @@ -1884,4 +1884,23 @@ mod unit_tests { let after: Vec = (after_start..file_size).map(|i| (i % 251) as u8).collect(); assert_eq!(&whole[after_start..], &after[..]); } + + #[test] + fn test_aligned_buf_allocation_and_access() { + for alignment in [512, 4096] { + let size = 1024usize; + let mut abuf = AlignedBuf::new(size, alignment).unwrap(); + let aligned_size = size.next_multiple_of(alignment); + + assert!( + (abuf.ptr() as usize).is_multiple_of(alignment), + "ptr not aligned to {alignment}" + ); + assert!(abuf.as_slice(aligned_size).iter().all(|&b| b == 0)); + + let pattern: Vec = (0..size).map(|i| (i % 251) as u8).collect(); + abuf.as_mut_slice(size).copy_from_slice(&pattern); + assert_eq!(abuf.as_slice(size), &pattern[..]); + } + } } From dc0e003be05133cabfa00b157319c758f1381877 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Tue, 14 Apr 2026 18:19:02 +0200 Subject: [PATCH 630/742] block: qcow: Add AlignedBuf size rounding test Verify that AlignedBuf rounds the allocation size up to the requested alignment. Passes under miri. Signed-off-by: Anatol Belski --- block/src/qcow_sync.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/block/src/qcow_sync.rs b/block/src/qcow_sync.rs index e696638eb4..44b8efaf5a 100644 --- a/block/src/qcow_sync.rs +++ b/block/src/qcow_sync.rs @@ -1903,4 +1903,13 @@ mod unit_tests { assert_eq!(abuf.as_slice(size), &pattern[..]); } } + + #[test] + fn test_aligned_buf_size_rounds_up() { + let abuf = AlignedBuf::new(1, 512).unwrap(); + assert_eq!(abuf.layout().size(), 512); + + let abuf = AlignedBuf::new(513, 512).unwrap(); + assert_eq!(abuf.layout().size(), 1024); + } } From c698075157c231a75c5fee0ba9e1c61290157364 Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Mon, 9 Mar 2026 16:02:40 +0100 Subject: [PATCH 631/742] vmm: add guest exit event path Introduce a dedicated guest_exit_evt and a matching epoll dispatch path for guest-triggered shutdowns. This series is needed because managment software such as libvirt may still need the Cloud Hypervisor process to stay alive after the guest has shut down. Today a guest-triggered shutdown can make the VMM disappear immediately, which means the managment software can lose track of the VM run-state. This must only apply to guest-triggered shutdowns. Fatal error paths and other internal exit paths must keep using the existing VMM exit handling. For now GuestExit still calls vmm_shutdown(), so this commit only adds the separate plumbing and keeps the current behavior unchanged. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- vmm/src/device_manager.rs | 7 +++++++ vmm/src/lib.rs | 35 +++++++++++++++++++++++++++++++++++ vmm/src/vm.rs | 6 ++++++ 3 files changed, 48 insertions(+) diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 39281e53ed..193b411d45 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -1080,6 +1080,7 @@ pub struct DeviceManager { // Exit event exit_evt: EventFd, reset_evt: EventFd, + guest_exit_evt: EventFd, #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] id_to_dev_info: HashMap<(DeviceType, String), MmioDeviceInfo>, @@ -1206,6 +1207,7 @@ impl DeviceManager { cpu_manager: Arc>, exit_evt: EventFd, reset_evt: EventFd, + guest_exit_evt: EventFd, seccomp_action: SeccompAction, numa_nodes: NumaNodes, activate_evt: &EventFd, @@ -1402,6 +1404,7 @@ impl DeviceManager { device_tree, exit_evt, reset_evt, + guest_exit_evt, #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] id_to_dev_info: HashMap::new(), seccomp_action, @@ -1519,6 +1522,9 @@ impl DeviceManager { self.exit_evt .try_clone() .map_err(DeviceManagerError::EventFd)?, + self.guest_exit_evt + .try_clone() + .map_err(DeviceManagerError::EventFd)?, )?; } @@ -1889,6 +1895,7 @@ impl DeviceManager { interrupt_manager: &dyn InterruptManager, reset_evt: EventFd, exit_evt: EventFd, + _guest_exit_evt: EventFd, ) -> DeviceManagerResult>>> { let vcpus_kill_signalled = self .cpu_manager diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 66a2a104d0..eec56c5c1f 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -250,6 +250,7 @@ pub enum EpollDispatch { Api = 2, ActivateVirtioDevices = 3, Debug = 4, + GuestExit = 5, Unknown, } @@ -262,6 +263,7 @@ impl From for EpollDispatch { 2 => Api, 3 => ActivateVirtioDevices, 4 => Debug, + 5 => GuestExit, _ => Unknown, } } @@ -608,6 +610,7 @@ pub struct Vmm { epoll: EpollContext, exit_evt: EventFd, reset_evt: EventFd, + guest_exit_evt: EventFd, api_evt: EventFd, #[cfg(feature = "guest_debug")] debug_evt: EventFd, @@ -781,6 +784,7 @@ impl Vmm { ) -> Result { let mut epoll = EpollContext::new().map_err(Error::Epoll)?; let reset_evt = EventFd::new(EFD_NONBLOCK).map_err(Error::EventFdCreate)?; + let guest_exit_evt = EventFd::new(EFD_NONBLOCK).map_err(Error::EventFdCreate)?; let activate_evt = EventFd::new(EFD_NONBLOCK).map_err(Error::EventFdCreate)?; epoll @@ -791,6 +795,10 @@ impl Vmm { .add_event(&reset_evt, EpollDispatch::Reset) .map_err(Error::Epoll)?; + epoll + .add_event(&guest_exit_evt, EpollDispatch::GuestExit) + .map_err(Error::Epoll)?; + epoll .add_event(&activate_evt, EpollDispatch::ActivateVirtioDevices) .map_err(Error::Epoll)?; @@ -808,6 +816,7 @@ impl Vmm { epoll, exit_evt, reset_evt, + guest_exit_evt, api_evt, #[cfg(feature = "guest_debug")] debug_evt, @@ -1109,6 +1118,9 @@ impl Vmm { let reset_evt = self.reset_evt.try_clone().map_err(|e| { MigratableError::MigrateReceive(anyhow!("Error cloning reset EventFd: {e}")) })?; + let guest_exit_evt = self.guest_exit_evt.try_clone().map_err(|e| { + MigratableError::MigrateReceive(anyhow!("Error cloning guest exit EventFd: {e}")) + })?; #[cfg(feature = "guest_debug")] let debug_evt = self.vm_debug_evt.try_clone().map_err(|e| { MigratableError::MigrateReceive(anyhow!("Error cloning debug EventFd: {e}")) @@ -1128,6 +1140,7 @@ impl Vmm { hypervisor_vm, exit_evt, reset_evt, + guest_exit_evt, #[cfg(feature = "guest_debug")] debug_evt, &self.seccomp_action, @@ -1569,6 +1582,10 @@ impl Vmm { let exit_evt = self.exit_evt.try_clone().map_err(VmError::EventFdClone)?; let reset_evt = self.reset_evt.try_clone().map_err(VmError::EventFdClone)?; + let guest_exit_evt = self + .guest_exit_evt + .try_clone() + .map_err(VmError::EventFdClone)?; #[cfg(feature = "guest_debug")] let debug_evt = self .vm_debug_evt @@ -1583,6 +1600,7 @@ impl Vmm { vm_config, exit_evt, reset_evt, + guest_exit_evt, #[cfg(feature = "guest_debug")] debug_evt, &self.seccomp_action, @@ -1667,6 +1685,13 @@ impl Vmm { self.reset_evt.read().map_err(Error::EventFdRead)?; self.vm_reboot().map_err(Error::VmReboot)?; } + EpollDispatch::GuestExit => { + info!("VM guest exit event"); + self.guest_exit_evt.read().map_err(Error::EventFdRead)?; + self.vmm_shutdown().map_err(Error::VmmShutdown)?; + + break 'outer; + } EpollDispatch::ActivateVirtioDevices => { if let Some(ref vm) = self.vm { let count = self.activate_evt.read().map_err(Error::EventFdRead)?; @@ -1775,6 +1800,10 @@ impl RequestHandler for Vmm { if self.vm.is_none() { let exit_evt = self.exit_evt.try_clone().map_err(VmError::EventFdClone)?; let reset_evt = self.reset_evt.try_clone().map_err(VmError::EventFdClone)?; + let guest_exit_evt = self + .guest_exit_evt + .try_clone() + .map_err(VmError::EventFdClone)?; #[cfg(feature = "guest_debug")] let vm_debug_evt = self .vm_debug_evt @@ -1790,6 +1819,7 @@ impl RequestHandler for Vmm { Arc::clone(vm_config), exit_evt, reset_evt, + guest_exit_evt, #[cfg(feature = "guest_debug")] vm_debug_evt, &self.seccomp_action, @@ -1955,6 +1985,10 @@ impl RequestHandler for Vmm { let exit_evt = self.exit_evt.try_clone().map_err(VmError::EventFdClone)?; let reset_evt = self.reset_evt.try_clone().map_err(VmError::EventFdClone)?; + let guest_exit_evt = self + .guest_exit_evt + .try_clone() + .map_err(VmError::EventFdClone)?; #[cfg(feature = "guest_debug")] let debug_evt = self .vm_debug_evt @@ -1980,6 +2014,7 @@ impl RequestHandler for Vmm { config, exit_evt, reset_evt, + guest_exit_evt, #[cfg(feature = "guest_debug")] debug_evt, &self.seccomp_action, diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index 04bd2d595a..6b3f6c42e1 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -536,6 +536,7 @@ impl Vm { vm: Arc, exit_evt: EventFd, reset_evt: EventFd, + guest_exit_evt: EventFd, #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, seccomp_action: &SeccompAction, hypervisor: Arc, @@ -604,6 +605,7 @@ impl Vm { cpu_manager.clone(), exit_evt.try_clone().map_err(Error::EventFdClone)?, reset_evt, + guest_exit_evt, seccomp_action.clone(), numa_nodes.clone(), &activate_evt, @@ -790,6 +792,7 @@ impl Vm { cpu_manager: Arc>, exit_evt: EventFd, reset_evt: EventFd, + guest_exit_evt: EventFd, seccomp_action: SeccompAction, numa_nodes: NumaNodes, activate_evt: &EventFd, @@ -812,6 +815,7 @@ impl Vm { cpu_manager, exit_evt, reset_evt, + guest_exit_evt, seccomp_action, numa_nodes, activate_evt, @@ -1247,6 +1251,7 @@ impl Vm { vm_config: Arc>, exit_evt: EventFd, reset_evt: EventFd, + guest_exit_evt: EventFd, #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, seccomp_action: &SeccompAction, hypervisor: Arc, @@ -1319,6 +1324,7 @@ impl Vm { vm, exit_evt, reset_evt, + guest_exit_evt, #[cfg(feature = "guest_debug")] vm_debug_evt, seccomp_action, From a159152e4139b048ed412a8aedecb19895f2a893 Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Mon, 9 Mar 2026 16:27:41 +0100 Subject: [PATCH 632/742] devices: route guest shutdown via guest exit Plumb ACPI S5 shutdown through guest_exit_evt instead of the shared exit path. This keeps guest-triggered shutdown separate from fatal VMM exit handling. Management software, for example libvirt, expects that distinction, and making it explicit aligns Cloud Hypervisor more closely with QEMU. Only the guest shutdown path is moved here. Reboot handling stays on reset_evt and non-guest exit paths are left unchanged. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- devices/src/acpi.rs | 8 ++++---- vmm/src/device_manager.rs | 8 ++------ 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/devices/src/acpi.rs b/devices/src/acpi.rs index 69bcb80d76..a9c86aa18e 100644 --- a/devices/src/acpi.rs +++ b/devices/src/acpi.rs @@ -21,7 +21,7 @@ pub const GED_DEVICE_ACPI_SIZE: usize = 0x1; /// A device for handling ACPI shutdown and reboot pub struct AcpiShutdownDevice { - exit_evt: EventFd, + guest_exit_evt: EventFd, reset_evt: EventFd, vcpus_kill_signalled: Arc, } @@ -29,12 +29,12 @@ pub struct AcpiShutdownDevice { impl AcpiShutdownDevice { /// Constructs a device that will signal the given event when the guest requests it. pub fn new( - exit_evt: EventFd, + guest_exit_evt: EventFd, reset_evt: EventFd, vcpus_kill_signalled: Arc, ) -> AcpiShutdownDevice { AcpiShutdownDevice { - exit_evt, + guest_exit_evt, reset_evt, vcpus_kill_signalled, } @@ -68,7 +68,7 @@ impl BusDevice for AcpiShutdownDevice { const SLEEP_VALUE_BIT: u8 = 2; if data[0] == (S5_SLEEP_VALUE << SLEEP_VALUE_BIT) | (1 << SLEEP_STATUS_EN_BIT) { info!("ACPI Shutdown signalled"); - if let Err(e) = self.exit_evt.write(1) { + if let Err(e) = self.guest_exit_evt.write(1) { error!("Error triggering ACPI shutdown event: {e}"); } // Spin until we are sure the reset_evt has been handled and that when diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 193b411d45..2ea8efe350 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -1519,9 +1519,6 @@ impl DeviceManager { self.reset_evt .try_clone() .map_err(DeviceManagerError::EventFd)?, - self.exit_evt - .try_clone() - .map_err(DeviceManagerError::EventFd)?, self.guest_exit_evt .try_clone() .map_err(DeviceManagerError::EventFd)?, @@ -1894,8 +1891,7 @@ impl DeviceManager { &mut self, interrupt_manager: &dyn InterruptManager, reset_evt: EventFd, - exit_evt: EventFd, - _guest_exit_evt: EventFd, + guest_exit_evt: EventFd, ) -> DeviceManagerResult>>> { let vcpus_kill_signalled = self .cpu_manager @@ -1904,7 +1900,7 @@ impl DeviceManager { .vcpus_kill_signalled() .clone(); let shutdown_device = Arc::new(Mutex::new(devices::AcpiShutdownDevice::new( - exit_evt, + guest_exit_evt, reset_evt, vcpus_kill_signalled, ))); From 005ce38ffd9b53aefca411184e4fa38d9548c9c5 Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Mon, 9 Mar 2026 16:35:34 +0100 Subject: [PATCH 633/742] main: add --no-shutdown Add a CLI-only --no-shutdown flag that keeps the VMM process alive after a guest-triggered shutdown. Management software may still need the Cloud Hypervisor process after the guest has powered off. Exposing this separately lets management software, for example libvirt, keep the VMM around in a way that is closer to QEMU. The flag only affects the GuestExit path. Fatal exits and other existing VMM shutdown paths remain unchanged. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- cloud-hypervisor/src/main.rs | 8 ++++++++ vmm/src/lib.rs | 19 ++++++++++++++++--- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/cloud-hypervisor/src/main.rs b/cloud-hypervisor/src/main.rs index 06ef1fe30f..51b1f38fdd 100644 --- a/cloud-hypervisor/src/main.rs +++ b/cloud-hypervisor/src/main.rs @@ -373,6 +373,12 @@ fn get_cli_options_sorted( .num_args(1..) .action(ArgAction::Append) .group("vm-config"), + Arg::new("no-shutdown") + .long("no-shutdown") + .help("Do not exit the VMM when the guest shuts down") + .num_args(0) + .action(ArgAction::SetTrue) + .group("vmm-config"), Arg::new("numa") .long("numa") .help(NumaConfig::SYNTAX) @@ -637,6 +643,7 @@ fn start_vmm( let exit_evt = EventFd::new(EFD_NONBLOCK).map_err(Error::CreateExitEventFd)?; let landlock_enable = cmd_arguments.get_flag("landlock"); + let no_shutdown = cmd_arguments.get_flag("no-shutdown"); #[allow(unused_mut)] let mut event_monitor = cmd_arguments @@ -733,6 +740,7 @@ fn start_vmm( exit_evt.try_clone().unwrap(), &seccomp_action, hypervisor, + no_shutdown, landlock_enable, ) .map_err(Error::StartVmmThread)?; diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index eec56c5c1f..5dfea07e78 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -162,6 +162,10 @@ pub enum Error { #[error("Error rebooting VM")] VmReboot(#[source] VmError), + /// Cannot shut the VM down + #[error("Error shutting down VM")] + VmShutdown(#[source] VmError), + /// Cannot create VMM thread #[error("Error spawning VMM thread")] VmmThreadSpawn(#[source] io::Error), @@ -449,6 +453,7 @@ pub fn start_vmm_thread( exit_event: EventFd, seccomp_action: &SeccompAction, hypervisor: Arc, + no_shutdown: bool, landlock_enable: bool, ) -> Result { #[cfg(feature = "guest_debug")] @@ -488,6 +493,7 @@ pub fn start_vmm_thread( vmm_seccomp_action, hypervisor, exit_event, + no_shutdown, )?; vmm.setup_signal_handler(landlock_enable)?; @@ -627,6 +633,7 @@ pub struct Vmm { original_termios_opt: Arc>>, console_resize_pipe: Option>, console_info: Option, + no_shutdown: bool, } /// Just a wrapper for the data that goes into @@ -781,6 +788,7 @@ impl Vmm { seccomp_action: SeccompAction, hypervisor: Arc, exit_evt: EventFd, + no_shutdown: bool, ) -> Result { let mut epoll = EpollContext::new().map_err(Error::Epoll)?; let reset_evt = EventFd::new(EFD_NONBLOCK).map_err(Error::EventFdCreate)?; @@ -833,6 +841,7 @@ impl Vmm { original_termios_opt: Arc::new(Mutex::new(None)), console_resize_pipe: None, console_info: None, + no_shutdown, }) } @@ -1688,9 +1697,12 @@ impl Vmm { EpollDispatch::GuestExit => { info!("VM guest exit event"); self.guest_exit_evt.read().map_err(Error::EventFdRead)?; - self.vmm_shutdown().map_err(Error::VmmShutdown)?; - - break 'outer; + if self.no_shutdown { + self.vm_shutdown().map_err(Error::VmShutdown)?; + } else { + self.vmm_shutdown().map_err(Error::VmmShutdown)?; + break 'outer; + } } EpollDispatch::ActivateVirtioDevices => { if let Some(ref vm) = self.vm { @@ -2630,6 +2642,7 @@ mod unit_tests { SeccompAction::Allow, hypervisor::new().unwrap(), EventFd::new(EFD_NONBLOCK).unwrap(), + false, ) .unwrap() } From c7a152ee7950005ca6e20e775226e23f640382f6 Mon Sep 17 00:00:00 2001 From: Anirudh Rayabharam Date: Thu, 16 Apr 2026 10:02:00 +0000 Subject: [PATCH 634/742] performance-metrics: fix overly broad process cleanup Drop the -f flag from the process termination command in cleanup_stale_processes() so it matches by process name only, not the full command line. This prevents terminating unrelated processes whose arguments happen to contain target strings (e.g., the test runner invoked with --report-file /cloud-hypervisor/report.json). Use the truncated name 'cloud-hyperviso' because Linux limits process names to 15 characters. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Anirudh Rayabharam --- performance-metrics/src/main.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/performance-metrics/src/main.rs b/performance-metrics/src/main.rs index 622d8793c7..c18ef70dc1 100644 --- a/performance-metrics/src/main.rs +++ b/performance-metrics/src/main.rs @@ -1297,8 +1297,9 @@ fn run_test_with_timeout( } fn cleanup_stale_processes() { - for proc in &["cloud-hypervisor", "iperf3", "ethr"] { - let _ = Command::new("pkill").args(["-9", "-f", proc]).status(); + // "cloud-hyperviso" - process name truncated to 15 chars by the kernel + for proc in &["cloud-hyperviso", "iperf3", "ethr"] { + let _ = Command::new("pkill").args(["-9", proc]).status(); } thread::sleep(Duration::from_secs(2)); } From 07b77b0f4b6aa8734aaf9213453cd00d13f0bf9c Mon Sep 17 00:00:00 2001 From: Keith Adler Date: Wed, 15 Apr 2026 23:02:07 -0500 Subject: [PATCH 635/742] ci: remove pinned cross version from quality.yaml Remove the pinned cross-version commit hash from all houseabsolute/actions-rust-cross usages. The pin was added as a workaround for virtio-bindings build issues that have since been resolved upstream. Closes #7180 Signed-off-by: Keith Adler --- .github/workflows/quality.yaml | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/.github/workflows/quality.yaml b/.github/workflows/quality.yaml index 776cd8eb42..47156beaba 100644 --- a/.github/workflows/quality.yaml +++ b/.github/workflows/quality.yaml @@ -53,7 +53,6 @@ jobs: uses: houseabsolute/actions-rust-cross@v1 with: command: clippy - cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 toolchain: ${{ matrix.rust }} target: ${{ matrix.target }} args: --locked --all --all-targets --no-default-features --tests --examples --features "kvm" -- -D warnings @@ -62,7 +61,6 @@ jobs: uses: houseabsolute/actions-rust-cross@v1 with: command: clippy - cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 toolchain: ${{ matrix.rust }} target: ${{ matrix.target }} args: --locked --all --all-targets --no-default-features --tests --examples --features "mshv" -- -D warnings @@ -71,7 +69,6 @@ jobs: uses: houseabsolute/actions-rust-cross@v1 with: command: clippy - cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 toolchain: ${{ matrix.rust }} target: ${{ matrix.target }} args: --locked --all --all-targets --no-default-features --tests --examples --features "mshv,kvm" -- -D warnings @@ -80,7 +77,6 @@ jobs: uses: houseabsolute/actions-rust-cross@v1 with: command: clippy - cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 toolchain: ${{ matrix.rust }} target: ${{ matrix.target }} args: --locked --all --all-targets --tests --examples -- -D warnings @@ -89,7 +85,6 @@ jobs: uses: houseabsolute/actions-rust-cross@v1 with: command: clippy - cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 toolchain: ${{ matrix.rust }} target: ${{ matrix.target }} args: --locked --all --all-targets --tests --examples --features "guest_debug" -- -D warnings @@ -98,7 +93,6 @@ jobs: uses: houseabsolute/actions-rust-cross@v1 with: command: clippy - cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 toolchain: ${{ matrix.rust }} target: ${{ matrix.target }} args: --locked --all --all-targets --tests --examples --features "pvmemcontrol" -- -D warnings @@ -107,7 +101,6 @@ jobs: uses: houseabsolute/actions-rust-cross@v1 with: command: clippy - cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 toolchain: ${{ matrix.rust }} target: ${{ matrix.target }} args: --locked --all --all-targets --tests --examples --features "tracing" -- -D warnings @@ -122,7 +115,6 @@ jobs: uses: houseabsolute/actions-rust-cross@v1 with: command: clippy - cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 toolchain: ${{ matrix.rust }} target: ${{ matrix.target }} args: --locked --all --all-targets --tests --examples --features "ivshmem" -- -D warnings @@ -132,7 +124,6 @@ jobs: uses: houseabsolute/actions-rust-cross@v1 with: command: clippy - cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 toolchain: ${{ matrix.rust }} target: ${{ matrix.target }} args: --locked --all --all-targets --no-default-features --tests --examples --features "sev_snp" -- -D warnings @@ -142,7 +133,6 @@ jobs: uses: houseabsolute/actions-rust-cross@v1 with: command: clippy - cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 toolchain: ${{ matrix.rust }} target: ${{ matrix.target }} args: --locked --all --all-targets --no-default-features --tests --examples --features "igvm" -- -D warnings @@ -152,7 +142,6 @@ jobs: uses: houseabsolute/actions-rust-cross@v1 with: command: clippy - cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 toolchain: ${{ matrix.rust }} target: ${{ matrix.target }} args: --locked --all --all-targets --no-default-features --tests --examples --features "tdx,kvm" -- -D warnings From c1b4fcc7505b4deebbfa68caf13bfc89f3ff36c0 Mon Sep 17 00:00:00 2001 From: Dylan Reid Date: Thu, 16 Apr 2026 09:43:16 -0700 Subject: [PATCH 636/742] virtio-devices: More detailed vhost user errors Make it easier to chase down which vhost user socket failed and why in systems that have many vhost user devices. Signed-off-by: Dylan Reid --- virtio-devices/src/vhost_user/blk.rs | 11 +++++++++-- virtio-devices/src/vhost_user/fs.rs | 5 ++++- virtio-devices/src/vhost_user/generic_vhost_user.rs | 5 ++++- virtio-devices/src/vhost_user/mod.rs | 8 +++++--- virtio-devices/src/vhost_user/net.rs | 5 ++++- virtio-devices/src/vhost_user/vu_common_ctrl.rs | 4 +++- 6 files changed, 29 insertions(+), 9 deletions(-) diff --git a/virtio-devices/src/vhost_user/blk.rs b/virtio-devices/src/vhost_user/blk.rs index 8221b7b501..83653147bd 100644 --- a/virtio-devices/src/vhost_user/blk.rs +++ b/virtio-devices/src/vhost_user/blk.rs @@ -254,7 +254,11 @@ impl VirtioDevice for Blk { .set_config(offset as u32, VhostUserConfigFlags::WRITABLE, data) .map_err(Error::VhostUserSetConfig) { - error!("Failed setting vhost-user-blk configuration: {e:?}"); + error!( + "Failed setting vhost-user-blk configuration for socket {} at offset 0x{offset:x} with length {}: {e:?}", + self.vu_common.socket_path, + data.len() + ); } } @@ -313,7 +317,10 @@ impl VirtioDevice for Blk { if let Some(vu) = &self.vu_common.vu && let Err(e) = vu.lock().unwrap().reset_vhost_user() { - error!("Failed to reset vhost-user daemon: {e:?}"); + error!( + "Failed to reset vhost-user daemon for socket {}: {e:?}", + self.vu_common.socket_path + ); return None; } diff --git a/virtio-devices/src/vhost_user/fs.rs b/virtio-devices/src/vhost_user/fs.rs index 967fdecf99..509a7a34f3 100644 --- a/virtio-devices/src/vhost_user/fs.rs +++ b/virtio-devices/src/vhost_user/fs.rs @@ -294,7 +294,10 @@ impl VirtioDevice for Fs { if let Some(vu) = &self.vu_common.vu && let Err(e) = vu.lock().unwrap().reset_vhost_user() { - error!("Failed to reset vhost-user daemon: {e:?}"); + error!( + "Failed to reset vhost-user daemon for socket {}: {e:?}", + self.vu_common.socket_path + ); return None; } diff --git a/virtio-devices/src/vhost_user/generic_vhost_user.rs b/virtio-devices/src/vhost_user/generic_vhost_user.rs index 1778834431..aed24b082d 100644 --- a/virtio-devices/src/vhost_user/generic_vhost_user.rs +++ b/virtio-devices/src/vhost_user/generic_vhost_user.rs @@ -317,7 +317,10 @@ impl VirtioDevice for GenericVhostUser { if let Some(vu) = &self.vu_common.vu && let Err(e) = vu.lock().unwrap().reset_vhost_user() { - error!("Failed to reset vhost-user daemon: {e:?}"); + error!( + "Failed to reset vhost-user daemon for socket {}: {e:?}", + self.vu_common.socket_path + ); return None; } diff --git a/virtio-devices/src/vhost_user/mod.rs b/virtio-devices/src/vhost_user/mod.rs index 01a75f0575..abca12c058 100644 --- a/virtio-devices/src/vhost_user/mod.rs +++ b/virtio-devices/src/vhost_user/mod.rs @@ -231,7 +231,8 @@ impl VhostUserEpollHandler { ) .map_err(|e| { EpollHelperError::IoError(std::io::Error::other(format!( - "failed connecting vhost-user backend {e:?}" + "failed connecting vhost-user backend for socket {}: {e:?}", + self.socket_path ))) })?; @@ -282,7 +283,8 @@ impl EpollHelperHandler for VhostUserEpollHandle HUP_CONNECTION_EVENT => { self.reconnect(helper).map_err(|e| { EpollHelperError::HandleEvent(anyhow!( - "failed to reconnect vhost-user backend: {e:?}" + "failed to reconnect vhost-user backend for socket {}: {e:?}", + self.socket_path )) })?; } @@ -370,7 +372,7 @@ impl VhostUserCommon { }; if self.vu.is_none() { - error!("Missing vhost-user handle"); + error!("Missing vhost-user handle for socket {}", self.socket_path); return Err(ActivateError::BadActivate); } let vu = self.vu.as_ref().unwrap(); diff --git a/virtio-devices/src/vhost_user/net.rs b/virtio-devices/src/vhost_user/net.rs index 1c85b5f38e..4803ade33c 100644 --- a/virtio-devices/src/vhost_user/net.rs +++ b/virtio-devices/src/vhost_user/net.rs @@ -374,7 +374,10 @@ impl VirtioDevice for Net { if let Some(vu) = &self.vu_common.vu && let Err(e) = vu.lock().unwrap().reset_vhost_user() { - error!("Failed to reset vhost-user daemon: {e:?}"); + error!( + "Failed to reset vhost-user daemon for socket {}: {e:?}", + self.vu_common.socket_path + ); return None; } diff --git a/virtio-devices/src/vhost_user/vu_common_ctrl.rs b/virtio-devices/src/vhost_user/vu_common_ctrl.rs index 23e06a3c0a..5ad9425c5a 100644 --- a/virtio-devices/src/vhost_user/vu_common_ctrl.rs +++ b/virtio-devices/src/vhost_user/vu_common_ctrl.rs @@ -428,7 +428,9 @@ impl VhostUserHandle { } }; - error!("Failed connecting the backend after trying for 1 minute: {err:?}"); + error!( + "Failed connecting the backend after trying for 1 minute for socket {socket_path}: {err:?}" + ); Err(Error::VhostUserConnect) } } From 4a607ed82ddaede31d414f88dc4762a515974271 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 26 Mar 2026 23:12:08 +0100 Subject: [PATCH 637/742] performance-metrics: Enable io_uring feature on block crate Enable the io_uring feature so that QcowDiskAsync and QcowAsync are available for async path micro benchmarks. Signed-off-by: Anatol Belski --- performance-metrics/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/performance-metrics/Cargo.toml b/performance-metrics/Cargo.toml index 516ea9e0a2..776a46fc2c 100644 --- a/performance-metrics/Cargo.toml +++ b/performance-metrics/Cargo.toml @@ -5,7 +5,7 @@ name = "performance-metrics" version = "0.1.0" [dependencies] -block = { path = "../block" } +block = { path = "../block", features = ["io_uring"] } clap = { workspace = true, features = ["wrap_help"] } dirs = { workspace = true } libc = { workspace = true } From ab1dddb62a69a6e948b5ea2c808a89155c85f9bd Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 26 Mar 2026 20:09:01 +0100 Subject: [PATCH 638/742] performance-metrics: Add QCOW2 tempfile helpers Add qcow_tempfile() which creates a QCOW2 v3 image with all clusters allocated via QcowFile::new plus sequential writes, then reopens it as QcowDiskSync. Add QCOW_CLUSTER_SIZE constant for the default 64 KiB cluster size. Signed-off-by: Anatol Belski --- performance-metrics/src/util.rs | 45 ++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/performance-metrics/src/util.rs b/performance-metrics/src/util.rs index dcc2257501..98fd56828d 100644 --- a/performance-metrics/src/util.rs +++ b/performance-metrics/src/util.rs @@ -4,14 +4,18 @@ //! Shared benchmark helpers. -use std::io::ErrorKind; +use std::io::{ErrorKind, Seek, SeekFrom, Write}; use std::thread; use std::time::Duration; +use block::qcow::{QcowFile, RawFile}; +use block::qcow_async::QcowDiskAsync; +use block::qcow_sync::QcowDiskSync; use vmm_sys_util::eventfd::EventFd; use vmm_sys_util::tempfile::TempFile; pub const BLOCK_SIZE: u64 = 4096; +pub const QCOW_CLUSTER_SIZE: u64 = 65536; /// Create a temporary file pre sized to hold `num_blocks` blocks. pub fn sized_tempfile(num_blocks: usize) -> TempFile { @@ -22,6 +26,45 @@ pub fn sized_tempfile(num_blocks: usize) -> TempFile { tmp } +/// Create a QCOW2 image with `num_clusters` allocated clusters and return +/// the tempfile handle. +/// +/// Each cluster is default QCOW2 cluster size of 64 KiB. The image is +/// created via `QcowFile::new` then populated with writes so that the +/// clusters are actually allocated in the L2 / refcount tables. +fn create_qcow_tempfile(num_clusters: usize) -> TempFile { + let tmp = TempFile::new().expect("failed to create tempfile"); + let virtual_size = QCOW_CLUSTER_SIZE * num_clusters as u64; + let raw = RawFile::new(tmp.as_file().try_clone().unwrap(), false); + let mut qcow = QcowFile::new(raw, 3, virtual_size, true).expect("failed to create QCOW2 file"); + let buf = vec![0xA5u8; QCOW_CLUSTER_SIZE as usize]; + for i in 0..num_clusters { + qcow.seek(SeekFrom::Start(i as u64 * QCOW_CLUSTER_SIZE)) + .expect("seek failed"); + qcow.write_all(&buf).expect("write failed"); + } + qcow.flush().expect("flush failed"); + tmp +} + +/// Create a QCOW2 image with `num_clusters` allocated clusters opened +/// via `QcowDiskSync` (blocking I/O backend). +pub fn qcow_tempfile(num_clusters: usize) -> (TempFile, QcowDiskSync) { + let tmp = create_qcow_tempfile(num_clusters); + let disk = QcowDiskSync::new(tmp.as_file().try_clone().unwrap(), false, false, true) + .expect("failed to open QCOW2 via QcowDiskSync"); + (tmp, disk) +} + +/// Create a QCOW2 image with `num_clusters` allocated clusters opened +/// via `QcowDiskAsync` (io_uring backend). +pub fn qcow_async_tempfile(num_clusters: usize) -> (TempFile, QcowDiskAsync) { + let tmp = create_qcow_tempfile(num_clusters); + let disk = QcowDiskAsync::new(tmp.as_file().try_clone().unwrap(), false, false, true) + .expect("failed to open QCOW2 via QcowDiskAsync"); + (tmp, disk) +} + /// Spin and wait until the given eventfd becomes readable. pub fn wait_for_eventfd(notifier: &EventFd) { loop { From 97b109bc89eee86d67afe53aaa7cb9ff703bf341 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 27 Mar 2026 22:32:07 +0100 Subject: [PATCH 639/742] performance-metrics: Add sync drain completions helper Signed-off-by: Anatol Belski --- performance-metrics/src/util.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/performance-metrics/src/util.rs b/performance-metrics/src/util.rs index 98fd56828d..0b42667df6 100644 --- a/performance-metrics/src/util.rs +++ b/performance-metrics/src/util.rs @@ -8,6 +8,7 @@ use std::io::{ErrorKind, Seek, SeekFrom, Write}; use std::thread; use std::time::Duration; +use block::async_io::AsyncIo; use block::qcow::{QcowFile, RawFile}; use block::qcow_async::QcowDiskAsync; use block::qcow_sync::QcowDiskSync; @@ -65,6 +66,13 @@ pub fn qcow_async_tempfile(num_clusters: usize) -> (TempFile, QcowDiskAsync) { (tmp, disk) } +/// Drain `count` completions from a synchronous async_io backend. +pub fn drain_completions(async_io: &mut dyn AsyncIo, count: usize) { + for _ in 0..count { + async_io.next_completed_request(); + } +} + /// Spin and wait until the given eventfd becomes readable. pub fn wait_for_eventfd(notifier: &EventFd) { loop { From f8deeb8a1cee3e26c97178484d452b7872eae495 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Sat, 28 Mar 2026 11:39:56 +0100 Subject: [PATCH 640/742] performance-metrics: Add submit_reads helper Signed-off-by: Anatol Belski --- performance-metrics/src/util.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/performance-metrics/src/util.rs b/performance-metrics/src/util.rs index 0b42667df6..9ec9102b03 100644 --- a/performance-metrics/src/util.rs +++ b/performance-metrics/src/util.rs @@ -73,6 +73,15 @@ pub fn drain_completions(async_io: &mut dyn AsyncIo, count: usize) { } } +/// Submit `count` sequential read_vectored calls at `stride`-byte intervals. +pub fn submit_reads(async_io: &mut dyn AsyncIo, count: usize, stride: u64, iovec: &[libc::iovec]) { + for i in 0..count { + async_io + .read_vectored((i as u64 * stride) as libc::off_t, iovec, i as u64) + .expect("read_vectored failed"); + } +} + /// Spin and wait until the given eventfd becomes readable. pub fn wait_for_eventfd(notifier: &EventFd) { loop { From beaa98728cf915d1e21eb4f34157ca3c922e074e Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Sat, 28 Mar 2026 12:43:11 +0100 Subject: [PATCH 641/742] performance-metrics: Add iovec construction helpers Signed-off-by: Anatol Belski --- performance-metrics/src/util.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/performance-metrics/src/util.rs b/performance-metrics/src/util.rs index 9ec9102b03..ce480aa4f4 100644 --- a/performance-metrics/src/util.rs +++ b/performance-metrics/src/util.rs @@ -73,6 +73,22 @@ pub fn drain_completions(async_io: &mut dyn AsyncIo, count: usize) { } } +/// Build an iovec suitable for a read into `buf`. +pub fn read_iovec(buf: &mut [u8]) -> libc::iovec { + libc::iovec { + iov_base: buf.as_mut_ptr() as *mut libc::c_void, + iov_len: buf.len(), + } +} + +/// Build an iovec suitable for a write from `buf`. +pub fn write_iovec(buf: &[u8]) -> libc::iovec { + libc::iovec { + iov_base: buf.as_ptr() as *mut libc::c_void, + iov_len: buf.len(), + } +} + /// Submit `count` sequential read_vectored calls at `stride`-byte intervals. pub fn submit_reads(async_io: &mut dyn AsyncIo, count: usize, stride: u64, iovec: &[libc::iovec]) { for i in 0..count { From 6cd3395a55d3193ceb7bd9d76113202acf3bc680 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 26 Mar 2026 20:15:10 +0100 Subject: [PATCH 642/742] performance-metrics: Add QCOW2 read micro benchmark Add micro_bench_qcow_read which times read_vectored calls through QcowSync on a prepopulated QCOW2 image. This exercises the hot read path including L2 lookup, pread64 for allocated clusters and iovec scatter. Two TEST_LIST entries: micro_block_qcow_read_128_us and micro_block_qcow_read_256_us with 128 and 256 cluster workloads. Signed-off-by: Anatol Belski --- performance-metrics/src/main.rs | 26 ++++++++++++++++- performance-metrics/src/micro_bench_block.rs | 30 +++++++++++++++++++- 2 files changed, 54 insertions(+), 2 deletions(-) diff --git a/performance-metrics/src/main.rs b/performance-metrics/src/main.rs index c18ef70dc1..1668bc288c 100644 --- a/performance-metrics/src/main.rs +++ b/performance-metrics/src/main.rs @@ -378,7 +378,7 @@ mod adjuster { } } -const TEST_LIST: [PerformanceTest; 62] = [ +const TEST_LIST: [PerformanceTest; 64] = [ PerformanceTest { name: "boot_time_ms", func_ptr: performance_boot_time, @@ -1253,6 +1253,30 @@ const TEST_LIST: [PerformanceTest; 62] = [ }, unit_adjuster: adjuster::s_to_us, }, + PerformanceTest { + name: "micro_block_qcow_read_128_us", + func_ptr: micro_bench_block::micro_bench_qcow_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_read_256_us", + func_ptr: micro_bench_block::micro_bench_qcow_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, ]; fn run_test_with_timeout( diff --git a/performance-metrics/src/micro_bench_block.rs b/performance-metrics/src/micro_bench_block.rs index 6dc51af658..1a8dea3e15 100644 --- a/performance-metrics/src/micro_bench_block.rs +++ b/performance-metrics/src/micro_bench_block.rs @@ -11,10 +11,13 @@ use std::os::unix::io::AsRawFd; use std::time::Instant; use block::async_io::AsyncIo; +use block::disk_file::AsyncDiskFile; use block::raw_async_aio::RawFileAsyncAio; use crate::PerformanceTestControl; -use crate::util::{self, BLOCK_SIZE}; +use crate::util::{ + self, BLOCK_SIZE, QCOW_CLUSTER_SIZE, drain_completions, read_iovec, submit_reads, +}; /// Submit num_ops AIO writes, wait for them all to land, then time /// how long it takes to drain every completion via next_completed_request(). @@ -51,3 +54,28 @@ pub fn micro_bench_aio_drain(control: &PerformanceTestControl) -> f64 { } start.elapsed().as_secs_f64() } + +/// Read num_ops clusters from a prepopulated qcow2 image through the +/// QcowSync async_io path and time the total read_vectored wall clock. +/// +/// This exercises the hot read path: L2 lookup via map_clusters_for_read, +/// pread64 for allocated data, and iovec scatter. +/// +/// Returns the total read wall clock time in seconds. +pub fn micro_bench_qcow_read(control: &PerformanceTestControl) -> f64 { + let num_ops = control.num_ops.expect("num_ops required") as usize; + let (_tmp, disk) = util::qcow_tempfile(num_ops); + let mut async_io = disk.new_async_io(1).expect("new_async_io failed"); + + let mut buf = vec![0u8; QCOW_CLUSTER_SIZE as usize]; + let iovec = read_iovec(&mut buf); + + let start = Instant::now(); + submit_reads(async_io.as_mut(), num_ops, QCOW_CLUSTER_SIZE, &[iovec]); + let elapsed = start.elapsed().as_secs_f64(); + + // Drain completions so Drop is clean. + drain_completions(async_io.as_mut(), num_ops); + + elapsed +} From 0854c3e082726edd156b65079c4c6c62f58bc352 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 26 Mar 2026 20:17:20 +0100 Subject: [PATCH 643/742] performance-metrics: Add empty QCOW2 tempfile helper Add empty_qcow_tempfile() which creates a QCOW2 v3 image with no allocated clusters so every write triggers the full cluster allocation path including L2 entry allocation and refcount updates. Signed-off-by: Anatol Belski --- performance-metrics/src/util.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/performance-metrics/src/util.rs b/performance-metrics/src/util.rs index ce480aa4f4..345e62a41e 100644 --- a/performance-metrics/src/util.rs +++ b/performance-metrics/src/util.rs @@ -98,6 +98,24 @@ pub fn submit_reads(async_io: &mut dyn AsyncIo, count: usize, stride: u64, iovec } } +/// Create an empty QCOW2 image sized for `num_clusters` clusters. +/// No data clusters are allocated. +fn create_empty_qcow_tempfile(num_clusters: usize) -> TempFile { + let tmp = TempFile::new().expect("failed to create tempfile"); + let virtual_size = QCOW_CLUSTER_SIZE * num_clusters as u64; + let raw = RawFile::new(tmp.as_file().try_clone().unwrap(), false); + QcowFile::new(raw, 3, virtual_size, true).expect("failed to create qcow2 file"); + tmp +} + +/// Empty QCOW2 opened via QcowDiskSync. +pub fn empty_qcow_tempfile(num_clusters: usize) -> (TempFile, QcowDiskSync) { + let tmp = create_empty_qcow_tempfile(num_clusters); + let disk = QcowDiskSync::new(tmp.as_file().try_clone().unwrap(), false, false, true) + .expect("failed to open qcow2 via QcowDiskSync"); + (tmp, disk) +} + /// Spin and wait until the given eventfd becomes readable. pub fn wait_for_eventfd(notifier: &EventFd) { loop { From 9f317895b573c80d121173b264ad23ae686e3e8d Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Sat, 28 Mar 2026 11:41:15 +0100 Subject: [PATCH 644/742] performance-metrics: Add submit_writes helper Signed-off-by: Anatol Belski --- performance-metrics/src/util.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/performance-metrics/src/util.rs b/performance-metrics/src/util.rs index 345e62a41e..bbafea0d34 100644 --- a/performance-metrics/src/util.rs +++ b/performance-metrics/src/util.rs @@ -98,6 +98,15 @@ pub fn submit_reads(async_io: &mut dyn AsyncIo, count: usize, stride: u64, iovec } } +/// Submit `count` sequential write_vectored calls at `stride`-byte intervals. +pub fn submit_writes(async_io: &mut dyn AsyncIo, count: usize, stride: u64, iovec: &[libc::iovec]) { + for i in 0..count { + async_io + .write_vectored((i as u64 * stride) as libc::off_t, iovec, i as u64) + .expect("write_vectored failed"); + } +} + /// Create an empty QCOW2 image sized for `num_clusters` clusters. /// No data clusters are allocated. fn create_empty_qcow_tempfile(num_clusters: usize) -> TempFile { From f7e40eec9af2aa89a4a12a642ee56e4ba351e549 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 26 Mar 2026 20:19:46 +0100 Subject: [PATCH 645/742] performance-metrics: Add QCOW2 write micro benchmark Add micro_bench_qcow_write which times write_vectored calls through QcowSync on an empty QCOW2 image. Each write allocates a new cluster exercising map_cluster_for_write with L2 entry allocation and refcount updates followed by pwrite_all. Two TEST_LIST entries: micro_block_qcow_write_128_us and micro_block_qcow_write_256_us with 128 and 256 cluster workloads. Signed-off-by: Anatol Belski --- performance-metrics/src/main.rs | 26 ++++++++++++++++++- performance-metrics/src/micro_bench_block.rs | 27 ++++++++++++++++++++ 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/performance-metrics/src/main.rs b/performance-metrics/src/main.rs index 1668bc288c..225212b8f9 100644 --- a/performance-metrics/src/main.rs +++ b/performance-metrics/src/main.rs @@ -378,7 +378,7 @@ mod adjuster { } } -const TEST_LIST: [PerformanceTest; 64] = [ +const TEST_LIST: [PerformanceTest; 66] = [ PerformanceTest { name: "boot_time_ms", func_ptr: performance_boot_time, @@ -1277,6 +1277,30 @@ const TEST_LIST: [PerformanceTest; 64] = [ }, unit_adjuster: adjuster::s_to_us, }, + PerformanceTest { + name: "micro_block_qcow_write_128_us", + func_ptr: micro_bench_block::micro_bench_qcow_write, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_write_256_us", + func_ptr: micro_bench_block::micro_bench_qcow_write, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, ]; fn run_test_with_timeout( diff --git a/performance-metrics/src/micro_bench_block.rs b/performance-metrics/src/micro_bench_block.rs index 1a8dea3e15..d02e997626 100644 --- a/performance-metrics/src/micro_bench_block.rs +++ b/performance-metrics/src/micro_bench_block.rs @@ -17,6 +17,7 @@ use block::raw_async_aio::RawFileAsyncAio; use crate::PerformanceTestControl; use crate::util::{ self, BLOCK_SIZE, QCOW_CLUSTER_SIZE, drain_completions, read_iovec, submit_reads, + submit_writes, write_iovec, }; /// Submit num_ops AIO writes, wait for them all to land, then time @@ -79,3 +80,29 @@ pub fn micro_bench_qcow_read(control: &PerformanceTestControl) -> f64 { elapsed } + +/// Write num_ops clusters into an empty qcow2 image through the +/// QcowSync async_io path and time the total write_vectored wall clock. +/// +/// This exercises the write allocation path: map_cluster_for_write +/// allocates a new cluster and bumps refcounts, then pwrite_all writes +/// the data. +/// +/// Returns the total write wall clock time in seconds. +pub fn micro_bench_qcow_write(control: &PerformanceTestControl) -> f64 { + let num_ops = control.num_ops.expect("num_ops required") as usize; + let (_tmp, disk) = util::empty_qcow_tempfile(num_ops); + let mut async_io = disk.new_async_io(1).expect("new_async_io failed"); + + let buf = vec![0xA5u8; QCOW_CLUSTER_SIZE as usize]; + let iovec = write_iovec(&buf); + + let start = Instant::now(); + submit_writes(async_io.as_mut(), num_ops, QCOW_CLUSTER_SIZE, &[iovec]); + let elapsed = start.elapsed().as_secs_f64(); + + // Drain completions so Drop is clean. + drain_completions(async_io.as_mut(), num_ops); + + elapsed +} From c3312dca3ff0cf275f50095e1ee6775c9ee525db Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Sun, 15 Mar 2026 14:13:46 +0100 Subject: [PATCH 646/742] performance-metrics: Add qcow2 punch hole micro benchmark Add micro_bench_qcow_punch_hole which times punch_hole calls through QcowSync on a prepopulated qcow2 image. Each call deallocates one cluster exercising deallocate_bytes with refcount decrement and fallocate punch_hole on the host file. Two TEST_LIST entries: micro_block_qcow_punch_hole_64_us and micro_block_qcow_punch_hole_256_us with 64 and 256 cluster workloads. Signed-off-by: Anatol Belski --- performance-metrics/src/main.rs | 26 +++++++++++++++++++- performance-metrics/src/micro_bench_block.rs | 26 ++++++++++++++++++++ 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/performance-metrics/src/main.rs b/performance-metrics/src/main.rs index 225212b8f9..93ab45e316 100644 --- a/performance-metrics/src/main.rs +++ b/performance-metrics/src/main.rs @@ -378,7 +378,7 @@ mod adjuster { } } -const TEST_LIST: [PerformanceTest; 66] = [ +const TEST_LIST: [PerformanceTest; 68] = [ PerformanceTest { name: "boot_time_ms", func_ptr: performance_boot_time, @@ -1301,6 +1301,30 @@ const TEST_LIST: [PerformanceTest; 66] = [ }, unit_adjuster: adjuster::s_to_us, }, + PerformanceTest { + name: "micro_block_qcow_punch_hole_128_us", + func_ptr: micro_bench_block::micro_bench_qcow_punch_hole, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_punch_hole_256_us", + func_ptr: micro_bench_block::micro_bench_qcow_punch_hole, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, ]; fn run_test_with_timeout( diff --git a/performance-metrics/src/micro_bench_block.rs b/performance-metrics/src/micro_bench_block.rs index d02e997626..2ebde205fa 100644 --- a/performance-metrics/src/micro_bench_block.rs +++ b/performance-metrics/src/micro_bench_block.rs @@ -106,3 +106,29 @@ pub fn micro_bench_qcow_write(control: &PerformanceTestControl) -> f64 { elapsed } + +/// Punch holes for num_ops clusters in a prepopulated qcow2 image through +/// the QcowSync async_io path and time the total punch_hole wall clock. +/// +/// This exercises the discard path: deallocate_bytes decrements refcounts, +/// frees clusters and issues fallocate punch_hole on the host file. +/// +/// Returns the total punch_hole wall clock time in seconds. +pub fn micro_bench_qcow_punch_hole(control: &PerformanceTestControl) -> f64 { + let num_ops = control.num_ops.expect("num_ops required") as usize; + let (_tmp, disk) = util::qcow_tempfile(num_ops); + let mut async_io = disk.new_async_io(1).expect("new_async_io failed"); + + let start = Instant::now(); + for i in 0..num_ops { + async_io + .punch_hole(i as u64 * QCOW_CLUSTER_SIZE, QCOW_CLUSTER_SIZE, i as u64) + .expect("punch_hole failed"); + } + let elapsed = start.elapsed().as_secs_f64(); + + // Drain completions so Drop is clean. + drain_completions(async_io.as_mut(), num_ops); + + elapsed +} From e802c0d8b975fb38145509cd0daae452938d6e0f Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Sun, 15 Mar 2026 14:17:55 +0100 Subject: [PATCH 647/742] performance-metrics: Add qcow2 fsync micro benchmark Add micro_bench_qcow_fsync which writes num_ops clusters into an empty qcow2 image to dirty L2 and refcount metadata then times a single fsync call that flushes all dirty tables to disk. This isolates the metadata flush cost which scales with the number of dirty L2 table entries and refcount blocks. Two TEST_LIST entries: micro_block_qcow_fsync_64_us and micro_block_qcow_fsync_256_us with 64 and 256 cluster workloads. Signed-off-by: Anatol Belski --- performance-metrics/src/main.rs | 26 ++++++++++++++++- performance-metrics/src/micro_bench_block.rs | 30 ++++++++++++++++++++ 2 files changed, 55 insertions(+), 1 deletion(-) diff --git a/performance-metrics/src/main.rs b/performance-metrics/src/main.rs index 93ab45e316..14ecfb12be 100644 --- a/performance-metrics/src/main.rs +++ b/performance-metrics/src/main.rs @@ -378,7 +378,7 @@ mod adjuster { } } -const TEST_LIST: [PerformanceTest; 68] = [ +const TEST_LIST: [PerformanceTest; 70] = [ PerformanceTest { name: "boot_time_ms", func_ptr: performance_boot_time, @@ -1325,6 +1325,30 @@ const TEST_LIST: [PerformanceTest; 68] = [ }, unit_adjuster: adjuster::s_to_us, }, + PerformanceTest { + name: "micro_block_qcow_fsync_128_us", + func_ptr: micro_bench_block::micro_bench_qcow_fsync, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_fsync_256_us", + func_ptr: micro_bench_block::micro_bench_qcow_fsync, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, ]; fn run_test_with_timeout( diff --git a/performance-metrics/src/micro_bench_block.rs b/performance-metrics/src/micro_bench_block.rs index 2ebde205fa..30a737780e 100644 --- a/performance-metrics/src/micro_bench_block.rs +++ b/performance-metrics/src/micro_bench_block.rs @@ -132,3 +132,33 @@ pub fn micro_bench_qcow_punch_hole(control: &PerformanceTestControl) -> f64 { elapsed } + +/// Write num_ops clusters into an empty qcow2 image to dirty L2 and +/// refcount metadata, then time a single fsync that flushes all dirty +/// tables to disk. +/// +/// This isolates the metadata flush cost which scales with the number +/// of dirty L2 table entries and refcount blocks. +/// +/// Returns the fsync wall clock time in seconds. +pub fn micro_bench_qcow_fsync(control: &PerformanceTestControl) -> f64 { + let num_ops = control.num_ops.expect("num_ops required") as usize; + let (_tmp, disk) = util::empty_qcow_tempfile(num_ops); + let mut async_io = disk.new_async_io(1).expect("new_async_io failed"); + + // Write num_ops clusters to dirty L2 and refcount metadata. + let buf = vec![0xA5u8; QCOW_CLUSTER_SIZE as usize]; + let iovec = write_iovec(&buf); + submit_writes(async_io.as_mut(), num_ops, QCOW_CLUSTER_SIZE, &[iovec]); + // Drain write completions. + drain_completions(async_io.as_mut(), num_ops); + + // Time the flush. + let start = Instant::now(); + async_io.fsync(Some(num_ops as u64)).expect("fsync failed"); + let elapsed = start.elapsed().as_secs_f64(); + + drain_completions(async_io.as_mut(), 1); + + elapsed +} From 638cb3d7f2bcf36a81f9d5d2f34de75901aa1823 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 15 Apr 2026 19:09:48 +0200 Subject: [PATCH 648/742] performance-metrics: Add deterministic permutation helper Add deterministic_permutation() which produces a reproducible pseudo random permutation of [0, n) using a Fisher-Yates shuffle seeded by DefaultHasher. This is used by the random read micro benchmarks to generate a fixed access pattern that is identical across runs. Signed-off-by: Anatol Belski --- performance-metrics/src/util.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/performance-metrics/src/util.rs b/performance-metrics/src/util.rs index bbafea0d34..09d6ad7e64 100644 --- a/performance-metrics/src/util.rs +++ b/performance-metrics/src/util.rs @@ -89,6 +89,24 @@ pub fn write_iovec(buf: &[u8]) -> libc::iovec { } } +/// Build a deterministic pseudo-random permutation of `[0, n)`. +/// +/// Uses a Fisher-Yates shuffle seeded by `DefaultHasher` so the +/// permutation is identical across runs. +pub fn deterministic_permutation(n: usize) -> Vec { + use std::collections::hash_map::DefaultHasher; + use std::hash::{Hash, Hasher}; + + let mut indices: Vec = (0..n).collect(); + for i in (1..n).rev() { + let mut h = DefaultHasher::new(); + i.hash(&mut h); + let j = h.finish() as usize % (i + 1); + indices.swap(i, j); + } + indices +} + /// Submit `count` sequential read_vectored calls at `stride`-byte intervals. pub fn submit_reads(async_io: &mut dyn AsyncIo, count: usize, stride: u64, iovec: &[libc::iovec]) { for i in 0..count { From 99b43202797cd2e2c3973d8c29cd6bcc3b72c701 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 26 Mar 2026 19:20:18 +0100 Subject: [PATCH 649/742] performance-metrics: Add qcow2 random read micro benchmark Add micro_bench_qcow_random_read which reads clusters from a prepopulated qcow2 image in a deterministic pseudo-random order. Unlike the sequential read benchmark, this exercises L2 cache miss and eviction behaviour under random access patterns. Uses Fisher-Yates shuffle with DefaultHasher for reproducible permutation across runs. Two TEST_LIST entries: micro_block_qcow_random_read_128_us and micro_block_qcow_random_read_256_us with 128 and 256 cluster workloads. Signed-off-by: Anatol Belski --- performance-metrics/src/main.rs | 26 +++++++++++++- performance-metrics/src/micro_bench_block.rs | 38 ++++++++++++++++++-- 2 files changed, 61 insertions(+), 3 deletions(-) diff --git a/performance-metrics/src/main.rs b/performance-metrics/src/main.rs index 14ecfb12be..3020d5827c 100644 --- a/performance-metrics/src/main.rs +++ b/performance-metrics/src/main.rs @@ -378,7 +378,7 @@ mod adjuster { } } -const TEST_LIST: [PerformanceTest; 70] = [ +const TEST_LIST: [PerformanceTest; 72] = [ PerformanceTest { name: "boot_time_ms", func_ptr: performance_boot_time, @@ -1277,6 +1277,30 @@ const TEST_LIST: [PerformanceTest; 70] = [ }, unit_adjuster: adjuster::s_to_us, }, + PerformanceTest { + name: "micro_block_qcow_random_read_128_us", + func_ptr: micro_bench_block::micro_bench_qcow_random_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_random_read_256_us", + func_ptr: micro_bench_block::micro_bench_qcow_random_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, PerformanceTest { name: "micro_block_qcow_write_128_us", func_ptr: micro_bench_block::micro_bench_qcow_write, diff --git a/performance-metrics/src/micro_bench_block.rs b/performance-metrics/src/micro_bench_block.rs index 30a737780e..40869afbc0 100644 --- a/performance-metrics/src/micro_bench_block.rs +++ b/performance-metrics/src/micro_bench_block.rs @@ -16,8 +16,8 @@ use block::raw_async_aio::RawFileAsyncAio; use crate::PerformanceTestControl; use crate::util::{ - self, BLOCK_SIZE, QCOW_CLUSTER_SIZE, drain_completions, read_iovec, submit_reads, - submit_writes, write_iovec, + self, BLOCK_SIZE, QCOW_CLUSTER_SIZE, deterministic_permutation, drain_completions, read_iovec, + submit_reads, submit_writes, write_iovec, }; /// Submit num_ops AIO writes, wait for them all to land, then time @@ -81,6 +81,40 @@ pub fn micro_bench_qcow_read(control: &PerformanceTestControl) -> f64 { elapsed } +/// Read num_ops clusters from a prepopulated qcow2 image in random order. +/// +/// Unlike micro_bench_qcow_read which reads sequentially, this shuffles +/// the cluster indices to exercise L2 cache miss and eviction behaviour +/// under random access patterns. +/// +/// Returns the total read wall clock time in seconds. +pub fn micro_bench_qcow_random_read(control: &PerformanceTestControl) -> f64 { + let num_ops = control.num_ops.expect("num_ops required") as usize; + let (_tmp, disk) = util::qcow_tempfile(num_ops); + let mut async_io = disk.new_async_io(1).expect("new_async_io failed"); + + let indices = deterministic_permutation(num_ops); + + let mut buf = vec![0u8; QCOW_CLUSTER_SIZE as usize]; + let iovec = read_iovec(&mut buf); + + let start = Instant::now(); + for (seq, &cluster_idx) in indices.iter().enumerate() { + async_io + .read_vectored( + (cluster_idx as u64 * QCOW_CLUSTER_SIZE) as libc::off_t, + &[iovec], + seq as u64, + ) + .expect("read_vectored failed"); + } + let elapsed = start.elapsed().as_secs_f64(); + + drain_completions(async_io.as_mut(), num_ops); + + elapsed +} + /// Write num_ops clusters into an empty qcow2 image through the /// QcowSync async_io path and time the total write_vectored wall clock. /// From bdce007aacd447b8cd5bca6bbadda63078d59191 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 26 Mar 2026 21:08:14 +0100 Subject: [PATCH 650/742] performance-metrics: Add QCOW2 overlay tempfile helper Add qcow_overlay_tempfile() which creates a raw backing file with pre-populated data and a QCOW2 overlay on top with no allocated clusters. The overlay is opened with backing file support via QcowDiskSync so reads fall through to the backing file. To be used by backing file read and copy-on-write write benchmarks. Signed-off-by: Anatol Belski --- performance-metrics/src/util.rs | 41 ++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/performance-metrics/src/util.rs b/performance-metrics/src/util.rs index 09d6ad7e64..3ba9a9eeef 100644 --- a/performance-metrics/src/util.rs +++ b/performance-metrics/src/util.rs @@ -5,11 +5,12 @@ //! Shared benchmark helpers. use std::io::{ErrorKind, Seek, SeekFrom, Write}; +use std::os::unix::fs::FileExt; use std::thread; use std::time::Duration; use block::async_io::AsyncIo; -use block::qcow::{QcowFile, RawFile}; +use block::qcow::{BackingFileConfig, ImageType, QcowFile, RawFile}; use block::qcow_async::QcowDiskAsync; use block::qcow_sync::QcowDiskSync; use vmm_sys_util::eventfd::EventFd; @@ -143,6 +144,44 @@ pub fn empty_qcow_tempfile(num_clusters: usize) -> (TempFile, QcowDiskSync) { (tmp, disk) } +/// Create a QCOW2 overlay backed by a raw file with `num_clusters` +/// pre-populated clusters. Returns (backing_tempfile, overlay_tempfile). +fn create_overlay_tempfiles(num_clusters: usize) -> (TempFile, TempFile) { + let virtual_size = QCOW_CLUSTER_SIZE * num_clusters as u64; + + let backing = TempFile::new().expect("failed to create backing tempfile"); + { + let f = backing.as_file(); + f.set_len(virtual_size).expect("set_len failed"); + let buf = vec![0xA5u8; QCOW_CLUSTER_SIZE as usize]; + for i in 0..num_clusters { + f.write_at(&buf, i as u64 * QCOW_CLUSTER_SIZE) + .expect("write_at failed"); + } + } + + let overlay = TempFile::new().expect("failed to create overlay tempfile"); + { + let raw = RawFile::new(overlay.as_file().try_clone().unwrap(), false); + let backing_config = BackingFileConfig { + path: backing.as_path().to_str().unwrap().to_string(), + format: Some(ImageType::Raw), + }; + QcowFile::new_from_backing(raw, 3, virtual_size, &backing_config, true) + .expect("failed to create overlay qcow2"); + } + + (backing, overlay) +} + +/// QCOW2 overlay with raw backing opened via QcowDiskSync. +pub fn qcow_overlay_tempfile(num_clusters: usize) -> (TempFile, TempFile, QcowDiskSync) { + let (backing, overlay) = create_overlay_tempfiles(num_clusters); + let disk = QcowDiskSync::new(overlay.as_file().try_clone().unwrap(), false, true, true) + .expect("failed to open overlay qcow2 via QcowDiskSync"); + (backing, overlay, disk) +} + /// Spin and wait until the given eventfd becomes readable. pub fn wait_for_eventfd(notifier: &EventFd) { loop { From 79b58c0fafe77ae03913fd6ab3661821577a91cc Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 26 Mar 2026 21:10:21 +0100 Subject: [PATCH 651/742] performance-metrics: Add QCOW2 backing file read micro benchmark Add micro_bench_qcow_backing_read which reads clusters from a QCOW2 overlay where all data lives in a raw backing file. Every read falls through the L2 lookup to the backing file, exercising the backing chain read path. Workloads: 128 and 256 clusters. Signed-off-by: Anatol Belski --- performance-metrics/src/main.rs | 26 +++++++++++++++++++- performance-metrics/src/micro_bench_block.rs | 25 +++++++++++++++++++ 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/performance-metrics/src/main.rs b/performance-metrics/src/main.rs index 3020d5827c..e40323788c 100644 --- a/performance-metrics/src/main.rs +++ b/performance-metrics/src/main.rs @@ -378,7 +378,7 @@ mod adjuster { } } -const TEST_LIST: [PerformanceTest; 72] = [ +const TEST_LIST: [PerformanceTest; 74] = [ PerformanceTest { name: "boot_time_ms", func_ptr: performance_boot_time, @@ -1373,6 +1373,30 @@ const TEST_LIST: [PerformanceTest; 72] = [ }, unit_adjuster: adjuster::s_to_us, }, + PerformanceTest { + name: "micro_block_qcow_backing_read_128_us", + func_ptr: micro_bench_block::micro_bench_qcow_backing_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_backing_read_256_us", + func_ptr: micro_bench_block::micro_bench_qcow_backing_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, ]; fn run_test_with_timeout( diff --git a/performance-metrics/src/micro_bench_block.rs b/performance-metrics/src/micro_bench_block.rs index 40869afbc0..efb73559d8 100644 --- a/performance-metrics/src/micro_bench_block.rs +++ b/performance-metrics/src/micro_bench_block.rs @@ -196,3 +196,28 @@ pub fn micro_bench_qcow_fsync(control: &PerformanceTestControl) -> f64 { elapsed } + +/// Read num_ops clusters from a QCOW2 overlay whose data lives entirely +/// in a raw backing file. +/// +/// This exercises the backing file read path: L2 lookup finds no +/// allocated cluster and falls through to the backing file for every +/// read. +/// +/// Returns the total read wall clock time in seconds. +pub fn micro_bench_qcow_backing_read(control: &PerformanceTestControl) -> f64 { + let num_ops = control.num_ops.expect("num_ops required") as usize; + let (_backing, _overlay, disk) = util::qcow_overlay_tempfile(num_ops); + let mut async_io = disk.new_async_io(1).expect("new_async_io failed"); + + let mut buf = vec![0u8; QCOW_CLUSTER_SIZE as usize]; + let iovec = read_iovec(&mut buf); + + let start = Instant::now(); + submit_reads(async_io.as_mut(), num_ops, QCOW_CLUSTER_SIZE, &[iovec]); + let elapsed = start.elapsed().as_secs_f64(); + + drain_completions(async_io.as_mut(), num_ops); + + elapsed +} From b2430d701bc7df964de751f13295fd65372fd448 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 26 Mar 2026 21:13:09 +0100 Subject: [PATCH 652/742] performance-metrics: Add QCOW2 copy-on-write write micro benchmark Add micro_bench_qcow_cow_write which writes clusters into a QCOW2 overlay backed by a raw file. Each write triggers copy-on-write: cluster allocation, L2 and refcount table updates, then the data write. This measures COW allocation overhead compared to writing into a plain empty image. Workloads: 128 and 256 clusters. Signed-off-by: Anatol Belski --- performance-metrics/src/main.rs | 26 +++++++++++++++++++- performance-metrics/src/micro_bench_block.rs | 26 ++++++++++++++++++++ 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/performance-metrics/src/main.rs b/performance-metrics/src/main.rs index e40323788c..879cce9262 100644 --- a/performance-metrics/src/main.rs +++ b/performance-metrics/src/main.rs @@ -378,7 +378,7 @@ mod adjuster { } } -const TEST_LIST: [PerformanceTest; 74] = [ +const TEST_LIST: [PerformanceTest; 76] = [ PerformanceTest { name: "boot_time_ms", func_ptr: performance_boot_time, @@ -1397,6 +1397,30 @@ const TEST_LIST: [PerformanceTest; 74] = [ }, unit_adjuster: adjuster::s_to_us, }, + PerformanceTest { + name: "micro_block_qcow_cow_write_128_us", + func_ptr: micro_bench_block::micro_bench_qcow_cow_write, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_cow_write_256_us", + func_ptr: micro_bench_block::micro_bench_qcow_cow_write, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, ]; fn run_test_with_timeout( diff --git a/performance-metrics/src/micro_bench_block.rs b/performance-metrics/src/micro_bench_block.rs index efb73559d8..035ca1d435 100644 --- a/performance-metrics/src/micro_bench_block.rs +++ b/performance-metrics/src/micro_bench_block.rs @@ -221,3 +221,29 @@ pub fn micro_bench_qcow_backing_read(control: &PerformanceTestControl) -> f64 { elapsed } + +/// Write num_ops clusters into a QCOW2 overlay backed by a raw file. +/// +/// Each write triggers copy-on-write: the overlay must allocate a new +/// cluster, update L2 and refcount tables, then write the data. This +/// measures the COW allocation overhead compared to writing into an +/// empty image (no backing read needed since we overwrite the full +/// cluster). +/// +/// Returns the total write wall clock time in seconds. +pub fn micro_bench_qcow_cow_write(control: &PerformanceTestControl) -> f64 { + let num_ops = control.num_ops.expect("num_ops required") as usize; + let (_backing, _overlay, disk) = util::qcow_overlay_tempfile(num_ops); + let mut async_io = disk.new_async_io(1).expect("new_async_io failed"); + + let buf = vec![0xBBu8; QCOW_CLUSTER_SIZE as usize]; + let iovec = write_iovec(&buf); + + let start = Instant::now(); + submit_writes(async_io.as_mut(), num_ops, QCOW_CLUSTER_SIZE, &[iovec]); + let elapsed = start.elapsed().as_secs_f64(); + + drain_completions(async_io.as_mut(), num_ops); + + elapsed +} From aca64ced8f60a11f1f432846b2fde19912431113 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 26 Mar 2026 21:20:14 +0100 Subject: [PATCH 653/742] performance-metrics: Add compressed QCOW2 tempfile helper Add compressed_qcow_tempfile() which creates a zlib compressed QCOW2 image by populating a RAW tempfile with data and converting it via qemu-img convert -c. Every cluster in the resulting image is stored compressed so reads exercise the decompression path. To be used by the compressed read benchmark. Signed-off-by: Anatol Belski --- performance-metrics/src/util.rs | 55 +++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/performance-metrics/src/util.rs b/performance-metrics/src/util.rs index 3ba9a9eeef..b859c8764b 100644 --- a/performance-metrics/src/util.rs +++ b/performance-metrics/src/util.rs @@ -4,8 +4,10 @@ //! Shared benchmark helpers. +use std::fs::File; use std::io::{ErrorKind, Seek, SeekFrom, Write}; use std::os::unix::fs::FileExt; +use std::process::Command; use std::thread; use std::time::Duration; @@ -182,6 +184,59 @@ pub fn qcow_overlay_tempfile(num_clusters: usize) -> (TempFile, TempFile, QcowDi (backing, overlay, disk) } +/// Create a zlib compressed QCOW2 image with `num_clusters` clusters +/// via `qemu-img convert -c`. +fn create_compressed_qcow_tempfile(num_clusters: usize) -> TempFile { + let virtual_size = QCOW_CLUSTER_SIZE * num_clusters as u64; + + let raw_tmp = TempFile::new().expect("failed to create raw tempfile"); + { + let f = raw_tmp.as_file(); + f.set_len(virtual_size).expect("set_len failed"); + let buf = vec![0xA5u8; QCOW_CLUSTER_SIZE as usize]; + for i in 0..num_clusters { + f.write_at(&buf, i as u64 * QCOW_CLUSTER_SIZE) + .expect("write_at failed"); + } + } + + let qcow_tmp = TempFile::new().expect("failed to create qcow2 tempfile"); + let qcow_path = qcow_tmp.as_path().to_str().unwrap().to_string(); + let raw_path = raw_tmp.as_path().to_str().unwrap().to_string(); + let status = Command::new("qemu-img") + .args([ + "convert", + "-f", + "raw", + "-O", + "qcow2", + "-c", + "-o", + "compression_type=zlib", + &raw_path, + &qcow_path, + ]) + .status() + .expect("failed to run qemu-img"); + assert!(status.success(), "qemu-img convert failed"); + + qcow_tmp +} + +/// Compressed QCOW2 opened via QcowDiskSync. +pub fn compressed_qcow_tempfile(num_clusters: usize) -> (TempFile, QcowDiskSync) { + let tmp = create_compressed_qcow_tempfile(num_clusters); + let path = tmp.as_path().to_str().unwrap().to_string(); + let disk = QcowDiskSync::new( + File::open(&path).expect("failed to open compressed qcow2"), + false, + false, + true, + ) + .expect("failed to open compressed qcow2 via QcowDiskSync"); + (tmp, disk) +} + /// Spin and wait until the given eventfd becomes readable. pub fn wait_for_eventfd(notifier: &EventFd) { loop { From 7dd1978fceed7d6c9e178c7474bc9781f790b2b7 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 26 Mar 2026 21:25:35 +0100 Subject: [PATCH 654/742] performance-metrics: Add QCOW2 compressed read micro benchmark Add micro_bench_qcow_compressed_read which reads clusters from a zlib compressed QCOW2 image. Every cluster triggers decompression, isolating the decompression overhead from the normal allocated cluster read path. Workloads: 128 and 256 clusters. Signed-off-by: Anatol Belski --- performance-metrics/src/main.rs | 26 +++++++++++++++++++- performance-metrics/src/micro_bench_block.rs | 24 ++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/performance-metrics/src/main.rs b/performance-metrics/src/main.rs index 879cce9262..d78e451b8b 100644 --- a/performance-metrics/src/main.rs +++ b/performance-metrics/src/main.rs @@ -378,7 +378,7 @@ mod adjuster { } } -const TEST_LIST: [PerformanceTest; 76] = [ +const TEST_LIST: [PerformanceTest; 78] = [ PerformanceTest { name: "boot_time_ms", func_ptr: performance_boot_time, @@ -1421,6 +1421,30 @@ const TEST_LIST: [PerformanceTest; 76] = [ }, unit_adjuster: adjuster::s_to_us, }, + PerformanceTest { + name: "micro_block_qcow_compressed_read_128_us", + func_ptr: micro_bench_block::micro_bench_qcow_compressed_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_compressed_read_256_us", + func_ptr: micro_bench_block::micro_bench_qcow_compressed_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, ]; fn run_test_with_timeout( diff --git a/performance-metrics/src/micro_bench_block.rs b/performance-metrics/src/micro_bench_block.rs index 035ca1d435..09cb093577 100644 --- a/performance-metrics/src/micro_bench_block.rs +++ b/performance-metrics/src/micro_bench_block.rs @@ -247,3 +247,27 @@ pub fn micro_bench_qcow_cow_write(control: &PerformanceTestControl) -> f64 { elapsed } + +/// Read num_ops clusters from a zlib compressed QCOW2 image. +/// +/// Every cluster is stored compressed, so each read triggers +/// decompression. This isolates the decompression overhead from +/// the normal allocated-cluster read path. +/// +/// Returns the total read wall clock time in seconds. +pub fn micro_bench_qcow_compressed_read(control: &PerformanceTestControl) -> f64 { + let num_ops = control.num_ops.expect("num_ops required") as usize; + let (_tmp, disk) = util::compressed_qcow_tempfile(num_ops); + let mut async_io = disk.new_async_io(1).expect("new_async_io failed"); + + let mut buf = vec![0u8; QCOW_CLUSTER_SIZE as usize]; + let iovec = read_iovec(&mut buf); + + let start = Instant::now(); + submit_reads(async_io.as_mut(), num_ops, QCOW_CLUSTER_SIZE, &[iovec]); + let elapsed = start.elapsed().as_secs_f64(); + + drain_completions(async_io.as_mut(), num_ops); + + elapsed +} From 94f78edcf06321189d48e9961bb4bc12e2e3c389 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 26 Mar 2026 22:53:32 +0100 Subject: [PATCH 655/742] performance-metrics: Add QCOW2 multi-cluster read micro benchmark Add micro_bench_qcow_multi_cluster_read which issues large reads spanning 8 contiguous clusters (512 KiB) per read_vectored call. This exercises the mapping coalesce path where multiple L2 entries are merged into fewer host I/O operations. Workloads: 128 and 256 total clusters (16 and 32 reads). Signed-off-by: Anatol Belski --- performance-metrics/src/main.rs | 26 ++++++++++++++++- performance-metrics/src/micro_bench_block.rs | 30 ++++++++++++++++++++ 2 files changed, 55 insertions(+), 1 deletion(-) diff --git a/performance-metrics/src/main.rs b/performance-metrics/src/main.rs index d78e451b8b..ffb42edfdd 100644 --- a/performance-metrics/src/main.rs +++ b/performance-metrics/src/main.rs @@ -378,7 +378,7 @@ mod adjuster { } } -const TEST_LIST: [PerformanceTest; 78] = [ +const TEST_LIST: [PerformanceTest; 80] = [ PerformanceTest { name: "boot_time_ms", func_ptr: performance_boot_time, @@ -1445,6 +1445,30 @@ const TEST_LIST: [PerformanceTest; 78] = [ }, unit_adjuster: adjuster::s_to_us, }, + PerformanceTest { + name: "micro_block_qcow_multi_cluster_read_128_us", + func_ptr: micro_bench_block::micro_bench_qcow_multi_cluster_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_multi_cluster_read_256_us", + func_ptr: micro_bench_block::micro_bench_qcow_multi_cluster_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, ]; fn run_test_with_timeout( diff --git a/performance-metrics/src/micro_bench_block.rs b/performance-metrics/src/micro_bench_block.rs index 09cb093577..1b9ff3b8ef 100644 --- a/performance-metrics/src/micro_bench_block.rs +++ b/performance-metrics/src/micro_bench_block.rs @@ -271,3 +271,33 @@ pub fn micro_bench_qcow_compressed_read(control: &PerformanceTestControl) -> f64 elapsed } + +/// Issue large multicluster reads from a prepopulated QCOW2 image. +/// +/// Each read_vectored call spans `CLUSTERS_PER_READ` contiguous clusters +/// (8 x 64 KiB = 512 KiB). This exercises the mapping coalesce path +/// where multiple L2 entries are merged into fewer host I/O operations. +/// `num_ops` is the total number of clusters; reads are issued in +/// chunks of CLUSTERS_PER_READ. +/// +/// Returns the total read wall clock time in seconds. +pub fn micro_bench_qcow_multi_cluster_read(control: &PerformanceTestControl) -> f64 { + const CLUSTERS_PER_READ: usize = 8; + + let num_ops = control.num_ops.expect("num_ops required") as usize; + let (_tmp, disk) = util::qcow_tempfile(num_ops); + let mut async_io = disk.new_async_io(1).expect("new_async_io failed"); + + let read_size = CLUSTERS_PER_READ * QCOW_CLUSTER_SIZE as usize; + let mut buf = vec![0u8; read_size]; + let iovec = read_iovec(&mut buf); + + let num_reads = num_ops / CLUSTERS_PER_READ; + let start = Instant::now(); + submit_reads(async_io.as_mut(), num_reads, read_size as u64, &[iovec]); + let elapsed = start.elapsed().as_secs_f64(); + + drain_completions(async_io.as_mut(), num_reads); + + elapsed +} From 73d99c044c94232040857e28889b824029c97a54 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 26 Mar 2026 22:58:48 +0100 Subject: [PATCH 656/742] performance-metrics: Add sparse QCOW2 tempfile helper Add sparse_qcow_tempfile() which creates a QCOW2 image with one allocated cluster per L2 table, spread across num_l2_tables distinct L2 tables. Reading these clusters in sequence forces L2 cache misses when the count exceeds the cache capacity. Also add the L2_ENTRIES_PER_TABLE constant, 8192 for 64 KiB clusters. To be used by the L2 cache cold miss benchmark. Signed-off-by: Anatol Belski --- performance-metrics/src/util.rs | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/performance-metrics/src/util.rs b/performance-metrics/src/util.rs index b859c8764b..e4604f22d8 100644 --- a/performance-metrics/src/util.rs +++ b/performance-metrics/src/util.rs @@ -237,6 +237,35 @@ pub fn compressed_qcow_tempfile(num_clusters: usize) -> (TempFile, QcowDiskSync) (tmp, disk) } +/// Number of data clusters covered by a single L2 table (64 KiB cluster, +/// 8-byte entries -> 8192 entries per L2 table). +pub const L2_ENTRIES_PER_TABLE: usize = QCOW_CLUSTER_SIZE as usize / 8; + +/// Create a sparse QCOW2 image with one allocated cluster per L2 table, +/// spanning `num_l2_tables` L2 tables. +fn create_sparse_qcow_tempfile(num_l2_tables: usize) -> TempFile { + let virtual_size = QCOW_CLUSTER_SIZE * (num_l2_tables as u64 * L2_ENTRIES_PER_TABLE as u64); + let tmp = TempFile::new().expect("failed to create tempfile"); + let raw = RawFile::new(tmp.as_file().try_clone().unwrap(), false); + let mut qcow = QcowFile::new(raw, 3, virtual_size, true).expect("failed to create qcow2 file"); + let buf = vec![0xA5u8; QCOW_CLUSTER_SIZE as usize]; + for i in 0..num_l2_tables { + let offset = i as u64 * L2_ENTRIES_PER_TABLE as u64 * QCOW_CLUSTER_SIZE; + qcow.seek(SeekFrom::Start(offset)).expect("seek failed"); + qcow.write_all(&buf).expect("write failed"); + } + qcow.flush().expect("flush failed"); + tmp +} + +/// Sparse QCOW2 opened via QcowDiskSync. +pub fn sparse_qcow_tempfile(num_l2_tables: usize) -> (TempFile, QcowDiskSync) { + let tmp = create_sparse_qcow_tempfile(num_l2_tables); + let disk = QcowDiskSync::new(tmp.as_file().try_clone().unwrap(), false, false, true) + .expect("failed to open qcow2 via QcowDiskSync"); + (tmp, disk) +} + /// Spin and wait until the given eventfd becomes readable. pub fn wait_for_eventfd(notifier: &EventFd) { loop { From faf6f7b6353aa5c07e09a2bf710b7ed91b4050fd Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 26 Mar 2026 23:03:38 +0100 Subject: [PATCH 657/742] performance-metrics: Add QCOW2 L2 cache cold miss micro benchmark Add micro_bench_qcow_l2_cache_miss which reads one cluster from each of num_ops distinct L2 tables in a sparsely allocated image. Clusters are spaced L2_ENTRIES_PER_TABLE apart so every read touches a different L2 table, forcing eviction when num_ops exceeds the cache capacity. Workloads: 128 and 256 L2 tables. Signed-off-by: Anatol Belski --- performance-metrics/src/main.rs | 26 +++++++++++++++- performance-metrics/src/micro_bench_block.rs | 31 ++++++++++++++++++-- 2 files changed, 54 insertions(+), 3 deletions(-) diff --git a/performance-metrics/src/main.rs b/performance-metrics/src/main.rs index ffb42edfdd..2c33bf9a83 100644 --- a/performance-metrics/src/main.rs +++ b/performance-metrics/src/main.rs @@ -378,7 +378,7 @@ mod adjuster { } } -const TEST_LIST: [PerformanceTest; 80] = [ +const TEST_LIST: [PerformanceTest; 82] = [ PerformanceTest { name: "boot_time_ms", func_ptr: performance_boot_time, @@ -1469,6 +1469,30 @@ const TEST_LIST: [PerformanceTest; 80] = [ }, unit_adjuster: adjuster::s_to_us, }, + PerformanceTest { + name: "micro_block_qcow_l2_cache_miss_128_us", + func_ptr: micro_bench_block::micro_bench_qcow_l2_cache_miss, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_l2_cache_miss_256_us", + func_ptr: micro_bench_block::micro_bench_qcow_l2_cache_miss, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, ]; fn run_test_with_timeout( diff --git a/performance-metrics/src/micro_bench_block.rs b/performance-metrics/src/micro_bench_block.rs index 1b9ff3b8ef..2b83ab495e 100644 --- a/performance-metrics/src/micro_bench_block.rs +++ b/performance-metrics/src/micro_bench_block.rs @@ -16,8 +16,8 @@ use block::raw_async_aio::RawFileAsyncAio; use crate::PerformanceTestControl; use crate::util::{ - self, BLOCK_SIZE, QCOW_CLUSTER_SIZE, deterministic_permutation, drain_completions, read_iovec, - submit_reads, submit_writes, write_iovec, + self, BLOCK_SIZE, L2_ENTRIES_PER_TABLE, QCOW_CLUSTER_SIZE, deterministic_permutation, + drain_completions, read_iovec, submit_reads, submit_writes, write_iovec, }; /// Submit num_ops AIO writes, wait for them all to land, then time @@ -301,3 +301,30 @@ pub fn micro_bench_qcow_multi_cluster_read(control: &PerformanceTestControl) -> elapsed } + +/// Read one cluster from each of num_ops distinct L2 tables in a +/// sparsely allocated QCOW2 image. +/// +/// The clusters are spaced L2_ENTRIES_PER_TABLE apart so every read +/// touches a different L2 table. With num_ops exceeding the L2 cache +/// capacity (100 entries), this forces eviction on nearly every read +/// and measures the cold L2 cache miss overhead. +/// +/// Returns the total read wall clock time in seconds. +pub fn micro_bench_qcow_l2_cache_miss(control: &PerformanceTestControl) -> f64 { + let num_ops = control.num_ops.expect("num_ops required") as usize; + let (_tmp, disk) = util::sparse_qcow_tempfile(num_ops); + let mut async_io = disk.new_async_io(1).expect("new_async_io failed"); + + let mut buf = vec![0u8; QCOW_CLUSTER_SIZE as usize]; + let iovec = read_iovec(&mut buf); + + let stride = L2_ENTRIES_PER_TABLE as u64 * QCOW_CLUSTER_SIZE; + let start = Instant::now(); + submit_reads(async_io.as_mut(), num_ops, stride, &[iovec]); + let elapsed = start.elapsed().as_secs_f64(); + + drain_completions(async_io.as_mut(), num_ops); + + elapsed +} From 3cb116fcfcec332778cbf520253091a33f3a6538 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 27 Mar 2026 23:11:34 +0100 Subject: [PATCH 658/742] performance-metrics: Add async drain completions helper Signed-off-by: Anatol Belski --- performance-metrics/src/util.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/performance-metrics/src/util.rs b/performance-metrics/src/util.rs index e4604f22d8..f9f322f22f 100644 --- a/performance-metrics/src/util.rs +++ b/performance-metrics/src/util.rs @@ -128,6 +128,18 @@ pub fn submit_writes(async_io: &mut dyn AsyncIo, count: usize, stride: u64, iove } } +/// Drain `count` completions from an asynchronous I/O backend that delivers +/// results via eventfd notification (e.g. io_uring). +pub fn drain_async_completions(async_io: &mut dyn AsyncIo, count: usize) { + let mut drained = 0usize; + while drained < count { + wait_for_eventfd(async_io.notifier()); + while async_io.next_completed_request().is_some() { + drained += 1; + } + } +} + /// Create an empty QCOW2 image sized for `num_clusters` clusters. /// No data clusters are allocated. fn create_empty_qcow_tempfile(num_clusters: usize) -> TempFile { From f8dbec0abbcab8058c2a0a8884195ea02a54d53f Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 26 Mar 2026 23:26:35 +0100 Subject: [PATCH 659/742] performance-metrics: Add QCOW2 async read micro benchmark Add micro_bench_qcow_async_read which reads clusters through the QcowDiskAsync io_uring backend. Single allocated cluster reads go through io_uring for true asynchronous completion, unlike the sync benchmarks which use QcowDiskSync with blocking I/O. Workloads: 128 and 256 clusters. Signed-off-by: Anatol Belski --- performance-metrics/src/main.rs | 26 +++++++++++++++++- performance-metrics/src/micro_bench_block.rs | 29 +++++++++++++++++++- 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/performance-metrics/src/main.rs b/performance-metrics/src/main.rs index 2c33bf9a83..1177b54f4f 100644 --- a/performance-metrics/src/main.rs +++ b/performance-metrics/src/main.rs @@ -378,7 +378,7 @@ mod adjuster { } } -const TEST_LIST: [PerformanceTest; 82] = [ +const TEST_LIST: [PerformanceTest; 84] = [ PerformanceTest { name: "boot_time_ms", func_ptr: performance_boot_time, @@ -1493,6 +1493,30 @@ const TEST_LIST: [PerformanceTest; 82] = [ }, unit_adjuster: adjuster::s_to_us, }, + PerformanceTest { + name: "micro_block_qcow_async_read_128_us", + func_ptr: micro_bench_block::micro_bench_qcow_async_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_async_read_256_us", + func_ptr: micro_bench_block::micro_bench_qcow_async_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, ]; fn run_test_with_timeout( diff --git a/performance-metrics/src/micro_bench_block.rs b/performance-metrics/src/micro_bench_block.rs index 2b83ab495e..5ab907a87a 100644 --- a/performance-metrics/src/micro_bench_block.rs +++ b/performance-metrics/src/micro_bench_block.rs @@ -17,7 +17,8 @@ use block::raw_async_aio::RawFileAsyncAio; use crate::PerformanceTestControl; use crate::util::{ self, BLOCK_SIZE, L2_ENTRIES_PER_TABLE, QCOW_CLUSTER_SIZE, deterministic_permutation, - drain_completions, read_iovec, submit_reads, submit_writes, write_iovec, + drain_async_completions, drain_completions, read_iovec, submit_reads, submit_writes, + write_iovec, }; /// Submit num_ops AIO writes, wait for them all to land, then time @@ -328,3 +329,29 @@ pub fn micro_bench_qcow_l2_cache_miss(control: &PerformanceTestControl) -> f64 { elapsed } + +/// Read num_ops clusters from a prepopulated qcow2 image through the +/// QcowAsync io_uring path and time the total wall clock. +/// +/// Unlike micro_bench_qcow_read which uses QcowDiskSync (blocking), +/// this uses QcowDiskAsync where single-allocated-cluster reads go +/// through io_uring for true asynchronous completion. +/// +/// Returns the total read wall clock time in seconds. +pub fn micro_bench_qcow_async_read(control: &PerformanceTestControl) -> f64 { + let num_ops = control.num_ops.expect("num_ops required") as usize; + let (_tmp, disk) = util::qcow_async_tempfile(num_ops); + let mut async_io = disk + .new_async_io(num_ops as u32) + .expect("new_async_io failed"); + + let mut buf = vec![0u8; QCOW_CLUSTER_SIZE as usize]; + let iovec = read_iovec(&mut buf); + + let start = Instant::now(); + submit_reads(async_io.as_mut(), num_ops, QCOW_CLUSTER_SIZE, &[iovec]); + + // Drain all io_uring completions before stopping the clock. + drain_async_completions(async_io.as_mut(), num_ops); + start.elapsed().as_secs_f64() +} From e9d1ffd24f0112477dd35a137316d746d5a920f8 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 27 Mar 2026 09:30:13 +0100 Subject: [PATCH 660/742] performance-metrics: Add QCOW2 batch read micro benchmark Add micro_bench_qcow_batch_read which builds a batch of num_ops read requests and submits them all at once through submit_batch_requests. This exercises the io_uring batch submission path added in qcow_async, where multiple SQEs are packed into a single io_uring_enter call. Signed-off-by: Anatol Belski --- performance-metrics/src/main.rs | 26 +++++++++++- performance-metrics/src/micro_bench_block.rs | 42 ++++++++++++++++++++ 2 files changed, 67 insertions(+), 1 deletion(-) diff --git a/performance-metrics/src/main.rs b/performance-metrics/src/main.rs index 1177b54f4f..f68471fd6d 100644 --- a/performance-metrics/src/main.rs +++ b/performance-metrics/src/main.rs @@ -378,7 +378,7 @@ mod adjuster { } } -const TEST_LIST: [PerformanceTest; 84] = [ +const TEST_LIST: [PerformanceTest; 86] = [ PerformanceTest { name: "boot_time_ms", func_ptr: performance_boot_time, @@ -1517,6 +1517,30 @@ const TEST_LIST: [PerformanceTest; 84] = [ }, unit_adjuster: adjuster::s_to_us, }, + PerformanceTest { + name: "micro_block_qcow_batch_read_128_us", + func_ptr: micro_bench_block::micro_bench_qcow_batch_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_batch_read_256_us", + func_ptr: micro_bench_block::micro_bench_qcow_batch_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, ]; fn run_test_with_timeout( diff --git a/performance-metrics/src/micro_bench_block.rs b/performance-metrics/src/micro_bench_block.rs index 5ab907a87a..5f2eabc28f 100644 --- a/performance-metrics/src/micro_bench_block.rs +++ b/performance-metrics/src/micro_bench_block.rs @@ -13,6 +13,7 @@ use std::time::Instant; use block::async_io::AsyncIo; use block::disk_file::AsyncDiskFile; use block::raw_async_aio::RawFileAsyncAio; +use block::{BatchRequest, RequestType}; use crate::PerformanceTestControl; use crate::util::{ @@ -355,3 +356,44 @@ pub fn micro_bench_qcow_async_read(control: &PerformanceTestControl) -> f64 { drain_async_completions(async_io.as_mut(), num_ops); start.elapsed().as_secs_f64() } + +/// Measure QCOW2 batch read submission via io_uring. +/// +/// Builds a batch of `num_ops` read requests and submits them all at once +/// through `submit_batch_requests`, which packs multiple SQEs into a single +/// io_uring submission. Returns the total wall clock time in seconds. +pub fn micro_bench_qcow_batch_read(control: &PerformanceTestControl) -> f64 { + let num_ops = control.num_ops.expect("num_ops required") as usize; + let (_tmp, disk) = util::qcow_async_tempfile(num_ops); + let mut async_io = disk + .new_async_io(num_ops as u32) + .expect("new_async_io failed"); + + let mut buf = vec![0u8; num_ops * QCOW_CLUSTER_SIZE as usize]; + + let batch: Vec = (0..num_ops) + .map(|i| { + let slice = + &mut buf[i * QCOW_CLUSTER_SIZE as usize..(i + 1) * QCOW_CLUSTER_SIZE as usize]; + BatchRequest { + offset: (i as u64 * QCOW_CLUSTER_SIZE) as libc::off_t, + iovecs: vec![libc::iovec { + iov_base: slice.as_mut_ptr() as *mut libc::c_void, + iov_len: QCOW_CLUSTER_SIZE as usize, + }] + .into(), + user_data: i as u64, + request_type: RequestType::In, + } + }) + .collect(); + + let start = Instant::now(); + async_io + .submit_batch_requests(&batch) + .expect("submit_batch_requests failed"); + + // Drain all io_uring completions before stopping the clock. + drain_async_completions(async_io.as_mut(), num_ops); + start.elapsed().as_secs_f64() +} From 377c260196bbeb2d1eeab31be50da364192d8fbb Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 27 Mar 2026 10:23:27 +0100 Subject: [PATCH 661/742] performance-metrics: Add QCOW2 async random read micro benchmark Add micro_bench_qcow_async_random_read which reads clusters in random order through the QcowAsync io_uring path. This mirrors the existing sync random read benchmark and measures io_uring completion handling under random access patterns. Signed-off-by: Anatol Belski --- performance-metrics/src/main.rs | 26 +++++++++++++++- performance-metrics/src/micro_bench_block.rs | 31 ++++++++++++++++++++ 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/performance-metrics/src/main.rs b/performance-metrics/src/main.rs index f68471fd6d..94896ae907 100644 --- a/performance-metrics/src/main.rs +++ b/performance-metrics/src/main.rs @@ -378,7 +378,7 @@ mod adjuster { } } -const TEST_LIST: [PerformanceTest; 86] = [ +const TEST_LIST: [PerformanceTest; 88] = [ PerformanceTest { name: "boot_time_ms", func_ptr: performance_boot_time, @@ -1541,6 +1541,30 @@ const TEST_LIST: [PerformanceTest; 86] = [ }, unit_adjuster: adjuster::s_to_us, }, + PerformanceTest { + name: "micro_block_qcow_async_random_read_128_us", + func_ptr: micro_bench_block::micro_bench_qcow_async_random_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_async_random_read_256_us", + func_ptr: micro_bench_block::micro_bench_qcow_async_random_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, ]; fn run_test_with_timeout( diff --git a/performance-metrics/src/micro_bench_block.rs b/performance-metrics/src/micro_bench_block.rs index 5f2eabc28f..ae204906ec 100644 --- a/performance-metrics/src/micro_bench_block.rs +++ b/performance-metrics/src/micro_bench_block.rs @@ -397,3 +397,34 @@ pub fn micro_bench_qcow_batch_read(control: &PerformanceTestControl) -> f64 { drain_async_completions(async_io.as_mut(), num_ops); start.elapsed().as_secs_f64() } + +/// Read num_ops clusters from a prepopulated QCOW2 image in random order +/// through the QcowAsync io_uring path. +/// +/// Returns the total read wall clock time in seconds. +pub fn micro_bench_qcow_async_random_read(control: &PerformanceTestControl) -> f64 { + let num_ops = control.num_ops.expect("num_ops required") as usize; + let (_tmp, disk) = util::qcow_async_tempfile(num_ops); + let mut async_io = disk + .new_async_io(num_ops as u32) + .expect("new_async_io failed"); + + let indices = deterministic_permutation(num_ops); + + let mut buf = vec![0u8; QCOW_CLUSTER_SIZE as usize]; + let iovec = read_iovec(&mut buf); + + let start = Instant::now(); + for (seq, &cluster_idx) in indices.iter().enumerate() { + async_io + .read_vectored( + (cluster_idx as u64 * QCOW_CLUSTER_SIZE) as libc::off_t, + &[iovec], + seq as u64, + ) + .expect("read_vectored failed"); + } + + drain_async_completions(async_io.as_mut(), num_ops); + start.elapsed().as_secs_f64() +} From 2041ba5a919fa4db286db9235688fa5fcdaa4330 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 27 Mar 2026 11:06:08 +0100 Subject: [PATCH 662/742] performance-metrics: Add QCOW2 async multicluster read micro benchmark Add micro_bench_qcow_async_multi_cluster_read which reads 8 contiguous clusters (512 KiB) per request through the QcowAsync io_uring path. With coalesced mappings this can hit the io_uring fast path for a single Readv SQE. Signed-off-by: Anatol Belski --- performance-metrics/src/main.rs | 26 +++++++++++++++++- performance-metrics/src/micro_bench_block.rs | 28 ++++++++++++++++++++ 2 files changed, 53 insertions(+), 1 deletion(-) diff --git a/performance-metrics/src/main.rs b/performance-metrics/src/main.rs index 94896ae907..868d3ee15e 100644 --- a/performance-metrics/src/main.rs +++ b/performance-metrics/src/main.rs @@ -378,7 +378,7 @@ mod adjuster { } } -const TEST_LIST: [PerformanceTest; 88] = [ +const TEST_LIST: [PerformanceTest; 90] = [ PerformanceTest { name: "boot_time_ms", func_ptr: performance_boot_time, @@ -1565,6 +1565,30 @@ const TEST_LIST: [PerformanceTest; 88] = [ }, unit_adjuster: adjuster::s_to_us, }, + PerformanceTest { + name: "micro_block_qcow_async_multi_cluster_read_128_us", + func_ptr: micro_bench_block::micro_bench_qcow_async_multi_cluster_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_async_multi_cluster_read_256_us", + func_ptr: micro_bench_block::micro_bench_qcow_async_multi_cluster_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, ]; fn run_test_with_timeout( diff --git a/performance-metrics/src/micro_bench_block.rs b/performance-metrics/src/micro_bench_block.rs index ae204906ec..4ec0051a40 100644 --- a/performance-metrics/src/micro_bench_block.rs +++ b/performance-metrics/src/micro_bench_block.rs @@ -428,3 +428,31 @@ pub fn micro_bench_qcow_async_random_read(control: &PerformanceTestControl) -> f drain_async_completions(async_io.as_mut(), num_ops); start.elapsed().as_secs_f64() } + +/// Issue large multi-cluster reads from a prepopulated QCOW2 image +/// through the QcowAsync io_uring path. +/// +/// Each read spans 8 contiguous clusters (512 KiB). With coalesced +/// mappings, this can hit the io_uring fast path for a single Readv. +/// +/// Returns the total read wall clock time in seconds. +pub fn micro_bench_qcow_async_multi_cluster_read(control: &PerformanceTestControl) -> f64 { + const CLUSTERS_PER_READ: usize = 8; + + let num_ops = control.num_ops.expect("num_ops required") as usize; + let (_tmp, disk) = util::qcow_async_tempfile(num_ops); + let mut async_io = disk + .new_async_io(num_ops as u32) + .expect("new_async_io failed"); + + let read_size = CLUSTERS_PER_READ * QCOW_CLUSTER_SIZE as usize; + let mut buf = vec![0u8; read_size]; + let iovec = read_iovec(&mut buf); + + let num_reads = num_ops / CLUSTERS_PER_READ; + let start = Instant::now(); + submit_reads(async_io.as_mut(), num_reads, read_size as u64, &[iovec]); + + drain_async_completions(async_io.as_mut(), num_reads); + start.elapsed().as_secs_f64() +} From c8fee5953fe8e2308474c9a3601c2b57e44482bc Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 27 Mar 2026 11:40:22 +0100 Subject: [PATCH 663/742] performance-metrics: Add async QCOW2 overlay tempfile helper Add qcow_async_overlay_tempfile which creates a QCOW2 overlay backed by a RAW file and opens it via QcowDiskAsync. Mirrors the existing qcow_overlay_tempfile for io_uring benchmarks. Signed-off-by: Anatol Belski --- performance-metrics/src/util.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/performance-metrics/src/util.rs b/performance-metrics/src/util.rs index f9f322f22f..c96d07253b 100644 --- a/performance-metrics/src/util.rs +++ b/performance-metrics/src/util.rs @@ -196,6 +196,14 @@ pub fn qcow_overlay_tempfile(num_clusters: usize) -> (TempFile, TempFile, QcowDi (backing, overlay, disk) } +/// QCOW2 overlay with raw backing opened via QcowDiskAsync. +pub fn qcow_async_overlay_tempfile(num_clusters: usize) -> (TempFile, TempFile, QcowDiskAsync) { + let (backing, overlay) = create_overlay_tempfiles(num_clusters); + let disk = QcowDiskAsync::new(overlay.as_file().try_clone().unwrap(), false, true, true) + .expect("failed to open overlay qcow2 via QcowDiskAsync"); + (backing, overlay, disk) +} + /// Create a zlib compressed QCOW2 image with `num_clusters` clusters /// via `qemu-img convert -c`. fn create_compressed_qcow_tempfile(num_clusters: usize) -> TempFile { From 324f16861d431fd53cc702d82b3256330b1bab91 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 27 Mar 2026 12:55:47 +0100 Subject: [PATCH 664/742] performance-metrics: Add QCOW2 async backing file read micro benchmark Add micro_bench_qcow_async_backing_read which reads clusters from a QCOW2 overlay through the QcowAsync io_uring path. All reads fall through to the backing file, exercising the sync fallback path in QcowAsync. Signed-off-by: Anatol Belski --- performance-metrics/src/main.rs | 26 +++++++++++++++++++- performance-metrics/src/micro_bench_block.rs | 24 ++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/performance-metrics/src/main.rs b/performance-metrics/src/main.rs index 868d3ee15e..56c6a08342 100644 --- a/performance-metrics/src/main.rs +++ b/performance-metrics/src/main.rs @@ -378,7 +378,7 @@ mod adjuster { } } -const TEST_LIST: [PerformanceTest; 90] = [ +const TEST_LIST: [PerformanceTest; 92] = [ PerformanceTest { name: "boot_time_ms", func_ptr: performance_boot_time, @@ -1589,6 +1589,30 @@ const TEST_LIST: [PerformanceTest; 90] = [ }, unit_adjuster: adjuster::s_to_us, }, + PerformanceTest { + name: "micro_block_qcow_async_backing_read_128_us", + func_ptr: micro_bench_block::micro_bench_qcow_async_backing_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_async_backing_read_256_us", + func_ptr: micro_bench_block::micro_bench_qcow_async_backing_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, ]; fn run_test_with_timeout( diff --git a/performance-metrics/src/micro_bench_block.rs b/performance-metrics/src/micro_bench_block.rs index 4ec0051a40..0d485aaa48 100644 --- a/performance-metrics/src/micro_bench_block.rs +++ b/performance-metrics/src/micro_bench_block.rs @@ -456,3 +456,27 @@ pub fn micro_bench_qcow_async_multi_cluster_read(control: &PerformanceTestContro drain_async_completions(async_io.as_mut(), num_reads); start.elapsed().as_secs_f64() } + +/// Read num_ops clusters from a QCOW2 overlay backed by a raw file +/// through the QcowAsync io_uring path. +/// +/// All reads fall through to the backing file (sync fallback in +/// QcowAsync since the mapping is not a single allocated cluster). +/// +/// Returns the total read wall clock time in seconds. +pub fn micro_bench_qcow_async_backing_read(control: &PerformanceTestControl) -> f64 { + let num_ops = control.num_ops.expect("num_ops required") as usize; + let (_backing, _overlay, disk) = util::qcow_async_overlay_tempfile(num_ops); + let mut async_io = disk + .new_async_io(num_ops as u32) + .expect("new_async_io failed"); + + let mut buf = vec![0u8; QCOW_CLUSTER_SIZE as usize]; + let iovec = read_iovec(&mut buf); + + let start = Instant::now(); + submit_reads(async_io.as_mut(), num_ops, QCOW_CLUSTER_SIZE, &[iovec]); + + drain_async_completions(async_io.as_mut(), num_ops); + start.elapsed().as_secs_f64() +} From 4bf3672fad35a80e7ccadffaad35a5d1044ae17d Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 27 Mar 2026 13:00:20 +0100 Subject: [PATCH 665/742] performance-metrics: Add compressed async QCOW2 tempfile helper Add compressed_qcow_async_tempfile which creates a zlib compressed QCOW2 image via qemu-img and opens it via QcowDiskAsync. Mirrors the existing compressed_qcow_tempfile for io_uring benchmarks. Signed-off-by: Anatol Belski --- performance-metrics/src/util.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/performance-metrics/src/util.rs b/performance-metrics/src/util.rs index c96d07253b..6482bd1151 100644 --- a/performance-metrics/src/util.rs +++ b/performance-metrics/src/util.rs @@ -257,6 +257,20 @@ pub fn compressed_qcow_tempfile(num_clusters: usize) -> (TempFile, QcowDiskSync) (tmp, disk) } +/// Compressed QCOW2 opened via QcowDiskAsync. +pub fn compressed_qcow_async_tempfile(num_clusters: usize) -> (TempFile, QcowDiskAsync) { + let tmp = create_compressed_qcow_tempfile(num_clusters); + let path = tmp.as_path().to_str().unwrap().to_string(); + let disk = QcowDiskAsync::new( + File::open(&path).expect("failed to open compressed qcow2"), + false, + false, + true, + ) + .expect("failed to open compressed qcow2 via QcowDiskAsync"); + (tmp, disk) +} + /// Number of data clusters covered by a single L2 table (64 KiB cluster, /// 8-byte entries -> 8192 entries per L2 table). pub const L2_ENTRIES_PER_TABLE: usize = QCOW_CLUSTER_SIZE as usize / 8; From fca6429e9bd8335ccd85cca68f20d8364b467faa Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 27 Mar 2026 13:01:33 +0100 Subject: [PATCH 666/742] performance-metrics: Add QCOW2 async compressed read micro benchmark Add micro_bench_qcow_async_compressed_read which reads from a zlib compressed QCOW2 image through the QcowAsync io_uring path. Compressed clusters take the sync fallback since they require decompression. Signed-off-by: Anatol Belski --- performance-metrics/src/main.rs | 26 +++++++++++++++++++- performance-metrics/src/micro_bench_block.rs | 22 +++++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/performance-metrics/src/main.rs b/performance-metrics/src/main.rs index 56c6a08342..1a7956bba7 100644 --- a/performance-metrics/src/main.rs +++ b/performance-metrics/src/main.rs @@ -378,7 +378,7 @@ mod adjuster { } } -const TEST_LIST: [PerformanceTest; 92] = [ +const TEST_LIST: [PerformanceTest; 94] = [ PerformanceTest { name: "boot_time_ms", func_ptr: performance_boot_time, @@ -1613,6 +1613,30 @@ const TEST_LIST: [PerformanceTest; 92] = [ }, unit_adjuster: adjuster::s_to_us, }, + PerformanceTest { + name: "micro_block_qcow_async_compressed_read_128_us", + func_ptr: micro_bench_block::micro_bench_qcow_async_compressed_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_async_compressed_read_256_us", + func_ptr: micro_bench_block::micro_bench_qcow_async_compressed_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, ]; fn run_test_with_timeout( diff --git a/performance-metrics/src/micro_bench_block.rs b/performance-metrics/src/micro_bench_block.rs index 0d485aaa48..6083c28f89 100644 --- a/performance-metrics/src/micro_bench_block.rs +++ b/performance-metrics/src/micro_bench_block.rs @@ -480,3 +480,25 @@ pub fn micro_bench_qcow_async_backing_read(control: &PerformanceTestControl) -> drain_async_completions(async_io.as_mut(), num_ops); start.elapsed().as_secs_f64() } + +/// Compressed clusters take the sync fallback in QcowAsync since they +/// require decompression. This measures decompression overhead through +/// the async code path. +/// +/// Returns the total read wall clock time in seconds. +pub fn micro_bench_qcow_async_compressed_read(control: &PerformanceTestControl) -> f64 { + let num_ops = control.num_ops.expect("num_ops required") as usize; + let (_tmp, disk) = util::compressed_qcow_async_tempfile(num_ops); + let mut async_io = disk + .new_async_io(num_ops as u32) + .expect("new_async_io failed"); + + let mut buf = vec![0u8; QCOW_CLUSTER_SIZE as usize]; + let iovec = read_iovec(&mut buf); + + let start = Instant::now(); + submit_reads(async_io.as_mut(), num_ops, QCOW_CLUSTER_SIZE, &[iovec]); + + drain_async_completions(async_io.as_mut(), num_ops); + start.elapsed().as_secs_f64() +} From 1069505151baea26c9e494c46d8e13a172b706ad Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 27 Mar 2026 13:03:17 +0100 Subject: [PATCH 667/742] performance-metrics: Add empty async QCOW2 tempfile helper Add empty_qcow_async_tempfile which creates an empty QCOW2 image and opens it via QcowDiskAsync. Mirrors the existing empty_qcow_tempfile for io_uring write benchmarks. Signed-off-by: Anatol Belski --- performance-metrics/src/util.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/performance-metrics/src/util.rs b/performance-metrics/src/util.rs index 6482bd1151..9536173938 100644 --- a/performance-metrics/src/util.rs +++ b/performance-metrics/src/util.rs @@ -158,6 +158,14 @@ pub fn empty_qcow_tempfile(num_clusters: usize) -> (TempFile, QcowDiskSync) { (tmp, disk) } +/// Empty QCOW2 opened via QcowDiskAsync. +pub fn empty_qcow_async_tempfile(num_clusters: usize) -> (TempFile, QcowDiskAsync) { + let tmp = create_empty_qcow_tempfile(num_clusters); + let disk = QcowDiskAsync::new(tmp.as_file().try_clone().unwrap(), false, false, true) + .expect("failed to open qcow2 via QcowDiskAsync"); + (tmp, disk) +} + /// Create a QCOW2 overlay backed by a raw file with `num_clusters` /// pre-populated clusters. Returns (backing_tempfile, overlay_tempfile). fn create_overlay_tempfiles(num_clusters: usize) -> (TempFile, TempFile) { From 1d5d13eb7bb3dba77026b65c0b16529337460b69 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 27 Mar 2026 13:04:35 +0100 Subject: [PATCH 668/742] performance-metrics: Add QCOW2 async write micro benchmark Add micro_bench_qcow_async_write which writes clusters into an empty QCOW2 image through the QcowAsync io_uring path. Writes in QcowAsync are synchronous due to COW metadata allocation, so this measures the write path overhead through the async code path. Signed-off-by: Anatol Belski --- performance-metrics/src/main.rs | 26 +++++++++++++++++++- performance-metrics/src/micro_bench_block.rs | 25 +++++++++++++++++++ 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/performance-metrics/src/main.rs b/performance-metrics/src/main.rs index 1a7956bba7..ede10c9538 100644 --- a/performance-metrics/src/main.rs +++ b/performance-metrics/src/main.rs @@ -378,7 +378,7 @@ mod adjuster { } } -const TEST_LIST: [PerformanceTest; 94] = [ +const TEST_LIST: [PerformanceTest; 96] = [ PerformanceTest { name: "boot_time_ms", func_ptr: performance_boot_time, @@ -1637,6 +1637,30 @@ const TEST_LIST: [PerformanceTest; 94] = [ }, unit_adjuster: adjuster::s_to_us, }, + PerformanceTest { + name: "micro_block_qcow_async_write_128_us", + func_ptr: micro_bench_block::micro_bench_qcow_async_write, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_async_write_256_us", + func_ptr: micro_bench_block::micro_bench_qcow_async_write, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, ]; fn run_test_with_timeout( diff --git a/performance-metrics/src/micro_bench_block.rs b/performance-metrics/src/micro_bench_block.rs index 6083c28f89..9e0c44c210 100644 --- a/performance-metrics/src/micro_bench_block.rs +++ b/performance-metrics/src/micro_bench_block.rs @@ -502,3 +502,28 @@ pub fn micro_bench_qcow_async_compressed_read(control: &PerformanceTestControl) drain_async_completions(async_io.as_mut(), num_ops); start.elapsed().as_secs_f64() } + +/// Write num_ops clusters into an empty QCOW2 image through the +/// QcowAsync io_uring path. +/// +/// Writes in QcowAsync are synchronous (COW metadata allocation must +/// complete before the host offset is known), so this measures the +/// write path overhead through the async code path. +/// +/// Returns the total write wall clock time in seconds. +pub fn micro_bench_qcow_async_write(control: &PerformanceTestControl) -> f64 { + let num_ops = control.num_ops.expect("num_ops required") as usize; + let (_tmp, disk) = util::empty_qcow_async_tempfile(num_ops); + let mut async_io = disk + .new_async_io(num_ops as u32) + .expect("new_async_io failed"); + + let buf = vec![0xA5u8; QCOW_CLUSTER_SIZE as usize]; + let iovec = write_iovec(&buf); + + let start = Instant::now(); + submit_writes(async_io.as_mut(), num_ops, QCOW_CLUSTER_SIZE, &[iovec]); + + drain_async_completions(async_io.as_mut(), num_ops); + start.elapsed().as_secs_f64() +} From 3804968eeff5243be295dc4d7f28057761368506 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 27 Mar 2026 13:06:01 +0100 Subject: [PATCH 669/742] performance-metrics: Add sparse async QCOW2 tempfile helper Add sparse_qcow_async_tempfile which creates a sparse QCOW2 image with one cluster per L2 table and opens it via QcowDiskAsync. Mirrors the existing sparse_qcow_tempfile for io_uring benchmarks. Signed-off-by: Anatol Belski --- performance-metrics/src/util.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/performance-metrics/src/util.rs b/performance-metrics/src/util.rs index 9536173938..3f68fdd7eb 100644 --- a/performance-metrics/src/util.rs +++ b/performance-metrics/src/util.rs @@ -308,6 +308,14 @@ pub fn sparse_qcow_tempfile(num_l2_tables: usize) -> (TempFile, QcowDiskSync) { (tmp, disk) } +/// Sparse QCOW2 opened via QcowDiskAsync. +pub fn sparse_qcow_async_tempfile(num_l2_tables: usize) -> (TempFile, QcowDiskAsync) { + let tmp = create_sparse_qcow_tempfile(num_l2_tables); + let disk = QcowDiskAsync::new(tmp.as_file().try_clone().unwrap(), false, false, true) + .expect("failed to open qcow2 via QcowDiskAsync"); + (tmp, disk) +} + /// Spin and wait until the given eventfd becomes readable. pub fn wait_for_eventfd(notifier: &EventFd) { loop { From 8052c5a66b29d492e0c68d55278f1d50c2fcc8f5 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 27 Mar 2026 13:07:14 +0100 Subject: [PATCH 670/742] performance-metrics: Add QCOW2 async L2 cache miss micro benchmark Add micro_bench_qcow_async_l2_cache_miss which reads one cluster from each of num_ops distinct L2 tables through the QcowAsync io_uring path, forcing L2 cache eviction on nearly every read. Signed-off-by: Anatol Belski --- performance-metrics/src/main.rs | 26 +++++++++++++++++++- performance-metrics/src/micro_bench_block.rs | 22 +++++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/performance-metrics/src/main.rs b/performance-metrics/src/main.rs index ede10c9538..2e42cc03cf 100644 --- a/performance-metrics/src/main.rs +++ b/performance-metrics/src/main.rs @@ -378,7 +378,7 @@ mod adjuster { } } -const TEST_LIST: [PerformanceTest; 96] = [ +const TEST_LIST: [PerformanceTest; 98] = [ PerformanceTest { name: "boot_time_ms", func_ptr: performance_boot_time, @@ -1661,6 +1661,30 @@ const TEST_LIST: [PerformanceTest; 96] = [ }, unit_adjuster: adjuster::s_to_us, }, + PerformanceTest { + name: "micro_block_qcow_async_l2_cache_miss_128_us", + func_ptr: micro_bench_block::micro_bench_qcow_async_l2_cache_miss, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_async_l2_cache_miss_256_us", + func_ptr: micro_bench_block::micro_bench_qcow_async_l2_cache_miss, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, ]; fn run_test_with_timeout( diff --git a/performance-metrics/src/micro_bench_block.rs b/performance-metrics/src/micro_bench_block.rs index 9e0c44c210..f545facda6 100644 --- a/performance-metrics/src/micro_bench_block.rs +++ b/performance-metrics/src/micro_bench_block.rs @@ -527,3 +527,25 @@ pub fn micro_bench_qcow_async_write(control: &PerformanceTestControl) -> f64 { drain_async_completions(async_io.as_mut(), num_ops); start.elapsed().as_secs_f64() } + +/// Read one cluster from each of num_ops distinct L2 tables in a sparse +/// QCOW2 image through the QcowAsync io_uring path. +/// +/// Returns the total read wall clock time in seconds. +pub fn micro_bench_qcow_async_l2_cache_miss(control: &PerformanceTestControl) -> f64 { + let num_ops = control.num_ops.expect("num_ops required") as usize; + let (_tmp, disk) = util::sparse_qcow_async_tempfile(num_ops); + let mut async_io = disk + .new_async_io(num_ops as u32) + .expect("new_async_io failed"); + + let mut buf = vec![0u8; QCOW_CLUSTER_SIZE as usize]; + let iovec = read_iovec(&mut buf); + + let stride = L2_ENTRIES_PER_TABLE as u64 * QCOW_CLUSTER_SIZE; + let start = Instant::now(); + submit_reads(async_io.as_mut(), num_ops, stride, &[iovec]); + + drain_async_completions(async_io.as_mut(), num_ops); + start.elapsed().as_secs_f64() +} From 1ab877882ddbe96f5aee108f1881502e79449ef7 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 27 Mar 2026 13:14:37 +0100 Subject: [PATCH 671/742] performance-metrics: Add QCOW2 batch write micro benchmark Add micro_bench_qcow_batch_write which builds a batch of num_ops write requests and submits them all at once through submit_batch_requests. Writes in QcowAsync are synchronous (COW path), so this measures whether batching reduces per-request overhead compared to individual write_vectored calls. Signed-off-by: Anatol Belski --- performance-metrics/src/main.rs | 26 +++++++++++- performance-metrics/src/micro_bench_block.rs | 42 ++++++++++++++++++++ 2 files changed, 67 insertions(+), 1 deletion(-) diff --git a/performance-metrics/src/main.rs b/performance-metrics/src/main.rs index 2e42cc03cf..5192a4406b 100644 --- a/performance-metrics/src/main.rs +++ b/performance-metrics/src/main.rs @@ -378,7 +378,7 @@ mod adjuster { } } -const TEST_LIST: [PerformanceTest; 98] = [ +const TEST_LIST: [PerformanceTest; 100] = [ PerformanceTest { name: "boot_time_ms", func_ptr: performance_boot_time, @@ -1685,6 +1685,30 @@ const TEST_LIST: [PerformanceTest; 98] = [ }, unit_adjuster: adjuster::s_to_us, }, + PerformanceTest { + name: "micro_block_qcow_batch_write_128_us", + func_ptr: micro_bench_block::micro_bench_qcow_batch_write, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_batch_write_256_us", + func_ptr: micro_bench_block::micro_bench_qcow_batch_write, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, ]; fn run_test_with_timeout( diff --git a/performance-metrics/src/micro_bench_block.rs b/performance-metrics/src/micro_bench_block.rs index f545facda6..aab20308c1 100644 --- a/performance-metrics/src/micro_bench_block.rs +++ b/performance-metrics/src/micro_bench_block.rs @@ -549,3 +549,45 @@ pub fn micro_bench_qcow_async_l2_cache_miss(control: &PerformanceTestControl) -> drain_async_completions(async_io.as_mut(), num_ops); start.elapsed().as_secs_f64() } + +/// Measure QCOW2 batch write submission via io_uring. +/// +/// Builds a batch of num_ops write requests and submits them all at once +/// through submit_batch_requests. Writes in QcowAsync are synchronous +/// (COW path), so this measures whether batching reduces per-request +/// overhead compared to individual write_vectored calls. +/// +/// Returns the total wall clock time in seconds. +pub fn micro_bench_qcow_batch_write(control: &PerformanceTestControl) -> f64 { + let num_ops = control.num_ops.expect("num_ops required") as usize; + let (_tmp, disk) = util::empty_qcow_async_tempfile(num_ops); + let mut async_io = disk + .new_async_io(num_ops as u32) + .expect("new_async_io failed"); + + let buf = vec![0xA5u8; num_ops * QCOW_CLUSTER_SIZE as usize]; + + let batch: Vec = (0..num_ops) + .map(|i| { + let slice = &buf[i * QCOW_CLUSTER_SIZE as usize..(i + 1) * QCOW_CLUSTER_SIZE as usize]; + BatchRequest { + offset: (i as u64 * QCOW_CLUSTER_SIZE) as libc::off_t, + iovecs: vec![libc::iovec { + iov_base: slice.as_ptr() as *mut libc::c_void, + iov_len: QCOW_CLUSTER_SIZE as usize, + }] + .into(), + user_data: i as u64, + request_type: RequestType::Out, + } + }) + .collect(); + + let start = Instant::now(); + async_io + .submit_batch_requests(&batch) + .expect("submit_batch_requests failed"); + + drain_async_completions(async_io.as_mut(), num_ops); + start.elapsed().as_secs_f64() +} From e2c51042d325bc776fb917275278d0436295a988 Mon Sep 17 00:00:00 2001 From: Keith Adler Date: Thu, 16 Apr 2026 03:33:25 -0500 Subject: [PATCH 672/742] vmm: preserve underlying errors in vm.rs instead of dropping them Several error mappings in vm.rs dropped the underlying error with map_err(|_| ...), making failures harder to diagnose. Preserve the source error by adding #[source] fields to InitramfsLoad and ErrorNmi. - InitramfsLoad: now wraps std::io::Error from seek/rewind operations - ErrorNmi: now wraps cpu::Error from the CPU manager nmi() call Partially addresses #7563 Signed-off-by: Keith Adler --- vmm/src/vm.rs | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index 6b3f6c42e1..6bb088cb0d 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -129,7 +129,13 @@ pub enum Error { UefiLoad(#[source] arch::riscv64::uefi::Error), #[error("Cannot load the initramfs into memory")] - InitramfsLoad, + InitramfsLoad(#[source] std::io::Error), + + #[error("Cannot determine initramfs load address")] + InitramfsAddress(#[source] arch::Error), + + #[error("Cannot read initramfs into guest memory")] + InitramfsRead(#[source] vm_memory::GuestMemoryError), #[error("Cannot load the kernel command line in memory")] LoadCmdLine(#[source] linux_loader::loader::Error), @@ -334,7 +340,7 @@ pub enum Error { IgvmLoad(#[source] igvm_loader::Error), #[error("Error injecting NMI")] - ErrorNmi, + ErrorNmi(#[source] cpu::Error), #[error("Error resuming the VM")] ResumeVm(#[source] hypervisor::HypervisorVmError), @@ -1362,18 +1368,18 @@ impl Vm { let initramfs = self.initramfs.as_mut().unwrap(); let size: usize = initramfs .seek(SeekFrom::End(0)) - .map_err(|_| Error::InitramfsLoad)? + .map_err(Error::InitramfsLoad)? .try_into() .unwrap(); - initramfs.rewind().map_err(|_| Error::InitramfsLoad)?; + initramfs.rewind().map_err(Error::InitramfsLoad)?; let address = - arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?; + arch::initramfs_load_addr(guest_mem, size).map_err(Error::InitramfsAddress)?; let address = GuestAddress(address); guest_mem .read_volatile_from(address, initramfs, size) - .map_err(|_| Error::InitramfsLoad)?; + .map_err(Error::InitramfsRead)?; info!("Initramfs loaded: address = 0x{:x}", address.0); Ok(arch::InitramfsConfig { address, size }) @@ -3032,7 +3038,7 @@ impl Vm { .lock() .unwrap() .nmi() - .map_err(|_| Error::ErrorNmi); + .map_err(Error::ErrorNmi); } } From cdfedfaab24e51f034dd7fd37885d0188c07eaa5 Mon Sep 17 00:00:00 2001 From: Max Makarov Date: Thu, 16 Apr 2026 21:05:27 +0000 Subject: [PATCH 673/742] vmm: device_manager: reject duplicate socket in add_user_device Calling vm.add-user-device a second time with a socket path already in use makes the VMM thread block indefinitely inside vfio_user::Client::new(). libvfio-user servers (SPDK, the reference libvfio-user daemon) accept a single active client per socket, so the second connect(2) succeeds at the OS level but the handshake recvmsg(2) waits for a response that never arrives. All subsequent API requests queue behind the stuck VMM event loop and also hang (vm.info, vmm.ping, vm.remove-device). The VM itself keeps running on vcpu threads, making the symptom confusing: the guest looks healthy, only the API is unreachable. This is easy to hit from management software that uses an idempotent reconcile / ensure pattern for user devices. Reject the call up-front when another user_device already has the same socket path, returning an HTTP 500 with a descriptive UserDeviceSocketInUse error in milliseconds instead of hanging. Signed-off-by: Max Makarov --- vmm/src/device_manager.rs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 2ea8efe350..97e774f98e 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -657,6 +657,10 @@ pub enum DeviceManagerError { #[error("Invalid identifier: {0}")] InvalidIdentifier(String), + /// vfio-user socket path already in use by another user device. + #[error("vfio-user socket path already in use: {0:?}")] + UserDeviceSocketInUse(std::path::PathBuf), + /// Error activating virtio device #[error("Error activating virtio device")] VirtioActivate(#[source] ActivateError), @@ -4707,6 +4711,17 @@ impl DeviceManager { ) -> DeviceManagerResult { self.validate_identifier(&device_cfg.pci_common.id)?; + // Reject duplicate socket up-front: libvfio-user servers accept a + // single client, so a second Client::new() on the same socket blocks + // indefinitely in the handshake recvmsg() and hangs the VMM thread. + if let Some(existing) = &self.config.lock().unwrap().user_devices + && existing.iter().any(|d| d.socket == device_cfg.socket) + { + return Err(DeviceManagerError::UserDeviceSocketInUse( + device_cfg.socket.clone(), + )); + } + let (bdf, device_name) = self.add_vfio_user_device(device_cfg)?; // Update the PCIU bitmap From 030e63476e6c2dd923a122d4b0760a82fa15ce9d Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Wed, 15 Apr 2026 08:42:32 +0100 Subject: [PATCH 674/742] tests: Reduce explicit sleep time in _test_api_* tests Use new `wait_until()` and existing boot response mechanisms to remove explicit sleeps from these tests. Signed-off-by: Rob Bradford --- .../tests/common/tests_wrappers.rs | 36 +++++++++---------- test_infra/src/lib.rs | 11 ++++++ 2 files changed, 28 insertions(+), 19 deletions(-) diff --git a/cloud-hypervisor/tests/common/tests_wrappers.rs b/cloud-hypervisor/tests/common/tests_wrappers.rs index 159a7b0ca8..6c012164da 100644 --- a/cloud-hypervisor/tests/common/tests_wrappers.rs +++ b/cloud-hypervisor/tests/common/tests_wrappers.rs @@ -9,6 +9,7 @@ use std::path::{Path, PathBuf}; use std::string::String; use std::sync::mpsc; use std::thread; +use std::time::Duration; use block::ImageType; use net_util::MacAddr; @@ -28,10 +29,9 @@ pub(crate) fn _test_api_create_boot(target_api: &TargetApi, guest: &Guest) { .spawn() .unwrap(); - thread::sleep(std::time::Duration::new(1, 0)); - - // Verify API server is running - assert!(target_api.remote_command("ping", None)); + // Wait for API server to be ready + assert!(wait_until(Duration::from_secs(5), || target_api + .remote_command("ping", None))); // Create the VM first let request_body = guest.api_create_body(); @@ -68,10 +68,9 @@ pub(crate) fn _test_api_shutdown(target_api: &TargetApi, guest: &Guest) { .spawn() .unwrap(); - thread::sleep(std::time::Duration::new(1, 0)); - - // Verify API server is running - assert!(target_api.remote_command("ping", None)); + // Wait for API server to be ready + assert!(wait_until(Duration::from_secs(5), || target_api + .remote_command("ping", None))); // Create the VM first let request_body = guest.api_create_body(); @@ -98,7 +97,7 @@ pub(crate) fn _test_api_shutdown(target_api: &TargetApi, guest: &Guest) { guest.ssh_command("sudo shutdown -H now").unwrap(); // Wait for the guest to be fully shutdown - thread::sleep(std::time::Duration::new(20, 0)); + assert!(guest.wait_for_ssh_unresponsive(Duration::from_secs(20))); // Then shut it down assert!(target_api.remote_command("shutdown", None)); @@ -129,10 +128,9 @@ pub(crate) fn _test_api_delete(target_api: &TargetApi, guest: &Guest) { .spawn() .unwrap(); - thread::sleep(std::time::Duration::new(1, 0)); - - // Verify API server is running - assert!(target_api.remote_command("ping", None)); + // Wait for API server to be ready + assert!(wait_until(Duration::from_secs(5), || target_api + .remote_command("ping", None))); // Create the VM first let request_body = guest.api_create_body(); @@ -159,7 +157,7 @@ pub(crate) fn _test_api_delete(target_api: &TargetApi, guest: &Guest) { guest.ssh_command("sudo shutdown -H now").unwrap(); // Wait for the guest to be fully shutdown - thread::sleep(std::time::Duration::new(20, 0)); + assert!(guest.wait_for_ssh_unresponsive(Duration::from_secs(20))); // Then delete it assert!(target_api.remote_command("delete", None)); @@ -193,10 +191,9 @@ pub(crate) fn _test_api_pause_resume(target_api: &TargetApi, guest: &Guest) { .spawn() .unwrap(); - thread::sleep(std::time::Duration::new(1, 0)); - - // Verify API server is running - assert!(target_api.remote_command("ping", None)); + // Wait for API server to be ready + assert!(wait_until(Duration::from_secs(5), || target_api + .remote_command("ping", None))); // Create the VM first let request_body = guest.api_create_body(); @@ -209,9 +206,10 @@ pub(crate) fn _test_api_pause_resume(target_api: &TargetApi, guest: &Guest) { // Then boot it assert!(target_api.remote_command("boot", None)); - thread::sleep(std::time::Duration::new(20, 0)); let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + // Check that the VM booted as expected guest.validate_cpu_count(None); guest.validate_memory(None); diff --git a/test_infra/src/lib.rs b/test_infra/src/lib.rs index 091fef7c8f..3f7f359804 100644 --- a/test_infra/src/lib.rs +++ b/test_infra/src/lib.rs @@ -1251,6 +1251,17 @@ impl Guest { ) } + /// Waits until the guest's SSH port is no longer reachable, indicating + /// the guest has probably shutdown. + pub fn wait_for_ssh_unresponsive(&self, timeout: Duration) -> bool { + let addr = format!("{}:22", self.network.guest_ip0) + .parse::() + .unwrap(); + wait_until(timeout, || { + std::net::TcpStream::connect_timeout(&addr, Duration::from_secs(2)).is_err() + }) + } + pub fn api_create_body(&self) -> String { let mut body = serde_json::json!({ "cpus": { From 2eafee69bee6eb22ef06875c8e24e5a02a6e9ec7 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Wed, 15 Apr 2026 08:46:35 +0100 Subject: [PATCH 675/742] tests: Remove explicit sleeps from balloon tests Use the new `wait_until()` to test the balloon size. Signed-off-by: Rob Bradford --- cloud-hypervisor/tests/integration.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index cc24cb8ebb..c009a0ed62 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -4752,8 +4752,9 @@ mod common_parallel { // Wait for balloon memory's initialization and check its size. // The virtio-balloon driver might take a few seconds to report the // balloon effective size back to the VMM. - thread::sleep(std::time::Duration::new(20, 0)); - + assert!(wait_until(Duration::from_secs(20), || { + balloon_size(&api_socket) == 2147483648 + })); let orig_balloon = balloon_size(&api_socket); println!("The original balloon memory size is {orig_balloon} bytes"); assert!(orig_balloon == 2147483648); @@ -4766,7 +4767,9 @@ mod common_parallel { // Give some time for the OOM to happen in the guest and be reported // back to the host. - thread::sleep(std::time::Duration::new(20, 0)); + assert!(wait_until(Duration::from_secs(20), || { + balloon_size(&api_socket) < 2147483648 + })); // 2nd: check balloon_mem's value to verify balloon has been automatically deflated let deflated_balloon = balloon_size(&api_socket); From 52c24136b5aff750eb89276809c0d05b127e2f0d Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Wed, 15 Apr 2026 08:59:42 +0100 Subject: [PATCH 676/742] tests: Remove explicit sleeps before killing vhost-user daemons If we're about to kill the daemons we don't need to spin waiting for them. Signed-off-by: Rob Bradford --- cloud-hypervisor/tests/common/tests_wrappers.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/cloud-hypervisor/tests/common/tests_wrappers.rs b/cloud-hypervisor/tests/common/tests_wrappers.rs index 6c012164da..036b815f40 100644 --- a/cloud-hypervisor/tests/common/tests_wrappers.rs +++ b/cloud-hypervisor/tests/common/tests_wrappers.rs @@ -663,7 +663,6 @@ pub(crate) fn test_vhost_user_net( kill_child(&mut child); let output = child.wait_with_output().unwrap(); - thread::sleep(std::time::Duration::new(5, 0)); let _ = daemon_child.kill(); let _ = daemon_child.wait(); @@ -806,7 +805,6 @@ pub(crate) fn test_vhost_user_blk( let output = child.wait_with_output().unwrap(); if let Some(mut daemon_child) = daemon_child { - thread::sleep(std::time::Duration::new(5, 0)); let _ = daemon_child.kill(); let _ = daemon_child.wait(); } @@ -876,7 +874,6 @@ pub(crate) fn test_boot_from_vhost_user_blk( let output = child.wait_with_output().unwrap(); if let Some(mut daemon_child) = daemon_child { - thread::sleep(std::time::Duration::new(5, 0)); let _ = daemon_child.kill(); let _ = daemon_child.wait(); } From a1cfbd6f5c8d15500c5932ab369aa89ee31a0dcc Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Wed, 15 Apr 2026 09:07:40 +0100 Subject: [PATCH 677/742] tests: Use `wait_until()` to check for vhost-user socket Rather than use a fixed time to wait for the socket to be opened instead test for its existence using `wait_until()`. Signed-off-by: Rob Bradford --- .../tests/common/tests_wrappers.rs | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/cloud-hypervisor/tests/common/tests_wrappers.rs b/cloud-hypervisor/tests/common/tests_wrappers.rs index 036b815f40..bc68cd6b66 100644 --- a/cloud-hypervisor/tests/common/tests_wrappers.rs +++ b/cloud-hypervisor/tests/common/tests_wrappers.rs @@ -569,13 +569,19 @@ pub(crate) fn test_vhost_user_net( if client_mode_daemon { child = ch_command.spawn().unwrap(); - // Make sure the VMM is waiting for the backend to connect - thread::sleep(std::time::Duration::new(10, 0)); + // Wait for the VMM to create the socket before starting the daemon + assert!(wait_until(Duration::from_secs(10), || Path::new( + &vunet_socket_path + ) + .exists())); daemon_child = daemon_command.spawn().unwrap(); } else { daemon_child = daemon_command.spawn().unwrap(); - // Make sure the backend is waiting for the VMM to connect - thread::sleep(std::time::Duration::new(10, 0)); + // Wait for the daemon to create the socket before starting the VMM + assert!(wait_until(Duration::from_secs(10), || Path::new( + &vunet_socket_path + ) + .exists())); child = ch_command.spawn().unwrap(); } @@ -1043,7 +1049,11 @@ pub(crate) fn _test_virtio_fs( prepare_daemon(&guest.tmp_dir, shared_dir.to_str().unwrap()); let r = std::panic::catch_unwind(|| { - thread::sleep(std::time::Duration::new(10, 0)); + // Wait for the daemon socket to be ready + assert!(wait_until(Duration::from_secs(10), || Path::new( + &virtiofsd_socket_path + ) + .exists())); let fs_params = format!( "id=myfs0,socket={},{}{}", virtiofsd_socket_path, From f55a90c170940e8d77b32ef6b95926e91489cc19 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Wed, 15 Apr 2026 09:09:33 +0100 Subject: [PATCH 678/742] tests: Remove explicit sleeps from "liveness" checks The vhost-user tests uses SSH and checking the RAM to test for the liveness of the VM - replace the explicit sleep before them with `wait_until()` allowing them to potentially finish earlier. Signed-off-by: Rob Bradford --- .../tests/common/tests_wrappers.rs | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/cloud-hypervisor/tests/common/tests_wrappers.rs b/cloud-hypervisor/tests/common/tests_wrappers.rs index bc68cd6b66..f1f41ecb90 100644 --- a/cloud-hypervisor/tests/common/tests_wrappers.rs +++ b/cloud-hypervisor/tests/common/tests_wrappers.rs @@ -657,12 +657,13 @@ pub(crate) fn test_vhost_user_net( let desired_ram = 1024 << 20; resize_command(&api_socket, None, Some(desired_ram), None, None); - thread::sleep(std::time::Duration::new(10, 0)); - // Here by simply checking the size (through ssh), we validate // the connection is still working, which means vhost-user-net // keeps working after the resize. - assert!(guest.get_total_memory().unwrap_or_default() > 960_000); + assert!(wait_until(Duration::from_secs(10), || guest + .get_total_memory() + .unwrap_or_default() + > 960_000)); } }); @@ -790,9 +791,10 @@ pub(crate) fn test_vhost_user_blk( let desired_ram = 1024 << 20; resize_command(&api_socket, None, Some(desired_ram), None, None); - thread::sleep(std::time::Duration::new(10, 0)); - - assert!(guest.get_total_memory().unwrap_or_default() > 960_000); + assert!(wait_until(Duration::from_secs(10), || guest + .get_total_memory() + .unwrap_or_default() + > 960_000)); // Check again the content of the block device after the resize // has been performed. @@ -1025,8 +1027,10 @@ pub(crate) fn _test_virtio_fs( let desired_ram = 1024 << 20; resize_command(&api_socket, None, Some(desired_ram), None, None); - thread::sleep(std::time::Duration::new(30, 0)); - assert!(guest.get_total_memory().unwrap_or_default() > 960_000); + assert!(wait_until(Duration::from_secs(30), || guest + .get_total_memory() + .unwrap_or_default() + > 960_000)); // After the resize, check again that file1 exists and its // content is "foo". From 01decd964f224b7fd626eab639652204eecbe318 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Wed, 15 Apr 2026 09:16:37 +0100 Subject: [PATCH 679/742] tests: Remove explicit sleep from tests_simple_launch tests On the shutdown path remove the explicit sleep and instead wait for the event to be delivered. Signed-off-by: Rob Bradford --- cloud-hypervisor/tests/common/tests_wrappers.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cloud-hypervisor/tests/common/tests_wrappers.rs b/cloud-hypervisor/tests/common/tests_wrappers.rs index f1f41ecb90..4d2b452845 100644 --- a/cloud-hypervisor/tests/common/tests_wrappers.rs +++ b/cloud-hypervisor/tests/common/tests_wrappers.rs @@ -1604,7 +1604,6 @@ pub(crate) fn _test_simple_launch(guest: &Guest) { let _ = guest.ssh_command("sudo systemctl stop snapd"); guest.ssh_command("sudo poweroff").unwrap(); - thread::sleep(std::time::Duration::new(20, 0)); let latest_events = [ &MetaEvent { event: "shutdown".to_string(), @@ -1619,7 +1618,9 @@ pub(crate) fn _test_simple_launch(guest: &Guest) { device_id: None, }, ]; - assert!(check_latest_events_exact(&latest_events, &event_path)); + assert!(wait_until(Duration::from_secs(20), || { + check_latest_events_exact(&latest_events, &event_path) + })); }); kill_child(&mut child); From 78224baac6e59af77bbb69f3d09ad7bb56afde21 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Wed, 15 Apr 2026 11:30:55 +0100 Subject: [PATCH 680/742] tests: Remove explicit sleeps from virtio-fs tests Instead use `wait_until()` for mounting of the filesystem. Signed-off-by: Rob Bradford --- .../tests/common/tests_wrappers.rs | 30 ++++++++++++++----- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/cloud-hypervisor/tests/common/tests_wrappers.rs b/cloud-hypervisor/tests/common/tests_wrappers.rs index 4d2b452845..05a5ca0d4d 100644 --- a/cloud-hypervisor/tests/common/tests_wrappers.rs +++ b/cloud-hypervisor/tests/common/tests_wrappers.rs @@ -906,6 +906,7 @@ pub(crate) fn _test_virtio_fs( let disk_config = UbuntuDiskConfig::new(focal_image); let guest = Guest::new(Box::new(disk_config)); let api_socket = temp_api_path(&guest.tmp_dir); + let event_path = temp_event_monitor_path(&guest.tmp_dir); let mut workload_path = dirs::home_dir().unwrap(); workload_path.push("workloads"); @@ -933,7 +934,8 @@ pub(crate) fn _test_virtio_fs( .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .default_disks() .default_net() - .args(["--api-socket", &api_socket]); + .args(["--api-socket", &api_socket]) + .args(["--event-monitor", format!("path={event_path}").as_str()]); if pci_segment.is_some() { guest_command.args([ "--platform", @@ -993,13 +995,14 @@ pub(crate) fn _test_virtio_fs( .contains("{\"id\":\"myfs0\",\"bdf\":\"0000:00:06.0\"}") ); } - - thread::sleep(std::time::Duration::new(10, 0)); } // Mount shared directory through virtio_fs filesystem guest - .ssh_command("mkdir -p mount_dir && sudo mount -t virtiofs myfs mount_dir/") + .wait_for_ssh_command( + "mkdir -p mount_dir && sudo mount -t virtiofs myfs mount_dir/", + Duration::from_secs(10), + ) .unwrap(); // Check file1 exists and its content is "foo" @@ -1044,11 +1047,19 @@ pub(crate) fn _test_virtio_fs( // Remove from VM guest.ssh_command("sudo umount mount_dir").unwrap(); assert!(remote_command(&api_socket, "remove-device", Some("myfs0"))); + + // Wait for the device to be fully removed before re-adding + let removed_event = MetaEvent { + event: "device-removed".to_string(), + device_id: Some("myfs0".to_string()), + }; + assert!(wait_until(Duration::from_secs(10), || { + check_sequential_events(&[&removed_event], &event_path) + })); } }); let (r, hotplug_daemon_child) = if r.is_ok() && hotplug { - thread::sleep(std::time::Duration::new(10, 0)); let (daemon_child, virtiofsd_socket_path) = prepare_daemon(&guest.tmp_dir, shared_dir.to_str().unwrap()); @@ -1088,10 +1099,13 @@ pub(crate) fn _test_virtio_fs( ); } - thread::sleep(std::time::Duration::new(10, 0)); - // Mount shared directory through virtio_fs filesystem + // Mount shared directory through virtio_fs filesystem, retrying + // until the hotplugged device is recognized by the guest guest - .ssh_command("mkdir -p mount_dir && sudo mount -t virtiofs myfs mount_dir/") + .wait_for_ssh_command( + "mkdir -p mount_dir && sudo mount -t virtiofs myfs mount_dir/", + Duration::from_secs(10), + ) .unwrap(); // Check file1 exists and its content is "foo" From 9ca6d4ca41620dc9e83395706314ad7074220931 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Wed, 15 Apr 2026 11:33:15 +0100 Subject: [PATCH 681/742] tests: Remove explicit sleeps from block tests Use `wait_until()` with the SSH command for detecting if the block device is present/absent as part of hotplugging/unplugging. Signed-off-by: Rob Bradford --- .../tests/common/tests_wrappers.rs | 86 ++++++------------- 1 file changed, 24 insertions(+), 62 deletions(-) diff --git a/cloud-hypervisor/tests/common/tests_wrappers.rs b/cloud-hypervisor/tests/common/tests_wrappers.rs index 05a5ca0d4d..8094aa8223 100644 --- a/cloud-hypervisor/tests/common/tests_wrappers.rs +++ b/cloud-hypervisor/tests/common/tests_wrappers.rs @@ -2714,18 +2714,12 @@ pub(crate) fn _test_disk_hotplug(guest: &Guest, landlock_enabled: bool) { .contains("{\"id\":\"test0\",\"bdf\":\"0000:00:06.0\"}") ); - thread::sleep(std::time::Duration::new(10, 0)); - - // Check that /dev/vdc exists and the block size is 16M. - assert_eq!( + // Wait for the hotplugged disk to appear in the guest + assert!(wait_until(Duration::from_secs(10), || { guest .ssh_command("lsblk | grep vdc | grep -c 16M") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 1 - ); + .is_ok_and(|s| s.trim().parse::().unwrap_or_default() == 1) + })); // And check the block device can be read. guest .ssh_command("sudo dd if=/dev/vdc of=/dev/null bs=1M iflag=direct count=16") @@ -2733,17 +2727,10 @@ pub(crate) fn _test_disk_hotplug(guest: &Guest, landlock_enabled: bool) { // Let's remove it the extra disk. assert!(remote_command(&api_socket, "remove-device", Some("test0"))); - thread::sleep(std::time::Duration::new(5, 0)); - // And check /dev/vdc is not there - assert_eq!( - guest - .ssh_command("lsblk | grep -c vdc.*16M || true") - .unwrap() - .trim() - .parse::() - .unwrap_or(1), - 0 - ); + // Wait for the disk to disappear + assert!(wait_until(Duration::from_secs(10), || guest + .ssh_command("lsblk | grep -c vdc.*16M || true") + .is_ok_and(|s| s.trim().parse::().unwrap_or(1) == 0))); // And add it back to validate unplug did work correctly. let (cmd_success, cmd_output) = remote_command_w_output( @@ -2763,18 +2750,12 @@ pub(crate) fn _test_disk_hotplug(guest: &Guest, landlock_enabled: bool) { .contains("{\"id\":\"test0\",\"bdf\":\"0000:00:06.0\"}") ); - thread::sleep(std::time::Duration::new(10, 0)); - - // Check that /dev/vdc exists and the block size is 16M. - assert_eq!( + // Wait for the hotplugged disk to appear in the guest + assert!(wait_until(Duration::from_secs(10), || { guest .ssh_command("lsblk | grep vdc | grep -c 16M") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 1 - ); + .is_ok_and(|s| s.trim().parse::().unwrap_or_default() == 1) + })); // And check the block device can be read. guest .ssh_command("sudo dd if=/dev/vdc of=/dev/null bs=1M iflag=direct count=16") @@ -2796,18 +2777,10 @@ pub(crate) fn _test_disk_hotplug(guest: &Guest, landlock_enabled: bool) { assert!(remote_command(&api_socket, "remove-device", Some("test0"))); - thread::sleep(std::time::Duration::new(20, 0)); - - // Check device has gone away - assert_eq!( - guest - .ssh_command("lsblk | grep -c vdc.*16M || true") - .unwrap() - .trim() - .parse::() - .unwrap_or(1), - 0 - ); + // Wait for the disk to disappear + assert!(wait_until(Duration::from_secs(20), || guest + .ssh_command("lsblk | grep -c vdc.*16M || true") + .is_ok_and(|s| s.trim().parse::().unwrap_or(1) == 0))); guest.reboot_linux(1); @@ -3483,14 +3456,10 @@ pub(crate) fn _test_vdpa_block(guest: &Guest) { .contains("{\"id\":\"myvdpa0\",\"bdf\":\"0001:00:01.0\"}") ); - thread::sleep(std::time::Duration::new(10, 0)); - - // Check IOMMU setup - assert!( - guest - .does_device_vendor_pair_match("0x1057", "0x1af4") - .unwrap_or_default() - ); + // Wait for the hotplugged device to appear + assert!(wait_until(Duration::from_secs(10), || guest + .does_device_vendor_pair_match("0x1057", "0x1af4") + .unwrap_or_default())); assert!( guest .ssh_command("ls /sys/kernel/iommu_groups/*/devices") @@ -3523,18 +3492,11 @@ pub(crate) fn _test_vdpa_block(guest: &Guest) { // Unplug the device let cmd_success = remote_command(&api_socket, "remove-device", Some("myvdpa0")); assert!(cmd_success); - thread::sleep(std::time::Duration::new(10, 0)); - // Check /dev/vdd doesn't exist anymore - assert_eq!( - guest - .ssh_command("lsblk | grep -c vdd || true") - .unwrap() - .trim() - .parse::() - .unwrap_or(1), - 0 - ); + // Wait for the device to disappear + assert!(wait_until(Duration::from_secs(10), || guest + .ssh_command("lsblk | grep -c vdd || true") + .is_ok_and(|s| s.trim().parse::().unwrap_or(1) == 0))); }); kill_child(&mut child); From 0107675eb1116f24747d1278895b8f8ae2e8bf34 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Wed, 15 Apr 2026 11:33:15 +0100 Subject: [PATCH 682/742] tests: Remove explicit sleeps from net tests Use `wait_until()` with the SSH command for detecting if the net device is present/absent as part of hotplugging/unplugging. Signed-off-by: Rob Bradford --- .../tests/common/tests_wrappers.rs | 46 ++++++++----------- 1 file changed, 20 insertions(+), 26 deletions(-) diff --git a/cloud-hypervisor/tests/common/tests_wrappers.rs b/cloud-hypervisor/tests/common/tests_wrappers.rs index 8094aa8223..f42fce9029 100644 --- a/cloud-hypervisor/tests/common/tests_wrappers.rs +++ b/cloud-hypervisor/tests/common/tests_wrappers.rs @@ -2928,18 +2928,12 @@ pub(crate) fn _test_net_hotplug( ); } - thread::sleep(std::time::Duration::new(5, 0)); - - // 2 network interfaces + default localhost ==> 3 interfaces - assert_eq!( + // Wait for the hotplugged network interface to appear + assert!(wait_until(Duration::from_secs(10), || { guest .ssh_command("ip -o link | wc -l") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 3 - ); + .is_ok_and(|s| s.trim().parse::().unwrap_or_default() == 3) + })); // Test the same using the added network interface's IP assert_eq!( @@ -2956,9 +2950,13 @@ pub(crate) fn _test_net_hotplug( 3 ); - // Remove network + // Remove network and wait for it to disappear assert!(remote_command(&api_socket, "remove-device", Some("test0"),)); - thread::sleep(std::time::Duration::new(5, 0)); + assert!(wait_until(Duration::from_secs(10), || { + guest + .ssh_command("ip -o link | wc -l") + .is_ok_and(|s| s.trim().parse::().unwrap_or_default() == 2) + })); // Add network let (cmd_success, cmd_output) = remote_command_w_output( @@ -2991,18 +2989,12 @@ pub(crate) fn _test_net_hotplug( ); } - thread::sleep(std::time::Duration::new(5, 0)); - - // 2 network interfaces + default localhost ==> 3 interfaces - assert_eq!( + // Wait for the hotplugged network interface to appear + assert!(wait_until(Duration::from_secs(10), || { guest .ssh_command("ip -o link | wc -l") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 3 - ); + .is_ok_and(|s| s.trim().parse::().unwrap_or_default() == 3) + })); guest.reboot_linux(0); @@ -3345,10 +3337,12 @@ pub(crate) fn _test_macvtap( let mut child = guest_command.capture_output().spawn().unwrap(); if hotplug { - // Give some time to the VMM process to listen to the API - // socket. This is the only requirement to avoid the following - // call to ch-remote from failing. - thread::sleep(std::time::Duration::new(10, 0)); + // Wait for the VMM process to listen to the API socket + assert!(wait_until(Duration::from_secs(10), || remote_command( + &api_socket, + "ping", + None + ))); // Hotplug the virtio-net device let (cmd_success, cmd_output) = remote_command_w_output(&api_socket, "add-net", Some(&net_params)); From 1269475c2aa8894c7a594045e118b5cdcb6b5b7e Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Wed, 15 Apr 2026 11:44:03 +0100 Subject: [PATCH 683/742] tests: Remove explicit sleeps from pvpanic test Instead wait for the event to be delivered that it has panicked. Signed-off-by: Rob Bradford --- cloud-hypervisor/tests/common/tests_wrappers.rs | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/cloud-hypervisor/tests/common/tests_wrappers.rs b/cloud-hypervisor/tests/common/tests_wrappers.rs index f42fce9029..6479f2e2a6 100644 --- a/cloud-hypervisor/tests/common/tests_wrappers.rs +++ b/cloud-hypervisor/tests/common/tests_wrappers.rs @@ -3165,17 +3165,14 @@ pub(crate) fn _test_pvpanic(guest: &Guest) { // Trigger guest a panic make_guest_panic(guest); - // Wait a while for guest - thread::sleep(std::time::Duration::new(10, 0)); - + // Wait for the panic event to be recorded let expected_sequential_events = [&MetaEvent { event: "panic".to_string(), device_id: None, }]; - assert!(check_latest_events_exact( - &expected_sequential_events, - &event_path - )); + assert!(wait_until(Duration::from_secs(10), || { + check_latest_events_exact(&expected_sequential_events, &event_path) + })); }); kill_child(&mut child); From 43b5a474f9422f79b5df824e16e763f41d023931 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Wed, 15 Apr 2026 11:45:03 +0100 Subject: [PATCH 684/742] tests: Remove explicit sleep from test_api_dbus_and_http_interleaved Instead wait for the guest to stop responding on the SSH port. Signed-off-by: Rob Bradford --- cloud-hypervisor/tests/integration.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index c009a0ed62..508e5e93dc 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -5753,7 +5753,7 @@ mod dbus_api { guest.ssh_command("sudo shutdown -H now").unwrap(); // Wait for the guest to be fully shutdown - thread::sleep(std::time::Duration::new(20, 0)); + assert!(guest.wait_for_ssh_unresponsive(Duration::from_secs(20))); // Then shutdown the VM assert!(dbus_api.remote_command("shutdown", None)); From afd155d578e0205a5beab94fe3860ca1238a876e Mon Sep 17 00:00:00 2001 From: Pascal Scholz Date: Mon, 16 Mar 2026 15:26:12 +0100 Subject: [PATCH 685/742] pci: Refactor bus.rs to better fit a PCI bus's semantics This commit refactors the PCI bus struct. It has two major focuses. First, we change the type of `device_ids` in `PciBus` to an array. A fixed-size array better reflects real PCI bus constraints, especially its limited number of PCI devices. Moreover, it can't be grown accidentally. The second focus is changing the type of the key of `devices` in `PciBus` to `u8`, since device IDs are not allowed to exceed 31. We furthermore replace magic numbers with constants and make them publicly available so we can use them in a follow-up change when parsing user input. Signed-off-by: Pascal Scholz On-behalf-of: SAP pascal.scholz@sap.com --- pci/src/bus.rs | 32 ++++++++++++++++++-------------- pci/src/lib.rs | 4 +++- vmm/src/device_manager.rs | 2 +- 3 files changed, 22 insertions(+), 16 deletions(-) diff --git a/pci/src/bus.rs b/pci/src/bus.rs index 1fa7bd866a..57e71551b4 100644 --- a/pci/src/bus.rs +++ b/pci/src/bus.rs @@ -20,9 +20,13 @@ use crate::configuration::{ }; use crate::device::{BarReprogrammingParams, DeviceRelocation, Error as PciDeviceError, PciDevice}; +/// Denotes the PCI device ID of a bus' root bridge device. +pub const PCI_ROOT_DEVICE_ID: u8 = 0; +/// Denotes the maximum number of PCI devices allowed on a bus. 32 per PCI spec. +pub const NUM_DEVICE_IDS: u8 = 32; + const VENDOR_ID_INTEL: u16 = 0x8086; const DEVICE_ID_INTEL_VIRT_PCIE_HOST: u16 = 0x0d57; -const NUM_DEVICE_IDS: usize = 32; /// Errors for device manager. #[derive(Error, Debug)] @@ -113,18 +117,18 @@ impl PciDevice for PciRoot { pub struct PciBus { /// Devices attached to this bus. /// Device 0 is host bridge. - devices: HashMap>>, + devices: HashMap>>, device_reloc: Arc, - device_ids: Vec, + device_ids: [bool; NUM_DEVICE_IDS as usize], } impl PciBus { pub fn new(pci_root: PciRoot, device_reloc: Arc) -> Self { - let mut devices: HashMap>> = HashMap::new(); - let mut device_ids: Vec = vec![false; NUM_DEVICE_IDS]; + let mut devices: HashMap>> = HashMap::new(); + let mut device_ids = [false; NUM_DEVICE_IDS as usize]; - devices.insert(0, Arc::new(Mutex::new(pci_root))); - device_ids[0] = true; + devices.insert(PCI_ROOT_DEVICE_ID, Arc::new(Mutex::new(pci_root))); + device_ids[PCI_ROOT_DEVICE_ID as usize] = true; PciBus { devices, @@ -158,7 +162,7 @@ impl PciBus { Ok(()) } - pub fn add_device(&mut self, device_id: u32, device: Arc>) -> Result<()> { + pub fn add_device(&mut self, device_id: u8, device: Arc>) -> Result<()> { self.devices.insert(device_id, device); Ok(()) } @@ -180,7 +184,7 @@ impl PciBus { } pub fn get_device_id(&mut self, id: usize) -> Result<()> { - if id < NUM_DEVICE_IDS { + if id < NUM_DEVICE_IDS as usize { if self.device_ids[id] { Err(PciRootError::AlreadyInUsePciDeviceSlot(id)) } else { @@ -193,7 +197,7 @@ impl PciBus { } pub fn put_device_id(&mut self, id: usize) -> Result<()> { - if id < NUM_DEVICE_IDS { + if id < NUM_DEVICE_IDS as usize { self.device_ids[id] = false; Ok(()) } else { @@ -240,7 +244,7 @@ impl PciConfigIo { .lock() .unwrap() .devices - .get(&(device as u32)) + .get(&(device as u8)) .map_or(0xffff_ffff, |d| { d.lock().unwrap().read_config_register(register) }) @@ -265,7 +269,7 @@ impl PciConfigIo { } let pci_bus = self.pci_bus.as_ref().lock().unwrap(); - if let Some(d) = pci_bus.devices.get(&(device as u32)) { + if let Some(d) = pci_bus.devices.get(&(device as u8)) { let mut device = d.lock().unwrap(); // Update the register value @@ -376,7 +380,7 @@ impl PciConfigMmio { .lock() .unwrap() .devices - .get(&(device as u32)) + .get(&(device as u8)) .map_or(0xffff_ffff, |d| { d.lock().unwrap().read_config_register(register) }) @@ -395,7 +399,7 @@ impl PciConfigMmio { } let pci_bus = self.pci_bus.lock().unwrap(); - if let Some(d) = pci_bus.devices.get(&(device as u32)) { + if let Some(d) = pci_bus.devices.get(&(device as u8)) { let mut device = d.lock().unwrap(); // Update the register value diff --git a/pci/src/lib.rs b/pci/src/lib.rs index 17c3ab7235..c5bba16d29 100644 --- a/pci/src/lib.rs +++ b/pci/src/lib.rs @@ -21,7 +21,9 @@ use std::str::FromStr; use serde::de::Visitor; -pub use self::bus::{PciBus, PciConfigIo, PciConfigMmio, PciRoot, PciRootError}; +pub use self::bus::{ + NUM_DEVICE_IDS, PCI_ROOT_DEVICE_ID, PciBus, PciConfigIo, PciConfigMmio, PciRoot, PciRootError, +}; pub use self::configuration::{ PCI_CONFIGURATION_ID, PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciCapability, PciCapabilityId, PciClassCode, PciConfiguration, PciExpressCapabilityId, PciHeaderType, diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 97e774f98e..7f96e1d080 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -4125,7 +4125,7 @@ impl DeviceManager { .unwrap(); pci_bus - .add_device(bdf.device() as u32, pci_device) + .add_device(bdf.device(), pci_device) .map_err(DeviceManagerError::AddPciDevice)?; self.bus_devices.push(Arc::clone(&bus_device)); From 34f08002e16f4900e2d21e94cca3e47e25b8c418 Mon Sep 17 00:00:00 2001 From: Pascal Scholz Date: Thu, 26 Mar 2026 14:16:06 +0100 Subject: [PATCH 686/742] vmm: Allow for device ID allocation on a segment Allocating a device ID is crucial for assigning a specific ID to a device. We need this to implement configurable PCI device ID. Signed-off-by: Pascal Scholz On-behalf-of: SAP pascal.scholz@sap.com Signed-off-by: Rob Bradford --- pci/src/bus.rs | 156 +++++++++++++++++++++++++++++++++++--- vmm/src/device_manager.rs | 5 +- vmm/src/pci_segment.rs | 13 +++- 3 files changed, 160 insertions(+), 14 deletions(-) diff --git a/pci/src/bus.rs b/pci/src/bus.rs index 57e71551b4..bd0abec303 100644 --- a/pci/src/bus.rs +++ b/pci/src/bus.rs @@ -47,10 +47,10 @@ pub enum PciRootError { #[error("Could not find an available device slot on the PCI bus")] NoPciDeviceSlotAvailable, /// Invalid PCI device identifier provided. - #[error("Invalid PCI device identifier provided")] + #[error("Invalid PCI device identifier provided: {0}")] InvalidPciDeviceSlot(usize), /// Valid PCI device identifier but already used. - #[error("Valid PCI device identifier but already used")] + #[error("Valid PCI device identifier but already used: {0}")] AlreadyInUsePciDeviceSlot(usize), } pub type Result = std::result::Result; @@ -172,15 +172,42 @@ impl PciBus { Ok(()) } - pub fn next_device_id(&mut self) -> Result { - for (idx, device_id) in self.device_ids.iter_mut().enumerate() { - if !(*device_id) { - *device_id = true; - return Ok(idx as u32); + /// Allocates a PCI device ID on the bus. + /// + /// - `id`: ID to allocate on the bus. If [`None`], the next free + /// device ID on the bus is allocated, else the ID given is + /// allocated + /// + /// ## Errors + /// * Returns [`PciRootError::AlreadyInUsePciDeviceSlot`] in case + /// the ID requested is already allocated. + /// * Returns [`PciRootError::InvalidPciDeviceSlot`] in case the + /// requested ID exceeds the maximum number of devices allowed per + /// bus (see [`NUM_DEVICE_IDS`]). + /// * If `id` is [`None`]: Returns + /// [`PciRootError::NoPciDeviceSlotAvailable`] if no free device + /// slot is available on the bus. + pub fn allocate_device_id(&mut self, id: Option) -> Result { + if let Some(idx) = id.map(|i| i as usize) { + if idx < NUM_DEVICE_IDS as usize { + if self.device_ids[idx] { + Err(PciRootError::AlreadyInUsePciDeviceSlot(idx)) + } else { + self.device_ids[idx] = true; + Ok(idx as u8) + } + } else { + Err(PciRootError::InvalidPciDeviceSlot(idx)) + } + } else { + for (idx, device_id) in self.device_ids.iter_mut().enumerate() { + if !(*device_id) { + *device_id = true; + return Ok(idx as u8); + } } + Err(PciRootError::NoPciDeviceSlotAvailable) } - - Err(PciRootError::NoPciDeviceSlotAvailable) } pub fn get_device_id(&mut self, id: usize) -> Result<()> { @@ -496,3 +523,114 @@ fn parse_io_config_address(config_address: u32) -> (usize, usize, usize, usize) shift_and_mask(config_address, REGISTER_NUMBER_OFFSET, REGISTER_NUMBER_MASK), ) } + +#[cfg(test)] +mod unit_tests { + use std::error::Error; + use std::result::Result; + + use super::*; + + #[derive(Debug)] + /// Helper struct that mocks the implementation of DeviceRelocation + struct MockDeviceRelocation; + + impl DeviceRelocation for MockDeviceRelocation { + fn move_bar( + &self, + _old_base: u64, + _new_base: u64, + _len: u64, + _pci_dev: &mut dyn PciDevice, + _region_type: PciBarRegionType, + ) -> Result<(), std::io::Error> { + Ok(()) + } + } + + fn setup_bus() -> PciBus { + let pci_root = PciRoot::new(None); + let mock_device_reloc = Arc::new(MockDeviceRelocation {}); + PciBus::new(pci_root, mock_device_reloc) + } + + #[test] + // Test to acquire all IDs that can be acquired + fn allocate_device_id_next_free() { + // The first address is occupied by the root + let mut bus = setup_bus(); + for expected_id in 1..NUM_DEVICE_IDS { + assert_eq!(expected_id, bus.allocate_device_id(None).unwrap()); + } + } + + #[test] + // Test that requesting specific ID work + fn allocate_device_id_request_id() -> Result<(), Box> { + // The first address is occupied by the root + let mut bus = setup_bus(); + let max_id = NUM_DEVICE_IDS - 1; + assert_eq!(0x01_u8, bus.allocate_device_id(Some(0x01))?); + assert_eq!(0x10_u8, bus.allocate_device_id(Some(0x10))?); + assert_eq!(max_id, bus.allocate_device_id(Some(max_id))?); + Ok(()) + } + + #[test] + // Test that gaps resulting from explicit allocations are filled by implicit ones, + // beginning with the first free slot + fn allocate_device_id_fills_gaps() -> Result<(), Box> { + // The first address is occupied by the root + let mut bus = setup_bus(); + assert_eq!(0x01_u8, bus.allocate_device_id(Some(0x01))?); + assert_eq!(0x03_u8, bus.allocate_device_id(Some(0x03))?); + assert_eq!(0x06_u8, bus.allocate_device_id(Some(0x06))?); + assert_eq!(0x02_u8, bus.allocate_device_id(None)?); + assert_eq!(0x04_u8, bus.allocate_device_id(None)?); + assert_eq!(0x05_u8, bus.allocate_device_id(None)?); + assert_eq!(0x07_u8, bus.allocate_device_id(None)?); + Ok(()) + } + + #[test] + // Test that requesting the same ID twice fails + fn allocate_device_id_request_id_twice_fails() -> Result<(), Box> { + let mut bus = setup_bus(); + let max_id = NUM_DEVICE_IDS - 1; + bus.allocate_device_id(Some(max_id))?; + let result = bus.allocate_device_id(Some(max_id)); + assert!(matches!( + result, + Err(PciRootError::AlreadyInUsePciDeviceSlot(x)) if x == usize::from(max_id), + )); + Ok(()) + } + + #[test] + // Test to request an invalid ID + fn allocate_device_id_request_invalid_id_fails() -> Result<(), Box> { + let mut bus = setup_bus(); + let max_id = NUM_DEVICE_IDS + 1; + let result = bus.allocate_device_id(Some(max_id)); + assert!(matches!( + result, + Err(PciRootError::InvalidPciDeviceSlot(x)) if x == usize::from(max_id), + )); + Ok(()) + } + + #[test] + // Test to acquire an ID when all IDs were already acquired + fn allocate_device_id_none_left() { + // The first address is occupied by the root + let mut bus = setup_bus(); + for expected_id in 1..NUM_DEVICE_IDS { + assert_eq!(expected_id, bus.allocate_device_id(None).unwrap()); + } + let result = bus.allocate_device_id(None); + assert!(matches!( + result, + Err(PciRootError::NoPciDeviceSlotAvailable), + )); + } +} diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 7f96e1d080..1499f4d73b 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -494,7 +494,7 @@ pub enum DeviceManagerError { /// Failed to find an available PCI device ID. #[error("Failed to find an available PCI device ID")] - NextPciDeviceId(#[source] pci::PciRootError), + AllocatePciDeviceId(#[source] pci::PciRootError), /// Could not reserve the PCI device ID. #[error("Could not reserve the PCI device ID")] @@ -4555,7 +4555,8 @@ impl DeviceManager { (pci_segment_id, pci_device_bdf, resources) } else { - let pci_device_bdf = self.pci_segments[pci_segment_id as usize].next_device_bdf()?; + let pci_device_bdf = + self.pci_segments[pci_segment_id as usize].allocate_device_id(None)?; (pci_segment_id, pci_device_bdf, None) }) diff --git a/vmm/src/pci_segment.rs b/vmm/src/pci_segment.rs index 81f11063ee..8ed03c3e26 100644 --- a/vmm/src/pci_segment.rs +++ b/vmm/src/pci_segment.rs @@ -164,15 +164,22 @@ impl PciSegment { ) } - pub(crate) fn next_device_bdf(&self) -> DeviceManagerResult { + /// Allocates a device's ID on this PCI segment. + /// + /// - `device_id`: Device ID to request for allocation + /// + /// ## Errors + /// * [`DeviceManagerError::AllocatePciDeviceId`] if device ID + /// allocation on the bus fails. + pub(crate) fn allocate_device_id(&self, device_id: Option) -> DeviceManagerResult { Ok(PciBdf::new( self.id, 0, self.pci_bus .lock() .unwrap() - .next_device_id() - .map_err(DeviceManagerError::NextPciDeviceId)? as u8, + .allocate_device_id(device_id) + .map_err(DeviceManagerError::AllocatePciDeviceId)?, 0, )) } From 93c17cb29156fb61be57395f360a12a89f95ed4b Mon Sep 17 00:00:00 2001 From: Pascal Scholz Date: Thu, 26 Mar 2026 14:04:58 +0100 Subject: [PATCH 687/742] vmm: Add tests for `allocate_device_id` in `PciSegment` Next to tests for `allocate_device_id`, we introduce a new constructor `new_without_address_manager`, only available in the test build. As there is no way to instantiate an `AddressManager` in the tests, we use this constructor to work around this. Signed-off-by: Pascal Scholz On-behalf-of: SAP pascal.scholz@sap.com Signed-off-by: Rob Bradford --- vmm/src/pci_segment.rs | 157 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 157 insertions(+) diff --git a/vmm/src/pci_segment.rs b/vmm/src/pci_segment.rs index 8ed03c3e26..35d9e1fef5 100644 --- a/vmm/src/pci_segment.rs +++ b/vmm/src/pci_segment.rs @@ -209,6 +209,65 @@ impl PciSegment { Ok(()) } + + #[cfg(test)] + /// Creates a PciSegment without the need for an [`AddressManager`] + /// for testing purpose. + /// + /// An [`AddressManager`] would otherwise be required to create + /// [`PciBus`] instances. Instead, we use any struct that implements + /// [`DeviceRelocation`] to instantiate a [`PciBus`]. + pub(crate) fn new_without_address_manager( + id: u16, + numa_node: u32, + mem32_allocator: Arc>, + mem64_allocator: Arc>, + pci_irq_slots: &[u8; 32], + device_reloc: &Arc, + ) -> DeviceManagerResult { + let pci_root = PciRoot::new(None); + let pci_bus = Arc::new(Mutex::new(PciBus::new(pci_root, device_reloc.clone()))); + + let pci_config_mmio = Arc::new(Mutex::new(PciConfigMmio::new(Arc::clone(&pci_bus)))); + let mmio_config_address = + layout::PCI_MMCONFIG_START.0 + layout::PCI_MMIO_CONFIG_SIZE_PER_SEGMENT * id as u64; + + let start_of_mem32_area = mem32_allocator.lock().unwrap().base().0; + let end_of_mem32_area = mem32_allocator.lock().unwrap().end().0; + + let start_of_mem64_area = mem64_allocator.lock().unwrap().base().0; + let end_of_mem64_area = mem64_allocator.lock().unwrap().end().0; + + let segment = PciSegment { + id, + pci_bus, + pci_config_mmio, + mmio_config_address, + proximity_domain: numa_node, + pci_devices_up: 0, + pci_devices_down: 0, + #[cfg(target_arch = "x86_64")] + pci_config_io: None, + mem32_allocator, + mem64_allocator, + start_of_mem32_area, + end_of_mem32_area, + start_of_mem64_area, + end_of_mem64_area, + pci_irq_slots: *pci_irq_slots, + }; + + info!( + "Adding PCI segment: id={}, PCI MMIO config address: 0x{:x}, mem32 area [0x{:x}-0x{:x}], mem64 area [0x{:x}-0x{:x}]", + segment.id, + segment.mmio_config_address, + segment.start_of_mem32_area, + segment.end_of_mem32_area, + segment.start_of_mem64_area, + segment.end_of_mem64_area + ); + Ok(segment) + } } struct PciDevSlot { @@ -481,3 +540,101 @@ impl Aml for PciSegment { .to_aml_bytes(sink); } } + +#[cfg(test)] +mod unit_tests { + use std::result::Result; + + use vm_memory::GuestAddress; + + use super::*; + + #[derive(Debug)] + struct MockDeviceRelocation; + impl DeviceRelocation for MockDeviceRelocation { + fn move_bar( + &self, + _old_base: u64, + _new_base: u64, + _len: u64, + _pci_dev: &mut dyn pci::PciDevice, + _region_type: pci::PciBarRegionType, + ) -> Result<(), std::io::Error> { + Ok(()) + } + } + + fn setup() -> PciSegment { + let guest_addr = 0_u64; + let guest_size = 0x1000_usize; + let allocator_1 = Arc::new(Mutex::new( + AddressAllocator::new(GuestAddress(guest_addr), guest_size as u64).unwrap(), + )); + let allocator_2 = Arc::new(Mutex::new( + AddressAllocator::new(GuestAddress(guest_addr), guest_size as u64).unwrap(), + )); + let mock_device_reloc: Arc = Arc::new(MockDeviceRelocation {}); + let arr = [0_u8; 32]; + + PciSegment::new_without_address_manager( + 0, + 0, + allocator_1, + allocator_2, + &arr, + &mock_device_reloc, + ) + .unwrap() + } + + #[test] + // Test the default device ID for a segment with an empty bus (except for the root device). + fn allocate_device_id_default() { + // The first address is occupied by the root + let segment = setup(); + let bdf = segment.allocate_device_id(None).unwrap(); + assert_eq!(bdf.segment(), segment.id); + assert_eq!(bdf.bus(), 0); + assert_eq!(bdf.device(), 1); + assert_eq!(bdf.function(), 0); + } + + #[test] + // Test to acquire a specific device ID + fn allocate_device_id_fixed_device_id() { + // The first address is occupied by the root + let expect_device_id = 0x10_u8; + let segment = setup(); + let bdf = segment.allocate_device_id(Some(expect_device_id)).unwrap(); + assert_eq!(bdf.segment(), segment.id); + assert_eq!(bdf.bus(), 0); + assert_eq!(bdf.device(), expect_device_id); + assert_eq!(bdf.function(), 0); + } + + #[test] + // Test to acquire a device ID that is invalid, one that is already taken + // and one being greater than the number of allowed devices per bus. + fn allocate_device_id_invalid_device_id() { + // The first address is occupied by the root + let already_taken_device_id = 0x0_u8; + let overflow_device_id = 0xff_u8; + let segment = setup(); + let bdf_res = segment.allocate_device_id(Some(already_taken_device_id)); + assert!(matches!( + bdf_res, + Err(DeviceManagerError::GetPciDeviceId(e)) if matches!( + e, + pci::PciRootError::AlreadyInUsePciDeviceSlot(0x0) + ) + )); + let bdf_res = segment.allocate_device_id(Some(overflow_device_id)); + assert!(matches!( + bdf_res, + Err(DeviceManagerError::AllocatePciDeviceId(e)) if matches!( + e, + pci::PciRootError::InvalidPciDeviceSlot(0xff) + ) + )); + } +} From 3a5fad22b9221a86c6e1a763b4b14242c5c6be15 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Sat, 4 Apr 2026 13:30:51 -0700 Subject: [PATCH 688/742] vmm: Fix segment log message formatting Signed-off-by: Rob Bradford --- vmm/src/pci_segment.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vmm/src/pci_segment.rs b/vmm/src/pci_segment.rs index 35d9e1fef5..37cc0dcc6c 100644 --- a/vmm/src/pci_segment.rs +++ b/vmm/src/pci_segment.rs @@ -105,7 +105,7 @@ impl PciSegment { }; info!( - "Adding PCI segment: id={}, PCI MMIO config address: 0x{:x}, mem32 area [0x{:x}-0x{:x}, mem64 area [0x{:x}-0x{:x}", + "Adding PCI segment: id={}, PCI MMIO config address: 0x{:x}, mem32 area [0x{:x}-0x{:x}], mem64 area [0x{:x}-0x{:x}]", segment.id, segment.mmio_config_address, segment.start_of_mem32_area, From 51a729a87454358c0f2d2c89b2baeca3b14e9aad Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Sat, 4 Apr 2026 11:05:01 -0700 Subject: [PATCH 689/742] pci: Add support for reserving but not allocating slots This can be used in a two pass approach where all configs that can hold PCI devices are evaluated to reserve any specific PCI device IDs they may need. Those device IDs will later be allocated when the devices are added to the bus. The tri-state Free, Reserved, Allocated also catches the problem of hotplugging a device with a specific, already used, device ID. Signed-off-by: Rob Bradford --- pci/src/bus.rs | 88 ++++++++++++++++++++++++++------------- vmm/src/device_manager.rs | 6 +-- vmm/src/pci_segment.rs | 19 +++++++-- 3 files changed, 76 insertions(+), 37 deletions(-) diff --git a/pci/src/bus.rs b/pci/src/bus.rs index bd0abec303..4e52ebc9be 100644 --- a/pci/src/bus.rs +++ b/pci/src/bus.rs @@ -114,21 +114,28 @@ impl PciDevice for PciRoot { } } +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +enum DeviceIdState { + Free, + Reserved, + Allocated, +} + pub struct PciBus { /// Devices attached to this bus. /// Device 0 is host bridge. devices: HashMap>>, device_reloc: Arc, - device_ids: [bool; NUM_DEVICE_IDS as usize], + device_ids: [DeviceIdState; NUM_DEVICE_IDS as usize], } impl PciBus { pub fn new(pci_root: PciRoot, device_reloc: Arc) -> Self { let mut devices: HashMap>> = HashMap::new(); - let mut device_ids = [false; NUM_DEVICE_IDS as usize]; + let mut device_ids = [DeviceIdState::Free; NUM_DEVICE_IDS as usize]; devices.insert(PCI_ROOT_DEVICE_ID, Arc::new(Mutex::new(pci_root))); - device_ids[PCI_ROOT_DEVICE_ID as usize] = true; + device_ids[PCI_ROOT_DEVICE_ID as usize] = DeviceIdState::Allocated; PciBus { devices, @@ -172,6 +179,31 @@ impl PciBus { Ok(()) } + /// Reserves a PCI device ID on the bus, marking it as in-use so + /// that automatic allocation will not use it. + /// + /// - `id`: Preferred ID to reserve on the bus. + /// + /// ## Errors + /// + /// * Returns [`PciRootError::AlreadyInUsePciDeviceSlot`] if the + /// slot is already reserved or allocated. + /// * Returns [`PciRootError::InvalidPciDeviceSlot`] if the slot + /// exceeds [`NUM_DEVICE_IDS`]. + pub fn reserve_device_id(&mut self, id: u8) -> Result { + let idx = id as usize; + if idx < NUM_DEVICE_IDS as usize { + if self.device_ids[idx] == DeviceIdState::Free { + self.device_ids[idx] = DeviceIdState::Reserved; + Ok(id) + } else { + Err(PciRootError::AlreadyInUsePciDeviceSlot(idx)) + } + } else { + Err(PciRootError::InvalidPciDeviceSlot(idx)) + } + } + /// Allocates a PCI device ID on the bus. /// /// - `id`: ID to allocate on the bus. If [`None`], the next free @@ -179,6 +211,7 @@ impl PciBus { /// allocated /// /// ## Errors + /// /// * Returns [`PciRootError::AlreadyInUsePciDeviceSlot`] in case /// the ID requested is already allocated. /// * Returns [`PciRootError::InvalidPciDeviceSlot`] in case the @@ -190,10 +223,10 @@ impl PciBus { pub fn allocate_device_id(&mut self, id: Option) -> Result { if let Some(idx) = id.map(|i| i as usize) { if idx < NUM_DEVICE_IDS as usize { - if self.device_ids[idx] { + if self.device_ids[idx] == DeviceIdState::Allocated { Err(PciRootError::AlreadyInUsePciDeviceSlot(idx)) } else { - self.device_ids[idx] = true; + self.device_ids[idx] = DeviceIdState::Allocated; Ok(idx as u8) } } else { @@ -201,8 +234,8 @@ impl PciBus { } } else { for (idx, device_id) in self.device_ids.iter_mut().enumerate() { - if !(*device_id) { - *device_id = true; + if *device_id == DeviceIdState::Free { + *device_id = DeviceIdState::Allocated; return Ok(idx as u8); } } @@ -210,22 +243,9 @@ impl PciBus { } } - pub fn get_device_id(&mut self, id: usize) -> Result<()> { - if id < NUM_DEVICE_IDS as usize { - if self.device_ids[id] { - Err(PciRootError::AlreadyInUsePciDeviceSlot(id)) - } else { - self.device_ids[id] = true; - Ok(()) - } - } else { - Err(PciRootError::InvalidPciDeviceSlot(id)) - } - } - pub fn put_device_id(&mut self, id: usize) -> Result<()> { if id < NUM_DEVICE_IDS as usize { - self.device_ids[id] = false; + self.device_ids[id] = DeviceIdState::Free; Ok(()) } else { Err(PciRootError::InvalidPciDeviceSlot(id)) @@ -577,14 +597,13 @@ mod unit_tests { } #[test] - // Test that gaps resulting from explicit allocations are filled by implicit ones, - // beginning with the first free slot + // Test that reserved IDs are skipped by automatic allocation fn allocate_device_id_fills_gaps() -> Result<(), Box> { // The first address is occupied by the root let mut bus = setup_bus(); - assert_eq!(0x01_u8, bus.allocate_device_id(Some(0x01))?); - assert_eq!(0x03_u8, bus.allocate_device_id(Some(0x03))?); - assert_eq!(0x06_u8, bus.allocate_device_id(Some(0x06))?); + bus.reserve_device_id(0x01)?; + bus.reserve_device_id(0x03)?; + bus.reserve_device_id(0x06)?; assert_eq!(0x02_u8, bus.allocate_device_id(None)?); assert_eq!(0x04_u8, bus.allocate_device_id(None)?); assert_eq!(0x05_u8, bus.allocate_device_id(None)?); @@ -593,12 +612,12 @@ mod unit_tests { } #[test] - // Test that requesting the same ID twice fails - fn allocate_device_id_request_id_twice_fails() -> Result<(), Box> { + // Test that reserving the same ID twice fails + fn reserve_device_id_twice_fails() -> Result<(), Box> { let mut bus = setup_bus(); let max_id = NUM_DEVICE_IDS - 1; - bus.allocate_device_id(Some(max_id))?; - let result = bus.allocate_device_id(Some(max_id)); + bus.reserve_device_id(max_id)?; + let result = bus.reserve_device_id(max_id); assert!(matches!( result, Err(PciRootError::AlreadyInUsePciDeviceSlot(x)) if x == usize::from(max_id), @@ -606,6 +625,15 @@ mod unit_tests { Ok(()) } + #[test] + // Test that allocating a previously reserved ID succeeds (idempotent) + fn allocate_device_id_after_reserve() -> Result<(), Box> { + let mut bus = setup_bus(); + bus.reserve_device_id(0x10)?; + assert_eq!(0x10_u8, bus.allocate_device_id(Some(0x10))?); + Ok(()) + } + #[test] // Test to request an invalid ID fn allocate_device_id_request_invalid_id_fails() -> Result<(), Box> { diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 1499f4d73b..cb88fb8ccd 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -498,7 +498,7 @@ pub enum DeviceManagerError { /// Could not reserve the PCI device ID. #[error("Could not reserve the PCI device ID")] - GetPciDeviceId(#[source] pci::PciRootError), + ReservePciDeviceId(#[source] pci::PciRootError), /// Could not give the PCI device ID back. #[error("Could not give the PCI device ID back")] @@ -4550,8 +4550,8 @@ impl DeviceManager { .pci_bus .lock() .unwrap() - .get_device_id(pci_device_bdf.device() as usize) - .map_err(DeviceManagerError::GetPciDeviceId)?; + .allocate_device_id(Some(pci_device_bdf.device())) + .map_err(DeviceManagerError::AllocatePciDeviceId)?; (pci_segment_id, pci_device_bdf, resources) } else { diff --git a/vmm/src/pci_segment.rs b/vmm/src/pci_segment.rs index 37cc0dcc6c..6a4f10aa70 100644 --- a/vmm/src/pci_segment.rs +++ b/vmm/src/pci_segment.rs @@ -164,6 +164,17 @@ impl PciSegment { ) } + /// Reserves a device ID on this PCI segment, marking it as in-use + /// so that automatic allocation will not use it. + pub(crate) fn reserve_device_id(&self, device_id: u8) -> DeviceManagerResult<()> { + self.pci_bus + .lock() + .unwrap() + .reserve_device_id(device_id) + .map_err(DeviceManagerError::ReservePciDeviceId)?; + Ok(()) + } + /// Allocates a device's ID on this PCI segment. /// /// - `device_id`: Device ID to request for allocation @@ -613,17 +624,17 @@ mod unit_tests { } #[test] - // Test to acquire a device ID that is invalid, one that is already taken - // and one being greater than the number of allowed devices per bus. + // Test that reserving an already taken device ID fails and that + // allocating an out-of-range device ID fails. fn allocate_device_id_invalid_device_id() { // The first address is occupied by the root let already_taken_device_id = 0x0_u8; let overflow_device_id = 0xff_u8; let segment = setup(); - let bdf_res = segment.allocate_device_id(Some(already_taken_device_id)); + let bdf_res = segment.reserve_device_id(already_taken_device_id); assert!(matches!( bdf_res, - Err(DeviceManagerError::GetPciDeviceId(e)) if matches!( + Err(DeviceManagerError::ReservePciDeviceId(e)) if matches!( e, pci::PciRootError::AlreadyInUsePciDeviceSlot(0x0) ) From c3ec804a44e3475bd5a3ca66f5e8a8564754d19f Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Sat, 4 Apr 2026 09:07:11 -0700 Subject: [PATCH 690/742] vmm: config: Add pci_device_id to PciDeviceCommonConfig This adds it to all device types that use the common PCI device configuration. Signed-off-by: Rob Bradford --- vmm/src/config.rs | 8 ++++++-- vmm/src/vm_config.rs | 2 ++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/vmm/src/config.rs b/vmm/src/config.rs index 8e0c4232af..a1453a011d 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -1195,8 +1195,8 @@ impl RateLimiterGroupConfig { } impl PciDeviceCommonConfig { - const OPTIONS: &[&str] = &["id", "pci_segment"]; - const OPTIONS_IOMMU: &[&str] = &["id", "iommu", "pci_segment"]; + const OPTIONS: &[&str] = &["id", "pci_segment", "pci_device_id"]; + const OPTIONS_IOMMU: &[&str] = &["id", "iommu", "pci_segment", "pci_device_id"]; pub fn parse(input: &str) -> Result { let mut parser = OptionParser::new(); @@ -1217,11 +1217,15 @@ impl PciDeviceCommonConfig { .convert("pci_segment") .map_err(Error::ParsePciDeviceCommonConfig)? .unwrap_or_default(); + let pci_device_id = parser + .convert::("pci_device_id") + .map_err(Error::ParsePciDeviceCommonConfig)?; Ok(Self { id, iommu, pci_segment, + pci_device_id, }) } diff --git a/vmm/src/vm_config.rs b/vmm/src/vm_config.rs index 156650cb2e..9162045404 100644 --- a/vmm/src/vm_config.rs +++ b/vmm/src/vm_config.rs @@ -281,6 +281,8 @@ pub struct PciDeviceCommonConfig { pub iommu: bool, #[serde(default)] pub pci_segment: u16, + #[serde(default)] + pub pci_device_id: Option, } #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] From e5d73159f6f10e800918c6ee18587cdc097f7c4a Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Sat, 4 Apr 2026 11:57:13 -0700 Subject: [PATCH 691/742] vmm: openapi: Add pci_device_id to the required device entries Also add pci_segment that was missing from vfio-user devices. Signed-off-by: Rob Bradford --- vmm/src/api/openapi/cloud-hypervisor.yaml | 30 +++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/vmm/src/api/openapi/cloud-hypervisor.yaml b/vmm/src/api/openapi/cloud-hypervisor.yaml index 5be55560e6..422660d07a 100644 --- a/vmm/src/api/openapi/cloud-hypervisor.yaml +++ b/vmm/src/api/openapi/cloud-hypervisor.yaml @@ -984,6 +984,9 @@ components: pci_segment: type: integer format: int16 + pci_device_id: + type: integer + format: uint8 id: type: string serial: @@ -1046,6 +1049,9 @@ components: pci_segment: type: integer format: int16 + pci_device_id: + type: integer + format: uint8 rate_limiter_config: $ref: "#/components/schemas/RateLimiterConfig" offload_tso: @@ -1107,6 +1113,9 @@ components: pci_segment: type: integer format: int16 + pci_device_id: + type: integer + format: uint8 id: type: string @@ -1127,6 +1136,9 @@ components: pci_segment: type: integer format: int16 + pci_device_id: + type: integer + format: uint8 virtio_id: type: uint32 @@ -1149,6 +1161,9 @@ components: pci_segment: type: integer format: int16 + pci_device_id: + type: integer + format: uint8 id: type: string @@ -1196,6 +1211,9 @@ components: pci_segment: type: integer format: int16 + pci_device_id: + type: integer + format: uint8 id: type: string x_nv_gpudirect_clique: @@ -1226,6 +1244,9 @@ components: pci_segment: type: integer format: int16 + pci_device_id: + type: integer + format: uint8 id: type: string @@ -1249,6 +1270,9 @@ components: pci_segment: type: integer format: int16 + pci_device_id: + type: integer + format: uint8 id: type: string @@ -1429,6 +1453,12 @@ components: properties: socket: type: string + pci_segment: + type: integer + format: int16 + pci_device_id: + type: integer + format: uint8 LandlockConfig: required: From 4e247cf91dfc69c9a311d74a36e2ebe1543bb371 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Sat, 4 Apr 2026 12:02:55 -0700 Subject: [PATCH 692/742] vmm: config: Add pci_device_id to SYNTAX for supported devices For those devices types that have the the ability to support specifying the PCI device ID add it to their help syntax. Signed-off-by: Rob Bradford --- vmm/src/config.rs | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/vmm/src/config.rs b/vmm/src/config.rs index a1453a011d..5e1414aa1c 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -1254,7 +1254,8 @@ impl DiskConfig { vhost_user=on|off,socket=,\ bw_size=,bw_one_time_burst=,bw_refill_time=,\ ops_size=,ops_one_time_burst=,ops_refill_time=,\ - id=,pci_segment=,rate_limit_group=,\ + id=,pci_segment=,pci_device_id=,\ + rate_limit_group=,\ queue_affinity=,\ serial=,backing_files=on|off,sparse=on|off,\ image_type=,lock_granularity=byte-range|full"; @@ -1496,7 +1497,8 @@ impl NetConfig { num_queues=,queue_size=,id=,\ vhost_user=,socket=,vhost_mode=client|server,\ bw_size=,bw_one_time_burst=,bw_refill_time=,\ - ops_size=,ops_one_time_burst=,ops_refill_time=,pci_segment=,\ + ops_size=,ops_one_time_burst=,ops_refill_time=,\ + pci_segment=,pci_device_id=,\ offload_tso=on|off,offload_ufo=on|off,offload_csum=on|off\""; pub fn parse(net: &str) -> Result { @@ -1767,7 +1769,7 @@ impl GenericVhostUserConfig { \"virtio_id=,\ socket=,\ queue_sizes=,\ - id=,pci_segment=\""; + id=,pci_segment=,pci_device_id=\""; pub fn parse(vhost_user: &str) -> Result { let mut parser = OptionParser::new(); @@ -1891,7 +1893,8 @@ impl GenericVhostUserConfig { impl FsConfig { pub const SYNTAX: &'static str = "virtio-fs parameters \ \"tag=,socket=,num_queues=,\ - queue_size=,id=,pci_segment=\""; + queue_size=,id=,\ + pci_segment=,pci_device_id=\""; pub fn parse(fs: &str) -> Result { let mut parser = OptionParser::new(); @@ -2044,7 +2047,8 @@ impl FwCfgItem { impl PmemConfig { pub const SYNTAX: &'static str = "Persistent memory parameters \ \"file=,size=,iommu=on|off,\ - discard_writes=on|off,id=,pci_segment=\""; + discard_writes=on|off,id=,\ + pci_segment=,pci_device_id=\""; pub fn parse(pmem: &str) -> Result { let mut parser = OptionParser::new(); @@ -2188,7 +2192,9 @@ impl DebugConsoleConfig { } impl DeviceConfig { - pub const SYNTAX: &'static str = "Direct device assignment parameters \"path=,iommu=on|off,id=,pci_segment=\""; + pub const SYNTAX: &'static str = "Direct device assignment parameters \ + \"path=,iommu=on|off,id=,\ + pci_segment=,pci_device_id=\""; pub fn parse(device: &str) -> Result { let mut parser = OptionParser::new(); @@ -2228,8 +2234,8 @@ impl DeviceConfig { } impl UserDeviceConfig { - pub const SYNTAX: &'static str = - "Userspace device socket=,id=,pci_segment=\""; + pub const SYNTAX: &'static str = "Userspace device socket=,id=,\ + pci_segment=,pci_device_id=\""; pub fn parse(user_device: &str) -> Result { let mut parser = OptionParser::new(); @@ -2257,7 +2263,7 @@ impl UserDeviceConfig { impl VdpaConfig { pub const SYNTAX: &'static str = "vDPA device \ \"path=,num_queues=,iommu=on|off,\ - id=,pci_segment=\""; + id=,pci_segment=,pci_device_id=\""; pub fn parse(vdpa: &str) -> Result { let mut parser = OptionParser::new(); @@ -2291,7 +2297,8 @@ impl VdpaConfig { impl VsockConfig { pub const SYNTAX: &'static str = "Virtio VSOCK parameters \ - \"cid=,socket=,iommu=on|off,id=,pci_segment=\""; + \"cid=,socket=,iommu=on|off,id=,\ + pci_segment=,pci_device_id=\""; pub fn parse(vsock: &str) -> Result { let mut parser = OptionParser::new(); From 7315a38a024e678963deb27b46e1c97956a62291 Mon Sep 17 00:00:00 2001 From: Pascal Scholz Date: Thu, 26 Mar 2026 09:17:44 +0100 Subject: [PATCH 693/742] vmm: Validate PCI device ID Validate the PCI device ID are within range and not using the reserved value. We need this option to ensure that invalid device IDs received via an API call result in an error as soon as possible. In this case, this would be after deserialization. On this code path, validation via `parse` is skipped and must be invoked by calling `validate`. Signed-off-by: Pascal Scholz On-behalf-of: SAP pascal.scholz@sap.com Signed-off-by: Rob Bradford --- vmm/src/config.rs | 68 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 67 insertions(+), 1 deletion(-) diff --git a/vmm/src/config.rs b/vmm/src/config.rs index 5e1414aa1c..72f0f6d47f 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -17,6 +17,7 @@ use log::{debug, warn}; use option_parser::{ ByteSized, IntegerList, OptionParser, OptionParserError, StringList, Toggle, Tuple, }; +use pci::NUM_DEVICE_IDS; use serde::{Deserialize, Serialize}; use thiserror::Error; use virtio_bindings::virtio_blk::VIRTIO_BLK_ID_BYTES; @@ -402,6 +403,13 @@ pub enum ValidationError { /// Invalid NUMA Configuration #[error("NUMA Configuration is invalid")] InvalidNumaConfig(String), + /// The supplied PCI ID was greater then the max. supported number + /// of devices per Bus + #[error("Given PCI device ID ({0}) is out of the supported range of 0..{NUM_DEVICE_IDS}")] + InvalidPciDeviceId(u8), + /// The supplied PCI ID is reserved + #[error("Given PCI device ID ({0}) is reserved")] + ReservedPciDeviceId(u8), } type ValidationResult = std::result::Result; @@ -414,6 +422,21 @@ pub fn add_to_config(items: &mut Option>, item: T) { } } +/// Check that the PCI device supplied is neither out of range nor does +/// it use any reserved device ID. +fn validate_pci_device_id(device_id: u8) -> ValidationResult<()> { + if device_id >= pci::NUM_DEVICE_IDS { + // Check the given ID is not out of range + return Err(ValidationError::InvalidPciDeviceId(device_id)); + } else if device_id == pci::PCI_ROOT_DEVICE_ID { + // Check the ID isn't any reserved one. Currently, only the device ID + // for the root device is reserved. + return Err(ValidationError::ReservedPciDeviceId(device_id)); + } + + Ok(()) +} + pub type Result = result::Result; pub struct VmParams<'a> { @@ -1243,6 +1266,10 @@ impl PciDeviceCommonConfig { } } + if let Some(device_id) = self.pci_device_id { + validate_pci_device_id(device_id)?; + } + Ok(()) } } @@ -5643,7 +5670,7 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" }]); still_valid_config.validate().unwrap(); - let mut still_valid_config = valid_config; + let mut still_valid_config = valid_config.clone(); // SAFETY: Safe as the file was just opened let fd1 = unsafe { libc::dup(File::open("/dev/null").unwrap().as_raw_fd()) }; // SAFETY: Safe as the file was just opened @@ -5653,6 +5680,45 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" still_valid_config.add_preserved_fds(vec![fd1, fd2]); } let _still_valid_config = still_valid_config.clone(); + + // Valid BDF test + let mut still_valid_config = valid_config.clone(); + still_valid_config.disks = Some(vec![DiskConfig { + pci_common: PciDeviceCommonConfig { + pci_device_id: Some(8), + ..Default::default() + }, + ..disk_fixture() + }]); + still_valid_config.validate().unwrap(); + // Invalid BDF - Same ID as Root device + let mut invalid_config = valid_config.clone(); + invalid_config.disks = Some(vec![DiskConfig { + pci_common: PciDeviceCommonConfig { + pci_device_id: Some(pci::PCI_ROOT_DEVICE_ID), + ..Default::default() + }, + ..disk_fixture() + }]); + assert_eq!( + invalid_config.validate(), + Err(ValidationError::ReservedPciDeviceId( + pci::PCI_ROOT_DEVICE_ID + )) + ); + // Invalid BDF - Out of range + let mut invalid_config = valid_config.clone(); + invalid_config.disks = Some(vec![DiskConfig { + pci_common: PciDeviceCommonConfig { + pci_device_id: Some(pci::NUM_DEVICE_IDS + 1), + ..Default::default() + }, + ..disk_fixture() + }]); + assert_eq!( + invalid_config.validate(), + Err(ValidationError::InvalidPciDeviceId(pci::NUM_DEVICE_IDS + 1)) + ); } #[test] fn test_landlock_parsing() -> Result<()> { From aace90f270503a6429254751c41a6287d0f88d80 Mon Sep 17 00:00:00 2001 From: Pascal Scholz Date: Fri, 13 Mar 2026 09:00:41 +0100 Subject: [PATCH 694/742] vmm: Propagate PCI device ID from the config We pass the device ID from the config to the allocation routine, where it is then used as the preferred device ID alongside the existing PCI segment ID. Signed-off-by: Pascal Scholz On-behalf-of: SAP pascal.scholz@sap.com Signed-off-by: Rob Bradford --- vmm/src/device_manager.rs | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index cb88fb8ccd..1f1dd2f056 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -1711,6 +1711,7 @@ impl DeviceManager { &id, handle.pci_common.pci_segment, handle.dma_handler, + handle.pci_common.pci_device_id, )?; // Track device BDF for Generic Initiator support @@ -1742,7 +1743,8 @@ impl DeviceManager { } if let Some(iommu_device) = iommu_device { - let dev_id = self.add_virtio_pci_device(iommu_device, &None, &iommu_id, 0, None)?; + let dev_id = + self.add_virtio_pci_device(iommu_device, &None, &iommu_id, 0, None, None)?; self.iommu_attached_devices = Some((dev_id, iommu_attached_devices)); } } @@ -3656,7 +3658,7 @@ impl DeviceManager { let pci_segment_id = 0x0_u16; let (pci_segment_id, pci_device_bdf, resources) = - self.pci_resources(&id, pci_segment_id)?; + self.pci_resources(&id, pci_segment_id, None)?; info!("Creating pvmemcontrol device: id = {id}"); let (pvmemcontrol_pci_device, pvmemcontrol_bus_device) = @@ -3922,8 +3924,11 @@ impl DeviceManager { id }; - let (pci_segment_id, pci_device_bdf, resources) = - self.pci_resources(&vfio_name, device_cfg.pci_common.pci_segment)?; + let (pci_segment_id, pci_device_bdf, resources) = self.pci_resources( + &vfio_name, + device_cfg.pci_common.pci_segment, + device_cfg.pci_common.pci_device_id, + )?; let mut needs_dma_mapping = false; @@ -4184,8 +4189,11 @@ impl DeviceManager { id }; - let (pci_segment_id, pci_device_bdf, resources) = - self.pci_resources(&vfio_user_name, device_cfg.pci_common.pci_segment)?; + let (pci_segment_id, pci_device_bdf, resources) = self.pci_resources( + &vfio_user_name, + device_cfg.pci_common.pci_segment, + device_cfg.pci_common.pci_device_id, + )?; let legacy_interrupt_group = if let Some(legacy_interrupt_manager) = &self.legacy_interrupt_manager { @@ -4301,6 +4309,7 @@ impl DeviceManager { virtio_device_id: &str, pci_segment_id: u16, dma_handler: Option>, + pci_device_id: Option, ) -> DeviceManagerResult { let id = format!("{VIRTIO_PCI_DEVICE_NAME_PREFIX}-{virtio_device_id}"); @@ -4309,7 +4318,7 @@ impl DeviceManager { node.children = vec![virtio_device_id.to_string()]; let (pci_segment_id, pci_device_bdf, resources) = - self.pci_resources(&id, pci_segment_id)?; + self.pci_resources(&id, pci_segment_id, pci_device_id)?; // Update the existing virtio node by setting the parent. if let Some(node) = self.device_tree.lock().unwrap().get_mut(virtio_device_id) { @@ -4446,7 +4455,7 @@ impl DeviceManager { info!("Creating pvpanic device {id}"); let (pci_segment_id, pci_device_bdf, resources) = - self.pci_resources(&id, pci_segment_id)?; + self.pci_resources(&id, pci_segment_id, None)?; let snapshot = snapshot_from_id(self.snapshot.as_ref(), id.as_str()); @@ -4484,7 +4493,7 @@ impl DeviceManager { info!("Creating ivshmem device {id}"); let (pci_segment_id, pci_device_bdf, resources) = - self.pci_resources(&id, pci_segment_id)?; + self.pci_resources(&id, pci_segment_id, None)?; let snapshot = snapshot_from_id(self.snapshot.as_ref(), id.as_str()); let ivshmem_ops = Arc::new(Mutex::new(IvshmemHandler { @@ -4529,6 +4538,7 @@ impl DeviceManager { &self, id: &str, pci_segment_id: u16, + pci_device_id: Option, ) -> DeviceManagerResult<(u16, PciBdf, Option>)> { // Look for the id in the device tree. If it can be found, that means // the device is being restored, otherwise it's created from scratch. @@ -4556,7 +4566,7 @@ impl DeviceManager { (pci_segment_id, pci_device_bdf, resources) } else { let pci_device_bdf = - self.pci_segments[pci_segment_id as usize].allocate_device_id(None)?; + self.pci_segments[pci_segment_id as usize].allocate_device_id(pci_device_id)?; (pci_segment_id, pci_device_bdf, None) }) @@ -5075,6 +5085,7 @@ impl DeviceManager { &id, handle.pci_common.pci_segment, handle.dma_handler, + handle.pci_common.pci_device_id, )?; // Update the PCIU bitmap From 5aa3692c6d3bad8abc87f9b0933f76393ec491ca Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Sat, 4 Apr 2026 11:09:01 -0700 Subject: [PATCH 695/742] vmm: device_manager: Reserve explicitly used PCI device IDs Use two passes to first reserve PCI device IDs and then allocate them when adding the devices to the bus. This prevents a situation where an anonymous PCI device allocation clashes with an explicitly allocated PCI device ID. Signed-off-by: Rob Bradford --- vmm/src/device_manager.rs | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 1f1dd2f056..6c52026e1a 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -1697,6 +1697,10 @@ impl DeviceManager { let mut iommu_attached_devices = Vec::new(); { + // Reserve all explicit PCI device IDs before any device creation + // so that they won't be picked for dynamic allocation. + self.reserve_explicit_device_ids()?; + for handle in self.virtio_devices.clone() { let mapping: Option> = if handle.pci_common.iommu { self.iommu_mapping.clone() @@ -4534,6 +4538,37 @@ impl DeviceManager { Ok(Some(ivshmem_device)) } + fn reserve_explicit_device_ids(&self) -> DeviceManagerResult<()> { + for handle in &self.virtio_devices { + if let Some(device_id) = handle.pci_common.pci_device_id { + self.pci_segments[handle.pci_common.pci_segment as usize] + .reserve_device_id(device_id)?; + } + } + + let config = self.config.lock().unwrap(); + + if let Some(devices) = &config.devices { + for device_cfg in devices { + if let Some(device_id) = device_cfg.pci_common.pci_device_id { + self.pci_segments[device_cfg.pci_common.pci_segment as usize] + .reserve_device_id(device_id)?; + } + } + } + + if let Some(user_devices) = &config.user_devices { + for device_cfg in user_devices { + if let Some(device_id) = device_cfg.pci_common.pci_device_id { + self.pci_segments[device_cfg.pci_common.pci_segment as usize] + .reserve_device_id(device_id)?; + } + } + } + + Ok(()) + } + fn pci_resources( &self, id: &str, From b4723999f8c53a3605deb0a8cff94c9ee14f6884 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Sat, 4 Apr 2026 12:49:53 -0700 Subject: [PATCH 696/742] docs: Update the relevant documentation Some of the documentation references PCI segment ID. For those documents add a mention of the new PCI device ID. Signed-off-by: Rob Bradford --- docs/device_model.md | 5 +++-- docs/vdpa.md | 24 ++++++++++++++++++++---- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/docs/device_model.md b/docs/device_model.md index ed4577a2cd..e915c47eec 100644 --- a/docs/device_model.md +++ b/docs/device_model.md @@ -90,8 +90,9 @@ feature is enabled by default. For all virtio devices listed below, only `virtio-pci` transport layer is supported. Cloud Hypervisor supports multiple PCI segments, and users can -append `,pci_segment=` to the device flag in the Cloud -Hypervisor command line to assign devices to a specific PCI segment. +append `,pci_segment=` or `,pci_device_id=` to +the device flag in the Cloud Hypervisor command line to assign devices to a specific +PCI segment or into a specific device slot. ### virtio-block diff --git a/docs/vdpa.md b/docs/vdpa.md index c1aa34c571..1e171670ad 100644 --- a/docs/vdpa.md +++ b/docs/vdpa.md @@ -32,11 +32,12 @@ struct VdpaConfig { iommu: bool, id: Option, pci_segment: u16, + pci_device_id: Option } ``` ``` ---vdpa vDPA device "path=,num_queues=,iommu=on|off,id=,pci_segment=" +--vdpa vDPA device "path=,num_queues=,iommu=on|off,id=,pci_segment=,pci_device_id=" ``` ### `path` @@ -96,6 +97,21 @@ _Example_ --vdpa path=/dev/vhost-vdpa-0,pci_segment=1 ``` +### `pci_device_id` + +PCI device ID to assign to the vDPA device on its PCI bus. + +This parameter is optional. If not specified, a device ID is automatically +allocated. + +Value is an unsigned integer in the range 1-31. + +_Example_ + +``` +--vdpa path=/dev/vhost-vdpa-0,pci_device_id=5 +``` + ## Example with vDPA block simulator The vDPA framework provides a simulator with both `virtio-block` and @@ -146,10 +162,10 @@ The `virtio-block` device backed by the vDPA simulator can be found as ``` cloud@cloud:~$ lsblk NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT -nullb0 252:0 0 250G 0 disk -vda 254:0 0 2.2G 0 disk +nullb0 252:0 0 250G 0 disk +vda 254:0 0 2.2G 0 disk ├─vda1 254:1 0 2.1G 0 part / -├─vda14 254:14 0 4M 0 part +├─vda14 254:14 0 4M 0 part └─vda15 254:15 0 106M 0 part /boot/efi vdb 254:16 0 128M 0 disk ``` From 8259f929096904f943e1fb648179f04c99e8ebc0 Mon Sep 17 00:00:00 2001 From: Pascal Scholz Date: Wed, 25 Mar 2026 19:34:56 +0100 Subject: [PATCH 697/742] tests: Return stderr when executing commands If we want to test for error cases, it can be useful to inspect the `stderr` of a `Command` to analyze the errors. For example, this allows us to ensure that a `Command` returns an `IoError` by parsing the error trace, if an `IoError` is expected. This commit prepares the implementation of negative integration tests for the configurable BDFs. Signed-off-by: Pascal Scholz On-behalf-of: SAP pascal.scholz@sap.com --- .../tests/common/tests_wrappers.rs | 18 +++++++-------- cloud-hypervisor/tests/common/utils.rs | 8 +++---- cloud-hypervisor/tests/integration.rs | 22 +++++++++---------- test_infra/src/lib.rs | 4 ++-- 4 files changed, 26 insertions(+), 26 deletions(-) diff --git a/cloud-hypervisor/tests/common/tests_wrappers.rs b/cloud-hypervisor/tests/common/tests_wrappers.rs index 6479f2e2a6..71087a9009 100644 --- a/cloud-hypervisor/tests/common/tests_wrappers.rs +++ b/cloud-hypervisor/tests/common/tests_wrappers.rs @@ -981,7 +981,7 @@ pub(crate) fn _test_virtio_fs( if hotplug { // Add fs to the VM - let (cmd_success, cmd_output) = + let (cmd_success, cmd_output, _) = remote_command_w_output(&api_socket, add_arg, Some(&fs_params)); assert!(cmd_success); @@ -1085,7 +1085,7 @@ pub(crate) fn _test_virtio_fs( ); // Add back and check it works - let (cmd_success, cmd_output) = + let (cmd_success, cmd_output, _) = remote_command_w_output(&api_socket, add_arg, Some(&fs_params)); assert!(cmd_success); if let Some(pci_segment) = pci_segment { @@ -1230,7 +1230,7 @@ pub(crate) fn _test_virtio_vsock(guest: &Guest, hotplug: bool) { guest.wait_vm_boot().unwrap(); if hotplug { - let (cmd_success, cmd_output) = remote_command_w_output( + let (cmd_success, cmd_output, _) = remote_command_w_output( &api_socket, "add-vsock", Some(format!("cid=3,socket={socket},id=test0").as_str()), @@ -2697,7 +2697,7 @@ pub(crate) fn _test_disk_hotplug(guest: &Guest, landlock_enabled: bool) { ); // Now let's add the extra disk. - let (cmd_success, cmd_output) = remote_command_w_output( + let (cmd_success, cmd_output, _) = remote_command_w_output( &api_socket, "add-disk", Some( @@ -2733,7 +2733,7 @@ pub(crate) fn _test_disk_hotplug(guest: &Guest, landlock_enabled: bool) { .is_ok_and(|s| s.trim().parse::().unwrap_or(1) == 0))); // And add it back to validate unplug did work correctly. - let (cmd_success, cmd_output) = remote_command_w_output( + let (cmd_success, cmd_output, _) = remote_command_w_output( &api_socket, "add-disk", Some( @@ -2898,7 +2898,7 @@ pub(crate) fn _test_net_hotplug( let r = std::panic::catch_unwind(|| { // Add network - let (cmd_success, cmd_output) = remote_command_w_output( + let (cmd_success, cmd_output, _) = remote_command_w_output( &api_socket, "add-net", Some( @@ -2959,7 +2959,7 @@ pub(crate) fn _test_net_hotplug( })); // Add network - let (cmd_success, cmd_output) = remote_command_w_output( + let (cmd_success, cmd_output, _) = remote_command_w_output( &api_socket, "add-net", Some( @@ -3341,7 +3341,7 @@ pub(crate) fn _test_macvtap( None ))); // Hotplug the virtio-net device - let (cmd_success, cmd_output) = + let (cmd_success, cmd_output, _) = remote_command_w_output(&api_socket, "add-net", Some(&net_params)); assert!(cmd_success); #[cfg(target_arch = "x86_64")] @@ -3436,7 +3436,7 @@ pub(crate) fn _test_vdpa_block(guest: &Guest) { // Hotplug an extra vDPA block device behind the vIOMMU // Add a new vDPA device to the VM - let (cmd_success, cmd_output) = remote_command_w_output( + let (cmd_success, cmd_output, _) = remote_command_w_output( &api_socket, "add-vdpa", Some("id=myvdpa0,path=/dev/vhost-vdpa-1,num_queues=1,pci_segment=1,iommu=on"), diff --git a/cloud-hypervisor/tests/common/utils.rs b/cloud-hypervisor/tests/common/utils.rs index f7cc1ea181..d39cbcc580 100644 --- a/cloud-hypervisor/tests/common/utils.rs +++ b/cloud-hypervisor/tests/common/utils.rs @@ -688,7 +688,7 @@ pub struct Counters { pub(crate) fn get_counters(api_socket: &str) -> Counters { // Get counters - let (cmd_success, cmd_output) = remote_command_w_output(api_socket, "counters", None); + let (cmd_success, cmd_output, _) = remote_command_w_output(api_socket, "counters", None); assert!(cmd_success); let counters: HashMap<&str, HashMap<&str, u64>> = @@ -738,7 +738,7 @@ pub(super) fn pty_read(mut pty: std::fs::File) -> Receiver { } pub(crate) fn get_pty_path(api_socket: &str, pty_type: &str) -> PathBuf { - let (cmd_success, cmd_output) = remote_command_w_output(api_socket, "info", None); + let (cmd_success, cmd_output, _) = remote_command_w_output(api_socket, "info", None); assert!(cmd_success); let info: serde_json::Value = serde_json::from_slice(&cmd_output).unwrap_or_default(); assert_eq!("Pty", info["config"][pty_type]["mode"]); @@ -786,7 +786,7 @@ pub(crate) fn cleanup_vfio_network_interfaces() { } pub(crate) fn balloon_size(api_socket: &str) -> u64 { - let (cmd_success, cmd_output) = remote_command_w_output(api_socket, "info", None); + let (cmd_success, cmd_output, _) = remote_command_w_output(api_socket, "info", None); assert!(cmd_success); let info: serde_json::Value = serde_json::from_slice(&cmd_output).unwrap_or_default(); @@ -802,7 +802,7 @@ pub(crate) fn balloon_size(api_socket: &str) -> u64 { } pub(crate) fn vm_state(api_socket: &str) -> String { - let (cmd_success, cmd_output) = remote_command_w_output(api_socket, "info", None); + let (cmd_success, cmd_output, _) = remote_command_w_output(api_socket, "info", None); assert!(cmd_success); let info: serde_json::Value = serde_json::from_slice(&cmd_output).unwrap_or_default(); diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 508e5e93dc..17c93ba1e7 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -375,7 +375,7 @@ mod common_parallel { guest.wait_vm_boot().unwrap(); let r = std::panic::catch_unwind(|| { - let (cmd_success, cmd_output) = remote_command_w_output( + let (cmd_success, cmd_output, _) = remote_command_w_output( &api_socket, "add-disk", Some( @@ -2979,7 +2979,7 @@ mod common_parallel { guest.wait_vm_boot().unwrap(); // Add the disk to the VM - let (cmd_success, cmd_output) = remote_command_w_output( + let (cmd_success, cmd_output, _) = remote_command_w_output( &api_socket, "add-disk", Some("path=/tmp/resize.img,id=test0"), @@ -3094,7 +3094,7 @@ mod common_parallel { guest.wait_vm_boot().unwrap(); // Add the QCOW2 disk to the VM - let (cmd_success, cmd_output) = remote_command_w_output( + let (cmd_success, cmd_output, _) = remote_command_w_output( &api_socket, "add-disk", Some(&format!( @@ -4907,7 +4907,7 @@ mod common_parallel { let pmem_temp_file = TempFile::new().unwrap(); pmem_temp_file.as_file().set_len(128 << 20).unwrap(); - let (cmd_success, cmd_output) = remote_command_w_output( + let (cmd_success, cmd_output, _) = remote_command_w_output( &api_socket, "add-pmem", Some(&format!( @@ -5405,7 +5405,7 @@ mod common_parallel { guest.wait_vm_boot().unwrap(); // Hotplug the SPDK-NVMe device to the VM - let (cmd_success, cmd_output) = remote_command_w_output( + let (cmd_success, cmd_output, _) = remote_command_w_output( &api_socket, "add-user-device", Some(&format!( @@ -7982,7 +7982,7 @@ mod windows { assert_eq!(netdev_ctrl_threads_count(child.id()), netdev_num); // Hotplug network device - let (cmd_success, cmd_output) = remote_command_w_output( + let (cmd_success, cmd_output, _) = remote_command_w_output( &api_socket, "add-net", Some(windows_guest.guest().default_net_string().as_str()), @@ -8066,7 +8066,7 @@ mod windows { assert_eq!(disk_ctrl_threads_count(child.id()), disk_num); // Hotplug disk device - let (cmd_success, cmd_output) = remote_command_w_output( + let (cmd_success, cmd_output, _) = remote_command_w_output( &api_socket, "add-disk", Some(format!("path={disk},readonly=off").as_str()), @@ -8104,7 +8104,7 @@ mod windows { assert_eq!(disk_ctrl_threads_count(child.id()), disk_num); // Remount and check the file exists with the expected contents - let (cmd_success, _cmd_output) = remote_command_w_output( + let (cmd_success, _cmd_output, _) = remote_command_w_output( &api_socket, "add-disk", Some(format!("path={disk},readonly=off").as_str()), @@ -8195,7 +8195,7 @@ mod windows { let expected_ctrl_threads = disk_ctrl_threads_count(child.id()) + 1; // Hotplug disk device - let (cmd_success, cmd_output) = remote_command_w_output( + let (cmd_success, cmd_output, _) = remote_command_w_output( &api_socket, "add-disk", Some(format!("path={disk},readonly=off").as_str()), @@ -8248,7 +8248,7 @@ mod windows { // Remount for it in &disk_test_data { let disk = it[1].as_str(); - let (cmd_success, _cmd_output) = remote_command_w_output( + let (cmd_success, _cmd_output, _) = remote_command_w_output( &api_socket, "add-disk", Some(format!("path={disk},readonly=off").as_str()), @@ -8514,7 +8514,7 @@ mod vfio { guest.wait_vm_boot().unwrap(); // Hotplug the card to the VM - let (cmd_success, cmd_output) = remote_command_w_output( + let (cmd_success, cmd_output, _) = remote_command_w_output( &api_socket, "add-device", Some(format!("id=vfio0,path={NVIDIA_VFIO_DEVICE}").as_str()), diff --git a/test_infra/src/lib.rs b/test_infra/src/lib.rs index 3f7f359804..b39e5a0842 100644 --- a/test_infra/src/lib.rs +++ b/test_infra/src/lib.rs @@ -1981,7 +1981,7 @@ pub fn remote_command_w_output( api_socket: &str, command: &str, arg: Option<&str>, -) -> (bool, Vec) { +) -> (bool, Vec /* stdout */, Vec /* stderr */) { let mut cmd = Command::new(clh_command("ch-remote")); cmd.args([&format!("--api-socket={api_socket}"), command]); @@ -1991,7 +1991,7 @@ pub fn remote_command_w_output( let output = cmd.output().expect("Failed to launch ch-remote"); - (output.status.success(), output.stdout) + (output.status.success(), output.stdout, output.stderr) } pub fn parse_iperf3_output(output: &[u8], sender: bool, bandwidth: bool) -> Result { From f82eebc0b0c7113ec179b85f709bef5bfbc1f833 Mon Sep 17 00:00:00 2001 From: Pascal Scholz Date: Thu, 26 Mar 2026 13:12:15 +0100 Subject: [PATCH 698/742] tests: Add an integration test to verify PCI device allocations This commit adds an integration test to verify that the guest sees the correct BDF. Moreover, we check that we can allocate a random free BDF and that freeing BDFs works. Signed-off-by: Pascal Scholz On-behalf-of: SAP pascal.scholz@sap.com Signed-off-by: Rob Bradford --- cloud-hypervisor/tests/common/utils.rs | 26 +++++ cloud-hypervisor/tests/integration.rs | 138 +++++++++++++++++++++++++ 2 files changed, 164 insertions(+) diff --git a/cloud-hypervisor/tests/common/utils.rs b/cloud-hypervisor/tests/common/utils.rs index d39cbcc580..1821575d49 100644 --- a/cloud-hypervisor/tests/common/utils.rs +++ b/cloud-hypervisor/tests/common/utils.rs @@ -1033,3 +1033,29 @@ pub(crate) fn make_guest_panic(guest: &Guest) { // Trigger guest a panic guest.ssh_command("screen -dmS reboot sh -c \"sleep 5; echo s | tee /proc/sysrq-trigger; echo c | sudo tee /proc/sysrq-trigger\"").unwrap(); } + +/// Extracts a BDF from a CHV returned response +pub(crate) fn bdf_from_hotplug_response( + s: &str, +) -> ( + u16, /* Segment ID */ + u8, /* Bus ID */ + u8, /* Device ID */ + u8, /* Function ID */ +) { + let json: serde_json::Value = serde_json::from_str(s).expect("should be valid JSON"); + let bdf_str = json["bdf"] + .as_str() + .expect("should contain string key `bdf`"); + + // BDF format: "SSSS:BB:DD.F" + let parts: Vec<&str> = bdf_str.split(&[':', '.'][..]).collect(); + assert_eq!(parts.len(), 4, "unexpected BDF format: {bdf_str}"); + + let segment_id = u16::from_str_radix(parts[0], 16).unwrap(); + let bus_id = u8::from_str_radix(parts[1], 16).unwrap(); + let device_id = u8::from_str_radix(parts[2], 16).unwrap(); + let function_id = u8::from_str_radix(parts[3], 16).unwrap(); + + (segment_id, bus_id, device_id, function_id) +} diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 17c93ba1e7..54394903c8 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -5699,6 +5699,144 @@ mod common_parallel { handle_child_output(r, &output); } + + #[test] + fn test_pci_device_id() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); + + #[cfg(target_arch = "x86_64")] + let kernel_path = direct_kernel_boot_path(); + #[cfg(target_arch = "aarch64")] + let kernel_path = edk2_path(); + + let api_socket = temp_api_path(&guest.tmp_dir); + + // Boot without network + let mut cmd = GuestCommand::new(&guest); + + cmd.args(["--api-socket", &api_socket]) + .default_cpus() + .default_memory() + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .default_net() + .default_disks() + .capture_output(); + + let mut child = cmd.spawn().unwrap(); + + guest.wait_vm_boot().unwrap(); + + // Add a network device with non-static device id request + let r = std::panic::catch_unwind(|| { + let (cmd_success, cmd_stdout, _) = remote_command_w_output( + &api_socket, + "add-net", + Some( + format!( + "id=test0,tap=,mac={},ip={},mask=255.255.255.128", + guest.network.guest_mac1, guest.network.host_ip1, + ) + .as_str(), + ), + ); + assert!(cmd_success); + // We now know the first free device ID on the bus + let output = String::from_utf8(cmd_stdout).expect("should work"); + let (_, _, first_free_device_id, _) = bdf_from_hotplug_response(output.as_str()); + assert_ne!(first_free_device_id, 0); + + // We expect a match from grep + let _ = String::from( + guest + .ssh_command(&format!( + "lspci -n | grep \"00:{first_free_device_id:02x}.0\"" + )) + .unwrap() + .trim(), + ); + // Calculate the succeeding device ID + let device_id_to_allocate = first_free_device_id + 1; + // We expect the succeeding device ID to be free + assert!(matches!( + guest.ssh_command(&format!( + "lspci -n | grep \"00:{device_id_to_allocate:02x}.0\"" + )), + Err(SshCommandError::NonZeroExitStatus(1)) + )); + + // Add a device to the next device slot explicitly + let (cmd_success, cmd_stdout, _) = remote_command_w_output( + &api_socket, + "add-net", + Some( + format!( + "id=test1337,tap=,mac={},ip={},mask=255.255.255.128,pci_device_id={}", + guest.network.guest_mac1, guest.network.host_ip1, device_id_to_allocate, + ) + .as_str(), + ), + ); + assert!(cmd_success); + // Retrieve what BDF we actually reserved and assert it's equal to that we wanted to reserve + let output = String::from_utf8(cmd_stdout).expect("should work"); + let (_, _, allocated_device_id, _) = bdf_from_hotplug_response(output.as_str()); + assert_eq!(device_id_to_allocate, allocated_device_id); + // Check that the device ID is really in use + let _ = String::from( + guest + .ssh_command(&format!( + "lspci -n | grep \"00:{allocated_device_id:02x}.0\"" + )) + .unwrap() + .trim(), + ); + // Remove the first device to create a hole + let cmd_success = remote_command(&api_socket, "remove-device", Some("test0")); + assert!(cmd_success); + thread::sleep(std::time::Duration::new(5, 0)); + // We left a hole in the used PCI IDs. The guest sees no device on the respective ID + assert!(matches!( + guest.ssh_command(&format!( + "lspci -n | grep \"00:{first_free_device_id:02x}.0\"" + )), + Err(SshCommandError::NonZeroExitStatus(1)) + )); + // Reuse the device ID hole by dynamically coalescing with the first free ID + let (cmd_success, cmd_stdout, _) = remote_command_w_output( + &api_socket, + "add-net", + Some( + format!( + "id=test0,tap=,mac={},ip={},mask=255.255.255.128", + guest.network.guest_mac1, guest.network.host_ip1, + ) + .as_str(), + ), + ); + assert!(cmd_success); + // Check that CHV reports that we added the same device to the same ID + let output = String::from_utf8(cmd_stdout).expect("should work"); + let (_, _, allocated_device_id, _) = bdf_from_hotplug_response(output.as_str()); + assert_eq!(first_free_device_id, allocated_device_id); + + // Check that guest sees the same device again at the same BDF + let _ = String::from( + guest + .ssh_command(&format!( + "lspci -n | grep \"00:{allocated_device_id:02x}.0\"" + )) + .unwrap() + .trim(), + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); + } } mod dbus_api { From f81faad0a15190c85b6668ac4416b1e4a78854fd Mon Sep 17 00:00:00 2001 From: Pascal Scholz Date: Thu, 26 Mar 2026 11:48:03 +0100 Subject: [PATCH 699/742] tests: Add an integration test to check duplicate PCI device IDs This integration test verifies that the same device ID cannot be allocated twice. Moreover, we check that the returned error matches our expectations. Signed-off-by: Pascal Scholz On-behalf-of: SAP pascal.scholz@sap.com Signed-off-by: Rob Bradford --- cloud-hypervisor/tests/integration.rs | 78 +++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 54394903c8..51d4b06e86 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -5837,6 +5837,84 @@ mod common_parallel { handle_child_output(r, &output); } + + #[test] + // Test that adding a duplicate PCI device ID fails + fn test_duplicate_pci_device_id() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); + + #[cfg(target_arch = "x86_64")] + let kernel_path = direct_kernel_boot_path(); + #[cfg(target_arch = "aarch64")] + let kernel_path = edk2_path(); + + let api_socket = temp_api_path(&guest.tmp_dir); + + // Boot without network + let mut cmd = GuestCommand::new(&guest); + + cmd.args(["--api-socket", &api_socket]) + .default_cpus() + .default_memory() + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .default_net() + .default_disks() + .capture_output(); + + let mut child = cmd.spawn().unwrap(); + + guest.wait_vm_boot().unwrap(); + + // Add a network device with non-static device ID request + let r = std::panic::catch_unwind(|| { + let (cmd_success, cmd_stdout, _) = remote_command_w_output( + &api_socket, + "add-net", + Some( + format!( + "id=test0,tap=,mac={},ip={},mask=255.255.255.128", + guest.network.guest_mac1, guest.network.host_ip1, + ) + .as_str(), + ), + ); + assert!(cmd_success); + + // We now know the first free device ID on the bus + let output = String::from_utf8(cmd_stdout).expect("should work"); + let (_, _, first_free_device_id, _) = bdf_from_hotplug_response(output.as_str()); + assert_ne!(first_free_device_id, 0); + + let (cmd_success, _, cmd_stderr) = remote_command_w_output( + &api_socket, + "add-net", + Some( + format!( + "id=test1337,tap=,mac={},ip={},mask=255.255.255.128,pci_device_id={first_free_device_id}", + guest.network.guest_mac1, guest.network.host_ip1, + ) + .as_str(), + ), + ); + // Check for fail; Allocating the same device ID for two devices is disallowed + assert!(!cmd_success); + // Check that the error message contains the expected error + let std_err_str = String::from_utf8(cmd_stderr).unwrap(); + assert!( + std_err_str.contains(&format!( + "Valid PCI device identifier but already used: {first_free_device_id}" + )), + "Command return was: {std_err_str}" + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); + } } mod dbus_api { From 18873d88d7b61e02c4b4fce5fafa9172a6e2dee0 Mon Sep 17 00:00:00 2001 From: Pascal Scholz Date: Wed, 25 Mar 2026 18:34:53 +0100 Subject: [PATCH 700/742] tests: Add an integration test for PCI device ID allocation errors Adds a test that checks the correct error is returned on allocation of an invalid device ID (one that is not in the range 0-31) and when trying to allocate a reserved ID (such as that of the root bridge). Signed-off-by: Pascal Scholz On-behalf-of: SAP pascal.scholz@sap.com Signed-off-by: Rob Bradford --- cloud-hypervisor/tests/integration.rs | 80 +++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 51d4b06e86..8657edc3bd 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -5915,6 +5915,86 @@ mod common_parallel { handle_child_output(r, &output); } + + #[test] + // Test that requesting an invalid device ID fails. + fn test_invalid_pci_device_id() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); + + #[cfg(target_arch = "x86_64")] + let kernel_path = direct_kernel_boot_path(); + #[cfg(target_arch = "aarch64")] + let kernel_path = edk2_path(); + + let api_socket = temp_api_path(&guest.tmp_dir); + + // Boot without network + let mut cmd = GuestCommand::new(&guest); + + cmd.args(["--api-socket", &api_socket]) + .default_cpus() + .default_memory() + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .default_net() + .default_disks() + .capture_output(); + + let mut child = cmd.spawn().unwrap(); + + guest.wait_vm_boot().unwrap(); + + let r = std::panic::catch_unwind(|| { + // Invalid API call because the PCI device ID is out of range + let (cmd_success, _, cmd_stderr) = remote_command_w_output( + &api_socket, + "add-net", + Some( + format!( + "id=test0,tap=,mac={},ip={},mask=255.255.255.128,pci_device_id=188", + guest.network.guest_mac1, guest.network.host_ip1, + ) + .as_str(), + ), + ); + // Check for fail + assert!(!cmd_success); + // Check that the error message contains the expected error + let std_err_str = String::from_utf8(cmd_stderr).unwrap(); + assert!( + std_err_str + .contains("Given PCI device ID (188) is out of the supported range of 0..32"), + "Command return was: {std_err_str}", + ); + + // Use the reserved device ID 0 (root device) + let (cmd_success, _, cmd_stderr) = remote_command_w_output( + &api_socket, + "add-net", + Some( + format!( + "id=test0,tap=,mac={},ip={},mask=255.255.255.128,pci_device_id=0", + guest.network.guest_mac1, guest.network.host_ip1, + ) + .as_str(), + ), + ); + // Check for fail + assert!(!cmd_success); + // Check that the error message contains the expected error + let std_err_str = String::from_utf8(cmd_stderr).unwrap(); + assert!( + std_err_str.contains("Given PCI device ID (0) is reserved"), + "Command return was: {std_err_str}" + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); + } } mod dbus_api { From d449983495db036307e6f0acc60bfebdc1b046c9 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Thu, 16 Apr 2026 21:01:25 +0100 Subject: [PATCH 701/742] vmm: Be consistent with PCI bus reservation nomenclature Our bus slots are now Reserved/Allocated/Free so change the method to free it to free_device_id() and update error. Also update to take u8 to match the other methods. Signed-off-by: Rob Bradford --- pci/src/bus.rs | 15 +++++++++++---- vmm/src/device_manager.rs | 10 +++++----- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/pci/src/bus.rs b/pci/src/bus.rs index 4e52ebc9be..89efd2ed68 100644 --- a/pci/src/bus.rs +++ b/pci/src/bus.rs @@ -243,12 +243,19 @@ impl PciBus { } } - pub fn put_device_id(&mut self, id: usize) -> Result<()> { - if id < NUM_DEVICE_IDS as usize { - self.device_ids[id] = DeviceIdState::Free; + /// Frees a PCI device ID on the bus. + /// + /// - `id`: ID to free on the bus. + /// + /// ## Errors + /// * Returns [`PciRootError::InvalidPciDeviceSlot`] if the slot + /// exceeds [`NUM_DEVICE_IDS`]. + pub fn free_device_id(&mut self, id: u8) -> Result<()> { + if id < NUM_DEVICE_IDS { + self.device_ids[id as usize] = DeviceIdState::Free; Ok(()) } else { - Err(PciRootError::InvalidPciDeviceSlot(id)) + Err(PciRootError::InvalidPciDeviceSlot(id as usize)) } } } diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 6c52026e1a..9f65784ecb 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -500,9 +500,9 @@ pub enum DeviceManagerError { #[error("Could not reserve the PCI device ID")] ReservePciDeviceId(#[source] pci::PciRootError), - /// Could not give the PCI device ID back. - #[error("Could not give the PCI device ID back")] - PutPciDeviceId(#[source] pci::PciRootError), + /// Could not free the PCI device ID. + #[error("Could not free PCI device ID")] + FreePciDeviceId(#[source] pci::PciRootError), /// No disk path was specified when one was expected #[error("No disk path was specified when one was expected")] @@ -4892,8 +4892,8 @@ impl DeviceManager { .pci_bus .lock() .unwrap() - .put_device_id(device_id as usize) - .map_err(DeviceManagerError::PutPciDeviceId)?; + .free_device_id(device_id) + .map_err(DeviceManagerError::FreePciDeviceId)?; let (pci_device_handle, id) = { // Remove the device from the device tree along with its children. From 67cf328a9e241fc6d12dd96950f1cb3e2deb6aca Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Thu, 16 Apr 2026 23:11:21 +0100 Subject: [PATCH 702/742] tests: Speed up test_pci_device_id() This test was taking > 300s due to SSH backoffs. Signed-off-by: Rob Bradford --- cloud-hypervisor/tests/integration.rs | 88 +++++++++++++++------------ test_infra/src/lib.rs | 2 +- 2 files changed, 50 insertions(+), 40 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 8657edc3bd..195ca9665b 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -5747,22 +5747,26 @@ mod common_parallel { let (_, _, first_free_device_id, _) = bdf_from_hotplug_response(output.as_str()); assert_ne!(first_free_device_id, 0); - // We expect a match from grep - let _ = String::from( - guest - .ssh_command(&format!( - "lspci -n | grep \"00:{first_free_device_id:02x}.0\"" - )) - .unwrap() - .trim(), - ); + // Wait for the hotplugged device to appear in the guest + assert!(wait_until(Duration::from_secs(10), || { + ssh_command_ip_with_auth( + &format!("lspci -n | grep \"00:{first_free_device_id:02x}.0\""), + &default_guest_auth(), + &guest.network.guest_ip0, + Some(Duration::from_secs(1)), + ) + .is_ok() + })); // Calculate the succeeding device ID let device_id_to_allocate = first_free_device_id + 1; - // We expect the succeeding device ID to be free + // We expect the succeeding device ID to be free (single attempt, no retries) assert!(matches!( - guest.ssh_command(&format!( - "lspci -n | grep \"00:{device_id_to_allocate:02x}.0\"" - )), + ssh_command_ip_with_auth( + &format!("lspci -n | grep \"00:{device_id_to_allocate:02x}.0\""), + &default_guest_auth(), + &guest.network.guest_ip0, + Some(Duration::from_secs(1)), + ), Err(SshCommandError::NonZeroExitStatus(1)) )); @@ -5783,26 +5787,31 @@ mod common_parallel { let output = String::from_utf8(cmd_stdout).expect("should work"); let (_, _, allocated_device_id, _) = bdf_from_hotplug_response(output.as_str()); assert_eq!(device_id_to_allocate, allocated_device_id); - // Check that the device ID is really in use - let _ = String::from( - guest - .ssh_command(&format!( - "lspci -n | grep \"00:{allocated_device_id:02x}.0\"" - )) - .unwrap() - .trim(), - ); + // Wait for the hotplugged device to appear in the guest + assert!(wait_until(Duration::from_secs(10), || { + ssh_command_ip_with_auth( + &format!("lspci -n | grep \"00:{allocated_device_id:02x}.0\""), + &default_guest_auth(), + &guest.network.guest_ip0, + Some(Duration::from_secs(1)), + ) + .is_ok() + })); // Remove the first device to create a hole let cmd_success = remote_command(&api_socket, "remove-device", Some("test0")); assert!(cmd_success); - thread::sleep(std::time::Duration::new(5, 0)); - // We left a hole in the used PCI IDs. The guest sees no device on the respective ID - assert!(matches!( - guest.ssh_command(&format!( - "lspci -n | grep \"00:{first_free_device_id:02x}.0\"" - )), - Err(SshCommandError::NonZeroExitStatus(1)) - )); + // Wait for the device to disappear from the guest + assert!(wait_until(Duration::from_secs(10), || { + matches!( + ssh_command_ip_with_auth( + &format!("lspci -n | grep \"00:{first_free_device_id:02x}.0\""), + &default_guest_auth(), + &guest.network.guest_ip0, + Some(Duration::from_secs(1)), + ), + Err(SshCommandError::NonZeroExitStatus(1)) + ) + })); // Reuse the device ID hole by dynamically coalescing with the first free ID let (cmd_success, cmd_stdout, _) = remote_command_w_output( &api_socket, @@ -5821,15 +5830,16 @@ mod common_parallel { let (_, _, allocated_device_id, _) = bdf_from_hotplug_response(output.as_str()); assert_eq!(first_free_device_id, allocated_device_id); - // Check that guest sees the same device again at the same BDF - let _ = String::from( - guest - .ssh_command(&format!( - "lspci -n | grep \"00:{allocated_device_id:02x}.0\"" - )) - .unwrap() - .trim(), - ); + // Wait for the re-added device to appear in the guest + assert!(wait_until(Duration::from_secs(10), || { + ssh_command_ip_with_auth( + &format!("lspci -n | grep \"00:{allocated_device_id:02x}.0\""), + &default_guest_auth(), + &guest.network.guest_ip0, + Some(Duration::from_secs(1)), + ) + .is_ok() + })); }); kill_child(&mut child); diff --git a/test_infra/src/lib.rs b/test_infra/src/lib.rs index b39e5a0842..42acca1905 100644 --- a/test_infra/src/lib.rs +++ b/test_infra/src/lib.rs @@ -702,7 +702,7 @@ pub enum WaitForSshError { }, } -fn default_guest_auth() -> PasswordAuth { +pub fn default_guest_auth() -> PasswordAuth { PasswordAuth { username: String::from("cloud"), password: String::from("cloud123"), From e5dbf5242e25eda3310787fd698ba64bd7534b96 Mon Sep 17 00:00:00 2001 From: Vincent Thomas Date: Thu, 9 Apr 2026 14:46:08 +0000 Subject: [PATCH 703/742] virtio-devices: Make pause idempotent to prevent deadlock Previously, calling pause() when already paused would wait on a barrier for worker threads that were already parked, causing a deadlock. This situation occurs when the VMM thread holds a device mutex while calling an operation that triggers pause(), and a vCPU thread simultaneously needs that same mutex for MMIO access. With slow I/O backends (like RBD/Ceph), the timing window for this race is larger, making the deadlock more likely to occur, see [0]. Make pause() idempotent by checking the paused state atomically and returning early if already paused, avoiding the barrier wait. [0] https://github.com/cloud-hypervisor/cloud-hypervisor/pull/7948#discussion_r305052509 Signed-off-by: Vincent Thomas --- virtio-devices/src/device.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/virtio-devices/src/device.rs b/virtio-devices/src/device.rs index d1b9257995..3e6b30e796 100644 --- a/virtio-devices/src/device.rs +++ b/virtio-devices/src/device.rs @@ -340,7 +340,13 @@ impl Pausable for VirtioCommon { "Pausing virtio-{}", VirtioDeviceType::from(self.device_type) ); - self.paused.store(true, Ordering::SeqCst); + + // If already paused, return early to avoid deadlock waiting on barrier + // for worker threads that are already parked. + if self.paused.swap(true, Ordering::SeqCst) { + return Ok(()); + } + if let Some(pause_evt) = &self.pause_evt { pause_evt .write(1) From fd8ded9d787a67843fd5c3d46c03f4e1c6afa042 Mon Sep 17 00:00:00 2001 From: Vincent Thomas Date: Thu, 9 Apr 2026 14:45:12 +0000 Subject: [PATCH 704/742] block: Fix resize for block device backends Block devices (LVM volumes, loop devices, RBD, etc.) cannot be resized via ftruncate - they are resized externally. When vm.resize-disk is called for a block device backend, verify the device size matches the requested size instead of attempting ftruncate. This enables the resize-disk API to work with block device backends by validating the externally-resized device matches the expected size. Signed-off-by: Vincent Thomas --- block/src/raw_async.rs | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/block/src/raw_async.rs b/block/src/raw_async.rs index 90332aa4b8..7fa3208f42 100644 --- a/block/src/raw_async.rs +++ b/block/src/raw_async.rs @@ -3,7 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause use std::fs::File; -use std::io::Error; +use std::io::{self, Error}; +use std::os::unix::fs::FileTypeExt; use std::os::unix::io::{AsRawFd, RawFd}; use io_uring::{IoUring, opcode, types}; @@ -68,9 +69,30 @@ impl disk_file::SparseCapable for RawFileDisk { impl disk_file::Resizable for RawFileDisk { fn resize(&mut self, size: u64) -> BlockResult<()> { - self.file - .set_len(size) - .map_err(|e| BlockError::new(BlockErrorKind::Io, DiskFileError::ResizeError(e))) + let fd_metadata = self + .file + .metadata() + .map_err(|e| BlockError::new(BlockErrorKind::Io, DiskFileError::ResizeError(e)))?; + + if fd_metadata.file_type().is_block_device() { + // Block devices cannot be resized via ftruncate - they are resized + // externally (LVM, losetup -c, etc.). Verify the size matches. + let (actual_size, _) = query_device_size(&self.file) + .map_err(|e| BlockError::new(BlockErrorKind::Io, DiskFileError::ResizeError(e)))?; + if actual_size != size { + return Err(BlockError::new( + BlockErrorKind::Io, + DiskFileError::ResizeError(io::Error::other(format!( + "Block device size {actual_size} does not match requested size {size}" + ))), + )); + } + Ok(()) + } else { + self.file + .set_len(size) + .map_err(|e| BlockError::new(BlockErrorKind::Io, DiskFileError::ResizeError(e))) + } } } From 11a86fee3e3398f2c75ee638d076c4f0e76db2da Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 16 Apr 2026 13:54:57 +0200 Subject: [PATCH 705/742] tests: Add QCOW2 direct I/O UEFI boot integration test Boot a UEFI guest from a QCOW2 image with direct=on to exercise the aligned I/O write path during early firmware operations. Signed-off-by: Anatol Belski --- cloud-hypervisor/tests/integration.rs | 90 +++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 195ca9665b..0e59acbe70 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -1309,6 +1309,96 @@ mod common_parallel { .expect("Failed to read back data after discard stress"); }); } + + #[test] + fn test_virtio_block_qcow2_uefi_direct_io() { + // Regression test for #8007. + // Place the QCOW2 OS image on a 4096 byte sector filesystem so + // O_DIRECT forces 4096 byte alignment on all I/O buffers. + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME_QCOW2.to_string()); + let guest = Guest::new(Box::new(disk_config)); + let kernel_path = edk2_path(); + + let mut workloads_path = dirs::home_dir().unwrap(); + workloads_path.push("workloads"); + let img_dir = TempDir::new_in(workloads_path.as_path()).unwrap(); + let fs_img_path = img_dir.as_path().join("fs_4ksec.img"); + + assert!( + exec_host_command_output(&format!("truncate -s 4G {}", fs_img_path.to_str().unwrap())) + .status + .success(), + "truncate failed" + ); + + let loop_dev_path = create_loop_device(fs_img_path.to_str().unwrap(), 4096, 5); + + assert!( + exec_host_command_output(&format!("mkfs.ext4 -q {loop_dev_path}")) + .status + .success(), + "mkfs.ext4 failed" + ); + + let mnt_dir = img_dir.as_path().join("mnt"); + fs::create_dir_all(&mnt_dir).unwrap(); + assert!( + exec_host_command_output(&format!( + "mount {} {}", + &loop_dev_path, + mnt_dir.to_str().unwrap() + )) + .status + .success(), + "mount failed" + ); + + let src_qcow2 = guest.disk_config.disk(DiskType::OperatingSystem).unwrap(); + let dest_qcow2 = mnt_dir.join("os.qcow2"); + assert!( + exec_host_command_output(&format!( + "cp {} {}", + &src_qcow2, + dest_qcow2.to_str().unwrap() + )) + .status + .success(), + "cp failed" + ); + + let mut child = GuestCommand::new(&guest) + .default_cpus() + .default_memory() + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args([ + "--disk", + &format!( + "path={},direct=on,image_type=qcow2", + dest_qcow2.to_str().unwrap() + ), + &format!( + "path={}", + guest.disk_config.disk(DiskType::CloudInit).unwrap() + ), + ]) + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + let _ = exec_host_command_output(&format!("umount {}", mnt_dir.to_str().unwrap())); + let _ = exec_host_command_output(&format!("losetup -d {loop_dev_path}")); + + handle_child_output(r, &output); + } + #[test] fn test_virtio_block_qcow2_dirty_bit_unclean_shutdown() { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME_QCOW2.to_string()); From 854b6862937d4d310335df065dabd88d4bfb39c7 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 16 Apr 2026 22:40:42 +0200 Subject: [PATCH 706/742] block: qcow: Test async alignment() returns SECTOR_SIZE Verify that QcowAsync reports the default SECTOR_SIZE alignment when O_DIRECT is not active. Signed-off-by: Anatol Belski --- block/src/qcow_async.rs | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/block/src/qcow_async.rs b/block/src/qcow_async.rs index e65a4ac757..16272f4965 100644 --- a/block/src/qcow_async.rs +++ b/block/src/qcow_async.rs @@ -630,7 +630,7 @@ mod unit_tests { use super::*; use crate::disk_file::AsyncDiskFile; use crate::qcow::{QcowFile, RawFile}; - use crate::{BatchRequest, RequestType}; + use crate::{BatchRequest, RequestType, SECTOR_SIZE}; fn create_disk_with_data( file_size: u64, @@ -995,4 +995,18 @@ mod unit_tests { ); } } + + #[test] + fn test_qcow_async_alignment_without_direct_io() { + let file_size = 100 * 1024 * 1024; + let temp_file = TempFile::new().unwrap(); + { + let raw_file = RawFile::new(temp_file.as_file().try_clone().unwrap(), false); + QcowFile::new(raw_file, 3, file_size, true).unwrap(); + } + let disk = QcowDiskAsync::new(temp_file.as_file().try_clone().unwrap(), false, false, true) + .unwrap(); + let async_io = disk.new_async_io(1).unwrap(); + assert_eq!(async_io.alignment(), SECTOR_SIZE); + } } From dd79b1899d657c0c18b1a9ac5bf34039677ee965 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 16 Apr 2026 22:41:53 +0200 Subject: [PATCH 707/742] block: qcow: Test async alignment() with O_DIRECT Verify that QcowAsync reports at least SECTOR_SIZE alignment when O_DIRECT is active. Skipped on filesystems that do not support O_DIRECT (e.g. tmpfs). Signed-off-by: Anatol Belski --- block/src/qcow_async.rs | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/block/src/qcow_async.rs b/block/src/qcow_async.rs index 16272f4965..390b863e9d 100644 --- a/block/src/qcow_async.rs +++ b/block/src/qcow_async.rs @@ -1009,4 +1009,27 @@ mod unit_tests { let async_io = disk.new_async_io(1).unwrap(); assert_eq!(async_io.alignment(), SECTOR_SIZE); } + + /// Returns None if O_DIRECT is not supported (e.g. tmpfs). + fn try_create_direct_io_disk(temp_file: &TempFile, file_size: u64) -> Option { + { + let raw_file = RawFile::new(temp_file.as_file().try_clone().unwrap(), false); + QcowFile::new(raw_file, 3, file_size, true).unwrap(); + } + QcowDiskAsync::new(temp_file.as_file().try_clone().unwrap(), true, false, true).ok() + } + + #[test] + fn test_qcow_async_alignment_with_direct_io() { + let temp_file = TempFile::new().unwrap(); + let disk = match try_create_direct_io_disk(&temp_file, 100 * 1024 * 1024) { + Some(d) => d, + None => { + eprintln!("skipping: O_DIRECT not supported on this filesystem"); + return; + } + }; + let async_io = disk.new_async_io(1).unwrap(); + assert!(async_io.alignment() >= SECTOR_SIZE); + } } From e67195ce481c182887306852992e869b6fe1cfe5 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 16 Apr 2026 22:44:57 +0200 Subject: [PATCH 708/742] block: qcow: Test sub sector O_DIRECT read Verify that a 512 byte read from an allocated cluster succeeds with O_DIRECT. This exercises the synchronous fallback path in resolve_read() that is taken when alignment is nonzero. Signed-off-by: Anatol Belski --- block/src/qcow_async.rs | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/block/src/qcow_async.rs b/block/src/qcow_async.rs index 390b863e9d..cbcd620422 100644 --- a/block/src/qcow_async.rs +++ b/block/src/qcow_async.rs @@ -1032,4 +1032,25 @@ mod unit_tests { let async_io = disk.new_async_io(1).unwrap(); assert!(async_io.alignment() >= SECTOR_SIZE); } + + #[test] + fn test_qcow_async_sub_sector_read_with_direct_io() { + let temp_file = TempFile::new().unwrap(); + let disk = match try_create_direct_io_disk(&temp_file, 100 * 1024 * 1024) { + Some(d) => d, + None => { + eprintln!("skipping: O_DIRECT not supported on this filesystem"); + return; + } + }; + + let pattern = vec![0xAB; 65536]; + async_write(&disk, 0, &pattern); + + let buf = async_read(&disk, 0, 512); + assert!( + buf.iter().all(|&b| b == 0xAB), + "sub-sector O_DIRECT read should return written data" + ); + } } From 98c533a501d5eff51bf16bb1a78b043f86450626 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 16 Apr 2026 22:46:38 +0200 Subject: [PATCH 709/742] block: qcow: Test O_DIRECT write and read roundtrip Write 128K of patterned data and read it back with O_DIRECT active to verify the aligned I/O paths produce correct results. Signed-off-by: Anatol Belski --- block/src/qcow_async.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/block/src/qcow_async.rs b/block/src/qcow_async.rs index cbcd620422..2794b4235f 100644 --- a/block/src/qcow_async.rs +++ b/block/src/qcow_async.rs @@ -1053,4 +1053,22 @@ mod unit_tests { "sub-sector O_DIRECT read should return written data" ); } + + #[test] + fn test_qcow_async_direct_io_write_read_roundtrip() { + let temp_file = TempFile::new().unwrap(); + let disk = match try_create_direct_io_disk(&temp_file, 100 * 1024 * 1024) { + Some(d) => d, + None => { + eprintln!("skipping: O_DIRECT not supported on this filesystem"); + return; + } + }; + + let pattern: Vec = (0..128 * 1024).map(|i| (i % 251) as u8).collect(); + async_write(&disk, 0, &pattern); + + let buf = async_read(&disk, 0, pattern.len()); + assert_eq!(buf, pattern, "O_DIRECT roundtrip should match"); + } } From 4772235952b87190a274713df8e6d0edd8093357 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Thu, 16 Apr 2026 18:01:34 +0200 Subject: [PATCH 710/742] block: qcow: Fix O_DIRECT EINVAL in async io_uring path Override AsyncIo::alignment() to report the actual device sector size so that execute_async() correctly bounces misaligned guest memory pointers. Guard the io_uring fast path in resolve_read() with an alignment check. When O_DIRECT is active, guest requests can have I/O sizes smaller than the device sector size (e.g. 512 byte UEFI reads on a 4096 byte sector device). The kernel rejects these with EINVAL. Route such reads through scatter_read_sync() which uses AlignedBuf and aligned_pread to satisfy O_DIRECT size and offset requirements. Signed-off-by: Anatol Belski --- block/src/qcow_async.rs | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/block/src/qcow_async.rs b/block/src/qcow_async.rs index 2794b4235f..308c2e27d0 100644 --- a/block/src/qcow_async.rs +++ b/block/src/qcow_async.rs @@ -6,7 +6,7 @@ //! QCOW2 async disk backend. -use std::cmp::min; +use std::cmp::{max, min}; use std::collections::VecDeque; use std::fs::File; use std::io::Error; @@ -30,7 +30,7 @@ use crate::qcow_common::{ AlignedBuf, aligned_pread, aligned_pwrite, gather_from_iovecs_into, pread_exact, pwrite_all, scatter_to_iovecs, zero_fill_iovecs, }; -use crate::{BatchRequest, RequestType, disk_file}; +use crate::{BatchRequest, RequestType, SECTOR_SIZE, disk_file}; /// Device level handle for a QCOW2 image. /// @@ -175,6 +175,8 @@ pub struct QcowAsync { sparse: bool, /// O_DIRECT alignment requirement (0 = no alignment needed). alignment: usize, + /// I/O alignment for the AsyncIo trait (at least SECTOR_SIZE). + io_alignment: u64, io_uring: IoUring, eventfd: EventFd, completion_list: VecDeque<(u64, i32)>, @@ -189,6 +191,7 @@ impl QcowAsync { ring_depth: u32, ) -> io::Result { let alignment = data_file.file().alignment(); + let io_alignment = max(alignment as u64, SECTOR_SIZE); let io_uring = IoUring::new(ring_depth)?; let eventfd = EventFd::new(libc::EFD_NONBLOCK)?; io_uring.submitter().register_eventfd(eventfd.as_raw_fd())?; @@ -199,6 +202,7 @@ impl QcowAsync { backing_file, sparse, alignment, + io_alignment, io_uring, eventfd, completion_list: VecDeque::new(), @@ -367,6 +371,10 @@ impl AsyncIo for QcowAsync { true } + fn alignment(&self) -> u64 { + self.io_alignment + } + fn submit_batch_requests(&mut self, batch_request: &[BatchRequest]) -> AsyncIoResult<()> { let (submitter, mut sq, _) = self.io_uring.split(); let mut needs_submit = false; @@ -464,7 +472,15 @@ impl QcowAsync { .map_clusters_for_read(address, total_len, has_backing) .map_err(AsyncIoError::ReadVectored)?; - if mappings.len() == 1 + // The fast path returns a host offset so the caller can submit a + // single io_uring readv with the original iovecs. This only works + // without O_DIRECT because it requires I/O + // size and file offset to be multiples of the device sector size. + // Guest requests can be smaller (e.g. 512 byte UEFI reads on a + // 4096 byte sector device), so O_DIRECT reads fall through to the + // alignment aware synchronous path instead. + if alignment == 0 + && mappings.len() == 1 && let ClusterReadMapping::Allocated { offset: host_offset, length, From e38f00644d247894442402e098435fcccf5ce428 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Fri, 17 Apr 2026 09:23:12 +0200 Subject: [PATCH 711/742] tests: Replace manual losetup with create_loop_device() call Use the ioctl based create_loop_device() helper instead of shelling out to losetup in the file backed 4K alignment test. Signed-off-by: Anatol Belski --- cloud-hypervisor/tests/integration.rs | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 0e59acbe70..0dc7a8d64d 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -3529,12 +3529,7 @@ mod common_parallel { "truncate failed" ); - let loop_dev = exec_host_command_output(&format!( - "losetup --find --show --sector-size 4096 {}", - fs_img_path.to_str().unwrap() - )); - assert!(loop_dev.status.success(), "losetup failed"); - let loop_dev_path = String::from_utf8_lossy(&loop_dev.stdout).trim().to_string(); + let loop_dev_path = create_loop_device(fs_img_path.to_str().unwrap(), 4096, 5); assert!( exec_host_command_output(&format!("mkfs.ext4 -q {loop_dev_path}")) From ea64d109c72da739590135042ab9b609c9b3814a Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Fri, 17 Apr 2026 08:51:28 +0100 Subject: [PATCH 712/742] virtio-devices: Add feature acked gated accessor for AccessPlatform Add VirtioCommon::access_platform() method. The virtio spec requires that only if the feature is acked should the accesses be transformed via the access platform implementation. This will enable that filtering. Signed-off-by: Rob Bradford --- virtio-devices/src/device.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/virtio-devices/src/device.rs b/virtio-devices/src/device.rs index 3e6b30e796..ca2e5834e5 100644 --- a/virtio-devices/src/device.rs +++ b/virtio-devices/src/device.rs @@ -16,6 +16,7 @@ use std::thread; use anyhow::anyhow; use libc::EFD_NONBLOCK; use log::{error, info, warn}; +use virtio_bindings::virtio_config::VIRTIO_F_ACCESS_PLATFORM; use virtio_queue::Queue; use vm_device::UserspaceMapping; use vm_memory::{GuestAddress, GuestMemoryAtomic}; @@ -332,6 +333,15 @@ impl VirtioCommon { // requires the addresses held by the descriptors to be translated. self.avail_features &= !(1 << VIRTIO_F_RING_INDIRECT_DESC); } + + /// Returns the access platform only if the feature has been acked. + pub fn access_platform(&self) -> Option> { + if self.feature_acked(VIRTIO_F_ACCESS_PLATFORM as u64) { + self.access_platform.clone() + } else { + None + } + } } impl Pausable for VirtioCommon { From 81ee260ac3336c32242b8b9872c7400d5ec76478 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Fri, 17 Apr 2026 08:56:44 +0100 Subject: [PATCH 713/742] virtio-devices: Use new VirtioCommon::access_platform() accessor Use the new virtio feature gated accessor when creating the handlers for the virtio devices. This now means that the translations via the accessor will only be applied if the feature is acked in accordance with the spec. Signed-off-by: Rob Bradford --- virtio-devices/src/block.rs | 2 +- virtio-devices/src/console.rs | 2 +- virtio-devices/src/net.rs | 4 ++-- virtio-devices/src/pmem.rs | 2 +- virtio-devices/src/rng.rs | 2 +- virtio-devices/src/vdpa.rs | 6 +++--- virtio-devices/src/vsock/device.rs | 2 +- 7 files changed, 10 insertions(+), 10 deletions(-) diff --git a/virtio-devices/src/block.rs b/virtio-devices/src/block.rs index bd740fc4a5..961ab9fdec 100644 --- a/virtio-devices/src/block.rs +++ b/virtio-devices/src/block.rs @@ -1140,7 +1140,7 @@ impl VirtioDevice for Block { .map(|r| r.new_handle()) .transpose() .unwrap(), - access_platform: self.common.access_platform.clone(), + access_platform: self.common.access_platform(), host_cpus: self.queue_affinity.get(&queue_idx).cloned(), acked_features: self.common.acked_features, disable_sector0_writes: self.disable_sector0_writes, diff --git a/virtio-devices/src/console.rs b/virtio-devices/src/console.rs index d2d57b9e50..b7519626d3 100644 --- a/virtio-devices/src/console.rs +++ b/virtio-devices/src/console.rs @@ -751,7 +751,7 @@ impl VirtioDevice for Console { self.resize_pipe.as_ref().map(|p| p.try_clone().unwrap()), kill_evt, pause_evt, - self.common.access_platform.clone(), + self.common.access_platform(), ); let paused = self.common.paused.clone(); diff --git a/virtio-devices/src/net.rs b/virtio-devices/src/net.rs index ed8c05eeb8..7c50b4f8b8 100644 --- a/virtio-devices/src/net.rs +++ b/virtio-devices/src/net.rs @@ -765,7 +765,7 @@ impl VirtioDevice for Net { ctrl_q: CtrlQueue::new(self.taps.clone()), queue: ctrl_queue, queue_evt: ctrl_queue_evt, - access_platform: self.common.access_platform.clone(), + access_platform: self.common.access_platform(), queue_index: ctrl_queue_index as u16, interrupt_cb: interrupt_cb.clone(), }; @@ -837,7 +837,7 @@ impl VirtioDevice for Net { rx_desc_avail: false, rx_rate_limiter, tx_rate_limiter, - access_platform: self.common.access_platform.clone(), + access_platform: self.common.access_platform(), }, mem: mem.clone(), queue_index_base: (i * 2) as u16, diff --git a/virtio-devices/src/pmem.rs b/virtio-devices/src/pmem.rs index 3abec1c0f0..91e0f65797 100644 --- a/virtio-devices/src/pmem.rs +++ b/virtio-devices/src/pmem.rs @@ -404,7 +404,7 @@ impl VirtioDevice for Pmem { queue_evt, kill_evt, pause_evt, - access_platform: self.common.access_platform.clone(), + access_platform: self.common.access_platform(), }; let paused = self.common.paused.clone(); diff --git a/virtio-devices/src/rng.rs b/virtio-devices/src/rng.rs index 6bb0269c5e..e8fd5b3742 100644 --- a/virtio-devices/src/rng.rs +++ b/virtio-devices/src/rng.rs @@ -273,7 +273,7 @@ impl VirtioDevice for Rng { queue_evt, kill_evt, pause_evt, - access_platform: self.common.access_platform.clone(), + access_platform: self.common.access_platform(), }; let paused = self.common.paused.clone(); diff --git a/virtio-devices/src/vdpa.rs b/virtio-devices/src/vdpa.rs index a35c35eb56..ff7305395e 100644 --- a/virtio-devices/src/vdpa.rs +++ b/virtio-devices/src/vdpa.rs @@ -251,21 +251,21 @@ impl Vdpa { desc_table_addr: queue .desc_table() .translate_gpa( - self.common.access_platform.as_deref(), + self.common.access_platform().as_deref(), queue_size as usize * std::mem::size_of::(), ) .map_err(Error::TranslateAddress)?, used_ring_addr: queue .used_ring() .translate_gpa( - self.common.access_platform.as_deref(), + self.common.access_platform().as_deref(), 4 + queue_size as usize * 8, ) .map_err(Error::TranslateAddress)?, avail_ring_addr: queue .avail_ring() .translate_gpa( - self.common.access_platform.as_deref(), + self.common.access_platform().as_deref(), 4 + queue_size as usize * 2, ) .map_err(Error::TranslateAddress)?, diff --git a/virtio-devices/src/vsock/device.rs b/virtio-devices/src/vsock/device.rs index 25412503af..dd877fa55f 100644 --- a/virtio-devices/src/vsock/device.rs +++ b/virtio-devices/src/vsock/device.rs @@ -466,7 +466,7 @@ where pause_evt, interrupt_cb, backend: self.backend.clone(), - access_platform: self.common.access_platform.clone(), + access_platform: self.common.access_platform(), }; let paused = self.common.paused.clone(); From 636da215e3b594f452c670ceeeb5d89fc6cad8f4 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Fri, 17 Apr 2026 09:10:12 +0100 Subject: [PATCH 714/742] virtio-devices: Add VirtioDevice::access_platform() Adding this method to the trait will allow the virtio PCI code to access a feature conditional version of the access platform and simplify the logic. Signed-off-by: Rob Bradford --- virtio-devices/src/device.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/virtio-devices/src/device.rs b/virtio-devices/src/device.rs index ca2e5834e5..89e1ee2eaf 100644 --- a/virtio-devices/src/device.rs +++ b/virtio-devices/src/device.rs @@ -188,6 +188,12 @@ pub trait VirtioDevice: Send { /// Set the access platform trait to let the device perform address /// translations if needed. fn set_access_platform(&mut self, _access_platform: Arc) {} + + /// Returns the access platform only if VIRTIO_F_ACCESS_PLATFORM was + /// negotiated with the guest. + fn access_platform(&self) -> Option> { + None + } } /// Trait to define address translation for devices managed by virtio-iommu From 128ee6d105d2594f67a4ed462e0d2e515e048a4d Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Fri, 17 Apr 2026 09:11:47 +0100 Subject: [PATCH 715/742] virtio-devices: Implement VirtioDevice::access_platform() This forwards through to the VirtioCommon implementation and can be used to simplify the virtio PCI access code. Signed-off-by: Rob Bradford --- virtio-devices/src/block.rs | 4 ++++ virtio-devices/src/console.rs | 4 ++++ virtio-devices/src/net.rs | 4 ++++ virtio-devices/src/pmem.rs | 4 ++++ virtio-devices/src/rng.rs | 4 ++++ virtio-devices/src/vdpa.rs | 4 ++++ virtio-devices/src/vsock/device.rs | 4 ++++ 7 files changed, 28 insertions(+) diff --git a/virtio-devices/src/block.rs b/virtio-devices/src/block.rs index 961ab9fdec..71c0b7ed39 100644 --- a/virtio-devices/src/block.rs +++ b/virtio-devices/src/block.rs @@ -1223,6 +1223,10 @@ impl VirtioDevice for Block { fn set_access_platform(&mut self, access_platform: Arc) { self.common.set_access_platform(access_platform); } + + fn access_platform(&self) -> Option> { + self.common.access_platform() + } } impl Pausable for Block { diff --git a/virtio-devices/src/console.rs b/virtio-devices/src/console.rs index b7519626d3..f141d08b14 100644 --- a/virtio-devices/src/console.rs +++ b/virtio-devices/src/console.rs @@ -782,6 +782,10 @@ impl VirtioDevice for Console { fn set_access_platform(&mut self, access_platform: Arc) { self.common.set_access_platform(access_platform); } + + fn access_platform(&self) -> Option> { + self.common.access_platform() + } } impl Pausable for Console { diff --git a/virtio-devices/src/net.rs b/virtio-devices/src/net.rs index 7c50b4f8b8..7ddbccca8e 100644 --- a/virtio-devices/src/net.rs +++ b/virtio-devices/src/net.rs @@ -900,6 +900,10 @@ impl VirtioDevice for Net { fn set_access_platform(&mut self, access_platform: Arc) { self.common.set_access_platform(access_platform); } + + fn access_platform(&self) -> Option> { + self.common.access_platform() + } } impl Pausable for Net { diff --git a/virtio-devices/src/pmem.rs b/virtio-devices/src/pmem.rs index 91e0f65797..eb94e7104d 100644 --- a/virtio-devices/src/pmem.rs +++ b/virtio-devices/src/pmem.rs @@ -441,6 +441,10 @@ impl VirtioDevice for Pmem { fn set_access_platform(&mut self, access_platform: Arc) { self.common.set_access_platform(access_platform); } + + fn access_platform(&self) -> Option> { + self.common.access_platform() + } } impl Pausable for Pmem { diff --git a/virtio-devices/src/rng.rs b/virtio-devices/src/rng.rs index e8fd5b3742..2f43ac66d5 100644 --- a/virtio-devices/src/rng.rs +++ b/virtio-devices/src/rng.rs @@ -305,6 +305,10 @@ impl VirtioDevice for Rng { fn set_access_platform(&mut self, access_platform: Arc) { self.common.set_access_platform(access_platform); } + + fn access_platform(&self) -> Option> { + self.common.access_platform() + } } impl Pausable for Rng { diff --git a/virtio-devices/src/vdpa.rs b/virtio-devices/src/vdpa.rs index ff7305395e..f9bf7a39de 100644 --- a/virtio-devices/src/vdpa.rs +++ b/virtio-devices/src/vdpa.rs @@ -465,6 +465,10 @@ impl VirtioDevice for Vdpa { fn set_access_platform(&mut self, access_platform: Arc) { self.common.set_access_platform(access_platform); } + + fn access_platform(&self) -> Option> { + self.common.access_platform() + } } impl Pausable for Vdpa { diff --git a/virtio-devices/src/vsock/device.rs b/virtio-devices/src/vsock/device.rs index dd877fa55f..0b202945b0 100644 --- a/virtio-devices/src/vsock/device.rs +++ b/virtio-devices/src/vsock/device.rs @@ -501,6 +501,10 @@ where fn set_access_platform(&mut self, access_platform: Arc) { self.common.set_access_platform(access_platform); } + + fn access_platform(&self) -> Option> { + self.common.access_platform() + } } impl Pausable for Vsock From adc5bb79580d15e1266232f5d4a9348fbef83f96 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Fri, 17 Apr 2026 09:14:44 +0100 Subject: [PATCH 716/742] virtio-devices: Add a VirtioDevice reference to VirtioPciCommonConfig Replace the stored AccessPlatform reference with one to the VirtioDevice. By doing this not only does it allow the code to be simplified but also now makes it virtio spec compliant by only translating via the access platform if the feature is acknowledged. Signed-off-by: Rob Bradford --- .../src/transport/pci_common_config.rs | 104 +++++++----------- virtio-devices/src/transport/pci_device.rs | 28 ++--- vmm/src/device_manager.rs | 2 +- 3 files changed, 50 insertions(+), 84 deletions(-) diff --git a/virtio-devices/src/transport/pci_common_config.rs b/virtio-devices/src/transport/pci_common_config.rs index 11e1d3ac70..379622bd97 100644 --- a/virtio-devices/src/transport/pci_common_config.rs +++ b/virtio-devices/src/transport/pci_common_config.rs @@ -14,7 +14,6 @@ use log::{debug, error, warn}; use serde::{Deserialize, Serialize}; use virtio_queue::{Queue, QueueT}; use vm_migration::{MigratableError, Pausable, Snapshot, Snapshottable}; -use vm_virtio::AccessPlatform; use super::pci_device::VIRTQ_MSI_NO_VECTOR; use crate::VirtioDevice; @@ -125,7 +124,7 @@ pub fn get_vring_size(t: VringType, queue_size: u16) -> u64 { /// le64 queue_avail; // 0x28 // read-write /// le64 queue_used; // 0x30 // read-write pub struct VirtioPciCommonConfig { - pub access_platform: Option>, + pub device: Arc>, pub driver_status: Arc, pub config_generation: u8, pub device_feature_select: u32, @@ -136,12 +135,9 @@ pub struct VirtioPciCommonConfig { } impl VirtioPciCommonConfig { - pub fn new( - state: VirtioPciCommonConfigState, - access_platform: Option>, - ) -> Self { + pub fn new(state: VirtioPciCommonConfigState, device: Arc>) -> Self { VirtioPciCommonConfig { - access_platform, + device, driver_status: Arc::new(AtomicU8::new(state.driver_status)), config_generation: state.config_generation, device_feature_select: state.device_feature_select, @@ -164,13 +160,7 @@ impl VirtioPciCommonConfig { } } - pub fn read( - &mut self, - offset: u64, - data: &mut [u8], - queues: &[Queue], - device: Arc>, - ) { + pub fn read(&mut self, offset: u64, data: &mut [u8], queues: &[Queue]) { assert!(data.len() <= 8); match data.len() { @@ -183,7 +173,7 @@ impl VirtioPciCommonConfig { LittleEndian::write_u16(data, v); } 4 => { - let v = self.read_common_config_dword(offset, device); + let v = self.read_common_config_dword(offset); LittleEndian::write_u32(data, v); } 8 => { @@ -194,26 +184,14 @@ impl VirtioPciCommonConfig { } } - #[allow(clippy::needless_pass_by_value)] - pub fn write( - &mut self, - offset: u64, - data: &[u8], - queues: &mut [Queue], - device: Arc>, - ) { + pub fn write(&mut self, offset: u64, data: &[u8], queues: &mut [Queue]) { assert!(data.len() <= 8); match data.len() { 1 => self.write_common_config_byte(offset, data[0]), 2 => self.write_common_config_word(offset, LittleEndian::read_u16(data), queues), 4 => { - self.write_common_config_dword( - offset, - LittleEndian::read_u32(data), - queues, - device, - ); + self.write_common_config_dword(offset, LittleEndian::read_u32(data), queues); } 8 => self.write_common_config_qword(offset, LittleEndian::read_u64(data), queues), _ => error!("invalid data length for virtio write: len {}", data.len()), @@ -285,8 +263,12 @@ impl VirtioPciCommonConfig { 0x1c => self.with_queue_mut(queues, |q| { let ready = value == 1; q.set_ready(ready); - // Translate address of descriptor table and vrings. - if ready && let Some(access_platform) = &self.access_platform { + let access_platform = if ready { + self.device.lock().unwrap().access_platform() + } else { + None + }; + if let Some(access_platform) = access_platform { let desc_table = match access_platform .translate_gva(q.desc_table(), get_vring_size(VringType::Desc, q.size())) { @@ -337,15 +319,12 @@ impl VirtioPciCommonConfig { } } - #[allow(clippy::needless_pass_by_value)] - fn read_common_config_dword(&self, offset: u64, device: Arc>) -> u32 { + fn read_common_config_dword(&self, offset: u64) -> u32 { debug!("read_common_config_dword: offset 0x{offset:x}"); match offset { 0x00 => self.device_feature_select, 0x04 => { - let locked_device = device.lock().unwrap(); - // Only 64 bits of features (2 pages) are defined for now, so limit - // device_feature_select to avoid shifting by 64 or more bits. + let locked_device = self.device.lock().unwrap(); if self.device_feature_select < 2 { (locked_device.features() >> (self.device_feature_select * 32)) as u32 } else { @@ -360,14 +339,7 @@ impl VirtioPciCommonConfig { } } - #[allow(clippy::needless_pass_by_value)] - fn write_common_config_dword( - &mut self, - offset: u64, - value: u32, - queues: &mut [Queue], - device: Arc>, - ) { + fn write_common_config_dword(&mut self, offset: u64, value: u32, queues: &mut [Queue]) { debug!("write_common_config_dword: offset 0x{offset:x}"); match offset { @@ -375,7 +347,7 @@ impl VirtioPciCommonConfig { 0x08 => self.driver_feature_select = value, 0x0c => { if self.driver_feature_select < 2 { - let mut locked_device = device.lock().unwrap(); + let mut locked_device = self.device.lock().unwrap(); locked_device .ack_features(u64::from(value) << (self.driver_feature_select * 32)); } @@ -472,8 +444,9 @@ mod unit_tests { #[test] fn write_base_regs() { + let dev: Arc> = Arc::new(Mutex::new(DummyDevice(0))); let mut regs = VirtioPciCommonConfig { - access_platform: None, + device: dev.clone(), driver_status: Arc::new(AtomicU8::new(0xaa)), config_generation: 0x55, device_feature_select: 0x0, @@ -483,72 +456,69 @@ mod unit_tests { msix_queues: Arc::new(Mutex::new(vec![0; 3])), }; - let dev = Arc::new(Mutex::new(DummyDevice(0))); let mut queues = Vec::new(); // Can set all bits of driver_status. - regs.write(0x14, &[0x55], &mut queues, dev.clone()); + regs.write(0x14, &[0x55], &mut queues); let mut read_back = vec![0x00]; - regs.read(0x14, &mut read_back, &queues, dev.clone()); + regs.read(0x14, &mut read_back, &queues); assert_eq!(read_back[0], 0x55); // The config generation register is read only. - regs.write(0x15, &[0xaa], &mut queues, dev.clone()); + regs.write(0x15, &[0xaa], &mut queues); let mut read_back = vec![0x00]; - regs.read(0x15, &mut read_back, &queues, dev.clone()); + regs.read(0x15, &mut read_back, &queues); assert_eq!(read_back[0], 0x55); // Device features is read-only and passed through from the device. - regs.write(0x04, &[0, 0, 0, 0], &mut queues, dev.clone()); + regs.write(0x04, &[0, 0, 0, 0], &mut queues); let mut read_back = vec![0, 0, 0, 0]; - regs.read(0x04, &mut read_back, &queues, dev.clone()); + regs.read(0x04, &mut read_back, &queues); assert_eq!(LittleEndian::read_u32(&read_back), DUMMY_FEATURES as u32); // Feature select registers are read/write. - regs.write(0x00, &[1, 2, 3, 4], &mut queues, dev.clone()); + regs.write(0x00, &[1, 2, 3, 4], &mut queues); let mut read_back = vec![0, 0, 0, 0]; - regs.read(0x00, &mut read_back, &queues, dev.clone()); + regs.read(0x00, &mut read_back, &queues); assert_eq!(LittleEndian::read_u32(&read_back), 0x0403_0201); - regs.write(0x08, &[1, 2, 3, 4], &mut queues, dev.clone()); + regs.write(0x08, &[1, 2, 3, 4], &mut queues); let mut read_back = vec![0, 0, 0, 0]; - regs.read(0x08, &mut read_back, &queues, dev.clone()); + regs.read(0x08, &mut read_back, &queues); assert_eq!(LittleEndian::read_u32(&read_back), 0x0403_0201); // 'queue_select' can be read and written. - regs.write(0x16, &[0xaa, 0x55], &mut queues, dev.clone()); + regs.write(0x16, &[0xaa, 0x55], &mut queues); let mut read_back = vec![0x00, 0x00]; - regs.read(0x16, &mut read_back, &queues, dev); + regs.read(0x16, &mut read_back, &queues); assert_eq!(read_back[0], 0xaa); assert_eq!(read_back[1], 0x55); } #[test] fn oob_queue_select_does_not_panic() { - // Regression test: reading/writing queue_msix_vector (offset 0x1a) - // with an out-of-bounds queue_select must not panic. + let dev: Arc> = Arc::new(Mutex::new(DummyDevice(0))); let mut regs = VirtioPciCommonConfig { - access_platform: None, + device: dev.clone(), driver_status: Arc::new(AtomicU8::new(0)), config_generation: 0, device_feature_select: 0, driver_feature_select: 0, queue_select: 0, msix_config: Arc::new(AtomicU16::new(0)), - msix_queues: Arc::new(Mutex::new(vec![0; 1])), // only 1 queue + msix_queues: Arc::new(Mutex::new(vec![0; 1])), }; - let dev = Arc::new(Mutex::new(DummyDevice(0))); let mut queues = vec![Queue::new(256).unwrap()]; // Set queue_select to an out-of-bounds value. - regs.write(0x16, &[0xFF, 0xFF], &mut queues, dev.clone()); + regs.write(0x16, &[0xFF, 0xFF], &mut queues); // Read queue_msix_vector — must not panic, should return VIRTQ_MSI_NO_VECTOR. let mut read_back = vec![0x00, 0x00]; - regs.read(0x1a, &mut read_back, &queues, dev.clone()); + regs.read(0x1a, &mut read_back, &queues); assert_eq!(LittleEndian::read_u16(&read_back), VIRTQ_MSI_NO_VECTOR); // Write queue_msix_vector — must not panic. - regs.write(0x1a, &[0xAB, 0xCD], &mut queues, dev); + regs.write(0x1a, &[0xAB, 0xCD], &mut queues); } } diff --git a/virtio-devices/src/transport/pci_device.rs b/virtio-devices/src/transport/pci_device.rs index 54a29caa5d..98abb04935 100644 --- a/virtio-devices/src/transport/pci_device.rs +++ b/virtio-devices/src/transport/pci_device.rs @@ -404,7 +404,7 @@ impl VirtioPciDevice { memory: GuestMemoryAtomic, device: Arc>, msix_num: u16, - access_platform: Option>, + access_platform: Option<&Arc>, interrupt_manager: &dyn InterruptManager, pci_device_bdf: u32, activate_evt: EventFd, @@ -422,7 +422,7 @@ impl VirtioPciDevice { } let num_queues = locked_device.queue_max_sizes().len(); - if let Some(access_platform) = &access_platform { + if let Some(access_platform) = access_platform { locked_device.set_access_platform(access_platform.clone()); } @@ -518,7 +518,7 @@ impl VirtioPciDevice { })?; let common_config = if let Some(common_config_state) = common_config_state { - VirtioPciCommonConfig::new(common_config_state, access_platform) + VirtioPciCommonConfig::new(common_config_state, device.clone()) } else { VirtioPciCommonConfig::new( VirtioPciCommonConfigState { @@ -530,7 +530,7 @@ impl VirtioPciDevice { msix_config: VIRTQ_MSI_NO_VECTOR, msix_queues: vec![VIRTQ_MSI_NO_VECTOR; num_queues], }, - access_platform, + device.clone(), ) }; @@ -1139,12 +1139,10 @@ impl PciDevice for VirtioPciDevice { fn read_bar(&mut self, _base: u64, offset: u64, data: &mut [u8]) { match offset { - o if o < COMMON_CONFIG_BAR_OFFSET + COMMON_CONFIG_SIZE => self.common_config.read( - o - COMMON_CONFIG_BAR_OFFSET, - data, - &self.queues, - self.device.clone(), - ), + o if o < COMMON_CONFIG_BAR_OFFSET + COMMON_CONFIG_SIZE => { + self.common_config + .read(o - COMMON_CONFIG_BAR_OFFSET, data, &self.queues); + } o if (ISR_CONFIG_BAR_OFFSET..ISR_CONFIG_BAR_OFFSET + ISR_CONFIG_SIZE).contains(&o) => { if let Some(v) = data.get_mut(0) { // Reading this register resets it to 0. @@ -1185,12 +1183,10 @@ impl PciDevice for VirtioPciDevice { fn write_bar(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { let initial_ready = self.is_driver_ready(); match offset { - o if o < COMMON_CONFIG_BAR_OFFSET + COMMON_CONFIG_SIZE => self.common_config.write( - o - COMMON_CONFIG_BAR_OFFSET, - data, - &mut self.queues, - self.device.clone(), - ), + o if o < COMMON_CONFIG_BAR_OFFSET + COMMON_CONFIG_SIZE => { + self.common_config + .write(o - COMMON_CONFIG_BAR_OFFSET, data, &mut self.queues); + } o if (ISR_CONFIG_BAR_OFFSET..ISR_CONFIG_BAR_OFFSET + ISR_CONFIG_SIZE).contains(&o) => { if let Some(v) = data.first() { self.interrupt_status diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 9f65784ecb..eec0da83c6 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -4405,7 +4405,7 @@ impl DeviceManager { memory, virtio_device, msix_num, - access_platform, + access_platform.as_ref(), self.msi_interrupt_manager.as_ref(), pci_device_bdf.into(), self.activate_evt From 0eca4d7c68562876a1bbc7421a464ba2d2322be5 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Fri, 17 Apr 2026 09:52:06 +0100 Subject: [PATCH 717/742] virtio-devices: Rename control parameter for VIRTIO_F_ACCESS_PLATFORM Rename from iommu to access_platform_enabled. The original name was iommu as this feature was exposed for devices behind an IOMMU however this feature is also now used for confidential VMs so adopt a more general name. Signed-off-by: Rob Bradford --- virtio-devices/src/block.rs | 4 ++-- virtio-devices/src/console.rs | 4 ++-- virtio-devices/src/net.rs | 12 ++++++------ virtio-devices/src/pmem.rs | 4 ++-- virtio-devices/src/rng.rs | 4 ++-- virtio-devices/src/vhost_user/blk.rs | 8 ++++---- virtio-devices/src/vhost_user/fs.rs | 8 ++++---- virtio-devices/src/vhost_user/generic_vhost_user.rs | 8 ++++---- virtio-devices/src/vhost_user/net.rs | 8 ++++---- virtio-devices/src/vsock/device.rs | 4 ++-- 10 files changed, 32 insertions(+), 32 deletions(-) diff --git a/virtio-devices/src/block.rs b/virtio-devices/src/block.rs index 71c0b7ed39..2e28c60da5 100644 --- a/virtio-devices/src/block.rs +++ b/virtio-devices/src/block.rs @@ -734,7 +734,7 @@ impl Block { mut disk_image: DiskBackend, disk_path: PathBuf, read_only: bool, - iommu: bool, + access_platform_enabled: bool, num_queues: usize, queue_size: u16, serial: Option, @@ -793,7 +793,7 @@ impl Block { warn!("sparse=on requested but backend does not support sparse operations"); } - if iommu { + if access_platform_enabled { avail_features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; } diff --git a/virtio-devices/src/console.rs b/virtio-devices/src/console.rs index f141d08b14..0a4cf65cff 100644 --- a/virtio-devices/src/console.rs +++ b/virtio-devices/src/console.rs @@ -602,7 +602,7 @@ impl Console { id: String, endpoint: Endpoint, resize_pipe: Option, - iommu: bool, + access_platform_enabled: bool, seccomp_action: SeccompAction, exit_evt: EventFd, state: Option, @@ -619,7 +619,7 @@ impl Console { ) } else { let mut avail_features = (1u64 << VIRTIO_F_VERSION_1) | (1u64 << VIRTIO_CONSOLE_F_SIZE); - if iommu { + if access_platform_enabled { avail_features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; } diff --git a/virtio-devices/src/net.rs b/virtio-devices/src/net.rs index 7ddbccca8e..d7e1d1f361 100644 --- a/virtio-devices/src/net.rs +++ b/virtio-devices/src/net.rs @@ -468,7 +468,7 @@ impl Net { id: String, taps: Vec, guest_mac: Option, - iommu: bool, + access_platform_enabled: bool, num_queues: usize, queue_size: u16, seccomp_action: SeccompAction, @@ -499,7 +499,7 @@ impl Net { | (1 << VIRTIO_RING_F_EVENT_IDX) | (1 << VIRTIO_F_VERSION_1); - if iommu { + if access_platform_enabled { avail_features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; } @@ -587,7 +587,7 @@ impl Net { guest_mac: Option, host_mac: &mut Option, mtu: Option, - iommu: bool, + access_platform_enabled: bool, num_queues: usize, queue_size: u16, seccomp_action: SeccompAction, @@ -613,7 +613,7 @@ impl Net { id, taps, guest_mac, - iommu, + access_platform_enabled, num_queues, queue_size, seccomp_action, @@ -632,7 +632,7 @@ impl Net { fds: &[RawFd], guest_mac: Option, mtu: Option, - iommu: bool, + access_platform_enabled: bool, queue_size: u16, seccomp_action: SeccompAction, rate_limiter_config: Option, @@ -666,7 +666,7 @@ impl Net { id, taps, guest_mac, - iommu, + access_platform_enabled, num_queue_pairs * 2, queue_size, seccomp_action, diff --git a/virtio-devices/src/pmem.rs b/virtio-devices/src/pmem.rs index eb94e7104d..94f2716b64 100644 --- a/virtio-devices/src/pmem.rs +++ b/virtio-devices/src/pmem.rs @@ -287,7 +287,7 @@ impl Pmem { disk: File, addr: GuestAddress, mapping: UserspaceMapping, - iommu: bool, + access_platform_enabled: bool, seccomp_action: SeccompAction, exit_evt: EventFd, state: Option, @@ -308,7 +308,7 @@ impl Pmem { let mut avail_features = 1u64 << VIRTIO_F_VERSION_1; - if iommu { + if access_platform_enabled { avail_features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; } (avail_features, 0, config, false) diff --git a/virtio-devices/src/rng.rs b/virtio-devices/src/rng.rs index 2f43ac66d5..8d11a3d7b6 100644 --- a/virtio-devices/src/rng.rs +++ b/virtio-devices/src/rng.rs @@ -169,7 +169,7 @@ impl Rng { pub fn new( id: String, path: &str, - iommu: bool, + access_platform_enabled: bool, seccomp_action: SeccompAction, exit_evt: EventFd, state: Option, @@ -182,7 +182,7 @@ impl Rng { } else { let mut avail_features = 1u64 << VIRTIO_F_VERSION_1; - if iommu { + if access_platform_enabled { avail_features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; } diff --git a/virtio-devices/src/vhost_user/blk.rs b/virtio-devices/src/vhost_user/blk.rs index 83653147bd..2526b36f6b 100644 --- a/virtio-devices/src/vhost_user/blk.rs +++ b/virtio-devices/src/vhost_user/blk.rs @@ -45,7 +45,7 @@ pub struct Blk { guest_memory: Option>, seccomp_action: SeccompAction, exit_evt: EventFd, - iommu: bool, + access_platform_enabled: bool, } impl Blk { @@ -55,7 +55,7 @@ impl Blk { vu_cfg: VhostUserConfig, seccomp_action: SeccompAction, exit_evt: EventFd, - iommu: bool, + access_platform_enabled: bool, state: Option, ) -> Result { let num_queues = vu_cfg.num_queues; @@ -191,7 +191,7 @@ impl Blk { guest_memory: None, seccomp_action, exit_evt, - iommu, + access_platform_enabled, }) } @@ -217,7 +217,7 @@ impl VirtioDevice for Blk { fn features(&self) -> u64 { let mut features = self.vu_common.virtio_common.avail_features; - if self.iommu { + if self.access_platform_enabled { features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; } features diff --git a/virtio-devices/src/vhost_user/fs.rs b/virtio-devices/src/vhost_user/fs.rs index 509a7a34f3..c062edf19b 100644 --- a/virtio-devices/src/vhost_user/fs.rs +++ b/virtio-devices/src/vhost_user/fs.rs @@ -68,7 +68,7 @@ pub struct Fs { seccomp_action: SeccompAction, guest_memory: Option>, exit_evt: EventFd, - iommu: bool, + access_platform_enabled: bool, } impl Fs { @@ -83,7 +83,7 @@ impl Fs { cache: Option<(VirtioSharedMemoryList, MmapRegion)>, seccomp_action: SeccompAction, exit_evt: EventFd, - iommu: bool, + access_platform_enabled: bool, state: Option, ) -> Result { // Calculate the actual number of queues needed. @@ -200,7 +200,7 @@ impl Fs { seccomp_action, guest_memory: None, exit_evt, - iommu, + access_platform_enabled, }) } @@ -226,7 +226,7 @@ impl VirtioDevice for Fs { fn features(&self) -> u64 { let mut features = self.vu_common.virtio_common.avail_features; - if self.iommu { + if self.access_platform_enabled { features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; } features diff --git a/virtio-devices/src/vhost_user/generic_vhost_user.rs b/virtio-devices/src/vhost_user/generic_vhost_user.rs index aed24b082d..dda891e912 100644 --- a/virtio-devices/src/vhost_user/generic_vhost_user.rs +++ b/virtio-devices/src/vhost_user/generic_vhost_user.rs @@ -42,7 +42,7 @@ pub struct GenericVhostUser { seccomp_action: SeccompAction, guest_memory: Option>, exit_evt: EventFd, - iommu: bool, + access_platform_enabled: bool, cfg_warning: AtomicBool, } @@ -57,7 +57,7 @@ impl GenericVhostUser { cache: Option<(VirtioSharedMemoryList, MmapRegion)>, seccomp_action: SeccompAction, exit_evt: EventFd, - iommu: bool, + access_platform_enabled: bool, state: Option, ) -> Result { // Calculate the actual number of queues needed. @@ -159,7 +159,7 @@ since the backend only supports {backend_num_queues}\n", seccomp_action, guest_memory: None, exit_evt, - iommu, + access_platform_enabled, cfg_warning: AtomicBool::new(false), }) } @@ -201,7 +201,7 @@ impl VirtioDevice for GenericVhostUser { fn features(&self) -> u64 { let mut features = self.vu_common.virtio_common.avail_features; - if self.iommu { + if self.access_platform_enabled { features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; } features diff --git a/virtio-devices/src/vhost_user/net.rs b/virtio-devices/src/vhost_user/net.rs index 4803ade33c..e5f9eda7d8 100644 --- a/virtio-devices/src/vhost_user/net.rs +++ b/virtio-devices/src/vhost_user/net.rs @@ -48,7 +48,7 @@ pub struct Net { ctrl_queue_epoll_thread: Option>, seccomp_action: SeccompAction, exit_evt: EventFd, - iommu: bool, + access_platform_enabled: bool, } impl Net { @@ -62,7 +62,7 @@ impl Net { server: bool, seccomp_action: SeccompAction, exit_evt: EventFd, - iommu: bool, + access_platform_enabled: bool, state: Option, offload_tso: bool, offload_ufo: bool, @@ -220,7 +220,7 @@ impl Net { ctrl_queue_epoll_thread: None, seccomp_action, exit_evt, - iommu, + access_platform_enabled, }) } @@ -252,7 +252,7 @@ impl VirtioDevice for Net { fn features(&self) -> u64 { let mut features = self.vu_common.virtio_common.avail_features; - if self.iommu { + if self.access_platform_enabled { features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; } features diff --git a/virtio-devices/src/vsock/device.rs b/virtio-devices/src/vsock/device.rs index 0b202945b0..c20288c2dc 100644 --- a/virtio-devices/src/vsock/device.rs +++ b/virtio-devices/src/vsock/device.rs @@ -339,7 +339,7 @@ where cid: u32, path: PathBuf, mut backend: B, - iommu: bool, + access_platform_enabled: bool, seccomp_action: SeccompAction, exit_evt: EventFd, state: Option, @@ -353,7 +353,7 @@ where } else { let mut avail_features = (1u64 << VIRTIO_F_VERSION_1) | (1u64 << VIRTIO_F_IN_ORDER); - if iommu { + if access_platform_enabled { avail_features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; } (avail_features, 0, false) From 5bd6fdc17d3b4c1bb417b6723f0b9c5503c31eca Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Fri, 17 Apr 2026 09:58:33 +0100 Subject: [PATCH 718/742] vmm: Rename force_iommu to force_access_platform This a clearer name for it's purpose and now matches more closely what is used for the virtio devices themselves. Signed-off-by: Rob Bradford --- vmm/src/device_manager.rs | 32 ++++++++++++++++---------------- vmm/src/vm.rs | 15 ++++++++------- 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index eec0da83c6..2c31efe336 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -1126,8 +1126,8 @@ pub struct DeviceManager { // pvpanic device pvpanic_device: Option>>, - // Flag to force setting the iommu on virtio devices - force_iommu: bool, + // Force VIRTIO_F_ACCESS_PLATFORM on all virtio devices (e.g. for TDX/SEV-SNP) + force_access_platform: bool, // io_uring availability if detected io_uring_supported: Option, @@ -1215,7 +1215,7 @@ impl DeviceManager { seccomp_action: SeccompAction, numa_nodes: NumaNodes, activate_evt: &EventFd, - force_iommu: bool, + force_access_platform: bool, boot_id_list: BTreeSet, #[cfg(not(target_arch = "riscv64"))] timestamp: Instant, snapshot: Option<&Snapshot>, @@ -1429,7 +1429,7 @@ impl DeviceManager { #[cfg(feature = "pvmemcontrol")] pvmemcontrol_devices: None, pvpanic_device: None, - force_iommu, + force_access_platform, io_uring_supported: None, aio_supported: None, boot_id_list, @@ -2440,7 +2440,7 @@ impl DeviceManager { self.console_resize_pipe .as_ref() .map(|p| p.try_clone().unwrap()), - self.force_iommu | console_config.iommu, + self.force_access_platform | console_config.iommu, self.seccomp_action.clone(), self.exit_evt .try_clone() @@ -2703,7 +2703,7 @@ impl DeviceManager { self.exit_evt .try_clone() .map_err(DeviceManagerError::EventFd)?, - self.force_iommu, + self.force_access_platform, state_from_id(self.snapshot.as_ref(), id.as_str()) .map_err(DeviceManagerError::RestoreGetState)?, ) { @@ -2930,7 +2930,7 @@ impl DeviceManager { .ok_or(DeviceManagerError::NoDiskPath)? .clone(), disk_cfg.readonly, - self.force_iommu | disk_cfg.pci_common.iommu, + self.force_access_platform | disk_cfg.pci_common.iommu, disk_cfg.num_queues, disk_cfg.queue_size, disk_cfg.serial.clone(), @@ -3032,7 +3032,7 @@ impl DeviceManager { self.exit_evt .try_clone() .map_err(DeviceManagerError::EventFd)?, - self.force_iommu, + self.force_access_platform, state_from_id(self.snapshot.as_ref(), id.as_str()) .map_err(DeviceManagerError::RestoreGetState)?, net_cfg.offload_tso, @@ -3063,7 +3063,7 @@ impl DeviceManager { Some(net_cfg.mac), &mut net_cfg.host_mac, net_cfg.mtu, - self.force_iommu | net_cfg.pci_common.iommu, + self.force_access_platform | net_cfg.pci_common.iommu, net_cfg.num_queues, net_cfg.queue_size, self.seccomp_action.clone(), @@ -3084,7 +3084,7 @@ impl DeviceManager { fds, Some(net_cfg.mac), net_cfg.mtu, - self.force_iommu | net_cfg.pci_common.iommu, + self.force_access_platform | net_cfg.pci_common.iommu, net_cfg.queue_size, self.seccomp_action.clone(), net_cfg.rate_limiter_config, @@ -3114,7 +3114,7 @@ impl DeviceManager { Some(net_cfg.mac), &mut net_cfg.host_mac, net_cfg.mtu, - self.force_iommu | net_cfg.pci_common.iommu, + self.force_access_platform | net_cfg.pci_common.iommu, net_cfg.num_queues, net_cfg.queue_size, self.seccomp_action.clone(), @@ -3177,7 +3177,7 @@ impl DeviceManager { virtio_devices::Rng::new( id.clone(), rng_path, - self.force_iommu | rng_config.iommu, + self.force_access_platform | rng_config.iommu, self.seccomp_action.clone(), self.exit_evt .try_clone() @@ -3239,7 +3239,7 @@ impl DeviceManager { self.exit_evt .try_clone() .map_err(DeviceManagerError::EventFd)?, - self.force_iommu, + self.force_access_platform, state_from_id(self.snapshot.as_ref(), id.as_str()) .map_err(DeviceManagerError::RestoreGetState)?, ) @@ -3305,7 +3305,7 @@ impl DeviceManager { self.exit_evt .try_clone() .map_err(DeviceManagerError::EventFd)?, - self.force_iommu, + self.force_access_platform, state_from_id(self.snapshot.as_ref(), id.as_str()) .map_err(DeviceManagerError::RestoreGetState)?, ) @@ -3484,7 +3484,7 @@ impl DeviceManager { file, GuestAddress(region_base), mapping, - self.force_iommu | pmem_cfg.pci_common.iommu, + self.force_access_platform | pmem_cfg.pci_common.iommu, self.seccomp_action.clone(), self.exit_evt .try_clone() @@ -3555,7 +3555,7 @@ impl DeviceManager { vsock_cfg.cid, vsock_cfg.socket.clone(), backend, - self.force_iommu | vsock_cfg.pci_common.iommu, + self.force_access_platform | vsock_cfg.pci_common.iommu, self.seccomp_action.clone(), self.exit_evt .try_clone() diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index 6bb088cb0d..ffe7b8624a 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -567,8 +567,8 @@ impl Vm { let numa_nodes = Self::create_numa_nodes(config.lock().unwrap().numa.as_deref(), &memory_manager)?; - // Determine if IOMMU should be forced based on confidential computing features - let force_iommu = Self::should_force_iommu(&config); + // Determine if VIRTIO_F_ACCESS_PLATFORM should be forced (e.g. for TDX/SEV-SNP) + let force_access_platform = Self::should_force_access_platform(&config); let stop_on_boot = Self::should_stop_on_boot(&config); @@ -615,7 +615,7 @@ impl Vm { seccomp_action.clone(), numa_nodes.clone(), &activate_evt, - force_iommu, + force_access_platform, boot_id_list, #[cfg(not(target_arch = "riscv64"))] timestamp, @@ -694,8 +694,9 @@ impl Vm { }) } - /// Determine if IOMMU should be forced based on confidential computing features. - fn should_force_iommu(_config: &Arc>) -> bool { + /// Determine if VIRTIO_F_ACCESS_PLATFORM should be forced based on + /// confidential computing features. + fn should_force_access_platform(_config: &Arc>) -> bool { #[cfg(feature = "tdx")] if _config.lock().unwrap().is_tdx_enabled() { return true; @@ -802,7 +803,7 @@ impl Vm { seccomp_action: SeccompAction, numa_nodes: NumaNodes, activate_evt: &EventFd, - force_iommu: bool, + force_access_platform: bool, boot_id_list: BTreeSet, #[cfg(not(target_arch = "riscv64"))] timestamp: Instant, snapshot: Option<&Snapshot>, @@ -825,7 +826,7 @@ impl Vm { seccomp_action, numa_nodes, activate_evt, - force_iommu, + force_access_platform, boot_id_list, #[cfg(not(target_arch = "riscv64"))] timestamp, From 12dd72d88f100697ec074ee420134ad580d2380d Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Fri, 17 Apr 2026 09:33:10 +0100 Subject: [PATCH 719/742] virtio-devices: balloon: Enable use with confidential VMs Following the pattern used by the existing virtio devices make the balloon device work with confidential VMs (e.g. SEV-SNP). This requires advertising the VIRTIO_F_ACCESS_PLATFORM feature. Do not expose this to the user as a controllable option and instead only enable in on the "force" case. Signed-off-by: Rob Bradford --- fuzz/fuzz_targets/balloon.rs | 1 + virtio-devices/src/balloon.rs | 33 +++++++++++++++++++++++++++++---- vmm/src/device_manager.rs | 1 + 3 files changed, 31 insertions(+), 4 deletions(-) diff --git a/fuzz/fuzz_targets/balloon.rs b/fuzz/fuzz_targets/balloon.rs index 69f0c07e84..20260c5469 100644 --- a/fuzz/fuzz_targets/balloon.rs +++ b/fuzz/fuzz_targets/balloon.rs @@ -49,6 +49,7 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { BALLOON_SIZE, true, true, + false, SeccompAction::Allow, EventFd::new(EFD_NONBLOCK).unwrap(), None, diff --git a/virtio-devices/src/balloon.rs b/virtio-devices/src/balloon.rs index bb9c46cbc8..f9db09bd33 100644 --- a/virtio-devices/src/balloon.rs +++ b/virtio-devices/src/balloon.rs @@ -34,14 +34,15 @@ use vm_memory::{ GuestMemoryError, GuestMemoryRegion, }; use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable}; +use vm_virtio::{AccessPlatform, Translatable}; use vmm_sys_util::eventfd::EventFd; use crate::seccomp_filters::Thread; use crate::thread_helper::spawn_virtio_thread; use crate::{ ActivateResult, EPOLL_HELPER_EVENT_LAST, EpollHelper, EpollHelperError, EpollHelperHandler, - GuestMemoryMmap, VIRTIO_F_VERSION_1, VirtioCommon, VirtioDevice, VirtioDeviceType, - VirtioInterrupt, VirtioInterruptType, + GuestMemoryMmap, VIRTIO_F_ACCESS_PLATFORM, VIRTIO_F_VERSION_1, VirtioCommon, VirtioDevice, + VirtioDeviceType, VirtioInterrupt, VirtioInterruptType, }; const QUEUE_SIZE: u16 = 128; @@ -160,6 +161,7 @@ struct BalloonEpollHandler { kill_evt: EventFd, pause_evt: EventFd, pbp: Option, + access_platform: Option>, } impl BalloonEpollHandler { @@ -277,7 +279,12 @@ impl BalloonEpollHandler { let mut offset = 0u64; while offset < desc.len() as u64 { - let addr = desc.addr().checked_add(offset).unwrap(); + let addr = desc + .addr() + .checked_add(offset) + .unwrap() + .translate_gva(self.access_platform.as_deref(), data_chunk_size) + .map_err(|e| Error::GuestMemory(GuestMemoryError::IOError(e)))?; let pfn: u32 = desc_chain .memory() .read_obj(addr) @@ -324,7 +331,11 @@ impl BalloonEpollHandler { let mut descs_len = 0; while let Some(desc) = desc_chain.next() { descs_len += desc.len(); - Self::release_memory_range(desc_chain.memory(), desc.addr(), desc.len() as usize)?; + let addr = desc + .addr() + .translate_gva(self.access_platform.as_deref(), desc.len() as usize) + .map_err(|e| Error::GuestMemory(GuestMemoryError::IOError(e)))?; + Self::release_memory_range(desc_chain.memory(), addr, desc.len() as usize)?; } self.queues[queue_index] @@ -437,11 +448,13 @@ pub struct Balloon { impl Balloon { // Create a new virtio-balloon. + #[allow(clippy::too_many_arguments)] pub fn new( id: String, size: u64, deflate_on_oom: bool, free_page_reporting: bool, + access_platform_enabled: bool, seccomp_action: SeccompAction, exit_evt: EventFd, state: Option, @@ -464,6 +477,9 @@ impl Balloon { if free_page_reporting { avail_features |= 1u64 << VIRTIO_BALLOON_F_REPORTING; } + if access_platform_enabled { + avail_features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; + } let config = VirtioBalloonConfig { num_pages: (size >> VIRTIO_BALLOON_PFN_SHIFT) as u32, @@ -628,6 +644,7 @@ impl VirtioDevice for Balloon { kill_evt, pause_evt, pbp: None, + access_platform: self.common.access_platform(), }; let paused = self.common.paused.clone(); @@ -648,6 +665,14 @@ impl VirtioDevice for Balloon { Ok(()) } + fn set_access_platform(&mut self, access_platform: Arc) { + self.common.set_access_platform(access_platform); + } + + fn access_platform(&self) -> Option> { + self.common.access_platform() + } + fn reset(&mut self) -> Option> { let result = self.common.reset(); event!("virtio-device", "reset", "id", &self.id); diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 2c31efe336..948afdfae7 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -3704,6 +3704,7 @@ impl DeviceManager { balloon_config.size, balloon_config.deflate_on_oom, balloon_config.free_page_reporting, + self.force_access_platform, self.seccomp_action.clone(), self.exit_evt .try_clone() From cdbe43f4230de5489595142b10ec78e36f76316a Mon Sep 17 00:00:00 2001 From: Keith Adler Date: Wed, 11 Mar 2026 17:27:22 -0500 Subject: [PATCH 720/742] hypervisor: kvm: Add GUEST_MEMFD and KVM_SET_USER_MEMORY_REGION2 support Add support for guest_memfd (available in Linux kernel v6.8+), which enables private memory for confidential VMs. Key changes: - Introduce UserMemoryRegion abstraction with guest_memfd fields - Add From impls between kvm_userspace_memory_region2 and UserMemoryRegion - Convert all KVM memory region operations from kvm_userspace_memory_region to kvm_userspace_memory_region2, with automatic fallback to v1 when guest_memfd is not supported - Add set_user_memory_region() wrapper that dispatches to v1/v2 based on kvm_guest_memfd_supported capability - Create guest_memfd via KVM_CREATE_GUEST_MEMFD ioctl when supported - Extend KvmDirtyLogSlot to preserve region2 fields across dirty log start/stop cycles This is prerequisite infrastructure for KVM-based confidential computing that requires private guest memory backed by guest_memfd. Co-authored-by: Alex Orozco Signed-off-by: Keith Adler Signed-off-by: Ruben Hakobyan --- hypervisor/src/kvm/mod.rs | 233 ++++++++++++++++++++++++++++++++----- hypervisor/src/lib.rs | 20 ++++ vmm/src/seccomp_filters.rs | 19 +++ 3 files changed, 240 insertions(+), 32 deletions(-) diff --git a/hypervisor/src/kvm/mod.rs b/hypervisor/src/kvm/mod.rs index 8b21002d1d..89090294a9 100644 --- a/hypervisor/src/kvm/mod.rs +++ b/hypervisor/src/kvm/mod.rs @@ -14,7 +14,10 @@ use std::any::Any; use std::collections::HashMap; #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] use std::mem::offset_of; -#[cfg(feature = "tdx")] +#[cfg(feature = "sev_snp")] +use std::os::fd::FromRawFd; +use std::os::fd::OwnedFd; +#[cfg(any(feature = "sev_snp", feature = "tdx"))] use std::os::unix::io::AsRawFd; #[cfg(feature = "tdx")] use std::os::unix::io::RawFd; @@ -26,9 +29,12 @@ use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::{Arc, RwLock}; use anyhow::anyhow; +#[cfg(feature = "sev_snp")] +use kvm_bindings::kvm_create_guest_memfd; use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd}; #[cfg(target_arch = "x86_64")] use log::warn; +use vmm_sys_util::errno; use vmm_sys_util::eventfd::EventFd; #[cfg(target_arch = "aarch64")] @@ -50,8 +56,6 @@ pub use crate::riscv64::{ }; #[cfg(target_arch = "riscv64")] use crate::riscv64_reg_id; -use crate::vm::{self, InterruptSourceConfig, VmOps}; -use crate::{HypervisorType, HypervisorVmConfig, cpu, hypervisor}; // x86_64 dependencies #[cfg(target_arch = "x86_64")] pub mod x86_64; @@ -73,7 +77,12 @@ use crate::arch::x86::{ CpuIdEntry, FpuState, LapicState, MTRR_MSR_INDICES, MsrEntry, NUM_IOAPIC_PINS, SpecialRegisters, XsaveState, }; -use crate::{CpuState, IoEventAddress, IrqRoutingEntry, MpState, StandardRegisters}; +use crate::{ + CpuState, HypervisorType, HypervisorVmConfig, InterruptSourceConfig, IoEventAddress, + IrqRoutingEntry, MpState, StandardRegisters, USER_MEMORY_REGION_GUEST_MEMFD, + USER_MEMORY_REGION_LOG_DIRTY, USER_MEMORY_REGION_READ, USER_MEMORY_REGION_WRITE, + UserMemoryRegion, VmOps, cpu, hypervisor, vm, +}; // aarch64 dependencies #[cfg(target_arch = "aarch64")] pub mod aarch64; @@ -83,6 +92,8 @@ pub mod riscv64; #[cfg(target_arch = "aarch64")] use std::mem; +#[cfg(target_arch = "x86_64")] +use kvm_bindings::KVM_X86_DEFAULT_VM; /// /// Export generically-named wrappers of kvm-bindings for Unix-based platforms /// @@ -92,10 +103,11 @@ pub use kvm_bindings::kvm_vcpu_events as VcpuEvents; use kvm_bindings::nested::KvmNestedStateBuffer; pub use kvm_bindings::{ self, KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_IRQ_ROUTING_IRQCHIP, - KVM_IRQ_ROUTING_MSI, KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID, - kvm_clock_data, kvm_create_device, kvm_create_device as CreateDevice, + KVM_IRQ_ROUTING_MSI, KVM_MEM_GUEST_MEMFD, KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, + KVM_MSI_VALID_DEVID, kvm_clock_data, kvm_create_device, kvm_create_device as CreateDevice, kvm_device_attr as DeviceAttr, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_guest_debug, kvm_irq_routing, kvm_irq_routing_entry, kvm_mp_state, kvm_run, kvm_userspace_memory_region, + kvm_userspace_memory_region2, }; #[cfg(target_arch = "aarch64")] use kvm_bindings::{ @@ -107,7 +119,7 @@ use kvm_bindings::{ #[cfg(target_arch = "riscv64")] use kvm_bindings::{KVM_REG_RISCV_CORE, kvm_riscv_core}; #[cfg(feature = "tdx")] -use kvm_bindings::{KVM_X86_DEFAULT_VM, KVM_X86_SW_PROTECTED_VM, KVMIO, kvm_run__bindgen_ty_1}; +use kvm_bindings::{KVM_X86_SW_PROTECTED_VM, KVMIO, kvm_run__bindgen_ty_1}; #[cfg(target_arch = "x86_64")] use kvm_bindings::{Xsave as xsave2, kvm_xsave2}; pub use kvm_ioctls::{self, Cap, Kvm, VcpuExit}; @@ -238,6 +250,61 @@ pub struct KvmTdxExitVmcall { pub out_rdx: u64, } +impl From for UserMemoryRegion { + fn from(region: kvm_userspace_memory_region2) -> Self { + let mut flags = USER_MEMORY_REGION_READ; + if region.flags & KVM_MEM_READONLY == 0 { + flags |= USER_MEMORY_REGION_WRITE; + } + if region.flags & KVM_MEM_LOG_DIRTY_PAGES != 0 { + flags |= USER_MEMORY_REGION_LOG_DIRTY; + } + if region.flags & KVM_MEM_GUEST_MEMFD != 0 { + flags |= USER_MEMORY_REGION_GUEST_MEMFD; + } + + UserMemoryRegion { + slot: region.slot, + guest_phys_addr: region.guest_phys_addr, + memory_size: region.memory_size, + userspace_addr: region.userspace_addr, + flags, + guest_memfd: Some(region.guest_memfd), + guest_memfd_offset: Some(region.guest_memfd_offset), + } + } +} + +impl From for kvm_userspace_memory_region2 { + fn from(region: UserMemoryRegion) -> Self { + assert!( + region.flags & USER_MEMORY_REGION_READ != 0, + "KVM mapped memory is always readable" + ); + + let mut flags = 0; + if region.flags & USER_MEMORY_REGION_WRITE == 0 { + flags |= KVM_MEM_READONLY; + } + if region.flags & USER_MEMORY_REGION_LOG_DIRTY != 0 { + flags |= KVM_MEM_LOG_DIRTY_PAGES; + } + if region.flags & USER_MEMORY_REGION_GUEST_MEMFD != 0 { + flags |= KVM_MEM_GUEST_MEMFD; + } + + kvm_userspace_memory_region2 { + slot: region.slot, + guest_phys_addr: region.guest_phys_addr, + memory_size: region.memory_size, + userspace_addr: region.userspace_addr, + flags, + guest_memfd: region.guest_memfd.unwrap_or(0), + guest_memfd_offset: region.guest_memfd_offset.unwrap_or(0), + ..Default::default() + } + } +} impl From for MpState { fn from(s: kvm_mp_state) -> Self { MpState::Kvm(s) @@ -424,6 +491,9 @@ struct KvmDirtyLogSlot { guest_phys_addr: u64, memory_size: u64, userspace_addr: u64, + // Following fields are used by kvm_userspace_memory_region2. + guest_memfd_offset: u64, + guest_memfd: u32, } /// Wrapper over KVM VM ioctls. @@ -432,6 +502,7 @@ pub struct KvmVm { #[cfg(target_arch = "x86_64")] msrs: Vec, dirty_log_slots: RwLock>, + guest_memfds: Option>>, } impl KvmVm { @@ -494,6 +565,47 @@ impl KvmVm { fn translate_msi_ext_dest_id(address_lo: u32, address_hi: u32) -> (u32, u32) { (address_lo, address_hi) } + + /// Set user memory region to use guest_memfd when available. + /// guest_memfd is available on host linux kernel v6.8+ + /// + /// # Safety + /// + /// `region.userspace_addr` must point to `region.memory_size` bytes of + /// memory that will stay mapped until the slot is removed via + /// `remove_user_memory_region`. The memory region must + /// be uniquely owned by the caller, as mapping it into the guest + /// effectively creates a long-lived mutable reference. + unsafe fn set_user_memory_region( + &self, + region: kvm_userspace_memory_region2, + ) -> Result<(), errno::Error> { + if self.guest_memfds.is_some() { + // SAFETY: Safe as the caller guarantees that region is safe to map + // the guest and is non-overlapping. + unsafe { self.fd.set_user_memory_region2(region) } + } else { + // SAFETY: Safe because guest regions are guaranteed not to overlap. + unsafe { + self.fd.set_user_memory_region(kvm_userspace_memory_region { + slot: region.slot, + guest_phys_addr: region.guest_phys_addr, + userspace_addr: region.userspace_addr, + flags: region.flags, + memory_size: region.memory_size, + }) + } + } + } + + /// Get flag for kvm_userspace_memory_region based on memfd support. + fn get_kvm_userspace_memory_region_flag(&self, flag: u32) -> u32 { + flag | if self.guest_memfds.is_some() { + KVM_MEM_GUEST_MEMFD + } else { + 0 + } + } } /// Implementation of Vm trait for KVM @@ -759,14 +871,43 @@ impl vm::Vm for KvmVm { const _: () = assert!(core::mem::size_of::() <= core::mem::size_of::()); - let mut region = kvm_userspace_memory_region { + // Create a per-region guest_memfd when supported. + // Each region gets its own fd sized exactly to memory_size + #[cfg(feature = "sev_snp")] + let guest_memfd = if let Some(memfds) = &self.guest_memfds { + // SAFETY: Safe because guest regions are guaranteed not to overlap. + let fd = unsafe { + OwnedFd::from_raw_fd( + self.fd + .create_guest_memfd(kvm_create_guest_memfd { + size: memory_size as u64, + ..Default::default() + }) + .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into()))?, + ) + }; + let raw_fd = fd.as_raw_fd() as u32; + memfds.write().unwrap().insert(slot, fd); + raw_fd + } else { + 0 + }; + #[cfg(not(feature = "sev_snp"))] + let guest_memfd = 0; + + let mut region = kvm_userspace_memory_region2 { slot, + flags: self.get_kvm_userspace_memory_region_flag(flags), guest_phys_addr, memory_size: memory_size as u64, userspace_addr: userspace_addr as usize as u64, - flags, + #[cfg(not(target_arch = "riscv64"))] + guest_memfd, + // Each guest_memfd is per-region and sized to memory_size, + // so the region's data always starts at offset 0. + guest_memfd_offset: 0, + ..Default::default() }; - if (region.flags & KVM_MEM_LOG_DIRTY_PAGES) != 0 { if (region.flags & KVM_MEM_READONLY) != 0 { return Err(vm::HypervisorVmError::CreateUserMemory(anyhow!( @@ -782,20 +923,22 @@ impl vm::Vm for KvmVm { guest_phys_addr: region.guest_phys_addr, memory_size: region.memory_size, userspace_addr: region.userspace_addr, + guest_memfd_offset: region.guest_memfd_offset, + guest_memfd: region.guest_memfd, }, ); // Always create guest physical memory region without `KVM_MEM_LOG_DIRTY_PAGES`. // For regions that need this flag, dirty pages log will be turned on in `start_dirty_log`. - region.flags = 0; + region.flags = self.get_kvm_userspace_memory_region_flag(0); } // SAFETY: Safe because caller promised this is safe. unsafe { - self.fd - .set_user_memory_region(region) - .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into())) + self.set_user_memory_region(region) + .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into()))?; } + Ok(()) } /// Removes a guest physical memory region. @@ -823,12 +966,13 @@ impl vm::Vm for KvmVm { const _: () = assert!(core::mem::size_of::() <= core::mem::size_of::()); - let mut region = kvm_userspace_memory_region { + let mut region = kvm_userspace_memory_region2 { slot, guest_phys_addr, memory_size: memory_size as u64, userspace_addr: userspace_addr as usize as u64, flags, + ..Default::default() }; // Remove the corresponding entry from "self.dirty_log_slots" if needed @@ -838,10 +982,16 @@ impl vm::Vm for KvmVm { region.memory_size = 0; // SAFETY: Safe because caller promised this is safe. unsafe { - self.fd - .set_user_memory_region(region) - .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into())) + self.set_user_memory_region(region) + .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into()))?; } + + // Close the per-region guest_memfd if one was created for this slot + if let Some(memfds) = &self.guest_memfds { + memfds.write().unwrap().remove(&slot); + } + + Ok(()) } /// @@ -932,17 +1082,19 @@ impl vm::Vm for KvmVm { fn start_dirty_log(&self) -> vm::Result<()> { let dirty_log_slots = self.dirty_log_slots.read().unwrap(); for (_, s) in dirty_log_slots.iter() { - let region = kvm_userspace_memory_region { + let region = kvm_userspace_memory_region2 { slot: s.slot, guest_phys_addr: s.guest_phys_addr, memory_size: s.memory_size, userspace_addr: s.userspace_addr, - flags: KVM_MEM_LOG_DIRTY_PAGES, + flags: self.get_kvm_userspace_memory_region_flag(KVM_MEM_LOG_DIRTY_PAGES), + guest_memfd: s.guest_memfd, + guest_memfd_offset: s.guest_memfd_offset, + ..Default::default() }; // SAFETY: Safe because guest regions are guaranteed not to overlap. unsafe { - self.fd - .set_user_memory_region(region) + self.set_user_memory_region(region) .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?; } } @@ -956,17 +1108,19 @@ impl vm::Vm for KvmVm { fn stop_dirty_log(&self) -> vm::Result<()> { let dirty_log_slots = self.dirty_log_slots.read().unwrap(); for (_, s) in dirty_log_slots.iter() { - let region = kvm_userspace_memory_region { + let region = kvm_userspace_memory_region2 { slot: s.slot, guest_phys_addr: s.guest_phys_addr, memory_size: s.memory_size, userspace_addr: s.userspace_addr, - flags: 0, + flags: self.get_kvm_userspace_memory_region_flag(0), + guest_memfd: s.guest_memfd, + guest_memfd_offset: s.guest_memfd_offset, + ..Default::default() }; // SAFETY: Safe because guest regions are guaranteed not to overlap. unsafe { - self.fd - .set_user_memory_region(region) + self.set_user_memory_region(region) .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?; } } @@ -1228,11 +1382,17 @@ impl hypervisor::Hypervisor for KvmHypervisor { vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap(); } - #[cfg(feature = "tdx")] - if _config.tdx_enabled { - vm_type = KVM_X86_SW_PROTECTED_VM.into(); - } else { - vm_type = KVM_X86_DEFAULT_VM.into(); + #[cfg(target_arch = "x86_64")] + cfg_if::cfg_if! { + if #[cfg(feature = "tdx")] { + if _config.tdx_enabled { + vm_type = KVM_X86_SW_PROTECTED_VM.into(); + } else { + vm_type = KVM_X86_DEFAULT_VM.into(); + } + } else { + vm_type = KVM_X86_DEFAULT_VM.into(); + } } loop { @@ -1255,7 +1415,7 @@ impl hypervisor::Hypervisor for KvmHypervisor { { let msr_list = self.get_msr_list()?; let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize; - let mut msrs: Vec = vec![ + let mut msrs = vec![ MsrEntry { ..Default::default() }; @@ -1266,10 +1426,18 @@ impl hypervisor::Hypervisor for KvmHypervisor { msrs[pos].index = *index; } + #[allow(unused_mut)] + let mut guest_memfds = None; + #[cfg(feature = "sev_snp")] + if _config.sev_snp_enabled && fd.check_extension(Cap::GuestMemfd) { + guest_memfds = Some(RwLock::new(HashMap::new())); + } + Ok(Arc::new(KvmVm { fd, msrs, dirty_log_slots: RwLock::new(HashMap::new()), + guest_memfds, })) } @@ -1278,6 +1446,7 @@ impl hypervisor::Hypervisor for KvmHypervisor { Ok(Arc::new(KvmVm { fd, dirty_log_slots: RwLock::new(HashMap::new()), + guest_memfds: None, })) } } diff --git a/hypervisor/src/lib.rs b/hypervisor/src/lib.rs index 3d919e45ce..787cfbf1f4 100644 --- a/hypervisor/src/lib.rs +++ b/hypervisor/src/lib.rs @@ -118,6 +118,26 @@ pub fn vec_with_array_field(count: usize) -> Vec { vec_with_size_in_bytes(vec_size_bytes) } +/// User memory region structure +#[derive(Debug, Default, Eq, PartialEq)] +pub struct UserMemoryRegion { + pub slot: u32, + pub guest_phys_addr: u64, + pub memory_size: u64, + pub userspace_addr: u64, + pub flags: u32, + pub guest_memfd: Option, + pub guest_memfd_offset: Option, +} + +/// Flags for user memory region +pub const USER_MEMORY_REGION_READ: u32 = 1; +pub const USER_MEMORY_REGION_WRITE: u32 = 1 << 1; +pub const USER_MEMORY_REGION_EXECUTE: u32 = 1 << 2; +pub const USER_MEMORY_REGION_LOG_DIRTY: u32 = 1 << 3; +pub const USER_MEMORY_REGION_ADJUSTABLE: u32 = 1 << 4; +pub const USER_MEMORY_REGION_GUEST_MEMFD: u32 = 1 << 5; + #[derive(Debug)] pub enum MpState { #[cfg(feature = "kvm")] diff --git a/vmm/src/seccomp_filters.rs b/vmm/src/seccomp_filters.rs index 25da7f9c9a..008fc60f29 100644 --- a/vmm/src/seccomp_filters.rs +++ b/vmm/src/seccomp_filters.rs @@ -90,6 +90,9 @@ mod kvm { pub const KVM_HAS_DEVICE_ATTR: u64 = 0x4018_aee3; pub const KVM_SET_ONE_REG: u64 = 0x4010_aeac; pub const KVM_SET_USER_MEMORY_REGION: u64 = 0x4020_ae46; + pub const KVM_SET_USER_MEMORY_REGION2: u64 = 0x40a0_ae49; + pub const KVM_SET_MEMORY_ATTRIBUTES: u64 = 0x4020_aed2; + pub const KVM_CREATE_GUEST_MEMFD: u64 = 0xc040_aed4; pub const KVM_IRQFD: u64 = 0x4020_ae76; pub const KVM_IOEVENTFD: u64 = 0x4040_ae79; pub const KVM_SET_VCPU_EVENTS: u64 = 0x4040_aea0; @@ -252,6 +255,14 @@ fn create_vmm_ioctl_seccomp_rule_common_kvm() -> Result, Backen and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_ONE_REG)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_REGS)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_USER_MEMORY_REGION,)?], + and![Cond::new( + 1, + ArgLen::Dword, + Eq, + KVM_SET_USER_MEMORY_REGION2, + )?], + and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_MEMORY_ATTRIBUTES,)?], + and![Cond::new(1, ArgLen::Dword, Eq, KVM_CREATE_GUEST_MEMFD,)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_VCPU_EVENTS,)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_NMI)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_NESTED_STATE)?], @@ -750,6 +761,14 @@ fn create_vcpu_ioctl_seccomp_rule_kvm() -> Result, BackendError and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_DEVICE_ATTR,)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_GSI_ROUTING,)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_USER_MEMORY_REGION,)?], + and![Cond::new( + 1, + ArgLen::Dword, + Eq, + KVM_SET_USER_MEMORY_REGION2, + )?], + and![Cond::new(1, ArgLen::Dword, Eq, KVM_CREATE_GUEST_MEMFD,)?], + and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_MEMORY_ATTRIBUTES,)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_RUN,)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_NMI)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_NESTED_STATE)?], From c31f5d4998e1f249be6c4e73244b8a11f9f0e806 Mon Sep 17 00:00:00 2001 From: Ruben Hakobyan Date: Tue, 7 Apr 2026 07:59:20 -0700 Subject: [PATCH 721/742] vmm: allow IGVM payload alongside a kernel Previously, the payload validation rejected an IGVM file combined with a kernel or firmware. Relax this constraint to allow an IGVM carrying a firmware (e.g Oak stage0) to be paired with a separate kernel image. This enables fw_cfg-style boot where stage0 loads a kernel provided through fw_cfg rather than embedded in the IGVM file itself. Signed-off-by: Ruben Hakobyan --- vmm/src/vm_config.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vmm/src/vm_config.rs b/vmm/src/vm_config.rs index 9162045404..01d3bb0101 100644 --- a/vmm/src/vm_config.rs +++ b/vmm/src/vm_config.rs @@ -849,7 +849,7 @@ impl PayloadConfig { #[cfg(feature = "igvm")] { if self.igvm.is_some() { - if self.firmware.is_some() || self.kernel.is_some() { + if self.firmware.is_some() { return Err(PayloadConfigError::IgvmPlusOtherPayloads); } return Ok(()); From 8ee0a07ab112441286432c29a61f13768447afae Mon Sep 17 00:00:00 2001 From: Ruben Hakobyan Date: Tue, 7 Apr 2026 08:05:30 -0700 Subject: [PATCH 722/742] arch, hypervisor, vmm: skip vcpu setup when using igvm and kvm When we use igvm + kvm, we setup the regs and sregs using the cpuid page. We still need to setup the fpu in configure_vcpu. Co-authored-by: Keith Adler Signed-off-by: Keith Adler Co-authored-by: Alex Orozco Signed-off-by: Alex Orozco Signed-off-by: Ruben Hakobyan --- arch/src/x86_64/mod.rs | 23 ++++++++++--------- hypervisor/src/lib.rs | 2 +- vmm/src/cpu.rs | 51 +++++++++++++++++++++++++++++++----------- vmm/src/vm.rs | 10 +++++++++ 4 files changed, 62 insertions(+), 24 deletions(-) diff --git a/arch/src/x86_64/mod.rs b/arch/src/x86_64/mod.rs index d35a878e61..09577b436c 100644 --- a/arch/src/x86_64/mod.rs +++ b/arch/src/x86_64/mod.rs @@ -819,6 +819,7 @@ pub fn configure_vcpu( cpu_vendor: CpuVendor, topology: (u16, u16, u16, u16), nested: bool, + setup_registers: bool, ) -> super::Result<()> { let x2apic_id = get_x2apic_id(id, Some(topology)); @@ -892,17 +893,19 @@ pub fn configure_vcpu( regs::setup_msrs(vcpu).map_err(Error::MsrsConfiguration)?; if let Some((kernel_entry_point, guest_memory)) = boot_setup { - regs::setup_regs(vcpu, kernel_entry_point).map_err(Error::RegsConfiguration)?; + if setup_registers { + regs::setup_regs(vcpu, kernel_entry_point).map_err(Error::RegsConfiguration)?; + + // CPUs are required (by Intel sdm spec) to boot in x2apic mode if any + // of the apic IDs is larger than 255. Experimentally, the Linux kernel + // does not recognize the last vCPU if x2apic is not enabled when + // there are 256 vCPUs in a flat hierarchy (i.e. max x2apic ID is 255), + // so we need to enable x2apic in this case as well. + let enable_x2_apic_mode = get_max_x2apic_id(topology) > MAX_SUPPORTED_CPUS_LEGACY; + regs::setup_sregs(&guest_memory.memory(), vcpu, enable_x2_apic_mode) + .map_err(Error::SregsConfiguration)?; + } regs::setup_fpu(vcpu).map_err(Error::FpuConfiguration)?; - - // CPUs are required (by Intel sdm spec) to boot in x2apic mode if any - // of the apic IDs is larger than 255. Experimentally, the Linux kernel - // does not recognize the last vCPU if x2apic is not enabled when - // there are 256 vCPUs in a flat hierarchy (i.e. max x2apic ID is 255), - // so we need to enable x2apic in this case as well. - let enable_x2_apic_mode = get_max_x2apic_id(topology) > MAX_SUPPORTED_CPUS_LEGACY; - regs::setup_sregs(&guest_memory.memory(), vcpu, enable_x2_apic_mode) - .map_err(Error::SregsConfiguration)?; } interrupts::set_lint(vcpu).map_err(|e| Error::LocalIntConfiguration(e.into()))?; Ok(()) diff --git a/hypervisor/src/lib.rs b/hypervisor/src/lib.rs index 787cfbf1f4..8357e63b79 100644 --- a/hypervisor/src/lib.rs +++ b/hypervisor/src/lib.rs @@ -64,7 +64,7 @@ pub use vm::{ pub use crate::hypervisor::{Hypervisor, HypervisorError}; -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Copy, Clone, PartialEq)] pub enum HypervisorType { #[cfg(feature = "kvm")] Kvm, diff --git a/vmm/src/cpu.rs b/vmm/src/cpu.rs index 1450e0a8e2..261f4c1c75 100644 --- a/vmm/src/cpu.rs +++ b/vmm/src/cpu.rs @@ -546,6 +546,7 @@ impl Vcpu { #[cfg(target_arch = "x86_64")] kvm_hyperv: bool, #[cfg(target_arch = "x86_64")] topology: (u16, u16, u16, u16), #[cfg(target_arch = "x86_64")] nested: bool, + #[cfg(feature = "igvm")] igvm_enabled: bool, ) -> Result<()> { #[cfg(target_arch = "aarch64")] { @@ -558,17 +559,32 @@ impl Vcpu { .map_err(Error::VcpuConfiguration)?; info!("Configuring vCPU: cpu_id = {}", self.id); #[cfg(target_arch = "x86_64")] - arch::configure_vcpu( - self.vcpu.as_ref(), - self.id, - boot_setup, - cpuid, - kvm_hyperv, - self.vendor, - topology, - nested, - ) - .map_err(Error::VcpuConfiguration)?; + { + // When IGVM is enabled, skip standard register setup here — the IGVM + // loader populates vCPU registers from the VMSA via set_sev_control_register + // (currently KVM-specific; MSHV handles this through its own import path). + // igvm_enabled is kept as an explicit flag rather than derived from sev_snp + // state because IGVM could theoretically be used independently of SEV-SNP. + cfg_if::cfg_if! { + if #[cfg(feature = "igvm")] { + let setup_registers = !igvm_enabled; + } else { + let setup_registers = true; + } + } + arch::configure_vcpu( + self.vcpu.as_ref(), + self.id, + boot_setup, + cpuid, + kvm_hyperv, + self.vendor, + topology, + nested, + setup_registers, + ) + .map_err(Error::VcpuConfiguration)?; + } Ok(()) } @@ -697,6 +713,8 @@ pub struct CpuManager { sev_snp_enabled: bool, // State of the core scheduling group leader election (VM mode). core_scheduling_group_leader: Arc, + #[cfg(feature = "igvm")] + igvm_enabled: bool, } /// State of the core scheduling group leader election for VM-wide cookie @@ -826,6 +844,7 @@ impl CpuManager { #[cfg(feature = "tdx")] tdx_enabled: bool, numa_nodes: &NumaNodes, #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, + #[cfg(feature = "igvm")] igvm_enabled: bool, ) -> Result>> { if config.max_vcpus > hypervisor.get_max_vcpus() { return Err(Error::MaximumVcpusExceeded( @@ -902,6 +921,8 @@ impl CpuManager { core_scheduling_group_leader: Arc::new(AtomicI32::new( CoreSchedulingLeader::Initial as i32, )), + #[cfg(feature = "igvm")] + igvm_enabled, }))) } @@ -980,8 +1001,10 @@ impl CpuManager { vcpu: &mut Vcpu, boot_setup: Option<(EntryPoint, &GuestMemoryAtomic)>, ) -> Result<()> { - #[cfg(feature = "sev_snp")] - if self.sev_snp_enabled { + #[cfg(all(feature = "sev_snp", feature = "mshv"))] + if self.sev_snp_enabled + && self.hypervisor.hypervisor_type() == hypervisor::HypervisorType::Mshv + { if let Some((kernel_entry_point, _)) = boot_setup { vcpu.set_sev_control_register( kernel_entry_point.entry_addr.0 / crate::igvm::HV_PAGE_SIZE, @@ -1022,6 +1045,8 @@ impl CpuManager { self.config.kvm_hyperv, topology, self.config.nested, + #[cfg(feature = "igvm")] + self.igvm_enabled, )?; #[cfg(target_arch = "aarch64")] diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index ffe7b8624a..f67d5bc108 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -738,6 +738,14 @@ impl Vm { let tdx_enabled = config.lock().unwrap().is_tdx_enabled(); #[cfg(feature = "sev_snp")] let sev_snp_enabled = config.lock().unwrap().is_sev_snp_enabled(); + #[cfg(feature = "igvm")] + let igvm_enabled = config + .lock() + .unwrap() + .payload + .as_ref() + .and_then(|p| p.igvm.as_ref()) + .is_some(); let cpus_config = config.lock().unwrap().cpus.clone(); let cpu_manager = cpu::CpuManager::new( @@ -755,6 +763,8 @@ impl Vm { numa_nodes, #[cfg(feature = "sev_snp")] sev_snp_enabled, + #[cfg(feature = "igvm")] + igvm_enabled, ) .map_err(Error::CpuManager)?; From b545b2fc4eb8f402c4c40f0e39c196dd0e1e0a0f Mon Sep 17 00:00:00 2001 From: Ruben Hakobyan Date: Tue, 7 Apr 2026 08:09:58 -0700 Subject: [PATCH 723/742] hypervisor, vmm: pass SNP guest policy to sev_snp_init The SNP guest policy (AMD SEV-SNP ABI bits controlling SMT, migration, debug, etc.) was previously hardcoded inside the MSHV implementation. Widen Vm::sev_snp_init() to accept an SnpPolicy parameter so each hypervisor backend receives the policy at init time. Add get_default_sev_snp_guest_policy() in the VMM to construct the default policy. Co-authored-by: Keith Adler Signed-off-by: Keith Adler Co-authored-by: Alex Orozco Signed-off-by: Alex Orozco Signed-off-by: Ruben Hakobyan --- hypervisor/src/mshv/mod.rs | 4 ++-- hypervisor/src/vm.rs | 4 +++- vmm/Cargo.toml | 7 ++++++- vmm/src/vm.rs | 18 +++++++++++++++++- 4 files changed, 28 insertions(+), 5 deletions(-) diff --git a/hypervisor/src/mshv/mod.rs b/hypervisor/src/mshv/mod.rs index 8623531c5b..1119691273 100644 --- a/hypervisor/src/mshv/mod.rs +++ b/hypervisor/src/mshv/mod.rs @@ -58,7 +58,7 @@ pub use aarch64::VcpuMshvState; #[cfg(target_arch = "aarch64")] use aarch64::gic::{BASE_SPI_IRQ, MshvGicV2M}; #[cfg(feature = "sev_snp")] -use igvm_defs::IGVM_VHS_SNP_ID_BLOCK; +use igvm_defs::{IGVM_VHS_SNP_ID_BLOCK, SnpPolicy}; #[cfg(feature = "sev_snp")] use snp_constants::*; use vmm_sys_util::eventfd::EventFd; @@ -2254,7 +2254,7 @@ impl vm::Vm for MshvVm { /// Initialize the SEV-SNP VM #[cfg(feature = "sev_snp")] - fn sev_snp_init(&self) -> vm::Result<()> { + fn sev_snp_init(&self, _guest_policy: SnpPolicy) -> vm::Result<()> { self.fd .set_partition_property( hv_partition_property_code_HV_PARTITION_PROPERTY_ISOLATION_STATE, diff --git a/hypervisor/src/vm.rs b/hypervisor/src/vm.rs index 9d7e60a8be..e6787d19ef 100644 --- a/hypervisor/src/vm.rs +++ b/hypervisor/src/vm.rs @@ -17,6 +17,8 @@ use std::sync::Mutex; #[cfg(feature = "sev_snp")] use igvm_defs::IGVM_VHS_SNP_ID_BLOCK; +#[cfg(feature = "sev_snp")] +use igvm_defs::SnpPolicy; use thiserror::Error; use vmm_sys_util::eventfd::EventFd; @@ -392,7 +394,7 @@ pub trait Vm: Send + Sync + Any { fn get_dirty_log(&self, slot: u32, base_gpa: u64, memory_size: u64) -> Result>; #[cfg(feature = "sev_snp")] /// Initialize SEV-SNP on this VM - fn sev_snp_init(&self) -> Result<()> { + fn sev_snp_init(&self, _guest_policy: SnpPolicy) -> Result<()> { unimplemented!() } #[cfg(feature = "tdx")] diff --git a/vmm/Cargo.toml b/vmm/Cargo.toml index 1fe5e0e47b..7c8354a0cb 100644 --- a/vmm/Cargo.toml +++ b/vmm/Cargo.toml @@ -32,7 +32,12 @@ mshv = [ "vm-device/mshv", ] pvmemcontrol = ["devices/pvmemcontrol"] -sev_snp = ["arch/sev_snp", "hypervisor/sev_snp", "virtio-devices/sev_snp"] +sev_snp = [ + "arch/sev_snp", + "hypervisor/sev_snp", + "igvm_defs", + "virtio-devices/sev_snp", +] tdx = ["arch/tdx", "hypervisor/tdx"] tracing = ["tracer/tracing"] diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index f67d5bc108..a60ee722d8 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -47,6 +47,8 @@ use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs; #[cfg(target_arch = "aarch64")] use hypervisor::arch::aarch64::regs::AARCH64_PMU_IRQ; use hypervisor::{HypervisorVmConfig, HypervisorVmError, VmOps}; +#[cfg(feature = "sev_snp")] +use igvm_defs::SnpPolicy; use libc::{SIGWINCH, termios}; use linux_loader::cmdline::Cmdline; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] @@ -534,6 +536,19 @@ pub struct Vm { impl Vm { pub const HANDLED_SIGNALS: [i32; 1] = [SIGWINCH]; + #[cfg(feature = "sev_snp")] + pub fn get_default_sev_snp_guest_policy() -> SnpPolicy { + SnpPolicy::new() + .with_abi_minor(0) + .with_abi_major(0) + // SMT permitted: allows the guest to run on an SMT-enabled host. + // This is the permissive default; future work can expose this as a + // configurable platform option. + .with_smt(1) + .with_reserved_must_be_one(1) + .with_migrate_ma(0) + } + #[allow(clippy::needless_pass_by_value)] #[allow(clippy::too_many_arguments)] pub fn new_from_memory_manager( @@ -982,7 +997,8 @@ impl Vm { .map_err(Error::CpuManager)?; // Initialize SEV-SNP - transitions guest into secure state - vm.sev_snp_init().map_err(Error::InitializeSevSnpVm)?; + vm.sev_snp_init(Self::get_default_sev_snp_guest_policy()) + .map_err(Error::InitializeSevSnpVm)?; // Load payload for SEV-SNP (IGVM parser needs cpu_manager for cpuid) let load_payload_handle = if snapshot.is_none() { From 7d65187350a12f6d6a4ea338e1e87e6160ca5b69 Mon Sep 17 00:00:00 2001 From: Ruben Hakobyan Date: Tue, 7 Apr 2026 18:39:37 -0700 Subject: [PATCH 724/742] vmm: make RSDP address optional in configure_system Change configure_system to take an Option since rsdp is wrapped into an option anyways (we use configure system to setup the mptables). Co-authored-by: Keith Adler Signed-off-by: Keith Adler Co-authored-by: Alex Orozco Signed-off-by: Alex Orozco Signed-off-by: Ruben Hakobyan --- vmm/src/vm.rs | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index a60ee722d8..ef3d0ebac1 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -1676,7 +1676,11 @@ impl Vm { } #[cfg(target_arch = "x86_64")] - fn configure_system(&mut self, rsdp_addr: GuestAddress, entry_addr: EntryPoint) -> Result<()> { + fn configure_system( + &mut self, + rsdp_addr: Option, + entry_addr: EntryPoint, + ) -> Result<()> { trace_scoped!("configure_system"); info!("Configuring system"); let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); @@ -1687,7 +1691,6 @@ impl Vm { }; let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus(); - let rsdp_addr = Some(rsdp_addr); let serial_number = self .config @@ -1739,7 +1742,7 @@ impl Vm { #[cfg(target_arch = "aarch64")] fn configure_system( &mut self, - _rsdp_addr: GuestAddress, + _rsdp_addr: Option, _entry_addr: EntryPoint, ) -> Result<()> { let cmdline = Self::generate_cmdline( @@ -2776,16 +2779,11 @@ impl Vm { let rsdp_addr = self.create_acpi_tables(); #[cfg(not(target_arch = "riscv64"))] - { - #[cfg(not(any(feature = "sev_snp", feature = "tdx")))] - assert!(rsdp_addr.is_some()); - // Configure shared state based on loaded kernel - if let Some(rsdp_adr) = rsdp_addr { - entry_point - .map(|entry_point| self.configure_system(rsdp_adr, entry_point)) - .transpose()?; - } - } + // Configure shared state based on loaded kernel + entry_point + .map(|entry_point| self.configure_system(rsdp_addr, entry_point)) + .transpose()?; + #[cfg(target_arch = "riscv64")] self.configure_system().unwrap(); From 4f1119a78891e632a33ec2610f75f0a70f292ea1 Mon Sep 17 00:00:00 2001 From: Ruben Hakobyan Date: Tue, 7 Apr 2026 18:40:26 -0700 Subject: [PATCH 725/742] vmm: remove sev_snp_enabled parameter from payload loading The load_payload and load_payload_async functions previously received a sev_snp_enabled flag to decide whether to call load_igvm with or without the host_data parameter. Replace this with a single code path that always passes host_data behind a cfg(feature = "sev_snp") gate, removing the runtime branch and the extra parameter threaded through three call sites. Co-authored-by: Keith Adler Signed-off-by: Keith Adler Co-authored-by: Alex Orozco Signed-off-by: Alex Orozco Signed-off-by: Ruben Hakobyan --- vmm/src/vm.rs | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index ef3d0ebac1..beec204f79 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -942,8 +942,6 @@ impl Vm { config, #[cfg(feature = "igvm")] cpu_manager, - #[cfg(feature = "sev_snp")] - false, )? } else { None @@ -1007,7 +1005,6 @@ impl Vm { config, #[cfg(feature = "igvm")] cpu_manager, - true, )? } else { None @@ -1583,19 +1580,19 @@ impl Vm { payload: &PayloadConfig, memory_manager: Arc>, #[cfg(feature = "igvm")] cpu_manager: Arc>, - #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, ) -> Result { trace_scoped!("load_payload"); #[cfg(feature = "igvm")] { if let Some(_igvm_file) = &payload.igvm { let igvm = File::open(_igvm_file).map_err(Error::IgvmFile)?; - #[cfg(feature = "sev_snp")] - if sev_snp_enabled { - return Self::load_igvm(igvm, memory_manager, cpu_manager, &payload.host_data); - } - #[cfg(not(feature = "sev_snp"))] - return Self::load_igvm(igvm, memory_manager, cpu_manager); + return Self::load_igvm( + igvm, + memory_manager, + cpu_manager, + #[cfg(feature = "sev_snp")] + &payload.host_data, + ); } } match (&payload.firmware, &payload.kernel) { @@ -1639,7 +1636,6 @@ impl Vm { memory_manager: &Arc>, config: &Arc>, #[cfg(feature = "igvm")] cpu_manager: &Arc>, - #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, ) -> Result>>> { // Kernel with TDX is loaded in a different manner #[cfg(feature = "tdx")] @@ -1666,8 +1662,6 @@ impl Vm { memory_manager, #[cfg(feature = "igvm")] cpu_manager, - #[cfg(feature = "sev_snp")] - sev_snp_enabled, ) }) .map_err(Error::KernelLoadThreadSpawn) From 425609a8b5fc10c9c0b35a44dc4f8b08a4751031 Mon Sep 17 00:00:00 2001 From: Ruben Hakobyan Date: Thu, 9 Apr 2026 16:58:58 -0700 Subject: [PATCH 726/742] vmm: parse IGVM file early and thread it through VM setup Move IGVM file parsing from load_igvm() into a dedicated parse_igvm() helper in igvm/mod.rs, and parse the file upfront in Vm::new() so the resulting IgvmFile struct is available throughout VM initialization. This is a prerequisite for extracting VMSA SEV features from the parsed IGVM before issuing KVM_SEV_INIT2, which needs sev_features. Signed-off-by: Ruben Hakobyan --- vmm/src/igvm/igvm_loader.rs | 14 ++++------- vmm/src/igvm/mod.rs | 9 ++++++++ vmm/src/lib.rs | 2 ++ vmm/src/vm.rs | 46 +++++++++++++++++++++++++++++-------- 4 files changed, 52 insertions(+), 19 deletions(-) diff --git a/vmm/src/igvm/igvm_loader.rs b/vmm/src/igvm/igvm_loader.rs index 6e256c1ecb..444fbcc539 100644 --- a/vmm/src/igvm/igvm_loader.rs +++ b/vmm/src/igvm/igvm_loader.rs @@ -4,12 +4,11 @@ // use std::collections::HashMap; use std::ffi::CString; -use std::io::{Read, Seek, SeekFrom}; use std::mem::size_of; use std::sync::{Arc, Mutex}; use igvm::snp_defs::SevVmsa; -use igvm::{IgvmDirectiveHeader, IgvmFile, IgvmPlatformHeader, IsolationType}; +use igvm::{IgvmDirectiveHeader, IgvmFile, IgvmPlatformHeader}; #[cfg(feature = "sev_snp")] use igvm_defs::{IGVM_VHS_MEMORY_MAP_ENTRY, MemoryMapEntryType}; use igvm_defs::{ @@ -51,6 +50,8 @@ pub enum Error { FailedToDecodeHostData(#[source] hex::FromHexError), #[error("Error allocating address space")] MemoryManager(MemoryManagerError), + #[error("IGVM file not provided")] + MissingIgvm, } #[allow(dead_code)] @@ -135,7 +136,7 @@ fn import_parameter( /// any isolation. #[allow(clippy::needless_pass_by_value)] pub fn load_igvm( - mut file: &std::fs::File, + igvm_file: IgvmFile, memory_manager: Arc>, cpu_manager: Arc>, cmdline: &str, @@ -143,7 +144,6 @@ pub fn load_igvm( ) -> Result, Error> { let mut loaded_info: Box = Box::default(); let command_line = CString::new(cmdline).map_err(Error::InvalidCommandLine)?; - let mut file_contents = Vec::new(); let memory = memory_manager.lock().as_ref().unwrap().guest_memory(); let mut gpas: Vec = Vec::new(); let proc_count = cpu_manager.lock().unwrap().vcpus().len() as u32; @@ -156,12 +156,6 @@ pub fn load_igvm( .map_err(Error::FailedToDecodeHostData)?; } - file.seek(SeekFrom::Start(0)).map_err(Error::Igvm)?; - file.read_to_end(&mut file_contents).map_err(Error::Igvm)?; - - let igvm_file = IgvmFile::new_from_binary(&file_contents, Some(IsolationType::Snp)) - .map_err(Error::InvalidIgvmFile)?; - let mask = match &igvm_file.platforms()[0] { IgvmPlatformHeader::SupportedPlatform(info) => { debug_assert!(info.platform_type == IgvmPlatformType::SEV_SNP); diff --git a/vmm/src/igvm/mod.rs b/vmm/src/igvm/mod.rs index 62c32d4e89..ded102bd35 100644 --- a/vmm/src/igvm/mod.rs +++ b/vmm/src/igvm/mod.rs @@ -27,10 +27,19 @@ pub mod igvm_loader; mod loader; +use std::path::Path; + use igvm::snp_defs::SevVmsa; +use igvm::{IgvmFile, IsolationType}; use igvm_defs::IGVM_VHS_SNP_ID_BLOCK; use zerocopy::FromZeros; +pub fn parse_igvm(igvm_path: &Path) -> Result { + let file_contents = std::fs::read(igvm_path).map_err(igvm_loader::Error::Igvm)?; + IgvmFile::new_from_binary(&file_contents, Some(IsolationType::Snp)) + .map_err(igvm_loader::Error::InvalidIgvmFile) +} + #[derive(Debug, Clone)] pub struct IgvmLoadedInfo { pub gpas: Vec, diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 5dfea07e78..f98c5b7258 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -1161,6 +1161,8 @@ impl Vmm { self.console_resize_pipe.clone(), Arc::clone(&self.original_termios_opt), Some(&snapshot), + #[cfg(feature = "igvm")] + None, ) .map_err(|e| { MigratableError::MigrateReceive(anyhow!("Error creating VM from snapshot: {e:?}")) diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index beec204f79..fe74e6a75e 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -47,6 +47,8 @@ use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs; #[cfg(target_arch = "aarch64")] use hypervisor::arch::aarch64::regs::AARCH64_PMU_IRQ; use hypervisor::{HypervisorVmConfig, HypervisorVmError, VmOps}; +#[cfg(feature = "igvm")] +use igvm::IgvmFile; #[cfg(feature = "sev_snp")] use igvm_defs::SnpPolicy; use libc::{SIGWINCH, termios}; @@ -333,10 +335,6 @@ pub enum Error { #[error("Error coredumping VM")] Coredump(#[source] GuestDebuggableError), - #[cfg(feature = "igvm")] - #[error("Cannot open igvm file")] - IgvmFile(#[source] io::Error), - #[cfg(feature = "igvm")] #[error("Cannot load the igvm into memory")] IgvmLoad(#[source] igvm_loader::Error), @@ -567,6 +565,7 @@ impl Vm { console_resize_pipe: Option>, original_termios: Arc>>, snapshot: Option<&Snapshot>, + #[cfg(feature = "igvm")] igvm_file: Option, ) -> Result { trace_scoped!("Vm::new_from_memory_manager"); @@ -649,6 +648,8 @@ impl Vm { console_resize_pipe.as_ref(), &original_termios, snapshot, + #[cfg(feature = "igvm")] + igvm_file, )?; // Load kernel and initramfs files @@ -879,6 +880,7 @@ impl Vm { console_resize_pipe: Option<&Arc>, original_termios: &Arc>>, snapshot: Option<&Snapshot>, + #[cfg(feature = "igvm")] igvm_file: Option, ) -> Result>>> { #[cfg(feature = "mshv")] let is_mshv = matches!( @@ -913,6 +915,8 @@ impl Vm { console_resize_pipe, original_termios, snapshot, + #[cfg(feature = "igvm")] + igvm_file, ); } @@ -942,6 +946,8 @@ impl Vm { config, #[cfg(feature = "igvm")] cpu_manager, + #[cfg(feature = "igvm")] + igvm_file, )? } else { None @@ -986,6 +992,7 @@ impl Vm { console_resize_pipe: Option<&Arc>, original_termios: &Arc>>, snapshot: Option<&Snapshot>, + #[cfg(feature = "igvm")] igvm_file: Option, ) -> Result>>> { // Create boot vCPUs before SEV-SNP initialization cpu_manager @@ -1005,6 +1012,8 @@ impl Vm { config, #[cfg(feature = "igvm")] cpu_manager, + #[cfg(feature = "igvm")] + igvm_file, )? } else { None @@ -1306,6 +1315,18 @@ impl Vm { vm_config.lock().unwrap().is_tdx_enabled() }; + #[cfg(feature = "igvm")] + let igvm_file = { + let config = vm_config.lock().unwrap(); + config + .payload + .as_ref() + .and_then(|p| p.igvm.as_ref()) + .map(|igvm_path| crate::igvm::parse_igvm(igvm_path)) + .transpose() + .map_err(Error::IgvmLoad)? + }; + let vm = Self::create_hypervisor_vm( hypervisor.as_ref(), vm_config.as_ref().lock().unwrap().deref().into(), @@ -1366,6 +1387,8 @@ impl Vm { console_resize_pipe, original_termios, snapshot, + #[cfg(feature = "igvm")] + igvm_file, ) } @@ -1484,13 +1507,13 @@ impl Vm { #[cfg(feature = "igvm")] #[allow(clippy::needless_pass_by_value)] fn load_igvm( - igvm: File, + igvm_file: IgvmFile, memory_manager: Arc>, cpu_manager: Arc>, #[cfg(feature = "sev_snp")] host_data: &Option, ) -> Result { let res = igvm_loader::load_igvm( - &igvm, + igvm_file, memory_manager, cpu_manager.clone(), "", @@ -1580,14 +1603,16 @@ impl Vm { payload: &PayloadConfig, memory_manager: Arc>, #[cfg(feature = "igvm")] cpu_manager: Arc>, + #[cfg(feature = "igvm")] igvm_file: Option, ) -> Result { trace_scoped!("load_payload"); #[cfg(feature = "igvm")] { - if let Some(_igvm_file) = &payload.igvm { - let igvm = File::open(_igvm_file).map_err(Error::IgvmFile)?; + if payload.igvm.is_some() { + let igvm_file = + igvm_file.ok_or(Error::IgvmLoad(igvm_loader::Error::MissingIgvm))?; return Self::load_igvm( - igvm, + igvm_file, memory_manager, cpu_manager, #[cfg(feature = "sev_snp")] @@ -1636,6 +1661,7 @@ impl Vm { memory_manager: &Arc>, config: &Arc>, #[cfg(feature = "igvm")] cpu_manager: &Arc>, + #[cfg(feature = "igvm")] igvm_file: Option, ) -> Result>>> { // Kernel with TDX is loaded in a different manner #[cfg(feature = "tdx")] @@ -1662,6 +1688,8 @@ impl Vm { memory_manager, #[cfg(feature = "igvm")] cpu_manager, + #[cfg(feature = "igvm")] + igvm_file, ) }) .map_err(Error::KernelLoadThreadSpawn) From 2e004521e01f8c7a8d50908d4b6b3396ab03160f Mon Sep 17 00:00:00 2001 From: Ruben Hakobyan Date: Tue, 7 Apr 2026 14:23:13 -0700 Subject: [PATCH 727/742] hypervisor, vmm: Add KVM SEV_{INIT2, SNP_LAUNCH_START} support Introduce the SevFd abstraction that wraps /dev/sev and implements the KVM_SEV_INIT2 and KVM_SEV_SNP_LAUNCH_START ioctls for SEV-SNP VM initialization on KVM. Key changes: - Add sev.rs with KvmSevInit and KvmSevSnpLaunchStart ioctl structs matching the kernel layout (linux/arch/x86/include/uapi/asm/kvm.h) - Implement KVM_SEV_INIT2 and KVM_SEV_SNP_LAUNCH_START ioctls - Set KVM_MEMORY_ATTRIBUTE_PRIVATE on newly created memory regions when guest_memfd is supported - Widen SevSnpPageAccessProxy cfg gates from mshv-only to all sev_snp-enabled builds - Make sev_snp_init a required trait method (remove default impl) - Include KVM_SEV_SNP_LAUNCH_START in the seccomp allowlist - Parse VMSA SEV features from IGVM and include them in the KVM_SEV_INIT2 ioctl Co-authored-by: Keith Adler Signed-off-by: Keith Adler Co-authored-by: Alex Orozco Signed-off-by: Alex Orozco Co-authored-by: Rob Bradford Signed-off-by: Rob Bradford Signed-off-by: Ruben Hakobyan --- hypervisor/src/cpu.rs | 5 +- hypervisor/src/hypervisor.rs | 5 ++ hypervisor/src/kvm/mod.rs | 77 +++++++++++++++++---- hypervisor/src/kvm/x86_64/mod.rs | 3 + hypervisor/src/kvm/x86_64/sev.rs | 113 +++++++++++++++++++++++++++++++ hypervisor/src/lib.rs | 2 + hypervisor/src/vm.rs | 4 +- vmm/src/device_manager.rs | 8 +-- vmm/src/igvm/igvm_loader.rs | 15 ++++ vmm/src/lib.rs | 2 + vmm/src/seccomp_filters.rs | 2 + vmm/src/vm.rs | 14 ++-- 12 files changed, 225 insertions(+), 25 deletions(-) create mode 100644 hypervisor/src/kvm/x86_64/sev.rs diff --git a/hypervisor/src/cpu.rs b/hypervisor/src/cpu.rs index 4bc348a98d..a4a029e989 100644 --- a/hypervisor/src/cpu.rs +++ b/hypervisor/src/cpu.rs @@ -587,10 +587,11 @@ pub trait Vcpu: Send + Sync { ) -> Result<[u32; 4]> { unimplemented!() } - #[cfg(feature = "mshv")] - fn set_sev_control_register(&self, _reg: u64) -> Result<()> { + #[cfg(feature = "sev_snp")] + fn set_sev_control_register(&self, _vmsa_pfn: u64) -> Result<()> { unimplemented!() } + /// /// Sets the value of GIC redistributor address /// diff --git a/hypervisor/src/hypervisor.rs b/hypervisor/src/hypervisor.rs index a25f8a9bf7..05852a230f 100644 --- a/hypervisor/src/hypervisor.rs +++ b/hypervisor/src/hypervisor.rs @@ -96,6 +96,11 @@ pub enum HypervisorError { #[cfg(target_arch = "x86_64")] #[error("Failed to enable AMX tile state components")] CouldNotEnableAmxStateComponents(#[source] crate::arch::x86::AmxGuestSupportError), + /// + /// Failed to retrieve SEV-SNP capabilities + /// + #[error("Failed to retrieve SEV-SNP capabilities:{0}")] + SevSnpCapabilities(#[source] anyhow::Error), } /// diff --git a/hypervisor/src/kvm/mod.rs b/hypervisor/src/kvm/mod.rs index 89090294a9..a697d5df79 100644 --- a/hypervisor/src/kvm/mod.rs +++ b/hypervisor/src/kvm/mod.rs @@ -140,6 +140,9 @@ use crate::kvm::x86_64::XsaveStateError; #[cfg(target_arch = "x86_64")] ioctl_io_nr!(KVM_NMI, kvm_bindings::KVMIO, 0x9a); +#[cfg(feature = "sev_snp")] +use kvm_bindings::{KVM_MEMORY_ATTRIBUTE_PRIVATE, KVM_X86_SNP_VM, kvm_memory_attributes}; + #[cfg(feature = "tdx")] const KVM_EXIT_TDX: u32 = 50; #[cfg(feature = "tdx")] @@ -498,9 +501,11 @@ struct KvmDirtyLogSlot { /// Wrapper over KVM VM ioctls. pub struct KvmVm { - fd: VmFd, + fd: Arc, #[cfg(target_arch = "x86_64")] msrs: Vec, + #[cfg(all(feature = "sev_snp", target_arch = "x86_64"))] + sev_fd: Option, dirty_log_slots: RwLock>, guest_memfds: Option>>, } @@ -621,6 +626,15 @@ impl KvmVm { /// let vm = hypervisor.create_vm(HypervisorVmConfig::default()).expect("new VM fd creation failed"); /// ``` impl vm::Vm for KvmVm { + #[cfg(all(feature = "sev_snp", target_arch = "x86_64"))] + fn sev_snp_init(&self, guest_policy: igvm_defs::SnpPolicy) -> vm::Result<()> { + self.sev_fd + .as_ref() + .unwrap() + .launch_start(&self.fd, guest_policy) + .map_err(|e| vm::HypervisorVmError::InitializeSevSnp(e.into())) + } + #[cfg(target_arch = "x86_64")] /// /// Sets the address of the one-page region in the VM's address space. @@ -938,6 +952,18 @@ impl vm::Vm for KvmVm { self.set_user_memory_region(region) .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into()))?; } + + #[cfg(feature = "sev_snp")] + if self.guest_memfds.is_some() { + self.fd + .set_memory_attributes(kvm_memory_attributes { + address: region.guest_phys_addr, + size: region.memory_size, + attributes: KVM_MEMORY_ATTRIBUTE_PRIVATE as u64, + flags: 0, + }) + .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into()))?; + } Ok(()) } @@ -1383,15 +1409,17 @@ impl hypervisor::Hypervisor for KvmHypervisor { } #[cfg(target_arch = "x86_64")] - cfg_if::cfg_if! { - if #[cfg(feature = "tdx")] { - if _config.tdx_enabled { - vm_type = KVM_X86_SW_PROTECTED_VM.into(); - } else { - vm_type = KVM_X86_DEFAULT_VM.into(); - } - } else { - vm_type = KVM_X86_DEFAULT_VM.into(); + { + vm_type = KVM_X86_DEFAULT_VM.into(); + + #[cfg(feature = "sev_snp")] + if _config.sev_snp_enabled { + vm_type = KVM_X86_SNP_VM.into(); + } + + #[cfg(feature = "tdx")] + if _config.tdx_enabled { + vm_type = KVM_X86_SW_PROTECTED_VM.into(); } } @@ -1433,10 +1461,35 @@ impl hypervisor::Hypervisor for KvmHypervisor { guest_memfds = Some(RwLock::new(HashMap::new())); } + #[cfg(feature = "sev_snp")] + let sev_fd = { + let sev_snp_enabled = vm_type == KVM_X86_SNP_VM as u64; + if sev_snp_enabled { + let mask = self.kvm.check_extension_int(crate::kvm::Cap::ExitHypercall); + let cap = kvm_bindings::kvm_enable_cap { + cap: kvm_bindings::KVM_CAP_EXIT_HYPERCALL, + args: [mask as _, 0, 0, 0], + ..Default::default() + }; + fd.enable_cap(&cap) + .map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?; + let sev_dev = x86_64::sev::SevFd::new("/dev/sev") + .map_err(|e| hypervisor::HypervisorError::SevSnpCapabilities(e.into()))?; + sev_dev + .init2(&fd, _config.vmsa_features) + .map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?; + Some(sev_dev) + } else { + None + } + }; + Ok(Arc::new(KvmVm { - fd, + fd: Arc::new(fd), msrs, dirty_log_slots: RwLock::new(HashMap::new()), + #[cfg(feature = "sev_snp")] + sev_fd, guest_memfds, })) } @@ -1444,7 +1497,7 @@ impl hypervisor::Hypervisor for KvmHypervisor { #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] { Ok(Arc::new(KvmVm { - fd, + fd: Arc::new(fd), dirty_log_slots: RwLock::new(HashMap::new()), guest_memfds: None, })) diff --git a/hypervisor/src/kvm/x86_64/mod.rs b/hypervisor/src/kvm/x86_64/mod.rs index e338346c3f..62185fd84e 100644 --- a/hypervisor/src/kvm/x86_64/mod.rs +++ b/hypervisor/src/kvm/x86_64/mod.rs @@ -31,6 +31,9 @@ use crate::arch::x86::{ }; use crate::kvm::{Cap, Kvm, KvmError, KvmResult}; +#[cfg(feature = "sev_snp")] +pub(crate) mod sev; + /// /// Check KVM extension for Linux /// diff --git a/hypervisor/src/kvm/x86_64/sev.rs b/hypervisor/src/kvm/x86_64/sev.rs new file mode 100644 index 0000000000..d3497bed3b --- /dev/null +++ b/hypervisor/src/kvm/x86_64/sev.rs @@ -0,0 +1,113 @@ +// Copyright 2025 Google LLC. +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::fs::OpenOptions; +use std::os::fd::{AsRawFd, OwnedFd}; +use std::os::unix::fs::OpenOptionsExt; +use std::path::Path; + +use igvm_defs::SnpPolicy; +use kvm_bindings::kvm_sev_cmd; +use kvm_ioctls::VmFd; +use log::{error, info}; +use vmm_sys_util::errno; + +pub(crate) type Result = std::result::Result; + +// KVM SEV command IDs — linux/include/uapi/linux/kvm.h +const KVM_SEV_INIT2: u32 = 22; +const KVM_SEV_SNP_LAUNCH_START: u32 = 100; + +// SNP in VMSA - linux/arch/x86/include/asm/svm.h +const SVM_SEV_FEAT_SNP_ACTIVE: u64 = 1 << 0; + +fn sev_op(vm: &VmFd, sev_cmd: &mut kvm_sev_cmd, name: &str) -> Result<()> { + let ret = vm.encrypt_op_sev(sev_cmd); + if ret.is_err() { + error!("{name} op failed. error code: 0x{:x}", sev_cmd.error); + } + ret +} + +#[derive(Debug)] +pub struct SevFd { + pub fd: OwnedFd, +} + +// These ioctl structs must match the kernel layout exactly. +// Layouts from linux/arch/x86/include/uapi/asm/kvm.h + +#[repr(C, packed)] +#[derive(Debug, Copy, Clone, Default)] +pub(crate) struct KvmSevInit { + pub vmsa_features: u64, + pub flags: u32, + pub ghcb_version: u16, + pub pad1: u16, + pub pad2: [u32; 8], +} + +#[repr(C, packed)] +#[derive(Debug, Copy, Clone, Default)] +pub(crate) struct KvmSevSnpLaunchStart { + pub policy: u64, + pub gosvw: [u8; 16], + pub flags: u16, + pub pad0: [u8; 6], + pub pad1: [u64; 4], +} + +impl SevFd { + pub(crate) fn new(sev_path: impl AsRef) -> Result { + let file = OpenOptions::new() + .read(true) + .write(true) + .custom_flags(libc::O_CLOEXEC) + .open(sev_path.as_ref()) + .map_err(|e| errno::Error::new(e.raw_os_error().unwrap_or(libc::EINVAL)))?; + Ok(SevFd { + fd: OwnedFd::from(file), + }) + } + + pub(crate) fn init2(&self, vm: &VmFd, vmsa_features: u64) -> Result<()> { + // Clear the SNP bit, KVM sets it directly + let vmsa_features = vmsa_features & !SVM_SEV_FEAT_SNP_ACTIVE; + + // TODO: Query KVM for supported VMSA features before calling init2 + if vmsa_features != 0 { + info!("SEV-SNP: requesting vmsa_features: {vmsa_features:#x}"); + } + + let mut init = KvmSevInit { + vmsa_features, + ..Default::default() + }; + let mut sev_cmd = kvm_sev_cmd { + id: KVM_SEV_INIT2, + data: &mut init as *mut KvmSevInit as _, + sev_fd: self.fd.as_raw_fd() as _, + ..Default::default() + }; + sev_op(vm, &mut sev_cmd, "KVM_SEV_INIT2") + } + + pub(crate) fn launch_start(&self, vm: &VmFd, guest_policy: SnpPolicy) -> Result<()> { + // See AMD Spec Section 4.3 - Guest Policy + // Bit 17 is reserved and has to be one. + // https://docs.amd.com/v/u/en-US/56860_PUB_1.58_SEV_SNP + let mut start: KvmSevSnpLaunchStart = KvmSevSnpLaunchStart { + policy: guest_policy.into_bits(), + ..Default::default() + }; + let mut sev_cmd = kvm_sev_cmd { + id: KVM_SEV_SNP_LAUNCH_START, + data: &mut start as *mut KvmSevSnpLaunchStart as _, + sev_fd: self.fd.as_raw_fd() as _, + ..Default::default() + }; + sev_op(vm, &mut sev_cmd, "KVM_SEV_SNP_LAUNCH_START") + } +} diff --git a/hypervisor/src/lib.rs b/hypervisor/src/lib.rs index 8357e63b79..f224e7217c 100644 --- a/hypervisor/src/lib.rs +++ b/hypervisor/src/lib.rs @@ -190,6 +190,8 @@ pub struct HypervisorVmConfig { pub sev_snp_enabled: bool, #[cfg(feature = "sev_snp")] pub mem_size: u64, + #[cfg(feature = "sev_snp")] + pub vmsa_features: u64, pub nested: bool, pub smt_enabled: bool, } diff --git a/hypervisor/src/vm.rs b/hypervisor/src/vm.rs index e6787d19ef..36aae27b08 100644 --- a/hypervisor/src/vm.rs +++ b/hypervisor/src/vm.rs @@ -394,9 +394,7 @@ pub trait Vm: Send + Sync + Any { fn get_dirty_log(&self, slot: u32, base_gpa: u64, memory_size: u64) -> Result>; #[cfg(feature = "sev_snp")] /// Initialize SEV-SNP on this VM - fn sev_snp_init(&self, _guest_policy: SnpPolicy) -> Result<()> { - unimplemented!() - } + fn sev_snp_init(&self, guest_policy: SnpPolicy) -> Result<()>; #[cfg(feature = "tdx")] /// Initialize TDX on this VM fn tdx_init(&self, _cpuid: &[CpuIdEntry], _max_vcpus: u32) -> Result<()> { diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 948afdfae7..dc3d827e73 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -957,26 +957,26 @@ pub struct AcpiPlatformAddresses { pub sleep_status_reg_address: Option, } -#[cfg(all(feature = "mshv", feature = "sev_snp"))] +#[cfg(feature = "sev_snp")] struct SevSnpPageAccessProxy { vm: Arc, } -#[cfg(all(feature = "mshv", feature = "sev_snp"))] +#[cfg(feature = "sev_snp")] impl std::fmt::Debug for SevSnpPageAccessProxy { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "SNP Page access proxy") } } -#[cfg(all(feature = "mshv", feature = "sev_snp"))] +#[cfg(feature = "sev_snp")] impl SevSnpPageAccessProxy { fn new(vm: Arc) -> SevSnpPageAccessProxy { SevSnpPageAccessProxy { vm } } } -#[cfg(all(feature = "mshv", feature = "sev_snp"))] +#[cfg(feature = "sev_snp")] impl AccessPlatform for SevSnpPageAccessProxy { fn translate_gpa(&self, base: u64, _size: u64) -> std::result::Result { Ok(base) diff --git a/vmm/src/igvm/igvm_loader.rs b/vmm/src/igvm/igvm_loader.rs index 444fbcc539..d7a8f0cae8 100644 --- a/vmm/src/igvm/igvm_loader.rs +++ b/vmm/src/igvm/igvm_loader.rs @@ -129,6 +129,21 @@ fn import_parameter( Ok(()) } +/// +/// Extract sev_features from the boot CPU (vp_index 0) VMSA. +/// +#[cfg(feature = "sev_snp")] +pub fn extract_sev_features(igvm_file: &IgvmFile) -> u64 { + for header in igvm_file.directives() { + if let IgvmDirectiveHeader::SnpVpContext { vp_index, vmsa, .. } = header + && *vp_index == 0 + { + return vmsa.sev_features.into(); + } + } + 0 +} + /// /// Load the given IGVM file to guest memory. /// Right now it only supports SNP based isolation. diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index f98c5b7258..b0117d729c 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -234,6 +234,8 @@ impl From<&VmConfig> for hypervisor::HypervisorVmConfig { sev_snp_enabled: _value.is_sev_snp_enabled(), #[cfg(feature = "sev_snp")] mem_size: _value.memory.total_size(), + #[cfg(feature = "sev_snp")] + vmsa_features: 0, nested: _value.cpus.nested, smt_enabled: _value .cpus diff --git a/vmm/src/seccomp_filters.rs b/vmm/src/seccomp_filters.rs index 008fc60f29..1258959434 100644 --- a/vmm/src/seccomp_filters.rs +++ b/vmm/src/seccomp_filters.rs @@ -111,6 +111,7 @@ mod kvm { pub const KVM_NMI: u64 = 0xae9a; pub const KVM_GET_NESTED_STATE: u64 = 3229658814; pub const KVM_SET_NESTED_STATE: u64 = 1082175167; + pub const KVM_SEV_SNP_LAUNCH_START: u64 = 0x4018_aeb4; } mod iommufd { @@ -267,6 +268,7 @@ fn create_vmm_ioctl_seccomp_rule_common_kvm() -> Result, Backen and![Cond::new(1, ArgLen::Dword, Eq, KVM_NMI)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_NESTED_STATE)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_NESTED_STATE)?], + and![Cond::new(1, ArgLen::Dword, Eq, KVM_SEV_SNP_LAUNCH_START)?], ]) } diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index fe74e6a75e..74ca6a7ed0 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -1327,10 +1327,16 @@ impl Vm { .map_err(Error::IgvmLoad)? }; - let vm = Self::create_hypervisor_vm( - hypervisor.as_ref(), - vm_config.as_ref().lock().unwrap().deref().into(), - )?; + let vm = { + #[allow(unused_mut)] + let mut hv_config: hypervisor::HypervisorVmConfig = + vm_config.as_ref().lock().unwrap().deref().into(); + #[cfg(all(feature = "igvm", feature = "sev_snp"))] + if let Some(ref igvm) = igvm_file { + hv_config.vmsa_features = igvm_loader::extract_sev_features(igvm); + } + Self::create_hypervisor_vm(hypervisor.as_ref(), hv_config)? + }; #[cfg(all(feature = "kvm", target_arch = "x86_64"))] if vm_config.lock().unwrap().max_apic_id() > MAX_SUPPORTED_CPUS_LEGACY { From 24db5e1efd4c5db19bb5e9e86352622c01b59bb0 Mon Sep 17 00:00:00 2001 From: Ruben Hakobyan Date: Tue, 7 Apr 2026 16:43:32 -0700 Subject: [PATCH 728/742] hypervisor, vmm: Add support for KVM_SEV_SNP_LAUNCH_UPDATE Implement the KVM_SEV_SNP_LAUNCH_UPDATE ioctl. Extend Vm::import_isolated_pages() with a uaddrs parameter carrying host virtual addresses, which KVM needs, unlike MSHV. Compute uaddrs from guest memory mappings in the IGVM loader. Add KVM_SEV_SNP_LAUNCH_UPDATE to the seccomp allowlist. Co-authored-by: Keith Adler Signed-off-by: Keith Adler Co-authored-by: Alex Orozco Signed-off-by: Alex Orozco Signed-off-by: Ruben Hakobyan --- hypervisor/src/kvm/mod.rs | 40 +++++++++++++++++++++++++++ hypervisor/src/kvm/x86_64/sev.rs | 47 ++++++++++++++++++++++++++++++++ hypervisor/src/mshv/mod.rs | 1 + hypervisor/src/vm.rs | 1 + vmm/src/igvm/igvm_loader.rs | 13 +++++++++ vmm/src/seccomp_filters.rs | 2 ++ 6 files changed, 104 insertions(+) diff --git a/hypervisor/src/kvm/mod.rs b/hypervisor/src/kvm/mod.rs index a697d5df79..db18c64169 100644 --- a/hypervisor/src/kvm/mod.rs +++ b/hypervisor/src/kvm/mod.rs @@ -142,6 +142,8 @@ ioctl_io_nr!(KVM_NMI, kvm_bindings::KVMIO, 0x9a); #[cfg(feature = "sev_snp")] use kvm_bindings::{KVM_MEMORY_ATTRIBUTE_PRIVATE, KVM_X86_SNP_VM, kvm_memory_attributes}; +#[cfg(feature = "sev_snp")] +use x86_64::sev; #[cfg(feature = "tdx")] const KVM_EXIT_TDX: u32 = 50; @@ -635,6 +637,44 @@ impl vm::Vm for KvmVm { .map_err(|e| vm::HypervisorVmError::InitializeSevSnp(e.into())) } + #[cfg(all(feature = "sev_snp", target_arch = "x86_64"))] + fn import_isolated_pages( + &self, + page_type: u32, + page_size: u32, + // host page frame numbers + pfns: &[u64], + uaddrs: &[u64], + ) -> vm::Result<()> { + if pfns.is_empty() { + return Ok(()); + } + assert_eq!(pfns.len(), uaddrs.len()); + // VMSA pages are not supported by launch_update + // https://elixir.bootlin.com/linux/v6.11/source/arch/x86/kvm/svm/sev.c#L2377 + if page_type == sev::SNP_PAGE_TYPE_VMSA { + return Ok(()); + } + for i in 0..pfns.len() { + self.fd + .set_memory_attributes(kvm_memory_attributes { + address: pfns[i] << sev::GPA_METADATA_SHIFT_OFFSET, + size: page_size as u64, + attributes: kvm_bindings::KVM_MEMORY_ATTRIBUTE_PRIVATE as u64, + // Flags must be zero o/w error (flags aren't being used here yet) + flags: 0, + }) + .map_err(|e| vm::HypervisorVmError::ImportIsolatedPages(e.into()))?; + self.sev_fd + .as_ref() + .unwrap() + .launch_update(&self.fd, uaddrs[i], page_size as u64, pfns[i], page_type) + .map_err(|e| vm::HypervisorVmError::ImportIsolatedPages(e.into()))?; + } + + Ok(()) + } + #[cfg(target_arch = "x86_64")] /// /// Sets the address of the one-page region in the VM's address space. diff --git a/hypervisor/src/kvm/x86_64/sev.rs b/hypervisor/src/kvm/x86_64/sev.rs index d3497bed3b..a2aebdd191 100644 --- a/hypervisor/src/kvm/x86_64/sev.rs +++ b/hypervisor/src/kvm/x86_64/sev.rs @@ -19,6 +19,14 @@ pub(crate) type Result = std::result::Result; // KVM SEV command IDs — linux/include/uapi/linux/kvm.h const KVM_SEV_INIT2: u32 = 22; const KVM_SEV_SNP_LAUNCH_START: u32 = 100; +const KVM_SEV_SNP_LAUNCH_UPDATE: u32 = 101; +// SNP_LAUNCH_UPDATE page types — linux/arch/x86/include/uapi/asm/sev-guest.h +pub const SNP_PAGE_TYPE_VMSA: u32 = 2; + +// See AMD Spec Section 8.17 — SNP_LAUNCH_UPDATE +// The last 12 bits are metadata about the guest context +// https://docs.amd.com/v/u/en-US/56860_PUB_1.58_SEV_SNP +pub const GPA_METADATA_SHIFT_OFFSET: u32 = 12; // SNP in VMSA - linux/arch/x86/include/asm/svm.h const SVM_SEV_FEAT_SNP_ACTIVE: u64 = 1 << 0; @@ -59,6 +67,19 @@ pub(crate) struct KvmSevSnpLaunchStart { pub pad1: [u64; 4], } +#[repr(C, packed)] +#[derive(Debug, Copy, Clone, Default)] +pub(crate) struct KvmSevSnpLaunchUpdate { + pub gfn_start: u64, + pub uaddr: u64, + pub len: u64, + pub type_: u8, + pub pad0: u8, + pub flags: u16, + pub pad1: u32, + pub pad2: [u64; 4], +} + impl SevFd { pub(crate) fn new(sev_path: impl AsRef) -> Result { let file = OpenOptions::new() @@ -110,4 +131,30 @@ impl SevFd { }; sev_op(vm, &mut sev_cmd, "KVM_SEV_SNP_LAUNCH_START") } + + pub(crate) fn launch_update( + &self, + vm: &VmFd, + // host virtual address + hva: u64, + size: u64, + // guest frame number + gfn_start: u64, + page_type: u32, + ) -> Result<()> { + let mut update = KvmSevSnpLaunchUpdate { + gfn_start, + uaddr: hva, + len: size, + type_: page_type as u8, + ..Default::default() + }; + let mut sev_cmd = kvm_sev_cmd { + id: KVM_SEV_SNP_LAUNCH_UPDATE, + data: &mut update as *mut KvmSevSnpLaunchUpdate as _, + sev_fd: self.fd.as_raw_fd() as _, + ..Default::default() + }; + sev_op(vm, &mut sev_cmd, "KVM_SEV_SNP_LAUNCH_UPDATE") + } } diff --git a/hypervisor/src/mshv/mod.rs b/hypervisor/src/mshv/mod.rs index 1119691273..a61f2e44ef 100644 --- a/hypervisor/src/mshv/mod.rs +++ b/hypervisor/src/mshv/mod.rs @@ -2272,6 +2272,7 @@ impl vm::Vm for MshvVm { page_type: u32, page_size: u32, pages: &[u64], + _uaddrs: &[u64], ) -> vm::Result<()> { debug_assert!(page_size == hv_isolated_page_size_HV_ISOLATED_PAGE_SIZE_4KB); if pages.is_empty() { diff --git a/hypervisor/src/vm.rs b/hypervisor/src/vm.rs index 36aae27b08..6d3a4a4ae5 100644 --- a/hypervisor/src/vm.rs +++ b/hypervisor/src/vm.rs @@ -429,6 +429,7 @@ pub trait Vm: Send + Sync + Any { _page_type: u32, _page_size: u32, _pages: &[u64], + _uaddrs: &[u64], ) -> Result<()> { unimplemented!() } diff --git a/vmm/src/igvm/igvm_loader.rs b/vmm/src/igvm/igvm_loader.rs index d7a8f0cae8..e22841c88d 100644 --- a/vmm/src/igvm/igvm_loader.rs +++ b/vmm/src/igvm/igvm_loader.rs @@ -19,6 +19,8 @@ use log::debug; use log::info; use mshv_bindings::*; use thiserror::Error; +#[cfg(feature = "sev_snp")] +use vm_memory::{GuestAddress, GuestAddressSpace, GuestMemory}; use zerocopy::IntoBytes; #[cfg(feature = "sev_snp")] @@ -471,6 +473,16 @@ pub fn load_igvm( .iter() .map(|gpa| gpa.gpa >> HV_HYP_PAGE_SHIFT) .collect(); + let guest_memory = memory_manager.lock().unwrap().guest_memory().memory(); + let uaddrs: Vec<_> = group + .iter() + .map(|gpa| { + let guest_region_mmap = guest_memory.to_region_addr(GuestAddress(gpa.gpa)); + let uaddr_base = guest_region_mmap.unwrap().0.as_ptr() as u64; + let uaddr_offset: u64 = guest_region_mmap.unwrap().1.0; + uaddr_base + uaddr_offset + }) + .collect(); memory_manager .lock() .unwrap() @@ -479,6 +491,7 @@ pub fn load_igvm( group[0].page_type, hv_isolated_page_size_HV_ISOLATED_PAGE_SIZE_4KB, &pfns, + &uaddrs, ) .map_err(Error::ImportIsolatedPages)?; } diff --git a/vmm/src/seccomp_filters.rs b/vmm/src/seccomp_filters.rs index 1258959434..748e3b3b60 100644 --- a/vmm/src/seccomp_filters.rs +++ b/vmm/src/seccomp_filters.rs @@ -112,6 +112,7 @@ mod kvm { pub const KVM_GET_NESTED_STATE: u64 = 3229658814; pub const KVM_SET_NESTED_STATE: u64 = 1082175167; pub const KVM_SEV_SNP_LAUNCH_START: u64 = 0x4018_aeb4; + pub const KVM_SEV_SNP_LAUNCH_UPDATE: u64 = 0x8018_aeb5; } mod iommufd { @@ -269,6 +270,7 @@ fn create_vmm_ioctl_seccomp_rule_common_kvm() -> Result, Backen and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_NESTED_STATE)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_NESTED_STATE)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_SEV_SNP_LAUNCH_START)?], + and![Cond::new(1, ArgLen::Dword, Eq, KVM_SEV_SNP_LAUNCH_UPDATE)?], ]) } From 4b2538f522f0babd634b8c68885bb1d094cd33cd Mon Sep 17 00:00:00 2001 From: Ruben Hakobyan Date: Tue, 7 Apr 2026 16:57:42 -0700 Subject: [PATCH 729/742] hypervisor, vmm: Add support for KVM_SEV_SNP_LAUNCH_FINISH Add the KVM_SEV_SNP_LAUNCH_FINISH ioctl, which finalizes the SNP launch sequence and transitions the VM into a runnable encrypted state. Additionally, add KVM_SEV_SNP_LAUNCH_FINISH to the seccomp allowlist. Co-authored-by: Keith Adler Signed-off-by: Keith Adler Co-authored-by: Alex Orozco Signed-off-by: Alex Orozco Signed-off-by: Ruben Hakobyan --- hypervisor/src/kvm/mod.rs | 19 +++++++++++++++ hypervisor/src/kvm/x86_64/sev.rs | 42 +++++++++++++++++++++++++++++++- vmm/src/seccomp_filters.rs | 2 ++ 3 files changed, 62 insertions(+), 1 deletion(-) diff --git a/hypervisor/src/kvm/mod.rs b/hypervisor/src/kvm/mod.rs index db18c64169..8cf43de4e5 100644 --- a/hypervisor/src/kvm/mod.rs +++ b/hypervisor/src/kvm/mod.rs @@ -675,6 +675,25 @@ impl vm::Vm for KvmVm { Ok(()) } + #[cfg(all(feature = "sev_snp", target_arch = "x86_64"))] + fn complete_isolated_import( + &self, + snp_id_block: igvm_defs::IGVM_VHS_SNP_ID_BLOCK, + host_data: [u8; 32], + id_block_enabled: u8, + ) -> vm::Result<()> { + self.sev_fd + .as_ref() + .unwrap() + .launch_finish( + &self.fd, + host_data, + id_block_enabled, + snp_id_block.author_key_enabled, + ) + .map_err(|e| vm::HypervisorVmError::CompleteIsolatedImport(e.into())) + } + #[cfg(target_arch = "x86_64")] /// /// Sets the address of the one-page region in the VM's address space. diff --git a/hypervisor/src/kvm/x86_64/sev.rs b/hypervisor/src/kvm/x86_64/sev.rs index a2aebdd191..6249468fef 100644 --- a/hypervisor/src/kvm/x86_64/sev.rs +++ b/hypervisor/src/kvm/x86_64/sev.rs @@ -11,7 +11,7 @@ use std::path::Path; use igvm_defs::SnpPolicy; use kvm_bindings::kvm_sev_cmd; use kvm_ioctls::VmFd; -use log::{error, info}; +use log::{debug, error, info}; use vmm_sys_util::errno; pub(crate) type Result = std::result::Result; @@ -20,6 +20,7 @@ pub(crate) type Result = std::result::Result; const KVM_SEV_INIT2: u32 = 22; const KVM_SEV_SNP_LAUNCH_START: u32 = 100; const KVM_SEV_SNP_LAUNCH_UPDATE: u32 = 101; +const KVM_SEV_SNP_LAUNCH_FINISH: u32 = 102; // SNP_LAUNCH_UPDATE page types — linux/arch/x86/include/uapi/asm/sev-guest.h pub const SNP_PAGE_TYPE_VMSA: u32 = 2; @@ -80,6 +81,21 @@ pub(crate) struct KvmSevSnpLaunchUpdate { pub pad2: [u64; 4], } +#[repr(C, packed)] +#[derive(Debug, Copy, Clone, Default)] +pub(crate) struct KvmSevSnpLaunchFinish { + pub id_block_uaddr: u64, + pub id_auth_uaddr: u64, + pub id_block_en: u8, + pub auth_key_en: u8, + pub vcek_disabled: u8, + pub host_data: [u8; 32], + pub pad0: [u8; 3], + // must be zero https://elixir.bootlin.com/linux/v6.11/source/arch/x86/kvm/svm/sev.c#L2506 + pub flags: u16, + pub pad1: [u64; 4], +} + impl SevFd { pub(crate) fn new(sev_path: impl AsRef) -> Result { let file = OpenOptions::new() @@ -157,4 +173,28 @@ impl SevFd { }; sev_op(vm, &mut sev_cmd, "KVM_SEV_SNP_LAUNCH_UPDATE") } + + pub(crate) fn launch_finish( + &self, + vm: &VmFd, + host_data: [u8; 32], + id_block_en: u8, + auth_key_en: u8, + ) -> Result<()> { + let mut finish = KvmSevSnpLaunchFinish { + host_data, + id_block_en, + auth_key_en, + ..Default::default() + }; + let mut sev_cmd = kvm_sev_cmd { + id: KVM_SEV_SNP_LAUNCH_FINISH, + data: &mut finish as *mut KvmSevSnpLaunchFinish as _, + sev_fd: self.fd.as_raw_fd() as _, + ..Default::default() + }; + let flags = finish.flags; + debug!("Calling KVM_SEV_SNP_LAUNCH_FINISH, flags: {flags}"); + sev_op(vm, &mut sev_cmd, "KVM_SEV_SNP_LAUNCH_FINISH") + } } diff --git a/vmm/src/seccomp_filters.rs b/vmm/src/seccomp_filters.rs index 748e3b3b60..bc17cde4e8 100644 --- a/vmm/src/seccomp_filters.rs +++ b/vmm/src/seccomp_filters.rs @@ -113,6 +113,7 @@ mod kvm { pub const KVM_SET_NESTED_STATE: u64 = 1082175167; pub const KVM_SEV_SNP_LAUNCH_START: u64 = 0x4018_aeb4; pub const KVM_SEV_SNP_LAUNCH_UPDATE: u64 = 0x8018_aeb5; + pub const KVM_SEV_SNP_LAUNCH_FINISH: u64 = 0x4008_aeb7; } mod iommufd { @@ -271,6 +272,7 @@ fn create_vmm_ioctl_seccomp_rule_common_kvm() -> Result, Backen and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_NESTED_STATE)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_SEV_SNP_LAUNCH_START)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_SEV_SNP_LAUNCH_UPDATE)?], + and![Cond::new(1, ArgLen::Dword, Eq, KVM_SEV_SNP_LAUNCH_FINISH)?], ]) } From 4a0cfa02de9d90d0f4024f718a96cc79cc5fe190 Mon Sep 17 00:00:00 2001 From: Ruben Hakobyan Date: Tue, 7 Apr 2026 17:04:58 -0700 Subject: [PATCH 730/742] hypervisor: Handle KVM_HC_MAP_GPA_RANGE hypercalls SEV-SNP guests will issue this hypercall to signal a change in the page encryption status to the hypervisor. Handle VcpuExit::Hypercall in the KVM vCPU run loop: decode the GPA, page count, and private/shared attribute from the hypercall arguments, then call KVM_SET_MEMORY_ATTRIBUTES to update the page state. Co-authored-by: Keith Adler Signed-off-by: Keith Adler Co-authored-by: Alex Orozco Signed-off-by: Alex Orozco Signed-off-by: Ruben Hakobyan --- hypervisor/src/kvm/mod.rs | 51 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/hypervisor/src/kvm/mod.rs b/hypervisor/src/kvm/mod.rs index 8cf43de4e5..27a5358728 100644 --- a/hypervisor/src/kvm/mod.rs +++ b/hypervisor/src/kvm/mod.rs @@ -32,6 +32,8 @@ use anyhow::anyhow; #[cfg(feature = "sev_snp")] use kvm_bindings::kvm_create_guest_memfd; use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd}; +#[cfg(feature = "sev_snp")] +use log::debug; #[cfg(target_arch = "x86_64")] use log::warn; use vmm_sys_util::errno; @@ -140,6 +142,8 @@ use crate::kvm::x86_64::XsaveStateError; #[cfg(target_arch = "x86_64")] ioctl_io_nr!(KVM_NMI, kvm_bindings::KVMIO, 0x9a); +#[cfg(feature = "sev_snp")] +use igvm_defs::PAGE_SIZE_4K; #[cfg(feature = "sev_snp")] use kvm_bindings::{KVM_MEMORY_ATTRIBUTE_PRIVATE, KVM_X86_SNP_VM, kvm_memory_attributes}; #[cfg(feature = "sev_snp")] @@ -774,6 +778,8 @@ impl vm::Vm for KvmVm { hyperv_synic: AtomicBool::new(false), #[cfg(target_arch = "x86_64")] xsave_size, + #[cfg(feature = "sev_snp")] + vm_fd: self.fd.clone(), }; Ok(Box::new(vcpu)) } @@ -1643,6 +1649,8 @@ pub struct KvmVcpu { hyperv_synic: AtomicBool, #[cfg(target_arch = "x86_64")] xsave_size: i32, + #[cfg(feature = "sev_snp")] + vm_fd: Arc, } /// Implementation of Vcpu trait for KVM @@ -2302,6 +2310,49 @@ impl cpu::Vcpu for KvmVcpu { #[cfg(feature = "tdx")] VcpuExit::Unsupported(KVM_EXIT_TDX) => Ok(cpu::VmExit::Tdx), VcpuExit::Debug(_) => Ok(cpu::VmExit::Debug), + #[cfg(feature = "sev_snp")] + VcpuExit::Hypercall(hypercall) => { + // https://docs.kernel.org/virt/kvm/x86/hypercalls.html#kvm-hc-map-gpa-range + const KVM_HC_MAP_GPA_RANGE: u64 = 12; + // 4th bit of attributes argument is encrypted page bit + match hypercall.nr { + KVM_HC_MAP_GPA_RANGE => { + // guest physical address of start page + let address = hypercall.args[0]; + // num pages to map from start address + let num_pages = hypercall.args[1]; + // bits[0-3] = page size encoding + // bits[4] = 1 if private, 0 if shared + // bits[5-63] = zero + let attributes = hypercall.args[2]; + // TODO: Add 2mb page support + let size = num_pages * PAGE_SIZE_4K; + // bit 4 = private attribute encoding + const PRIVATE_ENCODING_BITMASK: u64 = 0b10000; + debug!( + "KVM_HC_MAP_GPA_RANGE: address={address:#x}, pages={num_pages}, attributes={attributes:#x}" + ); + let set_private_attr = if attributes & PRIVATE_ENCODING_BITMASK > 0 { + KVM_MEMORY_ATTRIBUTE_PRIVATE as u64 + } else { + // the only attribute available is private, o/w 0 + // https://docs.kernel.org/virt/kvm/api.html#kvm-set-memory-attributes + 0u64 + }; + let mem_attributes = kvm_memory_attributes { + address, + size, + attributes: set_private_attr, + ..Default::default() + }; + self.vm_fd + .set_memory_attributes(mem_attributes) + .map(|_| cpu::VmExit::Ignore) + .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())) + } + _ => Ok(cpu::VmExit::Ignore), + } + } r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( "Unexpected exit reason on vcpu run: {r:?}" From b5ddcdc74ad286c0a88a00ac9637f3e19c460d17 Mon Sep 17 00:00:00 2001 From: Ruben Hakobyan Date: Tue, 7 Apr 2026 17:21:15 -0700 Subject: [PATCH 731/742] hypervisor: handle VcpuExit::MemoryFault for AP boot page conversions During SNP boot all guest RAM is initially marked KVM_MEMORY_ATTRIBUTE_PRIVATE. Pages imported via SNP_LAUNCH_UPDATE are properly accepted by the guest, but generic RAM pages (e.g. the AP trampoline at GPA 0xD000) are not. When stage0 on the BSP starts secondary vCPUs via x2APIC, the APs try to execute from the trampoline page through the shared mapping while KVM still has it marked private, causing a KVM_EXIT_MEMORY_FAULT (flags=KVM_MEMORY_EXIT_FLAG_PRIVATE) that previously fell through to the catch-all error, killing the VM. Handle VcpuExit::MemoryFault by toggling the page's memory attribute between private and shared based on the exit flags, allowing the vCPU to retry the access. Signed-off-by: Ruben Hakobyan --- hypervisor/src/kvm/mod.rs | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/hypervisor/src/kvm/mod.rs b/hypervisor/src/kvm/mod.rs index 27a5358728..8b2e296ef0 100644 --- a/hypervisor/src/kvm/mod.rs +++ b/hypervisor/src/kvm/mod.rs @@ -2354,6 +2354,38 @@ impl cpu::Vcpu for KvmVcpu { } } + #[cfg(feature = "sev_snp")] + VcpuExit::MemoryFault { flags, gpa, size } => { + debug!("VcpuExit::MemoryFault: flags={flags:#x}, gpa={gpa:#x}, size={size:#x}"); + + const KVM_MEMORY_EXIT_FLAG_PRIVATE: u64 = + kvm_bindings::KVM_MEMORY_EXIT_FLAG_PRIVATE as u64; + + if flags & !KVM_MEMORY_EXIT_FLAG_PRIVATE != 0 { + return Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( + "VcpuExit::MemoryFault: unknown flags {flags:#x}" + ))); + } + + let attributes = if flags & KVM_MEMORY_EXIT_FLAG_PRIVATE != 0 { + KVM_MEMORY_ATTRIBUTE_PRIVATE as u64 + } else { + // the only attribute available is private, o/w 0 + // https://docs.kernel.org/virt/kvm/api.html#kvm-set-memory-attributes + 0u64 + }; + + self.vm_fd + .set_memory_attributes(kvm_memory_attributes { + address: gpa, + size, + attributes, + flags: 0, + }) + .map(|_| cpu::VmExit::Ignore) + .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())) + } + r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( "Unexpected exit reason on vcpu run: {r:?}" ))), From 75ed2c9f903f44356355e6b94d2d3b0784ffc478 Mon Sep 17 00:00:00 2001 From: Ruben Hakobyan Date: Tue, 7 Apr 2026 17:30:24 -0700 Subject: [PATCH 732/742] vmm: add KVM SEV-SNP support to IGVM loader Adapt the IGVM loader to work with both MSHV and KVM backends, which differ in page type constants, CPUID page layout, and VMSA handling. Abstract page types into a PageTypeConfig struct populated at runtime from the detected hypervisor, replacing hardcoded mshv_bindings constants. Apply the VMSA register state to each vCPU via setup_sev_snp_regs(), translating SevSelector attributes to KVM segment format using a bitfield decoder. KVM's SNP launch path sanitizes certain CPUID bits that could lead to an insecure guest. If the VMM sets these bits, KVM rejects the CPUID page import on the first attempt, requiring a retry with the firmware-corrected values. Pre-clear the known problematic bits before import to avoid the reject-and-retry cycle: - Leaf 0x1, ECX bit 24: TSC_DEADLINE (filtered by KVM) - Leaf 0x7, EBX bit 1: SGX (filtered by KVM) - Leaf 0x7, EDX: clear entirely (contains speculative features) - Leaf 0x80000008, EBX bit 25: filtered by KVM - Leaf 0x80000021, ECX: clear entirely This keeps the CPUID page stable across launch updates and avoids noisy error logs from the retry path. Co-authored-by: Keith Adler Signed-off-by: Keith Adler Co-authored-by: Alex Orozco Signed-off-by: Alex Orozco Co-authored-by: Dylan Reid Signed-off-by: Dylan Reid Signed-off-by: Ruben Hakobyan --- hypervisor/src/cpu.rs | 4 + hypervisor/src/kvm/mod.rs | 128 ++++++++++++++- vmm/src/config.rs | 28 +++- vmm/src/cpu.rs | 17 +- vmm/src/igvm/igvm_loader.rs | 318 ++++++++++++++++++++++++++++++------ 5 files changed, 435 insertions(+), 60 deletions(-) diff --git a/hypervisor/src/cpu.rs b/hypervisor/src/cpu.rs index a4a029e989..044c81a2e8 100644 --- a/hypervisor/src/cpu.rs +++ b/hypervisor/src/cpu.rs @@ -591,6 +591,10 @@ pub trait Vcpu: Send + Sync { fn set_sev_control_register(&self, _vmsa_pfn: u64) -> Result<()> { unimplemented!() } + #[cfg(feature = "sev_snp")] + fn setup_sev_snp_regs(&self, _vmsa: igvm::snp_defs::SevVmsa) -> Result<()> { + unimplemented!() + } /// /// Sets the value of GIC redistributor address diff --git a/hypervisor/src/kvm/mod.rs b/hypervisor/src/kvm/mod.rs index 8b2e296ef0..2f7137d55a 100644 --- a/hypervisor/src/kvm/mod.rs +++ b/hypervisor/src/kvm/mod.rs @@ -145,10 +145,61 @@ ioctl_io_nr!(KVM_NMI, kvm_bindings::KVMIO, 0x9a); #[cfg(feature = "sev_snp")] use igvm_defs::PAGE_SIZE_4K; #[cfg(feature = "sev_snp")] -use kvm_bindings::{KVM_MEMORY_ATTRIBUTE_PRIVATE, KVM_X86_SNP_VM, kvm_memory_attributes}; +use kvm_bindings::{ + KVM_MEMORY_ATTRIBUTE_PRIVATE, KVM_X86_SNP_VM, kvm_memory_attributes, kvm_segment as Segment, +}; +use vm_memory::GuestAddress; #[cfg(feature = "sev_snp")] use x86_64::sev; +// Hardcoded GPA of a bootloader and VMSA page for KVM +// TODO: Derive these from the IGVM file's PageData/SnpVpContext directives +// instead of using fixed constants, to support arbitrary bootloader layouts. +pub const BOOTLOADER_START: GuestAddress = GuestAddress(0xffc0_0000); +pub const BOOTLOADER_SIZE: usize = 0x40_0000; // 4 MiB +pub const KVM_VMSA_PAGE_ADDRESS: GuestAddress = GuestAddress(0xffff_ffff_f000); +pub const KVM_VMSA_PAGE_SIZE: usize = 0x1000; // 4 KiB + +#[cfg(feature = "sev_snp")] +#[bitfield_struct::bitfield(u32)] +#[derive(PartialEq, Eq)] +/// AMD VMCB segment attributes +/// linux/arch/x86/include/asm/svm.h +pub struct SegAccess { + #[bits(4)] + pub seg_type: u8, + pub s_code_data: bool, + #[bits(2)] + pub priv_level: u8, + pub present: bool, + pub available: bool, + pub l_64bit: bool, + pub db_size_32: bool, + pub granularity: bool, + #[bits(20)] + _reserved: u32, +} + +#[cfg(feature = "sev_snp")] +fn make_segment(sev_selector: igvm::snp_defs::SevSelector) -> Segment { + let flags = SegAccess::from_bits(sev_selector.attrib.into()); + Segment { + base: sev_selector.base, + limit: sev_selector.limit, + selector: sev_selector.selector, + type_: flags.seg_type(), + s: flags.s_code_data() as u8, + dpl: flags.priv_level(), + present: flags.present() as u8, + avl: flags.available() as u8, + db: flags.db_size_32() as u8, + g: flags.granularity() as u8, + l: flags.l_64bit() as u8, + unusable: 0, + ..Default::default() + } +} + #[cfg(feature = "tdx")] const KVM_EXIT_TDX: u32 = 50; #[cfg(feature = "tdx")] @@ -3238,6 +3289,81 @@ impl cpu::Vcpu for KvmVcpu { Ok(_) => Ok(()), } } + + #[cfg(feature = "sev_snp")] + fn set_sev_control_register(&self, _vmsa_pfn: u64) -> cpu::Result<()> { + Ok(()) + } + + #[cfg(feature = "sev_snp")] + fn setup_sev_snp_regs(&self, vmsa: igvm::snp_defs::SevVmsa) -> cpu::Result<()> { + let mut sregs = self + .fd + .get_sregs() + .map_err(|e: kvm_ioctls::Error| cpu::HypervisorCpuError::GetSpecialRegs(e.into()))?; + sregs.cs = make_segment(vmsa.cs); + sregs.ds = make_segment(vmsa.ds); + sregs.es = make_segment(vmsa.es); + sregs.fs = make_segment(vmsa.fs); + sregs.gs = make_segment(vmsa.gs); + sregs.ss = make_segment(vmsa.ss); + sregs.tr = make_segment(vmsa.tr); + sregs.ldt = make_segment(vmsa.ldtr); + + sregs.cr0 = vmsa.cr0; + sregs.cr4 = vmsa.cr4; + sregs.cr3 = vmsa.cr3; + sregs.efer = vmsa.efer; + + sregs.idt.base = vmsa.idtr.base; + sregs.idt.limit = vmsa + .idtr + .limit + .try_into() + .map_err(|e: std::num::TryFromIntError| { + cpu::HypervisorCpuError::SetSpecialRegs(anyhow!(e)) + })?; + sregs.gdt.base = vmsa.gdtr.base; + sregs.gdt.limit = vmsa + .gdtr + .limit + .try_into() + .map_err(|e: std::num::TryFromIntError| { + cpu::HypervisorCpuError::SetSpecialRegs(anyhow!(e)) + })?; + self.fd + .set_sregs(&sregs) + .map_err(|e: kvm_ioctls::Error| cpu::HypervisorCpuError::SetSpecialRegs(e.into()))?; + + let mut regs = self + .fd + .get_regs() + .map_err(|e: kvm_ioctls::Error| cpu::HypervisorCpuError::GetRegister(e.into()))?; + regs.rip = vmsa.rip; + regs.rdx = vmsa.rdx; + regs.rflags = vmsa.rflags; + regs.rsp = vmsa.rsp; + regs.rax = vmsa.rax; + regs.rbx = vmsa.rbx; + regs.rcx = vmsa.rcx; + regs.rbp = vmsa.rbp; + regs.rsi = vmsa.rsi; + regs.rdi = vmsa.rdi; + regs.r8 = vmsa.r8; + regs.r9 = vmsa.r9; + regs.r10 = vmsa.r10; + regs.r11 = vmsa.r11; + regs.r12 = vmsa.r12; + regs.r13 = vmsa.r13; + regs.r14 = vmsa.r14; + regs.r15 = vmsa.r15; + + self.fd + .set_regs(®s) + .map_err(|e: kvm_ioctls::Error| cpu::HypervisorCpuError::SetRegister(e.into()))?; + + Ok(()) + } } impl KvmVcpu { diff --git a/vmm/src/config.rs b/vmm/src/config.rs index 72f0f6d47f..d023f98a1b 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -352,6 +352,9 @@ pub enum ValidationError { #[cfg(feature = "sev_snp")] #[error("Invalid host data format")] InvalidHostData, + #[cfg(all(feature = "sev_snp", feature = "igvm"))] + #[error("SEV-SNP requires an IGVM payload (--payload igvm=)")] + SevSnpRequiresIgvm, /// Restore expects all net ids that have fds #[error("Net id {0} is associated with FDs and is required")] RestoreMissingRequiredNetId(String), @@ -2823,12 +2826,25 @@ impl VmConfig { #[cfg(feature = "sev_snp")] { - let host_data_opt = &self.payload.as_ref().unwrap().host_data; - - if let Some(host_data) = host_data_opt - && host_data.len() != 64 - { - return Err(ValidationError::InvalidHostData); + let sev_snp_enabled = self.platform.as_ref().is_some_and(|p| p.sev_snp); + if sev_snp_enabled { + let host_data_opt = &self.payload.as_ref().unwrap().host_data; + if let Some(host_data) = host_data_opt + && host_data.len() != 64 + { + return Err(ValidationError::InvalidHostData); + } + // KVM SEV-SNP requires an IGVM payload to initialise the VMSA. + // Without IGVM the vCPU register state is undefined and VM entry fails. + #[cfg(feature = "igvm")] + if self + .payload + .as_ref() + .and_then(|p| p.igvm.as_ref()) + .is_none() + { + return Err(ValidationError::SevSnpRequiresIgvm); + } } } // The 'conflict' check is introduced in commit 24438e0390d3 diff --git a/vmm/src/cpu.rs b/vmm/src/cpu.rs index 261f4c1c75..cb445cda15 100644 --- a/vmm/src/cpu.rs +++ b/vmm/src/cpu.rs @@ -212,6 +212,9 @@ pub enum Error { #[cfg(feature = "sev_snp")] #[error("Failed to set sev control register")] SetSevControlRegister(#[source] hypervisor::HypervisorCpuError), + #[cfg(feature = "sev_snp")] + #[error("Failed to set up SEV-SNP vCPU registers")] + SetupSevSnpRegs(#[source] hypervisor::HypervisorCpuError), #[cfg(target_arch = "x86_64")] #[error("Failed to inject NMI")] @@ -644,6 +647,13 @@ impl Vcpu { .map_err(Error::SetSevControlRegister) } + #[cfg(feature = "sev_snp")] + pub fn setup_sev_snp_regs(&self, vmsa: igvm::snp_defs::SevVmsa) -> Result<()> { + self.vcpu + .setup_sev_snp_regs(vmsa) + .map_err(Error::SetupSevSnpRegs) + } + /// /// Sets the vCPU's GIC redistributor base address. /// @@ -2199,7 +2209,7 @@ impl CpuManager { &self.vcpus_kill_signalled } - #[cfg(feature = "igvm")] + #[cfg(all(feature = "igvm", feature = "mshv"))] pub(crate) fn get_cpuid_leaf( &self, cpu_id: u8, @@ -2222,6 +2232,11 @@ impl CpuManager { self.sev_snp_enabled } + #[cfg(feature = "igvm")] + pub(crate) fn hypervisor_type(&self) -> hypervisor::HypervisorType { + self.hypervisor.hypervisor_type() + } + pub(crate) fn nmi(&mut self) -> Result<()> { self.vcpus_kick_signalled.store(true, Ordering::SeqCst); self.signal_vcpus()?; diff --git a/vmm/src/igvm/igvm_loader.rs b/vmm/src/igvm/igvm_loader.rs index e22841c88d..5cdb0d01b4 100644 --- a/vmm/src/igvm/igvm_loader.rs +++ b/vmm/src/igvm/igvm_loader.rs @@ -7,6 +7,7 @@ use std::ffi::CString; use std::mem::size_of; use std::sync::{Arc, Mutex}; +use hypervisor::HypervisorType; use igvm::snp_defs::SevVmsa; use igvm::{IgvmDirectiveHeader, IgvmFile, IgvmPlatformHeader}; #[cfg(feature = "sev_snp")] @@ -15,13 +16,20 @@ use igvm_defs::{ IGVM_VHS_PARAMETER, IGVM_VHS_PARAMETER_INSERT, IgvmPageDataType, IgvmPlatformType, }; use log::debug; +#[cfg(all(feature = "kvm", feature = "sev_snp"))] +use log::error; #[cfg(feature = "sev_snp")] use log::info; +#[cfg(feature = "mshv")] use mshv_bindings::*; use thiserror::Error; #[cfg(feature = "sev_snp")] -use vm_memory::{GuestAddress, GuestAddressSpace, GuestMemory}; +use vm_memory::{Bytes, GuestAddress, GuestAddressSpace, GuestMemory}; +#[cfg(all(feature = "kvm", feature = "sev_snp"))] +use vm_migration::Snapshottable; use zerocopy::IntoBytes; +#[cfg(feature = "sev_snp")] +use zerocopy::{FromBytes, FromZeros}; #[cfg(feature = "sev_snp")] use crate::GuestMemoryMmap; @@ -30,6 +38,36 @@ use crate::igvm::loader::Loader; use crate::igvm::{BootPageAcceptance, HV_PAGE_SIZE, IgvmLoadedInfo, StartupMemoryType}; use crate::memory_manager::{Error as MemoryManagerError, MemoryManager}; +#[cfg(feature = "sev_snp")] +const ISOLATED_PAGE_SHIFT: u32 = 12; +#[cfg(feature = "sev_snp")] +const SNP_CPUID_LIMIT: u32 = 64; +// see section 7.1 +// https://www.amd.com/content/dam/amd/en/documents/epyc-technical-docs/specifications/56860.pdf +#[cfg(feature = "sev_snp")] +#[repr(C)] +#[derive(Debug, Clone, PartialEq, Eq, IntoBytes, FromBytes)] +pub struct SnpCpuidFunc { + pub eax_in: u32, + pub ecx_in: u32, + pub xcr0_in: u64, + pub xss_in: u64, + pub eax: u32, + pub ebx: u32, + pub ecx: u32, + pub edx: u32, + pub reserved: u64, +} + +#[cfg(feature = "sev_snp")] +#[repr(C)] +#[derive(Debug, Clone, FromBytes, IntoBytes)] +pub struct SnpCpuidInfo { + pub count: u32, + pub _reserved1: u32, + pub _reserved2: u64, + pub entries: [SnpCpuidFunc; SNP_CPUID_LIMIT as usize], +} #[derive(Debug, Error)] pub enum Error { #[error("command line is not a valid C string")] @@ -54,6 +92,30 @@ pub enum Error { MemoryManager(MemoryManagerError), #[error("IGVM file not provided")] MissingIgvm, + #[error("Error applying VMSA to vCPU registers: {0}")] + SetVmsa(#[source] crate::cpu::Error), +} + +// KVM SNP page types — linux/arch/x86/include/uapi/asm/sev-guest.h +#[cfg(feature = "kvm")] +const KVM_SNP_PAGE_TYPE_NORMAL: u32 = 1; +#[cfg(feature = "kvm")] +const KVM_SNP_PAGE_TYPE_VMSA: u32 = 2; +#[cfg(feature = "kvm")] +const KVM_SNP_PAGE_TYPE_UNMEASURED: u32 = 4; +#[cfg(feature = "kvm")] +const KVM_SNP_PAGE_TYPE_SECRETS: u32 = 5; +#[cfg(feature = "kvm")] +const KVM_SNP_PAGE_TYPE_CPUID: u32 = 6; + +// Consolidated page type/size configuration per hypervisor. +struct PageTypeConfig { + isolated_page_size_4kb: u32, + normal: u32, + unmeasured: u32, + cpuid: u32, + secrets: u32, + vmsa: u32, } #[allow(dead_code)] @@ -151,6 +213,10 @@ pub fn extract_sev_features(igvm_file: &IgvmFile) -> u64 { /// Right now it only supports SNP based isolation. /// We can boot legacy VM with an igvm file without /// any isolation. +/// +/// NOTE: KVM and MSHV have different page type values and CPUID/VMSA handling. +/// Hypervisor-specific code paths are gated by runtime type checks. A future +/// refactor could split these into separate KVM/MSHV loader implementations. #[allow(clippy::needless_pass_by_value)] pub fn load_igvm( igvm_file: IgvmFile, @@ -159,6 +225,28 @@ pub fn load_igvm( cmdline: &str, #[cfg(feature = "sev_snp")] host_data: &Option, ) -> Result, Error> { + let hypervisor_type = cpu_manager.lock().unwrap().hypervisor_type(); + let page_types = match hypervisor_type { + #[cfg(feature = "mshv")] + HypervisorType::Mshv => PageTypeConfig { + isolated_page_size_4kb: mshv_bindings::hv_isolated_page_size_HV_ISOLATED_PAGE_SIZE_4KB, + normal: mshv_bindings::hv_isolated_page_type_HV_ISOLATED_PAGE_TYPE_NORMAL, + unmeasured: mshv_bindings::hv_isolated_page_type_HV_ISOLATED_PAGE_TYPE_UNMEASURED, + cpuid: mshv_bindings::hv_isolated_page_type_HV_ISOLATED_PAGE_TYPE_NORMAL, + secrets: mshv_bindings::hv_isolated_page_type_HV_ISOLATED_PAGE_TYPE_UNMEASURED, + vmsa: mshv_bindings::hv_isolated_page_type_HV_ISOLATED_PAGE_TYPE_VMSA, + }, + #[cfg(feature = "kvm")] + HypervisorType::Kvm => PageTypeConfig { + isolated_page_size_4kb: HV_PAGE_SIZE as u32, + normal: KVM_SNP_PAGE_TYPE_NORMAL, + unmeasured: KVM_SNP_PAGE_TYPE_UNMEASURED, + cpuid: KVM_SNP_PAGE_TYPE_CPUID, + secrets: KVM_SNP_PAGE_TYPE_SECRETS, + vmsa: KVM_SNP_PAGE_TYPE_VMSA, + }, + }; + let mut loaded_info: Box = Box::default(); let command_line = CString::new(cmdline).map_err(Error::InvalidCommandLine)?; let memory = memory_manager.lock().as_ref().unwrap().guest_memory(); @@ -173,6 +261,8 @@ pub fn load_igvm( .map_err(Error::FailedToDecodeHostData)?; } + #[cfg(feature = "sev_snp")] + let sev_snp_enabled = cpu_manager.lock().unwrap().sev_snp_enabled(); let mask = match &igvm_file.platforms()[0] { IgvmPlatformHeader::SupportedPlatform(info) => { debug_assert!(info.platform_type == IgvmPlatformType::SEV_SNP); @@ -205,15 +295,15 @@ pub fn load_igvm( if flags.unmeasured() { gpas.push(GpaPages { gpa: *gpa, - page_type: hv_isolated_page_type_HV_ISOLATED_PAGE_TYPE_UNMEASURED, - page_size: hv_isolated_page_size_HV_ISOLATED_PAGE_SIZE_4KB, + page_type: page_types.unmeasured, + page_size: page_types.isolated_page_size_4kb, }); BootPageAcceptance::ExclusiveUnmeasured } else { gpas.push(GpaPages { gpa: *gpa, - page_type: hv_isolated_page_type_HV_ISOLATED_PAGE_TYPE_NORMAL, - page_size: hv_isolated_page_size_HV_ISOLATED_PAGE_SIZE_4KB, + page_type: page_types.normal, + page_size: page_types.isolated_page_size_4kb, }); BootPageAcceptance::Exclusive } @@ -221,43 +311,46 @@ pub fn load_igvm( IgvmPageDataType::SECRETS => { gpas.push(GpaPages { gpa: *gpa, - page_type: hv_isolated_page_type_HV_ISOLATED_PAGE_TYPE_SECRETS, - page_size: hv_isolated_page_size_HV_ISOLATED_PAGE_SIZE_4KB, + page_type: page_types.secrets, + page_size: page_types.isolated_page_size_4kb, }); BootPageAcceptance::SecretsPage } IgvmPageDataType::CPUID_DATA => { - // SAFETY: CPUID is readonly - unsafe { - let cpuid_page_p: *mut hv_psp_cpuid_page = - data.as_ptr() as *mut hv_psp_cpuid_page; // as *mut hv_psp_cpuid_page; - let cpuid_page: &mut hv_psp_cpuid_page = &mut *cpuid_page_p; - for i in 0..cpuid_page.count { - let leaf = cpuid_page.cpuid_leaf_info[i as usize]; - let mut in_leaf = cpu_manager - .lock() - .unwrap() - .get_cpuid_leaf( - 0, - leaf.eax_in, - leaf.ecx_in, - leaf.xfem_in, - leaf.xss_in, - ) - .unwrap(); - if leaf.eax_in == 1 { - in_leaf[2] &= 0x7FFFFFFF; + #[cfg(feature = "mshv")] + if hypervisor_type == HypervisorType::Mshv { + // SAFETY: CPUID is readonly + unsafe { + let cpuid_page_p: *mut hv_psp_cpuid_page = + data.as_ptr() as *mut hv_psp_cpuid_page; // as *mut hv_psp_cpuid_page; + let cpuid_page: &mut hv_psp_cpuid_page = &mut *cpuid_page_p; + for i in 0..cpuid_page.count { + let leaf = cpuid_page.cpuid_leaf_info[i as usize]; + let mut in_leaf = cpu_manager + .lock() + .unwrap() + .get_cpuid_leaf( + 0, + leaf.eax_in, + leaf.ecx_in, + leaf.xfem_in, + leaf.xss_in, + ) + .unwrap(); + if leaf.eax_in == 1 { + in_leaf[2] &= 0x7FFFFFFF; + } + cpuid_page.cpuid_leaf_info[i as usize].eax_out = in_leaf[0]; + cpuid_page.cpuid_leaf_info[i as usize].ebx_out = in_leaf[1]; + cpuid_page.cpuid_leaf_info[i as usize].ecx_out = in_leaf[2]; + cpuid_page.cpuid_leaf_info[i as usize].edx_out = in_leaf[3]; } - cpuid_page.cpuid_leaf_info[i as usize].eax_out = in_leaf[0]; - cpuid_page.cpuid_leaf_info[i as usize].ebx_out = in_leaf[1]; - cpuid_page.cpuid_leaf_info[i as usize].ecx_out = in_leaf[2]; - cpuid_page.cpuid_leaf_info[i as usize].edx_out = in_leaf[3]; } } gpas.push(GpaPages { gpa: *gpa, - page_type: hv_isolated_page_type_HV_ISOLATED_PAGE_TYPE_CPUID, - page_size: hv_isolated_page_size_HV_ISOLATED_PAGE_SIZE_4KB, + page_type: page_types.cpuid, + page_size: page_types.isolated_page_size_4kb, }); BootPageAcceptance::CpuidPage } @@ -265,9 +358,69 @@ pub fn load_igvm( _ => todo!("unsupported IgvmPageDataType"), }; - loader - .import_pages(gpa / HV_PAGE_SIZE, 1, acceptance, data) - .map_err(Error::Loader)?; + #[allow(unused_mut)] + let mut imported_page = false; + #[cfg(all(feature = "kvm", feature = "sev_snp"))] + if hypervisor_type == HypervisorType::Kvm + && *data_type == IgvmPageDataType::CPUID_DATA + { + let mut new_cp = SnpCpuidInfo::new_zeroed(); + + let entries = cpu_manager.lock().unwrap().common_cpuid(); + let cp_count = std::cmp::min(SNP_CPUID_LIMIT as usize, entries.len()); + // TODO: Filter cpuid rather than truncate + for (i, entry) in entries.iter().enumerate().take(cp_count) { + new_cp.entries[i].eax_in = entry.function; + new_cp.entries[i].ecx_in = entry.index; + new_cp.entries[i].eax = entry.eax; + new_cp.entries[i].ebx = entry.ebx; + new_cp.entries[i].ecx = entry.ecx; + new_cp.entries[i].edx = entry.edx; + /* + * Guest kernels will calculate EBX themselves using the 0xD + * subfunctions corresponding to the individual XSAVE areas, so only + * encode the base XSAVE size in the initial leaves, corresponding + * to the initial XCR0=1 state. (https://tinyurl.com/qemu-cpuid) + */ + if new_cp.entries[i].eax_in == 0xd + && (new_cp.entries[i].ecx_in == 0x0 || new_cp.entries[i].ecx_in == 0x1) + { + new_cp.entries[i].ebx = 0x240; + new_cp.entries[i].xcr0_in = 1; + new_cp.entries[i].xss_in = 0; + } + + // KVM SNP launch may reject a CPUID page with bits it intends + // to sanitize internally. Pre-clearing the known unsafe bits keeps + // the CPUID page stable across launch updates. + match (new_cp.entries[i].eax_in, new_cp.entries[i].ecx_in) { + (0x1, 0x0) => { + new_cp.entries[i].ecx &= !(1 << 24); + } + (0x7, 0x0) => { + new_cp.entries[i].ebx &= !0x2; + new_cp.entries[i].edx = 0; + } + (0x80000008, 0x0) => { + new_cp.entries[i].ebx &= !0x0200_0000; + } + (0x80000021, 0x0) => { + new_cp.entries[i].ecx = 0; + } + _ => {} + } + } + new_cp.count = cp_count as u32; + loader + .import_pages(gpa / HV_PAGE_SIZE, 1, acceptance, new_cp.as_mut_bytes()) + .map_err(Error::Loader)?; + imported_page = true; + } + if !imported_page { + loader + .import_pages(gpa / HV_PAGE_SIZE, 1, acceptance, data) + .map_err(Error::Loader)?; + } } IgvmDirectiveHeader::ParameterArea { number_of_bytes, @@ -299,16 +452,16 @@ pub fn load_igvm( IgvmDirectiveHeader::MmioRanges(_info) => { todo!("unsupported IgvmPageDataType"); } - IgvmDirectiveHeader::MemoryMap(_info) => { + IgvmDirectiveHeader::MemoryMap(_info) => + { #[cfg(feature = "sev_snp")] - { + if sev_snp_enabled { let guest_mem = memory_manager.lock().unwrap().boot_guest_memory(); let memory_map = generate_memory_map(&guest_mem)?; import_parameter(&mut parameter_areas, _info, memory_map.as_bytes())?; + } else { + todo!("Not implemented"); } - - #[cfg(not(feature = "sev_snp"))] - todo!("Not implemented"); } IgvmDirectiveHeader::CommandLine(info) => { import_parameter(&mut parameter_areas, info, command_line.as_bytes_with_nul())?; @@ -336,7 +489,7 @@ pub fn load_igvm( vmsa, } => { assert_eq!(gpa % HV_PAGE_SIZE, 0); - let mut data: [u8; 4096] = [0; 4096]; + let mut data: [u8; HV_PAGE_SIZE as usize] = [0; HV_PAGE_SIZE as usize]; let len = size_of::(); loaded_info.vmsa_gpa = *gpa; loaded_info.vmsa = **vmsa; @@ -348,10 +501,28 @@ pub fn load_igvm( .map_err(Error::Loader)?; } + // Set vCPU initial register state from VMSA before SNP_LAUNCH_FINISH + #[cfg(all(feature = "kvm", feature = "sev_snp"))] + if hypervisor_type == HypervisorType::Kvm { + let vcpus = cpu_manager.lock().unwrap().vcpus(); + for vcpu in vcpus { + let vcpu_locked = vcpu.lock().unwrap(); + let vcpu_id: u16 = vcpu_locked.id().parse().unwrap(); + if vcpu_id == *vp_index { + vcpu_locked + .setup_sev_snp_regs(loaded_info.vmsa) + .map_err(Error::SetVmsa)?; + vcpu_locked + .set_sev_control_register(0) + .map_err(Error::SetVmsa)?; + } + } + } + gpas.push(GpaPages { gpa: *gpa, - page_type: hv_isolated_page_type_HV_ISOLATED_PAGE_TYPE_VMSA, - page_size: hv_isolated_page_size_HV_ISOLATED_PAGE_SIZE_4KB, + page_type: page_types.vmsa, + page_size: page_types.isolated_page_size_4kb, }); } IgvmDirectiveHeader::SnpIdBlock { @@ -419,8 +590,8 @@ pub fn load_igvm( *area = ParameterAreaState::Inserted; gpas.push(GpaPages { gpa: *gpa, - page_type: hv_isolated_page_type_HV_ISOLATED_PAGE_TYPE_UNMEASURED, - page_size: hv_isolated_page_size_HV_ISOLATED_PAGE_SIZE_4KB, + page_type: page_types.unmeasured, + page_size: page_types.isolated_page_size_4kb, }); } IgvmDirectiveHeader::ErrorRange { .. } => { @@ -433,7 +604,7 @@ pub fn load_igvm( } #[cfg(feature = "sev_snp")] - { + if sev_snp_enabled { memory_manager .lock() .unwrap() @@ -471,7 +642,7 @@ pub fn load_igvm( // of PFN for importing the isolated pages let pfns: Vec = group .iter() - .map(|gpa| gpa.gpa >> HV_HYP_PAGE_SHIFT) + .map(|gpa| gpa.gpa >> ISOLATED_PAGE_SHIFT) .collect(); let guest_memory = memory_manager.lock().unwrap().guest_memory().memory(); let uaddrs: Vec<_> = group @@ -483,17 +654,50 @@ pub fn load_igvm( uaddr_base + uaddr_offset }) .collect(); - memory_manager + #[cfg(feature = "kvm")] + let page_type = group[0].page_type; + let mut new_cp = SnpCpuidInfo::new_zeroed(); + let _ = guest_memory.read(new_cp.as_mut_bytes(), GuestAddress(group[0].gpa)); + let import_result = memory_manager .lock() .unwrap() .vm .import_isolated_pages( group[0].page_type, - hv_isolated_page_size_HV_ISOLATED_PAGE_SIZE_4KB, + page_types.isolated_page_size_4kb, &pfns, &uaddrs, ) - .map_err(Error::ImportIsolatedPages)?; + .map_err(Error::ImportIsolatedPages); + #[cfg(feature = "kvm")] + if hypervisor_type == HypervisorType::Kvm + && import_result.is_err() + && page_type == page_types.cpuid + { + // When we import the CPUID page, the firmware will change any cpuid fns that + // could lead to an insecure guest, we must then make sure to import the updated cpuid + // https://elixir.bootlin.com/linux/v6.11/source/arch/x86/kvm/svm/sev.c#L2322 + let mut updated_cp = SnpCpuidInfo::new_zeroed(); + let _ = guest_memory.read(updated_cp.as_mut_bytes(), GuestAddress(group[0].gpa)); + for (set, got) in std::iter::zip(new_cp.entries.iter(), updated_cp.entries.iter()) { + if set != got { + error!("Set cpuid fn: {set:#x?}, but firmware expects: {got:#x?}"); + } + } + memory_manager + .lock() + .unwrap() + .vm + .import_isolated_pages( + group[0].page_type, + page_types.isolated_page_size_4kb, + &pfns, + &uaddrs, + ) + .map_err(Error::ImportIsolatedPages)?; + continue; + } + import_result?; } info!( @@ -502,13 +706,23 @@ pub fn load_igvm( gpas.len() ); + let id_block_enabled = if hypervisor_type == HypervisorType::Mshv { + 1 + } else { + 0 + }; + now = Instant::now(); // Call Complete Isolated Import since we are done importing isolated pages memory_manager .lock() .unwrap() .vm - .complete_isolated_import(loaded_info.snp_id_block, host_data_contents, 1) + .complete_isolated_import( + loaded_info.snp_id_block, + host_data_contents, + id_block_enabled, + ) .map_err(Error::CompleteIsolatedImport)?; info!( From 883ca3feb2965c5fc9742498d826fddd673a1157 Mon Sep 17 00:00:00 2001 From: Ruben Hakobyan Date: Tue, 7 Apr 2026 18:14:43 -0700 Subject: [PATCH 733/742] vmm: reserve memory regions for stage0 and VMSA on KVM SEV-SNP A bootloader/firmware (e.g. stage0) and the VMSA page require dedicated memory regions at fixed GPAs. Add reserve_region_for_stage0() to allocate these regions before IGVM loading begins: - Stage0 at GPA 0xffc0_0000 (4 MB) - VMSA page at GPA 0xffff_ffff_f000 (4 KB) These reservations are KVM-only; MSHV handles stage0/VMSA placement through its own isolated import path. Also add fw_cfg device creation and SYS_statx to the vCPU seccomp allowlist (needed by stage0's file access pattern). Co-authored-by: Keith Adler Signed-off-by: Keith Adler Co-authored-by: Alex Orozco Signed-off-by: Alex Orozco Signed-off-by: Ruben Hakobyan --- vmm/src/seccomp_filters.rs | 1 + vmm/src/vm.rs | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/vmm/src/seccomp_filters.rs b/vmm/src/seccomp_filters.rs index bc17cde4e8..877ea907d2 100644 --- a/vmm/src/seccomp_filters.rs +++ b/vmm/src/seccomp_filters.rs @@ -917,6 +917,7 @@ fn vcpu_thread_rules( (libc::SYS_sendto, vec![]), (libc::SYS_shutdown, vec![]), (libc::SYS_sigaltstack, vec![]), + (libc::SYS_statx, vec![]), (libc::SYS_tgkill, vec![]), (libc::SYS_tkill, vec![]), #[cfg(target_arch = "x86_64")] diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index 74ca6a7ed0..6006896691 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -46,6 +46,10 @@ use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs; #[cfg(target_arch = "aarch64")] use hypervisor::arch::aarch64::regs::AARCH64_PMU_IRQ; +#[cfg(all(feature = "kvm", feature = "sev_snp"))] +use hypervisor::kvm::{ + BOOTLOADER_SIZE, BOOTLOADER_START, KVM_VMSA_PAGE_ADDRESS, KVM_VMSA_PAGE_SIZE, +}; use hypervisor::{HypervisorVmConfig, HypervisorVmError, VmOps}; #[cfg(feature = "igvm")] use igvm::IgvmFile; @@ -1040,6 +1044,9 @@ impl Vm { ) .map_err(Error::DeviceManager)?; + #[cfg(feature = "fw_cfg")] + Self::create_fw_cfg_if_enabled(config, device_manager)?; + Ok(load_payload_handle) } @@ -1510,6 +1517,16 @@ impl Vm { Ok(EntryPoint { entry_addr }) } + #[cfg(all(feature = "kvm", feature = "sev_snp"))] + fn reserve_bootloader_regions(memory_manager: &Arc>) -> Result<()> { + let mut mm = memory_manager.lock().unwrap(); + mm.add_ram_region(BOOTLOADER_START, BOOTLOADER_SIZE) + .map_err(Error::MemoryManager)?; + mm.add_ram_region(KVM_VMSA_PAGE_ADDRESS, KVM_VMSA_PAGE_SIZE) + .map_err(Error::MemoryManager)?; + Ok(()) + } + #[cfg(feature = "igvm")] #[allow(clippy::needless_pass_by_value)] fn load_igvm( @@ -1518,6 +1535,13 @@ impl Vm { cpu_manager: Arc>, #[cfg(feature = "sev_snp")] host_data: &Option, ) -> Result { + // Only reserve bootloader/VMSA regions for KVM + SEV-SNP; other hypervisors + // (e.g. MSHV) handle this through their own import path. + #[cfg(all(feature = "kvm", feature = "sev_snp"))] + if cpu_manager.lock().unwrap().sev_snp_enabled() { + Self::reserve_bootloader_regions(&memory_manager)?; + } + let res = igvm_loader::load_igvm( igvm_file, memory_manager, From d48d1dcd3d228a6650a52e37090ae549324420e3 Mon Sep 17 00:00:00 2001 From: Dylan Reid Date: Fri, 3 Apr 2026 16:10:20 -0700 Subject: [PATCH 734/742] vmm: export full setup-header area for x86_64 kernels The Linux x86 boot protocol defines the setup area as (setup_sects + 1) * 512 bytes. Previously we exported only the boot_params buffer (4096 bytes), which is wrong for kernels with setup_sects >= 8 where the actual setup area exceeds boot_params. Truncate or extend the existing buffer to the correct setup_sects- derived length, reading any extra bytes directly from the kernel file. This avoids an extra allocation in the common case (setup_sects <= 7) and matches QEMU's fw_cfg_add_kernel() behavior in hw/i386/x86-common.c. Signed-off-by: Dylan Reid --- devices/src/legacy/fw_cfg.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/devices/src/legacy/fw_cfg.rs b/devices/src/legacy/fw_cfg.rs index f33179d831..6ad86d109b 100644 --- a/devices/src/legacy/fw_cfg.rs +++ b/devices/src/legacy/fw_cfg.rs @@ -649,6 +649,18 @@ impl FwCfg { let kernel_start = bp.text_offset; #[cfg(target_arch = "x86_64")] let kernel_start = (bp.hdr.setup_sects as usize + 1) * 512; + + #[cfg(target_arch = "x86_64")] + if kernel_start <= buffer.len() { + buffer.truncate(kernel_start); + } else { + buffer.resize(kernel_start, 0); + file.read_exact_at( + &mut buffer[size_of::()..], + size_of::() as u64, + )?; + } + self.known_items[FW_CFG_SETUP_SIZE as usize] = FwCfgContent::U32(buffer.len() as u32); self.known_items[FW_CFG_SETUP_DATA as usize] = FwCfgContent::Bytes(buffer); self.known_items[FW_CFG_KERNEL_SIZE as usize] = From d5179a73c20aa5d78092ed39a6eb1aa6a37ab487 Mon Sep 17 00:00:00 2001 From: Dylan Reid Date: Fri, 3 Apr 2026 16:11:22 -0700 Subject: [PATCH 735/742] vmm: use 64-bit BARs for hotplugged virtio block devices Boot-time block devices on PCI segment 0 use 32-bit BARs so early firmware can access them without additional identity mapping in the firmware page tables. However, hot-plugged block devices are only ever seen by the OS kernel which handles 64-bit BARs natively. Switch hot-plugged block devices to 64-bit BARs to avoid exhausting the scarce 32-bit MMIO window (typically 2-3 GB between RAM and 4 GB) when many devices are hot-plugged. Extract the BAR sizing decision into use_64bit_bar_for_virtio_device() and thread an is_hotplug flag through add_virtio_pci_device(). Add unit tests covering all relevant combinations. Signed-off-by: Dylan Reid --- vmm/src/device_manager.rs | 56 ++++++++++++++++++++++++++++++++++----- 1 file changed, 49 insertions(+), 7 deletions(-) diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index dc3d827e73..0a263e7c42 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -1200,6 +1200,14 @@ fn create_mmio_allocators( mmio_allocators } +fn use_64bit_bar_for_virtio_device( + device_type: u32, + pci_segment_id: u16, + is_hotplug: bool, +) -> bool { + pci_segment_id > 0 || device_type != VirtioDeviceType::Block as u32 || is_hotplug +} + impl DeviceManager { #[allow(clippy::too_many_arguments)] pub fn new( @@ -1714,6 +1722,7 @@ impl DeviceManager { &mapping, &id, handle.pci_common.pci_segment, + false, handle.dma_handler, handle.pci_common.pci_device_id, )?; @@ -1747,8 +1756,15 @@ impl DeviceManager { } if let Some(iommu_device) = iommu_device { - let dev_id = - self.add_virtio_pci_device(iommu_device, &None, &iommu_id, 0, None, None)?; + let dev_id = self.add_virtio_pci_device( + iommu_device, + &None, + &iommu_id, + 0, + false, + None, + None, + )?; self.iommu_attached_devices = Some((dev_id, iommu_attached_devices)); } } @@ -4307,12 +4323,14 @@ impl DeviceManager { Ok(vec![]) } + #[allow(clippy::too_many_arguments)] fn add_virtio_pci_device( &mut self, virtio_device: Arc>, iommu_mapping: &Option>, virtio_device_id: &str, pci_segment_id: u16, + is_hotplug: bool, dma_handler: Option>, pci_device_id: Option, ) -> DeviceManagerResult { @@ -4412,11 +4430,10 @@ impl DeviceManager { self.activate_evt .try_clone() .map_err(DeviceManagerError::EventFd)?, - // All device types *except* virtio block devices should be allocated a 64-bit bar - // The block devices should be given a 32-bit BAR so that they are easily accessible - // to firmware without requiring excessive identity mapping. - // The exception being if not on the default PCI segment. - pci_segment_id > 0 || device_type != VirtioDeviceType::Block as u32, + // Boot-time block devices stay in 32-bit BAR space so early firmware can access + // them without additional identity mapping. Hot-plugged block devices do not have + // that constraint and should use 64-bit BARs like the rest of the virtio devices. + use_64bit_bar_for_virtio_device(device_type, pci_segment_id, is_hotplug), dma_handler, self.pending_activations.clone(), vm_migration::snapshot_from_id(self.snapshot.as_ref(), id.as_str()), @@ -5120,6 +5137,7 @@ impl DeviceManager { &mapping, &id, handle.pci_common.pci_segment, + true, handle.dma_handler, handle.pci_common.pci_device_id, )?; @@ -5868,6 +5886,30 @@ impl Drop for DeviceManager { mod unit_tests { use super::*; + #[test] + fn test_hotplugged_block_devices_use_64bit_bars() { + assert!(!use_64bit_bar_for_virtio_device( + VirtioDeviceType::Block as u32, + 0, + false, + )); + assert!(use_64bit_bar_for_virtio_device( + VirtioDeviceType::Block as u32, + 0, + true, + )); + assert!(use_64bit_bar_for_virtio_device( + VirtioDeviceType::Net as u32, + 0, + false, + )); + assert!(use_64bit_bar_for_virtio_device( + VirtioDeviceType::Block as u32, + 1, + false, + )); + } + #[test] fn test_create_mmio_allocators() { let res = create_mmio_allocators(0x100000, 0x3fffff, 1, &[1], 4 << 10); From ea2df946f63d6662cae3a93ef3299d397a543a10 Mon Sep 17 00:00:00 2001 From: Ruben Hakobyan Date: Fri, 17 Apr 2026 04:12:35 -0700 Subject: [PATCH 736/742] ci: Add CI jobs for KVM SEV-SNP Add build and clippy jobs for kvm+sev_snp+igvm+fw_cfg feature combination. Signed-off-by: Keith Adler Signed-off-by: Ruben Hakobyan --- .github/workflows/build.yaml | 3 +++ .github/workflows/quality.yaml | 20 ++++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index e3b1a9e7f7..628c163db8 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -64,6 +64,9 @@ jobs: - name: Build (sev_snp) run: cargo build --locked --bin cloud-hypervisor --no-default-features --features "sev_snp" + - name: Build (kvm + igvm + sev_snp + fw_cfg) + run: cargo build --locked --bin cloud-hypervisor --no-default-features --features "kvm,igvm,sev_snp,fw_cfg" + - name: Build (igvm) run: cargo build --locked --bin cloud-hypervisor --no-default-features --features "igvm" diff --git a/.github/workflows/quality.yaml b/.github/workflows/quality.yaml index 47156beaba..1290b0f872 100644 --- a/.github/workflows/quality.yaml +++ b/.github/workflows/quality.yaml @@ -146,6 +146,26 @@ jobs: target: ${{ matrix.target }} args: --locked --all --all-targets --no-default-features --tests --examples --features "tdx,kvm" -- -D warnings + - name: Clippy (kvm + igvm + sev_snp + fw_cfg) + if: ${{ matrix.target == 'x86_64-unknown-linux-gnu' }} + uses: houseabsolute/actions-rust-cross@v1 + with: + command: clippy + cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 + toolchain: ${{ matrix.rust }} + target: ${{ matrix.target }} + args: --locked --all --all-targets --no-default-features --tests --examples --features "kvm,igvm,sev_snp,fw_cfg" -- -D warnings + + - name: Clippy (default features + sev_snp + igvm + fw_cfg) + if: ${{ matrix.target == 'x86_64-unknown-linux-gnu' }} + uses: houseabsolute/actions-rust-cross@v1 + with: + command: clippy + cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 + toolchain: ${{ matrix.rust }} + target: ${{ matrix.target }} + args: --locked --all --all-targets --tests --examples --features "sev_snp,igvm,fw_cfg" -- -D warnings + - name: Check build did not modify any files run: test -z "$(git status --porcelain)" From a10d9a309988331141b13b2ffb5c1f4cc8da6399 Mon Sep 17 00:00:00 2001 From: Anirudh Rayabharam Date: Wed, 8 Apr 2026 17:45:35 +0000 Subject: [PATCH 737/742] tests: Ignore live migration tests on mshv arm64 Live migration is not yet supported on mshv arm64. Annotate the applicable integration tests with cfg_attr to ignore them for that configuration. Signed-off-by: Anirudh Rayabharam --- cloud-hypervisor/tests/integration.rs | 40 +++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 0dc7a8d64d..fb262b00cd 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -10895,41 +10895,73 @@ mod live_migration { use super::*; #[test] + #[cfg_attr( + all(feature = "mshv", target_arch = "aarch64"), + ignore = "live migration not yet supported on mshv arm64" + )] fn test_live_migration_basic() { _test_live_migration(false, false); } #[test] + #[cfg_attr( + all(feature = "mshv", target_arch = "aarch64"), + ignore = "live migration not yet supported on mshv arm64" + )] fn test_live_migration_local() { _test_live_migration(false, true); } #[test] + #[cfg_attr( + all(feature = "mshv", target_arch = "aarch64"), + ignore = "live migration not yet supported on mshv arm64" + )] fn test_live_migration_tcp() { _test_live_migration_tcp(NonZeroU32::new(1).unwrap()); } #[test] + #[cfg_attr( + all(feature = "mshv", target_arch = "aarch64"), + ignore = "live migration not yet supported on mshv arm64" + )] fn test_live_migration_tcp_parallel_connections() { _test_live_migration_tcp(NonZeroU32::new(8).unwrap()); } #[test] + #[cfg_attr( + all(feature = "mshv", target_arch = "aarch64"), + ignore = "live migration not yet supported on mshv arm64" + )] fn test_live_migration_tcp_timeout_cancel() { _test_live_migration_tcp_timeout(TimeoutStrategy::Cancel); } #[test] + #[cfg_attr( + all(feature = "mshv", target_arch = "aarch64"), + ignore = "live migration not yet supported on mshv arm64" + )] fn test_live_migration_tcp_timeout_ignore() { _test_live_migration_tcp_timeout(TimeoutStrategy::Ignore); } #[test] + #[cfg_attr( + all(feature = "mshv", target_arch = "aarch64"), + ignore = "live migration not yet supported on mshv arm64" + )] fn test_live_migration_watchdog() { _test_live_migration_watchdog(false, false); } #[test] + #[cfg_attr( + all(feature = "mshv", target_arch = "aarch64"), + ignore = "live migration not yet supported on mshv arm64" + )] fn test_live_migration_watchdog_local() { _test_live_migration_watchdog(false, true); } @@ -10982,11 +11014,19 @@ mod live_migration { } #[test] + #[cfg_attr( + all(feature = "mshv", target_arch = "aarch64"), + ignore = "live migration not yet supported on mshv arm64" + )] fn test_live_migration_balloon() { _test_live_migration_balloon(false, false); } #[test] + #[cfg_attr( + all(feature = "mshv", target_arch = "aarch64"), + ignore = "live migration not yet supported on mshv arm64" + )] fn test_live_migration_balloon_local() { _test_live_migration_balloon(false, true); } From e4e3375a8d7d40ef817030ddcf4c41160dff8c64 Mon Sep 17 00:00:00 2001 From: Keith Adler Date: Tue, 14 Apr 2026 14:47:24 -0500 Subject: [PATCH 738/742] vmm: move fw_cfg validation into PayloadConfig::validate() Move FwCfgMissingKernel/Cmdline/Initramfs error variants from ValidationError into PayloadConfigError. Change FwCfgConfig::validate() to take &PayloadConfig instead of &VmConfig and return PayloadConfigError. Wire the call through PayloadConfig::validate() so both CLI and JSON API paths are covered. Signed-off-by: Keith Adler --- vmm/src/config.rs | 21 ++++----------------- vmm/src/vm_config.rs | 17 +++++++++++++++++ 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/vmm/src/config.rs b/vmm/src/config.rs index d023f98a1b..a099897307 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -373,18 +373,6 @@ pub enum ValidationError { /// Invalid block device serial length #[error("Block device serial length ({0}) exceeds maximum allowed length ({1})")] InvalidSerialLength(usize, usize), - #[cfg(feature = "fw_cfg")] - /// FwCfg missing kernel - #[error("Error --fw-cfg-config: missing --kernel")] - FwCfgMissingKernel, - #[cfg(feature = "fw_cfg")] - /// FwCfg missing cmdline - #[error("Error --fw-cfg-config: missing --cmdline")] - FwCfgMissingCmdline, - #[cfg(feature = "fw_cfg")] - /// FwCfg missing initramfs - #[error("Error --fw-cfg-config: missing --initramfs")] - FwCfgMissingInitramfs, #[cfg(feature = "ivshmem")] /// Invalid Ivshmem input size #[error("Invalid ivshmem input size")] @@ -2038,14 +2026,13 @@ impl FwCfgConfig { items, }) } - pub fn validate(&self, vm_config: &VmConfig) -> ValidationResult<()> { - let payload = vm_config.payload.as_ref().unwrap(); + pub fn validate(&self, payload: &PayloadConfig) -> std::result::Result<(), PayloadConfigError> { if self.kernel && payload.kernel.is_none() { - return Err(ValidationError::FwCfgMissingKernel); + return Err(PayloadConfigError::FwCfgMissingKernel); } else if self.cmdline && payload.cmdline.is_none() { - return Err(ValidationError::FwCfgMissingCmdline); + return Err(PayloadConfigError::FwCfgMissingCmdline); } else if self.initramfs && payload.initramfs.is_none() { - return Err(ValidationError::FwCfgMissingInitramfs); + return Err(PayloadConfigError::FwCfgMissingInitramfs); } Ok(()) } diff --git a/vmm/src/vm_config.rs b/vmm/src/vm_config.rs index 01d3bb0101..3753b4b491 100644 --- a/vmm/src/vm_config.rs +++ b/vmm/src/vm_config.rs @@ -747,6 +747,18 @@ pub enum PayloadConfigError { /// Specifying a kernel or firmware is not supported when an igvm is provided. #[error("Specifying a kernel or firmware is not supported when an igvm is provided")] IgvmPlusOtherPayloads, + #[cfg(feature = "fw_cfg")] + /// FwCfg missing kernel + #[error("Error --fw-cfg-config: missing --kernel")] + FwCfgMissingKernel, + #[cfg(feature = "fw_cfg")] + /// FwCfg missing cmdline + #[error("Error --fw-cfg-config: missing --cmdline")] + FwCfgMissingCmdline, + #[cfg(feature = "fw_cfg")] + /// FwCfg missing initramfs + #[error("Error --fw-cfg-config: missing --initramfs")] + FwCfgMissingInitramfs, } #[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] @@ -872,6 +884,11 @@ impl PayloadConfig { (None, None) => Err(PayloadConfigError::MissingBootitem), }?; + #[cfg(feature = "fw_cfg")] + if let Some(fw_cfg_config) = &self.fw_cfg_config { + fw_cfg_config.validate(self)?; + } + Ok(()) } } From 926dd1e141124ee86ea03a35bc040e307b6cb830 Mon Sep 17 00:00:00 2001 From: Keith Adler Date: Tue, 14 Apr 2026 14:48:49 -0500 Subject: [PATCH 739/742] vmm, devices: Add fw_cfg string item support QEMU supports passing inline string values to the guest via fw_cfg (-fw_cfg name=...,string=...). Cloud Hypervisor previously only supported file-backed fw_cfg items. This adds the 'string' option so users can pass values like OVMF's X-PciMmio64Mb without creating a temporary file on the host. Each fw_cfg item now accepts exactly one of 'file' or 'string'. The FwCfgInvalidItem invariant is validated in PayloadConfig::validate() (via FwCfgConfig::validate()), covering both CLI and JSON API paths. The populate_fw_cfg match arm uses unreachable!() since validation guarantees the invariant holds at that point. CLI syntax: --fw-cfg-config items=[name=opt/ovmf/X-PciMmio64Mb,string=262144] Signed-off-by: Keith Adler --- cloud-hypervisor/tests/integration.rs | 41 +++++++++ devices/src/legacy/fw_cfg.rs | 26 ++++++ docs/fw_cfg.md | 19 ++++- vmm/src/config.rs | 116 ++++++++++++++++++++++---- vmm/src/vm.rs | 21 +++-- vmm/src/vm_config.rs | 10 ++- 6 files changed, 209 insertions(+), 24 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index fb262b00cd..75250bcb20 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -11538,4 +11538,45 @@ mod fw_cfg { handle_child_output(r, &output); } + + #[test] + #[cfg_attr(feature = "mshv", ignore = "See #7434")] + fn test_fw_cfg_string() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); + let mut cmd = GuestCommand::new(&guest); + + let kernel_path = direct_kernel_boot_path(); + let cmd_line = DIRECT_KERNEL_BOOT_CMDLINE; + + cmd.args(["--cpus", "boot=4"]) + .default_memory() + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", cmd_line]) + .default_disks() + .default_net() + .args([ + "--fw-cfg-config", + "initramfs=off,items=[name=opt/org.test/test-string,string=hello-from-vmm]", + ]) + .capture_output(); + + let mut child = cmd.spawn().unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + thread::sleep(std::time::Duration::new(3, 0)); + let result = guest + .ssh_command( + "sudo cat /sys/firmware/qemu_fw_cfg/by_name/opt/org.test/test-string/raw", + ) + .unwrap(); + assert_eq!(result, "hello-from-vmm"); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); + } } diff --git a/devices/src/legacy/fw_cfg.rs b/devices/src/legacy/fw_cfg.rs index 6ad86d109b..876ceaaf3c 100644 --- a/devices/src/legacy/fw_cfg.rs +++ b/devices/src/legacy/fw_cfg.rs @@ -909,6 +909,32 @@ mod unit_tests { } } + #[test] + fn test_string_item() { + let gm = GuestMemoryAtomic::new( + GuestMemoryMmap::from_ranges(&[(GuestAddress(0), RAM_64BIT_START.0 as usize)]).unwrap(), + ); + + let mut fw_cfg = FwCfg::new(gm); + + // Simulate OVMF X-PciMmio64Mb string item for GPU CC passthrough + let item = FwCfgItem { + name: "opt/ovmf/X-PciMmio64Mb".to_owned(), + content: FwCfgContent::Bytes("262144".as_bytes().to_vec()), + }; + fw_cfg.add_item(item).unwrap(); + + let expected = b"262144"; + let mut data = vec![0u8]; + + // Select the first file item (FW_CFG_FILE_FIRST = 0x20) + fw_cfg.write(0, SELECTOR_OFFSET, &[FW_CFG_FILE_FIRST as u8, 0]); + for &byte in expected.iter() { + fw_cfg.read(0, DATA_OFFSET, &mut data); + assert_eq!(data[0], byte); + } + } + #[test] fn test_dma() { let code = [ diff --git a/docs/fw_cfg.md b/docs/fw_cfg.md index 73f10a7808..76e6951f45 100644 --- a/docs/fw_cfg.md +++ b/docs/fw_cfg.md @@ -39,9 +39,10 @@ The `fw_cfg` device is configured using the `--fw-cfg-config` command-line optio * `cmdline=on|off`: (Default: `on`) Whether to add the kernel command line (specified by `--cmdline`) to `fw_cfg`. * `initramfs=on|off`: (Default: `on`) Whether to add the initramfs image (specified by `--initramfs`) to `fw_cfg`. * `acpi_table=on|off`: (Default: `on`) Whether to add generated ACPI tables to `fw_cfg`. -* `items=[... : ...]`: A list of custom key-value pairs to be exposed via `fw_cfg`. +* `items=[... : ...]`: A list of custom key-value pairs to be exposed via `fw_cfg`. Multiple items are separated by `:`. * `name=`: The path under which the item will appear in the guest's sysfs (e.g., `opt/org.example/my-data`). - * `file=`: The path to the file on the host whose content will be provided to the guest for this item. + * `file=`: The path to a file on the host whose content will be provided to the guest for this item. + * `string=`: An inline string value to provide to the guest for this item. Each item must have exactly one of `file` or `string`, not both. **Example Usage:** @@ -57,7 +58,19 @@ The `fw_cfg` device is configured using the `--fw-cfg-config` command-line optio ``` In the guest, `/tmp/guest_setup.txt` from the host will be accessible at `/sys/firmware/qemu_fw_cfg/by_name/opt/org.mycorp/setup_info/raw`. -2. **Disabling `fw_cfg` explicitly:** +2. **Inline string items (e.g., OVMF MMIO64 configuration for GPU passthrough):** + + ```bash + cloud-hypervisor \ + --firmware /path/to/OVMF.fd \ + --disk path=/path/to/rootfs.img \ + --device path=/sys/bus/pci/devices/0000:41:00.0 \ + --fw-cfg-config items=[name=opt/ovmf/X-PciMmio64Mb,string=262144] \ + ... + ``` + The string `262144` is passed directly to the guest as the content of `opt/ovmf/X-PciMmio64Mb`. + +3. **Disabling `fw_cfg` explicitly:** ```bash cloud-hypervisor \ diff --git a/vmm/src/config.rs b/vmm/src/config.rs index a099897307..7efda7c05a 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -1970,7 +1970,7 @@ impl FsConfig { impl FwCfgConfig { pub const SYNTAX: &'static str = "Boot params to pass to FW CFG device \ \"e820=on|off,kernel=on|off,cmdline=on|off,initramfs=on|off,acpi_table=on|off, \ - items=[name0=,file0=:name1=,file1=]\""; + items=[name=,file=:name=,string=]\""; pub fn parse(fw_cfg_config: &str) -> Result { let mut parser = OptionParser::new(); parser @@ -2034,6 +2034,15 @@ impl FwCfgConfig { } else if self.initramfs && payload.initramfs.is_none() { return Err(PayloadConfigError::FwCfgMissingInitramfs); } + + if let Some(items) = &self.items { + for item in &items.item_list { + if item.file.is_some() == item.string.is_some() { + return Err(PayloadConfigError::FwCfgInvalidItem(item.name.clone())); + } + } + } + Ok(()) } } @@ -2042,7 +2051,7 @@ impl FwCfgConfig { impl FwCfgItem { pub fn parse(fw_cfg: &str) -> Result { let mut parser = OptionParser::new(); - parser.add("name").add("file"); + parser.add("name").add("file").add("string"); parser.parse(fw_cfg).map_err(Error::ParseFwCfgItem)?; let name = @@ -2051,13 +2060,9 @@ impl FwCfgItem { .ok_or(Error::ParseFwCfgItem(OptionParserError::InvalidValue( "missing FwCfgItem name".to_string(), )))?; - let file = parser - .get("file") - .map(PathBuf::from) - .ok_or(Error::ParseFwCfgItem(OptionParserError::InvalidValue( - "missing FwCfgItem file path".to_string(), - )))?; - Ok(FwCfgItem { name, file }) + let file = parser.get("file").map(PathBuf::from); + let string = parser.get("string"); + Ok(FwCfgItem { name, file, string }) } } @@ -4981,6 +4986,33 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" )) ); + #[cfg(feature = "fw_cfg")] + { + let mut invalid_config = valid_config.clone(); + if let Some(payload) = invalid_config.payload.as_mut() { + payload.fw_cfg_config = Some(FwCfgConfig { + e820: true, + kernel: false, + cmdline: false, + initramfs: false, + acpi_tables: true, + items: Some(FwCfgItemList { + item_list: vec![FwCfgItem { + name: "opt/org.test/invalid".to_string(), + file: None, + string: None, + }], + }), + }); + } + assert_eq!( + invalid_config.validate(), + Err(ValidationError::PayloadError( + PayloadConfigError::FwCfgInvalidItem("opt/org.test/invalid".to_string()) + )) + ); + } + let mut invalid_config = valid_config.clone(); invalid_config.serial.mode = ConsoleOutputMode::File; invalid_config.serial.file = None; @@ -5747,7 +5779,7 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" // Missing closing bracket FwCfgConfig::parse("items=[name=opt/org.test/fw_cfg_test_item,file=/tmp/fw_cfg_test_item") .unwrap_err(); - // Single Item + // Single file Item assert_eq!( FwCfgConfig::parse( "items=[name=opt/org.test/fw_cfg_test_item,file=/tmp/fw_cfg_test_item]" @@ -5756,13 +5788,14 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" items: Some(FwCfgItemList { item_list: vec![FwCfgItem { name: "opt/org.test/fw_cfg_test_item".to_string(), - file: PathBuf::from("/tmp/fw_cfg_test_item"), + file: Some(PathBuf::from("/tmp/fw_cfg_test_item")), + string: None, }] }), ..Default::default() }, ); - // Multiple Items + // Multiple file Items assert_eq!( FwCfgConfig::parse( "items=[name=opt/org.test/fw_cfg_test_item,file=/tmp/fw_cfg_test_item:name=opt/org.test/fw_cfg_test_item2,file=/tmp/fw_cfg_test_item2]" @@ -5772,17 +5805,72 @@ id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" item_list: vec![ FwCfgItem { name: "opt/org.test/fw_cfg_test_item".to_string(), - file: PathBuf::from("/tmp/fw_cfg_test_item"), + file: Some(PathBuf::from("/tmp/fw_cfg_test_item")), + string: None, }, FwCfgItem { name: "opt/org.test/fw_cfg_test_item2".to_string(), - file: PathBuf::from("/tmp/fw_cfg_test_item2"), + file: Some(PathBuf::from("/tmp/fw_cfg_test_item2")), + string: None, } ] }), ..Default::default() }, ); + // Single string Item (for OVMF MMIO64 config, GPU CC passthrough, etc.) + assert_eq!( + FwCfgConfig::parse("items=[name=opt/ovmf/X-PciMmio64Mb,string=262144]")?, + FwCfgConfig { + items: Some(FwCfgItemList { + item_list: vec![FwCfgItem { + name: "opt/ovmf/X-PciMmio64Mb".to_string(), + file: None, + string: Some("262144".to_string()), + }] + }), + ..Default::default() + }, + ); + // Mixed file and string Items + assert_eq!( + FwCfgConfig::parse( + "items=[name=opt/org.test/fw_cfg_test_item,file=/tmp/fw_cfg_test_item:name=opt/ovmf/X-PciMmio64Mb,string=262144]" + )?, + FwCfgConfig { + items: Some(FwCfgItemList { + item_list: vec![ + FwCfgItem { + name: "opt/org.test/fw_cfg_test_item".to_string(), + file: Some(PathBuf::from("/tmp/fw_cfg_test_item")), + string: None, + }, + FwCfgItem { + name: "opt/ovmf/X-PciMmio64Mb".to_string(), + file: None, + string: Some("262144".to_string()), + } + ] + }), + ..Default::default() + }, + ); + // Missing both file and string parses OK but fails validation + let missing_content = + FwCfgConfig::parse("items=[name=opt/org.test/missing_content]").unwrap(); + assert_eq!( + missing_content.items.as_ref().unwrap().item_list[0].file, + None + ); + assert_eq!( + missing_content.items.as_ref().unwrap().item_list[0].string, + None + ); + // Both file and string parses OK but fails validation + let both = FwCfgConfig::parse("items=[name=opt/org.test/both,file=/tmp/test,string=test]") + .unwrap(); + assert!(both.items.as_ref().unwrap().item_list[0].file.is_some()); + assert!(both.items.as_ref().unwrap().item_list[0].string.is_some()); Ok(()) } } diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index 6006896691..cd118782e9 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -1185,15 +1185,24 @@ impl Vm { initramfs_option = initramfs; } let mut fw_cfg_item_list_option: Option> = None; - if let Some(fw_cfg_files) = &fw_cfg_config.items { + if let Some(fw_cfg_items) = &fw_cfg_config.items { let mut fw_cfg_item_list = vec![]; - for fw_cfg_file in fw_cfg_files.item_list.clone() { - fw_cfg_item_list.push(FwCfgItem { - name: fw_cfg_file.name, - content: devices::legacy::fw_cfg::FwCfgContent::File( + for fw_cfg_item in fw_cfg_items.item_list.clone() { + let content = match (fw_cfg_item.string, fw_cfg_item.file) { + (Some(string_val), None) => { + devices::legacy::fw_cfg::FwCfgContent::Bytes(string_val.into_bytes()) + } + (None, Some(file_path)) => devices::legacy::fw_cfg::FwCfgContent::File( 0, - File::open(fw_cfg_file.file).map_err(Error::AddingFwCfgItem)?, + File::open(file_path).map_err(Error::AddingFwCfgItem)?, + ), + _ => unreachable!( + "PayloadConfig::validate() ensures either 'file' or 'string' is present" ), + }; + fw_cfg_item_list.push(FwCfgItem { + name: fw_cfg_item.name, + content, }); } fw_cfg_item_list_option = Some(fw_cfg_item_list); diff --git a/vmm/src/vm_config.rs b/vmm/src/vm_config.rs index 3753b4b491..b0fbb531a2 100644 --- a/vmm/src/vm_config.rs +++ b/vmm/src/vm_config.rs @@ -759,6 +759,12 @@ pub enum PayloadConfigError { /// FwCfg missing initramfs #[error("Error --fw-cfg-config: missing --initramfs")] FwCfgMissingInitramfs, + #[cfg(feature = "fw_cfg")] + /// Invalid fw_cfg item content + #[error( + "Error --fw-cfg-config: invalid item '{0}' (exactly one of 'file' or 'string' is required)" + )] + FwCfgInvalidItem(String), } #[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] @@ -819,7 +825,9 @@ pub struct FwCfgItem { #[serde(default)] pub name: String, #[serde(default)] - pub file: PathBuf, + pub file: Option, + #[serde(default)] + pub string: Option, } #[cfg(feature = "fw_cfg")] From a77b6231b45e4530971b326476d315e2c390d6b5 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Mon, 20 Apr 2026 12:21:25 +0100 Subject: [PATCH 740/742] tests: Ensure that virtiofsd has exited before hotplugging Fix test flakiness where the virtiofsd daemon was still running and hotplugging was trying to reach the old version. Cleanup the socket so that waiting for it actually waits for the new instance. Signed-off-by: Rob Bradford --- cloud-hypervisor/tests/common/tests_wrappers.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cloud-hypervisor/tests/common/tests_wrappers.rs b/cloud-hypervisor/tests/common/tests_wrappers.rs index 71087a9009..67033956e1 100644 --- a/cloud-hypervisor/tests/common/tests_wrappers.rs +++ b/cloud-hypervisor/tests/common/tests_wrappers.rs @@ -1060,6 +1060,11 @@ pub(crate) fn _test_virtio_fs( }); let (r, hotplug_daemon_child) = if r.is_ok() && hotplug { + let _ = daemon_child.kill(); + let _ = daemon_child.wait(); + // Remove the stale socket so wait_for_virtiofsd_socket actually waits + let _ = std::fs::remove_file(&virtiofsd_socket_path); + let (daemon_child, virtiofsd_socket_path) = prepare_daemon(&guest.tmp_dir, shared_dir.to_str().unwrap()); From ae646f9220f51320edcd53324538f60b0df175cd Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Mon, 20 Apr 2026 12:24:52 +0100 Subject: [PATCH 741/742] tests: Use wait_until() in test_pci_device_id The SSH connection may fail initially when under load so use `wait_until()` to allow retries. Signed-off-by: Rob Bradford --- cloud-hypervisor/tests/integration.rs | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 75250bcb20..f8915ec4a1 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -5844,16 +5844,18 @@ mod common_parallel { })); // Calculate the succeeding device ID let device_id_to_allocate = first_free_device_id + 1; - // We expect the succeeding device ID to be free (single attempt, no retries) - assert!(matches!( - ssh_command_ip_with_auth( - &format!("lspci -n | grep \"00:{device_id_to_allocate:02x}.0\""), - &default_guest_auth(), - &guest.network.guest_ip0, - Some(Duration::from_secs(1)), - ), - Err(SshCommandError::NonZeroExitStatus(1)) - )); + // We expect the succeeding device ID to be free. + assert!(wait_until(Duration::from_secs(10), || { + matches!( + ssh_command_ip_with_auth( + &format!("lspci -n | grep \"00:{device_id_to_allocate:02x}.0\""), + &default_guest_auth(), + &guest.network.guest_ip0, + Some(Duration::from_secs(5)), + ), + Err(SshCommandError::NonZeroExitStatus(1)) + ) + })); // Add a device to the next device slot explicitly let (cmd_success, cmd_stdout, _) = remote_command_w_output( From 305451cce4f5bbfbe7a0aa7820063f7d3fbe251b Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Fri, 17 Apr 2026 12:39:52 -0700 Subject: [PATCH 742/742] scripts: dev_cli: Add signal handling for graceful cleanup During development, it is common to cancel a running test and rerun it after making changes. However, pressing Ctrl+C while dev_cli.sh runs long-running container commands (wget, qemu-img, cargo build, etc.) does not reliably terminate the process. Bash defers signal handling while a foreground process is running, so the trap only fires after the docker run command returns. This makes it difficult to cancel and restart quickly. Fix this by introducing a run_container() wrapper that runs docker in the background and uses 'wait', which is immediately interruptible by signals. A cleanup() trap handler is set for SIGINT and SIGTERM that kills the named container, the tracked background PID, and any remaining child processes. Each docker run invocation is assigned a unique --name based on the script PID (clh-dev-$$) to allow targeted cleanup. The interactive shell (cmd_shell) is left unwrapped since it needs foreground terminal I/O. Signed-off-by: Muminul Islam --- scripts/dev_cli.sh | 75 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 64 insertions(+), 11 deletions(-) diff --git a/scripts/dev_cli.sh b/scripts/dev_cli.sh index 6e9d58eb7b..3b94513216 100755 --- a/scripts/dev_cli.sh +++ b/scripts/dev_cli.sh @@ -53,6 +53,46 @@ CARGO_TARGET_DIR="${CLH_BUILD_DIR}/cargo_target" # Let tests know that the special environment is set up. RUSTFLAGS="${RUSTFLAGS} --cfg devcli_testenv" +# Container name used for cleanup on signal. The PID makes it unique per +# invocation so parallel runs do not collide. +CLH_CTR_NAME="clh-dev-$$" + +# PID of the docker run process launched by run_container(). +CLH_CTR_PID="" + +# Cleanup handler: kill the running container (if any) and all child +# processes, then exit. +cleanup() { + echo "[$CLI_NAME] Caught signal, terminating..." + # Disable the trap to prevent recursion + trap - INT TERM + # Kill the Docker/Podman container by name + $DOCKER_RUNTIME kill "$CLH_CTR_NAME" 2>/dev/null + $DOCKER_RUNTIME kill "${CLH_CTR_NAME}-fix" 2>/dev/null + # Kill the docker run process tracked by run_container() + [ -n "$CLH_CTR_PID" ] && kill -TERM "$CLH_CTR_PID" 2>/dev/null + # Kill any remaining child processes + pkill -TERM -P $$ 2>/dev/null + wait 2>/dev/null + exit 1 +} + +trap cleanup INT TERM + +# Run a command in the background and wait for it. Bash defers trap +# handling while a foreground process is running, which makes Ctrl+C +# unresponsive during long-running container commands (wget, qemu-img, +# cargo build, etc). By backgrounding the command and using `wait`, +# the trap fires immediately when a signal arrives. +run_container() { + "$@" & + CLH_CTR_PID=$! + wait $CLH_CTR_PID + local rc=$? + CLH_CTR_PID="" + return $rc +} + # Send a decorated message to stdout, followed by a new line # say() { @@ -146,6 +186,7 @@ fix_dir_perms() { # Yes, running Docker to get elevated privileges, just to chown some files # is a dirty hack. $DOCKER_RUNTIME run \ + --name "${CLH_CTR_NAME}-fix" \ --workdir "$CTR_CLH_ROOT_DIR" \ --rm \ --volume /dev:/dev \ @@ -314,7 +355,8 @@ cmd_build() { rustflags="$rustflags -C link-args=-Wl,-Bstatic -C link-args=-lc" fi - $DOCKER_RUNTIME run \ + run_container "$DOCKER_RUNTIME" run \ + --name "$CLH_CTR_NAME" \ --user "$(id -u):$(id -g)" \ --workdir "$CTR_CLH_ROOT_DIR" \ --rm \ @@ -335,7 +377,8 @@ cmd_clean() { ensure_build_dir ensure_latest_ctr - $DOCKER_RUNTIME run \ + run_container "$DOCKER_RUNTIME" run \ + --name "$CLH_CTR_NAME" \ --user "$(id -u):$(id -g)" \ --workdir "$CTR_CLH_ROOT_DIR" \ --rm \ @@ -432,7 +475,8 @@ cmd_tests() { if [[ "$unit" = true ]]; then say "Running unit tests for $target..." - $DOCKER_RUNTIME run \ + run_container "$DOCKER_RUNTIME" run \ + --name "$CLH_CTR_NAME" \ --workdir "$CTR_CLH_ROOT_DIR" \ --rm \ --device $exported_device \ @@ -451,7 +495,8 @@ cmd_tests() { if [ "$integration" = true ]; then say "Running integration tests for $target..." - $DOCKER_RUNTIME run \ + run_container "$DOCKER_RUNTIME" run \ + --name "$CLH_CTR_NAME" \ --workdir "$CTR_CLH_ROOT_DIR" \ --rm \ --privileged \ @@ -477,7 +522,8 @@ cmd_tests() { mkdir -p "$DEST_IGVM_FILES_PATH" copy_igvm_files "$SRC_IGVM_FILES_PATH" "$DEST_IGVM_FILES_PATH" say "Running CVM integration tests for $target..." - $DOCKER_RUNTIME run \ + run_container "$DOCKER_RUNTIME" run \ + --name "$CLH_CTR_NAME" \ --workdir "$CTR_CLH_ROOT_DIR" \ --rm \ --privileged \ @@ -502,7 +548,8 @@ cmd_tests() { if [ "$integration_vfio" = true ]; then say "Running VFIO integration tests for $target..." - $DOCKER_RUNTIME run \ + run_container "$DOCKER_RUNTIME" run \ + --name "$CLH_CTR_NAME" \ --workdir "$CTR_CLH_ROOT_DIR" \ --rm \ --privileged \ @@ -525,7 +572,8 @@ cmd_tests() { if [ "$integration_windows" = true ]; then say "Running Windows integration tests for $target..." - $DOCKER_RUNTIME run \ + run_container "$DOCKER_RUNTIME" run \ + --name "$CLH_CTR_NAME" \ --workdir "$CTR_CLH_ROOT_DIR" \ --rm \ --privileged \ @@ -548,7 +596,8 @@ cmd_tests() { if [ "$integration_live_migration" = true ]; then say "Running 'live migration' integration tests for $target..." - $DOCKER_RUNTIME run \ + run_container "$DOCKER_RUNTIME" run \ + --name "$CLH_CTR_NAME" \ --workdir "$CTR_CLH_ROOT_DIR" \ --rm \ --privileged \ @@ -573,7 +622,8 @@ cmd_tests() { if [ "$integration_rate_limiter" = true ]; then say "Running 'rate limiter' integration tests for $target..." - $DOCKER_RUNTIME run \ + run_container "$DOCKER_RUNTIME" run \ + --name "$CLH_CTR_NAME" \ --workdir "$CTR_CLH_ROOT_DIR" \ --rm \ --privileged \ @@ -596,7 +646,8 @@ cmd_tests() { if [ "$metrics" = true ]; then say "Generating performance metrics for $target..." - $DOCKER_RUNTIME run \ + run_container "$DOCKER_RUNTIME" run \ + --name "$CLH_CTR_NAME" \ --workdir "$CTR_CLH_ROOT_DIR" \ --rm \ --privileged \ @@ -620,7 +671,8 @@ cmd_tests() { if [ "$coverage" = true ]; then say "Generating code coverage information for $target..." - $DOCKER_RUNTIME run \ + run_container "$DOCKER_RUNTIME" run \ + --name "$CLH_CTR_NAME" \ --workdir "$CTR_CLH_ROOT_DIR" \ --rm \ --privileged \ @@ -721,6 +773,7 @@ cmd_shell() { fi $DOCKER_RUNTIME run \ + --name "$CLH_CTR_NAME" \ $tty_args \ --workdir "$CTR_CLH_ROOT_DIR" \ --rm \