diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 781912648b..2f3865fb1a 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -37,6 +37,11 @@ updates: interval: weekly allow: - dependency-type: all + cooldown: + default-days: 7 + semver-major-days: 14 + semver-minor-days: 7 + semver-patch-days: 3 ignore: - dependency-name: "acpi_tables" - dependency-name: "kvm-bindings" diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index e3b1a9e7f7..628c163db8 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -64,6 +64,9 @@ jobs: - name: Build (sev_snp) run: cargo build --locked --bin cloud-hypervisor --no-default-features --features "sev_snp" + - name: Build (kvm + igvm + sev_snp + fw_cfg) + run: cargo build --locked --bin cloud-hypervisor --no-default-features --features "kvm,igvm,sev_snp,fw_cfg" + - name: Build (igvm) run: cargo build --locked --bin cloud-hypervisor --no-default-features --features "igvm" diff --git a/.github/workflows/dco.yaml b/.github/workflows/dco.yaml index 655c0b5e2f..67dfadd5c4 100644 --- a/.github/workflows/dco.yaml +++ b/.github/workflows/dco.yaml @@ -16,5 +16,6 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | + set -eufo pipefail pip3 install -U dco-check dco-check -e "49699333+dependabot[bot]@users.noreply.github.com" diff --git a/.github/workflows/docker-image.yaml b/.github/workflows/docker-image.yaml index 6a52f1edf0..8636d35f00 100644 --- a/.github/workflows/docker-image.yaml +++ b/.github/workflows/docker-image.yaml @@ -21,13 +21,13 @@ jobs: uses: actions/checkout@v6 - name: Set up QEMU - uses: docker/setup-qemu-action@v3 + uses: docker/setup-qemu-action@v4 - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 + uses: docker/setup-buildx-action@v4 - name: Login to ghcr - uses: docker/login-action@v3 + uses: docker/login-action@v4 with: registry: ${{ env.REGISTRY }} username: ${{ github.actor }} @@ -36,7 +36,7 @@ jobs: - name: Docker meta id: meta - uses: docker/metadata-action@v5 + uses: docker/metadata-action@v6 with: images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} # generate Docker tags based on the following events/attributes @@ -46,7 +46,7 @@ jobs: - name: Build and push if: ${{ github.event_name == 'push' }} - uses: docker/build-push-action@v6 + uses: docker/build-push-action@v7 with: file: ./resources/Dockerfile platforms: linux/amd64,linux/arm64 @@ -55,7 +55,7 @@ jobs: - name: Build only if: ${{ github.event_name == 'pull_request' }} - uses: docker/build-push-action@v6 + uses: docker/build-push-action@v7 with: file: ./resources/Dockerfile platforms: linux/amd64,linux/arm64 diff --git a/.github/workflows/gitlint.yaml b/.github/workflows/gitlint.yaml index 6fd0ec4ab9..178f15aa40 100644 --- a/.github/workflows/gitlint.yaml +++ b/.github/workflows/gitlint.yaml @@ -22,4 +22,4 @@ jobs: pip install --upgrade gitlint - name: Lint git commit messages run: | - gitlint --commits origin/$GITHUB_BASE_REF.. + gitlint --commits "origin/$GITHUB_BASE_REF.." diff --git a/.github/workflows/integration-arm64.yaml b/.github/workflows/integration-arm64.yaml index 0678dbc839..873daaa747 100644 --- a/.github/workflows/integration-arm64.yaml +++ b/.github/workflows/integration-arm64.yaml @@ -8,24 +8,30 @@ jobs: build: timeout-minutes: 120 name: Tests (ARM64) - runs-on: bookworm-arm64 + runs-on: ${{ github.event_name == 'pull_request' && 'ubuntu-latest' || 'bookworm-arm64' }} steps: - name: Fix workspace permissions + if: ${{ github.event_name != 'pull_request' }} run: sudo chown -R runner:runner ${GITHUB_WORKSPACE} - name: Code checkout + if: ${{ github.event_name != 'pull_request' }} uses: actions/checkout@v6 with: fetch-depth: 0 - name: Run unit tests (musl) + if: ${{ github.event_name != 'pull_request' }} run: scripts/dev_cli.sh tests --unit --libc musl - name: Load openvswitch module + if: ${{ github.event_name != 'pull_request' }} run: sudo modprobe openvswitch - name: Run integration tests (musl) + if: ${{ github.event_name != 'pull_request' }} timeout-minutes: 60 run: scripts/dev_cli.sh tests --integration --libc musl - name: Install Azure CLI if: ${{ github.event_name != 'pull_request' }} run: | + set -eufo pipefail sudo apt install -y ca-certificates curl apt-transport-https lsb-release gnupg curl -sL https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor | sudo tee /etc/apt/trusted.gpg.d/microsoft.gpg > /dev/null echo "deb [arch=arm64] https://packages.microsoft.com/repos/azure-cli/ bookworm main" | sudo tee /etc/apt/sources.list.d/azure-cli.list @@ -35,6 +41,7 @@ jobs: if: ${{ github.event_name != 'pull_request' }} shell: bash run: | + set -eufo pipefail IMG_BASENAME=windows-11-iot-enterprise-aarch64.raw IMG_PATH=$HOME/workloads/$IMG_BASENAME IMG_GZ_PATH=$HOME/workloads/$IMG_BASENAME.gz @@ -47,8 +54,11 @@ jobs: popd mkdir -p "$HOME/workloads" az storage blob download --container-name private-images --file "$IMG_GZ_PATH" --name "$IMG_GZ_BLOB_NAME" --connection-string "${{ secrets.CH_PRIVATE_IMAGES }}" - gzip -d $IMG_GZ_PATH + gzip -d "$IMG_GZ_PATH" - name: Run Windows guest integration tests if: ${{ github.event_name != 'pull_request' }} timeout-minutes: 30 run: scripts/dev_cli.sh tests --integration-windows --libc musl + - name: Skipping build for PR + if: ${{ github.event_name == 'pull_request' }} + run: echo "Skipping build for PR" diff --git a/.github/workflows/integration-metrics.yaml b/.github/workflows/integration-metrics.yaml index 952e938fdf..4e66f4b614 100644 --- a/.github/workflows/integration-metrics.yaml +++ b/.github/workflows/integration-metrics.yaml @@ -17,6 +17,6 @@ jobs: fetch-depth: 0 - name: Run metrics tests timeout-minutes: 60 - run: scripts/dev_cli.sh tests --metrics -- -- --report-file /root/workloads/metrics.json + run: scripts/dev_cli.sh tests --metrics -- --test-exclude micro_ -- --report-file /root/workloads/metrics.json - name: Upload metrics report run: 'curl -X PUT https://ch-metrics.azurewebsites.net/api/publishmetrics -H "x-functions-key: $METRICS_PUBLISH_KEY" -T ~/workloads/metrics.json' diff --git a/.github/workflows/integration-vfio.yaml b/.github/workflows/integration-vfio.yaml index 218e897270..b4f2ca2f94 100644 --- a/.github/workflows/integration-vfio.yaml +++ b/.github/workflows/integration-vfio.yaml @@ -13,7 +13,7 @@ jobs: steps: - name: Fix workspace permissions if: ${{ github.event_name != 'pull_request' }} - run: sudo chown -R runner:runner ${GITHUB_WORKSPACE} + run: sudo chown -R github-runner:github-runner "${GITHUB_WORKSPACE}" - name: Code checkout if: ${{ github.event_name != 'pull_request' }} uses: actions/checkout@v6 diff --git a/.github/workflows/integration-windows.yaml b/.github/workflows/integration-windows.yaml index 51877aa476..1010ab73ec 100644 --- a/.github/workflows/integration-windows.yaml +++ b/.github/workflows/integration-windows.yaml @@ -17,6 +17,7 @@ jobs: - name: Install Docker if: ${{ github.event_name != 'pull_request' }} run: | + set -eufo pipefail sudo apt-get update sudo apt-get -y install ca-certificates curl gnupg curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg @@ -27,6 +28,7 @@ jobs: - name: Install Azure CLI if: ${{ github.event_name != 'pull_request' }} run: | + set -eufo pipefail sudo apt install -y ca-certificates curl apt-transport-https lsb-release gnupg curl -sL https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor | sudo tee /etc/apt/trusted.gpg.d/microsoft.gpg > /dev/null echo "deb [arch=amd64] https://packages.microsoft.com/repos/azure-cli/ jammy main" | sudo tee /etc/apt/sources.list.d/azure-cli.list @@ -35,8 +37,9 @@ jobs: - name: Download Windows image if: ${{ github.event_name != 'pull_request' }} run: | + set -eufo pipefail mkdir $HOME/workloads - az storage blob download --container-name private-images --file "$HOME/workloads/windows-server-2022-amd64-2.raw" --name windows-server-2022-amd64-2.raw --connection-string "${{ secrets.CH_PRIVATE_IMAGES }}" + az storage blob download --container-name private-images --file "$HOME/workloads/windows-server-2025-amd64-1.raw" --name windows-server-2025-amd64-1.raw --connection-string "${{ secrets.CH_PRIVATE_IMAGES }}" - name: Run Windows guest integration tests if: ${{ github.event_name != 'pull_request' }} timeout-minutes: 15 diff --git a/.github/workflows/integration-x86-64.yaml b/.github/workflows/integration-x86-64.yaml index 9334b242ea..98e2a8dcdc 100644 --- a/.github/workflows/integration-x86-64.yaml +++ b/.github/workflows/integration-x86-64.yaml @@ -6,7 +6,7 @@ concurrency: jobs: build: - timeout-minutes: 60 + timeout-minutes: 80 strategy: fail-fast: false matrix: @@ -23,6 +23,7 @@ jobs: - name: Install Docker if: ${{ github.event_name != 'pull_request' || (matrix.runner == 'garm-jammy' && matrix.libc == 'gnu') }} run: | + set -eufo pipefail sudo apt-get update sudo apt-get -y install ca-certificates curl gnupg curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg @@ -41,7 +42,7 @@ jobs: run: sudo modprobe openvswitch - name: Run integration tests if: ${{ github.event_name != 'pull_request' || (matrix.runner == 'garm-jammy' && matrix.libc == 'gnu') }} - timeout-minutes: 40 + timeout-minutes: 60 run: scripts/dev_cli.sh tests --integration --libc ${{ matrix.libc }} - name: Run live-migration integration tests if: ${{ github.event_name != 'pull_request' || (matrix.runner == 'garm-jammy' && matrix.libc == 'gnu') }} diff --git a/.github/workflows/lychee.yaml b/.github/workflows/lychee.yaml index e77c595ed3..105e2e9a6e 100644 --- a/.github/workflows/lychee.yaml +++ b/.github/workflows/lychee.yaml @@ -20,6 +20,7 @@ jobs: # NEW STEP: Print all changed-files outputs for verification - name: Verify Changed Files run: | + set -eufo pipefail echo "--- tj-actions/changed-files Outputs ---" echo "any_changed: ${{ steps.changed-files.outputs.any_changed }}" echo "all_changed_files: ${{ steps.changed-files.outputs.all_changed_files }}" diff --git a/.github/workflows/mshv-infra.yaml b/.github/workflows/mshv-infra.yaml index ad71f69041..89cb5f6fbc 100644 --- a/.github/workflows/mshv-infra.yaml +++ b/.github/workflows/mshv-infra.yaml @@ -13,7 +13,7 @@ on: OS_DISK_SIZE: description: 'OS Disk Size in GB' required: true - type: string + type: number RG: description: 'Resource Group Name' required: true @@ -59,7 +59,7 @@ jobs: env: MI_CLIENT_ID: ${{ secrets.MI_CLIENT_ID }} run: | - set -e + set -eufo pipefail echo "Installing Azure CLI if not already installed" if ! command -v az &>/dev/null; then curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash @@ -68,7 +68,7 @@ jobs: fi az --version echo "Logging into Azure CLI using Managed Identity" - az login --identity --client-id ${MI_CLIENT_ID} + az login --identity --client-id "${MI_CLIENT_ID}" - name: Get Location id: get-location @@ -76,13 +76,13 @@ jobs: SKU: ${{ inputs.VM_SKU }} STORAGE_ACCOUNT_PATHS: ${{ secrets.STORAGE_ACCOUNT_PATHS }} run: | - set -e + set -eufo pipefail # Extract vCPU count from SKU (e.g., "Standard_D2s_v3" => 2) - vcpu=$(echo "$SKU" | sed -n 's/^Standard_[A-Za-z]\+\([0-9]\+\).*/\1/p') - if [[ -z "$vcpu" ]]; then - echo "Cannot extract vCPU count from SKU: $SKU" + if ! [[ "$SKU" =~ ^Standard_[A-Za-z]+([1-9][0-9]*) ]]; then + printf 'Cannot extract vCPU count from SKU: %q\n' "$SKU" exit 1 fi + vcpu=${BASH_REMATCH[1]} SUPPORTED_LOCATIONS=$(echo "$STORAGE_ACCOUNT_PATHS" | jq -r 'to_entries[] | .key') @@ -93,11 +93,9 @@ jobs: continue fi - usage=$(az vm list-usage --location "$location" --query "[?name.value=='$family'] | [0]" -o json) - current=$(echo "$usage" | jq -r '.currentValue') - limit=$(echo "$usage" | jq -r '.limit') - - if [[ $((limit - current)) -ge $vcpu ]]; then + remaining=$(az vm list-usage --location "$location" --query "[?name.value=='$family'] | [0]" -o json | + jq '(.limit | tonumber) - (.currentValue | tonumber) >= ($ARGS.positional[0] | tonumber)' --jsonargs "$vcpu") + if [[ "$remaining" = true ]]; then echo "Sufficient quota found in $location" echo "location=$location" >> "$GITHUB_OUTPUT" exit 0 @@ -114,11 +112,11 @@ jobs: RG: ${{ inputs.RG }} STORAGE_ACCOUNT_PATHS: ${{ secrets.STORAGE_ACCOUNT_PATHS }} run: | - set -e + set -eufo pipefail echo "Creating Resource Group: $RG" # Create the resource group echo "Creating resource group in location: ${LOCATION}" - az group create --name ${RG} --location ${LOCATION} + az group create --name "${RG}" --location "${LOCATION}" echo "RG_NAME=${RG}" >> $GITHUB_OUTPUT echo "Resource group created successfully." @@ -127,10 +125,10 @@ jobs: env: KEY: ${{ inputs.KEY }} run: | - set -e + set -eufo pipefail echo "Generating SSH key: $KEY" mkdir -p ~/.ssh - ssh-keygen -t rsa -b 4096 -f ~/.ssh/${KEY} -N "" + ssh-keygen -t rsa -b 4096 -f ~/.ssh/"${KEY}" -N "" - name: Create VM id: vm-setup @@ -145,12 +143,12 @@ jobs: VM_IMAGE_NAME: ${{ inputs.ARCH }}_${{ steps.get-location.outputs.location }}_image VM_NAME: ${{ inputs.ARCH }}_${{ steps.get-location.outputs.location }}_${{ github.run_id }} run: | - set -e + set -eufo pipefail echo "Creating $VM_SKU VM: $VM_NAME" # Extract subnet ID from the runner VM echo "Retrieving subnet ID..." - SUBNET_ID=$(az network vnet list --resource-group ${RUNNER_RG} --query "[?contains(location, '${LOCATION}')].{SUBNETS:subnets}" | jq -r ".[0].SUBNETS[0].id") + SUBNET_ID=$(az network vnet list --resource-group "$RUNNER_RG" --query "[?contains(location, '${LOCATION}')].{SUBNETS:subnets}" | jq -r ".[0].SUBNETS[0].id") if [[ -z "${SUBNET_ID}" ]]; then echo "ERROR: Failed to retrieve Subnet ID." exit 1 @@ -158,7 +156,7 @@ jobs: # Extract image ID from the runner VM echo "Retrieving image ID..." - IMAGE_ID=$(az image show --resource-group ${RUNNER_RG} --name ${VM_IMAGE_NAME} --query "id" -o tsv) + IMAGE_ID=$(az image show --resource-group "$RUNNER_RG" --name "$VM_IMAGE_NAME" --query "id" -o tsv) if [[ -z "${IMAGE_ID}" ]]; then echo "ERROR: Failed to retrieve Image ID." exit 1 @@ -166,24 +164,24 @@ jobs: # Create VM az vm create \ - --resource-group ${RG} \ - --name ${VM_NAME} \ - --subnet ${SUBNET_ID} \ - --size ${VM_SKU} \ - --location ${LOCATION} \ - --image ${IMAGE_ID} \ - --os-disk-size-gb ${OS_DISK_SIZE} \ + --resource-group "${RG}" \ + --name "${VM_NAME}" \ + --subnet "${SUBNET_ID}" \ + --size "${VM_SKU}" \ + --location "${LOCATION}" \ + --image "${IMAGE_ID}" \ + --os-disk-size-gb "${OS_DISK_SIZE}" \ --public-ip-sku Standard \ --storage-sku Premium_LRS \ --public-ip-address "" \ - --admin-username ${USERNAME} \ - --ssh-key-value ~/.ssh/${KEY}.pub \ + --admin-username "${USERNAME}" \ + --ssh-key-value ~/.ssh/"${KEY}".pub \ --security-type Standard \ --output json - az vm boot-diagnostics enable --name ${VM_NAME} --resource-group ${RG} + az vm boot-diagnostics enable --name "${VM_NAME}" --resource-group "${RG}" - echo "VM_NAME=${VM_NAME}" >> $GITHUB_OUTPUT + echo "VM_NAME=${VM_NAME}" >> "$GITHUB_OUTPUT" echo "VM creation process completed successfully." - name: Get VM Private IP @@ -192,15 +190,15 @@ jobs: RG: ${{ inputs.RG }} VM_NAME: ${{ inputs.ARCH }}_${{ steps.get-location.outputs.location }}_${{ github.run_id }} run: | - set -e + set -eufo pipefail echo "Retrieving VM Private IP address..." # Retrieve VM Private IP address - PRIVATE_IP=$(az vm show -g ${RG} -n ${VM_NAME} -d --query privateIps -o tsv) + PRIVATE_IP=$(az vm show -g "${RG}" -n "${VM_NAME}" -d --query privateIps -o tsv) if [[ -z "$PRIVATE_IP" ]]; then echo "ERROR: Failed to retrieve private IP address." exit 1 fi - echo "PRIVATE_IP=$PRIVATE_IP" >> $GITHUB_OUTPUT + echo "PRIVATE_IP=$PRIVATE_IP" >> "$GITHUB_OUTPUT" - name: Wait for SSH availability env: @@ -209,16 +207,16 @@ jobs: USERNAME: ${{ secrets.USERNAME }} run: | echo "Waiting for SSH to be accessible..." - timeout 120 bash -c 'until ssh -o StrictHostKeyChecking=no -i ~/.ssh/${KEY} ${USERNAME}@${PRIVATE_IP} "exit" 2>/dev/null; do sleep 5; done' + timeout 120 bash -c 'until ssh -o StrictHostKeyChecking=no -i ~/.ssh/"${KEY}" -- "${USERNAME}@${PRIVATE_IP}" "exit" 2>/dev/null; do sleep 5; done' echo "VM is accessible!" - name: Remove Old Host Key env: PRIVATE_IP: ${{ steps.get-vm-ip.outputs.PRIVATE_IP }} run: | - set -e + set -eufo pipefail echo "Removing the old host key" - ssh-keygen -R $PRIVATE_IP + ssh-keygen -R "$PRIVATE_IP" - name: SSH into VM and Install Dependencies env: @@ -226,9 +224,9 @@ jobs: PRIVATE_IP: ${{ steps.get-vm-ip.outputs.PRIVATE_IP }} USERNAME: ${{ secrets.USERNAME }} run: | - set -e - ssh -i ~/.ssh/${KEY} -o StrictHostKeyChecking=no ${USERNAME}@${PRIVATE_IP} << EOF - set -e + set -eufo pipefail + ssh -i ~/.ssh/"${KEY}" -o StrictHostKeyChecking=no -- "${USERNAME}@${PRIVATE_IP}" << EOF + set -eufo pipefail echo "Logged in successfully." echo "Installing dependencies..." sudo tdnf install -y git moby-engine moby-cli clang llvm pkg-config make gcc glibc-devel @@ -243,6 +241,6 @@ jobs: sudo systemctl enable containerd.service sudo systemctl start docker sudo groupadd -f docker - sudo usermod -a -G docker ${USERNAME} + sudo usermod -a -G docker "${USERNAME}" sudo systemctl restart docker EOF diff --git a/.github/workflows/mshv-integration.yaml b/.github/workflows/mshv-integration.yaml index 261b84d546..437cf44f6c 100644 --- a/.github/workflows/mshv-integration.yaml +++ b/.github/workflows/mshv-integration.yaml @@ -1,5 +1,6 @@ name: Cloud Hypervisor Tests (MSHV) (x86_64) on: [pull_request_target, merge_group] +permissions: {} jobs: infra-setup: @@ -35,9 +36,9 @@ jobs: RG: MSHV-${{ github.run_id }} USERNAME: ${{ secrets.MSHV_USERNAME }} run: | - set -e + set -eufo pipefail echo "Connecting to the VM via SSH..." - ssh -i ~/.ssh/${KEY} -o StrictHostKeyChecking=no ${USERNAME}@${PRIVATE_IP} << EOF + ssh -i ~/.ssh/"${KEY}" -o StrictHostKeyChecking=no -- "${USERNAME}@${PRIVATE_IP}" << EOF set -e echo "Logged in successfully." export PATH="\$HOME/.cargo/bin:\$PATH" @@ -66,12 +67,12 @@ jobs: echo "Setting permissions..." for i in 0 1 2; do - dev="/dev/vhost-vdpa-$i" - if [ -e "$dev" ]; then - sudo chown $USER:$USER "$dev" - sudo chmod 660 "$dev" + dev="/dev/vhost-vdpa-\$i" + if [ -e "\$dev" ]; then + sudo chown \$USER:\$USER "\$dev" + sudo chmod 660 "\$dev" else - echo "Warning: Device $dev not found" + echo "Warning: Device \$dev not found" fi done @@ -86,9 +87,7 @@ jobs: PRIVATE_IP: ${{ needs.infra-setup.outputs.PRIVATE_IP }} USERNAME: ${{ secrets.MSHV_USERNAME }} run: | - ssh -i ~/.ssh/${KEY} -o StrictHostKeyChecking=no ${USERNAME}@${PRIVATE_IP} << EOF - sudo dmesg - EOF + ssh -i ~/.ssh/"${KEY}" -o StrictHostKeyChecking=no -- "${USERNAME}@${PRIVATE_IP}" sudo dmesg - name: Dump serial console logs if: always() @@ -97,7 +96,7 @@ jobs: RG_NAME: ${{ needs.infra-setup.outputs.RG_NAME }} VM_NAME: ${{ needs.infra-setup.outputs.VM_NAME }} run: | - set -e + set -eufo pipefail az vm boot-diagnostics get-boot-log --name "${VM_NAME}" --resource-group "${RG_NAME}" | jq -r cleanup: @@ -110,8 +109,8 @@ jobs: env: RG: MSHV-INTEGRATION-${{ github.run_id }} run: | - if az group exists --name ${RG}; then - az group delete --name ${RG} --yes --no-wait + if az group exists --name "${RG}"; then + az group delete --name "${RG}" --yes --no-wait else echo "Resource Group ${RG} does not exist. Skipping deletion." fi @@ -121,8 +120,8 @@ jobs: env: KEY: azure_key_${{ github.run_id }} run: | - if [ -f ~/.ssh/${KEY} ]; then - rm -f ~/.ssh/${KEY} ~/.ssh/${KEY}.pub + if [ -f ~/.ssh/"${KEY}" ]; then + rm -f ~/.ssh/"${KEY}" ~/.ssh/"${KEY}.pub" echo "SSH key deleted successfully." else echo "SSH key does not exist. Skipping deletion." diff --git a/.github/workflows/package-consistency.yaml b/.github/workflows/package-consistency.yaml index df7f01b8af..7f7808c882 100644 --- a/.github/workflows/package-consistency.yaml +++ b/.github/workflows/package-consistency.yaml @@ -27,6 +27,7 @@ jobs: - name: Check Rust VMM Package Consistency of fuzz Workspace run: | + set -eufo pipefail pushd fuzz python3 ../scripts/package-consistency-check.py github.com/rust-vmm popd diff --git a/.github/workflows/preview-riscv64-build.yaml b/.github/workflows/preview-riscv64-build.yaml deleted file mode 100644 index ad87232d3d..0000000000 --- a/.github/workflows/preview-riscv64-build.yaml +++ /dev/null @@ -1,30 +0,0 @@ -name: Cloud Hypervisor RISC-V 64-bit kvm build Preview -on: [pull_request, merge_group] -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event_name }} - cancel-in-progress: true - -jobs: - build: - name: Cargo - runs-on: riscv64-qemu-host - strategy: - fail-fast: false - - steps: - - name: Code checkout - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - - name: Install Rust toolchain - run: /opt/scripts/exec-in-qemu.sh rustup default 1.89.0 - - - name: Build test (kvm) - run: /opt/scripts/exec-in-qemu.sh cargo build --locked --no-default-features --features "kvm" -p cloud-hypervisor - - - name: Clippy test (kvm) - run: /opt/scripts/exec-in-qemu.sh cargo clippy --locked --no-default-features --features "kvm" -p cloud-hypervisor - - - name: Check no files were modified - run: test -z "$(git status --porcelain)" diff --git a/.github/workflows/preview-riscv64-modules.yaml b/.github/workflows/preview-riscv64-modules.yaml deleted file mode 100644 index 1b7ac6ed16..0000000000 --- a/.github/workflows/preview-riscv64-modules.yaml +++ /dev/null @@ -1,39 +0,0 @@ -name: Cloud Hypervisor RISC-V 64-bit Preview -on: [pull_request, merge_group] -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event_name }} - cancel-in-progress: true - -jobs: - build: - name: Cargo - runs-on: riscv64-qemu-host - strategy: - fail-fast: false - matrix: - module: - - hypervisor - - arch - - vm-allocator - - devices - - steps: - - name: Code checkout - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - - name: Install Rust toolchain - run: /opt/scripts/exec-in-qemu.sh rustup default 1.89.0 - - - name: Build ${{ matrix.module }} Module (kvm) - run: /opt/scripts/exec-in-qemu.sh cargo build --locked -p ${{ matrix.module }} --no-default-features --features "kvm" - - - name: Clippy ${{ matrix.module }} Module (kvm) - run: /opt/scripts/exec-in-qemu.sh cargo clippy --locked -p ${{ matrix.module }} --no-default-features --features "kvm" -- -D warnings - - - name: Test ${{ matrix.module }} Module (kvm) - run: /opt/scripts/exec-in-qemu.sh cargo test --locked -p ${{ matrix.module }} --no-default-features --features "kvm" - - - name: Check no files were modified - run: test -z "$(git status --porcelain)" diff --git a/.github/workflows/quality.yaml b/.github/workflows/quality.yaml index 0427708458..1290b0f872 100644 --- a/.github/workflows/quality.yaml +++ b/.github/workflows/quality.yaml @@ -44,7 +44,7 @@ jobs: - name: Bisectability Check (default features) if: ${{ github.event_name == 'pull_request' && matrix.target == 'x86_64-unknown-linux-gnu' }} run: | - set -e + set -eufo pipefail commits=$(git rev-list origin/${{ github.base_ref }}..${{ github.sha }}) for commit in $commits; do git checkout $commit; cargo check --tests --examples --all --target=${{ matrix.target }}; done git checkout ${{ github.sha }} @@ -53,7 +53,6 @@ jobs: uses: houseabsolute/actions-rust-cross@v1 with: command: clippy - cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 toolchain: ${{ matrix.rust }} target: ${{ matrix.target }} args: --locked --all --all-targets --no-default-features --tests --examples --features "kvm" -- -D warnings @@ -62,7 +61,6 @@ jobs: uses: houseabsolute/actions-rust-cross@v1 with: command: clippy - cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 toolchain: ${{ matrix.rust }} target: ${{ matrix.target }} args: --locked --all --all-targets --no-default-features --tests --examples --features "mshv" -- -D warnings @@ -71,7 +69,6 @@ jobs: uses: houseabsolute/actions-rust-cross@v1 with: command: clippy - cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 toolchain: ${{ matrix.rust }} target: ${{ matrix.target }} args: --locked --all --all-targets --no-default-features --tests --examples --features "mshv,kvm" -- -D warnings @@ -80,7 +77,6 @@ jobs: uses: houseabsolute/actions-rust-cross@v1 with: command: clippy - cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 toolchain: ${{ matrix.rust }} target: ${{ matrix.target }} args: --locked --all --all-targets --tests --examples -- -D warnings @@ -89,7 +85,6 @@ jobs: uses: houseabsolute/actions-rust-cross@v1 with: command: clippy - cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 toolchain: ${{ matrix.rust }} target: ${{ matrix.target }} args: --locked --all --all-targets --tests --examples --features "guest_debug" -- -D warnings @@ -98,7 +93,6 @@ jobs: uses: houseabsolute/actions-rust-cross@v1 with: command: clippy - cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 toolchain: ${{ matrix.rust }} target: ${{ matrix.target }} args: --locked --all --all-targets --tests --examples --features "pvmemcontrol" -- -D warnings @@ -107,7 +101,6 @@ jobs: uses: houseabsolute/actions-rust-cross@v1 with: command: clippy - cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 toolchain: ${{ matrix.rust }} target: ${{ matrix.target }} args: --locked --all --all-targets --tests --examples --features "tracing" -- -D warnings @@ -122,7 +115,6 @@ jobs: uses: houseabsolute/actions-rust-cross@v1 with: command: clippy - cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 toolchain: ${{ matrix.rust }} target: ${{ matrix.target }} args: --locked --all --all-targets --tests --examples --features "ivshmem" -- -D warnings @@ -132,7 +124,6 @@ jobs: uses: houseabsolute/actions-rust-cross@v1 with: command: clippy - cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 toolchain: ${{ matrix.rust }} target: ${{ matrix.target }} args: --locked --all --all-targets --no-default-features --tests --examples --features "sev_snp" -- -D warnings @@ -142,7 +133,6 @@ jobs: uses: houseabsolute/actions-rust-cross@v1 with: command: clippy - cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 toolchain: ${{ matrix.rust }} target: ${{ matrix.target }} args: --locked --all --all-targets --no-default-features --tests --examples --features "igvm" -- -D warnings @@ -152,11 +142,30 @@ jobs: uses: houseabsolute/actions-rust-cross@v1 with: command: clippy - cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 toolchain: ${{ matrix.rust }} target: ${{ matrix.target }} args: --locked --all --all-targets --no-default-features --tests --examples --features "tdx,kvm" -- -D warnings + - name: Clippy (kvm + igvm + sev_snp + fw_cfg) + if: ${{ matrix.target == 'x86_64-unknown-linux-gnu' }} + uses: houseabsolute/actions-rust-cross@v1 + with: + command: clippy + cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 + toolchain: ${{ matrix.rust }} + target: ${{ matrix.target }} + args: --locked --all --all-targets --no-default-features --tests --examples --features "kvm,igvm,sev_snp,fw_cfg" -- -D warnings + + - name: Clippy (default features + sev_snp + igvm + fw_cfg) + if: ${{ matrix.target == 'x86_64-unknown-linux-gnu' }} + uses: houseabsolute/actions-rust-cross@v1 + with: + command: clippy + cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 + toolchain: ${{ matrix.rust }} + target: ${{ matrix.target }} + args: --locked --all --all-targets --tests --examples --features "sev_snp,igvm,fw_cfg" -- -D warnings + - name: Check build did not modify any files run: test -z "$(git status --porcelain)" @@ -167,4 +176,4 @@ jobs: steps: - uses: actions/checkout@v6 # Executes "typos ." - - uses: crate-ci/typos@v1.43.5 + - uses: crate-ci/typos@v1.45.1 diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 6a96de491b..bc7c3e152e 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -54,7 +54,7 @@ jobs: cp target/${{ matrix.platform.target }}/release/ch-remote ./${{ matrix.platform.name_ch_remote }} - name: Upload Release Artifacts if: github.event_name == 'create' && github.event.ref_type == 'tag' - uses: actions/upload-artifact@v6 + uses: actions/upload-artifact@v7 with: name: Artifacts for ${{ matrix.platform.target }} path: | @@ -80,13 +80,13 @@ jobs: github.event_name == 'create' && github.event.ref_type == 'tag' && matrix.platform.target == 'x86_64-unknown-linux-gnu' id: upload-release-cloud-hypervisor-vendored-sources - uses: actions/upload-artifact@v6 + uses: actions/upload-artifact@v7 with: path: cloud-hypervisor-${{ github.event.ref }}.tar.xz name: cloud-hypervisor-${{ github.event.ref }}.tar.xz - name: Create GitHub Release if: github.event_name == 'create' && github.event.ref_type == 'tag' - uses: softprops/action-gh-release@v2 + uses: softprops/action-gh-release@v3 with: draft: true files: | diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 157098201c..943f53c39d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -11,17 +11,36 @@ license of those projects. New code should be under the [Apache v2 License](https://opensource.org/licenses/Apache-2.0). -## Coding Style +## Coding Style & Code Comments -We follow the [Rust Style](https://github.com/rust-lang/rust/tree/HEAD/src/doc/style-guide/src) -convention and enforce it through the Continuous Integration (CI) process calling into `rustfmt`, -`clippy`, and other well-known code quality tool of the ecosystem for each submitted Pull Request (PR). +We use the [Rust Style] guide and enforce formatting and linting in CI, +including `rustfmt`, `clippy`, and other common Rust quality checks, for every +pull request. We adapt to best practices, new lints and new tooling as the +ecosystem evolves. + +Code should **speak for itself** (for example, by using descriptive identifiers) +and be **easy to read and maintain**. Beyond the conventions and tooling +described above, contributors have _some_ room to apply their own style and +preferred structure. Maintainers may still suggest refactorings where they +believe readability, consistency, or maintainability can be improved. + +For new code, add documentation and comments where they **provide additional value**: + +* **Rustdoc** explains the API to its users. +* **Inline comments** explain the code the reader, especially *why* it is + written that way. +* **Commit messages** explain the broader context of a change (for more + information on commit messages, see below). + +Comments should be concise and add additional context or information to the code. + +[Rust Style]: https://github.com/rust-lang/rust/tree/HEAD/src/doc/style-guide/src ## Basic Checks ```sh # We currently rely on nightly-only formatting features -cargo +nightly fmt --all +cargo +nightly fmt --all cargo check --all-targets --tests cargo clippy --all-targets --tests # Please note that this will not execute integration tests. @@ -36,7 +55,7 @@ gitlint --commits "HEAD~3..HEAD" _Caution: These tests are taking a long time to complete (40+ mins) and need special setup._ ```sh - bash ./scripts/dev_cli.sh tests --integration -- --test-filter '' + bash ./scripts/dev_cli.sh tests --integration -- --test-filter '' ``` ### Setup Commit Hook @@ -58,42 +77,65 @@ commit you make. ## Certificate of Origin -In order to get a clear contribution chain of trust we use the [signed-off-by language](https://web.archive.org/web/20230406041855/https://01.org/community/signed-process) +In order to get a clear contribution chain of trust we use the [signed-off-by language](https://www.kernel.org/doc/Documentation/process/submitting-patches.rst) used by the Linux kernel project. -## Patch format +## Patch format & Git Commit Hygiene -Beside the signed-off-by footer, we expect each patch to comply with the following format: +_We use **Patch** as synonym for **Commit**._ -``` -: Change summary +We require patches to: -More detailed explanation of your changes: Why and how. -Wrap it to 72 characters. -See http://chris.beams.io/posts/git-commit/ -for some more good pieces of advice. +- Have a `Signed-off-by: Name ` footer +- Follow the pattern: \ + ``` + : Change summary -Signed-off-by: -``` + More detailed explanation of your changes: Why and how. + Wrap it to 72 characters. + See http://chris.beams.io/posts/git-commit/ + for some more good pieces of advice. + + Signed-off-by: + ``` -For example: + +Valid components are listed in `TitleStartsWithComponent.py`. In short, each +cargo workspace member is a valid component as well as `build`, `ci`, `docs` and +`misc`. + +Example patch: ``` vm-virtio: Reset underlying device on driver request - + If the driver triggers a reset by writing zero into the status register then reset the underlying device if supported. A device reset also requires resetting various aspects of the queue. - + In order to be able to do a subsequent reactivate it is required to reclaim certain resources (interrupt and queue EventFDs.) If a device reset is requested by the driver but the underlying device does not support it then generate an error as the driver would not be able to configure it anyway. - + Signed-off-by: Rob Bradford ``` +### Git Commit History + +We value a clean, **reviewable** commit history. Each commit should represent +a self-contained, logical step that guides reviewers clearly from A to B. + +Avoid patterns like `init A -> init B -> fix A` or \ +`init design A -> revert A -> use design B`. Commits must be independently +reviewable - don't leave "fix previous commit" or earlier design attempts in +the history. + +Intermediate work-in-progress changes are acceptable only if a subsequent +commit in the same series cleans them up (e.g. a temporary `#[allow(unused)]` +removed in the next commit). + ## Pull requests Cloud Hypervisor uses the “fork-and-pull” development model. Follow these steps if @@ -104,10 +146,14 @@ you want to merge your changes to `cloud-hypervisor`: 1. Within your fork, create a branch for your contribution. 1. [Create a pull request](https://help.github.com/articles/creating-a-pull-request-from-a-fork/) against the main branch of the Cloud Hypervisor repository. -1. To update your pull request amend existing commits whenever applicable and - then push the new changes to your pull request branch. +1. Each commit must comply with the Commit Hygiene guidelines above. +1. A pull request should address a single component or concern to keep review + focused and approvals straightforward. 1. Once the pull request is approved it can be integrated. +Please squash any changes done during review already into the corresponding +commits instead of pushing `: addressing review for A`-style commits. + ## Issue tracking If you have a problem, please let us know. We recommend using @@ -123,26 +169,63 @@ comments or by adding the `Fixes` keyword to your commit message: ``` serial: Set terminal in raw mode - + In order to have proper output from the serial, we need to setup the terminal in raw mode. When the VM is shutting down, it is also the VMM responsibility to set the terminal back into canonical mode if we don't want to get any weird behavior from the shell. - + Fixes #88 - + Signed-off-by: Sebastien Boeuf ``` Then, after the corresponding PR is merged, GitHub will automatically close that issue when parsing the [commit message](https://help.github.com/articles/closing-issues-via-commit-messages/). -## AI Generated Code +## AI/LLM Assistance & Generated Code + +We recommend **a careful and conservative approach** to LLM usage, guided by +sound engineering judgment. Please use AI/LLM-assisted tooling thoughtfully and +responsibly to ensure efficient use of limited project resources, particularly +in code review and long-term maintenance. Our primary goals are to avoid +ambiguity in license compliance and to keep contributions clear and easy to +review. + +Or in other words: please apply common sense and don't blindly accept LLM +suggestions. + +This policy can be revisited as LLMs evolve and mature. + +### Code Review + +We generally recommend doing early coarse-grained reviews using state-of-the-art +LLMs. This can help identify rough edges, copy & paste errors, and typos early +on. This reduces review cycles for human reviewers. + +Please **do not** use GitHub Copilot directly in PRs to keep discussions clean. +Instead, ask an LLM of your choice for a review. A convenient way to do this is + +- appending `.patch` to the GitHub PR URL + (e.g., `https://github.com/cloud-hypervisor/cloud-hypervisor/pull/1234.patch`) + and pasting it into the LLM of your choice, or +- using a local agent in your terminal, such as `codex` or `claude`. + +### Contributions assisted by LLMs + +All contributions **must** be submitted by a human contributor. Automated or +bot-driven PRs are not accepted. + +You are responsible for every piece of code you submit, and you must understand +both the design and the implementation details. LLMs are useful for prototyping +and generating boilerplate code. However, large or complex logic must be +authored and fully understood by the contributor - LLM output should not be +submitted without careful review and comprehension. -Our policy is to decline any contributions known to contain contents -generated or derived from using Large Language Models (LLMs). This -includes ChatGPT, Gemini, Claude, Copilot and similar tools. +Please disclose LLM use in your commit message and PR description if it +meaningfully contributed to the submitted code. Again, we recommend careful and +conservative use of LLMs, guided by common sense. -The goal is to avoid ambiguity in license compliance and optimize the -use of limited project resources, especially for code review and -maintenance. This policy can be revisited as LLMs evolve and mature. +Maintainers reserve the right to request additional clarification or decline +contributions where LLM usage raises concerns. Ultimately, acceptance of any +contribution is at the maintainers' discretion. diff --git a/Cargo.lock b/Cargo.lock index 12304950bd..7c03c08766 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -37,9 +37,9 @@ dependencies = [ [[package]] name = "anstream" -version = "0.6.21" +version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" +checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d" dependencies = [ "anstyle", "anstyle-parse", @@ -52,15 +52,15 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.13" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" [[package]] name = "anstyle-parse" -version = "0.2.7" +version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" +checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e" dependencies = [ "utf8parse", ] @@ -71,7 +71,7 @@ version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" dependencies = [ - "windows-sys 0.61.0", + "windows-sys", ] [[package]] @@ -82,28 +82,28 @@ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys 0.61.0", + "windows-sys", ] [[package]] name = "anyhow" -version = "1.0.101" +version = "1.0.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f0e0fee31ef5ed1ba1316088939cea399010ed7731dba877ed44aeb407a75ea" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" [[package]] name = "api_client" version = "0.1.0" dependencies = [ - "thiserror 2.0.18", + "thiserror", "vmm-sys-util", ] [[package]] name = "arc-swap" -version = "1.8.2" +version = "1.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9f3647c145568cec02c42054e07bdf9a5a698e15b466fb2341bfc393cd24aa5" +checksum = "6a3a1fd6f75306b68087b831f025c712524bcb19aad54e557b1129cfa0a2b207" dependencies = [ "rustversion", ] @@ -120,7 +120,7 @@ dependencies = [ "linux-loader", "log", "serde", - "thiserror 2.0.18", + "thiserror", "uuid", "vm-fdt", "vm-memory", @@ -180,7 +180,7 @@ dependencies = [ "polling", "rustix", "slab", - "windows-sys 0.61.0", + "windows-sys", ] [[package]] @@ -238,7 +238,7 @@ dependencies = [ "rustix", "signal-hook-registry", "slab", - "windows-sys 0.61.0", + "windows-sys", ] [[package]] @@ -333,7 +333,7 @@ dependencies = [ "remain", "serde", "smallvec", - "thiserror 2.0.18", + "thiserror", "uuid", "virtio-bindings", "virtio-queue", @@ -358,9 +358,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.19.1" +version = "3.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dd9dc738b7a8311c7ade152424974d8115f2cdad61e8dab8dac9f2362298510" +checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" [[package]] name = "byteorder" @@ -370,9 +370,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "cc" -version = "1.2.56" +version = "1.2.59" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aebf35691d1bfb0ac386a69bac2fde4dd276fb618cf8bf4f5318fe285e821bb2" +checksum = "b7a4d3ec6524d28a329fc53654bbadc9bdd7b0431f5d65f1a56ffb28a1ee5283" dependencies = [ "find-msvc-tools", "jobserver", @@ -394,23 +394,23 @@ checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601" dependencies = [ "cfg-if", "cpufeatures", - "rand_core 0.10.0", + "rand_core", ] [[package]] name = "clap" -version = "4.5.59" +version = "4.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5caf74d17c3aec5495110c34cc3f78644bfa89af6c8993ed4de2790e49b6499" +checksum = "b193af5b67834b676abd72466a96c1024e6a6ad978a1f484bd90b85c94041351" dependencies = [ "clap_builder", ] [[package]] name = "clap_builder" -version = "4.5.59" +version = "4.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "370daa45065b80218950227371916a1633217ae42b2715b2287b606dcd618e24" +checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" dependencies = [ "anstream", "anstyle", @@ -421,9 +421,9 @@ dependencies = [ [[package]] name = "clap_lex" -version = "1.0.0" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a822ea5bc7590f9d40f1ba12c0dc3c2760f3482c6984db1573ad11031420831" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" [[package]] name = "cloud-hypervisor" @@ -447,7 +447,7 @@ dependencies = [ "serde_json", "signal-hook", "test_infra", - "thiserror 2.0.18", + "thiserror", "tpm", "tracer", "vm-memory", @@ -459,9 +459,9 @@ dependencies = [ [[package]] name = "colorchoice" -version = "1.0.4" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" +checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" [[package]] name = "concat-idents" @@ -517,9 +517,9 @@ checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" [[package]] name = "darling" -version = "0.21.3" +version = "0.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cdf337090841a411e2a7f3deb9187445851f91b309c0c0a29e05f74a00a48c0" +checksum = "25ae13da2f202d56bd7f91c25fba009e7717a1e4a1cc98a76d844b65ae912e9d" dependencies = [ "darling_core", "darling_macro", @@ -527,11 +527,10 @@ dependencies = [ [[package]] name = "darling_core" -version = "0.21.3" +version = "0.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1247195ecd7e3c85f83c8d2a366e4210d588e802133e1e355180a9870b517ea4" +checksum = "9865a50f7c335f53564bb694ef660825eb8610e0a53d3e11bf1b0d3df31e03b0" dependencies = [ - "fnv", "ident_case", "proc-macro2", "quote", @@ -541,9 +540,9 @@ dependencies = [ [[package]] name = "darling_macro" -version = "0.21.3" +version = "0.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d38308df82d1080de0afee5d069fa14b0326a88c14f15c5ccda35b4a6c414c81" +checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d" dependencies = [ "darling_core", "quote", @@ -574,7 +573,7 @@ dependencies = [ "num_enum", "pci", "serde", - "thiserror 2.0.18", + "thiserror", "tpm", "vm-allocator", "vm-device", @@ -618,7 +617,7 @@ dependencies = [ "libc", "option-ext", "redox_users", - "windows-sys 0.61.0", + "windows-sys", ] [[package]] @@ -656,9 +655,9 @@ dependencies = [ [[package]] name = "env_filter" -version = "1.0.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a1c3cc8e57274ec99de65301228b537f1e4eedc1b8e0f9411c6caac8ae7308f" +checksum = "32e90c2accc4b07a8456ea0debdc2e7587bdd890680d71173a15d4ae604f6eef" dependencies = [ "log", "regex", @@ -666,9 +665,9 @@ dependencies = [ [[package]] name = "env_logger" -version = "0.11.9" +version = "0.11.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2daee4ea451f429a58296525ddf28b45a3b64f1acf6587e2067437bb11e218d" +checksum = "0621c04f2196ac3f488dd583365b9c09be011a4ab8b9f37248ffcc8f6198b56a" dependencies = [ "anstream", "anstyle", @@ -700,7 +699,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.61.0", + "windows-sys", ] [[package]] @@ -741,7 +740,7 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" dependencies = [ - "getrandom 0.2.15", + "getrandom 0.2.17", ] [[package]] @@ -778,12 +777,6 @@ dependencies = [ "spin", ] -[[package]] -name = "fnv" -version = "1.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" - [[package]] name = "foldhash" version = "0.1.5" @@ -893,23 +886,23 @@ dependencies = [ [[package]] name = "gdbstub" -version = "0.7.9" +version = "0.7.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bf845b08f7c2ef3b5ad19f80779d43ae20d278652b91bb80adda65baf2d8ed6" +checksum = "5bafc7e33650ab9f05dcc16325f05d56b8d10393114e31a19a353b86fa60cfe7" dependencies = [ "bitflags 2.11.0", "cfg-if", "log", "managed", "num-traits", - "paste", + "pastey", ] [[package]] name = "gdbstub_arch" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22dde0e1b68787036ccedd0b1ff6f953527a0e807e571fbe898975203027278f" +checksum = "6c02bfe7bd65f42bcda751456869dfa1eb2bd1c36e309b9ec27f4888d41cf258" dependencies = [ "gdbstub", "num-traits", @@ -917,9 +910,9 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.15" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" +checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" dependencies = [ "cfg-if", "js-sys", @@ -936,20 +929,20 @@ checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" dependencies = [ "cfg-if", "libc", - "r-efi", + "r-efi 5.3.0", "wasip2", ] [[package]] name = "getrandom" -version = "0.4.1" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "139ef39800118c7683f2fd3c98c1b23c09ae076556b435f8e9064ae108aaeeec" +checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" dependencies = [ "cfg-if", "libc", - "r-efi", - "rand_core 0.10.0", + "r-efi 6.0.0", + "rand_core", "wasip2", "wasip3", ] @@ -977,9 +970,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.16.1" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51" [[package]] name = "heck" @@ -1023,7 +1016,7 @@ dependencies = [ "serde", "serde_json", "serde_with", - "thiserror 2.0.18", + "thiserror", "vfio-ioctls", "vm-memory", "vmm-sys-util", @@ -1064,7 +1057,7 @@ dependencies = [ "open-enum", "range_map_vec", "static_assertions", - "thiserror 2.0.18", + "thiserror", "tracing", "zerocopy", ] @@ -1083,12 +1076,12 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.13.0" +version = "2.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017" +checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" dependencies = [ "equivalent", - "hashbrown 0.16.1", + "hashbrown 0.17.0", "serde", "serde_core", ] @@ -1104,6 +1097,23 @@ dependencies = [ "libc", ] +[[package]] +name = "iommufd-bindings" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd7de3a04f6fd55f171a6682852f7aa360bb848a85e0c610513349e006b3c139" + +[[package]] +name = "iommufd-ioctls" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4eabd3414d9c4e716c9a198fbfac484625f088c075605372daf037edfe336e18" +dependencies = [ + "iommufd-bindings", + "thiserror", + "vmm-sys-util", +] + [[package]] name = "ipnetwork" version = "0.20.0" @@ -1130,15 +1140,15 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.17" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" [[package]] name = "jiff" -version = "0.2.20" +version = "0.2.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c867c356cc096b33f4981825ab281ecba3db0acefe60329f044c1789d94c6543" +checksum = "1a3546dc96b6d42c5f24902af9e2538e82e39ad350b0c766eb3fbf2d8f3d8359" dependencies = [ "jiff-static", "log", @@ -1149,9 +1159,9 @@ dependencies = [ [[package]] name = "jiff-static" -version = "0.2.20" +version = "0.2.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7946b4325269738f270bb55b3c19ab5c5040525f83fd625259422a9d25d9be5" +checksum = "2a8c8b344124222efd714b73bb41f8b5120b27a7cc1c75593a6ff768d9d05aa4" dependencies = [ "proc-macro2", "quote", @@ -1170,9 +1180,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.77" +version = "0.3.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f" +checksum = "b49715b7073f385ba4bc528e5747d02e66cb39c6146efb66b781f131f0fb399c" dependencies = [ "once_cell", "wasm-bindgen", @@ -1180,9 +1190,9 @@ dependencies = [ [[package]] name = "kvm-bindings" -version = "0.12.1" +version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a537873e15e8daabb416667e606d9b0abc2a8fb9a45bd5853b888ae0ead82f9" +checksum = "4b3c06ff73c7ce03e780887ec2389d62d2a2a9ddf471ab05c2ff69207cd3f3b4" dependencies = [ "serde", "vmm-sys-util", @@ -1191,9 +1201,9 @@ dependencies = [ [[package]] name = "kvm-ioctls" -version = "0.22.1" +version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c8f7370330b4f57981e300fa39b02088f2f2a5c2d0f1f994e8090589619c56d" +checksum = "333f77a20344a448f3f70664918135fddeb804e938f28a99d685bd92926e0b19" dependencies = [ "bitflags 2.11.0", "kvm-bindings", @@ -1209,7 +1219,7 @@ checksum = "49fefd6652c57d68aaa32544a4c0e642929725bdc1fd929367cdeb673ab81088" dependencies = [ "enumflags2", "libc", - "thiserror 2.0.18", + "thiserror", ] [[package]] @@ -1226,17 +1236,16 @@ checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" [[package]] name = "libc" -version = "0.2.182" +version = "0.2.184" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6800badb6cb2082ffd7b6a67e6125bb39f18782f793520caee8cb8846be06112" +checksum = "48f5d2a454e16a5ea0f4ced81bd44e4cfc7bd3a507b61887c99fd3538b28e4af" [[package]] name = "libredox" -version = "0.1.12" +version = "0.1.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d0b95e02c851351f877147b7deea7b1afb1df71b63aa5f8270716e0c5720616" +checksum = "7ddbf48fd451246b1f8c2610bd3b4ac0cc6e149d89832867093ab69a17194f08" dependencies = [ - "bitflags 2.11.0", "libc", ] @@ -1256,9 +1265,9 @@ dependencies = [ [[package]] name = "libz-sys" -version = "1.1.23" +version = "1.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15d118bbf3771060e7311cc7bb0545b01d08a8b4a7de949198dec1fa0ca1c0f7" +checksum = "fc3a226e576f50782b3305c5ccf458698f92798987f551c6a02efe8276721e22" dependencies = [ "cc", "libc", @@ -1268,18 +1277,18 @@ dependencies = [ [[package]] name = "linux-loader" -version = "0.13.1" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53802c0b111faf302a16fa20a2e3a33bd0eab408f60fc34cbfe052f6b153791e" +checksum = "de72cb02c55ecffcf75fe78295926f872eb6eb0a58d629c58a8c324dc26380f6" dependencies = [ "vm-memory", ] [[package]] name = "linux-raw-sys" -version = "0.11.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039" +checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" [[package]] name = "lock_api" @@ -1320,7 +1329,7 @@ dependencies = [ [[package]] name = "micro_http" version = "0.1.0" -source = "git+https://github.com/firecracker-microvm/micro-http?branch=main#3248ceeae41461d034624b582d5d358cd6e6f89f" +source = "git+https://github.com/firecracker-microvm/micro-http?branch=main#5c2254d6cf4f32a668d0d8e57ba20bebad9d4fba" dependencies = [ "libc", "vmm-sys-util", @@ -1344,9 +1353,9 @@ checksum = "c505b3e17ed6b70a7ed2e67fbb2c560ee327353556120d6e72f5232b6880d536" [[package]] name = "mshv-bindings" -version = "0.6.7" +version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3cbfd4f32d185152003679339751839da77c17e18fa8882a11051a236f841426" +checksum = "a94fc3871dd23738188e5bc76a1d1a5930ebcaf9308c560a7274aa62b1770594" dependencies = [ "libc", "num_enum", @@ -1358,20 +1367,13 @@ dependencies = [ [[package]] name = "mshv-ioctls" -version = "0.6.7" +version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f035616abe1e4cbc026a1a8094ff8d3900f5063fe6608309098bc745926fdfd8" +checksum = "1339723fe3a26baf4041459de20ad923e89d312c3bb25dbf9f60738c22a47f5e" dependencies = [ "libc", "mshv-bindings", - "thiserror 2.0.18", - "vmm-sys-util", -] - -[[package]] -name = "net_gen" -version = "0.1.0" -dependencies = [ + "thiserror", "vmm-sys-util", ] @@ -1380,16 +1382,15 @@ name = "net_util" version = "0.1.0" dependencies = [ "epoll", - "getrandom 0.4.1", + "getrandom 0.4.2", "libc", "log", - "net_gen", "pnet", "pnet_datalink", "rate_limiter", "serde", "serde_json", - "thiserror 2.0.18", + "thiserror", "virtio-bindings", "virtio-queue", "vm-memory", @@ -1414,9 +1415,9 @@ dependencies = [ [[package]] name = "num_enum" -version = "0.7.5" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1207a7e20ad57b847bbddc6776b968420d38292bbfe2089accff5e19e82454c" +checksum = "5d0bca838442ec211fa11de3a8b0e0e8f3a4522575b5c4c06ed722e005036f26" dependencies = [ "num_enum_derive", "rustversion", @@ -1424,9 +1425,9 @@ dependencies = [ [[package]] name = "num_enum_derive" -version = "0.7.5" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff32365de1b6743cb203b710788263c44a03de03802daf96092f2da4fe6ba4d7" +checksum = "680998035259dcfcafe653688bf2aa6d3e2dc05e98be6ab46afb089dc84f1df8" dependencies = [ "proc-macro-crate", "proc-macro2", @@ -1445,9 +1446,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.21.3" +version = "1.21.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" [[package]] name = "once_cell_polyfill" @@ -1486,9 +1487,9 @@ dependencies = [ [[package]] name = "openssl-sys" -version = "0.9.111" +version = "0.9.112" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "82cab2d520aa75e3c58898289429321eb788c3106963d0dc886ec7a5f4adc321" +checksum = "57d55af3b3e226502be1526dfdba67ab0e9c96fc293004e79576b2b9edb0dbdb" dependencies = [ "cc", "libc", @@ -1507,7 +1508,7 @@ checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" name = "option_parser" version = "0.1.0" dependencies = [ - "thiserror 2.0.18", + "thiserror", ] [[package]] @@ -1550,10 +1551,10 @@ dependencies = [ ] [[package]] -name = "paste" -version = "1.0.15" +name = "pastey" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" +checksum = "b867cad97c0791bbd3aaa6472142568c6c9e8f71937e98379f584cfb0cf35bec" [[package]] name = "pci" @@ -1565,7 +1566,7 @@ dependencies = [ "libc", "log", "serde", - "thiserror 2.0.18", + "thiserror", "vfio-bindings", "vfio-ioctls", "vfio_user", @@ -1580,25 +1581,28 @@ dependencies = [ name = "performance-metrics" version = "0.1.0" dependencies = [ + "block", "clap", "dirs", + "libc", "serde", "serde_json", "test_infra", - "thiserror 2.0.18", + "thiserror", + "vmm-sys-util", ] [[package]] name = "pin-project-lite" -version = "0.2.16" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" +checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" [[package]] name = "piper" -version = "0.2.4" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96c8c490f422ef9a4efd2cb5b42b76c8613d7e7dfc1caf667b8a3350a5acc066" +checksum = "c835479a4443ded371d6c535cbfd8d31ad92c5d23ae9770a61bc155e4992a3c1" dependencies = [ "atomic-waker", "fastrand", @@ -1713,7 +1717,7 @@ dependencies = [ "hermit-abi", "pin-project-lite", "rustix", - "windows-sys 0.61.0", + "windows-sys", ] [[package]] @@ -1724,22 +1728,13 @@ checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" [[package]] name = "portable-atomic-util" -version = "0.2.5" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a9db96d7fa8782dd8c15ce32ffe8680bbd1e978a43bf51a34d39483540495f5" +checksum = "091397be61a01d4be58e7841595bd4bfedb15f1cd54977d79b8271e94ed799a3" dependencies = [ "portable-atomic", ] -[[package]] -name = "ppv-lite86" -version = "0.2.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" -dependencies = [ - "zerocopy", -] - [[package]] name = "prettyplease" version = "0.2.37" @@ -1752,9 +1747,9 @@ dependencies = [ [[package]] name = "proc-macro-crate" -version = "3.4.0" +version = "3.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "219cb19e96be00ab2e37d6e299658a0cfa83e52429179969b0f0121b4ac46983" +checksum = "e67ba7e9b2b56446f1d419b1d807906278ffa1a658a8a5d8a39dcb1f5a78614f" dependencies = [ "toml_edit", ] @@ -1770,9 +1765,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.44" +version = "1.0.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21b2ebcf727b7760c461f091f9f0f539b77b8e87f2fd88131e7f1b433b3cece4" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" dependencies = [ "proc-macro2", ] @@ -1784,14 +1779,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" [[package]] -name = "rand" -version = "0.9.2" +name = "r-efi" +version = "6.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" -dependencies = [ - "rand_chacha", - "rand_core 0.9.5", -] +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" [[package]] name = "rand" @@ -1800,27 +1791,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bc266eb313df6c5c09c1c7b1fbe2510961e5bcd3add930c1e31f7ed9da0feff8" dependencies = [ "chacha20", - "getrandom 0.4.1", - "rand_core 0.10.0", -] - -[[package]] -name = "rand_chacha" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" -dependencies = [ - "ppv-lite86", - "rand_core 0.9.5", -] - -[[package]] -name = "rand_core" -version = "0.9.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" -dependencies = [ - "getrandom 0.3.4", + "getrandom 0.4.2", + "rand_core", ] [[package]] @@ -1842,7 +1814,7 @@ dependencies = [ "epoll", "libc", "log", - "thiserror 2.0.18", + "thiserror", "vmm-sys-util", ] @@ -1861,9 +1833,9 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a4e608c6638b9c18977b00b475ac1f28d14e84b27d8d42f70e0bf1e3dec127ac" dependencies = [ - "getrandom 0.2.15", + "getrandom 0.2.17", "libredox", - "thiserror 2.0.18", + "thiserror", ] [[package]] @@ -1891,9 +1863,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.8.9" +version = "0.8.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a96887878f22d7bad8a3b6dc5b7440e0ada9a245242924394987b21cf2210a4c" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" [[package]] name = "remain" @@ -1920,15 +1892,15 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" [[package]] name = "rustix" -version = "1.1.3" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "146c9e247ccc180c1f61615433868c99f3de3ae256a30a43b49f67c2d9171f34" +checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" dependencies = [ "bitflags 2.11.0", "errno", "libc", "linux-raw-sys", - "windows-sys 0.61.0", + "windows-sys", ] [[package]] @@ -1954,9 +1926,9 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.27" +version = "1.0.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" +checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd" [[package]] name = "serde" @@ -2014,9 +1986,9 @@ dependencies = [ [[package]] name = "serde_with" -version = "3.16.1" +version = "3.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fa237f2807440d238e0364a218270b98f767a00d3dada77b1c53ae88940e2e7" +checksum = "dd5414fad8e6907dbdd5bc441a50ae8d6e26151a03b1de04d89a5576de61d01f" dependencies = [ "serde_core", "serde_with_macros", @@ -2024,9 +1996,9 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "3.16.1" +version = "3.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52a8e3ca0ca629121f70ab50f95249e5a6f925cc0f6ffe8256c45b728875706c" +checksum = "d3db8978e608f1fe7357e211969fd9abdcae80bac1ba7a3369bb7eb6b404eb65" dependencies = [ "darling", "proc-macro2", @@ -2046,9 +2018,9 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" [[package]] name = "signal-hook" -version = "0.4.3" +version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b57709da74f9ff9f4a27dce9526eec25ca8407c45a7887243b031a58935fb8e" +checksum = "b2a0c28ca5908dbdbcd52e6fdaa00358ab88637f8ab33e1f188dd510eb44b53d" dependencies = [ "libc", "signal-hook-registry", @@ -2066,9 +2038,9 @@ dependencies = [ [[package]] name = "simd-adler32" -version = "0.3.8" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2" +checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214" [[package]] name = "slab" @@ -2117,9 +2089,9 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "syn" -version = "2.0.116" +version = "2.0.117" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3df424c70518695237746f84cede799c9c58fcb37450d7b23716568cc8bc69cb" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" dependencies = [ "proc-macro2", "quote", @@ -2128,25 +2100,25 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.25.0" +version = "3.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0136791f7c95b1f6dd99f9cc786b91bb81c3800b639b3478e561ddb7be95e5f1" +checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" dependencies = [ "fastrand", - "getrandom 0.4.1", + "getrandom 0.4.2", "once_cell", "rustix", - "windows-sys 0.61.0", + "windows-sys", ] [[package]] name = "terminal_size" -version = "0.4.3" +version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60b8cb979cb11c32ce1603f8137b22262a9d131aaa5c37b5678025f22b8becd0" +checksum = "230a1b821ccbd75b185820a1f1ff7b14d21da1e442e22c0863ea5f08771a8874" dependencies = [ "rustix", - "windows-sys 0.60.2", + "windows-sys", ] [[package]] @@ -2156,41 +2128,21 @@ dependencies = [ "dirs", "epoll", "libc", - "rand 0.10.0", + "rand", "serde_json", "ssh2", - "thiserror 2.0.18", + "thiserror", "vmm-sys-util", "wait-timeout", ] -[[package]] -name = "thiserror" -version = "1.0.62" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2675633b1499176c2dff06b0856a27976a8f9d436737b4cf4f312d4d91d8bbb" -dependencies = [ - "thiserror-impl 1.0.62", -] - [[package]] name = "thiserror" version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" dependencies = [ - "thiserror-impl 2.0.18", -] - -[[package]] -name = "thiserror-impl" -version = "1.0.62" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d20468752b09f49e909e55a5d338caa8bedf615594e9d80bc4c565d30faf798c" -dependencies = [ - "proc-macro2", - "quote", - "syn", + "thiserror-impl", ] [[package]] @@ -2212,32 +2164,32 @@ checksum = "3bf63baf9f5039dadc247375c29eb13706706cfde997d0330d05aa63a77d8820" [[package]] name = "toml_datetime" -version = "0.7.5+spec-1.1.0" +version = "1.1.1+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92e1cfed4a3038bc5a127e35a2d360f145e1f4b971b551a2ba5fd7aedf7e1347" +checksum = "3165f65f62e28e0115a00b2ebdd37eb6f3b641855f9d636d3cd4103767159ad7" dependencies = [ "serde_core", ] [[package]] name = "toml_edit" -version = "0.23.10+spec-1.0.0" +version = "0.25.11+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84c8b9f757e028cee9fa244aea147aab2a9ec09d5325a9b01e0a49730c2b5269" +checksum = "0b59c4d22ed448339746c59b905d24568fcbb3ab65a500494f7b8c3e97739f2b" dependencies = [ "indexmap", "toml_datetime", "toml_parser", - "winnow", + "winnow 1.0.0", ] [[package]] name = "toml_parser" -version = "1.0.9+spec-1.1.0" +version = "1.1.2+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "702d4415e08923e7e1ef96cd5727c0dfed80b4d2fa25db9647fe5eb6f7c5a4c4" +checksum = "a2abe9b86193656635d2411dc43050282ca48aa31c2451210f4202550afb7526" dependencies = [ - "winnow", + "winnow 1.0.0", ] [[package]] @@ -2247,8 +2199,7 @@ dependencies = [ "anyhow", "libc", "log", - "net_gen", - "thiserror 2.0.18", + "thiserror", "vmm-sys-util", ] @@ -2295,13 +2246,13 @@ dependencies = [ [[package]] name = "uds_windows" -version = "1.1.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89daebc3e6fd160ac4aa9fc8b3bf71e1f74fbf92367ae71fb83a037e8bf164b9" +checksum = "f2f6fb2847f6742cd76af783a2a2c49e9375d0a111c7bef6f71cd9e738c72d6e" dependencies = [ "memoffset", "tempfile", - "winapi", + "windows-sys", ] [[package]] @@ -2324,13 +2275,13 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.21.0" +version = "1.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b672338555252d43fd2240c714dc444b8c6fb0a5c5335e65a07bba7742735ddb" +checksum = "5ac8b6f42ead25368cf5b098aeb3dc8a1a2c05a3eee8a9a1a68c640edbfc79d9" dependencies = [ - "getrandom 0.4.1", + "getrandom 0.4.2", "js-sys", - "rand 0.9.2", + "rand", "serde_core", "wasm-bindgen", ] @@ -2343,27 +2294,29 @@ checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" [[package]] name = "vfio-bindings" -version = "0.6.0" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "698c66a4522a31ab407a410a59c9660da036178e4fe3f371825cd6aad7d46837" +checksum = "188dac3057a0cbc94470085204c84b82ff7ec5dac629a514323cd133d1f9abe0" dependencies = [ "vmm-sys-util", ] [[package]] name = "vfio-ioctls" -version = "0.5.1" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7af7e8d49719333e5eb52209417f26695c9ab2b117a82596a63a44947f97c5d6" +checksum = "d4b1d98dff7f0d219278e406323e7eda4d426447bd203c7828189baf0d8c07b7" dependencies = [ "byteorder", + "iommufd-bindings", + "iommufd-ioctls", "kvm-bindings", "kvm-ioctls", "libc", "log", "mshv-bindings", "mshv-ioctls", - "thiserror 2.0.18", + "thiserror", "vfio-bindings", "vm-memory", "vmm-sys-util", @@ -2371,9 +2324,9 @@ dependencies = [ [[package]] name = "vfio_user" -version = "0.1.1" +version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8db5bc783aad75202ad4cbcdc5e893cff1dd8fa24a1bcdb4de8998d3c4d169a" +checksum = "731c2582dd43f4f174ab47b4c933a1a9bb872d9d1b7f54c5867e12dbc1491b75" dependencies = [ "bitflags 2.11.0", "libc", @@ -2381,7 +2334,7 @@ dependencies = [ "serde", "serde_derive", "serde_json", - "thiserror 2.0.18", + "thiserror", "vfio-bindings", "vm-memory", "vmm-sys-util", @@ -2389,9 +2342,9 @@ dependencies = [ [[package]] name = "vhost" -version = "0.14.0" +version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a4dcad85a129d97d5d4b2f3c47a4affdeedd76bdcd02094bcb5d9b76cac2d05" +checksum = "ee90657203a8644e9a0860a0db6a7887d8ef0c7bc09fc22dfa4ae75df65bac86" dependencies = [ "bitflags 2.11.0", "libc", @@ -2402,9 +2355,9 @@ dependencies = [ [[package]] name = "vhost-user-backend" -version = "0.20.0" +version = "0.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e183205a9ba7cb9c47fcb0fc0a07fc295a110efbb11ab78ad0d793b0a38a7bde" +checksum = "d5925983d8fb537752ad3e26604c0a17abfa5de77cb6773a096c8a959c9eca0f" dependencies = [ "libc", "log", @@ -2425,7 +2378,7 @@ dependencies = [ "libc", "log", "option_parser", - "thiserror 2.0.18", + "thiserror", "vhost", "vhost-user-backend", "virtio-bindings", @@ -2445,7 +2398,7 @@ dependencies = [ "log", "net_util", "option_parser", - "thiserror 2.0.18", + "thiserror", "vhost", "vhost-user-backend", "virtio-bindings", @@ -2455,9 +2408,9 @@ dependencies = [ [[package]] name = "virtio-bindings" -version = "0.2.6" +version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "804f498a26d5a63be7bbb8bdcd3869c3f286c4c4a17108905276454da0caf8cb" +checksum = "091f1f09cfbf2a78563b562e7a949465cce1aef63b6065645188d995162f8868" [[package]] name = "virtio-devices" @@ -2468,6 +2421,7 @@ dependencies = [ "byteorder", "epoll", "event_monitor", + "hypervisor", "libc", "log", "mshv-ioctls", @@ -2478,7 +2432,7 @@ dependencies = [ "serde", "serde_with", "serial_buffer", - "thiserror 2.0.18", + "thiserror", "vhost", "virtio-bindings", "virtio-queue", @@ -2492,10 +2446,11 @@ dependencies = [ [[package]] name = "virtio-queue" -version = "0.16.0" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb0479158f863e59323771a1f684d843962f76960b86fecfec2bfa9c8f0f9180" +checksum = "e358084f32ed165fddb41d98ff1b7ff3c08b9611d8d6114a1b422e2e85688baf" dependencies = [ + "libc", "log", "virtio-bindings", "vm-memory", @@ -2517,7 +2472,7 @@ version = "0.1.0" dependencies = [ "hypervisor", "serde", - "thiserror 2.0.18", + "thiserror", "vfio-ioctls", "vm-memory", "vmm-sys-util", @@ -2531,13 +2486,13 @@ checksum = "7e21282841a059bb62627ce8441c491f09603622cd5a21c43bfedc85a2952f23" [[package]] name = "vm-memory" -version = "0.16.2" +version = "0.17.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fd5e56d48353c5f54ef50bd158a0452fc82f5383da840f7b8efc31695dd3b9d" +checksum = "f39348a049689cabd3377cdd9182bf526ec76a6f823b79903896452e9d7a7380" dependencies = [ "arc-swap", "libc", - "thiserror 1.0.62", + "thiserror", "winapi", ] @@ -2549,7 +2504,7 @@ dependencies = [ "itertools", "serde", "serde_json", - "thiserror 2.0.18", + "thiserror", "vm-memory", ] @@ -2585,6 +2540,7 @@ dependencies = [ "hypervisor", "igvm", "igvm_defs", + "iommufd-ioctls", "landlock", "libc", "linux-loader", @@ -2601,7 +2557,7 @@ dependencies = [ "serde_json", "serial_buffer", "signal-hook", - "thiserror 2.0.18", + "thiserror", "tracer", "uuid", "vfio-ioctls", @@ -2621,9 +2577,9 @@ dependencies = [ [[package]] name = "vmm-sys-util" -version = "0.14.0" +version = "0.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d21f366bf22bfba3e868349978766a965cbe628c323d58e026be80b8357ab789" +checksum = "506c62fdf617a5176827c2f9afbcf1be155b03a9b4bf9617a60dbc07e3a1642f" dependencies = [ "bitflags 1.3.2", "libc", @@ -2666,35 +2622,22 @@ dependencies = [ [[package]] name = "wasm-bindgen" -version = "0.2.100" +version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" +checksum = "6532f9a5c1ece3798cb1c2cfdba640b9b3ba884f5db45973a6f442510a87d38e" dependencies = [ "cfg-if", "once_cell", "rustversion", "wasm-bindgen-macro", -] - -[[package]] -name = "wasm-bindgen-backend" -version = "0.2.100" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" -dependencies = [ - "bumpalo", - "log", - "proc-macro2", - "quote", - "syn", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-macro" -version = "0.2.100" +version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" +checksum = "18a2d50fcf105fb33bb15f00e7a77b772945a2ee45dcf454961fd843e74c18e6" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -2702,22 +2645,22 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.100" +version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" +checksum = "03ce4caeaac547cdf713d280eda22a730824dd11e6b8c3ca9e42247b25c631e3" dependencies = [ + "bumpalo", "proc-macro2", "quote", "syn", - "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.100" +version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" +checksum = "75a326b8c223ee17883a4251907455a2431acc2791c98c26279376490c378c16" dependencies = [ "unicode-ident", ] @@ -2786,92 +2729,27 @@ checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" [[package]] name = "windows-sys" -version = "0.60.2" +version = "0.61.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" -dependencies = [ - "windows-targets", -] - -[[package]] -name = "windows-sys" -version = "0.61.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e201184e40b2ede64bc2ea34968b28e33622acdbbf37104f0e4a33f7abe657aa" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" dependencies = [ "windows-link", ] [[package]] -name = "windows-targets" -version = "0.53.5" +name = "winnow" +version = "0.7.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" +checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945" dependencies = [ - "windows-link", - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_gnullvm", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "memchr", ] -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" - -[[package]] -name = "windows_aarch64_msvc" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" - -[[package]] -name = "windows_i686_gnu" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3" - -[[package]] -name = "windows_i686_gnullvm" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" - -[[package]] -name = "windows_i686_msvc" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" - -[[package]] -name = "windows_x86_64_gnu" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" - -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" - -[[package]] -name = "windows_x86_64_msvc" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" - [[package]] name = "winnow" -version = "0.7.14" +version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a5364e9d77fcdeeaa6062ced926ee3381faa2ee02d3eb83a5c27a8825540829" +checksum = "a90e88e4667264a994d34e6d1ab2d26d398dcdca8b7f52bec8668957517fc7d8" dependencies = [ "memchr", ] @@ -2966,9 +2844,9 @@ dependencies = [ [[package]] name = "zbus" -version = "5.13.2" +version = "5.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bfeff997a0aaa3eb20c4652baf788d2dfa6d2839a0ead0b3ff69ce2f9c4bdd1" +checksum = "ca82f95dbd3943a40a53cfded6c2d0a2ca26192011846a1810c4256ef92c60bc" dependencies = [ "async-broadcast", "async-executor", @@ -2992,8 +2870,8 @@ dependencies = [ "tracing", "uds_windows", "uuid", - "windows-sys 0.61.0", - "winnow", + "windows-sys", + "winnow 0.7.15", "zbus_macros", "zbus_names", "zvariant", @@ -3001,9 +2879,9 @@ dependencies = [ [[package]] name = "zbus_macros" -version = "5.13.2" +version = "5.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0bbd5a90dbe8feee5b13def448427ae314ccd26a49cac47905cafefb9ff846f1" +checksum = "897e79616e84aac4b2c46e9132a4f63b93105d54fe8c0e8f6bffc21fa8d49222" dependencies = [ "proc-macro-crate", "proc-macro2", @@ -3021,24 +2899,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ffd8af6d5b78619bab301ff3c560a5bd22426150253db278f164d6cf3b72c50f" dependencies = [ "serde", - "winnow", + "winnow 0.7.15", "zvariant", ] [[package]] name = "zerocopy" -version = "0.8.39" +version = "0.8.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db6d35d663eadb6c932438e763b262fe1a70987f9ae936e60158176d710cae4a" +checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.39" +version = "0.8.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4122cd3169e94605190e77839c9a40d40ed048d305bfdc146e7df40ab0f3e517" +checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4" dependencies = [ "proc-macro2", "quote", @@ -3081,23 +2959,23 @@ dependencies = [ [[package]] name = "zvariant" -version = "5.9.2" +version = "5.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68b64ef4f40c7951337ddc7023dd03528a57a3ce3408ee9da5e948bd29b232c4" +checksum = "5708299b21903bbe348e94729f22c49c55d04720a004aa350f1f9c122fd2540b" dependencies = [ "endi", "enumflags2", "serde", - "winnow", + "winnow 0.7.15", "zvariant_derive", "zvariant_utils", ] [[package]] name = "zvariant_derive" -version = "5.9.2" +version = "5.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "484d5d975eb7afb52cc6b929c13d3719a20ad650fea4120e6310de3fc55e415c" +checksum = "5b59b012ebe9c46656f9cc08d8da8b4c726510aef12559da3e5f1bf72780752c" dependencies = [ "proc-macro-crate", "proc-macro2", @@ -3116,5 +2994,5 @@ dependencies = [ "quote", "serde", "syn", - "winnow", + "winnow 0.7.15", ] diff --git a/Cargo.toml b/Cargo.toml index 53537895d7..92a52f81a3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,7 +22,6 @@ members = [ "devices", "event_monitor", "hypervisor", - "net_gen", "net_util", "option_parser", "pci", @@ -41,27 +40,36 @@ members = [ "vmm", ] package.edition = "2024" +# Minimum buildable version: +# Keep in sync with version in .github/workflows/build.yaml +# Policy on MSRV (see #4318): +# Can only be bumped if satisfying any of the following: +# a.) A dependency requires it, +# b.) If we want to use a new feature and that MSRV is at least 6 months old, +# c.) There is a security issue that is addressed by the toolchain update. +package.rust-version = "1.89.0" resolver = "3" [workspace.dependencies] # rust-vmm crates acpi_tables = "0.2.0" -kvm-bindings = "0.12.1" -kvm-ioctls = "0.22.1" -linux-loader = "0.13.1" -mshv-bindings = "0.6.7" -mshv-ioctls = "0.6.7" +iommufd-ioctls = "0.1.0" +kvm-bindings = "0.14.0" +kvm-ioctls = "0.24.0" +linux-loader = "0.13.2" +mshv-bindings = "0.6.8" +mshv-ioctls = "0.6.8" seccompiler = "0.5.0" -vfio-bindings = { version = "0.6.0", default-features = false } -vfio-ioctls = { version = "0.5.1", default-features = false } -vfio_user = { version = "0.1.1", default-features = false } -vhost = { version = "0.14.0", default-features = false } -vhost-user-backend = { version = "0.20.0", default-features = false } +vfio-bindings = { version = "0.6.2", default-features = false } +vfio-ioctls = { version = "0.6.0", default-features = false } +vfio_user = { version = "0.1.3", default-features = false } +vhost = { version = "0.16.0", default-features = false } +vhost-user-backend = { version = "0.22.0", default-features = false } virtio-bindings = "0.2.6" -virtio-queue = "0.16.0" +virtio-queue = "0.17.0" vm-fdt = "0.3.0" -vm-memory = "0.16.1" -vmm-sys-util = "0.14.0" +vm-memory = "0.17.1" +vmm-sys-util = "0.15.0" # igvm crates igvm = "0.4.0" @@ -70,27 +78,27 @@ igvm_defs = "0.4.0" # serde crates serde = "1.0.228" serde_json = "1.0.149" -serde_with = { version = "3.16.1", default-features = false } +serde_with = { version = "3.18.0", default-features = false } # other crates -anyhow = "1.0.101" +anyhow = "1.0.102" bitflags = "2.11.0" byteorder = "1.5.0" cfg-if = "1.0.4" -clap = "4.5.59" +clap = "4.6.0" dhat = "0.3.3" dirs = "6.0.0" -env_logger = "0.11.8" +env_logger = "0.11.10" epoll = "4.4.0" flume = "0.12.0" itertools = "0.14.0" -libc = "0.2.182" +libc = "0.2.184" log = "0.4.29" -signal-hook = "0.4.3" +signal-hook = "0.4.4" thiserror = "2.0.18" -uuid = { version = "1.21.0" } +uuid = { version = "1.23.0" } wait-timeout = "0.2.1" -zerocopy = { version = "0.8.39", default-features = false } +zerocopy = { version = "0.8.48", default-features = false } [workspace.lints.clippy] # Any clippy lint (group) in alphabetical order: diff --git a/api_client/Cargo.toml b/api_client/Cargo.toml index 93a7836fcc..1ab0e5862e 100644 --- a/api_client/Cargo.toml +++ b/api_client/Cargo.toml @@ -3,6 +3,7 @@ authors = ["The Cloud Hypervisor Authors"] edition.workspace = true license = "Apache-2.0" name = "api_client" +rust-version.workspace = true version = "0.1.0" [dependencies] diff --git a/arch/Cargo.toml b/arch/Cargo.toml index 804be793d0..2e30b9e532 100644 --- a/arch/Cargo.toml +++ b/arch/Cargo.toml @@ -2,6 +2,7 @@ authors = ["The Chromium OS Authors"] edition.workspace = true name = "arch" +rust-version.workspace = true version = "0.1.0" [features] diff --git a/arch/src/aarch64/uefi.rs b/arch/src/aarch64/uefi.rs index bd40e36ff0..2ff3a8638f 100644 --- a/arch/src/aarch64/uefi.rs +++ b/arch/src/aarch64/uefi.rs @@ -7,7 +7,7 @@ use std::os::fd::AsFd; use std::result; use thiserror::Error; -use vm_memory::{GuestAddress, GuestMemory}; +use vm_memory::{Bytes, GuestAddress, GuestMemory}; /// Errors thrown while loading UEFI binary #[derive(Debug, Error)] diff --git a/arch/src/x86_64/mod.rs b/arch/src/x86_64/mod.rs index 979fd52a9b..09577b436c 100644 --- a/arch/src/x86_64/mod.rs +++ b/arch/src/x86_64/mod.rs @@ -641,17 +641,16 @@ pub fn generate_common_cpuid( // Update some existing CPUID for entry in cpuid.as_mut_slice().iter_mut() { + #[allow(unused_unsafe)] match entry.function { // Clear AMX related bits if the AMX feature is not enabled - 0x7 => { - if !config.amx { - if entry.index == 0 { - entry.edx &= !((1 << AMX_BF16) | (1 << AMX_TILE) | (1 << AMX_INT8)); - } - if entry.index == 1 { - entry.eax &= !(1 << AMX_FP16); - entry.edx &= !(1 << AMX_COMPLEX); - } + 0x7 if !config.amx => { + if entry.index == 0 { + entry.edx &= !((1 << AMX_BF16) | (1 << AMX_TILE) | (1 << AMX_INT8)); + } + if entry.index == 1 { + entry.eax &= !(1 << AMX_FP16); + entry.edx &= !(1 << AMX_COMPLEX); } } 0xd => @@ -673,55 +672,52 @@ pub fn generate_common_cpuid( } } } - 0x1d => { - // Tile Information (purely AMX related). - if !config.amx { - entry.eax = 0; - entry.ebx = 0; - entry.ecx = 0; - entry.edx = 0; - } + // Tile Information (purely AMX related). + 0x1d if !config.amx => { + entry.eax = 0; + entry.ebx = 0; + entry.ecx = 0; + entry.edx = 0; } - 0x1e => { - // TMUL information (purely AMX related) - if !config.amx { - entry.eax = 0; - entry.ebx = 0; - entry.ecx = 0; - entry.edx = 0; - } + // TMUL information (purely AMX related) + 0x1e if !config.amx => { + entry.eax = 0; + entry.ebx = 0; + entry.ecx = 0; + entry.edx = 0; } // Copy host L1 cache details if not populated by KVM - 0x8000_0005 => { - if entry.eax == 0 && entry.ebx == 0 && entry.ecx == 0 && entry.edx == 0 { - #[allow(unused_unsafe)] + 0x8000_0005 + if entry.eax == 0 + && entry.ebx == 0 + && entry.ecx == 0 + && entry.edx == 0 // SAFETY: cpuid called with valid leaves - if unsafe { std::arch::x86_64::__cpuid(0x8000_0000).eax } >= 0x8000_0005 { - // SAFETY: cpuid called with valid leaves - let leaf = unsafe { std::arch::x86_64::__cpuid(0x8000_0005) }; - entry.eax = leaf.eax; - entry.ebx = leaf.ebx; - entry.ecx = leaf.ecx; - entry.edx = leaf.edx; - } - } + && unsafe { std::arch::x86_64::__cpuid(0x8000_0000).eax } >= 0x8000_0005 => + { + // SAFETY: cpuid called with valid leaves + let leaf = unsafe { std::arch::x86_64::__cpuid(0x8000_0005) }; + entry.eax = leaf.eax; + entry.ebx = leaf.ebx; + entry.ecx = leaf.ecx; + entry.edx = leaf.edx; } // Copy host L2 cache details if not populated by KVM - 0x8000_0006 => { - if entry.eax == 0 && entry.ebx == 0 && entry.ecx == 0 && entry.edx == 0 { - #[allow(unused_unsafe)] + 0x8000_0006 + if entry.eax == 0 + && entry.ebx == 0 + && entry.ecx == 0 + && entry.edx == 0 // SAFETY: cpuid called with valid leaves - if unsafe { std::arch::x86_64::__cpuid(0x8000_0000).eax } >= 0x8000_0006 { - #[allow(unused_unsafe)] - // SAFETY: cpuid called with valid leaves - let leaf = unsafe { std::arch::x86_64::__cpuid(0x8000_0006) }; - entry.eax = leaf.eax; - entry.ebx = leaf.ebx; - entry.ecx = leaf.ecx; - entry.edx = leaf.edx; - } - } + && unsafe { std::arch::x86_64::__cpuid(0x8000_0000).eax } >= 0x8000_0006 => + { + // SAFETY: cpuid called with valid leaves + let leaf = unsafe { std::arch::x86_64::__cpuid(0x8000_0006) }; + entry.eax = leaf.eax; + entry.ebx = leaf.ebx; + entry.ecx = leaf.ecx; + entry.edx = leaf.edx; } // Set CPU physical bits 0x8000_0008 => { @@ -823,6 +819,7 @@ pub fn configure_vcpu( cpu_vendor: CpuVendor, topology: (u16, u16, u16, u16), nested: bool, + setup_registers: bool, ) -> super::Result<()> { let x2apic_id = get_x2apic_id(id, Some(topology)); @@ -841,11 +838,13 @@ pub fn configure_vcpu( entry.ebx &= 0xffffff; entry.ebx |= x2apic_id << 24; apic_id_patched = true; - if !nested { - // Disable nested virtualization for Intel - entry.ecx &= !(1 << VMX_ECX_BIT); + if matches!(cpu_vendor, CpuVendor::Intel) { + if !nested { + // Disable nested virtualization for Intel + entry.ecx &= !(1 << VMX_ECX_BIT); + } + break; } - break; } if entry.function == 0x8000_0001 { if !nested { @@ -894,17 +893,19 @@ pub fn configure_vcpu( regs::setup_msrs(vcpu).map_err(Error::MsrsConfiguration)?; if let Some((kernel_entry_point, guest_memory)) = boot_setup { - regs::setup_regs(vcpu, kernel_entry_point).map_err(Error::RegsConfiguration)?; + if setup_registers { + regs::setup_regs(vcpu, kernel_entry_point).map_err(Error::RegsConfiguration)?; + + // CPUs are required (by Intel sdm spec) to boot in x2apic mode if any + // of the apic IDs is larger than 255. Experimentally, the Linux kernel + // does not recognize the last vCPU if x2apic is not enabled when + // there are 256 vCPUs in a flat hierarchy (i.e. max x2apic ID is 255), + // so we need to enable x2apic in this case as well. + let enable_x2_apic_mode = get_max_x2apic_id(topology) > MAX_SUPPORTED_CPUS_LEGACY; + regs::setup_sregs(&guest_memory.memory(), vcpu, enable_x2_apic_mode) + .map_err(Error::SregsConfiguration)?; + } regs::setup_fpu(vcpu).map_err(Error::FpuConfiguration)?; - - // CPUs are required (by Intel sdm spec) to boot in x2apic mode if any - // of the apic IDs is larger than 255. Experimentally, the Linux kernel - // does not recognize the last vCPU if x2apic is not enabled when - // there are 256 vCPUs in a flat hierarchy (i.e. max x2apic ID is 255), - // so we need to enable x2apic in this case as well. - let enable_x2_apic_mode = get_max_x2apic_id(topology) > MAX_SUPPORTED_CPUS_LEGACY; - regs::setup_sregs(&guest_memory.memory(), vcpu, enable_x2_apic_mode) - .map_err(Error::SregsConfiguration)?; } interrupts::set_lint(vcpu).map_err(|e| Error::LocalIntConfiguration(e.into()))?; Ok(()) diff --git a/block/Cargo.toml b/block/Cargo.toml index 70a731a731..ab62c2308c 100644 --- a/block/Cargo.toml +++ b/block/Cargo.toml @@ -2,6 +2,7 @@ authors = ["The Chromium OS Authors", "The Cloud Hypervisor Authors"] edition.workspace = true name = "block" +rust-version.workspace = true version = "0.1.0" [features] diff --git a/block/src/async_io.rs b/block/src/async_io.rs index a1e8fa3e46..2d8ea37878 100644 --- a/block/src/async_io.rs +++ b/block/src/async_io.rs @@ -8,7 +8,7 @@ use std::os::fd::{AsRawFd, OwnedFd, RawFd}; use thiserror::Error; use vmm_sys_util::eventfd::EventFd; -use crate::{BatchRequest, DiskTopology}; +use crate::{BatchRequest, DiskTopology, SECTOR_SIZE}; #[derive(Error, Debug)] pub enum DiskFileError { @@ -24,6 +24,8 @@ pub enum DiskFileError { /// Resize failed #[error("Resize failed")] ResizeError(#[source] std::io::Error), + #[error("Failed cloning disk file")] + Clone(#[source] std::io::Error), } pub type DiskFileResult = std::result::Result; @@ -145,4 +147,7 @@ pub trait AsyncIo: Send { fn submit_batch_requests(&mut self, _batch_request: &[BatchRequest]) -> AsyncIoResult<()> { Ok(()) } + fn alignment(&self) -> u64 { + SECTOR_SIZE + } } diff --git a/block/src/disk_file.rs b/block/src/disk_file.rs new file mode 100644 index 0000000000..fea7243abb --- /dev/null +++ b/block/src/disk_file.rs @@ -0,0 +1,239 @@ +// Copyright 2026 The Cloud Hypervisor Authors. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +//! Composable disk capability traits for the block crate. +//! +//! Small traits define individual capabilities: +//! +//! - [`DiskSize`] - reported capacity (logical size) +//! - [`PhysicalSize`] - host allocation size +//! - [`DiskFd`] - backing file descriptor access +//! - [`Geometry`] - sector/cluster geometry (default 512B) +//! - [`SparseCapable`] - sparse and zero flag support +//! - [`Resizable`] - online resize +//! +//! [`DiskFile`] is a supertrait that bundles the universal capabilities +//! (`DiskSize` + `Geometry`). [`FullDiskFile`] adds all optional +//! capabilities. [`AsyncDiskFile`] extends `DiskFile` with async I/O +//! construction for virtio queue workers. [`AsyncFullDiskFile`] +//! combines both axes. +//! +//! ```text +//! DiskFile: DiskSize + Geometry + Sync +//! / \ +//! FullDiskFile: AsyncDiskFile: +//! DiskFile + PhysicalSize + DiskFile + Unpin +//! DiskFd + SparseCapable + try_clone, new_async_io +//! Resizable +//! \ / +//! AsyncFullDiskFile: FullDiskFile + AsyncDiskFile +//! ``` +//! +//! Readonly accessors take `&self`. Only [`Resizable::resize`] requires +//! `&mut self`. Errors are returned as [`BlockResult`]. + +use std::fmt::Debug; +use std::io; + +use crate::async_io::{self, AsyncIo, BorrowedDiskFd}; +use crate::error::{BlockError, BlockErrorKind}; +use crate::{BlockResult, DiskTopology}; + +/// Reported capacity of a disk image. +pub trait DiskSize: Send + Debug { + /// Virtual size of the disk image in bytes (reported capacity). + fn logical_size(&self) -> BlockResult; +} + +/// Host allocation size of a file-backed disk image. +pub trait PhysicalSize: Send + Debug { + /// Actual bytes occupied on the host filesystem. + fn physical_size(&self) -> BlockResult; +} + +/// Backing file descriptor access for disk images backed by a file. +pub trait DiskFd: Send + Debug { + /// Borrows the underlying file descriptor. + fn fd(&self) -> BorrowedDiskFd<'_>; +} + +/// Sector and cluster geometry of a disk image. +/// +/// Default returns `DiskTopology::default()` (512B logical/physical). +pub trait Geometry: Send + Debug { + /// Returns the disk topology. + fn topology(&self) -> DiskTopology { + DiskTopology::default() + } +} + +/// Sparse and zero flag support for thin provisioned disk images. +pub trait SparseCapable: Send + Debug { + /// Indicates support for sparse operations (punch hole, write zeroes, discard). + fn supports_sparse_operations(&self) -> bool { + false + } + + /// Indicates support for a metadata level zero flag optimization in + /// virtio `VIRTIO_BLK_T_WRITE_ZEROES` requests. When true, the format + /// can mark regions as reading zeros via a metadata bit rather than + /// writing actual zero bytes to disk. + fn supports_zero_flag(&self) -> bool { + false + } +} + +/// Live disk resize support. +/// +/// Implementations may return an error if the backend does not +/// support resizing (e.g. fixed size formats). +pub trait Resizable: Send + Debug { + /// Resizes the disk image to the given size in bytes, if the backend supports it. + fn resize(&mut self, size: u64) -> BlockResult<()>; +} + +/// Supertrait bundling universal disk capabilities. +/// +/// Every disk format implements `DiskSize` and `Geometry`. +/// `Sync` is required so that `Arc` can be shared +/// across threads for concurrent readonly access. +pub trait DiskFile: DiskSize + Geometry + Sync {} + +/// Full capability disk file trait. +/// +/// Bundles all optional capabilities on top of [`DiskFile`]: +/// file descriptor access, physical size, sparse operations, and resize. +/// Used by consumers that need feature negotiation without async I/O +/// (e.g. vhost user block). +pub trait FullDiskFile: DiskFile + PhysicalSize + DiskFd + SparseCapable + Resizable {} + +/// Blanket implementation: any type implementing all constituent traits +/// automatically satisfies [`FullDiskFile`]. +impl FullDiskFile for T {} + +/// Extended disk file trait for virtio queue workers. +/// +/// Adds cloning and async I/O construction on top of [`DiskFile`]. +/// `Unpin` is required so trait objects can be moved freely. +pub trait AsyncDiskFile: DiskFile + Unpin { + /// Creates an independent handle for a queue worker. + /// + /// The clone shares internally reference counted state (e.g. + /// `Arc`) with the original, but owns its own file + /// descriptor and I/O completion resources. Each virtio queue + /// gets one clone so that workers can operate in parallel + /// without contending on I/O state. + /// + /// Returns `Box` (not `AsyncFullDiskFile`) + /// because clones only serve as data plane handles for queue + /// workers. The original remains the control plane for feature + /// negotiation and configuration. + fn try_clone(&self) -> BlockResult>; + + /// Constructs a per queue async I/O engine. + /// + /// # Arguments + /// + /// * `ring_depth` - maximum number of in flight I/O operations. + /// Callers typically pass the virtio queue size. Must be greater + /// than zero. Backends that do not use an async ring (e.g. sync + /// fallback implementations) may ignore this value. + fn new_async_io(&self, ring_depth: u32) -> BlockResult>; +} + +/// Full capability async disk file trait. +/// +/// Combines [`FullDiskFile`] (all optional capabilities) with +/// [`AsyncDiskFile`] (async I/O construction). This is the top level +/// trait for virtio block devices that need both feature negotiation +/// and async queue workers. +/// +/// The type narrowing on [`AsyncDiskFile::try_clone`] is intentional: +/// clones only serve as data plane handles for queue workers, while +/// the original `AsyncFullDiskFile` handle remains the control plane +/// for feature negotiation and configuration. +pub trait AsyncFullDiskFile: FullDiskFile + AsyncDiskFile {} + +/// Blanket implementation: any type implementing both [`FullDiskFile`] +/// and [`AsyncDiskFile`] automatically satisfies [`AsyncFullDiskFile`]. +impl AsyncFullDiskFile for T {} + +/// A disk backend that dispatches to either the existing [`async_io::DiskFile`] +/// trait or the next-generation [`AsyncFullDiskFile`] trait. +pub enum DiskBackend { + /// Existing disk file backend (raw, vhd, vhdx, etc.). + Legacy(Box), + /// Next-generation disk file backend (qcow2, and more formats as they migrate). + Next(Box), +} + +impl DiskBackend { + pub fn logical_size(&mut self) -> BlockResult { + match self { + Self::Legacy(d) => d + .logical_size() + .map_err(|e| BlockError::new(BlockErrorKind::Io, io::Error::other(e))), + Self::Next(d) => d.logical_size(), + } + } + + pub fn physical_size(&mut self) -> BlockResult { + match self { + Self::Legacy(d) => d + .physical_size() + .map_err(|e| BlockError::new(BlockErrorKind::Io, io::Error::other(e))), + Self::Next(d) => d.physical_size(), + } + } + + pub fn topology(&mut self) -> DiskTopology { + match self { + Self::Legacy(d) => d.topology(), + Self::Next(d) => d.topology(), + } + } + + pub fn supports_sparse_operations(&self) -> bool { + match self { + Self::Legacy(d) => d.supports_sparse_operations(), + Self::Next(d) => d.supports_sparse_operations(), + } + } + + pub fn supports_zero_flag(&self) -> bool { + match self { + Self::Legacy(d) => d.supports_zero_flag(), + Self::Next(d) => d.supports_zero_flag(), + } + } + + pub fn fd(&mut self) -> BorrowedDiskFd<'_> { + match self { + Self::Legacy(d) => d.fd(), + Self::Next(d) => d.fd(), + } + } + + pub fn new_async_io(&self, ring_depth: u32) -> BlockResult> { + match self { + Self::Legacy(d) => d + .new_async_io(ring_depth) + .map_err(|e| BlockError::new(BlockErrorKind::Io, io::Error::other(e))), + Self::Next(d) => d.new_async_io(ring_depth), + } + } + + pub fn resize(&mut self, new_size: u64) -> BlockResult<()> { + match self { + Self::Legacy(d) => d.resize(new_size).map_err(|e| match e { + async_io::DiskFileError::Unsupported => BlockError::new( + BlockErrorKind::UnsupportedFeature, + io::Error::other("resize not supported"), + ), + _ => BlockError::new(BlockErrorKind::Io, io::Error::other(e)), + }), + Self::Next(d) => d.resize(new_size), + } + } +} diff --git a/block/src/error.rs b/block/src/error.rs new file mode 100644 index 0000000000..645057005e --- /dev/null +++ b/block/src/error.rs @@ -0,0 +1,245 @@ +// Copyright 2025 The Cloud Hypervisor Authors. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +//! Unified error handling for the block crate. +//! +//! # Architecture +//! +//! ```text +//! BlockError -- single public error type +//! |-- BlockErrorKind -- small, stable, matchable classification +//! |-- ErrorContext -- optional diagnostic metadata (path, offset, op) +//! +-- source -- format-specific error (boxed) +//! |-- QcowError +//! |-- VhdError / RawError / ... +//! +-- io::Error / etc. +//! ``` + +use std::error::Error as StdError; +use std::fmt::{self, Display, Formatter}; +use std::io; +use std::path::PathBuf; + +/// Small, stable classification of block errors. +/// +/// Callers match on this for control flow. Adding new format specific +/// errors does not require new variants here. +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +#[non_exhaustive] +pub enum BlockErrorKind { + /// An underlying I/O operation failed. + Io, + /// The disk image format is structurally invalid. + InvalidFormat, + /// The disk image requires a feature that is not implemented. + UnsupportedFeature, + /// The image is marked or detected as corrupt. + CorruptImage, + /// An address, offset, or index is outside the valid range. + OutOfBounds, + /// A file or required internal structure could not be found. + NotFound, + /// An internal counter or limit was exceeded. + Overflow, +} + +impl Display for BlockErrorKind { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + match self { + Self::Io => write!(f, "I/O error"), + Self::InvalidFormat => write!(f, "Invalid format"), + Self::UnsupportedFeature => write!(f, "Unsupported feature"), + Self::CorruptImage => write!(f, "Corrupt image"), + Self::OutOfBounds => write!(f, "Out of bounds"), + Self::NotFound => write!(f, "Not found"), + Self::Overflow => write!(f, "Overflow"), + } + } +} + +/// Classification of the operation that was in progress when an error occurred. +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +#[non_exhaustive] +pub enum ErrorOp { + /// Opening a disk image file. + Open, + /// Detecting the image format. + DetectImageType, + /// Duplicating a backing-file descriptor. + DupBackingFd, + /// Resizing a disk image. + Resize, +} + +impl Display for ErrorOp { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + match self { + Self::Open => write!(f, "open"), + Self::DetectImageType => write!(f, "detect_image_type"), + Self::DupBackingFd => write!(f, "dup_backing_fd"), + Self::Resize => write!(f, "resize"), + } + } +} + +/// Optional diagnostic context attached to a [`BlockError`]. +#[derive(Debug, Default, Clone)] +pub struct ErrorContext { + pub path: Option, + pub offset: Option, + pub op: Option, +} + +impl Display for ErrorContext { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + let mut first = true; + if let Some(path) = &self.path { + write!(f, "path={}", path.display())?; + first = false; + } + if let Some(offset) = self.offset { + if !first { + write!(f, " ")?; + } + write!(f, "offset={offset:#x}")?; + first = false; + } + if let Some(op) = self.op { + if !first { + write!(f, " ")?; + } + write!(f, "op={op}")?; + } + Ok(()) + } +} + +/// Unified error type for the block crate. +/// +/// Pairs a stable [`BlockErrorKind`] classification with an optional +/// boxed source error (format-specific) and optional [`ErrorContext`]. +/// +/// Display renders kind + context only; the underlying cause is +/// exposed via [`std::error::Error::source()`] for reporters that +/// walk the chain. +#[derive(Debug)] +pub struct BlockError { + kind: BlockErrorKind, + source: Option>, + ctx: Option, +} + +impl BlockError { + /// Create a new `BlockError` from a kind and a source error. + pub fn new(kind: BlockErrorKind, source: E) -> Self + where + E: StdError + Send + Sync + 'static, + { + Self { + kind, + source: Some(Box::new(source)), + ctx: None, + } + } + + /// Create a `BlockError` from just a kind, with no underlying cause. + pub fn from_kind(kind: BlockErrorKind) -> Self { + Self { + kind, + source: None, + ctx: None, + } + } + + /// Attach or replace the source error (builder-style). + pub fn with_source(mut self, source: E) -> Self + where + E: StdError + Send + Sync + 'static, + { + self.source = Some(Box::new(source)); + self + } + + /// Attach diagnostic context. + pub fn with_ctx(mut self, ctx: ErrorContext) -> Self { + self.ctx = Some(ctx); + self + } + + /// Replace the error classification (builder-style). + pub fn with_kind(mut self, kind: BlockErrorKind) -> Self { + self.kind = kind; + self + } + + /// Shorthand: attach an operation name. + pub fn with_op(mut self, op: ErrorOp) -> Self { + self.ctx.get_or_insert_with(ErrorContext::default).op = Some(op); + self + } + + /// Shorthand: attach a file path. + pub fn with_path(mut self, path: impl Into) -> Self { + self.ctx.get_or_insert_with(ErrorContext::default).path = Some(path.into()); + self + } + + /// Shorthand: attach a byte offset. + pub fn with_offset(mut self, offset: u64) -> Self { + self.ctx.get_or_insert_with(ErrorContext::default).offset = Some(offset); + self + } + + /// The error classification. + pub fn kind(&self) -> BlockErrorKind { + self.kind + } + + /// The diagnostic context, if any. + pub fn context(&self) -> Option<&ErrorContext> { + self.ctx.as_ref() + } + + /// Access the underlying source error, if any. + pub fn source_ref(&self) -> Option<&(dyn StdError + Send + Sync + 'static)> { + self.source.as_deref() + } + + /// Try to downcast the source to a concrete type. + pub fn downcast_ref(&self) -> Option<&T> { + self.source.as_ref()?.downcast_ref::() + } + + /// Consume the error and return the boxed source, if any. + pub fn into_source(self) -> Option> { + self.source + } +} + +impl Display for BlockError { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.kind)?; + if let Some(ctx) = &self.ctx { + write!(f, " ({ctx})")?; + } + Ok(()) + } +} + +impl StdError for BlockError { + fn source(&self) -> Option<&(dyn StdError + 'static)> { + self.source + .as_ref() + .map(|e| e.as_ref() as &(dyn StdError + 'static)) + } +} + +/// Convenience: wrap an `io::Error` as `BlockErrorKind::Io`. +impl From for BlockError { + fn from(e: io::Error) -> Self { + Self::new(BlockErrorKind::Io, e) + } +} + +pub type BlockResult = Result; diff --git a/block/src/fcntl.rs b/block/src/fcntl.rs index a2a684f322..98084748cf 100644 --- a/block/src/fcntl.rs +++ b/block/src/fcntl.rs @@ -16,6 +16,7 @@ use std::fmt::Debug; use std::io; use std::os::fd::{AsRawFd, RawFd}; +use std::str::FromStr; use thiserror::Error; @@ -140,6 +141,37 @@ impl LockGranularity { } } +/// User-facing choice for the lock granularity. +/// +/// This allows external management software to create snapshots of the disk +/// image. Without a byte-range lock, some NFS implementations may treat the +/// entire file as exclusively locked and prevent such operations (e.g. NetApp). +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, serde::Deserialize, serde::Serialize)] +pub enum LockGranularityChoice { + /// Byte-range lock covering [0, size). + #[default] + ByteRange, + /// Whole-file lock (l_start=0, l_len=0) - original OFD whole-file lock behavior. + Full, +} + +/// Error returned when parsing a [`LockGranularityChoice`] from a string. +#[derive(Error, Debug)] +#[error("Invalid lock granularity value: {0}, expected 'byte-range' or 'full'")] +pub struct LockGranularityParseError(String); + +impl FromStr for LockGranularityChoice { + type Err = LockGranularityParseError; + + fn from_str(s: &str) -> Result { + match s { + "byte-range" => Ok(LockGranularityChoice::ByteRange), + "full" => Ok(LockGranularityChoice::Full), + _ => Err(LockGranularityParseError(s.to_owned())), + } + } +} + /// Returns a [`struct@libc::flock`] structure for the whole file. const fn get_flock(lock_type: LockType, granularity: LockGranularity) -> libc::flock { libc::flock { diff --git a/block/src/fixed_vhd_async.rs b/block/src/fixed_vhd_async.rs index 6e858f74a5..699fb2a494 100644 --- a/block/src/fixed_vhd_async.rs +++ b/block/src/fixed_vhd_async.rs @@ -7,43 +7,79 @@ use std::os::unix::io::{AsRawFd, RawFd}; use vmm_sys_util::eventfd::EventFd; -use crate::async_io::{ - AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFile, DiskFileError, DiskFileResult, -}; +use crate::async_io::{AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFileError}; +use crate::error::{BlockError, BlockErrorKind, BlockResult, ErrorOp}; use crate::fixed_vhd::FixedVhd; use crate::raw_async::RawFileAsync; -use crate::{BatchRequest, BlockBackend}; +use crate::{BatchRequest, BlockBackend, disk_file}; +#[derive(Debug)] pub struct FixedVhdDiskAsync(FixedVhd); impl FixedVhdDiskAsync { - pub fn new(file: File) -> std::io::Result { - Ok(Self(FixedVhd::new(file)?)) + pub fn new(file: File) -> BlockResult { + Ok(Self( + FixedVhd::new(file).map_err(|e| BlockError::from(e).with_op(ErrorOp::Open))?, + )) } } -impl DiskFile for FixedVhdDiskAsync { - fn logical_size(&mut self) -> DiskFileResult { +impl disk_file::DiskSize for FixedVhdDiskAsync { + fn logical_size(&self) -> BlockResult { Ok(self.0.logical_size().unwrap()) } +} + +impl disk_file::PhysicalSize for FixedVhdDiskAsync { + fn physical_size(&self) -> BlockResult { + self.0.physical_size().map_err(|e| match e { + crate::Error::GetFileMetadata(io) => { + BlockError::new(BlockErrorKind::Io, crate::Error::GetFileMetadata(io)) + } + _ => BlockError::new(BlockErrorKind::Io, e), + }) + } +} + +impl disk_file::DiskFd for FixedVhdDiskAsync { + fn fd(&self) -> BorrowedDiskFd<'_> { + BorrowedDiskFd::new(self.0.as_raw_fd()) + } +} - fn physical_size(&mut self) -> DiskFileResult { - Ok(self.0.physical_size().unwrap()) +impl disk_file::Geometry for FixedVhdDiskAsync {} + +impl disk_file::SparseCapable for FixedVhdDiskAsync {} + +impl disk_file::Resizable for FixedVhdDiskAsync { + fn resize(&mut self, _size: u64) -> BlockResult<()> { + Err(BlockError::new( + BlockErrorKind::UnsupportedFeature, + DiskFileError::ResizeError(std::io::Error::other("resize not supported for fixed VHD")), + ) + .with_op(ErrorOp::Resize)) } +} - fn new_async_io(&self, ring_depth: u32) -> DiskFileResult> { +impl disk_file::DiskFile for FixedVhdDiskAsync {} + +impl disk_file::AsyncDiskFile for FixedVhdDiskAsync { + fn try_clone(&self) -> BlockResult> { + Ok(Box::new(FixedVhdDiskAsync(self.0.clone()))) + } + + fn new_async_io(&self, ring_depth: u32) -> BlockResult> { Ok(Box::new( FixedVhdAsync::new( self.0.as_raw_fd(), ring_depth, self.0.logical_size().unwrap(), ) - .map_err(DiskFileError::NewAsyncIo)?, - ) as Box) - } - - fn fd(&mut self) -> BorrowedDiskFd<'_> { - BorrowedDiskFd::new(self.0.as_raw_fd()) + .map_err(|e| { + BlockError::new(BlockErrorKind::Io, DiskFileError::NewAsyncIo(e)) + .with_op(ErrorOp::Open) + })?, + )) } } diff --git a/block/src/fixed_vhd_sync.rs b/block/src/fixed_vhd_sync.rs index ecb5e83ad0..14685522b3 100644 --- a/block/src/fixed_vhd_sync.rs +++ b/block/src/fixed_vhd_sync.rs @@ -7,45 +7,74 @@ use std::os::unix::io::{AsRawFd, RawFd}; use vmm_sys_util::eventfd::EventFd; -use crate::BlockBackend; -use crate::async_io::{ - AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFile, DiskFileError, DiskFileResult, -}; +use crate::async_io::{AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFileError}; +use crate::error::{BlockError, BlockErrorKind, BlockResult, ErrorOp}; use crate::fixed_vhd::FixedVhd; use crate::raw_sync::RawFileSync; +use crate::{BlockBackend, disk_file}; +#[derive(Debug)] pub struct FixedVhdDiskSync(FixedVhd); impl FixedVhdDiskSync { - pub fn new(file: File) -> std::io::Result { - Ok(Self(FixedVhd::new(file)?)) + pub fn new(file: File) -> BlockResult { + Ok(Self( + FixedVhd::new(file).map_err(|e| BlockError::from(e).with_op(ErrorOp::Open))?, + )) } } -impl DiskFile for FixedVhdDiskSync { - fn logical_size(&mut self) -> DiskFileResult { +impl disk_file::DiskSize for FixedVhdDiskSync { + fn logical_size(&self) -> BlockResult { Ok(self.0.logical_size().unwrap()) } +} - fn physical_size(&mut self) -> DiskFileResult { - self.0.physical_size().map_err(|e| { - let io_inner = match e { - crate::Error::GetFileMetadata(e) => e, - _ => unreachable!(), - }; - DiskFileError::Size(io_inner) +impl disk_file::PhysicalSize for FixedVhdDiskSync { + fn physical_size(&self) -> BlockResult { + self.0.physical_size().map_err(|e| match e { + crate::Error::GetFileMetadata(io) => { + BlockError::new(BlockErrorKind::Io, crate::Error::GetFileMetadata(io)) + } + _ => BlockError::new(BlockErrorKind::Io, e), }) } +} - fn new_async_io(&self, _ring_depth: u32) -> DiskFileResult> { - Ok(Box::new( - FixedVhdSync::new(self.0.as_raw_fd(), self.0.logical_size().unwrap()) - .map_err(DiskFileError::NewAsyncIo)?, - ) as Box) +impl disk_file::DiskFd for FixedVhdDiskSync { + fn fd(&self) -> BorrowedDiskFd<'_> { + BorrowedDiskFd::new(self.0.as_raw_fd()) } +} - fn fd(&mut self) -> BorrowedDiskFd<'_> { - BorrowedDiskFd::new(self.0.as_raw_fd()) +impl disk_file::Geometry for FixedVhdDiskSync {} + +impl disk_file::SparseCapable for FixedVhdDiskSync {} + +impl disk_file::Resizable for FixedVhdDiskSync { + fn resize(&mut self, _size: u64) -> BlockResult<()> { + Err(BlockError::new( + BlockErrorKind::UnsupportedFeature, + DiskFileError::ResizeError(std::io::Error::other("resize not supported for fixed VHD")), + ) + .with_op(ErrorOp::Resize)) + } +} + +impl disk_file::DiskFile for FixedVhdDiskSync {} + +impl disk_file::AsyncDiskFile for FixedVhdDiskSync { + fn try_clone(&self) -> BlockResult> { + Ok(Box::new(FixedVhdDiskSync(self.0.clone()))) + } + + fn new_async_io(&self, _ring_depth: u32) -> BlockResult> { + Ok(Box::new( + FixedVhdSync::new(self.0.as_raw_fd(), self.0.logical_size().unwrap()).map_err(|e| { + BlockError::new(BlockErrorKind::Io, DiskFileError::NewAsyncIo(e)) + .with_op(ErrorOp::Open) + })?, + )) } } diff --git a/block/src/lib.rs b/block/src/lib.rs index 9f78cefd9e..7a55cc4498 100644 --- a/block/src/lib.rs +++ b/block/src/lib.rs @@ -9,6 +9,8 @@ // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause pub mod async_io; +pub mod disk_file; +pub mod error; pub mod fcntl; pub mod fixed_vhd; #[cfg(feature = "io_uring")] @@ -16,6 +18,9 @@ pub mod fixed_vhd; pub mod fixed_vhd_async; pub mod fixed_vhd_sync; pub mod qcow; +#[cfg(feature = "io_uring")] +pub mod qcow_async; +pub(crate) mod qcow_common; pub mod qcow_sync; #[cfg(feature = "io_uring")] /// Async primitives based on `io-uring` @@ -23,6 +28,8 @@ pub mod qcow_sync; /// Enabled with the `"io_uring"` feature pub mod raw_async; pub mod raw_async_aio; +#[cfg(test)] +mod raw_async_io_tests; pub mod raw_sync; pub mod vhd; pub mod vhdx; @@ -31,18 +38,21 @@ pub mod vhdx_sync; use std::alloc::{Layout, alloc_zeroed, dealloc}; use std::collections::VecDeque; use std::fmt::{self, Debug}; -use std::fs::File; +use std::fs::{File, OpenOptions}; use std::io::{self, IoSlice, IoSliceMut, Read, Seek, SeekFrom, Write}; use std::os::linux::fs::MetadataExt; +use std::os::unix::fs::FileTypeExt; use std::os::unix::io::AsRawFd; use std::path::Path; use std::str::FromStr; use std::time::Instant; -use std::{cmp, result}; +use std::{cmp, mem, result}; #[cfg(feature = "io_uring")] use io_uring::{IoUring, Probe, opcode}; -use libc::{S_IFBLK, S_IFMT, ioctl}; +use libc::{ + FALLOC_FL_KEEP_SIZE, FALLOC_FL_PUNCH_HOLE, FALLOC_FL_ZERO_RANGE, S_IFBLK, S_IFMT, ioctl, +}; use log::{debug, error, info, warn}; use serde::{Deserialize, Serialize}; use smallvec::SmallVec; @@ -55,14 +65,27 @@ use vm_memory::{ }; use vm_virtio::{AccessPlatform, Translatable}; use vmm_sys_util::eventfd::EventFd; -use vmm_sys_util::{aio, ioctl_io_nr}; +use vmm_sys_util::{aio, ioctl_io_nr, ioctl_ior_nr}; use crate::async_io::{AsyncIo, AsyncIoError, AsyncIoResult}; +use crate::error::{BlockError, BlockErrorKind, BlockResult, ErrorOp}; use crate::vhdx::VhdxError; const SECTOR_SHIFT: u8 = 9; pub const SECTOR_SIZE: u64 = 0x01 << SECTOR_SHIFT; +/// Maximum number of segments per DISCARD or WRITE_ZEROES request. +pub const MAX_DISCARD_WRITE_ZEROES_SEG: u32 = 1; + +/// Size and field offsets within `struct virtio_blk_discard_write_zeroes`. +const DISCARD_WZ_SEG_SIZE: u32 = mem::size_of::() as u32; +const DISCARD_WZ_MAX_PAYLOAD: u32 = DISCARD_WZ_SEG_SIZE * MAX_DISCARD_WRITE_ZEROES_SEG; +const DISCARD_WZ_SECTOR_OFFSET: u64 = + mem::offset_of!(virtio_blk_discard_write_zeroes, sector) as u64; +const DISCARD_WZ_NUM_SECTORS_OFFSET: u64 = + mem::offset_of!(virtio_blk_discard_write_zeroes, num_sectors) as u64; +const DISCARD_WZ_FLAGS_OFFSET: u64 = mem::offset_of!(virtio_blk_discard_write_zeroes, flags) as u64; + #[derive(Error, Debug)] pub enum Error { #[error("Guest gave us bad memory addresses")] @@ -91,6 +114,8 @@ pub enum Error { RawFileError(#[source] std::io::Error), #[error("The requested operation does not support multiple descriptors")] TooManyDescriptors, + #[error("Request contains too many segments ({0}, max {MAX_DISCARD_WRITE_ZEROES_SEG})")] + TooManySegments(u32), #[error("Failure in vhdx")] VhdxError(#[source] VhdxError), } @@ -147,6 +172,8 @@ pub enum ExecuteError { WriteAll(#[source] io::Error), #[error("Unsupported request: {0}")] Unsupported(u32), + #[error("Unsupported flags {flags:#x} for request type {request_type}")] + UnsupportedFlags { request_type: u32, flags: u32 }, #[error("Failed to submit io uring")] SubmitIoUring(#[source] io::Error), #[error("Failed to get guest address")] @@ -177,6 +204,7 @@ impl ExecuteError { ExecuteError::Write(_) => VIRTIO_BLK_S_IOERR, ExecuteError::WriteAll(_) => VIRTIO_BLK_S_IOERR, ExecuteError::Unsupported(_) => VIRTIO_BLK_S_UNSUPP, + ExecuteError::UnsupportedFlags { .. } => VIRTIO_BLK_S_UNSUPP, ExecuteError::SubmitIoUring(_) => VIRTIO_BLK_S_IOERR, ExecuteError::GetHostAddress(_) => VIRTIO_BLK_S_IOERR, ExecuteError::AsyncRead(_) => VIRTIO_BLK_S_IOERR, @@ -284,7 +312,8 @@ impl Request { let hdr_desc_addr = hdr_desc .addr() - .translate_gva(access_platform, hdr_desc.len() as usize); + .translate_gva(access_platform, hdr_desc.len() as usize) + .map_err(|e| Error::GuestMemory(GuestMemoryError::IOError(e)))?; let mut req = Request { request_type: request_type(desc_chain.memory(), hdr_desc_addr)?, @@ -325,7 +354,8 @@ impl Request { req.data_descriptors.push(( desc.addr() - .translate_gva(access_platform, desc.len() as usize), + .translate_gva(access_platform, desc.len() as usize) + .map_err(|e| Error::GuestMemory(GuestMemoryError::IOError(e)))?, desc.len(), )); desc = desc_chain @@ -356,7 +386,8 @@ impl Request { req.status_addr = status_desc .addr() - .translate_gva(access_platform, status_desc.len() as usize); + .translate_gva(access_platform, status_desc.len() as usize) + .map_err(|e| Error::GuestMemory(GuestMemoryError::IOError(e)))?; Ok(req) } @@ -436,6 +467,7 @@ impl Request { let sector = self.sector; let request_type = self.request_type; let offset = (sector << SECTOR_SHIFT) as libc::off_t; + let alignment = disk_image.alignment(); let mut iovecs: SmallVec<[libc::iovec; DEFAULT_DESCRIPTOR_VEC_SIZE]> = SmallVec::with_capacity(self.data_descriptors.len()); @@ -466,14 +498,15 @@ impl Request { assert!(origin_ptr.len() >= data_len); let origin_ptr = origin_ptr.ptr_guard(); - // Verify the buffer alignment. - // In case it's not properly aligned, an intermediate buffer is - // created with the correct alignment, and a copy from/to the - // origin buffer is performed, depending on the type of operation. - let iov_base = if (origin_ptr.as_ptr() as u64).is_multiple_of(SECTOR_SIZE) { + // O_DIRECT requires buffer addresses to be aligned to the + // backend device's logical block size. In case it's not properly + // aligned, an intermediate buffer is created with the correct + // alignment, and a copy from/to the origin buffer is performed, + // depending on the type of operation. + let iov_base = if (origin_ptr.as_ptr() as u64).is_multiple_of(alignment) { origin_ptr.as_ptr() as *mut libc::c_void } else { - let layout = Layout::from_size_align(data_len, SECTOR_SIZE as usize).unwrap(); + let layout = Layout::from_size_align(data_len, alignment as usize).unwrap(); // SAFETY: layout has non-zero size let aligned_ptr = unsafe { alloc_zeroed(layout) }; if aligned_ptr.is_null() { @@ -575,17 +608,43 @@ impl Request { return Err(ExecuteError::BadRequest(Error::TooManyDescriptors)); }; - if data_len < 16 { + if data_len < DISCARD_WZ_SEG_SIZE { return Err(ExecuteError::BadRequest(Error::DescriptorLengthTooSmall)); } + if data_len > DISCARD_WZ_MAX_PAYLOAD { + return Err(ExecuteError::BadRequest(Error::TooManySegments( + data_len.div_ceil(DISCARD_WZ_SEG_SIZE), + ))); + } let mut discard_sector = [0u8; 8]; let mut discard_num_sectors = [0u8; 4]; - mem.read_slice(&mut discard_sector, data_addr) + let mut discard_flags = [0u8; 4]; + + let sector_addr = data_addr.checked_add(DISCARD_WZ_SECTOR_OFFSET).unwrap(); + mem.read_slice(&mut discard_sector, sector_addr) + .map_err(ExecuteError::Read)?; + + let num_sectors_addr = data_addr + .checked_add(DISCARD_WZ_NUM_SECTORS_OFFSET) + .unwrap(); + mem.read_slice(&mut discard_num_sectors, num_sectors_addr) .map_err(ExecuteError::Read)?; - mem.read_slice(&mut discard_num_sectors, data_addr.checked_add(8).unwrap()) + + let flags_addr = data_addr.checked_add(DISCARD_WZ_FLAGS_OFFSET).unwrap(); + mem.read_slice(&mut discard_flags, flags_addr) .map_err(ExecuteError::Read)?; + let discard_flags = u32::from_le_bytes(discard_flags); + // Per virtio spec v1.2 reject discard if any flag is set, including unmap. + if discard_flags != 0 { + warn!("Unsupported flags {discard_flags:#x} in discard request"); + return Err(ExecuteError::UnsupportedFlags { + request_type: VIRTIO_BLK_T_DISCARD, + flags: discard_flags, + }); + } + let discard_sector = u64::from_le_bytes(discard_sector); if discard_sector == 0 && disable_sector0_writes { @@ -594,6 +653,13 @@ impl Request { let discard_num_sectors = u32::from_le_bytes(discard_num_sectors); + let top = discard_sector + .checked_add(discard_num_sectors as u64) + .ok_or(ExecuteError::BadRequest(Error::InvalidOffset))?; + if top > disk_nsectors { + return Err(ExecuteError::BadRequest(Error::InvalidOffset)); + } + let discard_offset = discard_sector * SECTOR_SIZE; let discard_length = (discard_num_sectors as u64) * SECTOR_SIZE; @@ -608,30 +674,69 @@ impl Request { return Err(ExecuteError::BadRequest(Error::TooManyDescriptors)); }; - if data_len < 16 { + if data_len < DISCARD_WZ_SEG_SIZE { return Err(ExecuteError::BadRequest(Error::DescriptorLengthTooSmall)); } + if data_len > DISCARD_WZ_MAX_PAYLOAD { + return Err(ExecuteError::BadRequest(Error::TooManySegments( + data_len.div_ceil(DISCARD_WZ_SEG_SIZE), + ))); + } let mut wz_sector = [0u8; 8]; - let mut wz_num_sectors = [0u8; 4]; - mem.read_slice(&mut wz_sector, data_addr) + let mut wz_flags = [0u8; 4]; + + let sector_addr = data_addr.checked_add(DISCARD_WZ_SECTOR_OFFSET).unwrap(); + mem.read_slice(&mut wz_sector, sector_addr) + .map_err(ExecuteError::Read)?; + + let num_sectors_addr = data_addr + .checked_add(DISCARD_WZ_NUM_SECTORS_OFFSET) + .unwrap(); + mem.read_slice(&mut wz_num_sectors, num_sectors_addr) .map_err(ExecuteError::Read)?; - mem.read_slice(&mut wz_num_sectors, data_addr.checked_add(8).unwrap()) + + let flags_addr = data_addr.checked_add(DISCARD_WZ_FLAGS_OFFSET).unwrap(); + mem.read_slice(&mut wz_flags, flags_addr) .map_err(ExecuteError::Read)?; let wz_sector = u64::from_le_bytes(wz_sector); let wz_num_sectors = u32::from_le_bytes(wz_num_sectors); + let wz_flags = u32::from_le_bytes(wz_flags); + // Per virtio spec v1.2 reject write zeroes if any unknown flag is set. + if (wz_flags & !VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) != 0 { + warn!("Unsupported flags {wz_flags:#x} in write zeroes request"); + return Err(ExecuteError::UnsupportedFlags { + request_type: VIRTIO_BLK_T_WRITE_ZEROES, + flags: wz_flags, + }); + } + let wz_offset = wz_sector * SECTOR_SIZE; if wz_offset == 0 && disable_sector0_writes { return Err(ExecuteError::BadRequest(Error::InvalidOffset)); } + + let top = wz_sector + .checked_add(wz_num_sectors as u64) + .ok_or(ExecuteError::BadRequest(Error::InvalidOffset))?; + if top > disk_nsectors { + return Err(ExecuteError::BadRequest(Error::InvalidOffset)); + } + let wz_length = (wz_num_sectors as u64) * SECTOR_SIZE; - disk_image - .write_zeroes(wz_offset, wz_length, user_data) - .map_err(ExecuteError::AsyncWriteZeroes)?; + if wz_flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP != 0 { + disk_image + .punch_hole(wz_offset, wz_length, user_data) + .map_err(ExecuteError::AsyncPunchHole)?; + } else { + disk_image + .write_zeroes(wz_offset, wz_length, user_data) + .map_err(ExecuteError::AsyncWriteZeroes)?; + } } RequestType::Unsupported(t) => return Err(ExecuteError::Unsupported(t)), } @@ -801,10 +906,6 @@ pub fn probe_sparse_support(file: &File) -> bool { /// Probe sparse support for a regular file using fallocate(). fn probe_file_sparse_support(fd: libc::c_int) -> bool { - const FALLOC_FL_KEEP_SIZE: libc::c_int = 0x01; - const FALLOC_FL_PUNCH_HOLE: libc::c_int = 0x02; - const FALLOC_FL_ZERO_RANGE: libc::c_int = 0x10; - // SAFETY: FFI call with valid fd let file_size = unsafe { libc::lseek(fd, 0, libc::SEEK_END) }; if file_size < 0 { @@ -846,34 +947,19 @@ fn probe_file_sparse_support(fd: libc::c_int) -> bool { supported } -/// Probe sparse support for a block device using ioctls. -fn probe_block_device_sparse_support(fd: libc::c_int) -> bool { - ioctl_io_nr!(BLKDISCARD, 0x12, 119); - ioctl_io_nr!(BLKZEROOUT, 0x12, 127); - - let range: [u64; 2] = [0, 0]; - - // SAFETY: FFI call with valid fd and valid range buffer - let punch_hole = unsafe { ioctl(fd, BLKDISCARD() as _, &range) } == 0; - - if !punch_hole { - let err = io::Error::last_os_error(); - debug!("Block device BLKDISCARD probe returned: {err}"); - } - - // SAFETY: FFI call with valid fd and valid range buffer - let zero_range = unsafe { ioctl(fd, BLKZEROOUT() as _, &range) } == 0; - - if !zero_range { - let err = io::Error::last_os_error(); - debug!("Block device BLKZEROOUT probe returned: {err}"); - } - - let supported = punch_hole || zero_range; - info!( - "Probed block device sparse support: punch_hole={punch_hole}, zero_range={zero_range} => {supported}" - ); - supported +/// Probe sparse support for a block device. +/// +/// Block devices always report sparse support. `BLKZEROOUT` is guaranteed to +/// succeed as the kernel provides a software fallback writing explicit zeros +/// when the hardware lacks a native write zeroes command. `BLKDISCARD` may fail +/// at runtime with `EOPNOTSUPP` on devices without trim or discard support, but +/// Linux guests handle this gracefully by ceasing discard requests. +/// +/// There is no non destructive read only ioctl to query block device discard +/// or write zeroes capabilities. +fn probe_block_device_sparse_support(_fd: libc::c_int) -> bool { + info!("Block device: assuming sparse support"); + true } /// Preallocate disk space for a disk image file. @@ -1076,14 +1162,27 @@ pub fn read_aligned_block_size(f: &mut File) -> std::io::Result> { Ok(data) } +/// Open a disk image file, returning a [`BlockError`] with path context +/// on failure. +pub fn open_disk_image(path: &Path, options: &OpenOptions) -> BlockResult { + options.open(path).map_err(|e| { + BlockError::new(BlockErrorKind::Io, e) + .with_op(ErrorOp::Open) + .with_path(path) + }) +} + /// Determine image type through file parsing. -pub fn detect_image_type(f: &mut File) -> std::io::Result { - let block = read_aligned_block_size(f)?; +pub fn detect_image_type(f: &mut File) -> BlockResult { + let block = read_aligned_block_size(f) + .map_err(|e| BlockError::new(BlockErrorKind::Io, e).with_op(ErrorOp::DetectImageType))?; // Check 4 first bytes to get the header value and determine the image type let image_type = if u32::from_be_bytes(block[0..4].try_into().unwrap()) == QCOW_MAGIC { ImageType::Qcow2 - } else if vhd::is_fixed_vhd(f)? { + } else if vhd::is_fixed_vhd(f) + .map_err(|e| BlockError::new(BlockErrorKind::Io, e).with_op(ErrorOp::DetectImageType))? + { ImageType::FixedVhd } else if u64::from_le_bytes(block[0..8].try_into().unwrap()) == VHDX_SIGN { ImageType::Vhdx @@ -1128,6 +1227,36 @@ ioctl_io_nr!(BLKSSZGET, 0x12, 104); ioctl_io_nr!(BLKPBSZGET, 0x12, 123); ioctl_io_nr!(BLKIOMIN, 0x12, 120); ioctl_io_nr!(BLKIOOPT, 0x12, 121); +ioctl_ior_nr!(BLKGETSIZE64, 0x12, 114, u64); + +/// Returns `(logical_size, physical_size)` in bytes for regular files and block devices. +/// +/// For regular files, logical size is `st_size` and physical size is +/// `st_blocks * 512` (actual host allocation). For block devices both +/// values equal the `BLKGETSIZE64` result. +pub fn query_device_size(file: &File) -> io::Result<(u64, u64)> { + let m = file.metadata()?; + if m.is_file() { + // st_blocks is always in 512-byte units on Linux + Ok((m.len(), m.st_blocks() * 512)) + } else if m.file_type().is_block_device() { + let mut size: u64 = 0; + // SAFETY: BLKGETSIZE64 reads the device size into a u64 pointer. + let ret = unsafe { libc::ioctl(file.as_raw_fd(), BLKGETSIZE64() as _, &mut size) }; + if ret != 0 { + return Err(io::Error::last_os_error()); + } + Ok((size, size)) + } else { + Err(io::Error::new( + io::ErrorKind::InvalidInput, + format!( + "disk image must be a regular file or block device, is: {:?}", + m.file_type() + ), + )) + } +} #[derive(Copy, Clone)] enum BlockSize { @@ -1174,8 +1303,73 @@ impl DiskTopology { Ok(block_size) } + /// Query the O_DIRECT alignment requirement for a regular file. + /// + /// Uses `statx(STATX_DIOALIGN)` (Linux >= 6.1) to obtain the exact + /// memory and offset alignment the kernel requires for direct I/O on + /// this specific file. Unlike `fstatvfs().f_bsize`, which only returns + /// the filesystem's preferred I/O block size, `STATX_DIOALIGN` reports + /// the true per-file DIO constraints accounting for the filesystem, + /// underlying block device, and any stacking (loop, dm, etc.). + fn query_file_alignment(f: &File) -> u64 { + // The libc crate does not expose statx / STATX_DIOALIGN on all + // targets (e.g. musl), so define the constant and a minimal repr(C) + // struct locally and invoke the syscall directly. + const STATX_DIOALIGN: u32 = 0x2000; + + // Minimal statx layout, only the needed fields, + // everything else is padding. + #[repr(C)] + struct Statx { + stx_mask: u32, + _pad: [u8; 148], + stx_dio_mem_align: u32, + stx_dio_offset_align: u32, + _pad2: [u8; 96], + } + + let mut stx = mem::MaybeUninit::::zeroed(); + // SAFETY: FFI syscall with valid fd and correctly sized buffer. + let ret = unsafe { + libc::syscall( + libc::SYS_statx, + f.as_raw_fd(), + c"".as_ptr(), + libc::AT_EMPTY_PATH, + STATX_DIOALIGN, + stx.as_mut_ptr(), + ) + }; + if ret == 0 { + // SAFETY: statx succeeded, the struct is fully initialized. + let stx = unsafe { stx.assume_init() }; + if stx.stx_mask & STATX_DIOALIGN != 0 && stx.stx_dio_mem_align > 0 { + let align = cmp::max(stx.stx_dio_mem_align, stx.stx_dio_offset_align) as u64; + debug!("statx(STATX_DIOALIGN) returned alignment {align}"); + return align; + } + } + + debug!("O_DIRECT alignment query failed, falling back to default {SECTOR_SIZE}"); + SECTOR_SIZE + } + pub fn probe(f: &File) -> std::io::Result { if !Self::is_block_device(f)? { + // For regular files opened with O_DIRECT, the logical block size + // must reflect the filesystem DIO alignment so the guest issues + // correctly sized I/O. + // SAFETY: fcntl(F_GETFL) is always safe on a valid fd. + let flags = unsafe { libc::fcntl(f.as_raw_fd(), libc::F_GETFL) }; + if flags >= 0 && (flags & libc::O_DIRECT) != 0 { + let alignment = Self::query_file_alignment(f); + return Ok(DiskTopology { + logical_block_size: alignment, + physical_block_size: alignment, + minimum_io_size: alignment, + optimal_io_size: 0, + }); + } return Ok(DiskTopology::default()); } @@ -1187,3 +1381,193 @@ impl DiskTopology { }) } } + +#[cfg(test)] +mod unit_tests { + use std::alloc::{Layout, alloc_zeroed, dealloc}; + use std::fs::OpenOptions; + use std::io::Write; + use std::os::unix::fs::OpenOptionsExt; + use std::{ptr, slice}; + + use vmm_sys_util::tempfile::TempFile; + + use super::*; + + #[test] + fn test_probe_regular_file_returns_valid_alignment() { + let temp_file = TempFile::new().unwrap(); + let mut f = temp_file.into_file(); + f.write_all(&[0u8; 4096]).unwrap(); + f.sync_all().unwrap(); + + let topo = DiskTopology::probe(&f).unwrap(); + + assert_eq!( + topo.logical_block_size, SECTOR_SIZE, + "probe() should return {SECTOR_SIZE} for regular files without O_DIRECT, got {}", + topo.logical_block_size + ); + } + + #[test] + fn test_probe_regular_file_with_direct_returns_dio_alignment() { + let temp_file = TempFile::new().unwrap(); + let path = temp_file.as_path().to_owned(); + { + let f = temp_file.as_file(); + f.set_len(1 << 20).unwrap(); // 1 MiB + f.sync_all().unwrap(); + } + + let f = OpenOptions::new() + .read(true) + .write(true) + .custom_flags(libc::O_DIRECT) + .open(&path) + .unwrap(); + let topo = DiskTopology::probe(&f).unwrap(); + + assert!( + topo.logical_block_size.is_power_of_two(), + "logical_block_size {} is not a power of two", + topo.logical_block_size + ); + assert!( + topo.logical_block_size >= SECTOR_SIZE, + "logical_block_size {} is less than SECTOR_SIZE ({SECTOR_SIZE})", + topo.logical_block_size + ); + + let alignment = topo.logical_block_size as usize; + let layout = Layout::from_size_align(4096, alignment); + assert!( + layout.is_ok(), + "Layout::from_size_align(4096, {alignment}) failed: {:?}", + layout.err() + ); + } + + #[test] + fn test_dio_write_read_with_probed_alignment() { + let temp_file = TempFile::new().unwrap(); + let path = temp_file.as_path().to_owned(); + { + let f = temp_file.as_file(); + f.set_len(1 << 20).unwrap(); // 1 MiB + f.sync_all().unwrap(); + } + + let f = OpenOptions::new() + .read(true) + .write(true) + .custom_flags(libc::O_DIRECT) + .open(&path) + .unwrap(); + let topo = DiskTopology::probe(&f).unwrap(); + let alignment = topo.logical_block_size as usize; + + let layout = Layout::from_size_align(alignment, alignment).unwrap(); + // SAFETY: layout is valid (non-zero, power-of-two alignment). + let buf = unsafe { alloc_zeroed(layout) }; + assert!(!buf.is_null()); + + // SAFETY: buf is valid for `alignment` bytes. + unsafe { ptr::write_bytes(buf, 0xAB, alignment) }; + + // SAFETY: buf is aligned and sized for O_DIRECT; fd is valid. + let written = + unsafe { libc::pwrite(f.as_raw_fd(), buf as *const libc::c_void, alignment, 0) }; + assert_eq!( + written as usize, + alignment, + "O_DIRECT pwrite failed: {}", + io::Error::last_os_error() + ); + + // SAFETY: buf is valid for `alignment` bytes. + unsafe { ptr::write_bytes(buf, 0x00, alignment) }; + // SAFETY: buf is aligned and sized for O_DIRECT; fd is valid. + let read = unsafe { libc::pread(f.as_raw_fd(), buf as *mut libc::c_void, alignment, 0) }; + assert_eq!( + read as usize, + alignment, + "O_DIRECT pread failed: {}", + io::Error::last_os_error() + ); + + // SAFETY: buf is valid for `alignment` bytes after successful pread. + let slice = unsafe { slice::from_raw_parts(buf, alignment) }; + assert!( + slice.iter().all(|&b| b == 0xAB), + "Data mismatch after O_DIRECT roundtrip" + ); + + // SAFETY: buf was allocated with this layout via alloc_zeroed. + unsafe { dealloc(buf, layout) }; + } + + #[test] + fn test_query_device_size_regular_file() { + let temp_file = TempFile::new().unwrap(); + let mut f = temp_file.into_file(); + // 5 sectors + 13 extra bytes - not page aligned, not sectoraligned + f.write_all(&[0xAB; 5 * 512 + 13]).unwrap(); + f.sync_all().unwrap(); + + let (logical, physical) = query_device_size(&f).unwrap(); + assert_eq!(logical, 5 * 512 + 13); + assert!(physical > 0); + } + + #[test] + fn test_query_device_size_sparse_file_punch_hole() { + let temp_file = TempFile::new().unwrap(); + let f = temp_file.as_file(); + // Allocate 1 MiB + let size: i64 = 1 << 20; + f.set_len(size as u64).unwrap(); + // SAFETY: fd is valid, range is within file size. + let ret = unsafe { + libc::fallocate( + f.as_raw_fd(), + 0, // allocate + 0, + size, + ) + }; + assert_eq!(ret, 0, "fallocate failed: {}", io::Error::last_os_error()); + f.sync_all().unwrap(); + + let (log_before, phys_before) = query_device_size(f).unwrap(); + assert_eq!(log_before, size as u64); + assert_eq!(phys_before, size as u64); + + // Punch a hole in the middle 512 KiB + // SAFETY: fd is valid, range is within file size. + let ret = unsafe { + libc::fallocate( + f.as_raw_fd(), + libc::FALLOC_FL_PUNCH_HOLE | libc::FALLOC_FL_KEEP_SIZE, + size / 4, + size / 2, + ) + }; + assert_eq!(ret, 0, "punch hole failed: {}", io::Error::last_os_error()); + f.sync_all().unwrap(); + + let (logical, physical) = query_device_size(f).unwrap(); + assert_eq!(logical, size as u64, "logical size must not change"); + assert!( + physical < logical, + "physical ({physical}) should be less than logical ({logical}) after punch hole" + ); + } + + #[test] + fn test_query_device_size_rejects_char_device() { + let f = std::fs::File::open("/dev/zero").unwrap(); + let err = query_device_size(&f).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidInput); + } +} diff --git a/block/src/qcow/backing.rs b/block/src/qcow/backing.rs new file mode 100644 index 0000000000..754618f132 --- /dev/null +++ b/block/src/qcow/backing.rs @@ -0,0 +1,166 @@ +// Copyright © 2021 Intel Corporation +// +// Copyright 2026 The Cloud Hypervisor Authors. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +//! Thread safe backing file readers for QCOW2 images. + +use std::io; +use std::os::fd::{AsFd, AsRawFd, BorrowedFd, OwnedFd}; +use std::sync::Arc; + +use crate::error::{BlockError, BlockErrorKind, BlockResult, ErrorOp}; +use crate::qcow::metadata::{BackingRead, ClusterReadMapping, QcowMetadata}; +use crate::qcow::{BackingFile, BackingKind, Error as QcowError}; +use crate::qcow_common::pread_exact; + +/// Raw backing file using pread64 on a duplicated fd. +pub(crate) struct RawBacking { + pub(crate) fd: OwnedFd, + pub(crate) virtual_size: u64, +} + +// SAFETY: The only I/O operation is pread64 which is position independent +// and safe for concurrent use from multiple threads. +unsafe impl Sync for RawBacking {} + +impl BackingRead for RawBacking { + fn read_at(&self, address: u64, buf: &mut [u8]) -> io::Result<()> { + if address >= self.virtual_size { + buf.fill(0); + return Ok(()); + } + let available = (self.virtual_size - address) as usize; + if available >= buf.len() { + pread_exact(self.fd.as_raw_fd(), buf, address) + } else { + pread_exact(self.fd.as_raw_fd(), &mut buf[..available], address)?; + buf[available..].fill(0); + Ok(()) + } + } +} + +/// QCOW2 image used as a backing file for another QCOW2 image. +/// +/// Resolves guest offsets through the QCOW2 cluster mapping (L1/L2 +/// tables, refcounts) before reading the underlying data. Read only +/// because backing files never receive writes. Nested backing chains +/// are handled recursively via the optional `backing_file` field. +pub(crate) struct Qcow2Backing { + pub(crate) metadata: Arc, + pub(crate) data_fd: OwnedFd, + pub(crate) backing_file: Option>, +} + +// SAFETY: All reads go through QcowMetadata which uses RwLock +// and pread64 which is position independent and thread safe. +unsafe impl Sync for Qcow2Backing {} + +impl BackingRead for Qcow2Backing { + fn read_at(&self, address: u64, buf: &mut [u8]) -> io::Result<()> { + let virtual_size = self.metadata.virtual_size(); + if address >= virtual_size { + buf.fill(0); + return Ok(()); + } + let available = (virtual_size - address) as usize; + if available < buf.len() { + self.read_clusters(address, &mut buf[..available])?; + buf[available..].fill(0); + return Ok(()); + } + self.read_clusters(address, buf) + } +} + +impl Qcow2Backing { + /// Resolve cluster mappings via metadata then read allocated clusters + /// with pread64. + fn read_clusters(&self, address: u64, buf: &mut [u8]) -> io::Result<()> { + let total_len = buf.len(); + let has_backing = self.backing_file.is_some(); + + let mappings = self + .metadata + .map_clusters_for_read(address, total_len, has_backing)?; + + let mut buf_offset = 0usize; + for mapping in mappings { + match mapping { + ClusterReadMapping::Zero { length } => { + buf[buf_offset..buf_offset + length as usize].fill(0); + buf_offset += length as usize; + } + ClusterReadMapping::Allocated { + offset: host_offset, + length, + } => { + pread_exact( + self.data_fd.as_raw_fd(), + &mut buf[buf_offset..buf_offset + length as usize], + host_offset, + )?; + buf_offset += length as usize; + } + ClusterReadMapping::Compressed { data } => { + let len = data.len(); + buf[buf_offset..buf_offset + len].copy_from_slice(&data); + buf_offset += len; + } + ClusterReadMapping::Backing { + offset: backing_offset, + length, + } => { + self.backing_file.as_ref().unwrap().read_at( + backing_offset, + &mut buf[buf_offset..buf_offset + length as usize], + )?; + buf_offset += length as usize; + } + } + } + Ok(()) + } +} + +impl Drop for Qcow2Backing { + fn drop(&mut self) { + self.metadata.shutdown(); + } +} + +/// Construct a thread safe backing file reader. +pub fn shared_backing_from(bf: BackingFile) -> BlockResult> { + let (kind, virtual_size) = bf.into_kind(); + + let dup_fd = |fd: BorrowedFd<'_>| -> BlockResult { + fd.try_clone_to_owned().map_err(|e| { + BlockError::new( + BlockErrorKind::Io, + QcowError::BackingFileIo(String::new(), e), + ) + .with_op(ErrorOp::DupBackingFd) + }) + }; + + match kind { + BackingKind::Raw(raw_file) => { + let fd = dup_fd(raw_file.as_fd())?; + Ok(Arc::new(RawBacking { fd, virtual_size })) + } + BackingKind::Qcow { inner, backing } => { + let data_fd = dup_fd(inner.raw_file.as_fd())?; + Ok(Arc::new(Qcow2Backing { + metadata: Arc::new(QcowMetadata::new(*inner)), + data_fd, + backing_file: backing.map(|bf| shared_backing_from(*bf)).transpose()?, + })) + } + #[cfg(test)] + BackingKind::QcowFile(_) => { + unreachable!("QcowFile variant is only used by set_backing_file() in tests") + } + } +} diff --git a/block/src/qcow/header.rs b/block/src/qcow/header.rs new file mode 100644 index 0000000000..22a5492b19 --- /dev/null +++ b/block/src/qcow/header.rs @@ -0,0 +1,605 @@ +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// Copyright 2026 The Cloud Hypervisor Authors. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +//! QCOW2 header parsing, validation, and creation. + +use std::fmt::{Display, Formatter, Result as FmtResult}; +use std::io::{Read, Seek, SeekFrom, Write}; +use std::mem::size_of; +use std::str::FromStr; + +use bitflags::bitflags; +use vmm_sys_util::file_traits::FileSync; + +use super::decoder::{Decoder, ZlibDecoder, ZstdDecoder}; +use super::qcow_raw_file::BeUint; +use super::raw_file::RawFile; +use super::{Error, Result, div_round_up_u32, div_round_up_u64}; +use crate::error::{BlockError, BlockErrorKind, BlockResult}; + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum ImageType { + Raw, + Qcow2, +} + +impl Display for ImageType { + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + match self { + ImageType::Raw => write!(f, "raw"), + ImageType::Qcow2 => write!(f, "qcow2"), + } + } +} + +impl FromStr for ImageType { + type Err = Error; + + fn from_str(s: &str) -> Result { + match s { + "raw" => Ok(ImageType::Raw), + "qcow2" => Ok(ImageType::Qcow2), + _ => Err(Error::UnsupportedBackingFileFormat(s.to_string())), + } + } +} + +#[derive(Clone, Debug)] +pub enum CompressionType { + Zlib, + Zstd, +} + +#[derive(Debug, Clone)] +pub struct BackingFileConfig { + pub path: String, + // If this is None, we will autodetect it. + pub format: Option, +} + +// Maximum data size supported. +pub(super) const MAX_QCOW_FILE_SIZE: u64 = 0x01 << 44; // 16 TB. + +// QCOW magic constant that starts the header. +pub(super) const QCOW_MAGIC: u32 = 0x5146_49fb; +// Default to a cluster size of 2^DEFAULT_CLUSTER_BITS +pub(super) const DEFAULT_CLUSTER_BITS: u32 = 16; +// Limit clusters to reasonable sizes. Choose the same limits as qemu. Making the clusters smaller +// increases the amount of overhead for book keeping. +pub(super) const MIN_CLUSTER_BITS: u32 = 9; +pub(super) const MAX_CLUSTER_BITS: u32 = 21; +// The L1 and RefCount table are kept in RAM, only handle files that require less than 35M entries. +// This easily covers 1 TB files. When support for bigger files is needed the assumptions made to +// keep these tables in RAM needs to be thrown out. +pub(super) const MAX_RAM_POINTER_TABLE_SIZE: u64 = 35_000_000; +// 16-bit refcounts. +pub(super) const DEFAULT_REFCOUNT_ORDER: u32 = 4; + +pub(super) const V2_BARE_HEADER_SIZE: u32 = 72; +pub(super) const V3_BARE_HEADER_SIZE: u32 = 104; +pub(super) const AUTOCLEAR_FEATURES_OFFSET: u64 = 88; + +pub(super) const COMPATIBLE_FEATURES_LAZY_REFCOUNTS: u64 = 1; + +// Compression types as defined in https://www.qemu.org/docs/master/interop/qcow2.html +const COMPRESSION_TYPE_ZLIB: u64 = 0; // zlib/deflate +const COMPRESSION_TYPE_ZSTD: u64 = 1; // zstd + +// Header extension types +pub(super) const HEADER_EXT_END: u32 = 0x00000000; +// Backing file format name (raw, qcow2) +pub(super) const HEADER_EXT_BACKING_FORMAT: u32 = 0xe2792aca; +// Feature name table +const HEADER_EXT_FEATURE_NAME_TABLE: u32 = 0x6803f857; + +// Feature name table entry type incompatible +const FEAT_TYPE_INCOMPATIBLE: u8 = 0; + +bitflags! { + #[derive(Debug, Clone, Copy, PartialEq, Eq)] + pub struct IncompatFeatures: u64 { + const DIRTY = 1 << 0; + const CORRUPT = 1 << 1; + const DATA_FILE = 1 << 2; + const COMPRESSION = 1 << 3; + const EXTENDED_L2 = 1 << 4; + } +} + +impl IncompatFeatures { + /// Features supported by this implementation. + pub(super) const SUPPORTED: IncompatFeatures = IncompatFeatures::DIRTY + .union(IncompatFeatures::CORRUPT) + .union(IncompatFeatures::COMPRESSION); + + /// Get the fallback name for a known feature bit. + fn flag_name(bit: u8) -> Option<&'static str> { + Some(match Self::from_bits_truncate(1u64 << bit) { + Self::DIRTY => "dirty bit", + Self::CORRUPT => "corrupt bit", + Self::DATA_FILE => "external data file", + Self::EXTENDED_L2 => "extended L2 entries", + _ => return None, + }) + } +} + +/// Error type for unsupported incompatible features. +#[derive(Debug, Clone, thiserror::Error)] +pub struct MissingFeatureError { + /// Unsupported feature bits. + features: IncompatFeatures, + /// Feature name table from the qcow2 image. + feature_names: Vec<(u8, String)>, +} + +impl MissingFeatureError { + pub(super) fn new(features: IncompatFeatures, feature_names: Vec<(u8, String)>) -> Self { + Self { + features, + feature_names, + } + } +} + +impl Display for MissingFeatureError { + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + let names: Vec = (0u8..64) + .filter(|&bit| self.features.bits() & (1u64 << bit) != 0) + .map(|bit| { + // First try the image's feature name table + self.feature_names + .iter() + .find(|(b, _)| *b == bit) + .map(|(_, name)| name.clone()) + // Then try hardcoded fallback names + .or_else(|| IncompatFeatures::flag_name(bit).map(|s| s.to_string())) + // Finally, use generic description + .unwrap_or_else(|| format!("unknown feature bit {bit}")) + }) + .collect(); + write!(f, "Missing features: {}", names.join(", ")) + } +} + +// The format supports a "header extension area", that crosvm does not use. +const QCOW_EMPTY_HEADER_EXTENSION_SIZE: u32 = 8; + +// Defined by the specification +const MAX_BACKING_FILE_SIZE: u32 = 1023; + +/// Contains the information from the header of a qcow file. +#[derive(Clone, Debug)] +pub struct QcowHeader { + pub magic: u32, + pub version: u32, + + pub backing_file_offset: u64, + pub backing_file_size: u32, + + pub cluster_bits: u32, + pub size: u64, + pub crypt_method: u32, + + pub l1_size: u32, + pub l1_table_offset: u64, + + pub refcount_table_offset: u64, + pub refcount_table_clusters: u32, + + pub nb_snapshots: u32, + pub snapshots_offset: u64, + + // v3 entries + pub incompatible_features: u64, + pub compatible_features: u64, + pub autoclear_features: u64, + pub refcount_order: u32, + pub header_size: u32, + pub compression_type: CompressionType, + + // Post-header entries + pub backing_file: Option, +} + +impl QcowHeader { + /// Read header extensions, optionally collecting feature names for error reporting. + pub(super) fn read_header_extensions( + f: &mut RawFile, + header: &mut QcowHeader, + mut feature_table: Option<&mut Vec<(u8, String)>>, + ) -> Result<()> { + // Extensions start directly after the header + f.seek(SeekFrom::Start(header.header_size as u64)) + .map_err(Error::ReadingHeader)?; + + loop { + let ext_type = u32::read_be(f).map_err(Error::ReadingHeader)?; + if ext_type == HEADER_EXT_END { + break; + } + + let ext_length = u32::read_be(f).map_err(Error::ReadingHeader)?; + + match ext_type { + HEADER_EXT_BACKING_FORMAT => { + let mut format_bytes = vec![0u8; ext_length as usize]; + f.read_exact(&mut format_bytes) + .map_err(Error::ReadingHeader)?; + let format_str = String::from_utf8(format_bytes) + .map_err(|err| Error::InvalidBackingFileName(err.utf8_error()))?; + if let Some(backing_file) = &mut header.backing_file { + backing_file.format = Some(format_str.parse()?); + } + } + HEADER_EXT_FEATURE_NAME_TABLE if feature_table.is_some() => { + const FEATURE_NAME_ENTRY_SIZE: usize = 1 + 1 + 46; // type + bit + name + let mut data = vec![0u8; ext_length as usize]; + f.read_exact(&mut data).map_err(Error::ReadingHeader)?; + let table = feature_table.as_mut().unwrap(); + for entry in data.chunks_exact(FEATURE_NAME_ENTRY_SIZE) { + if entry[0] == FEAT_TYPE_INCOMPATIBLE { + let bit_number = entry[1]; + let name_bytes = &entry[2..]; + let name_len = name_bytes.iter().position(|&b| b == 0).unwrap_or(46); + let name = String::from_utf8_lossy(&name_bytes[..name_len]).to_string(); + table.push((bit_number, name)); + } + } + } + _ => { + // Skip unknown extension + f.seek(SeekFrom::Current(ext_length as i64)) + .map_err(Error::ReadingHeader)?; + } + } + + // Skip to the next 8 byte boundary + let padding = (8 - (ext_length % 8)) % 8; + f.seek(SeekFrom::Current(padding as i64)) + .map_err(Error::ReadingHeader)?; + } + + Ok(()) + } + + /// Creates a QcowHeader from a reference to a file. + pub fn new(f: &mut RawFile) -> Result { + f.rewind().map_err(Error::ReadingHeader)?; + let magic = u32::read_be(f).map_err(Error::ReadingHeader)?; + if magic != QCOW_MAGIC { + return Err(Error::InvalidMagic); + } + + // Reads the next u32 from the file. + fn read_u32_be(f: &mut RawFile) -> Result { + u32::read_be(f).map_err(Error::ReadingHeader) + } + + // Reads the next u64 from the file. + fn read_u64_be(f: &mut RawFile) -> Result { + u64::read_be(f).map_err(Error::ReadingHeader) + } + + let version = read_u32_be(f)?; + + let mut header = QcowHeader { + magic, + version, + backing_file_offset: read_u64_be(f)?, + backing_file_size: read_u32_be(f)?, + cluster_bits: read_u32_be(f)?, + size: read_u64_be(f)?, + crypt_method: read_u32_be(f)?, + l1_size: read_u32_be(f)?, + l1_table_offset: read_u64_be(f)?, + refcount_table_offset: read_u64_be(f)?, + refcount_table_clusters: read_u32_be(f)?, + nb_snapshots: read_u32_be(f)?, + snapshots_offset: read_u64_be(f)?, + incompatible_features: if version == 2 { 0 } else { read_u64_be(f)? }, + compatible_features: if version == 2 { 0 } else { read_u64_be(f)? }, + autoclear_features: if version == 2 { 0 } else { read_u64_be(f)? }, + refcount_order: if version == 2 { + DEFAULT_REFCOUNT_ORDER + } else { + read_u32_be(f)? + }, + header_size: if version == 2 { + V2_BARE_HEADER_SIZE + } else { + read_u32_be(f)? + }, + compression_type: CompressionType::Zlib, + backing_file: None, + }; + if version == 3 && header.header_size > V3_BARE_HEADER_SIZE { + let raw_compression_type = read_u64_be(f)? >> (64 - 8); + header.compression_type = if raw_compression_type == COMPRESSION_TYPE_ZLIB { + Ok(CompressionType::Zlib) + } else if raw_compression_type == COMPRESSION_TYPE_ZSTD { + Ok(CompressionType::Zstd) + } else { + Err(Error::UnsupportedCompressionType) + }?; + } + if header.backing_file_size > MAX_BACKING_FILE_SIZE { + return Err(Error::BackingFileTooLong(header.backing_file_size as usize)); + } + if header.backing_file_offset != 0 { + f.seek(SeekFrom::Start(header.backing_file_offset)) + .map_err(Error::ReadingHeader)?; + let mut backing_file_name_bytes = vec![0u8; header.backing_file_size as usize]; + f.read_exact(&mut backing_file_name_bytes) + .map_err(Error::ReadingHeader)?; + let path = String::from_utf8(backing_file_name_bytes) + .map_err(|err| Error::InvalidBackingFileName(err.utf8_error()))?; + header.backing_file = Some(BackingFileConfig { path, format: None }); + } + + if version == 3 { + // Check for unsupported incompatible features first + let features = IncompatFeatures::from_bits_retain(header.incompatible_features); + let unsupported = features - IncompatFeatures::SUPPORTED; + if !unsupported.is_empty() { + // Read extensions only to get feature names for error reporting + let mut feature_table = Vec::new(); + if header.header_size > V3_BARE_HEADER_SIZE { + let _ = Self::read_header_extensions(f, &mut header, Some(&mut feature_table)); + } + return Err(Error::UnsupportedFeature(MissingFeatureError::new( + unsupported, + feature_table, + ))); + } + + // Features OK, now read extensions normally + if header.header_size > V3_BARE_HEADER_SIZE { + Self::read_header_extensions(f, &mut header, None)?; + } + } + + Ok(header) + } + + pub fn get_decoder(&self) -> Box { + match self.compression_type { + CompressionType::Zlib => Box::new(ZlibDecoder {}), + CompressionType::Zstd => Box::new(ZstdDecoder {}), + } + } + + pub fn create_for_size_and_path( + version: u32, + size: u64, + backing_file: Option<&str>, + ) -> Result { + let header_size = if version == 2 { + V2_BARE_HEADER_SIZE + } else { + V3_BARE_HEADER_SIZE + QCOW_EMPTY_HEADER_EXTENSION_SIZE + }; + let cluster_bits: u32 = DEFAULT_CLUSTER_BITS; + let cluster_size: u32 = 0x01 << cluster_bits; + let max_length: usize = (cluster_size - header_size) as usize; + if let Some(path) = backing_file + && path.len() > max_length + { + return Err(Error::BackingFileTooLong(path.len() - max_length)); + } + + // L2 blocks are always one cluster long. They contain cluster_size/sizeof(u64) addresses. + let entries_per_cluster: u32 = cluster_size / size_of::() as u32; + let num_clusters: u32 = div_round_up_u64(size, u64::from(cluster_size)) as u32; + let num_l2_clusters: u32 = div_round_up_u32(num_clusters, entries_per_cluster); + let l1_clusters: u32 = div_round_up_u32(num_l2_clusters, entries_per_cluster); + let header_clusters = div_round_up_u32(size_of::() as u32, cluster_size); + Ok(QcowHeader { + magic: QCOW_MAGIC, + version, + backing_file_offset: backing_file.map_or(0, |_| { + header_size + + if version == 3 { + QCOW_EMPTY_HEADER_EXTENSION_SIZE + } else { + 0 + } + }) as u64, + backing_file_size: backing_file.map_or(0, |x| x.len()) as u32, + cluster_bits: DEFAULT_CLUSTER_BITS, + size, + crypt_method: 0, + l1_size: num_l2_clusters, + l1_table_offset: u64::from(cluster_size), + // The refcount table is after l1 + header. + refcount_table_offset: u64::from(cluster_size * (l1_clusters + 1)), + refcount_table_clusters: { + // Pre-allocate enough clusters for the entire refcount table as it must be + // continuous in the file. Allocate enough space to refcount all clusters, including + // the refcount clusters. + let max_refcount_clusters = max_refcount_clusters( + DEFAULT_REFCOUNT_ORDER, + cluster_size, + num_clusters + l1_clusters + num_l2_clusters + header_clusters, + ) as u32; + // The refcount table needs to store the offset of each refcount cluster. + div_round_up_u32( + max_refcount_clusters * size_of::() as u32, + cluster_size, + ) + }, + nb_snapshots: 0, + snapshots_offset: 0, + incompatible_features: 0, + compatible_features: 0, + autoclear_features: 0, + refcount_order: DEFAULT_REFCOUNT_ORDER, + header_size, + compression_type: CompressionType::Zlib, + backing_file: backing_file.map(|path| BackingFileConfig { + path: String::from(path), + format: None, + }), + }) + } + + /// Write the header to `file`. + pub fn write_to(&self, file: &mut F) -> Result<()> { + // Writes the next u32 to the file. + fn write_u32_be(f: &mut F, value: u32) -> Result<()> { + u32::write_be(f, value).map_err(Error::WritingHeader) + } + + // Writes the next u64 to the file. + fn write_u64_be(f: &mut F, value: u64) -> Result<()> { + u64::write_be(f, value).map_err(Error::WritingHeader) + } + + write_u32_be(file, self.magic)?; + write_u32_be(file, self.version)?; + write_u64_be(file, self.backing_file_offset)?; + write_u32_be(file, self.backing_file_size)?; + write_u32_be(file, self.cluster_bits)?; + write_u64_be(file, self.size)?; + write_u32_be(file, self.crypt_method)?; + write_u32_be(file, self.l1_size)?; + write_u64_be(file, self.l1_table_offset)?; + write_u64_be(file, self.refcount_table_offset)?; + write_u32_be(file, self.refcount_table_clusters)?; + write_u32_be(file, self.nb_snapshots)?; + write_u64_be(file, self.snapshots_offset)?; + + if self.version == 3 { + write_u64_be(file, self.incompatible_features)?; + write_u64_be(file, self.compatible_features)?; + write_u64_be(file, self.autoclear_features)?; + write_u32_be(file, self.refcount_order)?; + write_u32_be(file, self.header_size)?; + + if self.header_size > V3_BARE_HEADER_SIZE { + write_u64_be(file, 0)?; // no compression + } + + write_u32_be(file, 0)?; // header extension type: end of header extension area + write_u32_be(file, 0)?; // length of header extension data: 0 + } + + if let Some(backing_file_path) = self.backing_file.as_ref().map(|bf| &bf.path) { + if self.backing_file_offset > 0 { + file.seek(SeekFrom::Start(self.backing_file_offset)) + .map_err(Error::WritingHeader)?; + } + write!(file, "{backing_file_path}").map_err(Error::WritingHeader)?; + } + + // Set the file length by seeking and writing a zero to the last byte. This avoids needing + // a `File` instead of anything that implements seek as the `file` argument. + // Zeros out the l1 and refcount table clusters. + let cluster_size = 0x01u64 << self.cluster_bits; + let refcount_blocks_size = u64::from(self.refcount_table_clusters) * cluster_size; + file.seek(SeekFrom::Start( + self.refcount_table_offset + refcount_blocks_size - 2, + )) + .map_err(Error::WritingHeader)?; + file.write(&[0u8]).map_err(Error::WritingHeader)?; + + Ok(()) + } + + /// Write only the incompatible_features field to the file at its fixed offset. + fn write_incompatible_features(&self, file: &mut F) -> BlockResult<()> { + if self.version != 3 { + return Ok(()); + } + file.seek(SeekFrom::Start(V2_BARE_HEADER_SIZE as u64)) + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::WritingHeader(e)))?; + u64::write_be(file, self.incompatible_features) + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::WritingHeader(e)))?; + Ok(()) + } + + /// Set or clear the dirty bit for QCOW2 v3 images. + /// + /// When `dirty` is true, sets the bit to indicate the image is in use. + /// When `dirty` is false, clears the bit to indicate a clean shutdown. + pub fn set_dirty_bit( + &mut self, + file: &mut F, + dirty: bool, + ) -> BlockResult<()> { + if self.version == 3 { + if dirty { + self.incompatible_features |= IncompatFeatures::DIRTY.bits(); + } else { + self.incompatible_features &= !IncompatFeatures::DIRTY.bits(); + } + self.write_incompatible_features(file)?; + file.fsync() + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SyncingHeader(e)))?; + } + Ok(()) + } + + /// Set the corrupt bit for QCOW2 v3 images. + /// + /// This marks the image as corrupted. Once set, the image can only be + /// opened read-only until repaired. + pub fn set_corrupt_bit(&mut self, file: &mut F) -> BlockResult<()> { + if self.version == 3 { + self.incompatible_features |= IncompatFeatures::CORRUPT.bits(); + self.write_incompatible_features(file)?; + file.fsync() + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SyncingHeader(e)))?; + } + Ok(()) + } + + pub fn is_corrupt(&self) -> bool { + IncompatFeatures::from_bits_truncate(self.incompatible_features) + .contains(IncompatFeatures::CORRUPT) + } + + /// Clear all autoclear feature bits for QCOW2 v3 images. + /// + /// These bits indicate features that can be safely disabled when modified + /// by software that doesn't understand them. + pub fn clear_autoclear_features( + &mut self, + file: &mut F, + ) -> Result<()> { + if self.version == 3 && self.autoclear_features != 0 { + self.autoclear_features = 0; + file.seek(SeekFrom::Start(AUTOCLEAR_FEATURES_OFFSET)) + .map_err(Error::WritingHeader)?; + u64::write_be(file, 0).map_err(Error::WritingHeader)?; + file.fsync().map_err(Error::SyncingHeader)?; + } + Ok(()) + } +} + +pub(super) fn max_refcount_clusters( + refcount_order: u32, + cluster_size: u32, + num_clusters: u32, +) -> u64 { + // Use u64 as the product of the u32 inputs can overflow. + let refcount_bits = 0x01u64 << u64::from(refcount_order); + let cluster_bits = u64::from(cluster_size) * 8; + let for_data = div_round_up_u64(u64::from(num_clusters) * refcount_bits, cluster_bits); + let for_refcounts = div_round_up_u64(for_data * refcount_bits, cluster_bits); + for_data + for_refcounts +} + +/// Returns an Error if the given offset doesn't align to a cluster boundary. +pub(super) fn offset_is_cluster_boundary(offset: u64, cluster_bits: u32) -> Result<()> { + if offset & ((0x01 << cluster_bits) - 1) != 0 { + return Err(Error::InvalidOffset(offset)); + } + Ok(()) +} diff --git a/block/src/qcow/metadata.rs b/block/src/qcow/metadata.rs new file mode 100644 index 0000000000..b4b64cabd0 --- /dev/null +++ b/block/src/qcow/metadata.rs @@ -0,0 +1,1068 @@ +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// Copyright 2026 The Cloud Hypervisor Authors. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +//! QCOW2 metadata with lock based synchronization. +//! +//! QcowMetadata wraps the in memory QCOW2 metadata tables behind a single +//! coarse RwLock. This separates metadata lookup from data I/O, allowing +//! data reads and writes to proceed without holding the metadata lock. +//! +//! On L2 cache hit, map_clusters_for_read only needs a read lock with +//! pure shared reference access on the cache. Cache misses and all write +//! operations upgrade to a write lock. + +use std::cmp::min; +use std::io::{self, Seek}; +use std::mem; +use std::sync::RwLock; + +use libc::{EINVAL, EIO}; + +use super::qcow_raw_file::QcowRawFile; +use super::refcount::RefCount; +use super::util::{ + div_round_up_u64, l1_entry_make, l2_entry_compressed_cluster_layout, l2_entry_is_compressed, + l2_entry_is_empty, l2_entry_is_zero, l2_entry_make_std, l2_entry_make_zero, + l2_entry_std_cluster_addr, +}; +use super::vec_cache::{CacheMap, Cacheable, VecCache}; +use super::{QcowHeader, refcount}; + +/// Describes how to satisfy a guest read for a single cluster region. +/// +/// Returned by QcowMetadata::map_clusters_for_read. The caller performs +/// the actual data I/O using its own per queue file descriptor without +/// holding the metadata lock. +#[derive(Debug)] +pub enum ClusterReadMapping { + /// The cluster is not allocated and the guest should see zeros. + /// This covers both truly unallocated clusters where the L1 or L2 + /// entry is zero and clusters with the ZERO flag set. + Zero { length: u64 }, + + /// The cluster is allocated at the given host file offset. + /// The offset is the exact byte position combining cluster base and + /// intra cluster offset. The length is the number of bytes to read, + /// bounded by cluster boundary and guest request. + Allocated { offset: u64, length: u64 }, + + /// The cluster is compressed. The decompressed data is returned inline + /// because decompression is a CPU only operation that was done under the + /// write lock to access the raw compressed bytes from disk. + /// + /// The data field contains exactly the bytes the guest requested, already + /// sliced from the decompressed cluster. + Compressed { data: Vec }, + + /// The cluster is not allocated in this layer but may exist in a backing + /// file. The caller should delegate to the backing file at the given + /// guest offset for the specified length in bytes. + Backing { offset: u64, length: u64 }, +} + +/// Describes how to satisfy a guest write for a single cluster region. +/// +/// Returned by QcowMetadata::map_cluster_for_write. The caller performs +/// the actual data I/O using its own per queue file descriptor without +/// holding the metadata lock. +#[derive(Debug)] +pub enum ClusterWriteMapping { + /// The write target is at the given host file offset. + /// This covers both already allocated clusters and freshly allocated ones. + /// The offset is the exact byte position combining cluster base and + /// intra cluster offset. + Allocated { offset: u64 }, +} + +/// Trait for reading from a backing file in a thread safe manner. +/// +/// Used by QcowMetadata::deallocate_bytes so it can read COW data +/// from the backing file without knowing the concrete backing type. +pub(crate) trait BackingRead: Send + Sync { + fn read_at(&self, address: u64, buf: &mut [u8]) -> io::Result<()>; +} + +/// Action that the caller must perform after deallocate_bytes. +#[derive(Debug)] +pub enum DeallocAction { + /// Punch a hole at the given host file offset for a full cluster. + PunchHole { host_offset: u64, length: u64 }, + /// Write zeros at the given host file offset for a partial cluster. + WriteZeroes { host_offset: u64, length: usize }, +} + +/// Shared QCOW2 metadata protected by a coarse RwLock. +/// +/// Holds the L1 table, L2 cache and refcount state in memory. L2 table +/// entries and refcount blocks are read from disk on cache miss and +/// written back on eviction or when dirty. +/// +/// One instance is shared via Arc across all virtio blk queues. Each +/// queue holds its own QcowRawFile clone for data I/O. +/// +/// Steady state guest I/O is read dominant at the metadata level. Every +/// read and every write to an already allocated cluster only needs an +/// L1 to L2 lookup, which completes under a shared read lock. Only +/// cluster allocation, L2 cache eviction and resize take the exclusive +/// write lock, so contention stays low and queues scale. +pub struct QcowMetadata { + inner: RwLock, +} + +/// The actual metadata state, accessible only through the RwLock. +pub(crate) struct QcowState { + pub(crate) header: QcowHeader, + pub(crate) l1_table: VecCache, + pub(crate) l2_entries: u64, + pub(crate) l2_cache: CacheMap>, + pub(crate) refcounts: RefCount, + pub(crate) avail_clusters: Vec, + pub(crate) unref_clusters: Vec, + /// Dedicated file descriptor for metadata I/O covering L2 table reads, + /// refcount block reads and dirty eviction writes. This is a dup clone + /// of the original fd, separate from the per queue data I/O fds. + pub(crate) raw_file: QcowRawFile, +} + +impl QcowMetadata { + pub(crate) fn new(inner: QcowState) -> Self { + QcowMetadata { + inner: RwLock::new(inner), + } + } + + /// Maps a multicluster guest read range to a list of read mappings. + /// + /// This walks the range in cluster sized steps under a single lock + /// acquisition, reducing lock roundtrips for large reads. The returned + /// mappings are ordered by guest address and ready for io_uring + /// submission. The caller can coalesce adjacent allocated entries into + /// fewer submissions. + /// + /// On the read lock fast path, if all L2 tables are cached, the lookup + /// is pure memory access with no I/O and concurrent readers are allowed. + /// + /// On the write lock slow path, if an L2 cache miss occurs, the L2 + /// table is read from disk via the metadata fd, the cache is populated + /// and the mapping is returned. + /// + /// The has_backing_file flag indicates whether a backing file exists, + /// needed to distinguish zero versus backing for unallocated clusters. + pub fn map_clusters_for_read( + &self, + address: u64, + total_length: usize, + has_backing_file: bool, + ) -> io::Result> { + let inner = self.inner.read().unwrap(); + let cluster_size = inner.raw_file.cluster_size(); + let mut mappings = Vec::new(); + let mut mapped = 0usize; + let mut need_write_lock = false; + + // Fast path, try all chunks under read lock + while mapped < total_length { + let curr_addr = address + mapped as u64; + let offset_in_cluster = inner.raw_file.cluster_offset(curr_addr) as usize; + let count = min( + total_length - mapped, + cluster_size as usize - offset_in_cluster, + ); + + match inner.try_map_read(curr_addr, count, has_backing_file)? { + Some(mapping) => mappings.push(mapping), + None => { + need_write_lock = true; + break; + } + } + mapped += count; + } + + if !need_write_lock { + return Ok(mappings); + } + + // Slow path, drop read lock, take write lock, redo from where we stopped + drop(inner); + let mut inner = self.inner.write().unwrap(); + + // Remap everything under write lock for consistency since the L2 cache + // may have been evicted between the read to write lock transition. + mappings.clear(); + mapped = 0; + + while mapped < total_length { + let curr_addr = address + mapped as u64; + let offset_in_cluster = inner.raw_file.cluster_offset(curr_addr) as usize; + let count = min( + total_length - mapped, + cluster_size as usize - offset_in_cluster, + ); + + mappings.push(inner.map_read_with_populate(curr_addr, count, has_backing_file)?); + mapped += count; + } + + Ok(mappings) + } + + /// Maps a guest write address to a write mapping. + /// + /// Always takes a write lock since writes may need to allocate clusters, + /// update L2 entries and update refcounts. + /// + /// The backing_data parameter is the COW source. If the cluster is + /// unallocated and a backing file exists, the caller should have already + /// read the backing cluster data and pass it here. If None, the new + /// cluster is zeroed. + pub fn map_cluster_for_write( + &self, + address: u64, + backing_data: Option>, + ) -> io::Result { + let mut inner = self.inner.write().unwrap(); + inner.map_write(address, backing_data) + } + + pub fn flush(&self) -> io::Result<()> { + let mut inner = self.inner.write().unwrap(); + inner.sync_caches()?; + let mut unref = mem::take(&mut inner.unref_clusters); + inner.avail_clusters.append(&mut unref); + Ok(()) + } + + /// Flushes dirty metadata caches and clears the dirty bit for + /// clean shutdown. + pub fn shutdown(&self) { + let mut inner = self.inner.write().unwrap(); + let _ = inner.sync_caches(); + let QcowState { + ref mut header, + ref mut raw_file, + .. + } = *inner; + if raw_file.file().is_writable() { + let _ = header.set_dirty_bit(raw_file.file_mut(), false); + } + } + + /// Resizes the QCOW2 image to the given new size. Only grow is + /// supported, shrink would require walking all L2 tables to reclaim + /// clusters beyond the new size and risks data loss. + /// + /// Returns an error if the new size is smaller than the current size. + pub fn resize(&self, new_size: u64) -> io::Result<()> { + let mut inner = self.inner.write().unwrap(); + inner.resize(new_size) + } + + /// Deallocates a range of bytes. Full clusters are deallocated via metadata. + /// Partial clusters need the caller to write zeros. This method returns a + /// list of actions the caller should take. + pub(crate) fn deallocate_bytes( + &self, + address: u64, + length: usize, + sparse: bool, + virtual_size: u64, + cluster_size: u64, + backing_file: Option<&dyn BackingRead>, + ) -> io::Result> { + if address.checked_add(length as u64).is_none() { + return Ok(Vec::new()); + } + let mut inner = self.inner.write().unwrap(); + let mut actions = Vec::new(); + + let file_end = virtual_size; + let remaining_in_file = file_end.saturating_sub(address); + let write_count = min(length as u64, remaining_in_file) as usize; + + let mut nwritten = 0usize; + while nwritten < write_count { + let curr_addr = address + nwritten as u64; + let offset_in_cluster = inner.raw_file.cluster_offset(curr_addr) as usize; + let count = min( + write_count - nwritten, + cluster_size as usize - offset_in_cluster, + ); + + if count == cluster_size as usize { + let punch_offset = inner.deallocate_cluster(curr_addr, sparse)?; + if let Some(host_offset) = punch_offset { + actions.push(DeallocAction::PunchHole { + host_offset, + length: cluster_size, + }); + } + } else { + // Partial cluster - COW from backing to preserve non zeroed bytes, + // then the caller writes zeros to the partial range. + let backing_data = if let Some(backing) = backing_file { + let cluster_begin = curr_addr - offset_in_cluster as u64; + let mut data = vec![0u8; cluster_size as usize]; + backing.read_at(cluster_begin, &mut data)?; + Some(data) + } else { + None + }; + let mapping = inner.map_write(curr_addr, backing_data)?; + let ClusterWriteMapping::Allocated { offset } = mapping; + actions.push(DeallocAction::WriteZeroes { + host_offset: offset, + length: count, + }); + } + + nwritten += count; + } + Ok(actions) + } + + pub fn virtual_size(&self) -> u64 { + self.inner.read().unwrap().header.size + } + + pub fn cluster_size(&self) -> u64 { + self.inner.read().unwrap().raw_file.cluster_size() + } + + /// Returns the intra cluster byte offset for a given guest address. + pub fn cluster_offset(&self, address: u64) -> u64 { + self.inner.read().unwrap().raw_file.cluster_offset(address) + } +} + +impl QcowState { + /// Fast path read mapping under read lock only. Returns None on cache + /// miss. + /// + /// All access here is through shared reference. CacheMap::get, + /// VecCache::get and index operations are all shared reference compatible. + fn try_map_read( + &self, + address: u64, + count: usize, + has_backing_file: bool, + ) -> io::Result> { + if address >= self.header.size { + return Err(io::Error::from_raw_os_error(EINVAL)); + } + + let l1_index = self.l1_table_index(address) as usize; + let l2_addr_disk = match self.l1_table.get(l1_index) { + Some(&addr) => addr, + None => return Err(io::Error::from_raw_os_error(EINVAL)), + }; + + if l2_addr_disk == 0 { + return Ok(Some(self.unallocated_read_mapping( + address, + count, + has_backing_file, + ))); + } + + let l2_table = match self.l2_cache.get(l1_index) { + Some(table) => table, + None => return Ok(None), // cache miss, need write lock + }; + + let l2_index = self.l2_table_index(address) as usize; + let l2_entry = l2_table[l2_index]; + + // Compressed entries require disk I/O for decompression - can't do + // that under a read lock. Fall through to the write lock path. + if l2_entry_is_compressed(l2_entry) { + return Ok(None); + } + + if l2_entry_is_empty(l2_entry) { + Ok(Some(self.unallocated_read_mapping( + address, + count, + has_backing_file, + ))) + } else if l2_entry_is_zero(l2_entry) { + // Match original QcowFile::file_read semantics where zero flagged + // entries fall through to backing file when one exists or return + // zeros otherwise. + Ok(Some(self.unallocated_read_mapping( + address, + count, + has_backing_file, + ))) + } else { + let cluster_addr = l2_entry_std_cluster_addr(l2_entry); + let cluster_size = self.raw_file.cluster_size(); + if cluster_addr & (cluster_size - 1) != 0 { + // Fall through to write lock path which sets the corrupt bit + return Ok(None); + } + let intra_offset = self.raw_file.cluster_offset(address); + Ok(Some(ClusterReadMapping::Allocated { + offset: cluster_addr + intra_offset, + length: count as u64, + })) + } + } + + /// Slow path read mapping. Requires exclusive access to populate cache. + fn map_read_with_populate( + &mut self, + address: u64, + count: usize, + has_backing_file: bool, + ) -> io::Result { + if address >= self.header.size { + return Err(io::Error::from_raw_os_error(EINVAL)); + } + + let l1_index = self.l1_table_index(address) as usize; + let l2_addr_disk = match self.l1_table.get(l1_index) { + Some(&addr) => addr, + None => return Err(io::Error::from_raw_os_error(EINVAL)), + }; + + if l2_addr_disk == 0 { + return Ok(self.unallocated_read_mapping(address, count, has_backing_file)); + } + + // Populate cache if needed as this does I/O via the metadata raw file + self.cache_l2_cluster(l1_index, l2_addr_disk)?; + + let l2_index = self.l2_table_index(address) as usize; + let l2_entry = self.l2_cache.get(l1_index).unwrap()[l2_index]; + + if l2_entry_is_empty(l2_entry) { + Ok(self.unallocated_read_mapping(address, count, has_backing_file)) + } else if l2_entry_is_compressed(l2_entry) { + // Under write lock we can do I/O for decompression + let decompressed = self.decompress_l2_cluster(l2_entry)?; + let start = self.raw_file.cluster_offset(address) as usize; + let end = start + .checked_add(count) + .ok_or_else(|| io::Error::from_raw_os_error(EINVAL))?; + if end > decompressed.len() { + return Err(io::Error::from_raw_os_error(EINVAL)); + } + Ok(ClusterReadMapping::Compressed { + data: decompressed[start..end].to_vec(), + }) + } else if l2_entry_is_zero(l2_entry) { + // Match original QcowFile::file_read semantics where zero flagged + // entries fall through to backing file when one exists or return + // zeros otherwise. + Ok(self.unallocated_read_mapping(address, count, has_backing_file)) + } else { + let cluster_addr = l2_entry_std_cluster_addr(l2_entry); + let cluster_size = self.raw_file.cluster_size(); + if cluster_addr & (cluster_size - 1) != 0 { + self.set_corrupt_bit_best_effort(); + return Err(io::Error::from_raw_os_error(EIO)); + } + let intra_offset = self.raw_file.cluster_offset(address); + Ok(ClusterReadMapping::Allocated { + offset: cluster_addr + intra_offset, + length: count as u64, + }) + } + } + + fn unallocated_read_mapping( + &self, + address: u64, + count: usize, + has_backing_file: bool, + ) -> ClusterReadMapping { + if has_backing_file { + ClusterReadMapping::Backing { + offset: address, + length: count as u64, + } + } else { + ClusterReadMapping::Zero { + length: count as u64, + } + } + } + + /// Maps a single cluster region for a sequential read. + pub(crate) fn map_cluster_read( + &mut self, + address: u64, + count: usize, + has_backing_file: bool, + ) -> io::Result { + match self.try_map_read(address, count, has_backing_file)? { + Some(mapping) => Ok(mapping), + None => self.map_read_with_populate(address, count, has_backing_file), + } + } + + /// Write path mapping. Always called under write lock. + fn map_write( + &mut self, + address: u64, + backing_data: Option>, + ) -> io::Result { + if address >= self.header.size { + return Err(io::Error::from_raw_os_error(EINVAL)); + } + + let l1_index = self.l1_table_index(address) as usize; + let l2_addr_disk = match self.l1_table.get(l1_index) { + Some(&addr) => addr, + None => return Err(io::Error::from_raw_os_error(EINVAL)), + }; + let l2_index = self.l2_table_index(address) as usize; + + let mut set_refcounts = Vec::new(); + + if let Some(new_addr) = self.cache_l2_cluster_alloc(l1_index, l2_addr_disk)? { + set_refcounts.push((new_addr, 1)); + } + + let l2_entry = self.l2_cache.get(l1_index).unwrap()[l2_index]; + let cluster_addr = if l2_entry_is_compressed(l2_entry) { + let decompressed_cluster = self.decompress_l2_cluster(l2_entry)?; + let cluster_addr = self.append_data_cluster(None)?; + self.update_cluster_addr(l1_index, l2_index, cluster_addr, &mut set_refcounts)?; + self.raw_file + .file_mut() + .seek(io::SeekFrom::Start(cluster_addr))?; + let nwritten = io::Write::write(self.raw_file.file_mut(), &decompressed_cluster)?; + if nwritten != decompressed_cluster.len() { + self.set_corrupt_bit_best_effort(); + return Err(io::Error::from_raw_os_error(EIO)); + } + self.deallocate_compressed_cluster(l2_entry)?; + cluster_addr + } else if l2_entry_is_empty(l2_entry) || l2_entry_is_zero(l2_entry) { + let cluster_addr = self.append_data_cluster(backing_data)?; + self.update_cluster_addr(l1_index, l2_index, cluster_addr, &mut set_refcounts)?; + cluster_addr + } else { + // Already allocated - validate alignment + let cluster_addr = l2_entry_std_cluster_addr(l2_entry); + if cluster_addr & (self.raw_file.cluster_size() - 1) != 0 { + self.set_corrupt_bit_best_effort(); + return Err(io::Error::from_raw_os_error(EIO)); + } + cluster_addr + }; + + // Apply deferred refcount updates + for (addr, refcount) in set_refcounts { + self.set_cluster_refcount_track_freed(addr, refcount)?; + } + + let intra_offset = self.raw_file.cluster_offset(address); + Ok(ClusterWriteMapping::Allocated { + offset: cluster_addr + intra_offset, + }) + } + + // -- Address computation helpers -- + + fn l1_table_index(&self, address: u64) -> u64 { + (address / self.raw_file.cluster_size()) / self.l2_entries + } + + fn l2_table_index(&self, address: u64) -> u64 { + (address / self.raw_file.cluster_size()) % self.l2_entries + } + + // -- Cache and allocation operations requiring exclusive access -- + + /// Populates the L2 cache for read operations without allocation. + fn cache_l2_cluster(&mut self, l1_index: usize, l2_addr_disk: u64) -> io::Result<()> { + if !self.l2_cache.contains_key(l1_index) { + let cluster_size = self.raw_file.cluster_size(); + if l2_addr_disk & (cluster_size - 1) != 0 { + self.set_corrupt_bit_best_effort(); + return Err(io::Error::from_raw_os_error(EIO)); + } + let l2_table = + VecCache::from_vec(self.raw_file.read_pointer_cluster(l2_addr_disk, None)?); + let l1_table = &self.l1_table; + let raw_file = &mut self.raw_file; + self.l2_cache.insert(l1_index, l2_table, |index, evicted| { + raw_file.write_pointer_table_direct(l1_table[index], evicted.iter()) + })?; + } + Ok(()) + } + + /// Populates the L2 cache for write operations and may allocate a new + /// L2 table. Returns the address of the newly allocated cluster if any. + fn cache_l2_cluster_alloc( + &mut self, + l1_index: usize, + l2_addr_disk: u64, + ) -> io::Result> { + let mut new_cluster: Option = None; + if !self.l2_cache.contains_key(l1_index) { + let l2_table = if l2_addr_disk == 0 { + // Allocate a new cluster to store the L2 table + let new_addr = self.get_new_cluster(None)?; + new_cluster = Some(new_addr); + self.l1_table[l1_index] = new_addr; + VecCache::new(self.l2_entries as usize) + } else { + let cluster_size = self.raw_file.cluster_size(); + if l2_addr_disk & (cluster_size - 1) != 0 { + self.set_corrupt_bit_best_effort(); + return Err(io::Error::from_raw_os_error(EIO)); + } + VecCache::from_vec(self.raw_file.read_pointer_cluster(l2_addr_disk, None)?) + }; + let l1_table = &self.l1_table; + let raw_file = &mut self.raw_file; + self.l2_cache.insert(l1_index, l2_table, |index, evicted| { + raw_file.write_pointer_table_direct(l1_table[index], evicted.iter()) + })?; + } + Ok(new_cluster) + } + + /// Allocates a new cluster from the free list or by extending the file. + fn get_new_cluster(&mut self, initial_data: Option>) -> io::Result { + if let Some(free_cluster) = self.avail_clusters.pop() { + if free_cluster == 0 { + self.set_corrupt_bit_best_effort(); + return Err(io::Error::from_raw_os_error(EIO)); + } + if let Some(initial_data) = initial_data { + self.raw_file.write_cluster(free_cluster, &initial_data)?; + } else { + self.raw_file.zero_cluster(free_cluster)?; + } + return Ok(free_cluster); + } + + let max_valid = self.refcounts.max_valid_cluster_offset(); + if let Some(new_cluster) = self.raw_file.add_cluster_end(max_valid)? { + if new_cluster == 0 { + self.set_corrupt_bit_best_effort(); + return Err(io::Error::from_raw_os_error(EIO)); + } + if let Some(initial_data) = initial_data { + self.raw_file.write_cluster(new_cluster, &initial_data)?; + } + Ok(new_cluster) + } else { + log::error!("No free clusters in get_new_cluster()"); + Err(io::Error::from_raw_os_error(libc::ENOSPC)) + } + } + + /// Allocates a data cluster and sets its refcount to 1. + fn append_data_cluster(&mut self, initial_data: Option>) -> io::Result { + let new_addr = self.get_new_cluster(initial_data)?; + self.set_cluster_refcount_track_freed(new_addr, 1)?; + Ok(new_addr) + } + + /// Updates the L1 and L2 tables to point to a new cluster address. + fn update_cluster_addr( + &mut self, + l1_index: usize, + l2_index: usize, + cluster_addr: u64, + set_refcounts: &mut Vec<(u64, u64)>, + ) -> io::Result<()> { + if !self.l2_cache.get(l1_index).unwrap().dirty() { + // Free the previously used cluster if one exists. Modified tables are always + // written to new clusters so the L1 table can be committed to disk after they + // are and L1 never points at an invalid table. + let addr = self.l1_table[l1_index]; + if addr != 0 { + self.unref_clusters.push(addr); + set_refcounts.push((addr, 0)); + } + + // Allocate a new cluster to store the L2 table and update the L1 table to point + // to the new table. The cluster will be written when the cache is flushed. + let new_addr = self.get_new_cluster(None)?; + set_refcounts.push((new_addr, 1)); + self.l1_table[l1_index] = new_addr; // marks l1_table dirty via IndexMut + } + // Write the L2 entry - IndexMut marks the L2 table dirty automatically. + self.l2_cache.get_mut(l1_index).unwrap()[l2_index] = l2_entry_make_std(cluster_addr); + Ok(()) + } + + /// Resizes the image to the given new size. Only grow is supported, + /// shrink would require walking all L2 tables to reclaim clusters + /// beyond the new size and risks data loss. + fn resize(&mut self, new_size: u64) -> io::Result<()> { + let current_size = self.header.size; + + if new_size == current_size { + return Ok(()); + } + + if new_size < current_size { + return Err(io::Error::other("shrinking QCOW2 images is not supported")); + } + + let cluster_size = self.raw_file.cluster_size(); + let entries_per_cluster = cluster_size / size_of::() as u64; + let new_clusters = div_round_up_u64(new_size, cluster_size); + let needed_l1_entries = div_round_up_u64(new_clusters, entries_per_cluster) as u32; + + if needed_l1_entries > self.header.l1_size { + self.grow_l1_table(needed_l1_entries)?; + } + + self.header.size = new_size; + + self.raw_file.file_mut().rewind()?; + self.header + .write_to(self.raw_file.file_mut()) + .map_err(|e| io::Error::other(format!("failed to write header during resize: {e}")))?; + + self.raw_file.file_mut().sync_all()?; + + Ok(()) + } + + /// Grows the L1 table to accommodate at least the requested number of entries. + fn grow_l1_table(&mut self, new_l1_size: u32) -> io::Result<()> { + let old_l1_size = self.header.l1_size; + let old_l1_offset = self.header.l1_table_offset; + let cluster_size = self.raw_file.cluster_size(); + + let new_l1_bytes = new_l1_size as u64 * size_of::() as u64; + let new_l1_clusters = div_round_up_u64(new_l1_bytes, cluster_size); + + // Allocate contiguous clusters at file end for new L1 table + let file_size = self.raw_file.file_mut().seek(io::SeekFrom::End(0))?; + let new_l1_offset = self.raw_file.cluster_address(file_size + cluster_size - 1); + + let new_file_end = new_l1_offset + new_l1_clusters * cluster_size; + self.raw_file.file_mut().set_len(new_file_end)?; + + // Set refcounts for the contiguous range + for i in 0..new_l1_clusters { + self.set_cluster_refcount_track_freed(new_l1_offset + i * cluster_size, 1)?; + } + + let mut new_l1_data = vec![0u64; new_l1_size as usize]; + let old_entries = self.l1_table.get_values(); + new_l1_data[..old_entries.len()].copy_from_slice(old_entries); + + for l2_addr in new_l1_data.iter_mut() { + if *l2_addr != 0 { + let refcount = self + .refcounts + .get_cluster_refcount(&mut self.raw_file, *l2_addr) + .map_err(|e| { + io::Error::other(format!("failed to get refcount during resize: {e}")) + })?; + *l2_addr = l1_entry_make(*l2_addr, refcount == 1); + } + } + + // Write the new L1 table to disk + self.raw_file + .write_pointer_table_direct(new_l1_offset, new_l1_data.iter())?; + + self.raw_file.file_mut().sync_all()?; + + self.header.l1_size = new_l1_size; + self.header.l1_table_offset = new_l1_offset; + + self.raw_file.file_mut().rewind()?; + self.header + .write_to(self.raw_file.file_mut()) + .map_err(|e| io::Error::other(format!("failed to write header during resize: {e}")))?; + + self.raw_file.file_mut().sync_all()?; + + // Free old L1 table clusters + let old_l1_bytes = old_l1_size as u64 * size_of::() as u64; + let old_l1_clusters = div_round_up_u64(old_l1_bytes, cluster_size); + for i in 0..old_l1_clusters { + let cluster_addr = old_l1_offset + i * cluster_size; + // Best effort: the old L1 clusters are no longer reachable, + // so a refcount update failure just leaks space. + let _ = self.set_cluster_refcount(cluster_addr, 0); + } + + // Update L1 table cache + self.l1_table.extend(new_l1_size as usize); + + Ok(()) + } + + /// Deallocates a cluster at the given guest address. + /// + /// If sparse is true, fully deallocates and returns the host offset if + /// the underlying storage should be punched after the refcount dropped + /// to zero. If sparse is false, uses the zero flag optimization when + /// possible. + /// + /// Returns None if no host punch_hole is needed. + pub(super) fn deallocate_cluster( + &mut self, + address: u64, + sparse: bool, + ) -> io::Result> { + if address >= self.header.size { + return Err(io::Error::from_raw_os_error(EINVAL)); + } + + let l1_index = self.l1_table_index(address) as usize; + let l2_addr_disk = match self.l1_table.get(l1_index) { + Some(&addr) => addr, + None => return Err(io::Error::from_raw_os_error(EINVAL)), + }; + let l2_index = self.l2_table_index(address) as usize; + + if l2_addr_disk == 0 { + return Ok(None); + } + + self.cache_l2_cluster(l1_index, l2_addr_disk)?; + + let l2_entry = self.l2_cache.get(l1_index).unwrap()[l2_index]; + if l2_entry_is_empty(l2_entry) || l2_entry_is_zero(l2_entry) { + return Ok(None); + } + + if l2_entry_is_compressed(l2_entry) { + self.deallocate_compressed_cluster(l2_entry)?; + self.l2_cache.get_mut(l1_index).unwrap()[l2_index] = 0; + return Ok(None); + } + + let cluster_addr = l2_entry_std_cluster_addr(l2_entry); + let refcount = self + .refcounts + .get_cluster_refcount(&mut self.raw_file, cluster_addr) + .map_err(|e| { + if matches!(e, refcount::Error::RefblockUnaligned(_)) { + self.set_corrupt_bit_best_effort(); + } + io::Error::new( + io::ErrorKind::InvalidData, + format!("failed to get cluster refcount: {e}"), + ) + })?; + if refcount == 0 { + return Err(io::Error::from_raw_os_error(EINVAL)); + } + + if sparse { + let new_refcount = refcount - 1; + self.set_cluster_refcount_track_freed(cluster_addr, new_refcount)?; + self.l2_cache.get_mut(l1_index).unwrap()[l2_index] = 0; + if new_refcount == 0 { + self.unref_clusters.push(cluster_addr); + return Ok(Some(cluster_addr)); + } + } else if refcount == 1 { + self.l2_cache.get_mut(l1_index).unwrap()[l2_index] = l2_entry_make_zero(cluster_addr); + } else { + self.set_cluster_refcount_track_freed(cluster_addr, refcount - 1)?; + self.l2_cache.get_mut(l1_index).unwrap()[l2_index] = 0; + } + Ok(None) + } + + /// Sets refcount for a cluster, tracking any newly freed clusters. + fn set_cluster_refcount_track_freed(&mut self, address: u64, refcount: u64) -> io::Result<()> { + let mut newly_unref = self.set_cluster_refcount(address, refcount)?; + self.unref_clusters.append(&mut newly_unref); + Ok(()) + } + + /// Sets the refcount for a cluster. Returns freed cluster addresses. + fn set_cluster_refcount(&mut self, address: u64, refcount: u64) -> io::Result> { + let mut added_clusters = Vec::new(); + let mut unref_clusters = Vec::new(); + let mut refcount_set = false; + let mut new_cluster = None; + + while !refcount_set { + match self.refcounts.set_cluster_refcount( + &mut self.raw_file, + address, + refcount, + new_cluster.take(), + ) { + Ok(None) => { + refcount_set = true; + } + Ok(Some(freed_cluster)) => { + let mut freed = self.set_cluster_refcount(freed_cluster, 0)?; + unref_clusters.append(&mut freed); + refcount_set = true; + } + Err(refcount::Error::EvictingRefCounts(e)) => { + return Err(e); + } + Err(refcount::Error::InvalidIndex) => { + self.set_corrupt_bit_best_effort(); + return Err(io::Error::from_raw_os_error(EINVAL)); + } + Err(refcount::Error::NeedCluster(addr)) => { + new_cluster = Some(( + addr, + VecCache::from_vec(self.raw_file.read_refcount_block(addr)?), + )); + } + Err(refcount::Error::NeedNewCluster) => { + let addr = self.get_new_cluster(None)?; + added_clusters.push(addr); + new_cluster = Some(( + addr, + VecCache::new(self.refcounts.refcounts_per_block() as usize), + )); + } + Err(refcount::Error::ReadingRefCounts(e)) => { + return Err(e); + } + Err(refcount::Error::RefcountOverflow { .. }) => { + return Err(io::Error::from_raw_os_error(EINVAL)); + } + Err(refcount::Error::RefblockUnaligned(_)) => { + self.set_corrupt_bit_best_effort(); + return Err(io::Error::from_raw_os_error(EIO)); + } + } + } + + for addr in added_clusters { + self.set_cluster_refcount(addr, 1)?; + } + Ok(unref_clusters) + } + + /// Flushes all dirty metadata to disk. + pub(super) fn sync_caches(&mut self) -> io::Result<()> { + // Write out all dirty L2 tables. + for (l1_index, l2_table) in self.l2_cache.iter_mut().filter(|(_k, v)| v.dirty()) { + let addr = self.l1_table[*l1_index]; + if addr != 0 { + self.raw_file + .write_pointer_table_direct(addr, l2_table.iter())?; + } else { + self.set_corrupt_bit_best_effort(); + return Err(io::Error::from_raw_os_error(EINVAL)); + } + l2_table.mark_clean(); + } + // Write the modified refcount blocks. + self.refcounts.flush_blocks(&mut self.raw_file)?; + // Sync metadata and data clusters. + self.raw_file.file_mut().sync_all()?; + + // Push L1 table and refcount table last. + let mut sync_required = if self.l1_table.dirty() { + let refcounts = &mut self.refcounts; + self.raw_file.write_pointer_table( + self.header.l1_table_offset, + self.l1_table.iter(), + |raw_file, l2_addr| { + if l2_addr == 0 { + Ok(0) + } else { + let refcount = refcounts + .get_cluster_refcount(raw_file, l2_addr) + .map_err(|e| io::Error::other(super::Error::GettingRefcount(e)))?; + Ok(l1_entry_make(l2_addr, refcount == 1)) + } + }, + )?; + self.l1_table.mark_clean(); + true + } else { + false + }; + sync_required |= self.refcounts.flush_table(&mut self.raw_file)?; + if sync_required { + self.raw_file.file_mut().sync_data()?; + } + + Ok(()) + } + + /// Decompresses a compressed cluster, returning the raw decompressed bytes. + fn decompress_l2_cluster(&mut self, l2_entry: u64) -> io::Result> { + let (compressed_addr, compressed_size) = + l2_entry_compressed_cluster_layout(l2_entry, self.header.cluster_bits); + self.raw_file + .file_mut() + .seek(io::SeekFrom::Start(compressed_addr))?; + let mut compressed = vec![0u8; compressed_size]; + io::Read::read_exact(self.raw_file.file_mut(), &mut compressed)?; + let decoder = self.header.get_decoder(); + let cluster_size = self.raw_file.cluster_size() as usize; + let mut decompressed = vec![0u8; cluster_size]; + let decompressed_size = decoder + .decode(&compressed, &mut decompressed) + .map_err(|_| { + self.set_corrupt_bit_best_effort(); + io::Error::from_raw_os_error(EIO) + })?; + if decompressed_size as u64 != self.raw_file.cluster_size() { + self.set_corrupt_bit_best_effort(); + return Err(io::Error::from_raw_os_error(EIO)); + } + Ok(decompressed) + } + + /// Deallocates the clusters spanned by a compressed L2 entry. + fn deallocate_compressed_cluster(&mut self, l2_entry: u64) -> io::Result<()> { + let (compressed_addr, compressed_size) = + l2_entry_compressed_cluster_layout(l2_entry, self.header.cluster_bits); + let cluster_size = self.raw_file.cluster_size(); + + // Calculate the end of the compressed data region + let compressed_clusters_end = self.raw_file.cluster_address( + compressed_addr // Start of compressed data + + compressed_size as u64 // Add size to get end address + + cluster_size + - 1, // Catch possibly partially used last cluster + ); + + // Decrement refcount for each cluster spanned by the compressed data + let mut addr = self.raw_file.cluster_address(compressed_addr); + while addr < compressed_clusters_end { + let refcount = self + .refcounts + .get_cluster_refcount(&mut self.raw_file, addr) + .map_err(|e| { + if matches!(e, refcount::Error::RefblockUnaligned(_)) { + self.set_corrupt_bit_best_effort(); + } + io::Error::new( + io::ErrorKind::InvalidData, + format!("failed to get cluster refcount: {e}"), + ) + })?; + if refcount > 0 { + self.set_cluster_refcount_track_freed(addr, refcount - 1)?; + } + addr += cluster_size; + } + Ok(()) + } + + /// Best effort attempt to mark the image corrupt. + fn set_corrupt_bit_best_effort(&mut self) { + if let Err(e) = self.header.set_corrupt_bit(self.raw_file.file_mut()) { + log::warn!("Failed to persist corrupt bit: {e}"); + } + } +} diff --git a/block/src/qcow/mod.rs b/block/src/qcow/mod.rs index fd932406c0..4fc4916f30 100644 --- a/block/src/qcow/mod.rs +++ b/block/src/qcow/mod.rs @@ -4,46 +4,67 @@ // // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause +pub(crate) mod backing; mod decoder; -mod qcow_raw_file; +mod header; +pub(crate) mod metadata; +pub(crate) mod qcow_raw_file; mod raw_file; mod refcount; +mod util; mod vec_cache; use std::cmp::{max, min}; -use std::fmt::{Debug, Display, Formatter, Result as FmtResult}; +use std::fmt::{Debug, Formatter, Result as FmtResult}; use std::fs::{OpenOptions, read_link}; use std::io::{self, Read, Seek, SeekFrom, Write}; use std::mem::size_of; use std::os::fd::{AsRawFd, RawFd}; -use std::str::{self, FromStr}; +use std::str; -use bitflags::bitflags; +#[cfg(test)] +use header::{ + AUTOCLEAR_FEATURES_OFFSET, DEFAULT_REFCOUNT_ORDER, HEADER_EXT_BACKING_FORMAT, HEADER_EXT_END, + V2_BARE_HEADER_SIZE, V3_BARE_HEADER_SIZE, +}; +pub use header::{ + BackingFileConfig, CompressionType, ImageType, IncompatFeatures, MissingFeatureError, + QcowHeader, +}; +use header::{ + COMPATIBLE_FEATURES_LAZY_REFCOUNTS, MAX_CLUSTER_BITS, MAX_QCOW_FILE_SIZE, + MAX_RAM_POINTER_TABLE_SIZE, MIN_CLUSTER_BITS, QCOW_MAGIC, max_refcount_clusters, + offset_is_cluster_boundary, +}; use libc::{EINVAL, EIO, ENOSPC}; use log::{error, warn}; +use metadata::ClusterReadMapping; use remain::sorted; use thiserror::Error; +pub(crate) use util::MAX_NESTING_DEPTH; +use util::{ + L1_TABLE_OFFSET_MASK, L2_TABLE_OFFSET_MASK, div_round_up_u32, div_round_up_u64, l1_entry_make, + l2_entry_compressed_cluster_layout, l2_entry_is_compressed, l2_entry_is_empty, + l2_entry_is_zero, l2_entry_make_std, l2_entry_make_zero, l2_entry_std_cluster_addr, +}; use vmm_sys_util::file_traits::{FileSetLen, FileSync}; use vmm_sys_util::seek_hole::SeekHole; use vmm_sys_util::write_zeroes::{PunchHole, WriteZeroesAt}; use crate::BlockBackend; -use crate::qcow::decoder::{Decoder, ZlibDecoder, ZstdDecoder}; +use crate::error::{BlockError, BlockErrorKind, BlockResult}; use crate::qcow::qcow_raw_file::{BeUint, QcowRawFile}; pub use crate::qcow::raw_file::RawFile; use crate::qcow::refcount::RefCount; use crate::qcow::vec_cache::{CacheMap, Cacheable, VecCache}; -/// Nesting depth limit for disk formats that can open other disk files. -pub(super) const MAX_NESTING_DEPTH: u32 = 10; - #[sorted] #[derive(Debug, Error)] pub enum Error { - #[error("Backing file io error")] - BackingFileIo(#[source] io::Error), - #[error("Backing file open error")] - BackingFileOpen(#[source] Box), + #[error("Backing file I/O error: {0}")] + BackingFileIo(String /* path */, #[source] io::Error), + #[error("Backing file open error: {0}")] + BackingFileOpen(String /* path */, #[source] Box), #[error("Backing file support is disabled")] BackingFilesDisabled, #[error("Backing file name is too long: {0} bytes over")] @@ -144,739 +165,463 @@ pub enum Error { pub type Result = std::result::Result; -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -pub enum ImageType { - Raw, - Qcow2, -} - -impl Display for ImageType { - fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - match self { - ImageType::Raw => write!(f, "raw"), - ImageType::Qcow2 => write!(f, "qcow2"), - } - } +/// Concrete backing file variants. +pub(crate) enum BackingKind { + /// Raw backing file. + Raw(RawFile), + /// QCOW2 backing parsed into metadata and raw file. + Qcow { + inner: Box, + backing: Option>, + }, + /// Full QcowFile used as backing, only in tests. + #[cfg(test)] + QcowFile(Box), } - -impl FromStr for ImageType { - type Err = Error; - - fn from_str(s: &str) -> Result { - match s { - "raw" => Ok(ImageType::Raw), - "qcow2" => Ok(ImageType::Qcow2), - _ => Err(Error::UnsupportedBackingFileFormat(s.to_string())), - } - } -} - -#[derive(Clone, Debug)] -pub enum CompressionType { - Zlib, - Zstd, -} - -#[derive(Debug, Clone)] -pub struct BackingFileConfig { - pub path: String, - // If this is None, we will autodetect it. - pub format: Option, -} - -// Maximum data size supported. -const MAX_QCOW_FILE_SIZE: u64 = 0x01 << 44; // 16 TB. - -// QCOW magic constant that starts the header. -const QCOW_MAGIC: u32 = 0x5146_49fb; -// Default to a cluster size of 2^DEFAULT_CLUSTER_BITS -const DEFAULT_CLUSTER_BITS: u32 = 16; -// Limit clusters to reasonable sizes. Choose the same limits as qemu. Making the clusters smaller -// increases the amount of overhead for book keeping. -const MIN_CLUSTER_BITS: u32 = 9; -const MAX_CLUSTER_BITS: u32 = 21; -// The L1 and RefCount table are kept in RAM, only handle files that require less than 35M entries. -// This easily covers 1 TB files. When support for bigger files is needed the assumptions made to -// keep these tables in RAM needs to be thrown out. -const MAX_RAM_POINTER_TABLE_SIZE: u64 = 35_000_000; -// 16-bit refcounts. -const DEFAULT_REFCOUNT_ORDER: u32 = 4; - -const V2_BARE_HEADER_SIZE: u32 = 72; -const V3_BARE_HEADER_SIZE: u32 = 104; -const AUTOCLEAR_FEATURES_OFFSET: u64 = 88; - -// bits 0-8 and 56-63 are reserved. -const L1_TABLE_OFFSET_MASK: u64 = 0x00ff_ffff_ffff_fe00; -const L2_TABLE_OFFSET_MASK: u64 = 0x00ff_ffff_ffff_fe00; -// Flags -const ZERO_FLAG: u64 = 1 << 0; -const COMPRESSED_FLAG: u64 = 1 << 62; -const COMPRESSED_SECTOR_SIZE: u64 = 512; -const CLUSTER_USED_FLAG: u64 = 1 << 63; -const COMPATIBLE_FEATURES_LAZY_REFCOUNTS: u64 = 1; - -// Compression types as defined in https://www.qemu.org/docs/master/interop/qcow2.html -const COMPRESSION_TYPE_ZLIB: u64 = 0; // zlib/deflate -const COMPRESSION_TYPE_ZSTD: u64 = 1; // zstd - -// Header extension types -const HEADER_EXT_END: u32 = 0x00000000; -// Backing file format name (raw, qcow2) -const HEADER_EXT_BACKING_FORMAT: u32 = 0xe2792aca; -// Feature name table -const HEADER_EXT_FEATURE_NAME_TABLE: u32 = 0x6803f857; - -// Feature name table entry type incompatible -const FEAT_TYPE_INCOMPATIBLE: u8 = 0; - -bitflags! { - #[derive(Debug, Clone, Copy, PartialEq, Eq)] - pub struct IncompatFeatures: u64 { - const DIRTY = 1 << 0; - const CORRUPT = 1 << 1; - const DATA_FILE = 1 << 2; - const COMPRESSION = 1 << 3; - const EXTENDED_L2 = 1 << 4; - } -} - -impl IncompatFeatures { - /// Features supported by this implementation. - const SUPPORTED: IncompatFeatures = IncompatFeatures::DIRTY - .union(IncompatFeatures::CORRUPT) - .union(IncompatFeatures::COMPRESSION); - - /// Get the fallback name for a known feature bit. - fn flag_name(bit: u8) -> Option<&'static str> { - Some(match Self::from_bits_truncate(1u64 << bit) { - Self::DIRTY => "dirty bit", - Self::CORRUPT => "corrupt bit", - Self::DATA_FILE => "external data file", - Self::EXTENDED_L2 => "extended L2 entries", - _ => return None, - }) - } +/// Backing file wrapper +pub(crate) struct BackingFile { + kind: BackingKind, + virtual_size: u64, } -/// Error type for unsupported incompatible features. -#[derive(Debug, Clone, Error)] -pub struct MissingFeatureError { - /// Unsupported feature bits. - features: IncompatFeatures, - /// Feature name table from the qcow2 image. - feature_names: Vec<(u8, String)>, -} +impl BackingFile { + fn new( + backing_file_config: Option<&BackingFileConfig>, + direct_io: bool, + max_nesting_depth: u32, + sparse: bool, + ) -> BlockResult> { + let Some(config) = backing_file_config else { + return Ok(None); + }; -impl MissingFeatureError { - fn new(features: IncompatFeatures, feature_names: Vec<(u8, String)>) -> Self { - Self { - features, - feature_names, + // Check nesting depth - applies to any backing file + if max_nesting_depth == 0 { + return Err(BlockError::new( + BlockErrorKind::Overflow, + Error::MaxNestingDepthExceeded, + )); } - } -} - -impl Display for MissingFeatureError { - fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - let names: Vec = (0u8..64) - .filter(|&bit| self.features.bits() & (1u64 << bit) != 0) - .map(|bit| { - // First try the image's feature name table - self.feature_names - .iter() - .find(|(b, _)| *b == bit) - .map(|(_, name)| name.clone()) - // Then try hardcoded fallback names - .or_else(|| IncompatFeatures::flag_name(bit).map(|s| s.to_string())) - // Finally, use generic description - .unwrap_or_else(|| format!("unknown feature bit {bit}")) - }) - .collect(); - write!(f, "Missing features: {}", names.join(", ")) - } -} - -// The format supports a "header extension area", that crosvm does not use. -const QCOW_EMPTY_HEADER_EXTENSION_SIZE: u32 = 8; - -// Defined by the specification -const MAX_BACKING_FILE_SIZE: u32 = 1023; - -fn l2_entry_is_empty(l2_entry: u64) -> bool { - l2_entry == 0 -} - -// Check bit 0 - only valid for standard clusters. -fn l2_entry_is_zero(l2_entry: u64) -> bool { - l2_entry & ZERO_FLAG != 0 -} - -fn l2_entry_is_compressed(l2_entry: u64) -> bool { - l2_entry & COMPRESSED_FLAG != 0 -} - -// Get file offset and size of compressed cluster data -fn l2_entry_compressed_cluster_layout(l2_entry: u64, cluster_bits: u32) -> (u64, usize) { - let compressed_size_shift = 62 - (cluster_bits - 8); - let compressed_size_mask = (1 << (cluster_bits - 8)) - 1; - let compressed_cluster_addr = l2_entry & ((1 << compressed_size_shift) - 1); - let nsectors = (l2_entry >> compressed_size_shift & compressed_size_mask) + 1; - let compressed_cluster_size = ((nsectors * COMPRESSED_SECTOR_SIZE) - - (compressed_cluster_addr & (COMPRESSED_SECTOR_SIZE - 1))) - as usize; - (compressed_cluster_addr, compressed_cluster_size) -} - -// Get file offset of standard (non-compressed) cluster -fn l2_entry_std_cluster_addr(l2_entry: u64) -> u64 { - l2_entry & L2_TABLE_OFFSET_MASK -} - -// Make L2 entry for standard (non-compressed) cluster -fn l2_entry_make_std(cluster_addr: u64) -> u64 { - (cluster_addr & L2_TABLE_OFFSET_MASK) | CLUSTER_USED_FLAG -} - -// Make L2 entry for preallocated zero cluster -fn l2_entry_make_zero(cluster_addr: u64) -> u64 { - (cluster_addr & L2_TABLE_OFFSET_MASK) | CLUSTER_USED_FLAG | ZERO_FLAG -} - -// Make L1 entry with optional flags -fn l1_entry_make(cluster_addr: u64, refcount_is_one: bool) -> u64 { - (cluster_addr & L1_TABLE_OFFSET_MASK) | (refcount_is_one as u64 * CLUSTER_USED_FLAG) -} - -/// Contains the information from the header of a qcow file. -#[derive(Clone, Debug)] -pub struct QcowHeader { - pub magic: u32, - pub version: u32, - - pub backing_file_offset: u64, - pub backing_file_size: u32, - pub cluster_bits: u32, - pub size: u64, - pub crypt_method: u32, - - pub l1_size: u32, - pub l1_table_offset: u64, - - pub refcount_table_offset: u64, - pub refcount_table_clusters: u32, - - pub nb_snapshots: u32, - pub snapshots_offset: u64, + let backing_raw_file = OpenOptions::new() + .read(true) + .open(&config.path) + .map_err(|e| { + BlockError::new( + BlockErrorKind::Io, + Error::BackingFileIo(config.path.clone(), e), + ) + })?; - // v3 entries - pub incompatible_features: u64, - pub compatible_features: u64, - pub autoclear_features: u64, - pub refcount_order: u32, - pub header_size: u32, - pub compression_type: CompressionType, + let mut raw_file = RawFile::new(backing_raw_file, direct_io); - // Post-header entries - pub backing_file: Option, -} + // Determine backing file format from header extension or auto-detect + let backing_format = match config.format { + Some(format) => format, + None => detect_image_type(&mut raw_file)?, + }; -impl QcowHeader { - /// Read header extensions, optionally collecting feature names for error reporting. - fn read_header_extensions( - f: &mut RawFile, - header: &mut QcowHeader, - mut feature_table: Option<&mut Vec<(u8, String)>>, - ) -> Result<()> { - // Extensions start directly after the header - f.seek(SeekFrom::Start(header.header_size as u64)) - .map_err(Error::ReadingHeader)?; - - loop { - let ext_type = u32::read_be(f).map_err(Error::ReadingHeader)?; - if ext_type == HEADER_EXT_END { - break; + let (kind, virtual_size) = match backing_format { + ImageType::Raw => { + let size = raw_file.seek(SeekFrom::End(0)).map_err(|e| { + BlockError::new( + BlockErrorKind::Io, + Error::BackingFileIo(config.path.clone(), e), + ) + })?; + raw_file.rewind().map_err(|e| { + BlockError::new( + BlockErrorKind::Io, + Error::BackingFileIo(config.path.clone(), e), + ) + })?; + (BackingKind::Raw(raw_file), size) } - - let ext_length = u32::read_be(f).map_err(Error::ReadingHeader)?; - - match ext_type { - HEADER_EXT_BACKING_FORMAT => { - let mut format_bytes = vec![0u8; ext_length as usize]; - f.read_exact(&mut format_bytes) - .map_err(Error::ReadingHeader)?; - let format_str = String::from_utf8(format_bytes) - .map_err(|err| Error::InvalidBackingFileName(err.utf8_error()))?; - if let Some(backing_file) = &mut header.backing_file { - backing_file.format = Some(format_str.parse()?); - } - } - HEADER_EXT_FEATURE_NAME_TABLE if feature_table.is_some() => { - const FEATURE_NAME_ENTRY_SIZE: usize = 1 + 1 + 46; // type + bit + name - let mut data = vec![0u8; ext_length as usize]; - f.read_exact(&mut data).map_err(Error::ReadingHeader)?; - let table = feature_table.as_mut().unwrap(); - for entry in data.chunks_exact(FEATURE_NAME_ENTRY_SIZE) { - if entry[0] == FEAT_TYPE_INCOMPATIBLE { - let bit_number = entry[1]; - let name_bytes = &entry[2..]; - let name_len = name_bytes.iter().position(|&b| b == 0).unwrap_or(46); - let name = String::from_utf8_lossy(&name_bytes[..name_len]).to_string(); - table.push((bit_number, name)); + ImageType::Qcow2 => { + let (inner, nested_backing, _sparse) = + parse_qcow(raw_file, max_nesting_depth - 1, sparse).map_err(|e| { + let kind = e.kind(); + let source = e + .into_source() + .and_then(|s| s.downcast::().ok()) + .map(|qcow_err| Error::BackingFileOpen(config.path.clone(), qcow_err)); + match source { + Some(err) => BlockError::new(kind, err), + None => BlockError::from_kind(kind), } - } - } - _ => { - // Skip unknown extension - f.seek(SeekFrom::Current(ext_length as i64)) - .map_err(Error::ReadingHeader)?; - } + })?; + let size = inner.header.size; + ( + BackingKind::Qcow { + inner: Box::new(inner), + backing: nested_backing.map(Box::new), + }, + size, + ) } + }; - // Skip to the next 8 byte boundary - let padding = (8 - (ext_length % 8)) % 8; - f.seek(SeekFrom::Current(padding as i64)) - .map_err(Error::ReadingHeader)?; - } - - Ok(()) + Ok(Some(Self { kind, virtual_size })) } - /// Creates a QcowHeader from a reference to a file. - pub fn new(f: &mut RawFile) -> Result { - f.rewind().map_err(Error::ReadingHeader)?; - let magic = u32::read_be(f).map_err(Error::ReadingHeader)?; - if magic != QCOW_MAGIC { - return Err(Error::InvalidMagic); - } - - // Reads the next u32 from the file. - fn read_u32_be(f: &mut RawFile) -> Result { - u32::read_be(f).map_err(Error::ReadingHeader) - } + /// Consume and return the kind and virtual size. + pub(crate) fn into_kind(self) -> (BackingKind, u64) { + (self.kind, self.virtual_size) + } - // Reads the next u64 from the file. - fn read_u64_be(f: &mut RawFile) -> Result { - u64::read_be(f).map_err(Error::ReadingHeader) + /// Read from backing file, returning zeros for any portion beyond backing file size. + #[inline] + pub(crate) fn read_at(&mut self, address: u64, buf: &mut [u8]) -> std::io::Result<()> { + if address >= self.virtual_size { + buf.fill(0); + return Ok(()); } - let version = read_u32_be(f)?; - - let mut header = QcowHeader { - magic, - version, - backing_file_offset: read_u64_be(f)?, - backing_file_size: read_u32_be(f)?, - cluster_bits: read_u32_be(f)?, - size: read_u64_be(f)?, - crypt_method: read_u32_be(f)?, - l1_size: read_u32_be(f)?, - l1_table_offset: read_u64_be(f)?, - refcount_table_offset: read_u64_be(f)?, - refcount_table_clusters: read_u32_be(f)?, - nb_snapshots: read_u32_be(f)?, - snapshots_offset: read_u64_be(f)?, - incompatible_features: if version == 2 { 0 } else { read_u64_be(f)? }, - compatible_features: if version == 2 { 0 } else { read_u64_be(f)? }, - autoclear_features: if version == 2 { 0 } else { read_u64_be(f)? }, - refcount_order: if version == 2 { - DEFAULT_REFCOUNT_ORDER - } else { - read_u32_be(f)? - }, - header_size: if version == 2 { - V2_BARE_HEADER_SIZE - } else { - read_u32_be(f)? - }, - compression_type: CompressionType::Zlib, - backing_file: None, + let available = (self.virtual_size - address) as usize; + let (target, overflow) = if available >= buf.len() { + (buf, &mut [][..]) + } else { + buf.split_at_mut(available) }; - if version == 3 && header.header_size > V3_BARE_HEADER_SIZE { - let raw_compression_type = read_u64_be(f)? >> (64 - 8); - header.compression_type = if raw_compression_type == COMPRESSION_TYPE_ZLIB { - Ok(CompressionType::Zlib) - } else if raw_compression_type == COMPRESSION_TYPE_ZSTD { - Ok(CompressionType::Zstd) - } else { - Err(Error::UnsupportedCompressionType) - }?; - } - if header.backing_file_size > MAX_BACKING_FILE_SIZE { - return Err(Error::BackingFileTooLong(header.backing_file_size as usize)); - } - if header.backing_file_offset != 0 { - f.seek(SeekFrom::Start(header.backing_file_offset)) - .map_err(Error::ReadingHeader)?; - let mut backing_file_name_bytes = vec![0u8; header.backing_file_size as usize]; - f.read_exact(&mut backing_file_name_bytes) - .map_err(Error::ReadingHeader)?; - let path = String::from_utf8(backing_file_name_bytes) - .map_err(|err| Error::InvalidBackingFileName(err.utf8_error()))?; - header.backing_file = Some(BackingFileConfig { path, format: None }); - } + Self::read_at_inner(&mut self.kind, address, target)?; + overflow.fill(0); + Ok(()) + } - if version == 3 { - // Check for unsupported incompatible features first - let features = IncompatFeatures::from_bits_retain(header.incompatible_features); - let unsupported = features - IncompatFeatures::SUPPORTED; - if !unsupported.is_empty() { - // Read extensions only to get feature names for error reporting - let mut feature_table = Vec::new(); - if header.header_size > V3_BARE_HEADER_SIZE { - let _ = Self::read_header_extensions(f, &mut header, Some(&mut feature_table)); - } - return Err(Error::UnsupportedFeature(MissingFeatureError::new( - unsupported, - feature_table, - ))); + fn read_at_inner(kind: &mut BackingKind, address: u64, buf: &mut [u8]) -> std::io::Result<()> { + match kind { + BackingKind::Raw(file) => { + file.seek(SeekFrom::Start(address))?; + file.read_exact(buf) } - - // Features OK, now read extensions normally - if header.header_size > V3_BARE_HEADER_SIZE { - Self::read_header_extensions(f, &mut header, None)?; + #[cfg(test)] + BackingKind::QcowFile(qcow) => { + qcow.seek(SeekFrom::Start(address))?; + qcow.read_exact(buf) + } + BackingKind::Qcow { inner, backing } => { + let has_backing = backing.is_some(); + let cluster_size = inner.raw_file.cluster_size(); + let mut pos = 0usize; + while pos < buf.len() { + let curr_addr = address + pos as u64; + let intra = inner.raw_file.cluster_offset(curr_addr) as usize; + let count = min(buf.len() - pos, cluster_size as usize - intra); + let mapping = inner.map_cluster_read(curr_addr, count, has_backing)?; + match mapping { + ClusterReadMapping::Zero { length } => { + buf[pos..pos + length as usize].fill(0); + } + ClusterReadMapping::Allocated { + offset: host_off, + length, + } => { + inner.raw_file.file_mut().seek(SeekFrom::Start(host_off))?; + inner + .raw_file + .file_mut() + .read_exact(&mut buf[pos..pos + length as usize])?; + } + ClusterReadMapping::Compressed { data } => { + buf[pos..pos + data.len()].copy_from_slice(&data); + } + ClusterReadMapping::Backing { + offset: backing_off, + length, + } => { + if let Some(bf) = backing.as_mut() { + bf.read_at(backing_off, &mut buf[pos..pos + length as usize])?; + } else { + buf[pos..pos + length as usize].fill(0); + } + } + } + pos += count; + } + Ok(()) } } - - Ok(header) } +} - pub fn get_decoder(&self) -> Box { - match self.compression_type { - CompressionType::Zlib => Box::new(ZlibDecoder {}), - CompressionType::Zstd => Box::new(ZstdDecoder {}), - } +impl Debug for BackingFile { + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + f.debug_struct("BackingFile").finish() } +} - pub fn create_for_size_and_path( - version: u32, - size: u64, - backing_file: Option<&str>, - ) -> Result { - let header_size = if version == 2 { - V2_BARE_HEADER_SIZE - } else { - V3_BARE_HEADER_SIZE + QCOW_EMPTY_HEADER_EXTENSION_SIZE +/// Parses and validates a QCOW2 image file, returning the metadata, backing +/// file and sparse flag. +/// +/// This shared constructor is used by both QcowFile for sequential I/O +/// and QcowDiskSync for lock based parallel I/O. +pub(crate) fn parse_qcow( + mut file: RawFile, + max_nesting_depth: u32, + sparse: bool, +) -> BlockResult<(metadata::QcowState, Option, bool)> { + let mut header = QcowHeader::new(&mut file).map_err(|e| { + let kind = match &e { + Error::InvalidMagic + | Error::BackingFileTooLong(_) + | Error::InvalidBackingFileName(_) => BlockErrorKind::InvalidFormat, + Error::UnsupportedFeature(_) | Error::UnsupportedCompressionType => { + BlockErrorKind::UnsupportedFeature + } + _ => BlockErrorKind::Io, }; - let cluster_bits: u32 = DEFAULT_CLUSTER_BITS; - let cluster_size: u32 = 0x01 << cluster_bits; - let max_length: usize = (cluster_size - header_size) as usize; - if let Some(path) = backing_file - && path.len() > max_length - { - return Err(Error::BackingFileTooLong(path.len() - max_length)); - } - - // L2 blocks are always one cluster long. They contain cluster_size/sizeof(u64) addresses. - let entries_per_cluster: u32 = cluster_size / size_of::() as u32; - let num_clusters: u32 = div_round_up_u64(size, u64::from(cluster_size)) as u32; - let num_l2_clusters: u32 = div_round_up_u32(num_clusters, entries_per_cluster); - let l1_clusters: u32 = div_round_up_u32(num_l2_clusters, entries_per_cluster); - let header_clusters = div_round_up_u32(size_of::() as u32, cluster_size); - Ok(QcowHeader { - magic: QCOW_MAGIC, - version, - backing_file_offset: backing_file.map_or(0, |_| { - header_size - + if version == 3 { - QCOW_EMPTY_HEADER_EXTENSION_SIZE - } else { - 0 - } - }) as u64, - backing_file_size: backing_file.map_or(0, |x| x.len()) as u32, - cluster_bits: DEFAULT_CLUSTER_BITS, - size, - crypt_method: 0, - l1_size: num_l2_clusters, - l1_table_offset: u64::from(cluster_size), - // The refcount table is after l1 + header. - refcount_table_offset: u64::from(cluster_size * (l1_clusters + 1)), - refcount_table_clusters: { - // Pre-allocate enough clusters for the entire refcount table as it must be - // continuous in the file. Allocate enough space to refcount all clusters, including - // the refcount clusters. - let max_refcount_clusters = max_refcount_clusters( - DEFAULT_REFCOUNT_ORDER, - cluster_size, - num_clusters + l1_clusters + num_l2_clusters + header_clusters, - ) as u32; - // The refcount table needs to store the offset of each refcount cluster. - div_round_up_u32( - max_refcount_clusters * size_of::() as u32, - cluster_size, - ) - }, - nb_snapshots: 0, - snapshots_offset: 0, - incompatible_features: 0, - compatible_features: 0, - autoclear_features: 0, - refcount_order: DEFAULT_REFCOUNT_ORDER, - header_size, - compression_type: CompressionType::Zlib, - backing_file: backing_file.map(|path| BackingFileConfig { - path: String::from(path), - format: None, - }), - }) + BlockError::new(kind, e) + })?; + + // Only v2 and v3 files are supported. + if header.version != 2 && header.version != 3 { + return Err(BlockError::new( + BlockErrorKind::UnsupportedFeature, + Error::UnsupportedVersion(header.version), + )); } - /// Write the header to `file`. - pub fn write_to(&self, file: &mut F) -> Result<()> { - // Writes the next u32 to the file. - fn write_u32_be(f: &mut F, value: u32) -> Result<()> { - u32::write_be(f, value).map_err(Error::WritingHeader) - } - - // Writes the next u64 to the file. - fn write_u64_be(f: &mut F, value: u64) -> Result<()> { - u64::write_be(f, value).map_err(Error::WritingHeader) - } + // Make sure that the L1 table fits in RAM. + if u64::from(header.l1_size) > MAX_RAM_POINTER_TABLE_SIZE { + return Err(BlockError::new( + BlockErrorKind::InvalidFormat, + Error::InvalidL1TableSize(header.l1_size), + )); + } - write_u32_be(file, self.magic)?; - write_u32_be(file, self.version)?; - write_u64_be(file, self.backing_file_offset)?; - write_u32_be(file, self.backing_file_size)?; - write_u32_be(file, self.cluster_bits)?; - write_u64_be(file, self.size)?; - write_u32_be(file, self.crypt_method)?; - write_u32_be(file, self.l1_size)?; - write_u64_be(file, self.l1_table_offset)?; - write_u64_be(file, self.refcount_table_offset)?; - write_u32_be(file, self.refcount_table_clusters)?; - write_u32_be(file, self.nb_snapshots)?; - write_u64_be(file, self.snapshots_offset)?; - - if self.version == 3 { - write_u64_be(file, self.incompatible_features)?; - write_u64_be(file, self.compatible_features)?; - write_u64_be(file, self.autoclear_features)?; - write_u32_be(file, self.refcount_order)?; - write_u32_be(file, self.header_size)?; - - if self.header_size > V3_BARE_HEADER_SIZE { - write_u64_be(file, 0)?; // no compression - } + let cluster_bits: u32 = header.cluster_bits; + if !(MIN_CLUSTER_BITS..=MAX_CLUSTER_BITS).contains(&cluster_bits) { + return Err(BlockError::new( + BlockErrorKind::InvalidFormat, + Error::InvalidClusterSize, + )); + } + let cluster_size = 0x01u64 << cluster_bits; - write_u32_be(file, 0)?; // header extension type: end of header extension area - write_u32_be(file, 0)?; // length of header extension data: 0 - } + // Limit the total size of the disk. + if header.size > MAX_QCOW_FILE_SIZE { + return Err(BlockError::new( + BlockErrorKind::InvalidFormat, + Error::FileTooBig(header.size), + )); + } - if let Some(backing_file_path) = self.backing_file.as_ref().map(|bf| &bf.path) { - if self.backing_file_offset > 0 { - file.seek(SeekFrom::Start(self.backing_file_offset)) - .map_err(Error::WritingHeader)?; - } - write!(file, "{backing_file_path}").map_err(Error::WritingHeader)?; - } + let direct_io = file.is_direct(); - // Set the file length by seeking and writing a zero to the last byte. This avoids needing - // a `File` instead of anything that implements seek as the `file` argument. - // Zeros out the l1 and refcount table clusters. - let cluster_size = 0x01u64 << self.cluster_bits; - let refcount_blocks_size = u64::from(self.refcount_table_clusters) * cluster_size; - file.seek(SeekFrom::Start( - self.refcount_table_offset + refcount_blocks_size - 2, - )) - .map_err(Error::WritingHeader)?; - file.write(&[0u8]).map_err(Error::WritingHeader)?; + let backing_file = BackingFile::new( + header.backing_file.as_ref(), + direct_io, + max_nesting_depth, + sparse, + )?; - Ok(()) + // Validate refcount order to be 0..6 + let refcount_bits: u64 = 0x01u64.checked_shl(header.refcount_order).ok_or_else(|| { + BlockError::new( + BlockErrorKind::UnsupportedFeature, + Error::UnsupportedRefcountOrder, + ) + })?; + if refcount_bits > 64 { + return Err(BlockError::new( + BlockErrorKind::UnsupportedFeature, + Error::UnsupportedRefcountOrder, + )); } - /// Write only the incompatible_features field to the file at its fixed offset. - fn write_incompatible_features(&self, file: &mut F) -> Result<()> { - if self.version != 3 { - return Ok(()); - } - file.seek(SeekFrom::Start(V2_BARE_HEADER_SIZE as u64)) - .map_err(Error::WritingHeader)?; - u64::write_be(file, self.incompatible_features).map_err(Error::WritingHeader)?; - Ok(()) + // Need at least one refcount cluster + if header.refcount_table_clusters == 0 { + return Err(BlockError::new( + BlockErrorKind::InvalidFormat, + Error::NoRefcountClusters, + )); } - - /// Set or clear the dirty bit for QCOW2 v3 images. - /// - /// When `dirty` is true, sets the bit to indicate the image is in use. - /// When `dirty` is false, clears the bit to indicate a clean shutdown. - pub fn set_dirty_bit( - &mut self, - file: &mut F, - dirty: bool, - ) -> Result<()> { - if self.version == 3 { - if dirty { - self.incompatible_features |= IncompatFeatures::DIRTY.bits(); - } else { - self.incompatible_features &= !IncompatFeatures::DIRTY.bits(); - } - self.write_incompatible_features(file)?; - file.fsync().map_err(Error::SyncingHeader)?; - } - Ok(()) + offset_is_cluster_boundary(header.l1_table_offset, header.cluster_bits) + .map_err(|e| BlockError::new(BlockErrorKind::CorruptImage, e))?; + offset_is_cluster_boundary(header.snapshots_offset, header.cluster_bits) + .map_err(|e| BlockError::new(BlockErrorKind::CorruptImage, e))?; + // refcount table must be a cluster boundary, and within the file's virtual or actual size. + offset_is_cluster_boundary(header.refcount_table_offset, header.cluster_bits) + .map_err(|e| BlockError::new(BlockErrorKind::CorruptImage, e))?; + let file_size = file + .metadata() + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::GettingFileSize(e)))? + .len(); + if header.refcount_table_offset > max(file_size, header.size) { + return Err(BlockError::new( + BlockErrorKind::CorruptImage, + Error::RefcountTableOffEnd, + )); } - /// Set the corrupt bit for QCOW2 v3 images. - /// - /// This marks the image as corrupted. Once set, the image can only be - /// opened read-only until repaired. - pub fn set_corrupt_bit(&mut self, file: &mut F) -> Result<()> { - if self.version == 3 { - self.incompatible_features |= IncompatFeatures::CORRUPT.bits(); - self.write_incompatible_features(file)?; - file.fsync().map_err(Error::SyncingHeader)?; + // The first cluster should always have a non-zero refcount, so if it is 0, + // this is an old file with broken refcounts, which requires a rebuild. + let mut refcount_rebuild_required = true; + file.seek(SeekFrom::Start(header.refcount_table_offset)) + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SeekingFile(e)))?; + let first_refblock_addr = u64::read_be(&mut file) + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::ReadingHeader(e)))?; + if first_refblock_addr != 0 { + file.seek(SeekFrom::Start(first_refblock_addr)) + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SeekingFile(e)))?; + let first_cluster_refcount = u16::read_be(&mut file) + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::ReadingHeader(e)))?; + if first_cluster_refcount != 0 { + refcount_rebuild_required = false; } - Ok(()) } - pub fn is_corrupt(&self) -> bool { - IncompatFeatures::from_bits_truncate(self.incompatible_features) - .contains(IncompatFeatures::CORRUPT) + if (header.compatible_features & COMPATIBLE_FEATURES_LAZY_REFCOUNTS) != 0 { + refcount_rebuild_required = true; } - /// Clear all autoclear feature bits for QCOW2 v3 images. - /// - /// These bits indicate features that can be safely disabled when modified - /// by software that doesn't understand them. - pub fn clear_autoclear_features( - &mut self, - file: &mut F, - ) -> Result<()> { - if self.version == 3 && self.autoclear_features != 0 { - self.autoclear_features = 0; - file.seek(SeekFrom::Start(AUTOCLEAR_FEATURES_OFFSET)) - .map_err(Error::WritingHeader)?; - u64::write_be(file, 0).map_err(Error::WritingHeader)?; - file.fsync().map_err(Error::SyncingHeader)?; + let mut raw_file = QcowRawFile::from(file, cluster_size, refcount_bits) + .ok_or_else(|| BlockError::new(BlockErrorKind::InvalidFormat, Error::InvalidClusterSize))?; + let is_writable = raw_file.file().is_writable(); + + if header.is_corrupt() { + if is_writable { + return Err(BlockError::new( + BlockErrorKind::CorruptImage, + Error::CorruptImage, + )); } - Ok(()) + let path = read_link(format!("/proc/self/fd/{}", raw_file.file().as_raw_fd())) + .map_or_else(|_| "".to_string(), |p| p.display().to_string()); + warn!("QCOW2 image is marked corrupt, opening read-only: {path}"); } -} - -fn max_refcount_clusters(refcount_order: u32, cluster_size: u32, num_clusters: u32) -> u64 { - // Use u64 as the product of the u32 inputs can overflow. - let refcount_bits = 0x01u64 << u64::from(refcount_order); - let cluster_bits = u64::from(cluster_size) * 8; - let for_data = div_round_up_u64(u64::from(num_clusters) * refcount_bits, cluster_bits); - let for_refcounts = div_round_up_u64(for_data * refcount_bits, cluster_bits); - for_data + for_refcounts -} -trait BackingFileOps: Send + Seek + Read { - fn read_at(&mut self, address: u64, buf: &mut [u8]) -> std::io::Result<()> { - self.seek(SeekFrom::Start(address))?; - self.read_exact(buf) + // Image already has dirty bit set. Refcounts may be invalid. + if IncompatFeatures::from_bits_truncate(header.incompatible_features) + .contains(IncompatFeatures::DIRTY) + { + log::warn!("QCOW2 image not cleanly closed, rebuilding refcounts"); + refcount_rebuild_required = true; } - fn clone_box(&self) -> Box; -} -impl BackingFileOps for QcowFile { - fn clone_box(&self) -> Box { - Box::new(self.clone()) + // Skip refcount rebuilding for readonly files. + if refcount_rebuild_required && is_writable { + QcowFile::rebuild_refcounts(&mut raw_file, header.clone())?; } -} -impl BackingFileOps for RawFile { - fn clone_box(&self) -> Box { - Box::new(self.clone()) + let entries_per_cluster = cluster_size / size_of::() as u64; + let num_clusters = div_round_up_u64(header.size, cluster_size); + let num_l2_clusters = div_round_up_u64(num_clusters, entries_per_cluster); + let l1_clusters = div_round_up_u64(num_l2_clusters, entries_per_cluster); + let header_clusters = div_round_up_u64(size_of::() as u64, cluster_size); + if num_l2_clusters > MAX_RAM_POINTER_TABLE_SIZE { + return Err(BlockError::new( + BlockErrorKind::CorruptImage, + Error::TooManyL1Entries(num_l2_clusters), + )); } -} - -/// Backing file wrapper -struct BackingFile { - inner: Box, - virtual_size: u64, -} - -impl BackingFile { - fn new( - backing_file_config: Option<&BackingFileConfig>, - direct_io: bool, - max_nesting_depth: u32, - sparse: bool, - ) -> Result> { - let Some(config) = backing_file_config else { - return Ok(None); - }; - - // Check nesting depth - applies to any backing file - if max_nesting_depth == 0 { - return Err(Error::MaxNestingDepthExceeded); - } - - let backing_raw_file = OpenOptions::new() - .read(true) - .open(&config.path) - .map_err(Error::BackingFileIo)?; - - let mut raw_file = RawFile::new(backing_raw_file, direct_io); - - // Determine backing file format from header extension or auto-detect - let backing_format = match config.format { - Some(format) => format, - None => detect_image_type(&mut raw_file)?, - }; - - let (inner, virtual_size): (Box, u64) = match backing_format { - ImageType::Raw => { - let size = raw_file - .seek(SeekFrom::End(0)) - .map_err(Error::BackingFileIo)?; - raw_file.rewind().map_err(Error::BackingFileIo)?; - (Box::new(raw_file), size) - } - ImageType::Qcow2 => { - let backing_qcow = - QcowFile::from_with_nesting_depth(raw_file, max_nesting_depth - 1, sparse) - .map_err(|e| Error::BackingFileOpen(Box::new(e)))?; - let size = backing_qcow.virtual_size(); - (Box::new(backing_qcow), size) - } - }; - - Ok(Some(Self { - inner, - virtual_size, - })) + let l1_table = VecCache::from_vec( + raw_file + .read_pointer_table( + header.l1_table_offset, + num_l2_clusters, + Some(L1_TABLE_OFFSET_MASK), + ) + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::ReadingHeader(e)))?, + ); + + let num_clusters = div_round_up_u64(header.size, cluster_size); + let refcount_clusters = max_refcount_clusters( + header.refcount_order, + cluster_size as u32, + (num_clusters + l1_clusters + num_l2_clusters + header_clusters) as u32, + ); + // Check that the given header doesn't have a suspiciously sized refcount table. + if u64::from(header.refcount_table_clusters) > 2 * refcount_clusters { + return Err(BlockError::new( + BlockErrorKind::CorruptImage, + Error::RefcountTableTooLarge, + )); } - - /// Read from backing file, returning zeros for any portion beyond backing file size. - #[inline] - fn read_at(&mut self, address: u64, buf: &mut [u8]) -> std::io::Result<()> { - if address >= self.virtual_size { - // Entire read is beyond backing file - buf.fill(0); - return Ok(()); - } - - let available = (self.virtual_size - address) as usize; - if available >= buf.len() { - // Entire read is within backing file - self.inner.read_at(address, buf) - } else { - // Partial read, fill the rest with zeroes - self.inner.read_at(address, &mut buf[..available])?; - buf[available..].fill(0); - Ok(()) + if l1_clusters + refcount_clusters > MAX_RAM_POINTER_TABLE_SIZE { + return Err(BlockError::new( + BlockErrorKind::InvalidFormat, + Error::TooManyRefcounts(refcount_clusters), + )); + } + let refcount_block_entries = cluster_size * 8 / refcount_bits; + let mut refcounts = RefCount::new( + &mut raw_file, + header.refcount_table_offset, + refcount_clusters, + refcount_block_entries, + cluster_size, + refcount_bits, + ) + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::ReadingRefCounts(e)))?; + + let l2_entries = cluster_size / size_of::() as u64; + + // Check that the L1 and refcount tables fit in a 64bit address space. + let l1_index = (header.size / cluster_size) / l2_entries; + header + .l1_table_offset + .checked_add(l1_index * size_of::() as u64) + .ok_or_else(|| { + BlockError::new(BlockErrorKind::CorruptImage, Error::InvalidL1TableOffset) + })?; + header + .refcount_table_offset + .checked_add(u64::from(header.refcount_table_clusters) * cluster_size) + .ok_or_else(|| { + BlockError::new( + BlockErrorKind::CorruptImage, + Error::InvalidRefcountTableOffset, + ) + })?; + + // Find available (refcount == 0) clusters for the free list. + let file_size = raw_file + .file_mut() + .metadata() + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::GettingFileSize(e)))? + .len(); + let mut avail_clusters = Vec::new(); + for i in (0..file_size).step_by(cluster_size as usize) { + let refcount = refcounts + .get_cluster_refcount(&mut raw_file, i) + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::GettingRefcount(e)))?; + if refcount == 0 { + avail_clusters.push(i); } } -} -impl Clone for BackingFile { - fn clone(&self) -> Self { - Self { - inner: self.inner.clone_box(), - virtual_size: self.virtual_size, + if is_writable { + if !IncompatFeatures::from_bits_truncate(header.incompatible_features) + .contains(IncompatFeatures::DIRTY) + { + header + .set_dirty_bit(raw_file.file_mut(), true) + .map_err(|e| { + BlockError::new( + BlockErrorKind::Io, + Error::WritingHeader(io::Error::other(e)), + ) + })?; } - } -} -impl Debug for BackingFile { - fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - f.debug_struct("BackingFile").finish() - } + header + .clear_autoclear_features(raw_file.file_mut()) + .map_err(|e| BlockError::new(BlockErrorKind::Io, e))?; + } + + let inner = metadata::QcowState { + raw_file, + header, + l1_table, + l2_entries, + l2_cache: CacheMap::new(100), + refcounts, + avail_clusters, + unref_clusters: Vec::new(), + }; + + Ok((inner, backing_file, sparse)) } /// Represents a qcow2 file. This is a sparse file format maintained by the qemu project. @@ -896,7 +641,7 @@ impl Debug for BackingFile { /// # Ok(()) /// # } /// ``` -#[derive(Clone, Debug)] +#[derive(Debug)] pub struct QcowFile { raw_file: QcowRawFile, header: QcowHeader, @@ -917,202 +662,58 @@ impl QcowFile { /// Creates a QcowFile from `file`. File must be a valid qcow2 image. /// /// Additionally, max nesting depth of this qcow2 image will be set to default value 10. - pub fn from(file: RawFile) -> Result { + pub fn from(file: RawFile) -> BlockResult { Self::from_with_nesting_depth(file, MAX_NESTING_DEPTH, true) } /// Creates a QcowFile from `file` and with a max nesting depth. File must be a valid qcow2 /// image. pub fn from_with_nesting_depth( - mut file: RawFile, + file: RawFile, max_nesting_depth: u32, sparse: bool, - ) -> Result { - let header = QcowHeader::new(&mut file)?; - - // Only v2 and v3 files are supported. - if header.version != 2 && header.version != 3 { - return Err(Error::UnsupportedVersion(header.version)); - } - - // Make sure that the L1 table fits in RAM. - if u64::from(header.l1_size) > MAX_RAM_POINTER_TABLE_SIZE { - return Err(Error::InvalidL1TableSize(header.l1_size)); - } - - let cluster_bits: u32 = header.cluster_bits; - if !(MIN_CLUSTER_BITS..=MAX_CLUSTER_BITS).contains(&cluster_bits) { - return Err(Error::InvalidClusterSize); - } - let cluster_size = 0x01u64 << cluster_bits; - - // Limit the total size of the disk. - if header.size > MAX_QCOW_FILE_SIZE { - return Err(Error::FileTooBig(header.size)); - } - - let direct_io = file.is_direct(); - - let backing_file = BackingFile::new( - header.backing_file.as_ref(), - direct_io, - max_nesting_depth, - sparse, - )?; - - // Validate refcount order to be 0..6 - let refcount_bits: u64 = 0x01u64 - .checked_shl(header.refcount_order) - .ok_or(Error::UnsupportedRefcountOrder)?; - if refcount_bits > 64 { - return Err(Error::UnsupportedRefcountOrder); - } - - // Need at least one refcount cluster - if header.refcount_table_clusters == 0 { - return Err(Error::NoRefcountClusters); - } - offset_is_cluster_boundary(header.l1_table_offset, header.cluster_bits)?; - offset_is_cluster_boundary(header.snapshots_offset, header.cluster_bits)?; - // refcount table must be a cluster boundary, and within the file's virtual or actual size. - offset_is_cluster_boundary(header.refcount_table_offset, header.cluster_bits)?; - let file_size = file.metadata().map_err(Error::GettingFileSize)?.len(); - if header.refcount_table_offset > max(file_size, header.size) { - return Err(Error::RefcountTableOffEnd); - } - - // The first cluster should always have a non-zero refcount, so if it is 0, - // this is an old file with broken refcounts, which requires a rebuild. - let mut refcount_rebuild_required = true; - file.seek(SeekFrom::Start(header.refcount_table_offset)) - .map_err(Error::SeekingFile)?; - let first_refblock_addr = u64::read_be(&mut file).map_err(Error::ReadingHeader)?; - if first_refblock_addr != 0 { - file.seek(SeekFrom::Start(first_refblock_addr)) - .map_err(Error::SeekingFile)?; - let first_cluster_refcount = u16::read_be(&mut file).map_err(Error::ReadingHeader)?; - if first_cluster_refcount != 0 { - refcount_rebuild_required = false; - } - } - - if (header.compatible_features & COMPATIBLE_FEATURES_LAZY_REFCOUNTS) != 0 { - refcount_rebuild_required = true; - } - - let mut raw_file = QcowRawFile::from(file, cluster_size, refcount_bits) - .ok_or(Error::InvalidClusterSize)?; - let is_writable = raw_file.file().is_writable(); - - if header.is_corrupt() { - if is_writable { - return Err(Error::CorruptImage); - } - let path = read_link(format!("/proc/self/fd/{}", raw_file.file().as_raw_fd())) - .map_or_else(|_| "".to_string(), |p| p.display().to_string()); - warn!("QCOW2 image is marked corrupt, opening read-only: {path}"); - } - - // Image already has dirty bit set. Refcounts may be invalid. - if IncompatFeatures::from_bits_truncate(header.incompatible_features) - .contains(IncompatFeatures::DIRTY) - { - log::warn!("QCOW2 image not cleanly closed, rebuilding refcounts"); - refcount_rebuild_required = true; - } - - // Skip refcount rebuilding for readonly files. - if refcount_rebuild_required && is_writable { - QcowFile::rebuild_refcounts(&mut raw_file, header.clone())?; - } - - let entries_per_cluster = cluster_size / size_of::() as u64; - let num_clusters = div_round_up_u64(header.size, cluster_size); - let num_l2_clusters = div_round_up_u64(num_clusters, entries_per_cluster); - let l1_clusters = div_round_up_u64(num_l2_clusters, entries_per_cluster); - let header_clusters = div_round_up_u64(size_of::() as u64, cluster_size); - if num_l2_clusters > MAX_RAM_POINTER_TABLE_SIZE { - return Err(Error::TooManyL1Entries(num_l2_clusters)); - } - let l1_table = VecCache::from_vec( - raw_file - .read_pointer_table( - header.l1_table_offset, - num_l2_clusters, - Some(L1_TABLE_OFFSET_MASK), - ) - .map_err(Error::ReadingHeader)?, - ); - - let num_clusters = div_round_up_u64(header.size, cluster_size); - let refcount_clusters = max_refcount_clusters( - header.refcount_order, - cluster_size as u32, - (num_clusters + l1_clusters + num_l2_clusters + header_clusters) as u32, - ); - // Check that the given header doesn't have a suspiciously sized refcount table. - if u64::from(header.refcount_table_clusters) > 2 * refcount_clusters { - return Err(Error::RefcountTableTooLarge); - } - if l1_clusters + refcount_clusters > MAX_RAM_POINTER_TABLE_SIZE { - return Err(Error::TooManyRefcounts(refcount_clusters)); - } - let refcount_block_entries = cluster_size * 8 / refcount_bits; - let refcounts = RefCount::new( - &mut raw_file, - header.refcount_table_offset, - refcount_clusters, - refcount_block_entries, - cluster_size, - refcount_bits, - ) - .map_err(Error::ReadingRefCounts)?; - - let l2_entries = cluster_size / size_of::() as u64; - - let mut qcow = QcowFile { + ) -> BlockResult { + let (inner, backing_file, sparse) = parse_qcow(file, max_nesting_depth, sparse)?; + let metadata::QcowState { raw_file, header, l1_table, l2_entries, - l2_cache: CacheMap::new(100), + l2_cache, + refcounts, + avail_clusters, + unref_clusters, + } = inner; + Ok(QcowFile { + raw_file, + header, + l1_table, + l2_entries, + l2_cache, refcounts, current_offset: 0, - unref_clusters: Vec::new(), - avail_clusters: Vec::new(), + unref_clusters, + avail_clusters, backing_file, sparse, - }; - - // Check that the L1 and refcount tables fit in a 64bit address space. - qcow.header - .l1_table_offset - .checked_add(qcow.l1_address_offset(qcow.virtual_size())) - .ok_or(Error::InvalidL1TableOffset)?; - qcow.header - .refcount_table_offset - .checked_add(u64::from(qcow.header.refcount_table_clusters) * cluster_size) - .ok_or(Error::InvalidRefcountTableOffset)?; - - qcow.find_avail_clusters()?; - - if is_writable { - if !IncompatFeatures::from_bits_truncate(qcow.header.incompatible_features) - .contains(IncompatFeatures::DIRTY) - { - qcow.header.set_dirty_bit(qcow.raw_file.file_mut(), true)?; - } - - qcow.header - .clear_autoclear_features(qcow.raw_file.file_mut())?; - } - - Ok(qcow) + }) } /// Creates a new QcowFile at the given path. - pub fn new(file: RawFile, version: u32, virtual_size: u64, sparse: bool) -> Result { - let header = QcowHeader::create_for_size_and_path(version, virtual_size, None)?; + pub fn new( + file: RawFile, + version: u32, + virtual_size: u64, + sparse: bool, + ) -> BlockResult { + let header = + QcowHeader::create_for_size_and_path(version, virtual_size, None).map_err(|e| { + let kind = match &e { + Error::BackingFileTooLong(_) => BlockErrorKind::InvalidFormat, + _ => BlockErrorKind::Io, + }; + BlockError::new(kind, e) + })?; QcowFile::new_from_header(file, &header, sparse) } @@ -1123,12 +724,19 @@ impl QcowFile { backing_file_size: u64, backing_config: &BackingFileConfig, sparse: bool, - ) -> Result { + ) -> BlockResult { let mut header = QcowHeader::create_for_size_and_path( version, backing_file_size, Some(&backing_config.path), - )?; + ) + .map_err(|e| { + let kind = match &e { + Error::BackingFileTooLong(_) => BlockErrorKind::InvalidFormat, + _ => BlockErrorKind::Io, + }; + BlockError::new(kind, e) + })?; if let Some(backing_file) = &mut header.backing_file { backing_file.format = backing_config.format; } @@ -1136,9 +744,16 @@ impl QcowFile { // backing_file is loaded by new_from_header -> Self::from() based on the header } - fn new_from_header(mut file: RawFile, header: &QcowHeader, sparse: bool) -> Result { - file.rewind().map_err(Error::SeekingFile)?; - header.write_to(&mut file)?; + fn new_from_header( + mut file: RawFile, + header: &QcowHeader, + sparse: bool, + ) -> BlockResult { + file.rewind() + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SeekingFile(e)))?; + header + .write_to(&mut file) + .map_err(|e| BlockError::new(BlockErrorKind::Io, e))?; let mut qcow = Self::from_with_nesting_depth(file, MAX_NESTING_DEPTH, sparse)?; @@ -1150,9 +765,9 @@ impl QcowFile { let mut cluster_addr = 0; while cluster_addr < end_cluster_addr { - let mut unref_clusters = qcow - .set_cluster_refcount(cluster_addr, 1) - .map_err(Error::SettingRefcountRefcount)?; + let mut unref_clusters = qcow.set_cluster_refcount(cluster_addr, 1).map_err(|e| { + BlockError::new(BlockErrorKind::Io, Error::SettingRefcountRefcount(e)) + })?; qcow.unref_clusters.append(&mut unref_clusters); cluster_addr += cluster_size; } @@ -1160,11 +775,12 @@ impl QcowFile { Ok(qcow) } + #[cfg(test)] pub fn set_backing_file(&mut self, backing: Option>) { self.backing_file = backing.map(|b| { let virtual_size = b.virtual_size(); BackingFile { - inner: Box::new(*b), + kind: BackingKind::QcowFile(b), virtual_size, } }); @@ -1181,8 +797,11 @@ impl QcowFile { } /// Returns an L2_table of cluster addresses, only used for debugging. - pub fn l2_table(&mut self, l1_index: usize) -> Result> { - let l2_addr_disk = *self.l1_table.get(l1_index).ok_or(Error::InvalidIndex)?; + pub fn l2_table(&mut self, l1_index: usize) -> BlockResult> { + let l2_addr_disk = *self + .l1_table + .get(l1_index) + .ok_or_else(|| BlockError::new(BlockErrorKind::OutOfBounds, Error::InvalidIndex))?; if l2_addr_disk == 0 { // Reading from an unallocated cluster will return zeros. @@ -1193,7 +812,7 @@ impl QcowFile { // Not in the cache. let table = VecCache::from_vec( Self::read_l2_cluster(&mut self.raw_file, l2_addr_disk) - .map_err(Error::ReadingPointers)?, + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::ReadingPointers(e)))?, ); let l1_table = &self.l1_table; let raw_file = &mut self.raw_file; @@ -1201,7 +820,7 @@ impl QcowFile { .insert(l1_index, table, |index, evicted| { raw_file.write_pointer_table_direct(l1_table[index], evicted.iter()) }) - .map_err(Error::EvictingCache)?; + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::EvictingCache(e)))?; } // The index must exist as it was just inserted if it didn't already. @@ -1214,19 +833,19 @@ impl QcowFile { } /// Returns the `index`th refcount block from the file. - pub fn refcount_block(&mut self, index: usize) -> Result> { + pub fn refcount_block(&mut self, index: usize) -> BlockResult> { self.refcounts .refcount_block(&mut self.raw_file, index) - .map_err(Error::ReadingRefCountBlock) + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::ReadingRefCountBlock(e))) } /// Returns the first cluster in the file with a 0 refcount. Used for testing. - pub fn first_zero_refcount(&mut self) -> Result> { + pub fn first_zero_refcount(&mut self) -> BlockResult> { let file_size = self .raw_file .file_mut() .metadata() - .map_err(Error::GettingFileSize)? + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::GettingFileSize(e)))? .len(); let cluster_size = 0x01u64 << self.header.cluster_bits; @@ -1235,7 +854,7 @@ impl QcowFile { let cluster_refcount = self .refcounts .get_cluster_refcount(&mut self.raw_file, cluster_addr) - .map_err(Error::GettingRefcount)?; + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::GettingRefcount(e)))?; if cluster_refcount == 0 { return Ok(Some(cluster_addr)); } @@ -1250,7 +869,7 @@ impl QcowFile { /// if needed. Shrinking is not supported, as it could lead to data /// loss. Not supported when a backing file is present in that case /// an error is returned. - pub fn resize(&mut self, new_size: u64) -> Result<()> { + pub fn resize(&mut self, new_size: u64) -> BlockResult<()> { let current_size = self.virtual_size(); if new_size == current_size { @@ -1258,11 +877,17 @@ impl QcowFile { } if new_size < current_size { - return Err(Error::ShrinkNotSupported); + return Err(BlockError::new( + BlockErrorKind::UnsupportedFeature, + Error::ShrinkNotSupported, + )); } if self.backing_file.is_some() { - return Err(Error::ResizeWithBackingFile); + return Err(BlockError::new( + BlockErrorKind::UnsupportedFeature, + Error::ResizeWithBackingFile, + )); } // Grow the L1 table if needed @@ -1280,18 +905,20 @@ impl QcowFile { self.raw_file .file_mut() .rewind() - .map_err(Error::SeekingFile)?; + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SeekingFile(e)))?; self.header .write_to(self.raw_file.file_mut()) .map_err(|e| match e { - Error::WritingHeader(io_err) => Error::ResizeIo(io_err), - other => other, + Error::WritingHeader(io_err) => { + BlockError::new(BlockErrorKind::Io, Error::ResizeIo(io_err)) + } + other => BlockError::new(BlockErrorKind::Io, other), })?; self.raw_file .file_mut() .sync_all() - .map_err(Error::SyncingHeader)?; + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SyncingHeader(e)))?; Ok(()) } @@ -1301,7 +928,7 @@ impl QcowFile { /// This allocates a new L1 table at file end (guaranteeing contiguity), /// copies existing entries, updates refcounts, and atomically switches /// to the new table. - fn grow_l1_table(&mut self, new_l1_size: u32) -> Result<()> { + fn grow_l1_table(&mut self, new_l1_size: u32) -> BlockResult<()> { let old_l1_size = self.header.l1_size; let old_l1_offset = self.header.l1_table_offset; let cluster_size = self.raw_file.cluster_size(); @@ -1314,7 +941,7 @@ impl QcowFile { .raw_file .file_mut() .seek(SeekFrom::End(0)) - .map_err(Error::ResizeIo)?; + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::ResizeIo(e)))?; let new_l1_offset = self.raw_file.cluster_address(file_size + cluster_size - 1); // Extend file to fit all L1 clusters @@ -1322,12 +949,12 @@ impl QcowFile { self.raw_file .file_mut() .set_len(new_file_end) - .map_err(Error::SettingFileSize)?; + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SettingFileSize(e)))?; // Set refcounts for the contiguous range for i in 0..new_l1_clusters { self.set_cluster_refcount(new_l1_offset + i * cluster_size, 1) - .map_err(Error::ResizeIo)?; + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::ResizeIo(e)))?; } let mut new_l1_data = vec![0u64; new_l1_size as usize]; @@ -1339,7 +966,7 @@ impl QcowFile { let refcount = self .refcounts .get_cluster_refcount(&mut self.raw_file, *l2_addr) - .map_err(Error::GettingRefcount)?; + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::GettingRefcount(e)))?; *l2_addr = l1_entry_make(*l2_addr, refcount == 1); } } @@ -1347,12 +974,12 @@ impl QcowFile { // Write the new L1 table to the file. self.raw_file .write_pointer_table_direct(new_l1_offset, new_l1_data.iter()) - .map_err(Error::ResizeIo)?; + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::ResizeIo(e)))?; self.raw_file .file_mut() .sync_all() - .map_err(Error::SyncingHeader)?; + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SyncingHeader(e)))?; self.header.l1_size = new_l1_size; self.header.l1_table_offset = new_l1_offset; @@ -1360,13 +987,15 @@ impl QcowFile { self.raw_file .file_mut() .rewind() - .map_err(Error::SeekingFile)?; - self.header.write_to(self.raw_file.file_mut())?; + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SeekingFile(e)))?; + self.header + .write_to(self.raw_file.file_mut()) + .map_err(|e| BlockError::new(BlockErrorKind::Io, e))?; self.raw_file .file_mut() .sync_all() - .map_err(Error::SyncingHeader)?; + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SyncingHeader(e)))?; // Free old L1 table clusters let old_l1_bytes = old_l1_size as u64 * size_of::() as u64; @@ -1382,31 +1011,8 @@ impl QcowFile { Ok(()) } - fn find_avail_clusters(&mut self) -> Result<()> { - let cluster_size = self.raw_file.cluster_size(); - - let file_size = self - .raw_file - .file_mut() - .metadata() - .map_err(Error::GettingFileSize)? - .len(); - - for i in (0..file_size).step_by(cluster_size as usize) { - let refcount = self - .refcounts - .get_cluster_refcount(&mut self.raw_file, i) - .map_err(Error::GettingRefcount)?; - if refcount == 0 { - self.avail_clusters.push(i); - } - } - - Ok(()) - } - /// Rebuild the reference count tables. - fn rebuild_refcounts(raw_file: &mut QcowRawFile, header: QcowHeader) -> Result<()> { + fn rebuild_refcounts(raw_file: &mut QcowRawFile, header: QcowHeader) -> BlockResult<()> { fn add_ref( refcounts: &mut [u64], cluster_size: u64, @@ -1629,7 +1235,7 @@ impl QcowFile { let file_size = raw_file .file_mut() .metadata() - .map_err(Error::GettingFileSize)? + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::GettingFileSize(e)))? .len(); let refcount_bits = 1u64 << header.refcount_order; @@ -1659,25 +1265,33 @@ impl QcowFile { max_valid_cluster_index += refblocks_for_refs + reftable_clusters_for_refs; if max_valid_cluster_index > MAX_RAM_POINTER_TABLE_SIZE { - return Err(Error::InvalidRefcountTableSize(max_valid_cluster_index)); + return Err(BlockError::new( + BlockErrorKind::CorruptImage, + Error::InvalidRefcountTableSize(max_valid_cluster_index), + )); } let max_valid_cluster_offset = max_valid_cluster_index * cluster_size; if max_valid_cluster_offset < file_size - cluster_size { - return Err(Error::InvalidRefcountTableSize(max_valid_cluster_offset)); + return Err(BlockError::new( + BlockErrorKind::CorruptImage, + Error::InvalidRefcountTableSize(max_valid_cluster_offset), + )); } let mut refcounts = vec![0; max_valid_cluster_index as usize]; // Find all references clusters and rebuild refcounts. - set_header_refcount(&mut refcounts, cluster_size, max_refcount, refcount_bits)?; + set_header_refcount(&mut refcounts, cluster_size, max_refcount, refcount_bits) + .map_err(|e| BlockError::new(BlockErrorKind::Io, e))?; set_l1_refcounts( &mut refcounts, &header, cluster_size, max_refcount, refcount_bits, - )?; + ) + .map_err(|e| BlockError::new(BlockErrorKind::Io, e))?; set_data_refcounts( &mut refcounts, &header, @@ -1685,14 +1299,16 @@ impl QcowFile { raw_file, max_refcount, refcount_bits, - )?; + ) + .map_err(|e| BlockError::new(BlockErrorKind::Io, e))?; set_refcount_table_refcounts( &mut refcounts, &header, cluster_size, max_refcount, refcount_bits, - )?; + ) + .map_err(|e| BlockError::new(BlockErrorKind::Io, e))?; // Allocate clusters to store the new reference count blocks. let ref_table = alloc_refblocks( @@ -1701,7 +1317,8 @@ impl QcowFile { refblock_clusters, max_refcount, refcount_bits, - )?; + ) + .map_err(|e| BlockError::new(BlockErrorKind::Io, e))?; // Write updated reference counts and point the reftable at them. write_refblocks( @@ -1711,6 +1328,7 @@ impl QcowFile { raw_file, refcount_block_entries, ) + .map_err(|e| BlockError::new(BlockErrorKind::Io, e)) } // Limits the range so that it doesn't exceed the virtual size of the file. @@ -1733,12 +1351,6 @@ impl QcowFile { self.header.size } - // Gets the offset of `address` in the L1 table. - fn l1_address_offset(&self, address: u64) -> u64 { - let l1_index = self.l1_table_index(address); - l1_index * size_of::() as u64 - } - // Gets the offset of `address` in the L1 table. fn l1_table_index(&self, address: u64) -> u64 { (address / self.raw_file.cluster_size()) / self.l2_entries @@ -2561,25 +2173,7 @@ impl BlockBackend for QcowFile { } } -// Returns an Error if the given offset doesn't align to a cluster boundary. -fn offset_is_cluster_boundary(offset: u64, cluster_bits: u32) -> Result<()> { - if offset & ((0x01 << cluster_bits) - 1) != 0 { - return Err(Error::InvalidOffset(offset)); - } - Ok(()) -} - -// Ceiling of the division of `dividend`/`divisor`. -fn div_round_up_u64(dividend: u64, divisor: u64) -> u64 { - dividend / divisor + u64::from(!dividend.is_multiple_of(divisor)) -} - -// Ceiling of the division of `dividend`/`divisor`. -fn div_round_up_u32(dividend: u32, divisor: u32) -> u32 { - dividend / divisor + u32::from(!dividend.is_multiple_of(divisor)) -} - -fn convert_copy(reader: &mut R, writer: &mut W, offset: u64, size: u64) -> Result<()> +fn convert_copy(reader: &mut R, writer: &mut W, offset: u64, size: u64) -> BlockResult<()> where R: Read + Seek, W: Write + Seek, @@ -2589,16 +2183,18 @@ where let mut read_count = 0; reader .seek(SeekFrom::Start(offset)) - .map_err(Error::SeekingFile)?; + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SeekingFile(e)))?; writer .seek(SeekFrom::Start(offset)) - .map_err(Error::SeekingFile)?; + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SeekingFile(e)))?; loop { let this_count = min(CHUNK_SIZE as u64, size - read_count) as usize; let nread = reader .read(&mut buf[..this_count]) - .map_err(Error::ReadingData)?; - writer.write(&buf[..nread]).map_err(Error::WritingData)?; + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::ReadingData(e)))?; + writer + .write(&buf[..nread]) + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::WritingData(e)))?; read_count += nread as u64; if nread == 0 || read_count == size { break; @@ -2608,7 +2204,7 @@ where Ok(()) } -fn convert_reader_writer(reader: &mut R, writer: &mut W, size: u64) -> Result<()> +fn convert_reader_writer(reader: &mut R, writer: &mut W, size: u64) -> BlockResult<()> where R: Read + Seek + SeekHole, W: Write + Seek, @@ -2616,19 +2212,28 @@ where let mut offset = 0; while offset < size { // Find the next range of data. - let next_data = match reader.seek_data(offset).map_err(Error::SeekingFile)? { + let next_data = match reader + .seek_data(offset) + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SeekingFile(e)))? + { Some(o) => o, None => { // No more data in the file. break; } }; - let next_hole = match reader.seek_hole(next_data).map_err(Error::SeekingFile)? { + let next_hole = match reader + .seek_hole(next_data) + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SeekingFile(e)))? + { Some(o) => o, None => { // This should not happen - there should always be at least one hole // after any data. - return Err(Error::SeekingFile(io::Error::from_raw_os_error(EINVAL))); + return Err(BlockError::new( + BlockErrorKind::Io, + Error::SeekingFile(io::Error::from_raw_os_error(EINVAL)), + )); } }; let count = next_hole - next_data; @@ -2639,19 +2244,26 @@ where Ok(()) } -fn convert_reader(reader: &mut R, dst_file: RawFile, dst_type: ImageType) -> Result<()> +fn convert_reader(reader: &mut R, dst_file: RawFile, dst_type: ImageType) -> BlockResult<()> where R: Read + Seek + SeekHole, { - let src_size = reader.seek(SeekFrom::End(0)).map_err(Error::SeekingFile)?; - reader.rewind().map_err(Error::SeekingFile)?; + let src_size = reader + .seek(SeekFrom::End(0)) + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SeekingFile(e)))?; + reader + .rewind() + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SeekingFile(e)))?; // Ensure the destination file is empty before writing to it. - dst_file.set_len(0).map_err(Error::SettingFileSize)?; + dst_file + .set_len(0) + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SettingFileSize(e)))?; match dst_type { ImageType::Qcow2 => { - let mut dst_writer = QcowFile::new(dst_file, 3, src_size, true)?; + let mut dst_writer = QcowFile::new(dst_file, 3, src_size, true) + .map_err(|e| BlockError::new(BlockErrorKind::Io, e))?; convert_reader_writer(reader, &mut dst_writer, src_size) } ImageType::Raw => { @@ -2660,7 +2272,7 @@ where // of the desired size. dst_writer .set_len(src_size) - .map_err(Error::SettingFileSize)?; + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SettingFileSize(e)))?; convert_reader_writer(reader, &mut dst_writer, src_size) } } @@ -2674,12 +2286,13 @@ pub fn convert( dst_file: RawFile, dst_type: ImageType, src_max_nesting_depth: u32, -) -> Result<()> { +) -> BlockResult<()> { let src_type = detect_image_type(&mut src_file)?; match src_type { ImageType::Qcow2 => { let mut src_reader = - QcowFile::from_with_nesting_depth(src_file, src_max_nesting_depth, true)?; + QcowFile::from_with_nesting_depth(src_file, src_max_nesting_depth, true) + .map_err(|e| BlockError::new(BlockErrorKind::Io, e))?; convert_reader(&mut src_reader, dst_file, dst_type) } ImageType::Raw => { @@ -2691,22 +2304,27 @@ pub fn convert( } /// Detect the type of an image file by checking for a valid qcow2 header. -pub fn detect_image_type(file: &mut RawFile) -> Result { - let orig_seek = file.stream_position().map_err(Error::SeekingFile)?; - file.rewind().map_err(Error::SeekingFile)?; - let magic = u32::read_be(file).map_err(Error::ReadingHeader)?; +pub fn detect_image_type(file: &mut RawFile) -> BlockResult { + let orig_seek = file + .stream_position() + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SeekingFile(e)))?; + file.rewind() + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SeekingFile(e)))?; + let magic = u32::read_be(file) + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::ReadingHeader(e)))?; let image_type = if magic == QCOW_MAGIC { ImageType::Qcow2 } else { ImageType::Raw }; file.seek(SeekFrom::Start(orig_seek)) - .map_err(Error::SeekingFile)?; + .map_err(|e| BlockError::new(BlockErrorKind::Io, Error::SeekingFile(e)))?; Ok(image_type) } #[cfg(test)] mod unit_tests { + use std::error::Error as StdError; use std::fs::File; use std::path::Path; @@ -2714,6 +2332,7 @@ mod unit_tests { use vmm_sys_util::tempfile::TempFile; use vmm_sys_util::write_zeroes::WriteZeroes; + use super::util::{COMPRESSED_FLAG, ZERO_FLAG}; use super::*; fn valid_header_v3() -> Vec { @@ -3057,7 +2676,11 @@ mod unit_tests { disk_file.rewind().unwrap(); // The maximum nesting depth is 0, which means backing file is not allowed. let res = QcowFile::from_with_nesting_depth(disk_file, 0, true); - assert!(matches!(res.unwrap_err(), Error::MaxNestingDepthExceeded)); + let err = res.unwrap_err(); + assert!(matches!(err.kind(), BlockErrorKind::Overflow)); + let source = StdError::source(&err).unwrap(); + let qcow_err = source.downcast_ref::().unwrap(); + assert!(matches!(qcow_err, Error::MaxNestingDepthExceeded)); } /// Create a qcow2 file with itself as its backing file. @@ -3544,7 +3167,12 @@ mod unit_tests { let result = q.resize(smaller_size); assert!(result.is_err()); - assert!(matches!(result.unwrap_err(), Error::ShrinkNotSupported)); + let err = result.unwrap_err(); + assert!(matches!(err.kind(), BlockErrorKind::UnsupportedFeature)); + assert!(matches!( + err.downcast_ref::(), + Some(Error::ShrinkNotSupported) + )); assert_eq!(q.virtual_size(), original_size); }); @@ -3575,7 +3203,12 @@ mod unit_tests { let result = overlay.resize(backing_size * 2); assert!(result.is_err()); - assert!(matches!(result.unwrap_err(), Error::ResizeWithBackingFile)); + let err = result.unwrap_err(); + assert!(matches!(err.kind(), BlockErrorKind::UnsupportedFeature)); + assert!(matches!( + err.downcast_ref::(), + Some(Error::ResizeWithBackingFile) + )); assert_eq!(overlay.virtual_size(), backing_size); } @@ -4365,7 +3998,7 @@ mod unit_tests { assert!(result.is_err()); let err = result.unwrap_err(); assert!( - matches!(err, Error::CorruptImage), + matches!(err.kind(), BlockErrorKind::CorruptImage), "Expected CorruptImage error, got: {err:?}" ); }); @@ -4379,8 +4012,11 @@ mod unit_tests { let result = QcowFile::from(disk_file); assert!(result.is_err()); let err = result.unwrap_err(); + assert!(matches!(err.kind(), BlockErrorKind::UnsupportedFeature)); + let source = StdError::source(&err).unwrap(); + let qcow_err = source.downcast_ref::().unwrap(); assert!( - matches!(err, Error::UnsupportedFeature(ref v) if v.to_string().contains("external")), + matches!(qcow_err, Error::UnsupportedFeature(v) if v.to_string().contains("external")), "Expected UnsupportedFeature error mentioning external, got: {err:?}" ); }); @@ -4394,8 +4030,11 @@ mod unit_tests { let result = QcowFile::from(disk_file); assert!(result.is_err()); let err = result.unwrap_err(); + assert!(matches!(err.kind(), BlockErrorKind::UnsupportedFeature)); + let source = StdError::source(&err).unwrap(); + let qcow_err = source.downcast_ref::().unwrap(); assert!( - matches!(err, Error::UnsupportedFeature(ref v) if v.to_string().contains("extended")), + matches!(qcow_err, Error::UnsupportedFeature(v) if v.to_string().contains("extended")), "Expected UnsupportedFeature error mentioning extended, got: {err:?}" ); }); @@ -4408,7 +4047,10 @@ mod unit_tests { with_basic_file(&header, |disk_file: RawFile| { let result = QcowFile::from(disk_file); assert!(result.is_err()); - assert!(matches!(result.unwrap_err(), Error::UnsupportedFeature(_))); + assert!(matches!( + result.unwrap_err().kind(), + BlockErrorKind::UnsupportedFeature + )); }); } @@ -4420,8 +4062,11 @@ mod unit_tests { let result = QcowFile::from(disk_file); assert!(result.is_err()); let err = result.unwrap_err(); + assert!(matches!(err.kind(), BlockErrorKind::UnsupportedFeature)); + let source = StdError::source(&err).unwrap(); + let qcow_err = source.downcast_ref::().unwrap(); assert!( - matches!(err, Error::UnsupportedFeature(ref v) if v.to_string().contains("unknown")), + matches!(qcow_err, Error::UnsupportedFeature(v) if v.to_string().contains("unknown")), "Expected UnsupportedFeature error mentioning unknown, got: {err:?}" ); }); @@ -4621,7 +4266,7 @@ mod unit_tests { assert!(result.is_err()); let err = result.unwrap_err(); assert!( - matches!(err, Error::CorruptImage), + matches!(err.kind(), BlockErrorKind::CorruptImage), "Expected CorruptImage error, got: {err:?}" ); }); diff --git a/block/src/qcow/qcow_raw_file.rs b/block/src/qcow/qcow_raw_file.rs index 92a569d347..232f6b5a5c 100644 --- a/block/src/qcow/qcow_raw_file.rs +++ b/block/src/qcow/qcow_raw_file.rs @@ -7,7 +7,7 @@ use std::fmt::Debug; use std::io::{self, BufWriter, Read, Seek, SeekFrom, Write}; use std::mem::size_of; -use std::os::fd::{AsRawFd, RawFd}; +use std::os::fd::{AsFd, AsRawFd, BorrowedFd, RawFd}; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use vmm_sys_util::write_zeroes::WriteZeroes; @@ -361,3 +361,9 @@ impl AsRawFd for QcowRawFile { self.file.as_raw_fd() } } + +impl AsFd for QcowRawFile { + fn as_fd(&self) -> BorrowedFd<'_> { + self.file.as_fd() + } +} diff --git a/block/src/qcow/raw_file.rs b/block/src/qcow/raw_file.rs index eda7751c3f..c2a01811f8 100644 --- a/block/src/qcow/raw_file.rs +++ b/block/src/qcow/raw_file.rs @@ -11,6 +11,7 @@ use std::alloc::{Layout, alloc_zeroed, dealloc}; use std::fs::{File, Metadata}; use std::io::{self, Read, Seek, SeekFrom, Write}; +use std::os::fd::{AsFd, BorrowedFd}; use std::os::unix::io::{AsRawFd, RawFd}; use std::slice; @@ -19,7 +20,7 @@ use vmm_sys_util::file_traits::FileSync; use vmm_sys_util::seek_hole::SeekHole; use vmm_sys_util::write_zeroes::{PunchHole, WriteZeroesAt}; -use crate::BlockBackend; +use crate::{BlockBackend, query_device_size}; #[derive(Debug)] pub struct RawFile { @@ -124,6 +125,10 @@ impl RawFile { self.direct_io } + pub fn alignment(&self) -> usize { + self.alignment + } + /// Returns true if the file was opened with write access. pub fn is_writable(&self) -> bool { // SAFETY: fcntl with F_GETFL is safe and doesn't modify the file descriptor @@ -373,11 +378,15 @@ impl SeekHole for RawFile { impl BlockBackend for RawFile { fn logical_size(&self) -> std::result::Result { - Ok(self.metadata().map_err(crate::Error::RawFileError)?.len()) + Ok(query_device_size(&self.file) + .map_err(crate::Error::RawFileError)? + .0) } fn physical_size(&self) -> std::result::Result { - Ok(self.metadata().map_err(crate::Error::RawFileError)?.len()) + Ok(query_device_size(&self.file) + .map_err(crate::Error::RawFileError)? + .1) } } @@ -397,3 +406,9 @@ impl AsRawFd for RawFile { self.file.as_raw_fd() } } + +impl AsFd for RawFile { + fn as_fd(&self) -> BorrowedFd<'_> { + self.file.as_fd() + } +} diff --git a/block/src/qcow/util.rs b/block/src/qcow/util.rs new file mode 100644 index 0000000000..bc8d017725 --- /dev/null +++ b/block/src/qcow/util.rs @@ -0,0 +1,79 @@ +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// Copyright 2026 The Cloud Hypervisor Authors. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +//! Pure helper functions and constants for QCOW2 L1/L2 table entry +//! manipulation and integer arithmetic. Shared across the `qcow` submodules. + +/// Nesting depth limit for disk formats that can open other disk files. +pub(crate) const MAX_NESTING_DEPTH: u32 = 10; + +// bits 0-8 and 56-63 are reserved. +pub(super) const L1_TABLE_OFFSET_MASK: u64 = 0x00ff_ffff_ffff_fe00; +pub(super) const L2_TABLE_OFFSET_MASK: u64 = 0x00ff_ffff_ffff_fe00; +// Flags +pub(super) const ZERO_FLAG: u64 = 1 << 0; +pub(super) const COMPRESSED_FLAG: u64 = 1 << 62; +pub(super) const COMPRESSED_SECTOR_SIZE: u64 = 512; +pub(super) const CLUSTER_USED_FLAG: u64 = 1 << 63; + +/// Check if L2 entry is empty (unallocated). +pub(super) fn l2_entry_is_empty(l2_entry: u64) -> bool { + l2_entry == 0 +} + +/// Check bit 0 - only valid for standard clusters. +pub(super) fn l2_entry_is_zero(l2_entry: u64) -> bool { + l2_entry & ZERO_FLAG != 0 +} + +/// Check if L2 entry refers to a compressed cluster. +pub(super) fn l2_entry_is_compressed(l2_entry: u64) -> bool { + l2_entry & COMPRESSED_FLAG != 0 +} + +/// Get file offset and size of compressed cluster data. +pub(super) fn l2_entry_compressed_cluster_layout(l2_entry: u64, cluster_bits: u32) -> (u64, usize) { + let compressed_size_shift = 62 - (cluster_bits - 8); + let compressed_size_mask = (1 << (cluster_bits - 8)) - 1; + let compressed_cluster_addr = l2_entry & ((1 << compressed_size_shift) - 1); + let nsectors = (l2_entry >> compressed_size_shift & compressed_size_mask) + 1; + let compressed_cluster_size = ((nsectors * COMPRESSED_SECTOR_SIZE) + - (compressed_cluster_addr & (COMPRESSED_SECTOR_SIZE - 1))) + as usize; + (compressed_cluster_addr, compressed_cluster_size) +} + +/// Get file offset of standard (non-compressed) cluster. +pub(super) fn l2_entry_std_cluster_addr(l2_entry: u64) -> u64 { + l2_entry & L2_TABLE_OFFSET_MASK +} + +/// Make L2 entry for standard (non-compressed) cluster. +pub(super) fn l2_entry_make_std(cluster_addr: u64) -> u64 { + (cluster_addr & L2_TABLE_OFFSET_MASK) | CLUSTER_USED_FLAG +} + +/// Make L2 entry for preallocated zero cluster. +pub(super) fn l2_entry_make_zero(cluster_addr: u64) -> u64 { + (cluster_addr & L2_TABLE_OFFSET_MASK) | CLUSTER_USED_FLAG | ZERO_FLAG +} + +/// Make L1 entry with optional flags. +pub(super) fn l1_entry_make(cluster_addr: u64, refcount_is_one: bool) -> u64 { + (cluster_addr & L1_TABLE_OFFSET_MASK) | (refcount_is_one as u64 * CLUSTER_USED_FLAG) +} + +/// Ceiling of the division of `dividend`/`divisor`. +pub(super) fn div_round_up_u32(dividend: u32, divisor: u32) -> u32 { + dividend / divisor + u32::from(!dividend.is_multiple_of(divisor)) +} + +/// Ceiling of the division of `dividend`/`divisor`. +pub(super) fn div_round_up_u64(dividend: u64, divisor: u64) -> u64 { + dividend / divisor + u64::from(!dividend.is_multiple_of(divisor)) +} diff --git a/block/src/qcow_async.rs b/block/src/qcow_async.rs new file mode 100644 index 0000000000..308c2e27d0 --- /dev/null +++ b/block/src/qcow_async.rs @@ -0,0 +1,1090 @@ +// Copyright © 2021 Intel Corporation +// +// Copyright 2026 The Cloud Hypervisor Authors. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +//! QCOW2 async disk backend. + +use std::cmp::{max, min}; +use std::collections::VecDeque; +use std::fs::File; +use std::io::Error; +use std::os::fd::{AsFd, AsRawFd}; +use std::sync::Arc; +use std::{fmt, io}; + +use io_uring::{IoUring, opcode, types}; +use vmm_sys_util::eventfd::EventFd; +use vmm_sys_util::write_zeroes::{PunchHole, WriteZeroesAt}; + +use crate::async_io::{AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFileError}; +use crate::error::{BlockError, BlockErrorKind, BlockResult, ErrorOp}; +use crate::qcow::backing::shared_backing_from; +use crate::qcow::metadata::{ + BackingRead, ClusterReadMapping, ClusterWriteMapping, DeallocAction, QcowMetadata, +}; +use crate::qcow::qcow_raw_file::QcowRawFile; +use crate::qcow::{MAX_NESTING_DEPTH, RawFile, parse_qcow}; +use crate::qcow_common::{ + AlignedBuf, aligned_pread, aligned_pwrite, gather_from_iovecs_into, pread_exact, pwrite_all, + scatter_to_iovecs, zero_fill_iovecs, +}; +use crate::{BatchRequest, RequestType, SECTOR_SIZE, disk_file}; + +/// Device level handle for a QCOW2 image. +/// +/// Owns the parsed metadata and backing file chain. One instance is +/// created per disk and shared across virtio queues. +pub struct QcowDiskAsync { + metadata: Arc, + backing_file: Option>, + sparse: bool, + data_raw_file: QcowRawFile, +} + +impl fmt::Debug for QcowDiskAsync { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("QcowDiskAsync") + .field("sparse", &self.sparse) + .field("has_backing", &self.backing_file.is_some()) + .finish_non_exhaustive() + } +} + +impl QcowDiskAsync { + pub fn new( + file: File, + direct_io: bool, + backing_files: bool, + sparse: bool, + ) -> BlockResult { + let max_nesting_depth = if backing_files { MAX_NESTING_DEPTH } else { 0 }; + let (inner, backing_file, sparse) = + parse_qcow(RawFile::new(file, direct_io), max_nesting_depth, sparse).map_err(|e| { + let e = if !backing_files && matches!(e.kind(), BlockErrorKind::Overflow) { + e.with_kind(BlockErrorKind::UnsupportedFeature) + } else { + e + }; + e.with_op(ErrorOp::Open) + })?; + let data_raw_file = inner.raw_file.clone(); + Ok(QcowDiskAsync { + metadata: Arc::new(QcowMetadata::new(inner)), + backing_file: backing_file.map(shared_backing_from).transpose()?, + sparse, + data_raw_file, + }) + } +} + +impl Drop for QcowDiskAsync { + fn drop(&mut self) { + self.metadata.shutdown(); + } +} + +impl disk_file::DiskSize for QcowDiskAsync { + fn logical_size(&self) -> BlockResult { + Ok(self.metadata.virtual_size()) + } +} + +impl disk_file::PhysicalSize for QcowDiskAsync { + fn physical_size(&self) -> BlockResult { + Ok(self.data_raw_file.physical_size()?) + } +} + +impl disk_file::DiskFd for QcowDiskAsync { + fn fd(&self) -> BorrowedDiskFd<'_> { + BorrowedDiskFd::new(self.data_raw_file.as_fd().as_raw_fd()) + } +} + +impl disk_file::Geometry for QcowDiskAsync {} + +impl disk_file::SparseCapable for QcowDiskAsync { + fn supports_sparse_operations(&self) -> bool { + true + } + + fn supports_zero_flag(&self) -> bool { + true + } +} + +impl disk_file::Resizable for QcowDiskAsync { + fn resize(&mut self, size: u64) -> BlockResult<()> { + if self.backing_file.is_some() { + return Err(BlockError::new( + BlockErrorKind::UnsupportedFeature, + DiskFileError::ResizeError(io::Error::other( + "resize not supported with backing file", + )), + ) + .with_op(ErrorOp::Resize)); + } + self.metadata.resize(size).map_err(|e| { + BlockError::new(BlockErrorKind::Io, DiskFileError::ResizeError(e)) + .with_op(ErrorOp::Resize) + }) + } +} + +impl disk_file::DiskFile for QcowDiskAsync {} + +impl disk_file::AsyncDiskFile for QcowDiskAsync { + fn try_clone(&self) -> BlockResult> { + Ok(Box::new(QcowDiskAsync { + metadata: Arc::clone(&self.metadata), + backing_file: self.backing_file.as_ref().map(Arc::clone), + sparse: self.sparse, + data_raw_file: self.data_raw_file.clone(), + })) + } + + fn new_async_io(&self, ring_depth: u32) -> BlockResult> { + Ok(Box::new( + QcowAsync::new( + Arc::clone(&self.metadata), + self.data_raw_file.clone(), + self.backing_file.as_ref().map(Arc::clone), + self.sparse, + ring_depth, + ) + .map_err(|e| BlockError::new(BlockErrorKind::Io, DiskFileError::NewAsyncIo(e)))?, + )) + } +} + +/// Per queue QCOW2 I/O worker using io_uring. +/// +/// Reads against fully allocated single mapping clusters are submitted +/// to io_uring for true asynchronous completion. All other cluster +/// types (zero, compressed, backing) and multi mapping reads fall back +/// to synchronous I/O with synthetic completions. +/// +/// Writes are synchronous because metadata allocation must complete +/// before the host offset is known. +pub struct QcowAsync { + metadata: Arc, + data_file: QcowRawFile, + backing_file: Option>, + sparse: bool, + /// O_DIRECT alignment requirement (0 = no alignment needed). + alignment: usize, + /// I/O alignment for the AsyncIo trait (at least SECTOR_SIZE). + io_alignment: u64, + io_uring: IoUring, + eventfd: EventFd, + completion_list: VecDeque<(u64, i32)>, +} + +impl QcowAsync { + fn new( + metadata: Arc, + data_file: QcowRawFile, + backing_file: Option>, + sparse: bool, + ring_depth: u32, + ) -> io::Result { + let alignment = data_file.file().alignment(); + let io_alignment = max(alignment as u64, SECTOR_SIZE); + let io_uring = IoUring::new(ring_depth)?; + let eventfd = EventFd::new(libc::EFD_NONBLOCK)?; + io_uring.submitter().register_eventfd(eventfd.as_raw_fd())?; + + Ok(QcowAsync { + metadata, + data_file, + backing_file, + sparse, + alignment, + io_alignment, + io_uring, + eventfd, + completion_list: VecDeque::new(), + }) + } + + fn apply_dealloc_action(&mut self, action: &DeallocAction) { + match action { + DeallocAction::PunchHole { + host_offset, + length, + } => { + let _ = self.data_file.file_mut().punch_hole(*host_offset, *length); + } + DeallocAction::WriteZeroes { + host_offset, + length, + } => { + let _ = self + .data_file + .file_mut() + .write_zeroes_at(*host_offset, *length); + } + } + } +} + +impl AsyncIo for QcowAsync { + fn notifier(&self) -> &EventFd { + &self.eventfd + } + + fn read_vectored( + &mut self, + offset: libc::off_t, + iovecs: &[libc::iovec], + user_data: u64, + ) -> AsyncIoResult<()> { + let total_len: usize = iovecs.iter().map(|v| v.iov_len).sum(); + + if let Some(host_offset) = Self::resolve_read( + &self.metadata, + &self.data_file, + &self.backing_file, + offset as u64, + iovecs, + total_len, + self.alignment, + )? { + let fd = self.data_file.as_raw_fd(); + let (submitter, mut sq, _) = self.io_uring.split(); + + // SAFETY: fd is valid and iovecs point to valid guest memory. + unsafe { + sq.push( + &opcode::Readv::new(types::Fd(fd), iovecs.as_ptr(), iovecs.len() as u32) + .offset(host_offset) + .build() + .user_data(user_data), + ) + .map_err(|_| { + AsyncIoError::ReadVectored(Error::other("Submission queue is full")) + })?; + }; + + sq.sync(); + submitter.submit().map_err(AsyncIoError::ReadVectored)?; + } else { + self.completion_list + .push_back((user_data, total_len as i32)); + self.eventfd.write(1).unwrap(); + } + Ok(()) + } + + // TODO Make writes async. + // Writes are synchronous. Async writes require a multi step + // state machine for COW (backing read, cluster allocation, data + // write, L2 commit) with per request buffer lifetime tracking + // and write ordering. + fn write_vectored( + &mut self, + offset: libc::off_t, + iovecs: &[libc::iovec], + user_data: u64, + ) -> AsyncIoResult<()> { + Self::cow_write_sync( + offset as u64, + iovecs, + &self.metadata, + &self.data_file, + &self.backing_file, + self.alignment, + )?; + + let total_len: usize = iovecs.iter().map(|v| v.iov_len).sum(); + self.completion_list + .push_back((user_data, total_len as i32)); + self.eventfd.write(1).unwrap(); + Ok(()) + } + + fn fsync(&mut self, user_data: Option) -> AsyncIoResult<()> { + self.metadata.flush().map_err(AsyncIoError::Fsync)?; + if let Some(user_data) = user_data { + self.completion_list.push_back((user_data, 0)); + self.eventfd.write(1).unwrap(); + } + Ok(()) + } + + fn next_completed_request(&mut self) -> Option<(u64, i32)> { + // Drain io_uring completions first, then synthetic ones. + self.io_uring + .completion() + .next() + .map(|entry| (entry.user_data(), entry.result())) + .or_else(|| self.completion_list.pop_front()) + } + + fn punch_hole(&mut self, offset: u64, length: u64, user_data: u64) -> AsyncIoResult<()> { + let virtual_size = self.metadata.virtual_size(); + let cluster_size = self.metadata.cluster_size(); + + let result = self + .metadata + .deallocate_bytes( + offset, + length as usize, + self.sparse, + virtual_size, + cluster_size, + self.backing_file.as_deref(), + ) + .map_err(AsyncIoError::PunchHole); + + match result { + Ok(actions) => { + for action in &actions { + self.apply_dealloc_action(action); + } + self.completion_list.push_back((user_data, 0)); + self.eventfd.write(1).unwrap(); + Ok(()) + } + Err(e) => { + let errno = if let AsyncIoError::PunchHole(ref io_err) = e { + -io_err.raw_os_error().unwrap_or(libc::EIO) + } else { + -libc::EIO + }; + self.completion_list.push_back((user_data, errno)); + self.eventfd.write(1).unwrap(); + Ok(()) + } + } + } + + fn write_zeroes(&mut self, offset: u64, length: u64, user_data: u64) -> AsyncIoResult<()> { + // For QCOW2, zeroing and hole punching are the same operation. + // Both discard guest data so the range reads back as zero. + self.punch_hole(offset, length, user_data) + } + + fn batch_requests_enabled(&self) -> bool { + true + } + + fn alignment(&self) -> u64 { + self.io_alignment + } + + fn submit_batch_requests(&mut self, batch_request: &[BatchRequest]) -> AsyncIoResult<()> { + let (submitter, mut sq, _) = self.io_uring.split(); + let mut needs_submit = false; + let mut sync_completions: Vec<(u64, i32)> = Vec::new(); + + for req in batch_request { + match req.request_type { + RequestType::In => { + let total_len: usize = req.iovecs.iter().map(|v| v.iov_len).sum(); + + if let Some(host_offset) = Self::resolve_read( + &self.metadata, + &self.data_file, + &self.backing_file, + req.offset as u64, + &req.iovecs, + total_len, + self.alignment, + )? { + let fd = self.data_file.as_raw_fd(); + // SAFETY: fd is valid and iovecs point to valid guest memory. + unsafe { + sq.push( + &opcode::Readv::new( + types::Fd(fd), + req.iovecs.as_ptr(), + req.iovecs.len() as u32, + ) + .offset(host_offset) + .build() + .user_data(req.user_data), + ) + .map_err(|_| { + AsyncIoError::ReadVectored(Error::other("Submission queue is full")) + })?; + } + needs_submit = true; + } else { + sync_completions.push((req.user_data, total_len as i32)); + } + } + RequestType::Out => { + let total_len: usize = req.iovecs.iter().map(|v| v.iov_len).sum(); + Self::cow_write_sync( + req.offset as u64, + &req.iovecs, + &self.metadata, + &self.data_file, + &self.backing_file, + self.alignment, + )?; + sync_completions.push((req.user_data, total_len as i32)); + } + _ => { + unreachable!("Unexpected batch request type: {:?}", req.request_type) + } + } + } + + if needs_submit { + sq.sync(); + submitter + .submit() + .map_err(AsyncIoError::SubmitBatchRequests)?; + } + + if !sync_completions.is_empty() { + for c in sync_completions { + self.completion_list.push_back(c); + } + self.eventfd.write(1).unwrap(); + } + + Ok(()) + } +} + +impl QcowAsync { + /// Resolves read mappings for a guest read request. + /// + /// Returns `Some(host_offset)` if the entire read falls within a single + /// allocated cluster (fast path). Otherwise handles the read + /// synchronously via `scatter_read_sync` and returns `None`. + fn resolve_read( + metadata: &QcowMetadata, + data_file: &QcowRawFile, + backing_file: &Option>, + address: u64, + iovecs: &[libc::iovec], + total_len: usize, + alignment: usize, + ) -> AsyncIoResult> { + let has_backing = backing_file.is_some(); + let mappings = metadata + .map_clusters_for_read(address, total_len, has_backing) + .map_err(AsyncIoError::ReadVectored)?; + + // The fast path returns a host offset so the caller can submit a + // single io_uring readv with the original iovecs. This only works + // without O_DIRECT because it requires I/O + // size and file offset to be multiples of the device sector size. + // Guest requests can be smaller (e.g. 512 byte UEFI reads on a + // 4096 byte sector device), so O_DIRECT reads fall through to the + // alignment aware synchronous path instead. + if alignment == 0 + && mappings.len() == 1 + && let ClusterReadMapping::Allocated { + offset: host_offset, + length, + } = &mappings[0] + && *length as usize == total_len + { + return Ok(Some(*host_offset)); + } + + Self::scatter_read_sync(mappings, iovecs, data_file, backing_file, alignment)?; + Ok(None) + } + + /// Scatter-read cluster mappings synchronously into iovec buffers. + fn scatter_read_sync( + mappings: Vec, + iovecs: &[libc::iovec], + data_file: &QcowRawFile, + backing_file: &Option>, + alignment: usize, + ) -> AsyncIoResult<()> { + let mut buf_offset = 0usize; + for mapping in mappings { + match mapping { + ClusterReadMapping::Zero { length } => { + // SAFETY: iovecs point to valid guest memory buffers. + unsafe { + zero_fill_iovecs(iovecs, buf_offset, length as usize); + } + buf_offset += length as usize; + } + ClusterReadMapping::Allocated { + offset: host_offset, + length, + } => { + let len = length as usize; + if alignment > 0 { + let mut abuf = + AlignedBuf::new(len, alignment).map_err(AsyncIoError::ReadVectored)?; + aligned_pread( + data_file.as_raw_fd(), + abuf.as_mut_slice(len), + host_offset, + alignment, + ) + .map_err(AsyncIoError::ReadVectored)?; + // SAFETY: iovecs point to valid guest memory buffers. + unsafe { scatter_to_iovecs(iovecs, buf_offset, abuf.as_slice(len)) }; + } else { + let mut buf = vec![0u8; len]; + pread_exact(data_file.as_raw_fd(), &mut buf, host_offset) + .map_err(AsyncIoError::ReadVectored)?; + // SAFETY: iovecs point to valid guest memory buffers. + unsafe { scatter_to_iovecs(iovecs, buf_offset, &buf) }; + } + buf_offset += len; + } + ClusterReadMapping::Compressed { data } => { + let len = data.len(); + // SAFETY: iovecs point to valid guest memory buffers. + unsafe { scatter_to_iovecs(iovecs, buf_offset, &data) }; + buf_offset += len; + } + ClusterReadMapping::Backing { + offset: backing_offset, + length, + } => { + let mut buf = vec![0u8; length as usize]; + backing_file + .as_ref() + .unwrap() + .read_at(backing_offset, &mut buf) + .map_err(AsyncIoError::ReadVectored)?; + // SAFETY: iovecs point to valid guest memory buffers. + unsafe { scatter_to_iovecs(iovecs, buf_offset, &buf) }; + buf_offset += length as usize; + } + } + } + Ok(()) + } + + /// Write iovec data cluster-by-cluster with COW from backing file. + fn cow_write_sync( + address: u64, + iovecs: &[libc::iovec], + metadata: &QcowMetadata, + data_file: &QcowRawFile, + backing_file: &Option>, + alignment: usize, + ) -> AsyncIoResult<()> { + let total_len: usize = iovecs.iter().map(|v| v.iov_len).sum(); + let cluster_size = metadata.cluster_size(); + let mut buf_offset = 0usize; + + while buf_offset < total_len { + let curr_addr = address + buf_offset as u64; + let intra_offset = metadata.cluster_offset(curr_addr); + let remaining_in_cluster = (cluster_size - intra_offset) as usize; + let count = min(total_len - buf_offset, remaining_in_cluster); + + let backing_data = if let Some(backing) = backing_file + .as_ref() + .filter(|_| intra_offset != 0 || count < cluster_size as usize) + { + let cluster_begin = curr_addr - intra_offset; + let mut data = vec![0u8; cluster_size as usize]; + backing + .read_at(cluster_begin, &mut data) + .map_err(AsyncIoError::WriteVectored)?; + Some(data) + } else { + None + }; + + let mapping = metadata + .map_cluster_for_write(curr_addr, backing_data) + .map_err(AsyncIoError::WriteVectored)?; + + match mapping { + ClusterWriteMapping::Allocated { + offset: host_offset, + } => { + if alignment > 0 { + // O_DIRECT, gather directly into aligned buffer. + let mut abuf = AlignedBuf::new(count, alignment) + .map_err(AsyncIoError::WriteVectored)?; + // SAFETY: iovecs point to valid guest memory buffers + unsafe { + gather_from_iovecs_into(iovecs, buf_offset, abuf.as_mut_slice(count)); + } + aligned_pwrite( + data_file.as_raw_fd(), + abuf.as_slice(count), + host_offset, + alignment, + ) + .map_err(AsyncIoError::WriteVectored)?; + } else { + // No O_DIRECT, plain buffer is fine. + let mut buf = vec![0u8; count]; + // SAFETY: iovecs point to valid guest memory buffers. + unsafe { + gather_from_iovecs_into(iovecs, buf_offset, &mut buf); + } + pwrite_all(data_file.as_raw_fd(), &buf, host_offset) + .map_err(AsyncIoError::WriteVectored)?; + } + } + } + buf_offset += count; + } + Ok(()) + } +} + +#[cfg(test)] +mod unit_tests { + use std::io::{Seek, SeekFrom, Write}; + + use vmm_sys_util::tempfile::TempFile; + + use super::*; + use crate::disk_file::AsyncDiskFile; + use crate::qcow::{QcowFile, RawFile}; + use crate::{BatchRequest, RequestType, SECTOR_SIZE}; + + fn create_disk_with_data( + file_size: u64, + data: &[u8], + offset: u64, + sparse: bool, + ) -> (TempFile, QcowDiskAsync) { + let temp_file = TempFile::new().unwrap(); + { + let raw_file = RawFile::new(temp_file.as_file().try_clone().unwrap(), false); + let mut qcow_file = QcowFile::new(raw_file, 3, file_size, sparse).unwrap(); + qcow_file.seek(SeekFrom::Start(offset)).unwrap(); + qcow_file.write_all(data).unwrap(); + qcow_file.flush().unwrap(); + } + let disk = QcowDiskAsync::new( + temp_file.as_file().try_clone().unwrap(), + false, + false, + sparse, + ) + .unwrap(); + (temp_file, disk) + } + + fn wait_for_completion(async_io: &mut dyn AsyncIo) -> (u64, i32) { + loop { + if let Some(c) = async_io.next_completed_request() { + return c; + } + // Block until the eventfd is signaled (io_uring or synthetic). + let fd = async_io.notifier().as_raw_fd(); + let mut val = 0u64; + // SAFETY: reading 8 bytes from a valid eventfd. + unsafe { + libc::read(fd, &mut val as *mut u64 as *mut libc::c_void, 8); + } + } + } + + fn async_write(disk: &QcowDiskAsync, offset: u64, data: &[u8]) { + let mut async_io = disk.new_async_io(1).unwrap(); + let iovec = libc::iovec { + iov_base: data.as_ptr() as *mut libc::c_void, + iov_len: data.len(), + }; + async_io + .write_vectored(offset as libc::off_t, &[iovec], 2) + .unwrap(); + let (user_data, result) = wait_for_completion(async_io.as_mut()); + assert_eq!(user_data, 2); + assert_eq!( + result as usize, + data.len(), + "write should return requested length" + ); + } + + fn async_read(disk: &QcowDiskAsync, offset: u64, len: usize) -> Vec { + let mut async_io = disk.new_async_io(1).unwrap(); + let mut buf = vec![0xFFu8; len]; + let iovec = libc::iovec { + iov_base: buf.as_mut_ptr() as *mut libc::c_void, + iov_len: buf.len(), + }; + async_io + .read_vectored(offset as libc::off_t, &[iovec], 1) + .unwrap(); + let (user_data, result) = wait_for_completion(async_io.as_mut()); + assert_eq!(user_data, 1); + assert_eq!(result as usize, len, "read should return requested length"); + buf + } + + #[test] + fn test_qcow_async_punch_hole_completion() { + let data = vec![0xDD; 128 * 1024]; + let offset = 0u64; + let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &data, offset, true); + + let mut async_io = disk.new_async_io(1).unwrap(); + async_io.punch_hole(offset, data.len() as u64, 100).unwrap(); + let (user_data, result) = async_io.next_completed_request().unwrap(); + assert_eq!(user_data, 100); + assert_eq!(result, 0, "punch_hole should succeed"); + drop(async_io); + + let read_buf = async_read(&disk, offset, data.len()); + assert!( + read_buf.iter().all(|&b| b == 0), + "Punched hole should read as zeros" + ); + } + + #[test] + fn test_qcow_async_write_zeroes_completion() { + let data = vec![0xAA; 128 * 1024]; + let offset = 0u64; + let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &data, offset, true); + + let mut async_io = disk.new_async_io(1).unwrap(); + async_io + .write_zeroes(offset, data.len() as u64, 200) + .unwrap(); + let (user_data, result) = async_io.next_completed_request().unwrap(); + assert_eq!(user_data, 200); + assert_eq!(result, 0, "write_zeroes should succeed"); + drop(async_io); + + let read_buf = async_read(&disk, offset, data.len()); + assert!( + read_buf.iter().all(|&b| b == 0), + "Write zeroes region should read as zeros" + ); + } + + #[test] + fn test_qcow_async_write_read_roundtrip() { + let file_size = 100 * 1024 * 1024; + let temp_file = TempFile::new().unwrap(); + { + let raw_file = RawFile::new(temp_file.as_file().try_clone().unwrap(), false); + QcowFile::new(raw_file, 3, file_size, true).unwrap(); + } + let disk = QcowDiskAsync::new(temp_file.as_file().try_clone().unwrap(), false, false, true) + .unwrap(); + + let pattern: Vec = (0..128 * 1024).map(|i| (i % 251) as u8).collect(); + let offset = 64 * 1024; + + async_write(&disk, offset, &pattern); + let read_buf = async_read(&disk, offset, pattern.len()); + assert_eq!(read_buf, pattern, "read should match written data"); + } + + #[test] + fn test_qcow_async_read_spanning_cluster_boundary() { + let cluster_size: u64 = 65536; + let file_size = 100 * 1024 * 1024; + + // Write distinct patterns into two adjacent clusters. + let pattern_a = vec![0xAA; cluster_size as usize]; + let pattern_b = vec![0xBB; cluster_size as usize]; + let (_temp, disk) = create_disk_with_data(file_size, &pattern_a, 0, true); + async_write(&disk, cluster_size, &pattern_b); + + // Read across the boundary: last 4K of cluster 0 + first 4K of cluster 1. + let read_offset = cluster_size - 4096; + let read_len = 8192; + let buf = async_read(&disk, read_offset, read_len); + + assert!( + buf[..4096].iter().all(|&b| b == 0xAA), + "first half should come from cluster 0" + ); + assert!( + buf[4096..].iter().all(|&b| b == 0xBB), + "second half should come from cluster 1" + ); + } + + #[test] + fn test_qcow_async_batch_mixed_requests() { + let file_size = 100 * 1024 * 1024; + let temp_file = TempFile::new().unwrap(); + { + let raw_file = RawFile::new(temp_file.as_file().try_clone().unwrap(), false); + QcowFile::new(raw_file, 3, file_size, true).unwrap(); + } + let disk = QcowDiskAsync::new(temp_file.as_file().try_clone().unwrap(), false, false, true) + .unwrap(); + + let mut async_io = disk.new_async_io(8).unwrap(); + + // Prepare write data for two regions. + let write_a = vec![0xAA; 4096]; + let write_b = vec![0xBB; 4096]; + let offset_a: u64 = 0; + let offset_b: u64 = 65536; + + let iov_a = libc::iovec { + iov_base: write_a.as_ptr() as *mut libc::c_void, + iov_len: write_a.len(), + }; + let iov_b = libc::iovec { + iov_base: write_b.as_ptr() as *mut libc::c_void, + iov_len: write_b.len(), + }; + + let batch = vec![ + BatchRequest { + offset: offset_a as libc::off_t, + iovecs: smallvec::smallvec![iov_a], + user_data: 10, + request_type: RequestType::Out, + }, + BatchRequest { + offset: offset_b as libc::off_t, + iovecs: smallvec::smallvec![iov_b], + user_data: 20, + request_type: RequestType::Out, + }, + ]; + + async_io.submit_batch_requests(&batch).unwrap(); + + let mut completions = [ + wait_for_completion(async_io.as_mut()), + wait_for_completion(async_io.as_mut()), + ]; + completions.sort_by_key(|c| c.0); + assert_eq!(completions[0], (10, 4096)); + assert_eq!(completions[1], (20, 4096)); + drop(async_io); + + // Batch read both regions back. + let mut read_a = vec![0u8; 4096]; + let mut read_b = vec![0u8; 4096]; + let riov_a = libc::iovec { + iov_base: read_a.as_mut_ptr() as *mut libc::c_void, + iov_len: read_a.len(), + }; + let riov_b = libc::iovec { + iov_base: read_b.as_mut_ptr() as *mut libc::c_void, + iov_len: read_b.len(), + }; + + let mut async_io = disk.new_async_io(8).unwrap(); + let read_batch = vec![ + BatchRequest { + offset: offset_a as libc::off_t, + iovecs: smallvec::smallvec![riov_a], + user_data: 30, + request_type: RequestType::In, + }, + BatchRequest { + offset: offset_b as libc::off_t, + iovecs: smallvec::smallvec![riov_b], + user_data: 40, + request_type: RequestType::In, + }, + ]; + + async_io.submit_batch_requests(&read_batch).unwrap(); + + let mut completions = [ + wait_for_completion(async_io.as_mut()), + wait_for_completion(async_io.as_mut()), + ]; + completions.sort_by_key(|c| c.0); + assert_eq!(completions[0], (30, 4096)); + assert_eq!(completions[1], (40, 4096)); + + assert_eq!(read_a, write_a, "batch read A should match written data"); + assert_eq!(read_b, write_b, "batch read B should match written data"); + } + + #[test] + fn test_qcow_async_read_unallocated() { + let file_size = 100 * 1024 * 1024; + let temp_file = TempFile::new().unwrap(); + { + let raw_file = RawFile::new(temp_file.as_file().try_clone().unwrap(), false); + QcowFile::new(raw_file, 3, file_size, true).unwrap(); + } + let disk = QcowDiskAsync::new(temp_file.as_file().try_clone().unwrap(), false, false, true) + .unwrap(); + + let buf = async_read(&disk, 0, 128 * 1024); + assert!( + buf.iter().all(|&b| b == 0), + "unallocated region should read as zeroes" + ); + } + + #[test] + fn test_qcow_async_sub_cluster_write() { + let cluster_size = 65536usize; + let file_size = 100 * 1024 * 1024; + let temp_file = TempFile::new().unwrap(); + { + let raw_file = RawFile::new(temp_file.as_file().try_clone().unwrap(), false); + QcowFile::new(raw_file, 3, file_size, true).unwrap(); + } + let disk = QcowDiskAsync::new(temp_file.as_file().try_clone().unwrap(), false, false, true) + .unwrap(); + + // Write 4K into the middle of a cluster. + let write_offset = 4096u64; + let write_len = 4096; + let pattern = vec![0xCC; write_len]; + async_write(&disk, write_offset, &pattern); + + // Read the entire cluster back. + let buf = async_read(&disk, 0, cluster_size); + + assert!( + buf[..write_offset as usize].iter().all(|&b| b == 0), + "bytes before the write should be zero" + ); + assert_eq!( + &buf[write_offset as usize..write_offset as usize + write_len], + &pattern[..], + "written region should match" + ); + assert!( + buf[write_offset as usize + write_len..] + .iter() + .all(|&b| b == 0), + "bytes after the write should be zero" + ); + } + + #[test] + fn test_qcow_async_write_after_punch_hole() { + let data = vec![0xAA; 64 * 1024]; + let offset = 0u64; + let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &data, offset, true); + + let buf = async_read(&disk, offset, data.len()); + assert!(buf.iter().all(|&b| b == 0xAA)); + + let mut async_io = disk.new_async_io(1).unwrap(); + async_io.punch_hole(offset, data.len() as u64, 10).unwrap(); + let (_, result) = wait_for_completion(async_io.as_mut()); + assert_eq!(result, 0); + drop(async_io); + + let buf = async_read(&disk, offset, data.len()); + assert!( + buf.iter().all(|&b| b == 0), + "should be zero after punch hole" + ); + + let new_data = vec![0xBB; 64 * 1024]; + async_write(&disk, offset, &new_data); + + let buf = async_read(&disk, offset, new_data.len()); + assert_eq!(buf, new_data, "should read new data after rewrite"); + } + + #[test] + fn test_qcow_async_large_sequential_io() { + let cluster_size = 64 * 1024; + let num_clusters = 8; + let total_len = cluster_size * num_clusters; + let offset = 0u64; + + let mut data = vec![0u8; total_len]; + for (i, chunk) in data.chunks_mut(cluster_size).enumerate() { + chunk.fill((i + 1) as u8); + } + + let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &data, offset, true); + + let buf = async_read(&disk, offset, total_len); + assert_eq!(buf.len(), total_len); + for (i, chunk) in buf.chunks(cluster_size).enumerate() { + assert!( + chunk.iter().all(|&b| b == (i + 1) as u8), + "cluster {i} mismatch" + ); + } + } + + #[test] + fn test_qcow_async_alignment_without_direct_io() { + let file_size = 100 * 1024 * 1024; + let temp_file = TempFile::new().unwrap(); + { + let raw_file = RawFile::new(temp_file.as_file().try_clone().unwrap(), false); + QcowFile::new(raw_file, 3, file_size, true).unwrap(); + } + let disk = QcowDiskAsync::new(temp_file.as_file().try_clone().unwrap(), false, false, true) + .unwrap(); + let async_io = disk.new_async_io(1).unwrap(); + assert_eq!(async_io.alignment(), SECTOR_SIZE); + } + + /// Returns None if O_DIRECT is not supported (e.g. tmpfs). + fn try_create_direct_io_disk(temp_file: &TempFile, file_size: u64) -> Option { + { + let raw_file = RawFile::new(temp_file.as_file().try_clone().unwrap(), false); + QcowFile::new(raw_file, 3, file_size, true).unwrap(); + } + QcowDiskAsync::new(temp_file.as_file().try_clone().unwrap(), true, false, true).ok() + } + + #[test] + fn test_qcow_async_alignment_with_direct_io() { + let temp_file = TempFile::new().unwrap(); + let disk = match try_create_direct_io_disk(&temp_file, 100 * 1024 * 1024) { + Some(d) => d, + None => { + eprintln!("skipping: O_DIRECT not supported on this filesystem"); + return; + } + }; + let async_io = disk.new_async_io(1).unwrap(); + assert!(async_io.alignment() >= SECTOR_SIZE); + } + + #[test] + fn test_qcow_async_sub_sector_read_with_direct_io() { + let temp_file = TempFile::new().unwrap(); + let disk = match try_create_direct_io_disk(&temp_file, 100 * 1024 * 1024) { + Some(d) => d, + None => { + eprintln!("skipping: O_DIRECT not supported on this filesystem"); + return; + } + }; + + let pattern = vec![0xAB; 65536]; + async_write(&disk, 0, &pattern); + + let buf = async_read(&disk, 0, 512); + assert!( + buf.iter().all(|&b| b == 0xAB), + "sub-sector O_DIRECT read should return written data" + ); + } + + #[test] + fn test_qcow_async_direct_io_write_read_roundtrip() { + let temp_file = TempFile::new().unwrap(); + let disk = match try_create_direct_io_disk(&temp_file, 100 * 1024 * 1024) { + Some(d) => d, + None => { + eprintln!("skipping: O_DIRECT not supported on this filesystem"); + return; + } + }; + + let pattern: Vec = (0..128 * 1024).map(|i| (i % 251) as u8).collect(); + async_write(&disk, 0, &pattern); + + let buf = async_read(&disk, 0, pattern.len()); + assert_eq!(buf, pattern, "O_DIRECT roundtrip should match"); + } +} diff --git a/block/src/qcow_common.rs b/block/src/qcow_common.rs new file mode 100644 index 0000000000..08a1a9ca3e --- /dev/null +++ b/block/src/qcow_common.rs @@ -0,0 +1,264 @@ +// Copyright © 2021 Intel Corporation +// +// Copyright 2026 The Cloud Hypervisor Authors. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +//! Shared helpers for QCOW2 sync and async backends. +//! +//! Position-independent I/O (`pread_exact`, `pwrite_all`) and iovec +//! scatter/gather helpers used by both `qcow_sync` and `qcow_async`. + +use std::alloc::{Layout, alloc_zeroed, dealloc}; +use std::cmp::min; +use std::os::fd::RawFd; +use std::{io, ptr, slice}; + +// -- Position independent I/O helpers -- +// +// Duplicated file descriptors share the kernel file description and thus the +// file position. Using seek then read from multiple queues races on that +// shared position. pread64 and pwrite64 are atomic and never touch the position. + +/// Read exactly the requested bytes at offset, looping on short reads. +pub fn pread_exact(fd: RawFd, buf: &mut [u8], offset: u64) -> io::Result<()> { + let mut total = 0usize; + while total < buf.len() { + // SAFETY: buf and fd are valid for the lifetime of the call. + let ret = unsafe { + libc::pread64( + fd, + buf[total..].as_mut_ptr() as *mut libc::c_void, + buf.len() - total, + (offset + total as u64) as libc::off_t, + ) + }; + if ret < 0 { + return Err(io::Error::last_os_error()); + } + if ret == 0 { + return Err(io::Error::from(io::ErrorKind::UnexpectedEof)); + } + total += ret as usize; + } + Ok(()) +} + +/// Write all bytes to fd at offset, looping on short writes. +pub fn pwrite_all(fd: RawFd, buf: &[u8], offset: u64) -> io::Result<()> { + let mut total = 0usize; + while total < buf.len() { + // SAFETY: buf and fd are valid for the lifetime of the call. + let ret = unsafe { + libc::pwrite64( + fd, + buf[total..].as_ptr() as *const libc::c_void, + buf.len() - total, + (offset + total as u64) as libc::off_t, + ) + }; + if ret < 0 { + return Err(io::Error::last_os_error()); + } + if ret == 0 { + return Err(io::Error::other("pwrite64 wrote 0 bytes")); + } + total += ret as usize; + } + Ok(()) +} + +/// RAII wrapper for an aligned heap buffer required by O_DIRECT. +pub struct AlignedBuf { + ptr: *mut u8, + layout: Layout, +} + +impl AlignedBuf { + pub fn new(size: usize, alignment: usize) -> io::Result { + let size = size.max(1).next_multiple_of(alignment); + let layout = Layout::from_size_align(size, alignment) + .map_err(|e| io::Error::other(format!("invalid aligned layout: {e}")))?; + // SAFETY: layout has non-zero size. + let ptr = unsafe { alloc_zeroed(layout) }; + if ptr.is_null() { + return Err(io::Error::new( + io::ErrorKind::OutOfMemory, + "aligned allocation failed", + )); + } + Ok(AlignedBuf { ptr, layout }) + } + + pub fn as_mut_slice(&mut self, len: usize) -> &mut [u8] { + let len = len.min(self.layout.size()); + // SAFETY: ptr is valid for layout.size() bytes; len <= layout.size(). + unsafe { slice::from_raw_parts_mut(self.ptr, len) } + } + + pub fn as_slice(&self, len: usize) -> &[u8] { + let len = len.min(self.layout.size()); + // SAFETY: ptr is valid for layout.size() bytes; len <= layout.size(). + unsafe { slice::from_raw_parts(self.ptr, len) } + } + + #[cfg(test)] + pub fn layout(&self) -> &Layout { + &self.layout + } + + #[cfg(test)] + pub fn ptr(&self) -> *const u8 { + self.ptr + } +} + +impl Drop for AlignedBuf { + fn drop(&mut self) { + // SAFETY: ptr was allocated by alloc_zeroed with self.layout. + unsafe { dealloc(self.ptr, self.layout) }; + } +} + +/// Read into `buf` via an aligned bounce buffer when O_DIRECT requires it. +pub fn aligned_pread(fd: RawFd, buf: &mut [u8], offset: u64, alignment: usize) -> io::Result<()> { + if alignment == 0 + || ((buf.as_ptr() as usize).is_multiple_of(alignment) + && buf.len().is_multiple_of(alignment) + && (offset as usize).is_multiple_of(alignment)) + { + return pread_exact(fd, buf, offset); + } + + let aligned_offset = offset & !(alignment as u64 - 1); + let head = (offset - aligned_offset) as usize; + let aligned_len = (head + buf.len()).next_multiple_of(alignment); + let mut bounce = AlignedBuf::new(aligned_len, alignment)?; + pread_exact(fd, bounce.as_mut_slice(aligned_len), aligned_offset)?; + buf.copy_from_slice(&bounce.as_slice(aligned_len)[head..head + buf.len()]); + Ok(()) +} + +/// Write `buf` via an aligned bounce buffer when O_DIRECT requires it. +pub fn aligned_pwrite(fd: RawFd, buf: &[u8], offset: u64, alignment: usize) -> io::Result<()> { + if alignment == 0 + || ((buf.as_ptr() as usize).is_multiple_of(alignment) + && buf.len().is_multiple_of(alignment) + && (offset as usize).is_multiple_of(alignment)) + { + return pwrite_all(fd, buf, offset); + } + + let aligned_offset = offset & !(alignment as u64 - 1); + let head = (offset - aligned_offset) as usize; + let aligned_len = (head + buf.len()).next_multiple_of(alignment); + let mut bounce = AlignedBuf::new(aligned_len, alignment)?; + + // Read-modify-write: read the existing aligned region, overlay our data. + pread_exact(fd, bounce.as_mut_slice(aligned_len), aligned_offset)?; + bounce.as_mut_slice(aligned_len)[head..head + buf.len()].copy_from_slice(buf); + pwrite_all(fd, bounce.as_slice(aligned_len), aligned_offset) +} + +// -- iovec helper functions -- +// +// Operate on the iovec array as a flat byte stream. + +/// Copy data into iovecs starting at the given byte offset. +/// +/// # Safety +/// Caller must ensure iovecs point to valid, writable memory of sufficient size. +pub unsafe fn scatter_to_iovecs(iovecs: &[libc::iovec], start: usize, data: &[u8]) { + let mut remaining = data; + let mut pos = 0usize; + for iov in iovecs { + let iov_end = pos + iov.iov_len; + if iov_end <= start || remaining.is_empty() { + pos = iov_end; + continue; + } + let iov_start = start.saturating_sub(pos); + let available = iov.iov_len - iov_start; + let count = min(available, remaining.len()); + // SAFETY: iov_base is valid for iov_len bytes per caller contract. + unsafe { + let dst = (iov.iov_base as *mut u8).add(iov_start); + ptr::copy_nonoverlapping(remaining.as_ptr(), dst, count); + } + remaining = &remaining[count..]; + if remaining.is_empty() { + break; + } + pos = iov_end; + } +} + +/// Zero fill iovecs starting at the given byte offset for the given length. +/// +/// # Safety +/// Caller must ensure iovecs point to valid, writable memory of sufficient size. +pub unsafe fn zero_fill_iovecs(iovecs: &[libc::iovec], start: usize, len: usize) { + let mut remaining = len; + let mut pos = 0usize; + for iov in iovecs { + let iov_end = pos + iov.iov_len; + if iov_end <= start || remaining == 0 { + pos = iov_end; + continue; + } + let iov_start = start.saturating_sub(pos); + let available = iov.iov_len - iov_start; + let count = min(available, remaining); + // SAFETY: iov_base is valid for iov_len bytes per caller contract. + unsafe { + let dst = (iov.iov_base as *mut u8).add(iov_start); + ptr::write_bytes(dst, 0, count); + } + remaining -= count; + if remaining == 0 { + break; + } + pos = iov_end; + } +} + +/// Gather bytes from iovecs starting at the given byte offset into `dst`. +/// +/// # Safety +/// Caller must ensure iovecs point to valid, readable memory of sufficient size. +pub unsafe fn gather_from_iovecs_into(iovecs: &[libc::iovec], start: usize, dst: &mut [u8]) { + let len = dst.len(); + let mut written = 0usize; + let mut pos = 0usize; + for iov in iovecs { + let iov_end = pos + iov.iov_len; + if iov_end <= start || written == len { + pos = iov_end; + continue; + } + let iov_start = start.saturating_sub(pos); + let available = iov.iov_len - iov_start; + let count = min(available, len - written); + // SAFETY: iov_base is valid for iov_len bytes per caller contract. + unsafe { + let src = (iov.iov_base as *const u8).add(iov_start); + ptr::copy_nonoverlapping(src, dst.as_mut_ptr().add(written), count); + } + written += count; + if written == len { + break; + } + pos = iov_end; + } +} + +/// Gather bytes from iovecs starting at the given byte offset into a Vec. +/// +/// # Safety +/// Caller must ensure iovecs point to valid, readable memory of sufficient size. +pub unsafe fn gather_from_iovecs(iovecs: &[libc::iovec], start: usize, len: usize) -> Vec { + let mut result = vec![0u8; len]; + // SAFETY: caller guarantees iovecs are valid; result has len bytes. + unsafe { gather_from_iovecs_into(iovecs, start, &mut result) }; + result +} diff --git a/block/src/qcow_sync.rs b/block/src/qcow_sync.rs index 2707f5dfba..44b8efaf5a 100644 --- a/block/src/qcow_sync.rs +++ b/block/src/qcow_sync.rs @@ -2,82 +2,101 @@ // // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause +use std::cmp::min; use std::collections::VecDeque; use std::fs::File; -use std::io::{self, Seek, SeekFrom}; -use std::os::fd::AsRawFd; -use std::sync::{Arc, Mutex}; +use std::os::fd::{AsFd, AsRawFd}; +use std::sync::Arc; +use std::{fmt, io}; use vmm_sys_util::eventfd::EventFd; -use vmm_sys_util::write_zeroes::PunchHole; - -use crate::async_io::{ - AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFile, DiskFileError, DiskFileResult, +use vmm_sys_util::write_zeroes::{PunchHole, WriteZeroesAt}; + +use crate::async_io::{AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFileError}; +use crate::disk_file; +use crate::error::{BlockError, BlockErrorKind, BlockResult, ErrorOp}; +use crate::qcow::backing::shared_backing_from; +use crate::qcow::metadata::{ + BackingRead, ClusterReadMapping, ClusterWriteMapping, DeallocAction, QcowMetadata, +}; +use crate::qcow::qcow_raw_file::QcowRawFile; +use crate::qcow::{MAX_NESTING_DEPTH, RawFile, parse_qcow}; +use crate::qcow_common::{ + AlignedBuf, aligned_pread, aligned_pwrite, gather_from_iovecs, gather_from_iovecs_into, + pread_exact, pwrite_all, scatter_to_iovecs, zero_fill_iovecs, }; -use crate::qcow::{Error as QcowError, MAX_NESTING_DEPTH, QcowFile, RawFile, Result as QcowResult}; -use crate::{AsyncAdaptor, BlockBackend}; pub struct QcowDiskSync { - // FIXME: The Mutex serializes all QCOW2 I/O operations across queues, which - // is necessary for correctness but eliminates any parallelism benefit from - // multiqueue. QcowFile has internal mutable state (L2 cache, refcounts, file - // position) that is not safe to share across threads via Clone. - // - // A proper fix would require restructuring QcowFile to separate metadata - // operations (which need synchronization) from data I/O (which could be - // parallelized with per queue file descriptors). See #7560 for details. - qcow_file: Arc>, + metadata: Arc, + /// Shared across queues, resolved once at construction. + backing_file: Option>, + sparse: bool, + data_raw_file: QcowRawFile, +} + +impl fmt::Debug for QcowDiskSync { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("QcowDiskSync") + .field("sparse", &self.sparse) + .field("has_backing", &self.backing_file.is_some()) + .finish_non_exhaustive() + } } impl QcowDiskSync { - pub fn new(file: File, direct_io: bool, backing_files: bool, sparse: bool) -> QcowResult { + pub fn new( + file: File, + direct_io: bool, + backing_files: bool, + sparse: bool, + ) -> BlockResult { let max_nesting_depth = if backing_files { MAX_NESTING_DEPTH } else { 0 }; - let qcow_file = QcowFile::from_with_nesting_depth( - RawFile::new(file, direct_io), - max_nesting_depth, - sparse, - ) - .map_err(|e| match e { - QcowError::MaxNestingDepthExceeded if !backing_files => QcowError::BackingFilesDisabled, - other => other, - })?; + let (inner, backing_file, sparse) = + parse_qcow(RawFile::new(file, direct_io), max_nesting_depth, sparse).map_err(|e| { + let e = if !backing_files && matches!(e.kind(), BlockErrorKind::Overflow) { + e.with_kind(BlockErrorKind::UnsupportedFeature) + } else { + e + }; + e.with_op(ErrorOp::Open) + })?; + let data_raw_file = inner.raw_file.clone(); Ok(QcowDiskSync { - qcow_file: Arc::new(Mutex::new(qcow_file)), + metadata: Arc::new(QcowMetadata::new(inner)), + backing_file: backing_file.map(shared_backing_from).transpose()?, + sparse, + data_raw_file, }) } } -impl DiskFile for QcowDiskSync { - fn logical_size(&mut self) -> DiskFileResult { - self.qcow_file - .lock() - .unwrap() - .seek(SeekFrom::End(0)) - .map_err(DiskFileError::Size) +impl Drop for QcowDiskSync { + fn drop(&mut self) { + self.metadata.shutdown(); } +} - fn physical_size(&mut self) -> DiskFileResult { - self.qcow_file.lock().unwrap().physical_size().map_err(|e| { - let io_inner = match e { - crate::Error::GetFileMetadata(e) => e, - _ => unreachable!(), - }; - DiskFileError::Size(io_inner) - }) +impl disk_file::DiskSize for QcowDiskSync { + fn logical_size(&self) -> BlockResult { + Ok(self.metadata.virtual_size()) } +} - fn new_async_io(&self, _ring_depth: u32) -> DiskFileResult> { - Ok(Box::new(QcowSync::new(Arc::clone(&self.qcow_file))) as Box) +impl disk_file::PhysicalSize for QcowDiskSync { + fn physical_size(&self) -> BlockResult { + Ok(self.data_raw_file.physical_size()?) } +} - fn resize(&mut self, size: u64) -> DiskFileResult<()> { - self.qcow_file - .lock() - .unwrap() - .resize(size) - .map_err(|e| DiskFileError::ResizeError(io::Error::other(e))) +impl disk_file::DiskFd for QcowDiskSync { + fn fd(&self) -> BorrowedDiskFd<'_> { + BorrowedDiskFd::new(self.data_raw_file.as_fd().as_raw_fd()) } +} + +impl disk_file::Geometry for QcowDiskSync {} +impl disk_file::SparseCapable for QcowDiskSync { fn supports_sparse_operations(&self) -> bool { true } @@ -85,22 +104,76 @@ impl DiskFile for QcowDiskSync { fn supports_zero_flag(&self) -> bool { true } +} + +impl disk_file::Resizable for QcowDiskSync { + fn resize(&mut self, size: u64) -> BlockResult<()> { + if self.backing_file.is_some() { + return Err(BlockError::new( + BlockErrorKind::UnsupportedFeature, + DiskFileError::ResizeError(io::Error::other( + "resize not supported with backing file", + )), + ) + .with_op(ErrorOp::Resize)); + } + self.metadata.resize(size).map_err(|e| { + BlockError::new(BlockErrorKind::Io, DiskFileError::ResizeError(e)) + .with_op(ErrorOp::Resize) + }) + } +} + +impl disk_file::DiskFile for QcowDiskSync {} + +impl disk_file::AsyncDiskFile for QcowDiskSync { + fn try_clone(&self) -> BlockResult> { + Ok(Box::new(QcowDiskSync { + metadata: Arc::clone(&self.metadata), + backing_file: self.backing_file.as_ref().map(Arc::clone), + sparse: self.sparse, + data_raw_file: self.data_raw_file.clone(), + })) + } - fn fd(&mut self) -> BorrowedDiskFd<'_> { - BorrowedDiskFd::new(self.qcow_file.lock().unwrap().as_raw_fd()) + // ring_depth is unused - this sync backend performs blocking I/O + // instead of submitting to an async ring. + fn new_async_io(&self, _ring_depth: u32) -> BlockResult> { + Ok(Box::new(QcowSync::new( + Arc::clone(&self.metadata), + self.data_raw_file.clone(), + self.backing_file.as_ref().map(Arc::clone), + self.sparse, + ))) } } pub struct QcowSync { - qcow_file: Arc>, + metadata: Arc, + data_file: QcowRawFile, + /// See the backing_file field on QcowDiskSync. + backing_file: Option>, + sparse: bool, + /// O_DIRECT alignment requirement (0 = no alignment needed). + alignment: usize, eventfd: EventFd, completion_list: VecDeque<(u64, i32)>, } impl QcowSync { - pub fn new(qcow_file: Arc>) -> Self { + fn new( + metadata: Arc, + data_file: QcowRawFile, + backing_file: Option>, + sparse: bool, + ) -> Self { + let alignment = data_file.file().alignment(); QcowSync { - qcow_file, + metadata, + data_file, + backing_file, + sparse, + alignment, eventfd: EventFd::new(libc::EFD_NONBLOCK) .expect("Failed creating EventFd for QcowSync"), completion_list: VecDeque::new(), @@ -108,8 +181,6 @@ impl QcowSync { } } -impl AsyncAdaptor for QcowFile {} - impl AsyncIo for QcowSync { fn notifier(&self) -> &EventFd { &self.eventfd @@ -121,13 +192,78 @@ impl AsyncIo for QcowSync { iovecs: &[libc::iovec], user_data: u64, ) -> AsyncIoResult<()> { - self.qcow_file.lock().unwrap().read_vectored_sync( - offset, - iovecs, - user_data, - &self.eventfd, - &mut self.completion_list, - ) + let address = offset as u64; + let total_len: usize = iovecs.iter().map(|v| v.iov_len).sum(); + + let has_backing = self.backing_file.is_some(); + let mappings = self + .metadata + .map_clusters_for_read(address, total_len, has_backing) + .map_err(AsyncIoError::ReadVectored)?; + + let mut buf_offset = 0usize; + for mapping in mappings { + match mapping { + ClusterReadMapping::Zero { length } => { + // SAFETY: iovecs point to valid guest memory buffers + unsafe { zero_fill_iovecs(iovecs, buf_offset, length as usize) }; + buf_offset += length as usize; + } + ClusterReadMapping::Allocated { + offset: host_offset, + length, + } => { + let len = length as usize; + if self.alignment > 0 { + // O_DIRECT, aligned buffer avoids bounce copy. + let mut abuf = AlignedBuf::new(len, self.alignment) + .map_err(AsyncIoError::ReadVectored)?; + aligned_pread( + self.data_file.as_raw_fd(), + abuf.as_mut_slice(len), + host_offset, + self.alignment, + ) + .map_err(AsyncIoError::ReadVectored)?; + // SAFETY: iovecs point to valid guest memory buffers + unsafe { scatter_to_iovecs(iovecs, buf_offset, abuf.as_slice(len)) }; + } else { + // No O_DIRECT, plain buffer is fine. + let mut buf = vec![0u8; len]; + pread_exact(self.data_file.as_raw_fd(), &mut buf, host_offset) + .map_err(AsyncIoError::ReadVectored)?; + // SAFETY: iovecs point to valid guest memory buffers + unsafe { scatter_to_iovecs(iovecs, buf_offset, &buf) }; + } + buf_offset += len; + } + ClusterReadMapping::Compressed { data } => { + let len = data.len(); + // SAFETY: iovecs point to valid guest memory buffers + unsafe { scatter_to_iovecs(iovecs, buf_offset, &data) }; + buf_offset += len; + } + ClusterReadMapping::Backing { + offset: backing_offset, + length, + } => { + let mut buf = vec![0u8; length as usize]; + self.backing_file + .as_ref() + .unwrap() + .read_at(backing_offset, &mut buf) + .map_err(AsyncIoError::ReadVectored)?; + // SAFETY: iovecs point to valid guest memory buffers + unsafe { scatter_to_iovecs(iovecs, buf_offset, &buf) }; + buf_offset += length as usize; + } + } + } + + self.completion_list + .push_back((user_data, total_len as i32)); + self.eventfd.write(1).unwrap(); + Ok(()) } fn write_vectored( @@ -136,21 +272,83 @@ impl AsyncIo for QcowSync { iovecs: &[libc::iovec], user_data: u64, ) -> AsyncIoResult<()> { - self.qcow_file.lock().unwrap().write_vectored_sync( - offset, - iovecs, - user_data, - &self.eventfd, - &mut self.completion_list, - ) + let address = offset as u64; + let total_len: usize = iovecs.iter().map(|v| v.iov_len).sum(); + let mut buf_offset = 0usize; + + while buf_offset < total_len { + let curr_addr = address + buf_offset as u64; + let cluster_size = self.metadata.cluster_size(); + let intra_offset = self.metadata.cluster_offset(curr_addr); + let remaining_in_cluster = (cluster_size - intra_offset) as usize; + let count = min(total_len - buf_offset, remaining_in_cluster); + + // Read backing data for COW if this is a partial cluster + // write to an unallocated cluster with a backing file. + let backing_data = if let Some(backing) = self + .backing_file + .as_ref() + .filter(|_| intra_offset != 0 || count < cluster_size as usize) + { + let cluster_begin = curr_addr - intra_offset; + let mut data = vec![0u8; cluster_size as usize]; + backing + .read_at(cluster_begin, &mut data) + .map_err(AsyncIoError::WriteVectored)?; + Some(data) + } else { + None + }; + + let mapping = self + .metadata + .map_cluster_for_write(curr_addr, backing_data) + .map_err(AsyncIoError::WriteVectored)?; + + match mapping { + ClusterWriteMapping::Allocated { + offset: host_offset, + } => { + if self.alignment > 0 { + // O_DIRECT, gather directly into aligned buffer. + let mut abuf = AlignedBuf::new(count, self.alignment) + .map_err(AsyncIoError::WriteVectored)?; + // SAFETY: iovecs point to valid guest memory buffers + unsafe { + gather_from_iovecs_into(iovecs, buf_offset, abuf.as_mut_slice(count)); + } + aligned_pwrite( + self.data_file.as_raw_fd(), + abuf.as_slice(count), + host_offset, + self.alignment, + ) + .map_err(AsyncIoError::WriteVectored)?; + } else { + // No O_DIRECT, plain buffer is fine. + // SAFETY: iovecs point to valid guest memory buffers + let buf = unsafe { gather_from_iovecs(iovecs, buf_offset, count) }; + pwrite_all(self.data_file.as_raw_fd(), &buf, host_offset) + .map_err(AsyncIoError::WriteVectored)?; + } + } + } + buf_offset += count; + } + + self.completion_list + .push_back((user_data, total_len as i32)); + self.eventfd.write(1).unwrap(); + Ok(()) } fn fsync(&mut self, user_data: Option) -> AsyncIoResult<()> { - self.qcow_file.lock().unwrap().fsync_sync( - user_data, - &self.eventfd, - &mut self.completion_list, - ) + self.metadata.flush().map_err(AsyncIoError::Fsync)?; + if let Some(user_data) = user_data { + self.completion_list.push_back((user_data, 0)); + self.eventfd.write(1).unwrap(); + } + Ok(()) } fn next_completed_request(&mut self) -> Option<(u64, i32)> { @@ -158,26 +356,49 @@ impl AsyncIo for QcowSync { } fn punch_hole(&mut self, offset: u64, length: u64, user_data: u64) -> AsyncIoResult<()> { - // For QCOW2, punch_hole calls deallocate_cluster + let virtual_size = self.metadata.virtual_size(); + let cluster_size = self.metadata.cluster_size(); + let result = self - .qcow_file - .lock() - .unwrap() - .punch_hole(offset, length) - .map(|_| 0i32) + .metadata + .deallocate_bytes( + offset, + length as usize, + self.sparse, + virtual_size, + cluster_size, + self.backing_file.as_deref(), + ) .map_err(AsyncIoError::PunchHole); match result { - Ok(res) => { - self.completion_list.push_back((user_data, res)); + Ok(actions) => { + for action in actions { + match action { + DeallocAction::PunchHole { + host_offset, + length, + } => { + let _ = self.data_file.file_mut().punch_hole(host_offset, length); + } + DeallocAction::WriteZeroes { + host_offset, + length, + } => { + let _ = self + .data_file + .file_mut() + .write_zeroes_at(host_offset, length); + } + } + } + self.completion_list.push_back((user_data, 0)); self.eventfd.write(1).unwrap(); Ok(()) } Err(e) => { - // CRITICAL: Always signal completion even on error to avoid hangs - let errno = if let AsyncIoError::PunchHole(io_err) = &e { - let err = io_err.raw_os_error().unwrap_or(libc::EIO); - -err + let errno = if let AsyncIoError::PunchHole(ref io_err) = e { + -io_err.raw_os_error().unwrap_or(libc::EIO) } else { -libc::EIO }; @@ -189,85 +410,93 @@ impl AsyncIo for QcowSync { } fn write_zeroes(&mut self, offset: u64, length: u64, user_data: u64) -> AsyncIoResult<()> { - // For QCOW2, write_zeroes is implemented by deallocating clusters via punch_hole. - // This is more efficient than writing actual zeros and reduces disk usage. + // For QCOW2 write_zeroes uses cluster deallocation, same as punch_hole. // Unallocated clusters inherently read as zero in the QCOW2 format. - let result = self - .qcow_file - .lock() - .unwrap() - .punch_hole(offset, length) - .map(|_| 0i32) - .map_err(AsyncIoError::WriteZeroes); - - match result { - Ok(res) => { - self.completion_list.push_back((user_data, res)); - self.eventfd.write(1).unwrap(); - Ok(()) - } - Err(e) => { - // Always signal completion even on error to avoid hangs - let errno = if let AsyncIoError::WriteZeroes(io_err) = &e { - let err = io_err.raw_os_error().unwrap_or(libc::EIO); - -err - } else { - -libc::EIO - }; - self.completion_list.push_back((user_data, errno)); - self.eventfd.write(1).unwrap(); - Ok(()) - } - } + self.punch_hole(offset, length, user_data) } } #[cfg(test)] mod unit_tests { - use std::io::{Read, Seek, SeekFrom, Write}; + use std::io::{Seek, SeekFrom, Write}; + use std::os::fd::RawFd; + use std::thread; use vmm_sys_util::tempfile::TempFile; use super::*; - use crate::qcow::{QcowFile, QcowHeader, RawFile}; + use crate::disk_file::{AsyncDiskFile, DiskSize, Resizable}; + use crate::qcow::{BackingFileConfig, ImageType, QcowFile, RawFile}; + + fn create_disk_with_data( + file_size: u64, + data: &[u8], + offset: u64, + sparse: bool, + direct_io: bool, + ) -> (TempFile, QcowDiskSync) { + let temp_file = TempFile::new().unwrap(); + { + let raw_file = RawFile::new(temp_file.as_file().try_clone().unwrap(), false); + let mut qcow_file = QcowFile::new(raw_file, 3, file_size, sparse).unwrap(); + qcow_file.seek(SeekFrom::Start(offset)).unwrap(); + qcow_file.write_all(data).unwrap(); + qcow_file.flush().unwrap(); + } + let disk = QcowDiskSync::new( + temp_file.as_file().try_clone().unwrap(), + direct_io, + false, + sparse, + ) + .unwrap(); + (temp_file, disk) + } + + fn async_read(disk: &QcowDiskSync, offset: u64, len: usize) -> Vec { + let mut async_io = disk.new_async_io(1).unwrap(); + let mut buf = vec![0xFFu8; len]; + let iovec = libc::iovec { + iov_base: buf.as_mut_ptr() as *mut libc::c_void, + iov_len: buf.len(), + }; + async_io + .read_vectored(offset as libc::off_t, &[iovec], 1) + .unwrap(); + let (user_data, result) = async_io.next_completed_request().unwrap(); + assert_eq!(user_data, 1); + assert_eq!(result as usize, len, "read should return requested length"); + buf + } + + fn async_write(disk: &QcowDiskSync, offset: u64, data: &[u8]) { + let mut async_io = disk.new_async_io(1).unwrap(); + let iovec = libc::iovec { + iov_base: data.as_ptr() as *mut libc::c_void, + iov_len: data.len(), + }; + async_io + .write_vectored(offset as libc::off_t, &[iovec], 1) + .unwrap(); + let (user_data, result) = async_io.next_completed_request().unwrap(); + assert_eq!(user_data, 1); + assert_eq!(result as usize, data.len()); + } #[test] fn test_qcow_async_punch_hole_completion() { - // Create a QCOW2 image with valid header - let temp_file = TempFile::new().unwrap(); - let raw_file = RawFile::new(temp_file.into_file(), false); - let file_size = 1024 * 1024 * 100; // 100MB - let mut qcow_file = QcowFile::new(raw_file, 3, file_size, true).unwrap(); - - // Write some data - let data = vec![0xDD; 128 * 1024]; // 128KB - let offset = 0; - qcow_file.seek(SeekFrom::Start(offset)).unwrap(); - qcow_file.write_all(&data).unwrap(); - qcow_file.flush().unwrap(); - - // Create async wrapper - let qcow_file = Arc::new(Mutex::new(qcow_file)); - let mut async_qcow = QcowSync::new(qcow_file.clone()); - - // Punch hole - async_qcow - .punch_hole(offset, data.len() as u64, 100) - .unwrap(); + let data = vec![0xDD; 128 * 1024]; + let offset = 0u64; + let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &data, offset, true, false); - // Verify completion event was generated - let (user_data, result) = async_qcow.next_completed_request().unwrap(); + let mut async_io = disk.new_async_io(1).unwrap(); + async_io.punch_hole(offset, data.len() as u64, 100).unwrap(); + let (user_data, result) = async_io.next_completed_request().unwrap(); assert_eq!(user_data, 100); assert_eq!(result, 0, "punch_hole should succeed"); + drop(async_io); - // Verify data reads as zeros - let mut read_buf = vec![0; data.len()]; - qcow_file - .lock() - .unwrap() - .seek(SeekFrom::Start(offset)) - .unwrap(); - qcow_file.lock().unwrap().read_exact(&mut read_buf).unwrap(); + let read_buf = async_read(&disk, offset, data.len()); assert!( read_buf.iter().all(|&b| b == 0), "Punched hole should read as zeros" @@ -276,41 +505,20 @@ mod unit_tests { #[test] fn test_qcow_async_write_zeroes_completion() { - // Create a QCOW2 image with valid header - let temp_file = TempFile::new().unwrap(); - let raw_file = RawFile::new(temp_file.into_file(), false); - let file_size = 1024 * 1024 * 100; // 100MB - let mut qcow_file = QcowFile::new(raw_file, 3, file_size, true).unwrap(); - - // Write some data - let data = vec![0xEE; 256 * 1024]; // 256KB - let offset = 64 * 1024; // Start at 64KB offset - qcow_file.seek(SeekFrom::Start(offset)).unwrap(); - qcow_file.write_all(&data).unwrap(); - qcow_file.flush().unwrap(); - - // Create async wrapper - let qcow_file = Arc::new(Mutex::new(qcow_file)); - let mut async_qcow = QcowSync::new(qcow_file.clone()); - - // Write zeros - async_qcow + let data = vec![0xEE; 256 * 1024]; + let offset = 64 * 1024u64; + let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &data, offset, true, false); + + let mut async_io = disk.new_async_io(1).unwrap(); + async_io .write_zeroes(offset, data.len() as u64, 200) .unwrap(); - - // Verify completion event was generated - let (user_data, result) = async_qcow.next_completed_request().unwrap(); + let (user_data, result) = async_io.next_completed_request().unwrap(); assert_eq!(user_data, 200); assert_eq!(result, 0, "write_zeroes should succeed"); + drop(async_io); - // Verify data reads as zeros - let mut read_buf = vec![0; data.len()]; - qcow_file - .lock() - .unwrap() - .seek(SeekFrom::Start(offset)) - .unwrap(); - qcow_file.lock().unwrap().read_exact(&mut read_buf).unwrap(); + let read_buf = async_read(&disk, offset, data.len()); assert!( read_buf.iter().all(|&b| b == 0), "Zeroed region should read as zeros" @@ -319,186 +527,1389 @@ mod unit_tests { #[test] fn test_qcow_async_multiple_operations() { - // Create a QCOW2 image with valid header - let temp_file = TempFile::new().unwrap(); - let raw_file = RawFile::new(temp_file.into_file(), false); - let file_size = 1024 * 1024 * 100; // 100MB - let mut qcow_file = QcowFile::new(raw_file, 3, file_size, true).unwrap(); - - // Write data at multiple offsets - let data = vec![0xFF; 64 * 1024]; // 64KB chunks - for i in 0..4 { - let offset = i * 128 * 1024; // 128KB spacing - qcow_file.seek(SeekFrom::Start(offset)).unwrap(); - qcow_file.write_all(&data).unwrap(); + let data = vec![0xFF; 64 * 1024]; + let (_temp, _) = create_disk_with_data(100 * 1024 * 1024, &[], 0, true, false); + + // Write data at multiple offsets via QcowFile first, then punch + { + let temp_file = _temp.as_file().try_clone().unwrap(); + let raw_file = RawFile::new(temp_file, false); + let mut qcow_file = QcowFile::from(raw_file).unwrap(); + for i in 0..4u64 { + let off = i * 128 * 1024; + qcow_file.seek(SeekFrom::Start(off)).unwrap(); + qcow_file.write_all(&data).unwrap(); + } + qcow_file.flush().unwrap(); } - qcow_file.flush().unwrap(); - // Create async wrapper - let qcow_file = Arc::new(Mutex::new(qcow_file)); - let mut async_qcow = QcowSync::new(qcow_file.clone()); + let disk = + QcowDiskSync::new(_temp.as_file().try_clone().unwrap(), false, false, true).unwrap(); + + let mut async_io = disk.new_async_io(1).unwrap(); + + async_io.punch_hole(0, 64 * 1024, 1).unwrap(); + async_io.punch_hole(128 * 1024, 64 * 1024, 2).unwrap(); + async_io.punch_hole(256 * 1024, 64 * 1024, 3).unwrap(); + + let (ud, res) = async_io.next_completed_request().unwrap(); + assert_eq!(ud, 1); + assert_eq!(res, 0); + let (ud, res) = async_io.next_completed_request().unwrap(); + assert_eq!(ud, 2); + assert_eq!(res, 0); + let (ud, res) = async_io.next_completed_request().unwrap(); + assert_eq!(ud, 3); + assert_eq!(res, 0); + assert!(async_io.next_completed_request().is_none()); + } - // Queue multiple punch_hole operations - async_qcow.punch_hole(0, 64 * 1024, 1).unwrap(); - async_qcow.punch_hole(128 * 1024, 64 * 1024, 2).unwrap(); - async_qcow.punch_hole(256 * 1024, 64 * 1024, 3).unwrap(); + #[test] + fn test_qcow_punch_hole_then_read() { + // Verify that after punch_hole, a second async_io sees zeros. + let data = vec![0xAB; 128 * 1024]; + let offset = 0u64; + let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &data, offset, true, false); - // Verify all completions - let (user_data, result) = async_qcow.next_completed_request().unwrap(); - assert_eq!(user_data, 1); + let mut async_io1 = disk.new_async_io(1).unwrap(); + async_io1 + .punch_hole(offset, data.len() as u64, 100) + .unwrap(); + let (user_data, result) = async_io1.next_completed_request().unwrap(); + assert_eq!(user_data, 100); assert_eq!(result, 0); + drop(async_io1); - let (user_data, result) = async_qcow.next_completed_request().unwrap(); - assert_eq!(user_data, 2); - assert_eq!(result, 0); + // Read via second async_io, should see zeros + let read_buf = async_read(&disk, offset, data.len()); + assert!( + read_buf.iter().all(|&b| b == 0), + "After punch_hole, read should return zeros" + ); + } - let (user_data, result) = async_qcow.next_completed_request().unwrap(); - assert_eq!(user_data, 3); - assert_eq!(result, 0); + #[test] + fn test_qcow_disk_sync_punch_hole_with_new_async_io() { + // Simulates the real usage pattern of write data, punch hole, then read back. + let data = vec![0xCD; 64 * 1024]; // one cluster + let offset = 1024 * 1024u64; // 1MB offset + let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &data, offset, true, false); + + // Punch hole to simulate DISCARD + let mut async_io1 = disk.new_async_io(1).unwrap(); + async_io1.punch_hole(offset, data.len() as u64, 1).unwrap(); + let (user_data, result) = async_io1.next_completed_request().unwrap(); + assert_eq!(user_data, 1); + assert_eq!(result, 0, "punch_hole should succeed"); + drop(async_io1); - // Verify no more completions - assert!(async_qcow.next_completed_request().is_none()); + // Read from the same location to verify + let read_buf = async_read(&disk, offset, data.len()); + assert!( + read_buf.iter().all(|&b| b == 0), + "After punch_hole via new_async_io, read should return zeros" + ); } - #[test] - fn test_qcow_punch_hole_with_shared_instance() { - // This test verifies that with Arc>, multiple async I/O operations - // share the same QcowFile instance and see each other's changes. + fn test_qcow_async_read_write_roundtrip_impl(direct_io: bool) { + let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &[], 0, true, direct_io); - // Create a QCOW2 image - let temp_file = TempFile::new().unwrap(); - let raw_file = RawFile::new(temp_file.into_file(), false); - let file_size = 1024 * 1024 * 100; // 100MB - let mut qcow_file = QcowFile::new(raw_file, 3, file_size, true).unwrap(); - - // Write some data at offset 0 - let data = vec![0xAB; 128 * 1024]; // 128KB of 0xAB pattern - let offset = 0; - qcow_file.seek(SeekFrom::Start(offset)).unwrap(); - qcow_file.write_all(&data).unwrap(); - qcow_file.flush().unwrap(); - - let qcow_shared = Arc::new(Mutex::new(qcow_file)); - - // First async I/O: punch hole - let mut async_qcow1 = QcowSync::new(qcow_shared.clone()); - async_qcow1 - .punch_hole(offset, data.len() as u64, 100) - .unwrap(); + let data = vec![0x42u8; 64 * 1024]; + let offset = 0u64; - // Verify punch_hole completed - let (user_data, result) = async_qcow1.next_completed_request().unwrap(); - assert_eq!(user_data, 100); - assert_eq!(result, 0, "punch_hole should succeed"); + async_write(&disk, offset, &data); - // Second async I/O: read from same shared instance - // This should see the deallocated cluster because they share the same QcowFile - let mut read_buf = vec![0xFF; data.len()]; - qcow_shared - .lock() - .unwrap() - .seek(SeekFrom::Start(offset)) - .unwrap(); - qcow_shared - .lock() - .unwrap() - .read_exact(&mut read_buf) - .unwrap(); + let mut async_io = disk.new_async_io(1).unwrap(); + async_io.fsync(Some(10)).unwrap(); + let (ud, res) = async_io.next_completed_request().unwrap(); + assert_eq!(ud, 10); + assert_eq!(res, 0); + drop(async_io); - // The read should return zeros because the cluster was deallocated + let read_buf = async_read(&disk, offset, data.len()); + assert_eq!(read_buf, data, "Read-back should match written data"); + } + + #[test] + fn test_qcow_async_read_write_roundtrip() { + test_qcow_async_read_write_roundtrip_impl(false); + } + + #[test] + fn test_qcow_async_read_write_roundtrip_direct_io() { + test_qcow_async_read_write_roundtrip_impl(true); + } + + fn test_qcow_async_read_unallocated_impl(direct_io: bool) { + // Reading from an unallocated region should return zeros. + let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &[], 0, true, direct_io); + let read_buf = async_read(&disk, 0, 64 * 1024); assert!( read_buf.iter().all(|&b| b == 0), - "After punch_hole, shared QcowFile instance should read zeros from deallocated cluster" + "Unallocated region should read as zeros" ); } #[test] - fn test_qcow_disk_sync_punch_hole_with_new_async_io() { - // This test simulates the EXACT real usage pattern: QcowDiskSync.new_async_io() - // creates a new QcowSync with a cloned QcowFile for each I/O operation. + fn test_qcow_async_read_unallocated() { + test_qcow_async_read_unallocated_impl(false); + } + + #[test] + fn test_qcow_async_read_unallocated_direct_io() { + test_qcow_async_read_unallocated_impl(true); + } - use std::io::Write; + fn test_qcow_async_cross_cluster_read_write_impl(direct_io: bool) { + let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &[], 0, true, direct_io); - use crate::async_io::DiskFile; + // Default cluster size is 64KB. Write 96KB starting at 32KB to cross the boundary. + let data: Vec = (0..96 * 1024).map(|i| (i % 251) as u8).collect(); + let offset = 32 * 1024u64; - // Create a QCOW2 image - let temp_file = TempFile::new().unwrap(); - let file_size = 1024 * 1024 * 100; // 100MB + async_write(&disk, offset, &data); + + let mut async_io = disk.new_async_io(1).unwrap(); + async_io.fsync(Some(99)).unwrap(); + drop(async_io); + + let read_buf = async_read(&disk, offset, data.len()); + assert_eq!( + read_buf, data, + "Cross cluster read should match written data" + ); + } + + #[test] + fn test_qcow_async_cross_cluster_read_write() { + test_qcow_async_cross_cluster_read_write_impl(false); + } + + #[test] + fn test_qcow_async_cross_cluster_read_write_direct_io() { + test_qcow_async_cross_cluster_read_write_impl(true); + } + + fn test_backing_file_read_impl(direct_io: bool) { + let backing_temp = TempFile::new().unwrap(); + let cluster_size = 1u64 << 16; + let file_size = cluster_size * 4; + let pattern: Vec = (0..file_size as usize).map(|i| (i % 251) as u8).collect(); + backing_temp.as_file().write_all(&pattern).unwrap(); + backing_temp.as_file().sync_all().unwrap(); + let backing_path = backing_temp.as_path().to_str().unwrap().to_string(); + let overlay_temp = TempFile::new().unwrap(); { - let raw_file = RawFile::new(temp_file.as_file().try_clone().unwrap(), false); - let mut qcow_file = QcowFile::new(raw_file, 3, file_size, true).unwrap(); + let raw = RawFile::new(overlay_temp.as_file().try_clone().unwrap(), false); + let backing_config = BackingFileConfig { + path: backing_path, + format: Some(ImageType::Raw), + }; + let _overlay = + QcowFile::new_from_backing(raw, 3, file_size, &backing_config, true).unwrap(); + } - // Write data at offset 1MB - use single cluster (64KB) to simplify test - let data = vec![0xCD; 64 * 1024]; // 64KB (one cluster) - let offset = 1024 * 1024u64; - qcow_file.seek(SeekFrom::Start(offset)).unwrap(); - qcow_file.write_all(&data).unwrap(); - qcow_file.flush().unwrap(); + let file = overlay_temp.as_file().try_clone().unwrap(); + let disk = QcowDiskSync::new(file, direct_io, true, true).unwrap(); + + // Read first cluster - should come from backing file + let buf = async_read(&disk, 0, cluster_size as usize); + assert_eq!( + &buf[..], + &pattern[..cluster_size as usize], + "First cluster should match backing file data" + ); + + let buf = async_read(&disk, cluster_size, cluster_size as usize); + assert_eq!( + &buf[..], + &pattern[cluster_size as usize..2 * cluster_size as usize], + "Second cluster should match backing file data" + ); + + // Read a partial range spanning cluster boundary + let mid = cluster_size - 512; + let len = 1024usize; + let buf = async_read(&disk, mid, len); + assert_eq!( + &buf[..], + &pattern[mid as usize..mid as usize + len], + "Cross cluster read from backing should match" + ); + + let buf = async_read(&disk, 0, file_size as usize); + assert_eq!( + &buf[..], + &pattern[..], + "Full file read from backing should match" + ); + } + + #[test] + fn test_backing_file_read() { + test_backing_file_read_impl(false); + } + + #[test] + fn test_backing_file_read_direct_io() { + test_backing_file_read_impl(true); + } + + fn test_backing_file_read_qcow2_backing_impl(direct_io: bool) { + let backing_temp = TempFile::new().unwrap(); + let cluster_size = 1u64 << 16; + let file_size = cluster_size * 4; + let pattern: Vec = (0..file_size as usize).map(|i| (i % 251) as u8).collect(); + { + let raw = RawFile::new(backing_temp.as_file().try_clone().unwrap(), false); + let mut qcow = QcowFile::new(raw, 3, file_size, true).unwrap(); + qcow.seek(SeekFrom::Start(0)).unwrap(); + qcow.write_all(&pattern).unwrap(); + qcow.flush().unwrap(); } + let backing_path = backing_temp.as_path().to_str().unwrap().to_string(); - // Open with QcowDiskSync (like real code does) - let disk = - QcowDiskSync::new(temp_file.as_file().try_clone().unwrap(), false, true, true).unwrap(); + let overlay_temp = TempFile::new().unwrap(); + { + let raw = RawFile::new(overlay_temp.as_file().try_clone().unwrap(), false); + let backing_config = BackingFileConfig { + path: backing_path, + format: Some(ImageType::Qcow2), + }; + let _overlay = + QcowFile::new_from_backing(raw, 3, file_size, &backing_config, true).unwrap(); + } - // First async I/O: punch hole (simulates DISCARD command) - let mut async_io1 = disk.new_async_io(1).unwrap(); - let offset = 1024 * 1024u64; - let length = 64 * 1024u64; // Single cluster - async_io1.punch_hole(offset, length, 1).unwrap(); - let (user_data, result) = async_io1.next_completed_request().unwrap(); - assert_eq!(user_data, 1); - assert_eq!(result, 0, "punch_hole should succeed"); - drop(async_io1); + let file = overlay_temp.as_file().try_clone().unwrap(); + let disk = QcowDiskSync::new(file, direct_io, true, true).unwrap(); - // Second async I/O: read from the same location (simulates READ command) - let mut async_io2 = disk.new_async_io(1).unwrap(); - let mut read_buf = vec![0xFF; length as usize]; - let iovec = libc::iovec { - iov_base: read_buf.as_mut_ptr() as *mut libc::c_void, - iov_len: read_buf.len(), - }; + // Read first cluster - should come from QCOW2 backing + let buf = async_read(&disk, 0, cluster_size as usize); + assert_eq!( + &buf[..], + &pattern[..cluster_size as usize], + "First cluster from QCOW2 backing should match" + ); - // These assertions are critical to prevent compiler optimization bugs - // that can reorder operations. Without them, the test can fail even - // though the QCOW2 implementation is correct. - assert_eq!(iovec.iov_base as *const u8, read_buf.as_ptr()); - assert_eq!(iovec.iov_len, read_buf.len()); + let buf = async_read(&disk, 0, file_size as usize); + assert_eq!( + &buf[..], + &pattern[..], + "Full file from QCOW2 backing should match" + ); - async_io2 - .read_vectored(offset as libc::off_t, &[iovec], 2) - .unwrap(); + // Write to first cluster, then verify second cluster still reads from backing + let new_data = vec![0xAB; cluster_size as usize]; + async_write(&disk, 0, &new_data); + { + let mut async_io = disk.new_async_io(1).unwrap(); + async_io.fsync(Some(99)).unwrap(); + } - let (user_data, result) = async_io2.next_completed_request().unwrap(); - assert_eq!(user_data, 2); + let buf = async_read(&disk, 0, cluster_size as usize); assert_eq!( - result as usize, length as usize, - "read should complete successfully" + &buf[..], + &new_data[..], + "Written cluster should be new data" ); - // Verify the data is all zeros - assert!( - read_buf.iter().all(|&b| b == 0), - "After punch_hole via new_async_io, read should return zeros" + let buf = async_read(&disk, cluster_size, cluster_size as usize); + assert_eq!( + &buf[..], + &pattern[cluster_size as usize..2 * cluster_size as usize], + "Unwritten cluster should still come from backing" ); } #[test] - fn backing_files_disabled_error() { - let header = - QcowHeader::create_for_size_and_path(3, 0x10_0000, Some("/path/to/backing/file")) - .expect("Failed to create header."); - let temp_file = TempFile::new().unwrap(); - let mut raw_file = RawFile::new(temp_file.as_file().try_clone().unwrap(), false); - header - .write_to(&mut raw_file) - .expect("Failed to write header."); + fn test_backing_file_read_qcow2_backing() { + test_backing_file_read_qcow2_backing_impl(false); + } + + #[test] + fn test_backing_file_read_qcow2_backing_direct_io() { + test_backing_file_read_qcow2_backing_impl(true); + } + + fn test_multi_queue_concurrent_reads_impl(direct_io: bool) { + // Verify that multiple queues (threads) can read simultaneously. + // This exercises the RwLock + pread64 design: concurrent L2 cache hits + // proceed in parallel and data reads are position independent. + let cluster_size = 1u64 << 16; + let file_size = cluster_size * 16; + let pattern: Vec = (0..file_size as usize).map(|i| (i % 251) as u8).collect(); + let (_temp, disk) = create_disk_with_data(file_size, &pattern, 0, true, direct_io); + let disk = Arc::new(disk); + + let threads: Vec<_> = (0..8) + .map(|t| { + let disk = Arc::clone(&disk); + let pattern = pattern.clone(); + thread::spawn(move || { + for i in 0..16u64 { + // Each thread reads clusters in a different order + let cluster_idx = (i + t * 2) % 16; + let offset = cluster_idx * cluster_size; + let buf = async_read(&disk, offset, cluster_size as usize); + assert_eq!( + &buf[..], + &pattern[offset as usize..(offset + cluster_size) as usize], + "Thread {t} cluster {cluster_idx} mismatch" + ); + } + }) + }) + .collect(); + + for t in threads { + t.join().unwrap(); + } + } + + #[test] + fn test_multi_queue_concurrent_reads() { + test_multi_queue_concurrent_reads_impl(false); + } + + #[test] + fn test_multi_queue_concurrent_reads_direct_io() { + test_multi_queue_concurrent_reads_impl(true); + } + + fn test_multi_queue_concurrent_reads_qcow2_backing_impl(direct_io: bool) { + // Same as above but reads go through a Qcow2Backing, + // exercising concurrent metadata resolution + pread64 in the backing. + let backing_temp = TempFile::new().unwrap(); + let cluster_size = 1u64 << 16; + let file_size = cluster_size * 16; + let pattern: Vec = (0..file_size as usize).map(|i| (i % 251) as u8).collect(); + { + let raw = RawFile::new(backing_temp.as_file().try_clone().unwrap(), false); + let mut qcow = QcowFile::new(raw, 3, file_size, true).unwrap(); + qcow.seek(SeekFrom::Start(0)).unwrap(); + qcow.write_all(&pattern).unwrap(); + qcow.flush().unwrap(); + } + let backing_path = backing_temp.as_path().to_str().unwrap().to_string(); + + let overlay_temp = TempFile::new().unwrap(); + { + let raw = RawFile::new(overlay_temp.as_file().try_clone().unwrap(), false); + let backing_config = BackingFileConfig { + path: backing_path, + format: Some(ImageType::Qcow2), + }; + let _overlay = + QcowFile::new_from_backing(raw, 3, file_size, &backing_config, true).unwrap(); + } + + let file = overlay_temp.as_file().try_clone().unwrap(); + let disk = Arc::new(QcowDiskSync::new(file, direct_io, true, true).unwrap()); + + let threads: Vec<_> = (0..8) + .map(|t| { + let disk = Arc::clone(&disk); + let pattern = pattern.clone(); + thread::spawn(move || { + for i in 0..16u64 { + let cluster_idx = (i + t * 2) % 16; + let offset = cluster_idx * cluster_size; + let buf = async_read(&disk, offset, cluster_size as usize); + assert_eq!( + &buf[..], + &pattern[offset as usize..(offset + cluster_size) as usize], + "Thread {t} cluster {cluster_idx} mismatch (qcow2 backing)" + ); + } + }) + }) + .collect(); + + for t in threads { + t.join().unwrap(); + } + } + + #[test] + fn test_multi_queue_concurrent_reads_qcow2_backing() { + test_multi_queue_concurrent_reads_qcow2_backing_impl(false); + } + + #[test] + fn test_multi_queue_concurrent_reads_qcow2_backing_direct_io() { + test_multi_queue_concurrent_reads_qcow2_backing_impl(true); + } - let file = temp_file.into_file(); - match QcowDiskSync::new(file, false, false, true) { - Err(QcowError::BackingFilesDisabled) => {} - Err(other) => panic!("Expected BackingFilesDisabled, got: {other:?}"), - Ok(_) => panic!("Expected BackingFilesDisabled error, but succeeded"), + fn test_three_layer_backing_chain_impl(direct_io: bool) { + // raw base -> qcow2 mid -> qcow2 overlay + // Tests recursive shared_backing_from() with nested backing. + let cluster_size = 1u64 << 16; + let file_size = cluster_size * 4; + let base_pattern: Vec = (0..file_size as usize).map(|i| (i % 251) as u8).collect(); + + // Layer 0: raw base + let base_temp = TempFile::new().unwrap(); + base_temp.as_file().write_all(&base_pattern).unwrap(); + base_temp.as_file().sync_all().unwrap(); + let base_path = base_temp.as_path().to_str().unwrap().to_string(); + + // Layer 1: qcow2 mid pointing at raw base, write to cluster 0 only + let mid_temp = TempFile::new().unwrap(); + let mid_pattern = vec![0xBBu8; cluster_size as usize]; + { + let raw = RawFile::new(mid_temp.as_file().try_clone().unwrap(), false); + let backing_config = BackingFileConfig { + path: base_path, + format: Some(ImageType::Raw), + }; + let mut mid = + QcowFile::new_from_backing(raw, 3, file_size, &backing_config, true).unwrap(); + mid.seek(SeekFrom::Start(0)).unwrap(); + mid.write_all(&mid_pattern).unwrap(); + mid.flush().unwrap(); + } + let mid_path = mid_temp.as_path().to_str().unwrap().to_string(); + + // Layer 2: qcow2 overlay pointing at qcow2 mid, write to cluster 1 only + let overlay_temp = TempFile::new().unwrap(); + let overlay_pattern = vec![0xCCu8; cluster_size as usize]; + { + let raw = RawFile::new(overlay_temp.as_file().try_clone().unwrap(), false); + let backing_config = BackingFileConfig { + path: mid_path, + format: Some(ImageType::Qcow2), + }; + let mut overlay = + QcowFile::new_from_backing(raw, 3, file_size, &backing_config, true).unwrap(); + overlay.seek(SeekFrom::Start(cluster_size)).unwrap(); + overlay.write_all(&overlay_pattern).unwrap(); + overlay.flush().unwrap(); } + + let file = overlay_temp.as_file().try_clone().unwrap(); + let disk = QcowDiskSync::new(file, direct_io, true, true).unwrap(); + + // Cluster 0: mid wrote 0xBB + let buf = async_read(&disk, 0, cluster_size as usize); + assert!( + buf.iter().all(|&b| b == 0xBB), + "Cluster 0 should come from mid layer" + ); + + // Cluster 1: overlay wrote 0xCC + let buf = async_read(&disk, cluster_size, cluster_size as usize); + assert!( + buf.iter().all(|&b| b == 0xCC), + "Cluster 1 should come from overlay" + ); + + // Cluster 2: falls through mid (unwritten) to raw base + let buf = async_read(&disk, cluster_size * 2, cluster_size as usize); + let expected_start = (cluster_size * 2) as usize; + assert_eq!( + &buf[..], + &base_pattern[expected_start..expected_start + cluster_size as usize], + "Cluster 2 should come from raw base" + ); + + // Cluster 3: also falls through to raw base + let buf = async_read(&disk, cluster_size * 3, cluster_size as usize); + let expected_start = (cluster_size * 3) as usize; + assert_eq!( + &buf[..], + &base_pattern[expected_start..expected_start + cluster_size as usize], + "Cluster 3 should come from raw base" + ); + } + + #[test] + fn test_three_layer_backing_chain() { + test_three_layer_backing_chain_impl(false); + } + + #[test] + fn test_three_layer_backing_chain_direct_io() { + test_three_layer_backing_chain_impl(true); + } + + fn test_backing_cow_preserves_all_unwritten_clusters_impl(direct_io: bool) { + // Write to specific clusters in the overlay, verify all others still + // read from the qcow2 backing correctly. + let cluster_size = 1u64 << 16; + let num_clusters = 8u64; + let file_size = cluster_size * num_clusters; + let pattern: Vec = (0..file_size as usize).map(|i| (i % 251) as u8).collect(); + + let backing_temp = TempFile::new().unwrap(); + { + let raw = RawFile::new(backing_temp.as_file().try_clone().unwrap(), false); + let mut qcow = QcowFile::new(raw, 3, file_size, true).unwrap(); + qcow.seek(SeekFrom::Start(0)).unwrap(); + qcow.write_all(&pattern).unwrap(); + qcow.flush().unwrap(); + } + let backing_path = backing_temp.as_path().to_str().unwrap().to_string(); + + let overlay_temp = TempFile::new().unwrap(); + { + let raw = RawFile::new(overlay_temp.as_file().try_clone().unwrap(), false); + let backing_config = BackingFileConfig { + path: backing_path, + format: Some(ImageType::Qcow2), + }; + let _overlay = + QcowFile::new_from_backing(raw, 3, file_size, &backing_config, true).unwrap(); + } + + let file = overlay_temp.as_file().try_clone().unwrap(); + let disk = QcowDiskSync::new(file, direct_io, true, true).unwrap(); + + let written = vec![0xFFu8; cluster_size as usize]; + for &idx in &[0u64, 3, 7] { + async_write(&disk, idx * cluster_size, &written); + } + { + let mut async_io = disk.new_async_io(1).unwrap(); + async_io.fsync(Some(99)).unwrap(); + } + + for &idx in &[0u64, 3, 7] { + let buf = async_read(&disk, idx * cluster_size, cluster_size as usize); + assert!( + buf.iter().all(|&b| b == 0xFF), + "Cluster {idx} should be written data" + ); + } + + // Verify unwritten clusters read from backing + for idx in 0..num_clusters { + if idx == 0 || idx == 3 || idx == 7 { + continue; + } + let offset = idx * cluster_size; + let buf = async_read(&disk, offset, cluster_size as usize); + assert_eq!( + &buf[..], + &pattern[offset as usize..(offset + cluster_size) as usize], + "Cluster {idx} should come from backing" + ); + } + } + + #[test] + fn test_backing_cow_preserves_all_unwritten_clusters() { + test_backing_cow_preserves_all_unwritten_clusters_impl(false); + } + + #[test] + fn test_backing_cow_preserves_all_unwritten_clusters_direct_io() { + test_backing_cow_preserves_all_unwritten_clusters_impl(true); + } + + fn test_qcow2_backing_read_beyond_virtual_size_impl(direct_io: bool) { + // Read starting past the backing file virtual_size should return zeros. + let cluster_size = 1u64 << 16; + let backing_size = cluster_size * 2; + let overlay_size = cluster_size * 4; // overlay is larger than backing + + let backing_temp = TempFile::new().unwrap(); + { + let raw = RawFile::new(backing_temp.as_file().try_clone().unwrap(), false); + let mut qcow = QcowFile::new(raw, 3, backing_size, true).unwrap(); + qcow.seek(SeekFrom::Start(0)).unwrap(); + qcow.write_all(&vec![0xAA; backing_size as usize]).unwrap(); + qcow.flush().unwrap(); + } + let backing_path = backing_temp.as_path().to_str().unwrap().to_string(); + + let overlay_temp = TempFile::new().unwrap(); + { + let raw = RawFile::new(overlay_temp.as_file().try_clone().unwrap(), false); + let backing_config = BackingFileConfig { + path: backing_path, + format: Some(ImageType::Qcow2), + }; + let _overlay = + QcowFile::new_from_backing(raw, 3, overlay_size, &backing_config, true).unwrap(); + } + + let file = overlay_temp.as_file().try_clone().unwrap(); + let disk = QcowDiskSync::new(file, direct_io, true, true).unwrap(); + + // Read cluster 2 (past backing virtual_size) - should be zeros + let buf = async_read(&disk, backing_size, cluster_size as usize); + assert!( + buf.iter().all(|&b| b == 0), + "Read beyond backing virtual_size should return zeros" + ); + } + + #[test] + fn test_qcow2_backing_read_beyond_virtual_size() { + test_qcow2_backing_read_beyond_virtual_size_impl(false); + } + + #[test] + fn test_qcow2_backing_read_beyond_virtual_size_direct_io() { + test_qcow2_backing_read_beyond_virtual_size_impl(true); + } + + fn test_qcow2_backing_read_spanning_virtual_size_impl(direct_io: bool) { + // Read that starts within backing bounds but extends past virtual_size. + // First part should have backing data, remainder should be zeros. + let cluster_size = 1u64 << 16; + let backing_size = cluster_size * 2; + let overlay_size = cluster_size * 4; + + let backing_temp = TempFile::new().unwrap(); + let backing_data = vec![0xBBu8; backing_size as usize]; + { + let raw = RawFile::new(backing_temp.as_file().try_clone().unwrap(), false); + let mut qcow = QcowFile::new(raw, 3, backing_size, true).unwrap(); + qcow.seek(SeekFrom::Start(0)).unwrap(); + qcow.write_all(&backing_data).unwrap(); + qcow.flush().unwrap(); + } + let backing_path = backing_temp.as_path().to_str().unwrap().to_string(); + + let overlay_temp = TempFile::new().unwrap(); + { + let raw = RawFile::new(overlay_temp.as_file().try_clone().unwrap(), false); + let backing_config = BackingFileConfig { + path: backing_path, + format: Some(ImageType::Qcow2), + }; + let _overlay = + QcowFile::new_from_backing(raw, 3, overlay_size, &backing_config, true).unwrap(); + } + + let file = overlay_temp.as_file().try_clone().unwrap(); + let disk = QcowDiskSync::new(file, direct_io, true, true).unwrap(); + + // Read 2 clusters starting at cluster 1 (spans backing boundary) + let read_len = cluster_size as usize * 2; + let buf = async_read(&disk, cluster_size, read_len); + + // First cluster should be backing data + assert!( + buf[..cluster_size as usize].iter().all(|&b| b == 0xBB), + "First half should come from backing" + ); + + // Second cluster is past backing virtual_size - zeros + assert!( + buf[cluster_size as usize..].iter().all(|&b| b == 0), + "Second half should be zeros (past backing virtual_size)" + ); + } + + #[test] + fn test_qcow2_backing_read_spanning_virtual_size() { + test_qcow2_backing_read_spanning_virtual_size_impl(false); + } + + #[test] + fn test_qcow2_backing_read_spanning_virtual_size_direct_io() { + test_qcow2_backing_read_spanning_virtual_size_impl(true); + } + + fn test_raw_backing_read_beyond_virtual_size_impl(direct_io: bool) { + // Read past raw backing file virtual_size should return zeros. + let cluster_size = 1u64 << 16; + let backing_size = cluster_size * 2; + let overlay_size = cluster_size * 4; + + let backing_temp = TempFile::new().unwrap(); + let backing_data = vec![0xDD; backing_size as usize]; + backing_temp.as_file().write_all(&backing_data).unwrap(); + backing_temp.as_file().sync_all().unwrap(); + let backing_path = backing_temp.as_path().to_str().unwrap().to_string(); + + let overlay_temp = TempFile::new().unwrap(); + { + let raw = RawFile::new(overlay_temp.as_file().try_clone().unwrap(), false); + let backing_config = BackingFileConfig { + path: backing_path, + format: Some(ImageType::Raw), + }; + let _overlay = + QcowFile::new_from_backing(raw, 3, overlay_size, &backing_config, true).unwrap(); + } + + let file = overlay_temp.as_file().try_clone().unwrap(); + let disk = QcowDiskSync::new(file, direct_io, true, true).unwrap(); + + // Read cluster 2 (past backing size) - should be zeros + let buf = async_read(&disk, backing_size, cluster_size as usize); + assert!( + buf.iter().all(|&b| b == 0), + "Read beyond raw backing virtual_size should return zeros" + ); + + // Read spanning boundary: cluster 1 has data, cluster 2 zeros + let read_len = cluster_size as usize * 2; + let buf = async_read(&disk, cluster_size, read_len); + assert!( + buf[..cluster_size as usize].iter().all(|&b| b == 0xDD), + "First half should come from raw backing" + ); + assert!( + buf[cluster_size as usize..].iter().all(|&b| b == 0), + "Second half should be zeros (past raw backing size)" + ); + } + + #[test] + fn test_raw_backing_read_beyond_virtual_size() { + test_raw_backing_read_beyond_virtual_size_impl(false); + } + + #[test] + fn test_raw_backing_read_beyond_virtual_size_direct_io() { + test_raw_backing_read_beyond_virtual_size_impl(true); + } + + fn test_qcow2_backing_cross_cluster_read_impl(direct_io: bool) { + // Read spanning a cluster boundary through qcow2 backing. + // Exercises the read_clusters loop in Qcow2Backing. + let cluster_size = 1u64 << 16; + let file_size = cluster_size * 4; + let pattern: Vec = (0..file_size as usize).map(|i| (i % 251) as u8).collect(); + + let backing_temp = TempFile::new().unwrap(); + { + let raw = RawFile::new(backing_temp.as_file().try_clone().unwrap(), false); + let mut qcow = QcowFile::new(raw, 3, file_size, true).unwrap(); + qcow.seek(SeekFrom::Start(0)).unwrap(); + qcow.write_all(&pattern).unwrap(); + qcow.flush().unwrap(); + } + let backing_path = backing_temp.as_path().to_str().unwrap().to_string(); + + let overlay_temp = TempFile::new().unwrap(); + { + let raw = RawFile::new(overlay_temp.as_file().try_clone().unwrap(), false); + let backing_config = BackingFileConfig { + path: backing_path, + format: Some(ImageType::Qcow2), + }; + let _overlay = + QcowFile::new_from_backing(raw, 3, file_size, &backing_config, true).unwrap(); + } + + let file = overlay_temp.as_file().try_clone().unwrap(); + let disk = QcowDiskSync::new(file, direct_io, true, true).unwrap(); + + // Read spanning clusters 1-2 boundary: 512 bytes before + 512 after + let mid = cluster_size - 512; + let len = 1024usize; + let buf = async_read(&disk, mid, len); + assert_eq!( + &buf[..], + &pattern[mid as usize..mid as usize + len], + "Cross cluster read through qcow2 backing should match" + ); + + // Read spanning clusters 0-1-2 (3 clusters worth) + let start = cluster_size / 2; + let len = cluster_size as usize * 2; + let buf = async_read(&disk, start, len); + assert_eq!( + &buf[..], + &pattern[start as usize..start as usize + len], + "Multi cluster read through qcow2 backing should match" + ); + } + + #[test] + fn test_qcow2_backing_cross_cluster_read() { + test_qcow2_backing_cross_cluster_read_impl(false); + } + + #[test] + fn test_qcow2_backing_cross_cluster_read_direct_io() { + test_qcow2_backing_cross_cluster_read_impl(true); + } + + fn test_punch_hole_with_backing_fallthrough_impl(direct_io: bool) { + // Write to overlay, then punch hole. After punch, the cluster should + // fall through to backing data (not zeros). + let cluster_size = 1u64 << 16; + let file_size = cluster_size * 4; + let pattern: Vec = (0..file_size as usize).map(|i| (i % 251) as u8).collect(); + + let backing_temp = TempFile::new().unwrap(); + backing_temp.as_file().write_all(&pattern).unwrap(); + backing_temp.as_file().sync_all().unwrap(); + let backing_path = backing_temp.as_path().to_str().unwrap().to_string(); + + let overlay_temp = TempFile::new().unwrap(); + { + let raw = RawFile::new(overlay_temp.as_file().try_clone().unwrap(), false); + let backing_config = BackingFileConfig { + path: backing_path, + format: Some(ImageType::Raw), + }; + let _overlay = + QcowFile::new_from_backing(raw, 3, file_size, &backing_config, true).unwrap(); + } + + let file = overlay_temp.as_file().try_clone().unwrap(); + let disk = QcowDiskSync::new(file, direct_io, true, true).unwrap(); + + let written = vec![0xFFu8; cluster_size as usize]; + async_write(&disk, 0, &written); + { + let mut async_io = disk.new_async_io(1).unwrap(); + async_io.fsync(Some(99)).unwrap(); + } + + let buf = async_read(&disk, 0, cluster_size as usize); + assert!(buf.iter().all(|&b| b == 0xFF), "Should read written data"); + + // Punch hole on cluster 0 - should deallocate and fall through to backing + { + let mut async_io = disk.new_async_io(1).unwrap(); + async_io.punch_hole(0, cluster_size, 42).unwrap(); + let (ud, res) = async_io.next_completed_request().unwrap(); + assert_eq!(ud, 42); + assert_eq!(res, 0); + } + + // Now read should return backing data, not zeros + let buf = async_read(&disk, 0, cluster_size as usize); + assert_eq!( + &buf[..], + &pattern[..cluster_size as usize], + "After punch_hole with backing, should read backing data" + ); + + // Cluster 1 should still be backing data throughout + let buf = async_read(&disk, cluster_size, cluster_size as usize); + assert_eq!( + &buf[..], + &pattern[cluster_size as usize..2 * cluster_size as usize], + "Untouched cluster should read from backing" + ); + } + + #[test] + fn test_punch_hole_with_backing_fallthrough() { + test_punch_hole_with_backing_fallthrough_impl(false); + } + + #[test] + fn test_punch_hole_with_backing_fallthrough_direct_io() { + test_punch_hole_with_backing_fallthrough_impl(true); + } + + fn test_rewrite_allocated_cluster_impl(direct_io: bool) { + // Write to a cluster, then overwrite it. The second write should hit + // the already allocated path in map_write (no new cluster allocation). + let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &[], 0, true, direct_io); + let cluster_size = 1u64 << 16; + + let data1 = vec![0xAAu8; cluster_size as usize]; + async_write(&disk, 0, &data1); + { + let mut aio = disk.new_async_io(1).unwrap(); + aio.fsync(Some(1)).unwrap(); + } + let buf = async_read(&disk, 0, cluster_size as usize); + assert!(buf.iter().all(|&b| b == 0xAA), "First write should stick"); + + let data2 = vec![0xBBu8; cluster_size as usize]; + async_write(&disk, 0, &data2); + { + let mut aio = disk.new_async_io(1).unwrap(); + aio.fsync(Some(2)).unwrap(); + } + let buf = async_read(&disk, 0, cluster_size as usize); + assert!( + buf.iter().all(|&b| b == 0xBB), + "Overwrite should replace data" + ); + } + + #[test] + fn test_rewrite_allocated_cluster() { + test_rewrite_allocated_cluster_impl(false); + } + + #[test] + fn test_rewrite_allocated_cluster_direct_io() { + test_rewrite_allocated_cluster_impl(true); + } + + fn test_partial_cluster_write_with_backing_cow_impl(direct_io: bool) { + // Partial cluster write to an overlay with a backing file triggers COW. + // The unwritten part of the cluster must be copied from backing. + let cluster_size = 1u64 << 16; + let file_size = cluster_size * 4; + let pattern: Vec = (0..file_size as usize).map(|i| (i % 251) as u8).collect(); + + let backing_temp = TempFile::new().unwrap(); + backing_temp.as_file().write_all(&pattern).unwrap(); + backing_temp.as_file().sync_all().unwrap(); + let backing_path = backing_temp.as_path().to_str().unwrap().to_string(); + + let overlay_temp = TempFile::new().unwrap(); + { + let raw = RawFile::new(overlay_temp.as_file().try_clone().unwrap(), false); + let backing_config = BackingFileConfig { + path: backing_path, + format: Some(ImageType::Raw), + }; + let _overlay = + QcowFile::new_from_backing(raw, 3, file_size, &backing_config, true).unwrap(); + } + + let file = overlay_temp.as_file().try_clone().unwrap(); + let disk = QcowDiskSync::new(file, direct_io, true, true).unwrap(); + + // Write 4KB at offset 4KB within cluster 0 (partial cluster) + let write_offset = 4096u64; + let write_len = 4096usize; + let write_data = vec![0xEEu8; write_len]; + async_write(&disk, write_offset, &write_data); + { + let mut aio = disk.new_async_io(1).unwrap(); + aio.fsync(Some(1)).unwrap(); + } + + let buf = async_read(&disk, 0, cluster_size as usize); + + // Before the write: should be COW'd from backing + assert_eq!( + &buf[..write_offset as usize], + &pattern[..write_offset as usize], + "Pre write region should be COW from backing" + ); + + assert_eq!( + &buf[write_offset as usize..write_offset as usize + write_len], + &write_data[..], + "Written region should be new data" + ); + + // After the write: should be COW'd from backing + let after_offset = write_offset as usize + write_len; + assert_eq!( + &buf[after_offset..cluster_size as usize], + &pattern[after_offset..cluster_size as usize], + "Post write region should be COW from backing" + ); + } + + #[test] + fn test_partial_cluster_write_with_backing_cow() { + test_partial_cluster_write_with_backing_cow_impl(false); + } + + #[test] + fn test_partial_cluster_write_with_backing_cow_direct_io() { + test_partial_cluster_write_with_backing_cow_impl(true); + } + + #[test] + fn test_partial_cluster_deallocate() { + // Punch hole on a partial cluster range. The deallocate_bytes path + // should produce WriteZeroes actions for partial clusters. + let cluster_size = 1u64 << 16; + let file_size = cluster_size * 4; + + let data: Vec = (0..2 * cluster_size as usize) + .map(|i| (i % 251) as u8) + .collect(); + let (_temp, disk) = create_disk_with_data(file_size, &data, 0, true, false); + + // Punch a partial range: last 4KB of cluster 0 + first 4KB of cluster 1 + let punch_offset = cluster_size - 4096; + let punch_len = 8192u64; + { + let mut aio = disk.new_async_io(1).unwrap(); + aio.punch_hole(punch_offset, punch_len, 10).unwrap(); + let (ud, res) = aio.next_completed_request().unwrap(); + assert_eq!(ud, 10); + assert_eq!(res, 0); + } + + let buf = async_read(&disk, 0, 2 * cluster_size as usize); + + // Before punch: unchanged + assert_eq!( + &buf[..punch_offset as usize], + &data[..punch_offset as usize], + "Data before punch should be unchanged" + ); + + // Punched region: zeros + assert!( + buf[punch_offset as usize..(punch_offset + punch_len) as usize] + .iter() + .all(|&b| b == 0), + "Punched region should be zeros" + ); + + // After punch: unchanged + let after = (punch_offset + punch_len) as usize; + assert_eq!( + &buf[after..2 * cluster_size as usize], + &data[after..2 * cluster_size as usize], + "Data after punch should be unchanged" + ); + } + + #[test] + fn test_resize_grow() { + let cluster_size = 1u64 << 16; + let initial_size = cluster_size * 4; + let data = vec![0xAA; cluster_size as usize]; + let (_temp, mut disk) = create_disk_with_data(initial_size, &data, 0, true, false); + + assert_eq!(disk.logical_size().unwrap(), initial_size); + + let new_size = cluster_size * 8; + disk.resize(new_size).unwrap(); + assert_eq!(disk.logical_size().unwrap(), new_size); + + // Original data intact + let buf = async_read(&disk, 0, cluster_size as usize); + assert!( + buf.iter().all(|&b| b == 0xAA), + "Original data should survive resize" + ); + + // New region reads as zeros + let buf = async_read(&disk, initial_size, cluster_size as usize); + assert!( + buf.iter().all(|&b| b == 0), + "Newly grown region should read as zeros" + ); + + // Can write to newly grown region + let new_data = vec![0xBB; cluster_size as usize]; + async_write(&disk, initial_size, &new_data); + { + let mut aio = disk.new_async_io(1).unwrap(); + aio.fsync(Some(1)).unwrap(); + } + let buf = async_read(&disk, initial_size, cluster_size as usize); + assert!( + buf.iter().all(|&b| b == 0xBB), + "Write to grown region should work" + ); + } + + #[test] + fn test_resize_with_backing_file_rejected() { + let backing_temp = TempFile::new().unwrap(); + let cluster_size = 1u64 << 16; + let file_size = cluster_size * 4; + backing_temp + .as_file() + .write_all(&vec![0u8; file_size as usize]) + .unwrap(); + backing_temp.as_file().sync_all().unwrap(); + let backing_path = backing_temp.as_path().to_str().unwrap().to_string(); + + let overlay_temp = TempFile::new().unwrap(); + { + let raw = RawFile::new(overlay_temp.as_file().try_clone().unwrap(), false); + let backing_config = BackingFileConfig { + path: backing_path, + format: Some(ImageType::Raw), + }; + let _overlay = + QcowFile::new_from_backing(raw, 3, file_size, &backing_config, true).unwrap(); + } + + let file = overlay_temp.as_file().try_clone().unwrap(); + let mut disk = QcowDiskSync::new(file, false, true, true).unwrap(); + + assert_eq!(disk.logical_size().unwrap(), file_size); + let result = disk.resize(file_size * 2); + assert!(result.is_err(), "resize with backing file should fail"); + assert_eq!( + disk.logical_size().unwrap(), + file_size, + "size should be unchanged after failed resize" + ); + } + + fn test_multi_iovec_read_write_impl(direct_io: bool) { + // Exercise scatter/gather with multiple iovecs per operation. + let (_temp, disk) = create_disk_with_data(100 * 1024 * 1024, &[], 0, true, direct_io); + + // Write: 3 iovecs with distinct patterns + let a = vec![0xAAu8; 16 * 1024]; + let b = vec![0xBBu8; 32 * 1024]; + let c = vec![0xCCu8; 16 * 1024]; + let iovecs_w = [ + libc::iovec { + iov_base: a.as_ptr() as *mut libc::c_void, + iov_len: a.len(), + }, + libc::iovec { + iov_base: b.as_ptr() as *mut libc::c_void, + iov_len: b.len(), + }, + libc::iovec { + iov_base: c.as_ptr() as *mut libc::c_void, + iov_len: c.len(), + }, + ]; + let total = a.len() + b.len() + c.len(); + + let mut aio = disk.new_async_io(1).unwrap(); + aio.write_vectored(0, &iovecs_w, 1).unwrap(); + let (ud, res) = aio.next_completed_request().unwrap(); + assert_eq!(ud, 1); + assert_eq!(res as usize, total); + aio.fsync(Some(2)).unwrap(); + drop(aio); + + // Read back into 3 iovecs of different sizes + let mut r1 = vec![0u8; 8 * 1024]; + let mut r2 = vec![0u8; 48 * 1024]; + let mut r3 = vec![0u8; 8 * 1024]; + let iovecs_r = [ + libc::iovec { + iov_base: r1.as_mut_ptr() as *mut libc::c_void, + iov_len: r1.len(), + }, + libc::iovec { + iov_base: r2.as_mut_ptr() as *mut libc::c_void, + iov_len: r2.len(), + }, + libc::iovec { + iov_base: r3.as_mut_ptr() as *mut libc::c_void, + iov_len: r3.len(), + }, + ]; + + let mut aio = disk.new_async_io(1).unwrap(); + aio.read_vectored(0, &iovecs_r, 10).unwrap(); + let (ud, res) = aio.next_completed_request().unwrap(); + assert_eq!(ud, 10); + assert_eq!(res as usize, total); + drop(aio); + + // Reassemble the read buffers into a flat vec + let mut got = Vec::with_capacity(total); + got.extend_from_slice(&r1); + got.extend_from_slice(&r2); + got.extend_from_slice(&r3); + + // Build expected from the write buffers + let mut expected = Vec::with_capacity(total); + expected.extend_from_slice(&a); + expected.extend_from_slice(&b); + expected.extend_from_slice(&c); + + assert_eq!(got, expected, "Multi iovec read should match written data"); + } + + #[test] + fn test_multi_iovec_read_write() { + test_multi_iovec_read_write_impl(false); + } + + #[test] + fn test_multi_iovec_read_write_direct_io() { + test_multi_iovec_read_write_impl(true); + } + + // -- Low level aligned I/O function tests -- + // + // Test aligned_pread and aligned_pwrite directly with controlled + // alignment values on a plain temp file. + + /// Create a temp file filled with a repeating pattern of the given size. + /// Returns the TempFile (must be kept alive) and the raw fd. + fn create_pattern_file(size: usize) -> (TempFile, RawFd) { + let tf = TempFile::new().unwrap(); + let pattern: Vec = (0..size).map(|i| (i % 251) as u8).collect(); + tf.as_file().write_all(&pattern).unwrap(); + tf.as_file().sync_all().unwrap(); + let fd = tf.as_file().as_raw_fd(); + (tf, fd) + } + + #[test] + fn test_aligned_pread_pass_through() { + // When buffer address, length, and offset are all aligned, + // aligned_pread should take the fast path (no bounce buffer). + let size = 4096usize; + let (_tf, fd) = create_pattern_file(size); + let alignment = 512; + + // Use AlignedBuf to guarantee buffer address alignment. + let mut abuf = AlignedBuf::new(size, alignment).unwrap(); + aligned_pread(fd, abuf.as_mut_slice(size), 0, alignment).unwrap(); + + let expected: Vec = (0..size).map(|i| (i % 251) as u8).collect(); + assert_eq!(abuf.as_slice(size), &expected[..]); + } + + #[test] + fn test_aligned_pread_bounce_unaligned_buffer() { + // Force a misaligned buffer so aligned_pread must take the + // bounce path. A plain vec![0u8; 4096] is often page-aligned + // by the allocator, which would skip the bounce entirely. + let size = 4096usize; + let (_tf, fd) = create_pattern_file(size); + let alignment = 512; + + let mut backing = vec![0u8; size + 1]; + let buf = &mut backing[1..size + 1]; + aligned_pread(fd, buf, 0, alignment).unwrap(); + + let expected: Vec = (0..size).map(|i| (i % 251) as u8).collect(); + assert_eq!(buf, &expected[..]); + } + + #[test] + fn test_aligned_pread_unaligned_offset() { + // Read at an offset that is not a multiple of alignment. + // aligned_pread should round down the offset, read an aligned + // region, then copy the correct slice into the caller buffer. + let file_size = 8192usize; + let (_tf, fd) = create_pattern_file(file_size); + let alignment = 512; + + let offset = 100u64; + let len = 200usize; + let mut buf = vec![0u8; len]; + aligned_pread(fd, &mut buf, offset, alignment).unwrap(); + + let expected: Vec = (offset as usize..offset as usize + len) + .map(|i| (i % 251) as u8) + .collect(); + assert_eq!(buf, expected); + } + + #[test] + fn test_aligned_pwrite_pass_through() { + // When buffer address, length, and offset are all aligned, + // aligned_pwrite should take the fast path. + let size = 4096usize; + let (_tf, fd) = create_pattern_file(size); + let alignment = 512; + + let data: Vec = (0..size).map(|i| ((i + 1) % 251) as u8).collect(); + let mut abuf = AlignedBuf::new(size, alignment).unwrap(); + abuf.as_mut_slice(size).copy_from_slice(&data); + aligned_pwrite(fd, abuf.as_slice(size), 0, alignment).unwrap(); + + let mut readback = vec![0u8; size]; + pread_exact(fd, &mut readback, 0).unwrap(); + assert_eq!(readback, data); + } + + #[test] + fn test_aligned_pwrite_bounce_unaligned_buffer() { + // Force a misaligned buffer so aligned_pwrite must take the + // bounce path. A plain vec![0u8; 4096] is often page-aligned + // by the allocator, which would skip the bounce entirely. + let size = 4096usize; + let (_tf, fd) = create_pattern_file(size); + let alignment = 512; + + let backing: Vec = (0..size + 1).map(|i| ((i + 1) % 251) as u8).collect(); + let data = &backing[1..size + 1]; + aligned_pwrite(fd, data, 0, alignment).unwrap(); + + let mut readback = vec![0u8; size]; + pread_exact(fd, &mut readback, 0).unwrap(); + assert_eq!(readback, data); + } + + #[test] + fn test_aligned_pwrite_unaligned_offset() { + // Write at an offset that is not a multiple of alignment. + // aligned_pwrite should do read-modify-write and preserve + // surrounding data. + let file_size = 8192usize; + let (_tf, fd) = create_pattern_file(file_size); + let alignment = 512; + + let offset = 100u64; + let len = 200usize; + let data: Vec = (0..len).map(|i| ((i + 1) % 239) as u8).collect(); + aligned_pwrite(fd, &data, offset, alignment).unwrap(); + + // Read entire file and verify the written region plus untouched areas. + let mut whole = vec![0u8; file_size]; + pread_exact(fd, &mut whole, 0).unwrap(); + + // Before the write region: original pattern. + let before: Vec = (0..offset as usize).map(|i| (i % 251) as u8).collect(); + assert_eq!(&whole[..offset as usize], &before[..]); + + // The written region. + assert_eq!(&whole[offset as usize..offset as usize + len], &data[..]); + + // After the write region: original pattern. + let after_start = offset as usize + len; + let after: Vec = (after_start..file_size).map(|i| (i % 251) as u8).collect(); + assert_eq!(&whole[after_start..], &after[..]); + } + + #[test] + fn test_aligned_pread_pwrite_4096_alignment() { + // Exercise aligned I/O with 4096 byte alignment. + let file_size = 16384usize; + let (_tf, fd) = create_pattern_file(file_size); + let alignment = 4096; + + // Write 4096 bytes at offset 4096 via unaligned Vec. + let offset = 4096u64; + let len = 4096usize; + let data: Vec = (0..len).map(|i| ((i + 1) % 239) as u8).collect(); + aligned_pwrite(fd, &data, offset, alignment).unwrap(); + + // Read back the written region via unaligned Vec. + let mut buf = vec![0u8; len]; + aligned_pread(fd, &mut buf, offset, alignment).unwrap(); + assert_eq!(buf, data); + + // Verify untouched regions. + let mut whole = vec![0u8; file_size]; + pread_exact(fd, &mut whole, 0).unwrap(); + let before: Vec = (0..offset as usize).map(|i| (i % 251) as u8).collect(); + assert_eq!(&whole[..offset as usize], &before[..]); + let after_start = offset as usize + len; + let after: Vec = (after_start..file_size).map(|i| (i % 251) as u8).collect(); + assert_eq!(&whole[after_start..], &after[..]); + } + + #[test] + fn test_aligned_buf_allocation_and_access() { + for alignment in [512, 4096] { + let size = 1024usize; + let mut abuf = AlignedBuf::new(size, alignment).unwrap(); + let aligned_size = size.next_multiple_of(alignment); + + assert!( + (abuf.ptr() as usize).is_multiple_of(alignment), + "ptr not aligned to {alignment}" + ); + assert!(abuf.as_slice(aligned_size).iter().all(|&b| b == 0)); + + let pattern: Vec = (0..size).map(|i| (i % 251) as u8).collect(); + abuf.as_mut_slice(size).copy_from_slice(&pattern); + assert_eq!(abuf.as_slice(size), &pattern[..]); + } + } + + #[test] + fn test_aligned_buf_size_rounds_up() { + let abuf = AlignedBuf::new(1, 512).unwrap(); + assert_eq!(abuf.layout().size(), 512); + + let abuf = AlignedBuf::new(513, 512).unwrap(); + assert_eq!(abuf.layout().size(), 1024); } } diff --git a/block/src/raw_async.rs b/block/src/raw_async.rs index 539aaa9095..7fa3208f42 100644 --- a/block/src/raw_async.rs +++ b/block/src/raw_async.rs @@ -3,18 +3,23 @@ // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause use std::fs::File; -use std::io::{Error, Seek, SeekFrom}; +use std::io::{self, Error}; +use std::os::unix::fs::FileTypeExt; use std::os::unix::io::{AsRawFd, RawFd}; use io_uring::{IoUring, opcode, types}; +use libc::{FALLOC_FL_KEEP_SIZE, FALLOC_FL_PUNCH_HOLE, FALLOC_FL_ZERO_RANGE}; use log::warn; use vmm_sys_util::eventfd::EventFd; -use crate::async_io::{ - AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFile, DiskFileError, DiskFileResult, +use crate::async_io::{AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFileError}; +use crate::error::{BlockError, BlockErrorKind, BlockResult}; +use crate::{ + BatchRequest, DiskTopology, RequestType, SECTOR_SIZE, disk_file, probe_sparse_support, + query_device_size, }; -use crate::{BatchRequest, DiskTopology, RequestType, probe_sparse_support}; +#[derive(Debug)] pub struct RawFileDisk { file: File, } @@ -25,46 +30,89 @@ impl RawFileDisk { } } -impl DiskFile for RawFileDisk { - fn logical_size(&mut self) -> DiskFileResult { - self.file - .seek(SeekFrom::End(0)) - .map_err(DiskFileError::Size) +impl disk_file::DiskSize for RawFileDisk { + fn logical_size(&self) -> BlockResult { + query_device_size(&self.file) + .map(|(logical_size, _)| logical_size) + .map_err(|e| BlockError::new(BlockErrorKind::Io, DiskFileError::Size(e))) } +} - fn physical_size(&mut self) -> DiskFileResult { - self.file - .metadata() - .map(|m| m.len()) - .map_err(DiskFileError::Size) +impl disk_file::PhysicalSize for RawFileDisk { + fn physical_size(&self) -> BlockResult { + query_device_size(&self.file) + .map(|(_, physical_size)| physical_size) + .map_err(|e| BlockError::new(BlockErrorKind::Io, DiskFileError::Size(e))) } +} - fn new_async_io(&self, ring_depth: u32) -> DiskFileResult> { - Ok(Box::new( - RawFileAsync::new(self.file.as_raw_fd(), ring_depth) - .map_err(DiskFileError::NewAsyncIo)?, - ) as Box) +impl disk_file::DiskFd for RawFileDisk { + fn fd(&self) -> BorrowedDiskFd<'_> { + BorrowedDiskFd::new(self.file.as_raw_fd()) } +} - fn topology(&mut self) -> DiskTopology { - if let Ok(topology) = DiskTopology::probe(&self.file) { - topology - } else { +impl disk_file::Geometry for RawFileDisk { + fn topology(&self) -> DiskTopology { + DiskTopology::probe(&self.file).unwrap_or_else(|_| { warn!("Unable to get device topology. Using default topology"); DiskTopology::default() - } - } - - fn resize(&mut self, size: u64) -> DiskFileResult<()> { - self.file.set_len(size).map_err(DiskFileError::ResizeError) + }) } +} +impl disk_file::SparseCapable for RawFileDisk { fn supports_sparse_operations(&self) -> bool { probe_sparse_support(&self.file) } +} - fn fd(&mut self) -> BorrowedDiskFd<'_> { - BorrowedDiskFd::new(self.file.as_raw_fd()) +impl disk_file::Resizable for RawFileDisk { + fn resize(&mut self, size: u64) -> BlockResult<()> { + let fd_metadata = self + .file + .metadata() + .map_err(|e| BlockError::new(BlockErrorKind::Io, DiskFileError::ResizeError(e)))?; + + if fd_metadata.file_type().is_block_device() { + // Block devices cannot be resized via ftruncate - they are resized + // externally (LVM, losetup -c, etc.). Verify the size matches. + let (actual_size, _) = query_device_size(&self.file) + .map_err(|e| BlockError::new(BlockErrorKind::Io, DiskFileError::ResizeError(e)))?; + if actual_size != size { + return Err(BlockError::new( + BlockErrorKind::Io, + DiskFileError::ResizeError(io::Error::other(format!( + "Block device size {actual_size} does not match requested size {size}" + ))), + )); + } + Ok(()) + } else { + self.file + .set_len(size) + .map_err(|e| BlockError::new(BlockErrorKind::Io, DiskFileError::ResizeError(e))) + } + } +} + +impl disk_file::DiskFile for RawFileDisk {} + +impl disk_file::AsyncDiskFile for RawFileDisk { + fn try_clone(&self) -> BlockResult> { + let file = self + .file + .try_clone() + .map_err(|e| BlockError::new(BlockErrorKind::Io, DiskFileError::Clone(e)))?; + Ok(Box::new(RawFileDisk { file })) + } + + fn new_async_io(&self, ring_depth: u32) -> BlockResult> { + let mut raw = RawFileAsync::new(self.file.as_raw_fd(), ring_depth) + .map_err(|e| BlockError::new(BlockErrorKind::Io, DiskFileError::NewAsyncIo(e)))?; + raw.alignment = + DiskTopology::probe(&self.file).map_or(SECTOR_SIZE, |t| t.logical_block_size); + Ok(Box::new(raw) as Box) } } @@ -72,6 +120,7 @@ pub struct RawFileAsync { fd: RawFd, io_uring: IoUring, eventfd: EventFd, + alignment: u64, } impl RawFileAsync { @@ -87,6 +136,7 @@ impl RawFileAsync { fd, io_uring, eventfd, + alignment: SECTOR_SIZE, }) } } @@ -96,6 +146,10 @@ impl AsyncIo for RawFileAsync { &self.eventfd } + fn alignment(&self) -> u64 { + self.alignment + } + fn read_vectored( &mut self, offset: libc::off_t, @@ -261,8 +315,6 @@ impl AsyncIo for RawFileAsync { fn punch_hole(&mut self, offset: u64, length: u64, user_data: u64) -> AsyncIoResult<()> { let (submitter, mut sq, _) = self.io_uring.split(); - const FALLOC_FL_PUNCH_HOLE: i32 = 0x02; - const FALLOC_FL_KEEP_SIZE: i32 = 0x01; let mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; // SAFETY: The file descriptor is known to be valid. @@ -288,8 +340,6 @@ impl AsyncIo for RawFileAsync { fn write_zeroes(&mut self, offset: u64, length: u64, user_data: u64) -> AsyncIoResult<()> { let (submitter, mut sq, _) = self.io_uring.split(); - const FALLOC_FL_ZERO_RANGE: i32 = 0x10; - const FALLOC_FL_KEEP_SIZE: i32 = 0x01; let mode = FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE; // SAFETY: The file descriptor is known to be valid. diff --git a/block/src/raw_async_aio.rs b/block/src/raw_async_aio.rs index 6447a727d8..980f8d13a7 100644 --- a/block/src/raw_async_aio.rs +++ b/block/src/raw_async_aio.rs @@ -5,19 +5,20 @@ // Copyright © 2023 Crusoe Energy Systems LLC // +use std::collections::VecDeque; use std::fs::File; -use std::io::{Seek, SeekFrom}; use std::os::unix::io::{AsRawFd, RawFd}; +use libc::{FALLOC_FL_KEEP_SIZE, FALLOC_FL_PUNCH_HOLE, FALLOC_FL_ZERO_RANGE}; use log::warn; use vmm_sys_util::aio; use vmm_sys_util::eventfd::EventFd; -use crate::async_io::{ - AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFile, DiskFileError, DiskFileResult, -}; -use crate::{DiskTopology, probe_sparse_support}; +use crate::async_io::{AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFileError}; +use crate::error::{BlockError, BlockErrorKind, BlockResult}; +use crate::{DiskTopology, SECTOR_SIZE, disk_file, probe_sparse_support, query_device_size}; +#[derive(Debug)] pub struct RawFileDiskAio { file: File, } @@ -28,42 +29,67 @@ impl RawFileDiskAio { } } -impl DiskFile for RawFileDiskAio { - fn logical_size(&mut self) -> DiskFileResult { - self.file - .seek(SeekFrom::End(0)) - .map_err(DiskFileError::Size) +impl disk_file::DiskSize for RawFileDiskAio { + fn logical_size(&self) -> BlockResult { + query_device_size(&self.file) + .map(|(logical_size, _)| logical_size) + .map_err(|e| BlockError::new(BlockErrorKind::Io, DiskFileError::Size(e))) } +} - fn physical_size(&mut self) -> DiskFileResult { - self.file - .metadata() - .map(|m| m.len()) - .map_err(DiskFileError::Size) +impl disk_file::PhysicalSize for RawFileDiskAio { + fn physical_size(&self) -> BlockResult { + query_device_size(&self.file) + .map(|(_, physical_size)| physical_size) + .map_err(|e| BlockError::new(BlockErrorKind::Io, DiskFileError::Size(e))) } +} - fn new_async_io(&self, ring_depth: u32) -> DiskFileResult> { - Ok(Box::new( - RawFileAsyncAio::new(self.file.as_raw_fd(), ring_depth) - .map_err(DiskFileError::NewAsyncIo)?, - ) as Box) +impl disk_file::DiskFd for RawFileDiskAio { + fn fd(&self) -> BorrowedDiskFd<'_> { + BorrowedDiskFd::new(self.file.as_raw_fd()) } +} - fn topology(&mut self) -> DiskTopology { - if let Ok(topology) = DiskTopology::probe(&self.file) { - topology - } else { +impl disk_file::Geometry for RawFileDiskAio { + fn topology(&self) -> DiskTopology { + DiskTopology::probe(&self.file).unwrap_or_else(|_| { warn!("Unable to get device topology. Using default topology"); DiskTopology::default() - } + }) } +} +impl disk_file::SparseCapable for RawFileDiskAio { fn supports_sparse_operations(&self) -> bool { probe_sparse_support(&self.file) } +} - fn fd(&mut self) -> BorrowedDiskFd<'_> { - BorrowedDiskFd::new(self.file.as_raw_fd()) +impl disk_file::Resizable for RawFileDiskAio { + fn resize(&mut self, size: u64) -> BlockResult<()> { + self.file + .set_len(size) + .map_err(|e| BlockError::new(BlockErrorKind::Io, DiskFileError::ResizeError(e))) + } +} + +impl disk_file::DiskFile for RawFileDiskAio {} + +impl disk_file::AsyncDiskFile for RawFileDiskAio { + fn try_clone(&self) -> BlockResult> { + let file = self + .file + .try_clone() + .map_err(|e| BlockError::new(BlockErrorKind::Io, DiskFileError::Clone(e)))?; + Ok(Box::new(RawFileDiskAio { file })) + } + + fn new_async_io(&self, ring_depth: u32) -> BlockResult> { + let mut raw = RawFileAsyncAio::new(self.file.as_raw_fd(), ring_depth)?; + raw.alignment = + DiskTopology::probe(&self.file).map_or(SECTOR_SIZE, |t| t.logical_block_size); + Ok(Box::new(raw) as Box) } } @@ -71,14 +97,24 @@ pub struct RawFileAsyncAio { fd: RawFd, ctx: aio::IoContext, eventfd: EventFd, + alignment: u64, + completion_list: VecDeque<(u64, i32)>, } impl RawFileAsyncAio { - pub fn new(fd: RawFd, queue_depth: u32) -> std::io::Result { - let eventfd = EventFd::new(libc::EFD_NONBLOCK)?; - let ctx = aio::IoContext::new(queue_depth)?; + pub fn new(fd: RawFd, queue_depth: u32) -> BlockResult { + let eventfd = + EventFd::new(libc::EFD_NONBLOCK).map_err(|e| BlockError::new(BlockErrorKind::Io, e))?; + let ctx = + aio::IoContext::new(queue_depth).map_err(|e| BlockError::new(BlockErrorKind::Io, e))?; - Ok(RawFileAsyncAio { fd, ctx, eventfd }) + Ok(RawFileAsyncAio { + fd, + ctx, + eventfd, + alignment: SECTOR_SIZE, + completion_list: VecDeque::new(), + }) } } @@ -87,6 +123,10 @@ impl AsyncIo for RawFileAsyncAio { &self.eventfd } + fn alignment(&self) -> u64 { + self.alignment + } + fn read_vectored( &mut self, offset: libc::off_t, @@ -157,24 +197,99 @@ impl AsyncIo for RawFileAsyncAio { } fn next_completed_request(&mut self) -> Option<(u64, i32)> { - let mut events: [aio::IoEvent; 1] = [aio::IoEvent::default()]; - let rc = self.ctx.get_events(0, &mut events, None).unwrap(); - if rc == 0 { - None - } else { - Some((events[0].data, events[0].res as i32)) + if self.completion_list.is_empty() { + // Drain pending AIO completions batched into the same queue. + let mut events = [aio::IoEvent::default(); 32]; + let rc = self.ctx.get_events(0, &mut events, None).unwrap(); + for event in &events[..rc] { + self.completion_list + .push_back((event.data, event.res as i32)); + } } + self.completion_list.pop_front() + } + + fn punch_hole(&mut self, offset: u64, length: u64, user_data: u64) -> AsyncIoResult<()> { + // Linux AIO has no IOCB command for fallocate, so perform the operation + // synchronously and signal completion via the completion list, matching + // the pattern used by the sync backend (RawFileSync). + let mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; + + // SAFETY: FFI call with valid arguments + let result = unsafe { + libc::fallocate( + self.fd as libc::c_int, + mode, + offset as libc::off_t, + length as libc::off_t, + ) + }; + if result < 0 { + return Err(AsyncIoError::PunchHole(std::io::Error::last_os_error())); + } + + self.completion_list.push_back((user_data, result)); + self.eventfd.write(1).unwrap(); + + Ok(()) + } + + fn write_zeroes(&mut self, offset: u64, length: u64, user_data: u64) -> AsyncIoResult<()> { + // Linux AIO has no IOCB command for fallocate, so perform the operation + // synchronously and signal completion via the completion list, matching + // the pattern used by the sync backend (RawFileSync). + let mode = FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE; + + // SAFETY: FFI call with valid arguments + let result = unsafe { + libc::fallocate( + self.fd as libc::c_int, + mode, + offset as libc::off_t, + length as libc::off_t, + ) + }; + if result < 0 { + return Err(AsyncIoError::WriteZeroes(std::io::Error::last_os_error())); + } + + self.completion_list.push_back((user_data, result)); + self.eventfd.write(1).unwrap(); + + Ok(()) + } +} + +#[cfg(test)] +mod unit_tests { + use std::os::unix::io::AsRawFd; + + use vmm_sys_util::tempfile::TempFile; + + use super::*; + use crate::raw_async_io_tests; + + #[test] + fn test_punch_hole() { + let temp_file = TempFile::new().unwrap(); + let mut file = temp_file.into_file(); + let mut async_io = RawFileAsyncAio::new(file.as_raw_fd(), 128).unwrap(); + raw_async_io_tests::test_punch_hole(&mut async_io, &mut file); } - fn punch_hole(&mut self, _offset: u64, _length: u64, _user_data: u64) -> AsyncIoResult<()> { - Err(AsyncIoError::PunchHole(std::io::Error::other( - "punch_hole not supported with AIO backend", - ))) + #[test] + fn test_write_zeroes() { + let temp_file = TempFile::new().unwrap(); + let mut file = temp_file.into_file(); + let mut async_io = RawFileAsyncAio::new(file.as_raw_fd(), 128).unwrap(); + raw_async_io_tests::test_write_zeroes(&mut async_io, &mut file); } - fn write_zeroes(&mut self, _offset: u64, _length: u64, _user_data: u64) -> AsyncIoResult<()> { - Err(AsyncIoError::WriteZeroes(std::io::Error::other( - "write_zeroes not supported with AIO backend", - ))) + #[test] + fn test_punch_hole_multiple_operations() { + let temp_file = TempFile::new().unwrap(); + let mut file = temp_file.into_file(); + let mut async_io = RawFileAsyncAio::new(file.as_raw_fd(), 128).unwrap(); + raw_async_io_tests::test_punch_hole_multiple_operations(&mut async_io, &mut file); } } diff --git a/block/src/raw_async_io_tests.rs b/block/src/raw_async_io_tests.rs new file mode 100644 index 0000000000..560e41e334 --- /dev/null +++ b/block/src/raw_async_io_tests.rs @@ -0,0 +1,162 @@ +// Copyright 2026 The Cloud Hypervisor Authors. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +//! Shared test helpers for [`AsyncIo`] backends. +//! +//! Each helper takes a `&mut dyn AsyncIo` together with the [`File`] handle +//! that backs the I/O object, so the same logic exercises every backend with +//! only the constructor differing. + +use std::fs::File; +use std::io::{Read, Seek, SeekFrom, Write}; + +use crate::async_io::{AsyncIo, AsyncIoError}; + +/// Tests punching a hole in the middle of a 4 MB file and verifying data +/// integrity around the hole. +pub fn test_punch_hole(async_io: &mut dyn AsyncIo, file: &mut File) { + // Write 4MB of data + let data = vec![0xAA; 4 * 1024 * 1024]; + file.write_all(&data).unwrap(); + file.sync_all().unwrap(); + + // Punch hole in the middle (1MB at offset 1MB) + let offset = 1024 * 1024; + let length = 1024 * 1024; + async_io.punch_hole(offset, length, 1).unwrap(); + + // Check completion + let (user_data, result) = async_io.next_completed_request().unwrap(); + assert_eq!(user_data, 1); + assert_eq!(result, 0); + + // Verify the hole reads as zeros + file.seek(SeekFrom::Start(offset)).unwrap(); + let mut read_buf = vec![0; length as usize]; + file.read_exact(&mut read_buf).unwrap(); + assert!( + read_buf.iter().all(|&b| b == 0), + "Punched hole should read as zeros" + ); + + // Verify data before hole is intact + file.seek(SeekFrom::Start(0)).unwrap(); + let mut read_buf = vec![0; 1024]; + file.read_exact(&mut read_buf).unwrap(); + assert!( + read_buf.iter().all(|&b| b == 0xAA), + "Data before hole should be intact" + ); + + // Verify data after hole is intact + file.seek(SeekFrom::Start(offset + length)).unwrap(); + let mut read_buf = vec![0; 1024]; + file.read_exact(&mut read_buf).unwrap(); + assert!( + read_buf.iter().all(|&b| b == 0xAA), + "Data after hole should be intact" + ); +} + +/// Tests writing zeroes to a 512 KB region inside a 4 MB file and verifying +/// surrounding data is preserved. Gracefully skips when the filesystem does +/// not support `FALLOC_FL_ZERO_RANGE`. +pub fn test_write_zeroes(async_io: &mut dyn AsyncIo, file: &mut File) { + // Write 4MB of data + let data = vec![0xBB; 4 * 1024 * 1024]; + file.write_all(&data).unwrap(); + file.sync_all().unwrap(); + + // Write zeros in the middle (512KB at offset 2MB) + let offset = 2 * 1024 * 1024; + let length = 512 * 1024; + let write_zeroes_result = async_io.write_zeroes(offset, length, 2); + + // FALLOC_FL_ZERO_RANGE might not be supported on all filesystems (e.g., tmpfs) + // If it fails with ENOTSUP, skip the test + if let Err(AsyncIoError::WriteZeroes(ref e)) = write_zeroes_result + && (e.raw_os_error() == Some(libc::EOPNOTSUPP) || e.raw_os_error() == Some(libc::ENOTSUP)) + { + eprintln!("Skipping test_write_zeroes: filesystem doesn't support FALLOC_FL_ZERO_RANGE"); + return; + } + write_zeroes_result.unwrap(); + + // Check completion + let (user_data, result) = async_io.next_completed_request().unwrap(); + assert_eq!(user_data, 2); + assert_eq!(result, 0); + + // Verify the zeroed region reads as zeros + file.seek(SeekFrom::Start(offset)).unwrap(); + let mut read_buf = vec![0; length as usize]; + file.read_exact(&mut read_buf).unwrap(); + assert!( + read_buf.iter().all(|&b| b == 0), + "Zeroed region should read as zeros" + ); + + // Verify data before zeroed region is intact + file.seek(SeekFrom::Start(offset - 1024)).unwrap(); + let mut read_buf = vec![0; 1024]; + file.read_exact(&mut read_buf).unwrap(); + assert!( + read_buf.iter().all(|&b| b == 0xBB), + "Data before zeroed region should be intact" + ); + + // Verify data after zeroed region is intact + file.seek(SeekFrom::Start(offset + length)).unwrap(); + let mut read_buf = vec![0; 1024]; + file.read_exact(&mut read_buf).unwrap(); + assert!( + read_buf.iter().all(|&b| b == 0xBB), + "Data after zeroed region should be intact" + ); +} + +/// Tests punching multiple holes in an 8 MB file and verifying each hole +/// independently reads as zeroes. +pub fn test_punch_hole_multiple_operations(async_io: &mut dyn AsyncIo, file: &mut File) { + // Write 8MB of data + let data = vec![0xCC; 8 * 1024 * 1024]; + file.write_all(&data).unwrap(); + file.sync_all().unwrap(); + + // Punch multiple holes + async_io.punch_hole(1024 * 1024, 512 * 1024, 10).unwrap(); + async_io + .punch_hole(3 * 1024 * 1024, 512 * 1024, 11) + .unwrap(); + async_io + .punch_hole(5 * 1024 * 1024, 512 * 1024, 12) + .unwrap(); + + // Check all completions + let (user_data, result) = async_io.next_completed_request().unwrap(); + assert_eq!(user_data, 10); + assert_eq!(result, 0); + + let (user_data, result) = async_io.next_completed_request().unwrap(); + assert_eq!(user_data, 11); + assert_eq!(result, 0); + + let (user_data, result) = async_io.next_completed_request().unwrap(); + assert_eq!(user_data, 12); + assert_eq!(result, 0); + + // Verify all holes read as zeros + file.seek(SeekFrom::Start(1024 * 1024)).unwrap(); + let mut read_buf = vec![0; 512 * 1024]; + file.read_exact(&mut read_buf).unwrap(); + assert!(read_buf.iter().all(|&b| b == 0)); + + file.seek(SeekFrom::Start(3 * 1024 * 1024)).unwrap(); + file.read_exact(&mut read_buf).unwrap(); + assert!(read_buf.iter().all(|&b| b == 0)); + + file.seek(SeekFrom::Start(5 * 1024 * 1024)).unwrap(); + file.read_exact(&mut read_buf).unwrap(); + assert!(read_buf.iter().all(|&b| b == 0)); +} diff --git a/block/src/raw_sync.rs b/block/src/raw_sync.rs index e1a5433b89..491ef0563d 100644 --- a/block/src/raw_sync.rs +++ b/block/src/raw_sync.rs @@ -4,17 +4,17 @@ use std::collections::VecDeque; use std::fs::File; -use std::io::{Seek, SeekFrom}; use std::os::unix::io::{AsRawFd, RawFd}; +use libc::{FALLOC_FL_KEEP_SIZE, FALLOC_FL_PUNCH_HOLE, FALLOC_FL_ZERO_RANGE}; use log::warn; use vmm_sys_util::eventfd::EventFd; -use crate::async_io::{ - AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFile, DiskFileError, DiskFileResult, -}; -use crate::{DiskTopology, probe_sparse_support}; +use crate::async_io::{AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFileError}; +use crate::error::{BlockError, BlockErrorKind, BlockResult}; +use crate::{DiskTopology, SECTOR_SIZE, disk_file, probe_sparse_support, query_device_size}; +#[derive(Debug)] pub struct RawFileDiskSync { file: File, } @@ -25,39 +25,67 @@ impl RawFileDiskSync { } } -impl DiskFile for RawFileDiskSync { - fn logical_size(&mut self) -> DiskFileResult { - self.file - .seek(SeekFrom::End(0)) - .map_err(DiskFileError::Size) +impl disk_file::DiskSize for RawFileDiskSync { + fn logical_size(&self) -> BlockResult { + query_device_size(&self.file) + .map(|(logical_size, _)| logical_size) + .map_err(|e| BlockError::new(BlockErrorKind::Io, DiskFileError::Size(e))) } +} - fn physical_size(&mut self) -> DiskFileResult { - self.file - .metadata() - .map(|m| m.len()) - .map_err(DiskFileError::Size) +impl disk_file::PhysicalSize for RawFileDiskSync { + fn physical_size(&self) -> BlockResult { + query_device_size(&self.file) + .map(|(_, physical_size)| physical_size) + .map_err(|e| BlockError::new(BlockErrorKind::Io, DiskFileError::Size(e))) } +} - fn new_async_io(&self, _ring_depth: u32) -> DiskFileResult> { - Ok(Box::new(RawFileSync::new(self.file.as_raw_fd())) as Box) +impl disk_file::DiskFd for RawFileDiskSync { + fn fd(&self) -> BorrowedDiskFd<'_> { + BorrowedDiskFd::new(self.file.as_raw_fd()) } +} - fn topology(&mut self) -> DiskTopology { - if let Ok(topology) = DiskTopology::probe(&self.file) { - topology - } else { +impl disk_file::Geometry for RawFileDiskSync { + fn topology(&self) -> DiskTopology { + DiskTopology::probe(&self.file).unwrap_or_else(|_| { warn!("Unable to get device topology. Using default topology"); DiskTopology::default() - } + }) } +} +impl disk_file::SparseCapable for RawFileDiskSync { fn supports_sparse_operations(&self) -> bool { probe_sparse_support(&self.file) } +} - fn fd(&mut self) -> BorrowedDiskFd<'_> { - BorrowedDiskFd::new(self.file.as_raw_fd()) +impl disk_file::Resizable for RawFileDiskSync { + fn resize(&mut self, size: u64) -> BlockResult<()> { + self.file + .set_len(size) + .map_err(|e| BlockError::new(BlockErrorKind::Io, DiskFileError::ResizeError(e))) + } +} + +impl disk_file::DiskFile for RawFileDiskSync {} + +impl disk_file::AsyncDiskFile for RawFileDiskSync { + fn try_clone(&self) -> BlockResult> { + let file = self + .file + .try_clone() + .map_err(|e| BlockError::new(BlockErrorKind::Io, DiskFileError::Clone(e)))?; + Ok(Box::new(RawFileDiskSync { file })) + } + + fn new_async_io(&self, _ring_depth: u32) -> BlockResult> { + let mut raw = RawFileSync::new(self.file.as_raw_fd()); + raw.alignment = + DiskTopology::probe(&self.file).map_or(SECTOR_SIZE, |t| t.logical_block_size); + Ok(Box::new(raw) as Box) } } @@ -65,6 +93,7 @@ pub struct RawFileSync { fd: RawFd, eventfd: EventFd, completion_list: VecDeque<(u64, i32)>, + alignment: u64, } impl RawFileSync { @@ -73,6 +102,7 @@ impl RawFileSync { fd, eventfd: EventFd::new(libc::EFD_NONBLOCK).expect("Failed creating EventFd for RawFile"), completion_list: VecDeque::new(), + alignment: SECTOR_SIZE, } } } @@ -82,6 +112,10 @@ impl AsyncIo for RawFileSync { &self.eventfd } + fn alignment(&self) -> u64 { + self.alignment + } + fn read_vectored( &mut self, offset: libc::off_t, @@ -152,8 +186,6 @@ impl AsyncIo for RawFileSync { } fn punch_hole(&mut self, offset: u64, length: u64, user_data: u64) -> AsyncIoResult<()> { - const FALLOC_FL_PUNCH_HOLE: i32 = 0x02; - const FALLOC_FL_KEEP_SIZE: i32 = 0x01; let mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; // SAFETY: FFI call with valid arguments @@ -176,8 +208,6 @@ impl AsyncIo for RawFileSync { } fn write_zeroes(&mut self, offset: u64, length: u64, user_data: u64) -> AsyncIoResult<()> { - const FALLOC_FL_ZERO_RANGE: i32 = 0x10; - const FALLOC_FL_KEEP_SIZE: i32 = 0x01; let mode = FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE; // SAFETY: FFI call with valid arguments @@ -202,174 +232,34 @@ impl AsyncIo for RawFileSync { #[cfg(test)] mod unit_tests { - use std::io::{Read, Seek, SeekFrom, Write}; + use std::os::unix::io::AsRawFd; use vmm_sys_util::tempfile::TempFile; use super::*; + use crate::raw_async_io_tests; #[test] fn test_punch_hole() { let temp_file = TempFile::new().unwrap(); let mut file = temp_file.into_file(); - - // Write 4MB of data - let data = vec![0xAA; 4 * 1024 * 1024]; - file.write_all(&data).unwrap(); - file.sync_all().unwrap(); - - // Create async IO instance let mut async_io = RawFileSync::new(file.as_raw_fd()); - - // Punch hole in the middle (1MB at offset 1MB) - let offset = 1024 * 1024; - let length = 1024 * 1024; - async_io.punch_hole(offset, length, 1).unwrap(); - - // Check completion - let (user_data, result) = async_io.next_completed_request().unwrap(); - assert_eq!(user_data, 1); - assert_eq!(result, 0); - - // Verify the hole reads as zeros - file.seek(SeekFrom::Start(offset)).unwrap(); - let mut read_buf = vec![0; length as usize]; - file.read_exact(&mut read_buf).unwrap(); - assert!( - read_buf.iter().all(|&b| b == 0), - "Punched hole should read as zeros" - ); - - // Verify data before hole is intact - file.seek(SeekFrom::Start(0)).unwrap(); - let mut read_buf = vec![0; 1024]; - file.read_exact(&mut read_buf).unwrap(); - assert!( - read_buf.iter().all(|&b| b == 0xAA), - "Data before hole should be intact" - ); - - // Verify data after hole is intact - file.seek(SeekFrom::Start(offset + length)).unwrap(); - let mut read_buf = vec![0; 1024]; - file.read_exact(&mut read_buf).unwrap(); - assert!( - read_buf.iter().all(|&b| b == 0xAA), - "Data after hole should be intact" - ); + raw_async_io_tests::test_punch_hole(&mut async_io, &mut file); } #[test] fn test_write_zeroes() { let temp_file = TempFile::new().unwrap(); let mut file = temp_file.into_file(); - - // Write 4MB of data - let data = vec![0xBB; 4 * 1024 * 1024]; - file.write_all(&data).unwrap(); - file.sync_all().unwrap(); - - // Create async IO instance let mut async_io = RawFileSync::new(file.as_raw_fd()); - - // Write zeros in the middle (512KB at offset 2MB) - let offset = 2 * 1024 * 1024; - let length = 512 * 1024; - let write_zeroes_result = async_io.write_zeroes(offset, length, 2); - - // FALLOC_FL_ZERO_RANGE might not be supported on all filesystems (e.g., tmpfs) - // If it fails with ENOTSUP, skip the test - if let Err(AsyncIoError::WriteZeroes(ref e)) = write_zeroes_result - && (e.raw_os_error() == Some(libc::EOPNOTSUPP) - || e.raw_os_error() == Some(libc::ENOTSUP)) - { - eprintln!( - "Skipping test_write_zeroes: filesystem doesn't support FALLOC_FL_ZERO_RANGE" - ); - return; - } - write_zeroes_result.unwrap(); - - // Check completion - let (user_data, result) = async_io.next_completed_request().unwrap(); - assert_eq!(user_data, 2); - assert_eq!(result, 0); - - // Verify the zeroed region reads as zeros - file.seek(SeekFrom::Start(offset)).unwrap(); - let mut read_buf = vec![0; length as usize]; - file.read_exact(&mut read_buf).unwrap(); - assert!( - read_buf.iter().all(|&b| b == 0), - "Zeroed region should read as zeros" - ); - - // Verify data before zeroed region is intact - file.seek(SeekFrom::Start(offset - 1024)).unwrap(); - let mut read_buf = vec![0; 1024]; - file.read_exact(&mut read_buf).unwrap(); - assert!( - read_buf.iter().all(|&b| b == 0xBB), - "Data before zeroed region should be intact" - ); - - // Verify data after zeroed region is intact - file.seek(SeekFrom::Start(offset + length)).unwrap(); - let mut read_buf = vec![0; 1024]; - file.read_exact(&mut read_buf).unwrap(); - assert!( - read_buf.iter().all(|&b| b == 0xBB), - "Data after zeroed region should be intact" - ); + raw_async_io_tests::test_write_zeroes(&mut async_io, &mut file); } #[test] fn test_punch_hole_multiple_operations() { let temp_file = TempFile::new().unwrap(); let mut file = temp_file.into_file(); - - // Write 8MB of data - let data = vec![0xCC; 8 * 1024 * 1024]; - file.write_all(&data).unwrap(); - file.sync_all().unwrap(); - - // Create async IO instance let mut async_io = RawFileSync::new(file.as_raw_fd()); - - // Punch multiple holes - async_io.punch_hole(1024 * 1024, 512 * 1024, 10).unwrap(); - async_io - .punch_hole(3 * 1024 * 1024, 512 * 1024, 11) - .unwrap(); - async_io - .punch_hole(5 * 1024 * 1024, 512 * 1024, 12) - .unwrap(); - - // Check all completions - let (user_data, result) = async_io.next_completed_request().unwrap(); - assert_eq!(user_data, 10); - assert_eq!(result, 0); - - let (user_data, result) = async_io.next_completed_request().unwrap(); - assert_eq!(user_data, 11); - assert_eq!(result, 0); - - let (user_data, result) = async_io.next_completed_request().unwrap(); - assert_eq!(user_data, 12); - assert_eq!(result, 0); - - // Verify all holes read as zeros - file.seek(SeekFrom::Start(1024 * 1024)).unwrap(); - let mut read_buf = vec![0; 512 * 1024]; - file.read_exact(&mut read_buf).unwrap(); - assert!(read_buf.iter().all(|&b| b == 0)); - - file.seek(SeekFrom::Start(3 * 1024 * 1024)).unwrap(); - file.read_exact(&mut read_buf).unwrap(); - assert!(read_buf.iter().all(|&b| b == 0)); - - file.seek(SeekFrom::Start(5 * 1024 * 1024)).unwrap(); - file.read_exact(&mut read_buf).unwrap(); - assert!(read_buf.iter().all(|&b| b == 0)); + raw_async_io_tests::test_punch_hole_multiple_operations(&mut async_io, &mut file); } } diff --git a/block/src/vhdx_sync.rs b/block/src/vhdx_sync.rs index fc236c15df..0405554c1b 100644 --- a/block/src/vhdx_sync.rs +++ b/block/src/vhdx_sync.rs @@ -5,67 +5,107 @@ use std::collections::VecDeque; use std::fs::File; use std::os::fd::AsRawFd; +use std::sync::{Arc, Mutex}; use vmm_sys_util::eventfd::EventFd; -use crate::async_io::{ - AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFile, DiskFileError, DiskFileResult, -}; -use crate::vhdx::{Result as VhdxResult, Vhdx}; -use crate::{AsyncAdaptor, BlockBackend, Error}; +use crate::async_io::{AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFileError}; +use crate::error::{BlockError, BlockErrorKind, BlockResult, ErrorOp}; +use crate::vhdx::Vhdx; +use crate::{AsyncAdaptor, BlockBackend, Error, disk_file}; +#[derive(Debug)] pub struct VhdxDiskSync { - vhdx_file: Vhdx, + // FIXME: The Mutex serializes all VHDX I/O operations across queues, which + // is necessary for correctness but eliminates any parallelism benefit from + // multiqueue. Vhdx::clone() shares the underlying file description across + // threads, so concurrent I/O from multiple queues races on the file offset + // causing data corruption. + // + // A proper fix would require restructuring the VHDX I/O path so that data + // operations can proceed in parallel with independent file descriptors. + vhdx_file: Arc>, } impl VhdxDiskSync { - pub fn new(f: File) -> VhdxResult { + pub fn new(f: File) -> BlockResult { Ok(VhdxDiskSync { - vhdx_file: Vhdx::new(f)?, + vhdx_file: Arc::new(Mutex::new(Vhdx::new(f).map_err(|e| { + BlockError::new(BlockErrorKind::Io, e).with_op(ErrorOp::Open) + })?)), }) } } -impl DiskFile for VhdxDiskSync { - fn logical_size(&mut self) -> DiskFileResult { - Ok(self.vhdx_file.virtual_disk_size()) +impl disk_file::DiskSize for VhdxDiskSync { + fn logical_size(&self) -> BlockResult { + Ok(self.vhdx_file.lock().unwrap().virtual_disk_size()) } +} - fn physical_size(&mut self) -> DiskFileResult { - self.vhdx_file.physical_size().map_err(|e| { - let io_inner = match e { - Error::GetFileMetadata(e) => e, - _ => unreachable!(), - }; - DiskFileError::Size(io_inner) - }) +impl disk_file::PhysicalSize for VhdxDiskSync { + fn physical_size(&self) -> BlockResult { + self.vhdx_file + .lock() + .unwrap() + .physical_size() + .map_err(|e| match e { + Error::GetFileMetadata(io) => { + BlockError::new(BlockErrorKind::Io, Error::GetFileMetadata(io)) + } + _ => BlockError::new(BlockErrorKind::Io, e), + }) + } +} + +impl disk_file::DiskFd for VhdxDiskSync { + fn fd(&self) -> BorrowedDiskFd<'_> { + BorrowedDiskFd::new(self.vhdx_file.lock().unwrap().as_raw_fd()) } +} + +impl disk_file::Geometry for VhdxDiskSync {} - fn new_async_io(&self, _ring_depth: u32) -> DiskFileResult> { - Ok( - Box::new(VhdxSync::new(self.vhdx_file.clone()).map_err(DiskFileError::NewAsyncIo)?) - as Box, +impl disk_file::SparseCapable for VhdxDiskSync {} + +impl disk_file::Resizable for VhdxDiskSync { + fn resize(&mut self, _size: u64) -> BlockResult<()> { + Err(BlockError::new( + BlockErrorKind::UnsupportedFeature, + DiskFileError::ResizeError(std::io::Error::other("resize not supported for VHDX")), ) + .with_op(ErrorOp::Resize)) } +} - fn fd(&mut self) -> BorrowedDiskFd<'_> { - BorrowedDiskFd::new(self.vhdx_file.as_raw_fd()) +impl disk_file::DiskFile for VhdxDiskSync {} + +impl disk_file::AsyncDiskFile for VhdxDiskSync { + fn try_clone(&self) -> BlockResult> { + Ok(Box::new(VhdxDiskSync { + vhdx_file: Arc::clone(&self.vhdx_file), + })) + } + + fn new_async_io(&self, _ring_depth: u32) -> BlockResult> { + Ok(Box::new(VhdxSync::new(Arc::clone(&self.vhdx_file)))) } } pub struct VhdxSync { - vhdx_file: Vhdx, + vhdx_file: Arc>, eventfd: EventFd, completion_list: VecDeque<(u64, i32)>, } impl VhdxSync { - pub fn new(vhdx_file: Vhdx) -> std::io::Result { - Ok(VhdxSync { + pub fn new(vhdx_file: Arc>) -> Self { + VhdxSync { vhdx_file, - eventfd: EventFd::new(libc::EFD_NONBLOCK)?, + eventfd: EventFd::new(libc::EFD_NONBLOCK) + .expect("Failed creating EventFd for VhdxSync"), completion_list: VecDeque::new(), - }) + } } } @@ -82,7 +122,7 @@ impl AsyncIo for VhdxSync { iovecs: &[libc::iovec], user_data: u64, ) -> AsyncIoResult<()> { - self.vhdx_file.read_vectored_sync( + self.vhdx_file.lock().unwrap().read_vectored_sync( offset, iovecs, user_data, @@ -97,7 +137,7 @@ impl AsyncIo for VhdxSync { iovecs: &[libc::iovec], user_data: u64, ) -> AsyncIoResult<()> { - self.vhdx_file.write_vectored_sync( + self.vhdx_file.lock().unwrap().write_vectored_sync( offset, iovecs, user_data, @@ -107,8 +147,11 @@ impl AsyncIo for VhdxSync { } fn fsync(&mut self, user_data: Option) -> AsyncIoResult<()> { - self.vhdx_file - .fsync_sync(user_data, &self.eventfd, &mut self.completion_list) + self.vhdx_file.lock().unwrap().fsync_sync( + user_data, + &self.eventfd, + &mut self.completion_list, + ) } fn next_completed_request(&mut self) -> Option<(u64, i32)> { diff --git a/cloud-hypervisor/Cargo.toml b/cloud-hypervisor/Cargo.toml index 542d1859d8..8259b716fa 100644 --- a/cloud-hypervisor/Cargo.toml +++ b/cloud-hypervisor/Cargo.toml @@ -7,15 +7,8 @@ edition = "2024" homepage = "https://github.com/cloud-hypervisor/cloud-hypervisor" license = "Apache-2.0 AND BSD-3-Clause" name = "cloud-hypervisor" +rust-version.workspace = true version = "51.0.0" -# Minimum buildable version: -# Keep in sync with version in .github/workflows/build.yaml -# Policy on MSRV (see #4318): -# Can only be bumped if satisfying any of the following: -# a.) A dependency requires it, -# b.) If we want to use a new feature and that MSRV is at least 6 months old, -# c.) There is a security issue that is addressed by the toolchain update. -rust-version = "1.89.0" [dependencies] anyhow = { workspace = true } @@ -38,7 +31,7 @@ tracer = { path = "../tracer" } vm-memory = { workspace = true } vmm = { path = "../vmm" } vmm-sys-util = { workspace = true } -zbus = { version = "5.13.2", optional = true } +zbus = { version = "5.14.0", optional = true } [dev-dependencies] block = { path = "../block" } diff --git a/cloud-hypervisor/src/bin/ch-remote.rs b/cloud-hypervisor/src/bin/ch-remote.rs index fd48ffab1f..236e7438e0 100644 --- a/cloud-hypervisor/src/bin/ch-remote.rs +++ b/cloud-hypervisor/src/bin/ch-remote.rs @@ -16,14 +16,16 @@ use api_client::{ Error as ApiClientError, simple_api_command, simple_api_command_with_fds, simple_api_full_command, }; -use clap::{Arg, ArgAction, ArgMatches, Command}; +#[cfg(feature = "dbus_api")] +use clap::ArgAction; +use clap::{Arg, ArgMatches, Command}; use log::error; use option_parser::{ByteSized, ByteSizedParseError}; use thiserror::Error; use vmm::config::RestoreConfig; use vmm::vm_config::{ - DeviceConfig, DiskConfig, FsConfig, NetConfig, PmemConfig, UserDeviceConfig, VdpaConfig, - VsockConfig, + DeviceConfig, DiskConfig, FsConfig, GenericVhostUserConfig, NetConfig, PmemConfig, + UserDeviceConfig, VdpaConfig, VsockConfig, }; #[cfg(feature = "dbus_api")] use zbus::{proxy, zvariant::Optional}; @@ -49,6 +51,8 @@ enum Error { AddDiskConfig(#[source] vmm::config::Error), #[error("Error parsing filesystem syntax")] AddFsConfig(#[source] vmm::config::Error), + #[error("Error parsing generic vhost-user syntax")] + AddGenericVhostUserConfig(#[source] vmm::config::Error), #[error("Error parsing persistent memory syntax")] AddPmemConfig(#[source] vmm::config::Error), #[error("Error parsing network syntax")] @@ -67,6 +71,8 @@ enum Error { ReadingFile(#[source] std::io::Error), #[error("Invalid disk size")] InvalidDiskSize(#[source] ByteSizedParseError), + #[error("Error parsing send migration configuration")] + SendMigrationConfig(#[from] vmm::api::VmSendMigrationConfigError), } enum TargetApi<'a> { @@ -83,6 +89,10 @@ trait DBusApi1 { fn vm_add_device(&self, device_config: &str) -> zbus::Result>; fn vm_add_disk(&self, disk_config: &str) -> zbus::Result>; fn vm_add_fs(&self, fs_config: &str) -> zbus::Result>; + fn vm_add_generic_vhost_user( + &self, + generic_vhost_user_config: &str, + ) -> zbus::Result>; fn vm_add_net(&self, net_config: &str) -> zbus::Result>; fn vm_add_pmem(&self, pmem_config: &str) -> zbus::Result>; fn vm_add_user_device(&self, vm_add_user_device: &str) -> zbus::Result>; @@ -155,6 +165,10 @@ impl<'a> DBusApi1ProxyBlocking<'a> { self.print_response(self.vm_add_fs(fs_config)) } + fn api_vm_add_generic_vhost_user(&self, generic_vhost_user_config: &str) -> ApiResult { + self.print_response(self.vm_add_generic_vhost_user(generic_vhost_user_config)) + } + fn api_vm_add_net(&self, net_config: &str) -> ApiResult { self.print_response(self.vm_add_net(net_config)) } @@ -398,6 +412,22 @@ fn rest_api_do_command(matches: &ArgMatches, socket: &mut UnixStream) -> ApiResu simple_api_command(socket, "PUT", "add-fs", Some(&fs_config)) .map_err(Error::HttpApiClient) } + Some("add-generic-vhost-user") => { + let device_config = add_generic_vhost_user_config( + matches + .subcommand_matches("add-generic-vhost-user") + .unwrap() + .get_one::("generic_vhost_user_config") + .unwrap(), + )?; + simple_api_command( + socket, + "PUT", + "add-generic-vhost-user", + Some(&device_config), + ) + .map_err(Error::HttpApiClient) + } Some("add-pmem") => { let pmem_config = add_pmem_config( matches @@ -493,11 +523,7 @@ fn rest_api_do_command(matches: &ArgMatches, socket: &mut UnixStream) -> ApiResu .unwrap() .get_one::("send_migration_config") .unwrap(), - matches - .subcommand_matches("send-migration") - .unwrap() - .get_flag("send_migration_local"), - ); + )?; simple_api_command(socket, "PUT", "send-migration", Some(&send_migration_data)) .map_err(Error::HttpApiClient) } @@ -620,6 +646,16 @@ fn dbus_api_do_command(matches: &ArgMatches, proxy: &DBusApi1ProxyBlocking<'_>) )?; proxy.api_vm_add_fs(&fs_config) } + Some("add-generic-vhost-user") => { + let generic_vhost_user_config = add_generic_vhost_user_config( + matches + .subcommand_matches("add-generic-vhost-user") + .unwrap() + .get_one::("generic_vhost_user_config") + .unwrap(), + )?; + proxy.api_vm_add_generic_vhost_user(&generic_vhost_user_config) + } Some("add-pmem") => { let pmem_config = add_pmem_config( matches @@ -707,11 +743,7 @@ fn dbus_api_do_command(matches: &ArgMatches, proxy: &DBusApi1ProxyBlocking<'_>) .unwrap() .get_one::("send_migration_config") .unwrap(), - matches - .subcommand_matches("send-migration") - .unwrap() - .get_flag("send_migration_local"), - ); + )?; proxy.api_vm_send_migration(&send_migration_data) } Some("receive-migration") => { @@ -835,6 +867,14 @@ fn add_fs_config(config: &str) -> Result { Ok(fs_config) } +fn add_generic_vhost_user_config(config: &str) -> Result { + let generic_vhost_user_config = + GenericVhostUserConfig::parse(config).map_err(Error::AddGenericVhostUserConfig)?; + let generic_vhost_user_config = serde_json::to_string(&generic_vhost_user_config).unwrap(); + + Ok(generic_vhost_user_config) +} + fn add_pmem_config(config: &str) -> Result { let pmem_config = PmemConfig::parse(config).map_err(Error::AddPmemConfig)?; let pmem_config = serde_json::to_string(&pmem_config).unwrap(); @@ -909,13 +949,11 @@ fn receive_migration_data(url: &str) -> String { serde_json::to_string(&receive_migration_data).unwrap() } -fn send_migration_data(url: &str, local: bool) -> String { - let send_migration_data = vmm::api::VmSendMigrationData { - destination_url: url.to_owned(), - local, - }; - - serde_json::to_string(&send_migration_data).unwrap() +fn send_migration_data(config: &str) -> Result { + let send_migration_data = + vmm::api::VmSendMigrationData::parse(config).map_err(Error::SendMigrationConfig)?; + let send_migration_config = serde_json::to_string(&send_migration_data).unwrap(); + Ok(send_migration_config) } fn create_data(path: &str) -> Result { @@ -981,6 +1019,13 @@ fn get_cli_commands_sorted() -> Box<[Command]> { .index(1) .help(vmm::vm_config::FsConfig::SYNTAX), ), + Command::new("add-generic-vhost-user") + .about("Add generic vhost-user device") + .arg( + Arg::new("generic_vhost_user_config") + .index(1) + .help(vmm::vm_config::GenericVhostUserConfig::SYNTAX), + ), Command::new("add-net") .about("Add network device") .arg(Arg::new("net_config").index(1).help(NetConfig::SYNTAX)), @@ -1090,13 +1135,7 @@ fn get_cli_commands_sorted() -> Box<[Command]> { .arg( Arg::new("send_migration_config") .index(1) - .help(""), - ) - .arg( - Arg::new("send_migration_local") - .long("local") - .num_args(0) - .action(ArgAction::SetTrue), + .help(vmm::api::VmSendMigrationData::SYNTAX), ), Command::new("shutdown").about("Shutdown the VM"), Command::new("shutdown-vmm").about("Shutdown the VMM"), diff --git a/cloud-hypervisor/src/main.rs b/cloud-hypervisor/src/main.rs index 1d78b400bc..51b1f38fdd 100644 --- a/cloud-hypervisor/src/main.rs +++ b/cloud-hypervisor/src/main.rs @@ -32,9 +32,9 @@ use vmm::vm_config::FwCfgConfig; #[cfg(feature = "ivshmem")] use vmm::vm_config::IvshmemConfig; use vmm::vm_config::{ - BalloonConfig, DeviceConfig, DiskConfig, FsConfig, LandlockConfig, NetConfig, NumaConfig, - PciSegmentConfig, PmemConfig, RateLimiterGroupConfig, TpmConfig, UserDeviceConfig, VdpaConfig, - VmConfig, VsockConfig, + BalloonConfig, DeviceConfig, DiskConfig, FsConfig, GenericVhostUserConfig, LandlockConfig, + NetConfig, NumaConfig, PciSegmentConfig, PlatformConfig, PmemConfig, RateLimiterGroupConfig, + TpmConfig, UserDeviceConfig, VdpaConfig, VmConfig, VsockConfig, }; use vmm_sys_util::eventfd::EventFd; use vmm_sys_util::signal::block_signal; @@ -199,11 +199,10 @@ fn get_cli_options_sorted( .long("cmdline") .help("Kernel command line") .num_args(1) - .group("vm-config"), Arg::new("console") + .group("vm-config"), + Arg::new("console") .long("console") - .help( - "Control (virtio) console: \"off|null|pty|tty|file=,iommu=on|off\"", - ) + .help("Control (virtio) console: \"off|null|pty|tty|file=,iommu=on|off\"") .default_value("tty") .group("vm-config"), Arg::new("cpus") @@ -214,7 +213,7 @@ fn get_cli_options_sorted( kvm_hyperv=on|off,max_phys_bits=,\ affinity=,\ features=,\ - nested=on|off", + nested=on|off,core_scheduling=vm|vcpu|off", ) .default_value(default_vcpus) .group("vm-config"), @@ -247,11 +246,13 @@ fn get_cli_options_sorted( .long("device") .help(DeviceConfig::SYNTAX) .num_args(1..) + .action(ArgAction::Append) .group("vm-config"), Arg::new("disk") .long("disk") .help(DiskConfig::SYNTAX) .num_args(1..) + .action(ArgAction::Append) .group("vm-config"), Arg::new("event-monitor") .long("event-monitor") @@ -267,6 +268,7 @@ fn get_cli_options_sorted( .long("fs") .help(FsConfig::SYNTAX) .num_args(1..) + .action(ArgAction::Append) .group("vm-config"), #[cfg(feature = "fw_cfg")] Arg::new("fw-cfg-config") @@ -280,6 +282,12 @@ fn get_cli_options_sorted( .help("GDB socket (UNIX domain socket): path=") .num_args(1) .group("vmm-config"), + Arg::new("generic-vhost-user") + .long("generic-vhost-user") + .help(GenericVhostUserConfig::SYNTAX) + .num_args(1..) + .action(ArgAction::Append) + .group("vm-config"), #[cfg(feature = "igvm")] Arg::new("igvm") .long("igvm") @@ -324,6 +332,7 @@ fn get_cli_options_sorted( .long("landlock-rules") .help(LandlockConfig::SYNTAX) .num_args(1..) + .action(ArgAction::Append) .group("vm-config"), Arg::new("log-file") .long("log-file") @@ -356,26 +365,36 @@ fn get_cli_options_sorted( prefault=on|off\"", ) .num_args(1..) + .action(ArgAction::Append) .group("vm-config"), Arg::new("net") .long("net") .help(NetConfig::SYNTAX) .num_args(1..) + .action(ArgAction::Append) .group("vm-config"), + Arg::new("no-shutdown") + .long("no-shutdown") + .help("Do not exit the VMM when the guest shuts down") + .num_args(0) + .action(ArgAction::SetTrue) + .group("vmm-config"), Arg::new("numa") .long("numa") .help(NumaConfig::SYNTAX) .num_args(1..) + .action(ArgAction::Append) .group("vm-config"), Arg::new("pci-segment") .long("pci-segment") .help(PciSegmentConfig::SYNTAX) .num_args(1..) + .action(ArgAction::Append) .group("vm-config"), Arg::new("platform") .long("platform") .help( - "num_pci_segments=,iommu_segments=,iommu_address_width=,serial_number=,uuid=,oem_strings=" + PlatformConfig::syntax() ) .num_args(1) .group("vm-config"), @@ -383,6 +402,7 @@ fn get_cli_options_sorted( .long("pmem") .help(PmemConfig::SYNTAX) .num_args(1..) + .action(ArgAction::Append) .group("vm-config"), #[cfg(feature = "pvmemcontrol")] Arg::new("pvmemcontrol") @@ -401,6 +421,7 @@ fn get_cli_options_sorted( .long("rate-limit-group") .help(RateLimiterGroupConfig::SYNTAX) .num_args(1..) + .action(ArgAction::Append) .group("vm-config"), Arg::new("restore") .long("restore") @@ -433,6 +454,7 @@ fn get_cli_options_sorted( .long("user-device") .help(UserDeviceConfig::SYNTAX) .num_args(1..) + .action(ArgAction::Append) .group("vm-config"), Arg::new("v") .short('v') @@ -443,6 +465,7 @@ fn get_cli_options_sorted( .long("vdpa") .help(VdpaConfig::SYNTAX) .num_args(1..) + .action(ArgAction::Append) .group("vm-config"), Arg::new("version") .short('V') @@ -487,7 +510,37 @@ fn create_app(default_vcpus: String, default_memory: String, default_rng: String .args(args) } -fn start_vmm(cmd_arguments: &ArgMatches) -> Result, Error> { +fn parse_api_socket(cmd_arguments: &ArgMatches) -> Result<(Option, Option), Error> { + if let Some(socket_config) = cmd_arguments.get_one::("api-socket") { + let mut parser = OptionParser::new(); + parser.add("path").add("fd"); + parser.parse(socket_config).unwrap_or_default(); + + if let Some(fd) = parser.get("fd") { + Ok(( + None, + Some(fd.parse::().map_err(Error::ParsingApiSocket)?), + )) + } else if let Some(path) = parser.get("path") { + Ok((Some(path), None)) + } else { + Ok(( + cmd_arguments + .get_one::("api-socket") + .map(|s| s.to_string()), + None, + )) + } + } else { + Ok((None, None)) + } +} + +fn start_vmm( + cmd_arguments: &ArgMatches, + api_socket_path: &Option, + api_socket_fd: Option, +) -> Result<(), Error> { let log_level = match cmd_arguments.get_count("v") { 0 => LevelFilter::Warn, 1 => LevelFilter::Info, @@ -510,31 +563,6 @@ fn start_vmm(cmd_arguments: &ArgMatches) -> Result, Error> { .map(|()| log::set_max_level(log_level)) .map_err(Error::LoggerSetup)?; - let (api_socket_path, api_socket_fd) = - if let Some(socket_config) = cmd_arguments.get_one::("api-socket") { - let mut parser = OptionParser::new(); - parser.add("path").add("fd"); - parser.parse(socket_config).unwrap_or_default(); - - if let Some(fd) = parser.get("fd") { - ( - None, - Some(fd.parse::().map_err(Error::ParsingApiSocket)?), - ) - } else if let Some(path) = parser.get("path") { - (Some(path), None) - } else { - ( - cmd_arguments - .get_one::("api-socket") - .map(|s| s.to_string()), - None, - ) - } - } else { - (None, None) - }; - let (api_request_sender, api_request_receiver) = channel(); let api_evt = EventFd::new(EFD_NONBLOCK).map_err(Error::CreateApiEventFd)?; @@ -590,6 +618,8 @@ fn start_vmm(cmd_arguments: &ArgMatches) -> Result, Error> { } } + info!("{} starting", env!("BUILD_VERSION")); + let hypervisor = hypervisor::new().map_err(Error::CreateHypervisor)?; #[cfg(feature = "guest_debug")] @@ -613,6 +643,7 @@ fn start_vmm(cmd_arguments: &ArgMatches) -> Result, Error> { let exit_evt = EventFd::new(EFD_NONBLOCK).map_err(Error::CreateExitEventFd)?; let landlock_enable = cmd_arguments.get_flag("landlock"); + let no_shutdown = cmd_arguments.get_flag("no-shutdown"); #[allow(unused_mut)] let mut event_monitor = cmd_arguments @@ -693,7 +724,7 @@ fn start_vmm(cmd_arguments: &ArgMatches) -> Result, Error> { let vmm_thread_handle = vmm::start_vmm_thread( vmm::VmmVersionInfo::new(env!("BUILD_VERSION"), env!("CARGO_PKG_VERSION")), - &api_socket_path, + api_socket_path, api_socket_fd, #[cfg(feature = "dbus_api")] dbus_options, @@ -709,6 +740,7 @@ fn start_vmm(cmd_arguments: &ArgMatches) -> Result, Error> { exit_evt.try_clone().unwrap(), &seccomp_action, hypervisor, + no_shutdown, landlock_enable, ) .map_err(Error::StartVmmThread)?; @@ -779,7 +811,7 @@ fn start_vmm(cmd_arguments: &ArgMatches) -> Result, Error> { dbus_api_graceful_shutdown(chs); } - r.map(|_| api_socket_path) + r } // This is a best-effort solution to the latency induced by the RCU @@ -885,9 +917,22 @@ fn main() { warn!("Error expanding FD table: {e}"); } - let exit_code = match start_vmm(&cmd_arguments) { - Ok(path) => { - path.map(|s| std::fs::remove_file(s).ok()); + let (api_socket_path, api_socket_fd) = match parse_api_socket(&cmd_arguments) { + Ok(p) => p, + Err(top_error) => { + cloud_hypervisor::cli_print_error_chain(&top_error, "Cloud Hypervisor", |_, _, _| None); + std::process::exit(1); + } + }; + + let vmm_result = start_vmm(&cmd_arguments, &api_socket_path, api_socket_fd); + + if let Some(ref p) = api_socket_path { + let _ = std::fs::remove_file(p); + } + + let exit_code = match vmm_result { + Ok(()) => { info!("Cloud Hypervisor exited successfully"); 0 } @@ -911,8 +956,8 @@ mod unit_tests { #[cfg(target_arch = "x86_64")] use vmm::vm_config::DebugConsoleConfig; use vmm::vm_config::{ - ConsoleConfig, ConsoleOutputMode, CpuFeatures, CpusConfig, HotplugMethod, MemoryConfig, - PayloadConfig, RngConfig, VmConfig, + ConsoleConfig, ConsoleOutputMode, CoreScheduling, CpuFeatures, CpusConfig, HotplugMethod, + MemoryConfig, PayloadConfig, RngConfig, VmConfig, }; use crate::test_util::assert_args_sorted; @@ -963,6 +1008,7 @@ mod unit_tests { affinity: None, features: CpuFeatures::default(), nested: true, + core_scheduling: CoreScheduling::Vm, }, memory: MemoryConfig { size: 536_870_912, @@ -998,6 +1044,7 @@ mod unit_tests { }, balloon: None, fs: None, + generic_vhost_user: None, pmem: None, serial: ConsoleConfig { file: None, diff --git a/cloud-hypervisor/tests/common/mod.rs b/cloud-hypervisor/tests/common/mod.rs new file mode 100644 index 0000000000..da58f907e8 --- /dev/null +++ b/cloud-hypervisor/tests/common/mod.rs @@ -0,0 +1,6 @@ +// Copyright 2025 The Cloud Hypervisor Authors. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +pub(crate) mod tests_wrappers; +pub(crate) mod utils; diff --git a/cloud-hypervisor/tests/common/tests_wrappers.rs b/cloud-hypervisor/tests/common/tests_wrappers.rs new file mode 100644 index 0000000000..67033956e1 --- /dev/null +++ b/cloud-hypervisor/tests/common/tests_wrappers.rs @@ -0,0 +1,3502 @@ +// Copyright 2025 The Cloud Hypervisor Authors. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 +use std::ffi::{CStr, CString}; +use std::fs::{self, OpenOptions}; +use std::io::{Read, Seek, SeekFrom, Write}; +use std::os::unix::io::AsRawFd; +use std::path::{Path, PathBuf}; +use std::string::String; +use std::sync::mpsc; +use std::thread; +use std::time::Duration; + +use block::ImageType; +use net_util::MacAddr; +use test_infra::*; +use vmm_sys_util::tempdir::TempDir; +use vmm_sys_util::tempfile::TempFile; +use wait_timeout::ChildExt; + +use crate::common::utils::{TargetApi, *}; + +// Start cloud-hypervisor with no VM parameters, only the API server running. +// From the API: Create a VM, boot it and check that it looks as expected. +pub(crate) fn _test_api_create_boot(target_api: &TargetApi, guest: &Guest) { + let mut child = GuestCommand::new(guest) + .args(target_api.guest_args()) + .capture_output() + .spawn() + .unwrap(); + + // Wait for API server to be ready + assert!(wait_until(Duration::from_secs(5), || target_api + .remote_command("ping", None))); + + // Create the VM first + let request_body = guest.api_create_body(); + + let temp_config_path = guest.tmp_dir.as_path().join("config"); + std::fs::write(&temp_config_path, request_body).unwrap(); + let create_config = temp_config_path.as_os_str().to_str().unwrap(); + + assert!(target_api.remote_command("create", Some(create_config),)); + + // Then boot it + assert!(target_api.remote_command("boot", None)); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + // Check that the VM booted as expected + guest.validate_cpu_count(None); + guest.validate_memory(None); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +// Start cloud-hypervisor with no VM parameters, only the API server running. +// From the API: Create a VM, boot it and check it can be shutdown and then +// booted again +pub(crate) fn _test_api_shutdown(target_api: &TargetApi, guest: &Guest) { + let mut child = GuestCommand::new(guest) + .args(target_api.guest_args()) + .capture_output() + .spawn() + .unwrap(); + + // Wait for API server to be ready + assert!(wait_until(Duration::from_secs(5), || target_api + .remote_command("ping", None))); + + // Create the VM first + let request_body = guest.api_create_body(); + + let temp_config_path = guest.tmp_dir.as_path().join("config"); + std::fs::write(&temp_config_path, request_body).unwrap(); + let create_config = temp_config_path.as_os_str().to_str().unwrap(); + + let r = std::panic::catch_unwind(|| { + assert!(target_api.remote_command("create", Some(create_config))); + + // Then boot it + assert!(target_api.remote_command("boot", None)); + + guest.wait_vm_boot().unwrap(); + + // Check that the VM booted as expected + guest.validate_cpu_count(None); + guest.validate_memory(None); + + // Sync and shutdown without powering off to prevent filesystem + // corruption. + guest.ssh_command("sync").unwrap(); + guest.ssh_command("sudo shutdown -H now").unwrap(); + + // Wait for the guest to be fully shutdown + assert!(guest.wait_for_ssh_unresponsive(Duration::from_secs(20))); + + // Then shut it down + assert!(target_api.remote_command("shutdown", None)); + + // Then boot it again + assert!(target_api.remote_command("boot", None)); + + guest.wait_vm_boot().unwrap(); + + // Check that the VM booted as expected + guest.validate_cpu_count(None); + guest.validate_memory(None); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +// Start cloud-hypervisor with no VM parameters, only the API server running. +// From the API: Create a VM, boot it and check it can be deleted and then recreated +// booted again. +pub(crate) fn _test_api_delete(target_api: &TargetApi, guest: &Guest) { + let mut child = GuestCommand::new(guest) + .args(target_api.guest_args()) + .capture_output() + .spawn() + .unwrap(); + + // Wait for API server to be ready + assert!(wait_until(Duration::from_secs(5), || target_api + .remote_command("ping", None))); + + // Create the VM first + let request_body = guest.api_create_body(); + + let temp_config_path = guest.tmp_dir.as_path().join("config"); + std::fs::write(&temp_config_path, request_body).unwrap(); + let create_config = temp_config_path.as_os_str().to_str().unwrap(); + + let r = std::panic::catch_unwind(|| { + assert!(target_api.remote_command("create", Some(create_config))); + + // Then boot it + assert!(target_api.remote_command("boot", None)); + + guest.wait_vm_boot().unwrap(); + + // Check that the VM booted as expected + guest.validate_cpu_count(None); + guest.validate_memory(None); + + // Sync and shutdown without powering off to prevent filesystem + // corruption. + guest.ssh_command("sync").unwrap(); + guest.ssh_command("sudo shutdown -H now").unwrap(); + + // Wait for the guest to be fully shutdown + assert!(guest.wait_for_ssh_unresponsive(Duration::from_secs(20))); + + // Then delete it + assert!(target_api.remote_command("delete", None)); + + assert!(target_api.remote_command("create", Some(create_config))); + + // Then boot it again + assert!(target_api.remote_command("boot", None)); + + guest.wait_vm_boot().unwrap(); + + // Check that the VM booted as expected + guest.validate_cpu_count(None); + guest.validate_memory(None); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +// Start cloud-hypervisor with no VM parameters, only the API server running. +// From the API: Create a VM, boot it and check that it looks as expected. +// Then we pause the VM, check that it's no longer available. +// Finally we resume the VM and check that it's available. +pub(crate) fn _test_api_pause_resume(target_api: &TargetApi, guest: &Guest) { + let mut child = GuestCommand::new(guest) + .args(target_api.guest_args()) + .capture_output() + .spawn() + .unwrap(); + + // Wait for API server to be ready + assert!(wait_until(Duration::from_secs(5), || target_api + .remote_command("ping", None))); + + // Create the VM first + let request_body = guest.api_create_body(); + + let temp_config_path = guest.tmp_dir.as_path().join("config"); + std::fs::write(&temp_config_path, request_body).unwrap(); + let create_config = temp_config_path.as_os_str().to_str().unwrap(); + + assert!(target_api.remote_command("create", Some(create_config))); + + // Then boot it + assert!(target_api.remote_command("boot", None)); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + // Check that the VM booted as expected + guest.validate_cpu_count(None); + guest.validate_memory(None); + + // We now pause the VM + assert!(target_api.remote_command("pause", None)); + + // Check pausing again fails + assert!(!target_api.remote_command("pause", None)); + + thread::sleep(std::time::Duration::new(2, 0)); + + // SSH into the VM should fail + ssh_command_ip( + "grep -c processor /proc/cpuinfo", + &guest.network.guest_ip0, + 2, + 5, + ) + .unwrap_err(); + + // Resume the VM + assert!(target_api.remote_command("resume", None)); + + // Check resuming again fails + assert!(!target_api.remote_command("resume", None)); + + thread::sleep(std::time::Duration::new(2, 0)); + + // Now we should be able to SSH back in and get the right number of CPUs + guest.validate_cpu_count(None); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +pub(crate) fn _test_pty_interaction(pty_path: PathBuf) { + let mut cf = std::fs::OpenOptions::new() + .write(true) + .read(true) + .open(pty_path) + .unwrap(); + + // Some dumb sleeps but we don't want to write + // before the console is up and we don't want + // to try and write the next line before the + // login process is ready. + thread::sleep(std::time::Duration::new(5, 0)); + assert_eq!(cf.write(b"cloud\n").unwrap(), 6); + thread::sleep(std::time::Duration::new(2, 0)); + assert_eq!(cf.write(b"cloud123\n").unwrap(), 9); + thread::sleep(std::time::Duration::new(2, 0)); + assert_eq!(cf.write(b"echo test_pty_console\n").unwrap(), 22); + thread::sleep(std::time::Duration::new(2, 0)); + + // read pty and ensure they have a login shell + // some fairly hacky workarounds to avoid looping + // forever in case the channel is blocked getting output + let ptyc = pty_read(cf); + let mut empty = 0; + let mut prev = String::new(); + loop { + thread::sleep(std::time::Duration::new(2, 0)); + match ptyc.try_recv() { + Ok(line) => { + empty = 0; + prev = prev + &line; + if prev.contains("test_pty_console") { + break; + } + } + Err(mpsc::TryRecvError::Empty) => { + empty += 1; + assert!(empty <= 5, "No login on pty"); + } + _ => { + panic!("No login on pty") + } + } + } +} + +pub(crate) fn test_cpu_topology( + threads_per_core: u8, + cores_per_package: u8, + packages: u8, + use_fw: bool, +) { + let disk_config = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); + let total_vcpus = threads_per_core * cores_per_package * packages; + let direct_kernel_boot_path = direct_kernel_boot_path(); + let mut kernel_path = direct_kernel_boot_path.to_str().unwrap(); + let fw_path = fw_path(FwType::RustHypervisorFirmware); + if use_fw { + kernel_path = fw_path.as_str(); + } + + let mut child = GuestCommand::new(&guest) + .args([ + "--cpus", + &format!( + "boot={total_vcpus},topology={threads_per_core}:{cores_per_package}:1:{packages}" + ), + ]) + .default_memory() + .args(["--kernel", kernel_path]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .default_disks() + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + assert_eq!( + guest.get_cpu_count().unwrap_or_default(), + u32::from(total_vcpus) + ); + assert_eq!( + guest + .ssh_command("lscpu | grep \"per core\" | cut -f 2 -d \":\" | sed \"s# *##\"") + .unwrap() + .trim() + .parse::() + .unwrap_or(0), + threads_per_core + ); + + assert_eq!( + guest + .ssh_command("lscpu | grep \"per socket\" | cut -f 2 -d \":\" | sed \"s# *##\"") + .unwrap() + .trim() + .parse::() + .unwrap_or(0), + cores_per_package + ); + + assert_eq!( + guest + .ssh_command("lscpu | grep \"Socket\" | cut -f 2 -d \":\" | sed \"s# *##\"") + .unwrap() + .trim() + .parse::() + .unwrap_or(0), + packages + ); + + #[cfg(target_arch = "x86_64")] + { + let mut cpu_id = 0; + for package_id in 0..packages { + for core_id in 0..cores_per_package { + for _ in 0..threads_per_core { + assert_eq!( + guest + .ssh_command(&format!("cat /sys/devices/system/cpu/cpu{cpu_id}/topology/physical_package_id")) + .unwrap() + .trim() + .parse::() + .unwrap_or(0), + package_id + ); + + assert_eq!( + guest + .ssh_command(&format!( + "cat /sys/devices/system/cpu/cpu{cpu_id}/topology/core_id" + )) + .unwrap() + .trim() + .parse::() + .unwrap_or(0), + core_id + ); + + cpu_id += 1; + } + } + } + } + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +#[allow(unused_variables)] +pub(crate) fn _test_guest_numa_nodes(acpi: bool) { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); + let api_socket = temp_api_path(&guest.tmp_dir); + #[cfg(target_arch = "x86_64")] + let kernel_path = direct_kernel_boot_path(); + #[cfg(target_arch = "aarch64")] + let kernel_path = if acpi { + edk2_path() + } else { + direct_kernel_boot_path() + }; + + let mut child = GuestCommand::new(&guest) + .args(["--cpus", "boot=6,max=12"]) + .args(["--memory", "size=0,hotplug_method=virtio-mem"]) + .args([ + "--memory-zone", + "id=mem0,size=1G,hotplug_size=3G", + "id=mem1,size=2G,hotplug_size=3G", + "id=mem2,size=3G,hotplug_size=3G", + ]) + .args([ + "--numa", + "guest_numa_id=0,cpus=[0-2,9],distances=[1@15,2@20],memory_zones=mem0", + "guest_numa_id=1,cpus=[3-4,6-8],distances=[0@20,2@25],memory_zones=mem1", + "guest_numa_id=2,cpus=[5,10-11],distances=[0@25,1@30],memory_zones=mem2", + ]) + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .args(["--api-socket", &api_socket]) + .capture_output() + .default_disks() + .default_net() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + guest.check_numa_common( + Some(&[960_000, 1_920_000, 2_880_000]), + Some(&[&[0, 1, 2], &[3, 4], &[5]]), + Some(&["10 15 20", "20 10 25", "25 30 10"]), + ); + + // AArch64 currently does not support hotplug, and therefore we only + // test hotplug-related function on x86_64 here. + #[cfg(target_arch = "x86_64")] + { + guest.enable_memory_hotplug(); + + // Resize every memory zone and check each associated NUMA node + // has been assigned the right amount of memory. + resize_zone_command(&api_socket, "mem0", "4G"); + resize_zone_command(&api_socket, "mem1", "4G"); + resize_zone_command(&api_socket, "mem2", "4G"); + // Resize to the maximum amount of CPUs and check each NUMA + // node has been assigned the right CPUs set. + resize_command(&api_socket, Some(12), None, None, None); + thread::sleep(std::time::Duration::new(5, 0)); + + guest.check_numa_common( + Some(&[3_840_000, 3_840_000, 3_840_000]), + Some(&[&[0, 1, 2, 9], &[3, 4, 6, 7, 8], &[5, 10, 11]]), + None, + ); + } + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +#[allow(unused_variables)] +pub(crate) fn _test_power_button(guest: &Guest) { + let mut cmd = GuestCommand::new(guest); + let api_socket = temp_api_path(&guest.tmp_dir); + + cmd.default_cpus() + .default_memory() + .default_kernel_cmdline() + .capture_output() + .default_disks() + .default_net() + .args(["--api-socket", &api_socket]); + + let child = cmd.spawn().unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + assert!(remote_command(&api_socket, "power-button", None)); + }); + + let output = child.wait_with_output().unwrap(); + assert!(output.status.success()); + handle_child_output(r, &output); +} + +pub(crate) fn test_vhost_user_net( + tap: Option<&str>, + num_queues: usize, + prepare_daemon: &PrepareNetDaemon, + generate_host_mac: bool, + client_mode_daemon: bool, +) { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); + let api_socket = temp_api_path(&guest.tmp_dir); + + let kernel_path = direct_kernel_boot_path(); + + let host_mac = if generate_host_mac { + Some(MacAddr::local_random()) + } else { + None + }; + + let mtu = Some(3000); + + let (mut daemon_command, vunet_socket_path) = prepare_daemon( + &guest.tmp_dir, + &guest.network.host_ip0, + tap, + mtu, + num_queues, + client_mode_daemon, + ); + + let net_params = format!( + "vhost_user=true,mac={},socket={},num_queues={},queue_size=1024{},vhost_mode={},mtu=3000", + guest.network.guest_mac0, + vunet_socket_path, + num_queues, + if let Some(host_mac) = host_mac { + format!(",host_mac={host_mac}") + } else { + String::new() + }, + if client_mode_daemon { + "server" + } else { + "client" + }, + ); + + let mut ch_command = GuestCommand::new(&guest); + ch_command + .args(["--cpus", format!("boot={}", num_queues / 2).as_str()]) + .args(["--memory", "size=512M,hotplug_size=2048M,shared=on"]) + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .default_disks() + .args(["--net", net_params.as_str()]) + .args(["--api-socket", &api_socket]) + .capture_output(); + + let mut daemon_child: std::process::Child; + let mut child: std::process::Child; + + if client_mode_daemon { + child = ch_command.spawn().unwrap(); + // Wait for the VMM to create the socket before starting the daemon + assert!(wait_until(Duration::from_secs(10), || Path::new( + &vunet_socket_path + ) + .exists())); + daemon_child = daemon_command.spawn().unwrap(); + } else { + daemon_child = daemon_command.spawn().unwrap(); + // Wait for the daemon to create the socket before starting the VMM + assert!(wait_until(Duration::from_secs(10), || Path::new( + &vunet_socket_path + ) + .exists())); + child = ch_command.spawn().unwrap(); + } + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + if let Some(tap_name) = tap { + let tap_count = exec_host_command_output(&format!("ip link | grep -c {tap_name}")); + assert_eq!(String::from_utf8_lossy(&tap_count.stdout).trim(), "1"); + } + + if let Some(host_mac) = tap { + let mac_count = exec_host_command_output(&format!("ip link | grep -c {host_mac}")); + assert_eq!(String::from_utf8_lossy(&mac_count.stdout).trim(), "1"); + } + + #[cfg(target_arch = "aarch64")] + let iface = "enp0s4"; + #[cfg(target_arch = "x86_64")] + let iface = "ens4"; + + assert_eq!( + guest + .ssh_command(format!("cat /sys/class/net/{iface}/mtu").as_str()) + .unwrap() + .trim(), + "3000" + ); + + // 1 network interface + default localhost ==> 2 interfaces + // It's important to note that this test is fully exercising the + // vhost-user-net implementation and the associated backend since + // it does not define any --net network interface. That means all + // the ssh communication in that test happens through the network + // interface backed by vhost-user-net. + assert_eq!( + guest + .ssh_command("ip -o link | wc -l") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 2 + ); + + // The following pci devices will appear on guest with PCI-MSI + // interrupt vectors assigned. + // 1 virtio-console with 3 vectors: config, Rx, Tx + // 1 virtio-blk with 2 vectors: config, Request + // 1 virtio-blk with 2 vectors: config, Request + // 1 virtio-rng with 2 vectors: config, Request + // Since virtio-net has 2 queue pairs, its vectors is as follows: + // 1 virtio-net with 5 vectors: config, Rx (2), Tx (2) + // Based on the above, the total vectors should 14. + let grep_cmd = format!("grep -c {} /proc/interrupts", get_msi_interrupt_pattern()); + + assert_eq!( + guest + .ssh_command(&grep_cmd) + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 10 + (num_queues as u32) + ); + + // ACPI feature is needed. + #[cfg(target_arch = "x86_64")] + { + guest.enable_memory_hotplug(); + + // Add RAM to the VM + let desired_ram = 1024 << 20; + resize_command(&api_socket, None, Some(desired_ram), None, None); + + // Here by simply checking the size (through ssh), we validate + // the connection is still working, which means vhost-user-net + // keeps working after the resize. + assert!(wait_until(Duration::from_secs(10), || guest + .get_total_memory() + .unwrap_or_default() + > 960_000)); + } + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + let _ = daemon_child.kill(); + let _ = daemon_child.wait(); + + handle_child_output(r, &output); +} + +type PrepareBlkDaemon = dyn Fn(&TempDir, &str, usize, bool, bool) -> (std::process::Child, String); + +pub(crate) fn test_vhost_user_blk( + num_queues: usize, + readonly: bool, + direct: bool, + prepare_vhost_user_blk_daemon: Option<&PrepareBlkDaemon>, +) { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); + let api_socket = temp_api_path(&guest.tmp_dir); + + let kernel_path = direct_kernel_boot_path(); + + let (blk_params, daemon_child) = { + let prepare_daemon = prepare_vhost_user_blk_daemon.unwrap(); + // Start the daemon + let (daemon_child, vubd_socket_path) = + prepare_daemon(&guest.tmp_dir, "blk.img", num_queues, readonly, direct); + + ( + format!( + "vhost_user=true,socket={vubd_socket_path},num_queues={num_queues},queue_size=128", + ), + Some(daemon_child), + ) + }; + + let mut child = GuestCommand::new(&guest) + .args(["--cpus", format!("boot={num_queues}").as_str()]) + .args(["--memory", "size=512M,hotplug_size=2048M,shared=on"]) + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .args([ + "--disk", + format!( + "path={}", + guest.disk_config.disk(DiskType::OperatingSystem).unwrap() + ) + .as_str(), + format!( + "path={}", + guest.disk_config.disk(DiskType::CloudInit).unwrap() + ) + .as_str(), + blk_params.as_str(), + ]) + .default_net() + .args(["--api-socket", &api_socket]) + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + // Check both if /dev/vdc exists and if the block size is 16M. + assert_eq!( + guest + .ssh_command("lsblk | grep vdc | grep -c 16M") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 1 + ); + + // Check if this block is RO or RW. + assert_eq!( + guest + .ssh_command("lsblk | grep vdc | awk '{print $5}'") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + readonly as u32 + ); + + // Check if the number of queues in /sys/block/vdc/mq matches the + // expected num_queues. + assert_eq!( + guest + .ssh_command("ls -ll /sys/block/vdc/mq | grep ^d | wc -l") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + num_queues as u32 + ); + + // Mount the device + let mount_ro_rw_flag = if readonly { "ro,noload" } else { "rw" }; + guest.ssh_command("mkdir mount_image").unwrap(); + guest + .ssh_command( + format!("sudo mount -o {mount_ro_rw_flag} -t ext4 /dev/vdc mount_image/").as_str(), + ) + .unwrap(); + + // Check the content of the block device. The file "foo" should + // contain "bar". + assert_eq!( + guest.ssh_command("cat mount_image/foo").unwrap().trim(), + "bar" + ); + + // ACPI feature is needed. + #[cfg(target_arch = "x86_64")] + { + guest.enable_memory_hotplug(); + + // Add RAM to the VM + let desired_ram = 1024 << 20; + resize_command(&api_socket, None, Some(desired_ram), None, None); + + assert!(wait_until(Duration::from_secs(10), || guest + .get_total_memory() + .unwrap_or_default() + > 960_000)); + + // Check again the content of the block device after the resize + // has been performed. + assert_eq!( + guest.ssh_command("cat mount_image/foo").unwrap().trim(), + "bar" + ); + } + + // Unmount the device + guest.ssh_command("sudo umount /dev/vdc").unwrap(); + guest.ssh_command("rm -r mount_image").unwrap(); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + if let Some(mut daemon_child) = daemon_child { + let _ = daemon_child.kill(); + let _ = daemon_child.wait(); + } + + handle_child_output(r, &output); +} + +pub(crate) fn test_boot_from_vhost_user_blk( + num_queues: usize, + readonly: bool, + direct: bool, + prepare_vhost_user_blk_daemon: Option<&PrepareBlkDaemon>, +) { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); + + let kernel_path = direct_kernel_boot_path(); + + let disk_path = guest.disk_config.disk(DiskType::OperatingSystem).unwrap(); + + let (blk_boot_params, daemon_child) = { + let prepare_daemon = prepare_vhost_user_blk_daemon.unwrap(); + // Start the daemon + let (daemon_child, vubd_socket_path) = prepare_daemon( + &guest.tmp_dir, + disk_path.as_str(), + num_queues, + readonly, + direct, + ); + + ( + format!( + "vhost_user=true,socket={vubd_socket_path},num_queues={num_queues},queue_size=128", + ), + Some(daemon_child), + ) + }; + + let mut child = GuestCommand::new(&guest) + .args(["--cpus", format!("boot={num_queues}").as_str()]) + .args(["--memory", "size=512M,shared=on"]) + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .args([ + "--disk", + blk_boot_params.as_str(), + format!( + "path={}", + guest.disk_config.disk(DiskType::CloudInit).unwrap() + ) + .as_str(), + ]) + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + // Just check the VM booted correctly. + assert_eq!(guest.get_cpu_count().unwrap_or_default(), num_queues as u32); + assert!(guest.get_total_memory().unwrap_or_default() > 480_000); + }); + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + if let Some(mut daemon_child) = daemon_child { + let _ = daemon_child.kill(); + let _ = daemon_child.wait(); + } + + handle_child_output(r, &output); +} + +pub(crate) fn _test_virtio_fs( + prepare_daemon: &dyn Fn(&TempDir, &str) -> (std::process::Child, String), + hotplug: bool, + use_generic_vhost_user: bool, + pci_segment: Option, +) { + #[cfg(target_arch = "aarch64")] + let focal_image = if hotplug { + FOCAL_IMAGE_UPDATE_KERNEL_NAME.to_string() + } else { + FOCAL_IMAGE_NAME.to_string() + }; + #[cfg(target_arch = "x86_64")] + let focal_image = FOCAL_IMAGE_NAME.to_string(); + let disk_config = UbuntuDiskConfig::new(focal_image); + let guest = Guest::new(Box::new(disk_config)); + let api_socket = temp_api_path(&guest.tmp_dir); + let event_path = temp_event_monitor_path(&guest.tmp_dir); + + let mut workload_path = dirs::home_dir().unwrap(); + workload_path.push("workloads"); + + let mut shared_dir = workload_path; + shared_dir.push("shared_dir"); + + #[cfg(target_arch = "x86_64")] + let kernel_path = direct_kernel_boot_path(); + #[cfg(target_arch = "aarch64")] + let kernel_path = if hotplug { + edk2_path() + } else { + direct_kernel_boot_path() + }; + + let (mut daemon_child, virtiofsd_socket_path) = + prepare_daemon(&guest.tmp_dir, shared_dir.to_str().unwrap()); + + let mut guest_command = GuestCommand::new(&guest); + guest_command + .default_cpus() + .args(["--memory", "size=512M,hotplug_size=2048M,shared=on"]) + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .default_disks() + .default_net() + .args(["--api-socket", &api_socket]) + .args(["--event-monitor", format!("path={event_path}").as_str()]); + if pci_segment.is_some() { + guest_command.args([ + "--platform", + &format!("num_pci_segments={MAX_NUM_PCI_SEGMENTS}"), + ]); + } + + let fs_params = format!( + "socket={},id=myfs0,{}{}", + virtiofsd_socket_path, + if use_generic_vhost_user { + "queue_sizes=[1024,1024],virtio_id=26" + } else { + "tag=myfs,num_queues=1,queue_size=1024" + }, + if let Some(pci_segment) = pci_segment { + format!(",pci_segment={pci_segment}") + } else { + String::new() + } + ); + + if !hotplug { + guest_command.args([ + if use_generic_vhost_user { + "--generic-vhost-user" + } else { + "--fs" + }, + fs_params.as_str(), + ]); + } + + let mut child = guest_command.capture_output().spawn().unwrap(); + let add_arg = if use_generic_vhost_user { + "add-generic-vhost-user" + } else { + "add-fs" + }; + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + if hotplug { + // Add fs to the VM + let (cmd_success, cmd_output, _) = + remote_command_w_output(&api_socket, add_arg, Some(&fs_params)); + assert!(cmd_success); + + if let Some(pci_segment) = pci_segment { + assert!(String::from_utf8_lossy(&cmd_output).contains(&format!( + "{{\"id\":\"myfs0\",\"bdf\":\"{pci_segment:04x}:00:01.0\"}}" + ))); + } else { + assert!( + String::from_utf8_lossy(&cmd_output) + .contains("{\"id\":\"myfs0\",\"bdf\":\"0000:00:06.0\"}") + ); + } + } + + // Mount shared directory through virtio_fs filesystem + guest + .wait_for_ssh_command( + "mkdir -p mount_dir && sudo mount -t virtiofs myfs mount_dir/", + Duration::from_secs(10), + ) + .unwrap(); + + // Check file1 exists and its content is "foo" + assert_eq!( + guest.ssh_command("cat mount_dir/file1").unwrap().trim(), + "foo" + ); + // Check file2 does not exist + guest + .ssh_command("[ ! -f 'mount_dir/file2' ] || true") + .unwrap(); + + // Check file3 exists and its content is "bar" + assert_eq!( + guest.ssh_command("cat mount_dir/file3").unwrap().trim(), + "bar" + ); + + // ACPI feature is needed. + #[cfg(target_arch = "x86_64")] + { + guest.enable_memory_hotplug(); + + // Add RAM to the VM + let desired_ram = 1024 << 20; + resize_command(&api_socket, None, Some(desired_ram), None, None); + + assert!(wait_until(Duration::from_secs(30), || guest + .get_total_memory() + .unwrap_or_default() + > 960_000)); + + // After the resize, check again that file1 exists and its + // content is "foo". + assert_eq!( + guest.ssh_command("cat mount_dir/file1").unwrap().trim(), + "foo" + ); + } + + if hotplug { + // Remove from VM + guest.ssh_command("sudo umount mount_dir").unwrap(); + assert!(remote_command(&api_socket, "remove-device", Some("myfs0"))); + + // Wait for the device to be fully removed before re-adding + let removed_event = MetaEvent { + event: "device-removed".to_string(), + device_id: Some("myfs0".to_string()), + }; + assert!(wait_until(Duration::from_secs(10), || { + check_sequential_events(&[&removed_event], &event_path) + })); + } + }); + + let (r, hotplug_daemon_child) = if r.is_ok() && hotplug { + let _ = daemon_child.kill(); + let _ = daemon_child.wait(); + // Remove the stale socket so wait_for_virtiofsd_socket actually waits + let _ = std::fs::remove_file(&virtiofsd_socket_path); + + let (daemon_child, virtiofsd_socket_path) = + prepare_daemon(&guest.tmp_dir, shared_dir.to_str().unwrap()); + + let r = std::panic::catch_unwind(|| { + // Wait for the daemon socket to be ready + assert!(wait_until(Duration::from_secs(10), || Path::new( + &virtiofsd_socket_path + ) + .exists())); + let fs_params = format!( + "id=myfs0,socket={},{}{}", + virtiofsd_socket_path, + if use_generic_vhost_user { + "queue_sizes=[1024,1024],virtio_id=26" + } else { + "tag=myfs,num_queues=1,queue_size=1024" + }, + if let Some(pci_segment) = pci_segment { + format!(",pci_segment={pci_segment}") + } else { + String::new() + } + ); + + // Add back and check it works + let (cmd_success, cmd_output, _) = + remote_command_w_output(&api_socket, add_arg, Some(&fs_params)); + assert!(cmd_success); + if let Some(pci_segment) = pci_segment { + assert!(String::from_utf8_lossy(&cmd_output).contains(&format!( + "{{\"id\":\"myfs0\",\"bdf\":\"{pci_segment:04x}:00:01.0\"}}" + ))); + } else { + assert!( + String::from_utf8_lossy(&cmd_output) + .contains("{\"id\":\"myfs0\",\"bdf\":\"0000:00:06.0\"}") + ); + } + + // Mount shared directory through virtio_fs filesystem, retrying + // until the hotplugged device is recognized by the guest + guest + .wait_for_ssh_command( + "mkdir -p mount_dir && sudo mount -t virtiofs myfs mount_dir/", + Duration::from_secs(10), + ) + .unwrap(); + + // Check file1 exists and its content is "foo" + assert_eq!( + guest.ssh_command("cat mount_dir/file1").unwrap().trim(), + "foo" + ); + }); + + (r, Some(daemon_child)) + } else { + (r, None) + }; + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + let _ = daemon_child.kill(); + let _ = daemon_child.wait(); + + if let Some(mut daemon_child) = hotplug_daemon_child { + let _ = daemon_child.kill(); + let _ = daemon_child.wait(); + } + + handle_child_output(r, &output); +} + +pub(crate) fn test_virtio_pmem(discard_writes: bool, specify_size: bool) { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); + + let kernel_path = direct_kernel_boot_path(); + + let pmem_temp_file = TempFile::new().unwrap(); + pmem_temp_file.as_file().set_len(128 << 20).unwrap(); + + std::process::Command::new("mkfs.ext4") + .arg(pmem_temp_file.as_path()) + .output() + .expect("Expect creating disk image to succeed"); + + let mut child = GuestCommand::new(&guest) + .default_cpus() + .default_memory() + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .default_disks() + .default_net() + .args([ + "--pmem", + format!( + "file={}{}{}", + pmem_temp_file.as_path().to_str().unwrap(), + if specify_size { ",size=128M" } else { "" }, + if discard_writes { + ",discard_writes=on" + } else { + "" + } + ) + .as_str(), + ]) + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + // Check for the presence of /dev/pmem0 + assert_eq!( + guest.ssh_command("ls /dev/pmem0").unwrap().trim(), + "/dev/pmem0" + ); + + // Check changes persist after reboot + assert_eq!(guest.ssh_command("sudo mount /dev/pmem0 /mnt").unwrap(), ""); + assert_eq!(guest.ssh_command("ls /mnt").unwrap(), "lost+found\n"); + guest + .ssh_command("echo test123 | sudo tee /mnt/test") + .unwrap(); + assert_eq!(guest.ssh_command("sudo umount /mnt").unwrap(), ""); + assert_eq!(guest.ssh_command("ls /mnt").unwrap(), ""); + + guest.reboot_linux(0); + assert_eq!(guest.ssh_command("sudo mount /dev/pmem0 /mnt").unwrap(), ""); + assert_eq!( + guest + .ssh_command("sudo cat /mnt/test || true") + .unwrap() + .trim(), + if discard_writes { "" } else { "test123" } + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +pub(crate) fn _test_virtio_vsock(guest: &Guest, hotplug: bool) { + let socket = temp_vsock_path(&guest.tmp_dir); + let api_socket = temp_api_path(&guest.tmp_dir); + + let mut cmd = GuestCommand::new(guest); + cmd.args(["--api-socket", &api_socket]); + cmd.default_cpus(); + cmd.default_memory(); + cmd.default_kernel_cmdline(); + cmd.default_disks(); + cmd.default_net(); + + if !hotplug { + cmd.args(["--vsock", format!("cid=3,socket={socket}").as_str()]); + } + + let mut child = cmd.capture_output().spawn().unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + if hotplug { + let (cmd_success, cmd_output, _) = remote_command_w_output( + &api_socket, + "add-vsock", + Some(format!("cid=3,socket={socket},id=test0").as_str()), + ); + assert!(cmd_success); + assert!( + String::from_utf8_lossy(&cmd_output) + .contains("{\"id\":\"test0\",\"bdf\":\"0000:00:06.0\"}") + ); + thread::sleep(std::time::Duration::new(10, 0)); + // Check adding a second one fails + assert!(!remote_command( + &api_socket, + "add-vsock", + Some("cid=1234,socket=/tmp/fail") + )); + } + + // Validate vsock works as expected. + guest.check_vsock(socket.as_str()); + guest.reboot_linux(0); + // Validate vsock still works after a reboot. + guest.check_vsock(socket.as_str()); + + if hotplug { + assert!(remote_command(&api_socket, "remove-device", Some("test0"))); + } + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +pub(crate) fn test_memory_mergeable(mergeable: bool) { + let memory_param = if mergeable { + "mergeable=on" + } else { + "mergeable=off" + }; + + // We assume the number of shared pages in the rest of the system to be constant + let ksm_ps_init = get_ksm_pages_shared(); + + let disk_config1 = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); + let guest1 = Guest::new(Box::new(disk_config1)); + let mut child1 = GuestCommand::new(&guest1) + .default_cpus() + .args(["--memory", format!("size=512M,{memory_param}").as_str()]) + .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .default_disks() + .args(["--net", guest1.default_net_string().as_str()]) + .args(["--serial", "tty", "--console", "off"]) + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest1.wait_vm_boot().unwrap(); + }); + if r.is_err() { + kill_child(&mut child1); + let output = child1.wait_with_output().unwrap(); + handle_child_output(r, &output); + panic!("Test should already be failed/panicked"); // To explicitly mark this block never return + } + + let ksm_ps_guest1 = get_ksm_pages_shared(); + + let disk_config2 = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); + let guest2 = Guest::new(Box::new(disk_config2)); + let mut child2 = GuestCommand::new(&guest2) + .default_cpus() + .args(["--memory", format!("size=512M,{memory_param}").as_str()]) + .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .default_disks() + .args(["--net", guest2.default_net_string().as_str()]) + .args(["--serial", "tty", "--console", "off"]) + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest2.wait_vm_boot().unwrap(); + let ksm_ps_guest2 = get_ksm_pages_shared(); + + if mergeable { + println!( + "ksm pages_shared after vm1 booted '{ksm_ps_guest1}', ksm pages_shared after vm2 booted '{ksm_ps_guest2}'" + ); + // We are expecting the number of shared pages to increase as the number of VM increases + assert!(ksm_ps_guest1 < ksm_ps_guest2); + } else { + assert!(ksm_ps_guest1 == ksm_ps_init); + assert!(ksm_ps_guest2 == ksm_ps_init); + } + }); + + kill_child(&mut child1); + kill_child(&mut child2); + + let output = child1.wait_with_output().unwrap(); + child2.wait().unwrap(); + + handle_child_output(r, &output); +} + +// This test validates that it can find the virtio-iommu device at first. +// It also verifies that both disks and the network card are attached to +// the virtual IOMMU by looking at /sys/kernel/iommu_groups directory. +// The last interesting part of this test is that it exercises the network +// interface attached to the virtual IOMMU since this is the one used to +// send all commands through SSH. +pub(crate) fn _test_virtio_iommu(_acpi: bool /* not needed on x86_64 */) { + // Virtio-iommu support is ready in recent kernel (v5.14). But the kernel in + // Focal image is still old. + // So if ACPI is enabled on AArch64, we use a modified Focal image in which + // the kernel binary has been updated. + #[cfg(target_arch = "aarch64")] + let focal_image = FOCAL_IMAGE_UPDATE_KERNEL_NAME.to_string(); + #[cfg(target_arch = "x86_64")] + let focal_image = FOCAL_IMAGE_NAME.to_string(); + let disk_config = UbuntuDiskConfig::new(focal_image); + let guest = Guest::new(Box::new(disk_config)); + + #[cfg(target_arch = "x86_64")] + let kernel_path = direct_kernel_boot_path(); + #[cfg(target_arch = "aarch64")] + let kernel_path = if _acpi { + edk2_path() + } else { + direct_kernel_boot_path() + }; + + let mut child = GuestCommand::new(&guest) + .default_cpus() + .default_memory() + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .args([ + "--disk", + format!( + "path={},iommu=on", + guest.disk_config.disk(DiskType::OperatingSystem).unwrap() + ) + .as_str(), + format!( + "path={},iommu=on", + guest.disk_config.disk(DiskType::CloudInit).unwrap() + ) + .as_str(), + ]) + .args(["--net", guest.default_net_string_w_iommu().as_str()]) + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + // Verify the virtio-iommu device is present. + assert!( + guest + .does_device_vendor_pair_match("0x1057", "0x1af4") + .unwrap_or_default() + ); + + // On AArch64, if the guest system boots from FDT, the behavior of IOMMU is a bit + // different with ACPI. + // All devices on the PCI bus will be attached to the virtual IOMMU, except the + // virtio-iommu device itself. So these devices will all be added to IOMMU groups, + // and appear under folder '/sys/kernel/iommu_groups/'. + // + // Verify the first disk is in an iommu group. + assert!( + guest + .ssh_command("ls /sys/kernel/iommu_groups/*/devices") + .unwrap() + .contains("0000:00:02.0") + ); + + // Verify the second disk is in an iommu group. + assert!( + guest + .ssh_command("ls /sys/kernel/iommu_groups/*/devices") + .unwrap() + .contains("0000:00:03.0") + ); + + // Verify the network card is in an iommu group. + assert!( + guest + .ssh_command("ls /sys/kernel/iommu_groups/*/devices") + .unwrap() + .contains("0000:00:04.0") + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +// ivshmem test +// This case validates that read data from host(host write data to ivshmem backend file, +// guest read data from ivshmem pci bar2 memory) +// and write data to host(guest write data to ivshmem pci bar2 memory, host read it from +// ivshmem backend file). +// It also checks the size of the shared memory region. +pub(crate) fn _test_ivshmem(guest: &Guest, ivshmem_file_path: impl AsRef, file_size: &str) { + let ivshmem_file_path = ivshmem_file_path.as_ref(); + let test_message_read = String::from("ivshmem device test data read"); + // Modify backend file data before function test + let mut file = OpenOptions::new() + .read(true) + .write(true) + .open(ivshmem_file_path) + .unwrap(); + file.seek(SeekFrom::Start(0)).unwrap(); + file.write_all(test_message_read.as_bytes()).unwrap(); + file.write_all(b"\0").unwrap(); + file.flush().unwrap(); + + let output = fs::read_to_string(ivshmem_file_path).unwrap(); + let nul_pos = output.as_bytes().iter().position(|&b| b == 0).unwrap(); + let c_str = CStr::from_bytes_until_nul(&output.as_bytes()[..=nul_pos]).unwrap(); + let file_message = c_str.to_string_lossy().to_string(); + // Check if the backend file data is correct + assert_eq!(test_message_read, file_message); + + let device_id_line = String::from( + guest + .ssh_command("lspci -D | grep \"Inter-VM shared memory\"") + .unwrap() + .trim(), + ); + // Check if ivshmem exists + assert!(!device_id_line.is_empty()); + let device_id = device_id_line.split(" ").next().unwrap(); + // Check shard memory size + assert_eq!( + guest + .ssh_command( + format!("lspci -vv -s {device_id} | grep -c \"Region 2.*size={file_size}\"") + .as_str(), + ) + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 1 + ); + + // guest don't have gcc or g++, try to use python to test :( + // This python program try to mmap the ivshmem pci bar2 memory and read the data from it. + let ivshmem_test_read = format!( + r#" +import os +import mmap +from ctypes import create_string_buffer, c_char, memmove + +if __name__ == "__main__": + device_path = f"/sys/bus/pci/devices/{device_id}/resource2" + fd = os.open(device_path, os.O_RDWR | os.O_SYNC) + + PAGE_SIZE = os.sysconf('SC_PAGESIZE') + + with mmap.mmap(fd, PAGE_SIZE, flags=mmap.MAP_SHARED, + prot=mmap.PROT_READ | mmap.PROT_WRITE, offset=0) as shmem: + c_buf = (c_char * PAGE_SIZE).from_buffer(shmem) + null_pos = c_buf.raw.find(b'\x00') + valid_data = c_buf.raw[:null_pos] if null_pos != -1 else c_buf.raw + print(valid_data.decode('utf-8', errors='replace'), end="") + shmem.flush() + del c_buf + + os.close(fd) + "# + ); + guest + .ssh_command( + format!( + r#"cat << EOF > test_read.py +{ivshmem_test_read} +EOF +"# + ) + .as_str(), + ) + .unwrap(); + let guest_message = guest.ssh_command("sudo python3 test_read.py").unwrap(); + + // Check the probe message in host and guest + assert_eq!(test_message_read, guest_message); + + let test_message_write = "ivshmem device test data write"; + // Then the program writes a test message to the memory and flush it. + let ivshmem_test_write = format!( + r#" +import os +import mmap +from ctypes import create_string_buffer, c_char, memmove + +if __name__ == "__main__": + device_path = f"/sys/bus/pci/devices/{device_id}/resource2" + test_message = "{test_message_write}" + fd = os.open(device_path, os.O_RDWR | os.O_SYNC) + + PAGE_SIZE = os.sysconf('SC_PAGESIZE') + + with mmap.mmap(fd, PAGE_SIZE, flags=mmap.MAP_SHARED, + prot=mmap.PROT_READ | mmap.PROT_WRITE, offset=0) as shmem: + shmem.flush() + c_buf = (c_char * PAGE_SIZE).from_buffer(shmem) + encoded_msg = test_message.encode('utf-8').ljust(1000, b'\x00') + memmove(c_buf, encoded_msg, len(encoded_msg)) + shmem.flush() + del c_buf + + os.close(fd) + "# + ); + + guest + .ssh_command( + format!( + r#"cat << EOF > test_write.py +{ivshmem_test_write} +EOF +"# + ) + .as_str(), + ) + .unwrap(); + + let _ = guest.ssh_command("sudo python3 test_write.py").unwrap(); + + let output = fs::read_to_string(ivshmem_file_path).unwrap(); + let nul_pos = output.as_bytes().iter().position(|&b| b == 0).unwrap(); + let c_str = CStr::from_bytes_until_nul(&output.as_bytes()[..=nul_pos]).unwrap(); + let file_message = c_str.to_string_lossy().to_string(); + // Check to send data from guest to host + assert_eq!(test_message_write, file_message); +} + +pub(crate) fn _test_simple_launch(guest: &Guest) { + let event_path = temp_event_monitor_path(&guest.tmp_dir); + + let mut child = GuestCommand::new(guest) + .default_cpus() + .default_memory() + .default_kernel_cmdline() + .default_disks() + .default_net() + .args(["--serial", "tty", "--console", "off"]) + .args(["--event-monitor", format!("path={event_path}").as_str()]) + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + guest.validate_cpu_count(None); + guest.validate_memory(None); + assert_eq!(guest.get_pci_bridge_class().unwrap_or_default(), "0x060000"); + assert!(check_sequential_events( + &guest + .get_expected_seq_events_for_simple_launch() + .iter() + .collect::>(), + &event_path + )); + + // It's been observed on the Bionic image that udev and snapd + // services can cause some delay in the VM's shutdown. Disabling + // them improves the reliability of this test. + let _ = guest.ssh_command("sudo systemctl disable udev"); + let _ = guest.ssh_command("sudo systemctl stop udev"); + let _ = guest.ssh_command("sudo systemctl disable snapd"); + let _ = guest.ssh_command("sudo systemctl stop snapd"); + + guest.ssh_command("sudo poweroff").unwrap(); + let latest_events = [ + &MetaEvent { + event: "shutdown".to_string(), + device_id: None, + }, + &MetaEvent { + event: "deleted".to_string(), + device_id: None, + }, + &MetaEvent { + event: "shutdown".to_string(), + device_id: None, + }, + ]; + assert!(wait_until(Duration::from_secs(20), || { + check_latest_events_exact(&latest_events, &event_path) + })); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +pub(crate) fn _test_multi_cpu(guest: &Guest) { + let mut cmd = GuestCommand::new(guest); + cmd.args(["--cpus", "boot=2,max=4"]) + .default_memory() + .default_kernel_cmdline() + .capture_output() + .default_disks() + .default_net(); + + let mut child = cmd.spawn().unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + assert_eq!(guest.get_cpu_count().unwrap_or_default(), 2); + + assert_eq!( + guest + .ssh_command(r#"sudo dmesg | grep "smp: Brought up" | sed "s/\[\ *[0-9.]*\] //""#) + .unwrap() + .trim(), + "smp: Brought up 1 node, 2 CPUs" + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +pub(crate) fn _test_cpu_affinity(guest: &Guest) { + // We need the host to have at least 4 CPUs if we want to be able + // to run this test. + let host_cpus_count = exec_host_command_output("nproc"); + assert!( + String::from_utf8_lossy(&host_cpus_count.stdout) + .trim() + .parse::() + .unwrap_or(0) + >= 4 + ); + + let mut child = GuestCommand::new(guest) + .default_cpus_with_affinity() + .default_memory() + .default_kernel_cmdline() + .default_disks() + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + let pid = child.id(); + let taskset_vcpu0 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep vcpu0 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str()); + assert_eq!(String::from_utf8_lossy(&taskset_vcpu0.stdout).trim(), "0,2"); + let taskset_vcpu1 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep vcpu1 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str()); + assert_eq!(String::from_utf8_lossy(&taskset_vcpu1.stdout).trim(), "1,3"); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + handle_child_output(r, &output); +} + +pub(crate) fn _test_virtio_queue_affinity(guest: &Guest) { + // We need the host to have at least 4 CPUs if we want to be able + // to run this test. + let host_cpus_count = exec_host_command_output("nproc"); + assert!( + String::from_utf8_lossy(&host_cpus_count.stdout) + .trim() + .parse::() + .unwrap_or(0) + >= 4 + ); + + let mut child = GuestCommand::new(guest) + .default_cpus() + .default_memory() + .default_kernel_cmdline() + .args([ + "--disk", + format!( + "path={}", + guest.disk_config.disk(DiskType::OperatingSystem).unwrap() + ) + .as_str(), + format!( + "path={},num_queues=4,queue_affinity=[0@[0,2],1@[1,3],2@[1],3@[3]]", + guest.disk_config.disk(DiskType::CloudInit).unwrap() + ) + .as_str(), + ]) + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + let pid = child.id(); + let taskset_q0 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep disk1_q0 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str()); + assert_eq!(String::from_utf8_lossy(&taskset_q0.stdout).trim(), "0,2"); + let taskset_q1 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep disk1_q1 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str()); + assert_eq!(String::from_utf8_lossy(&taskset_q1.stdout).trim(), "1,3"); + let taskset_q2 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep disk1_q2 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str()); + assert_eq!(String::from_utf8_lossy(&taskset_q2.stdout).trim(), "1"); + let taskset_q3 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep disk1_q3 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str()); + assert_eq!(String::from_utf8_lossy(&taskset_q3.stdout).trim(), "3"); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + handle_child_output(r, &output); +} + +pub(crate) fn _test_pci_msi(guest: &Guest) { + let mut cmd = GuestCommand::new(guest); + cmd.default_cpus() + .default_memory() + .default_kernel_cmdline() + .capture_output() + .default_disks() + .default_net(); + + let mut child = cmd.spawn().unwrap(); + + guest.wait_vm_boot().unwrap(); + + let grep_cmd = format!("grep -c {} /proc/interrupts", get_msi_interrupt_pattern()); + + let r = std::panic::catch_unwind(|| { + assert_eq!( + guest + .ssh_command(&grep_cmd) + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 12 + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +pub(crate) fn _test_virtio_net_ctrl_queue(guest: &Guest) { + let mut cmd = GuestCommand::new(guest); + cmd.default_cpus() + .default_memory() + .default_kernel_cmdline() + .args(["--net", guest.default_net_string_w_mtu(3000).as_str()]) + .capture_output() + .default_disks(); + + let mut child = cmd.spawn().unwrap(); + + guest.wait_vm_boot().unwrap(); + + #[cfg(target_arch = "aarch64")] + let iface = "enp0s4"; + #[cfg(target_arch = "x86_64")] + let iface = "ens4"; + + let r = std::panic::catch_unwind(|| { + assert_eq!( + guest + .ssh_command( + format!("sudo ethtool -K {iface} rx-gro-hw off && echo success").as_str() + ) + .unwrap() + .trim(), + "success" + ); + assert_eq!( + guest + .ssh_command(format!("cat /sys/class/net/{iface}/mtu").as_str()) + .unwrap() + .trim(), + "3000" + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +pub(crate) fn _test_pci_multiple_segments( + guest: &Guest, + max_num_pci_segments: u16, + pci_segments_for_disk: u16, +) { + // Prepare another disk file for the virtio-disk device + let test_disk_path = String::from( + guest + .tmp_dir + .as_path() + .join("test-disk.raw") + .to_str() + .unwrap(), + ); + assert!( + exec_host_command_status(format!("truncate {test_disk_path} -s 4M").as_str()).success() + ); + assert!(exec_host_command_status(format!("mkfs.ext4 {test_disk_path}").as_str()).success()); + + let mut cmd = GuestCommand::new(guest); + cmd.default_cpus() + .default_memory() + .default_kernel_cmdline_with_platform(Some(&format!( + "num_pci_segments={max_num_pci_segments}" + ))) + .args([ + "--disk", + format!( + "path={}", + guest.disk_config.disk(DiskType::OperatingSystem).unwrap() + ) + .as_str(), + format!( + "path={}", + guest.disk_config.disk(DiskType::CloudInit).unwrap() + ) + .as_str(), + format!("path={test_disk_path},pci_segment={pci_segments_for_disk},image_type=raw") + .as_str(), + ]) + .capture_output() + .default_net(); + + let mut child = cmd.spawn().unwrap(); + + guest.wait_vm_boot().unwrap(); + + let grep_cmd = "lspci | grep \"Host bridge\" | wc -l"; + + let r = std::panic::catch_unwind(|| { + // There should be MAX_NUM_PCI_SEGMENTS PCI host bridges in the guest. + assert_eq!( + guest + .ssh_command(grep_cmd) + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + max_num_pci_segments + ); + + // Check both if /dev/vdc exists and if the block size is 4M. + assert_eq!( + guest + .ssh_command("lsblk | grep vdc | grep -c 4M") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 1 + ); + + // Mount the device. + guest.ssh_command("mkdir mount_image").unwrap(); + guest + .ssh_command("sudo mount -o rw -t ext4 /dev/vdc mount_image/") + .unwrap(); + // Grant all users with write permission. + guest.ssh_command("sudo chmod a+w mount_image/").unwrap(); + + // Write something to the device. + guest + .ssh_command("sudo echo \"bar\" >> mount_image/foo") + .unwrap(); + + // Check the content of the block device. The file "foo" should + // contain "bar". + assert_eq!( + guest + .ssh_command("sudo cat mount_image/foo") + .unwrap() + .trim(), + "bar" + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +pub(crate) fn _test_direct_kernel_boot(guest: &Guest) { + let mut child = GuestCommand::new(guest) + .default_cpus() + .default_memory() + .default_kernel_cmdline() + .default_disks() + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + guest.validate_cpu_count(None); + guest.validate_memory(None); + + let grep_cmd = format!("grep -c {} /proc/interrupts", get_msi_interrupt_pattern()); + assert_eq!( + guest + .ssh_command(&grep_cmd) + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 12 + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +pub(crate) fn _test_virtio_block( + guest: &Guest, + disable_io_uring: bool, + disable_aio: bool, + verify_os_disk: bool, + backing_files: bool, + image_type: ImageType, +) { + let mut workload_path = dirs::home_dir().unwrap(); + workload_path.push("workloads"); + + let mut blk_file_path = workload_path; + blk_file_path.push("blk.img"); + + let initial_backing_checksum = if verify_os_disk { + compute_backing_checksum(guest.disk_config.disk(DiskType::OperatingSystem).unwrap()) + } else { + None + }; + assert!( + guest.num_cpu >= 4, + "_test_virtio_block requires at least 4 CPUs to match num_queues=4" + ); + let mut cloud_child = GuestCommand::new(guest) + .default_cpus() + .args(["--memory", "size=512M,shared=on"]) + .default_kernel_cmdline() + .args([ + "--disk", + format!( + "path={},backing_files={},image_type={image_type}", + guest.disk_config.disk(DiskType::OperatingSystem).unwrap(), + if backing_files { "on" } else { "off" }, + ) + .as_str(), + format!( + "path={}", + guest.disk_config.disk(DiskType::CloudInit).unwrap() + ) + .as_str(), + format!( + "path={},readonly=on,direct=on,num_queues=4,_disable_io_uring={},_disable_aio={}", + blk_file_path.to_str().unwrap(), + disable_io_uring, + disable_aio, + ) + .as_str(), + ]) + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + // Check both if /dev/vdc exists and if the block size is 16M. + assert_eq!( + guest + .ssh_command("lsblk | grep vdc | grep -c 16M") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 1 + ); + + // Check both if /dev/vdc exists and if this block is RO. + assert_eq!( + guest + .ssh_command("lsblk | grep vdc | awk '{print $5}'") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 1 + ); + + // Check if the number of queues is 4. + assert_eq!( + guest + .ssh_command("ls -ll /sys/block/vdc/mq | grep ^d | wc -l") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 4 + ); + }); + + if verify_os_disk { + // Use clean shutdown to allow cloud-hypervisor to clear + // the dirty bit in the QCOW2 v3 image. + kill_child(&mut cloud_child); + } else { + let _ = cloud_child.kill(); + } + let output = cloud_child.wait_with_output().unwrap(); + + handle_child_output(r, &output); + + if verify_os_disk { + disk_check_consistency( + guest.disk_config.disk(DiskType::OperatingSystem).unwrap(), + initial_backing_checksum, + ); + } +} + +pub fn _test_virtio_block_dynamic_vhdx_expand(guest: &Guest) { + const VIRTUAL_DISK_SIZE: u64 = 100 << 20; + const EMPTY_VHDX_FILE_SIZE: u64 = 8 << 20; + const FULL_VHDX_FILE_SIZE: u64 = 112 << 20; + const DYNAMIC_VHDX_NAME: &str = "dynamic.vhdx"; + + let vhdx_pathbuf = guest.tmp_dir.as_path().join(DYNAMIC_VHDX_NAME); + let vhdx_path = vhdx_pathbuf.to_str().unwrap(); + + // Generate a 100 MiB dynamic VHDX file + std::process::Command::new("qemu-img") + .arg("create") + .args(["-f", "vhdx"]) + .arg(vhdx_path) + .arg(VIRTUAL_DISK_SIZE.to_string()) + .output() + .expect("Expect generating dynamic VHDX image"); + + // Check if the size matches with empty VHDx file size + assert_eq!(vhdx_image_size(vhdx_path), EMPTY_VHDX_FILE_SIZE); + + let mut cloud_child = GuestCommand::new(guest) + .default_cpus() + .default_memory() + .default_kernel_cmdline() + .args([ + "--disk", + format!( + "path={}", + guest.disk_config.disk(DiskType::OperatingSystem).unwrap() + ) + .as_str(), + format!( + "path={}", + guest.disk_config.disk(DiskType::CloudInit).unwrap() + ) + .as_str(), + format!("path={vhdx_path}").as_str(), + ]) + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + // Check both if /dev/vdc exists and if the block size is 100 MiB. + assert_eq!( + guest + .ssh_command("lsblk | grep vdc | grep -c 100M") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 1 + ); + + // Write 100 MB of data to the VHDx disk + guest + .ssh_command("sudo dd if=/dev/urandom of=/dev/vdc bs=1M count=100") + .unwrap(); + }); + + // Check if the size matches with expected expanded VHDx file size + assert_eq!(vhdx_image_size(vhdx_path), FULL_VHDX_FILE_SIZE); + + kill_child(&mut cloud_child); + let output = cloud_child.wait_with_output().unwrap(); + + handle_child_output(r, &output); + + disk_check_consistency(vhdx_path, None); +} + +fn vhdx_image_size(disk_name: &str) -> u64 { + std::fs::File::open(disk_name) + .unwrap() + .seek(SeekFrom::End(0)) + .unwrap() +} + +#[cfg(target_arch = "x86_64")] +pub fn _test_split_irqchip(guest: &Guest) { + let mut child = GuestCommand::new(guest) + .default_cpus() + .default_memory() + .default_kernel_cmdline() + .default_disks() + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + assert_eq!( + guest + .ssh_command("grep -c IO-APIC.*timer /proc/interrupts || true") + .unwrap() + .trim() + .parse::() + .unwrap_or(1), + 0 + ); + assert_eq!( + guest + .ssh_command("grep -c IO-APIC.*cascade /proc/interrupts || true") + .unwrap() + .trim() + .parse::() + .unwrap_or(1), + 0 + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +#[cfg(target_arch = "x86_64")] +pub(crate) fn _test_dmi_serial_number(guest: &Guest) { + let mut child = GuestCommand::new(guest) + .default_cpus() + .default_memory() + .default_kernel_cmdline_with_platform(Some("serial_number=a=b;c=d")) + .default_disks() + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + assert_eq!( + guest + .ssh_command("sudo cat /sys/class/dmi/id/product_serial") + .unwrap() + .trim(), + "a=b;c=d" + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +pub(crate) fn _test_dmi_uuid(guest: &Guest) { + let mut child = GuestCommand::new(guest) + .default_cpus() + .default_memory() + .default_kernel_cmdline_with_platform(Some("uuid=1e8aa28a-435d-4027-87f4-40dceff1fa0a")) + .default_disks() + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + assert_eq!( + guest + .ssh_command("sudo cat /sys/class/dmi/id/product_uuid") + .unwrap() + .trim(), + "1e8aa28a-435d-4027-87f4-40dceff1fa0a" + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +pub(crate) fn _test_dmi_oem_strings(guest: &Guest) { + let s1 = "io.systemd.credential:xx=yy"; + let s2 = "This is a test string"; + + let oem_strings = format!("oem_strings=[{s1},{s2}]"); + + let mut child = GuestCommand::new(guest) + .default_cpus() + .default_memory() + .default_kernel_cmdline_with_platform(Some(&oem_strings)) + .default_disks() + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + assert_eq!( + guest + .ssh_command("sudo dmidecode --oem-string count") + .unwrap() + .trim(), + "2" + ); + + assert_eq!( + guest + .ssh_command("sudo dmidecode --oem-string 1") + .unwrap() + .trim(), + s1 + ); + + assert_eq!( + guest + .ssh_command("sudo dmidecode --oem-string 2") + .unwrap() + .trim(), + s2 + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +pub(crate) fn _test_serial_off(guest: &Guest) { + let mut child = GuestCommand::new(guest) + .default_cpus() + .default_memory() + .default_kernel_cmdline() + .default_disks() + .default_net() + .args(["--serial", "off"]) + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + // Test that there is no ttyS0 + assert_eq!( + guest + .ssh_command(GREP_SERIAL_IRQ_CMD) + .unwrap() + .trim() + .parse::() + .unwrap_or(1), + 0 + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +pub(crate) fn _test_multiple_network_interfaces(guest: &Guest) { + let mut child = GuestCommand::new(guest) + .default_cpus() + .default_memory() + .default_kernel_cmdline() + .default_disks() + .args([ + "--net", + guest.default_net_string().as_str(), + "tap=,mac=8a:6b:6f:5a:de:ac,ip=192.168.3.1,mask=255.255.255.128", + "tap=mytap1,mac=fe:1f:9e:e1:60:f2,ip=192.168.4.1,mask=255.255.255.128", + ]) + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + let tap_count = exec_host_command_output("ip link | grep -c mytap1"); + assert_eq!(String::from_utf8_lossy(&tap_count.stdout).trim(), "1"); + + // 3 network interfaces + default localhost ==> 4 interfaces + assert_eq!( + guest + .ssh_command("ip -o link | wc -l") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 4 + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +pub(crate) fn _test_virtio_console(guest: &Guest) { + let mut child = GuestCommand::new(guest) + .default_cpus() + .default_memory() + .default_kernel_cmdline() + .default_disks() + .default_net() + .args(["--console", "tty"]) + .args(["--serial", "null"]) + .capture_output() + .spawn() + .unwrap(); + + let text = String::from("On a branch floating down river a cricket, singing."); + let cmd = format!("echo {text} | sudo tee /dev/hvc0"); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + assert!( + guest + .does_device_vendor_pair_match("0x1043", "0x1af4") + .unwrap_or_default() + ); + + guest.ssh_command(&cmd).unwrap(); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + handle_child_output(r, &output); + + let r = std::panic::catch_unwind(|| { + assert!(String::from_utf8_lossy(&output.stdout).contains(&text)); + }); + + handle_child_output(r, &output); +} + +pub(crate) fn _test_console_file(guest: &Guest) { + let console_path = guest.tmp_dir.as_path().join("console-output"); + let mut child = GuestCommand::new(guest) + .default_cpus() + .default_memory() + .default_kernel_cmdline() + .default_disks() + .default_net() + .args([ + "--console", + format!("file={}", console_path.to_str().unwrap()).as_str(), + ]) + .capture_output() + .spawn() + .unwrap(); + + guest.wait_vm_boot().unwrap(); + + guest.ssh_command("sudo shutdown -h now").unwrap(); + + let _ = child.wait_timeout(std::time::Duration::from_secs(20)); + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + let r = std::panic::catch_unwind(|| { + // Check that the cloud-hypervisor binary actually terminated + assert!(output.status.success()); + + // Do this check after shutdown of the VM as an easy way to ensure + // all writes are flushed to disk + let mut f = std::fs::File::open(console_path).unwrap(); + let mut buf = String::new(); + f.read_to_string(&mut buf).unwrap(); + + if !buf.contains(CONSOLE_TEST_STRING) { + eprintln!( + "\n\n==== Console file output ====\n\n{buf}\n\n==== End console file output ====" + ); + } + assert!(buf.contains(CONSOLE_TEST_STRING)); + }); + + handle_child_output(r, &output); +} + +pub(crate) fn _test_direct_kernel_boot_noacpi(guest: &Guest) { + let mut child = GuestCommand::new(guest) + .default_cpus() + .default_memory() + .default_kernel_cmdline() + .default_disks() + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + assert_eq!(guest.get_cpu_count().unwrap_or_default(), 1); + assert!(guest.get_total_memory().unwrap_or_default() > 480_000); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +pub(crate) fn _test_pci_bar_reprogramming(guest: &Guest) { + let mut child = GuestCommand::new(guest) + .default_cpus() + .default_memory() + .default_kernel_cmdline() + .default_disks() + .args([ + "--net", + guest.default_net_string().as_str(), + "tap=,mac=8a:6b:6f:5a:de:ac,ip=192.168.3.1,mask=255.255.255.128", + ]) + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + // 2 network interfaces + default localhost ==> 3 interfaces + assert_eq!( + guest + .ssh_command("ip -o link | wc -l") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 3 + ); + + let init_bar_addr = guest + .ssh_command("sudo awk '{print $1; exit}' /sys/bus/pci/devices/0000:00:05.0/resource") + .unwrap(); + + // Remove the PCI device + guest + .ssh_command("echo 1 | sudo tee /sys/bus/pci/devices/0000:00:05.0/remove") + .unwrap(); + + // Only 1 network interface left + default localhost ==> 2 interfaces + assert_eq!( + guest + .ssh_command("ip -o link | wc -l") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 2 + ); + + // Remove the PCI device + guest + .ssh_command("echo 1 | sudo tee /sys/bus/pci/rescan") + .unwrap(); + + // Back to 2 network interface + default localhost ==> 3 interfaces + assert_eq!( + guest + .ssh_command("ip -o link | wc -l") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 3 + ); + + let new_bar_addr = guest + .ssh_command("sudo awk '{print $1; exit}' /sys/bus/pci/devices/0000:00:05.0/resource") + .unwrap(); + + // Let's compare the BAR addresses for our virtio-net device. + // They should be different as we expect the BAR reprogramming + // to have happened. + assert_ne!(init_bar_addr, new_bar_addr); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +pub(crate) fn _test_memory_overhead(guest: &Guest, guest_memory_size_kb: u32) { + let mut child = GuestCommand::new(guest) + .default_cpus() + .default_memory() + .default_kernel_cmdline() + .default_net() + .default_disks() + .capture_output() + .spawn() + .unwrap(); + + guest.wait_vm_boot().unwrap(); + + let r = std::panic::catch_unwind(|| { + let overhead = get_vmm_overhead(child.id(), guest_memory_size_kb); + eprintln!("Guest memory overhead: {overhead} vs {MAXIMUM_VMM_OVERHEAD_KB}"); + assert!(overhead <= MAXIMUM_VMM_OVERHEAD_KB); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +pub(crate) fn _test_landlock(guest: &Guest) { + let api_socket = temp_api_path(&guest.tmp_dir); + + let mut child = GuestCommand::new(guest) + .args(["--api-socket", &api_socket]) + .default_cpus() + .default_memory() + .default_kernel_cmdline() + .args(["--landlock"]) + .default_disks() + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + // Check /dev/vdc is not there + assert_eq!( + guest + .ssh_command("lsblk | grep -c vdc.*16M || true") + .unwrap() + .trim() + .parse::() + .unwrap_or(1), + 0 + ); + + // Now let's add the extra disk. + let mut blk_file_path = dirs::home_dir().unwrap(); + blk_file_path.push("workloads"); + blk_file_path.push("blk.img"); + // As the path to the hotplug disk is not pre-added, this remote + // command will fail. + assert!(!remote_command( + &api_socket, + "add-disk", + Some( + format!( + "path={},id=test0,readonly=true", + blk_file_path.to_str().unwrap() + ) + .as_str() + ), + )); + }); + + let _ = child.kill(); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +pub(crate) fn _test_disk_hotplug(guest: &Guest, landlock_enabled: bool) { + let api_socket = temp_api_path(&guest.tmp_dir); + + let mut blk_file_path = dirs::home_dir().unwrap(); + blk_file_path.push("workloads"); + blk_file_path.push("blk.img"); + + let mut cmd = GuestCommand::new(guest); + if landlock_enabled { + cmd.args(["--landlock"]).args([ + "--landlock-rules", + format!("path={blk_file_path:?},access=rw").as_str(), + ]); + } + + cmd.args(["--api-socket", &api_socket]) + .default_cpus() + .default_memory() + .default_kernel_cmdline() + .default_disks() + .default_net() + .capture_output(); + + let mut child = cmd.spawn().unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + // Check /dev/vdc is not there + assert_eq!( + guest + .ssh_command("lsblk | grep -c vdc.*16M || true") + .unwrap() + .trim() + .parse::() + .unwrap_or(1), + 0 + ); + + // Now let's add the extra disk. + let (cmd_success, cmd_output, _) = remote_command_w_output( + &api_socket, + "add-disk", + Some( + format!( + "path={},id=test0,readonly=true", + blk_file_path.to_str().unwrap() + ) + .as_str(), + ), + ); + assert!(cmd_success); + assert!( + String::from_utf8_lossy(&cmd_output) + .contains("{\"id\":\"test0\",\"bdf\":\"0000:00:06.0\"}") + ); + + // Wait for the hotplugged disk to appear in the guest + assert!(wait_until(Duration::from_secs(10), || { + guest + .ssh_command("lsblk | grep vdc | grep -c 16M") + .is_ok_and(|s| s.trim().parse::().unwrap_or_default() == 1) + })); + // And check the block device can be read. + guest + .ssh_command("sudo dd if=/dev/vdc of=/dev/null bs=1M iflag=direct count=16") + .unwrap(); + + // Let's remove it the extra disk. + assert!(remote_command(&api_socket, "remove-device", Some("test0"))); + // Wait for the disk to disappear + assert!(wait_until(Duration::from_secs(10), || guest + .ssh_command("lsblk | grep -c vdc.*16M || true") + .is_ok_and(|s| s.trim().parse::().unwrap_or(1) == 0))); + + // And add it back to validate unplug did work correctly. + let (cmd_success, cmd_output, _) = remote_command_w_output( + &api_socket, + "add-disk", + Some( + format!( + "path={},id=test0,readonly=true", + blk_file_path.to_str().unwrap() + ) + .as_str(), + ), + ); + assert!(cmd_success); + assert!( + String::from_utf8_lossy(&cmd_output) + .contains("{\"id\":\"test0\",\"bdf\":\"0000:00:06.0\"}") + ); + + // Wait for the hotplugged disk to appear in the guest + assert!(wait_until(Duration::from_secs(10), || { + guest + .ssh_command("lsblk | grep vdc | grep -c 16M") + .is_ok_and(|s| s.trim().parse::().unwrap_or_default() == 1) + })); + // And check the block device can be read. + guest + .ssh_command("sudo dd if=/dev/vdc of=/dev/null bs=1M iflag=direct count=16") + .unwrap(); + + // Reboot the VM. + guest.reboot_linux(0); + + // Check still there after reboot + assert_eq!( + guest + .ssh_command("lsblk | grep vdc | grep -c 16M") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 1 + ); + + assert!(remote_command(&api_socket, "remove-device", Some("test0"))); + + // Wait for the disk to disappear + assert!(wait_until(Duration::from_secs(20), || guest + .ssh_command("lsblk | grep -c vdc.*16M || true") + .is_ok_and(|s| s.trim().parse::().unwrap_or(1) == 0))); + + guest.reboot_linux(1); + + // Check device still absent + assert_eq!( + guest + .ssh_command("lsblk | grep -c vdc.*16M || true") + .unwrap() + .trim() + .parse::() + .unwrap_or(1), + 0 + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +pub(crate) fn _test_virtio_block_topology(guest: &Guest, loop_dev: &str) { + let mut child = GuestCommand::new(guest) + .default_cpus() + .default_memory() + .default_kernel_cmdline() + .args([ + "--disk", + format!( + "path={}", + guest.disk_config.disk(DiskType::OperatingSystem).unwrap() + ) + .as_str(), + format!( + "path={}", + guest.disk_config.disk(DiskType::CloudInit).unwrap() + ) + .as_str(), + format!("path={loop_dev}").as_str(), + ]) + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + // MIN-IO column + assert_eq!( + guest + .ssh_command("lsblk -t| grep vdc | awk '{print $3}'") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 4096 + ); + // PHY-SEC column + assert_eq!( + guest + .ssh_command("lsblk -t| grep vdc | awk '{print $5}'") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 4096 + ); + // LOG-SEC column + assert_eq!( + guest + .ssh_command("lsblk -t| grep vdc | awk '{print $6}'") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 4096 + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +pub(crate) fn _test_net_hotplug( + guest: &Guest, + max_num_pci_segments: u16, + pci_segment: Option, +) { + let api_socket = temp_api_path(&guest.tmp_dir); + + // Boot without network + let mut cmd = GuestCommand::new(guest); + + cmd.args(["--api-socket", &api_socket]) + .default_cpus() + .default_memory() + .default_net() + .default_disks() + .capture_output(); + + if pci_segment.is_some() { + cmd.default_kernel_cmdline_with_platform(Some(&format!( + "num_pci_segments={max_num_pci_segments}" + ))); + } else { + cmd.default_kernel_cmdline(); + } + + let mut child = cmd.spawn().unwrap(); + + guest.wait_vm_boot().unwrap(); + + let r = std::panic::catch_unwind(|| { + // Add network + let (cmd_success, cmd_output, _) = remote_command_w_output( + &api_socket, + "add-net", + Some( + format!( + "id=test0,tap=,mac={},ip={},mask=255.255.255.128{}", + guest.network.guest_mac1, + guest.network.host_ip1, + if let Some(pci_segment) = pci_segment { + format!(",pci_segment={pci_segment}") + } else { + String::new() + } + ) + .as_str(), + ), + ); + assert!(cmd_success); + + if let Some(pci_segment) = pci_segment { + assert!(String::from_utf8_lossy(&cmd_output).contains(&format!( + "{{\"id\":\"test0\",\"bdf\":\"{pci_segment:04x}:00:01.0\"}}" + ))); + } else { + assert!( + String::from_utf8_lossy(&cmd_output) + .contains("{\"id\":\"test0\",\"bdf\":\"0000:00:06.0\"}") + ); + } + + // Wait for the hotplugged network interface to appear + assert!(wait_until(Duration::from_secs(10), || { + guest + .ssh_command("ip -o link | wc -l") + .is_ok_and(|s| s.trim().parse::().unwrap_or_default() == 3) + })); + + // Test the same using the added network interface's IP + assert_eq!( + ssh_command_ip( + "ip -o link | wc -l", + &guest.network.guest_ip1, + DEFAULT_SSH_RETRIES, + DEFAULT_SSH_TIMEOUT + ) + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 3 + ); + + // Remove network and wait for it to disappear + assert!(remote_command(&api_socket, "remove-device", Some("test0"),)); + assert!(wait_until(Duration::from_secs(10), || { + guest + .ssh_command("ip -o link | wc -l") + .is_ok_and(|s| s.trim().parse::().unwrap_or_default() == 2) + })); + + // Add network + let (cmd_success, cmd_output, _) = remote_command_w_output( + &api_socket, + "add-net", + Some( + format!( + "id=test1,tap=,mac={},ip={},mask=255.255.255.128{}", + guest.network.guest_mac1, + guest.network.host_ip1, + if let Some(pci_segment) = pci_segment { + format!(",pci_segment={pci_segment}") + } else { + String::new() + } + ) + .as_str(), + ), + ); + assert!(cmd_success); + + if let Some(pci_segment) = pci_segment { + assert!(String::from_utf8_lossy(&cmd_output).contains(&format!( + "{{\"id\":\"test1\",\"bdf\":\"{pci_segment:04x}:00:01.0\"}}" + ))); + } else { + assert!( + String::from_utf8_lossy(&cmd_output) + .contains("{\"id\":\"test1\",\"bdf\":\"0000:00:06.0\"}") + ); + } + + // Wait for the hotplugged network interface to appear + assert!(wait_until(Duration::from_secs(10), || { + guest + .ssh_command("ip -o link | wc -l") + .is_ok_and(|s| s.trim().parse::().unwrap_or_default() == 3) + })); + + guest.reboot_linux(0); + + // 2 network interfaces + default localhost ==> 3 interfaces + assert_eq!( + guest + .ssh_command("ip -o link | wc -l") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 3 + ); + + // Test the same using the added network interface's IP + assert_eq!( + ssh_command_ip( + "ip -o link | wc -l", + &guest.network.guest_ip1, + DEFAULT_SSH_RETRIES, + DEFAULT_SSH_TIMEOUT + ) + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 3 + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +pub(crate) fn _test_counters(guest: &Guest) { + let api_socket = temp_api_path(&guest.tmp_dir); + + let mut cmd = GuestCommand::new(guest); + cmd.default_cpus() + .default_memory() + .default_kernel_cmdline() + .default_disks() + .args(["--net", guest.default_net_string().as_str()]) + .args(["--api-socket", &api_socket]) + .capture_output(); + + let mut child = cmd.spawn().unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + let orig_counters = get_counters(&api_socket); + guest + .ssh_command("dd if=/dev/zero of=test count=8 bs=1M") + .unwrap(); + + let new_counters = get_counters(&api_socket); + + // Check that all the counters have increased + assert!(new_counters > orig_counters); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +pub(crate) fn _test_watchdog(guest: &Guest) { + let api_socket = temp_api_path(&guest.tmp_dir); + let event_path = temp_event_monitor_path(&guest.tmp_dir); + + let mut cmd = GuestCommand::new(guest); + cmd.default_cpus() + .default_memory() + .default_kernel_cmdline() + .default_disks() + .args(["--net", guest.default_net_string().as_str()]) + .args(["--watchdog"]) + .args(["--api-socket", &api_socket]) + .args(["--event-monitor", format!("path={event_path}").as_str()]) + .capture_output(); + + let mut child = cmd.spawn().unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + let mut expected_reboot_count = 1; + + // Enable the watchdog with a 15s timeout + enable_guest_watchdog(guest, 15); + + assert_eq!(get_reboot_count(guest), expected_reboot_count); + assert_eq!( + guest + .ssh_command("sudo journalctl | grep -c -- \"Watchdog started\"") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 1 + ); + + // Allow some normal time to elapse to check we don't get spurious reboots + thread::sleep(std::time::Duration::new(40, 0)); + // Check no reboot + assert_eq!(get_reboot_count(guest), expected_reboot_count); + + // Trigger a panic (sync first). We need to do this inside a screen with a delay so the SSH command returns. + guest.ssh_command("screen -dmS reboot sh -c \"sleep 5; echo s | tee /proc/sysrq-trigger; echo c | sudo tee /proc/sysrq-trigger\"").unwrap(); + // Allow some time for the watchdog to trigger (max 30s) and reboot to happen + guest.wait_vm_boot_custom_timeout(50).unwrap(); + // Check a reboot is triggered by the watchdog + expected_reboot_count += 1; + assert_eq!(get_reboot_count(guest), expected_reboot_count); + + #[cfg(target_arch = "x86_64")] + { + // Now pause the VM and remain offline for 30s + assert!(remote_command(&api_socket, "pause", None)); + let latest_events = [ + &MetaEvent { + event: "pausing".to_string(), + device_id: None, + }, + &MetaEvent { + event: "paused".to_string(), + device_id: None, + }, + ]; + assert!(check_latest_events_exact(&latest_events, &event_path)); + assert!(remote_command(&api_socket, "resume", None)); + + // Check no reboot + assert_eq!(get_reboot_count(guest), expected_reboot_count); + } + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +pub(crate) fn _test_pvpanic(guest: &Guest) { + let api_socket = temp_api_path(&guest.tmp_dir); + let event_path = temp_event_monitor_path(&guest.tmp_dir); + + let mut cmd = GuestCommand::new(guest); + cmd.default_cpus() + .default_memory() + .default_kernel_cmdline() + .default_disks() + .args(["--net", guest.default_net_string().as_str()]) + .args(["--pvpanic"]) + .args(["--api-socket", &api_socket]) + .args(["--event-monitor", format!("path={event_path}").as_str()]) + .capture_output(); + + let mut child = cmd.spawn().unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + // Trigger guest a panic + make_guest_panic(guest); + + // Wait for the panic event to be recorded + let expected_sequential_events = [&MetaEvent { + event: "panic".to_string(), + device_id: None, + }]; + assert!(wait_until(Duration::from_secs(10), || { + check_latest_events_exact(&expected_sequential_events, &event_path) + })); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +pub(crate) fn _test_tap_from_fd(guest: &Guest) { + // Create a TAP interface with multi-queue enabled + let num_queue_pairs: usize = 2; + + use std::str::FromStr; + let taps = net_util::open_tap( + Some("chtap0"), + Some(std::net::IpAddr::V4( + std::net::Ipv4Addr::from_str(&guest.network.host_ip0).unwrap(), + )), + None, + &mut None, + None, + num_queue_pairs, + Some(libc::O_RDWR | libc::O_NONBLOCK), + ) + .unwrap(); + + let mut child = GuestCommand::new(guest) + .default_cpus() + .default_memory() + .default_kernel_cmdline() + .default_disks() + .args([ + "--net", + &format!( + "fd=[{},{}],mac={},num_queues={}", + taps[0].as_raw_fd(), + taps[1].as_raw_fd(), + guest.network.guest_mac0, + num_queue_pairs * 2 + ), + ]) + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + assert_eq!( + guest + .ssh_command("ip -o link | wc -l") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 2 + ); + + guest.reboot_linux(0); + + assert_eq!( + guest + .ssh_command("ip -o link | wc -l") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 2 + ); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +// test creates two macvtap interfaces in 'bridge' mode on the +// same physical net interface, one for the guest and one for +// the host. With additional setup on the IP address and the +// routing table, it enables the communications between the +// guest VM and the host machine. +// Details: https://wiki.libvirt.org/page/TroubleshootMacvtapHostFail +pub(crate) fn _test_macvtap( + guest: &Guest, + hotplug: bool, + guest_macvtap_name: &str, + host_macvtap_name: &str, +) { + let api_socket = temp_api_path(&guest.tmp_dir); + + let phy_net = "eth0"; + + // Create a macvtap interface for the guest VM to use + assert!( + exec_host_command_status(&format!( + "sudo ip link add link {phy_net} name {guest_macvtap_name} type macvtap mod bridge" + )) + .success() + ); + assert!( + exec_host_command_status(&format!( + "sudo ip link set {} address {} up", + guest_macvtap_name, guest.network.guest_mac0 + )) + .success() + ); + assert!(exec_host_command_status(&format!("sudo ip link show {guest_macvtap_name}")).success()); + + let tap_index = + fs::read_to_string(format!("/sys/class/net/{guest_macvtap_name}/ifindex")).unwrap(); + let tap_device = format!("/dev/tap{}", tap_index.trim()); + + assert!(exec_host_command_status(&format!("sudo chown $UID.$UID {tap_device}")).success()); + + let cstr_tap_device = CString::new(tap_device).unwrap(); + let tap_fd1 = unsafe { libc::open(cstr_tap_device.as_ptr(), libc::O_RDWR) }; + assert!(tap_fd1 > 0); + let tap_fd2 = unsafe { libc::open(cstr_tap_device.as_ptr(), libc::O_RDWR) }; + assert!(tap_fd2 > 0); + + // Create a macvtap on the same physical net interface for + // the host machine to use + assert!( + exec_host_command_status(&format!( + "sudo ip link add link {phy_net} name {host_macvtap_name} type macvtap mod bridge" + )) + .success() + ); + // Use default mask "255.255.255.0" + assert!( + exec_host_command_status(&format!( + "sudo ip address add {}/24 dev {}", + guest.network.host_ip0, host_macvtap_name + )) + .success() + ); + assert!( + exec_host_command_status(&format!("sudo ip link set dev {host_macvtap_name} up")).success() + ); + + let mut guest_command = GuestCommand::new(guest); + guest_command + .default_cpus() + .default_memory() + .default_kernel_cmdline() + .default_disks() + .args(["--api-socket", &api_socket]); + + let net_params = format!( + "fd=[{},{}],mac={},num_queues=4", + tap_fd1, tap_fd2, guest.network.guest_mac0 + ); + + if !hotplug { + guest_command.args(["--net", &net_params]); + } + + let mut child = guest_command.capture_output().spawn().unwrap(); + + if hotplug { + // Wait for the VMM process to listen to the API socket + assert!(wait_until(Duration::from_secs(10), || remote_command( + &api_socket, + "ping", + None + ))); + // Hotplug the virtio-net device + let (cmd_success, cmd_output, _) = + remote_command_w_output(&api_socket, "add-net", Some(&net_params)); + assert!(cmd_success); + #[cfg(target_arch = "x86_64")] + assert!( + String::from_utf8_lossy(&cmd_output) + .contains("{\"id\":\"_net2\",\"bdf\":\"0000:00:05.0\"}") + ); + #[cfg(target_arch = "aarch64")] + assert!( + String::from_utf8_lossy(&cmd_output) + .contains("{\"id\":\"_net0\",\"bdf\":\"0000:00:05.0\"}") + ); + } + + // The functional connectivity provided by the virtio-net device + // gets tested through wait_vm_boot() as it expects to receive a + // HTTP request, and through the SSH command as well. + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + assert_eq!( + guest + .ssh_command("ip -o link | wc -l") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 2 + ); + + guest.reboot_linux(0); + + assert_eq!( + guest + .ssh_command("ip -o link | wc -l") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 2 + ); + }); + + kill_child(&mut child); + + exec_host_command_status(&format!("sudo ip link del {guest_macvtap_name}")); + exec_host_command_status(&format!("sudo ip link del {host_macvtap_name}")); + + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} + +pub(crate) fn _test_vdpa_block(guest: &Guest) { + let api_socket = temp_api_path(&guest.tmp_dir); + + let mut child = GuestCommand::new(guest) + .default_cpus() + .args(["--memory", "size=512M,hugepages=on"]) + .default_kernel_cmdline_with_platform(Some("num_pci_segments=2,iommu_segments=1")) + .default_disks() + .default_net() + .args(["--vdpa", "path=/dev/vhost-vdpa-0,num_queues=1"]) + .args(["--api-socket", &api_socket]) + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + // Check both if /dev/vdc exists and if the block size is 128M. + assert_eq!( + guest + .ssh_command("lsblk | grep vdc | grep -c 128M") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 1 + ); + + // Check the content of the block device after we wrote to it. + // The vpda-sim-blk should let us read what we previously wrote. + guest + .ssh_command("sudo bash -c 'echo foobar > /dev/vdc'") + .unwrap(); + assert_eq!( + guest.ssh_command("sudo head -1 /dev/vdc").unwrap().trim(), + "foobar" + ); + + // Hotplug an extra vDPA block device behind the vIOMMU + // Add a new vDPA device to the VM + let (cmd_success, cmd_output, _) = remote_command_w_output( + &api_socket, + "add-vdpa", + Some("id=myvdpa0,path=/dev/vhost-vdpa-1,num_queues=1,pci_segment=1,iommu=on"), + ); + assert!(cmd_success); + assert!( + String::from_utf8_lossy(&cmd_output) + .contains("{\"id\":\"myvdpa0\",\"bdf\":\"0001:00:01.0\"}") + ); + + // Wait for the hotplugged device to appear + assert!(wait_until(Duration::from_secs(10), || guest + .does_device_vendor_pair_match("0x1057", "0x1af4") + .unwrap_or_default())); + assert!( + guest + .ssh_command("ls /sys/kernel/iommu_groups/*/devices") + .unwrap() + .contains("0001:00:01.0") + ); + + // Check both if /dev/vdd exists and if the block size is 128M. + assert_eq!( + guest + .ssh_command("lsblk | grep vdd | grep -c 128M") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 1 + ); + + // Write some content to the block device we've just plugged. + guest + .ssh_command("sudo bash -c 'echo foobar > /dev/vdd'") + .unwrap(); + + // Check we can read the content back. + assert_eq!( + guest.ssh_command("sudo head -1 /dev/vdd").unwrap().trim(), + "foobar" + ); + + // Unplug the device + let cmd_success = remote_command(&api_socket, "remove-device", Some("myvdpa0")); + assert!(cmd_success); + + // Wait for the device to disappear + assert!(wait_until(Duration::from_secs(10), || guest + .ssh_command("lsblk | grep -c vdd || true") + .is_ok_and(|s| s.trim().parse::().unwrap_or(1) == 0))); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); +} diff --git a/cloud-hypervisor/tests/common/utils.rs b/cloud-hypervisor/tests/common/utils.rs new file mode 100644 index 0000000000..1821575d49 --- /dev/null +++ b/cloud-hypervisor/tests/common/utils.rs @@ -0,0 +1,1061 @@ +// Copyright 2025 The Cloud Hypervisor Authors. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 +use std::collections::HashMap; +use std::fs::{File, OpenOptions}; +use std::io::{BufRead, Read, Seek, SeekFrom, Write}; +use std::path::{Path, PathBuf}; +use std::process::{Child, Command}; +use std::string::String; +use std::sync::mpsc; +use std::sync::mpsc::Receiver; +use std::time::{Duration, Instant}; +use std::{cmp, fs, io, thread}; + +use test_infra::*; +use vmm_sys_util::tempdir::TempDir; + +const QCOW2_INCOMPATIBLE_FEATURES_OFFSET: u64 = 72; +// 10MB is our maximum accepted overhead. +pub(crate) const MAXIMUM_VMM_OVERHEAD_KB: u32 = 10 * 1024; + +// This enum exists to make it more convenient to +// implement test for both D-Bus and REST APIs. +pub(crate) enum TargetApi { + // API socket + HttpApi(String), + // well known service name, object path + DBusApi(String, String), +} + +impl TargetApi { + pub(crate) fn new_http_api(tmp_dir: &TempDir) -> Self { + Self::HttpApi(temp_api_path(tmp_dir)) + } + + pub(crate) fn new_dbus_api(tmp_dir: &TempDir) -> Self { + // `tmp_dir` is in the form of "/tmp/chXXXXXX" + // and we take the `chXXXXXX` part as a unique identifier for the guest + let id = tmp_dir.as_path().file_name().unwrap().to_str().unwrap(); + + Self::DBusApi( + format!("org.cloudhypervisor.{id}"), + format!("/org/cloudhypervisor/{id}"), + ) + } + + pub(crate) fn guest_args(&self) -> Vec { + match self { + TargetApi::HttpApi(api_socket) => { + vec![format!("--api-socket={}", api_socket.as_str())] + } + TargetApi::DBusApi(service_name, object_path) => { + vec![ + format!("--dbus-service-name={}", service_name.as_str()), + format!("--dbus-object-path={}", object_path.as_str()), + ] + } + } + } + + pub(crate) fn remote_args(&self) -> Vec { + // `guest_args` and `remote_args` are consistent with each other + self.guest_args() + } + + pub(crate) fn remote_command(&self, command: &str, arg: Option<&str>) -> bool { + let mut cmd = Command::new(clh_command("ch-remote")); + cmd.args(self.remote_args()); + cmd.arg(command); + + if let Some(arg) = arg { + cmd.arg(arg); + } + + let output = cmd.output().unwrap(); + if output.status.success() { + true + } else { + eprintln!("Error running ch-remote command: {:?}", &cmd); + let stderr = String::from_utf8_lossy(&output.stderr); + eprintln!("stderr: {stderr}"); + false + } + } +} + +pub(crate) fn temp_api_path(tmp_dir: &TempDir) -> String { + String::from( + tmp_dir + .as_path() + .join("cloud-hypervisor.sock") + .to_str() + .unwrap(), + ) +} + +pub(crate) fn wait_for_virtiofsd_socket(socket: &str) { + // Wait for virtiofds to start + let deadline = Instant::now() + Duration::from_secs(10); + while !Path::new(socket).exists() { + if Instant::now() > deadline { + panic!("virtiofsd socket did not appear within 10s"); + } + thread::sleep(Duration::from_millis(50)); + } +} + +pub(crate) fn prepare_virtiofsd( + tmp_dir: &TempDir, + shared_dir: &str, +) -> (std::process::Child, String) { + let mut workload_path = dirs::home_dir().unwrap(); + workload_path.push("workloads"); + + let mut virtiofsd_path = workload_path; + virtiofsd_path.push("virtiofsd"); + let virtiofsd_path = String::from(virtiofsd_path.to_str().unwrap()); + + let virtiofsd_socket_path = + String::from(tmp_dir.as_path().join("virtiofs.sock").to_str().unwrap()); + + // Start the daemon + let child = Command::new(virtiofsd_path.as_str()) + .args(["--shared-dir", shared_dir]) + .args(["--socket-path", virtiofsd_socket_path.as_str()]) + .args(["--cache", "never"]) + .args(["--tag", "myfs"]) + .spawn() + .unwrap(); + + wait_for_virtiofsd_socket(virtiofsd_socket_path.as_str()); + + (child, virtiofsd_socket_path) +} + +pub(crate) fn prepare_vubd( + tmp_dir: &TempDir, + blk_img: &str, + num_queues: usize, + rdonly: bool, + direct: bool, +) -> (std::process::Child, String) { + let mut workload_path = dirs::home_dir().unwrap(); + workload_path.push("workloads"); + + let mut blk_file_path = workload_path; + blk_file_path.push(blk_img); + let blk_file_path = String::from(blk_file_path.to_str().unwrap()); + + let vubd_socket_path = String::from(tmp_dir.as_path().join("vub.sock").to_str().unwrap()); + + // Start the daemon + let child = Command::new(clh_command("vhost_user_block")) + .args([ + "--block-backend", + format!( + "path={blk_file_path},socket={vubd_socket_path},num_queues={num_queues},readonly={rdonly},direct={direct}" + ) + .as_str(), + ]) + .spawn() + .unwrap(); + + thread::sleep(std::time::Duration::new(10, 0)); + + (child, vubd_socket_path) +} + +pub(crate) fn temp_vsock_path(tmp_dir: &TempDir) -> String { + String::from(tmp_dir.as_path().join("vsock").to_str().unwrap()) +} + +pub(crate) fn temp_event_monitor_path(tmp_dir: &TempDir) -> String { + String::from(tmp_dir.as_path().join("event.json").to_str().unwrap()) +} + +// Creates the directory and returns the path. +pub(crate) fn temp_snapshot_dir_path(tmp_dir: &TempDir) -> String { + let snapshot_dir = String::from(tmp_dir.as_path().join("snapshot").to_str().unwrap()); + std::fs::create_dir(&snapshot_dir).unwrap(); + snapshot_dir +} + +pub(crate) fn temp_vmcore_file_path(tmp_dir: &TempDir) -> String { + String::from(tmp_dir.as_path().join("vmcore").to_str().unwrap()) +} + +pub(crate) fn cloud_hypervisor_release_path() -> String { + let mut workload_path = dirs::home_dir().unwrap(); + workload_path.push("workloads"); + + let mut ch_release_path = workload_path; + #[cfg(target_arch = "x86_64")] + ch_release_path.push("cloud-hypervisor-static"); + #[cfg(target_arch = "aarch64")] + ch_release_path.push("cloud-hypervisor-static-aarch64"); + + ch_release_path.into_os_string().into_string().unwrap() +} + +pub(crate) fn prepare_vhost_user_net_daemon( + tmp_dir: &TempDir, + ip: &str, + tap: Option<&str>, + mtu: Option, + num_queues: usize, + client_mode: bool, +) -> (std::process::Command, String) { + let vunet_socket_path = String::from(tmp_dir.as_path().join("vunet.sock").to_str().unwrap()); + + // Start the daemon + let mut net_params = format!( + "ip={ip},mask=255.255.255.128,socket={vunet_socket_path},num_queues={num_queues},queue_size=1024,client={client_mode}" + ); + + if let Some(tap) = tap { + net_params.push_str(format!(",tap={tap}").as_str()); + } + + if let Some(mtu) = mtu { + net_params.push_str(format!(",mtu={mtu}").as_str()); + } + + let mut command = Command::new(clh_command("vhost_user_net")); + command.args(["--net-backend", net_params.as_str()]); + + (command, vunet_socket_path) +} + +pub(crate) fn prepare_swtpm_daemon(tmp_dir: &TempDir) -> (std::process::Command, String) { + let swtpm_tpm_dir = String::from(tmp_dir.as_path().join("swtpm").to_str().unwrap()); + let swtpm_socket_path = String::from( + tmp_dir + .as_path() + .join("swtpm") + .join("swtpm.sock") + .to_str() + .unwrap(), + ); + std::fs::create_dir(&swtpm_tpm_dir).unwrap(); + + let mut swtpm_command = Command::new("swtpm"); + let swtpm_args = [ + "socket", + "--tpmstate", + &format!("dir={swtpm_tpm_dir}"), + "--ctrl", + &format!("type=unixio,path={swtpm_socket_path}"), + "--flags", + "startup-clear", + "--tpm2", + ]; + swtpm_command.args(swtpm_args); + + (swtpm_command, swtpm_socket_path) +} + +pub(crate) fn resize_command( + api_socket: &str, + desired_vcpus: Option, + desired_ram: Option, + desired_balloon: Option, + event_file: Option<&str>, +) -> bool { + let mut cmd = Command::new(clh_command("ch-remote")); + cmd.args([&format!("--api-socket={api_socket}"), "resize"]); + + if let Some(desired_vcpus) = desired_vcpus { + cmd.arg(format!("--cpus={desired_vcpus}")); + } + + if let Some(desired_ram) = desired_ram { + cmd.arg(format!("--memory={desired_ram}")); + } + + if let Some(desired_balloon) = desired_balloon { + cmd.arg(format!("--balloon={desired_balloon}")); + } + + let ret = cmd.status().expect("Failed to launch ch-remote").success(); + + if let Some(event_path) = event_file { + let latest_events = [ + &MetaEvent { + event: "resizing".to_string(), + device_id: None, + }, + &MetaEvent { + event: "resized".to_string(), + device_id: None, + }, + ]; + // See: #5938 + thread::sleep(std::time::Duration::new(1, 0)); + assert!(check_latest_events_exact(&latest_events, event_path)); + } + + ret +} + +pub(crate) fn resize_zone_command(api_socket: &str, id: &str, desired_size: &str) -> bool { + let mut cmd = Command::new(clh_command("ch-remote")); + cmd.args([ + &format!("--api-socket={api_socket}"), + "resize-zone", + &format!("--id={id}"), + &format!("--size={desired_size}"), + ]); + + cmd.status().expect("Failed to launch ch-remote").success() +} + +pub(crate) fn resize_disk_command(api_socket: &str, id: &str, desired_size: &str) -> bool { + let mut cmd = Command::new(clh_command("ch-remote")); + cmd.args([ + &format!("--api-socket={api_socket}"), + "resize-disk", + &format!("--disk={id}"), + &format!("--size={desired_size}"), + ]); + + cmd.status().expect("Failed to launch ch-remote").success() +} + +// setup OVS-DPDK bridge and ports +pub(crate) fn setup_ovs_dpdk() { + // setup OVS-DPDK + assert!(exec_host_command_status("service openvswitch-switch start").success()); + assert!(exec_host_command_status("ovs-vsctl init").success()); + assert!( + exec_host_command_status("ovs-vsctl set Open_vSwitch . other_config:dpdk-init=true") + .success() + ); + assert!(exec_host_command_status("service openvswitch-switch restart").success()); + + // Create OVS-DPDK bridge and ports + assert!( + exec_host_command_status( + "ovs-vsctl add-br ovsbr0 -- set bridge ovsbr0 datapath_type=netdev", + ) + .success() + ); + assert!(exec_host_command_status("ovs-vsctl add-port ovsbr0 vhost-user1 -- set Interface vhost-user1 type=dpdkvhostuserclient options:vhost-server-path=/tmp/dpdkvhostclient1").success()); + assert!(exec_host_command_status("ovs-vsctl add-port ovsbr0 vhost-user2 -- set Interface vhost-user2 type=dpdkvhostuserclient options:vhost-server-path=/tmp/dpdkvhostclient2").success()); + assert!(exec_host_command_status("ip link set up dev ovsbr0").success()); + assert!(exec_host_command_status("service openvswitch-switch restart").success()); +} + +pub(crate) fn cleanup_ovs_dpdk() { + assert!(exec_host_command_status("ovs-vsctl del-br ovsbr0").success()); + exec_host_command_status("rm -f ovs-vsctl /tmp/dpdkvhostclient1 /tmp/dpdkvhostclient2"); +} + +// Setup two guests and ensure they are connected through ovs-dpdk +pub(crate) fn setup_ovs_dpdk_guests( + guest1: &Guest, + guest2: &Guest, + api_socket: &str, + release_binary: bool, +) -> (Child, Child) { + setup_ovs_dpdk(); + + let clh_path = if release_binary { + cloud_hypervisor_release_path() + } else { + clh_command("cloud-hypervisor") + }; + + let mut child1 = GuestCommand::new_with_binary_path(guest1, &clh_path) + .args(["--cpus", "boot=2"]) + .args(["--memory", "size=0,shared=on"]) + .args(["--memory-zone", "id=mem0,size=1G,shared=on,host_numa_node=0"]) + .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .default_disks() + .args(["--net", guest1.default_net_string().as_str(), "vhost_user=true,socket=/tmp/dpdkvhostclient1,num_queues=2,queue_size=256,vhost_mode=server"]) + .capture_output() + .spawn() + .unwrap(); + + #[cfg(target_arch = "x86_64")] + let guest_net_iface = "ens5"; + #[cfg(target_arch = "aarch64")] + let guest_net_iface = "enp0s5"; + + let r = std::panic::catch_unwind(|| { + guest1.wait_vm_boot().unwrap(); + + guest1 + .ssh_command(&format!( + "sudo ip addr add 172.100.0.1/24 dev {guest_net_iface}" + )) + .unwrap(); + guest1 + .ssh_command(&format!("sudo ip link set up dev {guest_net_iface}")) + .unwrap(); + + let guest_ip = guest1.network.guest_ip0.clone(); + thread::spawn(move || { + ssh_command_ip( + "nc -l 12345", + &guest_ip, + DEFAULT_SSH_RETRIES, + DEFAULT_SSH_TIMEOUT, + ) + .unwrap(); + }); + }); + if r.is_err() { + cleanup_ovs_dpdk(); + + let _ = child1.kill(); + let output = child1.wait_with_output().unwrap(); + handle_child_output(r, &output); + panic!("Test should already be failed/panicked"); // To explicitly mark this block never return + } + + let mut child2 = GuestCommand::new_with_binary_path(guest2, &clh_path) + .args(["--api-socket", api_socket]) + .args(["--cpus", "boot=2"]) + .args(["--memory", "size=0,shared=on"]) + .args(["--memory-zone", "id=mem0,size=1G,shared=on,host_numa_node=0"]) + .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .default_disks() + .args(["--net", guest2.default_net_string().as_str(), "vhost_user=true,socket=/tmp/dpdkvhostclient2,num_queues=2,queue_size=256,vhost_mode=server"]) + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest2.wait_vm_boot().unwrap(); + + guest2 + .ssh_command(&format!( + "sudo ip addr add 172.100.0.2/24 dev {guest_net_iface}" + )) + .unwrap(); + guest2 + .ssh_command(&format!("sudo ip link set up dev {guest_net_iface}")) + .unwrap(); + + // Check the connection works properly between the two VMs + guest2.ssh_command("nc -vz 172.100.0.1 12345").unwrap(); + }); + if r.is_err() { + cleanup_ovs_dpdk(); + + let _ = child1.kill(); + let _ = child2.kill(); + let output = child2.wait_with_output().unwrap(); + handle_child_output(r, &output); + panic!("Test should already be failed/panicked"); // To explicitly mark this block never return + } + + (child1, child2) +} + +pub enum FwType { + Ovmf, + RustHypervisorFirmware, +} + +pub(crate) fn fw_path(_fw_type: FwType) -> String { + let mut workload_path = dirs::home_dir().unwrap(); + workload_path.push("workloads"); + + let mut fw_path = workload_path; + #[cfg(target_arch = "aarch64")] + fw_path.push("CLOUDHV_EFI.fd"); + #[cfg(target_arch = "x86_64")] + { + match _fw_type { + FwType::Ovmf => fw_path.push(OVMF_NAME), + FwType::RustHypervisorFirmware => fw_path.push("hypervisor-fw"), + } + } + + fw_path.to_str().unwrap().to_string() +} + +// Parse the event_monitor file based on the format that each event +// is followed by a double newline +fn parse_event_file(event_file: &str) -> Vec { + let content = fs::read(event_file).unwrap(); + let mut ret = Vec::new(); + for entry in String::from_utf8_lossy(&content) + .trim() + .split("\n\n") + .collect::>() + { + ret.push(serde_json::from_str(entry).unwrap()); + } + + ret +} + +// Return true if all events from the input 'expected_events' are matched sequentially +// with events from the 'event_file' +pub(crate) fn check_sequential_events(expected_events: &[&MetaEvent], event_file: &str) -> bool { + if !Path::new(event_file).exists() { + return false; + } + let json_events = parse_event_file(event_file); + let len = expected_events.len(); + let mut idx = 0; + for e in &json_events { + if idx == len { + break; + } + if expected_events[idx].match_with_json_event(e) { + idx += 1; + } + } + + let ret = idx == len; + + if !ret { + eprintln!( + "\n\n==== Start 'check_sequential_events' failed ==== \ + \n\nexpected_events={expected_events:?}\nactual_events={json_events:?} \ + \n\n==== End 'check_sequential_events' failed ====", + ); + } + + ret +} + +// Return true if all events from the input 'expected_events' are matched exactly +// with events from the 'event_file' +pub(crate) fn check_sequential_events_exact( + expected_events: &[&MetaEvent], + event_file: &str, +) -> bool { + if !Path::new(event_file).exists() { + return false; + } + let json_events = parse_event_file(event_file); + if expected_events.len() > json_events.len() { + return false; + } + let json_events = &json_events[..expected_events.len()]; + + for (idx, e) in json_events.iter().enumerate() { + if !expected_events[idx].match_with_json_event(e) { + eprintln!( + "\n\n==== Start 'check_sequential_events_exact' failed ==== \ + \n\nexpected_events={expected_events:?}\nactual_events={json_events:?} \ + \n\n==== End 'check_sequential_events_exact' failed ====", + ); + + return false; + } + } + + true +} + +// Return true if events from the input 'latest_events' are matched exactly +// with the most recent events from the 'event_file' +pub(crate) fn check_latest_events_exact(latest_events: &[&MetaEvent], event_file: &str) -> bool { + if !Path::new(event_file).exists() { + return false; + } + let json_events = parse_event_file(event_file); + if latest_events.len() > json_events.len() { + return false; + } + let json_events = &json_events[(json_events.len() - latest_events.len())..]; + + for (idx, e) in json_events.iter().enumerate() { + if !latest_events[idx].match_with_json_event(e) { + eprintln!( + "\n\n==== Start 'check_latest_events_exact' failed ==== \ + \n\nexpected_events={latest_events:?}\nactual_events={json_events:?} \ + \n\n==== End 'check_latest_events_exact' failed ====", + ); + + return false; + } + } + + true +} + +pub(super) fn get_msi_interrupt_pattern() -> String { + #[cfg(target_arch = "x86_64")] + { + "PCI-MSI".to_string() + } + #[cfg(target_arch = "aarch64")] + { + if cfg!(feature = "mshv") { + "GICv2m-PCI-MSIX".to_string() + } else { + "ITS-PCI-MSIX".to_string() + } + } +} + +pub(super) type PrepareNetDaemon = dyn Fn( + &TempDir, + &str, + Option<&str>, + Option, + usize, + bool, +) -> (std::process::Command, String); + +pub(super) fn get_ksm_pages_shared() -> u32 { + fs::read_to_string("/sys/kernel/mm/ksm/pages_shared") + .unwrap() + .trim() + .parse::() + .unwrap() +} + +fn _get_vmm_overhead(pid: u32, guest_memory_size: u32) -> HashMap { + let smaps = fs::File::open(format!("/proc/{pid}/smaps")).unwrap(); + let reader = io::BufReader::new(smaps); + + let mut skip_map: bool = false; + let mut region_name: String = String::new(); + let mut region_maps = HashMap::new(); + for line in reader.lines() { + let l = line.unwrap(); + + if l.contains('-') { + let values: Vec<&str> = l.split_whitespace().collect(); + region_name = values.last().unwrap().trim().to_string(); + if region_name == "0" { + region_name = "anonymous".to_string(); + } + } + + // Each section begins with something that looks like: + // Size: 2184 kB + if l.starts_with("Size:") { + let values: Vec<&str> = l.split_whitespace().collect(); + let map_size = values[1].parse::().unwrap(); + // We skip the assigned guest RAM map, its RSS is only + // dependent on the guest actual memory usage. + // Everything else can be added to the VMM overhead. + skip_map = map_size >= guest_memory_size; + continue; + } + + // If this is a map we're taking into account, then we only + // count the RSS. The sum of all counted RSS is the VMM overhead. + if !skip_map && l.starts_with("Rss:") { + let values: Vec<&str> = l.split_whitespace().collect(); + let value = values[1].trim().parse::().unwrap(); + *region_maps.entry(region_name.clone()).or_insert(0) += value; + } + } + + region_maps +} + +pub(crate) fn get_vmm_overhead(pid: u32, guest_memory_size: u32) -> u32 { + let mut total = 0; + + for (region_name, value) in &_get_vmm_overhead(pid, guest_memory_size) { + eprintln!("{region_name}: {value}"); + total += value; + } + + total +} + +pub(crate) fn process_rss_kib(pid: u32) -> usize { + let command = format!("ps -q {pid} -o rss="); + let rss = exec_host_command_output(&command); + String::from_utf8_lossy(&rss.stdout).trim().parse().unwrap() +} + +#[derive(PartialEq, Eq, PartialOrd)] +pub struct Counters { + rx_bytes: u64, + rx_frames: u64, + tx_bytes: u64, + tx_frames: u64, + read_bytes: u64, + write_bytes: u64, + read_ops: u64, + write_ops: u64, +} + +pub(crate) fn get_counters(api_socket: &str) -> Counters { + // Get counters + let (cmd_success, cmd_output, _) = remote_command_w_output(api_socket, "counters", None); + assert!(cmd_success); + + let counters: HashMap<&str, HashMap<&str, u64>> = + serde_json::from_slice(&cmd_output).unwrap_or_default(); + + let rx_bytes = *counters.get("_net2").unwrap().get("rx_bytes").unwrap(); + let rx_frames = *counters.get("_net2").unwrap().get("rx_frames").unwrap(); + let tx_bytes = *counters.get("_net2").unwrap().get("tx_bytes").unwrap(); + let tx_frames = *counters.get("_net2").unwrap().get("tx_frames").unwrap(); + + let read_bytes = *counters.get("_disk0").unwrap().get("read_bytes").unwrap(); + let write_bytes = *counters.get("_disk0").unwrap().get("write_bytes").unwrap(); + let read_ops = *counters.get("_disk0").unwrap().get("read_ops").unwrap(); + let write_ops = *counters.get("_disk0").unwrap().get("write_ops").unwrap(); + + Counters { + rx_bytes, + rx_frames, + tx_bytes, + tx_frames, + read_bytes, + write_bytes, + read_ops, + write_ops, + } +} + +pub(super) fn pty_read(mut pty: std::fs::File) -> Receiver { + let (tx, rx) = mpsc::channel::(); + thread::spawn(move || { + loop { + thread::sleep(std::time::Duration::new(1, 0)); + let mut buf = [0; 512]; + match pty.read(&mut buf) { + Ok(_bytes) => { + let output = std::str::from_utf8(&buf).unwrap().to_string(); + match tx.send(output) { + Ok(_) => (), + Err(_) => break, + } + } + Err(_) => break, + } + } + }); + rx +} + +pub(crate) fn get_pty_path(api_socket: &str, pty_type: &str) -> PathBuf { + let (cmd_success, cmd_output, _) = remote_command_w_output(api_socket, "info", None); + assert!(cmd_success); + let info: serde_json::Value = serde_json::from_slice(&cmd_output).unwrap_or_default(); + assert_eq!("Pty", info["config"][pty_type]["mode"]); + PathBuf::from( + info["config"][pty_type]["file"] + .as_str() + .expect("Missing pty path"), + ) +} + +// VFIO test network setup. +// We reserve a different IP class for it: 172.18.0.0/24. +#[cfg(target_arch = "x86_64")] +pub(crate) fn setup_vfio_network_interfaces() { + // 'vfio-br0' + assert!(exec_host_command_status("sudo ip link add name vfio-br0 type bridge").success()); + assert!(exec_host_command_status("sudo ip link set vfio-br0 up").success()); + assert!(exec_host_command_status("sudo ip addr add 172.18.0.1/24 dev vfio-br0").success()); + // 'vfio-tap0' + assert!(exec_host_command_status("sudo ip tuntap add vfio-tap0 mode tap").success()); + assert!(exec_host_command_status("sudo ip link set vfio-tap0 master vfio-br0").success()); + assert!(exec_host_command_status("sudo ip link set vfio-tap0 up").success()); + // 'vfio-tap1' + assert!(exec_host_command_status("sudo ip tuntap add vfio-tap1 mode tap").success()); + assert!(exec_host_command_status("sudo ip link set vfio-tap1 master vfio-br0").success()); + assert!(exec_host_command_status("sudo ip link set vfio-tap1 up").success()); + // 'vfio-tap2' + assert!(exec_host_command_status("sudo ip tuntap add vfio-tap2 mode tap").success()); + assert!(exec_host_command_status("sudo ip link set vfio-tap2 master vfio-br0").success()); + assert!(exec_host_command_status("sudo ip link set vfio-tap2 up").success()); + // 'vfio-tap3' + assert!(exec_host_command_status("sudo ip tuntap add vfio-tap3 mode tap").success()); + assert!(exec_host_command_status("sudo ip link set vfio-tap3 master vfio-br0").success()); + assert!(exec_host_command_status("sudo ip link set vfio-tap3 up").success()); +} + +// Tear VFIO test network down +#[cfg(target_arch = "x86_64")] +pub(crate) fn cleanup_vfio_network_interfaces() { + assert!(exec_host_command_status("sudo ip link del vfio-br0").success()); + assert!(exec_host_command_status("sudo ip link del vfio-tap0").success()); + assert!(exec_host_command_status("sudo ip link del vfio-tap1").success()); + assert!(exec_host_command_status("sudo ip link del vfio-tap2").success()); + assert!(exec_host_command_status("sudo ip link del vfio-tap3").success()); +} + +pub(crate) fn balloon_size(api_socket: &str) -> u64 { + let (cmd_success, cmd_output, _) = remote_command_w_output(api_socket, "info", None); + assert!(cmd_success); + + let info: serde_json::Value = serde_json::from_slice(&cmd_output).unwrap_or_default(); + let total_mem = &info["config"]["memory"]["size"] + .to_string() + .parse::() + .unwrap(); + let actual_mem = &info["memory_actual_size"] + .to_string() + .parse::() + .unwrap(); + total_mem - actual_mem +} + +pub(crate) fn vm_state(api_socket: &str) -> String { + let (cmd_success, cmd_output, _) = remote_command_w_output(api_socket, "info", None); + assert!(cmd_success); + + let info: serde_json::Value = serde_json::from_slice(&cmd_output).unwrap_or_default(); + let state = &info["state"].as_str().unwrap(); + + state.to_string() +} + +pub(crate) fn make_virtio_block_guest(factory: &GuestFactory, image_name: &str) -> Guest { + let disk_config = UbuntuDiskConfig::new(image_name.to_string()); + factory.create_guest(Box::new(disk_config)).with_cpu(4) +} + +pub(crate) fn compute_backing_checksum( + path_or_image_name: impl AsRef, +) -> Option<(std::path::PathBuf, String, u32)> { + let path = resolve_disk_path(path_or_image_name); + + let mut file = File::open(&path).ok()?; + if !matches!( + block::detect_image_type(&mut file).ok()?, + block::ImageType::Qcow2 + ) { + return None; + } + + let info = get_image_info(&path)?; + + let backing_file = info["backing-filename"].as_str()?; + let backing_path = if std::path::Path::new(backing_file).is_absolute() { + std::path::PathBuf::from(backing_file) + } else { + path.parent() + .unwrap_or_else(|| std::path::Path::new(".")) + .join(backing_file) + }; + + let backing_info = get_image_info(&backing_path)?; + let backing_format = backing_info["format"].as_str()?.to_string(); + let mut file = File::open(&backing_path).ok()?; + let file_size = file.metadata().ok()?.len(); + let checksum = compute_file_checksum(&mut file, file_size); + + Some((backing_path, backing_format, checksum)) +} + +/// Uses `qemu-img check` to verify disk image consistency. +/// +/// Supported formats are `qcow2` (compressed and uncompressed), +/// `vhdx`, `qed`, `parallels`, `vmdk`, and `vdi`. See man page +/// for more details. +/// +/// It takes either a full path to the image or just the name of +/// the image located in the `workloads` directory. +/// +/// For QCOW2 images with backing files, also verifies the backing file +/// integrity and checks that the backing file hasn't been modified +/// during the test. +/// +/// For QCOW2 v3 images, also verifies the dirty bit is cleared. +pub(crate) fn disk_check_consistency( + path_or_image_name: impl AsRef, + initial_backing_checksum: Option<(std::path::PathBuf, String, u32)>, +) { + let path = resolve_disk_path(path_or_image_name); + let output = run_qemu_img(&path, &["check"], None); + + assert!( + output.status.success(), + "qemu-img check failed: {}", + String::from_utf8_lossy(&output.stderr) + ); + + match check_dirty_flag(&path) { + Ok(Some(dirty)) => { + assert!(!dirty, "QCOW2 image shutdown unclean"); + } + Ok(None) => {} // Not a QCOW2 v3 image, skip dirty flag check + Err(e) => panic!("Failed to check dirty flag: {e}"), + } + + if let Some((backing_path, format, initial_checksum)) = initial_backing_checksum { + if format.parse::().ok() != Some(block::qcow::ImageType::Raw) { + let output = run_qemu_img(&backing_path, &["check"], None); + + assert!( + output.status.success(), + "qemu-img check of backing file failed: {}", + String::from_utf8_lossy(&output.stderr) + ); + } + + let mut file = File::open(&backing_path).unwrap(); + let file_size = file.metadata().unwrap().len(); + assert_eq!( + initial_checksum, + compute_file_checksum(&mut file, file_size) + ); + } +} + +pub(crate) fn run_qemu_img( + path: &std::path::Path, + args: &[&str], + trailing_args: Option<&[&str]>, +) -> std::process::Output { + let mut cmd = std::process::Command::new("qemu-img"); + cmd.arg(args[0]) + .args(&args[1..]) + .arg(path.to_str().unwrap()); + if let Some(extra) = trailing_args { + cmd.args(extra); + } + cmd.output().unwrap() +} + +fn get_image_info(path: &std::path::Path) -> Option { + let output = run_qemu_img(path, &["info", "-U", "--output=json"], None); + + output.status.success().then_some(())?; + serde_json::from_slice(&output.stdout).ok() +} + +fn get_qcow2_v3_info(path: &Path) -> Result, String> { + let info = get_image_info(path) + .ok_or_else(|| format!("qemu-img info failed for {}", path.display()))?; + if info["format"].as_str() != Some("qcow2") { + return Ok(None); + } + // QCOW2 v3 has compat "1.1", v2 has "0.10" + if info["format-specific"]["data"]["compat"].as_str() != Some("1.1") { + return Ok(None); + } + Ok(Some(info)) +} + +pub(crate) fn check_dirty_flag(path: &Path) -> Result, String> { + Ok(get_qcow2_v3_info(path)?.and_then(|info| info["dirty-flag"].as_bool())) +} + +pub(crate) fn check_corrupt_flag(path: &Path) -> Result, String> { + Ok(get_qcow2_v3_info(path)? + .and_then(|info| info["format-specific"]["data"]["corrupt"].as_bool())) +} + +pub(crate) fn set_corrupt_flag(path: &Path, corrupt: bool) -> io::Result<()> { + let mut file = OpenOptions::new().read(true).write(true).open(path)?; + + file.seek(SeekFrom::Start(QCOW2_INCOMPATIBLE_FEATURES_OFFSET))?; + let mut buf = [0u8; 8]; + file.read_exact(&mut buf)?; + let mut features = u64::from_be_bytes(buf); + + if corrupt { + features |= 0x02; + } else { + features &= !0x02; + } + + file.seek(SeekFrom::Start(QCOW2_INCOMPATIBLE_FEATURES_OFFSET))?; + file.write_all(&features.to_be_bytes())?; + file.sync_all()?; + Ok(()) +} + +fn resolve_disk_path(path_or_image_name: impl AsRef) -> std::path::PathBuf { + if path_or_image_name.as_ref().exists() { + // A full path is provided + path_or_image_name.as_ref().to_path_buf() + } else { + // An image name is provided + let mut workload_path = dirs::home_dir().unwrap(); + workload_path.push("workloads"); + workload_path.as_path().join(path_or_image_name.as_ref()) + } +} + +pub(crate) fn compute_file_checksum(reader: &mut dyn std::io::Read, size: u64) -> u32 { + // Read first 16MB or entire data if smaller + let read_size = cmp::min(size, 16 * 1024 * 1024) as usize; + + let mut buffer = vec![0u8; read_size]; + reader.read_exact(&mut buffer).unwrap(); + + // DJB2 hash + let mut hash: u32 = 5381; + for byte in buffer.iter() { + hash = hash.wrapping_mul(33).wrapping_add(*byte as u32); + } + hash +} + +pub(crate) fn get_reboot_count(guest: &Guest) -> u32 { + guest + .ssh_command("sudo last | grep -c reboot") + .unwrap() + .trim() + .parse::() + .unwrap_or_default() +} + +pub(crate) fn enable_guest_watchdog(guest: &Guest, watchdog_sec: u32) { + // Check for PCI device + assert!( + guest + .does_device_vendor_pair_match("0x1063", "0x1af4") + .unwrap_or_default() + ); + + // Enable systemd watchdog + guest + .ssh_command(&format!( + "echo RuntimeWatchdogSec={watchdog_sec}s | sudo tee -a /etc/systemd/system.conf" + )) + .unwrap(); + + guest.ssh_command("sudo systemctl daemon-reexec").unwrap(); +} + +pub(crate) fn make_guest_panic(guest: &Guest) { + // Check for pvpanic device + assert!( + guest + .does_device_vendor_pair_match("0x0011", "0x1b36") + .unwrap_or_default() + ); + + // Trigger guest a panic + guest.ssh_command("screen -dmS reboot sh -c \"sleep 5; echo s | tee /proc/sysrq-trigger; echo c | sudo tee /proc/sysrq-trigger\"").unwrap(); +} + +/// Extracts a BDF from a CHV returned response +pub(crate) fn bdf_from_hotplug_response( + s: &str, +) -> ( + u16, /* Segment ID */ + u8, /* Bus ID */ + u8, /* Device ID */ + u8, /* Function ID */ +) { + let json: serde_json::Value = serde_json::from_str(s).expect("should be valid JSON"); + let bdf_str = json["bdf"] + .as_str() + .expect("should contain string key `bdf`"); + + // BDF format: "SSSS:BB:DD.F" + let parts: Vec<&str> = bdf_str.split(&[':', '.'][..]).collect(); + assert_eq!(parts.len(), 4, "unexpected BDF format: {bdf_str}"); + + let segment_id = u16::from_str_radix(parts[0], 16).unwrap(); + let bus_id = u8::from_str_radix(parts[1], 16).unwrap(); + let device_id = u8::from_str_radix(parts[2], 16).unwrap(); + let function_id = u8::from_str_radix(parts[3], 16).unwrap(); + + (segment_id, bus_id, device_id, function_id) +} diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 1aad1a372c..f8915ec4a1 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -2,3672 +2,637 @@ // // SPDX-License-Identifier: Apache-2.0 // -#![cfg(devcli_testenv)] +#![cfg(any(devcli_testenv, clippy))] #![allow(clippy::undocumented_unsafe_blocks)] // When enabling the `mshv` feature, we skip quite some tests and // hence have known dead-code. This annotation silences dead-code // related warnings for our quality workflow to pass. #![allow(dead_code)] - -use std::collections::HashMap; -use std::ffi::CStr; -use std::fs::OpenOptions; -use std::io::{BufRead, Read, Seek, SeekFrom, Write}; +use std::fs::{File, OpenOptions, copy}; +use std::io::{Read, Seek, Write}; use std::net::TcpListener; use std::os::unix::io::AsRawFd; -use std::path::{Path, PathBuf}; +use std::path::PathBuf; use std::process::{Child, Command, Stdio}; use std::string::String; -use std::sync::mpsc::Receiver; -use std::sync::{Mutex, mpsc}; -use std::time::Duration; -use std::{fs, io, thread}; +use std::sync::Mutex; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; +use std::{fs, thread}; -use net_util::MacAddr; +use block::ImageType; use test_infra::*; use vmm_sys_util::tempdir::TempDir; use vmm_sys_util::tempfile::TempFile; use wait_timeout::ChildExt; -// Constant taken from the VMM crate. -const MAX_NUM_PCI_SEGMENTS: u16 = 96; - -#[cfg(target_arch = "x86_64")] -mod x86_64 { - pub const FOCAL_IMAGE_NAME: &str = "focal-server-cloudimg-amd64-custom-20210609-0.raw"; - pub const JAMMY_VFIO_IMAGE_NAME: &str = - "jammy-server-cloudimg-amd64-custom-vfio-20241012-0.raw"; - pub const FOCAL_IMAGE_NAME_VHD: &str = "focal-server-cloudimg-amd64-custom-20210609-0.vhd"; - pub const FOCAL_IMAGE_NAME_VHDX: &str = "focal-server-cloudimg-amd64-custom-20210609-0.vhdx"; - pub const JAMMY_IMAGE_NAME: &str = "jammy-server-cloudimg-amd64-custom-20241017-0.raw"; - pub const JAMMY_IMAGE_NAME_QCOW2: &str = "jammy-server-cloudimg-amd64-custom-20241017-0.qcow2"; - pub const JAMMY_IMAGE_NAME_QCOW2_ZLIB: &str = - "jammy-server-cloudimg-amd64-custom-20241017-0-zlib.qcow2"; - pub const JAMMY_IMAGE_NAME_QCOW2_ZSTD: &str = - "jammy-server-cloudimg-amd64-custom-20241017-0-zstd.qcow2"; - pub const JAMMY_IMAGE_NAME_QCOW2_BACKING_ZSTD_FILE: &str = - "jammy-server-cloudimg-amd64-custom-20241017-0-backing-zstd.qcow2"; - pub const JAMMY_IMAGE_NAME_QCOW2_BACKING_UNCOMPRESSED_FILE: &str = - "jammy-server-cloudimg-amd64-custom-20241017-0-backing-uncompressed.qcow2"; - pub const JAMMY_IMAGE_NAME_QCOW2_BACKING_RAW_FILE: &str = - "jammy-server-cloudimg-amd64-custom-20241017-0-backing-raw.qcow2"; - pub const WINDOWS_IMAGE_NAME: &str = "windows-server-2022-amd64-2.raw"; - pub const OVMF_NAME: &str = "CLOUDHV.fd"; - pub const GREP_SERIAL_IRQ_CMD: &str = "grep -c 'IO-APIC.*ttyS0' /proc/interrupts || true"; -} - -#[cfg(target_arch = "x86_64")] -use x86_64::*; +mod common; +use common::tests_wrappers::*; +use common::utils::*; -#[cfg(target_arch = "aarch64")] -mod aarch64 { - pub const FOCAL_IMAGE_NAME: &str = "focal-server-cloudimg-arm64-custom-20210929-0.raw"; - pub const FOCAL_IMAGE_UPDATE_KERNEL_NAME: &str = - "focal-server-cloudimg-arm64-custom-20210929-0-update-kernel.raw"; - pub const FOCAL_IMAGE_NAME_VHD: &str = "focal-server-cloudimg-arm64-custom-20210929-0.vhd"; - pub const FOCAL_IMAGE_NAME_VHDX: &str = "focal-server-cloudimg-arm64-custom-20210929-0.vhdx"; - pub const JAMMY_IMAGE_NAME: &str = "jammy-server-cloudimg-arm64-custom-20220329-0.raw"; - pub const JAMMY_IMAGE_NAME_QCOW2: &str = "jammy-server-cloudimg-arm64-custom-20220329-0.qcow2"; - pub const JAMMY_IMAGE_NAME_QCOW2_ZLIB: &str = - "jammy-server-cloudimg-arm64-custom-20220329-0-zlib.qcow2"; - pub const JAMMY_IMAGE_NAME_QCOW2_ZSTD: &str = - "jammy-server-cloudimg-arm64-custom-20220329-0-zstd.qcow2"; - pub const JAMMY_IMAGE_NAME_QCOW2_BACKING_ZSTD_FILE: &str = - "jammy-server-cloudimg-arm64-custom-20220329-0-backing-zstd.qcow2"; - pub const JAMMY_IMAGE_NAME_QCOW2_BACKING_UNCOMPRESSED_FILE: &str = - "jammy-server-cloudimg-arm64-custom-20220329-0-backing-uncompressed.qcow2"; - pub const JAMMY_IMAGE_NAME_QCOW2_BACKING_RAW_FILE: &str = - "jammy-server-cloudimg-arm64-custom-20220329-0-backing-raw.qcow2"; - pub const WINDOWS_IMAGE_NAME: &str = "windows-11-iot-enterprise-aarch64.raw"; - pub const OVMF_NAME: &str = "CLOUDHV_EFI.fd"; - pub const GREP_SERIAL_IRQ_CMD: &str = "grep -c 'GICv3.*uart-pl011' /proc/interrupts || true"; - pub const GREP_PMU_IRQ_CMD: &str = "grep -c 'GICv3.*arm-pmu' /proc/interrupts || true"; +macro_rules! basic_regular_guest { + ($image_name:expr) => {{ + let disk_config = UbuntuDiskConfig::new($image_name.to_string()); + GuestFactory::new_regular_guest_factory().create_guest(Box::new(disk_config)) + }}; } -#[cfg(target_arch = "aarch64")] -use aarch64::*; - -const DIRECT_KERNEL_BOOT_CMDLINE: &str = - "root=/dev/vda1 console=hvc0 rw systemd.journald.forward_to_console=1"; +mod common_parallel { + use std::io::{self, SeekFrom}; + use std::process::Command; -const CONSOLE_TEST_STRING: &str = "Started OpenBSD Secure Shell server"; + use test_infra::GuestFactory; -// This enum exists to make it more convenient to -// implement test for both D-Bus and REST APIs. -enum TargetApi { - // API socket - HttpApi(String), - // well known service name, object path - DBusApi(String, String), -} + use crate::*; -impl TargetApi { - fn new_http_api(tmp_dir: &TempDir) -> Self { - Self::HttpApi(temp_api_path(tmp_dir)) + #[test] + #[cfg(target_arch = "x86_64")] + fn test_focal_hypervisor_fw() { + let guest = basic_regular_guest!(FOCAL_IMAGE_NAME) + .with_kernel(fw_path(FwType::RustHypervisorFirmware)); + _test_simple_launch(&guest); } - fn new_dbus_api(tmp_dir: &TempDir) -> Self { - // `tmp_dir` is in the form of "/tmp/chXXXXXX" - // and we take the `chXXXXXX` part as a unique identifier for the guest - let id = tmp_dir.as_path().file_name().unwrap().to_str().unwrap(); - - Self::DBusApi( - format!("org.cloudhypervisor.{id}"), - format!("/org/cloudhypervisor/{id}"), - ) + #[test] + #[cfg(target_arch = "x86_64")] + fn test_focal_ovmf() { + let guest = basic_regular_guest!(FOCAL_IMAGE_NAME).with_kernel(fw_path(FwType::Ovmf)); + _test_simple_launch(&guest); } - fn guest_args(&self) -> Vec { - match self { - TargetApi::HttpApi(api_socket) => { - vec![format!("--api-socket={}", api_socket.as_str())] - } - TargetApi::DBusApi(service_name, object_path) => { - vec![ - format!("--dbus-service-name={}", service_name.as_str()), - format!("--dbus-object-path={}", object_path.as_str()), - ] - } - } + #[test] + fn test_multi_cpu() { + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); + _test_multi_cpu(&guest); } - fn remote_args(&self) -> Vec { - // `guest_args` and `remote_args` are consistent with each other - self.guest_args() + #[test] + fn test_cpu_topology_421() { + test_cpu_topology(4, 2, 1, false); } - fn remote_command(&self, command: &str, arg: Option<&str>) -> bool { - let mut cmd = Command::new(clh_command("ch-remote")); - cmd.args(self.remote_args()); - cmd.arg(command); - - if let Some(arg) = arg { - cmd.arg(arg); - } - - let output = cmd.output().unwrap(); - if output.status.success() { - true - } else { - eprintln!("Error running ch-remote command: {:?}", &cmd); - let stderr = String::from_utf8_lossy(&output.stderr); - eprintln!("stderr: {stderr}"); - false - } + #[test] + fn test_cpu_topology_142() { + test_cpu_topology(1, 4, 2, false); } -} - -// Start cloud-hypervisor with no VM parameters, only the API server running. -// From the API: Create a VM, boot it and check that it looks as expected. -fn _test_api_create_boot(target_api: &TargetApi, guest: &Guest) { - let mut child = GuestCommand::new(guest) - .args(target_api.guest_args()) - .capture_output() - .spawn() - .unwrap(); - - thread::sleep(std::time::Duration::new(1, 0)); - - // Verify API server is running - assert!(target_api.remote_command("ping", None)); - - // Create the VM first - let cpu_count: u8 = 4; - let request_body = guest.api_create_body( - cpu_count, - direct_kernel_boot_path().to_str().unwrap(), - DIRECT_KERNEL_BOOT_CMDLINE, - ); - - let temp_config_path = guest.tmp_dir.as_path().join("config"); - std::fs::write(&temp_config_path, request_body).unwrap(); - let create_config = temp_config_path.as_os_str().to_str().unwrap(); - - assert!(target_api.remote_command("create", Some(create_config),)); - // Then boot it - assert!(target_api.remote_command("boot", None)); - thread::sleep(std::time::Duration::new(20, 0)); + #[test] + fn test_cpu_topology_262() { + test_cpu_topology(2, 6, 2, false); + } - let r = std::panic::catch_unwind(|| { - // Check that the VM booted as expected - assert_eq!(guest.get_cpu_count().unwrap_or_default() as u8, cpu_count); - assert!(guest.get_total_memory().unwrap_or_default() > 480_000); - }); + #[test] + #[cfg(target_arch = "x86_64")] + #[cfg(not(feature = "mshv"))] + fn test_cpu_physical_bits() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); + let max_phys_bits: u8 = 36; + let mut child = GuestCommand::new(&guest) + .args(["--cpus", &format!("max_phys_bits={max_phys_bits}")]) + .default_memory() + .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .default_disks() + .default_net() + .capture_output() + .spawn() + .unwrap(); - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); - handle_child_output(r, &output); -} + assert!( + guest + .ssh_command("lscpu | grep \"Address sizes:\" | cut -f 2 -d \":\" | sed \"s# *##\" | cut -f 1 -d \" \"") + .unwrap() + .trim() + .parse::() + .unwrap_or(max_phys_bits + 1) <= max_phys_bits, + ); + }); -// Start cloud-hypervisor with no VM parameters, only the API server running. -// From the API: Create a VM, boot it and check it can be shutdown and then -// booted again -fn _test_api_shutdown(target_api: &TargetApi, guest: &Guest) { - let mut child = GuestCommand::new(guest) - .args(target_api.guest_args()) - .capture_output() - .spawn() - .unwrap(); + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); - thread::sleep(std::time::Duration::new(1, 0)); + handle_child_output(r, &output); + } - // Verify API server is running - assert!(target_api.remote_command("ping", None)); + fn _test_nested_virtualization(nested: bool) { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)).with_nested(nested); + let mut child = GuestCommand::new(&guest) + .default_cpus() + .default_memory() + .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .default_disks() + .default_net() + .capture_output() + .spawn() + .unwrap(); - // Create the VM first - let cpu_count: u8 = 4; - let request_body = guest.api_create_body( - cpu_count, - direct_kernel_boot_path().to_str().unwrap(), - DIRECT_KERNEL_BOOT_CMDLINE, - ); + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); - let temp_config_path = guest.tmp_dir.as_path().join("config"); - std::fs::write(&temp_config_path, request_body).unwrap(); - let create_config = temp_config_path.as_os_str().to_str().unwrap(); + let expected = if nested { "yes" } else { "no" }; + assert_eq!( + guest + .ssh_command("test -c /dev/kvm && echo yes || echo no") + .unwrap() + .trim(), + expected + ); + }); - let r = std::panic::catch_unwind(|| { - assert!(target_api.remote_command("create", Some(create_config))); + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); - // Then boot it - assert!(target_api.remote_command("boot", None)); + handle_child_output(r, &output); + } - guest.wait_vm_boot().unwrap(); + #[test] + #[cfg(target_arch = "x86_64")] + fn test_nested_virtualization_on() { + _test_nested_virtualization(true); + } - // Check that the VM booted as expected - assert_eq!(guest.get_cpu_count().unwrap_or_default() as u8, cpu_count); - assert!(guest.get_total_memory().unwrap_or_default() > 480_000); + #[test] + #[cfg(target_arch = "x86_64")] + fn test_nested_virtualization_off() { + _test_nested_virtualization(false); + } - // Sync and shutdown without powering off to prevent filesystem - // corruption. - guest.ssh_command("sync").unwrap(); - guest.ssh_command("sudo shutdown -H now").unwrap(); + #[test] + fn test_cpu_affinity() { + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME).with_cpu(2); + _test_cpu_affinity(&guest); + } - // Wait for the guest to be fully shutdown - thread::sleep(std::time::Duration::new(20, 0)); + #[test] + fn test_virtio_queue_affinity() { + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME).with_cpu(4); + _test_virtio_queue_affinity(&guest); + } - // Then shut it down - assert!(target_api.remote_command("shutdown", None)); + #[test] + #[cfg(not(feature = "mshv"))] + fn test_large_vm() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); + let mut cmd = GuestCommand::new(&guest); + cmd.args(["--cpus", "boot=48"]) + .args(["--memory", "size=5120M"]) + .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .args(["--serial", "tty"]) + .args(["--console", "off"]) + .capture_output() + .default_disks() + .default_net(); - // Then boot it again - assert!(target_api.remote_command("boot", None)); + let mut child = cmd.spawn().unwrap(); guest.wait_vm_boot().unwrap(); - // Check that the VM booted as expected - assert_eq!(guest.get_cpu_count().unwrap_or_default() as u8, cpu_count); - assert!(guest.get_total_memory().unwrap_or_default() > 480_000); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); -} - -// Start cloud-hypervisor with no VM parameters, only the API server running. -// From the API: Create a VM, boot it and check it can be deleted and then recreated -// booted again. -fn _test_api_delete(target_api: &TargetApi, guest: &Guest) { - let mut child = GuestCommand::new(guest) - .args(target_api.guest_args()) - .capture_output() - .spawn() - .unwrap(); + let r = std::panic::catch_unwind(|| { + assert_eq!(guest.get_cpu_count().unwrap_or_default(), 48); + assert_eq!( + guest + .ssh_command("lscpu | grep \"On-line\" | cut -f 2 -d \":\" | sed \"s# *##\"") + .unwrap() + .trim(), + "0-47" + ); - thread::sleep(std::time::Duration::new(1, 0)); + assert!(guest.get_total_memory().unwrap_or_default() > 5_000_000); + }); - // Verify API server is running - assert!(target_api.remote_command("ping", None)); + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); - // Create the VM first - let cpu_count: u8 = 4; - let request_body = guest.api_create_body( - cpu_count, - direct_kernel_boot_path().to_str().unwrap(), - DIRECT_KERNEL_BOOT_CMDLINE, - ); - let temp_config_path = guest.tmp_dir.as_path().join("config"); - std::fs::write(&temp_config_path, request_body).unwrap(); - let create_config = temp_config_path.as_os_str().to_str().unwrap(); + handle_child_output(r, &output); + } - let r = std::panic::catch_unwind(|| { - assert!(target_api.remote_command("create", Some(create_config))); + #[test] + #[cfg(not(feature = "mshv"))] + fn test_huge_memory() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); + let mut cmd = GuestCommand::new(&guest); + cmd.default_cpus() + .args(["--memory", "size=128G"]) + .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .capture_output() + .default_disks() + .default_net(); - // Then boot it - assert!(target_api.remote_command("boot", None)); + let mut child = cmd.spawn().unwrap(); guest.wait_vm_boot().unwrap(); - // Check that the VM booted as expected - assert_eq!(guest.get_cpu_count().unwrap_or_default() as u8, cpu_count); - assert!(guest.get_total_memory().unwrap_or_default() > 480_000); + let r = std::panic::catch_unwind(|| { + assert!(guest.get_total_memory().unwrap_or_default() > 128_000_000); + }); - // Sync and shutdown without powering off to prevent filesystem - // corruption. - guest.ssh_command("sync").unwrap(); - guest.ssh_command("sudo shutdown -H now").unwrap(); + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); - // Wait for the guest to be fully shutdown - thread::sleep(std::time::Duration::new(20, 0)); + handle_child_output(r, &output); + } - // Then delete it - assert!(target_api.remote_command("delete", None)); + #[test] + fn test_power_button() { + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); + _test_power_button(&guest); + } - assert!(target_api.remote_command("create", Some(create_config))); + #[test] + #[cfg(not(feature = "mshv"))] // See #7456 + fn test_user_defined_memory_regions() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); + let api_socket = temp_api_path(&guest.tmp_dir); - // Then boot it again - assert!(target_api.remote_command("boot", None)); + let kernel_path = direct_kernel_boot_path(); - guest.wait_vm_boot().unwrap(); + let mut child = GuestCommand::new(&guest) + .default_cpus() + .args(["--memory", "size=0,hotplug_method=virtio-mem"]) + .args([ + "--memory-zone", + "id=mem0,size=1G,hotplug_size=2G", + "id=mem1,size=1G,shared=on", + "id=mem2,size=1G,host_numa_node=0,hotplug_size=2G", + ]) + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .args(["--api-socket", &api_socket]) + .capture_output() + .default_disks() + .default_net() + .spawn() + .unwrap(); - // Check that the VM booted as expected - assert_eq!(guest.get_cpu_count().unwrap_or_default() as u8, cpu_count); - assert!(guest.get_total_memory().unwrap_or_default() > 480_000); - }); + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); + assert!(guest.get_total_memory().unwrap_or_default() > 2_880_000); - handle_child_output(r, &output); -} + guest.enable_memory_hotplug(); -// Start cloud-hypervisor with no VM parameters, only the API server running. -// From the API: Create a VM, boot it and check that it looks as expected. -// Then we pause the VM, check that it's no longer available. -// Finally we resume the VM and check that it's available. -fn _test_api_pause_resume(target_api: &TargetApi, guest: &Guest) { - let mut child = GuestCommand::new(guest) - .args(target_api.guest_args()) - .capture_output() - .spawn() - .unwrap(); + resize_zone_command(&api_socket, "mem0", "3G"); + assert!(wait_until(Duration::from_secs(5), || guest + .get_total_memory() + .unwrap_or_default() + > 4_800_000)); + resize_zone_command(&api_socket, "mem2", "3G"); + assert!(wait_until(Duration::from_secs(5), || guest + .get_total_memory() + .unwrap_or_default() + > 6_720_000)); + resize_zone_command(&api_socket, "mem0", "2G"); + assert!(wait_until(Duration::from_secs(5), || guest + .get_total_memory() + .unwrap_or_default() + > 5_760_000)); + resize_zone_command(&api_socket, "mem2", "2G"); + assert!(wait_until(Duration::from_secs(5), || guest + .get_total_memory() + .unwrap_or_default() + > 4_800_000)); - thread::sleep(std::time::Duration::new(1, 0)); + guest.reboot_linux(0); - // Verify API server is running - assert!(target_api.remote_command("ping", None)); + // Check the amount of RAM after reboot + assert!(guest.get_total_memory().unwrap_or_default() > 4_800_000); + assert!(guest.get_total_memory().unwrap_or_default() < 5_760_000); - // Create the VM first - let cpu_count: u8 = 4; - let request_body = guest.api_create_body( - cpu_count, - direct_kernel_boot_path().to_str().unwrap(), - DIRECT_KERNEL_BOOT_CMDLINE, - ); + // Check if we can still resize down to the initial 'boot'size + resize_zone_command(&api_socket, "mem0", "1G"); + assert!(wait_until(Duration::from_secs(5), || guest + .get_total_memory() + .unwrap_or_default() + < 4_800_000)); + resize_zone_command(&api_socket, "mem2", "1G"); + assert!(wait_until(Duration::from_secs(5), || guest + .get_total_memory() + .unwrap_or_default() + < 3_840_000)); + }); - let temp_config_path = guest.tmp_dir.as_path().join("config"); - std::fs::write(&temp_config_path, request_body).unwrap(); - let create_config = temp_config_path.as_os_str().to_str().unwrap(); + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); - assert!(target_api.remote_command("create", Some(create_config))); + handle_child_output(r, &output); + } - // Then boot it - assert!(target_api.remote_command("boot", None)); - thread::sleep(std::time::Duration::new(20, 0)); + #[test] + #[cfg(not(feature = "mshv"))] // See #7456 + fn test_guest_numa_nodes() { + _test_guest_numa_nodes(false); + } - let r = std::panic::catch_unwind(|| { - // Check that the VM booted as expected - assert_eq!(guest.get_cpu_count().unwrap_or_default() as u8, cpu_count); - assert!(guest.get_total_memory().unwrap_or_default() > 480_000); + #[test] + #[cfg(target_arch = "x86_64")] + fn test_iommu_segments() { + let focal_image = FOCAL_IMAGE_NAME.to_string(); + let disk_config = UbuntuDiskConfig::new(focal_image); + let guest = Guest::new(Box::new(disk_config)); - // We now pause the VM - assert!(target_api.remote_command("pause", None)); + // Prepare another disk file for the virtio-disk device + let test_disk_path = String::from( + guest + .tmp_dir + .as_path() + .join("test-disk.raw") + .to_str() + .unwrap(), + ); + assert!( + exec_host_command_status(format!("truncate {test_disk_path} -s 4M").as_str()).success() + ); + assert!(exec_host_command_status(format!("mkfs.ext4 {test_disk_path}").as_str()).success()); - // Check pausing again fails - assert!(!target_api.remote_command("pause", None)); + let api_socket = temp_api_path(&guest.tmp_dir); + let mut cmd = GuestCommand::new(&guest); - thread::sleep(std::time::Duration::new(2, 0)); - - // SSH into the VM should fail - ssh_command_ip( - "grep -c processor /proc/cpuinfo", - &guest.network.guest_ip0, - 2, - 5, - ) - .unwrap_err(); - - // Resume the VM - assert!(target_api.remote_command("resume", None)); - - // Check resuming again fails - assert!(!target_api.remote_command("resume", None)); + cmd.default_cpus() + .args(["--api-socket", &api_socket]) + .default_memory() + .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .args([ + "--platform", + &format!("num_pci_segments={MAX_NUM_PCI_SEGMENTS},iommu_segments=[1]"), + ]) + .default_disks() + .capture_output() + .default_net(); - thread::sleep(std::time::Duration::new(2, 0)); + let mut child = cmd.spawn().unwrap(); - // Now we should be able to SSH back in and get the right number of CPUs - assert_eq!(guest.get_cpu_count().unwrap_or_default() as u8, cpu_count); - }); + guest.wait_vm_boot().unwrap(); - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); + let r = std::panic::catch_unwind(|| { + let (cmd_success, cmd_output, _) = remote_command_w_output( + &api_socket, + "add-disk", + Some( + format!( + "path={},id=test0,pci_segment=1,iommu=on", + test_disk_path.as_str() + ) + .as_str(), + ), + ); + assert!(cmd_success); + assert!( + String::from_utf8_lossy(&cmd_output) + .contains("{\"id\":\"test0\",\"bdf\":\"0001:00:01.0\"}") + ); - handle_child_output(r, &output); -} + // Check IOMMU setup + assert!( + guest + .does_device_vendor_pair_match("0x1057", "0x1af4") + .unwrap_or_default() + ); + assert!( + guest + .ssh_command("ls /sys/kernel/iommu_groups/*/devices") + .unwrap() + .contains("0001:00:01.0") + ); + }); -fn _test_pty_interaction(pty_path: PathBuf) { - let mut cf = std::fs::OpenOptions::new() - .write(true) - .read(true) - .open(pty_path) - .unwrap(); + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); - // Some dumb sleeps but we don't want to write - // before the console is up and we don't want - // to try and write the next line before the - // login process is ready. - thread::sleep(std::time::Duration::new(5, 0)); - assert_eq!(cf.write(b"cloud\n").unwrap(), 6); - thread::sleep(std::time::Duration::new(2, 0)); - assert_eq!(cf.write(b"cloud123\n").unwrap(), 9); - thread::sleep(std::time::Duration::new(2, 0)); - assert_eq!(cf.write(b"echo test_pty_console\n").unwrap(), 22); - thread::sleep(std::time::Duration::new(2, 0)); - - // read pty and ensure they have a login shell - // some fairly hacky workarounds to avoid looping - // forever in case the channel is blocked getting output - let ptyc = pty_read(cf); - let mut empty = 0; - let mut prev = String::new(); - loop { - thread::sleep(std::time::Duration::new(2, 0)); - match ptyc.try_recv() { - Ok(line) => { - empty = 0; - prev = prev + &line; - if prev.contains("test_pty_console") { - break; - } - } - Err(mpsc::TryRecvError::Empty) => { - empty += 1; - assert!(empty <= 5, "No login on pty"); - } - _ => { - panic!("No login on pty") - } - } + handle_child_output(r, &output); } -} - -fn prepare_virtiofsd(tmp_dir: &TempDir, shared_dir: &str) -> (std::process::Child, String) { - let mut workload_path = dirs::home_dir().unwrap(); - workload_path.push("workloads"); - - let mut virtiofsd_path = workload_path; - virtiofsd_path.push("virtiofsd"); - let virtiofsd_path = String::from(virtiofsd_path.to_str().unwrap()); - let virtiofsd_socket_path = - String::from(tmp_dir.as_path().join("virtiofs.sock").to_str().unwrap()); + #[test] + fn test_pci_msi() { + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); + _test_pci_msi(&guest); + } - // Start the daemon - let child = Command::new(virtiofsd_path.as_str()) - .args(["--shared-dir", shared_dir]) - .args(["--socket-path", virtiofsd_socket_path.as_str()]) - .args(["--cache", "never"]) - .spawn() - .unwrap(); + #[test] + fn test_virtio_net_ctrl_queue() { + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); + _test_virtio_net_ctrl_queue(&guest); + } - thread::sleep(std::time::Duration::new(10, 0)); + #[test] + fn test_pci_multiple_segments() { + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); + _test_pci_multiple_segments(&guest, MAX_NUM_PCI_SEGMENTS, 15u16); + } - (child, virtiofsd_socket_path) -} + #[test] + fn test_pci_multiple_segments_numa_node() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); + let api_socket = temp_api_path(&guest.tmp_dir); + #[cfg(target_arch = "x86_64")] + let kernel_path = direct_kernel_boot_path(); + #[cfg(target_arch = "aarch64")] + let kernel_path = edk2_path(); -fn prepare_vubd( - tmp_dir: &TempDir, - blk_img: &str, - num_queues: usize, - rdonly: bool, - direct: bool, -) -> (std::process::Child, String) { - let mut workload_path = dirs::home_dir().unwrap(); - workload_path.push("workloads"); - - let mut blk_file_path = workload_path; - blk_file_path.push(blk_img); - let blk_file_path = String::from(blk_file_path.to_str().unwrap()); - - let vubd_socket_path = String::from(tmp_dir.as_path().join("vub.sock").to_str().unwrap()); - - // Start the daemon - let child = Command::new(clh_command("vhost_user_block")) - .args([ - "--block-backend", - format!( - "path={blk_file_path},socket={vubd_socket_path},num_queues={num_queues},readonly={rdonly},direct={direct}" - ) - .as_str(), - ]) - .spawn() - .unwrap(); + // Prepare another disk file for the virtio-disk device + let test_disk_path = String::from( + guest + .tmp_dir + .as_path() + .join("test-disk.raw") + .to_str() + .unwrap(), + ); + assert!( + exec_host_command_status(format!("truncate {test_disk_path} -s 4M").as_str()).success() + ); + assert!(exec_host_command_status(format!("mkfs.ext4 {test_disk_path}").as_str()).success()); + const TEST_DISK_NODE: u16 = 1; - thread::sleep(std::time::Duration::new(10, 0)); + let mut child = GuestCommand::new(&guest) + .args(["--platform", "num_pci_segments=2"]) + .args(["--cpus", "boot=2"]) + .args(["--memory", "size=0"]) + .args(["--memory-zone", "id=mem0,size=256M", "id=mem1,size=256M"]) + .args([ + "--numa", + "guest_numa_id=0,cpus=[0],distances=[1@20],memory_zones=mem0,pci_segments=[0]", + "guest_numa_id=1,cpus=[1],distances=[0@20],memory_zones=mem1,pci_segments=[1]", + ]) + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .args(["--api-socket", &api_socket]) + .capture_output() + .args([ + "--disk", + format!( + "path={}", + guest.disk_config.disk(DiskType::OperatingSystem).unwrap() + ) + .as_str(), + format!( + "path={}", + guest.disk_config.disk(DiskType::CloudInit).unwrap() + ) + .as_str(), + format!("path={test_disk_path},pci_segment={TEST_DISK_NODE}").as_str(), + ]) + .default_net() + .spawn() + .unwrap(); - (child, vubd_socket_path) -} + let cmd = "cat /sys/block/vdc/device/../numa_node"; -fn temp_vsock_path(tmp_dir: &TempDir) -> String { - String::from(tmp_dir.as_path().join("vsock").to_str().unwrap()) -} + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); -fn temp_api_path(tmp_dir: &TempDir) -> String { - String::from( - tmp_dir - .as_path() - .join("cloud-hypervisor.sock") - .to_str() - .unwrap(), - ) -} + assert_eq!( + guest + .ssh_command(cmd) + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + TEST_DISK_NODE + ); -fn temp_event_monitor_path(tmp_dir: &TempDir) -> String { - String::from(tmp_dir.as_path().join("event.json").to_str().unwrap()) -} + // Each PNP0A08 host bridge in the DSDT must expose a unique + // _UID matching its PCI segment id. Linux surfaces the + // evaluated _UID via /sys/bus/acpi/devices/PNP0A08:*/uid. + // This test uses firmware boot on aarch64, so ACPI is + // available on both supported architectures. + let mut uids: Vec = guest + .ssh_command("cat /sys/bus/acpi/devices/PNP0A08:*/uid") + .unwrap() + .lines() + .filter_map(|l| l.trim().parse::().ok()) + .collect(); + uids.sort(); + assert_eq!(uids, vec![0u16, 1u16]); + }); -// Creates the directory and returns the path. -fn temp_snapshot_dir_path(tmp_dir: &TempDir) -> String { - let snapshot_dir = String::from(tmp_dir.as_path().join("snapshot").to_str().unwrap()); - std::fs::create_dir(&snapshot_dir).unwrap(); - snapshot_dir -} + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); -fn temp_vmcore_file_path(tmp_dir: &TempDir) -> String { - String::from(tmp_dir.as_path().join("vmcore").to_str().unwrap()) -} + handle_child_output(r, &output); + } -// Creates the path for direct kernel boot and return the path. -// For x86_64, this function returns the vmlinux kernel path. -// For AArch64, this function returns the PE kernel path. -fn direct_kernel_boot_path() -> PathBuf { - let mut workload_path = dirs::home_dir().unwrap(); - workload_path.push("workloads"); + #[test] + fn test_direct_kernel_boot() { + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); + _test_direct_kernel_boot(&guest); + } - let mut kernel_path = workload_path; + #[test] #[cfg(target_arch = "x86_64")] - kernel_path.push("vmlinux-x86_64"); - #[cfg(target_arch = "aarch64")] - kernel_path.push("Image-arm64"); - - kernel_path -} - -fn edk2_path() -> PathBuf { - let mut workload_path = dirs::home_dir().unwrap(); - workload_path.push("workloads"); - let mut edk2_path = workload_path; - edk2_path.push(OVMF_NAME); + fn test_direct_kernel_boot_bzimage() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); - edk2_path -} + let mut kernel_path = direct_kernel_boot_path(); + // Replace the default kernel with the bzImage. + kernel_path.pop(); + kernel_path.push("bzImage-x86_64"); -fn cloud_hypervisor_release_path() -> String { - let mut workload_path = dirs::home_dir().unwrap(); - workload_path.push("workloads"); + let mut child = GuestCommand::new(&guest) + .default_cpus() + .default_memory() + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .default_disks() + .default_net() + .capture_output() + .spawn() + .unwrap(); - let mut ch_release_path = workload_path; - #[cfg(target_arch = "x86_64")] - ch_release_path.push("cloud-hypervisor-static"); - #[cfg(target_arch = "aarch64")] - ch_release_path.push("cloud-hypervisor-static-aarch64"); + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); - ch_release_path.into_os_string().into_string().unwrap() -} + assert_eq!(guest.get_cpu_count().unwrap_or_default(), 1); + assert!(guest.get_total_memory().unwrap_or_default() > 480_000); -fn prepare_vhost_user_net_daemon( - tmp_dir: &TempDir, - ip: &str, - tap: Option<&str>, - mtu: Option, - num_queues: usize, - client_mode: bool, -) -> (std::process::Command, String) { - let vunet_socket_path = String::from(tmp_dir.as_path().join("vunet.sock").to_str().unwrap()); + let grep_cmd = "grep -c PCI-MSI /proc/interrupts"; + assert_eq!( + guest + .ssh_command(grep_cmd) + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 12 + ); + }); - // Start the daemon - let mut net_params = format!( - "ip={ip},mask=255.255.255.128,socket={vunet_socket_path},num_queues={num_queues},queue_size=1024,client={client_mode}" - ); + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); - if let Some(tap) = tap { - net_params.push_str(format!(",tap={tap}").as_str()); + handle_child_output(r, &output); } - if let Some(mtu) = mtu { - net_params.push_str(format!(",mtu={mtu}").as_str()); + #[test] + fn test_virtio_block_io_uring() { + let guest = + make_virtio_block_guest(&GuestFactory::new_regular_guest_factory(), FOCAL_IMAGE_NAME); + _test_virtio_block(&guest, false, true, false, false, ImageType::Raw); } - let mut command = Command::new(clh_command("vhost_user_net")); - command.args(["--net-backend", net_params.as_str()]); - - (command, vunet_socket_path) -} - -fn prepare_swtpm_daemon(tmp_dir: &TempDir) -> (std::process::Command, String) { - let swtpm_tpm_dir = String::from(tmp_dir.as_path().join("swtpm").to_str().unwrap()); - let swtpm_socket_path = String::from( - tmp_dir - .as_path() - .join("swtpm") - .join("swtpm.sock") - .to_str() - .unwrap(), - ); - std::fs::create_dir(&swtpm_tpm_dir).unwrap(); - - let mut swtpm_command = Command::new("swtpm"); - let swtpm_args = [ - "socket", - "--tpmstate", - &format!("dir={swtpm_tpm_dir}"), - "--ctrl", - &format!("type=unixio,path={swtpm_socket_path}"), - "--flags", - "startup-clear", - "--tpm2", - ]; - swtpm_command.args(swtpm_args); - - (swtpm_command, swtpm_socket_path) -} - -fn remote_command(api_socket: &str, command: &str, arg: Option<&str>) -> bool { - let mut cmd = Command::new(clh_command("ch-remote")); - cmd.args([&format!("--api-socket={api_socket}"), command]); - - if let Some(arg) = arg { - cmd.arg(arg); - } - let output = cmd.output().unwrap(); - if output.status.success() { - true - } else { - eprintln!("Error running ch-remote command: {:?}", &cmd); - let stderr = String::from_utf8_lossy(&output.stderr); - eprintln!("stderr: {stderr}"); - false + #[test] + fn test_virtio_block_aio() { + let guest = + make_virtio_block_guest(&GuestFactory::new_regular_guest_factory(), FOCAL_IMAGE_NAME) + .with_cpu(4); + _test_virtio_block(&guest, true, false, false, false, ImageType::Raw); } -} -fn remote_command_w_output(api_socket: &str, command: &str, arg: Option<&str>) -> (bool, Vec) { - let mut cmd = Command::new(clh_command("ch-remote")); - cmd.args([&format!("--api-socket={api_socket}"), command]); - - if let Some(arg) = arg { - cmd.arg(arg); + #[test] + fn test_virtio_block_sync() { + let guest = + make_virtio_block_guest(&GuestFactory::new_regular_guest_factory(), FOCAL_IMAGE_NAME) + .with_cpu(4); + _test_virtio_block(&guest, true, true, false, false, ImageType::Raw); } - let output = cmd.output().expect("Failed to launch ch-remote"); - - (output.status.success(), output.stdout) -} - -fn resize_command( - api_socket: &str, - desired_vcpus: Option, - desired_ram: Option, - desired_balloon: Option, - event_file: Option<&str>, -) -> bool { - let mut cmd = Command::new(clh_command("ch-remote")); - cmd.args([&format!("--api-socket={api_socket}"), "resize"]); - - if let Some(desired_vcpus) = desired_vcpus { - cmd.arg(format!("--cpus={desired_vcpus}")); + #[test] + fn test_compute_file_checksum_empty() { + let mut reader = io::Cursor::new(vec![]); + let checksum = compute_file_checksum(&mut reader, 0); + assert_eq!(checksum, 5381); } - if let Some(desired_ram) = desired_ram { - cmd.arg(format!("--memory={desired_ram}")); + #[test] + fn test_compute_file_checksum_small() { + let data = b"hello world"; + let mut reader = io::Cursor::new(data); + let checksum = compute_file_checksum(&mut reader, data.len() as u64); + assert_eq!(checksum, 894552257); } - if let Some(desired_balloon) = desired_balloon { - cmd.arg(format!("--balloon={desired_balloon}")); + #[test] + fn test_compute_file_checksum_same_data() { + let data = b"test data 123"; + let mut reader1 = io::Cursor::new(data); + let mut reader2 = io::Cursor::new(data); + let checksum1 = compute_file_checksum(&mut reader1, data.len() as u64); + let checksum2 = compute_file_checksum(&mut reader2, data.len() as u64); + assert_eq!(checksum1, checksum2); } - let ret = cmd.status().expect("Failed to launch ch-remote").success(); - - if let Some(event_path) = event_file { - let latest_events = [ - &MetaEvent { - event: "resizing".to_string(), - device_id: None, - }, - &MetaEvent { - event: "resized".to_string(), - device_id: None, - }, - ]; - // See: #5938 - thread::sleep(std::time::Duration::new(1, 0)); - assert!(check_latest_events_exact(&latest_events, event_path)); - } - - ret -} - -fn resize_zone_command(api_socket: &str, id: &str, desired_size: &str) -> bool { - let mut cmd = Command::new(clh_command("ch-remote")); - cmd.args([ - &format!("--api-socket={api_socket}"), - "resize-zone", - &format!("--id={id}"), - &format!("--size={desired_size}"), - ]); - - cmd.status().expect("Failed to launch ch-remote").success() -} - -fn resize_disk_command(api_socket: &str, id: &str, desired_size: &str) -> bool { - let mut cmd = Command::new(clh_command("ch-remote")); - cmd.args([ - &format!("--api-socket={api_socket}"), - "resize-disk", - &format!("--disk={id}"), - &format!("--size={desired_size}"), - ]); - - cmd.status().expect("Failed to launch ch-remote").success() -} - -// setup OVS-DPDK bridge and ports -fn setup_ovs_dpdk() { - // setup OVS-DPDK - assert!(exec_host_command_status("service openvswitch-switch start").success()); - assert!(exec_host_command_status("ovs-vsctl init").success()); - assert!( - exec_host_command_status("ovs-vsctl set Open_vSwitch . other_config:dpdk-init=true") - .success() - ); - assert!(exec_host_command_status("service openvswitch-switch restart").success()); - - // Create OVS-DPDK bridge and ports - assert!( - exec_host_command_status( - "ovs-vsctl add-br ovsbr0 -- set bridge ovsbr0 datapath_type=netdev", - ) - .success() - ); - assert!(exec_host_command_status("ovs-vsctl add-port ovsbr0 vhost-user1 -- set Interface vhost-user1 type=dpdkvhostuserclient options:vhost-server-path=/tmp/dpdkvhostclient1").success()); - assert!(exec_host_command_status("ovs-vsctl add-port ovsbr0 vhost-user2 -- set Interface vhost-user2 type=dpdkvhostuserclient options:vhost-server-path=/tmp/dpdkvhostclient2").success()); - assert!(exec_host_command_status("ip link set up dev ovsbr0").success()); - assert!(exec_host_command_status("service openvswitch-switch restart").success()); -} -fn cleanup_ovs_dpdk() { - assert!(exec_host_command_status("ovs-vsctl del-br ovsbr0").success()); - exec_host_command_status("rm -f ovs-vsctl /tmp/dpdkvhostclient1 /tmp/dpdkvhostclient2"); -} -// Setup two guests and ensure they are connected through ovs-dpdk -fn setup_ovs_dpdk_guests( - guest1: &Guest, - guest2: &Guest, - api_socket: &str, - release_binary: bool, -) -> (Child, Child) { - setup_ovs_dpdk(); - - let clh_path = if release_binary { - cloud_hypervisor_release_path() - } else { - clh_command("cloud-hypervisor") - }; - - let mut child1 = GuestCommand::new_with_binary_path(guest1, &clh_path) - .args(["--cpus", "boot=2"]) - .args(["--memory", "size=0,shared=on"]) - .args(["--memory-zone", "id=mem0,size=1G,shared=on,host_numa_node=0"]) - .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .args(["--net", guest1.default_net_string().as_str(), "vhost_user=true,socket=/tmp/dpdkvhostclient1,num_queues=2,queue_size=256,vhost_mode=server"]) - .capture_output() - .spawn() - .unwrap(); - - #[cfg(target_arch = "x86_64")] - let guest_net_iface = "ens5"; - #[cfg(target_arch = "aarch64")] - let guest_net_iface = "enp0s5"; - - let r = std::panic::catch_unwind(|| { - guest1.wait_vm_boot().unwrap(); - - guest1 - .ssh_command(&format!( - "sudo ip addr add 172.100.0.1/24 dev {guest_net_iface}" - )) - .unwrap(); - guest1 - .ssh_command(&format!("sudo ip link set up dev {guest_net_iface}")) - .unwrap(); - - let guest_ip = guest1.network.guest_ip0.clone(); - thread::spawn(move || { - ssh_command_ip( - "nc -l 12345", - &guest_ip, - DEFAULT_SSH_RETRIES, - DEFAULT_SSH_TIMEOUT, - ) - .unwrap(); - }); - }); - if r.is_err() { - cleanup_ovs_dpdk(); - - let _ = child1.kill(); - let output = child1.wait_with_output().unwrap(); - handle_child_output(r, &output); - panic!("Test should already be failed/panicked"); // To explicitly mark this block never return - } - - let mut child2 = GuestCommand::new_with_binary_path(guest2, &clh_path) - .args(["--api-socket", api_socket]) - .args(["--cpus", "boot=2"]) - .args(["--memory", "size=0,shared=on"]) - .args(["--memory-zone", "id=mem0,size=1G,shared=on,host_numa_node=0"]) - .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .args(["--net", guest2.default_net_string().as_str(), "vhost_user=true,socket=/tmp/dpdkvhostclient2,num_queues=2,queue_size=256,vhost_mode=server"]) - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest2.wait_vm_boot().unwrap(); - - guest2 - .ssh_command(&format!( - "sudo ip addr add 172.100.0.2/24 dev {guest_net_iface}" - )) - .unwrap(); - guest2 - .ssh_command(&format!("sudo ip link set up dev {guest_net_iface}")) - .unwrap(); - - // Check the connection works properly between the two VMs - guest2.ssh_command("nc -vz 172.100.0.1 12345").unwrap(); - }); - if r.is_err() { - cleanup_ovs_dpdk(); - - let _ = child1.kill(); - let _ = child2.kill(); - let output = child2.wait_with_output().unwrap(); - handle_child_output(r, &output); - panic!("Test should already be failed/panicked"); // To explicitly mark this block never return - } - - (child1, child2) -} - -enum FwType { - Ovmf, - RustHypervisorFirmware, -} - -fn fw_path(_fw_type: FwType) -> String { - let mut workload_path = dirs::home_dir().unwrap(); - workload_path.push("workloads"); - - let mut fw_path = workload_path; - #[cfg(target_arch = "aarch64")] - fw_path.push("CLOUDHV_EFI.fd"); - #[cfg(target_arch = "x86_64")] - { - match _fw_type { - FwType::Ovmf => fw_path.push(OVMF_NAME), - FwType::RustHypervisorFirmware => fw_path.push("hypervisor-fw"), - } - } - - fw_path.to_str().unwrap().to_string() -} - -#[derive(Debug)] -struct MetaEvent { - event: String, - device_id: Option, -} - -impl MetaEvent { - pub fn match_with_json_event(&self, v: &serde_json::Value) -> bool { - let mut matched = false; - if v["event"].as_str().unwrap() == self.event { - if let Some(device_id) = &self.device_id { - if v["properties"]["id"].as_str().unwrap() == device_id { - matched = true; - } - } else { - matched = true; - } - } - matched - } -} - -// Parse the event_monitor file based on the format that each event -// is followed by a double newline -fn parse_event_file(event_file: &str) -> Vec { - let content = fs::read(event_file).unwrap(); - let mut ret = Vec::new(); - for entry in String::from_utf8_lossy(&content) - .trim() - .split("\n\n") - .collect::>() - { - ret.push(serde_json::from_str(entry).unwrap()); - } - - ret -} - -// Return true if all events from the input 'expected_events' are matched sequentially -// with events from the 'event_file' -fn check_sequential_events(expected_events: &[&MetaEvent], event_file: &str) -> bool { - let json_events = parse_event_file(event_file); - let len = expected_events.len(); - let mut idx = 0; - for e in &json_events { - if idx == len { - break; - } - if expected_events[idx].match_with_json_event(e) { - idx += 1; - } - } - - let ret = idx == len; - - if !ret { - eprintln!( - "\n\n==== Start 'check_sequential_events' failed ==== \ - \n\nexpected_events={expected_events:?}\nactual_events={json_events:?} \ - \n\n==== End 'check_sequential_events' failed ====", - ); - } - - ret -} - -// Return true if all events from the input 'expected_events' are matched exactly -// with events from the 'event_file' -fn check_sequential_events_exact(expected_events: &[&MetaEvent], event_file: &str) -> bool { - let json_events = parse_event_file(event_file); - assert!(expected_events.len() <= json_events.len()); - let json_events = &json_events[..expected_events.len()]; - - for (idx, e) in json_events.iter().enumerate() { - if !expected_events[idx].match_with_json_event(e) { - eprintln!( - "\n\n==== Start 'check_sequential_events_exact' failed ==== \ - \n\nexpected_events={expected_events:?}\nactual_events={json_events:?} \ - \n\n==== End 'check_sequential_events_exact' failed ====", - ); - - return false; - } - } - - true -} - -// Return true if events from the input 'latest_events' are matched exactly -// with the most recent events from the 'event_file' -fn check_latest_events_exact(latest_events: &[&MetaEvent], event_file: &str) -> bool { - let json_events = parse_event_file(event_file); - assert!(latest_events.len() <= json_events.len()); - let json_events = &json_events[(json_events.len() - latest_events.len())..]; - - for (idx, e) in json_events.iter().enumerate() { - if !latest_events[idx].match_with_json_event(e) { - eprintln!( - "\n\n==== Start 'check_latest_events_exact' failed ==== \ - \n\nexpected_events={latest_events:?}\nactual_events={json_events:?} \ - \n\n==== End 'check_latest_events_exact' failed ====", - ); - - return false; - } - } - - true -} - -fn test_cpu_topology(threads_per_core: u8, cores_per_package: u8, packages: u8, use_fw: bool) { - let disk_config = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - let total_vcpus = threads_per_core * cores_per_package * packages; - let direct_kernel_boot_path = direct_kernel_boot_path(); - let mut kernel_path = direct_kernel_boot_path.to_str().unwrap(); - let fw_path = fw_path(FwType::RustHypervisorFirmware); - if use_fw { - kernel_path = fw_path.as_str(); - } - - let mut child = GuestCommand::new(&guest) - .args([ - "--cpus", - &format!( - "boot={total_vcpus},topology={threads_per_core}:{cores_per_package}:1:{packages}" - ), - ]) - .args(["--memory", "size=512M"]) - .args(["--kernel", kernel_path]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .default_net() - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - assert_eq!( - guest.get_cpu_count().unwrap_or_default(), - u32::from(total_vcpus) - ); - assert_eq!( - guest - .ssh_command("lscpu | grep \"per core\" | cut -f 2 -d \":\" | sed \"s# *##\"") - .unwrap() - .trim() - .parse::() - .unwrap_or(0), - threads_per_core - ); - - assert_eq!( - guest - .ssh_command("lscpu | grep \"per socket\" | cut -f 2 -d \":\" | sed \"s# *##\"") - .unwrap() - .trim() - .parse::() - .unwrap_or(0), - cores_per_package - ); - - assert_eq!( - guest - .ssh_command("lscpu | grep \"Socket\" | cut -f 2 -d \":\" | sed \"s# *##\"") - .unwrap() - .trim() - .parse::() - .unwrap_or(0), - packages - ); - - #[cfg(target_arch = "x86_64")] - { - let mut cpu_id = 0; - for package_id in 0..packages { - for core_id in 0..cores_per_package { - for _ in 0..threads_per_core { - assert_eq!( - guest - .ssh_command(&format!("cat /sys/devices/system/cpu/cpu{cpu_id}/topology/physical_package_id")) - .unwrap() - .trim() - .parse::() - .unwrap_or(0), - package_id - ); - - assert_eq!( - guest - .ssh_command(&format!( - "cat /sys/devices/system/cpu/cpu{cpu_id}/topology/core_id" - )) - .unwrap() - .trim() - .parse::() - .unwrap_or(0), - core_id - ); - - cpu_id += 1; - } - } - } - } - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); -} - -#[allow(unused_variables)] -fn _test_guest_numa_nodes(acpi: bool) { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - let api_socket = temp_api_path(&guest.tmp_dir); - #[cfg(target_arch = "x86_64")] - let kernel_path = direct_kernel_boot_path(); - #[cfg(target_arch = "aarch64")] - let kernel_path = if acpi { - edk2_path() - } else { - direct_kernel_boot_path() - }; - - let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=6,max=12"]) - .args(["--memory", "size=0,hotplug_method=virtio-mem"]) - .args([ - "--memory-zone", - "id=mem0,size=1G,hotplug_size=3G", - "id=mem1,size=2G,hotplug_size=3G", - "id=mem2,size=3G,hotplug_size=3G", - ]) - .args([ - "--numa", - "guest_numa_id=0,cpus=[0-2,9],distances=[1@15,2@20],memory_zones=mem0", - "guest_numa_id=1,cpus=[3-4,6-8],distances=[0@20,2@25],memory_zones=mem1", - "guest_numa_id=2,cpus=[5,10-11],distances=[0@25,1@30],memory_zones=mem2", - ]) - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .args(["--api-socket", &api_socket]) - .capture_output() - .default_disks() - .default_net() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - guest.check_numa_common( - Some(&[960_000, 1_920_000, 2_880_000]), - Some(&[&[0, 1, 2], &[3, 4], &[5]]), - Some(&["10 15 20", "20 10 25", "25 30 10"]), - ); - - // AArch64 currently does not support hotplug, and therefore we only - // test hotplug-related function on x86_64 here. - #[cfg(target_arch = "x86_64")] - { - guest.enable_memory_hotplug(); - - // Resize every memory zone and check each associated NUMA node - // has been assigned the right amount of memory. - resize_zone_command(&api_socket, "mem0", "4G"); - resize_zone_command(&api_socket, "mem1", "4G"); - resize_zone_command(&api_socket, "mem2", "4G"); - // Resize to the maximum amount of CPUs and check each NUMA - // node has been assigned the right CPUs set. - resize_command(&api_socket, Some(12), None, None, None); - thread::sleep(std::time::Duration::new(5, 0)); - - guest.check_numa_common( - Some(&[3_840_000, 3_840_000, 3_840_000]), - Some(&[&[0, 1, 2, 9], &[3, 4, 6, 7, 8], &[5, 10, 11]]), - None, - ); - } - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); -} - -#[allow(unused_variables)] -fn _test_power_button(acpi: bool) { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - let mut cmd = GuestCommand::new(&guest); - let api_socket = temp_api_path(&guest.tmp_dir); - - #[cfg(target_arch = "x86_64")] - let kernel_path = direct_kernel_boot_path(); - #[cfg(target_arch = "aarch64")] - let kernel_path = if acpi { - edk2_path() - } else { - direct_kernel_boot_path() - }; - - cmd.args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .capture_output() - .default_disks() - .default_net() - .args(["--api-socket", &api_socket]); - - let child = cmd.spawn().unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - assert!(remote_command(&api_socket, "power-button", None)); - }); - - let output = child.wait_with_output().unwrap(); - assert!(output.status.success()); - handle_child_output(r, &output); -} - -fn get_msi_interrupt_pattern() -> String { - #[cfg(target_arch = "x86_64")] - { - "PCI-MSI".to_string() - } - #[cfg(target_arch = "aarch64")] - { - if cfg!(feature = "mshv") { - "GICv2m-PCI-MSIX".to_string() - } else { - "ITS-PCI-MSIX".to_string() - } - } -} - -type PrepareNetDaemon = dyn Fn( - &TempDir, - &str, - Option<&str>, - Option, - usize, - bool, -) -> (std::process::Command, String); - -fn test_vhost_user_net( - tap: Option<&str>, - num_queues: usize, - prepare_daemon: &PrepareNetDaemon, - generate_host_mac: bool, - client_mode_daemon: bool, -) { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - let api_socket = temp_api_path(&guest.tmp_dir); - - let kernel_path = direct_kernel_boot_path(); - - let host_mac = if generate_host_mac { - Some(MacAddr::local_random()) - } else { - None - }; - - let mtu = Some(3000); - - let (mut daemon_command, vunet_socket_path) = prepare_daemon( - &guest.tmp_dir, - &guest.network.host_ip0, - tap, - mtu, - num_queues, - client_mode_daemon, - ); - - let net_params = format!( - "vhost_user=true,mac={},socket={},num_queues={},queue_size=1024{},vhost_mode={},mtu=3000", - guest.network.guest_mac0, - vunet_socket_path, - num_queues, - if let Some(host_mac) = host_mac { - format!(",host_mac={host_mac}") - } else { - String::new() - }, - if client_mode_daemon { - "server" - } else { - "client" - }, - ); - - let mut ch_command = GuestCommand::new(&guest); - ch_command - .args(["--cpus", format!("boot={}", num_queues / 2).as_str()]) - .args(["--memory", "size=512M,hotplug_size=2048M,shared=on"]) - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .args(["--net", net_params.as_str()]) - .args(["--api-socket", &api_socket]) - .capture_output(); - - let mut daemon_child: std::process::Child; - let mut child: std::process::Child; - - if client_mode_daemon { - child = ch_command.spawn().unwrap(); - // Make sure the VMM is waiting for the backend to connect - thread::sleep(std::time::Duration::new(10, 0)); - daemon_child = daemon_command.spawn().unwrap(); - } else { - daemon_child = daemon_command.spawn().unwrap(); - // Make sure the backend is waiting for the VMM to connect - thread::sleep(std::time::Duration::new(10, 0)); - child = ch_command.spawn().unwrap(); - } - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - if let Some(tap_name) = tap { - let tap_count = exec_host_command_output(&format!("ip link | grep -c {tap_name}")); - assert_eq!(String::from_utf8_lossy(&tap_count.stdout).trim(), "1"); - } - - if let Some(host_mac) = tap { - let mac_count = exec_host_command_output(&format!("ip link | grep -c {host_mac}")); - assert_eq!(String::from_utf8_lossy(&mac_count.stdout).trim(), "1"); - } - - #[cfg(target_arch = "aarch64")] - let iface = "enp0s4"; - #[cfg(target_arch = "x86_64")] - let iface = "ens4"; - - assert_eq!( - guest - .ssh_command(format!("cat /sys/class/net/{iface}/mtu").as_str()) - .unwrap() - .trim(), - "3000" - ); - - // 1 network interface + default localhost ==> 2 interfaces - // It's important to note that this test is fully exercising the - // vhost-user-net implementation and the associated backend since - // it does not define any --net network interface. That means all - // the ssh communication in that test happens through the network - // interface backed by vhost-user-net. - assert_eq!( - guest - .ssh_command("ip -o link | wc -l") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 2 - ); - - // The following pci devices will appear on guest with PCI-MSI - // interrupt vectors assigned. - // 1 virtio-console with 3 vectors: config, Rx, Tx - // 1 virtio-blk with 2 vectors: config, Request - // 1 virtio-blk with 2 vectors: config, Request - // 1 virtio-rng with 2 vectors: config, Request - // Since virtio-net has 2 queue pairs, its vectors is as follows: - // 1 virtio-net with 5 vectors: config, Rx (2), Tx (2) - // Based on the above, the total vectors should 14. - let grep_cmd = format!("grep -c {} /proc/interrupts", get_msi_interrupt_pattern()); - - assert_eq!( - guest - .ssh_command(&grep_cmd) - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 10 + (num_queues as u32) - ); - - // ACPI feature is needed. - #[cfg(target_arch = "x86_64")] - { - guest.enable_memory_hotplug(); - - // Add RAM to the VM - let desired_ram = 1024 << 20; - resize_command(&api_socket, None, Some(desired_ram), None, None); - - thread::sleep(std::time::Duration::new(10, 0)); - - // Here by simply checking the size (through ssh), we validate - // the connection is still working, which means vhost-user-net - // keeps working after the resize. - assert!(guest.get_total_memory().unwrap_or_default() > 960_000); - } - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - thread::sleep(std::time::Duration::new(5, 0)); - let _ = daemon_child.kill(); - let _ = daemon_child.wait(); - - handle_child_output(r, &output); -} - -type PrepareBlkDaemon = dyn Fn(&TempDir, &str, usize, bool, bool) -> (std::process::Child, String); - -fn test_vhost_user_blk( - num_queues: usize, - readonly: bool, - direct: bool, - prepare_vhost_user_blk_daemon: Option<&PrepareBlkDaemon>, -) { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - let api_socket = temp_api_path(&guest.tmp_dir); - - let kernel_path = direct_kernel_boot_path(); - - let (blk_params, daemon_child) = { - let prepare_daemon = prepare_vhost_user_blk_daemon.unwrap(); - // Start the daemon - let (daemon_child, vubd_socket_path) = - prepare_daemon(&guest.tmp_dir, "blk.img", num_queues, readonly, direct); - - ( - format!( - "vhost_user=true,socket={vubd_socket_path},num_queues={num_queues},queue_size=128", - ), - Some(daemon_child), - ) - }; - - let mut child = GuestCommand::new(&guest) - .args(["--cpus", format!("boot={num_queues}").as_str()]) - .args(["--memory", "size=512M,hotplug_size=2048M,shared=on"]) - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .args([ - "--disk", - format!( - "path={}", - guest.disk_config.disk(DiskType::OperatingSystem).unwrap() - ) - .as_str(), - format!( - "path={}", - guest.disk_config.disk(DiskType::CloudInit).unwrap() - ) - .as_str(), - blk_params.as_str(), - ]) - .default_net() - .args(["--api-socket", &api_socket]) - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - // Check both if /dev/vdc exists and if the block size is 16M. - assert_eq!( - guest - .ssh_command("lsblk | grep vdc | grep -c 16M") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 1 - ); - - // Check if this block is RO or RW. - assert_eq!( - guest - .ssh_command("lsblk | grep vdc | awk '{print $5}'") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - readonly as u32 - ); - - // Check if the number of queues in /sys/block/vdc/mq matches the - // expected num_queues. - assert_eq!( - guest - .ssh_command("ls -ll /sys/block/vdc/mq | grep ^d | wc -l") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - num_queues as u32 - ); - - // Mount the device - let mount_ro_rw_flag = if readonly { "ro,noload" } else { "rw" }; - guest.ssh_command("mkdir mount_image").unwrap(); - guest - .ssh_command( - format!("sudo mount -o {mount_ro_rw_flag} -t ext4 /dev/vdc mount_image/").as_str(), - ) - .unwrap(); - - // Check the content of the block device. The file "foo" should - // contain "bar". - assert_eq!( - guest.ssh_command("cat mount_image/foo").unwrap().trim(), - "bar" - ); - - // ACPI feature is needed. - #[cfg(target_arch = "x86_64")] - { - guest.enable_memory_hotplug(); - - // Add RAM to the VM - let desired_ram = 1024 << 20; - resize_command(&api_socket, None, Some(desired_ram), None, None); - - thread::sleep(std::time::Duration::new(10, 0)); - - assert!(guest.get_total_memory().unwrap_or_default() > 960_000); - - // Check again the content of the block device after the resize - // has been performed. - assert_eq!( - guest.ssh_command("cat mount_image/foo").unwrap().trim(), - "bar" - ); - } - - // Unmount the device - guest.ssh_command("sudo umount /dev/vdc").unwrap(); - guest.ssh_command("rm -r mount_image").unwrap(); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - if let Some(mut daemon_child) = daemon_child { - thread::sleep(std::time::Duration::new(5, 0)); - let _ = daemon_child.kill(); - let _ = daemon_child.wait(); - } - - handle_child_output(r, &output); -} - -fn test_boot_from_vhost_user_blk( - num_queues: usize, - readonly: bool, - direct: bool, - prepare_vhost_user_blk_daemon: Option<&PrepareBlkDaemon>, -) { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - let kernel_path = direct_kernel_boot_path(); - - let disk_path = guest.disk_config.disk(DiskType::OperatingSystem).unwrap(); - - let (blk_boot_params, daemon_child) = { - let prepare_daemon = prepare_vhost_user_blk_daemon.unwrap(); - // Start the daemon - let (daemon_child, vubd_socket_path) = prepare_daemon( - &guest.tmp_dir, - disk_path.as_str(), - num_queues, - readonly, - direct, - ); - - ( - format!( - "vhost_user=true,socket={vubd_socket_path},num_queues={num_queues},queue_size=128", - ), - Some(daemon_child), - ) - }; - - let mut child = GuestCommand::new(&guest) - .args(["--cpus", format!("boot={num_queues}").as_str()]) - .args(["--memory", "size=512M,shared=on"]) - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .args([ - "--disk", - blk_boot_params.as_str(), - format!( - "path={}", - guest.disk_config.disk(DiskType::CloudInit).unwrap() - ) - .as_str(), - ]) - .default_net() - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - // Just check the VM booted correctly. - assert_eq!(guest.get_cpu_count().unwrap_or_default(), num_queues as u32); - assert!(guest.get_total_memory().unwrap_or_default() > 480_000); - }); - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - if let Some(mut daemon_child) = daemon_child { - thread::sleep(std::time::Duration::new(5, 0)); - let _ = daemon_child.kill(); - let _ = daemon_child.wait(); - } - - handle_child_output(r, &output); -} - -fn _test_virtio_fs( - prepare_daemon: &dyn Fn(&TempDir, &str) -> (std::process::Child, String), - hotplug: bool, - pci_segment: Option, -) { - #[cfg(target_arch = "aarch64")] - let focal_image = if hotplug { - FOCAL_IMAGE_UPDATE_KERNEL_NAME.to_string() - } else { - FOCAL_IMAGE_NAME.to_string() - }; - #[cfg(target_arch = "x86_64")] - let focal_image = FOCAL_IMAGE_NAME.to_string(); - let disk_config = UbuntuDiskConfig::new(focal_image); - let guest = Guest::new(Box::new(disk_config)); - let api_socket = temp_api_path(&guest.tmp_dir); - - let mut workload_path = dirs::home_dir().unwrap(); - workload_path.push("workloads"); - - let mut shared_dir = workload_path; - shared_dir.push("shared_dir"); - - #[cfg(target_arch = "x86_64")] - let kernel_path = direct_kernel_boot_path(); - #[cfg(target_arch = "aarch64")] - let kernel_path = if hotplug { - edk2_path() - } else { - direct_kernel_boot_path() - }; - - let (mut daemon_child, virtiofsd_socket_path) = - prepare_daemon(&guest.tmp_dir, shared_dir.to_str().unwrap()); - - let mut guest_command = GuestCommand::new(&guest); - guest_command - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M,hotplug_size=2048M,shared=on"]) - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .default_net() - .args(["--api-socket", &api_socket]); - if pci_segment.is_some() { - guest_command.args([ - "--platform", - &format!("num_pci_segments={MAX_NUM_PCI_SEGMENTS}"), - ]); - } - - let fs_params = format!( - "id=myfs0,tag=myfs,socket={},num_queues=1,queue_size=1024{}", - virtiofsd_socket_path, - if let Some(pci_segment) = pci_segment { - format!(",pci_segment={pci_segment}") - } else { - String::new() - } - ); - - if !hotplug { - guest_command.args(["--fs", fs_params.as_str()]); - } - - let mut child = guest_command.capture_output().spawn().unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - if hotplug { - // Add fs to the VM - let (cmd_success, cmd_output) = - remote_command_w_output(&api_socket, "add-fs", Some(&fs_params)); - assert!(cmd_success); - - if let Some(pci_segment) = pci_segment { - assert!(String::from_utf8_lossy(&cmd_output).contains(&format!( - "{{\"id\":\"myfs0\",\"bdf\":\"{pci_segment:04x}:00:01.0\"}}" - ))); - } else { - assert!( - String::from_utf8_lossy(&cmd_output) - .contains("{\"id\":\"myfs0\",\"bdf\":\"0000:00:06.0\"}") - ); - } - - thread::sleep(std::time::Duration::new(10, 0)); - } - - // Mount shared directory through virtio_fs filesystem - guest - .ssh_command("mkdir -p mount_dir && sudo mount -t virtiofs myfs mount_dir/") - .unwrap(); - - // Check file1 exists and its content is "foo" - assert_eq!( - guest.ssh_command("cat mount_dir/file1").unwrap().trim(), - "foo" - ); - // Check file2 does not exist - guest - .ssh_command("[ ! -f 'mount_dir/file2' ] || true") - .unwrap(); - - // Check file3 exists and its content is "bar" - assert_eq!( - guest.ssh_command("cat mount_dir/file3").unwrap().trim(), - "bar" - ); - - // ACPI feature is needed. - #[cfg(target_arch = "x86_64")] - { - guest.enable_memory_hotplug(); - - // Add RAM to the VM - let desired_ram = 1024 << 20; - resize_command(&api_socket, None, Some(desired_ram), None, None); - - thread::sleep(std::time::Duration::new(30, 0)); - assert!(guest.get_total_memory().unwrap_or_default() > 960_000); - - // After the resize, check again that file1 exists and its - // content is "foo". - assert_eq!( - guest.ssh_command("cat mount_dir/file1").unwrap().trim(), - "foo" - ); - } - - if hotplug { - // Remove from VM - guest.ssh_command("sudo umount mount_dir").unwrap(); - assert!(remote_command(&api_socket, "remove-device", Some("myfs0"))); - } - }); - - let (r, hotplug_daemon_child) = if r.is_ok() && hotplug { - thread::sleep(std::time::Duration::new(10, 0)); - let (daemon_child, virtiofsd_socket_path) = - prepare_daemon(&guest.tmp_dir, shared_dir.to_str().unwrap()); - - let r = std::panic::catch_unwind(|| { - thread::sleep(std::time::Duration::new(10, 0)); - let fs_params = format!( - "id=myfs0,tag=myfs,socket={},num_queues=1,queue_size=1024{}", - virtiofsd_socket_path, - if let Some(pci_segment) = pci_segment { - format!(",pci_segment={pci_segment}") - } else { - String::new() - } - ); - - // Add back and check it works - let (cmd_success, cmd_output) = - remote_command_w_output(&api_socket, "add-fs", Some(&fs_params)); - assert!(cmd_success); - if let Some(pci_segment) = pci_segment { - assert!(String::from_utf8_lossy(&cmd_output).contains(&format!( - "{{\"id\":\"myfs0\",\"bdf\":\"{pci_segment:04x}:00:01.0\"}}" - ))); - } else { - assert!( - String::from_utf8_lossy(&cmd_output) - .contains("{\"id\":\"myfs0\",\"bdf\":\"0000:00:06.0\"}") - ); - } - - thread::sleep(std::time::Duration::new(10, 0)); - // Mount shared directory through virtio_fs filesystem - guest - .ssh_command("mkdir -p mount_dir && sudo mount -t virtiofs myfs mount_dir/") - .unwrap(); - - // Check file1 exists and its content is "foo" - assert_eq!( - guest.ssh_command("cat mount_dir/file1").unwrap().trim(), - "foo" - ); - }); - - (r, Some(daemon_child)) - } else { - (r, None) - }; - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - let _ = daemon_child.kill(); - let _ = daemon_child.wait(); - - if let Some(mut daemon_child) = hotplug_daemon_child { - let _ = daemon_child.kill(); - let _ = daemon_child.wait(); - } - - handle_child_output(r, &output); -} - -fn test_virtio_pmem(discard_writes: bool, specify_size: bool) { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - let kernel_path = direct_kernel_boot_path(); - - let pmem_temp_file = TempFile::new().unwrap(); - pmem_temp_file.as_file().set_len(128 << 20).unwrap(); - - std::process::Command::new("mkfs.ext4") - .arg(pmem_temp_file.as_path()) - .output() - .expect("Expect creating disk image to succeed"); - - let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .default_net() - .args([ - "--pmem", - format!( - "file={}{}{}", - pmem_temp_file.as_path().to_str().unwrap(), - if specify_size { ",size=128M" } else { "" }, - if discard_writes { - ",discard_writes=on" - } else { - "" - } - ) - .as_str(), - ]) - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - // Check for the presence of /dev/pmem0 - assert_eq!( - guest.ssh_command("ls /dev/pmem0").unwrap().trim(), - "/dev/pmem0" - ); - - // Check changes persist after reboot - assert_eq!(guest.ssh_command("sudo mount /dev/pmem0 /mnt").unwrap(), ""); - assert_eq!(guest.ssh_command("ls /mnt").unwrap(), "lost+found\n"); - guest - .ssh_command("echo test123 | sudo tee /mnt/test") - .unwrap(); - assert_eq!(guest.ssh_command("sudo umount /mnt").unwrap(), ""); - assert_eq!(guest.ssh_command("ls /mnt").unwrap(), ""); - - guest.reboot_linux(0); - assert_eq!(guest.ssh_command("sudo mount /dev/pmem0 /mnt").unwrap(), ""); - assert_eq!( - guest - .ssh_command("sudo cat /mnt/test || true") - .unwrap() - .trim(), - if discard_writes { "" } else { "test123" } - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); -} - -fn get_fd_count(pid: u32) -> usize { - fs::read_dir(format!("/proc/{pid}/fd")).unwrap().count() -} - -fn _test_virtio_vsock(hotplug: bool) { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - #[cfg(target_arch = "x86_64")] - let kernel_path = direct_kernel_boot_path(); - #[cfg(target_arch = "aarch64")] - let kernel_path = if hotplug { - edk2_path() - } else { - direct_kernel_boot_path() - }; - - let socket = temp_vsock_path(&guest.tmp_dir); - let api_socket = temp_api_path(&guest.tmp_dir); - - let mut cmd = GuestCommand::new(&guest); - cmd.args(["--api-socket", &api_socket]); - cmd.args(["--cpus", "boot=1"]); - cmd.args(["--memory", "size=512M"]); - cmd.args(["--kernel", kernel_path.to_str().unwrap()]); - cmd.args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]); - cmd.default_disks(); - cmd.default_net(); - - if !hotplug { - cmd.args(["--vsock", format!("cid=3,socket={socket}").as_str()]); - } - - let mut child = cmd.capture_output().spawn().unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - if hotplug { - let (cmd_success, cmd_output) = remote_command_w_output( - &api_socket, - "add-vsock", - Some(format!("cid=3,socket={socket},id=test0").as_str()), - ); - assert!(cmd_success); - assert!( - String::from_utf8_lossy(&cmd_output) - .contains("{\"id\":\"test0\",\"bdf\":\"0000:00:06.0\"}") - ); - thread::sleep(std::time::Duration::new(10, 0)); - // Check adding a second one fails - assert!(!remote_command( - &api_socket, - "add-vsock", - Some("cid=1234,socket=/tmp/fail") - )); - } - - // Validate vsock works as expected. - guest.check_vsock(socket.as_str()); - guest.reboot_linux(0); - // Validate vsock still works after a reboot. - guest.check_vsock(socket.as_str()); - - if hotplug { - assert!(remote_command(&api_socket, "remove-device", Some("test0"))); - } - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); -} - -fn get_ksm_pages_shared() -> u32 { - fs::read_to_string("/sys/kernel/mm/ksm/pages_shared") - .unwrap() - .trim() - .parse::() - .unwrap() -} - -fn test_memory_mergeable(mergeable: bool) { - let memory_param = if mergeable { - "mergeable=on" - } else { - "mergeable=off" - }; - - // We assume the number of shared pages in the rest of the system to be constant - let ksm_ps_init = get_ksm_pages_shared(); - - let disk_config1 = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); - let guest1 = Guest::new(Box::new(disk_config1)); - let mut child1 = GuestCommand::new(&guest1) - .args(["--cpus", "boot=1"]) - .args(["--memory", format!("size=512M,{memory_param}").as_str()]) - .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .args(["--net", guest1.default_net_string().as_str()]) - .args(["--serial", "tty", "--console", "off"]) - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest1.wait_vm_boot().unwrap(); - }); - if r.is_err() { - kill_child(&mut child1); - let output = child1.wait_with_output().unwrap(); - handle_child_output(r, &output); - panic!("Test should already be failed/panicked"); // To explicitly mark this block never return - } - - let ksm_ps_guest1 = get_ksm_pages_shared(); - - let disk_config2 = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); - let guest2 = Guest::new(Box::new(disk_config2)); - let mut child2 = GuestCommand::new(&guest2) - .args(["--cpus", "boot=1"]) - .args(["--memory", format!("size=512M,{memory_param}").as_str()]) - .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .args(["--net", guest2.default_net_string().as_str()]) - .args(["--serial", "tty", "--console", "off"]) - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest2.wait_vm_boot().unwrap(); - let ksm_ps_guest2 = get_ksm_pages_shared(); - - if mergeable { - println!( - "ksm pages_shared after vm1 booted '{ksm_ps_guest1}', ksm pages_shared after vm2 booted '{ksm_ps_guest2}'" - ); - // We are expecting the number of shared pages to increase as the number of VM increases - assert!(ksm_ps_guest1 < ksm_ps_guest2); - } else { - assert!(ksm_ps_guest1 == ksm_ps_init); - assert!(ksm_ps_guest2 == ksm_ps_init); - } - }); - - kill_child(&mut child1); - kill_child(&mut child2); - - let output = child1.wait_with_output().unwrap(); - child2.wait().unwrap(); - - handle_child_output(r, &output); -} - -fn _get_vmm_overhead(pid: u32, guest_memory_size: u32) -> HashMap { - let smaps = fs::File::open(format!("/proc/{pid}/smaps")).unwrap(); - let reader = io::BufReader::new(smaps); - - let mut skip_map: bool = false; - let mut region_name: String = String::new(); - let mut region_maps = HashMap::new(); - for line in reader.lines() { - let l = line.unwrap(); - - if l.contains('-') { - let values: Vec<&str> = l.split_whitespace().collect(); - region_name = values.last().unwrap().trim().to_string(); - if region_name == "0" { - region_name = "anonymous".to_string(); - } - } - - // Each section begins with something that looks like: - // Size: 2184 kB - if l.starts_with("Size:") { - let values: Vec<&str> = l.split_whitespace().collect(); - let map_size = values[1].parse::().unwrap(); - // We skip the assigned guest RAM map, its RSS is only - // dependent on the guest actual memory usage. - // Everything else can be added to the VMM overhead. - skip_map = map_size >= guest_memory_size; - continue; - } - - // If this is a map we're taking into account, then we only - // count the RSS. The sum of all counted RSS is the VMM overhead. - if !skip_map && l.starts_with("Rss:") { - let values: Vec<&str> = l.split_whitespace().collect(); - let value = values[1].trim().parse::().unwrap(); - *region_maps.entry(region_name.clone()).or_insert(0) += value; - } - } - - region_maps -} - -fn get_vmm_overhead(pid: u32, guest_memory_size: u32) -> u32 { - let mut total = 0; - - for (region_name, value) in &_get_vmm_overhead(pid, guest_memory_size) { - eprintln!("{region_name}: {value}"); - total += value; - } - - total -} - -fn process_rss_kib(pid: u32) -> usize { - let command = format!("ps -q {pid} -o rss="); - let rss = exec_host_command_output(&command); - String::from_utf8_lossy(&rss.stdout).trim().parse().unwrap() -} - -// 10MB is our maximum accepted overhead. -const MAXIMUM_VMM_OVERHEAD_KB: u32 = 10 * 1024; - -#[derive(PartialEq, Eq, PartialOrd)] -struct Counters { - rx_bytes: u64, - rx_frames: u64, - tx_bytes: u64, - tx_frames: u64, - read_bytes: u64, - write_bytes: u64, - read_ops: u64, - write_ops: u64, -} - -fn get_counters(api_socket: &str) -> Counters { - // Get counters - let (cmd_success, cmd_output) = remote_command_w_output(api_socket, "counters", None); - assert!(cmd_success); - - let counters: HashMap<&str, HashMap<&str, u64>> = - serde_json::from_slice(&cmd_output).unwrap_or_default(); - - let rx_bytes = *counters.get("_net2").unwrap().get("rx_bytes").unwrap(); - let rx_frames = *counters.get("_net2").unwrap().get("rx_frames").unwrap(); - let tx_bytes = *counters.get("_net2").unwrap().get("tx_bytes").unwrap(); - let tx_frames = *counters.get("_net2").unwrap().get("tx_frames").unwrap(); - - let read_bytes = *counters.get("_disk0").unwrap().get("read_bytes").unwrap(); - let write_bytes = *counters.get("_disk0").unwrap().get("write_bytes").unwrap(); - let read_ops = *counters.get("_disk0").unwrap().get("read_ops").unwrap(); - let write_ops = *counters.get("_disk0").unwrap().get("write_ops").unwrap(); - - Counters { - rx_bytes, - rx_frames, - tx_bytes, - tx_frames, - read_bytes, - write_bytes, - read_ops, - write_ops, - } -} - -fn pty_read(mut pty: std::fs::File) -> Receiver { - let (tx, rx) = mpsc::channel::(); - thread::spawn(move || { - loop { - thread::sleep(std::time::Duration::new(1, 0)); - let mut buf = [0; 512]; - match pty.read(&mut buf) { - Ok(_bytes) => { - let output = std::str::from_utf8(&buf).unwrap().to_string(); - match tx.send(output) { - Ok(_) => (), - Err(_) => break, - } - } - Err(_) => break, - } - } - }); - rx -} - -fn get_pty_path(api_socket: &str, pty_type: &str) -> PathBuf { - let (cmd_success, cmd_output) = remote_command_w_output(api_socket, "info", None); - assert!(cmd_success); - let info: serde_json::Value = serde_json::from_slice(&cmd_output).unwrap_or_default(); - assert_eq!("Pty", info["config"][pty_type]["mode"]); - PathBuf::from( - info["config"][pty_type]["file"] - .as_str() - .expect("Missing pty path"), - ) -} - -// VFIO test network setup. -// We reserve a different IP class for it: 172.18.0.0/24. -#[cfg(target_arch = "x86_64")] -fn setup_vfio_network_interfaces() { - // 'vfio-br0' - assert!(exec_host_command_status("sudo ip link add name vfio-br0 type bridge").success()); - assert!(exec_host_command_status("sudo ip link set vfio-br0 up").success()); - assert!(exec_host_command_status("sudo ip addr add 172.18.0.1/24 dev vfio-br0").success()); - // 'vfio-tap0' - assert!(exec_host_command_status("sudo ip tuntap add vfio-tap0 mode tap").success()); - assert!(exec_host_command_status("sudo ip link set vfio-tap0 master vfio-br0").success()); - assert!(exec_host_command_status("sudo ip link set vfio-tap0 up").success()); - // 'vfio-tap1' - assert!(exec_host_command_status("sudo ip tuntap add vfio-tap1 mode tap").success()); - assert!(exec_host_command_status("sudo ip link set vfio-tap1 master vfio-br0").success()); - assert!(exec_host_command_status("sudo ip link set vfio-tap1 up").success()); - // 'vfio-tap2' - assert!(exec_host_command_status("sudo ip tuntap add vfio-tap2 mode tap").success()); - assert!(exec_host_command_status("sudo ip link set vfio-tap2 master vfio-br0").success()); - assert!(exec_host_command_status("sudo ip link set vfio-tap2 up").success()); - // 'vfio-tap3' - assert!(exec_host_command_status("sudo ip tuntap add vfio-tap3 mode tap").success()); - assert!(exec_host_command_status("sudo ip link set vfio-tap3 master vfio-br0").success()); - assert!(exec_host_command_status("sudo ip link set vfio-tap3 up").success()); -} - -// Tear VFIO test network down -#[cfg(target_arch = "x86_64")] -fn cleanup_vfio_network_interfaces() { - assert!(exec_host_command_status("sudo ip link del vfio-br0").success()); - assert!(exec_host_command_status("sudo ip link del vfio-tap0").success()); - assert!(exec_host_command_status("sudo ip link del vfio-tap1").success()); - assert!(exec_host_command_status("sudo ip link del vfio-tap2").success()); - assert!(exec_host_command_status("sudo ip link del vfio-tap3").success()); -} - -fn balloon_size(api_socket: &str) -> u64 { - let (cmd_success, cmd_output) = remote_command_w_output(api_socket, "info", None); - assert!(cmd_success); - - let info: serde_json::Value = serde_json::from_slice(&cmd_output).unwrap_or_default(); - let total_mem = &info["config"]["memory"]["size"] - .to_string() - .parse::() - .unwrap(); - let actual_mem = &info["memory_actual_size"] - .to_string() - .parse::() - .unwrap(); - total_mem - actual_mem -} - -fn vm_state(api_socket: &str) -> String { - let (cmd_success, cmd_output) = remote_command_w_output(api_socket, "info", None); - assert!(cmd_success); - - let info: serde_json::Value = serde_json::from_slice(&cmd_output).unwrap_or_default(); - let state = &info["state"].as_str().unwrap(); - - state.to_string() -} - -// This test validates that it can find the virtio-iommu device at first. -// It also verifies that both disks and the network card are attached to -// the virtual IOMMU by looking at /sys/kernel/iommu_groups directory. -// The last interesting part of this test is that it exercises the network -// interface attached to the virtual IOMMU since this is the one used to -// send all commands through SSH. -fn _test_virtio_iommu(acpi: bool) { - // Virtio-iommu support is ready in recent kernel (v5.14). But the kernel in - // Focal image is still old. - // So if ACPI is enabled on AArch64, we use a modified Focal image in which - // the kernel binary has been updated. - #[cfg(target_arch = "aarch64")] - let focal_image = FOCAL_IMAGE_UPDATE_KERNEL_NAME.to_string(); - #[cfg(target_arch = "x86_64")] - let focal_image = FOCAL_IMAGE_NAME.to_string(); - let disk_config = UbuntuDiskConfig::new(focal_image); - let guest = Guest::new(Box::new(disk_config)); - - #[cfg(target_arch = "x86_64")] - let kernel_path = direct_kernel_boot_path(); - #[cfg(target_arch = "aarch64")] - let kernel_path = if acpi { - edk2_path() - } else { - direct_kernel_boot_path() - }; - - let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .args([ - "--disk", - format!( - "path={},iommu=on", - guest.disk_config.disk(DiskType::OperatingSystem).unwrap() - ) - .as_str(), - format!( - "path={},iommu=on", - guest.disk_config.disk(DiskType::CloudInit).unwrap() - ) - .as_str(), - ]) - .args(["--net", guest.default_net_string_w_iommu().as_str()]) - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - // Verify the virtio-iommu device is present. - assert!( - guest - .does_device_vendor_pair_match("0x1057", "0x1af4") - .unwrap_or_default() - ); - - // On AArch64, if the guest system boots from FDT, the behavior of IOMMU is a bit - // different with ACPI. - // All devices on the PCI bus will be attached to the virtual IOMMU, except the - // virtio-iommu device itself. So these devices will all be added to IOMMU groups, - // and appear under folder '/sys/kernel/iommu_groups/'. - // The result is, in the case of FDT, IOMMU group '0' contains "0000:00:01.0" - // which is the console. The first disk "0000:00:02.0" is in group '1'. - // While on ACPI, console device is not attached to IOMMU. So the IOMMU group '0' - // contains "0000:00:02.0" which is the first disk. - // - // Verify the iommu group of the first disk. - let iommu_group = if acpi { 0 } else { 2 }; - assert_eq!( - guest - .ssh_command(format!("ls /sys/kernel/iommu_groups/{iommu_group}/devices").as_str()) - .unwrap() - .trim(), - "0000:00:02.0" - ); - - // Verify the iommu group of the second disk. - let iommu_group = if acpi { 1 } else { 3 }; - assert_eq!( - guest - .ssh_command(format!("ls /sys/kernel/iommu_groups/{iommu_group}/devices").as_str()) - .unwrap() - .trim(), - "0000:00:03.0" - ); - - // Verify the iommu group of the network card. - let iommu_group = if acpi { 2 } else { 4 }; - assert_eq!( - guest - .ssh_command(format!("ls /sys/kernel/iommu_groups/{iommu_group}/devices").as_str()) - .unwrap() - .trim(), - "0000:00:04.0" - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); -} - -fn get_reboot_count(guest: &Guest) -> u32 { - guest - .ssh_command("sudo last | grep -c reboot") - .unwrap() - .trim() - .parse::() - .unwrap_or_default() -} - -fn enable_guest_watchdog(guest: &Guest, watchdog_sec: u32) { - // Check for PCI device - assert!( - guest - .does_device_vendor_pair_match("0x1063", "0x1af4") - .unwrap_or_default() - ); - - // Enable systemd watchdog - guest - .ssh_command(&format!( - "echo RuntimeWatchdogSec={watchdog_sec}s | sudo tee -a /etc/systemd/system.conf" - )) - .unwrap(); - - guest.ssh_command("sudo systemctl daemon-reexec").unwrap(); -} - -fn make_guest_panic(guest: &Guest) { - // Check for pvpanic device - assert!( - guest - .does_device_vendor_pair_match("0x0011", "0x1b36") - .unwrap_or_default() - ); - - // Trigger guest a panic - guest.ssh_command("screen -dmS reboot sh -c \"sleep 5; echo s | tee /proc/sysrq-trigger; echo c | sudo tee /proc/sysrq-trigger\"").unwrap(); -} - -// ivshmem test -// This case validates that read data from host(host write data to ivshmem backend file, -// guest read data from ivshmem pci bar2 memory) -// and write data to host(guest write data to ivshmem pci bar2 memory, host read it from -// ivshmem backend file). -// It also checks the size of the shared memory region. -fn _test_ivshmem(guest: &Guest, ivshmem_file_path: impl AsRef, file_size: &str) { - let ivshmem_file_path = ivshmem_file_path.as_ref(); - let test_message_read = String::from("ivshmem device test data read"); - // Modify backend file data before function test - let mut file = OpenOptions::new() - .read(true) - .write(true) - .open(ivshmem_file_path) - .unwrap(); - file.seek(SeekFrom::Start(0)).unwrap(); - file.write_all(test_message_read.as_bytes()).unwrap(); - file.write_all(b"\0").unwrap(); - file.flush().unwrap(); - - let output = fs::read_to_string(ivshmem_file_path).unwrap(); - let nul_pos = output.as_bytes().iter().position(|&b| b == 0).unwrap(); - let c_str = CStr::from_bytes_until_nul(&output.as_bytes()[..=nul_pos]).unwrap(); - let file_message = c_str.to_string_lossy().to_string(); - // Check if the backend file data is correct - assert_eq!(test_message_read, file_message); - - let device_id_line = String::from( - guest - .ssh_command("lspci -D | grep \"Inter-VM shared memory\"") - .unwrap() - .trim(), - ); - // Check if ivshmem exists - assert!(!device_id_line.is_empty()); - let device_id = device_id_line.split(" ").next().unwrap(); - // Check shard memory size - assert_eq!( - guest - .ssh_command( - format!("lspci -vv -s {device_id} | grep -c \"Region 2.*size={file_size}\"") - .as_str(), - ) - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 1 - ); - - // guest don't have gcc or g++, try to use python to test :( - // This python program try to mmap the ivshmem pci bar2 memory and read the data from it. - let ivshmem_test_read = format!( - r#" -import os -import mmap -from ctypes import create_string_buffer, c_char, memmove - -if __name__ == "__main__": - device_path = f"/sys/bus/pci/devices/{device_id}/resource2" - fd = os.open(device_path, os.O_RDWR | os.O_SYNC) - - PAGE_SIZE = os.sysconf('SC_PAGESIZE') - - with mmap.mmap(fd, PAGE_SIZE, flags=mmap.MAP_SHARED, - prot=mmap.PROT_READ | mmap.PROT_WRITE, offset=0) as shmem: - c_buf = (c_char * PAGE_SIZE).from_buffer(shmem) - null_pos = c_buf.raw.find(b'\x00') - valid_data = c_buf.raw[:null_pos] if null_pos != -1 else c_buf.raw - print(valid_data.decode('utf-8', errors='replace'), end="") - shmem.flush() - del c_buf - - os.close(fd) - "# - ); - guest - .ssh_command( - format!( - r#"cat << EOF > test_read.py -{ivshmem_test_read} -EOF -"# - ) - .as_str(), - ) - .unwrap(); - let guest_message = guest.ssh_command("sudo python3 test_read.py").unwrap(); - - // Check the probe message in host and guest - assert_eq!(test_message_read, guest_message); - - let test_message_write = "ivshmem device test data write"; - // Then the program writes a test message to the memory and flush it. - let ivshmem_test_write = format!( - r#" -import os -import mmap -from ctypes import create_string_buffer, c_char, memmove - -if __name__ == "__main__": - device_path = f"/sys/bus/pci/devices/{device_id}/resource2" - test_message = "{test_message_write}" - fd = os.open(device_path, os.O_RDWR | os.O_SYNC) - - PAGE_SIZE = os.sysconf('SC_PAGESIZE') - - with mmap.mmap(fd, PAGE_SIZE, flags=mmap.MAP_SHARED, - prot=mmap.PROT_READ | mmap.PROT_WRITE, offset=0) as shmem: - shmem.flush() - c_buf = (c_char * PAGE_SIZE).from_buffer(shmem) - encoded_msg = test_message.encode('utf-8').ljust(1000, b'\x00') - memmove(c_buf, encoded_msg, len(encoded_msg)) - shmem.flush() - del c_buf - - os.close(fd) - "# - ); - - guest - .ssh_command( - format!( - r#"cat << EOF > test_write.py -{ivshmem_test_write} -EOF -"# - ) - .as_str(), - ) - .unwrap(); - - let _ = guest.ssh_command("sudo python3 test_write.py").unwrap(); - - let output = fs::read_to_string(ivshmem_file_path).unwrap(); - let nul_pos = output.as_bytes().iter().position(|&b| b == 0).unwrap(); - let c_str = CStr::from_bytes_until_nul(&output.as_bytes()[..=nul_pos]).unwrap(); - let file_message = c_str.to_string_lossy().to_string(); - // Check to send data from guest to host - assert_eq!(test_message_write, file_message); -} - -fn _test_simple_launch(guest: &Guest) { - let event_path = temp_event_monitor_path(&guest.tmp_dir); - - let mut child = GuestCommand::new(guest) - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) - .default_kernel_cmdline() - .default_disks() - .default_net() - .args(["--serial", "tty", "--console", "off"]) - .args(["--event-monitor", format!("path={event_path}").as_str()]) - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - assert_eq!(guest.get_cpu_count().unwrap_or_default(), 1); - assert!(guest.get_total_memory().unwrap_or_default() > 480_000); - assert_eq!(guest.get_pci_bridge_class().unwrap_or_default(), "0x060000"); - - let expected_sequential_events = [ - &MetaEvent { - event: "starting".to_string(), - device_id: None, - }, - &MetaEvent { - event: "booting".to_string(), - device_id: None, - }, - &MetaEvent { - event: "booted".to_string(), - device_id: None, - }, - &MetaEvent { - event: "activated".to_string(), - device_id: Some("_disk0".to_string()), - }, - &MetaEvent { - event: "reset".to_string(), - device_id: Some("_disk0".to_string()), - }, - ]; - assert!(check_sequential_events( - &expected_sequential_events, - &event_path - )); - - // It's been observed on the Bionic image that udev and snapd - // services can cause some delay in the VM's shutdown. Disabling - // them improves the reliability of this test. - let _ = guest.ssh_command("sudo systemctl disable udev"); - let _ = guest.ssh_command("sudo systemctl stop udev"); - let _ = guest.ssh_command("sudo systemctl disable snapd"); - let _ = guest.ssh_command("sudo systemctl stop snapd"); - - guest.ssh_command("sudo poweroff").unwrap(); - thread::sleep(std::time::Duration::new(20, 0)); - let latest_events = [ - &MetaEvent { - event: "shutdown".to_string(), - device_id: None, - }, - &MetaEvent { - event: "deleted".to_string(), - device_id: None, - }, - &MetaEvent { - event: "shutdown".to_string(), - device_id: None, - }, - ]; - assert!(check_latest_events_exact(&latest_events, &event_path)); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); -} - -mod common_parallel { - use std::cmp; - use std::fs::{File, OpenOptions, copy}; - use std::io::{self, SeekFrom}; - use std::process::Command; - - use block::ImageType; - - use crate::*; - - #[test] - #[cfg(target_arch = "x86_64")] - fn test_focal_hypervisor_fw() { - let disk_config = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); - let mut guest = Guest::new(Box::new(disk_config)); - guest.kernel_path = Some(fw_path(FwType::RustHypervisorFirmware)); - _test_simple_launch(&guest) - } - - #[test] - #[cfg(target_arch = "x86_64")] - fn test_focal_ovmf() { - let disk_config = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); - let mut guest = Guest::new(Box::new(disk_config)); - guest.kernel_path = Some(fw_path(FwType::Ovmf)); - _test_simple_launch(&guest) - } - - #[test] - fn test_multi_cpu() { - let jammy_image = JAMMY_IMAGE_NAME.to_string(); - let disk_config = UbuntuDiskConfig::new(jammy_image); - let guest = Guest::new(Box::new(disk_config)); - - let mut cmd = GuestCommand::new(&guest); - cmd.args(["--cpus", "boot=2,max=4"]) - .args(["--memory", "size=512M"]) - .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .capture_output() - .default_disks() - .default_net(); - - let mut child = cmd.spawn().unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - assert_eq!(guest.get_cpu_count().unwrap_or_default(), 2); - - assert_eq!( - guest - .ssh_command( - r#"sudo dmesg | grep "smp: Brought up" | sed "s/\[\ *[0-9.]*\] //""# - ) - .unwrap() - .trim(), - "smp: Brought up 1 node, 2 CPUs" - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); - } - - #[test] - fn test_cpu_topology_421() { - test_cpu_topology(4, 2, 1, false); - } - - #[test] - fn test_cpu_topology_142() { - test_cpu_topology(1, 4, 2, false); - } - - #[test] - fn test_cpu_topology_262() { - test_cpu_topology(2, 6, 2, false); - } - - #[test] - #[cfg(target_arch = "x86_64")] - #[cfg(not(feature = "mshv"))] - fn test_cpu_physical_bits() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - let max_phys_bits: u8 = 36; - let mut child = GuestCommand::new(&guest) - .args(["--cpus", &format!("max_phys_bits={max_phys_bits}")]) - .args(["--memory", "size=512M"]) - .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .default_net() - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - assert!( - guest - .ssh_command("lscpu | grep \"Address sizes:\" | cut -f 2 -d \":\" | sed \"s# *##\" | cut -f 1 -d \" \"") - .unwrap() - .trim() - .parse::() - .unwrap_or(max_phys_bits + 1) <= max_phys_bits, - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); - } - - #[test] - fn test_cpu_affinity() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - // We need the host to have at least 4 CPUs if we want to be able - // to run this test. - let host_cpus_count = exec_host_command_output("nproc"); - assert!( - String::from_utf8_lossy(&host_cpus_count.stdout) - .trim() - .parse::() - .unwrap_or(0) - >= 4 - ); - - let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=2,affinity=[0@[0,2],1@[1,3]]"]) - .args(["--memory", "size=512M"]) - .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .default_net() - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - let pid = child.id(); - let taskset_vcpu0 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep vcpu0 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str()); - assert_eq!(String::from_utf8_lossy(&taskset_vcpu0.stdout).trim(), "0,2"); - let taskset_vcpu1 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep vcpu1 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str()); - assert_eq!(String::from_utf8_lossy(&taskset_vcpu1.stdout).trim(), "1,3"); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - handle_child_output(r, &output); - } - - #[test] - fn test_virtio_queue_affinity() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - // We need the host to have at least 4 CPUs if we want to be able - // to run this test. - let host_cpus_count = exec_host_command_output("nproc"); - assert!( - String::from_utf8_lossy(&host_cpus_count.stdout) - .trim() - .parse::() - .unwrap_or(0) - >= 4 - ); - - let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=4"]) - .args(["--memory", "size=512M"]) - .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .args([ - "--disk", - format!( - "path={}", - guest.disk_config.disk(DiskType::OperatingSystem).unwrap() - ) - .as_str(), - format!( - "path={},num_queues=4,queue_affinity=[0@[0,2],1@[1,3],2@[1],3@[3]]", - guest.disk_config.disk(DiskType::CloudInit).unwrap() - ) - .as_str(), - ]) - .default_net() - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - let pid = child.id(); - let taskset_q0 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep disk1_q0 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str()); - assert_eq!(String::from_utf8_lossy(&taskset_q0.stdout).trim(), "0,2"); - let taskset_q1 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep disk1_q1 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str()); - assert_eq!(String::from_utf8_lossy(&taskset_q1.stdout).trim(), "1,3"); - let taskset_q2 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep disk1_q2 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str()); - assert_eq!(String::from_utf8_lossy(&taskset_q2.stdout).trim(), "1"); - let taskset_q3 = exec_host_command_output(format!("taskset -pc $(ps -T -p {pid} | grep disk1_q3 | xargs | cut -f 2 -d \" \") | cut -f 6 -d \" \"").as_str()); - assert_eq!(String::from_utf8_lossy(&taskset_q3.stdout).trim(), "3"); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - handle_child_output(r, &output); - } - - #[test] - #[cfg(not(feature = "mshv"))] - fn test_large_vm() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - let mut cmd = GuestCommand::new(&guest); - cmd.args(["--cpus", "boot=48"]) - .args(["--memory", "size=5120M"]) - .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .args(["--serial", "tty"]) - .args(["--console", "off"]) - .capture_output() - .default_disks() - .default_net(); - - let mut child = cmd.spawn().unwrap(); - - guest.wait_vm_boot().unwrap(); - - let r = std::panic::catch_unwind(|| { - assert_eq!(guest.get_cpu_count().unwrap_or_default(), 48); - assert_eq!( - guest - .ssh_command("lscpu | grep \"On-line\" | cut -f 2 -d \":\" | sed \"s# *##\"") - .unwrap() - .trim(), - "0-47" - ); - - assert!(guest.get_total_memory().unwrap_or_default() > 5_000_000); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); - } - - #[test] - #[cfg(not(feature = "mshv"))] - fn test_huge_memory() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - let mut cmd = GuestCommand::new(&guest); - cmd.args(["--cpus", "boot=1"]) - .args(["--memory", "size=128G"]) - .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .capture_output() - .default_disks() - .default_net(); - - let mut child = cmd.spawn().unwrap(); - - guest.wait_vm_boot().unwrap(); - - let r = std::panic::catch_unwind(|| { - assert!(guest.get_total_memory().unwrap_or_default() > 128_000_000); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); - } - - #[test] - fn test_power_button() { - _test_power_button(false); - } - - #[test] - #[cfg(not(feature = "mshv"))] // See #7456 - fn test_user_defined_memory_regions() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - let api_socket = temp_api_path(&guest.tmp_dir); - - let kernel_path = direct_kernel_boot_path(); - - let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=0,hotplug_method=virtio-mem"]) - .args([ - "--memory-zone", - "id=mem0,size=1G,hotplug_size=2G", - "id=mem1,size=1G,shared=on", - "id=mem2,size=1G,host_numa_node=0,hotplug_size=2G", - ]) - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .args(["--api-socket", &api_socket]) - .capture_output() - .default_disks() - .default_net() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - assert!(guest.get_total_memory().unwrap_or_default() > 2_880_000); - - guest.enable_memory_hotplug(); - - resize_zone_command(&api_socket, "mem0", "3G"); - thread::sleep(std::time::Duration::new(5, 0)); - assert!(guest.get_total_memory().unwrap_or_default() > 4_800_000); - resize_zone_command(&api_socket, "mem2", "3G"); - thread::sleep(std::time::Duration::new(5, 0)); - assert!(guest.get_total_memory().unwrap_or_default() > 6_720_000); - resize_zone_command(&api_socket, "mem0", "2G"); - thread::sleep(std::time::Duration::new(5, 0)); - assert!(guest.get_total_memory().unwrap_or_default() > 5_760_000); - resize_zone_command(&api_socket, "mem2", "2G"); - thread::sleep(std::time::Duration::new(5, 0)); - assert!(guest.get_total_memory().unwrap_or_default() > 4_800_000); - - guest.reboot_linux(0); - - // Check the amount of RAM after reboot - assert!(guest.get_total_memory().unwrap_or_default() > 4_800_000); - assert!(guest.get_total_memory().unwrap_or_default() < 5_760_000); - - // Check if we can still resize down to the initial 'boot'size - resize_zone_command(&api_socket, "mem0", "1G"); - thread::sleep(std::time::Duration::new(5, 0)); - assert!(guest.get_total_memory().unwrap_or_default() < 4_800_000); - resize_zone_command(&api_socket, "mem2", "1G"); - thread::sleep(std::time::Duration::new(5, 0)); - assert!(guest.get_total_memory().unwrap_or_default() < 3_840_000); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); - } - - #[test] - #[cfg(not(feature = "mshv"))] // See #7456 - fn test_guest_numa_nodes() { - _test_guest_numa_nodes(false); - } - - #[test] - #[cfg(target_arch = "x86_64")] - fn test_iommu_segments() { - let focal_image = FOCAL_IMAGE_NAME.to_string(); - let disk_config = UbuntuDiskConfig::new(focal_image); - let guest = Guest::new(Box::new(disk_config)); - - // Prepare another disk file for the virtio-disk device - let test_disk_path = String::from( - guest - .tmp_dir - .as_path() - .join("test-disk.raw") - .to_str() - .unwrap(), - ); - assert!( - exec_host_command_status(format!("truncate {test_disk_path} -s 4M").as_str()).success() - ); - assert!(exec_host_command_status(format!("mkfs.ext4 {test_disk_path}").as_str()).success()); - - let api_socket = temp_api_path(&guest.tmp_dir); - let mut cmd = GuestCommand::new(&guest); - - cmd.args(["--cpus", "boot=1"]) - .args(["--api-socket", &api_socket]) - .args(["--memory", "size=512M"]) - .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .args([ - "--platform", - &format!("num_pci_segments={MAX_NUM_PCI_SEGMENTS},iommu_segments=[1]"), - ]) - .default_disks() - .capture_output() - .default_net(); - - let mut child = cmd.spawn().unwrap(); - - guest.wait_vm_boot().unwrap(); - - let r = std::panic::catch_unwind(|| { - let (cmd_success, cmd_output) = remote_command_w_output( - &api_socket, - "add-disk", - Some( - format!( - "path={},id=test0,pci_segment=1,iommu=on", - test_disk_path.as_str() - ) - .as_str(), - ), - ); - assert!(cmd_success); - assert!( - String::from_utf8_lossy(&cmd_output) - .contains("{\"id\":\"test0\",\"bdf\":\"0001:00:01.0\"}") - ); - - // Check IOMMU setup - assert!( - guest - .does_device_vendor_pair_match("0x1057", "0x1af4") - .unwrap_or_default() - ); - assert_eq!( - guest - .ssh_command("ls /sys/kernel/iommu_groups/1/devices") - .unwrap() - .trim(), - "0001:00:01.0" - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); - } - - #[test] - fn test_pci_msi() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - let mut cmd = GuestCommand::new(&guest); - cmd.args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) - .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .capture_output() - .default_disks() - .default_net(); - - let mut child = cmd.spawn().unwrap(); - - guest.wait_vm_boot().unwrap(); - - let grep_cmd = format!("grep -c {} /proc/interrupts", get_msi_interrupt_pattern()); - - let r = std::panic::catch_unwind(|| { - assert_eq!( - guest - .ssh_command(&grep_cmd) - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 12 - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); - } - - #[test] - fn test_virtio_net_ctrl_queue() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - let mut cmd = GuestCommand::new(&guest); - cmd.args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) - .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .args(["--net", guest.default_net_string_w_mtu(3000).as_str()]) - .capture_output() - .default_disks(); - - let mut child = cmd.spawn().unwrap(); - - guest.wait_vm_boot().unwrap(); - - #[cfg(target_arch = "aarch64")] - let iface = "enp0s4"; - #[cfg(target_arch = "x86_64")] - let iface = "ens4"; - - let r = std::panic::catch_unwind(|| { - assert_eq!( - guest - .ssh_command( - format!("sudo ethtool -K {iface} rx-gro-hw off && echo success").as_str() - ) - .unwrap() - .trim(), - "success" - ); - assert_eq!( - guest - .ssh_command(format!("cat /sys/class/net/{iface}/mtu").as_str()) - .unwrap() - .trim(), - "3000" - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); - } - - #[test] - fn test_pci_multiple_segments() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - // Prepare another disk file for the virtio-disk device - let test_disk_path = String::from( - guest - .tmp_dir - .as_path() - .join("test-disk.raw") - .to_str() - .unwrap(), - ); - assert!( - exec_host_command_status(format!("truncate {test_disk_path} -s 4M").as_str()).success() - ); - assert!(exec_host_command_status(format!("mkfs.ext4 {test_disk_path}").as_str()).success()); - - let mut cmd = GuestCommand::new(&guest); - cmd.args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) - .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .args([ - "--platform", - &format!("num_pci_segments={MAX_NUM_PCI_SEGMENTS}"), - ]) - .args([ - "--disk", - format!( - "path={}", - guest.disk_config.disk(DiskType::OperatingSystem).unwrap() - ) - .as_str(), - format!( - "path={}", - guest.disk_config.disk(DiskType::CloudInit).unwrap() - ) - .as_str(), - format!("path={test_disk_path},pci_segment=15,image_type=raw").as_str(), - ]) - .capture_output() - .default_net(); - - let mut child = cmd.spawn().unwrap(); - - guest.wait_vm_boot().unwrap(); - - let grep_cmd = "lspci | grep \"Host bridge\" | wc -l"; - - let r = std::panic::catch_unwind(|| { - // There should be MAX_NUM_PCI_SEGMENTS PCI host bridges in the guest. - assert_eq!( - guest - .ssh_command(grep_cmd) - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - MAX_NUM_PCI_SEGMENTS - ); - - // Check both if /dev/vdc exists and if the block size is 4M. - assert_eq!( - guest - .ssh_command("lsblk | grep vdc | grep -c 4M") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 1 - ); - - // Mount the device. - guest.ssh_command("mkdir mount_image").unwrap(); - guest - .ssh_command("sudo mount -o rw -t ext4 /dev/vdc mount_image/") - .unwrap(); - // Grant all users with write permission. - guest.ssh_command("sudo chmod a+w mount_image/").unwrap(); - - // Write something to the device. - guest - .ssh_command("sudo echo \"bar\" >> mount_image/foo") - .unwrap(); - - // Check the content of the block device. The file "foo" should - // contain "bar". - assert_eq!( - guest - .ssh_command("sudo cat mount_image/foo") - .unwrap() - .trim(), - "bar" - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); - } - - #[test] - fn test_pci_multiple_segments_numa_node() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - let api_socket = temp_api_path(&guest.tmp_dir); - #[cfg(target_arch = "x86_64")] - let kernel_path = direct_kernel_boot_path(); - #[cfg(target_arch = "aarch64")] - let kernel_path = edk2_path(); - - // Prepare another disk file for the virtio-disk device - let test_disk_path = String::from( - guest - .tmp_dir - .as_path() - .join("test-disk.raw") - .to_str() - .unwrap(), - ); - assert!( - exec_host_command_status(format!("truncate {test_disk_path} -s 4M").as_str()).success() - ); - assert!(exec_host_command_status(format!("mkfs.ext4 {test_disk_path}").as_str()).success()); - const TEST_DISK_NODE: u16 = 1; - - let mut child = GuestCommand::new(&guest) - .args(["--platform", "num_pci_segments=2"]) - .args(["--cpus", "boot=2"]) - .args(["--memory", "size=0"]) - .args(["--memory-zone", "id=mem0,size=256M", "id=mem1,size=256M"]) - .args([ - "--numa", - "guest_numa_id=0,cpus=[0],distances=[1@20],memory_zones=mem0,pci_segments=[0]", - "guest_numa_id=1,cpus=[1],distances=[0@20],memory_zones=mem1,pci_segments=[1]", - ]) - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .args(["--api-socket", &api_socket]) - .capture_output() - .args([ - "--disk", - format!( - "path={}", - guest.disk_config.disk(DiskType::OperatingSystem).unwrap() - ) - .as_str(), - format!( - "path={}", - guest.disk_config.disk(DiskType::CloudInit).unwrap() - ) - .as_str(), - format!("path={test_disk_path},pci_segment={TEST_DISK_NODE}").as_str(), - ]) - .default_net() - .spawn() - .unwrap(); - - let cmd = "cat /sys/block/vdc/device/../numa_node"; - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - assert_eq!( - guest - .ssh_command(cmd) - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - TEST_DISK_NODE - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); - } - - #[test] - fn test_direct_kernel_boot() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - let kernel_path = direct_kernel_boot_path(); - - let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .default_net() - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - assert_eq!(guest.get_cpu_count().unwrap_or_default(), 1); - assert!(guest.get_total_memory().unwrap_or_default() > 480_000); - - let grep_cmd = format!("grep -c {} /proc/interrupts", get_msi_interrupt_pattern()); - assert_eq!( - guest - .ssh_command(&grep_cmd) - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 12 - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); - } - - #[test] - #[cfg(target_arch = "x86_64")] - fn test_direct_kernel_boot_bzimage() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - let mut kernel_path = direct_kernel_boot_path(); - // Replace the default kernel with the bzImage. - kernel_path.pop(); - kernel_path.push("bzImage-x86_64"); - - let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .default_net() - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - assert_eq!(guest.get_cpu_count().unwrap_or_default(), 1); - assert!(guest.get_total_memory().unwrap_or_default() > 480_000); - - let grep_cmd = "grep -c PCI-MSI /proc/interrupts"; - assert_eq!( - guest - .ssh_command(grep_cmd) - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 12 - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); - } - - fn _test_virtio_block( - image_name: &str, - disable_io_uring: bool, - disable_aio: bool, - verify_os_disk: bool, - backing_files: bool, - image_type: ImageType, - ) { - let disk_config = UbuntuDiskConfig::new(image_name.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - let mut workload_path = dirs::home_dir().unwrap(); - workload_path.push("workloads"); - - let mut blk_file_path = workload_path; - blk_file_path.push("blk.img"); - - let kernel_path = direct_kernel_boot_path(); - - let initial_backing_checksum = if verify_os_disk { - compute_backing_checksum(guest.disk_config.disk(DiskType::OperatingSystem).unwrap()) - } else { - None - }; - - let mut cloud_child = GuestCommand::new(&guest) - .args(["--cpus", "boot=4"]) - .args(["--memory", "size=512M,shared=on"]) - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .args([ - "--disk", - format!( - "path={},backing_files={},image_type={image_type}", - guest.disk_config.disk(DiskType::OperatingSystem).unwrap(), - if backing_files { "on"} else {"off"}, - ) - .as_str(), - format!( - "path={}", - guest.disk_config.disk(DiskType::CloudInit).unwrap() - ) - .as_str(), - format!( - "path={},readonly=on,direct=on,num_queues=4,_disable_io_uring={},_disable_aio={}", - blk_file_path.to_str().unwrap(), - disable_io_uring, - disable_aio, - ) - .as_str(), - ]) - .default_net() - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - // Check both if /dev/vdc exists and if the block size is 16M. - assert_eq!( - guest - .ssh_command("lsblk | grep vdc | grep -c 16M") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 1 - ); - - // Check both if /dev/vdc exists and if this block is RO. - assert_eq!( - guest - .ssh_command("lsblk | grep vdc | awk '{print $5}'") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 1 - ); - - // Check if the number of queues is 4. - assert_eq!( - guest - .ssh_command("ls -ll /sys/block/vdc/mq | grep ^d | wc -l") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 4 - ); - }); - - if verify_os_disk { - // Use clean shutdown to allow cloud-hypervisor to clear - // the dirty bit in the QCOW2 v3 image. - kill_child(&mut cloud_child); - } else { - let _ = cloud_child.kill(); - } - let output = cloud_child.wait_with_output().unwrap(); - - handle_child_output(r, &output); - - if verify_os_disk { - disk_check_consistency( - guest.disk_config.disk(DiskType::OperatingSystem).unwrap(), - initial_backing_checksum, - ); - } - } - - #[test] - fn test_virtio_block_io_uring() { - _test_virtio_block(FOCAL_IMAGE_NAME, false, true, false, false, ImageType::Raw); - } - - #[test] - fn test_virtio_block_aio() { - _test_virtio_block(FOCAL_IMAGE_NAME, true, false, false, false, ImageType::Raw); - } - - #[test] - fn test_virtio_block_sync() { - _test_virtio_block(FOCAL_IMAGE_NAME, true, true, false, false, ImageType::Raw); - } - - fn run_qemu_img(path: &std::path::Path, args: &[&str]) -> std::process::Output { - std::process::Command::new("qemu-img") - .arg(args[0]) - .args(&args[1..]) - .arg(path.to_str().unwrap()) - .output() - .unwrap() - } - - fn get_image_info(path: &std::path::Path) -> Option { - let output = run_qemu_img(path, &["info", "-U", "--output=json"]); - - output.status.success().then(|| ())?; - serde_json::from_slice(&output.stdout).ok() - } - - fn get_qcow2_v3_info(path: &Path) -> Result, String> { - let info = get_image_info(path) - .ok_or_else(|| format!("qemu-img info failed for {}", path.display()))?; - if info["format"].as_str() != Some("qcow2") { - return Ok(None); - } - // QCOW2 v3 has compat "1.1", v2 has "0.10" - if info["format-specific"]["data"]["compat"].as_str() != Some("1.1") { - return Ok(None); - } - Ok(Some(info)) - } - - fn check_dirty_flag(path: &Path) -> Result, String> { - Ok(get_qcow2_v3_info(path)?.and_then(|info| info["dirty-flag"].as_bool())) - } - - fn check_corrupt_flag(path: &Path) -> Result, String> { - Ok(get_qcow2_v3_info(path)? - .and_then(|info| info["format-specific"]["data"]["corrupt"].as_bool())) - } - - const QCOW2_INCOMPATIBLE_FEATURES_OFFSET: u64 = 72; - - fn set_corrupt_flag(path: &Path, corrupt: bool) -> io::Result<()> { - let mut file = OpenOptions::new().read(true).write(true).open(path)?; - - file.seek(SeekFrom::Start(QCOW2_INCOMPATIBLE_FEATURES_OFFSET))?; - let mut buf = [0u8; 8]; - file.read_exact(&mut buf)?; - let mut features = u64::from_be_bytes(buf); - - if corrupt { - features |= 0x02; - } else { - features &= !0x02; - } - - file.seek(SeekFrom::Start(QCOW2_INCOMPATIBLE_FEATURES_OFFSET))?; - file.write_all(&features.to_be_bytes())?; - file.sync_all()?; - Ok(()) - } - - fn resolve_disk_path(path_or_image_name: impl AsRef) -> std::path::PathBuf { - if path_or_image_name.as_ref().exists() { - // A full path is provided - path_or_image_name.as_ref().to_path_buf() - } else { - // An image name is provided - let mut workload_path = dirs::home_dir().unwrap(); - workload_path.push("workloads"); - workload_path.as_path().join(path_or_image_name.as_ref()) - } - } - - fn compute_file_checksum(reader: &mut dyn std::io::Read, size: u64) -> u32 { - // Read first 16MB or entire data if smaller - let read_size = cmp::min(size, 16 * 1024 * 1024) as usize; - - let mut buffer = vec![0u8; read_size]; - reader.read_exact(&mut buffer).unwrap(); - - // DJB2 hash - let mut hash: u32 = 5381; - for byte in buffer.iter() { - hash = hash.wrapping_mul(33).wrapping_add(*byte as u32); - } - hash - } - - #[test] - fn test_compute_file_checksum_empty() { - let mut reader = io::Cursor::new(vec![]); - let checksum = compute_file_checksum(&mut reader, 0); - assert_eq!(checksum, 5381); - } - - #[test] - fn test_compute_file_checksum_small() { - let data = b"hello world"; - let mut reader = io::Cursor::new(data); - let checksum = compute_file_checksum(&mut reader, data.len() as u64); - assert_eq!(checksum, 894552257); - } - - #[test] - fn test_compute_file_checksum_same_data() { - let data = b"test data 123"; - let mut reader1 = io::Cursor::new(data); - let mut reader2 = io::Cursor::new(data); - let checksum1 = compute_file_checksum(&mut reader1, data.len() as u64); - let checksum2 = compute_file_checksum(&mut reader2, data.len() as u64); - assert_eq!(checksum1, checksum2); - } - - #[test] - fn test_compute_file_checksum_different_data() { - let data1 = b"data1"; - let data2 = b"data2"; - let mut reader1 = io::Cursor::new(data1); - let mut reader2 = io::Cursor::new(data2); - let checksum1 = compute_file_checksum(&mut reader1, data1.len() as u64); - let checksum2 = compute_file_checksum(&mut reader2, data2.len() as u64); - assert_ne!(checksum1, checksum2); - } + #[test] + fn test_compute_file_checksum_different_data() { + let data1 = b"data1"; + let data2 = b"data2"; + let mut reader1 = io::Cursor::new(data1); + let mut reader2 = io::Cursor::new(data2); + let checksum1 = compute_file_checksum(&mut reader1, data1.len() as u64); + let checksum2 = compute_file_checksum(&mut reader2, data2.len() as u64); + assert_ne!(checksum1, checksum2); + } #[test] fn test_compute_file_checksum_large_data() { @@ -3683,164 +648,58 @@ mod common_parallel { assert_eq!(position, 16 * 1024 * 1024); } - fn compute_backing_checksum( - path_or_image_name: impl AsRef, - ) -> Option<(std::path::PathBuf, String, u32)> { - let path = resolve_disk_path(path_or_image_name); - - let mut file = File::open(&path).ok()?; - if !matches!( - block::detect_image_type(&mut file).ok()?, - block::ImageType::Qcow2 - ) { - return None; - } - - let info = get_image_info(&path)?; - - let backing_file = info["backing-filename"].as_str()?; - let backing_path = if std::path::Path::new(backing_file).is_absolute() { - std::path::PathBuf::from(backing_file) - } else { - path.parent() - .unwrap_or_else(|| std::path::Path::new(".")) - .join(backing_file) - }; - - let backing_info = get_image_info(&backing_path)?; - let backing_format = backing_info["format"].as_str()?.to_string(); - let mut file = File::open(&backing_path).ok()?; - let file_size = file.metadata().ok()?.len(); - let checksum = compute_file_checksum(&mut file, file_size); - - Some((backing_path, backing_format, checksum)) - } - - /// Uses `qemu-img check` to verify disk image consistency. - /// - /// Supported formats are `qcow2` (compressed and uncompressed), - /// `vhdx`, `qed`, `parallels`, `vmdk`, and `vdi`. See man page - /// for more details. - /// - /// It takes either a full path to the image or just the name of - /// the image located in the `workloads` directory. - /// - /// For QCOW2 images with backing files, also verifies the backing file - /// integrity and checks that the backing file hasn't been modified - /// during the test. - /// - /// For QCOW2 v3 images, also verifies the dirty bit is cleared. - fn disk_check_consistency( - path_or_image_name: impl AsRef, - initial_backing_checksum: Option<(std::path::PathBuf, String, u32)>, - ) { - let path = resolve_disk_path(path_or_image_name); - let output = run_qemu_img(&path, &["check"]); - - assert!( - output.status.success(), - "qemu-img check failed: {}", - String::from_utf8_lossy(&output.stderr) - ); - - match check_dirty_flag(&path) { - Ok(Some(dirty)) => { - assert!(!dirty, "QCOW2 image shutdown unclean"); - } - Ok(None) => {} // Not a QCOW2 v3 image, skip dirty flag check - Err(e) => panic!("Failed to check dirty flag: {e}"), - } - - if let Some((backing_path, format, initial_checksum)) = initial_backing_checksum { - if format.parse::().ok() != Some(block::qcow::ImageType::Raw) { - let output = run_qemu_img(&backing_path, &["check"]); - - assert!( - output.status.success(), - "qemu-img check of backing file failed: {}", - String::from_utf8_lossy(&output.stderr) - ); - } - - let mut file = File::open(&backing_path).unwrap(); - let file_size = file.metadata().unwrap().len(); - assert_eq!( - initial_checksum, - compute_file_checksum(&mut file, file_size) - ); - } - } - #[test] fn test_virtio_block_qcow2() { - _test_virtio_block( - JAMMY_IMAGE_NAME_QCOW2, - false, - false, - true, - false, - ImageType::Qcow2, - ); + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME_QCOW2.to_string()); + let guest = GuestFactory::new_regular_guest_factory() + .create_guest(Box::new(disk_config)) + .with_cpu(4); + _test_virtio_block(&guest, false, false, true, false, ImageType::Qcow2); } #[test] fn test_virtio_block_qcow2_zlib() { - _test_virtio_block( - JAMMY_IMAGE_NAME_QCOW2_ZLIB, - false, - false, - true, - false, - ImageType::Qcow2, - ); + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME_QCOW2_ZLIB.to_string()); + let guest = GuestFactory::new_regular_guest_factory() + .create_guest(Box::new(disk_config)) + .with_cpu(4); + _test_virtio_block(&guest, false, false, true, false, ImageType::Qcow2); } #[test] fn test_virtio_block_qcow2_zstd() { - _test_virtio_block( + let guest = make_virtio_block_guest( + &GuestFactory::new_regular_guest_factory(), JAMMY_IMAGE_NAME_QCOW2_ZSTD, - false, - false, - true, - false, - ImageType::Qcow2, ); + _test_virtio_block(&guest, false, false, true, false, ImageType::Qcow2); } #[test] fn test_virtio_block_qcow2_backing_zstd_file() { - _test_virtio_block( + let guest = make_virtio_block_guest( + &GuestFactory::new_regular_guest_factory(), JAMMY_IMAGE_NAME_QCOW2_BACKING_ZSTD_FILE, - false, - false, - true, - true, - ImageType::Qcow2, ); + _test_virtio_block(&guest, false, false, true, true, ImageType::Qcow2); } #[test] fn test_virtio_block_qcow2_backing_uncompressed_file() { - _test_virtio_block( + let guest = make_virtio_block_guest( + &GuestFactory::new_regular_guest_factory(), JAMMY_IMAGE_NAME_QCOW2_BACKING_UNCOMPRESSED_FILE, - false, - false, - true, - true, - ImageType::Qcow2, ); + _test_virtio_block(&guest, false, false, true, true, ImageType::Qcow2); } #[test] fn test_virtio_block_qcow2_backing_raw_file() { - _test_virtio_block( + let guest = make_virtio_block_guest( + &GuestFactory::new_regular_guest_factory(), JAMMY_IMAGE_NAME_QCOW2_BACKING_RAW_FILE, - false, - false, - true, - true, - ImageType::Qcow2, ); + _test_virtio_block(&guest, false, false, true, true, ImageType::Qcow2); } /// Configuration for QCOW2 multiqueue test image setup @@ -3855,7 +714,7 @@ mod common_parallel { /// /// Creates a VM with multiple virtio queues on the test disk, then runs the /// provided test closure. Handles VM lifecycle and consistency checks. - fn run_multiqueue_qcow2_test(image_config: QcowTestImageConfig, test_fn: F) + fn run_multiqueue_qcow2_test(image_config: &QcowTestImageConfig, test_fn: F) where F: FnOnce(&Guest) + std::panic::UnwindSafe, { @@ -3866,7 +725,7 @@ mod common_parallel { let test_image_path = guest.tmp_dir.as_path().join("test.qcow2"); // Create test image based on configuration and capture backing checksum if applicable - let initial_backing_checksum = match image_config { + let initial_backing_checksum = match *image_config { QcowTestImageConfig::Simple(size) => { Command::new("qemu-img") .arg("create") @@ -3949,7 +808,7 @@ mod common_parallel { #[test] fn test_virtio_block_qcow2_multiqueue_writes() { - run_multiqueue_qcow2_test(QcowTestImageConfig::Simple("256M"), |guest| { + run_multiqueue_qcow2_test(&QcowTestImageConfig::Simple("256M"), |guest| { assert_eq!( guest .ssh_command("ls -ll /sys/block/vdc/mq | grep ^d | wc -l") @@ -4019,7 +878,7 @@ mod common_parallel { #[test] fn test_virtio_block_qcow2_multiqueue_mixed_rw() { - run_multiqueue_qcow2_test(QcowTestImageConfig::Simple("512M"), |guest| { + run_multiqueue_qcow2_test(&QcowTestImageConfig::Simple("512M"), |guest| { guest .ssh_command("sudo mkfs.ext4 -F /dev/vdc") .expect("Failed to format disk"); @@ -4070,7 +929,7 @@ mod common_parallel { #[test] fn test_virtio_block_qcow2_multiqueue_backing() { - run_multiqueue_qcow2_test(QcowTestImageConfig::WithBacking, |guest| { + run_multiqueue_qcow2_test(&QcowTestImageConfig::WithBacking, |guest| { guest .ssh_command("sudo mkfs.ext4 -F /dev/vdc") .expect("Failed to format disk"); @@ -4114,7 +973,7 @@ mod common_parallel { #[test] fn test_virtio_block_qcow2_multiqueue_random_4k() { - run_multiqueue_qcow2_test(QcowTestImageConfig::Simple("256M"), |guest| { + run_multiqueue_qcow2_test(&QcowTestImageConfig::Simple("256M"), |guest| { guest .ssh_command( "for i in $(seq 1 8); do \ @@ -4144,7 +1003,7 @@ mod common_parallel { #[test] fn test_virtio_block_qcow2_multiqueue_fsync() { - run_multiqueue_qcow2_test(QcowTestImageConfig::Simple("256M"), |guest| { + run_multiqueue_qcow2_test(&QcowTestImageConfig::Simple("256M"), |guest| { guest .ssh_command("sudo mkfs.ext4 -F /dev/vdc") .expect("Failed to format disk"); @@ -4191,7 +1050,7 @@ mod common_parallel { #[test] fn test_virtio_block_qcow2_multiqueue_metadata() { - run_multiqueue_qcow2_test(QcowTestImageConfig::Simple("256M"), |guest| { + run_multiqueue_qcow2_test(&QcowTestImageConfig::Simple("256M"), |guest| { guest .ssh_command("sudo mkfs.ext4 -F /dev/vdc") .expect("Failed to format disk"); @@ -4270,7 +1129,7 @@ mod common_parallel { #[test] fn test_virtio_block_qcow2_multiqueue_discard_mount() { - run_multiqueue_qcow2_test(QcowTestImageConfig::Simple("256M"), |guest| { + run_multiqueue_qcow2_test(&QcowTestImageConfig::Simple("256M"), |guest| { guest .ssh_command("sudo mkfs.ext4 -F /dev/vdc") .expect("Failed to format disk"); @@ -4333,7 +1192,7 @@ mod common_parallel { } #[test] fn test_virtio_block_qcow2_multiqueue_wide_writes() { - run_multiqueue_qcow2_test(QcowTestImageConfig::Simple("1G"), |guest| { + run_multiqueue_qcow2_test(&QcowTestImageConfig::Simple("1G"), |guest| { // Scattered write pattern - write to widely separated offsets in parallel. // This should initiate many L2 table allocations simultaneously across different queues. guest @@ -4373,7 +1232,7 @@ mod common_parallel { #[test] fn test_virtio_block_qcow2_multiqueue_discard_stress() { - run_multiqueue_qcow2_test(QcowTestImageConfig::Simple("512M"), |guest| { + run_multiqueue_qcow2_test(&QcowTestImageConfig::Simple("512M"), |guest| { guest .ssh_command("sudo mkfs.ext4 -F /dev/vdc") .expect("Failed to format disk"); @@ -4417,39 +1276,129 @@ mod common_parallel { "Expected 4 files after round 1" ); - // Round 2: More aggressive - 8 parallel writes with simultaneous blkdiscard on raw device - guest - .ssh_command("sudo umount /mnt/test") - .expect("Failed to unmount"); + // Round 2: More aggressive - 8 parallel writes with simultaneous blkdiscard on raw device + guest + .ssh_command("sudo umount /mnt/test") + .expect("Failed to unmount"); + + guest + .ssh_command( + "for i in $(seq 0 7); do \n\ + offset=$((i * 64)) \n\ + sudo dd if=/dev/urandom of=/dev/vdc bs=1M count=4 seek=$offset conv=notrunc,fsync & \n\ + done; wait", + ) + .expect("Failed sparse writes"); + + // Now discard half the regions while writing to the other half + guest + .ssh_command( + "for i in $(seq 0 3); do \n\ + offset=$((i * 64 * 1024 * 1024)) \n\ + sudo blkdiscard -o $offset -l $((4 * 1024 * 1024)) /dev/vdc & \n\ + done; \n\ + for i in $(seq 4 7); do \n\ + offset=$((i * 64)) \n\ + sudo dd if=/dev/zero of=/dev/vdc bs=1M count=4 seek=$offset conv=notrunc,fsync & \n\ + done; wait", + ) + .expect("Failed parallel discard and write stress test"); + + guest + .ssh_command("sudo dd if=/dev/vdc of=/dev/null bs=1M count=128") + .expect("Failed to read back data after discard stress"); + }); + } + + #[test] + fn test_virtio_block_qcow2_uefi_direct_io() { + // Regression test for #8007. + // Place the QCOW2 OS image on a 4096 byte sector filesystem so + // O_DIRECT forces 4096 byte alignment on all I/O buffers. + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME_QCOW2.to_string()); + let guest = Guest::new(Box::new(disk_config)); + let kernel_path = edk2_path(); + + let mut workloads_path = dirs::home_dir().unwrap(); + workloads_path.push("workloads"); + let img_dir = TempDir::new_in(workloads_path.as_path()).unwrap(); + let fs_img_path = img_dir.as_path().join("fs_4ksec.img"); + + assert!( + exec_host_command_output(&format!("truncate -s 4G {}", fs_img_path.to_str().unwrap())) + .status + .success(), + "truncate failed" + ); + + let loop_dev_path = create_loop_device(fs_img_path.to_str().unwrap(), 4096, 5); + + assert!( + exec_host_command_output(&format!("mkfs.ext4 -q {loop_dev_path}")) + .status + .success(), + "mkfs.ext4 failed" + ); + + let mnt_dir = img_dir.as_path().join("mnt"); + fs::create_dir_all(&mnt_dir).unwrap(); + assert!( + exec_host_command_output(&format!( + "mount {} {}", + &loop_dev_path, + mnt_dir.to_str().unwrap() + )) + .status + .success(), + "mount failed" + ); + + let src_qcow2 = guest.disk_config.disk(DiskType::OperatingSystem).unwrap(); + let dest_qcow2 = mnt_dir.join("os.qcow2"); + assert!( + exec_host_command_output(&format!( + "cp {} {}", + &src_qcow2, + dest_qcow2.to_str().unwrap() + )) + .status + .success(), + "cp failed" + ); + + let mut child = GuestCommand::new(&guest) + .default_cpus() + .default_memory() + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args([ + "--disk", + &format!( + "path={},direct=on,image_type=qcow2", + dest_qcow2.to_str().unwrap() + ), + &format!( + "path={}", + guest.disk_config.disk(DiskType::CloudInit).unwrap() + ), + ]) + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + }); - guest - .ssh_command( - "for i in $(seq 0 7); do \n\ - offset=$((i * 64)) \n\ - sudo dd if=/dev/urandom of=/dev/vdc bs=1M count=4 seek=$offset conv=notrunc,fsync & \n\ - done; wait", - ) - .expect("Failed sparse writes"); + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); - // Now discard half the regions while writing to the other half - guest - .ssh_command( - "for i in $(seq 0 3); do \n\ - offset=$((i * 64 * 1024 * 1024)) \n\ - sudo blkdiscard -o $offset -l $((4 * 1024 * 1024)) /dev/vdc & \n\ - done; \n\ - for i in $(seq 4 7); do \n\ - offset=$((i * 64)) \n\ - sudo dd if=/dev/zero of=/dev/vdc bs=1M count=4 seek=$offset conv=notrunc,fsync & \n\ - done; wait", - ) - .expect("Failed parallel discard and write stress test"); + let _ = exec_host_command_output(&format!("umount {}", mnt_dir.to_str().unwrap())); + let _ = exec_host_command_output(&format!("losetup -d {loop_dev_path}")); - guest - .ssh_command("sudo dd if=/dev/vdc of=/dev/null bs=1M count=128") - .expect("Failed to read back data after discard stress"); - }); + handle_child_output(r, &output); } + #[test] fn test_virtio_block_qcow2_dirty_bit_unclean_shutdown() { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME_QCOW2.to_string()); @@ -4468,8 +1417,8 @@ mod common_parallel { ); let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) + .default_cpus() + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .args([ @@ -4531,8 +1480,8 @@ mod common_parallel { ); let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) + .default_cpus() + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .args([ @@ -4598,8 +1547,8 @@ mod common_parallel { ); let child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) + .default_cpus() + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .args([ @@ -4648,8 +1597,8 @@ mod common_parallel { ); let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) + .default_cpus() + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .args([ @@ -4672,15 +1621,14 @@ mod common_parallel { let output = child.wait_with_output().unwrap(); let stderr = String::from_utf8_lossy(&output.stderr); panic!( - "VM should not have exited when opening corrupt image as readonly. Exit status: {}, stderr: {}", - status, stderr + "VM should not have exited when opening corrupt image as readonly. Exit status: {status}, stderr: {stderr}" ); } Ok(None) => { // VM is still running as expected } Err(e) => { - panic!("Error checking process status: {}", e); + panic!("Error checking process status: {e}"); } } @@ -4690,8 +1638,7 @@ mod common_parallel { let stderr = String::from_utf8_lossy(&output.stderr); assert!( stderr.contains("QCOW2 image is marked corrupt, opening read-only"), - "Expected warning about corrupt image being opened read-only. stderr: {}", - stderr + "Expected warning about corrupt image being opened read-only. stderr: {stderr}" ); assert_eq!( @@ -4722,15 +1669,11 @@ mod common_parallel { .arg(vhd_file_path.to_str().unwrap()) .output() .expect("Expect generating VHD image from RAW image"); - - _test_virtio_block( + let guest = make_virtio_block_guest( + &GuestFactory::new_regular_guest_factory(), FOCAL_IMAGE_NAME_VHD, - false, - false, - false, - false, - ImageType::FixedVhd, ); + _test_virtio_block(&guest, false, false, false, false, ImageType::FixedVhd); } #[test] @@ -4753,103 +1696,17 @@ mod common_parallel { .arg(vhdx_file_path.to_str().unwrap()) .output() .expect("Expect generating dynamic VHDx image from RAW image"); - - _test_virtio_block( + let guest = make_virtio_block_guest( + &GuestFactory::new_regular_guest_factory(), FOCAL_IMAGE_NAME_VHDX, - false, - false, - true, - false, - ImageType::Vhdx, ); + _test_virtio_block(&guest, false, false, true, false, ImageType::Vhdx); } #[test] fn test_virtio_block_dynamic_vhdx_expand() { - const VIRTUAL_DISK_SIZE: u64 = 100 << 20; - const EMPTY_VHDX_FILE_SIZE: u64 = 8 << 20; - const FULL_VHDX_FILE_SIZE: u64 = 112 << 20; - const DYNAMIC_VHDX_NAME: &str = "dynamic.vhdx"; - - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - let kernel_path = direct_kernel_boot_path(); - - let vhdx_pathbuf = guest.tmp_dir.as_path().join(DYNAMIC_VHDX_NAME); - let vhdx_path = vhdx_pathbuf.to_str().unwrap(); - - // Generate a 100 MiB dynamic VHDX file - std::process::Command::new("qemu-img") - .arg("create") - .args(["-f", "vhdx"]) - .arg(vhdx_path) - .arg(VIRTUAL_DISK_SIZE.to_string()) - .output() - .expect("Expect generating dynamic VHDX image"); - - // Check if the size matches with empty VHDx file size - assert_eq!(vhdx_image_size(vhdx_path), EMPTY_VHDX_FILE_SIZE); - - let mut cloud_child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .args([ - "--disk", - format!( - "path={}", - guest.disk_config.disk(DiskType::OperatingSystem).unwrap() - ) - .as_str(), - format!( - "path={}", - guest.disk_config.disk(DiskType::CloudInit).unwrap() - ) - .as_str(), - format!("path={vhdx_path}").as_str(), - ]) - .default_net() - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - // Check both if /dev/vdc exists and if the block size is 100 MiB. - assert_eq!( - guest - .ssh_command("lsblk | grep vdc | grep -c 100M") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 1 - ); - - // Write 100 MB of data to the VHDx disk - guest - .ssh_command("sudo dd if=/dev/urandom of=/dev/vdc bs=1M count=100") - .unwrap(); - }); - - // Check if the size matches with expected expanded VHDx file size - assert_eq!(vhdx_image_size(vhdx_path), FULL_VHDX_FILE_SIZE); - - kill_child(&mut cloud_child); - let output = cloud_child.wait_with_output().unwrap(); - - handle_child_output(r, &output); - - disk_check_consistency(vhdx_path, None); - } - - fn vhdx_image_size(disk_name: &str) -> u64 { - std::fs::File::open(disk_name) - .unwrap() - .seek(SeekFrom::End(0)) - .unwrap() + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); + _test_virtio_block_dynamic_vhdx_expand(&guest); } #[test] @@ -4872,8 +1729,8 @@ mod common_parallel { .expect("copying of OS disk failed"); let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) + .default_cpus() + .default_memory() .args(["--kernel", fw_path(FwType::RustHypervisorFirmware).as_str()]) .args([ "--disk", @@ -4965,209 +1822,83 @@ mod common_parallel { } #[test] - #[cfg(not(target_arch = "aarch64"))] - fn test_vhost_user_blk_direct() { - test_vhost_user_blk(1, false, true, Some(&prepare_vubd)); - } - - #[test] - fn test_boot_from_vhost_user_blk_default() { - test_boot_from_vhost_user_blk(1, false, false, Some(&prepare_vubd)); - } - - #[test] - #[cfg(target_arch = "x86_64")] - fn test_split_irqchip() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) - .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .default_net() - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - assert_eq!( - guest - .ssh_command("grep -c IO-APIC.*timer /proc/interrupts || true") - .unwrap() - .trim() - .parse::() - .unwrap_or(1), - 0 - ); - assert_eq!( - guest - .ssh_command("grep -c IO-APIC.*cascade /proc/interrupts || true") - .unwrap() - .trim() - .parse::() - .unwrap_or(1), - 0 - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); - } - - #[test] - #[cfg(target_arch = "x86_64")] - fn test_dmi_serial_number() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) - .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .args(["--platform", "serial_number=a=b;c=d"]) - .default_disks() - .default_net() - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - assert_eq!( - guest - .ssh_command("sudo cat /sys/class/dmi/id/product_serial") - .unwrap() - .trim(), - "a=b;c=d" - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); - } - - #[test] - #[cfg(target_arch = "x86_64")] - fn test_dmi_uuid() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) - .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .args(["--platform", "uuid=1e8aa28a-435d-4027-87f4-40dceff1fa0a"]) - .default_disks() - .default_net() - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - assert_eq!( - guest - .ssh_command("sudo cat /sys/class/dmi/id/product_uuid") - .unwrap() - .trim(), - "1e8aa28a-435d-4027-87f4-40dceff1fa0a" - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); - } - - #[test] - #[cfg(target_arch = "x86_64")] - fn test_dmi_oem_strings() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - let s1 = "io.systemd.credential:xx=yy"; - let s2 = "This is a test string"; - - let oem_strings = format!("oem_strings=[{s1},{s2}]"); - - let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) - .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .args(["--platform", &oem_strings]) - .default_disks() - .default_net() - .capture_output() - .spawn() - .unwrap(); + #[cfg(not(target_arch = "aarch64"))] + fn test_vhost_user_blk_direct() { + test_vhost_user_blk(1, false, true, Some(&prepare_vubd)); + } - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); + #[test] + fn test_boot_from_vhost_user_blk_default() { + test_boot_from_vhost_user_blk(1, false, false, Some(&prepare_vubd)); + } - assert_eq!( - guest - .ssh_command("sudo dmidecode --oem-string count") - .unwrap() - .trim(), - "2" - ); + #[test] + #[cfg(target_arch = "x86_64")] + fn test_split_irqchip() { + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); + _test_split_irqchip(&guest); + } - assert_eq!( - guest - .ssh_command("sudo dmidecode --oem-string 1") - .unwrap() - .trim(), - s1 - ); + #[test] + #[cfg(target_arch = "x86_64")] + fn test_dmi_serial_number() { + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); - assert_eq!( - guest - .ssh_command("sudo dmidecode --oem-string 2") - .unwrap() - .trim(), - s2 - ); - }); + _test_dmi_serial_number(&guest); + } - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); + #[test] + #[cfg(target_arch = "x86_64")] + fn test_dmi_uuid() { + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); + _test_dmi_uuid(&guest); + } - handle_child_output(r, &output); + #[test] + #[cfg(target_arch = "x86_64")] + fn test_dmi_oem_strings() { + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); + _test_dmi_oem_strings(&guest); } #[test] fn test_virtio_fs() { - _test_virtio_fs(&prepare_virtiofsd, false, None); + _test_virtio_fs(&prepare_virtiofsd, false, false, None); } #[test] fn test_virtio_fs_hotplug() { - _test_virtio_fs(&prepare_virtiofsd, true, None); + _test_virtio_fs(&prepare_virtiofsd, true, false, None); } #[test] fn test_virtio_fs_multi_segment_hotplug() { - _test_virtio_fs(&prepare_virtiofsd, true, Some(15)); + _test_virtio_fs(&prepare_virtiofsd, true, false, Some(15)); } #[test] fn test_virtio_fs_multi_segment() { - _test_virtio_fs(&prepare_virtiofsd, false, Some(15)); + _test_virtio_fs(&prepare_virtiofsd, false, false, Some(15)); + } + + #[test] + fn test_generic_vhost_user() { + _test_virtio_fs(&prepare_virtiofsd, false, true, None); + } + + #[test] + fn test_generic_vhost_user_hotplug() { + _test_virtio_fs(&prepare_virtiofsd, true, true, None); + } + + #[test] + fn test_generic_vhost_user_multi_segment_hotplug() { + _test_virtio_fs(&prepare_virtiofsd, true, true, Some(15)); + } + + #[test] + fn test_generic_vhost_user_multi_segment() { + _test_virtio_fs(&prepare_virtiofsd, false, true, Some(15)); } #[test] @@ -5188,8 +1919,8 @@ mod common_parallel { let kernel_path = direct_kernel_boot_path(); let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) + .default_cpus() + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args([ "--disk", @@ -5237,49 +1968,8 @@ mod common_parallel { #[test] fn test_multiple_network_interfaces() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - let kernel_path = direct_kernel_boot_path(); - - let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .args([ - "--net", - guest.default_net_string().as_str(), - "tap=,mac=8a:6b:6f:5a:de:ac,ip=192.168.3.1,mask=255.255.255.128", - "tap=mytap1,mac=fe:1f:9e:e1:60:f2,ip=192.168.4.1,mask=255.255.255.128", - ]) - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - let tap_count = exec_host_command_output("ip link | grep -c mytap1"); - assert_eq!(String::from_utf8_lossy(&tap_count.stdout).trim(), "1"); - - // 3 network interfaces + default localhost ==> 4 interfaces - assert_eq!( - guest - .ssh_command("ip -o link | wc -l") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 4 - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); + _test_multiple_network_interfaces(&guest); } #[test] @@ -5288,8 +1978,8 @@ mod common_parallel { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(disk_config)); let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) + .default_cpus() + .default_memory() .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .default_disks() @@ -5321,39 +2011,8 @@ mod common_parallel { #[test] fn test_serial_off() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) - .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .default_net() - .args(["--serial", "off"]) - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - // Test that there is no ttyS0 - assert_eq!( - guest - .ssh_command(GREP_SERIAL_IRQ_CMD) - .unwrap() - .trim() - .parse::() - .unwrap_or(1), - 0 - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); + _test_serial_off(&guest); } #[test] @@ -5366,13 +2025,13 @@ mod common_parallel { #[cfg(target_arch = "aarch64")] let console_str: &str = "console=ttyAMA0"; - cmd.args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) + cmd.default_cpus() + .default_memory() .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args([ "--cmdline", DIRECT_KERNEL_BOOT_CMDLINE - .replace("console=hvc0 ", console_str) + .replace("console=hvc0", console_str) .as_str(), ]) .default_disks() @@ -5422,13 +2081,13 @@ mod common_parallel { let console_str: &str = "console=ttyAMA0"; let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) + .default_cpus() + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args([ "--cmdline", DIRECT_KERNEL_BOOT_CMDLINE - .replace("console=hvc0 ", console_str) + .replace("console=hvc0", console_str) .as_str(), ]) .default_disks() @@ -5480,13 +2139,13 @@ mod common_parallel { let console_str: &str = "console=ttyAMA0"; let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) + .default_cpus() + .default_memory() .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args([ "--cmdline", DIRECT_KERNEL_BOOT_CMDLINE - .replace("console=hvc0 ", console_str) + .replace("console=hvc0", console_str) .as_str(), ]) .default_disks() @@ -5549,8 +2208,8 @@ mod common_parallel { let cmdline = DIRECT_KERNEL_BOOT_CMDLINE.to_owned() + serial_option; let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) + .default_cpus() + .default_memory() .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", &cmdline]) .default_disks() @@ -5596,8 +2255,8 @@ mod common_parallel { let cmdline = DIRECT_KERNEL_BOOT_CMDLINE.to_owned() + serial_option; let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) + .default_cpus() + .default_memory() .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", &cmdline]) .default_disks() @@ -5654,98 +2313,14 @@ mod common_parallel { #[test] fn test_virtio_console() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - let kernel_path = direct_kernel_boot_path(); - - let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .default_net() - .args(["--console", "tty"]) - .args(["--serial", "null"]) - .capture_output() - .spawn() - .unwrap(); - - let text = String::from("On a branch floating down river a cricket, singing."); - let cmd = format!("echo {text} | sudo tee /dev/hvc0"); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - assert!( - guest - .does_device_vendor_pair_match("0x1043", "0x1af4") - .unwrap_or_default() - ); - - guest.ssh_command(&cmd).unwrap(); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - handle_child_output(r, &output); - - let r = std::panic::catch_unwind(|| { - assert!(String::from_utf8_lossy(&output.stdout).contains(&text)); - }); - - handle_child_output(r, &output); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); + _test_virtio_console(&guest); } #[test] fn test_console_file() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - let console_path = guest.tmp_dir.as_path().join("console-output"); - let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) - .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .default_net() - .args([ - "--console", - format!("file={}", console_path.to_str().unwrap()).as_str(), - ]) - .capture_output() - .spawn() - .unwrap(); - - guest.wait_vm_boot().unwrap(); - - guest.ssh_command("sudo shutdown -h now").unwrap(); - - let _ = child.wait_timeout(std::time::Duration::from_secs(20)); - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - let r = std::panic::catch_unwind(|| { - // Check that the cloud-hypervisor binary actually terminated - assert!(output.status.success()); - - // Do this check after shutdown of the VM as an easy way to ensure - // all writes are flushed to disk - let mut f = std::fs::File::open(console_path).unwrap(); - let mut buf = String::new(); - f.read_to_string(&mut buf).unwrap(); - - if !buf.contains(CONSOLE_TEST_STRING) { - eprintln!( - "\n\n==== Console file output ====\n\n{buf}\n\n==== End console file output ====" - ); - } - assert!(buf.contains(CONSOLE_TEST_STRING)); - }); - - handle_child_output(r, &output); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); + _test_console_file(&guest); } #[test] @@ -5859,11 +2434,28 @@ mod common_parallel { .spawn() .unwrap(); - thread::sleep(std::time::Duration::new(30, 0)); + guest.wait_for_ssh(Duration::from_secs(30)).unwrap(); let r = std::panic::catch_unwind(|| { guest.ssh_command_l1("sudo systemctl start vfio").unwrap(); - thread::sleep(std::time::Duration::new(120, 0)); + let auth = PasswordAuth { + username: String::from("cloud"), + password: String::from("cloud123"), + }; + wait_for_ssh( + "true", + &auth, + &guest.network.l2_guest_ip1, + Duration::from_secs(120), + ) + .unwrap(); + wait_for_ssh( + "true", + &auth, + &guest.network.l2_guest_ip2, + Duration::from_secs(120), + ) + .unwrap(); // We booted our cloud hypervisor L2 guest with a "VFIOTAG" tag // added to its kernel command line. @@ -5922,7 +2514,18 @@ mod common_parallel { 1 )); - thread::sleep(std::time::Duration::new(10, 0)); + wait_for_ssh( + "true", + &auth, + &guest.network.l2_guest_ip3, + Duration::from_secs(10), + ) + .unwrap(); + assert!(wait_until(Duration::from_secs(10), || { + guest + .ssh_command_l2_1("ls /sys/bus/pci/devices") + .is_ok_and(|output| check_lines_count(output.trim(), 9)) + })); // Let's also verify from the third virtio-net device passed to // the L2 VM. This third device has been hotplugged through the L2 @@ -5954,7 +2557,11 @@ mod common_parallel { remove-device vfio123", ) .unwrap(); - thread::sleep(std::time::Duration::new(10, 0)); + assert!(wait_until(Duration::from_secs(10), || { + guest + .ssh_command_l2_1("ls /sys/bus/pci/devices") + .is_ok_and(|output| check_lines_count(output.trim(), 8)) + })); // Check the amount of PCI devices appearing in L2 VM is back down // to 8 devices. @@ -5994,54 +2601,32 @@ mod common_parallel { handle_child_output(r, &output); } - #[test] - fn test_direct_kernel_boot_noacpi() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - let kernel_path = direct_kernel_boot_path(); - - let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args([ - "--cmdline", - format!("{DIRECT_KERNEL_BOOT_CMDLINE} acpi=off").as_str(), - ]) - .default_disks() - .default_net() - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - assert_eq!(guest.get_cpu_count().unwrap_or_default(), 1); - assert!(guest.get_total_memory().unwrap_or_default() > 480_000); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); + #[test] + fn test_direct_kernel_boot_noacpi() { + let mut guest = basic_regular_guest!(JAMMY_IMAGE_NAME); + guest.kernel_cmdline = Some(format!("{DIRECT_KERNEL_BOOT_CMDLINE} acpi=off")); + _test_direct_kernel_boot_noacpi(&guest); } #[test] fn test_virtio_vsock() { - _test_virtio_vsock(false); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); + _test_virtio_vsock(&guest, false); } #[test] fn test_virtio_vsock_hotplug() { - _test_virtio_vsock(true); + #[cfg(target_arch = "x86_64")] + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); + #[cfg(target_arch = "aarch64")] + let guest = + basic_regular_guest!(JAMMY_IMAGE_NAME).with_kernel_path(edk2_path().to_str().unwrap()); + _test_virtio_vsock(&guest, true); } #[test] fn test_api_http_shutdown() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME).with_cpu(4); let target_api = TargetApi::new_http_api(&guest.tmp_dir); _test_api_shutdown(&target_api, &guest); @@ -6049,8 +2634,7 @@ mod common_parallel { #[test] fn test_api_http_delete() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME).with_cpu(4); let target_api = TargetApi::new_http_api(&guest.tmp_dir); _test_api_delete(&target_api, &guest); @@ -6058,8 +2642,7 @@ mod common_parallel { #[test] fn test_api_http_pause_resume() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME).with_cpu(4); let target_api = TargetApi::new_http_api(&guest.tmp_dir); _test_api_pause_resume(&target_api, &guest); @@ -6067,8 +2650,7 @@ mod common_parallel { #[test] fn test_api_http_create_boot() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME).with_cpu(4); let target_api = TargetApi::new_http_api(&guest.tmp_dir); _test_api_create_boot(&target_api, &guest); @@ -6090,97 +2672,12 @@ mod common_parallel { // properly probed first, then removing it, and adding it again by doing a // rescan. fn test_pci_bar_reprogramming() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - #[cfg(target_arch = "x86_64")] - let kernel_path = direct_kernel_boot_path(); #[cfg(target_arch = "aarch64")] - let kernel_path = edk2_path(); - - let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .args([ - "--net", - guest.default_net_string().as_str(), - "tap=,mac=8a:6b:6f:5a:de:ac,ip=192.168.3.1,mask=255.255.255.128", - ]) - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - // 2 network interfaces + default localhost ==> 3 interfaces - assert_eq!( - guest - .ssh_command("ip -o link | wc -l") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 3 - ); - - let init_bar_addr = guest - .ssh_command( - "sudo awk '{print $1; exit}' /sys/bus/pci/devices/0000:00:05.0/resource", - ) - .unwrap(); - - // Remove the PCI device - guest - .ssh_command("echo 1 | sudo tee /sys/bus/pci/devices/0000:00:05.0/remove") - .unwrap(); - - // Only 1 network interface left + default localhost ==> 2 interfaces - assert_eq!( - guest - .ssh_command("ip -o link | wc -l") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 2 - ); - - // Remove the PCI device - guest - .ssh_command("echo 1 | sudo tee /sys/bus/pci/rescan") - .unwrap(); - - // Back to 2 network interface + default localhost ==> 3 interfaces - assert_eq!( - guest - .ssh_command("ip -o link | wc -l") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 3 - ); - - let new_bar_addr = guest - .ssh_command( - "sudo awk '{print $1; exit}' /sys/bus/pci/devices/0000:00:05.0/resource", - ) - .unwrap(); - - // Let's compare the BAR addresses for our virtio-net device. - // They should be different as we expect the BAR reprogramming - // to have happened. - assert_ne!(init_bar_addr, new_bar_addr); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); + let guest = + basic_regular_guest!(JAMMY_IMAGE_NAME).with_kernel_path(edk2_path().to_str().unwrap()); + #[cfg(target_arch = "x86_64")] + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); + _test_pci_bar_reprogramming(&guest); } #[test] @@ -6201,12 +2698,12 @@ mod common_parallel { let mut child = GuestCommand::new(&guest) .args(["--cpus", "boot=2,max=4"]) - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args([ "--cmdline", DIRECT_KERNEL_BOOT_CMDLINE - .replace("console=hvc0 ", console_str) + .replace("console=hvc0", console_str) .as_str(), ]) .args(["--serial", "tty"]) @@ -6233,11 +2730,9 @@ mod common_parallel { guest .ssh_command("echo 1 | sudo tee /sys/bus/cpu/devices/cpu3/online") .unwrap(); - thread::sleep(std::time::Duration::new(10, 0)); - assert_eq!( - guest.get_cpu_count().unwrap_or_default(), - u32::from(desired_vcpus) - ); + assert!(wait_until(Duration::from_secs(10), || { + guest.get_cpu_count().unwrap_or_default() == u32::from(desired_vcpus) + })); guest.reboot_linux(0); @@ -6250,11 +2745,9 @@ mod common_parallel { let desired_vcpus = 2; resize_command(&api_socket, Some(desired_vcpus), None, None, None); - thread::sleep(std::time::Duration::new(10, 0)); - assert_eq!( - guest.get_cpu_count().unwrap_or_default(), - u32::from(desired_vcpus) - ); + assert!(wait_until(Duration::from_secs(10), || { + guest.get_cpu_count().unwrap_or_default() == u32::from(desired_vcpus) + })); // Resize the VM back up to 4 let desired_vcpus = 4; @@ -6266,11 +2759,9 @@ mod common_parallel { guest .ssh_command("echo 1 | sudo tee /sys/bus/cpu/devices/cpu3/online") .unwrap(); - thread::sleep(std::time::Duration::new(10, 0)); - assert_eq!( - guest.get_cpu_count().unwrap_or_default(), - u32::from(desired_vcpus) - ); + assert!(wait_until(Duration::from_secs(10), || { + guest.get_cpu_count().unwrap_or_default() == u32::from(desired_vcpus) + })); }); kill_child(&mut child); @@ -6318,16 +2809,18 @@ mod common_parallel { let desired_ram = 1024 << 20; resize_command(&api_socket, None, Some(desired_ram), None, None); - thread::sleep(std::time::Duration::new(10, 0)); - assert!(guest.get_total_memory().unwrap_or_default() > 960_000); + assert!(wait_until(Duration::from_secs(10), || { + guest.get_total_memory().unwrap_or_default() > 960_000 + })); // Use balloon to remove RAM from the VM let desired_balloon = 512 << 20; resize_command(&api_socket, None, None, Some(desired_balloon), None); - thread::sleep(std::time::Duration::new(10, 0)); - assert!(guest.get_total_memory().unwrap_or_default() > 480_000); - assert!(guest.get_total_memory().unwrap_or_default() < 960_000); + assert!(wait_until(Duration::from_secs(10), || { + let total_memory = guest.get_total_memory().unwrap_or_default(); + total_memory > 480_000 && total_memory < 960_000 + })); guest.reboot_linux(0); @@ -6337,9 +2830,9 @@ mod common_parallel { let desired_balloon = 0; resize_command(&api_socket, None, None, Some(desired_balloon), None); - thread::sleep(std::time::Duration::new(10, 0)); - - assert!(guest.get_total_memory().unwrap_or_default() > 960_000); + assert!(wait_until(Duration::from_secs(10), || { + guest.get_total_memory().unwrap_or_default() > 960_000 + })); guest.enable_memory_hotplug(); @@ -6347,8 +2840,9 @@ mod common_parallel { let desired_ram = 2048 << 20; resize_command(&api_socket, None, Some(desired_ram), None, None); - thread::sleep(std::time::Duration::new(10, 0)); - assert!(guest.get_total_memory().unwrap_or_default() > 1_920_000); + assert!(wait_until(Duration::from_secs(10), || { + guest.get_total_memory().unwrap_or_default() > 1_920_000 + })); // Remove RAM to the VM (only applies after reboot) let desired_ram = 1024 << 20; @@ -6401,23 +2895,26 @@ mod common_parallel { let desired_ram = 1024 << 20; resize_command(&api_socket, None, Some(desired_ram), None, None); - thread::sleep(std::time::Duration::new(10, 0)); - assert!(guest.get_total_memory().unwrap_or_default() > 960_000); + assert!(wait_until(Duration::from_secs(10), || { + guest.get_total_memory().unwrap_or_default() > 960_000 + })); // Add RAM to the VM let desired_ram = 2048 << 20; resize_command(&api_socket, None, Some(desired_ram), None, None); - thread::sleep(std::time::Duration::new(10, 0)); - assert!(guest.get_total_memory().unwrap_or_default() > 1_920_000); + assert!(wait_until(Duration::from_secs(10), || { + guest.get_total_memory().unwrap_or_default() > 1_920_000 + })); // Remove RAM from the VM let desired_ram = 1024 << 20; resize_command(&api_socket, None, Some(desired_ram), None, None); - thread::sleep(std::time::Duration::new(10, 0)); - assert!(guest.get_total_memory().unwrap_or_default() > 960_000); - assert!(guest.get_total_memory().unwrap_or_default() < 1_920_000); + assert!(wait_until(Duration::from_secs(10), || { + let total_memory = guest.get_total_memory().unwrap_or_default(); + total_memory > 960_000 && total_memory < 1_920_000 + })); guest.reboot_linux(0); @@ -6428,9 +2925,10 @@ mod common_parallel { // Check we can still resize to 512MiB let desired_ram = 512 << 20; resize_command(&api_socket, None, Some(desired_ram), None, None); - thread::sleep(std::time::Duration::new(10, 0)); - assert!(guest.get_total_memory().unwrap_or_default() > 480_000); - assert!(guest.get_total_memory().unwrap_or_default() < 960_000); + assert!(wait_until(Duration::from_secs(10), || { + let total_memory = guest.get_total_memory().unwrap_or_default(); + total_memory > 480_000 && total_memory < 960_000 + })); }); kill_child(&mut child); @@ -6486,11 +2984,9 @@ mod common_parallel { guest .ssh_command("echo 1 | sudo tee /sys/bus/cpu/devices/cpu3/online") .unwrap(); - thread::sleep(std::time::Duration::new(10, 0)); - assert_eq!( - guest.get_cpu_count().unwrap_or_default(), - u32::from(desired_vcpus) - ); + assert!(wait_until(Duration::from_secs(10), || { + guest.get_cpu_count().unwrap_or_default() == u32::from(desired_vcpus) + })); assert!(guest.get_total_memory().unwrap_or_default() > 960_000); }); @@ -6503,36 +2999,10 @@ mod common_parallel { #[test] fn test_memory_overhead() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - let kernel_path = direct_kernel_boot_path(); - - let guest_memory_size_kb = 512 * 1024; - - let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) - .args(["--memory", format!("size={guest_memory_size_kb}K").as_str()]) - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_net() - .default_disks() - .capture_output() - .spawn() - .unwrap(); - - guest.wait_vm_boot().unwrap(); - - let r = std::panic::catch_unwind(|| { - let overhead = get_vmm_overhead(child.id(), guest_memory_size_kb); - eprintln!("Guest memory overhead: {overhead} vs {MAXIMUM_VMM_OVERHEAD_KB}"); - assert!(overhead <= MAXIMUM_VMM_OVERHEAD_KB); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); + let guest_memory_size_kb: u32 = 512 * 1024; + let guest = + basic_regular_guest!(JAMMY_IMAGE_NAME).with_memory(&format!("{guest_memory_size_kb}K")); + _test_memory_overhead(&guest, guest_memory_size_kb); } #[test] @@ -6541,6 +3011,30 @@ mod common_parallel { // the path for the hotplug disk is not pre-added to Landlock rules, this // the test will result in a failure. fn test_landlock() { + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); + _test_landlock(&guest); + } + + #[test] + fn test_disk_hotplug() { + #[cfg(target_arch = "x86_64")] + let kernel_path = direct_kernel_boot_path(); + #[cfg(target_arch = "aarch64")] + let kernel_path = edk2_path(); + let guest = + basic_regular_guest!(JAMMY_IMAGE_NAME).with_kernel_path(kernel_path.to_str().unwrap()); + _test_disk_hotplug(&guest, false); + } + + #[test] + #[cfg(target_arch = "x86_64")] + fn test_disk_hotplug_with_landlock() { + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); + _test_disk_hotplug(&guest, true); + } + + #[test] + fn test_disk_resize() { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(disk_config)); @@ -6551,59 +3045,106 @@ mod common_parallel { let api_socket = temp_api_path(&guest.tmp_dir); - let mut child = GuestCommand::new(&guest) - .args(["--api-socket", &api_socket]) - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) + // Create a disk image that we can write to + assert!( + exec_host_command_output("sudo dd if=/dev/zero of=/tmp/resize.img bs=1M count=16") + .status + .success() + ); + + let mut cmd = GuestCommand::new(&guest); + + cmd.args(["--api-socket", &api_socket]) + .default_cpus() + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .args(["--landlock"]) .default_disks() .default_net() - .capture_output() - .spawn() - .unwrap(); + .capture_output(); + + let mut child = cmd.spawn().unwrap(); let r = std::panic::catch_unwind(|| { guest.wait_vm_boot().unwrap(); - // Check /dev/vdc is not there + // Add the disk to the VM + let (cmd_success, cmd_output, _) = remote_command_w_output( + &api_socket, + "add-disk", + Some("path=/tmp/resize.img,id=test0"), + ); + + assert!(cmd_success); + assert!( + String::from_utf8_lossy(&cmd_output) + .contains("{\"id\":\"test0\",\"bdf\":\"0000:00:06.0\"}") + ); + + // Check that /dev/vdc exists and the block size is 16M. assert_eq!( guest - .ssh_command("lsblk | grep -c vdc.*16M || true") + .ssh_command("lsblk | grep vdc | grep -c 16M") .unwrap() .trim() .parse::() - .unwrap_or(1), - 0 + .unwrap_or_default(), + 1 ); + // And check the block device can be written to. + guest + .ssh_command("sudo dd if=/dev/zero of=/dev/vdc bs=1M count=16") + .unwrap(); - // Now let's add the extra disk. - let mut blk_file_path = dirs::home_dir().unwrap(); - blk_file_path.push("workloads"); - blk_file_path.push("blk.img"); - // As the path to the hotplug disk is not pre-added, this remote - // command will fail. - assert!(!remote_command( - &api_socket, - "add-disk", - Some( - format!( - "path={},id=test0,readonly=true", - blk_file_path.to_str().unwrap() - ) - .as_str() - ), - )); + // Resize disk to 32M + let resize_up_success = + resize_disk_command(&api_socket, "test0", "33554432" /* 32M */); + assert!(resize_up_success); + + assert_eq!( + guest + .ssh_command("lsblk | grep vdc | grep -c 32M") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 1 + ); + + // And check all blocks can be written to + guest + .ssh_command("sudo dd if=/dev/zero of=/dev/vdc bs=1M count=32") + .unwrap(); + + // Resize down to original size + let resize_down_success = + resize_disk_command(&api_socket, "test0", "16777216" /* 16M */); + assert!(resize_down_success); + + assert_eq!( + guest + .ssh_command("lsblk | grep vdc | grep -c 16M") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 1 + ); + + // And check all blocks can be written to, again + guest + .ssh_command("sudo dd if=/dev/zero of=/dev/vdc bs=1M count=16") + .unwrap(); }); - let _ = child.kill(); + kill_child(&mut child); let output = child.wait_with_output().unwrap(); handle_child_output(r, &output); } - fn _test_disk_hotplug(landlock_enabled: bool) { + #[test] + fn test_disk_resize_qcow2() { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(disk_config)); @@ -6614,21 +3155,23 @@ mod common_parallel { let api_socket = temp_api_path(&guest.tmp_dir); - let mut blk_file_path = dirs::home_dir().unwrap(); - blk_file_path.push("workloads"); - blk_file_path.push("blk.img"); + let test_disk_path = guest.tmp_dir.as_path().join("resize-test.qcow2"); + + // Create a 16MB QCOW2 disk image + assert!( + exec_host_command_output(&format!( + "qemu-img create -f qcow2 {} 16M", + test_disk_path.to_str().unwrap() + )) + .status + .success() + ); let mut cmd = GuestCommand::new(&guest); - if landlock_enabled { - cmd.args(["--landlock"]).args([ - "--landlock-rules", - format!("path={blk_file_path:?},access=rw").as_str(), - ]); - } cmd.args(["--api-socket", &api_socket]) - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) + .default_cpus() + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .default_disks() @@ -6640,38 +3183,20 @@ mod common_parallel { let r = std::panic::catch_unwind(|| { guest.wait_vm_boot().unwrap(); - // Check /dev/vdc is not there - assert_eq!( - guest - .ssh_command("lsblk | grep -c vdc.*16M || true") - .unwrap() - .trim() - .parse::() - .unwrap_or(1), - 0 - ); - - // Now let's add the extra disk. - let (cmd_success, cmd_output) = remote_command_w_output( + // Add the QCOW2 disk to the VM + let (cmd_success, cmd_output, _) = remote_command_w_output( &api_socket, "add-disk", - Some( - format!( - "path={},id=test0,readonly=true", - blk_file_path.to_str().unwrap() - ) - .as_str(), - ), - ); - assert!(cmd_success); - assert!( - String::from_utf8_lossy(&cmd_output) - .contains("{\"id\":\"test0\",\"bdf\":\"0000:00:06.0\"}") + Some(&format!( + "path={},id=test0", + test_disk_path.to_str().unwrap() + )), ); - thread::sleep(std::time::Duration::new(10, 0)); + assert!(cmd_success); + assert!(String::from_utf8_lossy(&cmd_output).contains("\"id\":\"test0\"")); - // Check that /dev/vdc exists and the block size is 16M. + // Check that /dev/vdc exists and the block size is 16M assert_eq!( guest .ssh_command("lsblk | grep vdc | grep -c 16M") @@ -6681,330 +3206,633 @@ mod common_parallel { .unwrap_or_default(), 1 ); - // And check the block device can be read. + + // Write some data to verify it persists after resize guest - .ssh_command("sudo dd if=/dev/vdc of=/dev/null bs=1M iflag=direct count=16") + .ssh_command("sudo dd if=/dev/urandom of=/dev/vdc bs=1M count=8") .unwrap(); - // Let's remove it the extra disk. - assert!(remote_command(&api_socket, "remove-device", Some("test0"))); - thread::sleep(std::time::Duration::new(5, 0)); - // And check /dev/vdc is not there + // Resize disk up to 32M + let resize_up_success = + resize_disk_command(&api_socket, "test0", "33554432" /* 32M */); + assert!(resize_up_success); + + // Check new size is visible assert_eq!( guest - .ssh_command("lsblk | grep -c vdc.*16M || true") + .ssh_command("lsblk | grep vdc | grep -c 32M") .unwrap() .trim() .parse::() - .unwrap_or(1), - 0 + .unwrap_or_default(), + 1 ); - // And add it back to validate unplug did work correctly. - let (cmd_success, cmd_output) = remote_command_w_output( - &api_socket, - "add-disk", - Some( - format!( - "path={},id=test0,readonly=true", - blk_file_path.to_str().unwrap() - ) - .as_str(), - ), - ); - assert!(cmd_success); - assert!( - String::from_utf8_lossy(&cmd_output) - .contains("{\"id\":\"test0\",\"bdf\":\"0000:00:06.0\"}") - ); + // Write to the expanded area to verify it works + guest + .ssh_command("sudo dd if=/dev/zero of=/dev/vdc bs=1M count=32") + .unwrap(); - thread::sleep(std::time::Duration::new(10, 0)); + // Resize to 64M to exercise L1 table growth + let resize_up_again_success = + resize_disk_command(&api_socket, "test0", "67108864" /* 64M */); + assert!(resize_up_again_success); - // Check that /dev/vdc exists and the block size is 16M. assert_eq!( guest - .ssh_command("lsblk | grep vdc | grep -c 16M") + .ssh_command("lsblk | grep vdc | grep -c 64M") .unwrap() .trim() .parse::() .unwrap_or_default(), 1 ); - // And check the block device can be read. + + // Write to the full disk guest - .ssh_command("sudo dd if=/dev/vdc of=/dev/null bs=1M iflag=direct count=16") + .ssh_command("sudo dd if=/dev/zero of=/dev/vdc bs=1M count=64") + .unwrap(); + + // QCOW2 does not support shrinking, no resize down test here. + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + disk_check_consistency(&test_disk_path, None); + + handle_child_output(r, &output); + } + + fn create_loop_device(backing_file_path: &str, block_size: u32, num_retries: usize) -> String { + const LOOP_CONFIGURE: u64 = 0x4c0a; + const LOOP_CTL_GET_FREE: u64 = 0x4c82; + const LOOP_CTL_PATH: &str = "/dev/loop-control"; + const LOOP_DEVICE_PREFIX: &str = "/dev/loop"; + + #[repr(C)] + struct LoopInfo64 { + lo_device: u64, + lo_inode: u64, + lo_rdevice: u64, + lo_offset: u64, + lo_sizelimit: u64, + lo_number: u32, + lo_encrypt_type: u32, + lo_encrypt_key_size: u32, + lo_flags: u32, + lo_file_name: [u8; 64], + lo_crypt_name: [u8; 64], + lo_encrypt_key: [u8; 32], + lo_init: [u64; 2], + } + + impl Default for LoopInfo64 { + fn default() -> Self { + LoopInfo64 { + lo_device: 0, + lo_inode: 0, + lo_rdevice: 0, + lo_offset: 0, + lo_sizelimit: 0, + lo_number: 0, + lo_encrypt_type: 0, + lo_encrypt_key_size: 0, + lo_flags: 0, + lo_file_name: [0; 64], + lo_crypt_name: [0; 64], + lo_encrypt_key: [0; 32], + lo_init: [0; 2], + } + } + } + + #[derive(Default)] + #[repr(C)] + struct LoopConfig { + fd: u32, + block_size: u32, + info: LoopInfo64, + _reserved: [u64; 8], + } + + // Open loop-control device + let loop_ctl_file = OpenOptions::new() + .read(true) + .write(true) + .open(LOOP_CTL_PATH) + .unwrap(); + + // Open backing file + let backing_file = OpenOptions::new() + .read(true) + .write(true) + .open(backing_file_path) + .unwrap(); + + // Retry the whole get free -> open -> configure sequence so that a + // race with another parallel test claiming the same loop device + // is resolved by requesting a new free device on each attempt. + let mut loop_device_path = String::new(); + for i in 0..num_retries { + // Request a free loop device + let loop_device_number = + unsafe { libc::ioctl(loop_ctl_file.as_raw_fd(), LOOP_CTL_GET_FREE as _) }; + + if loop_device_number < 0 { + panic!("Couldn't find a free loop device"); + } + + loop_device_path = format!("{LOOP_DEVICE_PREFIX}{loop_device_number}"); + + // Open loop device + let loop_device_file = OpenOptions::new() + .read(true) + .write(true) + .open(&loop_device_path) .unwrap(); - // Reboot the VM. - guest.reboot_linux(0); + let loop_config = LoopConfig { + fd: backing_file.as_raw_fd() as u32, + block_size, + ..Default::default() + }; + + let ret = unsafe { + libc::ioctl( + loop_device_file.as_raw_fd(), + LOOP_CONFIGURE as _, + &loop_config, + ) + }; + if ret == 0 { + break; + } + + if i < num_retries - 1 { + println!( + "Iteration {}: Failed to configure loop device {}: {}", + i, + loop_device_path, + io::Error::last_os_error() + ); + let jitter_ms = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .subsec_nanos() + % 500 + + 100; + thread::sleep(Duration::from_millis(jitter_ms as u64)); + } else { + panic!( + "Failed {} times trying to configure the loop device {}: {}", + num_retries, + loop_device_path, + io::Error::last_os_error() + ); + } + } + + loop_device_path + } + + #[test] + fn test_virtio_block_topology() { + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); + let test_disk_path = guest.tmp_dir.as_path().join("test.img"); + + let output = exec_host_command_output( + format!( + "qemu-img create -f raw {} 16M", + test_disk_path.to_str().unwrap() + ) + .as_str(), + ); + if !output.status.success() { + let stdout = String::from_utf8_lossy(&output.stdout); + let stderr = String::from_utf8_lossy(&output.stderr); + panic!("qemu-img command failed\nstdout\n{stdout}\nstderr\n{stderr}"); + } + + let loop_dev = create_loop_device(test_disk_path.to_str().unwrap(), 4096, 5); + _test_virtio_block_topology(&guest, &loop_dev); + Command::new("losetup") + .args(["-d", &loop_dev]) + .output() + .expect("loop device not found"); + } + + #[test] + fn test_virtio_block_direct_io_block_device_alignment_4k() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); + let kernel_path = direct_kernel_boot_path(); + + // The backing file for the loop device must live on a filesystem that + // supports O_DIRECT (e.g. ext4). guest.tmp_dir is on tmpfs inside + // Docker, and the loop driver forwards I/O to the backing file. + let mut workloads_path = dirs::home_dir().unwrap(); + workloads_path.push("workloads"); + let img_dir = TempDir::new_in(workloads_path.as_path()).unwrap(); + let test_disk_path = img_dir.as_path().join("directio_test.img"); + // Preallocate the backing file -- a sparse file can deadlock when + // O_DIRECT writes through a loop device trigger block allocation + // in the backing filesystem. + assert!( + exec_host_command_output(&format!( + "fallocate -l 64M {}", + test_disk_path.to_str().unwrap() + )) + .status + .success(), + "fallocate failed" + ); + + let loop_dev = create_loop_device(test_disk_path.to_str().unwrap(), 4096, 5); + + let mut child = GuestCommand::new(&guest) + .args(["--cpus", "boot=1"]) + .args(["--memory", "size=512M"]) + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .args([ + "--disk", + format!( + "path={}", + guest.disk_config.disk(DiskType::OperatingSystem).unwrap() + ) + .as_str(), + format!( + "path={}", + guest.disk_config.disk(DiskType::CloudInit).unwrap() + ) + .as_str(), + format!("path={},direct=on,image_type=raw", &loop_dev).as_str(), + ]) + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); - // Check still there after reboot assert_eq!( guest - .ssh_command("lsblk | grep vdc | grep -c 16M") + .ssh_command("lsblk -t | grep vdc | awk '{print $6}'") .unwrap() .trim() .parse::() .unwrap_or_default(), - 1 - ); - - assert!(remote_command(&api_socket, "remove-device", Some("test0"))); - - thread::sleep(std::time::Duration::new(20, 0)); - - // Check device has gone away - assert_eq!( - guest - .ssh_command("lsblk | grep -c vdc.*16M || true") - .unwrap() - .trim() - .parse::() - .unwrap_or(1), - 0 + 4096 ); - guest.reboot_linux(1); - - // Check device still absent - assert_eq!( - guest - .ssh_command("lsblk | grep -c vdc.*16M || true") - .unwrap() - .trim() - .parse::() - .unwrap_or(1), - 0 - ); + guest + .ssh_command( + "sudo dd if=/dev/urandom of=/tmp/pattern bs=4096 count=1 && \ + sudo dd if=/tmp/pattern of=/dev/vdc bs=4096 count=1 seek=1 oflag=direct && \ + sudo dd if=/dev/vdc of=/tmp/readback bs=4096 count=1 skip=1 iflag=direct && \ + cmp /tmp/pattern /tmp/readback", + ) + .unwrap(); }); kill_child(&mut child); let output = child.wait_with_output().unwrap(); handle_child_output(r, &output); - } - - #[test] - fn test_disk_hotplug() { - _test_disk_hotplug(false); - } - #[test] - #[cfg(target_arch = "x86_64")] - fn test_disk_hotplug_with_landlock() { - _test_disk_hotplug(true); + Command::new("losetup") + .args(["-d", &loop_dev]) + .output() + .expect("loop device cleanup failed"); } #[test] - fn test_disk_resize() { + fn test_virtio_block_direct_io_file_backed_alignment_4k() { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(disk_config)); - - #[cfg(target_arch = "x86_64")] let kernel_path = direct_kernel_boot_path(); - #[cfg(target_arch = "aarch64")] - let kernel_path = edk2_path(); - let api_socket = temp_api_path(&guest.tmp_dir); + let mut workloads_path = dirs::home_dir().unwrap(); + workloads_path.push("workloads"); + let img_dir = TempDir::new_in(workloads_path.as_path()).unwrap(); + let fs_img_path = img_dir.as_path().join("fs_4ksec.img"); - // Create a disk image that we can write to assert!( exec_host_command_output(&format!( - "sudo dd if=/dev/zero of=/tmp/resize.img bs=1M count=16" + "truncate -s 512M {}", + fs_img_path.to_str().unwrap() )) .status - .success() + .success(), + "truncate failed" ); - let mut cmd = GuestCommand::new(&guest); + let loop_dev_path = create_loop_device(fs_img_path.to_str().unwrap(), 4096, 5); - cmd.args(["--api-socket", &api_socket]) + assert!( + exec_host_command_output(&format!("mkfs.ext4 -q {loop_dev_path}")) + .status + .success(), + "mkfs.ext4 failed" + ); + + let mnt_dir = img_dir.as_path().join("mnt"); + fs::create_dir_all(&mnt_dir).unwrap(); + assert!( + exec_host_command_output(&format!( + "mount {} {}", + &loop_dev_path, + mnt_dir.to_str().unwrap() + )) + .status + .success(), + "mount failed" + ); + + let test_disk_path = mnt_dir.join("dio_file_test.raw"); + assert!( + exec_host_command_output(&format!( + "truncate -s 64M {}", + test_disk_path.to_str().unwrap() + )) + .status + .success(), + "truncate test disk failed" + ); + + let mut child = GuestCommand::new(&guest) .args(["--cpus", "boot=1"]) .args(["--memory", "size=512M"]) .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() + .args([ + "--disk", + format!( + "path={}", + guest.disk_config.disk(DiskType::OperatingSystem).unwrap() + ) + .as_str(), + format!( + "path={}", + guest.disk_config.disk(DiskType::CloudInit).unwrap() + ) + .as_str(), + format!( + "path={},direct=on,image_type=raw", + test_disk_path.to_str().unwrap() + ) + .as_str(), + ]) .default_net() - .capture_output(); - - let mut child = cmd.spawn().unwrap(); + .capture_output() + .spawn() + .unwrap(); let r = std::panic::catch_unwind(|| { guest.wait_vm_boot().unwrap(); - // Add the disk to the VM - let (cmd_success, cmd_output) = remote_command_w_output( - &api_socket, - "add-disk", - Some("path=/tmp/resize.img,id=test0"), - ); - - assert!(cmd_success); - assert!( - String::from_utf8_lossy(&cmd_output) - .contains("{\"id\":\"test0\",\"bdf\":\"0000:00:06.0\"}") - ); - - // Check that /dev/vdc exists and the block size is 16M. + let log_sec: u32 = guest + .ssh_command("lsblk -t | grep vdc | awk '{print $6}'") + .unwrap() + .trim() + .parse() + .unwrap_or_default(); assert_eq!( - guest - .ssh_command("lsblk | grep vdc | grep -c 16M") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 1 + log_sec, 4096, + "expected 4096-byte logical sector for file on 4k-sector fs, got {log_sec}" ); - // And check the block device can be written to. + guest - .ssh_command("sudo dd if=/dev/zero of=/dev/vdc bs=1M count=16") + .ssh_command( + "sudo dd if=/dev/urandom of=/tmp/pattern bs=4096 count=8 && \ + sudo dd if=/tmp/pattern of=/dev/vdc bs=4096 count=8 seek=1 oflag=direct && \ + sudo dd if=/dev/vdc of=/tmp/readback bs=4096 count=8 skip=1 iflag=direct && \ + cmp /tmp/pattern /tmp/readback", + ) .unwrap(); + }); - // Resize disk to 32M - let resize_up_success = - resize_disk_command(&api_socket, "test0", "33554432" /* 32M */); - assert!(resize_up_success); + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); - assert_eq!( - guest - .ssh_command("lsblk | grep vdc | grep -c 32M") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 1 - ); + handle_child_output(r, &output); - // And check all blocks can be written to - guest - .ssh_command("sudo dd if=/dev/zero of=/dev/vdc bs=1M count=32") - .unwrap(); + let _ = exec_host_command_output(&format!("umount {}", mnt_dir.to_str().unwrap())); + let _ = exec_host_command_output(&format!("losetup -d {loop_dev_path}")); + } - // Resize down to original size - let resize_down_success = - resize_disk_command(&api_socket, "test0", "16777216" /* 16M */); - assert!(resize_down_success); + // Helper function to verify sparse file + fn verify_sparse_file(test_disk_path: &str, expected_ratio: f64) { + let res = exec_host_command_output(&format!("ls -s --block-size=1 {test_disk_path}")); + assert!(res.status.success(), "ls -s command failed"); + let out = String::from_utf8_lossy(&res.stdout); + let actual_bytes: u64 = out + .split_whitespace() + .next() + .and_then(|s| s.parse().ok()) + .expect("Failed to parse ls -s output"); - assert_eq!( - guest - .ssh_command("lsblk | grep vdc | grep -c 16M") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 1 + let res = exec_host_command_output(&format!("ls -l {test_disk_path}")); + assert!(res.status.success(), "ls -l command failed"); + let out = String::from_utf8_lossy(&res.stdout); + let apparent_size: u64 = out + .split_whitespace() + .nth(4) + .and_then(|s| s.parse().ok()) + .expect("Failed to parse ls -l output"); + + let threshold = (apparent_size as f64 * expected_ratio) as u64; + assert!( + actual_bytes < threshold, + "Expected file to be sparse: apparent_size={apparent_size} bytes, actual_disk_usage={actual_bytes} bytes (threshold={threshold})" + ); + } + + // Helper function to count zero flagged regions in QCOW2 image + fn count_qcow2_zero_regions(test_disk_path: &str) -> Option { + let res = + exec_host_command_output(&format!("qemu-img map --output=json -U {test_disk_path}")); + if !res.status.success() { + return None; + } + + let out = String::from_utf8_lossy(&res.stdout); + let map_json = serde_json::from_str::(&out).ok()?; + let regions = map_json.as_array()?; + + Some( + regions + .iter() + .filter(|r| { + let data = r["data"].as_bool().unwrap_or(true); + let zero = r["zero"].as_bool().unwrap_or(false); + // holes - data: false + // zero flagged regions - data: true, zero: true + !data || zero + }) + .count(), + ) + } + + // Helper function to verify file extents using FIEMAP after DISCARD + // TODO: Make verification more format-specific: + // - QCOW2: Check for fragmentation patterns showing deallocated clusters + // - RAW: Verify actual holes (unallocated extents) exist in sparse regions + // - Could parse extent output to count holes vs allocated regions + fn verify_fiemap_extents(test_disk_path: &str, format_type: &str) { + let blocksize_output = exec_host_command_output(&format!("stat -f -c %S {test_disk_path}")); + let blocksize = if blocksize_output.status.success() { + String::from_utf8_lossy(&blocksize_output.stdout) + .trim() + .parse::() + .unwrap_or(4096) + } else { + 4096 + }; + + let fiemap_output = + exec_host_command_output(&format!("filefrag -b {blocksize} -v {test_disk_path}")); + if fiemap_output.status.success() { + let fiemap_str = String::from_utf8_lossy(&fiemap_output.stdout); + + // Verify we have extent information indicating sparse regions + let has_extents = fiemap_str.contains("extent") || fiemap_str.contains("extents"); + let has_holes = fiemap_str.contains("hole"); + + assert!( + has_extents || has_holes, + "FIEMAP should show extent information or holes for {format_type} file" ); + } + } - // And check all blocks can be written to, again - guest - .ssh_command("sudo dd if=/dev/zero of=/dev/vdc bs=1M count=16") - .unwrap(); - }); + /// Helper function to verify a disk region reads as all zeros from within the guest + fn assert_guest_disk_region_is_zero(guest: &Guest, device: &str, offset: u64, length: u64) { + let result = guest + .ssh_command(&format!( + "sudo hexdump -v -s {offset} -n {length} -e '1/1 \"%02x\"' {device} | grep -qv '^00*$' && echo 'NONZERO' || echo 'ZEROS'" + )) + .unwrap(); - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); + assert!( + result.trim() == "ZEROS", + "Expected {} region at offset {} length {} to read as zeros, but got: {}", + device, + offset, + length, + result.trim() + ); + } - handle_child_output(r, &output); + // Common test sizes for discard/fstrim tests (all formats): 9 small (≤256KB), then one 4MB + const BLOCK_DISCARD_TEST_SIZES_KB: &[u64] = &[64, 128, 256, 64, 128, 256, 64, 128, 256, 4096]; + + fn _test_virtio_block_discard( + format_name: &str, + qemu_img_format: &str, + extra_create_args: &[&str], + expect_discard_success: bool, + verify_disk: bool, + ) { + _test_virtio_block_discard_with_backend( + format_name, + qemu_img_format, + extra_create_args, + expect_discard_success, + verify_disk, + false, + ); } - #[test] - fn test_disk_resize_qcow2() { + fn _test_virtio_block_discard_with_backend( + format_name: &str, + qemu_img_format: &str, + extra_create_args: &[&str], + expect_discard_success: bool, + verify_disk: bool, + disable_io_uring: bool, + ) { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(disk_config)); - - #[cfg(target_arch = "x86_64")] let kernel_path = direct_kernel_boot_path(); - #[cfg(target_arch = "aarch64")] - let kernel_path = edk2_path(); - let api_socket = temp_api_path(&guest.tmp_dir); + let test_disk_path = guest + .tmp_dir + .as_path() + .join(format!("discard_test.{}", format_name.to_lowercase())); - let test_disk_path = guest.tmp_dir.as_path().join("resize-test.qcow2"); + let mut cmd = format!("qemu-img create -f {qemu_img_format} "); + if !extra_create_args.is_empty() { + cmd.push_str(&extra_create_args.join(" ")); + cmd.push(' '); + } + cmd.push_str(&format!("{} 2G", test_disk_path.to_str().unwrap())); - // Create a 16MB QCOW2 disk image + let res = exec_host_command_output(&cmd); assert!( - exec_host_command_output(&format!( - "qemu-img create -f qcow2 {} 16M", - test_disk_path.to_str().unwrap() - )) - .status - .success() + res.status.success(), + "Failed to create {format_name} test image" ); - let mut cmd = GuestCommand::new(&guest); - - cmd.args(["--api-socket", &api_socket]) - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) + let mut child = GuestCommand::new(&guest) + .args(["--cpus", "boot=4"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() + .args([ + "--disk", + format!( + "path={}", + guest.disk_config.disk(DiskType::OperatingSystem).unwrap() + ) + .as_str(), + format!( + "path={}", + guest.disk_config.disk(DiskType::CloudInit).unwrap() + ) + .as_str(), + format!( + "path={},num_queues=4,image_type={}{}", + test_disk_path.to_str().unwrap(), + format_name.to_lowercase(), + if disable_io_uring { + ",_disable_io_uring=on" + } else { + "" + } + ) + .as_str(), + ]) .default_net() - .capture_output(); - - let mut child = cmd.spawn().unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - // Add the QCOW2 disk to the VM - let (cmd_success, cmd_output) = remote_command_w_output( - &api_socket, - "add-disk", - Some(&format!( - "path={},id=test0", - test_disk_path.to_str().unwrap() - )), - ); - - assert!(cmd_success); - assert!(String::from_utf8_lossy(&cmd_output).contains("\"id\":\"test0\"")); - - // Check that /dev/vdc exists and the block size is 16M - assert_eq!( - guest - .ssh_command("lsblk | grep vdc | grep -c 16M") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 1 - ); + .capture_output() + .spawn() + .unwrap(); - // Write some data to verify it persists after resize - guest - .ssh_command("sudo dd if=/dev/urandom of=/dev/vdc bs=1M count=8") - .unwrap(); + const CLUSTER_SIZE_BYTES: u64 = 64 * 1024; // One QCOW2 cluster + const WRITE_SIZE_MB: u64 = 4; + const WRITE_OFFSET_MB: u64 = 1; - // Resize disk up to 32M - let resize_up_success = - resize_disk_command(&api_socket, "test0", "33554432" /* 32M */); - assert!(resize_up_success); + // Build discard operations within the written region + let write_start = WRITE_OFFSET_MB * 1024 * 1024; + let mut discard_operations: Vec<(u64, u64)> = Vec::new(); + let mut current_offset = write_start; - // Check new size is visible - assert_eq!( - guest - .ssh_command("lsblk | grep vdc | grep -c 32M") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 1 - ); + for &size_kb in BLOCK_DISCARD_TEST_SIZES_KB { + let size = size_kb * 1024; + discard_operations.push((current_offset, size)); + current_offset += size + CLUSTER_SIZE_BYTES; // Add gap between operations + } - // Write to the expanded area to verify it works - guest - .ssh_command("sudo dd if=/dev/zero of=/dev/vdc bs=1M count=32") - .unwrap(); + let size_after_write = std::cell::Cell::new(0u64); - // Resize to 64M to exercise L1 table growth - let resize_up_again_success = - resize_disk_command(&api_socket, "test0", "67108864" /* 64M */); - assert!(resize_up_again_success); + let r = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { + guest.wait_vm_boot().unwrap(); assert_eq!( guest - .ssh_command("lsblk | grep vdc | grep -c 64M") + .ssh_command("lsblk | grep -c vdc") .unwrap() .trim() .parse::() @@ -7012,175 +3840,160 @@ mod common_parallel { 1 ); - // Write to the full disk + // Write one 4MB block at offset 1MB guest - .ssh_command("sudo dd if=/dev/zero of=/dev/vdc bs=1M count=64") + .ssh_command(&format!( + "sudo dd if=/dev/zero of=/dev/vdc bs=1M count={WRITE_SIZE_MB} seek={WRITE_OFFSET_MB} oflag=direct" + )) .unwrap(); + guest.ssh_command("sync").unwrap(); - // QCOW2 does not support shrinking, no resize down test here. - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - disk_check_consistency(&test_disk_path, None); + // For QCOW2, measure file size after write to verify deallocation later + let write_size = if qemu_img_format == "qcow2" { + let res = exec_host_command_output(&format!( + "ls -s --block-size=1 {}", + test_disk_path.to_str().unwrap() + )); + assert!(res.status.success()); + String::from_utf8_lossy(&res.stdout) + .split_whitespace() + .next() + .and_then(|s| s.parse::().ok()) + .expect("Failed to parse file size after write") + } else { + 0 + }; + size_after_write.set(write_size); - handle_child_output(r, &output); - } + if expect_discard_success { + for (i, (offset, length)) in discard_operations.iter().enumerate() { + let result = guest + .ssh_command(&format!( + "sudo blkdiscard -v -o {offset} -l {length} /dev/vdc 2>&1 || true" + )) + .unwrap(); - fn create_loop_device(backing_file_path: &str, block_size: u32, num_retries: usize) -> String { - const LOOP_CONFIGURE: u64 = 0x4c0a; - const LOOP_CTL_GET_FREE: u64 = 0x4c82; - const LOOP_CTL_PATH: &str = "/dev/loop-control"; - const LOOP_DEVICE_PREFIX: &str = "/dev/loop"; + assert!( + !result.contains("Operation not supported") + && !result.contains("BLKDISCARD"), + "blkdiscard #{i} at offset {offset} length {length} failed: {result}" + ); + } - #[repr(C)] - struct LoopInfo64 { - lo_device: u64, - lo_inode: u64, - lo_rdevice: u64, - lo_offset: u64, - lo_sizelimit: u64, - lo_number: u32, - lo_encrypt_type: u32, - lo_encrypt_key_size: u32, - lo_flags: u32, - lo_file_name: [u8; 64], - lo_crypt_name: [u8; 64], - lo_encrypt_key: [u8; 32], - lo_init: [u64; 2], - } + // Force sync to ensure async DISCARD operations complete + guest.ssh_command("sync").unwrap(); - impl Default for LoopInfo64 { - fn default() -> Self { - LoopInfo64 { - lo_device: 0, - lo_inode: 0, - lo_rdevice: 0, - lo_offset: 0, - lo_sizelimit: 0, - lo_number: 0, - lo_encrypt_type: 0, - lo_encrypt_key_size: 0, - lo_flags: 0, - lo_file_name: [0; 64], - lo_crypt_name: [0; 64], - lo_encrypt_key: [0; 32], - lo_init: [0; 2], + // Verify VM sees zeros in discarded regions + for (offset, length) in discard_operations.iter() { + assert_guest_disk_region_is_zero(&guest, "/dev/vdc", *offset, *length); } + + guest.ssh_command("echo test").unwrap(); + } else { + // For unsupported formats, blkdiscard should fail with "not supported" + use test_infra::ssh_command_ip; + let result = ssh_command_ip( + "sudo blkdiscard -o 0 -l 4096 /dev/vdc 2>&1", + &guest.network.guest_ip0, + 0, + 5, + ); + assert!( + result.is_err(), + "blkdiscard should fail on unsupported format" + ); + guest.ssh_command("echo test").unwrap(); } - } - - #[derive(Default)] - #[repr(C)] - struct LoopConfig { - fd: u32, - block_size: u32, - info: LoopInfo64, - _reserved: [u64; 8], - } - // Open loop-control device - let loop_ctl_file = OpenOptions::new() - .read(true) - .write(true) - .open(LOOP_CTL_PATH) - .unwrap(); + if expect_discard_success { + if qemu_img_format == "qcow2" { + let res = exec_host_command_output(&format!( + "ls -s --block-size=1 {}", + test_disk_path.to_str().unwrap() + )); + assert!(res.status.success()); + let size_after_discard: u64 = String::from_utf8_lossy(&res.stdout) + .split_whitespace() + .next() + .and_then(|s| s.parse().ok()) + .expect("Failed to parse file size after discard"); - // Request a free loop device - let loop_device_number = - unsafe { libc::ioctl(loop_ctl_file.as_raw_fd(), LOOP_CTL_GET_FREE as _) }; + assert!( + size_after_discard < size_after_write.get(), + "QCOW2 file should shrink after DISCARD with sparse=true: after_write={} bytes, after_discard={} bytes", + size_after_write.get(), + size_after_discard + ); - if loop_device_number < 0 { - panic!("Couldn't find a free loop device"); - } + verify_fiemap_extents(test_disk_path.to_str().unwrap(), "QCOW2"); + } else if qemu_img_format == "raw" { + let mut file = File::open(&test_disk_path) + .expect("Failed to open test disk for verification"); - // Create loop device path - let loop_device_path = format!("{LOOP_DEVICE_PREFIX}{loop_device_number}"); + // Verify each discarded region contains all zeros + for (offset, length) in &discard_operations { + file.seek(SeekFrom::Start(*offset)) + .expect("Failed to seek to discarded region"); - // Open loop device - let loop_device_file = OpenOptions::new() - .read(true) - .write(true) - .open(&loop_device_path) - .unwrap(); + let mut buffer = vec![0u8; *length as usize]; + file.read_exact(&mut buffer) + .expect("Failed to read discarded region"); - // Open backing file - let backing_file = OpenOptions::new() - .read(true) - .write(true) - .open(backing_file_path) - .unwrap(); + let all_zeros = buffer.iter().all(|&b| b == 0); + assert!( + all_zeros, + "Expected discarded region at offset {offset} length {length} to contain all zeros" + ); + } - let loop_config = LoopConfig { - fd: backing_file.as_raw_fd() as u32, - block_size, - ..Default::default() - }; + verify_sparse_file(test_disk_path.to_str().unwrap(), 1.0); - for i in 0..num_retries { - let ret = unsafe { - libc::ioctl( - loop_device_file.as_raw_fd(), - LOOP_CONFIGURE as _, - &loop_config, - ) - }; - if ret != 0 { - if i < num_retries - 1 { - println!( - "Iteration {}: Failed to configure the loop device {}: {}", - i, - loop_device_path, - std::io::Error::last_os_error() - ); - } else { - panic!( - "Failed {} times trying to configure the loop device {}: {}", - num_retries, - loop_device_path, - std::io::Error::last_os_error() - ); + verify_fiemap_extents(test_disk_path.to_str().unwrap(), "RAW"); } - } else { - break; } + })); - // Wait for a bit before retrying - thread::sleep(std::time::Duration::new(5, 0)); + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + handle_child_output(r, &output); + + if verify_disk { + disk_check_consistency(&test_disk_path, None); } + } - loop_device_path + #[test] + fn test_virtio_block_discard_qcow2() { + _test_virtio_block_discard("qcow2", "qcow2", &[], true, true); } #[test] - fn test_virtio_block_topology() { + fn test_virtio_block_discard_raw() { + _test_virtio_block_discard("raw", "raw", &[], true, false); + } + + #[test] + fn test_virtio_block_discard_raw_aio() { + _test_virtio_block_discard_with_backend("raw", "raw", &[], true, false, true); + } + + #[test] + fn test_virtio_block_write_zeroes_unmap_raw() { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(disk_config)); - let kernel_path = direct_kernel_boot_path(); - let test_disk_path = guest.tmp_dir.as_path().join("test.img"); - - let output = exec_host_command_output( - format!( - "qemu-img create -f raw {} 16M", - test_disk_path.to_str().unwrap() - ) - .as_str(), - ); - if !output.status.success() { - let stdout = String::from_utf8_lossy(&output.stdout); - let stderr = String::from_utf8_lossy(&output.stderr); - panic!("qemu-img command failed\nstdout\n{stdout}\nstderr\n{stderr}"); - } + let test_disk_path = guest.tmp_dir.as_path().join("write_zeroes_unmap_test.raw"); - let loop_dev = create_loop_device(test_disk_path.to_str().unwrap(), 4096, 5); + let res = exec_host_command_output(&format!( + "dd if=/dev/zero of={} bs=1M count=128", + test_disk_path.to_str().unwrap() + )); + assert!(res.status.success(), "Failed to create raw test image"); let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .default_cpus() + .default_memory() + .default_kernel_cmdline() .args([ "--disk", format!( @@ -7193,7 +4006,7 @@ mod common_parallel { guest.disk_config.disk(DiskType::CloudInit).unwrap() ) .as_str(), - format!("path={}", &loop_dev).as_str(), + format!("path={},image_type=raw", test_disk_path.to_str().unwrap()).as_str(), ]) .default_net() .capture_output() @@ -7203,193 +4016,83 @@ mod common_parallel { let r = std::panic::catch_unwind(|| { guest.wait_vm_boot().unwrap(); - // MIN-IO column - assert_eq!( - guest - .ssh_command("lsblk -t| grep vdc | awk '{print $3}'") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 4096 - ); - // PHY-SEC column - assert_eq!( - guest - .ssh_command("lsblk -t| grep vdc | awk '{print $5}'") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 4096 - ); - // LOG-SEC column assert_eq!( guest - .ssh_command("lsblk -t| grep vdc | awk '{print $6}'") + .ssh_command("lsblk | grep -c vdc") .unwrap() .trim() .parse::() .unwrap_or_default(), - 4096 + 1 ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); - - Command::new("losetup") - .args(["-d", &loop_dev]) - .output() - .expect("loop device not found"); - } - - // Helper function to verify sparse file - fn verify_sparse_file(test_disk_path: &str, expected_ratio: f64) { - let res = exec_host_command_output(&format!("ls -s --block-size=1 {}", test_disk_path)); - assert!(res.status.success(), "ls -s command failed"); - let out = String::from_utf8_lossy(&res.stdout); - let actual_bytes: u64 = out - .split_whitespace() - .next() - .and_then(|s| s.parse().ok()) - .expect("Failed to parse ls -s output"); - - let res = exec_host_command_output(&format!("ls -l {}", test_disk_path)); - assert!(res.status.success(), "ls -l command failed"); - let out = String::from_utf8_lossy(&res.stdout); - let apparent_size: u64 = out - .split_whitespace() - .nth(4) - .and_then(|s| s.parse().ok()) - .expect("Failed to parse ls -l output"); - - let threshold = (apparent_size as f64 * expected_ratio) as u64; - assert!( - actual_bytes < threshold, - "Expected file to be sparse: apparent_size={} bytes, actual_disk_usage={} bytes (threshold={})", - apparent_size, - actual_bytes, - threshold - ); - } - - // Helper function to count zero flagged regions in QCOW2 image - fn count_qcow2_zero_regions(test_disk_path: &str) -> Option { - let res = - exec_host_command_output(&format!("qemu-img map --output=json -U {}", test_disk_path)); - if !res.status.success() { - return None; - } - - let out = String::from_utf8_lossy(&res.stdout); - let map_json = serde_json::from_str::(&out).ok()?; - let regions = map_json.as_array()?; - - Some( - regions - .iter() - .filter(|r| { - let data = r["data"].as_bool().unwrap_or(true); - let zero = r["zero"].as_bool().unwrap_or(false); - // holes - data: false - // zero flagged regions - data: true, zero: true - !data || zero - }) - .count(), - ) - } - // Helper function to verify file extents using FIEMAP after DISCARD - // TODO: Make verification more format-specific: - // - QCOW2: Check for fragmentation patterns showing deallocated clusters - // - RAW: Verify actual holes (unallocated extents) exist in sparse regions - // - Could parse extent output to count holes vs allocated regions - fn verify_fiemap_extents(test_disk_path: &str, format_type: &str) { - let blocksize_output = - exec_host_command_output(&format!("stat -f -c %S {}", test_disk_path)); - let blocksize = if blocksize_output.status.success() { - String::from_utf8_lossy(&blocksize_output.stdout) + let wz_max = guest + .ssh_command("cat /sys/block/vdc/queue/write_zeroes_max_bytes") + .unwrap() .trim() .parse::() - .unwrap_or(4096) - } else { - 4096 - }; - - let fiemap_output = - exec_host_command_output(&format!("filefrag -b {} -v {}", blocksize, test_disk_path)); - if fiemap_output.status.success() { - let fiemap_str = String::from_utf8_lossy(&fiemap_output.stdout); + .unwrap_or_default(); + assert!( + wz_max > 0, + "write_zeroes_max_bytes={wz_max}, VIRTIO_BLK_F_WRITE_ZEROES not negotiated" + ); - // Verify we have extent information indicating sparse regions - let has_extents = fiemap_str.contains("extent") || fiemap_str.contains("extents"); - let has_holes = fiemap_str.contains("hole"); + guest + .ssh_command("sudo dd if=/dev/urandom of=/dev/vdc bs=1M count=64 oflag=direct") + .unwrap(); + guest.ssh_command("sync").unwrap(); + // fallocate --punch-hole on a block device sends + // WRITE_ZEROES with VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP set. + let result = guest + .ssh_command("sudo fallocate -p -o 0 -l 67108864 /dev/vdc 2>&1 || true") + .unwrap(); assert!( - has_extents || has_holes, - "FIEMAP should show extent information or holes for {} file", - format_type + !result.contains("Operation not supported") && !result.contains("not supported"), + "fallocate --punch-hole failed: {result}" ); - } - } + guest.ssh_command("sync").unwrap(); - /// Helper function to verify a disk region reads as all zeros from within the guest - fn assert_guest_disk_region_is_zero(guest: &Guest, device: &str, offset: u64, length: u64) { - let result = guest - .ssh_command(&format!( - "sudo hexdump -v -s {} -n {} -e '1/1 \"%02x\"' {} | grep -qv '^00*$' && echo 'NONZERO' || echo 'ZEROS'", - offset, length, device - )) - .unwrap(); + assert_guest_disk_region_is_zero(&guest, "/dev/vdc", 0, 4096 * 256); - assert!( - result.trim() == "ZEROS", - "Expected {} region at offset {} length {} to read as zeros, but got: {}", - device, - offset, - length, - result.trim() - ); + let test_disk_str = test_disk_path.to_str().unwrap(); + verify_sparse_file(test_disk_str, 1.0); + verify_fiemap_extents(test_disk_str, "raw"); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + handle_child_output(r, &output); + } + + #[test] + fn test_virtio_block_discard_unsupported_vhd() { + _test_virtio_block_discard("vhd", "vpc", &["-o", "subformat=fixed"], false, false); } - // Common test sizes for discard/fstrim tests (all formats): 9 small (≤256KB), then one 4MB - const BLOCK_DISCARD_TEST_SIZES_KB: &[u64] = &[64, 128, 256, 64, 128, 256, 64, 128, 256, 4096]; + #[test] + fn test_virtio_block_discard_unsupported_vhdx() { + _test_virtio_block_discard("vhdx", "vhdx", &[], false, false); + } - fn _test_virtio_block_discard( - format_name: &str, - qemu_img_format: &str, - extra_create_args: &[&str], - expect_discard_success: bool, - verify_disk: bool, - ) { + #[test] + fn test_virtio_block_discard_loop_device() { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(disk_config)); let kernel_path = direct_kernel_boot_path(); - let test_disk_path = guest - .tmp_dir - .as_path() - .join(format!("discard_test.{}", format_name.to_lowercase())); - - let mut cmd = format!("qemu-img create -f {} ", qemu_img_format); - if !extra_create_args.is_empty() { - cmd.push_str(&extra_create_args.join(" ")); - cmd.push(' '); - } - cmd.push_str(&format!("{} 2G", test_disk_path.to_str().unwrap())); - - let res = exec_host_command_output(&cmd); + let test_disk_path = guest.tmp_dir.as_path().join("loop_discard_test.raw"); + let res = run_qemu_img(&test_disk_path, &["create", "-f", "raw"], Some(&["128M"])); assert!( res.status.success(), - "Failed to create {} test image", - format_name + "Failed to create raw backing image: {}", + String::from_utf8_lossy(&res.stderr) ); + let loop_dev = create_loop_device(test_disk_path.to_str().unwrap(), 4096, 5); + let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=4"]) + .args(["--cpus", "boot=1"]) .args(["--memory", "size=512M"]) .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -7405,36 +4108,14 @@ mod common_parallel { guest.disk_config.disk(DiskType::CloudInit).unwrap() ) .as_str(), - format!( - "path={},num_queues=4,image_type={}", - test_disk_path.to_str().unwrap(), - format_name.to_lowercase() - ) - .as_str(), + format!("path={},image_type=raw", &loop_dev).as_str(), ]) .default_net() .capture_output() .spawn() .unwrap(); - const CLUSTER_SIZE_BYTES: u64 = 64 * 1024; // One QCOW2 cluster - const WRITE_SIZE_MB: u64 = 4; - const WRITE_OFFSET_MB: u64 = 1; - - // Build discard operations within the written region - let write_start = WRITE_OFFSET_MB * 1024 * 1024; - let mut discard_operations: Vec<(u64, u64)> = Vec::new(); - let mut current_offset = write_start; - - for &size_kb in BLOCK_DISCARD_TEST_SIZES_KB { - let size = size_kb * 1024; - discard_operations.push((current_offset, size)); - current_offset += size + CLUSTER_SIZE_BYTES; // Add gap between operations - } - - let size_after_write = std::cell::Cell::new(0u64); - - let r = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { + let r = std::panic::catch_unwind(|| { guest.wait_vm_boot().unwrap(); assert_eq!( @@ -7447,161 +4128,263 @@ mod common_parallel { 1 ); - // Write one 4MB block at offset 1MB + assert_eq!( + guest + .ssh_command("lsblk -t | grep vdc | awk '{print $6}'") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 4096 + ); + + let discard_max = guest + .ssh_command("cat /sys/block/vdc/queue/discard_max_bytes") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(); + assert!( + discard_max > 0, + "discard_max_bytes={discard_max}, VIRTIO_BLK_F_DISCARD not negotiated" + ); + guest - .ssh_command(&format!( - "sudo dd if=/dev/zero of=/dev/vdc bs=1M count={} seek={} oflag=direct", - WRITE_SIZE_MB, WRITE_OFFSET_MB - )) + .ssh_command("sudo dd if=/dev/urandom of=/dev/vdc bs=4096 count=1024 oflag=direct") .unwrap(); guest.ssh_command("sync").unwrap(); - // For QCOW2, measure file size after write to verify deallocation later - let write_size = if qemu_img_format == "qcow2" { - let res = exec_host_command_output(&format!( - "ls -s --block-size=1 {}", - test_disk_path.to_str().unwrap() - )); - assert!(res.status.success()); - String::from_utf8_lossy(&res.stdout) - .split_whitespace() - .next() - .and_then(|s| s.parse::().ok()) - .expect("Failed to parse file size after write") - } else { - 0 - }; - size_after_write.set(write_size); + let result = guest + .ssh_command("sudo blkdiscard -v -o 0 -l 4194304 /dev/vdc 2>&1 || true") + .unwrap(); + assert!( + !result.contains("Operation not supported") + && !result.contains("BLKDISCARD ioctl failed"), + "blkdiscard failed on loop device: {result}" + ); - if expect_discard_success { - for (i, (offset, length)) in discard_operations.iter().enumerate() { - let result = guest - .ssh_command(&format!( - "sudo blkdiscard -v -o {} -l {} /dev/vdc 2>&1 || true", - offset, length - )) - .unwrap(); + guest.ssh_command("sync").unwrap(); - assert!( - !result.contains("Operation not supported") - && !result.contains("BLKDISCARD"), - "blkdiscard #{} at offset {} length {} failed: {}", - i, - offset, - length, - result - ); - } + assert_guest_disk_region_is_zero(&guest, "/dev/vdc", 0, 4194304); + }); - // Force sync to ensure async DISCARD operations complete - guest.ssh_command("sync").unwrap(); + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + handle_child_output(r, &output); - // Verify VM sees zeros in discarded regions - for (_i, (offset, length)) in discard_operations.iter().enumerate() { - assert_guest_disk_region_is_zero(&guest, "/dev/vdc", *offset, *length); - } + Command::new("losetup") + .args(["-d", &loop_dev]) + .output() + .expect("loop device not found"); + } - guest.ssh_command("echo test").unwrap(); - } else { - // For unsupported formats, blkdiscard should fail with "not supported" - use test_infra::ssh_command_ip; - let result = ssh_command_ip( - "sudo blkdiscard -o 0 -l 4096 /dev/vdc 2>&1", - &guest.network.guest_ip0, - 0, - 5, - ); - assert!( - result.is_err(), - "blkdiscard should fail on unsupported format" - ); - guest.ssh_command("echo test").unwrap(); - } + #[test] + fn test_virtio_block_discard_dm_snapshot() { + // Verify that the guest remains stable when BLKDISCARD fails on the + // host backend. DM snapshot targets do not support discard, so the + // VMM returns VIRTIO_BLK_S_IOERR. The guest must handle this + // gracefully even under repeated attempts. + // + // DM topology follows the same pattern used by WindowsDiskConfig. + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); + let kernel_path = direct_kernel_boot_path(); - if expect_discard_success { - if qemu_img_format == "qcow2" { - let res = exec_host_command_output(&format!( - "ls -s --block-size=1 {}", - test_disk_path.to_str().unwrap() - )); - assert!(res.status.success()); - let size_after_discard: u64 = String::from_utf8_lossy(&res.stdout) - .split_whitespace() - .next() - .and_then(|s| s.parse().ok()) - .expect("Failed to parse file size after discard"); + let origin_path = guest.tmp_dir.as_path().join("dm_origin.raw"); + let cow_path = guest.tmp_dir.as_path().join("dm_cow.raw"); - assert!( - size_after_discard < size_after_write.get(), - "QCOW2 file should shrink after DISCARD with sparse=true: after_write={} bytes, after_discard={} bytes", - size_after_write.get(), - size_after_discard - ); + let res = run_qemu_img(&origin_path, &["create", "-f", "raw"], Some(&["128M"])); + assert!( + res.status.success(), + "Failed to create origin image: {}", + String::from_utf8_lossy(&res.stderr) + ); - verify_fiemap_extents(test_disk_path.to_str().unwrap(), "QCOW2"); - } else if qemu_img_format == "raw" { - let mut file = File::open(&test_disk_path) - .expect("Failed to open test disk for verification"); + let cow_size: u64 = 128 << 20; + let cow_sectors = cow_size / 512; + let cow_file = File::create(&cow_path).expect("Expect creating COW image to succeed"); + cow_file + .set_len(cow_size) + .expect("Expect truncating COW image to succeed"); - // Verify each discarded region contains all zeros - for (offset, length) in &discard_operations { - file.seek(SeekFrom::Start(*offset)) - .expect("Failed to seek to discarded region"); + let origin_sectors: u64 = 128 * 1024 * 1024 / 512; + let origin_loop = create_loop_device(origin_path.to_str().unwrap(), 4096, 5); + let cow_loop = create_loop_device(cow_path.to_str().unwrap(), 512, 5); - let mut buffer = vec![0u8; *length as usize]; - file.read_exact(&mut buffer) - .expect("Failed to read discarded region"); + let unique = format!( + "ch-test-{}", + guest + .tmp_dir + .as_path() + .file_name() + .unwrap() + .to_str() + .unwrap() + ); + let cow_dm_name = format!("{unique}-cow"); + let snap_dm_name = format!("{unique}-snap"); - let all_zeros = buffer.iter().all(|&b| b == 0); - assert!( - all_zeros, - "Expected discarded region at offset {} length {} to contain all zeros", - offset, length - ); - } + let output = Command::new("dmsetup") + .args([ + "create", + &cow_dm_name, + "--table", + &format!("0 {cow_sectors} linear {cow_loop} 0"), + ]) + .output() + .expect("Failed to run dmsetup"); + assert!( + output.status.success(), + "dmsetup create (cow linear) failed: {}", + String::from_utf8_lossy(&output.stderr) + ); - verify_sparse_file(test_disk_path.to_str().unwrap(), 1.0); + Command::new("dmsetup") + .arg("mknodes") + .output() + .expect("dmsetup mknodes failed"); - verify_fiemap_extents(test_disk_path.to_str().unwrap(), "RAW"); - } + // dm-snapshot: origin + COW, non-persistent, chunk size 8 sectors. + let output = Command::new("dmsetup") + .args([ + "create", + &snap_dm_name, + "--table", + &format!("0 {origin_sectors} snapshot {origin_loop} /dev/mapper/{cow_dm_name} N 8"), + ]) + .output() + .expect("Failed to run dmsetup"); + assert!( + output.status.success(), + "dmsetup create (snapshot) failed: {}", + String::from_utf8_lossy(&output.stderr) + ); + + Command::new("dmsetup") + .arg("mknodes") + .output() + .expect("dmsetup mknodes failed"); + + let dm_dev = format!("/dev/mapper/{snap_dm_name}"); + + let mut child = GuestCommand::new(&guest) + .args(["--cpus", "boot=1"]) + .args(["--memory", "size=512M"]) + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .args([ + "--disk", + format!( + "path={}", + guest.disk_config.disk(DiskType::OperatingSystem).unwrap() + ) + .as_str(), + format!( + "path={}", + guest.disk_config.disk(DiskType::CloudInit).unwrap() + ) + .as_str(), + format!("path={},image_type=raw", &dm_dev).as_str(), + ]) + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + assert_eq!( + guest + .ssh_command("lsblk | grep -c vdc") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 1 + ); + + let discard_max = guest + .ssh_command("cat /sys/block/vdc/queue/discard_max_bytes") + .unwrap() + .trim() + .parse::() + .unwrap_or_default(); + assert!( + discard_max > 0, + "discard_max_bytes={discard_max}, VIRTIO_BLK_F_DISCARD not negotiated" + ); + + guest + .ssh_command("sudo dd if=/dev/urandom of=/dev/vdc bs=4096 count=1024 oflag=direct") + .unwrap(); + guest.ssh_command("sync").unwrap(); + + // Discard is expected to fail on DM snapshot because the + // snapshot target does not support BLKDISCARD. + for attempt in 1..=3 { + let result = guest + .ssh_command("sudo blkdiscard -o 0 -l 4194304 /dev/vdc 2>&1; echo rc=$?") + .unwrap(); + println!("blkdiscard attempt {attempt}: {result}"); + + let uptime = guest.ssh_command("uptime").unwrap(); + assert!( + !uptime.is_empty(), + "Guest unresponsive after blkdiscard attempt {attempt}" + ); } - })); + + guest + .ssh_command("sudo dd if=/dev/urandom of=/dev/vdc bs=4096 count=256 oflag=direct") + .unwrap(); + let readback = guest + .ssh_command("sudo dd if=/dev/vdc bs=4096 count=1 iflag=direct 2>/dev/null | od -A n -t x1 | head -1") + .unwrap(); + assert!( + !readback.trim().is_empty(), + "Failed to read back from device after discard errors" + ); + }); kill_child(&mut child); let output = child.wait_with_output().unwrap(); handle_child_output(r, &output); - if verify_disk { - disk_check_consistency(&test_disk_path, None); - } - } - - #[test] - fn test_virtio_block_discard_qcow2() { - _test_virtio_block_discard("qcow2", "qcow2", &[], true, true); - } - - #[test] - fn test_virtio_block_discard_raw() { - _test_virtio_block_discard("raw", "raw", &[], true, false); - } - - #[test] - fn test_virtio_block_discard_unsupported_vhd() { - _test_virtio_block_discard("vhd", "vpc", &["-o", "subformat=fixed"], false, false); + let _ = Command::new("dmsetup") + .args(["remove", &snap_dm_name]) + .output(); + let _ = Command::new("dmsetup") + .args(["remove", &cow_dm_name]) + .output(); + let _ = Command::new("losetup").args(["-d", &origin_loop]).output(); + let _ = Command::new("losetup").args(["-d", &cow_loop]).output(); } - #[test] - fn test_virtio_block_discard_unsupported_vhdx() { - _test_virtio_block_discard("vhdx", "vhdx", &[], false, false); + fn _test_virtio_block_fstrim( + format_name: &str, + qemu_img_format: &str, + extra_create_args: &[&str], + expect_fstrim_success: bool, + verify_disk: bool, + ) { + _test_virtio_block_fstrim_with_backend( + format_name, + qemu_img_format, + extra_create_args, + expect_fstrim_success, + verify_disk, + false, + ); } - fn _test_virtio_block_fstrim( + fn _test_virtio_block_fstrim_with_backend( format_name: &str, qemu_img_format: &str, extra_create_args: &[&str], expect_fstrim_success: bool, verify_disk: bool, + disable_io_uring: bool, ) { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(disk_config)); @@ -7612,7 +4395,7 @@ mod common_parallel { .as_path() .join(format!("fstrim_test.{}", format_name.to_lowercase())); - let mut cmd = format!("qemu-img create -f {} ", qemu_img_format); + let mut cmd = format!("qemu-img create -f {qemu_img_format} "); if !extra_create_args.is_empty() { cmd.push_str(&extra_create_args.join(" ")); cmd.push(' '); @@ -7622,8 +4405,7 @@ mod common_parallel { let res = exec_host_command_output(&cmd); assert!( res.status.success(), - "Failed to create {} test image", - format_name + "Failed to create {format_name} test image" ); const WRITE_SIZE_MB: u64 = 4; @@ -7631,7 +4413,7 @@ mod common_parallel { let mut child = GuestCommand::new(&guest) .args(["--cpus", "boot=4"]) - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .args([ @@ -7647,9 +4429,14 @@ mod common_parallel { ) .as_str(), format!( - "path={},num_queues=4,image_type={}", + "path={},num_queues=4,image_type={}{}", test_disk_path.to_str().unwrap(), - format_name.to_lowercase() + format_name.to_lowercase(), + if disable_io_uring { + ",_disable_io_uring=on" + } else { + "" + } ) .as_str(), ]) @@ -7682,8 +4469,7 @@ mod common_parallel { for (iteration, &write_size_kb) in BLOCK_DISCARD_TEST_SIZES_KB.iter().enumerate() { guest .ssh_command(&format!( - "sudo dd if=/dev/zero of=/mnt/test/testfile{} bs=1K count={}", - iteration, write_size_kb + "sudo dd if=/dev/zero of=/mnt/test/testfile{iteration} bs=1K count={write_size_kb}" )) .unwrap(); @@ -7695,20 +4481,19 @@ mod common_parallel { "ls -s --block-size=1 {}", test_disk_path.to_str().unwrap() )); - if res.status.success() { - if let Some(size) = String::from_utf8_lossy(&res.stdout) + if res.status.success() + && let Some(size) = String::from_utf8_lossy(&res.stdout) .split_whitespace() .next() .and_then(|s| s.parse::().ok()) - { - max_size_during_writes.set(max_size_during_writes.get().max(size)); - } + { + max_size_during_writes.set(max_size_during_writes.get().max(size)); } } // Make blocks available for discard guest - .ssh_command(&format!("sudo rm /mnt/test/testfile{}", iteration)) + .ssh_command(&format!("sudo rm /mnt/test/testfile{iteration}")) .unwrap(); guest.ssh_command("sync").unwrap(); @@ -7719,10 +4504,7 @@ mod common_parallel { // Would output like "/mnt/test: X bytes (Y MB) trimmed" assert!( fstrim_result.contains("trimmed") || fstrim_result.contains("bytes"), - "fstrim iteration {} ({}KB) should report trimmed bytes: {}", - iteration, - write_size_kb, - fstrim_result + "fstrim iteration {iteration} ({write_size_kb}KB) should report trimmed bytes: {fstrim_result}" ); } else { // For unsupported formats, expect fstrim to fail @@ -7788,16 +4570,17 @@ mod common_parallel { _test_virtio_block_fstrim("raw", "raw", &[], true, false); } + #[test] + fn test_virtio_block_fstrim_raw_aio() { + _test_virtio_block_fstrim_with_backend("raw", "raw", &[], true, false, true); + } + #[test] fn test_virtio_block_fstrim_unsupported_vhd() { _test_virtio_block_fstrim("vhd", "vpc", &["-o", "subformat=fixed"], false, false); } - // VHDX backend has a multiqueue bug causing filesystem corruption. - // The _test_virtio_block_fstrim helper uses num_queues>1 which triggers the bug. - // Ref: #7665 #[test] - #[ignore] fn test_virtio_block_fstrim_unsupported_vhdx() { _test_virtio_block_fstrim("vhdx", "vhdx", &[], false, false); } @@ -7816,13 +4599,11 @@ mod common_parallel { let test_disk_path = guest.tmp_dir.as_path().join("sparse_off_test.raw"); let test_disk_path = test_disk_path.to_str().unwrap(); - let res = exec_host_command_output(&format!( - "truncate -s {} {}", - TEST_DISK_SIZE, test_disk_path - )); + let res = + exec_host_command_output(&format!("truncate -s {TEST_DISK_SIZE} {test_disk_path}")); assert!(res.status.success(), "Failed to create sparse test file"); - let res = exec_host_command_output(&format!("ls -s --block-size=1 {}", test_disk_path)); + let res = exec_host_command_output(&format!("ls -s --block-size=1 {test_disk_path}")); assert!(res.status.success()); let initial_bytes: u64 = String::from_utf8_lossy(&res.stdout) .split_whitespace() @@ -7831,13 +4612,12 @@ mod common_parallel { .expect("Failed to parse initial disk usage"); assert!( initial_bytes < INITIAL_ALLOCATION_THRESHOLD, - "File should be initially sparse: {} bytes allocated", - initial_bytes + "File should be initially sparse: {initial_bytes} bytes allocated" ); let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) + .default_cpus() + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .args([ @@ -7852,7 +4632,7 @@ mod common_parallel { guest.disk_config.disk(DiskType::CloudInit).unwrap() ) .as_str(), - format!("path={},sparse=off", test_disk_path).as_str(), + format!("path={test_disk_path},sparse=off").as_str(), ]) .default_net() .capture_output() @@ -7882,7 +4662,7 @@ mod common_parallel { // - physical >= logical is fully allocated, modulo block alignment // - physical < logical is still sparse - let res = exec_host_command_output(&format!("ls -l {}", test_disk_path)); + let res = exec_host_command_output(&format!("ls -l {test_disk_path}")); assert!(res.status.success()); let logical_size: u64 = String::from_utf8_lossy(&res.stdout) .split_whitespace() @@ -7890,7 +4670,7 @@ mod common_parallel { .and_then(|s| s.parse().ok()) .expect("Failed to parse logical size"); - let res = exec_host_command_output(&format!("ls -s --block-size=1 {}", test_disk_path)); + let res = exec_host_command_output(&format!("ls -s --block-size=1 {test_disk_path}")); assert!(res.status.success()); let physical_size: u64 = String::from_utf8_lossy(&res.stdout) .split_whitespace() @@ -7900,40 +4680,32 @@ mod common_parallel { assert_eq!( logical_size, TEST_DISK_SIZE_BYTES, - "Logical size should be exactly {} bytes, got {}", - TEST_DISK_SIZE_BYTES, logical_size + "Logical size should be exactly {TEST_DISK_SIZE_BYTES} bytes, got {logical_size}" ); - let res = exec_host_command_output(&format!("stat -c '%o' {}", test_disk_path)); + let res = exec_host_command_output(&format!("stat -c '%o' {test_disk_path}")); assert!(res.status.success()); let block_size: u64 = String::from_utf8_lossy(&res.stdout) .trim() .parse() .expect("Failed to parse block size from stat"); - let expected_max = ((logical_size + block_size - 1) / block_size) * block_size; + let expected_max = logical_size.div_ceil(block_size) * block_size; assert!( physical_size >= logical_size, - "File should be fully allocated with sparse=off: logical={} bytes, physical={} bytes (physical < logical means still sparse)", - logical_size, - physical_size + "File should be fully allocated with sparse=off: logical={logical_size} bytes, physical={physical_size} bytes (physical < logical means still sparse)" ); assert!( physical_size <= expected_max, - "Physical size seems too large: logical={} bytes, physical={} bytes, expected_max={} bytes (block_size={})", - logical_size, - physical_size, - expected_max, - block_size + "Physical size seems too large: logical={logical_size} bytes, physical={physical_size} bytes, expected_max={expected_max} bytes (block_size={block_size})" ); } #[test] fn test_virtio_block_sparse_off_qcow2() { const TEST_DISK_SIZE: &str = "2G"; - const CLUSTER_SIZE_BYTES: u64 = 64 * 1024; let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(disk_config)); @@ -7943,8 +4715,7 @@ mod common_parallel { let test_disk_path = test_disk_path.to_str().unwrap(); let res = exec_host_command_output(&format!( - "qemu-img create -f qcow2 {} {}", - test_disk_path, TEST_DISK_SIZE + "qemu-img create -f qcow2 {test_disk_path} {TEST_DISK_SIZE}" )); assert!(res.status.success(), "Failed to create QCOW2 test image"); @@ -7953,7 +4724,7 @@ mod common_parallel { let mut child = GuestCommand::new(&guest) .args(["--cpus", "boot=4"]) - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .args([ @@ -7968,7 +4739,7 @@ mod common_parallel { guest.disk_config.disk(DiskType::CloudInit).unwrap() ) .as_str(), - format!("path={},sparse=off,num_queues=4", test_disk_path).as_str(), + format!("path={test_disk_path},sparse=off,num_queues=4").as_str(), ]) .default_net() .capture_output() @@ -7988,38 +4759,36 @@ mod common_parallel { 1 ); - let mut current_offset_kb = 1024; - - for (_iteration, &size_kb) in BLOCK_DISCARD_TEST_SIZES_KB.iter().enumerate() { - guest - .ssh_command(&format!( - "sudo dd if=/dev/urandom of=/dev/vdc bs=1K count={} seek={} oflag=direct", - size_kb, current_offset_kb - )) - .unwrap(); - - guest.ssh_command("sync").unwrap(); - - guest - .ssh_command(&format!( - "sudo blkdiscard -o {} -l {} /dev/vdc", - current_offset_kb * 1024, - size_kb * 1024 - )) - .unwrap(); - - guest.ssh_command("sync").unwrap(); + // With sparse=off, DISCARD should NOT be advertised. + // blkdiscard is expected to fail. + let discard_result = + guest.ssh_command("sudo blkdiscard -o 1048576 -l 1048576 /dev/vdc 2>&1; echo $?"); + let exit_code = discard_result + .unwrap() + .trim() + .lines() + .last() + .unwrap_or("1") + .parse::() + .unwrap_or(1); + assert_ne!( + exit_code, 0, + "blkdiscard should fail with sparse=off (DISCARD not advertised)" + ); - // Verify VM sees zeros in discarded region - assert_guest_disk_region_is_zero( - &guest, - "/dev/vdc", - current_offset_kb * 1024, - size_kb * 1024, - ); + // WRITE_ZEROES should still work via blkdiscard --zeroout + guest + .ssh_command( + "sudo dd if=/dev/urandom of=/dev/vdc bs=1K count=64 seek=1024 oflag=direct", + ) + .unwrap(); + guest.ssh_command("sync").unwrap(); + guest + .ssh_command("sudo blkdiscard -z -o 1048576 -l 65536 /dev/vdc") + .unwrap(); + guest.ssh_command("sync").unwrap(); - current_offset_kb += size_kb + 64; - } + assert_guest_disk_region_is_zero(&guest, "/dev/vdc", 1048576, 65536); }); kill_child(&mut child); @@ -8030,14 +4799,13 @@ mod common_parallel { handle_child_output(r, &output); + // WRITE_ZEROES should still produce zero-flagged regions assert!( zero_regions_after > zero_regions_before, - "Expected zero-flagged regions to increase with sparse=off: before={}, after={}", - zero_regions_before, - zero_regions_after + "Expected zero-flagged regions to increase via WRITE_ZEROES: before={zero_regions_before}, after={zero_regions_after}" ); - disk_check_consistency(&test_disk_path, None); + disk_check_consistency(test_disk_path, None); } #[test] @@ -8052,7 +4820,7 @@ mod common_parallel { //Let's start a 4G guest with balloon occupied 2G memory let mut child = GuestCommand::new(&guest) .args(["--api-socket", &api_socket]) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=4G"]) .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -8069,8 +4837,9 @@ mod common_parallel { // Wait for balloon memory's initialization and check its size. // The virtio-balloon driver might take a few seconds to report the // balloon effective size back to the VMM. - thread::sleep(std::time::Duration::new(20, 0)); - + assert!(wait_until(Duration::from_secs(20), || { + balloon_size(&api_socket) == 2147483648 + })); let orig_balloon = balloon_size(&api_socket); println!("The original balloon memory size is {orig_balloon} bytes"); assert!(orig_balloon == 2147483648); @@ -8083,7 +4852,9 @@ mod common_parallel { // Give some time for the OOM to happen in the guest and be reported // back to the host. - thread::sleep(std::time::Duration::new(20, 0)); + assert!(wait_until(Duration::from_secs(20), || { + balloon_size(&api_socket) < 2147483648 + })); // 2nd: check balloon_mem's value to verify balloon has been automatically deflated let deflated_balloon = balloon_size(&api_socket); @@ -8106,7 +4877,7 @@ mod common_parallel { //Let's start a 4G guest with balloon occupied 2G memory let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) + .default_cpus() .args(["--memory", "size=4G"]) .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -8139,18 +4910,16 @@ mod common_parallel { .unwrap(); }); - // Wait for 50 seconds to make sure the stress command is consuming - // the expected amount of memory. - thread::sleep(std::time::Duration::new(50, 0)); + // Wait for guest memory consumption to reach the expected level. + assert!(wait_until(Duration::from_secs(60), || process_rss_kib(pid) >= 2097152)); let rss = process_rss_kib(pid); println!("RSS {rss} >= 2097152"); assert!(rss >= 2097152); - // Wait for an extra minute to make sure the stress command has - // completed and that the guest reported the free pages to the VMM - // through the virtio-balloon device. We expect the RSS to be under - // 2GiB. - thread::sleep(std::time::Duration::new(60, 0)); + // Wait for stress to complete and free-page reporting to shrink RSS again. + assert!(wait_until(Duration::from_secs(120), || process_rss_kib( + pid + ) < 2097152)); let rss = process_rss_kib(pid); println!("RSS {rss} < 2097152"); assert!(rss < 2097152); @@ -8190,8 +4959,8 @@ mod common_parallel { let mut cmd = GuestCommand::new(&guest); cmd.args(["--api-socket", &api_socket]) - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) + .default_cpus() + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .default_disks() @@ -8223,268 +4992,75 @@ mod common_parallel { let pmem_temp_file = TempFile::new().unwrap(); pmem_temp_file.as_file().set_len(128 << 20).unwrap(); - let (cmd_success, cmd_output) = remote_command_w_output( - &api_socket, - "add-pmem", - Some(&format!( - "file={},id=test0{}", - pmem_temp_file.as_path().to_str().unwrap(), - if let Some(pci_segment) = pci_segment { - format!(",pci_segment={pci_segment}") - } else { - String::new() - } - )), - ); - assert!(cmd_success); - if let Some(pci_segment) = pci_segment { - assert!(String::from_utf8_lossy(&cmd_output).contains(&format!( - "{{\"id\":\"test0\",\"bdf\":\"{pci_segment:04x}:00:01.0\"}}" - ))); - } else { - assert!( - String::from_utf8_lossy(&cmd_output) - .contains("{\"id\":\"test0\",\"bdf\":\"0000:00:06.0\"}") - ); - } - - // Check that /dev/pmem0 exists and the block size is 128M - assert_eq!( - guest - .ssh_command("lsblk | grep pmem0 | grep -c 128M") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 1 - ); - - guest.reboot_linux(0); - - // Check still there after reboot - assert_eq!( - guest - .ssh_command("lsblk | grep pmem0 | grep -c 128M") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 1 - ); - - assert!(remote_command(&api_socket, "remove-device", Some("test0"))); - - thread::sleep(std::time::Duration::new(20, 0)); - - // Check device has gone away - assert_eq!( - guest - .ssh_command("lsblk | grep -c pmem0.*128M || true") - .unwrap() - .trim() - .parse::() - .unwrap_or(1), - 0 - ); - - guest.reboot_linux(1); - - // Check still absent after reboot - assert_eq!( - guest - .ssh_command("lsblk | grep -c pmem0.*128M || true") - .unwrap() - .trim() - .parse::() - .unwrap_or(1), - 0 - ); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); - } - - #[test] - fn test_net_hotplug() { - _test_net_hotplug(None); - } - - #[test] - fn test_net_multi_segment_hotplug() { - _test_net_hotplug(Some(15)); - } - - fn _test_net_hotplug(pci_segment: Option) { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - - #[cfg(target_arch = "x86_64")] - let kernel_path = direct_kernel_boot_path(); - #[cfg(target_arch = "aarch64")] - let kernel_path = edk2_path(); - - let api_socket = temp_api_path(&guest.tmp_dir); - - // Boot without network - let mut cmd = GuestCommand::new(&guest); - - cmd.args(["--api-socket", &api_socket]) - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_net() - .default_disks() - .capture_output(); - - if pci_segment.is_some() { - cmd.args([ - "--platform", - &format!("num_pci_segments={MAX_NUM_PCI_SEGMENTS}"), - ]); - } - - let mut child = cmd.spawn().unwrap(); - - guest.wait_vm_boot().unwrap(); - - let r = std::panic::catch_unwind(|| { - // Add network - let (cmd_success, cmd_output) = remote_command_w_output( - &api_socket, - "add-net", - Some( - format!( - "id=test0,tap=,mac={},ip={},mask=255.255.255.128{}", - guest.network.guest_mac1, - guest.network.host_ip1, - if let Some(pci_segment) = pci_segment { - format!(",pci_segment={pci_segment}") - } else { - String::new() - } - ) - .as_str(), - ), - ); - assert!(cmd_success); - - if let Some(pci_segment) = pci_segment { - assert!(String::from_utf8_lossy(&cmd_output).contains(&format!( - "{{\"id\":\"test0\",\"bdf\":\"{pci_segment:04x}:00:01.0\"}}" - ))); - } else { - assert!( - String::from_utf8_lossy(&cmd_output) - .contains("{\"id\":\"test0\",\"bdf\":\"0000:00:06.0\"}") - ); - } - - thread::sleep(std::time::Duration::new(5, 0)); - - // 2 network interfaces + default localhost ==> 3 interfaces - assert_eq!( - guest - .ssh_command("ip -o link | wc -l") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 3 - ); - - // Test the same using the added network interface's IP - assert_eq!( - ssh_command_ip( - "ip -o link | wc -l", - &guest.network.guest_ip1, - DEFAULT_SSH_RETRIES, - DEFAULT_SSH_TIMEOUT - ) - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 3 - ); - - // Remove network - assert!(remote_command(&api_socket, "remove-device", Some("test0"),)); - thread::sleep(std::time::Duration::new(5, 0)); - - // Add network - let (cmd_success, cmd_output) = remote_command_w_output( + let (cmd_success, cmd_output, _) = remote_command_w_output( &api_socket, - "add-net", - Some( - format!( - "id=test1,tap=,mac={},ip={},mask=255.255.255.128{}", - guest.network.guest_mac1, - guest.network.host_ip1, - if let Some(pci_segment) = pci_segment { - format!(",pci_segment={pci_segment}") - } else { - String::new() - } - ) - .as_str(), - ), + "add-pmem", + Some(&format!( + "file={},id=test0{}", + pmem_temp_file.as_path().to_str().unwrap(), + if let Some(pci_segment) = pci_segment { + format!(",pci_segment={pci_segment}") + } else { + String::new() + } + )), ); assert!(cmd_success); - if let Some(pci_segment) = pci_segment { assert!(String::from_utf8_lossy(&cmd_output).contains(&format!( - "{{\"id\":\"test1\",\"bdf\":\"{pci_segment:04x}:00:01.0\"}}" + "{{\"id\":\"test0\",\"bdf\":\"{pci_segment:04x}:00:01.0\"}}" ))); } else { assert!( String::from_utf8_lossy(&cmd_output) - .contains("{\"id\":\"test1\",\"bdf\":\"0000:00:06.0\"}") + .contains("{\"id\":\"test0\",\"bdf\":\"0000:00:06.0\"}") ); } - thread::sleep(std::time::Duration::new(5, 0)); - - // 2 network interfaces + default localhost ==> 3 interfaces + // Check that /dev/pmem0 exists and the block size is 128M assert_eq!( guest - .ssh_command("ip -o link | wc -l") + .ssh_command("lsblk | grep pmem0 | grep -c 128M") .unwrap() .trim() .parse::() .unwrap_or_default(), - 3 + 1 ); guest.reboot_linux(0); - // 2 network interfaces + default localhost ==> 3 interfaces + // Check still there after reboot assert_eq!( guest - .ssh_command("ip -o link | wc -l") + .ssh_command("lsblk | grep pmem0 | grep -c 128M") .unwrap() .trim() .parse::() .unwrap_or_default(), - 3 + 1 ); - // Test the same using the added network interface's IP + assert!(remote_command(&api_socket, "remove-device", Some("test0"))); + + // Wait for the pmem device to disappear from lsblk. + assert!(wait_until(Duration::from_secs(20), || { + guest + .ssh_command("lsblk | grep -c pmem0.*128M || true") + .is_ok_and(|output| output.trim().parse::().unwrap_or(1) == 0) + })); + + guest.reboot_linux(1); + + // Check still absent after reboot assert_eq!( - ssh_command_ip( - "ip -o link | wc -l", - &guest.network.guest_ip1, - DEFAULT_SSH_RETRIES, - DEFAULT_SSH_TIMEOUT - ) - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 3 + guest + .ssh_command("lsblk | grep -c pmem0.*128M || true") + .unwrap() + .trim() + .parse::() + .unwrap_or(1), + 0 ); }); @@ -8494,6 +5070,29 @@ mod common_parallel { handle_child_output(r, &output); } + #[test] + fn test_net_hotplug() { + #[cfg(target_arch = "x86_64")] + let kernel_path = direct_kernel_boot_path(); + #[cfg(target_arch = "aarch64")] + let kernel_path = edk2_path(); + let guest = + basic_regular_guest!(JAMMY_IMAGE_NAME).with_kernel_path(kernel_path.to_str().unwrap()); + + _test_net_hotplug(&guest, MAX_NUM_PCI_SEGMENTS, None); + } + + #[test] + fn test_net_multi_segment_hotplug() { + #[cfg(target_arch = "x86_64")] + let kernel_path = direct_kernel_boot_path(); + #[cfg(target_arch = "aarch64")] + let kernel_path = edk2_path(); + let guest = + basic_regular_guest!(JAMMY_IMAGE_NAME).with_kernel_path(kernel_path.to_str().unwrap()); + _test_net_hotplug(&guest, MAX_NUM_PCI_SEGMENTS, Some(15)); + } + #[test] fn test_initramfs() { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); @@ -8545,40 +5144,8 @@ mod common_parallel { #[test] fn test_counters() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - let api_socket = temp_api_path(&guest.tmp_dir); - - let mut cmd = GuestCommand::new(&guest); - cmd.args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) - .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .args(["--net", guest.default_net_string().as_str()]) - .args(["--api-socket", &api_socket]) - .capture_output(); - - let mut child = cmd.spawn().unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - let orig_counters = get_counters(&api_socket); - guest - .ssh_command("dd if=/dev/zero of=test count=8 bs=1M") - .unwrap(); - - let new_counters = get_counters(&api_socket); - - // Check that all the counters have increased - assert!(new_counters > orig_counters); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); + _test_counters(&guest); } #[test] @@ -8590,7 +5157,7 @@ mod common_parallel { let mut cmd = GuestCommand::new(&guest); cmd.args(["--cpus", "boot=4"]) - .args(["--memory", "size=4G"]) + .args(["--memory", "size=1G"]) .args(["--kernel", fw_path(FwType::RustHypervisorFirmware).as_str()]) .default_disks() .args(["--net", guest.default_net_string().as_str()]) @@ -8638,7 +5205,7 @@ mod common_parallel { let mut cmd = GuestCommand::new(&guest); cmd.args(["--cpus", "boot=4"]) - .args(["--memory", "size=4G"]) + .args(["--memory", "size=1G"]) .args(["--kernel", fw_path(FwType::RustHypervisorFirmware).as_str()]) .default_disks() .args(["--net", guest.default_net_string().as_str()]) @@ -8668,763 +5235,929 @@ mod common_parallel { #[test] fn test_watchdog() { - let disk_config = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - let api_socket = temp_api_path(&guest.tmp_dir); - - let kernel_path = direct_kernel_boot_path(); - let event_path = temp_event_monitor_path(&guest.tmp_dir); - - let mut cmd = GuestCommand::new(&guest); - cmd.args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .args(["--net", guest.default_net_string().as_str()]) - .args(["--watchdog"]) - .args(["--api-socket", &api_socket]) - .args(["--event-monitor", format!("path={event_path}").as_str()]) - .capture_output(); - - let mut child = cmd.spawn().unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - let mut expected_reboot_count = 1; - - // Enable the watchdog with a 15s timeout - enable_guest_watchdog(&guest, 15); - - assert_eq!(get_reboot_count(&guest), expected_reboot_count); - assert_eq!( - guest - .ssh_command("sudo journalctl | grep -c -- \"Watchdog started\"") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 1 - ); - - // Allow some normal time to elapse to check we don't get spurious reboots - thread::sleep(std::time::Duration::new(40, 0)); - // Check no reboot - assert_eq!(get_reboot_count(&guest), expected_reboot_count); - - // Trigger a panic (sync first). We need to do this inside a screen with a delay so the SSH command returns. - guest.ssh_command("screen -dmS reboot sh -c \"sleep 5; echo s | tee /proc/sysrq-trigger; echo c | sudo tee /proc/sysrq-trigger\"").unwrap(); - // Allow some time for the watchdog to trigger (max 30s) and reboot to happen - guest.wait_vm_boot_custom_timeout(50).unwrap(); - // Check a reboot is triggered by the watchdog - expected_reboot_count += 1; - assert_eq!(get_reboot_count(&guest), expected_reboot_count); + let guest = basic_regular_guest!(FOCAL_IMAGE_NAME); + _test_watchdog(&guest); + } - #[cfg(target_arch = "x86_64")] - { - // Now pause the VM and remain offline for 30s - assert!(remote_command(&api_socket, "pause", None)); - let latest_events = [ - &MetaEvent { - event: "pausing".to_string(), - device_id: None, - }, - &MetaEvent { - event: "paused".to_string(), - device_id: None, - }, - ]; - assert!(check_latest_events_exact(&latest_events, &event_path)); - assert!(remote_command(&api_socket, "resume", None)); + #[test] + fn test_pvpanic() { + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME); + _test_pvpanic(&guest); + } - // Check no reboot - assert_eq!(get_reboot_count(&guest), expected_reboot_count); - } - }); + #[test] + fn test_tap_from_fd() { + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME).with_cpu(2); + _test_tap_from_fd(&guest); + } - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); + #[test] + #[cfg_attr(target_arch = "aarch64", ignore = "See #5443")] + fn test_macvtap() { + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME).with_cpu(2); + _test_macvtap(&guest, false, "guestmacvtap0", "hostmacvtap0"); + } - handle_child_output(r, &output); + #[test] + #[cfg_attr(target_arch = "aarch64", ignore = "See #5443")] + fn test_macvtap_hotplug() { + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME).with_cpu(2); + _test_macvtap(&guest, true, "guestmacvtap1", "hostmacvtap1"); } #[test] - fn test_pvpanic() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - let api_socket = temp_api_path(&guest.tmp_dir); - let event_path = temp_event_monitor_path(&guest.tmp_dir); + #[cfg(not(feature = "mshv"))] + fn test_ovs_dpdk() { + let disk_config1 = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); + let guest1 = Guest::new(Box::new(disk_config1)); - let kernel_path = direct_kernel_boot_path(); + let disk_config2 = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); + let guest2 = Guest::new(Box::new(disk_config2)); + let api_socket_source = format!("{}.1", temp_api_path(&guest2.tmp_dir)); - let mut cmd = GuestCommand::new(&guest); - cmd.args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .args(["--net", guest.default_net_string().as_str()]) - .args(["--pvpanic"]) - .args(["--api-socket", &api_socket]) - .args(["--event-monitor", format!("path={event_path}").as_str()]) - .capture_output(); + let (mut child1, mut child2) = + setup_ovs_dpdk_guests(&guest1, &guest2, &api_socket_source, false); - let mut child = cmd.spawn().unwrap(); + // Create the snapshot directory + let snapshot_dir = temp_snapshot_dir_path(&guest2.tmp_dir); let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); + // Remove one of the two ports from the OVS bridge + assert!(exec_host_command_status("ovs-vsctl del-port vhost-user1").success()); - // Trigger guest a panic - make_guest_panic(&guest); + // Spawn a new netcat listener in the first VM + let guest_ip = guest1.network.guest_ip0.clone(); + thread::spawn(move || { + ssh_command_ip( + "nc -l 12345", + &guest_ip, + DEFAULT_SSH_RETRIES, + DEFAULT_SSH_TIMEOUT, + ) + .unwrap(); + }); - // Wait a while for guest - thread::sleep(std::time::Duration::new(10, 0)); + guest1 + .wait_for_ssh_command( + "ss -ltnH | awk '{print $4}' | grep -q ':12345$'", + Duration::from_secs(20), + ) + .unwrap(); - let expected_sequential_events = [&MetaEvent { - event: "panic".to_string(), - device_id: None, - }]; - assert!(check_latest_events_exact( - &expected_sequential_events, - &event_path - )); - }); + // Check the connection fails this time + guest2.ssh_command("nc -vz 172.100.0.1 12345").unwrap_err(); - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); + // Add the OVS port back + assert!(exec_host_command_status("ovs-vsctl add-port ovsbr0 vhost-user1 -- set Interface vhost-user1 type=dpdkvhostuserclient options:vhost-server-path=/tmp/dpdkvhostclient1").success()); - handle_child_output(r, &output); - } + // And finally check the connection is functional again + guest2.ssh_command("nc -vz 172.100.0.1 12345").unwrap(); - #[test] - fn test_tap_from_fd() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - let kernel_path = direct_kernel_boot_path(); + // Pause the VM + assert!(remote_command(&api_socket_source, "pause", None)); + + // Take a snapshot from the VM + assert!(remote_command( + &api_socket_source, + "snapshot", + Some(format!("file://{snapshot_dir}").as_str()), + )); - // Create a TAP interface with multi-queue enabled - let num_queue_pairs: usize = 2; + // Wait for the source VM snapshot artifacts to be ready. + assert!(wait_until(Duration::from_secs(10), || { + std::path::Path::new(&snapshot_dir).exists() + })); + }); - use std::str::FromStr; - let taps = net_util::open_tap( - Some("chtap0"), - Some(std::net::IpAddr::V4( - std::net::Ipv4Addr::from_str(&guest.network.host_ip0).unwrap(), - )), - None, - &mut None, - None, - num_queue_pairs, - Some(libc::O_RDWR | libc::O_NONBLOCK), - ) - .unwrap(); + // Shutdown the source VM + kill_child(&mut child2); + let output = child2.wait_with_output().unwrap(); + handle_child_output(r, &output); - let mut child = GuestCommand::new(&guest) - .args(["--cpus", &format!("boot={num_queue_pairs}")]) - .args(["--memory", "size=512M"]) - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() + // Remove the vhost-user socket file. + Command::new("rm") + .arg("-f") + .arg("/tmp/dpdkvhostclient2") + .output() + .unwrap(); + + let api_socket_restored = format!("{}.2", temp_api_path(&guest2.tmp_dir)); + // Restore the VM from the snapshot + let mut child2 = GuestCommand::new(&guest2) + .args(["--api-socket", &api_socket_restored]) .args([ - "--net", - &format!( - "fd=[{},{}],mac={},num_queues={}", - taps[0].as_raw_fd(), - taps[1].as_raw_fd(), - guest.network.guest_mac0, - num_queue_pairs * 2 - ), + "--restore", + format!("source_url=file://{snapshot_dir}").as_str(), ]) .capture_output() .spawn() .unwrap(); + // Wait for the restored VM to accept SSH again after resume. + let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); + // Resume the VM + assert!(wait_until(Duration::from_secs(30), || remote_command( + &api_socket_restored, + "info", + None + ))); + assert!(remote_command(&api_socket_restored, "resume", None)); + guest2.wait_for_ssh(Duration::from_secs(30)).unwrap(); - assert_eq!( - guest - .ssh_command("ip -o link | wc -l") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 2 - ); + // Spawn a new netcat listener in the first VM + let guest_ip = guest1.network.guest_ip0.clone(); + thread::spawn(move || { + ssh_command_ip( + "nc -l 12345", + &guest_ip, + DEFAULT_SSH_RETRIES, + DEFAULT_SSH_TIMEOUT, + ) + .unwrap(); + }); - guest.reboot_linux(0); + guest1 + .wait_for_ssh_command( + "ss -ltnH | awk '{print $4}' | grep -q ':12345$'", + Duration::from_secs(20), + ) + .unwrap(); - assert_eq!( - guest - .ssh_command("ip -o link | wc -l") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 2 - ); + // And check the connection is still functional after restore + guest2.ssh_command("nc -vz 172.100.0.1 12345").unwrap(); }); - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); + kill_child(&mut child1); + kill_child(&mut child2); - handle_child_output(r, &output); - } + let output = child1.wait_with_output().unwrap(); + child2.wait().unwrap(); - // By design, a guest VM won't be able to connect to the host - // machine when using a macvtap network interface (while it can - // communicate externally). As a workaround, this integration - // test creates two macvtap interfaces in 'bridge' mode on the - // same physical net interface, one for the guest and one for - // the host. With additional setup on the IP address and the - // routing table, it enables the communications between the - // guest VM and the host machine. - // Details: https://wiki.libvirt.org/page/TroubleshootMacvtapHostFail - fn _test_macvtap(hotplug: bool, guest_macvtap_name: &str, host_macvtap_name: &str) { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - let api_socket = temp_api_path(&guest.tmp_dir); + cleanup_ovs_dpdk(); - #[cfg(target_arch = "x86_64")] - let kernel_path = direct_kernel_boot_path(); - #[cfg(target_arch = "aarch64")] - let kernel_path = edk2_path(); + handle_child_output(r, &output); + } - let phy_net = "eth0"; + fn setup_spdk_nvme(nvme_dir: &std::path::Path) -> Child { + cleanup_spdk_nvme(); - // Create a macvtap interface for the guest VM to use assert!( exec_host_command_status(&format!( - "sudo ip link add link {phy_net} name {guest_macvtap_name} type macvtap mod bridge" + "mkdir -p {}", + nvme_dir.join("nvme-vfio-user").to_str().unwrap() )) .success() ); assert!( exec_host_command_status(&format!( - "sudo ip link set {} address {} up", - guest_macvtap_name, guest.network.guest_mac0 + "truncate {} -s 128M", + nvme_dir.join("test-disk.raw").to_str().unwrap() )) .success() ); - assert!( - exec_host_command_status(&format!("sudo ip link show {guest_macvtap_name}")).success() - ); - - let tap_index = - fs::read_to_string(format!("/sys/class/net/{guest_macvtap_name}/ifindex")).unwrap(); - let tap_device = format!("/dev/tap{}", tap_index.trim()); - - assert!(exec_host_command_status(&format!("sudo chown $UID.$UID {tap_device}")).success()); - - let cstr_tap_device = std::ffi::CString::new(tap_device).unwrap(); - let tap_fd1 = unsafe { libc::open(cstr_tap_device.as_ptr(), libc::O_RDWR) }; - assert!(tap_fd1 > 0); - let tap_fd2 = unsafe { libc::open(cstr_tap_device.as_ptr(), libc::O_RDWR) }; - assert!(tap_fd2 > 0); - - // Create a macvtap on the same physical net interface for - // the host machine to use assert!( exec_host_command_status(&format!( - "sudo ip link add link {phy_net} name {host_macvtap_name} type macvtap mod bridge" + "mkfs.ext4 {}", + nvme_dir.join("test-disk.raw").to_str().unwrap() )) .success() ); - // Use default mask "255.255.255.0" + + // Start the SPDK nvmf_tgt daemon to present NVMe device as a VFIO user device + let child = Command::new("/usr/local/bin/spdk-nvme/nvmf_tgt") + .args(["-i", "0", "-m", "0x1"]) + .spawn() + .unwrap(); + thread::sleep(std::time::Duration::new(2, 0)); + + assert!(exec_host_command_with_retries( + "/usr/local/bin/spdk-nvme/rpc.py nvmf_create_transport -t VFIOUSER", + 3, + std::time::Duration::new(5, 0), + )); assert!( exec_host_command_status(&format!( - "sudo ip address add {}/24 dev {}", - guest.network.host_ip0, host_macvtap_name + "/usr/local/bin/spdk-nvme/rpc.py bdev_aio_create {} test 512", + nvme_dir.join("test-disk.raw").to_str().unwrap() )) .success() ); - assert!( - exec_host_command_status(&format!("sudo ip link set dev {host_macvtap_name} up")) - .success() - ); + assert!(exec_host_command_status( + "/usr/local/bin/spdk-nvme/rpc.py nvmf_create_subsystem nqn.2019-07.io.spdk:cnode -a -s test" + ) + .success()); + assert!(exec_host_command_status( + "/usr/local/bin/spdk-nvme/rpc.py nvmf_subsystem_add_ns nqn.2019-07.io.spdk:cnode test" + ) + .success()); + assert!(exec_host_command_status(&format!( + "/usr/local/bin/spdk-nvme/rpc.py nvmf_subsystem_add_listener nqn.2019-07.io.spdk:cnode -t VFIOUSER -a {} -s 0", + nvme_dir.join("nvme-vfio-user").to_str().unwrap() + )) + .success()); - let mut guest_command = GuestCommand::new(&guest); - guest_command - .args(["--cpus", "boot=2"]) - .args(["--memory", "size=512M"]) - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .args(["--api-socket", &api_socket]); + child + } - let net_params = format!( - "fd=[{},{}],mac={},num_queues=4", - tap_fd1, tap_fd2, guest.network.guest_mac0 - ); + fn cleanup_spdk_nvme() { + exec_host_command_status("pkill -f nvmf_tgt"); + } - if !hotplug { - guest_command.args(["--net", &net_params]); - } + #[test] + fn test_vfio_user() { + let jammy_image = JAMMY_IMAGE_NAME.to_string(); + let disk_config = UbuntuDiskConfig::new(jammy_image); + let guest = Guest::new(Box::new(disk_config)); + + let spdk_nvme_dir = guest.tmp_dir.as_path().join("test-vfio-user"); + let mut spdk_child = setup_spdk_nvme(spdk_nvme_dir.as_path()); + + let api_socket = temp_api_path(&guest.tmp_dir); + let mut child = GuestCommand::new(&guest) + .args(["--api-socket", &api_socket]) + .default_cpus() + .args(["--memory", "size=1G,shared=on,hugepages=on"]) + .args(["--kernel", fw_path(FwType::RustHypervisorFirmware).as_str()]) + .args(["--serial", "tty", "--console", "off"]) + .default_disks() + .default_net() + .capture_output() + .spawn() + .unwrap(); - let mut child = guest_command.capture_output().spawn().unwrap(); + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); - if hotplug { - // Give some time to the VMM process to listen to the API - // socket. This is the only requirement to avoid the following - // call to ch-remote from failing. - thread::sleep(std::time::Duration::new(10, 0)); - // Hotplug the virtio-net device - let (cmd_success, cmd_output) = - remote_command_w_output(&api_socket, "add-net", Some(&net_params)); + // Hotplug the SPDK-NVMe device to the VM + let (cmd_success, cmd_output, _) = remote_command_w_output( + &api_socket, + "add-user-device", + Some(&format!( + "socket={},id=vfio_user0", + spdk_nvme_dir + .as_path() + .join("nvme-vfio-user/cntrl") + .to_str() + .unwrap(), + )), + ); assert!(cmd_success); - #[cfg(target_arch = "x86_64")] assert!( String::from_utf8_lossy(&cmd_output) - .contains("{\"id\":\"_net2\",\"bdf\":\"0000:00:05.0\"}") + .contains("{\"id\":\"vfio_user0\",\"bdf\":\"0000:00:05.0\"}") ); - #[cfg(target_arch = "aarch64")] - assert!( - String::from_utf8_lossy(&cmd_output) - .contains("{\"id\":\"_net0\",\"bdf\":\"0000:00:05.0\"}") + + // Check both if /dev/nvme exists and if the block size is 128M. + assert!(wait_until(Duration::from_secs(10), || { + guest + .ssh_command("lsblk | grep nvme0n1 | grep -c 128M") + .ok() + .and_then(|output| output.trim().parse::().ok()) + == Some(1) + })); + + // Check changes persist after reboot + assert_eq!( + guest.ssh_command("sudo mount /dev/nvme0n1 /mnt").unwrap(), + "" + ); + assert_eq!(guest.ssh_command("ls /mnt").unwrap(), "lost+found\n"); + guest + .ssh_command("echo test123 | sudo tee /mnt/test") + .unwrap(); + assert_eq!(guest.ssh_command("sudo umount /mnt").unwrap(), ""); + assert_eq!(guest.ssh_command("ls /mnt").unwrap(), ""); + + guest.reboot_linux(0); + assert_eq!( + guest.ssh_command("sudo mount /dev/nvme0n1 /mnt").unwrap(), + "" + ); + assert_eq!( + guest.ssh_command("sudo cat /mnt/test").unwrap().trim(), + "test123" ); + }); + + let _ = spdk_child.kill(); + let _ = spdk_child.wait(); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); + } + + #[test] + #[cfg(target_arch = "x86_64")] + fn test_vdpa_block() { + // Before trying to run the test, verify the vdpa_sim_blk module is correctly loaded. + assert!(exec_host_command_status("lsmod | grep vdpa_sim_blk").success()); + + let guest = basic_regular_guest!(JAMMY_IMAGE_NAME).with_cpu(2); + _test_vdpa_block(&guest); + } + + #[test] + #[cfg(target_arch = "x86_64")] + fn test_vdpa_net() { + // Before trying to run the test, verify the vdpa_sim_net module is correctly loaded. + if !exec_host_command_status("lsmod | grep vdpa_sim_net").success() { + return; } - // The functional connectivity provided by the virtio-net device - // gets tested through wait_vm_boot() as it expects to receive a - // HTTP request, and through the SSH command as well. + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); + + let kernel_path = direct_kernel_boot_path(); + + let mut child = GuestCommand::new(&guest) + .args(["--cpus", "boot=2"]) + .args(["--memory", "size=512M,hugepages=on"]) + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .default_disks() + .default_net() + .args(["--vdpa", "path=/dev/vhost-vdpa-2,num_queues=3"]) + .capture_output() + .spawn() + .unwrap(); + let r = std::panic::catch_unwind(|| { guest.wait_vm_boot().unwrap(); + // Check we can find network interface related to vDPA device assert_eq!( guest - .ssh_command("ip -o link | wc -l") + .ssh_command("ip -o link | grep -c ens6") .unwrap() .trim() .parse::() - .unwrap_or_default(), + .unwrap_or(0), + 1 + ); + + guest + .ssh_command("sudo ip link set dev ens6 address 00:e8:ca:33:ba:06") + .unwrap(); + guest + .ssh_command("sudo ip addr add 172.16.1.2/24 dev ens6") + .unwrap(); + guest.ssh_command("sudo ip link set up dev ens6").unwrap(); + + // Check there is no packet yet on both TX/RX of the network interface + assert_eq!( + guest + .ssh_command("ip -j -p -s link show ens6 | grep -c '\"packets\": 0'") + .unwrap() + .trim() + .parse::() + .unwrap_or(0), 2 ); - guest.reboot_linux(0); + // Send 6 packets with ping command + guest.ssh_command("ping 172.16.1.10 -c 6 || true").unwrap(); + // Check we can find 6 packets on both TX/RX of the network interface assert_eq!( guest - .ssh_command("ip -o link | wc -l") + .ssh_command("ip -j -p -s link show ens6 | grep -c '\"packets\": 6'") .unwrap() .trim() .parse::() - .unwrap_or_default(), + .unwrap_or(0), 2 ); + + // No need to check for hotplug as we already tested it through + // test_vdpa_block() }); kill_child(&mut child); - - exec_host_command_status(&format!("sudo ip link del {guest_macvtap_name}")); - exec_host_command_status(&format!("sudo ip link del {host_macvtap_name}")); - let output = child.wait_with_output().unwrap(); handle_child_output(r, &output); } #[test] - #[cfg_attr(target_arch = "aarch64", ignore = "See #5443")] - fn test_macvtap() { - _test_macvtap(false, "guestmacvtap0", "hostmacvtap0"); - } + #[cfg(not(feature = "mshv"))] // See issue #7439 + #[cfg(target_arch = "x86_64")] + fn test_tpm() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); - #[test] - #[cfg_attr(target_arch = "aarch64", ignore = "See #5443")] - fn test_macvtap_hotplug() { - _test_macvtap(true, "guestmacvtap1", "hostmacvtap1"); + let (mut swtpm_command, swtpm_socket_path) = prepare_swtpm_daemon(&guest.tmp_dir); + + let mut guest_cmd = GuestCommand::new(&guest); + guest_cmd + .default_cpus() + .args(["--memory", "size=1G"]) + .args(["--kernel", fw_path(FwType::RustHypervisorFirmware).as_str()]) + .args(["--tpm", &format!("socket={swtpm_socket_path}")]) + .capture_output() + .default_disks() + .default_net(); + + // Start swtpm daemon + let mut swtpm_child = swtpm_command.spawn().unwrap(); + assert!(wait_until(Duration::from_secs(10), || { + std::path::Path::new(&swtpm_socket_path).exists() + })); + let mut child = guest_cmd.spawn().unwrap(); + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + assert_eq!( + guest.ssh_command("ls /dev/tpm0").unwrap().trim(), + "/dev/tpm0" + ); + guest.ssh_command("sudo tpm2_selftest -f").unwrap(); + guest + .ssh_command("echo 'hello' > /tmp/checksum_test; ") + .unwrap(); + guest.ssh_command("cmp <(sudo tpm2_pcrevent /tmp/checksum_test | grep sha256 | awk '{print $2}') <(sha256sum /tmp/checksum_test| awk '{print $1}')").unwrap(); + }); + + let _ = swtpm_child.kill(); + let _d_out = swtpm_child.wait_with_output().unwrap(); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); } #[test] - #[cfg(not(feature = "mshv"))] - fn test_ovs_dpdk() { - let disk_config1 = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); - let guest1 = Guest::new(Box::new(disk_config1)); + #[cfg(target_arch = "x86_64")] + fn test_double_tty() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); + let mut cmd = GuestCommand::new(&guest); + let api_socket = temp_api_path(&guest.tmp_dir); + let tty_str: &str = "console=hvc0 earlyprintk=ttyS0 "; + // linux printk module enable console log. + let con_dis_str: &str = "console [hvc0] enabled"; + // linux printk module disable console log. + let con_enb_str: &str = "bootconsole [earlyser0] disabled"; - let disk_config2 = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); - let guest2 = Guest::new(Box::new(disk_config2)); - let api_socket_source = format!("{}.1", temp_api_path(&guest2.tmp_dir)); + let kernel_path = direct_kernel_boot_path(); - let (mut child1, mut child2) = - setup_ovs_dpdk_guests(&guest1, &guest2, &api_socket_source, false); + cmd.default_cpus() + .default_memory() + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args([ + "--cmdline", + DIRECT_KERNEL_BOOT_CMDLINE + .replace("console=hvc0", tty_str) + .as_str(), + ]) + .capture_output() + .default_disks() + .default_net() + .args(["--serial", "tty"]) + .args(["--console", "tty"]) + .args(["--api-socket", &api_socket]); - // Create the snapshot directory - let snapshot_dir = temp_snapshot_dir_path(&guest2.tmp_dir); + let mut child = cmd.spawn().unwrap(); - let r = std::panic::catch_unwind(|| { - // Remove one of the two ports from the OVS bridge - assert!(exec_host_command_status("ovs-vsctl del-port vhost-user1").success()); + let mut r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + }); - // Spawn a new netcat listener in the first VM - let guest_ip = guest1.network.guest_ip0.clone(); - thread::spawn(move || { - ssh_command_ip( - "nc -l 12345", - &guest_ip, - DEFAULT_SSH_RETRIES, - DEFAULT_SSH_TIMEOUT, - ) - .unwrap(); + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + if r.is_ok() { + r = std::panic::catch_unwind(|| { + let s = String::from_utf8_lossy(&output.stdout); + assert!(s.contains(tty_str)); + assert!(s.contains(con_dis_str)); + assert!(s.contains(con_enb_str)); }); + } - // Wait for the server to be listening - thread::sleep(std::time::Duration::new(5, 0)); + handle_child_output(r, &output); + } - // Check the connection fails this time - guest2.ssh_command("nc -vz 172.100.0.1 12345").unwrap_err(); + #[test] + #[cfg(target_arch = "x86_64")] + fn test_nmi() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); + let api_socket = temp_api_path(&guest.tmp_dir); + let event_path = temp_event_monitor_path(&guest.tmp_dir); - // Add the OVS port back - assert!(exec_host_command_status("ovs-vsctl add-port ovsbr0 vhost-user1 -- set Interface vhost-user1 type=dpdkvhostuserclient options:vhost-server-path=/tmp/dpdkvhostclient1").success()); + let kernel_path = direct_kernel_boot_path(); + let cmd_line = format!("{} {}", DIRECT_KERNEL_BOOT_CMDLINE, "unknown_nmi_panic=1"); - // And finally check the connection is functional again - guest2.ssh_command("nc -vz 172.100.0.1 12345").unwrap(); + let mut cmd = GuestCommand::new(&guest); + cmd.args(["--cpus", "boot=4"]) + .default_memory() + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", cmd_line.as_str()]) + .default_disks() + .args(["--net", guest.default_net_string().as_str()]) + .args(["--pvpanic"]) + .args(["--api-socket", &api_socket]) + .args(["--event-monitor", format!("path={event_path}").as_str()]) + .capture_output(); - // Pause the VM - assert!(remote_command(&api_socket_source, "pause", None)); + let mut child = cmd.spawn().unwrap(); - // Take a snapshot from the VM - assert!(remote_command( - &api_socket_source, - "snapshot", - Some(format!("file://{snapshot_dir}").as_str()), - )); + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); - // Wait to make sure the snapshot is completed - thread::sleep(std::time::Duration::new(10, 0)); + assert!(remote_command(&api_socket, "nmi", None)); + + let expected_sequential_events = [&MetaEvent { + event: "panic".to_string(), + device_id: None, + }]; + assert!(wait_until(Duration::from_secs(3), || { + check_latest_events_exact(&expected_sequential_events, &event_path) + })); }); - // Shutdown the source VM - kill_child(&mut child2); - let output = child2.wait_with_output().unwrap(); + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + handle_child_output(r, &output); + } - // Remove the vhost-user socket file. - Command::new("rm") - .arg("-f") - .arg("/tmp/dpdkvhostclient2") - .output() - .unwrap(); + #[test] + fn test_pci_device_id() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); - let api_socket_restored = format!("{}.2", temp_api_path(&guest2.tmp_dir)); - // Restore the VM from the snapshot - let mut child2 = GuestCommand::new(&guest2) - .args(["--api-socket", &api_socket_restored]) - .args([ - "--restore", - format!("source_url=file://{snapshot_dir}").as_str(), - ]) - .capture_output() - .spawn() - .unwrap(); + #[cfg(target_arch = "x86_64")] + let kernel_path = direct_kernel_boot_path(); + #[cfg(target_arch = "aarch64")] + let kernel_path = edk2_path(); + + let api_socket = temp_api_path(&guest.tmp_dir); + + // Boot without network + let mut cmd = GuestCommand::new(&guest); + + cmd.args(["--api-socket", &api_socket]) + .default_cpus() + .default_memory() + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .default_net() + .default_disks() + .capture_output(); + + let mut child = cmd.spawn().unwrap(); + + guest.wait_vm_boot().unwrap(); + + // Add a network device with non-static device id request + let r = std::panic::catch_unwind(|| { + let (cmd_success, cmd_stdout, _) = remote_command_w_output( + &api_socket, + "add-net", + Some( + format!( + "id=test0,tap=,mac={},ip={},mask=255.255.255.128", + guest.network.guest_mac1, guest.network.host_ip1, + ) + .as_str(), + ), + ); + assert!(cmd_success); + // We now know the first free device ID on the bus + let output = String::from_utf8(cmd_stdout).expect("should work"); + let (_, _, first_free_device_id, _) = bdf_from_hotplug_response(output.as_str()); + assert_ne!(first_free_device_id, 0); + + // Wait for the hotplugged device to appear in the guest + assert!(wait_until(Duration::from_secs(10), || { + ssh_command_ip_with_auth( + &format!("lspci -n | grep \"00:{first_free_device_id:02x}.0\""), + &default_guest_auth(), + &guest.network.guest_ip0, + Some(Duration::from_secs(1)), + ) + .is_ok() + })); + // Calculate the succeeding device ID + let device_id_to_allocate = first_free_device_id + 1; + // We expect the succeeding device ID to be free. + assert!(wait_until(Duration::from_secs(10), || { + matches!( + ssh_command_ip_with_auth( + &format!("lspci -n | grep \"00:{device_id_to_allocate:02x}.0\""), + &default_guest_auth(), + &guest.network.guest_ip0, + Some(Duration::from_secs(5)), + ), + Err(SshCommandError::NonZeroExitStatus(1)) + ) + })); - // Wait for the VM to be restored - thread::sleep(std::time::Duration::new(10, 0)); + // Add a device to the next device slot explicitly + let (cmd_success, cmd_stdout, _) = remote_command_w_output( + &api_socket, + "add-net", + Some( + format!( + "id=test1337,tap=,mac={},ip={},mask=255.255.255.128,pci_device_id={}", + guest.network.guest_mac1, guest.network.host_ip1, device_id_to_allocate, + ) + .as_str(), + ), + ); + assert!(cmd_success); + // Retrieve what BDF we actually reserved and assert it's equal to that we wanted to reserve + let output = String::from_utf8(cmd_stdout).expect("should work"); + let (_, _, allocated_device_id, _) = bdf_from_hotplug_response(output.as_str()); + assert_eq!(device_id_to_allocate, allocated_device_id); + // Wait for the hotplugged device to appear in the guest + assert!(wait_until(Duration::from_secs(10), || { + ssh_command_ip_with_auth( + &format!("lspci -n | grep \"00:{allocated_device_id:02x}.0\""), + &default_guest_auth(), + &guest.network.guest_ip0, + Some(Duration::from_secs(1)), + ) + .is_ok() + })); + // Remove the first device to create a hole + let cmd_success = remote_command(&api_socket, "remove-device", Some("test0")); + assert!(cmd_success); + // Wait for the device to disappear from the guest + assert!(wait_until(Duration::from_secs(10), || { + matches!( + ssh_command_ip_with_auth( + &format!("lspci -n | grep \"00:{first_free_device_id:02x}.0\""), + &default_guest_auth(), + &guest.network.guest_ip0, + Some(Duration::from_secs(1)), + ), + Err(SshCommandError::NonZeroExitStatus(1)) + ) + })); + // Reuse the device ID hole by dynamically coalescing with the first free ID + let (cmd_success, cmd_stdout, _) = remote_command_w_output( + &api_socket, + "add-net", + Some( + format!( + "id=test0,tap=,mac={},ip={},mask=255.255.255.128", + guest.network.guest_mac1, guest.network.host_ip1, + ) + .as_str(), + ), + ); + assert!(cmd_success); + // Check that CHV reports that we added the same device to the same ID + let output = String::from_utf8(cmd_stdout).expect("should work"); + let (_, _, allocated_device_id, _) = bdf_from_hotplug_response(output.as_str()); + assert_eq!(first_free_device_id, allocated_device_id); + + // Wait for the re-added device to appear in the guest + assert!(wait_until(Duration::from_secs(10), || { + ssh_command_ip_with_auth( + &format!("lspci -n | grep \"00:{allocated_device_id:02x}.0\""), + &default_guest_auth(), + &guest.network.guest_ip0, + Some(Duration::from_secs(1)), + ) + .is_ok() + })); + }); - let r = std::panic::catch_unwind(|| { - // Resume the VM - assert!(remote_command(&api_socket_restored, "resume", None)); + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); - // Spawn a new netcat listener in the first VM - let guest_ip = guest1.network.guest_ip0.clone(); - thread::spawn(move || { - ssh_command_ip( - "nc -l 12345", - &guest_ip, - DEFAULT_SSH_RETRIES, - DEFAULT_SSH_TIMEOUT, - ) - .unwrap(); - }); + handle_child_output(r, &output); + } - // Wait for the server to be listening - thread::sleep(std::time::Duration::new(5, 0)); + #[test] + // Test that adding a duplicate PCI device ID fails + fn test_duplicate_pci_device_id() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); - // And check the connection is still functional after restore - guest2.ssh_command("nc -vz 172.100.0.1 12345").unwrap(); - }); + #[cfg(target_arch = "x86_64")] + let kernel_path = direct_kernel_boot_path(); + #[cfg(target_arch = "aarch64")] + let kernel_path = edk2_path(); - kill_child(&mut child1); - kill_child(&mut child2); + let api_socket = temp_api_path(&guest.tmp_dir); - let output = child1.wait_with_output().unwrap(); - child2.wait().unwrap(); + // Boot without network + let mut cmd = GuestCommand::new(&guest); - cleanup_ovs_dpdk(); + cmd.args(["--api-socket", &api_socket]) + .default_cpus() + .default_memory() + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .default_net() + .default_disks() + .capture_output(); - handle_child_output(r, &output); - } + let mut child = cmd.spawn().unwrap(); - fn setup_spdk_nvme(nvme_dir: &std::path::Path) -> Child { - cleanup_spdk_nvme(); + guest.wait_vm_boot().unwrap(); - assert!( - exec_host_command_status(&format!( - "mkdir -p {}", - nvme_dir.join("nvme-vfio-user").to_str().unwrap() - )) - .success() - ); - assert!( - exec_host_command_status(&format!( - "truncate {} -s 128M", - nvme_dir.join("test-disk.raw").to_str().unwrap() - )) - .success() - ); - assert!( - exec_host_command_status(&format!( - "mkfs.ext4 {}", - nvme_dir.join("test-disk.raw").to_str().unwrap() - )) - .success() - ); + // Add a network device with non-static device ID request + let r = std::panic::catch_unwind(|| { + let (cmd_success, cmd_stdout, _) = remote_command_w_output( + &api_socket, + "add-net", + Some( + format!( + "id=test0,tap=,mac={},ip={},mask=255.255.255.128", + guest.network.guest_mac1, guest.network.host_ip1, + ) + .as_str(), + ), + ); + assert!(cmd_success); - // Start the SPDK nvmf_tgt daemon to present NVMe device as a VFIO user device - let child = Command::new("/usr/local/bin/spdk-nvme/nvmf_tgt") - .args(["-i", "0", "-m", "0x1"]) - .spawn() - .unwrap(); - thread::sleep(std::time::Duration::new(2, 0)); + // We now know the first free device ID on the bus + let output = String::from_utf8(cmd_stdout).expect("should work"); + let (_, _, first_free_device_id, _) = bdf_from_hotplug_response(output.as_str()); + assert_ne!(first_free_device_id, 0); - assert!(exec_host_command_with_retries( - "/usr/local/bin/spdk-nvme/rpc.py nvmf_create_transport -t VFIOUSER", - 3, - std::time::Duration::new(5, 0), - )); - assert!( - exec_host_command_status(&format!( - "/usr/local/bin/spdk-nvme/rpc.py bdev_aio_create {} test 512", - nvme_dir.join("test-disk.raw").to_str().unwrap() - )) - .success() - ); - assert!(exec_host_command_status( - "/usr/local/bin/spdk-nvme/rpc.py nvmf_create_subsystem nqn.2019-07.io.spdk:cnode -a -s test" - ) - .success()); - assert!(exec_host_command_status( - "/usr/local/bin/spdk-nvme/rpc.py nvmf_subsystem_add_ns nqn.2019-07.io.spdk:cnode test" - ) - .success()); - assert!(exec_host_command_status(&format!( - "/usr/local/bin/spdk-nvme/rpc.py nvmf_subsystem_add_listener nqn.2019-07.io.spdk:cnode -t VFIOUSER -a {} -s 0", - nvme_dir.join("nvme-vfio-user").to_str().unwrap() - )) - .success()); + let (cmd_success, _, cmd_stderr) = remote_command_w_output( + &api_socket, + "add-net", + Some( + format!( + "id=test1337,tap=,mac={},ip={},mask=255.255.255.128,pci_device_id={first_free_device_id}", + guest.network.guest_mac1, guest.network.host_ip1, + ) + .as_str(), + ), + ); + // Check for fail; Allocating the same device ID for two devices is disallowed + assert!(!cmd_success); + // Check that the error message contains the expected error + let std_err_str = String::from_utf8(cmd_stderr).unwrap(); + assert!( + std_err_str.contains(&format!( + "Valid PCI device identifier but already used: {first_free_device_id}" + )), + "Command return was: {std_err_str}" + ); + }); - child - } + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); - fn cleanup_spdk_nvme() { - exec_host_command_status("pkill -f nvmf_tgt"); + handle_child_output(r, &output); } #[test] - fn test_vfio_user() { - let jammy_image = JAMMY_IMAGE_NAME.to_string(); - let disk_config = UbuntuDiskConfig::new(jammy_image); + // Test that requesting an invalid device ID fails. + fn test_invalid_pci_device_id() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(disk_config)); - let spdk_nvme_dir = guest.tmp_dir.as_path().join("test-vfio-user"); - let mut spdk_child = setup_spdk_nvme(spdk_nvme_dir.as_path()); + #[cfg(target_arch = "x86_64")] + let kernel_path = direct_kernel_boot_path(); + #[cfg(target_arch = "aarch64")] + let kernel_path = edk2_path(); let api_socket = temp_api_path(&guest.tmp_dir); - let mut child = GuestCommand::new(&guest) - .args(["--api-socket", &api_socket]) - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=1G,shared=on,hugepages=on"]) - .args(["--kernel", fw_path(FwType::RustHypervisorFirmware).as_str()]) - .args(["--serial", "tty", "--console", "off"]) - .default_disks() + + // Boot without network + let mut cmd = GuestCommand::new(&guest); + + cmd.args(["--api-socket", &api_socket]) + .default_cpus() + .default_memory() + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .default_net() - .capture_output() - .spawn() - .unwrap(); + .default_disks() + .capture_output(); - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); + let mut child = cmd.spawn().unwrap(); - // Hotplug the SPDK-NVMe device to the VM - let (cmd_success, cmd_output) = remote_command_w_output( + guest.wait_vm_boot().unwrap(); + + let r = std::panic::catch_unwind(|| { + // Invalid API call because the PCI device ID is out of range + let (cmd_success, _, cmd_stderr) = remote_command_w_output( &api_socket, - "add-user-device", - Some(&format!( - "socket={},id=vfio_user0", - spdk_nvme_dir - .as_path() - .join("nvme-vfio-user/cntrl") - .to_str() - .unwrap(), - )), + "add-net", + Some( + format!( + "id=test0,tap=,mac={},ip={},mask=255.255.255.128,pci_device_id=188", + guest.network.guest_mac1, guest.network.host_ip1, + ) + .as_str(), + ), ); - assert!(cmd_success); + // Check for fail + assert!(!cmd_success); + // Check that the error message contains the expected error + let std_err_str = String::from_utf8(cmd_stderr).unwrap(); assert!( - String::from_utf8_lossy(&cmd_output) - .contains("{\"id\":\"vfio_user0\",\"bdf\":\"0000:00:05.0\"}") - ); - - thread::sleep(std::time::Duration::new(10, 0)); - - // Check both if /dev/nvme exists and if the block size is 128M. - assert_eq!( - guest - .ssh_command("lsblk | grep nvme0n1 | grep -c 128M") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 1 - ); - - // Check changes persist after reboot - assert_eq!( - guest.ssh_command("sudo mount /dev/nvme0n1 /mnt").unwrap(), - "" + std_err_str + .contains("Given PCI device ID (188) is out of the supported range of 0..32"), + "Command return was: {std_err_str}", ); - assert_eq!(guest.ssh_command("ls /mnt").unwrap(), "lost+found\n"); - guest - .ssh_command("echo test123 | sudo tee /mnt/test") - .unwrap(); - assert_eq!(guest.ssh_command("sudo umount /mnt").unwrap(), ""); - assert_eq!(guest.ssh_command("ls /mnt").unwrap(), ""); - guest.reboot_linux(0); - assert_eq!( - guest.ssh_command("sudo mount /dev/nvme0n1 /mnt").unwrap(), - "" + // Use the reserved device ID 0 (root device) + let (cmd_success, _, cmd_stderr) = remote_command_w_output( + &api_socket, + "add-net", + Some( + format!( + "id=test0,tap=,mac={},ip={},mask=255.255.255.128,pci_device_id=0", + guest.network.guest_mac1, guest.network.host_ip1, + ) + .as_str(), + ), ); - assert_eq!( - guest.ssh_command("sudo cat /mnt/test").unwrap().trim(), - "test123" + // Check for fail + assert!(!cmd_success); + // Check that the error message contains the expected error + let std_err_str = String::from_utf8(cmd_stderr).unwrap(); + assert!( + std_err_str.contains("Given PCI device ID (0) is reserved"), + "Command return was: {std_err_str}" ); - }); - - let _ = spdk_child.kill(); - let _ = spdk_child.wait(); + }); kill_child(&mut child); let output = child.wait_with_output().unwrap(); handle_child_output(r, &output); } +} - #[test] - #[cfg(target_arch = "x86_64")] - fn test_vdpa_block() { - // Before trying to run the test, verify the vdpa_sim_blk module is correctly loaded. - assert!(exec_host_command_status("lsmod | grep vdpa_sim_blk").success()); +mod dbus_api { + use crate::*; + // Start cloud-hypervisor with no VM parameters, running both the HTTP + // and DBus APIs. Alternate calls to the external APIs (HTTP and DBus) + // to create a VM, boot it, and verify that it can be shut down and then + // booted again. + #[test] + fn test_api_dbus_and_http_interleaved() { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(disk_config)); - let api_socket = temp_api_path(&guest.tmp_dir); - - let kernel_path = direct_kernel_boot_path(); + let dbus_api = TargetApi::new_dbus_api(&guest.tmp_dir); + let http_api = TargetApi::new_http_api(&guest.tmp_dir); let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=2"]) - .args(["--memory", "size=512M,hugepages=on"]) - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .default_net() - .args(["--vdpa", "path=/dev/vhost-vdpa-0,num_queues=1"]) - .args(["--platform", "num_pci_segments=2,iommu_segments=1"]) - .args(["--api-socket", &api_socket]) + .args(dbus_api.guest_args()) + .args(http_api.guest_args()) .capture_output() .spawn() .unwrap(); - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); + thread::sleep(std::time::Duration::new(1, 0)); - // Check both if /dev/vdc exists and if the block size is 128M. - assert_eq!( - guest - .ssh_command("lsblk | grep vdc | grep -c 128M") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 1 - ); + // Verify API servers are running + assert!(dbus_api.remote_command("ping", None)); + assert!(http_api.remote_command("ping", None)); - // Check the content of the block device after we wrote to it. - // The vpda-sim-blk should let us read what we previously wrote. - guest - .ssh_command("sudo bash -c 'echo foobar > /dev/vdc'") - .unwrap(); - assert_eq!( - guest.ssh_command("sudo head -1 /dev/vdc").unwrap().trim(), - "foobar" - ); + // Create the VM first + let request_body = guest.api_create_body(); - // Hotplug an extra vDPA block device behind the vIOMMU - // Add a new vDPA device to the VM - let (cmd_success, cmd_output) = remote_command_w_output( - &api_socket, - "add-vdpa", - Some("id=myvdpa0,path=/dev/vhost-vdpa-1,num_queues=1,pci_segment=1,iommu=on"), - ); - assert!(cmd_success); - assert!( - String::from_utf8_lossy(&cmd_output) - .contains("{\"id\":\"myvdpa0\",\"bdf\":\"0001:00:01.0\"}") - ); + let temp_config_path = guest.tmp_dir.as_path().join("config"); + std::fs::write(&temp_config_path, request_body).unwrap(); + let create_config = temp_config_path.as_os_str().to_str().unwrap(); - thread::sleep(std::time::Duration::new(10, 0)); + let r = std::panic::catch_unwind(|| { + // Create the VM + assert!(dbus_api.remote_command("create", Some(create_config),)); - // Check IOMMU setup - assert!( - guest - .does_device_vendor_pair_match("0x1057", "0x1af4") - .unwrap_or_default() - ); - assert_eq!( - guest - .ssh_command("ls /sys/kernel/iommu_groups/1/devices") - .unwrap() - .trim(), - "0001:00:01.0" - ); + // Then boot it + assert!(http_api.remote_command("boot", None)); + guest.wait_vm_boot().unwrap(); - // Check both if /dev/vdd exists and if the block size is 128M. - assert_eq!( - guest - .ssh_command("lsblk | grep vdd | grep -c 128M") - .unwrap() - .trim() - .parse::() - .unwrap_or_default(), - 1 - ); + // Check that the VM booted as expected + guest.validate_cpu_count(None); + guest.validate_memory(None); - // Write some content to the block device we've just plugged. - guest - .ssh_command("sudo bash -c 'echo foobar > /dev/vdd'") - .unwrap(); + // Sync and shutdown without powering off to prevent filesystem + // corruption. + guest.ssh_command("sync").unwrap(); + guest.ssh_command("sudo shutdown -H now").unwrap(); - // Check we can read the content back. - assert_eq!( - guest.ssh_command("sudo head -1 /dev/vdd").unwrap().trim(), - "foobar" - ); + // Wait for the guest to be fully shutdown + assert!(guest.wait_for_ssh_unresponsive(Duration::from_secs(20))); - // Unplug the device - let cmd_success = remote_command(&api_socket, "remove-device", Some("myvdpa0")); - assert!(cmd_success); - thread::sleep(std::time::Duration::new(10, 0)); + // Then shutdown the VM + assert!(dbus_api.remote_command("shutdown", None)); - // Check /dev/vdd doesn't exist anymore - assert_eq!( - guest - .ssh_command("lsblk | grep -c vdd || true") - .unwrap() - .trim() - .parse::() - .unwrap_or(1), - 0 - ); + // Then boot it again + assert!(http_api.remote_command("boot", None)); + guest.wait_vm_boot().unwrap(); + + // Check that the VM booted as expected + guest.validate_cpu_count(None); + guest.validate_memory(None); }); kill_child(&mut child); @@ -9434,671 +6167,900 @@ mod common_parallel { } #[test] - #[cfg(target_arch = "x86_64")] - fn test_vdpa_net() { - // Before trying to run the test, verify the vdpa_sim_net module is correctly loaded. - if !exec_host_command_status("lsmod | grep vdpa_sim_net").success() { - return; - } + fn test_api_dbus_create_boot() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = GuestFactory::new_regular_guest_factory() + .create_guest(Box::new(disk_config)) + .with_cpu(4); + + let target_api = TargetApi::new_dbus_api(&guest.tmp_dir); + _test_api_create_boot(&target_api, &guest); + } + #[test] + fn test_api_dbus_shutdown() { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); + let guest = GuestFactory::new_regular_guest_factory() + .create_guest(Box::new(disk_config)) + .with_cpu(4); + + let target_api = TargetApi::new_dbus_api(&guest.tmp_dir); + _test_api_shutdown(&target_api, &guest); + } + + #[test] + fn test_api_dbus_delete() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = GuestFactory::new_regular_guest_factory() + .create_guest(Box::new(disk_config)) + .with_cpu(4); + + let target_api = TargetApi::new_dbus_api(&guest.tmp_dir); + _test_api_delete(&target_api, &guest); + } + + #[test] + fn test_api_dbus_pause_resume() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = GuestFactory::new_regular_guest_factory() + .create_guest(Box::new(disk_config)) + .with_cpu(4); + + let target_api = TargetApi::new_dbus_api(&guest.tmp_dir); + _test_api_pause_resume(&target_api, &guest); + } +} + +mod ivshmem { + #[cfg(not(feature = "mshv"))] + use std::fs::remove_dir_all; + use std::process::Command; + + use test_infra::{Guest, GuestCommand, UbuntuDiskConfig, handle_child_output, kill_child}; + + use crate::*; + fn _test_live_migration_ivshmem(local: bool) { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); let kernel_path = direct_kernel_boot_path(); + let console_text = String::from("On a branch floating down river a cricket, singing."); + let net_id = "net123"; + let net_params = format!( + "id={},tap=,mac={},ip={},mask=255.255.255.128", + net_id, guest.network.guest_mac0, guest.network.host_ip0 + ); + + let memory_param: &[&str] = if local { + &["--memory", "size=4G,shared=on"] + } else { + &["--memory", "size=4G"] + }; + + let boot_vcpus = 2; + let max_vcpus = 4; + + let pmem_temp_file = TempFile::new().unwrap(); + pmem_temp_file.as_file().set_len(128 << 20).unwrap(); + std::process::Command::new("mkfs.ext4") + .arg(pmem_temp_file.as_path()) + .output() + .expect("Expect creating disk image to succeed"); + let pmem_path = String::from("/dev/pmem0"); + + let ivshmem_file_path = String::from( + guest + .tmp_dir + .as_path() + .join("ivshmem.data") + .to_str() + .unwrap(), + ); + let file_size = "1M"; + + // Create a file to be used as the shared memory + Command::new("dd") + .args([ + "if=/dev/zero", + format!("of={ivshmem_file_path}").as_str(), + format!("bs={file_size}").as_str(), + "count=1", + ]) + .status() + .unwrap(); + + // Start the source VM + let src_vm_path = clh_command("cloud-hypervisor"); + let src_api_socket = temp_api_path(&guest.tmp_dir); + let mut src_vm_cmd = GuestCommand::new_with_binary_path(&guest, &src_vm_path); + src_vm_cmd + .args([ + "--cpus", + format!("boot={boot_vcpus},max={max_vcpus}").as_str(), + ]) + .args(memory_param) + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .default_disks() + .args(["--net", net_params.as_str()]) + .args(["--api-socket", &src_api_socket]) + .args([ + "--pmem", + format!("file={}", pmem_temp_file.as_path().to_str().unwrap(),).as_str(), + ]) + .args([ + "--ivshmem", + format!("path={ivshmem_file_path},size={file_size}").as_str(), + ]); + let mut src_child = src_vm_cmd.capture_output().spawn().unwrap(); + + // Start the destination VM + let mut dest_api_socket = temp_api_path(&guest.tmp_dir); + dest_api_socket.push_str(".dest"); + let mut dest_child = GuestCommand::new(&guest) + .args(["--api-socket", &dest_api_socket]) + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + + // Make sure the source VM is functional + // Check the number of vCPUs + assert_eq!(guest.get_cpu_count().unwrap_or_default(), boot_vcpus); + // Check the guest RAM + assert!(guest.get_total_memory().unwrap_or_default() > 3_840_000); + // Check the guest virtio-devices, e.g. block, rng, console, and net + guest.check_devices_common(None, Some(&console_text), Some(&pmem_path)); + // x86_64: Following what's done in the `test_snapshot_restore`, we need + // to make sure that removing and adding back the virtio-net device does + // not break the live-migration support for virtio-pci. + #[cfg(target_arch = "x86_64")] + { + assert!(remote_command( + &src_api_socket, + "remove-device", + Some(net_id), + )); + thread::sleep(Duration::new(10, 0)); - let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=2"]) - .args(["--memory", "size=512M,hugepages=on"]) - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .default_net() - .args(["--vdpa", "path=/dev/vhost-vdpa-2,num_queues=3"]) - .capture_output() - .spawn() - .unwrap(); + // Plug the virtio-net device again + assert!(remote_command( + &src_api_socket, + "add-net", + Some(net_params.as_str()), + )); + thread::sleep(Duration::new(10, 0)); + } - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); + // Check ivshmem device in src guest. + _test_ivshmem(&guest, &ivshmem_file_path, file_size); + // Allow some normal time to elapse to check we don't get spurious reboots + thread::sleep(std::time::Duration::new(40, 0)); - // Check we can find network interface related to vDPA device - assert_eq!( + // Start the live-migration + let migration_socket = String::from( guest - .ssh_command("ip -o link | grep -c ens6") - .unwrap() - .trim() - .parse::() - .unwrap_or(0), - 1 + .tmp_dir + .as_path() + .join("live-migration.sock") + .to_str() + .unwrap(), ); - guest - .ssh_command("sudo ip link set dev ens6 address 00:e8:ca:33:ba:06") - .unwrap(); - guest - .ssh_command("sudo ip addr add 172.16.1.2/24 dev ens6") - .unwrap(); - guest.ssh_command("sudo ip link set up dev ens6").unwrap(); - - // Check there is no packet yet on both TX/RX of the network interface - assert_eq!( - guest - .ssh_command("ip -j -p -s link show ens6 | grep -c '\"packets\": 0'") - .unwrap() - .trim() - .parse::() - .unwrap_or(0), - 2 + assert!( + live_migration::start_live_migration( + &migration_socket, + &src_api_socket, + &dest_api_socket, + local + ), + "Unsuccessful command: 'send-migration' or 'receive-migration'." ); + }); - // Send 6 packets with ping command - guest.ssh_command("ping 172.16.1.10 -c 6 || true").unwrap(); - - // Check we can find 6 packets on both TX/RX of the network interface - assert_eq!( - guest - .ssh_command("ip -j -p -s link show ens6 | grep -c '\"packets\": 6'") - .unwrap() - .trim() - .parse::() - .unwrap_or(0), - 2 + // Check and report any errors occurred during the live-migration + if r.is_err() { + live_migration::print_and_panic( + src_child, + dest_child, + None, + "Error occurred during live-migration", ); + } - // No need to check for hotplug as we already tested it through - // test_vdpa_block() - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); + // Check the source vm has been terminated successful (give it '3s' to settle) + thread::sleep(std::time::Duration::new(3, 0)); + if !src_child.try_wait().unwrap().is_some_and(|s| s.success()) { + live_migration::print_and_panic( + src_child, + dest_child, + None, + "source VM was not terminated successfully.", + ); + } - handle_child_output(r, &output); - } + // Post live-migration check to make sure the destination VM is functional + let r = std::panic::catch_unwind(|| { + // Perform same checks to validate VM has been properly migrated + assert_eq!(guest.get_cpu_count().unwrap_or_default(), boot_vcpus); + assert!(guest.get_total_memory().unwrap_or_default() > 3_840_000); - #[test] - #[cfg(not(feature = "mshv"))] // See issue #7439 - #[cfg(target_arch = "x86_64")] - fn test_tpm() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); + guest.check_devices_common(None, Some(&console_text), Some(&pmem_path)); - let (mut swtpm_command, swtpm_socket_path) = prepare_swtpm_daemon(&guest.tmp_dir); + // Check ivshmem device + _test_ivshmem(&guest, &ivshmem_file_path, file_size); + }); - let mut guest_cmd = GuestCommand::new(&guest); - guest_cmd - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=1G"]) - .args(["--kernel", fw_path(FwType::RustHypervisorFirmware).as_str()]) - .args(["--tpm", &format!("socket={swtpm_socket_path}")]) - .capture_output() - .default_disks() - .default_net(); + // Clean-up the destination VM and make sure it terminated correctly + let _ = dest_child.kill(); + let dest_output = dest_child.wait_with_output().unwrap(); + handle_child_output(r, &dest_output); - // Start swtpm daemon - let mut swtpm_child = swtpm_command.spawn().unwrap(); - thread::sleep(std::time::Duration::new(10, 0)); - let mut child = guest_cmd.spawn().unwrap(); + // Check the destination VM has the expected 'console_text' from its output let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - assert_eq!( - guest.ssh_command("ls /dev/tpm0").unwrap().trim(), - "/dev/tpm0" - ); - guest.ssh_command("sudo tpm2_selftest -f").unwrap(); - guest - .ssh_command("echo 'hello' > /tmp/checksum_test; ") - .unwrap(); - guest.ssh_command("cmp <(sudo tpm2_pcrevent /tmp/checksum_test | grep sha256 | awk '{print $2}') <(sha256sum /tmp/checksum_test| awk '{print $1}')").unwrap(); + assert!(String::from_utf8_lossy(&dest_output.stdout).contains(&console_text)); }); - - let _ = swtpm_child.kill(); - let _d_out = swtpm_child.wait_with_output().unwrap(); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); + handle_child_output(r, &dest_output); } #[test] - #[cfg(target_arch = "x86_64")] - fn test_double_tty() { + fn test_ivshmem() { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(disk_config)); - let mut cmd = GuestCommand::new(&guest); let api_socket = temp_api_path(&guest.tmp_dir); - let tty_str: &str = "console=hvc0 earlyprintk=ttyS0 "; - // linux printk module enable console log. - let con_dis_str: &str = "console [hvc0] enabled"; - // linux printk module disable console log. - let con_enb_str: &str = "bootconsole [earlyser0] disabled"; let kernel_path = direct_kernel_boot_path(); - cmd.args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) - .args(["--kernel", kernel_path.to_str().unwrap()]) + let ivshmem_file_path = String::from( + guest + .tmp_dir + .as_path() + .join("ivshmem.data") + .to_str() + .unwrap(), + ); + let file_size = "1M"; + + // Create a file to be used as the shared memory + Command::new("dd") .args([ - "--cmdline", - DIRECT_KERNEL_BOOT_CMDLINE - .replace("console=hvc0 ", tty_str) - .as_str(), + "if=/dev/zero", + format!("of={ivshmem_file_path}").as_str(), + format!("bs={file_size}").as_str(), + "count=1", ]) - .capture_output() - .default_disks() - .default_net() - .args(["--serial", "tty"]) - .args(["--console", "tty"]) - .args(["--api-socket", &api_socket]); - - let mut child = cmd.spawn().unwrap(); - - let mut r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - }); - - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - - if r.is_ok() { - r = std::panic::catch_unwind(|| { - let s = String::from_utf8_lossy(&output.stdout); - assert!(s.contains(tty_str)); - assert!(s.contains(con_dis_str)); - assert!(s.contains(con_enb_str)); - }); - } - - handle_child_output(r, &output); - } - - #[test] - #[cfg(target_arch = "x86_64")] - fn test_nmi() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - let api_socket = temp_api_path(&guest.tmp_dir); - let event_path = temp_event_monitor_path(&guest.tmp_dir); - - let kernel_path = direct_kernel_boot_path(); - let cmd_line = format!("{} {}", DIRECT_KERNEL_BOOT_CMDLINE, "unknown_nmi_panic=1"); + .status() + .unwrap(); - let mut cmd = GuestCommand::new(&guest); - cmd.args(["--cpus", "boot=4"]) - .args(["--memory", "size=512M"]) + let mut child = GuestCommand::new(&guest) + .args(["--cpus", "boot=2"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", cmd_line.as_str()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .default_disks() - .args(["--net", guest.default_net_string().as_str()]) - .args(["--pvpanic"]) + .default_net() + .args([ + "--ivshmem", + format!("path={ivshmem_file_path},size={file_size}").as_str(), + ]) .args(["--api-socket", &api_socket]) - .args(["--event-monitor", format!("path={event_path}").as_str()]) - .capture_output(); - - let mut child = cmd.spawn().unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - - assert!(remote_command(&api_socket, "nmi", None)); - - // Wait a while for guest - thread::sleep(std::time::Duration::new(3, 0)); + .capture_output() + .spawn() + .unwrap(); - let expected_sequential_events = [&MetaEvent { - event: "panic".to_string(), - device_id: None, - }]; - assert!(check_latest_events_exact( - &expected_sequential_events, - &event_path - )); + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + _test_ivshmem(&guest, &ivshmem_file_path, file_size); }); - kill_child(&mut child); let output = child.wait_with_output().unwrap(); handle_child_output(r, &output); } -} -mod dbus_api { - use crate::*; - - // Start cloud-hypervisor with no VM parameters, running both the HTTP - // and DBus APIs. Alternate calls to the external APIs (HTTP and DBus) - // to create a VM, boot it, and verify that it can be shut down and then - // booted again. #[test] - fn test_api_dbus_and_http_interleaved() { + #[cfg(not(feature = "mshv"))] + fn test_snapshot_restore_ivshmem() { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(disk_config)); - let dbus_api = TargetApi::new_dbus_api(&guest.tmp_dir); - let http_api = TargetApi::new_http_api(&guest.tmp_dir); + let kernel_path = direct_kernel_boot_path(); + + let api_socket_source = format!("{}.1", temp_api_path(&guest.tmp_dir)); + + let ivshmem_file_path = String::from( + guest + .tmp_dir + .as_path() + .join("ivshmem.data") + .to_str() + .unwrap(), + ); + let file_size = "1M"; + + // Create a file to be used as the shared memory + Command::new("dd") + .args([ + "if=/dev/zero", + format!("of={ivshmem_file_path}").as_str(), + format!("bs={file_size}").as_str(), + "count=1", + ]) + .status() + .unwrap(); + + let socket = temp_vsock_path(&guest.tmp_dir); + let event_path = temp_event_monitor_path(&guest.tmp_dir); let mut child = GuestCommand::new(&guest) - .args(dbus_api.guest_args()) - .args(http_api.guest_args()) + .args(["--api-socket", &api_socket_source]) + .args(["--event-monitor", format!("path={event_path}").as_str()]) + .args(["--cpus", "boot=2"]) + .args(["--memory", "size=1G"]) + .args(["--kernel", kernel_path.to_str().unwrap()]) + .default_disks() + .default_net() + .args(["--vsock", format!("cid=3,socket={socket}").as_str()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .args([ + "--ivshmem", + format!("path={ivshmem_file_path},size={file_size}").as_str(), + ]) .capture_output() .spawn() .unwrap(); - thread::sleep(std::time::Duration::new(1, 0)); + let console_text = String::from("On a branch floating down river a cricket, singing."); + // Create the snapshot directory + let snapshot_dir = temp_snapshot_dir_path(&guest.tmp_dir); - // Verify API servers are running - assert!(dbus_api.remote_command("ping", None)); - assert!(http_api.remote_command("ping", None)); + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); - // Create the VM first - let cpu_count: u8 = 4; - let request_body = guest.api_create_body( - cpu_count, - direct_kernel_boot_path().to_str().unwrap(), - DIRECT_KERNEL_BOOT_CMDLINE, - ); + // Check the number of vCPUs + assert_eq!(guest.get_cpu_count().unwrap_or_default(), 2); - let temp_config_path = guest.tmp_dir.as_path().join("config"); - std::fs::write(&temp_config_path, request_body).unwrap(); - let create_config = temp_config_path.as_os_str().to_str().unwrap(); + snapshot_restore_common::snapshot_and_check_events( + &api_socket_source, + &snapshot_dir, + &event_path, + ); + }); - let r = std::panic::catch_unwind(|| { - // Create the VM - assert!(dbus_api.remote_command("create", Some(create_config),)); + // Shutdown the source VM and check console output + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + handle_child_output(r, &output); - // Then boot it - assert!(http_api.remote_command("boot", None)); - guest.wait_vm_boot().unwrap(); + // Remove the vsock socket file. + Command::new("rm") + .arg("-f") + .arg(socket.as_str()) + .output() + .unwrap(); - // Check that the VM booted as expected - assert_eq!(guest.get_cpu_count().unwrap_or_default() as u8, cpu_count); - assert!(guest.get_total_memory().unwrap_or_default() > 480_000); + let api_socket_restored = format!("{}.2", temp_api_path(&guest.tmp_dir)); + let event_path_restored = format!("{}.2", temp_event_monitor_path(&guest.tmp_dir)); - // Sync and shutdown without powering off to prevent filesystem - // corruption. - guest.ssh_command("sync").unwrap(); - guest.ssh_command("sudo shutdown -H now").unwrap(); + // Restore the VM from the snapshot + let mut child = GuestCommand::new(&guest) + .args(["--api-socket", &api_socket_restored]) + .args([ + "--event-monitor", + format!("path={event_path_restored}").as_str(), + ]) + .args([ + "--restore", + format!("source_url=file://{snapshot_dir}").as_str(), + ]) + .capture_output() + .spawn() + .unwrap(); - // Wait for the guest to be fully shutdown - thread::sleep(std::time::Duration::new(20, 0)); + let latest_events = [&MetaEvent { + event: "restored".to_string(), + device_id: None, + }]; + // Wait for the restored event to show up in the monitor file. + assert!(wait_until(Duration::from_secs(30), || { + check_latest_events_exact(&latest_events, &event_path_restored) + })); - // Then shutdown the VM - assert!(dbus_api.remote_command("shutdown", None)); + // Remove the snapshot dir + let _ = remove_dir_all(snapshot_dir.as_str()); - // Then boot it again - assert!(http_api.remote_command("boot", None)); - guest.wait_vm_boot().unwrap(); + let r = std::panic::catch_unwind(|| { + // Resume the VM + assert!(wait_until(Duration::from_secs(30), || remote_command( + &api_socket_restored, + "info", + None + ))); + assert!(remote_command(&api_socket_restored, "resume", None)); + let latest_events = [ + &MetaEvent { + event: "resuming".to_string(), + device_id: None, + }, + &MetaEvent { + event: "resumed".to_string(), + device_id: None, + }, + ]; + assert!(wait_until(Duration::from_secs(30), || { + check_latest_events_exact(&latest_events, &event_path_restored) + })); - // Check that the VM booted as expected - assert_eq!(guest.get_cpu_count().unwrap_or_default() as u8, cpu_count); - assert!(guest.get_total_memory().unwrap_or_default() > 480_000); + // Check the number of vCPUs + assert_eq!(guest.get_cpu_count().unwrap_or_default(), 2); + guest.check_devices_common(Some(&socket), Some(&console_text), None); + _test_ivshmem(&guest, &ivshmem_file_path, file_size); }); - + // Shutdown the target VM and check console output kill_child(&mut child); let output = child.wait_with_output().unwrap(); + handle_child_output(r, &output); + + let r = std::panic::catch_unwind(|| { + assert!(String::from_utf8_lossy(&output.stdout).contains(&console_text)); + }); handle_child_output(r, &output); } #[test] - fn test_api_dbus_create_boot() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); + #[cfg(not(feature = "mshv"))] + fn test_live_migration_ivshmem() { + _test_live_migration_ivshmem(false); + } - let target_api = TargetApi::new_dbus_api(&guest.tmp_dir); - _test_api_create_boot(&target_api, &guest); + #[test] + #[cfg(not(feature = "mshv"))] + fn test_live_migration_ivshmem_local() { + _test_live_migration_ivshmem(true); } #[test] - fn test_api_dbus_shutdown() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); + #[cfg(not(feature = "mshv"))] + fn test_snapshot_restore_hotplug_virtiomem() { + snapshot_restore_common::_test_snapshot_restore(true, false); + } - let target_api = TargetApi::new_dbus_api(&guest.tmp_dir); - _test_api_shutdown(&target_api, &guest); + #[test] + #[cfg(not(feature = "mshv"))] // See issue #7437 + fn test_snapshot_restore_basic() { + snapshot_restore_common::_test_snapshot_restore(false, false); } #[test] - fn test_api_dbus_delete() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); + #[cfg(not(feature = "mshv"))] + fn test_snapshot_restore_with_resume() { + snapshot_restore_common::_test_snapshot_restore(false, true); + } - let target_api = TargetApi::new_dbus_api(&guest.tmp_dir); - _test_api_delete(&target_api, &guest); + #[test] + #[cfg(not(feature = "mshv"))] + fn test_snapshot_restore_uffd() { + snapshot_restore_common::_test_snapshot_restore_uffd("size=2G", &[], 1_920_000); } #[test] - fn test_api_dbus_pause_resume() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); + #[cfg(not(feature = "mshv"))] + fn test_snapshot_restore_uffd_shared_memory() { + snapshot_restore_common::_test_snapshot_restore_uffd("size=512M,shared=on", &[], 480_000); + } - let target_api = TargetApi::new_dbus_api(&guest.tmp_dir); - _test_api_pause_resume(&target_api, &guest); + #[test] + #[cfg(not(feature = "mshv"))] // See issue #7437 + #[cfg(target_arch = "x86_64")] + fn test_snapshot_restore_pvpanic() { + snapshot_restore_common::_test_snapshot_restore_devices(true); + } + + #[test] + fn test_virtio_pmem_persist_writes() { + test_virtio_pmem(false, false); } } -mod ivshmem { - #[cfg(not(feature = "mshv"))] +#[cfg(not(feature = "mshv"))] +mod snapshot_restore_common { use std::fs::remove_dir_all; use std::process::Command; - use test_infra::{Guest, GuestCommand, UbuntuDiskConfig, handle_child_output, kill_child}; + use crate::*; + + pub(crate) fn snapshot_and_check_events( + api_socket: &str, + snapshot_dir: &str, + event_path: &str, + ) { + // Pause the VM + assert!(remote_command(api_socket, "pause", None)); + let latest_events: [&MetaEvent; 2] = [ + &MetaEvent { + event: "pausing".to_string(), + device_id: None, + }, + &MetaEvent { + event: "paused".to_string(), + device_id: None, + }, + ]; + + assert!(wait_until(Duration::from_secs(30), || { + check_latest_events_exact(&latest_events, event_path) + })); + + // Take a snapshot from the VM + assert!(remote_command( + api_socket, + "snapshot", + Some(format!("file://{snapshot_dir}").as_str()), + )); + + let latest_events = [ + &MetaEvent { + event: "snapshotting".to_string(), + device_id: None, + }, + &MetaEvent { + event: "snapshotted".to_string(), + device_id: None, + }, + ]; - use crate::*; + assert!(wait_until(Duration::from_secs(30), || { + check_latest_events_exact(&latest_events, event_path) + })); + } - fn _test_live_migration_ivshmem(local: bool) { + pub(crate) fn _test_snapshot_restore(use_hotplug: bool, use_resume_option: bool) { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(disk_config)); let kernel_path = direct_kernel_boot_path(); - let console_text = String::from("On a branch floating down river a cricket, singing."); + + let api_socket_source = format!("{}.1", temp_api_path(&guest.tmp_dir)); + let net_id = "net123"; let net_params = format!( "id={},tap=,mac={},ip={},mask=255.255.255.128", net_id, guest.network.guest_mac0, guest.network.host_ip0 ); + let mut mem_params = "size=1G"; - let memory_param: &[&str] = if local { - &["--memory", "size=4G,shared=on"] - } else { - &["--memory", "size=4G"] - }; - - let boot_vcpus = 2; - let max_vcpus = 4; - - let pmem_temp_file = TempFile::new().unwrap(); - pmem_temp_file.as_file().set_len(128 << 20).unwrap(); - std::process::Command::new("mkfs.ext4") - .arg(pmem_temp_file.as_path()) - .output() - .expect("Expect creating disk image to succeed"); - let pmem_path = String::from("/dev/pmem0"); + if use_hotplug { + mem_params = "size=2G,hotplug_method=virtio-mem,hotplug_size=32G"; + } - let ivshmem_file_path = String::from( - guest - .tmp_dir - .as_path() - .join("ivshmem.data") - .to_str() - .unwrap(), + let cloudinit_params = format!( + "path={},iommu=on", + guest.disk_config.disk(DiskType::CloudInit).unwrap() ); - let file_size = "1M"; - // Create a file to be used as the shared memory - Command::new("dd") - .args([ - "if=/dev/zero", - format!("of={ivshmem_file_path}").as_str(), - format!("bs={file_size}").as_str(), - "count=1", - ]) - .status() - .unwrap(); + let socket = temp_vsock_path(&guest.tmp_dir); + let event_path = temp_event_monitor_path(&guest.tmp_dir); - // Start the source VM - let src_vm_path = clh_command("cloud-hypervisor"); - let src_api_socket = temp_api_path(&guest.tmp_dir); - let mut src_vm_cmd = GuestCommand::new_with_binary_path(&guest, &src_vm_path); - src_vm_cmd - .args([ - "--cpus", - format!("boot={boot_vcpus},max={max_vcpus}").as_str(), - ]) - .args(memory_param) + let mut child = GuestCommand::new(&guest) + .args(["--api-socket", &api_socket_source]) + .args(["--event-monitor", format!("path={event_path}").as_str()]) + .args(["--cpus", "boot=4"]) + .args(["--memory", mem_params]) + .args(["--balloon", "size=0"]) .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .args(["--net", net_params.as_str()]) - .args(["--api-socket", &src_api_socket]) .args([ - "--pmem", - format!("file={}", pmem_temp_file.as_path().to_str().unwrap(),).as_str(), + "--disk", + format!( + "path={}", + guest.disk_config.disk(DiskType::OperatingSystem).unwrap() + ) + .as_str(), + cloudinit_params.as_str(), ]) - .args([ - "--ivshmem", - format!("path={ivshmem_file_path},size={file_size}").as_str(), - ]); - let mut src_child = src_vm_cmd.capture_output().spawn().unwrap(); - - // Start the destination VM - let mut dest_api_socket = temp_api_path(&guest.tmp_dir); - dest_api_socket.push_str(".dest"); - let mut dest_child = GuestCommand::new(&guest) - .args(["--api-socket", &dest_api_socket]) + .args(["--net", net_params.as_str()]) + .args(["--vsock", format!("cid=3,socket={socket}").as_str()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .capture_output() .spawn() .unwrap(); + let console_text = String::from("On a branch floating down river a cricket, singing."); + // Create the snapshot directory + let snapshot_dir = temp_snapshot_dir_path(&guest.tmp_dir); + let r = std::panic::catch_unwind(|| { guest.wait_vm_boot().unwrap(); - // Make sure the source VM is functional // Check the number of vCPUs - assert_eq!(guest.get_cpu_count().unwrap_or_default(), boot_vcpus); + assert_eq!(guest.get_cpu_count().unwrap_or_default(), 4); // Check the guest RAM - assert!(guest.get_total_memory().unwrap_or_default() > 3_840_000); - // Check the guest virtio-devices, e.g. block, rng, console, and net - guest.check_devices_common(None, Some(&console_text), Some(&pmem_path)); - // x86_64: Following what's done in the `test_snapshot_restore`, we need - // to make sure that removing and adding back the virtio-net device does - // not break the live-migration support for virtio-pci. + let total_memory = guest.get_total_memory().unwrap_or_default(); + if use_hotplug { + assert!(total_memory > 1_900_000, "total memory: {total_memory}"); + } else { + assert!(total_memory > 900_000, "total memory: {total_memory}"); + } + if use_hotplug { + // Increase guest RAM with virtio-mem + resize_command( + &api_socket_source, + None, + Some(6 << 30), + None, + Some(&event_path), + ); + thread::sleep(std::time::Duration::new(5, 0)); + assert!(guest.get_total_memory().unwrap_or_default() > 5_760_000); + // Use balloon to remove RAM from the VM + resize_command( + &api_socket_source, + None, + None, + Some(1 << 30), + Some(&event_path), + ); + thread::sleep(std::time::Duration::new(5, 0)); + let total_memory = guest.get_total_memory().unwrap_or_default(); + assert!(total_memory > 4_800_000, "total_memory is {total_memory}"); + assert!(total_memory < 5_760_000, "total_memory is {total_memory}"); + } + // Check the guest virtio-devices, e.g. block, rng, vsock, console, and net + guest.check_devices_common(Some(&socket), Some(&console_text), None); + + // x86_64: We check that removing and adding back the virtio-net device + // does not break the snapshot/restore support for virtio-pci. + // This is an important thing to test as the hotplug will + // trigger a PCI BAR reprogramming, which is a good way of + // checking if the stored resources are correctly restored. + // Unplug the virtio-net device + // AArch64: Device hotplug is currently not supported, skipping here. #[cfg(target_arch = "x86_64")] { assert!(remote_command( - &src_api_socket, + &api_socket_source, "remove-device", Some(net_id), )); - thread::sleep(Duration::new(10, 0)); + thread::sleep(std::time::Duration::new(10, 0)); + let latest_events = [&MetaEvent { + event: "device-removed".to_string(), + device_id: Some(net_id.to_string()), + }]; + assert!(wait_until(Duration::from_secs(30), || { + check_latest_events_exact(&latest_events, &event_path) + })); // Plug the virtio-net device again assert!(remote_command( - &src_api_socket, + &api_socket_source, "add-net", Some(net_params.as_str()), )); - thread::sleep(Duration::new(10, 0)); + thread::sleep(std::time::Duration::new(10, 0)); } - // Check ivshmem device in src guest. - _test_ivshmem(&guest, &ivshmem_file_path, file_size); - // Allow some normal time to elapse to check we don't get spurious reboots - thread::sleep(std::time::Duration::new(40, 0)); - - // Start the live-migration - let migration_socket = String::from( - guest - .tmp_dir - .as_path() - .join("live-migration.sock") - .to_str() - .unwrap(), - ); - - assert!( - live_migration::start_live_migration( - &migration_socket, - &src_api_socket, - &dest_api_socket, - local - ), - "Unsuccessful command: 'send-migration' or 'receive-migration'." - ); - }); - - // Check and report any errors occurred during the live-migration - if r.is_err() { - live_migration::print_and_panic( - src_child, - dest_child, - None, - "Error occurred during live-migration", - ); - } - - // Check the source vm has been terminated successful (give it '3s' to settle) - thread::sleep(std::time::Duration::new(3, 0)); - if !src_child.try_wait().unwrap().is_some_and(|s| s.success()) { - live_migration::print_and_panic( - src_child, - dest_child, - None, - "source VM was not terminated successfully.", + snapshot_restore_common::snapshot_and_check_events( + &api_socket_source, + &snapshot_dir, + &event_path, ); - } - - // Post live-migration check to make sure the destination VM is functional - let r = std::panic::catch_unwind(|| { - // Perform same checks to validate VM has been properly migrated - assert_eq!(guest.get_cpu_count().unwrap_or_default(), boot_vcpus); - assert!(guest.get_total_memory().unwrap_or_default() > 3_840_000); - - guest.check_devices_common(None, Some(&console_text), Some(&pmem_path)); - - // Check ivshmem device - _test_ivshmem(&guest, &ivshmem_file_path, file_size); - }); - - // Clean-up the destination VM and make sure it terminated correctly - let _ = dest_child.kill(); - let dest_output = dest_child.wait_with_output().unwrap(); - handle_child_output(r, &dest_output); - - // Check the destination VM has the expected 'console_text' from its output - let r = std::panic::catch_unwind(|| { - assert!(String::from_utf8_lossy(&dest_output.stdout).contains(&console_text)); }); - handle_child_output(r, &dest_output); - } - - #[test] - fn test_ivshmem() { - let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(disk_config)); - let api_socket = temp_api_path(&guest.tmp_dir); - - let kernel_path = direct_kernel_boot_path(); - - let ivshmem_file_path = String::from( - guest - .tmp_dir - .as_path() - .join("ivshmem.data") - .to_str() - .unwrap(), - ); - let file_size = "1M"; - // Create a file to be used as the shared memory - Command::new("dd") - .args([ - "if=/dev/zero", - format!("of={ivshmem_file_path}").as_str(), - format!("bs={file_size}").as_str(), - "count=1", - ]) - .status() + // Shutdown the source VM and check console output + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + handle_child_output(r, &output); + + let r = std::panic::catch_unwind(|| { + assert!(String::from_utf8_lossy(&output.stdout).contains(&console_text)); + }); + + handle_child_output(r, &output); + + // Remove the vsock socket file. + Command::new("rm") + .arg("-f") + .arg(socket.as_str()) + .output() .unwrap(); + let api_socket_restored = format!("{}.2", temp_api_path(&guest.tmp_dir)); + let event_path_restored = format!("{}.2", temp_event_monitor_path(&guest.tmp_dir)); + + // Restore the VM from the snapshot let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=2"]) - .args(["--memory", "size=512M"]) - .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .default_disks() - .default_net() + .args(["--api-socket", &api_socket_restored]) .args([ - "--ivshmem", - format!("path={ivshmem_file_path},size={file_size}").as_str(), + "--event-monitor", + format!("path={event_path_restored}").as_str(), + ]) + .args([ + "--restore", + format!("source_url=file://{snapshot_dir},resume={use_resume_option}").as_str(), ]) - .args(["--api-socket", &api_socket]) .capture_output() .spawn() .unwrap(); + let expected_events = [ + &MetaEvent { + event: "starting".to_string(), + device_id: None, + }, + &MetaEvent { + event: "activated".to_string(), + device_id: Some("__console".to_string()), + }, + &MetaEvent { + event: "activated".to_string(), + device_id: Some("__rng".to_string()), + }, + &MetaEvent { + event: "restoring".to_string(), + device_id: None, + }, + ]; + assert!(wait_until(Duration::from_secs(30), || { + check_sequential_events(&expected_events, &event_path_restored) + })); + if use_resume_option { + let latest_events = [ + &MetaEvent { + event: "restored".to_string(), + device_id: None, + }, + &MetaEvent { + event: "resuming".to_string(), + device_id: None, + }, + &MetaEvent { + event: "resumed".to_string(), + device_id: None, + }, + ]; + assert!(wait_until(Duration::from_secs(30), || { + check_latest_events_exact(&latest_events, &event_path_restored) + })); + } else { + let latest_events = [&MetaEvent { + event: "restored".to_string(), + device_id: None, + }]; + assert!(wait_until(Duration::from_secs(30), || { + check_latest_events_exact(&latest_events, &event_path_restored) + })); + } + + // Wait until the restored VM API is ready before issuing follow-up requests. + assert!(wait_until(Duration::from_secs(30), || remote_command( + &api_socket_restored, + "info", + None + ))); + + // Remove the snapshot dir + let _ = remove_dir_all(snapshot_dir.as_str()); + let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot().unwrap(); - _test_ivshmem(&guest, &ivshmem_file_path, file_size); + if use_resume_option { + // VM was automatically resumed via restore option, just wait for events + thread::sleep(std::time::Duration::new(1, 0)); + } else { + // Resume the VM manually + assert!(wait_until(Duration::from_secs(30), || remote_command( + &api_socket_restored, + "info", + None + ))); + assert!(remote_command(&api_socket_restored, "resume", None)); + + let latest_events = [ + &MetaEvent { + event: "resuming".to_string(), + device_id: None, + }, + &MetaEvent { + event: "resumed".to_string(), + device_id: None, + }, + ]; + assert!(wait_until(Duration::from_secs(30), || { + check_latest_events_exact(&latest_events, &event_path_restored) + })); + } + + // Perform same checks to validate VM has been properly restored + assert_eq!(guest.get_cpu_count().unwrap_or_default(), 4); + let total_memory = guest.get_total_memory().unwrap_or_default(); + if use_hotplug { + assert!(total_memory > 4_800_000, "total_memory is {total_memory}"); + assert!(total_memory < 5_760_000, "total_memory is {total_memory}"); + // Deflate balloon to restore entire RAM to the VM + resize_command(&api_socket_restored, None, None, Some(0), None); + thread::sleep(std::time::Duration::new(5, 0)); + assert!(guest.get_total_memory().unwrap_or_default() > 5_760_000); + // Decrease guest RAM with virtio-mem + resize_command(&api_socket_restored, None, Some(5 << 30), None, None); + thread::sleep(std::time::Duration::new(5, 0)); + let total_memory = guest.get_total_memory().unwrap_or_default(); + assert!(total_memory > 4_800_000, "total_memory is {total_memory}"); + assert!(total_memory < 5_760_000, "total_memory is {total_memory}"); + } else { + assert!(total_memory > 900_000, "total memory: {total_memory}"); + } + + guest.check_devices_common(Some(&socket), Some(&console_text), None); }); + // Shutdown the target VM and check console output kill_child(&mut child); let output = child.wait_with_output().unwrap(); + handle_child_output(r, &output); + + let r = std::panic::catch_unwind(|| { + assert!(String::from_utf8_lossy(&output.stdout).contains(&console_text)); + }); handle_child_output(r, &output); } - #[test] - #[cfg(not(feature = "mshv"))] - fn test_snapshot_restore_ivshmem() { + pub(crate) fn _test_snapshot_restore_uffd( + memory_config: &str, + memory_zone_config: &[&str], + min_total_memory_kib: u32, + ) { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(disk_config)); let kernel_path = direct_kernel_boot_path(); let api_socket_source = format!("{}.1", temp_api_path(&guest.tmp_dir)); - let ivshmem_file_path = String::from( - guest - .tmp_dir - .as_path() - .join("ivshmem.data") - .to_str() - .unwrap(), - ); - let file_size = "1M"; - - // Create a file to be used as the shared memory - Command::new("dd") - .args([ - "if=/dev/zero", - format!("of={ivshmem_file_path}").as_str(), - format!("bs={file_size}").as_str(), - "count=1", - ]) - .status() - .unwrap(); - + let console_text = String::from("On a branch floating down river a cricket, singing."); + let snapshot_dir = temp_snapshot_dir_path(&guest.tmp_dir); let socket = temp_vsock_path(&guest.tmp_dir); let event_path = temp_event_monitor_path(&guest.tmp_dir); - let mut child = GuestCommand::new(&guest) + let mut source_cmd = GuestCommand::new(&guest); + source_cmd .args(["--api-socket", &api_socket_source]) .args(["--event-monitor", format!("path={event_path}").as_str()]) - .args(["--cpus", "boot=2"]) - .args(["--memory", "size=1G"]) + .args(["--cpus", "boot=4"]) + .args(["--memory", memory_config]); + + if !memory_zone_config.is_empty() { + source_cmd.args(["--memory-zone"]).args(memory_zone_config); + } + + let mut child = source_cmd .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .default_disks() .default_net() .args(["--vsock", format!("cid=3,socket={socket}").as_str()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .args([ - "--ivshmem", - format!("path={ivshmem_file_path},size={file_size}").as_str(), - ]) .capture_output() .spawn() .unwrap(); - let console_text = String::from("On a branch floating down river a cricket, singing."); - // Create the snapshot directory - let snapshot_dir = temp_snapshot_dir_path(&guest.tmp_dir); - let r = std::panic::catch_unwind(|| { guest.wait_vm_boot().unwrap(); - // Check the number of vCPUs - assert_eq!(guest.get_cpu_count().unwrap_or_default(), 2); + assert_eq!(guest.get_cpu_count().unwrap_or_default(), 4); + assert!(guest.get_total_memory().unwrap_or_default() > min_total_memory_kib); - common_sequential::snapshot_and_check_events( - &api_socket_source, - &snapshot_dir, - &event_path, - ); + guest.check_devices_common(Some(&socket), Some(&console_text), None); + + snapshot_and_check_events(&api_socket_source, &snapshot_dir, &event_path); }); - // Shutdown the source VM and check console output kill_child(&mut child); let output = child.wait_with_output().unwrap(); handle_child_output(r, &output); - // Remove the vsock socket file. + let r = std::panic::catch_unwind(|| { + assert!(String::from_utf8_lossy(&output.stdout).contains(&console_text)); + }); + handle_child_output(r, &output); + Command::new("rm") .arg("-f") .arg(socket.as_str()) @@ -10108,7 +7070,6 @@ mod ivshmem { let api_socket_restored = format!("{}.2", temp_api_path(&guest.tmp_dir)); let event_path_restored = format!("{}.2", temp_event_monitor_path(&guest.tmp_dir)); - // Restore the VM from the snapshot let mut child = GuestCommand::new(&guest) .args(["--api-socket", &api_socket_restored]) .args([ @@ -10117,175 +7078,85 @@ mod ivshmem { ]) .args([ "--restore", - format!("source_url=file://{snapshot_dir}").as_str(), + format!("source_url=file://{snapshot_dir},memory_restore_mode=ondemand").as_str(), ]) .capture_output() .spawn() .unwrap(); - // Wait for the VM to be restored - thread::sleep(std::time::Duration::new(20, 0)); - let latest_events = [&MetaEvent { event: "restored".to_string(), device_id: None, }]; - assert!(check_latest_events_exact( - &latest_events, - &event_path_restored - )); - // Remove the snapshot dir - let _ = remove_dir_all(snapshot_dir.as_str()); + assert!(wait_until(Duration::from_secs(30), || { + check_latest_events_exact(&latest_events, &event_path_restored) + })); let r = std::panic::catch_unwind(|| { - // Resume the VM + assert!(wait_until(Duration::from_secs(30), || remote_command( + &api_socket_restored, + "info", + None + ))); assert!(remote_command(&api_socket_restored, "resume", None)); - // There is no way that we can ensure the 'write()' to the - // event file is completed when the 'resume' request is - // returned successfully, because the 'write()' was done - // asynchronously from a different thread of Cloud - // Hypervisor (e.g. the event-monitor thread). - thread::sleep(std::time::Duration::new(1, 0)); + let latest_events = [ &MetaEvent { event: "resuming".to_string(), device_id: None, - }, - &MetaEvent { - event: "resumed".to_string(), - device_id: None, - }, - ]; - assert!(check_latest_events_exact( - &latest_events, - &event_path_restored - )); - - // Check the number of vCPUs - assert_eq!(guest.get_cpu_count().unwrap_or_default(), 2); - guest.check_devices_common(Some(&socket), Some(&console_text), None); - _test_ivshmem(&guest, &ivshmem_file_path, file_size); - }); - // Shutdown the target VM and check console output - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - handle_child_output(r, &output); - - let r = std::panic::catch_unwind(|| { - assert!(String::from_utf8_lossy(&output.stdout).contains(&console_text)); - }); - - handle_child_output(r, &output); - } - - #[test] - #[cfg(not(feature = "mshv"))] - fn test_live_migration_ivshmem() { - _test_live_migration_ivshmem(false); - } - - #[test] - #[cfg(not(feature = "mshv"))] - fn test_live_migration_ivshmem_local() { - _test_live_migration_ivshmem(true); - } -} - -mod common_sequential { - use std::fs::remove_dir_all; - - use crate::*; - - #[test] - #[cfg(not(feature = "mshv"))] - fn test_memory_mergeable_on() { - test_memory_mergeable(true); - } - - pub(crate) fn snapshot_and_check_events( - api_socket: &str, - snapshot_dir: &str, - event_path: &str, - ) { - // Pause the VM - assert!(remote_command(api_socket, "pause", None)); - let latest_events: [&MetaEvent; 2] = [ - &MetaEvent { - event: "pausing".to_string(), - device_id: None, - }, - &MetaEvent { - event: "paused".to_string(), - device_id: None, - }, - ]; - // See: #5938 - thread::sleep(std::time::Duration::new(1, 0)); - assert!(check_latest_events_exact(&latest_events, event_path)); + }, + &MetaEvent { + event: "resumed".to_string(), + device_id: None, + }, + ]; + assert!(wait_until(Duration::from_secs(30), || { + check_latest_events_exact(&latest_events, &event_path_restored) + })); - // Take a snapshot from the VM - assert!(remote_command( - api_socket, - "snapshot", - Some(format!("file://{snapshot_dir}").as_str()), - )); + assert_eq!(guest.get_cpu_count().unwrap_or_default(), 4); + assert!(guest.get_total_memory().unwrap_or_default() > min_total_memory_kib); - // Wait to make sure the snapshot is completed - thread::sleep(std::time::Duration::new(10, 0)); + guest.check_devices_common(Some(&socket), Some(&console_text), None); + }); - let latest_events = [ - &MetaEvent { - event: "snapshotting".to_string(), - device_id: None, - }, - &MetaEvent { - event: "snapshotted".to_string(), - device_id: None, - }, - ]; - // See: #5938 - thread::sleep(std::time::Duration::new(1, 0)); - assert!(check_latest_events_exact(&latest_events, event_path)); - } + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + handle_child_output(r, &output); - // One thing to note about this test. The virtio-net device is heavily used - // through each ssh command. There's no need to perform a dedicated test to - // verify the migration went well for virtio-net. - #[test] - #[cfg(not(feature = "mshv"))] - fn test_snapshot_restore_hotplug_virtiomem() { - _test_snapshot_restore(true); - } + let r = std::panic::catch_unwind(|| { + assert!(String::from_utf8_lossy(&output.stdout).contains(&console_text)); - #[test] - #[cfg(not(feature = "mshv"))] // See issue #7437 - fn test_snapshot_restore_basic() { - _test_snapshot_restore(false); + let logs = format!( + "{}\n{}", + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr) + ); + assert!( + logs.contains("UFFD restore: demand-paged restore enabled"), + "Expected UFFD restore path to be enabled. output: {logs}" + ); + }); + handle_child_output(r, &output); + + let _ = remove_dir_all(snapshot_dir.as_str()); } - fn _test_snapshot_restore(use_hotplug: bool) { + pub(crate) fn _test_snapshot_restore_devices(pvpanic: bool) { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(disk_config)); let kernel_path = direct_kernel_boot_path(); let api_socket_source = format!("{}.1", temp_api_path(&guest.tmp_dir)); - let net_id = "net123"; - let net_params = format!( - "id={},tap=,mac={},ip={},mask=255.255.255.128", - net_id, guest.network.guest_mac0, guest.network.host_ip0 - ); - let mut mem_params = "size=2G"; - - if use_hotplug { - mem_params = "size=2G,hotplug_method=virtio-mem,hotplug_size=32G"; - } - - let cloudinit_params = format!( - "path={},iommu=on", - guest.disk_config.disk(DiskType::CloudInit).unwrap() - ); + let device_params = { + let mut data = vec![]; + if pvpanic { + data.push(String::from("--pvpanic")); + } + data + }; let socket = temp_vsock_path(&guest.tmp_dir); let event_path = temp_event_monitor_path(&guest.tmp_dir); @@ -10293,111 +7164,33 @@ mod common_sequential { let mut child = GuestCommand::new(&guest) .args(["--api-socket", &api_socket_source]) .args(["--event-monitor", format!("path={event_path}").as_str()]) - .args(["--cpus", "boot=4"]) - .args(["--memory", mem_params]) - .args(["--balloon", "size=0"]) + .args(["--cpus", "boot=2"]) + .args(["--memory", "size=1G"]) .args(["--kernel", kernel_path.to_str().unwrap()]) - .args([ - "--disk", - format!( - "path={}", - guest.disk_config.disk(DiskType::OperatingSystem).unwrap() - ) - .as_str(), - cloudinit_params.as_str(), - ]) - .args(["--net", net_params.as_str()]) + .default_disks() + .default_net() .args(["--vsock", format!("cid=3,socket={socket}").as_str()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .args(device_params) .capture_output() .spawn() .unwrap(); let console_text = String::from("On a branch floating down river a cricket, singing."); - // Create the snapshot directory let snapshot_dir = temp_snapshot_dir_path(&guest.tmp_dir); let r = std::panic::catch_unwind(|| { guest.wait_vm_boot().unwrap(); - // Check the number of vCPUs - assert_eq!(guest.get_cpu_count().unwrap_or_default(), 4); - // Check the guest RAM - assert!(guest.get_total_memory().unwrap_or_default() > 1_920_000); - if use_hotplug { - // Increase guest RAM with virtio-mem - resize_command( - &api_socket_source, - None, - Some(6 << 30), - None, - Some(&event_path), - ); - thread::sleep(std::time::Duration::new(5, 0)); - assert!(guest.get_total_memory().unwrap_or_default() > 5_760_000); - // Use balloon to remove RAM from the VM - resize_command( - &api_socket_source, - None, - None, - Some(1 << 30), - Some(&event_path), - ); - thread::sleep(std::time::Duration::new(5, 0)); - let total_memory = guest.get_total_memory().unwrap_or_default(); - assert!(total_memory > 4_800_000); - assert!(total_memory < 5_760_000); - } - // Check the guest virtio-devices, e.g. block, rng, vsock, console, and net - guest.check_devices_common(Some(&socket), Some(&console_text), None); - - // x86_64: We check that removing and adding back the virtio-net device - // does not break the snapshot/restore support for virtio-pci. - // This is an important thing to test as the hotplug will - // trigger a PCI BAR reprogramming, which is a good way of - // checking if the stored resources are correctly restored. - // Unplug the virtio-net device - // AArch64: Device hotplug is currently not supported, skipping here. - #[cfg(target_arch = "x86_64")] - { - assert!(remote_command( - &api_socket_source, - "remove-device", - Some(net_id), - )); - thread::sleep(std::time::Duration::new(10, 0)); - let latest_events = [&MetaEvent { - event: "device-removed".to_string(), - device_id: Some(net_id.to_string()), - }]; - // See: #5938 - thread::sleep(std::time::Duration::new(1, 0)); - assert!(check_latest_events_exact(&latest_events, &event_path)); - - // Plug the virtio-net device again - assert!(remote_command( - &api_socket_source, - "add-net", - Some(net_params.as_str()), - )); - thread::sleep(std::time::Duration::new(10, 0)); - } + assert_eq!(guest.get_cpu_count().unwrap_or_default(), 2); snapshot_and_check_events(&api_socket_source, &snapshot_dir, &event_path); }); - // Shutdown the source VM and check console output kill_child(&mut child); let output = child.wait_with_output().unwrap(); handle_child_output(r, &output); - let r = std::panic::catch_unwind(|| { - assert!(String::from_utf8_lossy(&output.stdout).contains(&console_text)); - }); - - handle_child_output(r, &output); - - // Remove the vsock socket file. Command::new("rm") .arg("-f") .arg(socket.as_str()) @@ -10407,7 +7200,6 @@ mod common_sequential { let api_socket_restored = format!("{}.2", temp_api_path(&guest.tmp_dir)); let event_path_restored = format!("{}.2", temp_event_monitor_path(&guest.tmp_dir)); - // Restore the VM from the snapshot let mut child = GuestCommand::new(&guest) .args(["--api-socket", &api_socket_restored]) .args([ @@ -10422,51 +7214,23 @@ mod common_sequential { .spawn() .unwrap(); - // Wait for the VM to be restored - thread::sleep(std::time::Duration::new(20, 0)); - let expected_events = [ - &MetaEvent { - event: "starting".to_string(), - device_id: None, - }, - &MetaEvent { - event: "activated".to_string(), - device_id: Some("__console".to_string()), - }, - &MetaEvent { - event: "activated".to_string(), - device_id: Some("__rng".to_string()), - }, - &MetaEvent { - event: "restoring".to_string(), - device_id: None, - }, - ]; - assert!(check_sequential_events( - &expected_events, - &event_path_restored - )); let latest_events = [&MetaEvent { event: "restored".to_string(), device_id: None, }]; - assert!(check_latest_events_exact( - &latest_events, - &event_path_restored - )); + assert!(wait_until(Duration::from_secs(30), || { + check_latest_events_exact(&latest_events, &event_path_restored) + })); - // Remove the snapshot dir let _ = remove_dir_all(snapshot_dir.as_str()); let r = std::panic::catch_unwind(|| { - // Resume the VM + assert!(wait_until(Duration::from_secs(30), || remote_command( + &api_socket_restored, + "info", + None + ))); assert!(remote_command(&api_socket_restored, "resume", None)); - // There is no way that we can ensure the 'write()' to the - // event file is completed when the 'resume' request is - // returned successfully, because the 'write()' was done - // asynchronously from a different thread of Cloud - // Hypervisor (e.g. the event-monitor thread). - thread::sleep(std::time::Duration::new(1, 0)); let latest_events = [ &MetaEvent { event: "resuming".to_string(), @@ -10477,34 +7241,27 @@ mod common_sequential { device_id: None, }, ]; - assert!(check_latest_events_exact( - &latest_events, - &event_path_restored - )); - - // Perform same checks to validate VM has been properly restored - assert_eq!(guest.get_cpu_count().unwrap_or_default(), 4); - let total_memory = guest.get_total_memory().unwrap_or_default(); - if use_hotplug { - assert!(total_memory > 4_800_000); - assert!(total_memory < 5_760_000); - // Deflate balloon to restore entire RAM to the VM - resize_command(&api_socket_restored, None, None, Some(0), None); - thread::sleep(std::time::Duration::new(5, 0)); - assert!(guest.get_total_memory().unwrap_or_default() > 5_760_000); - // Decrease guest RAM with virtio-mem - resize_command(&api_socket_restored, None, Some(5 << 30), None, None); - thread::sleep(std::time::Duration::new(5, 0)); - let total_memory = guest.get_total_memory().unwrap_or_default(); - assert!(total_memory > 4_800_000); - assert!(total_memory < 5_760_000); - } else { - assert!(total_memory > 1_920_000); - } + assert!(wait_until(Duration::from_secs(30), || { + check_latest_events_exact(&latest_events, &event_path_restored) + })); + assert_eq!(guest.get_cpu_count().unwrap_or_default(), 2); guest.check_devices_common(Some(&socket), Some(&console_text), None); + + if pvpanic { + make_guest_panic(&guest); + thread::sleep(std::time::Duration::new(10, 0)); + + let expected_sequential_events = [&MetaEvent { + event: "panic".to_string(), + device_id: None, + }]; + assert!(check_latest_events_exact( + &expected_sequential_events, + &event_path_restored + )); + } }); - // Shutdown the target VM and check console output kill_child(&mut child); let output = child.wait_with_output().unwrap(); handle_child_output(r, &output); @@ -10515,6 +7272,39 @@ mod common_sequential { handle_child_output(r, &output); } +} + +mod common_sequential { + #[cfg(not(feature = "mshv"))] + use std::fs::remove_dir_all; + + #[cfg(not(feature = "mshv"))] + use crate::*; + + #[test] + #[cfg(not(feature = "mshv"))] + fn test_memory_mergeable_on() { + test_memory_mergeable(true); + } + + #[test] + #[cfg(not(feature = "mshv"))] + fn test_snapshot_restore_uffd_hugepage_zone() { + if !exec_host_command_status( + "grep -q '^Hugepagesize:[[:space:]]*2048 kB' /proc/meminfo && test $(awk '/HugePages_Free/ {print $2}' /proc/meminfo) -ge 256", + ) + .success() + { + println!("SKIPPED: not enough free 2MiB hugepages for UFFD restore test"); + return; + } + + snapshot_restore_common::_test_snapshot_restore_uffd( + "size=0", + &["id=mem0,size=512M,hugepages=on,hugepage_size=2M"], + 480_000, + ); + } #[test] #[cfg(not(feature = "mshv"))] // See issue #7437 @@ -10602,7 +7392,11 @@ mod common_sequential { // Check the guest virtio-devices, e.g. block, rng, vsock, console, and net guest.check_devices_common(None, Some(&console_text), None); - snapshot_and_check_events(&api_socket_source, &snapshot_dir, &event_path); + snapshot_restore_common::snapshot_and_check_events( + &api_socket_source, + &snapshot_dir, + &event_path, + ); }); // Shutdown the source VM and check console output @@ -10657,7 +7451,9 @@ mod common_sequential { )); // Wait for the VM to be restored - thread::sleep(std::time::Duration::new(20, 0)); + assert!(wait_until(Duration::from_secs(20), || { + remote_command(&api_socket_restored, "info", None) + })); // close the fds as CH duplicates them before using for tap in taps.iter() { @@ -10682,31 +7478,30 @@ mod common_sequential { device_id: None, }, ]; - assert!(check_sequential_events( - &expected_events, - &event_path_restored - )); + // Wait for the restore event sequence to be recorded. + assert!(wait_until(Duration::from_secs(30), || { + check_sequential_events(&expected_events, &event_path_restored) + })); let latest_events = [&MetaEvent { event: "restored".to_string(), device_id: None, - }]; - assert!(check_latest_events_exact( - &latest_events, - &event_path_restored - )); + }]; + assert!(wait_until(Duration::from_secs(30), || { + check_latest_events_exact(&latest_events, &event_path_restored) + })); // Remove the snapshot dir let _ = remove_dir_all(snapshot_dir.as_str()); let r = std::panic::catch_unwind(|| { // Resume the VM + assert!(wait_until(Duration::from_secs(20), || remote_command( + &api_socket_restored, + "info", + None + ))); assert!(remote_command(&api_socket_restored, "resume", None)); - // There is no way that we can ensure the 'write()' to the - // event file is completed when the 'resume' request is - // returned successfully, because the 'write()' was done - // asynchronously from a different thread of Cloud - // Hypervisor (e.g. the event-monitor thread). - thread::sleep(std::time::Duration::new(1, 0)); + let latest_events = [ &MetaEvent { event: "resuming".to_string(), @@ -10717,10 +7512,9 @@ mod common_sequential { device_id: None, }, ]; - assert!(check_latest_events_exact( - &latest_events, - &event_path_restored - )); + assert!(wait_until(Duration::from_secs(30), || { + check_latest_events_exact(&latest_events, &event_path_restored) + })); // Perform same checks to validate VM has been properly restored assert_eq!(guest.get_cpu_count().unwrap_or_default(), n_cpu); @@ -10741,69 +7535,82 @@ mod common_sequential { } #[test] - #[cfg(not(feature = "mshv"))] // See issue #7437 - #[cfg(target_arch = "x86_64")] - fn test_snapshot_restore_pvpanic() { - _test_snapshot_restore_devices(true); - } - - fn _test_snapshot_restore_devices(pvpanic: bool) { + #[cfg(not(feature = "mshv"))] + fn test_snapshot_restore_virtio_fs() { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(disk_config)); let kernel_path = direct_kernel_boot_path(); let api_socket_source = format!("{}.1", temp_api_path(&guest.tmp_dir)); - let device_params = { - let mut data = vec![]; - if pvpanic { - data.push(String::from("--pvpanic")); - } - data - }; + let mut workload_path = dirs::home_dir().unwrap(); + workload_path.push("workloads"); + let mut shared_dir = workload_path; + shared_dir.push("shared_dir"); + + let (mut daemon_child, virtiofsd_socket_path) = + prepare_virtiofsd(&guest.tmp_dir, shared_dir.to_str().unwrap()); - let socket = temp_vsock_path(&guest.tmp_dir); let event_path = temp_event_monitor_path(&guest.tmp_dir); let mut child = GuestCommand::new(&guest) .args(["--api-socket", &api_socket_source]) .args(["--event-monitor", format!("path={event_path}").as_str()]) .args(["--cpus", "boot=2"]) - .args(["--memory", "size=1G"]) + .args(["--memory", "size=512M,shared=on"]) .args(["--kernel", kernel_path.to_str().unwrap()]) .default_disks() .default_net() - .args(["--vsock", format!("cid=3,socket={socket}").as_str()]) + .args([ + "--fs", + format!("socket={virtiofsd_socket_path},tag=myfs,num_queues=1,queue_size=1024") + .as_str(), + ]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) - .args(device_params) .capture_output() .spawn() .unwrap(); - let console_text = String::from("On a branch floating down river a cricket, singing."); - // Create the snapshot directory let snapshot_dir = temp_snapshot_dir_path(&guest.tmp_dir); let r = std::panic::catch_unwind(|| { guest.wait_vm_boot().unwrap(); - // Check the number of vCPUs - assert_eq!(guest.get_cpu_count().unwrap_or_default(), 2); + // Mount virtiofs and write a test file + guest + .ssh_command("mkdir -p mount_dir && sudo mount -t virtiofs myfs mount_dir/") + .unwrap(); - snapshot_and_check_events(&api_socket_source, &snapshot_dir, &event_path); + // Verify the shared directory is accessible + assert_eq!( + guest.ssh_command("cat mount_dir/file1").unwrap().trim(), + "foo" + ); + + // Write a file from the guest + guest + .ssh_command( + "sudo bash -c 'echo snapshot_test_data > mount_dir/snapshot_test_file'", + ) + .unwrap(); + snapshot_restore_common::snapshot_and_check_events( + &api_socket_source, + &snapshot_dir, + &event_path, + ); }); - // Shutdown the source VM and check console output + // Shutdown the source VM kill_child(&mut child); let output = child.wait_with_output().unwrap(); handle_child_output(r, &output); - // Remove the vsock socket file. - Command::new("rm") - .arg("-f") - .arg(socket.as_str()) - .output() - .unwrap(); + // Kill the old virtiofsd + let _ = daemon_child.kill(); + let _ = daemon_child.wait(); + + // Start a fresh virtiofsd (reusing the same socket path) + let (mut daemon_child, _) = prepare_virtiofsd(&guest.tmp_dir, shared_dir.to_str().unwrap()); let api_socket_restored = format!("{}.2", temp_api_path(&guest.tmp_dir)); let event_path_restored = format!("{}.2", temp_event_monitor_path(&guest.tmp_dir)); @@ -10824,7 +7631,9 @@ mod common_sequential { .unwrap(); // Wait for the VM to be restored - thread::sleep(std::time::Duration::new(20, 0)); + assert!(wait_until(Duration::from_secs(30), || { + remote_command(&api_socket_restored, "info", None) + })); let latest_events = [&MetaEvent { event: "restored".to_string(), @@ -10840,63 +7649,51 @@ mod common_sequential { let r = std::panic::catch_unwind(|| { // Resume the VM + assert!(wait_until(Duration::from_secs(30), || remote_command( + &api_socket_restored, + "info", + None + ))); assert!(remote_command(&api_socket_restored, "resume", None)); - // There is no way that we can ensure the 'write()' to the - // event file is completed when the 'resume' request is - // returned successfully, because the 'write()' was done - // asynchronously from a different thread of Cloud - // Hypervisor (e.g. the event-monitor thread). - thread::sleep(std::time::Duration::new(1, 0)); - let latest_events = [ - &MetaEvent { - event: "resuming".to_string(), - device_id: None, - }, - &MetaEvent { - event: "resumed".to_string(), - device_id: None, - }, - ]; - assert!(check_latest_events_exact( - &latest_events, - &event_path_restored - )); + thread::sleep(std::time::Duration::new(5, 0)); - // Check the number of vCPUs - assert_eq!(guest.get_cpu_count().unwrap_or_default(), 2); - guest.check_devices_common(Some(&socket), Some(&console_text), None); + // Verify virtiofs still works after restore + // Read the file written before snapshot + assert_eq!( + guest + .ssh_command("cat mount_dir/snapshot_test_file") + .unwrap() + .trim(), + "snapshot_test_data" + ); - if pvpanic { - // Trigger guest a panic - make_guest_panic(&guest); - // Wait a while for guest - thread::sleep(std::time::Duration::new(10, 0)); + // Read the pre-existing shared file + assert_eq!( + guest.ssh_command("cat mount_dir/file1").unwrap().trim(), + "foo" + ); - let expected_sequential_events = [&MetaEvent { - event: "panic".to_string(), - device_id: None, - }]; - assert!(check_latest_events_exact( - &expected_sequential_events, - &event_path_restored - )); - } - }); - // Shutdown the target VM and check console output - kill_child(&mut child); - let output = child.wait_with_output().unwrap(); - handle_child_output(r, &output); + // Write a new file after restore + guest + .ssh_command("sudo bash -c 'echo post_restore_data > mount_dir/post_restore_file'") + .unwrap(); - let r = std::panic::catch_unwind(|| { - assert!(String::from_utf8_lossy(&output.stdout).contains(&console_text)); + // Verify the new file exists on the host + let post_restore_content = + std::fs::read_to_string(shared_dir.join("post_restore_file")).unwrap(); + assert_eq!(post_restore_content.trim(), "post_restore_data"); }); + // Shutdown the target VM + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); handle_child_output(r, &output); - } - #[test] - fn test_virtio_pmem_persist_writes() { - test_virtio_pmem(false, false); + // Clean up virtiofsd and test files + let _ = daemon_child.kill(); + let _ = daemon_child.wait(); + let _ = std::fs::remove_file(shared_dir.join("snapshot_test_file")); + let _ = std::fs::remove_file(shared_dir.join("post_restore_file")); } } @@ -10938,7 +7735,7 @@ mod windows { } fn ssh_cmd(&self, cmd: &str) -> String { - ssh_command_ip_with_auth( + ssh_command_ip_with_auth_retry( cmd, &self.auth, &self.guest.network.guest_ip0, @@ -11123,37 +7920,19 @@ mod windows { )) } - fn wait_for_boot(&self) -> bool { - let cmd = "dir /b c:\\ | find \"Windows\""; - let tmo_max = 180; - // The timeout increase by n*1+n*2+n*3+..., therefore the initial - // interval must be small. - let tmo_int = 2; - let out = ssh_command_ip_with_auth( - cmd, + fn wait_for_boot(&self) -> Result<(), WaitForSshError> { + let out = wait_for_ssh( + "dir /b c:\\ | find \"Windows\"", &self.auth, &self.guest.network.guest_ip0, - { - let mut ret = 1; - let mut tmo_acc = 0; - loop { - tmo_acc += tmo_int * ret; - if tmo_acc >= tmo_max { - break; - } - ret += 1; - } - ret - }, - tmo_int, - ) - .unwrap(); + Duration::from_secs(180), + )?; - if "Windows" == out.trim() { - return true; + if out.trim() == "Windows" { + Ok(()) + } else { + panic!("Unexpected Windows boot probe output: {:?}", out.trim()); } - - false } } @@ -11222,7 +8001,7 @@ mod windows { let r = std::panic::catch_unwind(|| { // Wait to make sure Windows boots up - assert!(windows_guest.wait_for_boot()); + windows_guest.wait_for_boot().unwrap(); windows_guest.shutdown(); }); @@ -11287,7 +8066,7 @@ mod windows { let r = std::panic::catch_unwind(|| { // Wait to make sure Windows boots up - assert!(windows_guest.wait_for_boot()); + windows_guest.wait_for_boot().unwrap(); windows_guest.shutdown(); }); @@ -11338,7 +8117,7 @@ mod windows { let mut child_dnsmasq = windows_guest.run_dnsmasq(); // Wait to make sure Windows boots up - assert!(windows_guest.wait_for_boot()); + windows_guest.wait_for_boot().unwrap(); let snapshot_dir = temp_snapshot_dir_path(&tmp_dir); @@ -11352,8 +8131,11 @@ mod windows { Some(format!("file://{snapshot_dir}").as_str()), )); - // Wait to make sure the snapshot is completed - thread::sleep(std::time::Duration::new(30, 0)); + let snapshot_state_path = std::path::Path::new(&snapshot_dir).join("state.json"); + let snapshot_config_path = std::path::Path::new(&snapshot_dir).join("config.json"); + assert!(wait_until(Duration::from_secs(30), || { + snapshot_state_path.exists() && snapshot_config_path.exists() + })); let _ = child.kill(); child.wait().unwrap(); @@ -11372,10 +8154,17 @@ mod windows { .unwrap(); // Wait for the VM to be restored - thread::sleep(std::time::Duration::new(20, 0)); + assert!(wait_until(Duration::from_secs(30), || { + remote_command(&api_socket_restored, "info", None) + })); let r = std::panic::catch_unwind(|| { // Resume the VM + assert!(wait_until(Duration::from_secs(30), || remote_command( + &api_socket_restored, + "info", + None + ))); assert!(remote_command(&api_socket_restored, "resume", None)); windows_guest.shutdown(); @@ -11421,7 +8210,7 @@ mod windows { let r = std::panic::catch_unwind(|| { // Wait to make sure Windows boots up - assert!(windows_guest.wait_for_boot()); + windows_guest.wait_for_boot().unwrap(); let vcpu_num = 2; // Check the initial number of CPUs the guest sees @@ -11432,8 +8221,10 @@ mod windows { let vcpu_num = 6; // Hotplug some CPUs resize_command(&api_socket, Some(vcpu_num), None, None, None); - // Wait to make sure CPUs are added - thread::sleep(std::time::Duration::new(10, 0)); + // Wait for Windows to report the hotplugged CPUs. + assert!(wait_until(Duration::from_secs(10), || windows_guest + .cpu_count() + == vcpu_num)); // Check the guest sees the correct number assert_eq!(windows_guest.cpu_count(), vcpu_num); // Check the CH process has the correct number of vcpu threads @@ -11442,12 +8233,16 @@ mod windows { let vcpu_num = 4; // Remove some CPUs. Note that Windows doesn't support hot-remove. resize_command(&api_socket, Some(vcpu_num), None, None, None); - // Wait to make sure CPUs are removed thread::sleep(std::time::Duration::new(10, 0)); + // Reboot to let Windows catch up windows_guest.reboot(); - // Wait to make sure Windows completely rebooted - thread::sleep(std::time::Duration::new(60, 0)); + // Wait for Windows to come back after the reboot. + windows_guest.wait_for_boot().unwrap(); + // Wait for Windows to reflect the unplugged CPU count. + assert!(wait_until(Duration::from_secs(60), || windows_guest + .cpu_count() + == vcpu_num)); // Check the guest sees the correct number assert_eq!(windows_guest.cpu_count(), vcpu_num); // Check the CH process has the correct number of vcpu threads @@ -11496,7 +8291,7 @@ mod windows { let r = std::panic::catch_unwind(|| { // Wait to make sure Windows boots up - assert!(windows_guest.wait_for_boot()); + windows_guest.wait_for_boot().unwrap(); let ram_size = 2 * 1024 * 1024 * 1024; // Check the initial number of RAM the guest sees @@ -11511,20 +8306,22 @@ mod windows { let ram_size = 4 * 1024 * 1024 * 1024; // Hotplug some RAM resize_command(&api_socket, None, Some(ram_size), None, None); - // Wait to make sure RAM has been added - thread::sleep(std::time::Duration::new(10, 0)); - // Check the guest sees the correct number - assert_eq!(windows_guest.ram_size(), ram_size - reserved_ram_size); + // Wait for Windows to report the hotplugged memory. + assert!(wait_until(Duration::from_secs(10), || windows_guest + .ram_size() + == ram_size - reserved_ram_size)); let ram_size = 3 * 1024 * 1024 * 1024; // Unplug some RAM. Note that hot-remove most likely won't work. resize_command(&api_socket, None, Some(ram_size), None, None); - // Wait to make sure RAM has been added - thread::sleep(std::time::Duration::new(10, 0)); // Reboot to let Windows catch up windows_guest.reboot(); - // Wait to make sure guest completely rebooted - thread::sleep(std::time::Duration::new(60, 0)); + // Wait for Windows to come back after the reboot. + windows_guest.wait_for_boot().unwrap(); + // Wait for Windows to reflect the unplugged RAM amount. + assert!(wait_until(Duration::from_secs(60), || windows_guest + .ram_size() + == ram_size - reserved_ram_size)); // Check the guest sees the correct number assert_eq!(windows_guest.ram_size(), ram_size - reserved_ram_size); @@ -11570,7 +8367,7 @@ mod windows { let r = std::panic::catch_unwind(|| { // Wait to make sure Windows boots up - assert!(windows_guest.wait_for_boot()); + windows_guest.wait_for_boot().unwrap(); // Initially present network device let netdev_num = 1; @@ -11578,14 +8375,18 @@ mod windows { assert_eq!(netdev_ctrl_threads_count(child.id()), netdev_num); // Hotplug network device - let (cmd_success, cmd_output) = remote_command_w_output( + let (cmd_success, cmd_output, _) = remote_command_w_output( &api_socket, "add-net", Some(windows_guest.guest().default_net_string().as_str()), ); assert!(cmd_success); assert!(String::from_utf8_lossy(&cmd_output).contains("\"id\":\"_net2\"")); - thread::sleep(std::time::Duration::new(5, 0)); + // Wait for Windows to enumerate the added network device. + assert!(wait_until(Duration::from_secs(5), || windows_guest + .netdev_count() + == 2 + && netdev_ctrl_threads_count(child.id()) == 2)); // Verify the device is on the system let netdev_num = 2; assert_eq!(windows_guest.netdev_count(), netdev_num); @@ -11594,7 +8395,11 @@ mod windows { // Remove network device let cmd_success = remote_command(&api_socket, "remove-device", Some("_net2")); assert!(cmd_success); - thread::sleep(std::time::Duration::new(5, 0)); + // Wait for Windows to drop the removed network device. + assert!(wait_until(Duration::from_secs(5), || windows_guest + .netdev_count() + == 1 + && netdev_ctrl_threads_count(child.id()) == 1)); // Verify the device has been removed let netdev_num = 1; assert_eq!(windows_guest.netdev_count(), netdev_num); @@ -11646,7 +8451,7 @@ mod windows { let r = std::panic::catch_unwind(|| { // Wait to make sure Windows boots up - assert!(windows_guest.wait_for_boot()); + windows_guest.wait_for_boot().unwrap(); // Initially present disk device let disk_num = 1; @@ -11654,17 +8459,21 @@ mod windows { assert_eq!(disk_ctrl_threads_count(child.id()), disk_num); // Hotplug disk device - let (cmd_success, cmd_output) = remote_command_w_output( + let (cmd_success, cmd_output, _) = remote_command_w_output( &api_socket, "add-disk", Some(format!("path={disk},readonly=off").as_str()), ); assert!(cmd_success); assert!(String::from_utf8_lossy(&cmd_output).contains("\"id\":\"_disk2\"")); - thread::sleep(std::time::Duration::new(5, 0)); // Online disk device windows_guest.disks_set_rw(); windows_guest.disks_online(); + // Wait for Windows to enumerate the added disk. + assert!(wait_until(Duration::from_secs(5), || windows_guest + .disk_count() + == 2 + && disk_ctrl_threads_count(child.id()) == 2)); // Verify the device is on the system let disk_num = 2; assert_eq!(windows_guest.disk_count(), disk_num); @@ -11677,20 +8486,28 @@ mod windows { // Unmount disk device let cmd_success = remote_command(&api_socket, "remove-device", Some("_disk2")); assert!(cmd_success); - thread::sleep(std::time::Duration::new(5, 0)); + // Wait for Windows to drop the removed disk. + assert!(wait_until(Duration::from_secs(5), || windows_guest + .disk_count() + == 1 + && disk_ctrl_threads_count(child.id()) == 1)); // Verify the device has been removed let disk_num = 1; assert_eq!(windows_guest.disk_count(), disk_num); assert_eq!(disk_ctrl_threads_count(child.id()), disk_num); // Remount and check the file exists with the expected contents - let (cmd_success, _cmd_output) = remote_command_w_output( + let (cmd_success, _cmd_output, _) = remote_command_w_output( &api_socket, "add-disk", Some(format!("path={disk},readonly=off").as_str()), ); assert!(cmd_success); - thread::sleep(std::time::Duration::new(5, 0)); + // Wait for Windows to mount the re-added disk again. + assert!(wait_until(Duration::from_secs(5), || windows_guest + .disk_file_read(fname) + .trim() + == data)); let out = windows_guest.disk_file_read(fname); assert_eq!(data, out.trim()); @@ -11756,7 +8573,7 @@ mod windows { let r = std::panic::catch_unwind(|| { // Wait to make sure Windows boots up - assert!(windows_guest.wait_for_boot()); + windows_guest.wait_for_boot().unwrap(); // Initially present disk device let disk_num = 1; @@ -11766,8 +8583,12 @@ mod windows { for it in &disk_test_data { let disk_id = it[0].as_str(); let disk = it[1].as_str(); + + let expected_disk_num = windows_guest.disk_count() + 1; + let expected_ctrl_threads = disk_ctrl_threads_count(child.id()) + 1; + // Hotplug disk device - let (cmd_success, cmd_output) = remote_command_w_output( + let (cmd_success, cmd_output, _) = remote_command_w_output( &api_socket, "add-disk", Some(format!("path={disk},readonly=off").as_str()), @@ -11777,7 +8598,13 @@ mod windows { String::from_utf8_lossy(&cmd_output) .contains(format!("\"id\":\"{disk_id}\"").as_str()) ); - thread::sleep(std::time::Duration::new(5, 0)); + + // Wait for disk to appear + assert!(wait_until(Duration::from_secs(5), || { + windows_guest.disk_count() == expected_disk_num + && disk_ctrl_threads_count(child.id()) == expected_ctrl_threads + })); + // Online disk devices windows_guest.disks_set_rw(); windows_guest.disks_online(); @@ -11799,9 +8626,13 @@ mod windows { let disk_id = it[0].as_str(); let cmd_success = remote_command(&api_socket, "remove-device", Some(disk_id)); assert!(cmd_success); - thread::sleep(std::time::Duration::new(5, 0)); } + // Wait for Windows to drop all removed disks. + assert!(wait_until(Duration::from_secs(5), || windows_guest + .disk_count() + == 1 + && disk_ctrl_threads_count(child.id()) == 1)); // Verify the devices have been removed let disk_num = 1; assert_eq!(windows_guest.disk_count(), disk_num); @@ -11810,15 +8641,18 @@ mod windows { // Remount for it in &disk_test_data { let disk = it[1].as_str(); - let (cmd_success, _cmd_output) = remote_command_w_output( + let (cmd_success, _cmd_output, _) = remote_command_w_output( &api_socket, "add-disk", Some(format!("path={disk},readonly=off").as_str()), ); assert!(cmd_success); - thread::sleep(std::time::Duration::new(5, 0)); } + // Wait for Windows to enumerate the re-added disks. + assert!(wait_until(Duration::from_secs(5), || { + windows_guest.disk_count() == 4 && disk_ctrl_threads_count(child.id()) == 4 + })); // Check the files exists with the expected contents for it in &disk_test_data { let fname = it[2].as_str(); @@ -11878,7 +8712,7 @@ mod windows { let r = std::panic::catch_unwind(|| { // Wait to make sure Windows boots up - assert!(windows_guest.wait_for_boot()); + windows_guest.wait_for_boot().unwrap(); let netdev_num = 3; assert_eq!(windows_guest.netdev_count(), netdev_num); @@ -11899,6 +8733,73 @@ mod windows { handle_child_output(r, &output); } + + #[test] + fn test_windows_guest_qcow2_backing_direct() { + let windows_guest = WindowsGuest::new(); + + let qcow2_path = windows_guest.guest().disk_config.qcow2_disk().unwrap(); + + let mut child = GuestCommand::new(windows_guest.guest()) + .args(["--cpus", "boot=2,kvm_hyperv=on"]) + .args(["--memory", "size=4G"]) + .args(["--kernel", edk2_path().to_str().unwrap()]) + .args(["--serial", "tty"]) + .args(["--console", "off"]) + .args([ + "--disk", + format!("path={qcow2_path},image_type=qcow2,backing_files=on,direct=on").as_str(), + ]) + .default_net() + .capture_output() + .spawn() + .unwrap(); + + let fd = child.stdout.as_ref().unwrap().as_raw_fd(); + let pipesize = unsafe { libc::fcntl(fd, libc::F_SETPIPE_SZ, PIPE_SIZE) }; + let fd = child.stderr.as_ref().unwrap().as_raw_fd(); + let pipesize1 = unsafe { libc::fcntl(fd, libc::F_SETPIPE_SZ, PIPE_SIZE) }; + + assert!(pipesize >= PIPE_SIZE && pipesize1 >= PIPE_SIZE); + + let mut child_dnsmasq = windows_guest.run_dnsmasq(); + + let r = std::panic::catch_unwind(|| { + windows_guest.wait_for_boot().unwrap(); + + // Write and read back files through qcow2 + direct I/O. + for i in 0..5 { + let fname = format!("c:\\test-dio-{i}.bin"); + let fname2 = format!("c:\\test-dio-{i}-copy.bin"); + let size = (i + 1) * 4 * 1024 * 1024; + windows_guest.ssh_cmd(&format!( + "powershell -Command \"\ + $r = New-Object byte[] {size}; \ + (New-Object Random {i}).NextBytes($r); \ + [IO.File]::WriteAllBytes('{fname}', $r)\"" + )); + let hash_write = windows_guest.ssh_cmd(&format!( + "powershell -Command \"(Get-FileHash '{fname}' -Algorithm SHA256).Hash\"" + )); + windows_guest.ssh_cmd(&format!("copy {fname} {fname2}")); + let hash_read = windows_guest.ssh_cmd(&format!( + "powershell -Command \"(Get-FileHash '{fname2}' -Algorithm SHA256).Hash\"" + )); + assert_eq!(hash_write.trim(), hash_read.trim()); + } + + windows_guest.shutdown(); + }); + + let _ = child.wait_timeout(std::time::Duration::from_secs(60)); + let _ = child.kill(); + let output = child.wait_with_output().unwrap(); + + let _ = child_dnsmasq.kill(); + let _ = child_dnsmasq.wait(); + + handle_child_output(r, &output); + } } #[cfg(target_arch = "x86_64")] @@ -11906,7 +8807,15 @@ mod vfio { use crate::*; const NVIDIA_VFIO_DEVICE: &str = "/sys/bus/pci/devices/0002:00:01.0"; - fn test_nvidia_card_memory_hotplug(hotplug_method: &str) { + fn platform_cfg(iommufd: bool) -> String { + if iommufd { + "iommufd=on,vfio_p2p_dma=off".to_string() + } else { + "iommufd=off".to_string() + } + } + + fn test_nvidia_card_memory_hotplug(hotplug_method: &str, iommufd: bool) { let disk_config = UbuntuDiskConfig::new(JAMMY_VFIO_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(disk_config)); let api_socket = temp_api_path(&guest.tmp_dir); @@ -11918,6 +8827,7 @@ mod vfio { format!("size=4G,hotplug_size=4G,hotplug_method={hotplug_method}").as_str(), ]) .args(["--kernel", fw_path(FwType::RustHypervisorFirmware).as_str()]) + .args(["--platform", &platform_cfg(iommufd)]) .args(["--device", format!("path={NVIDIA_VFIO_DEVICE}").as_str()]) .args(["--api-socket", &api_socket]) .default_disks() @@ -11931,16 +8841,23 @@ mod vfio { assert!(guest.get_total_memory().unwrap_or_default() > 3_840_000); + // Verify the VFIO device works before memory hotplug + guest.check_nvidia_gpu(); + guest.enable_memory_hotplug(); // Add RAM to the VM let desired_ram = 6 << 30; resize_command(&api_socket, None, Some(desired_ram), None, None); - thread::sleep(std::time::Duration::new(30, 0)); + assert!(wait_until(Duration::from_secs(5), || { + guest.get_total_memory().unwrap_or_default() > 5_760_000 + })); assert!(guest.get_total_memory().unwrap_or_default() > 5_760_000); - // Check the VFIO device works when RAM is increased to 6GiB - guest.check_nvidia_gpu(); + // Check the VFIO device works when RAM is increased to 6GiB. + // After guest memory hotplug, the VMM must refresh VFIO/iommufd DMA + // mappings for the passthrough GPU. + assert!(wait_until(Duration::from_secs(10), || guest.check_nvidia_gpu())); }); let _ = child.kill(); @@ -11951,24 +8868,34 @@ mod vfio { #[test] fn test_nvidia_card_memory_hotplug_acpi() { - test_nvidia_card_memory_hotplug("acpi"); + test_nvidia_card_memory_hotplug("acpi", false); } #[test] fn test_nvidia_card_memory_hotplug_virtio_mem() { - test_nvidia_card_memory_hotplug("virtio-mem"); + test_nvidia_card_memory_hotplug("virtio-mem", false); } #[test] - fn test_nvidia_card_pci_hotplug() { + fn test_iommufd_nvidia_card_memory_hotplug_acpi() { + test_nvidia_card_memory_hotplug("acpi", true); + } + + #[test] + fn test_iommufd_nvidia_card_memory_hotplug_virtio_mem() { + test_nvidia_card_memory_hotplug("virtio-mem", true); + } + + fn test_nvidia_card_pci_hotplug_common(iommufd: bool) { let disk_config = UbuntuDiskConfig::new(JAMMY_VFIO_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(disk_config)); let api_socket = temp_api_path(&guest.tmp_dir); let mut child = GuestCommand::new(&guest) .args(["--cpus", "boot=4"]) - .args(["--memory", "size=4G"]) + .args(["--memory", "size=1G"]) .args(["--kernel", fw_path(FwType::RustHypervisorFirmware).as_str()]) + .args(["--platform", &platform_cfg(iommufd)]) .args(["--api-socket", &api_socket]) .default_disks() .default_net() @@ -11980,7 +8907,7 @@ mod vfio { guest.wait_vm_boot().unwrap(); // Hotplug the card to the VM - let (cmd_success, cmd_output) = remote_command_w_output( + let (cmd_success, cmd_output, _) = remote_command_w_output( &api_socket, "add-device", Some(format!("id=vfio0,path={NVIDIA_VFIO_DEVICE}").as_str()), @@ -11991,10 +8918,8 @@ mod vfio { .contains("{\"id\":\"vfio0\",\"bdf\":\"0000:00:06.0\"}") ); - thread::sleep(std::time::Duration::new(10, 0)); - // Check the VFIO device works after hotplug - guest.check_nvidia_gpu(); + assert!(wait_until(Duration::from_secs(10), || guest.check_nvidia_gpu())); }); let _ = child.kill(); @@ -12004,14 +8929,24 @@ mod vfio { } #[test] - fn test_nvidia_card_reboot() { + fn test_nvidia_card_pci_hotplug() { + test_nvidia_card_pci_hotplug_common(false); + } + + #[test] + fn test_iommufd_nvidia_card_pci_hotplug() { + test_nvidia_card_pci_hotplug_common(true); + } + + fn test_nvidia_card_reboot_common(iommufd: bool) { let disk_config = UbuntuDiskConfig::new(JAMMY_VFIO_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(disk_config)); let api_socket = temp_api_path(&guest.tmp_dir); let mut child = GuestCommand::new(&guest) .args(["--cpus", "boot=4"]) - .args(["--memory", "size=4G"]) + .args(["--memory", "size=1G"]) + .args(["--platform", &platform_cfg(iommufd)]) .args(["--kernel", fw_path(FwType::RustHypervisorFirmware).as_str()]) .args([ "--device", @@ -12028,12 +8963,12 @@ mod vfio { guest.wait_vm_boot().unwrap(); // Check the VFIO device works after boot - guest.check_nvidia_gpu(); + assert!(guest.check_nvidia_gpu()); guest.reboot_linux(0); // Check the VFIO device works after reboot - guest.check_nvidia_gpu(); + assert!(guest.check_nvidia_gpu()); }); let _ = child.kill(); @@ -12043,20 +8978,31 @@ mod vfio { } #[test] - fn test_nvidia_card_iommu_address_width() { + fn test_nvidia_card_reboot() { + test_nvidia_card_reboot_common(false); + } + + #[test] + fn test_iommufd_nvidia_card_reboot() { + test_nvidia_card_reboot_common(true); + } + + fn test_nvidia_card_iommu_address_width_common(iommufd: bool) { let disk_config = UbuntuDiskConfig::new(JAMMY_VFIO_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(disk_config)); let api_socket = temp_api_path(&guest.tmp_dir); + let platform = format!( + "num_pci_segments=2,iommu_segments=1,iommu_address_width=42,{}", + platform_cfg(iommufd) + ); + let mut child = GuestCommand::new(&guest) .args(["--cpus", "boot=4"]) - .args(["--memory", "size=4G"]) + .args(["--memory", "size=1G"]) .args(["--kernel", fw_path(FwType::RustHypervisorFirmware).as_str()]) .args(["--device", format!("path={NVIDIA_VFIO_DEVICE}").as_str()]) - .args([ - "--platform", - "num_pci_segments=2,iommu_segments=1,iommu_address_width=42", - ]) + .args(["--platform", &platform]) .args(["--api-socket", &api_socket]) .default_disks() .default_net() @@ -12073,6 +9019,9 @@ mod vfio { .unwrap() .contains("input address: 42 bits") ); + + // Check the VFIO device works after boot + guest.check_nvidia_gpu(); }); let _ = child.kill(); @@ -12082,29 +9031,34 @@ mod vfio { } #[test] - fn test_nvidia_guest_numa_generic_initiator() { + fn test_nvidia_card_iommu_address_width() { + test_nvidia_card_iommu_address_width_common(false); + } + + #[test] + fn test_iommufd_nvidia_card_iommu_address_width() { + test_nvidia_card_iommu_address_width_common(true); + } + + fn test_nvidia_guest_numa_generic_initiator_common(iommufd: bool) { // Skip test if VFIO device is not available or not ready if !std::path::Path::new(NVIDIA_VFIO_DEVICE).exists() { - println!("SKIPPED: VFIO device {} not found", NVIDIA_VFIO_DEVICE); + println!("SKIPPED: VFIO device {NVIDIA_VFIO_DEVICE} not found"); return; } // Check if device is bound to vfio-pci driver - let driver_path = format!("{}/driver", NVIDIA_VFIO_DEVICE); + let driver_path = format!("{NVIDIA_VFIO_DEVICE}/driver"); if let Ok(driver) = std::fs::read_link(&driver_path) { let driver_name = driver.file_name().unwrap_or_default().to_string_lossy(); if driver_name != "vfio-pci" { println!( - "SKIPPED: VFIO device {} bound to {}, not vfio-pci", - NVIDIA_VFIO_DEVICE, driver_name + "SKIPPED: VFIO device {NVIDIA_VFIO_DEVICE} bound to {driver_name}, not vfio-pci" ); return; } } else { - println!( - "SKIPPED: VFIO device {} not bound to any driver", - NVIDIA_VFIO_DEVICE - ); + println!("SKIPPED: VFIO device {NVIDIA_VFIO_DEVICE} not bound to any driver"); return; } @@ -12123,9 +9077,10 @@ mod vfio { "guest_numa_id=1,cpus=[2-3],distances=[0@20,2@30],memory_zones=mem1", "guest_numa_id=2,device_id=vfio0,distances=[0@25,1@30]", ]) + .args(["--platform", &platform_cfg(iommufd)]) .args([ "--device", - &format!("id=vfio0,path={},iommu=on", NVIDIA_VFIO_DEVICE), + &format!("id=vfio0,path={NVIDIA_VFIO_DEVICE},iommu=on"), ]) .args(["--kernel", fw_path(FwType::RustHypervisorFirmware).as_str()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) @@ -12193,11 +9148,25 @@ mod vfio { kill_child(&mut child); let output = child.wait_with_output().unwrap(); - handle_child_output(r, &output); + handle_child_output(r, &output); + } + + #[test] + fn test_nvidia_guest_numa_generic_initiator() { + test_nvidia_guest_numa_generic_initiator_common(false); + } + + #[test] + fn test_iommufd_nvidia_guest_numa_generic_initiator() { + test_nvidia_guest_numa_generic_initiator_common(true); } } mod live_migration { + use std::num::NonZeroU32; + + use vmm::api::TimeoutStrategy; + use crate::*; pub fn start_live_migration( @@ -12221,17 +9190,16 @@ mod live_migration { thread::sleep(std::time::Duration::new(1, 0)); // Start to send migration from the source VM - let mut args = [ + let args = [ format!("--api-socket={}", &src_api_socket), "send-migration".to_string(), - format! {"unix:{migration_socket}"}, + format!( + "destination_url=unix:{migration_socket},local={}", + if local { "on" } else { "off" } + ), ] .to_vec(); - if local { - args.insert(2, "--local".to_string()); - } - let mut send_migration = Command::new(clh_command("ch-remote")) .args(&args) .stderr(Stdio::piped()) @@ -12355,9 +9323,9 @@ mod live_migration { ); let memory_param: &[&str] = if local { - &["--memory", "size=4G,shared=on"] + &["--memory", "size=1500M,shared=on"] } else { - &["--memory", "size=4G"] + &["--memory", "size=1500M"] }; let boot_vcpus = 2; @@ -12413,7 +9381,7 @@ mod live_migration { assert_eq!(guest.get_cpu_count().unwrap_or_default(), boot_vcpus); // Check the guest RAM - assert!(guest.get_total_memory().unwrap_or_default() > 3_840_000); + assert!(guest.get_total_memory().unwrap_or_default() > 1_400_000); // Check the guest virtio-devices, e.g. block, rng, console, and net guest.check_devices_common(None, Some(&console_text), Some(&pmem_path)); @@ -12428,7 +9396,9 @@ mod live_migration { "remove-device", Some(net_id), )); - thread::sleep(std::time::Duration::new(10, 0)); + assert!(wait_until(Duration::from_secs(10), || { + guest.wait_for_ssh(Duration::from_secs(1)).is_err() + })); // Plug the virtio-net device again assert!(remote_command( @@ -12436,7 +9406,7 @@ mod live_migration { "add-net", Some(net_params.as_str()), )); - thread::sleep(std::time::Duration::new(10, 0)); + guest.wait_for_ssh(Duration::from_secs(10)).unwrap(); } // Start the live-migration @@ -12480,7 +9450,7 @@ mod live_migration { let r = std::panic::catch_unwind(|| { // Perform same checks to validate VM has been properly migrated assert_eq!(guest.get_cpu_count().unwrap_or_default(), boot_vcpus); - assert!(guest.get_total_memory().unwrap_or_default() > 3_840_000); + assert!(guest.get_total_memory().unwrap_or_default() > 1_400_000); guest.check_devices_common(None, Some(&console_text), Some(&pmem_path)); }); @@ -12580,11 +9550,16 @@ mod live_migration { assert!(guest.get_total_memory().unwrap_or_default() > 3_840_000); // Increase the guest RAM resize_command(&src_api_socket, None, Some(6 << 30), None, None); - thread::sleep(std::time::Duration::new(5, 0)); + assert!(wait_until(Duration::from_secs(30), || { + guest.get_total_memory().unwrap_or_default() > 5_760_000 + })); assert!(guest.get_total_memory().unwrap_or_default() > 5_760_000); // Use balloon to remove RAM from the VM resize_command(&src_api_socket, None, None, Some(1 << 30), None); - thread::sleep(std::time::Duration::new(5, 0)); + assert!(wait_until(Duration::from_secs(5), || { + let total_memory = guest.get_total_memory().unwrap_or_default(); + total_memory > 4_800_000 && total_memory < 5_760_000 + })); let total_memory = guest.get_total_memory().unwrap_or_default(); assert!(total_memory > 4_800_000); assert!(total_memory < 5_760_000); @@ -12602,7 +9577,9 @@ mod live_migration { "remove-device", Some(net_id), )); - thread::sleep(std::time::Duration::new(10, 0)); + assert!(wait_until(Duration::from_secs(10), || { + guest.wait_for_ssh(Duration::from_secs(1)).is_err() + })); // Plug the virtio-net device again assert!(remote_command( @@ -12610,7 +9587,7 @@ mod live_migration { "add-net", Some(net_params.as_str()), )); - thread::sleep(std::time::Duration::new(10, 0)); + guest.wait_for_ssh(Duration::from_secs(10)).unwrap(); } // Start the live-migration @@ -12820,7 +9797,9 @@ mod live_migration { "remove-device", Some(net_id), )); - thread::sleep(std::time::Duration::new(10, 0)); + assert!(wait_until(Duration::from_secs(10), || { + guest.wait_for_ssh(Duration::from_secs(1)).is_err() + })); // Plug the virtio-net device again assert!(remote_command( @@ -12828,7 +9807,7 @@ mod live_migration { "add-net", Some(net_params.as_str()), )); - thread::sleep(std::time::Duration::new(10, 0)); + guest.wait_for_ssh(Duration::from_secs(10)).unwrap(); } // Start the live-migration @@ -12945,9 +9924,9 @@ mod live_migration { ); let memory_param: &[&str] = if local { - &["--memory", "size=4G,shared=on"] + &["--memory", "size=1500M,shared=on"] } else { - &["--memory", "size=4G"] + &["--memory", "size=1500M"] }; let boot_vcpus = 2; @@ -13003,7 +9982,7 @@ mod live_migration { // Check the number of vCPUs assert_eq!(guest.get_cpu_count().unwrap_or_default(), boot_vcpus); // Check the guest RAM - assert!(guest.get_total_memory().unwrap_or_default() > 3_840_000); + assert!(guest.get_total_memory().unwrap_or_default() > 1_400_000); // Check the guest virtio-devices, e.g. block, rng, console, and net guest.check_devices_common(None, Some(&console_text), Some(&pmem_path)); // x86_64: Following what's done in the `test_snapshot_restore`, we need @@ -13016,7 +9995,9 @@ mod live_migration { "remove-device", Some(net_id), )); - thread::sleep(std::time::Duration::new(10, 0)); + assert!(wait_until(Duration::from_secs(10), || { + guest.wait_for_ssh(Duration::from_secs(1)).is_err() + })); // Plug the virtio-net device again assert!(remote_command( @@ -13024,7 +10005,7 @@ mod live_migration { "add-net", Some(net_params.as_str()), )); - thread::sleep(std::time::Duration::new(10, 0)); + guest.wait_for_ssh(Duration::from_secs(10)).unwrap(); } // Enable watchdog and ensure its functional @@ -13088,7 +10069,7 @@ mod live_migration { let r = std::panic::catch_unwind(|| { // Perform same checks to validate VM has been properly migrated assert_eq!(guest.get_cpu_count().unwrap_or_default(), boot_vcpus); - assert!(guest.get_total_memory().unwrap_or_default() > 3_840_000); + assert!(guest.get_total_memory().unwrap_or_default() > 1_400_000); guest.check_devices_common(None, Some(&console_text), Some(&pmem_path)); @@ -13263,7 +10244,7 @@ mod live_migration { "--cpus", format!("boot={boot_vcpus},max={max_vcpus}").as_str(), ]) - .args(["--memory", "size=4G,shared=on"]) + .args(["--memory", "size=1500M,shared=on"]) .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .default_disks() @@ -13295,7 +10276,7 @@ mod live_migration { assert_eq!(guest.get_cpu_count().unwrap_or_default(), boot_vcpus); // Check the guest RAM - assert!(guest.get_total_memory().unwrap_or_default() > 3_840_000); + assert!(guest.get_total_memory().unwrap_or_default() > 1_400_000); // Check Landlock is enabled by hot-plugging a disk. assert!(!remote_command( @@ -13345,7 +10326,7 @@ mod live_migration { let r = std::panic::catch_unwind(|| { // Perform same checks to validate VM has been properly migrated assert_eq!(guest.get_cpu_count().unwrap_or_default(), boot_vcpus); - assert!(guest.get_total_memory().unwrap_or_default() > 3_840_000); + assert!(guest.get_total_memory().unwrap_or_default() > 1_400_000); }); // Check Landlock is enabled on destination VM by hot-plugging a disk. @@ -13370,7 +10351,11 @@ mod live_migration { .port() } - fn start_live_migration_tcp(src_api_socket: &str, dest_api_socket: &str) -> bool { + fn start_live_migration_tcp( + src_api_socket: &str, + dest_api_socket: &str, + connections: NonZeroU32, + ) -> bool { // Get an available TCP port let migration_port = get_available_port(); let host_ip = "127.0.0.1"; @@ -13392,11 +10377,14 @@ mod live_migration { thread::sleep(Duration::from_secs(1)); // Start the 'send-migration' command on the source + let connections = connections.get(); let mut send_migration = Command::new(clh_command("ch-remote")) .args([ &format!("--api-socket={src_api_socket}"), "send-migration", - &format!("tcp:{host_ip}:{migration_port}"), + &format!( + "destination_url=tcp:{host_ip}:{migration_port},connections={connections}" + ), ]) .stdin(Stdio::null()) .stderr(Stdio::piped()) @@ -13404,95 +10392,367 @@ mod live_migration { .spawn() .unwrap(); - // Check if the 'send-migration' command executed successfully - let send_success = if let Some(status) = send_migration - .wait_timeout(Duration::from_secs(60)) - .unwrap() - { - status.success() - } else { - false - }; + // Check if the 'send-migration' command executed successfully + let send_success = if let Some(status) = send_migration + .wait_timeout(Duration::from_secs(60)) + .unwrap() + { + status.success() + } else { + false + }; + + if !send_success { + let _ = send_migration.kill(); + let output = send_migration.wait_with_output().unwrap(); + eprintln!( + "\n\n==== Start 'send_migration' output ====\n\n---stdout---\n{}\n\n---stderr---\n{}\n\n==== End 'send_migration' output ====\n\n", + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr) + ); + } + + // Check if the 'receive-migration' command executed successfully + let receive_success = if let Some(status) = receive_migration + .wait_timeout(Duration::from_secs(60)) + .unwrap() + { + status.success() + } else { + false + }; + + if !receive_success { + let _ = receive_migration.kill(); + let output = receive_migration.wait_with_output().unwrap(); + eprintln!( + "\n\n==== Start 'receive_migration' output ====\n\n---stdout---\n{}\n\n---stderr---\n{}\n\n==== End 'receive_migration' output ====\n\n", + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr) + ); + } + + send_success && receive_success + } + + fn _test_live_migration_tcp(connections: NonZeroU32) { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); + let kernel_path = direct_kernel_boot_path(); + let console_text = String::from("On a branch floating down river a cricket, singing."); + let net_id = "net123"; + let net_params = format!( + "id={},tap=,mac={},ip={},mask=255.255.255.128", + net_id, guest.network.guest_mac0, guest.network.host_ip0 + ); + let memory_param: &[&str] = &["--memory", "size=1500M,shared=on"]; + let boot_vcpus = 2; + let max_vcpus = 4; + let pmem_temp_file = TempFile::new().unwrap(); + pmem_temp_file.as_file().set_len(128 << 20).unwrap(); + std::process::Command::new("mkfs.ext4") + .arg(pmem_temp_file.as_path()) + .output() + .expect("Expect creating disk image to succeed"); + let pmem_path = String::from("/dev/pmem0"); + + // Start the source VM + let src_vm_path = clh_command("cloud-hypervisor"); + let src_api_socket = temp_api_path(&guest.tmp_dir); + let mut src_vm_cmd = GuestCommand::new_with_binary_path(&guest, &src_vm_path); + src_vm_cmd + .args([ + "--cpus", + format!("boot={boot_vcpus},max={max_vcpus}").as_str(), + ]) + .args(memory_param) + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .default_disks() + .args(["--net", net_params.as_str()]) + .args(["--api-socket", &src_api_socket]) + .args([ + "--pmem", + format!( + "file={},discard_writes=on", + pmem_temp_file.as_path().to_str().unwrap(), + ) + .as_str(), + ]) + .capture_output(); + let mut src_child = src_vm_cmd.spawn().unwrap(); + + // Start the destination VM + let mut dest_api_socket = temp_api_path(&guest.tmp_dir); + dest_api_socket.push_str(".dest"); + let mut dest_child = GuestCommand::new(&guest) + .args(["--api-socket", &dest_api_socket]) + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + // Ensure the source VM is running normally + assert_eq!(guest.get_cpu_count().unwrap_or_default(), boot_vcpus); + assert!(guest.get_total_memory().unwrap_or_default() > 1_400_000); + guest.check_devices_common(None, Some(&console_text), Some(&pmem_path)); + + // On x86_64 architecture, remove and re-add the virtio-net device + #[cfg(target_arch = "x86_64")] + { + assert!(remote_command( + &src_api_socket, + "remove-device", + Some(net_id), + )); + assert!(wait_until(Duration::from_secs(10), || { + guest.wait_for_ssh(Duration::from_secs(1)).is_err() + })); + // Re-add the virtio-net device + assert!(remote_command( + &src_api_socket, + "add-net", + Some(net_params.as_str()), + )); + guest.wait_for_ssh(Duration::from_secs(10)).unwrap(); + } + // Start TCP live migration + assert!( + start_live_migration_tcp(&src_api_socket, &dest_api_socket, connections), + "Unsuccessful command: 'send-migration' or 'receive-migration'." + ); + }); + + // Check and report any errors that occurred during live migration + if r.is_err() { + print_and_panic( + src_child, + dest_child, + None, + "Error occurred during live-migration", + ); + } + + // Check the source vm has been terminated successful (give it '3s' to settle) + thread::sleep(std::time::Duration::new(3, 0)); + if !src_child.try_wait().unwrap().is_some_and(|s| s.success()) { + print_and_panic( + src_child, + dest_child, + None, + "Source VM was not terminated successfully.", + ); + } + + // After live migration, ensure the destination VM is running normally + let r = std::panic::catch_unwind(|| { + // Perform the same checks to ensure the VM has migrated correctly + assert_eq!(guest.get_cpu_count().unwrap_or_default(), boot_vcpus); + assert!(guest.get_total_memory().unwrap_or_default() > 1_400_000); + guest.check_devices_common(None, Some(&console_text), Some(&pmem_path)); + }); + + // Clean up the destination VM and ensure it terminates properly + let _ = dest_child.kill(); + let dest_output = dest_child.wait_with_output().unwrap(); + handle_child_output(r, &dest_output); + + // Check if the expected `console_text` is present in the destination VM's output + let r = std::panic::catch_unwind(|| { + assert!(String::from_utf8_lossy(&dest_output.stdout).contains(&console_text)); + }); + handle_child_output(r, &dest_output); + } + + fn _test_live_migration_tcp_timeout(timeout_strategy: TimeoutStrategy) { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); + let kernel_path = direct_kernel_boot_path(); + let net_id = "net1337"; + let net_params = format!( + "id={},tap=,mac={},ip={},mask=255.255.255.128", + net_id, guest.network.guest_mac0, guest.network.host_ip0 + ); + let memory_param: &[&str] = &["--memory", "size=1500M,shared=on"]; + let boot_vcpus = 2; + + let src_vm_path = clh_command("cloud-hypervisor"); + let src_api_socket = temp_api_path(&guest.tmp_dir); + let mut src_vm_cmd = GuestCommand::new_with_binary_path(&guest, &src_vm_path); + src_vm_cmd + .args(["--cpus", format!("boot={boot_vcpus}").as_str()]) + .args(memory_param) + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .default_disks() + .args(["--net", net_params.as_str()]) + .args(["--api-socket", &src_api_socket]) + .capture_output(); + let mut src_child = src_vm_cmd.spawn().unwrap(); + + let mut dest_api_socket = temp_api_path(&guest.tmp_dir); + dest_api_socket.push_str(".dest"); + let mut dest_child = GuestCommand::new(&guest) + .args(["--api-socket", &dest_api_socket]) + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { + guest.wait_vm_boot().unwrap(); + + assert_eq!(guest.get_cpu_count().unwrap_or_default(), boot_vcpus); + + // Start a memory stressor in the background to keep pages dirty, + // ensuring the precopy loop cannot converge within the 1s timeout. + guest + .ssh_command("nohup stress --vm 2 --vm-bytes 220M --vm-keep &>/dev/null &") + .unwrap(); + // Give stress a moment to actually start dirtying memory + thread::sleep(Duration::from_secs(3)); + + let migration_port = get_available_port(); + let host_ip = "127.0.0.1"; + + let mut receive_migration = Command::new(clh_command("ch-remote")) + .args([ + &format!("--api-socket={dest_api_socket}"), + "receive-migration", + &format!("tcp:0.0.0.0:{migration_port}"), + ]) + .stdin(Stdio::null()) + .stderr(Stdio::piped()) + .stdout(Stdio::piped()) + .spawn() + .unwrap(); + + thread::sleep(Duration::from_secs(1)); + + // Use a tight downtime budget (1ms) combined with a 1s timeout so the + // migration practically cannot converge regardless of strategy. + let mut send_migration = Command::new(clh_command("ch-remote")) + .args([ + &format!("--api-socket={src_api_socket}"), + "send-migration", + &format!( + "destination_url=tcp:{host_ip}:{migration_port},downtime_ms=1,timeout_s=1,timeout_strategy={timeout_strategy:?}" + ), + ]) + .stdin(Stdio::null()) + .stderr(Stdio::piped()) + .stdout(Stdio::piped()) + .spawn() + .unwrap(); + + let send_status = send_migration + .wait_timeout(Duration::from_secs(60)) + .unwrap(); + let receive_status = receive_migration + .wait_timeout(Duration::from_secs(60)) + .unwrap(); + + // Clean up receive-migration regardless of its outcome + if receive_status.is_none() { + let _ = receive_migration.kill(); + } + + // Kill the stressor now that migration has completed or aborted, + // to reduce system load during post-migration checks. + let _ = guest.ssh_command("pkill -f 'stress --vm'"); + + match timeout_strategy { + TimeoutStrategy::Cancel => { + // With cancel strategy the send must fail and the source VM + // must keep running. + let send_failed = match send_status { + Some(status) => !status.success(), + None => { + let _ = send_migration.kill(); + false + } + }; + assert!( + send_failed, + "send-migration should have failed due to 1s timeout with cancel strategy" + ); + + thread::sleep(Duration::from_secs(2)); + assert!( + src_child.try_wait().unwrap().is_none(), + "Source VM should still be running after a cancelled migration" + ); - if !send_success { - let _ = send_migration.kill(); - let output = send_migration.wait_with_output().unwrap(); - eprintln!( - "\n\n==== Start 'send_migration' output ====\n\n---stdout---\n{}\n\n---stderr---\n{}\n\n==== End 'send_migration' output ====\n\n", - String::from_utf8_lossy(&output.stdout), - String::from_utf8_lossy(&output.stderr) - ); - } + // Confirm the source VM is still responsive over SSH + assert_eq!(guest.get_cpu_count().unwrap_or_default(), boot_vcpus); + } + TimeoutStrategy::Ignore => { + // With Ignore strategy the send must succeed despite the timeout + // being reached, and the source VM must have terminated. + let send_succeeded = match send_status { + Some(status) => status.success(), + None => { + let _ = send_migration.kill(); + false + } + }; + assert!( + send_succeeded, + "send-migration should have succeeded with timeout_strategy=ignore" + ); - // Check if the 'receive-migration' command executed successfully - let receive_success = if let Some(status) = receive_migration - .wait_timeout(Duration::from_secs(60)) - .unwrap() - { - status.success() - } else { - false - }; + thread::sleep(Duration::from_secs(3)); + assert!( + src_child.try_wait().unwrap().is_some(), + "Source VM should have terminated after a forced migration" + ); - if !receive_success { - let _ = receive_migration.kill(); - let output = receive_migration.wait_with_output().unwrap(); - eprintln!( - "\n\n==== Start 'receive_migration' output ====\n\n---stdout---\n{}\n\n---stderr---\n{}\n\n==== End 'receive_migration' output ====\n\n", - String::from_utf8_lossy(&output.stdout), - String::from_utf8_lossy(&output.stderr) - ); - } + // Confirm the VM is still responsive over SSH on the new host + assert_eq!(guest.get_cpu_count().unwrap_or_default(), boot_vcpus); + } + } + })); - send_success && receive_success + let _ = src_child.kill(); + let src_output = src_child.wait_with_output().unwrap(); + let _ = dest_child.kill(); + let _dest_output = dest_child.wait_with_output().unwrap(); + + handle_child_output(r, &src_output); } - fn _test_live_migration_tcp() { + fn _test_live_migration_virtio_fs(local: bool) { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(disk_config)); let kernel_path = direct_kernel_boot_path(); - let console_text = String::from("On a branch floating down river a cricket, singing."); - let net_id = "net123"; - let net_params = format!( - "id={},tap=,mac={},ip={},mask=255.255.255.128", - net_id, guest.network.guest_mac0, guest.network.host_ip0 - ); - let memory_param: &[&str] = &["--memory", "size=4G,shared=on"]; - let boot_vcpus = 2; - let max_vcpus = 4; - let pmem_temp_file = TempFile::new().unwrap(); - pmem_temp_file.as_file().set_len(128 << 20).unwrap(); - std::process::Command::new("mkfs.ext4") - .arg(pmem_temp_file.as_path()) - .output() - .expect("Expect creating disk image to succeed"); - let pmem_path = String::from("/dev/pmem0"); - // Start the source VM - let src_vm_path = clh_command("cloud-hypervisor"); + let mut workload_path = dirs::home_dir().unwrap(); + workload_path.push("workloads"); + let mut shared_dir = workload_path; + shared_dir.push("shared_dir"); + + let (daemon_child, virtiofsd_socket_path) = + prepare_virtiofsd(&guest.tmp_dir, shared_dir.to_str().unwrap()); + let src_api_socket = temp_api_path(&guest.tmp_dir); - let mut src_vm_cmd = GuestCommand::new_with_binary_path(&guest, &src_vm_path); - src_vm_cmd - .args([ - "--cpus", - format!("boot={boot_vcpus},max={max_vcpus}").as_str(), - ]) - .args(memory_param) + + // Start the source VM + let mut src_child = GuestCommand::new(&guest) + .args(["--api-socket", &src_api_socket]) + .args(["--cpus", "boot=2"]) + .args(["--memory", "size=512M,shared=on"]) .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .default_disks() - .args(["--net", net_params.as_str()]) - .args(["--api-socket", &src_api_socket]) + .default_net() .args([ - "--pmem", - format!( - "file={},discard_writes=on", - pmem_temp_file.as_path().to_str().unwrap(), - ) - .as_str(), + "--fs", + format!("socket={virtiofsd_socket_path},tag=myfs,num_queues=1,queue_size=1024") + .as_str(), ]) - .capture_output(); - let mut src_child = src_vm_cmd.spawn().unwrap(); + .capture_output() + .spawn() + .unwrap(); // Start the destination VM let mut dest_api_socket = temp_api_path(&guest.tmp_dir); @@ -13503,101 +10763,207 @@ mod live_migration { .spawn() .unwrap(); + // Spawn a thread that waits for the old virtiofsd to exit then + // starts a replacement. During migration the source saves + // DEVICE_STATE then disconnects, causing virtiofsd to exit. + // The destination needs a fresh virtiofsd to load DEVICE_STATE. + // We remove the socket file first so the destination cannot + // accidentally connect to the old instance. + let virtiofsd_socket_clone = virtiofsd_socket_path.clone(); + let shared_dir_str = shared_dir.to_str().unwrap().to_string(); + let (restart_tx, restart_rx) = std::sync::mpsc::channel(); + let _monitor = thread::spawn(move || { + let mut child = daemon_child; + let _ = child.wait(); + let mut path = dirs::home_dir().unwrap(); + path.push("workloads"); + path.push("virtiofsd"); + let new_child = Command::new(path) + .args(["--shared-dir", &shared_dir_str]) + .args(["--socket-path", &virtiofsd_socket_clone]) + .args(["--cache", "never"]) + .args(["--tag", "myfs"]) + .spawn() + .unwrap(); + wait_for_virtiofsd_socket(&virtiofsd_socket_clone); + let _ = restart_tx.send(new_child); + }); + let r = std::panic::catch_unwind(|| { guest.wait_vm_boot().unwrap(); - // Ensure the source VM is running normally - assert_eq!(guest.get_cpu_count().unwrap_or_default(), boot_vcpus); - assert!(guest.get_total_memory().unwrap_or_default() > 3_840_000); - guest.check_devices_common(None, Some(&console_text), Some(&pmem_path)); - // On x86_64 architecture, remove and re-add the virtio-net device - #[cfg(target_arch = "x86_64")] - { - assert!(remote_command( - &src_api_socket, - "remove-device", - Some(net_id), - )); - thread::sleep(Duration::new(10, 0)); - // Re-add the virtio-net device - assert!(remote_command( - &src_api_socket, - "add-net", - Some(net_params.as_str()), - )); - thread::sleep(Duration::new(10, 0)); - } - // Start TCP live migration + // Mount virtiofs and verify it works + guest + .ssh_command("mkdir -p mount_dir && sudo mount -t virtiofs myfs mount_dir/") + .unwrap(); + + // Write a test file through virtiofs before migration + guest + .ssh_command( + "sudo bash -c 'echo pre_migration_data > mount_dir/migration_test_file'", + ) + .unwrap(); + + // Verify the file is accessible + assert_eq!( + guest + .ssh_command("cat mount_dir/migration_test_file") + .unwrap() + .trim(), + "pre_migration_data" + ); + + let migration_socket = String::from( + guest + .tmp_dir + .as_path() + .join("live-migration.sock") + .to_str() + .unwrap(), + ); + + // Remove the socket so the destination cannot connect to + // the old virtiofsd (which is still running). The source's + // existing connection uses an already-accepted fd. + let _ = std::fs::remove_file(&virtiofsd_socket_path); + assert!( - start_live_migration_tcp(&src_api_socket, &dest_api_socket), + start_live_migration(&migration_socket, &src_api_socket, &dest_api_socket, local), "Unsuccessful command: 'send-migration' or 'receive-migration'." ); }); - // Check and report any errors that occurred during live migration + // Check and report any errors occurred during the live-migration if r.is_err() { print_and_panic( src_child, dest_child, None, - "Error occurred during live-migration", + "Error occurred during live-migration with virtio-fs", ); } - // Check the source vm has been terminated successful (give it '3s' to settle) - thread::sleep(std::time::Duration::new(3, 0)); + // Check the source vm has been terminated successfully (give it '3s' to settle) + thread::sleep(Duration::from_secs(3)); if !src_child.try_wait().unwrap().is_some_and(|s| s.success()) { print_and_panic( src_child, dest_child, None, - "Source VM was not terminated successfully.", + "source VM was not terminated successfully.", ); } - // After live migration, ensure the destination VM is running normally + // Post live-migration checks let r = std::panic::catch_unwind(|| { - // Perform the same checks to ensure the VM has migrated correctly - assert_eq!(guest.get_cpu_count().unwrap_or_default(), boot_vcpus); - assert!(guest.get_total_memory().unwrap_or_default() > 3_840_000); - guest.check_devices_common(None, Some(&console_text), Some(&pmem_path)); + // Verify virtiofs still works after migration + // Read the file written before migration + assert_eq!( + guest + .ssh_command("cat mount_dir/migration_test_file") + .unwrap() + .trim(), + "pre_migration_data" + ); + + // Write a new file after migration + guest + .ssh_command( + "sudo bash -c 'echo post_migration_data > mount_dir/post_migration_file'", + ) + .unwrap(); + + // Verify the new file exists on the host + let post_content = + std::fs::read_to_string(shared_dir.join("post_migration_file")).unwrap(); + assert_eq!(post_content.trim(), "post_migration_data"); }); - // Clean up the destination VM and ensure it terminates properly + // Clean up let _ = dest_child.kill(); let dest_output = dest_child.wait_with_output().unwrap(); - handle_child_output(r, &dest_output); + if let Ok(mut new_daemon) = restart_rx.try_recv() { + let _ = new_daemon.kill(); + let _ = new_daemon.wait(); + } + let _ = std::fs::remove_file(shared_dir.join("migration_test_file")); + let _ = std::fs::remove_file(shared_dir.join("post_migration_file")); - // Check if the expected `console_text` is present in the destination VM's output - let r = std::panic::catch_unwind(|| { - assert!(String::from_utf8_lossy(&dest_output.stdout).contains(&console_text)); - }); handle_child_output(r, &dest_output); } mod live_migration_parallel { + use vmm::api::TimeoutStrategy; + use super::*; #[test] + #[cfg_attr( + all(feature = "mshv", target_arch = "aarch64"), + ignore = "live migration not yet supported on mshv arm64" + )] fn test_live_migration_basic() { _test_live_migration(false, false); } #[test] + #[cfg_attr( + all(feature = "mshv", target_arch = "aarch64"), + ignore = "live migration not yet supported on mshv arm64" + )] fn test_live_migration_local() { _test_live_migration(false, true); } #[test] + #[cfg_attr( + all(feature = "mshv", target_arch = "aarch64"), + ignore = "live migration not yet supported on mshv arm64" + )] fn test_live_migration_tcp() { - _test_live_migration_tcp(); + _test_live_migration_tcp(NonZeroU32::new(1).unwrap()); + } + + #[test] + #[cfg_attr( + all(feature = "mshv", target_arch = "aarch64"), + ignore = "live migration not yet supported on mshv arm64" + )] + fn test_live_migration_tcp_parallel_connections() { + _test_live_migration_tcp(NonZeroU32::new(8).unwrap()); + } + + #[test] + #[cfg_attr( + all(feature = "mshv", target_arch = "aarch64"), + ignore = "live migration not yet supported on mshv arm64" + )] + fn test_live_migration_tcp_timeout_cancel() { + _test_live_migration_tcp_timeout(TimeoutStrategy::Cancel); } #[test] + #[cfg_attr( + all(feature = "mshv", target_arch = "aarch64"), + ignore = "live migration not yet supported on mshv arm64" + )] + fn test_live_migration_tcp_timeout_ignore() { + _test_live_migration_tcp_timeout(TimeoutStrategy::Ignore); + } + + #[test] + #[cfg_attr( + all(feature = "mshv", target_arch = "aarch64"), + ignore = "live migration not yet supported on mshv arm64" + )] fn test_live_migration_watchdog() { _test_live_migration_watchdog(false, false); } #[test] + #[cfg_attr( + all(feature = "mshv", target_arch = "aarch64"), + ignore = "live migration not yet supported on mshv arm64" + )] fn test_live_migration_watchdog_local() { _test_live_migration_watchdog(false, true); } @@ -13635,14 +11001,34 @@ mod live_migration { mod live_migration_sequential { use super::*; - // NUMA & balloon live migration tests are large so run sequentially + // NUMA, balloon, and virtio-fs live migration tests run sequentially + + #[test] + #[cfg(not(feature = "mshv"))] + fn test_live_migration_virtio_fs() { + _test_live_migration_virtio_fs(false); + } + + #[test] + #[cfg(not(feature = "mshv"))] + fn test_live_migration_virtio_fs_local() { + _test_live_migration_virtio_fs(true); + } #[test] + #[cfg_attr( + all(feature = "mshv", target_arch = "aarch64"), + ignore = "live migration not yet supported on mshv arm64" + )] fn test_live_migration_balloon() { _test_live_migration_balloon(false, false); } #[test] + #[cfg_attr( + all(feature = "mshv", target_arch = "aarch64"), + ignore = "live migration not yet supported on mshv arm64" + )] fn test_live_migration_balloon_local() { _test_live_migration_balloon(false, true); } @@ -13730,8 +11116,8 @@ mod aarch64_acpi { let guest = Guest::new(disk_config); let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) + .default_cpus() + .default_memory() .args(["--kernel", edk2_path().to_str().unwrap()]) .default_disks() .default_net() @@ -13777,7 +11163,11 @@ mod aarch64_acpi { #[test] fn test_power_button_acpi() { - _test_power_button(true); + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = GuestFactory::new_regular_guest_factory() + .create_guest(Box::new(disk_config)) + .with_kernel_path(edk2_path().to_str().unwrap()); + _test_power_button(&guest); } #[test] @@ -13789,6 +11179,10 @@ mod aarch64_acpi { mod rate_limiter { use super::*; + const NET_RATE_LIMITER_RUNTIME: u32 = 20; + const BLOCK_RATE_LIMITER_RUNTIME: u32 = 20; + const BLOCK_RATE_LIMITER_RAMP_TIME: u32 = 5; + // Check if the 'measured' rate is within the expected 'difference' (in percentage) // compared to given 'limit' rate. fn check_rate_limit(measured: f64, limit: f64, difference: f64) -> bool { @@ -13812,15 +11206,14 @@ mod rate_limiter { let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(disk_config)); - let test_timeout = 10; let num_queues = 2; let queue_size = 256; - let bw_size = 10485760_u64; // bytes - let bw_refill_time = 100; // ms + let bw_size = 104857600_u64; // bytes + let bw_refill_time = 1000; // ms let limit_bps = (bw_size * 8 * 1000) as f64 / bw_refill_time as f64; let net_params = format!( - "tap=,mac={},ip={},mask=255.255.255.128,num_queues={},queue_size={},bw_size={},bw_refill_time={}", + "tap=,mac={},ip={},mask=255.255.255.128,num_queues={},queue_size={},bw_size={},bw_one_time_burst=0,bw_refill_time={}", guest.network.guest_mac0, guest.network.host_ip0, num_queues, @@ -13831,7 +11224,7 @@ mod rate_limiter { let mut child = GuestCommand::new(&guest) .args(["--cpus", &format!("boot={}", num_queues / 2)]) - .args(["--memory", "size=4G"]) + .args(["--memory", "size=1G"]) .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .default_disks() @@ -13842,9 +11235,14 @@ mod rate_limiter { let r = std::panic::catch_unwind(|| { guest.wait_vm_boot().unwrap(); - let measured_bps = - measure_virtio_net_throughput(test_timeout, num_queues / 2, &guest, rx, true) - .unwrap(); + let measured_bps = measure_virtio_net_throughput( + NET_RATE_LIMITER_RUNTIME, + num_queues / 2, + &guest, + rx, + true, + ) + .unwrap(); assert!(check_rate_limit(measured_bps, limit_bps, 0.1)); }); @@ -13864,15 +11262,14 @@ mod rate_limiter { } fn _test_rate_limiter_block(bandwidth: bool, num_queues: u32) { - let test_timeout = 10; let fio_ops = FioOps::RandRW; let bw_size = if bandwidth { - 10485760_u64 // bytes + 104857600_u64 // bytes } else { - 100_u64 // I/O + 1000_u64 // I/O }; - let bw_refill_time = 100; // ms + let bw_refill_time = 1000; // ms let limit_rate = (bw_size * 1000) as f64 / bw_refill_time as f64; let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); @@ -13893,17 +11290,17 @@ mod rate_limiter { let test_blk_params = if bandwidth { format!( - "path={blk_rate_limiter_test_img},num_queues={num_queues},bw_size={bw_size},bw_refill_time={bw_refill_time}" + "path={blk_rate_limiter_test_img},num_queues={num_queues},bw_size={bw_size},bw_one_time_burst=0,bw_refill_time={bw_refill_time},image_type=raw" ) } else { format!( - "path={blk_rate_limiter_test_img},num_queues={num_queues},ops_size={bw_size},ops_refill_time={bw_refill_time}" + "path={blk_rate_limiter_test_img},num_queues={num_queues},ops_size={bw_size},ops_one_time_burst=0,ops_refill_time={bw_refill_time},image_type=raw" ) }; let mut child = GuestCommand::new(&guest) .args(["--cpus", &format!("boot={num_queues}")]) - .args(["--memory", "size=4G"]) + .args(["--memory", "size=1G"]) .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .args([ @@ -13932,7 +11329,8 @@ mod rate_limiter { let fio_command = format!( "sudo fio --filename=/dev/vdc --name=test --output-format=json \ --direct=1 --bs=4k --ioengine=io_uring --iodepth=64 \ - --rw={fio_ops} --runtime={test_timeout} --numjobs={num_queues}" + --rw={fio_ops} --runtime={BLOCK_RATE_LIMITER_RUNTIME} \ + --ramp_time={BLOCK_RATE_LIMITER_RAMP_TIME} --numjobs={num_queues}", ); let output = guest.ssh_command(&fio_command).unwrap(); @@ -13951,15 +11349,14 @@ mod rate_limiter { } fn _test_rate_limiter_group_block(bandwidth: bool, num_queues: u32, num_disks: u32) { - let test_timeout = 10; let fio_ops = FioOps::RandRW; let bw_size = if bandwidth { - 10485760_u64 // bytes + 104857600_u64 // bytes } else { - 100_u64 // I/O + 1000_u64 // I/O }; - let bw_refill_time = 100; // ms + let bw_refill_time = 1000; // ms let limit_rate = (bw_size * 1000) as f64 / bw_refill_time as f64; let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); @@ -13968,9 +11365,13 @@ mod rate_limiter { let test_img_dir = TempDir::new_with_prefix("/var/tmp/ch").unwrap(); let rate_limit_group_arg = if bandwidth { - format!("id=group0,bw_size={bw_size},bw_refill_time={bw_refill_time}") + format!( + "id=group0,bw_size={bw_size},bw_one_time_burst=0,bw_refill_time={bw_refill_time}" + ) } else { - format!("id=group0,ops_size={bw_size},ops_refill_time={bw_refill_time}") + format!( + "id=group0,ops_size={bw_size},ops_one_time_burst=0,ops_refill_time={bw_refill_time}" + ) }; let mut disk_args = vec![ @@ -14003,13 +11404,13 @@ mod rate_limiter { ); disk_args.push(format!( - "path={test_img_path},num_queues={num_queues},rate_limit_group=group0" + "path={test_img_path},num_queues={num_queues},rate_limit_group=group0,image_type=raw" )); } let mut child = GuestCommand::new(&guest) .args(["--cpus", &format!("boot={}", num_queues * num_disks)]) - .args(["--memory", "size=4G"]) + .args(["--memory", "size=1G"]) .args(["--kernel", direct_kernel_boot_path().to_str().unwrap()]) .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) .args(["--rate-limit-group", &rate_limit_group_arg]) @@ -14026,7 +11427,8 @@ mod rate_limiter { let mut fio_command = format!( "sudo fio --name=global --output-format=json \ --direct=1 --bs=4k --ioengine=io_uring --iodepth=64 \ - --rw={fio_ops} --runtime={test_timeout} --numjobs={num_queues}" + --rw={fio_ops} --runtime={BLOCK_RATE_LIMITER_RUNTIME} \ + --ramp_time={BLOCK_RATE_LIMITER_RAMP_TIME} --numjobs={num_queues}", ); // Generate additional argument for each disk: @@ -14105,7 +11507,7 @@ mod fw_cfg { std::fs::write(&test_file, "test-file-content").unwrap(); cmd.args(["--cpus", "boot=4"]) - .args(["--memory", "size=512M"]) + .default_memory() .args(["--kernel", kernel_path.to_str().unwrap()]) .args(["--cmdline", cmd_line]) .default_disks() @@ -14138,19 +11540,45 @@ mod fw_cfg { handle_child_output(r, &output); } -} -#[cfg(all(feature = "sev_snp", target_arch = "x86_64"))] -mod common_cvm { - use vm_memory::GuestAddress; - - use crate::*; #[test] - fn test_focal_simple_launch() { - let disk_config = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); - let mut guest = Guest::new(Box::new(disk_config)); - guest.vm_type = GuestVmType::Confidential; - guest.boot_timeout = DEFAULT_CVM_TCP_LISTENER_TIMEOUT; - _test_simple_launch(&guest) + #[cfg_attr(feature = "mshv", ignore = "See #7434")] + fn test_fw_cfg_string() { + let disk_config = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(disk_config)); + let mut cmd = GuestCommand::new(&guest); + + let kernel_path = direct_kernel_boot_path(); + let cmd_line = DIRECT_KERNEL_BOOT_CMDLINE; + + cmd.args(["--cpus", "boot=4"]) + .default_memory() + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", cmd_line]) + .default_disks() + .default_net() + .args([ + "--fw-cfg-config", + "initramfs=off,items=[name=opt/org.test/test-string,string=hello-from-vmm]", + ]) + .capture_output(); + + let mut child = cmd.spawn().unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot().unwrap(); + thread::sleep(std::time::Duration::new(3, 0)); + let result = guest + .ssh_command( + "sudo cat /sys/firmware/qemu_fw_cfg/by_name/opt/org.test/test-string/raw", + ) + .unwrap(); + assert_eq!(result, "hello-from-vmm"); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); } } diff --git a/cloud-hypervisor/tests/integration_cvm.rs b/cloud-hypervisor/tests/integration_cvm.rs new file mode 100644 index 0000000000..039f8840ff --- /dev/null +++ b/cloud-hypervisor/tests/integration_cvm.rs @@ -0,0 +1,327 @@ +// Copyright 2026 The Cloud Hypervisor Authors. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 +// +#![cfg(any(devcli_testenv, clippy))] +#![allow(clippy::undocumented_unsafe_blocks)] +// When enabling the `mshv` feature, we skip quite some tests and +// hence have known dead-code. This annotation silences dead-code +// related warnings for our quality workflow to pass. +#![allow(dead_code)] +mod common; + +#[cfg(all(feature = "sev_snp", target_arch = "x86_64"))] +mod common_cvm { + use block::ImageType; + use common::tests_wrappers::*; + use common::utils::*; + use test_infra::*; + const NUM_PCI_SEGMENTS: u16 = 8; + + use super::*; + macro_rules! basic_cvm_guest { + ($image_name:expr) => {{ + let disk_config = UbuntuDiskConfig::new($image_name.to_string()); + GuestFactory::new_confidential_guest_factory().create_guest(Box::new(disk_config)) + }}; + } + + #[test] + fn test_focal_simple_launch() { + let guest = basic_cvm_guest!(FOCAL_IMAGE_NAME); + + _test_simple_launch(&guest); + } + + #[test] + fn test_api_http_create_boot() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME).with_cpu(4); + let target_api = TargetApi::new_http_api(&guest.tmp_dir); + _test_api_create_boot(&target_api, &guest); + } + + #[test] + fn test_api_http_shutdown() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME).with_cpu(4); + + let target_api = TargetApi::new_http_api(&guest.tmp_dir); + _test_api_shutdown(&target_api, &guest); + } + + #[test] + fn test_api_http_delete() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); + let target_api = TargetApi::new_http_api(&guest.tmp_dir); + _test_api_delete(&target_api, &guest); + } + + #[test] + fn test_power_button() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); + _test_power_button(&guest); + } + + #[test] + fn test_virtio_vsock() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); + _test_virtio_vsock(&guest, false); + } + + #[test] + fn test_multi_cpu() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); + _test_multi_cpu(&guest); + } + + #[test] + fn test_cpu_affinity() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME).with_cpu(2); + _test_cpu_affinity(&guest); + } + + #[test] + fn test_virtio_queue_affinity() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME).with_cpu(4); + _test_virtio_queue_affinity(&guest); + } + + #[test] + fn test_pci_msi() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); + _test_pci_msi(&guest); + } + + #[test] + fn test_virtio_net_ctrl_queue() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); + _test_virtio_net_ctrl_queue(&guest); + } + + #[test] + fn test_pci_multiple_segments() { + // Use 8 segments to test the multiple segment support since it's more than the default 6 + // supported by Linux + // IGVM file used by Sev-Snp Guest now support up to 8 segments, so we can use 8 segments for testing. + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); + _test_pci_multiple_segments(&guest, NUM_PCI_SEGMENTS, 5); + } + + #[test] + fn test_direct_kernel_boot() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); + _test_direct_kernel_boot(&guest); + } + + #[test] + fn test_virtio_block_io_uring() { + let guest = make_virtio_block_guest( + &GuestFactory::new_confidential_guest_factory(), + FOCAL_IMAGE_NAME, + ); + _test_virtio_block(&guest, false, true, false, false, ImageType::Raw); + } + + #[test] + fn test_virtio_block_aio() { + let guest = make_virtio_block_guest( + &GuestFactory::new_confidential_guest_factory(), + FOCAL_IMAGE_NAME, + ); + _test_virtio_block(&guest, true, false, false, false, ImageType::Raw); + } + + #[test] + fn test_virtio_block_sync() { + let guest = make_virtio_block_guest( + &GuestFactory::new_confidential_guest_factory(), + FOCAL_IMAGE_NAME, + ); + _test_virtio_block(&guest, true, true, false, false, ImageType::Raw); + } + + #[test] + fn test_virtio_block_qcow2() { + let guest = make_virtio_block_guest( + &GuestFactory::new_confidential_guest_factory(), + JAMMY_IMAGE_NAME_QCOW2, + ); + _test_virtio_block(&guest, false, false, true, false, ImageType::Qcow2); + } + + #[test] + fn test_virtio_block_qcow2_zlib() { + let guest = make_virtio_block_guest( + &GuestFactory::new_confidential_guest_factory(), + JAMMY_IMAGE_NAME_QCOW2_ZLIB, + ); + _test_virtio_block(&guest, false, false, true, false, ImageType::Qcow2); + } + + #[test] + fn test_virtio_block_qcow2_zstd() { + let guest = make_virtio_block_guest( + &GuestFactory::new_confidential_guest_factory(), + JAMMY_IMAGE_NAME_QCOW2_ZSTD, + ); + _test_virtio_block(&guest, false, false, true, false, ImageType::Qcow2); + } + + #[test] + fn test_virtio_block_qcow2_backing_zstd_file() { + let guest = make_virtio_block_guest( + &GuestFactory::new_confidential_guest_factory(), + JAMMY_IMAGE_NAME_QCOW2_BACKING_ZSTD_FILE, + ); + + _test_virtio_block(&guest, false, false, true, true, ImageType::Qcow2); + } + + #[test] + fn test_virtio_block_qcow2_backing_uncompressed_file() { + let guest = make_virtio_block_guest( + &GuestFactory::new_confidential_guest_factory(), + JAMMY_IMAGE_NAME_QCOW2_BACKING_UNCOMPRESSED_FILE, + ); + + _test_virtio_block(&guest, false, false, true, true, ImageType::Qcow2); + } + + #[test] + fn test_virtio_block_qcow2_backing_raw_file() { + let guest = make_virtio_block_guest( + &GuestFactory::new_confidential_guest_factory(), + JAMMY_IMAGE_NAME_QCOW2_BACKING_RAW_FILE, + ); + _test_virtio_block(&guest, false, false, true, true, ImageType::Qcow2); + } + + #[test] + fn test_virtio_block_dynamic_vhdx_expand() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); + _test_virtio_block_dynamic_vhdx_expand(&guest); + } + + #[test] + fn test_split_irqchip() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); + _test_split_irqchip(&guest); + } + + #[test] + fn test_dmi_uuid() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); + _test_dmi_uuid(&guest); + } + + #[test] + fn test_dmi_oem_strings() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); + _test_dmi_oem_strings(&guest); + } + + #[test] + fn test_multiple_network_interfaces() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); + _test_multiple_network_interfaces(&guest); + } + + #[test] + fn test_serial_off() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); + _test_serial_off(&guest); + } + + #[test] + fn test_virtio_console() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); + _test_virtio_console(&guest); + } + + #[test] + fn test_console_file() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); + _test_console_file(&guest); + } + + #[test] + fn test_direct_kernel_boot_noacpi() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); + _test_direct_kernel_boot_noacpi(&guest); + } + + #[test] + fn test_pci_bar_reprogramming() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); + _test_pci_bar_reprogramming(&guest); + } + + #[test] + fn test_memory_overhead() { + let guest_memory_size_kb: u32 = 512 * 1024; + let guest = + basic_cvm_guest!(JAMMY_IMAGE_NAME).with_memory(&format!("{guest_memory_size_kb}K")); + _test_memory_overhead(&guest, guest_memory_size_kb); + } + + #[test] + fn test_landlock() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); + _test_landlock(&guest); + } + + #[test] + fn test_disk_hotplug() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); + _test_disk_hotplug(&guest, false); + } + + #[test] + fn test_net_hotplug() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); + _test_net_hotplug(&guest, NUM_PCI_SEGMENTS, None); + } + + #[test] + fn test_counters() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); + _test_counters(&guest); + } + + #[test] + fn test_watchdog() { + let guest = basic_cvm_guest!(FOCAL_IMAGE_NAME); + _test_watchdog(&guest); + } + + #[test] + fn test_pvpanic() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME); + _test_pvpanic(&guest); + } + + #[test] + fn test_tap_from_fd() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME).with_cpu(2); + _test_tap_from_fd(&guest); + } + + #[test] + fn test_macvtap() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME).with_cpu(2); + _test_macvtap(&guest, false, "guestmacvtap0", "hostmacvtap0"); + } + + #[test] + fn test_macvtap_hotplug() { + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME).with_cpu(2); + _test_macvtap(&guest, true, "guestmacvtap1", "hostmacvtap1"); + } + + #[test] + fn test_vdpa_block() { + assert!(exec_host_command_status("lsmod | grep vdpa_sim_blk").success()); + + let guest = basic_cvm_guest!(JAMMY_IMAGE_NAME).with_cpu(2); + _test_vdpa_block(&guest); + } +} diff --git a/devices/Cargo.toml b/devices/Cargo.toml index 0414f13d2a..afdc3403bf 100644 --- a/devices/Cargo.toml +++ b/devices/Cargo.toml @@ -2,6 +2,7 @@ authors = ["The Chromium OS Authors"] edition.workspace = true name = "devices" +rust-version.workspace = true version = "0.1.0" [dependencies] @@ -20,7 +21,7 @@ linux-loader = { workspace = true, features = [ "pe", ], optional = true } log = { workspace = true } -num_enum = "0.7.5" +num_enum = "0.7.6" pci = { path = "../pci" } serde = { workspace = true, features = ["derive"] } thiserror = { workspace = true } @@ -34,7 +35,7 @@ vm-memory = { workspace = true, features = [ ] } vm-migration = { path = "../vm-migration" } vmm-sys-util = { workspace = true } -zerocopy = { version = "0.8.39", features = [ +zerocopy = { version = "0.8.48", features = [ "alloc", "derive", ], optional = true } diff --git a/devices/src/acpi.rs b/devices/src/acpi.rs index 69bcb80d76..a9c86aa18e 100644 --- a/devices/src/acpi.rs +++ b/devices/src/acpi.rs @@ -21,7 +21,7 @@ pub const GED_DEVICE_ACPI_SIZE: usize = 0x1; /// A device for handling ACPI shutdown and reboot pub struct AcpiShutdownDevice { - exit_evt: EventFd, + guest_exit_evt: EventFd, reset_evt: EventFd, vcpus_kill_signalled: Arc, } @@ -29,12 +29,12 @@ pub struct AcpiShutdownDevice { impl AcpiShutdownDevice { /// Constructs a device that will signal the given event when the guest requests it. pub fn new( - exit_evt: EventFd, + guest_exit_evt: EventFd, reset_evt: EventFd, vcpus_kill_signalled: Arc, ) -> AcpiShutdownDevice { AcpiShutdownDevice { - exit_evt, + guest_exit_evt, reset_evt, vcpus_kill_signalled, } @@ -68,7 +68,7 @@ impl BusDevice for AcpiShutdownDevice { const SLEEP_VALUE_BIT: u8 = 2; if data[0] == (S5_SLEEP_VALUE << SLEEP_VALUE_BIT) | (1 << SLEEP_STATUS_EN_BIT) { info!("ACPI Shutdown signalled"); - if let Err(e) = self.exit_evt.write(1) { + if let Err(e) = self.guest_exit_evt.write(1) { error!("Error triggering ACPI shutdown event: {e}"); } // Spin until we are sure the reset_evt has been handled and that when diff --git a/devices/src/ivshmem.rs b/devices/src/ivshmem.rs index fff48a72c6..932e0d9eba 100644 --- a/devices/src/ivshmem.rs +++ b/devices/src/ivshmem.rs @@ -217,7 +217,7 @@ impl BusDevice for IvshmemDevice { impl PciDevice for IvshmemDevice { fn allocate_bars( &mut self, - _allocator: &Arc>, + _allocator: &mut SystemAllocator, mmio32_allocator: &mut AddressAllocator, mmio64_allocator: &mut AddressAllocator, resources: Option>, @@ -382,6 +382,10 @@ impl PciDevice for IvshmemDevice { Ok(()) } + fn restore_bar_addr(&mut self, params: &BarReprogrammingParams) { + self.configuration.restore_bar_addr(params); + } + fn as_any_mut(&mut self) -> &mut dyn Any { self } diff --git a/devices/src/legacy/fw_cfg.rs b/devices/src/legacy/fw_cfg.rs index f33179d831..876ceaaf3c 100644 --- a/devices/src/legacy/fw_cfg.rs +++ b/devices/src/legacy/fw_cfg.rs @@ -649,6 +649,18 @@ impl FwCfg { let kernel_start = bp.text_offset; #[cfg(target_arch = "x86_64")] let kernel_start = (bp.hdr.setup_sects as usize + 1) * 512; + + #[cfg(target_arch = "x86_64")] + if kernel_start <= buffer.len() { + buffer.truncate(kernel_start); + } else { + buffer.resize(kernel_start, 0); + file.read_exact_at( + &mut buffer[size_of::()..], + size_of::() as u64, + )?; + } + self.known_items[FW_CFG_SETUP_SIZE as usize] = FwCfgContent::U32(buffer.len() as u32); self.known_items[FW_CFG_SETUP_DATA as usize] = FwCfgContent::Bytes(buffer); self.known_items[FW_CFG_KERNEL_SIZE as usize] = @@ -897,6 +909,32 @@ mod unit_tests { } } + #[test] + fn test_string_item() { + let gm = GuestMemoryAtomic::new( + GuestMemoryMmap::from_ranges(&[(GuestAddress(0), RAM_64BIT_START.0 as usize)]).unwrap(), + ); + + let mut fw_cfg = FwCfg::new(gm); + + // Simulate OVMF X-PciMmio64Mb string item for GPU CC passthrough + let item = FwCfgItem { + name: "opt/ovmf/X-PciMmio64Mb".to_owned(), + content: FwCfgContent::Bytes("262144".as_bytes().to_vec()), + }; + fw_cfg.add_item(item).unwrap(); + + let expected = b"262144"; + let mut data = vec![0u8]; + + // Select the first file item (FW_CFG_FILE_FIRST = 0x20) + fw_cfg.write(0, SELECTOR_OFFSET, &[FW_CFG_FILE_FIRST as u8, 0]); + for &byte in expected.iter() { + fw_cfg.read(0, DATA_OFFSET, &mut data); + assert_eq!(data[0], byte); + } + } + #[test] fn test_dma() { let code = [ diff --git a/devices/src/pvmemcontrol.rs b/devices/src/pvmemcontrol.rs index 06e0e24923..171fdf1544 100644 --- a/devices/src/pvmemcontrol.rs +++ b/devices/src/pvmemcontrol.rs @@ -5,7 +5,7 @@ use std::collections::HashMap; use std::ffi::CString; -use std::sync::{Arc, Barrier, Mutex, RwLock}; +use std::sync::{Arc, Barrier, RwLock}; use std::{io, result}; use log::{debug, warn}; @@ -712,6 +712,10 @@ impl PciDevice for PvmemcontrolPciDevice { self.configuration.read_config_register(reg_idx) } + fn restore_bar_addr(&mut self, params: &BarReprogrammingParams) { + self.configuration.restore_bar_addr(params); + } + fn as_any_mut(&mut self) -> &mut dyn std::any::Any { self } @@ -722,7 +726,7 @@ impl PciDevice for PvmemcontrolPciDevice { fn allocate_bars( &mut self, - _allocator: &Arc>, + _allocator: &mut SystemAllocator, mmio32_allocator: &mut AddressAllocator, _mmio64_allocator: &mut AddressAllocator, resources: Option>, diff --git a/devices/src/pvpanic.rs b/devices/src/pvpanic.rs index 0451da1a05..3b9c9d5a80 100644 --- a/devices/src/pvpanic.rs +++ b/devices/src/pvpanic.rs @@ -5,7 +5,7 @@ use std::any::Any; use std::result; -use std::sync::{Arc, Barrier, Mutex}; +use std::sync::{Arc, Barrier}; use anyhow::anyhow; use event_monitor::event; @@ -174,7 +174,7 @@ impl PciDevice for PvPanicDevice { fn allocate_bars( &mut self, - _allocator: &Arc>, + _allocator: &mut SystemAllocator, mmio32_allocator: &mut AddressAllocator, _mmio64_allocator: &mut AddressAllocator, resources: Option>, @@ -231,6 +231,10 @@ impl PciDevice for PvPanicDevice { Ok(()) } + fn restore_bar_addr(&mut self, params: &BarReprogrammingParams) { + self.configuration.restore_bar_addr(params); + } + fn read_bar(&mut self, _base: u64, _offset: u64, data: &mut [u8]) { data[0] = self.events; } diff --git a/docs/api.md b/docs/api.md index 8f7a10642d..cea3f31812 100644 --- a/docs/api.md +++ b/docs/api.md @@ -72,36 +72,37 @@ The Cloud Hypervisor API exposes the following actions through its endpoints: ##### Virtual Machine (VM) Actions | Action | Endpoint | Request Body | Response Body | Prerequisites | -| ---------------------------------- | ----------------------- | ------------------------------- | ------------------------ | ------------------------------------------------------ | -| Create the VM | `/vm.create` | `/schemas/VmConfig` | N/A | The VM is not created yet | -| Delete the VM | `/vm.delete` | N/A | N/A | N/A | -| Boot the VM | `/vm.boot` | N/A | N/A | The VM is created but not booted | -| Shut the VM down | `/vm.shutdown` | N/A | N/A | The VM is booted | -| Reboot the VM | `/vm.reboot` | N/A | N/A | The VM is booted | -| Trigger power button of the VM | `/vm.power-button` | N/A | N/A | The VM is booted | -| Pause the VM | `/vm.pause` | N/A | N/A | The VM is booted | -| Resume the VM | `/vm.resume` | N/A | N/A | The VM is paused | -| Take a snapshot of the VM | `/vm.snapshot` | `/schemas/VmSnapshotConfig` | N/A | The VM is paused | -| Perform a coredump of the VM* | `/vm.coredump` | `/schemas/VmCoredumpData` | N/A | The VM is paused | -| Restore the VM from a snapshot | `/vm.restore` | `/schemas/RestoreConfig` | N/A | The VM is created but not booted | -| Add/remove CPUs to/from the VM | `/vm.resize` | `/schemas/VmResize` | N/A | The VM is booted | -| Add/remove memory from the VM | `/vm.resize` | `/schemas/VmResize` | N/A | The VM is booted | -| Resize a disk attached to the VM | `/vm.resize-disk` | `/schemas/VmResizeDisk` | N/A | The VM is created | -| Add/remove memory from a zone | `/vm.resize-zone` | `/schemas/VmResizeZone` | N/A | The VM is booted | -| Dump the VM information | `/vm.info` | N/A | `/schemas/VmInfo` | The VM is created | -| Add VFIO PCI device to the VM | `/vm.add-device` | `/schemas/VmAddDevice` | `/schemas/PciDeviceInfo` | The VM is booted | -| Add disk device to the VM | `/vm.add-disk` | `/schemas/DiskConfig` | `/schemas/PciDeviceInfo` | The VM is booted | -| Add fs device to the VM | `/vm.add-fs` | `/schemas/FsConfig` | `/schemas/PciDeviceInfo` | The VM is booted | -| Add pmem device to the VM | `/vm.add-pmem` | `/schemas/PmemConfig` | `/schemas/PciDeviceInfo` | The VM is booted | -| Add network device to the VM | `/vm.add-net` | `/schemas/NetConfig` | `/schemas/PciDeviceInfo` | The VM is booted | -| Add userspace PCI device to the VM | `/vm.add-user-device` | `/schemas/VmAddUserDevice` | `/schemas/PciDeviceInfo` | The VM is booted | -| Add vdpa device to the VM | `/vm.add-vdpa` | `/schemas/VdpaConfig` | `/schemas/PciDeviceInfo` | The VM is booted | -| Add vsock device to the VM | `/vm.add-vsock` | `/schemas/VsockConfig` | `/schemas/PciDeviceInfo` | The VM is booted | -| Remove device from the VM | `/vm.remove-device` | `/schemas/VmRemoveDevice` | N/A | The VM is booted | -| Dump the VM counters | `/vm.counters` | N/A | `/schemas/VmCounters` | The VM is booted | -| Inject an NMI | `/vm.nmi` | N/A | N/A | The VM is booted | -| Prepare to receive a migration | `/vm.receive-migration` | `/schemas/ReceiveMigrationData` | N/A | N/A | -| Start to send migration to target | `/vm.send-migration` | `/schemas/SendMigrationData` | N/A | The VM is booted and (shared mem or hugepages enabled) | +| --------------------------------------- | ---------------------------- | --------------------------------- | ------------------------ | ------------------------------------------------------ | +| Create the VM | `/vm.create` | `/schemas/VmConfig` | N/A | The VM is not created yet | +| Delete the VM | `/vm.delete` | N/A | N/A | N/A | +| Boot the VM | `/vm.boot` | N/A | N/A | The VM is created but not booted | +| Shut the VM down | `/vm.shutdown` | N/A | N/A | The VM is booted | +| Reboot the VM | `/vm.reboot` | N/A | N/A | The VM is booted | +| Trigger power button of the VM | `/vm.power-button` | N/A | N/A | The VM is booted | +| Pause the VM | `/vm.pause` | N/A | N/A | The VM is booted | +| Resume the VM | `/vm.resume` | N/A | N/A | The VM is paused | +| Take a snapshot of the VM | `/vm.snapshot` | `/schemas/VmSnapshotConfig` | N/A | The VM is paused | +| Perform a coredump of the VM* | `/vm.coredump` | `/schemas/VmCoredumpData` | N/A | The VM is paused | +| Restore the VM from a snapshot | `/vm.restore` | `/schemas/RestoreConfig` | N/A | The VM is created but not booted | +| Add/remove CPUs to/from the VM | `/vm.resize` | `/schemas/VmResize` | N/A | The VM is booted | +| Add/remove memory from the VM | `/vm.resize` | `/schemas/VmResize` | N/A | The VM is booted | +| Resize a disk attached to the VM | `/vm.resize-disk` | `/schemas/VmResizeDisk` | N/A | The VM is created | +| Add/remove memory from a zone | `/vm.resize-zone` | `/schemas/VmResizeZone` | N/A | The VM is booted | +| Dump the VM information | `/vm.info` | N/A | `/schemas/VmInfo` | The VM is created | +| Add VFIO PCI device to the VM | `/vm.add-device` | `/schemas/VmAddDevice` | `/schemas/PciDeviceInfo` | The VM is booted | +| Add disk device to the VM | `/vm.add-disk` | `/schemas/DiskConfig` | `/schemas/PciDeviceInfo` | The VM is booted | +| Add fs device to the VM | `/vm.add-fs` | `/schemas/FsConfig` | `/schemas/PciDeviceInfo` | The VM is booted | +| Add generic vhost-user device to the VM | `/vm.add-generic-vhost-user` | `/schemas/GenericVhostUserConfig` | `/schemas/PciDeviceInfo` | The VM is booted | +| Add pmem device to the VM | `/vm.add-pmem` | `/schemas/PmemConfig` | `/schemas/PciDeviceInfo` | The VM is booted | +| Add network device to the VM | `/vm.add-net` | `/schemas/NetConfig` | `/schemas/PciDeviceInfo` | The VM is booted | +| Add userspace PCI device to the VM | `/vm.add-user-device` | `/schemas/VmAddUserDevice` | `/schemas/PciDeviceInfo` | The VM is booted | +| Add vdpa device to the VM | `/vm.add-vdpa` | `/schemas/VdpaConfig` | `/schemas/PciDeviceInfo` | The VM is booted | +| Add vsock device to the VM | `/vm.add-vsock` | `/schemas/VsockConfig` | `/schemas/PciDeviceInfo` | The VM is booted | +| Remove device from the VM | `/vm.remove-device` | `/schemas/VmRemoveDevice` | N/A | The VM is booted | +| Dump the VM counters | `/vm.counters` | N/A | `/schemas/VmCounters` | The VM is booted | +| Inject an NMI | `/vm.nmi` | N/A | N/A | The VM is booted | +| Prepare to receive a migration | `/vm.receive-migration` | `/schemas/ReceiveMigrationData` | N/A | N/A | +| Start to send migration to target | `/vm.send-migration` | `/schemas/SendMigrationData` | N/A | The VM is booted and (shared mem or hugepages enabled) | * The `vmcoredump` action is available exclusively for the `x86_64` architecture and can be executed only when the `guest_debug` feature is diff --git a/docs/cpu.md b/docs/cpu.md index 6a55942691..8ed247c909 100644 --- a/docs/cpu.md +++ b/docs/cpu.md @@ -19,11 +19,12 @@ struct CpusConfig { affinity: Option>, features: CpuFeatures, nested: bool, + core_scheduling: CoreScheduling, } ``` ``` ---cpus boot=,max=,topology=:::,kvm_hyperv=on|off,max_phys_bits=,affinity=,features=,nested=on|off +--cpus boot=,max=,topology=:::,kvm_hyperv=on|off,max_phys_bits=,affinity=,features=,nested=on|off,core_scheduling=vm|vcpu|off ``` ### `boot` @@ -221,3 +222,34 @@ _Example_ ``` --cpus nested=on ``` + +### `core_scheduling` + +Core scheduling mode for vCPU threads. + +This option controls Linux core scheduling (`PR_SCHED_CORE`) for vCPU threads, +which prevents untrusted tasks from sharing SMT siblings. This mitigates +side-channel attacks (e.g. MDS, L1TF) between vCPU threads. + +Three modes are available: + +- `vm` (default): All vCPU threads share a single core scheduling cookie. + vCPUs may be co-scheduled on SMT siblings of the same core, providing + better performance while still isolating VM threads from host tasks. +- `vcpu`: Each vCPU thread gets its own unique cookie. No two vCPUs can + share SMT siblings, providing the strongest isolation between vCPUs at + the cost of performance. +- `off`: No core scheduling is applied. + +On kernels older than 5.14 (which lack `PR_SCHED_CORE` support), the +option silently has no effect. + +_Example_ + +``` +--cpus boot=2,core_scheduling=vm +``` + +In this example, both vCPUs will share the same core scheduling cookie, +allowing them to be co-scheduled on SMT siblings while preventing host +threads from sharing those siblings. diff --git a/docs/device_model.md b/docs/device_model.md index c072dc2eb6..e915c47eec 100644 --- a/docs/device_model.md +++ b/docs/device_model.md @@ -90,8 +90,9 @@ feature is enabled by default. For all virtio devices listed below, only `virtio-pci` transport layer is supported. Cloud Hypervisor supports multiple PCI segments, and users can -append `,pci_segment=` to the device flag in the Cloud -Hypervisor command line to assign devices to a specific PCI segment. +append `,pci_segment=` or `,pci_device_id=` to +the device flag in the Cloud Hypervisor command line to assign devices to a specific +PCI segment or into a specific device slot. ### virtio-block @@ -201,6 +202,24 @@ networking device (e.g. DPDK) into the VMM as their virtio network backend. This device is always built-in, and it is enabled when `vhost_user=true` and `socket` are provided to the `--net` parameter. +### vhost-user-generic + +This is a generic vhost-user device. The main use case is to provide a +vhost-user device that Cloud Hypervisor doesn't support natively. However, +there is nothing preventing its use for devices that Cloud Hypervisor does +support. For instance, the tag of a virtio-fs device can be set on the +virtiofsd command line, whereas the built-in virtio-fs support +requires the tag to be set in Cloud Hypervisor's command line. + +If the backend negotiates the `VHOST_USER_PROTOCOL_F_CONFIG` feature, +all configuration space access will be handled by it. Otherwise, +writes will be ignored and reads will return 0xFF. Cloud Hypervisor +warns if this happens. + +This device is always built-in, and it is enabled when the +`--generic-vhost-user` flag is passed. +See [the generic vhost-user documentation](generic-vhost-user.md) for more details. + ## VFIO VFIO (Virtual Function I/O) is a kernel framework that exposes direct device diff --git a/docs/disk_locking.md b/docs/disk_locking.md new file mode 100644 index 0000000000..adb8956a08 --- /dev/null +++ b/docs/disk_locking.md @@ -0,0 +1,61 @@ +# Disk Image Locking + +Cloud Hypervisor places an advisory lock on each disk image opened via +`--disk` to prevent multiple instances from concurrently accessing the +same file. This avoids potential data corruption from overlapping writes. +Locks are advisory and require cooperating processes; a non-cooperating +process can still open and write to a locked file. Locking is host-local +and does not enforce coordination across multiple hosts. + +If the backing file resides on network storage, the storage system must +correctly translate or propagate OFD (Open File Description) locks across +the network to ensure that advisory locking semantics are preserved in a +multi-host environment. In the case of Linux, OFD locks are translated +into NFS locks by the NFS driver. + +The implementation uses Open File Description (OFD) locks (`F_OFD_SETLK`) +rather than traditional POSIX locks (`F_SETLK`). OFD locks are only +released when the last file descriptor referencing the open file +description is closed, preventing accidental early release. + +## Lock Granularity + +The `lock_granularity` parameter controls how the lock is placed on the +disk image: + +``` +--disk path=/foo.img,lock_granularity=byte-range +--disk path=/bar.img,lock_granularity=full +``` + +### `byte-range` (default) + +Locks the byte range `[0, physical_file_size)`. The physical file size +is evaluated once at startup; if the file grows after the lock is +acquired, the newly appended region is not covered by the lock. + +The file is protected against concurrent access by other instances of +Cloud Hypervisor. That's the only thing we can guarantee. + +#### Fallback to full + +One caveat is that if the physical size of the disk image cannot be +determined at startup (e.g. with certain vhost-user backends), Cloud +Hypervisor falls back to a whole-file lock regardless of the +`lock_granularity` setting, as a byte-range lock cannot be safely +computed without knowing the physical file size. + +### `full` + +Locks the entire file using the OFD whole-file semantic (`l_start=0`, +`l_len=0`). This may be needed in environments that depend on whole-file +lock semantics. Note that on some network storage backends, whole-file +OFD locks may be treated as mandatory rather than advisory, which can +cause external tools to fail when accessing the disk image. Lock +behavior may also vary across network filesystem implementations. + +## Disk Resizing + +Cloud Hypervisor supports live disk resizing. Currently, byte-range +locks are not updated. However, as a part of the file is still locked, +no new Cloud Hypervisor instance can open the disk image. diff --git a/docs/fw_cfg.md b/docs/fw_cfg.md index 73f10a7808..76e6951f45 100644 --- a/docs/fw_cfg.md +++ b/docs/fw_cfg.md @@ -39,9 +39,10 @@ The `fw_cfg` device is configured using the `--fw-cfg-config` command-line optio * `cmdline=on|off`: (Default: `on`) Whether to add the kernel command line (specified by `--cmdline`) to `fw_cfg`. * `initramfs=on|off`: (Default: `on`) Whether to add the initramfs image (specified by `--initramfs`) to `fw_cfg`. * `acpi_table=on|off`: (Default: `on`) Whether to add generated ACPI tables to `fw_cfg`. -* `items=[... : ...]`: A list of custom key-value pairs to be exposed via `fw_cfg`. +* `items=[... : ...]`: A list of custom key-value pairs to be exposed via `fw_cfg`. Multiple items are separated by `:`. * `name=`: The path under which the item will appear in the guest's sysfs (e.g., `opt/org.example/my-data`). - * `file=`: The path to the file on the host whose content will be provided to the guest for this item. + * `file=`: The path to a file on the host whose content will be provided to the guest for this item. + * `string=`: An inline string value to provide to the guest for this item. Each item must have exactly one of `file` or `string`, not both. **Example Usage:** @@ -57,7 +58,19 @@ The `fw_cfg` device is configured using the `--fw-cfg-config` command-line optio ``` In the guest, `/tmp/guest_setup.txt` from the host will be accessible at `/sys/firmware/qemu_fw_cfg/by_name/opt/org.mycorp/setup_info/raw`. -2. **Disabling `fw_cfg` explicitly:** +2. **Inline string items (e.g., OVMF MMIO64 configuration for GPU passthrough):** + + ```bash + cloud-hypervisor \ + --firmware /path/to/OVMF.fd \ + --disk path=/path/to/rootfs.img \ + --device path=/sys/bus/pci/devices/0000:41:00.0 \ + --fw-cfg-config items=[name=opt/ovmf/X-PciMmio64Mb,string=262144] \ + ... + ``` + The string `262144` is passed directly to the guest as the content of `opt/ovmf/X-PciMmio64Mb`. + +3. **Disabling `fw_cfg` explicitly:** ```bash cloud-hypervisor \ diff --git a/docs/generic-vhost-user.md b/docs/generic-vhost-user.md new file mode 100644 index 0000000000..6af813e28c --- /dev/null +++ b/docs/generic-vhost-user.md @@ -0,0 +1,76 @@ +# How to use generic vhost-user devices + +## What is a generic vhost-user device? + +Cloud Hypervisor deliberately does not have support for all types of virtio devices. +For instance, it does not natively support sound or media. + +However, the vhost-user protocol does not require the frontend to have separate +code for each type of vhost-user device. This allows writing a *generic* frontend +that supports almost all of them. + +Any vhost-user device that only uses supported protocol messages is +expected to work. It can (and often will) be of a type that Cloud +Hypervisor does not know about. It can even be of a type that is +not standardized. + +Virtio-GPU is known to *not* work. The version implemented in QEMU +requires `VHOST_USER_GPU_SET_SOCKET`, which is standard but will +never be implemented by Cloud Hypervisor. Other versions require +messages that have not been standardized. In the future, these +versions might be supported. + +## Examples + +virtiofsd meets these requirements if the `--tag` argument is passed. +Therefore, generic vhost-user can be used as an alternative to the built-in +virtio-fs support. See [fs.md](fs.md) for how to build the virtiofs daemon. + +To use generic vhost-user with virtiofsd, use a command line argument +similar to this: + +```bash +/path/to/virtiofsd \ + --tag=myfs \ + --log-level=debug \ + "--socket-path=$path_to_virtiofsd_socket" \ + "--shared-dir=$path_to_shared_directory" \ + "${other_virtiofsd_options[@]}" & + +/path/to/cloud-hypervisor \ + --cpus boot=1 \ + --memory size=1G,shared=on \ + --disk path=your-linux-image.iso \ + --kernel vmlinux \ + --cmdline "console=hvc0 root=/dev/vda1 rw" \ + --generic-vhost-user "socket=\"${path_to_virtiofsd_socket//\"/\"\"}\",virtio_id=26,queue_sizes=[512,512]" \ + "${other_cloud_hypervisor_options[@]}" +``` + +26 is the ID for a virtio-fs device. The IDs for other devices are defined +by the VIRTIO specification. The odd-looking variable expansion escapes +any double quotes in the socket path. It is also possible to provide +the name that is defined by the virtio specification, so `virtio_id=fs` +will also work. + +Inside the guest, you can mount the virtio-fs device with + +```bash +mkdir mount_dir +mount -t virtiofs -- myfs mount_dir/ +``` + +## Limitations + +Cloud Hypervisor does not save, restore, or migrate the PCI configuration +space of a generic vhost-user device. The backend can do it itself, but if +it does not these features will not work. + +Cloud Hypervisor cannot validate the number or size of the queues. Some +guest drivers do not validate these and will crash if they are wrong. +Notably, at least some versions of Linux will crash if one creates a +virtio-fs device (id 26) with only one queue. + +If any access to configuration space fails, Cloud Hypervisor will panic +instead of injecting an exception into the guest. It is unclear what +correct behavior is in this case. diff --git a/docs/live_migration.md b/docs/live_migration.md index ac5bf93f75..81eed06665 100644 --- a/docs/live_migration.md +++ b/docs/live_migration.md @@ -3,8 +3,9 @@ This document gives examples of how to use the live migration support in Cloud Hypervisor: -1. local migration - migrating a VM from one Cloud Hypervisor instance to another on the same machine; -1. remote migration - migrating a VM between two machines; +1. **Local Migration**: Migrating a VM from one Cloud Hypervisor instance to another on the same machine; also called + UNIX socket migration. +1. **Remote Migration** (TCP Migration): migrating a VM between two TCP/IP hosts. > :warning: These examples place sockets in /tmp. This is done for > simplicity and should not be done in production. @@ -28,7 +29,8 @@ Launch the destination VM from the same directory (on the host machine): $ target/release/cloud-hypervisor --api-socket=/tmp/api2 ``` -Get ready for receiving migration for the destination VM (on the host machine): +Get ready for receiving migration for the destination VM (on the host +machine): ```console $ target/release/ch-remote --api-socket=/tmp/api2 receive-migration unix:/tmp/sock @@ -37,14 +39,16 @@ $ target/release/ch-remote --api-socket=/tmp/api2 receive-migration unix:/tmp/so Start to send migration for the source VM (on the host machine): ```console -$ target/release/ch-remote --api-socket=/tmp/api1 send-migration --local unix:/tmp/sock +$ target/release/ch-remote --api-socket=/tmp/api1 send-migration destination_url=unix:/tmp/sock,local=true ``` When the above commands completed, the source VM should be successfully migrated to the destination VM. Now the destination VM is running while the source VM is terminated gracefully. -## Remote Migration +## Remote Migration (TCP Migration) + +_Hint: For developing purposes, same-host TCP migrations are also supported._ In this example, we will migrate a VM from one machine (`src`) to another (`dst`) across the network. To keep it simple, we will use a @@ -171,7 +175,13 @@ After a few seconds the VM should be up and you can interact with it. Initiate the Migration over TCP: ```console -src $ ch-remote --api-socket=/tmp/api send-migration tcp:{dst}:{port} +src $ ch-remote --api-socket=/tmp/api send-migration destination_url=tcp:{dst}:{port} +``` + +With migration parameters: + +```console +src $ ch-remote --api-socket=/tmp/api send-migration destination_url=tcp:{dst}:{port},downtime_ms=200,timeout_s=3600,timeout_strategy=cancel ``` > Replace {dst}:{port} with the actual IP address and port of your destination host. @@ -180,3 +190,24 @@ After completing the above commands, the source VM will be migrated to the destination host and continue running there. The source VM instance will terminate normally. All ongoing processes and connections within the VM should remain intact after the migration. + +#### Migration Parameters + +Cloud Hypervisor supports additional parameters to control the +migration process. Via the API or `ch-remote`, you may specify: + +- `downtime_ms `: \ + The maximum downtime the migration aims for, in milliseconds. + Defaults to `300ms`. +- `timeout_s `: \ + The timeout for the migration (maximum total duration), in seconds. + Defaults to `3600s` (one hour). +- `timeout_strategy ` (`[cancel, ignore]`): \ + The strategy to apply when the migration timeout is reached. + Cancel will abort the migration and keep the VM running on the source. + Ignore will proceed with the migration regardless of the downtime requirement. + Defaults to `cancel`. +- `connections `: \ + The number of parallel TCP connections to use for migration. + Must be between `1` and `128`. Defaults to `1`. + Multiple connections are not supported with local UNIX-socket migration. diff --git a/docs/memory.md b/docs/memory.md index 75179e0e07..fb42e89374 100644 --- a/docs/memory.md +++ b/docs/memory.md @@ -214,11 +214,12 @@ struct MemoryZoneConfig { hotplug_size: Option, hotplugged_size: Option, prefault: bool, + mergeable: bool, } ``` ``` ---memory-zone User defined memory zone parameters "size=,file=,shared=on|off,hugepages=on|off,hugepage_size=,host_numa_node=,id=,hotplug_size=,hotplugged_size=,prefault=on|off" +--memory-zone User defined memory zone parameters "size=,file=,shared=on|off,hugepages=on|off,hugepage_size=,host_numa_node=,id=,hotplug_size=,hotplugged_size=,prefault=on|off,mergeable=on|off" ``` This parameter expects one or more occurrences, allowing for a list of memory @@ -422,6 +423,34 @@ _Example_ --memory-zone id=mem0,size=1G,prefault=on ``` +### `mergeable` + +Specifies if the pages from this memory zone must be marked as _mergeable_, +enabling Kernel Same-page Merging (KSM) for this zone. + +This is the per-zone equivalent of the top-level `--memory mergeable=on` option. +It allows KSM to be enabled selectively — for example, enabling it only on a +hotplug zone while leaving boot memory unaffected: + +``` +--memory size=2G,mergeable=off +--memory-zone id=hotplug,size=0,hotplug_size=8G,mergeable=on +``` + +For KSM to have any effect, the host kernel must have KSM enabled: +``` +echo 1 > /sys/kernel/mm/ksm/run +``` + +By default this option is turned off. + +_Example_ + +``` +--memory size=0 +--memory-zone id=mem0,size=1G,mergeable=on +``` + ## NUMA settings `NumaConfig` or what is known as `--numa` from the CLI perspective has been diff --git a/docs/snapshot_restore.md b/docs/snapshot_restore.md index df7248805e..567f77a9a6 100644 --- a/docs/snapshot_restore.md +++ b/docs/snapshot_restore.md @@ -90,9 +90,40 @@ start using it. ./ch-remote --api-socket=/tmp/cloud-hypervisor.sock resume ``` +Alternatively, the `resume` option can be used to automatically resume the VM +after restore completes: + +```bash +./cloud-hypervisor \ + --api-socket /tmp/cloud-hypervisor.sock \ + --restore source_url=file:///home/foo/snapshot,resume=true +``` + At this point, the VM is fully restored and is identical to the VM which was snapshot earlier. +Restore also supports selecting how guest memory is populated: + +```bash +./cloud-hypervisor \ + --api-socket /tmp/cloud-hypervisor.sock \ + --restore source_url=file:///home/foo/snapshot,memory_restore_mode=ondemand +``` + +If `memory_restore_mode` is omitted, Cloud Hypervisor uses the eager-copy +restore path (`copy`). + +With `memory_restore_mode=ondemand`, restore uses `userfaultfd` to fault snapshot +pages in on first access instead of copying the full `memory-ranges` file into +guest RAM before restore completes. This mode is strict: if Cloud Hypervisor +cannot enable the `userfaultfd` restore path, restore fails instead of falling +back to `copy`. + +Current constraints for `memory_restore_mode=ondemand`: + +- `prefault=on` is not supported +- the snapshot memory ranges must be page-aligned + ## Restore a VM with new Net FDs For a VM created with FDs explicitly passed to NetConfig, a set of valid FDs need to be provided along with the VM restore command in the following syntax: diff --git a/docs/vdpa.md b/docs/vdpa.md index c1aa34c571..1e171670ad 100644 --- a/docs/vdpa.md +++ b/docs/vdpa.md @@ -32,11 +32,12 @@ struct VdpaConfig { iommu: bool, id: Option, pci_segment: u16, + pci_device_id: Option } ``` ``` ---vdpa vDPA device "path=,num_queues=,iommu=on|off,id=,pci_segment=" +--vdpa vDPA device "path=,num_queues=,iommu=on|off,id=,pci_segment=,pci_device_id=" ``` ### `path` @@ -96,6 +97,21 @@ _Example_ --vdpa path=/dev/vhost-vdpa-0,pci_segment=1 ``` +### `pci_device_id` + +PCI device ID to assign to the vDPA device on its PCI bus. + +This parameter is optional. If not specified, a device ID is automatically +allocated. + +Value is an unsigned integer in the range 1-31. + +_Example_ + +``` +--vdpa path=/dev/vhost-vdpa-0,pci_device_id=5 +``` + ## Example with vDPA block simulator The vDPA framework provides a simulator with both `virtio-block` and @@ -146,10 +162,10 @@ The `virtio-block` device backed by the vDPA simulator can be found as ``` cloud@cloud:~$ lsblk NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT -nullb0 252:0 0 250G 0 disk -vda 254:0 0 2.2G 0 disk +nullb0 252:0 0 250G 0 disk +vda 254:0 0 2.2G 0 disk ├─vda1 254:1 0 2.1G 0 part / -├─vda14 254:14 0 4M 0 part +├─vda14 254:14 0 4M 0 part └─vda15 254:15 0 106M 0 part /boot/efi vdb 254:16 0 128M 0 disk ``` diff --git a/event_monitor/Cargo.toml b/event_monitor/Cargo.toml index 18ac2567c3..ee52a7fd3d 100644 --- a/event_monitor/Cargo.toml +++ b/event_monitor/Cargo.toml @@ -2,6 +2,7 @@ authors = ["The Cloud Hypervisor Authors"] edition.workspace = true name = "event_monitor" +rust-version.workspace = true version = "0.1.0" [dependencies] diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock index 37e0f2aa2d..7906fd5cd4 100644 --- a/fuzz/Cargo.lock +++ b/fuzz/Cargo.lock @@ -19,9 +19,9 @@ checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" [[package]] name = "anstream" -version = "0.6.21" +version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" +checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d" dependencies = [ "anstyle", "anstyle-parse", @@ -34,15 +34,15 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.13" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" [[package]] name = "anstyle-parse" -version = "0.2.7" +version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" +checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e" dependencies = [ "utf8parse", ] @@ -69,9 +69,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.101" +version = "1.0.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f0e0fee31ef5ed1ba1316088939cea399010ed7731dba877ed44aeb407a75ea" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" [[package]] name = "arbitrary" @@ -81,9 +81,9 @@ checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1" [[package]] name = "arc-swap" -version = "1.8.2" +version = "1.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9f3647c145568cec02c42054e07bdf9a5a698e15b466fb2341bfc393cd24aa5" +checksum = "6a3a1fd6f75306b68087b831f025c712524bcb19aad54e557b1129cfa0a2b207" dependencies = [ "rustversion", ] @@ -100,7 +100,7 @@ dependencies = [ "linux-loader", "log", "serde", - "thiserror 2.0.18", + "thiserror", "uuid", "vm-fdt", "vm-memory", @@ -149,7 +149,7 @@ dependencies = [ "remain", "serde", "smallvec", - "thiserror 2.0.18", + "thiserror", "uuid", "virtio-bindings", "virtio-queue", @@ -161,9 +161,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.19.1" +version = "3.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dd9dc738b7a8311c7ade152424974d8115f2cdad61e8dab8dac9f2362298510" +checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" [[package]] name = "byteorder" @@ -173,9 +173,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "cc" -version = "1.2.56" +version = "1.2.59" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aebf35691d1bfb0ac386a69bac2fde4dd276fb618cf8bf4f5318fe285e821bb2" +checksum = "b7a4d3ec6524d28a329fc53654bbadc9bdd7b0431f5d65f1a56ffb28a1ee5283" dependencies = [ "find-msvc-tools", "jobserver", @@ -189,20 +189,31 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" +[[package]] +name = "chacha20" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601" +dependencies = [ + "cfg-if", + "cpufeatures", + "rand_core", +] + [[package]] name = "clap" -version = "4.5.59" +version = "4.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5caf74d17c3aec5495110c34cc3f78644bfa89af6c8993ed4de2790e49b6499" +checksum = "b193af5b67834b676abd72466a96c1024e6a6ad978a1f484bd90b85c94041351" dependencies = [ "clap_builder", ] [[package]] name = "clap_builder" -version = "4.5.59" +version = "4.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "370daa45065b80218950227371916a1633217ae42b2715b2287b606dcd618e24" +checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" dependencies = [ "anstream", "anstyle", @@ -212,9 +223,9 @@ dependencies = [ [[package]] name = "clap_lex" -version = "1.0.0" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a822ea5bc7590f9d40f1ba12c0dc3c2760f3482c6984db1573ad11031420831" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" [[package]] name = "cloud-hypervisor-fuzz" @@ -244,9 +255,9 @@ dependencies = [ [[package]] name = "colorchoice" -version = "1.0.4" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" +checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" [[package]] name = "concat-idents" @@ -258,6 +269,15 @@ dependencies = [ "syn", ] +[[package]] +name = "cpufeatures" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201" +dependencies = [ + "libc", +] + [[package]] name = "crc-any" version = "2.5.0" @@ -278,9 +298,9 @@ dependencies = [ [[package]] name = "darling" -version = "0.21.3" +version = "0.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cdf337090841a411e2a7f3deb9187445851f91b309c0c0a29e05f74a00a48c0" +checksum = "25ae13da2f202d56bd7f91c25fba009e7717a1e4a1cc98a76d844b65ae912e9d" dependencies = [ "darling_core", "darling_macro", @@ -288,11 +308,10 @@ dependencies = [ [[package]] name = "darling_core" -version = "0.21.3" +version = "0.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1247195ecd7e3c85f83c8d2a366e4210d588e802133e1e355180a9870b517ea4" +checksum = "9865a50f7c335f53564bb694ef660825eb8610e0a53d3e11bf1b0d3df31e03b0" dependencies = [ - "fnv", "ident_case", "proc-macro2", "quote", @@ -302,9 +321,9 @@ dependencies = [ [[package]] name = "darling_macro" -version = "0.21.3" +version = "0.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d38308df82d1080de0afee5d069fa14b0326a88c14f15c5ccda35b4a6c414c81" +checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d" dependencies = [ "darling_core", "quote", @@ -333,7 +352,7 @@ dependencies = [ "num_enum", "pci", "serde", - "thiserror 2.0.18", + "thiserror", "tpm", "vm-allocator", "vm-device", @@ -448,12 +467,6 @@ dependencies = [ "spin", ] -[[package]] -name = "fnv" -version = "1.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" - [[package]] name = "foldhash" version = "0.1.5" @@ -474,23 +487,23 @@ checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893" [[package]] name = "gdbstub" -version = "0.7.9" +version = "0.7.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bf845b08f7c2ef3b5ad19f80779d43ae20d278652b91bb80adda65baf2d8ed6" +checksum = "5bafc7e33650ab9f05dcc16325f05d56b8d10393114e31a19a353b86fa60cfe7" dependencies = [ "bitflags 2.11.0", "cfg-if", "log", "managed", "num-traits", - "paste", + "pastey", ] [[package]] name = "gdbstub_arch" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22dde0e1b68787036ccedd0b1ff6f953527a0e807e571fbe898975203027278f" +checksum = "6c02bfe7bd65f42bcda751456869dfa1eb2bd1c36e309b9ec27f4888d41cf258" dependencies = [ "gdbstub", "num-traits", @@ -517,19 +530,20 @@ checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" dependencies = [ "cfg-if", "libc", - "r-efi", + "r-efi 5.3.0", "wasip2", ] [[package]] name = "getrandom" -version = "0.4.1" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "139ef39800118c7683f2fd3c98c1b23c09ae076556b435f8e9064ae108aaeeec" +checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" dependencies = [ "cfg-if", "libc", - "r-efi", + "r-efi 6.0.0", + "rand_core", "wasip2", "wasip3", ] @@ -545,9 +559,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.16.1" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51" [[package]] name = "heck" @@ -575,7 +589,7 @@ dependencies = [ "serde", "serde_json", "serde_with", - "thiserror 2.0.18", + "thiserror", "vfio-ioctls", "vm-memory", "vmm-sys-util", @@ -605,12 +619,12 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" [[package]] name = "indexmap" -version = "2.13.0" +version = "2.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017" +checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" dependencies = [ "equivalent", - "hashbrown 0.16.1", + "hashbrown 0.17.0", "serde", "serde_core", ] @@ -632,9 +646,9 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.17" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" [[package]] name = "jobserver" @@ -648,9 +662,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.77" +version = "0.3.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f" +checksum = "b49715b7073f385ba4bc528e5747d02e66cb39c6146efb66b781f131f0fb399c" dependencies = [ "once_cell", "wasm-bindgen", @@ -658,9 +672,9 @@ dependencies = [ [[package]] name = "kvm-bindings" -version = "0.12.1" +version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a537873e15e8daabb416667e606d9b0abc2a8fb9a45bd5853b888ae0ead82f9" +checksum = "4b3c06ff73c7ce03e780887ec2389d62d2a2a9ddf471ab05c2ff69207cd3f3b4" dependencies = [ "serde", "vmm-sys-util", @@ -669,9 +683,9 @@ dependencies = [ [[package]] name = "kvm-ioctls" -version = "0.22.1" +version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c8f7370330b4f57981e300fa39b02088f2f2a5c2d0f1f994e8090589619c56d" +checksum = "333f77a20344a448f3f70664918135fddeb804e938f28a99d685bd92926e0b19" dependencies = [ "bitflags 2.11.0", "kvm-bindings", @@ -687,7 +701,7 @@ checksum = "49fefd6652c57d68aaa32544a4c0e642929725bdc1fd929367cdeb673ab81088" dependencies = [ "enumflags2", "libc", - "thiserror 2.0.18", + "thiserror", ] [[package]] @@ -704,9 +718,9 @@ checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" [[package]] name = "libc" -version = "0.2.182" +version = "0.2.184" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6800badb6cb2082ffd7b6a67e6125bb39f18782f793520caee8cb8846be06112" +checksum = "48f5d2a454e16a5ea0f4ced81bd44e4cfc7bd3a507b61887c99fd3538b28e4af" [[package]] name = "libfuzzer-sys" @@ -720,9 +734,9 @@ dependencies = [ [[package]] name = "linux-loader" -version = "0.13.1" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53802c0b111faf302a16fa20a2e3a33bd0eab408f60fc34cbfe052f6b153791e" +checksum = "de72cb02c55ecffcf75fe78295926f872eb6eb0a58d629c58a8c324dc26380f6" dependencies = [ "vm-memory", ] @@ -757,7 +771,7 @@ checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" [[package]] name = "micro_http" version = "0.1.0" -source = "git+https://github.com/firecracker-microvm/micro-http?branch=main#3248ceeae41461d034624b582d5d358cd6e6f89f" +source = "git+https://github.com/firecracker-microvm/micro-http?branch=main#5c2254d6cf4f32a668d0d8e57ba20bebad9d4fba" dependencies = [ "libc", "vmm-sys-util", @@ -775,9 +789,9 @@ dependencies = [ [[package]] name = "mshv-bindings" -version = "0.6.7" +version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3cbfd4f32d185152003679339751839da77c17e18fa8882a11051a236f841426" +checksum = "a94fc3871dd23738188e5bc76a1d1a5930ebcaf9308c560a7274aa62b1770594" dependencies = [ "libc", "num_enum", @@ -787,25 +801,17 @@ dependencies = [ "zerocopy", ] -[[package]] -name = "net_gen" -version = "0.1.0" -dependencies = [ - "vmm-sys-util", -] - [[package]] name = "net_util" version = "0.1.0" dependencies = [ "epoll", - "getrandom 0.4.1", + "getrandom 0.4.2", "libc", "log", - "net_gen", "rate_limiter", "serde", - "thiserror 2.0.18", + "thiserror", "virtio-bindings", "virtio-queue", "vm-memory", @@ -824,9 +830,9 @@ dependencies = [ [[package]] name = "num_enum" -version = "0.7.5" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1207a7e20ad57b847bbddc6776b968420d38292bbfe2089accff5e19e82454c" +checksum = "5d0bca838442ec211fa11de3a8b0e0e8f3a4522575b5c4c06ed722e005036f26" dependencies = [ "num_enum_derive", "rustversion", @@ -834,9 +840,9 @@ dependencies = [ [[package]] name = "num_enum_derive" -version = "0.7.5" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff32365de1b6743cb203b710788263c44a03de03802daf96092f2da4fe6ba4d7" +checksum = "680998035259dcfcafe653688bf2aa6d3e2dc05e98be6ab46afb089dc84f1df8" dependencies = [ "proc-macro-crate", "proc-macro2", @@ -846,9 +852,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.21.3" +version = "1.21.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" [[package]] name = "once_cell_polyfill" @@ -880,14 +886,14 @@ dependencies = [ name = "option_parser" version = "0.1.0" dependencies = [ - "thiserror 2.0.18", + "thiserror", ] [[package]] -name = "paste" -version = "1.0.15" +name = "pastey" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" +checksum = "b867cad97c0791bbd3aaa6472142568c6c9e8f71937e98379f584cfb0cf35bec" [[package]] name = "pci" @@ -899,7 +905,7 @@ dependencies = [ "libc", "log", "serde", - "thiserror 2.0.18", + "thiserror", "vfio-bindings", "vfio-ioctls", "vfio_user", @@ -916,15 +922,6 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" -[[package]] -name = "ppv-lite86" -version = "0.2.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" -dependencies = [ - "zerocopy", -] - [[package]] name = "prettyplease" version = "0.2.37" @@ -937,9 +934,9 @@ dependencies = [ [[package]] name = "proc-macro-crate" -version = "3.4.0" +version = "3.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "219cb19e96be00ab2e37d6e299658a0cfa83e52429179969b0f0121b4ac46983" +checksum = "e67ba7e9b2b56446f1d419b1d807906278ffa1a658a8a5d8a39dcb1f5a78614f" dependencies = [ "toml_edit", ] @@ -955,9 +952,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.44" +version = "1.0.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21b2ebcf727b7760c461f091f9f0f539b77b8e87f2fd88131e7f1b433b3cece4" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" dependencies = [ "proc-macro2", ] @@ -969,33 +966,27 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" [[package]] -name = "rand" -version = "0.9.2" +name = "r-efi" +version = "6.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" -dependencies = [ - "rand_chacha", - "rand_core", -] +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" [[package]] -name = "rand_chacha" -version = "0.9.0" +name = "rand" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +checksum = "bc266eb313df6c5c09c1c7b1fbe2510961e5bcd3add930c1e31f7ed9da0feff8" dependencies = [ - "ppv-lite86", + "chacha20", + "getrandom 0.4.2", "rand_core", ] [[package]] name = "rand_core" -version = "0.9.5" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" -dependencies = [ - "getrandom 0.3.4", -] +checksum = "0c8d0fd677905edcbeedbf2edb6494d676f0e98d54d5cf9bda0b061cb8fb8aba" [[package]] name = "rate_limiter" @@ -1004,7 +995,7 @@ dependencies = [ "epoll", "libc", "log", - "thiserror 2.0.18", + "thiserror", "vmm-sys-util", ] @@ -1042,9 +1033,9 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.27" +version = "1.0.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" +checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd" [[package]] name = "serde" @@ -1091,9 +1082,9 @@ dependencies = [ [[package]] name = "serde_with" -version = "3.16.1" +version = "3.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fa237f2807440d238e0364a218270b98f767a00d3dada77b1c53ae88940e2e7" +checksum = "dd5414fad8e6907dbdd5bc441a50ae8d6e26151a03b1de04d89a5576de61d01f" dependencies = [ "serde_core", "serde_with_macros", @@ -1101,9 +1092,9 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "3.16.1" +version = "3.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52a8e3ca0ca629121f70ab50f95249e5a6f925cc0f6ffe8256c45b728875706c" +checksum = "d3db8978e608f1fe7357e211969fd9abdcae80bac1ba7a3369bb7eb6b404eb65" dependencies = [ "darling", "proc-macro2", @@ -1123,9 +1114,9 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" [[package]] name = "signal-hook" -version = "0.4.3" +version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b57709da74f9ff9f4a27dce9526eec25ca8407c45a7887243b031a58935fb8e" +checksum = "b2a0c28ca5908dbdbcd52e6fdaa00358ab88637f8ab33e1f188dd510eb44b53d" dependencies = [ "libc", "signal-hook-registry", @@ -1143,9 +1134,9 @@ dependencies = [ [[package]] name = "simd-adler32" -version = "0.3.8" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2" +checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214" [[package]] name = "smallvec" @@ -1170,42 +1161,22 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "syn" -version = "2.0.116" +version = "2.0.117" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3df424c70518695237746f84cede799c9c58fcb37450d7b23716568cc8bc69cb" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] -[[package]] -name = "thiserror" -version = "1.0.64" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d50af8abc119fb8bb6dbabcfa89656f46f84aa0ac7688088608076ad2b459a84" -dependencies = [ - "thiserror-impl 1.0.64", -] - [[package]] name = "thiserror" version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" dependencies = [ - "thiserror-impl 2.0.18", -] - -[[package]] -name = "thiserror-impl" -version = "1.0.64" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08904e7672f5eb876eaaf87e0ce17857500934f4981c4a0ab2b4aa98baac7fc3" -dependencies = [ - "proc-macro2", - "quote", - "syn", + "thiserror-impl", ] [[package]] @@ -1221,18 +1192,18 @@ dependencies = [ [[package]] name = "toml_datetime" -version = "0.7.5+spec-1.1.0" +version = "1.1.1+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92e1cfed4a3038bc5a127e35a2d360f145e1f4b971b551a2ba5fd7aedf7e1347" +checksum = "3165f65f62e28e0115a00b2ebdd37eb6f3b641855f9d636d3cd4103767159ad7" dependencies = [ "serde_core", ] [[package]] name = "toml_edit" -version = "0.23.10+spec-1.0.0" +version = "0.25.11+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84c8b9f757e028cee9fa244aea147aab2a9ec09d5325a9b01e0a49730c2b5269" +checksum = "0b59c4d22ed448339746c59b905d24568fcbb3ab65a500494f7b8c3e97739f2b" dependencies = [ "indexmap", "toml_datetime", @@ -1242,9 +1213,9 @@ dependencies = [ [[package]] name = "toml_parser" -version = "1.0.9+spec-1.1.0" +version = "1.1.2+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "702d4415e08923e7e1ef96cd5727c0dfed80b4d2fa25db9647fe5eb6f7c5a4c4" +checksum = "a2abe9b86193656635d2411dc43050282ca48aa31c2451210f4202550afb7526" dependencies = [ "winnow", ] @@ -1256,8 +1227,7 @@ dependencies = [ "anyhow", "libc", "log", - "net_gen", - "thiserror 2.0.18", + "thiserror", "vmm-sys-util", ] @@ -1291,11 +1261,11 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.21.0" +version = "1.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b672338555252d43fd2240c714dc444b8c6fb0a5c5335e65a07bba7742735ddb" +checksum = "5ac8b6f42ead25368cf5b098aeb3dc8a1a2c05a3eee8a9a1a68c640edbfc79d9" dependencies = [ - "getrandom 0.4.1", + "getrandom 0.4.2", "js-sys", "rand", "wasm-bindgen", @@ -1303,25 +1273,25 @@ dependencies = [ [[package]] name = "vfio-bindings" -version = "0.6.0" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "698c66a4522a31ab407a410a59c9660da036178e4fe3f371825cd6aad7d46837" +checksum = "188dac3057a0cbc94470085204c84b82ff7ec5dac629a514323cd133d1f9abe0" dependencies = [ "vmm-sys-util", ] [[package]] name = "vfio-ioctls" -version = "0.5.1" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7af7e8d49719333e5eb52209417f26695c9ab2b117a82596a63a44947f97c5d6" +checksum = "d4b1d98dff7f0d219278e406323e7eda4d426447bd203c7828189baf0d8c07b7" dependencies = [ "byteorder", "kvm-bindings", "kvm-ioctls", "libc", "log", - "thiserror 2.0.18", + "thiserror", "vfio-bindings", "vm-memory", "vmm-sys-util", @@ -1329,9 +1299,9 @@ dependencies = [ [[package]] name = "vfio_user" -version = "0.1.1" +version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8db5bc783aad75202ad4cbcdc5e893cff1dd8fa24a1bcdb4de8998d3c4d169a" +checksum = "731c2582dd43f4f174ab47b4c933a1a9bb872d9d1b7f54c5867e12dbc1491b75" dependencies = [ "bitflags 2.11.0", "libc", @@ -1339,7 +1309,7 @@ dependencies = [ "serde", "serde_derive", "serde_json", - "thiserror 2.0.18", + "thiserror", "vfio-bindings", "vm-memory", "vmm-sys-util", @@ -1347,9 +1317,9 @@ dependencies = [ [[package]] name = "vhost" -version = "0.14.0" +version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a4dcad85a129d97d5d4b2f3c47a4affdeedd76bdcd02094bcb5d9b76cac2d05" +checksum = "ee90657203a8644e9a0860a0db6a7887d8ef0c7bc09fc22dfa4ae75df65bac86" dependencies = [ "bitflags 2.11.0", "libc", @@ -1360,9 +1330,9 @@ dependencies = [ [[package]] name = "virtio-bindings" -version = "0.2.6" +version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "804f498a26d5a63be7bbb8bdcd3869c3f286c4c4a17108905276454da0caf8cb" +checksum = "091f1f09cfbf2a78563b562e7a949465cce1aef63b6065645188d995162f8868" [[package]] name = "virtio-devices" @@ -1373,6 +1343,7 @@ dependencies = [ "byteorder", "epoll", "event_monitor", + "hypervisor", "libc", "log", "net_util", @@ -1382,7 +1353,7 @@ dependencies = [ "serde", "serde_with", "serial_buffer", - "thiserror 2.0.18", + "thiserror", "vhost", "virtio-bindings", "virtio-queue", @@ -1396,10 +1367,11 @@ dependencies = [ [[package]] name = "virtio-queue" -version = "0.16.0" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb0479158f863e59323771a1f684d843962f76960b86fecfec2bfa9c8f0f9180" +checksum = "e358084f32ed165fddb41d98ff1b7ff3c08b9611d8d6114a1b422e2e85688baf" dependencies = [ + "libc", "log", "virtio-bindings", "vm-memory", @@ -1421,7 +1393,7 @@ version = "0.1.0" dependencies = [ "hypervisor", "serde", - "thiserror 2.0.18", + "thiserror", "vfio-ioctls", "vm-memory", "vmm-sys-util", @@ -1435,13 +1407,13 @@ checksum = "7e21282841a059bb62627ce8441c491f09603622cd5a21c43bfedc85a2952f23" [[package]] name = "vm-memory" -version = "0.16.2" +version = "0.17.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fd5e56d48353c5f54ef50bd158a0452fc82f5383da840f7b8efc31695dd3b9d" +checksum = "f39348a049689cabd3377cdd9182bf526ec76a6f823b79903896452e9d7a7380" dependencies = [ "arc-swap", "libc", - "thiserror 1.0.64", + "thiserror", "winapi", ] @@ -1453,7 +1425,7 @@ dependencies = [ "itertools", "serde", "serde_json", - "thiserror 2.0.18", + "thiserror", "vm-memory", ] @@ -1497,7 +1469,7 @@ dependencies = [ "serde_json", "serial_buffer", "signal-hook", - "thiserror 2.0.18", + "thiserror", "tracer", "uuid", "vfio-ioctls", @@ -1516,9 +1488,9 @@ dependencies = [ [[package]] name = "vmm-sys-util" -version = "0.14.0" +version = "0.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d21f366bf22bfba3e868349978766a965cbe628c323d58e026be80b8357ab789" +checksum = "506c62fdf617a5176827c2f9afbcf1be155b03a9b4bf9617a60dbc07e3a1642f" dependencies = [ "bitflags 1.3.2", "libc", @@ -1552,35 +1524,22 @@ dependencies = [ [[package]] name = "wasm-bindgen" -version = "0.2.100" +version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" +checksum = "6532f9a5c1ece3798cb1c2cfdba640b9b3ba884f5db45973a6f442510a87d38e" dependencies = [ "cfg-if", "once_cell", "rustversion", "wasm-bindgen-macro", -] - -[[package]] -name = "wasm-bindgen-backend" -version = "0.2.100" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" -dependencies = [ - "bumpalo", - "log", - "proc-macro2", - "quote", - "syn", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-macro" -version = "0.2.100" +version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" +checksum = "18a2d50fcf105fb33bb15f00e7a77b772945a2ee45dcf454961fd843e74c18e6" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -1588,22 +1547,22 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.100" +version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" +checksum = "03ce4caeaac547cdf713d280eda22a730824dd11e6b8c3ca9e42247b25c631e3" dependencies = [ + "bumpalo", "proc-macro2", "quote", "syn", - "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.100" +version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" +checksum = "75a326b8c223ee17883a4251907455a2431acc2791c98c26279376490c378c16" dependencies = [ "unicode-ident", ] @@ -1681,9 +1640,9 @@ dependencies = [ [[package]] name = "winnow" -version = "0.7.14" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a5364e9d77fcdeeaa6062ced926ee3381faa2ee02d3eb83a5c27a8825540829" +checksum = "09dac053f1cd375980747450bfc7250c264eaae0583872e845c0c7cd578872b5" dependencies = [ "memchr", ] @@ -1778,18 +1737,18 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.39" +version = "0.8.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db6d35d663eadb6c932438e763b262fe1a70987f9ae936e60158176d710cae4a" +checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.39" +version = "0.8.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4122cd3169e94605190e77839c9a40d40ed048d305bfdc146e7df40ab0f3e517" +checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4" dependencies = [ "proc-macro2", "quote", diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index 888928af06..83dead0c9e 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -19,23 +19,23 @@ pvmemcontrol = [] arbitrary = "1.4.2" block = { path = "../block" } devices = { path = "../devices" } -epoll = "4.3.3" +epoll = "4.4.0" hypervisor = { path = "../hypervisor", features = ["mshv_emulator"] } -libc = "0.2.182" +libc = "0.2.184" libfuzzer-sys = "0.4.12" -linux-loader = { version = "0.13.1", features = ["bzimage", "elf", "pe"] } +linux-loader = { version = "0.13.2", features = ["bzimage", "elf", "pe"] } micro_http = { git = "https://github.com/firecracker-microvm/micro-http", branch = "main" } -mshv-bindings = "0.6.6" +mshv-bindings = "0.6.8" net_util = { path = "../net_util" } seccompiler = "0.5.0" virtio-devices = { path = "../virtio-devices" } -virtio-queue = "0.16.0" +virtio-queue = "0.17.0" vm-device = { path = "../vm-device" } -vm-memory = "0.16.0" +vm-memory = "0.17.1" vm-migration = { path = "../vm-migration" } vm-virtio = { path = "../vm-virtio" } vmm = { path = "../vmm", features = ["guest_debug"] } -vmm-sys-util = "0.14.0" +vmm-sys-util = "0.15.0" # Prevent this from interfering with workspaces [workspace] diff --git a/fuzz/fuzz_targets/balloon.rs b/fuzz/fuzz_targets/balloon.rs index b745cf6187..20260c5469 100644 --- a/fuzz/fuzz_targets/balloon.rs +++ b/fuzz/fuzz_targets/balloon.rs @@ -49,6 +49,7 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { BALLOON_SIZE, true, true, + false, SeccompAction::Allow, EventFd::new(EFD_NONBLOCK).unwrap(), None, @@ -95,15 +96,16 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { reporting_queue_evt.write(1).unwrap(); balloon - .activate( - guest_memory, - Arc::new(NoopVirtioInterrupt {}), - vec![ + .activate(virtio_devices::ActivationContext { + mem: guest_memory, + interrupt_cb: Arc::new(NoopVirtioInterrupt {}), + queues: vec![ (0, inflate_q, inflate_evt), (1, deflate_q, deflate_evt), (2, reporting_q, reporting_evt), ], - ) + device_status: Arc::new(std::sync::atomic::AtomicU8::new(0)), + }) .ok(); // Wait for the events to finish and balloon device worker thread to return @@ -118,6 +120,15 @@ impl VirtioInterrupt for NoopVirtioInterrupt { fn trigger(&self, _int_type: VirtioInterruptType) -> std::result::Result<(), std::io::Error> { Ok(()) } + + fn set_notifier( + &self, + _interrupt: u32, + _eventfd: Option, + _vm: &dyn hypervisor::Vm, + ) -> std::io::Result<()> { + unimplemented!() + } } macro_rules! align { diff --git a/fuzz/fuzz_targets/block.rs b/fuzz/fuzz_targets/block.rs index 7d1fbdf38f..abddc27b41 100644 --- a/fuzz/fuzz_targets/block.rs +++ b/fuzz/fuzz_targets/block.rs @@ -15,7 +15,8 @@ use std::path::PathBuf; use std::sync::Arc; use std::{ffi, io}; -use block::async_io::DiskFile; +use block::disk_file::DiskBackend; +use block::fcntl::LockGranularityChoice; use block::raw_sync::RawFileDiskSync; use libfuzzer_sys::{fuzz_target, Corpus}; use seccompiler::SeccompAction; @@ -51,11 +52,10 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { // Create a virtio-block device backed by a synchronous raw file let shm = memfd_create(&ffi::CString::new("fuzz").unwrap(), 0).unwrap(); let disk_file: File = unsafe { File::from_raw_fd(shm) }; - let qcow_disk = Box::new(RawFileDiskSync::new(disk_file)) as Box; let queue_affinity = BTreeMap::new(); let mut block = Block::new( "tmp".to_owned(), - qcow_disk, + DiskBackend::Next(Box::new(RawFileDiskSync::new(disk_file))), PathBuf::from(""), false, false, @@ -69,6 +69,7 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { queue_affinity, true, false, + LockGranularityChoice::default(), ) .unwrap(); @@ -89,11 +90,12 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { queue_evt.write(1).unwrap(); block - .activate( - guest_memory, - Arc::new(NoopVirtioInterrupt {}), - vec![(0, q, evt)], - ) + .activate(virtio_devices::ActivationContext { + mem: guest_memory, + interrupt_cb: Arc::new(NoopVirtioInterrupt {}), + queues: vec![(0, q, evt)], + device_status: Arc::new(std::sync::atomic::AtomicU8::new(0)), + }) .ok(); // Wait for the events to finish and block device worker thread to return @@ -118,6 +120,15 @@ impl VirtioInterrupt for NoopVirtioInterrupt { fn trigger(&self, _int_type: VirtioInterruptType) -> std::result::Result<(), std::io::Error> { Ok(()) } + + fn set_notifier( + &self, + _interrupt: u32, + _eventfd: Option, + _vm: &dyn hypervisor::Vm, + ) -> std::io::Result<()> { + unimplemented!() + } } fn setup_virt_queue(bytes: &[u8; QUEUE_DATA_SIZE]) -> Queue { diff --git a/fuzz/fuzz_targets/console.rs b/fuzz/fuzz_targets/console.rs index 4b3a49df91..a335a96027 100644 --- a/fuzz/fuzz_targets/console.rs +++ b/fuzz/fuzz_targets/console.rs @@ -128,11 +128,12 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { pipe_tx.write_all(console_input_bytes).unwrap(); // To use fuzzed data; console - .activate( - guest_memory, - Arc::new(NoopVirtioInterrupt {}), - vec![(0, input_queue, input_evt), (1, output_queue, output_evt)], - ) + .activate(virtio_devices::ActivationContext { + mem: guest_memory, + interrupt_cb: Arc::new(NoopVirtioInterrupt {}), + queues: vec![(0, input_queue, input_evt), (1, output_queue, output_evt)], + device_status: Arc::new(std::sync::atomic::AtomicU8::new(0)), + }) .unwrap(); // Wait for the events to finish and console device worker thread to return @@ -147,6 +148,15 @@ impl VirtioInterrupt for NoopVirtioInterrupt { fn trigger(&self, _int_type: VirtioInterruptType) -> std::result::Result<(), std::io::Error> { Ok(()) } + + fn set_notifier( + &self, + _interrupt: u32, + _eventfd: Option, + _vm: &dyn hypervisor::Vm, + ) -> std::io::Result<()> { + unimplemented!() + } } fn setup_virt_queues(bytes: &[&[u8; QUEUE_DATA_SIZE]], base_addr: u64) -> Vec { diff --git a/fuzz/fuzz_targets/http_api.rs b/fuzz/fuzz_targets/http_api.rs index ee9dd62f18..b7128a1678 100644 --- a/fuzz/fuzz_targets/http_api.rs +++ b/fuzz/fuzz_targets/http_api.rs @@ -137,6 +137,7 @@ impl RequestHandler for StubApiRequestHandler { affinity: None, features: CpuFeatures::default(), nested: true, + core_scheduling: CoreScheduling::default(), }, memory: MemoryConfig { size: 536_870_912, @@ -168,6 +169,7 @@ impl RequestHandler for StubApiRequestHandler { }, balloon: None, fs: None, + generic_vhost_user: None, pmem: None, serial: ConsoleConfig { file: None, @@ -254,6 +256,13 @@ impl RequestHandler for StubApiRequestHandler { Ok(None) } + fn vm_add_generic_vhost_user( + &mut self, + _: GenericVhostUserConfig, + ) -> Result>, VmError> { + Ok(None) + } + fn vm_add_pmem(&mut self, _: PmemConfig) -> Result>, VmError> { Ok(None) } diff --git a/fuzz/fuzz_targets/iommu.rs b/fuzz/fuzz_targets/iommu.rs index 8c9f26b262..11600a36a7 100644 --- a/fuzz/fuzz_targets/iommu.rs +++ b/fuzz/fuzz_targets/iommu.rs @@ -107,14 +107,15 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { request_queue_evt.write(1).unwrap(); iommu - .activate( - guest_memory, - Arc::new(NoopVirtioInterrupt {}), - vec![ + .activate(virtio_devices::ActivationContext { + mem: guest_memory, + interrupt_cb: Arc::new(NoopVirtioInterrupt {}), + queues: vec![ (0, request_queue, request_evt), (0, _event_queue, _event_evt), ], - ) + device_status: Arc::new(std::sync::atomic::AtomicU8::new(0)), + }) .ok(); // Wait for the events to finish and vIOMMU device worker thread to return @@ -129,6 +130,15 @@ impl VirtioInterrupt for NoopVirtioInterrupt { fn trigger(&self, _int_type: VirtioInterruptType) -> std::result::Result<(), std::io::Error> { Ok(()) } + + fn set_notifier( + &self, + _interrupt: u32, + _eventfd: Option, + _vm: &dyn hypervisor::Vm, + ) -> std::io::Result<()> { + unimplemented!() + } } fn setup_virt_queue(bytes: &[u8; QUEUE_DATA_SIZE]) -> Queue { diff --git a/fuzz/fuzz_targets/mem.rs b/fuzz/fuzz_targets/mem.rs index 57fc9a91dd..e430e195aa 100644 --- a/fuzz/fuzz_targets/mem.rs +++ b/fuzz/fuzz_targets/mem.rs @@ -105,11 +105,12 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { queue_evt.write(1).unwrap(); virtio_mem - .activate( - guest_memory, - Arc::new(NoopVirtioInterrupt {}), - vec![(0, q, evt)], - ) + .activate(virtio_devices::ActivationContext { + mem: guest_memory, + interrupt_cb: Arc::new(NoopVirtioInterrupt {}), + queues: vec![(0, q, evt)], + device_status: Arc::new(std::sync::atomic::AtomicU8::new(0)), + }) .ok(); // Wait for the events to finish and virtio-mem device worker thread to return @@ -124,6 +125,15 @@ impl VirtioInterrupt for NoopVirtioInterrupt { fn trigger(&self, _int_type: VirtioInterruptType) -> std::result::Result<(), std::io::Error> { Ok(()) } + + fn set_notifier( + &self, + _interrupt: u32, + _eventfd: Option, + _vm: &dyn hypervisor::Vm, + ) -> std::io::Result<()> { + unimplemented!() + } } // Create a dummy virtio-mem device for fuzzing purpose only diff --git a/fuzz/fuzz_targets/net.rs b/fuzz/fuzz_targets/net.rs index 30968d2a47..55c98bdcfc 100644 --- a/fuzz/fuzz_targets/net.rs +++ b/fuzz/fuzz_targets/net.rs @@ -64,7 +64,7 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { } let (dummy_tap_frontend, dummy_tap_backend) = create_socketpair().unwrap(); - let if_name = "fuzzer_tap_name".as_bytes().to_vec(); + let if_name = "fuzzer_tap_name"; let tap = net_util::Tap::new_for_fuzzing(dummy_tap_frontend, if_name); let mut net = virtio_devices::Net::new_with_tap( @@ -143,11 +143,12 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { input_queue_evt.write(1).unwrap(); output_queue_evt.write(1).unwrap(); - net.activate( - guest_memory, - Arc::new(NoopVirtioInterrupt {}), - vec![(0, input_queue, input_evt), (1, output_queue, output_evt)], - ) + net.activate(virtio_devices::ActivationContext { + mem: guest_memory, + interrupt_cb: Arc::new(NoopVirtioInterrupt {}), + queues: vec![(0, input_queue, input_evt), (1, output_queue, output_evt)], + device_status: Arc::new(std::sync::atomic::AtomicU8::new(0)), + }) .unwrap(); // Wait for the events to finish and net device worker thread to return @@ -165,6 +166,15 @@ impl VirtioInterrupt for NoopVirtioInterrupt { fn trigger(&self, _int_type: VirtioInterruptType) -> std::result::Result<(), std::io::Error> { Ok(()) } + + fn set_notifier( + &self, + _interrupt: u32, + _eventfd: Option, + _vm: &dyn hypervisor::Vm, + ) -> std::io::Result<()> { + unimplemented!() + } } fn setup_virt_queues(bytes: &[&[u8; QUEUE_DATA_SIZE]], base_addr: u64) -> Vec { diff --git a/fuzz/fuzz_targets/pmem.rs b/fuzz/fuzz_targets/pmem.rs index a8fcb7a774..37eabf86cd 100644 --- a/fuzz/fuzz_targets/pmem.rs +++ b/fuzz/fuzz_targets/pmem.rs @@ -61,11 +61,12 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { // Kick the 'queue' event before activate the pmem device queue_evt.write(1).unwrap(); - pmem.activate( - guest_memory, - Arc::new(NoopVirtioInterrupt {}), - vec![(0, q, evt)], - ) + pmem.activate(virtio_devices::ActivationContext { + mem: guest_memory, + interrupt_cb: Arc::new(NoopVirtioInterrupt {}), + queues: vec![(0, q, evt)], + device_status: Arc::new(std::sync::atomic::AtomicU8::new(0)), + }) .ok(); // Wait for the events to finish and pmem device worker thread to return @@ -94,6 +95,15 @@ impl VirtioInterrupt for NoopVirtioInterrupt { fn trigger(&self, _int_type: VirtioInterruptType) -> std::result::Result<(), std::io::Error> { Ok(()) } + + fn set_notifier( + &self, + _interrupt: u32, + _eventfd: Option, + _vm: &dyn hypervisor::Vm, + ) -> std::io::Result<()> { + unimplemented!() + } } // Create a dummy virtio-pmem device for fuzzing purpose only diff --git a/fuzz/fuzz_targets/rng.rs b/fuzz/fuzz_targets/rng.rs index 8d5ffe35b3..c3029f33b4 100644 --- a/fuzz/fuzz_targets/rng.rs +++ b/fuzz/fuzz_targets/rng.rs @@ -99,11 +99,12 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { // Kick the 'queue' event before activate the rng device queue_evt.write(1).unwrap(); - rng.activate( - guest_memory, - Arc::new(NoopVirtioInterrupt {}), - vec![(0, q, evt)], - ) + rng.activate(virtio_devices::ActivationContext { + mem: guest_memory, + interrupt_cb: Arc::new(NoopVirtioInterrupt {}), + queues: vec![(0, q, evt)], + device_status: Arc::new(std::sync::atomic::AtomicU8::new(0)), + }) .ok(); // Wait for the events to finish and rng device worker thread to return @@ -118,6 +119,15 @@ impl VirtioInterrupt for NoopVirtioInterrupt { fn trigger(&self, _int_type: VirtioInterruptType) -> std::result::Result<(), std::io::Error> { Ok(()) } + + fn set_notifier( + &self, + _interrupt: u32, + _eventfd: Option, + _vm: &dyn hypervisor::Vm, + ) -> std::io::Result<()> { + unimplemented!() + } } fn setup_virt_queue(bytes: &[u8; QUEUE_DATA_SIZE]) -> Queue { diff --git a/fuzz/fuzz_targets/vsock.rs b/fuzz/fuzz_targets/vsock.rs index 144b8b4057..559f2ec138 100644 --- a/fuzz/fuzz_targets/vsock.rs +++ b/fuzz/fuzz_targets/vsock.rs @@ -108,11 +108,12 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { .unwrap(); vsock - .activate( - guest_memory, - Arc::new(NoopVirtioInterrupt {}), - vec![(0, q, evt)], - ) + .activate(virtio_devices::ActivationContext { + mem: guest_memory, + interrupt_cb: Arc::new(NoopVirtioInterrupt {}), + queues: vec![(0, q, evt)], + device_status: Arc::new(std::sync::atomic::AtomicU8::new(0)), + }) .ok(); // Wait for the events to finish and vsock device worker thread to return @@ -127,6 +128,15 @@ impl VirtioInterrupt for NoopVirtioInterrupt { fn trigger(&self, _int_type: VirtioInterruptType) -> std::result::Result<(), std::io::Error> { Ok(()) } + + fn set_notifier( + &self, + _interrupt: u32, + _eventfd: Option, + _vm: &dyn hypervisor::Vm, + ) -> std::io::Result<()> { + unimplemented!() + } } fn setup_virt_queue(bytes: &[u8; QUEUE_DATA_SIZE]) -> Queue { diff --git a/fuzz/fuzz_targets/watchdog.rs b/fuzz/fuzz_targets/watchdog.rs index f203a228f9..60f4afab55 100644 --- a/fuzz/fuzz_targets/watchdog.rs +++ b/fuzz/fuzz_targets/watchdog.rs @@ -64,11 +64,12 @@ fuzz_target!(|bytes: &[u8]| -> Corpus { queue_evt.write(1).unwrap(); watchdog - .activate( - guest_memory, - Arc::new(NoopVirtioInterrupt {}), - vec![(0, q, evt)], - ) + .activate(virtio_devices::ActivationContext { + mem: guest_memory, + interrupt_cb: Arc::new(NoopVirtioInterrupt {}), + queues: vec![(0, q, evt)], + device_status: Arc::new(std::sync::atomic::AtomicU8::new(0)), + }) .ok(); // Wait for the events to finish and watchdog device worker thread to return @@ -83,6 +84,15 @@ impl VirtioInterrupt for NoopVirtioInterrupt { fn trigger(&self, _int_type: VirtioInterruptType) -> std::result::Result<(), std::io::Error> { Ok(()) } + + fn set_notifier( + &self, + _interrupt: u32, + _eventfd: Option, + _vm: &dyn hypervisor::Vm, + ) -> std::io::Result<()> { + unimplemented!() + } } fn setup_virt_queue(bytes: &[u8; QUEUE_DATA_SIZE]) -> Queue { diff --git a/hypervisor/Cargo.toml b/hypervisor/Cargo.toml index e18f9ba390..19a9ca794d 100644 --- a/hypervisor/Cargo.toml +++ b/hypervisor/Cargo.toml @@ -3,6 +3,7 @@ authors = ["Microsoft Authors"] edition.workspace = true license = "Apache-2.0 OR BSD-3-Clause" name = "hypervisor" +rust-version.workspace = true version = "0.1.0" [features] @@ -14,7 +15,7 @@ tdx = [] [dependencies] anyhow = { workspace = true } -arc-swap = "1.8.2" +arc-swap = "1.9.1" bitfield-struct = "0.12.0" byteorder = { workspace = true } cfg-if = { workspace = true } diff --git a/hypervisor/src/arch/x86/mod.rs b/hypervisor/src/arch/x86/mod.rs index 78e4d7cc5d..45f820cde7 100644 --- a/hypervisor/src/arch/x86/mod.rs +++ b/hypervisor/src/arch/x86/mod.rs @@ -28,6 +28,36 @@ pub mod msr_index; // MTRR constants pub const MTRR_ENABLE: u64 = 0x800; // IA32_MTRR_DEF_TYPE MSR: E (MTRRs enabled) flag, bit 11 pub const MTRR_MEM_TYPE_WB: u64 = 0x6; +pub const MTRR_MSR_INDICES: [u32; 28] = [ + msr_index::MSR_MTRRdefType, + msr_index::MSR_IA32_MTRR_PHYSBASE0, + msr_index::MSR_IA32_MTRR_PHYSMASK0, + msr_index::MSR_IA32_MTRR_PHYSBASE1, + msr_index::MSR_IA32_MTRR_PHYSMASK1, + msr_index::MSR_IA32_MTRR_PHYSBASE2, + msr_index::MSR_IA32_MTRR_PHYSMASK2, + msr_index::MSR_IA32_MTRR_PHYSBASE3, + msr_index::MSR_IA32_MTRR_PHYSMASK3, + msr_index::MSR_IA32_MTRR_PHYSBASE4, + msr_index::MSR_IA32_MTRR_PHYSMASK4, + msr_index::MSR_IA32_MTRR_PHYSBASE5, + msr_index::MSR_IA32_MTRR_PHYSMASK5, + msr_index::MSR_IA32_MTRR_PHYSBASE6, + msr_index::MSR_IA32_MTRR_PHYSMASK6, + msr_index::MSR_IA32_MTRR_PHYSBASE7, + msr_index::MSR_IA32_MTRR_PHYSMASK7, + msr_index::MSR_MTRRfix64K_00000, + msr_index::MSR_MTRRfix16K_80000, + msr_index::MSR_MTRRfix16K_A0000, + msr_index::MSR_MTRRfix4K_C0000, + msr_index::MSR_MTRRfix4K_C8000, + msr_index::MSR_MTRRfix4K_D0000, + msr_index::MSR_MTRRfix4K_D8000, + msr_index::MSR_MTRRfix4K_E0000, + msr_index::MSR_MTRRfix4K_E8000, + msr_index::MSR_MTRRfix4K_F0000, + msr_index::MSR_MTRRfix4K_F8000, +]; // IOAPIC pins pub const NUM_IOAPIC_PINS: usize = 24; diff --git a/hypervisor/src/arch/x86/msr_index.rs b/hypervisor/src/arch/x86/msr_index.rs index 810fe08b9a..607ee3b2c1 100644 --- a/hypervisor/src/arch/x86/msr_index.rs +++ b/hypervisor/src/arch/x86/msr_index.rs @@ -85,6 +85,22 @@ pub const MSR_IA32_RTIT_ADDR3_B: ::std::os::raw::c_uint = 0x00000587; pub const MSR_IA32_RTIT_CR3_MATCH: ::std::os::raw::c_uint = 0x00000572; pub const MSR_IA32_RTIT_OUTPUT_BASE: ::std::os::raw::c_uint = 0x00000560; pub const MSR_IA32_RTIT_OUTPUT_MASK: ::std::os::raw::c_uint = 0x00000561; +pub const MSR_IA32_MTRR_PHYSBASE0: ::std::os::raw::c_uint = 0x00000200; +pub const MSR_IA32_MTRR_PHYSMASK0: ::std::os::raw::c_uint = 0x00000201; +pub const MSR_IA32_MTRR_PHYSBASE1: ::std::os::raw::c_uint = 0x00000202; +pub const MSR_IA32_MTRR_PHYSMASK1: ::std::os::raw::c_uint = 0x00000203; +pub const MSR_IA32_MTRR_PHYSBASE2: ::std::os::raw::c_uint = 0x00000204; +pub const MSR_IA32_MTRR_PHYSMASK2: ::std::os::raw::c_uint = 0x00000205; +pub const MSR_IA32_MTRR_PHYSBASE3: ::std::os::raw::c_uint = 0x00000206; +pub const MSR_IA32_MTRR_PHYSMASK3: ::std::os::raw::c_uint = 0x00000207; +pub const MSR_IA32_MTRR_PHYSBASE4: ::std::os::raw::c_uint = 0x00000208; +pub const MSR_IA32_MTRR_PHYSMASK4: ::std::os::raw::c_uint = 0x00000209; +pub const MSR_IA32_MTRR_PHYSBASE5: ::std::os::raw::c_uint = 0x0000020a; +pub const MSR_IA32_MTRR_PHYSMASK5: ::std::os::raw::c_uint = 0x0000020b; +pub const MSR_IA32_MTRR_PHYSBASE6: ::std::os::raw::c_uint = 0x0000020c; +pub const MSR_IA32_MTRR_PHYSMASK6: ::std::os::raw::c_uint = 0x0000020d; +pub const MSR_IA32_MTRR_PHYSBASE7: ::std::os::raw::c_uint = 0x0000020e; +pub const MSR_IA32_MTRR_PHYSMASK7: ::std::os::raw::c_uint = 0x0000020f; pub const MSR_MTRRfix64K_00000: ::std::os::raw::c_uint = 0x00000250; pub const MSR_MTRRfix16K_80000: ::std::os::raw::c_uint = 0x00000258; pub const MSR_MTRRfix16K_A0000: ::std::os::raw::c_uint = 0x00000259; diff --git a/hypervisor/src/cpu.rs b/hypervisor/src/cpu.rs index 4bc348a98d..044c81a2e8 100644 --- a/hypervisor/src/cpu.rs +++ b/hypervisor/src/cpu.rs @@ -587,10 +587,15 @@ pub trait Vcpu: Send + Sync { ) -> Result<[u32; 4]> { unimplemented!() } - #[cfg(feature = "mshv")] - fn set_sev_control_register(&self, _reg: u64) -> Result<()> { + #[cfg(feature = "sev_snp")] + fn set_sev_control_register(&self, _vmsa_pfn: u64) -> Result<()> { + unimplemented!() + } + #[cfg(feature = "sev_snp")] + fn setup_sev_snp_regs(&self, _vmsa: igvm::snp_defs::SevVmsa) -> Result<()> { unimplemented!() } + /// /// Sets the value of GIC redistributor address /// diff --git a/hypervisor/src/hypervisor.rs b/hypervisor/src/hypervisor.rs index a25f8a9bf7..05852a230f 100644 --- a/hypervisor/src/hypervisor.rs +++ b/hypervisor/src/hypervisor.rs @@ -96,6 +96,11 @@ pub enum HypervisorError { #[cfg(target_arch = "x86_64")] #[error("Failed to enable AMX tile state components")] CouldNotEnableAmxStateComponents(#[source] crate::arch::x86::AmxGuestSupportError), + /// + /// Failed to retrieve SEV-SNP capabilities + /// + #[error("Failed to retrieve SEV-SNP capabilities:{0}")] + SevSnpCapabilities(#[source] anyhow::Error), } /// diff --git a/hypervisor/src/kvm/mod.rs b/hypervisor/src/kvm/mod.rs index 259009151e..2f7137d55a 100644 --- a/hypervisor/src/kvm/mod.rs +++ b/hypervisor/src/kvm/mod.rs @@ -14,7 +14,10 @@ use std::any::Any; use std::collections::HashMap; #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] use std::mem::offset_of; -#[cfg(feature = "tdx")] +#[cfg(feature = "sev_snp")] +use std::os::fd::FromRawFd; +use std::os::fd::OwnedFd; +#[cfg(any(feature = "sev_snp", feature = "tdx"))] use std::os::unix::io::AsRawFd; #[cfg(feature = "tdx")] use std::os::unix::io::RawFd; @@ -26,9 +29,14 @@ use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::{Arc, RwLock}; use anyhow::anyhow; +#[cfg(feature = "sev_snp")] +use kvm_bindings::kvm_create_guest_memfd; use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd}; +#[cfg(feature = "sev_snp")] +use log::debug; #[cfg(target_arch = "x86_64")] use log::warn; +use vmm_sys_util::errno; use vmm_sys_util::eventfd::EventFd; #[cfg(target_arch = "aarch64")] @@ -50,8 +58,6 @@ pub use crate::riscv64::{ }; #[cfg(target_arch = "riscv64")] use crate::riscv64_reg_id; -use crate::vm::{self, InterruptSourceConfig, VmOps}; -use crate::{HypervisorType, HypervisorVmConfig, cpu, hypervisor}; // x86_64 dependencies #[cfg(target_arch = "x86_64")] pub mod x86_64; @@ -70,9 +76,15 @@ pub use x86_64::{CpuId, ExtendedControlRegisters, MsrEntries, VcpuKvmState}; use crate::ClockData; #[cfg(target_arch = "x86_64")] use crate::arch::x86::{ - CpuIdEntry, FpuState, LapicState, MsrEntry, NUM_IOAPIC_PINS, SpecialRegisters, XsaveState, + CpuIdEntry, FpuState, LapicState, MTRR_MSR_INDICES, MsrEntry, NUM_IOAPIC_PINS, + SpecialRegisters, XsaveState, +}; +use crate::{ + CpuState, HypervisorType, HypervisorVmConfig, InterruptSourceConfig, IoEventAddress, + IrqRoutingEntry, MpState, StandardRegisters, USER_MEMORY_REGION_GUEST_MEMFD, + USER_MEMORY_REGION_LOG_DIRTY, USER_MEMORY_REGION_READ, USER_MEMORY_REGION_WRITE, + UserMemoryRegion, VmOps, cpu, hypervisor, vm, }; -use crate::{CpuState, IoEventAddress, IrqRoutingEntry, MpState, StandardRegisters}; // aarch64 dependencies #[cfg(target_arch = "aarch64")] pub mod aarch64; @@ -82,6 +94,8 @@ pub mod riscv64; #[cfg(target_arch = "aarch64")] use std::mem; +#[cfg(target_arch = "x86_64")] +use kvm_bindings::KVM_X86_DEFAULT_VM; /// /// Export generically-named wrappers of kvm-bindings for Unix-based platforms /// @@ -90,11 +104,12 @@ pub use kvm_bindings::kvm_vcpu_events as VcpuEvents; #[cfg(target_arch = "x86_64")] use kvm_bindings::nested::KvmNestedStateBuffer; pub use kvm_bindings::{ - KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI, - KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID, kvm_clock_data, - kvm_create_device, kvm_create_device as CreateDevice, kvm_device_attr as DeviceAttr, - kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_guest_debug, kvm_irq_routing, kvm_irq_routing_entry, - kvm_mp_state, kvm_run, kvm_userspace_memory_region, + self, KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_IRQ_ROUTING_IRQCHIP, + KVM_IRQ_ROUTING_MSI, KVM_MEM_GUEST_MEMFD, KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, + KVM_MSI_VALID_DEVID, kvm_clock_data, kvm_create_device, kvm_create_device as CreateDevice, + kvm_device_attr as DeviceAttr, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_guest_debug, + kvm_irq_routing, kvm_irq_routing_entry, kvm_mp_state, kvm_run, kvm_userspace_memory_region, + kvm_userspace_memory_region2, }; #[cfg(target_arch = "aarch64")] use kvm_bindings::{ @@ -106,17 +121,16 @@ use kvm_bindings::{ #[cfg(target_arch = "riscv64")] use kvm_bindings::{KVM_REG_RISCV_CORE, kvm_riscv_core}; #[cfg(feature = "tdx")] -use kvm_bindings::{KVM_X86_DEFAULT_VM, KVM_X86_SW_PROTECTED_VM, KVMIO, kvm_run__bindgen_ty_1}; +use kvm_bindings::{KVM_X86_SW_PROTECTED_VM, KVMIO, kvm_run__bindgen_ty_1}; #[cfg(target_arch = "x86_64")] use kvm_bindings::{Xsave as xsave2, kvm_xsave2}; -pub use kvm_ioctls::{Cap, Kvm, VcpuExit}; +pub use kvm_ioctls::{self, Cap, Kvm, VcpuExit}; use thiserror::Error; use vfio_ioctls::VfioDeviceFd; #[cfg(target_arch = "x86_64")] use vmm_sys_util::{fam::FamStruct, ioctl_io_nr}; #[cfg(feature = "tdx")] use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_iowr_nr}; -pub use {kvm_bindings, kvm_ioctls}; #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] use crate::RegList; @@ -128,6 +142,64 @@ use crate::kvm::x86_64::XsaveStateError; #[cfg(target_arch = "x86_64")] ioctl_io_nr!(KVM_NMI, kvm_bindings::KVMIO, 0x9a); +#[cfg(feature = "sev_snp")] +use igvm_defs::PAGE_SIZE_4K; +#[cfg(feature = "sev_snp")] +use kvm_bindings::{ + KVM_MEMORY_ATTRIBUTE_PRIVATE, KVM_X86_SNP_VM, kvm_memory_attributes, kvm_segment as Segment, +}; +use vm_memory::GuestAddress; +#[cfg(feature = "sev_snp")] +use x86_64::sev; + +// Hardcoded GPA of a bootloader and VMSA page for KVM +// TODO: Derive these from the IGVM file's PageData/SnpVpContext directives +// instead of using fixed constants, to support arbitrary bootloader layouts. +pub const BOOTLOADER_START: GuestAddress = GuestAddress(0xffc0_0000); +pub const BOOTLOADER_SIZE: usize = 0x40_0000; // 4 MiB +pub const KVM_VMSA_PAGE_ADDRESS: GuestAddress = GuestAddress(0xffff_ffff_f000); +pub const KVM_VMSA_PAGE_SIZE: usize = 0x1000; // 4 KiB + +#[cfg(feature = "sev_snp")] +#[bitfield_struct::bitfield(u32)] +#[derive(PartialEq, Eq)] +/// AMD VMCB segment attributes +/// linux/arch/x86/include/asm/svm.h +pub struct SegAccess { + #[bits(4)] + pub seg_type: u8, + pub s_code_data: bool, + #[bits(2)] + pub priv_level: u8, + pub present: bool, + pub available: bool, + pub l_64bit: bool, + pub db_size_32: bool, + pub granularity: bool, + #[bits(20)] + _reserved: u32, +} + +#[cfg(feature = "sev_snp")] +fn make_segment(sev_selector: igvm::snp_defs::SevSelector) -> Segment { + let flags = SegAccess::from_bits(sev_selector.attrib.into()); + Segment { + base: sev_selector.base, + limit: sev_selector.limit, + selector: sev_selector.selector, + type_: flags.seg_type(), + s: flags.s_code_data() as u8, + dpl: flags.priv_level(), + present: flags.present() as u8, + avl: flags.available() as u8, + db: flags.db_size_32() as u8, + g: flags.granularity() as u8, + l: flags.l_64bit() as u8, + unusable: 0, + ..Default::default() + } +} + #[cfg(feature = "tdx")] const KVM_EXIT_TDX: u32 = 50; #[cfg(feature = "tdx")] @@ -238,6 +310,61 @@ pub struct KvmTdxExitVmcall { pub out_rdx: u64, } +impl From for UserMemoryRegion { + fn from(region: kvm_userspace_memory_region2) -> Self { + let mut flags = USER_MEMORY_REGION_READ; + if region.flags & KVM_MEM_READONLY == 0 { + flags |= USER_MEMORY_REGION_WRITE; + } + if region.flags & KVM_MEM_LOG_DIRTY_PAGES != 0 { + flags |= USER_MEMORY_REGION_LOG_DIRTY; + } + if region.flags & KVM_MEM_GUEST_MEMFD != 0 { + flags |= USER_MEMORY_REGION_GUEST_MEMFD; + } + + UserMemoryRegion { + slot: region.slot, + guest_phys_addr: region.guest_phys_addr, + memory_size: region.memory_size, + userspace_addr: region.userspace_addr, + flags, + guest_memfd: Some(region.guest_memfd), + guest_memfd_offset: Some(region.guest_memfd_offset), + } + } +} + +impl From for kvm_userspace_memory_region2 { + fn from(region: UserMemoryRegion) -> Self { + assert!( + region.flags & USER_MEMORY_REGION_READ != 0, + "KVM mapped memory is always readable" + ); + + let mut flags = 0; + if region.flags & USER_MEMORY_REGION_WRITE == 0 { + flags |= KVM_MEM_READONLY; + } + if region.flags & USER_MEMORY_REGION_LOG_DIRTY != 0 { + flags |= KVM_MEM_LOG_DIRTY_PAGES; + } + if region.flags & USER_MEMORY_REGION_GUEST_MEMFD != 0 { + flags |= KVM_MEM_GUEST_MEMFD; + } + + kvm_userspace_memory_region2 { + slot: region.slot, + guest_phys_addr: region.guest_phys_addr, + memory_size: region.memory_size, + userspace_addr: region.userspace_addr, + flags, + guest_memfd: region.guest_memfd.unwrap_or(0), + guest_memfd_offset: region.guest_memfd_offset.unwrap_or(0), + ..Default::default() + } + } +} impl From for MpState { fn from(s: kvm_mp_state) -> Self { MpState::Kvm(s) @@ -424,6 +551,9 @@ struct KvmDirtyLogSlot { guest_phys_addr: u64, memory_size: u64, userspace_addr: u64, + // Following fields are used by kvm_userspace_memory_region2. + guest_memfd_offset: u64, + guest_memfd: u32, } /// Wrapper over KVM VM ioctls. @@ -431,7 +561,10 @@ pub struct KvmVm { fd: Arc, #[cfg(target_arch = "x86_64")] msrs: Vec, - dirty_log_slots: Arc>>, + #[cfg(all(feature = "sev_snp", target_arch = "x86_64"))] + sev_fd: Option, + dirty_log_slots: RwLock>, + guest_memfds: Option>>, } impl KvmVm { @@ -494,6 +627,47 @@ impl KvmVm { fn translate_msi_ext_dest_id(address_lo: u32, address_hi: u32) -> (u32, u32) { (address_lo, address_hi) } + + /// Set user memory region to use guest_memfd when available. + /// guest_memfd is available on host linux kernel v6.8+ + /// + /// # Safety + /// + /// `region.userspace_addr` must point to `region.memory_size` bytes of + /// memory that will stay mapped until the slot is removed via + /// `remove_user_memory_region`. The memory region must + /// be uniquely owned by the caller, as mapping it into the guest + /// effectively creates a long-lived mutable reference. + unsafe fn set_user_memory_region( + &self, + region: kvm_userspace_memory_region2, + ) -> Result<(), errno::Error> { + if self.guest_memfds.is_some() { + // SAFETY: Safe as the caller guarantees that region is safe to map + // the guest and is non-overlapping. + unsafe { self.fd.set_user_memory_region2(region) } + } else { + // SAFETY: Safe because guest regions are guaranteed not to overlap. + unsafe { + self.fd.set_user_memory_region(kvm_userspace_memory_region { + slot: region.slot, + guest_phys_addr: region.guest_phys_addr, + userspace_addr: region.userspace_addr, + flags: region.flags, + memory_size: region.memory_size, + }) + } + } + } + + /// Get flag for kvm_userspace_memory_region based on memfd support. + fn get_kvm_userspace_memory_region_flag(&self, flag: u32) -> u32 { + flag | if self.guest_memfds.is_some() { + KVM_MEM_GUEST_MEMFD + } else { + 0 + } + } } /// Implementation of Vm trait for KVM @@ -509,6 +683,72 @@ impl KvmVm { /// let vm = hypervisor.create_vm(HypervisorVmConfig::default()).expect("new VM fd creation failed"); /// ``` impl vm::Vm for KvmVm { + #[cfg(all(feature = "sev_snp", target_arch = "x86_64"))] + fn sev_snp_init(&self, guest_policy: igvm_defs::SnpPolicy) -> vm::Result<()> { + self.sev_fd + .as_ref() + .unwrap() + .launch_start(&self.fd, guest_policy) + .map_err(|e| vm::HypervisorVmError::InitializeSevSnp(e.into())) + } + + #[cfg(all(feature = "sev_snp", target_arch = "x86_64"))] + fn import_isolated_pages( + &self, + page_type: u32, + page_size: u32, + // host page frame numbers + pfns: &[u64], + uaddrs: &[u64], + ) -> vm::Result<()> { + if pfns.is_empty() { + return Ok(()); + } + assert_eq!(pfns.len(), uaddrs.len()); + // VMSA pages are not supported by launch_update + // https://elixir.bootlin.com/linux/v6.11/source/arch/x86/kvm/svm/sev.c#L2377 + if page_type == sev::SNP_PAGE_TYPE_VMSA { + return Ok(()); + } + for i in 0..pfns.len() { + self.fd + .set_memory_attributes(kvm_memory_attributes { + address: pfns[i] << sev::GPA_METADATA_SHIFT_OFFSET, + size: page_size as u64, + attributes: kvm_bindings::KVM_MEMORY_ATTRIBUTE_PRIVATE as u64, + // Flags must be zero o/w error (flags aren't being used here yet) + flags: 0, + }) + .map_err(|e| vm::HypervisorVmError::ImportIsolatedPages(e.into()))?; + self.sev_fd + .as_ref() + .unwrap() + .launch_update(&self.fd, uaddrs[i], page_size as u64, pfns[i], page_type) + .map_err(|e| vm::HypervisorVmError::ImportIsolatedPages(e.into()))?; + } + + Ok(()) + } + + #[cfg(all(feature = "sev_snp", target_arch = "x86_64"))] + fn complete_isolated_import( + &self, + snp_id_block: igvm_defs::IGVM_VHS_SNP_ID_BLOCK, + host_data: [u8; 32], + id_block_enabled: u8, + ) -> vm::Result<()> { + self.sev_fd + .as_ref() + .unwrap() + .launch_finish( + &self.fd, + host_data, + id_block_enabled, + snp_id_block.author_key_enabled, + ) + .map_err(|e| vm::HypervisorVmError::CompleteIsolatedImport(e.into())) + } + #[cfg(target_arch = "x86_64")] /// /// Sets the address of the one-page region in the VM's address space. @@ -589,6 +829,8 @@ impl vm::Vm for KvmVm { hyperv_synic: AtomicBool::new(false), #[cfg(target_arch = "x86_64")] xsave_size, + #[cfg(feature = "sev_snp")] + vm_fd: self.fd.clone(), }; Ok(Box::new(vcpu)) } @@ -759,14 +1001,43 @@ impl vm::Vm for KvmVm { const _: () = assert!(core::mem::size_of::() <= core::mem::size_of::()); - let mut region = kvm_userspace_memory_region { + // Create a per-region guest_memfd when supported. + // Each region gets its own fd sized exactly to memory_size + #[cfg(feature = "sev_snp")] + let guest_memfd = if let Some(memfds) = &self.guest_memfds { + // SAFETY: Safe because guest regions are guaranteed not to overlap. + let fd = unsafe { + OwnedFd::from_raw_fd( + self.fd + .create_guest_memfd(kvm_create_guest_memfd { + size: memory_size as u64, + ..Default::default() + }) + .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into()))?, + ) + }; + let raw_fd = fd.as_raw_fd() as u32; + memfds.write().unwrap().insert(slot, fd); + raw_fd + } else { + 0 + }; + #[cfg(not(feature = "sev_snp"))] + let guest_memfd = 0; + + let mut region = kvm_userspace_memory_region2 { slot, + flags: self.get_kvm_userspace_memory_region_flag(flags), guest_phys_addr, memory_size: memory_size as u64, userspace_addr: userspace_addr as usize as u64, - flags, + #[cfg(not(target_arch = "riscv64"))] + guest_memfd, + // Each guest_memfd is per-region and sized to memory_size, + // so the region's data always starts at offset 0. + guest_memfd_offset: 0, + ..Default::default() }; - if (region.flags & KVM_MEM_LOG_DIRTY_PAGES) != 0 { if (region.flags & KVM_MEM_READONLY) != 0 { return Err(vm::HypervisorVmError::CreateUserMemory(anyhow!( @@ -782,20 +1053,34 @@ impl vm::Vm for KvmVm { guest_phys_addr: region.guest_phys_addr, memory_size: region.memory_size, userspace_addr: region.userspace_addr, + guest_memfd_offset: region.guest_memfd_offset, + guest_memfd: region.guest_memfd, }, ); // Always create guest physical memory region without `KVM_MEM_LOG_DIRTY_PAGES`. // For regions that need this flag, dirty pages log will be turned on in `start_dirty_log`. - region.flags = 0; + region.flags = self.get_kvm_userspace_memory_region_flag(0); } // SAFETY: Safe because caller promised this is safe. unsafe { + self.set_user_memory_region(region) + .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into()))?; + } + + #[cfg(feature = "sev_snp")] + if self.guest_memfds.is_some() { self.fd - .set_user_memory_region(region) - .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into())) + .set_memory_attributes(kvm_memory_attributes { + address: region.guest_phys_addr, + size: region.memory_size, + attributes: KVM_MEMORY_ATTRIBUTE_PRIVATE as u64, + flags: 0, + }) + .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into()))?; } + Ok(()) } /// Removes a guest physical memory region. @@ -823,12 +1108,13 @@ impl vm::Vm for KvmVm { const _: () = assert!(core::mem::size_of::() <= core::mem::size_of::()); - let mut region = kvm_userspace_memory_region { + let mut region = kvm_userspace_memory_region2 { slot, guest_phys_addr, memory_size: memory_size as u64, userspace_addr: userspace_addr as usize as u64, flags, + ..Default::default() }; // Remove the corresponding entry from "self.dirty_log_slots" if needed @@ -838,10 +1124,16 @@ impl vm::Vm for KvmVm { region.memory_size = 0; // SAFETY: Safe because caller promised this is safe. unsafe { - self.fd - .set_user_memory_region(region) - .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into())) + self.set_user_memory_region(region) + .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into()))?; } + + // Close the per-region guest_memfd if one was created for this slot + if let Some(memfds) = &self.guest_memfds { + memfds.write().unwrap().remove(&slot); + } + + Ok(()) } /// @@ -932,17 +1224,19 @@ impl vm::Vm for KvmVm { fn start_dirty_log(&self) -> vm::Result<()> { let dirty_log_slots = self.dirty_log_slots.read().unwrap(); for (_, s) in dirty_log_slots.iter() { - let region = kvm_userspace_memory_region { + let region = kvm_userspace_memory_region2 { slot: s.slot, guest_phys_addr: s.guest_phys_addr, memory_size: s.memory_size, userspace_addr: s.userspace_addr, - flags: KVM_MEM_LOG_DIRTY_PAGES, + flags: self.get_kvm_userspace_memory_region_flag(KVM_MEM_LOG_DIRTY_PAGES), + guest_memfd: s.guest_memfd, + guest_memfd_offset: s.guest_memfd_offset, + ..Default::default() }; // SAFETY: Safe because guest regions are guaranteed not to overlap. unsafe { - self.fd - .set_user_memory_region(region) + self.set_user_memory_region(region) .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?; } } @@ -956,17 +1250,19 @@ impl vm::Vm for KvmVm { fn stop_dirty_log(&self) -> vm::Result<()> { let dirty_log_slots = self.dirty_log_slots.read().unwrap(); for (_, s) in dirty_log_slots.iter() { - let region = kvm_userspace_memory_region { + let region = kvm_userspace_memory_region2 { slot: s.slot, guest_phys_addr: s.guest_phys_addr, memory_size: s.memory_size, userspace_addr: s.userspace_addr, - flags: 0, + flags: self.get_kvm_userspace_memory_region_flag(0), + guest_memfd: s.guest_memfd, + guest_memfd_offset: s.guest_memfd_offset, + ..Default::default() }; // SAFETY: Safe because guest regions are guaranteed not to overlap. unsafe { - self.fd - .set_user_memory_region(region) + self.set_user_memory_region(region) .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?; } } @@ -1129,9 +1425,21 @@ impl KvmHypervisor { /// Retrieve the list of MSRs supported by the hypervisor. /// fn get_msr_list(&self) -> hypervisor::Result { - self.kvm + let mut indices = self + .kvm .get_msr_index_list() - .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into())) + .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into()))? + .as_slice() + .to_vec(); + + // KVM_GET_MSR_INDEX_LIST does not include MTRR MSRs, but firmware may update them before an early boot snapshot. + indices.extend(MTRR_MSR_INDICES); + + let mut msr_list = MsrList::new(indices.len()) + .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into()))?; + msr_list.as_mut_slice().copy_from_slice(&indices); + + Ok(msr_list) } } @@ -1216,11 +1524,19 @@ impl hypervisor::Hypervisor for KvmHypervisor { vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap(); } - #[cfg(feature = "tdx")] - if _config.tdx_enabled { - vm_type = KVM_X86_SW_PROTECTED_VM.into(); - } else { + #[cfg(target_arch = "x86_64")] + { vm_type = KVM_X86_DEFAULT_VM.into(); + + #[cfg(feature = "sev_snp")] + if _config.sev_snp_enabled { + vm_type = KVM_X86_SNP_VM.into(); + } + + #[cfg(feature = "tdx")] + if _config.tdx_enabled { + vm_type = KVM_X86_SW_PROTECTED_VM.into(); + } } loop { @@ -1239,13 +1555,11 @@ impl hypervisor::Hypervisor for KvmHypervisor { break; } - let vm_fd = Arc::new(fd); - #[cfg(target_arch = "x86_64")] { let msr_list = self.get_msr_list()?; let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize; - let mut msrs: Vec = vec![ + let mut msrs = vec![ MsrEntry { ..Default::default() }; @@ -1256,18 +1570,52 @@ impl hypervisor::Hypervisor for KvmHypervisor { msrs[pos].index = *index; } + #[allow(unused_mut)] + let mut guest_memfds = None; + #[cfg(feature = "sev_snp")] + if _config.sev_snp_enabled && fd.check_extension(Cap::GuestMemfd) { + guest_memfds = Some(RwLock::new(HashMap::new())); + } + + #[cfg(feature = "sev_snp")] + let sev_fd = { + let sev_snp_enabled = vm_type == KVM_X86_SNP_VM as u64; + if sev_snp_enabled { + let mask = self.kvm.check_extension_int(crate::kvm::Cap::ExitHypercall); + let cap = kvm_bindings::kvm_enable_cap { + cap: kvm_bindings::KVM_CAP_EXIT_HYPERCALL, + args: [mask as _, 0, 0, 0], + ..Default::default() + }; + fd.enable_cap(&cap) + .map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?; + let sev_dev = x86_64::sev::SevFd::new("/dev/sev") + .map_err(|e| hypervisor::HypervisorError::SevSnpCapabilities(e.into()))?; + sev_dev + .init2(&fd, _config.vmsa_features) + .map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?; + Some(sev_dev) + } else { + None + } + }; + Ok(Arc::new(KvmVm { - fd: vm_fd, + fd: Arc::new(fd), msrs, - dirty_log_slots: Arc::new(RwLock::new(HashMap::new())), + dirty_log_slots: RwLock::new(HashMap::new()), + #[cfg(feature = "sev_snp")] + sev_fd, + guest_memfds, })) } #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] { Ok(Arc::new(KvmVm { - fd: vm_fd, - dirty_log_slots: Arc::new(RwLock::new(HashMap::new())), + fd: Arc::new(fd), + dirty_log_slots: RwLock::new(HashMap::new()), + guest_memfds: None, })) } } @@ -1352,6 +1700,8 @@ pub struct KvmVcpu { hyperv_synic: AtomicBool, #[cfg(target_arch = "x86_64")] xsave_size: i32, + #[cfg(feature = "sev_snp")] + vm_fd: Arc, } /// Implementation of Vcpu trait for KVM @@ -2011,6 +2361,81 @@ impl cpu::Vcpu for KvmVcpu { #[cfg(feature = "tdx")] VcpuExit::Unsupported(KVM_EXIT_TDX) => Ok(cpu::VmExit::Tdx), VcpuExit::Debug(_) => Ok(cpu::VmExit::Debug), + #[cfg(feature = "sev_snp")] + VcpuExit::Hypercall(hypercall) => { + // https://docs.kernel.org/virt/kvm/x86/hypercalls.html#kvm-hc-map-gpa-range + const KVM_HC_MAP_GPA_RANGE: u64 = 12; + // 4th bit of attributes argument is encrypted page bit + match hypercall.nr { + KVM_HC_MAP_GPA_RANGE => { + // guest physical address of start page + let address = hypercall.args[0]; + // num pages to map from start address + let num_pages = hypercall.args[1]; + // bits[0-3] = page size encoding + // bits[4] = 1 if private, 0 if shared + // bits[5-63] = zero + let attributes = hypercall.args[2]; + // TODO: Add 2mb page support + let size = num_pages * PAGE_SIZE_4K; + // bit 4 = private attribute encoding + const PRIVATE_ENCODING_BITMASK: u64 = 0b10000; + debug!( + "KVM_HC_MAP_GPA_RANGE: address={address:#x}, pages={num_pages}, attributes={attributes:#x}" + ); + let set_private_attr = if attributes & PRIVATE_ENCODING_BITMASK > 0 { + KVM_MEMORY_ATTRIBUTE_PRIVATE as u64 + } else { + // the only attribute available is private, o/w 0 + // https://docs.kernel.org/virt/kvm/api.html#kvm-set-memory-attributes + 0u64 + }; + let mem_attributes = kvm_memory_attributes { + address, + size, + attributes: set_private_attr, + ..Default::default() + }; + self.vm_fd + .set_memory_attributes(mem_attributes) + .map(|_| cpu::VmExit::Ignore) + .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())) + } + _ => Ok(cpu::VmExit::Ignore), + } + } + + #[cfg(feature = "sev_snp")] + VcpuExit::MemoryFault { flags, gpa, size } => { + debug!("VcpuExit::MemoryFault: flags={flags:#x}, gpa={gpa:#x}, size={size:#x}"); + + const KVM_MEMORY_EXIT_FLAG_PRIVATE: u64 = + kvm_bindings::KVM_MEMORY_EXIT_FLAG_PRIVATE as u64; + + if flags & !KVM_MEMORY_EXIT_FLAG_PRIVATE != 0 { + return Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( + "VcpuExit::MemoryFault: unknown flags {flags:#x}" + ))); + } + + let attributes = if flags & KVM_MEMORY_EXIT_FLAG_PRIVATE != 0 { + KVM_MEMORY_ATTRIBUTE_PRIVATE as u64 + } else { + // the only attribute available is private, o/w 0 + // https://docs.kernel.org/virt/kvm/api.html#kvm-set-memory-attributes + 0u64 + }; + + self.vm_fd + .set_memory_attributes(kvm_memory_attributes { + address: gpa, + size, + attributes, + flags: 0, + }) + .map(|_| cpu::VmExit::Ignore) + .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())) + } r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( "Unexpected exit reason on vcpu run: {r:?}" @@ -2864,6 +3289,81 @@ impl cpu::Vcpu for KvmVcpu { Ok(_) => Ok(()), } } + + #[cfg(feature = "sev_snp")] + fn set_sev_control_register(&self, _vmsa_pfn: u64) -> cpu::Result<()> { + Ok(()) + } + + #[cfg(feature = "sev_snp")] + fn setup_sev_snp_regs(&self, vmsa: igvm::snp_defs::SevVmsa) -> cpu::Result<()> { + let mut sregs = self + .fd + .get_sregs() + .map_err(|e: kvm_ioctls::Error| cpu::HypervisorCpuError::GetSpecialRegs(e.into()))?; + sregs.cs = make_segment(vmsa.cs); + sregs.ds = make_segment(vmsa.ds); + sregs.es = make_segment(vmsa.es); + sregs.fs = make_segment(vmsa.fs); + sregs.gs = make_segment(vmsa.gs); + sregs.ss = make_segment(vmsa.ss); + sregs.tr = make_segment(vmsa.tr); + sregs.ldt = make_segment(vmsa.ldtr); + + sregs.cr0 = vmsa.cr0; + sregs.cr4 = vmsa.cr4; + sregs.cr3 = vmsa.cr3; + sregs.efer = vmsa.efer; + + sregs.idt.base = vmsa.idtr.base; + sregs.idt.limit = vmsa + .idtr + .limit + .try_into() + .map_err(|e: std::num::TryFromIntError| { + cpu::HypervisorCpuError::SetSpecialRegs(anyhow!(e)) + })?; + sregs.gdt.base = vmsa.gdtr.base; + sregs.gdt.limit = vmsa + .gdtr + .limit + .try_into() + .map_err(|e: std::num::TryFromIntError| { + cpu::HypervisorCpuError::SetSpecialRegs(anyhow!(e)) + })?; + self.fd + .set_sregs(&sregs) + .map_err(|e: kvm_ioctls::Error| cpu::HypervisorCpuError::SetSpecialRegs(e.into()))?; + + let mut regs = self + .fd + .get_regs() + .map_err(|e: kvm_ioctls::Error| cpu::HypervisorCpuError::GetRegister(e.into()))?; + regs.rip = vmsa.rip; + regs.rdx = vmsa.rdx; + regs.rflags = vmsa.rflags; + regs.rsp = vmsa.rsp; + regs.rax = vmsa.rax; + regs.rbx = vmsa.rbx; + regs.rcx = vmsa.rcx; + regs.rbp = vmsa.rbp; + regs.rsi = vmsa.rsi; + regs.rdi = vmsa.rdi; + regs.r8 = vmsa.r8; + regs.r9 = vmsa.r9; + regs.r10 = vmsa.r10; + regs.r11 = vmsa.r11; + regs.r12 = vmsa.r12; + regs.r13 = vmsa.r13; + regs.r14 = vmsa.r14; + regs.r15 = vmsa.r15; + + self.fd + .set_regs(®s) + .map_err(|e: kvm_ioctls::Error| cpu::HypervisorCpuError::SetRegister(e.into()))?; + + Ok(()) + } } impl KvmVcpu { @@ -2994,7 +3494,7 @@ impl KvmVcpu { let maybe_size = self .fd - .get_nested_state(&mut buffer) + .nested_state(&mut buffer) .map_err(|e| cpu::HypervisorCpuError::GetNestedState(e.into()))?; if let Some(_size) = maybe_size { diff --git a/hypervisor/src/kvm/x86_64/mod.rs b/hypervisor/src/kvm/x86_64/mod.rs index e338346c3f..62185fd84e 100644 --- a/hypervisor/src/kvm/x86_64/mod.rs +++ b/hypervisor/src/kvm/x86_64/mod.rs @@ -31,6 +31,9 @@ use crate::arch::x86::{ }; use crate::kvm::{Cap, Kvm, KvmError, KvmResult}; +#[cfg(feature = "sev_snp")] +pub(crate) mod sev; + /// /// Check KVM extension for Linux /// diff --git a/hypervisor/src/kvm/x86_64/sev.rs b/hypervisor/src/kvm/x86_64/sev.rs new file mode 100644 index 0000000000..6249468fef --- /dev/null +++ b/hypervisor/src/kvm/x86_64/sev.rs @@ -0,0 +1,200 @@ +// Copyright 2025 Google LLC. +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::fs::OpenOptions; +use std::os::fd::{AsRawFd, OwnedFd}; +use std::os::unix::fs::OpenOptionsExt; +use std::path::Path; + +use igvm_defs::SnpPolicy; +use kvm_bindings::kvm_sev_cmd; +use kvm_ioctls::VmFd; +use log::{debug, error, info}; +use vmm_sys_util::errno; + +pub(crate) type Result = std::result::Result; + +// KVM SEV command IDs — linux/include/uapi/linux/kvm.h +const KVM_SEV_INIT2: u32 = 22; +const KVM_SEV_SNP_LAUNCH_START: u32 = 100; +const KVM_SEV_SNP_LAUNCH_UPDATE: u32 = 101; +const KVM_SEV_SNP_LAUNCH_FINISH: u32 = 102; +// SNP_LAUNCH_UPDATE page types — linux/arch/x86/include/uapi/asm/sev-guest.h +pub const SNP_PAGE_TYPE_VMSA: u32 = 2; + +// See AMD Spec Section 8.17 — SNP_LAUNCH_UPDATE +// The last 12 bits are metadata about the guest context +// https://docs.amd.com/v/u/en-US/56860_PUB_1.58_SEV_SNP +pub const GPA_METADATA_SHIFT_OFFSET: u32 = 12; + +// SNP in VMSA - linux/arch/x86/include/asm/svm.h +const SVM_SEV_FEAT_SNP_ACTIVE: u64 = 1 << 0; + +fn sev_op(vm: &VmFd, sev_cmd: &mut kvm_sev_cmd, name: &str) -> Result<()> { + let ret = vm.encrypt_op_sev(sev_cmd); + if ret.is_err() { + error!("{name} op failed. error code: 0x{:x}", sev_cmd.error); + } + ret +} + +#[derive(Debug)] +pub struct SevFd { + pub fd: OwnedFd, +} + +// These ioctl structs must match the kernel layout exactly. +// Layouts from linux/arch/x86/include/uapi/asm/kvm.h + +#[repr(C, packed)] +#[derive(Debug, Copy, Clone, Default)] +pub(crate) struct KvmSevInit { + pub vmsa_features: u64, + pub flags: u32, + pub ghcb_version: u16, + pub pad1: u16, + pub pad2: [u32; 8], +} + +#[repr(C, packed)] +#[derive(Debug, Copy, Clone, Default)] +pub(crate) struct KvmSevSnpLaunchStart { + pub policy: u64, + pub gosvw: [u8; 16], + pub flags: u16, + pub pad0: [u8; 6], + pub pad1: [u64; 4], +} + +#[repr(C, packed)] +#[derive(Debug, Copy, Clone, Default)] +pub(crate) struct KvmSevSnpLaunchUpdate { + pub gfn_start: u64, + pub uaddr: u64, + pub len: u64, + pub type_: u8, + pub pad0: u8, + pub flags: u16, + pub pad1: u32, + pub pad2: [u64; 4], +} + +#[repr(C, packed)] +#[derive(Debug, Copy, Clone, Default)] +pub(crate) struct KvmSevSnpLaunchFinish { + pub id_block_uaddr: u64, + pub id_auth_uaddr: u64, + pub id_block_en: u8, + pub auth_key_en: u8, + pub vcek_disabled: u8, + pub host_data: [u8; 32], + pub pad0: [u8; 3], + // must be zero https://elixir.bootlin.com/linux/v6.11/source/arch/x86/kvm/svm/sev.c#L2506 + pub flags: u16, + pub pad1: [u64; 4], +} + +impl SevFd { + pub(crate) fn new(sev_path: impl AsRef) -> Result { + let file = OpenOptions::new() + .read(true) + .write(true) + .custom_flags(libc::O_CLOEXEC) + .open(sev_path.as_ref()) + .map_err(|e| errno::Error::new(e.raw_os_error().unwrap_or(libc::EINVAL)))?; + Ok(SevFd { + fd: OwnedFd::from(file), + }) + } + + pub(crate) fn init2(&self, vm: &VmFd, vmsa_features: u64) -> Result<()> { + // Clear the SNP bit, KVM sets it directly + let vmsa_features = vmsa_features & !SVM_SEV_FEAT_SNP_ACTIVE; + + // TODO: Query KVM for supported VMSA features before calling init2 + if vmsa_features != 0 { + info!("SEV-SNP: requesting vmsa_features: {vmsa_features:#x}"); + } + + let mut init = KvmSevInit { + vmsa_features, + ..Default::default() + }; + let mut sev_cmd = kvm_sev_cmd { + id: KVM_SEV_INIT2, + data: &mut init as *mut KvmSevInit as _, + sev_fd: self.fd.as_raw_fd() as _, + ..Default::default() + }; + sev_op(vm, &mut sev_cmd, "KVM_SEV_INIT2") + } + + pub(crate) fn launch_start(&self, vm: &VmFd, guest_policy: SnpPolicy) -> Result<()> { + // See AMD Spec Section 4.3 - Guest Policy + // Bit 17 is reserved and has to be one. + // https://docs.amd.com/v/u/en-US/56860_PUB_1.58_SEV_SNP + let mut start: KvmSevSnpLaunchStart = KvmSevSnpLaunchStart { + policy: guest_policy.into_bits(), + ..Default::default() + }; + let mut sev_cmd = kvm_sev_cmd { + id: KVM_SEV_SNP_LAUNCH_START, + data: &mut start as *mut KvmSevSnpLaunchStart as _, + sev_fd: self.fd.as_raw_fd() as _, + ..Default::default() + }; + sev_op(vm, &mut sev_cmd, "KVM_SEV_SNP_LAUNCH_START") + } + + pub(crate) fn launch_update( + &self, + vm: &VmFd, + // host virtual address + hva: u64, + size: u64, + // guest frame number + gfn_start: u64, + page_type: u32, + ) -> Result<()> { + let mut update = KvmSevSnpLaunchUpdate { + gfn_start, + uaddr: hva, + len: size, + type_: page_type as u8, + ..Default::default() + }; + let mut sev_cmd = kvm_sev_cmd { + id: KVM_SEV_SNP_LAUNCH_UPDATE, + data: &mut update as *mut KvmSevSnpLaunchUpdate as _, + sev_fd: self.fd.as_raw_fd() as _, + ..Default::default() + }; + sev_op(vm, &mut sev_cmd, "KVM_SEV_SNP_LAUNCH_UPDATE") + } + + pub(crate) fn launch_finish( + &self, + vm: &VmFd, + host_data: [u8; 32], + id_block_en: u8, + auth_key_en: u8, + ) -> Result<()> { + let mut finish = KvmSevSnpLaunchFinish { + host_data, + id_block_en, + auth_key_en, + ..Default::default() + }; + let mut sev_cmd = kvm_sev_cmd { + id: KVM_SEV_SNP_LAUNCH_FINISH, + data: &mut finish as *mut KvmSevSnpLaunchFinish as _, + sev_fd: self.fd.as_raw_fd() as _, + ..Default::default() + }; + let flags = finish.flags; + debug!("Calling KVM_SEV_SNP_LAUNCH_FINISH, flags: {flags}"); + sev_op(vm, &mut sev_cmd, "KVM_SEV_SNP_LAUNCH_FINISH") + } +} diff --git a/hypervisor/src/lib.rs b/hypervisor/src/lib.rs index 3d919e45ce..f224e7217c 100644 --- a/hypervisor/src/lib.rs +++ b/hypervisor/src/lib.rs @@ -64,7 +64,7 @@ pub use vm::{ pub use crate::hypervisor::{Hypervisor, HypervisorError}; -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Copy, Clone, PartialEq)] pub enum HypervisorType { #[cfg(feature = "kvm")] Kvm, @@ -118,6 +118,26 @@ pub fn vec_with_array_field(count: usize) -> Vec { vec_with_size_in_bytes(vec_size_bytes) } +/// User memory region structure +#[derive(Debug, Default, Eq, PartialEq)] +pub struct UserMemoryRegion { + pub slot: u32, + pub guest_phys_addr: u64, + pub memory_size: u64, + pub userspace_addr: u64, + pub flags: u32, + pub guest_memfd: Option, + pub guest_memfd_offset: Option, +} + +/// Flags for user memory region +pub const USER_MEMORY_REGION_READ: u32 = 1; +pub const USER_MEMORY_REGION_WRITE: u32 = 1 << 1; +pub const USER_MEMORY_REGION_EXECUTE: u32 = 1 << 2; +pub const USER_MEMORY_REGION_LOG_DIRTY: u32 = 1 << 3; +pub const USER_MEMORY_REGION_ADJUSTABLE: u32 = 1 << 4; +pub const USER_MEMORY_REGION_GUEST_MEMFD: u32 = 1 << 5; + #[derive(Debug)] pub enum MpState { #[cfg(feature = "kvm")] @@ -170,6 +190,8 @@ pub struct HypervisorVmConfig { pub sev_snp_enabled: bool, #[cfg(feature = "sev_snp")] pub mem_size: u64, + #[cfg(feature = "sev_snp")] + pub vmsa_features: u64, pub nested: bool, pub smt_enabled: bool, } diff --git a/hypervisor/src/mshv/mod.rs b/hypervisor/src/mshv/mod.rs index 613c4dc77e..a61f2e44ef 100644 --- a/hypervisor/src/mshv/mod.rs +++ b/hypervisor/src/mshv/mod.rs @@ -13,6 +13,8 @@ use anyhow::anyhow; #[cfg(target_arch = "x86_64")] use arc_swap::ArcSwap; #[cfg(feature = "sev_snp")] +use log::error; +#[cfg(feature = "sev_snp")] use log::info; use log::{debug, warn}; use mshv_bindings::*; @@ -56,7 +58,7 @@ pub use aarch64::VcpuMshvState; #[cfg(target_arch = "aarch64")] use aarch64::gic::{BASE_SPI_IRQ, MshvGicV2M}; #[cfg(feature = "sev_snp")] -use igvm_defs::IGVM_VHS_SNP_ID_BLOCK; +use igvm_defs::{IGVM_VHS_SNP_ID_BLOCK, SnpPolicy}; #[cfg(feature = "sev_snp")] use snp_constants::*; use vmm_sys_util::eventfd::EventFd; @@ -85,6 +87,12 @@ use crate::{CpuState, IoEventAddress, IrqRoutingEntry, MpState}; pub const PAGE_SHIFT: usize = 12; +// SVM exit codes not yet defined in mshv-bindings (AMD APM Vol 2, Table 15-7) +#[cfg(feature = "sev_snp")] +const SVM_EXITCODE_CPUID: u32 = 0x72; +#[cfg(feature = "sev_snp")] +const SVM_EXITCODE_MSR: u32 = 0x7c; + #[cfg(target_arch = "x86_64")] impl From for ClockData { fn from(d: MshvClockData) -> Self { @@ -346,7 +354,7 @@ impl hypervisor::Hypervisor for MshvHypervisor { Ok(Arc::new(MshvVm { fd: vm_fd, msrs: ArcSwap::new(Vec::::new().into()), - dirty_log_slots: Arc::new(RwLock::new(HashMap::new())), + dirty_log_slots: RwLock::new(HashMap::new()), #[cfg(feature = "sev_snp")] sev_snp_enabled: mshv_vm_type == VmType::Snp, #[cfg(feature = "sev_snp")] @@ -364,7 +372,7 @@ impl hypervisor::Hypervisor for MshvHypervisor { { Ok(Arc::new(MshvVm { fd: vm_fd, - dirty_log_slots: Arc::new(RwLock::new(HashMap::new())), + dirty_log_slots: RwLock::new(HashMap::new()), })) } } @@ -1199,11 +1207,104 @@ impl cpu::Vcpu for MshvVcpu { // Clear the SW_EXIT_INFO1 register to indicate no error self.clear_swexit_info1()?; } + SVM_EXITCODE_CPUID => { + // SAFETY: Accessing fields from the mapped GHCB page + let cpuid_fn = unsafe { (*ghcb).rax } as u32; + // SAFETY: Accessing fields from the mapped GHCB page + let cpuid_idx = unsafe { (*ghcb).rcx } as u32; + // SAFETY: Accessing fields from the mapped GHCB page + let xcr0 = unsafe { (*ghcb).xfem }; + // SAFETY: Accessing fields from the mapped GHCB page + let xss = unsafe { (*ghcb).xss }; + debug!("GHCB CPUID: fn=0x{cpuid_fn:x} idx=0x{cpuid_idx:x}"); + + let cpuid_result = self + .fd + .get_cpuid_values(cpuid_fn, cpuid_idx, xcr0, xss) + .unwrap_or([0u32; 4]); + + set_svm_field_u64_ptr!(ghcb, rax, cpuid_result[0] as u64); + set_svm_field_u64_ptr!(ghcb, rbx, cpuid_result[1] as u64); + set_svm_field_u64_ptr!(ghcb, rcx, cpuid_result[2] as u64); + set_svm_field_u64_ptr!(ghcb, rdx, cpuid_result[3] as u64); + + self.clear_swexit_info1()?; + } + SVM_EXITCODE_MSR => { + let exit_info1 = + info.__bindgen_anon_2.__bindgen_anon_1.sw_exit_info1; + // SAFETY: Accessing fields from the mapped GHCB page + let msr_index = unsafe { (*ghcb).rcx } as u32; + let is_write = exit_info1 & 1 != 0; + + if is_write { + // SAFETY: Accessing fields from the mapped GHCB page + let msr_lo = unsafe { (*ghcb).rax } as u32; + // SAFETY: Accessing fields from the mapped GHCB page + let msr_hi = unsafe { (*ghcb).rdx } as u32; + let msr_val = ((msr_hi as u64) << 32) | (msr_lo as u64); + debug!( + "GHCB MSR WRITE: index=0x{msr_index:x} val=0x{msr_val:x}" + ); + let entry = msr_entry { + index: msr_index, + data: msr_val, + ..Default::default() + }; + let msr_entries = MsrEntries::from_entries(&[entry]) + .map_err(|e| { + cpu::HypervisorCpuError::RunVcpu(e.into()) + })?; + self.fd.set_msrs(&msr_entries).map_err(|e| { + cpu::HypervisorCpuError::RunVcpu(e.into()) + })?; + } else { + let entry = msr_entry { + index: msr_index, + ..Default::default() + }; + let mut msr_entries = MsrEntries::from_entries(&[entry]) + .map_err(|e| { + cpu::HypervisorCpuError::RunVcpu(e.into()) + })?; + self.fd.get_msrs(&mut msr_entries).map_err(|e| { + cpu::HypervisorCpuError::RunVcpu(e.into()) + })?; + let msr_slice = msr_entries.as_slice(); + if msr_slice.is_empty() { + return Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( + "get_msrs returned no entries for index 0x{msr_index:x}" + ))); + } + let msr_val = msr_slice[0].data; + debug!( + "GHCB MSR READ: index=0x{msr_index:x} val=0x{msr_val:x}" + ); + set_svm_field_u64_ptr!(ghcb, rax, msr_val & 0xFFFFFFFF); + set_svm_field_u64_ptr!(ghcb, rdx, msr_val >> 32); + } + + self.clear_swexit_info1()?; + } _ => { panic!("GHCB_INFO_NORMAL: Unhandled exit code: {exit_code:0x}") } } } + GHCB_INFO_SHUTDOWN_REQUEST => { + let ghcb_msr_val = { info.ghcb_msr }; + let reason_set = (ghcb_msr_val >> 12) & 0xf; + let reason_val = (ghcb_msr_val >> 16) & 0xff; + error!( + "GHCB_MSR_TERM_REQ: Guest terminated! \ + ghcb_msr=0x{ghcb_msr_val:x}, \ + reason_set={reason_set}, reason_val={reason_val}" + ); + return Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( + "Guest requested termination via GHCB_MSR_TERM_REQ \ + (reason_set={reason_set}, reason_val={reason_val})" + ))); + } _ => panic!("Unsupported VMGEXIT operation: {ghcb_op:0x}"), } @@ -1716,7 +1817,7 @@ pub struct MshvVm { fd: Arc, #[cfg(target_arch = "x86_64")] msrs: ArcSwap>, - dirty_log_slots: Arc>>, + dirty_log_slots: RwLock>, #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, #[cfg(feature = "sev_snp")] @@ -2153,7 +2254,7 @@ impl vm::Vm for MshvVm { /// Initialize the SEV-SNP VM #[cfg(feature = "sev_snp")] - fn sev_snp_init(&self) -> vm::Result<()> { + fn sev_snp_init(&self, _guest_policy: SnpPolicy) -> vm::Result<()> { self.fd .set_partition_property( hv_partition_property_code_HV_PARTITION_PROPERTY_ISOLATION_STATE, @@ -2171,6 +2272,7 @@ impl vm::Vm for MshvVm { page_type: u32, page_size: u32, pages: &[u64], + _uaddrs: &[u64], ) -> vm::Result<()> { debug_assert!(page_size == hv_isolated_page_size_HV_ISOLATED_PAGE_SIZE_4KB); if pages.is_empty() { diff --git a/hypervisor/src/vm.rs b/hypervisor/src/vm.rs index 9d7e60a8be..6d3a4a4ae5 100644 --- a/hypervisor/src/vm.rs +++ b/hypervisor/src/vm.rs @@ -17,6 +17,8 @@ use std::sync::Mutex; #[cfg(feature = "sev_snp")] use igvm_defs::IGVM_VHS_SNP_ID_BLOCK; +#[cfg(feature = "sev_snp")] +use igvm_defs::SnpPolicy; use thiserror::Error; use vmm_sys_util::eventfd::EventFd; @@ -392,9 +394,7 @@ pub trait Vm: Send + Sync + Any { fn get_dirty_log(&self, slot: u32, base_gpa: u64, memory_size: u64) -> Result>; #[cfg(feature = "sev_snp")] /// Initialize SEV-SNP on this VM - fn sev_snp_init(&self) -> Result<()> { - unimplemented!() - } + fn sev_snp_init(&self, guest_policy: SnpPolicy) -> Result<()>; #[cfg(feature = "tdx")] /// Initialize TDX on this VM fn tdx_init(&self, _cpuid: &[CpuIdEntry], _max_vcpus: u32) -> Result<()> { @@ -429,6 +429,7 @@ pub trait Vm: Send + Sync + Any { _page_type: u32, _page_size: u32, _pages: &[u64], + _uaddrs: &[u64], ) -> Result<()> { unimplemented!() } diff --git a/net_gen/Cargo.toml b/net_gen/Cargo.toml deleted file mode 100644 index a99c7c995d..0000000000 --- a/net_gen/Cargo.toml +++ /dev/null @@ -1,12 +0,0 @@ -[package] -authors = ["The Chromium OS Authors"] -edition = "2021" -#edition.workspace = true -name = "net_gen" -version = "0.1.0" - -[dependencies] -vmm-sys-util = { workspace = true } - -[lints] -workspace = true diff --git a/net_gen/src/if_tun.rs b/net_gen/src/if_tun.rs deleted file mode 100644 index ab9f327b94..0000000000 --- a/net_gen/src/if_tun.rs +++ /dev/null @@ -1,327 +0,0 @@ -// Copyright © 2021 Intel Corporation -// -// SPDX-License-Identifier: Apache-2.0 -// - -// bindgen /usr/include/linux/if_tun.h --no-layout-tests - -/* automatically generated by rust-bindgen 0.58.1 */ - -#[repr(C)] -#[derive(Default)] -pub struct __IncompleteArrayField(::std::marker::PhantomData, [T; 0]); -#[allow(clippy::missing_safety_doc)] -impl __IncompleteArrayField { - #[inline] - pub const fn new() -> Self { - __IncompleteArrayField(::std::marker::PhantomData, []) - } - #[inline] - pub fn as_ptr(&self) -> *const T { - self as *const _ as *const T - } - #[inline] - pub fn as_mut_ptr(&mut self) -> *mut T { - self as *mut _ as *mut T - } - #[inline] - pub unsafe fn as_slice(&self, len: usize) -> &[T] { - ::std::slice::from_raw_parts(self.as_ptr(), len) - } - #[inline] - pub unsafe fn as_mut_slice(&mut self, len: usize) -> &mut [T] { - ::std::slice::from_raw_parts_mut(self.as_mut_ptr(), len) - } -} -impl ::std::fmt::Debug for __IncompleteArrayField { - fn fmt(&self, fmt: &mut ::std::fmt::Formatter<'_>) -> ::std::fmt::Result { - fmt.write_str("__IncompleteArrayField") - } -} -pub const __BITS_PER_LONG: u32 = 64; -pub const __FD_SETSIZE: u32 = 1024; -pub const ETH_ALEN: u32 = 6; -pub const ETH_TLEN: u32 = 2; -pub const ETH_HLEN: u32 = 14; -pub const ETH_ZLEN: u32 = 60; -pub const ETH_DATA_LEN: u32 = 1500; -pub const ETH_FRAME_LEN: u32 = 1514; -pub const ETH_FCS_LEN: u32 = 4; -pub const ETH_MIN_MTU: u32 = 68; -pub const ETH_MAX_MTU: u32 = 65535; -pub const ETH_P_LOOP: u32 = 96; -pub const ETH_P_PUP: u32 = 512; -pub const ETH_P_PUPAT: u32 = 513; -pub const ETH_P_TSN: u32 = 8944; -pub const ETH_P_ERSPAN2: u32 = 8939; -pub const ETH_P_IP: u32 = 2048; -pub const ETH_P_X25: u32 = 2053; -pub const ETH_P_ARP: u32 = 2054; -pub const ETH_P_BPQ: u32 = 2303; -pub const ETH_P_IEEEPUP: u32 = 2560; -pub const ETH_P_IEEEPUPAT: u32 = 2561; -pub const ETH_P_BATMAN: u32 = 17157; -pub const ETH_P_DEC: u32 = 24576; -pub const ETH_P_DNA_DL: u32 = 24577; -pub const ETH_P_DNA_RC: u32 = 24578; -pub const ETH_P_DNA_RT: u32 = 24579; -pub const ETH_P_LAT: u32 = 24580; -pub const ETH_P_DIAG: u32 = 24581; -pub const ETH_P_CUST: u32 = 24582; -pub const ETH_P_SCA: u32 = 24583; -pub const ETH_P_TEB: u32 = 25944; -pub const ETH_P_RARP: u32 = 32821; -pub const ETH_P_ATALK: u32 = 32923; -pub const ETH_P_AARP: u32 = 33011; -pub const ETH_P_8021Q: u32 = 33024; -pub const ETH_P_ERSPAN: u32 = 35006; -pub const ETH_P_IPX: u32 = 33079; -pub const ETH_P_IPV6: u32 = 34525; -pub const ETH_P_PAUSE: u32 = 34824; -pub const ETH_P_SLOW: u32 = 34825; -pub const ETH_P_WCCP: u32 = 34878; -pub const ETH_P_MPLS_UC: u32 = 34887; -pub const ETH_P_MPLS_MC: u32 = 34888; -pub const ETH_P_ATMMPOA: u32 = 34892; -pub const ETH_P_PPP_DISC: u32 = 34915; -pub const ETH_P_PPP_SES: u32 = 34916; -pub const ETH_P_LINK_CTL: u32 = 34924; -pub const ETH_P_ATMFATE: u32 = 34948; -pub const ETH_P_PAE: u32 = 34958; -pub const ETH_P_AOE: u32 = 34978; -pub const ETH_P_8021AD: u32 = 34984; -pub const ETH_P_802_EX1: u32 = 34997; -pub const ETH_P_PREAUTH: u32 = 35015; -pub const ETH_P_TIPC: u32 = 35018; -pub const ETH_P_LLDP: u32 = 35020; -pub const ETH_P_MRP: u32 = 35043; -pub const ETH_P_MACSEC: u32 = 35045; -pub const ETH_P_8021AH: u32 = 35047; -pub const ETH_P_MVRP: u32 = 35061; -pub const ETH_P_1588: u32 = 35063; -pub const ETH_P_NCSI: u32 = 35064; -pub const ETH_P_PRP: u32 = 35067; -pub const ETH_P_FCOE: u32 = 35078; -pub const ETH_P_IBOE: u32 = 35093; -pub const ETH_P_TDLS: u32 = 35085; -pub const ETH_P_FIP: u32 = 35092; -pub const ETH_P_80221: u32 = 35095; -pub const ETH_P_HSR: u32 = 35119; -pub const ETH_P_NSH: u32 = 35151; -pub const ETH_P_LOOPBACK: u32 = 36864; -pub const ETH_P_QINQ1: u32 = 37120; -pub const ETH_P_QINQ2: u32 = 37376; -pub const ETH_P_QINQ3: u32 = 37632; -pub const ETH_P_EDSA: u32 = 56026; -pub const ETH_P_DSA_8021Q: u32 = 56027; -pub const ETH_P_IFE: u32 = 60734; -pub const ETH_P_AF_IUCV: u32 = 64507; -pub const ETH_P_802_3_MIN: u32 = 1536; -pub const ETH_P_802_3: u32 = 1; -pub const ETH_P_AX25: u32 = 2; -pub const ETH_P_ALL: u32 = 3; -pub const ETH_P_802_2: u32 = 4; -pub const ETH_P_SNAP: u32 = 5; -pub const ETH_P_DDCMP: u32 = 6; -pub const ETH_P_WAN_PPP: u32 = 7; -pub const ETH_P_PPP_MP: u32 = 8; -pub const ETH_P_LOCALTALK: u32 = 9; -pub const ETH_P_CAN: u32 = 12; -pub const ETH_P_CANFD: u32 = 13; -pub const ETH_P_PPPTALK: u32 = 16; -pub const ETH_P_TR_802_2: u32 = 17; -pub const ETH_P_MOBITEX: u32 = 21; -pub const ETH_P_CONTROL: u32 = 22; -pub const ETH_P_IRDA: u32 = 23; -pub const ETH_P_ECONET: u32 = 24; -pub const ETH_P_HDLC: u32 = 25; -pub const ETH_P_ARCNET: u32 = 26; -pub const ETH_P_DSA: u32 = 27; -pub const ETH_P_TRAILER: u32 = 28; -pub const ETH_P_PHONET: u32 = 245; -pub const ETH_P_IEEE802154: u32 = 246; -pub const ETH_P_CAIF: u32 = 247; -pub const ETH_P_XDSA: u32 = 248; -pub const ETH_P_MAP: u32 = 249; -pub const __UAPI_DEF_ETHHDR: u32 = 1; -pub const BPF_LD: u32 = 0; -pub const BPF_LDX: u32 = 1; -pub const BPF_ST: u32 = 2; -pub const BPF_STX: u32 = 3; -pub const BPF_ALU: u32 = 4; -pub const BPF_JMP: u32 = 5; -pub const BPF_RET: u32 = 6; -pub const BPF_MISC: u32 = 7; -pub const BPF_W: u32 = 0; -pub const BPF_H: u32 = 8; -pub const BPF_B: u32 = 16; -pub const BPF_IMM: u32 = 0; -pub const BPF_ABS: u32 = 32; -pub const BPF_IND: u32 = 64; -pub const BPF_MEM: u32 = 96; -pub const BPF_LEN: u32 = 128; -pub const BPF_MSH: u32 = 160; -pub const BPF_ADD: u32 = 0; -pub const BPF_SUB: u32 = 16; -pub const BPF_MUL: u32 = 32; -pub const BPF_DIV: u32 = 48; -pub const BPF_OR: u32 = 64; -pub const BPF_AND: u32 = 80; -pub const BPF_LSH: u32 = 96; -pub const BPF_RSH: u32 = 112; -pub const BPF_NEG: u32 = 128; -pub const BPF_MOD: u32 = 144; -pub const BPF_XOR: u32 = 160; -pub const BPF_JA: u32 = 0; -pub const BPF_JEQ: u32 = 16; -pub const BPF_JGT: u32 = 32; -pub const BPF_JGE: u32 = 48; -pub const BPF_JSET: u32 = 64; -pub const BPF_K: u32 = 0; -pub const BPF_X: u32 = 8; -pub const BPF_MAXINSNS: u32 = 4096; -pub const BPF_MAJOR_VERSION: u32 = 1; -pub const BPF_MINOR_VERSION: u32 = 1; -pub const BPF_A: u32 = 16; -pub const BPF_TAX: u32 = 0; -pub const BPF_TXA: u32 = 128; -pub const BPF_MEMWORDS: u32 = 16; -pub const SKF_AD_OFF: i32 = -4096; -pub const SKF_AD_PROTOCOL: u32 = 0; -pub const SKF_AD_PKTTYPE: u32 = 4; -pub const SKF_AD_IFINDEX: u32 = 8; -pub const SKF_AD_NLATTR: u32 = 12; -pub const SKF_AD_NLATTR_NEST: u32 = 16; -pub const SKF_AD_MARK: u32 = 20; -pub const SKF_AD_QUEUE: u32 = 24; -pub const SKF_AD_HATYPE: u32 = 28; -pub const SKF_AD_RXHASH: u32 = 32; -pub const SKF_AD_CPU: u32 = 36; -pub const SKF_AD_ALU_XOR_X: u32 = 40; -pub const SKF_AD_VLAN_TAG: u32 = 44; -pub const SKF_AD_VLAN_TAG_PRESENT: u32 = 48; -pub const SKF_AD_PAY_OFFSET: u32 = 52; -pub const SKF_AD_RANDOM: u32 = 56; -pub const SKF_AD_VLAN_TPID: u32 = 60; -pub const SKF_AD_MAX: u32 = 64; -pub const SKF_NET_OFF: i32 = -1048576; -pub const SKF_LL_OFF: i32 = -2097152; -pub const BPF_NET_OFF: i32 = -1048576; -pub const BPF_LL_OFF: i32 = -2097152; -pub const TUN_READQ_SIZE: u32 = 500; -pub const TUN_TYPE_MASK: u32 = 15; -pub const IFF_TUN: u32 = 1; -pub const IFF_TAP: u32 = 2; -pub const IFF_NAPI: u32 = 16; -pub const IFF_NAPI_FRAGS: u32 = 32; -pub const IFF_NO_PI: u32 = 4096; -pub const IFF_ONE_QUEUE: u32 = 8192; -pub const IFF_VNET_HDR: u32 = 16384; -pub const IFF_TUN_EXCL: u32 = 32768; -pub const IFF_MULTI_QUEUE: u32 = 256; -pub const IFF_ATTACH_QUEUE: u32 = 512; -pub const IFF_DETACH_QUEUE: u32 = 1024; -pub const IFF_PERSIST: u32 = 2048; -pub const IFF_NOFILTER: u32 = 4096; -pub const TUN_TX_TIMESTAMP: u32 = 1; -pub const TUN_F_CSUM: u32 = 1; -pub const TUN_F_TSO4: u32 = 2; -pub const TUN_F_TSO6: u32 = 4; -pub const TUN_F_TSO_ECN: u32 = 8; -pub const TUN_F_UFO: u32 = 16; -pub const TUN_PKT_STRIP: u32 = 1; -pub const TUN_FLT_ALLMULTI: u32 = 1; -pub type __s8 = ::std::os::raw::c_schar; -pub type __u8 = ::std::os::raw::c_uchar; -pub type __s16 = ::std::os::raw::c_short; -pub type __u16 = ::std::os::raw::c_ushort; -pub type __s32 = ::std::os::raw::c_int; -pub type __u32 = ::std::os::raw::c_uint; -pub type __s64 = ::std::os::raw::c_longlong; -pub type __u64 = ::std::os::raw::c_ulonglong; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct __kernel_fd_set { - pub fds_bits: [::std::os::raw::c_ulong; 16usize], -} -pub type __kernel_sighandler_t = - ::std::option::Option; -pub type __kernel_key_t = ::std::os::raw::c_int; -pub type __kernel_mqd_t = ::std::os::raw::c_int; -pub type __kernel_old_uid_t = ::std::os::raw::c_ushort; -pub type __kernel_old_gid_t = ::std::os::raw::c_ushort; -pub type __kernel_old_dev_t = ::std::os::raw::c_ulong; -pub type __kernel_long_t = ::std::os::raw::c_long; -pub type __kernel_ulong_t = ::std::os::raw::c_ulong; -pub type __kernel_ino_t = __kernel_ulong_t; -pub type __kernel_mode_t = ::std::os::raw::c_uint; -pub type __kernel_pid_t = ::std::os::raw::c_int; -pub type __kernel_ipc_pid_t = ::std::os::raw::c_int; -pub type __kernel_uid_t = ::std::os::raw::c_uint; -pub type __kernel_gid_t = ::std::os::raw::c_uint; -pub type __kernel_suseconds_t = __kernel_long_t; -pub type __kernel_daddr_t = ::std::os::raw::c_int; -pub type __kernel_uid32_t = ::std::os::raw::c_uint; -pub type __kernel_gid32_t = ::std::os::raw::c_uint; -pub type __kernel_size_t = __kernel_ulong_t; -pub type __kernel_ssize_t = __kernel_long_t; -pub type __kernel_ptrdiff_t = __kernel_long_t; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct __kernel_fsid_t { - pub val: [::std::os::raw::c_int; 2usize], -} -pub type __kernel_off_t = __kernel_long_t; -pub type __kernel_loff_t = ::std::os::raw::c_longlong; -pub type __kernel_old_time_t = __kernel_long_t; -pub type __kernel_time_t = __kernel_long_t; -pub type __kernel_time64_t = ::std::os::raw::c_longlong; -pub type __kernel_clock_t = __kernel_long_t; -pub type __kernel_timer_t = ::std::os::raw::c_int; -pub type __kernel_clockid_t = ::std::os::raw::c_int; -pub type __kernel_caddr_t = *mut ::std::os::raw::c_char; -pub type __kernel_uid16_t = ::std::os::raw::c_ushort; -pub type __kernel_gid16_t = ::std::os::raw::c_ushort; -pub type __le16 = __u16; -pub type __be16 = __u16; -pub type __le32 = __u32; -pub type __be32 = __u32; -pub type __le64 = __u64; -pub type __be64 = __u64; -pub type __sum16 = __u16; -pub type __wsum = __u32; -pub type __poll_t = ::std::os::raw::c_uint; -#[repr(C, packed)] -#[derive(Debug, Copy, Clone)] -pub struct ethhdr { - pub h_dest: [::std::os::raw::c_uchar; 6usize], - pub h_source: [::std::os::raw::c_uchar; 6usize], - pub h_proto: __be16, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct sock_filter { - pub code: __u16, - pub jt: __u8, - pub jf: __u8, - pub k: __u32, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct sock_fprog { - pub len: ::std::os::raw::c_ushort, - pub filter: *mut sock_filter, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct tun_pi { - pub flags: __u16, - pub proto: __be16, -} -#[repr(C)] -#[derive(Debug)] -pub struct tun_filter { - pub flags: __u16, - pub count: __u16, - pub addr: __IncompleteArrayField<[__u8; 6usize]>, -} diff --git a/net_gen/src/iff.rs b/net_gen/src/iff.rs deleted file mode 100644 index 974e01b42e..0000000000 --- a/net_gen/src/iff.rs +++ /dev/null @@ -1,1228 +0,0 @@ -// Copyright © 2021 Intel Corporation -// -// SPDX-License-Identifier: Apache-2.0 -// - -// bindgen /usr/include/linux/if.h --no-layout-tests - -/* automatically generated by rust-bindgen 0.58.1 */ - -#[repr(C)] -#[derive(Default)] -pub struct __IncompleteArrayField(::std::marker::PhantomData, [T; 0]); -#[allow(clippy::missing_safety_doc)] -impl __IncompleteArrayField { - #[inline] - pub const fn new() -> Self { - __IncompleteArrayField(::std::marker::PhantomData, []) - } - #[inline] - pub fn as_ptr(&self) -> *const T { - self as *const _ as *const T - } - #[inline] - pub fn as_mut_ptr(&mut self) -> *mut T { - self as *mut _ as *mut T - } - #[inline] - pub unsafe fn as_slice(&self, len: usize) -> &[T] { - ::std::slice::from_raw_parts(self.as_ptr(), len) - } - #[inline] - pub unsafe fn as_mut_slice(&mut self, len: usize) -> &mut [T] { - ::std::slice::from_raw_parts_mut(self.as_mut_ptr(), len) - } -} -impl ::std::fmt::Debug for __IncompleteArrayField { - fn fmt(&self, fmt: &mut ::std::fmt::Formatter<'_>) -> ::std::fmt::Result { - fmt.write_str("__IncompleteArrayField") - } -} -pub const __UAPI_DEF_IF_IFCONF: u32 = 1; -pub const __UAPI_DEF_IF_IFMAP: u32 = 1; -pub const __UAPI_DEF_IF_IFNAMSIZ: u32 = 1; -pub const __UAPI_DEF_IF_IFREQ: u32 = 1; -pub const __UAPI_DEF_IF_NET_DEVICE_FLAGS: u32 = 1; -pub const __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO: u32 = 1; -pub const __UAPI_DEF_IN_ADDR: u32 = 1; -pub const __UAPI_DEF_IN_IPPROTO: u32 = 1; -pub const __UAPI_DEF_IN_PKTINFO: u32 = 1; -pub const __UAPI_DEF_IP_MREQ: u32 = 1; -pub const __UAPI_DEF_SOCKADDR_IN: u32 = 1; -pub const __UAPI_DEF_IN_CLASS: u32 = 1; -pub const __UAPI_DEF_IN6_ADDR: u32 = 1; -pub const __UAPI_DEF_IN6_ADDR_ALT: u32 = 1; -pub const __UAPI_DEF_SOCKADDR_IN6: u32 = 1; -pub const __UAPI_DEF_IPV6_MREQ: u32 = 1; -pub const __UAPI_DEF_IPPROTO_V6: u32 = 1; -pub const __UAPI_DEF_IPV6_OPTIONS: u32 = 1; -pub const __UAPI_DEF_IN6_PKTINFO: u32 = 1; -pub const __UAPI_DEF_IP6_MTUINFO: u32 = 1; -pub const __UAPI_DEF_SOCKADDR_IPX: u32 = 1; -pub const __UAPI_DEF_IPX_ROUTE_DEFINITION: u32 = 1; -pub const __UAPI_DEF_IPX_INTERFACE_DEFINITION: u32 = 1; -pub const __UAPI_DEF_IPX_CONFIG_DATA: u32 = 1; -pub const __UAPI_DEF_IPX_ROUTE_DEF: u32 = 1; -pub const __UAPI_DEF_XATTR: u32 = 1; -pub const __BITS_PER_LONG: u32 = 64; -pub const __FD_SETSIZE: u32 = 1024; -pub const _K_SS_MAXSIZE: u32 = 128; -pub const _SYS_SOCKET_H: u32 = 1; -pub const _FEATURES_H: u32 = 1; -pub const _DEFAULT_SOURCE: u32 = 1; -pub const __GLIBC_USE_ISOC2X: u32 = 0; -pub const __USE_ISOC11: u32 = 1; -pub const __USE_ISOC99: u32 = 1; -pub const __USE_ISOC95: u32 = 1; -pub const __USE_POSIX_IMPLICITLY: u32 = 1; -pub const _POSIX_SOURCE: u32 = 1; -pub const _POSIX_C_SOURCE: u32 = 200809; -pub const __USE_POSIX: u32 = 1; -pub const __USE_POSIX2: u32 = 1; -pub const __USE_POSIX199309: u32 = 1; -pub const __USE_POSIX199506: u32 = 1; -pub const __USE_XOPEN2K: u32 = 1; -pub const __USE_XOPEN2K8: u32 = 1; -pub const _ATFILE_SOURCE: u32 = 1; -pub const __USE_MISC: u32 = 1; -pub const __USE_ATFILE: u32 = 1; -pub const __USE_FORTIFY_LEVEL: u32 = 0; -pub const __GLIBC_USE_DEPRECATED_GETS: u32 = 0; -pub const __GLIBC_USE_DEPRECATED_SCANF: u32 = 0; -pub const _STDC_PREDEF_H: u32 = 1; -pub const __STDC_IEC_559__: u32 = 1; -pub const __STDC_IEC_559_COMPLEX__: u32 = 1; -pub const __STDC_ISO_10646__: u32 = 201706; -pub const __GNU_LIBRARY__: u32 = 6; -pub const __GLIBC__: u32 = 2; -pub const __GLIBC_MINOR__: u32 = 32; -pub const _SYS_CDEFS_H: u32 = 1; -pub const __glibc_c99_flexarr_available: u32 = 1; -pub const __WORDSIZE: u32 = 64; -pub const __WORDSIZE_TIME64_COMPAT32: u32 = 1; -pub const __SYSCALL_WORDSIZE: u32 = 64; -pub const __LDOUBLE_REDIRECTS_TO_FLOAT128_ABI: u32 = 0; -pub const __HAVE_GENERIC_SELECTION: u32 = 1; -pub const __iovec_defined: u32 = 1; -pub const _SYS_TYPES_H: u32 = 1; -pub const _BITS_TYPES_H: u32 = 1; -pub const __TIMESIZE: u32 = 64; -pub const _BITS_TYPESIZES_H: u32 = 1; -pub const __OFF_T_MATCHES_OFF64_T: u32 = 1; -pub const __INO_T_MATCHES_INO64_T: u32 = 1; -pub const __RLIM_T_MATCHES_RLIM64_T: u32 = 1; -pub const __STATFS_MATCHES_STATFS64: u32 = 1; -pub const __KERNEL_OLD_TIMEVAL_MATCHES_TIMEVAL64: u32 = 1; -pub const _BITS_TIME64_H: u32 = 1; -pub const __clock_t_defined: u32 = 1; -pub const __clockid_t_defined: u32 = 1; -pub const __time_t_defined: u32 = 1; -pub const __timer_t_defined: u32 = 1; -pub const _BITS_STDINT_INTN_H: u32 = 1; -pub const __BIT_TYPES_DEFINED__: u32 = 1; -pub const _ENDIAN_H: u32 = 1; -pub const _BITS_ENDIAN_H: u32 = 1; -pub const __LITTLE_ENDIAN: u32 = 1234; -pub const __BIG_ENDIAN: u32 = 4321; -pub const __PDP_ENDIAN: u32 = 3412; -pub const _BITS_ENDIANNESS_H: u32 = 1; -pub const __BYTE_ORDER: u32 = 1234; -pub const __FLOAT_WORD_ORDER: u32 = 1234; -pub const LITTLE_ENDIAN: u32 = 1234; -pub const BIG_ENDIAN: u32 = 4321; -pub const PDP_ENDIAN: u32 = 3412; -pub const BYTE_ORDER: u32 = 1234; -pub const _BITS_BYTESWAP_H: u32 = 1; -pub const _BITS_UINTN_IDENTITY_H: u32 = 1; -pub const _SYS_SELECT_H: u32 = 1; -pub const __sigset_t_defined: u32 = 1; -pub const __timeval_defined: u32 = 1; -pub const _STRUCT_TIMESPEC: u32 = 1; -pub const FD_SETSIZE: u32 = 1024; -pub const _BITS_PTHREADTYPES_COMMON_H: u32 = 1; -pub const _THREAD_SHARED_TYPES_H: u32 = 1; -pub const _BITS_PTHREADTYPES_ARCH_H: u32 = 1; -pub const __SIZEOF_PTHREAD_MUTEX_T: u32 = 40; -pub const __SIZEOF_PTHREAD_ATTR_T: u32 = 56; -pub const __SIZEOF_PTHREAD_RWLOCK_T: u32 = 56; -pub const __SIZEOF_PTHREAD_BARRIER_T: u32 = 32; -pub const __SIZEOF_PTHREAD_MUTEXATTR_T: u32 = 4; -pub const __SIZEOF_PTHREAD_COND_T: u32 = 48; -pub const __SIZEOF_PTHREAD_CONDATTR_T: u32 = 4; -pub const __SIZEOF_PTHREAD_RWLOCKATTR_T: u32 = 8; -pub const __SIZEOF_PTHREAD_BARRIERATTR_T: u32 = 4; -pub const _THREAD_MUTEX_INTERNAL_H: u32 = 1; -pub const __PTHREAD_MUTEX_HAVE_PREV: u32 = 1; -pub const __have_pthread_attr_t: u32 = 1; -pub const PF_UNSPEC: u32 = 0; -pub const PF_LOCAL: u32 = 1; -pub const PF_UNIX: u32 = 1; -pub const PF_FILE: u32 = 1; -pub const PF_INET: u32 = 2; -pub const PF_AX25: u32 = 3; -pub const PF_IPX: u32 = 4; -pub const PF_APPLETALK: u32 = 5; -pub const PF_NETROM: u32 = 6; -pub const PF_BRIDGE: u32 = 7; -pub const PF_ATMPVC: u32 = 8; -pub const PF_X25: u32 = 9; -pub const PF_INET6: u32 = 10; -pub const PF_ROSE: u32 = 11; -pub const PF_DECnet: u32 = 12; -pub const PF_NETBEUI: u32 = 13; -pub const PF_SECURITY: u32 = 14; -pub const PF_KEY: u32 = 15; -pub const PF_NETLINK: u32 = 16; -pub const PF_ROUTE: u32 = 16; -pub const PF_PACKET: u32 = 17; -pub const PF_ASH: u32 = 18; -pub const PF_ECONET: u32 = 19; -pub const PF_ATMSVC: u32 = 20; -pub const PF_RDS: u32 = 21; -pub const PF_SNA: u32 = 22; -pub const PF_IRDA: u32 = 23; -pub const PF_PPPOX: u32 = 24; -pub const PF_WANPIPE: u32 = 25; -pub const PF_LLC: u32 = 26; -pub const PF_IB: u32 = 27; -pub const PF_MPLS: u32 = 28; -pub const PF_CAN: u32 = 29; -pub const PF_TIPC: u32 = 30; -pub const PF_BLUETOOTH: u32 = 31; -pub const PF_IUCV: u32 = 32; -pub const PF_RXRPC: u32 = 33; -pub const PF_ISDN: u32 = 34; -pub const PF_PHONET: u32 = 35; -pub const PF_IEEE802154: u32 = 36; -pub const PF_CAIF: u32 = 37; -pub const PF_ALG: u32 = 38; -pub const PF_NFC: u32 = 39; -pub const PF_VSOCK: u32 = 40; -pub const PF_KCM: u32 = 41; -pub const PF_QIPCRTR: u32 = 42; -pub const PF_SMC: u32 = 43; -pub const PF_XDP: u32 = 44; -pub const PF_MAX: u32 = 45; -pub const AF_UNSPEC: u32 = 0; -pub const AF_LOCAL: u32 = 1; -pub const AF_UNIX: u32 = 1; -pub const AF_FILE: u32 = 1; -pub const AF_INET: u32 = 2; -pub const AF_AX25: u32 = 3; -pub const AF_IPX: u32 = 4; -pub const AF_APPLETALK: u32 = 5; -pub const AF_NETROM: u32 = 6; -pub const AF_BRIDGE: u32 = 7; -pub const AF_ATMPVC: u32 = 8; -pub const AF_X25: u32 = 9; -pub const AF_INET6: u32 = 10; -pub const AF_ROSE: u32 = 11; -pub const AF_DECnet: u32 = 12; -pub const AF_NETBEUI: u32 = 13; -pub const AF_SECURITY: u32 = 14; -pub const AF_KEY: u32 = 15; -pub const AF_NETLINK: u32 = 16; -pub const AF_ROUTE: u32 = 16; -pub const AF_PACKET: u32 = 17; -pub const AF_ASH: u32 = 18; -pub const AF_ECONET: u32 = 19; -pub const AF_ATMSVC: u32 = 20; -pub const AF_RDS: u32 = 21; -pub const AF_SNA: u32 = 22; -pub const AF_IRDA: u32 = 23; -pub const AF_PPPOX: u32 = 24; -pub const AF_WANPIPE: u32 = 25; -pub const AF_LLC: u32 = 26; -pub const AF_IB: u32 = 27; -pub const AF_MPLS: u32 = 28; -pub const AF_CAN: u32 = 29; -pub const AF_TIPC: u32 = 30; -pub const AF_BLUETOOTH: u32 = 31; -pub const AF_IUCV: u32 = 32; -pub const AF_RXRPC: u32 = 33; -pub const AF_ISDN: u32 = 34; -pub const AF_PHONET: u32 = 35; -pub const AF_IEEE802154: u32 = 36; -pub const AF_CAIF: u32 = 37; -pub const AF_ALG: u32 = 38; -pub const AF_NFC: u32 = 39; -pub const AF_VSOCK: u32 = 40; -pub const AF_KCM: u32 = 41; -pub const AF_QIPCRTR: u32 = 42; -pub const AF_SMC: u32 = 43; -pub const AF_XDP: u32 = 44; -pub const AF_MAX: u32 = 45; -pub const SOL_RAW: u32 = 255; -pub const SOL_DECNET: u32 = 261; -pub const SOL_X25: u32 = 262; -pub const SOL_PACKET: u32 = 263; -pub const SOL_ATM: u32 = 264; -pub const SOL_AAL: u32 = 265; -pub const SOL_IRDA: u32 = 266; -pub const SOL_NETBEUI: u32 = 267; -pub const SOL_LLC: u32 = 268; -pub const SOL_DCCP: u32 = 269; -pub const SOL_NETLINK: u32 = 270; -pub const SOL_TIPC: u32 = 271; -pub const SOL_RXRPC: u32 = 272; -pub const SOL_PPPOL2TP: u32 = 273; -pub const SOL_BLUETOOTH: u32 = 274; -pub const SOL_PNPIPE: u32 = 275; -pub const SOL_RDS: u32 = 276; -pub const SOL_IUCV: u32 = 277; -pub const SOL_CAIF: u32 = 278; -pub const SOL_ALG: u32 = 279; -pub const SOL_NFC: u32 = 280; -pub const SOL_KCM: u32 = 281; -pub const SOL_TLS: u32 = 282; -pub const SOL_XDP: u32 = 283; -pub const SOMAXCONN: u32 = 4096; -pub const _BITS_SOCKADDR_H: u32 = 1; -pub const _SS_SIZE: u32 = 128; -pub const FIOSETOWN: u32 = 35073; -pub const SIOCSPGRP: u32 = 35074; -pub const FIOGETOWN: u32 = 35075; -pub const SIOCGPGRP: u32 = 35076; -pub const SIOCATMARK: u32 = 35077; -pub const SIOCGSTAMP_OLD: u32 = 35078; -pub const SIOCGSTAMPNS_OLD: u32 = 35079; -pub const SOL_SOCKET: u32 = 1; -pub const SO_DEBUG: u32 = 1; -pub const SO_REUSEADDR: u32 = 2; -pub const SO_TYPE: u32 = 3; -pub const SO_ERROR: u32 = 4; -pub const SO_DONTROUTE: u32 = 5; -pub const SO_BROADCAST: u32 = 6; -pub const SO_SNDBUF: u32 = 7; -pub const SO_RCVBUF: u32 = 8; -pub const SO_SNDBUFFORCE: u32 = 32; -pub const SO_RCVBUFFORCE: u32 = 33; -pub const SO_KEEPALIVE: u32 = 9; -pub const SO_OOBINLINE: u32 = 10; -pub const SO_NO_CHECK: u32 = 11; -pub const SO_PRIORITY: u32 = 12; -pub const SO_LINGER: u32 = 13; -pub const SO_BSDCOMPAT: u32 = 14; -pub const SO_REUSEPORT: u32 = 15; -pub const SO_PASSCRED: u32 = 16; -pub const SO_PEERCRED: u32 = 17; -pub const SO_RCVLOWAT: u32 = 18; -pub const SO_SNDLOWAT: u32 = 19; -pub const SO_RCVTIMEO_OLD: u32 = 20; -pub const SO_SNDTIMEO_OLD: u32 = 21; -pub const SO_SECURITY_AUTHENTICATION: u32 = 22; -pub const SO_SECURITY_ENCRYPTION_TRANSPORT: u32 = 23; -pub const SO_SECURITY_ENCRYPTION_NETWORK: u32 = 24; -pub const SO_BINDTODEVICE: u32 = 25; -pub const SO_ATTACH_FILTER: u32 = 26; -pub const SO_DETACH_FILTER: u32 = 27; -pub const SO_GET_FILTER: u32 = 26; -pub const SO_PEERNAME: u32 = 28; -pub const SO_ACCEPTCONN: u32 = 30; -pub const SO_PEERSEC: u32 = 31; -pub const SO_PASSSEC: u32 = 34; -pub const SO_MARK: u32 = 36; -pub const SO_PROTOCOL: u32 = 38; -pub const SO_DOMAIN: u32 = 39; -pub const SO_RXQ_OVFL: u32 = 40; -pub const SO_WIFI_STATUS: u32 = 41; -pub const SCM_WIFI_STATUS: u32 = 41; -pub const SO_PEEK_OFF: u32 = 42; -pub const SO_NOFCS: u32 = 43; -pub const SO_LOCK_FILTER: u32 = 44; -pub const SO_SELECT_ERR_QUEUE: u32 = 45; -pub const SO_BUSY_POLL: u32 = 46; -pub const SO_MAX_PACING_RATE: u32 = 47; -pub const SO_BPF_EXTENSIONS: u32 = 48; -pub const SO_INCOMING_CPU: u32 = 49; -pub const SO_ATTACH_BPF: u32 = 50; -pub const SO_DETACH_BPF: u32 = 27; -pub const SO_ATTACH_REUSEPORT_CBPF: u32 = 51; -pub const SO_ATTACH_REUSEPORT_EBPF: u32 = 52; -pub const SO_CNX_ADVICE: u32 = 53; -pub const SCM_TIMESTAMPING_OPT_STATS: u32 = 54; -pub const SO_MEMINFO: u32 = 55; -pub const SO_INCOMING_NAPI_ID: u32 = 56; -pub const SO_COOKIE: u32 = 57; -pub const SCM_TIMESTAMPING_PKTINFO: u32 = 58; -pub const SO_PEERGROUPS: u32 = 59; -pub const SO_ZEROCOPY: u32 = 60; -pub const SO_TXTIME: u32 = 61; -pub const SCM_TXTIME: u32 = 61; -pub const SO_BINDTOIFINDEX: u32 = 62; -pub const SO_TIMESTAMP_OLD: u32 = 29; -pub const SO_TIMESTAMPNS_OLD: u32 = 35; -pub const SO_TIMESTAMPING_OLD: u32 = 37; -pub const SO_TIMESTAMP_NEW: u32 = 63; -pub const SO_TIMESTAMPNS_NEW: u32 = 64; -pub const SO_TIMESTAMPING_NEW: u32 = 65; -pub const SO_RCVTIMEO_NEW: u32 = 66; -pub const SO_SNDTIMEO_NEW: u32 = 67; -pub const SO_DETACH_REUSEPORT_BPF: u32 = 68; -pub const SO_TIMESTAMP: u32 = 29; -pub const SO_TIMESTAMPNS: u32 = 35; -pub const SO_TIMESTAMPING: u32 = 37; -pub const SO_RCVTIMEO: u32 = 20; -pub const SO_SNDTIMEO: u32 = 21; -pub const SCM_TIMESTAMP: u32 = 29; -pub const SCM_TIMESTAMPNS: u32 = 35; -pub const SCM_TIMESTAMPING: u32 = 37; -pub const __osockaddr_defined: u32 = 1; -pub const IFNAMSIZ: u32 = 16; -pub const IFALIASZ: u32 = 256; -pub const ALTIFNAMSIZ: u32 = 128; -pub const GENERIC_HDLC_VERSION: u32 = 4; -pub const CLOCK_DEFAULT: u32 = 0; -pub const CLOCK_EXT: u32 = 1; -pub const CLOCK_INT: u32 = 2; -pub const CLOCK_TXINT: u32 = 3; -pub const CLOCK_TXFROMRX: u32 = 4; -pub const ENCODING_DEFAULT: u32 = 0; -pub const ENCODING_NRZ: u32 = 1; -pub const ENCODING_NRZI: u32 = 2; -pub const ENCODING_FM_MARK: u32 = 3; -pub const ENCODING_FM_SPACE: u32 = 4; -pub const ENCODING_MANCHESTER: u32 = 5; -pub const PARITY_DEFAULT: u32 = 0; -pub const PARITY_NONE: u32 = 1; -pub const PARITY_CRC16_PR0: u32 = 2; -pub const PARITY_CRC16_PR1: u32 = 3; -pub const PARITY_CRC16_PR0_CCITT: u32 = 4; -pub const PARITY_CRC16_PR1_CCITT: u32 = 5; -pub const PARITY_CRC32_PR0_CCITT: u32 = 6; -pub const PARITY_CRC32_PR1_CCITT: u32 = 7; -pub const LMI_DEFAULT: u32 = 0; -pub const LMI_NONE: u32 = 1; -pub const LMI_ANSI: u32 = 2; -pub const LMI_CCITT: u32 = 3; -pub const LMI_CISCO: u32 = 4; -pub const IF_GET_IFACE: u32 = 1; -pub const IF_GET_PROTO: u32 = 2; -pub const IF_IFACE_V35: u32 = 4096; -pub const IF_IFACE_V24: u32 = 4097; -pub const IF_IFACE_X21: u32 = 4098; -pub const IF_IFACE_T1: u32 = 4099; -pub const IF_IFACE_E1: u32 = 4100; -pub const IF_IFACE_SYNC_SERIAL: u32 = 4101; -pub const IF_IFACE_X21D: u32 = 4102; -pub const IF_PROTO_HDLC: u32 = 8192; -pub const IF_PROTO_PPP: u32 = 8193; -pub const IF_PROTO_CISCO: u32 = 8194; -pub const IF_PROTO_FR: u32 = 8195; -pub const IF_PROTO_FR_ADD_PVC: u32 = 8196; -pub const IF_PROTO_FR_DEL_PVC: u32 = 8197; -pub const IF_PROTO_X25: u32 = 8198; -pub const IF_PROTO_HDLC_ETH: u32 = 8199; -pub const IF_PROTO_FR_ADD_ETH_PVC: u32 = 8200; -pub const IF_PROTO_FR_DEL_ETH_PVC: u32 = 8201; -pub const IF_PROTO_FR_PVC: u32 = 8202; -pub const IF_PROTO_FR_ETH_PVC: u32 = 8203; -pub const IF_PROTO_RAW: u32 = 8204; -pub const IFHWADDRLEN: u32 = 6; -pub type __s8 = ::std::os::raw::c_schar; -pub type __u8 = ::std::os::raw::c_uchar; -pub type __s16 = ::std::os::raw::c_short; -pub type __u16 = ::std::os::raw::c_ushort; -pub type __s32 = ::std::os::raw::c_int; -pub type __u32 = ::std::os::raw::c_uint; -pub type __s64 = ::std::os::raw::c_longlong; -pub type __u64 = ::std::os::raw::c_ulonglong; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct __kernel_fd_set { - pub fds_bits: [::std::os::raw::c_ulong; 16usize], -} -pub type __kernel_sighandler_t = - ::std::option::Option; -pub type __kernel_key_t = ::std::os::raw::c_int; -pub type __kernel_mqd_t = ::std::os::raw::c_int; -pub type __kernel_old_uid_t = ::std::os::raw::c_ushort; -pub type __kernel_old_gid_t = ::std::os::raw::c_ushort; -pub type __kernel_old_dev_t = ::std::os::raw::c_ulong; -pub type __kernel_long_t = ::std::os::raw::c_long; -pub type __kernel_ulong_t = ::std::os::raw::c_ulong; -pub type __kernel_ino_t = __kernel_ulong_t; -pub type __kernel_mode_t = ::std::os::raw::c_uint; -pub type __kernel_pid_t = ::std::os::raw::c_int; -pub type __kernel_ipc_pid_t = ::std::os::raw::c_int; -pub type __kernel_uid_t = ::std::os::raw::c_uint; -pub type __kernel_gid_t = ::std::os::raw::c_uint; -pub type __kernel_suseconds_t = __kernel_long_t; -pub type __kernel_daddr_t = ::std::os::raw::c_int; -pub type __kernel_uid32_t = ::std::os::raw::c_uint; -pub type __kernel_gid32_t = ::std::os::raw::c_uint; -pub type __kernel_size_t = __kernel_ulong_t; -pub type __kernel_ssize_t = __kernel_long_t; -pub type __kernel_ptrdiff_t = __kernel_long_t; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct __kernel_fsid_t { - pub val: [::std::os::raw::c_int; 2usize], -} -pub type __kernel_off_t = __kernel_long_t; -pub type __kernel_loff_t = ::std::os::raw::c_longlong; -pub type __kernel_old_time_t = __kernel_long_t; -pub type __kernel_time_t = __kernel_long_t; -pub type __kernel_time64_t = ::std::os::raw::c_longlong; -pub type __kernel_clock_t = __kernel_long_t; -pub type __kernel_timer_t = ::std::os::raw::c_int; -pub type __kernel_clockid_t = ::std::os::raw::c_int; -pub type __kernel_caddr_t = *mut ::std::os::raw::c_char; -pub type __kernel_uid16_t = ::std::os::raw::c_ushort; -pub type __kernel_gid16_t = ::std::os::raw::c_ushort; -pub type __le16 = __u16; -pub type __be16 = __u16; -pub type __le32 = __u32; -pub type __be32 = __u32; -pub type __le64 = __u64; -pub type __be64 = __u64; -pub type __sum16 = __u16; -pub type __wsum = __u32; -pub type __poll_t = ::std::os::raw::c_uint; -pub type __kernel_sa_family_t = ::std::os::raw::c_ushort; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct __kernel_sockaddr_storage { - pub __bindgen_anon_1: __kernel_sockaddr_storage__bindgen_ty_1, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub union __kernel_sockaddr_storage__bindgen_ty_1 { - pub __bindgen_anon_1: __kernel_sockaddr_storage__bindgen_ty_1__bindgen_ty_1, - pub __align: *mut ::std::os::raw::c_void, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct __kernel_sockaddr_storage__bindgen_ty_1__bindgen_ty_1 { - pub ss_family: __kernel_sa_family_t, - pub __data: [::std::os::raw::c_char; 126usize], -} -pub type size_t = ::std::os::raw::c_ulong; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct iovec { - pub iov_base: *mut ::std::os::raw::c_void, - pub iov_len: size_t, -} -pub type __u_char = ::std::os::raw::c_uchar; -pub type __u_short = ::std::os::raw::c_ushort; -pub type __u_int = ::std::os::raw::c_uint; -pub type __u_long = ::std::os::raw::c_ulong; -pub type __int8_t = ::std::os::raw::c_schar; -pub type __uint8_t = ::std::os::raw::c_uchar; -pub type __int16_t = ::std::os::raw::c_short; -pub type __uint16_t = ::std::os::raw::c_ushort; -pub type __int32_t = ::std::os::raw::c_int; -pub type __uint32_t = ::std::os::raw::c_uint; -pub type __int64_t = ::std::os::raw::c_long; -pub type __uint64_t = ::std::os::raw::c_ulong; -pub type __int_least8_t = __int8_t; -pub type __uint_least8_t = __uint8_t; -pub type __int_least16_t = __int16_t; -pub type __uint_least16_t = __uint16_t; -pub type __int_least32_t = __int32_t; -pub type __uint_least32_t = __uint32_t; -pub type __int_least64_t = __int64_t; -pub type __uint_least64_t = __uint64_t; -pub type __quad_t = ::std::os::raw::c_long; -pub type __u_quad_t = ::std::os::raw::c_ulong; -pub type __intmax_t = ::std::os::raw::c_long; -pub type __uintmax_t = ::std::os::raw::c_ulong; -pub type __dev_t = ::std::os::raw::c_ulong; -pub type __uid_t = ::std::os::raw::c_uint; -pub type __gid_t = ::std::os::raw::c_uint; -pub type __ino_t = ::std::os::raw::c_ulong; -pub type __ino64_t = ::std::os::raw::c_ulong; -pub type __mode_t = ::std::os::raw::c_uint; -pub type __nlink_t = ::std::os::raw::c_ulong; -pub type __off_t = ::std::os::raw::c_long; -pub type __off64_t = ::std::os::raw::c_long; -pub type __pid_t = ::std::os::raw::c_int; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct __fsid_t { - pub __val: [::std::os::raw::c_int; 2usize], -} -pub type __clock_t = ::std::os::raw::c_long; -pub type __rlim_t = ::std::os::raw::c_ulong; -pub type __rlim64_t = ::std::os::raw::c_ulong; -pub type __id_t = ::std::os::raw::c_uint; -pub type __time_t = ::std::os::raw::c_long; -pub type __useconds_t = ::std::os::raw::c_uint; -pub type __suseconds_t = ::std::os::raw::c_long; -pub type __suseconds64_t = ::std::os::raw::c_long; -pub type __daddr_t = ::std::os::raw::c_int; -pub type __key_t = ::std::os::raw::c_int; -pub type __clockid_t = ::std::os::raw::c_int; -pub type __timer_t = *mut ::std::os::raw::c_void; -pub type __blksize_t = ::std::os::raw::c_long; -pub type __blkcnt_t = ::std::os::raw::c_long; -pub type __blkcnt64_t = ::std::os::raw::c_long; -pub type __fsblkcnt_t = ::std::os::raw::c_ulong; -pub type __fsblkcnt64_t = ::std::os::raw::c_ulong; -pub type __fsfilcnt_t = ::std::os::raw::c_ulong; -pub type __fsfilcnt64_t = ::std::os::raw::c_ulong; -pub type __fsword_t = ::std::os::raw::c_long; -pub type __ssize_t = ::std::os::raw::c_long; -pub type __syscall_slong_t = ::std::os::raw::c_long; -pub type __syscall_ulong_t = ::std::os::raw::c_ulong; -pub type __loff_t = __off64_t; -pub type __caddr_t = *mut ::std::os::raw::c_char; -pub type __intptr_t = ::std::os::raw::c_long; -pub type __socklen_t = ::std::os::raw::c_uint; -pub type __sig_atomic_t = ::std::os::raw::c_int; -pub type u_char = __u_char; -pub type u_short = __u_short; -pub type u_int = __u_int; -pub type u_long = __u_long; -pub type quad_t = __quad_t; -pub type u_quad_t = __u_quad_t; -pub type fsid_t = __fsid_t; -pub type loff_t = __loff_t; -pub type ino_t = __ino_t; -pub type dev_t = __dev_t; -pub type gid_t = __gid_t; -pub type mode_t = __mode_t; -pub type nlink_t = __nlink_t; -pub type uid_t = __uid_t; -pub type off_t = __off_t; -pub type pid_t = __pid_t; -pub type id_t = __id_t; -pub type ssize_t = __ssize_t; -pub type daddr_t = __daddr_t; -pub type caddr_t = __caddr_t; -pub type key_t = __key_t; -pub type clock_t = __clock_t; -pub type clockid_t = __clockid_t; -pub type time_t = __time_t; -pub type timer_t = __timer_t; -pub type ulong = ::std::os::raw::c_ulong; -pub type ushort = ::std::os::raw::c_ushort; -pub type uint = ::std::os::raw::c_uint; -pub type u_int8_t = __uint8_t; -pub type u_int16_t = __uint16_t; -pub type u_int32_t = __uint32_t; -pub type u_int64_t = __uint64_t; -pub type register_t = ::std::os::raw::c_long; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct __sigset_t { - pub __val: [::std::os::raw::c_ulong; 16usize], -} -pub type sigset_t = __sigset_t; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct timeval { - pub tv_sec: __time_t, - pub tv_usec: __suseconds_t, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct timespec { - pub tv_sec: __time_t, - pub tv_nsec: __syscall_slong_t, -} -pub type suseconds_t = __suseconds_t; -pub type __fd_mask = ::std::os::raw::c_long; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct fd_set { - pub __fds_bits: [__fd_mask; 16usize], -} -pub type fd_mask = __fd_mask; -extern "C" { - pub fn select( - __nfds: ::std::os::raw::c_int, - __readfds: *mut fd_set, - __writefds: *mut fd_set, - __exceptfds: *mut fd_set, - __timeout: *mut timeval, - ) -> ::std::os::raw::c_int; -} -extern "C" { - pub fn pselect( - __nfds: ::std::os::raw::c_int, - __readfds: *mut fd_set, - __writefds: *mut fd_set, - __exceptfds: *mut fd_set, - __timeout: *const timespec, - __sigmask: *const __sigset_t, - ) -> ::std::os::raw::c_int; -} -pub type blksize_t = __blksize_t; -pub type blkcnt_t = __blkcnt_t; -pub type fsblkcnt_t = __fsblkcnt_t; -pub type fsfilcnt_t = __fsfilcnt_t; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct __pthread_internal_list { - pub __prev: *mut __pthread_internal_list, - pub __next: *mut __pthread_internal_list, -} -pub type __pthread_list_t = __pthread_internal_list; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct __pthread_internal_slist { - pub __next: *mut __pthread_internal_slist, -} -pub type __pthread_slist_t = __pthread_internal_slist; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct __pthread_mutex_s { - pub __lock: ::std::os::raw::c_int, - pub __count: ::std::os::raw::c_uint, - pub __owner: ::std::os::raw::c_int, - pub __nusers: ::std::os::raw::c_uint, - pub __kind: ::std::os::raw::c_int, - pub __spins: ::std::os::raw::c_short, - pub __elision: ::std::os::raw::c_short, - pub __list: __pthread_list_t, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct __pthread_rwlock_arch_t { - pub __readers: ::std::os::raw::c_uint, - pub __writers: ::std::os::raw::c_uint, - pub __wrphase_futex: ::std::os::raw::c_uint, - pub __writers_futex: ::std::os::raw::c_uint, - pub __pad3: ::std::os::raw::c_uint, - pub __pad4: ::std::os::raw::c_uint, - pub __cur_writer: ::std::os::raw::c_int, - pub __shared: ::std::os::raw::c_int, - pub __rwelision: ::std::os::raw::c_schar, - pub __pad1: [::std::os::raw::c_uchar; 7usize], - pub __pad2: ::std::os::raw::c_ulong, - pub __flags: ::std::os::raw::c_uint, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct __pthread_cond_s { - pub __bindgen_anon_1: __pthread_cond_s__bindgen_ty_1, - pub __bindgen_anon_2: __pthread_cond_s__bindgen_ty_2, - pub __g_refs: [::std::os::raw::c_uint; 2usize], - pub __g_size: [::std::os::raw::c_uint; 2usize], - pub __g1_orig_size: ::std::os::raw::c_uint, - pub __wrefs: ::std::os::raw::c_uint, - pub __g_signals: [::std::os::raw::c_uint; 2usize], -} -#[repr(C)] -#[derive(Copy, Clone)] -pub union __pthread_cond_s__bindgen_ty_1 { - pub __wseq: ::std::os::raw::c_ulonglong, - pub __wseq32: __pthread_cond_s__bindgen_ty_1__bindgen_ty_1, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct __pthread_cond_s__bindgen_ty_1__bindgen_ty_1 { - pub __low: ::std::os::raw::c_uint, - pub __high: ::std::os::raw::c_uint, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub union __pthread_cond_s__bindgen_ty_2 { - pub __g1_start: ::std::os::raw::c_ulonglong, - pub __g1_start32: __pthread_cond_s__bindgen_ty_2__bindgen_ty_1, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct __pthread_cond_s__bindgen_ty_2__bindgen_ty_1 { - pub __low: ::std::os::raw::c_uint, - pub __high: ::std::os::raw::c_uint, -} -pub type __tss_t = ::std::os::raw::c_uint; -pub type __thrd_t = ::std::os::raw::c_ulong; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct __once_flag { - pub __data: ::std::os::raw::c_int, -} -pub type pthread_t = ::std::os::raw::c_ulong; -#[repr(C)] -#[derive(Copy, Clone)] -pub union pthread_mutexattr_t { - pub __size: [::std::os::raw::c_char; 4usize], - pub __align: ::std::os::raw::c_int, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub union pthread_condattr_t { - pub __size: [::std::os::raw::c_char; 4usize], - pub __align: ::std::os::raw::c_int, -} -pub type pthread_key_t = ::std::os::raw::c_uint; -pub type pthread_once_t = ::std::os::raw::c_int; -#[repr(C)] -#[derive(Copy, Clone)] -pub union pthread_attr_t { - pub __size: [::std::os::raw::c_char; 56usize], - pub __align: ::std::os::raw::c_long, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub union pthread_mutex_t { - pub __data: __pthread_mutex_s, - pub __size: [::std::os::raw::c_char; 40usize], - pub __align: ::std::os::raw::c_long, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub union pthread_cond_t { - pub __data: __pthread_cond_s, - pub __size: [::std::os::raw::c_char; 48usize], - pub __align: ::std::os::raw::c_longlong, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub union pthread_rwlock_t { - pub __data: __pthread_rwlock_arch_t, - pub __size: [::std::os::raw::c_char; 56usize], - pub __align: ::std::os::raw::c_long, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub union pthread_rwlockattr_t { - pub __size: [::std::os::raw::c_char; 8usize], - pub __align: ::std::os::raw::c_long, -} -pub type pthread_spinlock_t = ::std::os::raw::c_int; -#[repr(C)] -#[derive(Copy, Clone)] -pub union pthread_barrier_t { - pub __size: [::std::os::raw::c_char; 32usize], - pub __align: ::std::os::raw::c_long, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub union pthread_barrierattr_t { - pub __size: [::std::os::raw::c_char; 4usize], - pub __align: ::std::os::raw::c_int, -} -pub type socklen_t = __socklen_t; -pub const __socket_type_SOCK_STREAM: __socket_type = 1; -pub const __socket_type_SOCK_DGRAM: __socket_type = 2; -pub const __socket_type_SOCK_RAW: __socket_type = 3; -pub const __socket_type_SOCK_RDM: __socket_type = 4; -pub const __socket_type_SOCK_SEQPACKET: __socket_type = 5; -pub const __socket_type_SOCK_DCCP: __socket_type = 6; -pub const __socket_type_SOCK_PACKET: __socket_type = 10; -pub const __socket_type_SOCK_CLOEXEC: __socket_type = 524288; -pub const __socket_type_SOCK_NONBLOCK: __socket_type = 2048; -pub type __socket_type = ::std::os::raw::c_uint; -pub type sa_family_t = ::std::os::raw::c_ushort; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct sockaddr { - pub sa_family: sa_family_t, - pub sa_data: [::std::os::raw::c_uchar; 14usize], -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct sockaddr_storage { - pub ss_family: sa_family_t, - pub __ss_padding: [::std::os::raw::c_char; 118usize], - pub __ss_align: ::std::os::raw::c_ulong, -} -pub const MSG_OOB: ::std::os::raw::c_uint = 1; -pub const MSG_PEEK: ::std::os::raw::c_uint = 2; -pub const MSG_DONTROUTE: ::std::os::raw::c_uint = 4; -pub const MSG_CTRUNC: ::std::os::raw::c_uint = 8; -pub const MSG_PROXY: ::std::os::raw::c_uint = 16; -pub const MSG_TRUNC: ::std::os::raw::c_uint = 32; -pub const MSG_DONTWAIT: ::std::os::raw::c_uint = 64; -pub const MSG_EOR: ::std::os::raw::c_uint = 128; -pub const MSG_WAITALL: ::std::os::raw::c_uint = 256; -pub const MSG_FIN: ::std::os::raw::c_uint = 512; -pub const MSG_SYN: ::std::os::raw::c_uint = 1024; -pub const MSG_CONFIRM: ::std::os::raw::c_uint = 2048; -pub const MSG_RST: ::std::os::raw::c_uint = 4096; -pub const MSG_ERRQUEUE: ::std::os::raw::c_uint = 8192; -pub const MSG_NOSIGNAL: ::std::os::raw::c_uint = 16384; -pub const MSG_MORE: ::std::os::raw::c_uint = 32768; -pub const MSG_WAITFORONE: ::std::os::raw::c_uint = 65536; -pub const MSG_BATCH: ::std::os::raw::c_uint = 262144; -pub const MSG_ZEROCOPY: ::std::os::raw::c_uint = 67108864; -pub const MSG_FASTOPEN: ::std::os::raw::c_uint = 536870912; -pub const MSG_CMSG_CLOEXEC: ::std::os::raw::c_uint = 1073741824; -pub type _bindgen_ty_1 = ::std::os::raw::c_uint; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct msghdr { - pub msg_name: *mut ::std::os::raw::c_void, - pub msg_namelen: socklen_t, - pub msg_iov: *mut iovec, - pub msg_iovlen: size_t, - pub msg_control: *mut ::std::os::raw::c_void, - pub msg_controllen: size_t, - pub msg_flags: ::std::os::raw::c_int, -} -#[repr(C)] -#[derive(Debug)] -pub struct cmsghdr { - pub cmsg_len: size_t, - pub cmsg_level: ::std::os::raw::c_int, - pub cmsg_type: ::std::os::raw::c_int, - pub __cmsg_data: __IncompleteArrayField<::std::os::raw::c_uchar>, -} -extern "C" { - pub fn __cmsg_nxthdr(__mhdr: *mut msghdr, __cmsg: *mut cmsghdr) -> *mut cmsghdr; -} -pub const SCM_RIGHTS: ::std::os::raw::c_uint = 1; -pub type _bindgen_ty_2 = ::std::os::raw::c_uint; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct linger { - pub l_onoff: ::std::os::raw::c_int, - pub l_linger: ::std::os::raw::c_int, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct osockaddr { - pub sa_family: ::std::os::raw::c_ushort, - pub sa_data: [::std::os::raw::c_uchar; 14usize], -} -pub const SHUT_RD: ::std::os::raw::c_uint = 0; -pub const SHUT_WR: ::std::os::raw::c_uint = 1; -pub const SHUT_RDWR: ::std::os::raw::c_uint = 2; -pub type _bindgen_ty_3 = ::std::os::raw::c_uint; -extern "C" { - pub fn socket( - __domain: ::std::os::raw::c_int, - __type: ::std::os::raw::c_int, - __protocol: ::std::os::raw::c_int, - ) -> ::std::os::raw::c_int; -} -extern "C" { - pub fn socketpair( - __domain: ::std::os::raw::c_int, - __type: ::std::os::raw::c_int, - __protocol: ::std::os::raw::c_int, - __fds: *mut ::std::os::raw::c_int, - ) -> ::std::os::raw::c_int; -} -extern "C" { - pub fn bind( - __fd: ::std::os::raw::c_int, - __addr: *const sockaddr, - __len: socklen_t, - ) -> ::std::os::raw::c_int; -} -extern "C" { - pub fn getsockname( - __fd: ::std::os::raw::c_int, - __addr: *mut sockaddr, - __len: *mut socklen_t, - ) -> ::std::os::raw::c_int; -} -extern "C" { - pub fn connect( - __fd: ::std::os::raw::c_int, - __addr: *const sockaddr, - __len: socklen_t, - ) -> ::std::os::raw::c_int; -} -extern "C" { - pub fn getpeername( - __fd: ::std::os::raw::c_int, - __addr: *mut sockaddr, - __len: *mut socklen_t, - ) -> ::std::os::raw::c_int; -} -extern "C" { - pub fn send( - __fd: ::std::os::raw::c_int, - __buf: *const ::std::os::raw::c_void, - __n: size_t, - __flags: ::std::os::raw::c_int, - ) -> ssize_t; -} -extern "C" { - pub fn recv( - __fd: ::std::os::raw::c_int, - __buf: *mut ::std::os::raw::c_void, - __n: size_t, - __flags: ::std::os::raw::c_int, - ) -> ssize_t; -} -extern "C" { - pub fn sendto( - __fd: ::std::os::raw::c_int, - __buf: *const ::std::os::raw::c_void, - __n: size_t, - __flags: ::std::os::raw::c_int, - __addr: *const sockaddr, - __addr_len: socklen_t, - ) -> ssize_t; -} -extern "C" { - pub fn recvfrom( - __fd: ::std::os::raw::c_int, - __buf: *mut ::std::os::raw::c_void, - __n: size_t, - __flags: ::std::os::raw::c_int, - __addr: *mut sockaddr, - __addr_len: *mut socklen_t, - ) -> ssize_t; -} -extern "C" { - pub fn sendmsg( - __fd: ::std::os::raw::c_int, - __message: *const msghdr, - __flags: ::std::os::raw::c_int, - ) -> ssize_t; -} -extern "C" { - pub fn recvmsg( - __fd: ::std::os::raw::c_int, - __message: *mut msghdr, - __flags: ::std::os::raw::c_int, - ) -> ssize_t; -} -extern "C" { - pub fn getsockopt( - __fd: ::std::os::raw::c_int, - __level: ::std::os::raw::c_int, - __optname: ::std::os::raw::c_int, - __optval: *mut ::std::os::raw::c_void, - __optlen: *mut socklen_t, - ) -> ::std::os::raw::c_int; -} -extern "C" { - pub fn setsockopt( - __fd: ::std::os::raw::c_int, - __level: ::std::os::raw::c_int, - __optname: ::std::os::raw::c_int, - __optval: *const ::std::os::raw::c_void, - __optlen: socklen_t, - ) -> ::std::os::raw::c_int; -} -extern "C" { - pub fn listen(__fd: ::std::os::raw::c_int, __n: ::std::os::raw::c_int) - -> ::std::os::raw::c_int; -} -extern "C" { - pub fn accept( - __fd: ::std::os::raw::c_int, - __addr: *mut sockaddr, - __addr_len: *mut socklen_t, - ) -> ::std::os::raw::c_int; -} -extern "C" { - pub fn shutdown( - __fd: ::std::os::raw::c_int, - __how: ::std::os::raw::c_int, - ) -> ::std::os::raw::c_int; -} -extern "C" { - pub fn sockatmark(__fd: ::std::os::raw::c_int) -> ::std::os::raw::c_int; -} -extern "C" { - pub fn isfdtype( - __fd: ::std::os::raw::c_int, - __fdtype: ::std::os::raw::c_int, - ) -> ::std::os::raw::c_int; -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct sync_serial_settings { - pub clock_rate: ::std::os::raw::c_uint, - pub clock_type: ::std::os::raw::c_uint, - pub loopback: ::std::os::raw::c_ushort, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct te1_settings { - pub clock_rate: ::std::os::raw::c_uint, - pub clock_type: ::std::os::raw::c_uint, - pub loopback: ::std::os::raw::c_ushort, - pub slot_map: ::std::os::raw::c_uint, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct raw_hdlc_proto { - pub encoding: ::std::os::raw::c_ushort, - pub parity: ::std::os::raw::c_ushort, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct fr_proto { - pub t391: ::std::os::raw::c_uint, - pub t392: ::std::os::raw::c_uint, - pub n391: ::std::os::raw::c_uint, - pub n392: ::std::os::raw::c_uint, - pub n393: ::std::os::raw::c_uint, - pub lmi: ::std::os::raw::c_ushort, - pub dce: ::std::os::raw::c_ushort, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct fr_proto_pvc { - pub dlci: ::std::os::raw::c_uint, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct fr_proto_pvc_info { - pub dlci: ::std::os::raw::c_uint, - pub master: [::std::os::raw::c_char; 16usize], -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct cisco_proto { - pub interval: ::std::os::raw::c_uint, - pub timeout: ::std::os::raw::c_uint, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct x25_hdlc_proto { - pub dce: ::std::os::raw::c_ushort, - pub modulo: ::std::os::raw::c_uint, - pub window: ::std::os::raw::c_uint, - pub t1: ::std::os::raw::c_uint, - pub t2: ::std::os::raw::c_uint, - pub n2: ::std::os::raw::c_uint, -} -pub const net_device_flags_IFF_UP: net_device_flags = 1; -pub const net_device_flags_IFF_BROADCAST: net_device_flags = 2; -pub const net_device_flags_IFF_DEBUG: net_device_flags = 4; -pub const net_device_flags_IFF_LOOPBACK: net_device_flags = 8; -pub const net_device_flags_IFF_POINTOPOINT: net_device_flags = 16; -pub const net_device_flags_IFF_NOTRAILERS: net_device_flags = 32; -pub const net_device_flags_IFF_RUNNING: net_device_flags = 64; -pub const net_device_flags_IFF_NOARP: net_device_flags = 128; -pub const net_device_flags_IFF_PROMISC: net_device_flags = 256; -pub const net_device_flags_IFF_ALLMULTI: net_device_flags = 512; -pub const net_device_flags_IFF_MASTER: net_device_flags = 1024; -pub const net_device_flags_IFF_SLAVE: net_device_flags = 2048; -pub const net_device_flags_IFF_MULTICAST: net_device_flags = 4096; -pub const net_device_flags_IFF_PORTSEL: net_device_flags = 8192; -pub const net_device_flags_IFF_AUTOMEDIA: net_device_flags = 16384; -pub const net_device_flags_IFF_DYNAMIC: net_device_flags = 32768; -pub const net_device_flags_IFF_LOWER_UP: net_device_flags = 65536; -pub const net_device_flags_IFF_DORMANT: net_device_flags = 131072; -pub const net_device_flags_IFF_ECHO: net_device_flags = 262144; -#[doc = " enum net_device_flags - &struct net_device flags"] -#[doc = ""] -#[doc = " These are the &struct net_device flags, they can be set by drivers, the"] -#[doc = " kernel and some can be triggered by userspace. Userspace can query and"] -#[doc = " set these flags using userspace utilities but there is also a sysfs"] -#[doc = " entry available for all dev flags which can be queried and set. These flags"] -#[doc = " are shared for all types of net_devices. The sysfs entries are available"] -#[doc = " via /sys/class/net//flags. Flags which can be toggled through sysfs"] -#[doc = " are annotated below, note that only a few flags can be toggled and some"] -#[doc = " other flags are always preserved from the original net_device flags"] -#[doc = " even if you try to set them via sysfs. Flags which are always preserved"] -#[doc = " are kept under the flag grouping @IFF_VOLATILE. Flags which are __volatile__"] -#[doc = " are annotated below as such."] -#[doc = ""] -#[doc = " You should have a pretty good reason to be extending these flags."] -#[doc = ""] -#[doc = " @IFF_UP: interface is up. Can be toggled through sysfs."] -#[doc = " @IFF_BROADCAST: broadcast address valid. Volatile."] -#[doc = " @IFF_DEBUG: turn on debugging. Can be toggled through sysfs."] -#[doc = " @IFF_LOOPBACK: is a loopback net. Volatile."] -#[doc = " @IFF_POINTOPOINT: interface is has p-p link. Volatile."] -#[doc = " @IFF_NOTRAILERS: avoid use of trailers. Can be toggled through sysfs."] -#[doc = "\tVolatile."] -#[doc = " @IFF_RUNNING: interface RFC2863 OPER_UP. Volatile."] -#[doc = " @IFF_NOARP: no ARP protocol. Can be toggled through sysfs. Volatile."] -#[doc = " @IFF_PROMISC: receive all packets. Can be toggled through sysfs."] -#[doc = " @IFF_ALLMULTI: receive all multicast packets. Can be toggled through"] -#[doc = "\tsysfs."] -#[doc = " @IFF_MASTER: master of a load balancer. Volatile."] -#[doc = " @IFF_SLAVE: slave of a load balancer. Volatile."] -#[doc = " @IFF_MULTICAST: Supports multicast. Can be toggled through sysfs."] -#[doc = " @IFF_PORTSEL: can set media type. Can be toggled through sysfs."] -#[doc = " @IFF_AUTOMEDIA: auto media select active. Can be toggled through sysfs."] -#[doc = " @IFF_DYNAMIC: dialup device with changing addresses. Can be toggled"] -#[doc = "\tthrough sysfs."] -#[doc = " @IFF_LOWER_UP: driver signals L1 up. Volatile."] -#[doc = " @IFF_DORMANT: driver signals dormant. Volatile."] -#[doc = " @IFF_ECHO: echo sent packets. Volatile."] -pub type net_device_flags = ::std::os::raw::c_uint; -pub const IF_OPER_UNKNOWN: ::std::os::raw::c_uint = 0; -pub const IF_OPER_NOTPRESENT: ::std::os::raw::c_uint = 1; -pub const IF_OPER_DOWN: ::std::os::raw::c_uint = 2; -pub const IF_OPER_LOWERLAYERDOWN: ::std::os::raw::c_uint = 3; -pub const IF_OPER_TESTING: ::std::os::raw::c_uint = 4; -pub const IF_OPER_DORMANT: ::std::os::raw::c_uint = 5; -pub const IF_OPER_UP: ::std::os::raw::c_uint = 6; -pub type _bindgen_ty_4 = ::std::os::raw::c_uint; -pub const IF_LINK_MODE_DEFAULT: ::std::os::raw::c_uint = 0; -pub const IF_LINK_MODE_DORMANT: ::std::os::raw::c_uint = 1; -pub const IF_LINK_MODE_TESTING: ::std::os::raw::c_uint = 2; -pub type _bindgen_ty_5 = ::std::os::raw::c_uint; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct ifmap { - pub mem_start: ::std::os::raw::c_ulong, - pub mem_end: ::std::os::raw::c_ulong, - pub base_addr: ::std::os::raw::c_ushort, - pub irq: ::std::os::raw::c_uchar, - pub dma: ::std::os::raw::c_uchar, - pub port: ::std::os::raw::c_uchar, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct if_settings { - pub type_: ::std::os::raw::c_uint, - pub size: ::std::os::raw::c_uint, - pub ifs_ifsu: if_settings__bindgen_ty_1, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub union if_settings__bindgen_ty_1 { - pub raw_hdlc: *mut raw_hdlc_proto, - pub cisco: *mut cisco_proto, - pub fr: *mut fr_proto, - pub fr_pvc: *mut fr_proto_pvc, - pub fr_pvc_info: *mut fr_proto_pvc_info, - pub x25: *mut x25_hdlc_proto, - pub sync: *mut sync_serial_settings, - pub te1: *mut te1_settings, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct ifreq { - pub ifr_ifrn: ifreq__bindgen_ty_1, - pub ifr_ifru: ifreq__bindgen_ty_2, -} - -impl Default for ifreq { - fn default() -> Self { - // SAFETY: all zeros is a valid pattern for this data type - unsafe { std::mem::zeroed() } - } -} -#[repr(C)] -#[derive(Copy, Clone)] -pub union ifreq__bindgen_ty_1 { - pub ifrn_name: [::std::os::raw::c_uchar; 16usize], -} -#[repr(C)] -#[derive(Copy, Clone)] -pub union ifreq__bindgen_ty_2 { - pub ifru_addr: sockaddr, - pub ifru_dstaddr: sockaddr, - pub ifru_broadaddr: sockaddr, - pub ifru_netmask: sockaddr, - pub ifru_hwaddr: sockaddr, - pub ifru_flags: ::std::os::raw::c_short, - pub ifru_ivalue: ::std::os::raw::c_int, - pub ifru_mtu: ::std::os::raw::c_int, - pub ifru_map: ifmap, - pub ifru_slave: [::std::os::raw::c_uchar; 16usize], - pub ifru_newname: [::std::os::raw::c_uchar; 16usize], - pub ifru_data: *mut ::std::os::raw::c_void, - pub ifru_settings: if_settings, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct ifconf { - pub ifc_len: ::std::os::raw::c_int, - pub ifc_ifcu: ifconf__bindgen_ty_1, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub union ifconf__bindgen_ty_1 { - pub ifcu_buf: *mut ::std::os::raw::c_char, - pub ifcu_req: *mut ifreq, -} diff --git a/net_gen/src/inn.rs b/net_gen/src/inn.rs deleted file mode 100644 index f7a4e508a4..0000000000 --- a/net_gen/src/inn.rs +++ /dev/null @@ -1,294 +0,0 @@ -// Copyright © 2021 Intel Corporation -// -// SPDX-License-Identifier: Apache-2.0 - -// bindgen /usr/include/linux/in.h --no-layout-tests - -/* automatically generated by rust-bindgen 0.58.1 */ - -pub const __BITS_PER_LONG: u32 = 64; -pub const __FD_SETSIZE: u32 = 1024; -pub const __UAPI_DEF_IF_IFCONF: u32 = 1; -pub const __UAPI_DEF_IF_IFMAP: u32 = 1; -pub const __UAPI_DEF_IF_IFNAMSIZ: u32 = 1; -pub const __UAPI_DEF_IF_IFREQ: u32 = 1; -pub const __UAPI_DEF_IF_NET_DEVICE_FLAGS: u32 = 1; -pub const __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO: u32 = 1; -pub const __UAPI_DEF_IN_ADDR: u32 = 1; -pub const __UAPI_DEF_IN_IPPROTO: u32 = 1; -pub const __UAPI_DEF_IN_PKTINFO: u32 = 1; -pub const __UAPI_DEF_IP_MREQ: u32 = 1; -pub const __UAPI_DEF_SOCKADDR_IN: u32 = 1; -pub const __UAPI_DEF_IN_CLASS: u32 = 1; -pub const __UAPI_DEF_IN6_ADDR: u32 = 1; -pub const __UAPI_DEF_IN6_ADDR_ALT: u32 = 1; -pub const __UAPI_DEF_SOCKADDR_IN6: u32 = 1; -pub const __UAPI_DEF_IPV6_MREQ: u32 = 1; -pub const __UAPI_DEF_IPPROTO_V6: u32 = 1; -pub const __UAPI_DEF_IPV6_OPTIONS: u32 = 1; -pub const __UAPI_DEF_IN6_PKTINFO: u32 = 1; -pub const __UAPI_DEF_IP6_MTUINFO: u32 = 1; -pub const __UAPI_DEF_SOCKADDR_IPX: u32 = 1; -pub const __UAPI_DEF_IPX_ROUTE_DEFINITION: u32 = 1; -pub const __UAPI_DEF_IPX_INTERFACE_DEFINITION: u32 = 1; -pub const __UAPI_DEF_IPX_CONFIG_DATA: u32 = 1; -pub const __UAPI_DEF_IPX_ROUTE_DEF: u32 = 1; -pub const __UAPI_DEF_XATTR: u32 = 1; -pub const _K_SS_MAXSIZE: u32 = 128; -pub const IP_TOS: u32 = 1; -pub const IP_TTL: u32 = 2; -pub const IP_HDRINCL: u32 = 3; -pub const IP_OPTIONS: u32 = 4; -pub const IP_ROUTER_ALERT: u32 = 5; -pub const IP_RECVOPTS: u32 = 6; -pub const IP_RETOPTS: u32 = 7; -pub const IP_PKTINFO: u32 = 8; -pub const IP_PKTOPTIONS: u32 = 9; -pub const IP_MTU_DISCOVER: u32 = 10; -pub const IP_RECVERR: u32 = 11; -pub const IP_RECVTTL: u32 = 12; -pub const IP_RECVTOS: u32 = 13; -pub const IP_MTU: u32 = 14; -pub const IP_FREEBIND: u32 = 15; -pub const IP_IPSEC_POLICY: u32 = 16; -pub const IP_XFRM_POLICY: u32 = 17; -pub const IP_PASSSEC: u32 = 18; -pub const IP_TRANSPARENT: u32 = 19; -pub const IP_RECVRETOPTS: u32 = 7; -pub const IP_ORIGDSTADDR: u32 = 20; -pub const IP_RECVORIGDSTADDR: u32 = 20; -pub const IP_MINTTL: u32 = 21; -pub const IP_NODEFRAG: u32 = 22; -pub const IP_CHECKSUM: u32 = 23; -pub const IP_BIND_ADDRESS_NO_PORT: u32 = 24; -pub const IP_RECVFRAGSIZE: u32 = 25; -pub const IP_PMTUDISC_DONT: u32 = 0; -pub const IP_PMTUDISC_WANT: u32 = 1; -pub const IP_PMTUDISC_DO: u32 = 2; -pub const IP_PMTUDISC_PROBE: u32 = 3; -pub const IP_PMTUDISC_INTERFACE: u32 = 4; -pub const IP_PMTUDISC_OMIT: u32 = 5; -pub const IP_MULTICAST_IF: u32 = 32; -pub const IP_MULTICAST_TTL: u32 = 33; -pub const IP_MULTICAST_LOOP: u32 = 34; -pub const IP_ADD_MEMBERSHIP: u32 = 35; -pub const IP_DROP_MEMBERSHIP: u32 = 36; -pub const IP_UNBLOCK_SOURCE: u32 = 37; -pub const IP_BLOCK_SOURCE: u32 = 38; -pub const IP_ADD_SOURCE_MEMBERSHIP: u32 = 39; -pub const IP_DROP_SOURCE_MEMBERSHIP: u32 = 40; -pub const IP_MSFILTER: u32 = 41; -pub const MCAST_JOIN_GROUP: u32 = 42; -pub const MCAST_BLOCK_SOURCE: u32 = 43; -pub const MCAST_UNBLOCK_SOURCE: u32 = 44; -pub const MCAST_LEAVE_GROUP: u32 = 45; -pub const MCAST_JOIN_SOURCE_GROUP: u32 = 46; -pub const MCAST_LEAVE_SOURCE_GROUP: u32 = 47; -pub const MCAST_MSFILTER: u32 = 48; -pub const IP_MULTICAST_ALL: u32 = 49; -pub const IP_UNICAST_IF: u32 = 50; -pub const MCAST_EXCLUDE: u32 = 0; -pub const MCAST_INCLUDE: u32 = 1; -pub const IP_DEFAULT_MULTICAST_TTL: u32 = 1; -pub const IP_DEFAULT_MULTICAST_LOOP: u32 = 1; -pub const __SOCK_SIZE__: u32 = 16; -pub const IN_CLASSA_NET: u32 = 4278190080; -pub const IN_CLASSA_NSHIFT: u32 = 24; -pub const IN_CLASSA_HOST: u32 = 16777215; -pub const IN_CLASSA_MAX: u32 = 128; -pub const IN_CLASSB_NET: u32 = 4294901760; -pub const IN_CLASSB_NSHIFT: u32 = 16; -pub const IN_CLASSB_HOST: u32 = 65535; -pub const IN_CLASSB_MAX: u32 = 65536; -pub const IN_CLASSC_NET: u32 = 4294967040; -pub const IN_CLASSC_NSHIFT: u32 = 8; -pub const IN_CLASSC_HOST: u32 = 255; -pub const IN_MULTICAST_NET: u32 = 3758096384; -pub const IN_CLASSE_NET: u32 = 4294967295; -pub const IN_CLASSE_NSHIFT: u32 = 0; -pub const IN_LOOPBACKNET: u32 = 127; -pub const INADDR_LOOPBACK: u32 = 2130706433; -pub const INADDR_UNSPEC_GROUP: u32 = 3758096384; -pub const INADDR_ALLHOSTS_GROUP: u32 = 3758096385; -pub const INADDR_ALLRTRS_GROUP: u32 = 3758096386; -pub const INADDR_ALLSNOOPERS_GROUP: u32 = 3758096490; -pub const INADDR_MAX_LOCAL_GROUP: u32 = 3758096639; -pub const __LITTLE_ENDIAN: u32 = 1234; -pub type __s8 = ::std::os::raw::c_schar; -pub type __u8 = ::std::os::raw::c_uchar; -pub type __s16 = ::std::os::raw::c_short; -pub type __u16 = ::std::os::raw::c_ushort; -pub type __s32 = ::std::os::raw::c_int; -pub type __u32 = ::std::os::raw::c_uint; -pub type __s64 = ::std::os::raw::c_longlong; -pub type __u64 = ::std::os::raw::c_ulonglong; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct __kernel_fd_set { - pub fds_bits: [::std::os::raw::c_ulong; 16usize], -} -pub type __kernel_sighandler_t = - ::std::option::Option; -pub type __kernel_key_t = ::std::os::raw::c_int; -pub type __kernel_mqd_t = ::std::os::raw::c_int; -pub type __kernel_old_uid_t = ::std::os::raw::c_ushort; -pub type __kernel_old_gid_t = ::std::os::raw::c_ushort; -pub type __kernel_old_dev_t = ::std::os::raw::c_ulong; -pub type __kernel_long_t = ::std::os::raw::c_long; -pub type __kernel_ulong_t = ::std::os::raw::c_ulong; -pub type __kernel_ino_t = __kernel_ulong_t; -pub type __kernel_mode_t = ::std::os::raw::c_uint; -pub type __kernel_pid_t = ::std::os::raw::c_int; -pub type __kernel_ipc_pid_t = ::std::os::raw::c_int; -pub type __kernel_uid_t = ::std::os::raw::c_uint; -pub type __kernel_gid_t = ::std::os::raw::c_uint; -pub type __kernel_suseconds_t = __kernel_long_t; -pub type __kernel_daddr_t = ::std::os::raw::c_int; -pub type __kernel_uid32_t = ::std::os::raw::c_uint; -pub type __kernel_gid32_t = ::std::os::raw::c_uint; -pub type __kernel_size_t = __kernel_ulong_t; -pub type __kernel_ssize_t = __kernel_long_t; -pub type __kernel_ptrdiff_t = __kernel_long_t; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct __kernel_fsid_t { - pub val: [::std::os::raw::c_int; 2usize], -} -pub type __kernel_off_t = __kernel_long_t; -pub type __kernel_loff_t = ::std::os::raw::c_longlong; -pub type __kernel_old_time_t = __kernel_long_t; -pub type __kernel_time_t = __kernel_long_t; -pub type __kernel_time64_t = ::std::os::raw::c_longlong; -pub type __kernel_clock_t = __kernel_long_t; -pub type __kernel_timer_t = ::std::os::raw::c_int; -pub type __kernel_clockid_t = ::std::os::raw::c_int; -pub type __kernel_caddr_t = *mut ::std::os::raw::c_char; -pub type __kernel_uid16_t = ::std::os::raw::c_ushort; -pub type __kernel_gid16_t = ::std::os::raw::c_ushort; -pub type __le16 = __u16; -pub type __be16 = __u16; -pub type __le32 = __u32; -pub type __be32 = __u32; -pub type __le64 = __u64; -pub type __be64 = __u64; -pub type __sum16 = __u16; -pub type __wsum = __u32; -pub type __poll_t = ::std::os::raw::c_uint; -pub type __kernel_sa_family_t = ::std::os::raw::c_ushort; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct __kernel_sockaddr_storage { - pub __bindgen_anon_1: __kernel_sockaddr_storage__bindgen_ty_1, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub union __kernel_sockaddr_storage__bindgen_ty_1 { - pub __bindgen_anon_1: __kernel_sockaddr_storage__bindgen_ty_1__bindgen_ty_1, - pub __align: *mut ::std::os::raw::c_void, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct __kernel_sockaddr_storage__bindgen_ty_1__bindgen_ty_1 { - pub ss_family: __kernel_sa_family_t, - pub __data: [::std::os::raw::c_char; 126usize], -} -pub const IPPROTO_IP: ::std::os::raw::c_uint = 0; -pub const IPPROTO_ICMP: ::std::os::raw::c_uint = 1; -pub const IPPROTO_IGMP: ::std::os::raw::c_uint = 2; -pub const IPPROTO_IPIP: ::std::os::raw::c_uint = 4; -pub const IPPROTO_TCP: ::std::os::raw::c_uint = 6; -pub const IPPROTO_EGP: ::std::os::raw::c_uint = 8; -pub const IPPROTO_PUP: ::std::os::raw::c_uint = 12; -pub const IPPROTO_UDP: ::std::os::raw::c_uint = 17; -pub const IPPROTO_IDP: ::std::os::raw::c_uint = 22; -pub const IPPROTO_TP: ::std::os::raw::c_uint = 29; -pub const IPPROTO_DCCP: ::std::os::raw::c_uint = 33; -pub const IPPROTO_IPV6: ::std::os::raw::c_uint = 41; -pub const IPPROTO_RSVP: ::std::os::raw::c_uint = 46; -pub const IPPROTO_GRE: ::std::os::raw::c_uint = 47; -pub const IPPROTO_ESP: ::std::os::raw::c_uint = 50; -pub const IPPROTO_AH: ::std::os::raw::c_uint = 51; -pub const IPPROTO_MTP: ::std::os::raw::c_uint = 92; -pub const IPPROTO_BEETPH: ::std::os::raw::c_uint = 94; -pub const IPPROTO_ENCAP: ::std::os::raw::c_uint = 98; -pub const IPPROTO_PIM: ::std::os::raw::c_uint = 103; -pub const IPPROTO_COMP: ::std::os::raw::c_uint = 108; -pub const IPPROTO_SCTP: ::std::os::raw::c_uint = 132; -pub const IPPROTO_UDPLITE: ::std::os::raw::c_uint = 136; -pub const IPPROTO_MPLS: ::std::os::raw::c_uint = 137; -pub const IPPROTO_ETHERNET: ::std::os::raw::c_uint = 143; -pub const IPPROTO_RAW: ::std::os::raw::c_uint = 255; -pub const IPPROTO_MPTCP: ::std::os::raw::c_uint = 262; -pub const IPPROTO_MAX: ::std::os::raw::c_uint = 263; -pub type _bindgen_ty_1 = ::std::os::raw::c_uint; -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct in_addr { - pub s_addr: __be32, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct ip_mreq { - pub imr_multiaddr: in_addr, - pub imr_interface: in_addr, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct ip_mreqn { - pub imr_multiaddr: in_addr, - pub imr_address: in_addr, - pub imr_ifindex: ::std::os::raw::c_int, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct ip_mreq_source { - pub imr_multiaddr: __be32, - pub imr_interface: __be32, - pub imr_sourceaddr: __be32, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct ip_msfilter { - pub imsf_multiaddr: __be32, - pub imsf_interface: __be32, - pub imsf_fmode: __u32, - pub imsf_numsrc: __u32, - pub imsf_slist: [__be32; 1usize], -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct group_req { - pub gr_interface: __u32, - pub gr_group: __kernel_sockaddr_storage, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct group_source_req { - pub gsr_interface: __u32, - pub gsr_group: __kernel_sockaddr_storage, - pub gsr_source: __kernel_sockaddr_storage, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct group_filter { - pub gf_interface: __u32, - pub gf_group: __kernel_sockaddr_storage, - pub gf_fmode: __u32, - pub gf_numsrc: __u32, - pub gf_slist: [__kernel_sockaddr_storage; 1usize], -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct in_pktinfo { - pub ipi_ifindex: ::std::os::raw::c_int, - pub ipi_spec_dst: in_addr, - pub ipi_addr: in_addr, -} -#[repr(C)] -#[derive(Debug, Copy, Clone)] -pub struct sockaddr_in { - pub sin_family: __kernel_sa_family_t, - pub sin_port: __be16, - pub sin_addr: in_addr, - pub __pad: [::std::os::raw::c_uchar; 8usize], -} diff --git a/net_gen/src/ipv6.rs b/net_gen/src/ipv6.rs deleted file mode 100644 index 65d9349ec3..0000000000 --- a/net_gen/src/ipv6.rs +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright © 2025 Cloud Hypervisor Authors -// -// SPDX-License-Identifier: Apache-2.0 - -// bindgen /usr/include/linux/ipv6.h --no-layout-tests --constified-enum '*' --allowlist-type 'sockaddr_in6|in6_ifreq' - -/* automatically generated by rust-bindgen 0.71.1 */ - -pub type __u8 = ::std::os::raw::c_uchar; -pub type __u16 = ::std::os::raw::c_ushort; -pub type __u32 = ::std::os::raw::c_uint; -pub type __be16 = __u16; -pub type __be32 = __u32; -#[repr(C)] -#[derive(Copy, Clone)] -pub struct in6_addr { - pub in6_u: in6_addr__bindgen_ty_1, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub union in6_addr__bindgen_ty_1 { - pub u6_addr8: [__u8; 16usize], - pub u6_addr16: [__be16; 8usize], - pub u6_addr32: [__be32; 4usize], -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct sockaddr_in6 { - pub sin6_family: ::std::os::raw::c_ushort, - pub sin6_port: __be16, - pub sin6_flowinfo: __be32, - pub sin6_addr: in6_addr, - pub sin6_scope_id: __u32, -} -#[repr(C)] -#[derive(Copy, Clone)] -pub struct in6_ifreq { - pub ifr6_addr: in6_addr, - pub ifr6_prefixlen: __u32, - pub ifr6_ifindex: ::std::os::raw::c_int, -} diff --git a/net_gen/src/lib.rs b/net_gen/src/lib.rs deleted file mode 100644 index 91a5c8c15d..0000000000 --- a/net_gen/src/lib.rs +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright TUNTAP, 2017 The Chromium OS Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the THIRD-PARTY file. -// -// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause - -#![allow(non_upper_case_globals)] -#![allow(non_camel_case_types)] -#![allow(non_snake_case)] - -// generated with bindgen /usr/include/linux/if.h --no-unstable-rust -// --constified-enum '*' --with-derive-default -- -D __UAPI_DEF_IF_IFNAMSIZ -D -// __UAPI_DEF_IF_NET_DEVICE_FLAGS -D __UAPI_DEF_IF_IFREQ -D __UAPI_DEF_IF_IFMAP -// Name is "iff" to avoid conflicting with "if" keyword. -// Generated against Linux 4.11 to include fix "uapi: fix linux/if.h userspace -// compilation errors". -// Manual fixup of ifrn_name to be of type c_uchar instead of c_char. -pub mod iff; -// generated with bindgen /usr/include/linux/if_tun.h --no-unstable-rust -// --constified-enum '*' --with-derive-default -pub mod if_tun; -// generated with bindgen /usr/include/linux/in.h --no-unstable-rust -// --constified-enum '*' --with-derive-default -// Name is "inn" to avoid conflicting with "in" keyword. -pub mod inn; -// generated with bindgen /usr/include/linux/ipv6.h --no-layout-tests --constified-enum '*' -// --allowlist-type 'sockaddr_in6|in6_ifreq' -pub mod ipv6; -// generated with bindgen /usr/include/linux/sockios.h --no-unstable-rust -// --constified-enum '*' --with-derive-default -pub mod sockios; -pub use if_tun::{ - sock_fprog, IFF_MULTI_QUEUE, IFF_NO_PI, IFF_TAP, IFF_VNET_HDR, TUN_F_CSUM, TUN_F_TSO4, - TUN_F_TSO6, TUN_F_TSO_ECN, TUN_F_UFO, -}; -pub use iff::{ifreq, net_device_flags_IFF_UP, setsockopt, sockaddr, AF_INET}; -pub use inn::sockaddr_in; -pub use ipv6::{in6_ifreq, sockaddr_in6}; -use vmm_sys_util::{ioctl_ior_nr, ioctl_iow_nr}; - -pub const TUNTAP: ::std::os::raw::c_uint = 84; - -ioctl_iow_nr!(TUNSETNOCSUM, TUNTAP, 200, ::std::os::raw::c_int); -ioctl_iow_nr!(TUNSETDEBUG, TUNTAP, 201, ::std::os::raw::c_int); -ioctl_iow_nr!(TUNSETIFF, TUNTAP, 202, ::std::os::raw::c_int); -ioctl_iow_nr!(TUNSETPERSIST, TUNTAP, 203, ::std::os::raw::c_int); -ioctl_iow_nr!(TUNSETOWNER, TUNTAP, 204, ::std::os::raw::c_int); -ioctl_iow_nr!(TUNSETLINK, TUNTAP, 205, ::std::os::raw::c_int); -ioctl_iow_nr!(TUNSETGROUP, TUNTAP, 206, ::std::os::raw::c_int); -ioctl_ior_nr!(TUNGETFEATURES, TUNTAP, 207, ::std::os::raw::c_uint); -ioctl_iow_nr!(TUNSETOFFLOAD, TUNTAP, 208, ::std::os::raw::c_uint); -ioctl_iow_nr!(TUNSETTXFILTER, TUNTAP, 209, ::std::os::raw::c_uint); -ioctl_ior_nr!(TUNGETIFF, TUNTAP, 210, ::std::os::raw::c_uint); -ioctl_ior_nr!(TUNGETSNDBUF, TUNTAP, 211, ::std::os::raw::c_int); -ioctl_iow_nr!(TUNSETSNDBUF, TUNTAP, 212, ::std::os::raw::c_int); -ioctl_iow_nr!(TUNATTACHFILTER, TUNTAP, 213, sock_fprog); -ioctl_iow_nr!(TUNDETACHFILTER, TUNTAP, 214, sock_fprog); -ioctl_ior_nr!(TUNGETVNETHDRSZ, TUNTAP, 215, ::std::os::raw::c_int); -ioctl_iow_nr!(TUNSETVNETHDRSZ, TUNTAP, 216, ::std::os::raw::c_int); -ioctl_iow_nr!(TUNSETQUEUE, TUNTAP, 217, ::std::os::raw::c_int); -ioctl_iow_nr!(TUNSETIFINDEX, TUNTAP, 218, ::std::os::raw::c_uint); -ioctl_ior_nr!(TUNGETFILTER, TUNTAP, 219, sock_fprog); -ioctl_iow_nr!(TUNSETVNETLE, TUNTAP, 220, ::std::os::raw::c_int); -ioctl_ior_nr!(TUNGETVNETLE, TUNTAP, 221, ::std::os::raw::c_int); -ioctl_iow_nr!(TUNSETVNETBE, TUNTAP, 222, ::std::os::raw::c_int); -ioctl_ior_nr!(TUNGETVNETBE, TUNTAP, 223, ::std::os::raw::c_int); diff --git a/net_gen/src/sockios.rs b/net_gen/src/sockios.rs deleted file mode 100644 index ad3dee7c8a..0000000000 --- a/net_gen/src/sockios.rs +++ /dev/null @@ -1,95 +0,0 @@ -// Copyright © 2021 Intel Corporation -// -// SPDX-License-Identifier: Apache-2.0 -// - -// bindgen /usr/include/linux/sockios.h --no-layout-tests - -/* automatically generated by rust-bindgen 0.58.1 */ - -pub const __BITS_PER_LONG: u32 = 64; -pub const FIOSETOWN: u32 = 35073; -pub const SIOCSPGRP: u32 = 35074; -pub const FIOGETOWN: u32 = 35075; -pub const SIOCGPGRP: u32 = 35076; -pub const SIOCATMARK: u32 = 35077; -pub const SIOCGSTAMP_OLD: u32 = 35078; -pub const SIOCGSTAMPNS_OLD: u32 = 35079; -pub const SOCK_IOC_TYPE: u32 = 137; -pub const SIOCGSTAMP: u32 = 35078; -pub const SIOCGSTAMPNS: u32 = 35079; -pub const SIOCADDRT: u32 = 35083; -pub const SIOCDELRT: u32 = 35084; -pub const SIOCRTMSG: u32 = 35085; -pub const SIOCGIFNAME: u32 = 35088; -pub const SIOCSIFLINK: u32 = 35089; -pub const SIOCGIFCONF: u32 = 35090; -pub const SIOCGIFFLAGS: u32 = 35091; -pub const SIOCSIFFLAGS: u32 = 35092; -pub const SIOCGIFADDR: u32 = 35093; -pub const SIOCSIFADDR: u32 = 35094; -pub const SIOCGIFDSTADDR: u32 = 35095; -pub const SIOCSIFDSTADDR: u32 = 35096; -pub const SIOCGIFBRDADDR: u32 = 35097; -pub const SIOCSIFBRDADDR: u32 = 35098; -pub const SIOCGIFNETMASK: u32 = 35099; -pub const SIOCSIFNETMASK: u32 = 35100; -pub const SIOCGIFMETRIC: u32 = 35101; -pub const SIOCSIFMETRIC: u32 = 35102; -pub const SIOCGIFMEM: u32 = 35103; -pub const SIOCSIFMEM: u32 = 35104; -pub const SIOCGIFMTU: u32 = 35105; -pub const SIOCSIFMTU: u32 = 35106; -pub const SIOCSIFNAME: u32 = 35107; -pub const SIOCSIFHWADDR: u32 = 35108; -pub const SIOCGIFENCAP: u32 = 35109; -pub const SIOCSIFENCAP: u32 = 35110; -pub const SIOCGIFHWADDR: u32 = 35111; -pub const SIOCGIFSLAVE: u32 = 35113; -pub const SIOCSIFSLAVE: u32 = 35120; -pub const SIOCADDMULTI: u32 = 35121; -pub const SIOCDELMULTI: u32 = 35122; -pub const SIOCGIFINDEX: u32 = 35123; -pub const SIOGIFINDEX: u32 = 35123; -pub const SIOCSIFPFLAGS: u32 = 35124; -pub const SIOCGIFPFLAGS: u32 = 35125; -pub const SIOCDIFADDR: u32 = 35126; -pub const SIOCSIFHWBROADCAST: u32 = 35127; -pub const SIOCGIFCOUNT: u32 = 35128; -pub const SIOCGIFBR: u32 = 35136; -pub const SIOCSIFBR: u32 = 35137; -pub const SIOCGIFTXQLEN: u32 = 35138; -pub const SIOCSIFTXQLEN: u32 = 35139; -pub const SIOCETHTOOL: u32 = 35142; -pub const SIOCGMIIPHY: u32 = 35143; -pub const SIOCGMIIREG: u32 = 35144; -pub const SIOCSMIIREG: u32 = 35145; -pub const SIOCWANDEV: u32 = 35146; -pub const SIOCOUTQNSD: u32 = 35147; -pub const SIOCGSKNS: u32 = 35148; -pub const SIOCDARP: u32 = 35155; -pub const SIOCGARP: u32 = 35156; -pub const SIOCSARP: u32 = 35157; -pub const SIOCDRARP: u32 = 35168; -pub const SIOCGRARP: u32 = 35169; -pub const SIOCSRARP: u32 = 35170; -pub const SIOCGIFMAP: u32 = 35184; -pub const SIOCSIFMAP: u32 = 35185; -pub const SIOCADDDLCI: u32 = 35200; -pub const SIOCDELDLCI: u32 = 35201; -pub const SIOCGIFVLAN: u32 = 35202; -pub const SIOCSIFVLAN: u32 = 35203; -pub const SIOCBONDENSLAVE: u32 = 35216; -pub const SIOCBONDRELEASE: u32 = 35217; -pub const SIOCBONDSETHWADDR: u32 = 35218; -pub const SIOCBONDSLAVEINFOQUERY: u32 = 35219; -pub const SIOCBONDINFOQUERY: u32 = 35220; -pub const SIOCBONDCHANGEACTIVE: u32 = 35221; -pub const SIOCBRADDBR: u32 = 35232; -pub const SIOCBRDELBR: u32 = 35233; -pub const SIOCBRADDIF: u32 = 35234; -pub const SIOCBRDELIF: u32 = 35235; -pub const SIOCSHWTSTAMP: u32 = 35248; -pub const SIOCGHWTSTAMP: u32 = 35249; -pub const SIOCDEVPRIVATE: u32 = 35312; -pub const SIOCPROTOPRIVATE: u32 = 35296; diff --git a/net_util/Cargo.toml b/net_util/Cargo.toml index e9a8f5badf..03a40defce 100644 --- a/net_util/Cargo.toml +++ b/net_util/Cargo.toml @@ -2,14 +2,14 @@ authors = ["The Chromium OS Authors"] edition.workspace = true name = "net_util" +rust-version.workspace = true version = "0.1.0" [dependencies] epoll = { workspace = true } -getrandom = "0.4.1" +getrandom = "0.4.2" libc = { workspace = true } log = { workspace = true } -net_gen = { path = "../net_gen" } rate_limiter = { path = "../rate_limiter" } serde = { workspace = true, features = ["derive"] } thiserror = { workspace = true } diff --git a/net_util/src/ctrl_queue.rs b/net_util/src/ctrl_queue.rs index 284b6ec4e6..b14b380364 100644 --- a/net_util/src/ctrl_queue.rs +++ b/net_util/src/ctrl_queue.rs @@ -2,12 +2,16 @@ // // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause -use log::{error, info, warn}; +use log::{debug, error, info, warn}; use thiserror::Error; use virtio_bindings::virtio_net::{ - VIRTIO_NET_CTRL_GUEST_OFFLOADS, VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET, VIRTIO_NET_CTRL_MQ, - VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX, VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN, - VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, VIRTIO_NET_ERR, VIRTIO_NET_OK, + VIRTIO_NET_CTRL_ANNOUNCE, VIRTIO_NET_CTRL_ANNOUNCE_ACK, VIRTIO_NET_CTRL_GUEST_OFFLOADS, + VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET, VIRTIO_NET_CTRL_MQ, VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX, + VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN, VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, VIRTIO_NET_CTRL_RX, + VIRTIO_NET_CTRL_RX_ALLMULTI, VIRTIO_NET_CTRL_RX_ALLUNI, VIRTIO_NET_CTRL_RX_NOBCAST, + VIRTIO_NET_CTRL_RX_NOMULTI, VIRTIO_NET_CTRL_RX_NOUNI, VIRTIO_NET_CTRL_RX_PROMISC, + VIRTIO_NET_CTRL_VLAN, VIRTIO_NET_CTRL_VLAN_ADD, VIRTIO_NET_CTRL_VLAN_DEL, VIRTIO_NET_ERR, + VIRTIO_NET_OK, }; use virtio_queue::{Queue, QueueT}; use vm_memory::{ByteValued, Bytes, GuestMemoryError}; @@ -53,6 +57,26 @@ pub struct ControlHeader { // SAFETY: ControlHeader only contains a series of integers unsafe impl ByteValued for ControlHeader {} +fn is_tolerated_ctrl_command(ctrl_hdr: ControlHeader) -> bool { + match u32::from(ctrl_hdr.class) { + VIRTIO_NET_CTRL_RX => matches!( + u32::from(ctrl_hdr.cmd), + VIRTIO_NET_CTRL_RX_PROMISC + | VIRTIO_NET_CTRL_RX_ALLMULTI + | VIRTIO_NET_CTRL_RX_ALLUNI + | VIRTIO_NET_CTRL_RX_NOMULTI + | VIRTIO_NET_CTRL_RX_NOUNI + | VIRTIO_NET_CTRL_RX_NOBCAST + ), + VIRTIO_NET_CTRL_VLAN => matches!( + u32::from(ctrl_hdr.cmd), + VIRTIO_NET_CTRL_VLAN_ADD | VIRTIO_NET_CTRL_VLAN_DEL + ), + VIRTIO_NET_CTRL_ANNOUNCE => u32::from(ctrl_hdr.cmd) == VIRTIO_NET_CTRL_ANNOUNCE_ACK, + _ => false, + } +} + pub struct CtrlQueue { pub taps: Vec, } @@ -76,14 +100,16 @@ impl CtrlQueue { .read_obj( ctrl_desc .addr() - .translate_gva(access_platform, ctrl_desc.len() as usize), + .translate_gva(access_platform, ctrl_desc.len() as usize) + .map_err(|e| Error::GuestMemory(GuestMemoryError::IOError(e)))?, ) .map_err(Error::GuestMemory)?; let data_desc = desc_chain.next().ok_or(Error::NoDataDescriptor)?; let data_desc_addr = data_desc .addr() - .translate_gva(access_platform, data_desc.len() as usize); + .translate_gva(access_platform, data_desc.len() as usize) + .map_err(|e| Error::GuestMemory(GuestMemoryError::IOError(e)))?; let status_desc = desc_chain.next().ok_or(Error::NoStatusDescriptor)?; @@ -128,6 +154,10 @@ impl CtrlQueue { false } } + _ if is_tolerated_ctrl_command(ctrl_hdr) => { + debug!("Ignoring unsupported but tolerated control command {ctrl_hdr:?}"); + true + } _ => { warn!("Unsupported command {ctrl_hdr:?}"); false @@ -140,10 +170,13 @@ impl CtrlQueue { if ok { VIRTIO_NET_OK } else { VIRTIO_NET_ERR } as u8, status_desc .addr() - .translate_gva(access_platform, status_desc.len() as usize), + .translate_gva(access_platform, status_desc.len() as usize) + .map_err(|e| Error::GuestMemory(GuestMemoryError::IOError(e)))?, ) .map_err(Error::GuestMemory)?; - let len = ctrl_desc.len() + data_desc.len() + status_desc.len(); + // Per virtio spec 2.6.8, used_len is the number of bytes written + // to device-writable descriptors. Only the status byte is written. + let len = status_desc.len(); queue .add_used(desc_chain.memory(), desc_chain.head_index(), len) diff --git a/net_util/src/lib.rs b/net_util/src/lib.rs index 6cf5791507..7152c1676f 100644 --- a/net_util/src/lib.rs +++ b/net_util/src/lib.rs @@ -59,15 +59,15 @@ unsafe impl ByteValued for VirtioNetConfig {} /// Create a sockaddr_in from an IPv4 address, and expose it as /// an opaque sockaddr suitable for usage by socket ioctls. -fn create_sockaddr(ip_addr: net::Ipv4Addr) -> net_gen::sockaddr { - // IPv4 addresses big-endian (network order), but Ipv4Addr will give us - // a view of those bytes directly so we can avoid any endian trickiness. - let addr_in = net_gen::sockaddr_in { - sin_family: net_gen::AF_INET as u16, +fn create_sockaddr(ip_addr: net::Ipv4Addr) -> libc::sockaddr { + let addr_in = libc::sockaddr_in { + sin_family: libc::AF_INET as u16, sin_port: 0, - // SAFETY: ip_addr can be safely transmute to in_addr - sin_addr: unsafe { mem::transmute::<[u8; 4], net_gen::inn::in_addr>(ip_addr.octets()) }, - __pad: [0; 8usize], + sin_addr: libc::in_addr { + // Use network byte order (big endian). + s_addr: ip_addr.to_bits().to_be(), + }, + sin_zero: [0; 8], }; // SAFETY: addr_in can be safely transmute to sockaddr @@ -167,19 +167,19 @@ pub fn build_net_config_space_with_mq( pub fn virtio_features_to_tap_offload(features: u64) -> c_uint { let mut tap_offloads: c_uint = 0; if features & (1 << VIRTIO_NET_F_GUEST_CSUM) != 0 { - tap_offloads |= net_gen::TUN_F_CSUM; + tap_offloads |= libc::TUN_F_CSUM; } if features & (1 << VIRTIO_NET_F_GUEST_TSO4) != 0 { - tap_offloads |= net_gen::TUN_F_TSO4; + tap_offloads |= libc::TUN_F_TSO4; } if features & (1 << VIRTIO_NET_F_GUEST_TSO6) != 0 { - tap_offloads |= net_gen::TUN_F_TSO6; + tap_offloads |= libc::TUN_F_TSO6; } if features & (1 << VIRTIO_NET_F_GUEST_ECN) != 0 { - tap_offloads |= net_gen::TUN_F_TSO_ECN; + tap_offloads |= libc::TUN_F_TSO_ECN; } if features & (1 << VIRTIO_NET_F_GUEST_UFO) != 0 { - tap_offloads |= net_gen::TUN_F_UFO; + tap_offloads |= libc::TUN_F_UFO; } tap_offloads @@ -194,7 +194,7 @@ mod unit_tests { let addr: net::Ipv4Addr = "10.0.0.1".parse().unwrap(); let sockaddr = create_sockaddr(addr); - assert_eq!(sockaddr.sa_family, net_gen::AF_INET as u16); + assert_eq!(sockaddr.sa_family, libc::AF_INET as u16); let data = &sockaddr.sa_data[..]; diff --git a/net_util/src/open_tap.rs b/net_util/src/open_tap.rs index 39d4285df3..a5168d22a0 100644 --- a/net_util/src/open_tap.rs +++ b/net_util/src/open_tap.rs @@ -48,11 +48,11 @@ fn check_mq_support(if_name: &Option<&str>, queue_pairs: usize) -> Result<()> { return Ok(()); } let tun_flags_str = fs::read_to_string(path).map_err(Error::ReadSysfsTunFlags)?; - let tun_flags = u32::from_str_radix(tun_flags_str.trim().trim_start_matches("0x"), 16) + let tun_flags = i32::from_str_radix(tun_flags_str.trim().trim_start_matches("0x"), 16) .map_err(Error::ConvertHexStringToInt)?; - if (tun_flags & net_gen::IFF_MULTI_QUEUE != 0) && !mq { + if (tun_flags & libc::IFF_MULTI_QUEUE != 0) && !mq { return Err(Error::MultiQueueNoDeviceSupport); - } else if (tun_flags & net_gen::IFF_MULTI_QUEUE == 0) && mq { + } else if (tun_flags & libc::IFF_MULTI_QUEUE == 0) && mq { return Err(Error::MultiQueueNoTapSupport); } } diff --git a/net_util/src/queue_pair.rs b/net_util/src/queue_pair.rs index 86a1c758dc..a569031815 100644 --- a/net_util/src/queue_pair.rs +++ b/net_util/src/queue_pair.rs @@ -51,7 +51,13 @@ impl TxVirtio { let mut retry_write = false; let mut rate_limit_reached = false; - while let Some(mut desc_chain) = queue.pop_descriptor_chain(mem) { + loop { + let mut iter = queue + .iter(mem) + .map_err(NetQueuePairError::QueueIteratorFailed)?; + let Some(mut desc_chain) = iter.next() else { + break; + }; if rate_limit_reached { queue.go_to_previous_position(); break; @@ -63,7 +69,10 @@ impl TxVirtio { while let Some(desc) = next_desc { let desc_addr = desc .addr() - .translate_gva(access_platform, desc.len() as usize); + .translate_gva(access_platform, desc.len() as usize) + .map_err(|e| { + NetQueuePairError::GuestMemory(vm_memory::GuestMemoryError::IOError(e)) + })?; if !desc.is_write_only() && desc.len() > 0 { let buf = desc_chain .memory() @@ -180,7 +189,13 @@ impl RxVirtio { let mut exhausted_descs = true; let mut rate_limit_reached = false; - while let Some(mut desc_chain) = queue.pop_descriptor_chain(mem) { + loop { + let mut iter = queue + .iter(mem) + .map_err(NetQueuePairError::QueueIteratorFailed)?; + let Some(mut desc_chain) = iter.next() else { + break; + }; if rate_limit_reached { exhausted_descs = false; queue.go_to_previous_position(); @@ -195,7 +210,10 @@ impl RxVirtio { .memory() .checked_offset( desc.addr() - .translate_gva(access_platform, desc.len() as usize), + .translate_gva(access_platform, desc.len() as usize) + .map_err(|e| { + NetQueuePairError::GuestMemory(vm_memory::GuestMemoryError::IOError(e)) + })?, 10, ) .ok_or(NetQueuePairError::DescriptorInvalidHeader)?; @@ -205,7 +223,10 @@ impl RxVirtio { while let Some(desc) = next_desc { let desc_addr = desc .addr() - .translate_gva(access_platform, desc.len() as usize); + .translate_gva(access_platform, desc.len() as usize) + .map_err(|e| { + NetQueuePairError::GuestMemory(vm_memory::GuestMemoryError::IOError(e)) + })?; if desc.is_write_only() && desc.len() > 0 { let buf = desc_chain .memory() diff --git a/net_util/src/tap.rs b/net_util/src/tap.rs index 36f0e2ba33..43b42a4a2e 100644 --- a/net_util/src/tap.rs +++ b/net_util/src/tap.rs @@ -5,12 +5,14 @@ // Use of this source code is governed by a BSD-style license that can be // found in the THIRD-PARTY file. +use std::ffi::{CStr, CString}; use std::fs::File; use std::io::{Error as IoError, Read, Result as IoResult, Write}; use std::net::{IpAddr, Ipv6Addr}; use std::os::raw::*; use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; +use libc::{__c_anonymous_ifr_ifru, ifreq}; use thiserror::Error; use vmm_sys_util::ioctl::{ioctl_with_mut_ref, ioctl_with_ref, ioctl_with_val}; @@ -43,6 +45,8 @@ pub enum Error { NetUtil(#[source] NetUtilError), #[error("Interface name too long (max length is {MAX_INTERFACE_NAME_LEN}): {0}")] IfnameTooLong(String), + #[error("Interface name contains interior NUL byte: {0:?}")] + IfnameContainsNUL(String), #[error("Invalid interface name (does it exist?): {0}")] InvalidIfname(String), #[error("Error parsing MAC data")] @@ -62,7 +66,8 @@ pub type Result = ::std::result::Result; #[derive(Debug)] pub struct Tap { tap_file: File, - if_name: Vec, + /// The name does not exceed [`MAX_INTERFACE_NAME_LEN`] bytes excluding the NUL byte. + if_name: CString, } impl PartialEq for Tap { @@ -80,23 +85,6 @@ impl std::clone::Clone for Tap { } } -// Returns a byte vector representing the contents of a null terminated C string which -// contains if_name. -fn build_terminated_if_name(if_name: &str) -> Result> { - // Convert the string slice to bytes, and shadow the variable, - // since we no longer need the &str version. - let bytes = if_name.as_bytes(); - - if bytes.len() > MAX_INTERFACE_NAME_LEN { - return Err(Error::IfnameTooLong(if_name.to_string())); - } - - let mut terminated_if_name = vec![b'\0'; bytes.len() + 1]; - terminated_if_name[..bytes.len()].copy_from_slice(bytes); - - Ok(terminated_if_name) -} - fn ipv6_mask_to_prefix(mask: Ipv6Addr) -> Result { let mask = mask.segments(); let mut iter = mask.iter(); @@ -169,7 +157,12 @@ impl Tap { } pub fn open_named(if_name: &str, num_queue_pairs: usize, flags: Option) -> Result { - let terminated_if_name = build_terminated_if_name(if_name)?; + if if_name.len() > MAX_INTERFACE_NAME_LEN { + return Err(Error::IfnameTooLong(if_name.to_string())); + } + + let terminated_if_name = + CString::new(if_name).map_err(|_| Error::IfnameContainsNUL(if_name.to_string()))?; // SAFETY: FFI call let fd = unsafe { @@ -192,42 +185,48 @@ impl Tap { // value. let mut features = 0; // SAFETY: IOCTL with correct arguments - let ret = unsafe { ioctl_with_mut_ref(&tuntap, net_gen::TUNGETFEATURES(), &mut features) }; + let ret = + unsafe { ioctl_with_mut_ref(&tuntap, libc::TUNGETFEATURES as c_ulong, &mut features) }; if ret < 0 { return Err(Error::GetFeatures(IoError::last_os_error())); } // Check if the user parameters match the kernel support for MQ - if (features & net_gen::IFF_MULTI_QUEUE == 0) && num_queue_pairs > 1 { + if (features & libc::IFF_MULTI_QUEUE == 0) && num_queue_pairs > 1 { return Err(Error::MultiQueueKernelSupport); } - // This is pretty messy because of the unions used by ifreq. Since we - // don't call as_mut on the same union field more than once, this block - // is safe. - let mut ifreq: net_gen::ifreq = Default::default(); - // SAFETY: see the comment above. - unsafe { - let ifrn_name = ifreq.ifr_ifrn.ifrn_name.as_mut(); - let name_slice = &mut ifrn_name[..terminated_if_name.len()]; - name_slice.copy_from_slice(terminated_if_name.as_slice()); - ifreq.ifr_ifru.ifru_flags = - (net_gen::IFF_TAP | net_gen::IFF_NO_PI | net_gen::IFF_VNET_HDR) as c_short; - if num_queue_pairs > 1 { - ifreq.ifr_ifru.ifru_flags |= net_gen::IFF_MULTI_QUEUE as c_short; - } + let mut ifru_flags = (libc::IFF_TAP | libc::IFF_NO_PI | libc::IFF_VNET_HDR) as c_short; + if num_queue_pairs > 1 { + ifru_flags |= libc::IFF_MULTI_QUEUE as c_short; } + let mut ifreq = libc::ifreq { + ifr_name: [0; libc::IFNAMSIZ], + ifr_ifru: __c_anonymous_ifr_ifru { ifru_flags }, + }; + + // Convert and copy bytes to `ifr_name` buffer. + // `terminated_if_name` will fit into `ifr_name` since we enforce the length limit + // above. + ifreq + .ifr_name + .iter_mut() + .zip(terminated_if_name.as_bytes_with_nul()) + .for_each(|(ifr_name_char, terminated_if_name_byte)| { + *ifr_name_char = *terminated_if_name_byte as c_char; + }); + // SAFETY: ioctl is safe since we call it with a valid tap fd and check the return // value. - let ret = unsafe { ioctl_with_mut_ref(&tuntap, net_gen::TUNSETIFF(), &mut ifreq) }; + let ret = unsafe { ioctl_with_mut_ref(&tuntap, libc::TUNSETIFF as c_ulong, &mut ifreq) }; if ret < 0 { return Err(Error::ConfigureTap(IoError::last_os_error())); } - // SAFETY: only the name is accessed, and it's cloned out. - let mut if_name = unsafe { ifreq.ifr_ifrn.ifrn_name }.to_vec(); - if_name.truncate(terminated_if_name.len() - 1); + // SAFETY: `ifreq.ifr_name` is set by the `ioctl_with_mut_ref` call and we checked the + // return code, so the name must be a valid `CStr`. + let if_name = unsafe { CStr::from_ptr(ifreq.ifr_name.as_ptr()) }.to_owned(); Ok(Tap { tap_file: tuntap, if_name, @@ -254,27 +253,36 @@ impl Tap { // SAFETY: fd is a tap fd let tap_file = unsafe { File::from_raw_fd(fd) }; - let mut ifreq: net_gen::ifreq = Default::default(); + let mut ifreq: libc::ifreq = ifreq { + ifr_name: [0; libc::IFNAMSIZ], + ifr_ifru: __c_anonymous_ifr_ifru { ifru_flags: 0 }, + }; // Get current config including name // SAFETY: IOCTL with correct arguments - unsafe { Self::ioctl_with_mut_ref(&tap_file, net_gen::TUNGETIFF(), &mut ifreq)? }; - - // SAFETY: We only access one field of the ifru union - let if_name = unsafe { ifreq.ifr_ifrn.ifrn_name }.to_vec(); + unsafe { Self::ioctl_with_mut_ref(&tap_file, libc::TUNGETIFF as c_ulong, &mut ifreq)? }; + + let if_name = { + let ifr_ptr = ifreq.ifr_name.as_ptr(); + // SAFETY: The `ifr_name` field of the union is a valid, nul-terminated C string since it + // was just set by the ioctl call, and we checked for errors. + // We immediately convert the `CStr` to the owned `CString, so the memory of the union field + // is not accessed or mutated during the lifetime of the `Cstr`. + unsafe { CStr::from_ptr(ifr_ptr).to_owned() } + }; // Try and update flags. Depending on how the tap was created (macvtap // or via open_named()) this might return -EEXIST so we just ignore that. // SAFETY: access union fields unsafe { ifreq.ifr_ifru.ifru_flags = - (net_gen::IFF_TAP | net_gen::IFF_NO_PI | net_gen::IFF_VNET_HDR) as c_short; + (libc::IFF_TAP | libc::IFF_NO_PI | libc::IFF_VNET_HDR) as c_short; if num_queue_pairs > 1 { - ifreq.ifr_ifru.ifru_flags |= net_gen::IFF_MULTI_QUEUE as c_short; + ifreq.ifr_ifru.ifru_flags |= libc::IFF_MULTI_QUEUE as c_short; } } // SAFETY: IOCTL with correct arguments - let ret = unsafe { ioctl_with_mut_ref(&tap_file, net_gen::TUNSETIFF(), &mut ifreq) }; + let ret = unsafe { ioctl_with_mut_ref(&tap_file, libc::TUNSETIFF as c_ulong, &mut ifreq) }; if ret < 0 && IoError::last_os_error().raw_os_error().unwrap() != libc::EEXIST { return Err(Error::ConfigureTap(IoError::last_os_error())); } @@ -300,7 +308,7 @@ impl Tap { // SAFETY: ioctl is safe. Called with a valid sock fd, and we check the return. unsafe { - Self::ioctl_with_ref(&sock, net_gen::sockios::SIOCSIFADDR as c_ulong, &ifreq)?; + Self::ioctl_with_ref(&sock, libc::SIOCSIFADDR as c_ulong, &ifreq)?; } if let Some(IpAddr::V4(mask)) = netmask { @@ -308,11 +316,7 @@ impl Tap { // SAFETY: ioctl is safe. Called with a valid sock fd, and we check the return. unsafe { - Self::ioctl_with_ref( - &sock, - net_gen::sockios::SIOCSIFNETMASK as c_ulong, - &ifreq, - )?; + Self::ioctl_with_ref(&sock, libc::SIOCSIFNETMASK as c_ulong, &ifreq)?; } } @@ -322,18 +326,14 @@ impl Tap { let ifindex = { // SAFETY: ioctl is safe. Called with a valid sock fd, and we check the return. unsafe { - Self::ioctl_with_ref( - &sock, - net_gen::sockios::SIOCGIFINDEX as c_ulong, - &ifreq, - )?; + Self::ioctl_with_ref(&sock, libc::SIOCGIFINDEX as c_ulong, &ifreq)?; } // SAFETY: ifru_ivalue contains the ifindex and is set by the previous ioctl unsafe { - match ifreq.ifr_ifru.ifru_ivalue { + match ifreq.ifr_ifru.ifru_ifindex { 0 => { - let name = String::from_utf8_lossy(&self.if_name).to_string(); + let name = self.if_name.to_string_lossy().to_string(); return Err(Error::InvalidIfname(name)); } i => i, @@ -347,19 +347,17 @@ impl Tap { None => 0, }; - let ifreq = net_gen::in6_ifreq { + let ifreq = libc::in6_ifreq { // SAFETY: addr can be safely transmuted to in6_addr ifr6_addr: unsafe { - std::mem::transmute::<[u8; 16], net_gen::ipv6::in6_addr>(addr.octets()) + std::mem::transmute::<[u8; 16], libc::in6_addr>(addr.octets()) }, ifr6_prefixlen: prefixlen as u32, ifr6_ifindex: ifindex, }; // SAFETY: ioctl is safe. Called with a valid sock fd, and we check the return. - unsafe { - Self::ioctl_with_ref(&sock, net_gen::sockios::SIOCSIFADDR as c_ulong, &ifreq) - } + unsafe { Self::ioctl_with_ref(&sock, libc::SIOCSIFADDR as c_ulong, &ifreq) } } } } @@ -380,18 +378,18 @@ impl Tap { let mut ifreq = self.get_ifreq(); // SAFETY: ioctl is safe. Called with a valid sock fd, and we check the return. - unsafe { Self::ioctl_with_ref(&sock, net_gen::sockios::SIOCGIFHWADDR as c_ulong, &ifreq)? }; + unsafe { Self::ioctl_with_ref(&sock, libc::SIOCGIFHWADDR as c_ulong, &ifreq)? }; // SAFETY: We only access one field of the ifru union unsafe { let ifru_hwaddr = &mut ifreq.ifr_ifru.ifru_hwaddr; for (i, v) in addr.get_bytes().iter().enumerate() { - ifru_hwaddr.sa_data[i] = *v as c_uchar; + ifru_hwaddr.sa_data[i] = *v as c_char; } } // SAFETY: ioctl is safe. Called with a valid sock fd, and we check the return. - unsafe { Self::ioctl_with_ref(&sock, net_gen::sockios::SIOCSIFHWADDR as c_ulong, &ifreq) } + unsafe { Self::ioctl_with_ref(&sock, libc::SIOCSIFHWADDR as c_ulong, &ifreq) } } /// Get mac addr for tap interface. @@ -401,12 +399,21 @@ impl Tap { let ifreq = self.get_ifreq(); // SAFETY: ioctl is safe. Called with a valid sock fd, and we check the return. - unsafe { Self::ioctl_with_ref(&sock, net_gen::sockios::SIOCGIFHWADDR as c_ulong, &ifreq)? }; - - // SAFETY: We only access one field of the ifru union - let addr = unsafe { - MacAddr::from_bytes(&ifreq.ifr_ifru.ifru_hwaddr.sa_data[0..MAC_ADDR_LEN]) - .map_err(Error::MacParsing)? + unsafe { Self::ioctl_with_ref(&sock, libc::SIOCGIFHWADDR as c_ulong, &ifreq)? }; + + let addr = { + let bytes: Vec = + // SAFETY: The `ioctl_with_ref` ensures accessing `ifru_hwaddr` is valid. + unsafe { ifreq.ifr_ifru.ifru_hwaddr.sa_data[0..MAC_ADDR_LEN].iter() } + .map(|byte| { + // On some architectures, `c_char` is already a `u8`. + #[allow(clippy::unnecessary_cast)] + { + *byte as u8 + } + }) + .collect(); + MacAddr::from_bytes(&bytes).map_err(Error::MacParsing)? }; Ok(addr) } @@ -418,7 +425,7 @@ impl Tap { let ifreq = self.get_ifreq(); // SAFETY: ioctl is safe. Called with a valid sock fd, and we check the return. - unsafe { Self::ioctl_with_ref(&sock, net_gen::sockios::SIOCGIFMTU as c_ulong, &ifreq)? }; + unsafe { Self::ioctl_with_ref(&sock, libc::SIOCGIFMTU as c_ulong, &ifreq)? }; // SAFETY: access a union field let mtu = unsafe { ifreq.ifr_ifru.ifru_mtu }; @@ -439,13 +446,19 @@ impl Tap { ifreq.ifr_ifru.ifru_mtu = mtu; // SAFETY: ioctl is safe. Called with a valid sock fd, and we check the return. - unsafe { Self::ioctl_with_ref(&sock, net_gen::sockios::SIOCSIFMTU as c_ulong, &ifreq) } + unsafe { Self::ioctl_with_ref(&sock, libc::SIOCSIFMTU as c_ulong, &ifreq) } } /// Set the offload flags for the tap interface. pub fn set_offload(&self, flags: c_uint) -> Result<()> { // SAFETY: ioctl is safe. Called with a valid tap fd, and we check the return. - unsafe { Self::ioctl_with_val(&self.tap_file, net_gen::TUNSETOFFLOAD(), flags as c_ulong) } + unsafe { + Self::ioctl_with_val( + &self.tap_file, + libc::TUNSETOFFLOAD as c_ulong, + flags as c_ulong, + ) + } } /// Enable the tap interface. @@ -455,48 +468,44 @@ impl Tap { let mut ifreq = self.get_ifreq(); // SAFETY: IOCTL with correct arguments - unsafe { Self::ioctl_with_ref(&sock, net_gen::sockios::SIOCGIFFLAGS as c_ulong, &ifreq)? }; + unsafe { Self::ioctl_with_ref(&sock, libc::SIOCGIFFLAGS as c_ulong, &ifreq)? }; // If TAP device is already up don't try and enable it // SAFETY: access a union field let ifru_flags = unsafe { ifreq.ifr_ifru.ifru_flags }; - if ifru_flags & net_gen::net_device_flags_IFF_UP as i16 - == net_gen::net_device_flags_IFF_UP as i16 - { + if ifru_flags & libc::IFF_UP as i16 == libc::IFF_UP as i16 { return Ok(()); } - ifreq.ifr_ifru.ifru_flags = net_gen::net_device_flags_IFF_UP as i16; + ifreq.ifr_ifru.ifru_flags = libc::IFF_UP as i16; // SAFETY: ioctl is safe. Called with a valid sock fd, and we check the return. - unsafe { Self::ioctl_with_ref(&sock, net_gen::sockios::SIOCSIFFLAGS as c_ulong, &ifreq) } + unsafe { Self::ioctl_with_ref(&sock, libc::SIOCSIFFLAGS as c_ulong, &ifreq) } } /// Set the size of the vnet hdr. pub fn set_vnet_hdr_size(&self, size: c_int) -> Result<()> { // SAFETY: ioctl is safe. Called with a valid tap fd, and we check the return. - unsafe { Self::ioctl_with_ref(&self.tap_file, net_gen::TUNSETVNETHDRSZ(), &size) } + unsafe { Self::ioctl_with_ref(&self.tap_file, libc::TUNSETVNETHDRSZ as c_ulong, &size) } } - fn get_ifreq(&self) -> net_gen::ifreq { - let mut ifreq: net_gen::ifreq = Default::default(); - - // This sets the name of the interface, which is the only entry - // in a single-field union. - // SAFETY: access union fields and we're sure the copy is okay. - unsafe { - let ifrn_name = ifreq.ifr_ifrn.ifrn_name.as_mut(); - let name_slice = &mut ifrn_name[..self.if_name.len()]; - name_slice.copy_from_slice(&self.if_name); - } + fn get_ifreq(&self) -> libc::ifreq { + let mut ifreq: libc::ifreq = libc::ifreq { + ifr_name: [0; libc::IFNAMSIZ], + ifr_ifru: __c_anonymous_ifr_ifru { ifru_flags: 0 }, + }; + // Convert and copy bytes to `ifr_name` buffer. + // `self.if_name` will fit into `ifr_name` since we enforce the length when setting it. ifreq - } + .ifr_name + .iter_mut() + .zip(self.if_name.as_bytes_with_nul()) + .for_each(|(ifr_name_char, terminated_if_name_byte)| { + *ifr_name_char = *terminated_if_name_byte as c_char; + }); - /// Returns the raw bytes of the interface name, which may or may not be - /// valid UTF-8. - pub fn if_name_as_bytes(&self) -> &[u8] { - &self.if_name + ifreq } /// Returns the interface name as a string, truncated at the first NUL byte @@ -509,19 +518,20 @@ impl Tap { /// thus valid UTF-8. Also, self-generated interface names form CHV are /// also always created from Rust strings, thus valid UTF-8. pub fn if_name_as_str(&self) -> &str { - // All bytes until first NUL. - let nul_terminated = self - .if_name_as_bytes() - .split(|&b| b == 0) - .next() - .unwrap_or(&[]); - // Panicking here is fine, see function documentation. - std::str::from_utf8(nul_terminated).expect("Tap interface name should be valid UTF-8") + std::str::from_utf8(self.if_name.as_bytes()) + .expect("Tap interface name should be valid UTF-8") } #[cfg(fuzzing)] - pub fn new_for_fuzzing(tap_file: File, if_name: Vec) -> Self { + pub fn new_for_fuzzing(tap_file: File, if_name: &str) -> Self { + if if_name.len() > MAX_INTERFACE_NAME_LEN { + panic!("provided name longer than `MAX_INTERFACE_NAME_LEN`") + } + + let if_name = CString::new(if_name) + .map_err(|_| Error::IfnameContainsNUL(if_name.to_string())) + .unwrap(); Tap { tap_file, if_name } } } @@ -663,7 +673,7 @@ mod unit_tests { } // Sends a test packet on the interface named "ifname". - fn pnet_send_packet(ifname: String) { + fn pnet_send_packet(ifname: &str) { let payload = DATA_STRING.as_bytes(); // eth hdr + ip hdr + udp hdr + payload len @@ -682,7 +692,7 @@ mod unit_tests { // interface, an object that can be used to send Ethernet frames, and a receiver of // Ethernet frames arriving at the specified interface. fn pnet_get_mac_tx_rx( - ifname: String, + ifname: &str, ) -> (MacAddr, Box, Box) { let interface_name_matches = |iface: &NetworkInterface| iface.name == ifname; @@ -778,7 +788,7 @@ mod unit_tests { tap.enable().unwrap(); // Send a packet to the interface. We expect to be able to receive it on the associated fd. - pnet_send_packet(tap.if_name_as_str().to_owned()); + pnet_send_packet(tap.if_name_as_str()); let mut buf = [0u8; 4096]; @@ -836,7 +846,7 @@ mod unit_tests { tap.set_ip_addr(ip_addr, Some(netmask)).unwrap(); tap.enable().unwrap(); - let (mac, _, mut rx) = pnet_get_mac_tx_rx(tap.if_name_as_str().to_owned()); + let (mac, _, mut rx) = pnet_get_mac_tx_rx(tap.if_name_as_str()); let payload = DATA_STRING.as_bytes(); diff --git a/option_parser/Cargo.toml b/option_parser/Cargo.toml index 3d76690b41..54c77e296b 100644 --- a/option_parser/Cargo.toml +++ b/option_parser/Cargo.toml @@ -2,6 +2,7 @@ authors = ["The Cloud Hypervisor Authors"] edition.workspace = true name = "option_parser" +rust-version.workspace = true version = "0.1.0" [dependencies] diff --git a/option_parser/src/lib.rs b/option_parser/src/lib.rs index 1722da39f2..6be4bcbb1a 100644 --- a/option_parser/src/lib.rs +++ b/option_parser/src/lib.rs @@ -3,7 +3,31 @@ // SPDX-License-Identifier: Apache-2.0 // +//! A parser for comma-separated `key=value` option strings. +//! +//! This crate provides [`OptionParser`], which parses strings of the form +//! `"key1=value1,key2=value2,..."` into a set of named options that can then +//! be retrieved and converted to various types. +//! +//! Values may be quoted with `"` to embed commas and other special characters, +//! and brackets `[` `]` are tracked so that list-valued options like +//! `topology=[1,2,3]` are not split at inner commas. +//! +//! # Example +//! +//! ``` +//! use option_parser::OptionParser; +//! +//! let mut parser = OptionParser::new(); +//! parser.add("size").add("mergeable"); +//! parser.parse("size=128M,mergeable=on").unwrap(); +//! +//! assert_eq!(parser.get("size"), Some("128M".to_owned())); +//! assert_eq!(parser.get("mergeable"), Some("on".to_owned())); +//! ``` + use std::collections::HashMap; +use std::fmt::{Display, Write}; use std::num::ParseIntError; use std::str::FromStr; @@ -26,6 +50,12 @@ mod private_trait { } use private_trait::Parseable; +/// A parser for comma-separated `key=value` option strings. +/// +/// Options must be registered with [`add`](Self::add) or +/// [`add_valueless`](Self::add_valueless) before parsing. After calling +/// [`parse`](Self::parse), values can be retrieved with [`get`](Self::get) +/// or converted to a specific type with [`convert`](Self::convert). #[derive(Default)] pub struct OptionParser { options: HashMap, @@ -36,14 +66,19 @@ struct OptionParserValue { requires_value: bool, } +/// Errors returned when parsing or converting options. #[derive(Debug, Error)] pub enum OptionParserError { + /// An option name was not previously registered with [`OptionParser::add`]. #[error("unknown option: {0}")] UnknownOption(String), - #[error("unknown option: {0}")] + /// The input string has invalid syntax (unbalanced quotes/brackets, missing `=`). + #[error("invalid syntax: {0}")] InvalidSyntax(String), + /// A value could not be converted to the requested type. #[error("unable to convert {1} for {0}")] Conversion(String /* field */, String /* value */), + /// A value was syntactically valid but semantically wrong. #[error("invalid value: {0}")] InvalidValue(String), } @@ -86,13 +121,14 @@ fn split_commas(s: &str) -> OptionParserResult> { } impl OptionParser { + /// Creates an empty `OptionParser` with no registered options. pub fn new() -> Self { Self { options: HashMap::new(), } } - pub fn parse(&mut self, input: &str) -> OptionParserResult<()> { + fn parse_inner(&mut self, input: &str, ignore_unknown: bool) -> OptionParserResult<()> { if input.trim().is_empty() { return Ok(()); } @@ -100,7 +136,11 @@ impl OptionParser { for option in split_commas(input)?.iter() { let parts: Vec<&str> = option.splitn(2, '=').collect(); match self.options.get_mut(parts[0]) { - None => return Err(OptionParserError::UnknownOption(parts[0].to_owned())), + None => { + if !ignore_unknown { + return Err(OptionParserError::UnknownOption(parts[0].to_owned())); + } + } Some(value) => { if value.requires_value { if parts.len() != 2 { @@ -117,6 +157,30 @@ impl OptionParser { Ok(()) } + /// Parses a comma-separated `key=value` string, updating registered options. + /// + /// Returns an error if the input contains an unknown option name, has + /// unbalanced quotes or brackets, or a value-requiring option lacks `=`. + pub fn parse(&mut self, input: &str) -> OptionParserResult<()> { + self.parse_inner(input, false) + } + + /// Like [`parse`](Self::parse), but silently ignores unknown option names. + /// + /// This is useful when multiple parsers share the same input string and + /// each only cares about a subset of the options. + pub fn parse_subset(&mut self, input: &str) -> OptionParserResult<()> { + self.parse_inner(input, true) + } + + /// Registers a named option that requires a value (i.e. `key=value`). + /// + /// Option names must not contain `"`, `[`, `]`, `=`, or `,`. + /// Returns `&mut Self` for chaining. + /// + /// # Panics + /// + /// Panics if the option name contains a forbidden character. pub fn add(&mut self, option: &str) -> &mut Self { // Check that option=value has balanced // quotes and brackets iff value does. @@ -135,6 +199,21 @@ impl OptionParser { self } + /// Registers multiple value-requiring options at once. + /// + /// Equivalent to calling [`add`](Self::add) for each element in the slice. + pub fn add_all(&mut self, options: &[&str]) -> &mut Self { + for option in options { + self.add(option); + } + + self + } + + /// Registers a flag-style option that does not take a value. + /// + /// When this option appears in the input string (without `=`), it is + /// marked as set. Use [`is_set`](Self::is_set) to query it. pub fn add_valueless(&mut self, option: &str) -> &mut Self { self.options.insert( option.to_owned(), @@ -147,6 +226,10 @@ impl OptionParser { self } + /// Returns the raw string value of an option, or `None` if the option was + /// not set or if its value is an empty string (e.g. `key=`). + /// + /// Surrounding double-quotes in the value are removed. pub fn get(&self, option: &str) -> Option { self.options .get(option) @@ -160,6 +243,9 @@ impl OptionParser { }) } + /// Returns `true` if the option was present in the parsed input. + /// + /// This works for both value-requiring and valueless options. pub fn is_set(&self, option: &str) -> bool { self.options .get(option) @@ -167,6 +253,14 @@ impl OptionParser { .is_some() } + /// Retrieves and converts an option value to type `T`. + /// + /// Returns `Ok(None)` if the option was not set or its value is empty. + /// Returns `Err` if the value cannot be converted to `T`. + /// + /// `T` can be any type that implements `FromStr` (e.g. `u32`, `String`), + /// or one of this crate's types such as [`Toggle`], [`IntegerList`], + /// [`Tuple`], or [`StringList`]. pub fn convert(&self, option: &str) -> OptionParserResult> { match self.options.get(option).and_then(|v| v.value.as_ref()) { None => Ok(None), @@ -183,6 +277,9 @@ impl OptionParser { } } +/// A boolean-like value that accepts `"on"`, `"true"`, `"off"`, `"false"`, or `""`. +/// +/// An empty string is treated as `false`. pub struct Toggle(pub bool); #[derive(Error, Debug)] @@ -206,6 +303,10 @@ impl Parseable for Toggle { } } +/// A byte size parsed from a human-readable string with optional `K`, `M`, or `G` suffix. +/// +/// The suffix is binary (1K = 1024, 1M = 1048576, 1G = 1073741824). +/// A bare integer is treated as bytes. pub struct ByteSized(pub u64); #[derive(Error, Debug)] @@ -238,8 +339,26 @@ impl FromStr for ByteSized { } } +/// A list of integers parsed from a bracket-enclosed, comma-separated string. +/// +/// Ranges are supported with `-`: `"[0,2-4,6]"` produces `[0, 2, 3, 4, 6]`. pub struct IntegerList(pub Vec); +impl Display for IntegerList { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_char('[')?; + let mut iter = self.0.iter(); + if let Some(first) = iter.next() { + first.fmt(f)?; + for i in iter { + f.write_char(',')?; + i.fmt(f)?; + } + } + f.write_char(']') + } +} + #[derive(Error, Debug)] pub enum IntegerListParseError { #[error("invalid value: {0}")] @@ -288,7 +407,11 @@ impl Parseable for IntegerList { } } +/// Types that can appear as the second element of a [`Tuple`] pair. +/// +/// Implemented for `u64`, `Vec`, `Vec`, and `Vec`. pub trait TupleValue { + /// Parses the value portion of a `key@value` tuple element. fn parse_value(input: &str) -> Result where Self: Sized; @@ -330,6 +453,10 @@ impl TupleValue for Vec { } } +/// A list of `key@value` pairs parsed from a bracket-enclosed string. +/// +/// The format is `[key1@value1,key2@value2,...]` where `@` separates each +/// pair's elements. `S` is the key type and `T` is the value type. #[derive(PartialEq, Eq, Debug)] pub struct Tuple(pub Vec<(S, T)>); @@ -385,6 +512,9 @@ impl Parseable for Tuple { } } +/// A list of strings parsed from a bracket-enclosed, comma-separated string. +/// +/// The format is `[str1,str2,...]`. Brackets are optional. #[derive(Default)] pub struct StringList(pub Vec); @@ -458,11 +588,13 @@ mod unit_tests { assert_eq!(split_commas("\"\"").unwrap(), vec!["\"\""]); parser.parse("size=128M,hanging_param").unwrap_err(); - parser - .parse("size=128M,too_many_equals=foo=bar") - .unwrap_err(); parser.parse("size=128M,file=/dev/shm").unwrap_err(); + // Equals signs within a value are fine (splitn(2, '=') keeps them) + parser.add("extra"); + parser.parse("extra=foo=bar").unwrap(); + assert_eq!(parser.get("extra"), Some("foo=bar".to_owned())); + parser.parse("size=128M").unwrap(); assert_eq!(parser.get("size"), Some("128M".to_owned())); assert!(!parser.is_set("mergeable")); @@ -517,4 +649,234 @@ mod unit_tests { fn check_dequote() { assert_eq!(dequote("a\u{3b2}\"a\"\"\""), "a\u{3b2}a\""); } + + #[test] + fn test_empty_input() { + let mut parser = OptionParser::new(); + parser.add("foo"); + parser.parse("").unwrap(); + parser.parse(" ").unwrap(); + assert!(!parser.is_set("foo")); + } + + #[test] + fn test_parse_subset_ignores_unknown() { + let mut parser = OptionParser::new(); + parser.add("known"); + parser.parse_subset("known=val,unknown=other").unwrap(); + assert_eq!(parser.get("known"), Some("val".to_owned())); + assert!(!parser.is_set("unknown")); + } + + #[test] + fn test_add_all() { + let mut parser = OptionParser::new(); + parser.add_all(&["a", "b", "c"]); + parser.parse("a=1,b=2,c=3").unwrap(); + assert_eq!(parser.get("a"), Some("1".to_owned())); + assert_eq!(parser.get("b"), Some("2".to_owned())); + assert_eq!(parser.get("c"), Some("3".to_owned())); + } + + #[test] + fn test_add_valueless() { + let mut parser = OptionParser::new(); + parser.add_valueless("readonly"); + parser.add("path"); + parser.parse("path=/dev/sda,readonly").unwrap(); + assert!(parser.is_set("readonly")); + assert_eq!(parser.get("readonly"), None); + assert_eq!(parser.get("path"), Some("/dev/sda".to_owned())); + } + + #[test] + fn test_convert_integer() { + let mut parser = OptionParser::new(); + parser.add("count"); + parser.parse("count=42").unwrap(); + assert_eq!(parser.convert::("count").unwrap(), Some(42)); + assert_eq!(parser.convert::("count").unwrap(), Some(42)); + } + + #[test] + fn test_convert_unset_returns_none() { + let mut parser = OptionParser::new(); + parser.add("count"); + assert_eq!(parser.convert::("count").unwrap(), None); + } + + #[test] + fn test_convert_invalid_returns_error() { + let mut parser = OptionParser::new(); + parser.add("count"); + parser.parse("count=notanumber").unwrap(); + parser.convert::("count").unwrap_err(); + } + + #[test] + fn test_toggle() { + for (input, expected) in [ + ("on", true), + ("off", false), + ("true", true), + ("false", false), + ("ON", true), + ("OFF", false), + ("True", true), + ("False", false), + ] { + let mut parser = OptionParser::new(); + parser.add("flag"); + parser.parse(&format!("flag={input}")).unwrap(); + let toggle = parser.convert::("flag").unwrap().unwrap(); + assert_eq!(toggle.0, expected, "Toggle({input}) should be {expected}"); + } + } + + #[test] + fn test_toggle_invalid() { + let mut parser = OptionParser::new(); + parser.add("flag"); + parser.parse("flag=maybe").unwrap(); + assert!(parser.convert::("flag").is_err()); + } + + #[test] + fn test_byte_sized() { + let cases = [ + ("1024", 1024u64), + ("1K", 1024), + ("2M", 2 * 1024 * 1024), + ("4G", 4 * 1024 * 1024 * 1024), + ("0K", 0), + ]; + for (input, expected) in cases { + let mut parser = OptionParser::new(); + parser.add("size"); + parser.parse(&format!("size={input}")).unwrap(); + let bs = parser.convert::("size").unwrap().unwrap(); + assert_eq!(bs.0, expected, "ByteSized({input}) should be {expected}"); + } + } + + #[test] + fn test_byte_sized_invalid() { + assert!("xyzK".parse::().is_err()); + assert!("".parse::().is_err()); + } + + #[test] + fn test_integer_list_single_values() { + let list = IntegerList::from_str("[1,3,5]").unwrap(); + assert_eq!(list.0, vec![1, 3, 5]); + } + + #[test] + fn test_integer_list_ranges() { + let list = IntegerList::from_str("[0,2-4,7]").unwrap(); + assert_eq!(list.0, vec![0, 2, 3, 4, 7]); + } + + #[test] + fn test_integer_list_invalid_range() { + assert!(IntegerList::from_str("[5-3]").is_err()); + assert!(IntegerList::from_str("[5-5]").is_err()); + } + + #[test] + fn test_integer_list_too_many_dashes() { + assert!(IntegerList::from_str("[1-2-3]").is_err()); + } + + #[test] + fn test_integer_list_display() { + let list = IntegerList(vec![1, 2, 3]); + assert_eq!(format!("{list}"), "[1,2,3]"); + + let empty = IntegerList(vec![]); + assert_eq!(format!("{empty}"), "[]"); + + let single = IntegerList(vec![42]); + assert_eq!(format!("{single}"), "[42]"); + } + + #[test] + fn test_string_list() { + let list = StringList::from_str("[foo,bar,baz]").unwrap(); + assert_eq!(list.0, vec!["foo", "bar", "baz"]); + } + + #[test] + fn test_string_list_no_brackets() { + let list = StringList::from_str("foo,bar").unwrap(); + assert_eq!(list.0, vec!["foo", "bar"]); + } + + #[test] + fn test_tuple_single_pair() { + let t = Tuple::::from_str("[foo@42]").unwrap(); + assert_eq!(t, Tuple(vec![("foo".to_owned(), 42)])); + } + + #[test] + fn test_tuple_multiple_pairs() { + let t = Tuple::>::from_str("[a@[1,2],b@[3,4]]").unwrap(); + assert_eq!( + t, + Tuple(vec![ + ("a".to_owned(), vec![1, 2]), + ("b".to_owned(), vec![3, 4]), + ]) + ); + } + + #[test] + fn test_tuple_missing_at_separator() { + Tuple::::from_str("[foo42]").unwrap_err(); + } + + #[test] + fn test_tuple_missing_brackets() { + Tuple::::from_str("foo@42").unwrap_err(); + } + + #[test] + fn test_split_commas_unbalanced_bracket() { + split_commas("[a,b").unwrap_err(); + split_commas("a]").unwrap_err(); + } + + #[test] + fn test_split_commas_unbalanced_quote() { + split_commas("\"abc").unwrap_err(); + } + + #[test] + fn test_quoted_value_with_commas() { + let mut parser = OptionParser::new(); + parser.add("cmd"); + parser.parse("cmd=\"a,b,c\"").unwrap(); + assert_eq!(parser.get("cmd"), Some("a,b,c".to_owned())); + } + + #[test] + #[should_panic(expected = "forbidden character")] + fn test_add_option_with_equals() { + let mut parser = OptionParser::new(); + parser.add("bad=name"); + } + + #[test] + #[should_panic(expected = "forbidden character")] + fn test_add_option_with_comma() { + let mut parser = OptionParser::new(); + parser.add("bad,name"); + } + + #[test] + #[should_panic(expected = "forbidden character")] + fn test_add_option_with_bracket() { + let mut parser = OptionParser::new(); + parser.add("bad[name"); + } } diff --git a/pci/Cargo.toml b/pci/Cargo.toml index 760baae03d..c1b69f9854 100644 --- a/pci/Cargo.toml +++ b/pci/Cargo.toml @@ -2,6 +2,7 @@ authors = ["Samuel Ortiz "] edition.workspace = true name = "pci" +rust-version.workspace = true version = "0.1.0" [features] diff --git a/pci/src/bus.rs b/pci/src/bus.rs index eaae23a4d8..89efd2ed68 100644 --- a/pci/src/bus.rs +++ b/pci/src/bus.rs @@ -10,7 +10,7 @@ use std::ops::DerefMut; use std::sync::{Arc, Barrier, Mutex}; use byteorder::{ByteOrder, LittleEndian}; -use log::error; +use log::warn; use thiserror::Error; use vm_device::{Bus, BusDevice, BusDeviceSync}; @@ -20,9 +20,13 @@ use crate::configuration::{ }; use crate::device::{BarReprogrammingParams, DeviceRelocation, Error as PciDeviceError, PciDevice}; +/// Denotes the PCI device ID of a bus' root bridge device. +pub const PCI_ROOT_DEVICE_ID: u8 = 0; +/// Denotes the maximum number of PCI devices allowed on a bus. 32 per PCI spec. +pub const NUM_DEVICE_IDS: u8 = 32; + const VENDOR_ID_INTEL: u16 = 0x8086; const DEVICE_ID_INTEL_VIRT_PCIE_HOST: u16 = 0x0d57; -const NUM_DEVICE_IDS: usize = 32; /// Errors for device manager. #[derive(Error, Debug)] @@ -43,10 +47,10 @@ pub enum PciRootError { #[error("Could not find an available device slot on the PCI bus")] NoPciDeviceSlotAvailable, /// Invalid PCI device identifier provided. - #[error("Invalid PCI device identifier provided")] + #[error("Invalid PCI device identifier provided: {0}")] InvalidPciDeviceSlot(usize), /// Valid PCI device identifier but already used. - #[error("Valid PCI device identifier but already used")] + #[error("Valid PCI device identifier but already used: {0}")] AlreadyInUsePciDeviceSlot(usize), } pub type Result = std::result::Result; @@ -110,21 +114,28 @@ impl PciDevice for PciRoot { } } +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +enum DeviceIdState { + Free, + Reserved, + Allocated, +} + pub struct PciBus { /// Devices attached to this bus. /// Device 0 is host bridge. - devices: HashMap>>, + devices: HashMap>>, device_reloc: Arc, - device_ids: Vec, + device_ids: [DeviceIdState; NUM_DEVICE_IDS as usize], } impl PciBus { pub fn new(pci_root: PciRoot, device_reloc: Arc) -> Self { - let mut devices: HashMap>> = HashMap::new(); - let mut device_ids: Vec = vec![false; NUM_DEVICE_IDS]; + let mut devices: HashMap>> = HashMap::new(); + let mut device_ids = [DeviceIdState::Free; NUM_DEVICE_IDS as usize]; - devices.insert(0, Arc::new(Mutex::new(pci_root))); - device_ids[0] = true; + devices.insert(PCI_ROOT_DEVICE_ID, Arc::new(Mutex::new(pci_root))); + device_ids[PCI_ROOT_DEVICE_ID as usize] = DeviceIdState::Allocated; PciBus { devices, @@ -158,7 +169,7 @@ impl PciBus { Ok(()) } - pub fn add_device(&mut self, device_id: u32, device: Arc>) -> Result<()> { + pub fn add_device(&mut self, device_id: u8, device: Arc>) -> Result<()> { self.devices.insert(device_id, device); Ok(()) } @@ -168,36 +179,83 @@ impl PciBus { Ok(()) } - pub fn next_device_id(&mut self) -> Result { - for (idx, device_id) in self.device_ids.iter_mut().enumerate() { - if !(*device_id) { - *device_id = true; - return Ok(idx as u32); + /// Reserves a PCI device ID on the bus, marking it as in-use so + /// that automatic allocation will not use it. + /// + /// - `id`: Preferred ID to reserve on the bus. + /// + /// ## Errors + /// + /// * Returns [`PciRootError::AlreadyInUsePciDeviceSlot`] if the + /// slot is already reserved or allocated. + /// * Returns [`PciRootError::InvalidPciDeviceSlot`] if the slot + /// exceeds [`NUM_DEVICE_IDS`]. + pub fn reserve_device_id(&mut self, id: u8) -> Result { + let idx = id as usize; + if idx < NUM_DEVICE_IDS as usize { + if self.device_ids[idx] == DeviceIdState::Free { + self.device_ids[idx] = DeviceIdState::Reserved; + Ok(id) + } else { + Err(PciRootError::AlreadyInUsePciDeviceSlot(idx)) } + } else { + Err(PciRootError::InvalidPciDeviceSlot(idx)) } - - Err(PciRootError::NoPciDeviceSlotAvailable) } - pub fn get_device_id(&mut self, id: usize) -> Result<()> { - if id < NUM_DEVICE_IDS { - if self.device_ids[id] { - Err(PciRootError::AlreadyInUsePciDeviceSlot(id)) + /// Allocates a PCI device ID on the bus. + /// + /// - `id`: ID to allocate on the bus. If [`None`], the next free + /// device ID on the bus is allocated, else the ID given is + /// allocated + /// + /// ## Errors + /// + /// * Returns [`PciRootError::AlreadyInUsePciDeviceSlot`] in case + /// the ID requested is already allocated. + /// * Returns [`PciRootError::InvalidPciDeviceSlot`] in case the + /// requested ID exceeds the maximum number of devices allowed per + /// bus (see [`NUM_DEVICE_IDS`]). + /// * If `id` is [`None`]: Returns + /// [`PciRootError::NoPciDeviceSlotAvailable`] if no free device + /// slot is available on the bus. + pub fn allocate_device_id(&mut self, id: Option) -> Result { + if let Some(idx) = id.map(|i| i as usize) { + if idx < NUM_DEVICE_IDS as usize { + if self.device_ids[idx] == DeviceIdState::Allocated { + Err(PciRootError::AlreadyInUsePciDeviceSlot(idx)) + } else { + self.device_ids[idx] = DeviceIdState::Allocated; + Ok(idx as u8) + } } else { - self.device_ids[id] = true; - Ok(()) + Err(PciRootError::InvalidPciDeviceSlot(idx)) } } else { - Err(PciRootError::InvalidPciDeviceSlot(id)) + for (idx, device_id) in self.device_ids.iter_mut().enumerate() { + if *device_id == DeviceIdState::Free { + *device_id = DeviceIdState::Allocated; + return Ok(idx as u8); + } + } + Err(PciRootError::NoPciDeviceSlotAvailable) } } - pub fn put_device_id(&mut self, id: usize) -> Result<()> { + /// Frees a PCI device ID on the bus. + /// + /// - `id`: ID to free on the bus. + /// + /// ## Errors + /// * Returns [`PciRootError::InvalidPciDeviceSlot`] if the slot + /// exceeds [`NUM_DEVICE_IDS`]. + pub fn free_device_id(&mut self, id: u8) -> Result<()> { if id < NUM_DEVICE_IDS { - self.device_ids[id] = false; + self.device_ids[id as usize] = DeviceIdState::Free; Ok(()) } else { - Err(PciRootError::InvalidPciDeviceSlot(id)) + Err(PciRootError::InvalidPciDeviceSlot(id as usize)) } } } @@ -240,7 +298,7 @@ impl PciConfigIo { .lock() .unwrap() .devices - .get(&(device as u32)) + .get(&(device as u8)) .map_or(0xffff_ffff, |d| { d.lock().unwrap().read_config_register(register) }) @@ -265,7 +323,7 @@ impl PciConfigIo { } let pci_bus = self.pci_bus.as_ref().lock().unwrap(); - if let Some(d) = pci_bus.devices.get(&(device as u32)) { + if let Some(d) = pci_bus.devices.get(&(device as u8)) { let mut device = d.lock().unwrap(); // Update the register value @@ -280,10 +338,15 @@ impl PciConfigIo { device.deref_mut(), params.region_type, ) { - error!( - "Failed moving device BAR: {}: 0x{:x}->0x{:x}(0x{:x})", + warn!( + "Failed moving device BAR: {}: 0x{:x}->0x{:x}(0x{:x}), keeping old BAR", e, params.old_base, params.new_base, params.len ); + // Rollback: the config register was already updated to + // new_base by detect_bar_reprogramming(). Restore it by + // writing back the old address so device state stays + // consistent with the MMIO bus mapping. + device.restore_bar_addr(params); } } @@ -371,7 +434,7 @@ impl PciConfigMmio { .lock() .unwrap() .devices - .get(&(device as u32)) + .get(&(device as u8)) .map_or(0xffff_ffff, |d| { d.lock().unwrap().read_config_register(register) }) @@ -390,7 +453,7 @@ impl PciConfigMmio { } let pci_bus = self.pci_bus.lock().unwrap(); - if let Some(d) = pci_bus.devices.get(&(device as u32)) { + if let Some(d) = pci_bus.devices.get(&(device as u8)) { let mut device = d.lock().unwrap(); // Update the register value @@ -405,10 +468,11 @@ impl PciConfigMmio { device.deref_mut(), params.region_type, ) { - error!( - "Failed moving device BAR: {}: 0x{:x}->0x{:x}(0x{:x})", + warn!( + "Failed moving device BAR: {}: 0x{:x}->0x{:x}(0x{:x}), keeping old BAR", e, params.old_base, params.new_base, params.len ); + device.restore_bar_addr(params); } } } @@ -486,3 +550,122 @@ fn parse_io_config_address(config_address: u32) -> (usize, usize, usize, usize) shift_and_mask(config_address, REGISTER_NUMBER_OFFSET, REGISTER_NUMBER_MASK), ) } + +#[cfg(test)] +mod unit_tests { + use std::error::Error; + use std::result::Result; + + use super::*; + + #[derive(Debug)] + /// Helper struct that mocks the implementation of DeviceRelocation + struct MockDeviceRelocation; + + impl DeviceRelocation for MockDeviceRelocation { + fn move_bar( + &self, + _old_base: u64, + _new_base: u64, + _len: u64, + _pci_dev: &mut dyn PciDevice, + _region_type: PciBarRegionType, + ) -> Result<(), std::io::Error> { + Ok(()) + } + } + + fn setup_bus() -> PciBus { + let pci_root = PciRoot::new(None); + let mock_device_reloc = Arc::new(MockDeviceRelocation {}); + PciBus::new(pci_root, mock_device_reloc) + } + + #[test] + // Test to acquire all IDs that can be acquired + fn allocate_device_id_next_free() { + // The first address is occupied by the root + let mut bus = setup_bus(); + for expected_id in 1..NUM_DEVICE_IDS { + assert_eq!(expected_id, bus.allocate_device_id(None).unwrap()); + } + } + + #[test] + // Test that requesting specific ID work + fn allocate_device_id_request_id() -> Result<(), Box> { + // The first address is occupied by the root + let mut bus = setup_bus(); + let max_id = NUM_DEVICE_IDS - 1; + assert_eq!(0x01_u8, bus.allocate_device_id(Some(0x01))?); + assert_eq!(0x10_u8, bus.allocate_device_id(Some(0x10))?); + assert_eq!(max_id, bus.allocate_device_id(Some(max_id))?); + Ok(()) + } + + #[test] + // Test that reserved IDs are skipped by automatic allocation + fn allocate_device_id_fills_gaps() -> Result<(), Box> { + // The first address is occupied by the root + let mut bus = setup_bus(); + bus.reserve_device_id(0x01)?; + bus.reserve_device_id(0x03)?; + bus.reserve_device_id(0x06)?; + assert_eq!(0x02_u8, bus.allocate_device_id(None)?); + assert_eq!(0x04_u8, bus.allocate_device_id(None)?); + assert_eq!(0x05_u8, bus.allocate_device_id(None)?); + assert_eq!(0x07_u8, bus.allocate_device_id(None)?); + Ok(()) + } + + #[test] + // Test that reserving the same ID twice fails + fn reserve_device_id_twice_fails() -> Result<(), Box> { + let mut bus = setup_bus(); + let max_id = NUM_DEVICE_IDS - 1; + bus.reserve_device_id(max_id)?; + let result = bus.reserve_device_id(max_id); + assert!(matches!( + result, + Err(PciRootError::AlreadyInUsePciDeviceSlot(x)) if x == usize::from(max_id), + )); + Ok(()) + } + + #[test] + // Test that allocating a previously reserved ID succeeds (idempotent) + fn allocate_device_id_after_reserve() -> Result<(), Box> { + let mut bus = setup_bus(); + bus.reserve_device_id(0x10)?; + assert_eq!(0x10_u8, bus.allocate_device_id(Some(0x10))?); + Ok(()) + } + + #[test] + // Test to request an invalid ID + fn allocate_device_id_request_invalid_id_fails() -> Result<(), Box> { + let mut bus = setup_bus(); + let max_id = NUM_DEVICE_IDS + 1; + let result = bus.allocate_device_id(Some(max_id)); + assert!(matches!( + result, + Err(PciRootError::InvalidPciDeviceSlot(x)) if x == usize::from(max_id), + )); + Ok(()) + } + + #[test] + // Test to acquire an ID when all IDs were already acquired + fn allocate_device_id_none_left() { + // The first address is occupied by the root + let mut bus = setup_bus(); + for expected_id in 1..NUM_DEVICE_IDS { + assert_eq!(expected_id, bus.allocate_device_id(None).unwrap()); + } + let result = bus.allocate_device_id(None); + assert!(matches!( + result, + Err(PciRootError::NoPciDeviceSlotAvailable), + )); + } +} diff --git a/pci/src/configuration.rs b/pci/src/configuration.rs index 4dbd04f122..2a905e19bc 100644 --- a/pci/src/configuration.rs +++ b/pci/src/configuration.rs @@ -422,6 +422,9 @@ pub struct PciConfigurationState { rom_bar_used: bool, last_capability: Option<(usize, usize)>, msix_cap_reg_idx: Option, + // Preserve deferred BAR moves across snapshot and restore. + #[serde(default)] + pending_bar_reprogram: Vec, } /// Contains the configuration space of a PCI node. @@ -557,6 +560,7 @@ impl PciConfiguration { rom_bar_used, last_capability, msix_cap_reg_idx, + pending_bar_reprogram, ) = if let Some(state) = state { ( state.registers.try_into().unwrap(), @@ -567,6 +571,7 @@ impl PciConfiguration { state.rom_bar_used, state.last_capability, state.msix_cap_reg_idx, + state.pending_bar_reprogram, ) } else { let mut registers = [0u32; NUM_CONFIGURATION_REGISTERS]; @@ -606,6 +611,7 @@ impl PciConfiguration { false, None, None, + Vec::new(), ) }; @@ -619,7 +625,7 @@ impl PciConfiguration { last_capability, msix_cap_reg_idx, msix_config, - pending_bar_reprogram: Vec::new(), + pending_bar_reprogram, } } @@ -633,6 +639,7 @@ impl PciConfiguration { rom_bar_used: self.rom_bar_used, last_capability: self.last_capability, msix_cap_reg_idx: self.msix_cap_reg_idx, + pending_bar_reprogram: self.pending_bar_reprogram.clone(), } } @@ -1086,6 +1093,55 @@ impl PciConfiguration { pub(crate) fn clear_pending_bar_reprogram(&mut self) { self.pending_bar_reprogram = Vec::new(); } + + /// Restore BAR address after a failed move. This undoes the premature + /// address update in detect_bar_reprogramming() so that config space + /// stays consistent with the actual MMIO mapping. + pub fn restore_bar_addr(&mut self, params: &BarReprogrammingParams) { + match params.region_type { + PciBarRegionType::Memory64BitRegion => { + // 64-bit BAR spans two slots: bars[i] (low, type Memory64BitRegion) + // and bars[i+1] (high, type None). Mirror detect_bar_reprogramming + // by matching the combined address and restoring both halves. + for i in 0..NUM_BAR_REGS - 1 { + if self.bars[i].r#type != Some(PciBarRegionType::Memory64BitRegion) { + continue; + } + let low_mask = self.writable_bits[BAR0_REG + i]; + let high_mask = self.writable_bits[BAR0_REG + i + 1]; + let current = (u64::from(self.bars[i + 1].addr & high_mask) << 32) + | u64::from(self.bars[i].addr & low_mask); + if current == params.new_base { + let old_low = params.old_base as u32; + let old_high = (params.old_base >> 32) as u32; + self.bars[i].addr = old_low; + self.bars[i + 1].addr = old_high; + self.registers[BAR0_REG + i] = + (self.registers[BAR0_REG + i] & !low_mask) | (old_low & low_mask); + self.registers[BAR0_REG + i + 1] = (self.registers[BAR0_REG + i + 1] + & !high_mask) + | (old_high & high_mask); + return; + } + } + } + _ => { + // 32-bit Memory or IO BAR + for i in 0..NUM_BAR_REGS { + let mask = self.writable_bits[BAR0_REG + i]; + if self.bars[i].r#type == Some(params.region_type) + && u64::from(self.bars[i].addr & mask) == params.new_base + { + let old = params.old_base as u32; + self.bars[i].addr = old; + self.registers[BAR0_REG + i] = + (self.registers[BAR0_REG + i] & !mask) | (old & mask); + return; + } + } + } + } + } } impl Pausable for PciConfiguration {} diff --git a/pci/src/device.rs b/pci/src/device.rs index 3c5b3315f8..482e15e404 100644 --- a/pci/src/device.rs +++ b/pci/src/device.rs @@ -5,9 +5,10 @@ // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause use std::any::Any; -use std::sync::{Arc, Barrier, Mutex}; +use std::sync::{Arc, Barrier}; use std::{io, result}; +use serde::{Deserialize, Serialize}; use thiserror::Error; use vm_allocator::{AddressAllocator, SystemAllocator}; use vm_device::Resource; @@ -35,7 +36,7 @@ pub enum Error { } pub type Result = std::result::Result; -#[derive(Clone, Copy, Debug)] +#[derive(Clone, Copy, Debug, Serialize, Deserialize)] pub struct BarReprogrammingParams { pub old_base: u64, pub new_base: u64, @@ -48,7 +49,7 @@ pub trait PciDevice: Send { /// returns an address. Returns a Vec of (GuestAddress, GuestUsize) tuples. fn allocate_bars( &mut self, - _allocator: &Arc>, + _allocator: &mut SystemAllocator, _mmio32_allocator: &mut AddressAllocator, _mmio64_allocator: &mut AddressAllocator, _resources: Option>, @@ -92,6 +93,10 @@ pub trait PciDevice: Send { fn move_bar(&mut self, _old_base: u64, _new_base: u64) -> result::Result<(), io::Error> { Ok(()) } + /// Restore BAR address in config space after a failed move_bar. + /// This rolls back the address update made by detect_bar_reprogramming() + /// so that the config register stays consistent with the MMIO bus mapping. + fn restore_bar_addr(&mut self, _params: &BarReprogrammingParams) {} /// Provides a mutable reference to the Any trait. This is useful to let /// the caller have access to the underlying type behind the trait. fn as_any_mut(&mut self) -> &mut dyn Any; diff --git a/pci/src/lib.rs b/pci/src/lib.rs index 5ab87cf19d..c5bba16d29 100644 --- a/pci/src/lib.rs +++ b/pci/src/lib.rs @@ -21,7 +21,9 @@ use std::str::FromStr; use serde::de::Visitor; -pub use self::bus::{PciBus, PciConfigIo, PciConfigMmio, PciRoot, PciRootError}; +pub use self::bus::{ + NUM_DEVICE_IDS, PCI_ROOT_DEVICE_ID, PciBus, PciConfigIo, PciConfigMmio, PciRoot, PciRootError, +}; pub use self::configuration::{ PCI_CONFIGURATION_ID, PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciCapability, PciCapabilityId, PciClassCode, PciConfiguration, PciExpressCapabilityId, PciHeaderType, @@ -32,7 +34,10 @@ pub use self::device::{ BarReprogrammingParams, DeviceRelocation, Error as PciDeviceError, PciDevice, }; pub use self::msi::{MsiCap, MsiConfig, msi_num_enabled_vectors}; -pub use self::msix::{MSIX_CONFIG_ID, MSIX_TABLE_ENTRY_SIZE, MsixCap, MsixConfig, MsixTableEntry}; +pub use self::msix::{ + MSIX_CONFIG_ID, MSIX_TABLE_ENTRY_SIZE, MaybeMutInterruptSourceGroup, MsixCap, MsixConfig, + MsixTableEntry, +}; pub use self::vfio::{MmioRegion, VfioDmaMapping, VfioPciDevice, VfioPciError}; pub use self::vfio_user::{VfioUserDmaMapping, VfioUserPciDevice, VfioUserPciDeviceError}; diff --git a/pci/src/msix.rs b/pci/src/msix.rs index 9bc5e63f3a..49b379b02a 100644 --- a/pci/src/msix.rs +++ b/pci/src/msix.rs @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause // -use std::sync::Arc; +use std::sync::{Arc, Mutex}; use std::{io, result}; use byteorder::{ByteOrder, LittleEndian}; @@ -15,6 +15,7 @@ use vm_device::interrupt::{ }; use vm_memory::ByteValued; use vm_migration::{MigratableError, Pausable, Snapshot, Snapshottable}; +use vmm_sys_util::eventfd::EventFd; use crate::{PciCapability, PciCapabilityId}; @@ -72,11 +73,66 @@ pub struct MsixConfigState { enabled: bool, } +#[derive(Clone)] +pub enum MaybeMutInterruptSourceGroup { + Immutable(Arc), + Mutable(Arc>), +} + +macro_rules! impl_method { + ($( + fn $i: ident(&self $(,$index:ident : $InterruptIndex:ty)*$(,)?) -> $r: ty; + )*) => { + $( + fn $i(&self $(,$index: $InterruptIndex)*) -> $r { + match self { + Self::Immutable(source) => source.$i($($index),*), + Self::Mutable(source) => source.lock().unwrap().$i($($index),*), + } + } + )* + }; +} + +impl InterruptSourceGroup for MaybeMutInterruptSourceGroup { + impl_method! { + fn trigger(&self, index: InterruptIndex) -> vm_device::interrupt::Result<()>; + + fn notifier(&self, index: InterruptIndex) -> Option; + + fn update( + &self, + index: InterruptIndex, + config: InterruptSourceConfig, + masked: bool, + set_gsi: bool, + ) -> vm_device::interrupt::Result<()>; + + fn set_gsi(&self) -> vm_device::interrupt::Result<()>; + } +} + +impl MaybeMutInterruptSourceGroup { + pub fn set_notifier( + &self, + index: InterruptIndex, + eventfd: Option, + vm: &dyn hypervisor::Vm, + ) -> std::io::Result<()> { + match self { + Self::Immutable(_) => panic!( + "Attempted to set a notifier of an immutable source. You must mark your device as needing a mutable source by having sets_irqfd() return true." + ), + Self::Mutable(source) => source.lock().unwrap().set_notifier(index, eventfd, vm), + } + } +} + pub struct MsixConfig { pub table_entries: Vec, pub pba_entries: Vec, pub devid: u32, - interrupt_source_group: Arc, + interrupt_source_group: MaybeMutInterruptSourceGroup, masked: bool, enabled: bool, } @@ -84,7 +140,7 @@ pub struct MsixConfig { impl MsixConfig { pub fn new( msix_vectors: u16, - interrupt_source_group: Arc, + interrupt_source_group: MaybeMutInterruptSourceGroup, devid: u32, state: Option, ) -> result::Result { diff --git a/pci/src/vfio.rs b/pci/src/vfio.rs index 9e8e7e3163..e0c9110a86 100644 --- a/pci/src/vfio.rs +++ b/pci/src/vfio.rs @@ -19,11 +19,10 @@ use log::{error, info}; use serde::{Deserialize, Serialize}; use thiserror::Error; use vfio_bindings::bindings::vfio::*; -use vfio_ioctls::{ - VfioContainer, VfioDevice, VfioIrq, VfioRegionInfoCap, VfioRegionSparseMmapArea, -}; +use vfio_ioctls::{VfioDevice, VfioIrq, VfioOps, VfioRegionInfoCap, VfioRegionSparseMmapArea}; use vm_allocator::page_size::{ - align_page_size_down, align_page_size_up, is_4k_aligned, is_4k_multiple, is_page_size_aligned, + align_page_size_down, align_page_size_up, get_page_size, is_4k_aligned, is_4k_multiple, + is_page_size_aligned, }; use vm_allocator::{AddressAllocator, MemorySlotAllocator, SystemAllocator}; use vm_device::dma_mapping::ExternalDmaMapping; @@ -37,7 +36,7 @@ use vmm_sys_util::eventfd::EventFd; use crate::mmap::MmapRegion; use crate::msi::{MSI_CONFIG_ID, MsiConfigState}; -use crate::msix::MsixConfigState; +use crate::msix::{MaybeMutInterruptSourceGroup, MsixConfigState}; use crate::{ BarReprogrammingParams, MSIX_CONFIG_ID, MSIX_TABLE_ENTRY_SIZE, MsiCap, MsiConfig, MsixCap, MsixConfig, PCI_CONFIGURATION_ID, PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, @@ -159,7 +158,14 @@ impl VfioMsix { // Update "Message Control" word if offset == 2 && data.len() == 2 { - self.bar.set_msg_ctl(LittleEndian::read_u16(data)); + let data = LittleEndian::read_u16(data); + self.bar.set_msg_ctl(data); + self.cap.set_msg_ctl(data); + } else if offset == 0 && data.len() == 4 { + // Some guests update MSI-X control through the dword config write path. + let data = (LittleEndian::read_u32(data) >> 16) as u16; + self.bar.set_msg_ctl(data); + self.cap.set_msg_ctl(data); } let new_enabled = self.bar.enabled(); @@ -604,7 +610,7 @@ impl VfioCommon { #[allow(unused_variables)] pub(crate) fn allocate_bars( &mut self, - allocator: &Arc>, + allocator: &mut SystemAllocator, mmio32_allocator: &mut AddressAllocator, mmio64_allocator: &mut AddressAllocator, resources: Option<&[Resource]>, @@ -743,8 +749,6 @@ impl VfioCommon { PciBarRegionType::IoRegion => { // The address needs to be 4 bytes aligned. allocator - .lock() - .unwrap() .allocate_io_addresses(restored_bar_addr, region_size, Some(0x4)) .ok_or(PciDeviceError::IoAllocationFailed(region_size))? } @@ -863,7 +867,7 @@ impl VfioCommon { let msix_config = MsixConfig::new( msix_cap.table_size(), - interrupt_source_group.clone(), + MaybeMutInterruptSourceGroup::Immutable(interrupt_source_group.clone()), bdf.into(), state, ) @@ -1468,9 +1472,12 @@ pub struct VfioPciDevice { id: String, vm: Arc, device: Arc, - container: Arc, + vfio_ops: Arc, common: VfioCommon, iommu_attached: bool, + // Whether to map VFIO device MMIO BARs into the host IOMMU address space. + // Required for peer-to-peer DMA between VFIO devices. + p2p_dma: bool, memory_slot_allocator: MemorySlotAllocator, bdf: PciBdf, device_path: PathBuf, @@ -1483,10 +1490,11 @@ impl VfioPciDevice { id: String, vm: Arc, device: VfioDevice, - container: Arc, + vfio_ops: Arc, msi_interrupt_manager: Arc>, legacy_interrupt_group: Option>, iommu_attached: bool, + p2p_dma: bool, bdf: PciBdf, memory_slot_allocator: MemorySlotAllocator, snapshot: Option<&Snapshot>, @@ -1512,9 +1520,10 @@ impl VfioPciDevice { id, vm, device, - container, + vfio_ops, common, iommu_attached, + p2p_dma, memory_slot_allocator, bdf, device_path, @@ -1564,13 +1573,33 @@ impl VfioPciDevice { let (offset, size) = msix.cap.table_range(); let offset = align_page_size_down(offset); let size = align_page_size_up(size); - inter_ranges.insert(offset, size); + // MSI-X mmap region safety: when a device has a non page + // aligned MSI-X offset, fixup_msix_region() relocates MSI-X + // to the upper half of an enlarged virtual BAR, causing the + // offsets in msix.cap to exceed the physical BAR size. This + // check skips carving a hole, preventing invalid offsets from + // reaching the mmap path. With no holes, + // generate_sparse_areas() returns a single sparse region + // covering the entire physical BAR. The relocated MSI-X in + // the virtual BAR remains trapped because its upper half has + // no mmap backing. Exposing the physical MSI-X region through + // mmap is safe when the kernel advertises + // VFIO_REGION_INFO_CAP_MSIX_MAPPABLE. When MSI-X offsets are + // already page aligned, fixup_msix_region() does not relocate + // and this check is satisfied, so a hole is carved at the + // intended offset as before. + if offset < region_size { + inter_ranges.insert(offset, size); + } } if region_index == msix.cap.pba_bir() { let (offset, size) = msix.cap.pba_range(); let offset = align_page_size_down(offset); let size = align_page_size_up(size); - inter_ranges.insert(offset, size); + // See MSI-X mmap safety comment above. + if offset < region_size { + inter_ranges.insert(offset, size); + } } } @@ -1658,9 +1687,40 @@ impl VfioPciDevice { self.common.interrupt.msix.as_ref(), )?; + let page_size = get_page_size(); for area in sparse_areas.iter() { + // KVM_SET_USER_MEMORY_REGION requires memory_size to be a + // multiple of the host page size. On aarch64 with 64K pages + // a device BAR can be smaller than a page (e.g. 16K NVMe + // BAR). + // + // The kernel only sets VFIO_REGION_INFO_FLAG_MMAP on sub-page + // BARs after verifying the physical BAR start is page-aligned + // and reserving the rest of the page. Expansion is only safe + // at offset 0 where the kernel reservation applies. + // + // fixup_msix_region() ensures MSI-X relocation at >= page_size + // offset, so the expanded mmap cannot overlap the trap region. + let mmap_len = if area.size < page_size { + if area.offset != 0 { + error!( + "BAR {}: sub-page sparse area at non-zero offset 0x{:x} \ + cannot be safely expanded to page size", + region.index, area.offset, + ); + return Err(VfioPciError::MmapArea); + } + info!( + "BAR {}: expanding sub-page sparse area mmap from 0x{:x} to \ + page size 0x{:x}", + region.index, area.size, page_size, + ); + page_size + } else { + area.size + }; let mapping = match MmapRegion::mmap( - area.size, + mmap_len, prot, fd, mmap_offset, @@ -1671,7 +1731,7 @@ impl VfioPciDevice { error!( "Could not mmap sparse area (offset = 0x{:x}, size = 0x{:x}): {}", mmap_offset, - area.size, + mmap_len, std::io::Error::last_os_error() ); return Err(VfioPciError::MmapArea); @@ -1699,7 +1759,9 @@ impl VfioPciDevice { } .map_err(VfioPciError::CreateUserMemoryRegion)?; - if !self.iommu_attached { + // Map the MMIO BAR into the host IOMMU address space via VfioOps + // Only needed if p2p_dma is enabled. + if !self.iommu_attached && self.p2p_dma { // vfio_dma_map should be unsafe but isn't. #[allow(unused_unsafe)] // SAFETY: MmapRegion invariants guarantee that @@ -1707,12 +1769,10 @@ impl VfioPciDevice { // user_memory_region.mapping.len() bytes of // valid memory that will only be unmapped with munmap(). unsafe { - self.container.vfio_dma_map( + self.vfio_ops.vfio_dma_map( user_memory_region.start, - user_memory_region.mapping.len().try_into().unwrap(), - (user_memory_region.mapping.addr() as usize) - .try_into() - .unwrap(), + user_memory_region.mapping.len(), + user_memory_region.mapping.addr(), ) } .map_err(|e| VfioPciError::DmaMap(e, self.device_path.clone(), self.bdf))?; @@ -1730,15 +1790,17 @@ impl VfioPciDevice { for user_memory_region in region.user_memory_regions.drain(..) { let len = user_memory_region.mapping.len(); let host_addr = user_memory_region.mapping.addr(); - // Unmap from vfio container + // Unmap MMIO region from the host IOMMU address space via VfioOps + // Only needed if p2p_dma is enabled. if !self.iommu_attached + && self.p2p_dma && let Err(e) = self - .container - .vfio_dma_unmap(user_memory_region.start, len.try_into().unwrap()) + .vfio_ops + .vfio_dma_unmap(user_memory_region.start, len) .map_err(|e| VfioPciError::DmaUnmap(e, self.device_path.clone(), self.bdf)) { error!( - "Could not unmap mmio region from vfio container: \ + "Could not unmap MMIO region from the host IOMMU address space: \ iova 0x{:x}, size 0x{:x}: {}, ", user_memory_region.start, len, e ); @@ -1836,7 +1898,7 @@ const PCI_ROM_EXP_BAR_INDEX: usize = 12; impl PciDevice for VfioPciDevice { fn allocate_bars( &mut self, - allocator: &Arc>, + allocator: &mut SystemAllocator, mmio32_allocator: &mut AddressAllocator, mmio64_allocator: &mut AddressAllocator, resources: Option>, @@ -1888,17 +1950,19 @@ impl PciDevice for VfioPciDevice { for user_memory_region in region.user_memory_regions.iter_mut() { let len = user_memory_region.mapping.len(); let host_addr = user_memory_region.mapping.addr(); - // Unmap the old MMIO region from vfio container + // Unmap the old MMIO region from the host IOMMU address space via VfioOps + // Only needed if p2p_dma is enabled. if !self.iommu_attached + && self.p2p_dma && let Err(e) = self - .container - .vfio_dma_unmap(user_memory_region.start, len.try_into().unwrap()) + .vfio_ops + .vfio_dma_unmap(user_memory_region.start, len) .map_err(|e| { VfioPciError::DmaUnmap(e, self.device_path.clone(), self.bdf) }) { error!( - "Could not unmap mmio region from vfio container: \ + "Could not unmap MMIO region from the host IOMMU address space: \ iova 0x{:x}, size 0x{:x}: {}, ", user_memory_region.start, len, e ); @@ -1942,24 +2006,22 @@ iova 0x{:x}, size 0x{:x}: {}, ", } .map_err(io::Error::other)?; - // Map the moved mmio region to vfio container - if !self.iommu_attached { + // Map the moved MMIO region into the host IOMMU address space via VfioOps + // Only needed if p2p_dma is enabled. + if !self.iommu_attached && self.p2p_dma { // vfio_dma_map is unsound and ought to be marked as unsafe #[allow(unused_unsafe)] // SAFETY: MmapRegion invariants guarantee that // host_addr points to len bytes of // valid memory that will only be unmapped with munmap(). unsafe { - self.container.vfio_dma_map( - user_memory_region.start, - len.try_into().unwrap(), - (host_addr as usize).try_into().unwrap(), - ) + self.vfio_ops + .vfio_dma_map(user_memory_region.start, len, host_addr) } .map_err(|e| VfioPciError::DmaMap(e, self.device_path.clone(), self.bdf)) .map_err(|e| { io::Error::other(format!( - "Could not map mmio region to vfio container: \ + "Could not map MMIO region into the host IOMMU address space: \ iova 0x{:x}, size 0x{:x}: {}, ", user_memory_region.start, len, e )) @@ -1972,6 +2034,10 @@ iova 0x{:x}, size 0x{:x}: {}, ", Ok(()) } + fn restore_bar_addr(&mut self, params: &BarReprogrammingParams) { + self.common.configuration.restore_bar_addr(params); + } + fn as_any_mut(&mut self) -> &mut dyn Any { self } @@ -2003,9 +2069,9 @@ impl Migratable for VfioPciDevice {} /// This structure implements the ExternalDmaMapping trait. It is meant to /// be used when the caller tries to provide a way to update the mappings -/// associated with a specific VFIO container. +/// associated with a specific VfioOps instance. pub struct VfioDmaMapping { - container: Arc, + vfio_ops: Arc, memory: Arc, mmio_regions: Arc>>, } @@ -2013,16 +2079,16 @@ pub struct VfioDmaMapping { impl VfioDmaMapping { /// Create a DmaMapping object. /// # Parameters - /// * `container`: VFIO container object. + /// * `vfio_ops`: VfioOps instance. /// * `memory`: guest memory to mmap. /// * `mmio_regions`: mmio_regions to mmap. pub fn new( - container: Arc, + vfio_ops: Arc, memory: Arc, mmio_regions: Arc>>, ) -> Self { VfioDmaMapping { - container, + vfio_ops, memory, mmio_regions, } @@ -2069,24 +2135,22 @@ impl ExternalDmaMapping for VfioDmaMapping std::result::Result<(), io::Error> { - self.container.vfio_dma_unmap(iova, size).map_err(|e| { - io::Error::other(format!( - "failed to unmap memory for VFIO container, \ + self.vfio_ops + .vfio_dma_unmap(iova, size as usize) + .map_err(|e| { + io::Error::other(format!( + "failed to unmap memory from the host IOMMU address space, \ iova 0x{iova:x}, size 0x{size:x}: {e:?}" - )) - }) + )) + }) } } diff --git a/pci/src/vfio_user.rs b/pci/src/vfio_user.rs index 248f87272a..27c7dc0405 100644 --- a/pci/src/vfio_user.rs +++ b/pci/src/vfio_user.rs @@ -391,7 +391,7 @@ impl Vfio for VfioUserClientWrapper { impl PciDevice for VfioUserPciDevice { fn allocate_bars( &mut self, - allocator: &Arc>, + allocator: &mut SystemAllocator, mmio32_allocator: &mut AddressAllocator, mmio64_allocator: &mut AddressAllocator, resources: Option>, @@ -414,6 +414,10 @@ impl PciDevice for VfioUserPciDevice { .free_bars(allocator, mmio32_allocator, mmio64_allocator) } + fn restore_bar_addr(&mut self, params: &BarReprogrammingParams) { + self.common.configuration.restore_bar_addr(params); + } + fn as_any_mut(&mut self) -> &mut dyn Any { self } diff --git a/performance-metrics/Cargo.toml b/performance-metrics/Cargo.toml index 472f1159b3..776a46fc2c 100644 --- a/performance-metrics/Cargo.toml +++ b/performance-metrics/Cargo.toml @@ -5,12 +5,15 @@ name = "performance-metrics" version = "0.1.0" [dependencies] +block = { path = "../block", features = ["io_uring"] } clap = { workspace = true, features = ["wrap_help"] } dirs = { workspace = true } +libc = { workspace = true } serde = { workspace = true, features = ["derive", "rc"] } serde_json = { workspace = true } test_infra = { path = "../test_infra" } thiserror = { workspace = true } +vmm-sys-util = { workspace = true } [lints] workspace = true diff --git a/performance-metrics/src/main.rs b/performance-metrics/src/main.rs index c48e5a906d..5192a4406b 100644 --- a/performance-metrics/src/main.rs +++ b/performance-metrics/src/main.rs @@ -4,7 +4,9 @@ // // Custom harness to run performance tests +mod micro_bench_block; mod performance_tests; +mod util; use std::process::Command; use std::sync::Arc; @@ -26,6 +28,14 @@ enum Error { TestFailed, } +#[derive(Deserialize, Serialize)] +enum TestStatus { + #[serde(rename = "PASSED")] + Passed, + #[serde(rename = "FAILED")] + Failed, +} + #[derive(Deserialize, Serialize)] pub struct PerformanceTestResult { name: String, @@ -33,6 +43,31 @@ pub struct PerformanceTestResult { std_dev: f64, max: f64, min: f64, + status: TestStatus, +} + +impl PerformanceTestResult { + fn passed(name: &str, mean: f64, std_dev: f64, max: f64, min: f64) -> Self { + Self { + name: name.to_string(), + mean, + std_dev, + max, + min, + status: TestStatus::Passed, + } + } + + fn failed(name: &str) -> Self { + Self { + name: name.to_string(), + mean: 0.0, + std_dev: 0.0, + max: 0.0, + min: 0.0, + status: TestStatus::Failed, + } + } } #[derive(Deserialize, Serialize)] @@ -179,6 +214,7 @@ pub struct PerformanceTestControl { net_control: Option<(bool, bool)>, // First bool is for RX(true)/TX(false), second bool is for bandwidth or PPS block_control: Option, num_boot_vcpus: Option, + num_ops: Option, // Workload size for micro benchmarks } impl fmt::Display for PerformanceTestControl { @@ -203,6 +239,9 @@ impl fmt::Display for PerformanceTestControl { o.fio_ops, o.bandwidth, o.test_file ); } + if let Some(o) = self.num_ops { + output = format!("{output}, num_ops = {o}"); + } write!(f, "{output}") } @@ -219,6 +258,7 @@ impl PerformanceTestControl { net_control: None, block_control: None, num_boot_vcpus: Some(1), + num_ops: None, } } } @@ -235,6 +275,13 @@ struct PerformanceTest { impl PerformanceTest { pub fn run(&self, overrides: &PerformanceTestOverrides) -> PerformanceTestResult { + if self.control.num_ops.is_some() && !self.name.starts_with("micro_") { + eprintln!( + "Warning: num_ops is set on '{}' but has no effect on non micro benchmarks", + self.name + ); + } + // Run warmup iterations if configured (results discarded) for _ in 0..self.control.warmup_iterations { if let Some(test_timeout) = overrides.test_timeout { @@ -266,13 +313,7 @@ impl PerformanceTest { let max = (self.unit_adjuster)(metrics.clone().into_iter().reduce(f64::max).unwrap()); let min = (self.unit_adjuster)(metrics.clone().into_iter().reduce(f64::min).unwrap()); - PerformanceTestResult { - name: self.name.to_string(), - mean, - std_dev, - max, - min, - } + PerformanceTestResult::passed(self.name, mean, std_dev, max, min) } // Calculate the timeout for each test @@ -323,6 +364,10 @@ mod adjuster { v * 1000.0 } + pub fn s_to_us(v: f64) -> f64 { + v * 1_000_000.0 + } + pub fn bps_to_gbps(v: f64) -> f64 { v / (1_000_000_000_f64) } @@ -333,7 +378,7 @@ mod adjuster { } } -const TEST_LIST: [PerformanceTest; 60] = [ +const TEST_LIST: [PerformanceTest; 100] = [ PerformanceTest { name: "boot_time_ms", func_ptr: performance_boot_time, @@ -1184,6 +1229,486 @@ const TEST_LIST: [PerformanceTest; 60] = [ }, unit_adjuster: adjuster::Bps_to_MiBps, }, + PerformanceTest { + name: "micro_block_raw_aio_drain_128_us", + func_ptr: micro_bench_block::micro_bench_aio_drain, + control: PerformanceTestControl { + test_timeout: 5, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_raw_aio_drain_256_us", + func_ptr: micro_bench_block::micro_bench_aio_drain, + control: PerformanceTestControl { + test_timeout: 5, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_read_128_us", + func_ptr: micro_bench_block::micro_bench_qcow_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_read_256_us", + func_ptr: micro_bench_block::micro_bench_qcow_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_random_read_128_us", + func_ptr: micro_bench_block::micro_bench_qcow_random_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_random_read_256_us", + func_ptr: micro_bench_block::micro_bench_qcow_random_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_write_128_us", + func_ptr: micro_bench_block::micro_bench_qcow_write, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_write_256_us", + func_ptr: micro_bench_block::micro_bench_qcow_write, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_punch_hole_128_us", + func_ptr: micro_bench_block::micro_bench_qcow_punch_hole, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_punch_hole_256_us", + func_ptr: micro_bench_block::micro_bench_qcow_punch_hole, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_fsync_128_us", + func_ptr: micro_bench_block::micro_bench_qcow_fsync, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_fsync_256_us", + func_ptr: micro_bench_block::micro_bench_qcow_fsync, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_backing_read_128_us", + func_ptr: micro_bench_block::micro_bench_qcow_backing_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_backing_read_256_us", + func_ptr: micro_bench_block::micro_bench_qcow_backing_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_cow_write_128_us", + func_ptr: micro_bench_block::micro_bench_qcow_cow_write, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_cow_write_256_us", + func_ptr: micro_bench_block::micro_bench_qcow_cow_write, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_compressed_read_128_us", + func_ptr: micro_bench_block::micro_bench_qcow_compressed_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_compressed_read_256_us", + func_ptr: micro_bench_block::micro_bench_qcow_compressed_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_multi_cluster_read_128_us", + func_ptr: micro_bench_block::micro_bench_qcow_multi_cluster_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_multi_cluster_read_256_us", + func_ptr: micro_bench_block::micro_bench_qcow_multi_cluster_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_l2_cache_miss_128_us", + func_ptr: micro_bench_block::micro_bench_qcow_l2_cache_miss, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_l2_cache_miss_256_us", + func_ptr: micro_bench_block::micro_bench_qcow_l2_cache_miss, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_async_read_128_us", + func_ptr: micro_bench_block::micro_bench_qcow_async_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_async_read_256_us", + func_ptr: micro_bench_block::micro_bench_qcow_async_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_batch_read_128_us", + func_ptr: micro_bench_block::micro_bench_qcow_batch_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_batch_read_256_us", + func_ptr: micro_bench_block::micro_bench_qcow_batch_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_async_random_read_128_us", + func_ptr: micro_bench_block::micro_bench_qcow_async_random_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_async_random_read_256_us", + func_ptr: micro_bench_block::micro_bench_qcow_async_random_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_async_multi_cluster_read_128_us", + func_ptr: micro_bench_block::micro_bench_qcow_async_multi_cluster_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_async_multi_cluster_read_256_us", + func_ptr: micro_bench_block::micro_bench_qcow_async_multi_cluster_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_async_backing_read_128_us", + func_ptr: micro_bench_block::micro_bench_qcow_async_backing_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_async_backing_read_256_us", + func_ptr: micro_bench_block::micro_bench_qcow_async_backing_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_async_compressed_read_128_us", + func_ptr: micro_bench_block::micro_bench_qcow_async_compressed_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_async_compressed_read_256_us", + func_ptr: micro_bench_block::micro_bench_qcow_async_compressed_read, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_async_write_128_us", + func_ptr: micro_bench_block::micro_bench_qcow_async_write, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_async_write_256_us", + func_ptr: micro_bench_block::micro_bench_qcow_async_write, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_async_l2_cache_miss_128_us", + func_ptr: micro_bench_block::micro_bench_qcow_async_l2_cache_miss, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_async_l2_cache_miss_256_us", + func_ptr: micro_bench_block::micro_bench_qcow_async_l2_cache_miss, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_batch_write_128_us", + func_ptr: micro_bench_block::micro_bench_qcow_batch_write, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(128), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, + PerformanceTest { + name: "micro_block_qcow_batch_write_256_us", + func_ptr: micro_bench_block::micro_bench_qcow_batch_write, + control: PerformanceTestControl { + test_timeout: 10, + test_iterations: 20, + warmup_iterations: 5, + num_ops: Some(256), + ..PerformanceTestControl::default() + }, + unit_adjuster: adjuster::s_to_us, + }, ]; fn run_test_with_timeout( @@ -1214,7 +1739,6 @@ fn run_test_with_timeout( let _ = sender.send(output); }); - // Todo: Need to cleanup/kill all hanging child processes let test_timeout = test.calc_timeout(&test_iterations, &test_timeout); receiver .recv_timeout(Duration::from_secs(test_timeout)) @@ -1223,10 +1747,27 @@ fn run_test_with_timeout( "[Error] Test '{}' time-out after {} seconds", test.name, test_timeout ); + cleanup_stale_processes(); Error::TestTimeout })? } +fn cleanup_stale_processes() { + // "cloud-hyperviso" - process name truncated to 15 chars by the kernel + for proc in &["cloud-hyperviso", "iperf3", "ethr"] { + let _ = Command::new("pkill").args(["-9", proc]).status(); + } + thread::sleep(Duration::from_secs(2)); +} + +fn settle_host() { + let _ = Command::new("sync").status(); + let _ = Command::new("bash") + .args(["-c", "echo 3 > /proc/sys/vm/drop_caches"]) + .status(); + thread::sleep(Duration::from_secs(1)); +} + fn date() -> String { let output = test_infra::exec_host_command_output("date"); String::from_utf8_lossy(&output.stdout).trim().to_string() @@ -1244,6 +1785,13 @@ fn main() { .num_args(1) .required(false), ) + .arg( + Arg::new("test-exclude") + .long("test-exclude") + .help("Exclude metrics tests matching the provided keywords") + .num_args(1) + .required(false), + ) .arg( Arg::new("list-tests") .long("list-tests") @@ -1252,6 +1800,14 @@ fn main() { .action(ArgAction::SetTrue) .required(false), ) + .arg( + Arg::new("continue-on-failure") + .long("continue-on-failure") + .help("Continue running remaining tests after a test failure") + .num_args(0) + .action(ArgAction::SetTrue) + .required(false), + ) .arg( Arg::new("report-file") .long("report-file") @@ -1289,19 +1845,31 @@ fn main() { .filter(|t| !(cfg!(target_arch = "aarch64") && t.name == "virtio_net_latency_us")) .collect(); + let test_filter = match cmd_arguments.get_many::("test-filter") { + Some(s) => s.collect(), + None => Vec::new(), + }; + + let test_exclude = match cmd_arguments.get_many::("test-exclude") { + Some(s) => s.collect(), + None => Vec::new(), + }; + + // Determine which tests will actually run. + let tests_to_run: Vec<&&PerformanceTest> = test_list + .iter() + .filter(|t| test_filter.is_empty() || test_filter.iter().any(|&s| t.name.contains(s))) + .filter(|t| !test_exclude.iter().any(|&s| t.name.contains(s))) + .collect(); + if cmd_arguments.get_flag("list-tests") { - for test in test_list.iter() { + for test in tests_to_run.iter() { println!("\"{}\" ({})", test.name, test.control); } return; } - let test_filter = match cmd_arguments.get_many::("test-filter") { - Some(s) => s.collect(), - None => Vec::new(), - }; - // Run performance tests sequentially and report results (in both readable/json format) let mut metrics_report: MetricsReport = Default::default(); @@ -1323,15 +1891,31 @@ fn main() { .unwrap_or_default(), }); - init_tests(&overrides); + // Skip heavy VM level init/cleanup when only micro benchmarks are selected. + let needs_vm_tests = tests_to_run.iter().any(|t| !t.name.starts_with("micro_")); - for test in test_list.iter() { - if test_filter.is_empty() || test_filter.iter().any(|&s| test.name.contains(s)) { - match run_test_with_timeout(test, &overrides) { - Ok(r) => { - metrics_report.results.push(r); - } - Err(e) => { + if needs_vm_tests { + init_tests(&overrides); + } + + let continue_on_failure = cmd_arguments.get_flag("continue-on-failure"); + let mut has_failure = false; + + for test in tests_to_run { + settle_host(); + match run_test_with_timeout(test, &overrides) { + Ok(r) => { + metrics_report.results.push(r); + } + Err(e) => { + if continue_on_failure { + eprintln!("Test '{}' failed: '{e:?}'. Continuing.", test.name); + has_failure = true; + metrics_report + .results + .push(PerformanceTestResult::failed(test.name)); + cleanup_stale_processes(); + } else { eprintln!("Aborting test due to error: '{e:?}'"); std::process::exit(1); } @@ -1339,7 +1923,9 @@ fn main() { } } - cleanup_tests(); + if needs_vm_tests { + cleanup_tests(); + } let mut report_file: Box = if let Some(file) = cmd_arguments.get_one::("report-file") { @@ -1366,4 +1952,8 @@ fn main() { std::process::exit(1); }) .unwrap(); + + if has_failure { + std::process::exit(1); + } } diff --git a/performance-metrics/src/micro_bench_block.rs b/performance-metrics/src/micro_bench_block.rs new file mode 100644 index 0000000000..aab20308c1 --- /dev/null +++ b/performance-metrics/src/micro_bench_block.rs @@ -0,0 +1,593 @@ +// Copyright 2026 The Cloud Hypervisor Authors. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +//! In process micro benchmarks for block layer internals. +//! +//! These run without booting a VM and measure hot path operations +//! (e.g. AIO completion draining) at the syscall level. + +use std::os::unix::io::AsRawFd; +use std::time::Instant; + +use block::async_io::AsyncIo; +use block::disk_file::AsyncDiskFile; +use block::raw_async_aio::RawFileAsyncAio; +use block::{BatchRequest, RequestType}; + +use crate::PerformanceTestControl; +use crate::util::{ + self, BLOCK_SIZE, L2_ENTRIES_PER_TABLE, QCOW_CLUSTER_SIZE, deterministic_permutation, + drain_async_completions, drain_completions, read_iovec, submit_reads, submit_writes, + write_iovec, +}; + +/// Submit num_ops AIO writes, wait for them all to land, then time +/// how long it takes to drain every completion via next_completed_request(). +/// +/// Returns the drain wall clock time in seconds. +pub fn micro_bench_aio_drain(control: &PerformanceTestControl) -> f64 { + let num_ops = control.num_ops.expect("num_ops required") as usize; + let tmp = util::sized_tempfile(num_ops); + let fd = tmp.as_file().as_raw_fd(); + let mut aio = RawFileAsyncAio::new(fd, num_ops as u32).expect("failed to create AIO context"); + + let buf = vec![0xA5u8; BLOCK_SIZE as usize]; + + // Submit all writes. + for i in 0..num_ops { + let iovec = libc::iovec { + iov_base: buf.as_ptr() as *mut _, + iov_len: buf.len(), + }; + aio.write_vectored((i as u64 * BLOCK_SIZE) as libc::off_t, &[iovec], i as u64) + .expect("write_vectored failed"); + } + + // Wait until the eventfd signals that completions are available. + util::wait_for_eventfd(aio.notifier()); + + // Drain all completions and measure. + let start = Instant::now(); + let mut drained = 0usize; + while drained < num_ops { + if aio.next_completed_request().is_some() { + drained += 1; + } + } + start.elapsed().as_secs_f64() +} + +/// Read num_ops clusters from a prepopulated qcow2 image through the +/// QcowSync async_io path and time the total read_vectored wall clock. +/// +/// This exercises the hot read path: L2 lookup via map_clusters_for_read, +/// pread64 for allocated data, and iovec scatter. +/// +/// Returns the total read wall clock time in seconds. +pub fn micro_bench_qcow_read(control: &PerformanceTestControl) -> f64 { + let num_ops = control.num_ops.expect("num_ops required") as usize; + let (_tmp, disk) = util::qcow_tempfile(num_ops); + let mut async_io = disk.new_async_io(1).expect("new_async_io failed"); + + let mut buf = vec![0u8; QCOW_CLUSTER_SIZE as usize]; + let iovec = read_iovec(&mut buf); + + let start = Instant::now(); + submit_reads(async_io.as_mut(), num_ops, QCOW_CLUSTER_SIZE, &[iovec]); + let elapsed = start.elapsed().as_secs_f64(); + + // Drain completions so Drop is clean. + drain_completions(async_io.as_mut(), num_ops); + + elapsed +} + +/// Read num_ops clusters from a prepopulated qcow2 image in random order. +/// +/// Unlike micro_bench_qcow_read which reads sequentially, this shuffles +/// the cluster indices to exercise L2 cache miss and eviction behaviour +/// under random access patterns. +/// +/// Returns the total read wall clock time in seconds. +pub fn micro_bench_qcow_random_read(control: &PerformanceTestControl) -> f64 { + let num_ops = control.num_ops.expect("num_ops required") as usize; + let (_tmp, disk) = util::qcow_tempfile(num_ops); + let mut async_io = disk.new_async_io(1).expect("new_async_io failed"); + + let indices = deterministic_permutation(num_ops); + + let mut buf = vec![0u8; QCOW_CLUSTER_SIZE as usize]; + let iovec = read_iovec(&mut buf); + + let start = Instant::now(); + for (seq, &cluster_idx) in indices.iter().enumerate() { + async_io + .read_vectored( + (cluster_idx as u64 * QCOW_CLUSTER_SIZE) as libc::off_t, + &[iovec], + seq as u64, + ) + .expect("read_vectored failed"); + } + let elapsed = start.elapsed().as_secs_f64(); + + drain_completions(async_io.as_mut(), num_ops); + + elapsed +} + +/// Write num_ops clusters into an empty qcow2 image through the +/// QcowSync async_io path and time the total write_vectored wall clock. +/// +/// This exercises the write allocation path: map_cluster_for_write +/// allocates a new cluster and bumps refcounts, then pwrite_all writes +/// the data. +/// +/// Returns the total write wall clock time in seconds. +pub fn micro_bench_qcow_write(control: &PerformanceTestControl) -> f64 { + let num_ops = control.num_ops.expect("num_ops required") as usize; + let (_tmp, disk) = util::empty_qcow_tempfile(num_ops); + let mut async_io = disk.new_async_io(1).expect("new_async_io failed"); + + let buf = vec![0xA5u8; QCOW_CLUSTER_SIZE as usize]; + let iovec = write_iovec(&buf); + + let start = Instant::now(); + submit_writes(async_io.as_mut(), num_ops, QCOW_CLUSTER_SIZE, &[iovec]); + let elapsed = start.elapsed().as_secs_f64(); + + // Drain completions so Drop is clean. + drain_completions(async_io.as_mut(), num_ops); + + elapsed +} + +/// Punch holes for num_ops clusters in a prepopulated qcow2 image through +/// the QcowSync async_io path and time the total punch_hole wall clock. +/// +/// This exercises the discard path: deallocate_bytes decrements refcounts, +/// frees clusters and issues fallocate punch_hole on the host file. +/// +/// Returns the total punch_hole wall clock time in seconds. +pub fn micro_bench_qcow_punch_hole(control: &PerformanceTestControl) -> f64 { + let num_ops = control.num_ops.expect("num_ops required") as usize; + let (_tmp, disk) = util::qcow_tempfile(num_ops); + let mut async_io = disk.new_async_io(1).expect("new_async_io failed"); + + let start = Instant::now(); + for i in 0..num_ops { + async_io + .punch_hole(i as u64 * QCOW_CLUSTER_SIZE, QCOW_CLUSTER_SIZE, i as u64) + .expect("punch_hole failed"); + } + let elapsed = start.elapsed().as_secs_f64(); + + // Drain completions so Drop is clean. + drain_completions(async_io.as_mut(), num_ops); + + elapsed +} + +/// Write num_ops clusters into an empty qcow2 image to dirty L2 and +/// refcount metadata, then time a single fsync that flushes all dirty +/// tables to disk. +/// +/// This isolates the metadata flush cost which scales with the number +/// of dirty L2 table entries and refcount blocks. +/// +/// Returns the fsync wall clock time in seconds. +pub fn micro_bench_qcow_fsync(control: &PerformanceTestControl) -> f64 { + let num_ops = control.num_ops.expect("num_ops required") as usize; + let (_tmp, disk) = util::empty_qcow_tempfile(num_ops); + let mut async_io = disk.new_async_io(1).expect("new_async_io failed"); + + // Write num_ops clusters to dirty L2 and refcount metadata. + let buf = vec![0xA5u8; QCOW_CLUSTER_SIZE as usize]; + let iovec = write_iovec(&buf); + submit_writes(async_io.as_mut(), num_ops, QCOW_CLUSTER_SIZE, &[iovec]); + // Drain write completions. + drain_completions(async_io.as_mut(), num_ops); + + // Time the flush. + let start = Instant::now(); + async_io.fsync(Some(num_ops as u64)).expect("fsync failed"); + let elapsed = start.elapsed().as_secs_f64(); + + drain_completions(async_io.as_mut(), 1); + + elapsed +} + +/// Read num_ops clusters from a QCOW2 overlay whose data lives entirely +/// in a raw backing file. +/// +/// This exercises the backing file read path: L2 lookup finds no +/// allocated cluster and falls through to the backing file for every +/// read. +/// +/// Returns the total read wall clock time in seconds. +pub fn micro_bench_qcow_backing_read(control: &PerformanceTestControl) -> f64 { + let num_ops = control.num_ops.expect("num_ops required") as usize; + let (_backing, _overlay, disk) = util::qcow_overlay_tempfile(num_ops); + let mut async_io = disk.new_async_io(1).expect("new_async_io failed"); + + let mut buf = vec![0u8; QCOW_CLUSTER_SIZE as usize]; + let iovec = read_iovec(&mut buf); + + let start = Instant::now(); + submit_reads(async_io.as_mut(), num_ops, QCOW_CLUSTER_SIZE, &[iovec]); + let elapsed = start.elapsed().as_secs_f64(); + + drain_completions(async_io.as_mut(), num_ops); + + elapsed +} + +/// Write num_ops clusters into a QCOW2 overlay backed by a raw file. +/// +/// Each write triggers copy-on-write: the overlay must allocate a new +/// cluster, update L2 and refcount tables, then write the data. This +/// measures the COW allocation overhead compared to writing into an +/// empty image (no backing read needed since we overwrite the full +/// cluster). +/// +/// Returns the total write wall clock time in seconds. +pub fn micro_bench_qcow_cow_write(control: &PerformanceTestControl) -> f64 { + let num_ops = control.num_ops.expect("num_ops required") as usize; + let (_backing, _overlay, disk) = util::qcow_overlay_tempfile(num_ops); + let mut async_io = disk.new_async_io(1).expect("new_async_io failed"); + + let buf = vec![0xBBu8; QCOW_CLUSTER_SIZE as usize]; + let iovec = write_iovec(&buf); + + let start = Instant::now(); + submit_writes(async_io.as_mut(), num_ops, QCOW_CLUSTER_SIZE, &[iovec]); + let elapsed = start.elapsed().as_secs_f64(); + + drain_completions(async_io.as_mut(), num_ops); + + elapsed +} + +/// Read num_ops clusters from a zlib compressed QCOW2 image. +/// +/// Every cluster is stored compressed, so each read triggers +/// decompression. This isolates the decompression overhead from +/// the normal allocated-cluster read path. +/// +/// Returns the total read wall clock time in seconds. +pub fn micro_bench_qcow_compressed_read(control: &PerformanceTestControl) -> f64 { + let num_ops = control.num_ops.expect("num_ops required") as usize; + let (_tmp, disk) = util::compressed_qcow_tempfile(num_ops); + let mut async_io = disk.new_async_io(1).expect("new_async_io failed"); + + let mut buf = vec![0u8; QCOW_CLUSTER_SIZE as usize]; + let iovec = read_iovec(&mut buf); + + let start = Instant::now(); + submit_reads(async_io.as_mut(), num_ops, QCOW_CLUSTER_SIZE, &[iovec]); + let elapsed = start.elapsed().as_secs_f64(); + + drain_completions(async_io.as_mut(), num_ops); + + elapsed +} + +/// Issue large multicluster reads from a prepopulated QCOW2 image. +/// +/// Each read_vectored call spans `CLUSTERS_PER_READ` contiguous clusters +/// (8 x 64 KiB = 512 KiB). This exercises the mapping coalesce path +/// where multiple L2 entries are merged into fewer host I/O operations. +/// `num_ops` is the total number of clusters; reads are issued in +/// chunks of CLUSTERS_PER_READ. +/// +/// Returns the total read wall clock time in seconds. +pub fn micro_bench_qcow_multi_cluster_read(control: &PerformanceTestControl) -> f64 { + const CLUSTERS_PER_READ: usize = 8; + + let num_ops = control.num_ops.expect("num_ops required") as usize; + let (_tmp, disk) = util::qcow_tempfile(num_ops); + let mut async_io = disk.new_async_io(1).expect("new_async_io failed"); + + let read_size = CLUSTERS_PER_READ * QCOW_CLUSTER_SIZE as usize; + let mut buf = vec![0u8; read_size]; + let iovec = read_iovec(&mut buf); + + let num_reads = num_ops / CLUSTERS_PER_READ; + let start = Instant::now(); + submit_reads(async_io.as_mut(), num_reads, read_size as u64, &[iovec]); + let elapsed = start.elapsed().as_secs_f64(); + + drain_completions(async_io.as_mut(), num_reads); + + elapsed +} + +/// Read one cluster from each of num_ops distinct L2 tables in a +/// sparsely allocated QCOW2 image. +/// +/// The clusters are spaced L2_ENTRIES_PER_TABLE apart so every read +/// touches a different L2 table. With num_ops exceeding the L2 cache +/// capacity (100 entries), this forces eviction on nearly every read +/// and measures the cold L2 cache miss overhead. +/// +/// Returns the total read wall clock time in seconds. +pub fn micro_bench_qcow_l2_cache_miss(control: &PerformanceTestControl) -> f64 { + let num_ops = control.num_ops.expect("num_ops required") as usize; + let (_tmp, disk) = util::sparse_qcow_tempfile(num_ops); + let mut async_io = disk.new_async_io(1).expect("new_async_io failed"); + + let mut buf = vec![0u8; QCOW_CLUSTER_SIZE as usize]; + let iovec = read_iovec(&mut buf); + + let stride = L2_ENTRIES_PER_TABLE as u64 * QCOW_CLUSTER_SIZE; + let start = Instant::now(); + submit_reads(async_io.as_mut(), num_ops, stride, &[iovec]); + let elapsed = start.elapsed().as_secs_f64(); + + drain_completions(async_io.as_mut(), num_ops); + + elapsed +} + +/// Read num_ops clusters from a prepopulated qcow2 image through the +/// QcowAsync io_uring path and time the total wall clock. +/// +/// Unlike micro_bench_qcow_read which uses QcowDiskSync (blocking), +/// this uses QcowDiskAsync where single-allocated-cluster reads go +/// through io_uring for true asynchronous completion. +/// +/// Returns the total read wall clock time in seconds. +pub fn micro_bench_qcow_async_read(control: &PerformanceTestControl) -> f64 { + let num_ops = control.num_ops.expect("num_ops required") as usize; + let (_tmp, disk) = util::qcow_async_tempfile(num_ops); + let mut async_io = disk + .new_async_io(num_ops as u32) + .expect("new_async_io failed"); + + let mut buf = vec![0u8; QCOW_CLUSTER_SIZE as usize]; + let iovec = read_iovec(&mut buf); + + let start = Instant::now(); + submit_reads(async_io.as_mut(), num_ops, QCOW_CLUSTER_SIZE, &[iovec]); + + // Drain all io_uring completions before stopping the clock. + drain_async_completions(async_io.as_mut(), num_ops); + start.elapsed().as_secs_f64() +} + +/// Measure QCOW2 batch read submission via io_uring. +/// +/// Builds a batch of `num_ops` read requests and submits them all at once +/// through `submit_batch_requests`, which packs multiple SQEs into a single +/// io_uring submission. Returns the total wall clock time in seconds. +pub fn micro_bench_qcow_batch_read(control: &PerformanceTestControl) -> f64 { + let num_ops = control.num_ops.expect("num_ops required") as usize; + let (_tmp, disk) = util::qcow_async_tempfile(num_ops); + let mut async_io = disk + .new_async_io(num_ops as u32) + .expect("new_async_io failed"); + + let mut buf = vec![0u8; num_ops * QCOW_CLUSTER_SIZE as usize]; + + let batch: Vec = (0..num_ops) + .map(|i| { + let slice = + &mut buf[i * QCOW_CLUSTER_SIZE as usize..(i + 1) * QCOW_CLUSTER_SIZE as usize]; + BatchRequest { + offset: (i as u64 * QCOW_CLUSTER_SIZE) as libc::off_t, + iovecs: vec![libc::iovec { + iov_base: slice.as_mut_ptr() as *mut libc::c_void, + iov_len: QCOW_CLUSTER_SIZE as usize, + }] + .into(), + user_data: i as u64, + request_type: RequestType::In, + } + }) + .collect(); + + let start = Instant::now(); + async_io + .submit_batch_requests(&batch) + .expect("submit_batch_requests failed"); + + // Drain all io_uring completions before stopping the clock. + drain_async_completions(async_io.as_mut(), num_ops); + start.elapsed().as_secs_f64() +} + +/// Read num_ops clusters from a prepopulated QCOW2 image in random order +/// through the QcowAsync io_uring path. +/// +/// Returns the total read wall clock time in seconds. +pub fn micro_bench_qcow_async_random_read(control: &PerformanceTestControl) -> f64 { + let num_ops = control.num_ops.expect("num_ops required") as usize; + let (_tmp, disk) = util::qcow_async_tempfile(num_ops); + let mut async_io = disk + .new_async_io(num_ops as u32) + .expect("new_async_io failed"); + + let indices = deterministic_permutation(num_ops); + + let mut buf = vec![0u8; QCOW_CLUSTER_SIZE as usize]; + let iovec = read_iovec(&mut buf); + + let start = Instant::now(); + for (seq, &cluster_idx) in indices.iter().enumerate() { + async_io + .read_vectored( + (cluster_idx as u64 * QCOW_CLUSTER_SIZE) as libc::off_t, + &[iovec], + seq as u64, + ) + .expect("read_vectored failed"); + } + + drain_async_completions(async_io.as_mut(), num_ops); + start.elapsed().as_secs_f64() +} + +/// Issue large multi-cluster reads from a prepopulated QCOW2 image +/// through the QcowAsync io_uring path. +/// +/// Each read spans 8 contiguous clusters (512 KiB). With coalesced +/// mappings, this can hit the io_uring fast path for a single Readv. +/// +/// Returns the total read wall clock time in seconds. +pub fn micro_bench_qcow_async_multi_cluster_read(control: &PerformanceTestControl) -> f64 { + const CLUSTERS_PER_READ: usize = 8; + + let num_ops = control.num_ops.expect("num_ops required") as usize; + let (_tmp, disk) = util::qcow_async_tempfile(num_ops); + let mut async_io = disk + .new_async_io(num_ops as u32) + .expect("new_async_io failed"); + + let read_size = CLUSTERS_PER_READ * QCOW_CLUSTER_SIZE as usize; + let mut buf = vec![0u8; read_size]; + let iovec = read_iovec(&mut buf); + + let num_reads = num_ops / CLUSTERS_PER_READ; + let start = Instant::now(); + submit_reads(async_io.as_mut(), num_reads, read_size as u64, &[iovec]); + + drain_async_completions(async_io.as_mut(), num_reads); + start.elapsed().as_secs_f64() +} + +/// Read num_ops clusters from a QCOW2 overlay backed by a raw file +/// through the QcowAsync io_uring path. +/// +/// All reads fall through to the backing file (sync fallback in +/// QcowAsync since the mapping is not a single allocated cluster). +/// +/// Returns the total read wall clock time in seconds. +pub fn micro_bench_qcow_async_backing_read(control: &PerformanceTestControl) -> f64 { + let num_ops = control.num_ops.expect("num_ops required") as usize; + let (_backing, _overlay, disk) = util::qcow_async_overlay_tempfile(num_ops); + let mut async_io = disk + .new_async_io(num_ops as u32) + .expect("new_async_io failed"); + + let mut buf = vec![0u8; QCOW_CLUSTER_SIZE as usize]; + let iovec = read_iovec(&mut buf); + + let start = Instant::now(); + submit_reads(async_io.as_mut(), num_ops, QCOW_CLUSTER_SIZE, &[iovec]); + + drain_async_completions(async_io.as_mut(), num_ops); + start.elapsed().as_secs_f64() +} + +/// Compressed clusters take the sync fallback in QcowAsync since they +/// require decompression. This measures decompression overhead through +/// the async code path. +/// +/// Returns the total read wall clock time in seconds. +pub fn micro_bench_qcow_async_compressed_read(control: &PerformanceTestControl) -> f64 { + let num_ops = control.num_ops.expect("num_ops required") as usize; + let (_tmp, disk) = util::compressed_qcow_async_tempfile(num_ops); + let mut async_io = disk + .new_async_io(num_ops as u32) + .expect("new_async_io failed"); + + let mut buf = vec![0u8; QCOW_CLUSTER_SIZE as usize]; + let iovec = read_iovec(&mut buf); + + let start = Instant::now(); + submit_reads(async_io.as_mut(), num_ops, QCOW_CLUSTER_SIZE, &[iovec]); + + drain_async_completions(async_io.as_mut(), num_ops); + start.elapsed().as_secs_f64() +} + +/// Write num_ops clusters into an empty QCOW2 image through the +/// QcowAsync io_uring path. +/// +/// Writes in QcowAsync are synchronous (COW metadata allocation must +/// complete before the host offset is known), so this measures the +/// write path overhead through the async code path. +/// +/// Returns the total write wall clock time in seconds. +pub fn micro_bench_qcow_async_write(control: &PerformanceTestControl) -> f64 { + let num_ops = control.num_ops.expect("num_ops required") as usize; + let (_tmp, disk) = util::empty_qcow_async_tempfile(num_ops); + let mut async_io = disk + .new_async_io(num_ops as u32) + .expect("new_async_io failed"); + + let buf = vec![0xA5u8; QCOW_CLUSTER_SIZE as usize]; + let iovec = write_iovec(&buf); + + let start = Instant::now(); + submit_writes(async_io.as_mut(), num_ops, QCOW_CLUSTER_SIZE, &[iovec]); + + drain_async_completions(async_io.as_mut(), num_ops); + start.elapsed().as_secs_f64() +} + +/// Read one cluster from each of num_ops distinct L2 tables in a sparse +/// QCOW2 image through the QcowAsync io_uring path. +/// +/// Returns the total read wall clock time in seconds. +pub fn micro_bench_qcow_async_l2_cache_miss(control: &PerformanceTestControl) -> f64 { + let num_ops = control.num_ops.expect("num_ops required") as usize; + let (_tmp, disk) = util::sparse_qcow_async_tempfile(num_ops); + let mut async_io = disk + .new_async_io(num_ops as u32) + .expect("new_async_io failed"); + + let mut buf = vec![0u8; QCOW_CLUSTER_SIZE as usize]; + let iovec = read_iovec(&mut buf); + + let stride = L2_ENTRIES_PER_TABLE as u64 * QCOW_CLUSTER_SIZE; + let start = Instant::now(); + submit_reads(async_io.as_mut(), num_ops, stride, &[iovec]); + + drain_async_completions(async_io.as_mut(), num_ops); + start.elapsed().as_secs_f64() +} + +/// Measure QCOW2 batch write submission via io_uring. +/// +/// Builds a batch of num_ops write requests and submits them all at once +/// through submit_batch_requests. Writes in QcowAsync are synchronous +/// (COW path), so this measures whether batching reduces per-request +/// overhead compared to individual write_vectored calls. +/// +/// Returns the total wall clock time in seconds. +pub fn micro_bench_qcow_batch_write(control: &PerformanceTestControl) -> f64 { + let num_ops = control.num_ops.expect("num_ops required") as usize; + let (_tmp, disk) = util::empty_qcow_async_tempfile(num_ops); + let mut async_io = disk + .new_async_io(num_ops as u32) + .expect("new_async_io failed"); + + let buf = vec![0xA5u8; num_ops * QCOW_CLUSTER_SIZE as usize]; + + let batch: Vec = (0..num_ops) + .map(|i| { + let slice = &buf[i * QCOW_CLUSTER_SIZE as usize..(i + 1) * QCOW_CLUSTER_SIZE as usize]; + BatchRequest { + offset: (i as u64 * QCOW_CLUSTER_SIZE) as libc::off_t, + iovecs: vec![libc::iovec { + iov_base: slice.as_ptr() as *mut libc::c_void, + iov_len: QCOW_CLUSTER_SIZE as usize, + }] + .into(), + user_data: i as u64, + request_type: RequestType::Out, + } + }) + .collect(); + + let start = Instant::now(); + async_io + .submit_batch_requests(&batch) + .expect("submit_batch_requests failed"); + + drain_async_completions(async_io.as_mut(), num_ops); + start.elapsed().as_secs_f64() +} diff --git a/performance-metrics/src/performance_tests.rs b/performance-metrics/src/performance_tests.rs index b959fce5c3..813963481b 100644 --- a/performance-metrics/src/performance_tests.rs +++ b/performance-metrics/src/performance_tests.rs @@ -5,7 +5,6 @@ // Performance tests -use std::path::PathBuf; use std::time::Duration; use std::{fs, thread}; @@ -14,8 +13,6 @@ use thiserror::Error; use crate::{ImageFormat, PerformanceTestControl, PerformanceTestOverrides, mean}; -#[cfg(target_arch = "x86_64")] -pub const FOCAL_IMAGE_NAME: &str = "focal-server-cloudimg-amd64-custom-20210609-0.raw"; #[cfg(target_arch = "aarch64")] pub const FOCAL_IMAGE_NAME: &str = "focal-server-cloudimg-arm64-custom-20210929-0-update-tool.raw"; @@ -127,45 +124,6 @@ fn performance_test_new_guest(disk_config: Box) -> Guest { Guest::new_from_ip_range(disk_config, "172.19", 0) } -const DIRECT_KERNEL_BOOT_CMDLINE: &str = - "root=/dev/vda1 console=hvc0 rw systemd.journald.forward_to_console=1"; - -// Creates the path for direct kernel boot and return the path. -// For x86_64, this function returns the vmlinux kernel path. -// For AArch64, this function returns the PE kernel path. -fn direct_kernel_boot_path() -> PathBuf { - let mut workload_path = dirs::home_dir().unwrap(); - workload_path.push("workloads"); - - let mut kernel_path = workload_path; - #[cfg(target_arch = "x86_64")] - kernel_path.push("vmlinux-x86_64"); - #[cfg(target_arch = "aarch64")] - kernel_path.push("Image-arm64"); - - kernel_path -} - -fn remote_command(api_socket: &str, command: &str, arg: Option<&str>) -> bool { - let mut cmd = std::process::Command::new(clh_command("ch-remote")); - cmd.args([&format!("--api-socket={api_socket}"), command]); - - if let Some(arg) = arg { - cmd.arg(arg); - } - let output = cmd.output().unwrap(); - if output.status.success() { - true - } else { - eprintln!( - "Error running ch-remote command: {:?}\nstderr: {}", - &cmd, - String::from_utf8_lossy(&output.stderr) - ); - false - } -} - pub fn performance_net_throughput(control: &PerformanceTestControl) -> f64 { let test_timeout = control.test_timeout; let (rx, bandwidth) = control.net_control.unwrap(); @@ -440,8 +398,18 @@ pub fn performance_block_io(control: &PerformanceTestControl) -> f64 { let mut test_disk_arg = format!("path={test_file},queue_size={queue_size},num_queues={num_queues}"); - if test_file == OVERLAY_WITH_QCOW2_BACKING || test_file == OVERLAY_WITH_RAW_BACKING { - test_disk_arg.push_str(",backing_files=on"); + if test_file == OVERLAY_WITH_QCOW2_BACKING + || test_file == OVERLAY_WITH_RAW_BACKING + || test_file == QCOW2_UNCOMPRESSED_IMG + || test_file == QCOW2_ZLIB_IMG + || test_file == QCOW2_ZSTD_IMG + { + test_disk_arg.push_str(",image_type=qcow2"); + if test_file == OVERLAY_WITH_QCOW2_BACKING || test_file == OVERLAY_WITH_RAW_BACKING { + test_disk_arg.push_str(",backing_files=on"); + } + } else if test_file == BLK_IO_TEST_IMG { + test_disk_arg.push_str(",image_type=raw"); } let mut child = GuestCommand::new(&guest) diff --git a/performance-metrics/src/util.rs b/performance-metrics/src/util.rs new file mode 100644 index 0000000000..3f68fdd7eb --- /dev/null +++ b/performance-metrics/src/util.rs @@ -0,0 +1,330 @@ +// Copyright 2026 The Cloud Hypervisor Authors. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +//! Shared benchmark helpers. + +use std::fs::File; +use std::io::{ErrorKind, Seek, SeekFrom, Write}; +use std::os::unix::fs::FileExt; +use std::process::Command; +use std::thread; +use std::time::Duration; + +use block::async_io::AsyncIo; +use block::qcow::{BackingFileConfig, ImageType, QcowFile, RawFile}; +use block::qcow_async::QcowDiskAsync; +use block::qcow_sync::QcowDiskSync; +use vmm_sys_util::eventfd::EventFd; +use vmm_sys_util::tempfile::TempFile; + +pub const BLOCK_SIZE: u64 = 4096; +pub const QCOW_CLUSTER_SIZE: u64 = 65536; + +/// Create a temporary file pre sized to hold `num_blocks` blocks. +pub fn sized_tempfile(num_blocks: usize) -> TempFile { + let tmp = TempFile::new().expect("failed to create tempfile"); + tmp.as_file() + .set_len(BLOCK_SIZE * num_blocks as u64) + .expect("failed to set file length"); + tmp +} + +/// Create a QCOW2 image with `num_clusters` allocated clusters and return +/// the tempfile handle. +/// +/// Each cluster is default QCOW2 cluster size of 64 KiB. The image is +/// created via `QcowFile::new` then populated with writes so that the +/// clusters are actually allocated in the L2 / refcount tables. +fn create_qcow_tempfile(num_clusters: usize) -> TempFile { + let tmp = TempFile::new().expect("failed to create tempfile"); + let virtual_size = QCOW_CLUSTER_SIZE * num_clusters as u64; + let raw = RawFile::new(tmp.as_file().try_clone().unwrap(), false); + let mut qcow = QcowFile::new(raw, 3, virtual_size, true).expect("failed to create QCOW2 file"); + let buf = vec![0xA5u8; QCOW_CLUSTER_SIZE as usize]; + for i in 0..num_clusters { + qcow.seek(SeekFrom::Start(i as u64 * QCOW_CLUSTER_SIZE)) + .expect("seek failed"); + qcow.write_all(&buf).expect("write failed"); + } + qcow.flush().expect("flush failed"); + tmp +} + +/// Create a QCOW2 image with `num_clusters` allocated clusters opened +/// via `QcowDiskSync` (blocking I/O backend). +pub fn qcow_tempfile(num_clusters: usize) -> (TempFile, QcowDiskSync) { + let tmp = create_qcow_tempfile(num_clusters); + let disk = QcowDiskSync::new(tmp.as_file().try_clone().unwrap(), false, false, true) + .expect("failed to open QCOW2 via QcowDiskSync"); + (tmp, disk) +} + +/// Create a QCOW2 image with `num_clusters` allocated clusters opened +/// via `QcowDiskAsync` (io_uring backend). +pub fn qcow_async_tempfile(num_clusters: usize) -> (TempFile, QcowDiskAsync) { + let tmp = create_qcow_tempfile(num_clusters); + let disk = QcowDiskAsync::new(tmp.as_file().try_clone().unwrap(), false, false, true) + .expect("failed to open QCOW2 via QcowDiskAsync"); + (tmp, disk) +} + +/// Drain `count` completions from a synchronous async_io backend. +pub fn drain_completions(async_io: &mut dyn AsyncIo, count: usize) { + for _ in 0..count { + async_io.next_completed_request(); + } +} + +/// Build an iovec suitable for a read into `buf`. +pub fn read_iovec(buf: &mut [u8]) -> libc::iovec { + libc::iovec { + iov_base: buf.as_mut_ptr() as *mut libc::c_void, + iov_len: buf.len(), + } +} + +/// Build an iovec suitable for a write from `buf`. +pub fn write_iovec(buf: &[u8]) -> libc::iovec { + libc::iovec { + iov_base: buf.as_ptr() as *mut libc::c_void, + iov_len: buf.len(), + } +} + +/// Build a deterministic pseudo-random permutation of `[0, n)`. +/// +/// Uses a Fisher-Yates shuffle seeded by `DefaultHasher` so the +/// permutation is identical across runs. +pub fn deterministic_permutation(n: usize) -> Vec { + use std::collections::hash_map::DefaultHasher; + use std::hash::{Hash, Hasher}; + + let mut indices: Vec = (0..n).collect(); + for i in (1..n).rev() { + let mut h = DefaultHasher::new(); + i.hash(&mut h); + let j = h.finish() as usize % (i + 1); + indices.swap(i, j); + } + indices +} + +/// Submit `count` sequential read_vectored calls at `stride`-byte intervals. +pub fn submit_reads(async_io: &mut dyn AsyncIo, count: usize, stride: u64, iovec: &[libc::iovec]) { + for i in 0..count { + async_io + .read_vectored((i as u64 * stride) as libc::off_t, iovec, i as u64) + .expect("read_vectored failed"); + } +} + +/// Submit `count` sequential write_vectored calls at `stride`-byte intervals. +pub fn submit_writes(async_io: &mut dyn AsyncIo, count: usize, stride: u64, iovec: &[libc::iovec]) { + for i in 0..count { + async_io + .write_vectored((i as u64 * stride) as libc::off_t, iovec, i as u64) + .expect("write_vectored failed"); + } +} + +/// Drain `count` completions from an asynchronous I/O backend that delivers +/// results via eventfd notification (e.g. io_uring). +pub fn drain_async_completions(async_io: &mut dyn AsyncIo, count: usize) { + let mut drained = 0usize; + while drained < count { + wait_for_eventfd(async_io.notifier()); + while async_io.next_completed_request().is_some() { + drained += 1; + } + } +} + +/// Create an empty QCOW2 image sized for `num_clusters` clusters. +/// No data clusters are allocated. +fn create_empty_qcow_tempfile(num_clusters: usize) -> TempFile { + let tmp = TempFile::new().expect("failed to create tempfile"); + let virtual_size = QCOW_CLUSTER_SIZE * num_clusters as u64; + let raw = RawFile::new(tmp.as_file().try_clone().unwrap(), false); + QcowFile::new(raw, 3, virtual_size, true).expect("failed to create qcow2 file"); + tmp +} + +/// Empty QCOW2 opened via QcowDiskSync. +pub fn empty_qcow_tempfile(num_clusters: usize) -> (TempFile, QcowDiskSync) { + let tmp = create_empty_qcow_tempfile(num_clusters); + let disk = QcowDiskSync::new(tmp.as_file().try_clone().unwrap(), false, false, true) + .expect("failed to open qcow2 via QcowDiskSync"); + (tmp, disk) +} + +/// Empty QCOW2 opened via QcowDiskAsync. +pub fn empty_qcow_async_tempfile(num_clusters: usize) -> (TempFile, QcowDiskAsync) { + let tmp = create_empty_qcow_tempfile(num_clusters); + let disk = QcowDiskAsync::new(tmp.as_file().try_clone().unwrap(), false, false, true) + .expect("failed to open qcow2 via QcowDiskAsync"); + (tmp, disk) +} + +/// Create a QCOW2 overlay backed by a raw file with `num_clusters` +/// pre-populated clusters. Returns (backing_tempfile, overlay_tempfile). +fn create_overlay_tempfiles(num_clusters: usize) -> (TempFile, TempFile) { + let virtual_size = QCOW_CLUSTER_SIZE * num_clusters as u64; + + let backing = TempFile::new().expect("failed to create backing tempfile"); + { + let f = backing.as_file(); + f.set_len(virtual_size).expect("set_len failed"); + let buf = vec![0xA5u8; QCOW_CLUSTER_SIZE as usize]; + for i in 0..num_clusters { + f.write_at(&buf, i as u64 * QCOW_CLUSTER_SIZE) + .expect("write_at failed"); + } + } + + let overlay = TempFile::new().expect("failed to create overlay tempfile"); + { + let raw = RawFile::new(overlay.as_file().try_clone().unwrap(), false); + let backing_config = BackingFileConfig { + path: backing.as_path().to_str().unwrap().to_string(), + format: Some(ImageType::Raw), + }; + QcowFile::new_from_backing(raw, 3, virtual_size, &backing_config, true) + .expect("failed to create overlay qcow2"); + } + + (backing, overlay) +} + +/// QCOW2 overlay with raw backing opened via QcowDiskSync. +pub fn qcow_overlay_tempfile(num_clusters: usize) -> (TempFile, TempFile, QcowDiskSync) { + let (backing, overlay) = create_overlay_tempfiles(num_clusters); + let disk = QcowDiskSync::new(overlay.as_file().try_clone().unwrap(), false, true, true) + .expect("failed to open overlay qcow2 via QcowDiskSync"); + (backing, overlay, disk) +} + +/// QCOW2 overlay with raw backing opened via QcowDiskAsync. +pub fn qcow_async_overlay_tempfile(num_clusters: usize) -> (TempFile, TempFile, QcowDiskAsync) { + let (backing, overlay) = create_overlay_tempfiles(num_clusters); + let disk = QcowDiskAsync::new(overlay.as_file().try_clone().unwrap(), false, true, true) + .expect("failed to open overlay qcow2 via QcowDiskAsync"); + (backing, overlay, disk) +} + +/// Create a zlib compressed QCOW2 image with `num_clusters` clusters +/// via `qemu-img convert -c`. +fn create_compressed_qcow_tempfile(num_clusters: usize) -> TempFile { + let virtual_size = QCOW_CLUSTER_SIZE * num_clusters as u64; + + let raw_tmp = TempFile::new().expect("failed to create raw tempfile"); + { + let f = raw_tmp.as_file(); + f.set_len(virtual_size).expect("set_len failed"); + let buf = vec![0xA5u8; QCOW_CLUSTER_SIZE as usize]; + for i in 0..num_clusters { + f.write_at(&buf, i as u64 * QCOW_CLUSTER_SIZE) + .expect("write_at failed"); + } + } + + let qcow_tmp = TempFile::new().expect("failed to create qcow2 tempfile"); + let qcow_path = qcow_tmp.as_path().to_str().unwrap().to_string(); + let raw_path = raw_tmp.as_path().to_str().unwrap().to_string(); + let status = Command::new("qemu-img") + .args([ + "convert", + "-f", + "raw", + "-O", + "qcow2", + "-c", + "-o", + "compression_type=zlib", + &raw_path, + &qcow_path, + ]) + .status() + .expect("failed to run qemu-img"); + assert!(status.success(), "qemu-img convert failed"); + + qcow_tmp +} + +/// Compressed QCOW2 opened via QcowDiskSync. +pub fn compressed_qcow_tempfile(num_clusters: usize) -> (TempFile, QcowDiskSync) { + let tmp = create_compressed_qcow_tempfile(num_clusters); + let path = tmp.as_path().to_str().unwrap().to_string(); + let disk = QcowDiskSync::new( + File::open(&path).expect("failed to open compressed qcow2"), + false, + false, + true, + ) + .expect("failed to open compressed qcow2 via QcowDiskSync"); + (tmp, disk) +} + +/// Compressed QCOW2 opened via QcowDiskAsync. +pub fn compressed_qcow_async_tempfile(num_clusters: usize) -> (TempFile, QcowDiskAsync) { + let tmp = create_compressed_qcow_tempfile(num_clusters); + let path = tmp.as_path().to_str().unwrap().to_string(); + let disk = QcowDiskAsync::new( + File::open(&path).expect("failed to open compressed qcow2"), + false, + false, + true, + ) + .expect("failed to open compressed qcow2 via QcowDiskAsync"); + (tmp, disk) +} + +/// Number of data clusters covered by a single L2 table (64 KiB cluster, +/// 8-byte entries -> 8192 entries per L2 table). +pub const L2_ENTRIES_PER_TABLE: usize = QCOW_CLUSTER_SIZE as usize / 8; + +/// Create a sparse QCOW2 image with one allocated cluster per L2 table, +/// spanning `num_l2_tables` L2 tables. +fn create_sparse_qcow_tempfile(num_l2_tables: usize) -> TempFile { + let virtual_size = QCOW_CLUSTER_SIZE * (num_l2_tables as u64 * L2_ENTRIES_PER_TABLE as u64); + let tmp = TempFile::new().expect("failed to create tempfile"); + let raw = RawFile::new(tmp.as_file().try_clone().unwrap(), false); + let mut qcow = QcowFile::new(raw, 3, virtual_size, true).expect("failed to create qcow2 file"); + let buf = vec![0xA5u8; QCOW_CLUSTER_SIZE as usize]; + for i in 0..num_l2_tables { + let offset = i as u64 * L2_ENTRIES_PER_TABLE as u64 * QCOW_CLUSTER_SIZE; + qcow.seek(SeekFrom::Start(offset)).expect("seek failed"); + qcow.write_all(&buf).expect("write failed"); + } + qcow.flush().expect("flush failed"); + tmp +} + +/// Sparse QCOW2 opened via QcowDiskSync. +pub fn sparse_qcow_tempfile(num_l2_tables: usize) -> (TempFile, QcowDiskSync) { + let tmp = create_sparse_qcow_tempfile(num_l2_tables); + let disk = QcowDiskSync::new(tmp.as_file().try_clone().unwrap(), false, false, true) + .expect("failed to open qcow2 via QcowDiskSync"); + (tmp, disk) +} + +/// Sparse QCOW2 opened via QcowDiskAsync. +pub fn sparse_qcow_async_tempfile(num_l2_tables: usize) -> (TempFile, QcowDiskAsync) { + let tmp = create_sparse_qcow_tempfile(num_l2_tables); + let disk = QcowDiskAsync::new(tmp.as_file().try_clone().unwrap(), false, false, true) + .expect("failed to open qcow2 via QcowDiskAsync"); + (tmp, disk) +} + +/// Spin and wait until the given eventfd becomes readable. +pub fn wait_for_eventfd(notifier: &EventFd) { + loop { + match notifier.read() { + Ok(_) => return, + Err(e) if e.kind() == ErrorKind::WouldBlock => { + thread::sleep(Duration::from_micros(50)); + } + Err(e) => panic!("eventfd read failed: {e}"), + } + } +} diff --git a/rate_limiter/Cargo.toml b/rate_limiter/Cargo.toml index 206ec7b7f8..2d32e8a25b 100644 --- a/rate_limiter/Cargo.toml +++ b/rate_limiter/Cargo.toml @@ -1,6 +1,7 @@ [package] edition.workspace = true name = "rate_limiter" +rust-version.workspace = true version = "0.1.0" [dependencies] diff --git a/scripts/dev_cli.sh b/scripts/dev_cli.sh index e537c499c2..3b94513216 100755 --- a/scripts/dev_cli.sh +++ b/scripts/dev_cli.sh @@ -53,6 +53,46 @@ CARGO_TARGET_DIR="${CLH_BUILD_DIR}/cargo_target" # Let tests know that the special environment is set up. RUSTFLAGS="${RUSTFLAGS} --cfg devcli_testenv" +# Container name used for cleanup on signal. The PID makes it unique per +# invocation so parallel runs do not collide. +CLH_CTR_NAME="clh-dev-$$" + +# PID of the docker run process launched by run_container(). +CLH_CTR_PID="" + +# Cleanup handler: kill the running container (if any) and all child +# processes, then exit. +cleanup() { + echo "[$CLI_NAME] Caught signal, terminating..." + # Disable the trap to prevent recursion + trap - INT TERM + # Kill the Docker/Podman container by name + $DOCKER_RUNTIME kill "$CLH_CTR_NAME" 2>/dev/null + $DOCKER_RUNTIME kill "${CLH_CTR_NAME}-fix" 2>/dev/null + # Kill the docker run process tracked by run_container() + [ -n "$CLH_CTR_PID" ] && kill -TERM "$CLH_CTR_PID" 2>/dev/null + # Kill any remaining child processes + pkill -TERM -P $$ 2>/dev/null + wait 2>/dev/null + exit 1 +} + +trap cleanup INT TERM + +# Run a command in the background and wait for it. Bash defers trap +# handling while a foreground process is running, which makes Ctrl+C +# unresponsive during long-running container commands (wget, qemu-img, +# cargo build, etc). By backgrounding the command and using `wait`, +# the trap fires immediately when a signal arrives. +run_container() { + "$@" & + CLH_CTR_PID=$! + wait $CLH_CTR_PID + local rc=$? + CLH_CTR_PID="" + return $rc +} + # Send a decorated message to stdout, followed by a new line # say() { @@ -146,6 +186,7 @@ fix_dir_perms() { # Yes, running Docker to get elevated privileges, just to chown some files # is a dirty hack. $DOCKER_RUNTIME run \ + --name "${CLH_CTR_NAME}-fix" \ --workdir "$CTR_CLH_ROOT_DIR" \ --rm \ --volume /dev:/dev \ @@ -165,7 +206,7 @@ process_volumes_args() { return fi exported_volumes="" - arr_vols=("${arg_vols//#/ }") + IFS='#' read -ra arr_vols <<<"$arg_vols" for var in "${arr_vols[@]}"; do dev=$(echo "$var" | cut -d ':' -f 1) if [[ ! -e "$dev" ]]; then @@ -314,7 +355,8 @@ cmd_build() { rustflags="$rustflags -C link-args=-Wl,-Bstatic -C link-args=-lc" fi - $DOCKER_RUNTIME run \ + run_container "$DOCKER_RUNTIME" run \ + --name "$CLH_CTR_NAME" \ --user "$(id -u):$(id -g)" \ --workdir "$CTR_CLH_ROOT_DIR" \ --rm \ @@ -335,7 +377,8 @@ cmd_clean() { ensure_build_dir ensure_latest_ctr - $DOCKER_RUNTIME run \ + run_container "$DOCKER_RUNTIME" run \ + --name "$CLH_CTR_NAME" \ --user "$(id -u):$(id -g)" \ --workdir "$CTR_CLH_ROOT_DIR" \ --rm \ @@ -432,12 +475,14 @@ cmd_tests() { if [[ "$unit" = true ]]; then say "Running unit tests for $target..." - $DOCKER_RUNTIME run \ + run_container "$DOCKER_RUNTIME" run \ + --name "$CLH_CTR_NAME" \ --workdir "$CTR_CLH_ROOT_DIR" \ --rm \ --device $exported_device \ --device /dev/net/tun \ --cap-add net_admin \ + --security-opt seccomp=unconfined \ --volume "$CLH_ROOT_DIR:$CTR_CLH_ROOT_DIR" \ ${exported_volumes:+$exported_volumes} \ --env BUILD_TARGET="$target" \ @@ -450,7 +495,8 @@ cmd_tests() { if [ "$integration" = true ]; then say "Running integration tests for $target..." - $DOCKER_RUNTIME run \ + run_container "$DOCKER_RUNTIME" run \ + --name "$CLH_CTR_NAME" \ --workdir "$CTR_CLH_ROOT_DIR" \ --rm \ --privileged \ @@ -476,7 +522,8 @@ cmd_tests() { mkdir -p "$DEST_IGVM_FILES_PATH" copy_igvm_files "$SRC_IGVM_FILES_PATH" "$DEST_IGVM_FILES_PATH" say "Running CVM integration tests for $target..." - $DOCKER_RUNTIME run \ + run_container "$DOCKER_RUNTIME" run \ + --name "$CLH_CTR_NAME" \ --workdir "$CTR_CLH_ROOT_DIR" \ --rm \ --privileged \ @@ -501,7 +548,8 @@ cmd_tests() { if [ "$integration_vfio" = true ]; then say "Running VFIO integration tests for $target..." - $DOCKER_RUNTIME run \ + run_container "$DOCKER_RUNTIME" run \ + --name "$CLH_CTR_NAME" \ --workdir "$CTR_CLH_ROOT_DIR" \ --rm \ --privileged \ @@ -524,7 +572,8 @@ cmd_tests() { if [ "$integration_windows" = true ]; then say "Running Windows integration tests for $target..." - $DOCKER_RUNTIME run \ + run_container "$DOCKER_RUNTIME" run \ + --name "$CLH_CTR_NAME" \ --workdir "$CTR_CLH_ROOT_DIR" \ --rm \ --privileged \ @@ -547,7 +596,8 @@ cmd_tests() { if [ "$integration_live_migration" = true ]; then say "Running 'live migration' integration tests for $target..." - $DOCKER_RUNTIME run \ + run_container "$DOCKER_RUNTIME" run \ + --name "$CLH_CTR_NAME" \ --workdir "$CTR_CLH_ROOT_DIR" \ --rm \ --privileged \ @@ -572,7 +622,8 @@ cmd_tests() { if [ "$integration_rate_limiter" = true ]; then say "Running 'rate limiter' integration tests for $target..." - $DOCKER_RUNTIME run \ + run_container "$DOCKER_RUNTIME" run \ + --name "$CLH_CTR_NAME" \ --workdir "$CTR_CLH_ROOT_DIR" \ --rm \ --privileged \ @@ -595,7 +646,8 @@ cmd_tests() { if [ "$metrics" = true ]; then say "Generating performance metrics for $target..." - $DOCKER_RUNTIME run \ + run_container "$DOCKER_RUNTIME" run \ + --name "$CLH_CTR_NAME" \ --workdir "$CTR_CLH_ROOT_DIR" \ --rm \ --privileged \ @@ -619,7 +671,8 @@ cmd_tests() { if [ "$coverage" = true ]; then say "Generating code coverage information for $target..." - $DOCKER_RUNTIME run \ + run_container "$DOCKER_RUNTIME" run \ + --name "$CLH_CTR_NAME" \ --workdir "$CTR_CLH_ROOT_DIR" \ --rm \ --privileged \ @@ -706,10 +759,22 @@ cmd_shell() { ensure_build_dir ensure_latest_ctr process_volumes_args - say_warn "Starting a privileged shell prompt as root ..." - say_warn "WARNING: Your $CLH_ROOT_DIR folder will be bind-mounted in the container under $CTR_CLH_ROOT_DIR" + + # Remaining args after -- are passed as a command to bash -c. + # With no args, an interactive shell is started. + tty_args="-ti" + shell_args=() + if [ $# -gt 0 ]; then + tty_args="" + shell_args+=("-c" "$*") + else + say_warn "Starting a privileged shell prompt as root ..." + say_warn "WARNING: Your $CLH_ROOT_DIR folder will be bind-mounted in the container under $CTR_CLH_ROOT_DIR" + fi + $DOCKER_RUNTIME run \ - -ti \ + --name "$CLH_CTR_NAME" \ + $tty_args \ --workdir "$CTR_CLH_ROOT_DIR" \ --rm \ --privileged \ @@ -723,7 +788,8 @@ cmd_shell() { --volume "$CLH_INTEGRATION_WORKLOADS:$CTR_CLH_INTEGRATION_WORKLOADS" \ --env USER="root" \ --entrypoint bash \ - "$CTR_IMAGE" + "$CTR_IMAGE" \ + "${shell_args[@]}" fix_dir_perms $? } diff --git a/scripts/gitlint/rules/TitleStartsWithComponent.py b/scripts/gitlint/rules/TitleStartsWithComponent.py index 1310e45ccd..a25172a629 100644 --- a/scripts/gitlint/rules/TitleStartsWithComponent.py +++ b/scripts/gitlint/rules/TitleStartsWithComponent.py @@ -46,7 +46,6 @@ def validate(self, line, _commit): 'hypervisor', 'main', 'misc', - 'net_gen', 'net_util', 'openapi', 'option_parser', diff --git a/scripts/prepare_vdpa.sh b/scripts/prepare_vdpa.sh index 4a99daaf7b..601e61dbcf 100755 --- a/scripts/prepare_vdpa.sh +++ b/scripts/prepare_vdpa.sh @@ -1,35 +1,73 @@ #!/usr/bin/env bash set -x -sudo apt install -y libncurses-dev gawk flex bison openssl libssl-dev dkms libelf-dev libudev-dev libpci-dev libiberty-dev autoconf git make dpkg-dev libmnl-dev pkg-config iproute2 -sudo sed -i -- 's/# deb-src/deb-src/g' /etc/apt/sources.list -sudo apt update -apt-get source linux-image-unsigned-"$(uname -r)" -pushd linux-azure*/drivers/vdpa/vdpa_sim/ || exit -# REUSE-IgnoreStart -cat <<'EOF' >Makefile +build_install_vdpa_sim_modules_ubuntu() { + sudo apt install -y libncurses-dev gawk flex bison openssl libssl-dev dkms libelf-dev libudev-dev libpci-dev libiberty-dev autoconf git make dpkg-dev libmnl-dev pkg-config iproute2 + sudo sed -i -- 's/# deb-src/deb-src/g' /etc/apt/sources.list + sudo apt update + apt-get source linux-image-unsigned-"$(uname -r)" + pushd linux-azure*/drivers/vdpa/vdpa_sim/ || exit + # REUSE-IgnoreStart + cat <<'EOF' >Makefile # SPDX-License-Identifier: GPL-2.0 obj-m += vdpa_sim.o obj-m += vdpa_sim_net.o obj-m += vdpa_sim_blk.o EOF -# REUSE-IgnoreEnd -make -C /lib/modules/"$(uname -r)"/build M="$PWD" -sudo make -C /lib/modules/"$(uname -r)"/build M="$PWD" modules_install -popd || exit -sudo depmod -a -sudo modprobe vdpa -sudo modprobe vhost_vdpa -sudo modprobe vdpa_sim -sudo modprobe vdpa_sim_blk -sudo modprobe vdpa_sim_net -# Create /dev/vhost-vdpa-0 -sudo vdpa dev add name vdpa-blk1 mgmtdev vdpasim_blk -# Create /dev/vhost-vdpa-1 -sudo vdpa dev add name vdpa-blk2 mgmtdev vdpasim_blk -# Create /dev/vhost-vdpa-2 -sudo vdpa dev add name vdpa-net1 mgmtdev vdpasim_net -sudo chmod 660 /dev/vhost-vdpa-0 -sudo chmod 660 /dev/vhost-vdpa-1 -sudo chmod 660 /dev/vhost-vdpa-2 -vdpa dev show -jp + # REUSE-IgnoreEnd + make -C /lib/modules/"$(uname -r)"/build M="$PWD" + sudo make -C /lib/modules/"$(uname -r)"/build M="$PWD" modules_install + popd || exit + sudo depmod -a +} + +check_vdpa_sim_modules() { + for module in $MODULES; do + modinfo "$module" || { + echo "Module $module is not installed. Please build and install it first." + exit 1 + } + done +} + +modproobe_modules() { + for module in $MODULES; do + sudo modprobe "$module" || { + echo "Failed to load module $module. Please check if it is installed correctly." + exit 1 + } + done +} + +prepare_vdpa() { + # Create /dev/vhost-vdpa-0 + sudo vdpa dev add name vdpa-blk1 mgmtdev vdpasim_blk + # Create /dev/vhost-vdpa-1 + sudo vdpa dev add name vdpa-blk2 mgmtdev vdpasim_blk + # Create /dev/vhost-vdpa-2 + sudo vdpa dev add name vdpa-net1 mgmtdev vdpasim_net + sudo chmod 660 /dev/vhost-vdpa-0 + sudo chmod 660 /dev/vhost-vdpa-1 + sudo chmod 660 /dev/vhost-vdpa-2 + vdpa dev show -jp +} + +MODULES="vdpa vhost_vdpa vdpa_sim vdpa_sim_blk vdpa_sim_net" +DISTRO_NAME="ubuntu" +if [[ -f /etc/lsb-release ]]; then + DISTRO_NAME=$(grep DISTRIB_ID /etc/lsb-release | cut -d '=' -f 2) + # Converts the value of the DISTRO_NAME variable to lowercase letters. + DISTRO_NAME=$(echo "$DISTRO_NAME" | tr '[:upper:]' '[:lower:]') + echo "Distribution Name: $DISTRO_NAME" +fi + +if [[ "$DISTRO_NAME" == "ubuntu" ]]; then + build_install_vdpa_sim_modules_ubuntu +fi +# For other distros, we assume the modules are already built and installed +# For Azure Linux, the modules are included in the kernel and should be available by default +check_vdpa_sim_modules + +modproobe_modules + +prepare_vdpa diff --git a/scripts/run_integration_tests_aarch64.sh b/scripts/run_integration_tests_aarch64.sh index 68500028bc..5a1dce52b9 100755 --- a/scripts/run_integration_tests_aarch64.sh +++ b/scripts/run_integration_tests_aarch64.sh @@ -13,7 +13,7 @@ build_virtiofsd() { VIRTIOFSD_DIR="$WORKLOADS_DIR/virtiofsd_build" VIRTIOFSD_REPO="https://gitlab.com/virtio-fs/virtiofsd.git" - checkout_repo "$VIRTIOFSD_DIR" "$VIRTIOFSD_REPO" v1.13.3 "bbf82173682a3e48083771a0a23331e5c23b4924" + checkout_repo "$VIRTIOFSD_DIR" "$VIRTIOFSD_REPO" main "0f5865629dc995a3e9d5a73b4eb45bb91740bccb" if [ ! -f "$VIRTIOFSD_DIR/.built" ]; then pushd "$VIRTIOFSD_DIR" || exit @@ -26,7 +26,7 @@ build_virtiofsd() { } update_workloads() { - cp scripts/sha1sums-aarch64 "$WORKLOADS_DIR" + cp scripts/sha1sums-aarch64-common "$WORKLOADS_DIR" FOCAL_OS_RAW_IMAGE_NAME="focal-server-cloudimg-arm64-custom-20210929-0.raw" FOCAL_OS_RAW_IMAGE_DOWNLOAD_URL="https://ch-images.azureedge.net/$FOCAL_OS_RAW_IMAGE_NAME" @@ -136,9 +136,12 @@ update_workloads() { popd || exit fi + # Download aarch64 ovmf + download_aarch64_ovmf + pushd "$WORKLOADS_DIR" || exit - if ! sha1sum sha1sums-aarch64 --check; then + if ! sha1sum sha1sums-aarch64-common --check; then echo "sha1sum validation of images failed, remove invalid images to fix the issue." exit 1 fi @@ -202,9 +205,6 @@ update_workloads() { echo "foo" >"$SHARED_DIR/file1" echo "bar" >"$SHARED_DIR/file3" || exit 1 fi - - # Checkout and build EDK2 - build_edk2 } process_common_args "$@" @@ -285,7 +285,7 @@ fi # Run tests on dbus_api if [ $RES -eq 0 ]; then - cargo build --features "dbus_api" --all --release --target "$BUILD_TARGET" + cargo build --features "mshv,dbus_api" --all --release --target "$BUILD_TARGET" export RUST_BACKTRACE=1 # integration tests now do not reply on build feature "dbus_api" time cargo nextest run $test_features --retries 3 --no-fail-fast --no-tests=pass --test-threads=$(($(nproc) / 4)) "dbus_api::$test_filter" -- ${test_binary_args[*]} @@ -294,14 +294,14 @@ fi # Run tests on fw_cfg if [ $RES -eq 0 ]; then - cargo build --features "fw_cfg" --all --release --target "$BUILD_TARGET" + cargo build --features "mshv,fw_cfg" --all --release --target "$BUILD_TARGET" export RUST_BACKTRACE=1 time cargo nextest run $test_features --retries 3 --no-fail-fast --no-tests=pass --test-threads=$(($(nproc) / 4)) "fw_cfg::$test_filter" -- ${test_binary_args[*]} RES=$? fi if [ $RES -eq 0 ]; then - cargo build --features "ivshmem" --all --release --target "$BUILD_TARGET" + cargo build --features "mshv,ivshmem" --all --release --target "$BUILD_TARGET" export RUST_BACKTRACE=1 time cargo nextest run $test_features --retries 3 --no-fail-fast --no-tests=pass --test-threads=$(($(nproc) / 4)) "ivshmem::$test_filter" -- ${test_binary_args[*]} diff --git a/scripts/run_integration_tests_cvm.sh b/scripts/run_integration_tests_cvm.sh index edf543fc88..37e2cc74e7 100755 --- a/scripts/run_integration_tests_cvm.sh +++ b/scripts/run_integration_tests_cvm.sh @@ -27,7 +27,7 @@ popd || exit cargo build --features $build_features --all --release --target "$BUILD_TARGET" export RUST_BACKTRACE=1 -cargo nextest run $test_features "common_cvm::$test_filter" -- ${test_binary_args[*]} +time cargo nextest run $test_features --retries 3 --no-fail-fast --no-tests=pass --test-threads=$(($(nproc) / 4)) "common_cvm::$test_filter" -- ${test_binary_args[*]} RES=$? exit $RES diff --git a/scripts/run_integration_tests_vfio.sh b/scripts/run_integration_tests_vfio.sh index b32afe5a23..3f1b267787 100755 --- a/scripts/run_integration_tests_vfio.sh +++ b/scripts/run_integration_tests_vfio.sh @@ -30,7 +30,14 @@ cargo build --features mshv --all --release --target "$BUILD_TARGET" export RUST_BACKTRACE=1 export RUSTFLAGS="$RUSTFLAGS" -time cargo nextest run --no-tests=pass --test-threads=1 "vfio::test_nvidia" -- ${test_binary_args[*]} +# Run VFIO tests using legacy vfio interface with container/group +time cargo nextest run --retries 3 --no-tests=pass --test-threads=1 "vfio::test_nvidia" -- ${test_binary_args[*]} RES=$? +# Run VFIO tests using vfio cdev interface backed by iommufd +if [ $RES -eq 0 ]; then + time cargo nextest run --retries 3 --no-tests=pass --test-threads=1 "vfio::test_iommufd" -- ${test_binary_args[*]} + RES=$? +fi + exit $RES diff --git a/scripts/run_integration_tests_windows_aarch64.sh b/scripts/run_integration_tests_windows_aarch64.sh index 8f12a2740a..69537d7769 100755 --- a/scripts/run_integration_tests_windows_aarch64.sh +++ b/scripts/run_integration_tests_windows_aarch64.sh @@ -18,9 +18,9 @@ fi WIN_IMAGE_BASENAME="windows-11-iot-enterprise-aarch64.raw" WIN_IMAGE_FILE="$WORKLOADS_DIR/$WIN_IMAGE_BASENAME" -# Checkout and build EDK2 +# Download aarch64 OVMF OVMF_FW="$WORKLOADS_DIR/CLOUDHV_EFI.fd" -build_edk2 +download_aarch64_ovmf # Check if the images are present if [[ ! -f ${WIN_IMAGE_FILE} || ! -f ${OVMF_FW} ]]; then @@ -44,7 +44,7 @@ cargo build --all --release --target "$BUILD_TARGET" # Only run with 1 thread to avoid tests interfering with one another because # Windows has a static IP configured -time cargo nextest run --no-tests=pass "windows::$test_filter" --target "$BUILD_TARGET" -- ${test_binary_args[*]} +time cargo nextest run --retries 3 --no-tests=pass "windows::$test_filter" --target "$BUILD_TARGET" -- ${test_binary_args[*]} RES=$? dmsetup remove_all -f diff --git a/scripts/run_integration_tests_windows_x86_64.sh b/scripts/run_integration_tests_windows_x86_64.sh index d8f6861497..6b358c5c28 100755 --- a/scripts/run_integration_tests_windows_x86_64.sh +++ b/scripts/run_integration_tests_windows_x86_64.sh @@ -13,11 +13,12 @@ test_features="" if [ "$hypervisor" = "mshv" ]; then test_features="--features mshv" fi -WIN_IMAGE_FILE="/root/workloads/windows-server-2022-amd64-2.raw" +WIN_IMAGE_FILE="/root/workloads/windows-server-2025-amd64-1.raw" WORKLOADS_DIR="/root/workloads" -download_ovmf +# Download amd64 ovmf +download_amd64_ovmf CFLAGS="" if [[ "${BUILD_TARGET}" == "x86_64-unknown-linux-musl" ]]; then @@ -47,7 +48,7 @@ export RUSTFLAGS="$RUSTFLAGS" # Only run with 1 thread to avoid tests interfering with one another because # Windows has a static IP configured -time cargo nextest run --no-tests=pass $test_features "windows::$test_filter" --target "$BUILD_TARGET" -- ${test_binary_args[*]} +time cargo nextest run --retries 3 --no-tests=pass $test_features "windows::$test_filter" --target "$BUILD_TARGET" -- ${test_binary_args[*]} RES=$? dmsetup remove_all -f diff --git a/scripts/run_integration_tests_x86_64.sh b/scripts/run_integration_tests_x86_64.sh index 80ac279349..f12fa9aaaa 100755 --- a/scripts/run_integration_tests_x86_64.sh +++ b/scripts/run_integration_tests_x86_64.sh @@ -25,7 +25,7 @@ if [ ! -f "$WORKLOADS_DIR/hypervisor-fw" ]; then fi if [ ! -f "$WORKLOADS_DIR/CLOUDHV.fd" ]; then - download_ovmf + download_amd64_ovmf fi download_x86_guest_images @@ -124,7 +124,7 @@ if [ ! -f "$VIRTIOFSD" ]; then pushd "$WORKLOADS_DIR" || exit git clone "https://gitlab.com/virtio-fs/virtiofsd.git" $VIRTIOFSD_DIR pushd $VIRTIOFSD_DIR || exit - git checkout v1.13.3 + git checkout 0f5865629dc995a3e9d5a73b4eb45bb91740bccb time cargo build --release cp target/release/virtiofsd "$VIRTIOFSD" || exit 1 popd || exit diff --git a/scripts/run_metrics.sh b/scripts/run_metrics.sh index f70ccf1cde..88c207b322 100755 --- a/scripts/run_metrics.sh +++ b/scripts/run_metrics.sh @@ -108,6 +108,10 @@ if [ -n "$test_filter" ]; then test_binary_args+=("--test-filter $test_filter") fi +if [ -n "$test_exclude" ]; then + test_binary_args+=("--test-exclude $test_exclude") +fi + # Ensure that git commands can be run in this directory (for metrics report) git config --global --add safe.directory "$PWD" diff --git a/scripts/sha1sums-aarch64 b/scripts/sha1sums-aarch64-common similarity index 88% rename from scripts/sha1sums-aarch64 rename to scripts/sha1sums-aarch64-common index 4585509712..d955f98bec 100644 --- a/scripts/sha1sums-aarch64 +++ b/scripts/sha1sums-aarch64-common @@ -3,3 +3,4 @@ e4addb6e212a298144f9eb0eb6e36019d013f0e7 alpine-minirootfs-aarch64.tar.gz 9953b31bb1923cdd8d91b1b7cc9ad3a9be1e0a59 focal-server-cloudimg-arm64-custom-20210929-0.raw 7118f4d4cad18c8357bc2ad9824a50f9a82a860a jammy-server-cloudimg-arm64-custom-20220329-0.qcow2 1f2b71be43b8f748f01306c4454e5c921343faa4 jammy-server-cloudimg-arm64-custom-20220329-0.raw +ce3656987f9e4238ef8afbd65fca219460c1f767 CLOUDHV_EFI.fd diff --git a/scripts/sha1sums-x86_64 b/scripts/sha1sums-x86_64 index c49f00b266..e719bcc316 100644 --- a/scripts/sha1sums-x86_64 +++ b/scripts/sha1sums-x86_64 @@ -1,3 +1,3 @@ d4a44acc6014d5f83dea1c625c43d677a95fa75f alpine-minirootfs-x86_64.tar.gz 540ac358429305d7aa94e15363665d1c9d845982 hypervisor-fw -4e96fd0914a44005d40707b2b0c7e829e4086bd5 CLOUDHV.fd +fb2e6834cc482c80a45766f6dcf12474f4fcb74e CLOUDHV.fd diff --git a/scripts/test-util.sh b/scripts/test-util.sh index 8958439330..3ba2474a49 100644 --- a/scripts/test-util.sh +++ b/scripts/test-util.sh @@ -122,6 +122,10 @@ process_common_args() { shift test_filter="$1" ;; + "--test-exclude") + shift + test_exclude="$1" + ;; "--build-guest-kernel") build_kernel=true ;; @@ -194,8 +198,8 @@ prepare_linux() { fi } -download_ovmf() { - OVMF_FW_TAG="ch-a54f262b09" +download_amd64_ovmf() { + OVMF_FW_TAG="ch-1e1b96f126" OVMF_FW_URL="https://github.com/cloud-hypervisor/edk2/releases/download/$OVMF_FW_TAG/CLOUDHV.fd" OVMF_FW="$WORKLOADS_DIR/CLOUDHV.fd" pushd "$WORKLOADS_DIR" || exit @@ -204,6 +208,16 @@ download_ovmf() { popd || exit } +download_aarch64_ovmf() { + OVMF_FW_TAG="ch-1e1b96f126" + OVMF_FW_URL="https://github.com/cloud-hypervisor/edk2/releases/download/$OVMF_FW_TAG/CLOUDHV_EFI.fd" + OVMF_FW="$WORKLOADS_DIR/CLOUDHV_EFI.fd" + pushd "$WORKLOADS_DIR" || exit + rm -f "$OVMF_FW" + download_with_retries $OVMF_FW_URL || exit 1 + popd || exit +} + # Function to mount image partition, execute commands, and cleanup. # Arguments: $1: Image file path, $2: Mount directory, $3+: Commands to execute. mount_and_exec() { diff --git a/serial_buffer/Cargo.toml b/serial_buffer/Cargo.toml index 767c8a97ff..89766d5e86 100644 --- a/serial_buffer/Cargo.toml +++ b/serial_buffer/Cargo.toml @@ -2,6 +2,7 @@ authors = ["The Cloud Hypervisor Authors"] edition.workspace = true name = "serial_buffer" +rust-version.workspace = true version = "0.1.0" [lints] diff --git a/test_infra/Cargo.toml b/test_infra/Cargo.toml index 6c53e9ca0f..b5854521ad 100644 --- a/test_infra/Cargo.toml +++ b/test_infra/Cargo.toml @@ -2,6 +2,7 @@ authors = ["The Cloud Hypervisor Authors"] edition.workspace = true name = "test_infra" +rust-version.workspace = true version = "0.1.0" [dependencies] diff --git a/test_infra/src/lib.rs b/test_infra/src/lib.rs index 3ab550870e..42acca1905 100644 --- a/test_infra/src/lib.rs +++ b/test_infra/src/lib.rs @@ -15,7 +15,7 @@ use std::os::unix::io::{AsRawFd, FromRawFd}; use std::path::{Path, PathBuf}; use std::process::{Child, Command, ExitStatus, Output, Stdio}; use std::str::FromStr; -use std::time::Duration; +use std::time::{Duration, Instant}; use std::{env, fmt, fs, io, thread}; use rand::Rng; @@ -57,6 +57,44 @@ pub enum Error { WaitTimeout(#[source] WaitTimeoutError), } +/// Polls a boolean condition until it becomes true or the timeout expires. +pub fn wait_until(timeout: Duration, mut condition: F) -> bool +where + F: FnMut() -> bool, +{ + const INTERVAL: Duration = Duration::from_millis(50); + let start = Instant::now(); + + loop { + if condition() { + return true; + } + + if start.elapsed() >= timeout { + return false; + } + + thread::sleep(INTERVAL); + } +} + +/// Retries an operation until it returns `Ok` or the timeout expires. +pub fn wait_until_succeeds(timeout: Duration, mut operation: F) -> Result +where + F: FnMut() -> Result, +{ + const INTERVAL: Duration = Duration::from_millis(50); + let start = Instant::now(); + + loop { + match operation() { + Ok(result) => return Ok(result), + Err(err) if start.elapsed() >= timeout => return Err(err), + Err(_) => thread::sleep(INTERVAL), + } + } +} + pub struct GuestNetworkConfig { pub guest_ip0: String, pub host_ip0: String, @@ -76,7 +114,7 @@ pub struct GuestNetworkConfig { pub const DEFAULT_TCP_LISTENER_MESSAGE: &str = "booted"; pub const DEFAULT_TCP_LISTENER_PORT: u16 = 8000; pub const DEFAULT_TCP_LISTENER_TIMEOUT: u32 = 120; -pub const DEFAULT_CVM_TCP_LISTENER_TIMEOUT: u32 = 120; +pub const DEFAULT_CVM_TCP_LISTENER_TIMEOUT: u32 = 140; #[derive(Error, Debug)] pub enum WaitForBootError { @@ -188,6 +226,9 @@ pub trait DiskConfig { fn prepare_files(&mut self, tmp_dir: &TempDir, network: &GuestNetworkConfig); fn prepare_cloudinit(&self, tmp_dir: &TempDir, network: &GuestNetworkConfig) -> String; fn disk(&self, disk_type: DiskType) -> Option; + fn qcow2_disk(&self) -> Option { + None + } } #[derive(Clone)] @@ -210,6 +251,7 @@ impl UbuntuDiskConfig { pub struct WindowsDiskConfig { image_name: String, osdisk_path: String, + osdisk_qcow2_path: String, loopback_device: String, windows_snapshot_cow: String, windows_snapshot: String, @@ -220,6 +262,7 @@ impl WindowsDiskConfig { WindowsDiskConfig { image_name, osdisk_path: String::new(), + osdisk_qcow2_path: String::new(), loopback_device: String::new(), windows_snapshot_cow: String::new(), windows_snapshot: String::new(), @@ -248,6 +291,10 @@ impl Drop for WindowsDiskConfig { .args(["-d", self.loopback_device.as_str()]) .output() .expect("Expect removing loopback device to succeed"); + + if !self.osdisk_qcow2_path.is_empty() { + let _ = fs::remove_file(&self.osdisk_qcow2_path); + } } } @@ -413,7 +460,7 @@ impl DiskConfig for WindowsDiskConfig { let mut osdisk_path = workload_path; osdisk_path.push(&self.image_name); - let osdisk_blk_size = fs::metadata(osdisk_path) + let osdisk_blk_size = fs::metadata(&osdisk_path) .expect("Expect retrieving Windows image metadata") .len() >> 9; @@ -492,6 +539,27 @@ impl DiskConfig for WindowsDiskConfig { self.osdisk_path = format!("/dev/mapper/{windows_snapshot}"); self.windows_snapshot_cow = windows_snapshot_cow; self.windows_snapshot = windows_snapshot; + + // Create a qcow2 overlay backed by the raw image. + let mut workload_path = dirs::home_dir().unwrap(); + workload_path.push("workloads"); + let qcow2_name = format!("windows-qcow2-{}.qcow2", random_extension.to_str().unwrap()); + let qcow2_path = workload_path.join(&qcow2_name); + let output = Command::new("qemu-img") + .args([ + "create", + "-f", + "qcow2", + "-b", + osdisk_path.to_str().unwrap(), + "-F", + "raw", + qcow2_path.to_str().unwrap(), + ]) + .output() + .expect("Expect creating qcow2 overlay to succeed"); + assert!(output.status.success(), "qemu-img create failed"); + self.osdisk_qcow2_path = qcow2_path.to_str().unwrap().to_string(); } fn disk(&self, disk_type: DiskType) -> Option { @@ -500,6 +568,10 @@ impl DiskConfig for WindowsDiskConfig { DiskType::CloudInit => None, } } + + fn qcow2_disk(&self) -> Option { + Some(self.osdisk_qcow2_path.clone()) + } } pub fn rate_limited_copy, Q: AsRef>(from: P, to: Q) -> io::Result { @@ -618,6 +690,25 @@ pub enum SshCommandError { WaitEof(#[source] ssh2::Error), } +#[derive(Error, Debug)] +pub enum WaitForSshError { + #[error("timed out after {timeout:?} waiting for ssh command {command:?} on {ip}: {source}")] + Timeout { + command: String, + ip: String, + timeout: Duration, + #[source] + source: SshCommandError, + }, +} + +pub fn default_guest_auth() -> PasswordAuth { + PasswordAuth { + username: String::from("cloud"), + password: String::from("cloud123"), + } +} + fn scp_to_guest_with_auth( path: &Path, remote_path: &Path, @@ -707,50 +798,67 @@ pub fn scp_to_guest( ) } +/// Executes a command on a remote host via SSH using password authentication. +/// Returns the stdout output on success, or an [`SshCommandError`] on any +/// connection, authentication, or execution failure. pub fn ssh_command_ip_with_auth( command: &str, auth: &PasswordAuth, ip: &str, - retries: u8, - timeout: u8, + timeout: Option, ) -> Result { let mut s = String::new(); + let tcp = TcpStream::connect(format!("{ip}:22")).map_err(SshCommandError::Connection)?; + let mut sess = Session::new().unwrap(); + sess.set_tcp_stream(tcp); + if let Some(timeout) = timeout { + sess.set_timeout(timeout.as_millis() as u32); + } + sess.handshake().map_err(SshCommandError::Handshake)?; + sess.userauth_password(&auth.username, &auth.password) + .map_err(SshCommandError::Authentication)?; + assert!(sess.authenticated()); + let mut channel = sess + .channel_session() + .map_err(SshCommandError::ChannelSession)?; + channel.exec(command).map_err(SshCommandError::Command)?; + // Intentionally ignore these results here as their failure + // does not precipitate a repeat + let _ = channel.read_to_string(&mut s); + let _ = channel.close(); + let _ = channel.wait_close(); + let status = channel.exit_status().map_err(SshCommandError::ExitStatus)?; + if status != 0 { + Err(SshCommandError::NonZeroExitStatus(status)) + } else { + Ok(s) + } +} +/// Executes a command on a remote host via SSH using password authentication, +/// retrying on failure with linear backoff. +/// +/// Delegates each attempt to [`ssh_command_ip_with_auth`]. After the +/// *n*-th consecutive failure the function sleeps for `timeout_s * n` seconds +/// before the next attempt. Once `retries` attempts are exhausted the command +/// output and error are printed to stderr and the last error is returned. +/// +/// Note that `timeout_s` is not a per-attempt deadline — individual connection +/// and I/O operations may block for as long as the OS or SSH layer allows. +// TODO since we have we probably want to migrate every single invocation to a +// more graceful combination of wait_until() and ssh_command_ip_with_auth(). +pub fn ssh_command_ip_with_auth_retry( + command: &str, + auth: &PasswordAuth, + ip: &str, + retries: u8, + // Base unit for the inter-retry sleep duration, in seconds. + timeout_s: u8, +) -> Result { let mut counter = 0; loop { - let mut closure = || -> Result<(), SshCommandError> { - let tcp = - TcpStream::connect(format!("{ip}:22")).map_err(SshCommandError::Connection)?; - let mut sess = Session::new().unwrap(); - sess.set_tcp_stream(tcp); - sess.handshake().map_err(SshCommandError::Handshake)?; - - sess.userauth_password(&auth.username, &auth.password) - .map_err(SshCommandError::Authentication)?; - assert!(sess.authenticated()); - - let mut channel = sess - .channel_session() - .map_err(SshCommandError::ChannelSession)?; - channel.exec(command).map_err(SshCommandError::Command)?; - - // Intentionally ignore these results here as their failure - // does not precipitate a repeat - let _ = channel.read_to_string(&mut s); - let _ = channel.close(); - let _ = channel.wait_close(); - - let status = channel.exit_status().map_err(SshCommandError::ExitStatus)?; - - if status != 0 { - Err(SshCommandError::NonZeroExitStatus(status)) - } else { - Ok(()) - } - }; - - match closure() { - Ok(_) => break, + match ssh_command_ip_with_auth(command, auth, ip, None) { + Ok(s) => return Ok(s), Err(e) => { counter += 1; if counter >= retries { @@ -759,27 +867,28 @@ pub fn ssh_command_ip_with_auth( command=\"{command}\"\n\ auth=\"{auth:#?}\"\n\ ip=\"{ip}\"\n\ - output=\"{s}\"\n\ error=\"{e:?}\"\n\ - \n==== End ssh command outout ====\n\n" + \n==== End ssh command output ====\n\n" ); - return Err(e); } } } - thread::sleep(std::time::Duration::new((timeout * counter).into(), 0)); + thread::sleep(std::time::Duration::new((timeout_s * counter).into(), 0)); } - Ok(s) } +/// Executes a command on a remote host via SSH using password authentication, +/// retrying on failure with linear backoff. +/// +/// Wrapper around [`ssh_command_ip_with_auth_retry`]. pub fn ssh_command_ip( command: &str, ip: &str, retries: u8, timeout: u8, ) -> Result { - ssh_command_ip_with_auth( + ssh_command_ip_with_auth_retry( command, &PasswordAuth { username: String::from("cloud"), @@ -791,6 +900,24 @@ pub fn ssh_command_ip( ) } +/// Waits until SSH to the guest becomes available. +pub fn wait_for_ssh( + command: &str, + auth: &PasswordAuth, + ip: &str, + timeout: Duration, +) -> Result { + wait_until_succeeds(timeout, || { + ssh_command_ip_with_auth(command, auth, ip, Some(timeout)) + }) + .map_err(|source| WaitForSshError::Timeout { + command: command.to_string(), + ip: ip.to_string(), + timeout, + source, + }) +} + pub fn exec_host_command_with_retries(command: &str, retries: u32, interval: Duration) -> bool { for _ in 0..retries { let s = exec_host_command_output(command).status; @@ -883,6 +1010,28 @@ pub fn kill_child(child: &mut Child) { } } +#[derive(Debug)] +pub struct MetaEvent { + pub event: String, + pub device_id: Option, +} + +impl MetaEvent { + pub fn match_with_json_event(&self, v: &serde_json::Value) -> bool { + let mut matched = false; + if v["event"].as_str().unwrap() == self.event { + if let Some(device_id) = &self.device_id { + if v["properties"]["id"].as_str().unwrap() == device_id { + matched = true; + } + } else { + matched = true; + } + } + matched + } +} + pub const PIPE_SIZE: i32 = 32 << 20; pub struct Guest { @@ -894,6 +1043,9 @@ pub struct Guest { pub kernel_path: Option, pub kernel_cmdline: Option, pub console_type: Option, + pub num_cpu: u32, + pub nested: bool, + pub mem_size_str: String, } // Return the next id that can be used for this guest. This is stored in a @@ -960,9 +1112,12 @@ impl Guest { network, vm_type: GuestVmType::Regular, boot_timeout: DEFAULT_TCP_LISTENER_TIMEOUT, - kernel_path: None, - kernel_cmdline: None, + kernel_path: direct_kernel_boot_path().to_str().map(String::from), + kernel_cmdline: Some(DIRECT_KERNEL_BOOT_CMDLINE.to_string()), console_type: None, + num_cpu: 1u32, + nested: true, + mem_size_str: "512M".to_string(), } } @@ -970,6 +1125,31 @@ impl Guest { Self::new_from_ip_range(disk_config, "192.168", next_guest_id()) } + pub fn with_cpu(mut self, count: u32) -> Self { + self.num_cpu = count; + self + } + + pub fn with_memory(mut self, mem_size: &str) -> Self { + self.mem_size_str = mem_size.to_string(); + self + } + + pub fn with_nested(mut self, nested: bool) -> Self { + self.nested = nested; + self + } + + pub fn with_kernel_path(mut self, kernel_path: &str) -> Self { + self.kernel_path = Some(kernel_path.to_string()); + self + } + + pub fn with_kernel(mut self, kernel: String) -> Self { + self.kernel_path = Some(kernel); + self + } + pub fn default_net_string(&self) -> String { format!( "tap=,mac={},ip={},mask=255.255.255.128", @@ -1040,17 +1220,93 @@ impl Guest { ) } - pub fn api_create_body(&self, cpu_count: u8, kernel_path: &str, kernel_cmd: &str) -> String { - format! {"{{\"cpus\":{{\"boot_vcpus\":{},\"max_vcpus\":{}}},\"payload\":{{\"kernel\":\"{}\",\"cmdline\": \"{}\"}},\"net\":[{{\"ip\":\"{}\", \"mask\":\"255.255.255.0\", \"mac\":\"{}\"}}], \"disks\":[{{\"path\":\"{}\"}}, {{\"path\":\"{}\"}}]}}", - cpu_count, - cpu_count, - kernel_path, - kernel_cmd, - self.network.host_ip0, - self.network.guest_mac0, - self.disk_config.disk(DiskType::OperatingSystem).unwrap().as_str(), - self.disk_config.disk(DiskType::CloudInit).unwrap().as_str(), + /// Waits until SSH to the guest becomes available using the + /// [default guest authentication] and the default guest IP. + /// + /// [default guest authentication]: default_guest_auth + pub fn wait_for_ssh(&self, timeout: Duration) -> Result<(), WaitForSshError> { + wait_for_ssh( + "true", + &default_guest_auth(), + &self.network.guest_ip0, + timeout, + ) + .map(|_| ()) + } + + /// Waits until the provided command succeeds via SSH on the guest using the + /// [default guest authentication] and the default guest IP. + /// + /// [default guest authentication]: default_guest_auth + pub fn wait_for_ssh_command( + &self, + command: &str, + timeout: Duration, + ) -> Result { + wait_for_ssh( + command, + &default_guest_auth(), + &self.network.guest_ip0, + timeout, + ) + } + + /// Waits until the guest's SSH port is no longer reachable, indicating + /// the guest has probably shutdown. + pub fn wait_for_ssh_unresponsive(&self, timeout: Duration) -> bool { + let addr = format!("{}:22", self.network.guest_ip0) + .parse::() + .unwrap(); + wait_until(timeout, || { + std::net::TcpStream::connect_timeout(&addr, Duration::from_secs(2)).is_err() + }) + } + + pub fn api_create_body(&self) -> String { + let mut body = serde_json::json!({ + "cpus": { + "boot_vcpus": self.num_cpu, + "max_vcpus": self.num_cpu, + }, + "net": [ + { + "ip": self.network.host_ip0, + "mask": "255.255.255.0", + "mac": self.network.guest_mac0, + } + ], + "disks": [ + { + "path": self.disk_config.disk(DiskType::OperatingSystem).unwrap(), + }, + { + "path": self.disk_config.disk(DiskType::CloudInit).unwrap(), + } + ] + }); + + if !self.nested { + body["cpus"]["nested"] = serde_json::json!(false); + } + + if self.vm_type == GuestVmType::Confidential { + body["platform"] = serde_json::json!({"sev_snp": true}); + body["payload"] = serde_json::json!({ + "igvm": direct_igvm_boot_path(Some("hvc0")) + .unwrap() + .to_str() + .unwrap(), + "cmdline": self.kernel_cmdline.as_deref().unwrap(), + "host_data": generate_host_data(), + }); + } else { + body["payload"] = serde_json::json!({ + "kernel": self.kernel_path.as_deref().unwrap(), + "cmdline": self.kernel_cmdline.as_deref().unwrap(), + }); } + + body.to_string() } pub fn get_cpu_count(&self) -> Result { @@ -1228,12 +1484,25 @@ impl Guest { } #[cfg(target_arch = "x86_64")] - pub fn check_nvidia_gpu(&self) { - assert!( - self.ssh_command("nvidia-smi") - .unwrap() - .contains("NVIDIA L40S") + pub fn check_nvidia_gpu(&self) -> bool { + let output = self.ssh_command("nvidia-smi").unwrap(); + + if output.contains("NVIDIA L40S") { + return true; + } + + let dmesg = self + .ssh_command("sudo dmesg") + .unwrap_or_else(|e| format!("Failed to get dmesg: {e:?}")); + + eprintln!( + "\n\n==== Guest dmesg (nvidia-smi check failed) ====\n\n\ + {dmesg}\n\ + \n==== End guest dmesg ====\n\n" ); + eprintln!("nvidia-smi output did not contain 'NVIDIA L40S': {output}"); + + false } pub fn reboot_linux(&self, current_reboot_count: u32) { @@ -1320,6 +1589,138 @@ impl Guest { assert_eq!(self.ssh_command("sudo umount /mnt").unwrap(), ""); } } + + pub fn get_expected_seq_events_for_simple_launch(&self) -> Vec { + let mut out_evt = vec![ + MetaEvent { + event: "starting".to_string(), + device_id: None, + }, + MetaEvent { + event: "booting".to_string(), + device_id: None, + }, + MetaEvent { + event: "booted".to_string(), + device_id: None, + }, + MetaEvent { + event: "activated".to_string(), + device_id: Some("_disk0".to_string()), + }, + ]; + // For confidential VM, reset of the device does not trigger a VMM exit, or + // It is handled in the PSP + // so we won't receive the "reset" event for disk0. + if self.vm_type != GuestVmType::Confidential { + out_evt.push(MetaEvent { + event: "reset".to_string(), + device_id: Some("_disk0".to_string()), + }); + } + out_evt + } + + pub fn default_cpus_string(&self) -> String { + format!( + "boot={}{}", + self.num_cpu, + if self.nested { "" } else { ",nested=off" } + ) + } + + pub fn default_cpus_with_affinity_string(&self) -> String { + format!( + "boot={},affinity=[0@[0,2],1@[1,3]]{}", + self.num_cpu, + if self.nested { "" } else { ",nested=off" } + ) + } + + pub fn default_memory_string(&self) -> String { + format!("size={}", self.mem_size_str) + } + + pub fn validate_cpu_count(&self, expected_cpu_count: Option) { + let cpu = match expected_cpu_count { + Some(count) => count, + None => self.num_cpu, + }; + assert_eq!(self.get_cpu_count().unwrap_or_default(), cpu); + } + + fn get_expected_memory(&self) -> Option { + // For confidential VMs, the memory available to the guest is less than + // the memory assigned to the VM, as some of it is reserved for the PSP + // and bounce buffers. + // So we return the expected available memory for confidential VMs here. + let memory = match self.mem_size_str.as_str() { + "512M" => { + if self.vm_type == GuestVmType::Confidential { + 407_000 + } else { + 480_000 + } + } + "1G" => { + if self.vm_type == GuestVmType::Confidential { + 920_000 + } else { + 960_000 + } + } + // More to be added if more memory sizes are used in the tests + _ => panic!("Unsupported memory size: {}", self.mem_size_str), + }; + Some(memory) + } + + pub fn validate_memory(&self, expected_memory: Option) { + let memory = expected_memory + .or_else(|| self.get_expected_memory()) + .unwrap_or_default(); + + assert!(self.get_total_memory().unwrap_or_default() > memory); + } +} + +// A factory for creating guests with different configurations. The factory is initialized +// with a GuestVmType, and created guests will have the same GuestVmType as the factory. +// This allows creation of guests with different configurations (e.g. regular vs confidential) +// without specifying the GuestVmType each time. +// Based on the VmType, the default timeout for waiting for the VM to boot is also set, +// which is used in the wait_vm_boot() method of the Guest struct. Additionally, nested +// virtualization is disabled by default for confidential VMs, as it is not supported. +pub struct GuestFactory { + vm_type: GuestVmType, + boot_timeout: u32, + nested: bool, +} + +impl GuestFactory { + pub fn new_regular_guest_factory() -> Self { + Self { + vm_type: GuestVmType::Regular, + boot_timeout: DEFAULT_TCP_LISTENER_TIMEOUT, + nested: true, + } + } + + pub fn new_confidential_guest_factory() -> Self { + Self { + vm_type: GuestVmType::Confidential, + boot_timeout: DEFAULT_CVM_TCP_LISTENER_TIMEOUT, + nested: false, + } + } + + pub fn create_guest(&self, disk_config: Box) -> Guest { + let mut guest = Guest::new(disk_config); + guest.vm_type = self.vm_type; + guest.boot_timeout = self.boot_timeout; + guest.nested = self.nested; + guest + } } #[derive(Default)] @@ -1448,35 +1849,31 @@ impl<'a> GuestCommand<'a> { } pub fn default_disks(&mut self) -> &mut Self { - if self.guest.disk_config.disk(DiskType::CloudInit).is_some() { + self.default_disks_inner(true) + } + + pub fn default_disks_sparse_off(&mut self) -> &mut Self { + self.default_disks_inner(false) + } + + fn default_disks_inner(&mut self, sparse: bool) -> &mut Self { + let sparse_opt = if sparse { "" } else { ",sparse=off" }; + let os_disk = format!( + "path={}{}", + self.guest + .disk_config + .disk(DiskType::OperatingSystem) + .unwrap(), + sparse_opt + ); + if let Some(cloud_init) = self.guest.disk_config.disk(DiskType::CloudInit) { self.args([ "--disk", - format!( - "path={}", - self.guest - .disk_config - .disk(DiskType::OperatingSystem) - .unwrap() - ) - .as_str(), - format!( - "path={}", - self.guest.disk_config.disk(DiskType::CloudInit).unwrap() - ) - .as_str(), + os_disk.as_str(), + format!("path={cloud_init}").as_str(), ]) } else { - self.args([ - "--disk", - format!( - "path={}", - self.guest - .disk_config - .disk(DiskType::OperatingSystem) - .unwrap() - ) - .as_str(), - ]) + self.args(["--disk", os_disk.as_str()]) } } @@ -1484,7 +1881,7 @@ impl<'a> GuestCommand<'a> { self.args(["--net", self.guest.default_net_string().as_str()]) } - pub fn default_kernel_cmdline(&mut self) -> &mut Self { + pub fn default_kernel_cmdline_with_platform(&mut self, platform: Option<&str>) -> &mut Self { if self.guest.vm_type == GuestVmType::Confidential { let console_str = if let Some(c) = &self.guest.console_type { c.as_str() @@ -1493,19 +1890,57 @@ impl<'a> GuestCommand<'a> { }; let igvm = direct_igvm_boot_path(Some(console_str)) .expect("IGVM boot file not found for console type: {console_str}"); - self.command.args(["--igvm", igvm.to_str().unwrap()]); + self.command.args([ + "--igvm", + igvm.to_str().expect("IGVM path is not valid UTF-8"), + ]); self.command .args(["--host-data", generate_host_data().as_str()]); - self.command.args(["--platform", "sev_snp=on"]); + self.command.args([ + "--platform", + &format!( + "{}sev_snp=on", + if let Some(p) = platform { + format!("{p},") + } else { + String::new() + } + ), + ]); } else if let Some(kernel) = &self.guest.kernel_path { self.command.args(["--kernel", kernel.as_str()]); if let Some(cmdline) = &self.guest.kernel_cmdline { self.command.args(["--cmdline", cmdline]); } + if let Some(platform_arg) = platform { + self.command.args(["--platform", platform_arg]); + } } self } + + pub fn default_kernel_cmdline(&mut self) -> &mut Self { + self.default_kernel_cmdline_with_platform(None) + } + + pub fn default_cpus(&mut self) -> &mut Self { + self.args(["--cpus", self.guest.default_cpus_string().as_str()]) + } + + pub fn default_cpus_with_affinity(&mut self) -> &mut Self { + // Only support cpu affinity for 2 VCPUs for now, + // as it is only used in a test that validates cpu affinity is applied correctly. + assert_eq!(self.guest.num_cpu, 2); + self.args([ + "--cpus", + self.guest.default_cpus_with_affinity_string().as_str(), + ]) + } + + pub fn default_memory(&mut self) -> &mut Self { + self.args(["--memory", self.guest.default_memory_string().as_str()]) + } } /// Returns the absolute path into the workspaces target directory to locate the desired @@ -1523,6 +1958,42 @@ pub fn clh_command(cmd: &str) -> String { String::from(full_path.to_str().unwrap()) } +pub fn remote_command(api_socket: &str, command: &str, arg: Option<&str>) -> bool { + let mut cmd = Command::new(clh_command("ch-remote")); + cmd.args([&format!("--api-socket={api_socket}"), command]); + + if let Some(arg) = arg { + cmd.arg(arg); + } + + let output = cmd.output().unwrap(); + if output.status.success() { + true + } else { + eprintln!("Error running ch-remote command: {:?}", &cmd); + let stderr = String::from_utf8_lossy(&output.stderr); + eprintln!("stderr: {stderr}"); + false + } +} + +pub fn remote_command_w_output( + api_socket: &str, + command: &str, + arg: Option<&str>, +) -> (bool, Vec /* stdout */, Vec /* stderr */) { + let mut cmd = Command::new(clh_command("ch-remote")); + cmd.args([&format!("--api-socket={api_socket}"), command]); + + if let Some(arg) = arg { + cmd.arg(arg); + } + + let output = cmd.output().expect("Failed to launch ch-remote"); + + (output.status.success(), output.stdout, output.stderr) +} + pub fn parse_iperf3_output(output: &[u8], sender: bool, bandwidth: bool) -> Result { std::panic::catch_unwind(|| { let s = String::from_utf8_lossy(output); @@ -1933,3 +2404,91 @@ fn generate_host_data() -> String { rand::rng().fill_bytes(&mut bytes); bytes.iter().map(|b| format!("{b:02x}")).collect() } + +// Creates the path for direct kernel boot and return the path. +// For x86_64, this function returns the vmlinux kernel path. +// For AArch64, this function returns the PE kernel path. +pub fn direct_kernel_boot_path() -> PathBuf { + let mut workload_path = dirs::home_dir().unwrap(); + workload_path.push("workloads"); + + let mut kernel_path = workload_path; + #[cfg(target_arch = "x86_64")] + kernel_path.push("vmlinux-x86_64"); + #[cfg(target_arch = "aarch64")] + kernel_path.push("Image-arm64"); + + kernel_path +} + +pub fn edk2_path() -> PathBuf { + let mut workload_path = dirs::home_dir().unwrap(); + workload_path.push("workloads"); + let mut edk2_path = workload_path; + edk2_path.push(OVMF_NAME); + + edk2_path +} + +pub const DIRECT_KERNEL_BOOT_CMDLINE: &str = + "root=/dev/vda1 console=hvc0 rw systemd.journald.forward_to_console=1"; + +pub const CONSOLE_TEST_STRING: &str = "Started OpenBSD Secure Shell server"; + +// Constant taken from the VMM crate. +pub const MAX_NUM_PCI_SEGMENTS: u16 = 96; + +#[cfg(target_arch = "x86_64")] +pub mod x86_64 { + pub const FOCAL_IMAGE_NAME: &str = "focal-server-cloudimg-amd64-custom-20210609-0.raw"; + pub const JAMMY_VFIO_IMAGE_NAME: &str = + "jammy-server-cloudimg-amd64-custom-vfio-20241012-0.raw"; + pub const FOCAL_IMAGE_NAME_VHD: &str = "focal-server-cloudimg-amd64-custom-20210609-0.vhd"; + pub const FOCAL_IMAGE_NAME_VHDX: &str = "focal-server-cloudimg-amd64-custom-20210609-0.vhdx"; + pub const JAMMY_IMAGE_NAME: &str = "jammy-server-cloudimg-amd64-custom-20241017-0.raw"; + pub const JAMMY_IMAGE_NAME_QCOW2: &str = "jammy-server-cloudimg-amd64-custom-20241017-0.qcow2"; + pub const JAMMY_IMAGE_NAME_QCOW2_ZLIB: &str = + "jammy-server-cloudimg-amd64-custom-20241017-0-zlib.qcow2"; + pub const JAMMY_IMAGE_NAME_QCOW2_ZSTD: &str = + "jammy-server-cloudimg-amd64-custom-20241017-0-zstd.qcow2"; + pub const JAMMY_IMAGE_NAME_QCOW2_BACKING_ZSTD_FILE: &str = + "jammy-server-cloudimg-amd64-custom-20241017-0-backing-zstd.qcow2"; + pub const JAMMY_IMAGE_NAME_QCOW2_BACKING_UNCOMPRESSED_FILE: &str = + "jammy-server-cloudimg-amd64-custom-20241017-0-backing-uncompressed.qcow2"; + pub const JAMMY_IMAGE_NAME_QCOW2_BACKING_RAW_FILE: &str = + "jammy-server-cloudimg-amd64-custom-20241017-0-backing-raw.qcow2"; + pub const WINDOWS_IMAGE_NAME: &str = "windows-server-2025-amd64-1.raw"; + pub const OVMF_NAME: &str = "CLOUDHV.fd"; + pub const GREP_SERIAL_IRQ_CMD: &str = "grep -c 'IO-APIC.*ttyS0' /proc/interrupts || true"; +} + +#[cfg(target_arch = "x86_64")] +pub use x86_64::*; + +#[cfg(target_arch = "aarch64")] +pub mod aarch64 { + pub const FOCAL_IMAGE_NAME: &str = "focal-server-cloudimg-arm64-custom-20210929-0.raw"; + pub const FOCAL_IMAGE_UPDATE_KERNEL_NAME: &str = + "focal-server-cloudimg-arm64-custom-20210929-0-update-kernel.raw"; + pub const FOCAL_IMAGE_NAME_VHD: &str = "focal-server-cloudimg-arm64-custom-20210929-0.vhd"; + pub const FOCAL_IMAGE_NAME_VHDX: &str = "focal-server-cloudimg-arm64-custom-20210929-0.vhdx"; + pub const JAMMY_IMAGE_NAME: &str = "jammy-server-cloudimg-arm64-custom-20220329-0.raw"; + pub const JAMMY_IMAGE_NAME_QCOW2: &str = "jammy-server-cloudimg-arm64-custom-20220329-0.qcow2"; + pub const JAMMY_IMAGE_NAME_QCOW2_ZLIB: &str = + "jammy-server-cloudimg-arm64-custom-20220329-0-zlib.qcow2"; + pub const JAMMY_IMAGE_NAME_QCOW2_ZSTD: &str = + "jammy-server-cloudimg-arm64-custom-20220329-0-zstd.qcow2"; + pub const JAMMY_IMAGE_NAME_QCOW2_BACKING_ZSTD_FILE: &str = + "jammy-server-cloudimg-arm64-custom-20220329-0-backing-zstd.qcow2"; + pub const JAMMY_IMAGE_NAME_QCOW2_BACKING_UNCOMPRESSED_FILE: &str = + "jammy-server-cloudimg-arm64-custom-20220329-0-backing-uncompressed.qcow2"; + pub const JAMMY_IMAGE_NAME_QCOW2_BACKING_RAW_FILE: &str = + "jammy-server-cloudimg-arm64-custom-20220329-0-backing-raw.qcow2"; + pub const WINDOWS_IMAGE_NAME: &str = "windows-11-iot-enterprise-aarch64.raw"; + pub const OVMF_NAME: &str = "CLOUDHV_EFI.fd"; + pub const GREP_SERIAL_IRQ_CMD: &str = "grep -c 'GICv3.*uart-pl011' /proc/interrupts || true"; + pub const GREP_PMU_IRQ_CMD: &str = "grep -c 'GICv3.*arm-pmu' /proc/interrupts || true"; +} + +#[cfg(target_arch = "aarch64")] +pub use aarch64::*; diff --git a/tpm/Cargo.toml b/tpm/Cargo.toml index 82dc8f79be..dd87f99371 100644 --- a/tpm/Cargo.toml +++ b/tpm/Cargo.toml @@ -3,13 +3,13 @@ authors = ["Microsoft Authors"] edition = "2021" license = "Apache-2.0" name = "tpm" +rust-version.workspace = true version = "0.1.0" [dependencies] anyhow = { workspace = true } libc = { workspace = true } log = { workspace = true } -net_gen = { path = "../net_gen" } thiserror = { workspace = true } vmm-sys-util = { workspace = true } diff --git a/tpm/src/emulator.rs b/tpm/src/emulator.rs index b27a069aa0..0ffd3a62e4 100644 --- a/tpm/src/emulator.rs +++ b/tpm/src/emulator.rs @@ -150,16 +150,16 @@ impl Emulator { // SAFETY: FFI calls and return value of the unsafe call is checked unsafe { - let tv = net_gen::iff::timeval { + let tv = libc::timeval { tv_sec: 0, tv_usec: 100000, // Set recv timeout to 100ms }; - let ret = net_gen::setsockopt( + let ret = libc::setsockopt( fds[0], - net_gen::iff::SOL_SOCKET as i32, - net_gen::iff::SO_RCVTIMEO as i32, + libc::SOL_SOCKET, + libc::SO_RCVTIMEO, &tv as *const _ as *const libc::c_void, - std::mem::size_of::() as u32, + std::mem::size_of::() as u32, ); if ret == -1 { return Err(Error::PrepareDataFd(anyhow!( diff --git a/tracer/Cargo.toml b/tracer/Cargo.toml index 1ac9f4e393..64f3399902 100644 --- a/tracer/Cargo.toml +++ b/tracer/Cargo.toml @@ -2,6 +2,7 @@ authors = ["The Cloud Hypervisor Authors"] edition.workspace = true name = "tracer" +rust-version.workspace = true version = "0.1.0" [dependencies] diff --git a/vhost_user_block/src/lib.rs b/vhost_user_block/src/lib.rs index 13668e1cdb..9b0e429ee4 100644 --- a/vhost_user_block/src/lib.rs +++ b/vhost_user_block/src/lib.rs @@ -12,6 +12,7 @@ use std::fs::{File, OpenOptions}; use std::io::{Read, Seek, SeekFrom, Write}; use std::ops::{Deref, DerefMut}; use std::os::unix::fs::OpenOptionsExt; +use std::os::unix::io::{FromRawFd, IntoRawFd}; use std::path::PathBuf; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::{Arc, Mutex, RwLock, RwLockWriteGuard}; @@ -34,6 +35,7 @@ use virtio_bindings::virtio_ring::VIRTIO_RING_F_EVENT_IDX; use virtio_queue::QueueT; use vm_memory::{ByteValued, Bytes, GuestAddressSpace, GuestMemoryAtomic}; use vmm_sys_util::epoll::EventSet; +use vmm_sys_util::event::{EventConsumer, EventNotifier}; use vmm_sys_util::eventfd::EventFd; type GuestMemoryMmap = vm_memory::GuestMemoryMmap; @@ -423,15 +425,15 @@ impl VhostUserBackendMut for VhostUserBlkBackend { Ok(()) } - fn exit_event(&self, thread_index: usize) -> Option { - Some( - self.threads[thread_index] - .lock() - .unwrap() - .kill_evt - .try_clone() - .unwrap(), - ) + fn exit_event(&self, thread_index: usize) -> Option<(EventConsumer, EventNotifier)> { + let kill_evt = &self.threads[thread_index].lock().unwrap().kill_evt; + // SAFETY: kill_evt is a valid eventfd + unsafe { + Some(( + EventConsumer::from_raw_fd(kill_evt.try_clone().unwrap().into_raw_fd()), + EventNotifier::from_raw_fd(kill_evt.try_clone().unwrap().into_raw_fd()), + )) + } } fn queues_per_thread(&self) -> Vec { @@ -533,14 +535,14 @@ pub fn start_block_backend(backend_command: &str) { debug!("blk_backend is created!\n"); - let listener = Listener::new(&backend_config.socket, true).unwrap(); + let mut listener = Listener::new(&backend_config.socket, true).unwrap(); let name = "vhost-user-blk-backend"; let mut blk_daemon = VhostUserDaemon::new(name.to_string(), blk_backend.clone(), mem).unwrap(); debug!("blk_daemon is created!\n"); - if let Err(e) = blk_daemon.start(listener) { + if let Err(e) = blk_daemon.start(&mut listener) { error!("Failed to start daemon for vhost-user-block with error: {e:?}\n"); process::exit(1); } diff --git a/vhost_user_net/src/lib.rs b/vhost_user_net/src/lib.rs index 0e89a763a8..254058abd3 100644 --- a/vhost_user_net/src/lib.rs +++ b/vhost_user_net/src/lib.rs @@ -8,7 +8,7 @@ use std::net::{IpAddr, Ipv4Addr}; use std::ops::Deref; -use std::os::unix::io::{AsRawFd, RawFd}; +use std::os::unix::io::{AsRawFd, FromRawFd, IntoRawFd, RawFd}; use std::sync::{Arc, Mutex, RwLock}; use std::{io, process}; @@ -27,6 +27,7 @@ use virtio_bindings::virtio_config::{VIRTIO_F_NOTIFY_ON_EMPTY, VIRTIO_F_VERSION_ use virtio_bindings::virtio_net::*; use vm_memory::{GuestAddressSpace, GuestMemoryAtomic}; use vmm_sys_util::epoll::EventSet; +use vmm_sys_util::event::{EventConsumer, EventNotifier}; use vmm_sys_util::eventfd::EventFd; type GuestMemoryMmap = vm_memory::GuestMemoryMmap; @@ -249,15 +250,15 @@ impl VhostUserBackendMut for VhostUserNetBackend { Ok(()) } - fn exit_event(&self, thread_index: usize) -> Option { - Some( - self.threads[thread_index] - .lock() - .unwrap() - .kill_evt - .try_clone() - .unwrap(), - ) + fn exit_event(&self, thread_index: usize) -> Option<(EventConsumer, EventNotifier)> { + let kill_evt = &self.threads[thread_index].lock().unwrap().kill_evt; + // SAFETY: kill_evt is a valid eventfd + unsafe { + Some(( + EventConsumer::from_raw_fd(kill_evt.try_clone().unwrap().into_raw_fd()), + EventNotifier::from_raw_fd(kill_evt.try_clone().unwrap().into_raw_fd()), + )) + } } fn queues_per_thread(&self) -> Vec { @@ -394,7 +395,7 @@ pub fn start_net_backend(backend_command: &str) { if let Err(e) = if backend_config.client { net_daemon.start_client(&backend_config.socket) } else { - net_daemon.start(Listener::new(&backend_config.socket, true).unwrap()) + net_daemon.start(&mut Listener::new(&backend_config.socket, true).unwrap()) } { error!("failed to start daemon for vhost-user-net with error: {e:?}"); process::exit(1); diff --git a/virtio-devices/Cargo.toml b/virtio-devices/Cargo.toml index 5cbfe145f4..d2658eeeca 100644 --- a/virtio-devices/Cargo.toml +++ b/virtio-devices/Cargo.toml @@ -2,6 +2,7 @@ authors = ["The Cloud Hypervisor Authors"] edition.workspace = true name = "virtio-devices" +rust-version.workspace = true version = "0.1.0" [features] @@ -16,6 +17,7 @@ block = { path = "../block" } byteorder = { workspace = true } epoll = { workspace = true } event_monitor = { path = "../event_monitor" } +hypervisor = { path = "../hypervisor" } libc = { workspace = true } log = { workspace = true } mshv-ioctls = { workspace = true, optional = true } diff --git a/virtio-devices/src/balloon.rs b/virtio-devices/src/balloon.rs index 3db6832617..f9db09bd33 100644 --- a/virtio-devices/src/balloon.rs +++ b/virtio-devices/src/balloon.rs @@ -34,14 +34,15 @@ use vm_memory::{ GuestMemoryError, GuestMemoryRegion, }; use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable}; +use vm_virtio::{AccessPlatform, Translatable}; use vmm_sys_util::eventfd::EventFd; use crate::seccomp_filters::Thread; use crate::thread_helper::spawn_virtio_thread; use crate::{ ActivateResult, EPOLL_HELPER_EVENT_LAST, EpollHelper, EpollHelperError, EpollHelperHandler, - GuestMemoryMmap, VIRTIO_F_VERSION_1, VirtioCommon, VirtioDevice, VirtioDeviceType, - VirtioInterrupt, VirtioInterruptType, + GuestMemoryMmap, VIRTIO_F_ACCESS_PLATFORM, VIRTIO_F_VERSION_1, VirtioCommon, VirtioDevice, + VirtioDeviceType, VirtioInterrupt, VirtioInterruptType, }; const QUEUE_SIZE: u16 = 128; @@ -160,6 +161,7 @@ struct BalloonEpollHandler { kill_evt: EventFd, pause_evt: EventFd, pbp: Option, + access_platform: Option>, } impl BalloonEpollHandler { @@ -277,7 +279,12 @@ impl BalloonEpollHandler { let mut offset = 0u64; while offset < desc.len() as u64 { - let addr = desc.addr().checked_add(offset).unwrap(); + let addr = desc + .addr() + .checked_add(offset) + .unwrap() + .translate_gva(self.access_platform.as_deref(), data_chunk_size) + .map_err(|e| Error::GuestMemory(GuestMemoryError::IOError(e)))?; let pfn: u32 = desc_chain .memory() .read_obj(addr) @@ -324,7 +331,11 @@ impl BalloonEpollHandler { let mut descs_len = 0; while let Some(desc) = desc_chain.next() { descs_len += desc.len(); - Self::release_memory_range(desc_chain.memory(), desc.addr(), desc.len() as usize)?; + let addr = desc + .addr() + .translate_gva(self.access_platform.as_deref(), desc.len() as usize) + .map_err(|e| Error::GuestMemory(GuestMemoryError::IOError(e)))?; + Self::release_memory_range(desc_chain.memory(), addr, desc.len() as usize)?; } self.queues[queue_index] @@ -437,11 +448,13 @@ pub struct Balloon { impl Balloon { // Create a new virtio-balloon. + #[allow(clippy::too_many_arguments)] pub fn new( id: String, size: u64, deflate_on_oom: bool, free_page_reporting: bool, + access_platform_enabled: bool, seccomp_action: SeccompAction, exit_evt: EventFd, state: Option, @@ -464,6 +477,9 @@ impl Balloon { if free_page_reporting { avail_features |= 1u64 << VIRTIO_BALLOON_F_REPORTING; } + if access_platform_enabled { + avail_features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; + } let config = VirtioBalloonConfig { num_pages: (size >> VIRTIO_BALLOON_PFN_SHIFT) as u32, @@ -590,12 +606,13 @@ impl VirtioDevice for Balloon { } } - fn activate( - &mut self, - mem: GuestMemoryAtomic, - interrupt_cb: Arc, - mut queues: Vec<(usize, Queue, EventFd)>, - ) -> ActivateResult { + fn activate(&mut self, context: crate::device::ActivationContext) -> ActivateResult { + let crate::device::ActivationContext { + mem, + interrupt_cb, + mut queues, + .. + } = context; self.common.activate(&queues, interrupt_cb.clone())?; let (kill_evt, pause_evt) = self.common.dup_eventfds(); @@ -627,6 +644,7 @@ impl VirtioDevice for Balloon { kill_evt, pause_evt, pbp: None, + access_platform: self.common.access_platform(), }; let paused = self.common.paused.clone(); @@ -647,6 +665,14 @@ impl VirtioDevice for Balloon { Ok(()) } + fn set_access_platform(&mut self, access_platform: Arc) { + self.common.set_access_platform(access_platform); + } + + fn access_platform(&self) -> Option> { + self.common.access_platform() + } + fn reset(&mut self) -> Option> { let result = self.common.reset(); event!("virtio-device", "reset", "id", &self.id); diff --git a/virtio-devices/src/block.rs b/virtio-devices/src/block.rs index 0d2b1fb271..2e28c60da5 100644 --- a/virtio-devices/src/block.rs +++ b/virtio-devices/src/block.rs @@ -8,20 +8,24 @@ // // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause +use std::cmp::max; use std::collections::{BTreeMap, HashMap, VecDeque}; use std::num::Wrapping; use std::ops::Deref; use std::os::unix::io::AsRawFd; use std::path::PathBuf; -use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicU8, AtomicU64, Ordering}; use std::sync::{Arc, Barrier}; use std::{io, result}; use anyhow::anyhow; -use block::async_io::{AsyncIo, AsyncIoError, DiskFile, DiskFileError}; -use block::fcntl::{LockError, LockGranularity, LockType, get_lock_state}; +use block::async_io::{AsyncIo, AsyncIoError}; +use block::disk_file::DiskBackend; +use block::error::BlockError; +use block::fcntl::{LockError, LockGranularity, LockGranularityChoice, LockType, get_lock_state}; use block::{ - ExecuteAsync, ExecuteError, Request, RequestType, VirtioBlockConfig, build_serial, fcntl, + ExecuteAsync, ExecuteError, MAX_DISCARD_WRITE_ZEROES_SEG, Request, RequestType, + VirtioBlockConfig, build_serial, fcntl, }; use event_monitor::event; use log::{debug, error, info, warn}; @@ -104,7 +108,7 @@ pub enum Error { #[error("Failed signal config interrupt")] ConfigChange(#[source] io::Error), #[error("Disk resize failed")] - DiskResize(#[source] DiskFileError), + DiskResize(#[source] BlockError), } pub type Result = result::Result; @@ -161,6 +165,7 @@ struct BlockEpollHandler { host_cpus: Option>, acked_features: u64, disable_sector0_writes: bool, + device_status: Arc, } fn has_feature(features: u64, feature_flag: u64) -> bool { @@ -168,6 +173,10 @@ fn has_feature(features: u64, feature_flag: u64) -> bool { } impl BlockEpollHandler { + fn needs_reset(&self) -> bool { + (self.device_status.load(Ordering::Acquire) & crate::DEVICE_NEEDS_RESET as u8) != 0 + } + fn check_request( features: u64, request: &Request, @@ -182,22 +191,64 @@ impl BlockEpollHandler { // For virtio spec compliance // "A device MUST set the status byte to VIRTIO_BLK_S_IOERR for a write request // if the VIRTIO_BLK_F_RO feature if offered, and MUST NOT write any data." + warn!( + "Rejecting block request {request_type:?}: device is read-only (VIRTIO_BLK_F_RO negotiated)" + ); return Err(ExecuteError::ReadOnly); } if request_type == RequestType::Out && disable_sector0_writes && request.sector == 0 { + warn!( + "Attempting to write to sector 0 on a raw disk without specifying image_type=raw" + ); return Err(ExecuteError::ReadOnly); } Ok(()) } + fn handle_queue_iterator_error(&mut self, err: &virtio_queue::Error) { + // The guest submitted a corrupted VirtQ request, and the error + // was logged during queue processing. We cannot just ignore the + // error, as the guest could continue spamming the VMM with bad + // requests, triggering excessive error logging. So we mark + // the device "NEEDS_RESET", effectively stopping all request + // processing (see self.needs_reset() usage) until the guest + // resets and reactivates the device. + + warn!( + "Corrupted request detected (virtqueue error: {err:?}). \ +Setting device status to 'NEEDS_RESET' and stopping processing queues until reset." + ); + + self.device_status + .fetch_or(crate::DEVICE_NEEDS_RESET as u8, Ordering::SeqCst); + + // Let the guest know that the device status has changed. + if let Err(e) = self.interrupt_cb.trigger(VirtioInterruptType::Config) { + error!("Failed to signal config interrupt: {e:?}"); + } + } + fn process_queue_submit(&mut self) -> Result<()> { + if self.needs_reset() { + return Ok(()); + } let queue = &mut self.queue; let mut batch_requests = Vec::new(); let mut batch_inflight_requests = Vec::new(); - while let Some(mut desc_chain) = queue.pop_descriptor_chain(self.mem.memory()) { + loop { + let mut desc_chain = match queue.iter(self.mem.memory()) { + Ok(mut iter) => match iter.next() { + Some(c) => c, + None => break, + }, + Err(err) => { + self.handle_queue_iterator_error(&err); + return Ok(()); + } + }; let mut request = Request::parse(&mut desc_chain, self.access_platform.as_deref()) .map_err(Error::RequestParsing)?; @@ -289,7 +340,7 @@ impl BlockEpollHandler { Ok(_) => VIRTIO_BLK_S_OK, Err(e) => { warn!("Request failed: {request:x?} {e:?}"); - VIRTIO_BLK_S_IOERR + e.status() as u32 } }; @@ -380,6 +431,9 @@ impl BlockEpollHandler { } fn process_queue_complete(&mut self) -> Result<()> { + if self.needs_reset() { + return Ok(()); + } let mem = self.mem.memory(); let mut read_bytes = Wrapping(0); let mut write_bytes = Wrapping(0); @@ -607,17 +661,14 @@ impl EpollHelperHandler for BlockEpollHandler { )) })?; + self.try_signal_used_queue()?; + let rate_limit_reached = self.rate_limiter.as_ref().is_some_and(|r| r.is_blocked()); // Process the queue only when the rate limit is not reached if !rate_limit_reached { - self.process_queue_submit().map_err(|e| { - EpollHelperError::HandleEvent(anyhow!( - "Failed to process queue (submit): {e:?}" - )) - })?; + self.process_queue_submit_and_signal()?; } - self.try_signal_used_queue()?; } RATE_LIMITER_EVENT => { if let Some(rate_limiter) = &mut self.rate_limiter { @@ -650,7 +701,7 @@ impl EpollHelperHandler for BlockEpollHandler { pub struct Block { common: VirtioCommon, id: String, - disk_image: Box, + disk_image: DiskBackend, disk_path: PathBuf, disk_nsectors: Arc, config: VirtioBlockConfig, @@ -662,6 +713,8 @@ pub struct Block { serial: Vec, queue_affinity: BTreeMap>, disable_sector0_writes: bool, + lock_granularity_choice: LockGranularityChoice, + device_status: Arc, } #[derive(Serialize, Deserialize)] @@ -678,10 +731,10 @@ impl Block { #[allow(clippy::too_many_arguments)] pub fn new( id: String, - mut disk_image: Box, + mut disk_image: DiskBackend, disk_path: PathBuf, read_only: bool, - iommu: bool, + access_platform_enabled: bool, num_queues: usize, queue_size: u16, serial: Option, @@ -692,6 +745,7 @@ impl Block { queue_affinity: BTreeMap>, sparse: bool, disable_sector0_writes: bool, + lock_granularity: LockGranularityChoice, ) -> io::Result { let (disk_nsectors, avail_features, acked_features, config, paused) = if let Some(state) = state { @@ -724,20 +778,23 @@ impl Block { | (1u64 << VIRTIO_RING_F_INDIRECT_DESC); // When backend supports sparse operations: - // - Always advertise WRITE_ZEROES - // - Advertise DISCARD only if sparse=true OR format supports marking - // clusters as zero without deallocating + // - Always advertise WRITE_ZEROES (safe for all drivers) + // - Advertise DISCARD only when sparse=true, since DISCARD + // deallocates space via punch_hole and should require + // explicit user opt in. + let mut discard_supported = false; if disk_image.supports_sparse_operations() { avail_features |= 1u64 << VIRTIO_BLK_F_WRITE_ZEROES; - if sparse || disk_image.supports_zero_flag() { + if sparse { avail_features |= 1u64 << VIRTIO_BLK_F_DISCARD; + discard_supported = true; } } else if sparse { warn!("sparse=on requested but backend does not support sparse operations"); } - if iommu { - avail_features |= 1u64 << VIRTIO_F_IOMMU_PLATFORM; + if access_platform_enabled { + avail_features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; } if read_only { @@ -773,6 +830,17 @@ impl Block { ..Default::default() }; + if avail_features & (1u64 << VIRTIO_BLK_F_WRITE_ZEROES) != 0 { + config.max_write_zeroes_sectors = u32::MAX; + config.max_write_zeroes_seg = MAX_DISCARD_WRITE_ZEROES_SEG; + config.write_zeroes_may_unmap = if discard_supported { 1 } else { 0 }; + } + if avail_features & (1u64 << VIRTIO_BLK_F_DISCARD) != 0 { + config.max_discard_sectors = u32::MAX; + config.max_discard_seg = MAX_DISCARD_WRITE_ZEROES_SEG; + config.discard_sector_alignment = (logical_block_size / SECTOR_SIZE) as u32; + } + if num_queues > 1 { avail_features |= 1u64 << VIRTIO_BLK_F_MQ; config.num_queues = num_queues as u16; @@ -807,6 +875,8 @@ impl Block { serial, queue_affinity, disable_sector0_writes, + lock_granularity_choice: lock_granularity, + device_status: Arc::new(AtomicU8::new(0)), }) } @@ -815,23 +885,32 @@ impl Block { } /// Returns the granularity for the advisory lock for this disk. - // TODO In future, we could add a `lock_granularity=` configuration to the CLI. - // For now, we stick to QEMU behavior. fn lock_granularity(&mut self) -> LockGranularity { - self.disk_image.physical_size().map_or_else( - // use a safe fallback - |e| { - let fallback = LockGranularity::WholeFile; - warn!( - "Can't get disk size for id={},path={}, falling back to {:?}: error: {e}", - self.id, - self.disk_path.display(), - fallback - ); - fallback - }, - |size| LockGranularity::ByteRange(0, size), - ) + match self.lock_granularity_choice { + LockGranularityChoice::Full => LockGranularity::WholeFile, + LockGranularityChoice::ByteRange => { + // Byte range lock covering [0, max(logical, physical)) + // logical > physical for sparse files, physical > logical + // for small dense files due to filesystem block rounding. + let logical = self.disk_image.logical_size(); + let physical = self.disk_image.physical_size(); + match (logical, physical) { + (Ok(l), Ok(p)) => LockGranularity::ByteRange(0, max(l, p)), + (Ok(l), Err(_)) => LockGranularity::ByteRange(0, l), + (Err(_), Ok(p)) => LockGranularity::ByteRange(0, p), + (Err(e), Err(_)) => { + let fallback = LockGranularity::WholeFile; + warn!( + "Can't get disk size for id={},path={}, falling back to {:?}: error: {e}", + self.id, + self.disk_path.display(), + fallback + ); + fallback + } + } + } + } } /// Tries to set an advisory lock for the corresponding disk image. @@ -895,24 +974,24 @@ impl Block { } } - fn update_writeback(&mut self) { - // Use writeback from config if VIRTIO_BLK_F_CONFIG_WCE - let writeback = if self.common.feature_acked(VIRTIO_BLK_F_CONFIG_WCE.into()) { - self.config.writeback == 1 - } else { - // Else check if VIRTIO_BLK_F_FLUSH negotiated - self.common.feature_acked(VIRTIO_BLK_F_FLUSH.into()) - }; + /// The virtio v1.2 spec says "If VIRTIO_BLK_F_CONFIG_WCE was not + /// negotiated but VIRTIO_BLK_F_FLUSH was, the driver SHOULD assume + /// presence of a writeback cache." It also says "If + /// VIRTIO_BLK_F_CONFIG_WCE is negotiated but VIRTIO_BLK_F_FLUSH is not, + /// the device MUST initialize writeback to 0." + fn is_writeback_enabled(&self, desired: bool) -> bool { + let flush = self.common.feature_acked(VIRTIO_BLK_F_FLUSH.into()); + let wce = self.common.feature_acked(VIRTIO_BLK_F_CONFIG_WCE.into()); + if wce { flush && desired } else { flush } + } + fn set_writeback_mode(&mut self, enabled: bool) { + self.config.writeback = enabled as u8; + self.writeback.store(enabled, Ordering::Release); info!( "Changing cache mode to {}", - if writeback { - "writeback" - } else { - "writethrough" - } + if enabled { "writeback" } else { "writethrough" } ); - self.writeback.store(writeback, Ordering::Release); } pub fn resize(&mut self, new_size: u64) -> Result<()> { @@ -994,16 +1073,18 @@ impl VirtioDevice for Block { return; } - self.config.writeback = data[0]; - self.update_writeback(); + let writeback = self.is_writeback_enabled(data[0] == 1); + self.set_writeback_mode(writeback); } - fn activate( - &mut self, - mem: GuestMemoryAtomic, - interrupt_cb: Arc, - mut queues: Vec<(usize, Queue, EventFd)>, - ) -> ActivateResult { + fn activate(&mut self, context: crate::device::ActivationContext) -> ActivateResult { + let crate::device::ActivationContext { + mem, + interrupt_cb, + mut queues, + device_status, + } = context; + self.device_status = device_status; // See if the guest didn't ack the device being read-only. // If so, warn and pretend it did. let original_acked_features = self.common.acked_features; @@ -1013,7 +1094,11 @@ impl VirtioDevice for Block { } self.common.activate(&queues, interrupt_cb.clone())?; - self.update_writeback(); + // Recompute the barrier size from the queues that are actually activated. + self.common.paused_sync = Some(Arc::new(Barrier::new(queues.len() + 1))); + + let writeback = self.is_writeback_enabled(self.config.writeback == 1); + self.set_writeback_mode(writeback); let mut epoll_threads = Vec::new(); let event_idx = self.common.feature_acked(VIRTIO_RING_F_EVENT_IDX.into()); @@ -1055,10 +1140,11 @@ impl VirtioDevice for Block { .map(|r| r.new_handle()) .transpose() .unwrap(), - access_platform: self.common.access_platform.clone(), + access_platform: self.common.access_platform(), host_cpus: self.queue_affinity.get(&queue_idx).cloned(), acked_features: self.common.acked_features, disable_sector0_writes: self.disable_sector0_writes, + device_status: self.device_status.clone(), }; let paused = self.common.paused.clone(); @@ -1082,6 +1168,7 @@ impl VirtioDevice for Block { fn reset(&mut self) -> Option> { let result = self.common.reset(); + self.set_writeback_mode(true); event!("virtio-device", "reset", "id", &self.id); result } @@ -1136,6 +1223,10 @@ impl VirtioDevice for Block { fn set_access_platform(&mut self, access_platform: Arc) { self.common.set_access_platform(access_platform); } + + fn access_platform(&self) -> Option> { + self.common.access_platform() + } } impl Pausable for Block { diff --git a/virtio-devices/src/console.rs b/virtio-devices/src/console.rs index c8a9f08a02..0a4cf65cff 100644 --- a/virtio-devices/src/console.rs +++ b/virtio-devices/src/console.rs @@ -18,14 +18,14 @@ use serde::{Deserialize, Serialize}; use serial_buffer::SerialBuffer; use thiserror::Error; use virtio_queue::{Queue, QueueT}; -use vm_memory::{ByteValued, Bytes, GuestAddressSpace, GuestMemory, GuestMemoryAtomic}; +use vm_memory::{ByteValued, Bytes, GuestAddressSpace, GuestMemoryAtomic}; use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable}; use vm_virtio::{AccessPlatform, Translatable}; use vmm_sys_util::eventfd::EventFd; use super::{ ActivateResult, EPOLL_HELPER_EVENT_LAST, EpollHelper, EpollHelperError, EpollHelperHandler, - Error as DeviceError, VIRTIO_F_IOMMU_PLATFORM, VIRTIO_F_VERSION_1, VirtioCommon, VirtioDevice, + Error as DeviceError, VIRTIO_F_ACCESS_PLATFORM, VIRTIO_F_VERSION_1, VirtioCommon, VirtioDevice, VirtioDeviceType, VirtioInterruptType, }; use crate::seccomp_filters::Thread; @@ -51,8 +51,6 @@ const VIRTIO_CONSOLE_F_SIZE: u64 = 0; #[derive(Error, Debug)] enum Error { - #[error("Descriptor chain too short")] - DescriptorChainTooShort, #[error("Failed to read from guest memory")] GuestMemoryRead(#[source] vm_memory::guest_memory::Error), #[error("Failed to write to guest memory")] @@ -210,21 +208,31 @@ impl ConsoleEpollHandler { } while let Some(mut desc_chain) = recv_queue.pop_descriptor_chain(self.mem.memory()) { - let desc = desc_chain.next().ok_or(Error::DescriptorChainTooShort)?; - let len = cmp::min(desc.len(), in_buffer.len() as u32); - let source_slice = in_buffer.drain(..len as usize).collect::>(); - - desc_chain - .memory() - .write_slice( - &source_slice[..], - desc.addr() - .translate_gva(self.access_platform.as_deref(), desc.len() as usize), - ) - .map_err(Error::GuestMemoryWrite)?; + let mut total_len = 0; + while let Some(desc) = desc_chain.next() { + if in_buffer.is_empty() { + break; + } + let len = cmp::min(desc.len(), in_buffer.len() as u32); + let source_slice = in_buffer.drain(..len as usize).collect::>(); + + desc_chain + .memory() + .write_slice( + &source_slice[..], + desc.addr() + .translate_gva(self.access_platform.as_deref(), desc.len() as usize) + .map_err(|e| { + Error::GuestMemoryWrite(vm_memory::GuestMemoryError::IOError(e)) + })?, + ) + .map_err(Error::GuestMemoryWrite)?; + + total_len += len; + } recv_queue - .add_used(desc_chain.memory(), desc_chain.head_index(), len) + .add_used(desc_chain.memory(), desc_chain.head_index(), total_len) .map_err(Error::QueueAddUsed)?; used_descs = true; @@ -248,28 +256,35 @@ impl ConsoleEpollHandler { let mut used_descs = false; while let Some(mut desc_chain) = trans_queue.pop_descriptor_chain(self.mem.memory()) { - let desc = desc_chain.next().ok_or(Error::DescriptorChainTooShort)?; - if let Some(out) = &mut self.out { - let mut buf: Vec = Vec::new(); - desc_chain - .memory() - .write_volatile_to( - desc.addr() - .translate_gva(self.access_platform.as_deref(), desc.len() as usize), - &mut buf, - desc.len() as usize, - ) - .map_err(Error::GuestMemoryRead)?; - - out.write_all(&buf).map_err(Error::OutputWriteAll)?; - out.flush().map_err(Error::OutputFlush)?; + while let Some(desc) = desc_chain.next() { + if let Some(out) = &mut self.out { + let mut buf: Vec = Vec::new(); + desc_chain + .memory() + .write_volatile_to( + desc.addr() + .translate_gva(self.access_platform.as_deref(), desc.len() as usize) + .map_err(|e| { + Error::GuestMemoryRead(vm_memory::GuestMemoryError::IOError(e)) + })?, + &mut buf, + desc.len() as usize, + ) + .map_err(Error::GuestMemoryRead)?; + + out.write_all(&buf).map_err(Error::OutputWriteAll)?; + } } trans_queue - .add_used(desc_chain.memory(), desc_chain.head_index(), desc.len()) + .add_used(desc_chain.memory(), desc_chain.head_index(), 0) .map_err(Error::QueueAddUsed)?; used_descs = true; } + if used_descs && let Some(out) = &mut self.out { + out.flush().map_err(Error::OutputFlush)?; + } + Ok(used_descs) } @@ -526,11 +541,7 @@ impl ConsoleResizer { if let Some(tty) = self.tty.as_ref() { let (cols, rows) = get_win_size(tty); self.config.lock().unwrap().update_console_size(cols, rows); - if self - .acked_features - .fetch_and(1u64 << VIRTIO_CONSOLE_F_SIZE, Ordering::AcqRel) - != 0 - { + if self.acked_features.load(Ordering::Acquire) & (1u64 << VIRTIO_CONSOLE_F_SIZE) != 0 { // Send the interrupt to the driver let _ = self.config_evt.write(1); } @@ -591,7 +602,7 @@ impl Console { id: String, endpoint: Endpoint, resize_pipe: Option, - iommu: bool, + access_platform_enabled: bool, seccomp_action: SeccompAction, exit_evt: EventFd, state: Option, @@ -608,8 +619,8 @@ impl Console { ) } else { let mut avail_features = (1u64 << VIRTIO_F_VERSION_1) | (1u64 << VIRTIO_CONSOLE_F_SIZE); - if iommu { - avail_features |= 1u64 << VIRTIO_F_IOMMU_PLATFORM; + if access_platform_enabled { + avail_features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; } ( @@ -703,12 +714,13 @@ impl VirtioDevice for Console { self.read_config_from_slice(self.config.lock().unwrap().as_slice(), offset, data); } - fn activate( - &mut self, - mem: GuestMemoryAtomic, - interrupt_cb: Arc, - mut queues: Vec<(usize, Queue, EventFd)>, - ) -> ActivateResult { + fn activate(&mut self, context: crate::device::ActivationContext) -> ActivateResult { + let crate::device::ActivationContext { + mem, + interrupt_cb, + mut queues, + .. + } = context; self.common.activate(&queues, interrupt_cb.clone())?; self.resizer .acked_features @@ -739,7 +751,7 @@ impl VirtioDevice for Console { self.resize_pipe.as_ref().map(|p| p.try_clone().unwrap()), kill_evt, pause_evt, - self.common.access_platform.clone(), + self.common.access_platform(), ); let paused = self.common.paused.clone(); @@ -770,6 +782,10 @@ impl VirtioDevice for Console { fn set_access_platform(&mut self, access_platform: Arc) { self.common.set_access_platform(access_platform); } + + fn access_platform(&self) -> Option> { + self.common.access_platform() + } } impl Pausable for Console { diff --git a/virtio-devices/src/device.rs b/virtio-devices/src/device.rs index f0ed28f517..89e1ee2eaf 100644 --- a/virtio-devices/src/device.rs +++ b/virtio-devices/src/device.rs @@ -9,12 +9,14 @@ use std::collections::HashMap; use std::io::Write; use std::num::Wrapping; -use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicU8, Ordering}; use std::sync::{Arc, Barrier}; use std::thread; +use anyhow::anyhow; use libc::EFD_NONBLOCK; use log::{error, info, warn}; +use virtio_bindings::virtio_config::VIRTIO_F_ACCESS_PLATFORM; use virtio_queue::Queue; use vm_device::UserspaceMapping; use vm_memory::{GuestAddress, GuestMemoryAtomic}; @@ -37,6 +39,12 @@ pub trait VirtioInterrupt: Send + Sync { fn notifier(&self, _int_type: VirtioInterruptType) -> Option { None } + fn set_notifier( + &self, + int_type: u32, + notifier: Option, + vm: &dyn hypervisor::Vm, + ) -> std::io::Result<()>; } #[derive(Clone)] @@ -53,6 +61,13 @@ pub struct VirtioSharedMemoryList { pub region_list: Vec, } +pub struct ActivationContext { + pub mem: GuestMemoryAtomic, + pub interrupt_cb: Arc, + pub queues: Vec<(usize, Queue, EventFd)>, + pub device_status: Arc, +} + /// Trait for virtio devices to be driven by a virtio transport. /// /// The lifecycle of a virtio device is to be moved to a virtio transport, which will then query the @@ -67,6 +82,18 @@ pub trait VirtioDevice: Send { /// The maximum size of each queue that this device supports. fn queue_max_sizes(&self) -> &[u16]; + /// Whether the device needs to register extra irqfds at runtime + /// from external sources. + /// The default is false. If this is true, locking is required for + /// most operations involving interrupts (but not for sending) + /// interrupts from external irqfds). + /// + /// If the device claims to not need to register irqfds, but + /// attempts to do so, a panic will ensue. + fn interrupt_source_mutable(&self) -> bool { + false + } + /// The set of feature bits that this device supports. fn features(&self) -> u64 { 0 @@ -94,12 +121,7 @@ pub trait VirtioDevice: Send { } /// Activates this device for real usage. - fn activate( - &mut self, - mem: GuestMemoryAtomic, - interrupt_evt: Arc, - queues: Vec<(usize, Queue, EventFd)>, - ) -> ActivateResult; + fn activate(&mut self, context: ActivationContext) -> ActivateResult; /// Optionally deactivates this device and returns ownership of the guest memory map, interrupt /// event, and queue events. @@ -166,6 +188,12 @@ pub trait VirtioDevice: Send { /// Set the access platform trait to let the device perform address /// translations if needed. fn set_access_platform(&mut self, _access_platform: Arc) {} + + /// Returns the access platform only if VIRTIO_F_ACCESS_PLATFORM was + /// negotiated with the guest. + fn access_platform(&self) -> Option> { + None + } } /// Trait to define address translation for devices managed by virtio-iommu @@ -195,6 +223,7 @@ pub struct VirtioCommon { pub paused_sync: Option>, pub epoll_threads: Option>>, pub queue_sizes: Vec, + pub queue_evts: Vec, pub device_type: u32, pub min_queues: u16, pub access_platform: Option>, @@ -232,6 +261,16 @@ impl VirtioCommon { return Err(ActivateError::BadActivate); } + self.queue_evts = queues + .iter() + .map(|(_, _, queue_evt)| { + queue_evt.try_clone().map_err(|e| { + error!("failed cloning queue EventFd: {e}"); + ActivateError::BadActivate + }) + }) + .collect::, _>>()?; + let kill_evt = EventFd::new(EFD_NONBLOCK).map_err(|e| { error!("failed creating kill EventFd: {e}"); ActivateError::BadActivate @@ -252,6 +291,8 @@ impl VirtioCommon { } pub fn reset(&mut self) -> Option> { + self.queue_evts.clear(); + // We first must resume the virtio thread if it was paused. if self.pause_evt.take().is_some() { self.resume().ok()?; @@ -298,6 +339,15 @@ impl VirtioCommon { // requires the addresses held by the descriptors to be translated. self.avail_features &= !(1 << VIRTIO_F_RING_INDIRECT_DESC); } + + /// Returns the access platform only if the feature has been acked. + pub fn access_platform(&self) -> Option> { + if self.feature_acked(VIRTIO_F_ACCESS_PLATFORM as u64) { + self.access_platform.clone() + } else { + None + } + } } impl Pausable for VirtioCommon { @@ -306,7 +356,13 @@ impl Pausable for VirtioCommon { "Pausing virtio-{}", VirtioDeviceType::from(self.device_type) ); - self.paused.store(true, Ordering::SeqCst); + + // If already paused, return early to avoid deadlock waiting on barrier + // for worker threads that are already parked. + if self.paused.swap(true, Ordering::SeqCst) { + return Ok(()); + } + if let Some(pause_evt) = &self.pause_evt { pause_evt .write(1) @@ -335,6 +391,25 @@ impl Pausable for VirtioCommon { } } + // Signal each activated queue eventfd so workers process restored queues + // that may already contain pending requests. + for queue_evt in &self.queue_evts { + queue_evt.write(1).map_err(|e| { + MigratableError::Resume(anyhow!( + "Could not notify restored virtio worker on resume: {e}" + )) + })?; + } + + // Also trigger interrupts into the guest to wake up the driver to avoid a "livelock" + if let Some(interrupt_cb) = &self.interrupt_cb { + for i in 0..self.queue_evts.len() { + interrupt_cb + .trigger(crate::VirtioInterruptType::Queue(i as u16)) + .ok(); + } + } + Ok(()) } } diff --git a/virtio-devices/src/iommu.rs b/virtio-devices/src/iommu.rs index f4812b04fb..513d510b56 100644 --- a/virtio-devices/src/iommu.rs +++ b/virtio-devices/src/iommu.rs @@ -98,7 +98,7 @@ struct VirtioIommuConfig { domain_range: VirtioIommuRange32, probe_size: u32, bypass: u8, - _reserved: [u8; 7], + _reserved: [u8; 3], } /// Virtio IOMMU request type @@ -1075,12 +1075,13 @@ impl VirtioDevice for Iommu { self.update_bypass(); } - fn activate( - &mut self, - mem: GuestMemoryAtomic, - interrupt_cb: Arc, - mut queues: Vec<(usize, Queue, EventFd)>, - ) -> ActivateResult { + fn activate(&mut self, context: crate::device::ActivationContext) -> ActivateResult { + let crate::device::ActivationContext { + mem, + interrupt_cb, + mut queues, + .. + } = context; self.common.activate(&queues, interrupt_cb.clone())?; let (kill_evt, pause_evt) = self.common.dup_eventfds(); diff --git a/virtio-devices/src/lib.rs b/virtio-devices/src/lib.rs index da4f1c91be..d57673ad34 100644 --- a/virtio-devices/src/lib.rs +++ b/virtio-devices/src/lib.rs @@ -42,8 +42,8 @@ pub use self::balloon::Balloon; pub use self::block::{Block, BlockState}; pub use self::console::{Console, ConsoleResizer, Endpoint}; pub use self::device::{ - DmaRemapping, VirtioCommon, VirtioDevice, VirtioInterrupt, VirtioInterruptType, - VirtioSharedMemoryList, + ActivationContext, DmaRemapping, VirtioCommon, VirtioDevice, VirtioInterrupt, + VirtioInterruptType, VirtioSharedMemoryList, }; pub use self::epoll_helper::{ EPOLL_HELPER_EVENT_LAST, EpollHelper, EpollHelperError, EpollHelperHandler, @@ -66,12 +66,13 @@ const DEVICE_ACKNOWLEDGE: u32 = 0x01; const DEVICE_DRIVER: u32 = 0x02; const DEVICE_DRIVER_OK: u32 = 0x04; const DEVICE_FEATURES_OK: u32 = 0x08; +const DEVICE_NEEDS_RESET: u32 = 0x40; const DEVICE_FAILED: u32 = 0x80; const VIRTIO_F_RING_INDIRECT_DESC: u32 = 28; const VIRTIO_F_RING_EVENT_IDX: u32 = 29; const VIRTIO_F_VERSION_1: u32 = 32; -const VIRTIO_F_IOMMU_PLATFORM: u32 = 33; +const VIRTIO_F_ACCESS_PLATFORM: u32 = 33; const VIRTIO_F_IN_ORDER: u32 = 35; const VIRTIO_F_ORDER_PLATFORM: u32 = 36; #[allow(dead_code)] @@ -167,9 +168,7 @@ pub fn get_host_address_range( if mem.check_range(addr, size) { let slice = mem.get_slice(addr, size).unwrap(); assert!(slice.len() >= size); - // TODO: return a VolatileSlice and fix all callers. - #[allow(deprecated)] - Some(slice.as_ptr()) + Some(slice.ptr_guard_mut().as_ptr()) } else { None } diff --git a/virtio-devices/src/mem.rs b/virtio-devices/src/mem.rs index 936fdbe42a..067100164e 100644 --- a/virtio-devices/src/mem.rs +++ b/virtio-devices/src/mem.rs @@ -834,6 +834,10 @@ impl Mem { }) } + pub fn plugged_size(&self) -> u64 { + self.config.lock().unwrap().plugged_size + } + pub fn resize(&mut self, size: u64) -> result::Result<(), Error> { let mut config = self.config.lock().unwrap(); config.resize(size).map_err(|e| { @@ -950,12 +954,13 @@ impl VirtioDevice for Mem { self.read_config_from_slice(self.config.lock().unwrap().as_slice(), offset, data); } - fn activate( - &mut self, - mem: GuestMemoryAtomic, - interrupt_cb: Arc, - mut queues: Vec<(usize, Queue, EventFd)>, - ) -> ActivateResult { + fn activate(&mut self, context: crate::device::ActivationContext) -> ActivateResult { + let crate::device::ActivationContext { + mem, + interrupt_cb, + mut queues, + .. + } = context; self.common.activate(&queues, interrupt_cb.clone())?; let (kill_evt, pause_evt) = self.common.dup_eventfds(); diff --git a/virtio-devices/src/net.rs b/virtio-devices/src/net.rs index 8eee661341..d7e1d1f361 100644 --- a/virtio-devices/src/net.rs +++ b/virtio-devices/src/net.rs @@ -10,13 +10,13 @@ use std::net::IpAddr; use std::num::Wrapping; use std::ops::Deref; use std::os::unix::io::{AsRawFd, RawFd}; -use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicU8, Ordering}; use std::sync::{Arc, Barrier}; use std::{result, thread}; use anyhow::anyhow; use event_monitor::event; -use log::{debug, error, info}; +use log::{debug, error, info, warn}; #[cfg(not(fuzzing))] use net_util::virtio_features_to_tap_offload; use net_util::{ @@ -174,11 +174,7 @@ struct NetEpollHandler { queue_index_base: u16, queue_pair: (Queue, Queue), queue_evt_pair: (EventFd, EventFd), - // Always generate interrupts until the driver has signalled to the device. - // This mitigates a problem with interrupts from tap events being "lost" upon - // a restore as the vCPU thread isn't ready to handle the interrupt. This causes - // issues when combined with VIRTIO_RING_F_EVENT_IDX interrupt suppression. - driver_awake: bool, + device_status: Arc, } impl NetEpollHandler { @@ -192,6 +188,9 @@ impl NetEpollHandler { } fn handle_rx_event(&mut self) -> result::Result<(), DeviceError> { + if self.needs_reset() { + return Ok(()); + } let queue_evt = &self.queue_evt_pair.0; if let Err(e) = queue_evt.read() { error!("Failed to get rx queue event: {e:?}"); @@ -220,13 +219,43 @@ impl NetEpollHandler { Ok(()) } + fn handle_queue_iterator_error(&mut self, err: &virtio_queue::Error) { + // The guest submitted a corrupted VirtQ request, and the error + // was logged during queue processing. We cannot just ignore the + // error, as the guest could continue spamming the VMM with bad + // requests, triggering excessive error logging. So we mark + // the device "NEEDS_RESET", effectively stopping all request + // processing (see self.needs_reset() usage) until the guest + // resets and reactivates the device. + + warn!( + "Corrupted request detected (virtqueue error: {err:?}). \ +Setting device status to 'NEEDS_RESET' and stopping processing queues until reset." + ); + + self.device_status + .fetch_or(crate::DEVICE_NEEDS_RESET as u8, Ordering::SeqCst); + + // Let the guest know that the device status has changed. + if let Err(e) = self.interrupt_cb.trigger(VirtioInterruptType::Config) { + error!("Failed to signal config interrupt: {e:?}"); + } + } + fn process_tx(&mut self) -> result::Result<(), DeviceError> { - if self + if self.needs_reset() { + return Ok(()); + } + let res = self .net - .process_tx(&self.mem.memory(), &mut self.queue_pair.1) - .map_err(DeviceError::NetQueuePair)? - || !self.driver_awake - { + .process_tx(&self.mem.memory(), &mut self.queue_pair.1); + + if let Err(net_util::NetQueuePairError::QueueIteratorFailed(err)) = res { + self.handle_queue_iterator_error(&err); + return Ok(()); + } + + if res.map_err(DeviceError::NetQueuePair)? { self.signal_used_queue(self.queue_index_base + 1)?; debug!("Signalling TX queue"); } else { @@ -250,12 +279,19 @@ impl NetEpollHandler { } fn handle_rx_tap_event(&mut self) -> result::Result<(), DeviceError> { - if self + if self.needs_reset() { + return Ok(()); + } + let res = self .net - .process_rx(&self.mem.memory(), &mut self.queue_pair.0) - .map_err(DeviceError::NetQueuePair)? - || !self.driver_awake - { + .process_rx(&self.mem.memory(), &mut self.queue_pair.0); + + if let Err(net_util::NetQueuePairError::QueueIteratorFailed(err)) = res { + self.handle_queue_iterator_error(&err); + return Ok(()); + } + + if res.map_err(DeviceError::NetQueuePair)? { self.signal_used_queue(self.queue_index_base)?; debug!("Signalling RX queue"); } else { @@ -305,6 +341,10 @@ impl NetEpollHandler { Ok(()) } + + fn needs_reset(&self) -> bool { + (self.device_status.load(Ordering::Acquire) & crate::DEVICE_NEEDS_RESET as u8) != 0 + } } impl EpollHelperHandler for NetEpollHandler { @@ -316,7 +356,6 @@ impl EpollHelperHandler for NetEpollHandler { let ev_type = event.data as u16; match ev_type { RX_QUEUE_EVENT => { - self.driver_awake = true; self.handle_rx_event().map_err(|e| { EpollHelperError::HandleEvent(anyhow!("Error processing RX queue: {e:?}")) })?; @@ -326,7 +365,6 @@ impl EpollHelperHandler for NetEpollHandler { if let Err(e) = queue_evt.read() { error!("Failed to get tx queue event: {e:?}"); } - self.driver_awake = true; self.handle_tx_event().map_err(|e| { EpollHelperError::HandleEvent(anyhow!("Error processing TX queue: {e:?}")) })?; @@ -383,8 +421,6 @@ impl EpollHelperHandler for NetEpollHandler { "Error from 'rate_limiter.event_handler()': {e:?}" )) })?; - - self.driver_awake = true; self.process_tx().map_err(|e| { EpollHelperError::HandleEvent(anyhow!("Error processing TX queue: {e:?}")) })?; @@ -414,6 +450,7 @@ pub struct Net { seccomp_action: SeccompAction, rate_limiter_config: Option, exit_evt: EventFd, + device_status: Arc, } #[derive(Serialize, Deserialize)] @@ -431,7 +468,7 @@ impl Net { id: String, taps: Vec, guest_mac: Option, - iommu: bool, + access_platform_enabled: bool, num_queues: usize, queue_size: u16, seccomp_action: SeccompAction, @@ -462,8 +499,8 @@ impl Net { | (1 << VIRTIO_RING_F_EVENT_IDX) | (1 << VIRTIO_F_VERSION_1); - if iommu { - avail_features |= 1u64 << VIRTIO_F_IOMMU_PLATFORM; + if access_platform_enabled { + avail_features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; } // Configure TSO/UFO features when hardware checksum offload is enabled. @@ -535,6 +572,7 @@ impl Net { seccomp_action, rate_limiter_config, exit_evt, + device_status: Arc::new(AtomicU8::new(0)), }) } @@ -549,7 +587,7 @@ impl Net { guest_mac: Option, host_mac: &mut Option, mtu: Option, - iommu: bool, + access_platform_enabled: bool, num_queues: usize, queue_size: u16, seccomp_action: SeccompAction, @@ -575,7 +613,7 @@ impl Net { id, taps, guest_mac, - iommu, + access_platform_enabled, num_queues, queue_size, seccomp_action, @@ -594,7 +632,7 @@ impl Net { fds: &[RawFd], guest_mac: Option, mtu: Option, - iommu: bool, + access_platform_enabled: bool, queue_size: u16, seccomp_action: SeccompAction, rate_limiter_config: Option, @@ -628,7 +666,7 @@ impl Net { id, taps, guest_mac, - iommu, + access_platform_enabled, num_queue_pairs * 2, queue_size, seccomp_action, @@ -693,17 +731,27 @@ impl VirtioDevice for Net { self.read_config_from_slice(self.config.as_slice(), offset, data); } - fn activate( - &mut self, - mem: GuestMemoryAtomic, - interrupt_cb: Arc, - mut queues: Vec<(usize, Queue, EventFd)>, - ) -> ActivateResult { + fn activate(&mut self, context: crate::device::ActivationContext) -> ActivateResult { + let crate::device::ActivationContext { + mem, + interrupt_cb, + mut queues, + device_status, + } = context; + self.device_status = device_status; self.common.activate(&queues, interrupt_cb.clone())?; let num_queues = queues.len(); let event_idx = self.common.feature_acked(VIRTIO_RING_F_EVENT_IDX.into()); - if self.common.feature_acked(VIRTIO_NET_F_CTRL_VQ.into()) && !num_queues.is_multiple_of(2) { + + // Recompute the barrier size from the queues that are actually activated. + let has_ctrl_queue = + self.common.feature_acked(VIRTIO_NET_F_CTRL_VQ.into()) && !num_queues.is_multiple_of(2); + let ctrl_threads = if has_ctrl_queue { 1 } else { 0 }; + let qp_threads = (num_queues - ctrl_threads) / 2; + self.common.paused_sync = Some(Arc::new(Barrier::new(1 + qp_threads + ctrl_threads))); + + if has_ctrl_queue { let ctrl_queue_index = num_queues - 1; let (_, mut ctrl_queue, ctrl_queue_evt) = queues.remove(ctrl_queue_index); @@ -717,16 +765,12 @@ impl VirtioDevice for Net { ctrl_q: CtrlQueue::new(self.taps.clone()), queue: ctrl_queue, queue_evt: ctrl_queue_evt, - access_platform: self.common.access_platform.clone(), + access_platform: self.common.access_platform(), queue_index: ctrl_queue_index as u16, interrupt_cb: interrupt_cb.clone(), }; let paused = self.common.paused.clone(); - // Let's update the barrier as we need 1 for each RX/TX pair + - // 1 for the control queue + 1 for the main thread signalling - // the pause. - self.common.paused_sync = Some(Arc::new(Barrier::new(self.taps.len() + 2))); let paused_sync = self.common.paused_sync.clone(); let mut epoll_threads = Vec::new(); @@ -793,7 +837,7 @@ impl VirtioDevice for Net { rx_desc_avail: false, rx_rate_limiter, tx_rate_limiter, - access_platform: self.common.access_platform.clone(), + access_platform: self.common.access_platform(), }, mem: mem.clone(), queue_index_base: (i * 2) as u16, @@ -802,7 +846,7 @@ impl VirtioDevice for Net { interrupt_cb: interrupt_cb.clone(), kill_evt, pause_evt, - driver_awake: false, + device_status: self.device_status.clone(), }; let paused = self.common.paused.clone(); @@ -856,6 +900,10 @@ impl VirtioDevice for Net { fn set_access_platform(&mut self, access_platform: Arc) { self.common.set_access_platform(access_platform); } + + fn access_platform(&self) -> Option> { + self.common.access_platform() + } } impl Pausable for Net { diff --git a/virtio-devices/src/pmem.rs b/virtio-devices/src/pmem.rs index 549b62fd96..94f2716b64 100644 --- a/virtio-devices/src/pmem.rs +++ b/virtio-devices/src/pmem.rs @@ -31,7 +31,7 @@ use vmm_sys_util::eventfd::EventFd; use super::{ ActivateError, ActivateResult, EPOLL_HELPER_EVENT_LAST, EpollHelper, EpollHelperError, - EpollHelperHandler, Error as DeviceError, VIRTIO_F_IOMMU_PLATFORM, VIRTIO_F_VERSION_1, + EpollHelperHandler, Error as DeviceError, VIRTIO_F_ACCESS_PLATFORM, VIRTIO_F_VERSION_1, VirtioCommon, VirtioDevice, VirtioDeviceType, }; use crate::seccomp_filters::Thread; @@ -123,7 +123,8 @@ impl Request { .memory() .read_obj( desc.addr() - .translate_gva(access_platform, desc.len() as usize), + .translate_gva(access_platform, desc.len() as usize) + .map_err(|e| Error::GuestMemory(GuestMemoryError::IOError(e)))?, ) .map_err(Error::GuestMemory)?; @@ -147,7 +148,8 @@ impl Request { type_: request_type, status_addr: status_desc .addr() - .translate_gva(access_platform, status_desc.len() as usize), + .translate_gva(access_platform, status_desc.len() as usize) + .map_err(|e| Error::GuestMemory(GuestMemoryError::IOError(e)))?, }) } } @@ -285,7 +287,7 @@ impl Pmem { disk: File, addr: GuestAddress, mapping: UserspaceMapping, - iommu: bool, + access_platform_enabled: bool, seccomp_action: SeccompAction, exit_evt: EventFd, state: Option, @@ -306,8 +308,8 @@ impl Pmem { let mut avail_features = 1u64 << VIRTIO_F_VERSION_1; - if iommu { - avail_features |= 1u64 << VIRTIO_F_IOMMU_PLATFORM; + if access_platform_enabled { + avail_features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; } (avail_features, 0, config, false) }; @@ -377,12 +379,13 @@ impl VirtioDevice for Pmem { self.read_config_from_slice(self.config.as_slice(), offset, data); } - fn activate( - &mut self, - mem: GuestMemoryAtomic, - interrupt_cb: Arc, - mut queues: Vec<(usize, Queue, EventFd)>, - ) -> ActivateResult { + fn activate(&mut self, context: crate::device::ActivationContext) -> ActivateResult { + let crate::device::ActivationContext { + mem, + interrupt_cb, + mut queues, + .. + } = context; self.common.activate(&queues, interrupt_cb.clone())?; let (kill_evt, pause_evt) = self.common.dup_eventfds(); if let Some(disk) = self.disk.as_ref() { @@ -401,7 +404,7 @@ impl VirtioDevice for Pmem { queue_evt, kill_evt, pause_evt, - access_platform: self.common.access_platform.clone(), + access_platform: self.common.access_platform(), }; let paused = self.common.paused.clone(); @@ -438,6 +441,10 @@ impl VirtioDevice for Pmem { fn set_access_platform(&mut self, access_platform: Arc) { self.common.set_access_platform(access_platform); } + + fn access_platform(&self) -> Option> { + self.common.access_platform() + } } impl Pausable for Pmem { diff --git a/virtio-devices/src/rng.rs b/virtio-devices/src/rng.rs index 2f980d4d8b..8d11a3d7b6 100644 --- a/virtio-devices/src/rng.rs +++ b/virtio-devices/src/rng.rs @@ -17,14 +17,14 @@ use seccompiler::SeccompAction; use serde::{Deserialize, Serialize}; use thiserror::Error; use virtio_queue::{Queue, QueueT}; -use vm_memory::{GuestAddressSpace, GuestMemory, GuestMemoryAtomic}; +use vm_memory::{Bytes, GuestAddressSpace, GuestMemoryAtomic}; use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable}; use vm_virtio::{AccessPlatform, Translatable}; use vmm_sys_util::eventfd::EventFd; use super::{ ActivateError, ActivateResult, EPOLL_HELPER_EVENT_LAST, EpollHelper, EpollHelperError, - EpollHelperHandler, Error as DeviceError, VIRTIO_F_IOMMU_PLATFORM, VIRTIO_F_VERSION_1, + EpollHelperHandler, Error as DeviceError, VIRTIO_F_ACCESS_PLATFORM, VIRTIO_F_VERSION_1, VirtioCommon, VirtioDevice, VirtioDeviceType, }; use crate::seccomp_filters::Thread; @@ -78,7 +78,10 @@ impl RngEpollHandler { .memory() .read_volatile_from( desc.addr() - .translate_gva(self.access_platform.as_deref(), desc.len() as usize), + .translate_gva(self.access_platform.as_deref(), desc.len() as usize) + .map_err(|e| { + Error::GuestMemoryWrite(vm_memory::GuestMemoryError::IOError(e)) + })?, &mut self.random_file, desc.len() as usize, ) @@ -166,7 +169,7 @@ impl Rng { pub fn new( id: String, path: &str, - iommu: bool, + access_platform_enabled: bool, seccomp_action: SeccompAction, exit_evt: EventFd, state: Option, @@ -179,8 +182,8 @@ impl Rng { } else { let mut avail_features = 1u64 << VIRTIO_F_VERSION_1; - if iommu { - avail_features |= 1u64 << VIRTIO_F_IOMMU_PLATFORM; + if access_platform_enabled { + avail_features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; } (avail_features, 0, false) @@ -244,12 +247,13 @@ impl VirtioDevice for Rng { self.common.ack_features(value); } - fn activate( - &mut self, - mem: GuestMemoryAtomic, - interrupt_cb: Arc, - mut queues: Vec<(usize, Queue, EventFd)>, - ) -> ActivateResult { + fn activate(&mut self, context: crate::device::ActivationContext) -> ActivateResult { + let crate::device::ActivationContext { + mem, + interrupt_cb, + mut queues, + .. + } = context; self.common.activate(&queues, interrupt_cb.clone())?; let (kill_evt, pause_evt) = self.common.dup_eventfds(); @@ -269,7 +273,7 @@ impl VirtioDevice for Rng { queue_evt, kill_evt, pause_evt, - access_platform: self.common.access_platform.clone(), + access_platform: self.common.access_platform(), }; let paused = self.common.paused.clone(); @@ -301,6 +305,10 @@ impl VirtioDevice for Rng { fn set_access_platform(&mut self, access_platform: Arc) { self.common.set_access_platform(access_platform); } + + fn access_platform(&self) -> Option> { + self.common.access_platform() + } } impl Pausable for Rng { diff --git a/virtio-devices/src/seccomp_filters.rs b/virtio-devices/src/seccomp_filters.rs index 5afd056a6b..37c444999a 100644 --- a/virtio-devices/src/seccomp_filters.rs +++ b/virtio-devices/src/seccomp_filters.rs @@ -24,6 +24,7 @@ pub enum Thread { VirtioRng, VirtioVhostBlock, VirtioVhostFs, + VirtioGenericVhostUser, VirtioVhostNet, VirtioVhostNetCtl, VirtioVsock, @@ -52,6 +53,10 @@ macro_rules! or { const VFIO_IOMMU_MAP_DMA: u64 = 0x3b71; const VFIO_IOMMU_UNMAP_DMA: u64 = 0x3b72; +// See include/uapi/linux/iommufd.h in the kernel code. +const IOMMU_IOAS_MAP: u64 = 0x3b85; +const IOMMU_IOAS_UNMAP: u64 = 0x3b86; + #[cfg(feature = "sev_snp")] fn mshv_sev_snp_ioctl_seccomp_rule() -> SeccompRule { and![ @@ -82,6 +87,8 @@ fn create_virtio_iommu_ioctl_seccomp_rule() -> Vec { or![ and![Cond::new(1, ArgLen::Dword, Eq, VFIO_IOMMU_MAP_DMA).unwrap()], and![Cond::new(1, ArgLen::Dword, Eq, VFIO_IOMMU_UNMAP_DMA).unwrap()], + and![Cond::new(1, ArgLen::Dword, Eq, IOMMU_IOAS_MAP).unwrap()], + and![Cond::new(1, ArgLen::Dword, Eq, IOMMU_IOAS_UNMAP).unwrap()], ] } @@ -89,6 +96,8 @@ fn create_virtio_mem_ioctl_seccomp_rule() -> Vec { or![ and![Cond::new(1, ArgLen::Dword, Eq, VFIO_IOMMU_MAP_DMA).unwrap()], and![Cond::new(1, ArgLen::Dword, Eq, VFIO_IOMMU_UNMAP_DMA).unwrap()], + and![Cond::new(1, ArgLen::Dword, Eq, IOMMU_IOAS_MAP).unwrap()], + and![Cond::new(1, ArgLen::Dword, Eq, IOMMU_IOAS_UNMAP).unwrap()], ] } @@ -145,11 +154,11 @@ fn virtio_mem_thread_rules() -> Vec<(i64, Vec)> { fn virtio_net_thread_rules() -> Vec<(i64, Vec)> { vec![ + #[cfg(feature = "sev_snp")] + (libc::SYS_ioctl, create_mshv_sev_snp_ioctl_seccomp_rule()), (libc::SYS_readv, vec![]), (libc::SYS_timerfd_settime, vec![]), (libc::SYS_writev, vec![]), - #[cfg(feature = "sev_snp")] - (libc::SYS_ioctl, create_mshv_sev_snp_ioctl_seccomp_rule()), ] } @@ -192,6 +201,20 @@ fn virtio_vhost_fs_thread_rules() -> Vec<(i64, Vec)> { ] } +fn virtio_generic_vhost_user_thread_rules() -> Vec<(i64, Vec)> { + vec![ + (libc::SYS_clock_nanosleep, vec![]), + (libc::SYS_connect, vec![]), + (libc::SYS_nanosleep, vec![]), + (libc::SYS_pread64, vec![]), + (libc::SYS_pwrite64, vec![]), + (libc::SYS_recvmsg, vec![]), + (libc::SYS_sendmsg, vec![]), + (libc::SYS_sendto, vec![]), + (libc::SYS_socket, vec![]), + ] +} + fn virtio_vhost_net_ctl_thread_rules() -> Vec<(i64, Vec)> { vec![] } @@ -239,14 +262,11 @@ fn virtio_vsock_thread_rules() -> Vec<(i64, Vec)> { vec![ (libc::SYS_accept4, vec![]), (libc::SYS_connect, vec![]), + (libc::SYS_fcntl, vec![]), (libc::SYS_ioctl, create_vsock_ioctl_seccomp_rule()), (libc::SYS_recvfrom, vec![]), (libc::SYS_sendto, vec![]), (libc::SYS_socket, vec![]), - // If debug_assertions is enabled, closing a file first checks - // whether the FD is valid with fcntl. - #[cfg(debug_assertions)] - (libc::SYS_fcntl, vec![]), ] } @@ -271,6 +291,7 @@ fn get_seccomp_rules(thread_type: Thread) -> Vec<(i64, Vec)> { Thread::VirtioRng => virtio_rng_thread_rules(), Thread::VirtioVhostBlock => virtio_vhost_block_thread_rules(), Thread::VirtioVhostFs => virtio_vhost_fs_thread_rules(), + Thread::VirtioGenericVhostUser => virtio_generic_vhost_user_thread_rules(), Thread::VirtioVhostNet => virtio_vhost_net_thread_rules(), Thread::VirtioVhostNetCtl => virtio_vhost_net_ctl_thread_rules(), Thread::VirtioVsock => virtio_vsock_thread_rules(), @@ -292,6 +313,7 @@ fn virtio_thread_common() -> Vec<(i64, Vec)> { #[cfg(target_arch = "x86_64")] (libc::SYS_epoll_wait, vec![]), (libc::SYS_exit, vec![]), + (libc::SYS_fcntl, vec![]), (libc::SYS_futex, vec![]), (libc::SYS_gettid, vec![]), (libc::SYS_madvise, vec![]), @@ -305,8 +327,6 @@ fn virtio_thread_common() -> Vec<(i64, Vec)> { (libc::SYS_rt_sigreturn, vec![]), (libc::SYS_sigaltstack, vec![]), (libc::SYS_write, vec![]), - #[cfg(debug_assertions)] - (libc::SYS_fcntl, vec![]), ] } diff --git a/virtio-devices/src/transport/pci_common_config.rs b/virtio-devices/src/transport/pci_common_config.rs index 5a7b5f57a4..379622bd97 100644 --- a/virtio-devices/src/transport/pci_common_config.rs +++ b/virtio-devices/src/transport/pci_common_config.rs @@ -6,7 +6,7 @@ // // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause -use std::sync::atomic::{AtomicU16, Ordering}; +use std::sync::atomic::{AtomicU8, AtomicU16, Ordering}; use std::sync::{Arc, Mutex}; use byteorder::{ByteOrder, LittleEndian}; @@ -14,8 +14,8 @@ use log::{debug, error, warn}; use serde::{Deserialize, Serialize}; use virtio_queue::{Queue, QueueT}; use vm_migration::{MigratableError, Pausable, Snapshot, Snapshottable}; -use vm_virtio::AccessPlatform; +use super::pci_device::VIRTQ_MSI_NO_VECTOR; use crate::VirtioDevice; pub const VIRTIO_PCI_COMMON_CONFIG_ID: &str = "virtio_pci_common_config"; @@ -124,8 +124,8 @@ pub fn get_vring_size(t: VringType, queue_size: u16) -> u64 { /// le64 queue_avail; // 0x28 // read-write /// le64 queue_used; // 0x30 // read-write pub struct VirtioPciCommonConfig { - pub access_platform: Option>, - pub driver_status: u8, + pub device: Arc>, + pub driver_status: Arc, pub config_generation: u8, pub device_feature_select: u32, pub driver_feature_select: u32, @@ -135,13 +135,10 @@ pub struct VirtioPciCommonConfig { } impl VirtioPciCommonConfig { - pub fn new( - state: VirtioPciCommonConfigState, - access_platform: Option>, - ) -> Self { + pub fn new(state: VirtioPciCommonConfigState, device: Arc>) -> Self { VirtioPciCommonConfig { - access_platform, - driver_status: state.driver_status, + device, + driver_status: Arc::new(AtomicU8::new(state.driver_status)), config_generation: state.config_generation, device_feature_select: state.device_feature_select, driver_feature_select: state.driver_feature_select, @@ -153,7 +150,7 @@ impl VirtioPciCommonConfig { fn state(&self) -> VirtioPciCommonConfigState { VirtioPciCommonConfigState { - driver_status: self.driver_status, + driver_status: self.driver_status.load(Ordering::Acquire), config_generation: self.config_generation, device_feature_select: self.device_feature_select, driver_feature_select: self.driver_feature_select, @@ -163,13 +160,7 @@ impl VirtioPciCommonConfig { } } - pub fn read( - &mut self, - offset: u64, - data: &mut [u8], - queues: &[Queue], - device: Arc>, - ) { + pub fn read(&mut self, offset: u64, data: &mut [u8], queues: &[Queue]) { assert!(data.len() <= 8); match data.len() { @@ -182,7 +173,7 @@ impl VirtioPciCommonConfig { LittleEndian::write_u16(data, v); } 4 => { - let v = self.read_common_config_dword(offset, device); + let v = self.read_common_config_dword(offset); LittleEndian::write_u32(data, v); } 8 => { @@ -193,26 +184,14 @@ impl VirtioPciCommonConfig { } } - #[allow(clippy::needless_pass_by_value)] - pub fn write( - &mut self, - offset: u64, - data: &[u8], - queues: &mut [Queue], - device: Arc>, - ) { + pub fn write(&mut self, offset: u64, data: &[u8], queues: &mut [Queue]) { assert!(data.len() <= 8); match data.len() { 1 => self.write_common_config_byte(offset, data[0]), 2 => self.write_common_config_word(offset, LittleEndian::read_u16(data), queues), 4 => { - self.write_common_config_dword( - offset, - LittleEndian::read_u32(data), - queues, - device, - ); + self.write_common_config_dword(offset, LittleEndian::read_u32(data), queues); } 8 => self.write_common_config_qword(offset, LittleEndian::read_u64(data), queues), _ => error!("invalid data length for virtio write: len {}", data.len()), @@ -223,7 +202,7 @@ impl VirtioPciCommonConfig { debug!("read_common_config_byte: offset 0x{offset:x}"); // The driver is only allowed to do aligned, properly sized access. match offset { - 0x14 => self.driver_status, + 0x14 => self.driver_status.load(Ordering::Acquire), 0x15 => self.config_generation, _ => { warn!("invalid virtio config byte read: 0x{offset:x}"); @@ -235,7 +214,7 @@ impl VirtioPciCommonConfig { fn write_common_config_byte(&mut self, offset: u64, value: u8) { debug!("write_common_config_byte: offset 0x{offset:x}"); match offset { - 0x14 => self.driver_status = value, + 0x14 => self.driver_status.store(value, Ordering::Release), _ => { warn!("invalid virtio config byte write: 0x{offset:x}"); } @@ -249,7 +228,13 @@ impl VirtioPciCommonConfig { 0x12 => queues.len() as u16, // num_queues 0x16 => self.queue_select, 0x18 => self.with_queue(queues, |q| q.size()).unwrap_or(0), - 0x1a => self.msix_queues.lock().unwrap()[self.queue_select as usize], + 0x1a => self + .msix_queues + .lock() + .unwrap() + .get(usize::from(self.queue_select)) + .copied() + .unwrap_or(VIRTQ_MSI_NO_VECTOR), 0x1c => u16::from(self.with_queue(queues, |q| q.ready()).unwrap_or(false)), 0x1e => self.queue_select, // notify_off _ => { @@ -265,21 +250,55 @@ impl VirtioPciCommonConfig { 0x10 => self.msix_config.store(value, Ordering::Release), 0x16 => self.queue_select = value, 0x18 => self.with_queue_mut(queues, |q| q.set_size(value)), - 0x1a => self.msix_queues.lock().unwrap()[self.queue_select as usize] = value, + 0x1a => { + if let Some(entry) = self + .msix_queues + .lock() + .unwrap() + .get_mut(usize::from(self.queue_select)) + { + *entry = value; + } + } 0x1c => self.with_queue_mut(queues, |q| { let ready = value == 1; q.set_ready(ready); - // Translate address of descriptor table and vrings. - if ready && let Some(access_platform) = &self.access_platform { - let desc_table = access_platform + let access_platform = if ready { + self.device.lock().unwrap().access_platform() + } else { + None + }; + if let Some(access_platform) = access_platform { + let desc_table = match access_platform .translate_gva(q.desc_table(), get_vring_size(VringType::Desc, q.size())) - .unwrap(); - let avail_ring = access_platform + { + Ok(addr) => addr, + Err(e) => { + error!("Failed to translate desc_table GVA: {e}"); + q.set_ready(false); + return; + } + }; + let avail_ring = match access_platform .translate_gva(q.avail_ring(), get_vring_size(VringType::Avail, q.size())) - .unwrap(); - let used_ring = access_platform + { + Ok(addr) => addr, + Err(e) => { + error!("Failed to translate avail_ring GVA: {e}"); + q.set_ready(false); + return; + } + }; + let used_ring = match access_platform .translate_gva(q.used_ring(), get_vring_size(VringType::Used, q.size())) - .unwrap(); + { + Ok(addr) => addr, + Err(e) => { + error!("Failed to translate used_ring GVA: {e}"); + q.set_ready(false); + return; + } + }; q.set_desc_table_address( Some((desc_table & 0xffff_ffff) as u32), Some((desc_table >> 32) as u32), @@ -300,15 +319,12 @@ impl VirtioPciCommonConfig { } } - #[allow(clippy::needless_pass_by_value)] - fn read_common_config_dword(&self, offset: u64, device: Arc>) -> u32 { + fn read_common_config_dword(&self, offset: u64) -> u32 { debug!("read_common_config_dword: offset 0x{offset:x}"); match offset { 0x00 => self.device_feature_select, 0x04 => { - let locked_device = device.lock().unwrap(); - // Only 64 bits of features (2 pages) are defined for now, so limit - // device_feature_select to avoid shifting by 64 or more bits. + let locked_device = self.device.lock().unwrap(); if self.device_feature_select < 2 { (locked_device.features() >> (self.device_feature_select * 32)) as u32 } else { @@ -323,14 +339,7 @@ impl VirtioPciCommonConfig { } } - #[allow(clippy::needless_pass_by_value)] - fn write_common_config_dword( - &mut self, - offset: u64, - value: u32, - queues: &mut [Queue], - device: Arc>, - ) { + fn write_common_config_dword(&mut self, offset: u64, value: u32, queues: &mut [Queue]) { debug!("write_common_config_dword: offset 0x{offset:x}"); match offset { @@ -338,7 +347,7 @@ impl VirtioPciCommonConfig { 0x08 => self.driver_feature_select = value, 0x0c => { if self.driver_feature_select < 2 { - let mut locked_device = device.lock().unwrap(); + let mut locked_device = self.device.lock().unwrap(); locked_device .ack_features(u64::from(value) << (self.driver_feature_select * 32)); } @@ -404,11 +413,8 @@ impl Snapshottable for VirtioPciCommonConfig { #[cfg(test)] mod unit_tests { - use vm_memory::GuestMemoryAtomic; - use vmm_sys_util::eventfd::EventFd; - use super::*; - use crate::{ActivateResult, GuestMemoryMmap, VirtioInterrupt}; + use crate::{ActivateResult, ActivationContext}; struct DummyDevice(u32); const QUEUE_SIZE: u16 = 256; @@ -421,12 +427,7 @@ mod unit_tests { fn queue_max_sizes(&self) -> &[u16] { QUEUE_SIZES } - fn activate( - &mut self, - _mem: GuestMemoryAtomic, - _interrupt_evt: Arc, - _queues: Vec<(usize, Queue, EventFd)>, - ) -> ActivateResult { + fn activate(&mut self, _context: ActivationContext) -> ActivateResult { Ok(()) } @@ -443,9 +444,10 @@ mod unit_tests { #[test] fn write_base_regs() { + let dev: Arc> = Arc::new(Mutex::new(DummyDevice(0))); let mut regs = VirtioPciCommonConfig { - access_platform: None, - driver_status: 0xaa, + device: dev.clone(), + driver_status: Arc::new(AtomicU8::new(0xaa)), config_generation: 0x55, device_feature_select: 0x0, driver_feature_select: 0x0, @@ -454,42 +456,69 @@ mod unit_tests { msix_queues: Arc::new(Mutex::new(vec![0; 3])), }; - let dev = Arc::new(Mutex::new(DummyDevice(0))); let mut queues = Vec::new(); // Can set all bits of driver_status. - regs.write(0x14, &[0x55], &mut queues, dev.clone()); + regs.write(0x14, &[0x55], &mut queues); let mut read_back = vec![0x00]; - regs.read(0x14, &mut read_back, &queues, dev.clone()); + regs.read(0x14, &mut read_back, &queues); assert_eq!(read_back[0], 0x55); // The config generation register is read only. - regs.write(0x15, &[0xaa], &mut queues, dev.clone()); + regs.write(0x15, &[0xaa], &mut queues); let mut read_back = vec![0x00]; - regs.read(0x15, &mut read_back, &queues, dev.clone()); + regs.read(0x15, &mut read_back, &queues); assert_eq!(read_back[0], 0x55); // Device features is read-only and passed through from the device. - regs.write(0x04, &[0, 0, 0, 0], &mut queues, dev.clone()); + regs.write(0x04, &[0, 0, 0, 0], &mut queues); let mut read_back = vec![0, 0, 0, 0]; - regs.read(0x04, &mut read_back, &queues, dev.clone()); + regs.read(0x04, &mut read_back, &queues); assert_eq!(LittleEndian::read_u32(&read_back), DUMMY_FEATURES as u32); // Feature select registers are read/write. - regs.write(0x00, &[1, 2, 3, 4], &mut queues, dev.clone()); + regs.write(0x00, &[1, 2, 3, 4], &mut queues); let mut read_back = vec![0, 0, 0, 0]; - regs.read(0x00, &mut read_back, &queues, dev.clone()); + regs.read(0x00, &mut read_back, &queues); assert_eq!(LittleEndian::read_u32(&read_back), 0x0403_0201); - regs.write(0x08, &[1, 2, 3, 4], &mut queues, dev.clone()); + regs.write(0x08, &[1, 2, 3, 4], &mut queues); let mut read_back = vec![0, 0, 0, 0]; - regs.read(0x08, &mut read_back, &queues, dev.clone()); + regs.read(0x08, &mut read_back, &queues); assert_eq!(LittleEndian::read_u32(&read_back), 0x0403_0201); // 'queue_select' can be read and written. - regs.write(0x16, &[0xaa, 0x55], &mut queues, dev.clone()); + regs.write(0x16, &[0xaa, 0x55], &mut queues); let mut read_back = vec![0x00, 0x00]; - regs.read(0x16, &mut read_back, &queues, dev); + regs.read(0x16, &mut read_back, &queues); assert_eq!(read_back[0], 0xaa); assert_eq!(read_back[1], 0x55); } + + #[test] + fn oob_queue_select_does_not_panic() { + let dev: Arc> = Arc::new(Mutex::new(DummyDevice(0))); + let mut regs = VirtioPciCommonConfig { + device: dev.clone(), + driver_status: Arc::new(AtomicU8::new(0)), + config_generation: 0, + device_feature_select: 0, + driver_feature_select: 0, + queue_select: 0, + msix_config: Arc::new(AtomicU16::new(0)), + msix_queues: Arc::new(Mutex::new(vec![0; 1])), + }; + + let mut queues = vec![Queue::new(256).unwrap()]; + + // Set queue_select to an out-of-bounds value. + regs.write(0x16, &[0xFF, 0xFF], &mut queues); + + // Read queue_msix_vector — must not panic, should return VIRTQ_MSI_NO_VECTOR. + let mut read_back = vec![0x00, 0x00]; + regs.read(0x1a, &mut read_back, &queues); + assert_eq!(LittleEndian::read_u16(&read_back), VIRTQ_MSI_NO_VECTOR); + + // Write queue_msix_vector — must not panic. + regs.write(0x1a, &[0xAB, 0xCD], &mut queues); + } } diff --git a/virtio-devices/src/transport/pci_device.rs b/virtio-devices/src/transport/pci_device.rs index 408611e29a..98abb04935 100644 --- a/virtio-devices/src/transport/pci_device.rs +++ b/virtio-devices/src/transport/pci_device.rs @@ -10,16 +10,17 @@ use std::any::Any; use std::cmp; use std::io::Write; use std::ops::Deref; -use std::sync::atomic::{AtomicBool, AtomicU16, AtomicUsize, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicU8, AtomicU16, AtomicUsize, Ordering}; use std::sync::{Arc, Barrier, Mutex}; use anyhow::anyhow; use libc::EFD_NONBLOCK; use log::{error, info}; use pci::{ - BarReprogrammingParams, MsixCap, MsixConfig, PciBarConfiguration, PciBarRegionType, - PciCapability, PciCapabilityId, PciClassCode, PciConfiguration, PciDevice, PciDeviceError, - PciHeaderType, PciMassStorageSubclass, PciNetworkControllerSubclass, PciSubclass, + BarReprogrammingParams, MaybeMutInterruptSourceGroup, MsixCap, MsixConfig, PciBarConfiguration, + PciBarRegionType, PciCapability, PciCapabilityId, PciClassCode, PciConfiguration, PciDevice, + PciDeviceError, PciHeaderType, PciMassStorageSubclass, PciNetworkControllerSubclass, + PciSubclass, }; use serde::{Deserialize, Serialize}; use thiserror::Error; @@ -44,7 +45,7 @@ use crate::{ }; /// Vector value used to disable MSI for a queue. -const VIRTQ_MSI_NO_VECTOR: u16 = 0xffff; +pub(super) const VIRTQ_MSI_NO_VECTOR: u16 = 0xffff; enum PciCapabilityType { Common = 1, @@ -231,28 +232,46 @@ impl PciSubclass for PciVirtioSubclass { } } +/// Max number of virtio queues Cloud Hypervisor supports. +/// This is set by the current size of the notification BAR. +const MAX_QUEUES: u64 = 0x400; + +// Automatically compute the position of the next entry in the BAR. +// This handles alignment properly and is much less error-prone than +// manual calculation. +const fn next_bar_addr_align(offset: u64, size: u64, align: u64) -> u64 { + assert!(align >= 0x2000, "too small alignment for structure in BAR"); + assert!(align.is_power_of_two(), "alignment must be a power of 2"); + (offset + size).next_multiple_of(align) +} +// Same as next_bar_addr_align(), but with the default alignment (8K). +const fn next_bar_addr(offset: u64, size: u64) -> u64 { + next_bar_addr_align(offset, size, 0x2000) +} + // Allocate one bar for the structs pointed to by the capability structures. // As per the PCI specification, because the same BAR shares MSI-X and non // MSI-X structures, it is recommended to use 8KiB alignment for all those // structures. const COMMON_CONFIG_BAR_OFFSET: u64 = 0x0000; const COMMON_CONFIG_SIZE: u64 = 56; -const ISR_CONFIG_BAR_OFFSET: u64 = 0x2000; +const ISR_CONFIG_BAR_OFFSET: u64 = next_bar_addr(COMMON_CONFIG_BAR_OFFSET, COMMON_CONFIG_SIZE); const ISR_CONFIG_SIZE: u64 = 1; -const DEVICE_CONFIG_BAR_OFFSET: u64 = 0x4000; +const DEVICE_CONFIG_BAR_OFFSET: u64 = next_bar_addr(ISR_CONFIG_BAR_OFFSET, ISR_CONFIG_SIZE); const DEVICE_CONFIG_SIZE: u64 = 0x1000; -const NOTIFICATION_BAR_OFFSET: u64 = 0x6000; -const NOTIFICATION_SIZE: u64 = 0x1000; -const MSIX_TABLE_BAR_OFFSET: u64 = 0x8000; +const NOTIFICATION_BAR_OFFSET: u64 = next_bar_addr(DEVICE_CONFIG_BAR_OFFSET, DEVICE_CONFIG_SIZE); +const NOTIFICATION_SIZE: u64 = MAX_QUEUES * NOTIFY_OFF_MULTIPLIER as u64; +const MSIX_TABLE_BAR_OFFSET: u64 = next_bar_addr(NOTIFICATION_BAR_OFFSET, NOTIFICATION_SIZE); + // The size is 256KiB because the table can hold up to 2048 entries, with each // entry being 128 bits (4 DWORDS). const MSIX_TABLE_SIZE: u64 = 0x40000; -const MSIX_PBA_BAR_OFFSET: u64 = 0x48000; +const MSIX_PBA_BAR_OFFSET: u64 = next_bar_addr(MSIX_TABLE_BAR_OFFSET, MSIX_TABLE_SIZE); // The size is 2KiB because the Pending Bit Array has one bit per vector and it // can support up to 2048 vectors. const MSIX_PBA_SIZE: u64 = 0x800; // The BAR size must be a power of 2. -const CAPABILITY_BAR_SIZE: u64 = 0x80000; +const CAPABILITY_BAR_SIZE: u64 = (MSIX_PBA_BAR_OFFSET + MSIX_PBA_SIZE).next_power_of_two(); const VIRTIO_COMMON_BAR_INDEX: usize = 0; const VIRTIO_SHM_BAR_INDEX: usize = 2; @@ -288,15 +307,18 @@ pub struct VirtioPciDeviceActivator { queues: Option>, barrier: Option>, id: String, + status: Arc, } impl VirtioPciDeviceActivator { - pub fn activate(&mut self) -> ActivateResult { - self.device.lock().unwrap().activate( - self.memory.take().unwrap(), - self.interrupt.take().unwrap(), - self.queues.take().unwrap(), - )?; + pub fn activate(mut self) -> ActivateResult { + let mut locked_device = self.device.lock().unwrap(); + locked_device.activate(crate::device::ActivationContext { + mem: self.memory.take().unwrap(), + interrupt_cb: self.interrupt.take().unwrap(), + queues: self.queues.take().unwrap(), + device_status: self.status, + })?; self.device_activated.store(true, Ordering::SeqCst); if let Some(barrier) = self.barrier.take() { @@ -338,7 +360,7 @@ pub struct VirtioPciDevice { // PCI interrupts. interrupt_status: Arc, virtio_interrupt: Option>, - interrupt_source_group: Arc, + interrupt_source_group: MaybeMutInterruptSourceGroup, // virtio queues queues: Vec, @@ -382,7 +404,7 @@ impl VirtioPciDevice { memory: GuestMemoryAtomic, device: Arc>, msix_num: u16, - access_platform: Option>, + access_platform: Option<&Arc>, interrupt_manager: &dyn InterruptManager, pci_device_bdf: u32, activate_evt: EventFd, @@ -400,7 +422,7 @@ impl VirtioPciDevice { } let num_queues = locked_device.queue_max_sizes().len(); - if let Some(access_platform) = &access_platform { + if let Some(access_platform) = access_platform { locked_device.set_access_platform(access_platform.clone()); } @@ -412,17 +434,26 @@ impl VirtioPciDevice { let pci_device_id = VIRTIO_PCI_DEVICE_ID_BASE + locked_device.device_type() as u16; - let interrupt_source_group = interrupt_manager - .create_group(MsiIrqGroupConfig { + let interrupt_source_group: MaybeMutInterruptSourceGroup = { + let config = MsiIrqGroupConfig { base: 0, count: msix_num as InterruptIndex, + }; + (if locked_device.interrupt_source_mutable() { + interrupt_manager + .create_group_mut(config) + .map(MaybeMutInterruptSourceGroup::Mutable) + } else { + interrupt_manager + .create_group(config) + .map(MaybeMutInterruptSourceGroup::Immutable) }) .map_err(|e| { VirtioPciDeviceError::CreateVirtioPciDevice(anyhow!( "Failed creating MSI interrupt group: {e}" )) - })?; - + })? + }; let msix_state = vm_migration::state_from_id(snapshot, pci::MSIX_CONFIG_ID).map_err(|e| { VirtioPciDeviceError::CreateVirtioPciDevice(anyhow!( @@ -431,14 +462,11 @@ impl VirtioPciDevice { })?; let (msix_config, msix_config_clone) = if msix_num > 0 { + let interrupt_source_group: MaybeMutInterruptSourceGroup = + interrupt_source_group.clone(); let msix_config = Arc::new(Mutex::new( - MsixConfig::new( - msix_num, - interrupt_source_group.clone(), - pci_device_bdf, - msix_state, - ) - .unwrap(), + MsixConfig::new(msix_num, interrupt_source_group, pci_device_bdf, msix_state) + .unwrap(), )); let msix_config_clone = msix_config.clone(); (Some(msix_config), Some(msix_config_clone)) @@ -490,7 +518,7 @@ impl VirtioPciDevice { })?; let common_config = if let Some(common_config_state) = common_config_state { - VirtioPciCommonConfig::new(common_config_state, access_platform) + VirtioPciCommonConfig::new(common_config_state, device.clone()) } else { VirtioPciCommonConfig::new( VirtioPciCommonConfigState { @@ -502,7 +530,7 @@ impl VirtioPciDevice { msix_config: VIRTQ_MSI_NO_VECTOR, msix_queues: vec![VIRTQ_MSI_NO_VECTOR; num_queues], }, - access_platform, + device.clone(), ) }; @@ -577,7 +605,7 @@ impl VirtioPciDevice { memory, settings_bar: 0, use_64bit_bar, - interrupt_source_group, + interrupt_source_group: interrupt_source_group.clone(), cap_pci_cfg_info, bar_regions: vec![], activate_evt, @@ -641,13 +669,13 @@ impl VirtioPciDevice { fn is_driver_ready(&self) -> bool { let ready_bits = (DEVICE_ACKNOWLEDGE | DEVICE_DRIVER | DEVICE_DRIVER_OK | DEVICE_FEATURES_OK) as u8; - self.common_config.driver_status == ready_bits - && self.common_config.driver_status & DEVICE_FAILED as u8 == 0 + let driver_status = self.common_config.driver_status.load(Ordering::SeqCst); + driver_status == ready_bits && (driver_status & DEVICE_FAILED as u8) == 0 } /// Determines if the driver has requested the device (re)init / reset itself fn is_driver_init(&self) -> bool { - self.common_config.driver_status == DEVICE_INIT as u8 + self.common_config.driver_status.load(Ordering::SeqCst) == DEVICE_INIT as u8 } pub fn config_bar_addr(&self) -> u64 { @@ -801,6 +829,7 @@ impl VirtioPciDevice { device_activated: self.device_activated.clone(), barrier, id: self.id.clone(), + status: self.common_config.driver_status.clone(), } } @@ -833,7 +862,7 @@ pub struct VirtioInterruptMsix { msix_config: Arc>, config_vector: Arc, queues_vectors: Arc>>, - interrupt_source_group: Arc, + interrupt_source_group: MaybeMutInterruptSourceGroup, } impl VirtioInterruptMsix { @@ -841,7 +870,7 @@ impl VirtioInterruptMsix { msix_config: Arc>, config_vector: Arc, queues_vectors: Arc>>, - interrupt_source_group: Arc, + interrupt_source_group: MaybeMutInterruptSourceGroup, ) -> Self { VirtioInterruptMsix { msix_config, @@ -892,6 +921,16 @@ impl VirtioInterrupt for VirtioInterruptMsix { self.interrupt_source_group .notifier(vector as InterruptIndex) } + + fn set_notifier( + &self, + interrupt: u32, + eventfd: Option, + vm: &dyn hypervisor::Vm, + ) -> std::io::Result<()> { + self.interrupt_source_group + .set_notifier(interrupt, eventfd, vm) + } } impl PciDevice for VirtioPciDevice { @@ -939,7 +978,7 @@ impl PciDevice for VirtioPciDevice { fn allocate_bars( &mut self, - _allocator: &Arc>, + _allocator: &mut SystemAllocator, mmio32_allocator: &mut AddressAllocator, mmio64_allocator: &mut AddressAllocator, resources: Option>, @@ -1094,14 +1133,16 @@ impl PciDevice for VirtioPciDevice { Ok(()) } + fn restore_bar_addr(&mut self, params: &BarReprogrammingParams) { + self.configuration.restore_bar_addr(params); + } + fn read_bar(&mut self, _base: u64, offset: u64, data: &mut [u8]) { match offset { - o if o < COMMON_CONFIG_BAR_OFFSET + COMMON_CONFIG_SIZE => self.common_config.read( - o - COMMON_CONFIG_BAR_OFFSET, - data, - &self.queues, - self.device.clone(), - ), + o if o < COMMON_CONFIG_BAR_OFFSET + COMMON_CONFIG_SIZE => { + self.common_config + .read(o - COMMON_CONFIG_BAR_OFFSET, data, &self.queues); + } o if (ISR_CONFIG_BAR_OFFSET..ISR_CONFIG_BAR_OFFSET + ISR_CONFIG_SIZE).contains(&o) => { if let Some(v) = data.get_mut(0) { // Reading this register resets it to 0. @@ -1140,13 +1181,12 @@ impl PciDevice for VirtioPciDevice { } fn write_bar(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { + let initial_ready = self.is_driver_ready(); match offset { - o if o < COMMON_CONFIG_BAR_OFFSET + COMMON_CONFIG_SIZE => self.common_config.write( - o - COMMON_CONFIG_BAR_OFFSET, - data, - &mut self.queues, - self.device.clone(), - ), + o if o < COMMON_CONFIG_BAR_OFFSET + COMMON_CONFIG_SIZE => { + self.common_config + .write(o - COMMON_CONFIG_BAR_OFFSET, data, &mut self.queues); + } o if (ISR_CONFIG_BAR_OFFSET..ISR_CONFIG_BAR_OFFSET + ISR_CONFIG_SIZE).contains(&o) => { if let Some(v) = data.first() { self.interrupt_status @@ -1191,8 +1231,8 @@ impl PciDevice for VirtioPciDevice { _ => (), } - // Try and activate the device if the driver status has changed - if self.needs_activation() { + // Try and activate the device if the driver status has changed (from unready to ready) + if !initial_ready && self.needs_activation() { let barrier = Arc::new(Barrier::new(2)); let activator = self.prepare_activator(Some(barrier.clone())); self.pending_activations.lock().unwrap().push(activator); @@ -1219,7 +1259,9 @@ impl PciDevice for VirtioPciDevice { self.common_config.queue_select = 0; } else { error!("Attempt to reset device when not implemented in underlying device"); - self.common_config.driver_status = crate::DEVICE_FAILED as u8; + self.common_config + .driver_status + .store(crate::DEVICE_FAILED as u8, Ordering::SeqCst); } } diff --git a/virtio-devices/src/vdpa.rs b/virtio-devices/src/vdpa.rs index 725f215c77..f9bf7a39de 100644 --- a/virtio-devices/src/vdpa.rs +++ b/virtio-devices/src/vdpa.rs @@ -28,7 +28,7 @@ use vmm_sys_util::eventfd::EventFd; use crate::{ ActivateError, ActivateResult, DEVICE_ACKNOWLEDGE, DEVICE_DRIVER, DEVICE_DRIVER_OK, - DEVICE_FEATURES_OK, GuestMemoryMmap, VIRTIO_F_IOMMU_PLATFORM, VirtioCommon, VirtioDevice, + DEVICE_FEATURES_OK, GuestMemoryMmap, VIRTIO_F_ACCESS_PLATFORM, VirtioCommon, VirtioDevice, VirtioInterrupt, VirtioInterruptType, get_host_address_range, }; @@ -88,6 +88,8 @@ pub enum Error { SetVringKick(#[source] vhost::Error), #[error("Failed to set vring size")] SetVringNum(#[source] vhost::Error), + #[error("Failed to translate address")] + TranslateAddress(#[source] std::io::Error), } pub type Result = std::result::Result; @@ -164,7 +166,7 @@ impl Vdpa { let iova_range = vhost.get_iova_range().map_err(Error::GetIovaRange)?; - if avail_features & (1u64 << VIRTIO_F_IOMMU_PLATFORM) == 0 { + if avail_features & (1u64 << VIRTIO_F_ACCESS_PLATFORM) == 0 { return Err(Error::MissingAccessPlatformVirtioFeature); } @@ -217,7 +219,7 @@ impl Vdpa { fn activate_vdpa( &mut self, - mem: &GuestMemoryMmap, + _mem: &GuestMemoryMmap, virtio_interrupt: &dyn VirtioInterrupt, queues: &[(usize, Queue, EventFd)], ) -> Result<()> { @@ -246,18 +248,27 @@ impl Vdpa { queue_max_size, queue_size, flags: 0u32, - desc_table_addr: queue.desc_table().translate_gpa( - self.common.access_platform.as_deref(), - queue_size as usize * std::mem::size_of::(), - ), - used_ring_addr: queue.used_ring().translate_gpa( - self.common.access_platform.as_deref(), - 4 + queue_size as usize * 8, - ), - avail_ring_addr: queue.avail_ring().translate_gpa( - self.common.access_platform.as_deref(), - 4 + queue_size as usize * 2, - ), + desc_table_addr: queue + .desc_table() + .translate_gpa( + self.common.access_platform().as_deref(), + queue_size as usize * std::mem::size_of::(), + ) + .map_err(Error::TranslateAddress)?, + used_ring_addr: queue + .used_ring() + .translate_gpa( + self.common.access_platform().as_deref(), + 4 + queue_size as usize * 8, + ) + .map_err(Error::TranslateAddress)?, + avail_ring_addr: queue + .avail_ring() + .translate_gpa( + self.common.access_platform().as_deref(), + 4 + queue_size as usize * 2, + ) + .map_err(Error::TranslateAddress)?, log_addr: None, }; @@ -269,13 +280,7 @@ impl Vdpa { self.vhost .as_ref() .unwrap() - .set_vring_base( - *queue_index, - queue - .avail_idx(mem, Ordering::Acquire) - .map_err(Error::GetAvailableIndex)? - .0, - ) + .set_vring_base(*queue_index, 0) .map_err(Error::SetVringBase)?; if let Some(eventfd) = @@ -428,12 +433,13 @@ impl VirtioDevice for Vdpa { } } - fn activate( - &mut self, - mem: GuestMemoryAtomic, - virtio_interrupt: Arc, - queues: Vec<(usize, Queue, EventFd)>, - ) -> ActivateResult { + fn activate(&mut self, context: crate::device::ActivationContext) -> ActivateResult { + let crate::device::ActivationContext { + mem, + interrupt_cb: virtio_interrupt, + queues, + .. + } = context; self.activate_vdpa(&mem.memory(), virtio_interrupt.as_ref(), &queues) .map_err(ActivateError::ActivateVdpa)?; @@ -459,6 +465,10 @@ impl VirtioDevice for Vdpa { fn set_access_platform(&mut self, access_platform: Arc) { self.common.set_access_platform(access_platform); } + + fn access_platform(&self) -> Option> { + self.common.access_platform() + } } impl Pausable for Vdpa { diff --git a/virtio-devices/src/vhost_user/blk.rs b/virtio-devices/src/vhost_user/blk.rs index d26350c91a..2526b36f6b 100644 --- a/virtio-devices/src/vhost_user/blk.rs +++ b/virtio-devices/src/vhost_user/blk.rs @@ -3,13 +3,12 @@ use std::sync::atomic::AtomicBool; use std::sync::{Arc, Barrier, Mutex}; -use std::{mem, result, thread}; +use std::{mem, result}; use block::VirtioBlockConfig; use event_monitor::event; use log::{error, info}; use seccompiler::SeccompAction; -use serde::{Deserialize, Serialize}; use vhost::vhost_user::message::{ VhostUserConfigFlags, VhostUserProtocolFeatures, VhostUserVirtioFeatures, }; @@ -19,7 +18,6 @@ use virtio_bindings::virtio_blk::{ VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_RO, VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_WRITE_ZEROES, }; -use virtio_queue::Queue; use vm_memory::{ByteValued, GuestMemoryAtomic}; use vm_migration::protocol::MemoryRangeTable; use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable}; @@ -30,33 +28,24 @@ use super::vu_common_ctrl::{VhostUserConfig, VhostUserHandle}; use super::{DEFAULT_VIRTIO_FEATURES, Error, Result}; use crate::seccomp_filters::Thread; use crate::thread_helper::spawn_virtio_thread; -use crate::vhost_user::VhostUserCommon; -use crate::{GuestMemoryMmap, GuestRegionMmap, VIRTIO_F_IOMMU_PLATFORM, VirtioInterrupt}; +use crate::vhost_user::{VhostUserCommon, VhostUserState}; +use crate::{GuestMemoryMmap, GuestRegionMmap, VIRTIO_F_ACCESS_PLATFORM, VirtioInterrupt}; const DEFAULT_QUEUE_NUMBER: usize = 1; -#[derive(Serialize, Deserialize)] -pub struct State { - pub avail_features: u64, - pub acked_features: u64, - pub config: VirtioBlockConfig, - pub acked_protocol_features: u64, - pub vu_num_queues: usize, -} +pub type State = VhostUserState; struct BackendReqHandler {} impl VhostUserFrontendReqHandler for BackendReqHandler {} pub struct Blk { - common: VirtioCommon, vu_common: VhostUserCommon, id: String, config: VirtioBlockConfig, guest_memory: Option>, - epoll_thread: Option>, seccomp_action: SeccompAction, exit_evt: EventFd, - iommu: bool, + access_platform_enabled: bool, } impl Blk { @@ -66,7 +55,7 @@ impl Blk { vu_cfg: VhostUserConfig, seccomp_action: SeccompAction, exit_evt: EventFd, - iommu: bool, + access_platform_enabled: bool, state: Option, ) -> Result { let num_queues = vu_cfg.num_queues; @@ -81,6 +70,7 @@ impl Blk { vu_num_queues, config, paused, + vring_bases, ) = if let Some(state) = state { info!("Restoring vhost-user-block {id}"); @@ -89,6 +79,8 @@ impl Blk { state.acked_protocol_features, )?; + vu.restore_state(&state)?; + ( state.avail_features, state.acked_features, @@ -96,6 +88,7 @@ impl Blk { state.vu_num_queues, state.config, true, + state.vring_bases, ) } else { // Filling device and vring features VMM supports. @@ -120,7 +113,8 @@ impl Blk { | VhostUserProtocolFeatures::CONFIGURE_MEM_SLOTS | VhostUserProtocolFeatures::REPLY_ACK | VhostUserProtocolFeatures::INFLIGHT_SHMFD - | VhostUserProtocolFeatures::LOG_SHMFD; + | VhostUserProtocolFeatures::LOG_SHMFD + | VhostUserProtocolFeatures::DEVICE_STATE; let (acked_features, acked_protocol_features) = vu.negotiate_features_vhost_user(avail_features, avail_protocol_features)?; @@ -169,83 +163,68 @@ impl Blk { num_queues, config, false, + None, ) }; Ok(Blk { - common: VirtioCommon { - device_type: VirtioDeviceType::Block as u32, - queue_sizes: vec![vu_cfg.queue_size; num_queues], - avail_features, - acked_features, - paused_sync: Some(Arc::new(Barrier::new(2))), - min_queues: DEFAULT_QUEUE_NUMBER as u16, - paused: Arc::new(AtomicBool::new(paused)), - ..Default::default() - }, vu_common: VhostUserCommon { + virtio_common: VirtioCommon { + device_type: VirtioDeviceType::Block as u32, + queue_sizes: vec![vu_cfg.queue_size; num_queues], + avail_features, + acked_features, + paused_sync: Some(Arc::new(Barrier::new(2))), + min_queues: DEFAULT_QUEUE_NUMBER as u16, + paused: Arc::new(AtomicBool::new(paused)), + ..Default::default() + }, vu: Some(Arc::new(Mutex::new(vu))), acked_protocol_features, socket_path: vu_cfg.socket, vu_num_queues, + vring_bases, ..Default::default() }, id, config, guest_memory: None, - epoll_thread: None, seccomp_action, exit_evt, - iommu, + access_platform_enabled, }) } - fn state(&self) -> State { - State { - avail_features: self.common.avail_features, - acked_features: self.common.acked_features, - config: self.config, - acked_protocol_features: self.vu_common.acked_protocol_features, - vu_num_queues: self.vu_common.vu_num_queues, - } + fn state(&self) -> std::result::Result { + self.vu_common.state(self.config) } } impl Drop for Blk { fn drop(&mut self) { - if let Some(kill_evt) = self.common.kill_evt.take() - && let Err(e) = kill_evt.write(1) - { - error!("failed to kill vhost-user-blk: {e:?}"); - } - self.common.wait_for_epoll_threads(); - if let Some(thread) = self.epoll_thread.take() - && let Err(e) = thread.join() - { - error!("Error joining thread: {e:?}"); - } + self.vu_common.shutdown(); } } impl VirtioDevice for Blk { fn device_type(&self) -> u32 { - self.common.device_type + self.vu_common.virtio_common.device_type } fn queue_max_sizes(&self) -> &[u16] { - &self.common.queue_sizes + &self.vu_common.virtio_common.queue_sizes } fn features(&self) -> u64 { - let mut features = self.common.avail_features; - if self.iommu { - features |= 1u64 << VIRTIO_F_IOMMU_PLATFORM; + let mut features = self.vu_common.virtio_common.avail_features; + if self.access_platform_enabled { + features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; } features } fn ack_features(&mut self, value: u64) { - self.common.ack_features(value); + self.vu_common.virtio_common.ack_features(value); } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -275,37 +254,44 @@ impl VirtioDevice for Blk { .set_config(offset as u32, VhostUserConfigFlags::WRITABLE, data) .map_err(Error::VhostUserSetConfig) { - error!("Failed setting vhost-user-blk configuration: {e:?}"); + error!( + "Failed setting vhost-user-blk configuration for socket {} at offset 0x{offset:x} with length {}: {e:?}", + self.vu_common.socket_path, + data.len() + ); } } - fn activate( - &mut self, - mem: GuestMemoryAtomic, - interrupt_cb: Arc, - queues: Vec<(usize, Queue, EventFd)>, - ) -> ActivateResult { - self.common.activate(&queues, interrupt_cb.clone())?; + fn activate(&mut self, context: crate::device::ActivationContext) -> ActivateResult { + let crate::device::ActivationContext { + mem, + interrupt_cb, + queues, + .. + } = context; + self.vu_common + .virtio_common + .activate(&queues, interrupt_cb.clone())?; self.guest_memory = Some(mem.clone()); let backend_req_handler: Option> = None; // Run a dedicated thread for handling potential reconnections with // the backend. - let (kill_evt, pause_evt) = self.common.dup_eventfds(); + let (kill_evt, pause_evt) = self.vu_common.virtio_common.dup_eventfds(); let mut handler = self.vu_common.activate( mem, &queues, interrupt_cb, - self.common.acked_features, + self.vu_common.virtio_common.acked_features, backend_req_handler, kill_evt, pause_evt, )?; - let paused = self.common.paused.clone(); - let paused_sync = self.common.paused_sync.clone(); + let paused = self.vu_common.virtio_common.paused.clone(); + let paused_sync = self.vu_common.virtio_common.paused_sync.clone(); let mut epoll_threads = Vec::new(); @@ -317,25 +303,28 @@ impl VirtioDevice for Blk { &self.exit_evt, move || handler.run(&paused, paused_sync.as_ref().unwrap()), )?; - self.epoll_thread = Some(epoll_threads.remove(0)); + self.vu_common.epoll_thread = Some(epoll_threads.remove(0)); Ok(()) } fn reset(&mut self) -> Option> { // We first must resume the virtio thread if it was paused. - if self.common.pause_evt.take().is_some() { - self.common.resume().ok()?; + if self.vu_common.virtio_common.pause_evt.take().is_some() { + self.vu_common.virtio_common.resume().ok()?; } if let Some(vu) = &self.vu_common.vu && let Err(e) = vu.lock().unwrap().reset_vhost_user() { - error!("Failed to reset vhost-user daemon: {e:?}"); + error!( + "Failed to reset vhost-user daemon for socket {}: {e:?}", + self.vu_common.socket_path + ); return None; } - if let Some(kill_evt) = self.common.kill_evt.take() { + if let Some(kill_evt) = self.vu_common.virtio_common.kill_evt.take() { // Ignore the result because there is nothing we can do about it. let _ = kill_evt.write(1); } @@ -343,7 +332,7 @@ impl VirtioDevice for Blk { event!("virtio-device", "reset", "id", &self.id); // Return the interrupt - Some(self.common.interrupt_cb.take().unwrap()) + Some(self.vu_common.virtio_common.interrupt_cb.take().unwrap()) } fn shutdown(&mut self) { @@ -361,13 +350,13 @@ impl VirtioDevice for Blk { impl Pausable for Blk { fn pause(&mut self) -> result::Result<(), MigratableError> { self.vu_common.pause()?; - self.common.pause() + self.vu_common.virtio_common.pause() } fn resume(&mut self) -> result::Result<(), MigratableError> { - self.common.resume()?; + self.vu_common.virtio_common.resume()?; - if let Some(epoll_thread) = &self.epoll_thread { + if let Some(epoll_thread) = &self.vu_common.epoll_thread { epoll_thread.thread().unpark(); } @@ -381,7 +370,7 @@ impl Snapshottable for Blk { } fn snapshot(&mut self) -> std::result::Result { - self.vu_common.snapshot(&self.state()) + self.vu_common.snapshot(&self.state()?) } } impl Transportable for Blk {} @@ -404,7 +393,6 @@ impl Migratable for Blk { } fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> { - self.vu_common - .complete_migration(self.common.kill_evt.take()) + self.vu_common.complete_migration() } } diff --git a/virtio-devices/src/vhost_user/fs.rs b/virtio-devices/src/vhost_user/fs.rs index d0005af90f..c062edf19b 100644 --- a/virtio-devices/src/vhost_user/fs.rs +++ b/virtio-devices/src/vhost_user/fs.rs @@ -1,9 +1,9 @@ // Copyright 2019 Intel Corporation. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +use std::result; use std::sync::atomic::AtomicBool; use std::sync::{Arc, Barrier, Mutex}; -use std::{result, thread}; use event_monitor::event; use log::{error, info}; @@ -12,7 +12,6 @@ use serde::{Deserialize, Serialize}; use serde_with::{Bytes, serde_as}; use vhost::vhost_user::message::{VhostUserProtocolFeatures, VhostUserVirtioFeatures}; use vhost::vhost_user::{FrontendReqHandler, VhostUserFrontend, VhostUserFrontendReqHandler}; -use virtio_queue::Queue; use vm_device::UserspaceMapping; use vm_memory::{ByteValued, GuestMemoryAtomic}; use vm_migration::protocol::MemoryRangeTable; @@ -23,24 +22,16 @@ use super::vu_common_ctrl::VhostUserHandle; use super::{DEFAULT_VIRTIO_FEATURES, Error, Result}; use crate::seccomp_filters::Thread; use crate::thread_helper::spawn_virtio_thread; -use crate::vhost_user::VhostUserCommon; +use crate::vhost_user::{VhostUserCommon, VhostUserState}; use crate::{ - ActivateResult, GuestMemoryMmap, GuestRegionMmap, MmapRegion, VIRTIO_F_IOMMU_PLATFORM, + ActivateResult, GuestMemoryMmap, GuestRegionMmap, MmapRegion, VIRTIO_F_ACCESS_PLATFORM, VirtioCommon, VirtioDevice, VirtioDeviceType, VirtioInterrupt, VirtioSharedMemoryList, }; const NUM_QUEUE_OFFSET: usize = 1; const DEFAULT_QUEUE_NUMBER: usize = 2; -#[derive(Serialize, Deserialize)] -pub struct State { - pub avail_features: u64, - pub acked_features: u64, - pub config: VirtioFsConfig, - pub acked_protocol_features: u64, - pub vu_num_queues: usize, - pub backend_req_support: bool, -} +pub type State = VhostUserState; struct BackendReqHandler {} impl VhostUserFrontendReqHandler for BackendReqHandler {} @@ -68,7 +59,6 @@ impl Default for VirtioFsConfig { unsafe impl ByteValued for VirtioFsConfig {} pub struct Fs { - common: VirtioCommon, vu_common: VhostUserCommon, id: String, config: VirtioFsConfig, @@ -77,9 +67,8 @@ pub struct Fs { cache: Option<(VirtioSharedMemoryList, MmapRegion)>, seccomp_action: SeccompAction, guest_memory: Option>, - epoll_thread: Option>, exit_evt: EventFd, - iommu: bool, + access_platform_enabled: bool, } impl Fs { @@ -94,7 +83,7 @@ impl Fs { cache: Option<(VirtioSharedMemoryList, MmapRegion)>, seccomp_action: SeccompAction, exit_evt: EventFd, - iommu: bool, + access_platform_enabled: bool, state: Option, ) -> Result { // Calculate the actual number of queues needed. @@ -110,6 +99,7 @@ impl Fs { vu_num_queues, config, paused, + vring_bases, ) = if let Some(state) = state { info!("Restoring vhost-user-fs {id}"); @@ -118,6 +108,8 @@ impl Fs { state.acked_protocol_features, )?; + vu.restore_state(&state)?; + ( state.avail_features, state.acked_features, @@ -125,6 +117,7 @@ impl Fs { state.vu_num_queues, state.config, true, + state.vring_bases, ) } else { // Filling device and vring features VMM supports. @@ -134,7 +127,8 @@ impl Fs { | VhostUserProtocolFeatures::CONFIGURE_MEM_SLOTS | VhostUserProtocolFeatures::REPLY_ACK | VhostUserProtocolFeatures::INFLIGHT_SHMFD - | VhostUserProtocolFeatures::LOG_SHMFD; + | VhostUserProtocolFeatures::LOG_SHMFD + | VhostUserProtocolFeatures::DEVICE_STATE; let (acked_features, acked_protocol_features) = vu.negotiate_features_vhost_user(avail_features, avail_protocol_features)?; @@ -177,25 +171,27 @@ impl Fs { num_queues, config, false, + None, ) }; Ok(Fs { - common: VirtioCommon { - device_type: VirtioDeviceType::Fs as u32, - avail_features, - acked_features, - queue_sizes: vec![queue_size; num_queues], - paused_sync: Some(Arc::new(Barrier::new(2))), - min_queues: 1, - paused: Arc::new(AtomicBool::new(paused)), - ..Default::default() - }, vu_common: VhostUserCommon { + virtio_common: VirtioCommon { + device_type: VirtioDeviceType::Fs as u32, + avail_features, + acked_features, + queue_sizes: vec![queue_size; num_queues], + paused_sync: Some(Arc::new(Barrier::new(2))), + min_queues: 1, + paused: Arc::new(AtomicBool::new(paused)), + ..Default::default() + }, vu: Some(Arc::new(Mutex::new(vu))), acked_protocol_features, socket_path: path.to_string(), vu_num_queues, + vring_bases, ..Default::default() }, id, @@ -203,90 +199,76 @@ impl Fs { cache, seccomp_action, guest_memory: None, - epoll_thread: None, exit_evt, - iommu, + access_platform_enabled, }) } - fn state(&self) -> State { - State { - avail_features: self.common.avail_features, - acked_features: self.common.acked_features, - config: self.config, - acked_protocol_features: self.vu_common.acked_protocol_features, - vu_num_queues: self.vu_common.vu_num_queues, - backend_req_support: false, - } + fn state(&self) -> std::result::Result { + self.vu_common.state(self.config) } } impl Drop for Fs { fn drop(&mut self) { - if let Some(kill_evt) = self.common.kill_evt.take() { - // Ignore the result because there is nothing we can do about it. - let _ = kill_evt.write(1); - } - self.common.wait_for_epoll_threads(); - if let Some(thread) = self.epoll_thread.take() - && let Err(e) = thread.join() - { - error!("Error joining thread: {e:?}"); - } + self.vu_common.shutdown(); } } impl VirtioDevice for Fs { fn device_type(&self) -> u32 { - self.common.device_type + self.vu_common.virtio_common.device_type } fn queue_max_sizes(&self) -> &[u16] { - &self.common.queue_sizes + &self.vu_common.virtio_common.queue_sizes } fn features(&self) -> u64 { - let mut features = self.common.avail_features; - if self.iommu { - features |= 1u64 << VIRTIO_F_IOMMU_PLATFORM; + let mut features = self.vu_common.virtio_common.avail_features; + if self.access_platform_enabled { + features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; } features } fn ack_features(&mut self, value: u64) { - self.common.ack_features(value); + self.vu_common.virtio_common.ack_features(value); } fn read_config(&self, offset: u64, data: &mut [u8]) { self.read_config_from_slice(self.config.as_slice(), offset, data); } - fn activate( - &mut self, - mem: GuestMemoryAtomic, - interrupt_cb: Arc, - queues: Vec<(usize, Queue, EventFd)>, - ) -> ActivateResult { - self.common.activate(&queues, interrupt_cb.clone())?; + fn activate(&mut self, context: crate::device::ActivationContext) -> ActivateResult { + let crate::device::ActivationContext { + mem, + interrupt_cb, + queues, + .. + } = context; + self.vu_common + .virtio_common + .activate(&queues, interrupt_cb.clone())?; self.guest_memory = Some(mem.clone()); let backend_req_handler: Option> = None; // Run a dedicated thread for handling potential reconnections with // the backend. - let (kill_evt, pause_evt) = self.common.dup_eventfds(); + let (kill_evt, pause_evt) = self.vu_common.virtio_common.dup_eventfds(); let mut handler = self.vu_common.activate( mem, &queues, interrupt_cb, - self.common.acked_features, + self.vu_common.virtio_common.acked_features, backend_req_handler, kill_evt, pause_evt, )?; - let paused = self.common.paused.clone(); - let paused_sync = self.common.paused_sync.clone(); + let paused = self.vu_common.virtio_common.paused.clone(); + let paused_sync = self.vu_common.virtio_common.paused_sync.clone(); let mut epoll_threads = Vec::new(); spawn_virtio_thread( @@ -297,7 +279,7 @@ impl VirtioDevice for Fs { &self.exit_evt, move || handler.run(&paused, paused_sync.as_ref().unwrap()), )?; - self.epoll_thread = Some(epoll_threads.remove(0)); + self.vu_common.epoll_thread = Some(epoll_threads.remove(0)); event!("virtio-device", "activated", "id", &self.id); Ok(()) @@ -305,18 +287,21 @@ impl VirtioDevice for Fs { fn reset(&mut self) -> Option> { // We first must resume the virtio thread if it was paused. - if self.common.pause_evt.take().is_some() { - self.common.resume().ok()?; + if self.vu_common.virtio_common.pause_evt.take().is_some() { + self.vu_common.virtio_common.resume().ok()?; } if let Some(vu) = &self.vu_common.vu && let Err(e) = vu.lock().unwrap().reset_vhost_user() { - error!("Failed to reset vhost-user daemon: {e:?}"); + error!( + "Failed to reset vhost-user daemon for socket {}: {e:?}", + self.vu_common.socket_path + ); return None; } - if let Some(kill_evt) = self.common.kill_evt.take() { + if let Some(kill_evt) = self.vu_common.virtio_common.kill_evt.take() { // Ignore the result because there is nothing we can do about it. let _ = kill_evt.write(1); } @@ -324,7 +309,7 @@ impl VirtioDevice for Fs { event!("virtio-device", "reset", "id", &self.id); // Return the interrupt - Some(self.common.interrupt_cb.take().unwrap()) + Some(self.vu_common.virtio_common.interrupt_cb.take().unwrap()) } fn shutdown(&mut self) { @@ -372,13 +357,13 @@ impl VirtioDevice for Fs { impl Pausable for Fs { fn pause(&mut self) -> result::Result<(), MigratableError> { self.vu_common.pause()?; - self.common.pause() + self.vu_common.virtio_common.pause() } fn resume(&mut self) -> result::Result<(), MigratableError> { - self.common.resume()?; + self.vu_common.virtio_common.resume()?; - if let Some(epoll_thread) = &self.epoll_thread { + if let Some(epoll_thread) = &self.vu_common.epoll_thread { epoll_thread.thread().unpark(); } @@ -392,7 +377,7 @@ impl Snapshottable for Fs { } fn snapshot(&mut self) -> std::result::Result { - self.vu_common.snapshot(&self.state()) + self.vu_common.snapshot(&self.state()?) } } impl Transportable for Fs {} @@ -415,7 +400,6 @@ impl Migratable for Fs { } fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> { - self.vu_common - .complete_migration(self.common.kill_evt.take()) + self.vu_common.complete_migration() } } diff --git a/virtio-devices/src/vhost_user/generic_vhost_user.rs b/virtio-devices/src/vhost_user/generic_vhost_user.rs new file mode 100644 index 0000000000..dda891e912 --- /dev/null +++ b/virtio-devices/src/vhost_user/generic_vhost_user.rs @@ -0,0 +1,428 @@ +// Copyright 2019 Intel Corporation. All Rights Reserved. +// Copyright 2025 Demi Marie Obenour. +// SPDX-License-Identifier: Apache-2.0 + +use std::result; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::{Arc, Barrier, Mutex}; + +use event_monitor::event; +use log::{error, info, warn}; +use seccompiler::SeccompAction; +use vhost::vhost_user::message::{ + VhostUserConfigFlags, VhostUserProtocolFeatures, VhostUserVirtioFeatures, +}; +use vhost::vhost_user::{FrontendReqHandler, VhostUserFrontend, VhostUserFrontendReqHandler}; +use vm_device::UserspaceMapping; +use vm_memory::GuestMemoryAtomic; +use vm_migration::protocol::MemoryRangeTable; +use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable}; +use vmm_sys_util::eventfd::EventFd; + +use super::vu_common_ctrl::VhostUserHandle; +use super::{Error, Result}; +use crate::seccomp_filters::Thread; +use crate::thread_helper::spawn_virtio_thread; +use crate::vhost_user::{VhostUserCommon, VhostUserState}; +use crate::{ + ActivateResult, GuestMemoryMmap, GuestRegionMmap, MmapRegion, VIRTIO_F_ACCESS_PLATFORM, + VirtioCommon, VirtioDevice, VirtioInterrupt, VirtioSharedMemoryList, +}; + +pub type State = VhostUserState<()>; + +struct BackendReqHandler {} +impl VhostUserFrontendReqHandler for BackendReqHandler {} +pub struct GenericVhostUser { + vu_common: VhostUserCommon, + id: String, + // Hold ownership of the memory that is allocated for the device + // which will be automatically dropped when the device is dropped + cache: Option<(VirtioSharedMemoryList, MmapRegion)>, + seccomp_action: SeccompAction, + guest_memory: Option>, + exit_evt: EventFd, + access_platform_enabled: bool, + cfg_warning: AtomicBool, +} + +impl GenericVhostUser { + /// Create a new generic vhost-user device. + #[allow(clippy::too_many_arguments)] + pub fn new( + id: String, + path: &str, + request_queue_sizes: Vec, + device_type: u32, + cache: Option<(VirtioSharedMemoryList, MmapRegion)>, + seccomp_action: SeccompAction, + exit_evt: EventFd, + access_platform_enabled: bool, + state: Option, + ) -> Result { + // Calculate the actual number of queues needed. + let num_queues = request_queue_sizes.len(); + + // Connect to the vhost-user socket. + let mut vu = VhostUserHandle::connect_vhost_user(false, path, num_queues as u64, false)?; + + let ( + avail_features, + acked_features, + acked_protocol_features, + vu_num_queues, + paused, + vring_bases, + ) = if let Some(state) = state { + info!("Restoring generic vhost-user {id}"); + vu.set_protocol_features_vhost_user( + state.acked_features, + state.acked_protocol_features, + )?; + + vu.restore_state(&state)?; + + ( + state.avail_features, + state.acked_features, + state.acked_protocol_features, + state.vu_num_queues, + true, + state.vring_bases, + ) + } else { + let avail_protocol_features = VhostUserProtocolFeatures::CONFIG + | VhostUserProtocolFeatures::MQ + | VhostUserProtocolFeatures::CONFIGURE_MEM_SLOTS + | VhostUserProtocolFeatures::REPLY_ACK + | VhostUserProtocolFeatures::INFLIGHT_SHMFD + | VhostUserProtocolFeatures::LOG_SHMFD + | VhostUserProtocolFeatures::DEVICE_STATE; + + let avail_features = super::DEFAULT_VIRTIO_FEATURES; + + let (acked_features, acked_protocol_features) = + vu.negotiate_features_vhost_user(avail_features, avail_protocol_features)?; + + let backend_num_queues = + if acked_protocol_features & VhostUserProtocolFeatures::MQ.bits() != 0 { + vu.socket_handle() + .get_queue_num() + .map_err(Error::VhostUserGetQueueMaxNum)? as usize + } else { + num_queues + }; + + if num_queues > backend_num_queues { + error!( + "generic vhost-user requested too many queues ({num_queues}) \ +since the backend only supports {backend_num_queues}\n", + ); + return Err(Error::BadQueueNum); + } + + ( + acked_features, + // If part of the available features that have been acked, the + // PROTOCOL_FEATURES bit must be already set through the VIRTIO + // acked features as we know the guest would never ack it, thus + // the feature would be lost. + acked_features & VhostUserVirtioFeatures::PROTOCOL_FEATURES.bits(), + acked_protocol_features, + num_queues, + false, + None, + ) + }; + + Ok(GenericVhostUser { + vu_common: VhostUserCommon { + virtio_common: VirtioCommon { + device_type, + avail_features, + acked_features, + queue_sizes: request_queue_sizes, + paused_sync: Some(Arc::new(Barrier::new(2))), + min_queues: 1, + paused: Arc::new(AtomicBool::new(paused)), + ..Default::default() + }, + vu: Some(Arc::new(Mutex::new(vu))), + acked_protocol_features, + socket_path: path.to_string(), + vu_num_queues, + vring_bases, + ..Default::default() + }, + id, + cache, + seccomp_action, + guest_memory: None, + exit_evt, + access_platform_enabled, + cfg_warning: AtomicBool::new(false), + }) + } + + fn state(&self) -> std::result::Result { + self.vu_common.state(()) + } + + #[cold] + #[inline(never)] + fn warn_no_config_access(&self) { + if self + .cfg_warning + .compare_exchange(false, true, Ordering::SeqCst, Ordering::SeqCst) + .is_ok() + { + warn!( + "Attempt to read config space, but backend does not support config \ +space access. Reads will return 0xFF and writes will be ignored." + ); + } + } +} + +impl Drop for GenericVhostUser { + fn drop(&mut self) { + self.vu_common.shutdown(); + } +} + +impl VirtioDevice for GenericVhostUser { + fn device_type(&self) -> u32 { + self.vu_common.virtio_common.device_type + } + + fn queue_max_sizes(&self) -> &[u16] { + &self.vu_common.virtio_common.queue_sizes + } + + fn features(&self) -> u64 { + let mut features = self.vu_common.virtio_common.avail_features; + if self.access_platform_enabled { + features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; + } + features + } + + fn ack_features(&mut self, value: u64) { + self.vu_common.virtio_common.ack_features(value); + } + + fn read_config(&self, offset: u64, data: &mut [u8]) { + if (VhostUserProtocolFeatures::CONFIG.bits() & self.vu_common.acked_protocol_features) == 0 + { + self.warn_no_config_access(); + + data.fill(0xFF); + return; + } + if let Err(e) = self + .vu_common + .vu + .as_ref() + .unwrap() + .lock() + .unwrap() + .socket_handle() + .get_config( + offset.try_into().unwrap(), + data.len().try_into().unwrap(), + VhostUserConfigFlags::empty(), + data, + ) + .map(|(_, config)| data.copy_from_slice(&config)) + { + panic!("Failed getting generic vhost-user configuration: {e}"); + } + } + + fn write_config(&mut self, offset: u64, data: &[u8]) { + if (VhostUserProtocolFeatures::CONFIG.bits() & self.vu_common.acked_protocol_features) == 0 + { + self.warn_no_config_access(); + return; + } + if let Err(e) = self + .vu_common + .vu + .as_ref() + .unwrap() + .lock() + .unwrap() + .socket_handle() + .set_config( + offset.try_into().unwrap(), + VhostUserConfigFlags::WRITABLE, + data, + ) + { + panic!("Failed setting generic vhost-user configuration: {e}"); + } + } + + fn activate(&mut self, context: crate::device::ActivationContext) -> ActivateResult { + let crate::device::ActivationContext { + mem, + interrupt_cb, + queues, + .. + } = context; + self.vu_common + .virtio_common + .activate(&queues, interrupt_cb.clone())?; + self.guest_memory = Some(mem.clone()); + + let backend_req_handler: Option> = None; + // Run a dedicated thread for handling potential reconnections with + // the backend. + let (kill_evt, pause_evt) = self.vu_common.virtio_common.dup_eventfds(); + + let mut handler = self.vu_common.activate( + mem, + &queues, + interrupt_cb, + self.vu_common.virtio_common.acked_features, + backend_req_handler, + kill_evt, + pause_evt, + )?; + + let paused = self.vu_common.virtio_common.paused.clone(); + let paused_sync = self.vu_common.virtio_common.paused_sync.clone(); + + let mut epoll_threads = Vec::new(); + spawn_virtio_thread( + &self.id, + &self.seccomp_action, + Thread::VirtioGenericVhostUser, + &mut epoll_threads, + &self.exit_evt, + move || handler.run(&paused, paused_sync.as_ref().unwrap()), + )?; + self.vu_common.epoll_thread = Some(epoll_threads.remove(0)); + + event!("virtio-device", "activated", "id", &self.id); + Ok(()) + } + + fn reset(&mut self) -> Option> { + // We first must resume the virtio thread if it was paused. + if self.vu_common.virtio_common.pause_evt.take().is_some() { + self.vu_common.virtio_common.resume().ok()?; + } + + if let Some(vu) = &self.vu_common.vu + && let Err(e) = vu.lock().unwrap().reset_vhost_user() + { + error!( + "Failed to reset vhost-user daemon for socket {}: {e:?}", + self.vu_common.socket_path + ); + return None; + } + + if let Some(kill_evt) = self.vu_common.virtio_common.kill_evt.take() { + // Ignore the result because there is nothing we can do about it. + let _ = kill_evt.write(1); + } + + event!("virtio-device", "reset", "id", &self.id); + + // Return the interrupt + Some(self.vu_common.virtio_common.interrupt_cb.take().unwrap()) + } + + fn shutdown(&mut self) { + self.vu_common.shutdown(); + } + + fn get_shm_regions(&self) -> Option { + self.cache.as_ref().map(|cache| cache.0.clone()) + } + + fn set_shm_regions( + &mut self, + shm_regions: VirtioSharedMemoryList, + ) -> std::result::Result<(), crate::Error> { + if let Some(cache) = self.cache.as_mut() { + cache.0 = shm_regions; + Ok(()) + } else { + Err(crate::Error::SetShmRegionsNotSupported) + } + } + + fn add_memory_region( + &mut self, + region: &Arc, + ) -> std::result::Result<(), crate::Error> { + self.vu_common.add_memory_region(&self.guest_memory, region) + } + + fn userspace_mappings(&self) -> Vec { + let mut mappings = Vec::new(); + if let Some(cache) = self.cache.as_ref() { + mappings.push(UserspaceMapping { + mem_slot: cache.0.mem_slot, + addr: cache.0.addr, + mapping: cache.0.mapping.clone(), + mergeable: false, + }); + } + + mappings + } +} + +impl Pausable for GenericVhostUser { + fn pause(&mut self) -> result::Result<(), MigratableError> { + self.vu_common.pause()?; + self.vu_common.virtio_common.pause() + } + + fn resume(&mut self) -> result::Result<(), MigratableError> { + self.vu_common.virtio_common.resume()?; + + if let Some(epoll_thread) = &self.vu_common.epoll_thread { + epoll_thread.thread().unpark(); + } + + self.vu_common.resume() + } +} + +impl Snapshottable for GenericVhostUser { + fn id(&self) -> String { + self.id.clone() + } + + fn snapshot(&mut self) -> std::result::Result { + self.vu_common.snapshot(&self.state()?) + } +} +impl Transportable for GenericVhostUser {} + +impl Migratable for GenericVhostUser { + fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { + self.vu_common.start_dirty_log(&self.guest_memory) + } + + fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { + self.vu_common.stop_dirty_log() + } + + fn dirty_log(&mut self) -> std::result::Result { + self.vu_common.dirty_log(&self.guest_memory) + } + + fn start_migration(&mut self) -> std::result::Result<(), MigratableError> { + self.vu_common.start_migration() + } + + fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> { + self.vu_common.complete_migration() + } +} diff --git a/virtio-devices/src/vhost_user/mod.rs b/virtio-devices/src/vhost_user/mod.rs index 7e2c162cb9..abca12c058 100644 --- a/virtio-devices/src/vhost_user/mod.rs +++ b/virtio-devices/src/vhost_user/mod.rs @@ -1,11 +1,11 @@ // Copyright 2019 Intel Corporation. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 -use std::io; use std::ops::Deref; use std::os::unix::io::AsRawFd; -use std::sync::atomic::AtomicBool; +use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::{Arc, Barrier, Mutex}; +use std::{io, thread}; use anyhow::anyhow; use log::error; @@ -17,8 +17,9 @@ use vhost::vhost_user::message::{ }; use vhost::vhost_user::{FrontendReqHandler, VhostUserFrontendReqHandler}; use virtio_queue::{Error as QueueError, Queue}; +use vm_memory::guest_memory::Error as MmapError; use vm_memory::mmap::MmapRegionError; -use vm_memory::{Address, Error as MmapError, GuestAddressSpace, GuestMemory, GuestMemoryAtomic}; +use vm_memory::{Address, GuestAddressSpace, GuestMemory, GuestMemoryAtomic}; use vm_migration::protocol::MemoryRangeTable; use vm_migration::{MigratableError, Snapshot}; use vmm_sys_util::eventfd::EventFd; @@ -28,16 +29,18 @@ use crate::{ ActivateError, EPOLL_HELPER_EVENT_LAST, EpollHelper, EpollHelperError, EpollHelperHandler, GuestMemoryMmap, GuestRegionMmap, VIRTIO_F_IN_ORDER, VIRTIO_F_NOTIFICATION_DATA, VIRTIO_F_ORDER_PLATFORM, VIRTIO_F_RING_EVENT_IDX, VIRTIO_F_RING_INDIRECT_DESC, - VIRTIO_F_VERSION_1, VirtioInterrupt, + VIRTIO_F_VERSION_1, VirtioCommon, VirtioInterrupt, }; pub mod blk; pub mod fs; +pub mod generic_vhost_user; pub mod net; pub mod vu_common_ctrl; pub use self::blk::Blk; pub use self::fs::*; +pub use self::generic_vhost_user::GenericVhostUser; pub use self::net::Net; pub use self::vu_common_ctrl::VhostUserConfig; @@ -145,6 +148,16 @@ pub enum Error { NewMmapRegion(#[source] MmapRegionError), #[error("Could not find the shm log region")] MissingShmLogRegion, + #[error("Failed setting device state fd")] + VhostUserSetDeviceStateFd(#[source] VhostError), + #[error("Failed checking device state")] + VhostUserCheckDeviceState(#[source] VhostError), + #[error("Failed saving/restoring backend state")] + SaveRestoreBackendState(#[source] io::Error), + #[error("Vring bases count ({0}) does not match queue count ({1})")] + VringBasesCountMismatch(usize, usize), + #[error("Backend state and vring bases must both be present or both be absent")] + InconsistentBackendState, } type Result = std::result::Result; @@ -154,7 +167,8 @@ pub const DEFAULT_VIRTIO_FEATURES: u64 = (1 << VIRTIO_F_RING_INDIRECT_DESC) | (1 << VIRTIO_F_IN_ORDER) | (1 << VIRTIO_F_ORDER_PLATFORM) | (1 << VIRTIO_F_NOTIFICATION_DATA) - | VhostUserVirtioFeatures::PROTOCOL_FEATURES.bits(); + | VhostUserVirtioFeatures::PROTOCOL_FEATURES.bits() + | VhostUserVirtioFeatures::LOG_ALL.bits(); const HUP_CONNECTION_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 1; const BACKEND_REQ_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 2; @@ -217,7 +231,8 @@ impl VhostUserEpollHandler { ) .map_err(|e| { EpollHelperError::IoError(std::io::Error::other(format!( - "failed connecting vhost-user backend {e:?}" + "failed connecting vhost-user backend for socket {}: {e:?}", + self.socket_path ))) })?; @@ -268,7 +283,8 @@ impl EpollHelperHandler for VhostUserEpollHandle HUP_CONNECTION_EVENT => { self.reconnect(helper).map_err(|e| { EpollHelperError::HandleEvent(anyhow!( - "failed to reconnect vhost-user backend: {e:?}" + "failed to reconnect vhost-user backend for socket {}: {e:?}", + self.socket_path )) })?; } @@ -292,14 +308,47 @@ impl EpollHelperHandler for VhostUserEpollHandle } } +/// Common snapshot state for all vhost-user device types. +/// +/// Generic over `C` which is the device-specific config type +/// (e.g. VirtioBlockConfig, VirtioFsConfig, VirtioNetConfig). +/// Devices without a config type use `()`. +#[derive(Default, Serialize, Deserialize)] +pub struct VhostUserState { + pub avail_features: u64, + pub acked_features: u64, + pub config: C, + pub acked_protocol_features: u64, + pub vu_num_queues: usize, + #[serde(default)] + pub backend_req_support: bool, + #[serde(default)] + pub vring_bases: Option>, + #[serde(default)] + pub backend_state: Option>, +} + +impl VhostUserState { + pub fn validate(&self) -> Result<()> { + if self.backend_state.is_some() != self.vring_bases.is_some() { + return Err(Error::InconsistentBackendState); + } + Ok(()) + } +} + #[derive(Default)] pub struct VhostUserCommon { + pub virtio_common: VirtioCommon, pub vu: Option>>, pub acked_protocol_features: u64, pub socket_path: String, pub vu_num_queues: usize, pub migration_started: bool, pub server: bool, + pub interrupt_cb: Option>, + pub vring_bases: Option>, + pub epoll_thread: Option>, } impl VhostUserCommon { @@ -323,7 +372,7 @@ impl VhostUserCommon { }; if self.vu.is_none() { - error!("Missing vhost-user handle"); + error!("Missing vhost-user handle for socket {}", self.socket_path); return Err(ActivateError::BadActivate); } let vu = self.vu.as_ref().unwrap(); @@ -331,6 +380,7 @@ impl VhostUserCommon { .iter() .map(|(i, q, e)| (*i, vm_virtio::clone_queue(q), e.try_clone().unwrap())) .collect::>(); + let vring_bases = self.vring_bases.take(); vu.lock() .unwrap() .setup_vhost_user( @@ -340,9 +390,12 @@ impl VhostUserCommon { acked_features, &backend_req_handler, inflight.as_mut(), + vring_bases.as_deref(), ) .map_err(ActivateError::VhostUserSetup)?; + self.interrupt_cb = Some(interrupt_cb.clone()); + Ok(VhostUserEpollHandler { vu: vu.clone(), mem, @@ -375,6 +428,22 @@ impl VhostUserCommon { } pub fn shutdown(&mut self) { + // Signal the epoll thread to exit, unpause it (it may be parked + // if the VM was paused for migration), then wait for it to finish. + // This ensures the thread drops its Arc, fully + // closing the vhost-user socket so the backend can accept a new + // connection from the destination. + if let Some(kill_evt) = self.virtio_common.kill_evt.take() { + let _ = kill_evt.write(1); + } + self.virtio_common.paused.store(false, Ordering::SeqCst); + if let Some(t) = self.epoll_thread.as_ref() { + t.thread().unpark(); + } + if let Some(t) = self.epoll_thread.take() { + let _ = t.join(); + } + // Remove socket path if needed if self.server { let _ = std::fs::remove_file(&self.socket_path); @@ -423,15 +492,48 @@ impl VhostUserCommon { if let Some(vu) = &self.vu { vu.lock().unwrap().resume_vhost_user().map_err(|e| { MigratableError::Resume(anyhow!("Error resuming vhost-user backend: {e:?}")) - }) - } else { - Ok(()) + })?; + } + if let Some(interrupt_cb) = &self.interrupt_cb { + for i in 0..self.vu_num_queues { + interrupt_cb + .trigger(crate::VirtioInterruptType::Queue(i as u16)) + .ok(); + } + } + Ok(()) + } + + pub fn state( + &self, + config: C, + ) -> std::result::Result, MigratableError> { + let mut state = VhostUserState { + avail_features: self.virtio_common.avail_features, + acked_features: self.virtio_common.acked_features, + config, + acked_protocol_features: self.acked_protocol_features, + vu_num_queues: self.vu_num_queues, + ..Default::default() + }; + + if let Some(vu) = &self.vu { + let mut vu_locked = vu.lock().unwrap(); + if vu_locked.supports_device_state() { + let (backend_state, vring_bases) = vu_locked.save_backend_state().map_err(|e| { + MigratableError::Snapshot(anyhow!("Failed saving backend state: {e:?}")) + })?; + state.backend_state = Some(backend_state); + state.vring_bases = Some(vring_bases); + } } + + Ok(state) } - pub fn snapshot<'a, T>(&mut self, state: &T) -> std::result::Result + pub fn snapshot(&mut self, state: &T) -> std::result::Result where - T: Serialize + Deserialize<'a>, + T: Serialize, { let snapshot = Snapshot::new_from_state(state)?; @@ -504,15 +606,12 @@ impl VhostUserCommon { Ok(()) } - pub fn complete_migration( - &mut self, - kill_evt: Option, - ) -> std::result::Result<(), MigratableError> { + pub fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> { self.migration_started = false; // Make sure the device thread is killed in order to prevent from // reconnections to the socket. - if let Some(kill_evt) = kill_evt { + if let Some(kill_evt) = self.virtio_common.kill_evt.take() { kill_evt.write(1).map_err(|e| { MigratableError::CompleteMigration(anyhow!( "Error killing vhost-user thread: {e:?}" diff --git a/virtio-devices/src/vhost_user/net.rs b/virtio-devices/src/vhost_user/net.rs index e0a71c7342..e5f9eda7d8 100644 --- a/virtio-devices/src/vhost_user/net.rs +++ b/virtio-devices/src/vhost_user/net.rs @@ -9,7 +9,6 @@ use event_monitor::event; use log::{error, info}; use net_util::{CtrlQueue, MacAddr, VirtioNetConfig, build_net_config_space}; use seccompiler::SeccompAction; -use serde::{Deserialize, Serialize}; use vhost::vhost_user::message::{VhostUserProtocolFeatures, VhostUserVirtioFeatures}; use vhost::vhost_user::{FrontendReqHandler, VhostUserFrontend, VhostUserFrontendReqHandler}; use virtio_bindings::virtio_net::{ @@ -19,7 +18,7 @@ use virtio_bindings::virtio_net::{ VIRTIO_NET_F_MAC, VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_MTU, }; use virtio_bindings::virtio_ring::VIRTIO_RING_F_EVENT_IDX; -use virtio_queue::{Queue, QueueT}; +use virtio_queue::QueueT; use vm_memory::{ByteValued, GuestMemoryAtomic}; use vm_migration::protocol::MemoryRangeTable; use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable}; @@ -28,37 +27,28 @@ use vmm_sys_util::eventfd::EventFd; use crate::seccomp_filters::Thread; use crate::thread_helper::spawn_virtio_thread; use crate::vhost_user::vu_common_ctrl::{VhostUserConfig, VhostUserHandle}; -use crate::vhost_user::{DEFAULT_VIRTIO_FEATURES, Error, Result, VhostUserCommon}; +use crate::vhost_user::{DEFAULT_VIRTIO_FEATURES, Error, Result, VhostUserCommon, VhostUserState}; use crate::{ - ActivateResult, GuestMemoryMmap, GuestRegionMmap, NetCtrlEpollHandler, VIRTIO_F_IOMMU_PLATFORM, - VirtioCommon, VirtioDevice, VirtioDeviceType, VirtioInterrupt, + ActivateResult, GuestMemoryMmap, GuestRegionMmap, NetCtrlEpollHandler, + VIRTIO_F_ACCESS_PLATFORM, VirtioCommon, VirtioDevice, VirtioDeviceType, VirtioInterrupt, }; const DEFAULT_QUEUE_NUMBER: usize = 2; -#[derive(Serialize, Deserialize)] -pub struct State { - pub avail_features: u64, - pub acked_features: u64, - pub config: VirtioNetConfig, - pub acked_protocol_features: u64, - pub vu_num_queues: usize, -} +pub type State = VhostUserState; struct BackendReqHandler {} impl VhostUserFrontendReqHandler for BackendReqHandler {} pub struct Net { - common: VirtioCommon, vu_common: VhostUserCommon, id: String, config: VirtioNetConfig, guest_memory: Option>, ctrl_queue_epoll_thread: Option>, - epoll_thread: Option>, seccomp_action: SeccompAction, exit_evt: EventFd, - iommu: bool, + access_platform_enabled: bool, } impl Net { @@ -72,7 +62,7 @@ impl Net { server: bool, seccomp_action: SeccompAction, exit_evt: EventFd, - iommu: bool, + access_platform_enabled: bool, state: Option, offload_tso: bool, offload_ufo: bool, @@ -90,6 +80,7 @@ impl Net { vu_num_queues, config, paused, + vring_bases, ) = if let Some(state) = state { info!("Restoring vhost-user-net {id}"); @@ -103,6 +94,8 @@ impl Net { state.acked_protocol_features, )?; + vu.restore_state(&state)?; + // If the control queue feature has been negotiated, let's // increase the number of queues. if state.acked_features & (1 << VIRTIO_NET_F_CTRL_VQ) != 0 { @@ -116,6 +109,7 @@ impl Net { state.vu_num_queues, state.config, true, + state.vring_bases, ) } else { // Filling device and vring features VMM supports. @@ -152,7 +146,8 @@ impl Net { | VhostUserProtocolFeatures::CONFIGURE_MEM_SLOTS | VhostUserProtocolFeatures::REPLY_ACK | VhostUserProtocolFeatures::INFLIGHT_SHMFD - | VhostUserProtocolFeatures::LOG_SHMFD; + | VhostUserProtocolFeatures::LOG_SHMFD + | VhostUserProtocolFeatures::DEVICE_STATE; let (mut acked_features, acked_protocol_features) = vu.negotiate_features_vhost_user(avail_features, avail_protocol_features)?; @@ -195,65 +190,48 @@ impl Net { vu_num_queues, config, false, + None, ) }; Ok(Net { id, - common: VirtioCommon { - device_type: VirtioDeviceType::Net as u32, - queue_sizes: vec![vu_cfg.queue_size; num_queues], - avail_features, - acked_features, - paused_sync: Some(Arc::new(Barrier::new(2))), - min_queues: DEFAULT_QUEUE_NUMBER as u16, - paused: Arc::new(AtomicBool::new(paused)), - ..Default::default() - }, vu_common: VhostUserCommon { + virtio_common: VirtioCommon { + device_type: VirtioDeviceType::Net as u32, + queue_sizes: vec![vu_cfg.queue_size; num_queues], + avail_features, + acked_features, + paused_sync: Some(Arc::new(Barrier::new(2))), + min_queues: DEFAULT_QUEUE_NUMBER as u16, + paused: Arc::new(AtomicBool::new(paused)), + ..Default::default() + }, vu: Some(Arc::new(Mutex::new(vu))), acked_protocol_features, socket_path: vu_cfg.socket, vu_num_queues, server, + vring_bases, ..Default::default() }, config, guest_memory: None, ctrl_queue_epoll_thread: None, - epoll_thread: None, seccomp_action, exit_evt, - iommu, + access_platform_enabled, }) } - fn state(&self) -> State { - State { - avail_features: self.common.avail_features, - acked_features: self.common.acked_features, - config: self.config, - acked_protocol_features: self.vu_common.acked_protocol_features, - vu_num_queues: self.vu_common.vu_num_queues, - } + fn state(&self) -> std::result::Result { + self.vu_common.state(self.config) } } impl Drop for Net { fn drop(&mut self) { - if let Some(kill_evt) = self.common.kill_evt.take() - && let Err(e) = kill_evt.write(1) - { - error!("failed to kill vhost-user-net: {e:?}"); - } - - self.common.wait_for_epoll_threads(); - - if let Some(thread) = self.epoll_thread.take() - && let Err(e) = thread.join() - { - error!("Error joining thread: {e:?}"); - } + self.vu_common.shutdown(); if let Some(thread) = self.ctrl_queue_epoll_thread.take() && let Err(e) = thread.join() @@ -265,47 +243,58 @@ impl Drop for Net { impl VirtioDevice for Net { fn device_type(&self) -> u32 { - self.common.device_type + self.vu_common.virtio_common.device_type } fn queue_max_sizes(&self) -> &[u16] { - &self.common.queue_sizes + &self.vu_common.virtio_common.queue_sizes } fn features(&self) -> u64 { - let mut features = self.common.avail_features; - if self.iommu { - features |= 1u64 << VIRTIO_F_IOMMU_PLATFORM; + let mut features = self.vu_common.virtio_common.avail_features; + if self.access_platform_enabled { + features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; } features } fn ack_features(&mut self, value: u64) { - self.common.ack_features(value); + self.vu_common.virtio_common.ack_features(value); } fn read_config(&self, offset: u64, data: &mut [u8]) { self.read_config_from_slice(self.config.as_slice(), offset, data); } - fn activate( - &mut self, - mem: GuestMemoryAtomic, - interrupt_cb: Arc, - mut queues: Vec<(usize, Queue, EventFd)>, - ) -> ActivateResult { - self.common.activate(&queues, interrupt_cb.clone())?; + fn activate(&mut self, context: crate::device::ActivationContext) -> ActivateResult { + let crate::device::ActivationContext { + mem, + interrupt_cb, + mut queues, + .. + } = context; + self.vu_common + .virtio_common + .activate(&queues, interrupt_cb.clone())?; self.guest_memory = Some(mem.clone()); let num_queues = queues.len(); - let event_idx = self.common.feature_acked(VIRTIO_RING_F_EVENT_IDX.into()); - if self.common.feature_acked(VIRTIO_NET_F_CTRL_VQ.into()) && !num_queues.is_multiple_of(2) { + let event_idx = self + .vu_common + .virtio_common + .feature_acked(VIRTIO_RING_F_EVENT_IDX.into()); + if self + .vu_common + .virtio_common + .feature_acked(VIRTIO_NET_F_CTRL_VQ.into()) + && !num_queues.is_multiple_of(2) + { let ctrl_queue_index = num_queues - 1; let (_, mut ctrl_queue, ctrl_queue_evt) = queues.remove(ctrl_queue_index); ctrl_queue.set_event_idx(event_idx); - let (kill_evt, pause_evt) = self.common.dup_eventfds(); + let (kill_evt, pause_evt) = self.vu_common.virtio_common.dup_eventfds(); let mut ctrl_handler = NetCtrlEpollHandler { mem: mem.clone(), @@ -319,12 +308,12 @@ impl VirtioDevice for Net { queue_index: ctrl_queue_index as u16, }; - let paused = self.common.paused.clone(); + let paused = self.vu_common.virtio_common.paused.clone(); // Let's update the barrier as we need 1 for the control queue // thread + 1 for the common vhost-user thread + 1 for the main // thread signalling the pause. - self.common.paused_sync = Some(Arc::new(Barrier::new(3))); - let paused_sync = self.common.paused_sync.clone(); + self.vu_common.virtio_common.paused_sync = Some(Arc::new(Barrier::new(3))); + let paused_sync = self.vu_common.virtio_common.paused_sync.clone(); let mut epoll_threads = Vec::new(); spawn_virtio_thread( @@ -342,11 +331,12 @@ impl VirtioDevice for Net { // The backend acknowledged features must not contain VIRTIO_NET_F_MAC // since we don't expect the backend to handle it. - let backend_acked_features = self.common.acked_features & !(1 << VIRTIO_NET_F_MAC); + let backend_acked_features = + self.vu_common.virtio_common.acked_features & !(1 << VIRTIO_NET_F_MAC); // Run a dedicated thread for handling potential reconnections with // the backend. - let (kill_evt, pause_evt) = self.common.dup_eventfds(); + let (kill_evt, pause_evt) = self.vu_common.virtio_common.dup_eventfds(); let mut handler = self.vu_common.activate( mem, @@ -358,8 +348,8 @@ impl VirtioDevice for Net { pause_evt, )?; - let paused = self.common.paused.clone(); - let paused_sync = self.common.paused_sync.clone(); + let paused = self.vu_common.virtio_common.paused.clone(); + let paused_sync = self.vu_common.virtio_common.paused_sync.clone(); let mut epoll_threads = Vec::new(); spawn_virtio_thread( @@ -370,25 +360,28 @@ impl VirtioDevice for Net { &self.exit_evt, move || handler.run(&paused, paused_sync.as_ref().unwrap()), )?; - self.epoll_thread = Some(epoll_threads.remove(0)); + self.vu_common.epoll_thread = Some(epoll_threads.remove(0)); Ok(()) } fn reset(&mut self) -> Option> { // We first must resume the virtio thread if it was paused. - if self.common.pause_evt.take().is_some() { - self.common.resume().ok()?; + if self.vu_common.virtio_common.pause_evt.take().is_some() { + self.vu_common.virtio_common.resume().ok()?; } if let Some(vu) = &self.vu_common.vu && let Err(e) = vu.lock().unwrap().reset_vhost_user() { - error!("Failed to reset vhost-user daemon: {e:?}"); + error!( + "Failed to reset vhost-user daemon for socket {}: {e:?}", + self.vu_common.socket_path + ); return None; } - if let Some(kill_evt) = self.common.kill_evt.take() { + if let Some(kill_evt) = self.vu_common.virtio_common.kill_evt.take() { // Ignore the result because there is nothing we can do about it. let _ = kill_evt.write(1); } @@ -396,7 +389,7 @@ impl VirtioDevice for Net { event!("virtio-device", "reset", "id", &self.id); // Return the interrupt - Some(self.common.interrupt_cb.take().unwrap()) + Some(self.vu_common.virtio_common.interrupt_cb.take().unwrap()) } fn shutdown(&mut self) { @@ -414,13 +407,13 @@ impl VirtioDevice for Net { impl Pausable for Net { fn pause(&mut self) -> result::Result<(), MigratableError> { self.vu_common.pause()?; - self.common.pause() + self.vu_common.virtio_common.pause() } fn resume(&mut self) -> result::Result<(), MigratableError> { - self.common.resume()?; + self.vu_common.virtio_common.resume()?; - if let Some(epoll_thread) = &self.epoll_thread { + if let Some(epoll_thread) = &self.vu_common.epoll_thread { epoll_thread.thread().unpark(); } @@ -438,7 +431,7 @@ impl Snapshottable for Net { } fn snapshot(&mut self) -> std::result::Result { - self.vu_common.snapshot(&self.state()) + self.vu_common.snapshot(&self.state()?) } } impl Transportable for Net {} @@ -461,7 +454,6 @@ impl Migratable for Net { } fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> { - self.vu_common - .complete_migration(self.common.kill_evt.take()) + self.vu_common.complete_migration() } } diff --git a/virtio-devices/src/vhost_user/vu_common_ctrl.rs b/virtio-devices/src/vhost_user/vu_common_ctrl.rs index 264635149c..5ad9425c5a 100644 --- a/virtio-devices/src/vhost_user/vu_common_ctrl.rs +++ b/virtio-devices/src/vhost_user/vu_common_ctrl.rs @@ -3,17 +3,19 @@ use std::ffi; use std::fs::File; -use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; -use std::os::unix::net::UnixListener; +use std::io::{Read, Write}; +use std::os::unix::io::{AsRawFd, FromRawFd, OwnedFd, RawFd}; +use std::os::unix::net::{UnixListener, UnixStream}; use std::sync::Arc; use std::sync::atomic::Ordering; use std::thread::sleep; use std::time::{Duration, Instant}; use log::{error, info}; -use vhost::vhost_kern::vhost_binding::{VHOST_F_LOG_ALL, VHOST_VRING_F_LOG}; +use vhost::vhost_kern::vhost_binding::VHOST_VRING_F_LOG; use vhost::vhost_user::message::{ - VhostUserHeaderFlag, VhostUserInflight, VhostUserProtocolFeatures, VhostUserVirtioFeatures, + VhostTransferStateDirection, VhostTransferStatePhase, VhostUserHeaderFlag, VhostUserInflight, + VhostUserProtocolFeatures, VhostUserVirtioFeatures, }; use vhost::vhost_user::{ Frontend, FrontendReqHandler, VhostUserFrontend, VhostUserFrontendReqHandler, @@ -21,13 +23,12 @@ use vhost::vhost_user::{ use vhost::{VhostBackend, VhostUserDirtyLogRegion, VhostUserMemoryRegionInfo, VringConfigData}; use virtio_queue::desc::RawDescriptor; use virtio_queue::{Queue, QueueT}; -use vm_memory::{ - Address, Error as MmapError, FileOffset, GuestAddress, GuestMemory, GuestMemoryRegion, -}; +use vm_memory::guest_memory::Error as MmapError; +use vm_memory::{Address, FileOffset, GuestAddress, GuestMemory, GuestMemoryRegion}; use vm_migration::protocol::MemoryRangeTable; use vmm_sys_util::eventfd::EventFd; -use super::{Error, Result}; +use super::{Error, Result, VhostUserState}; use crate::vhost_user::Inflight; use crate::{ GuestMemoryMmap, GuestRegionMmap, MmapRegion, VirtioInterrupt, VirtioInterruptType, @@ -55,6 +56,7 @@ pub struct VhostUserHandle { vu: Frontend, ready: bool, supports_migration: bool, + supports_device_state: bool, shm_log: Option>, acked_features: u64, vrings_info: Option>, @@ -67,7 +69,11 @@ impl VhostUserHandle { for region in mem.iter() { let (mmap_handle, mmap_offset) = match region.file_offset() { Some(_file_offset) => (_file_offset.file().as_raw_fd(), _file_offset.start()), - None => return Err(Error::VhostUserMemoryRegion(MmapError::NoMemoryRegion)), + None => { + return Err(Error::VhostUserMemoryRegion( + MmapError::InvalidGuestAddress(region.start_addr()), + )); + } }; let vhost_user_net_reg = VhostUserMemoryRegionInfo { @@ -147,7 +153,7 @@ impl VhostUserHandle { self.vu.set_hdr_flags(VhostUserHeaderFlag::NEED_REPLY); } - self.update_supports_migration(acked_features, acked_protocol_features.bits()); + self.update_supported_features(acked_features, acked_protocol_features.bits()); Ok((acked_features, acked_protocol_features.bits())) } @@ -161,7 +167,14 @@ impl VhostUserHandle { acked_features: u64, backend_req_handler: &Option>, inflight: Option<&mut Inflight>, + vring_bases: Option<&[u64]>, ) -> Result<()> { + if let Some(bases) = &vring_bases + && bases.len() != queues.len() + { + return Err(Error::VringBasesCountMismatch(bases.len(), queues.len())); + } + self.vu .set_features(acked_features) .map_err(Error::VhostUserSetFeatures)?; @@ -204,7 +217,7 @@ impl VhostUserHandle { } let mut vrings_info = Vec::new(); - for (queue_index, queue, queue_evt) in queues.iter() { + for (i, (queue_index, queue, queue_evt)) in queues.iter().enumerate() { let actual_size: usize = queue.size().into(); let config_data = VringConfigData { @@ -244,14 +257,16 @@ impl VhostUserHandle { self.vu .set_vring_addr(*queue_index, &config_data) .map_err(Error::VhostUserSetVringAddr)?; + let base = if let Some(bases) = vring_bases { + bases[i] as u16 + } else { + queue + .avail_idx(mem, Ordering::Acquire) + .map_err(Error::GetAvailableIndex)? + .0 + }; self.vu - .set_vring_base( - *queue_index, - queue - .avail_idx(mem, Ordering::Acquire) - .map_err(Error::GetAvailableIndex)? - .0, - ) + .set_vring_base(*queue_index, base) .map_err(Error::VhostUserSetVringBase)?; if let Some(eventfd) = @@ -331,7 +346,7 @@ impl VhostUserHandle { } } - self.update_supports_migration(acked_features, acked_protocol_features); + self.update_supported_features(acked_features, acked_protocol_features); Ok(()) } @@ -356,6 +371,7 @@ impl VhostUserHandle { acked_features, backend_req_handler, inflight, + None, ) } @@ -379,6 +395,7 @@ impl VhostUserHandle { vu: Frontend::from_stream(stream, num_queues), ready: false, supports_migration: false, + supports_device_state: false, shm_log: None, acked_features: 0, vrings_info: None, @@ -395,6 +412,7 @@ impl VhostUserHandle { vu: m, ready: false, supports_migration: false, + supports_device_state: false, shm_log: None, acked_features: 0, vrings_info: None, @@ -410,7 +428,9 @@ impl VhostUserHandle { } }; - error!("Failed connecting the backend after trying for 1 minute: {err:?}"); + error!( + "Failed connecting the backend after trying for 1 minute for socket {socket_path}: {err:?}" + ); Err(Error::VhostUserConnect) } } @@ -435,12 +455,102 @@ impl VhostUserHandle { Ok(()) } - fn update_supports_migration(&mut self, acked_features: u64, acked_protocol_features: u64) { - if (acked_features & u64::from(vhost::vhost_kern::vhost_binding::VHOST_F_LOG_ALL) != 0) - && (acked_protocol_features & VhostUserProtocolFeatures::LOG_SHMFD.bits() != 0) + fn update_supported_features(&mut self, acked_features: u64, acked_protocol_features: u64) { + self.supports_migration = acked_features & VhostUserVirtioFeatures::LOG_ALL.bits() != 0 + && acked_protocol_features & VhostUserProtocolFeatures::LOG_SHMFD.bits() != 0; + self.supports_device_state = + acked_protocol_features & VhostUserProtocolFeatures::DEVICE_STATE.bits() != 0; + } + + pub fn supports_device_state(&self) -> bool { + self.supports_device_state + } + + /// Save backend device state via the SET_DEVICE_STATE_FD protocol. + /// Returns the opaque state blob and per-queue vring base indices. + pub fn save_backend_state(&mut self) -> Result<(Vec, Vec)> { + // GET_VRING_BASE for each queue to stop the backend and capture indices + let mut vring_bases = Vec::new(); + for queue_index in &self.queue_indexes { + let base = self + .vu + .get_vring_base(*queue_index) + .map_err(Error::VhostUserGetVringBase)?; + vring_bases.push(base as u64); + } + + // The backend considers the vrings stopped after GET_VRING_BASE. + self.ready = false; + + let (local, remote) = UnixStream::pair().map_err(Error::SaveRestoreBackendState)?; + + let mut read_file: File = match self + .vu + .set_device_state_fd( + VhostTransferStateDirection::SAVE, + VhostTransferStatePhase::STOPPED, + remote.into(), + ) + .map_err(Error::VhostUserSetDeviceStateFd)? + { + Some(file) => file, + None => OwnedFd::from(local).into(), + }; + + // Read all state from the socket + let mut state = Vec::new(); + read_file + .read_to_end(&mut state) + .map_err(Error::SaveRestoreBackendState)?; + + // Verify the transfer succeeded + self.vu + .check_device_state() + .map_err(Error::VhostUserCheckDeviceState)?; + + Ok((state, vring_bases)) + } + + pub fn restore_state(&mut self, state: &VhostUserState) -> Result<()> { + state.validate()?; + if let Some(backend_state) = &state.backend_state { + self.restore_backend_state(backend_state)?; + } + Ok(()) + } + + /// Restore backend device state via the SET_DEVICE_STATE_FD protocol. + /// Sends the saved opaque state blob to the backend via a socket. + pub fn restore_backend_state(&mut self, state: &[u8]) -> Result<()> { + let (local, remote) = UnixStream::pair().map_err(Error::SaveRestoreBackendState)?; + + // Explicit scope to close the write end and signal EOF to the backend { - self.supports_migration = true; + let mut write_file: File = match self + .vu + .set_device_state_fd( + VhostTransferStateDirection::LOAD, + VhostTransferStatePhase::STOPPED, + remote.into(), + ) + .map_err(Error::VhostUserSetDeviceStateFd)? + { + Some(file) => file, + None => OwnedFd::from(local).into(), + }; + + // Write the saved state to the socket + write_file + .write_all(state) + .map_err(Error::SaveRestoreBackendState)?; } + + // Verify the transfer succeeded + self.vu + .check_device_state() + .map_err(Error::VhostUserCheckDeviceState)?; + + Ok(()) } fn update_log_base(&mut self, last_ram_addr: u64) -> Result>> { @@ -532,7 +642,7 @@ impl VhostUserHandle { self.update_log_base(last_ram_addr)?; // Enable VHOST_F_LOG_ALL feature - let features = self.acked_features | (1 << VHOST_F_LOG_ALL); + let features = self.acked_features | VhostUserVirtioFeatures::LOG_ALL.bits(); self.vu .set_features(features) .map_err(Error::VhostUserSetFeatures)?; @@ -562,6 +672,10 @@ impl VhostUserHandle { } pub fn dirty_log(&mut self, last_ram_addr: u64) -> Result { + if !self.supports_migration { + return Err(Error::MigrationNotSupported); + } + // The log region is updated by creating a new region that is sent to // the backend. This ensures the backend stops logging to the previous // region. The previous region is returned and processed to create the diff --git a/virtio-devices/src/vsock/device.rs b/virtio-devices/src/vsock/device.rs index 27a0af1ff2..c20288c2dc 100644 --- a/virtio-devices/src/vsock/device.rs +++ b/virtio-devices/src/vsock/device.rs @@ -50,7 +50,7 @@ use crate::seccomp_filters::Thread; use crate::thread_helper::spawn_virtio_thread; use crate::{ ActivateResult, EPOLL_HELPER_EVENT_LAST, EpollHelper, EpollHelperError, EpollHelperHandler, - Error as DeviceError, GuestMemoryMmap, VIRTIO_F_IN_ORDER, VIRTIO_F_IOMMU_PLATFORM, + Error as DeviceError, GuestMemoryMmap, VIRTIO_F_ACCESS_PLATFORM, VIRTIO_F_IN_ORDER, VIRTIO_F_VERSION_1, VirtioCommon, VirtioDevice, VirtioDeviceType, VirtioInterrupt, VirtioInterruptType, }; @@ -323,6 +323,8 @@ pub struct Vsock { pub struct VsockState { pub avail_features: u64, pub acked_features: u64, + #[serde(default)] + pub connections: Vec<(u32, u32)>, } impl Vsock @@ -336,20 +338,23 @@ where id: String, cid: u32, path: PathBuf, - backend: B, - iommu: bool, + mut backend: B, + access_platform_enabled: bool, seccomp_action: SeccompAction, exit_evt: EventFd, state: Option, ) -> io::Result> { let (avail_features, acked_features, paused) = if let Some(state) = state { info!("Restoring virtio-vsock {id}"); + // Instead of letting the guest connection hang/timeout, proactively let + // the guest know the connection is gone. + backend.queue_rst_for_connections(state.connections.clone()); (state.avail_features, state.acked_features, true) } else { let mut avail_features = (1u64 << VIRTIO_F_VERSION_1) | (1u64 << VIRTIO_F_IN_ORDER); - if iommu { - avail_features |= 1u64 << VIRTIO_F_IOMMU_PLATFORM; + if access_platform_enabled { + avail_features |= 1u64 << VIRTIO_F_ACCESS_PLATFORM; } (avail_features, 0, false) }; @@ -378,6 +383,7 @@ where VsockState { avail_features: self.common.avail_features, acked_features: self.common.acked_features, + connections: self.backend.read().unwrap().connections(), } } @@ -435,12 +441,13 @@ where } } - fn activate( - &mut self, - mem: GuestMemoryAtomic, - interrupt_cb: Arc, - queues: Vec<(usize, Queue, EventFd)>, - ) -> ActivateResult { + fn activate(&mut self, context: crate::device::ActivationContext) -> ActivateResult { + let crate::device::ActivationContext { + mem, + interrupt_cb, + queues, + .. + } = context; self.common.activate(&queues, interrupt_cb.clone())?; let (kill_evt, pause_evt) = self.common.dup_eventfds(); @@ -459,7 +466,7 @@ where pause_evt, interrupt_cb, backend: self.backend.clone(), - access_platform: self.common.access_platform.clone(), + access_platform: self.common.access_platform(), }; let paused = self.common.paused.clone(); @@ -494,6 +501,10 @@ where fn set_access_platform(&mut self, access_platform: Arc) { self.common.set_access_platform(access_platform); } + + fn access_platform(&self) -> Option> { + self.common.access_platform() + } } impl Pausable for Vsock @@ -593,9 +604,12 @@ mod unit_tests { let memory = GuestMemoryAtomic::new(ctx.mem.clone()); // Test a bad activation. - let bad_activate = - ctx.device - .activate(memory.clone(), Arc::new(NoopVirtioInterrupt {}), Vec::new()); + let bad_activate = ctx.device.activate(crate::device::ActivationContext { + mem: memory.clone(), + interrupt_cb: Arc::new(NoopVirtioInterrupt {}), + queues: Vec::new(), + device_status: Arc::new(std::sync::atomic::AtomicU8::new(0)), + }); match bad_activate { Err(ActivateError::BadActivate) => (), other => panic!("{other:?}"), @@ -603,10 +617,10 @@ mod unit_tests { // Test a correct activation. ctx.device - .activate( - memory, - Arc::new(NoopVirtioInterrupt {}), - vec![ + .activate(crate::device::ActivationContext { + mem: memory, + interrupt_cb: Arc::new(NoopVirtioInterrupt {}), + queues: vec![ ( 0, Queue::new(256).unwrap(), @@ -623,7 +637,8 @@ mod unit_tests { EventFd::new(EFD_NONBLOCK).unwrap(), ), ], - ) + device_status: Arc::new(std::sync::atomic::AtomicU8::new(0)), + }) .unwrap(); } diff --git a/virtio-devices/src/vsock/mod.rs b/virtio-devices/src/vsock/mod.rs index 7895587855..cc1ef1ad23 100644 --- a/virtio-devices/src/vsock/mod.rs +++ b/virtio-devices/src/vsock/mod.rs @@ -158,7 +158,12 @@ pub trait VsockChannel { /// It that needs to be sendable through a mpsc channel (the latter due to how `vmm::EpollContext` works). /// Currently, the only implementation we have is `crate::virtio::unix::muxer::VsockMuxer`, which /// translates guest-side vsock connections to host-side Unix domain socket connections. -pub trait VsockBackend: VsockChannel + VsockEpollListener + Send {} +pub trait VsockBackend: VsockChannel + VsockEpollListener + Send { + fn connections(&self) -> Vec<(u32, u32)> { + Vec::new() + } + fn queue_rst_for_connections(&mut self, _conns: Vec<(u32, u32)>) {} +} #[cfg(any(test, fuzzing))] pub mod unit_tests { @@ -188,6 +193,15 @@ pub mod unit_tests { ) -> std::result::Result<(), std::io::Error> { Ok(()) } + + fn set_notifier( + &self, + _interrupt: u32, + _eventfd: Option, + _vm: &dyn hypervisor::Vm, + ) -> std::io::Result<()> { + unimplemented!() + } } pub struct TestBackend { diff --git a/virtio-devices/src/vsock/packet.rs b/virtio-devices/src/vsock/packet.rs index 57218a5b87..e6b4c5afbb 100644 --- a/virtio-devices/src/vsock/packet.rs +++ b/virtio-devices/src/vsock/packet.rs @@ -142,7 +142,8 @@ impl VsockPacket { let guest_hdr_addr = head .addr() - .translate_gva(access_platform, VSOCK_PKT_HDR_SIZE); + .translate_gva(access_platform, VSOCK_PKT_HDR_SIZE) + .map_err(|_| VsockError::GuestMemory)?; // To avoid TOCTOU issues when reading/writing the VSock packet header in guest memory, // we need to copy the content of the header in the VMM's memory. @@ -178,8 +179,9 @@ impl VsockPacket { desc_chain.memory(), head.addr() .checked_add(VSOCK_PKT_HDR_SIZE as u64) - .unwrap() - .translate_gva(access_platform, buf_size), + .ok_or(VsockError::GuestMemory)? + .translate_gva(access_platform, buf_size) + .map_err(|_| VsockError::GuestMemory)?, buf_size, ) .ok_or(VsockError::GuestMemory)?; @@ -214,7 +216,10 @@ impl VsockPacket { let desc_len = desc.len() as usize; if desc_len > 0 && offset < total_len { let to_copy = std::cmp::min(desc_len, total_len - offset); - let desc_addr = desc.addr().translate_gva(access_platform, desc_len); + let desc_addr = desc + .addr() + .translate_gva(access_platform, desc_len) + .map_err(|_| VsockError::GuestMemory)?; desc_chain .memory() .read_slice(&mut owned[offset..offset + to_copy], desc_addr) @@ -242,7 +247,10 @@ impl VsockPacket { let buf_size = buf_desc.len() as usize; let buf_ptr = get_host_address_range( desc_chain.memory(), - buf_desc.addr().translate_gva(access_platform, buf_size), + buf_desc + .addr() + .translate_gva(access_platform, buf_size) + .map_err(|_| VsockError::GuestMemory)?, buf_size, ) .ok_or(VsockError::GuestMemory)?; @@ -283,7 +291,8 @@ impl VsockPacket { let guest_hdr_addr = head .addr() - .translate_gva(access_platform, VSOCK_PKT_HDR_SIZE); + .translate_gva(access_platform, VSOCK_PKT_HDR_SIZE) + .map_err(|_| VsockError::GuestMemory)?; // To avoid TOCTOU issues when reading/writing the VSock packet header in guest memory, // we need to copy the content of the header in the VMM's memory. @@ -313,7 +322,10 @@ impl VsockPacket { buf: Some(PacketBuffer::Borrowed { ptr: get_host_address_range( desc_chain.memory(), - buf_desc.addr().translate_gva(access_platform, buf_size), + buf_desc + .addr() + .translate_gva(access_platform, buf_size) + .map_err(|_| VsockError::GuestMemory)?, buf_size, ) .ok_or(VsockError::GuestMemory)?, @@ -330,8 +342,9 @@ impl VsockPacket { desc_chain.memory(), head.addr() .checked_add(VSOCK_PKT_HDR_SIZE as u64) - .unwrap() - .translate_gva(access_platform, buf_size), + .ok_or(VsockError::GuestMemory)? + .translate_gva(access_platform, buf_size) + .map_err(|_| VsockError::GuestMemory)?, buf_size, ) .ok_or(VsockError::GuestMemory)?, diff --git a/virtio-devices/src/vsock/unix/muxer.rs b/virtio-devices/src/vsock/unix/muxer.rs index edce5b1e03..499c68eac1 100644 --- a/virtio-devices/src/vsock/unix/muxer.rs +++ b/virtio-devices/src/vsock/unix/muxer.rs @@ -345,7 +345,23 @@ impl VsockEpollListener for VsockMuxer { } } -impl VsockBackend for VsockMuxer {} +impl VsockBackend for VsockMuxer { + fn connections(&self) -> Vec<(u32, u32)> { + self.conn_map + .keys() + .map(|k| (k.local_port, k.peer_port)) + .collect() + } + + fn queue_rst_for_connections(&mut self, conns: Vec<(u32, u32)>) { + for (local_port, peer_port) in conns { + self.rxq.push(MuxerRx::RstPkt { + local_port, + peer_port, + }); + } + } +} impl VsockMuxer { /// Muxer constructor. @@ -409,7 +425,7 @@ impl VsockMuxer { // If we're already maxed-out on connections, we'll just accept and // immediately discard this potentially new one. warn!("vsock: connection limit reached; refusing new host connection"); - self.host_sock.accept().map(|_| 0).unwrap_or(0); + let _ = self.host_sock.accept(); return; } self.host_sock diff --git a/virtio-devices/src/watchdog.rs b/virtio-devices/src/watchdog.rs index 6b9f7cc0ac..742a2e0241 100644 --- a/virtio-devices/src/watchdog.rs +++ b/virtio-devices/src/watchdog.rs @@ -326,12 +326,13 @@ impl VirtioDevice for Watchdog { self.common.ack_features(value); } - fn activate( - &mut self, - mem: GuestMemoryAtomic, - interrupt_cb: Arc, - mut queues: Vec<(usize, Queue, EventFd)>, - ) -> ActivateResult { + fn activate(&mut self, context: crate::device::ActivationContext) -> ActivateResult { + let crate::device::ActivationContext { + mem, + interrupt_cb, + mut queues, + .. + } = context; self.common.activate(&queues, interrupt_cb.clone())?; let (kill_evt, pause_evt) = self.common.dup_eventfds(); diff --git a/vm-allocator/Cargo.toml b/vm-allocator/Cargo.toml index a4996d6dc3..3826479313 100644 --- a/vm-allocator/Cargo.toml +++ b/vm-allocator/Cargo.toml @@ -2,6 +2,7 @@ authors = ["The Chromium OS Authors"] edition.workspace = true name = "vm-allocator" +rust-version.workspace = true version = "0.1.0" [features] diff --git a/vm-device/Cargo.toml b/vm-device/Cargo.toml index a57ea57f5b..358ffc7435 100644 --- a/vm-device/Cargo.toml +++ b/vm-device/Cargo.toml @@ -2,6 +2,7 @@ authors = ["The Cloud Hypervisor Authors"] edition.workspace = true name = "vm-device" +rust-version.workspace = true version = "0.1.0" [features] diff --git a/vm-device/src/bus.rs b/vm-device/src/bus.rs index 2897ac303e..eacca24987 100644 --- a/vm-device/src/bus.rs +++ b/vm-device/src/bus.rs @@ -147,6 +147,9 @@ impl Bus { None } + /// Inserts a bus device into the bus. + /// + /// The bus will only hold a weak reference to the object. #[allow(clippy::needless_pass_by_value)] pub fn insert(&self, device: Arc, base: u64, len: u64) -> Result<()> { if len == 0 { diff --git a/vm-device/src/interrupt/mod.rs b/vm-device/src/interrupt/mod.rs index 342cbe0631..e9b0180d29 100644 --- a/vm-device/src/interrupt/mod.rs +++ b/vm-device/src/interrupt/mod.rs @@ -57,7 +57,8 @@ //! * The virtual device backend requests the interrupt manager to create an interrupt group //! according to guest configuration information -use std::sync::Arc; +use std::io::{Error, ErrorKind}; +use std::sync::{Arc, Mutex}; pub use hypervisor::{InterruptSourceConfig, LegacyIrqSourceConfig, MsiIrqSourceConfig}; use vmm_sys_util::eventfd::EventFd; @@ -107,6 +108,30 @@ pub trait InterruptManager: Send + Sync { /// * count: number of Interrupt Sources to be managed by the group object. fn create_group(&self, config: Self::GroupConfig) -> Result>; + /// Create an [InterruptSourceGroup](trait.InterruptSourceGroup.html) object to manage + /// interrupt sources for a virtual device + /// + /// An [InterruptSourceGroup](trait.InterruptSourceGroup.html) object manages all interrupt + /// sources of the same type for a virtual device. + /// + /// This is the same as [`Self::create_group`], except that the returned + /// [`InterruptSourceGroup`] allows setting the irqfd used as notifier via + /// [`InterruptSourceGroup::set_notifier`]. + /// + /// # Arguments + /// * interrupt_type: type of interrupt source. + /// * base: base Interrupt Source ID to be managed by the group object. + /// * count: number of Interrupt Sources to be managed by the group object. + fn create_group_mut( + &self, + _config: Self::GroupConfig, + ) -> Result>> { + Err(Error::new( + ErrorKind::Unsupported, + "setting notifiers not supported", + )) + } + /// Destroy an [InterruptSourceGroup](trait.InterruptSourceGroup.html) object created by /// [create_group()](trait.InterruptManager.html#tymethod.create_group). /// @@ -137,7 +162,7 @@ pub trait InterruptSourceGroup: Send + Sync { /// Returns an interrupt notifier from this interrupt. /// /// An interrupt notifier allows for external components and processes - /// to inject interrupts into a guest, by writing to the file returned + /// to inject interrupts into a guest, by writing to the [`EventFd`] returned /// by this method. #[allow(unused_variables)] fn notifier(&self, index: InterruptIndex) -> Option; @@ -159,4 +184,17 @@ pub trait InterruptSourceGroup: Send + Sync { /// Set the interrupt group GSI routing table. fn set_gsi(&self) -> Result<()>; + + /// Sets the [`EventFd`] used to trigger interrupts. + fn set_notifier( + &mut self, + _index: InterruptIndex, + _eventfd: Option, + _vm: &dyn hypervisor::Vm, + ) -> Result<()> { + Err(Error::new( + ErrorKind::Unsupported, + "setting notifiers not supported", + )) + } } diff --git a/vm-migration/Cargo.toml b/vm-migration/Cargo.toml index b17475065c..66b4e4f6a9 100644 --- a/vm-migration/Cargo.toml +++ b/vm-migration/Cargo.toml @@ -2,6 +2,7 @@ authors = ["The Cloud Hypervisor Authors"] edition.workspace = true name = "vm-migration" +rust-version.workspace = true version = "0.1.0" [dependencies] diff --git a/vm-migration/src/context.rs b/vm-migration/src/context.rs new file mode 100644 index 0000000000..21801c0290 --- /dev/null +++ b/vm-migration/src/context.rs @@ -0,0 +1,693 @@ +// Copyright © 2026 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// + +//! Module for context and metrics of migrations. +//! +//! Main exports: +//! - [`OngoingMigrationContext`] +//! - [`CompletedMigrationContext`] +//! - [`MemoryMigrationContext`] + +use std::fmt; +use std::fmt::{Display, Formatter}; +use std::time::{Duration, Instant}; + +use thiserror::Error; + +use crate::protocol::MemoryRangeTable; + +/// Metrics of the VM downtime during a migration. +/// +/// By downtime, we mean the time between the VM pause() and the corresponding +/// resume() on the destination. This downtime covers the time when the vCPUs +/// didn't execute a single instruction. The network downtime might be longer +/// and is not covered by this type. +/// +/// This metric is only relevant for the migration of running VMs. +#[derive(Debug, PartialEq)] +pub struct DowntimeContext { + /// The effective downtime Cloud Hypervisor observed (from the migration sender). + /// + /// This is roughly the sum of all the other durations. + pub effective_downtime: Duration, + /// The time of the final memory iteration. + pub final_memory_iteration_dur: Duration, + /// The time needed to aggregate the final VM state (i.e., snapshotting it). + pub state_dur: Duration, + /// The time needed to send the final VM state including deserializing it on + /// the destination + pub send_state_dur: Duration, + /// The time of the completion request. This includes resuming the VM (if it + /// was running before the migration). + pub complete_dur: Duration, +} + +impl Display for DowntimeContext { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!( + f, + // Caution: This format is specifically crafted for the VMM log + "{}ms (final_iter:{}ms state:{}ms send_state:{}ms complete:{}ms)", + self.effective_downtime.as_millis(), + self.final_memory_iteration_dur.as_millis(), + self.state_dur.as_millis(), + self.send_state_dur.as_millis(), + self.complete_dur.as_millis() + ) + } +} + +/// The internal metrics of a completed migration. +/// +/// The properties of this type help to investigate timings of the migration, +/// with specific focus on the VM downtime. +/// +/// This type is static once it was created and should not change. +#[derive(Debug, PartialEq)] +pub struct CompletedMigrationContext { + /// Total duration of the migration. + pub migration_dur: Duration, + pub downtime_ctx: DowntimeContext, + /// The finalized context of the memory migration. + pub memory_ctx: MemoryMigrationContext, +} + +impl CompletedMigrationContext { + fn new( + migration_dur: Duration, + effective_downtime: Duration, + state_dur: Duration, + send_state_dur: Duration, + complete_dur: Duration, + memory_ctx: MemoryMigrationContext, + ) -> Self { + Self { + migration_dur, + downtime_ctx: DowntimeContext { + effective_downtime, + final_memory_iteration_dur: memory_ctx.iteration_duration.unwrap_or_default(), + state_dur, + send_state_dur, + complete_dur, + }, + memory_ctx, + } + } +} + +/// Error returned when the migration context is advanced in an invalid order. +#[derive(Clone, Copy, Debug, Eq, Error, PartialEq)] +pub enum MigrationContextError { + /// The memory migration context was not finalized before transition. + #[error("memory migration context should be finalized before pausing the VM")] + MemoryContextNotFinalized, + /// The transition to `VmPaused` was attempted from an invalid state. + #[error("memory migration should only advance from the Begin state")] + InvalidVmPausedTransition, + /// Finalization was attempted before memory migration completed. + #[error("migration should only finalize after memory migration completed")] + InvalidFinalizeTransition, +} + +/// Holds context and metrics about the current ongoing migration. +/// +/// This is a state-machine to properly reflect the intermediate states and +/// their properties. This machine does not have a `Completed` variant in favor +/// of [`CompletedMigrationContext`], which is easier to work with. +#[derive(Debug, PartialEq)] +pub enum OngoingMigrationContext { + /// Migration started. + Begin { + /// Begin of the migration. + migration_begin: Instant, + }, + /// VM memory fully transferred to the destination and the VM is paused. + VmPaused { + /// Begin of the migration. + migration_begin: Instant, + /// Downtime begin of the migration. + downtime_begin: Instant, + /// The finalized context of the memory migration. + finalized_memory_ctx: MemoryMigrationContext, + }, +} + +impl OngoingMigrationContext { + /// Creates a new context. + pub fn new() -> Self { + Self::Begin { + migration_begin: Instant::now(), + } + } + + /// Marks the memory migration as completed and records when downtime + /// started. The VM is now in paused state. + pub fn set_vm_paused( + &mut self, + downtime_begin: Instant, + finalized_memory_ctx: MemoryMigrationContext, + ) -> Result<(), MigrationContextError> { + if finalized_memory_ctx.migration_duration.is_none() { + return Err(MigrationContextError::MemoryContextNotFinalized); + } + let migration_begin = match self { + Self::Begin { migration_begin } => *migration_begin, + _ => return Err(MigrationContextError::InvalidVmPausedTransition), + }; + *self = Self::VmPaused { + migration_begin, + downtime_begin, + finalized_memory_ctx, + }; + Ok(()) + } + + /// Finalizes the metrics and returns a [`CompletedMigrationContext`]. + /// + /// This should be called right after the completed migration was + /// acknowledged by the receiver. From now on, the metrics are considered + /// finalized and should not be modified. They can be stored for further + /// analysis. + /// + /// # Arguments + /// - `state_dur`: The time needed to aggregate the final VM state (i.e., + /// snapshotting it). + /// - `send_state_dur`: The time needed to send the final VM state + /// including deserializing it on the destination. + /// - `complete_dur`: The time of the completion request. This includes + /// resuming the VM (if it was running before the migration). + pub fn finalize( + self, + state_dur: Duration, + send_state_dur: Duration, + complete_dur: Duration, + ) -> Result { + let (migration_begin, downtime_begin, finalized_memory_ctx) = match self { + Self::VmPaused { + migration_begin, + downtime_begin, + finalized_memory_ctx, + } => (migration_begin, downtime_begin, finalized_memory_ctx), + _ => return Err(MigrationContextError::InvalidFinalizeTransition), + }; + + Ok(CompletedMigrationContext::new( + migration_begin.elapsed(), + downtime_begin.elapsed(), + state_dur, + send_state_dur, + complete_dur, + finalized_memory_ctx, + )) + } +} + +impl Default for OngoingMigrationContext { + fn default() -> Self { + Self::new() + } +} + +/// Internal metrics for the precopy migration phase. +/// +/// The context aggregates runtime statistics such as iteration count, +/// transferred bytes, durations, bandwidth, and estimated downtime. +/// These metrics allow the migration logic to make decisions based on +/// observed runtime behavior, for example terminating further iterations +/// once the expected downtime falls below a configured threshold. +/// +/// The structure is updated both between iterations and during an +/// iteration so that it always reflects the most recent state. +#[derive(Debug, PartialEq)] +pub struct MemoryMigrationContext { + /// Current iteration: 0 initial total transmission, >0 delta transmission. + pub iteration: usize, + /// Total bytes sent across all iterations. + total_sent_bytes: u64, + /// Total bytes to send in the current iteration. + pub current_iteration_total_bytes: u64, + /// The currently measured bandwidth. + /// + /// This is updated (at least) after each completed iteration. + bandwidth_bytes_per_second: f64, + /// Calculated downtime in milliseconds regarding the current bandwidth and + /// the remaining memory. + /// + /// This is only `None` for iteration 0. + /// + /// Please note that this ignores any additional migration overhead and + /// only looks at the memory transfer itself. + pub estimated_downtime: Option, + /// Begin of the memory migration. + pub migration_begin: Instant, + /// Duration of the memory migration. + /// + /// This is only `None` until the last iteration is finished. + migration_duration: Option, + /// Begin of the current iteration. + iteration_begin: Instant, + /// Duration of the current iteration. + /// + /// This includes the transmission, all logging, and update of any metrics. + /// + /// This is only `None` for iteration 0. + pub iteration_duration: Option, + /// Begin of the current transfer. + transfer_begin: Instant, + /// Duration of the current transfer. + /// + /// This is only `None` for iteration 0. + pub transfer_duration: Option, +} + +impl MemoryMigrationContext { + /// Creates a new context. + /// + /// Please note that you should create this struct right before the precopy + /// memory migration starts, as the field `migration_begin` is set to + /// [`Instant::now`]. + pub fn new() -> Self { + Self { + iteration: 0, + total_sent_bytes: 0, + current_iteration_total_bytes: 0, + bandwidth_bytes_per_second: 0.0, + estimated_downtime: None, + migration_begin: Instant::now(), + migration_duration: None, + // Will be updated soon -> so this value is never read + iteration_begin: Instant::now(), + iteration_duration: None, + // Will be updated soon -> so this value is never read + transfer_begin: Instant::now(), + transfer_duration: None, + } + } + + /// Returns an empty finalized block. + /// + /// This can be used if no memory was transferred (e.g., local migration). + pub fn empty_finalized() -> Self { + let mut this = Self::new(); + this.finalize(); + this + } + + /// Updates the metrics right before the transfer over the wire. + /// + /// Supposed to be called once per precopy memory iteration. + /// + /// This helps to feed the "is converged?" with fresh metrics to + /// potentially stop the precopy phase. + pub fn update_metrics_before_transfer( + &mut self, + iteration_begin: Instant, + iteration_table: &MemoryRangeTable, + ) { + self.iteration_begin = iteration_begin; + self.current_iteration_total_bytes = iteration_table.effective_size(); + self.estimated_downtime = if self.current_iteration_total_bytes == 0 { + Some(Duration::ZERO) + } else if self.bandwidth_bytes_per_second == 0.0 { + // Only happens on the very first iteration + None + } else { + let calculated_downtime_s = + self.current_iteration_total_bytes as f64 / (self.bandwidth_bytes_per_second); + Some(Duration::from_secs_f64(calculated_downtime_s)) + } + } + + /// Updates the metrics right after the transfer over the wire. + /// + /// Supposed to be called once per precopy memory iteration. + /// + /// This updates the bandwidth and ensures that + /// [`Self::update_metrics_before_transfer`] operates on fresh metrics on + /// the new iteration. + /// + /// # Panics + /// + /// If the transfer duration is longer than the iteration duration, this + /// function panics. This can never happen with real-world data but in + /// artificial unit test scenarios. + pub fn update_metrics_after_transfer( + &mut self, + transfer_begin: Instant, + transfer_duration: Duration, + ) { + self.transfer_begin = transfer_begin; + self.transfer_duration = Some(transfer_duration); + self.total_sent_bytes += self.current_iteration_total_bytes; + self.bandwidth_bytes_per_second = + Self::calculate_bandwidth(self.current_iteration_total_bytes, transfer_duration); + + // We might have a few operations after that before the loop starts + // (e.g., logging) again, but practically, this is negligible for this + // metric. + self.iteration_duration = Some(self.iteration_begin.elapsed()); + + // Catch programming errors: + // unwrap is fine as both values are set by now + assert!( + self.iteration_duration.unwrap() >= self.transfer_duration.unwrap(), + "iteration_duration must be larger than transfer_duration: {}ms < {}ms", + self.iteration_duration.unwrap().as_millis(), + self.transfer_duration.unwrap().as_millis(), + ); + } + + /// Finalizes the metrics. + /// + /// From now on, the metrics are considered finalized and should not be + /// modified. They can be stored for further analysis. + #[inline] + pub fn finalize(&mut self) { + // Any overhead from the function call is negligible. + self.migration_duration = Some(self.migration_begin.elapsed()); + } + + /// Returns the average bandwidth over the whole duration of the migration. + #[inline] + pub fn average_bandwidth(&self) -> f64 { + Self::calculate_bandwidth(self.total_sent_bytes, self.migration_begin.elapsed()) + } + + /// Calculates the bandwidth in bytes per second. + /// + /// Returns `0.0` if the duration is zero to avoid division by zero. + #[inline] + fn calculate_bandwidth(bytes: u64, duration: Duration) -> f64 { + if duration == Duration::ZERO { + 0.0 + } else { + bytes as f64 / duration.as_secs_f64() + } + } + + /// Calculates the overhead of an iteration. + /// + /// This is the additional time next to the transfer time and includes + /// fetching and parsing the dirty log, for example. + fn iteration_overhead(&self) -> Duration { + self.iteration_duration + .and_then(|iter| { + self.transfer_duration.map(|tr| { + // This is guaranteed by update_metrics_after_transfer() + assert!(iter >= tr); + iter - tr + }) + }) + .unwrap_or_default() + } +} + +impl Default for MemoryMigrationContext { + fn default() -> Self { + Self::new() + } +} + +// The display format must be a compact one-liner to enable concise log messages per iteration. +impl Display for MemoryMigrationContext { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let curr_mib = self.current_iteration_total_bytes.div_ceil(1024 * 1024); + let total_mib = self.total_sent_bytes.div_ceil(1024 * 1024); + + // Current bandwidth in MiB/s + let curr_bw_mib_s = self.bandwidth_bytes_per_second / 1024.0 / 1024.0; + + // Time elapsed since memory migration start. + let elapsed = self + .migration_duration + .unwrap_or_else(|| Instant::now() - self.migration_begin) + .as_secs_f64(); + + // Internally, this again evaluates `self.migration_begin.elapsed()` + // but this is negligible. + let avg_bw_mib_s = self.average_bandwidth() / 1024.0 / 1024.0; + + // Transfer duration and iteration overhead + let transfer_s = self.transfer_duration.map_or(0.0, |d| d.as_secs_f64()); + let iteration_overhead_ms = self.iteration_overhead().as_millis(); + + let est_downtime_ms = self.estimated_downtime.map_or(0, |d| d.as_millis()); + + write!( + f, + "iter={} \ + curr={curr_mib}MiB \ + total={total_mib}MiB \ + bw={curr_bw_mib_s:.2}MiB/s \ + transfer={transfer_s:.2}s \ + overhead={iteration_overhead_ms}ms \ + est_downtime={est_downtime_ms}ms \ + elapsed={elapsed:.2}s \ + avg_bw={avg_bw_mib_s:.2}MiB/s", + self.iteration, + ) + } +} + +#[cfg(test)] +mod unit_tests { + use super::*; + + /// Tests for [`CompletedMigrationContext`] and [`OngoingMigrationContext`]. + mod migration_ctx_tests { + use super::*; + + #[test] + fn memory_migrated_and_vm_paused_records_transition() { + let mut ctx = OngoingMigrationContext::new(); + let downtime_begin = Instant::now(); + + let mut memory_ctx = MemoryMigrationContext::new(); + memory_ctx.finalize(); + + ctx.set_vm_paused(downtime_begin, memory_ctx) + .expect("migration context should transition to VmPaused after memory migration"); + + assert!(matches!( + ctx, + OngoingMigrationContext::VmPaused { + downtime_begin: recorded_downtime_begin, + .. + } if recorded_downtime_begin == downtime_begin + )); + } + + #[test] + fn finalize_returns_completed_context() { + let mut ctx = OngoingMigrationContext::new(); + let downtime_begin = Instant::now() - Duration::from_millis(10); + + let mut memory_ctx = MemoryMigrationContext::new(); + memory_ctx.finalize(); + + ctx.set_vm_paused(downtime_begin, memory_ctx) + .expect("migration context should transition to VmPaused after memory migration"); + + let completed = ctx + .finalize( + Duration::from_millis(1), + Duration::from_millis(2), + Duration::from_millis(3), + ) + .expect("migration context should finalize after memory migration completed"); + + assert_eq!(completed.downtime_ctx.state_dur, Duration::from_millis(1)); + assert_eq!( + completed.downtime_ctx.send_state_dur, + Duration::from_millis(2) + ); + assert_eq!( + completed.downtime_ctx.complete_dur, + Duration::from_millis(3) + ); + assert!(completed.downtime_ctx.effective_downtime >= Duration::from_millis(10)); + assert!(completed.migration_dur > Duration::ZERO); + assert!(completed.memory_ctx.migration_duration.is_some()); + } + + #[test] + fn finalize_errors_before_memory_migration_completed() { + let err = OngoingMigrationContext::new() + .finalize(Duration::ZERO, Duration::ZERO, Duration::ZERO) + .unwrap_err(); + + assert_eq!(err, MigrationContextError::InvalidFinalizeTransition); + } + } + + /// Tests for [`MemoryMigrationContext`]. + mod memory_migration_ctx_tests { + use std::time::{Duration, Instant}; + + use super::*; + use crate::protocol::MemoryRange; + + fn make_table(bytes: u64) -> MemoryRangeTable { + let mut table = MemoryRangeTable::default(); + if bytes > 0 { + table.push(MemoryRange { + gpa: 0, + length: bytes, + }); + } + table + } + + /// A controlled migration scenario with fixed timing offsets. + /// + /// ```text + /// migration_begin + /// + 1.0s -> iteration_begin + /// + 1.1s -> transfer_begin + /// + 2.0s -> transfer ends (transfer_duration = 0.9s) + /// + 2.1s -> iteration ends (iteration_duration = 1.1s, overhead = 0.2s) + /// ``` + struct Scenario { + migration_begin: Instant, + iteration_begin: Instant, + transfer_begin: Instant, + transfer_duration: Duration, + } + + impl Scenario { + /// We use a fixed point in the past so all offsets are in the past too, + /// meaning elapsed() calls in the code under test will be >= our durations. + const FIXPOINT_PAST: Duration = Duration::from_secs(10); + + fn new() -> Self { + // Use a fixed point in the past so all offsets are in the past too, + // meaning elapsed() calls in the code under test will be >= our durations. + let migration_begin = Instant::now() - Self::FIXPOINT_PAST; + Self { + migration_begin, + iteration_begin: migration_begin + Duration::from_millis(1000), + transfer_begin: migration_begin + Duration::from_millis(1100), + transfer_duration: Duration::from_millis(900), + } + } + + fn make_ctx(&self) -> MemoryMigrationContext { + let mut ctx = MemoryMigrationContext::new(); + // Override migration_begin with our controlled value. + ctx.migration_begin = self.migration_begin; + ctx + } + } + + #[test] + fn before_transfer_updates_begin_and_bytes() { + let s = Scenario::new(); + let mut ctx = s.make_ctx(); + + ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(4096)); + + assert_eq!(ctx.iteration_begin, s.iteration_begin); + assert_eq!(ctx.current_iteration_total_bytes, 4096); + } + + #[test] + fn before_transfer_estimated_downtime() { + let s = Scenario::new(); + let mut ctx = s.make_ctx(); + + // Empty table -> zero downtime regardless of bandwidth + ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(0)); + assert_eq!(ctx.estimated_downtime, Some(Duration::ZERO)); + + // No bandwidth yet -> None + ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(1024)); + assert_eq!(ctx.estimated_downtime, None); + + // 1024 B/s, 1024 bytes -> 1s + ctx.bandwidth_bytes_per_second = 1024.0; + ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(1024)); + assert_eq!(ctx.estimated_downtime, Some(Duration::from_secs(1))); + } + + #[test] + fn after_transfer_updates_timing_and_bandwidth() { + let s = Scenario::new(); + let mut ctx = s.make_ctx(); + + ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(1024)); + ctx.update_metrics_after_transfer(s.transfer_begin, s.transfer_duration); + + assert_eq!(ctx.transfer_begin, s.transfer_begin); + assert_eq!(ctx.transfer_duration, Some(s.transfer_duration)); + // 1024 bytes / 0.9s + assert_eq!(ctx.bandwidth_bytes_per_second, 1024.0 / 0.9); + // iteration_duration = time from iteration_begin until now (>= transfer_duration) + assert!(ctx.iteration_duration.unwrap() >= s.transfer_duration); + // Zero transfer_duration -> bandwidth is 0.0, no division by zero + ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(1024)); + ctx.update_metrics_after_transfer(s.transfer_begin, Duration::ZERO); + assert_eq!(ctx.bandwidth_bytes_per_second, 0.0); + + // Check finalize() sets migration duration + assert_eq!(ctx.migration_duration, None); + ctx.finalize(); + assert!(matches!(ctx.migration_duration, Some(d) if d >= Scenario::FIXPOINT_PAST)); + } + + #[test] + fn two_iterations_accumulate_bytes_and_feed_downtime_estimate() { + let s = Scenario::new(); + let mut ctx = s.make_ctx(); + + // Iteration 0: no bandwidth yet -> downtime is None + ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(1024)); + assert_eq!(ctx.estimated_downtime, None); + ctx.update_metrics_after_transfer(s.transfer_begin, s.transfer_duration); + assert_eq!(ctx.total_sent_bytes, 1024); + + // Iteration 1: bandwidth now known -> downtime is Some + ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(2048)); + assert!(ctx.estimated_downtime.is_some()); + ctx.update_metrics_after_transfer(s.transfer_begin, s.transfer_duration); + assert_eq!(ctx.total_sent_bytes, 1024 + 2048); + + // Check finalize() sets migration duration + assert_eq!(ctx.migration_duration, None); + ctx.finalize(); + assert!(matches!(ctx.migration_duration, Some(d) if d >= Scenario::FIXPOINT_PAST)); + } + + #[test] + /// The display format is specifically crafted to be very insightful in logs. + /// Therefore, we have a dedicated test for that format. + fn display_format() { + let s = Scenario::new(); + let mut ctx = s.make_ctx(); + + // Iteration 0: 1 MiB in 1s + ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(1024 * 1024)); + ctx.update_metrics_after_transfer(s.transfer_begin, Duration::from_secs(1)); + ctx.iteration += 1; + + // Iteration 1: 512 KiB in 1s; fix migration_duration for deterministic elapsed/avg_bw + ctx.update_metrics_before_transfer(s.iteration_begin, &make_table(512 * 1024)); + ctx.update_metrics_after_transfer(s.transfer_begin, Duration::from_secs(1)); + + ctx.migration_duration = Some(Duration::from_secs(2)); + let out = ctx.to_string(); + + assert_eq!( + out, + "iter=1 curr=1MiB total=2MiB bw=0.50MiB/s transfer=1.00s overhead=8000ms est_downtime=500ms elapsed=2.00s avg_bw=0.15MiB/s" + ); + + // Should change elapsed() time! + // Since this is at least 10s, we never face timing issues in CI! + ctx.finalize(); + let out2 = ctx.to_string(); + assert_ne!(out2, out, "elapsed time should have changed! is={out2}"); + } + } +} diff --git a/vm-migration/src/lib.rs b/vm-migration/src/lib.rs index 921ae5b3db..0faedf2858 100644 --- a/vm-migration/src/lib.rs +++ b/vm-migration/src/lib.rs @@ -4,14 +4,51 @@ // use anyhow::anyhow; +pub use context::{ + CompletedMigrationContext, DowntimeContext, MemoryMigrationContext, MigrationContextError, + OngoingMigrationContext, +}; use serde::{Deserialize, Serialize}; use thiserror::Error; use crate::protocol::MemoryRangeTable; mod bitpos_iterator; +mod context; pub mod protocol; +#[derive(Error, Debug)] +pub enum UffdError { + #[error("Snapshot ranges are not page-aligned")] + UnalignedRanges, + + #[error("Failed to create userfaultfd")] + Create(#[source] std::io::Error), + + #[error("Cannot translate GPA {gpa:#x} to host address")] + GpaTranslation { gpa: u64 }, + + #[error("Failed to register region at {addr:#x}+{len:#x}")] + Register { + addr: u64, + len: u64, + #[source] + source: std::io::Error, + }, + + #[error("Region at {addr:#x}+{len:#x} missing COPY/WAKE support")] + MissingIoctlSupport { addr: u64, len: u64 }, + + #[error("Failed to spawn handler thread")] + SpawnThread(#[source] std::io::Error), + + #[error("Handler terminated before startup completed")] + HandlerStartup, + + #[error("Handler failed after startup")] + HandlerFailed(#[source] std::io::Error), +} + #[derive(Error, Debug)] pub enum MigratableError { #[error("Failed to pause migratable component")] @@ -32,6 +69,9 @@ pub enum MigratableError { #[error("Failed to receive migratable component snapshot")] MigrateReceive(#[source] anyhow::Error), + #[error("On-demand restore failed")] + OnDemandRestore(#[source] UffdError), + #[error("Socket error")] MigrateSocket(#[source] std::io::Error), diff --git a/vm-migration/src/protocol.rs b/vm-migration/src/protocol.rs index 3ae226ece2..f927d7a36e 100644 --- a/vm-migration/src/protocol.rs +++ b/vm-migration/src/protocol.rs @@ -115,6 +115,7 @@ pub enum Command { Config, State, Memory, + /// Finalizes the migration and resumes the VM on the guest. Complete, Abandon, MemoryFd, @@ -272,12 +273,99 @@ pub struct MemoryRange { pub length: u64, } -#[derive(Clone, Default, Serialize, Deserialize)] +/// A set of guest-memory ranges to transfer as one migration payload. +#[derive(Clone, Default, Debug, Serialize, Deserialize)] pub struct MemoryRangeTable { data: Vec, } +/// Iterator returned by [`MemoryRangeTable::partition`]. +/// +/// Each item contains at most `chunk_size` bytes. A range may be split across +/// multiple items. +/// +/// The iterator may reorder ranges for efficiency, so callers must not rely on +/// the order in which chunks or ranges are yielded. +#[derive(Clone, Default, Debug)] +struct MemoryRangeTableIterator { + chunk_size: u64, + data: Vec, +} + +impl MemoryRangeTableIterator { + /// Create an iterator that partitions `table` into chunks of at most + /// `chunk_size` bytes. + pub fn new(table: MemoryRangeTable, chunk_size: u64) -> Self { + MemoryRangeTableIterator { + chunk_size, + data: table.data, + } + } +} + +impl Iterator for MemoryRangeTableIterator { + type Item = MemoryRangeTable; + + /// Return the next memory range in the table, making sure that + /// the returned range is not larger than `chunk_size`. + /// + /// **Note**: Do not rely on the order of the ranges returned by this + /// iterator. This allows for a more efficient implementation. + fn next(&mut self) -> Option { + let mut ranges: Vec = vec![]; + let mut ranges_size: u64 = 0; + + loop { + assert!(ranges_size <= self.chunk_size); + + if ranges_size == self.chunk_size || self.data.is_empty() { + break; + } + + if let Some(range) = self.data.pop() { + let next_range: MemoryRange = if ranges_size + range.length > self.chunk_size { + // How many bytes we need to put back into the table. + let leftover_bytes = ranges_size + range.length - self.chunk_size; + assert!(leftover_bytes <= range.length); + let returned_bytes = range.length - leftover_bytes; + assert!(returned_bytes <= range.length); + assert_eq!(leftover_bytes + returned_bytes, range.length); + + self.data.push(MemoryRange { + gpa: range.gpa, + length: leftover_bytes, + }); + MemoryRange { + gpa: range.gpa + leftover_bytes, + length: returned_bytes, + } + } else { + range + }; + + ranges_size += next_range.length; + ranges.push(next_range); + } + } + + if ranges.is_empty() { + None + } else { + Some(MemoryRangeTable { data: ranges }) + } + } +} + impl MemoryRangeTable { + pub fn ranges(&self) -> &[MemoryRange] { + &self.data + } + + /// Partitions the table into chunks of at most `chunk_size` bytes. + pub fn partition(self, chunk_size: u64) -> impl Iterator { + MemoryRangeTableIterator::new(self, chunk_size) + } + /// Converts an iterator over a dirty bitmap into an iterator of dirty /// [`MemoryRange`]s, merging consecutive dirty pages into contiguous ranges. /// @@ -332,19 +420,19 @@ impl MemoryRangeTable { pub fn read_from(fd: &mut dyn Read, length: u64) -> Result { assert!((length as usize).is_multiple_of(size_of::())); - let mut data: Vec = Vec::new(); - data.resize_with( - length as usize / (std::mem::size_of::()), - Default::default, - ); - // SAFETY: the slice is constructed with the correct arguments - fd.read_exact(unsafe { - std::slice::from_raw_parts_mut( - data.as_ptr() as *mut MemoryRange as *mut u8, - length as usize, - ) - }) - .map_err(MigratableError::MigrateSocket)?; + let mut data: Vec = + vec![MemoryRange::default(); length as usize / size_of::()]; + + // SAFETY: The pointer points to the just created vector data. + // `MemoryRange` can be read from and written to bytes since it's `[repr(C)]`. + // The vector data was initialized with `length as usize / size_of::()` valid + // `MemoryRange`s so the memory is valid for `length` bytes. + // During the lifetime of the slice, neither the backing vector nor the pointed to memory are accessed. + let data_slice_bytes = + unsafe { std::slice::from_raw_parts_mut(data.as_mut_ptr().cast(), length as usize) }; + + fd.read_exact(data_slice_bytes) + .map_err(MigratableError::MigrateSocket)?; Ok(Self { data }) } @@ -376,6 +464,11 @@ impl MemoryRangeTable { } Self { data } } + + /// Returns the effective size in bytes. + pub fn effective_size(&self) -> u64 { + self.data.iter().map(|r| r.length).sum() + } } #[cfg(test)] @@ -408,4 +501,144 @@ mod unit_tests { ] ); } + + #[test] + fn test_memory_range_table_partition() { + // We start the test similar as the one above, but with a input that is simpler to parse for + // developers. + let input = [0b11_0011_0011_0011]; + + let start_gpa = 0x1000; + let page_size = 0x1000; + + let table = MemoryRangeTable::from_dirty_bitmap(input, start_gpa, page_size); + let expected_regions = [ + MemoryRange { + gpa: start_gpa, + length: page_size * 2, + }, + MemoryRange { + gpa: start_gpa + 4 * page_size, + length: page_size * 2, + }, + MemoryRange { + gpa: start_gpa + 8 * page_size, + length: page_size * 2, + }, + MemoryRange { + gpa: start_gpa + 12 * page_size, + length: page_size * 2, + }, + ]; + assert_eq!(table.regions(), &expected_regions); + + // In the first test, we expect to see the exact same result as above, as we use the length + // of every region (which is fixed!). + { + let chunks = table + .clone() + .partition(page_size * 2) + .map(|table| table.data) + .collect::>(); + + // The implementation currently returns the ranges in reverse order. + // For better testability, we reverse it. + let chunks = chunks + .into_iter() + .map(|vec| vec.into_iter().rev().collect::>()) + .rev() + .collect::>(); + + assert_eq!( + chunks, + &[ + [expected_regions[0].clone()].to_vec(), + [expected_regions[1].clone()].to_vec(), + [expected_regions[2].clone()].to_vec(), + [expected_regions[3].clone()].to_vec(), + ] + ); + } + + // Next, we have a more sophisticated test with a chunk size of 5 pages. + { + let chunks = table + .clone() + .partition(page_size * 5) + .map(|table| table.data) + .collect::>(); + + // The implementation currently returns the ranges in reverse order. + // For better testability, we reverse it. + let chunks = chunks + .into_iter() + .map(|vec| vec.into_iter().rev().collect::>()) + .rev() + .collect::>(); + + assert_eq!( + chunks, + &[ + vec![ + MemoryRange { + gpa: start_gpa, + length: 2 * page_size + }, + MemoryRange { + gpa: start_gpa + 4 * page_size, + length: page_size + } + ], + vec![ + MemoryRange { + gpa: start_gpa + 5 * page_size, + length: page_size + }, + MemoryRange { + gpa: start_gpa + 8 * page_size, + length: 2 * page_size + }, + MemoryRange { + gpa: start_gpa + 12 * page_size, + length: 2 * page_size + } + ] + ] + ); + } + } + + #[test] + fn test_memory_range_table_partition_uneven_split() { + // Three consecutive dirty pages produce one 3-page range, which lets + // us test an uneven 1+2 page split while using the same helper as the + // other partition tests above. + let input = [0b111]; + let start_gpa = 0x1000; + let page_size = 0x1000; + + let table = MemoryRangeTable::from_dirty_bitmap(input, start_gpa, page_size); + + let chunks = table + .partition(page_size * 2) + .map(|table| table.data) + .collect::>(); + + // The implementation currently returns ranges in reverse order. + let chunks = chunks.into_iter().rev().collect::>(); + + assert_eq!( + chunks, + &[ + vec![MemoryRange { + gpa: start_gpa, + length: page_size, + }], + vec![MemoryRange { + gpa: start_gpa + page_size, + length: page_size * 2, + }], + ] + ); + } } diff --git a/vm-virtio/Cargo.toml b/vm-virtio/Cargo.toml index 228f552416..de90b209d0 100644 --- a/vm-virtio/Cargo.toml +++ b/vm-virtio/Cargo.toml @@ -2,6 +2,7 @@ authors = ["The Cloud Hypervisor Authors"] edition.workspace = true name = "vm-virtio" +rust-version.workspace = true version = "0.1.0" [features] diff --git a/vm-virtio/src/lib.rs b/vm-virtio/src/lib.rs index b7f5370702..fbd94b2b72 100644 --- a/vm-virtio/src/lib.rs +++ b/vm-virtio/src/lib.rs @@ -39,7 +39,7 @@ pub enum VirtioDeviceType { Mem = 24, Fs = 26, Pmem = 27, - Watchdog = 35, // Temporary until official number allocated + Watchdog = 35, Unknown = 0xFF, } @@ -101,32 +101,60 @@ pub trait AccessPlatform: Send + Sync + Debug { } pub trait Translatable { - fn translate_gva(&self, access_platform: Option<&dyn AccessPlatform>, len: usize) -> Self; - fn translate_gpa(&self, access_platform: Option<&dyn AccessPlatform>, len: usize) -> Self; + fn translate_gva( + &self, + access_platform: Option<&dyn AccessPlatform>, + len: usize, + ) -> std::result::Result + where + Self: Sized; + fn translate_gpa( + &self, + access_platform: Option<&dyn AccessPlatform>, + len: usize, + ) -> std::result::Result + where + Self: Sized; } impl Translatable for GuestAddress { - fn translate_gva(&self, access_platform: Option<&dyn AccessPlatform>, len: usize) -> Self { - GuestAddress(self.0.translate_gva(access_platform, len)) + fn translate_gva( + &self, + access_platform: Option<&dyn AccessPlatform>, + len: usize, + ) -> std::result::Result { + Ok(GuestAddress(self.0.translate_gva(access_platform, len)?)) } - fn translate_gpa(&self, access_platform: Option<&dyn AccessPlatform>, len: usize) -> Self { - GuestAddress(self.0.translate_gpa(access_platform, len)) + fn translate_gpa( + &self, + access_platform: Option<&dyn AccessPlatform>, + len: usize, + ) -> std::result::Result { + Ok(GuestAddress(self.0.translate_gpa(access_platform, len)?)) } } impl Translatable for u64 { - fn translate_gva(&self, access_platform: Option<&dyn AccessPlatform>, len: usize) -> Self { + fn translate_gva( + &self, + access_platform: Option<&dyn AccessPlatform>, + len: usize, + ) -> std::result::Result { if let Some(access_platform) = access_platform { - access_platform.translate_gva(*self, len as u64).unwrap() + access_platform.translate_gva(*self, len as u64) } else { - *self + Ok(*self) } } - fn translate_gpa(&self, access_platform: Option<&dyn AccessPlatform>, len: usize) -> Self { + fn translate_gpa( + &self, + access_platform: Option<&dyn AccessPlatform>, + len: usize, + ) -> std::result::Result { if let Some(access_platform) = access_platform { - access_platform.translate_gpa(*self, len as u64).unwrap() + access_platform.translate_gpa(*self, len as u64) } else { - *self + Ok(*self) } } } diff --git a/vmm/Cargo.toml b/vmm/Cargo.toml index 4cfa4ed3a1..7c8354a0cb 100644 --- a/vmm/Cargo.toml +++ b/vmm/Cargo.toml @@ -2,6 +2,7 @@ authors = ["The Cloud Hypervisor Authors"] edition.workspace = true name = "vmm" +rust-version.workspace = true version = "0.1.0" [features] @@ -16,8 +17,10 @@ ivshmem = ["devices/ivshmem"] kvm = [ "arch/kvm", "hypervisor/kvm", + "iommufd-ioctls", "pci/kvm", "vfio-ioctls/kvm", + "vfio-ioctls/vfio_cdev", "virtio-devices/kvm", "vm-device/kvm", ] @@ -29,7 +32,12 @@ mshv = [ "vm-device/mshv", ] pvmemcontrol = ["devices/pvmemcontrol"] -sev_snp = ["arch/sev_snp", "hypervisor/sev_snp", "virtio-devices/sev_snp"] +sev_snp = [ + "arch/sev_snp", + "hypervisor/sev_snp", + "igvm_defs", + "virtio-devices/sev_snp", +] tdx = ["arch/tdx", "hypervisor/tdx"] tracing = ["tracer/tracing"] @@ -48,12 +56,13 @@ epoll = { workspace = true } event_monitor = { path = "../event_monitor" } flume = { workspace = true } futures = { version = "0.3.32", optional = true } -gdbstub = { version = "0.7.9", optional = true } -gdbstub_arch = { version = "0.3.2", optional = true } +gdbstub = { version = "0.7.10", optional = true } +gdbstub_arch = { version = "0.3.3", optional = true } hex = { version = "0.4.3", optional = true } hypervisor = { path = "../hypervisor" } igvm = { workspace = true, optional = true } igvm_defs = { workspace = true, optional = true } +iommufd-ioctls = { workspace = true, optional = true } landlock = "0.4.4" libc = { workspace = true } linux-loader = { workspace = true, features = ["bzimage", "elf", "pe"] } @@ -91,7 +100,7 @@ vm-memory = { workspace = true, features = [ vm-migration = { path = "../vm-migration" } vm-virtio = { path = "../vm-virtio" } vmm-sys-util = { workspace = true, features = ["with-serde"] } -zbus = { version = "5.13.2", optional = true } +zbus = { version = "5.14.0", optional = true } zerocopy = { workspace = true, features = ["alloc", "derive"] } [lints] diff --git a/vmm/src/acpi.rs b/vmm/src/acpi.rs index 8f46b20ddb..ac05306bcf 100644 --- a/vmm/src/acpi.rs +++ b/vmm/src/acpi.rs @@ -2,7 +2,6 @@ // // SPDX-License-Identifier: Apache-2.0 // -use std::sync::{Arc, Mutex}; use std::time::Instant; use acpi_tables::Aml; @@ -192,7 +191,7 @@ bitflags! { impl MemoryAffinity { fn from_region( - region: &Arc, + region: &GuestRegionMmap, proximity_domain: u32, flags: MemAffinityFlags, ) -> Self { @@ -258,9 +257,9 @@ struct ViotPciRangeNode { } pub fn create_dsdt_table( - device_manager: &Arc>, - cpu_manager: &Arc>, - memory_manager: &Arc>, + device_manager: &DeviceManager, + cpu_manager: &CpuManager, + memory_manager: &MemoryManager, ) -> Sdt { trace_scoped!("create_dsdt_table"); // DSDT @@ -268,9 +267,9 @@ pub fn create_dsdt_table( let mut bytes = Vec::new(); - device_manager.lock().unwrap().to_aml_bytes(&mut bytes); - cpu_manager.lock().unwrap().to_aml_bytes(&mut bytes); - memory_manager.lock().unwrap().to_aml_bytes(&mut bytes); + device_manager.to_aml_bytes(&mut bytes); + cpu_manager.to_aml_bytes(&mut bytes); + memory_manager.to_aml_bytes(&mut bytes); dsdt.append_slice(&bytes); dsdt @@ -278,14 +277,13 @@ pub fn create_dsdt_table( const FACP_DSDT_OFFSET: usize = 140; -fn create_facp_table(dsdt_offset: GuestAddress, device_manager: &Arc>) -> Sdt { +fn create_facp_table(dsdt_offset: GuestAddress, device_manager: &DeviceManager) -> Sdt { trace_scoped!("create_facp_table"); // Revision 6 of the ACPI FADT table is 276 bytes long let mut facp = Sdt::new(*b"FACP", 276, 6, *b"CLOUDH", *b"CHFACP ", 1); { - let device_manager = device_manager.lock().unwrap(); if let Some(address) = device_manager.acpi_platform_addresses().reset_reg_address { // RESET_REG facp.write(116, address); @@ -369,7 +367,7 @@ fn create_tpm2_table() -> Sdt { fn create_srat_table( numa_nodes: &NumaNodes, - device_manager: &Arc>, + device_manager: &DeviceManager, #[cfg(target_arch = "x86_64")] topology: Option<(u16, u16, u16, u16)>, ) -> Sdt { let mut srat = Sdt::new(*b"SRAT", 36, 3, *b"CLOUDH", *b"CHSRAT ", 1); @@ -381,7 +379,6 @@ fn create_srat_table( assert_eq!(std::mem::size_of::(), 40); // Confirm struct size matches ACPI 6.6 spec assert_eq!(std::mem::size_of::(), 32); - let dm = device_manager.lock().unwrap(); for (node_id, node) in numa_nodes.iter() { let proximity_domain = *node_id; @@ -436,7 +433,7 @@ fn create_srat_table( // Add Generic Initiator Affinity structures for device-only NUMA nodes if let Some(device_id) = &node.device_id { // Resolve device_id to guest BDF - if let Some(bdf) = dm.get_device_bdf(device_id) { + if let Some(bdf) = device_manager.get_device_bdf(device_id) { srat.append(GenericInitiatorAffinity::from_pci_bdf( bdf, proximity_domain, @@ -852,9 +849,9 @@ fn create_viot_table(iommu_bdf: &PciBdf, devices_bdf: &[PciBdf]) -> Sdt { // * `Vec` contains a list of table pointers stored in XSDT. fn create_acpi_tables_internal( dsdt_addr: GuestAddress, - device_manager: &Arc>, - cpu_manager: &Arc>, - memory_manager: &Arc>, + device_manager: &DeviceManager, + cpu_manager: &CpuManager, + memory_manager: &MemoryManager, numa_nodes: &NumaNodes, tpm_enabled: bool, ) -> (Rsdp, Vec, Vec) { @@ -876,15 +873,13 @@ fn create_acpi_tables_internal( // MADT #[cfg(target_arch = "aarch64")] let vgic = device_manager - .lock() - .unwrap() .get_interrupt_controller() .unwrap() .lock() .unwrap() .get_vgic() .unwrap(); - let madt = cpu_manager.lock().unwrap().create_madt( + let madt = cpu_manager.create_madt( #[cfg(target_arch = "aarch64")] vgic, ); @@ -897,7 +892,7 @@ fn create_acpi_tables_internal( // PPTT #[cfg(target_arch = "aarch64")] { - let pptt = cpu_manager.lock().unwrap().create_pptt(); + let pptt = cpu_manager.create_pptt(); let pptt_addr = prev_tbl_addr.checked_add(prev_tbl_len).unwrap(); tables_bytes.extend_from_slice(pptt.as_slice()); xsdt_table_pointers.push(pptt_addr.0); @@ -917,7 +912,7 @@ fn create_acpi_tables_internal( } // MCFG - let mcfg = create_mcfg_table(device_manager.lock().unwrap().pci_segments()); + let mcfg = create_mcfg_table(device_manager.pci_segments()); let mcfg_addr = prev_tbl_addr.checked_add(prev_tbl_len).unwrap(); tables_bytes.extend_from_slice(mcfg.as_slice()); xsdt_table_pointers.push(mcfg_addr.0); @@ -928,16 +923,12 @@ fn create_acpi_tables_internal( #[cfg(target_arch = "aarch64")] { let is_serial_on = device_manager - .lock() - .unwrap() .get_device_info() .clone() .contains_key(&(DeviceType::Serial, DeviceType::Serial.to_string())); let serial_device_addr = arch::layout::LEGACY_SERIAL_MAPPED_IO_START.raw_value(); let serial_device_irq = if is_serial_on { device_manager - .lock() - .unwrap() .get_device_info() .clone() .get(&(DeviceType::Serial, DeviceType::Serial.to_string())) @@ -979,7 +970,7 @@ fn create_acpi_tables_internal( // Only created if the NUMA nodes list is not empty. if !numa_nodes.is_empty() { #[cfg(target_arch = "x86_64")] - let topology = cpu_manager.lock().unwrap().get_vcpu_topology(); + let topology = cpu_manager.get_vcpu_topology(); // SRAT let srat = create_srat_table( numa_nodes, @@ -1003,7 +994,7 @@ fn create_acpi_tables_internal( #[cfg(target_arch = "aarch64")] { - let iort = create_iort_table(device_manager.lock().unwrap().pci_segments()); + let iort = create_iort_table(device_manager.pci_segments()); let iort_addr = prev_tbl_addr.checked_add(prev_tbl_len).unwrap(); tables_bytes.extend_from_slice(iort.as_slice()); xsdt_table_pointers.push(iort_addr.0); @@ -1012,8 +1003,7 @@ fn create_acpi_tables_internal( } // VIOT - if let Some((iommu_bdf, devices_bdf)) = device_manager.lock().unwrap().iommu_attached_devices() - { + if let Some((iommu_bdf, devices_bdf)) = device_manager.iommu_attached_devices() { let viot = create_viot_table(iommu_bdf, devices_bdf); let viot_addr = prev_tbl_addr.checked_add(prev_tbl_len).unwrap(); @@ -1040,9 +1030,9 @@ fn create_acpi_tables_internal( #[cfg(feature = "fw_cfg")] pub fn create_acpi_tables_for_fw_cfg( - device_manager: &Arc>, - cpu_manager: &Arc>, - memory_manager: &Arc>, + device_manager: &DeviceManager, + cpu_manager: &CpuManager, + memory_manager: &MemoryManager, numa_nodes: &NumaNodes, tpm_enabled: bool, ) -> Result<(), crate::vm::Error> { @@ -1087,8 +1077,6 @@ pub fn create_acpi_tables_for_fw_cfg( checksums.push(xsdt_checksum); device_manager - .lock() - .unwrap() .fw_cfg() .expect("fw_cfg must be present") .lock() @@ -1099,9 +1087,9 @@ pub fn create_acpi_tables_for_fw_cfg( pub fn create_acpi_tables( guest_mem: &GuestMemoryMmap, - device_manager: &Arc>, - cpu_manager: &Arc>, - memory_manager: &Arc>, + device_manager: &DeviceManager, + cpu_manager: &CpuManager, + memory_manager: &MemoryManager, numa_nodes: &NumaNodes, tpm_enabled: bool, ) -> GuestAddress { @@ -1139,9 +1127,9 @@ pub fn create_acpi_tables( #[cfg(feature = "tdx")] pub fn create_acpi_tables_tdx( - device_manager: &Arc>, - cpu_manager: &Arc>, - memory_manager: &Arc>, + device_manager: &DeviceManager, + cpu_manager: &CpuManager, + memory_manager: &MemoryManager, numa_nodes: &NumaNodes, ) -> Vec { // DSDT @@ -1155,18 +1143,16 @@ pub fn create_acpi_tables_tdx( tables.push(create_facp_table(GuestAddress(0), device_manager)); // MADT - tables.push(cpu_manager.lock().unwrap().create_madt()); + tables.push(cpu_manager.create_madt()); // MCFG - tables.push(create_mcfg_table( - device_manager.lock().unwrap().pci_segments(), - )); + tables.push(create_mcfg_table(device_manager.pci_segments())); // SRAT and SLIT // Only created if the NUMA nodes list is not empty. if !numa_nodes.is_empty() { #[cfg(target_arch = "x86_64")] - let topology = cpu_manager.lock().unwrap().get_vcpu_topology(); + let topology = cpu_manager.get_vcpu_topology(); // SRAT tables.push(create_srat_table( @@ -1181,8 +1167,7 @@ pub fn create_acpi_tables_tdx( } // VIOT - if let Some((iommu_bdf, devices_bdf)) = device_manager.lock().unwrap().iommu_attached_devices() - { + if let Some((iommu_bdf, devices_bdf)) = device_manager.iommu_attached_devices() { tables.push(create_viot_table(iommu_bdf, devices_bdf)); } diff --git a/vmm/src/api/dbus/mod.rs b/vmm/src/api/dbus/mod.rs index 6f75fb5cda..ae39feb7d7 100644 --- a/vmm/src/api/dbus/mod.rs +++ b/vmm/src/api/dbus/mod.rs @@ -22,10 +22,10 @@ use super::{ApiAction, ApiRequest}; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use crate::api::VmCoredump; use crate::api::{ - AddDisk, Body, VmAddDevice, VmAddFs, VmAddNet, VmAddPmem, VmAddUserDevice, VmAddVdpa, - VmAddVsock, VmBoot, VmCounters, VmCreate, VmDelete, VmInfo, VmPause, VmPowerButton, VmReboot, - VmReceiveMigration, VmRemoveDevice, VmResize, VmResizeZone, VmRestore, VmResume, - VmSendMigration, VmShutdown, VmSnapshot, VmmPing, VmmShutdown, + AddDisk, Body, VmAddDevice, VmAddFs, VmAddGenericVhostUser, VmAddNet, VmAddPmem, + VmAddUserDevice, VmAddVdpa, VmAddVsock, VmBoot, VmCounters, VmCreate, VmDelete, VmInfo, + VmPause, VmPowerButton, VmReboot, VmReceiveMigration, VmRemoveDevice, VmResize, VmResizeZone, + VmRestore, VmResume, VmSendMigration, VmShutdown, VmSnapshot, VmmPing, VmmShutdown, }; use crate::seccomp_filters::{Thread, get_seccomp_filter}; use crate::{Error as VmmError, NetConfig, Result as VmmResult, VmConfig}; @@ -144,6 +144,16 @@ impl DBusApi { self.vm_action(&VmAddFs, fs_config).await } + async fn vm_add_generic_vhost_user( + &self, + generic_vhost_user_config: String, + ) -> Result> { + let generic_vhost_user_config = + serde_json::from_str(&generic_vhost_user_config).map_err(api_error)?; + self.vm_action(&VmAddGenericVhostUser, generic_vhost_user_config) + .await + } + async fn vm_add_net(&self, net_config: String) -> Result> { let mut net_config: NetConfig = serde_json::from_str(&net_config).map_err(api_error)?; if net_config.fds.is_some() { diff --git a/vmm/src/api/http/http_endpoint.rs b/vmm/src/api/http/http_endpoint.rs index e463a20819..92b53ac68e 100644 --- a/vmm/src/api/http/http_endpoint.rs +++ b/vmm/src/api/http/http_endpoint.rs @@ -45,10 +45,11 @@ use crate::api::VmCoredump; use crate::api::http::http_endpoint::fds_helper::{attach_fds_to_cfg, attach_fds_to_cfgs}; use crate::api::http::{EndpointHandler, HttpError, error_response}; use crate::api::{ - AddDisk, ApiAction, ApiError, ApiRequest, NetConfig, VmAddDevice, VmAddFs, VmAddNet, VmAddPmem, - VmAddUserDevice, VmAddVdpa, VmAddVsock, VmBoot, VmConfig, VmCounters, VmDelete, VmNmi, VmPause, - VmPowerButton, VmReboot, VmReceiveMigration, VmRemoveDevice, VmResize, VmResizeDisk, - VmResizeZone, VmRestore, VmResume, VmSendMigration, VmShutdown, VmSnapshot, + AddDisk, ApiAction, ApiError, ApiRequest, NetConfig, VmAddDevice, VmAddFs, + VmAddGenericVhostUser, VmAddNet, VmAddPmem, VmAddUserDevice, VmAddVdpa, VmAddVsock, VmBoot, + VmConfig, VmCounters, VmDelete, VmNmi, VmPause, VmPowerButton, VmReboot, VmReceiveMigration, + VmRemoveDevice, VmResize, VmResizeDisk, VmResizeZone, VmRestore, VmResume, VmSendMigration, + VmShutdown, VmSnapshot, }; use crate::config::RestoreConfig; use crate::cpu::Error as CpuError; @@ -121,7 +122,7 @@ mod fds_helper { impl ConfigWithFDs for NetConfig { fn id(&self) -> Option<&str> { - self.id.as_deref() + self.pci_common.id.as_deref() } fn fds_from_http_body(&self) -> Option<&[RawFd]> { @@ -419,6 +420,7 @@ vm_action_put_handler!(VmNmi); vm_action_put_handler_body!(VmAddDevice); vm_action_put_handler_body!(AddDisk); vm_action_put_handler_body!(VmAddFs); +vm_action_put_handler_body!(VmAddGenericVhostUser); vm_action_put_handler_body!(VmAddPmem); vm_action_put_handler_body!(VmAddVdpa); vm_action_put_handler_body!(VmAddVsock); diff --git a/vmm/src/api/http/mod.rs b/vmm/src/api/http/mod.rs index 2aa52e8e37..3461c08af9 100644 --- a/vmm/src/api/http/mod.rs +++ b/vmm/src/api/http/mod.rs @@ -28,10 +28,10 @@ use self::http_endpoint::{VmActionHandler, VmCreate, VmInfo, VmmPing, VmmShutdow #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use crate::api::VmCoredump; use crate::api::{ - AddDisk, ApiError, ApiRequest, VmAddDevice, VmAddFs, VmAddNet, VmAddPmem, VmAddUserDevice, - VmAddVdpa, VmAddVsock, VmBoot, VmCounters, VmDelete, VmNmi, VmPause, VmPowerButton, VmReboot, - VmReceiveMigration, VmRemoveDevice, VmResize, VmResizeDisk, VmResizeZone, VmRestore, VmResume, - VmSendMigration, VmShutdown, VmSnapshot, + AddDisk, ApiError, ApiRequest, VmAddDevice, VmAddFs, VmAddGenericVhostUser, VmAddNet, + VmAddPmem, VmAddUserDevice, VmAddVdpa, VmAddVsock, VmBoot, VmCounters, VmDelete, VmNmi, + VmPause, VmPowerButton, VmReboot, VmReceiveMigration, VmRemoveDevice, VmResize, VmResizeDisk, + VmResizeZone, VmRestore, VmResume, VmSendMigration, VmShutdown, VmSnapshot, }; use crate::landlock::Landlock; use crate::seccomp_filters::{Thread, get_seccomp_filter}; @@ -196,6 +196,10 @@ pub static HTTP_ROUTES: LazyLock = LazyLock::new(|| { endpoint!("/vm.add-fs"), Box::new(VmActionHandler::new(&VmAddFs)), ); + r.routes.insert( + endpoint!("/vm.add-generic-vhost-user"), + Box::new(VmActionHandler::new(&VmAddGenericVhostUser)), + ); r.routes.insert( endpoint!("/vm.add-net"), Box::new(VmActionHandler::new(&VmAddNet)), diff --git a/vmm/src/api/mod.rs b/vmm/src/api/mod.rs index 12ca6b9877..e4ee7235ad 100644 --- a/vmm/src/api/mod.rs +++ b/vmm/src/api/mod.rs @@ -34,10 +34,14 @@ pub mod dbus; pub mod http; use std::io; +use std::num::{NonZeroU32, NonZeroU64}; +use std::str::FromStr; use std::sync::mpsc::{RecvError, SendError, Sender, channel}; +use std::time::Duration; use log::info; use micro_http::Body; +use option_parser::{OptionParser, OptionParserError, Toggle}; use serde::{Deserialize, Serialize}; use thiserror::Error; use vm_migration::MigratableError; @@ -49,10 +53,11 @@ pub use self::http::{start_http_fd_thread, start_http_path_thread}; use crate::Error as VmmError; use crate::config::RestoreConfig; use crate::device_tree::DeviceTree; +use crate::migration_transport::MAX_MIGRATION_CONNECTIONS; use crate::vm::{Error as VmError, VmState}; use crate::vm_config::{ - DeviceConfig, DiskConfig, FsConfig, NetConfig, PmemConfig, UserDeviceConfig, VdpaConfig, - VmConfig, VsockConfig, + DeviceConfig, DiskConfig, FsConfig, GenericVhostUserConfig, NetConfig, PmemConfig, + UserDeviceConfig, VdpaConfig, VmConfig, VsockConfig, }; /// API errors are sent back from the VMM API server through the ApiResponse. @@ -170,6 +175,10 @@ pub enum ApiError { #[error("The fs could not be added to the VM")] VmAddFs(#[source] VmError), + /// The generic vhost-user device could not be added to the VM. + #[error("The generic vhost-user device could not be added to the VM")] + VmAddGenericVhostUser(#[source] VmError), + /// The pmem device could not be added to the VM. #[error("The pmem device could not be added to the VM")] VmAddPmem(#[source] VmError), @@ -262,13 +271,220 @@ pub struct VmReceiveMigrationData { pub receiver_url: String, } -#[derive(Clone, Deserialize, Serialize, Default, Debug)] +#[derive(Copy, Clone, Default, Deserialize, Serialize, Debug, PartialEq, Eq)] +/// The migration timeout strategy. +/// +/// This strategy describes the behavior of the migration when the target +/// downtime can't be reached in the given timeout. +pub enum TimeoutStrategy { + #[default] + /// Cancel the migration and keep the VM running on the source. + Cancel, + /// Ignore the timeout and migrate anyway. + Ignore, +} + +impl FromStr for TimeoutStrategy { + type Err = String; + fn from_str(s: &str) -> Result { + match s.to_lowercase().as_str() { + "cancel" => Ok(TimeoutStrategy::Cancel), + "ignore" => Ok(TimeoutStrategy::Ignore), + _ => Err(format!("Invalid timeout strategy: {s}")), + } + } +} + +#[derive(Debug, Error)] +pub enum VmSendMigrationConfigError { + #[error("Error parsing send migration parameters")] + ParseError(#[source] OptionParserError), + + #[error("Error validating send migration parameters")] + ValidationError(String), +} + +/// Configuration for an outgoing migration. +#[derive(Clone, Deserialize, Serialize, Debug)] +#[cfg_attr(test, derive(PartialEq))] pub struct VmSendMigrationData { - /// URL to migrate the VM to + /// Migration destination, e.g. `tcp::` or `unix:/path/to/socket`. pub destination_url: String, /// Send memory across socket without copying #[serde(default)] pub local: bool, + /// The maximum downtime the migration aims for. + /// + /// Usually, on the order of a few hundred milliseconds. + #[serde(default = "VmSendMigrationData::default_downtime_ms")] + downtime_ms: NonZeroU64, + /// The timeout for the migration, i.e., the maximum duration. + #[serde(default = "VmSendMigrationData::default_timeout_s")] + timeout_s: NonZeroU64, + /// The timeout strategy for the migration. + #[serde(default)] + pub timeout_strategy: TimeoutStrategy, + + /// The number of parallel TCP connections for migration. + /// + /// Must be between 1 and `MAX_MIGRATION_CONNECTIONS` inclusive. + #[serde(default = "VmSendMigrationData::default_connections")] + pub connections: NonZeroU32, +} + +impl VmSendMigrationData { + pub const SYNTAX: &'static str = "VM send migration parameters \ + \"destination_url=[,local=on|off,\ + downtime_ms=,timeout_s=,\ + timeout_strategy=cancel|ignore,connections=]\""; + + // Same as QEMU. + pub const DEFAULT_DOWNTIME: Duration = Duration::from_millis(300); + pub const DEFAULT_TIMEOUT: Duration = Duration::from_secs(60 * 60 /* one hour */); + + fn default_downtime_ms() -> NonZeroU64 { + let ms_u64 = u64::try_from(Self::DEFAULT_DOWNTIME.as_millis()).unwrap(); + NonZeroU64::new(ms_u64).unwrap() + } + + fn default_timeout_s() -> NonZeroU64 { + NonZeroU64::new(Self::DEFAULT_TIMEOUT.as_secs()).unwrap() + } + + // Use a single connection as default for backward compatibility. + fn default_connections() -> NonZeroU32 { + NonZeroU32::new(1).unwrap() + } + + pub fn parse(migration: &str) -> Result { + let mut parser = OptionParser::new(); + parser + .add("destination_url") + .add("local") + .add("downtime_ms") + .add("timeout_s") + .add("timeout_strategy") + .add("connections"); + parser + .parse(migration) + .map_err(VmSendMigrationConfigError::ParseError)?; + + let destination_url = parser.get("destination_url").ok_or_else(|| { + VmSendMigrationConfigError::ParseError(OptionParserError::InvalidSyntax( + "destination_url is required".to_string(), + )) + })?; + let local = parser + .convert::("local") + .map_err(VmSendMigrationConfigError::ParseError)? + .unwrap_or(Toggle(false)) + .0; + let downtime_ms = match parser + .convert::("downtime_ms") + .map_err(VmSendMigrationConfigError::ParseError)? + { + Some(v) => NonZeroU64::new(v).ok_or_else(|| { + VmSendMigrationConfigError::ParseError(OptionParserError::InvalidValue( + "downtime_ms must be non-zero".to_string(), + )) + })?, + None => Self::default_downtime_ms(), + }; + let timeout_s = match parser + .convert::("timeout_s") + .map_err(VmSendMigrationConfigError::ParseError)? + { + Some(v) => NonZeroU64::new(v).ok_or_else(|| { + VmSendMigrationConfigError::ParseError(OptionParserError::InvalidValue( + "timeout_s must be non-zero".to_string(), + )) + })?, + None => Self::default_timeout_s(), + }; + let timeout_strategy = parser + .convert("timeout_strategy") + .map_err(VmSendMigrationConfigError::ParseError)? + .unwrap_or_default(); + let connections = match parser + .convert::("connections") + .map_err(VmSendMigrationConfigError::ParseError)? + { + Some(v) => NonZeroU32::new(v).ok_or_else(|| { + VmSendMigrationConfigError::ParseError(OptionParserError::InvalidValue( + "connections must be non-zero".to_string(), + )) + })?, + None => Self::default_connections(), + }; + + let data = Self { + destination_url, + local, + downtime_ms, + timeout_s, + timeout_strategy, + connections, + }; + + data.validate()?; + + Ok(data) + } + + pub fn downtime(&self) -> Duration { + Duration::from_millis(self.downtime_ms.get()) + } + + pub fn timeout(&self) -> Duration { + Duration::from_secs(self.timeout_s.get()) + } + + pub fn validate(&self) -> Result<(), VmSendMigrationConfigError> { + match self.destination_url.as_str() { + url if url + .strip_prefix("tcp:") + .is_some_and(|addr| !addr.is_empty()) => {} + url if url + .strip_prefix("unix:") + .is_some_and(|path| !path.is_empty()) => + { + if self.connections.get() > 1 { + return Err(VmSendMigrationConfigError::ValidationError( + "UNIX sockets and connections option cannot be used at the same time." + .to_string(), + )); + } + } + _ => { + return Err(VmSendMigrationConfigError::ValidationError( + "destination_url must use tcp:: or unix:.".to_string(), + )); + } + } + + if self.connections.get() > MAX_MIGRATION_CONNECTIONS { + return Err(VmSendMigrationConfigError::ValidationError(format!( + "connections must not exceed {MAX_MIGRATION_CONNECTIONS}." + ))); + } + + if self.local { + if !self.destination_url.starts_with("unix:") { + return Err(VmSendMigrationConfigError::ValidationError( + "local option is only supported with UNIX sockets.".to_string(), + )); + } + + if self.connections.get() > 1 { + return Err(VmSendMigrationConfigError::ValidationError( + "local option and connections option cannot be used at the same time." + .to_string(), + )); + } + } + + Ok(()) + } } pub enum ApiResponsePayload { @@ -340,6 +556,11 @@ pub trait RequestHandler { fn vm_add_fs(&mut self, fs_cfg: FsConfig) -> Result>, VmError>; + fn vm_add_generic_vhost_user( + &mut self, + fs_cfg: GenericVhostUserConfig, + ) -> Result>, VmError>; + fn vm_add_pmem(&mut self, pmem_cfg: PmemConfig) -> Result>, VmError>; fn vm_add_net(&mut self, net_cfg: NetConfig) -> Result>, VmError>; @@ -539,6 +760,43 @@ impl ApiAction for VmAddFs { } } +pub struct VmAddGenericVhostUser; + +impl ApiAction for VmAddGenericVhostUser { + type RequestBody = GenericVhostUserConfig; + type ResponseBody = Option; + + fn request( + &self, + config: Self::RequestBody, + response_sender: Sender, + ) -> ApiRequest { + Box::new(move |vmm| { + info!("API request event: VmAddGenericVhostUser {config:?}"); + + let response = vmm + .vm_add_generic_vhost_user(config) + .map_err(ApiError::VmAddGenericVhostUser) + .map(ApiResponsePayload::VmAction); + + response_sender + .send(response) + .map_err(VmmError::ApiResponseSend)?; + + Ok(false) + }) + } + + fn send( + &self, + api_evt: EventFd, + api_sender: Sender, + data: Self::RequestBody, + ) -> ApiResult { + get_response_body(self, api_evt, api_sender, data) + } +} + pub struct VmAddPmem; impl ApiAction for VmAddPmem { @@ -1495,3 +1753,108 @@ impl ApiAction for VmNmi { get_response_body(self, api_evt, api_sender, data) } } + +#[cfg(test)] +mod unit_tests { + use super::*; + + #[test] + fn test_vm_send_migration_data_parse() { + // Fully specified + let data = VmSendMigrationData::parse( + "destination_url=unix:/tmp/migrate.sock,local=on,downtime_ms=200,timeout_s=3600,timeout_strategy=cancel" + ).expect("valid migration string should parse"); + assert_eq!(data.destination_url, "unix:/tmp/migrate.sock"); + assert!(data.local); + assert_eq!(data.downtime_ms.get(), 200); + assert_eq!(data.timeout_s.get(), 3600); + assert_eq!(data.timeout_strategy, TimeoutStrategy::Cancel); + assert_eq!(data.connections.get(), 1); + + // Defaults applied when optional fields are omitted + let data = VmSendMigrationData::parse("destination_url=tcp:192.168.1.1:8080") + .expect("minimal migration string should parse"); + assert_eq!(data.destination_url, "tcp:192.168.1.1:8080"); + assert!(!data.local); + assert_eq!(data.downtime_ms, VmSendMigrationData::default_downtime_ms()); + assert_eq!(data.timeout_s, VmSendMigrationData::default_timeout_s()); + assert_eq!(data.timeout_strategy, TimeoutStrategy::default()); + assert_eq!(data.connections, VmSendMigrationData::default_connections()); + + // Missing destination_url is an error + VmSendMigrationData::parse("local=on,downtime_ms=200").unwrap_err(); + + // Zero downtime_ms is rejected + let _data = + VmSendMigrationData::parse("destination_url=tcp:192.168.1.1:8080,downtime_ms=0") + .expect_err("zero downtime_ms should be rejected"); + + // Zero timeout_s is rejected + let _data = VmSendMigrationData::parse("destination_url=unix:/tmp/sock,timeout_s=0") + .expect_err("zero timeout_s should be rejected"); + + // Zero connections is rejected + let _data = + VmSendMigrationData::parse("destination_url=tcp:192.168.1.1:8080,connections=0") + .expect_err("zero connections should be rejected"); + + // Excessive numbers of parallel connections are rejected + let _data = + VmSendMigrationData::parse("destination_url=tcp:192.168.1.1:8080,connections=129") + .expect_err("too many connections should be rejected"); + + // Unknown option is an error + VmSendMigrationData::parse("destination_url=unix:/tmp/sock,unknown_field=foo").unwrap_err(); + + // Invalid toggle value is an error + VmSendMigrationData::parse("destination_url=unix:/tmp/sock,local=yes").unwrap_err(); + + // Timeout strategy + let _data = VmSendMigrationData::parse( + "destination_url=tcp:192.168.1.1:8080,timeout_strategy=invalid", + ) + .expect_err("invalid timeout strategy should be rejected"); + + // Invalid destination URL scheme is rejected + VmSendMigrationData::parse("destination_url=file:///tmp/migration").unwrap_err(); + + // Local migration requires a UNIX socket destination + VmSendMigrationData::parse("destination_url=tcp:192.168.1.1:8080,local=yes").unwrap_err(); + + // Local migration cannot use multiple connections + VmSendMigrationData::parse("destination_url=unix:/tmp/sock,local=yes,connections=2") + .unwrap_err(); + + // Happy path with some defaults + let data = + VmSendMigrationData::parse("destination_url=tcp:192.168.1.1:8080,downtime_ms=150") + .unwrap(); + assert_eq!( + data, + VmSendMigrationData { + destination_url: "tcp:192.168.1.1:8080".to_string(), + local: false, + downtime_ms: NonZeroU64::new(150).unwrap(), + timeout_s: VmSendMigrationData::default_timeout_s(), + timeout_strategy: Default::default(), + connections: VmSendMigrationData::default_connections(), + } + ); + + // Happy path, fully specified + let data = + VmSendMigrationData::parse("destination_url=tcp:192.168.1.1:8080,downtime_ms=150,timeout_s=900,timeout_strategy=ignore,connections=4") + .unwrap(); + assert_eq!( + data, + VmSendMigrationData { + destination_url: "tcp:192.168.1.1:8080".to_string(), + local: false, + downtime_ms: NonZeroU64::new(150).unwrap(), + timeout_s: NonZeroU64::new(900).unwrap(), + timeout_strategy: TimeoutStrategy::Ignore, + connections: NonZeroU32::new(4).unwrap(), + } + ); + } +} diff --git a/vmm/src/api/openapi/cloud-hypervisor.yaml b/vmm/src/api/openapi/cloud-hypervisor.yaml index 629a6800d1..422660d07a 100644 --- a/vmm/src/api/openapi/cloud-hypervisor.yaml +++ b/vmm/src/api/openapi/cloud-hypervisor.yaml @@ -277,6 +277,28 @@ paths: 500: description: The new device could not be added to the VM instance. + /vm.add-generic-vhost-user: + put: + summary: Add a new generic vhost-user device to the VM + requestBody: + description: The details of the new generic vhost-user device + content: + application/json: + schema: + $ref: "#/components/schemas/GenericVhostUserConfig" + required: true + responses: + 200: + description: The new device was successfully added to the VM instance. + content: + application/json: + schema: + $ref: "#/components/schemas/PciDeviceInfo" + 204: + description: The new device was successfully (cold) added to the VM instance. + 500: + description: The new device could not be added to the VM instance. + /vm.add-pmem: put: summary: Add a new pmem device to the VM @@ -498,6 +520,10 @@ components: type: string description: Virtual Machine Monitor information + VmState: + type: string + enum: [Created, Running, Shutdown, Paused] + VmInfo: required: - config @@ -507,8 +533,8 @@ components: config: $ref: "#/components/schemas/VmConfig" state: - type: string - enum: [Created, Running, Shutdown, Paused] + $ref: "#/components/schemas/VmState" + memory_actual_size: type: integer format: int64 @@ -603,6 +629,10 @@ components: type: array items: $ref: "#/components/schemas/FsConfig" + generic-vhost-user: + type: array + items: + $ref: "#/components/schemas/GenericVhostUserConfig" pmem: type: array items: @@ -684,6 +714,11 @@ components: packages: type: integer + CoreSchedulingMode: + type: string + enum: ["Vm", "Vcpu", "Off"] + default: "Vm" + CpusConfig: required: - boot_vcpus @@ -712,6 +747,9 @@ components: $ref: "#/components/schemas/CpuAffinity" features: $ref: "#/components/schemas/CpuFeatures" + core_scheduling: + $ref: "#/components/schemas/CoreSchedulingMode" + PciSegmentConfig: required: @@ -756,6 +794,12 @@ components: sev_snp: type: boolean default: false + iommufd: + type: boolean + default: false + vfio_p2p_dma: + type: boolean + default: true MemoryZoneConfig: required: @@ -901,6 +945,15 @@ components: items: type: integer + ImageType: + type: string + enum: ["FixedVhd", "Qcow2", "Raw", "Vhdx", "Unknown"] + + LockGranularity: + type: string + enum: [ByteRange, Full] + default: ByteRange + DiskConfig: type: object properties: @@ -931,6 +984,9 @@ components: pci_segment: type: integer format: int16 + pci_device_id: + type: integer + format: uint8 id: type: string serial: @@ -948,9 +1004,9 @@ components: type: boolean default: true image_type: - type: string - enum: [FixedVhd, Qcow2, Raw, Vhdx, Unknown] - + $ref: "#/components/schemas/ImageType" + lock_granularity: + $ref: "#/components/schemas/LockGranularity" NetConfig: type: object @@ -993,6 +1049,9 @@ components: pci_segment: type: integer format: int16 + pci_device_id: + type: integer + format: uint8 rate_limiter_config: $ref: "#/components/schemas/RateLimiterConfig" offload_tso: @@ -1054,9 +1113,35 @@ components: pci_segment: type: integer format: int16 + pci_device_id: + type: integer + format: uint8 id: type: string + GenericVhostUserConfig: + required: + - queue_sizes + - socket + - tag + - virtio_id + type: object + properties: + socket: + type: string + queue_size: + type: array + items: + type: uint16 + pci_segment: + type: integer + format: int16 + pci_device_id: + type: integer + format: uint8 + virtio_id: + type: uint32 + PmemConfig: required: - file @@ -1076,9 +1161,16 @@ components: pci_segment: type: integer format: int16 + pci_device_id: + type: integer + format: uint8 id: type: string + ConsoleMode: + type: string + enum: ["Off", "Pty", "Tty", "File", "Socket", "Null"] + ConsoleConfig: required: - mode @@ -1089,8 +1181,7 @@ components: socket: type: string mode: - type: string - enum: ["Off", "Pty", "Tty", "File", "Socket", "Null"] + $ref: "#/components/schemas/ConsoleMode" iommu: type: boolean default: false @@ -1103,8 +1194,7 @@ components: file: type: string mode: - type: string - enum: ["Off", "Pty", "Tty", "File", "Null"] + $ref: "#/components/schemas/ConsoleMode" iobase: type: integer @@ -1121,6 +1211,9 @@ components: pci_segment: type: integer format: int16 + pci_device_id: + type: integer + format: uint8 id: type: string x_nv_gpudirect_clique: @@ -1151,6 +1244,9 @@ components: pci_segment: type: integer format: int16 + pci_device_id: + type: integer + format: uint8 id: type: string @@ -1174,6 +1270,9 @@ components: pci_segment: type: integer format: int16 + pci_device_id: + type: integer + format: uint8 id: type: string @@ -1273,6 +1372,11 @@ components: destination_url: type: string + MemoryRestoreMode: + type: string + enum: [Copy, OnDemand] + default: Copy + RestoreConfig: required: - source_url @@ -1282,6 +1386,10 @@ components: type: string prefault: type: boolean + memory_restore_mode: + $ref: "#/components/schemas/MemoryRestoreMode" + resume: + type: boolean ReceiveMigrationData: required: @@ -1291,6 +1399,15 @@ components: receiver_url: type: string + TimeoutStrategy: + type: string + enum: ["Cancel", "Ignore"] + default: "Cancel" + description: > + The strategy to apply when the migration timeout is reached. + Cancel will abort the migration and keep the VM running on the source. + Ignore will proceed with the migration regardless of the downtime requirement. + SendMigrationData: required: - destination_url @@ -1300,6 +1417,34 @@ components: type: string local: type: boolean + downtime_ms: + type: integer + format: int64 + minimum: 1 + default: 300 + description: > + The maximum downtime the migration aims for, in milliseconds. + Defaults to 300ms. + timeout_s: + type: integer + format: int64 + minimum: 1 + default: 3600 + description: > + The timeout for the migration (maximum total duration), in seconds. + Defaults to 3600s (one hour). + timeout_strategy: + $ref: "#/components/schemas/TimeoutStrategy" + connections: + type: integer + format: int64 + default: 1 + minimum: 1 + maximum: 128 + description: > + The number of parallel TCP connections to use for migration. + Must be between 1 and 128. Multiple connections are not supported + with local UNIX-socket migration. VmAddUserDevice: required: @@ -1308,6 +1453,12 @@ components: properties: socket: type: string + pci_segment: + type: integer + format: int16 + pci_device_id: + type: integer + format: uint8 LandlockConfig: required: diff --git a/vmm/src/config.rs b/vmm/src/config.rs index a4339c27b8..7efda7c05a 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -9,6 +9,7 @@ use std::fs; use std::path::PathBuf; use std::result; use std::str::FromStr; +use std::sync::LazyLock; use block::ImageType; use clap::ArgMatches; @@ -16,9 +17,11 @@ use log::{debug, warn}; use option_parser::{ ByteSized, IntegerList, OptionParser, OptionParserError, StringList, Toggle, Tuple, }; +use pci::NUM_DEVICE_IDS; use serde::{Deserialize, Serialize}; use thiserror::Error; use virtio_bindings::virtio_blk::VIRTIO_BLK_ID_BYTES; +use virtio_bindings::virtio_ids::*; use virtio_devices::block::MINIMUM_BLOCK_QUEUE_SIZE; use virtio_devices::vhost_user::VIRTIO_FS_TAG_LEN; use virtio_devices::{RateLimiterConfig, TokenBucketConfig}; @@ -46,6 +49,34 @@ pub enum Error { /// Filesystem socket is missing #[error("Error parsing --fs: socket missing")] ParseFsSockMissing, + /// Generic vhost-user virtio ID is invalid + #[error( + "Error parsing --generic-vhost-user: virtio ID {0:?} invalid (leading zeros or unknown string)" + )] + ParseGenericVhostUserVirtioIdInvalid(String), + /// Generic vhost-user virtio ID is unsupported + #[error( + "Error parsing --generic-vhost-user: device with virtio ID {0:?} cannot be implemented via vhost-user" + )] + ParseGenericVhostUserVirtioIdUnsupported(String), + /// Generic vhost-user socket is missing + #[error("Error parsing --generic-vhost-user: socket missing")] + ParseGenericVhostUserSockMissing, + /// Generic vhost-user number of queues is missing + #[error("Error parsing --generic-vhost-user: number of queues missing")] + ParseGenericVhostUserNumResponseQueuesMissing, + /// Generic vhost-user virtio ID is missing + #[error("Error parsing --generic-vhost-user: virtio ID missing")] + ParseGenericVhostUserVirtioIdMissing, + /// Generic vhost-user available features is missing + #[error("Error parsing --generic-vhost-user: available features missing")] + ParseGenericVhostUserAvailFeaturesMissing, + /// Generic vhost-user queue size is too large + #[error("Error parsing --generic-vhost-user: queue size {0} is {1}, but limit is 65535")] + ParseGenericVhostUserQueueSizeTooLarge(usize, u64), + /// Generic vhost-user queue size missing + #[error("Error parsing --generic-vhost-user: queue size missing")] + ParseGenericVhostUserQueueSizeMissing, /// Missing persistent memory file parameter. #[error("Error parsing --pmem: file missing")] ParsePmemFileMissing, @@ -94,6 +125,9 @@ pub enum Error { /// Error parsing persistent memory parameters #[error("Error parsing --pmem")] ParsePersistentMemory(#[source] OptionParserError), + /// Error parsing generic vhost-user parameters + #[error("Error parsing --generic-vhost-user")] + ParseGenericVhostUser(#[source] OptionParserError), /// Failed parsing console #[error("Error parsing --console")] ParseConsole(#[source] OptionParserError), @@ -176,6 +210,8 @@ pub enum Error { /// Failed Parsing FwCfgItem config #[error("Error parsing --fw-cfg-config items")] ParseFwCfgItem(#[source] OptionParserError), + #[error("Error parsing common PCI device config")] + ParsePciDeviceCommonConfig(#[source] OptionParserError), } #[derive(Debug, PartialEq, Eq, Error)] @@ -282,11 +318,9 @@ pub enum ValidationError { /// On a IOMMU segment but not behind IOMMU #[error("Device is on an IOMMU PCI segment ({0}) but not placed behind IOMMU")] OnIommuSegment(u16), - // On a IOMMU segment but IOMMU not supported - #[error( - "Device is on an IOMMU PCI segment ({0}) but does not support being placed behind IOMMU" - )] - IommuNotSupportedOnSegment(u16), + /// GPUDirect clique requires P2P DMA + #[error("Device with x_nv_gpudirect_clique requires vfio_p2p_dma=on")] + GpuDirectCliqueRequiresP2pDma, // Identifier is not unique #[error("Identifier {0} is not unique")] IdentifierNotUnique(String), @@ -318,12 +352,18 @@ pub enum ValidationError { #[cfg(feature = "sev_snp")] #[error("Invalid host data format")] InvalidHostData, + #[cfg(all(feature = "sev_snp", feature = "igvm"))] + #[error("SEV-SNP requires an IGVM payload (--payload igvm=)")] + SevSnpRequiresIgvm, /// Restore expects all net ids that have fds #[error("Net id {0} is associated with FDs and is required")] RestoreMissingRequiredNetId(String), /// Number of FDs passed during Restore are incorrect to the NetConfig #[error("Number of Net FDs passed for '{0}' during Restore: {1}. Expected: {2}")] RestoreNetFdCountMismatch(String, usize, usize), + /// Prefault cannot be combined with on-demand restore + #[error("'prefault' cannot be combined with 'memory_restore_mode=ondemand'")] + InvalidRestorePrefaultWithOnDemand, /// Path provided in landlock-rules doesn't exist #[error("Path {0:?} provided in landlock-rules does not exist")] LandlockPathDoesNotExist(PathBuf), @@ -333,18 +373,6 @@ pub enum ValidationError { /// Invalid block device serial length #[error("Block device serial length ({0}) exceeds maximum allowed length ({1})")] InvalidSerialLength(usize, usize), - #[cfg(feature = "fw_cfg")] - /// FwCfg missing kernel - #[error("Error --fw-cfg-config: missing --kernel")] - FwCfgMissingKernel, - #[cfg(feature = "fw_cfg")] - /// FwCfg missing cmdline - #[error("Error --fw-cfg-config: missing --cmdline")] - FwCfgMissingCmdline, - #[cfg(feature = "fw_cfg")] - /// FwCfg missing initramfs - #[error("Error --fw-cfg-config: missing --initramfs")] - FwCfgMissingInitramfs, #[cfg(feature = "ivshmem")] /// Invalid Ivshmem input size #[error("Invalid ivshmem input size")] @@ -366,6 +394,13 @@ pub enum ValidationError { /// Invalid NUMA Configuration #[error("NUMA Configuration is invalid")] InvalidNumaConfig(String), + /// The supplied PCI ID was greater then the max. supported number + /// of devices per Bus + #[error("Given PCI device ID ({0}) is out of the supported range of 0..{NUM_DEVICE_IDS}")] + InvalidPciDeviceId(u8), + /// The supplied PCI ID is reserved + #[error("Given PCI device ID ({0}) is reserved")] + ReservedPciDeviceId(u8), } type ValidationResult = std::result::Result; @@ -378,6 +413,21 @@ pub fn add_to_config(items: &mut Option>, item: T) { } } +/// Check that the PCI device supplied is neither out of range nor does +/// it use any reserved device ID. +fn validate_pci_device_id(device_id: u8) -> ValidationResult<()> { + if device_id >= pci::NUM_DEVICE_IDS { + // Check the given ID is not out of range + return Err(ValidationError::InvalidPciDeviceId(device_id)); + } else if device_id == pci::PCI_ROOT_DEVICE_ID { + // Check the ID isn't any reserved one. Currently, only the device ID + // for the root device is reserved. + return Err(ValidationError::ReservedPciDeviceId(device_id)); + } + + Ok(()) +} + pub type Result = result::Result; pub struct VmParams<'a> { @@ -394,6 +444,7 @@ pub struct VmParams<'a> { pub rng: &'a str, pub balloon: Option<&'a str>, pub fs: Option>, + pub generic_vhost_user: Option>, pub pmem: Option>, pub serial: &'a str, pub console: &'a str, @@ -455,6 +506,9 @@ impl<'a> VmParams<'a> { let fs: Option> = args .get_many::("fs") .map(|x| x.map(|y| y as &str).collect()); + let generic_vhost_user: Option> = args + .get_many::("generic-vhost-user") + .map(|x| x.map(|y| y as &str).collect()); let pmem: Option> = args .get_many::("pmem") .map(|x| x.map(|y| y as &str).collect()); @@ -509,6 +563,7 @@ impl<'a> VmParams<'a> { rng, balloon, fs, + generic_vhost_user, pmem, serial, console, @@ -559,6 +614,23 @@ impl FromStr for HotplugMethod { } } +pub enum ParseCoreSchedulingError { + InvalidValue(String), +} + +impl FromStr for CoreScheduling { + type Err = ParseCoreSchedulingError; + + fn from_str(s: &str) -> std::result::Result { + match s.to_lowercase().as_str() { + "vm" => Ok(CoreScheduling::Vm), + "vcpu" => Ok(CoreScheduling::Vcpu), + "off" => Ok(CoreScheduling::Off), + _ => Err(ParseCoreSchedulingError::InvalidValue(s.to_owned())), + } + } +} + pub enum CpuTopologyParseError { InvalidValue(String), } @@ -603,7 +675,8 @@ impl CpusConfig { .add("max_phys_bits") .add("affinity") .add("features") - .add("nested"); + .add("nested") + .add("core_scheduling"); parser.parse(cpus).map_err(Error::ParseCpus)?; let boot_vcpus: u32 = parser @@ -662,14 +735,11 @@ impl CpusConfig { .map_err(Error::ParseCpus)? .is_none_or(|toggle| toggle.0); - // Nested virtualization is always turned on for aarch64 and riscv64 - // TODO: revisit this when nested support can be turned of on these architectures - #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] - if !nested { - return Err(Error::ParseCpus(OptionParserError::InvalidValue( - "nested=off is not supported on aarch64 and riscv64 architectures".to_string(), - ))); - } + let core_scheduling = parser + .convert("core_scheduling") + .map_err(Error::ParseCpus)? + .unwrap_or(CoreScheduling::Vm); + Ok(CpusConfig { boot_vcpus, max_vcpus, @@ -679,6 +749,7 @@ impl CpusConfig { affinity, features, nested, + core_scheduling, }) } } @@ -742,6 +813,31 @@ impl PciSegmentConfig { } impl PlatformConfig { + pub fn syntax() -> &'static str { + static SYNTAX: LazyLock = LazyLock::new(|| { + let mut syntax = "Platform configuration parameters \ + \"num_pci_segments=,iommu_segments=,\ + iommu_address_width=,serial_number=,\ + uuid=,oem_strings=,iommufd=on|off,\ + vfio_p2p_dma=on|off" + .to_string(); + + if cfg!(feature = "tdx") { + syntax.push_str(",tdx=on|off"); + } + + if cfg!(feature = "sev_snp") { + syntax.push_str(",sev_snp=on|off"); + } + + syntax.push('"'); + + syntax + }); + + &SYNTAX + } + pub fn parse(platform: &str) -> Result { let mut parser = OptionParser::new(); parser @@ -750,7 +846,9 @@ impl PlatformConfig { .add("iommu_address_width") .add("serial_number") .add("uuid") - .add("oem_strings"); + .add("oem_strings") + .add("iommufd") + .add("vfio_p2p_dma"); #[cfg(feature = "tdx")] parser.add("tdx"); #[cfg(feature = "sev_snp")] @@ -777,6 +875,16 @@ impl PlatformConfig { .convert::("oem_strings") .map_err(Error::ParsePlatform)? .map(|v| v.0); + let iommufd = parser + .convert::("iommufd") + .map_err(Error::ParsePlatform)? + .unwrap_or(Toggle(false)) + .0; + let vfio_p2p_dma = parser + .convert::("vfio_p2p_dma") + .map_err(Error::ParsePlatform)? + .unwrap_or(Toggle(true)) + .0; #[cfg(feature = "tdx")] let tdx = parser .convert::("tdx") @@ -796,6 +904,8 @@ impl PlatformConfig { serial_number, uuid, oem_strings, + iommufd, + vfio_p2p_dma, #[cfg(feature = "tdx")] tdx, #[cfg(feature = "sev_snp")] @@ -907,7 +1017,8 @@ impl MemoryConfig { .add("host_numa_node") .add("hotplug_size") .add("hotplugged_size") - .add("prefault"); + .add("prefault") + .add("mergeable"); parser.parse(memory_zone).map_err(Error::ParseMemoryZone)?; let id = parser.get("id").ok_or(Error::ParseMemoryZoneIdMissing)?; @@ -948,6 +1059,11 @@ impl MemoryConfig { .map_err(Error::ParseMemoryZone)? .unwrap_or(Toggle(false)) .0; + let mergeable = parser + .convert::("mergeable") + .map_err(Error::ParseMemoryZone)? + .unwrap_or(Toggle(mergeable)) + .0; zones.push(MemoryZoneConfig { id, @@ -960,6 +1076,7 @@ impl MemoryConfig { hotplug_size, hotplugged_size, prefault, + mergeable, }); } Some(zones) @@ -983,21 +1100,24 @@ impl MemoryConfig { } pub fn total_size(&self) -> u64 { - let mut size = self.size; - if let Some(hotplugged_size) = self.hotplugged_size { - size += hotplugged_size; - } - - if let Some(zones) = &self.zones { - for zone in zones.iter() { - size += zone.size; - if let Some(hotplugged_size) = zone.hotplugged_size { - size += hotplugged_size; - } - } - } + self.size + + self + .zones + .iter() + .flatten() + .map(|zone| zone.size) + .sum::() + + self.hotplugged_size() + } - size + pub fn hotplugged_size(&self) -> u64 { + self.hotplugged_size.unwrap_or(0) + + self + .zones + .iter() + .flatten() + .filter_map(|zone| zone.hotplugged_size) + .sum::() } } @@ -1088,6 +1208,63 @@ impl RateLimiterGroupConfig { } } +impl PciDeviceCommonConfig { + const OPTIONS: &[&str] = &["id", "pci_segment", "pci_device_id"]; + const OPTIONS_IOMMU: &[&str] = &["id", "iommu", "pci_segment", "pci_device_id"]; + + pub fn parse(input: &str) -> Result { + let mut parser = OptionParser::new(); + + parser.add_all(Self::OPTIONS_IOMMU); + + parser + .parse_subset(input) + .map_err(Error::ParsePciDeviceCommonConfig)?; + + let id = parser.get("id"); + let iommu = parser + .convert::("iommu") + .map_err(Error::ParsePciDeviceCommonConfig)? + .unwrap_or(Toggle(false)) + .0; + let pci_segment = parser + .convert("pci_segment") + .map_err(Error::ParsePciDeviceCommonConfig)? + .unwrap_or_default(); + let pci_device_id = parser + .convert::("pci_device_id") + .map_err(Error::ParsePciDeviceCommonConfig)?; + + Ok(Self { + id, + iommu, + pci_segment, + pci_device_id, + }) + } + + pub fn validate(&self, vm_config: &VmConfig) -> ValidationResult<()> { + if let Some(platform_config) = vm_config.platform.as_ref() { + if self.pci_segment >= platform_config.num_pci_segments { + return Err(ValidationError::InvalidPciSegment(self.pci_segment)); + } + + if let Some(iommu_segments) = platform_config.iommu_segments.as_ref() + && iommu_segments.contains(&self.pci_segment) + && !self.iommu + { + return Err(ValidationError::OnIommuSegment(self.pci_segment)); + } + } + + if let Some(device_id) = self.pci_device_id { + validate_pci_device_id(device_id)?; + } + + Ok(()) + } +} + impl DiskConfig { pub const SYNTAX: &'static str = "Disk parameters \ \"path=,readonly=on|off,direct=on|off,iommu=on|off,\ @@ -1095,10 +1272,11 @@ impl DiskConfig { vhost_user=on|off,socket=,\ bw_size=,bw_one_time_burst=,bw_refill_time=,\ ops_size=,ops_one_time_burst=,ops_refill_time=,\ - id=,pci_segment=,rate_limit_group=,\ + id=,pci_segment=,pci_device_id=,\ + rate_limit_group=,\ queue_affinity=,\ serial=,backing_files=on|off,sparse=on|off,\ - image_type="; + image_type=,lock_granularity=byte-range|full"; pub fn parse(disk: &str) -> Result { let mut parser = OptionParser::new(); @@ -1106,7 +1284,6 @@ impl DiskConfig { .add("path") .add("readonly") .add("direct") - .add("iommu") .add("queue_size") .add("num_queues") .add("vhost_user") @@ -1117,16 +1294,16 @@ impl DiskConfig { .add("ops_size") .add("ops_one_time_burst") .add("ops_refill_time") - .add("id") .add("_disable_io_uring") .add("_disable_aio") - .add("pci_segment") .add("serial") .add("rate_limit_group") .add("queue_affinity") .add("backing_files") .add("sparse") - .add("image_type"); + .add("image_type") + .add("lock_granularity") + .add_all(PciDeviceCommonConfig::OPTIONS_IOMMU); parser.parse(disk).map_err(Error::ParseDisk)?; @@ -1141,11 +1318,6 @@ impl DiskConfig { .map_err(Error::ParseDisk)? .unwrap_or(Toggle(false)) .0; - let iommu = parser - .convert::("iommu") - .map_err(Error::ParseDisk)? - .unwrap_or(Toggle(false)) - .0; let queue_size = parser .convert("queue_size") .map_err(Error::ParseDisk)? @@ -1160,7 +1332,6 @@ impl DiskConfig { .unwrap_or(Toggle(false)) .0; let vhost_socket = parser.get("socket"); - let id = parser.get("id"); let disable_io_uring = parser .convert::("_disable_io_uring") .map_err(Error::ParseDisk)? @@ -1171,10 +1342,6 @@ impl DiskConfig { .map_err(Error::ParseDisk)? .unwrap_or(Toggle(false)) .0; - let pci_segment = parser - .convert("pci_segment") - .map_err(Error::ParseDisk)? - .unwrap_or_default(); let rate_limit_group = parser.get("rate_limit_group"); let bw_size = parser .convert("bw_size") @@ -1228,6 +1395,11 @@ impl DiskConfig { ImageType::Unknown }; + let lock_granularity = parser + .convert::("lock_granularity") + .map_err(Error::ParseDisk)? + .unwrap_or_default(); + let bw_tb_config = if bw_size != 0 && bw_refill_time != 0 { Some(TokenBucketConfig { size: bw_size, @@ -1260,30 +1432,33 @@ impl DiskConfig { .unwrap_or_else(|| Toggle(default_diskconfig_sparse())) .0; + let pci_common = PciDeviceCommonConfig::parse(disk)?; + Ok(DiskConfig { + pci_common, path, readonly, direct, - iommu, num_queues, queue_size, vhost_user, vhost_socket, rate_limit_group, rate_limiter_config, - id, disable_io_uring, disable_aio, - pci_segment, serial, queue_affinity, backing_files, sparse, image_type, + lock_granularity, }) } pub fn validate(&self, vm_config: &VmConfig) -> ValidationResult<()> { + self.pci_common.validate(vm_config)?; + if self.num_queues > vm_config.cpus.boot_vcpus as usize { return Err(ValidationError::TooManyQueues( self.num_queues, @@ -1295,23 +1470,10 @@ impl DiskConfig { return Err(ValidationError::InvalidQueueSize(self.queue_size)); } - if self.vhost_user && self.iommu { + if self.vhost_user && self.pci_common.iommu { return Err(ValidationError::IommuNotSupported); } - if let Some(platform_config) = vm_config.platform.as_ref() { - if self.pci_segment >= platform_config.num_pci_segments { - return Err(ValidationError::InvalidPciSegment(self.pci_segment)); - } - - if let Some(iommu_segments) = platform_config.iommu_segments.as_ref() - && iommu_segments.contains(&self.pci_segment) - && !self.iommu - { - return Err(ValidationError::OnIommuSegment(self.pci_segment)); - } - } - if self.rate_limiter_config.is_some() && self.rate_limit_group.is_some() { return Err(ValidationError::InvalidRateLimiterGroup); } @@ -1353,7 +1515,8 @@ impl NetConfig { num_queues=,queue_size=,id=,\ vhost_user=,socket=,vhost_mode=client|server,\ bw_size=,bw_one_time_burst=,bw_refill_time=,\ - ops_size=,ops_one_time_burst=,ops_refill_time=,pci_segment=\ + ops_size=,ops_one_time_burst=,ops_refill_time=,\ + pci_segment=,pci_device_id=,\ offload_tso=on|off,offload_ufo=on|off,offload_csum=on|off\""; pub fn parse(net: &str) -> Result { @@ -1369,13 +1532,11 @@ impl NetConfig { .add("offload_ufo") .add("offload_csum") .add("mtu") - .add("iommu") .add("queue_size") .add("num_queues") .add("vhost_user") .add("socket") .add("vhost_mode") - .add("id") .add("fd") .add("bw_size") .add("bw_one_time_burst") @@ -1383,7 +1544,7 @@ impl NetConfig { .add("ops_size") .add("ops_one_time_burst") .add("ops_refill_time") - .add("pci_segment"); + .add_all(PciDeviceCommonConfig::OPTIONS_IOMMU); parser.parse(net).map_err(Error::ParseNetwork)?; let tap = parser.get("tap"); @@ -1411,11 +1572,6 @@ impl NetConfig { .unwrap_or(Toggle(true)) .0; let mtu = parser.convert("mtu").map_err(Error::ParseNetwork)?; - let iommu = parser - .convert::("iommu") - .map_err(Error::ParseNetwork)? - .unwrap_or(Toggle(false)) - .0; let queue_size = parser .convert("queue_size") .map_err(Error::ParseNetwork)? @@ -1434,15 +1590,10 @@ impl NetConfig { .convert("vhost_mode") .map_err(Error::ParseNetwork)? .unwrap_or_default(); - let id = parser.get("id"); let fds = parser .convert::("fd") .map_err(Error::ParseNetwork)? .map(|v| v.0.iter().map(|e| *e as i32).collect()); - let pci_segment = parser - .convert("pci_segment") - .map_err(Error::ParseNetwork)? - .unwrap_or_default(); let bw_size = parser .convert("bw_size") .map_err(Error::ParseNetwork)? @@ -1494,23 +1645,23 @@ impl NetConfig { None }; + let pci_common = PciDeviceCommonConfig::parse(net)?; + let config = NetConfig { + pci_common, tap, ip, mask, mac, host_mac, mtu, - iommu, num_queues, queue_size, vhost_user, vhost_socket, vhost_mode, - id, fds, rate_limiter_config, - pci_segment, offload_tso, offload_ufo, offload_csum, @@ -1519,6 +1670,8 @@ impl NetConfig { } pub fn validate(&self, vm_config: &VmConfig) -> ValidationResult<()> { + self.pci_common.validate(vm_config)?; + if self.num_queues < 2 { return Err(ValidationError::VnetQueueLowerThan2(self.num_queues)); } @@ -1546,23 +1699,10 @@ impl NetConfig { )); } - if self.vhost_user && self.iommu { + if self.vhost_user && self.pci_common.iommu { return Err(ValidationError::IommuNotSupported); } - if let Some(platform_config) = vm_config.platform.as_ref() { - if self.pci_segment >= platform_config.num_pci_segments { - return Err(ValidationError::InvalidPciSegment(self.pci_segment)); - } - - if let Some(iommu_segments) = platform_config.iommu_segments.as_ref() - && iommu_segments.contains(&self.pci_segment) - && !self.iommu - { - return Err(ValidationError::OnIommuSegment(self.pci_segment)); - } - } - if let Some(mtu) = self.mtu && mtu < virtio_devices::net::MIN_MTU { @@ -1642,10 +1782,137 @@ impl BalloonConfig { } } +impl GenericVhostUserConfig { + pub const SYNTAX: &'static str = "generic vhost-user parameters \ + \"virtio_id=,\ + socket=,\ + queue_sizes=,\ + id=,pci_segment=,pci_device_id=\""; + + pub fn parse(vhost_user: &str) -> Result { + let mut parser = OptionParser::new(); + parser + .add("virtio_id") + .add("queue_sizes") + .add("socket") + .add_all(PciDeviceCommonConfig::OPTIONS); + parser + .parse(vhost_user) + .map_err(Error::ParseGenericVhostUser)?; + + let socket = parser + .get("socket") + .ok_or(Error::ParseGenericVhostUserSockMissing)?; + + let IntegerList(queue_sizes) = parser + .convert("queue_sizes") + .map_err(Error::ParseGenericVhostUser)? + .ok_or(Error::ParseGenericVhostUserQueueSizeMissing)?; + let device_type_str = parser + .convert::("virtio_id") + .map_err(Error::ParseGenericVhostUser)? + .ok_or(Error::ParseGenericVhostUserVirtioIdMissing)?; + let device_type = match device_type_str.as_bytes() { + b"net" => VIRTIO_ID_NET, + b"block" => VIRTIO_ID_BLOCK, + b"console" => VIRTIO_ID_CONSOLE, + b"rng" => VIRTIO_ID_RNG, + b"balloon" => VIRTIO_ID_BALLOON, + b"iomem" => VIRTIO_ID_IOMEM, + b"rpmsg" => VIRTIO_ID_RPMSG, + b"scsi" => VIRTIO_ID_SCSI, + b"9p" => VIRTIO_ID_9P, + b"mac80211_wlan" => VIRTIO_ID_MAC80211_WLAN, + b"rproc_serial" => VIRTIO_ID_RPROC_SERIAL, + b"caif" => VIRTIO_ID_CAIF, + b"memory_balloon" => VIRTIO_ID_MEMORY_BALLOON, + b"gpu" => VIRTIO_ID_GPU, + b"clock" => VIRTIO_ID_CLOCK, + b"input" => VIRTIO_ID_INPUT, + b"vsock" => VIRTIO_ID_VSOCK, + b"crypto" => VIRTIO_ID_CRYPTO, + b"signal_dist" => VIRTIO_ID_SIGNAL_DIST, + b"pstore" => VIRTIO_ID_PSTORE, + b"iommu" => VIRTIO_ID_IOMMU, + b"mem" => VIRTIO_ID_MEM, + b"sound" => VIRTIO_ID_SOUND, + b"fs" => VIRTIO_ID_FS, + b"pmem" => VIRTIO_ID_PMEM, + b"rpmb" => VIRTIO_ID_RPMB, + b"mac80211_hwsim" => VIRTIO_ID_MAC80211_HWSIM, + b"video_encoder" => VIRTIO_ID_VIDEO_ENCODER, + b"video_decoder" => VIRTIO_ID_VIDEO_DECODER, + b"scmi" => VIRTIO_ID_SCMI, + b"nitro_sec_mod" => VIRTIO_ID_NITRO_SEC_MOD, + b"i2c" => VIRTIO_ID_I2C_ADAPTER, + b"watchdog" => VIRTIO_ID_WATCHDOG, + b"can" => VIRTIO_ID_CAN, + b"dmabuf" => VIRTIO_ID_DMABUF, + b"param_serv" => VIRTIO_ID_PARAM_SERV, + b"audio_policy" => VIRTIO_ID_AUDIO_POLICY, + b"bt" => VIRTIO_ID_BT, + b"gpio" => VIRTIO_ID_GPIO, + b"rdma" => 42, + b"camera" => 43, + b"ism" => 44, + b"spi" => 45, + b"tee" => 46, + b"cpu_balloon" => 47, + b"media" => 48, + b"usb" => 49, + [b'1'..=b'9', ..] => match device_type_str.parse() { + Ok(id) => id, + Err(_) => return Err(Error::ParseGenericVhostUserVirtioIdInvalid(device_type_str)), + }, + _ => return Err(Error::ParseGenericVhostUserVirtioIdInvalid(device_type_str)), + }; + match device_type { + // vhost-user devices of these types definitely cannot work. + // Cloud Hypervisor needs to know if an IOMMU exists so that it + // can perform address translation, and a vhost-user device has + // no supported way to reset the guest. + VIRTIO_ID_WATCHDOG | VIRTIO_ID_IOMMU => { + return Err(Error::ParseGenericVhostUserVirtioIdUnsupported( + device_type_str, + )); + } + _ => {} + } + let pci_common = PciDeviceCommonConfig::parse(vhost_user)?; + let mut converted_queue_sizes: Vec = Vec::new(); + for (offset, &queue_size) in queue_sizes.iter().enumerate() { + match queue_size.try_into() { + Err(_) => { + return Err(Error::ParseGenericVhostUserQueueSizeTooLarge( + offset, queue_size, + )); + } + Ok(queue_size) => converted_queue_sizes.push(queue_size), + } + } + + Ok(GenericVhostUserConfig { + pci_common, + socket: socket.into(), + device_type, + queue_sizes: converted_queue_sizes, + }) + } + + pub fn validate(&self, vm_config: &VmConfig) -> ValidationResult<()> { + if self.pci_common.iommu { + return Err(ValidationError::IommuNotSupported); + } + + self.pci_common.validate(vm_config) + } +} + impl FsConfig { pub const SYNTAX: &'static str = "virtio-fs parameters \ \"tag=,socket=,num_queues=,\ - queue_size=,id=,pci_segment=\""; + queue_size=,id=,\ + pci_segment=,pci_device_id=\""; pub fn parse(fs: &str) -> Result { let mut parser = OptionParser::new(); @@ -1654,8 +1921,7 @@ impl FsConfig { .add("queue_size") .add("num_queues") .add("socket") - .add("id") - .add("pci_segment"); + .add_all(PciDeviceCommonConfig::OPTIONS); parser.parse(fs).map_err(Error::ParseFileSystem)?; let tag = parser.get("tag").ok_or(Error::ParseFsTagMissing)?; @@ -1673,20 +1939,14 @@ impl FsConfig { .map_err(Error::ParseFileSystem)? .unwrap_or_else(default_fsconfig_num_queues); - let id = parser.get("id"); - - let pci_segment = parser - .convert("pci_segment") - .map_err(Error::ParseFileSystem)? - .unwrap_or_default(); + let pci_common = PciDeviceCommonConfig::parse(fs)?; Ok(FsConfig { + pci_common, tag, socket, num_queues, queue_size, - id, - pci_segment, }) } @@ -1698,21 +1958,11 @@ impl FsConfig { )); } - if let Some(platform_config) = vm_config.platform.as_ref() { - if self.pci_segment >= platform_config.num_pci_segments { - return Err(ValidationError::InvalidPciSegment(self.pci_segment)); - } - - if let Some(iommu_segments) = platform_config.iommu_segments.as_ref() - && iommu_segments.contains(&self.pci_segment) - { - return Err(ValidationError::IommuNotSupportedOnSegment( - self.pci_segment, - )); - } + if self.pci_common.iommu { + return Err(ValidationError::IommuNotSupported); } - Ok(()) + self.pci_common.validate(vm_config) } } @@ -1720,7 +1970,7 @@ impl FsConfig { impl FwCfgConfig { pub const SYNTAX: &'static str = "Boot params to pass to FW CFG device \ \"e820=on|off,kernel=on|off,cmdline=on|off,initramfs=on|off,acpi_table=on|off, \ - items=[name0=,file0=:name1=,file1=]\""; + items=[name=,file=:name=,string=]\""; pub fn parse(fw_cfg_config: &str) -> Result { let mut parser = OptionParser::new(); parser @@ -1776,15 +2026,23 @@ impl FwCfgConfig { items, }) } - pub fn validate(&self, vm_config: &VmConfig) -> ValidationResult<()> { - let payload = vm_config.payload.as_ref().unwrap(); + pub fn validate(&self, payload: &PayloadConfig) -> std::result::Result<(), PayloadConfigError> { if self.kernel && payload.kernel.is_none() { - return Err(ValidationError::FwCfgMissingKernel); + return Err(PayloadConfigError::FwCfgMissingKernel); } else if self.cmdline && payload.cmdline.is_none() { - return Err(ValidationError::FwCfgMissingCmdline); + return Err(PayloadConfigError::FwCfgMissingCmdline); } else if self.initramfs && payload.initramfs.is_none() { - return Err(ValidationError::FwCfgMissingInitramfs); + return Err(PayloadConfigError::FwCfgMissingInitramfs); + } + + if let Some(items) = &self.items { + for item in &items.item_list { + if item.file.is_some() == item.string.is_some() { + return Err(PayloadConfigError::FwCfgInvalidItem(item.name.clone())); + } + } } + Ok(()) } } @@ -1793,7 +2051,7 @@ impl FwCfgConfig { impl FwCfgItem { pub fn parse(fw_cfg: &str) -> Result { let mut parser = OptionParser::new(); - parser.add("name").add("file"); + parser.add("name").add("file").add("string"); parser.parse(fw_cfg).map_err(Error::ParseFwCfgItem)?; let name = @@ -1802,78 +2060,49 @@ impl FwCfgItem { .ok_or(Error::ParseFwCfgItem(OptionParserError::InvalidValue( "missing FwCfgItem name".to_string(), )))?; - let file = parser - .get("file") - .map(PathBuf::from) - .ok_or(Error::ParseFwCfgItem(OptionParserError::InvalidValue( - "missing FwCfgItem file path".to_string(), - )))?; - Ok(FwCfgItem { name, file }) + let file = parser.get("file").map(PathBuf::from); + let string = parser.get("string"); + Ok(FwCfgItem { name, file, string }) } } impl PmemConfig { pub const SYNTAX: &'static str = "Persistent memory parameters \ \"file=,size=,iommu=on|off,\ - discard_writes=on|off,id=,pci_segment=\""; + discard_writes=on|off,id=,\ + pci_segment=,pci_device_id=\""; pub fn parse(pmem: &str) -> Result { let mut parser = OptionParser::new(); parser .add("size") .add("file") - .add("iommu") .add("discard_writes") - .add("id") - .add("pci_segment"); + .add_all(PciDeviceCommonConfig::OPTIONS_IOMMU); parser.parse(pmem).map_err(Error::ParsePersistentMemory)?; + let pci_common = PciDeviceCommonConfig::parse(pmem)?; let file = PathBuf::from(parser.get("file").ok_or(Error::ParsePmemFileMissing)?); let size = parser .convert::("size") .map_err(Error::ParsePersistentMemory)? .map(|v| v.0); - let iommu = parser - .convert::("iommu") - .map_err(Error::ParsePersistentMemory)? - .unwrap_or(Toggle(false)) - .0; let discard_writes = parser .convert::("discard_writes") .map_err(Error::ParsePersistentMemory)? .unwrap_or(Toggle(false)) .0; - let id = parser.get("id"); - let pci_segment = parser - .convert("pci_segment") - .map_err(Error::ParsePersistentMemory)? - .unwrap_or_default(); Ok(PmemConfig { + pci_common, file, size, - iommu, discard_writes, - id, - pci_segment, }) } pub fn validate(&self, vm_config: &VmConfig) -> ValidationResult<()> { - if let Some(platform_config) = vm_config.platform.as_ref() { - if self.pci_segment >= platform_config.num_pci_segments { - return Err(ValidationError::InvalidPciSegment(self.pci_segment)); - } - - if let Some(iommu_segments) = platform_config.iommu_segments.as_ref() - && iommu_segments.contains(&self.pci_segment) - && !self.iommu - { - return Err(ValidationError::OnIommuSegment(self.pci_segment)); - } - } - - Ok(()) + self.pci_common.validate(vm_config) } } @@ -1985,55 +2214,40 @@ impl DebugConsoleConfig { } impl DeviceConfig { - pub const SYNTAX: &'static str = "Direct device assignment parameters \"path=,iommu=on|off,id=,pci_segment=\""; + pub const SYNTAX: &'static str = "Direct device assignment parameters \ + \"path=,iommu=on|off,id=,\ + pci_segment=,pci_device_id=\""; pub fn parse(device: &str) -> Result { let mut parser = OptionParser::new(); parser .add("path") - .add("id") - .add("iommu") - .add("pci_segment") + .add_all(PciDeviceCommonConfig::OPTIONS_IOMMU) .add("x_nv_gpudirect_clique"); parser.parse(device).map_err(Error::ParseDevice)?; + let pci_common = PciDeviceCommonConfig::parse(device)?; let path = parser .get("path") .map(PathBuf::from) .ok_or(Error::ParseDevicePathMissing)?; - let iommu = parser - .convert::("iommu") - .map_err(Error::ParseDevice)? - .unwrap_or(Toggle(false)) - .0; - let id = parser.get("id"); - let pci_segment = parser - .convert::("pci_segment") - .map_err(Error::ParseDevice)? - .unwrap_or_default(); let x_nv_gpudirect_clique = parser .convert::("x_nv_gpudirect_clique") .map_err(Error::ParseDevice)?; Ok(DeviceConfig { + pci_common, path, - iommu, - id, - pci_segment, x_nv_gpudirect_clique, }) } pub fn validate(&self, vm_config: &VmConfig) -> ValidationResult<()> { - if let Some(platform_config) = vm_config.platform.as_ref() { - if self.pci_segment >= platform_config.num_pci_segments { - return Err(ValidationError::InvalidPciSegment(self.pci_segment)); - } + self.pci_common.validate(vm_config)?; - if let Some(iommu_segments) = platform_config.iommu_segments.as_ref() - && iommu_segments.contains(&self.pci_segment) - && !self.iommu - { - return Err(ValidationError::OnIommuSegment(self.pci_segment)); + if self.x_nv_gpudirect_clique.is_some() { + let vfio_p2p_dma = vm_config.platform.as_ref().is_none_or(|p| p.vfio_p2p_dma); + if !vfio_p2p_dma { + return Err(ValidationError::GpuDirectCliqueRequiresP2pDma); } } @@ -2042,65 +2256,46 @@ impl DeviceConfig { } impl UserDeviceConfig { - pub const SYNTAX: &'static str = - "Userspace device socket=,id=,pci_segment=\""; + pub const SYNTAX: &'static str = "Userspace device socket=,id=,\ + pci_segment=,pci_device_id=\""; pub fn parse(user_device: &str) -> Result { let mut parser = OptionParser::new(); - parser.add("socket").add("id").add("pci_segment"); + parser.add("socket").add_all(PciDeviceCommonConfig::OPTIONS); parser.parse(user_device).map_err(Error::ParseUserDevice)?; + let pci_common = PciDeviceCommonConfig::parse(user_device)?; let socket = parser .get("socket") .map(PathBuf::from) .ok_or(Error::ParseUserDeviceSocketMissing)?; - let id = parser.get("id"); - let pci_segment = parser - .convert::("pci_segment") - .map_err(Error::ParseUserDevice)? - .unwrap_or_default(); - Ok(UserDeviceConfig { - socket, - id, - pci_segment, - }) + Ok(UserDeviceConfig { pci_common, socket }) } pub fn validate(&self, vm_config: &VmConfig) -> ValidationResult<()> { - if let Some(platform_config) = vm_config.platform.as_ref() { - if self.pci_segment >= platform_config.num_pci_segments { - return Err(ValidationError::InvalidPciSegment(self.pci_segment)); - } - - if let Some(iommu_segments) = platform_config.iommu_segments.as_ref() - && iommu_segments.contains(&self.pci_segment) - { - return Err(ValidationError::IommuNotSupportedOnSegment( - self.pci_segment, - )); - } + if self.pci_common.iommu { + return Err(ValidationError::IommuNotSupported); } - Ok(()) + self.pci_common.validate(vm_config) } } impl VdpaConfig { pub const SYNTAX: &'static str = "vDPA device \ \"path=,num_queues=,iommu=on|off,\ - id=,pci_segment=\""; + id=,pci_segment=,pci_device_id=\""; pub fn parse(vdpa: &str) -> Result { let mut parser = OptionParser::new(); parser .add("path") .add("num_queues") - .add("iommu") - .add("id") - .add("pci_segment"); + .add_all(PciDeviceCommonConfig::OPTIONS_IOMMU); parser.parse(vdpa).map_err(Error::ParseVdpa)?; + let pci_common = PciDeviceCommonConfig::parse(vdpa)?; let path = parser .get("path") .map(PathBuf::from) @@ -2109,101 +2304,51 @@ impl VdpaConfig { .convert("num_queues") .map_err(Error::ParseVdpa)? .unwrap_or_else(default_vdpaconfig_num_queues); - let iommu = parser - .convert::("iommu") - .map_err(Error::ParseVdpa)? - .unwrap_or(Toggle(false)) - .0; - let id = parser.get("id"); - let pci_segment = parser - .convert("pci_segment") - .map_err(Error::ParseVdpa)? - .unwrap_or_default(); Ok(VdpaConfig { + pci_common, path, num_queues, - iommu, - id, - pci_segment, }) } pub fn validate(&self, vm_config: &VmConfig) -> ValidationResult<()> { - if let Some(platform_config) = vm_config.platform.as_ref() { - if self.pci_segment >= platform_config.num_pci_segments { - return Err(ValidationError::InvalidPciSegment(self.pci_segment)); - } - - if let Some(iommu_segments) = platform_config.iommu_segments.as_ref() - && iommu_segments.contains(&self.pci_segment) - && !self.iommu - { - return Err(ValidationError::OnIommuSegment(self.pci_segment)); - } - } - - Ok(()) + self.pci_common.validate(vm_config) } } impl VsockConfig { pub const SYNTAX: &'static str = "Virtio VSOCK parameters \ - \"cid=,socket=,iommu=on|off,id=,pci_segment=\""; + \"cid=,socket=,iommu=on|off,id=,\ + pci_segment=,pci_device_id=\""; pub fn parse(vsock: &str) -> Result { let mut parser = OptionParser::new(); parser .add("socket") .add("cid") - .add("iommu") - .add("id") - .add("pci_segment"); + .add_all(PciDeviceCommonConfig::OPTIONS_IOMMU); parser.parse(vsock).map_err(Error::ParseVsock)?; + let pci_common = PciDeviceCommonConfig::parse(vsock)?; let socket = parser .get("socket") .map(PathBuf::from) .ok_or(Error::ParseVsockSockMissing)?; - let iommu = parser - .convert::("iommu") - .map_err(Error::ParseVsock)? - .unwrap_or(Toggle(false)) - .0; let cid = parser .convert("cid") .map_err(Error::ParseVsock)? .ok_or(Error::ParseVsockCidMissing)?; - let id = parser.get("id"); - let pci_segment = parser - .convert("pci_segment") - .map_err(Error::ParseVsock)? - .unwrap_or_default(); Ok(VsockConfig { + pci_common, cid, socket, - iommu, - id, - pci_segment, }) } pub fn validate(&self, vm_config: &VmConfig) -> ValidationResult<()> { - if let Some(platform_config) = vm_config.platform.as_ref() { - if self.pci_segment >= platform_config.num_pci_segments { - return Err(ValidationError::InvalidPciSegment(self.pci_segment)); - } - - if let Some(iommu_segments) = platform_config.iommu_segments.as_ref() - && iommu_segments.contains(&self.pci_segment) - && !self.iommu - { - return Err(ValidationError::OnIommuSegment(self.pci_segment)); - } - } - - Ok(()) + self.pci_common.validate(vm_config) } } @@ -2362,27 +2507,65 @@ where } } +#[derive(Clone, Copy, Debug, PartialEq, Eq, Deserialize, Serialize, Default)] +pub enum MemoryRestoreMode { + /// Restore by eagerly copying the snapshot into guest RAM before resume. + #[default] + Copy, + /// Restore lazily by faulting snapshot pages into guest RAM on demand. + OnDemand, +} + +#[derive(Debug, Error)] +pub enum MemoryRestoreModeParseError { + #[error("Invalid value: {0}")] + InvalidValue(String), +} + +impl FromStr for MemoryRestoreMode { + type Err = MemoryRestoreModeParseError; + + fn from_str(s: &str) -> result::Result { + match s.to_lowercase().as_str() { + "copy" => Ok(Self::Copy), + "ondemand" => Ok(Self::OnDemand), + _ => Err(MemoryRestoreModeParseError::InvalidValue(s.to_owned())), + } + } +} + #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize, Default)] pub struct RestoreConfig { pub source_url: PathBuf, #[serde(default)] pub prefault: bool, #[serde(default)] + pub memory_restore_mode: MemoryRestoreMode, + #[serde(default)] pub net_fds: Option>, + #[serde(default)] + pub resume: bool, } impl RestoreConfig { pub const SYNTAX: &'static str = "Restore from a VM snapshot. \ - \nRestore parameters \"source_url=,prefault=on|off,\ - net_fds=\" \ + \nRestore parameters \"source_url=,prefault=on|off,memory_restore_mode=copy|ondemand,\ + net_fds=,resume=true|false\" \ \n`source_url` should be a valid URL (e.g file:///foo/bar or tcp://192.168.1.10/foo) \ - \n`prefault` brings memory pages in when enabled (disabled by default) \ + \n`prefault` controls eager prefaulting for the copy-based restore path (disabled by default) \ + \n`memory_restore_mode=copy` preserves the existing eager read-copy restore behavior, while `memory_restore_mode=ondemand` enables lazy demand paging and fails restore if userfaultfd support is unavailable \ \n`net_fds` is a list of net ids with new file descriptors. \ - Only net devices backed by FDs directly are needed as input."; + Only net devices backed by FDs directly are needed as input.\ + \n `resume` controls whether the VM will be directly resumed after restore "; pub fn parse(restore: &str) -> Result { let mut parser = OptionParser::new(); - parser.add("source_url").add("prefault").add("net_fds"); + parser + .add("source_url") + .add("prefault") + .add("memory_restore_mode") + .add("net_fds") + .add("resume"); parser.parse(restore).map_err(Error::ParseRestore)?; let source_url = parser @@ -2394,6 +2577,10 @@ impl RestoreConfig { .map_err(Error::ParseRestore)? .unwrap_or(Toggle(false)) .0; + let memory_restore_mode = parser + .convert::("memory_restore_mode") + .map_err(Error::ParseRestore)? + .unwrap_or_default(); let net_fds = parser .convert::>>("net_fds") .map_err(Error::ParseRestore)? @@ -2406,11 +2593,18 @@ impl RestoreConfig { }) .collect() }); + let resume = parser + .convert::("resume") + .map_err(Error::ParseRestore)? + .unwrap_or(Toggle(false)) + .0; Ok(RestoreConfig { source_url, prefault, + memory_restore_mode, net_fds, + resume, }) } @@ -2418,6 +2612,10 @@ impl RestoreConfig { // corresponding 'RestoreNetConfig' with a matched 'id' and expected // number of FDs. pub fn validate(&self, vm_config: &VmConfig) -> ValidationResult<()> { + if self.memory_restore_mode == MemoryRestoreMode::OnDemand && self.prefault { + return Err(ValidationError::InvalidRestorePrefaultWithOnDemand); + } + let mut restored_net_with_fds = HashMap::new(); for n in self.net_fds.iter().flatten() { assert_eq!( @@ -2433,6 +2631,7 @@ impl RestoreConfig { for net_fds in vm_config.net.iter().flatten() { if let Some(expected_fds) = &net_fds.fds { let expected_id = net_fds + .pci_common .id .as_ref() .expect("Invalid 'NetConfig' with empty 'id' for VM restore."); @@ -2619,12 +2818,25 @@ impl VmConfig { #[cfg(feature = "sev_snp")] { - let host_data_opt = &self.payload.as_ref().unwrap().host_data; - - if let Some(host_data) = host_data_opt - && host_data.len() != 64 - { - return Err(ValidationError::InvalidHostData); + let sev_snp_enabled = self.platform.as_ref().is_some_and(|p| p.sev_snp); + if sev_snp_enabled { + let host_data_opt = &self.payload.as_ref().unwrap().host_data; + if let Some(host_data) = host_data_opt + && host_data.len() != 64 + { + return Err(ValidationError::InvalidHostData); + } + // KVM SEV-SNP requires an IGVM payload to initialise the VMSA. + // Without IGVM the vCPU register state is undefined and VM entry fails. + #[cfg(feature = "igvm")] + if self + .payload + .as_ref() + .and_then(|p| p.igvm.as_ref()) + .is_none() + { + return Err(ValidationError::SevSnpRequiresIgvm); + } } } // The 'conflict' check is introduced in commit 24438e0390d3 @@ -2709,9 +2921,9 @@ impl VmConfig { } disk.validate(self)?; - self.iommu |= disk.iommu; + self.iommu |= disk.pci_common.iommu; - Self::validate_identifier(&mut id_list, &disk.id)?; + Self::validate_identifier(&mut id_list, &disk.pci_common.id)?; } } @@ -2721,9 +2933,9 @@ impl VmConfig { return Err(ValidationError::VhostUserRequiresSharedMemory); } net.validate(self)?; - self.iommu |= net.iommu; + self.iommu |= net.pci_common.iommu; - Self::validate_identifier(&mut id_list, &net.id)?; + Self::validate_identifier(&mut id_list, &net.pci_common.id)?; } } @@ -2734,16 +2946,27 @@ impl VmConfig { for fs in fses { fs.validate(self)?; - Self::validate_identifier(&mut id_list, &fs.id)?; + Self::validate_identifier(&mut id_list, &fs.pci_common.id)?; + } + } + + if let Some(generic_vhost_user_devices) = &self.generic_vhost_user { + if !generic_vhost_user_devices.is_empty() && !self.backed_by_shared_memory() { + return Err(ValidationError::VhostUserRequiresSharedMemory); + } + for generic_vhost_user_device in generic_vhost_user_devices { + generic_vhost_user_device.validate(self)?; + + Self::validate_identifier(&mut id_list, &generic_vhost_user_device.pci_common.id)?; } } if let Some(pmems) = &self.pmem { for pmem in pmems { pmem.validate(self)?; - self.iommu |= pmem.iommu; + self.iommu |= pmem.pci_common.iommu; - Self::validate_identifier(&mut id_list, &pmem.id)?; + Self::validate_identifier(&mut id_list, &pmem.pci_common.id)?; } } @@ -2793,16 +3016,16 @@ impl VmConfig { for user_device in user_devices { user_device.validate(self)?; - Self::validate_identifier(&mut id_list, &user_device.id)?; + Self::validate_identifier(&mut id_list, &user_device.pci_common.id)?; } } if let Some(vdpa_devices) = &self.vdpa { for vdpa_device in vdpa_devices { vdpa_device.validate(self)?; - self.iommu |= vdpa_device.iommu; + self.iommu |= vdpa_device.pci_common.iommu; - Self::validate_identifier(&mut id_list, &vdpa_device.id)?; + Self::validate_identifier(&mut id_list, &vdpa_device.pci_common.id)?; } } @@ -2839,17 +3062,17 @@ impl VmConfig { } device.validate(self)?; - self.iommu |= device.iommu; + self.iommu |= device.pci_common.iommu; - Self::validate_identifier(&mut id_list, &device.id)?; + Self::validate_identifier(&mut id_list, &device.pci_common.id)?; } } if let Some(vsock) = &self.vsock { vsock.validate(self)?; - self.iommu |= vsock.iommu; + self.iommu |= vsock.pci_common.iommu; - Self::validate_identifier(&mut id_list, &vsock.id)?; + Self::validate_identifier(&mut id_list, &vsock.pci_common.id)?; } let num_pci_segments = match &self.platform { @@ -2991,6 +3214,15 @@ impl VmConfig { fs = Some(fs_config_list); } + let mut generic_vhost_user: Option> = None; + if let Some(generic_vhost_user_list) = &vm_params.generic_vhost_user { + let mut generic_vhost_user_config_list = Vec::new(); + for item in generic_vhost_user_list.iter() { + generic_vhost_user_config_list.push(GenericVhostUserConfig::parse(item)?); + } + generic_vhost_user = Some(generic_vhost_user_config_list); + } + let mut pmem: Option> = None; if let Some(pmem_list) = &vm_params.pmem { let mut pmem_config_list = Vec::new(); @@ -3126,6 +3358,7 @@ impl VmConfig { net, rng, balloon, + generic_vhost_user, fs, pmem, serial, @@ -3163,55 +3396,63 @@ impl VmConfig { // Remove if VFIO device if let Some(devices) = self.devices.as_mut() { let len = devices.len(); - devices.retain(|dev| dev.id.as_ref().map(|id| id.as_ref()) != Some(id)); + devices.retain(|dev| dev.pci_common.id.as_ref().map(|id| id.as_ref()) != Some(id)); removed |= devices.len() != len; } // Remove if VFIO user device if let Some(user_devices) = self.user_devices.as_mut() { let len = user_devices.len(); - user_devices.retain(|dev| dev.id.as_ref().map(|id| id.as_ref()) != Some(id)); + user_devices.retain(|dev| dev.pci_common.id.as_ref().map(|id| id.as_ref()) != Some(id)); removed |= user_devices.len() != len; } // Remove if disk device if let Some(disks) = self.disks.as_mut() { let len = disks.len(); - disks.retain(|dev| dev.id.as_ref().map(|id| id.as_ref()) != Some(id)); + disks.retain(|dev| dev.pci_common.id.as_ref().map(|id| id.as_ref()) != Some(id)); removed |= disks.len() != len; } // Remove if fs device if let Some(fs) = self.fs.as_mut() { let len = fs.len(); - fs.retain(|dev| dev.id.as_ref().map(|id| id.as_ref()) != Some(id)); + fs.retain(|dev| dev.pci_common.id.as_ref().map(|id| id.as_ref()) != Some(id)); removed |= fs.len() != len; } + // Remove if generic vhost-user device + if let Some(generic_vhost_user) = self.generic_vhost_user.as_mut() { + let len = generic_vhost_user.len(); + generic_vhost_user + .retain(|dev| dev.pci_common.id.as_ref().map(|id| id.as_ref()) != Some(id)); + removed |= generic_vhost_user.len() != len; + } + // Remove if net device if let Some(net) = self.net.as_mut() { let len = net.len(); - net.retain(|dev| dev.id.as_ref().map(|id| id.as_ref()) != Some(id)); + net.retain(|dev| dev.pci_common.id.as_ref().map(|id| id.as_ref()) != Some(id)); removed |= net.len() != len; } // Remove if pmem device if let Some(pmem) = self.pmem.as_mut() { let len = pmem.len(); - pmem.retain(|dev| dev.id.as_ref().map(|id| id.as_ref()) != Some(id)); + pmem.retain(|dev| dev.pci_common.id.as_ref().map(|id| id.as_ref()) != Some(id)); removed |= pmem.len() != len; } // Remove if vDPA device if let Some(vdpa) = self.vdpa.as_mut() { let len = vdpa.len(); - vdpa.retain(|dev| dev.id.as_ref().map(|id| id.as_ref()) != Some(id)); + vdpa.retain(|dev| dev.pci_common.id.as_ref().map(|id| id.as_ref()) != Some(id)); removed |= vdpa.len() != len; } // Remove if vsock device if let Some(vsock) = self.vsock.as_ref() - && vsock.id.as_ref().map(|id| id.as_ref()) == Some(id) + && vsock.pci_common.id.as_ref().map(|id| id.as_ref()) == Some(id) { self.vsock = None; removed = true; @@ -3260,6 +3501,7 @@ impl Clone for VmConfig { #[cfg(feature = "pvmemcontrol")] pvmemcontrol: self.pvmemcontrol.clone(), fs: self.fs.clone(), + generic_vhost_user: self.generic_vhost_user.clone(), pmem: self.pmem.clone(), serial: self.serial.clone(), console: self.console.clone(), @@ -3371,6 +3613,123 @@ mod unit_tests { }, ); + // Test core_scheduling parsing + assert_eq!( + CpusConfig::parse("boot=1,core_scheduling=vm")?, + CpusConfig { + boot_vcpus: 1, + max_vcpus: 1, + core_scheduling: CoreScheduling::Vm, + ..Default::default() + } + ); + assert_eq!( + CpusConfig::parse("boot=1,core_scheduling=vcpu")?, + CpusConfig { + boot_vcpus: 1, + max_vcpus: 1, + core_scheduling: CoreScheduling::Vcpu, + ..Default::default() + } + ); + assert_eq!( + CpusConfig::parse("boot=1,core_scheduling=off")?, + CpusConfig { + boot_vcpus: 1, + max_vcpus: 1, + core_scheduling: CoreScheduling::Off, + ..Default::default() + } + ); + // Default (no core_scheduling specified) should be Vm + assert_eq!( + CpusConfig::parse("boot=1")?.core_scheduling, + CoreScheduling::Vm + ); + // Invalid value should error + CpusConfig::parse("boot=1,core_scheduling=invalid").unwrap_err(); + + Ok(()) + } + + #[test] + fn test_mem_zone_parsing() -> Result<()> { + // mergeable defaults to false + assert_eq!( + MemoryConfig::parse("size=0", Some(vec!["id=mem0,size=1G"]))?, + MemoryConfig { + size: 0, + zones: Some(vec![MemoryZoneConfig { + id: "mem0".to_string(), + size: 1 << 30, + ..Default::default() + }]), + ..Default::default() + } + ); + // mergeable=on + assert_eq!( + MemoryConfig::parse("size=0", Some(vec!["id=mem0,size=1G,mergeable=on"]))?, + MemoryConfig { + size: 0, + zones: Some(vec![MemoryZoneConfig { + id: "mem0".to_string(), + size: 1 << 30, + mergeable: true, + ..Default::default() + }]), + ..Default::default() + } + ); + // mergeable=off is explicit false + assert_eq!( + MemoryConfig::parse("size=0", Some(vec!["id=mem0,size=1G,mergeable=off"]))?, + MemoryConfig { + size: 0, + zones: Some(vec![MemoryZoneConfig { + id: "mem0".to_string(), + size: 1 << 30, + mergeable: false, + ..Default::default() + }]), + ..Default::default() + } + ); + // per-zone mergeable independent of global mergeable + assert_eq!( + MemoryConfig::parse( + "size=1G,mergeable=off", + Some(vec!["id=hotplug,size=0,hotplug_size=4G,mergeable=on"]) + )?, + MemoryConfig { + size: 1 << 30, + mergeable: false, + hotplug_method: HotplugMethod::Acpi, + zones: Some(vec![MemoryZoneConfig { + id: "hotplug".to_string(), + size: 0, + hotplug_size: Some(4 << 30), + mergeable: true, + ..Default::default() + }]), + ..Default::default() + } + ); + // global mergeable=on inherited by zone with no explicit mergeable + assert_eq!( + MemoryConfig::parse("size=0,mergeable=on", Some(vec!["id=mem0,size=1G"]))?, + MemoryConfig { + size: 0, + mergeable: true, + zones: Some(vec![MemoryZoneConfig { + id: "mem0".to_string(), + size: 1 << 30, + mergeable: true, + ..Default::default() + }]), + ..Default::default() + } + ); Ok(()) } @@ -3513,25 +3872,24 @@ mod unit_tests { fn disk_fixture() -> DiskConfig { DiskConfig { + pci_common: PciDeviceCommonConfig::default(), path: Some(PathBuf::from("/path/to_file")), readonly: false, direct: false, - iommu: false, num_queues: 1, queue_size: 128, vhost_user: false, vhost_socket: None, - id: None, disable_io_uring: false, disable_aio: false, rate_limit_group: None, rate_limiter_config: None, - pci_segment: 0, serial: None, queue_affinity: None, backing_files: false, sparse: true, image_type: ImageType::Unknown, + lock_granularity: LockGranularityChoice::default(), } } @@ -3544,7 +3902,10 @@ mod unit_tests { assert_eq!( DiskConfig::parse("path=/path/to_file,id=mydisk0")?, DiskConfig { - id: Some("mydisk0".to_owned()), + pci_common: PciDeviceCommonConfig { + id: Some("mydisk0".to_owned()), + ..Default::default() + }, ..disk_fixture() } ); @@ -3561,14 +3922,20 @@ mod unit_tests { assert_eq!( DiskConfig::parse("path=/path/to_file,iommu=on")?, DiskConfig { - iommu: true, + pci_common: PciDeviceCommonConfig { + iommu: true, + ..Default::default() + }, ..disk_fixture() } ); assert_eq!( DiskConfig::parse("path=/path/to_file,iommu=on,queue_size=256")?, DiskConfig { - iommu: true, + pci_common: PciDeviceCommonConfig { + iommu: true, + ..Default::default() + }, queue_size: 256, ..disk_fixture() } @@ -3576,7 +3943,10 @@ mod unit_tests { assert_eq!( DiskConfig::parse("path=/path/to_file,iommu=on,queue_size=256,num_queues=4")?, DiskConfig { - iommu: true, + pci_common: PciDeviceCommonConfig { + iommu: true, + ..Default::default() + }, queue_size: 256, num_queues: 4, ..disk_fixture() @@ -3603,6 +3973,20 @@ mod unit_tests { ..disk_fixture() } ); + assert_eq!( + DiskConfig::parse("path=/path/to_file,lock_granularity=full")?, + DiskConfig { + lock_granularity: LockGranularityChoice::Full, + ..disk_fixture() + } + ); + assert_eq!( + DiskConfig::parse("path=/path/to_file,lock_granularity=byte-range")?, + DiskConfig { + lock_granularity: LockGranularityChoice::ByteRange, + ..disk_fixture() + } + ); assert_eq!( DiskConfig::parse("path=/path/to_file,queue_affinity=[0@[1],1@[2],2@[3,4],3@[5-8]]")?, DiskConfig { @@ -3632,22 +4016,20 @@ mod unit_tests { fn net_fixture() -> NetConfig { NetConfig { + pci_common: PciDeviceCommonConfig::default(), tap: None, ip: None, mask: None, mac: MacAddr::parse_str("de:ad:be:ef:12:34").unwrap(), host_mac: Some(MacAddr::parse_str("12:34:de:ad:be:ef").unwrap()), mtu: None, - iommu: false, num_queues: 2, queue_size: 256, vhost_user: false, vhost_socket: None, vhost_mode: VhostMode::Client, - id: None, fds: None, rate_limiter_config: None, - pci_segment: 0, offload_tso: true, offload_ufo: true, offload_csum: true, @@ -3665,7 +4047,10 @@ mod unit_tests { assert_eq!( NetConfig::parse("mac=de:ad:be:ef:12:34,host_mac=12:34:de:ad:be:ef,id=mynet0")?, NetConfig { - id: Some("mynet0".to_owned()), + pci_common: PciDeviceCommonConfig { + id: Some("mynet0".to_owned()), + ..Default::default() + }, ..net_fixture() } ); @@ -3698,9 +4083,12 @@ mod unit_tests { "mac=de:ad:be:ef:12:34,host_mac=12:34:de:ad:be:ef,num_queues=4,queue_size=1024,iommu=on" )?, NetConfig { + pci_common: PciDeviceCommonConfig { + iommu: true, + ..Default::default() + }, num_queues: 4, queue_size: 1024, - iommu: true, ..net_fixture() } ); @@ -3756,12 +4144,11 @@ mod unit_tests { fn fs_fixture() -> FsConfig { FsConfig { + pci_common: PciDeviceCommonConfig::default(), socket: PathBuf::from("/tmp/sock"), tag: "mytag".to_owned(), num_queues: 1, queue_size: 1024, - id: None, - pci_segment: 0, } } @@ -3784,14 +4171,102 @@ mod unit_tests { Ok(()) } + #[track_caller] + #[allow(clippy::too_many_arguments)] + fn make_vhost_user_config( + socket: &str, + virtio_id: u64, + id: &str, + pci_segment: u64, + queue_sizes: &IntegerList, + ) { + assert!(!socket.contains(",[]\n\r\0\"")); + assert!(!id.contains(",[]\n\r\0\"")); + let config = GenericVhostUserConfig::parse(&format!( + "virtio_id={virtio_id},socket=\"{socket}\",\ +id=\"{id}\",pci_segment={pci_segment},queue_sizes={queue_sizes}" + )); + if pci_segment <= u16::MAX.into() + && virtio_id <= u32::MAX.into() + && virtio_id != u64::from(VIRTIO_ID_BALLOON) + && virtio_id != u64::from(VIRTIO_ID_WATCHDOG) + && virtio_id != u64::from(VIRTIO_ID_IOMMU) + && queue_sizes.0.iter().all(|&f| f <= u16::MAX.into()) + { + assert_eq!( + config.unwrap(), + GenericVhostUserConfig { + pci_common: PciDeviceCommonConfig { + id: Some(id.to_owned()), + pci_segment: u16::try_from(pci_segment).unwrap(), + ..Default::default() + }, + socket: socket.into(), + device_type: u32::try_from(virtio_id).unwrap(), + queue_sizes: queue_sizes + .0 + .iter() + .map(|&f| u16::try_from(f).unwrap()) + .collect(), + } + ); + } else { + config.unwrap_err(); + } + } + + #[test] + fn test_parse_vhost_user() -> Result<()> { + // all parameters must be supplied, except pci_segment + GenericVhostUserConfig::parse("").unwrap_err(); + GenericVhostUserConfig::parse("virtio_id=1").unwrap_err(); + GenericVhostUserConfig::parse("queue_size=1").unwrap_err(); + GenericVhostUserConfig::parse("socket=/tmp/sock").unwrap_err(); + GenericVhostUserConfig::parse("id=1").unwrap_err(); + make_vhost_user_config( + "/dev/null/doesnotexist", + 100, + "Something", + 10, + &IntegerList(vec![u16::MAX.into(), 20u16.into()]), + ); + make_vhost_user_config( + "/dev/null/doesnotexist", + 100, + "Something", + 10, + &IntegerList(vec![u16::MAX.into()]), + ); + make_vhost_user_config( + "/dev/null/doesnotexist", + u64::from(u32::MAX) + 1, + "Something", + 10, + &IntegerList(vec![20u64]), + ); + make_vhost_user_config( + "/dev/null/doesnotexist", + u64::from(u32::MAX) + 1, + "Something", + 10, + &IntegerList(vec![20u64]), + ); + make_vhost_user_config( + "/dev/null/doesnotexist", + u64::from(u32::MAX) + 1, + "Something", + 10, + &IntegerList(vec![20u64]), + ); + Ok(()) + } + fn pmem_fixture() -> PmemConfig { PmemConfig { + pci_common: PciDeviceCommonConfig::default(), file: PathBuf::from("/tmp/pmem"), size: Some(128 << 20), - iommu: false, discard_writes: false, - id: None, - pci_segment: 0, } } @@ -3807,15 +4282,21 @@ mod unit_tests { assert_eq!( PmemConfig::parse("file=/tmp/pmem,size=128M,id=mypmem0")?, PmemConfig { - id: Some("mypmem0".to_owned()), + pci_common: PciDeviceCommonConfig { + id: Some("mypmem0".to_owned()), + ..Default::default() + }, ..pmem_fixture() } ); assert_eq!( PmemConfig::parse("file=/tmp/pmem,size=128M,iommu=on,discard_writes=on")?, PmemConfig { + pci_common: PciDeviceCommonConfig { + iommu: true, + ..Default::default() + }, discard_writes: true, - iommu: true, ..pmem_fixture() } ); @@ -3904,10 +4385,8 @@ mod unit_tests { fn device_fixture() -> DeviceConfig { DeviceConfig { + pci_common: PciDeviceCommonConfig::default(), path: PathBuf::from("/path/to/device"), - id: None, - iommu: false, - pci_segment: 0, x_nv_gpudirect_clique: None, } } @@ -3924,7 +4403,10 @@ mod unit_tests { assert_eq!( DeviceConfig::parse("path=/path/to/device,iommu=on")?, DeviceConfig { - iommu: true, + pci_common: PciDeviceCommonConfig { + iommu: true, + ..Default::default() + }, ..device_fixture() } ); @@ -3932,8 +4414,11 @@ mod unit_tests { assert_eq!( DeviceConfig::parse("path=/path/to/device,iommu=on,id=mydevice0")?, DeviceConfig { - id: Some("mydevice0".to_owned()), - iommu: true, + pci_common: PciDeviceCommonConfig { + id: Some("mydevice0".to_owned()), + iommu: true, + ..Default::default() + }, ..device_fixture() } ); @@ -3943,11 +4428,9 @@ mod unit_tests { fn vdpa_fixture() -> VdpaConfig { VdpaConfig { + pci_common: PciDeviceCommonConfig::default(), path: PathBuf::from("/dev/vhost-vdpa"), num_queues: 1, - iommu: false, - id: None, - pci_segment: 0, } } @@ -3959,8 +4442,11 @@ mod unit_tests { assert_eq!( VdpaConfig::parse("path=/dev/vhost-vdpa,num_queues=2,id=my_vdpa")?, VdpaConfig { + pci_common: PciDeviceCommonConfig { + id: Some("my_vdpa".to_owned()), + ..Default::default() + }, num_queues: 2, - id: Some("my_vdpa".to_owned()), ..vdpa_fixture() } ); @@ -3987,21 +4473,20 @@ mod unit_tests { assert_eq!( VsockConfig::parse("socket=/tmp/sock,cid=3")?, VsockConfig { + pci_common: PciDeviceCommonConfig::default(), cid: 3, socket: PathBuf::from("/tmp/sock"), - iommu: false, - id: None, - pci_segment: 0, } ); assert_eq!( VsockConfig::parse("socket=/tmp/sock,cid=3,iommu=on")?, VsockConfig { + pci_common: PciDeviceCommonConfig { + iommu: true, + ..Default::default() + }, cid: 3, socket: PathBuf::from("/tmp/sock"), - iommu: true, - id: None, - pci_segment: 0, } ); Ok(()) @@ -4129,7 +4614,9 @@ mod unit_tests { RestoreConfig { source_url: PathBuf::from("/path/to/snapshot"), prefault: false, + memory_restore_mode: MemoryRestoreMode::Copy, net_fds: None, + resume: false, } ); assert_eq!( @@ -4139,6 +4626,7 @@ mod unit_tests { RestoreConfig { source_url: PathBuf::from("/path/to/snapshot"), prefault: false, + memory_restore_mode: MemoryRestoreMode::Copy, net_fds: Some(vec![ RestoredNetConfig { id: "net0".to_string(), @@ -4151,13 +4639,53 @@ mod unit_tests { fds: Some(vec![5, 6, 7, 8]), } ]), + resume: false, + } + ); + assert_eq!( + RestoreConfig::parse("source_url=/path/to/snapshot,memory_restore_mode=ondemand")?, + RestoreConfig { + source_url: PathBuf::from("/path/to/snapshot"), + prefault: false, + memory_restore_mode: MemoryRestoreMode::OnDemand, + net_fds: None, + resume: false, + } + ); + assert_eq!( + RestoreConfig::parse("source_url=/path/to/snapshot,resume=on")?, + RestoreConfig { + source_url: PathBuf::from("/path/to/snapshot"), + prefault: false, + memory_restore_mode: MemoryRestoreMode::Copy, + net_fds: None, + resume: true, } ); // Parsing should fail as source_url is a required field RestoreConfig::parse("prefault=off").unwrap_err(); + RestoreConfig::parse("source_url=/path/to/snapshot,memory_restore_mode=bogus").unwrap_err(); Ok(()) } + #[test] + fn test_restore_config_serde() { + assert_eq!( + serde_json::from_str::(r#"{"source_url":"/path/to/snapshot"}"#) + .unwrap() + .memory_restore_mode, + MemoryRestoreMode::Copy + ); + assert_eq!( + serde_json::from_str::( + r#"{"source_url":"/path/to/snapshot","memory_restore_mode":"OnDemand"}"# + ) + .unwrap() + .memory_restore_mode, + MemoryRestoreMode::OnDemand + ); + } + #[test] fn test_restore_config_validation() { // interested in only VmConfig.net, so set rest to default values @@ -4168,6 +4696,7 @@ mod unit_tests { rate_limit_groups: None, disks: None, rng: RngConfig::default(), + generic_vhost_user: None, balloon: None, fs: None, pmem: None, @@ -4193,19 +4722,28 @@ mod unit_tests { preserved_fds: None, net: Some(vec![ NetConfig { - id: Some("net0".to_owned()), + pci_common: PciDeviceCommonConfig { + id: Some("net0".to_owned()), + ..Default::default() + }, num_queues: 2, fds: Some(vec![-1, -1, -1, -1]), ..net_fixture() }, NetConfig { - id: Some("net1".to_owned()), + pci_common: PciDeviceCommonConfig { + id: Some("net1".to_owned()), + ..Default::default() + }, num_queues: 1, fds: Some(vec![-1, -1]), ..net_fixture() }, NetConfig { - id: Some("net2".to_owned()), + pci_common: PciDeviceCommonConfig { + id: Some("net2".to_owned()), + ..Default::default() + }, fds: None, ..net_fixture() }, @@ -4219,6 +4757,7 @@ mod unit_tests { let valid_config = RestoreConfig { source_url: PathBuf::from("/path/to/snapshot"), prefault: false, + memory_restore_mode: MemoryRestoreMode::Copy, net_fds: Some(vec![ RestoredNetConfig { id: "net0".to_string(), @@ -4231,6 +4770,7 @@ mod unit_tests { fds: Some(vec![7, 8]), }, ]), + resume: false, }; valid_config.validate(&snapshot_vm_config).unwrap(); @@ -4293,14 +4833,31 @@ mod unit_tests { let another_valid_config = RestoreConfig { source_url: PathBuf::from("/path/to/snapshot"), prefault: false, + memory_restore_mode: MemoryRestoreMode::Copy, net_fds: None, + resume: false, }; snapshot_vm_config.net = Some(vec![NetConfig { - id: Some("net2".to_owned()), + pci_common: PciDeviceCommonConfig { + id: Some("net2".to_owned()), + ..Default::default() + }, fds: None, ..net_fixture() }]); another_valid_config.validate(&snapshot_vm_config).unwrap(); + + let invalid_restore_mode = RestoreConfig { + source_url: PathBuf::from("/path/to/snapshot"), + prefault: true, + memory_restore_mode: MemoryRestoreMode::OnDemand, + net_fds: None, + resume: false, + }; + assert_eq!( + invalid_restore_mode.validate(&snapshot_vm_config), + Err(ValidationError::InvalidRestorePrefaultWithOnDemand) + ); } fn platform_fixture() -> PlatformConfig { @@ -4311,6 +4868,8 @@ mod unit_tests { serial_number: None, uuid: None, oem_strings: None, + iommufd: false, + vfio_p2p_dma: default_platformconfig_vfio_p2p_dma(), #[cfg(feature = "tdx")] tdx: false, #[cfg(feature = "sev_snp")] @@ -4373,6 +4932,7 @@ mod unit_tests { }, balloon: None, fs: None, + generic_vhost_user: None, pmem: None, serial: ConsoleConfig { file: None, @@ -4426,6 +4986,33 @@ mod unit_tests { )) ); + #[cfg(feature = "fw_cfg")] + { + let mut invalid_config = valid_config.clone(); + if let Some(payload) = invalid_config.payload.as_mut() { + payload.fw_cfg_config = Some(FwCfgConfig { + e820: true, + kernel: false, + cmdline: false, + initramfs: false, + acpi_tables: true, + items: Some(FwCfgItemList { + item_list: vec![FwCfgItem { + name: "opt/org.test/invalid".to_string(), + file: None, + string: None, + }], + }), + }); + } + assert_eq!( + invalid_config.validate(), + Err(ValidationError::PayloadError( + PayloadConfigError::FwCfgInvalidItem("opt/org.test/invalid".to_string()) + )) + ); + } + let mut invalid_config = valid_config.clone(); invalid_config.serial.mode = ConsoleOutputMode::File; invalid_config.serial.file = None; @@ -4649,8 +5236,11 @@ mod unit_tests { ..platform_fixture() }); still_valid_config.disks = Some(vec![DiskConfig { - iommu: true, - pci_segment: 1, + pci_common: PciDeviceCommonConfig { + iommu: true, + pci_segment: 1, + ..Default::default() + }, ..disk_fixture() }]); still_valid_config.validate().unwrap(); @@ -4661,8 +5251,11 @@ mod unit_tests { ..platform_fixture() }); still_valid_config.net = Some(vec![NetConfig { - iommu: true, - pci_segment: 1, + pci_common: PciDeviceCommonConfig { + iommu: true, + pci_segment: 1, + ..Default::default() + }, ..net_fixture() }]); still_valid_config.validate().unwrap(); @@ -4673,8 +5266,11 @@ mod unit_tests { ..platform_fixture() }); still_valid_config.pmem = Some(vec![PmemConfig { - iommu: true, - pci_segment: 1, + pci_common: PciDeviceCommonConfig { + iommu: true, + pci_segment: 1, + ..Default::default() + }, ..pmem_fixture() }]); still_valid_config.validate().unwrap(); @@ -4685,8 +5281,11 @@ mod unit_tests { ..platform_fixture() }); still_valid_config.devices = Some(vec![DeviceConfig { - iommu: true, - pci_segment: 1, + pci_common: PciDeviceCommonConfig { + iommu: true, + pci_segment: 1, + ..Default::default() + }, ..device_fixture() }]); still_valid_config.validate().unwrap(); @@ -4697,11 +5296,13 @@ mod unit_tests { ..platform_fixture() }); still_valid_config.vsock = Some(VsockConfig { + pci_common: PciDeviceCommonConfig { + iommu: true, + pci_segment: 1, + ..Default::default() + }, cid: 3, socket: PathBuf::new(), - id: None, - iommu: true, - pci_segment: 1, }); still_valid_config.validate().unwrap(); @@ -4711,8 +5312,11 @@ mod unit_tests { ..platform_fixture() }); invalid_config.disks = Some(vec![DiskConfig { - iommu: false, - pci_segment: 1, + pci_common: PciDeviceCommonConfig { + iommu: false, + pci_segment: 1, + ..Default::default() + }, ..disk_fixture() }]); assert_eq!( @@ -4726,8 +5330,11 @@ mod unit_tests { ..platform_fixture() }); invalid_config.net = Some(vec![NetConfig { - iommu: false, - pci_segment: 1, + pci_common: PciDeviceCommonConfig { + iommu: false, + pci_segment: 1, + ..Default::default() + }, ..net_fixture() }]); assert_eq!( @@ -4742,8 +5349,10 @@ mod unit_tests { ..platform_fixture() }); invalid_config.pmem = Some(vec![PmemConfig { - iommu: false, - pci_segment: 1, + pci_common: PciDeviceCommonConfig { + pci_segment: 1, + ..Default::default() + }, ..pmem_fixture() }]); assert_eq!( @@ -4758,8 +5367,10 @@ mod unit_tests { ..platform_fixture() }); invalid_config.devices = Some(vec![DeviceConfig { - iommu: false, - pci_segment: 1, + pci_common: PciDeviceCommonConfig { + pci_segment: 1, + ..Default::default() + }, ..device_fixture() }]); assert_eq!( @@ -4773,11 +5384,12 @@ mod unit_tests { ..platform_fixture() }); invalid_config.vsock = Some(VsockConfig { + pci_common: PciDeviceCommonConfig { + pci_segment: 1, + ..Default::default() + }, cid: 3, socket: PathBuf::new(), - id: None, - iommu: false, - pci_segment: 1, }); assert_eq!( invalid_config.validate(), @@ -4791,13 +5403,15 @@ mod unit_tests { ..platform_fixture() }); invalid_config.user_devices = Some(vec![UserDeviceConfig { - pci_segment: 1, + pci_common: PciDeviceCommonConfig { + pci_segment: 1, + ..Default::default() + }, socket: PathBuf::new(), - id: None, }]); assert_eq!( invalid_config.validate(), - Err(ValidationError::IommuNotSupportedOnSegment(1)) + Err(ValidationError::OnIommuSegment(1)) ); let mut invalid_config = valid_config.clone(); @@ -4806,7 +5420,10 @@ mod unit_tests { ..platform_fixture() }); invalid_config.vdpa = Some(vec![VdpaConfig { - pci_segment: 1, + pci_common: PciDeviceCommonConfig { + pci_segment: 1, + ..Default::default() + }, ..vdpa_fixture() }]); assert_eq!( @@ -4821,12 +5438,15 @@ mod unit_tests { ..platform_fixture() }); invalid_config.fs = Some(vec![FsConfig { - pci_segment: 1, + pci_common: PciDeviceCommonConfig { + pci_segment: 1, + ..Default::default() + }, ..fs_fixture() }]); assert_eq!( invalid_config.validate(), - Err(ValidationError::IommuNotSupportedOnSegment(1)) + Err(ValidationError::OnIommuSegment(1)) ); let mut invalid_config = valid_config.clone(); @@ -5050,7 +5670,42 @@ mod unit_tests { config_with_invalid_host_data.validate().unwrap_err(); } - let mut still_valid_config = valid_config; + // x_nv_gpudirect_clique with vfio_p2p_dma=off should fail + let mut invalid_config = valid_config.clone(); + invalid_config.platform = Some(PlatformConfig { + vfio_p2p_dma: false, + ..platform_fixture() + }); + invalid_config.devices = Some(vec![DeviceConfig { + x_nv_gpudirect_clique: Some(0), + ..device_fixture() + }]); + assert_eq!( + invalid_config.validate(), + Err(ValidationError::GpuDirectCliqueRequiresP2pDma) + ); + + // x_nv_gpudirect_clique with vfio_p2p_dma=on should pass + let mut still_valid_config = valid_config.clone(); + still_valid_config.platform = Some(PlatformConfig { + vfio_p2p_dma: true, + ..platform_fixture() + }); + still_valid_config.devices = Some(vec![DeviceConfig { + x_nv_gpudirect_clique: Some(0), + ..device_fixture() + }]); + still_valid_config.validate().unwrap(); + + // x_nv_gpudirect_clique with no platform config (default p2p_dma=on) should pass + let mut still_valid_config = valid_config.clone(); + still_valid_config.devices = Some(vec![DeviceConfig { + x_nv_gpudirect_clique: Some(0), + ..device_fixture() + }]); + still_valid_config.validate().unwrap(); + + let mut still_valid_config = valid_config.clone(); // SAFETY: Safe as the file was just opened let fd1 = unsafe { libc::dup(File::open("/dev/null").unwrap().as_raw_fd()) }; // SAFETY: Safe as the file was just opened @@ -5060,6 +5715,45 @@ mod unit_tests { still_valid_config.add_preserved_fds(vec![fd1, fd2]); } let _still_valid_config = still_valid_config.clone(); + + // Valid BDF test + let mut still_valid_config = valid_config.clone(); + still_valid_config.disks = Some(vec![DiskConfig { + pci_common: PciDeviceCommonConfig { + pci_device_id: Some(8), + ..Default::default() + }, + ..disk_fixture() + }]); + still_valid_config.validate().unwrap(); + // Invalid BDF - Same ID as Root device + let mut invalid_config = valid_config.clone(); + invalid_config.disks = Some(vec![DiskConfig { + pci_common: PciDeviceCommonConfig { + pci_device_id: Some(pci::PCI_ROOT_DEVICE_ID), + ..Default::default() + }, + ..disk_fixture() + }]); + assert_eq!( + invalid_config.validate(), + Err(ValidationError::ReservedPciDeviceId( + pci::PCI_ROOT_DEVICE_ID + )) + ); + // Invalid BDF - Out of range + let mut invalid_config = valid_config.clone(); + invalid_config.disks = Some(vec![DiskConfig { + pci_common: PciDeviceCommonConfig { + pci_device_id: Some(pci::NUM_DEVICE_IDS + 1), + ..Default::default() + }, + ..disk_fixture() + }]); + assert_eq!( + invalid_config.validate(), + Err(ValidationError::InvalidPciDeviceId(pci::NUM_DEVICE_IDS + 1)) + ); } #[test] fn test_landlock_parsing() -> Result<()> { @@ -5085,7 +5779,7 @@ mod unit_tests { // Missing closing bracket FwCfgConfig::parse("items=[name=opt/org.test/fw_cfg_test_item,file=/tmp/fw_cfg_test_item") .unwrap_err(); - // Single Item + // Single file Item assert_eq!( FwCfgConfig::parse( "items=[name=opt/org.test/fw_cfg_test_item,file=/tmp/fw_cfg_test_item]" @@ -5094,13 +5788,14 @@ mod unit_tests { items: Some(FwCfgItemList { item_list: vec![FwCfgItem { name: "opt/org.test/fw_cfg_test_item".to_string(), - file: PathBuf::from("/tmp/fw_cfg_test_item"), + file: Some(PathBuf::from("/tmp/fw_cfg_test_item")), + string: None, }] }), ..Default::default() }, ); - // Multiple Items + // Multiple file Items assert_eq!( FwCfgConfig::parse( "items=[name=opt/org.test/fw_cfg_test_item,file=/tmp/fw_cfg_test_item:name=opt/org.test/fw_cfg_test_item2,file=/tmp/fw_cfg_test_item2]" @@ -5110,17 +5805,72 @@ mod unit_tests { item_list: vec![ FwCfgItem { name: "opt/org.test/fw_cfg_test_item".to_string(), - file: PathBuf::from("/tmp/fw_cfg_test_item"), + file: Some(PathBuf::from("/tmp/fw_cfg_test_item")), + string: None, }, FwCfgItem { name: "opt/org.test/fw_cfg_test_item2".to_string(), - file: PathBuf::from("/tmp/fw_cfg_test_item2"), + file: Some(PathBuf::from("/tmp/fw_cfg_test_item2")), + string: None, } ] }), ..Default::default() }, ); + // Single string Item (for OVMF MMIO64 config, GPU CC passthrough, etc.) + assert_eq!( + FwCfgConfig::parse("items=[name=opt/ovmf/X-PciMmio64Mb,string=262144]")?, + FwCfgConfig { + items: Some(FwCfgItemList { + item_list: vec![FwCfgItem { + name: "opt/ovmf/X-PciMmio64Mb".to_string(), + file: None, + string: Some("262144".to_string()), + }] + }), + ..Default::default() + }, + ); + // Mixed file and string Items + assert_eq!( + FwCfgConfig::parse( + "items=[name=opt/org.test/fw_cfg_test_item,file=/tmp/fw_cfg_test_item:name=opt/ovmf/X-PciMmio64Mb,string=262144]" + )?, + FwCfgConfig { + items: Some(FwCfgItemList { + item_list: vec![ + FwCfgItem { + name: "opt/org.test/fw_cfg_test_item".to_string(), + file: Some(PathBuf::from("/tmp/fw_cfg_test_item")), + string: None, + }, + FwCfgItem { + name: "opt/ovmf/X-PciMmio64Mb".to_string(), + file: None, + string: Some("262144".to_string()), + } + ] + }), + ..Default::default() + }, + ); + // Missing both file and string parses OK but fails validation + let missing_content = + FwCfgConfig::parse("items=[name=opt/org.test/missing_content]").unwrap(); + assert_eq!( + missing_content.items.as_ref().unwrap().item_list[0].file, + None + ); + assert_eq!( + missing_content.items.as_ref().unwrap().item_list[0].string, + None + ); + // Both file and string parses OK but fails validation + let both = FwCfgConfig::parse("items=[name=opt/org.test/both,file=/tmp/test,string=test]") + .unwrap(); + assert!(both.items.as_ref().unwrap().item_list[0].file.is_some()); + assert!(both.items.as_ref().unwrap().item_list[0].string.is_some()); Ok(()) } } diff --git a/vmm/src/console_devices.rs b/vmm/src/console_devices.rs index 76655d6c16..32cba7b780 100644 --- a/vmm/src/console_devices.rs +++ b/vmm/src/console_devices.rs @@ -56,7 +56,7 @@ pub enum ConsoleDeviceError { type ConsoleDeviceResult = result::Result; #[derive(Clone)] -pub enum ConsoleOutput { +pub enum ConsoleTransport { File(Arc), Pty(Arc), Tty(Arc), @@ -67,10 +67,10 @@ pub enum ConsoleOutput { #[derive(Clone)] pub struct ConsoleInfo { - pub console_main_fd: ConsoleOutput, - pub serial_main_fd: ConsoleOutput, + pub console: ConsoleTransport, + pub serial: ConsoleTransport, #[cfg(target_arch = "x86_64")] - pub debug_main_fd: ConsoleOutput, + pub debug: ConsoleTransport, } fn modify_mode( @@ -181,11 +181,11 @@ pub(crate) fn pre_create_console_devices(vmm: &mut Vmm) -> ConsoleDeviceResult { let file = File::create(vmconfig.console.file.as_ref().unwrap()) .map_err(ConsoleDeviceError::CreateConsoleDevice)?; - ConsoleOutput::File(Arc::new(file)) + ConsoleTransport::File(Arc::new(file)) } ConsoleOutputMode::Pty => { let (main_fd, sub_fd, path) = @@ -200,7 +200,7 @@ pub(crate) fn pre_create_console_devices(vmm: &mut Vmm) -> ConsoleDeviceResult { // Duplicating the file descriptors like this is needed as otherwise @@ -222,26 +222,26 @@ pub(crate) fn pre_create_console_devices(vmm: &mut Vmm) -> ConsoleDeviceResult { return Err(ConsoleDeviceError::NoSocketOptionSupportForConsoleDevice); } - ConsoleOutputMode::Null => ConsoleOutput::Null, - ConsoleOutputMode::Off => ConsoleOutput::Off, + ConsoleOutputMode::Null => ConsoleTransport::Null, + ConsoleOutputMode::Off => ConsoleTransport::Off, }, - serial_main_fd: match vmconfig.serial.mode { + serial: match vmconfig.serial.mode { ConsoleOutputMode::File => { let file = File::create(vmconfig.serial.file.as_ref().unwrap()) .map_err(ConsoleDeviceError::CreateConsoleDevice)?; - ConsoleOutput::File(Arc::new(file)) + ConsoleTransport::File(Arc::new(file)) } ConsoleOutputMode::Pty => { let (main_fd, sub_fd, path) = create_pty().map_err(ConsoleDeviceError::CreateConsoleDevice)?; set_raw_mode(&sub_fd.as_raw_fd(), &mut original_termios_opt)?; vmconfig.serial.file = Some(path.clone()); - ConsoleOutput::Pty(Arc::new(main_fd)) + ConsoleTransport::Pty(Arc::new(main_fd)) } ConsoleOutputMode::Tty => { // During vm_shutdown, when serial device is closed, FD#2(STDOUT) @@ -257,41 +257,41 @@ pub(crate) fn pre_create_console_devices(vmm: &mut Vmm) -> ConsoleDeviceResult { let listener = UnixListener::bind(vmconfig.serial.socket.as_ref().unwrap()) .map_err(ConsoleDeviceError::CreateConsoleDevice)?; - ConsoleOutput::Socket(Arc::new(listener)) + ConsoleTransport::Socket(Arc::new(listener)) } - ConsoleOutputMode::Null => ConsoleOutput::Null, - ConsoleOutputMode::Off => ConsoleOutput::Off, + ConsoleOutputMode::Null => ConsoleTransport::Null, + ConsoleOutputMode::Off => ConsoleTransport::Off, }, #[cfg(target_arch = "x86_64")] - debug_main_fd: match vmconfig.debug_console.mode { + debug: match vmconfig.debug_console.mode { ConsoleOutputMode::File => { let file = File::create(vmconfig.debug_console.file.as_ref().unwrap()) .map_err(ConsoleDeviceError::CreateConsoleDevice)?; - ConsoleOutput::File(Arc::new(file)) + ConsoleTransport::File(Arc::new(file)) } ConsoleOutputMode::Pty => { let (main_fd, sub_fd, path) = create_pty().map_err(ConsoleDeviceError::CreateConsoleDevice)?; set_raw_mode(&sub_fd.as_raw_fd(), &mut original_termios_opt)?; vmconfig.debug_console.file = Some(path.clone()); - ConsoleOutput::Pty(Arc::new(main_fd)) + ConsoleTransport::Pty(Arc::new(main_fd)) } ConsoleOutputMode::Tty => { let out = dup_stdout().map_err(|e| ConsoleDeviceError::CreateConsoleDevice(e.into()))?; set_raw_mode(&out, &mut original_termios_opt)?; - ConsoleOutput::Tty(Arc::new(out)) + ConsoleTransport::Tty(Arc::new(out)) } ConsoleOutputMode::Socket => { return Err(ConsoleDeviceError::NoSocketOptionSupportForConsoleDevice); } - ConsoleOutputMode::Null => ConsoleOutput::Null, - ConsoleOutputMode::Off => ConsoleOutput::Off, + ConsoleOutputMode::Null => ConsoleTransport::Null, + ConsoleOutputMode::Off => ConsoleTransport::Off, }, }; diff --git a/vmm/src/cpu.rs b/vmm/src/cpu.rs index bba78e642c..cb445cda15 100644 --- a/vmm/src/cpu.rs +++ b/vmm/src/cpu.rs @@ -17,7 +17,7 @@ use std::io::Write; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use std::mem::size_of; use std::os::unix::thread::JoinHandleExt; -use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicI32, Ordering}; use std::sync::{Arc, Barrier, Mutex}; use std::{cmp, io, result, thread}; @@ -92,7 +92,7 @@ use crate::gdb::{Debuggable, DebuggableError, get_raw_tid}; use crate::seccomp_filters::{Thread, get_seccomp_filter}; #[cfg(target_arch = "x86_64")] use crate::vm::physical_bits; -use crate::vm_config::CpusConfig; +use crate::vm_config::{CoreScheduling, CpusConfig}; use crate::{CPU_MANAGER_SNAPSHOT_ID, GuestMemoryMmap}; #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] @@ -212,6 +212,9 @@ pub enum Error { #[cfg(feature = "sev_snp")] #[error("Failed to set sev control register")] SetSevControlRegister(#[source] hypervisor::HypervisorCpuError), + #[cfg(feature = "sev_snp")] + #[error("Failed to set up SEV-SNP vCPU registers")] + SetupSevSnpRegs(#[source] hypervisor::HypervisorCpuError), #[cfg(target_arch = "x86_64")] #[error("Failed to inject NMI")] @@ -220,9 +223,95 @@ pub enum Error { #[cfg(feature = "mshv")] #[error("Failed to set partition property")] SetPartitionProperty(#[source] anyhow::Error), + + #[error("Error enabling core scheduling")] + CoreScheduling(#[source] io::Error), } pub type Result = result::Result; +const PR_SCHED_CORE: libc::c_int = 62; +const PR_SCHED_CORE_GET: libc::c_int = 0; +const PR_SCHED_CORE_CREATE: libc::c_int = 1; +const PR_SCHED_CORE_SHARE_FROM: libc::c_int = 3; +const PR_SCHED_CORE_SCOPE_THREAD: libc::c_int = 0; + +/// Create a new unique core scheduling cookie for the current thread. +/// Silently succeeds on kernels that don't support PR_SCHED_CORE. +fn core_scheduling_create() -> Result<()> { + // SAFETY: prctl with PR_SCHED_CORE_CREATE on the current thread (pid=0). + // All arguments are valid constants. We check the return value. + let ret = unsafe { + libc::prctl( + PR_SCHED_CORE, + PR_SCHED_CORE_CREATE, + 0, + PR_SCHED_CORE_SCOPE_THREAD, + 0, + ) + }; + if ret == -1 { + let err = io::Error::last_os_error(); + // EINVAL: kernel < 5.14 where PR_SCHED_CORE is unknown. + // ENODEV: CONFIG_SCHED_CORE is enabled but SMT is not present/enabled, + // so core scheduling is not applicable. + // Both mean core scheduling is unavailable; silently ignore. + match err.raw_os_error() { + Some(libc::EINVAL) => { + warn!("Kernel lacks CONFIG_SCHED_CORE support - no SMT isolation"); + } + Some(libc::ENODEV) => {} + _ => return Err(Error::CoreScheduling(err)), + } + } + Ok(()) +} + +/// Copy the core scheduling cookie from the thread identified by `tid` +/// to the current thread, placing both in the same scheduling group. +/// Silently succeeds on kernels that don't support PR_SCHED_CORE. +fn core_scheduling_share_from(tid: i32) -> Result<()> { + // SAFETY: prctl with PR_SCHED_CORE_SHARE_FROM targeting tid. + // All arguments are valid. We check the return value. + let ret = unsafe { + libc::prctl( + PR_SCHED_CORE, + PR_SCHED_CORE_SHARE_FROM, + tid, + PR_SCHED_CORE_SCOPE_THREAD, + 0, + ) + }; + if ret == -1 { + let err = io::Error::last_os_error(); + match err.raw_os_error() { + Some(libc::EINVAL) | Some(libc::ENODEV) => {} + _ => return Err(Error::CoreScheduling(err)), + } + } + Ok(()) +} + +/// Read the core scheduling cookie of the current thread. +/// Returns 0 if no cookie is set or the kernel doesn't support PR_SCHED_CORE. +fn core_scheduling_cookie() -> u64 { + let mut cookie: u64 = 0; + // SAFETY: PR_SCHED_CORE_GET with pid=0 reads the current thread's cookie + // into the provided pointer. We pass a valid mutable reference. + let ret = unsafe { + libc::prctl( + PR_SCHED_CORE, + PR_SCHED_CORE_GET, + 0, + PR_SCHED_CORE_SCOPE_THREAD, + &mut cookie as *mut u64, + ) + }; + if ret == -1 { + return 0; + } + cookie +} + #[cfg(target_arch = "x86_64")] #[allow(dead_code)] #[repr(C, packed)] @@ -460,6 +549,7 @@ impl Vcpu { #[cfg(target_arch = "x86_64")] kvm_hyperv: bool, #[cfg(target_arch = "x86_64")] topology: (u16, u16, u16, u16), #[cfg(target_arch = "x86_64")] nested: bool, + #[cfg(feature = "igvm")] igvm_enabled: bool, ) -> Result<()> { #[cfg(target_arch = "aarch64")] { @@ -472,17 +562,32 @@ impl Vcpu { .map_err(Error::VcpuConfiguration)?; info!("Configuring vCPU: cpu_id = {}", self.id); #[cfg(target_arch = "x86_64")] - arch::configure_vcpu( - self.vcpu.as_ref(), - self.id, - boot_setup, - cpuid, - kvm_hyperv, - self.vendor, - topology, - nested, - ) - .map_err(Error::VcpuConfiguration)?; + { + // When IGVM is enabled, skip standard register setup here — the IGVM + // loader populates vCPU registers from the VMSA via set_sev_control_register + // (currently KVM-specific; MSHV handles this through its own import path). + // igvm_enabled is kept as an explicit flag rather than derived from sev_snp + // state because IGVM could theoretically be used independently of SEV-SNP. + cfg_if::cfg_if! { + if #[cfg(feature = "igvm")] { + let setup_registers = !igvm_enabled; + } else { + let setup_registers = true; + } + } + arch::configure_vcpu( + self.vcpu.as_ref(), + self.id, + boot_setup, + cpuid, + kvm_hyperv, + self.vendor, + topology, + nested, + setup_registers, + ) + .map_err(Error::VcpuConfiguration)?; + } Ok(()) } @@ -542,6 +647,13 @@ impl Vcpu { .map_err(Error::SetSevControlRegister) } + #[cfg(feature = "sev_snp")] + pub fn setup_sev_snp_regs(&self, vmsa: igvm::snp_defs::SevVmsa) -> Result<()> { + self.vcpu + .setup_sev_snp_regs(vmsa) + .map_err(Error::SetupSevSnpRegs) + } + /// /// Sets the vCPU's GIC redistributor base address. /// @@ -596,8 +708,8 @@ pub struct CpuManager { reset_evt: EventFd, #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, - vcpu_states: Vec, - selected_cpu: u32, + // Shared with AcpiCpuHotplugController + vcpu_states: Arc>>, vcpus: Vec>>, seccomp_action: SeccompAction, vm_ops: Arc, @@ -609,96 +721,53 @@ pub struct CpuManager { hypervisor: Arc, #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, + // State of the core scheduling group leader election (VM mode). + core_scheduling_group_leader: Arc, + #[cfg(feature = "igvm")] + igvm_enabled: bool, } -const CPU_ENABLE_FLAG: usize = 0; -const CPU_INSERTING_FLAG: usize = 1; -const CPU_REMOVING_FLAG: usize = 2; -const CPU_EJECT_FLAG: usize = 3; - -const CPU_STATUS_OFFSET: u64 = 4; -const CPU_SELECTION_OFFSET: u64 = 0; - -impl BusDevice for CpuManager { - fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { - // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. - data.fill(0); - - match offset { - CPU_SELECTION_OFFSET => { - assert!(data.len() >= core::mem::size_of::()); - data[0..core::mem::size_of::()] - .copy_from_slice(&self.selected_cpu.to_le_bytes()); - } - CPU_STATUS_OFFSET => { - if self.selected_cpu < self.max_vcpus() { - let state = &self.vcpu_states[usize::try_from(self.selected_cpu).unwrap()]; - if state.active() { - data[0] |= 1 << CPU_ENABLE_FLAG; - } - if state.inserting { - data[0] |= 1 << CPU_INSERTING_FLAG; - } - if state.removing { - data[0] |= 1 << CPU_REMOVING_FLAG; - } - } else { - warn!("Out of range vCPU id: {}", self.selected_cpu); - } - } - _ => { - warn!("Unexpected offset for accessing CPU manager device: {offset:#}"); - } - } - } +/// State of the core scheduling group leader election for VM-wide cookie +/// sharing. +/// +/// The value will be in an `AtomicI32`. Positive values represent a leader +/// TID (cookie ready). +#[repr(i32)] +enum CoreSchedulingLeader { + /// No leader elected yet. + Initial = 0, + /// A leader has been elected and is creating the cookie. + Elected = -1, + /// The leader failed to create the cookie. + Error = -2, +} - fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { - match offset { - CPU_SELECTION_OFFSET => { - assert!(data.len() >= core::mem::size_of::()); - self.selected_cpu = - u32::from_le_bytes(data[0..core::mem::size_of::()].try_into().unwrap()); - } - CPU_STATUS_OFFSET => { - if self.selected_cpu < self.max_vcpus() { - let state = &mut self.vcpu_states[usize::try_from(self.selected_cpu).unwrap()]; - // The ACPI code writes back a 1 to acknowledge the insertion - if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG) - && state.inserting - { - state.inserting = false; - } - // Ditto for removal - if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG) - && state.removing - { - state.removing = false; - } - // Trigger removal of vCPU - if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG - && let Err(e) = self.remove_vcpu(self.selected_cpu) - { - error!("Error removing vCPU: {e:?}"); - } - } else { - warn!("Out of range vCPU id: {}", self.selected_cpu); - } - } - _ => { - warn!("Unexpected offset for accessing CPU manager device: {offset:#}"); - } +impl TryFrom for CoreSchedulingLeader { + type Error = (); + /// Convert from the raw `i32` (from the `AtomicI32`) value. + /// Quirky: Returns `Ok(state)` for known sentinel values, or `Err(())` for + /// a positive TID (cookie ready). + fn try_from(value: i32) -> result::Result { + match value { + 0 => Ok(Self::Initial), + -1 => Ok(Self::Elected), + -2 => Ok(Self::Error), + _ => Err(()), } - None } } +/// Management structure for a vCPU (thread). #[derive(Default)] struct VcpuState { inserting: bool, removing: bool, pending_removal: Arc, + /// Handle to the vCPU thread. handle: Option>, + /// Instructs the thread to exit the run-vCPU loop. kill: Arc, + /// Used to ACK interruption from the run vCPU loop to the CPU Manager. vcpu_run_interrupted: Arc, /// Used to ACK state changes from the run vCPU loop to the CPU Manager. paused: Arc, @@ -713,6 +782,13 @@ impl VcpuState { /// /// Please call [`Self::wait_until_signal_acknowledged`] afterward to block /// until the vCPU thread has acknowledged the signal. + /// + /// If the thread is in KVM_RUN (or MSHV_RUN_VP or equivalent), this kicks + /// the thread out of kernel space. If the thread is in user-space, the + /// thread will just handle the event eventually. If the thread is in + /// user-space but about to enter kernel-space, the user-space signal + /// handler will make sure that the next kernel entry of the given + /// vCPU thread immediately exits to handle the event in user-space. fn signal_thread(&self) { if let Some(handle) = self.handle.as_ref() { // SAFETY: FFI call with correct arguments @@ -778,6 +854,7 @@ impl CpuManager { #[cfg(feature = "tdx")] tdx_enabled: bool, numa_nodes: &NumaNodes, #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, + #[cfg(feature = "igvm")] igvm_enabled: bool, ) -> Result>> { if config.max_vcpus > hypervisor.get_max_vcpus() { return Err(Error::MaximumVcpusExceeded( @@ -789,6 +866,7 @@ impl CpuManager { let max_vcpus = usize::try_from(config.max_vcpus).unwrap(); let mut vcpu_states = Vec::with_capacity(max_vcpus); vcpu_states.resize_with(max_vcpus, VcpuState::default); + let vcpu_states = Arc::new(Mutex::new(vcpu_states)); let hypervisor_type = hypervisor.hypervisor_type(); #[cfg(target_arch = "x86_64")] let cpu_vendor = hypervisor.get_cpu_vendor(); @@ -840,7 +918,6 @@ impl CpuManager { reset_evt, #[cfg(feature = "guest_debug")] vm_debug_evt, - selected_cpu: 0, vcpus: Vec::with_capacity(max_vcpus), seccomp_action, vm_ops, @@ -851,6 +928,11 @@ impl CpuManager { hypervisor, #[cfg(feature = "sev_snp")] sev_snp_enabled, + core_scheduling_group_leader: Arc::new(AtomicI32::new( + CoreSchedulingLeader::Initial as i32, + )), + #[cfg(feature = "igvm")] + igvm_enabled, }))) } @@ -929,8 +1011,10 @@ impl CpuManager { vcpu: &mut Vcpu, boot_setup: Option<(EntryPoint, &GuestMemoryAtomic)>, ) -> Result<()> { - #[cfg(feature = "sev_snp")] - if self.sev_snp_enabled { + #[cfg(all(feature = "sev_snp", feature = "mshv"))] + if self.sev_snp_enabled + && self.hypervisor.hypervisor_type() == hypervisor::HypervisorType::Mshv + { if let Some((kernel_entry_point, _)) = boot_setup { vcpu.set_sev_control_register( kernel_entry_point.entry_addr.0 / crate::igvm::HV_PAGE_SIZE, @@ -971,6 +1055,8 @@ impl CpuManager { self.config.kvm_hyperv, topology, self.config.nested, + #[cfg(feature = "igvm")] + self.igvm_enabled, )?; #[cfg(target_arch = "aarch64")] @@ -1055,14 +1141,14 @@ impl CpuManager { let vcpus_pause_signalled = self.vcpus_pause_signalled.clone(); let vcpus_kick_signalled = self.vcpus_kick_signalled.clone(); - let vcpu_kill = self.vcpu_states[usize::try_from(vcpu_id).unwrap()] - .kill - .clone(); - let vcpu_run_interrupted = self.vcpu_states[usize::try_from(vcpu_id).unwrap()] + let mut vcpu_states = self.vcpu_states.lock().unwrap(); + + let vcpu_kill = vcpu_states[usize::try_from(vcpu_id).unwrap()].kill.clone(); + let vcpu_run_interrupted = vcpu_states[usize::try_from(vcpu_id).unwrap()] .vcpu_run_interrupted .clone(); let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone(); - let vcpu_paused = self.vcpu_states[usize::try_from(vcpu_id).unwrap()] + let vcpu_paused = vcpu_states[usize::try_from(vcpu_id).unwrap()] .paused .clone(); @@ -1079,6 +1165,9 @@ impl CpuManager { cpuset }); + let core_scheduling = self.config.core_scheduling; + let core_scheduling_group_leader = self.core_scheduling_group_leader.clone(); + // Retrieve seccomp filter for vcpu thread let vcpu_seccomp_filter = get_seccomp_filter( &self.seccomp_action, @@ -1117,6 +1206,67 @@ impl CpuManager { } } + // Set up core scheduling before seccomp locks down prctl. + match core_scheduling { + CoreScheduling::Vcpu => { + // Each vCPU gets its own unique cookie + if let Err(e) = core_scheduling_create() { + error!( + "Failed to enable core scheduling for vCPU {vcpu_id}: {e:?}" + ); + return; + } + } + CoreScheduling::Vm => { + // First vCPU creates a cookie; all others share from it. + // SAFETY: gettid() is always safe to call. + let my_tid = unsafe { libc::gettid() }; + if core_scheduling_group_leader + .compare_exchange(CoreSchedulingLeader::Initial as i32, CoreSchedulingLeader::Elected as i32, Ordering::AcqRel, Ordering::Acquire) + .is_ok() + { + // We are the group leader — create the cookie + if let Err(e) = core_scheduling_create() { + error!( + "Failed to create core scheduling cookie: {e:?}" + ); + // This will force the loop in the other threads to break out + core_scheduling_group_leader.store(CoreSchedulingLeader::Error as i32, Ordering::Release); + return; + } + // Signal that the cookie is ready by storing real TID + core_scheduling_group_leader + .store(my_tid, Ordering::Release); + } else { + // Wait for the leader to finish creating the cookie + let leader_tid = loop { + let v = core_scheduling_group_leader.load(Ordering::Acquire); + match CoreSchedulingLeader::try_from(v) { + Ok(CoreSchedulingLeader::Error) => return, + Ok(CoreSchedulingLeader::Initial | + CoreSchedulingLeader::Elected) => std::hint::spin_loop(), + Err(()) => break v, + } + }; + if let Err(e) = core_scheduling_share_from(leader_tid) { + error!( + "Failed to share core scheduling cookie \ + to vCPU {vcpu_id}: {e:?}" + ); + return; + } + } + } + CoreScheduling::Off => {} + } + + if core_scheduling != CoreScheduling::Off { + info!( + "vCPU {vcpu_id}: core scheduling cookie = {:#x}", + core_scheduling_cookie() + ); + } + // Apply seccomp filter for vcpu thread. if !vcpu_seccomp_filter.is_empty() && let Err(e) = apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter) @@ -1285,8 +1435,8 @@ impl CpuManager { // On hot plug calls into this function entry_point is None. It is for // those hotplug CPU additions that we need to set the inserting flag. - self.vcpu_states[usize::try_from(vcpu_id).unwrap()].handle = handle; - self.vcpu_states[usize::try_from(vcpu_id).unwrap()].inserting = inserting; + vcpu_states[usize::try_from(vcpu_id).unwrap()].handle = handle; + vcpu_states[usize::try_from(vcpu_id).unwrap()].inserting = inserting; Ok(()) } @@ -1330,17 +1480,20 @@ impl CpuManager { } fn mark_vcpus_for_removal(&mut self, desired_vcpus: u32) { + let mut vcpu_states = self.vcpu_states.lock().unwrap(); + let present_vcpus = Self::active_vcpus(&vcpu_states); + // Mark vCPUs for removal, actual removal happens on ejection - for cpu_id in desired_vcpus..self.present_vcpus() { - self.vcpu_states[usize::try_from(cpu_id).unwrap()].removing = true; - self.vcpu_states[usize::try_from(cpu_id).unwrap()] + for cpu_id in desired_vcpus..present_vcpus { + vcpu_states[usize::try_from(cpu_id).unwrap()].removing = true; + vcpu_states[usize::try_from(cpu_id).unwrap()] .pending_removal .store(true, Ordering::SeqCst); } } pub fn check_pending_removed_vcpu(&mut self) -> bool { - for state in self.vcpu_states.iter() { + for state in self.vcpu_states.lock().unwrap().iter() { if state.active() && state.pending_removal.load(Ordering::SeqCst) { return true; } @@ -1348,22 +1501,6 @@ impl CpuManager { false } - fn remove_vcpu(&mut self, cpu_id: u32) -> Result<()> { - info!("Removing vCPU: cpu_id = {cpu_id}"); - let state = &mut self.vcpu_states[usize::try_from(cpu_id).unwrap()]; - state.kill.store(true, Ordering::SeqCst); - state.signal_thread(); - state.wait_until_signal_acknowledged()?; - state.join_thread()?; - state.handle = None; - - // Once the thread has exited, clear the "kill" so that it can reused - state.kill.store(false, Ordering::SeqCst); - state.pending_removal.store(false, Ordering::SeqCst); - - Ok(()) - } - pub fn create_boot_vcpus( &mut self, snapshot: Option<&Snapshot>, @@ -1441,17 +1578,20 @@ impl CpuManager { } } - /// Signal to the spawned threads (vCPUs and console signal handler). + /// Signals all vCPU threads and waits for them to ACK the interruption. /// - /// For the vCPU threads this will interrupt the KVM_RUN ioctl() allowing - /// the loop to check the shared state booleans. + /// Calls [`VcpuState::signal_thread`] and + /// [`VcpuState::wait_until_signal_acknowledged`] for each vCPU. fn signal_vcpus(&mut self) -> Result<()> { + // Holding the lock for the whole operation is correct: + let vcpu_states = self.vcpu_states.lock().unwrap(); + // Splitting this into two loops reduced the time to pause many vCPUs // massively. Example: 254 vCPUs. >254ms -> ~4ms. - for state in self.vcpu_states.iter() { + for state in vcpu_states.iter() { state.signal_thread(); } - for state in self.vcpu_states.iter() { + for state in vcpu_states.iter() { state.wait_until_signal_acknowledged()?; } @@ -1466,14 +1606,14 @@ impl CpuManager { self.vcpus_pause_signalled.store(false, Ordering::SeqCst); // Unpark all the VCPU threads. - for state in self.vcpu_states.iter() { + for state in self.vcpu_states.lock().unwrap().iter() { state.unpark_thread(); } self.signal_vcpus()?; // Wait for all the threads to finish. This removes the state from the vector. - for mut state in self.vcpu_states.drain(..) { + for mut state in self.vcpu_states.lock().unwrap().drain(..) { state.join_thread()?; } @@ -1506,8 +1646,15 @@ impl CpuManager { self.cpuid.clone() } + /// Locks the vCPU states and calls [`Self::active_vcpus`]. fn present_vcpus(&self) -> u32 { - self.vcpu_states + let lock = self.vcpu_states.lock().unwrap(); + Self::active_vcpus(&lock) + } + + /// Counts the number of active vCPUs (running vCPU threads). + fn active_vcpus(vcpu_states: &[VcpuState]) -> u32 { + vcpu_states .iter() .fold(0, |acc, state| acc + state.active() as u32) } @@ -2062,7 +2209,7 @@ impl CpuManager { &self.vcpus_kill_signalled } - #[cfg(feature = "igvm")] + #[cfg(all(feature = "igvm", feature = "mshv"))] pub(crate) fn get_cpuid_leaf( &self, cpu_id: u8, @@ -2085,6 +2232,11 @@ impl CpuManager { self.sev_snp_enabled } + #[cfg(feature = "igvm")] + pub(crate) fn hypervisor_type(&self) -> hypervisor::HypervisorType { + self.hypervisor.hypervisor_type() + } + pub(crate) fn nmi(&mut self) -> Result<()> { self.vcpus_kick_signalled.store(true, Ordering::SeqCst); self.signal_vcpus()?; @@ -2452,21 +2604,21 @@ impl Pausable for CpuManager { self.signal_vcpus() .map_err(|e| MigratableError::Pause(anyhow!("Error signalling vCPUs: {e}")))?; + // Notify all guests (including Hyper-V / Windows) that the clock was + // paused. KVM_KVMCLOCK_CTRL updates internal KVM state that affects + // both pvclock (Linux) and the Hyper-V TSC reference page, so it must + // be called unconditionally. #[cfg(all(feature = "kvm", target_arch = "x86_64"))] for vcpu in self.vcpus.iter() { let vcpu = vcpu.lock().unwrap(); - if !self.config.kvm_hyperv { - vcpu.vcpu.notify_guest_clock_paused().map_err(|e| { - MigratableError::Pause(anyhow!( - "Could not notify guest it has been paused {e:?}" - )) - })?; - } + vcpu.vcpu.notify_guest_clock_paused().map_err(|e| { + MigratableError::Pause(anyhow!("Could not notify guest it has been paused {e:?}")) + })?; } // The vCPU thread will change its paused state before parking, wait here for each // activated vCPU change their state to ensure they have parked. - for state in self.vcpu_states.iter() { + for state in self.vcpu_states.lock().unwrap().iter() { if state.active() { // wait for vCPU to update state while !state.paused.load(Ordering::SeqCst) { @@ -2484,16 +2636,18 @@ impl Pausable for CpuManager { // their run vCPU loop. self.vcpus_pause_signalled.store(false, Ordering::SeqCst); + let vcpu_states = self.vcpu_states.lock().unwrap(); + // Unpark all the vCPU threads. // Step 1/2: signal each thread { - for state in self.vcpu_states.iter() { + for state in vcpu_states.iter() { state.unpark_thread(); } } // Step 2/2: wait for state ACK { - for state in self.vcpu_states.iter() { + for state in vcpu_states.iter() { // wait for vCPU to update state while state.paused.load(Ordering::SeqCst) { // To avoid a priority inversion with the vCPU thread @@ -2577,7 +2731,7 @@ impl Debuggable for CpuManager { ]; // GDB exposes 32-bit eflags instead of 64-bit rflags. - // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml + // https://sourceware.org/git/?p=binutils-gdb.git;a=blob;f=gdb/features/i386/64bit-core.xml let eflags = gregs.get_rflags() as u32; let rip = gregs.get_rip(); @@ -2995,6 +3149,132 @@ impl CpuElf64Writable for CpuManager { } } +/// MMIO-accessible controller for handling ACPI hotplug and unplug events. +/// +/// Shares state about the vCPUs with the [`CpuManager`]. +pub struct AcpiCpuHotplugController { + /// The currently selected CPU by the guest. + selected_cpu: u32, + /// Shared vCPU state with [`CpuManager`]. + vcpu_states: Arc>>, + /// Maximum number of vCPUS of the VM. + max_vcpus: u32, +} + +impl AcpiCpuHotplugController { + const CPU_ENABLE_FLAG: usize = 0; + const CPU_INSERTING_FLAG: usize = 1; + const CPU_REMOVING_FLAG: usize = 2; + const CPU_EJECT_FLAG: usize = 3; + + const CPU_SELECTION_OFFSET: u64 = 0; + const CPU_STATUS_OFFSET: u64 = 4; + + /// Creates a new [`AcpiCpuHotplugController`]. + pub fn new(cpu_manager: &CpuManager) -> AcpiCpuHotplugController { + Self { + max_vcpus: cpu_manager.config.max_vcpus, + selected_cpu: 0, + vcpu_states: cpu_manager.vcpu_states.clone(), + } + } + + /// Removes a vCPU from the guest. + /// + /// The corresponding vCPU thread will be gracefully stopped and joined. + fn remove_vcpu(cpu_id: u32, state: &mut VcpuState) -> Result<()> { + info!("Removing vCPU: cpu_id = {cpu_id}"); + state.kill.store(true, Ordering::SeqCst); + state.signal_thread(); + state.wait_until_signal_acknowledged()?; + state.join_thread()?; + state.handle = None; + + // Once the thread has exited, clear the "kill" so that it can reused + state.kill.store(false, Ordering::SeqCst); + state.pending_removal.store(false, Ordering::SeqCst); + + Ok(()) + } +} + +impl BusDevice for AcpiCpuHotplugController { + fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { + // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. + data.fill(0); + let vcpu_states = self.vcpu_states.lock().unwrap(); + + match offset { + Self::CPU_SELECTION_OFFSET => { + assert!(data.len() >= core::mem::size_of::()); + data[0..core::mem::size_of::()] + .copy_from_slice(&self.selected_cpu.to_le_bytes()); + } + Self::CPU_STATUS_OFFSET => { + if self.selected_cpu < self.max_vcpus { + let state = &vcpu_states[usize::try_from(self.selected_cpu).unwrap()]; + if state.active() { + data[0] |= 1 << Self::CPU_ENABLE_FLAG; + } + if state.inserting { + data[0] |= 1 << Self::CPU_INSERTING_FLAG; + } + if state.removing { + data[0] |= 1 << Self::CPU_REMOVING_FLAG; + } + } else { + warn!("Out of range vCPU id: {}", self.selected_cpu); + } + } + _ => { + warn!("Unexpected offset for accessing CPU manager device: {offset:#}"); + } + } + } + + fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { + match offset { + Self::CPU_SELECTION_OFFSET => { + assert!(data.len() >= core::mem::size_of::()); + self.selected_cpu = + u32::from_le_bytes(data[0..core::mem::size_of::()].try_into().unwrap()); + } + Self::CPU_STATUS_OFFSET => { + if self.selected_cpu < self.max_vcpus { + // This structure is not shared with the vCPU thread, therefore, holding the + // lock for the entire function doesn't cause any deadlock. + let mut vcpu_states = self.vcpu_states.lock().unwrap(); + let state = &mut vcpu_states[usize::try_from(self.selected_cpu).unwrap()]; + // The ACPI code writes back a 1 to acknowledge the insertion + if (data[0] & (1 << Self::CPU_INSERTING_FLAG) == 1 << Self::CPU_INSERTING_FLAG) + && state.inserting + { + state.inserting = false; + } + // Ditto for removal + if (data[0] & (1 << Self::CPU_REMOVING_FLAG) == 1 << Self::CPU_REMOVING_FLAG) + && state.removing + { + state.removing = false; + } + // Trigger removal of vCPU: + if data[0] & (1 << Self::CPU_EJECT_FLAG) == 1 << Self::CPU_EJECT_FLAG + && let Err(e) = Self::remove_vcpu(self.selected_cpu, state) + { + error!("Error removing vCPU: {e:?}"); + } + } else { + warn!("Out of range vCPU id: {}", self.selected_cpu); + } + } + _ => { + warn!("Unexpected offset for accessing CPU manager device: {offset:#}"); + } + } + None + } +} + #[cfg(all(feature = "kvm", target_arch = "x86_64"))] #[cfg(test)] mod unit_tests { diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index a7d3254c3f..0a263e7c42 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -32,15 +32,18 @@ use arch::layout::{APIC_START, IOAPIC_SIZE, IOAPIC_START}; #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] use arch::{DeviceType, MmioDeviceInfo}; use arch::{NumaNodes, layout}; -use block::async_io::DiskFile; +use block::disk_file::DiskBackend; +use block::error::BlockError; use block::fixed_vhd_sync::FixedVhdDiskSync; +#[cfg(feature = "io_uring")] +use block::qcow_async::QcowDiskAsync; use block::qcow_sync::QcowDiskSync; use block::raw_async_aio::RawFileDiskAio; use block::raw_sync::RawFileDiskSync; use block::vhdx_sync::VhdxDiskSync; use block::{ ImageType, block_aio_is_supported, block_io_uring_is_supported, detect_image_type, - preallocate_disk, qcow, vhdx, + open_disk_image, preallocate_disk, }; #[cfg(feature = "io_uring")] use block::{fixed_vhd_async::FixedVhdDiskAsync, raw_async::RawFileDisk}; @@ -75,6 +78,8 @@ use event_monitor::event; use hypervisor::IoEventAddress; #[cfg(target_arch = "aarch64")] use hypervisor::arch::aarch64::regs::AARCH64_PMU_IRQ; +#[cfg(feature = "kvm")] +use iommufd_ioctls::IommuFd; use libc::{ MAP_NORESERVE, MAP_PRIVATE, MAP_SHARED, O_TMPFILE, PROT_READ, PROT_WRITE, TCSANOW, tcsetattr, termios, @@ -89,7 +94,9 @@ use seccompiler::SeccompAction; use serde::{Deserialize, Serialize}; use thiserror::Error; use tracer::trace_scoped; -use vfio_ioctls::{VfioContainer, VfioDevice, VfioDeviceFd}; +#[cfg(feature = "kvm")] +use vfio_ioctls::VfioIommufd; +use vfio_ioctls::{VfioContainer, VfioDevice, VfioDeviceFd, VfioOps}; use virtio_devices::transport::{VirtioPciDevice, VirtioPciDeviceActivator, VirtioTransport}; use virtio_devices::vhost_user::VhostUserConfig; use virtio_devices::{ @@ -116,8 +123,8 @@ use vm_migration::{ use vm_virtio::{AccessPlatform, VirtioDeviceType}; use vmm_sys_util::eventfd::EventFd; -use crate::console_devices::{ConsoleDeviceError, ConsoleInfo, ConsoleOutput}; -use crate::cpu::{CPU_MANAGER_ACPI_SIZE, CpuManager}; +use crate::console_devices::{ConsoleDeviceError, ConsoleInfo, ConsoleTransport}; +use crate::cpu::{AcpiCpuHotplugController, CPU_MANAGER_ACPI_SIZE, CpuManager}; use crate::device_tree::{DeviceNode, DeviceTree}; use crate::interrupt::{LegacyUserspaceInterruptManager, MsiInterruptManager}; use crate::memory_manager::{Error as MemoryManagerError, MEMORY_MANAGER_ACPI_SIZE, MemoryManager}; @@ -127,8 +134,8 @@ use crate::serial_manager::{Error as SerialManagerError, SerialManager}; use crate::vm_config::IvshmemConfig; use crate::vm_config::{ ConsoleOutputMode, DEFAULT_IOMMU_ADDRESS_WIDTH_BITS, DEFAULT_PCI_SEGMENT_APERTURE_WEIGHT, - DeviceConfig, DiskConfig, FsConfig, NetConfig, PmemConfig, UserDeviceConfig, VdpaConfig, - VhostMode, VmConfig, VsockConfig, + DeviceConfig, DiskConfig, FsConfig, GenericVhostUserConfig, NetConfig, PciDeviceCommonConfig, + PmemConfig, UserDeviceConfig, VdpaConfig, VhostMode, VmConfig, VsockConfig, }; use crate::{DEVICE_MANAGER_SNAPSHOT_ID, GuestRegionMmap, PciDeviceInfo, device_node}; @@ -158,6 +165,7 @@ const IVSHMEM_DEVICE_NAME: &str = "__ivshmem"; const DISK_DEVICE_NAME_PREFIX: &str = "_disk"; const FS_DEVICE_NAME_PREFIX: &str = "_fs"; const NET_DEVICE_NAME_PREFIX: &str = "_net"; +const GENERIC_VHOST_USER_DEVICE_NAME_PREFIX: &str = "_generic_vhost_user"; const PMEM_DEVICE_NAME_PREFIX: &str = "_pmem"; const VDPA_DEVICE_NAME_PREFIX: &str = "_vdpa"; const VSOCK_DEVICE_NAME_PREFIX: &str = "_vsock"; @@ -175,7 +183,7 @@ pub enum DeviceManagerError { /// Cannot open disk path #[error("Cannot open disk path")] - Disk(#[source] io::Error), + Disk(#[source] BlockError), /// Cannot create vhost-user-net device #[error("Cannot create vhost-user-net device")] @@ -197,6 +205,10 @@ pub enum DeviceManagerError { #[error("Cannot create virtio-rng device")] CreateVirtioRng(#[source] io::Error), + /// Cannot create generic vhost-user device + #[error("Cannot create generic vhost-user device")] + CreateGenericVhostUser(#[source] virtio_devices::vhost_user::Error), + /// Cannot create virtio-fs device #[error("Cannot create virtio-fs device")] CreateVirtioFs(#[source] virtio_devices::vhost_user::Error), @@ -205,6 +217,10 @@ pub enum DeviceManagerError { #[error("Virtio-fs device was created without a socket")] NoVirtioFsSock, + /// Generic vhost-user device was created without a socket. + #[error("Generic vhost-user device was created without a socket")] + NoGenericVhostUserSock, + /// Cannot create vhost-user-blk device #[error("Cannot create vhost-user-blk device")] CreateVhostUserBlk(#[source] virtio_devices::vhost_user::Error), @@ -256,11 +272,7 @@ pub enum DeviceManagerError { /// Failed to parse disk image format #[error("Failed to parse disk image format")] - DetectImageType(#[source] io::Error), - - /// Cannot open qcow disk path - #[error("Cannot open qcow disk path")] - QcowDeviceCreate(#[source] qcow::Error), + DetectImageType(#[source] BlockError), /// Cannot create serial manager #[error("Cannot create serial manager")] @@ -355,6 +367,15 @@ pub enum DeviceManagerError { #[error("Error getting pty peer")] GetPtyPeer(#[source] vmm_sys_util::errno::Error), + /// Cannot create iommufd + #[cfg(feature = "kvm")] + #[error("Cannot create iommufd")] + IommufdCreate(#[source] iommufd_ioctls::IommufdError), + + /// iommufd is not supported + #[error("iommufd is not supported without the kvm feature")] + IommufdNotSupported, + /// Cannot create a VFIO device #[error("Cannot create a VFIO device")] VfioCreate(#[source] vfio_ioctls::VfioError), @@ -473,15 +494,15 @@ pub enum DeviceManagerError { /// Failed to find an available PCI device ID. #[error("Failed to find an available PCI device ID")] - NextPciDeviceId(#[source] pci::PciRootError), + AllocatePciDeviceId(#[source] pci::PciRootError), /// Could not reserve the PCI device ID. #[error("Could not reserve the PCI device ID")] - GetPciDeviceId(#[source] pci::PciRootError), + ReservePciDeviceId(#[source] pci::PciRootError), - /// Could not give the PCI device ID back. - #[error("Could not give the PCI device ID back")] - PutPciDeviceId(#[source] pci::PciRootError), + /// Could not free the PCI device ID. + #[error("Could not free PCI device ID")] + FreePciDeviceId(#[source] pci::PciRootError), /// No disk path was specified when one was expected #[error("No disk path was specified when one was expected")] @@ -562,19 +583,23 @@ pub enum DeviceManagerError { /// Failed to create FixedVhdDiskAsync #[error("Failed to create FixedVhdDiskAsync")] - CreateFixedVhdDiskAsync(#[source] io::Error), + CreateFixedVhdDiskAsync(#[source] BlockError), /// Failed to create FixedVhdDiskSync #[error("Failed to create FixedVhdDiskSync")] - CreateFixedVhdDiskSync(#[source] io::Error), + CreateFixedVhdDiskSync(#[source] BlockError), /// Failed to create QcowDiskSync #[error("Failed to create QcowDiskSync")] - CreateQcowDiskSync(#[source] qcow::Error), + CreateQcowDiskSync(#[source] BlockError), + + /// Failed to create QcowDiskAsync + #[error("Failed to create QcowDiskAsync")] + CreateQcowDiskAsync(#[source] BlockError), /// Failed to create FixedVhdxDiskSync #[error("Failed to create FixedVhdxDiskSync")] - CreateFixedVhdxDiskSync(#[source] vhdx::VhdxError), + CreateFixedVhdxDiskSync(#[source] BlockError), /// Failed to add DMA mapping handler to virtio-mem device. #[error("Failed to add DMA mapping handler to virtio-mem device")] @@ -632,6 +657,10 @@ pub enum DeviceManagerError { #[error("Invalid identifier: {0}")] InvalidIdentifier(String), + /// vfio-user socket path already in use by another user device. + #[error("vfio-user socket path already in use: {0:?}")] + UserDeviceSocketInUse(std::path::PathBuf), + /// Error activating virtio device #[error("Error activating virtio device")] VirtioActivate(#[source] ActivateError), @@ -734,15 +763,10 @@ impl DeviceRelocation for AddressManager { ) -> std::result::Result<(), std::io::Error> { match region_type { PciBarRegionType::IoRegion => { + let mut sys_allocator = self.allocator.lock().unwrap(); // Update system allocator - self.allocator - .lock() - .unwrap() - .free_io_addresses(GuestAddress(old_base), len as GuestUsize); - - self.allocator - .lock() - .unwrap() + sys_allocator.free_io_addresses(GuestAddress(old_base), len as GuestUsize); + sys_allocator .allocate_io_addresses(Some(GuestAddress(new_base)), len as GuestUsize, None) .ok_or_else(|| io::Error::other("failed allocating new IO range"))?; @@ -752,26 +776,22 @@ impl DeviceRelocation for AddressManager { .map_err(io::Error::other)?; } PciBarRegionType::Memory32BitRegion | PciBarRegionType::Memory64BitRegion => { - let allocators = if region_type == PciBarRegionType::Memory32BitRegion { + let pci_mmio_allocators = if region_type == PciBarRegionType::Memory32BitRegion { &self.pci_mmio32_allocators } else { &self.pci_mmio64_allocators }; - // Find the specific allocator that this BAR was allocated from and use it for new one - for allocator in allocators { - let allocator_base = allocator.lock().unwrap().base(); - let allocator_end = allocator.lock().unwrap().end(); + // Find the specific allocator that this BAR was allocated from and use it for a new one + for pci_mmio_allocator_mutex in pci_mmio_allocators { + let mut pci_mmio_allocator = pci_mmio_allocator_mutex.lock().unwrap(); - if old_base >= allocator_base.0 && old_base <= allocator_end.0 { - allocator - .lock() - .unwrap() - .free(GuestAddress(old_base), len as GuestUsize); + if old_base >= pci_mmio_allocator.base().0 + && old_base <= pci_mmio_allocator.end().0 + { + pci_mmio_allocator.free(GuestAddress(old_base), len as GuestUsize); - allocator - .lock() - .unwrap() + pci_mmio_allocator .allocate(Some(GuestAddress(new_base)), len as GuestUsize, Some(len)) .ok_or_else(|| io::Error::other("failed allocating new MMIO range"))?; @@ -916,12 +936,19 @@ pub enum PciDeviceHandle { #[derive(Clone)] struct MetaVirtioDevice { virtio_device: Arc>, - iommu: bool, - id: String, - pci_segment: u16, + pci_common: PciDeviceCommonConfig, dma_handler: Option>, } +impl MetaVirtioDevice { + fn id(&self) -> &str { + self.pci_common + .id + .as_deref() + .expect("ID should have been assigned before use") + } +} + #[derive(Default)] pub struct AcpiPlatformAddresses { pub pm_timer_address: Option, @@ -930,26 +957,26 @@ pub struct AcpiPlatformAddresses { pub sleep_status_reg_address: Option, } -#[cfg(all(feature = "mshv", feature = "sev_snp"))] +#[cfg(feature = "sev_snp")] struct SevSnpPageAccessProxy { vm: Arc, } -#[cfg(all(feature = "mshv", feature = "sev_snp"))] +#[cfg(feature = "sev_snp")] impl std::fmt::Debug for SevSnpPageAccessProxy { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "SNP Page access proxy") } } -#[cfg(all(feature = "mshv", feature = "sev_snp"))] +#[cfg(feature = "sev_snp")] impl SevSnpPageAccessProxy { fn new(vm: Arc) -> SevSnpPageAccessProxy { SevSnpPageAccessProxy { vm } } } -#[cfg(all(feature = "mshv", feature = "sev_snp"))] +#[cfg(feature = "sev_snp")] impl AccessPlatform for SevSnpPageAccessProxy { fn translate_gpa(&self, base: u64, _size: u64) -> std::result::Result { Ok(base) @@ -1003,6 +1030,10 @@ pub struct DeviceManager { // CPU Manager cpu_manager: Arc>, + /// Owned version needed to keep the bus device alive (the bus only holds + /// a weak reference). + _acpi_cpu_hotplug_controller: Arc>, + // The virtio devices on the system virtio_devices: Vec, @@ -1031,10 +1062,10 @@ pub struct DeviceManager { // Passthrough device handle passthrough_device: Option, - // VFIO container - // Only one container can be created, therefore it is stored as part of the + // VFIO operation instance + // Only one can be created, therefore it is stored as part of the // DeviceManager to be reused. - vfio_container: Option>, + vfio_ops: Option>, // Paravirtualized IOMMU iommu_device: Option>>, @@ -1053,6 +1084,7 @@ pub struct DeviceManager { // Exit event exit_evt: EventFd, reset_evt: EventFd, + guest_exit_evt: EventFd, #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] id_to_dev_info: HashMap<(DeviceType, String), MmioDeviceInfo>, @@ -1094,8 +1126,8 @@ pub struct DeviceManager { // pvpanic device pvpanic_device: Option>>, - // Flag to force setting the iommu on virtio devices - force_iommu: bool, + // Force VIRTIO_F_ACCESS_PLATFORM on all virtio devices (e.g. for TDX/SEV-SNP) + force_access_platform: bool, // io_uring availability if detected io_uring_supported: Option, @@ -1130,6 +1162,8 @@ pub struct DeviceManager { ivshmem_device: Option>>, } +/// Create per-PCI-segment MMIO allocators over the range `[start, end]`. +/// Both `start` and `end` are inclusive addresses. fn create_mmio_allocators( start: u64, end: u64, @@ -1147,7 +1181,15 @@ fn create_mmio_allocators( for segment_id in 0..num_pci_segments as u64 { let weight = weights[segment_id as usize] as u64; let mmio_start = start + i * pci_segment_mmio_size; - let mmio_size = pci_segment_mmio_size * weight; + let is_last = segment_id == num_pci_segments as u64 - 1; + // Give the last segment all remaining space so no addresses + // near the top of the physical address space are lost to + // alignment truncation. + let mmio_size = if is_last { + end - mmio_start + 1 + } else { + pci_segment_mmio_size * weight + }; let allocator = Arc::new(Mutex::new( AddressAllocator::new(GuestAddress(mmio_start), mmio_size).unwrap(), )); @@ -1158,6 +1200,14 @@ fn create_mmio_allocators( mmio_allocators } +fn use_64bit_bar_for_virtio_device( + device_type: u32, + pci_segment_id: u16, + is_hotplug: bool, +) -> bool { + pci_segment_id > 0 || device_type != VirtioDeviceType::Block as u32 || is_hotplug +} + impl DeviceManager { #[allow(clippy::too_many_arguments)] pub fn new( @@ -1169,10 +1219,11 @@ impl DeviceManager { cpu_manager: Arc>, exit_evt: EventFd, reset_evt: EventFd, + guest_exit_evt: EventFd, seccomp_action: SeccompAction, numa_nodes: NumaNodes, activate_evt: &EventFd, - force_iommu: bool, + force_access_platform: bool, boot_id_list: BTreeSet, #[cfg(not(target_arch = "riscv64"))] timestamp: Instant, snapshot: Option<&Snapshot>, @@ -1208,7 +1259,8 @@ impl DeviceManager { } let start_of_mmio32_area = layout::MEM_32BIT_DEVICES_START.0; - let end_of_mmio32_area = layout::MEM_32BIT_DEVICES_START.0 + layout::MEM_32BIT_DEVICES_SIZE; + let end_of_mmio32_area = + layout::MEM_32BIT_DEVICES_START.0 + layout::MEM_32BIT_DEVICES_SIZE - 1; let pci_mmio32_allocators = create_mmio_allocators( start_of_mmio32_area, end_of_mmio32_area, @@ -1290,6 +1342,10 @@ impl DeviceManager { )?); } + let acpi_cpu_hotplug_controller = + AcpiCpuHotplugController::new(&cpu_manager.lock().unwrap()); + let acpi_cpu_hotplug_controller = Arc::new(Mutex::new(acpi_cpu_hotplug_controller)); + if dynamic { let acpi_address = address_manager .allocator @@ -1301,7 +1357,7 @@ impl DeviceManager { address_manager .mmio_bus .insert( - cpu_manager.clone(), + acpi_cpu_hotplug_controller.clone(), acpi_address.0, CPU_MANAGER_ACPI_SIZE as u64, ) @@ -1352,7 +1408,7 @@ impl DeviceManager { msi_interrupt_manager, legacy_interrupt_manager: None, passthrough_device: None, - vfio_container: None, + vfio_ops: None, iommu_device: None, iommu_mapping: None, iommu_attached_devices: None, @@ -1360,6 +1416,7 @@ impl DeviceManager { device_tree, exit_evt, reset_evt, + guest_exit_evt, #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] id_to_dev_info: HashMap::new(), seccomp_action, @@ -1380,7 +1437,7 @@ impl DeviceManager { #[cfg(feature = "pvmemcontrol")] pvmemcontrol_devices: None, pvpanic_device: None, - force_iommu, + force_access_platform, io_uring_supported: None, aio_supported: None, boot_id_list, @@ -1395,6 +1452,7 @@ impl DeviceManager { fw_cfg: None, #[cfg(feature = "ivshmem")] ivshmem_device: None, + _acpi_cpu_hotplug_controller: acpi_cpu_hotplug_controller, }; let device_manager = Arc::new(Mutex::new(device_manager)); @@ -1473,7 +1531,7 @@ impl DeviceManager { self.reset_evt .try_clone() .map_err(DeviceManagerError::EventFd)?, - self.exit_evt + self.guest_exit_evt .try_clone() .map_err(DeviceManagerError::EventFd)?, )?; @@ -1647,25 +1705,32 @@ impl DeviceManager { let mut iommu_attached_devices = Vec::new(); { + // Reserve all explicit PCI device IDs before any device creation + // so that they won't be picked for dynamic allocation. + self.reserve_explicit_device_ids()?; + for handle in self.virtio_devices.clone() { - let mapping: Option> = if handle.iommu { + let mapping: Option> = if handle.pci_common.iommu { self.iommu_mapping.clone() } else { None }; + let id = handle.id().to_owned(); let dev_id = self.add_virtio_pci_device( handle.virtio_device, &mapping, - &handle.id, - handle.pci_segment, + &id, + handle.pci_common.pci_segment, + false, handle.dma_handler, + handle.pci_common.pci_device_id, )?; // Track device BDF for Generic Initiator support - self.device_id_to_bdf.insert(handle.id.clone(), dev_id); + self.device_id_to_bdf.insert(id, dev_id); - if handle.iommu { + if handle.pci_common.iommu { iommu_attached_devices.push(dev_id); } } @@ -1691,7 +1756,15 @@ impl DeviceManager { } if let Some(iommu_device) = iommu_device { - let dev_id = self.add_virtio_pci_device(iommu_device, &None, &iommu_id, 0, None)?; + let dev_id = self.add_virtio_pci_device( + iommu_device, + &None, + &iommu_id, + 0, + false, + None, + None, + )?; self.iommu_attached_devices = Some((dev_id, iommu_attached_devices)); } } @@ -1759,7 +1832,7 @@ impl DeviceManager { } #[cfg(target_arch = "aarch64")] - pub fn get_interrupt_controller(&mut self) -> Option<&Arc>> { + pub fn get_interrupt_controller(&self) -> Option<&Arc>> { self.interrupt_controller.as_ref() } @@ -1796,7 +1869,7 @@ impl DeviceManager { } #[cfg(target_arch = "riscv64")] - pub fn get_interrupt_controller(&mut self) -> Option<&Arc>> { + pub fn get_interrupt_controller(&self) -> Option<&Arc>> { self.interrupt_controller.as_ref() } @@ -1844,7 +1917,7 @@ impl DeviceManager { &mut self, interrupt_manager: &dyn InterruptManager, reset_evt: EventFd, - exit_evt: EventFd, + guest_exit_evt: EventFd, ) -> DeviceManagerResult>>> { let vcpus_kill_signalled = self .cpu_manager @@ -1853,7 +1926,7 @@ impl DeviceManager { .vcpus_kill_signalled() .clone(); let shutdown_device = Arc::new(Mutex::new(devices::AcpiShutdownDevice::new( - exit_evt, + guest_exit_evt, reset_evt, vcpus_kill_signalled, ))); @@ -2338,17 +2411,17 @@ impl DeviceManager { fn add_virtio_console_device( &mut self, - console_fd: ConsoleOutput, + transport: ConsoleTransport, resize_pipe: Option>, ) -> DeviceManagerResult>> { let console_config = self.config.lock().unwrap().console.clone(); - let endpoint = match console_fd { - ConsoleOutput::File(file) => Endpoint::File(file), - ConsoleOutput::Pty(file) => { + let endpoint = match transport { + ConsoleTransport::File(file) => Endpoint::File(file), + ConsoleTransport::Pty(file) => { self.console_resize_pipe = resize_pipe; Endpoint::PtyPair(Arc::new(file.try_clone().unwrap()), file) } - ConsoleOutput::Tty(stdout) => { + ConsoleTransport::Tty(stdout) => { if stdout.is_terminal() { self.console_resize_pipe = resize_pipe; } @@ -2369,11 +2442,11 @@ impl DeviceManager { Endpoint::File(stdout) } } - ConsoleOutput::Socket(_) => { + ConsoleTransport::Socket(_) => { return Err(DeviceManagerError::NoSocketOptionSupportForConsoleDevice); } - ConsoleOutput::Null => Endpoint::Null, - ConsoleOutput::Off => return Ok(None), + ConsoleTransport::Null => Endpoint::Null, + ConsoleTransport::Off => return Ok(None), }; let id = String::from(CONSOLE_DEVICE_NAME); @@ -2383,7 +2456,7 @@ impl DeviceManager { self.console_resize_pipe .as_ref() .map(|p| p.try_clone().unwrap()), - self.force_iommu | console_config.iommu, + self.force_access_platform | console_config.iommu, self.seccomp_action.clone(), self.exit_evt .try_clone() @@ -2396,9 +2469,11 @@ impl DeviceManager { self.virtio_devices.push(MetaVirtioDevice { virtio_device: Arc::clone(&virtio_console_device) as Arc>, - iommu: console_config.iommu, - id: id.clone(), - pci_segment: 0, + pci_common: PciDeviceCommonConfig { + id: Some(id.clone()), + iommu: console_config.iommu, + ..Default::default() + }, dma_handler: None, }); @@ -2437,26 +2512,25 @@ impl DeviceManager { // SAFETY: console_info is Some, so it's safe to unwrap. let console_info = console_info.unwrap(); - let serial_writer: Option> = match console_info.serial_main_fd { - ConsoleOutput::File(ref file) | ConsoleOutput::Tty(ref file) => { + let serial_writer: Option> = match console_info.serial { + ConsoleTransport::File(ref file) | ConsoleTransport::Tty(ref file) => { Some(Box::new(Arc::clone(file))) } - ConsoleOutput::Off - | ConsoleOutput::Null - | ConsoleOutput::Pty(_) - | ConsoleOutput::Socket(_) => None, + ConsoleTransport::Off + | ConsoleTransport::Null + | ConsoleTransport::Pty(_) + | ConsoleTransport::Socket(_) => None, }; - if !matches!(console_info.serial_main_fd, ConsoleOutput::Off) { + if !matches!(console_info.serial, ConsoleTransport::Off) { let serial = self.add_serial_device(interrupt_manager, serial_writer)?; - self.serial_manager = match console_info.serial_main_fd { - ConsoleOutput::Pty(_) | ConsoleOutput::Tty(_) | ConsoleOutput::Socket(_) => { - let serial_manager = SerialManager::new( - serial, - console_info.serial_main_fd, - serial_config.socket, - ) - .map_err(DeviceManagerError::CreateSerialManager)?; + self.serial_manager = match console_info.serial { + ConsoleTransport::Pty(_) + | ConsoleTransport::Tty(_) + | ConsoleTransport::Socket(_) => { + let serial_manager = + SerialManager::new(serial, console_info.serial, serial_config.socket) + .map_err(DeviceManagerError::CreateSerialManager)?; if let Some(mut serial_manager) = serial_manager { serial_manager .start_thread( @@ -2476,21 +2550,20 @@ impl DeviceManager { #[cfg(target_arch = "x86_64")] { - let debug_console_writer: Option> = - match console_info.debug_main_fd { - ConsoleOutput::File(file) | ConsoleOutput::Tty(file) => Some(Box::new(file)), - ConsoleOutput::Off - | ConsoleOutput::Null - | ConsoleOutput::Pty(_) - | ConsoleOutput::Socket(_) => None, - }; + let debug_console_writer: Option> = match console_info.debug { + ConsoleTransport::File(file) | ConsoleTransport::Tty(file) => Some(Box::new(file)), + ConsoleTransport::Off + | ConsoleTransport::Null + | ConsoleTransport::Pty(_) + | ConsoleTransport::Socket(_) => None, + }; if let Some(writer) = debug_console_writer { let _ = self.add_debug_console_device(writer)?; } } let console_resizer = - self.add_virtio_console_device(console_info.console_main_fd, console_resize_pipe)?; + self.add_virtio_console_device(console_info.console, console_resize_pipe)?; Ok(Arc::new(Console { console_resizer })) } @@ -2554,6 +2627,9 @@ impl DeviceManager { self.make_virtio_net_devices()?; self.make_virtio_rng_devices()?; + // Add generic vhost-user if required + self.make_generic_vhost_user_devices()?; + // Add virtio-fs if required self.make_virtio_fs_devices()?; @@ -2614,12 +2690,13 @@ impl DeviceManager { disk_cfg: &mut DiskConfig, is_hotplug: bool, ) -> DeviceManagerResult { - let id = if let Some(id) = &disk_cfg.id { - id.clone() - } else { - let id = self.next_device_name(DISK_DEVICE_NAME_PREFIX)?; - disk_cfg.id = Some(id.clone()); - id + let id = match disk_cfg.pci_common.id.as_ref() { + Some(id) => id.clone(), + None => disk_cfg + .pci_common + .id + .insert(self.next_device_name(DISK_DEVICE_NAME_PREFIX)?) + .clone(), }; info!("Creating virtio-block device: {disk_cfg:?}"); @@ -2642,7 +2719,7 @@ impl DeviceManager { self.exit_evt .try_clone() .map_err(DeviceManagerError::EventFd)?, - self.force_iommu, + self.force_access_platform, state_from_id(self.snapshot.as_ref(), id.as_str()) .map_err(DeviceManagerError::RestoreGetState)?, ) { @@ -2665,15 +2742,12 @@ impl DeviceManager { options.custom_flags(libc::O_DIRECT); } // Open block device path - let mut file: File = options - .open( - disk_cfg - .path - .as_ref() - .ok_or(DeviceManagerError::NoDiskPath)? - .clone(), - ) - .map_err(DeviceManagerError::Disk)?; + let disk_path = disk_cfg + .path + .as_ref() + .ok_or(DeviceManagerError::NoDiskPath)?; + let mut file: File = + open_disk_image(disk_path, &options).map_err(DeviceManagerError::Disk)?; let detected_image_type = detect_image_type(&mut file).map_err(DeviceManagerError::DetectImageType)?; @@ -2726,17 +2800,17 @@ impl DeviceManager { unreachable!("Checked in if statement above"); #[cfg(feature = "io_uring")] { - Box::new( + DiskBackend::Next(Box::new( FixedVhdDiskAsync::new(file) .map_err(DeviceManagerError::CreateFixedVhdDiskAsync)?, - ) as Box + )) } } else { info!("Using synchronous fixed VHD disk file"); - Box::new( + DiskBackend::Next(Box::new( FixedVhdDiskSync::new(file) .map_err(DeviceManagerError::CreateFixedVhdDiskSync)?, - ) as Box + )) } } ImageType::Raw => { @@ -2760,34 +2834,64 @@ impl DeviceManager { unreachable!("Checked in if statement above"); #[cfg(feature = "io_uring")] { - Box::new(RawFileDisk::new(file)) as Box + DiskBackend::Next(Box::new(RawFileDisk::new(file))) } } else if !disk_cfg.disable_aio && self.aio_is_supported() { info!("Using asynchronous RAW disk file (aio)"); - Box::new(RawFileDiskAio::new(file)) as Box + DiskBackend::Next(Box::new(RawFileDiskAio::new(file))) } else { info!("Using synchronous RAW disk file"); - Box::new(RawFileDiskSync::new(file)) as Box + DiskBackend::Next(Box::new(RawFileDiskSync::new(file))) } } ImageType::Qcow2 => { - info!("Using synchronous QCOW2 disk file"); - Box::new( - QcowDiskSync::new( - file, - disk_cfg.direct, - disk_cfg.backing_files, - disk_cfg.sparse, - ) - .map_err(DeviceManagerError::CreateQcowDiskSync)?, - ) as Box + if cfg!(feature = "io_uring") + && !disk_cfg.disable_io_uring + && self.io_uring_is_supported() + { + info!("Using asynchronous QCOW2 disk file (io_uring)"); + + #[cfg(not(feature = "io_uring"))] + unreachable!("Checked in if statement above"); + #[cfg(feature = "io_uring")] + { + DiskBackend::Next(Box::new( + QcowDiskAsync::new( + file, + disk_cfg.direct, + disk_cfg.backing_files, + disk_cfg.sparse, + ) + .map_err(|e| match &disk_cfg.path { + Some(p) => e.with_path(p), + None => e, + }) + .map_err(DeviceManagerError::CreateQcowDiskAsync)?, + )) + } + } else { + info!("Using synchronous QCOW2 disk file"); + DiskBackend::Next(Box::new( + QcowDiskSync::new( + file, + disk_cfg.direct, + disk_cfg.backing_files, + disk_cfg.sparse, + ) + .map_err(|e| match &disk_cfg.path { + Some(p) => e.with_path(p), + None => e, + }) + .map_err(DeviceManagerError::CreateQcowDiskSync)?, + )) + } } ImageType::Vhdx => { info!("Using synchronous VHDX disk file"); - Box::new( + DiskBackend::Next(Box::new( VhdxDiskSync::new(file) .map_err(DeviceManagerError::CreateFixedVhdxDiskSync)?, - ) as Box + )) } ImageType::Unknown => unreachable!(), }; @@ -2799,7 +2903,7 @@ impl DeviceManager { let bw = rate_limiter_cfg.bandwidth.unwrap_or_default(); let ops = rate_limiter_cfg.ops.unwrap_or_default(); let mut rate_limit_group = RateLimiterGroup::new( - disk_cfg.id.as_ref().unwrap(), + disk_cfg.pci_common.id.as_ref().unwrap(), bw.size, bw.one_time_burst.unwrap_or(0), bw.refill_time, @@ -2842,7 +2946,7 @@ impl DeviceManager { .ok_or(DeviceManagerError::NoDiskPath)? .clone(), disk_cfg.readonly, - self.force_iommu | disk_cfg.iommu, + self.force_access_platform | disk_cfg.pci_common.iommu, disk_cfg.num_queues, disk_cfg.queue_size, disk_cfg.serial.clone(), @@ -2856,6 +2960,7 @@ impl DeviceManager { queue_affinity, disk_cfg.sparse, disable_sector0_writes, + disk_cfg.lock_granularity, ) .map_err(DeviceManagerError::CreateVirtioBlock)?; @@ -2889,9 +2994,7 @@ impl DeviceManager { Ok(MetaVirtioDevice { virtio_device, - iommu: disk_cfg.iommu, - id, - pci_segment: disk_cfg.pci_segment, + pci_common: disk_cfg.pci_common.clone(), dma_handler: None, }) } @@ -2913,12 +3016,13 @@ impl DeviceManager { &mut self, net_cfg: &mut NetConfig, ) -> DeviceManagerResult { - let id = if let Some(id) = &net_cfg.id { - id.clone() - } else { - let id = self.next_device_name(NET_DEVICE_NAME_PREFIX)?; - net_cfg.id = Some(id.clone()); - id + let id = match net_cfg.pci_common.id.as_ref() { + Some(id) => id.clone(), + None => net_cfg + .pci_common + .id + .insert(self.next_device_name(NET_DEVICE_NAME_PREFIX)?) + .clone(), }; info!("Creating virtio-net device: {net_cfg:?}"); @@ -2944,7 +3048,7 @@ impl DeviceManager { self.exit_evt .try_clone() .map_err(DeviceManagerError::EventFd)?, - self.force_iommu, + self.force_access_platform, state_from_id(self.snapshot.as_ref(), id.as_str()) .map_err(DeviceManagerError::RestoreGetState)?, net_cfg.offload_tso, @@ -2975,7 +3079,7 @@ impl DeviceManager { Some(net_cfg.mac), &mut net_cfg.host_mac, net_cfg.mtu, - self.force_iommu | net_cfg.iommu, + self.force_access_platform | net_cfg.pci_common.iommu, net_cfg.num_queues, net_cfg.queue_size, self.seccomp_action.clone(), @@ -2996,7 +3100,7 @@ impl DeviceManager { fds, Some(net_cfg.mac), net_cfg.mtu, - self.force_iommu | net_cfg.iommu, + self.force_access_platform | net_cfg.pci_common.iommu, net_cfg.queue_size, self.seccomp_action.clone(), net_cfg.rate_limiter_config, @@ -3026,7 +3130,7 @@ impl DeviceManager { Some(net_cfg.mac), &mut net_cfg.host_mac, net_cfg.mtu, - self.force_iommu | net_cfg.iommu, + self.force_access_platform | net_cfg.pci_common.iommu, net_cfg.num_queues, net_cfg.queue_size, self.seccomp_action.clone(), @@ -3059,9 +3163,7 @@ impl DeviceManager { Ok(MetaVirtioDevice { virtio_device, - iommu: net_cfg.iommu, - id, - pci_segment: net_cfg.pci_segment, + pci_common: net_cfg.pci_common.clone(), dma_handler: None, }) } @@ -3091,7 +3193,7 @@ impl DeviceManager { virtio_devices::Rng::new( id.clone(), rng_path, - self.force_iommu | rng_config.iommu, + self.force_access_platform | rng_config.iommu, self.seccomp_action.clone(), self.exit_evt .try_clone() @@ -3104,9 +3206,11 @@ impl DeviceManager { self.virtio_devices.push(MetaVirtioDevice { virtio_device: Arc::clone(&virtio_rng_device) as Arc>, - iommu: rng_config.iommu, - id: id.clone(), - pci_segment: 0, + pci_common: PciDeviceCommonConfig { + id: Some(id.clone()), + iommu: rng_config.iommu, + ..Default::default() + }, dma_handler: None, }); @@ -3122,16 +3226,82 @@ impl DeviceManager { Ok(()) } + fn make_generic_vhost_user_device( + &mut self, + generic_vhost_user_cfg: &mut GenericVhostUserConfig, + ) -> DeviceManagerResult { + let id = match generic_vhost_user_cfg.pci_common.id.as_ref() { + Some(id) => id.clone(), + None => generic_vhost_user_cfg + .pci_common + .id + .insert(self.next_device_name(GENERIC_VHOST_USER_DEVICE_NAME_PREFIX)?) + .clone(), + }; + + info!("Creating generic vhost-user device: {generic_vhost_user_cfg:?}"); + + let mut node = device_node!(id); + + if let Some(generic_vhost_user_socket) = generic_vhost_user_cfg.socket.to_str() { + let generic_vhost_user_device = Arc::new(Mutex::new( + virtio_devices::vhost_user::GenericVhostUser::new( + id.clone(), + generic_vhost_user_socket, + generic_vhost_user_cfg.queue_sizes.clone(), + generic_vhost_user_cfg.device_type, + None, + self.seccomp_action.clone(), + self.exit_evt + .try_clone() + .map_err(DeviceManagerError::EventFd)?, + self.force_access_platform, + state_from_id(self.snapshot.as_ref(), id.as_str()) + .map_err(DeviceManagerError::RestoreGetState)?, + ) + .map_err(DeviceManagerError::CreateGenericVhostUser)?, + )); + + // Update the device tree with the migratable device. + node.migratable = + Some(Arc::clone(&generic_vhost_user_device) as Arc>); + self.device_tree.lock().unwrap().insert(id.clone(), node); + + Ok(MetaVirtioDevice { + virtio_device: Arc::clone(&generic_vhost_user_device) + as Arc>, + pci_common: generic_vhost_user_cfg.pci_common.clone(), + dma_handler: None, + }) + } else { + Err(DeviceManagerError::NoGenericVhostUserSock) + } + } + + fn make_generic_vhost_user_devices(&mut self) -> DeviceManagerResult<()> { + let mut generic_vhost_user_devices = self.config.lock().unwrap().generic_vhost_user.clone(); + if let Some(generic_vhost_user_list_cfg) = &mut generic_vhost_user_devices { + for generic_vhost_user_cfg in generic_vhost_user_list_cfg.iter_mut() { + let device = self.make_generic_vhost_user_device(generic_vhost_user_cfg)?; + self.virtio_devices.push(device); + } + } + self.config.lock().unwrap().generic_vhost_user = generic_vhost_user_devices; + + Ok(()) + } + fn make_virtio_fs_device( &mut self, fs_cfg: &mut FsConfig, ) -> DeviceManagerResult { - let id = if let Some(id) = &fs_cfg.id { - id.clone() - } else { - let id = self.next_device_name(FS_DEVICE_NAME_PREFIX)?; - fs_cfg.id = Some(id.clone()); - id + let id = match fs_cfg.pci_common.id.as_ref() { + Some(id) => id.clone(), + None => fs_cfg + .pci_common + .id + .insert(self.next_device_name(FS_DEVICE_NAME_PREFIX)?) + .clone(), }; info!("Creating virtio-fs device: {fs_cfg:?}"); @@ -3151,7 +3321,7 @@ impl DeviceManager { self.exit_evt .try_clone() .map_err(DeviceManagerError::EventFd)?, - self.force_iommu, + self.force_access_platform, state_from_id(self.snapshot.as_ref(), id.as_str()) .map_err(DeviceManagerError::RestoreGetState)?, ) @@ -3165,9 +3335,7 @@ impl DeviceManager { Ok(MetaVirtioDevice { virtio_device: Arc::clone(&virtio_fs_device) as Arc>, - iommu: false, - id, - pci_segment: fs_cfg.pci_segment, + pci_common: fs_cfg.pci_common.clone(), dma_handler: None, }) } else { @@ -3192,12 +3360,13 @@ impl DeviceManager { &mut self, pmem_cfg: &mut PmemConfig, ) -> DeviceManagerResult { - let id = if let Some(id) = &pmem_cfg.id { - id.clone() - } else { - let id = self.next_device_name(PMEM_DEVICE_NAME_PREFIX)?; - pmem_cfg.id = Some(id.clone()); - id + let id = match pmem_cfg.pci_common.id.as_ref() { + Some(id) => id.clone(), + None => pmem_cfg + .pci_common + .id + .insert(self.next_device_name(PMEM_DEVICE_NAME_PREFIX)?) + .clone(), }; info!("Creating virtio-pmem device: {pmem_cfg:?}"); @@ -3268,7 +3437,7 @@ impl DeviceManager { let (region_base, region_size) = if let Some((base, size)) = region_range { // The memory needs to be 2MiB aligned in order to support // hugepages. - self.pci_segments[pmem_cfg.pci_segment as usize] + self.pci_segments[pmem_cfg.pci_common.pci_segment as usize] .mem64_allocator .lock() .unwrap() @@ -3283,7 +3452,7 @@ impl DeviceManager { } else { // The memory needs to be 2MiB aligned in order to support // hugepages. - let base = self.pci_segments[pmem_cfg.pci_segment as usize] + let base = self.pci_segments[pmem_cfg.pci_common.pci_segment as usize] .mem64_allocator .lock() .unwrap() @@ -3331,7 +3500,7 @@ impl DeviceManager { file, GuestAddress(region_base), mapping, - self.force_iommu | pmem_cfg.iommu, + self.force_access_platform | pmem_cfg.pci_common.iommu, self.seccomp_action.clone(), self.exit_evt .try_clone() @@ -3354,9 +3523,7 @@ impl DeviceManager { Ok(MetaVirtioDevice { virtio_device: Arc::clone(&virtio_pmem_device) as Arc>, - iommu: pmem_cfg.iommu, - id, - pci_segment: pmem_cfg.pci_segment, + pci_common: pmem_cfg.pci_common.clone(), dma_handler: None, }) } @@ -3379,12 +3546,13 @@ impl DeviceManager { &mut self, vsock_cfg: &mut VsockConfig, ) -> DeviceManagerResult { - let id = if let Some(id) = &vsock_cfg.id { - id.clone() - } else { - let id = self.next_device_name(VSOCK_DEVICE_NAME_PREFIX)?; - vsock_cfg.id = Some(id.clone()); - id + let id = match vsock_cfg.pci_common.id.as_ref() { + Some(id) => id.clone(), + None => vsock_cfg + .pci_common + .id + .insert(self.next_device_name(VSOCK_DEVICE_NAME_PREFIX)?) + .clone(), }; info!("Creating virtio-vsock device: {vsock_cfg:?}"); @@ -3403,7 +3571,7 @@ impl DeviceManager { vsock_cfg.cid, vsock_cfg.socket.clone(), backend, - self.force_iommu | vsock_cfg.iommu, + self.force_access_platform | vsock_cfg.pci_common.iommu, self.seccomp_action.clone(), self.exit_evt .try_clone() @@ -3425,9 +3593,7 @@ impl DeviceManager { Ok(MetaVirtioDevice { virtio_device: Arc::clone(&vsock_device) as Arc>, - iommu: vsock_cfg.iommu, - id, - pci_segment: vsock_cfg.pci_segment, + pci_common: vsock_cfg.pci_common.clone(), dma_handler: None, }) } @@ -3481,9 +3647,10 @@ impl DeviceManager { self.virtio_devices.push(MetaVirtioDevice { virtio_device: Arc::clone(&virtio_mem_device) as Arc>, - iommu: false, - id: memory_zone_id.clone(), - pci_segment: 0, + pci_common: PciDeviceCommonConfig { + id: Some(memory_zone_id.clone()), + ..Default::default() + }, dma_handler: None, }); @@ -3511,7 +3678,7 @@ impl DeviceManager { let pci_segment_id = 0x0_u16; let (pci_segment_id, pci_device_bdf, resources) = - self.pci_resources(&id, pci_segment_id)?; + self.pci_resources(&id, pci_segment_id, None)?; info!("Creating pvmemcontrol device: id = {id}"); let (pvmemcontrol_pci_device, pvmemcontrol_bus_device) = @@ -3553,6 +3720,7 @@ impl DeviceManager { balloon_config.size, balloon_config.deflate_on_oom, balloon_config.free_page_reporting, + self.force_access_platform, self.seccomp_action.clone(), self.exit_evt .try_clone() @@ -3568,9 +3736,10 @@ impl DeviceManager { self.virtio_devices.push(MetaVirtioDevice { virtio_device: Arc::clone(&virtio_balloon_device) as Arc>, - iommu: false, - id: id.clone(), - pci_segment: 0, + pci_common: PciDeviceCommonConfig { + id: Some(id.clone()), + ..Default::default() + }, dma_handler: None, }); @@ -3607,9 +3776,10 @@ impl DeviceManager { self.virtio_devices.push(MetaVirtioDevice { virtio_device: Arc::clone(&virtio_watchdog_device) as Arc>, - iommu: false, - id: id.clone(), - pci_segment: 0, + pci_common: PciDeviceCommonConfig { + id: Some(id.clone()), + ..Default::default() + }, dma_handler: None, }); @@ -3625,12 +3795,13 @@ impl DeviceManager { &mut self, vdpa_cfg: &mut VdpaConfig, ) -> DeviceManagerResult { - let id = if let Some(id) = &vdpa_cfg.id { - id.clone() - } else { - let id = self.next_device_name(VDPA_DEVICE_NAME_PREFIX)?; - vdpa_cfg.id = Some(id.clone()); - id + let id = match vdpa_cfg.pci_common.id.as_ref() { + Some(id) => id.clone(), + None => vdpa_cfg + .pci_common + .id + .insert(self.next_device_name(VDPA_DEVICE_NAME_PREFIX)?) + .clone(), }; info!("Creating vDPA device: {vdpa_cfg:?}"); @@ -3665,9 +3836,7 @@ impl DeviceManager { Ok(MetaVirtioDevice { virtio_device: vdpa_device as Arc>, - iommu: vdpa_cfg.iommu, - id, - pci_segment: vdpa_cfg.pci_segment, + pci_common: vdpa_cfg.pci_common.clone(), dma_handler: Some(vdpa_mapping), }) } @@ -3727,7 +3896,7 @@ impl DeviceManager { self.add_vfio_device(device_cfg) } - fn create_vfio_container(&self) -> DeviceManagerResult> { + fn create_vfio_ops(&self) -> DeviceManagerResult> { let passthrough_device = self .passthrough_device .as_ref() @@ -3737,41 +3906,71 @@ impl DeviceManager { .try_clone() .map_err(DeviceManagerError::VfioCreate)?; - Ok(Arc::new( - VfioContainer::new(Some(Arc::new(dup))).map_err(DeviceManagerError::VfioCreate)?, - )) + let iommufd = self + .config + .lock() + .unwrap() + .platform + .as_ref() + .is_some_and(|p| p.iommufd); + + if iommufd { + #[cfg(feature = "kvm")] + { + info!("Using vfio cdev mode with iommufd."); + let iommufd = IommuFd::new().map_err(DeviceManagerError::IommufdCreate)?; + let vfio_iommufd = VfioIommufd::new(Arc::new(iommufd), None, Some(Arc::new(dup))) + .map_err(DeviceManagerError::VfioCreate)?; + Ok(Arc::new(vfio_iommufd)) + } + #[cfg(not(feature = "kvm"))] + Err(DeviceManagerError::IommufdNotSupported) + } else { + info!("Using vfio legacy mode with vfio container/group."); + Ok(Arc::new( + VfioContainer::new(Some(Arc::new(dup))).map_err(DeviceManagerError::VfioCreate)?, + )) + } } fn add_vfio_device( &mut self, device_cfg: &mut DeviceConfig, ) -> DeviceManagerResult<(PciBdf, String)> { - let vfio_name = if let Some(id) = &device_cfg.id { + let vfio_name = if let Some(id) = &device_cfg.pci_common.id { id.clone() } else { let id = self.next_device_name(VFIO_DEVICE_NAME_PREFIX)?; - device_cfg.id = Some(id.clone()); + device_cfg.pci_common.id = Some(id.clone()); id }; - let (pci_segment_id, pci_device_bdf, resources) = - self.pci_resources(&vfio_name, device_cfg.pci_segment)?; + let (pci_segment_id, pci_device_bdf, resources) = self.pci_resources( + &vfio_name, + device_cfg.pci_common.pci_segment, + device_cfg.pci_common.pci_device_id, + )?; let mut needs_dma_mapping = false; - // Here we create a new VFIO container for two reasons. Either this is - // the first VFIO device, meaning we need a new VFIO container, which - // will be shared with other VFIO devices. Or the new VFIO device is - // attached to a vIOMMU, meaning we must create a dedicated VFIO - // container. In the vIOMMU use case, we can't let all devices under - // the same VFIO container since we couldn't map/unmap memory for each - // device. That's simply because the map/unmap operations happen at the - // VFIO container level. - let vfio_container = if device_cfg.iommu { - let vfio_container = self.create_vfio_container()?; + // Here we create a new VfioOps for two reasons: + // 1) This is the first VFIO device, meaning we need a new VfioOps + // which will be shared with other VFIO devices. + // 2) The new VFIO device is attached to a vIOMMU, meaning we must + // create a dedicated VfioOps. In the vIOMMU use case, we can't + // let all devices share the same VfioOps since we couldn't + // map/unmap memory for each device independently. That's simply + // because the map/unmap operations happen at the VfioOps level. + // + // Note: this is a limitation of the legacy VFIO interface using + // container/group. The VFIO cdev and iommufd do not have such a + // limitation, and this will be revised once we have VFIO cdev and + // iommufd support. + let vfio_ops = if device_cfg.pci_common.iommu { + let vfio_ops = self.create_vfio_ops()?; let vfio_mapping = Arc::new(VfioDmaMapping::new( - Arc::clone(&vfio_container), + Arc::clone(&vfio_ops), Arc::new(self.memory_manager.lock().unwrap().guest_memory()), Arc::clone(&self.mmio_regions), )); @@ -3785,19 +3984,20 @@ impl DeviceManager { return Err(DeviceManagerError::MissingVirtualIommu); } - vfio_container - } else if let Some(vfio_container) = &self.vfio_container { - Arc::clone(vfio_container) + vfio_ops + } else if let Some(vfio_ops) = &self.vfio_ops { + Arc::clone(vfio_ops) } else { - let vfio_container = self.create_vfio_container()?; + let vfio_ops = self.create_vfio_ops()?; needs_dma_mapping = true; - self.vfio_container = Some(Arc::clone(&vfio_container)); + self.vfio_ops = Some(Arc::clone(&vfio_ops)); - vfio_container + vfio_ops }; - let vfio_device = VfioDevice::new(&device_cfg.path, Arc::clone(&vfio_container)) - .map_err(DeviceManagerError::VfioCreate)?; + let vfio_device = + VfioDevice::new(&device_cfg.path, Arc::clone(&vfio_ops) as Arc) + .map_err(DeviceManagerError::VfioCreate)?; if needs_dma_mapping { // Register DMA mapping in IOMMU. @@ -3811,10 +4011,10 @@ impl DeviceManager { // to len bytes of valid memory starting at as_ptr() // that will only be freed with munmap(). unsafe { - vfio_container.vfio_dma_map( + vfio_ops.vfio_dma_map( region.start_addr().raw_value(), - region.len(), - region.as_ptr() as u64, + region.len() as usize, + region.as_ptr(), ) } .map_err(DeviceManagerError::VfioDmaMap)?; @@ -3822,7 +4022,7 @@ impl DeviceManager { } let vfio_mapping = Arc::new(VfioDmaMapping::new( - Arc::clone(&vfio_container), + Arc::clone(&vfio_ops), Arc::new(self.memory_manager.lock().unwrap().guest_memory()), Arc::clone(&self.mmio_regions), )); @@ -3856,14 +4056,23 @@ impl DeviceManager { let memory_manager = self.memory_manager.clone(); + let vfio_p2p_dma = self + .config + .lock() + .unwrap() + .platform + .as_ref() + .is_none_or(|p| p.vfio_p2p_dma); + let vfio_pci_device = VfioPciDevice::new( vfio_name.clone(), self.address_manager.vm.clone(), vfio_device, - vfio_container, + vfio_ops, self.msi_interrupt_manager.clone(), legacy_interrupt_group, - device_cfg.iommu, + device_cfg.pci_common.iommu, + vfio_p2p_dma, pci_device_bdf, memory_manager.lock().unwrap().memory_slot_allocator(), vm_migration::snapshot_from_id(self.snapshot.as_ref(), vfio_name.as_str()), @@ -3923,7 +4132,7 @@ impl DeviceManager { .lock() .unwrap() .allocate_bars( - &self.address_manager.allocator, + &mut self.address_manager.allocator.lock().unwrap(), &mut self.pci_segments[segment_id as usize] .mem32_allocator .lock() @@ -3942,7 +4151,7 @@ impl DeviceManager { .unwrap(); pci_bus - .add_device(bdf.device() as u32, pci_device) + .add_device(bdf.device(), pci_device) .map_err(DeviceManagerError::AddPciDevice)?; self.bus_devices.push(Arc::clone(&bus_device)); @@ -3977,7 +4186,7 @@ impl DeviceManager { if let Some(device_list_cfg) = &mut devices { for device_cfg in device_list_cfg.iter_mut() { let (device_id, _) = self.add_passthrough_device(device_cfg)?; - if device_cfg.iommu && self.iommu_device.is_some() { + if device_cfg.pci_common.iommu && self.iommu_device.is_some() { iommu_attached_device_ids.push(device_id); } } @@ -3993,16 +4202,19 @@ impl DeviceManager { &mut self, device_cfg: &mut UserDeviceConfig, ) -> DeviceManagerResult<(PciBdf, String)> { - let vfio_user_name = if let Some(id) = &device_cfg.id { + let vfio_user_name = if let Some(id) = &device_cfg.pci_common.id { id.clone() } else { let id = self.next_device_name(VFIO_USER_DEVICE_NAME_PREFIX)?; - device_cfg.id = Some(id.clone()); + device_cfg.pci_common.id = Some(id.clone()); id }; - let (pci_segment_id, pci_device_bdf, resources) = - self.pci_resources(&vfio_user_name, device_cfg.pci_segment)?; + let (pci_segment_id, pci_device_bdf, resources) = self.pci_resources( + &vfio_user_name, + device_cfg.pci_common.pci_segment, + device_cfg.pci_common.pci_device_id, + )?; let legacy_interrupt_group = if let Some(legacy_interrupt_manager) = &self.legacy_interrupt_manager { @@ -4111,13 +4323,16 @@ impl DeviceManager { Ok(vec![]) } + #[allow(clippy::too_many_arguments)] fn add_virtio_pci_device( &mut self, virtio_device: Arc>, iommu_mapping: &Option>, virtio_device_id: &str, pci_segment_id: u16, + is_hotplug: bool, dma_handler: Option>, + pci_device_id: Option, ) -> DeviceManagerResult { let id = format!("{VIRTIO_PCI_DEVICE_NAME_PREFIX}-{virtio_device_id}"); @@ -4126,7 +4341,7 @@ impl DeviceManager { node.children = vec![virtio_device_id.to_string()]; let (pci_segment_id, pci_device_bdf, resources) = - self.pci_resources(&id, pci_segment_id)?; + self.pci_resources(&id, pci_segment_id, pci_device_id)?; // Update the existing virtio node by setting the parent. if let Some(node) = self.device_tree.lock().unwrap().get_mut(virtio_device_id) { @@ -4135,8 +4350,8 @@ impl DeviceManager { return Err(DeviceManagerError::MissingNode); } - // Allows support for one MSI-X vector per queue. It also adds 1 - // as we need to take into account the dedicated vector to notify + // Allows support for one MSI-X vector per interrupt needed by the device. + // It also adds 1 as we need to take into account the dedicated vector to notify // about a virtio config change. let msix_num = (virtio_device.lock().unwrap().queue_max_sizes().len() + 1) as u16; @@ -4209,17 +4424,16 @@ impl DeviceManager { memory, virtio_device, msix_num, - access_platform, + access_platform.as_ref(), self.msi_interrupt_manager.as_ref(), pci_device_bdf.into(), self.activate_evt .try_clone() .map_err(DeviceManagerError::EventFd)?, - // All device types *except* virtio block devices should be allocated a 64-bit bar - // The block devices should be given a 32-bit BAR so that they are easily accessible - // to firmware without requiring excessive identity mapping. - // The exception being if not on the default PCI segment. - pci_segment_id > 0 || device_type != VirtioDeviceType::Block as u32, + // Boot-time block devices stay in 32-bit BAR space so early firmware can access + // them without additional identity mapping. Hot-plugged block devices do not have + // that constraint and should use 64-bit BARs like the rest of the virtio devices. + use_64bit_bar_for_virtio_device(device_type, pci_segment_id, is_hotplug), dma_handler, self.pending_activations.clone(), vm_migration::snapshot_from_id(self.snapshot.as_ref(), id.as_str()), @@ -4263,7 +4477,7 @@ impl DeviceManager { info!("Creating pvpanic device {id}"); let (pci_segment_id, pci_device_bdf, resources) = - self.pci_resources(&id, pci_segment_id)?; + self.pci_resources(&id, pci_segment_id, None)?; let snapshot = snapshot_from_id(self.snapshot.as_ref(), id.as_str()); @@ -4301,7 +4515,7 @@ impl DeviceManager { info!("Creating ivshmem device {id}"); let (pci_segment_id, pci_device_bdf, resources) = - self.pci_resources(&id, pci_segment_id)?; + self.pci_resources(&id, pci_segment_id, None)?; let snapshot = snapshot_from_id(self.snapshot.as_ref(), id.as_str()); let ivshmem_ops = Arc::new(Mutex::new(IvshmemHandler { @@ -4342,10 +4556,42 @@ impl DeviceManager { Ok(Some(ivshmem_device)) } + fn reserve_explicit_device_ids(&self) -> DeviceManagerResult<()> { + for handle in &self.virtio_devices { + if let Some(device_id) = handle.pci_common.pci_device_id { + self.pci_segments[handle.pci_common.pci_segment as usize] + .reserve_device_id(device_id)?; + } + } + + let config = self.config.lock().unwrap(); + + if let Some(devices) = &config.devices { + for device_cfg in devices { + if let Some(device_id) = device_cfg.pci_common.pci_device_id { + self.pci_segments[device_cfg.pci_common.pci_segment as usize] + .reserve_device_id(device_id)?; + } + } + } + + if let Some(user_devices) = &config.user_devices { + for device_cfg in user_devices { + if let Some(device_id) = device_cfg.pci_common.pci_device_id { + self.pci_segments[device_cfg.pci_common.pci_segment as usize] + .reserve_device_id(device_id)?; + } + } + } + + Ok(()) + } + fn pci_resources( &self, id: &str, pci_segment_id: u16, + pci_device_id: Option, ) -> DeviceManagerResult<(u16, PciBdf, Option>)> { // Look for the id in the device tree. If it can be found, that means // the device is being restored, otherwise it's created from scratch. @@ -4367,12 +4613,13 @@ impl DeviceManager { .pci_bus .lock() .unwrap() - .get_device_id(pci_device_bdf.device() as usize) - .map_err(DeviceManagerError::GetPciDeviceId)?; + .allocate_device_id(Some(pci_device_bdf.device())) + .map_err(DeviceManagerError::AllocatePciDeviceId)?; (pci_segment_id, pci_device_bdf, resources) } else { - let pci_device_bdf = self.pci_segments[pci_segment_id as usize].next_device_bdf()?; + let pci_device_bdf = + self.pci_segments[pci_segment_id as usize].allocate_device_id(pci_device_id)?; (pci_segment_id, pci_device_bdf, None) }) @@ -4428,7 +4675,7 @@ impl DeviceManager { .map_err(DeviceManagerError::UpdateMemoryForVirtioDevice)?; if let Some(dma_handler) = &handle.dma_handler - && !handle.iommu + && !handle.pci_common.iommu { let gpa = new_region.start_addr().0; let size = new_region.len(); @@ -4439,17 +4686,17 @@ impl DeviceManager { } // Take care of updating the memory for VFIO PCI devices. - if let Some(vfio_container) = &self.vfio_container { + if let Some(vfio_ops) = &self.vfio_ops { // vfio_dma_map is unsound and ought to be marked as unsafe #[allow(unused_unsafe)] // SAFETY: GuestMemoryMmap guarantees that region points // to len bytes of valid memory starting at as_ptr() // that will only be freed with munmap(). unsafe { - vfio_container.vfio_dma_map( + vfio_ops.vfio_dma_map( new_region.start_addr().raw_value(), - new_region.len(), - new_region.as_ptr() as u64, + new_region.len() as usize, + new_region.as_ptr(), ) } .map_err(DeviceManagerError::UpdateMemoryForVfioPciDevice)?; @@ -4477,7 +4724,7 @@ impl DeviceManager { } pub fn activate_virtio_devices(&self) -> DeviceManagerResult<()> { - for mut activator in self.pending_activations.lock().unwrap().drain(..) { + for activator in self.pending_activations.lock().unwrap().drain(..) { activator .activate() .map_err(DeviceManagerError::VirtioActivate)?; @@ -4503,16 +4750,18 @@ impl DeviceManager { &mut self, device_cfg: &mut DeviceConfig, ) -> DeviceManagerResult { - self.validate_identifier(&device_cfg.id)?; + self.validate_identifier(&device_cfg.pci_common.id)?; - if device_cfg.iommu && !self.is_iommu_segment(device_cfg.pci_segment) { + if device_cfg.pci_common.iommu && !self.is_iommu_segment(device_cfg.pci_common.pci_segment) + { return Err(DeviceManagerError::InvalidIommuHotplug); } let (bdf, device_name) = self.add_passthrough_device(device_cfg)?; // Update the PCIU bitmap - self.pci_segments[device_cfg.pci_segment as usize].pci_devices_up |= 1 << bdf.device(); + self.pci_segments[device_cfg.pci_common.pci_segment as usize].pci_devices_up |= + 1 << bdf.device(); Ok(PciDeviceInfo { id: device_name, @@ -4524,12 +4773,24 @@ impl DeviceManager { &mut self, device_cfg: &mut UserDeviceConfig, ) -> DeviceManagerResult { - self.validate_identifier(&device_cfg.id)?; + self.validate_identifier(&device_cfg.pci_common.id)?; + + // Reject duplicate socket up-front: libvfio-user servers accept a + // single client, so a second Client::new() on the same socket blocks + // indefinitely in the handshake recvmsg() and hangs the VMM thread. + if let Some(existing) = &self.config.lock().unwrap().user_devices + && existing.iter().any(|d| d.socket == device_cfg.socket) + { + return Err(DeviceManagerError::UserDeviceSocketInUse( + device_cfg.socket.clone(), + )); + } let (bdf, device_name) = self.add_vfio_user_device(device_cfg)?; // Update the PCIU bitmap - self.pci_segments[device_cfg.pci_segment as usize].pci_devices_up |= 1 << bdf.device(); + self.pci_segments[device_cfg.pci_common.pci_segment as usize].pci_devices_up |= + 1 << bdf.device(); Ok(PciDeviceInfo { id: device_name, @@ -4610,7 +4871,7 @@ impl DeviceManager { let nets = config.net.as_deref_mut().unwrap(); let net_dev_cfg = nets .iter_mut() - .find(|net| net.id.as_deref() == Some(id)) + .find(|net| net.pci_common.id.as_deref() == Some(id)) // unwrap: the device could not have been removed without an ID .unwrap(); let fds = net_dev_cfg.fds.take().unwrap_or(Vec::new()); @@ -4649,8 +4910,8 @@ impl DeviceManager { .pci_bus .lock() .unwrap() - .put_device_id(device_id as usize) - .map_err(DeviceManagerError::PutPciDeviceId)?; + .free_device_id(device_id) + .map_err(DeviceManagerError::FreePciDeviceId)?; let (pci_device_handle, id) = { // Remove the device from the device tree along with its children. @@ -4690,7 +4951,7 @@ impl DeviceManager { let (pci_device, bus_device, virtio_device, remove_dma_handler) = match pci_device_handle { // VirtioMemMappingSource::Container cleanup is handled by - // cleanup_vfio_container when the last VFIO device is removed. + // cleanup_vfio_ops when the last VFIO device is removed. PciDeviceHandle::Vfio(vfio_pci_device) => { // Remove this device's MMIO regions from the DeviceManager's // mmio_regions list. We match on UserMemoryRegion slot numbers @@ -4864,24 +5125,28 @@ impl DeviceManager { // for instance. self.virtio_devices.push(handle.clone()); - let mapping: Option> = if handle.iommu { + let mapping: Option> = if handle.pci_common.iommu { self.iommu_mapping.clone() } else { None }; + let id = handle.id().to_owned(); let bdf = self.add_virtio_pci_device( handle.virtio_device, &mapping, - &handle.id, - handle.pci_segment, + &id, + handle.pci_common.pci_segment, + true, handle.dma_handler, + handle.pci_common.pci_device_id, )?; // Update the PCIU bitmap - self.pci_segments[handle.pci_segment as usize].pci_devices_up |= 1 << bdf.device(); + self.pci_segments[handle.pci_common.pci_segment as usize].pci_devices_up |= + 1 << bdf.device(); - Ok(PciDeviceInfo { id: handle.id, bdf }) + Ok(PciDeviceInfo { id, bdf }) } fn is_iommu_segment(&self, pci_segment_id: u16) -> bool { @@ -4901,9 +5166,9 @@ impl DeviceManager { } pub fn add_disk(&mut self, disk_cfg: &mut DiskConfig) -> DeviceManagerResult { - self.validate_identifier(&disk_cfg.id)?; + self.validate_identifier(&disk_cfg.pci_common.id)?; - if disk_cfg.iommu && !self.is_iommu_segment(disk_cfg.pci_segment) { + if disk_cfg.pci_common.iommu && !self.is_iommu_segment(disk_cfg.pci_common.pci_segment) { return Err(DeviceManagerError::InvalidIommuHotplug); } @@ -4912,16 +5177,26 @@ impl DeviceManager { } pub fn add_fs(&mut self, fs_cfg: &mut FsConfig) -> DeviceManagerResult { - self.validate_identifier(&fs_cfg.id)?; + self.validate_identifier(&fs_cfg.pci_common.id)?; let device = self.make_virtio_fs_device(fs_cfg)?; self.hotplug_virtio_pci_device(device) } + pub fn add_generic_vhost_user( + &mut self, + generic_vhost_user_cfg: &mut GenericVhostUserConfig, + ) -> DeviceManagerResult { + self.validate_identifier(&generic_vhost_user_cfg.pci_common.id)?; + + let device = self.make_generic_vhost_user_device(generic_vhost_user_cfg)?; + self.hotplug_virtio_pci_device(device) + } + pub fn add_pmem(&mut self, pmem_cfg: &mut PmemConfig) -> DeviceManagerResult { - self.validate_identifier(&pmem_cfg.id)?; + self.validate_identifier(&pmem_cfg.pci_common.id)?; - if pmem_cfg.iommu && !self.is_iommu_segment(pmem_cfg.pci_segment) { + if pmem_cfg.pci_common.iommu && !self.is_iommu_segment(pmem_cfg.pci_common.pci_segment) { return Err(DeviceManagerError::InvalidIommuHotplug); } @@ -4930,9 +5205,9 @@ impl DeviceManager { } pub fn add_net(&mut self, net_cfg: &mut NetConfig) -> DeviceManagerResult { - self.validate_identifier(&net_cfg.id)?; + self.validate_identifier(&net_cfg.pci_common.id)?; - if net_cfg.iommu && !self.is_iommu_segment(net_cfg.pci_segment) { + if net_cfg.pci_common.iommu && !self.is_iommu_segment(net_cfg.pci_common.pci_segment) { return Err(DeviceManagerError::InvalidIommuHotplug); } @@ -4941,9 +5216,9 @@ impl DeviceManager { } pub fn add_vdpa(&mut self, vdpa_cfg: &mut VdpaConfig) -> DeviceManagerResult { - self.validate_identifier(&vdpa_cfg.id)?; + self.validate_identifier(&vdpa_cfg.pci_common.id)?; - if vdpa_cfg.iommu && !self.is_iommu_segment(vdpa_cfg.pci_segment) { + if vdpa_cfg.pci_common.iommu && !self.is_iommu_segment(vdpa_cfg.pci_common.pci_segment) { return Err(DeviceManagerError::InvalidIommuHotplug); } @@ -4952,9 +5227,9 @@ impl DeviceManager { } pub fn add_vsock(&mut self, vsock_cfg: &mut VsockConfig) -> DeviceManagerResult { - self.validate_identifier(&vsock_cfg.id)?; + self.validate_identifier(&vsock_cfg.pci_common.id)?; - if vsock_cfg.iommu && !self.is_iommu_segment(vsock_cfg.pci_segment) { + if vsock_cfg.pci_common.iommu && !self.is_iommu_segment(vsock_cfg.pci_common.pci_segment) { return Err(DeviceManagerError::InvalidIommuHotplug); } @@ -4968,7 +5243,7 @@ impl DeviceManager { for handle in &self.virtio_devices { let virtio_device = handle.virtio_device.lock().unwrap(); if let Some(device_counters) = virtio_device.counters() { - counters.insert(handle.id.clone(), device_counters.clone()); + counters.insert(handle.id().to_owned(), device_counters.clone()); } } @@ -5070,11 +5345,11 @@ impl DeviceManager { &self.acpi_platform_addresses } - fn cleanup_vfio_container(&mut self) { - // Drop the 'vfio container' instance when "Self" is the only reference - if let Some(1) = self.vfio_container.as_ref().map(Arc::strong_count) { - debug!("Drop 'vfio container' given no active 'vfio devices'."); - self.vfio_container = None; + fn cleanup_vfio_ops(&mut self) { + // Drop the VfioOps instance when "Self" is the only reference + if let Some(1) = self.vfio_ops.as_ref().map(Arc::strong_count) { + debug!("Drop VfioOps given no active VFIO devices."); + self.vfio_ops = None; } } } @@ -5560,7 +5835,7 @@ impl BusDevice for DeviceManager { if let Err(e) = self.eject_device(self.selected_segment as u16, slot_id as u8) { error!("Failed ejecting device {slot_id}: {e:?}"); } - self.cleanup_vfio_container(); + self.cleanup_vfio_ops(); slot_bitmap &= !(1 << slot_id); } } @@ -5611,9 +5886,33 @@ impl Drop for DeviceManager { mod unit_tests { use super::*; + #[test] + fn test_hotplugged_block_devices_use_64bit_bars() { + assert!(!use_64bit_bar_for_virtio_device( + VirtioDeviceType::Block as u32, + 0, + false, + )); + assert!(use_64bit_bar_for_virtio_device( + VirtioDeviceType::Block as u32, + 0, + true, + )); + assert!(use_64bit_bar_for_virtio_device( + VirtioDeviceType::Net as u32, + 0, + false, + )); + assert!(use_64bit_bar_for_virtio_device( + VirtioDeviceType::Block as u32, + 1, + false, + )); + } + #[test] fn test_create_mmio_allocators() { - let res = create_mmio_allocators(0x100000, 0x400000, 1, &[1], 4 << 10); + let res = create_mmio_allocators(0x100000, 0x3fffff, 1, &[1], 4 << 10); assert_eq!(res.len(), 1); assert_eq!( res[0].lock().unwrap().base(), @@ -5624,7 +5923,7 @@ mod unit_tests { vm_memory::GuestAddress(0x3fffff) ); - let res = create_mmio_allocators(0x100000, 0x400000, 2, &[1, 1], 4 << 10); + let res = create_mmio_allocators(0x100000, 0x3fffff, 2, &[1, 1], 4 << 10); assert_eq!(res.len(), 2); assert_eq!( res[0].lock().unwrap().base(), @@ -5643,7 +5942,7 @@ mod unit_tests { vm_memory::GuestAddress(0x3fffff) ); - let res = create_mmio_allocators(0x100000, 0x400000, 2, &[2, 1], 4 << 10); + let res = create_mmio_allocators(0x100000, 0x3fffff, 2, &[2, 1], 4 << 10); assert_eq!(res.len(), 2); assert_eq!( res[0].lock().unwrap().base(), diff --git a/vmm/src/gdb.rs b/vmm/src/gdb.rs index fc24767d9c..82a5d63a9d 100644 --- a/vmm/src/gdb.rs +++ b/vmm/src/gdb.rs @@ -484,7 +484,7 @@ impl run_blocking::BlockingEventLoop for GdbEventLoop { } } - if conn.peek().map(|b| b.is_some()).unwrap_or(true) { + if conn.peek().map_or(true, |b| b.is_some()) { let byte = conn .read() .map_err(run_blocking::WaitForStopReasonError::Connection)?; diff --git a/vmm/src/igvm/igvm_loader.rs b/vmm/src/igvm/igvm_loader.rs index 4d454f8223..5cdb0d01b4 100644 --- a/vmm/src/igvm/igvm_loader.rs +++ b/vmm/src/igvm/igvm_loader.rs @@ -4,23 +4,32 @@ // use std::collections::HashMap; use std::ffi::CString; -use std::io::{Read, Seek, SeekFrom}; use std::mem::size_of; use std::sync::{Arc, Mutex}; +use hypervisor::HypervisorType; use igvm::snp_defs::SevVmsa; -use igvm::{IgvmDirectiveHeader, IgvmFile, IgvmPlatformHeader, IsolationType}; +use igvm::{IgvmDirectiveHeader, IgvmFile, IgvmPlatformHeader}; #[cfg(feature = "sev_snp")] use igvm_defs::{IGVM_VHS_MEMORY_MAP_ENTRY, MemoryMapEntryType}; use igvm_defs::{ IGVM_VHS_PARAMETER, IGVM_VHS_PARAMETER_INSERT, IgvmPageDataType, IgvmPlatformType, }; use log::debug; +#[cfg(all(feature = "kvm", feature = "sev_snp"))] +use log::error; #[cfg(feature = "sev_snp")] use log::info; +#[cfg(feature = "mshv")] use mshv_bindings::*; use thiserror::Error; +#[cfg(feature = "sev_snp")] +use vm_memory::{Bytes, GuestAddress, GuestAddressSpace, GuestMemory}; +#[cfg(all(feature = "kvm", feature = "sev_snp"))] +use vm_migration::Snapshottable; use zerocopy::IntoBytes; +#[cfg(feature = "sev_snp")] +use zerocopy::{FromBytes, FromZeros}; #[cfg(feature = "sev_snp")] use crate::GuestMemoryMmap; @@ -29,6 +38,36 @@ use crate::igvm::loader::Loader; use crate::igvm::{BootPageAcceptance, HV_PAGE_SIZE, IgvmLoadedInfo, StartupMemoryType}; use crate::memory_manager::{Error as MemoryManagerError, MemoryManager}; +#[cfg(feature = "sev_snp")] +const ISOLATED_PAGE_SHIFT: u32 = 12; +#[cfg(feature = "sev_snp")] +const SNP_CPUID_LIMIT: u32 = 64; +// see section 7.1 +// https://www.amd.com/content/dam/amd/en/documents/epyc-technical-docs/specifications/56860.pdf +#[cfg(feature = "sev_snp")] +#[repr(C)] +#[derive(Debug, Clone, PartialEq, Eq, IntoBytes, FromBytes)] +pub struct SnpCpuidFunc { + pub eax_in: u32, + pub ecx_in: u32, + pub xcr0_in: u64, + pub xss_in: u64, + pub eax: u32, + pub ebx: u32, + pub ecx: u32, + pub edx: u32, + pub reserved: u64, +} + +#[cfg(feature = "sev_snp")] +#[repr(C)] +#[derive(Debug, Clone, FromBytes, IntoBytes)] +pub struct SnpCpuidInfo { + pub count: u32, + pub _reserved1: u32, + pub _reserved2: u64, + pub entries: [SnpCpuidFunc; SNP_CPUID_LIMIT as usize], +} #[derive(Debug, Error)] pub enum Error { #[error("command line is not a valid C string")] @@ -51,6 +90,32 @@ pub enum Error { FailedToDecodeHostData(#[source] hex::FromHexError), #[error("Error allocating address space")] MemoryManager(MemoryManagerError), + #[error("IGVM file not provided")] + MissingIgvm, + #[error("Error applying VMSA to vCPU registers: {0}")] + SetVmsa(#[source] crate::cpu::Error), +} + +// KVM SNP page types — linux/arch/x86/include/uapi/asm/sev-guest.h +#[cfg(feature = "kvm")] +const KVM_SNP_PAGE_TYPE_NORMAL: u32 = 1; +#[cfg(feature = "kvm")] +const KVM_SNP_PAGE_TYPE_VMSA: u32 = 2; +#[cfg(feature = "kvm")] +const KVM_SNP_PAGE_TYPE_UNMEASURED: u32 = 4; +#[cfg(feature = "kvm")] +const KVM_SNP_PAGE_TYPE_SECRETS: u32 = 5; +#[cfg(feature = "kvm")] +const KVM_SNP_PAGE_TYPE_CPUID: u32 = 6; + +// Consolidated page type/size configuration per hypervisor. +struct PageTypeConfig { + isolated_page_size_4kb: u32, + normal: u32, + unmeasured: u32, + cpuid: u32, + secrets: u32, + vmsa: u32, } #[allow(dead_code)] @@ -128,22 +193,62 @@ fn import_parameter( Ok(()) } +/// +/// Extract sev_features from the boot CPU (vp_index 0) VMSA. +/// +#[cfg(feature = "sev_snp")] +pub fn extract_sev_features(igvm_file: &IgvmFile) -> u64 { + for header in igvm_file.directives() { + if let IgvmDirectiveHeader::SnpVpContext { vp_index, vmsa, .. } = header + && *vp_index == 0 + { + return vmsa.sev_features.into(); + } + } + 0 +} + /// /// Load the given IGVM file to guest memory. /// Right now it only supports SNP based isolation. /// We can boot legacy VM with an igvm file without /// any isolation. +/// +/// NOTE: KVM and MSHV have different page type values and CPUID/VMSA handling. +/// Hypervisor-specific code paths are gated by runtime type checks. A future +/// refactor could split these into separate KVM/MSHV loader implementations. #[allow(clippy::needless_pass_by_value)] pub fn load_igvm( - mut file: &std::fs::File, + igvm_file: IgvmFile, memory_manager: Arc>, cpu_manager: Arc>, cmdline: &str, #[cfg(feature = "sev_snp")] host_data: &Option, ) -> Result, Error> { + let hypervisor_type = cpu_manager.lock().unwrap().hypervisor_type(); + let page_types = match hypervisor_type { + #[cfg(feature = "mshv")] + HypervisorType::Mshv => PageTypeConfig { + isolated_page_size_4kb: mshv_bindings::hv_isolated_page_size_HV_ISOLATED_PAGE_SIZE_4KB, + normal: mshv_bindings::hv_isolated_page_type_HV_ISOLATED_PAGE_TYPE_NORMAL, + unmeasured: mshv_bindings::hv_isolated_page_type_HV_ISOLATED_PAGE_TYPE_UNMEASURED, + cpuid: mshv_bindings::hv_isolated_page_type_HV_ISOLATED_PAGE_TYPE_NORMAL, + secrets: mshv_bindings::hv_isolated_page_type_HV_ISOLATED_PAGE_TYPE_UNMEASURED, + vmsa: mshv_bindings::hv_isolated_page_type_HV_ISOLATED_PAGE_TYPE_VMSA, + }, + #[cfg(feature = "kvm")] + HypervisorType::Kvm => PageTypeConfig { + isolated_page_size_4kb: HV_PAGE_SIZE as u32, + normal: KVM_SNP_PAGE_TYPE_NORMAL, + unmeasured: KVM_SNP_PAGE_TYPE_UNMEASURED, + cpuid: KVM_SNP_PAGE_TYPE_CPUID, + secrets: KVM_SNP_PAGE_TYPE_SECRETS, + vmsa: KVM_SNP_PAGE_TYPE_VMSA, + }, + }; + let mut loaded_info: Box = Box::default(); let command_line = CString::new(cmdline).map_err(Error::InvalidCommandLine)?; - let mut file_contents = Vec::new(); let memory = memory_manager.lock().as_ref().unwrap().guest_memory(); let mut gpas: Vec = Vec::new(); let proc_count = cpu_manager.lock().unwrap().vcpus().len() as u32; @@ -156,12 +261,8 @@ pub fn load_igvm( .map_err(Error::FailedToDecodeHostData)?; } - file.seek(SeekFrom::Start(0)).map_err(Error::Igvm)?; - file.read_to_end(&mut file_contents).map_err(Error::Igvm)?; - - let igvm_file = IgvmFile::new_from_binary(&file_contents, Some(IsolationType::Snp)) - .map_err(Error::InvalidIgvmFile)?; - + #[cfg(feature = "sev_snp")] + let sev_snp_enabled = cpu_manager.lock().unwrap().sev_snp_enabled(); let mask = match &igvm_file.platforms()[0] { IgvmPlatformHeader::SupportedPlatform(info) => { debug_assert!(info.platform_type == IgvmPlatformType::SEV_SNP); @@ -194,15 +295,15 @@ pub fn load_igvm( if flags.unmeasured() { gpas.push(GpaPages { gpa: *gpa, - page_type: hv_isolated_page_type_HV_ISOLATED_PAGE_TYPE_UNMEASURED, - page_size: hv_isolated_page_size_HV_ISOLATED_PAGE_SIZE_4KB, + page_type: page_types.unmeasured, + page_size: page_types.isolated_page_size_4kb, }); BootPageAcceptance::ExclusiveUnmeasured } else { gpas.push(GpaPages { gpa: *gpa, - page_type: hv_isolated_page_type_HV_ISOLATED_PAGE_TYPE_NORMAL, - page_size: hv_isolated_page_size_HV_ISOLATED_PAGE_SIZE_4KB, + page_type: page_types.normal, + page_size: page_types.isolated_page_size_4kb, }); BootPageAcceptance::Exclusive } @@ -210,43 +311,46 @@ pub fn load_igvm( IgvmPageDataType::SECRETS => { gpas.push(GpaPages { gpa: *gpa, - page_type: hv_isolated_page_type_HV_ISOLATED_PAGE_TYPE_SECRETS, - page_size: hv_isolated_page_size_HV_ISOLATED_PAGE_SIZE_4KB, + page_type: page_types.secrets, + page_size: page_types.isolated_page_size_4kb, }); BootPageAcceptance::SecretsPage } IgvmPageDataType::CPUID_DATA => { - // SAFETY: CPUID is readonly - unsafe { - let cpuid_page_p: *mut hv_psp_cpuid_page = - data.as_ptr() as *mut hv_psp_cpuid_page; // as *mut hv_psp_cpuid_page; - let cpuid_page: &mut hv_psp_cpuid_page = &mut *cpuid_page_p; - for i in 0..cpuid_page.count { - let leaf = cpuid_page.cpuid_leaf_info[i as usize]; - let mut in_leaf = cpu_manager - .lock() - .unwrap() - .get_cpuid_leaf( - 0, - leaf.eax_in, - leaf.ecx_in, - leaf.xfem_in, - leaf.xss_in, - ) - .unwrap(); - if leaf.eax_in == 1 { - in_leaf[2] &= 0x7FFFFFFF; + #[cfg(feature = "mshv")] + if hypervisor_type == HypervisorType::Mshv { + // SAFETY: CPUID is readonly + unsafe { + let cpuid_page_p: *mut hv_psp_cpuid_page = + data.as_ptr() as *mut hv_psp_cpuid_page; // as *mut hv_psp_cpuid_page; + let cpuid_page: &mut hv_psp_cpuid_page = &mut *cpuid_page_p; + for i in 0..cpuid_page.count { + let leaf = cpuid_page.cpuid_leaf_info[i as usize]; + let mut in_leaf = cpu_manager + .lock() + .unwrap() + .get_cpuid_leaf( + 0, + leaf.eax_in, + leaf.ecx_in, + leaf.xfem_in, + leaf.xss_in, + ) + .unwrap(); + if leaf.eax_in == 1 { + in_leaf[2] &= 0x7FFFFFFF; + } + cpuid_page.cpuid_leaf_info[i as usize].eax_out = in_leaf[0]; + cpuid_page.cpuid_leaf_info[i as usize].ebx_out = in_leaf[1]; + cpuid_page.cpuid_leaf_info[i as usize].ecx_out = in_leaf[2]; + cpuid_page.cpuid_leaf_info[i as usize].edx_out = in_leaf[3]; } - cpuid_page.cpuid_leaf_info[i as usize].eax_out = in_leaf[0]; - cpuid_page.cpuid_leaf_info[i as usize].ebx_out = in_leaf[1]; - cpuid_page.cpuid_leaf_info[i as usize].ecx_out = in_leaf[2]; - cpuid_page.cpuid_leaf_info[i as usize].edx_out = in_leaf[3]; } } gpas.push(GpaPages { gpa: *gpa, - page_type: hv_isolated_page_type_HV_ISOLATED_PAGE_TYPE_CPUID, - page_size: hv_isolated_page_size_HV_ISOLATED_PAGE_SIZE_4KB, + page_type: page_types.cpuid, + page_size: page_types.isolated_page_size_4kb, }); BootPageAcceptance::CpuidPage } @@ -254,9 +358,69 @@ pub fn load_igvm( _ => todo!("unsupported IgvmPageDataType"), }; - loader - .import_pages(gpa / HV_PAGE_SIZE, 1, acceptance, data) - .map_err(Error::Loader)?; + #[allow(unused_mut)] + let mut imported_page = false; + #[cfg(all(feature = "kvm", feature = "sev_snp"))] + if hypervisor_type == HypervisorType::Kvm + && *data_type == IgvmPageDataType::CPUID_DATA + { + let mut new_cp = SnpCpuidInfo::new_zeroed(); + + let entries = cpu_manager.lock().unwrap().common_cpuid(); + let cp_count = std::cmp::min(SNP_CPUID_LIMIT as usize, entries.len()); + // TODO: Filter cpuid rather than truncate + for (i, entry) in entries.iter().enumerate().take(cp_count) { + new_cp.entries[i].eax_in = entry.function; + new_cp.entries[i].ecx_in = entry.index; + new_cp.entries[i].eax = entry.eax; + new_cp.entries[i].ebx = entry.ebx; + new_cp.entries[i].ecx = entry.ecx; + new_cp.entries[i].edx = entry.edx; + /* + * Guest kernels will calculate EBX themselves using the 0xD + * subfunctions corresponding to the individual XSAVE areas, so only + * encode the base XSAVE size in the initial leaves, corresponding + * to the initial XCR0=1 state. (https://tinyurl.com/qemu-cpuid) + */ + if new_cp.entries[i].eax_in == 0xd + && (new_cp.entries[i].ecx_in == 0x0 || new_cp.entries[i].ecx_in == 0x1) + { + new_cp.entries[i].ebx = 0x240; + new_cp.entries[i].xcr0_in = 1; + new_cp.entries[i].xss_in = 0; + } + + // KVM SNP launch may reject a CPUID page with bits it intends + // to sanitize internally. Pre-clearing the known unsafe bits keeps + // the CPUID page stable across launch updates. + match (new_cp.entries[i].eax_in, new_cp.entries[i].ecx_in) { + (0x1, 0x0) => { + new_cp.entries[i].ecx &= !(1 << 24); + } + (0x7, 0x0) => { + new_cp.entries[i].ebx &= !0x2; + new_cp.entries[i].edx = 0; + } + (0x80000008, 0x0) => { + new_cp.entries[i].ebx &= !0x0200_0000; + } + (0x80000021, 0x0) => { + new_cp.entries[i].ecx = 0; + } + _ => {} + } + } + new_cp.count = cp_count as u32; + loader + .import_pages(gpa / HV_PAGE_SIZE, 1, acceptance, new_cp.as_mut_bytes()) + .map_err(Error::Loader)?; + imported_page = true; + } + if !imported_page { + loader + .import_pages(gpa / HV_PAGE_SIZE, 1, acceptance, data) + .map_err(Error::Loader)?; + } } IgvmDirectiveHeader::ParameterArea { number_of_bytes, @@ -288,16 +452,16 @@ pub fn load_igvm( IgvmDirectiveHeader::MmioRanges(_info) => { todo!("unsupported IgvmPageDataType"); } - IgvmDirectiveHeader::MemoryMap(_info) => { + IgvmDirectiveHeader::MemoryMap(_info) => + { #[cfg(feature = "sev_snp")] - { + if sev_snp_enabled { let guest_mem = memory_manager.lock().unwrap().boot_guest_memory(); let memory_map = generate_memory_map(&guest_mem)?; import_parameter(&mut parameter_areas, _info, memory_map.as_bytes())?; + } else { + todo!("Not implemented"); } - - #[cfg(not(feature = "sev_snp"))] - todo!("Not implemented"); } IgvmDirectiveHeader::CommandLine(info) => { import_parameter(&mut parameter_areas, info, command_line.as_bytes_with_nul())?; @@ -325,7 +489,7 @@ pub fn load_igvm( vmsa, } => { assert_eq!(gpa % HV_PAGE_SIZE, 0); - let mut data: [u8; 4096] = [0; 4096]; + let mut data: [u8; HV_PAGE_SIZE as usize] = [0; HV_PAGE_SIZE as usize]; let len = size_of::(); loaded_info.vmsa_gpa = *gpa; loaded_info.vmsa = **vmsa; @@ -337,10 +501,28 @@ pub fn load_igvm( .map_err(Error::Loader)?; } + // Set vCPU initial register state from VMSA before SNP_LAUNCH_FINISH + #[cfg(all(feature = "kvm", feature = "sev_snp"))] + if hypervisor_type == HypervisorType::Kvm { + let vcpus = cpu_manager.lock().unwrap().vcpus(); + for vcpu in vcpus { + let vcpu_locked = vcpu.lock().unwrap(); + let vcpu_id: u16 = vcpu_locked.id().parse().unwrap(); + if vcpu_id == *vp_index { + vcpu_locked + .setup_sev_snp_regs(loaded_info.vmsa) + .map_err(Error::SetVmsa)?; + vcpu_locked + .set_sev_control_register(0) + .map_err(Error::SetVmsa)?; + } + } + } + gpas.push(GpaPages { gpa: *gpa, - page_type: hv_isolated_page_type_HV_ISOLATED_PAGE_TYPE_VMSA, - page_size: hv_isolated_page_size_HV_ISOLATED_PAGE_SIZE_4KB, + page_type: page_types.vmsa, + page_size: page_types.isolated_page_size_4kb, }); } IgvmDirectiveHeader::SnpIdBlock { @@ -408,8 +590,8 @@ pub fn load_igvm( *area = ParameterAreaState::Inserted; gpas.push(GpaPages { gpa: *gpa, - page_type: hv_isolated_page_type_HV_ISOLATED_PAGE_TYPE_UNMEASURED, - page_size: hv_isolated_page_size_HV_ISOLATED_PAGE_SIZE_4KB, + page_type: page_types.unmeasured, + page_size: page_types.isolated_page_size_4kb, }); } IgvmDirectiveHeader::ErrorRange { .. } => { @@ -422,7 +604,7 @@ pub fn load_igvm( } #[cfg(feature = "sev_snp")] - { + if sev_snp_enabled { memory_manager .lock() .unwrap() @@ -433,7 +615,7 @@ pub fn load_igvm( let mut now = Instant::now(); // Sort the gpas to group them by the page type - gpas.sort_by(|a, b| a.gpa.cmp(&b.gpa)); + gpas.sort_by_key(|a| a.gpa); let gpas_grouped = gpas .iter() @@ -460,18 +642,62 @@ pub fn load_igvm( // of PFN for importing the isolated pages let pfns: Vec = group .iter() - .map(|gpa| gpa.gpa >> HV_HYP_PAGE_SHIFT) + .map(|gpa| gpa.gpa >> ISOLATED_PAGE_SHIFT) .collect(); - memory_manager + let guest_memory = memory_manager.lock().unwrap().guest_memory().memory(); + let uaddrs: Vec<_> = group + .iter() + .map(|gpa| { + let guest_region_mmap = guest_memory.to_region_addr(GuestAddress(gpa.gpa)); + let uaddr_base = guest_region_mmap.unwrap().0.as_ptr() as u64; + let uaddr_offset: u64 = guest_region_mmap.unwrap().1.0; + uaddr_base + uaddr_offset + }) + .collect(); + #[cfg(feature = "kvm")] + let page_type = group[0].page_type; + let mut new_cp = SnpCpuidInfo::new_zeroed(); + let _ = guest_memory.read(new_cp.as_mut_bytes(), GuestAddress(group[0].gpa)); + let import_result = memory_manager .lock() .unwrap() .vm .import_isolated_pages( group[0].page_type, - hv_isolated_page_size_HV_ISOLATED_PAGE_SIZE_4KB, + page_types.isolated_page_size_4kb, &pfns, + &uaddrs, ) - .map_err(Error::ImportIsolatedPages)?; + .map_err(Error::ImportIsolatedPages); + #[cfg(feature = "kvm")] + if hypervisor_type == HypervisorType::Kvm + && import_result.is_err() + && page_type == page_types.cpuid + { + // When we import the CPUID page, the firmware will change any cpuid fns that + // could lead to an insecure guest, we must then make sure to import the updated cpuid + // https://elixir.bootlin.com/linux/v6.11/source/arch/x86/kvm/svm/sev.c#L2322 + let mut updated_cp = SnpCpuidInfo::new_zeroed(); + let _ = guest_memory.read(updated_cp.as_mut_bytes(), GuestAddress(group[0].gpa)); + for (set, got) in std::iter::zip(new_cp.entries.iter(), updated_cp.entries.iter()) { + if set != got { + error!("Set cpuid fn: {set:#x?}, but firmware expects: {got:#x?}"); + } + } + memory_manager + .lock() + .unwrap() + .vm + .import_isolated_pages( + group[0].page_type, + page_types.isolated_page_size_4kb, + &pfns, + &uaddrs, + ) + .map_err(Error::ImportIsolatedPages)?; + continue; + } + import_result?; } info!( @@ -480,13 +706,23 @@ pub fn load_igvm( gpas.len() ); + let id_block_enabled = if hypervisor_type == HypervisorType::Mshv { + 1 + } else { + 0 + }; + now = Instant::now(); // Call Complete Isolated Import since we are done importing isolated pages memory_manager .lock() .unwrap() .vm - .complete_isolated_import(loaded_info.snp_id_block, host_data_contents, 1) + .complete_isolated_import( + loaded_info.snp_id_block, + host_data_contents, + id_block_enabled, + ) .map_err(Error::CompleteIsolatedImport)?; info!( diff --git a/vmm/src/igvm/mod.rs b/vmm/src/igvm/mod.rs index 62c32d4e89..ded102bd35 100644 --- a/vmm/src/igvm/mod.rs +++ b/vmm/src/igvm/mod.rs @@ -27,10 +27,19 @@ pub mod igvm_loader; mod loader; +use std::path::Path; + use igvm::snp_defs::SevVmsa; +use igvm::{IgvmFile, IsolationType}; use igvm_defs::IGVM_VHS_SNP_ID_BLOCK; use zerocopy::FromZeros; +pub fn parse_igvm(igvm_path: &Path) -> Result { + let file_contents = std::fs::read(igvm_path).map_err(igvm_loader::Error::Igvm)?; + IgvmFile::new_from_binary(&file_contents, Some(IsolationType::Snp)) + .map_err(igvm_loader::Error::InvalidIgvmFile) +} + #[derive(Debug, Clone)] pub struct IgvmLoadedInfo { pub gpas: Vec, diff --git a/vmm/src/interrupt.rs b/vmm/src/interrupt.rs index e42ba2f76b..f08aaab7fa 100644 --- a/vmm/src/interrupt.rs +++ b/vmm/src/interrupt.rs @@ -5,7 +5,6 @@ use std::collections::HashMap; use std::io; -use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::{Arc, Mutex}; use devices::interrupt_controller::InterruptController; @@ -18,77 +17,152 @@ use vm_device::interrupt::{ use vmm_sys_util::eventfd::EventFd; /// Reuse std::io::Result to simplify interoperability among crates. -pub type Result = std::io::Result; +type Result = std::io::Result; struct InterruptRoute { - gsi: u32, - irq_fd: EventFd, - registered: AtomicBool, + gsi: Option, + irq_fd: Option, + registered: bool, } impl InterruptRoute { - pub fn new(allocator: &mut SystemAllocator) -> Result { - let irq_fd = EventFd::new(libc::EFD_NONBLOCK)?; - let gsi = allocator - .allocate_gsi() - .ok_or_else(|| io::Error::other("Failed allocating new GSI"))?; + fn new() -> Result { + // The irq_fd must be created eagerly because external components + // (say, VFIO) need the fd at device initialization time via notifier(). + Self::new_with_fd(Some(EventFd::new(libc::EFD_NONBLOCK)?)) + } + fn new_with_fd(irq_fd: Option) -> Result { Ok(InterruptRoute { - gsi, + gsi: None, irq_fd, - registered: AtomicBool::new(false), + registered: false, }) } - pub fn enable(&self, vm: &dyn hypervisor::Vm) -> Result<()> { - if !self.registered.load(Ordering::Acquire) { - vm.register_irqfd(&self.irq_fd, self.gsi) - .map_err(|e| io::Error::other(format!("Failed registering irq_fd: {e}")))?; + fn allocate_gsi(&mut self, allocator: &mut SystemAllocator) -> Result { + match self.gsi { + Some(existing) => Ok(existing), + None => { + let new_gsi = allocator + .allocate_gsi() + .ok_or_else(|| io::Error::other("Failed allocating new GSI"))?; + self.gsi = Some(new_gsi); + Ok(new_gsi) + } + } + } + + fn enable(&mut self, vm: &dyn hypervisor::Vm) -> Result<()> { + let gsi = match self.gsi { + Some(gsi) => gsi, + // Do nothing if no GSI was ever allocated for this route, which means the interrupt is still masked. + None => return Ok(()), + }; + + if !self.registered { + if let Some(ref irq_fd) = self.irq_fd { + vm.register_irqfd(irq_fd, gsi) + .map_err(|e| io::Error::other(format!("Failed registering irq_fd: {e}")))?; + } // Update internals to track the irq_fd as "registered". - self.registered.store(true, Ordering::Release); + self.registered = true; } Ok(()) } - pub fn disable(&self, vm: &dyn hypervisor::Vm) -> Result<()> { - if self.registered.load(Ordering::Acquire) { - vm.unregister_irqfd(&self.irq_fd, self.gsi) - .map_err(|e| io::Error::other(format!("Failed unregistering irq_fd: {e}")))?; + fn disable(&mut self, vm: &dyn hypervisor::Vm) -> Result<()> { + let gsi = match self.gsi { + Some(gsi) => gsi, + // Do nothing if no GSI was ever allocated for this route, which means the interrupt is still masked. + None => return Ok(()), + }; + + if self.registered { + if let Some(ref irq_fd) = self.irq_fd { + vm.unregister_irqfd(irq_fd, gsi) + .map_err(|e| io::Error::other(format!("Failed unregistering irq_fd: {e}")))?; + } // Update internals to track the irq_fd as "unregistered". - self.registered.store(false, Ordering::Release); + self.registered = false; } Ok(()) } - pub fn trigger(&self) -> Result<()> { - self.irq_fd.write(1) + fn trigger(&mut self) -> Result<()> { + match self.irq_fd { + Some(ref fd) => fd.write(1), + None => Ok(()), + } } - pub fn notifier(&self) -> Option { + fn notifier(&mut self) -> Option { Some( self.irq_fd + .as_ref()? .try_clone() .expect("Failed cloning interrupt's EventFd"), ) } + + // This is currently not used, but the upcoming vhost-guest feature + // will use it. Use #[allow(dead_code)] to suppress a compiler + // warning. + #[allow(dead_code)] + fn set_notifier(&mut self, eventfd: Option, vm: &dyn hypervisor::Vm) -> Result<()> { + let old_irqfd = core::mem::replace(&mut self.irq_fd, eventfd); + if self.registered { + // A registered route must have a GSI allocated, since enable() + // only sets registered=true after using a valid GSI. + let gsi = self.gsi.expect("registered route has no GSI allocated"); + if let Some(ref irq_fd) = self.irq_fd { + vm.register_irqfd(irq_fd, gsi) + .map_err(|e| io::Error::other(format!("Failed registering irq_fd: {e}")))?; + } + // If the irqfd cannot be unregistered, what to do? Spin? + // Returning an error isn't helpful as the new irqfd is already registered. + if let Some(old_irq_fd) = old_irqfd { + match vm.unregister_irqfd(&old_irq_fd, gsi) { + Ok(()) => {} + Err(e) => log::warn!("Failed unregistering old irqfd: {e}"), + } + } + } + Ok(()) + } } -pub struct RoutingEntry { +struct RoutingEntry { route: IrqRoutingEntry, masked: bool, } -pub struct MsiInterruptGroup { +struct MsiInterruptGroup { vm: Arc, gsi_msi_routes: Arc>>, - irq_routes: HashMap, + irq_routes: HashMap>, + allocator: Arc>, } impl MsiInterruptGroup { + fn new( + vm: Arc, + gsi_msi_routes: Arc>>, + irq_routes: HashMap>, + allocator: Arc>, + ) -> Self { + MsiInterruptGroup { + vm, + gsi_msi_routes, + irq_routes, + allocator, + } + } + fn set_gsi_routes(&self, routes: &HashMap) -> Result<()> { let mut entry_vec: Vec = Vec::new(); for (_, entry) in routes.iter() { @@ -105,24 +179,10 @@ impl MsiInterruptGroup { } } -impl MsiInterruptGroup { - fn new( - vm: Arc, - gsi_msi_routes: Arc>>, - irq_routes: HashMap, - ) -> Self { - MsiInterruptGroup { - vm, - gsi_msi_routes, - irq_routes, - } - } -} - impl InterruptSourceGroup for MsiInterruptGroup { fn enable(&self) -> Result<()> { for (_, route) in self.irq_routes.iter() { - route.enable(self.vm.as_ref())?; + route.lock().unwrap().enable(self.vm.as_ref())?; } Ok(()) @@ -130,7 +190,7 @@ impl InterruptSourceGroup for MsiInterruptGroup { fn disable(&self) -> Result<()> { for (_, route) in self.irq_routes.iter() { - route.disable(self.vm.as_ref())?; + route.lock().unwrap().disable(self.vm.as_ref())?; } Ok(()) @@ -138,7 +198,7 @@ impl InterruptSourceGroup for MsiInterruptGroup { fn trigger(&self, index: InterruptIndex) -> Result<()> { if let Some(route) = self.irq_routes.get(&index) { - return route.trigger(); + return route.lock().unwrap().trigger(); } Err(io::Error::other(format!( @@ -148,7 +208,7 @@ impl InterruptSourceGroup for MsiInterruptGroup { fn notifier(&self, index: InterruptIndex) -> Option { if let Some(route) = self.irq_routes.get(&index) { - return route.notifier(); + return route.lock().unwrap().notifier(); } None @@ -162,8 +222,21 @@ impl InterruptSourceGroup for MsiInterruptGroup { set_gsi: bool, ) -> Result<()> { if let Some(route) = self.irq_routes.get(&index) { + let mut route = route.lock().unwrap(); + let gsi = if masked { + match route.gsi { + Some(gsi) => gsi, + // No update needed if masked and no GSI was ever allocated + None => return Ok(()), + } + } else { + // Allocate a GSI when the interrupt vector is first unmasked + let mut allocator = self.allocator.lock().unwrap(); + route.allocate_gsi(&mut allocator)? + }; + let entry = RoutingEntry { - route: self.vm.make_routing_entry(route.gsi, &config), + route: self.vm.make_routing_entry(gsi, &config), masked, }; @@ -176,7 +249,7 @@ impl InterruptSourceGroup for MsiInterruptGroup { } let mut routes = self.gsi_msi_routes.lock().unwrap(); - routes.insert(route.gsi, entry); + routes.insert(gsi, entry); if set_gsi { self.set_gsi_routes(&routes)?; } @@ -200,9 +273,22 @@ impl InterruptSourceGroup for MsiInterruptGroup { let routes = self.gsi_msi_routes.lock().unwrap(); self.set_gsi_routes(&routes) } + + fn set_notifier( + &mut self, + index: InterruptIndex, + eventfd: Option, + vm: &dyn hypervisor::Vm, + ) -> Result<()> { + if let Some(route) = self.irq_routes.get(&index) { + return route.lock().unwrap().set_notifier(eventfd, vm); + } + + Ok(()) + } } -pub struct LegacyUserspaceInterruptGroup { +struct LegacyUserspaceInterruptGroup { ioapic: Arc>, irq: u32, } @@ -288,24 +374,52 @@ impl InterruptManager for LegacyUserspaceInterruptManager { } } +impl MsiInterruptManager { + fn create_group_raw( + &self, + config: ::GroupConfig, + ) -> Result { + let mut irq_routes: HashMap> = + HashMap::with_capacity(config.count as usize); + for i in config.base..config.base + config.count { + irq_routes.insert(i, Mutex::new(InterruptRoute::new()?)); + } + + Ok(MsiInterruptGroup::new( + self.vm.clone(), + self.gsi_msi_routes.clone(), + irq_routes, + self.allocator.clone(), + )) + } +} + impl InterruptManager for MsiInterruptManager { type GroupConfig = MsiIrqGroupConfig; fn create_group(&self, config: Self::GroupConfig) -> Result> { - let mut allocator = self.allocator.lock().unwrap(); - let mut irq_routes: HashMap = + let mut irq_routes: HashMap> = HashMap::with_capacity(config.count as usize); for i in config.base..config.base + config.count { - irq_routes.insert(i, InterruptRoute::new(&mut allocator)?); + irq_routes.insert(i, Mutex::new(InterruptRoute::new()?)); } Ok(Arc::new(MsiInterruptGroup::new( self.vm.clone(), self.gsi_msi_routes.clone(), irq_routes, + self.allocator.clone(), ))) } + fn create_group_mut( + &self, + config: Self::GroupConfig, + ) -> vm_device::interrupt::Result>> { + let r = self.create_group_raw(config)?; + Ok(Arc::new(Mutex::new(r))) + } + fn destroy_group(&self, _group: Arc) -> Result<()> { Ok(()) } diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index dcf6614b24..b0117d729c 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -6,18 +6,18 @@ use std::collections::HashMap; use std::fs::File; use std::io::{Read, Write, stdout}; -use std::net::{TcpListener, TcpStream}; use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; -use std::os::unix::net::{UnixListener, UnixStream}; use std::panic::AssertUnwindSafe; +#[cfg(feature = "guest_debug")] use std::path::PathBuf; use std::sync::mpsc::{Receiver, RecvError, SendError, Sender}; use std::sync::{Arc, Mutex}; +use std::time::Duration; #[cfg(not(target_arch = "riscv64"))] use std::time::Instant; use std::{io, result, thread}; -use anyhow::anyhow; +use anyhow::{Context, anyhow}; #[cfg(feature = "dbus_api")] use api::dbus::{DBusApiOptions, DBusApiShutdownChannels}; use api::http::HttpApiHandle; @@ -27,7 +27,7 @@ use console_devices::{ConsoleInfo, pre_create_console_devices}; use event_monitor::event; use landlock::LandlockError; use libc::{EFD_NONBLOCK, SIGINT, SIGTERM, TCSANOW, tcsetattr, termios}; -use log::{error, info, trace, warn}; +use log::{debug, error, info, trace, warn}; use memory_manager::MemoryManagerSnapshotData; use pci::PciBdf; use seccompiler::{SeccompAction, apply_filter}; @@ -36,19 +36,22 @@ use serde::{Deserialize, Serialize}; use signal_hook::iterator::{Handle, Signals}; use thiserror::Error; use tracer::trace_scoped; -use vm_memory::bitmap::{AtomicBitmap, BitmapSlice}; -use vm_memory::{ReadVolatile, VolatileMemoryError, VolatileSlice, WriteVolatile}; +use vm_memory::GuestMemoryAtomic; +use vm_memory::bitmap::AtomicBitmap; use vm_migration::protocol::*; -use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable}; +use vm_migration::{ + MemoryMigrationContext, Migratable, MigratableError, OngoingMigrationContext, Pausable, + Snapshot, Snapshottable, Transportable, +}; use vmm_sys_util::eventfd::EventFd; use vmm_sys_util::signal::unblock_signal; use vmm_sys_util::sock_ctrl_msg::ScmSocket; use crate::api::{ - ApiRequest, ApiResponse, RequestHandler, VmInfoResponse, VmReceiveMigrationData, - VmSendMigrationData, VmmPingResponse, + ApiRequest, ApiResponse, RequestHandler, TimeoutStrategy, VmInfoResponse, + VmReceiveMigrationData, VmSendMigrationData, VmmPingResponse, }; -use crate::config::{RestoreConfig, add_to_config}; +use crate::config::{MemoryRestoreMode, RestoreConfig, add_to_config}; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use crate::coredump::GuestDebuggable; use crate::landlock::Landlock; @@ -56,11 +59,14 @@ use crate::memory_manager::MemoryManager; #[cfg(all(feature = "kvm", target_arch = "x86_64"))] use crate::migration::get_vm_snapshot; use crate::migration::{recv_vm_config, recv_vm_state}; +use crate::migration_transport::{ + ReceiveAdditionalConnections, ReceiveListener, SendAdditionalConnections, SocketStream, +}; use crate::seccomp_filters::{Thread, get_seccomp_filter}; use crate::vm::{Error as VmError, Vm, VmState}; use crate::vm_config::{ - DeviceConfig, DiskConfig, FsConfig, NetConfig, PmemConfig, UserDeviceConfig, VdpaConfig, - VmConfig, VsockConfig, + DeviceConfig, DiskConfig, FsConfig, GenericVhostUserConfig, NetConfig, PmemConfig, + UserDeviceConfig, VdpaConfig, VmConfig, VsockConfig, }; mod acpi; @@ -81,10 +87,14 @@ pub mod interrupt; pub mod landlock; pub mod memory_manager; pub mod migration; +pub mod migration_transport; mod pci_segment; pub mod seccomp_filters; mod serial_manager; mod sigwinch_listener; +mod sync_utils; +mod uffd; +mod userfaultfd; pub mod vm; pub mod vm_config; @@ -152,6 +162,10 @@ pub enum Error { #[error("Error rebooting VM")] VmReboot(#[source] VmError), + /// Cannot shut the VM down + #[error("Error shutting down VM")] + VmShutdown(#[source] VmError), + /// Cannot create VMM thread #[error("Error spawning VMM thread")] VmmThreadSpawn(#[source] io::Error), @@ -220,6 +234,8 @@ impl From<&VmConfig> for hypervisor::HypervisorVmConfig { sev_snp_enabled: _value.is_sev_snp_enabled(), #[cfg(feature = "sev_snp")] mem_size: _value.memory.total_size(), + #[cfg(feature = "sev_snp")] + vmsa_features: 0, nested: _value.cpus.nested, smt_enabled: _value .cpus @@ -240,6 +256,7 @@ pub enum EpollDispatch { Api = 2, ActivateVirtioDevices = 3, Debug = 4, + GuestExit = 5, Unknown, } @@ -252,94 +269,12 @@ impl From for EpollDispatch { 2 => Api, 3 => ActivateVirtioDevices, 4 => Debug, + 5 => GuestExit, _ => Unknown, } } } -enum SocketStream { - Unix(UnixStream), - Tcp(TcpStream), -} - -impl Read for SocketStream { - fn read(&mut self, buf: &mut [u8]) -> std::io::Result { - match self { - SocketStream::Unix(stream) => stream.read(buf), - SocketStream::Tcp(stream) => stream.read(buf), - } - } -} - -impl Write for SocketStream { - fn write(&mut self, buf: &[u8]) -> std::io::Result { - match self { - SocketStream::Unix(stream) => stream.write(buf), - SocketStream::Tcp(stream) => stream.write(buf), - } - } - - fn flush(&mut self) -> std::io::Result<()> { - match self { - SocketStream::Unix(stream) => stream.flush(), - SocketStream::Tcp(stream) => stream.flush(), - } - } -} - -impl AsRawFd for SocketStream { - fn as_raw_fd(&self) -> RawFd { - match self { - SocketStream::Unix(s) => s.as_raw_fd(), - SocketStream::Tcp(s) => s.as_raw_fd(), - } - } -} - -impl ReadVolatile for SocketStream { - fn read_volatile( - &mut self, - buf: &mut VolatileSlice, - ) -> std::result::Result { - match self { - SocketStream::Unix(s) => s.read_volatile(buf), - SocketStream::Tcp(s) => s.read_volatile(buf), - } - } - - fn read_exact_volatile( - &mut self, - buf: &mut VolatileSlice, - ) -> std::result::Result<(), VolatileMemoryError> { - match self { - SocketStream::Unix(s) => s.read_exact_volatile(buf), - SocketStream::Tcp(s) => s.read_exact_volatile(buf), - } - } -} - -impl WriteVolatile for SocketStream { - fn write_volatile( - &mut self, - buf: &VolatileSlice, - ) -> std::result::Result { - match self { - SocketStream::Unix(s) => s.write_volatile(buf), - SocketStream::Tcp(s) => s.write_volatile(buf), - } - } - - fn write_all_volatile( - &mut self, - buf: &VolatileSlice, - ) -> std::result::Result<(), VolatileMemoryError> { - match self { - SocketStream::Unix(s) => s.write_all_volatile(buf), - SocketStream::Tcp(s) => s.write_all_volatile(buf), - } - } -} - pub struct EpollContext { epoll_file: File, } @@ -520,6 +455,7 @@ pub fn start_vmm_thread( exit_event: EventFd, seccomp_action: &SeccompAction, hypervisor: Arc, + no_shutdown: bool, landlock_enable: bool, ) -> Result { #[cfg(feature = "guest_debug")] @@ -559,6 +495,7 @@ pub fn start_vmm_thread( vmm_seccomp_action, hypervisor, exit_event, + no_shutdown, )?; vmm.setup_signal_handler(landlock_enable)?; @@ -636,6 +573,17 @@ pub fn start_vmm_thread( }) } +/// Measures the time of the callback, in case it returns `Ok`. +fn measure_ok(f: F) -> result::Result<(T, Duration), E> +where + F: FnOnce() -> result::Result, +{ + let begin = Instant::now(); + let value = f()?; + let duration = begin.elapsed(); + Ok((value, duration)) +} + #[derive(Clone, Deserialize, Serialize)] struct VmMigrationConfig { vm_config: Arc>, @@ -670,6 +618,7 @@ pub struct Vmm { epoll: EpollContext, exit_evt: EventFd, reset_evt: EventFd, + guest_exit_evt: EventFd, api_evt: EventFd, #[cfg(feature = "guest_debug")] debug_evt: EventFd, @@ -686,8 +635,16 @@ pub struct Vmm { original_termios_opt: Arc>>, console_resize_pipe: Option>, console_info: Option, + no_shutdown: bool, } +/// Just a wrapper for the data that goes into +/// [`ReceiveMigrationState::Configured`] +struct ReceiveMigrationConfiguredData { + memory_manager: Arc>, + guest_memory: GuestMemoryAtomic, + connections: ReceiveAdditionalConnections, +} /// The receiver's state machine behind the migration protocol. enum ReceiveMigrationState { /// The connection is established and we haven't received any commands yet. @@ -699,11 +656,19 @@ enum ReceiveMigrationState { /// We received file descriptors for memory. This can only happen on UNIX domain sockets. MemoryFdsReceived(Vec<(u32, File)>), - /// We received the VM configuration. We keep the memory configuration around to populate guest memory. From this point on, the sender can start sending memory updates. - Configured(Arc>), + /// We received the VM configuration. We keep a direct reference to the guest memory + /// around to populate it without having to acquire a lock (which we would have to do + /// when accessing the memory through the memory manager). + /// + /// We keep the memory manager around to pass it into the next state. From this point + /// on, the sender can start sending memory updates. + Configured(ReceiveMigrationConfiguredData), /// Memory is populated and we received the state. The VM is ready to go. - StateReceived, + StateReceived { + /// The timestamp where the VMM started to receive the final state. + state_receive_begin: Instant, + }, /// The migration is successful. Completed, @@ -735,6 +700,7 @@ impl Vmm { for signal in signals.forever() { match signal { + #[allow(clippy::collapsible_match)] SIGTERM | SIGINT => { if exit_evt.write(1).is_err() { // Resetting the terminal is usually done as the VMM exits @@ -824,9 +790,11 @@ impl Vmm { seccomp_action: SeccompAction, hypervisor: Arc, exit_evt: EventFd, + no_shutdown: bool, ) -> Result { let mut epoll = EpollContext::new().map_err(Error::Epoll)?; let reset_evt = EventFd::new(EFD_NONBLOCK).map_err(Error::EventFdCreate)?; + let guest_exit_evt = EventFd::new(EFD_NONBLOCK).map_err(Error::EventFdCreate)?; let activate_evt = EventFd::new(EFD_NONBLOCK).map_err(Error::EventFdCreate)?; epoll @@ -837,6 +805,10 @@ impl Vmm { .add_event(&reset_evt, EpollDispatch::Reset) .map_err(Error::Epoll)?; + epoll + .add_event(&guest_exit_evt, EpollDispatch::GuestExit) + .map_err(Error::Epoll)?; + epoll .add_event(&activate_evt, EpollDispatch::ActivateVirtioDevices) .map_err(Error::Epoll)?; @@ -854,6 +826,7 @@ impl Vmm { epoll, exit_evt, reset_evt, + guest_exit_evt, api_evt, #[cfg(feature = "guest_debug")] debug_evt, @@ -870,6 +843,7 @@ impl Vmm { original_termios_opt: Arc::new(Mutex::new(None)), console_resize_pipe: None, console_info: None, + no_shutdown, }) } @@ -899,6 +873,7 @@ impl Vmm { fn vm_receive_migration_step( &mut self, socket: &mut SocketStream, + listener: &ReceiveListener, state: ReceiveMigrationState, req: &Request, _receive_data_migration: &VmReceiveMigrationData, @@ -914,10 +889,21 @@ impl Vmm { let mut configure_vm = |socket: &mut SocketStream, memory_files: HashMap| - -> std::result::Result>, MigratableError> { + -> std::result::Result { let memory_manager = self.vm_receive_config(req, socket, memory_files)?; - - Ok(memory_manager) + let guest_memory = memory_manager.lock().unwrap().guest_memory(); + // Create the additional-connection receiver even in the single-connection case. + // At this point the receiver does not know whether the sender will use extra TCP + // connections. If it does not, no worker connections are accepted and memory + // requests continue to arrive on the main connection. + let connections = listener + .try_clone() + .and_then(|l| ReceiveAdditionalConnections::new(l, guest_memory.clone()))?; + Ok(ReceiveMigrationConfiguredData { + memory_manager, + guest_memory, + connections, + }) }; let recv_memory_fd = |socket: &mut SocketStream, @@ -951,23 +937,67 @@ impl Vmm { } _ => invalid_command(), }, - Configured(memory_manager) => match req.command() { + Configured(mut config_data) => match req.command() { + // Memory commands use the main connection only in the single-connection case. + // When multiple TCP connections are configured, the worker connections carry + // all memory commands and the main connection is used only for control traffic. Command::Memory => { - self.vm_receive_memory(req, socket, &mut memory_manager.lock().unwrap())?; - Ok(Configured(memory_manager)) + migration_transport::receive_memory_ranges( + &config_data.guest_memory, + req, + socket, + ) + .inspect_err(|_| { + // connections.cleanup() already logs all errors that occurred in one of the + // threads. Furthermore, this path is only taken in the single-connection case, + // thus we do not expect any errors during this cleanup. The warning should + // reflect that. + if let Err(e) = config_data.connections.cleanup() { + warn!( + "Unexpected error while cleaning up migration connections after a main-connection memory receive failure: {e}" + ); + } + })?; + Ok(Configured(config_data)) } Command::State => { - self.vm_receive_state(req, socket, memory_manager.clone())?; - Ok(StateReceived) + let state_receive_begin = Instant::now(); + config_data.connections.cleanup()?; + let (recv_state_dur, restore_vm_dur) = + self.vm_receive_state(req, socket, config_data.memory_manager)?; + debug!( + "Migration (incoming): recv_snapshot:{}ms restore:{}ms", + recv_state_dur.as_millis(), + restore_vm_dur.as_millis(), + ); + Ok(StateReceived { + state_receive_begin, + }) } _ => invalid_command(), }, - StateReceived => match req.command() { + StateReceived { + state_receive_begin, + } => match req.command() { Command::Complete => { // The unwrap is safe, because the state machine makes sure we called // vm_receive_state before, which creates the VM. let vm = self.vm.as_mut().unwrap(); - vm.resume()?; + let (_, resume_duration) = measure_ok(|| vm.resume())?; + debug!( + "Migration (incoming): resume:{}ms", + resume_duration.as_millis() + ); + // This logs the downtime without the final memory delta, so + // it does not reflect the actual downtime. While we could + // pass along the timestamp from when the VM was paused, + // that would rely on both VM hosts having synchronized + // clocks, which we cannot guarantee. For that reason, this + // is logged as debug! rather than info!. + debug!( + "Migration (incoming): Receiving final state and resuming the VM took {}ms", + state_receive_begin.elapsed().as_millis() + ); Ok(Completed) } _ => invalid_command(), @@ -1064,23 +1094,33 @@ impl Vmm { Ok(memory_manager) } + /// Receives the final VM state (devices, vCPUs) and restores the VM. + /// + /// Measures the time for each step. fn vm_receive_state( &mut self, req: &Request, socket: &mut T, mm: Arc>, - ) -> std::result::Result<(), MigratableError> + ) -> std::result::Result< + ( + Duration, /* state receive + deserialize */ + Duration, /* restoring */ + ), + MigratableError, + > where T: Read, { - // Read in state data - let mut data: Vec = Vec::new(); - data.resize_with(req.length() as usize, Default::default); - socket - .read_exact(&mut data) - .map_err(MigratableError::MigrateSocket)?; - let snapshot: Snapshot = serde_json::from_slice(&data).map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error deserialising snapshot: {e}")) + let (snapshot, receive_duration): (Snapshot, Duration) = measure_ok(|| { + let mut data: Vec = Vec::new(); + data.resize_with(req.length() as usize, Default::default); + socket + .read_exact(&mut data) + .map_err(MigratableError::MigrateSocket)?; + serde_json::from_slice(&data).map_err(|e| { + MigratableError::MigrateReceive(anyhow!("Error deserialising snapshot: {e}")) + }) })?; let exit_evt = self.exit_evt.try_clone().map_err(|e| { @@ -1089,6 +1129,9 @@ impl Vmm { let reset_evt = self.reset_evt.try_clone().map_err(|e| { MigratableError::MigrateReceive(anyhow!("Error cloning reset EventFd: {e}")) })?; + let guest_exit_evt = self.guest_exit_evt.try_clone().map_err(|e| { + MigratableError::MigrateReceive(anyhow!("Error cloning guest exit EventFd: {e}")) + })?; #[cfg(feature = "guest_debug")] let debug_evt = self.vm_debug_evt.try_clone().map_err(|e| { MigratableError::MigrateReceive(anyhow!("Error cloning debug EventFd: {e}")) @@ -1097,162 +1140,262 @@ impl Vmm { MigratableError::MigrateReceive(anyhow!("Error cloning activate EventFd: {e}")) })?; - #[cfg(not(target_arch = "riscv64"))] - let timestamp = Instant::now(); - let hypervisor_vm = mm.lock().unwrap().vm.clone(); - let mut vm = Vm::new_from_memory_manager( - self.vm_config.clone().unwrap(), - mm, - hypervisor_vm, - exit_evt, - reset_evt, - #[cfg(feature = "guest_debug")] - debug_evt, - &self.seccomp_action, - self.hypervisor.clone(), - activate_evt, + let (vm, restore_duration) = measure_ok(|| { #[cfg(not(target_arch = "riscv64"))] - timestamp, - self.console_info.clone(), - self.console_resize_pipe.clone(), - Arc::clone(&self.original_termios_opt), - Some(&snapshot), - ) - .map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error creating VM from snapshot: {e:?}")) - })?; + let timestamp = Instant::now(); + let hypervisor_vm = mm.lock().unwrap().vm.clone(); + + let mut vm = Vm::new_from_memory_manager( + self.vm_config.clone().unwrap(), + mm, + hypervisor_vm, + exit_evt, + reset_evt, + guest_exit_evt, + #[cfg(feature = "guest_debug")] + debug_evt, + &self.seccomp_action, + self.hypervisor.clone(), + activate_evt, + #[cfg(not(target_arch = "riscv64"))] + timestamp, + self.console_info.clone(), + self.console_resize_pipe.clone(), + Arc::clone(&self.original_termios_opt), + Some(&snapshot), + #[cfg(feature = "igvm")] + None, + ) + .map_err(|e| { + MigratableError::MigrateReceive(anyhow!("Error creating VM from snapshot: {e:?}")) + })?; - // Create VM - vm.restore().map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Failed restoring the Vm: {e}")) - })?; - self.vm = Some(vm); + // Create VM + vm.restore().map_err(|e| { + MigratableError::MigrateReceive(anyhow!("Failed restoring the Vm: {e}")) + })?; - Ok(()) - } + Ok(vm) + })?; - fn vm_receive_memory( - &mut self, - req: &Request, - socket: &mut T, - memory_manager: &mut MemoryManager, - ) -> std::result::Result<(), MigratableError> - where - T: Read + ReadVolatile, - { - // Read table - let table = MemoryRangeTable::read_from(socket, req.length())?; + self.vm = Some(vm); - // And then read the memory itself - memory_manager.receive_memory_regions(&table, socket)?; - Ok(()) + Ok((receive_duration, restore_duration)) } - fn socket_url_to_path(url: &str) -> result::Result { - url.strip_prefix("unix:") - .ok_or_else(|| { - MigratableError::MigrateSend(anyhow!("Could not extract path from URL: {url}")) - }) - .map(|s| s.into()) - } + /// Performs the initial memory transmission (iteration zero) plus a + /// variable number of memory iterations with the goal to eventually migrate + /// the VM in a reasonably small downtime. + /// + /// This returns as soon as the precopy migration indicates it is converged + /// (e.g., reasonably small downtime) is reached. + fn do_memory_iterations( + vm: &mut Vm, + socket: &mut SocketStream, + ctx: &mut MemoryMigrationContext, + is_converged: impl Fn(&MemoryMigrationContext) -> result::Result, + mem_send: &mut SendAdditionalConnections, + ) -> result::Result { + loop { + let iteration_begin = Instant::now(); + + let iteration_table = if ctx.iteration == 0 { + vm.memory_range_table()? + } else { + // TODO do this in a thread #7816 + vm.dirty_log()? + }; - fn send_migration_socket( - destination_url: &str, - ) -> std::result::Result { - if let Some(address) = destination_url.strip_prefix("tcp:") { - info!("Connecting to TCP socket at {address}"); + ctx.update_metrics_before_transfer(iteration_begin, &iteration_table); + if is_converged(ctx)? { + debug!("Precopy converged: {ctx}"); + break Ok(iteration_table); + } - let socket = TcpStream::connect(address).map_err(|e| { - MigratableError::MigrateSend(anyhow!("Error connecting to TCP socket: {e}")) - })?; + // Send the current dirty pages + let transfer_begin = Instant::now(); + mem_send.send_memory(iteration_table, socket)?; + let transfer_duration = transfer_begin.elapsed(); + ctx.update_metrics_after_transfer(transfer_begin, transfer_duration); - Ok(SocketStream::Tcp(socket)) - } else { - let path = Vmm::socket_url_to_path(destination_url)?; - info!("Connecting to UNIX socket at {path:?}"); + // Log progress of the current iteration + debug!("Precopy: {ctx}"); - let socket = UnixStream::connect(&path).map_err(|e| { - MigratableError::MigrateSend(anyhow!("Error connecting to UNIX socket: {e}")) - })?; + // Enables management software (e.g., libvirt) to easily track forward progress. + event!( + "vm", + "migration-memory-iteration", + "id", + ctx.iteration.to_string() + ); - Ok(SocketStream::Unix(socket)) + // Increment iteration last: This way we ensure that the logging + // above matches the actual iteration. + ctx.iteration += 1; } } - fn receive_migration_socket( - receiver_url: &str, - ) -> std::result::Result { - if let Some(address) = receiver_url.strip_prefix("tcp:") { - let listener = TcpListener::bind(address).map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error binding to TCP socket: {e}")) - })?; - - let (socket, _addr) = listener.accept().map_err(|e| { - MigratableError::MigrateReceive(anyhow!( - "Error accepting connection on TCP socket: {e}" - )) - })?; - - Ok(SocketStream::Tcp(socket)) - } else { - let path = Vmm::socket_url_to_path(receiver_url)?; - let listener = UnixListener::bind(&path).map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error binding to UNIX socket: {e}")) - })?; - - let (socket, _addr) = listener.accept().map_err(|e| { - MigratableError::MigrateReceive(anyhow!( - "Error accepting connection on UNIX socket: {e}" - )) - })?; - - // Remove the UNIX socket file after accepting the connection - std::fs::remove_file(&path).map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error removing UNIX socket file: {e}")) - })?; - - Ok(SocketStream::Unix(socket)) + /// Checks whether the precopy memory migration has converged and it is safe + /// to proceed to the final (paused) memory iteration. + /// + /// Once this returns, the VM is expected to stop as soon as possible. + /// + /// Convergence is reached when any of the following criteria is met: + /// + /// 1. **No dirty pages remain** – the current iteration would transfer zero + /// bytes. + /// 2. **Downtime budget is met** – the estimated downtime for the final + /// (paused) iteration is within the caller-specified + /// [`VmSendMigrationData::downtime`] budget. + /// 3. **Timeout** – the precopy phase has been running for at least + /// [`VmSendMigrationData::timeout`]. The outcome depends on + /// [`VmSendMigrationData::timeout_strategy`]: + /// - [`TimeoutStrategy::Cancel`] – returns + /// - [`TimeoutStrategy::Ignore`] – the migration completes despite not + /// meeting the downtime budget. + /// [`MigratableError::MigrateSend`] so the caller can abort the + /// migration cleanly. + /// + /// # Returns + /// + /// * `Ok(true)` – convergence criterion met; the caller should stop precopy + /// iterations. + /// * `Ok(false)` – not yet converged; the caller should run another + /// dirty-page iteration. + /// * `Err(_)` – the timeout was reached and [`TimeoutStrategy::Cancel`] + /// is in effect. + fn is_precopy_converged( + ctx: &MemoryMigrationContext, + send_data_migration: &VmSendMigrationData, + ) -> result::Result { + if ctx.current_iteration_total_bytes == 0 { + debug!("Precopy: No more memory to transfer"); + return Ok(true); + } + + // We currently ignore the time required to transfer the final + // VM state (device state and vCPUs) and the time needed on the + // receiver to create the VM and initialize its data structures + // before execution can resume. + // + // Manual testing showed that migrating an idle VM on a modern + // AMD CPU (CHV release build) adds ~5 ms of overhead when + // scaling from 1 to 200 vCPUs. Given this small cost, we + // deliberately avoid additional heuristics to estimate the + // downtime more precisely - for now. Instead, we approximate + // the downtime just by the transfer time of the final memory + // delta. + if let Some(memory_downtime) = ctx.estimated_downtime + && memory_downtime <= send_data_migration.downtime() + { + debug!( + "Precopy: Target downtime can be met: {}ms <= {}ms", + memory_downtime.as_millis(), + send_data_migration.downtime().as_millis() + ); + return Ok(true); + } + + // We check the beginning of the precopy migration and not the overall migration, and + // this is fine: precopy takes the longest and the earlier steps are negligible. + if ctx.migration_begin.elapsed() >= send_data_migration.timeout() { + return match send_data_migration.timeout_strategy { + TimeoutStrategy::Cancel => { + let msg = format!( + "Precopy: Timeout reached: {}s: migration didn't converge in time", + send_data_migration.timeout().as_secs() + ); + Err(MigratableError::MigrateSend(anyhow!("{msg}"))) + } + TimeoutStrategy::Ignore => { + info!( + "Precopy: Pausing VM, ignoring target downtime ({}ms) due to timeout ({}s): Estimated downtime: {}ms", + send_data_migration.downtime().as_millis(), + send_data_migration.timeout().as_secs(), + ctx.estimated_downtime + .unwrap_or(Duration::from_secs(0)) + .as_millis() + ); + Ok(true) + } + }; } + + Ok(false) } - // Returns true if there were dirty pages to send - fn vm_maybe_send_dirty_pages( + /// Performs the memory migration including multiple iterations. + /// + /// This includes: + /// - initial memory - VM is running + /// - multiple memory delta transmissions - VM is running + /// - final memory iteration - VM is paused + /// + /// Stores the [finalized] [`MemoryMigrationContext`] in the provided + /// [`OngoingMigrationContext`]. + /// + /// [finalized]: MemoryMigrationContext::finalize + fn do_memory_migration( vm: &mut Vm, socket: &mut SocketStream, - ) -> result::Result { - // Send (dirty) memory table - let table = vm.dirty_log()?; - - // But if there are no regions go straight to pause - if table.regions().is_empty() { - return Ok(false); - } + send_data_migration: &VmSendMigrationData, + mem_send: &mut SendAdditionalConnections, + ctx: &mut OngoingMigrationContext, + ) -> result::Result<(), MigratableError> { + let mut mem_ctx = MemoryMigrationContext::new(); - Request::memory(table.length()).write_to(socket).unwrap(); - table.write_to(socket)?; - // And then the memory itself - vm.send_memory_regions(&table, socket)?; - Response::read_from(socket)?.ok_or_abandon( + vm.start_dirty_log()?; + let remaining = Self::do_memory_iterations( + vm, socket, - MigratableError::MigrateSend(anyhow!("Error during dirty memory migration")), + &mut mem_ctx, + // We bind send_data_migration to the callback + |ctx| Self::is_precopy_converged(ctx, send_data_migration), + mem_send, )?; + let downtime_begin = Instant::now(); + vm.pause()?; + + // Send last batch of dirty pages: final iteration + { + let iteration_begin = Instant::now(); - Ok(true) + let mut final_table = vm.dirty_log()?; + final_table.extend(remaining); + + mem_ctx.update_metrics_before_transfer(iteration_begin, &final_table); + let transfer_begin = Instant::now(); + mem_send.send_memory(final_table, socket)?; + let transfer_duration = transfer_begin.elapsed(); + mem_ctx.update_metrics_after_transfer(transfer_begin, transfer_duration); + mem_ctx.iteration += 1; + } + mem_ctx.finalize(); + info!("Precopy complete: {mem_ctx}"); + ctx.set_vm_paused(downtime_begin, mem_ctx) + .expect("migration context should transition to VmPaused after memory migration"); + + Ok(()) } + /// Performs a migration including all its phases. fn send_migration( vm: &mut Vm, #[cfg(all(feature = "kvm", target_arch = "x86_64"))] hypervisor: &dyn hypervisor::Hypervisor, send_data_migration: &VmSendMigrationData, ) -> result::Result<(), MigratableError> { + // State machine that is updated with more context as we progress. + let mut ctx = OngoingMigrationContext::new(); + // Set up the socket connection - let mut socket = Self::send_migration_socket(&send_data_migration.destination_url)?; + let mut socket = + migration_transport::send_migration_socket(&send_data_migration.destination_url)?; // Start the migration - Request::start().write_to(&mut socket)?; - Response::read_from(&mut socket)?.ok_or_abandon( + migration_transport::send_request_expect_ok( &mut socket, + Request::start(), MigratableError::MigrateSend(anyhow!("Error starting migration")), )?; @@ -1305,53 +1448,44 @@ impl Vmm { common_cpuid, memory_manager_data: vm.memory_manager_data(), }; - let config_data = serde_json::to_vec(&vm_migration_config).unwrap(); - Request::config(config_data.len() as u64).write_to(&mut socket)?; - socket - .write_all(&config_data) - .map_err(MigratableError::MigrateSocket)?; - Response::read_from(&mut socket)?.ok_or_abandon( - &mut socket, - MigratableError::MigrateSend(anyhow!("Error during config migration")), - )?; + migration_transport::send_config(&mut socket, &vm_migration_config)?; // Let every Migratable object know about the migration being started. vm.start_migration()?; if send_data_migration.local { // Now pause VM + let downtime_begin = Instant::now(); vm.pause()?; + ctx.set_vm_paused( + downtime_begin, + // No memory was transferred + MemoryMigrationContext::empty_finalized(), + ) + .expect("migration context should transition to VmPaused for local migration"); } else { - // Start logging dirty pages - vm.start_dirty_log()?; - - // Send memory table - let table = vm.memory_range_table()?; - Request::memory(table.length()) - .write_to(&mut socket) - .unwrap(); - table.write_to(&mut socket)?; - // And then the memory itself - vm.send_memory_regions(&table, &mut socket)?; - Response::read_from(&mut socket)?.ok_or_abandon( - &mut socket, - MigratableError::MigrateSend(anyhow!("Error during dirty memory migration")), + let mut mem_send = migration_transport::SendAdditionalConnections::new( + &send_data_migration.destination_url, + send_data_migration.connections, + &vm.guest_memory(), )?; - // Try at most 5 passes of dirty memory sending - const MAX_DIRTY_MIGRATIONS: usize = 5; - for i in 0..MAX_DIRTY_MIGRATIONS { - info!("Dirty memory migration {i} of {MAX_DIRTY_MIGRATIONS}"); - if !Self::vm_maybe_send_dirty_pages(vm, &mut socket)? { - break; + Self::do_memory_migration( + vm, + &mut socket, + send_data_migration, + &mut mem_send, + &mut ctx, + ) + .inspect_err(|_| { + // Calling cleanup multiple times is fine, thus here we just make sure + // that it is called. + if let Err(e) = mem_send.cleanup() { + warn!("Error cleaning up migration connections: {e}"); } - } - - // Now pause VM - vm.pause()?; + })?; - // Send last batch of dirty pages - Self::vm_maybe_send_dirty_pages(vm, &mut socket)?; + mem_send.cleanup()?; } // We release the locks early to enable locking them on the destination host. @@ -1360,31 +1494,39 @@ impl Vmm { .map_err(|e| MigratableError::UnlockError(anyhow!("{e}")))?; // Capture snapshot and send it - let vm_snapshot = vm.snapshot()?; - let snapshot_data = serde_json::to_vec(&vm_snapshot).unwrap(); - Request::state(snapshot_data.len() as u64).write_to(&mut socket)?; - socket - .write_all(&snapshot_data) - .map_err(MigratableError::MigrateSocket)?; - Response::read_from(&mut socket)?.ok_or_abandon( - &mut socket, - MigratableError::MigrateSend(anyhow!("Error during state migration")), - )?; - // Complete the migration - // At this step, the receiving VMM will acquire disk locks again. - Request::complete().write_to(&mut socket)?; - Response::read_from(&mut socket)?.ok_or_abandon( - &mut socket, - MigratableError::MigrateSend(anyhow!("Error completing migration")), - )?; + let (vm_snapshot, snapshot_duration) = measure_ok(|| vm.snapshot())?; + let (_, send_snapshot_duration) = + measure_ok(|| migration_transport::send_state(&mut socket, &vm_snapshot))?; + + // Complete the migration. + // When this returns, we know the VM was resumed (if it was running + // before the migration) and that the receiving VMM acquired disk + // locks again. + let (_, complete_duration) = measure_ok(|| { + migration_transport::send_request_expect_ok( + &mut socket, + Request::complete(), + MigratableError::MigrateSend(anyhow!("Error completing migration")), + ) + })?; + + let ctx = ctx + .finalize(snapshot_duration, send_snapshot_duration, complete_duration) + .expect("migration context should finalize after memory migration completed"); + + info!( + "Migration completed after {:.1}s with a downtime of {}ms (goal was {}ms)", + ctx.migration_dur.as_secs_f32(), + ctx.downtime_ctx.effective_downtime.as_millis(), + send_data_migration.downtime().as_millis() + ); + debug!("Downtime breakdown: {}", ctx.downtime_ctx); // Stop logging dirty pages if !send_data_migration.local { vm.stop_dirty_log()?; } - info!("Migration complete"); - // Let every Migratable object know about the migration being complete vm.complete_migration() } @@ -1435,6 +1577,7 @@ impl Vmm { source_url: &str, vm_config: Arc>, prefault: bool, + memory_restore_mode: MemoryRestoreMode, ) -> std::result::Result<(), VmError> { let snapshot = recv_vm_state(source_url).map_err(VmError::Restore)?; #[cfg(all(feature = "kvm", target_arch = "x86_64"))] @@ -1452,6 +1595,10 @@ impl Vmm { let exit_evt = self.exit_evt.try_clone().map_err(VmError::EventFdClone)?; let reset_evt = self.reset_evt.try_clone().map_err(VmError::EventFdClone)?; + let guest_exit_evt = self + .guest_exit_evt + .try_clone() + .map_err(VmError::EventFdClone)?; #[cfg(feature = "guest_debug")] let debug_evt = self .vm_debug_evt @@ -1466,6 +1613,7 @@ impl Vmm { vm_config, exit_evt, reset_evt, + guest_exit_evt, #[cfg(feature = "guest_debug")] debug_evt, &self.seccomp_action, @@ -1477,6 +1625,7 @@ impl Vmm { Some(&snapshot), Some(source_url), Some(prefault), + Some(memory_restore_mode), )?; self.vm = Some(vm); @@ -1549,6 +1698,16 @@ impl Vmm { self.reset_evt.read().map_err(Error::EventFdRead)?; self.vm_reboot().map_err(Error::VmReboot)?; } + EpollDispatch::GuestExit => { + info!("VM guest exit event"); + self.guest_exit_evt.read().map_err(Error::EventFdRead)?; + if self.no_shutdown { + self.vm_shutdown().map_err(Error::VmShutdown)?; + } else { + self.vmm_shutdown().map_err(Error::VmmShutdown)?; + break 'outer; + } + } EpollDispatch::ActivateVirtioDevices => { if let Some(ref vm) = self.vm { let count = self.activate_evt.read().map_err(Error::EventFdRead)?; @@ -1657,6 +1816,10 @@ impl RequestHandler for Vmm { if self.vm.is_none() { let exit_evt = self.exit_evt.try_clone().map_err(VmError::EventFdClone)?; let reset_evt = self.reset_evt.try_clone().map_err(VmError::EventFdClone)?; + let guest_exit_evt = self + .guest_exit_evt + .try_clone() + .map_err(VmError::EventFdClone)?; #[cfg(feature = "guest_debug")] let vm_debug_evt = self .vm_debug_evt @@ -1672,6 +1835,7 @@ impl RequestHandler for Vmm { Arc::clone(vm_config), exit_evt, reset_evt, + guest_exit_evt, #[cfg(feature = "guest_debug")] vm_debug_evt, &self.seccomp_action, @@ -1683,6 +1847,7 @@ impl RequestHandler for Vmm { None, None, None, + None, )?; self.vm = Some(vm); @@ -1760,24 +1925,37 @@ impl RequestHandler for Vmm { for net in restored_nets.iter() { for net_config in vm_net_configs.iter_mut() { // update only if the net dev is backed by FDs - if net_config.id.as_ref() == Some(&net.id) && net_config.fds.is_some() { + if net_config.pci_common.id.as_ref() == Some(&net.id) + && net_config.fds.is_some() + { net_config.fds.clone_from(&net.fds); } } } } - self.vm_restore(source_url, vm_config, restore_cfg.prefault) - .map_err(|vm_restore_err| { - error!("VM Restore failed: {vm_restore_err:?}"); - - // Cleanup the VM being created while vm restore - if let Err(e) = self.vm_delete() { - return e; - } + self.vm_restore( + source_url, + vm_config, + restore_cfg.prefault, + restore_cfg.memory_restore_mode, + ) + .and_then(|()| { + if restore_cfg.resume { + self.vm_resume() + } else { + Ok(()) + } + }) + .map_err(|e| { + error!("VM Restore failed: {e:?}"); + if let Err(e) = self.vm_delete() { + return e; + } + e + })?; - vm_restore_err - }) + Ok(()) } #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] @@ -1823,6 +2001,10 @@ impl RequestHandler for Vmm { let exit_evt = self.exit_evt.try_clone().map_err(VmError::EventFdClone)?; let reset_evt = self.reset_evt.try_clone().map_err(VmError::EventFdClone)?; + let guest_exit_evt = self + .guest_exit_evt + .try_clone() + .map_err(VmError::EventFdClone)?; #[cfg(feature = "guest_debug")] let debug_evt = self .vm_debug_evt @@ -1848,6 +2030,7 @@ impl RequestHandler for Vmm { config, exit_evt, reset_evt, + guest_exit_evt, #[cfg(feature = "guest_debug")] debug_evt, &self.seccomp_action, @@ -1859,6 +2042,7 @@ impl RequestHandler for Vmm { None, None, None, + None, )?; // And we boot it @@ -1880,9 +2064,11 @@ impl RequestHandler for Vmm { }; let config = vm_config.lock().unwrap().clone(); - let mut memory_actual_size = config.memory.total_size(); + let mut memory_actual_size = + config.memory.total_size() - config.memory.hotplugged_size(); if let Some(vm) = &self.vm { - memory_actual_size -= vm.balloon_size(); + memory_actual_size = memory_actual_size.saturating_sub(vm.balloon_size()); + memory_actual_size += vm.virtio_mem_plugged_size(); } let device_tree = self @@ -2125,6 +2311,39 @@ impl RequestHandler for Vmm { } } + fn vm_add_generic_vhost_user( + &mut self, + generic_vhost_user_cfg: GenericVhostUserConfig, + ) -> result::Result>, VmError> { + self.vm_config.as_ref().ok_or(VmError::VmNotCreated)?; + + { + // Validate the configuration change in a cloned configuration + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap().clone(); + add_to_config( + &mut config.generic_vhost_user, + generic_vhost_user_cfg.clone(), + ); + config.validate().map_err(VmError::ConfigValidation)?; + } + + if let Some(ref mut vm) = self.vm { + let info = vm + .add_generic_vhost_user(generic_vhost_user_cfg) + .inspect_err(|e| { + error!("Error when adding new generic vhost-user device to the VM: {e:?}"); + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } else { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + add_to_config(&mut config.generic_vhost_user, generic_vhost_user_cfg); + Ok(None) + } + } + fn vm_add_pmem(&mut self, pmem_cfg: PmemConfig) -> result::Result>, VmError> { self.vm_config.as_ref().ok_or(VmError::VmNotCreated)?; @@ -2268,8 +2487,12 @@ impl RequestHandler for Vmm { receive_data_migration.receiver_url ); + let mut listener = + migration_transport::receive_migration_listener(&receive_data_migration.receiver_url)?; // Accept the connection and get the socket - let mut socket = Vmm::receive_migration_socket(&receive_data_migration.receiver_url)?; + let mut socket = listener.accept()?; + + event!("vm", "migration-receive-started"); let mut state = ReceiveMigrationState::Established; @@ -2279,6 +2502,7 @@ impl RequestHandler for Vmm { let (response, new_state) = match self.vm_receive_migration_step( &mut socket, + &listener, state, &req, &receive_data_migration, @@ -2300,8 +2524,11 @@ impl RequestHandler for Vmm { } if let ReceiveMigrationState::Aborted = state { + event!("vm", "migration-receive-failed"); self.vm = None; self.vm_config = None; + } else { + event!("vm", "migration-receive-finished"); } Ok(()) @@ -2311,9 +2538,18 @@ impl RequestHandler for Vmm { &mut self, send_data_migration: VmSendMigrationData, ) -> result::Result<(), MigratableError> { + send_data_migration + .validate() + .context("Invalid send migration configuration") + .map_err(MigratableError::MigrateSend)?; + info!( - "Sending migration: destination_url = {}, local = {}", - send_data_migration.destination_url, send_data_migration.local + "Sending migration: destination_url={},local={},downtime={}ms,timeout={}s,timeout_strategy={:?}", + send_data_migration.destination_url, + send_data_migration.local, + send_data_migration.downtime().as_millis(), + send_data_migration.timeout().as_secs(), + send_data_migration.timeout_strategy ); if !self @@ -2330,41 +2566,56 @@ impl RequestHandler for Vmm { ))); } - if let Some(vm) = self.vm.as_mut() { - Self::send_migration( - vm, - #[cfg(all(feature = "kvm", target_arch = "x86_64"))] - self.hypervisor.as_ref(), - &send_data_migration, - ) - .map_err(|migration_err| { - error!("Migration failed: {migration_err:?}"); - - // Stop logging dirty pages only for non-local migrations - if !send_data_migration.local - && let Err(e) = vm.stop_dirty_log() - { - return e; - } + let vm = self + .vm + .as_mut() + .ok_or_else(|| MigratableError::MigrateSend(anyhow!("VM is not running")))?; - if vm.get_state() == VmState::Paused - && let Err(e) = vm.resume() - { - return e; - } + // Only running VMs can be migrated: Future work can fix this to allow + // also the migration of paused VMs while preserving the state in success + // and error case. See #7815. + if vm.get_state() != VmState::Running { + return Err(MigratableError::MigrateSend(anyhow!( + "VM is not in running state: {:?}", + vm.get_state() + ))); + } - migration_err - })?; + event!("vm", "migration-started"); + Self::send_migration( + vm, + #[cfg(all(feature = "kvm", target_arch = "x86_64"))] + self.hypervisor.as_ref(), + &send_data_migration, + ) + .map_err(|migration_err| { + error!("Migration failed: {migration_err:?}"); + event!("vm", "migration-failed"); - // Shutdown the VM after the migration succeeded - self.exit_evt.write(1).map_err(|e| { - MigratableError::MigrateSend(anyhow!( - "Failed shutting down the VM after migration: {e:?}" - )) - }) - } else { - Err(MigratableError::MigrateSend(anyhow!("VM is not running"))) - } + // Stop logging dirty pages only for non-local migrations + if !send_data_migration.local + && let Err(e) = vm.stop_dirty_log() + { + return e; + } + + if vm.get_state() == VmState::Paused + && let Err(e) = vm.resume() + { + return e; + } + + migration_err + })?; + + event!("vm", "migration-finished"); + + // Shutdown the VM after the migration succeeded + self.exit_evt.write(1).map_err(|e| { + MigratableError::MigrateSend(anyhow!( + "Failed shutting down the VM after migration: {e:?}" + )) + }) } } @@ -2374,12 +2625,14 @@ const DEVICE_MANAGER_SNAPSHOT_ID: &str = "device-manager"; #[cfg(test)] mod unit_tests { + use std::path::PathBuf; + use super::*; #[cfg(target_arch = "x86_64")] use crate::vm_config::DebugConsoleConfig; use crate::vm_config::{ - ConsoleConfig, ConsoleOutputMode, CpuFeatures, CpusConfig, HotplugMethod, MemoryConfig, - PayloadConfig, RngConfig, + ConsoleConfig, ConsoleOutputMode, CoreScheduling, CpuFeatures, CpusConfig, HotplugMethod, + MemoryConfig, PayloadConfig, RngConfig, }; fn create_dummy_vmm() -> Vmm { @@ -2393,6 +2646,7 @@ mod unit_tests { SeccompAction::Allow, hypervisor::new().unwrap(), EventFd::new(EFD_NONBLOCK).unwrap(), + false, ) .unwrap() } @@ -2408,6 +2662,7 @@ mod unit_tests { affinity: None, features: CpuFeatures::default(), nested: true, + core_scheduling: CoreScheduling::default(), }, memory: MemoryConfig { size: 536_870_912, @@ -2443,6 +2698,7 @@ mod unit_tests { }, balloon: None, fs: None, + generic_vhost_user: None, pmem: None, serial: ConsoleConfig { file: None, @@ -2679,6 +2935,59 @@ mod unit_tests { ); } + #[test] + fn test_vmm_vm_cold_add_generic_vhost_user() { + let mut vmm = create_dummy_vmm(); + let generic_vhost_user_config = + GenericVhostUserConfig::parse("virtio_id=26,socket=/tmp/sock,queue_sizes=[1024]") + .unwrap(); + + assert!(matches!( + vmm.vm_add_generic_vhost_user(generic_vhost_user_config.clone()), + Err(VmError::VmNotCreated) + )); + + let _ = vmm.vm_create(create_dummy_vm_config()); + assert!( + vmm.vm_config + .as_ref() + .unwrap() + .lock() + .unwrap() + .generic_vhost_user + .is_none() + ); + + assert!( + vmm.vm_add_generic_vhost_user(generic_vhost_user_config.clone()) + .unwrap() + .is_none() + ); + assert_eq!( + vmm.vm_config + .as_ref() + .unwrap() + .lock() + .unwrap() + .generic_vhost_user + .clone() + .unwrap() + .len(), + 1 + ); + assert_eq!( + vmm.vm_config + .as_ref() + .unwrap() + .lock() + .unwrap() + .generic_vhost_user + .clone() + .unwrap()[0], + generic_vhost_user_config + ); + } + #[test] fn test_vmm_vm_cold_add_pmem() { let mut vmm = create_dummy_vmm(); diff --git a/vmm/src/memory_manager.rs b/vmm/src/memory_manager.rs index 03b7b8a837..a01949b0fc 100644 --- a/vmm/src/memory_manager.rs +++ b/vmm/src/memory_manager.rs @@ -7,13 +7,13 @@ use std::collections::BTreeMap; use std::collections::HashMap; use std::fs::{File, OpenOptions}; -use std::io::{self}; +use std::io::{self, Read as _, Seek, SeekFrom}; use std::ops::{BitAnd, Not, Sub}; -#[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] -use std::os::fd::AsFd; +use std::os::fd::{AsFd, OwnedFd}; use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::mpsc::{self, Receiver, SyncSender}; use std::sync::{Arc, Barrier, Mutex}; use std::{ffi, result, thread}; @@ -35,24 +35,40 @@ use vm_allocator::GsiApic; use vm_allocator::{AddressAllocator, MemorySlotAllocator, SystemAllocator}; use vm_device::BusDevice; use vm_memory::bitmap::AtomicBitmap; -use vm_memory::guest_memory::FileOffset; +use vm_memory::guest_memory::{Error as MmapError, FileOffset}; use vm_memory::mmap::MmapRegionError; use vm_memory::{ - Address, Error as MmapError, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, - GuestMemoryError, GuestMemoryRegion, GuestUsize, MmapRegion, ReadVolatile, + Address, Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, + GuestMemoryError, GuestMemoryRegion, GuestUsize, MmapRegion, }; use vm_migration::protocol::{MemoryRange, MemoryRangeTable}; use vm_migration::{ Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable, Transportable, + UffdError, }; +use vmm_sys_util::eventfd::EventFd; +use crate::config::MemoryRestoreMode; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use crate::coredump::{ CoredumpMemoryRegion, CoredumpMemoryRegions, DumpState, GuestDebuggableError, }; use crate::migration::url_to_path; use crate::vm_config::{HotplugMethod, MemoryConfig, MemoryZoneConfig}; -use crate::{GuestMemoryMmap, GuestRegionMmap, MEMORY_MANAGER_SNAPSHOT_ID}; +use crate::{GuestMemoryMmap, GuestRegionMmap, MEMORY_MANAGER_SNAPSHOT_ID, uffd}; + +struct UffdHandler { + stop_event: EventFd, + result_rx: Receiver>, + handle: thread::JoinHandle<()>, +} + +struct UffdRange { + host_addr: u64, + length: u64, + file_offset: u64, + page_size: u64, +} pub const MEMORY_MANAGER_ACPI_SIZE: usize = 0x18; @@ -116,13 +132,27 @@ impl VirtioMemZone { } } -#[derive(Default)] pub struct MemoryZone { regions: Vec>, virtio_mem_zone: Option, + shared: bool, + hugepages: bool, + backing_page_size: u64, + mergeable: bool, } impl MemoryZone { + fn new(shared: bool, hugepages: bool, backing_page_size: u64, mergeable: bool) -> Self { + Self { + regions: Vec::new(), + virtio_mem_zone: None, + shared, + hugepages, + backing_page_size, + mergeable, + } + } + pub fn regions(&self) -> &Vec> { &self.regions } @@ -132,6 +162,21 @@ impl MemoryZone { pub fn virtio_mem_zone_mut(&mut self) -> Option<&mut VirtioMemZone> { self.virtio_mem_zone.as_mut() } + + fn backing_page_size_for_gpa(&self, gpa: u64) -> Option { + if self.regions.iter().any(|region| { + let start = region.start_addr().raw_value(); + gpa >= start && gpa < start + region.len() + }) { + return Some(self.backing_page_size); + } + + self.virtio_mem_zone.as_ref().and_then(|virtio_mem_zone| { + let start = virtio_mem_zone.region.start_addr().raw_value(); + (gpa >= start && gpa < start + virtio_mem_zone.region.len()) + .then_some(self.backing_page_size) + }) + } } pub type MemoryZones = HashMap; @@ -187,6 +232,7 @@ pub struct MemoryManager { // This is useful for getting the dirty pages as we need to know the // slots that the mapping is created in. guest_ram_mappings: Vec, + uffd_handler: Option, pub acpi_address: Option, #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] @@ -207,6 +253,10 @@ pub enum Error { #[error("Mmap backed guest memory error")] GuestMemory(#[source] MmapError), + /// Guest region collection error + #[error("Guest region collection error")] + GuestRegionCollection(#[source] vm_memory::GuestRegionCollectionError), + /// Failed to allocate a memory range. #[error("Failed to allocate a memory range")] MemoryRangeAllocation, @@ -350,6 +400,12 @@ pub enum Error { MisalignedMemorySize, } +impl From for Error { + fn from(e: UffdError) -> Self { + Error::Restore(MigratableError::OnDemandRestore(e)) + } +} + const ENABLE_FLAG: usize = 0; const INSERTING_FLAG: usize = 1; const REMOVING_FLAG: usize = 2; @@ -551,7 +607,10 @@ impl MemoryManager { } // Add zone id to the list of memory zones. - memory_zones.insert(zone.id.clone(), MemoryZone::default()); + memory_zones.insert( + zone.id.clone(), + MemoryZone::new(zone.shared, zone.hugepages, zone_align_size, zone.mergeable), + ); for ram_region in ram_regions.iter() { let mut ram_region_offset = 0; @@ -642,7 +701,15 @@ impl MemoryManager { ); return Err(Error::DuplicateZoneId); } - memory_zones.insert(zone.id.clone(), MemoryZone::default()); + memory_zones.insert( + zone.id.clone(), + MemoryZone::new( + zone.shared, + zone.hugepages, + zone_align_size, + zone.mergeable, + ), + ); } if ram_region_consumed { @@ -670,7 +737,16 @@ impl MemoryManager { let mut memory_zones = HashMap::new(); for zone_config in zones_config { - memory_zones.insert(zone_config.id.clone(), MemoryZone::default()); + let zone_page_size = memory_zone_get_align_size(zone_config)?; + memory_zones.insert( + zone_config.id.clone(), + MemoryZone::new( + zone_config.shared, + zone_config.hugepages, + zone_page_size, + zone_config.mergeable, + ), + ); } for guest_ram_mapping in guest_ram_mappings { @@ -760,6 +836,360 @@ impl MemoryManager { Ok(()) } + /// Restore guest memory using userfaultfd for lazy demand paging. + /// + /// Instead of reading the entire snapshot into guest RAM upfront (which + /// blocks restore for hundreds of milliseconds at multi-GB sizes), this + /// registers the guest memory regions with a userfaultfd. A background + /// thread handles page faults by reading the corresponding page from the + /// snapshot file and copying it into guest memory via `UFFDIO_COPY`. + /// + /// This preserves the original memory mapping type (anonymous or shared), + /// making it compatible with VFIO device passthrough and shared-memory + /// guest RAM. + /// + /// Fails the restore if UFFD setup cannot be completed successfully. + /// + /// The handler thread keeps the snapshot file open while lazy restore + /// is active. The file must remain available until the VM is shut down or + /// all faulted pages have been served. + fn restore_by_uffd( + &mut self, + file_path: &Path, + saved_regions: &MemoryRangeTable, + exit_evt: &EventFd, + ) -> Result<(), Error> { + if saved_regions.is_empty() { + return Ok(()); + } + + let guest_memory = self.guest_memory.memory(); + let required_uffd_features = self.required_uffd_features(); + + // SAFETY: FFI call. Trivially safe. + let base_page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) } as u64; + + info!( + "UFFD restore: attempting demand-paged restore for {} region(s)", + saved_regions.regions().len() + ); + + if saved_regions + .regions() + .iter() + .any(|range| range.gpa % base_page_size != 0 || range.length % base_page_size != 0) + { + return Err(UffdError::UnalignedRanges.into()); + } + + let snapshot_file = File::open(file_path).map_err(Error::SnapshotOpen)?; + + let uffd_fd = uffd::create(required_uffd_features).map_err(UffdError::Create)?; + + let mut handler_ranges: Vec = Vec::new(); + let mut file_offset: u64 = 0; + + for range in saved_regions.regions() { + let host_addr = guest_memory + .get_host_address(GuestAddress(range.gpa)) + .map_err(|_| UffdError::GpaTranslation { gpa: range.gpa })? + as u64; + + let ioctls = uffd::register(uffd_fd.as_fd(), host_addr, range.length).map_err(|e| { + UffdError::Register { + addr: host_addr, + len: range.length, + source: e, + } + })?; + + if ioctls & crate::userfaultfd::UFFD_API_RANGE_IOCTLS_BASIC + != crate::userfaultfd::UFFD_API_RANGE_IOCTLS_BASIC + { + return Err(UffdError::MissingIoctlSupport { + addr: host_addr, + len: range.length, + } + .into()); + } + + let range_page_size = self + .memory_zones + .values() + .find_map(|zone| zone.backing_page_size_for_gpa(range.gpa)) + .unwrap_or(base_page_size); + + handler_ranges.push(UffdRange { + host_addr, + length: range.length, + file_offset, + page_size: range_page_size, + }); + + file_offset += range.length; + } + + info!( + "UFFD restore: registered {} region(s), {} total bytes, spawning handler", + handler_ranges.len(), + file_offset + ); + + let stop_event = EventFd::new(libc::EFD_NONBLOCK).map_err(Error::EventFdFail)?; + let thread_stop_event = stop_event.try_clone().map_err(Error::EventFdFail)?; + let thread_exit_evt = exit_evt.try_clone().map_err(Error::EventFdFail)?; + let (ready_tx, ready_rx) = mpsc::sync_channel(1); + let (result_tx, result_rx) = mpsc::sync_channel(1); + let handle = thread::Builder::new() + .name("uffd-handler".to_string()) + .spawn(move || { + std::panic::catch_unwind(std::panic::AssertUnwindSafe(move || { + let max_page_size = handler_ranges + .iter() + .map(|r| r.page_size) + .max() + .unwrap_or(base_page_size); + let result = Self::uffd_handler_loop( + uffd_fd, + thread_stop_event, + snapshot_file, + &handler_ranges, + max_page_size, + &ready_tx, + ); + + if let Err(e) = &result { + error!("UFFD handler exited with error: {e}"); + } + + result_tx.send(result).ok(); + })) + .map_err(|_| { + error!("uffd-handler thread panicked"); + thread_exit_evt.write(1).ok(); + }) + .ok(); + }) + .map_err(UffdError::SpawnThread)?; + + if ready_rx.recv().is_err() { + handle.join().ok(); + return Err(UffdError::HandlerStartup.into()); + } + + if let Ok(Err(e)) = result_rx.try_recv() { + handle.join().ok(); + return Err(UffdError::HandlerFailed(e).into()); + } + + self.uffd_handler = Some(UffdHandler { + stop_event, + result_rx, + handle, + }); + + info!("UFFD restore: demand-paged restore enabled"); + + Ok(()) + } + + fn required_uffd_features(&self) -> u64 { + let mut features = 0u64; + if self.memory_zones.values().any(|z| z.shared && !z.hugepages) { + features |= crate::userfaultfd::UFFD_FEATURE_MISSING_SHMEM; + } + if self.memory_zones.values().any(|z| z.hugepages) { + features |= crate::userfaultfd::UFFD_FEATURE_MISSING_HUGETLBFS; + } + features + } + + fn stop_uffd_handler(&mut self) { + if let Some(uffd_handler) = self.uffd_handler.take() { + uffd_handler.stop_event.write(1).ok(); + uffd_handler.handle.join().ok(); + + match uffd_handler.result_rx.try_recv() { + Ok(Err(e)) => error!("UFFD handler terminated with error: {e}"), + Err(mpsc::TryRecvError::Disconnected) => { + warn!("UFFD handler terminated unexpectedly (possible panic)"); + } + _ => {} + } + } + } + + /// Poll the UFFD fd and serve page faults from the snapshot file. + /// + /// Runs until the fd is closed (EPOLLHUP) or an unrecoverable error occurs. + /// Each fault triggers a seek + read from the snapshot file followed by a + /// `UFFDIO_COPY` to resolve the fault and wake the faulting thread. + #[allow(clippy::needless_pass_by_value)] + fn uffd_handler_loop( + uffd_fd: OwnedFd, + stop_event: EventFd, + mut snapshot_file: File, + ranges: &[UffdRange], + page_size: u64, + ready_tx: &SyncSender<()>, + ) -> Result<(), io::Error> { + let uffd_raw_fd = uffd_fd.as_raw_fd(); + let mut page_buf = vec![0u8; page_size as usize]; + + let total_pages: u64 = ranges.iter().map(|r| r.length.div_ceil(r.page_size)).sum(); + let mut pages_served: u64 = 0; + + const EVENT_STOP: u64 = 0; + const EVENT_UFFD: u64 = 1; + + let epoll_fd = epoll::create(true).map_err(io::Error::other)?; + // SAFETY: epoll_fd is valid and owned by this scope. + let _epoll_file = unsafe { File::from_raw_fd(epoll_fd) }; + + epoll::ctl( + epoll_fd, + epoll::ControlOptions::EPOLL_CTL_ADD, + stop_event.as_raw_fd(), + epoll::Event::new(epoll::Events::EPOLLIN, EVENT_STOP), + ) + .map_err(io::Error::other)?; + + epoll::ctl( + epoll_fd, + epoll::ControlOptions::EPOLL_CTL_ADD, + uffd_raw_fd, + epoll::Event::new(epoll::Events::EPOLLIN | epoll::Events::EPOLLHUP, EVENT_UFFD), + ) + .map_err(io::Error::other)?; + + ready_tx.send(()).ok(); + + let mut events = vec![epoll::Event::new(epoll::Events::empty(), 0); 2]; + loop { + let num_events = match epoll::wait(epoll_fd, -1, &mut events) { + Ok(n) => n, + Err(e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => return Err(e), + }; + + let mut got_uffd_data = false; + for event in events.iter().take(num_events) { + let token = event.data; + let evt_flags = event.events; + + if token == EVENT_STOP { + stop_event.read().ok(); + info!("UFFD handler: received stop event, exiting"); + return Ok(()); + } + + if token == EVENT_UFFD + && (evt_flags & epoll::Events::EPOLLHUP.bits()) != 0 + && (evt_flags & epoll::Events::EPOLLIN.bits()) == 0 + { + info!("UFFD handler: fd closed (EPOLLHUP), exiting"); + return Ok(()); + } + + if token == EVENT_UFFD && (evt_flags & epoll::Events::EPOLLIN.bits()) != 0 { + got_uffd_data = true; + } + } + + if !got_uffd_data { + continue; + } + + // SAFETY: UffdMsg is a plain repr(C) struct, safe to zero-init. + let mut msg: uffd::UffdMsg = unsafe { std::mem::zeroed() }; + // SAFETY: reading a uffd_msg-sized struct from the valid uffd fd. + let n = unsafe { + libc::read( + uffd_raw_fd, + &mut msg as *mut uffd::UffdMsg as *mut libc::c_void, + std::mem::size_of::(), + ) + }; + if n < 0 { + let err = io::Error::last_os_error(); + if err.kind() == io::ErrorKind::WouldBlock { + continue; + } + return Err(err); + } + if n == 0 { + info!("UFFD handler: EOF on fd, exiting"); + return Ok(()); + } + if n as usize != std::mem::size_of::() { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "Short read from userfaultfd", + )); + } + + if msg.event != crate::userfaultfd::UFFD_EVENT_PAGEFAULT { + continue; + } + + let fault_addr = msg.pf_address; + + let mut served = false; + for range in ranges { + // Round down to the page boundary containing the faulted address. + let page_addr = fault_addr & !(range.page_size - 1); + if page_addr >= range.host_addr && page_addr < range.host_addr + range.length { + let offset_in_range = page_addr - range.host_addr; + let file_pos = range.file_offset + offset_in_range; + + snapshot_file.seek(SeekFrom::Start(file_pos))?; + snapshot_file.read_exact(&mut page_buf[..range.page_size as usize])?; + + loop { + match uffd::copy( + uffd_fd.as_fd(), + page_addr, + page_buf.as_ptr(), + range.page_size, + ) { + Ok(()) => { + pages_served += 1; + break; + } + Err(e) if e.raw_os_error() == Some(libc::EEXIST) => { + if let Err(e) = + uffd::wake(uffd_fd.as_fd(), page_addr, range.page_size) + { + warn!("UFFDIO_WAKE failed at {page_addr:#x}: {e}"); + } + break; + } + Err(e) if e.raw_os_error() == Some(libc::EAGAIN) => { + // The kernel can report a transient EAGAIN while the fault + // is being resolved; yield and retry instead of aborting restore. + thread::yield_now(); + } + Err(e) => return Err(e), + } + } + served = true; + break; + } + } + + if !served { + return Err(io::Error::other(format!( + "UFFD handler: fault at {fault_addr:#x} does not belong to any registered range", + ))); + } + + if pages_served == total_pages { + info!("UFFD handler: all {pages_served} pages served, exiting"); + return Ok(()); + } + } + } + fn validate_memory_config( config: &MemoryConfig, user_provided_zones: bool, @@ -877,6 +1307,7 @@ impl MemoryManager { hotplug_size: config.hotplug_size, hotplugged_size: config.hotplugged_size, prefault: config.prefault, + mergeable: config.mergeable, }]; Ok((config.size, zones, allow_mem_hotplug)) @@ -898,10 +1329,10 @@ impl MemoryManager { regions.push((virtio_mem_zone.region().clone(), true)); } - list.push((zone_id.clone(), regions)); + list.push((zone_id.clone(), regions, memory_zone.mergeable)); } - for (zone_id, regions) in list { + for (zone_id, regions, zone_mergeable) in list { for (region, virtio_mem) in regions { // SAFETY: guaranteed by GuestRegionMmap invariants let slot = unsafe { @@ -909,7 +1340,7 @@ impl MemoryManager { region.start_addr().raw_value(), region.len().try_into().unwrap(), region.as_ptr(), - self.mergeable, + zone_mergeable, false, self.log_dirty, ) @@ -954,9 +1385,9 @@ impl MemoryManager { Ok(()) } - #[cfg(target_arch = "aarch64")] + #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] pub fn add_uefi_flash(&mut self) -> Result<(), Error> { - // On AArch64, the UEFI binary requires a flash device at address 0. + // The UEFI binary requires a flash device at address 0. // 4 MiB memory is mapped to simulate the flash. let uefi_mem_slot = self.allocate_memory_slot(); let uefi_region = GuestRegionMmap::new( @@ -1034,7 +1465,7 @@ impl MemoryManager { config.thp, )?; let guest_memory = - GuestMemoryMmap::from_arc_regions(regions).map_err(Error::GuestMemory)?; + GuestMemoryMmap::from_arc_regions(regions).map_err(Error::GuestRegionCollection)?; let boot_guest_memory = guest_memory.clone(); ( GuestAddress(data.start_of_device_area), @@ -1071,8 +1502,8 @@ impl MemoryManager { let (mem_regions, mut memory_zones) = Self::create_memory_regions_from_zones(&ram_regions, &zones, prefault, config.thp)?; - let mut guest_memory = - GuestMemoryMmap::from_arc_regions(mem_regions).map_err(Error::GuestMemory)?; + let mut guest_memory = GuestMemoryMmap::from_arc_regions(mem_regions) + .map_err(Error::GuestRegionCollection)?; let boot_guest_memory = guest_memory.clone(); @@ -1120,7 +1551,7 @@ impl MemoryManager { guest_memory = guest_memory .insert_region(Arc::clone(®ion)) - .map_err(Error::GuestMemory)?; + .map_err(Error::GuestRegionCollection)?; let hotplugged_size = zone.hotplugged_size.unwrap_or(0); let region_size = region.len(); @@ -1227,6 +1658,7 @@ impl MemoryManager { snapshot_memory_ranges: MemoryRangeTable::default(), memory_zones, guest_ram_mappings: Vec::new(), + uffd_handler: None, acpi_address, log_dirty: dynamic, // Cannot log dirty pages on a TD arch_mem_regions, @@ -1240,13 +1672,16 @@ impl MemoryManager { Ok(Arc::new(Mutex::new(memory_manager))) } + #[allow(clippy::too_many_arguments)] pub fn new_from_snapshot( snapshot: &Snapshot, vm: Arc, config: &MemoryConfig, source_url: Option<&str>, prefault: bool, + memory_restore_mode: MemoryRestoreMode, phys_bits: u8, + exit_evt: &EventFd, ) -> Result>, Error> { if let Some(source_url) = source_url { let mut memory_file_path = url_to_path(source_url).map_err(Error::Restore)?; @@ -1266,9 +1701,17 @@ impl MemoryManager { Default::default(), )?; - mm.lock() - .unwrap() - .fill_saved_regions(memory_file_path, &mem_snapshot.memory_ranges)?; + if memory_restore_mode == MemoryRestoreMode::OnDemand { + mm.lock().unwrap().restore_by_uffd( + &memory_file_path, + &mem_snapshot.memory_ranges, + exit_evt, + )?; + } else { + mm.lock() + .unwrap() + .fill_saved_regions(memory_file_path, &mem_snapshot.memory_ranges)?; + } Ok(mm) } else { @@ -1535,9 +1978,9 @@ impl MemoryManager { thp, )?; - Ok(Arc::new( - GuestRegionMmap::new(r, start_addr).map_err(Error::GuestMemory)?, - )) + Ok(Arc::new(GuestRegionMmap::new(r, start_addr).ok_or( + Error::GuestMemory(MmapError::InvalidGuestAddress(start_addr)), + )?)) } // Duplicate of `memory_zone_get_align_size` that does not require a `zone` @@ -1598,7 +2041,7 @@ impl MemoryManager { .guest_memory .memory() .insert_region(region) - .map_err(Error::GuestMemory)?; + .map_err(Error::GuestRegionCollection)?; self.guest_memory.lock().unwrap().replace(guest_memory); Ok(()) @@ -1657,7 +2100,9 @@ impl MemoryManager { region.start_addr().0, region.len().try_into().unwrap(), region.as_ptr(), - self.mergeable, + self.memory_zones + .get(DEFAULT_MEMORY_ZONE) + .map_or(self.mergeable, |z| z.mergeable), false, self.log_dirty, ) @@ -1990,6 +2435,19 @@ impl MemoryManager { unsafe { (*stat.as_ptr()).st_nlink as usize > 0 } } + pub fn virtio_mem_plugged_size(&self) -> u64 { + self.memory_zones + .values() + .filter_map(|zone| { + zone.virtio_mem_zone + .as_ref()? + .virtio_device + .as_ref() + .map(|dev| dev.lock().unwrap().plugged_size()) + }) + .sum() + } + pub fn memory_zones(&self) -> &MemoryZones { &self.memory_zones } @@ -2142,47 +2600,6 @@ impl MemoryManager { debug!("coredump total bytes {total_bytes}"); Ok(()) } - - pub fn receive_memory_regions( - &mut self, - ranges: &MemoryRangeTable, - fd: &mut F, - ) -> std::result::Result<(), MigratableError> - where - F: ReadVolatile, - { - let guest_memory = self.guest_memory(); - let mem = guest_memory.memory(); - - for range in ranges.regions() { - let mut offset: u64 = 0; - // Here we are manually handling the retry in case we can't the - // whole region at once because we can't use the implementation - // from vm-memory::GuestMemory of read_exact_from() as it is not - // following the correct behavior. For more info about this issue - // see: https://github.com/rust-vmm/vm-memory/issues/174 - loop { - let bytes_read = mem - .read_volatile_from( - GuestAddress(range.gpa + offset), - fd, - (range.length - offset) as usize, - ) - .map_err(|e| { - MigratableError::MigrateReceive(anyhow!( - "Error receiving memory from socket: {e}" - )) - })?; - offset += bytes_read as u64; - - if offset == range.length { - break; - } - } - } - - Ok(()) - } } struct MemoryNotify { @@ -2530,6 +2947,12 @@ impl Aml for MemoryManager { impl Pausable for MemoryManager {} +impl Drop for MemoryManager { + fn drop(&mut self) { + self.stop_uffd_handler(); + } +} + #[derive(Clone, Serialize, Deserialize)] pub struct MemoryManagerSnapshotData { memory_ranges: MemoryRangeTable, @@ -2673,11 +3096,11 @@ impl Migratable for MemoryManager { let sub_table = MemoryRangeTable::from_dirty_bitmap(dirty_bitmap, r.gpa, 4096); if sub_table.regions().is_empty() { - info!("Dirty Memory Range Table is empty"); + debug!("Dirty Memory Range Table is empty"); } else { - info!("Dirty Memory Range Table:"); + debug!("Dirty Memory Range Table:"); for range in sub_table.regions() { - info!("GPA: {:x} size: {} (KiB)", range.gpa, range.length / 1024); + debug!("GPA: {:x} size: {} (KiB)", range.gpa, range.length / 1024); } } diff --git a/vmm/src/migration_transport.rs b/vmm/src/migration_transport.rs new file mode 100644 index 0000000000..6440dc8fd6 --- /dev/null +++ b/vmm/src/migration_transport.rs @@ -0,0 +1,979 @@ +// Copyright © 2026 Contributors to the Cloud Hypervisor project +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::io::{self, ErrorKind, Read, Write}; +use std::net::{TcpListener, TcpStream}; +use std::num::NonZeroU32; +use std::os::fd::{AsFd, BorrowedFd}; +use std::os::unix::io::{AsRawFd, RawFd}; +use std::os::unix::net::{UnixListener, UnixStream}; +use std::path::PathBuf; +use std::result::Result; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::mpsc::{Receiver, Sender, SyncSender, TrySendError, channel, sync_channel}; +use std::sync::{Arc, Mutex}; +use std::thread; +use std::time::Duration; + +use anyhow::{Context, anyhow}; +use log::{debug, error, info, warn}; +use serde_json; +use vm_memory::bitmap::BitmapSlice; +use vm_memory::{ + Bytes, GuestAddress, GuestAddressSpace, GuestMemoryAtomic, ReadVolatile, VolatileMemoryError, + VolatileSlice, WriteVolatile, +}; +use vm_migration::protocol::{Command, MemoryRangeTable, Request, Response}; +use vm_migration::{MigratableError, Snapshot}; +use vmm_sys_util::eventfd::EventFd; + +use crate::sync_utils::Gate; +use crate::{GuestMemoryMmap, VmMigrationConfig}; + +/// Hard upper bound for migration worker connections on both the sender and +/// receiver side. +pub(crate) const MAX_MIGRATION_CONNECTIONS: u32 = 128; + +/// Transport-agnostic listener used to receive connections. +#[derive(Debug)] +pub(crate) enum ReceiveListener { + Tcp(TcpListener), + Unix(UnixListener), +} + +impl ReceiveListener { + /// Block until a connection is accepted. + pub(crate) fn accept(&mut self) -> Result { + match self { + ReceiveListener::Tcp(listener) => listener + .accept() + .map(|(socket, _)| SocketStream::Tcp(socket)) + .context("Failed to accept TCP migration connection") + .map_err(MigratableError::MigrateReceive), + ReceiveListener::Unix(listener) => listener + .accept() + .map(|(socket, _)| SocketStream::Unix(socket)) + .context("Failed to accept Unix migration connection") + .map_err(MigratableError::MigrateReceive), + } + } + + /// Same as [`Self::accept`], but returns `None` if the abort event was signaled. + fn abortable_accept( + &mut self, + abort_event: &impl AsRawFd, + ) -> Result, MigratableError> { + if wait_for_readable(&self, abort_event) + .context("Error while waiting for socket to become readable") + .map_err(MigratableError::MigrateReceive)? + { + // The listener is readable; accept the connection. + Ok(Some(self.accept()?)) + } else { + // The abort event was signaled before any connection arrived. + Ok(None) + } + } + + /// Tries to clone a [`ReceiveListener`]. + pub(crate) fn try_clone(&self) -> Result { + match self { + ReceiveListener::Tcp(listener) => listener + .try_clone() + .map(ReceiveListener::Tcp) + .context("Failed to clone TCP listener") + .map_err(MigratableError::MigrateReceive), + ReceiveListener::Unix(listener) => listener + .try_clone() + .map(ReceiveListener::Unix) + .context("Failed to clone Unix listener") + .map_err(MigratableError::MigrateReceive), + } + } +} + +impl AsFd for ReceiveListener { + fn as_fd(&self) -> BorrowedFd<'_> { + match self { + ReceiveListener::Tcp(listener) => listener.as_fd(), + ReceiveListener::Unix(listener) => listener.as_fd(), + } + } +} + +/// Transport-agnostic stream used by the migration protocol. +pub(crate) enum SocketStream { + Unix(UnixStream), + Tcp(TcpStream), +} + +impl Read for SocketStream { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + match self { + SocketStream::Unix(stream) => stream.read(buf), + SocketStream::Tcp(stream) => stream.read(buf), + } + } +} + +impl Write for SocketStream { + fn write(&mut self, buf: &[u8]) -> io::Result { + match self { + SocketStream::Unix(stream) => stream.write(buf), + SocketStream::Tcp(stream) => stream.write(buf), + } + } + + fn flush(&mut self) -> io::Result<()> { + match self { + SocketStream::Unix(stream) => stream.flush(), + SocketStream::Tcp(stream) => stream.flush(), + } + } +} + +impl AsRawFd for SocketStream { + fn as_raw_fd(&self) -> RawFd { + match self { + SocketStream::Unix(s) => s.as_raw_fd(), + SocketStream::Tcp(s) => s.as_raw_fd(), + } + } +} + +impl AsFd for SocketStream { + fn as_fd(&self) -> BorrowedFd<'_> { + match self { + SocketStream::Unix(s) => s.as_fd(), + SocketStream::Tcp(s) => s.as_fd(), + } + } +} + +impl ReadVolatile for SocketStream { + fn read_volatile( + &mut self, + buf: &mut VolatileSlice, + ) -> Result { + match self { + SocketStream::Unix(s) => s.read_volatile(buf), + SocketStream::Tcp(s) => s.read_volatile(buf), + } + } + + fn read_exact_volatile( + &mut self, + buf: &mut VolatileSlice, + ) -> Result<(), VolatileMemoryError> { + match self { + SocketStream::Unix(s) => s.read_exact_volatile(buf), + SocketStream::Tcp(s) => s.read_exact_volatile(buf), + } + } +} + +impl WriteVolatile for SocketStream { + fn write_volatile( + &mut self, + buf: &VolatileSlice, + ) -> Result { + match self { + SocketStream::Unix(s) => s.write_volatile(buf), + SocketStream::Tcp(s) => s.write_volatile(buf), + } + } + + fn write_all_volatile( + &mut self, + buf: &VolatileSlice, + ) -> Result<(), VolatileMemoryError> { + match self { + SocketStream::Unix(s) => s.write_all_volatile(buf), + SocketStream::Tcp(s) => s.write_all_volatile(buf), + } + } +} + +// Wait for `fd` to become readable. In this case, we return true. In case +// `abort_event` was signaled, return false. +fn wait_for_readable(fd: &impl AsFd, abort_event: &impl AsRawFd) -> Result { + let fd = fd.as_fd().as_raw_fd(); + let abort_event = abort_event.as_raw_fd(); + + let mut poll_fds = [ + libc::pollfd { + fd: abort_event, + events: libc::POLLIN, + revents: 0, + }, + libc::pollfd { + fd, + events: libc::POLLIN, + revents: 0, + }, + ]; + + loop { + // SAFETY: This is safe, because the file descriptors are valid and the + // poll_fds array is properly initialized. + let ret = unsafe { libc::poll(poll_fds.as_mut_ptr(), poll_fds.len() as libc::nfds_t, -1) }; + + if ret >= 0 { + break; + } + + let err = io::Error::last_os_error(); + if err.raw_os_error() == Some(libc::EINTR) { + continue; + } + + return Err(err); + } + + if poll_fds[0].revents & libc::POLLIN != 0 { + return Ok(false); + } + + if poll_fds[1].revents & libc::POLLIN != 0 { + return Ok(true); + } + + Err(io::Error::other( + "Poll returned, but neither file descriptor is readable?", + )) +} + +/// Struct to keep track of additional connections for receiving VM migration data. +#[derive(Debug)] +pub(crate) struct ReceiveAdditionalConnections { + /// This thread accepts incoming connections and spawns a new worker for + /// each connection that handles receiving memory. + accept_thread: Option>>, + + /// This fd gets signaled when the migration stops, and will then stop + /// the [`Self::accept_thread`]. + terminate_fd: EventFd, +} + +impl ReceiveAdditionalConnections { + /// Starts a thread to accept incoming connections and handle them. These + /// additional connections are used to receive additional memory regions + /// during VM migration. + pub(crate) fn new( + listener: ReceiveListener, + guest_memory: GuestMemoryAtomic, + ) -> Result { + let event_fd = EventFd::new(0) + .context("Error creating terminate fd") + .map_err(MigratableError::MigrateReceive)?; + + let terminate_fd = event_fd + .try_clone() + .context("Error cloning terminate fd") + .map_err(MigratableError::MigrateReceive)?; + + let accept_thread = thread::Builder::new() + .name("migrate-receive-accept-connections".to_owned()) + .spawn(move || Self::accept_connections(listener, &terminate_fd, &guest_memory)) + .context("Error creating connection accept thread") + .map_err(MigratableError::MigrateReceive)?; + + Ok(Self { + accept_thread: Some(accept_thread), + terminate_fd: event_fd, + }) + } + + fn accept_connections( + mut listener: ReceiveListener, + terminate_fd: &EventFd, + guest_memory: &GuestMemoryAtomic, + ) -> Result<(), MigratableError> { + let mut threads: Vec>> = Vec::new(); + let mut first_err = loop { + let socket = match listener.abortable_accept(terminate_fd) { + Ok(socket) => socket, + Err(e) => break Err(e), + }; + let Some(mut socket) = socket else { + break Ok(()); + }; + + if threads.len() >= MAX_MIGRATION_CONNECTIONS as usize { + break Err(MigratableError::MigrateReceive(anyhow!( + "Received more than {MAX_MIGRATION_CONNECTIONS} additional migration connections." + ))); + } + + let guest_memory = guest_memory.clone(); + let terminate_fd = match terminate_fd + .try_clone() + .context("Error cloning terminate fd") + .map_err(MigratableError::MigrateReceive) + { + Ok(terminate_fd) => terminate_fd, + Err(e) => break Err(e), + }; + + match thread::Builder::new() + .name(format!("migrate-receive-memory-{}", threads.len()).to_owned()) + .spawn(move || { + Self::worker_receive_memory(&mut socket, &terminate_fd, &guest_memory) + }) { + Ok(t) => threads.push(t), + Err(e) => { + error!("Error spawning receive-memory thread: {e}"); + break Err(MigratableError::MigrateReceive( + anyhow!(e).context("Error spawning receive-memory thread"), + )); + } + } + }; + + if first_err.is_err() { + warn!("Signaling termination due to an error while accepting connections."); + let _ = terminate_fd.write(1); + } + + info!("Stopped accepting additional connections. Cleaning up threads."); + + // We only return the first error we encounter here. + for thread in threads { + let err = match thread.join() { + Ok(Ok(())) => None, + Ok(Err(e)) => Some(e), + Err(panic) => Some(MigratableError::MigrateReceive(anyhow!( + "receive-memory thread panicked: {panic:?}" + ))), + }; + + if let Some(e) = err { + warn!("Error in receive-memory thread: {e}"); + + if first_err.is_ok() { + first_err = Err(e); + } + } + } + + first_err + } + + // Handles a `Memory` request by writing its payload to the VM memory. + fn worker_receive_memory( + mut socket: &mut SocketStream, + terminate_fd: &EventFd, + guest_memory: &GuestMemoryAtomic, + ) -> Result<(), MigratableError> { + loop { + // We only check whether we should abort when waiting for a new request. If the + // sender stops sending data mid-request, we will hang forever. + if !wait_for_readable(socket, terminate_fd) + .context("Failed to poll fds") + .map_err(MigratableError::MigrateReceive)? + { + info!("Got signal to tear down connection."); + return Ok(()); + } + + let req = match Request::read_from(&mut socket) { + Ok(req) => req, + Err(MigratableError::MigrateSocket(io_error)) + if io_error.kind() == ErrorKind::UnexpectedEof => + { + // EOF is only handled here while reading the next request + // header. Each memory chunk is fully received and acked + // before the worker loops back to Request::read_from(), so + // EOF at this point means the sender finished sending + // memory rather than dropping a chunk mid-transfer. + debug!( + "Connection closed by peer as expected (sender finished sending memory)" + ); + return Ok(()); + } + Err(e) => return Err(e), + }; + + if req.command() != Command::Memory { + error!( + "Dropping connection. Only Memory commands are allowed on additional connections." + ); + return Err(MigratableError::MigrateReceive(anyhow!( + "Received non memory command on migration receive worker: {:?}", + req.command() + ))); + } + + receive_memory_ranges(guest_memory, &req, socket)?; + Response::ok().write_to(socket)?; + } + } + + /// Signals to the worker threads that the migration is finished and joins them. + /// If any thread encountered an error, this error is returned by this function. + pub(crate) fn cleanup(&mut self) -> Result<(), MigratableError> { + self.terminate_fd + .write(1) + .context("Failed to signal termination to worker threads.") + .map_err(MigratableError::MigrateReceive)?; + let accept_thread = self + .accept_thread + .take() + .context("Error taking accept thread.") + .map_err(MigratableError::MigrateReceive)?; + accept_thread + .join() + .map_err(|panic| { + MigratableError::MigrateReceive(anyhow!( + "Accept connections thread panicked: {panic:?}" + )) + }) + .flatten() + } +} + +impl Drop for ReceiveAdditionalConnections { + fn drop(&mut self) { + if self.accept_thread.is_some() { + warn!( + "ReceiveAdditionalConnections was not cleaned up! Either cleanup() was never called (programming error) or it failed before completing." + ); + } + } +} + +/// The different kinds of messages we can send to memory sending threads. +#[derive(Debug)] +enum SendMemoryThreadMessage { + /// A chunk of memory that the thread should send to the receiving side of the + /// live migration. + Memory(MemoryRangeTable), + /// A synchronization point after each iteration of sending memory. That way the + /// main thread knows when all memory is sent and acknowledged. + Gate(Arc), + /// Sending memory is done and the threads are not needed anymore. + Disconnect, +} + +/// The different kinds of messages the main thread can receive from a memory +/// sending thread. +#[derive(Debug)] +enum SendMemoryThreadNotify { + /// A sending thread arrived at the gate. The main thread does not wait at the + /// gate, otherwise we could miss error messages. + Gate, + /// A sending thread encountered an error while sending memory. + Error, +} + +/// This struct keeps track of additional threads we use to send VM memory. +pub(crate) struct SendAdditionalConnections { + guest_memory: GuestMemoryAtomic, + threads: Vec>>, + /// Sender to all workers. The receiver is shared by all workers. + message_tx: SyncSender, + /// If an error occurs in one of the memory sending threads, the main thread signals + /// this using this flag. Only the main thread checks this variable, the worker + /// threads will be stopped during cleanup. + worker_error: Arc, + /// After the main thread sent all memory chunks to the sender threads, it waits + /// until one of the workers notifies it. Either because an error occurred, or + /// because they arrived at the gate. + notify_rx: Receiver, +} + +impl SendAdditionalConnections { + /// How many requests can be queued for each connection before the main + /// thread has to wait for workers to catch up. This bounded [`SyncChannel`] + /// provides backpressure, so send_chunk() re-checks worker_error promptly + /// instead of queueing all memory descriptors up front and only noticing + /// failures at the next gate synchronization point. + const BUFFERED_REQUESTS_PER_THREAD: usize = 64; + + /// The size of each chunk of memory to send. + /// + /// We want to make this large, because each chunk is acknowledged and we wait + /// for the ack before sending the next chunk. The challenge is that if it is + /// _too_ large, we become more sensitive to network issues, like packet drops + /// in individual connections, because large amounts of data can pool when + /// throughput on one connection is temporarily reduced. + /// + /// We can consider making this configurable, but a better network protocol that + /// doesn't require ACKs would be more efficient. + /// + /// The best-case throughput per connection can be estimated via: + /// chunk_size / (chunk_size / throughput_per_connection + round_trip_time) + /// + /// This chunk size together with eight connections is sufficient to saturate a 100G link. + const CHUNK_SIZE: u64 = 64 /* MiB */ << 20; + + pub(crate) fn new( + destination: &str, + connections: NonZeroU32, + guest_memory: &GuestMemoryAtomic, + ) -> Result { + let mut threads = Vec::new(); + let configured_connections = connections.get(); + let buffer_size = Self::BUFFERED_REQUESTS_PER_THREAD * configured_connections as usize; + let (message_tx, message_rx) = sync_channel::(buffer_size); + let worker_error = Arc::new(AtomicBool::new(false)); + let (notify_tx, notify_rx) = channel::(); + + // If one connection is configured, we don't have to create any additional threads. + // In this case the main thread does the sending. + if configured_connections == 1 { + return Ok(Self { + guest_memory: guest_memory.clone(), + threads, + message_tx, + worker_error, + notify_rx, + }); + } + + let message_rx = Arc::new(Mutex::new(message_rx)); + // If we use multiple threads to send memory, the main thread only distributes + // the memory chunks to the workers, but does not send memory anymore. Thus in + // this case we create one additional thread for each connection. + for n in 0..configured_connections { + let mut socket = send_migration_socket(destination)?; + let guest_memory = guest_memory.clone(); + let message_rx = message_rx.clone(); + let worker_error = worker_error.clone(); + let notify_tx = notify_tx.clone(); + + let thread = thread::Builder::new() + .name(format!("migrate-send-memory-{n}")) + .spawn(move || { + Self::worker_send_memory( + &mut socket, + &guest_memory, + &message_rx, + &worker_error, + ¬ify_tx, + ) + }) + .inspect_err(|_| { + // If an error occurs here, we still do some light cleanup. + for _ in 0..threads.len() { + message_tx.send(SendMemoryThreadMessage::Disconnect).ok(); + } + threads.drain(..).for_each(|thread| { + thread.join().ok(); + }); + }) + .context("Error spawning send-memory thread") + .map_err(MigratableError::MigrateSend)?; + threads.push(thread); + } + + Ok(Self { + guest_memory: guest_memory.clone(), + threads, + message_tx, + worker_error, + notify_rx, + }) + } + + fn worker_send_memory( + socket: &mut SocketStream, + guest_memory: &GuestMemoryAtomic, + message_rx: &Mutex>, + worker_error: &AtomicBool, + notify_tx: &Sender, + ) -> Result<(), MigratableError> { + info!("Spawned thread to send VM memory."); + loop { + // Every memory sending thread receives messages from the main thread through this + // channel. The lock is necessary to synchronize the multiple consumers. If the + // workers are very quick, lock contention could become a performance issue. + let message = message_rx + .lock() + .map_err(|_| MigratableError::MigrateSend(anyhow!("message_rx mutex is poisoned"))) + .inspect_err(|_| { + worker_error.store(true, Ordering::Relaxed); + // We ignore errors during error handling. + notify_tx.send(SendMemoryThreadNotify::Error).ok(); + })? + .recv() + .context("Error receiving message from main thread") + .map_err(MigratableError::MigrateSend) + .inspect_err(|_| { + worker_error.store(true, Ordering::Relaxed); + notify_tx.send(SendMemoryThreadNotify::Error).ok(); + })?; + match message { + SendMemoryThreadMessage::Memory(table) => { + send_memory_ranges(guest_memory, &table, socket) + .inspect_err(|_| { + worker_error.store(true, Ordering::Relaxed); + notify_tx.send(SendMemoryThreadNotify::Error).ok(); + }) + .context("Error sending memory to receiver side") + .map_err(MigratableError::MigrateSend)?; + } + SendMemoryThreadMessage::Gate(gate) => { + notify_tx + .send(SendMemoryThreadNotify::Gate) + .context("Error sending gate notification to main thread") + .map_err(MigratableError::MigrateSend) + .inspect_err(|_| { + // Sending via `notify_tx` just failed, so we don't try to send another + // message via it. + worker_error.store(true, Ordering::Relaxed); + })?; + gate.wait(); + } + SendMemoryThreadMessage::Disconnect => { + return Ok(()); + } + } + } + } + + /// Send memory via all connections that we have. `socket` is the original socket + /// that was used to connect to the destination. Returns Ok(true) if memory was + /// sent, Ok(false) if the given table was empty. + /// + /// When this function returns, all memory has been sent and acknowledged. + pub(crate) fn send_memory( + &mut self, + table: MemoryRangeTable, + socket: &mut SocketStream, + ) -> Result { + if table.regions().is_empty() { + return Ok(false); + } + + // If we use only one connection, we send the memory directly. + if self.threads.is_empty() { + send_memory_ranges(&self.guest_memory, &table, socket)?; + return Ok(true); + } + + // The chunk size is chosen to be big enough so that even very fast links need some + // milliseconds to send it. + for chunk in table.partition(Self::CHUNK_SIZE) { + self.send_chunk(chunk)?; + } + + self.wait_for_pending_data()?; + Ok(true) + } + + fn send_chunk(&mut self, chunk: MemoryRangeTable) -> Result<(), MigratableError> { + let mut chunk = SendMemoryThreadMessage::Memory(chunk); + // [`Self::message_tx`] has a limited size, so we may have to retry sending the chunk + loop { + if self.worker_error.load(Ordering::Relaxed) { + return self.cleanup(); + } + + // Use try_send() so we can keep checking worker_error while the + // workers catch up. A blocking send() could wait forever if a + // worker failed and stopped making progress. + match self.message_tx.try_send(chunk) { + Ok(()) => { + return Ok(()); + } + Err(TrySendError::Full(unsent_chunk)) => { + // The channel is full. We wait for a short time and retry. + thread::sleep(Duration::from_millis(10)); + chunk = unsent_chunk; + } + Err(TrySendError::Disconnected(_)) => { + // The workers didn't disconnect for no reason, thus we do a cleanup. + return Err(self.cleanup().err().unwrap_or(MigratableError::MigrateSend( + anyhow!("All sending threads disconnected, but none returned an error?"), + ))); + } + } + } + } + + /// Wait until all data that is in-flight has actually been sent and acknowledged. + fn wait_for_pending_data(&mut self) -> Result<(), MigratableError> { + let gate = Arc::new(Gate::new()); + for _ in 0..self.threads.len() { + self.message_tx + .send(SendMemoryThreadMessage::Gate(gate.clone())) + .context("Error sending gate message to workers") + .map_err(MigratableError::MigrateSend)?; + } + + // We cannot simply wait at the gate, otherwise we might miss it when a sender + // thread encounters an error. Thus we wait for the workers to notify us that + // they arrived at the gate. + let mut seen_threads = 0; + loop { + match self + .notify_rx + .recv() + .context("Error receiving message from workers") + .map_err(MigratableError::MigrateSend)? + { + SendMemoryThreadNotify::Gate => { + seen_threads += 1; + if seen_threads == self.threads.len() { + gate.open(); + return Ok(()); + } + } + SendMemoryThreadNotify::Error => { + // If an error occurred in one of the worker threads, we open + // the gate to make sure that no thread hangs. After that, we + // receive the error from Self::cleanup() and return it. + gate.open(); + return self.cleanup(); + } + } + } + } + + /// Sends disconnect messages to all workers and joins them. + pub(crate) fn cleanup(&mut self) -> Result<(), MigratableError> { + // Send disconnect messages to all workers. + for _ in 0..self.threads.len() { + // All threads may have terminated, leading to a dropped receiver. Thus we ignore + // errors here. + self.message_tx + .try_send(SendMemoryThreadMessage::Disconnect) + .ok(); + } + + let mut first_err = Ok(()); + self.threads.drain(..).for_each(|thread| { + let err = match thread.join() { + Ok(Ok(())) => None, + Ok(Err(e)) => Some(e), + Err(panic) => Some(MigratableError::MigrateSend(anyhow!( + "send-memory thread panicked: {panic:?}" + ))), + }; + + if let Some(e) = err { + warn!("Error in send-memory thread: {e}"); + + if first_err.is_ok() { + first_err = Err(e); + } + } + }); + + first_err + } +} + +impl Drop for SendAdditionalConnections { + fn drop(&mut self) { + if !self.threads.is_empty() { + warn!( + "SendAdditionalConnections was not cleaned up! Either cleanup() was never called (programming error) or it failed before completing." + ); + } + } +} + +/// Extract a UNIX socket path from a "unix:" migration URL. +fn socket_url_to_path(url: &str) -> Result { + url.strip_prefix("unix:") + .ok_or_else(|| anyhow!("Could not extract path from URL: {url}")) + .map(|s| s.into()) +} + +/// Connect to a migration endpoint and return the established stream. +pub(crate) fn send_migration_socket( + destination_url: &str, +) -> Result { + if let Some(address) = destination_url.strip_prefix("tcp:") { + info!("Connecting to TCP socket at {address}"); + + let socket = TcpStream::connect(address).map_err(|e| { + MigratableError::MigrateSend(anyhow!("Error connecting to TCP socket: {e}")) + })?; + + Ok(SocketStream::Tcp(socket)) + } else { + let path = socket_url_to_path(destination_url).map_err(MigratableError::MigrateSend)?; + info!("Connecting to UNIX socket at {path:?}"); + + let socket = UnixStream::connect(&path).map_err(|e| { + MigratableError::MigrateSend(anyhow!("Error connecting to UNIX socket: {e}")) + })?; + + Ok(SocketStream::Unix(socket)) + } +} + +/// Bind a migration listener for the receiver side. +pub(crate) fn receive_migration_listener( + receiver_url: &str, +) -> Result { + if let Some(address) = receiver_url.strip_prefix("tcp:") { + TcpListener::bind(address) + .map(ReceiveListener::Tcp) + .context("Error binding to TCP socket") + .map_err(MigratableError::MigrateReceive) + } else { + let path = socket_url_to_path(receiver_url).map_err(MigratableError::MigrateReceive)?; + UnixListener::bind(&path) + .map(ReceiveListener::Unix) + .context("Error binding to UNIX socket") + .map_err(MigratableError::MigrateReceive) + } +} + +/// Read a response and return Ok(()) if it was a [`Response::Ok`]. +pub(crate) fn expect_ok_response( + socket: &mut SocketStream, + error: MigratableError, +) -> Result<(), MigratableError> { + Response::read_from(socket)? + .ok_or_abandon(socket, error) + .map(|_| ()) +} + +/// Send a request and validate that the peer responds with OK. +pub(crate) fn send_request_expect_ok( + socket: &mut SocketStream, + request: Request, + error: MigratableError, +) -> Result<(), MigratableError> { + request.write_to(socket)?; + expect_ok_response(socket, error) +} + +/// Serialize and send the VM configuration payload. +pub(crate) fn send_config( + socket: &mut SocketStream, + config: &VmMigrationConfig, +) -> Result<(), MigratableError> { + let config_data = serde_json::to_vec(config) + .context("Error serializing VM migration config") + .map_err(MigratableError::MigrateSend)?; + Request::config(config_data.len() as u64).write_to(socket)?; + socket + .write_all(&config_data) + .map_err(MigratableError::MigrateSocket)?; + expect_ok_response( + socket, + MigratableError::MigrateSend(anyhow!("Error during config migration")), + ) +} + +/// Serialize and send the VM snapshot payload. +pub(crate) fn send_state( + socket: &mut SocketStream, + snapshot: &Snapshot, +) -> Result<(), MigratableError> { + let snapshot_data = serde_json::to_vec(snapshot) + .context("Error serializing VM snapshot") + .map_err(MigratableError::MigrateSend)?; + Request::state(snapshot_data.len() as u64).write_to(socket)?; + socket + .write_all(&snapshot_data) + .map_err(MigratableError::MigrateSocket)?; + expect_ok_response( + socket, + MigratableError::MigrateSend(anyhow!("Error during state migration")), + ) +} + +/// Transmits the given [`MemoryRangeTable`] and the corresponding guest memory +/// content over the wire if there is at least one range. +/// +/// Sends a memory migration request, the range table, and the corresponding +/// guest memory range over the given socket. Waits for acknowledgment +/// from the destination. +pub(crate) fn send_memory_ranges( + guest_memory: &GuestMemoryAtomic, + ranges: &MemoryRangeTable, + socket: &mut SocketStream, +) -> Result<(), MigratableError> { + if ranges.regions().is_empty() { + return Ok(()); + } + + // Send the memory table + Request::memory(ranges.length()).write_to(socket)?; + ranges.write_to(socket)?; + + // And then the memory itself + let mem = guest_memory.memory(); + for range in ranges.regions() { + let mut offset: u64 = 0; + // Here we are manually handling the retry in case we can't read the + // whole region at once because we can't use the implementation + // from vm-memory::GuestMemory of write_all_to() as it is not + // following the correct behavior. For more info about this issue + // see: https://github.com/rust-vmm/vm-memory/issues/174 + loop { + let bytes_written = mem + .write_volatile_to( + GuestAddress(range.gpa + offset), + socket, + (range.length - offset) as usize, + ) + .map_err(|e| { + MigratableError::MigrateSend(anyhow!( + "Error transferring memory to socket: {e}" + )) + })?; + offset += bytes_written as u64; + + if offset == range.length { + break; + } + } + } + expect_ok_response( + socket, + MigratableError::MigrateSend(anyhow!("Error during dirty memory migration")), + ) +} + +/// Receive memory contents for the given range table into guest memory. +pub(crate) fn receive_memory_ranges( + guest_memory: &GuestMemoryAtomic, + req: &Request, + socket: &mut SocketStream, +) -> Result<(), MigratableError> { + debug_assert_eq!(req.command(), Command::Memory); + // Read the memory table + let ranges = MemoryRangeTable::read_from(socket, req.length())?; + + // And then the memory itself + let mem = guest_memory.memory(); + + for range in ranges.regions() { + let mut offset: u64 = 0; + // Here we are manually handling the retry in case we can't read the + // whole region at once because we can't use the implementation + // from vm-memory::GuestMemory of read_exact_from() as it is not + // following the correct behavior. For more info about this issue + // see: https://github.com/rust-vmm/vm-memory/issues/174 + loop { + let bytes_read = mem + .read_volatile_from( + GuestAddress(range.gpa + offset), + socket, + (range.length - offset) as usize, + ) + .map_err(|e| { + MigratableError::MigrateReceive(anyhow!( + "Error receiving memory from socket: {e}" + )) + })?; + offset += bytes_read as u64; + + if offset == range.length { + break; + } + } + } + + Ok(()) +} diff --git a/vmm/src/pci_segment.rs b/vmm/src/pci_segment.rs index b334ddb5d6..6a4f10aa70 100644 --- a/vmm/src/pci_segment.rs +++ b/vmm/src/pci_segment.rs @@ -105,7 +105,7 @@ impl PciSegment { }; info!( - "Adding PCI segment: id={}, PCI MMIO config address: 0x{:x}, mem32 area [0x{:x}-0x{:x}, mem64 area [0x{:x}-0x{:x}", + "Adding PCI segment: id={}, PCI MMIO config address: 0x{:x}, mem32 area [0x{:x}-0x{:x}], mem64 area [0x{:x}-0x{:x}]", segment.id, segment.mmio_config_address, segment.start_of_mem32_area, @@ -164,15 +164,33 @@ impl PciSegment { ) } - pub(crate) fn next_device_bdf(&self) -> DeviceManagerResult { + /// Reserves a device ID on this PCI segment, marking it as in-use + /// so that automatic allocation will not use it. + pub(crate) fn reserve_device_id(&self, device_id: u8) -> DeviceManagerResult<()> { + self.pci_bus + .lock() + .unwrap() + .reserve_device_id(device_id) + .map_err(DeviceManagerError::ReservePciDeviceId)?; + Ok(()) + } + + /// Allocates a device's ID on this PCI segment. + /// + /// - `device_id`: Device ID to request for allocation + /// + /// ## Errors + /// * [`DeviceManagerError::AllocatePciDeviceId`] if device ID + /// allocation on the bus fails. + pub(crate) fn allocate_device_id(&self, device_id: Option) -> DeviceManagerResult { Ok(PciBdf::new( self.id, 0, self.pci_bus .lock() .unwrap() - .next_device_id() - .map_err(DeviceManagerError::NextPciDeviceId)? as u8, + .allocate_device_id(device_id) + .map_err(DeviceManagerError::AllocatePciDeviceId)?, 0, )) } @@ -202,6 +220,65 @@ impl PciSegment { Ok(()) } + + #[cfg(test)] + /// Creates a PciSegment without the need for an [`AddressManager`] + /// for testing purpose. + /// + /// An [`AddressManager`] would otherwise be required to create + /// [`PciBus`] instances. Instead, we use any struct that implements + /// [`DeviceRelocation`] to instantiate a [`PciBus`]. + pub(crate) fn new_without_address_manager( + id: u16, + numa_node: u32, + mem32_allocator: Arc>, + mem64_allocator: Arc>, + pci_irq_slots: &[u8; 32], + device_reloc: &Arc, + ) -> DeviceManagerResult { + let pci_root = PciRoot::new(None); + let pci_bus = Arc::new(Mutex::new(PciBus::new(pci_root, device_reloc.clone()))); + + let pci_config_mmio = Arc::new(Mutex::new(PciConfigMmio::new(Arc::clone(&pci_bus)))); + let mmio_config_address = + layout::PCI_MMCONFIG_START.0 + layout::PCI_MMIO_CONFIG_SIZE_PER_SEGMENT * id as u64; + + let start_of_mem32_area = mem32_allocator.lock().unwrap().base().0; + let end_of_mem32_area = mem32_allocator.lock().unwrap().end().0; + + let start_of_mem64_area = mem64_allocator.lock().unwrap().base().0; + let end_of_mem64_area = mem64_allocator.lock().unwrap().end().0; + + let segment = PciSegment { + id, + pci_bus, + pci_config_mmio, + mmio_config_address, + proximity_domain: numa_node, + pci_devices_up: 0, + pci_devices_down: 0, + #[cfg(target_arch = "x86_64")] + pci_config_io: None, + mem32_allocator, + mem64_allocator, + start_of_mem32_area, + end_of_mem32_area, + start_of_mem64_area, + end_of_mem64_area, + pci_irq_slots: *pci_irq_slots, + }; + + info!( + "Adding PCI segment: id={}, PCI MMIO config address: 0x{:x}, mem32 area [0x{:x}-0x{:x}], mem64 area [0x{:x}-0x{:x}]", + segment.id, + segment.mmio_config_address, + segment.start_of_mem32_area, + segment.end_of_mem32_area, + segment.start_of_mem64_area, + segment.end_of_mem64_area + ); + Ok(segment) + } } struct PciDevSlot { @@ -360,7 +437,7 @@ impl Aml for PciSegment { pci_dsdt_inner_data.push(&adr); let seg = aml::Name::new("_SEG".into(), &self.id); pci_dsdt_inner_data.push(&seg); - let uid = aml::Name::new("_UID".into(), &aml::ZERO); + let uid = aml::Name::new("_UID".into(), &self.id); pci_dsdt_inner_data.push(&uid); let cca = aml::Name::new("_CCA".into(), &aml::ONE); pci_dsdt_inner_data.push(&cca); @@ -474,3 +551,101 @@ impl Aml for PciSegment { .to_aml_bytes(sink); } } + +#[cfg(test)] +mod unit_tests { + use std::result::Result; + + use vm_memory::GuestAddress; + + use super::*; + + #[derive(Debug)] + struct MockDeviceRelocation; + impl DeviceRelocation for MockDeviceRelocation { + fn move_bar( + &self, + _old_base: u64, + _new_base: u64, + _len: u64, + _pci_dev: &mut dyn pci::PciDevice, + _region_type: pci::PciBarRegionType, + ) -> Result<(), std::io::Error> { + Ok(()) + } + } + + fn setup() -> PciSegment { + let guest_addr = 0_u64; + let guest_size = 0x1000_usize; + let allocator_1 = Arc::new(Mutex::new( + AddressAllocator::new(GuestAddress(guest_addr), guest_size as u64).unwrap(), + )); + let allocator_2 = Arc::new(Mutex::new( + AddressAllocator::new(GuestAddress(guest_addr), guest_size as u64).unwrap(), + )); + let mock_device_reloc: Arc = Arc::new(MockDeviceRelocation {}); + let arr = [0_u8; 32]; + + PciSegment::new_without_address_manager( + 0, + 0, + allocator_1, + allocator_2, + &arr, + &mock_device_reloc, + ) + .unwrap() + } + + #[test] + // Test the default device ID for a segment with an empty bus (except for the root device). + fn allocate_device_id_default() { + // The first address is occupied by the root + let segment = setup(); + let bdf = segment.allocate_device_id(None).unwrap(); + assert_eq!(bdf.segment(), segment.id); + assert_eq!(bdf.bus(), 0); + assert_eq!(bdf.device(), 1); + assert_eq!(bdf.function(), 0); + } + + #[test] + // Test to acquire a specific device ID + fn allocate_device_id_fixed_device_id() { + // The first address is occupied by the root + let expect_device_id = 0x10_u8; + let segment = setup(); + let bdf = segment.allocate_device_id(Some(expect_device_id)).unwrap(); + assert_eq!(bdf.segment(), segment.id); + assert_eq!(bdf.bus(), 0); + assert_eq!(bdf.device(), expect_device_id); + assert_eq!(bdf.function(), 0); + } + + #[test] + // Test that reserving an already taken device ID fails and that + // allocating an out-of-range device ID fails. + fn allocate_device_id_invalid_device_id() { + // The first address is occupied by the root + let already_taken_device_id = 0x0_u8; + let overflow_device_id = 0xff_u8; + let segment = setup(); + let bdf_res = segment.reserve_device_id(already_taken_device_id); + assert!(matches!( + bdf_res, + Err(DeviceManagerError::ReservePciDeviceId(e)) if matches!( + e, + pci::PciRootError::AlreadyInUsePciDeviceSlot(0x0) + ) + )); + let bdf_res = segment.allocate_device_id(Some(overflow_device_id)); + assert!(matches!( + bdf_res, + Err(DeviceManagerError::AllocatePciDeviceId(e)) if matches!( + e, + pci::PciRootError::InvalidPciDeviceSlot(0xff) + ) + )); + } +} diff --git a/vmm/src/seccomp_filters.rs b/vmm/src/seccomp_filters.rs index bb40b99f60..877ea907d2 100644 --- a/vmm/src/seccomp_filters.rs +++ b/vmm/src/seccomp_filters.rs @@ -25,6 +25,8 @@ use vhost::vhost_kern::vhost_binding::{ VHOST_VDPA_SET_STATUS, VHOST_VDPA_SET_VRING_ENABLE, VHOST_VDPA_SUSPEND, }; +use crate::userfaultfd::{UFFDIO_API, UFFDIO_COPY, UFFDIO_REGISTER, UFFDIO_WAKE}; + #[derive(Copy, Clone)] pub enum Thread { HttpApi, @@ -88,6 +90,9 @@ mod kvm { pub const KVM_HAS_DEVICE_ATTR: u64 = 0x4018_aee3; pub const KVM_SET_ONE_REG: u64 = 0x4010_aeac; pub const KVM_SET_USER_MEMORY_REGION: u64 = 0x4020_ae46; + pub const KVM_SET_USER_MEMORY_REGION2: u64 = 0x40a0_ae49; + pub const KVM_SET_MEMORY_ATTRIBUTES: u64 = 0x4020_aed2; + pub const KVM_CREATE_GUEST_MEMFD: u64 = 0xc040_aed4; pub const KVM_IRQFD: u64 = 0x4020_ae76; pub const KVM_IOEVENTFD: u64 = 0x4040_ae79; pub const KVM_SET_VCPU_EVENTS: u64 = 0x4040_aea0; @@ -106,11 +111,27 @@ mod kvm { pub const KVM_NMI: u64 = 0xae9a; pub const KVM_GET_NESTED_STATE: u64 = 3229658814; pub const KVM_SET_NESTED_STATE: u64 = 1082175167; + pub const KVM_SEV_SNP_LAUNCH_START: u64 = 0x4018_aeb4; + pub const KVM_SEV_SNP_LAUNCH_UPDATE: u64 = 0x8018_aeb5; + pub const KVM_SEV_SNP_LAUNCH_FINISH: u64 = 0x4008_aeb7; } -// Block device ioctls for sparse support probing (not exported by libc) +mod iommufd { + // See include/uapi/linux/iommufd.h in the kernel code. + pub const IOMMU_IOAS_ALLOC: u64 = 0x3b81; + pub const IOMMU_IOAS_MAP: u64 = 0x3b85; + pub const IOMMU_IOAS_UNMAP: u64 = 0x3b86; + + // See include/uapi/linux/vfio.h in the kernel code. + pub const VFIO_DEVICE_BIND_IOMMUFD: u64 = 0x3b76; + pub const VFIO_DEVICE_ATTACH_IOMMUFD_PT: u64 = 0x3b77; + pub const VFIO_DEVICE_DETACH_IOMMUFD_PT: u64 = 0x3b78; +} + +// Block device ioctls (not exported by libc) const BLKDISCARD: u64 = 0x1277; // _IO(0x12, 119) const BLKZEROOUT: u64 = 0x127f; // _IO(0x12, 127) +const BLKGETSIZE64: u64 = 0x80081272; // _IOR(0x12, 114, size_t) // MSHV IOCTL code. This is unstable until the kernel code has been declared stable. #[cfg(feature = "mshv")] @@ -237,10 +258,43 @@ fn create_vmm_ioctl_seccomp_rule_common_kvm() -> Result, Backen and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_ONE_REG)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_REGS)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_USER_MEMORY_REGION,)?], + and![Cond::new( + 1, + ArgLen::Dword, + Eq, + KVM_SET_USER_MEMORY_REGION2, + )?], + and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_MEMORY_ATTRIBUTES,)?], + and![Cond::new(1, ArgLen::Dword, Eq, KVM_CREATE_GUEST_MEMFD,)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_VCPU_EVENTS,)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_NMI)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_NESTED_STATE)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_NESTED_STATE)?], + and![Cond::new(1, ArgLen::Dword, Eq, KVM_SEV_SNP_LAUNCH_START)?], + and![Cond::new(1, ArgLen::Dword, Eq, KVM_SEV_SNP_LAUNCH_UPDATE)?], + and![Cond::new(1, ArgLen::Dword, Eq, KVM_SEV_SNP_LAUNCH_FINISH)?], + ]) +} + +fn create_vmm_ioctl_seccomp_rule_iommufd() -> Result, BackendError> { + use iommufd::*; + Ok(or![ + and![Cond::new(1, ArgLen::Dword, Eq, IOMMU_IOAS_ALLOC)?], + and![Cond::new(1, ArgLen::Dword, Eq, IOMMU_IOAS_MAP)?], + and![Cond::new(1, ArgLen::Dword, Eq, IOMMU_IOAS_UNMAP)?], + and![Cond::new(1, ArgLen::Dword, Eq, VFIO_DEVICE_BIND_IOMMUFD)?], + and![Cond::new( + 1, + ArgLen::Dword, + Eq, + VFIO_DEVICE_ATTACH_IOMMUFD_PT + )?], + and![Cond::new( + 1, + ArgLen::Dword, + Eq, + VFIO_DEVICE_DETACH_IOMMUFD_PT + )?], ]) } @@ -263,6 +317,7 @@ fn create_vmm_ioctl_seccomp_rule_common( and![Cond::new(1, ArgLen::Dword, Eq, BLKPBSZGET as _)?], and![Cond::new(1, ArgLen::Dword, Eq, BLKIOMIN as _)?], and![Cond::new(1, ArgLen::Dword, Eq, BLKIOOPT as _)?], + and![Cond::new(1, ArgLen::Dword, Eq, BLKGETSIZE64 as _)?], and![Cond::new(1, ArgLen::Dword, Eq, BLKDISCARD as _)?], and![Cond::new(1, ArgLen::Dword, Eq, BLKZEROOUT as _)?], and![Cond::new(1, ArgLen::Dword, Eq, FIOCLEX as _)?], @@ -362,12 +417,18 @@ fn create_vmm_ioctl_seccomp_rule_common( VHOST_VDPA_GET_CONFIG_SIZE() )?], and![Cond::new(1, ArgLen::Dword, Eq, VHOST_VDPA_SUSPEND())?], + and![Cond::new(1, ArgLen::Dword, Eq, UFFDIO_API)?], + and![Cond::new(1, ArgLen::Dword, Eq, UFFDIO_COPY)?], + and![Cond::new(1, ArgLen::Dword, Eq, UFFDIO_REGISTER)?], + and![Cond::new(1, ArgLen::Dword, Eq, UFFDIO_WAKE)?], ]; let hypervisor_rules = create_vmm_ioctl_seccomp_rule_hypervisor(hypervisor_type)?; - common_rules.extend(hypervisor_rules); + let iommufd_rules = create_vmm_ioctl_seccomp_rule_iommufd()?; + common_rules.extend(iommufd_rules); + Ok(common_rules) } @@ -495,6 +556,7 @@ fn signal_handler_thread_rules() -> Result)>, Backend (libc::SYS_close, vec![]), (libc::SYS_exit, vec![]), (libc::SYS_exit_group, vec![]), + (libc::SYS_fcntl, vec![]), (libc::SYS_futex, vec![]), (libc::SYS_ioctl, create_signal_handler_ioctl_seccomp_rule()?), (libc::SYS_landlock_create_ruleset, vec![]), @@ -511,8 +573,6 @@ fn signal_handler_thread_rules() -> Result)>, Backend (libc::SYS_sendto, vec![]), (libc::SYS_sigaltstack, vec![]), (libc::SYS_write, vec![]), - #[cfg(debug_assertions)] - (libc::SYS_fcntl, vec![]), ]) } @@ -528,7 +588,10 @@ fn pty_foreground_thread_rules() -> Result)>, Backend Ok(vec![ (libc::SYS_close, vec![]), (libc::SYS_exit_group, vec![]), + (libc::SYS_fcntl, vec![]), + (libc::SYS_getcwd, vec![]), (libc::SYS_getpgid, vec![]), + (libc::SYS_gettid, vec![]), #[cfg(target_arch = "x86_64")] (libc::SYS_getpgrp, vec![]), (libc::SYS_ioctl, create_pty_foreground_ioctl_seccomp_rule()?), @@ -543,12 +606,8 @@ fn pty_foreground_thread_rules() -> Result)>, Backend (libc::SYS_rt_sigreturn, vec![]), (libc::SYS_sched_yield, vec![]), (libc::SYS_setsid, vec![]), - (libc::SYS_gettid, vec![]), (libc::SYS_sigaltstack, vec![]), (libc::SYS_write, vec![]), - #[cfg(debug_assertions)] - (libc::SYS_fcntl, vec![]), - (libc::SYS_getcwd, vec![]), ]) } @@ -592,6 +651,7 @@ fn vmm_thread_rules( #[cfg(target_arch = "aarch64")] (libc::SYS_newfstatat, vec![]), (libc::SYS_futex, vec![]), + (libc::SYS_getcwd, vec![]), (libc::SYS_getdents64, vec![]), (libc::SYS_getpgid, vec![]), #[cfg(target_arch = "x86_64")] @@ -691,10 +751,10 @@ fn vmm_thread_rules( (libc::SYS_unlink, vec![]), #[cfg(target_arch = "aarch64")] (libc::SYS_unlinkat, vec![]), + (libc::SYS_userfaultfd, vec![]), (libc::SYS_wait4, vec![]), (libc::SYS_write, vec![]), (libc::SYS_writev, vec![]), - (libc::SYS_getcwd, vec![]), ]) } @@ -707,6 +767,14 @@ fn create_vcpu_ioctl_seccomp_rule_kvm() -> Result, BackendError and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_DEVICE_ATTR,)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_GSI_ROUTING,)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_USER_MEMORY_REGION,)?], + and![Cond::new( + 1, + ArgLen::Dword, + Eq, + KVM_SET_USER_MEMORY_REGION2, + )?], + and![Cond::new(1, ArgLen::Dword, Eq, KVM_CREATE_GUEST_MEMFD,)?], + and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_MEMORY_ATTRIBUTES,)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_RUN,)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_NMI)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_NESTED_STATE)?], @@ -757,6 +825,20 @@ fn create_vcpu_ioctl_seccomp_rule_hypervisor( } } +fn create_vcpu_ioctl_seccomp_rule_iommufd() -> Result, BackendError> { + use iommufd::*; + Ok(or![ + and![Cond::new(1, ArgLen::Dword, Eq, IOMMU_IOAS_MAP)?], + and![Cond::new(1, ArgLen::Dword, Eq, IOMMU_IOAS_UNMAP)?], + and![Cond::new( + 1, + ArgLen::Dword, + Eq, + VFIO_DEVICE_DETACH_IOMMUFD_PT + )?], + ]) +} + fn create_vcpu_ioctl_seccomp_rule( hypervisor_type: HypervisorType, ) -> Result, BackendError> { @@ -777,9 +859,11 @@ fn create_vcpu_ioctl_seccomp_rule( ]; let hypervisor_rules = create_vcpu_ioctl_seccomp_rule_hypervisor(hypervisor_type)?; - rules.extend(hypervisor_rules); + let iommufd_rules = create_vcpu_ioctl_seccomp_rule_iommufd()?; + rules.extend(iommufd_rules); + Ok(rules) } @@ -794,11 +878,13 @@ fn vcpu_thread_rules( (libc::SYS_dup, vec![]), (libc::SYS_exit, vec![]), (libc::SYS_epoll_ctl, vec![]), + (libc::SYS_fcntl, vec![]), (libc::SYS_fstat, vec![]), - (libc::SYS_gettid, vec![]), (libc::SYS_futex, vec![]), + (libc::SYS_getcwd, vec![]), (libc::SYS_getrandom, vec![]), (libc::SYS_getpid, vec![]), + (libc::SYS_gettid, vec![]), ( libc::SYS_ioctl, create_vcpu_ioctl_seccomp_rule(hypervisor_type)?, @@ -817,6 +903,10 @@ fn vcpu_thread_rules( (libc::SYS_pread64, vec![]), (libc::SYS_pwrite64, vec![]), (libc::SYS_read, vec![]), + #[cfg(target_arch = "x86_64")] + (libc::SYS_readlink, vec![]), + #[cfg(target_arch = "aarch64")] + (libc::SYS_readlinkat, vec![]), (libc::SYS_recvfrom, vec![]), (libc::SYS_recvmsg, vec![]), (libc::SYS_rt_sigaction, vec![]), @@ -827,6 +917,7 @@ fn vcpu_thread_rules( (libc::SYS_sendto, vec![]), (libc::SYS_shutdown, vec![]), (libc::SYS_sigaltstack, vec![]), + (libc::SYS_statx, vec![]), (libc::SYS_tgkill, vec![]), (libc::SYS_tkill, vec![]), #[cfg(target_arch = "x86_64")] @@ -835,8 +926,6 @@ fn vcpu_thread_rules( (libc::SYS_unlinkat, vec![]), (libc::SYS_write, vec![]), (libc::SYS_writev, vec![]), - (libc::SYS_fcntl, vec![]), - (libc::SYS_getcwd, vec![]), ]) } @@ -856,6 +945,7 @@ fn http_api_thread_rules() -> Result)>, BackendError> (libc::SYS_epoll_wait, vec![]), (libc::SYS_exit, vec![]), (libc::SYS_fcntl, vec![]), + (libc::SYS_getcwd, vec![]), (libc::SYS_gettid, vec![]), (libc::SYS_futex, vec![]), (libc::SYS_getrandom, vec![]), @@ -869,12 +959,11 @@ fn http_api_thread_rules() -> Result)>, BackendError> (libc::SYS_prctl, vec![]), (libc::SYS_recvfrom, vec![]), (libc::SYS_recvmsg, vec![]), + (libc::SYS_rt_sigprocmask, vec![]), (libc::SYS_sched_yield, vec![]), (libc::SYS_sendto, vec![]), (libc::SYS_sigaltstack, vec![]), (libc::SYS_write, vec![]), - (libc::SYS_rt_sigprocmask, vec![]), - (libc::SYS_getcwd, vec![]), ]) } @@ -892,7 +981,9 @@ fn dbus_api_thread_rules() -> Result)>, BackendError> (libc::SYS_epoll_ctl, vec![]), (libc::SYS_exit, vec![]), (libc::SYS_gettid, vec![]), + (libc::SYS_fcntl, vec![]), (libc::SYS_futex, vec![]), + (libc::SYS_getcwd, vec![]), (libc::SYS_getrandom, vec![]), (libc::SYS_madvise, vec![]), (libc::SYS_mmap, vec![]), @@ -908,7 +999,6 @@ fn dbus_api_thread_rules() -> Result)>, BackendError> (libc::SYS_set_robust_list, vec![]), (libc::SYS_sigaltstack, vec![]), (libc::SYS_write, vec![]), - (libc::SYS_getcwd, vec![]), ]) } @@ -916,6 +1006,7 @@ fn event_monitor_thread_rules() -> Result)>, BackendE Ok(vec![ (libc::SYS_brk, vec![]), (libc::SYS_close, vec![]), + (libc::SYS_getcwd, vec![]), (libc::SYS_gettid, vec![]), (libc::SYS_futex, vec![]), (libc::SYS_landlock_create_ruleset, vec![]), @@ -925,7 +1016,6 @@ fn event_monitor_thread_rules() -> Result)>, BackendE (libc::SYS_prctl, vec![]), (libc::SYS_sched_yield, vec![]), (libc::SYS_write, vec![]), - (libc::SYS_getcwd, vec![]), ]) } diff --git a/vmm/src/serial_manager.rs b/vmm/src/serial_manager.rs index 5f8de1874a..27f359bec6 100644 --- a/vmm/src/serial_manager.rs +++ b/vmm/src/serial_manager.rs @@ -6,6 +6,7 @@ use std::fs::File; use std::io::Read; use std::net::Shutdown; +use std::os::fd::OwnedFd; use std::os::unix::io::{AsRawFd, FromRawFd}; use std::os::unix::net::UnixStream; use std::panic::AssertUnwindSafe; @@ -24,7 +25,7 @@ use serial_buffer::SerialBuffer; use thiserror::Error; use vmm_sys_util::eventfd::EventFd; -use crate::console_devices::ConsoleOutput; +use crate::console_devices::ConsoleTransport; #[derive(Debug, Error)] pub enum Error { @@ -113,8 +114,8 @@ pub struct SerialManager { serial: Arc>, #[cfg(target_arch = "aarch64")] serial: Arc>, - epoll_file: File, - in_file: ConsoleOutput, + epoll_fd: OwnedFd, + transport: ConsoleTransport, kill_evt: EventFd, handle: Option>, pty_write_out: Option>, @@ -125,62 +126,63 @@ impl SerialManager { pub fn new( #[cfg(any(target_arch = "x86_64", target_arch = "riscv64"))] serial: Arc>, #[cfg(target_arch = "aarch64")] serial: Arc>, - mut output: ConsoleOutput, + mut transport: ConsoleTransport, socket: Option, ) -> Result> { + let epoll_fd = epoll::create(true).map_err(Error::Epoll)?; + let kill_evt = EventFd::new(EFD_NONBLOCK).map_err(Error::EventFd)?; + + epoll::ctl( + epoll_fd, + epoll::ControlOptions::EPOLL_CTL_ADD, + kill_evt.as_raw_fd(), + epoll::Event::new(epoll::Events::EPOLLIN, EpollDispatch::Kill as u64), + ) + .map_err(Error::Epoll)?; + let mut socket_path: Option = None; - let in_fd = match output { - ConsoleOutput::Pty(ref fd) => fd.as_raw_fd(), - ConsoleOutput::Tty(_) => { + let in_fd = match transport { + ConsoleTransport::Pty(ref fd) => fd.as_raw_fd(), + ConsoleTransport::Tty(_) // If running on an interactive TTY then accept input // SAFETY: trivially safe - if unsafe { libc::isatty(libc::STDIN_FILENO) == 1 } { - // SAFETY: STDIN_FILENO is a valid fd - let fd = unsafe { libc::dup(libc::STDIN_FILENO) }; - if fd == -1 { - return Err(Error::DupFd(std::io::Error::last_os_error())); - } - // SAFETY: fd is valid and owned by us - let stdin_clone = unsafe { File::from_raw_fd(fd) }; - // SAFETY: FFI calls with correct arguments - let ret = unsafe { - let mut flags = libc::fcntl(stdin_clone.as_raw_fd(), libc::F_GETFL); - flags |= libc::O_NONBLOCK; - libc::fcntl(stdin_clone.as_raw_fd(), libc::F_SETFL, flags) - }; - - if ret < 0 { - return Err(Error::SetNonBlocking(std::io::Error::last_os_error())); - } - - output = ConsoleOutput::Tty(Arc::new(stdin_clone)); - fd - } else { - return Ok(None); + if unsafe { libc::isatty(libc::STDIN_FILENO) == 1 } => + { + // SAFETY: STDIN_FILENO is a valid fd + let fd = unsafe { libc::dup(libc::STDIN_FILENO) }; + if fd == -1 { + return Err(Error::DupFd(std::io::Error::last_os_error())); } + // SAFETY: fd is valid and owned by us + let stdin_clone = unsafe { File::from_raw_fd(fd) }; + // SAFETY: FFI calls with correct arguments + let ret = unsafe { + let mut flags = libc::fcntl(stdin_clone.as_raw_fd(), libc::F_GETFL); + flags |= libc::O_NONBLOCK; + libc::fcntl(stdin_clone.as_raw_fd(), libc::F_SETFL, flags) + }; + + if ret < 0 { + return Err(Error::SetNonBlocking(std::io::Error::last_os_error())); + } + + transport = ConsoleTransport::Tty(Arc::new(stdin_clone)); + fd } - ConsoleOutput::Socket(ref fd) => { + ConsoleTransport::Tty(_) => { + return Ok(None); + } + ConsoleTransport::Socket(ref listener) => { if let Some(path_in_socket) = socket { socket_path = Some(path_in_socket.clone()); } - fd.as_raw_fd() + listener.as_raw_fd() } _ => return Ok(None), }; - let epoll_fd = epoll::create(true).map_err(Error::Epoll)?; - let kill_evt = EventFd::new(EFD_NONBLOCK).map_err(Error::EventFd)?; - - epoll::ctl( - epoll_fd, - epoll::ControlOptions::EPOLL_CTL_ADD, - kill_evt.as_raw_fd(), - epoll::Event::new(epoll::Events::EPOLLIN, EpollDispatch::Kill as u64), - ) - .map_err(Error::Epoll)?; - - let epoll_fd_data = if let ConsoleOutput::Socket(_) = output { + let in_event = if let ConsoleTransport::Socket(_) = transport { EpollDispatch::Socket } else { EpollDispatch::File @@ -190,12 +192,12 @@ impl SerialManager { epoll_fd, epoll::ControlOptions::EPOLL_CTL_ADD, in_fd, - epoll::Event::new(epoll::Events::EPOLLIN, epoll_fd_data as u64), + epoll::Event::new(epoll::Events::EPOLLIN, in_event as u64), ) .map_err(Error::Epoll)?; let mut pty_write_out = None; - if let ConsoleOutput::Pty(ref file) = output { + if let ConsoleTransport::Pty(ref file) = transport { let write_out = Arc::new(AtomicBool::new(false)); pty_write_out = Some(write_out.clone()); let writer = file.try_clone().map_err(Error::FileClone)?; @@ -207,14 +209,14 @@ impl SerialManager { .set_out(Some(Box::new(buffer))); } - // Use 'File' to enforce closing on 'epoll_fd' + // Use 'OwnedFd' to manage lifetime // SAFETY: epoll_fd is valid - let epoll_file = unsafe { File::from_raw_fd(epoll_fd) }; + let epoll_fd = unsafe { OwnedFd::from_raw_fd(epoll_fd) }; Ok(Some(SerialManager { serial, - epoll_file, - in_file: output, + epoll_fd, + transport, kill_evt, handle: None, pty_write_out, @@ -255,8 +257,8 @@ impl SerialManager { return Ok(()); } - let epoll_fd = self.epoll_file.as_raw_fd(); - let in_file = self.in_file.clone(); + let epoll_fd = self.epoll_fd.try_clone().map_err(Error::Epoll)?; + let transport = self.transport.clone(); let serial = self.serial.clone(); let pty_write_out = self.pty_write_out.clone(); let mut reader: Option = None; @@ -277,24 +279,25 @@ impl SerialManager { [epoll::Event::new(epoll::Events::empty(), 0); EPOLL_EVENTS_LEN]; loop { - let num_events = match epoll::wait(epoll_fd, timeout, &mut events[..]) { - Ok(res) => res, - Err(e) => { - if e.kind() == io::ErrorKind::Interrupted { - // It's well defined from the epoll_wait() syscall - // documentation that the epoll loop can be interrupted - // before any of the requested events occurred or the - // timeout expired. In both those cases, epoll_wait() - // returns an error of type EINTR, but this should not - // be considered as a regular error. Instead it is more - // appropriate to retry, by calling into epoll_wait(). - continue; + let num_events = + match epoll::wait(epoll_fd.as_raw_fd(), timeout, &mut events[..]) { + Ok(res) => res, + Err(e) => { + if e.kind() == io::ErrorKind::Interrupted { + // It's well defined from the epoll_wait() syscall + // documentation that the epoll loop can be interrupted + // before any of the requested events occurred or the + // timeout expired. In both those cases, epoll_wait() + // returns an error of type EINTR, but this should not + // be considered as a regular error. Instead it is more + // appropriate to retry, by calling into epoll_wait(). + continue; + } + return Err(Error::Epoll(e)); } - return Err(Error::Epoll(e)); - } - }; + }; - if matches!(in_file, ConsoleOutput::Pty(_)) && num_events == 0 { + if matches!(transport, ConsoleTransport::Pty(_)) && num_events == 0 { // This very specific case happens when the serial is connected // to a PTY. We know EPOLLHUP is always present when there's nothing // connected at the other end of the PTY. That's why getting no event @@ -319,7 +322,7 @@ impl SerialManager { .map_err(Error::AcceptConnection)?; } - let ConsoleOutput::Socket(ref listener) = in_file else { + let ConsoleTransport::Socket(ref listener) = transport else { unreachable!(); }; @@ -331,7 +334,7 @@ impl SerialManager { unix_stream.try_clone().map_err(Error::CloneUnixStream)?; epoll::ctl( - epoll_fd, + epoll_fd.as_raw_fd(), epoll::ControlOptions::EPOLL_CTL_ADD, unix_stream.as_raw_fd(), epoll::Event::new( @@ -347,8 +350,8 @@ impl SerialManager { EpollDispatch::File => { if event.events & libc::EPOLLIN as u32 != 0 { let mut input = [0u8; 64]; - let count = match &in_file { - ConsoleOutput::Socket(_) => { + let count = match &transport { + ConsoleTransport::Socket(_) => { if let Some(mut serial_reader) = reader.as_ref() { let count = serial_reader .read(&mut input) @@ -370,11 +373,10 @@ impl SerialManager { 0 } } - ConsoleOutput::Pty(file) | ConsoleOutput::Tty(file) => { - (&**file) - .read(&mut input) - .map_err(Error::ReadInput)? - } + ConsoleTransport::Pty(file) + | ConsoleTransport::Tty(file) => (&**file) + .read(&mut input) + .map_err(Error::ReadInput)?, _ => unreachable!(), }; @@ -431,7 +433,7 @@ impl Drop for SerialManager { if let Some(handle) = self.handle.take() { handle.join().ok(); } - if let ConsoleOutput::Socket(_) = self.in_file + if let ConsoleTransport::Socket(_) = self.transport && let Some(socket_path) = self.socket_path.as_ref() { std::fs::remove_file(socket_path.as_os_str()) diff --git a/vmm/src/sync_utils.rs b/vmm/src/sync_utils.rs new file mode 100644 index 0000000000..14517eac24 --- /dev/null +++ b/vmm/src/sync_utils.rs @@ -0,0 +1,127 @@ +// Copyright © 2026 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::sync::{Condvar, Mutex}; + +/// A single use abortable gate. The main thread will create the gate and pass +/// it to the memory sending threads. The main thread can always open the gate. +/// That way the main thread can also open the gate before all workers arrive +/// there, e.g. if one worker signals that an error occurred and thus cannot +/// continue. +#[derive(Debug)] +pub struct Gate { + /// True if the gate is open, false otherwise. + open: Mutex, + /// Used to notify waiting threads. + cv: Condvar, +} + +impl Gate { + pub fn new() -> Self { + Self { + open: Mutex::new(false), + cv: Condvar::new(), + } + } + + /// Wait at the gate. Only blocks if the gate is not opened. + pub fn wait(&self) { + let mut open = self.open.lock().unwrap(); + while !*open { + open = self.cv.wait(open).unwrap(); + } + } + + /// Open the gate, releasing all waiting threads. + pub fn open(&self) { + let mut open = self.open.lock().unwrap(); + *open = true; + self.cv.notify_all(); + } +} + +#[cfg(test)] +mod tests { + use std::sync::{Arc, mpsc}; + use std::thread; + use std::time::Duration; + + use super::Gate; + + #[test] + fn gate_blocks_until_open() { + let gate = Arc::new(Gate::new()); + let (tx, rx) = mpsc::channel(); + + let gate_clone = gate.clone(); + thread::spawn(move || { + gate_clone.wait(); + tx.send(()).unwrap(); + }); + + // Give the thread time to block. + thread::sleep(Duration::from_millis(50)); + assert!(rx.try_recv().is_err()); + + gate.open(); + rx.recv_timeout(Duration::from_secs(1)).unwrap(); + } + + #[test] + fn gate_open_before_wait_is_non_blocking() { + let gate = Arc::new(Gate::new()); + gate.open(); + + let (tx, rx) = mpsc::channel(); + let gate_clone = gate.clone(); + thread::spawn(move || { + gate_clone.wait(); + tx.send(()).unwrap(); + }); + + rx.recv_timeout(Duration::from_secs(1)).unwrap(); + } + + #[test] + fn gate_releases_multiple_waiters() { + let gate = Arc::new(Gate::new()); + let (tx, rx) = mpsc::channel(); + + for _ in 0..4 { + let gate_clone = gate.clone(); + let tx = tx.clone(); + thread::spawn(move || { + gate_clone.wait(); + tx.send(()).unwrap(); + }); + } + + // Ensure nobody passed before open. + thread::sleep(Duration::from_millis(50)); + assert!(rx.try_recv().is_err()); + + gate.open(); + + for _ in 0..4 { + rx.recv_timeout(Duration::from_secs(1)).unwrap(); + } + } + + #[test] + fn gate_open_is_idempotent() { + let gate = Arc::new(Gate::new()); + gate.open(); + gate.open(); + + let (tx, rx) = mpsc::channel(); + let gate_clone = gate.clone(); + thread::spawn(move || { + gate_clone.wait(); + tx.send(()).unwrap(); + }); + + rx.recv_timeout(Duration::from_secs(1)).unwrap(); + } +} diff --git a/vmm/src/uffd.rs b/vmm/src/uffd.rs new file mode 100644 index 0000000000..eb73f46592 --- /dev/null +++ b/vmm/src/uffd.rs @@ -0,0 +1,167 @@ +// Copyright © 2026 Cloud Hypervisor Authors +// +// SPDX-License-Identifier: Apache-2.0 + +//! Minimal userfaultfd bindings for demand-paged snapshot restore. +//! +//! Uses the `userfaultfd(2)` syscall (available since Linux 4.3) to create a +//! fault descriptor, then `UFFDIO_API` / `UFFDIO_REGISTER` / `UFFDIO_COPY` +//! ioctls to handle page faults from a background thread. +//! +//! Unlike an mmap(MAP_PRIVATE) overlay approach, UFFD does not replace the +//! original memory mapping, so it remains compatible with VFIO device +//! passthrough and shared-memory-backed guest RAM. + +use std::os::fd::{AsRawFd, BorrowedFd, FromRawFd, OwnedFd}; + +use crate::userfaultfd; + +#[repr(C)] +pub(crate) struct UffdioApi { + pub api: u64, + pub features: u64, + pub ioctls: u64, +} + +#[repr(C)] +pub(crate) struct UffdioRegister { + pub range_start: u64, + pub range_len: u64, + pub mode: u64, + pub ioctls: u64, +} + +#[repr(C)] +pub(crate) struct UffdioCopy { + pub dst: u64, + pub src: u64, + pub len: u64, + pub mode: u64, + pub copy: i64, +} + +/// Flat representation of `struct uffd_msg` (32 bytes). +/// +/// The kernel struct contains an 8-byte header followed by a 24-byte +/// union (`arg`). We only use the `arg.pagefault` variant, so the +/// union is flattened into its pagefault fields here. The trailing +/// 8 bytes (`arg.pagefault.feat` + padding) are unused. +#[repr(C)] +pub(crate) struct UffdMsg { + pub event: u8, + _reserved1: u8, + _reserved2: u16, + _reserved3: u32, + pub pf_flags: u64, + pub pf_address: u64, + _pad: [u8; 8], +} + +const _: () = assert!(std::mem::size_of::() == 32); + +/// Create a userfaultfd file descriptor and perform the API handshake. +pub(crate) fn create(required_features: u64) -> Result { + // SAFETY: `userfaultfd` syscall with O_CLOEXEC | O_NONBLOCK flags. + let fd = unsafe { libc::syscall(libc::SYS_userfaultfd, libc::O_CLOEXEC | libc::O_NONBLOCK) }; + if fd < 0 { + return Err(std::io::Error::last_os_error()); + } + // SAFETY: the syscall returned a valid fd above. + let fd = unsafe { OwnedFd::from_raw_fd(fd as std::os::unix::io::RawFd) }; + + let mut api = UffdioApi { + api: userfaultfd::UFFD_API, + features: required_features, + ioctls: 0, + }; + // SAFETY: `api` is a valid, correctly-sized struct for this ioctl. + let ret = unsafe { + libc::ioctl( + fd.as_raw_fd(), + userfaultfd::UFFDIO_API as libc::Ioctl, + &mut api, + ) + }; + if ret < 0 { + return Err(std::io::Error::last_os_error()); + } + + Ok(fd) +} + +/// Register a memory range for missing-page fault handling. +pub(crate) fn register(fd: BorrowedFd<'_>, addr: u64, len: u64) -> Result { + let mut reg = UffdioRegister { + range_start: addr, + range_len: len, + mode: userfaultfd::UFFDIO_REGISTER_MODE_MISSING, + ioctls: 0, + }; + // SAFETY: `reg` is a valid, correctly-sized struct for this ioctl. + let ret = unsafe { + libc::ioctl( + fd.as_raw_fd(), + userfaultfd::UFFDIO_REGISTER as libc::Ioctl, + &mut reg, + ) + }; + if ret < 0 { + return Err(std::io::Error::last_os_error()); + } + Ok(reg.ioctls) +} + +/// Resolve a page fault by copying data into the faulted address. +pub(crate) fn copy( + fd: BorrowedFd<'_>, + dst: u64, + src: *const u8, + len: u64, +) -> Result<(), std::io::Error> { + let mut cp = UffdioCopy { + dst, + src: src as u64, + len, + mode: 0, + copy: 0, + }; + // SAFETY: `cp` is a valid, correctly-sized struct for this ioctl. + let ret = unsafe { + libc::ioctl( + fd.as_raw_fd(), + userfaultfd::UFFDIO_COPY as libc::Ioctl, + &mut cp, + ) + }; + if ret < 0 { + return Err(std::io::Error::last_os_error()); + } + Ok(()) +} + +#[repr(C)] +struct UffdioRange { + start: u64, + len: u64, +} + +/// Wake threads waiting on a fault in the given range without copying data. +/// +/// Needed after UFFDIO_COPY returns EEXIST: the page was already resolved +/// by a concurrent fault, but any additional threads blocked on that page +/// may not have been woken. +pub(crate) fn wake(fd: BorrowedFd<'_>, addr: u64, len: u64) -> Result<(), std::io::Error> { + let mut range = UffdioRange { start: addr, len }; + // SAFETY: `range` is a valid, correctly-sized struct for this ioctl. + let ret = unsafe { + libc::ioctl( + fd.as_raw_fd(), + userfaultfd::UFFDIO_WAKE as libc::Ioctl, + &mut range, + ) + }; + if ret < 0 { + return Err(std::io::Error::last_os_error()); + } + Ok(()) +} diff --git a/vmm/src/userfaultfd.rs b/vmm/src/userfaultfd.rs new file mode 100644 index 0000000000..bbefb16bc6 --- /dev/null +++ b/vmm/src/userfaultfd.rs @@ -0,0 +1,37 @@ +// Copyright © 2026 Cloud Hypervisor Authors +// +// SPDX-License-Identifier: Apache-2.0 + +// See include/uapi/linux/userfaultfd.h in the kernel code. +pub const UFFDIO_API: u64 = 0xc018_aa3f; // _IOWR(0xAA, 0x3F, struct uffdio_api) +pub const UFFDIO_REGISTER: u64 = 0xc020_aa00; // _IOWR(0xAA, 0x00, struct uffdio_register) +pub const UFFDIO_COPY: u64 = 0xc028_aa03; // _IOWR(0xAA, 0x03, struct uffdio_copy) +pub const UFFDIO_WAKE: u64 = 0x8010_aa02; // _IOR(0xAA, 0x02, struct uffdio_range) + +// Validate ioctl encoding against the _IO{R,W,WR}(type, nr, size) formula so +// transposed direction bits or sizes are caught at compile time. +const fn ioctl_ioc(dir: u64, typ: u64, nr: u64, size: u64) -> u64 { + (dir << 30) | (size << 16) | (typ << 8) | nr +} +const IOC_READ: u64 = 2; +const IOC_READWRITE: u64 = 3; +const _: () = assert!(UFFDIO_API == ioctl_ioc(IOC_READWRITE, 0xAA, 0x3F, 24)); +const _: () = assert!(UFFDIO_REGISTER == ioctl_ioc(IOC_READWRITE, 0xAA, 0x00, 32)); +const _: () = assert!(UFFDIO_COPY == ioctl_ioc(IOC_READWRITE, 0xAA, 0x03, 40)); +const _: () = assert!(UFFDIO_WAKE == ioctl_ioc(IOC_READ, 0xAA, 0x02, 16)); + +// Seccomp compares these as Dword (u32); ensure they fit. +const _: () = assert!(UFFDIO_API <= u32::MAX as u64); +const _: () = assert!(UFFDIO_REGISTER <= u32::MAX as u64); +const _: () = assert!(UFFDIO_COPY <= u32::MAX as u64); +const _: () = assert!(UFFDIO_WAKE <= u32::MAX as u64); + +pub const UFFD_API: u64 = 0xAA; +pub const UFFDIO_REGISTER_MODE_MISSING: u64 = 1; +pub const UFFD_EVENT_PAGEFAULT: u8 = 0x12; +pub const UFFD_FEATURE_MISSING_HUGETLBFS: u64 = 1 << 4; +pub const UFFD_FEATURE_MISSING_SHMEM: u64 = 1 << 5; + +const _UFFDIO_COPY: u64 = 0x03; +const _UFFDIO_WAKE: u64 = 0x02; +pub const UFFD_API_RANGE_IOCTLS_BASIC: u64 = (1 << _UFFDIO_WAKE) | (1 << _UFFDIO_COPY); diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index db29072bed..cd118782e9 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -46,7 +46,15 @@ use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs; #[cfg(target_arch = "aarch64")] use hypervisor::arch::aarch64::regs::AARCH64_PMU_IRQ; +#[cfg(all(feature = "kvm", feature = "sev_snp"))] +use hypervisor::kvm::{ + BOOTLOADER_SIZE, BOOTLOADER_START, KVM_VMSA_PAGE_ADDRESS, KVM_VMSA_PAGE_SIZE, +}; use hypervisor::{HypervisorVmConfig, HypervisorVmError, VmOps}; +#[cfg(feature = "igvm")] +use igvm::IgvmFile; +#[cfg(feature = "sev_snp")] +use igvm_defs::SnpPolicy; use libc::{SIGWINCH, termios}; use linux_loader::cmdline::Cmdline; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] @@ -65,10 +73,10 @@ use thiserror::Error; use tracer::trace_scoped; use vm_device::Bus; #[cfg(feature = "tdx")] +use vm_memory::GuestMemory; +#[cfg(feature = "tdx")] use vm_memory::{Address, ByteValued, GuestMemoryRegion, ReadVolatile}; -use vm_memory::{ - Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, WriteVolatile, -}; +use vm_memory::{Bytes, GuestAddress, GuestAddressSpace, GuestMemoryAtomic}; use vm_migration::protocol::{MemoryRangeTable, Request, Response}; use vm_migration::{ Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable, snapshot_from_id, @@ -76,7 +84,7 @@ use vm_migration::{ use vmm_sys_util::eventfd::EventFd; use vmm_sys_util::sock_ctrl_msg::ScmSocket; -use crate::config::{ValidationError, add_to_config}; +use crate::config::{MemoryRestoreMode, ValidationError, add_to_config}; use crate::console_devices::{ConsoleDeviceError, ConsoleInfo}; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use crate::coredump::{ @@ -100,8 +108,8 @@ use crate::migration::{SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE, url_to_path}; #[cfg(feature = "fw_cfg")] use crate::vm_config::FwCfgConfig; use crate::vm_config::{ - DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, NumaConfig, PayloadConfig, - PmemConfig, UserDeviceConfig, VdpaConfig, VmConfig, VsockConfig, + DeviceConfig, DiskConfig, FsConfig, GenericVhostUserConfig, HotplugMethod, NetConfig, + NumaConfig, PayloadConfig, PmemConfig, UserDeviceConfig, VdpaConfig, VmConfig, VsockConfig, }; use crate::{ CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, GuestMemoryMmap, @@ -129,7 +137,13 @@ pub enum Error { UefiLoad(#[source] arch::riscv64::uefi::Error), #[error("Cannot load the initramfs into memory")] - InitramfsLoad, + InitramfsLoad(#[source] std::io::Error), + + #[error("Cannot determine initramfs load address")] + InitramfsAddress(#[source] arch::Error), + + #[error("Cannot read initramfs into guest memory")] + InitramfsRead(#[source] vm_memory::GuestMemoryError), #[error("Cannot load the kernel command line in memory")] LoadCmdLine(#[source] linux_loader::loader::Error), @@ -325,16 +339,12 @@ pub enum Error { #[error("Error coredumping VM")] Coredump(#[source] GuestDebuggableError), - #[cfg(feature = "igvm")] - #[error("Cannot open igvm file")] - IgvmFile(#[source] io::Error), - #[cfg(feature = "igvm")] #[error("Cannot load the igvm into memory")] IgvmLoad(#[source] igvm_loader::Error), #[error("Error injecting NMI")] - ErrorNmi, + ErrorNmi(#[source] cpu::Error), #[error("Error resuming the VM")] ResumeVm(#[source] hypervisor::HypervisorVmError), @@ -528,6 +538,19 @@ pub struct Vm { impl Vm { pub const HANDLED_SIGNALS: [i32; 1] = [SIGWINCH]; + #[cfg(feature = "sev_snp")] + pub fn get_default_sev_snp_guest_policy() -> SnpPolicy { + SnpPolicy::new() + .with_abi_minor(0) + .with_abi_major(0) + // SMT permitted: allows the guest to run on an SMT-enabled host. + // This is the permissive default; future work can expose this as a + // configurable platform option. + .with_smt(1) + .with_reserved_must_be_one(1) + .with_migrate_ma(0) + } + #[allow(clippy::needless_pass_by_value)] #[allow(clippy::too_many_arguments)] pub fn new_from_memory_manager( @@ -536,6 +559,7 @@ impl Vm { vm: Arc, exit_evt: EventFd, reset_evt: EventFd, + guest_exit_evt: EventFd, #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, seccomp_action: &SeccompAction, hypervisor: Arc, @@ -545,6 +569,7 @@ impl Vm { console_resize_pipe: Option>, original_termios: Arc>>, snapshot: Option<&Snapshot>, + #[cfg(feature = "igvm")] igvm_file: Option, ) -> Result { trace_scoped!("Vm::new_from_memory_manager"); @@ -560,8 +585,8 @@ impl Vm { let numa_nodes = Self::create_numa_nodes(config.lock().unwrap().numa.as_deref(), &memory_manager)?; - // Determine if IOMMU should be forced based on confidential computing features - let force_iommu = Self::should_force_iommu(&config); + // Determine if VIRTIO_F_ACCESS_PLATFORM should be forced (e.g. for TDX/SEV-SNP) + let force_access_platform = Self::should_force_access_platform(&config); let stop_on_boot = Self::should_stop_on_boot(&config); @@ -604,15 +629,15 @@ impl Vm { cpu_manager.clone(), exit_evt.try_clone().map_err(Error::EventFdClone)?, reset_evt, + guest_exit_evt, seccomp_action.clone(), numa_nodes.clone(), &activate_evt, - force_iommu, + force_access_platform, boot_id_list, #[cfg(not(target_arch = "riscv64"))] timestamp, snapshot, - &config, )?; // Perform hypervisor-specific initialization @@ -627,6 +652,8 @@ impl Vm { console_resize_pipe.as_ref(), &original_termios, snapshot, + #[cfg(feature = "igvm")] + igvm_file, )?; // Load kernel and initramfs files @@ -687,8 +714,9 @@ impl Vm { }) } - /// Determine if IOMMU should be forced based on confidential computing features. - fn should_force_iommu(_config: &Arc>) -> bool { + /// Determine if VIRTIO_F_ACCESS_PLATFORM should be forced based on + /// confidential computing features. + fn should_force_access_platform(_config: &Arc>) -> bool { #[cfg(feature = "tdx")] if _config.lock().unwrap().is_tdx_enabled() { return true; @@ -730,6 +758,14 @@ impl Vm { let tdx_enabled = config.lock().unwrap().is_tdx_enabled(); #[cfg(feature = "sev_snp")] let sev_snp_enabled = config.lock().unwrap().is_sev_snp_enabled(); + #[cfg(feature = "igvm")] + let igvm_enabled = config + .lock() + .unwrap() + .payload + .as_ref() + .and_then(|p| p.igvm.as_ref()) + .is_some(); let cpus_config = config.lock().unwrap().cpus.clone(); let cpu_manager = cpu::CpuManager::new( @@ -747,6 +783,8 @@ impl Vm { numa_nodes, #[cfg(feature = "sev_snp")] sev_snp_enabled, + #[cfg(feature = "igvm")] + igvm_enabled, ) .map_err(Error::CpuManager)?; @@ -791,17 +829,17 @@ impl Vm { cpu_manager: Arc>, exit_evt: EventFd, reset_evt: EventFd, + guest_exit_evt: EventFd, seccomp_action: SeccompAction, numa_nodes: NumaNodes, activate_evt: &EventFd, - force_iommu: bool, + force_access_platform: bool, boot_id_list: BTreeSet, #[cfg(not(target_arch = "riscv64"))] timestamp: Instant, snapshot: Option<&Snapshot>, - _vm_config: &Arc>, ) -> Result>> { #[cfg(feature = "tdx")] - let dynamic = !_vm_config.lock().unwrap().is_tdx_enabled(); + let dynamic = !config.lock().unwrap().is_tdx_enabled(); #[cfg(not(feature = "tdx"))] let dynamic = true; @@ -814,10 +852,11 @@ impl Vm { cpu_manager, exit_evt, reset_evt, + guest_exit_evt, seccomp_action, numa_nodes, activate_evt, - force_iommu, + force_access_platform, boot_id_list, #[cfg(not(target_arch = "riscv64"))] timestamp, @@ -845,6 +884,7 @@ impl Vm { console_resize_pipe: Option<&Arc>, original_termios: &Arc>>, snapshot: Option<&Snapshot>, + #[cfg(feature = "igvm")] igvm_file: Option, ) -> Result>>> { #[cfg(feature = "mshv")] let is_mshv = matches!( @@ -879,6 +919,8 @@ impl Vm { console_resize_pipe, original_termios, snapshot, + #[cfg(feature = "igvm")] + igvm_file, ); } @@ -901,14 +943,6 @@ impl Vm { .allocate_address_space() .map_err(Error::MemoryManager)?; - // Add UEFI flash for aarch64 - #[cfg(target_arch = "aarch64")] - memory_manager - .lock() - .unwrap() - .add_uefi_flash() - .map_err(Error::MemoryManager)?; - // Load payload asynchronously let load_payload_handle = if snapshot.is_none() { Self::load_payload_async( @@ -916,8 +950,8 @@ impl Vm { config, #[cfg(feature = "igvm")] cpu_manager, - #[cfg(feature = "sev_snp")] - false, + #[cfg(feature = "igvm")] + igvm_file, )? } else { None @@ -962,6 +996,7 @@ impl Vm { console_resize_pipe: Option<&Arc>, original_termios: &Arc>>, snapshot: Option<&Snapshot>, + #[cfg(feature = "igvm")] igvm_file: Option, ) -> Result>>> { // Create boot vCPUs before SEV-SNP initialization cpu_manager @@ -971,7 +1006,8 @@ impl Vm { .map_err(Error::CpuManager)?; // Initialize SEV-SNP - transitions guest into secure state - vm.sev_snp_init().map_err(Error::InitializeSevSnpVm)?; + vm.sev_snp_init(Self::get_default_sev_snp_guest_policy()) + .map_err(Error::InitializeSevSnpVm)?; // Load payload for SEV-SNP (IGVM parser needs cpu_manager for cpuid) let load_payload_handle = if snapshot.is_none() { @@ -980,7 +1016,8 @@ impl Vm { config, #[cfg(feature = "igvm")] cpu_manager, - true, + #[cfg(feature = "igvm")] + igvm_file, )? } else { None @@ -1007,6 +1044,9 @@ impl Vm { ) .map_err(Error::DeviceManager)?; + #[cfg(feature = "fw_cfg")] + Self::create_fw_cfg_if_enabled(config, device_manager)?; + Ok(load_payload_handle) } @@ -1145,15 +1185,24 @@ impl Vm { initramfs_option = initramfs; } let mut fw_cfg_item_list_option: Option> = None; - if let Some(fw_cfg_files) = &fw_cfg_config.items { + if let Some(fw_cfg_items) = &fw_cfg_config.items { let mut fw_cfg_item_list = vec![]; - for fw_cfg_file in fw_cfg_files.item_list.clone() { - fw_cfg_item_list.push(FwCfgItem { - name: fw_cfg_file.name, - content: devices::legacy::fw_cfg::FwCfgContent::File( + for fw_cfg_item in fw_cfg_items.item_list.clone() { + let content = match (fw_cfg_item.string, fw_cfg_item.file) { + (Some(string_val), None) => { + devices::legacy::fw_cfg::FwCfgContent::Bytes(string_val.into_bytes()) + } + (None, Some(file_path)) => devices::legacy::fw_cfg::FwCfgContent::File( 0, - File::open(fw_cfg_file.file).map_err(Error::AddingFwCfgItem)?, + File::open(file_path).map_err(Error::AddingFwCfgItem)?, ), + _ => unreachable!( + "PayloadConfig::validate() ensures either 'file' or 'string' is present" + ), + }; + fw_cfg_item_list.push(FwCfgItem { + name: fw_cfg_item.name, + content, }); } fw_cfg_item_list_option = Some(fw_cfg_item_list); @@ -1257,6 +1306,7 @@ impl Vm { vm_config: Arc>, exit_evt: EventFd, reset_evt: EventFd, + guest_exit_evt: EventFd, #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, seccomp_action: &SeccompAction, hypervisor: Arc, @@ -1267,6 +1317,7 @@ impl Vm { snapshot: Option<&Snapshot>, source_url: Option<&str>, prefault: Option, + memory_restore_mode: Option, ) -> Result { trace_scoped!("Vm::new"); @@ -1280,10 +1331,28 @@ impl Vm { vm_config.lock().unwrap().is_tdx_enabled() }; - let vm = Self::create_hypervisor_vm( - hypervisor.as_ref(), - vm_config.as_ref().lock().unwrap().deref().into(), - )?; + #[cfg(feature = "igvm")] + let igvm_file = { + let config = vm_config.lock().unwrap(); + config + .payload + .as_ref() + .and_then(|p| p.igvm.as_ref()) + .map(|igvm_path| crate::igvm::parse_igvm(igvm_path)) + .transpose() + .map_err(Error::IgvmLoad)? + }; + + let vm = { + #[allow(unused_mut)] + let mut hv_config: hypervisor::HypervisorVmConfig = + vm_config.as_ref().lock().unwrap().deref().into(); + #[cfg(all(feature = "igvm", feature = "sev_snp"))] + if let Some(ref igvm) = igvm_file { + hv_config.vmsa_features = igvm_loader::extract_sev_features(igvm); + } + Self::create_hypervisor_vm(hypervisor.as_ref(), hv_config)? + }; #[cfg(all(feature = "kvm", target_arch = "x86_64"))] if vm_config.lock().unwrap().max_apic_id() > MAX_SUPPORTED_CPUS_LEGACY { @@ -1302,8 +1371,10 @@ impl Vm { vm.clone(), &vm_config.lock().unwrap().memory.clone(), source_url, - prefault.unwrap(), + prefault.unwrap_or(false), + memory_restore_mode.unwrap_or_default(), phys_bits, + &exit_evt, ) .map_err(Error::MemoryManager)? } else { @@ -1326,6 +1397,7 @@ impl Vm { vm, exit_evt, reset_evt, + guest_exit_evt, #[cfg(feature = "guest_debug")] vm_debug_evt, seccomp_action, @@ -1337,6 +1409,8 @@ impl Vm { console_resize_pipe, original_termios, snapshot, + #[cfg(feature = "igvm")] + igvm_file, ) } @@ -1363,18 +1437,18 @@ impl Vm { let initramfs = self.initramfs.as_mut().unwrap(); let size: usize = initramfs .seek(SeekFrom::End(0)) - .map_err(|_| Error::InitramfsLoad)? + .map_err(Error::InitramfsLoad)? .try_into() .unwrap(); - initramfs.rewind().map_err(|_| Error::InitramfsLoad)?; + initramfs.rewind().map_err(Error::InitramfsLoad)?; let address = - arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?; + arch::initramfs_load_addr(guest_mem, size).map_err(Error::InitramfsAddress)?; let address = GuestAddress(address); guest_mem .read_volatile_from(address, initramfs, size) - .map_err(|_| Error::InitramfsLoad)?; + .map_err(Error::InitramfsRead)?; info!("Initramfs loaded: address = 0x{:x}", address.0); Ok(arch::InitramfsConfig { address, size }) @@ -1404,7 +1478,11 @@ impl Vm { mut firmware: &File, memory_manager: Arc>, ) -> Result { - let uefi_flash = memory_manager.lock().as_ref().unwrap().uefi_flash(); + let mut memory_manager = memory_manager.lock().unwrap(); + memory_manager + .add_uefi_flash() + .map_err(Error::MemoryManager)?; + let uefi_flash = memory_manager.uefi_flash(); let mem = uefi_flash.memory(); arch::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut firmware) .map_err(Error::UefiLoad)?; @@ -1448,16 +1526,33 @@ impl Vm { Ok(EntryPoint { entry_addr }) } + #[cfg(all(feature = "kvm", feature = "sev_snp"))] + fn reserve_bootloader_regions(memory_manager: &Arc>) -> Result<()> { + let mut mm = memory_manager.lock().unwrap(); + mm.add_ram_region(BOOTLOADER_START, BOOTLOADER_SIZE) + .map_err(Error::MemoryManager)?; + mm.add_ram_region(KVM_VMSA_PAGE_ADDRESS, KVM_VMSA_PAGE_SIZE) + .map_err(Error::MemoryManager)?; + Ok(()) + } + #[cfg(feature = "igvm")] #[allow(clippy::needless_pass_by_value)] fn load_igvm( - igvm: File, + igvm_file: IgvmFile, memory_manager: Arc>, cpu_manager: Arc>, #[cfg(feature = "sev_snp")] host_data: &Option, ) -> Result { + // Only reserve bootloader/VMSA regions for KVM + SEV-SNP; other hypervisors + // (e.g. MSHV) handle this through their own import path. + #[cfg(all(feature = "kvm", feature = "sev_snp"))] + if cpu_manager.lock().unwrap().sev_snp_enabled() { + Self::reserve_bootloader_regions(&memory_manager)?; + } + let res = igvm_loader::load_igvm( - &igvm, + igvm_file, memory_manager, cpu_manager.clone(), "", @@ -1547,19 +1642,21 @@ impl Vm { payload: &PayloadConfig, memory_manager: Arc>, #[cfg(feature = "igvm")] cpu_manager: Arc>, - #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, + #[cfg(feature = "igvm")] igvm_file: Option, ) -> Result { trace_scoped!("load_payload"); #[cfg(feature = "igvm")] { - if let Some(_igvm_file) = &payload.igvm { - let igvm = File::open(_igvm_file).map_err(Error::IgvmFile)?; - #[cfg(feature = "sev_snp")] - if sev_snp_enabled { - return Self::load_igvm(igvm, memory_manager, cpu_manager, &payload.host_data); - } - #[cfg(not(feature = "sev_snp"))] - return Self::load_igvm(igvm, memory_manager, cpu_manager); + if payload.igvm.is_some() { + let igvm_file = + igvm_file.ok_or(Error::IgvmLoad(igvm_loader::Error::MissingIgvm))?; + return Self::load_igvm( + igvm_file, + memory_manager, + cpu_manager, + #[cfg(feature = "sev_snp")] + &payload.host_data, + ); } } match (&payload.firmware, &payload.kernel) { @@ -1603,7 +1700,7 @@ impl Vm { memory_manager: &Arc>, config: &Arc>, #[cfg(feature = "igvm")] cpu_manager: &Arc>, - #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, + #[cfg(feature = "igvm")] igvm_file: Option, ) -> Result>>> { // Kernel with TDX is loaded in a different manner #[cfg(feature = "tdx")] @@ -1630,8 +1727,8 @@ impl Vm { memory_manager, #[cfg(feature = "igvm")] cpu_manager, - #[cfg(feature = "sev_snp")] - sev_snp_enabled, + #[cfg(feature = "igvm")] + igvm_file, ) }) .map_err(Error::KernelLoadThreadSpawn) @@ -1640,7 +1737,11 @@ impl Vm { } #[cfg(target_arch = "x86_64")] - fn configure_system(&mut self, rsdp_addr: GuestAddress, entry_addr: EntryPoint) -> Result<()> { + fn configure_system( + &mut self, + rsdp_addr: Option, + entry_addr: EntryPoint, + ) -> Result<()> { trace_scoped!("configure_system"); info!("Configuring system"); let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); @@ -1651,7 +1752,6 @@ impl Vm { }; let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus(); - let rsdp_addr = Some(rsdp_addr); let serial_number = self .config @@ -1703,7 +1803,7 @@ impl Vm { #[cfg(target_arch = "aarch64")] fn configure_system( &mut self, - _rsdp_addr: GuestAddress, + _rsdp_addr: Option, _entry_addr: EntryPoint, ) -> Result<()> { let cmdline = Self::generate_cmdline( @@ -2136,6 +2236,33 @@ impl Vm { Ok(pci_device_info) } + pub fn add_generic_vhost_user( + &mut self, + mut generic_vhost_user_cfg: GenericVhostUserConfig, + ) -> Result { + let pci_device_info = self + .device_manager + .lock() + .unwrap() + .add_generic_vhost_user(&mut generic_vhost_user_cfg) + .map_err(Error::DeviceManager)?; + + // Update VmConfig by adding the new device. This is important to + // ensure the device would be created in case of a reboot. + { + let mut config = self.config.lock().unwrap(); + add_to_config(&mut config.generic_vhost_user, generic_vhost_user_cfg); + } + + self.device_manager + .lock() + .unwrap() + .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) + .map_err(Error::DeviceManager)?; + + Ok(pci_device_info) + } + pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result { let pci_device_info = self .device_manager @@ -2486,9 +2613,9 @@ impl Vm { // Loop over the ACPI tables and copy them to the HOB. for acpi_table in crate::acpi::create_acpi_tables_tdx( - &self.device_manager, - &self.cpu_manager, - &self.memory_manager, + &self.device_manager.lock().unwrap(), + &self.cpu_manager.lock().unwrap(), + &self.memory_manager.lock().unwrap(), &self.numa_nodes, ) { hob.add_acpi_table(&mem, acpi_table.as_slice()) @@ -2548,9 +2675,9 @@ impl Vm { let tpm_enabled = self.config.lock().unwrap().tpm.is_some(); let rsdp_addr = crate::acpi::create_acpi_tables( &mem, - &self.device_manager, - &self.cpu_manager, - &self.memory_manager, + &self.device_manager.lock().unwrap(), + &self.cpu_manager.lock().unwrap(), + &self.memory_manager.lock().unwrap(), &self.numa_nodes, tpm_enabled, ); @@ -2615,9 +2742,9 @@ impl Vm { if fw_cfg_config.acpi_tables { let tpm_enabled = self.config.lock().unwrap().tpm.is_some(); crate::acpi::create_acpi_tables_for_fw_cfg( - &self.device_manager, - &self.cpu_manager, - &self.memory_manager, + &self.device_manager.lock().unwrap(), + &self.cpu_manager.lock().unwrap(), + &self.memory_manager.lock().unwrap(), &self.numa_nodes, tpm_enabled, )?; @@ -2713,16 +2840,11 @@ impl Vm { let rsdp_addr = self.create_acpi_tables(); #[cfg(not(target_arch = "riscv64"))] - { - #[cfg(not(any(feature = "sev_snp", feature = "tdx")))] - assert!(rsdp_addr.is_some()); - // Configure shared state based on loaded kernel - if let Some(rsdp_adr) = rsdp_addr { - entry_point - .map(|entry_point| self.configure_system(rsdp_adr, entry_point)) - .transpose()?; - } - } + // Configure shared state based on loaded kernel + entry_point + .map(|entry_point| self.configure_system(rsdp_addr, entry_point)) + .transpose()?; + #[cfg(target_arch = "riscv64")] self.configure_system().unwrap(); @@ -2794,6 +2916,14 @@ impl Vm { self.device_manager.lock().unwrap().balloon_size() } + /// Get the actual size of the virtio_mem regions + pub fn virtio_mem_plugged_size(&self) -> u64 { + self.memory_manager + .lock() + .unwrap() + .virtio_mem_plugged_size() + } + pub fn send_memory_fds( &mut self, socket: &mut UnixStream, @@ -2825,47 +2955,6 @@ impl Vm { Ok(()) } - pub fn send_memory_regions( - &mut self, - ranges: &MemoryRangeTable, - fd: &mut F, - ) -> std::result::Result<(), MigratableError> - where - F: WriteVolatile, - { - let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); - let mem = guest_memory.memory(); - - for range in ranges.regions() { - let mut offset: u64 = 0; - // Here we are manually handling the retry in case we can't the - // whole region at once because we can't use the implementation - // from vm-memory::GuestMemory of write_all_to() as it is not - // following the correct behavior. For more info about this issue - // see: https://github.com/rust-vmm/vm-memory/issues/174 - loop { - let bytes_written = mem - .write_volatile_to( - GuestAddress(range.gpa + offset), - fd, - (range.length - offset) as usize, - ) - .map_err(|e| { - MigratableError::MigrateSend(anyhow!( - "Error transferring memory to socket: {e}" - )) - })?; - offset += bytes_written as u64; - - if offset == range.length { - break; - } - } - } - - Ok(()) - } - pub fn memory_range_table(&self) -> std::result::Result { self.memory_manager .lock() @@ -2873,6 +2962,10 @@ impl Vm { .memory_range_table(false) } + pub fn guest_memory(&self) -> GuestMemoryAtomic { + self.memory_manager.lock().unwrap().guest_memory() + } + pub fn device_tree(&self) -> Arc> { self.device_manager.lock().unwrap().device_tree() } @@ -3031,7 +3124,7 @@ impl Vm { .lock() .unwrap() .nmi() - .map_err(|_| Error::ErrorNmi); + .map_err(Error::ErrorNmi); } } @@ -3081,7 +3174,8 @@ impl Pausable for Vm { .valid_transition(new_state) .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {e:?}")))?; - self.cpu_manager.lock().unwrap().resume()?; + // Restore KVM clock BEFORE vCPUs start running, so they see correct + // TSC/kvmclock from the first instruction after resume. #[cfg(target_arch = "x86_64")] { if let Some(clock) = &self.saved_clock { @@ -3098,6 +3192,7 @@ impl Pausable for Vm { } self.device_manager.lock().unwrap().resume()?; + self.cpu_manager.lock().unwrap().resume()?; // And we're back to the Running state. self.state = new_state; diff --git a/vmm/src/vm_config.rs b/vmm/src/vm_config.rs index 5de8c31452..b0fbb531a2 100644 --- a/vmm/src/vm_config.rs +++ b/vmm/src/vm_config.rs @@ -9,6 +9,7 @@ use std::str::FromStr; use std::{fs, result}; use block::ImageType; +pub use block::fcntl::LockGranularityChoice; use log::{debug, warn}; use net_util::MacAddr; use serde::{Deserialize, Serialize}; @@ -39,6 +40,14 @@ pub struct CpuFeatures { pub amx: bool, } +#[derive(Clone, Copy, Debug, PartialEq, Eq, Deserialize, Serialize, Default)] +pub enum CoreScheduling { + #[default] + Vm, // All vCPUs have the same cookie so can share a core + Vcpu, // Each vCPU has a unique cookie so can't share a core + Off, +} + #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] pub struct CpuTopology { pub threads_per_core: u16, @@ -72,6 +81,8 @@ pub struct CpusConfig { pub features: CpuFeatures, #[serde(default = "default_cpusconfig_nested")] pub nested: bool, + #[serde(default)] + pub core_scheduling: CoreScheduling, } pub const DEFAULT_VCPUS: u32 = 1; @@ -87,6 +98,7 @@ impl Default for CpusConfig { affinity: None, features: CpuFeatures::default(), nested: true, + core_scheduling: CoreScheduling::default(), } } } @@ -101,6 +113,10 @@ pub fn default_platformconfig_iommu_address_width_bits() -> u8 { DEFAULT_IOMMU_ADDRESS_WIDTH_BITS } +pub fn default_platformconfig_vfio_p2p_dma() -> bool { + true +} + #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] pub struct PlatformConfig { #[serde(default = "default_platformconfig_num_pci_segments")] @@ -121,6 +137,10 @@ pub struct PlatformConfig { #[cfg(feature = "sev_snp")] #[serde(default)] pub sev_snp: bool, + #[serde(default)] + pub iommufd: bool, + #[serde(default = "default_platformconfig_vfio_p2p_dma")] + pub vfio_p2p_dma: bool, } pub const DEFAULT_PCI_SEGMENT_APERTURE_WEIGHT: u32 = 1; @@ -139,7 +159,7 @@ pub struct PciSegmentConfig { pub mmio64_aperture_weight: u32, } -#[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] +#[derive(Clone, Debug, Default, PartialEq, Eq, Deserialize, Serialize)] pub struct MemoryZoneConfig { pub id: String, pub size: u64, @@ -159,6 +179,8 @@ pub struct MemoryZoneConfig { pub hotplugged_size: Option, #[serde(default)] pub prefault: bool, + #[serde(default)] + pub mergeable: bool, } impl ApplyLandlock for MemoryZoneConfig { @@ -251,15 +273,27 @@ pub struct VirtQueueAffinity { pub host_cpus: Vec, } +#[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize, Default)] +pub struct PciDeviceCommonConfig { + #[serde(default)] + pub id: Option, + #[serde(default, skip_serializing_if = "<&bool as std::ops::Not>::not")] + pub iommu: bool, + #[serde(default)] + pub pci_segment: u16, + #[serde(default)] + pub pci_device_id: Option, +} + #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] pub struct DiskConfig { + #[serde(flatten)] + pub pci_common: PciDeviceCommonConfig, pub path: Option, #[serde(default)] pub readonly: bool, #[serde(default)] pub direct: bool, - #[serde(default)] - pub iommu: bool, #[serde(default = "default_diskconfig_num_queues")] pub num_queues: usize, #[serde(default = "default_diskconfig_queue_size")] @@ -271,8 +305,6 @@ pub struct DiskConfig { pub rate_limit_group: Option, #[serde(default)] pub rate_limiter_config: Option, - #[serde(default)] - pub id: Option, // For testing use only. Not exposed in API. #[serde(default)] pub disable_io_uring: bool, @@ -280,8 +312,6 @@ pub struct DiskConfig { #[serde(default)] pub disable_aio: bool, #[serde(default)] - pub pci_segment: u16, - #[serde(default)] pub serial: Option, #[serde(default)] pub queue_affinity: Option>, @@ -291,6 +321,8 @@ pub struct DiskConfig { pub sparse: bool, #[serde(default)] pub image_type: ImageType, + #[serde(default)] + pub lock_granularity: LockGranularityChoice, } impl ApplyLandlock for DiskConfig { @@ -320,6 +352,8 @@ pub fn default_diskconfig_sparse() -> bool { #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] pub struct NetConfig { + #[serde(flatten)] + pub pci_common: PciDeviceCommonConfig, #[serde(default = "default_netconfig_tap")] pub tap: Option, pub ip: Option, @@ -330,8 +364,6 @@ pub struct NetConfig { pub host_mac: Option, #[serde(default)] pub mtu: Option, - #[serde(default)] - pub iommu: bool, #[serde(default = "default_netconfig_num_queues")] pub num_queues: usize, #[serde(default = "default_netconfig_queue_size")] @@ -341,8 +373,6 @@ pub struct NetConfig { pub vhost_socket: Option, #[serde(default)] pub vhost_mode: VhostMode, - #[serde(default)] - pub id: Option, // Special deserialize handling: // Therefore, we don't serialize FDs, and whatever value is here after // deserialization is invalid. @@ -353,8 +383,6 @@ pub struct NetConfig { pub fds: Option>, #[serde(default)] pub rate_limiter_config: Option, - #[serde(default)] - pub pci_segment: u16, #[serde(default = "default_netconfig_true")] pub offload_tso: bool, #[serde(default = "default_netconfig_true")] @@ -445,16 +473,14 @@ pub struct PvmemcontrolConfig {} #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] pub struct FsConfig { + #[serde(flatten)] + pub pci_common: PciDeviceCommonConfig, pub tag: String, pub socket: PathBuf, #[serde(default = "default_fsconfig_num_queues")] pub num_queues: usize, #[serde(default = "default_fsconfig_queue_size")] pub queue_size: u16, - #[serde(default)] - pub id: Option, - #[serde(default)] - pub pci_segment: u16, } pub fn default_fsconfig_num_queues() -> usize { @@ -472,19 +498,31 @@ impl ApplyLandlock for FsConfig { } } +#[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] +pub struct GenericVhostUserConfig { + #[serde(flatten)] + pub pci_common: PciDeviceCommonConfig, + pub socket: PathBuf, + pub queue_sizes: Vec, + pub device_type: u32, +} + +impl ApplyLandlock for GenericVhostUserConfig { + fn apply_landlock(&self, landlock: &mut Landlock) -> LandlockResult<()> { + landlock.add_rule_with_access(&self.socket, "rw")?; + Ok(()) + } +} + #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] pub struct PmemConfig { + #[serde(flatten)] + pub pci_common: PciDeviceCommonConfig, pub file: PathBuf, #[serde(default)] pub size: Option, #[serde(default)] - pub iommu: bool, - #[serde(default)] pub discard_writes: bool, - #[serde(default)] - pub id: Option, - #[serde(default)] - pub pci_segment: u16, } impl ApplyLandlock for PmemConfig { @@ -521,6 +559,10 @@ pub fn default_consoleconfig_file() -> Option { impl ApplyLandlock for ConsoleConfig { fn apply_landlock(&self, landlock: &mut Landlock) -> LandlockResult<()> { + if self.mode == ConsoleOutputMode::Pty { + landlock.add_rule_with_access(Path::new("/dev/pts"), "rw")?; + landlock.add_rule_with_access(Path::new("/dev/ptmx"), "rw")?; + } if let Some(file) = &self.file { landlock.add_rule_with_access(file, "rw")?; } @@ -554,6 +596,10 @@ impl Default for DebugConsoleConfig { #[cfg(target_arch = "x86_64")] impl ApplyLandlock for DebugConsoleConfig { fn apply_landlock(&self, landlock: &mut Landlock) -> LandlockResult<()> { + if self.mode == ConsoleOutputMode::Pty { + landlock.add_rule_with_access(Path::new("/dev/pts"), "rw")?; + landlock.add_rule_with_access(Path::new("/dev/ptmx"), "rw")?; + } if let Some(file) = &self.file { landlock.add_rule_with_access(file, "rw")?; } @@ -563,14 +609,10 @@ impl ApplyLandlock for DebugConsoleConfig { #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] pub struct DeviceConfig { + #[serde(flatten)] + pub pci_common: PciDeviceCommonConfig, pub path: PathBuf, #[serde(default)] - pub iommu: bool, - #[serde(default)] - pub id: Option, - #[serde(default)] - pub pci_segment: u16, - #[serde(default)] pub x_nv_gpudirect_clique: Option, } @@ -593,11 +635,9 @@ impl ApplyLandlock for DeviceConfig { #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] pub struct UserDeviceConfig { + #[serde(flatten)] + pub pci_common: PciDeviceCommonConfig, pub socket: PathBuf, - #[serde(default)] - pub id: Option, - #[serde(default)] - pub pci_segment: u16, } impl ApplyLandlock for UserDeviceConfig { @@ -609,15 +649,11 @@ impl ApplyLandlock for UserDeviceConfig { #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] pub struct VdpaConfig { + #[serde(flatten)] + pub pci_common: PciDeviceCommonConfig, pub path: PathBuf, #[serde(default = "default_vdpaconfig_num_queues")] pub num_queues: usize, - #[serde(default)] - pub iommu: bool, - #[serde(default)] - pub id: Option, - #[serde(default)] - pub pci_segment: u16, } pub fn default_vdpaconfig_num_queues() -> usize { @@ -633,14 +669,10 @@ impl ApplyLandlock for VdpaConfig { #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] pub struct VsockConfig { + #[serde(flatten)] + pub pci_common: PciDeviceCommonConfig, pub cid: u32, pub socket: PathBuf, - #[serde(default)] - pub iommu: bool, - #[serde(default)] - pub id: Option, - #[serde(default)] - pub pci_segment: u16, } impl ApplyLandlock for VsockConfig { @@ -715,6 +747,24 @@ pub enum PayloadConfigError { /// Specifying a kernel or firmware is not supported when an igvm is provided. #[error("Specifying a kernel or firmware is not supported when an igvm is provided")] IgvmPlusOtherPayloads, + #[cfg(feature = "fw_cfg")] + /// FwCfg missing kernel + #[error("Error --fw-cfg-config: missing --kernel")] + FwCfgMissingKernel, + #[cfg(feature = "fw_cfg")] + /// FwCfg missing cmdline + #[error("Error --fw-cfg-config: missing --cmdline")] + FwCfgMissingCmdline, + #[cfg(feature = "fw_cfg")] + /// FwCfg missing initramfs + #[error("Error --fw-cfg-config: missing --initramfs")] + FwCfgMissingInitramfs, + #[cfg(feature = "fw_cfg")] + /// Invalid fw_cfg item content + #[error( + "Error --fw-cfg-config: invalid item '{0}' (exactly one of 'file' or 'string' is required)" + )] + FwCfgInvalidItem(String), } #[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] @@ -775,7 +825,9 @@ pub struct FwCfgItem { #[serde(default)] pub name: String, #[serde(default)] - pub file: PathBuf, + pub file: Option, + #[serde(default)] + pub string: Option, } #[cfg(feature = "fw_cfg")] @@ -817,7 +869,7 @@ impl PayloadConfig { #[cfg(feature = "igvm")] { if self.igvm.is_some() { - if self.firmware.is_some() || self.kernel.is_some() { + if self.firmware.is_some() { return Err(PayloadConfigError::IgvmPlusOtherPayloads); } return Ok(()); @@ -840,6 +892,11 @@ impl PayloadConfig { (None, None) => Err(PayloadConfigError::MissingBootitem), }?; + #[cfg(feature = "fw_cfg")] + if let Some(fw_cfg_config) = &self.fw_cfg_config { + fw_cfg_config.validate(self)?; + } + Ok(()) } } @@ -924,6 +981,7 @@ pub struct VmConfig { #[serde(default)] pub rng: RngConfig, pub balloon: Option, + pub generic_vhost_user: Option>, pub fs: Option>, pub pmem: Option>, #[serde(default = "default_serial")] @@ -1000,6 +1058,12 @@ impl VmConfig { } } + if let Some(generic_vhost_user_configs) = &self.generic_vhost_user { + for generic_vhost_user_config in generic_vhost_user_configs.iter() { + generic_vhost_user_config.apply_landlock(&mut landlock)?; + } + } + if let Some(pmem_configs) = &self.pmem { for pmem_config in pmem_configs.iter() { pmem_config.apply_landlock(&mut landlock)?;