qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH V3 07/42] vfio/container: preserve descriptors


From: Cédric Le Goater
Subject: Re: [PATCH V3 07/42] vfio/container: preserve descriptors
Date: Thu, 15 May 2025 14:59:04 +0200
User-agent: Mozilla Thunderbird

On 5/12/25 17:32, Steve Sistare wrote:
At vfio creation time, save the value of vfio container, group, and device
descriptors in CPR state.  On qemu restart, vfio_realize() finds and uses
the saved descriptors, and remembers the reused status for subsequent
patches.  The reused status is cleared when vmstate load finishes.

During reuse, device and iommu state is already configured, so operations
in vfio_realize that would modify the configuration, such as vfio ioctl's,
are skipped.  The result is that vfio_realize constructs qemu data
structures that reflect the current state of the device.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
---
  hw/vfio/container.c           | 65 ++++++++++++++++++++++++++++++++++++-------
  hw/vfio/cpr-legacy.c          | 46 ++++++++++++++++++++++++++++++
  include/hw/vfio/vfio-cpr.h    |  9 ++++++
  include/hw/vfio/vfio-device.h |  2 ++
  4 files changed, 112 insertions(+), 10 deletions(-)

diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 85c76da..278a220 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -31,6 +31,8 @@
  #include "system/reset.h"
  #include "trace.h"
  #include "qapi/error.h"
+#include "migration/cpr.h"
+#include "migration/blocker.h"
  #include "pci.h"
  #include "hw/vfio/vfio-container.h"
  #include "hw/vfio/vfio-cpr.h"
@@ -414,7 +416,7 @@ static bool vfio_set_iommu(int container_fd, int group_fd,
  }
static VFIOContainer *vfio_create_container(int fd, VFIOGroup *group,
-                                            Error **errp)
+                                            bool cpr_reused, Error **errp)
  {
      int iommu_type;
      const char *vioc_name;
@@ -425,7 +427,11 @@ static VFIOContainer *vfio_create_container(int fd, 
VFIOGroup *group,
          return NULL;
      }
- if (!vfio_set_iommu(fd, group->fd, &iommu_type, errp)) {
+    /*
+     * If container is reused, just set its type and skip the ioctls, as the
+     * container and group are already configured in the kernel.
+     */
+    if (!cpr_reused && !vfio_set_iommu(fd, group->fd, &iommu_type, errp)) {
          return NULL;
      }
@@ -433,6 +439,7 @@ static VFIOContainer *vfio_create_container(int fd, VFIOGroup *group, container = VFIO_IOMMU_LEGACY(object_new(vioc_name));
      container->fd = fd;
+    container->cpr.reused = cpr_reused;
      container->iommu_type = iommu_type;
      return container;
  }
@@ -584,7 +591,7 @@ static bool 
vfio_container_attach_discard_disable(VFIOContainer *container,
  }
static bool vfio_container_group_add(VFIOContainer *container, VFIOGroup *group,
-                                     Error **errp)
+                                     bool cpr_reused, Error **errp)
  {
      if (!vfio_container_attach_discard_disable(container, group, errp)) {
          return false;
@@ -592,6 +599,9 @@ static bool vfio_container_group_add(VFIOContainer 
*container, VFIOGroup *group,
      group->container = container;
      QLIST_INSERT_HEAD(&container->group_list, group, container_next);
      vfio_group_add_kvm_device(group);
+    if (!cpr_reused) {
+        cpr_save_fd("vfio_container_for_group", group->groupid, container->fd);
+    }

Could we avoid the test on cpr_reused always call cpr_save_fd() ?

      return true;
  }
@@ -601,6 +611,7 @@ static void vfio_container_group_del(VFIOContainer *container, VFIOGroup *group)
      group->container = NULL;
      vfio_group_del_kvm_device(group);
      vfio_ram_block_discard_disable(container, false);
+    cpr_delete_fd("vfio_container_for_group", group->groupid);
  }
static bool vfio_container_connect(VFIOGroup *group, AddressSpace *as,
@@ -613,17 +624,37 @@ static bool vfio_container_connect(VFIOGroup *group, 
AddressSpace *as,
      VFIOIOMMUClass *vioc = NULL;
      bool new_container = false;
      bool group_was_added = false;
+    bool cpr_reused;
space = vfio_address_space_get(as);
+    fd = cpr_find_fd("vfio_container_for_group", group->groupid);
+    cpr_reused = (fd > 0);


The code above is doing 2 things : it grabs a restored fd and
deduces from the fd value that the VM is doing are doing a CPR
reboot.

Instead of adding this cpr_reused flag, I would prefer to duplicate
the code into something like:

if (!cpr_reboot) {
   QLIST_FOREACH(bcontainer, &space->containers, next) {
        container = container_of(bcontainer, VFIOContainer, bcontainer);
        if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
            return vfio_container_group_add(container, group, errp);
        }
    }

    fd = qemu_open("/dev/vfio/vfio", O_RDWR, errp);
    if (fd < 0) {
        goto fail;
    }

    ret = ioctl(fd, VFIO_GET_API_VERSION);
    if (ret != VFIO_API_VERSION) {
        error_setg(errp, "supported vfio version: %d, "
                   "reported version: %d", VFIO_API_VERSION, ret);
        goto fail;
    }

    container = vfio_create_container(fd, group, errp);
} else {
   /* ... */
}



+    /*
+     * If the container is reused, then the group is already attached in the
+     * kernel.  If a container with matching fd is found, then update the
+     * userland group list and return.  If not, then after the loop, create
+     * the container struct and group list.
+     */
QLIST_FOREACH(bcontainer, &space->containers, next) {
          container = container_of(bcontainer, VFIOContainer, bcontainer);
-        if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
-            return vfio_container_group_add(container, group, errp);
+
+        if (cpr_reused) {
+            if (!vfio_cpr_container_match(container, group, &fd)) {

why do we need to modify fd ?

+                continue;
+            }
+        } else if (ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) 
{
+            continue;
          }
+        return vfio_container_group_add(container, group, cpr_reused, errp);
+    }
+
+    if (!cpr_reused) {
+        fd = qemu_open("/dev/vfio/vfio", O_RDWR, errp);
      }
- fd = qemu_open("/dev/vfio/vfio", O_RDWR, errp);
      if (fd < 0) {>           goto fail;
      }
@@ -635,7 +666,7 @@ static bool vfio_container_connect(VFIOGroup *group, 
AddressSpace *as,
          goto fail;
      }
- container = vfio_create_container(fd, group, errp);
+    container = vfio_create_container(fd, group, cpr_reused, errp);
      if (!container) {
          goto fail;
      }
@@ -655,7 +686,7 @@ static bool vfio_container_connect(VFIOGroup *group, 
AddressSpace *as,
vfio_address_space_insert(space, bcontainer); - if (!vfio_container_group_add(container, group, errp)) {
+    if (!vfio_container_group_add(container, group, cpr_reused, errp)) {
          goto fail;
      }
      group_was_added = true;
@@ -697,6 +728,7 @@ static void vfio_container_disconnect(VFIOGroup *group)
QLIST_REMOVE(group, container_next);
      group->container = NULL;
+    cpr_delete_fd("vfio_container_for_group", group->groupid);
/*
       * Explicitly release the listener first before unset container,
@@ -750,7 +782,7 @@ static VFIOGroup *vfio_group_get(int groupid, AddressSpace 
*as, Error **errp)
      group = g_malloc0(sizeof(*group));
snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
-    group->fd = qemu_open(path, O_RDWR, errp);
+    group->fd = cpr_open_fd(path, O_RDWR, "vfio_group", groupid, NULL, errp);
      if (group->fd < 0) {
          goto free_group_exit;
      }
@@ -782,6 +814,7 @@ static VFIOGroup *vfio_group_get(int groupid, AddressSpace 
*as, Error **errp)
      return group;
close_fd_exit:
+    cpr_delete_fd("vfio_group", groupid);
      close(group->fd);
free_group_exit:
@@ -803,6 +836,7 @@ static void vfio_group_put(VFIOGroup *group)
      vfio_container_disconnect(group);
      QLIST_REMOVE(group, next);
      trace_vfio_group_put(group->fd);
+    cpr_delete_fd("vfio_group", group->groupid);
      close(group->fd);
      g_free(group);
  }
@@ -812,8 +846,14 @@ static bool vfio_device_get(VFIOGroup *group, const char 
*name,
  {
      g_autofree struct vfio_device_info *info = NULL;
      int fd;
+    bool cpr_reused;
+
+    fd = cpr_find_fd(name, 0);
+    cpr_reused = (fd >= 0);
+    if (!cpr_reused) {
+        fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
+    }

Could we introduce an helper routine to open this file,  like we have
cpr_open_fd() ?


-    fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
      if (fd < 0) {
          error_setg_errno(errp, errno, "error getting device from group %d",
                           group->groupid);
@@ -857,6 +897,10 @@ static bool vfio_device_get(VFIOGroup *group, const char 
*name,
      vbasedev->group = group;
      QLIST_INSERT_HEAD(&group->device_list, vbasedev, next);
+ vbasedev->cpr.reused = cpr_reused;
+    if (!cpr_reused) {
+        cpr_save_fd(name, 0, fd);

Could we avoid the test on cpr_reused always call cpr_save_fd() ?

+    }
      trace_vfio_device_get(name, info->flags, info->num_regions, 
info->num_irqs);
return true;
@@ -870,6 +914,7 @@ static void vfio_device_put(VFIODevice *vbasedev)
      QLIST_REMOVE(vbasedev, next);
      vbasedev->group = NULL;
      trace_vfio_device_put(vbasedev->fd);
+    cpr_delete_fd(vbasedev->name, 0);
      close(vbasedev->fd);
  }
diff --git a/hw/vfio/cpr-legacy.c b/hw/vfio/cpr-legacy.c
index fac323c..638a8e0 100644
--- a/hw/vfio/cpr-legacy.c
+++ b/hw/vfio/cpr-legacy.c
@@ -10,6 +10,7 @@
  #include "qemu/osdep.h"
  #include "hw/vfio/vfio-container.h"
  #include "hw/vfio/vfio-cpr.h"
+#include "hw/vfio/vfio-device.h"
  #include "migration/blocker.h"
  #include "migration/cpr.h"
  #include "migration/migration.h"
@@ -31,10 +32,27 @@ static bool vfio_cpr_supported(VFIOContainer *container, 
Error **errp)
      }
  }
+static int vfio_container_post_load(void *opaque, int version_id)
+{
+    VFIOContainer *container = opaque;
+    VFIOGroup *group;
+    VFIODevice *vbasedev;
+
+    container->cpr.reused = false;
+
+    QLIST_FOREACH(group, &container->group_list, container_next) {
+        QLIST_FOREACH(vbasedev, &group->device_list, next) {
+            vbasedev->cpr.reused = false;
+        }
+    }
+    return 0;
+}
+
  static const VMStateDescription vfio_container_vmstate = {
      .name = "vfio-container",
      .version_id = 0,
      .minimum_version_id = 0,
+    .post_load = vfio_container_post_load,
      .needed = cpr_needed_for_reuse,
      .fields = (VMStateField[]) {
          VMSTATE_END_OF_LIST()
@@ -68,3 +86,31 @@ void vfio_legacy_cpr_unregister_container(VFIOContainer 
*container)
      migrate_del_blocker(&container->cpr.blocker);
      vmstate_unregister(NULL, &vfio_container_vmstate, container);
  }
+
+static bool same_device(int fd1, int fd2)
+{
+    struct stat st1, st2;
+
+    return !fstat(fd1, &st1) && !fstat(fd2, &st2) && st1.st_dev == st2.st_dev;
+}
+
+bool vfio_cpr_container_match(VFIOContainer *container, VFIOGroup *group,
+                              int *pfd)
+{
+    if (container->fd == *pfd) {
+        return true;
+    }
+    if (!same_device(container->fd, *pfd)) {
+        return false;
+    }
+    /*
+     * Same device, different fd.  This occurs when the container fd is
+     * cpr_save'd multiple times, once for each groupid, so SCM_RIGHTS
+     * produces duplicates.  De-dup it.
+     */
+    cpr_delete_fd("vfio_container_for_group", group->groupid);
+    close(*pfd);
+    cpr_save_fd("vfio_container_for_group", group->groupid, container->fd);
+    *pfd = container->fd;

I am not sure 'pfd' is used afterwards. Is it ?

+    return true;
+}
diff --git a/include/hw/vfio/vfio-cpr.h b/include/hw/vfio/vfio-cpr.h
index f864547..1c4f070 100644
--- a/include/hw/vfio/vfio-cpr.h
+++ b/include/hw/vfio/vfio-cpr.h
@@ -13,10 +13,16 @@
typedef struct VFIOContainerCPR {
      Error *blocker;
+    bool reused;
  } VFIOContainerCPR;
+typedef struct VFIODeviceCPR {
+    bool reused;
+} VFIODeviceCPR;
+
  struct VFIOContainer;
  struct VFIOContainerBase;
+struct VFIOGroup;
bool vfio_legacy_cpr_register_container(struct VFIOContainer *container,
                                          Error **errp);
@@ -29,4 +35,7 @@ bool vfio_cpr_register_container(struct VFIOContainerBase 
*bcontainer,
                                   Error **errp);
  void vfio_cpr_unregister_container(struct VFIOContainerBase *bcontainer);
+bool vfio_cpr_container_match(struct VFIOContainer *container,
+                              struct VFIOGroup *group, int *fd);
+
  #endif /* HW_VFIO_VFIO_CPR_H */
diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h
index 8bcb3c1..4e4d0b6 100644
--- a/include/hw/vfio/vfio-device.h
+++ b/include/hw/vfio/vfio-device.h
@@ -28,6 +28,7 @@
  #endif
  #include "system/system.h"
  #include "hw/vfio/vfio-container-base.h"
+#include "hw/vfio/vfio-cpr.h"
  #include "system/host_iommu_device.h"
  #include "system/iommufd.h"
@@ -84,6 +85,7 @@ typedef struct VFIODevice {
      VFIOIOASHwpt *hwpt;
      QLIST_ENTRY(VFIODevice) hwpt_next;
      struct vfio_region_info **reginfo;
+    VFIODeviceCPR cpr;
  } VFIODevice;
struct VFIODeviceOps {


Thanks,

C.





reply via email to

[Prev in Thread] Current Thread [Next in Thread]