Add PRE_COPY support for live migration.

This functionality may reduce the downtime upon STOP_COPY as of letting
the target machine to get some 'initial data' from the source once the
machine is still in its RUNNING state and let it prepares itself
pre-ahead to get the final STOP_COPY data.

Signed-off-by: Yishai Hadas <yish...@nvidia.com>
---
 drivers/vfio/pci/virtio/common.h  |   4 +
 drivers/vfio/pci/virtio/migrate.c | 234 +++++++++++++++++++++++++++++-
 2 files changed, 231 insertions(+), 7 deletions(-)

diff --git a/drivers/vfio/pci/virtio/common.h b/drivers/vfio/pci/virtio/common.h
index 3bdfb3ea1174..37796e1d70bc 100644
--- a/drivers/vfio/pci/virtio/common.h
+++ b/drivers/vfio/pci/virtio/common.h
@@ -10,6 +10,8 @@
 
 enum virtiovf_migf_state {
        VIRTIOVF_MIGF_STATE_ERROR = 1,
+       VIRTIOVF_MIGF_STATE_PRECOPY = 2,
+       VIRTIOVF_MIGF_STATE_COMPLETE = 3,
 };
 
 enum virtiovf_load_state {
@@ -57,6 +59,7 @@ struct virtiovf_migration_file {
        /* synchronize access to the file state */
        struct mutex lock;
        loff_t max_pos;
+       u64 pre_copy_initial_bytes;
        u64 record_size;
        u32 record_tag;
        u8 has_obj_id:1;
@@ -90,6 +93,7 @@ struct virtiovf_pci_core_device {
        /* protect migration state */
        struct mutex state_mutex;
        enum vfio_device_mig_state mig_state;
+       u16 num_pre_copy_calls;
        /* protect the reset_done flow */
        spinlock_t reset_lock;
        struct virtiovf_migration_file *resuming_migf;
diff --git a/drivers/vfio/pci/virtio/migrate.c 
b/drivers/vfio/pci/virtio/migrate.c
index 2a9614c2ef07..5ffcff3425c6 100644
--- a/drivers/vfio/pci/virtio/migrate.c
+++ b/drivers/vfio/pci/virtio/migrate.c
@@ -26,6 +26,12 @@
 /* Initial target buffer size */
 #define VIRTIOVF_TARGET_INITIAL_BUF_SIZE SZ_1M
 
+#define VIRTIOVF_MAX_PRE_COPY_CALLS 128
+
+static int
+virtiovf_read_device_context_chunk(struct virtiovf_migration_file *migf,
+                                  u32 ctx_size);
+
 static struct page *
 virtiovf_get_migration_page(struct virtiovf_data_buffer *buf,
                            unsigned long offset)
@@ -155,6 +161,41 @@ virtiovf_pci_free_obj_id(struct virtiovf_pci_core_device 
*virtvdev, u32 obj_id)
                        VIRTIO_RESOURCE_OBJ_DEV_PARTS, obj_id);
 }
 
+static struct virtiovf_data_buffer *
+virtiovf_get_data_buffer(struct virtiovf_migration_file *migf, size_t length)
+{
+       struct virtiovf_data_buffer *buf, *temp_buf;
+       struct list_head free_list;
+
+       INIT_LIST_HEAD(&free_list);
+
+       spin_lock_irq(&migf->list_lock);
+       list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) {
+               list_del_init(&buf->buf_elm);
+               if (buf->allocated_length >= length) {
+                       spin_unlock_irq(&migf->list_lock);
+                       goto found;
+               }
+               /*
+                * Prevent holding redundant buffers. Put in a free
+                * list and call at the end not under the spin lock
+                * (&migf->list_lock) to minimize its scope usage.
+                */
+               list_add(&buf->buf_elm, &free_list);
+       }
+       spin_unlock_irq(&migf->list_lock);
+       buf = virtiovf_alloc_data_buffer(migf, length);
+
+found:
+       while ((temp_buf = list_first_entry_or_null(&free_list,
+                               struct virtiovf_data_buffer, buf_elm))) {
+               list_del(&temp_buf->buf_elm);
+               virtiovf_free_data_buffer(temp_buf);
+       }
+
+       return buf;
+}
+
 static void virtiovf_clean_migf_resources(struct virtiovf_migration_file *migf)
 {
        struct virtiovf_data_buffer *entry;
@@ -217,6 +258,7 @@ static void virtiovf_state_mutex_unlock(struct 
virtiovf_pci_core_device *virtvde
                virtvdev->deferred_reset = false;
                spin_unlock(&virtvdev->reset_lock);
                virtvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
+               virtvdev->num_pre_copy_calls = 0;
                virtiovf_disable_fds(virtvdev);
                goto again;
        }
@@ -341,6 +383,7 @@ static ssize_t virtiovf_save_read(struct file *filp, char 
__user *buf, size_t le
 {
        struct virtiovf_migration_file *migf = filp->private_data;
        struct virtiovf_data_buffer *vhca_buf;
+       bool first_loop_call = true;
        bool end_of_data;
        ssize_t done = 0;
 
@@ -358,6 +401,19 @@ static ssize_t virtiovf_save_read(struct file *filp, char 
__user *buf, size_t le
                ssize_t count;
 
                vhca_buf = virtiovf_get_data_buff_from_pos(migf, *pos, 
&end_of_data);
+               if (first_loop_call) {
+                       first_loop_call = false;
+                       /* Temporary end of file as part of PRE_COPY */
+                       if (end_of_data && migf->state == 
VIRTIOVF_MIGF_STATE_PRECOPY) {
+                               done = -ENOMSG;
+                               goto out_unlock;
+                       }
+                       if (end_of_data && migf->state != 
VIRTIOVF_MIGF_STATE_COMPLETE) {
+                               done = -EINVAL;
+                               goto out_unlock;
+                       }
+               }
+
                if (end_of_data)
                        goto out_unlock;
 
@@ -379,9 +435,103 @@ static ssize_t virtiovf_save_read(struct file *filp, char 
__user *buf, size_t le
        return done;
 }
 
+static long virtiovf_precopy_ioctl(struct file *filp, unsigned int cmd,
+                                  unsigned long arg)
+{
+       struct virtiovf_migration_file *migf = filp->private_data;
+       struct virtiovf_pci_core_device *virtvdev = migf->virtvdev;
+       struct vfio_precopy_info info = {};
+       loff_t *pos = &filp->f_pos;
+       bool end_of_data = false;
+       unsigned long minsz;
+       u32 ctx_size;
+       int ret;
+
+       if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
+               return -ENOTTY;
+
+       minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
+       if (copy_from_user(&info, (void __user *)arg, minsz))
+               return -EFAULT;
+
+       if (info.argsz < minsz)
+               return -EINVAL;
+
+       mutex_lock(&virtvdev->state_mutex);
+       if (virtvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY &&
+           virtvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
+               ret = -EINVAL;
+               goto err_state_unlock;
+       }
+
+       virtvdev->num_pre_copy_calls++;
+       /*
+        * There is no PRE_COPY concept in virtio spec, prevent infinite calls
+        * for a potenital same data.
+        */
+       if (virtvdev->num_pre_copy_calls > VIRTIOVF_MAX_PRE_COPY_CALLS) {
+               ret = 0;
+               goto done;
+       }
+
+       ret = 
virtio_pci_admin_dev_parts_metadata_get(virtvdev->core_device.pdev,
+                               VIRTIO_RESOURCE_OBJ_DEV_PARTS, migf->obj_id,
+                               VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE,
+                               &ctx_size);
+       if (ret)
+               goto err_state_unlock;
+
+       mutex_lock(&migf->lock);
+       if (migf->state == VIRTIOVF_MIGF_STATE_ERROR) {
+               ret = -ENODEV;
+               goto err_migf_unlock;
+       }
+
+       if (migf->pre_copy_initial_bytes > *pos) {
+               info.initial_bytes = migf->pre_copy_initial_bytes - *pos;
+       } else {
+               info.dirty_bytes = migf->max_pos - *pos;
+               if (!info.dirty_bytes)
+                       end_of_data = true;
+               info.dirty_bytes += ctx_size;
+       }
+
+       if (!end_of_data || !ctx_size) {
+               mutex_unlock(&migf->lock);
+               goto done;
+       }
+
+       mutex_unlock(&migf->lock);
+       /*
+        * We finished transferring the current state and the device has a
+        * dirty state, read a new state.
+        */
+       ret = virtiovf_read_device_context_chunk(migf, ctx_size);
+       if (ret)
+               /*
+                * The machine is running, and context size could be grow, so 
no reason to mark
+                * the device state as VIRTIOVF_MIGF_STATE_ERROR.
+                */
+               goto err_state_unlock;
+
+done:
+       virtiovf_state_mutex_unlock(virtvdev);
+       if (copy_to_user((void __user *)arg, &info, minsz))
+               return -EFAULT;
+       return 0;
+
+err_migf_unlock:
+       mutex_unlock(&migf->lock);
+err_state_unlock:
+       virtiovf_state_mutex_unlock(virtvdev);
+       return ret;
+}
+
 static const struct file_operations virtiovf_save_fops = {
        .owner = THIS_MODULE,
        .read = virtiovf_save_read,
+       .unlocked_ioctl = virtiovf_precopy_ioctl,
+       .compat_ioctl = compat_ptr_ioctl,
        .release = virtiovf_release_file,
 };
 
@@ -425,7 +575,7 @@ virtiovf_read_device_context_chunk(struct 
virtiovf_migration_file *migf,
        int nent;
        int ret;
 
-       buf = virtiovf_alloc_data_buffer(migf, ctx_size);
+       buf = virtiovf_get_data_buffer(migf, ctx_size);
        if (IS_ERR(buf))
                return PTR_ERR(buf);
 
@@ -460,7 +610,7 @@ virtiovf_read_device_context_chunk(struct 
virtiovf_migration_file *migf,
                goto out;
 
        buf->length = res_size;
-       header_buf = virtiovf_alloc_data_buffer(migf,
+       header_buf = virtiovf_get_data_buffer(migf,
                                sizeof(struct virtiovf_migration_header));
        if (IS_ERR(header_buf)) {
                ret = PTR_ERR(header_buf);
@@ -485,8 +635,43 @@ virtiovf_read_device_context_chunk(struct 
virtiovf_migration_file *migf,
        return ret;
 }
 
+static int
+virtiovf_pci_save_device_final_data(struct virtiovf_pci_core_device *virtvdev)
+{
+       struct virtiovf_migration_file *migf = virtvdev->saving_migf;
+       u32 ctx_size;
+       int ret;
+
+       if (migf->state == VIRTIOVF_MIGF_STATE_ERROR)
+               return -ENODEV;
+
+       ret = 
virtio_pci_admin_dev_parts_metadata_get(virtvdev->core_device.pdev,
+                               VIRTIO_RESOURCE_OBJ_DEV_PARTS, migf->obj_id,
+                               VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE,
+                               &ctx_size);
+       if (ret)
+               goto err;
+
+       if (!ctx_size) {
+               ret = -EINVAL;
+               goto err;
+       }
+
+       ret = virtiovf_read_device_context_chunk(migf, ctx_size);
+       if (ret)
+               goto err;
+
+       migf->state = VIRTIOVF_MIGF_STATE_COMPLETE;
+       return 0;
+
+err:
+       migf->state = VIRTIOVF_MIGF_STATE_ERROR;
+       return ret;
+}
+
 static struct virtiovf_migration_file *
-virtiovf_pci_save_device_data(struct virtiovf_pci_core_device *virtvdev)
+virtiovf_pci_save_device_data(struct virtiovf_pci_core_device *virtvdev,
+                             bool pre_copy)
 {
        struct virtiovf_migration_file *migf;
        u32 ctx_size;
@@ -536,6 +721,13 @@ virtiovf_pci_save_device_data(struct 
virtiovf_pci_core_device *virtvdev)
        if (ret)
                goto out_clean;
 
+       if (pre_copy) {
+               migf->pre_copy_initial_bytes = migf->max_pos;
+               migf->state = VIRTIOVF_MIGF_STATE_PRECOPY;
+       } else {
+               migf->state = VIRTIOVF_MIGF_STATE_COMPLETE;
+       }
+
        return migf;
 
 out_clean:
@@ -948,7 +1140,8 @@ virtiovf_pci_step_device_state_locked(struct 
virtiovf_pci_core_device *virtvdev,
                return NULL;
        }
 
-       if (cur == VFIO_DEVICE_STATE_RUNNING && new == 
VFIO_DEVICE_STATE_RUNNING_P2P) {
+       if ((cur == VFIO_DEVICE_STATE_RUNNING && new == 
VFIO_DEVICE_STATE_RUNNING_P2P) ||
+           (cur == VFIO_DEVICE_STATE_PRE_COPY && new == 
VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
                ret = virtio_pci_admin_mode_set(virtvdev->core_device.pdev,
                                                
BIT(VIRTIO_ADMIN_CMD_DEV_MODE_F_STOPPED));
                if (ret)
@@ -956,7 +1149,8 @@ virtiovf_pci_step_device_state_locked(struct 
virtiovf_pci_core_device *virtvdev,
                return NULL;
        }
 
-       if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == 
VFIO_DEVICE_STATE_RUNNING) {
+       if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == 
VFIO_DEVICE_STATE_RUNNING) ||
+           (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == 
VFIO_DEVICE_STATE_PRE_COPY)) {
                ret = virtio_pci_admin_mode_set(virtvdev->core_device.pdev, 0);
                if (ret)
                        return ERR_PTR(ret);
@@ -966,7 +1160,7 @@ virtiovf_pci_step_device_state_locked(struct 
virtiovf_pci_core_device *virtvdev,
        if (cur == VFIO_DEVICE_STATE_STOP && new == 
VFIO_DEVICE_STATE_STOP_COPY) {
                struct virtiovf_migration_file *migf;
 
-               migf = virtiovf_pci_save_device_data(virtvdev);
+               migf = virtiovf_pci_save_device_data(virtvdev, false);
                if (IS_ERR(migf))
                        return ERR_CAST(migf);
                get_file(migf->filp);
@@ -974,6 +1168,13 @@ virtiovf_pci_step_device_state_locked(struct 
virtiovf_pci_core_device *virtvdev,
                return migf->filp;
        }
 
+       if ((cur == VFIO_DEVICE_STATE_PRE_COPY && new == 
VFIO_DEVICE_STATE_RUNNING) ||
+           (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == 
VFIO_DEVICE_STATE_RUNNING_P2P)) {
+               virtvdev->num_pre_copy_calls = 0;
+               virtiovf_disable_fds(virtvdev);
+               return NULL;
+       }
+
        if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == 
VFIO_DEVICE_STATE_STOP) {
                virtiovf_disable_fds(virtvdev);
                return NULL;
@@ -995,6 +1196,24 @@ virtiovf_pci_step_device_state_locked(struct 
virtiovf_pci_core_device *virtvdev,
                return NULL;
        }
 
+       if ((cur == VFIO_DEVICE_STATE_RUNNING && new == 
VFIO_DEVICE_STATE_PRE_COPY) ||
+           (cur == VFIO_DEVICE_STATE_RUNNING_P2P &&
+            new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
+               struct virtiovf_migration_file *migf;
+
+               migf = virtiovf_pci_save_device_data(virtvdev, true);
+               if (IS_ERR(migf))
+                       return ERR_CAST(migf);
+               get_file(migf->filp);
+               virtvdev->saving_migf = migf;
+               return migf->filp;
+       }
+
+       if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == 
VFIO_DEVICE_STATE_STOP_COPY) {
+               ret = virtiovf_pci_save_device_final_data(virtvdev);
+               return ret ? ERR_PTR(ret) : NULL;
+       }
+
        /*
         * vfio_mig_get_next_state() does not use arcs other than the above
         */
@@ -1098,7 +1317,8 @@ void virtiovf_set_migratable(struct 
virtiovf_pci_core_device *virtvdev)
        spin_lock_init(&virtvdev->reset_lock);
        virtvdev->core_device.vdev.migration_flags =
                VFIO_MIGRATION_STOP_COPY |
-               VFIO_MIGRATION_P2P;
+               VFIO_MIGRATION_P2P |
+               VFIO_MIGRATION_PRE_COPY;
        virtvdev->core_device.vdev.mig_ops = &virtvdev_pci_mig_ops;
 }
 
-- 
2.27.0


Reply via email to