On Mon, 1 Aug 2016 10:14:06 +0800 Zhou Jie <zhoujie2...@cn.fujitsu.com> wrote:
> Hi, Alex > > On 2016/7/30 1:12, Alex Williamson wrote: > > On Tue, 19 Jul 2016 15:32:43 +0800 > > Zhou Jie <zhoujie2...@cn.fujitsu.com> wrote: > > > >> From: Chen Fan <chen.fan.f...@cn.fujitsu.com> > >> > >> During aer err occurs and resume do following to > >> protect device from being accessed. > >> 1. Make config space read only. > >> 2. Disable INTx/MSI Interrupt. > >> 3. Do nothing for bar regions. > >> > >> Signed-off-by: Zhou Jie <zhoujie2...@cn.fujitsu.com> > >> --- > >> drivers/vfio/pci/vfio_pci.c | 30 ++++++++++++++++++++++++++++++ > >> drivers/vfio/pci/vfio_pci_private.h | 2 ++ > >> include/uapi/linux/vfio.h | 2 ++ > >> 3 files changed, 34 insertions(+) > >> > >> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c > >> index 2d12b03..dd96b60 100644 > >> --- a/drivers/vfio/pci/vfio_pci.c > >> +++ b/drivers/vfio/pci/vfio_pci.c > >> @@ -318,6 +318,7 @@ static int vfio_pci_open(void *device_data) > >> return -ENODEV; > >> > >> mutex_lock(&driver_lock); > >> + init_completion(&vdev->aer_error_completion); > >> > >> if (!vdev->refcnt) { > >> ret = vfio_pci_enable(vdev); > >> @@ -571,6 +572,16 @@ static long vfio_pci_ioctl(void *device_data, > >> struct vfio_pci_device *vdev = device_data; > >> unsigned long minsz; > >> > >> + if (vdev->aer_error_in_progress && (cmd == VFIO_DEVICE_SET_IRQS || > >> + cmd == VFIO_DEVICE_RESET || cmd == VFIO_DEVICE_PCI_HOT_RESET)) { > >> + int ret; > >> + ret = wait_for_completion_interruptible( > >> + &vdev->aer_error_completion); > >> + if (ret) { > >> + return ret; > >> + } > > > > No brackets necessary. > > > >> + } > >> + > >> if (cmd == VFIO_DEVICE_GET_INFO) { > >> struct vfio_device_info info; > >> > >> @@ -587,6 +598,10 @@ static long vfio_pci_ioctl(void *device_data, > >> if (vdev->reset_works) > >> info.flags |= VFIO_DEVICE_FLAGS_RESET; > >> > >> + info.flags |= VFIO_DEVICE_FLAGS_AERPROCESS; > >> + if (vdev->aer_error_in_progress) > >> + info.flags |= VFIO_DEVICE_FLAGS_INAERPROCESS; > >> + > >> info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions; > >> info.num_irqs = VFIO_PCI_NUM_IRQS; > >> > >> @@ -996,6 +1011,14 @@ static ssize_t vfio_pci_rw(void *device_data, char > >> __user *buf, > >> > >> switch (index) { > >> case VFIO_PCI_CONFIG_REGION_INDEX: > >> + if (vdev->aer_error_in_progress && iswrite) { > >> + int ret; > >> + ret = wait_for_completion_interruptible( > >> + &vdev->aer_error_completion); > >> + if (ret) { > >> + return ret; > >> + } > >> + } > >> return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite); > >> > >> case VFIO_PCI_ROM_REGION_INDEX: > >> @@ -1226,6 +1249,10 @@ static pci_ers_result_t > >> vfio_pci_aer_err_detected(struct pci_dev *pdev, > >> > >> mutex_lock(&vdev->igate); > >> > >> + vdev->aer_error_in_progress = true; > >> + vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE | > >> + VFIO_IRQ_SET_ACTION_TRIGGER, > >> + vdev->irq_type, 0, 0, NULL); > >> if (vdev->err_trigger) > >> eventfd_signal(vdev->err_trigger, 1); > >> > >> @@ -1252,6 +1279,9 @@ static void vfio_pci_aer_resume(struct pci_dev *pdev) > >> } > >> > >> mutex_lock(&vdev->igate); > >> + > >> + vdev->aer_error_in_progress = false; > >> + complete_all(&vdev->aer_error_completion); > >> if (vdev->resume_trigger) > >> eventfd_signal(vdev->resume_trigger, 1); > >> > >> diff --git a/drivers/vfio/pci/vfio_pci_private.h > >> b/drivers/vfio/pci/vfio_pci_private.h > >> index 80d4ddd..2f151f5 100644 > >> --- a/drivers/vfio/pci/vfio_pci_private.h > >> +++ b/drivers/vfio/pci/vfio_pci_private.h > >> @@ -84,6 +84,8 @@ struct vfio_pci_device { > >> bool has_vga; > >> bool needs_reset; > >> bool nointx; > >> + bool aer_error_in_progress; > >> + struct completion aer_error_completion; > >> struct pci_saved_state *pci_saved_state; > >> int refcnt; > >> struct eventfd_ctx *err_trigger; > >> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > >> index 34ab138..276ce50 100644 > >> --- a/include/uapi/linux/vfio.h > >> +++ b/include/uapi/linux/vfio.h > >> @@ -198,6 +198,8 @@ struct vfio_device_info { > >> #define VFIO_DEVICE_FLAGS_PCI (1 << 1) /* vfio-pci device */ > >> #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2) /* vfio-platform device > >> */ > >> #define VFIO_DEVICE_FLAGS_AMBA (1 << 3) /* vfio-amba device */ > >> +#define VFIO_DEVICE_FLAGS_AERPROCESS (1 << 4) /* support aer error > >> progress */ > >> +#define VFIO_DEVICE_FLAGS_INAERPROCESS (1 << 5)/* status in aer error > >> progress */ > >> __u32 num_regions; /* Max region index + 1 */ > >> __u32 num_irqs; /* Max IRQ index + 1 */ > >> }; > > > > Clearly this has only been tested for a single instance of an AER error > > event and resume per device. Are the things you're intending to block > > actually blocked for subsequent events? Note how complete_all() fills > > the done field to let all current and future waiters go through and > > nowhere is there a call to reinit_completion() to drain that path. > > Thanks, > > > > Alex > > Do you mean this condition? > > For device 1: > error1 occurs ---- error1 resumes > error2 occurs ---- error2 resumes > error3 occurs ---- error3 resumes > > In current code, I do complete_all() when error1 resumes. > And this will unblock the device > when error2 and error3 are still be processed. So walk me through how this works. On vfio_pci_open() we call init_completion(), which sets aer_error_completion.done equal to zero (BTW, a user can open the device file descriptor multiple times, so there's already a bug here). Let's assume that an error occurs and the user stalls a single access on wait_for_completion_interruptible(). The bulk of this function happens here: static inline long __sched do_wait_for_common(struct completion *x, long (*action)(long), long timeout, int state) { if (!x->done) { DECLARE_WAITQUEUE(wait, current); __add_wait_queue_tail_exclusive(&x->wait, &wait); do { if (signal_pending_state(state, current)) { timeout = -ERESTARTSYS; break; } __set_current_state(state); spin_unlock_irq(&x->wait.lock); timeout = action(timeout); spin_lock_irq(&x->wait.lock); } while (!x->done && timeout); __remove_wait_queue(&x->wait, &wait); if (!x->done) return timeout; } x->done--; return timeout ?: 1; } So it waits within that do{}while loop for a completion, interruption, or timeout. Then we have: void complete_all(struct completion *x) { unsigned long flags; spin_lock_irqsave(&x->wait.lock, flags); x->done += UINT_MAX/2; __wake_up_locked(&x->wait, TASK_NORMAL, 0); spin_unlock_irqrestore(&x->wait.lock, flags); } So aer_error_completion.done gets incremented to let a couple billion completion waiters through... Show me how another call to wait_for_completion_interruptible() will ever block again within our lifetime when the actual wait of do_wait_for_common() is only entered when 'done' count is equal to zero. This seems to be why reinit_completion() exists, but it's not used here. Thanks, Alex