From: Michael Kelley <mhkli...@outlook.com>

vmbus_irq_set_affinity() may issue a MODIFYCHANNEL request to Hyper-V to
target a VMBus channel interrupt to a different CPU. While newer versions
of Hyper-V send a response to the guest when the change is complete,
vmbus_irq_set_affinity() does not wait for the response because it is
running with interrupts disabled. So Hyper-V may continue to direct
interrupts to the old CPU for a short window after vmbus_irq_set_affinity()
completes. This lag is not a problem during normal operation. But if
the old CPU is taken offline during that window, Hyper-V may drop
the interrupt because the synic in the target CPU is disabled. Dropping
the interrupt may cause the VMBus channel to hang.

To prevent this, wait for in-process MODIFYCHANNEL requests when taking
a CPU offline. On newer versions of Hyper-V, completion can be confirmed
by waiting for the response sent by Hyper-V. But on older versions of
Hyper-V that don't send a response, wait a fixed interval of time that
empirically should be "long enough", as that's the best that can be done.

Signed-off-by: Michael Kelley <mhkli...@outlook.com>
---
 drivers/hv/channel.c      |  3 ++
 drivers/hv/channel_mgmt.c | 32 ++++--------------
 drivers/hv/hv.c           | 70 +++++++++++++++++++++++++++++++++++----
 drivers/hv/hyperv_vmbus.h |  8 +++++
 4 files changed, 81 insertions(+), 32 deletions(-)

diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index b7920072e243..b7ee95373049 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -246,6 +246,9 @@ int vmbus_send_modifychannel(struct vmbus_channel *channel, 
u32 target_vp)
        ret = vmbus_post_msg(&msg, sizeof(msg), false);
        trace_vmbus_send_modifychannel(&msg, ret);
 
+       if (!ret)
+               vmbus_connection.modchan_sent++;
+
        return ret;
 }
 EXPORT_SYMBOL_GPL(vmbus_send_modifychannel);
diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index da42aaae994e..960a2f0367d8 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -1465,40 +1465,20 @@ static void vmbus_ongpadl_created(struct 
vmbus_channel_message_header *hdr)
  * vmbus_onmodifychannel_response - Modify Channel response handler.
  *
  * This is invoked when we received a response to our channel modify request.
- * Find the matching request, copy the response and signal the requesting 
thread.
+ * Increment the count of responses received. No locking is needed because
+ * responses are always received on the VMBUS_CONNECT_CPU.
  */
 static void vmbus_onmodifychannel_response(struct vmbus_channel_message_header 
*hdr)
 {
        struct vmbus_channel_modifychannel_response *response;
-       struct vmbus_channel_msginfo *msginfo;
-       unsigned long flags;
 
        response = (struct vmbus_channel_modifychannel_response *)hdr;
+       if (response->status)
+               pr_err("Error status %x in MODIFYCHANNEL response for relid 
%d\n",
+                       response->status, response->child_relid);
+       vmbus_connection.modchan_completed++;
 
        trace_vmbus_onmodifychannel_response(response);
-
-       /*
-        * Find the modify msg, copy the response and signal/unblock the wait 
event.
-        */
-       spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
-
-       list_for_each_entry(msginfo, &vmbus_connection.chn_msg_list, 
msglistentry) {
-               struct vmbus_channel_message_header *responseheader =
-                               (struct vmbus_channel_message_header 
*)msginfo->msg;
-
-               if (responseheader->msgtype == CHANNELMSG_MODIFYCHANNEL) {
-                       struct vmbus_channel_modifychannel *modifymsg;
-
-                       modifymsg = (struct vmbus_channel_modifychannel 
*)msginfo->msg;
-                       if (modifymsg->child_relid == response->child_relid) {
-                               memcpy(&msginfo->response.modify_response, 
response,
-                                      sizeof(*response));
-                               complete(&msginfo->waitevent);
-                               break;
-                       }
-               }
-       }
-       spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
 }
 
 /*
diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index a8ad728354cb..76658dfc5008 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -401,6 +401,56 @@ void hv_synic_disable_regs(unsigned int cpu)
                disable_percpu_irq(vmbus_irq);
 }
 
+static void hv_synic_wait_for_modifychannel(int cpu)
+{
+       int i = 5;
+       u64 base;
+
+       /*
+        * If we're on a VMBus version where MODIFYCHANNEL doesn't send acks,
+        * just sleep for 20 milliseconds and hope that gives Hyper-V enough
+        * time to process them. Empirical data on recent server-class CPUs
+        * (both x86 and arm64) shows that the Hyper-V response is typically
+        * received and processed in the guest within a few hundred
+        * microseconds. The 20 millisecond wait is somewhat arbitrary and
+        * intended to give plenty to time in case there are multiple
+        * MODIFYCHANNEL requests in progress and the host is busy. It's
+        * the best we can do.
+        */
+       if (vmbus_proto_version < VERSION_WIN10_V5_3) {
+               usleep_range(20000, 25000);
+               return;
+       }
+
+       /*
+        * Otherwise compare the current value of modchan_completed against
+        * modchan_sent. If some MODIFYCHANNEL requests have been sent that
+        * haven't completed, sleep 5 milliseconds and check again. If the
+        * requests still haven't completed after 5 attempts, output a
+        * message and proceed anyway.
+        *
+        * Hyper-V guarantees to process MODIFYCHANNEL requests in the order
+        * they are received from the guest, so simply comparing the counts
+        * is sufficient.
+        *
+        * Note that this check may encompass MODIFYCHANNEL requests that are
+        * unrelated to the CPU that is going offline. But the only effect is
+        * to potentially wait a little bit longer than necessary. CPUs going
+        * offline and affinity changes that result in MODIFYCHANNEL are
+        * relatively rare and it's not worth the complexity to track them more
+        * precisely.
+        */
+       base = READ_ONCE(vmbus_connection.modchan_sent);
+       while (READ_ONCE(vmbus_connection.modchan_completed) < base && i) {
+               usleep_range(5000, 10000);
+               i--;
+       }
+
+       if (i == 0)
+               pr_err("Timed out waiting for MODIFYCHANNEL. CPU %d sent %lld 
completed %lld\n",
+                       cpu, base, vmbus_connection.modchan_completed);
+}
+
 #define HV_MAX_TRIES 3
 /*
  * Scan the event flags page of 'this' CPU looking for any bit that is set.  
If we find one
@@ -485,13 +535,21 @@ int hv_synic_cleanup(unsigned int cpu)
        /*
         * channel_found == false means that any channels that were previously
         * assigned to the CPU have been reassigned elsewhere with a call of
-        * vmbus_send_modifychannel().  Scan the event flags page looking for
-        * bits that are set and waiting with a timeout for vmbus_chan_sched()
-        * to process such bits.  If bits are still set after this operation
-        * and VMBus is connected, fail the CPU offlining operation.
+        * vmbus_send_modifychannel(). First wait until any MODIFYCHANNEL
+        * requests have been completed by Hyper-V, after which we know that
+        * no new bits in the event flags will be set. Then scan the event flags
+        * page looking for bits that are set and waiting with a timeout for
+        * vmbus_chan_sched() to process such bits.  If bits are still set
+        * after this operation, fail the CPU offlining operation.
         */
-       if (vmbus_proto_version >= VERSION_WIN10_V4_1 && 
hv_synic_event_pending())
-               return -EBUSY;
+       if (vmbus_proto_version >= VERSION_WIN10_V4_1) {
+               hv_synic_wait_for_modifychannel(cpu);
+               if (hv_synic_event_pending()) {
+                       pr_err("Events pending when trying to offline CPU %d\n",
+                                       cpu);
+                       return -EBUSY;
+               }
+       }
 
 always_cleanup:
        hv_stimer_legacy_cleanup(cpu);
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index bf35bb40c55e..571b2955b38e 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -264,6 +264,14 @@ struct vmbus_connection {
        struct irq_domain *vmbus_irq_domain;
        struct irq_chip vmbus_irq_chip;
 
+       /*
+        * VM-wide counts of MODIFYCHANNEL messages sent and completed.
+        * Used when taking a CPU offline to make sure the relevant
+        * MODIFYCHANNEL messages have been completed.
+        */
+       u64 modchan_sent;
+       u64 modchan_completed;
+
        /*
         * An offer message is handled first on the work_queue, and then
         * is further handled on handle_primary_chan_wq or
-- 
2.25.1


Reply via email to