On 21.10.19 11:51, Sergey Dyasli wrote:
Hello,

While testing pv-shim from a snapshot of staging 4.13 branch (with core-
scheduling patches applied), some sort of scheduling issues were uncovered
which usually leads to a guest lockup (sometimes with soft lockup messages
from Linux kernel).

This happens more frequently on SandyBridge CPUs. After enabling
CONFIG_DEBUG in pv-shim, the following assertions failed:

Null scheduler:

     Assertion 'lock == get_sched_res(i->res->master_cpu)->schedule_lock' 
failed at ...are/xen-dir/xen-root/xen/include/xen/sched-if.h:278
     (full crash log: https://paste.debian.net/1108861/ )

Credit1 scheduler:

     Assertion 'cpumask_cycle(cpu, unit->cpu_hard_affinity) == cpu' failed at 
sched_credit.c:383
     (full crash log: https://paste.debian.net/1108862/ )

I'm currently investigation those, but would appreciate any help or
suggestions.

Hmm, I think that pv_shim_cpu_up() shouldn't do the vcpu_wake(), but the
caller.

Does the attached patch help?


Juergen
>From 068ea0419d1a67e967b9431ed11e24b731cd357f Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgr...@suse.com>
Date: Mon, 21 Oct 2019 12:28:33 +0200
Subject: [PATCH] xen/shim: fix pv-shim cpu up/down

Calling vcpu_wake() from pv_shim_cpu_up() is wrong as it is not yet
sure that the correct scheduler has taken over the cpu. Doing the
call after continue_hypercall_on_cpu() is correct, so let
pv_shim_cpu_up() just online the cpu.

Adapt pv_shim_cpu_down() to be symmetric, even if that is not
required for correctness.

Reported-by: Sergey Dyasli <sergey.dya...@citrix.com>
Signed-off-by: Juergen Gross <jgr...@suse.com>
---
 xen/arch/x86/pv/shim.c | 16 ----------------
 xen/common/domain.c    | 31 ++++++++++++++++++-------------
 2 files changed, 18 insertions(+), 29 deletions(-)

diff --git a/xen/arch/x86/pv/shim.c b/xen/arch/x86/pv/shim.c
index 5edbcd9ac5..bc6a2f3eb9 100644
--- a/xen/arch/x86/pv/shim.c
+++ b/xen/arch/x86/pv/shim.c
@@ -815,16 +815,11 @@ long pv_shim_cpu_up(void *data)
 {
     struct vcpu *v = data;
     struct domain *d = v->domain;
-    bool wake;
 
     BUG_ON(smp_processor_id() != 0);
 
-    domain_lock(d);
     if ( !v->is_initialised )
-    {
         domain_unlock(d);
-        return -EINVAL;
-    }
 
     if ( !cpu_online(v->vcpu_id) )
     {
@@ -832,18 +827,12 @@ long pv_shim_cpu_up(void *data)
 
         if ( rc )
         {
-            domain_unlock(d);
             gprintk(XENLOG_ERR, "Failed to bring up CPU#%u: %ld\n",
                     v->vcpu_id, rc);
             return rc;
         }
     }
 
-    wake = test_and_clear_bit(_VPF_down, &v->pause_flags);
-    domain_unlock(d);
-    if ( wake )
-        vcpu_wake(v);
-
     return 0;
 }
 
@@ -852,11 +841,6 @@ long pv_shim_cpu_down(void *data)
     struct vcpu *v = data;
     long rc;
 
-    BUG_ON(smp_processor_id() != 0);
-
-    if ( !test_and_set_bit(_VPF_down, &v->pause_flags) )
-        vcpu_sleep_sync(v);
-
     if ( cpu_online(v->vcpu_id) )
     {
         rc = cpu_down_helper((void *)(unsigned long)v->vcpu_id);
diff --git a/xen/common/domain.c b/xen/common/domain.c
index 9c7360ed2a..e83657e310 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -1428,19 +1428,21 @@ long do_vcpu_op(int cmd, unsigned int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg)
         break;
 
     case VCPUOP_up:
-#ifdef CONFIG_X86
-        if ( pv_shim )
-            rc = continue_hypercall_on_cpu(0, pv_shim_cpu_up, v);
-        else
-#endif
         {
             bool wake = false;
 
             domain_lock(d);
-            if ( !v->is_initialised )
-                rc = -EINVAL;
-            else
-                wake = test_and_clear_bit(_VPF_down, &v->pause_flags);
+#ifdef CONFIG_X86
+            if ( pv_shim )
+                rc = continue_hypercall_on_cpu(0, pv_shim_cpu_up, v);
+#endif
+            if ( !rc )
+            {
+                if ( !v->is_initialised )
+                    rc = -EINVAL;
+                else
+                    wake = test_and_clear_bit(_VPF_down, &v->pause_flags);
+            }
             domain_unlock(d);
             if ( wake )
                 vcpu_wake(v);
@@ -1465,14 +1467,17 @@ long do_vcpu_op(int cmd, unsigned int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg)
         rc = 0;
         v = d->vcpu[vcpuid];
 
+#ifdef CONFIG_X86
+        BUG_ON(pv_shim && smp_processor_id() != 0);
+#endif
+
+        if ( !test_and_set_bit(_VPF_down, &v->pause_flags) )
+            vcpu_sleep_nosync(v);
+
 #ifdef CONFIG_X86
         if ( pv_shim )
             rc = continue_hypercall_on_cpu(0, pv_shim_cpu_down, v);
-        else
 #endif
-            if ( !test_and_set_bit(_VPF_down, &v->pause_flags) )
-                vcpu_sleep_nosync(v);
-
         break;
 
     case VCPUOP_is_up:
-- 
2.16.4

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Reply via email to