On Fri, Nov 14, 2014 at 05:59:23PM +0100, Sander Eikelenboom wrote:
> 
> Friday, November 14, 2014, 4:43:58 PM, you wrote:
> 
> >>>> On 14.11.14 at 16:20, <li...@eikelenboom.it> wrote:
> >> If it still helps i could try Andrews suggestion and try out with only 
> >> commit aeeea485 ..
> 
> > Yes, even if it's pretty certain it's the second of the commits, verifying
> > this would be helpful (or if the assumption is wrong, the pattern it's
> > dying with would change and hence perhaps provide further clues).
> 
> > Jan
> 
> 
> Ok with a revert of f6dd295 .. it survived cooking and eating a nice bowl of 
> pasta without a panic. So it would probably be indeed that specific commit.

Could you try running with these two patches while you enjoy an beer in the 
evening?

> 
> --
> Sander
> 
>From b8c267ea34663a9585e1aee9c09dede240b546f9 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.w...@oracle.com>
Date: Fri, 14 Nov 2014 12:15:26 -0500
Subject: [PATCH 1/2] dpci: Add ZOMBIE state.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.w...@oracle.com>
---
 xen/drivers/passthrough/io.c | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/xen/drivers/passthrough/io.c b/xen/drivers/passthrough/io.c
index efc66dc..8e9141e 100644
--- a/xen/drivers/passthrough/io.c
+++ b/xen/drivers/passthrough/io.c
@@ -50,20 +50,25 @@ static DEFINE_PER_CPU(struct list_head, dpci_list);
 
 enum {
     STATE_SCHED,
-    STATE_RUN
+    STATE_RUN,
+    STATE_ZOMBIE
 };
 
 /*
  * This can be called multiple times, but the softirq is only raised once.
- * That is until the STATE_SCHED state has been cleared. The state can be
- * cleared by: the 'dpci_softirq' (when it has executed 'hvm_dirq_assist'),
- * or by 'pt_pirq_softirq_reset' (which will try to clear the state before
- * the softirq had a chance to run).
+ * That is until the STATE_SCHED and STATE_ZOMBIE state has been cleared. The
+ * STATE_SCHED and STATE_ZOMBIE state can be cleared by the 'dpci_softirq'
+ * (when it has executed 'hvm_dirq_assist'). The STATE_SCHED can be cleared
+ * by 'pt_pirq_softirq_reset' (which will try to clear the state before the
+ * softirq had a chance to run).
  */
 static void raise_softirq_for(struct hvm_pirq_dpci *pirq_dpci)
 {
     unsigned long flags;
 
+    if ( test_bit(STATE_ZOMBIE, &pirq_dpci->state) )
+        return;
+
     if ( test_and_set_bit(STATE_SCHED, &pirq_dpci->state) )
         return;
 
@@ -85,7 +90,7 @@ static void raise_softirq_for(struct hvm_pirq_dpci *pirq_dpci)
  */
 bool_t pt_pirq_softirq_active(struct hvm_pirq_dpci *pirq_dpci)
 {
-    if ( pirq_dpci->state & ((1 << STATE_RUN) | (1 << STATE_SCHED)) )
+    if ( pirq_dpci->state & ((1 << STATE_RUN) | (1 << STATE_SCHED) | (1 << 
STATE_ZOMBIE) ) )
         return 1;
 
     /*
@@ -109,7 +114,7 @@ static void pt_pirq_softirq_reset(struct hvm_pirq_dpci 
*pirq_dpci)
 
     ASSERT(spin_is_locked(&d->event_lock));
 
-    switch ( cmpxchg(&pirq_dpci->state, 1 << STATE_SCHED, 0) )
+    switch ( cmpxchg(&pirq_dpci->state, 1 << STATE_SCHED, 1 << STATE_ZOMBIE ) )
     {
     case (1 << STATE_SCHED):
         /*
@@ -120,6 +125,7 @@ static void pt_pirq_softirq_reset(struct hvm_pirq_dpci 
*pirq_dpci)
         /* fallthrough. */
     case (1 << STATE_RUN):
     case (1 << STATE_RUN) | (1 << STATE_SCHED):
+    case (1 << STATE_RUN) | (1 << STATE_SCHED) | (1 << STATE_ZOMBIE):
         /*
          * The reason it is OK to reset 'dom' when STATE_RUN bit is set is due
          * to a shortcut the 'dpci_softirq' implements. It stashes the 'dom'
@@ -779,6 +785,7 @@ unlock:
 static void dpci_softirq(void)
 {
     unsigned int cpu = smp_processor_id();
+    unsigned int reset = 0;
     LIST_HEAD(our_list);
 
     local_irq_disable();
@@ -805,7 +812,15 @@ static void dpci_softirq(void)
             hvm_dirq_assist(d, pirq_dpci);
             put_domain(d);
         }
+        else
+            reset = 1;
+
         clear_bit(STATE_RUN, &pirq_dpci->state);
+        if ( reset )
+        {
+            clear_bit(STATE_ZOMBIE, &pirq_dpci->state);
+            reset = 0;
+        }
     }
 }
 
-- 
1.9.3

>From a4171fa12583eabd126bc5b4c305f49b2fb2b515 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.w...@oracle.com>
Date: Fri, 14 Nov 2014 15:00:39 -0500
Subject: [PATCH 2/2] debug:

Signed-off-by: Konrad Rzeszutek Wilk <konrad.w...@oracle.com>
---
 xen/drivers/passthrough/io.c | 180 ++++++++++++++++++++++++++++++++++++++++++-
 xen/include/xen/hvm/irq.h    |   2 +
 2 files changed, 178 insertions(+), 4 deletions(-)

diff --git a/xen/drivers/passthrough/io.c b/xen/drivers/passthrough/io.c
index 8e9141e..23e5ed1 100644
--- a/xen/drivers/passthrough/io.c
+++ b/xen/drivers/passthrough/io.c
@@ -54,6 +54,117 @@ enum {
     STATE_ZOMBIE
 };
 
+struct _debug_f {
+    unsigned int domid;
+    unsigned long count;
+    s_time_t last;
+    struct list_head list;
+    unsigned int state;
+    struct hvm_pirq_dpci *dpci;
+};
+
+struct __debug {
+    struct _debug_f ok;
+    struct _debug_f poison;
+    struct _debug_f raise;
+    struct _debug_f reset;
+    struct _debug_f zombie_softirq;
+    struct _debug_f zombie_raise;
+};
+
+static DEFINE_PER_CPU(struct __debug, _d);
+
+void _record(struct _debug_f *d, struct hvm_pirq_dpci *pirq_dpci)
+{
+    if (pirq_dpci->dom)
+        d->domid = pirq_dpci->dom->domain_id;
+    else
+        d->domid = 0xdead;
+    d->count ++;
+    d->last = NOW();
+    d->list.next = pirq_dpci->softirq_list.next;
+    d->list.prev = pirq_dpci->softirq_list.prev;
+    d->state = pirq_dpci->state;
+    d->dpci = pirq_dpci;
+}
+
+enum {
+    Z_SOFTIRQ,
+    Z_RAISE,
+    ERR_POISON,
+    OK_SOFTIRQ,
+    OK_RAISE,
+    OK_RESET,
+};
+
+static void dump_record(struct _debug_f *d, unsigned int type)
+{
+    static const char *const names[] = {
+        [Z_SOFTIRQ]  = "Z-softirq ",
+        [Z_RAISE]    = "Z-raise   ",
+        [ERR_POISON] = "ERR-poison",
+        [OK_SOFTIRQ] = "OK-softirq",
+        [OK_RAISE]   = "OK-raise  ",
+        [OK_RESET]   = "OK-reset  ",
+    };
+#define LONG(x) [_HVM_IRQ_DPCI_##x] = #x
+    static const char *const names_flag[] = {
+        LONG(MACH_PCI_SHIFT),
+        LONG(MAPPED_SHIFT),
+        LONG(EOI_LATCH_SHIFT),
+        LONG(GUEST_PCI_SHIFT),
+        LONG(GUEST_MSI_SHIFT),
+    };
+#undef LONG
+    unsigned int i;
+    s_time_t now;
+
+    if ( d->domid == 0 )
+        return;
+
+    if ( type >= ARRAY_SIZE(names) )
+        BUG();
+
+    now = NOW();
+    printk("d%d %s %lumsec ago, state:%x, %ld count, [prev:%p, next:%p] ",
+           d->domid, names[type],
+           (unsigned long)((now - d->last) / MILLISECS(1)),
+            d->state, d->count, d->list.prev, d->list.next);
+
+    if ( d->dpci )
+    {
+        struct hvm_pirq_dpci *pirq_dpci = d->dpci;
+
+        for ( i = 0; i <= _HVM_IRQ_DPCI_GUEST_MSI_SHIFT; i++ )
+            if ( pirq_dpci->flags & 1 << _HVM_IRQ_DPCI_TRANSLATE_SHIFT )
+                printk("%s ", names_flag[i]);
+
+        printk(" PIRQ:%d", pirq_dpci->pirq);
+        if (pirq_dpci->line)
+            printk(" LINE: %d", pirq_dpci->line);
+    }
+    printk("\n");
+    memset(d, 0, sizeof(struct _debug_f));
+}
+
+static void dump_debug(unsigned char key)
+{
+    unsigned int cpu;
+
+    for_each_online_cpu ( cpu )
+    {
+        struct __debug *d;
+        d = &per_cpu(_d, cpu);
+
+        printk("CPU%02d: \n" ,cpu);
+        dump_record(&d->ok, OK_SOFTIRQ);
+        dump_record(&d->raise, OK_RAISE);
+        dump_record(&d->reset, OK_RESET);
+        dump_record(&d->poison, ERR_POISON);
+        dump_record(&d->zombie_softirq, Z_SOFTIRQ);
+        dump_record(&d->zombie_raise, Z_RAISE);
+    }
+}
 /*
  * This can be called multiple times, but the softirq is only raised once.
  * That is until the STATE_SCHED and STATE_ZOMBIE state has been cleared. The
@@ -65,13 +176,18 @@ enum {
 static void raise_softirq_for(struct hvm_pirq_dpci *pirq_dpci)
 {
     unsigned long flags;
+    struct __debug *d = &__get_cpu_var(_d);
 
     if ( test_bit(STATE_ZOMBIE, &pirq_dpci->state) )
+    {
+        _record(&d->zombie_raise, pirq_dpci);
         return;
-
+    }
     if ( test_and_set_bit(STATE_SCHED, &pirq_dpci->state) )
         return;
 
+    _record(&d->raise, pirq_dpci);
+
     get_knownalive_domain(pirq_dpci->dom);
 
     local_irq_save(flags);
@@ -111,9 +227,12 @@ bool_t pt_pirq_softirq_active(struct hvm_pirq_dpci 
*pirq_dpci)
 static void pt_pirq_softirq_reset(struct hvm_pirq_dpci *pirq_dpci)
 {
     struct domain *d = pirq_dpci->dom;
+    struct __debug *debug = &__get_cpu_var(_d);
 
     ASSERT(spin_is_locked(&d->event_lock));
 
+    _record(&debug->reset, pirq_dpci);
+
     switch ( cmpxchg(&pirq_dpci->state, 1 << STATE_SCHED, 1 << STATE_ZOMBIE ) )
     {
     case (1 << STATE_SCHED):
@@ -277,6 +396,7 @@ int pt_irq_create_bind(
              * As such on every 'pt_irq_create_bind' call we MUST set it.
              */
             pirq_dpci->dom = d;
+            pirq_dpci->pirq = pirq;
             /* bind after hvm_irq_dpci is setup to avoid race with irq 
handler*/
             rc = pirq_guest_bind(d->vcpu[0], info, 0);
             if ( rc == 0 && pt_irq_bind->u.msi.gtable )
@@ -291,6 +411,7 @@ int pt_irq_create_bind(
                      * to be scheduled but we must deal with the one that may 
be
                      * in the queue.
                      */
+                    pirq_dpci->line = __LINE__;
                     pt_pirq_softirq_reset(pirq_dpci);
                 }
             }
@@ -300,6 +421,7 @@ int pt_irq_create_bind(
                 pirq_dpci->gmsi.gvec = 0;
                 pirq_dpci->dom = NULL;
                 pirq_dpci->flags = 0;
+                pirq_dpci->pirq = -pirq;
                 pirq_cleanup_check(info, d);
                 spin_unlock(&d->event_lock);
                 return rc;
@@ -544,6 +666,7 @@ int pt_irq_destroy_bind(
          * See comment in pt_irq_create_bind's PT_IRQ_TYPE_MSI before the
          * call to pt_pirq_softirq_reset.
          */
+        pirq_dpci->line = __LINE__;
         pt_pirq_softirq_reset(pirq_dpci);
 
         pirq_cleanup_check(pirq, d);
@@ -778,6 +901,8 @@ unlock:
     spin_unlock(&d->event_lock);
 }
 
+#include <xen/console.h>
+
 /*
  * Note: 'pt_pirq_softirq_reset' can clear the STATE_SCHED before we get to
  * doing it. If that is the case we let 'pt_pirq_softirq_reset' do 
ref-counting.
@@ -787,6 +912,9 @@ static void dpci_softirq(void)
     unsigned int cpu = smp_processor_id();
     unsigned int reset = 0;
     LIST_HEAD(our_list);
+    struct __debug *debug;
+
+    debug = &per_cpu(_d, cpu);
 
     local_irq_disable();
     list_splice_init(&per_cpu(dpci_list, cpu), &our_list);
@@ -796,9 +924,22 @@ static void dpci_softirq(void)
     {
         struct hvm_pirq_dpci *pirq_dpci;
         struct domain *d;
+        struct list_head *entry;
 
         pirq_dpci = list_entry(our_list.next, struct hvm_pirq_dpci, 
softirq_list);
-        list_del(&pirq_dpci->softirq_list);
+        entry = &pirq_dpci->softirq_list;
+        if ( entry->next == LIST_POISON1 || entry->next == LIST_POISON2 ||
+             entry->prev == LIST_POISON2 || entry->prev == LIST_POISON2 )
+        {
+            _record(&debug->poison, pirq_dpci);
+            console_start_sync();
+            dump_debug((char)0);
+            console_end_sync();
+            domain_crash(pirq_dpci->dom);
+            break;
+        }
+        _record(&debug->ok, pirq_dpci);
+        list_del(entry);
 
         d = pirq_dpci->dom;
         smp_mb(); /* 'd' MUST be saved before we set/clear the bits. */
@@ -813,8 +954,10 @@ static void dpci_softirq(void)
             put_domain(d);
         }
         else
+        {
+            _record(&debug->zombie_softirq, pirq_dpci);
             reset = 1;
-
+        }
         clear_bit(STATE_RUN, &pirq_dpci->state);
         if ( reset )
         {
@@ -833,6 +976,7 @@ static int cpu_callback(
     {
     case CPU_UP_PREPARE:
         INIT_LIST_HEAD(&per_cpu(dpci_list, cpu));
+        memset(&per_cpu(_d, cpu), 0, sizeof(struct __debug));
         break;
     case CPU_UP_CANCELED:
     case CPU_DEAD:
@@ -854,15 +998,43 @@ static struct notifier_block cpu_nfb = {
     .notifier_call = cpu_callback,
 };
 
+#include <xen/keyhandler.h>
+
+static struct keyhandler dump_debug_keyhandler = {
+    .diagnostic = 1,
+    .u.fn = dump_debug,
+    .desc = "dpci debug stats"
+};
+static struct timer debug_timer;
+static s_time_t last_time = 0;
+static unsigned int debug_cnt = 0;
+
+static void debug_timer_fn(void *d)
+{
+    if ( ( debug_cnt ++ % 10 ) == 0 )
+        printk("--MARK--\n");
+
+    last_time = NOW();
+    set_timer(&debug_timer, last_time + SECONDS(1));
+}
+
 static int __init setup_dpci_softirq(void)
 {
     unsigned int cpu;
 
     for_each_online_cpu(cpu)
+    {
         INIT_LIST_HEAD(&per_cpu(dpci_list, cpu));
-
+        memset(&per_cpu(_d, cpu), 0, sizeof(struct __debug));
+    }
     open_softirq(HVM_DPCI_SOFTIRQ, dpci_softirq);
     register_cpu_notifier(&cpu_nfb);
+
+    init_timer(&debug_timer, debug_timer_fn, NULL, smp_processor_id());
+    last_time = NOW();
+    set_timer(&debug_timer, NOW() + SECONDS(1));
+    register_keyhandler('k', &dump_debug_keyhandler);
+
     return 0;
 }
 __initcall(setup_dpci_softirq);
diff --git a/xen/include/xen/hvm/irq.h b/xen/include/xen/hvm/irq.h
index 9709397..1fb1292 100644
--- a/xen/include/xen/hvm/irq.h
+++ b/xen/include/xen/hvm/irq.h
@@ -100,6 +100,8 @@ struct hvm_pirq_dpci {
     struct hvm_gmsi_info gmsi;
     struct timer timer;
     struct list_head softirq_list;
+    unsigned int pirq;
+    unsigned int line;
 };
 
 void pt_pirq_init(struct domain *, struct hvm_pirq_dpci *);
-- 
1.9.3

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

Reply via email to