Hi, I believe this issue is fixed by http://xenbits.xensource.com/xen-unstable.hg?rev/914304b3a3da
Without the patch (2.6.18.dfsg.1-18etch1) I was able to reproduce the problem by repeatedly starting 4 guest domains in parallel[0] in a around 5-6 hours or 288 iterations (first time I repro'd it was a couple of hours, I wasn't counting time or iterations that time). With the patch applied on top of the svn kernel (2.6.18.dfsg.1-21.0.hellion0) the test survived overnight, around 16 hours and 850 iterations. I've attached a version of this backported to the Etch kernel (some of the function names differ). I also attached the svn diff-of-a-diff against revision in r11588, 2.6.18.dfsg.1-22. This issue is already fixed in the Lenny/Sid kernels which use the paravirt_ops kernel port which has completely reimplemented event channel code. Cheers, Ian. [0] n=0 ; while : ; do date ; echo iteration $n ; xm create debian-x86_32p-1 & xm create debian-x86_32p-2 & xm create rhel44-x86_32p-1 kernel=/boot/vmlinuz-2.6.18.8-x86_32p-xenU ramdisk= & xm create rhel41-x86_32p-1 kernel=/boot/vmlinuz-2.6.18.8-x86_32p-xenU ramdisk= & sleep 1m ; xm dest debian-1 ; xm dest debian-2 ; xm dest rhel41-1 ; xm dest rhel44-1 ; sleep 5s ; n=$(($n + 1)) ; done -- Ian Campbell I haven't lost my mind; I know exactly where I left it.
Index: debian/patches/series/22-extra =================================================================== --- debian/patches/series/22-extra (revision 0) +++ debian/patches/series/22-extra (revision 0) @@ -0,0 +1 @@ ++ features/all/xen/evtchn-BUG.patch *_xen *_xen-vserver Index: debian/patches/features/all/xen/evtchn-BUG.patch =================================================================== --- debian/patches/features/all/xen/evtchn-BUG.patch (revision 0) +++ debian/patches/features/all/xen/evtchn-BUG.patch (revision 0) @@ -0,0 +1,128 @@ +# HG changeset patch +# User [EMAIL PROTECTED] +# Date 1169559560 0 +# Node ID 914304b3a3da6fc5bad12f742bae3893b53d20bc +# Parent ee7c422c5f7b79e0cc0ae5670af81b400a72357f +linux: Fix enable_irq() crash by removing a BUG_ON() assumption in our +event-channel retrigger() function. Also clean up bitmap usages. +Signed-off-by: Keir Fraser <[EMAIL PROTECTED]> + +Index: source-amd64-xen/drivers/xen/core/evtchn.c +=================================================================== +--- source-amd64-xen.orig/drivers/xen/core/evtchn.c 2008-06-03 21:11:44.000000000 +0100 ++++ source-amd64-xen/drivers/xen/core/evtchn.c 2008-06-03 21:21:24.000000000 +0100 +@@ -103,7 +103,7 @@ + static int irq_bindcount[NR_IRQS]; + + /* Bitmap indicating which PIRQs require Xen to be notified on unmask. */ +-static unsigned long pirq_needs_eoi[NR_PIRQS/sizeof(unsigned long)]; ++static DECLARE_BITMAP(pirq_needs_eoi, NR_PIRQS); + + #ifdef CONFIG_SMP + +@@ -472,14 +472,19 @@ + rebind_irq_to_cpu(irq, tcpu); + } + +-static int retrigger(unsigned int irq) ++static int resend_irq_on_evtchn(unsigned int i) + { +- int evtchn = evtchn_from_irq(irq); ++ int masked, evtchn = evtchn_from_irq(i); + shared_info_t *s = HYPERVISOR_shared_info; ++ + if (!VALID_EVTCHN(evtchn)) + return 1; +- BUG_ON(!synch_test_bit(evtchn, &s->evtchn_mask[0])); +- synch_set_bit(evtchn, &s->evtchn_pending[0]); ++ ++ masked = synch_test_and_set_bit(evtchn, s->evtchn_mask); ++ synch_set_bit(evtchn, s->evtchn_pending); ++ if (!masked) ++ unmask_evtchn(evtchn); ++ + return 1; + } + +@@ -551,13 +556,13 @@ + #ifdef CONFIG_SMP + .set_affinity = set_affinity_irq, + #endif +- .retrigger = retrigger, ++ .retrigger = resend_irq_on_evtchn, + }; + + static inline void pirq_unmask_notify(int pirq) + { + struct physdev_eoi eoi = { .irq = pirq }; +- if (unlikely(test_bit(pirq, &pirq_needs_eoi[0]))) ++ if (unlikely(test_bit(pirq, pirq_needs_eoi))) + (void)HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi); + } + +@@ -679,7 +684,7 @@ + #ifdef CONFIG_SMP + .set_affinity = set_affinity_irq, + #endif +- .retrigger = retrigger, ++ .retrigger = resend_irq_on_evtchn, + }; + + int irq_ignore_unhandled(unsigned int irq) +@@ -693,16 +698,6 @@ + return !!(irq_status.flags & XENIRQSTAT_shared); + } + +-void resend_irq_on_evtchn(unsigned int i) +-{ +- int evtchn = evtchn_from_irq(i); +- shared_info_t *s = HYPERVISOR_shared_info; +- if (!VALID_EVTCHN(evtchn)) +- return; +- BUG_ON(!synch_test_bit(evtchn, &s->evtchn_mask[0])); +- synch_set_bit(evtchn, &s->evtchn_pending[0]); +-} +- + void notify_remote_via_irq(int irq) + { + int evtchn = evtchn_from_irq(irq); +@@ -715,7 +710,7 @@ + void mask_evtchn(int port) + { + shared_info_t *s = HYPERVISOR_shared_info; +- synch_set_bit(port, &s->evtchn_mask[0]); ++ synch_set_bit(port, s->evtchn_mask); + } + EXPORT_SYMBOL_GPL(mask_evtchn); + +@@ -734,14 +729,10 @@ + return; + } + +- synch_clear_bit(port, &s->evtchn_mask[0]); ++ synch_clear_bit(port, s->evtchn_mask); + +- /* +- * The following is basically the equivalent of 'hw_resend_irq'. Just +- * like a real IO-APIC we 'lose the interrupt edge' if the channel is +- * masked. +- */ +- if (synch_test_bit(port, &s->evtchn_pending[0]) && ++ /* Did we miss an interrupt 'edge'? Re-fire if so. */ ++ if (synch_test_bit(port, s->evtchn_pending) && + !synch_test_and_set_bit(port / BITS_PER_LONG, + &vcpu_info->evtchn_pending_sel)) + vcpu_info->evtchn_upcall_pending = 1; +Index: source-amd64-xen/include/xen/evtchn.h +=================================================================== +--- source-amd64-xen.orig/include/xen/evtchn.h 2008-06-03 21:11:48.000000000 +0100 ++++ source-amd64-xen/include/xen/evtchn.h 2008-06-03 21:18:30.000000000 +0100 +@@ -95,7 +95,7 @@ + static inline void clear_evtchn(int port) + { + shared_info_t *s = HYPERVISOR_shared_info; +- synch_clear_bit(port, &s->evtchn_pending[0]); ++ synch_clear_bit(port, s->evtchn_pending); + } + + static inline void notify_remote_via_evtchn(int port) Index: debian/changelog =================================================================== --- debian/changelog (revision 11588) +++ debian/changelog (working copy) @@ -1,7 +1,12 @@ linux-2.6 (2.6.18.dfsg.1-22) UNRELEASED; urgency=high + [ dann frazier ] * Merge in changes from 2.6.18.dfsg.1-18etch6 + [ Ian Campbell ] + * Backport http://xenbits.xensource.com/xen-unstable.hg?rev/914304b3a3da, + fixing kernel BUG at drivers/xen/core/evtchn.c:481 (closes: 410807). + -- dann frazier <[EMAIL PROTECTED]> Mon, 09 Jun 2008 01:21:13 -0600 linux-2.6 (2.6.18.dfsg.1-21) stable; urgency=high
# HG changeset patch # User [EMAIL PROTECTED] # Date 1169559560 0 # Node ID 914304b3a3da6fc5bad12f742bae3893b53d20bc # Parent ee7c422c5f7b79e0cc0ae5670af81b400a72357f linux: Fix enable_irq() crash by removing a BUG_ON() assumption in our event-channel retrigger() function. Also clean up bitmap usages. Signed-off-by: Keir Fraser <[EMAIL PROTECTED]> Index: source-amd64-xen/drivers/xen/core/evtchn.c =================================================================== --- source-amd64-xen.orig/drivers/xen/core/evtchn.c 2008-06-03 21:11:44.000000000 +0100 +++ source-amd64-xen/drivers/xen/core/evtchn.c 2008-06-03 21:21:24.000000000 +0100 @@ -103,7 +103,7 @@ static int irq_bindcount[NR_IRQS]; /* Bitmap indicating which PIRQs require Xen to be notified on unmask. */ -static unsigned long pirq_needs_eoi[NR_PIRQS/sizeof(unsigned long)]; +static DECLARE_BITMAP(pirq_needs_eoi, NR_PIRQS); #ifdef CONFIG_SMP @@ -472,14 +472,19 @@ rebind_irq_to_cpu(irq, tcpu); } -static int retrigger(unsigned int irq) +static int resend_irq_on_evtchn(unsigned int i) { - int evtchn = evtchn_from_irq(irq); + int masked, evtchn = evtchn_from_irq(i); shared_info_t *s = HYPERVISOR_shared_info; + if (!VALID_EVTCHN(evtchn)) return 1; - BUG_ON(!synch_test_bit(evtchn, &s->evtchn_mask[0])); - synch_set_bit(evtchn, &s->evtchn_pending[0]); + + masked = synch_test_and_set_bit(evtchn, s->evtchn_mask); + synch_set_bit(evtchn, s->evtchn_pending); + if (!masked) + unmask_evtchn(evtchn); + return 1; } @@ -551,13 +556,13 @@ #ifdef CONFIG_SMP .set_affinity = set_affinity_irq, #endif - .retrigger = retrigger, + .retrigger = resend_irq_on_evtchn, }; static inline void pirq_unmask_notify(int pirq) { struct physdev_eoi eoi = { .irq = pirq }; - if (unlikely(test_bit(pirq, &pirq_needs_eoi[0]))) + if (unlikely(test_bit(pirq, pirq_needs_eoi))) (void)HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi); } @@ -679,7 +684,7 @@ #ifdef CONFIG_SMP .set_affinity = set_affinity_irq, #endif - .retrigger = retrigger, + .retrigger = resend_irq_on_evtchn, }; int irq_ignore_unhandled(unsigned int irq) @@ -693,16 +698,6 @@ return !!(irq_status.flags & XENIRQSTAT_shared); } -void resend_irq_on_evtchn(unsigned int i) -{ - int evtchn = evtchn_from_irq(i); - shared_info_t *s = HYPERVISOR_shared_info; - if (!VALID_EVTCHN(evtchn)) - return; - BUG_ON(!synch_test_bit(evtchn, &s->evtchn_mask[0])); - synch_set_bit(evtchn, &s->evtchn_pending[0]); -} - void notify_remote_via_irq(int irq) { int evtchn = evtchn_from_irq(irq); @@ -715,7 +710,7 @@ void mask_evtchn(int port) { shared_info_t *s = HYPERVISOR_shared_info; - synch_set_bit(port, &s->evtchn_mask[0]); + synch_set_bit(port, s->evtchn_mask); } EXPORT_SYMBOL_GPL(mask_evtchn); @@ -734,14 +729,10 @@ return; } - synch_clear_bit(port, &s->evtchn_mask[0]); + synch_clear_bit(port, s->evtchn_mask); - /* - * The following is basically the equivalent of 'hw_resend_irq'. Just - * like a real IO-APIC we 'lose the interrupt edge' if the channel is - * masked. - */ - if (synch_test_bit(port, &s->evtchn_pending[0]) && + /* Did we miss an interrupt 'edge'? Re-fire if so. */ + if (synch_test_bit(port, s->evtchn_pending) && !synch_test_and_set_bit(port / BITS_PER_LONG, &vcpu_info->evtchn_pending_sel)) vcpu_info->evtchn_upcall_pending = 1; Index: source-amd64-xen/include/xen/evtchn.h =================================================================== --- source-amd64-xen.orig/include/xen/evtchn.h 2008-06-03 21:11:48.000000000 +0100 +++ source-amd64-xen/include/xen/evtchn.h 2008-06-03 21:18:30.000000000 +0100 @@ -95,7 +95,7 @@ static inline void clear_evtchn(int port) { shared_info_t *s = HYPERVISOR_shared_info; - synch_clear_bit(port, &s->evtchn_pending[0]); + synch_clear_bit(port, s->evtchn_pending); } static inline void notify_remote_via_evtchn(int port)
signature.asc
Description: This is a digitally signed message part