Package: linux-image-2.6.32-5-686-bigmem Version: 2.6.32-41squeeze2 Severity: important
We received several problem reports from our customers and also experienced the following bug when running said kernel with >=2 CPUs in a virtualization environbment (KVM and VMWare ESX). After some time the VM just stops (no ping, no console, no activity). Using gdb and KVMs gdbserver capability I was able to track it down to the following symptom: One thread would be stuck in flush_tlb_others_ipi() waiting for all other CPUs using a specifiy mm to signal they have flushed there TLB, while all other CPU-threads would be waiting for the pgd_lock to be freed. I have no easy test to reproduce the bug, but it usually happens to me when I run several pbuilder-builds in parallel. The more virtual CPUs the VM has, the easier to trigger: With 2 VCPUs it's hours, with 6 VCPUs it's usually less than 5 minutes. This got more prominent with git-commit 831d52bc153971b70e64eccfbed2b232394f22f8: x86, mm: avoid possible bogus tlb entries by clearing prev mm_cpumask after switching mm (Linus tree), and even worse with 4981d01eada5354d81c8929d5b2836829ba3df7b: x86: Flush TLB if PGD entry is changed in i386 PAE mode I found the issue to be fixed by a79e53d85683c6dd9f99c90511028adc2043031f: x86/mm: Fix pgd_lock deadlock That patch is already in the Debian patch set, but only applied for the xen flavour: features/all/xen/x86-mm-Fix-pgd_lock-deadlock.patch I thinks this patch should be applied to all flavours. It doesn't apply to the non-xen-flavour as is, because it depends on some other xen-related patch. The Patch was also back-ported to the OpenSUSE Kernel <http://kernel.opensuse.org/cgit/kernel-source/commit/?id=ac27c01aa880c65d17043ab87249c613ac4c3635>, but since the patch is trivial to backport, I'll attach my version as well. The patch should be forwarded to Upstream to be included into the upstream 2.6.32 longterm stable kernel as well. The full issue is tracked in our (German) Bugzilla: <https://forge.univention.org/bugzilla/show_bug.cgi?id=26661> Sincerely Philipp -- System Information: Debian Release: 5.0.1 Architecture: i386 (i686) Shell: /bin/sh linked to /bin/bash Kernel: Linux 2.6.32-ucs57-686-bigmem Locale: LANG=de_DE.UTF-8, LC_CTYPE=de_DE.UTF-8 (charmap=UTF-8)
Bug #26661: 686-bigmem VM deadlock --- /dev/null +++ linux-2.6.32-2.6.32/debian/patches/bugfix/x86/x86-mm-Fix-pgd_lock-deadlock.patch @@ -0,0 +1,217 @@ +It's forbidden to take the page_table_lock with the irq disabled +or if there's contention the IPIs (for tlb flushes) sent with +the page_table_lock held will never run leading to a deadlock. + +Nobody takes the pgd_lock from irq context so the _irqsave can be +removed. + +Signed-off-by: Andrea Arcangeli <aarca...@redhat.com> +Acked-by: Rik van Riel <r...@redhat.com> +Tested-by: Konrad Rzeszutek Wilk <konrad.w...@oracle.com> +Signed-off-by: Andrew Morton <a...@linux-foundation.org> +Cc: Peter Zijlstra <pet...@infradead.org> +Cc: Linus Torvalds <torva...@linux-foundation.org> +Cc: <sta...@kernel.org> +LKML-Reference: <201102162345.p1gnjmjm021...@imap1.linux-foundation.org> +Signed-off-by: Ingo Molnar <mi...@elte.hu> +Git-commit: a79e53d85683c6dd9f99c90511028adc2043031f +--- a/arch/x86/mm/fault.c ++++ b/arch/x86/mm/fault.c +@@ -223,15 +223,14 @@ void vmalloc_sync_all(void) + address >= TASK_SIZE && address < FIXADDR_TOP; + address += PMD_SIZE) { + +- unsigned long flags; + struct page *page; + +- spin_lock_irqsave(&pgd_lock, flags); ++ spin_lock(&pgd_lock); + list_for_each_entry(page, &pgd_list, lru) { + if (!vmalloc_sync_one(page_address(page), address)) + break; + } +- spin_unlock_irqrestore(&pgd_lock, flags); ++ spin_unlock(&pgd_lock); + } + } + +@@ -331,13 +330,12 @@ void vmalloc_sync_all(void) + address += PGDIR_SIZE) { + + const pgd_t *pgd_ref = pgd_offset_k(address); +- unsigned long flags; + struct page *page; + + if (pgd_none(*pgd_ref)) + continue; + +- spin_lock_irqsave(&pgd_lock, flags); ++ spin_lock(&pgd_lock); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + pgd = (pgd_t *)page_address(page) + pgd_index(address); +@@ -346,7 +344,7 @@ void vmalloc_sync_all(void) + else + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + } +- spin_unlock_irqrestore(&pgd_lock, flags); ++ spin_unlock(&pgd_lock); + } + } + +--- a/arch/x86/mm/pageattr.c ++++ b/arch/x86/mm/pageattr.c +@@ -56,12 +56,10 @@ static unsigned long direct_pages_count[ + + void update_page_count(int level, unsigned long pages) + { +- unsigned long flags; +- + /* Protect against CPA */ +- spin_lock_irqsave(&pgd_lock, flags); ++ spin_lock(&pgd_lock); + direct_pages_count[level] += pages; +- spin_unlock_irqrestore(&pgd_lock, flags); ++ spin_unlock(&pgd_lock); + } + + static void split_page_count(int level) +@@ -354,7 +352,7 @@ static int + try_preserve_large_page(pte_t *kpte, unsigned long address, + struct cpa_data *cpa) + { +- unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn; ++ unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn; + pte_t new_pte, old_pte, *tmp; + pgprot_t old_prot, new_prot; + int i, do_split = 1; +@@ -363,7 +361,7 @@ try_preserve_large_page(pte_t *kpte, uns + if (cpa->force_split) + return 1; + +- spin_lock_irqsave(&pgd_lock, flags); ++ spin_lock(&pgd_lock); + /* + * Check for races, another CPU might have split this page + * up already: +@@ -458,14 +456,14 @@ try_preserve_large_page(pte_t *kpte, uns + } + + out_unlock: +- spin_unlock_irqrestore(&pgd_lock, flags); ++ spin_unlock(&pgd_lock); + + return do_split; + } + + static int split_large_page(pte_t *kpte, unsigned long address) + { +- unsigned long flags, pfn, pfninc = 1; ++ unsigned long pfn, pfninc = 1; + unsigned int i, level; + pte_t *pbase, *tmp; + pgprot_t ref_prot; +@@ -479,7 +477,7 @@ static int split_large_page(pte_t *kpte, + if (!base) + return -ENOMEM; + +- spin_lock_irqsave(&pgd_lock, flags); ++ spin_lock(&pgd_lock); + /* + * Check for races, another CPU might have split this page + * up for us already: +@@ -551,7 +549,7 @@ out_unlock: + */ + if (base) + __free_page(base); +- spin_unlock_irqrestore(&pgd_lock, flags); ++ spin_unlock(&pgd_lock); + + return 0; + } +--- a/arch/x86/mm/pgtable.c ++++ b/arch/x86/mm/pgtable.c +@@ -110,14 +110,12 @@ static void pgd_ctor(pgd_t *pgd) + + static void pgd_dtor(pgd_t *pgd) + { +- unsigned long flags; /* can be called from interrupt context */ +- + if (SHARED_KERNEL_PMD) + return; + +- spin_lock_irqsave(&pgd_lock, flags); ++ spin_lock(&pgd_lock); + pgd_list_del(pgd); +- spin_unlock_irqrestore(&pgd_lock, flags); ++ spin_unlock(&pgd_lock); + } + + /* +@@ -248,7 +246,6 @@ pgd_t *pgd_alloc(struct mm_struct *mm) + { + pgd_t *pgd; + pmd_t *pmds[PREALLOCATED_PMDS]; +- unsigned long flags; + + pgd = (pgd_t *)__get_free_page(PGALLOC_GFP); + +@@ -268,12 +265,12 @@ pgd_t *pgd_alloc(struct mm_struct *mm) + * respect to anything walking the pgd_list, so that they + * never see a partially populated pgd. + */ +- spin_lock_irqsave(&pgd_lock, flags); ++ spin_lock(&pgd_lock); + + pgd_ctor(pgd); + pgd_prepopulate_pmd(mm, pgd, pmds); + +- spin_unlock_irqrestore(&pgd_lock, flags); ++ spin_unlock(&pgd_lock); + + return pgd; + +--- a/arch/x86/xen/mmu.c ++++ b/arch/x86/xen/mmu.c +@@ -988,10 +988,9 @@ static void xen_pgd_pin(struct mm_struct + */ + void xen_mm_pin_all(void) + { +- unsigned long flags; + struct page *page; + +- spin_lock_irqsave(&pgd_lock, flags); ++ spin_lock(&pgd_lock); + + list_for_each_entry(page, &pgd_list, lru) { + if (!PagePinned(page)) { +@@ -1000,7 +999,7 @@ void xen_mm_pin_all(void) + } + } + +- spin_unlock_irqrestore(&pgd_lock, flags); ++ spin_unlock(&pgd_lock); + } + + /* +@@ -1101,10 +1100,9 @@ static void xen_pgd_unpin(struct mm_stru + */ + void xen_mm_unpin_all(void) + { +- unsigned long flags; + struct page *page; + +- spin_lock_irqsave(&pgd_lock, flags); ++ spin_lock(&pgd_lock); + + list_for_each_entry(page, &pgd_list, lru) { + if (PageSavePinned(page)) { +@@ -1114,7 +1112,7 @@ void xen_mm_unpin_all(void) + } + } + +- spin_unlock_irqrestore(&pgd_lock, flags); ++ spin_unlock(&pgd_lock); + } + + void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) --- linux-2.6.32-2.6.32/debian/patches/series/37-extra +++ linux-2.6.32-2.6.32/debian/patches/series/37-extra @@ -12,6 +12,7 @@ + features/all/vserver/vs2.3.0.36.29.7.patch featureset=vserver + features/all/vserver/vserver-complete-fix-for-CVE-2010-4243.patch featureset=vserver +- bugfix/x86/x86-mm-Fix-pgd_lock-deadlock.patch featureset=xen + features/all/xen/pvops.patch featureset=xen + features/all/xen/xen-netfront-make-smartpoll-optional-and-default-off.patch featureset=xen + features/all/xen/xen-grant-table-do-not-truncate-machine-address-on-g.patch featureset=xen --- linux-2.6.32-2.6.32/debian/patches/series/38 +++ linux-2.6.32-2.6.32/debian/patches/series/38 @@ -5,3 +5,5 @@ + bugfix/x86/92_23258_kvm-clock-reset.patch + bugfix/x86/92_5f4e3f882731c65b5d64a2ff743fda96eaebb9ee.patch + bugfix/x86/92_7c4c0f4fd5c3e82234c0ab61c7e7ffdb8f3af07b.patch + ++ bugfix/x86/x86-mm-Fix-pgd_lock-deadlock.patch