Re: 64GB option broken in 2.4.2

Andrea Arcangeli Mon, 26 Feb 2001 08:55:05 -0800
On Mon, Feb 26, 2001 at 02:44:03AM -0600, Rico Tudor wrote:
> > Hypthesis#2 The bounce buffer code in the Linus tree is known to be
> > imperfect. Does 2.4.2ac3 do the same ?
> > 
> No improvement.  (In fact, 2.4.2ac3 breaks 3ware IDE RAID support.)

The highmem changes in 2.4.2ac3 has a couple of bugs, one relevant 
that can generate deadlocks (re-enable interrupts with io_request_lock
acquired).

Then the fix isn't complete, for example it doesn't unplug the queue while
waiting I/O completion (we wait the emergency pool to be refilled at I/O
completion).

Can you try this patch against official 2.4.2 and see if you can deadlock
again? If you can please try SYSRQ+P and SYSRQ+T and send resolved traces.

Thanks.

diff -urN -X /home/andrea/bin/dontdiff 2.4.2/arch/i386/config.in 
2.4.2-bounce/arch/i386/config.in
--- 2.4.2/arch/i386/config.in   Sat Feb 10 02:34:03 2001
+++ 2.4.2-bounce/arch/i386/config.in    Mon Feb 26 17:22:27 2001
@@ -366,4 +366,7 @@
 
 #bool 'Debug kmalloc/kfree' CONFIG_DEBUG_MALLOC
 bool 'Magic SysRq key' CONFIG_MAGIC_SYSRQ
+if [ "$CONFIG_HIGHMEM" = "y" ]; then
+   bool 'Debug HIGHMEM on lowmem machines' CONFIG_HIGHMEM_DEBUG
+fi
 endmenu
diff -urN -X /home/andrea/bin/dontdiff 2.4.2/arch/i386/kernel/setup.c 
2.4.2-bounce/arch/i386/kernel/setup.c
--- 2.4.2/arch/i386/kernel/setup.c      Thu Feb 22 03:44:53 2001
+++ 2.4.2-bounce/arch/i386/kernel/setup.c       Mon Feb 26 17:23:50 2001
@@ -648,7 +648,19 @@
  */
 #define VMALLOC_RESERVE        (unsigned long)(128 << 20)
 #define MAXMEM         (unsigned long)(-PAGE_OFFSET-VMALLOC_RESERVE)
+#ifdef CONFIG_HIGHMEM_DEBUG
+#define MAXMEM_PFN                             \
+({                                             \
+       int __max_pfn;                          \
+       if (max_pfn > PFN_DOWN(MAXMEM))         \
+            __max_pfn = PFN_DOWN(MAXMEM);      \
+       else                                    \
+            __max_pfn = max_pfn / 2;           \
+       __max_pfn;                              \
+})
+#else
 #define MAXMEM_PFN     PFN_DOWN(MAXMEM)
+#endif
 #define MAX_NONPAE_PFN (1 << 20)
 
        /*
diff -urN -X /home/andrea/bin/dontdiff 2.4.2/arch/i386/mm/fault.c 
2.4.2-bounce/arch/i386/mm/fault.c
--- 2.4.2/arch/i386/mm/fault.c  Thu Feb 22 03:44:53 2001
+++ 2.4.2-bounce/arch/i386/mm/fault.c   Mon Feb 26 16:28:42 2001
@@ -326,23 +326,27 @@
                int offset = __pgd_offset(address);
                pgd_t *pgd, *pgd_k;
                pmd_t *pmd, *pmd_k;
+               static spinlock_t lazy_vmalloc_lock = SPIN_LOCK_UNLOCKED;
+               unsigned long flags;
 
                pgd = tsk->active_mm->pgd + offset;
                pgd_k = init_mm.pgd + offset;
 
-               if (!pgd_present(*pgd)) {
-                       if (!pgd_present(*pgd_k))
-                               goto bad_area_nosemaphore;
+               spin_lock_irqsave(&lazy_vmalloc_lock, flags);
+               if (!pgd_present(*pgd) && pgd_present(*pgd_k)) {
                        set_pgd(pgd, *pgd_k);
+                       spin_unlock_irqrestore(&lazy_vmalloc_lock, flags);
                        return;
                }
-
                pmd = pmd_offset(pgd, address);
                pmd_k = pmd_offset(pgd_k, address);
 
-               if (pmd_present(*pmd) || !pmd_present(*pmd_k))
-                       goto bad_area_nosemaphore;
-               set_pmd(pmd, *pmd_k);
-               return;
+               if (!pmd_present(*pmd) && pmd_present(*pmd_k)) {
+                       set_pmd(pmd, *pmd_k);
+                       spin_unlock_irqrestore(&lazy_vmalloc_lock, flags);
+                       return;
+               }
+               spin_unlock_irqrestore(&lazy_vmalloc_lock, flags);
+               goto bad_area_nosemaphore;
        }
 }
diff -urN -X /home/andrea/bin/dontdiff 2.4.2/arch/i386/mm/init.c 
2.4.2-bounce/arch/i386/mm/init.c
--- 2.4.2/arch/i386/mm/init.c   Sat Feb 10 02:34:03 2001
+++ 2.4.2-bounce/arch/i386/mm/init.c    Mon Feb 26 16:28:42 2001
@@ -116,21 +116,13 @@
        pte_t *pte;
 
        pte = (pte_t *) __get_free_page(GFP_KERNEL);
-       if (pmd_none(*pmd)) {
-               if (pte) {
-                       clear_page(pte);
-                       set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte)));
-                       return pte + offset;
-               }
-               set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(get_bad_pte_table())));
-               return NULL;
+       if (pte) {
+               clear_page(pte);
+               set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte)));
+               return pte + offset;
        }
-       free_page((unsigned long)pte);
-       if (pmd_bad(*pmd)) {
-               __handle_bad_pmd_kernel(pmd);
-               return NULL;
-       }
-       return (pte_t *) pmd_page(*pmd) + offset;
+       set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(get_bad_pte_table())));
+       return NULL;
 }
 
 pte_t *get_pte_slow(pmd_t *pmd, unsigned long offset)
diff -urN -X /home/andrea/bin/dontdiff 2.4.2/fs/buffer.c 2.4.2-bounce/fs/buffer.c
--- 2.4.2/fs/buffer.c   Thu Feb 22 03:45:09 2001
+++ 2.4.2-bounce/fs/buffer.c    Mon Feb 26 03:15:55 2001
@@ -762,7 +762,12 @@
        balance_dirty(NODEV);
        if (free_shortage())
                page_launder(GFP_BUFFER, 0);
-       grow_buffers(size);
+       if (!grow_buffers(size)) {
+               wakeup_bdflush(1);
+               current->policy |= SCHED_YIELD;
+               __set_current_state(TASK_RUNNING);
+               schedule();
+       }
 }
 
 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
diff -urN -X /home/andrea/bin/dontdiff 2.4.2/include/asm-i386/pgalloc-3level.h 
2.4.2-bounce/include/asm-i386/pgalloc-3level.h
--- 2.4.2/include/asm-i386/pgalloc-3level.h     Fri Dec  3 20:12:23 1999
+++ 2.4.2-bounce/include/asm-i386/pgalloc-3level.h      Mon Feb 26 16:28:42 2001
@@ -53,12 +53,9 @@
                if (!page)
                        page = get_pmd_slow();
                if (page) {
-                       if (pgd_none(*pgd)) {
-                               set_pgd(pgd, __pgd(1 + __pa(page)));
-                               __flush_tlb();
-                               return page + address;
-                       } else
-                               free_pmd_fast(page);
+                       set_pgd(pgd, __pgd(1 + __pa(page)));
+                       __flush_tlb();
+                       return page + address;
                } else
                        return NULL;
        }
diff -urN -X /home/andrea/bin/dontdiff 2.4.2/mm/highmem.c 2.4.2-bounce/mm/highmem.c
--- 2.4.2/mm/highmem.c  Thu Dec 14 22:34:14 2000
+++ 2.4.2-bounce/mm/highmem.c   Mon Feb 26 17:38:42 2001
@@ -159,6 +159,19 @@
        spin_unlock(&kmap_lock);
 }
 
+#define POOL_SIZE 32
+
+/*
+ * This lock gets no contention at all, normally.
+ */
+static spinlock_t emergency_lock = SPIN_LOCK_UNLOCKED;
+
+int nr_emergency_pages;
+static LIST_HEAD(emergency_pages);
+
+int nr_emergency_bhs;
+static LIST_HEAD(emergency_bhs);
+
 /*
  * Simple bounce buffer support for highmem pages.
  * This will be moved to the block layer in 2.5.
@@ -203,13 +216,67 @@
 
 static inline void bounce_end_io (struct buffer_head *bh, int uptodate)
 {
+       struct page *page;
        struct buffer_head *bh_orig = (struct buffer_head *)(bh->b_private);
+       unsigned long flags;
 
        bh_orig->b_end_io(bh_orig, uptodate);
-       __free_page(bh->b_page);
-       kmem_cache_free(bh_cachep, bh);
+
+       page = bh->b_page;
+
+       spin_lock_irqsave(&emergency_lock, flags);
+       if (nr_emergency_pages >= POOL_SIZE)
+               __free_page(page);
+       else {
+               /*
+                * We are abusing page->list to manage
+                * the highmem emergency pool:
+                */
+               list_add(&page->list, &emergency_pages);
+               nr_emergency_pages++;
+       }
+       
+       if (nr_emergency_bhs >= POOL_SIZE)
+               kmem_cache_free(bh_cachep, bh);
+       else {
+               /*
+                * Ditto in the bh case, here we abuse b_inode_buffers:
+                */
+               list_add(&bh->b_inode_buffers, &emergency_bhs);
+               nr_emergency_bhs++;
+       }
+       spin_unlock_irqrestore(&emergency_lock, flags);
 }
 
+static int init_emergency_pool(void)
+{
+       spin_lock_irq(&emergency_lock);
+       while (nr_emergency_pages < POOL_SIZE) {
+               struct page * page = alloc_page(GFP_ATOMIC);
+               if (!page) {
+                       printk("couldn't refill highmem emergency pages");
+                       break;
+               }
+               list_add(&page->list, &emergency_pages);
+               nr_emergency_pages++;
+       }
+       while (nr_emergency_bhs < POOL_SIZE) {
+               struct buffer_head * bh = kmem_cache_alloc(bh_cachep, SLAB_ATOMIC);
+               if (!bh) {
+                       printk("couldn't refill highmem emergency bhs");
+                       break;
+               }
+               list_add(&bh->b_inode_buffers, &emergency_bhs);
+               nr_emergency_bhs++;
+       }
+       spin_unlock_irq(&emergency_lock);
+       printk("allocated %d pages and %d bhs reserved for the highmem bounces\n",
+              nr_emergency_pages, nr_emergency_bhs);
+
+       return 0;
+}
+__initcall(init_emergency_pool);
+
 static void bounce_end_io_write (struct buffer_head *bh, int uptodate)
 {
        bounce_end_io(bh, uptodate);
@@ -224,6 +291,82 @@
        bounce_end_io(bh, uptodate);
 }
 
+struct page *alloc_bounce_page (void)
+{
+       struct list_head *tmp;
+       struct page *page;
+
+repeat_alloc:
+       page = alloc_page(GFP_BUFFER);
+       if (page)
+               return page;
+       /*
+        * No luck. First, kick the VM so it doesnt idle around while
+        * we are using up our emergency rations.
+        */
+       wakeup_bdflush(0);
+
+       /*
+        * Try to allocate from the emergency pool.
+        */
+       tmp = &emergency_pages;
+       spin_lock_irq(&emergency_lock);
+       if (!list_empty(tmp)) {
+               page = list_entry(tmp->next, struct page, list);
+               list_del(tmp->next);
+               nr_emergency_pages--;
+       }
+       spin_unlock_irq(&emergency_lock);
+       if (page)
+               return page;
+
+       /* we need to wait I/O completion */
+       run_task_queue(&tq_disk);
+
+       current->policy |= SCHED_YIELD;
+       __set_current_state(TASK_RUNNING);
+       schedule();
+       goto repeat_alloc;
+}
+
+struct buffer_head *alloc_bounce_bh (void)
+{
+       struct list_head *tmp;
+       struct buffer_head *bh;
+
+repeat_alloc:
+       bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER);
+       if (bh)
+               return bh;
+       /*
+        * No luck. First, kick the VM so it doesnt idle around while
+        * we are using up our emergency rations.
+        */
+       wakeup_bdflush(0);
+
+       /*
+        * Try to allocate from the emergency pool.
+        */
+       tmp = &emergency_bhs;
+       spin_lock_irq(&emergency_lock);
+       if (!list_empty(tmp)) {
+               bh = list_entry(tmp->next, struct buffer_head, b_inode_buffers);
+               list_del(tmp->next);
+               nr_emergency_bhs--;
+       }
+       spin_unlock_irq(&emergency_lock);
+       if (bh)
+               return bh;
+
+       /* we need to wait I/O completion */
+       run_task_queue(&tq_disk);
+
+       current->policy |= SCHED_YIELD;
+       __set_current_state(TASK_RUNNING);
+       schedule();
+       goto repeat_alloc;
+}
+
 struct buffer_head * create_bounce(int rw, struct buffer_head * bh_orig)
 {
        struct page *page;
@@ -232,24 +375,15 @@
        if (!PageHighMem(bh_orig->b_page))
                return bh_orig;
 
-repeat_bh:
-       bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER);
-       if (!bh) {
-               wakeup_bdflush(1);  /* Sets task->state to TASK_RUNNING */
-               goto repeat_bh;
-       }
+       bh = alloc_bounce_bh();
        /*
         * This is wasteful for 1k buffers, but this is a stopgap measure
         * and we are being ineffective anyway. This approach simplifies
         * things immensly. On boxes with more than 4GB RAM this should
         * not be an issue anyway.
         */
-repeat_page:
-       page = alloc_page(GFP_BUFFER);
-       if (!page) {
-               wakeup_bdflush(1);  /* Sets task->state to TASK_RUNNING */
-               goto repeat_page;
-       }
+       page = alloc_bounce_page();
+
        set_bh_page(bh, page, 0);
 
        bh->b_next = NULL;

Andrea
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Re: 64GB option broken in 2.4.2

Reply via email to