Re: Linux 2.6.20.16

Willy Tarreau Wed, 15 Aug 2007 22:49:20 -0700
diff --git a/Makefile b/Makefile
index 947ff3c..b3806cb 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 6
 SUBLEVEL = 20
-EXTRAVERSION = .15
+EXTRAVERSION = .16
 NAME = Homicidal Dwarf Hamster
 
 # *DOCUMENTATION*
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 5e47683..9bf056e 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -367,10 +367,6 @@ ENTRY(system_call)
        CFI_ADJUST_CFA_OFFSET 4
        SAVE_ALL
        GET_THREAD_INFO(%ebp)
-       testl $TF_MASK,PT_EFLAGS(%esp)
-       jz no_singlestep
-       orl $_TIF_SINGLESTEP,TI_flags(%ebp)
-no_singlestep:
                                        # system call tracing in operation / 
emulation
        /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not 
testb */
        testw 
$(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
@@ -385,6 +381,10 @@ syscall_exit:
                                        # setting need_resched or sigpending
                                        # between sampling and the iret
        TRACE_IRQS_OFF
+       testl $TF_MASK,PT_EFLAGS(%esp)  # If tracing set singlestep flag on exit
+       jz no_singlestep
+       orl $_TIF_SINGLESTEP,TI_flags(%ebp)
+no_singlestep:
        movl TI_flags(%ebp), %ecx
        testw $_TIF_ALLWORK_MASK, %cx   # current->work
        jne syscall_exit_work
diff --git a/arch/i386/oprofile/nmi_int.c b/arch/i386/oprofile/nmi_int.c
index 3700eef..be4a9a8 100644
--- a/arch/i386/oprofile/nmi_int.c
+++ b/arch/i386/oprofile/nmi_int.c
@@ -131,7 +131,6 @@ static void nmi_save_registers(void * dummy)
 {
        int cpu = smp_processor_id();
        struct op_msrs * msrs = &cpu_msrs[cpu];
-       model->fill_in_addresses(msrs);
        nmi_cpu_save_registers(msrs);
 }
 
@@ -195,6 +194,7 @@ static struct notifier_block profile_exceptions_nb = {
 static int nmi_setup(void)
 {
        int err=0;
+       int cpu;
 
        if (!allocate_msrs())
                return -ENOMEM;
@@ -207,6 +207,13 @@ static int nmi_setup(void)
        /* We need to serialize save and setup for HT because the subset
         * of msrs are distinct for save and setup operations
         */
+
+       /* Assume saved/restored counters are the same on all CPUs */
+       model->fill_in_addresses(&cpu_msrs[0]);
+       for_each_possible_cpu (cpu) {
+               if (cpu != 0)
+                       cpu_msrs[cpu] = cpu_msrs[0];
+       }
        on_each_cpu(nmi_save_registers, NULL, 0, 1);
        on_each_cpu(nmi_cpu_setup, NULL, 0, 1);
        nmi_enabled = 1;
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index f72e8e8..a84304e 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -177,6 +177,13 @@ static long restore_sigcontext(struct pt_regs *regs, 
sigset_t *set, int sig,
         */
        discard_lazy_cpu_state();
 
+       /*
+        * Force reload of FP/VEC.
+        * This has to be done before copying stuff into current->thread.fpr/vr
+        * for the reasons explained in the previous comment.
+        */
+       regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1 | MSR_VEC);
+
        err |= __copy_from_user(&current->thread.fpr, &sc->fp_regs, 
FP_REGS_SIZE);
 
 #ifdef CONFIG_ALTIVEC
@@ -198,9 +205,6 @@ static long restore_sigcontext(struct pt_regs *regs, 
sigset_t *set, int sig,
                current->thread.vrsave = 0;
 #endif /* CONFIG_ALTIVEC */
 
-       /* Force reload of FP/VEC */
-       regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1 | MSR_VEC);
-
        return err;
 }
 
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 2968b90..e67cc4f 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -72,6 +72,8 @@ void show_mem(void)
 
        for_each_online_pgdat(pgdat) {
                for (i = 0; i < pgdat->node_spanned_pages; ++i) {
+                       if (!pfn_valid(pgdat->node_start_pfn + i))
+                               continue;
                        page = pfn_to_page(pgdat->node_start_pfn + i);
                        total++;
                        if (PageReserved(page))
@@ -766,3 +768,9 @@ int in_gate_area_no_task(unsigned long addr)
 {
        return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
 }
+
+void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
+{
+       return __alloc_bootmem_core(pgdat->bdata, size,
+                       SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0);
+}
diff --git a/drivers/char/cyclades.c b/drivers/char/cyclades.c
index 3ffa080..e4e0ccb 100644
--- a/drivers/char/cyclades.c
+++ b/drivers/char/cyclades.c
@@ -1102,6 +1102,7 @@ static void cyy_intr_chip(struct cyclades_card *cinfo, 
int chip,
 
                                if (data & info->ignore_status_mask) {
                                        info->icount.rx++;
+                                       spin_unlock(&cinfo->card_lock);
                                        return;
                                }
                                if (tty_buffer_request_room(tty, 1)) {
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index cef1287..550ac72 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -255,19 +255,25 @@ static struct page *read_sb_page(mddev_t *mddev, long 
offset, unsigned long inde
 
 }
 
-static int write_sb_page(mddev_t *mddev, long offset, struct page *page, int 
wait)
+static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 {
        mdk_rdev_t *rdev;
        struct list_head *tmp;
+       mddev_t *mddev = bitmap->mddev;
 
        ITERATE_RDEV(mddev, rdev, tmp)
                if (test_bit(In_sync, &rdev->flags)
-                   && !test_bit(Faulty, &rdev->flags))
+                   && !test_bit(Faulty, &rdev->flags)) {
+                       int size = PAGE_SIZE;
+                       if (page->index == bitmap->file_pages-1)
+                               size = roundup(bitmap->last_page_size,
+                                              bdev_hardsect_size(rdev->bdev));
                        md_super_write(mddev, rdev,
-                                      (rdev->sb_offset<<1) + offset
+                                      (rdev->sb_offset<<1) + bitmap->offset
                                       + page->index * (PAGE_SIZE/512),
-                                      PAGE_SIZE,
+                                      size,
                                       page);
+               }
 
        if (wait)
                md_super_wait(mddev);
@@ -282,7 +288,7 @@ static int write_page(struct bitmap *bitmap, struct page 
*page, int wait)
        struct buffer_head *bh;
 
        if (bitmap->file == NULL)
-               return write_sb_page(bitmap->mddev, bitmap->offset, page, wait);
+               return write_sb_page(bitmap, page, wait);
 
        bh = page_buffers(page);
 
@@ -923,6 +929,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, 
sector_t start)
                        }
 
                        bitmap->filemap[bitmap->file_pages++] = page;
+                       bitmap->last_page_size = count;
                }
                paddr = kmap_atomic(page, KM_USER0);
                if (bitmap->flags & BITMAP_HOSTENDIAN)
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 4c2471e..b9ff4e3 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -33,7 +33,6 @@
 struct crypt_io {
        struct dm_target *target;
        struct bio *base_bio;
-       struct bio *first_clone;
        struct work_struct work;
        atomic_t pending;
        int error;
@@ -107,6 +106,8 @@ struct crypt_config {
 
 static struct kmem_cache *_crypt_io_pool;
 
+static void clone_init(struct crypt_io *, struct bio *);
+
 /*
  * Different IV generation algorithms:
  *
@@ -378,25 +379,20 @@ static int crypt_convert(struct crypt_config *cc,
  * This should never violate the device limitations
  * May return a smaller bio when running out of pages
  */
-static struct bio *
-crypt_alloc_buffer(struct crypt_config *cc, unsigned int size,
-                   struct bio *base_bio, unsigned int *bio_vec_idx)
+static struct bio *crypt_alloc_buffer(struct crypt_io *io, unsigned int size,
+                                     unsigned int *bio_vec_idx)
 {
+       struct crypt_config *cc = io->target->private;
        struct bio *clone;
        unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
        gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM;
        unsigned int i;
 
-       if (base_bio) {
-               clone = bio_alloc_bioset(GFP_NOIO, base_bio->bi_max_vecs, 
cc->bs);
-               __bio_clone(clone, base_bio);
-       } else
-               clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
-
+       clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
        if (!clone)
                return NULL;
 
-       clone->bi_destructor = dm_crypt_bio_destructor;
+       clone_init(io, clone);
 
        /* if the last bio was not complete, continue where that one ended */
        clone->bi_idx = *bio_vec_idx;
@@ -495,9 +491,6 @@ static void dec_pending(struct crypt_io *io, int error)
        if (!atomic_dec_and_test(&io->pending))
                return;
 
-       if (io->first_clone)
-               bio_put(io->first_clone);
-
        bio_endio(io->base_bio, io->base_bio->bi_size, io->error);
 
        mempool_free(io, cc->io_pool);
@@ -562,6 +555,7 @@ static void clone_init(struct crypt_io *io, struct bio 
*clone)
        clone->bi_end_io  = crypt_endio;
        clone->bi_bdev    = cc->dev->bdev;
        clone->bi_rw      = io->base_bio->bi_rw;
+       clone->bi_destructor = dm_crypt_bio_destructor;
 }
 
 static void process_read(struct crypt_io *io)
@@ -585,7 +579,6 @@ static void process_read(struct crypt_io *io)
        }
 
        clone_init(io, clone);
-       clone->bi_destructor = dm_crypt_bio_destructor;
        clone->bi_idx = 0;
        clone->bi_vcnt = bio_segments(base_bio);
        clone->bi_size = base_bio->bi_size;
@@ -615,8 +608,7 @@ static void process_write(struct crypt_io *io)
         * so repeat the whole process until all the data can be handled.
         */
        while (remaining) {
-               clone = crypt_alloc_buffer(cc, base_bio->bi_size,
-                                          io->first_clone, &bvec_idx);
+               clone = crypt_alloc_buffer(io, base_bio->bi_size, &bvec_idx);
                if (unlikely(!clone)) {
                        dec_pending(io, -ENOMEM);
                        return;
@@ -631,31 +623,23 @@ static void process_write(struct crypt_io *io)
                        return;
                }
 
-               clone_init(io, clone);
                clone->bi_sector = cc->start + sector;
-
-               if (!io->first_clone) {
-                       /*
-                        * hold a reference to the first clone, because it
-                        * holds the bio_vec array and that can't be freed
-                        * before all other clones are released
-                        */
-                       bio_get(clone);
-                       io->first_clone = clone;
-               }
-
                remaining -= clone->bi_size;
                sector += bio_sectors(clone);
 
-               /* prevent bio_put of first_clone */
+               /* Grab another reference to the io struct
+                * before we kick off the request */
                if (remaining)
                        atomic_inc(&io->pending);
 
                generic_make_request(clone);
 
+               /* Do not reference clone after this - it
+                * may be gone already. */
+
                /* out of memory -> run queues */
                if (remaining)
-                       congestion_wait(bio_data_dir(clone), HZ/100);
+                       congestion_wait(WRITE, HZ/100);
        }
 }
 
@@ -954,10 +938,12 @@ static int crypt_map(struct dm_target *ti, struct bio 
*bio,
        struct crypt_config *cc = ti->private;
        struct crypt_io *io;
 
+       if (bio_barrier(bio))
+               return -EOPNOTSUPP;
+
        io = mempool_alloc(cc->io_pool, GFP_NOIO);
        io->target = ti;
        io->base_bio = bio;
-       io->first_clone = NULL;
        io->error = io->post_process = 0;
        atomic_set(&io->pending, 0);
        kcryptd_queue_io(io);
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index dfe3214..2c404f7 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -415,7 +415,7 @@ static int raid0_make_request (request_queue_t *q, struct 
bio *bio)
        raid0_conf_t *conf = mddev_to_conf(mddev);
        struct strip_zone *zone;
        mdk_rdev_t *tmp_dev;
-       unsigned long chunk;
+       sector_t chunk;
        sector_t block, rsect;
        const int rw = bio_data_dir(bio);
 
@@ -470,7 +470,6 @@ static int raid0_make_request (request_queue_t *q, struct 
bio *bio)
 
                sector_div(x, zone->nb_dev);
                chunk = x;
-               BUG_ON(x != (sector_t)chunk);
 
                x = block >> chunksize_bits;
                tmp_dev = zone->dev[sector_div(x, zone->nb_dev)];
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 97ee870..b20c6e9 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1235,17 +1235,24 @@ static void sync_request_write(mddev_t *mddev, r1bio_t 
*r1_bio)
                        }
                r1_bio->read_disk = primary;
                for (i=0; i<mddev->raid_disks; i++)
-                       if (r1_bio->bios[i]->bi_end_io == end_sync_read &&
-                           test_bit(BIO_UPTODATE, &r1_bio->bios[i]->bi_flags)) 
{
+                       if (r1_bio->bios[i]->bi_end_io == end_sync_read) {
                                int j;
                                int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
                                struct bio *pbio = r1_bio->bios[primary];
                                struct bio *sbio = r1_bio->bios[i];
-                               for (j = vcnt; j-- ; )
-                                       if 
(memcmp(page_address(pbio->bi_io_vec[j].bv_page),
-                                                  
page_address(sbio->bi_io_vec[j].bv_page),
-                                                  PAGE_SIZE))
-                                               break;
+
+                               if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) {
+                                       for (j = vcnt; j-- ; ) {
+                                               struct page *p, *s;
+                                               p = pbio->bi_io_vec[j].bv_page;
+                                               s = sbio->bi_io_vec[j].bv_page;
+                                               if (memcmp(page_address(p),
+                                                          page_address(s),
+                                                          PAGE_SIZE))
+                                                       break;
+                                       }
+                               } else
+                                       j = 0;
                                if (j >= 0)
                                        mddev->resync_mismatches += 
r1_bio->sectors;
                                if (j < 0 || test_bit(MD_RECOVERY_CHECK, 
&mddev->recovery)) {
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 82249a6..9eb66c1 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1867,6 +1867,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t 
sector_nr, int *skipped, i
                        int d = r10_bio->devs[i].devnum;
                        bio = r10_bio->devs[i].bio;
                        bio->bi_end_io = NULL;
+                       clear_bit(BIO_UPTODATE, &bio->bi_flags);
                        if (conf->mirrors[d].rdev == NULL ||
                            test_bit(Faulty, &conf->mirrors[d].rdev->flags))
                                continue;
@@ -2037,6 +2038,11 @@ static int run(mddev_t *mddev)
        /* 'size' is now the number of chunks in the array */
        /* calculate "used chunks per device" in 'stride' */
        stride = size * conf->copies;
+
+       /* We need to round up when dividing by raid_disks to
+        * get the stride size.
+        */
+       stride += conf->raid_disks - 1;
        sector_div(stride, conf->raid_disks);
        mddev->size = stride  << (conf->chunk_shift-1);
 
diff --git a/drivers/media/video/saa7134/saa7134-tvaudio.c 
b/drivers/media/video/saa7134/saa7134-tvaudio.c
index dd759d6..36b3fa3 100644
--- a/drivers/media/video/saa7134/saa7134-tvaudio.c
+++ b/drivers/media/video/saa7134/saa7134-tvaudio.c
@@ -1006,7 +1006,7 @@ int saa7134_tvaudio_init2(struct saa7134_dev *dev)
 int saa7134_tvaudio_fini(struct saa7134_dev *dev)
 {
        /* shutdown tvaudio thread */
-       if (dev->thread.pid >= 0) {
+       if (dev->thread.pid > 0) {
                dev->thread.shutdown = 1;
                wake_up_interruptible(&dev->thread.wq);
                wait_for_completion(&dev->thread.exit);
diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c
index c6259c7..40bdcf9 100644
--- a/drivers/net/e1000/e1000_main.c
+++ b/drivers/net/e1000/e1000_main.c
@@ -1157,13 +1157,16 @@ e1000_probe(struct pci_dev *pdev,
            !e1000_check_mng_mode(&adapter->hw))
                e1000_get_hw_control(adapter);
 
-       strcpy(netdev->name, "eth%d");
-       if ((err = register_netdev(netdev)))
-               goto err_register;
-
        /* tell the stack to leave us alone until e1000_open() is called */
        netif_carrier_off(netdev);
        netif_stop_queue(netdev);
+#ifdef CONFIG_E1000_NAPI
+       netif_poll_disable(netdev);
+#endif
+
+       strcpy(netdev->name, "eth%d");
+       if ((err = register_netdev(netdev)))
+               goto err_register;
 
        DPRINTK(PROBE, INFO, "Intel(R) PRO/1000 Network Connection\n");
 
diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c
index 38e75cf..aec8c59 100644
--- a/drivers/net/sky2.c
+++ b/drivers/net/sky2.c
@@ -95,7 +95,7 @@ static int disable_msi = 0;
 module_param(disable_msi, int, 0);
 MODULE_PARM_DESC(disable_msi, "Disable Message Signaled Interrupt (MSI)");
 
-static int idle_timeout = 0;
+static int idle_timeout = 100;
 module_param(idle_timeout, int, 0);
 MODULE_PARM_DESC(idle_timeout, "Watchdog timer for lost interrupts (ms)");
 
@@ -2341,6 +2341,13 @@ static int sky2_poll(struct net_device *dev0, int 
*budget)
 
        work_done = sky2_status_intr(hw, work_limit);
        if (work_done < work_limit) {
+               /* Bug/Errata workaround?
+                * Need to kick the TX irq moderation timer.
+                */
+               if (sky2_read8(hw, STAT_TX_TIMER_CTRL) == TIM_START) {
+                       sky2_write8(hw, STAT_TX_TIMER_CTRL, TIM_STOP);
+                       sky2_write8(hw, STAT_TX_TIMER_CTRL, TIM_START);
+               }
                netif_rx_complete(dev0);
 
                sky2_read32(hw, B0_Y2_SP_LISR);
diff --git a/drivers/serial/mpsc.c b/drivers/serial/mpsc.c
index 3d2fcc5..64ed5ef 100644
--- a/drivers/serial/mpsc.c
+++ b/drivers/serial/mpsc.c
@@ -502,7 +502,8 @@ mpsc_sdma_intr_ack(struct mpsc_port_info *pi)
 
        if (pi->mirror_regs)
                pi->shared_regs->SDMA_INTR_CAUSE_m = 0;
-       writel(0, pi->shared_regs->sdma_intr_base + SDMA_INTR_CAUSE);
+       writeb(0x00, pi->shared_regs->sdma_intr_base + SDMA_INTR_CAUSE +
+              pi->port.line);
        return;
 }
 
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 2275f27..8f820e4 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -59,6 +59,7 @@ extern void *__alloc_bootmem_core(struct bootmem_data *bdata,
                                  unsigned long align,
                                  unsigned long goal,
                                  unsigned long limit);
+extern void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size);
 
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 extern void reserve_bootmem(unsigned long addr, unsigned long size);
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index d37f46a..f768e10 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2235,11 +2235,11 @@
 #define PCI_DEVICE_ID_INTEL_ICH8_5     0x283e
 #define PCI_DEVICE_ID_INTEL_ICH8_6     0x2850
 #define PCI_DEVICE_ID_INTEL_ICH9_0     0x2910
-#define PCI_DEVICE_ID_INTEL_ICH9_1     0x2911
+#define PCI_DEVICE_ID_INTEL_ICH9_1     0x2917
 #define PCI_DEVICE_ID_INTEL_ICH9_2     0x2912
 #define PCI_DEVICE_ID_INTEL_ICH9_3     0x2913
 #define PCI_DEVICE_ID_INTEL_ICH9_4     0x2914
-#define PCI_DEVICE_ID_INTEL_ICH9_5     0x2915
+#define PCI_DEVICE_ID_INTEL_ICH9_5     0x2919
 #define PCI_DEVICE_ID_INTEL_ICH9_6     0x2930
 #define PCI_DEVICE_ID_INTEL_82855PM_HB 0x3340
 #define PCI_DEVICE_ID_INTEL_82830_HB   0x3575
diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h
index 6db9a4c..dd5a05d 100644
--- a/include/linux/raid/bitmap.h
+++ b/include/linux/raid/bitmap.h
@@ -232,6 +232,7 @@ struct bitmap {
        struct page **filemap; /* list of cache pages for the file */
        unsigned long *filemap_attr; /* attributes associated w/ filemap pages 
*/
        unsigned long file_pages; /* number of pages in the file */
+       int last_page_size; /* bytes in the last page */
 
        unsigned long flags;
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4463735..7a0cc67 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1137,6 +1137,7 @@ static inline void put_task_struct(struct task_struct *t)
                                        /* Not implemented yet, only for 486*/
 #define PF_STARTING    0x00000002      /* being created */
 #define PF_EXITING     0x00000004      /* getting shut down */
+#define PF_EXITPIDONE  0x00000008      /* pi exit done on shut down */
 #define PF_FORKNOEXEC  0x00000040      /* forked but didn't exec */
 #define PF_SUPERPRIV   0x00000100      /* used super-user privileges */
 #define PF_DUMPCORE    0x00000200      /* dumped core */
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 2a7b38d..1a76bda 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -162,7 +162,7 @@ extern struct workqueue_struct *__create_workqueue(const 
char *name,
                                                    int singlethread,
                                                    int freezeable);
 #define create_workqueue(name) __create_workqueue((name), 0, 0)
-#define create_freezeable_workqueue(name) __create_workqueue((name), 0, 1)
+#define create_freezeable_workqueue(name) __create_workqueue((name), 1, 1)
 #define create_singlethread_workqueue(name) __create_workqueue((name), 1, 0)
 
 extern void destroy_workqueue(struct workqueue_struct *wq);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 9c8c232..5a75657 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -905,7 +905,7 @@ static void audit_update_watch(struct audit_parent *parent,
 
                /* If the update involves invalidating rules, do the inode-based
                 * filtering now, so we don't omit records. */
-               if (invalidating &&
+               if (invalidating && current->audit_context &&
                    audit_filter_inodes(current, current->audit_context) == 
AUDIT_RECORD_CONTEXT)
                        audit_set_auditable(current->audit_context);
 
diff --git a/kernel/exit.c b/kernel/exit.c
index fec12eb..d306845 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -883,13 +883,29 @@ fastcall NORET_TYPE void do_exit(long code)
        if (unlikely(tsk->flags & PF_EXITING)) {
                printk(KERN_ALERT
                        "Fixing recursive fault but reboot is needed!\n");
+               /*
+                * We can do this unlocked here. The futex code uses
+                * this flag just to verify whether the pi state
+                * cleanup has been done or not. In the worst case it
+                * loops once more. We pretend that the cleanup was
+                * done as there is no way to return. Either the
+                * OWNER_DIED bit is set by now or we push the blocked
+                * task into the wait for ever nirwana as well.
+                */
+               tsk->flags |= PF_EXITPIDONE;
                if (tsk->io_context)
                        exit_io_context();
                set_current_state(TASK_UNINTERRUPTIBLE);
                schedule();
        }
 
+       /*
+        * tsk->flags are checked in the futex code to protect against
+        * an exiting task cleaning up the robust pi futexes.
+        */
+       spin_lock_irq(&tsk->pi_lock);
        tsk->flags |= PF_EXITING;
+       spin_unlock_irq(&tsk->pi_lock);
 
        if (unlikely(in_atomic()))
                printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
@@ -956,6 +972,12 @@ fastcall NORET_TYPE void do_exit(long code)
         * Make sure we are holding no locks:
         */
        debug_check_no_locks_held(tsk);
+       /*
+        * We can do this unlocked here. The futex code uses this flag
+        * just to verify whether the pi state cleanup has been done
+        * or not. In the worst case it loops once more.
+        */
+       tsk->flags |= PF_EXITPIDONE;
 
        if (tsk->io_context)
                exit_io_context();
diff --git a/kernel/futex.c b/kernel/futex.c
index 1df411e..99dad33 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -390,18 +390,12 @@ static struct task_struct * futex_find_get_task(pid_t pid)
 
        rcu_read_lock();
        p = find_task_by_pid(pid);
-       if (!p)
-               goto out_unlock;
-       if ((current->euid != p->euid) && (current->euid != p->uid)) {
-               p = NULL;
-               goto out_unlock;
-       }
-       if (p->exit_state != 0) {
-               p = NULL;
-               goto out_unlock;
-       }
-       get_task_struct(p);
-out_unlock:
+
+       if (!p || ((current->euid != p->euid) && (current->euid != p->uid)))
+               p = ERR_PTR(-ESRCH);
+       else
+               get_task_struct(p);
+
        rcu_read_unlock();
 
        return p;
@@ -467,7 +461,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, 
struct futex_q *me)
        struct futex_q *this, *next;
        struct list_head *head;
        struct task_struct *p;
-       pid_t pid;
+       pid_t pid = uval & FUTEX_TID_MASK;
 
        head = &hb->chain;
 
@@ -485,6 +479,8 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, 
struct futex_q *me)
                                return -EINVAL;
 
                        WARN_ON(!atomic_read(&pi_state->refcount));
+                       WARN_ON(pid && pi_state->owner &&
+                               pi_state->owner->pid != pid);
 
                        atomic_inc(&pi_state->refcount);
                        me->pi_state = pi_state;
@@ -495,15 +491,33 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, 
struct futex_q *me)
 
        /*
         * We are the first waiter - try to look up the real owner and attach
-        * the new pi_state to it, but bail out when the owner died bit is set
-        * and TID = 0:
+        * the new pi_state to it, but bail out when TID = 0
         */
-       pid = uval & FUTEX_TID_MASK;
-       if (!pid && (uval & FUTEX_OWNER_DIED))
+       if (!pid)
                return -ESRCH;
        p = futex_find_get_task(pid);
-       if (!p)
-               return -ESRCH;
+       if (IS_ERR(p))
+               return PTR_ERR(p);
+
+       /*
+        * We need to look at the task state flags to figure out,
+        * whether the task is exiting. To protect against the do_exit
+        * change of the task flags, we do this protected by
+        * p->pi_lock:
+        */
+       spin_lock_irq(&p->pi_lock);
+       if (unlikely(p->flags & PF_EXITING)) {
+               /*
+                * The task is on the way out. When PF_EXITPIDONE is
+                * set, we know that the task has finished the
+                * cleanup:
+                */
+               int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
+
+               spin_unlock_irq(&p->pi_lock);
+               put_task_struct(p);
+               return ret;
+       }
 
        pi_state = alloc_pi_state();
 
@@ -516,7 +530,6 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, 
struct futex_q *me)
        /* Store the key for possible exit cleanups: */
        pi_state->key = me->key;
 
-       spin_lock_irq(&p->pi_lock);
        WARN_ON(!list_empty(&pi_state->list));
        list_add(&pi_state->list, &p->pi_state_list);
        pi_state->owner = p;
@@ -583,15 +596,22 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, 
struct futex_q *this)
         * preserve the owner died bit.)
         */
        if (!(uval & FUTEX_OWNER_DIED)) {
+               int ret = 0;
+
                newval = FUTEX_WAITERS | new_owner->pid;
 
                pagefault_disable();
                curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
                pagefault_enable();
+
                if (curval == -EFAULT)
-                       return -EFAULT;
+                       ret = -EFAULT;
                if (curval != uval)
-                       return -EINVAL;
+                       ret = -EINVAL;
+               if (ret) {
+                       spin_unlock(&pi_state->pi_mutex.wait_lock);
+                       return ret;
+               }
        }
 
        spin_lock_irq(&pi_state->owner->pi_lock);
@@ -1149,6 +1169,7 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, 
unsigned long sec,
        if (unlikely(ret != 0))
                goto out_release_sem;
 
+ retry_unlocked:
        hb = queue_lock(&q, -1, NULL);
 
  retry_locked:
@@ -1200,34 +1221,58 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, 
unsigned long sec,
        ret = lookup_pi_state(uval, hb, &q);
 
        if (unlikely(ret)) {
-               /*
-                * There were no waiters and the owner task lookup
-                * failed. When the OWNER_DIED bit is set, then we
-                * know that this is a robust futex and we actually
-                * take the lock. This is safe as we are protected by
-                * the hash bucket lock. We also set the waiters bit
-                * unconditionally here, to simplify glibc handling of
-                * multiple tasks racing to acquire the lock and
-                * cleanup the problems which were left by the dead
-                * owner.
-                */
-               if (curval & FUTEX_OWNER_DIED) {
-                       uval = newval;
-                       newval = current->pid |
-                               FUTEX_OWNER_DIED | FUTEX_WAITERS;
+               switch (ret) {
 
-                       pagefault_disable();
-                       curval = futex_atomic_cmpxchg_inatomic(uaddr,
-                                                              uval, newval);
-                       pagefault_enable();
+               case -EAGAIN:
+                       /*
+                        * Task is exiting and we just wait for the
+                        * exit to complete.
+                        */
+                       queue_unlock(&q, hb);
+                       up_read(&curr->mm->mmap_sem);
+                       cond_resched();
+                       goto retry;
 
-                       if (unlikely(curval == -EFAULT))
+               case -ESRCH:
+                       /*
+                        * No owner found for this futex. Check if the
+                        * OWNER_DIED bit is set to figure out whether
+                        * this is a robust futex or not.
+                        */
+                       if (get_futex_value_locked(&curval, uaddr))
                                goto uaddr_faulted;
-                       if (unlikely(curval != uval))
-                               goto retry_locked;
-                       ret = 0;
+
+                       /*
+                        * There were no waiters and the owner task lookup
+                        * failed. When the OWNER_DIED bit is set, then we
+                        * know that this is a robust futex and we actually
+                        * take the lock. This is safe as we are protected by
+                        * the hash bucket lock. We also set the waiters bit
+                        * unconditionally here, to simplify glibc handling of
+                        * multiple tasks racing to acquire the lock and
+                        * cleanup the problems which were left by the dead
+                        * owner.
+                        */
+                       if (curval & FUTEX_OWNER_DIED) {
+                               uval = newval;
+                               newval = current->pid |
+                                       FUTEX_OWNER_DIED | FUTEX_WAITERS;
+
+                               pagefault_disable();
+                               curval = futex_atomic_cmpxchg_inatomic(uaddr,
+                                                                      uval,
+                                                                      newval);
+                               pagefault_enable();
+
+                               if (unlikely(curval == -EFAULT))
+                                       goto uaddr_faulted;
+                               if (unlikely(curval != uval))
+                                       goto retry_locked;
+                               ret = 0;
+                       }
+               default:
+                       goto out_unlock_release_sem;
                }
-               goto out_unlock_release_sem;
        }
 
        /*
@@ -1279,39 +1324,52 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, 
unsigned long sec,
                list_add(&q.pi_state->list, &current->pi_state_list);
                spin_unlock_irq(&current->pi_lock);
 
-               /* Unqueue and drop the lock */
-               unqueue_me_pi(&q, hb);
-               up_read(&curr->mm->mmap_sem);
                /*
                 * We own it, so we have to replace the pending owner
-                * TID. This must be atomic as we have preserve the
+                * TID. This must be atomic as we have to preserve the
                 * owner died bit here.
                 */
-               ret = get_user(uval, uaddr);
+               ret = get_futex_value_locked(&uval, uaddr);
                while (!ret) {
                        newval = (uval & FUTEX_OWNER_DIED) | newtid;
+
+                       pagefault_disable();
                        curval = futex_atomic_cmpxchg_inatomic(uaddr,
                                                               uval, newval);
+                       pagefault_enable();
+
                        if (curval == -EFAULT)
                                ret = -EFAULT;
                        if (curval == uval)
                                break;
                        uval = curval;
                }
-       } else {
+       } else if (ret) {
                /*
                 * Catch the rare case, where the lock was released
                 * when we were on the way back before we locked
                 * the hash bucket.
                 */
-               if (ret && q.pi_state->owner == curr) {
-                       if (rt_mutex_trylock(&q.pi_state->pi_mutex))
-                               ret = 0;
+               if (q.pi_state->owner == curr &&
+                   rt_mutex_trylock(&q.pi_state->pi_mutex)) {
+                       ret = 0;
+               } else {
+                       /*
+                        * Paranoia check. If we did not take the lock
+                        * in the trylock above, then we should not be
+                        * the owner of the rtmutex, neither the real
+                        * nor the pending one:
+                        */
+                       if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr)
+                               printk(KERN_ERR "futex_lock_pi: ret = %d "
+                                      "pi-mutex: %p pi-state %p\n", ret,
+                                      q.pi_state->pi_mutex.owner,
+                                      q.pi_state->owner);
                }
-               /* Unqueue and drop the lock */
-               unqueue_me_pi(&q, hb);
-               up_read(&curr->mm->mmap_sem);
        }
+       /* Unqueue and drop the lock */
+       unqueue_me_pi(&q, hb);
+       up_read(&curr->mm->mmap_sem);
 
        if (!detect && ret == -EDEADLK && 0)
                force_sig(SIGKILL, current);
@@ -1331,16 +1389,18 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, 
unsigned long sec,
         * non-atomically.  Therefore, if get_user below is not
         * enough, we need to handle the fault ourselves, while
         * still holding the mmap_sem.
+        *
+        * ... and hb->lock. :-) --ANK
         */
+       queue_unlock(&q, hb);
+
        if (attempt++) {
-               if (futex_handle_fault((unsigned long)uaddr, attempt)) {
-                       ret = -EFAULT;
-                       goto out_unlock_release_sem;
-               }
-               goto retry_locked;
+               ret = futex_handle_fault((unsigned long)uaddr, attempt);
+               if (ret)
+                       goto out_release_sem;
+               goto retry_unlocked;
        }
 
-       queue_unlock(&q, hb);
        up_read(&curr->mm->mmap_sem);
 
        ret = get_user(uval, uaddr);
@@ -1382,9 +1442,9 @@ retry:
                goto out;
 
        hb = hash_futex(&key);
+retry_unlocked:
        spin_lock(&hb->lock);
 
-retry_locked:
        /*
         * To avoid races, try to do the TID -> 0 atomic transition
         * again. If it succeeds then we can return without waking
@@ -1446,16 +1506,17 @@ pi_faulted:
         * non-atomically.  Therefore, if get_user below is not
         * enough, we need to handle the fault ourselves, while
         * still holding the mmap_sem.
+        *
+        * ... and hb->lock. :-) --ANK
         */
+       spin_unlock(&hb->lock);
+
        if (attempt++) {
-               if (futex_handle_fault((unsigned long)uaddr, attempt)) {
-                       ret = -EFAULT;
-                       goto out_unlock;
-               }
-               goto retry_locked;
+               ret = futex_handle_fault((unsigned long)uaddr, attempt);
+               if (ret)
+                       goto out;
+               goto retry_unlocked;
        }
-
-       spin_unlock(&hb->lock);
        up_read(&current->mm->mmap_sem);
 
        ret = get_user(uval, uaddr);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 4ab17da..dd5feae 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -212,6 +212,19 @@ static int rt_mutex_adjust_prio_chain(struct task_struct 
*task,
        if (!waiter || !waiter->task)
                goto out_unlock_pi;
 
+       /*
+        * Check the orig_waiter state. After we dropped the locks,
+        * the previous owner of the lock might have released the lock
+        * and made us the pending owner:
+        */
+       if (orig_waiter && !orig_waiter->task)
+               goto out_unlock_pi;
+
+       /*
+        * Drop out, when the task has no waiters. Note,
+        * top_waiter can be NULL, when we are in the deboosting
+        * mode!
+        */
        if (top_waiter && (!task_has_pi_waiters(task) ||
                           top_waiter != task_top_pi_waiter(task)))
                goto out_unlock_pi;
@@ -659,9 +672,16 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
                         * all over without going into schedule to try
                         * to get the lock now:
                         */
-                       if (unlikely(!waiter.task))
+                       if (unlikely(!waiter.task)) {
+                               /*
+                                * Reset the return value. We might
+                                * have returned with -EDEADLK and the
+                                * owner released the lock while we
+                                * were walking the pi chain.
+                                */
+                               ret = 0;
                                continue;
-
+                       }
                        if (unlikely(ret))
                                break;
                }
diff --git a/kernel/sched.c b/kernel/sched.c
index 62db30c..907ab05 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2814,17 +2814,21 @@ static void idle_balance(int this_cpu, struct rq 
*this_rq)
        unsigned long next_balance = jiffies + 60 *  HZ;
 
        for_each_domain(this_cpu, sd) {
-               if (sd->flags & SD_BALANCE_NEWIDLE) {
+               unsigned long interval;
+
+               if (!(sd->flags & SD_LOAD_BALANCE))
+                       continue;
+
+               if (sd->flags & SD_BALANCE_NEWIDLE)
                        /* If we've pulled tasks over stop searching: */
                        pulled_task = load_balance_newidle(this_cpu,
-                                                       this_rq, sd);
-                       if (time_after(next_balance,
-                                 sd->last_balance + sd->balance_interval))
-                               next_balance = sd->last_balance
-                                               + sd->balance_interval;
-                       if (pulled_task)
-                               break;
-               }
+                                                               this_rq, sd);
+
+               interval = msecs_to_jiffies(sd->balance_interval);
+               if (time_after(next_balance, sd->last_balance + interval))
+                       next_balance = sd->last_balance + interval;
+               if (pulled_task)
+                       break;
        }
        if (!pulled_task)
                /*
diff --git a/mm/rmap.c b/mm/rmap.c
index 7ce69c1..c30781c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -53,24 +53,6 @@
 
 struct kmem_cache *anon_vma_cachep;
 
-static inline void validate_anon_vma(struct vm_area_struct *find_vma)
-{
-#ifdef CONFIG_DEBUG_VM
-       struct anon_vma *anon_vma = find_vma->anon_vma;
-       struct vm_area_struct *vma;
-       unsigned int mapcount = 0;
-       int found = 0;
-
-       list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
-               mapcount++;
-               BUG_ON(mapcount > 100000);
-               if (vma == find_vma)
-                       found = 1;
-       }
-       BUG_ON(!found);
-#endif
-}
-
 /* This must be called under the mmap_sem. */
 int anon_vma_prepare(struct vm_area_struct *vma)
 {
@@ -121,10 +103,8 @@ void __anon_vma_link(struct vm_area_struct *vma)
 {
        struct anon_vma *anon_vma = vma->anon_vma;
 
-       if (anon_vma) {
+       if (anon_vma)
                list_add_tail(&vma->anon_vma_node, &anon_vma->head);
-               validate_anon_vma(vma);
-       }
 }
 
 void anon_vma_link(struct vm_area_struct *vma)
@@ -134,7 +114,6 @@ void anon_vma_link(struct vm_area_struct *vma)
        if (anon_vma) {
                spin_lock(&anon_vma->lock);
                list_add_tail(&vma->anon_vma_node, &anon_vma->head);
-               validate_anon_vma(vma);
                spin_unlock(&anon_vma->lock);
        }
 }
@@ -148,7 +127,6 @@ void anon_vma_unlink(struct vm_area_struct *vma)
                return;
 
        spin_lock(&anon_vma->lock);
-       validate_anon_vma(vma);
        list_del(&vma->anon_vma_node);
 
        /* We must garbage collect the anon_vma if it's empty */
diff --git a/mm/sparse.c b/mm/sparse.c
index ac26eb0..faa08e2 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -209,6 +209,12 @@ static int sparse_init_one_section(struct mem_section *ms,
        return 1;
 }
 
+__attribute__((weak))
+void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
+{
+       return NULL;
+}
+
 static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
 {
        struct page *map;
@@ -219,6 +225,11 @@ static struct page *sparse_early_mem_map_alloc(unsigned 
long pnum)
        if (map)
                return map;
 
+       map = alloc_bootmem_high_node(NODE_DATA(nid),
+                       sizeof(struct page) * PAGES_PER_SECTION);
+       if (map)
+               return map;
+
        map = alloc_bootmem_node(NODE_DATA(nid),
                        sizeof(struct page) * PAGES_PER_SECTION);
        if (map)
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Re: Linux 2.6.20.16

Reply via email to