Package: linux-2.6 Version: 2.6.32-35 Tags: patch We've now seen multiple crashes during periods of heavy IO on amd64 architecture machines running 2.6.32-5-amd64 from stock squeeze installs.
An example crash [0] yields a backtrace like this: 2011-06-26_12:46:14.63097 [62478.818625] divide error: 0000 [#1] SMP 2011-06-26_12:46:14.68003 [62478.822564] last sysfs file: /sys/devices/pci0000:00/0000:00:1e.0/0000:04:03.0/class 2011-06-26_12:46:14.68004 [62478.830287] CPU 0 2011-06-26_12:46:14.68005 [62478.832304] Modules linked in: rng_core btrfs zlib_deflate crc32c libcrc32c ufs qnx4 hfsplus hfs minix ntfs vfat msdos fat jfs xfs exportfs reiserfs ext4 jbd2 crc16 ext2 bridge stp kvm_intel kvm tun loop snd_pcm snd_timer snd soundcore snd_page_alloc dcdbas pcspkr psmouse serio_raw evdev button power_meter processor ext3 jbd mbcache sh a256_generic aes_x86_64 aes_generic cbc dm_crypt dm_mod raid1 md_mod sd_mod crc_t10dif sg sr_mod cdrom ata_generic uhci_hcd mpt2sas ehci_hcd thermal ata_piix thermal_sys usbcore nls_b ase scsi_transport_sas libata scsi_mod bnx2 [last unloaded: scsi_wait_scan] 2011-06-26_12:46:14.68007 [62478.885126] Pid: 32653, comm: kvm Tainted: G W 2.6.32-5-amd64 #1 PowerEdge R410 2011-06-26_12:46:14.68008 [62478.893108] RIP: 0010:[<ffffffff81044d3a>] [<ffffffff81044d3a>] find_busiest_group+0x3d0/0x876 2011-06-26_12:46:14.68009 [62478.901803] RSP: 0018:ffff8804c5a8ba68 EFLAGS: 00010046 2011-06-26_12:46:14.68010 [62478.907101] RAX: 0000000000000000 RBX: ffffffffffffffff RCX: ffffffff8103a601 2011-06-26_12:46:14.68012 [62478.914219] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000200 2011-06-26_12:46:14.68013 [62478.921334] RBP: ffff88044e40fd50 R08: 0000000000000000 R09: ffff88083e4400b0 2011-06-26_12:46:14.68014 [62478.928449] R10: ffff880298c3a8b8 R11: ffffffffa0253fb7 R12: 0000000000015780 2011-06-26_12:46:14.68014 [62478.935565] R13: 0000000000000000 R14: 0000000000000001 R15: ffff88083e440060 2011-06-26_12:46:14.68015 [62478.942683] FS: 00007f995a599700(0000) GS:ffff88044e400000(0000) knlGS:0000000000000000 2011-06-26_12:46:14.68016 [62478.950753] CS: 0010 DS: 002b ES: 002b CR0: 0000000080050033 2011-06-26_12:46:14.68017 [62478.956483] CR2: 00007f80157a6000 CR3: 0000000393285000 CR4: 00000000000026e0 2011-06-26_12:46:14.68018 [62478.963601] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 2011-06-26_12:46:14.68019 [62478.970716] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 2011-06-26_12:46:14.68020 [62478.977833] Process kvm (pid: 32653, threadinfo ffff8804c5a8a000, task ffff88083e67a350) 2011-06-26_12:46:14.68021 [62478.985901] Stack: 2011-06-26_12:46:14.68021 [62478.987907] 0000000000015788 0000000000015780 0000000000000008 0000000000015780 2011-06-26_12:46:14.68022 [62478.995142] <0> 0000000000015780 0000000000015780 ffffffff813cd8a8 ffffffff8106fde3 2011-06-26_12:46:14.68027 [62479.002834] <0> 0000000000000000 ffff88001d1e8e10 ffff88044e410108 ffff88044e40f9e0 2011-06-26_12:46:14.68027 [62479.010711] Call Trace: 2011-06-26_12:46:14.68028 [62479.013157] [<ffffffff8106fde3>] ? tick_dev_program_event+0x2d/0x95 2011-06-26_12:46:14.68029 [62479.019496] [<ffffffff81067b20>] ? __hrtimer_start_range_ns+0x22f/0x242 2011-06-26_12:46:14.68030 [62479.026183] [<ffffffff812f9b40>] ? schedule+0x2bd/0x7cb 2011-06-26_12:46:14.68030 [62479.031491] [<ffffffffa02856ec>] ? x86_emulate_insn+0x1f08/0x2fc4 [kvm] 2011-06-26_12:46:14.68031 [62479.038184] [<ffffffffa026d858>] ? kvm_vcpu_block+0x94/0xb4 [kvm] 2011-06-26_12:46:14.68032 [62479.044349] [<ffffffff81064bee>] ? autoremove_wake_function+0x0/0x2e 2011-06-26_12:46:14.68033 [62479.050781] [<ffffffffa0278127>] ? kvm_arch_vcpu_ioctl_run+0x80b/0xa44 [kvm] 2011-06-26_12:46:14.68033 [62479.057901] [<ffffffff8104a252>] ? try_to_wake_up+0x2a7/0x2b9 2011-06-26_12:46:14.68034 [62479.063719] [<ffffffff8107188f>] ? wake_futex+0x31/0x4e 2011-06-26_12:46:14.68035 [62479.069024] [<ffffffffa026a9d1>] ? kvm_vcpu_ioctl+0xf1/0x4e6 [kvm] 2011-06-26_12:46:14.68035 [62479.075275] [<ffffffff81067b20>] ? __hrtimer_start_range_ns+0x22f/0x242 2011-06-26_12:46:14.68036 [62479.081964] [<ffffffff810fa492>] ? vfs_ioctl+0x21/0x6c 2011-06-26_12:46:14.68038 [62479.087176] [<ffffffff810fa9e0>] ? do_vfs_ioctl+0x48d/0x4cb 2011-06-26_12:46:14.68039 [62479.092822] [<ffffffff81073c0a>] ? sys_futex+0x113/0x131 2011-06-26_12:46:14.68039 [62479.098210] [<ffffffff81111451>] ? block_llseek+0x75/0x81 2011-06-26_12:46:14.68040 [62479.103681] [<ffffffff810faa6f>] ? sys_ioctl+0x51/0x70 2011-06-26_12:46:14.68041 [62479.108894] [<ffffffff81010b42>] ? system_call_fastpath+0x16/0x1b 2011-06-26_12:46:14.68042 [62479.115056] Code: bc 24 a0 01 00 00 00 74 10 48 8b 94 24 a0 01 00 00 c7 02 00 00 00 00 eb 65 41 8b 77 08 48 8b 84 24 38 01 00 00 31 d2 48 c1 e0 0a <48> f7 f6 48 8b b4 24 40 01 00 00 48 89 84 24 30 01 00 00 31 c0 2011-06-26_12:46:14.68042 [62479.134577] RIP [<ffffffff81044d3a>] find_busiest_group+0x3d0/0x876 2011-06-26_12:46:14.68043 [62479.140927] RSP <ffff8804c5a8ba68> 2011-06-26_12:46:14.68044 [62479.144752] ---[ end trace 62d8c362642de471 ]--- 2011-06-26_12:46:14.68045 [62479.154150] divide error: 0000 [#2] SMP 2011-06-26_12:46:14.68045 [62479.158090] last sysfs file: /sys/devices/pci0000:00/0000:00:1e.0/0000:04:03.0/class 2011-06-26_12:46:14.68046 [62479.165813] CPU 2 2011-06-26_12:46:15.63168 [62479.167831] Modules linked in: rng_core btrfs zlib_deflate crc32c libcrc32c ufs qnx4 hfsplus hfs minix ntfs vfat msdos fat jfs xfs exportfs reiserfs ext4 jbd2 crc16 ext2 bridge stp kvm_intel kvm tun loop snd_pcm snd_timer snd soundcore snd_page_alloc dcdbas pcspkr psmouse serio_raw evdev button power_meter processor ext3 jbd mbcache sha256_generic aes_x86_64 aes_generic cbc dm_crypt dm_mod raid1 md_mod sd_mod crc_t10dif sg sr_mod cdrom ata_generic uhci_hcd mpt2sas ehci_hcd thermal ata_piix thermal_sys usbcore nls_base scsi_transport_sas libata scsi_mod bnx2 [last unloaded: scsi_wait_scan] 2011-06-26_12:46:15.63169 [62479.220649] Pid: 29476, comm: kvm Tainted: G D W 2.6.32-5-amd64 #1 PowerEdge R410 2011-06-26_12:46:15.63170 [62479.228632] RIP: 0010:[<ffffffff81044d3a>] [<ffffffff81044d3a>] find_busiest_group+0x3d0/0x876 2011-06-26_12:46:15.63171 [62479.237323] RSP: 0018:ffff88014ac0da68 EFLAGS: 00010046 2011-06-26_12:46:15.63172 [62479.242621] RAX: 0000000000000000 RBX: ffffffffffffffff RCX: ffffffff8103a601 2011-06-26_12:46:15.63173 [62479.249737] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000200 2011-06-26_12:46:15.63175 [62479.256854] RBP: ffff88044e42fd50 R08: 0000000000000000 R09: ffff88083e4400b0 2011-06-26_12:46:15.63176 [62479.263970] R10: ffff88075e22d118 R11: ffffffffa0253fb7 R12: 0000000000015780 2011-06-26_12:46:15.63177 [62479.271086] R13: 0000000000000000 R14: 0000000000000000 R15: ffff88083e440060 2011-06-26_12:46:15.63177 [62479.278205] FS: 00007ff427c45700(0000) GS:ffff88044e420000(0000) knlGS:0000000000000000 2011-06-26_12:46:15.63181 [62479.286275] CS: 0010 DS: 002b ES: 002b CR0: 0000000080050033 2011-06-26_12:46:15.63182 [62479.292004] CR2: 00007fff1eaceee0 CR3: 00000001bc629000 CR4: 00000000000026e0 2011-06-26_12:46:15.63182 [62479.299120] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 2011-06-26_12:46:15.63183 [62479.306235] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 2011-06-26_12:46:15.63184 [62479.313351] Process kvm (pid: 29476, threadinfo ffff88014ac0c000, task ffff8803cb488710) 2011-06-26_12:46:15.63185 [62479.321419] Stack: 2011-06-26_12:46:15.63185 [62479.323424] 0000000000015788 0000000000015780 0000000000000008 0000000000015780 2011-06-26_12:46:15.63186 [62479.330659] <0> 0000000000015780 0000000000015780 ffff88075e22d0c0 ffffffffa027f406 2011-06-26_12:46:15.63187 [62479.338351] <0> 0000000000000000 ffff880799fa4410 ffff88044e430108 ffff88044e42f9e0 2011-06-26_12:46:15.63188 [62479.346228] Call Trace: 2011-06-26_12:46:15.63189 [62479.348679] [<ffffffffa027f406>] ? paging64_walk_addr+0x175/0x41c [kvm] 2011-06-26_12:46:15.63190 [62479.355366] [<ffffffff81067b20>] ? __hrtimer_start_range_ns+0x22f/0x242 2011-06-26_12:46:15.63191 [62479.362053] [<ffffffff812f9b40>] ? schedule+0x2bd/0x7cb 2011-06-26_12:46:15.63191 [62479.367359] [<ffffffffa02856ec>] ? x86_emulate_insn+0x1f08/0x2fc4 [kvm] 2011-06-26_12:46:15.63192 [62479.374051] [<ffffffffa026d858>] ? kvm_vcpu_block+0x94/0xb4 [kvm] 2011-06-26_12:46:15.63193 [62479.380216] [<ffffffff81064bee>] ? autoremove_wake_function+0x0/0x2e 2011-06-26_12:46:15.63193 [62479.386648] [<ffffffffa0278127>] ? kvm_arch_vcpu_ioctl_run+0x80b/0xa44 [kvm] 2011-06-26_12:46:15.63194 [62479.393768] [<ffffffff8104a252>] ? try_to_wake_up+0x2a7/0x2b9 2011-06-26_12:46:15.63195 [62479.399588] [<ffffffff8101657d>] ? read_tsc+0xa/0x20 2011-06-26_12:46:15.63196 [62479.404632] [<ffffffffa026a9d1>] ? kvm_vcpu_ioctl+0xf1/0x4e6 [kvm] 2011-06-26_12:46:15.63196 [62479.410886] [<ffffffff8102419a>] ? lapic_next_event+0x18/0x1d 2011-06-26_12:46:15.63197 [62479.416705] [<ffffffff8106fde3>] ? tick_dev_program_event+0x2d/0x95 2011-06-26_12:46:15.63198 [62479.423046] [<ffffffff810fa492>] ? vfs_ioctl+0x21/0x6c 2011-06-26_12:46:15.63199 [62479.428259] [<ffffffff810fa9e0>] ? do_vfs_ioctl+0x48d/0x4cb 2011-06-26_12:46:15.63199 [62479.433904] [<ffffffff810641e1>] ? sys_timer_settime+0x233/0x283 2011-06-26_12:46:15.63201 [62479.439983] [<ffffffff81063f9b>] ? common_timer_get+0xa9/0xbc 2011-06-26_12:46:15.63202 [62479.445801] [<ffffffff810faa6f>] ? sys_ioctl+0x51/0x70 2011-06-26_12:46:15.63202 [62479.451014] [<ffffffff81010b42>] ? system_call_fastpath+0x16/0x1b 2011-06-26_12:46:15.63203 [62479.457178] Code: bc 24 a0 01 00 00 00 74 10 48 8b 94 24 a0 01 00 00 c7 02 00 00 00 00 eb 65 41 8b 77 08 48 8b 84 24 38 01 00 00 31 d2 48 c1 e0 0a <48> f7 f6 48 8b b4 24 40 01 00 00 48 89 84 24 30 01 00 00 31 c0 2011-06-26_12:46:15.63204 [62479.476700] RIP [<ffffffff81044d3a>] find_busiest_group+0x3d0/0x876 2011-06-26_12:46:15.63205 [62479.483048] RSP <ffff88014ac0da68> 2011-06-26_12:46:15.63205 [62479.486527] ---[ end trace 62d8c362642de472 ]--- This seems to be related to the kernel's upstream bug report: https://bugzilla.kernel.org/show_bug.cgi?id=16991 It looks like ubuntu has done something to try to address the same bug in their linux-ec2 package in march: https://bugs.launchpad.net/linux/+bug/614853 We've applied the attached patch (a simple workaround to ensure no division-by-zero) to the debian packages for several weeks in production (over a month on some machines) and haven't seen a recurrence of the problem. I recommend this patch for inclusion in debian's next bugfix release. I welcome feedback on it, of course. Regards, --dkg [0] https://support.mayfirst.org/ticket/4423
diff --git a/kernel/sched.c b/kernel/sched.c index 2829d09..2856c54 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1577,7 +1577,9 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu, * shares_i = ----------------------------- * \Sum_j rq_weight_j */ - shares = (sd_shares * rq_weight) / sd_rq_weight; + shares = (sd_shares * rq_weight); + if (sd_rq_weight) + shares /= sd_rq_weight; shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); if (abs(shares - tg->se[cpu]->load.weight) > @@ -3932,7 +3934,9 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, } /* Adjust by relative CPU power of the group */ - sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; + sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE); + if (group->cpu_power) + sgs->avg_load /= group->cpu_power; /* * Consider the group unbalanced when the imbalance is larger @@ -4245,7 +4249,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (sds.this_load >= sds.max_load) goto out_balanced; - sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; + sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load); + if (sds.total_pwr) + sds.avg_load /= sds.total_pwr; if (sds.this_load >= sds.avg_load) goto out_balanced; @@ -4323,7 +4329,9 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, * the load can be moved away from the cpu that is potentially * running at a lower capacity. */ - wl = (wl * SCHED_LOAD_SCALE) / power; + wl = (wl * SCHED_LOAD_SCALE); + if (power) + wl /= power; if (wl > max_load) { max_load = wl; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index d53c9c7..e9c7c86 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1354,7 +1354,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, } /* Adjust by relative CPU power of the group */ - avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + avg_load = (avg_load * SCHED_LOAD_SCALE); + if (group->cpu_power) + avg_load /= group->cpu_power; if (local_group) { this_load = avg_load;