The function __acct_update_integrals() is called both from irq context and task context. This creates a race where irq context can advance tsk->acct_timexpd to a value larger than time, leading to a negative value, which causes a divide error. See commit 6d5b5acca9e5 ("Fix fixpoint divide exception in acct_update_integrals")
In 2012, __acct_update_integrals() was changed to get utime and stime as function parameters. This re-introduced the bug, because an irq can hit in-between the call to task_cputime() and where irqs actually get disabled. However, this race condition was originally reproduced on Hercules, and I have not seen any reports of it re-occurring since it was re-introduced 3 years ago. On the other hand, the irq disabling and re-enabling, which no longer even protects us against the race today, show up prominently in the perf profile of a program that makes a very large number of system calls in a short period of time, when nohz_full= (and context tracking) is enabled. This patch replaces the (now ineffective) irq blocking with a cheaper way to test for the race condition, and speeds up my microbenchmark with 10 million iterations, average of 5 runs, tiny stddev: run time system time vanilla 5.49s 2.08s patch 5.21s 1.92s Cc: Andy Lutomirsky <aml...@amacapital.com> Cc: Frederic Weisbecker <fweis...@redhat.com> Cc: Peter Zijlstra <pet...@infradead.org> Cc: Heiko Carstens <heiko.carst...@de.ibm.com> Signed-off-by: Rik van Riel <r...@redhat.com> --- V2: introduce signed_cputime_t to deal with 64 bit cputime_t on 32 bit architectures, and use READ_ONCE to ensure the value is always read atomically (Heiko Karstens) arch/powerpc/include/asm/cputime.h | 3 +++ arch/s390/include/asm/cputime.h | 3 +++ include/asm-generic/cputime_jiffies.h | 2 ++ include/asm-generic/cputime_nsecs.h | 3 +++ kernel/tsacct.c | 18 ++++++++++++------ 5 files changed, 23 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h index e2452550bcb1..e41b32f68a2c 100644 --- a/arch/powerpc/include/asm/cputime.h +++ b/arch/powerpc/include/asm/cputime.h @@ -32,6 +32,9 @@ static inline void setup_cputime_one_jiffy(void) { } typedef u64 __nocast cputime_t; typedef u64 __nocast cputime64_t; +typedef s64 signed_cputime_t; +typedef s64 signed_cputime64_t; + #define cmpxchg_cputime(ptr, old, new) cmpxchg(ptr, old, new) #ifdef __KERNEL__ diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h index 221b454c734a..2e8c268cc2a7 100644 --- a/arch/s390/include/asm/cputime.h +++ b/arch/s390/include/asm/cputime.h @@ -18,6 +18,9 @@ typedef unsigned long long __nocast cputime_t; typedef unsigned long long __nocast cputime64_t; +typedef signed long long signed_cputime_t; +typedef signed long long signed_cputime64_t; + #define cmpxchg_cputime(ptr, old, new) cmpxchg64(ptr, old, new) static inline unsigned long __div(unsigned long long n, unsigned long base) diff --git a/include/asm-generic/cputime_jiffies.h b/include/asm-generic/cputime_jiffies.h index fe386fc6e85e..b96b6a1b6c97 100644 --- a/include/asm-generic/cputime_jiffies.h +++ b/include/asm-generic/cputime_jiffies.h @@ -2,6 +2,7 @@ #define _ASM_GENERIC_CPUTIME_JIFFIES_H typedef unsigned long __nocast cputime_t; +typedef signed long signed_cputime_t; #define cmpxchg_cputime(ptr, old, new) cmpxchg(ptr, old, new) @@ -11,6 +12,7 @@ typedef unsigned long __nocast cputime_t; #define jiffies_to_cputime(__hz) (__force cputime_t)(__hz) typedef u64 __nocast cputime64_t; +typedef s64 signed_cputime_t; #define cputime64_to_jiffies64(__ct) (__force u64)(__ct) #define jiffies64_to_cputime64(__jif) (__force cputime64_t)(__jif) diff --git a/include/asm-generic/cputime_nsecs.h b/include/asm-generic/cputime_nsecs.h index 0419485891f2..c1ad2f90a4d9 100644 --- a/include/asm-generic/cputime_nsecs.h +++ b/include/asm-generic/cputime_nsecs.h @@ -21,6 +21,9 @@ typedef u64 __nocast cputime_t; typedef u64 __nocast cputime64_t; +typedef s64 signed_cputime_t; +typedef s64 signed_cputime64_t; + #define cmpxchg_cputime(ptr, old, new) cmpxchg64(ptr, old, new) #define cputime_one_jiffy jiffies_to_cputime(1) diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 975cb49e32bf..1bebe0339ac1 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -126,23 +126,29 @@ static void __acct_update_integrals(struct task_struct *tsk, if (likely(tsk->mm)) { cputime_t time, dtime; struct timeval value; - unsigned long flags; u64 delta; - local_irq_save(flags); time = stime + utime; - dtime = time - tsk->acct_timexpd; + dtime = time - READ_ONCE(tsk->acct_timexpd); + /* + * This code is called both from irq context and from + * task context. There is a race where irq context advances + * tsk->acct_timexpd to a value larger than time, creating + * a negative value. In that case, the irq has already + * updated the statistics. + */ + if (unlikely((signed_cputime_t)dtime <= 0)) + return; + jiffies_to_timeval(cputime_to_jiffies(dtime), &value); delta = value.tv_sec; delta = delta * USEC_PER_SEC + value.tv_usec; if (delta == 0) - goto out; + return; tsk->acct_timexpd = time; tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; - out: - local_irq_restore(flags); } } -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/