applied On Thu, Jul 14, 2016 at 04:00:19PM +0200, Fabian Grünbichler wrote: > see https://forum.proxmox.com/threads/ceph-kernel-4-4-8-bug.28196/ > and https://lkml.org/lkml/2016/3/17/570 for background > --- > Makefile | 1 + > ceph-scheduler-fix.patch | 137 > +++++++++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 138 insertions(+) > create mode 100644 ceph-scheduler-fix.patch > > diff --git a/Makefile b/Makefile > index c2f9ae8..a29e062 100644 > --- a/Makefile > +++ b/Makefile > @@ -258,6 +258,7 @@ ${KERNEL_SRC}/README ${KERNEL_CFG_ORG}: ${KERNELSRCTAR} > cd ${KERNEL_SRC}; patch -p1 < > ../CVE-2016-4794-1-percpu-fix-synchronization-between-chunk-map_extend_.patch > cd ${KERNEL_SRC}; patch -p1 < > ../CVE-2016-4794-2-percpu-fix-synchronization-between-synchronous-map-e.patch > cd ${KERNEL_SRC}; patch -p1 < > ../CVE-2016-4470-KEYS-potential-uninitialized-variable.patch > + cd ${KERNEL_SRC}; patch -p1 < ../ceph-scheduler-fix.patch > sed -i ${KERNEL_SRC}/Makefile -e > 's/^EXTRAVERSION.*$$/EXTRAVERSION=${EXTRAVERSION}/' > touch $@ > > diff --git a/ceph-scheduler-fix.patch b/ceph-scheduler-fix.patch > new file mode 100644 > index 0000000..2466f82 > --- /dev/null > +++ b/ceph-scheduler-fix.patch > @@ -0,0 +1,137 @@ > +commit 8974189222159154c55f24ddad33e3613960521a > +Author: Peter Zijlstra <pet...@infradead.org> > +Date: Thu Jun 16 10:50:40 2016 +0200 > + > + sched/fair: Fix cfs_rq avg tracking underflow > + > + As per commit: > + > + b7fa30c9cc48 ("sched/fair: Fix post_init_entity_util_avg() > serialization") > + > + > the code generated from update_cfs_rq_load_avg(): > + > > + > if (atomic_long_read(&cfs_rq->removed_load_avg)) { > + > s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); > + > sa->load_avg = max_t(long, sa->load_avg - r, 0); > + > sa->load_sum = max_t(s64, sa->load_sum - r * > LOAD_AVG_MAX, 0); > + > removed_load = 1; > + > } > + > > + > turns into: > + > > + > ffffffff81087064: 49 8b 85 98 00 00 00 mov 0x98(%r13),%rax > + > ffffffff8108706b: 48 85 c0 test %rax,%rax > + > ffffffff8108706e: 74 40 je > ffffffff810870b0 <update_blocked_averages+0xc0> > + > ffffffff81087070: 4c 89 f8 mov %r15,%rax > + > ffffffff81087073: 49 87 85 98 00 00 00 xchg %rax,0x98(%r13) > + > ffffffff8108707a: 49 29 45 70 sub %rax,0x70(%r13) > + > ffffffff8108707e: 4c 89 f9 mov %r15,%rcx > + > ffffffff81087081: bb 01 00 00 00 mov $0x1,%ebx > + > ffffffff81087086: 49 83 7d 70 00 cmpq $0x0,0x70(%r13) > + > ffffffff8108708b: 49 0f 49 4d 70 cmovns 0x70(%r13),%rcx > + > > + > Which you'll note ends up with sa->load_avg -= r in memory at > + > ffffffff8108707a. > + > + So I _should_ have looked at other unserialized users of ->load_avg, > + but alas. Luckily nikbor reported a similar /0 from task_h_load() which > + instantly triggered recollection of this here problem. > + > + Aside from the intermediate value hitting memory and causing problems, > + there's another problem: the underflow detection relies on the signed > + bit. This reduces the effective width of the variables, IOW its > + effectively the same as having these variables be of signed type. > + > + This patch changes to a different means of unsigned underflow > + detection to not rely on the signed bit. This allows the variables to > + use the 'full' unsigned range. And it does so with explicit LOAD - > + STORE to ensure any intermediate value will never be visible in > + memory, allowing these unserialized loads. > + > + Note: GCC generates crap code for this, might warrant a look later. > + > + Note2: I say 'full' above, if we end up at U*_MAX we'll still explode; > + maybe we should do clamping on add too. > + > + Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org> > + Cc: Andrey Ryabinin <aryabi...@virtuozzo.com> > + Cc: Chris Wilson <ch...@chris-wilson.co.uk> > + Cc: Linus Torvalds <torva...@linux-foundation.org> > + Cc: Mike Galbraith <efa...@gmx.de> > + Cc: Peter Zijlstra <pet...@infradead.org> > + Cc: Thomas Gleixner <t...@linutronix.de> > + Cc: Yuyang Du <yuyang...@intel.com> > + Cc: bseg...@google.com > + Cc: ker...@kyup.com > + Cc: morten.rasmus...@arm.com > + Cc: p...@google.com > + Cc: steve.muc...@linaro.org > + Fixes: 9d89c257dfb9 ("sched/fair: Rewrite runnable load and utilization > average tracking") > + Link: > http://lkml.kernel.org/r/20160617091948.gj30...@twins.programming.kicks-ass.net > + Signed-off-by: Ingo Molnar <mi...@kernel.org> > + > +--- > + kernel/sched/fair.c | 33 +++++++++++++++++++++++++-------- > + 1 file changed, 25 insertions(+), 8 deletions(-) > + > +--- a/kernel/sched/fair.c > ++++ b/kernel/sched/fair.c > +@@ -2682,6 +2682,23 @@ static inline void update_tg_load_avg(st > + > + static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); > + > ++/* > ++ * Unsigned subtract and clamp on underflow. > ++ * > ++ * Explicitly do a load-store to ensure the intermediate value never hits > ++ * memory. This allows lockless observations without ever seeing the > negative > ++ * values. > ++ */ > ++#define sub_positive(_ptr, _val) do { \ > ++ typeof(_ptr) ptr = (_ptr); \ > ++ typeof(*ptr) val = (_val); \ > ++ typeof(*ptr) res, var = READ_ONCE(*ptr); \ > ++ res = var - val; \ > ++ if (res > var) \ > ++ res = 0; \ > ++ WRITE_ONCE(*ptr, res); \ > ++} while (0) > ++ > + /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ > + static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) > + { > +@@ -2690,15 +2707,15 @@ static inline int update_cfs_rq_load_avg > + > + if (atomic_long_read(&cfs_rq->removed_load_avg)) { > + s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); > +- sa->load_avg = max_t(long, sa->load_avg - r, 0); > +- sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); > ++ sub_positive(&sa->load_avg, r); > ++ sub_positive(&sa->load_sum, r * LOAD_AVG_MAX); > + removed = 1; > + } > + > + if (atomic_long_read(&cfs_rq->removed_util_avg)) { > + long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); > +- sa->util_avg = max_t(long, sa->util_avg - r, 0); > +- sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0); > ++ sub_positive(&sa->util_avg, r); > ++ sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); > + } > + > + decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, > +@@ -2764,10 +2781,10 @@ static void detach_entity_load_avg(struc > + &se->avg, se->on_rq * > scale_load_down(se->load.weight), > + cfs_rq->curr == se, NULL); > + > +- cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - > se->avg.load_avg, 0); > +- cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - > se->avg.load_sum, 0); > +- cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - > se->avg.util_avg, 0); > +- cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - > se->avg.util_sum, 0); > ++ sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); > ++ sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum); > ++ sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); > ++ sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); > + } > + > + /* Add the load generated by se into cfs_rq's load average */ > -- > 2.1.4 > > > _______________________________________________ > pve-devel mailing list > pve-devel@pve.proxmox.com > http://pve.proxmox.com/cgi-bin/mailman/listinfo/pve-devel >
_______________________________________________ pve-devel mailing list pve-devel@pve.proxmox.com http://pve.proxmox.com/cgi-bin/mailman/listinfo/pve-devel