On Thu, Jun 09, 2016 at 03:07:50PM +0200, Peter Zijlstra wrote:
> Which given the lack of serialization, and the code generated from
> update_cfs_rq_load_avg() is entirely possible.
> 
>       if (atomic_long_read(&cfs_rq->removed_load_avg)) {
>               s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
>               sa->load_avg = max_t(long, sa->load_avg - r, 0);
>               sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
>               removed_load = 1;
>       }
> 
> turns into:
> 
> ffffffff81087064:       49 8b 85 98 00 00 00    mov    0x98(%r13),%rax
> ffffffff8108706b:       48 85 c0                test   %rax,%rax
> ffffffff8108706e:       74 40                   je     ffffffff810870b0 
> <update_blocked_averages+0xc0>
> ffffffff81087070:       4c 89 f8                mov    %r15,%rax
> ffffffff81087073:       49 87 85 98 00 00 00    xchg   %rax,0x98(%r13)
> ffffffff8108707a:       49 29 45 70             sub    %rax,0x70(%r13)
> ffffffff8108707e:       4c 89 f9                mov    %r15,%rcx
> ffffffff81087081:       bb 01 00 00 00          mov    $0x1,%ebx
> ffffffff81087086:       49 83 7d 70 00          cmpq   $0x0,0x70(%r13)
> ffffffff8108708b:       49 0f 49 4d 70          cmovns 0x70(%r13),%rcx
> 
> Which you'll note ends up with sa->load_avg -= r in memory at
> ffffffff8108707a.
> 
> Ludicrous code generation if you ask me; I'd have expected something
> like (note, r15 holds 0):
> 
>       mov     %r15, %rax
>       xchg    %rax, cfs_rq->removed_load_avg
>       mov     sa->load_avg, %rcx
>       sub     %rax, %rcx
>       cmovs   %r15, %rcx
>       mov     %rcx, sa->load_avg

So I _should_ have looked at other unserialized users of ->load_avg, but
alas. Luckily nikbor reported a similar /0 from task_h_load() which
instantly triggered recollection of this here problem.

Now, we really do not want to go grab rq->lock there, so I did the
below. Which actually ends up generating the 'right' code as per the
above.

    3133:       49 87 85 98 00 00 00    xchg   %rax,0x98(%r13)
    313a:       49 8b 4d 70             mov    0x70(%r13),%rcx
    313e:       bb 01 00 00 00          mov    $0x1,%ebx
    3143:       48 29 c1                sub    %rax,%rcx
    3146:       49 0f 48 cf             cmovs  %r15,%rcx
    314a:       48 69 c0 82 45 ff ff    imul   $0xffffffffffff4582,%rax,%rax
    3151:       49 89 4d 70             mov    %rcx,0x70(%r13)

This ensures the negative value never hits memory and allows the
unserialized use.

Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>
---
 kernel/sched/fair.c | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f75930bdd326..3fd3d903e6b6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2878,6 +2878,20 @@ static inline void cfs_rq_util_change(struct cfs_rq 
*cfs_rq)
        }
 }
 
+/*
+ * Explicitly do a load-store to ensure the temporary value never hits memory.
+ * This allows lockless observations without ever seeing the negative values.
+ *
+ * Incidentally, this also generates much saner code for x86.
+ */
+#define sub_positive(type, ptr, val) do {                      \
+       type tmp = READ_ONCE(*ptr);                             \
+       tmp -= (val);                                           \
+       if (tmp < 0)                                            \
+               tmp = 0;                                        \
+       WRITE_ONCE(*ptr, tmp);                                  \
+} while (0)
+
 /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
 static inline int
 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
@@ -2887,15 +2901,15 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, 
bool update_freq)
 
        if (atomic_long_read(&cfs_rq->removed_load_avg)) {
                s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
-               sa->load_avg = max_t(long, sa->load_avg - r, 0);
-               sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
+               sub_positive(long, &sa->load_avg, r);
+               sub_positive(s64,  &sa->load_sum, r * LOAD_AVG_MAX);
                removed_load = 1;
        }
 
        if (atomic_long_read(&cfs_rq->removed_util_avg)) {
                long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
-               sa->util_avg = max_t(long, sa->util_avg - r, 0);
-               sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
+               sub_positive(long, &sa->util_avg, r);
+               sub_positive(s32,  &sa->util_sum, r * LOAD_AVG_MAX);
                removed_util = 1;
        }
 
@@ -2968,10 +2982,10 @@ static void detach_entity_load_avg(struct cfs_rq 
*cfs_rq, struct sched_entity *s
                          &se->avg, se->on_rq * 
scale_load_down(se->load.weight),
                          cfs_rq->curr == se, NULL);
 
-       cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - 
se->avg.load_avg, 0);
-       cfs_rq->avg.load_sum = max_t(s64,  cfs_rq->avg.load_sum - 
se->avg.load_sum, 0);
-       cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - 
se->avg.util_avg, 0);
-       cfs_rq->avg.util_sum = max_t(s32,  cfs_rq->avg.util_sum - 
se->avg.util_sum, 0);
+       sub_positive(long, &cfs_rq->avg.load_avg, se->avg.load_avg);
+       sub_positive(s64,  &cfs_rq->avg.load_sum, se->avg.load_sum);
+       sub_positive(long, &cfs_rq->avg.util_avg, se->avg.util_avg);
+       sub_positive(s32,  &cfs_rq->avg.util_sum, se->avg.util_sum);
 
        cfs_rq_util_change(cfs_rq);
 }

Reply via email to