Hi,

We found a concurrency bug in linux kernel 5.3.11. We were able to reproduce 
this bug in x86 when the kernel is compiled with non-standard GCC options and 
under specific thread interleavings. This bug causes a page fault that the 
kernel can’t handle due to an illegal memory access (“BUG: unable to handle 
page fault”). 

After some investigation, it seems the problem is caused by the use of the GCC 
extension for ternary operators (“Conditionals with Omitted Operands”) in the 
function __rht_ptr. 

The kernel seems to assume that the first operand is only evaluated once when 
the condition is true, but our experiments show that GCC actually evaluates 
twice the first operand causing two reads of the bkt variable (one to evaluate 
the condition and another to evaluate the implicit 2nd operand). Unfortunately 
under concurrency another thread can change the value of the variable 
in-between the two reads. 

In particular, if a) the condition evaluates to true during the first read, b) 
another thread changes the value that bkt is pointing to, which makes the 
condition false, c) the second (omitted) operand gets evaluated but is 
evaluated with a second read that returns a value inconsistent with the true 
condition.

Note that the GCC documentation mentions that the ternary operator with 
“Omitted Operands” should not cause side-effects to execute twice but there is 
no guarantee that non-volatile read memory operations are not executed twice, 
which we have also confirmed with GCC developers and you could find the 
information here. https://gcc.gnu.org/pipermail/gcc/2020-July/233018.html
 
------------------------------------------ 
Console output

[  109.796573] BUG: unable to handle page fault for address: ffffffe0
[  110.250775] #PF: supervisor read access in kernel mode
[  110.851490] #PF: error_code(0x0000) - not-present page
[  111.575747] *pde = 02248067 *pte = 00000000
[  112.170468] Oops: 0000 [#1] SMP
[  113.137733] CPU: 1 PID: 1799 Comm: ski-executor Not tainted 5.3.11 #1
[  113.936036] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007
[  114.567487] EIP: memcmp+0x9/0x32
[  115.636968] Code: 55 89 e5 57 56 8b 75 08 85 f6 74 11 bf 00 00 00 00 89 14 
f8 89 4c f8 04 47 39 f7 75 f4 5e 5f 5d c3 55 89 e5 56 53 85 c9 74 22 <0f> b6 18 
0f b6 32 29 f3 75 12 01 c1 40 42 39 c1 74 0a 0f b6 18 0f
[  118.578949] EAX: ffffffe0 EBX: 00000000 ECX: 00000004 EDX: ce4b5f3c
[  119.412369] ESI: c2076a9c EDI: ffffffe0 EBP: ce4b5f14 ESP: ce4b5f0c
[  120.380135] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00000202
[  121.241093] CR0: 80050033 CR2: ffffffe0 CR3: 0e272000 CR4: 00000690
[  122.258051] DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000
[  123.705309] DR6: 00000000 DR7: 00000000
[  124.875237] Call Trace:
[  126.062462]  ipcget+0xfa/0x26c
[  127.034312]  ksys_msgget+0x46/0x5d
[  127.667337]  sys_msgget+0x13/0x15
[  128.421597]  do_fast_syscall_32+0x99/0x285
[  129.166383]  entry_SYSENTER_32+0x9f/0xf2
[  130.204738] EIP: 0xb7fffaad
[  130.823257] Code: 8b 5d 08 e8 19 00 00 00 89 d3 eb e5 8b 04 24 c3 8b 0c 24 
c3 8b 1c 24 c3 8b 34 24 c3 8b 3c 24 c3 90 51 52 55 89 e5 0f 34 cd 80 <5d> 5a 59 
c3 90 90 90 90 8d 76 00 58 b8 77 00 00 00 cd 80 90 8d 76
[  133.839387] EAX: ffffffda EBX: 798e2635 ECX: 00000000 EDX: 00000000
[  134.921710] ESI: 00000000 EDI: 00000000 EBP: 00000000 ESP: bffff81c
[  136.039511] DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 007b EFLAGS: 00000296
[  137.326581] Modules linked in:
[  138.350381] CR2: 00000000ffffffe0
[  139.121664] ---[ end trace 5a43bf9a3ce51e57 ]---
[  139.987631] EIP: memcmp+0x9/0x32
[  140.203250] Code: 55 89 e5 57 56 8b 75 08 85 f6 74 11 bf 00 00 00 00 89 14 
f8 89 4c f8 04 47 39 f7 75 f4 5e 5f 5d c3 55 89 e5 56 53 85 c9 74 22 <0f> b6 18 
0f b6 32 29 f3 75 12 01 c1 40 42 39 c1 74 0a 0f b6 18 0f
[  140.553281] EAX: ffffffe0 EBX: 00000000 ECX: 00000004 EDX: ce4b5f3c
[  140.734438] ESI: c2076a9c EDI: ffffffe0 EBP: ce4b5f14 ESP: ce4b5f0c
[  140.858536] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00000202
[  140.959192] CR0: 80050033 CR2: ffffffe0 CR3: 0e272000 CR4: 00000690
[  141.052654] DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000
[  141.177823] DR6: 00000000 DR7: 00000000


------------------------------------------ 
Input and source code

This bug occurs when two syscalls, msget and msgctl, are invoked concurrently. 
Our analysis has located two lines of code, one of which will read a shared 
memory rhash_head __rcu object while the other writes to it. 

Writer:
starting from include/linux/rhashtable.h:399
    static inline void rht_assign_unlock(struct bucket_table *tbl,
        struct rhash_lock_head **bkt,
        struct rhash_head *obj)
    {
        struct rhash_head __rcu **p = (struct rhash_head __rcu **)bkt;

        if (rht_is_a_nulls(obj))
--->       obj = NULL;
        lock_map_release(&tbl->dep_map);
        rcu_assign_pointer(*p, obj);
        preempt_enable();
        __release(bitlock);
        local_bh_enable();
    }


Reader:
starting at include/linux/rhashtable.h:352
    static inline struct rhash_head rcu *rht_ptr(
        struct rhash_lock_head const **bkt)
{
        return (struct rhash_head __rcu *)
        ((unsigned long)*bkt & ~BIT(0) ?:
        (unsigned long)RHT_NULLS_MARKER(bkt));
}

return (struct rhash_head __rcu *)
--->       ((unsigned long)*bkt & ~BIT(0) ?:
        (unsigned long)RHT_NULLS_MARKER(bkt));

------------------------------------------ 
Thread interleaving

For presentation purposes, we convert the original code to explain the thread 
interleaving.

Original code:
return (struct rhash_head __rcu *)
       ((unsigned long)*bkt & ~BIT(0) ?:
        (unsigned long)RHT_NULLS_MARKER(bkt));

Converted code:
if ((unsigned long)*bkt & ~BIT(0)){
return  (struct rhash_head __rcu *)(unsigned long)*bkt & ~BIT(0);
}else{
return  (struct rhash_head __rcu *)(unsigned long)RHT_NULLS_MARKER(bkt);
}

Interleaving that triggers the bug:

CPU0 (msgctl)                   CPU1(msget)
…
if (rht_is_a_nulls(obj))
                                                …
                                                if ((unsigned long)*bkt & 
~BIT(0)){

obj = NULL;
                                                return  (struct rhash_head 
__rcu *)(unsigned long)*bkt & ~BIT(0);
                                                …
                                                rhashtable_compare(…)
                                                …
                                                return memcmp(ptr + 
ht->p.key_offset, arg->key, ht->p.key_len);
                                                [from rhashtable_compare at 
include/linux/rhashtable.h: 584]
                                                [page fault]

------------------------------------------ 
Kernel configuration and compilation

We were only able to reproduce this bug when the kernel is compiled with “-O1 
-fno-if-conversion -fno-if-conversion2 -fno-delayed-branch -fno-tree-fre 
-fno-tree-dominator-opts -fno-cprop-registers” instead of the default “-O2”. 
The compiler emits different instructions for the reader depending on the 
optimization level. In the buggy version, the writer has two read accesses to 
the bkt address but in the non-buggy version it only has one memory read 
operation.

It is unclear to us how this affects other architectures.


static inline struct rhash_head __rcu *__rht_ptr(
      struct rhash_lock_head *const *bkt)
  {
 return (struct rhash_head __rcu *)
      c11a36d4:   8b 45 e4                mov    -0x1c(%ebp),%eax
      c11a36d7:   83 c8 01                or     $0x1,%eax
      c11a36da:   89 45 e0                mov    %eax,-0x20(%ebp)
      c11a36dd:   89 75 d8                mov    %esi,-0x28(%ebp)
      c11a36e0:   8b 45 e4                mov    -0x1c(%ebp),%eax
---> c11a36e3:   f7 00 fe ff ff ff       testl  $0xfffffffe,(%eax)
      c11a36e9:   74 69                   je     c11a3754 
<bpf_offload_find_netdev+0x107>
---> c11a36eb:   8b 00                   mov    (%eax),%eax
      c11a36ed:   89 45 dc                mov    %eax,-0x24(%ebp)
      c11a36f0:   83 e0 fe                and    $0xfffffffe,%eax


static inline struct rhash_head __rcu *__rht_ptr(
     struct rhash_lock_head *const *bkt)
 {
     return (struct rhash_head __rcu *)
       c119b644:   8b 45 e4                mov    -0x1c(%ebp),%eax
       c119b647:   83 c8 01                or     $0x1,%eax
       c119b64a:   89 45 dc                mov    %eax,-0x24(%ebp)
       c119b64d:   89 75 d8                mov    %esi,-0x28(%ebp)
         ((unsigned long)*bkt & ~BIT(0) ?:
       c119b650:   8b 45 e4                mov    -0x1c(%ebp),%eax
  ---> c119b653:   8b 00                   mov    (%eax),%eax
        c119b655:   89 45 e0                mov    %eax,-0x20(%ebp)
      return (struct rhash_head __rcu *)
       c119b658:   83 e0 fe                and    $0xfffffffe,%eax
       c119b65b:   75 03                   jne    c119b660 
<bpf_offload_find_netdev+0xa3>
       c119b65d:   8b 45 dc                mov    -0x24(%ebp),%eax


Thanks,
Sishuai

Reply via email to