On 7/23/20 1:14 PM, Gong, Sishuai wrote:
> Hi,
> 
> We found a concurrency bug in linux kernel 5.3.11. We were able to reproduce 
> this bug in x86 when the kernel is compiled with non-standard GCC options and 
> under specific thread interleavings. This bug causes a page fault that the 
> kernel can’t handle due to an illegal memory access (“BUG: unable to handle 
> page fault”). 
> 
> After some investigation, it seems the problem is caused by the use of the 
> GCC extension for ternary operators (“Conditionals with Omitted Operands”) in 
> the function __rht_ptr. 
> 
> The kernel seems to assume that the first operand is only evaluated once when 
> the condition is true, but our experiments show that GCC actually evaluates 
> twice the first operand causing two reads of the bkt variable (one to 
> evaluate the condition and another to evaluate the implicit 2nd operand). 
> Unfortunately under concurrency another thread can change the value of the 
> variable in-between the two reads. 
> 
> In particular, if a) the condition evaluates to true during the first read, 
> b) another thread changes the value that bkt is pointing to, which makes the 
> condition false, c) the second (omitted) operand gets evaluated but is 
> evaluated with a second read that returns a value inconsistent with the true 
> condition.
> 
> Note that the GCC documentation mentions that the ternary operator with 
> “Omitted Operands” should not cause side-effects to execute twice but there 
> is no guarantee that non-volatile read memory operations are not executed 
> twice, which we have also confirmed with GCC developers and you could find 
> the information here. https://gcc.gnu.org/pipermail/gcc/2020-July/233018.html
>  
> ------------------------------------------ 
> Console output
> 
> [  109.796573] BUG: unable to handle page fault for address: ffffffe0
> [  110.250775] #PF: supervisor read access in kernel mode
> [  110.851490] #PF: error_code(0x0000) - not-present page
> [  111.575747] *pde = 02248067 *pte = 00000000
> [  112.170468] Oops: 0000 [#1] SMP
> [  113.137733] CPU: 1 PID: 1799 Comm: ski-executor Not tainted 5.3.11 #1
> [  113.936036] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007
> [  114.567487] EIP: memcmp+0x9/0x32
> [  115.636968] Code: 55 89 e5 57 56 8b 75 08 85 f6 74 11 bf 00 00 00 00 89 14 
> f8 89 4c f8 04 47 39 f7 75 f4 5e 5f 5d c3 55 89 e5 56 53 85 c9 74 22 <0f> b6 
> 18 0f b6 32 29 f3 75 12 01 c1 40 42 39 c1 74 0a 0f b6 18 0f
> [  118.578949] EAX: ffffffe0 EBX: 00000000 ECX: 00000004 EDX: ce4b5f3c
> [  119.412369] ESI: c2076a9c EDI: ffffffe0 EBP: ce4b5f14 ESP: ce4b5f0c
> [  120.380135] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00000202
> [  121.241093] CR0: 80050033 CR2: ffffffe0 CR3: 0e272000 CR4: 00000690
> [  122.258051] DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000
> [  123.705309] DR6: 00000000 DR7: 00000000
> [  124.875237] Call Trace:
> [  126.062462]  ipcget+0xfa/0x26c
> [  127.034312]  ksys_msgget+0x46/0x5d
> [  127.667337]  sys_msgget+0x13/0x15
> [  128.421597]  do_fast_syscall_32+0x99/0x285
> [  129.166383]  entry_SYSENTER_32+0x9f/0xf2
> [  130.204738] EIP: 0xb7fffaad
> [  130.823257] Code: 8b 5d 08 e8 19 00 00 00 89 d3 eb e5 8b 04 24 c3 8b 0c 24 
> c3 8b 1c 24 c3 8b 34 24 c3 8b 3c 24 c3 90 51 52 55 89 e5 0f 34 cd 80 <5d> 5a 
> 59 c3 90 90 90 90 8d 76 00 58 b8 77 00 00 00 cd 80 90 8d 76
> [  133.839387] EAX: ffffffda EBX: 798e2635 ECX: 00000000 EDX: 00000000
> [  134.921710] ESI: 00000000 EDI: 00000000 EBP: 00000000 ESP: bffff81c
> [  136.039511] DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 007b EFLAGS: 00000296
> [  137.326581] Modules linked in:
> [  138.350381] CR2: 00000000ffffffe0
> [  139.121664] ---[ end trace 5a43bf9a3ce51e57 ]---
> [  139.987631] EIP: memcmp+0x9/0x32
> [  140.203250] Code: 55 89 e5 57 56 8b 75 08 85 f6 74 11 bf 00 00 00 00 89 14 
> f8 89 4c f8 04 47 39 f7 75 f4 5e 5f 5d c3 55 89 e5 56 53 85 c9 74 22 <0f> b6 
> 18 0f b6 32 29 f3 75 12 01 c1 40 42 39 c1 74 0a 0f b6 18 0f
> [  140.553281] EAX: ffffffe0 EBX: 00000000 ECX: 00000004 EDX: ce4b5f3c
> [  140.734438] ESI: c2076a9c EDI: ffffffe0 EBP: ce4b5f14 ESP: ce4b5f0c
> [  140.858536] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00000202
> [  140.959192] CR0: 80050033 CR2: ffffffe0 CR3: 0e272000 CR4: 00000690
> [  141.052654] DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000
> [  141.177823] DR6: 00000000 DR7: 00000000
> 
> 
> ------------------------------------------ 
> Input and source code
> 
> This bug occurs when two syscalls, msget and msgctl, are invoked 
> concurrently. Our analysis has located two lines of code, one of which will 
> read a shared memory rhash_head __rcu object while the other writes to it. 
> 
> Writer:
> starting from include/linux/rhashtable.h:399
>     static inline void rht_assign_unlock(struct bucket_table *tbl,
>         struct rhash_lock_head **bkt,
>         struct rhash_head *obj)
>     {
>         struct rhash_head __rcu **p = (struct rhash_head __rcu **)bkt;
> 
>         if (rht_is_a_nulls(obj))
> --->       obj = NULL;
>         lock_map_release(&tbl->dep_map);
>         rcu_assign_pointer(*p, obj);
>         preempt_enable();
>         __release(bitlock);
>         local_bh_enable();
>     }
> 
> 
> Reader:
> starting at include/linux/rhashtable.h:352
>     static inline struct rhash_head rcu *rht_ptr(
>         struct rhash_lock_head const **bkt)
> {
>         return (struct rhash_head __rcu *)
>         ((unsigned long)*bkt & ~BIT(0) ?:
>         (unsigned long)RHT_NULLS_MARKER(bkt));
> }
> 
> return (struct rhash_head __rcu *)
> --->       ((unsigned long)*bkt & ~BIT(0) ?:
>         (unsigned long)RHT_NULLS_MARKER(bkt));
> 
> ------------------------------------------ 
> Thread interleaving
> 
> For presentation purposes, we convert the original code to explain the thread 
> interleaving.
> 
> Original code:
> return (struct rhash_head __rcu *)
>        ((unsigned long)*bkt & ~BIT(0) ?:
>         (unsigned long)RHT_NULLS_MARKER(bkt));
> 
> Converted code:
> if ((unsigned long)*bkt & ~BIT(0)){
> return  (struct rhash_head __rcu *)(unsigned long)*bkt & ~BIT(0);
> }else{
> return  (struct rhash_head __rcu *)(unsigned long)RHT_NULLS_MARKER(bkt);
> }
> 
> Interleaving that triggers the bug:
> 
> CPU0 (msgctl)                         CPU1(msget)
> …
> if (rht_is_a_nulls(obj))
>                                               …
>                                               if ((unsigned long)*bkt & 
> ~BIT(0)){
> 
> obj = NULL;
>                                               return  (struct rhash_head 
> __rcu *)(unsigned long)*bkt & ~BIT(0);
>                                               …
>                                               rhashtable_compare(…)
>                                               …
>                                               return memcmp(ptr + 
> ht->p.key_offset, arg->key, ht->p.key_len);
>                                               [from rhashtable_compare at 
> include/linux/rhashtable.h: 584]
>                                               [page fault]
> 
> ------------------------------------------ 
> Kernel configuration and compilation
> 
> We were only able to reproduce this bug when the kernel is compiled with “-O1 
> -fno-if-conversion -fno-if-conversion2 -fno-delayed-branch -fno-tree-fre 
> -fno-tree-dominator-opts -fno-cprop-registers” instead of the default “-O2”. 
> The compiler emits different instructions for the reader depending on the 
> optimization level. In the buggy version, the writer has two read accesses to 
> the bkt address but in the non-buggy version it only has one memory read 
> operation.
> 
> It is unclear to us how this affects other architectures.
> 
> 
> static inline struct rhash_head __rcu *__rht_ptr(
>       struct rhash_lock_head *const *bkt)
>   {
>  return (struct rhash_head __rcu *)
>       c11a36d4:   8b 45 e4                mov    -0x1c(%ebp),%eax
>       c11a36d7:   83 c8 01                or     $0x1,%eax
>       c11a36da:   89 45 e0                mov    %eax,-0x20(%ebp)
>       c11a36dd:   89 75 d8                mov    %esi,-0x28(%ebp)
>       c11a36e0:   8b 45 e4                mov    -0x1c(%ebp),%eax
> ---> c11a36e3:   f7 00 fe ff ff ff       testl  $0xfffffffe,(%eax)
>       c11a36e9:   74 69                   je     c11a3754 
> <bpf_offload_find_netdev+0x107>
> ---> c11a36eb:   8b 00                   mov    (%eax),%eax
>       c11a36ed:   89 45 dc                mov    %eax,-0x24(%ebp)
>       c11a36f0:   83 e0 fe                and    $0xfffffffe,%eax
> 
> 
> static inline struct rhash_head __rcu *__rht_ptr(
>      struct rhash_lock_head *const *bkt)
>  {
>      return (struct rhash_head __rcu *)
>        c119b644:   8b 45 e4                mov    -0x1c(%ebp),%eax
>        c119b647:   83 c8 01                or     $0x1,%eax
>        c119b64a:   89 45 dc                mov    %eax,-0x24(%ebp)
>        c119b64d:   89 75 d8                mov    %esi,-0x28(%ebp)
>          ((unsigned long)*bkt & ~BIT(0) ?:
>        c119b650:   8b 45 e4                mov    -0x1c(%ebp),%eax
>   ---> c119b653:   8b 00                   mov    (%eax),%eax
>         c119b655:   89 45 e0                mov    %eax,-0x20(%ebp)
>       return (struct rhash_head __rcu *)
>        c119b658:   83 e0 fe                and    $0xfffffffe,%eax
>        c119b65b:   75 03                   jne    c119b660 
> <bpf_offload_find_netdev+0xa3>
>        c119b65d:   8b 45 dc                mov    -0x24(%ebp),%eax
> 
> 
> Thanks,
> Sishuai
> 

Thanks for the report/analysis. 

READ_ONCE() should help here, can you test/submit an official patch ?

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 
d3432ee65de7684dbfb3cd6f04e207335db7f3bf..f9f88d67c8f7293b3aa9fdece5e70e51fa4859b5
 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -353,7 +353,7 @@ static inline struct rhash_head __rcu *__rht_ptr(
        struct rhash_lock_head *const *bkt)
 {
        return (struct rhash_head __rcu *)
-               ((unsigned long)*bkt & ~BIT(0) ?:
+               ((unsigned long)READ_ONCE(*bkt) & ~BIT(0) ?:
                 (unsigned long)RHT_NULLS_MARKER(bkt));
 }
 

Reply via email to