ldrumm added a comment.

In D151701#4380666 <https://reviews.llvm.org/D151701#4380666>, @yaxunl wrote:

> HIP did not add fetch/sub since fetch/sub x can be trivially implemented 
> through fetch/add -x and performance-wise equivalent.

There is existing isel for `global_atomic_sub` for RDNA targets which means we 
can avoid a subtraction. I also have a patch for the hip runtime ready to go 
that uses the this new builtin. It should shave off an extra instruction.

  __global__ void test_natural_sub(int *data, int rhs) {
    __hip_atomic_fetch_sub(data, rhs, __ATOMIC_RELAXED, 
__HIP_MEMORY_SCOPE_WAVEFRONT);
  }
  
  __global__ void test_kernel_neg_add_sub(int *data, int rhs) {
    __hip_atomic_fetch_add(data, -rhs, __ATOMIC_RELAXED, 
__HIP_MEMORY_SCOPE_WAVEFRONT);
  }

->

  0000000000000000 <_Z16test_natural_subPii>:                                   
                                                                                
          
          s_clause 0x1                                               // 
000000000000: BFA10001
          s_load_dword s2, s[4:5], 0x8                               // 
000000000004: F4000082 FA000008
          s_load_dwordx2 s[0:1], s[4:5], null                        // 
00000000000C: F4040002 FA000000
          v_mov_b32_e32 v0, 0                                        // 
000000000014: 7E000280
          s_waitcnt lgkmcnt(0)                                       // 
000000000018: BF8CC07F
          v_mov_b32_e32 v1, s2                                       // 
00000000001C: 7E020202
          global_atomic_sub v0, v1, s[0:1]                           // 
000000000020: DCCC8000 00000100
          s_endpgm                                                   // 
000000000028: BF810000         
  
  
  0000000000000100 <_Z23test_kernel_neg_add_subPii>:
          s_clause 0x1                                               // 
000000000100: BFA10001
          s_load_dword s2, s[4:5], 0x8                               // 
000000000104: F4000082 FA000008
          s_load_dwordx2 s[0:1], s[4:5], null                        // 
00000000010C: F4040002 FA000000
          v_mov_b32_e32 v0, 0                                        // 
000000000114: 7E000280
          s_waitcnt lgkmcnt(0)                                       // 
000000000118: BF8CC07F
          s_sub_i32 s2, 0, s2                                        // 
00000000011C: 81820280
          v_mov_b32_e32 v1, s2                                       // 
000000000120: 7E020202
          global_atomic_add v0, v1, s[0:1]                           // 
000000000124: DCC88000 00000100
          s_endpgm                                                   // 
00000000012C: BF810000

The backend has isel for this instruction, but the frontend will never generate 
it. I think this improves things


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D151701/new/

https://reviews.llvm.org/D151701

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to