[clang] [llvm] [NVPTX] Support __usAtomicCAS builtin (PR #99646)

Artem Belevich via cfe-commits Tue, 20 Aug 2024 10:44:35 -0700

================
@@ -0,0 +1,216 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --default-march nvptx64 --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_32 | FileCheck %s 
--check-prefixes=SM30,CHECK
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_32 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s 
--check-prefixes=SM70,CHECK
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | 
%ptxas-verify -arch=sm_70 %}
+
+; TODO: these are system scope, but are compiled to gpu scope..
+; TODO: these are seq_cst, but are compiled to relaxed..
+
+; CHECK-LABEL: relaxed_sys_i8
+define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
+; SM30-LABEL: relaxed_sys_i8(
+; SM30:       {
+; SM30-NEXT:    .reg .pred %p<3>;
+; SM30-NEXT:    .reg .b16 %rs<2>;
+; SM30-NEXT:    .reg .b32 %r<21>;
+; SM30-NEXT:    .reg .b64 %rd<3>;
+; SM30-EMPTY:
+; SM30-NEXT:  // %bb.0:
+; SM30-NEXT:    ld.param.u8 %rs1, [relaxed_sys_i8_param_2];
+; SM30-NEXT:    ld.param.u64 %rd2, [relaxed_sys_i8_param_0];
+; SM30-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM30-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM30-NEXT:    and.b32 %r10, %r9, 3;
+; SM30-NEXT:    shl.b32 %r1, %r10, 3;
+; SM30-NEXT:    mov.b32 %r11, 255;
+; SM30-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM30-NEXT:    not.b32 %r2, %r12;
+; SM30-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM30-NEXT:    and.b32 %r14, %r13, 255;
+; SM30-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM30-NEXT:    ld.param.u8 %r15, [relaxed_sys_i8_param_1];
+; SM30-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM30-NEXT:    ld.u32 %r16, [%rd1];
+; SM30-NEXT:    and.b32 %r20, %r16, %r2;
+; SM30-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
+; SM30-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM30-NEXT:    or.b32 %r17, %r20, %r3;
+; SM30-NEXT:    or.b32 %r18, %r20, %r4;
+; SM30-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM30-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM30-NEXT:    @%p1 bra $L__BB0_3;
+; SM30-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM30-NEXT:    // in Loop: Header=BB0_1 Depth=1
+; SM30-NEXT:    and.b32 %r8, %r7, %r2;
+; SM30-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM30-NEXT:    mov.u32 %r20, %r8;
+; SM30-NEXT:    @%p2 bra $L__BB0_1;
+; SM30-NEXT:  $L__BB0_3: // %partword.cmpxchg.end
+; SM30-NEXT:    st.param.b32 [func_retval0+0], %r13;
+; SM30-NEXT:    ret;
+;
+; SM70-LABEL: relaxed_sys_i8(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<17>;
+; SM70-NEXT:    .reg .b32 %r<3>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs9, [relaxed_sys_i8_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [relaxed_sys_i8_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -2;
+; SM70-NEXT:    ld.param.u8 %rs10, [relaxed_sys_i8_param_1];
+; SM70-NEXT:    and.b64 %rd3, %rd2, 1;
+; SM70-NEXT:    shl.b64 %rd4, %rd3, 3;
+; SM70-NEXT:    cvt.u32.u64 %r1, %rd4;
+; SM70-NEXT:    mov.u16 %rs11, 255;
+; SM70-NEXT:    shl.b16 %rs12, %rs11, %r1;
+; SM70-NEXT:    not.b16 %rs2, %rs12;
+; SM70-NEXT:    shl.b16 %rs3, %rs9, %r1;
+; SM70-NEXT:    shl.b16 %rs4, %rs10, %r1;
+; SM70-NEXT:    ld.u16 %rs13, [%rd1];
+; SM70-NEXT:    and.b16 %rs16, %rs13, %rs2;
+; SM70-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b16 %rs14, %rs16, %rs3;
+; SM70-NEXT:    or.b16 %rs15, %rs16, %rs4;
+; SM70-NEXT:    atom.cas.b16 %rs7, [%rd1], %rs15, %rs14;
+; SM70-NEXT:    setp.eq.s16 %p1, %rs7, %rs15;
+; SM70-NEXT:    @%p1 bra $L__BB0_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB0_1 Depth=1
+; SM70-NEXT:    and.b16 %rs8, %rs7, %rs2;
+; SM70-NEXT:    setp.ne.s16 %p2, %rs16, %rs8;
+; SM70-NEXT:    mov.u16 %rs16, %rs8;
+; SM70-NEXT:    @%p2 bra $L__BB0_1;
+; SM70-NEXT:  $L__BB0_3: // %partword.cmpxchg.end
+; SM70-NEXT:    cvt.u32.u16 %r2, %rs9;
+; SM70-NEXT:    st.param.b32 [func_retval0+0], %r2;
+; SM70-NEXT:    ret;
+  %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst
----------------
Artem-B wrote:


> this is generating tests that I think are too precise and contain too much 
> unrelated CHECKs,

They look reasonably readable to me. The cases where we have to fall-back to a 
more complicated algorithm are expected to be more verbose. 

Granted, not all the parts are particularly interesting, and would be an 
unnecessary burden if the checks were done manually, but with the autogenerated 
checks, it's not an issue. On a positive side, being able to see that some IR 
may produce less efficient code than others is sometimes quite useful as it 
highlights the current inefficiencies and possible future optimization 
opportunities.


https://github.com/llvm/llvm-project/pull/99646
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [NVPTX] Support __usAtomicCAS builtin (PR #99646)

Reply via email to