https://github.com/vpykhtin updated https://github.com/llvm/llvm-project/pull/71556
>From f4ff530d27334a8ba8c986d321efd4ab751b6476 Mon Sep 17 00:00:00 2001 From: Valery Pykhtin <valery.pykh...@gmail.com> Date: Mon, 20 Nov 2023 15:22:16 +0100 Subject: [PATCH 1/2] run opt instcombine pass in the test --- .../AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll | 44 ++++++++++--------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll index 04a993eac82cd5e..bd319c6fc39ce62 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -global-isel=0 -mcpu=gfx1010 < %s | FileCheck %s --check-prefixes=CHECK,DAGISEL -; RUN: llc -march=amdgcn -global-isel=0 -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck %s --check-prefixes=CHECK,DAGISEL -; RUN: llc -march=amdgcn -global-isel -mcpu=gfx1010 < %s | FileCheck %s --check-prefixes=CHECK,GISEL -; RUN: llc -march=amdgcn -global-isel -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck %s --check-prefixes=CHECK,GISEL +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -passes=instcombine -o - < %s | llc -march=amdgcn -global-isel=0 -mcpu=gfx1010 - | FileCheck %s --check-prefixes=CHECK,DAGISEL +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -passes=instcombine -o - < %s | llc -march=amdgcn -global-isel=0 -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 - | FileCheck %s --check-prefixes=CHECK,DAGISEL +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -passes=instcombine -o - < %s | llc -march=amdgcn -global-isel -mcpu=gfx1010 - | FileCheck %s --check-prefixes=CHECK,GISEL +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -passes=instcombine -o - < %s | llc -march=amdgcn -global-isel -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 - | FileCheck %s --check-prefixes=CHECK,GISEL declare i64 @llvm.amdgcn.ballot.i64(i1) declare i64 @llvm.ctpop.i64(i64) @@ -40,12 +40,20 @@ define amdgpu_cs i64 @constant_true() { ; Test ballot of a non-comparison operation define amdgpu_cs i64 @non_compare(i32 %x) { -; CHECK-LABEL: non_compare: -; CHECK: ; %bb.0: -; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 -; CHECK-NEXT: s_mov_b32 s1, 0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 -; CHECK-NEXT: ; return to shader part epilog +; DAGISEL-LABEL: non_compare: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: v_and_b32_e32 v0, 1, v0 +; DAGISEL-NEXT: s_mov_b32 s1, 0 +; DAGISEL-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 +; DAGISEL-NEXT: ; return to shader part epilog +; +; GISEL-LABEL: non_compare: +; GISEL: ; %bb.0: +; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 +; GISEL-NEXT: s_mov_b32 s1, 0 +; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 +; GISEL-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 +; GISEL-NEXT: ; return to shader part epilog %trunc = trunc i32 %x to i1 %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %trunc) ret i64 %ballot @@ -65,17 +73,11 @@ define amdgpu_cs i64 @compare_ints(i32 %x, i32 %y) { } define amdgpu_cs i64 @compare_int_with_constant(i32 %x) { -; DAGISEL-LABEL: compare_int_with_constant: -; DAGISEL: ; %bb.0: -; DAGISEL-NEXT: v_cmp_lt_i32_e64 s0, 0x62, v0 -; DAGISEL-NEXT: s_mov_b32 s1, 0 -; DAGISEL-NEXT: ; return to shader part epilog -; -; GISEL-LABEL: compare_int_with_constant: -; GISEL: ; %bb.0: -; GISEL-NEXT: v_cmp_le_i32_e64 s0, 0x63, v0 -; GISEL-NEXT: s_mov_b32 s1, 0 -; GISEL-NEXT: ; return to shader part epilog +; CHECK-LABEL: compare_int_with_constant: +; CHECK: ; %bb.0: +; CHECK-NEXT: v_cmp_lt_i32_e64 s0, 0x62, v0 +; CHECK-NEXT: s_mov_b32 s1, 0 +; CHECK-NEXT: ; return to shader part epilog %cmp = icmp sge i32 %x, 99 %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp) ret i64 %ballot >From e97e128f39639ab278a83f6aec99b0d7a8753bd7 Mon Sep 17 00:00:00 2001 From: Valery Pykhtin <valery.pykh...@gmail.com> Date: Mon, 20 Nov 2023 15:29:31 +0100 Subject: [PATCH 2/2] add instcombine rule --- .../CodeGenOpenCL/builtins-amdgcn-wave32.cl | 8 +-- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 5 +- .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 12 ++++ .../AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll | 68 +++++++++---------- .../InstCombine/AMDGPU/amdgcn-intrinsics.ll | 6 +- 5 files changed, 54 insertions(+), 45 deletions(-) diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl index 43553131f63c549..a0e27ce22fe7d9c 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl @@ -24,13 +24,11 @@ void test_ballot_wave32_target_attr(global uint* out, int a, int b) } // CHECK-LABEL: @test_read_exec( -// CHECK: call i64 @llvm.amdgcn.ballot.i64(i1 true) +// CHECK: call i32 @llvm.amdgcn.ballot.i32(i1 true) void test_read_exec(global uint* out) { *out = __builtin_amdgcn_read_exec(); } -// CHECK: declare i64 @llvm.amdgcn.ballot.i64(i1) #[[$NOUNWIND_READONLY:[0-9]+]] - // CHECK-LABEL: @test_read_exec_lo( // CHECK: call i32 @llvm.amdgcn.ballot.i32(i1 true) void test_read_exec_lo(global uint* out) { @@ -38,9 +36,7 @@ void test_read_exec_lo(global uint* out) { } // CHECK-LABEL: @test_read_exec_hi( -// CHECK: call i64 @llvm.amdgcn.ballot.i64(i1 true) -// CHECK: lshr i64 [[A:%.*]], 32 -// CHECK: trunc i64 [[B:%.*]] to i32 +// CHECK: store i32 0, ptr addrspace(1) %out void test_read_exec_hi(global uint* out) { *out = __builtin_amdgcn_read_exec_hi(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index ead3f51d6acdc5a..a0ff1bd579b14b2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -2314,9 +2314,8 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) { SDValue VCMP = Cond->getOperand(0); auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get(); auto *CRHS = dyn_cast<ConstantSDNode>(Cond->getOperand(1)); - if ((CC == ISD::SETEQ || CC == ISD::SETNE) && CRHS && CRHS->isZero() && - // TODO: make condition below an assert after fixing ballot bitwidth. - VCMP.getValueType().getSizeInBits() == ST->getWavefrontSize()) { + if ((CC == ISD::SETEQ || CC == ISD::SETNE) && CRHS && CRHS->isZero()) { + assert(VCMP.getValueType().getSizeInBits() == ST->getWavefrontSize()); // %VCMP = i(WaveSize) AMDGPUISD::SETCC ... // %C = i1 ISD::SETCC %VCMP, 0, setne/seteq // BRCOND i1 %C, %BB diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 5296415ab4c36da..510f0a59719ecdb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -961,6 +961,18 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType())); } } + if (ST->isWave32() && II.getType()->getIntegerBitWidth() == 64) { + // %b64 = call i64 ballot.i64(...) + // => + // %b32 = call i32 ballot.i32(...) + // %b64 = zext i32 %b32 to i64 + Function *NewF = Intrinsic::getDeclaration( + II.getModule(), Intrinsic::amdgcn_ballot, {IC.Builder.getInt32Ty()}); + CallInst *NewCall = IC.Builder.CreateCall(NewF, {II.getArgOperand(0)}); + Value *CastedCall = IC.Builder.CreateZExtOrBitCast(NewCall, II.getType()); + CastedCall->takeName(&II); + return IC.replaceInstUsesWith(II, CastedCall); + } break; } case Intrinsic::amdgcn_wqm_vote: { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll index bd319c6fc39ce62..8647718d7a77b98 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll @@ -22,17 +22,11 @@ define amdgpu_cs i64 @constant_false() { ; Test ballot(1) define amdgpu_cs i64 @constant_true() { -; DAGISEL-LABEL: constant_true: -; DAGISEL: ; %bb.0: -; DAGISEL-NEXT: s_mov_b32 s0, exec_lo -; DAGISEL-NEXT: s_mov_b32 s1, exec_hi -; DAGISEL-NEXT: ; return to shader part epilog -; -; GISEL-LABEL: constant_true: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_mov_b32 s0, exec_lo -; GISEL-NEXT: s_mov_b32 s1, 0 -; GISEL-NEXT: ; return to shader part epilog +; CHECK-LABEL: constant_true: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 s0, exec_lo +; CHECK-NEXT: s_mov_b32 s1, 0 +; CHECK-NEXT: ; return to shader part epilog %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 1) ret i64 %ballot } @@ -97,9 +91,9 @@ define amdgpu_cs i64 @compare_floats(float %x, float %y) { define amdgpu_cs i64 @ctpop_of_ballot(float %x, float %y) { ; CHECK-LABEL: ctpop_of_ballot: ; CHECK: ; %bb.0: -; CHECK-NEXT: v_cmp_gt_f32_e64 s0, v0, v1 +; CHECK-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1 ; CHECK-NEXT: s_mov_b32 s1, 0 -; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; CHECK-NEXT: s_bcnt1_i32_b32 s0, vcc_lo ; CHECK-NEXT: ; return to shader part epilog %cmp = fcmp ogt float %x, %y %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp) @@ -108,19 +102,30 @@ define amdgpu_cs i64 @ctpop_of_ballot(float %x, float %y) { } define amdgpu_cs i32 @branch_divergent_ballot64_ne_zero_compare(i32 %v) { -; CHECK-LABEL: branch_divergent_ballot64_ne_zero_compare: -; CHECK: ; %bb.0: -; CHECK-NEXT: v_cmp_gt_u32_e64 s0, 12, v0 -; CHECK-NEXT: s_mov_b32 s1, 0 -; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0 -; CHECK-NEXT: s_cbranch_scc1 .LBB7_2 -; CHECK-NEXT: ; %bb.1: ; %true -; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB7_3 -; CHECK-NEXT: .LBB7_2: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB7_3 -; CHECK-NEXT: .LBB7_3: +; DAGISEL-LABEL: branch_divergent_ballot64_ne_zero_compare: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0 +; DAGISEL-NEXT: s_cbranch_vccz .LBB7_2 +; DAGISEL-NEXT: ; %bb.1: ; %true +; DAGISEL-NEXT: s_mov_b32 s0, 42 +; DAGISEL-NEXT: s_branch .LBB7_3 +; DAGISEL-NEXT: .LBB7_2: ; %false +; DAGISEL-NEXT: s_mov_b32 s0, 33 +; DAGISEL-NEXT: s_branch .LBB7_3 +; DAGISEL-NEXT: .LBB7_3: +; +; GISEL-LABEL: branch_divergent_ballot64_ne_zero_compare: +; GISEL: ; %bb.0: +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0 +; GISEL-NEXT: s_cmp_eq_u32 vcc_lo, 0 +; GISEL-NEXT: s_cbranch_scc1 .LBB7_2 +; GISEL-NEXT: ; %bb.1: ; %true +; GISEL-NEXT: s_mov_b32 s0, 42 +; GISEL-NEXT: s_branch .LBB7_3 +; GISEL-NEXT: .LBB7_2: ; %false +; GISEL-NEXT: s_mov_b32 s0, 33 +; GISEL-NEXT: s_branch .LBB7_3 +; GISEL-NEXT: .LBB7_3: %c = icmp ult i32 %v, 12 %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c) %ballot_ne_zero = icmp ne i64 %ballot, 0 @@ -136,12 +141,8 @@ define amdgpu_cs i32 @branch_divergent_ballot64_ne_zero_and(i32 %v1, i32 %v2) { ; DAGISEL: ; %bb.0: ; DAGISEL-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0 ; DAGISEL-NEXT: v_cmp_lt_u32_e64 s0, 34, v1 -; DAGISEL-NEXT: s_mov_b32 s1, 0 -; DAGISEL-NEXT: s_and_b32 s0, vcc_lo, s0 -; DAGISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; DAGISEL-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 -; DAGISEL-NEXT: s_cmp_eq_u64 s[0:1], 0 -; DAGISEL-NEXT: s_cbranch_scc1 .LBB8_2 +; DAGISEL-NEXT: s_and_b32 vcc_lo, vcc_lo, s0 +; DAGISEL-NEXT: s_cbranch_vccz .LBB8_2 ; DAGISEL-NEXT: ; %bb.1: ; %true ; DAGISEL-NEXT: s_mov_b32 s0, 42 ; DAGISEL-NEXT: s_branch .LBB8_3 @@ -154,9 +155,8 @@ define amdgpu_cs i32 @branch_divergent_ballot64_ne_zero_and(i32 %v1, i32 %v2) { ; GISEL: ; %bb.0: ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0 ; GISEL-NEXT: v_cmp_lt_u32_e64 s0, 34, v1 -; GISEL-NEXT: s_mov_b32 s1, 0 ; GISEL-NEXT: s_and_b32 s0, vcc_lo, s0 -; GISEL-NEXT: s_cmp_eq_u64 s[0:1], 0 +; GISEL-NEXT: s_cmp_eq_u32 s0, 0 ; GISEL-NEXT: s_cbranch_scc1 .LBB8_2 ; GISEL-NEXT: ; %bb.1: ; %true ; GISEL-NEXT: s_mov_b32 s0, 42 diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll index 804283cc20cd6a3..94c32e3cbe99f72 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll @@ -2599,7 +2599,8 @@ declare i32 @llvm.amdgcn.ballot.i32(i1) nounwind readnone convergent define i64 @ballot_nocombine_64(i1 %i) { ; CHECK-LABEL: @ballot_nocombine_64( -; CHECK-NEXT: [[B:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[I:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[I:%.*]]) +; CHECK-NEXT: [[B:%.*]] = zext i32 [[TMP1]] to i64 ; CHECK-NEXT: ret i64 [[B]] ; %b = call i64 @llvm.amdgcn.ballot.i64(i1 %i) @@ -2616,7 +2617,8 @@ define i64 @ballot_zero_64() { define i64 @ballot_one_64() { ; CHECK-LABEL: @ballot_one_64( -; CHECK-NEXT: [[B:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 true) +; CHECK-NEXT: [[B:%.*]] = zext i32 [[TMP1]] to i64 ; CHECK-NEXT: ret i64 [[B]] ; %b = call i64 @llvm.amdgcn.ballot.i64(i1 1) _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits