https://github.com/vpykhtin updated https://github.com/llvm/llvm-project/pull/71556
>From 526c635b3f70fd779f0919c5c40acd017a0f800e Mon Sep 17 00:00:00 2001 From: Valery Pykhtin <valery.pykh...@gmail.com> Date: Mon, 20 Nov 2023 15:22:16 +0100 Subject: [PATCH] add instcombine rule --- .../CodeGenOpenCL/builtins-amdgcn-wave32.cl | 8 +- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 2 +- .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 12 ++ .../AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll | 110 ++++++++++++++++++ .../InstCombine/AMDGPU/amdgcn-intrinsics.ll | 6 +- 5 files changed, 129 insertions(+), 9 deletions(-) diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl index 43553131f63c549..a0e27ce22fe7d9c 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl @@ -24,13 +24,11 @@ void test_ballot_wave32_target_attr(global uint* out, int a, int b) } // CHECK-LABEL: @test_read_exec( -// CHECK: call i64 @llvm.amdgcn.ballot.i64(i1 true) +// CHECK: call i32 @llvm.amdgcn.ballot.i32(i1 true) void test_read_exec(global uint* out) { *out = __builtin_amdgcn_read_exec(); } -// CHECK: declare i64 @llvm.amdgcn.ballot.i64(i1) #[[$NOUNWIND_READONLY:[0-9]+]] - // CHECK-LABEL: @test_read_exec_lo( // CHECK: call i32 @llvm.amdgcn.ballot.i32(i1 true) void test_read_exec_lo(global uint* out) { @@ -38,9 +36,7 @@ void test_read_exec_lo(global uint* out) { } // CHECK-LABEL: @test_read_exec_hi( -// CHECK: call i64 @llvm.amdgcn.ballot.i64(i1 true) -// CHECK: lshr i64 [[A:%.*]], 32 -// CHECK: trunc i64 [[B:%.*]] to i32 +// CHECK: store i32 0, ptr addrspace(1) %out void test_read_exec_hi(global uint* out) { *out = __builtin_amdgcn_read_exec_hi(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index ead3f51d6acdc5a..e2ea1a2752af166 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -2315,7 +2315,7 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) { auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get(); auto *CRHS = dyn_cast<ConstantSDNode>(Cond->getOperand(1)); if ((CC == ISD::SETEQ || CC == ISD::SETNE) && CRHS && CRHS->isZero() && - // TODO: make condition below an assert after fixing ballot bitwidth. + // We may encounter ballot.i64 in wave32 mode on -O0. VCMP.getValueType().getSizeInBits() == ST->getWavefrontSize()) { // %VCMP = i(WaveSize) AMDGPUISD::SETCC ... // %C = i1 ISD::SETCC %VCMP, 0, setne/seteq diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 5296415ab4c36da..510f0a59719ecdb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -961,6 +961,18 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType())); } } + if (ST->isWave32() && II.getType()->getIntegerBitWidth() == 64) { + // %b64 = call i64 ballot.i64(...) + // => + // %b32 = call i32 ballot.i32(...) + // %b64 = zext i32 %b32 to i64 + Function *NewF = Intrinsic::getDeclaration( + II.getModule(), Intrinsic::amdgcn_ballot, {IC.Builder.getInt32Ty()}); + CallInst *NewCall = IC.Builder.CreateCall(NewF, {II.getArgOperand(0)}); + Value *CastedCall = IC.Builder.CreateZExtOrBitCast(NewCall, II.getType()); + CastedCall->takeName(&II); + return IC.replaceInstUsesWith(II, CastedCall); + } break; } case Intrinsic::amdgcn_wqm_vote: { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll index 04a993eac82cd5e..e5344b28edf2b59 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll @@ -3,6 +3,10 @@ ; RUN: llc -march=amdgcn -global-isel=0 -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck %s --check-prefixes=CHECK,DAGISEL ; RUN: llc -march=amdgcn -global-isel -mcpu=gfx1010 < %s | FileCheck %s --check-prefixes=CHECK,GISEL ; RUN: llc -march=amdgcn -global-isel -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck %s --check-prefixes=CHECK,GISEL +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -passes=instcombine -o - < %s | llc -march=amdgcn -global-isel=0 -mcpu=gfx1010 - | FileCheck %s --check-prefixes=CHECK-OPT,DAGISEL-OPT +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -passes=instcombine -o - < %s | llc -march=amdgcn -global-isel=0 -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 - | FileCheck %s --check-prefixes=CHECK-OPT,DAGISEL-OPT +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -passes=instcombine -o - < %s | llc -march=amdgcn -global-isel -mcpu=gfx1010 - | FileCheck %s --check-prefixes=CHECK-OPT,GISEL-OPT +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -passes=instcombine -o - < %s | llc -march=amdgcn -global-isel -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 - | FileCheck %s --check-prefixes=CHECK-OPT,GISEL-OPT declare i64 @llvm.amdgcn.ballot.i64(i1) declare i64 @llvm.ctpop.i64(i64) @@ -15,6 +19,12 @@ define amdgpu_cs i64 @constant_false() { ; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: s_mov_b32 s1, 0 ; CHECK-NEXT: ; return to shader part epilog +; +; CHECK-OPT-LABEL: constant_false: +; CHECK-OPT: ; %bb.0: +; CHECK-OPT-NEXT: s_mov_b32 s0, 0 +; CHECK-OPT-NEXT: s_mov_b32 s1, 0 +; CHECK-OPT-NEXT: ; return to shader part epilog %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 0) ret i64 %ballot } @@ -33,6 +43,12 @@ define amdgpu_cs i64 @constant_true() { ; GISEL-NEXT: s_mov_b32 s0, exec_lo ; GISEL-NEXT: s_mov_b32 s1, 0 ; GISEL-NEXT: ; return to shader part epilog +; +; CHECK-OPT-LABEL: constant_true: +; CHECK-OPT: ; %bb.0: +; CHECK-OPT-NEXT: s_mov_b32 s0, exec_lo +; CHECK-OPT-NEXT: s_mov_b32 s1, 0 +; CHECK-OPT-NEXT: ; return to shader part epilog %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 1) ret i64 %ballot } @@ -46,6 +62,21 @@ define amdgpu_cs i64 @non_compare(i32 %x) { ; CHECK-NEXT: s_mov_b32 s1, 0 ; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 ; CHECK-NEXT: ; return to shader part epilog +; +; DAGISEL-OPT-LABEL: non_compare: +; DAGISEL-OPT: ; %bb.0: +; DAGISEL-OPT-NEXT: v_and_b32_e32 v0, 1, v0 +; DAGISEL-OPT-NEXT: s_mov_b32 s1, 0 +; DAGISEL-OPT-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 +; DAGISEL-OPT-NEXT: ; return to shader part epilog +; +; GISEL-OPT-LABEL: non_compare: +; GISEL-OPT: ; %bb.0: +; GISEL-OPT-NEXT: v_and_b32_e32 v0, 1, v0 +; GISEL-OPT-NEXT: s_mov_b32 s1, 0 +; GISEL-OPT-NEXT: v_and_b32_e32 v0, 1, v0 +; GISEL-OPT-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 +; GISEL-OPT-NEXT: ; return to shader part epilog %trunc = trunc i32 %x to i1 %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %trunc) ret i64 %ballot @@ -59,6 +90,12 @@ define amdgpu_cs i64 @compare_ints(i32 %x, i32 %y) { ; CHECK-NEXT: v_cmp_eq_u32_e64 s0, v0, v1 ; CHECK-NEXT: s_mov_b32 s1, 0 ; CHECK-NEXT: ; return to shader part epilog +; +; CHECK-OPT-LABEL: compare_ints: +; CHECK-OPT: ; %bb.0: +; CHECK-OPT-NEXT: v_cmp_eq_u32_e64 s0, v0, v1 +; CHECK-OPT-NEXT: s_mov_b32 s1, 0 +; CHECK-OPT-NEXT: ; return to shader part epilog %cmp = icmp eq i32 %x, %y %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp) ret i64 %ballot @@ -76,6 +113,12 @@ define amdgpu_cs i64 @compare_int_with_constant(i32 %x) { ; GISEL-NEXT: v_cmp_le_i32_e64 s0, 0x63, v0 ; GISEL-NEXT: s_mov_b32 s1, 0 ; GISEL-NEXT: ; return to shader part epilog +; +; CHECK-OPT-LABEL: compare_int_with_constant: +; CHECK-OPT: ; %bb.0: +; CHECK-OPT-NEXT: v_cmp_lt_i32_e64 s0, 0x62, v0 +; CHECK-OPT-NEXT: s_mov_b32 s1, 0 +; CHECK-OPT-NEXT: ; return to shader part epilog %cmp = icmp sge i32 %x, 99 %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp) ret i64 %ballot @@ -87,6 +130,12 @@ define amdgpu_cs i64 @compare_floats(float %x, float %y) { ; CHECK-NEXT: v_cmp_gt_f32_e64 s0, v0, v1 ; CHECK-NEXT: s_mov_b32 s1, 0 ; CHECK-NEXT: ; return to shader part epilog +; +; CHECK-OPT-LABEL: compare_floats: +; CHECK-OPT: ; %bb.0: +; CHECK-OPT-NEXT: v_cmp_gt_f32_e64 s0, v0, v1 +; CHECK-OPT-NEXT: s_mov_b32 s1, 0 +; CHECK-OPT-NEXT: ; return to shader part epilog %cmp = fcmp ogt float %x, %y %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp) ret i64 %ballot @@ -99,6 +148,13 @@ define amdgpu_cs i64 @ctpop_of_ballot(float %x, float %y) { ; CHECK-NEXT: s_mov_b32 s1, 0 ; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; CHECK-NEXT: ; return to shader part epilog +; +; CHECK-OPT-LABEL: ctpop_of_ballot: +; CHECK-OPT: ; %bb.0: +; CHECK-OPT-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1 +; CHECK-OPT-NEXT: s_mov_b32 s1, 0 +; CHECK-OPT-NEXT: s_bcnt1_i32_b32 s0, vcc_lo +; CHECK-OPT-NEXT: ; return to shader part epilog %cmp = fcmp ogt float %x, %y %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp) %bcnt = call i64 @llvm.ctpop.i64(i64 %ballot) @@ -119,6 +175,31 @@ define amdgpu_cs i32 @branch_divergent_ballot64_ne_zero_compare(i32 %v) { ; CHECK-NEXT: s_mov_b32 s0, 33 ; CHECK-NEXT: s_branch .LBB7_3 ; CHECK-NEXT: .LBB7_3: +; +; DAGISEL-OPT-LABEL: branch_divergent_ballot64_ne_zero_compare: +; DAGISEL-OPT: ; %bb.0: +; DAGISEL-OPT-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0 +; DAGISEL-OPT-NEXT: s_cbranch_vccz .LBB7_2 +; DAGISEL-OPT-NEXT: ; %bb.1: ; %true +; DAGISEL-OPT-NEXT: s_mov_b32 s0, 42 +; DAGISEL-OPT-NEXT: s_branch .LBB7_3 +; DAGISEL-OPT-NEXT: .LBB7_2: ; %false +; DAGISEL-OPT-NEXT: s_mov_b32 s0, 33 +; DAGISEL-OPT-NEXT: s_branch .LBB7_3 +; DAGISEL-OPT-NEXT: .LBB7_3: +; +; GISEL-OPT-LABEL: branch_divergent_ballot64_ne_zero_compare: +; GISEL-OPT: ; %bb.0: +; GISEL-OPT-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0 +; GISEL-OPT-NEXT: s_cmp_eq_u32 vcc_lo, 0 +; GISEL-OPT-NEXT: s_cbranch_scc1 .LBB7_2 +; GISEL-OPT-NEXT: ; %bb.1: ; %true +; GISEL-OPT-NEXT: s_mov_b32 s0, 42 +; GISEL-OPT-NEXT: s_branch .LBB7_3 +; GISEL-OPT-NEXT: .LBB7_2: ; %false +; GISEL-OPT-NEXT: s_mov_b32 s0, 33 +; GISEL-OPT-NEXT: s_branch .LBB7_3 +; GISEL-OPT-NEXT: .LBB7_3: %c = icmp ult i32 %v, 12 %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c) %ballot_ne_zero = icmp ne i64 %ballot, 0 @@ -163,6 +244,35 @@ define amdgpu_cs i32 @branch_divergent_ballot64_ne_zero_and(i32 %v1, i32 %v2) { ; GISEL-NEXT: s_mov_b32 s0, 33 ; GISEL-NEXT: s_branch .LBB8_3 ; GISEL-NEXT: .LBB8_3: +; +; DAGISEL-OPT-LABEL: branch_divergent_ballot64_ne_zero_and: +; DAGISEL-OPT: ; %bb.0: +; DAGISEL-OPT-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0 +; DAGISEL-OPT-NEXT: v_cmp_lt_u32_e64 s0, 34, v1 +; DAGISEL-OPT-NEXT: s_and_b32 vcc_lo, vcc_lo, s0 +; DAGISEL-OPT-NEXT: s_cbranch_vccz .LBB8_2 +; DAGISEL-OPT-NEXT: ; %bb.1: ; %true +; DAGISEL-OPT-NEXT: s_mov_b32 s0, 42 +; DAGISEL-OPT-NEXT: s_branch .LBB8_3 +; DAGISEL-OPT-NEXT: .LBB8_2: ; %false +; DAGISEL-OPT-NEXT: s_mov_b32 s0, 33 +; DAGISEL-OPT-NEXT: s_branch .LBB8_3 +; DAGISEL-OPT-NEXT: .LBB8_3: +; +; GISEL-OPT-LABEL: branch_divergent_ballot64_ne_zero_and: +; GISEL-OPT: ; %bb.0: +; GISEL-OPT-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0 +; GISEL-OPT-NEXT: v_cmp_lt_u32_e64 s0, 34, v1 +; GISEL-OPT-NEXT: s_and_b32 s0, vcc_lo, s0 +; GISEL-OPT-NEXT: s_cmp_eq_u32 s0, 0 +; GISEL-OPT-NEXT: s_cbranch_scc1 .LBB8_2 +; GISEL-OPT-NEXT: ; %bb.1: ; %true +; GISEL-OPT-NEXT: s_mov_b32 s0, 42 +; GISEL-OPT-NEXT: s_branch .LBB8_3 +; GISEL-OPT-NEXT: .LBB8_2: ; %false +; GISEL-OPT-NEXT: s_mov_b32 s0, 33 +; GISEL-OPT-NEXT: s_branch .LBB8_3 +; GISEL-OPT-NEXT: .LBB8_3: %v1c = icmp ult i32 %v1, 12 %v2c = icmp ugt i32 %v2, 34 %c = and i1 %v1c, %v2c diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll index 804283cc20cd6a3..94c32e3cbe99f72 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll @@ -2599,7 +2599,8 @@ declare i32 @llvm.amdgcn.ballot.i32(i1) nounwind readnone convergent define i64 @ballot_nocombine_64(i1 %i) { ; CHECK-LABEL: @ballot_nocombine_64( -; CHECK-NEXT: [[B:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[I:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[I:%.*]]) +; CHECK-NEXT: [[B:%.*]] = zext i32 [[TMP1]] to i64 ; CHECK-NEXT: ret i64 [[B]] ; %b = call i64 @llvm.amdgcn.ballot.i64(i1 %i) @@ -2616,7 +2617,8 @@ define i64 @ballot_zero_64() { define i64 @ballot_one_64() { ; CHECK-LABEL: @ballot_one_64( -; CHECK-NEXT: [[B:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 true) +; CHECK-NEXT: [[B:%.*]] = zext i32 [[TMP1]] to i64 ; CHECK-NEXT: ret i64 [[B]] ; %b = call i64 @llvm.amdgcn.ballot.i64(i1 1) _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits