llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-llvm-transforms Author: Florian Hahn (fhahn) <details> <summary>Changes</summary> Update createEdgeMask to created masks where the terminator in Src is a switch. We need to handle 2 separate cases: 1. Dst is not the default desintation. Dst is reached if any of the cases with destination == Dst are taken. Join the conditions for each case where destination == Dst using a logical OR. 2. Dst is the default destination. Dst is reached if none of the cases with destination != Dst are taken. Join the conditions for each case where the destination is != Dst using a logical OR and negate it. Fixes https://github.com/llvm/llvm-project/issues/48188. --- Patch is 84.17 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/99808.diff 7 Files Affected: - (modified) clang/test/Frontend/optimization-remark-analysis.c (+1-1) - (modified) llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp (+5-5) - (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+35) - (modified) llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll (+623-24) - (modified) llvm/test/Transforms/LoopVectorize/no_switch.ll (+5-7) - (modified) llvm/test/Transforms/LoopVectorize/predicate-switch.ll (+196-4) - (modified) llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll (+55-5) ``````````diff diff --git a/clang/test/Frontend/optimization-remark-analysis.c b/clang/test/Frontend/optimization-remark-analysis.c index e43984942a6ef..9d8917265a320 100644 --- a/clang/test/Frontend/optimization-remark-analysis.c +++ b/clang/test/Frontend/optimization-remark-analysis.c @@ -1,7 +1,7 @@ // RUN: %clang -O1 -fvectorize -target x86_64-unknown-unknown -emit-llvm -Rpass-analysis -S %s -o - 2>&1 | FileCheck %s --check-prefix=RPASS // RUN: %clang -O1 -fvectorize -target x86_64-unknown-unknown -emit-llvm -S %s -o - 2>&1 | FileCheck %s -// RPASS: {{.*}}:12:5: remark: loop not vectorized: loop contains a switch statement +// RPASS-NOT: {{.*}}:12:5: remark: loop not vectorized // CHECK-NOT: remark: loop not vectorized: loop contains a switch statement double foo(int N, int *Array) { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index f54eebb2874ab..7f84455150093 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -1348,11 +1348,11 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { // Collect the blocks that need predication. for (BasicBlock *BB : TheLoop->blocks()) { // We don't support switch statements inside loops. - if (!isa<BranchInst>(BB->getTerminator())) { - reportVectorizationFailure("Loop contains a switch statement", - "loop contains a switch statement", - "LoopContainsSwitch", ORE, TheLoop, - BB->getTerminator()); + if (!isa<BranchInst, SwitchInst>(BB->getTerminator())) { + reportVectorizationFailure("Loop contains an unsupported termaintor", + "loop contains an unsupported terminator", + "LoopContainsUnsupportedTerminator", ORE, + TheLoop, BB->getTerminator()); return false; } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 6d28b8fabe42e..2530762e3e424 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7763,6 +7763,41 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { VPValue *SrcMask = getBlockInMask(Src); + if (auto *SI = dyn_cast<SwitchInst>(Src->getTerminator())) { + // Create mask where the terminator in Src is a switch. We need to handle 2 + // separate cases: + // 1. Dst is not the default desintation. Dst is reached if any of the cases + // with destination == Dst are taken. Join the conditions for each case + // where destination == Dst using a logical OR. + // 2. Dst is the default destination. Dst is reached if none of the cases + // with destination != Dst are taken. Join the conditions for each case + // where the destination is != Dst using a logical OR and negate it. + VPValue *Mask = nullptr; + VPValue *Cond = getVPValueOrAddLiveIn(SI->getCondition(), Plan); + bool IsDefault = SI->getDefaultDest() == Dst; + for (auto &C : SI->cases()) { + if (IsDefault) { + if (C.getCaseSuccessor() == Dst) + continue; + } else if (C.getCaseSuccessor() != Dst) + continue; + + VPValue *Eq = EdgeMaskCache.lookup({Src, C.getCaseSuccessor()}); + if (!Eq) { + VPValue *V = getVPValueOrAddLiveIn(C.getCaseValue(), Plan); + Eq = Builder.createICmp(CmpInst::ICMP_EQ, Cond, V); + } + if (Mask) + Mask = Builder.createOr(Mask, Eq); + else + Mask = Eq; + } + if (IsDefault) + Mask = Builder.createNot(Mask); + assert(Mask && "mask must be created"); + return EdgeMaskCache[Edge] = Mask; + } + // The terminator has to be a branch inst! BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); assert(BI && "Unexpected terminator found"); diff --git a/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll index b8ce3c40920a3..ff73a149c8e39 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll @@ -6,9 +6,43 @@ define void @switch_default_to_latch_common_dest(ptr %start, ptr %end) { ; IC1-LABEL: define void @switch_default_to_latch_common_dest( ; IC1-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0:[0-9]+]] { ; IC1-NEXT: [[ENTRY:.*]]: +; IC1-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; IC1-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; IC1-NEXT: [[TMP0:%.*]] = add i64 [[END1]], -8 +; IC1-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; IC1-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 3 +; IC1-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4 +; IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IC1: [[VECTOR_PH]]: +; IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4 +; IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] +; IC1-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 8 +; IC1-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]] +; IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; IC1: [[VECTOR_BODY]]: +; IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC1-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 +; IC1-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; IC1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]] +; IC1-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0 +; IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP6]], align 1 +; IC1-NEXT: [[TMP7:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 -12, i64 -12, i64 -12, i64 -12> +; IC1-NEXT: [[TMP8:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 13, i64 13, i64 13, i64 13> +; IC1-NEXT: [[TMP9:%.*]] = or <4 x i1> [[TMP7]], [[TMP8]] +; IC1-NEXT: [[TMP10:%.*]] = or <4 x i1> [[TMP9]], [[TMP9]] +; IC1-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 42, i64 42, i64 42, i64 42>, ptr [[TMP6]], i32 1, <4 x i1> [[TMP10]]) +; IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; IC1-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IC1-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IC1: [[MIDDLE_BLOCK]]: +; IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IC1: [[SCALAR_PH]]: +; IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] ; IC1-NEXT: br label %[[LOOP_HEADER:.*]] ; IC1: [[LOOP_HEADER]]: -; IC1-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; IC1-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; IC1-NEXT: [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1 ; IC1-NEXT: switch i64 [[L]], label %[[LOOP_LATCH]] [ ; IC1-NEXT: i64 -12, label %[[IF_THEN:.*]] @@ -20,16 +54,59 @@ define void @switch_default_to_latch_common_dest(ptr %start, ptr %end) { ; IC1: [[LOOP_LATCH]]: ; IC1-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1 ; IC1-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]] -; IC1-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]] +; IC1-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] ; IC1: [[EXIT]]: ; IC1-NEXT: ret void ; ; IC2-LABEL: define void @switch_default_to_latch_common_dest( ; IC2-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0:[0-9]+]] { ; IC2-NEXT: [[ENTRY:.*]]: +; IC2-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; IC2-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; IC2-NEXT: [[TMP0:%.*]] = add i64 [[END1]], -8 +; IC2-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; IC2-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 3 +; IC2-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; IC2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 8 +; IC2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IC2: [[VECTOR_PH]]: +; IC2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 8 +; IC2-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] +; IC2-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 8 +; IC2-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]] +; IC2-NEXT: br label %[[VECTOR_BODY:.*]] +; IC2: [[VECTOR_BODY]]: +; IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC2-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 +; IC2-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; IC2-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 32 +; IC2-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]] +; IC2-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP6]] +; IC2-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0 +; IC2-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 4 +; IC2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 1 +; IC2-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP8]], align 1 +; IC2-NEXT: [[TMP9:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 -12, i64 -12, i64 -12, i64 -12> +; IC2-NEXT: [[TMP10:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], <i64 -12, i64 -12, i64 -12, i64 -12> +; IC2-NEXT: [[TMP11:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 13, i64 13, i64 13, i64 13> +; IC2-NEXT: [[TMP12:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], <i64 13, i64 13, i64 13, i64 13> +; IC2-NEXT: [[TMP13:%.*]] = or <4 x i1> [[TMP9]], [[TMP11]] +; IC2-NEXT: [[TMP14:%.*]] = or <4 x i1> [[TMP10]], [[TMP12]] +; IC2-NEXT: [[TMP15:%.*]] = or <4 x i1> [[TMP13]], [[TMP13]] +; IC2-NEXT: [[TMP16:%.*]] = or <4 x i1> [[TMP14]], [[TMP14]] +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 42, i64 42, i64 42, i64 42>, ptr [[TMP7]], i32 1, <4 x i1> [[TMP15]]) +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 42, i64 42, i64 42, i64 42>, ptr [[TMP8]], i32 1, <4 x i1> [[TMP16]]) +; IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; IC2-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IC2-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IC2: [[MIDDLE_BLOCK]]: +; IC2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; IC2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IC2: [[SCALAR_PH]]: +; IC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] ; IC2-NEXT: br label %[[LOOP_HEADER:.*]] ; IC2: [[LOOP_HEADER]]: -; IC2-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; IC2-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; IC2-NEXT: [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1 ; IC2-NEXT: switch i64 [[L]], label %[[LOOP_LATCH]] [ ; IC2-NEXT: i64 -12, label %[[IF_THEN:.*]] @@ -41,7 +118,7 @@ define void @switch_default_to_latch_common_dest(ptr %start, ptr %end) { ; IC2: [[LOOP_LATCH]]: ; IC2-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1 ; IC2-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]] -; IC2-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]] +; IC2-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] ; IC2: [[EXIT]]: ; IC2-NEXT: ret void ; @@ -73,9 +150,48 @@ define void @switch_all_dests_distinct(ptr %start, ptr %end) { ; IC1-LABEL: define void @switch_all_dests_distinct( ; IC1-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0]] { ; IC1-NEXT: [[ENTRY:.*]]: +; IC1-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; IC1-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; IC1-NEXT: [[TMP0:%.*]] = add i64 [[END1]], -8 +; IC1-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; IC1-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 3 +; IC1-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4 +; IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IC1: [[VECTOR_PH]]: +; IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4 +; IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] +; IC1-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 8 +; IC1-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]] +; IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; IC1: [[VECTOR_BODY]]: +; IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC1-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 +; IC1-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; IC1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]] +; IC1-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0 +; IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP6]], align 1 +; IC1-NEXT: [[TMP7:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], zeroinitializer +; IC1-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 1, i64 1, i64 1, i64 1>, ptr [[TMP6]], i32 1, <4 x i1> [[TMP7]]) +; IC1-NEXT: [[TMP8:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 13, i64 13, i64 13, i64 13> +; IC1-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP6]], i32 1, <4 x i1> [[TMP8]]) +; IC1-NEXT: [[TMP9:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 -12, i64 -12, i64 -12, i64 -12> +; IC1-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 42, i64 42, i64 42, i64 42>, ptr [[TMP6]], i32 1, <4 x i1> [[TMP9]]) +; IC1-NEXT: [[TMP10:%.*]] = or <4 x i1> [[TMP9]], [[TMP8]] +; IC1-NEXT: [[TMP11:%.*]] = or <4 x i1> [[TMP10]], [[TMP7]] +; IC1-NEXT: [[TMP12:%.*]] = xor <4 x i1> [[TMP11]], <i1 true, i1 true, i1 true, i1 true> +; IC1-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 2, i64 2, i64 2, i64 2>, ptr [[TMP6]], i32 1, <4 x i1> [[TMP12]]) +; IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; IC1-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IC1-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; IC1: [[MIDDLE_BLOCK]]: +; IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IC1: [[SCALAR_PH]]: +; IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] ; IC1-NEXT: br label %[[LOOP_HEADER:.*]] ; IC1: [[LOOP_HEADER]]: -; IC1-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; IC1-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; IC1-NEXT: [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1 ; IC1-NEXT: switch i64 [[L]], label %[[DEFAULT:.*]] [ ; IC1-NEXT: i64 -12, label %[[IF_THEN_1:.*]] @@ -97,16 +213,69 @@ define void @switch_all_dests_distinct(ptr %start, ptr %end) { ; IC1: [[LOOP_LATCH]]: ; IC1-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1 ; IC1-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]] -; IC1-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]] +; IC1-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]] ; IC1: [[EXIT]]: ; IC1-NEXT: ret void ; ; IC2-LABEL: define void @switch_all_dests_distinct( ; IC2-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0]] { ; IC2-NEXT: [[ENTRY:.*]]: +; IC2-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; IC2-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; IC2-NEXT: [[TMP0:%.*]] = add i64 [[END1]], -8 +; IC2-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; IC2-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 3 +; IC2-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; IC2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 8 +; IC2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IC2: [[VECTOR_PH]]: +; IC2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 8 +; IC2-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] +; IC2-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 8 +; IC2-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]] +; IC2-NEXT: br label %[[VECTOR_BODY:.*]] +; IC2: [[VECTOR_BODY]]: +; IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC2-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 +; IC2-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; IC2-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 32 +; IC2-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]] +; IC2-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP6]] +; IC2-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0 +; IC2-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 4 +; IC2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 1 +; IC2-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP8]], align 1 +; IC2-NEXT: [[TMP9:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], zeroinitializer +; IC2-NEXT: [[TMP10:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], zeroinitializer +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 1, i64 1, i64 1, i64 1>, ptr [[TMP7]], i32 1, <4 x i1> [[TMP9]]) +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 1, i64 1, i64 1, i64 1>, ptr [[TMP8]], i32 1, <4 x i1> [[TMP10]]) +; IC2-NEXT: [[TMP11:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 13, i64 13, i64 13, i64 13> +; IC2-NEXT: [[TMP12:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], <i64 13, i64 13, i64 13, i64 13> +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP7]], i32 1, <4 x i1> [[TMP11]]) +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP8]], i32 1, <4 x i1> [[TMP12]]) +; IC2-NEXT: [[TMP13:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 -12, i64 -12, i64 -12, i64 -12> +; IC2-NEXT: [[TMP14:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], <i64 -12, i64 -12, i64 -12, i64 -12> +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 42, i64 42, i64 42, i64 42>, ptr [[TMP7]], i32 1, <4 x i1> [[TMP13]]) +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 42, i64 42, i64 42, i64 42>, ptr [[TMP8]], i32 1, <4 x i1> [[TMP14]]) +; IC2-NEXT: [[TMP15:%.*]] = or <4 x i1> [[TMP13]], [[TMP11]] +; IC2-NEXT: [[TMP16:%.*]] = or <4 x i1> [[TMP14]], [[TMP12]] +; IC2-NEXT: [[TMP17:%.*]] = or <4 x i1> [[TMP15]], [[TMP9]] +; IC2-NEXT: [[TMP18:%.*]] = or <4 x i1> [[TMP16]], [[TMP10]] +; IC2-NEXT: [[TMP19:%.*]] = xor <4 x i1> [[TMP17]], <i1 true, i1 true, i1 true, i1 true> +; IC2-NEXT: [[TMP20:%.*]] = xor <4 x i1> [[TMP18]], <i1 true, i1 true, i1 true, i1 true> +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 2, i64 2, i64 2, i64 2>, ptr [[TMP7]], i32 1, <4 x i1> [[TMP19]]) +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 2, i64 2, i64 2, i64 2>, ptr [[TMP8]], i32 1, <4 x i1> [[TMP20]]) +; IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; IC2-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IC2-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; IC2: [[MIDDLE_BLOCK]]: +; IC2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; IC2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IC2: [[SCALAR_PH]]: +; IC2-NEXT: [[BC_RES... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/99808 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits