https://github.com/fhahn created https://github.com/llvm/llvm-project/pull/99808
Update createEdgeMask to created masks where the terminator in Src is a switch. We need to handle 2 separate cases: 1. Dst is not the default desintation. Dst is reached if any of the cases with destination == Dst are taken. Join the conditions for each case where destination == Dst using a logical OR. 2. Dst is the default destination. Dst is reached if none of the cases with destination != Dst are taken. Join the conditions for each case where the destination is != Dst using a logical OR and negate it. Fixes https://github.com/llvm/llvm-project/issues/48188. >From 19d0aec67eb5f115c0b5349bbf2742154f66c0f1 Mon Sep 17 00:00:00 2001 From: Florian Hahn <f...@fhahn.com> Date: Sat, 20 Jul 2024 21:12:00 +0100 Subject: [PATCH] [LV] Support generating masks for switch terminators. Update createEdgeMask to created masks where the terminator in Src is a switch. We need to handle 2 separate cases: 1. Dst is not the default desintation. Dst is reached if any of the cases with destination == Dst are taken. Join the conditions for each case where destination == Dst using a logical OR. 2. Dst is the default destination. Dst is reached if none of the cases with destination != Dst are taken. Join the conditions for each case where the destination is != Dst using a logical OR and negate it. Fixes https://github.com/llvm/llvm-project/issues/48188. --- .../Frontend/optimization-remark-analysis.c | 2 +- .../Vectorize/LoopVectorizationLegality.cpp | 10 +- .../Transforms/Vectorize/LoopVectorize.cpp | 35 + .../LoopVectorize/X86/predicate-switch.ll | 647 +++++++++++++++++- .../Transforms/LoopVectorize/no_switch.ll | 12 +- .../LoopVectorize/predicate-switch.ll | 200 +++++- .../X86/pr48844-br-to-switch-vectorization.ll | 60 +- 7 files changed, 920 insertions(+), 46 deletions(-) diff --git a/clang/test/Frontend/optimization-remark-analysis.c b/clang/test/Frontend/optimization-remark-analysis.c index e43984942a6ef..9d8917265a320 100644 --- a/clang/test/Frontend/optimization-remark-analysis.c +++ b/clang/test/Frontend/optimization-remark-analysis.c @@ -1,7 +1,7 @@ // RUN: %clang -O1 -fvectorize -target x86_64-unknown-unknown -emit-llvm -Rpass-analysis -S %s -o - 2>&1 | FileCheck %s --check-prefix=RPASS // RUN: %clang -O1 -fvectorize -target x86_64-unknown-unknown -emit-llvm -S %s -o - 2>&1 | FileCheck %s -// RPASS: {{.*}}:12:5: remark: loop not vectorized: loop contains a switch statement +// RPASS-NOT: {{.*}}:12:5: remark: loop not vectorized // CHECK-NOT: remark: loop not vectorized: loop contains a switch statement double foo(int N, int *Array) { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index f54eebb2874ab..7f84455150093 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -1348,11 +1348,11 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { // Collect the blocks that need predication. for (BasicBlock *BB : TheLoop->blocks()) { // We don't support switch statements inside loops. - if (!isa<BranchInst>(BB->getTerminator())) { - reportVectorizationFailure("Loop contains a switch statement", - "loop contains a switch statement", - "LoopContainsSwitch", ORE, TheLoop, - BB->getTerminator()); + if (!isa<BranchInst, SwitchInst>(BB->getTerminator())) { + reportVectorizationFailure("Loop contains an unsupported termaintor", + "loop contains an unsupported terminator", + "LoopContainsUnsupportedTerminator", ORE, + TheLoop, BB->getTerminator()); return false; } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 6d28b8fabe42e..2530762e3e424 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7763,6 +7763,41 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { VPValue *SrcMask = getBlockInMask(Src); + if (auto *SI = dyn_cast<SwitchInst>(Src->getTerminator())) { + // Create mask where the terminator in Src is a switch. We need to handle 2 + // separate cases: + // 1. Dst is not the default desintation. Dst is reached if any of the cases + // with destination == Dst are taken. Join the conditions for each case + // where destination == Dst using a logical OR. + // 2. Dst is the default destination. Dst is reached if none of the cases + // with destination != Dst are taken. Join the conditions for each case + // where the destination is != Dst using a logical OR and negate it. + VPValue *Mask = nullptr; + VPValue *Cond = getVPValueOrAddLiveIn(SI->getCondition(), Plan); + bool IsDefault = SI->getDefaultDest() == Dst; + for (auto &C : SI->cases()) { + if (IsDefault) { + if (C.getCaseSuccessor() == Dst) + continue; + } else if (C.getCaseSuccessor() != Dst) + continue; + + VPValue *Eq = EdgeMaskCache.lookup({Src, C.getCaseSuccessor()}); + if (!Eq) { + VPValue *V = getVPValueOrAddLiveIn(C.getCaseValue(), Plan); + Eq = Builder.createICmp(CmpInst::ICMP_EQ, Cond, V); + } + if (Mask) + Mask = Builder.createOr(Mask, Eq); + else + Mask = Eq; + } + if (IsDefault) + Mask = Builder.createNot(Mask); + assert(Mask && "mask must be created"); + return EdgeMaskCache[Edge] = Mask; + } + // The terminator has to be a branch inst! BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); assert(BI && "Unexpected terminator found"); diff --git a/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll index b8ce3c40920a3..ff73a149c8e39 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll @@ -6,9 +6,43 @@ define void @switch_default_to_latch_common_dest(ptr %start, ptr %end) { ; IC1-LABEL: define void @switch_default_to_latch_common_dest( ; IC1-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0:[0-9]+]] { ; IC1-NEXT: [[ENTRY:.*]]: +; IC1-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; IC1-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; IC1-NEXT: [[TMP0:%.*]] = add i64 [[END1]], -8 +; IC1-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; IC1-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 3 +; IC1-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4 +; IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IC1: [[VECTOR_PH]]: +; IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4 +; IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] +; IC1-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 8 +; IC1-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]] +; IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; IC1: [[VECTOR_BODY]]: +; IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC1-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 +; IC1-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; IC1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]] +; IC1-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0 +; IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP6]], align 1 +; IC1-NEXT: [[TMP7:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 -12, i64 -12, i64 -12, i64 -12> +; IC1-NEXT: [[TMP8:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 13, i64 13, i64 13, i64 13> +; IC1-NEXT: [[TMP9:%.*]] = or <4 x i1> [[TMP7]], [[TMP8]] +; IC1-NEXT: [[TMP10:%.*]] = or <4 x i1> [[TMP9]], [[TMP9]] +; IC1-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 42, i64 42, i64 42, i64 42>, ptr [[TMP6]], i32 1, <4 x i1> [[TMP10]]) +; IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; IC1-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IC1-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IC1: [[MIDDLE_BLOCK]]: +; IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IC1: [[SCALAR_PH]]: +; IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] ; IC1-NEXT: br label %[[LOOP_HEADER:.*]] ; IC1: [[LOOP_HEADER]]: -; IC1-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; IC1-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; IC1-NEXT: [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1 ; IC1-NEXT: switch i64 [[L]], label %[[LOOP_LATCH]] [ ; IC1-NEXT: i64 -12, label %[[IF_THEN:.*]] @@ -20,16 +54,59 @@ define void @switch_default_to_latch_common_dest(ptr %start, ptr %end) { ; IC1: [[LOOP_LATCH]]: ; IC1-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1 ; IC1-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]] -; IC1-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]] +; IC1-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] ; IC1: [[EXIT]]: ; IC1-NEXT: ret void ; ; IC2-LABEL: define void @switch_default_to_latch_common_dest( ; IC2-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0:[0-9]+]] { ; IC2-NEXT: [[ENTRY:.*]]: +; IC2-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; IC2-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; IC2-NEXT: [[TMP0:%.*]] = add i64 [[END1]], -8 +; IC2-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; IC2-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 3 +; IC2-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; IC2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 8 +; IC2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IC2: [[VECTOR_PH]]: +; IC2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 8 +; IC2-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] +; IC2-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 8 +; IC2-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]] +; IC2-NEXT: br label %[[VECTOR_BODY:.*]] +; IC2: [[VECTOR_BODY]]: +; IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC2-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 +; IC2-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; IC2-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 32 +; IC2-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]] +; IC2-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP6]] +; IC2-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0 +; IC2-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 4 +; IC2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 1 +; IC2-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP8]], align 1 +; IC2-NEXT: [[TMP9:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 -12, i64 -12, i64 -12, i64 -12> +; IC2-NEXT: [[TMP10:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], <i64 -12, i64 -12, i64 -12, i64 -12> +; IC2-NEXT: [[TMP11:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 13, i64 13, i64 13, i64 13> +; IC2-NEXT: [[TMP12:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], <i64 13, i64 13, i64 13, i64 13> +; IC2-NEXT: [[TMP13:%.*]] = or <4 x i1> [[TMP9]], [[TMP11]] +; IC2-NEXT: [[TMP14:%.*]] = or <4 x i1> [[TMP10]], [[TMP12]] +; IC2-NEXT: [[TMP15:%.*]] = or <4 x i1> [[TMP13]], [[TMP13]] +; IC2-NEXT: [[TMP16:%.*]] = or <4 x i1> [[TMP14]], [[TMP14]] +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 42, i64 42, i64 42, i64 42>, ptr [[TMP7]], i32 1, <4 x i1> [[TMP15]]) +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 42, i64 42, i64 42, i64 42>, ptr [[TMP8]], i32 1, <4 x i1> [[TMP16]]) +; IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; IC2-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IC2-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IC2: [[MIDDLE_BLOCK]]: +; IC2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; IC2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IC2: [[SCALAR_PH]]: +; IC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] ; IC2-NEXT: br label %[[LOOP_HEADER:.*]] ; IC2: [[LOOP_HEADER]]: -; IC2-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; IC2-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; IC2-NEXT: [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1 ; IC2-NEXT: switch i64 [[L]], label %[[LOOP_LATCH]] [ ; IC2-NEXT: i64 -12, label %[[IF_THEN:.*]] @@ -41,7 +118,7 @@ define void @switch_default_to_latch_common_dest(ptr %start, ptr %end) { ; IC2: [[LOOP_LATCH]]: ; IC2-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1 ; IC2-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]] -; IC2-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]] +; IC2-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] ; IC2: [[EXIT]]: ; IC2-NEXT: ret void ; @@ -73,9 +150,48 @@ define void @switch_all_dests_distinct(ptr %start, ptr %end) { ; IC1-LABEL: define void @switch_all_dests_distinct( ; IC1-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0]] { ; IC1-NEXT: [[ENTRY:.*]]: +; IC1-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; IC1-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; IC1-NEXT: [[TMP0:%.*]] = add i64 [[END1]], -8 +; IC1-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; IC1-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 3 +; IC1-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4 +; IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IC1: [[VECTOR_PH]]: +; IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4 +; IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] +; IC1-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 8 +; IC1-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]] +; IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; IC1: [[VECTOR_BODY]]: +; IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC1-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 +; IC1-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; IC1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]] +; IC1-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0 +; IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP6]], align 1 +; IC1-NEXT: [[TMP7:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], zeroinitializer +; IC1-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 1, i64 1, i64 1, i64 1>, ptr [[TMP6]], i32 1, <4 x i1> [[TMP7]]) +; IC1-NEXT: [[TMP8:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 13, i64 13, i64 13, i64 13> +; IC1-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP6]], i32 1, <4 x i1> [[TMP8]]) +; IC1-NEXT: [[TMP9:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 -12, i64 -12, i64 -12, i64 -12> +; IC1-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 42, i64 42, i64 42, i64 42>, ptr [[TMP6]], i32 1, <4 x i1> [[TMP9]]) +; IC1-NEXT: [[TMP10:%.*]] = or <4 x i1> [[TMP9]], [[TMP8]] +; IC1-NEXT: [[TMP11:%.*]] = or <4 x i1> [[TMP10]], [[TMP7]] +; IC1-NEXT: [[TMP12:%.*]] = xor <4 x i1> [[TMP11]], <i1 true, i1 true, i1 true, i1 true> +; IC1-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 2, i64 2, i64 2, i64 2>, ptr [[TMP6]], i32 1, <4 x i1> [[TMP12]]) +; IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; IC1-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IC1-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; IC1: [[MIDDLE_BLOCK]]: +; IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IC1: [[SCALAR_PH]]: +; IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] ; IC1-NEXT: br label %[[LOOP_HEADER:.*]] ; IC1: [[LOOP_HEADER]]: -; IC1-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; IC1-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; IC1-NEXT: [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1 ; IC1-NEXT: switch i64 [[L]], label %[[DEFAULT:.*]] [ ; IC1-NEXT: i64 -12, label %[[IF_THEN_1:.*]] @@ -97,16 +213,69 @@ define void @switch_all_dests_distinct(ptr %start, ptr %end) { ; IC1: [[LOOP_LATCH]]: ; IC1-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1 ; IC1-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]] -; IC1-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]] +; IC1-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]] ; IC1: [[EXIT]]: ; IC1-NEXT: ret void ; ; IC2-LABEL: define void @switch_all_dests_distinct( ; IC2-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0]] { ; IC2-NEXT: [[ENTRY:.*]]: +; IC2-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; IC2-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; IC2-NEXT: [[TMP0:%.*]] = add i64 [[END1]], -8 +; IC2-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; IC2-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 3 +; IC2-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; IC2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 8 +; IC2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IC2: [[VECTOR_PH]]: +; IC2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 8 +; IC2-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] +; IC2-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 8 +; IC2-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]] +; IC2-NEXT: br label %[[VECTOR_BODY:.*]] +; IC2: [[VECTOR_BODY]]: +; IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC2-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 +; IC2-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; IC2-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 32 +; IC2-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]] +; IC2-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP6]] +; IC2-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0 +; IC2-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 4 +; IC2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 1 +; IC2-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP8]], align 1 +; IC2-NEXT: [[TMP9:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], zeroinitializer +; IC2-NEXT: [[TMP10:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], zeroinitializer +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 1, i64 1, i64 1, i64 1>, ptr [[TMP7]], i32 1, <4 x i1> [[TMP9]]) +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 1, i64 1, i64 1, i64 1>, ptr [[TMP8]], i32 1, <4 x i1> [[TMP10]]) +; IC2-NEXT: [[TMP11:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 13, i64 13, i64 13, i64 13> +; IC2-NEXT: [[TMP12:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], <i64 13, i64 13, i64 13, i64 13> +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP7]], i32 1, <4 x i1> [[TMP11]]) +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP8]], i32 1, <4 x i1> [[TMP12]]) +; IC2-NEXT: [[TMP13:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 -12, i64 -12, i64 -12, i64 -12> +; IC2-NEXT: [[TMP14:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], <i64 -12, i64 -12, i64 -12, i64 -12> +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 42, i64 42, i64 42, i64 42>, ptr [[TMP7]], i32 1, <4 x i1> [[TMP13]]) +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 42, i64 42, i64 42, i64 42>, ptr [[TMP8]], i32 1, <4 x i1> [[TMP14]]) +; IC2-NEXT: [[TMP15:%.*]] = or <4 x i1> [[TMP13]], [[TMP11]] +; IC2-NEXT: [[TMP16:%.*]] = or <4 x i1> [[TMP14]], [[TMP12]] +; IC2-NEXT: [[TMP17:%.*]] = or <4 x i1> [[TMP15]], [[TMP9]] +; IC2-NEXT: [[TMP18:%.*]] = or <4 x i1> [[TMP16]], [[TMP10]] +; IC2-NEXT: [[TMP19:%.*]] = xor <4 x i1> [[TMP17]], <i1 true, i1 true, i1 true, i1 true> +; IC2-NEXT: [[TMP20:%.*]] = xor <4 x i1> [[TMP18]], <i1 true, i1 true, i1 true, i1 true> +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 2, i64 2, i64 2, i64 2>, ptr [[TMP7]], i32 1, <4 x i1> [[TMP19]]) +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 2, i64 2, i64 2, i64 2>, ptr [[TMP8]], i32 1, <4 x i1> [[TMP20]]) +; IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; IC2-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IC2-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; IC2: [[MIDDLE_BLOCK]]: +; IC2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; IC2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IC2: [[SCALAR_PH]]: +; IC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] ; IC2-NEXT: br label %[[LOOP_HEADER:.*]] ; IC2: [[LOOP_HEADER]]: -; IC2-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; IC2-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; IC2-NEXT: [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1 ; IC2-NEXT: switch i64 [[L]], label %[[DEFAULT:.*]] [ ; IC2-NEXT: i64 -12, label %[[IF_THEN_1:.*]] @@ -128,7 +297,7 @@ define void @switch_all_dests_distinct(ptr %start, ptr %end) { ; IC2: [[LOOP_LATCH]]: ; IC2-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1 ; IC2-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]] -; IC2-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]] +; IC2-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]] ; IC2: [[EXIT]]: ; IC2-NEXT: ret void ; @@ -174,9 +343,57 @@ define void @switch_multiple_common_dests(ptr %start, ptr %end) { ; IC1-LABEL: define void @switch_multiple_common_dests( ; IC1-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0]] { ; IC1-NEXT: [[ENTRY:.*]]: +; IC1-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; IC1-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; IC1-NEXT: [[TMP0:%.*]] = add i64 [[END1]], -8 +; IC1-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; IC1-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 3 +; IC1-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4 +; IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IC1: [[VECTOR_PH]]: +; IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4 +; IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] +; IC1-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 8 +; IC1-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]] +; IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; IC1: [[VECTOR_BODY]]: +; IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC1-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 +; IC1-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; IC1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]] +; IC1-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0 +; IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP6]], align 1 +; IC1-NEXT: [[TMP7:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 13, i64 13, i64 13, i64 13> +; IC1-NEXT: [[TMP8:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 14, i64 14, i64 14, i64 14> +; IC1-NEXT: [[TMP9:%.*]] = or <4 x i1> [[TMP7]], [[TMP8]] +; IC1-NEXT: [[TMP10:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 15, i64 15, i64 15, i64 15> +; IC1-NEXT: [[TMP11:%.*]] = or <4 x i1> [[TMP9]], [[TMP10]] +; IC1-NEXT: [[TMP12:%.*]] = or <4 x i1> [[TMP11]], [[TMP11]] +; IC1-NEXT: [[TMP13:%.*]] = or <4 x i1> [[TMP12]], [[TMP11]] +; IC1-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP6]], i32 1, <4 x i1> [[TMP13]]) +; IC1-NEXT: [[TMP14:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 -12, i64 -12, i64 -12, i64 -12> +; IC1-NEXT: [[TMP15:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], zeroinitializer +; IC1-NEXT: [[TMP16:%.*]] = or <4 x i1> [[TMP14]], [[TMP15]] +; IC1-NEXT: [[TMP17:%.*]] = or <4 x i1> [[TMP16]], [[TMP16]] +; IC1-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 42, i64 42, i64 42, i64 42>, ptr [[TMP6]], i32 1, <4 x i1> [[TMP17]]) +; IC1-NEXT: [[TMP18:%.*]] = or <4 x i1> [[TMP16]], [[TMP16]] +; IC1-NEXT: [[TMP19:%.*]] = or <4 x i1> [[TMP18]], [[TMP11]] +; IC1-NEXT: [[TMP20:%.*]] = or <4 x i1> [[TMP19]], [[TMP11]] +; IC1-NEXT: [[TMP21:%.*]] = or <4 x i1> [[TMP20]], [[TMP11]] +; IC1-NEXT: [[TMP22:%.*]] = xor <4 x i1> [[TMP21]], <i1 true, i1 true, i1 true, i1 true> +; IC1-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 2, i64 2, i64 2, i64 2>, ptr [[TMP6]], i32 1, <4 x i1> [[TMP22]]) +; IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; IC1-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IC1-NEXT: br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; IC1: [[MIDDLE_BLOCK]]: +; IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IC1: [[SCALAR_PH]]: +; IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] ; IC1-NEXT: br label %[[LOOP_HEADER:.*]] ; IC1: [[LOOP_HEADER]]: -; IC1-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; IC1-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; IC1-NEXT: [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1 ; IC1-NEXT: switch i64 [[L]], label %[[DEFAULT:.*]] [ ; IC1-NEXT: i64 -12, label %[[IF_THEN_1:.*]] @@ -197,16 +414,87 @@ define void @switch_multiple_common_dests(ptr %start, ptr %end) { ; IC1: [[LOOP_LATCH]]: ; IC1-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1 ; IC1-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]] -; IC1-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]] +; IC1-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP7:![0-9]+]] ; IC1: [[EXIT]]: ; IC1-NEXT: ret void ; ; IC2-LABEL: define void @switch_multiple_common_dests( ; IC2-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0]] { ; IC2-NEXT: [[ENTRY:.*]]: +; IC2-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; IC2-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; IC2-NEXT: [[TMP0:%.*]] = add i64 [[END1]], -8 +; IC2-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; IC2-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 3 +; IC2-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; IC2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 8 +; IC2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IC2: [[VECTOR_PH]]: +; IC2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 8 +; IC2-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] +; IC2-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 8 +; IC2-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]] +; IC2-NEXT: br label %[[VECTOR_BODY:.*]] +; IC2: [[VECTOR_BODY]]: +; IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC2-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 +; IC2-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; IC2-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 32 +; IC2-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]] +; IC2-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP6]] +; IC2-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0 +; IC2-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 4 +; IC2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 1 +; IC2-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP8]], align 1 +; IC2-NEXT: [[TMP9:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 13, i64 13, i64 13, i64 13> +; IC2-NEXT: [[TMP10:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], <i64 13, i64 13, i64 13, i64 13> +; IC2-NEXT: [[TMP11:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 14, i64 14, i64 14, i64 14> +; IC2-NEXT: [[TMP12:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], <i64 14, i64 14, i64 14, i64 14> +; IC2-NEXT: [[TMP13:%.*]] = or <4 x i1> [[TMP9]], [[TMP11]] +; IC2-NEXT: [[TMP14:%.*]] = or <4 x i1> [[TMP10]], [[TMP12]] +; IC2-NEXT: [[TMP15:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 15, i64 15, i64 15, i64 15> +; IC2-NEXT: [[TMP16:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], <i64 15, i64 15, i64 15, i64 15> +; IC2-NEXT: [[TMP17:%.*]] = or <4 x i1> [[TMP13]], [[TMP15]] +; IC2-NEXT: [[TMP18:%.*]] = or <4 x i1> [[TMP14]], [[TMP16]] +; IC2-NEXT: [[TMP19:%.*]] = or <4 x i1> [[TMP17]], [[TMP17]] +; IC2-NEXT: [[TMP20:%.*]] = or <4 x i1> [[TMP18]], [[TMP18]] +; IC2-NEXT: [[TMP21:%.*]] = or <4 x i1> [[TMP19]], [[TMP17]] +; IC2-NEXT: [[TMP22:%.*]] = or <4 x i1> [[TMP20]], [[TMP18]] +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP7]], i32 1, <4 x i1> [[TMP21]]) +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP8]], i32 1, <4 x i1> [[TMP22]]) +; IC2-NEXT: [[TMP23:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 -12, i64 -12, i64 -12, i64 -12> +; IC2-NEXT: [[TMP24:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], <i64 -12, i64 -12, i64 -12, i64 -12> +; IC2-NEXT: [[TMP25:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], zeroinitializer +; IC2-NEXT: [[TMP26:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], zeroinitializer +; IC2-NEXT: [[TMP27:%.*]] = or <4 x i1> [[TMP23]], [[TMP25]] +; IC2-NEXT: [[TMP28:%.*]] = or <4 x i1> [[TMP24]], [[TMP26]] +; IC2-NEXT: [[TMP29:%.*]] = or <4 x i1> [[TMP27]], [[TMP27]] +; IC2-NEXT: [[TMP30:%.*]] = or <4 x i1> [[TMP28]], [[TMP28]] +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 42, i64 42, i64 42, i64 42>, ptr [[TMP7]], i32 1, <4 x i1> [[TMP29]]) +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 42, i64 42, i64 42, i64 42>, ptr [[TMP8]], i32 1, <4 x i1> [[TMP30]]) +; IC2-NEXT: [[TMP31:%.*]] = or <4 x i1> [[TMP27]], [[TMP27]] +; IC2-NEXT: [[TMP32:%.*]] = or <4 x i1> [[TMP28]], [[TMP28]] +; IC2-NEXT: [[TMP33:%.*]] = or <4 x i1> [[TMP31]], [[TMP17]] +; IC2-NEXT: [[TMP34:%.*]] = or <4 x i1> [[TMP32]], [[TMP18]] +; IC2-NEXT: [[TMP35:%.*]] = or <4 x i1> [[TMP33]], [[TMP17]] +; IC2-NEXT: [[TMP36:%.*]] = or <4 x i1> [[TMP34]], [[TMP18]] +; IC2-NEXT: [[TMP37:%.*]] = or <4 x i1> [[TMP35]], [[TMP17]] +; IC2-NEXT: [[TMP38:%.*]] = or <4 x i1> [[TMP36]], [[TMP18]] +; IC2-NEXT: [[TMP39:%.*]] = xor <4 x i1> [[TMP37]], <i1 true, i1 true, i1 true, i1 true> +; IC2-NEXT: [[TMP40:%.*]] = xor <4 x i1> [[TMP38]], <i1 true, i1 true, i1 true, i1 true> +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 2, i64 2, i64 2, i64 2>, ptr [[TMP7]], i32 1, <4 x i1> [[TMP39]]) +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 2, i64 2, i64 2, i64 2>, ptr [[TMP8]], i32 1, <4 x i1> [[TMP40]]) +; IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; IC2-NEXT: [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IC2-NEXT: br i1 [[TMP41]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; IC2: [[MIDDLE_BLOCK]]: +; IC2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; IC2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IC2: [[SCALAR_PH]]: +; IC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] ; IC2-NEXT: br label %[[LOOP_HEADER:.*]] ; IC2: [[LOOP_HEADER]]: -; IC2-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; IC2-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; IC2-NEXT: [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1 ; IC2-NEXT: switch i64 [[L]], label %[[DEFAULT:.*]] [ ; IC2-NEXT: i64 -12, label %[[IF_THEN_1:.*]] @@ -227,7 +515,7 @@ define void @switch_multiple_common_dests(ptr %start, ptr %end) { ; IC2: [[LOOP_LATCH]]: ; IC2-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1 ; IC2-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]] -; IC2-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]] +; IC2-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP7:![0-9]+]] ; IC2: [[EXIT]]: ; IC2-NEXT: ret void ; @@ -270,9 +558,46 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) { ; IC1-LABEL: define void @switch4_default_common_dest_with_case( ; IC1-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0]] { ; IC1-NEXT: [[ENTRY:.*]]: +; IC1-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; IC1-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; IC1-NEXT: [[TMP0:%.*]] = add i64 [[END1]], -8 +; IC1-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; IC1-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 3 +; IC1-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4 +; IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IC1: [[VECTOR_PH]]: +; IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4 +; IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] +; IC1-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 8 +; IC1-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]] +; IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; IC1: [[VECTOR_BODY]]: +; IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC1-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 +; IC1-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; IC1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]] +; IC1-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0 +; IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP6]], align 1 +; IC1-NEXT: [[TMP7:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 13, i64 13, i64 13, i64 13> +; IC1-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP6]], i32 1, <4 x i1> [[TMP7]]) +; IC1-NEXT: [[TMP8:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 -12, i64 -12, i64 -12, i64 -12> +; IC1-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 42, i64 42, i64 42, i64 42>, ptr [[TMP6]], i32 1, <4 x i1> [[TMP8]]) +; IC1-NEXT: [[TMP9:%.*]] = or <4 x i1> [[TMP8]], [[TMP7]] +; IC1-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP9]], <i1 true, i1 true, i1 true, i1 true> +; IC1-NEXT: [[TMP11:%.*]] = or <4 x i1> [[TMP10]], [[TMP10]] +; IC1-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 2, i64 2, i64 2, i64 2>, ptr [[TMP6]], i32 1, <4 x i1> [[TMP11]]) +; IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; IC1-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IC1-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; IC1: [[MIDDLE_BLOCK]]: +; IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IC1: [[SCALAR_PH]]: +; IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] ; IC1-NEXT: br label %[[LOOP_HEADER:.*]] ; IC1: [[LOOP_HEADER]]: -; IC1-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; IC1-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; IC1-NEXT: [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1 ; IC1-NEXT: switch i64 [[L]], label %[[DEFAULT:.*]] [ ; IC1-NEXT: i64 -12, label %[[IF_THEN_1:.*]] @@ -291,16 +616,65 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) { ; IC1: [[LOOP_LATCH]]: ; IC1-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1 ; IC1-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]] -; IC1-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]] +; IC1-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]] ; IC1: [[EXIT]]: ; IC1-NEXT: ret void ; ; IC2-LABEL: define void @switch4_default_common_dest_with_case( ; IC2-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0]] { ; IC2-NEXT: [[ENTRY:.*]]: +; IC2-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; IC2-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; IC2-NEXT: [[TMP0:%.*]] = add i64 [[END1]], -8 +; IC2-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; IC2-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 3 +; IC2-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; IC2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 8 +; IC2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IC2: [[VECTOR_PH]]: +; IC2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 8 +; IC2-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] +; IC2-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 8 +; IC2-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]] +; IC2-NEXT: br label %[[VECTOR_BODY:.*]] +; IC2: [[VECTOR_BODY]]: +; IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC2-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 +; IC2-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; IC2-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 32 +; IC2-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]] +; IC2-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP6]] +; IC2-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0 +; IC2-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 4 +; IC2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 1 +; IC2-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP8]], align 1 +; IC2-NEXT: [[TMP9:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 13, i64 13, i64 13, i64 13> +; IC2-NEXT: [[TMP10:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], <i64 13, i64 13, i64 13, i64 13> +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP7]], i32 1, <4 x i1> [[TMP9]]) +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP8]], i32 1, <4 x i1> [[TMP10]]) +; IC2-NEXT: [[TMP11:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 -12, i64 -12, i64 -12, i64 -12> +; IC2-NEXT: [[TMP12:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], <i64 -12, i64 -12, i64 -12, i64 -12> +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 42, i64 42, i64 42, i64 42>, ptr [[TMP7]], i32 1, <4 x i1> [[TMP11]]) +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 42, i64 42, i64 42, i64 42>, ptr [[TMP8]], i32 1, <4 x i1> [[TMP12]]) +; IC2-NEXT: [[TMP13:%.*]] = or <4 x i1> [[TMP11]], [[TMP9]] +; IC2-NEXT: [[TMP14:%.*]] = or <4 x i1> [[TMP12]], [[TMP10]] +; IC2-NEXT: [[TMP15:%.*]] = xor <4 x i1> [[TMP13]], <i1 true, i1 true, i1 true, i1 true> +; IC2-NEXT: [[TMP16:%.*]] = xor <4 x i1> [[TMP14]], <i1 true, i1 true, i1 true, i1 true> +; IC2-NEXT: [[TMP17:%.*]] = or <4 x i1> [[TMP15]], [[TMP15]] +; IC2-NEXT: [[TMP18:%.*]] = or <4 x i1> [[TMP16]], [[TMP16]] +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 2, i64 2, i64 2, i64 2>, ptr [[TMP7]], i32 1, <4 x i1> [[TMP17]]) +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 2, i64 2, i64 2, i64 2>, ptr [[TMP8]], i32 1, <4 x i1> [[TMP18]]) +; IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; IC2-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IC2-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; IC2: [[MIDDLE_BLOCK]]: +; IC2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; IC2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IC2: [[SCALAR_PH]]: +; IC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] ; IC2-NEXT: br label %[[LOOP_HEADER:.*]] ; IC2: [[LOOP_HEADER]]: -; IC2-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; IC2-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; IC2-NEXT: [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1 ; IC2-NEXT: switch i64 [[L]], label %[[DEFAULT:.*]] [ ; IC2-NEXT: i64 -12, label %[[IF_THEN_1:.*]] @@ -319,7 +693,7 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) { ; IC2: [[LOOP_LATCH]]: ; IC2-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1 ; IC2-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]] -; IC2-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]] +; IC2-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]] ; IC2: [[EXIT]]: ; IC2-NEXT: ret void ; @@ -360,9 +734,46 @@ define void @switch_under_br_default_common_dest_with_case(ptr %start, ptr %end, ; IC1-LABEL: define void @switch_under_br_default_common_dest_with_case( ; IC1-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i64 [[X:%.*]]) #[[ATTR0]] { ; IC1-NEXT: [[ENTRY:.*]]: +; IC1-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; IC1-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; IC1-NEXT: [[TMP0:%.*]] = add i64 [[END1]], -8 +; IC1-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; IC1-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 3 +; IC1-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4 +; IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IC1: [[VECTOR_PH]]: +; IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4 +; IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] +; IC1-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 8 +; IC1-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]] +; IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; IC1: [[VECTOR_BODY]]: +; IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC1-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 +; IC1-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; IC1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]] +; IC1-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0 +; IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP6]], align 1 +; IC1-NEXT: [[TMP7:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 13, i64 13, i64 13, i64 13> +; IC1-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP6]], i32 1, <4 x i1> [[TMP7]]) +; IC1-NEXT: [[TMP8:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 -12, i64 -12, i64 -12, i64 -12> +; IC1-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 42, i64 42, i64 42, i64 42>, ptr [[TMP6]], i32 1, <4 x i1> [[TMP8]]) +; IC1-NEXT: [[TMP9:%.*]] = or <4 x i1> [[TMP8]], [[TMP7]] +; IC1-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP9]], <i1 true, i1 true, i1 true, i1 true> +; IC1-NEXT: [[TMP11:%.*]] = or <4 x i1> [[TMP10]], [[TMP10]] +; IC1-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 2, i64 2, i64 2, i64 2>, ptr [[TMP6]], i32 1, <4 x i1> [[TMP11]]) +; IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; IC1-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IC1-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; IC1: [[MIDDLE_BLOCK]]: +; IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IC1: [[SCALAR_PH]]: +; IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] ; IC1-NEXT: br label %[[LOOP_HEADER:.*]] ; IC1: [[LOOP_HEADER]]: -; IC1-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; IC1-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; IC1-NEXT: [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1 ; IC1-NEXT: [[C:%.*]] = icmp ule i64 [[L]], [[X]] ; IC1-NEXT: br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]] @@ -384,16 +795,65 @@ define void @switch_under_br_default_common_dest_with_case(ptr %start, ptr %end, ; IC1: [[LOOP_LATCH]]: ; IC1-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1 ; IC1-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]] -; IC1-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]] +; IC1-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP11:![0-9]+]] ; IC1: [[EXIT]]: ; IC1-NEXT: ret void ; ; IC2-LABEL: define void @switch_under_br_default_common_dest_with_case( ; IC2-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i64 [[X:%.*]]) #[[ATTR0]] { ; IC2-NEXT: [[ENTRY:.*]]: +; IC2-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; IC2-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; IC2-NEXT: [[TMP0:%.*]] = add i64 [[END1]], -8 +; IC2-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; IC2-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 3 +; IC2-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; IC2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 8 +; IC2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IC2: [[VECTOR_PH]]: +; IC2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 8 +; IC2-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] +; IC2-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 8 +; IC2-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]] +; IC2-NEXT: br label %[[VECTOR_BODY:.*]] +; IC2: [[VECTOR_BODY]]: +; IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC2-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 +; IC2-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; IC2-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 32 +; IC2-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]] +; IC2-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP6]] +; IC2-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0 +; IC2-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 4 +; IC2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 1 +; IC2-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP8]], align 1 +; IC2-NEXT: [[TMP9:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 13, i64 13, i64 13, i64 13> +; IC2-NEXT: [[TMP10:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], <i64 13, i64 13, i64 13, i64 13> +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP7]], i32 1, <4 x i1> [[TMP9]]) +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP8]], i32 1, <4 x i1> [[TMP10]]) +; IC2-NEXT: [[TMP11:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 -12, i64 -12, i64 -12, i64 -12> +; IC2-NEXT: [[TMP12:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], <i64 -12, i64 -12, i64 -12, i64 -12> +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 42, i64 42, i64 42, i64 42>, ptr [[TMP7]], i32 1, <4 x i1> [[TMP11]]) +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 42, i64 42, i64 42, i64 42>, ptr [[TMP8]], i32 1, <4 x i1> [[TMP12]]) +; IC2-NEXT: [[TMP13:%.*]] = or <4 x i1> [[TMP11]], [[TMP9]] +; IC2-NEXT: [[TMP14:%.*]] = or <4 x i1> [[TMP12]], [[TMP10]] +; IC2-NEXT: [[TMP15:%.*]] = xor <4 x i1> [[TMP13]], <i1 true, i1 true, i1 true, i1 true> +; IC2-NEXT: [[TMP16:%.*]] = xor <4 x i1> [[TMP14]], <i1 true, i1 true, i1 true, i1 true> +; IC2-NEXT: [[TMP17:%.*]] = or <4 x i1> [[TMP15]], [[TMP15]] +; IC2-NEXT: [[TMP18:%.*]] = or <4 x i1> [[TMP16]], [[TMP16]] +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 2, i64 2, i64 2, i64 2>, ptr [[TMP7]], i32 1, <4 x i1> [[TMP17]]) +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 2, i64 2, i64 2, i64 2>, ptr [[TMP8]], i32 1, <4 x i1> [[TMP18]]) +; IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; IC2-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IC2-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; IC2: [[MIDDLE_BLOCK]]: +; IC2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; IC2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IC2: [[SCALAR_PH]]: +; IC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] ; IC2-NEXT: br label %[[LOOP_HEADER:.*]] ; IC2: [[LOOP_HEADER]]: -; IC2-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; IC2-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; IC2-NEXT: [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1 ; IC2-NEXT: [[C:%.*]] = icmp ule i64 [[L]], [[X]] ; IC2-NEXT: br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]] @@ -415,7 +875,7 @@ define void @switch_under_br_default_common_dest_with_case(ptr %start, ptr %end, ; IC2: [[LOOP_LATCH]]: ; IC2-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1 ; IC2-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]] -; IC2-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]] +; IC2-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP11:![0-9]+]] ; IC2: [[EXIT]]: ; IC2-NEXT: ret void ; @@ -460,9 +920,54 @@ define void @br_under_switch_default_common_dest_with_case(ptr %start, ptr %end, ; IC1-LABEL: define void @br_under_switch_default_common_dest_with_case( ; IC1-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i64 [[X:%.*]]) #[[ATTR0]] { ; IC1-NEXT: [[ENTRY:.*]]: +; IC1-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; IC1-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; IC1-NEXT: [[TMP0:%.*]] = add i64 [[END1]], -8 +; IC1-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; IC1-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 3 +; IC1-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4 +; IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IC1: [[VECTOR_PH]]: +; IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4 +; IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] +; IC1-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 8 +; IC1-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]] +; IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[X]], i64 0 +; IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; IC1: [[VECTOR_BODY]]: +; IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC1-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 +; IC1-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; IC1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]] +; IC1-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0 +; IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP6]], align 1 +; IC1-NEXT: [[TMP7:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 -12, i64 -12, i64 -12, i64 -12> +; IC1-NEXT: [[TMP8:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; IC1-NEXT: [[TMP9:%.*]] = xor <4 x i1> [[TMP8]], <i1 true, i1 true, i1 true, i1 true> +; IC1-NEXT: [[TMP10:%.*]] = select <4 x i1> [[TMP7]], <4 x i1> [[TMP9]], <4 x i1> zeroinitializer +; IC1-NEXT: [[TMP11:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 13, i64 13, i64 13, i64 13> +; IC1-NEXT: [[TMP12:%.*]] = or <4 x i1> [[TMP10]], [[TMP11]] +; IC1-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP6]], i32 1, <4 x i1> [[TMP12]]) +; IC1-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP7]], <4 x i1> [[TMP8]], <4 x i1> zeroinitializer +; IC1-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 42, i64 42, i64 42, i64 42>, ptr [[TMP6]], i32 1, <4 x i1> [[TMP13]]) +; IC1-NEXT: [[TMP14:%.*]] = or <4 x i1> [[TMP7]], [[TMP11]] +; IC1-NEXT: [[TMP15:%.*]] = xor <4 x i1> [[TMP14]], <i1 true, i1 true, i1 true, i1 true> +; IC1-NEXT: [[TMP16:%.*]] = or <4 x i1> [[TMP13]], [[TMP15]] +; IC1-NEXT: [[TMP17:%.*]] = or <4 x i1> [[TMP16]], [[TMP15]] +; IC1-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 2, i64 2, i64 2, i64 2>, ptr [[TMP6]], i32 1, <4 x i1> [[TMP17]]) +; IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; IC1-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IC1-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; IC1: [[MIDDLE_BLOCK]]: +; IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IC1: [[SCALAR_PH]]: +; IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] ; IC1-NEXT: br label %[[LOOP_HEADER:.*]] ; IC1: [[LOOP_HEADER]]: -; IC1-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; IC1-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; IC1-NEXT: [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1 ; IC1-NEXT: switch i64 [[L]], label %[[DEFAULT:.*]] [ ; IC1-NEXT: i64 -12, label %[[IF_THEN_1:.*]] @@ -484,16 +989,79 @@ define void @br_under_switch_default_common_dest_with_case(ptr %start, ptr %end, ; IC1: [[LOOP_LATCH]]: ; IC1-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1 ; IC1-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]] -; IC1-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]] +; IC1-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP13:![0-9]+]] ; IC1: [[EXIT]]: ; IC1-NEXT: ret void ; ; IC2-LABEL: define void @br_under_switch_default_common_dest_with_case( ; IC2-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i64 [[X:%.*]]) #[[ATTR0]] { ; IC2-NEXT: [[ENTRY:.*]]: +; IC2-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; IC2-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; IC2-NEXT: [[TMP0:%.*]] = add i64 [[END1]], -8 +; IC2-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; IC2-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 3 +; IC2-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; IC2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 8 +; IC2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IC2: [[VECTOR_PH]]: +; IC2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 8 +; IC2-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] +; IC2-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 8 +; IC2-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]] +; IC2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[X]], i64 0 +; IC2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; IC2-NEXT: br label %[[VECTOR_BODY:.*]] +; IC2: [[VECTOR_BODY]]: +; IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC2-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 +; IC2-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; IC2-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 32 +; IC2-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]] +; IC2-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP6]] +; IC2-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0 +; IC2-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 4 +; IC2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 1 +; IC2-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP8]], align 1 +; IC2-NEXT: [[TMP9:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 -12, i64 -12, i64 -12, i64 -12> +; IC2-NEXT: [[TMP10:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], <i64 -12, i64 -12, i64 -12, i64 -12> +; IC2-NEXT: [[TMP11:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; IC2-NEXT: [[TMP12:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD4]], [[BROADCAST_SPLAT]] +; IC2-NEXT: [[TMP13:%.*]] = xor <4 x i1> [[TMP11]], <i1 true, i1 true, i1 true, i1 true> +; IC2-NEXT: [[TMP14:%.*]] = xor <4 x i1> [[TMP12]], <i1 true, i1 true, i1 true, i1 true> +; IC2-NEXT: [[TMP15:%.*]] = select <4 x i1> [[TMP9]], <4 x i1> [[TMP13]], <4 x i1> zeroinitializer +; IC2-NEXT: [[TMP16:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> [[TMP14]], <4 x i1> zeroinitializer +; IC2-NEXT: [[TMP17:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], <i64 13, i64 13, i64 13, i64 13> +; IC2-NEXT: [[TMP18:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], <i64 13, i64 13, i64 13, i64 13> +; IC2-NEXT: [[TMP19:%.*]] = or <4 x i1> [[TMP15]], [[TMP17]] +; IC2-NEXT: [[TMP20:%.*]] = or <4 x i1> [[TMP16]], [[TMP18]] +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP7]], i32 1, <4 x i1> [[TMP19]]) +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP8]], i32 1, <4 x i1> [[TMP20]]) +; IC2-NEXT: [[TMP21:%.*]] = select <4 x i1> [[TMP9]], <4 x i1> [[TMP11]], <4 x i1> zeroinitializer +; IC2-NEXT: [[TMP22:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> [[TMP12]], <4 x i1> zeroinitializer +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 42, i64 42, i64 42, i64 42>, ptr [[TMP7]], i32 1, <4 x i1> [[TMP21]]) +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 42, i64 42, i64 42, i64 42>, ptr [[TMP8]], i32 1, <4 x i1> [[TMP22]]) +; IC2-NEXT: [[TMP23:%.*]] = or <4 x i1> [[TMP9]], [[TMP17]] +; IC2-NEXT: [[TMP24:%.*]] = or <4 x i1> [[TMP10]], [[TMP18]] +; IC2-NEXT: [[TMP25:%.*]] = xor <4 x i1> [[TMP23]], <i1 true, i1 true, i1 true, i1 true> +; IC2-NEXT: [[TMP26:%.*]] = xor <4 x i1> [[TMP24]], <i1 true, i1 true, i1 true, i1 true> +; IC2-NEXT: [[TMP27:%.*]] = or <4 x i1> [[TMP21]], [[TMP25]] +; IC2-NEXT: [[TMP28:%.*]] = or <4 x i1> [[TMP22]], [[TMP26]] +; IC2-NEXT: [[TMP29:%.*]] = or <4 x i1> [[TMP27]], [[TMP25]] +; IC2-NEXT: [[TMP30:%.*]] = or <4 x i1> [[TMP28]], [[TMP26]] +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 2, i64 2, i64 2, i64 2>, ptr [[TMP7]], i32 1, <4 x i1> [[TMP29]]) +; IC2-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 2, i64 2, i64 2, i64 2>, ptr [[TMP8]], i32 1, <4 x i1> [[TMP30]]) +; IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; IC2-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IC2-NEXT: br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; IC2: [[MIDDLE_BLOCK]]: +; IC2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; IC2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IC2: [[SCALAR_PH]]: +; IC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] ; IC2-NEXT: br label %[[LOOP_HEADER:.*]] ; IC2: [[LOOP_HEADER]]: -; IC2-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; IC2-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; IC2-NEXT: [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1 ; IC2-NEXT: switch i64 [[L]], label %[[DEFAULT:.*]] [ ; IC2-NEXT: i64 -12, label %[[IF_THEN_1:.*]] @@ -515,7 +1083,7 @@ define void @br_under_switch_default_common_dest_with_case(ptr %start, ptr %end, ; IC2: [[LOOP_LATCH]]: ; IC2-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1 ; IC2-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]] -; IC2-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]] +; IC2-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP13:![0-9]+]] ; IC2: [[EXIT]]: ; IC2-NEXT: ret void ; @@ -555,3 +1123,34 @@ loop.latch: exit: ret void } +;. +; IC1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; IC1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; IC1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; IC1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; IC1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; IC1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; IC1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; IC1: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; IC1: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; IC1: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +; IC1: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; IC1: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} +; IC1: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} +; IC1: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} +;. +; IC2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; IC2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; IC2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; IC2: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; IC2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; IC2: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; IC2: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; IC2: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; IC2: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; IC2: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +; IC2: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; IC2: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} +; IC2: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} +; IC2: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/no_switch.ll b/llvm/test/Transforms/LoopVectorize/no_switch.ll index c62826f9554e6..118a15e63fe99 100644 --- a/llvm/test/Transforms/LoopVectorize/no_switch.ll +++ b/llvm/test/Transforms/LoopVectorize/no_switch.ll @@ -2,18 +2,16 @@ ; RUN: opt < %s -passes=loop-vectorize,transform-warning -force-vector-width=1 -S 2>&1 | FileCheck %s -check-prefix=NOANALYSIS ; RUN: opt < %s -passes=loop-vectorize,transform-warning -force-vector-width=4 -pass-remarks-missed='loop-vectorize' -S 2>&1 | FileCheck %s -check-prefix=MOREINFO -; CHECK: remark: source.cpp:4:5: loop not vectorized: loop contains a switch statement -; CHECK: warning: source.cpp:4:5: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering +; CHECK-NOT: loop not vectorized: loop contains a switch statement +; CHECK-NOT: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering ; NOANALYSIS-NOT: remark: {{.*}} -; NOANALYSIS: warning: source.cpp:4:5: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering +; NOANALYSIS: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering -; MOREINFO: remark: source.cpp:4:5: loop not vectorized: loop contains a switch statement -; MOREINFO: remark: source.cpp:4:5: loop not vectorized (Force=true, Vector Width=4) -; MOREINFO: warning: source.cpp:4:5: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering +; MOREINFO-NOT: remark ; CHECK: _Z11test_switchPii -; CHECK-NOT: x i32> +; CHECK: vector.body: ; CHECK: ret target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" diff --git a/llvm/test/Transforms/LoopVectorize/predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/predicate-switch.ll index dba53c8717f59..1963a4d286360 100644 --- a/llvm/test/Transforms/LoopVectorize/predicate-switch.ll +++ b/llvm/test/Transforms/LoopVectorize/predicate-switch.ll @@ -6,9 +6,76 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) { ; IC1-LABEL: define void @switch4_default_common_dest_with_case( ; IC1-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) { ; IC1-NEXT: [[ENTRY:.*]]: +; IC1-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; IC1-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; IC1-NEXT: [[TMP0:%.*]] = sub i64 [[END1]], [[START2]] +; IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2 +; IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IC1: [[VECTOR_PH]]: +; IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 2 +; IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; IC1-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]] +; IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; IC1: [[VECTOR_BODY]]: +; IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE13:.*]] ] +; IC1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; IC1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 1 +; IC1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP1]] +; IC1-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP2]] +; IC1-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0 +; IC1-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP3]], align 1 +; IC1-NEXT: [[TMP4:%.*]] = icmp eq <2 x i8> [[WIDE_LOAD]], <i8 13, i8 13> +; IC1-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 +; IC1-NEXT: br i1 [[TMP5]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; IC1: [[PRED_STORE_IF]]: +; IC1-NEXT: store i8 0, ptr [[NEXT_GEP]], align 1 +; IC1-NEXT: br label %[[PRED_STORE_CONTINUE]] +; IC1: [[PRED_STORE_CONTINUE]]: +; IC1-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 +; IC1-NEXT: br i1 [[TMP6]], label %[[PRED_STORE_IF4:.*]], label %[[PRED_STORE_CONTINUE5:.*]] +; IC1: [[PRED_STORE_IF4]]: +; IC1-NEXT: store i8 0, ptr [[NEXT_GEP3]], align 1 +; IC1-NEXT: br label %[[PRED_STORE_CONTINUE5]] +; IC1: [[PRED_STORE_CONTINUE5]]: +; IC1-NEXT: [[TMP7:%.*]] = icmp eq <2 x i8> [[WIDE_LOAD]], <i8 -12, i8 -12> +; IC1-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; IC1-NEXT: br i1 [[TMP8]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7:.*]] +; IC1: [[PRED_STORE_IF6]]: +; IC1-NEXT: store i8 42, ptr [[NEXT_GEP]], align 1 +; IC1-NEXT: br label %[[PRED_STORE_CONTINUE7]] +; IC1: [[PRED_STORE_CONTINUE7]]: +; IC1-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; IC1-NEXT: br i1 [[TMP9]], label %[[PRED_STORE_IF8:.*]], label %[[PRED_STORE_CONTINUE9:.*]] +; IC1: [[PRED_STORE_IF8]]: +; IC1-NEXT: store i8 42, ptr [[NEXT_GEP3]], align 1 +; IC1-NEXT: br label %[[PRED_STORE_CONTINUE9]] +; IC1: [[PRED_STORE_CONTINUE9]]: +; IC1-NEXT: [[TMP10:%.*]] = or <2 x i1> [[TMP7]], [[TMP4]] +; IC1-NEXT: [[TMP11:%.*]] = xor <2 x i1> [[TMP10]], <i1 true, i1 true> +; IC1-NEXT: [[TMP12:%.*]] = or <2 x i1> [[TMP11]], [[TMP11]] +; IC1-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP12]], i32 0 +; IC1-NEXT: br i1 [[TMP13]], label %[[PRED_STORE_IF10:.*]], label %[[PRED_STORE_CONTINUE11:.*]] +; IC1: [[PRED_STORE_IF10]]: +; IC1-NEXT: store i8 2, ptr [[NEXT_GEP]], align 1 +; IC1-NEXT: br label %[[PRED_STORE_CONTINUE11]] +; IC1: [[PRED_STORE_CONTINUE11]]: +; IC1-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP12]], i32 1 +; IC1-NEXT: br i1 [[TMP14]], label %[[PRED_STORE_IF12:.*]], label %[[PRED_STORE_CONTINUE13]] +; IC1: [[PRED_STORE_IF12]]: +; IC1-NEXT: store i8 2, ptr [[NEXT_GEP3]], align 1 +; IC1-NEXT: br label %[[PRED_STORE_CONTINUE13]] +; IC1: [[PRED_STORE_CONTINUE13]]: +; IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; IC1-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IC1-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IC1: [[MIDDLE_BLOCK]]: +; IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IC1: [[SCALAR_PH]]: +; IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] ; IC1-NEXT: br label %[[LOOP_HEADER:.*]] ; IC1: [[LOOP_HEADER]]: -; IC1-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; IC1-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; IC1-NEXT: [[L:%.*]] = load i8, ptr [[PTR_IV]], align 1 ; IC1-NEXT: switch i8 [[L]], label %[[DEFAULT:.*]] [ ; IC1-NEXT: i8 -12, label %[[IF_THEN_1:.*]] @@ -27,16 +94,130 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) { ; IC1: [[LOOP_LATCH]]: ; IC1-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i8, ptr [[PTR_IV]], i64 1 ; IC1-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]] -; IC1-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]] +; IC1-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] ; IC1: [[EXIT]]: ; IC1-NEXT: ret void ; ; IC2-LABEL: define void @switch4_default_common_dest_with_case( ; IC2-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) { ; IC2-NEXT: [[ENTRY:.*]]: +; IC2-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; IC2-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; IC2-NEXT: [[TMP0:%.*]] = sub i64 [[END1]], [[START2]] +; IC2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 +; IC2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IC2: [[VECTOR_PH]]: +; IC2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4 +; IC2-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; IC2-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]] +; IC2-NEXT: br label %[[VECTOR_BODY:.*]] +; IC2: [[VECTOR_BODY]]: +; IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE28:.*]] ] +; IC2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; IC2-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 1 +; IC2-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 2 +; IC2-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 3 +; IC2-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP1]] +; IC2-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP2]] +; IC2-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP3]] +; IC2-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]] +; IC2-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0 +; IC2-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 2 +; IC2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP5]], align 1 +; IC2-NEXT: [[WIDE_LOAD6:%.*]] = load <2 x i8>, ptr [[TMP6]], align 1 +; IC2-NEXT: [[TMP7:%.*]] = icmp eq <2 x i8> [[WIDE_LOAD]], <i8 13, i8 13> +; IC2-NEXT: [[TMP8:%.*]] = icmp eq <2 x i8> [[WIDE_LOAD6]], <i8 13, i8 13> +; IC2-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; IC2-NEXT: br i1 [[TMP9]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; IC2: [[PRED_STORE_IF]]: +; IC2-NEXT: store i8 0, ptr [[NEXT_GEP]], align 1 +; IC2-NEXT: br label %[[PRED_STORE_CONTINUE]] +; IC2: [[PRED_STORE_CONTINUE]]: +; IC2-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; IC2-NEXT: br i1 [[TMP10]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]] +; IC2: [[PRED_STORE_IF7]]: +; IC2-NEXT: store i8 0, ptr [[NEXT_GEP3]], align 1 +; IC2-NEXT: br label %[[PRED_STORE_CONTINUE8]] +; IC2: [[PRED_STORE_CONTINUE8]]: +; IC2-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; IC2-NEXT: br i1 [[TMP11]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]] +; IC2: [[PRED_STORE_IF9]]: +; IC2-NEXT: store i8 0, ptr [[NEXT_GEP4]], align 1 +; IC2-NEXT: br label %[[PRED_STORE_CONTINUE10]] +; IC2: [[PRED_STORE_CONTINUE10]]: +; IC2-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; IC2-NEXT: br i1 [[TMP12]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]] +; IC2: [[PRED_STORE_IF11]]: +; IC2-NEXT: store i8 0, ptr [[NEXT_GEP5]], align 1 +; IC2-NEXT: br label %[[PRED_STORE_CONTINUE12]] +; IC2: [[PRED_STORE_CONTINUE12]]: +; IC2-NEXT: [[TMP13:%.*]] = icmp eq <2 x i8> [[WIDE_LOAD]], <i8 -12, i8 -12> +; IC2-NEXT: [[TMP14:%.*]] = icmp eq <2 x i8> [[WIDE_LOAD6]], <i8 -12, i8 -12> +; IC2-NEXT: [[TMP15:%.*]] = extractelement <2 x i1> [[TMP13]], i32 0 +; IC2-NEXT: br i1 [[TMP15]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]] +; IC2: [[PRED_STORE_IF13]]: +; IC2-NEXT: store i8 42, ptr [[NEXT_GEP]], align 1 +; IC2-NEXT: br label %[[PRED_STORE_CONTINUE14]] +; IC2: [[PRED_STORE_CONTINUE14]]: +; IC2-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP13]], i32 1 +; IC2-NEXT: br i1 [[TMP16]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]] +; IC2: [[PRED_STORE_IF15]]: +; IC2-NEXT: store i8 42, ptr [[NEXT_GEP3]], align 1 +; IC2-NEXT: br label %[[PRED_STORE_CONTINUE16]] +; IC2: [[PRED_STORE_CONTINUE16]]: +; IC2-NEXT: [[TMP17:%.*]] = extractelement <2 x i1> [[TMP14]], i32 0 +; IC2-NEXT: br i1 [[TMP17]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]] +; IC2: [[PRED_STORE_IF17]]: +; IC2-NEXT: store i8 42, ptr [[NEXT_GEP4]], align 1 +; IC2-NEXT: br label %[[PRED_STORE_CONTINUE18]] +; IC2: [[PRED_STORE_CONTINUE18]]: +; IC2-NEXT: [[TMP18:%.*]] = extractelement <2 x i1> [[TMP14]], i32 1 +; IC2-NEXT: br i1 [[TMP18]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]] +; IC2: [[PRED_STORE_IF19]]: +; IC2-NEXT: store i8 42, ptr [[NEXT_GEP5]], align 1 +; IC2-NEXT: br label %[[PRED_STORE_CONTINUE20]] +; IC2: [[PRED_STORE_CONTINUE20]]: +; IC2-NEXT: [[TMP19:%.*]] = or <2 x i1> [[TMP13]], [[TMP7]] +; IC2-NEXT: [[TMP20:%.*]] = or <2 x i1> [[TMP14]], [[TMP8]] +; IC2-NEXT: [[TMP21:%.*]] = xor <2 x i1> [[TMP19]], <i1 true, i1 true> +; IC2-NEXT: [[TMP22:%.*]] = xor <2 x i1> [[TMP20]], <i1 true, i1 true> +; IC2-NEXT: [[TMP23:%.*]] = or <2 x i1> [[TMP21]], [[TMP21]] +; IC2-NEXT: [[TMP24:%.*]] = or <2 x i1> [[TMP22]], [[TMP22]] +; IC2-NEXT: [[TMP25:%.*]] = extractelement <2 x i1> [[TMP23]], i32 0 +; IC2-NEXT: br i1 [[TMP25]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]] +; IC2: [[PRED_STORE_IF21]]: +; IC2-NEXT: store i8 2, ptr [[NEXT_GEP]], align 1 +; IC2-NEXT: br label %[[PRED_STORE_CONTINUE22]] +; IC2: [[PRED_STORE_CONTINUE22]]: +; IC2-NEXT: [[TMP26:%.*]] = extractelement <2 x i1> [[TMP23]], i32 1 +; IC2-NEXT: br i1 [[TMP26]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]] +; IC2: [[PRED_STORE_IF23]]: +; IC2-NEXT: store i8 2, ptr [[NEXT_GEP3]], align 1 +; IC2-NEXT: br label %[[PRED_STORE_CONTINUE24]] +; IC2: [[PRED_STORE_CONTINUE24]]: +; IC2-NEXT: [[TMP27:%.*]] = extractelement <2 x i1> [[TMP24]], i32 0 +; IC2-NEXT: br i1 [[TMP27]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]] +; IC2: [[PRED_STORE_IF25]]: +; IC2-NEXT: store i8 2, ptr [[NEXT_GEP4]], align 1 +; IC2-NEXT: br label %[[PRED_STORE_CONTINUE26]] +; IC2: [[PRED_STORE_CONTINUE26]]: +; IC2-NEXT: [[TMP28:%.*]] = extractelement <2 x i1> [[TMP24]], i32 1 +; IC2-NEXT: br i1 [[TMP28]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28]] +; IC2: [[PRED_STORE_IF27]]: +; IC2-NEXT: store i8 2, ptr [[NEXT_GEP5]], align 1 +; IC2-NEXT: br label %[[PRED_STORE_CONTINUE28]] +; IC2: [[PRED_STORE_CONTINUE28]]: +; IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; IC2-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IC2-NEXT: br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IC2: [[MIDDLE_BLOCK]]: +; IC2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; IC2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IC2: [[SCALAR_PH]]: +; IC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] ; IC2-NEXT: br label %[[LOOP_HEADER:.*]] ; IC2: [[LOOP_HEADER]]: -; IC2-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; IC2-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; IC2-NEXT: [[L:%.*]] = load i8, ptr [[PTR_IV]], align 1 ; IC2-NEXT: switch i8 [[L]], label %[[DEFAULT:.*]] [ ; IC2-NEXT: i8 -12, label %[[IF_THEN_1:.*]] @@ -55,7 +236,7 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) { ; IC2: [[LOOP_LATCH]]: ; IC2-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i8, ptr [[PTR_IV]], i64 1 ; IC2-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]] -; IC2-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]] +; IC2-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] ; IC2: [[EXIT]]: ; IC2-NEXT: ret void ; @@ -91,3 +272,14 @@ loop.latch: exit: ret void } +;. +; IC1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; IC1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; IC1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; IC1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +;. +; IC2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; IC2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; IC2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; IC2: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +;. diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll index 40c42ffdfd107..26b8ee9896c05 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll @@ -11,13 +11,63 @@ define dso_local void @test(ptr %start, ptr %end) #0 { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[I11_NOT1:%.*]] = icmp eq ptr [[START:%.*]], [[END:%.*]] -; CHECK-NEXT: br i1 [[I11_NOT1]], label [[EXIT:%.*]], label [[BB12:%.*]] +; CHECK-NEXT: br i1 [[I11_NOT1]], label [[EXIT:%.*]], label [[BB12_PREHEADER:%.*]] +; CHECK: bb12.preheader: +; CHECK-NEXT: [[END3:%.*]] = ptrtoint ptr [[END]] to i64 +; CHECK-NEXT: [[START4:%.*]] = ptrtoint ptr [[START]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[END3]], -4 +; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START4]] +; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 124 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[BB12_PREHEADER11:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP3]], 9223372036854775776 +; CHECK-NEXT: [[TMP4:%.*]] = shl i64 [[N_VEC]], 2 +; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 2 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 32 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 64 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 96 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[NEXT_GEP]], align 4 +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4 +; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4 +; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <8 x i32>, ptr [[TMP7]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD]], <i32 -12, i32 -12, i32 -12, i32 -12, i32 -12, i32 -12, i32 -12, i32 -12> +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD8]], <i32 -12, i32 -12, i32 -12, i32 -12, i32 -12, i32 -12, i32 -12, i32 -12> +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD9]], <i32 -12, i32 -12, i32 -12, i32 -12, i32 -12, i32 -12, i32 -12, i32 -12> +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD10]], <i32 -12, i32 -12, i32 -12, i32 -12, i32 -12, i32 -12, i32 -12, i32 -12> +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD]], <i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13> +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD8]], <i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13> +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD9]], <i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13> +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD10]], <i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13> +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i1> [[TMP8]], [[TMP12]] +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i1> [[TMP9]], [[TMP13]] +; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i1> [[TMP10]], [[TMP14]] +; CHECK-NEXT: [[TMP19:%.*]] = or <8 x i1> [[TMP11]], [[TMP15]] +; CHECK-NEXT: tail call void @llvm.masked.store.v8i32.p0(<8 x i32> <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>, ptr [[NEXT_GEP]], i32 4, <8 x i1> [[TMP16]]) +; CHECK-NEXT: tail call void @llvm.masked.store.v8i32.p0(<8 x i32> <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>, ptr [[TMP5]], i32 4, <8 x i1> [[TMP17]]) +; CHECK-NEXT: tail call void @llvm.masked.store.v8i32.p0(<8 x i32> <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>, ptr [[TMP6]], i32 4, <8 x i1> [[TMP18]]) +; CHECK-NEXT: tail call void @llvm.masked.store.v8i32.p0(<8 x i32> <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>, ptr [[TMP7]], i32 4, <8 x i1> [[TMP19]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT]], label [[BB12_PREHEADER11]] +; CHECK: bb12.preheader11: +; CHECK-NEXT: [[PTR2_PH:%.*]] = phi ptr [ [[START]], [[BB12_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[BB12:%.*]] ; CHECK: bb12: -; CHECK-NEXT: [[PTR2:%.*]] = phi ptr [ [[PTR_NEXT:%.*]], [[LATCH:%.*]] ], [ [[START]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[PTR2:%.*]] = phi ptr [ [[PTR_NEXT:%.*]], [[LATCH:%.*]] ], [ [[PTR2_PH]], [[BB12_PREHEADER11]] ] ; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[PTR2]], align 4 ; CHECK-NEXT: switch i32 [[VAL]], label [[LATCH]] [ -; CHECK-NEXT: i32 -12, label [[STORE:%.*]] -; CHECK-NEXT: i32 13, label [[STORE]] +; CHECK-NEXT: i32 -12, label [[STORE:%.*]] +; CHECK-NEXT: i32 13, label [[STORE]] ; CHECK-NEXT: ] ; CHECK: store: ; CHECK-NEXT: store i32 42, ptr [[PTR2]], align 4 @@ -25,7 +75,7 @@ define dso_local void @test(ptr %start, ptr %end) #0 { ; CHECK: latch: ; CHECK-NEXT: [[PTR_NEXT]] = getelementptr inbounds i8, ptr [[PTR2]], i64 4 ; CHECK-NEXT: [[I11_NOT:%.*]] = icmp eq ptr [[PTR_NEXT]], [[END]] -; CHECK-NEXT: br i1 [[I11_NOT]], label [[EXIT]], label [[BB12]] +; CHECK-NEXT: br i1 [[I11_NOT]], label [[EXIT]], label [[BB12]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits