Author: Roman Lebedev Date: 2020-10-04T11:53:50+03:00 New Revision: 03bd5198b6f7d9f49d72e6516d813a206f3b6d0d
URL: https://github.com/llvm/llvm-project/commit/03bd5198b6f7d9f49d72e6516d813a206f3b6d0d DIFF: https://github.com/llvm/llvm-project/commit/03bd5198b6f7d9f49d72e6516d813a206f3b6d0d.diff LOG: [OldPM] Pass manager: run SROA after (simple) loop unrolling I have stumbled into this pretty accidentally, when rewriting some spaghetti-like code into something more structured, which involved using some `std::array<>`s. And to my surprise, the `alloca`s remained, causing about `+160%` perf regression. https://llvm-compile-time-tracker.com/compare.php?from=bb6f4d32aac3eecb51909f4facc625219307ee68&to=d563e66f40f9d4d145cb2050e41cb961e2b37785&stat=instructions suggests that this has geomean compile-time cost of `+0.08%`. Note that D68593 / cecc0d27ad58c0aed8ef9ed99bbf691e137a0f26 already did this chage for NewPM, but left OldPM in a pessimized state. This fixes [[ https://bugs.llvm.org/show_bug.cgi?id=40011 | PR40011 ]], [[ https://bugs.llvm.org/show_bug.cgi?id=42794 | PR42794 ]] and probably some other reports. Reviewed By: nikic, xbolva00 Differential Revision: https://reviews.llvm.org/D87972 Added: Modified: clang/test/CodeGenCXX/union-tbaa2.cpp clang/test/Misc/loop-opt-setup.c llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp llvm/lib/Transforms/IPO/PassManagerBuilder.cpp llvm/test/Other/opt-O2-pipeline.ll llvm/test/Other/opt-O3-pipeline-enable-matrix.ll llvm/test/Other/opt-O3-pipeline.ll llvm/test/Other/opt-Os-pipeline.ll llvm/test/Transforms/PhaseOrdering/X86/SROA-after-loop-unrolling.ll Removed: ################################################################################ diff --git a/clang/test/CodeGenCXX/union-tbaa2.cpp b/clang/test/CodeGenCXX/union-tbaa2.cpp index 5d13ff1ad8d9..65872d4a98ae 100644 --- a/clang/test/CodeGenCXX/union-tbaa2.cpp +++ b/clang/test/CodeGenCXX/union-tbaa2.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 %s -O2 -fno-experimental-new-pass-manager -std=c++11 -triple x86_64-unknown-linux-gnu -target-cpu x86-64 -target-feature +sse4.2 -target-feature +avx -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 %s -O1 -std=c++11 -triple x86_64-unknown-linux-gnu -target-cpu x86-64 -target-feature +sse4.2 -target-feature +avx -emit-llvm -o - | FileCheck %s // Testcase from llvm.org/PR32056 diff --git a/clang/test/Misc/loop-opt-setup.c b/clang/test/Misc/loop-opt-setup.c index 868c716c6ed7..322f5e0e6d4a 100644 --- a/clang/test/Misc/loop-opt-setup.c +++ b/clang/test/Misc/loop-opt-setup.c @@ -1,5 +1,5 @@ -// RUN: %clang -O1 -fexperimental-new-pass-manager -fno-unroll-loops -S -o - %s -emit-llvm | FileCheck %s -check-prefix=CHECK-NEWPM -// RUN: %clang -O1 -fno-experimental-new-pass-manager -fno-unroll-loops -S -o - %s -emit-llvm | FileCheck %s -check-prefix=CHECK-OLDPM +// RUN: %clang -O1 -fno-unroll-loops -S -o - %s -emit-llvm | FileCheck %s + extern int a[16]; int b = 0; int foo(void) { @@ -9,10 +9,8 @@ int foo(void) { return b; } // Check br i1 to make sure that the loop is fully unrolled -// CHECK-LABEL-NEWPM: foo -// CHECK-NOT-NEWPM: br i1 -// CHECK-LABEL-OLDPM: foo -// CHECK-NOT-OLDPM: br i1 +// CHECK-LABEL: foo +// CHECK-NOT: br i1 void Helper() { const int *nodes[5]; @@ -26,17 +24,7 @@ void Helper() { } // Check br i1 to make sure the loop is gone, there will still be a label branch for the infinite loop. -// CHECK-LABEL-NEWPM: Helper -// CHECK-NEWPM: br label -// CHECK-NEWPM-NOT: br i1 -// CHECK-NEWPM: br label - -// The old pass manager doesn't remove the while loop so check for 5 load i32*. -// CHECK-LABEL-OLDPM: Helper -// CHECK-OLDPM: br label -// CHECK-OLDPM: load i32* -// CHECK-OLDPM: load i32* -// CHECK-OLDPM: load i32* -// CHECK-OLDPM: load i32* -// CHECK-OLDPM: load i32* -// CHECK-OLDPM: ret +// CHECK-LABEL: Helper +// CHECK: br label +// CHECK-NOT: br i1 +// CHECK: br label diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index ccc493640b29..043effc97f2b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -479,14 +479,6 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { if (EnableOpt) PM.add(createAMDGPUPromoteAllocaToVector()); }); - - Builder.addExtension( - PassManagerBuilder::EP_LoopOptimizerEnd, - [](const PassManagerBuilder &, legacy::PassManagerBase &PM) { - // Add SROA after loop unrolling as more promotable patterns are - // exposed after small loops are fully unrolled. - PM.add(createSROAPass()); - }); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp index c63705a4ee94..088f1e25f3d1 100644 --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -459,6 +459,9 @@ void PassManagerBuilder::addFunctionSimplificationPasses( addExtensionsToPM(EP_LoopOptimizerEnd, MPM); // This ends the loop pass pipelines. + // Break up allocas that may now be splittable after loop unrolling. + MPM.add(createSROAPass()); + if (OptLevel > 1) { MPM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds MPM.add(NewGVN ? createNewGVNPass() diff --git a/llvm/test/Other/opt-O2-pipeline.ll b/llvm/test/Other/opt-O2-pipeline.ll index 58ed6b2a0820..967477da22bd 100644 --- a/llvm/test/Other/opt-O2-pipeline.ll +++ b/llvm/test/Other/opt-O2-pipeline.ll @@ -1,4 +1,4 @@ -; RUN: opt -enable-new-pm=0 -mtriple=x86_64-- -O2 -debug-pass=Structure < %s -o /dev/null 2>&1 | FileCheck --check-prefixes=CHECK,%llvmcheckext %s +; RUN: opt -enable-new-pm=0 -mtriple=x86_64-- -O2 -debug-pass=Structure < %s -o /dev/null 2>&1 | FileCheck --check-prefixes=CHECK,%llvmcheckext %s ; REQUIRES: asserts @@ -22,7 +22,7 @@ ; CHECK-NEXT: Target Library Information ; CHECK-NEXT: Target Transform Information ; Target Pass Configuration -; CHECK: Type-Based Alias Analysis +; CHECK: Type-Based Alias Analysis ; CHECK-NEXT: Scoped NoAlias Alias Analysis ; CHECK-NEXT: Assumption Cache Tracker ; CHECK-NEXT: Profile summary info @@ -134,6 +134,8 @@ ; CHECK-NEXT: Recognize loop idioms ; CHECK-NEXT: Delete dead loops ; CHECK-NEXT: Unroll loops +; CHECK-NEXT: SROA +; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: MergedLoadStoreMotion ; CHECK-NEXT: Phi Values Analysis ; CHECK-NEXT: Function Alias Analysis Results diff --git a/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll b/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll index 493957e865d4..3b8db87e8fb1 100644 --- a/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll +++ b/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll @@ -139,6 +139,8 @@ ; CHECK-NEXT: Recognize loop idioms ; CHECK-NEXT: Delete dead loops ; CHECK-NEXT: Unroll loops +; CHECK-NEXT: SROA +; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: MergedLoadStoreMotion ; CHECK-NEXT: Phi Values Analysis ; CHECK-NEXT: Function Alias Analysis Results diff --git a/llvm/test/Other/opt-O3-pipeline.ll b/llvm/test/Other/opt-O3-pipeline.ll index f674dabd5217..a53db61a93cf 100644 --- a/llvm/test/Other/opt-O3-pipeline.ll +++ b/llvm/test/Other/opt-O3-pipeline.ll @@ -139,6 +139,8 @@ ; CHECK-NEXT: Recognize loop idioms ; CHECK-NEXT: Delete dead loops ; CHECK-NEXT: Unroll loops +; CHECK-NEXT: SROA +; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: MergedLoadStoreMotion ; CHECK-NEXT: Phi Values Analysis ; CHECK-NEXT: Function Alias Analysis Results diff --git a/llvm/test/Other/opt-Os-pipeline.ll b/llvm/test/Other/opt-Os-pipeline.ll index 66df666a64c6..93c2d121255b 100644 --- a/llvm/test/Other/opt-Os-pipeline.ll +++ b/llvm/test/Other/opt-Os-pipeline.ll @@ -120,6 +120,8 @@ ; CHECK-NEXT: Recognize loop idioms ; CHECK-NEXT: Delete dead loops ; CHECK-NEXT: Unroll loops +; CHECK-NEXT: SROA +; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: MergedLoadStoreMotion ; CHECK-NEXT: Phi Values Analysis ; CHECK-NEXT: Function Alias Analysis Results diff --git a/llvm/test/Transforms/PhaseOrdering/X86/SROA-after-loop-unrolling.ll b/llvm/test/Transforms/PhaseOrdering/X86/SROA-after-loop-unrolling.ll index 8c8a80cbf7ff..22694901162c 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/SROA-after-loop-unrolling.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/SROA-after-loop-unrolling.ll @@ -22,55 +22,21 @@ target triple = "x86_64-unknown-linux-gnu" %"struct.std::array" = type { [6 x i32] } define dso_local void @_Z3fooi(i32 %cnt) { -; OLDPM-LABEL: @_Z3fooi( -; OLDPM-NEXT: entry: -; OLDPM-NEXT: [[ARR:%.*]] = alloca %"struct.std::array", align 16 -; OLDPM-NEXT: [[TMP0:%.*]] = bitcast %"struct.std::array"* [[ARR]] to i8* -; OLDPM-NEXT: call void @llvm.lifetime.start.p0i8(i64 24, i8* nonnull [[TMP0]]) -; OLDPM-NEXT: [[ARRAYDECAY_I_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* [[ARR]], i64 0, i32 0, i64 0 -; OLDPM-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* [[ARR]], i64 0, i32 0, i64 1 -; OLDPM-NEXT: [[INCDEC_PTR_1:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* [[ARR]], i64 0, i32 0, i64 2 -; OLDPM-NEXT: [[INCDEC_PTR_2:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* [[ARR]], i64 0, i32 0, i64 3 -; OLDPM-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[CNT:%.*]], i32 0 -; OLDPM-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> zeroinitializer -; OLDPM-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[TMP2]], <i32 1, i32 2, i32 3, i32 4> -; OLDPM-NEXT: [[TMP4:%.*]] = bitcast %"struct.std::array"* [[ARR]] to <4 x i32>* -; OLDPM-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 16 -; OLDPM-NEXT: [[INCDEC_PTR_3:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* [[ARR]], i64 0, i32 0, i64 4 -; OLDPM-NEXT: [[INC_4:%.*]] = add nsw i32 [[CNT]], 5 -; OLDPM-NEXT: store i32 [[INC_4]], i32* [[INCDEC_PTR_3]], align 16 -; OLDPM-NEXT: [[INCDEC_PTR_4:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* [[ARR]], i64 0, i32 0, i64 5 -; OLDPM-NEXT: [[INC_5:%.*]] = add nsw i32 [[CNT]], 6 -; OLDPM-NEXT: store i32 [[INC_5]], i32* [[INCDEC_PTR_4]], align 4 -; OLDPM-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYDECAY_I_I_I]], align 16 -; OLDPM-NEXT: call void @_Z3usei(i32 [[TMP5]]) -; OLDPM-NEXT: [[TMP6:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 -; OLDPM-NEXT: call void @_Z3usei(i32 [[TMP6]]) -; OLDPM-NEXT: [[TMP7:%.*]] = load i32, i32* [[INCDEC_PTR_1]], align 8 -; OLDPM-NEXT: call void @_Z3usei(i32 [[TMP7]]) -; OLDPM-NEXT: [[TMP8:%.*]] = load i32, i32* [[INCDEC_PTR_2]], align 4 -; OLDPM-NEXT: call void @_Z3usei(i32 [[TMP8]]) -; OLDPM-NEXT: [[TMP9:%.*]] = load i32, i32* [[INCDEC_PTR_3]], align 16 -; OLDPM-NEXT: call void @_Z3usei(i32 [[TMP9]]) -; OLDPM-NEXT: call void @_Z3usei(i32 [[INC_5]]) -; OLDPM-NEXT: call void @llvm.lifetime.end.p0i8(i64 24, i8* nonnull [[TMP0]]) -; OLDPM-NEXT: ret void -; -; NEWPM-LABEL: @_Z3fooi( -; NEWPM-NEXT: entry: -; NEWPM-NEXT: [[INC:%.*]] = add nsw i32 [[CNT:%.*]], 1 -; NEWPM-NEXT: [[INC_1:%.*]] = add nsw i32 [[CNT]], 2 -; NEWPM-NEXT: [[INC_2:%.*]] = add nsw i32 [[CNT]], 3 -; NEWPM-NEXT: [[INC_3:%.*]] = add nsw i32 [[CNT]], 4 -; NEWPM-NEXT: [[INC_4:%.*]] = add nsw i32 [[CNT]], 5 -; NEWPM-NEXT: [[INC_5:%.*]] = add nsw i32 [[CNT]], 6 -; NEWPM-NEXT: call void @_Z3usei(i32 [[INC]]) -; NEWPM-NEXT: call void @_Z3usei(i32 [[INC_1]]) -; NEWPM-NEXT: call void @_Z3usei(i32 [[INC_2]]) -; NEWPM-NEXT: call void @_Z3usei(i32 [[INC_3]]) -; NEWPM-NEXT: call void @_Z3usei(i32 [[INC_4]]) -; NEWPM-NEXT: call void @_Z3usei(i32 [[INC_5]]) -; NEWPM-NEXT: ret void +; CHECK-LABEL: @_Z3fooi( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[CNT:%.*]], 1 +; CHECK-NEXT: [[INC_1:%.*]] = add nsw i32 [[CNT]], 2 +; CHECK-NEXT: [[INC_2:%.*]] = add nsw i32 [[CNT]], 3 +; CHECK-NEXT: [[INC_3:%.*]] = add nsw i32 [[CNT]], 4 +; CHECK-NEXT: [[INC_4:%.*]] = add nsw i32 [[CNT]], 5 +; CHECK-NEXT: [[INC_5:%.*]] = add nsw i32 [[CNT]], 6 +; CHECK-NEXT: call void @_Z3usei(i32 [[INC]]) +; CHECK-NEXT: call void @_Z3usei(i32 [[INC_1]]) +; CHECK-NEXT: call void @_Z3usei(i32 [[INC_2]]) +; CHECK-NEXT: call void @_Z3usei(i32 [[INC_3]]) +; CHECK-NEXT: call void @_Z3usei(i32 [[INC_4]]) +; CHECK-NEXT: call void @_Z3usei(i32 [[INC_5]]) +; CHECK-NEXT: ret void ; entry: %cnt.addr = alloca i32 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits