[PATCH] D76078: [AArch64][SVE] Add a pass for SVE intrinsic optimisations

Kerry McLaughlin via Phabricator via cfe-commits Tue, 24 Mar 2020 10:46:26 -0700

kmclaughlin updated this revision to Diff 252368.
kmclaughlin marked 3 inline comments as done.
kmclaughlin added a comment.


Use SmallPtrSet instead of SmallVector for storing functions found by 
runOnModule
Add more comments to clarify the purpose of the pass and some of the negative 
reinterpret tests


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D76078/new/

https://reviews.llvm.org/D76078

Files:
  llvm/lib/Target/AArch64/AArch64.h
  llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
  llvm/lib/Target/AArch64/CMakeLists.txt
  llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
  llvm/test/CodeGen/AArch64/O3-pipeline.ll
  llvm/test/CodeGen/AArch64/sve-intrinsic-opts-ptest.ll
  llvm/test/CodeGen/AArch64/sve-intrinsic-opts-reinterpret.ll

Index: llvm/test/CodeGen/AArch64/sve-intrinsic-opts-reinterpret.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/sve-intrinsic-opts-reinterpret.ll
@@ -0,0 +1,203 @@
+; RUN: opt -S -sve-intrinsic-opts -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck --check-prefix OPT %s
+
+define <vscale x 8 x i1> @reinterpret_test_h(<vscale x 8 x i1> %a) {
+; OPT-LABEL: @reinterpret_test_h(
+; OPT-NOT: convert
+; OPT: ret <vscale x 8 x i1> %a
+  %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a)
+  %2 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %1)
+  ret <vscale x 8 x i1> %2
+}
+
+; Reinterprets are not redundant because the second reinterpret zeros the
+; lanes that don't exist within its input.
+define <vscale x 16 x i1> @reinterpret_test_h_rev(<vscale x 16 x i1> %a) {
+; OPT-LABEL: @reinterpret_test_h_rev(
+; OPT: %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %a)
+; OPT-NEXT: %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %1)
+; OPT-NEXT: ret <vscale x 16 x i1> %2
+  %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %a)
+  %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %1)
+  ret <vscale x 16 x i1> %2
+}
+
+define <vscale x 4 x i1> @reinterpret_test_w(<vscale x 4 x i1> %a) {
+; OPT-LABEL: @reinterpret_test_w(
+; OPT-NOT: convert
+; OPT: ret <vscale x 4 x i1> %a
+  %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
+  %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
+  ret <vscale x 4 x i1> %2
+}
+
+; Reinterprets are not redundant because the second reinterpret zeros the
+; lanes that don't exist within its input.
+define <vscale x 16 x i1> @reinterpret_test_w_rev(<vscale x 16 x i1> %a) {
+; OPT-LABEL: @reinterpret_test_w_rev(
+; OPT: %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %a)
+; OPT-NEXT: %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %1)
+; OPT-NEXT: ret <vscale x 16 x i1> %2
+  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %a)
+  %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %1)
+  ret <vscale x 16 x i1> %2
+}
+
+define <vscale x 2 x i1> @reinterpret_test_d(<vscale x 2 x i1> %a) {
+; OPT-LABEL: @reinterpret_test_d(
+; OPT-NOT: convert
+; OPT: ret <vscale x 2 x i1> %a
+  %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a)
+  %2 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %1)
+  ret <vscale x 2 x i1> %2
+}
+
+; Reinterprets are not redundant because the second reinterpret zeros the
+; lanes that don't exist within its input.
+define <vscale x 16 x i1> @reinterpret_test_d_rev(<vscale x 16 x i1> %a) {
+; OPT-LABEL: @reinterpret_test_d_rev(
+; OPT: %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %a)
+; OPT-NEXT: %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %1)
+; OPT-NEXT: ret <vscale x 16 x i1> %2
+  %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %a)
+  %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %1)
+  ret <vscale x 16 x i1> %2
+}
+
+define <vscale x 2 x i1> @reinterpret_reductions(i32 %cond, <vscale x 2 x i1> %a, <vscale x 2 x i1> %b, <vscale x 2 x i1> %c) {
+; OPT-LABEL: reinterpret_reductions
+; OPT-NOT: convert
+; OPT-NOT: phi <vscale x 16 x i1>
+; OPT: phi <vscale x 2 x i1> [ %a, %br_phi_a ], [ %b, %br_phi_b ], [ %c, %br_phi_c ]
+; OPT-NOT: convert
+; OPT: ret
+
+entry:
+  switch i32 %cond, label %br_phi_c [
+         i32 43, label %br_phi_a
+         i32 45, label %br_phi_b
+  ]
+
+br_phi_a:
+  %a1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a)
+  br label %join
+
+br_phi_b:
+  %b1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %b)
+  br label %join
+
+br_phi_c:
+  %c1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %c)
+  br label %join
+
+join:
+  %pg = phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b1, %br_phi_b ], [ %c1, %br_phi_c ]
+  %pg1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %pg)
+  ret <vscale x 2 x i1> %pg1
+}
+
+; No transform as the reinterprets are converting from different types (nxv2i1 & nxv4i1)
+; As the incoming values to the phi must all be the same type, we cannot remove the reinterprets.
+define <vscale x 2 x i1> @reinterpret_reductions_1(i32 %cond, <vscale x 2 x i1> %a, <vscale x 4 x i1> %b, <vscale x 2 x i1> %c) {
+; OPT-LABEL: reinterpret_reductions_1
+; OPT: convert
+; OPT: phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b1, %br_phi_b ], [ %c1, %br_phi_c ]
+; OPT-NOT: phi <vscale x 2 x i1>
+; OPT: tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %pg)
+; OPT: ret
+
+entry:
+  switch i32 %cond, label %br_phi_c [
+         i32 43, label %br_phi_a
+         i32 45, label %br_phi_b
+  ]
+
+br_phi_a:
+  %a1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a)
+  br label %join
+
+br_phi_b:
+  %b1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %b)
+  br label %join
+
+br_phi_c:
+  %c1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %c)
+  br label %join
+
+join:
+  %pg = phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b1, %br_phi_b ], [ %c1, %br_phi_c ]
+  %pg1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %pg)
+  ret <vscale x 2 x i1> %pg1
+}
+
+; No transform. Similar to the the test above, but here only two of the arguments need to
+; be converted to svbool.
+define <vscale x 2 x i1> @reinterpret_reductions_2(i32 %cond, <vscale x 2 x i1> %a, <vscale x 16 x i1> %b, <vscale x 2 x i1> %c) {
+; OPT-LABEL: reinterpret_reductions_2
+; OPT: convert
+; OPT: phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b, %br_phi_b ], [ %c1, %br_phi_c ]
+; OPT-NOT: phi <vscale x 2 x i1>
+; OPT: tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %pg)
+; OPT: ret
+
+entry:
+  switch i32 %cond, label %br_phi_c [
+         i32 43, label %br_phi_a
+         i32 45, label %br_phi_b
+  ]
+
+br_phi_a:
+  %a1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a)
+  br label %join
+
+br_phi_b:
+  br label %join
+
+br_phi_c:
+  %c1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %c)
+  br label %join
+
+join:
+  %pg = phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b, %br_phi_b ], [ %c1, %br_phi_c ]
+  %pg1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %pg)
+  ret <vscale x 2 x i1> %pg1
+}
+
+; Similar to reinterpret_reductions but the reinterprets remain because the
+; original phi cannot be removed (i.e. prefer reinterprets over multiple phis).
+define <vscale x 16 x i1> @reinterpret_reductions3(i32 %cond, <vscale x 2 x i1> %a, <vscale x 2 x i1> %b, <vscale x 2 x i1> %c) {
+; OPT-LABEL: reinterpret_reductions3
+; OPT: phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b1, %br_phi_b ], [ %c1, %br_phi_c ]
+; OPT-NOT: phi <vscale x 2 x i1>
+; OPT: tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %pg)
+; OPT-NEXT: ret <vscale x 16 x i1> %pg
+
+entry:
+  switch i32 %cond, label %br_phi_c [
+         i32 43, label %br_phi_a
+         i32 45, label %br_phi_b
+  ]
+
+br_phi_a:
+  %a1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a)
+  br label %join
+
+br_phi_b:
+  %b1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %b)
+  br label %join
+
+br_phi_c:
+  %c1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %c)
+  br label %join
+
+join:
+  %pg = phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b1, %br_phi_b ], [ %c1, %br_phi_c ]
+  %pg1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %pg)
+  ret <vscale x 16 x i1> %pg
+}
+
+declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1>)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1>)
+declare <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1>)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1>)
Index: llvm/test/CodeGen/AArch64/sve-intrinsic-opts-ptest.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/sve-intrinsic-opts-ptest.ll
@@ -0,0 +1,67 @@
+; RUN: opt -S -sve-intrinsic-opts -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck --check-prefix OPT %s
+
+define i1 @ptest_any1(<vscale x 2 x i1> %a) {
+; OPT-LABEL: ptest_any1
+; OPT: %mask = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 0)
+; OPT-NOT: convert
+; OPT-NEXT: %[[OUT:.*]] = call i1 @llvm.aarch64.sve.ptest.any.nxv2i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %a)
+; OPT-NEXT: ret i1 %[[OUT]]
+  %mask = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 0)
+  %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %mask)
+  %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a)
+  %out = call i1 @llvm.aarch64.sve.ptest.any.nxv16i1(<vscale x 16 x i1> %1, <vscale x 16 x i1> %2)
+  ret i1 %out
+}
+
+; No transform because the ptest is using differently sized operands.
+define i1 @ptest_any2(<vscale x 4 x i1> %a) {
+; OPT-LABEL: ptest_any2
+; OPT: %mask = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; OPT-NEXT: %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %mask)
+; OPT-NEXT: %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
+; OPT-NEXT: %out = call i1 @llvm.aarch64.sve.ptest.any.nxv16i1(<vscale x 16 x i1> %1, <vscale x 16 x i1> %2)
+  %mask = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %mask)
+  %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
+  %out = call i1 @llvm.aarch64.sve.ptest.any.nxv16i1(<vscale x 16 x i1> %1, <vscale x 16 x i1> %2)
+  ret i1 %out
+}
+
+define i1 @ptest_first(<vscale x 4 x i1> %a) {
+; OPT-LABEL: ptest_first
+; OPT: %mask = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 0)
+; OPT-NOT: convert
+; OPT-NEXT: %[[OUT:.*]] = call i1 @llvm.aarch64.sve.ptest.first.nxv4i1(<vscale x 4 x i1> %mask, <vscale x 4 x i1> %a)
+; OPT-NEXT: ret i1 %[[OUT]]
+  %mask = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 0)
+  %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %mask)
+  %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
+  %out = call i1 @llvm.aarch64.sve.ptest.first.nxv16i1(<vscale x 16 x i1> %1, <vscale x 16 x i1> %2)
+  ret i1 %out
+}
+
+define i1 @ptest_last(<vscale x 8 x i1> %a) {
+; OPT-LABEL: ptest_last
+; OPT: %mask = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 0)
+; OPT-NOT: convert
+; OPT-NEXT: %[[OUT:.*]] = call i1 @llvm.aarch64.sve.ptest.last.nxv8i1(<vscale x 8 x i1> %mask, <vscale x 8 x i1> %a)
+; OPT-NEXT: ret i1 %[[OUT]]
+  %mask = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 0)
+  %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %mask)
+  %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a)
+  %out = call i1 @llvm.aarch64.sve.ptest.last.nxv16i1(<vscale x 16 x i1> %1, <vscale x 16 x i1> %2)
+  ret i1 %out
+}
+
+declare <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32)
+declare <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32)
+
+declare i1 @llvm.aarch64.sve.ptest.any.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>)
+declare i1 @llvm.aarch64.sve.ptest.first.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>)
+declare i1 @llvm.aarch64.sve.ptest.last.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>)
+
+declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1>)
Index: llvm/test/CodeGen/AArch64/O3-pipeline.ll
===================================================================
--- llvm/test/CodeGen/AArch64/O3-pipeline.ll
+++ llvm/test/CodeGen/AArch64/O3-pipeline.ll
@@ -17,6 +17,10 @@
 ; CHECK-NEXT:     Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT:     FunctionPass Manager
 ; CHECK-NEXT:       Expand Atomic instructions
+; CHECK-NEXT:     SVE intrinsics optimizations
+; CHECK-NEXT:       FunctionPass Manager
+; CHECK-NEXT:         Dominator Tree Construction
+; CHECK-NEXT:     FunctionPass Manager
 ; CHECK-NEXT:       Simplify the CFG
 ; CHECK-NEXT:       Dominator Tree Construction
 ; CHECK-NEXT:       Natural Loop Information
Index: llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
===================================================================
--- /dev/null
+++ llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
@@ -0,0 +1,277 @@
+//===----- SVEIntrinsicOpts - SVE ACLE Intrinsics Opts --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Performs general IR level optimizations on SVE intrinsics.
+//
+// The main goal of this pass is to remove unnecessary reinterpret
+// intrinsics (llvm.aarch64.sve.convert.[to|from].svbool), e.g:
+//
+//   %1 = @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
+//   %2 = @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
+//
+// This pass also looks for ptest intrinsics & phi instructions where the
+// operands are being needlessly converted to and from svbool_t.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsAArch64.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "sve-intrinsic-opts"
+
+namespace llvm {
+void initializeSVEIntrinsicOptsPass(PassRegistry &);
+}
+
+namespace {
+struct SVEIntrinsicOpts : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  SVEIntrinsicOpts() : ModulePass(ID) {
+    initializeSVEIntrinsicOptsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+private:
+  static IntrinsicInst *isReinterpretFromSVBool(Value *V);
+  static IntrinsicInst *isReinterpretToSVBool(Value *V);
+
+  static bool optimizeIntrinsic(Instruction *I);
+
+  bool optimizeFunctions(SmallPtrSetImpl<Function *> &Functions);
+
+  static bool optimizeConvertFromSVBool(IntrinsicInst *I);
+  static bool optimizePTest(IntrinsicInst *I);
+
+  static bool processPhiNode(IntrinsicInst *I);
+};
+} // end anonymous namespace
+
+void SVEIntrinsicOpts::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<DominatorTreeWrapperPass>();
+  AU.setPreservesCFG();
+}
+
+char SVEIntrinsicOpts::ID = 0;
+static const char *name = "SVE intrinsics optimizations";
+INITIALIZE_PASS_BEGIN(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
+INITIALIZE_PASS_END(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false)
+
+namespace llvm {
+ModulePass *createSVEIntrinsicOptsPass() { return new SVEIntrinsicOpts(); }
+} // namespace llvm
+
+/// Returns V if it's a cast from <n x 16 x i1> (aka svbool_t), nullptr
+/// otherwise.
+IntrinsicInst *SVEIntrinsicOpts::isReinterpretToSVBool(Value *V) {
+  IntrinsicInst *I = dyn_cast<IntrinsicInst>(V);
+  if (!I)
+    return nullptr;
+
+  if (I->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
+    return nullptr;
+
+  return I;
+}
+
+/// Returns V if it's a cast to <n x 16 x i1> (aka svbool_t), nullptr otherwise.
+IntrinsicInst *SVEIntrinsicOpts::isReinterpretFromSVBool(Value *V) {
+  IntrinsicInst *I = dyn_cast<IntrinsicInst>(V);
+  if (!I)
+    return nullptr;
+
+  if (I->getIntrinsicID() != Intrinsic::aarch64_sve_convert_from_svbool)
+    return nullptr;
+
+  return I;
+}
+
+/// The function will remove redundant reinterprets casting in the presence
+/// of the control flow
+bool SVEIntrinsicOpts::processPhiNode(IntrinsicInst *X) {
+
+  SmallVector<Instruction *, 32> Worklist;
+  auto RequiredType = X->getType();
+
+  auto *PN = dyn_cast<PHINode>(X->getArgOperand(0));
+  assert(PN && "Expected Phi Node!");
+
+  // Don't create a new Phi unless we can remove the old one.
+  if (!PN->hasOneUse())
+    return false;
+
+  for (Value *IncValPhi : PN->incoming_values()) {
+    auto *Reinterpret = isReinterpretToSVBool(IncValPhi);
+    if (!Reinterpret ||
+        RequiredType != Reinterpret->getArgOperand(0)->getType())
+      return false;
+  }
+
+  // Create the new Phi
+  LLVMContext &Ctx = PN->getContext();
+  IRBuilder<> Builder(Ctx);
+  Builder.SetInsertPoint(PN);
+  PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
+  Worklist.push_back(PN);
+
+  for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
+    auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
+    NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
+    Worklist.push_back(Reinterpret);
+  }
+
+  // Cleanup Phi Node and reinterprets
+  X->replaceAllUsesWith(NPN);
+  X->eraseFromParent();
+
+  for (auto &I : Worklist)
+    if (I->use_empty())
+      I->eraseFromParent();
+
+  return true;
+}
+
+bool SVEIntrinsicOpts::optimizePTest(IntrinsicInst *I) {
+  IntrinsicInst *Op1 = dyn_cast<IntrinsicInst>(I->getArgOperand(0));
+  IntrinsicInst *Op2 = dyn_cast<IntrinsicInst>(I->getArgOperand(1));
+
+  if (Op1 && Op2 &&
+      Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
+      Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
+      Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) {
+
+    Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)};
+    Type *Tys[] = {Op1->getArgOperand(0)->getType()};
+    Module *M = I->getParent()->getParent()->getParent();
+
+    auto Fn = Intrinsic::getDeclaration(M, I->getIntrinsicID(), Tys);
+    auto CI = CallInst::Create(Fn, Ops, I->getName(), I);
+
+    I->replaceAllUsesWith(CI);
+    I->eraseFromParent();
+    if (Op1->use_empty())
+      Op1->eraseFromParent();
+    if (Op2->use_empty())
+      Op2->eraseFromParent();
+
+    return true;
+  }
+
+  return false;
+}
+
+bool SVEIntrinsicOpts::optimizeConvertFromSVBool(IntrinsicInst *I) {
+  assert(isReinterpretFromSVBool(I));
+
+  // If the reinterpret instruction operand is a PHI Node
+  if (isa<PHINode>(I->getArgOperand(0)))
+    return processPhiNode(I);
+
+  // If we have a reinterpret intrinsic I of type A which is converting from
+  // another reinterpret Y of type B, and the source type of Y is A, then we can
+  // elide away both reinterprets if there are no other users of Y.
+  auto *Y = isReinterpretToSVBool(I->getArgOperand(0));
+  if (!Y)
+    return false;
+
+  Value *SourceVal = Y->getArgOperand(0);
+  if (I->getType() != SourceVal->getType())
+    return false;
+
+  I->replaceAllUsesWith(SourceVal);
+  I->eraseFromParent();
+  if (Y->use_empty())
+    Y->eraseFromParent();
+
+  return true;
+}
+
+bool SVEIntrinsicOpts::optimizeIntrinsic(Instruction *I) {
+  IntrinsicInst *IntrI = dyn_cast<IntrinsicInst>(I);
+  if (!IntrI)
+    return false;
+
+  switch (IntrI->getIntrinsicID()) {
+  case Intrinsic::aarch64_sve_convert_from_svbool:
+    return optimizeConvertFromSVBool(IntrI);
+  case Intrinsic::aarch64_sve_ptest_any:
+  case Intrinsic::aarch64_sve_ptest_first:
+  case Intrinsic::aarch64_sve_ptest_last:
+    return optimizePTest(IntrI);
+  default:
+    return false;
+  }
+
+  return true;
+}
+
+bool SVEIntrinsicOpts::optimizeFunctions(
+    SmallPtrSetImpl<Function *> &Functions) {
+  bool Changed = false;
+  for (auto *F : Functions) {
+    DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>(*F).getDomTree();
+
+    // Traverse the DT with an rpo walk so we see defs before uses, allowing
+    // simplification to be done incrementally.
+    BasicBlock *Root = DT->getRoot();
+    ReversePostOrderTraversal<BasicBlock *> RPOT(Root);
+    for (auto *BB : RPOT)
+      for (Instruction &I : make_early_inc_range(*BB))
+        Changed |= optimizeIntrinsic(&I);
+  }
+  return Changed;
+}
+
+bool SVEIntrinsicOpts::runOnModule(Module &M) {
+  bool Changed = false;
+  SmallPtrSet<Function *, 4> Functions;
+
+  // Check for SVE intrinsic declarations first so that we only iterate over
+  // relevant functions. Where an appropriate declaration is found, store the
+  // function(s) where it is used so we can target these only.
+  for (auto &F : M.getFunctionList()) {
+    if (!F.isDeclaration())
+      continue;
+
+    switch (F.getIntrinsicID()) {
+    case Intrinsic::aarch64_sve_convert_from_svbool:
+    case Intrinsic::aarch64_sve_ptest_any:
+    case Intrinsic::aarch64_sve_ptest_first:
+    case Intrinsic::aarch64_sve_ptest_last:
+      for (auto I = F.user_begin(), E = F.user_end(); I != E;) {
+        auto *Inst = dyn_cast<Instruction>(*I++);
+        Functions.insert(Inst->getFunction());
+      }
+      break;
+    default:
+      break;
+    }
+  }
+
+  if (!Functions.empty())
+    Changed |= optimizeFunctions(Functions);
+
+  return Changed;
+}
Index: llvm/lib/Target/AArch64/CMakeLists.txt
===================================================================
--- llvm/lib/Target/AArch64/CMakeLists.txt
+++ llvm/lib/Target/AArch64/CMakeLists.txt
@@ -64,6 +64,7 @@
   AArch64TargetMachine.cpp
   AArch64TargetObjectFile.cpp
   AArch64TargetTransformInfo.cpp
+  SVEIntrinsicOpts.cpp
   AArch64SIMDInstrOpt.cpp
 
   DEPENDS
Index: llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -146,6 +146,11 @@
     cl::desc("Enable GlobalISel at or below an opt level (-1 to disable)"),
     cl::init(0));
 
+static cl::opt<bool> EnableSVEIntrinsicOpts(
+    "aarch64-sve-intrinsic-opts", cl::Hidden,
+    cl::desc("Enable SVE intrinsic opts"),
+    cl::init(true));
+
 static cl::opt<bool> EnableFalkorHWPFFix("aarch64-enable-falkor-hwpf-fix",
                                          cl::init(true), cl::Hidden);
 
@@ -182,6 +187,7 @@
   initializeFalkorHWPFFixPass(*PR);
   initializeFalkorMarkStridedAccessesLegacyPass(*PR);
   initializeLDTLSCleanupPass(*PR);
+  initializeSVEIntrinsicOptsPass(*PR);
   initializeAArch64SpeculationHardeningPass(*PR);
   initializeAArch64StackTaggingPass(*PR);
   initializeAArch64StackTaggingPreRAPass(*PR);
@@ -434,6 +440,10 @@
   // ourselves.
   addPass(createAtomicExpandPass());
 
+  // Expand any SVE vector library calls that we can't code generate directly.
+  if (EnableSVEIntrinsicOpts && TM->getOptLevel() == CodeGenOpt::Aggressive)
+    addPass(createSVEIntrinsicOptsPass());
+
   // Cmpxchg instructions are often used with a subsequent comparison to
   // determine whether it succeeded. We can exploit existing control-flow in
   // ldrex/strex loops to simplify this, but it needs tidying up.
Index: llvm/lib/Target/AArch64/AArch64.h
===================================================================
--- llvm/lib/Target/AArch64/AArch64.h
+++ llvm/lib/Target/AArch64/AArch64.h
@@ -52,6 +52,7 @@
 FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
 
 FunctionPass *createAArch64CollectLOHPass();
+ModulePass *createSVEIntrinsicOptsPass();
 InstructionSelector *
 createAArch64InstructionSelector(const AArch64TargetMachine &,
                                  AArch64Subtarget &, AArch64RegisterBankInfo &);
@@ -80,6 +81,7 @@
 void initializeFalkorHWPFFixPass(PassRegistry&);
 void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&);
 void initializeLDTLSCleanupPass(PassRegistry&);
+void initializeSVEIntrinsicOptsPass(PassRegistry&);
 void initializeAArch64StackTaggingPass(PassRegistry&);
 void initializeAArch64StackTaggingPreRAPass(PassRegistry&);
 } // end namespace llvm

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D76078: [AArch64][SVE] Add a pass for SVE intrinsic optimisations

Reply via email to