[PATCH] D90799: [PowerPC] Add paired vector load and store builtins and intrinsics

Baptiste Saleil via Phabricator via cfe-commits Wed, 04 Nov 2020 14:09:20 -0800

bsaleil created this revision.
bsaleil added reviewers: nemanjai, amyk, saghir, lei.
bsaleil added projects: LLVM, PowerPC.
Herald added subscribers: llvm-commits, cfe-commits, shchenz, kbarton, 
hiraditya.
Herald added a project: clang.
bsaleil requested review of this revision.


This patch adds the Clang builtins and LLVM intrinsics to load and store vector 
pairs.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D90799

Files:
  clang/include/clang/Basic/BuiltinsPPC.def
  clang/lib/CodeGen/CGBuiltin.cpp
  llvm/include/llvm/IR/IntrinsicsPowerPC.td
  llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
  llvm/lib/Target/PowerPC/PPCISelLowering.cpp
  llvm/lib/Target/PowerPC/PPCISelLowering.h
  llvm/lib/Target/PowerPC/PPCInstrInfo.td
  llvm/lib/Target/PowerPC/PPCInstrPrefix.td
  llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
  llvm/test/CodeGen/PowerPC/dform-pair-load-store.ll
  llvm/test/CodeGen/PowerPC/mma-intrinsics.ll

Index: llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
===================================================================
--- llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
+++ llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
@@ -698,3 +698,307 @@
 
 declare <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1>, <256 x i1>, <16 x i8>, i32, i32)
 declare <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1>, <256 x i1>, <16 x i8>)
+
+; Function Attrs: nounwind
+define void @test_ldst_1(<256 x i1>* %vpp, <256 x i1>* %vp2) {
+; CHECK-LABEL: test_ldst_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lxvp vsp0, 0(r3)
+; CHECK-NEXT:    stxvp vsp0, 0(r4)
+; CHECK-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_ldst_1:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxvp vsp0, 0(r3)
+; CHECK-BE-NEXT:    stxvp vsp0, 0(r4)
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast <256 x i1>* %vpp to i8*
+  %1 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %0)
+  %2 = bitcast <256 x i1>* %vp2 to i8*
+  tail call void @llvm.ppc.mma.stxvp(<256 x i1> %1, i8* %2)
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind readonly
+declare <256 x i1> @llvm.ppc.mma.lxvp(i8*)
+
+; Function Attrs: argmemonly nounwind writeonly
+declare void @llvm.ppc.mma.stxvp(<256 x i1>, i8*)
+
+; Function Attrs: nounwind
+define void @test_ldst_2(<256 x i1>* %vpp, i64 %offset, <256 x i1>* %vp2)  {
+; CHECK-LABEL: test_ldst_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lxvpx vsp0, r3, r4
+; CHECK-NEXT:    stxvpx vsp0, r5, r4
+; CHECK-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_ldst_2:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxvpx vsp0, r3, r4
+; CHECK-BE-NEXT:    stxvpx vsp0, r5, r4
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast <256 x i1>* %vpp to i8*
+  %1 = getelementptr i8, i8* %0, i64 %offset
+  %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1)
+  %3 = bitcast <256 x i1>* %vp2 to i8*
+  %4 = getelementptr i8, i8* %3, i64 %offset
+  tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @test_ldst_3(<256 x i1>* %vpp, <256 x i1>* %vp2)  {
+; CHECK-LABEL: test_ldst_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    plxvp vsp0, 18(r3), 0
+; CHECK-NEXT:    pstxvp vsp0, 18(r4), 0
+; CHECK-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_ldst_3:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    plxvp vsp0, 18(r3), 0
+; CHECK-BE-NEXT:    pstxvp vsp0, 18(r4), 0
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast <256 x i1>* %vpp to i8*
+  %1 = getelementptr i8, i8* %0, i64 18
+  %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1)
+  %3 = bitcast <256 x i1>* %vp2 to i8*
+  %4 = getelementptr i8, i8* %3, i64 18
+  tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @test_ldst_4(<256 x i1>* %vpp, <256 x i1>* %vp2)  {
+; CHECK-LABEL: test_ldst_4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    plxvp vsp0, 1(r3), 0
+; CHECK-NEXT:    pstxvp vsp0, 1(r4), 0
+; CHECK-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_ldst_4:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    plxvp vsp0, 1(r3), 0
+; CHECK-BE-NEXT:    pstxvp vsp0, 1(r4), 0
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast <256 x i1>* %vpp to i8*
+  %1 = getelementptr i8, i8* %0, i64 1
+  %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1)
+  %3 = bitcast <256 x i1>* %vp2 to i8*
+  %4 = getelementptr i8, i8* %3, i64 1
+  tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @test_ldst_5(<256 x i1>* %vpp, <256 x i1>* %vp2)  {
+; CHECK-LABEL: test_ldst_5:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    plxvp vsp0, 42(r3), 0
+; CHECK-NEXT:    pstxvp vsp0, 42(r4), 0
+; CHECK-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_ldst_5:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    plxvp vsp0, 42(r3), 0
+; CHECK-BE-NEXT:    pstxvp vsp0, 42(r4), 0
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast <256 x i1>* %vpp to i8*
+  %1 = getelementptr i8, i8* %0, i64 42
+  %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1)
+  %3 = bitcast <256 x i1>* %vp2 to i8*
+  %4 = getelementptr i8, i8* %3, i64 42
+  tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @test_ldst_6(<256 x i1>* %vpp, <256 x i1>* %vp2)  {
+; CHECK-LABEL: test_ldst_6:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lxvp vsp0, 4096(r3)
+; CHECK-NEXT:    stxvp vsp0, 4096(r4)
+; CHECK-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_ldst_6:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxvp vsp0, 4096(r3)
+; CHECK-BE-NEXT:    stxvp vsp0, 4096(r4)
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = getelementptr <256 x i1>, <256 x i1>* %vpp, i64 128
+  %1 = bitcast <256 x i1>* %0 to i8*
+  %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1)
+  %3 = getelementptr <256 x i1>, <256 x i1>* %vp2, i64 128
+  %4 = bitcast <256 x i1>* %3 to i8*
+  tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4)
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @test_ldst_7(<256 x i1>* %vpp, <256 x i1>* %vp2)  {
+; FIXME: A prefixed load (plxvp) is expected here as the offset in this
+; test case is a constant that fits within 34-bits.
+; CHECK-LABEL: test_ldst_7:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    li r5, 0
+; CHECK-NEXT:    ori r5, r5, 32799
+; CHECK-NEXT:    lxvpx vsp0, r3, r5
+; CHECK-NEXT:    stxvpx vsp0, r4, r5
+; CHECK-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_ldst_7:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    li r5, 0
+; CHECK-BE-NEXT:    ori r5, r5, 32799
+; CHECK-BE-NEXT:    lxvpx vsp0, r3, r5
+; CHECK-BE-NEXT:    stxvpx vsp0, r4, r5
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast <256 x i1>* %vpp to i8*
+  %1 = getelementptr i8, i8* %0, i64 32799
+  %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1)
+  %3 = bitcast <256 x i1>* %vp2 to i8*
+  %4 = getelementptr i8, i8* %3, i64 32799
+  tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4)
+  ret void
+}
+
+; Function Attrs: nofree nounwind
+define void @test_ldst_8(i8* nocapture readonly %vqp, <256 x i1>* %vpp, <16 x i8> %vc, i8* nocapture %resp)  {
+; CHECK-LABEL: test_ldst_8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lxv vs1, 32(r3)
+; CHECK-NEXT:    lxv vs0, 48(r3)
+; CHECK-NEXT:    lxv vs3, 0(r3)
+; CHECK-NEXT:    lxv vs2, 16(r3)
+; CHECK-NEXT:    plxvp vsp4, 8(r4), 0
+; CHECK-NEXT:    xxmtacc acc0
+; CHECK-NEXT:    pmxvf64gernn acc0, vsp4, v2, 0, 0
+; CHECK-NEXT:    xxmfacc acc0
+; CHECK-NEXT:    stxv vs0, 48(r7)
+; CHECK-NEXT:    stxv vs1, 32(r7)
+; CHECK-NEXT:    stxv vs2, 16(r7)
+; CHECK-NEXT:    stxv vs3, 0(r7)
+; CHECK-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_ldst_8:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs1, 16(r3)
+; CHECK-BE-NEXT:    lxv vs0, 0(r3)
+; CHECK-BE-NEXT:    lxv vs3, 48(r3)
+; CHECK-BE-NEXT:    lxv vs2, 32(r3)
+; CHECK-BE-NEXT:    plxvp vsp4, 8(r4), 0
+; CHECK-BE-NEXT:    xxmtacc acc0
+; CHECK-BE-NEXT:    pmxvf64gernn acc0, vsp4, v2, 0, 0
+; CHECK-BE-NEXT:    xxmfacc acc0
+; CHECK-BE-NEXT:    stxv vs1, 16(r7)
+; CHECK-BE-NEXT:    stxv vs0, 0(r7)
+; CHECK-BE-NEXT:    stxv vs3, 48(r7)
+; CHECK-BE-NEXT:    stxv vs2, 32(r7)
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast i8* %vqp to <512 x i1>*
+  %1 = load <512 x i1>, <512 x i1>* %0, align 64
+  %2 = bitcast <256 x i1>* %vpp to i8*
+  %3 = getelementptr i8, i8* %2, i64 8
+  %4 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %3)
+  %5 = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1> %1, <256 x i1> %4, <16 x i8> %vc, i32 0, i32 0)
+  %6 = bitcast i8* %resp to <512 x i1>*
+  store <512 x i1> %5, <512 x i1>* %6, align 64
+  ret void
+}
+
+; Function Attrs: nofree nounwind
+define void @test_ldst_9(i8* nocapture readonly %vqp, <256 x i1>* %vpp, <16 x i8> %vc, i8* nocapture %resp)  {
+; CHECK-LABEL: test_ldst_9:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lxv vs1, 32(r3)
+; CHECK-NEXT:    lxv vs0, 48(r3)
+; CHECK-NEXT:    lxv vs3, 0(r3)
+; CHECK-NEXT:    lxv vs2, 16(r3)
+; CHECK-NEXT:    lxvp vsp4, 0(r4)
+; CHECK-NEXT:    xxmtacc acc0
+; CHECK-NEXT:    xvf64gernp acc0, vsp4, v2
+; CHECK-NEXT:    xxmfacc acc0
+; CHECK-NEXT:    stxv vs0, 48(r7)
+; CHECK-NEXT:    stxv vs1, 32(r7)
+; CHECK-NEXT:    stxv vs2, 16(r7)
+; CHECK-NEXT:    stxv vs3, 0(r7)
+; CHECK-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_ldst_9:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs1, 16(r3)
+; CHECK-BE-NEXT:    lxv vs0, 0(r3)
+; CHECK-BE-NEXT:    lxv vs3, 48(r3)
+; CHECK-BE-NEXT:    lxv vs2, 32(r3)
+; CHECK-BE-NEXT:    lxvp vsp4, 0(r4)
+; CHECK-BE-NEXT:    xxmtacc acc0
+; CHECK-BE-NEXT:    xvf64gernp acc0, vsp4, v2
+; CHECK-BE-NEXT:    xxmfacc acc0
+; CHECK-BE-NEXT:    stxv vs1, 16(r7)
+; CHECK-BE-NEXT:    stxv vs0, 0(r7)
+; CHECK-BE-NEXT:    stxv vs3, 48(r7)
+; CHECK-BE-NEXT:    stxv vs2, 32(r7)
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast i8* %vqp to <512 x i1>*
+  %1 = load <512 x i1>, <512 x i1>* %0, align 64
+  %2 = bitcast <256 x i1>* %vpp to i8*
+  %3 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %2)
+  %4 = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> %1, <256 x i1> %3, <16 x i8> %vc)
+  %5 = bitcast i8* %resp to <512 x i1>*
+  store <512 x i1> %4, <512 x i1>* %5, align 64
+  ret void
+}
+
+; Function Attrs: nofree nounwind
+define void @test_ldst_10(i8* nocapture readonly %vqp, i64 %offs, <256 x i1>* %vpp, <16 x i8> %vc, i8* nocapture %resp)  {
+; CHECK-LABEL: test_ldst_10:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lxv vs1, 32(r3)
+; CHECK-NEXT:    lxv vs0, 48(r3)
+; CHECK-NEXT:    lxv vs3, 0(r3)
+; CHECK-NEXT:    lxv vs2, 16(r3)
+; CHECK-NEXT:    lxvp vsp4, 0(r5)
+; CHECK-NEXT:    xxmtacc acc0
+; CHECK-NEXT:    xvf64gernp acc0, vsp4, v2
+; CHECK-NEXT:    xxmfacc acc0
+; CHECK-NEXT:    stxv vs0, 48(r9)
+; CHECK-NEXT:    stxv vs1, 32(r9)
+; CHECK-NEXT:    stxv vs2, 16(r9)
+; CHECK-NEXT:    stxv vs3, 0(r9)
+; CHECK-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_ldst_10:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv vs1, 16(r3)
+; CHECK-BE-NEXT:    lxv vs0, 0(r3)
+; CHECK-BE-NEXT:    lxv vs3, 48(r3)
+; CHECK-BE-NEXT:    lxv vs2, 32(r3)
+; CHECK-BE-NEXT:    lxvp vsp4, 0(r5)
+; CHECK-BE-NEXT:    xxmtacc acc0
+; CHECK-BE-NEXT:    xvf64gernp acc0, vsp4, v2
+; CHECK-BE-NEXT:    xxmfacc acc0
+; CHECK-BE-NEXT:    stxv vs1, 16(r9)
+; CHECK-BE-NEXT:    stxv vs0, 0(r9)
+; CHECK-BE-NEXT:    stxv vs3, 48(r9)
+; CHECK-BE-NEXT:    stxv vs2, 32(r9)
+; CHECK-BE-NEXT:    blr
+entry:
+  %0 = bitcast i8* %vqp to <512 x i1>*
+  %1 = load <512 x i1>, <512 x i1>* %0, align 64
+  %2 = bitcast <256 x i1>* %vpp to i8*
+  %3 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %2)
+  %4 = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> %1, <256 x i1> %3, <16 x i8> %vc)
+  %5 = bitcast i8* %resp to <512 x i1>*
+  store <512 x i1> %4, <512 x i1>* %5, align 64
+  ret void
+}
Index: llvm/test/CodeGen/PowerPC/dform-pair-load-store.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/PowerPC/dform-pair-load-store.ll
@@ -0,0 +1,58 @@
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=pwr10 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-p:64:64-n32:64-v256:256:256-v512:512:512"
+
+declare <256 x i1> @llvm.ppc.mma.lxvp(i8*)
+declare void @llvm.ppc.mma.stxvp(<256 x i1>, i8*)
+define void @foo(i32 zeroext %n, <256 x i1>* %ptr, <256 x i1>* %ptr2) {
+; CHECK-LABEL: foo:
+; CHECK:  .LBB0_2: # %for.body
+; CHECK-NEXT:    #
+; CHECK:    lxvp
+; CHECK:    lxvp
+; CHECK:    lxvp
+; CHECK:    lxvp
+; CHECK:    stxvp
+; CHECK:    stxvp
+; CHECK:    stxvp
+; CHECK:    stxvp
+entry:
+  %cmp35.not = icmp eq i32 %n, 0
+  br i1 %cmp35.not, label %for.cond.cleanup, label %for.body.lr.ph
+
+for.body.lr.ph:
+  %0 = bitcast <256 x i1>* %ptr to i8*
+  %1 = bitcast <256 x i1>* %ptr2 to i8*
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %2 = getelementptr i8, i8* %0, i64 %indvars.iv
+  %3 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %2)
+  %add2 = add nuw nsw i64 %indvars.iv, 32
+  %4 = getelementptr i8, i8* %0, i64 %add2
+  %5 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %4)
+  %add4 = add nuw nsw i64 %indvars.iv, 64
+  %6 = getelementptr i8, i8* %0, i64 %add4
+  %7 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %6)
+  %add6 = add nuw nsw i64 %indvars.iv, 96
+  %8 = getelementptr i8, i8* %0, i64 %add6
+  %9 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %8)
+  %10 = getelementptr i8, i8* %1, i64 %indvars.iv
+  tail call void @llvm.ppc.mma.stxvp(<256 x i1> %3, i8* %10)
+  %11 = getelementptr i8, i8* %1, i64 %add2
+  tail call void @llvm.ppc.mma.stxvp(<256 x i1> %5, i8* %11)
+  %12 = getelementptr i8, i8* %1, i64 %add4
+  tail call void @llvm.ppc.mma.stxvp(<256 x i1> %7, i8* %12)
+  %13 = getelementptr i8, i8* %1, i64 %add6
+  tail call void @llvm.ppc.mma.stxvp(<256 x i1> %9, i8* %13)
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
Index: llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
===================================================================
--- llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -1223,7 +1223,8 @@
   case Intrinsic::ppc_vsx_lxvd2x_be:
   case Intrinsic::ppc_vsx_lxvw4x_be:
   case Intrinsic::ppc_vsx_lxvl:
-  case Intrinsic::ppc_vsx_lxvll: {
+  case Intrinsic::ppc_vsx_lxvll:
+  case Intrinsic::ppc_mma_lxvp: {
     Info.PtrVal = Inst->getArgOperand(0);
     Info.ReadMem = true;
     Info.WriteMem = false;
@@ -1239,7 +1240,8 @@
   case Intrinsic::ppc_vsx_stxvd2x_be:
   case Intrinsic::ppc_vsx_stxvw4x_be:
   case Intrinsic::ppc_vsx_stxvl:
-  case Intrinsic::ppc_vsx_stxvll: {
+  case Intrinsic::ppc_vsx_stxvll:
+  case Intrinsic::ppc_mma_stxvp: {
     Info.PtrVal = Inst->getArgOperand(1);
     Info.ReadMem = false;
     Info.WriteMem = true;
Index: llvm/lib/Target/PowerPC/PPCInstrPrefix.td
===================================================================
--- llvm/lib/Target/PowerPC/PPCInstrPrefix.td
+++ llvm/lib/Target/PowerPC/PPCInstrPrefix.td
@@ -1654,6 +1654,24 @@
                                 "pstxvp $XTp, $D_RA", IIC_LdStLFD>;
 }
 
+let Predicates = [PairedVectorMemops] in {
+  // Intrinsics for Paired Vector Loads.
+  def : Pat<(v256i1 (int_ppc_mma_lxvp iaddrX16:$src)), (LXVP memrix16:$src)>;
+  def : Pat<(v256i1 (int_ppc_mma_lxvp xaddr:$src)), (LXVPX xaddr:$src)>;
+  let Predicates = [PairedVectorMemops, PrefixInstrs] in {
+    def : Pat<(v256i1 (int_ppc_mma_lxvp iaddrX34:$src)), (PLXVP memri34:$src)>;
+  }
+  // Intrinsics for Paired Vector Stores.
+  def : Pat<(int_ppc_mma_stxvp v256i1:$XSp, iaddrX16:$dst),
+            (STXVP $XSp, memrix16:$dst)>;
+  def : Pat<(int_ppc_mma_stxvp v256i1:$XSp, xaddr:$dst),
+            (STXVPX $XSp, xaddr:$dst)>;
+  let Predicates = [PairedVectorMemops, PrefixInstrs] in {
+    def : Pat<(int_ppc_mma_stxvp v256i1:$XSp, iaddrX34:$dst),
+              (PSTXVP $XSp, memri34:$dst)>;
+  }
+}
+
 // TODO: We have an added complexity of 500 here. This is only a temporary
 // solution to have tablegen consider these patterns first. The way we do
 // addressing for PowerPC is complex depending on available D form, X form, or
Index: llvm/lib/Target/PowerPC/PPCInstrInfo.td
===================================================================
--- llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -1031,11 +1031,13 @@
 // Define PowerPC specific addressing mode.
 
 // d-form
-def iaddr    : ComplexPattern<iPTR, 2, "SelectAddrImm",     [], []>;  // "stb"
+def iaddr    : ComplexPattern<iPTR, 2, "SelectAddrImm",     [], []>; // "stb"
 // ds-form
-def iaddrX4  : ComplexPattern<iPTR, 2, "SelectAddrImmX4",   [], []>;  // "std"
+def iaddrX4  : ComplexPattern<iPTR, 2, "SelectAddrImmX4",   [], []>; // "std"
 // dq-form
-def iaddrX16 : ComplexPattern<iPTR, 2, "SelectAddrImmX16",  [], []>;  // "stxv"
+def iaddrX16 : ComplexPattern<iPTR, 2, "SelectAddrImmX16",  [], []>; // "stxv"
+// 8LS:d-form
+def iaddrX34 : ComplexPattern<iPTR, 2, "SelectAddrImmX34",  [], []>; // "pstxvp"
 
 // Below forms are all x-form addressing mode, use three different ones so we
 // can make a accurate check for x-form instructions in ISEL.
Index: llvm/lib/Target/PowerPC/PPCISelLowering.h
===================================================================
--- llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -770,6 +770,8 @@
     bool SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base,
                              SelectionDAG &DAG,
                              MaybeAlign EncodingAlignment) const;
+    bool SelectAddressRegImm34(SDValue N, SDValue &Disp, SDValue &Base,
+                               SelectionDAG &DAG) const;
 
     /// SelectAddressRegRegOnly - Given the specified addressed, force it to be
     /// represented as an indexed [r+r] operation.
@@ -1325,6 +1327,8 @@
 
   bool isIntS16Immediate(SDNode *N, int16_t &Imm);
   bool isIntS16Immediate(SDValue Op, int16_t &Imm);
+  bool isIntS34Immediate(SDNode *N, int64_t &Imm);
+  bool isIntS34Immediate(SDValue Op, int64_t &Imm);
 
   bool convertToNonDenormSingle(APInt &ArgAPInt);
   bool convertToNonDenormSingle(APFloat &ArgAPFloat);
Index: llvm/lib/Target/PowerPC/PPCISelLowering.cpp
===================================================================
--- llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -2399,6 +2399,20 @@
   return false;
 }
 
+/// isIntS34Immediate - This method tests if value of node given can be
+/// accurately represented as a sign extension from a 34-bit value.  If so,
+/// this returns true and the immediate.
+bool llvm::isIntS34Immediate(SDNode *N, int64_t &Imm) {
+  if (!isa<ConstantSDNode>(N))
+    return false;
+
+  Imm = (int64_t)cast<ConstantSDNode>(N)->getZExtValue();
+  return isInt<34>(Imm);
+}
+bool llvm::isIntS34Immediate(SDValue Op, int64_t &Imm) {
+  return isIntS34Immediate(Op.getNode(), Imm);
+}
+
 /// SelectAddressRegReg - Given the specified addressed, check to see if it
 /// can be represented as an indexed [r+r] operation.  Returns false if it
 /// can be more efficiently represented as [r+imm]. If \p EncodingAlignment is
@@ -2599,6 +2613,50 @@
   return true;      // [r+0]
 }
 
+/// Similar to the 16-bit case but for instructions that take a 34-bit
+/// displacement field (prefixed loads/stores).
+bool PPCTargetLowering::SelectAddressRegImm34(SDValue N, SDValue &Disp,
+                                              SDValue &Base,
+                                              SelectionDAG &DAG) const {
+  // Only on 64-bit targets.
+  if (N.getValueType() != MVT::i64)
+    return false;
+
+  SDLoc dl(N);
+  int64_t Imm = 0;
+  if (N.getOpcode() == ISD::ADD) {
+    if (isIntS34Immediate(N.getOperand(1), Imm)) {
+      Disp = DAG.getTargetConstant(Imm, dl, N.getValueType());
+      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0)))
+        Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+      else
+        Base = N.getOperand(0);
+      return true;
+    }
+  } else if (N.getOpcode() == ISD::OR) {
+    if (isIntS34Immediate(N.getOperand(1), Imm)) {
+      // If this is an or of disjoint bitfields, we can codegen this as an add
+      // (for better address arithmetic) if the LHS and RHS of the OR are
+      // provably disjoint.
+      KnownBits LHSKnown = DAG.computeKnownBits(N.getOperand(0));
+
+      if ((LHSKnown.Zero.getZExtValue() | ~(uint64_t)Imm) == ~0ULL) {
+        if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0)))
+          Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+        else
+          Base = N.getOperand(0);
+        Disp = DAG.getTargetConstant(Imm, dl, N.getValueType());
+        return true;
+      }
+    }
+  } else if (isIntS34Immediate(N, Imm)) { // If the address is a 34-bit const.
+    Disp = DAG.getTargetConstant(Imm, dl, N.getValueType());
+    Base = DAG.getRegister(PPC::ZERO8, N.getValueType());
+    return true;
+  }
+  return false;
+}
+
 /// SelectAddressRegRegOnly - Given the specified addressed, force it to be
 /// represented as an indexed [r+r] operation.
 bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base,
Index: llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
===================================================================
--- llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -293,6 +293,13 @@
                                               Align(16));
     }
 
+    /// SelectAddrImmX34 - Returns true if the address N can be represented by
+    /// a base register plus a signed 34-bit displacement. Suitable for use by
+    /// PSTXVP and friends.
+    bool SelectAddrImmX34(SDValue N, SDValue &Disp, SDValue &Base) {
+      return PPCLowering->SelectAddressRegImm34(N, Disp, Base, *CurDAG);
+    }
+
     // Select an address into a single register.
     bool SelectAddr(SDValue N, SDValue &Base) {
       Base = N;
Index: llvm/include/llvm/IR/IntrinsicsPowerPC.td
===================================================================
--- llvm/include/llvm/IR/IntrinsicsPowerPC.td
+++ llvm/include/llvm/IR/IntrinsicsPowerPC.td
@@ -1422,6 +1422,14 @@
   def int_ppc_mma_xxsetaccz :
         Intrinsic<[llvm_v512i1_ty], [], [IntrNoMem]>;
 
+  def int_ppc_mma_lxvp :
+        Intrinsic<[llvm_v256i1_ty], [llvm_ptr_ty],
+                  [IntrReadMem, IntrArgMemOnly]>;
+
+  def int_ppc_mma_stxvp :
+        Intrinsic<[], [llvm_v256i1_ty, llvm_ptr_ty],
+                  [IntrWriteMem, IntrArgMemOnly]>;
+
   // MMA Reduced-Precision: Outer Product Intrinsic Definitions.
   defm int_ppc_mma_xvi4ger8 :
         PowerPC_MMA_ACC_PP_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty]>;
Index: clang/lib/CodeGen/CGBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -14776,6 +14776,19 @@
       break;
   #include "clang/Basic/BuiltinsPPC.def"
     }
+    if (BuiltinID == PPC::BI__builtin_mma_lxvp ||
+        BuiltinID == PPC::BI__builtin_mma_stxvp) {
+      if (BuiltinID == PPC::BI__builtin_mma_lxvp) {
+        Ops[1] = Builder.CreateBitCast(Ops[1], Int8PtrTy);
+        Ops[0] = Builder.CreateGEP(Ops[1], Ops[0]);
+      } else {
+        Ops[2] = Builder.CreateBitCast(Ops[2], Int8PtrTy);
+        Ops[1] = Builder.CreateGEP(Ops[2], Ops[1]);
+      }
+      Ops.pop_back();
+      llvm::Function *F = CGM.getIntrinsic(ID);
+      return Builder.CreateCall(F, Ops, "");
+    }
     SmallVector<Value*, 4> CallOps;
     if (Accumulate) {
       Address Addr = EmitPointerWithAlignment(E->getArg(0));
Index: clang/include/clang/Basic/BuiltinsPPC.def
===================================================================
--- clang/include/clang/Basic/BuiltinsPPC.def
+++ clang/include/clang/Basic/BuiltinsPPC.def
@@ -738,6 +738,8 @@
 MMA_BUILTIN(pmxvbf16ger2pn, "vW512*VVi15i15i3", true)
 MMA_BUILTIN(pmxvbf16ger2np, "vW512*VVi15i15i3", true)
 MMA_BUILTIN(pmxvbf16ger2nn, "vW512*VVi15i15i3", true)
+MMA_BUILTIN(lxvp, "W256SLLiW256C*", false)
+MMA_BUILTIN(stxvp, "vW256SLLiW256C*", false)
 
 // FIXME: Obviously incomplete.

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D90799: [PowerPC] Add paired vector load and store builtins and intrinsics

Reply via email to