bsaleil updated this revision to Diff 294776.
bsaleil added a comment.
Use early exit in lowering functions and extend test case
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D84968/new/
https://reviews.llvm.org/D84968
Files:
clang/lib/Basic/Targets/PPC.h
clang/test/CodeGen/target-data.c
llvm/lib/Target/PowerPC/PPCISelLowering.cpp
llvm/lib/Target/PowerPC/PPCISelLowering.h
llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
llvm/lib/Target/PowerPC/PPCInstrPrefix.td
llvm/lib/Target/PowerPC/PPCRegisterInfo.td
llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
llvm/test/CodeGen/PowerPC/mma-acc-memops.ll
Index: llvm/test/CodeGen/PowerPC/mma-acc-memops.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/PowerPC/mma-acc-memops.ll
@@ -0,0 +1,238 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=LE-PAIRED
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \
+; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=BE-PAIRED
+
+@f = common local_unnamed_addr global <512 x i1> zeroinitializer, align 16
+@g = common local_unnamed_addr global <256 x i1> zeroinitializer, align 16
+
+define void @testLdSt(i64 %SrcIdx, i64 %DstIdx) {
+; LE-PAIRED-LABEL: testLdSt:
+; LE-PAIRED: # %bb.0: # %entry
+; LE-PAIRED-NEXT: plxv vs1, f@PCREL+96(0), 1
+; LE-PAIRED-NEXT: plxv vs0, f@PCREL+112(0), 1
+; LE-PAIRED-NEXT: plxv vs3, f@PCREL+64(0), 1
+; LE-PAIRED-NEXT: plxv vs2, f@PCREL+80(0), 1
+; LE-PAIRED-NEXT: xxmtacc acc0
+; LE-PAIRED-NEXT: xxmfacc acc0
+; LE-PAIRED-NEXT: pstxv vs0, f@PCREL+176(0), 1
+; LE-PAIRED-NEXT: pstxv vs1, f@PCREL+160(0), 1
+; LE-PAIRED-NEXT: pstxv vs2, f@PCREL+144(0), 1
+; LE-PAIRED-NEXT: pstxv vs3, f@PCREL+128(0), 1
+; LE-PAIRED-NEXT: blr
+;
+; BE-PAIRED-LABEL: testLdSt:
+; BE-PAIRED: # %bb.0: # %entry
+; BE-PAIRED-NEXT: addis r3, r2, .LC0@toc@ha
+; BE-PAIRED-NEXT: ld r3, .LC0@toc@l(r3)
+; BE-PAIRED-NEXT: lxv vs1, 80(r3)
+; BE-PAIRED-NEXT: lxv vs0, 64(r3)
+; BE-PAIRED-NEXT: lxv vs3, 112(r3)
+; BE-PAIRED-NEXT: lxv vs2, 96(r3)
+; BE-PAIRED-NEXT: xxmtacc acc0
+; BE-PAIRED-NEXT: xxmfacc acc0
+; BE-PAIRED-NEXT: stxv vs1, 144(r3)
+; BE-PAIRED-NEXT: stxv vs0, 128(r3)
+; BE-PAIRED-NEXT: stxv vs3, 176(r3)
+; BE-PAIRED-NEXT: stxv vs2, 160(r3)
+; BE-PAIRED-NEXT: blr
+entry:
+ %arrayidx = getelementptr inbounds <512 x i1>, <512 x i1>* @f, i64 1
+ %0 = load <512 x i1>, <512 x i1>* %arrayidx, align 64
+ %arrayidx1 = getelementptr inbounds <512 x i1>, <512 x i1>* @f, i64 2
+ store <512 x i1> %0, <512 x i1>* %arrayidx1, align 64
+ ret void
+}
+
+define void @testXLdSt(i64 %SrcIdx, i64 %DstIdx) {
+; LE-PAIRED-LABEL: testXLdSt:
+; LE-PAIRED: # %bb.0: # %entry
+; LE-PAIRED-NEXT: sldi r3, r3, 6
+; LE-PAIRED-NEXT: paddi r5, 0, f@PCREL, 1
+; LE-PAIRED-NEXT: add r6, r5, r3
+; LE-PAIRED-NEXT: lxv vs1, 32(r6)
+; LE-PAIRED-NEXT: lxv vs0, 48(r6)
+; LE-PAIRED-NEXT: lxvx vs3, r5, r3
+; LE-PAIRED-NEXT: lxv vs2, 16(r6)
+; LE-PAIRED-NEXT: sldi r3, r4, 6
+; LE-PAIRED-NEXT: xxmtacc acc0
+; LE-PAIRED-NEXT: xxmfacc acc0
+; LE-PAIRED-NEXT: stxvx vs3, r5, r3
+; LE-PAIRED-NEXT: add r3, r5, r3
+; LE-PAIRED-NEXT: stxv vs0, 48(r3)
+; LE-PAIRED-NEXT: stxv vs1, 32(r3)
+; LE-PAIRED-NEXT: stxv vs2, 16(r3)
+; LE-PAIRED-NEXT: blr
+;
+; BE-PAIRED-LABEL: testXLdSt:
+; BE-PAIRED: # %bb.0: # %entry
+; BE-PAIRED-NEXT: addis r5, r2, .LC0@toc@ha
+; BE-PAIRED-NEXT: sldi r3, r3, 6
+; BE-PAIRED-NEXT: ld r5, .LC0@toc@l(r5)
+; BE-PAIRED-NEXT: add r6, r5, r3
+; BE-PAIRED-NEXT: lxvx vs0, r5, r3
+; BE-PAIRED-NEXT: sldi r3, r4, 6
+; BE-PAIRED-NEXT: lxv vs1, 16(r6)
+; BE-PAIRED-NEXT: lxv vs3, 48(r6)
+; BE-PAIRED-NEXT: lxv vs2, 32(r6)
+; BE-PAIRED-NEXT: xxmtacc acc0
+; BE-PAIRED-NEXT: xxmfacc acc0
+; BE-PAIRED-NEXT: stxvx vs0, r5, r3
+; BE-PAIRED-NEXT: add r3, r5, r3
+; BE-PAIRED-NEXT: stxv vs1, 16(r3)
+; BE-PAIRED-NEXT: stxv vs3, 48(r3)
+; BE-PAIRED-NEXT: stxv vs2, 32(r3)
+; BE-PAIRED-NEXT: blr
+entry:
+ %arrayidx = getelementptr inbounds <512 x i1>, <512 x i1>* @f, i64 %SrcIdx
+ %0 = load <512 x i1>, <512 x i1>* %arrayidx, align 64
+ %arrayidx1 = getelementptr inbounds <512 x i1>, <512 x i1>* @f, i64 %DstIdx
+ store <512 x i1> %0, <512 x i1>* %arrayidx1, align 64
+ ret void
+}
+
+define void @testUnalignedLdSt() {
+; LE-PAIRED-LABEL: testUnalignedLdSt:
+; LE-PAIRED: # %bb.0: # %entry
+; LE-PAIRED-NEXT: plxv vs1, f@PCREL+43(0), 1
+; LE-PAIRED-NEXT: plxv vs0, f@PCREL+59(0), 1
+; LE-PAIRED-NEXT: plxv vs3, f@PCREL+11(0), 1
+; LE-PAIRED-NEXT: plxv vs2, f@PCREL+27(0), 1
+; LE-PAIRED-NEXT: xxmtacc acc0
+; LE-PAIRED-NEXT: xxmfacc acc0
+; LE-PAIRED-NEXT: pstxv vs0, f@PCREL+67(0), 1
+; LE-PAIRED-NEXT: pstxv vs1, f@PCREL+51(0), 1
+; LE-PAIRED-NEXT: pstxv vs2, f@PCREL+35(0), 1
+; LE-PAIRED-NEXT: pstxv vs3, f@PCREL+19(0), 1
+; LE-PAIRED-NEXT: blr
+;
+; BE-PAIRED-LABEL: testUnalignedLdSt:
+; BE-PAIRED: # %bb.0: # %entry
+; BE-PAIRED-NEXT: addis r3, r2, .LC0@toc@ha
+; BE-PAIRED-NEXT: li r4, 11
+; BE-PAIRED-NEXT: ld r3, .LC0@toc@l(r3)
+; BE-PAIRED-NEXT: lxvx vs0, r3, r4
+; BE-PAIRED-NEXT: li r4, 27
+; BE-PAIRED-NEXT: lxvx vs1, r3, r4
+; BE-PAIRED-NEXT: li r4, 43
+; BE-PAIRED-NEXT: lxvx vs2, r3, r4
+; BE-PAIRED-NEXT: li r4, 59
+; BE-PAIRED-NEXT: lxvx vs3, r3, r4
+; BE-PAIRED-NEXT: li r4, 35
+; BE-PAIRED-NEXT: xxmtacc acc0
+; BE-PAIRED-NEXT: xxmfacc acc0
+; BE-PAIRED-NEXT: stxvx vs1, r3, r4
+; BE-PAIRED-NEXT: li r4, 19
+; BE-PAIRED-NEXT: stxvx vs0, r3, r4
+; BE-PAIRED-NEXT: li r4, 67
+; BE-PAIRED-NEXT: stxvx vs3, r3, r4
+; BE-PAIRED-NEXT: li r4, 51
+; BE-PAIRED-NEXT: stxvx vs2, r3, r4
+; BE-PAIRED-NEXT: blr
+entry:
+ %0 = bitcast <512 x i1>* @f to i8*
+ %add.ptr = getelementptr inbounds i8, i8* %0, i64 11
+ %add.ptr1 = getelementptr inbounds i8, i8* %0, i64 19
+ %1 = bitcast i8* %add.ptr to <512 x i1>*
+ %2 = bitcast i8* %add.ptr1 to <512 x i1>*
+ %3 = load <512 x i1>, <512 x i1>* %1, align 64
+ store <512 x i1> %3, <512 x i1>* %2, align 64
+ ret void
+}
+
+define void @testLdStPair(i64 %SrcIdx, i64 %DstIdx) {
+; LE-PAIRED-LABEL: testLdStPair:
+; LE-PAIRED: # %bb.0: # %entry
+; LE-PAIRED-NEXT: plxv vs1, g@PCREL+32(0), 1
+; LE-PAIRED-NEXT: plxv vs0, g@PCREL+48(0), 1
+; LE-PAIRED-NEXT: pstxv vs1, g@PCREL+64(0), 1
+; LE-PAIRED-NEXT: pstxv vs0, g@PCREL+80(0), 1
+; LE-PAIRED-NEXT: blr
+;
+; BE-PAIRED-LABEL: testLdStPair:
+; BE-PAIRED: # %bb.0: # %entry
+; BE-PAIRED-NEXT: addis r3, r2, .LC1@toc@ha
+; BE-PAIRED-NEXT: ld r3, .LC1@toc@l(r3)
+; BE-PAIRED-NEXT: lxv vs1, 48(r3)
+; BE-PAIRED-NEXT: lxv vs0, 32(r3)
+; BE-PAIRED-NEXT: stxv vs1, 80(r3)
+; BE-PAIRED-NEXT: stxv vs0, 64(r3)
+; BE-PAIRED-NEXT: blr
+entry:
+ %arrayidx = getelementptr inbounds <256 x i1>, <256 x i1>* @g, i64 1
+ %0 = load <256 x i1>, <256 x i1>* %arrayidx, align 64
+ %arrayidx1 = getelementptr inbounds <256 x i1>, <256 x i1>* @g, i64 2
+ store <256 x i1> %0, <256 x i1>* %arrayidx1, align 64
+ ret void
+}
+
+define void @testXLdStPair(i64 %SrcIdx, i64 %DstIdx) {
+; LE-PAIRED-LABEL: testXLdStPair:
+; LE-PAIRED: # %bb.0: # %entry
+; LE-PAIRED-NEXT: sldi r3, r3, 5
+; LE-PAIRED-NEXT: paddi r5, 0, g@PCREL, 1
+; LE-PAIRED-NEXT: add r6, r5, r3
+; LE-PAIRED-NEXT: lxvx vs1, r5, r3
+; LE-PAIRED-NEXT: sldi r3, r4, 5
+; LE-PAIRED-NEXT: lxv vs0, 16(r6)
+; LE-PAIRED-NEXT: add r4, r5, r3
+; LE-PAIRED-NEXT: stxvx vs1, r5, r3
+; LE-PAIRED-NEXT: stxv vs0, 16(r4)
+; LE-PAIRED-NEXT: blr
+;
+; BE-PAIRED-LABEL: testXLdStPair:
+; BE-PAIRED: # %bb.0: # %entry
+; BE-PAIRED-NEXT: addis r5, r2, .LC1@toc@ha
+; BE-PAIRED-NEXT: sldi r3, r3, 5
+; BE-PAIRED-NEXT: ld r5, .LC1@toc@l(r5)
+; BE-PAIRED-NEXT: add r6, r5, r3
+; BE-PAIRED-NEXT: lxvx vs0, r5, r3
+; BE-PAIRED-NEXT: sldi r3, r4, 5
+; BE-PAIRED-NEXT: lxv vs1, 16(r6)
+; BE-PAIRED-NEXT: add r4, r5, r3
+; BE-PAIRED-NEXT: stxvx vs0, r5, r3
+; BE-PAIRED-NEXT: stxv vs1, 16(r4)
+; BE-PAIRED-NEXT: blr
+entry:
+ %arrayidx = getelementptr inbounds <256 x i1>, <256 x i1>* @g, i64 %SrcIdx
+ %0 = load <256 x i1>, <256 x i1>* %arrayidx, align 64
+ %arrayidx1 = getelementptr inbounds <256 x i1>, <256 x i1>* @g, i64 %DstIdx
+ store <256 x i1> %0, <256 x i1>* %arrayidx1, align 64
+ ret void
+}
+
+define void @testUnalignedLdStPair() {
+; LE-PAIRED-LABEL: testUnalignedLdStPair:
+; LE-PAIRED: # %bb.0: # %entry
+; LE-PAIRED-NEXT: plxv vs1, g@PCREL+11(0), 1
+; LE-PAIRED-NEXT: plxv vs0, g@PCREL+27(0), 1
+; LE-PAIRED-NEXT: pstxv vs1, g@PCREL+19(0), 1
+; LE-PAIRED-NEXT: pstxv vs0, g@PCREL+35(0), 1
+; LE-PAIRED-NEXT: blr
+;
+; BE-PAIRED-LABEL: testUnalignedLdStPair:
+; BE-PAIRED: # %bb.0: # %entry
+; BE-PAIRED-NEXT: addis r3, r2, .LC1@toc@ha
+; BE-PAIRED-NEXT: li r4, 11
+; BE-PAIRED-NEXT: ld r3, .LC1@toc@l(r3)
+; BE-PAIRED-NEXT: lxvx vs0, r3, r4
+; BE-PAIRED-NEXT: li r4, 27
+; BE-PAIRED-NEXT: lxvx vs1, r3, r4
+; BE-PAIRED-NEXT: li r4, 35
+; BE-PAIRED-NEXT: stxvx vs1, r3, r4
+; BE-PAIRED-NEXT: li r4, 19
+; BE-PAIRED-NEXT: stxvx vs0, r3, r4
+; BE-PAIRED-NEXT: blr
+entry:
+ %0 = bitcast <256 x i1>* @g to i8*
+ %add.ptr = getelementptr inbounds i8, i8* %0, i64 11
+ %add.ptr1 = getelementptr inbounds i8, i8* %0, i64 19
+ %1 = bitcast i8* %add.ptr to <256 x i1>*
+ %2 = bitcast i8* %add.ptr1 to <256 x i1>*
+ %3 = load <256 x i1>, <256 x i1>* %1, align 64
+ store <256 x i1> %3, <256 x i1>* %2, align 64
+ ret void
+}
Index: llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
===================================================================
--- llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -156,6 +156,13 @@
else
Ret += "-n32";
+ // Specify the vector alignment explicitly. For v256i1 and v512i1, the
+ // calculated alignment would be 256*alignment(i1) and 512*alignment(i1),
+ // which is 256 and 512 bytes - way over aligned.
+ if ((T.getArch() == Triple::ppc64le || T.getArch() == Triple::ppc64) &&
+ (T.isOSAIX() || T.isOSLinux()))
+ Ret += "-v256:256:256-v512:512:512";
+
return Ret;
}
Index: llvm/lib/Target/PowerPC/PPCRegisterInfo.td
===================================================================
--- llvm/lib/Target/PowerPC/PPCRegisterInfo.td
+++ llvm/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -104,6 +104,15 @@
let SubRegs = subregs;
}
+// UACC - One of the 8 512-bit VSX accumulators prior to being primed.
+// Without using this register class, the register allocator has no way to
+// differentiate a primed accumulator from an unprimed accumulator.
+// This may result in invalid copies between primed and unprimed accumulators.
+class UACC<bits<3> num, string n, list<Register> subregs> : PPCReg<n> {
+ let HWEncoding{2-0} = num;
+ let SubRegs = subregs;
+}
+
// VSR Pairs - One of the 32 paired even-odd consecutive VSRs.
class VSRPair<bits<5> num, string n, list<Register> subregs> : PPCReg<n> {
let HWEncoding{4-0} = num;
@@ -420,6 +429,22 @@
let Size = 512;
}
+let SubRegIndices = [sub_pair0, sub_pair1] in {
+ def UACC0 : UACC<0, "acc0", [VSRp0, VSRp1]>, DwarfRegNum<[0, 0]>;
+ def UACC1 : UACC<1, "acc1", [VSRp2, VSRp3]>, DwarfRegNum<[0, 0]>;
+ def UACC2 : UACC<2, "acc2", [VSRp4, VSRp5]>, DwarfRegNum<[0, 0]>;
+ def UACC3 : UACC<3, "acc3", [VSRp6, VSRp7]>, DwarfRegNum<[0, 0]>;
+ def UACC4 : UACC<4, "acc4", [VSRp8, VSRp9]>, DwarfRegNum<[0, 0]>;
+ def UACC5 : UACC<5, "acc5", [VSRp10, VSRp11]>, DwarfRegNum<[0, 0]>;
+ def UACC6 : UACC<6, "acc6", [VSRp12, VSRp13]>, DwarfRegNum<[0, 0]>;
+ def UACC7 : UACC<7, "acc7", [VSRp14, VSRp15]>, DwarfRegNum<[0, 0]>;
+}
+def UACCRC : RegisterClass<"PPC", [v512i1], 128,
+ (add UACC0, UACC1, UACC2, UACC3,
+ UACC4, UACC5, UACC6, UACC7)> {
+ let Size = 512;
+}
+
// Allocate in the same order as the underlying VSX registers.
def VSRpRC :
RegisterClass<"PPC", [v256i1], 128,
Index: llvm/lib/Target/PowerPC/PPCInstrPrefix.td
===================================================================
--- llvm/lib/Target/PowerPC/PPCInstrPrefix.td
+++ llvm/lib/Target/PowerPC/PPCInstrPrefix.td
@@ -5,12 +5,35 @@
def SDT_PPCSplat32 : SDTypeProfile<1, 3, [ SDTCisVT<0, v2i64>,
SDTCisVec<1>, SDTCisInt<2>, SDTCisInt<3>
]>;
+def SDT_PPCAccBuild : SDTypeProfile<1, 4, [
+ SDTCisVT<0, v512i1>, SDTCisVT<1, v4i32>, SDTCisVT<2, v4i32>,
+ SDTCisVT<3, v4i32>, SDTCisVT<4, v4i32>
+]>;
+def SDT_PPCPairBuild : SDTypeProfile<1, 2, [
+ SDTCisVT<0, v256i1>, SDTCisVT<1, v4i32>, SDTCisVT<2, v4i32>
+]>;
+def SDT_PPCAccExtractVsx : SDTypeProfile<1, 2, [
+ SDTCisVT<0, v4i32>, SDTCisVT<1, v512i1>, SDTCisInt<2>
+]>;
+def SDT_PPCPairExtractVsx : SDTypeProfile<1, 2, [
+ SDTCisVT<0, v4i32>, SDTCisVT<1, v256i1>, SDTCisInt<2>
+]>;
+def SDT_PPCxxmfacc : SDTypeProfile<1, 1, [
+ SDTCisVT<0, v512i1>, SDTCisVT<1, v512i1>
+]>;
//===----------------------------------------------------------------------===//
// ISA 3.1 specific PPCISD nodes.
//
def PPCxxsplti32dx : SDNode<"PPCISD::XXSPLTI32DX", SDT_PPCSplat32, []>;
+def PPCAccBuild : SDNode<"PPCISD::ACC_BUILD", SDT_PPCAccBuild, []>;
+def PPCPairBuild : SDNode<"PPCISD::PAIR_BUILD", SDT_PPCPairBuild, []>;
+def PPCAccExtractVsx : SDNode<"PPCISD::EXTRACT_VSX_REG", SDT_PPCAccExtractVsx,
+ []>;
+def PPCPairExtractVsx : SDNode<"PPCISD::EXTRACT_VSX_REG", SDT_PPCPairExtractVsx,
+ []>;
+def PPCxxmfacc : SDNode<"PPCISD::XXMFACC", SDT_PPCxxmfacc, []>;
//===----------------------------------------------------------------------===//
@@ -525,6 +548,16 @@
let ParserMatchClass = PPCRegVSRpRCAsmOperand;
}
+def PPCRegVSRpEvenRCAsmOperand : AsmOperandClass {
+ let Name = "RegVSRpEvenRC"; let PredicateMethod = "isVSRpEvenRegNumber";
+}
+
+def vsrpevenrc : RegisterOperand<VSRpRC> {
+ let ParserMatchClass = PPCRegVSRpEvenRCAsmOperand;
+ let EncoderMethod = "getVSRpEvenEncoding";
+ let DecoderMethod = "decodeVSRpEvenOperands";
+}
+
class DQForm_XTp5_RA17_MEM<bits<6> opcode, bits<4> xo, dag OOL, dag IOL,
string asmstr, InstrItinClass itin, list<dag> pattern>
: I<opcode, OOL, IOL, asmstr, itin> {
@@ -594,6 +627,10 @@
let ParserMatchClass = PPCRegACCRCAsmOperand;
}
+def uacc : RegisterOperand<UACCRC> {
+ let ParserMatchClass = PPCRegACCRCAsmOperand;
+}
+
// [PO AS XO2 XO]
class XForm_AT3<bits<6> opcode, bits<5> xo2, bits<10> xo, dag OOL, dag IOL,
string asmstr, InstrItinClass itin, list<dag> pattern>
@@ -774,6 +811,11 @@
XForm_AT3<31, 1, 177, (outs acc:$AT), (ins acc:$ATi), "xxmtacc $AT",
IIC_VecGeneral, []>, RegConstraint<"$ATi = $AT">,
NoEncode<"$ATi">;
+ def KILL_PAIR : PPCPostRAExpPseudo<(outs vsrprc:$XTp), (ins vsrprc:$XSp),
+ "#KILL_PAIR", []>,
+ RegConstraint<"$XTp = $XSp">;
+ def BUILD_UACC : PPCPostRAExpPseudo<(outs acc:$AT), (ins uacc:$AS),
+ "#BUILD_UACC $AT, $AS", []>;
// We define XXSETACCZ as rematerializable to undo CSE of that intrinsic in
// the backend. We avoid CSE here because it generates a copy of the acc
// register and this copy is more expensive than calling the intrinsic again.
@@ -784,6 +826,51 @@
}
}
+def Concats {
+ dag VecsToVecPair0 =
+ (v256i1 (INSERT_SUBREG
+ (INSERT_SUBREG (IMPLICIT_DEF), $vs0, sub_vsx1),
+ $vs1, sub_vsx0));
+ dag VecsToVecPair1 =
+ (v256i1 (INSERT_SUBREG
+ (INSERT_SUBREG (IMPLICIT_DEF), $vs2, sub_vsx1),
+ $vs3, sub_vsx0));
+ dag VecsToVecQuad =
+ (BUILD_UACC (INSERT_SUBREG
+ (INSERT_SUBREG (v512i1 (IMPLICIT_DEF)),
+ (KILL_PAIR VecsToVecPair0), sub_pair0),
+ (KILL_PAIR VecsToVecPair1), sub_pair1));
+}
+
+def Extracts {
+ dag Pair0 = (v256i1 (EXTRACT_SUBREG $v, sub_pair0));
+ dag Pair1 = (v256i1 (EXTRACT_SUBREG $v, sub_pair1));
+ dag Vec0 = (v4i32 (EXTRACT_SUBREG Pair0, sub_vsx0));
+ dag Vec1 = (v4i32 (EXTRACT_SUBREG Pair0, sub_vsx1));
+ dag Vec2 = (v4i32 (EXTRACT_SUBREG Pair1, sub_vsx0));
+ dag Vec3 = (v4i32 (EXTRACT_SUBREG Pair1, sub_vsx1));
+}
+
+let Predicates = [MMA] in {
+ def : Pat<(v512i1 (PPCAccBuild v4i32:$vs1, v4i32:$vs0, v4i32:$vs3, v4i32:$vs2)),
+ (XXMTACC Concats.VecsToVecQuad)>;
+ def : Pat<(v256i1 (PPCPairBuild v4i32:$vs1, v4i32:$vs0)),
+ Concats.VecsToVecPair0>;
+ def : Pat<(v512i1 (PPCxxmfacc v512i1:$AS)), (XXMFACC acc:$AS)>;
+ def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, (i64 0))),
+ Extracts.Vec0>;
+ def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, (i64 1))),
+ Extracts.Vec1>;
+ def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, (i64 2))),
+ Extracts.Vec2>;
+ def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, (i64 3))),
+ Extracts.Vec3>;
+ def : Pat<(v4i32 (PPCPairExtractVsx vsrpevenrc:$v, (i64 0))),
+ (v4i32 (EXTRACT_SUBREG $v, sub_vsx0))>;
+ def : Pat<(v4i32 (PPCPairExtractVsx vsrpevenrc:$v, (i64 1))),
+ (v4i32 (EXTRACT_SUBREG $v, sub_vsx1))>;
+}
+
let mayLoad = 1, mayStore = 0, Predicates = [PairedVectorMemops] in {
def LXVP : DQForm_XTp5_RA17_MEM<6, 0, (outs vsrprc:$XTp),
(ins memrix16:$DQ_RA), "lxvp $XTp, $DQ_RA",
Index: llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
===================================================================
--- llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -2465,6 +2465,31 @@
auto DL = MI.getDebugLoc();
switch (MI.getOpcode()) {
+ case PPC::BUILD_UACC: {
+ MCRegister ACC = MI.getOperand(0).getReg();
+ MCRegister UACC = MI.getOperand(1).getReg();
+ if (ACC - PPC::ACC0 != UACC - PPC::UACC0) {
+ MCRegister SrcVSR = PPC::VSL0 + (UACC - PPC::UACC0) * 4;
+ MCRegister DstVSR = PPC::VSL0 + (ACC - PPC::ACC0) * 4;
+ // FIXME: This can easily be improved to look up to the top of the MBB
+ // to see if the inputs are XXLOR's. If they are and SrcReg is killed,
+ // we can just re-target any such XXLOR's to DstVSR + offset.
+ for (int VecNo = 0; VecNo < 4; VecNo++)
+ BuildMI(MBB, MI, DL, get(PPC::XXLOR), DstVSR + VecNo)
+ .addReg(SrcVSR + VecNo)
+ .addReg(SrcVSR + VecNo);
+ }
+ // BUILD_UACC is expanded to 4 copies of the underlying vsx regisers.
+ // So after building the 4 copies, we can replace the BUILD_UACC instruction
+ // with a NOP.
+ LLVM_FALLTHROUGH;
+ }
+ case PPC::KILL_PAIR: {
+ MI.setDesc(get(PPC::UNENCODED_NOP));
+ MI.RemoveOperand(1);
+ MI.RemoveOperand(0);
+ return true;
+ }
case TargetOpcode::LOAD_STACK_GUARD: {
assert(Subtarget.isTargetLinux() &&
"Only Linux target is expected to contain LOAD_STACK_GUARD");
Index: llvm/lib/Target/PowerPC/PPCISelLowering.h
===================================================================
--- llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -450,6 +450,21 @@
/// available. This is used with ADD_TLS to produce an add like PADDI.
TLS_LOCAL_EXEC_MAT_ADDR,
+ /// ACC_BUILD = Build an accumulator register from 4 VSX registers.
+ ACC_BUILD,
+
+ /// PAIR_BUILD = Build a vector pair register from 2 VSX registers.
+ PAIR_BUILD,
+
+ /// EXTRACT_VSX_REG = Extract one of the underlying vsx registers of
+ /// an accumulator or pair register. This node is needed because
+ /// EXTRACT_SUBVECTOR expects the input and output vectors to have the same
+ /// element type.
+ EXTRACT_VSX_REG,
+
+ /// XXMFACC = This corresponds to the xxmfacc instruction.
+ XXMFACC,
+
// Constrained conversion from floating point to int
STRICT_FCTIDZ = ISD::FIRST_TARGET_STRICTFP_OPCODE,
STRICT_FCTIWZ,
Index: llvm/lib/Target/PowerPC/PPCISelLowering.cpp
===================================================================
--- llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1181,6 +1181,18 @@
}
}
+ if (Subtarget.pairedVectorMemops()) {
+ addRegisterClass(MVT::v256i1, &PPC::VSRpRCRegClass);
+ setOperationAction(ISD::LOAD, MVT::v256i1, Custom);
+ setOperationAction(ISD::STORE, MVT::v256i1, Custom);
+ }
+ if (Subtarget.hasMMA()) {
+ addRegisterClass(MVT::v512i1, &PPC::UACCRCRegClass);
+ setOperationAction(ISD::LOAD, MVT::v512i1, Custom);
+ setOperationAction(ISD::STORE, MVT::v512i1, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v512i1, Custom);
+ }
+
if (Subtarget.has64BitSupport())
setOperationAction(ISD::PREFETCH, MVT::Other, Legal);
@@ -1523,6 +1535,10 @@
return "PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR";
case PPCISD::TLS_LOCAL_EXEC_MAT_ADDR:
return "PPCISD::TLS_LOCAL_EXEC_MAT_ADDR";
+ case PPCISD::ACC_BUILD: return "PPCISD::ACC_BUILD";
+ case PPCISD::PAIR_BUILD: return "PPCISD::PAIR_BUILD";
+ case PPCISD::EXTRACT_VSX_REG: return "PPCISD::EXTRACT_VSX_REG";
+ case PPCISD::XXMFACC: return "PPCISD::XXMFACC";
case PPCISD::LD_SPLAT: return "PPCISD::LD_SPLAT";
case PPCISD::FNMSUB: return "PPCISD::FNMSUB";
case PPCISD::STRICT_FADDRTZ:
@@ -7824,6 +7840,8 @@
}
SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+ if (Op.getValueType().isVector())
+ return LowerVectorLoad(Op, DAG);
assert(Op.getValueType() == MVT::i1 &&
"Custom lowering only for i1 loads");
@@ -7847,6 +7865,9 @@
}
SDValue PPCTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+ if (Op.getOperand(1).getValueType().isVector())
+ return LowerVectorStore(Op, DAG);
+
assert(Op.getOperand(1).getValueType() == MVT::i1 &&
"Custom lowering only for i1 stores");
@@ -10581,6 +10602,94 @@
return Op;
}
+SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ LoadSDNode *LN = cast<LoadSDNode>(Op.getNode());
+ SDValue LoadChain = LN->getChain();
+ SDValue BasePtr = LN->getBasePtr();
+ EVT VT = Op.getValueType();
+
+ if (VT != MVT::v256i1 && VT != MVT::v512i1)
+ return Op;
+
+ // Type v256i1 is used for pairs and v512i1 is used for accumulators.
+ // Here we create 2 or 4 v16i8 loads to load the pair or accumulator value in
+ // 2 or 4 vsx registers.
+ assert((VT != MVT::v512i1 || Subtarget.hasMMA()) &&
+ "Type unsupported without MMA");
+ assert((VT != MVT::v256i1 || Subtarget.pairedVectorMemops()) &&
+ "Type unsupported without paired vector support");
+ Align Alignment = LN->getAlign();
+ SmallVector<SDValue, 4> Loads;
+ SmallVector<SDValue, 4> LoadChains;
+ unsigned NumVecs = VT.getSizeInBits() / 128;
+ for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
+ SDValue Load =
+ DAG.getLoad(MVT::v16i8, dl, LoadChain, BasePtr,
+ LN->getPointerInfo().getWithOffset(Idx * 16),
+ commonAlignment(Alignment, Idx * 16),
+ LN->getMemOperand()->getFlags(), LN->getAAInfo());
+ BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+ DAG.getConstant(16, dl, BasePtr.getValueType()));
+ Loads.push_back(Load);
+ LoadChains.push_back(Load.getValue(1));
+ }
+ if (Subtarget.isLittleEndian()) {
+ std::reverse(Loads.begin(), Loads.end());
+ std::reverse(LoadChains.begin(), LoadChains.end());
+ }
+ SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
+ SDValue Value =
+ DAG.getNode(VT == MVT::v512i1 ? PPCISD::ACC_BUILD : PPCISD::PAIR_BUILD,
+ dl, VT, Loads);
+ SDValue RetOps[] = {Value, TF};
+ return DAG.getMergeValues(RetOps, dl);
+}
+
+SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ StoreSDNode *SN = cast<StoreSDNode>(Op.getNode());
+ SDValue StoreChain = SN->getChain();
+ SDValue BasePtr = SN->getBasePtr();
+ SDValue Value = SN->getValue();
+ EVT StoreVT = Value.getValueType();
+
+ if (StoreVT != MVT::v256i1 && StoreVT != MVT::v512i1)
+ return Op;
+
+ // Type v256i1 is used for pairs and v512i1 is used for accumulators.
+ // Here we create 2 or 4 v16i8 stores to store the pair or accumulator
+ // underlying registers individually.
+ assert((StoreVT != MVT::v512i1 || Subtarget.hasMMA()) &&
+ "Type unsupported without MMA");
+ assert((StoreVT != MVT::v256i1 || Subtarget.pairedVectorMemops()) &&
+ "Type unsupported without paired vector support");
+ Align Alignment = SN->getAlign();
+ SmallVector<SDValue, 4> Stores;
+ unsigned NumVecs = 2;
+ if (StoreVT == MVT::v512i1) {
+ Value = DAG.getNode(PPCISD::XXMFACC, dl, MVT::v512i1, Value);
+ NumVecs = 4;
+ }
+ for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
+ unsigned VecNum = Subtarget.isLittleEndian() ? NumVecs - 1 - Idx : Idx;
+ SDValue Elt = DAG.getNode(PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, Value,
+ DAG.getConstant(VecNum, dl, MVT::i64));
+ SDValue Store =
+ DAG.getStore(StoreChain, dl, Elt, BasePtr,
+ SN->getPointerInfo().getWithOffset(Idx * 16),
+ commonAlignment(Alignment, Idx * 16),
+ SN->getMemOperand()->getFlags(), SN->getAAInfo());
+ BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+ DAG.getConstant(16, dl, BasePtr.getValueType()));
+ Stores.push_back(Store);
+ }
+ SDValue TF = DAG.getTokenFactor(dl, Stores);
+ return TF;
+}
+
SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
SDLoc dl(Op);
if (Op.getValueType() == MVT::v4i32) {
Index: clang/test/CodeGen/target-data.c
===================================================================
--- clang/test/CodeGen/target-data.c
+++ clang/test/CodeGen/target-data.c
@@ -136,11 +136,27 @@
// RUN: %clang_cc1 -triple powerpc64-linux -o - -emit-llvm %s | \
// RUN: FileCheck %s -check-prefix=PPC64-LINUX
-// PPC64-LINUX: target datalayout = "E-m:e-i64:64-n32:64"
+// PPC64-LINUX: target datalayout = "E-m:e-i64:64-n32:64-v256:256:256-v512:512:512"
+
+// RUN: %clang_cc1 -triple powerpc64-linux -o - -emit-llvm -target-cpu future %s | \
+// RUN: FileCheck %s -check-prefix=PPC64-FUTURE
+// PPC64-FUTURE: target datalayout = "E-m:e-i64:64-n32:64-v256:256:256-v512:512:512"
+
+// RUN: %clang_cc1 -triple powerpc64-linux -o - -emit-llvm -target-cpu pwr10 %s | \
+// RUN: FileCheck %s -check-prefix=PPC64-P10
+// PPC64-P10: target datalayout = "E-m:e-i64:64-n32:64-v256:256:256-v512:512:512"
// RUN: %clang_cc1 -triple powerpc64le-linux -o - -emit-llvm %s | \
// RUN: FileCheck %s -check-prefix=PPC64LE-LINUX
-// PPC64LE-LINUX: target datalayout = "e-m:e-i64:64-n32:64"
+// PPC64LE-LINUX: target datalayout = "e-m:e-i64:64-n32:64-v256:256:256-v512:512:512"
+
+// RUN: %clang_cc1 -triple powerpc64le-linux -o - -emit-llvm -target-cpu future %s | \
+// RUN: FileCheck %s -check-prefix=PPC64LE-FUTURE
+// PPC64LE-FUTURE: target datalayout = "e-m:e-i64:64-n32:64-v256:256:256-v512:512:512"
+
+// RUN: %clang_cc1 -triple powerpc64le-linux -o - -emit-llvm -target-cpu pwr10 %s | \
+// RUN: FileCheck %s -check-prefix=PPC64LE-P10
+// PPC64LE-P10: target datalayout = "e-m:e-i64:64-n32:64-v256:256:256-v512:512:512"
// RUN: %clang_cc1 -triple nvptx-unknown -o - -emit-llvm %s | \
// RUN: FileCheck %s -check-prefix=NVPTX
Index: clang/lib/Basic/Targets/PPC.h
===================================================================
--- clang/lib/Basic/Targets/PPC.h
+++ clang/lib/Basic/Targets/PPC.h
@@ -404,19 +404,20 @@
LongWidth = LongAlign = PointerWidth = PointerAlign = 64;
IntMaxType = SignedLong;
Int64Type = SignedLong;
+ std::string DataLayout = "";
if (Triple.isOSAIX()) {
// TODO: Set appropriate ABI for AIX platform.
- resetDataLayout("E-m:a-i64:64-n32:64");
+ DataLayout = "E-m:a-i64:64-n32:64";
SuitableAlign = 64;
LongDoubleWidth = 64;
LongDoubleAlign = DoubleAlign = 32;
LongDoubleFormat = &llvm::APFloat::IEEEdouble();
} else if ((Triple.getArch() == llvm::Triple::ppc64le)) {
- resetDataLayout("e-m:e-i64:64-n32:64");
+ DataLayout = "e-m:e-i64:64-n32:64";
ABI = "elfv2";
} else {
- resetDataLayout("E-m:e-i64:64-n32:64");
+ DataLayout = "E-m:e-i64:64-n32:64";
ABI = "elfv1";
}
@@ -425,6 +426,10 @@
LongDoubleFormat = &llvm::APFloat::IEEEdouble();
}
+ if (Triple.isOSAIX() || Triple.isOSLinux())
+ DataLayout += "-v256:256:256-v512:512:512";
+ resetDataLayout(DataLayout);
+
// PPC64 supports atomics up to 8 bytes.
MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 64;
}
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits