[PATCH] D96825: [AArch64] Adding Polynomial vadd Intrinsics support

Ryan Santhirarajan via Phabricator via cfe-commits Tue, 16 Feb 2021 17:43:12 -0800

rsanthir.quic created this revision.
rsanthir.quic added reviewers: t.p.northover, kevin.qin, labrinea, pbarrio, 
apazos.
Herald added subscribers: danielkiss, kristof.beyls.
rsanthir.quic requested review of this revision.
Herald added a project: clang.
Herald added a subscriber: cfe-commits.


This patch adds the following intrinsics:

  vadd_p8
  vadd_p16
  vadd_p64
  vaddq_p8
  vaddq_p16
  vaddq_p64
  vaddq_p128


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D96825

Files:
  clang/include/clang/Basic/arm_neon.td
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/aarch64-poly-add.c

Index: clang/test/CodeGen/aarch64-poly-add.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/aarch64-poly-add.c
@@ -0,0 +1,85 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
+// RUN: -disable-O0-optnone -ffp-contract=fast -emit-llvm -o - %s | opt -S -mem2reg \
+// RUN:  | FileCheck %s
+
+#include <arm_neon.h>
+
+// CHECK-LABEL: @test_vadd_p8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <8 x i8> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
+poly8x8_t test_vadd_p8(poly8x8_t a, poly8x8_t b) {
+  return vadd_p8 (a, b);
+}
+
+// CHECK-LABEL: @test_vadd_p16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = xor <8 x i8> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP3]]
+//
+poly16x4_t test_vadd_p16(poly16x4_t a, poly16x4_t b) {
+  return vadd_p16 (a, b);
+}
+
+// CHECK-LABEL: @test_vadd_p64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = xor <8 x i8> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP3]]
+//
+poly64x1_t test_vadd_p64(poly64x1_t a, poly64x1_t b) {
+  return vadd_p64(a, b);
+}
+
+// CHECK-LABEL: @test_vaddq_p8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <16 x i8> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+poly8x16_t test_vaddq_p8(poly8x16_t a, poly8x16_t b){
+  return vaddq_p8(a, b);
+}
+
+// CHECK-LABEL: @test_vaddq_p16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = xor <16 x i8> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+//
+poly16x8_t test_vaddq_p16(poly16x8_t a, poly16x8_t b){
+  return vaddq_p16(a, b);
+}
+
+// CHECK-LABEL: @test_vaddq_p64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = xor <16 x i8> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP3]]
+//
+poly64x2_t test_vaddq_p64(poly64x2_t a, poly64x2_t b){
+  return vaddq_p64(a, b);
+}
+
+// CHECK-LABEL: @test_vaddq_p128(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[B:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = xor <16 x i8> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
+//
+poly128_t test_vaddq_p128 (poly128_t a, poly128_t b){
+  return vaddq_p128(a, b);
+}
Index: clang/lib/CodeGen/CGBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -5662,7 +5662,10 @@
   NEONMAP0(splatq_laneq_v),
   NEONMAP1(vabs_v, aarch64_neon_abs, 0),
   NEONMAP1(vabsq_v, aarch64_neon_abs, 0),
+  NEONMAP0(vadd_v),
   NEONMAP0(vaddhn_v),
+  NEONMAP0(vaddq_p128),
+  NEONMAP0(vaddq_v),
   NEONMAP1(vaesdq_v, aarch64_crypto_aesd, 0),
   NEONMAP1(vaeseq_v, aarch64_crypto_aese, 0),
   NEONMAP1(vaesimcq_v, aarch64_crypto_aesimc, 0),
@@ -6290,6 +6293,14 @@
     if (VTy->getElementType()->isFloatingPointTy())
       return EmitNeonCall(CGM.getIntrinsic(Intrinsic::fabs, Ty), Ops, "vabs");
     return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Ty), Ops, "vabs");
+  case NEON::BI__builtin_neon_vadd_v:
+  case NEON::BI__builtin_neon_vaddq_v: {
+    llvm::Type *VTy = llvm::FixedVectorType::get(Int8Ty, Quad ? 16 : 8);
+    Ops[0] = Builder.CreateBitCast(Ops[0], VTy);
+    Ops[1] = Builder.CreateBitCast(Ops[1], VTy);
+    Ops[0] =  Builder.CreateXor(Ops[0], Ops[1]);
+    return Builder.CreateBitCast(Ops[0], Ty);
+  }
   case NEON::BI__builtin_neon_vaddhn_v: {
     llvm::FixedVectorType *SrcTy =
         llvm::FixedVectorType::getExtendedElementVectorType(VTy);
@@ -9515,6 +9526,15 @@
   case NEON::BI__builtin_neon_vabsh_f16:
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::fabs, HalfTy), Ops, "vabs");
+  case NEON::BI__builtin_neon_vaddq_p128: {
+    llvm::Type *Ty = GetNeonType(this, NeonTypeFlags::Poly128);
+    Ops.push_back(EmitScalarExpr(E->getArg(1)));
+    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
+    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
+    Ops[0] =  Builder.CreateXor(Ops[0], Ops[1]);
+    llvm::Type *Int128Ty = llvm::Type::getIntNTy(getLLVMContext(), 128);
+    return Builder.CreateBitCast(Ops[0], Int128Ty);
+  }
   case NEON::BI__builtin_neon_vldrq_p128: {
     llvm::Type *Int128Ty = llvm::Type::getIntNTy(getLLVMContext(), 128);
     llvm::Type *Int128PTy = llvm::PointerType::get(Int128Ty, 0);
Index: clang/include/clang/Basic/arm_neon.td
===================================================================
--- clang/include/clang/Basic/arm_neon.td
+++ clang/include/clang/Basic/arm_neon.td
@@ -1136,6 +1136,8 @@
 def SHA256SU1 : SInst<"vsha256su1", "....", "QUi">;
 }
 
+def VADDP   : WInst<"vadd", "...", "PcPsPlQPcQPsQPlQPk">;
+
 ////////////////////////////////////////////////////////////////////////////////
 // Float -> Int conversions with explicit rounding mode

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D96825: [AArch64] Adding Polynomial vadd Intrinsics support

Reply via email to