[PATCH] D153128: [AArch64][RCPC3] Add Neon intrinsics for LDAP1 and STL1

Lucas Prates via Phabricator via cfe-commits Fri, 07 Jul 2023 04:32:24 -0700

This revision was landed with ongoing or failed builds.
This revision was automatically updated to reflect the committed changes.
Closed by commit rG2b7ac6260627: [AArch64][RCPC3] Add Neon intrinsics for LDAP1 
and STL1 (authored by pratlucas).


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D153128/new/

https://reviews.llvm.org/D153128

Files:
  clang/include/clang/Basic/arm_neon.td
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/aarch64-neon-ldst-one-rcpc3.c
  clang/utils/TableGen/NeonEmitter.cpp

Index: clang/utils/TableGen/NeonEmitter.cpp
===================================================================
--- clang/utils/TableGen/NeonEmitter.cpp
+++ clang/utils/TableGen/NeonEmitter.cpp
@@ -2086,12 +2086,13 @@
 
     std::string Name = Def->getName();
     // Omit type checking for the pointer arguments of vld1_lane, vld1_dup,
-    // and vst1_lane intrinsics.  Using a pointer to the vector element
-    // type with one of those operations causes codegen to select an aligned
-    // load/store instruction.  If you want an unaligned operation,
-    // the pointer argument needs to have less alignment than element type,
-    // so just accept any pointer type.
-    if (Name == "vld1_lane" || Name == "vld1_dup" || Name == "vst1_lane") {
+    // vst1_lane, vldap1_lane, and vstl1_lane intrinsics.  Using a pointer to
+    // the vector element type with one of those operations causes codegen to
+    // select an aligned load/store instruction.  If you want an unaligned
+    // operation, the pointer argument needs to have less alignment than element
+    // type, so just accept any pointer type.
+    if (Name == "vld1_lane" || Name == "vld1_dup" || Name == "vst1_lane" ||
+        Name == "vldap1_lane" || Name == "vstl1_lane") {
       PtrArgNum = -1;
       HasConstPtr = false;
     }
Index: clang/test/CodeGen/aarch64-neon-ldst-one-rcpc3.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/aarch64-neon-ldst-one-rcpc3.c
@@ -0,0 +1,201 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple aarch64-arm-none-eabi -target-feature +neon \
+// RUN:  -target-feature +rcpc3 -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: | opt -S -passes=mem2reg | FileCheck %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_neon.h>
+
+
+// CHECK-LABEL: @test_vldap1q_lane_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8
+// CHECK-NEXT:    [[VLDAP1_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[TMP2]], i32 1
+// CHECK-NEXT:    ret <2 x i64> [[VLDAP1_LANE]]
+//
+uint64x2_t test_vldap1q_lane_u64(uint64_t  *a, uint64x2_t b) {
+  return vldap1q_lane_u64(a, b, 1);
+}
+
+// CHECK-LABEL: @test_vldap1q_lane_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8
+// CHECK-NEXT:    [[VLDAP1_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[TMP2]], i32 1
+// CHECK-NEXT:    ret <2 x i64> [[VLDAP1_LANE]]
+//
+int64x2_t test_vldap1q_lane_s64(int64_t  *a, int64x2_t b) {
+  return vldap1q_lane_s64(a, b, 1);
+}
+
+// CHECK-LABEL: @test_vldap1q_lane_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = load atomic double, ptr [[A:%.*]] acquire, align 8
+// CHECK-NEXT:    [[VLDAP1_LANE:%.*]] = insertelement <2 x double> [[TMP1]], double [[TMP2]], i32 1
+// CHECK-NEXT:    ret <2 x double> [[VLDAP1_LANE]]
+//
+float64x2_t test_vldap1q_lane_f64(float64_t  *a, float64x2_t b) {
+  return vldap1q_lane_f64(a, b, 1);
+}
+
+// CHECK-LABEL: @test_vldap1q_lane_p64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8
+// CHECK-NEXT:    [[VLDAP1_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[TMP2]], i32 1
+// CHECK-NEXT:    ret <2 x i64> [[VLDAP1_LANE]]
+//
+poly64x2_t test_vldap1q_lane_p64(poly64_t  *a, poly64x2_t b) {
+  return vldap1q_lane_p64(a, b, 1);
+}
+
+// CHECK-LABEL: @test_vldap1_lane_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8
+// CHECK-NEXT:    [[VLDAP1_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 [[TMP2]], i32 0
+// CHECK-NEXT:    ret <1 x i64> [[VLDAP1_LANE]]
+//
+uint64x1_t test_vldap1_lane_u64(uint64_t  *a, uint64x1_t b) {
+  return vldap1_lane_u64(a, b, 0);
+}
+
+// CHECK-LABEL: @test_vldap1_lane_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8
+// CHECK-NEXT:    [[VLDAP1_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 [[TMP2]], i32 0
+// CHECK-NEXT:    ret <1 x i64> [[VLDAP1_LANE]]
+//
+int64x1_t test_vldap1_lane_s64(int64_t  *a, int64x1_t b) {
+  return vldap1_lane_s64(a, b, 0);
+}
+
+// CHECK-LABEL: @test_vldap1_lane_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = load atomic double, ptr [[A:%.*]] acquire, align 8
+// CHECK-NEXT:    [[VLDAP1_LANE:%.*]] = insertelement <1 x double> [[TMP1]], double [[TMP2]], i32 0
+// CHECK-NEXT:    ret <1 x double> [[VLDAP1_LANE]]
+//
+float64x1_t test_vldap1_lane_f64(float64_t  *a, float64x1_t b) {
+  return vldap1_lane_f64(a, b, 0);
+}
+
+// CHECK-LABEL: @test_vldap1_lane_p64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8
+// CHECK-NEXT:    [[VLDAP1_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 [[TMP2]], i32 0
+// CHECK-NEXT:    ret <1 x i64> [[VLDAP1_LANE]]
+//
+poly64x1_t test_vldap1_lane_p64(poly64_t  *a, poly64x1_t b) {
+  return vldap1_lane_p64(a, b, 0);
+}
+
+// CHECK-LABEL: @test_vstl1q_lane_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK-NEXT:    store atomic i64 [[TMP2]], ptr [[A:%.*]] release, align 8
+// CHECK-NEXT:    ret void
+//
+void test_vstl1q_lane_u64(uint64_t  *a, uint64x2_t b) {
+  vstl1q_lane_u64(a, b, 1);
+}
+
+// CHECK-LABEL: @test_vstl1q_lane_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK-NEXT:    store atomic i64 [[TMP2]], ptr [[A:%.*]] release, align 8
+// CHECK-NEXT:    ret void
+//
+void test_vstl1q_lane_s64(int64_t  *a, int64x2_t b) {
+  vstl1q_lane_s64(a, b, 1);
+}
+
+// CHECK-LABEL: @test_vstl1q_lane_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+// CHECK-NEXT:    store atomic double [[TMP2]], ptr [[A:%.*]] release, align 8
+// CHECK-NEXT:    ret void
+//
+void test_vstl1q_lane_f64(float64_t  *a, float64x2_t b) {
+  vstl1q_lane_f64(a, b, 1);
+}
+
+// CHECK-LABEL: @test_vstl1q_lane_p64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK-NEXT:    store atomic i64 [[TMP2]], ptr [[A:%.*]] release, align 8
+// CHECK-NEXT:    ret void
+//
+void test_vstl1q_lane_p64(poly64_t  *a, poly64x2_t b) {
+  vstl1q_lane_p64(a, b, 1);
+}
+
+// CHECK-LABEL: @test_vstl1_lane_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK-NEXT:    store atomic i64 [[TMP2]], ptr [[A:%.*]] release, align 8
+// CHECK-NEXT:    ret void
+//
+void test_vstl1_lane_u64(uint64_t  *a, uint64x1_t b) {
+  vstl1_lane_u64(a, b, 0);
+}
+
+// CHECK-LABEL: @test_vstl1_lane_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK-NEXT:    store atomic i64 [[TMP2]], ptr [[A:%.*]] release, align 8
+// CHECK-NEXT:    ret void
+//
+void test_vstl1_lane_s64(int64_t  *a, int64x1_t b) {
+  vstl1_lane_s64(a, b, 0);
+}
+
+// CHECK-LABEL: @test_vstl1_lane_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
+// CHECK-NEXT:    store atomic double [[TMP2]], ptr [[A:%.*]] release, align 8
+// CHECK-NEXT:    ret void
+//
+void test_vstl1_lane_f64(float64_t  *a, float64x1_t b) {
+  vstl1_lane_f64(a, b, 0);
+}
+
+// CHECK-LABEL: @test_vstl1_lane_p64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK-NEXT:    store atomic i64 [[TMP2]], ptr [[A:%.*]] release, align 8
+// CHECK-NEXT:    ret void
+//
+void test_vstl1_lane_p64(poly64_t  *a, poly64x1_t b) {
+  vstl1_lane_p64(a, b, 0);
+}
Index: clang/lib/CodeGen/CGBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -6790,6 +6790,21 @@
   { NEON::BI__builtin_neon_vuzpq_f16, NEON::BI__builtin_neon_vuzpq_v, },
   { NEON::BI__builtin_neon_vzip_f16, NEON::BI__builtin_neon_vzip_v, },
   { NEON::BI__builtin_neon_vzipq_f16, NEON::BI__builtin_neon_vzipq_v, },
+  // The mangling rules cause us to have one ID for each type for vldap1(q)_lane
+  // and vstl1(q)_lane, but codegen is equivalent for all of them. Choose an
+  // arbitrary one to be handled as tha canonical variation.
+  { NEON::BI__builtin_neon_vldap1_lane_u64, NEON::BI__builtin_neon_vldap1_lane_s64 },
+  { NEON::BI__builtin_neon_vldap1_lane_f64, NEON::BI__builtin_neon_vldap1_lane_s64 },
+  { NEON::BI__builtin_neon_vldap1_lane_p64, NEON::BI__builtin_neon_vldap1_lane_s64 },
+  { NEON::BI__builtin_neon_vldap1q_lane_u64, NEON::BI__builtin_neon_vldap1q_lane_s64 },
+  { NEON::BI__builtin_neon_vldap1q_lane_f64, NEON::BI__builtin_neon_vldap1q_lane_s64 },
+  { NEON::BI__builtin_neon_vldap1q_lane_p64, NEON::BI__builtin_neon_vldap1q_lane_s64 },
+  { NEON::BI__builtin_neon_vstl1_lane_u64, NEON::BI__builtin_neon_vstl1_lane_s64 },
+  { NEON::BI__builtin_neon_vstl1_lane_f64, NEON::BI__builtin_neon_vstl1_lane_s64 },
+  { NEON::BI__builtin_neon_vstl1_lane_p64, NEON::BI__builtin_neon_vstl1_lane_s64 },
+  { NEON::BI__builtin_neon_vstl1q_lane_u64, NEON::BI__builtin_neon_vstl1q_lane_s64 },
+  { NEON::BI__builtin_neon_vstl1q_lane_f64, NEON::BI__builtin_neon_vstl1q_lane_s64 },
+  { NEON::BI__builtin_neon_vstl1q_lane_p64, NEON::BI__builtin_neon_vstl1q_lane_s64 },
 };
 
 #undef NEONMAP0
@@ -10596,6 +10611,10 @@
       case NEON::BI__builtin_neon_vst1q_v:
       case NEON::BI__builtin_neon_vst1_lane_v:
       case NEON::BI__builtin_neon_vst1q_lane_v:
+      case NEON::BI__builtin_neon_vldap1_lane_s64:
+      case NEON::BI__builtin_neon_vldap1q_lane_s64:
+      case NEON::BI__builtin_neon_vstl1_lane_s64:
+      case NEON::BI__builtin_neon_vstl1q_lane_s64:
         // Get the alignment for the argument in addition to the value;
         // we'll use it later.
         PtrOp0 = EmitPointerWithAlignment(E->getArg(0));
@@ -12194,6 +12213,17 @@
                                        PtrOp0.getAlignment());
     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vld1_lane");
   }
+  case NEON::BI__builtin_neon_vldap1_lane_s64:
+  case NEON::BI__builtin_neon_vldap1q_lane_s64: {
+    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
+    Ty = llvm::PointerType::getUnqual(VTy->getElementType());
+    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
+    llvm::LoadInst *LI = Builder.CreateAlignedLoad(
+        VTy->getElementType(), Ops[0], PtrOp0.getAlignment());
+    LI->setAtomic(llvm::AtomicOrdering::Acquire);
+    Ops[0] = LI;
+    return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vldap1_lane");
+  }
   case NEON::BI__builtin_neon_vld1_dup_v:
   case NEON::BI__builtin_neon_vld1q_dup_v: {
     Value *V = PoisonValue::get(Ty);
@@ -12212,6 +12242,16 @@
     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
     return Builder.CreateAlignedStore(Ops[1], Builder.CreateBitCast(Ops[0], Ty),
                                       PtrOp0.getAlignment());
+  case NEON::BI__builtin_neon_vstl1_lane_s64:
+  case NEON::BI__builtin_neon_vstl1q_lane_s64: {
+    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
+    Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2]);
+    Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
+    llvm::StoreInst *SI = Builder.CreateAlignedStore(
+        Ops[1], Builder.CreateBitCast(Ops[0], Ty), PtrOp0.getAlignment());
+    SI->setAtomic(llvm::AtomicOrdering::Release);
+    return SI;
+  }
   case NEON::BI__builtin_neon_vld2_v:
   case NEON::BI__builtin_neon_vld2q_v: {
     llvm::Type *PTy = llvm::PointerType::getUnqual(VTy);
Index: clang/include/clang/Basic/arm_neon.td
===================================================================
--- clang/include/clang/Basic/arm_neon.td
+++ clang/include/clang/Basic/arm_neon.td
@@ -2086,3 +2086,9 @@
         "csilUcUsUiUlhfdPcPsPlQcQsQiQlQUcQUsQUiQUlQhQfQdQPcQPsQPlQPk", "bQb">;
   }
 }
+
+// v8.9a/v9.4a LRCPC3 intrinsics
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "rcpc3" in {
+  def VLDAP1_LANE : WInst<"vldap1_lane", ".(c*!).I", "QUlQlUlldQdPlQPl">;
+  def VSTL1_LANE  : WInst<"vstl1_lane", "v*(.!)I", "QUlQlUlldQdPlQPl">;
+}

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D153128: [AArch64][RCPC3] Add Neon intrinsics for LDAP1 and STL1

Reply via email to