[PATCH] D74337: [ARM,MVE] Add the vmovnbq,vmovntq intrinsic family.

Simon Tatham via Phabricator via cfe-commits Mon, 10 Feb 2020 08:57:59 -0800

simon_tatham created this revision.
simon_tatham added reviewers: dmgreen, miyuki, MarkMurrayARM, ostannard.
Herald added subscribers: llvm-commits, cfe-commits, hiraditya, kristof.beyls.
Herald added projects: clang, LLVM.


These are in some sense the inverse of vmovl[bt]q: they take a vector
of n wide elements and truncate each to half its width. So they only
write half a vector's worth of output data, and therefore they also
take an 'inactive' parameter to provide the other half of the data in
the output vector. So vmovnb overwrites the even lanes of 'inactive'
with the narrowed values from the main input, and vmovnt overwrites
the odd lanes.

LLVM had existing codegen which generates these MVE instructions in
response to IR that takes two vectors of wide elements, or two vectors
of narrow ones. But in this case, we have one vector of each. So my
clang codegen strategy is to narrow the input vector of wide elements
by simply reinterpreting it as the output type, and then we have two
narrow vectors and can represent the operation as a vector shuffle
that interleaves lanes from both of them.

Even so, not all the cases I needed ended up being selected as a
single MVE instruction, so I've added a couple more patterns that spot
combinations of the 'MVEvmovn' and 'ARMvrev32' SDNodes which can be
generated as a VMOVN instruction with operands swapped.

This commit adds the unpredicated forms only.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D74337

Files:
  clang/include/clang/Basic/arm_mve.td
  clang/include/clang/Basic/arm_mve_defs.td
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/arm-mve-intrinsics/vmovn.c
  clang/utils/TableGen/MveEmitter.cpp
  llvm/lib/Target/ARM/ARMInstrMVE.td
  llvm/test/CodeGen/Thumb2/mve-intrinsics/vmovn.ll

Index: llvm/test/CodeGen/Thumb2/mve-intrinsics/vmovn.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/Thumb2/mve-intrinsics/vmovn.ll
@@ -0,0 +1,102 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -verify-machineinstrs -o - %s | FileCheck %s
+
+define arm_aapcs_vfpcc <16 x i8> @test_vmovnbq_s16(<16 x i8> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmovnbq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmovnb.i16 q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+  %1 = bitcast <16 x i8> %0 to <8 x i16>
+  %2 = shufflevector <8 x i16> %b, <8 x i16> %1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %3 = trunc <16 x i16> %2 to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vmovnbq_s32(<8 x i16> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmovnbq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmovnb.i32 q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+  %1 = bitcast <8 x i16> %0 to <4 x i32>
+  %2 = shufflevector <4 x i32> %b, <4 x i32> %1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  %3 = trunc <8 x i32> %2 to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vmovnbq_u16(<16 x i8> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmovnbq_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmovnb.i16 q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+  %1 = bitcast <16 x i8> %0 to <8 x i16>
+  %2 = shufflevector <8 x i16> %b, <8 x i16> %1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %3 = trunc <16 x i16> %2 to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vmovnbq_u32(<8 x i16> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmovnbq_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmovnb.i32 q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+  %1 = bitcast <8 x i16> %0 to <4 x i32>
+  %2 = shufflevector <4 x i32> %b, <4 x i32> %1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  %3 = trunc <8 x i32> %2 to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vmovntq_s16(<16 x i8> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmovntq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmovnt.i16 q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast <16 x i8> %a to <8 x i16>
+  %1 = shufflevector <8 x i16> %0, <8 x i16> %b, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %2 = trunc <16 x i16> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vmovntq_s32(<8 x i16> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmovntq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmovnt.i32 q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast <8 x i16> %a to <4 x i32>
+  %1 = shufflevector <4 x i32> %0, <4 x i32> %b, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  %2 = trunc <8 x i32> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vmovntq_u16(<16 x i8> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmovntq_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmovnt.i16 q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast <16 x i8> %a to <8 x i16>
+  %1 = shufflevector <8 x i16> %0, <8 x i16> %b, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %2 = trunc <16 x i16> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vmovntq_u32(<8 x i16> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmovntq_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmovnt.i32 q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast <8 x i16> %a to <4 x i32>
+  %1 = shufflevector <4 x i32> %0, <4 x i32> %b, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  %2 = trunc <8 x i32> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
Index: llvm/lib/Target/ARM/ARMInstrMVE.td
===================================================================
--- llvm/lib/Target/ARM/ARMInstrMVE.td
+++ llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -4333,8 +4333,16 @@
             (v16i8 (MVE_VMOVNi16bh (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm)))>;
   def : Pat<(v16i8 (MVEvmovn (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm), (i32 1))),
             (v16i8 (MVE_VMOVNi16th (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm)))>;
+
+  def : Pat<(v8i16 (MVEvmovn (v8i16 MQPR:$Qm),
+                             (v8i16 (ARMvrev32 MQPR:$Qd_src)), (i32 1))),
+            (v8i16 (MVE_VMOVNi32bh (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>;
+  def : Pat<(v16i8 (MVEvmovn (v16i8 MQPR:$Qm),
+                             (v16i8 (ARMvrev16 MQPR:$Qd_src)), (i32 1))),
+            (v16i8 (MVE_VMOVNi16bh (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm)))>;
 }
 
+
 class MVE_VCVT_ff<string iname, string suffix, bit op, bit T,
                   list<dag> pattern=[]>
   : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qm),
Index: clang/utils/TableGen/MveEmitter.cpp
===================================================================
--- clang/utils/TableGen/MveEmitter.cpp
+++ clang/utils/TableGen/MveEmitter.cpp
@@ -1188,6 +1188,18 @@
     } else {
       PrintFatalError("unsignedflag's argument should be a scalar type");
     }
+  } else if (Op->getName() == "bitsize") {
+    if (D->getNumArgs() != 1)
+      PrintFatalError("bitsize should have exactly one argument");
+    Record *TypeRec = cast<DefInit>(D->getArg(0))->getDef();
+    if (!TypeRec->isSubClassOf("Type"))
+      PrintFatalError("bitsize's argument should be a type");
+    if (const auto *ST = dyn_cast<ScalarType>(getType(TypeRec, Param))) {
+      return std::make_shared<IntLiteralResult>(getScalarType("u32"),
+                                                ST->sizeInBits());
+    } else {
+      PrintFatalError("bitsize's argument should be a scalar type");
+    }
   } else {
     std::vector<Result::Ptr> Args;
     for (unsigned i = 0, e = D->getNumArgs(); i < e; ++i)
Index: clang/test/CodeGen/arm-mve-intrinsics/vmovn.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/arm-mve-intrinsics/vmovn.c
@@ -0,0 +1,138 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+
+#include <arm_mve.h>
+
+// CHECK-LABEL: @test_vmovnbq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[TMP1]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK-NEXT:    [[TMP3:%.*]] = trunc <16 x i16> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP3]]
+//
+int8x16_t test_vmovnbq_s16(int8x16_t a, int16x8_t b)
+{
+#ifdef POLYMORPHIC
+    return vmovnbq(a, b);
+#else /* POLYMORPHIC */
+    return vmovnbq_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmovnbq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[TMP1]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+// CHECK-NEXT:    [[TMP3:%.*]] = trunc <8 x i32> [[TMP2]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+//
+int16x8_t test_vmovnbq_s32(int16x8_t a, int32x4_t b)
+{
+#ifdef POLYMORPHIC
+    return vmovnbq(a, b);
+#else /* POLYMORPHIC */
+    return vmovnbq_s32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmovnbq_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[TMP1]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK-NEXT:    [[TMP3:%.*]] = trunc <16 x i16> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP3]]
+//
+uint8x16_t test_vmovnbq_u16(uint8x16_t a, uint16x8_t b)
+{
+#ifdef POLYMORPHIC
+    return vmovnbq(a, b);
+#else /* POLYMORPHIC */
+    return vmovnbq_u16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmovnbq_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[TMP1]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+// CHECK-NEXT:    [[TMP3:%.*]] = trunc <8 x i32> [[TMP2]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+//
+uint16x8_t test_vmovnbq_u32(uint16x8_t a, uint32x4_t b)
+{
+#ifdef POLYMORPHIC
+    return vmovnbq(a, b);
+#else /* POLYMORPHIC */
+    return vmovnbq_u32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmovntq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc <16 x i16> [[TMP1]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vmovntq_s16(int8x16_t a, int16x8_t b)
+{
+#ifdef POLYMORPHIC
+    return vmovntq(a, b);
+#else /* POLYMORPHIC */
+    return vmovntq_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmovntq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc <8 x i32> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vmovntq_s32(int16x8_t a, int32x4_t b)
+{
+#ifdef POLYMORPHIC
+    return vmovntq(a, b);
+#else /* POLYMORPHIC */
+    return vmovntq_s32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmovntq_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc <16 x i16> [[TMP1]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+uint8x16_t test_vmovntq_u16(uint8x16_t a, uint16x8_t b)
+{
+#ifdef POLYMORPHIC
+    return vmovntq(a, b);
+#else /* POLYMORPHIC */
+    return vmovntq_u16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmovntq_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc <8 x i32> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vmovntq_u32(uint16x8_t a, uint32x4_t b)
+{
+#ifdef POLYMORPHIC
+    return vmovntq(a, b);
+#else /* POLYMORPHIC */
+    return vmovntq_u32(a, b);
+#endif /* POLYMORPHIC */
+}
+
Index: clang/lib/CodeGen/CGBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -7069,6 +7069,19 @@
                                      Indices);
 }
 
+static llvm::Value *VectorZip(CGBuilderTy &Builder, llvm::Value *V0,
+                              llvm::Value *V1) {
+  // Make a shufflevector that interleaves two vectors element by element.
+  assert(V0->getType() == V1->getType() && "Can't zip different vector types");
+  SmallVector<uint32_t, 16> Indices;
+  unsigned InputElements = V0->getType()->getVectorNumElements();
+  for (unsigned i = 0; i < InputElements; i++) {
+    Indices.push_back(i);
+    Indices.push_back(i + InputElements);
+  }
+  return Builder.CreateShuffleVector(V0, V1, Indices);
+}
+
 template<unsigned HighBit, unsigned OtherBits>
 static llvm::Value *ARMMVEConstantSplat(CGBuilderTy &Builder, llvm::Type *VT) {
   // MVE-specific helper function to make a vector splat of a constant such as
Index: clang/include/clang/Basic/arm_mve_defs.td
===================================================================
--- clang/include/clang/Basic/arm_mve_defs.td
+++ clang/include/clang/Basic/arm_mve_defs.td
@@ -131,6 +131,7 @@
 def unzip: CGHelperFn<"VectorUnzip"> {
   let special_params = [IRBuilderIntParam<1, "bool">];
 }
+def zip: CGHelperFn<"VectorZip">;
 
 // Helper for making boolean flags in IR
 def i1: IRBuilderBase {
@@ -187,6 +188,10 @@
 // and 0 for a signed (or floating) one.
 def unsignedflag;
 
+// 'bitsize' also takes a scalar type, and expands into an integer
+// constant giving its size in bits.
+def bitsize;
+
 // If you put CustomCodegen<"foo"> in an intrinsic's codegen field, it
 // indicates that the IR generation for that intrinsic is done by handwritten
 // C++ and not autogenerated at all. The effect in the MVE builtin codegen
Index: clang/include/clang/Basic/arm_mve.td
===================================================================
--- clang/include/clang/Basic/arm_mve.td
+++ clang/include/clang/Basic/arm_mve.td
@@ -354,6 +354,14 @@
     (extend (unzip $a, 1), DblVector, (unsignedflag Scalar))>;
 }
 
+let params = [s16, u16, s32, u32] in {
+  def vmovnbq: Intrinsic<HalfVector, (args HalfVector:$inactive, Vector:$a),
+    (trunc (zip $a, (vreinterpret (vrev $inactive, (bitsize Scalar)), Vector)),
+           HalfVector)>;
+  def vmovntq: Intrinsic<HalfVector, (args HalfVector:$inactive, Vector:$a),
+    (trunc (zip (vreinterpret $inactive, Vector), $a), HalfVector)>;
+}
+
 defm : float_int_conversions<f32, u32, fptoui, uitofp>;
 defm : float_int_conversions<f16, u16, fptoui, uitofp>;
 defm : float_int_conversions<f32, s32, fptosi, sitofp>;

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D74337: [ARM,MVE] Add the vmovnbq,vmovntq intrinsic family.

Reply via email to