[PATCH] D41792: [AArch64] Add ARMv8.2-A FP16 scalar intrinsics

Abderrazek Zaafrani via Phabricator via cfe-commits Fri, 05 Jan 2018 16:41:27 -0800

az created this revision.
az added a reviewer: SjoerdMeijer.
Herald added subscribers: kristof.beyls, javed.absar, mgorny, rengolin, 
aemerson.


ARMv8.2-A introduces half-precision floating point data processing. This patch 
adds the fp16 scalar intrinsics for this architecture as described in the ARM 
ACLE document. Only the front-end intrinsic work is done here. Some backend 
work related to instruction selection still needs to be done.

This work is a continuation of https://reviews.llvm.org/D32511 which addressed 
ARMv8.2-A vector intrinsics.


https://reviews.llvm.org/D41792

Files:
  clang/include/clang/Basic/BuiltinsNEON.def
  clang/include/clang/Basic/CMakeLists.txt
  clang/include/clang/Basic/arm_fp16.td
  clang/lib/Basic/Targets/AArch64.cpp
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/Headers/CMakeLists.txt
  clang/lib/Headers/module.modulemap
  clang/lib/Sema/SemaChecking.cpp
  clang/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c
  clang/utils/TableGen/NeonEmitter.cpp
  clang/utils/TableGen/TableGen.cpp
  clang/utils/TableGen/TableGenBackends.h
  llvm/include/llvm/IR/IntrinsicsAArch64.td

Index: llvm/include/llvm/IR/IntrinsicsAArch64.td
===================================================================
--- llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -146,6 +146,9 @@
   class AdvSIMD_CvtFPToFx_Intrinsic
     : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, llvm_i32_ty],
                 [IntrNoMem]>;
+
+  class AdvSIMD_1Arg_Intrinsic
+    : Intrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrNoMem]>;
 }
 
 // Arithmetic ops
@@ -244,7 +247,7 @@
   // Vector Max
   def int_aarch64_neon_smax : AdvSIMD_2VectorArg_Intrinsic;
   def int_aarch64_neon_umax : AdvSIMD_2VectorArg_Intrinsic;
-  def int_aarch64_neon_fmax : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_fmax : AdvSIMD_2FloatArg_Intrinsic;
   def int_aarch64_neon_fmaxnmp : AdvSIMD_2VectorArg_Intrinsic;
 
   // Vector Max Across Lanes
@@ -256,7 +259,7 @@
   // Vector Min
   def int_aarch64_neon_smin : AdvSIMD_2VectorArg_Intrinsic;
   def int_aarch64_neon_umin : AdvSIMD_2VectorArg_Intrinsic;
-  def int_aarch64_neon_fmin : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_fmin : AdvSIMD_2FloatArg_Intrinsic;
   def int_aarch64_neon_fminnmp : AdvSIMD_2VectorArg_Intrinsic;
 
   // Vector Min/Max Number
@@ -354,7 +357,8 @@
   def int_aarch64_neon_sqxtun : AdvSIMD_1VectorArg_Narrow_Intrinsic;
 
   // Vector Absolute Value
-  def int_aarch64_neon_abs : AdvSIMD_1IntArg_Intrinsic;
+  //def int_aarch64_neon_abs : AdvSIMD_1IntArg_Intrinsic;
+  def int_aarch64_neon_abs : AdvSIMD_1Arg_Intrinsic;
 
   // Vector Saturating Absolute Value
   def int_aarch64_neon_sqabs : AdvSIMD_1IntArg_Intrinsic;
Index: clang/utils/TableGen/TableGenBackends.h
===================================================================
--- clang/utils/TableGen/TableGenBackends.h
+++ clang/utils/TableGen/TableGenBackends.h
@@ -65,6 +65,7 @@
 void EmitClangCommentCommandList(RecordKeeper &Records, raw_ostream &OS);
 
 void EmitNeon(RecordKeeper &Records, raw_ostream &OS);
+void EmitFP16(RecordKeeper &Records, raw_ostream &OS);
 void EmitNeonSema(RecordKeeper &Records, raw_ostream &OS);
 void EmitNeonTest(RecordKeeper &Records, raw_ostream &OS);
 void EmitNeon2(RecordKeeper &Records, raw_ostream &OS);
Index: clang/utils/TableGen/TableGen.cpp
===================================================================
--- clang/utils/TableGen/TableGen.cpp
+++ clang/utils/TableGen/TableGen.cpp
@@ -52,6 +52,7 @@
   GenClangCommentCommandInfo,
   GenClangCommentCommandList,
   GenArmNeon,
+  GenArmFP16,
   GenArmNeonSema,
   GenArmNeonTest,
   GenAttrDocs,
@@ -139,6 +140,7 @@
                    "Generate list of commands that are used in "
                    "documentation comments"),
         clEnumValN(GenArmNeon, "gen-arm-neon", "Generate arm_neon.h for clang"),
+        clEnumValN(GenArmFP16, "gen-arm-fp16", "Generate arm_fp16.h for clang"),
         clEnumValN(GenArmNeonSema, "gen-arm-neon-sema",
                    "Generate ARM NEON sema support for clang"),
         clEnumValN(GenArmNeonTest, "gen-arm-neon-test",
@@ -250,6 +252,9 @@
   case GenArmNeon:
     EmitNeon(Records, OS);
     break;
+  case GenArmFP16:
+    EmitFP16(Records, OS);
+    break;
   case GenArmNeonSema:
     EmitNeonSema(Records, OS);
     break;
Index: clang/utils/TableGen/NeonEmitter.cpp
===================================================================
--- clang/utils/TableGen/NeonEmitter.cpp
+++ clang/utils/TableGen/NeonEmitter.cpp
@@ -552,7 +552,11 @@
   // run - Emit arm_neon.h.inc
   void run(raw_ostream &o);
 
+  // runFP16 - Emit arm_fp16.h.inc
+  void runFP16(raw_ostream &o);
+
   // runHeader - Emit all the __builtin prototypes used in arm_neon.h
+	// and arm_fp16.h
   void runHeader(raw_ostream &o);
 
   // runTests - Emit tests for all the Neon intrinsics.
@@ -852,6 +856,35 @@
     NumVectors = 0;
     Float = true;
     break;
+  case 'Y':
+    Bitwidth = ElementBitwidth = 16;
+    NumVectors = 0;
+    Float = true;
+    break;
+  case 'I':
+    Bitwidth = ElementBitwidth = 32;
+    NumVectors = 0;
+    Float = false;
+    Signed = true;
+    break;
+  case 'L':
+    Bitwidth = ElementBitwidth = 64;
+    NumVectors = 0;
+    Float = false;
+    Signed = true;
+    break;
+  case 'U':
+    Bitwidth = ElementBitwidth = 32;
+    NumVectors = 0;
+    Float = false;
+    Signed = false;
+    break;
+  case 'O':
+    Bitwidth = ElementBitwidth = 64;
+    NumVectors = 0;
+    Float = false;
+    Signed = false;
+    break;
   case 'f':
     Float = true;
     ElementBitwidth = 32;
@@ -1010,7 +1043,7 @@
 }
 
 static bool isFloatingPointProtoModifier(char Mod) {
-  return Mod == 'F' || Mod == 'f' || Mod == 'H';
+  return Mod == 'F' || Mod == 'f' || Mod == 'H' || Mod == 'Y' || Mod == 'I';
 }
 
 std::string Intrinsic::getBuiltinTypeStr() {
@@ -2420,12 +2453,128 @@
   OS << "#endif /* __ARM_NEON_H */\n";
 }
 
+/// run - Read the records in arm_fp16.td and output arm_fp16.h.  arm_fp16.h
+/// is comprised of type definitions and function declarations.
+void NeonEmitter::runFP16(raw_ostream &OS) {
+  OS << "/*===---- arm_fp16.h - ARM FP16 intrinsics "
+        "------------------------------"
+        "---===\n"
+        " *\n"
+        " * Permission is hereby granted, free of charge, to any person "
+        "obtaining "
+        "a copy\n"
+        " * of this software and associated documentation files (the "
+        "\"Software\"),"
+        " to deal\n"
+        " * in the Software without restriction, including without limitation "
+        "the "
+        "rights\n"
+        " * to use, copy, modify, merge, publish, distribute, sublicense, "
+        "and/or sell\n"
+        " * copies of the Software, and to permit persons to whom the Software "
+        "is\n"
+        " * furnished to do so, subject to the following conditions:\n"
+        " *\n"
+        " * The above copyright notice and this permission notice shall be "
+        "included in\n"
+        " * all copies or substantial portions of the Software.\n"
+        " *\n"
+        " * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, "
+        "EXPRESS OR\n"
+        " * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF "
+        "MERCHANTABILITY,\n"
+        " * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT "
+        "SHALL THE\n"
+        " * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR "
+        "OTHER\n"
+        " * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, "
+        "ARISING FROM,\n"
+        " * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER "
+        "DEALINGS IN\n"
+        " * THE SOFTWARE.\n"
+        " *\n"
+        " *===-----------------------------------------------------------------"
+        "---"
+        "---===\n"
+        " */\n\n";
+
+  OS << "#ifndef __ARM_FP16_H\n";
+  OS << "#define __ARM_FP16_H\n\n";
+
+	OS << "#include <stdint.h>\n\n";
+
+  OS << "typedef __fp16 float16_t;\n";
+
+  OS << "#define __ai static inline __attribute__((__always_inline__, "
+        "__nodebug__))\n\n";
+
+  SmallVector<Intrinsic *, 128> Defs;
+  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
+  for (auto *R : RV)
+    createIntrinsic(R, Defs);
+
+  for (auto *I : Defs)
+    I->indexBody();
+
+  std::stable_sort(
+      Defs.begin(), Defs.end(),
+      [](const Intrinsic *A, const Intrinsic *B) { return *A < *B; });
+
+  // Only emit a def when its requirements have been met.
+  // FIXME: This loop could be made faster, but it's fast enough for now.
+  bool MadeProgress = true;
+  std::string InGuard;
+  while (!Defs.empty() && MadeProgress) {
+    MadeProgress = false;
+
+    for (SmallVector<Intrinsic *, 128>::iterator I = Defs.begin();
+         I != Defs.end(); /*No step*/) {
+      bool DependenciesSatisfied = true;
+      for (auto *II : (*I)->getDependencies()) {
+        if (std::find(Defs.begin(), Defs.end(), II) != Defs.end())
+          DependenciesSatisfied = false;
+      }
+      if (!DependenciesSatisfied) {
+        // Try the next one.
+        ++I;
+        continue;
+      }
+
+      // Emit #endif/#if pair if needed.
+      if ((*I)->getGuard() != InGuard) {
+        if (!InGuard.empty())
+          OS << "#endif\n";
+        InGuard = (*I)->getGuard();
+        if (!InGuard.empty())
+          OS << "#if " << InGuard << "\n";
+      }
+
+      // Actually generate the intrinsic code.
+      OS << (*I)->generate();
+
+      MadeProgress = true;
+      I = Defs.erase(I);
+    }
+  }
+  assert(Defs.empty() && "Some requirements were not satisfied!");
+  if (!InGuard.empty())
+    OS << "#endif\n";
+
+  OS << "\n";
+  OS << "#undef __ai\n\n";
+  OS << "#endif /* __ARM_FP16_H */\n";
+}
+
 namespace clang {
 
 void EmitNeon(RecordKeeper &Records, raw_ostream &OS) {
   NeonEmitter(Records).run(OS);
 }
 
+void EmitFP16(RecordKeeper &Records, raw_ostream &OS) {
+  NeonEmitter(Records).runFP16(OS);
+}
+
 void EmitNeonSema(RecordKeeper &Records, raw_ostream &OS) {
   NeonEmitter(Records).runHeader(OS);
 }
Index: clang/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c
@@ -0,0 +1,643 @@
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +fullfp16\
+// RUN: -fallow-half-arguments-and-returns -S -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: | opt -S -mem2reg \
+// RUN: | FileCheck %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_fp16.h>
+
+// CHECK-LABEL: test_vabsh_f16
+// CHECK:  [[ABS:%.*]] =  call half @llvm.aarch64.neon.abs.f16(half %a)
+// CHECK:  ret half [[ABS]]
+float16_t test_vabsh_f16(float16_t a) {
+  return vabsh_f16(a);
+}
+
+// CHECK-LABEL: test_vceqzh_f16
+// CHECK:  [[TMP1:%.*]] = fcmp oeq half %a, 0xH0000
+// CHECK:  [[TMP2:%.*]] = sext i1 [[TMP1]] to i16
+// CHECK:  ret i16 [[TMP2]]
+uint16_t test_vceqzh_f16(float16_t a) {
+  return vceqzh_f16(a);
+}
+
+// CHECK-LABEL: test_vcgezh_f16
+// CHECK:  [[TMP1:%.*]] = fcmp oge half %a, 0xH0000
+// CHECK:  [[TMP2:%.*]] = sext i1 [[TMP1]] to i16
+// CHECK:  ret i16 [[TMP2]]
+uint16_t test_vcgezh_f16(float16_t a) {
+  return vcgezh_f16(a);
+}
+
+// CHECK-LABEL: test_vcgtzh_f16
+// CHECK:  [[TMP1:%.*]] = fcmp ogt half %a, 0xH0000
+// CHECK:  [[TMP2:%.*]] = sext i1 [[TMP1]] to i16
+// CHECK:  ret i16 [[TMP2]]
+uint16_t test_vcgtzh_f16(float16_t a) {
+  return vcgtzh_f16(a);
+}
+
+// CHECK-LABEL: test_vclezh_f16
+// CHECK:  [[TMP1:%.*]] = fcmp ole half %a, 0xH0000
+// CHECK:  [[TMP2:%.*]] = sext i1 [[TMP1]] to i16
+// CHECK:  ret i16 [[TMP2]]
+uint16_t test_vclezh_f16(float16_t a) {
+  return vclezh_f16(a);
+}
+
+// CHECK-LABEL: test_vcltzh_f16
+// CHECK:  [[TMP1:%.*]] = fcmp olt half %a, 0xH0000
+// CHECK:  [[TMP2:%.*]] = sext i1 [[TMP1]] to i16
+// CHECK:  ret i16 [[TMP2]]
+uint16_t test_vcltzh_f16(float16_t a) {
+  return vcltzh_f16(a);
+}
+
+// CHECK-LABEL: test_vcvth_f16_s16
+// CHECK:  [[VCVT:%.*]] = sitofp i16 %a to half
+// CHECK:  ret half [[VCVT]]
+float16_t test_vcvth_f16_s16 (int16_t a) {
+  return vcvth_f16_s16(a);
+}
+
+// CHECK-LABEL: test_vcvth_f16_s32
+// CHECK:  [[VCVT:%.*]] = sitofp i32 %a to half
+// CHECK:  ret half [[VCVT]]
+float16_t test_vcvth_f16_s32 (int32_t a) {
+  return vcvth_f16_s32(a);
+}
+
+// CHECK-LABEL: test_vcvth_f16_s64
+// CHECK:  [[VCVT:%.*]] = sitofp i64 %a to half
+// CHECK:  ret half [[VCVT]]
+float16_t test_vcvth_f16_s64 (int64_t a) {
+  return vcvth_f16_s64(a);
+}
+
+// CHECK-LABEL: test_vcvth_f16_u16
+// CHECK:  [[VCVT:%.*]] = uitofp i16 %a to half
+// CHECK:  ret half [[VCVT]]
+float16_t test_vcvth_f16_u16 (uint16_t a) {
+  return vcvth_f16_u16(a);
+}
+
+// CHECK-LABEL: test_vcvth_f16_u32
+// CHECK:  [[VCVT:%.*]] = uitofp i32 %a to half
+// CHECK:  ret half [[VCVT]]
+float16_t test_vcvth_f16_u32 (uint32_t a) {
+  return vcvth_f16_u32(a);
+}
+
+// CHECK-LABEL: test_vcvth_f16_u64
+// CHECK:  [[VCVT:%.*]] = uitofp i64 %a to half
+// CHECK:  ret half [[VCVT]]
+float16_t test_vcvth_f16_u64 (uint64_t a) {
+  return vcvth_f16_u64(a);
+}
+
+// CHECK-LABEL: test_vcvth_s16_f16
+// CHECK:  [[VCVT:%.*]] = fptosi half %a to i16
+// CHECK:  ret i16 [[VCVT]]
+int16_t test_vcvth_s16_f16 (float16_t a) {
+  return vcvth_s16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvth_s32_f16
+// CHECK:  [[VCVT:%.*]] = fptosi half %a to i32
+// CHECK:  ret i32 [[VCVT]]
+int32_t test_vcvth_s32_f16 (float16_t a) {
+  return vcvth_s32_f16(a);
+}
+
+// CHECK-LABEL: test_vcvth_s64_f16
+// CHECK:  [[VCVT:%.*]] = fptosi half %a to i64
+// CHECK:  ret i64 [[VCVT]]
+int64_t test_vcvth_s64_f16 (float16_t a) {
+  return vcvth_s64_f16(a);
+}
+
+// CHECK-LABEL: test_vcvth_u16_f16
+// CHECK:  [[VCVT:%.*]] = fptoui half %a to i16
+// CHECK:  ret i16 [[VCVT]]
+uint16_t test_vcvth_u16_f16 (float16_t a) {
+  return vcvth_u16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvth_u32_f16
+// CHECK:  [[VCVT:%.*]] = fptoui half %a to i32
+// CHECK:  ret i32 [[VCVT]]
+uint32_t test_vcvth_u32_f16 (float16_t a) {
+  return vcvth_u32_f16(a);
+}
+
+// CHECK-LABEL: test_vcvth_u64_f16
+// CHECK:  [[VCVT:%.*]] = fptoui half %a to i64
+// CHECK:  ret i64 [[VCVT]]
+uint64_t test_vcvth_u64_f16 (float16_t a) {
+  return vcvth_u64_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtah_s16_f16
+// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtas.i16.f16(half %a)
+// CHECK: ret i16 [[VCVT]]
+int16_t test_vcvtah_s16_f16 (float16_t a) {
+  return vcvtah_s16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtah_s32_f16
+// CHECK: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtas.i32.f16(half %a)
+// CHECK: ret i32 [[VCVT]]
+int32_t test_vcvtah_s32_f16 (float16_t a) {
+  return vcvtah_s32_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtah_s64_f16
+// CHECK: [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtas.i64.f16(half %a)
+// CHECK: ret i64 [[VCVT]]
+int64_t test_vcvtah_s64_f16 (float16_t a) {
+  return vcvtah_s64_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtah_u16_f16
+// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtau.i16.f16(half %a)
+// CHECK: ret i16 [[VCVT]]
+uint16_t test_vcvtah_u16_f16 (float16_t a) {
+  return vcvtah_u16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtah_u32_f16
+// CHECK: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtau.i32.f16(half %a)
+// CHECK: ret i32 [[VCVT]]
+uint32_t test_vcvtah_u32_f16 (float16_t a) {
+  return vcvtah_u32_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtah_u64_f16
+// CHECK: [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtau.i64.f16(half %a)
+// CHECK: ret i64 [[VCVT]]
+uint64_t test_vcvtah_u64_f16 (float16_t a) {
+  return vcvtah_u64_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtmh_s16_f16
+// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtms.i16.f16(half %a)
+// CHECK: ret i16 [[VCVT]]
+int16_t test_vcvtmh_s16_f16 (float16_t a) {
+  return vcvtmh_s16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtmh_s32_f16
+// CHECK: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtms.i32.f16(half %a)
+// CHECK: ret i32 [[VCVT]]
+int32_t test_vcvtmh_s32_f16 (float16_t a) {
+  return vcvtmh_s32_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtmh_s64_f16
+// CHECK: [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtms.i64.f16(half %a)
+// CHECK: ret i64 [[VCVT]]
+int64_t test_vcvtmh_s64_f16 (float16_t a) {
+  return vcvtmh_s64_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtmh_u16_f16
+// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtmu.i16.f16(half %a)
+// CHECK: ret i16 [[VCVT]]
+uint16_t test_vcvtmh_u16_f16 (float16_t a) {
+  return vcvtmh_u16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtmh_u32_f16
+// CHECK: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtmu.i32.f16(half %a)
+// CHECK: ret i32 [[VCVT]]
+uint32_t test_vcvtmh_u32_f16 (float16_t a) {
+  return vcvtmh_u32_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtmh_u64_f16
+// CHECK: [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtmu.i64.f16(half %a)
+// CHECK: ret i64 [[VCVT]]
+uint64_t test_vcvtmh_u64_f16 (float16_t a) {
+  return vcvtmh_u64_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtnh_s16_f16
+// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtns.i16.f16(half %a)
+// CHECK: ret i16 [[VCVT]]
+int16_t test_vcvtnh_s16_f16 (float16_t a) {
+  return vcvtnh_s16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtnh_s32_f16
+// CHECK: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtns.i32.f16(half %a)
+// CHECK: ret i32 [[VCVT]]
+int32_t test_vcvtnh_s32_f16 (float16_t a) {
+  return vcvtnh_s32_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtnh_s64_f16
+// CHECK: [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtns.i64.f16(half %a)
+// CHECK: ret i64 [[VCVT]]
+int64_t test_vcvtnh_s64_f16 (float16_t a) {
+  return vcvtnh_s64_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtnh_u16_f16
+// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtnu.i16.f16(half %a)
+// CHECK: ret i16 [[VCVT]]
+uint16_t test_vcvtnh_u16_f16 (float16_t a) {
+  return vcvtnh_u16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtnh_u32_f16
+// CHECK: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtnu.i32.f16(half %a)
+// CHECK: ret i32 [[VCVT]]
+uint32_t test_vcvtnh_u32_f16 (float16_t a) {
+  return vcvtnh_u32_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtnh_u64_f16
+// CHECK: [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtnu.i64.f16(half %a)
+// CHECK: ret i64 [[VCVT]]
+uint64_t test_vcvtnh_u64_f16 (float16_t a) {
+  return vcvtnh_u64_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtph_s16_f16
+// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtps.i16.f16(half %a)
+// CHECK: ret i16 [[VCVT]]
+int16_t test_vcvtph_s16_f16 (float16_t a) {
+  return vcvtph_s16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtph_s32_f16
+// CHECK: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtps.i32.f16(half %a)
+// CHECK: ret i32 [[VCVT]]
+int32_t test_vcvtph_s32_f16 (float16_t a) {
+  return vcvtph_s32_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtph_s64_f16
+// CHECK: [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtps.i64.f16(half %a)
+// CHECK: ret i64 [[VCVT]]
+int64_t test_vcvtph_s64_f16 (float16_t a) {
+  return vcvtph_s64_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtph_u16_f16
+// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtpu.i16.f16(half %a)
+// CHECK: ret i16 [[VCVT]]
+uint16_t test_vcvtph_u16_f16 (float16_t a) {
+  return vcvtph_u16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtph_u32_f16
+// CHECK: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtpu.i32.f16(half %a)
+// CHECK: ret i32 [[VCVT]]
+uint32_t test_vcvtph_u32_f16 (float16_t a) {
+  return vcvtph_u32_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtph_u64_f16
+// CHECK: [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtpu.i64.f16(half %a)
+// CHECK: ret i64 [[VCVT]]
+uint64_t test_vcvtph_u64_f16 (float16_t a) {
+  return vcvtph_u64_f16(a);
+}
+
+// CHECK-LABEL: test_vnegh_f16
+// CHECK: [[NEG:%.*]] = fsub half 0xH8000, %a
+// CHECK: ret half [[NEG]]
+float16_t test_vnegh_f16(float16_t a) {
+  return vnegh_f16(a);
+}
+
+// CHECK-LABEL: test_vrecpeh_f16
+// CHECK: [[VREC:%.*]] = call half @llvm.aarch64.neon.frecpe.f16(half %a)
+// CHECK: ret half [[VREC]]
+float16_t test_vrecpeh_f16(float16_t a) {
+  return vrecpeh_f16(a);
+}
+
+// CHECK-LABEL: test_vrecpxh_f16
+// CHECK: [[VREC:%.*]] = call half @llvm.aarch64.neon.frecpx.f16(half %a)
+// CHECK: ret half [[VREC]]
+float16_t test_vrecpxh_f16(float16_t a) {
+  return vrecpxh_f16(a);
+}
+
+// CHECK-LABEL: test_vrndh_f16
+// CHECK:  [[RND:%.*]] =  call half @llvm.trunc.f16(half %a)
+// CHECK:  ret half [[RND]]
+float16_t test_vrndh_f16(float16_t a) {
+  return vrndh_f16(a);
+}
+
+// CHECK-LABEL: test_vrndah_f16
+// CHECK:  [[RND:%.*]] =  call half @llvm.round.f16(half %a)
+// CHECK:  ret half [[RND]]
+float16_t test_vrndah_f16(float16_t a) {
+  return vrndah_f16(a);
+}
+
+// CHECK-LABEL: test_vrndih_f16
+// CHECK:  [[RND:%.*]] =  call half @llvm.nearbyint.f16(half %a)
+// CHECK:  ret half [[RND]]
+float16_t test_vrndih_f16(float16_t a) {
+  return vrndih_f16(a);
+}
+
+// CHECK-LABEL: test_vrndmh_f16
+// CHECK:  [[RND:%.*]] =  call half @llvm.floor.f16(half %a)
+// CHECK:  ret half [[RND]]
+float16_t test_vrndmh_f16(float16_t a) {
+  return vrndmh_f16(a);
+}
+
+// CHECK-LABEL: test_vrndnh_f16
+// CHECK:  [[RND:%.*]] =  call half @llvm.aarch64.neon.frintn.f16(half %a)
+// CHECK:  ret half [[RND]]
+float16_t test_vrndnh_f16(float16_t a) {
+  return vrndnh_f16(a);
+}
+
+// CHECK-LABEL: test_vrndph_f16
+// CHECK:  [[RND:%.*]] =  call half @llvm.ceil.f16(half %a)
+// CHECK:  ret half [[RND]]
+float16_t test_vrndph_f16(float16_t a) {
+  return vrndph_f16(a);
+}
+
+// CHECK-LABEL: test_vrndxh_f16
+// CHECK:  [[RND:%.*]] =  call half @llvm.rint.f16(half %a)
+// CHECK:  ret half [[RND]]
+float16_t test_vrndxh_f16(float16_t a) {
+  return vrndxh_f16(a);
+}
+
+// CHECK-LABEL: test_vrsqrteh_f16
+// CHECK:  [[RND:%.*]] = call half @llvm.aarch64.neon.frsqrte.f16(half %a)
+// CHECK:  ret half [[RND]]
+float16_t test_vrsqrteh_f16(float16_t a) {
+  return vrsqrteh_f16(a);
+}
+
+// CHECK-LABEL: test_vsqrth_f16
+// CHECK:  [[SQR:%.*]] = call half @llvm.sqrt.f16(half %a)
+// CHECK:  ret half [[SQR]]
+float16_t test_vsqrth_f16(float16_t a) {
+  return vsqrth_f16(a);
+}
+
+// CHECK-LABEL: test_vaddh_f16
+// CHECK:  [[ADD:%.*]] = fadd half %a, %b
+// CHECK:  ret half [[ADD]]
+float16_t test_vaddh_f16(float16_t a, float16_t b) {
+  return vaddh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vabdh_f16
+// CHECK:  [[ABD:%.*]] = call half @llvm.aarch64.sisd.fabd.f16(half %a, half %b)
+// CHECK:  ret half [[ABD]]
+float16_t test_vabdh_f16(float16_t a, float16_t b) {
+  return vabdh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcageh_f16
+// CHECK:  [[ABS:%.*]] = call i16 @llvm.aarch64.neon.facge.i16.f16(half %a, half %b)
+// CHECK:  ret i16 [[ABS]]
+uint16_t test_vcageh_f16(float16_t a, float16_t b) {
+  return vcageh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcagth_f16
+// CHECK:  [[ABS:%.*]] = call i16 @llvm.aarch64.neon.facgt.i16.f16(half %a, half %b)
+// CHECK:  ret i16 [[ABS]]
+uint16_t test_vcagth_f16(float16_t a, float16_t b) {
+  return vcagth_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcaleh_f16
+// CHECK:  [[ABS:%.*]] = call i16 @llvm.aarch64.neon.facge.i16.f16(half %a, half %b)
+// CHECK:  ret i16 [[ABS]]
+uint16_t test_vcaleh_f16(float16_t a, float16_t b) {
+  return vcaleh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcalth_f16
+// CHECK:  [[ABS:%.*]] = call i16 @llvm.aarch64.neon.facgt.i16.f16(half %a, half %b)
+// CHECK:  ret i16 [[ABS]]
+uint16_t test_vcalth_f16(float16_t a, float16_t b) {
+  return vcalth_f16(a, b);
+}
+
+// CHECK-LABEL: test_vceqh_f16
+// CHECK:  [[TMP1:%.*]] = fcmp oeq half %a, %b
+// CHECK:  [[TMP2:%.*]] = sext i1 [[TMP1]] to i16
+// CHECK:  ret i16 [[TMP2]]
+uint16_t test_vceqh_f16(float16_t a, float16_t b) {
+  return vceqh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcgeh_f16
+// CHECK:  [[TMP1:%.*]] = fcmp oge half %a, %b
+// CHECK:  [[TMP2:%.*]] = sext i1 [[TMP1]] to i16
+// CHECK:  ret i16 [[TMP2]]
+uint16_t test_vcgeh_f16(float16_t a, float16_t b) {
+  return vcgeh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcgth_f16
+//CHECK:  [[TMP1:%.*]] = fcmp ogt half %a, %b
+// CHECK:  [[TMP2:%.*]] = sext i1 [[TMP1]] to i16
+// CHECK:  ret i16 [[TMP2]]
+uint16_t test_vcgth_f16(float16_t a, float16_t b) {
+  return vcgth_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcleh_f16
+// CHECK:  [[TMP1:%.*]] = fcmp ole half %a, %b
+// CHECK:  [[TMP2:%.*]] = sext i1 [[TMP1]] to i16
+// CHECK:  ret i16 [[TMP2]]
+uint16_t test_vcleh_f16(float16_t a, float16_t b) {
+  return vcleh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vclth_f16
+// CHECK:  [[TMP1:%.*]] = fcmp olt half %a, %b
+// CHECK:  [[TMP2:%.*]] = sext i1 [[TMP1]] to i16
+// CHECK:  ret i16 [[TMP2]]
+uint16_t test_vclth_f16(float16_t a, float16_t b) {
+  return vclth_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcvth_n_f16_s16
+// CHECK:  [[CVT:%.*]] = call half @llvm.aarch64.neon.vcvtfxs2fp.f16.i16(i16 %a, i32 0)
+// CHECK:  ret half [[CVT]]
+float16_t test_vcvth_n_f16_s16(int16_t a) {
+  return vcvth_n_f16_s16(a, 0);
+}
+
+// CHECK-LABEL: test_vcvth_n_f16_s32
+// CHECK:  [[CVT:%.*]] = call half @llvm.aarch64.neon.vcvtfxs2fp.f16.i32(i32 %a, i32 0)
+// CHECK:  ret half [[CVT]]
+float16_t test_vcvth_n_f16_s32(int32_t a) {
+  return vcvth_n_f16_s32(a, 0);
+}
+
+// CHECK-LABEL: test_vcvth_n_f16_s64
+// CHECK:  [[CVT:%.*]] = call half @llvm.aarch64.neon.vcvtfxs2fp.f16.i64(i64 %a, i32 0)
+// CHECK:  ret half [[CVT]]
+float16_t test_vcvth_n_f16_s64(int64_t a) {
+  return vcvth_n_f16_s64(a, 0);
+}
+
+// CHECK-LABEL: test_vcvth_n_s16_f16
+// CHECK:  [[CVT:%.*]] = call i16 @llvm.aarch64.neon.vcvtfp2fxs.i16.f16(half %a, i32 0)
+// CHECK:  ret i16 [[CVT]]
+int16_t test_vcvth_n_s16_f16(float16_t a) {
+  return vcvth_n_s16_f16(a, 0);
+}
+
+// CHECK-LABEL: test_vcvth_n_s32_f16
+// CHECK:  [[CVT:%.*]] = call i32 @llvm.aarch64.neon.vcvtfp2fxs.i32.f16(half %a, i32 0)
+// CHECK:  ret i32 [[CVT]]
+int32_t test_vcvth_n_s32_f16(float16_t a) {
+  return vcvth_n_s32_f16(a, 0);
+}
+
+// CHECK-LABEL: test_vcvth_n_s64_f16
+// CHECK:  [[CVT:%.*]] = call i64 @llvm.aarch64.neon.vcvtfp2fxs.i64.f16(half %a, i32 0)
+// CHECK:  ret i64 [[CVT]]
+int64_t test_vcvth_n_s64_f16(float16_t a) {
+  return vcvth_n_s64_f16(a, 0);
+}
+
+// CHECK-LABEL: test_vcvth_n_f16_u16
+// CHECK:  [[CVT:%.*]] = call half @llvm.aarch64.neon.vcvtfxu2fp.f16.i16(i16 %a, i32 0)
+// CHECK:  ret half [[CVT]]
+float16_t test_vcvth_n_f16_u16(int16_t a) {
+  return vcvth_n_f16_u16(a, 0);
+}
+
+// CHECK-LABEL: test_vcvth_n_f16_u32
+// CHECK:  [[CVT:%.*]] = call half @llvm.aarch64.neon.vcvtfxu2fp.f16.i32(i32 %a, i32 0)
+// CHECK:  ret half [[CVT]]
+float16_t test_vcvth_n_f16_u32(int32_t a) {
+  return vcvth_n_f16_u32(a, 0);
+}
+
+// CHECK-LABEL: test_vcvth_n_f16_u64
+// CHECK:  [[CVT:%.*]] = call half @llvm.aarch64.neon.vcvtfxu2fp.f16.i64(i64 %a, i32 0)
+// CHECK:  ret half [[CVT]]
+float16_t test_vcvth_n_f16_u64(int64_t a) {
+  return vcvth_n_f16_u64(a, 0);
+}
+
+// CHECK-LABEL: test_vcvth_n_u16_f16
+// CHECK:  [[CVT:%.*]] = call i16 @llvm.aarch64.neon.vcvtfp2fxu.i16.f16(half %a, i32 0)
+// CHECK:  ret i16 [[CVT]]
+int16_t test_vcvth_n_u16_f16(float16_t a) {
+  return vcvth_n_u16_f16(a, 0);
+}
+
+// CHECK-LABEL: test_vcvth_n_u32_f16
+// CHECK:  [[CVT:%.*]] = call i32 @llvm.aarch64.neon.vcvtfp2fxu.i32.f16(half %a, i32 0)
+// CHECK:  ret i32 [[CVT]]
+int32_t test_vcvth_n_u32_f16(float16_t a) {
+  return vcvth_n_u32_f16(a, 0);
+}
+
+// CHECK-LABEL: test_vcvth_n_u64_f16
+// CHECK:  [[CVT:%.*]] = call i64 @llvm.aarch64.neon.vcvtfp2fxu.i64.f16(half %a, i32 0)
+// CHECK:  ret i64 [[CVT]]
+int64_t test_vcvth_n_u64_f16(float16_t a) {
+  return vcvth_n_u64_f16(a, 0);
+}
+
+// CHECK-LABEL: test_vdivh_f16
+// CHECK:  [[DIV:%.*]] = fdiv half %a, %b
+// CHECK:  ret half [[DIV]]
+float16_t test_vdivh_f16(float16_t a, float16_t b) {
+  return vdivh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vmaxh_f16
+// CHECK:  [[MAX:%.*]] = call half @llvm.aarch64.neon.fmax.f16(half %a, half %b)
+// CHECK:  ret half [[MAX]]
+float16_t test_vmaxh_f16(float16_t a, float16_t b) {
+  return vmaxh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vmaxnmh_f16
+// CHECK:  [[MAX:%.*]] = call half @llvm.aarch64.neon.fmaxnm.f16(half %a, half %b)
+// CHECK:  ret half [[MAX]]
+float16_t test_vmaxnmh_f16(float16_t a, float16_t b) {
+  return vmaxnmh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vminh_f16
+// CHECK:  [[MIN:%.*]] = call half @llvm.aarch64.neon.fmin.f16(half %a, half %b)
+// CHECK:  ret half [[MIN]]
+float16_t test_vminh_f16(float16_t a, float16_t b) {
+  return vminh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vminnmh_f16
+// CHECK:  [[MIN:%.*]] = call half @llvm.aarch64.neon.fminnm.f16(half %a, half %b)
+// CHECK:  ret half [[MIN]]
+float16_t test_vminnmh_f16(float16_t a, float16_t b) {
+  return vminnmh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vmulh_f16
+// CHECK:  [[MUL:%.*]] = fmul half %a, %b
+// CHECK:  ret half [[MUL]]
+float16_t test_vmulh_f16(float16_t a, float16_t b) {
+  return vmulh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vmulxh_f16
+// CHECK:  [[MUL:%.*]] = call half @llvm.aarch64.neon.fmulx.f16(half %a, half %b)
+// CHECK:  ret half [[MUL]]
+float16_t test_vmulxh_f16(float16_t a, float16_t b) {
+  return vmulxh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vrecpsh_f16
+// CHECK: [[RECPS:%.*]] = call half @llvm.aarch64.neon.frecps.f16(half %a, half %b)
+// CHECK: ret half [[RECPS]]
+float16_t test_vrecpsh_f16(float16_t a, float16_t b) {
+  return vrecpsh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vrsqrtsh_f16
+// CHECK:  [[RSQRTS:%.*]] = call half @llvm.aarch64.neon.frsqrts.f16(half %a, half %b)
+// CHECK:  ret half [[RSQRTS]]
+float16_t test_vrsqrtsh_f16(float16_t a, float16_t b) {
+  return vrsqrtsh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vsubh_f16
+// CHECK:  [[SUB:%.*]] = fsub half %a, %b
+// CHECK:  ret half [[SUB]]
+float16_t test_vsubh_f16(float16_t a, float16_t b) {
+  return vsubh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vfmah_f16
+// CHECK:  [[FMA:%.*]] = call half @llvm.fma.f16(half %b, half %c, half %a)
+// CHECK:  ret half [[FMA]]
+float16_t test_vfmah_f16(float16_t a, float16_t b, float16_t c) {
+  return vfmah_f16(a, b, c);
+}
+
+// CHECK-LABEL: test_vfmsh_f16
+// CHECK:  [[SUB:%.*]] = fsub half 0xH8000, %b
+// CHECK:  [[ADD:%.*]] = call half @llvm.fma.f16(half [[SUB]], half %c, half %a)
+// CHECK:  ret half [[ADD]]
+float16_t test_vfmsh_f16(float16_t a, float16_t b, float16_t c) {
+  return vfmsh_f16(a, b, c);
+}
+
Index: clang/lib/Sema/SemaChecking.cpp
===================================================================
--- clang/lib/Sema/SemaChecking.cpp
+++ clang/lib/Sema/SemaChecking.cpp
@@ -1353,6 +1353,7 @@
   switch (BuiltinID) {
 #define GET_NEON_OVERLOAD_CHECK
 #include "clang/Basic/arm_neon.inc"
+#include "clang/Basic/arm_fp16.inc"
 #undef GET_NEON_OVERLOAD_CHECK
   }
 
@@ -1404,6 +1405,7 @@
     return false;
 #define GET_NEON_IMMEDIATE_CHECK
 #include "clang/Basic/arm_neon.inc"
+#include "clang/Basic/arm_fp16.inc"
 #undef GET_NEON_IMMEDIATE_CHECK
   }
 
Index: clang/lib/Headers/module.modulemap
===================================================================
--- clang/lib/Headers/module.modulemap
+++ clang/lib/Headers/module.modulemap
@@ -38,6 +38,7 @@
     explicit module neon {
       requires neon
       header "arm_neon.h"
+      header "arm_fp16.h"
       export *
     }
   }
Index: clang/lib/Headers/CMakeLists.txt
===================================================================
--- clang/lib/Headers/CMakeLists.txt
+++ clang/lib/Headers/CMakeLists.txt
@@ -117,6 +117,9 @@
 # Generate arm_neon.h
 clang_tablegen(arm_neon.h -gen-arm-neon
   SOURCE ${CLANG_SOURCE_DIR}/include/clang/Basic/arm_neon.td)
+# Generate arm_fp16.h
+clang_tablegen(arm_fp16.h -gen-arm-fp16
+  SOURCE ${CLANG_SOURCE_DIR}/include/clang/Basic/arm_fp16.td)
 
 set(out_files)
 foreach( f ${files} ${cuda_wrapper_files} )
@@ -134,6 +137,11 @@
   COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_BINARY_DIR}/arm_neon.h ${output_dir}/arm_neon.h
   COMMENT "Copying clang's arm_neon.h...")
 list(APPEND out_files ${output_dir}/arm_neon.h)
+add_custom_command(OUTPUT ${output_dir}/arm_fp16.h
+  DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/arm_fp16.h
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_BINARY_DIR}/arm_fp16.h ${output_dir}/arm_fp16.h
+  COMMENT "Copying clang's arm_fp16.h...")
+list(APPEND out_files ${output_dir}/arm_fp16.h)
 
 add_custom_target(clang-headers ALL DEPENDS ${out_files})
 set_target_properties(clang-headers PROPERTIES FOLDER "Misc")
@@ -145,6 +153,12 @@
   DESTINATION lib${LLVM_LIBDIR_SUFFIX}/clang/${CLANG_VERSION}/include)
 
 install(
+  FILES ${files} ${CMAKE_CURRENT_BINARY_DIR}/arm_fp16.h
+  COMPONENT clang-headers
+  PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ
+  DESTINATION lib${LLVM_LIBDIR_SUFFIX}/clang/${CLANG_VERSION}/include)
+
+install(
   FILES ${cuda_wrapper_files}
   COMPONENT clang-headers
   PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ
Index: clang/lib/CodeGen/CGBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -4099,6 +4099,54 @@
   NEONMAP1(vuqaddd_s64, aarch64_neon_suqadd, Add1ArgType),
   NEONMAP1(vuqaddh_s16, aarch64_neon_suqadd, Vectorize1ArgType | Use64BitVectors),
   NEONMAP1(vuqadds_s32, aarch64_neon_suqadd, Add1ArgType),
+  // FP16 scalar intrinisics go here.
+  NEONMAP1(vabdh_f16, aarch64_sisd_fabd, Add1ArgType),
+  NEONMAP1(vabsh_f16, aarch64_neon_abs, Add1ArgType),
+  NEONMAP1(vcageh_f16, aarch64_neon_facge, AddRetType | Add1ArgType),
+  NEONMAP1(vcagth_f16, aarch64_neon_facgt, AddRetType | Add1ArgType),
+  NEONMAP1(vcaleh_f16, aarch64_neon_facge, AddRetType | Add1ArgType),
+  NEONMAP1(vcalth_f16, aarch64_neon_facgt, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtah_s16_f16, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtah_s32_f16, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtah_s64_f16, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtah_u16_f16, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtah_u32_f16, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtah_u64_f16, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
+  NEONMAP1(vcvth_n_f16_s16, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
+  NEONMAP1(vcvth_n_f16_s32, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
+  NEONMAP1(vcvth_n_f16_s64, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
+  NEONMAP1(vcvth_n_f16_u16, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
+  NEONMAP1(vcvth_n_f16_u32, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
+  NEONMAP1(vcvth_n_f16_u64, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
+  NEONMAP1(vcvth_n_s16_f16, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
+  NEONMAP1(vcvth_n_s32_f16, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
+  NEONMAP1(vcvth_n_s64_f16, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
+  NEONMAP1(vcvth_n_u16_f16, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvth_n_u32_f16, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvth_n_u64_f16, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtmh_s16_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtmh_s32_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtmh_s64_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtmh_u16_f16, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtmh_u32_f16, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtmh_u64_f16, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtnh_s16_f16, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtnh_s32_f16, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtnh_s64_f16, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtnh_u16_f16, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtnh_u32_f16, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtnh_u64_f16, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtph_s16_f16, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtph_s32_f16, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtph_s64_f16, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtph_u16_f16, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtph_u32_f16, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtph_u64_f16, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
+  NEONMAP1(vmulxh_f16, aarch64_neon_fmulx, Add1ArgType),
+  NEONMAP1(vrecpeh_f16, aarch64_neon_frecpe, Add1ArgType),
+  NEONMAP1(vrecpxh_f16, aarch64_neon_frecpx, Add1ArgType),
+  NEONMAP1(vrsqrteh_f16, aarch64_neon_frsqrte, Add1ArgType),
+  NEONMAP1(vrsqrtsh_f16, aarch64_neon_frsqrts, Add1ArgType),
 };
 
 #undef NEONMAP0
@@ -6123,6 +6171,58 @@
       return Builder.CreateUIToFP(Ops[0], FTy);
     return Builder.CreateSIToFP(Ops[0], FTy);
   }
+  case NEON::BI__builtin_neon_vcvth_f16_u16:
+  case NEON::BI__builtin_neon_vcvth_f16_u32:
+  case NEON::BI__builtin_neon_vcvth_f16_u64:
+    usgn = true;
+    // FALL THROUGH
+  case NEON::BI__builtin_neon_vcvth_f16_s16:
+  case NEON::BI__builtin_neon_vcvth_f16_s32:
+  case NEON::BI__builtin_neon_vcvth_f16_s64: {
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    llvm::Type *FTy = HalfTy;
+    llvm::Type *InTy;
+    if (Ops[0]->getType()->getPrimitiveSizeInBits() == 64)
+      InTy = Int64Ty;
+   else if (Ops[0]->getType()->getPrimitiveSizeInBits() == 32)
+      InTy = Int32Ty;
+   else
+      InTy = Int16Ty;
+    Ops[0] = Builder.CreateBitCast(Ops[0], InTy);
+    if (usgn)
+      return Builder.CreateUIToFP(Ops[0], FTy);
+    return Builder.CreateSIToFP(Ops[0], FTy);
+  }
+  case NEON::BI__builtin_neon_vcvth_u16_f16:
+    usgn = true;
+    // FALL THROUGH
+  case NEON::BI__builtin_neon_vcvth_s16_f16: {
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Ops[0] = Builder.CreateBitCast(Ops[0], HalfTy);
+    if (usgn)
+      return Builder.CreateFPToUI(Ops[0], Int16Ty);
+    return Builder.CreateFPToSI(Ops[0], Int16Ty);
+  }
+  case NEON::BI__builtin_neon_vcvth_u32_f16:
+    usgn = true;
+    // FALL THROUGH
+  case NEON::BI__builtin_neon_vcvth_s32_f16: {
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Ops[0] = Builder.CreateBitCast(Ops[0], HalfTy);
+    if (usgn)
+      return Builder.CreateFPToUI(Ops[0], Int32Ty);
+    return Builder.CreateFPToSI(Ops[0], Int32Ty);
+  }
+  case NEON::BI__builtin_neon_vcvth_u64_f16:
+    usgn = true;
+    // FALL THROUGH
+  case NEON::BI__builtin_neon_vcvth_s64_f16: {
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Ops[0] = Builder.CreateBitCast(Ops[0], HalfTy);
+    if (usgn)
+      return Builder.CreateFPToUI(Ops[0], Int64Ty);
+    return Builder.CreateFPToSI(Ops[0], Int64Ty);
+  }
   case NEON::BI__builtin_neon_vpaddd_s64: {
     llvm::Type *Ty = llvm::VectorType::get(Int64Ty, 2);
     Value *Vec = EmitScalarExpr(E->getArg(0));
@@ -6164,34 +6264,39 @@
   case NEON::BI__builtin_neon_vceqzd_s64:
   case NEON::BI__builtin_neon_vceqzd_f64:
   case NEON::BI__builtin_neon_vceqzs_f32:
+  case NEON::BI__builtin_neon_vceqzh_f16:
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     return EmitAArch64CompareBuiltinExpr(
         Ops[0], ConvertType(E->getCallReturnType(getContext())),
         ICmpInst::FCMP_OEQ, ICmpInst::ICMP_EQ, "vceqz");
   case NEON::BI__builtin_neon_vcgezd_s64:
   case NEON::BI__builtin_neon_vcgezd_f64:
   case NEON::BI__builtin_neon_vcgezs_f32:
+  case NEON::BI__builtin_neon_vcgezh_f16:
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     return EmitAArch64CompareBuiltinExpr(
         Ops[0], ConvertType(E->getCallReturnType(getContext())),
         ICmpInst::FCMP_OGE, ICmpInst::ICMP_SGE, "vcgez");
   case NEON::BI__builtin_neon_vclezd_s64:
   case NEON::BI__builtin_neon_vclezd_f64:
   case NEON::BI__builtin_neon_vclezs_f32:
+  case NEON::BI__builtin_neon_vclezh_f16:
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     return EmitAArch64CompareBuiltinExpr(
         Ops[0], ConvertType(E->getCallReturnType(getContext())),
         ICmpInst::FCMP_OLE, ICmpInst::ICMP_SLE, "vclez");
   case NEON::BI__builtin_neon_vcgtzd_s64:
   case NEON::BI__builtin_neon_vcgtzd_f64:
   case NEON::BI__builtin_neon_vcgtzs_f32:
+  case NEON::BI__builtin_neon_vcgtzh_f16:
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     return EmitAArch64CompareBuiltinExpr(
         Ops[0], ConvertType(E->getCallReturnType(getContext())),
         ICmpInst::FCMP_OGT, ICmpInst::ICMP_SGT, "vcgtz");
   case NEON::BI__builtin_neon_vcltzd_s64:
   case NEON::BI__builtin_neon_vcltzd_f64:
   case NEON::BI__builtin_neon_vcltzs_f32:
+  case NEON::BI__builtin_neon_vcltzh_f16:
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     return EmitAArch64CompareBuiltinExpr(
         Ops[0], ConvertType(E->getCallReturnType(getContext())),
@@ -6244,6 +6349,26 @@
     Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]);
     return Builder.CreateSExt(Ops[0], Int32Ty, "vcmpd");
   }
+  case NEON::BI__builtin_neon_vceqh_f16:
+  case NEON::BI__builtin_neon_vcleh_f16:
+  case NEON::BI__builtin_neon_vclth_f16:
+  case NEON::BI__builtin_neon_vcgeh_f16:
+  case NEON::BI__builtin_neon_vcgth_f16: {
+    llvm::CmpInst::Predicate P;
+    switch (BuiltinID) {
+    default: llvm_unreachable("missing builtin ID in switch!");
+    case NEON::BI__builtin_neon_vceqh_f16: P = llvm::FCmpInst::FCMP_OEQ; break;
+    case NEON::BI__builtin_neon_vcleh_f16: P = llvm::FCmpInst::FCMP_OLE; break;
+    case NEON::BI__builtin_neon_vclth_f16: P = llvm::FCmpInst::FCMP_OLT; break;
+    case NEON::BI__builtin_neon_vcgeh_f16: P = llvm::FCmpInst::FCMP_OGE; break;
+    case NEON::BI__builtin_neon_vcgth_f16: P = llvm::FCmpInst::FCMP_OGT; break;
+    }
+    Ops.push_back(EmitScalarExpr(E->getArg(1)));
+    Ops[0] = Builder.CreateBitCast(Ops[0], HalfTy);
+    Ops[1] = Builder.CreateBitCast(Ops[1], HalfTy);
+    Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]);
+    return Builder.CreateSExt(Ops[0], Int16Ty, "vcmpd");
+  }
   case NEON::BI__builtin_neon_vceqd_s64:
   case NEON::BI__builtin_neon_vceqd_u64:
   case NEON::BI__builtin_neon_vcgtd_s64:
@@ -6381,6 +6506,31 @@
         llvm::VectorType::get(DoubleTy, 2));
     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
                                         "vgetq_lane");
+   case NEON::BI__builtin_neon_vaddh_f16:
+     Ops.push_back(EmitScalarExpr(E->getArg(1)));
+     return Builder.CreateFAdd(Ops[0], Ops[1], "vaddh");
+   case NEON::BI__builtin_neon_vsubh_f16:
+     Ops.push_back(EmitScalarExpr(E->getArg(1)));
+     return Builder.CreateFSub(Ops[0], Ops[1], "vsubh");
+   case NEON::BI__builtin_neon_vmulh_f16:
+     Ops.push_back(EmitScalarExpr(E->getArg(1)));
+     return Builder.CreateFMul(Ops[0], Ops[1], "vmulh");
+   case NEON::BI__builtin_neon_vdivh_f16:
+     Ops.push_back(EmitScalarExpr(E->getArg(1)));
+     return Builder.CreateFDiv(Ops[0], Ops[1], "vdivh");
+   case NEON::BI__builtin_neon_vfmah_f16: {
+     Value *F = CGM.getIntrinsic(Intrinsic::fma, HalfTy);
+     // NEON intrinsic puts accumulator first, unlike the LLVM fma.
+     return Builder.CreateCall(F,
+       {EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2)), Ops[0]});
+   }
+   case NEON::BI__builtin_neon_vfmsh_f16: {
+     Value *F = CGM.getIntrinsic(Intrinsic::fma, HalfTy);
+    Value *Zero = llvm::ConstantFP::getZeroValueForNegation(HalfTy);
+     Value* Sub = Builder.CreateFSub(Zero, EmitScalarExpr(E->getArg(1)), "vsubh");
+     // NEON intrinsic puts accumulator first, unlike the LLVM fma.
+     return Builder.CreateCall(F, {Sub, EmitScalarExpr(E->getArg(2)), Ops[0]});
+   }
   case NEON::BI__builtin_neon_vaddd_s64:
   case NEON::BI__builtin_neon_vaddd_u64:
     return Builder.CreateAdd(Ops[0], EmitScalarExpr(E->getArg(1)), "vaddd");
@@ -6655,12 +6805,22 @@
     Int = usgn ? Intrinsic::aarch64_neon_umax : Intrinsic::aarch64_neon_smax;
     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmax;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmax");
+  case NEON::BI__builtin_neon_vmaxh_f16: {
+    Ops.push_back(EmitScalarExpr(E->getArg(1)));
+    Int = Intrinsic::aarch64_neon_fmax;
+    return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmax");
+  }
   case NEON::BI__builtin_neon_vmin_v:
   case NEON::BI__builtin_neon_vminq_v:
     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
     Int = usgn ? Intrinsic::aarch64_neon_umin : Intrinsic::aarch64_neon_smin;
     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmin;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmin");
+  case NEON::BI__builtin_neon_vminh_f16: {
+    Ops.push_back(EmitScalarExpr(E->getArg(1)));
+    Int = Intrinsic::aarch64_neon_fmin;
+    return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmin");
+  }
   case NEON::BI__builtin_neon_vabd_v:
   case NEON::BI__builtin_neon_vabdq_v:
     // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
@@ -6699,20 +6859,31 @@
   case NEON::BI__builtin_neon_vminnmq_v:
     Int = Intrinsic::aarch64_neon_fminnm;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vminnm");
+  case NEON::BI__builtin_neon_vminnmh_f16:
+    Ops.push_back(EmitScalarExpr(E->getArg(1)));
+    Int = Intrinsic::aarch64_neon_fminnm;
+    return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vminnm");
   case NEON::BI__builtin_neon_vmaxnm_v:
   case NEON::BI__builtin_neon_vmaxnmq_v:
     Int = Intrinsic::aarch64_neon_fmaxnm;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmaxnm");
+  case NEON::BI__builtin_neon_vmaxnmh_f16:
+    Ops.push_back(EmitScalarExpr(E->getArg(1)));
+    Int = Intrinsic::aarch64_neon_fmaxnm;
+    return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmaxnm");
   case NEON::BI__builtin_neon_vrecpss_f32: {
     Ops.push_back(EmitScalarExpr(E->getArg(1)));
     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, FloatTy),
                         Ops, "vrecps");
   }
-  case NEON::BI__builtin_neon_vrecpsd_f64: {
+  case NEON::BI__builtin_neon_vrecpsd_f64:
     Ops.push_back(EmitScalarExpr(E->getArg(1)));
     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, DoubleTy),
                         Ops, "vrecps");
-  }
+  case NEON::BI__builtin_neon_vrecpsh_f16:
+    Ops.push_back(EmitScalarExpr(E->getArg(1)));
+    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, HalfTy),
+                        Ops, "vrecps");
   case NEON::BI__builtin_neon_vqshrun_n_v:
     Int = Intrinsic::aarch64_neon_sqshrun;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrun_n");
@@ -6728,36 +6899,71 @@
   case NEON::BI__builtin_neon_vqrshrn_n_v:
     Int = usgn ? Intrinsic::aarch64_neon_uqrshrn : Intrinsic::aarch64_neon_sqrshrn;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrn_n");
+  case NEON::BI__builtin_neon_vrndah_f16: {
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Int = Intrinsic::round;
+    return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrnda");
+  }
   case NEON::BI__builtin_neon_vrnda_v:
   case NEON::BI__builtin_neon_vrndaq_v: {
     Int = Intrinsic::round;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnda");
   }
+  case NEON::BI__builtin_neon_vrndih_f16: {
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Int = Intrinsic::nearbyint;
+    return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndi");
+  }
   case NEON::BI__builtin_neon_vrndi_v:
   case NEON::BI__builtin_neon_vrndiq_v: {
     Int = Intrinsic::nearbyint;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndi");
   }
+  case NEON::BI__builtin_neon_vrndmh_f16: {
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Int = Intrinsic::floor;
+    return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndm");
+  }
   case NEON::BI__builtin_neon_vrndm_v:
   case NEON::BI__builtin_neon_vrndmq_v: {
     Int = Intrinsic::floor;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndm");
   }
+  case NEON::BI__builtin_neon_vrndnh_f16: {
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Int = Intrinsic::aarch64_neon_frintn;
+    return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndn");
+  }
   case NEON::BI__builtin_neon_vrndn_v:
   case NEON::BI__builtin_neon_vrndnq_v: {
     Int = Intrinsic::aarch64_neon_frintn;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndn");
   }
+  case NEON::BI__builtin_neon_vrndph_f16: {
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Int = Intrinsic::ceil;
+    return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndp");
+  }
   case NEON::BI__builtin_neon_vrndp_v:
   case NEON::BI__builtin_neon_vrndpq_v: {
     Int = Intrinsic::ceil;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndp");
   }
+  case NEON::BI__builtin_neon_vrndxh_f16: {
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Int = Intrinsic::rint;
+    return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndx");
+  }
   case NEON::BI__builtin_neon_vrndx_v:
   case NEON::BI__builtin_neon_vrndxq_v: {
     Int = Intrinsic::rint;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndx");
   }
+  case NEON::BI__builtin_neon_vrndh_f16: {
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Int = Intrinsic::trunc;
+    return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndz");
+  }
   case NEON::BI__builtin_neon_vrnd_v:
   case NEON::BI__builtin_neon_vrndq_v: {
     Int = Intrinsic::trunc;
@@ -6906,6 +7112,8 @@
   }
   case NEON::BI__builtin_neon_vnegd_s64:
     return Builder.CreateNeg(EmitScalarExpr(E->getArg(0)), "vnegd");
+  case NEON::BI__builtin_neon_vnegh_f16:
+    return Builder.CreateFNeg(EmitScalarExpr(E->getArg(0)), "vnegh");
   case NEON::BI__builtin_neon_vpmaxnm_v:
   case NEON::BI__builtin_neon_vpmaxnmq_v: {
     Int = Intrinsic::aarch64_neon_fmaxnmp;
@@ -6916,6 +7124,11 @@
     Int = Intrinsic::aarch64_neon_fminnmp;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpminnm");
   }
+  case NEON::BI__builtin_neon_vsqrth_f16: {
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Int = Intrinsic::sqrt;
+    return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vsqrt");
+  }
   case NEON::BI__builtin_neon_vsqrt_v:
   case NEON::BI__builtin_neon_vsqrtq_v: {
     Int = Intrinsic::sqrt;
Index: clang/lib/Basic/Targets/AArch64.cpp
===================================================================
--- clang/lib/Basic/Targets/AArch64.cpp
+++ clang/lib/Basic/Targets/AArch64.cpp
@@ -183,6 +183,8 @@
 
   if ((FPU & NeonMode) && HasFullFP16)
     Builder.defineMacro("__ARM_FEATURE_FP16_VECTOR_ARITHMETIC", "1");
+  if (HasFullFP16)
+   Builder.defineMacro("__ARM_FEATURE_FP16_SCALAR_ARITHMETIC", "1");
 
   switch (ArchKind) {
   default:
Index: clang/include/clang/Basic/arm_fp16.td
===================================================================
--- /dev/null
+++ clang/include/clang/Basic/arm_fp16.td
@@ -0,0 +1,173 @@
+//===--- arm_fp16.td - ARM FP16 compiler interface ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file defines the TableGen definitions from which the ARM FP16 header
+//  file will be generated.
+//
+//===----------------------------------------------------------------------===//
+//
+// Each intrinsic is a subclass of the Inst class. An intrinsic can either
+// generate a __builtin_* call or it can expand to a set of generic operations.
+//
+// The operations are subclasses of Operation providing a list of DAGs, the
+// last of which is the return value. 
+//
+//===----------------------------------------------------------------------===//
+
+// The base Operation class. All operations must subclass this.
+class Operation<list<dag> ops=[]> {
+  list<dag> Ops = ops;
+  bit Unavailable = 0;
+}
+
+def OP_NONE  : Operation;
+
+//===----------------------------------------------------------------------===//
+// Instruction definitions
+//===----------------------------------------------------------------------===//
+// See arm_neon.td for more information
+
+// Every intrinsic subclasses Inst.
+class Inst <string n, string p, string t, Operation o> {
+  string Name = n;
+  string Prototype = p;
+  string Types = t;
+  string ArchGuard = "";
+
+  Operation Operation = o;
+  bit CartesianProductOfTypes = 0;
+  bit BigEndianSafe = 0;
+  bit isShift = 0;
+  bit isScalarShift = 0;
+  bit isVCVT_N = 0;
+}
+
+// The following instruction classes are implemented via builtins.
+// These declarations are used to generate Builtins.def:
+//
+// SInst: Instruction with signed/unsigned suffix (e.g., "s8", "u8", "p8")
+// IInst: Instruction with generic integer suffix (e.g., "i8")
+class SInst<string n, string p, string t> : Inst<n, p, t, OP_NONE> {}
+class IInst<string n, string p, string t> : Inst<n, p, t, OP_NONE> {}
+
+// ARMv8.2-A FP16 intrinsics.
+let ArchGuard = "defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) && defined(__aarch64__)" in {
+
+  // Negate
+  def VNEGSH  : SInst<"vneg", "ss", "Sh">;
+
+  // Reciprocal/Sqrt
+  def SCALAR_FRECPSH : IInst<"vrecps", "sss", "Sh">;
+  def FSQRTSH  : SInst<"vsqrt", "ss", "Sh">;
+  def SCALAR_FRSQRTSH: IInst<"vrsqrts", "sss", "Sh">;
+
+  // Reciprocal Estimate
+  def SCALAR_FRECPEH : IInst<"vrecpe", "ss", "Sh">;
+
+  // Reciprocal Exponent
+  def SCALAR_FRECPXH : IInst<"vrecpx", "ss", "Sh">;
+
+  // Reciprocal Square Root Estimate
+  def SCALAR_FRSQRTEH: IInst<"vrsqrte", "ss", "Sh">;
+
+  // Rounding 
+  def FRINTZ_S64H : SInst<"vrnd", "ss", "Sh">;
+  def FRINTA_S64H : SInst<"vrnda", "ss", "Sh">;
+  def FRINTI_S64H : SInst<"vrndi", "ss", "Sh">;
+  def FRINTM_S64H : SInst<"vrndm", "ss", "Sh">;
+  def FRINTN_S64H : SInst<"vrndn", "ss", "Sh">;
+  def FRINTP_S64H : SInst<"vrndp", "ss", "Sh">;
+  def FRINTX_S64H : SInst<"vrndx", "ss", "Sh">;
+
+  // Conversion 
+  def SCALAR_SCVTFSH   :  SInst<"vcvth_f16", "Ys", "silUsUiUl">;
+  def SCALAR_FCVTZSH   :  SInst<"vcvt_s16", "$s", "Sh">;
+  def SCALAR_FCVTZSH1  : SInst<"vcvt_s32", "Is", "Sh">;
+  def SCALAR_FCVTZSH2  : SInst<"vcvt_s64", "Ls", "Sh">;
+  def SCALAR_FCVTZUH   :  SInst<"vcvt_u16", "bs", "Sh">;
+  def SCALAR_FCVTZUH1  : SInst<"vcvt_u32", "Us", "Sh">;
+  def SCALAR_FCVTZUH2  : SInst<"vcvt_u64", "Os", "Sh">;
+  def SCALAR_FCVTASH   :  SInst<"vcvta_s16", "$s", "Sh">;
+  def SCALAR_FCVTASH1  : SInst<"vcvta_s32", "Is", "Sh">;
+  def SCALAR_FCVTASH2  : SInst<"vcvta_s64", "Ls", "Sh">;
+  def SCALAR_FCVTAUH   :  SInst<"vcvta_u16", "bs", "Sh">;
+  def SCALAR_FCVTAUH1  : SInst<"vcvta_u32", "Us", "Sh">;
+  def SCALAR_FCVTAUH2  : SInst<"vcvta_u64", "Os", "Sh">;
+  def SCALAR_FCVTMSH   :  SInst<"vcvtm_s16", "$s", "Sh">;
+  def SCALAR_FCVTMSH1  : SInst<"vcvtm_s32", "Is", "Sh">;
+  def SCALAR_FCVTMSH2  : SInst<"vcvtm_s64", "Ls", "Sh">;
+  def SCALAR_FCVTMUH   :  SInst<"vcvtm_u16", "bs", "Sh">;
+  def SCALAR_FCVTMUH1  : SInst<"vcvtm_u32", "Us", "Sh">;
+  def SCALAR_FCVTMUH2  : SInst<"vcvtm_u64", "Os", "Sh">;
+  def SCALAR_FCVTNSH   :  SInst<"vcvtn_s16", "$s", "Sh">;
+  def SCALAR_FCVTNSH1  : SInst<"vcvtn_s32", "Is", "Sh">;
+  def SCALAR_FCVTNSH2  : SInst<"vcvtn_s64", "Ls", "Sh">;
+  def SCALAR_FCVTNUH   :  SInst<"vcvtn_u16", "bs", "Sh">;
+  def SCALAR_FCVTNUH1  : SInst<"vcvtn_u32", "Us", "Sh">;
+  def SCALAR_FCVTNUH2  : SInst<"vcvtn_u64", "Os", "Sh">;
+  def SCALAR_FCVTPSH   :  SInst<"vcvtp_s16", "$s", "Sh">;
+  def SCALAR_FCVTPSH1  : SInst<"vcvtp_s32", "Is", "Sh">;
+  def SCALAR_FCVTPSH2  : SInst<"vcvtp_s64", "Ls", "Sh">;
+  def SCALAR_FCVTPUH   :  SInst<"vcvtp_u16", "bs", "Sh">;
+  def SCALAR_FCVTPUH1  : SInst<"vcvtp_u32", "Us", "Sh">;
+  def SCALAR_FCVTPUH2  : SInst<"vcvtp_u64", "Os", "Sh">;
+
+  def SCALAR_SCVTFSHO  : SInst<"vcvth_n_f16", "Ysi", "silUsUiUl">;
+  def SCALAR_FCVTZSHO  :  SInst<"vcvt_n_s16", "$si", "Sh">;
+  def SCALAR_FCVTZSH1O : SInst<"vcvt_n_s32", "Isi", "Sh">;
+  def SCALAR_FCVTZSH2O : SInst<"vcvt_n_s64", "Lsi", "Sh">;
+  def SCALAR_FCVTZUHO  :  SInst<"vcvt_n_u16", "bsi", "Sh">;
+  def SCALAR_FCVTZUH1O : SInst<"vcvt_n_u32", "Usi", "Sh">;
+  def SCALAR_FCVTZUH2O : SInst<"vcvt_n_u64", "Osi", "Sh">;
+
+  // Comparison
+  def SCALAR_CMEQRH : SInst<"vceq", "bss", "Sh">;
+  def SCALAR_CMEQZH : SInst<"vceqz", "bs", "Sh">;
+  def SCALAR_CMGERH : SInst<"vcge", "bss", "Sh">;
+  def SCALAR_CMGEZH : SInst<"vcgez", "bs", "Sh">;
+  def SCALAR_CMGTRH : SInst<"vcgt", "bss", "Sh">;
+  def SCALAR_CMGTZH : SInst<"vcgtz", "bs", "Sh">;
+  def SCALAR_CMLERH : SInst<"vcle", "bss", "Sh">;
+  def SCALAR_CMLEZH : SInst<"vclez", "bs", "Sh">;
+  def SCALAR_CMLTH  : SInst<"vclt", "bss", "Sh">;
+  def SCALAR_CMLTZH : SInst<"vcltz", "bs", "Sh">;
+
+  // Absolute Compare Mask Greater Than Or Equal
+  def SCALAR_FACGEH: IInst<"vcage", "bss", "Sh">;
+  def SCALAR_FACLEH: IInst<"vcale", "bss", "Sh">;
+
+  // Absolute Compare Mask Greater Than
+  def SCALAR_FACGT : IInst<"vcagt", "bss", "Sh">;
+  def SCALAR_FACLT : IInst<"vcalt", "bss", "Sh">;
+
+  // Scalar Absolute Value
+  def SCALAR_ABSH  : SInst<"vabs", "ss", "Sh">;
+
+  // Scalar Absolute Difference
+  def SCALAR_ABDH: IInst<"vabd", "sss", "Sh">;
+
+  // Add/Sub
+  def VADDSH       : SInst<"vadd", "sss", "Sh">;
+  def VSUBHS       : SInst<"vsub", "sss", "Sh">;
+
+  // Max/Min
+  def VMAXHS       : SInst<"vmax", "sss", "Sh">;
+  def VMINHS       : SInst<"vmin", "sss", "Sh">;
+  def FMAXNMHS     : SInst<"vmaxnm", "sss", "Sh">;
+  def FMINNMHS     : SInst<"vminnm", "sss", "Sh">;
+
+  // Multiplication/Division
+  def VMULHS       : SInst<"vmul", "sss", "Sh">;
+  def MULXHS       : SInst<"vmulx", "sss", "Sh">;
+  def FDIVHS       : SInst<"vdiv", "sss",  "Sh">;
+
+  // Vector fused multiply-add operations
+  def VFMAHS       : SInst<"vfma", "ssss", "Sh">;
+  def VFMSHS       : SInst<"vfms", "ssss", "Sh">;
+}
Index: clang/include/clang/Basic/CMakeLists.txt
===================================================================
--- clang/include/clang/Basic/CMakeLists.txt
+++ clang/include/clang/Basic/CMakeLists.txt
@@ -46,3 +46,7 @@
   -I ${CMAKE_CURRENT_SOURCE_DIR}/../../
   SOURCE arm_neon.td
   TARGET ClangARMNeon)
+clang_tablegen(arm_fp16.inc -gen-arm-neon-sema
+  -I ${CMAKE_CURRENT_SOURCE_DIR}/../../
+  SOURCE arm_fp16.td
+  TARGET ClangARMFP16)
Index: clang/include/clang/Basic/BuiltinsNEON.def
===================================================================
--- clang/include/clang/Basic/BuiltinsNEON.def
+++ clang/include/clang/Basic/BuiltinsNEON.def
@@ -16,6 +16,7 @@
 
 #define GET_NEON_BUILTINS
 #include "clang/Basic/arm_neon.inc"
+#include "clang/Basic/arm_fp16.inc"
 #undef GET_NEON_BUILTINS
 
 #undef BUILTIN

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D41792: [AArch64] Add ARMv8.2-A FP16 scalar intrinsics

Reply via email to