[PATCH][AARCH64][NEON] Enabling V*HFmode simd immediate loads.

Bilyan Borisov Thu, 17 Dec 2015 02:24:00 -0800

This patch adds support for loading vector 16bit floating point immediates
(modes V*HF) using a movi instruction. We leverage the existing code that does
checking for an 8 bit pattern in a 64/128-bit long splattered version of the
concatenated bit pattern representations of the individual constant elements
of the vector. This enables us to load a variety of constants, since the movi
instruction also comes with an up to 24 bit immediate left shift encoding (in
multiples of 8). A new testcase was added that checks for presence of movi
instructions and for correctness of results.


Tested on aarch64-none-elf, aarch64_be-none-elf, bootstrapped on
aarch64-none-linux-gnu.

---

gcc/

2015-XX-XX  Bilyan Borisov  <bilyan.bori...@arm.com>

        * config/aarch64/aarch64.c (aarch64_simd_container_mode): Added HFmode
        cases.
        (aarch64_vect_float_const_representable_p): Updated comment.
        (aarch64_simd_valid_immediate): Added support for V*HF arguments.
        (aarch64_output_simd_mov_immediate): Added check for HFmode.

gcc/testsuite/

2015-XX-XX  Bilyan Borisov  <bilyan.bori...@arm.com>

        * gcc.target/aarch64/fp16/f16_mov_immediate_simd_1.c: New.

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index ae4cfb336a827a63a6baadefcb5646a9dbfb7523..bb6fce0a829d634a7694710e8a8c9a1c3e841abd 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -10250,6 +10250,8 @@ aarch64_simd_container_mode (machine_mode mode, unsigned width)
 	    return V2DFmode;
 	  case SFmode:
 	    return V4SFmode;
+	  case HFmode:
+	    return V8HFmode;
 	  case SImode:
 	    return V4SImode;
 	  case HImode:
@@ -10266,6 +10268,8 @@ aarch64_simd_container_mode (machine_mode mode, unsigned width)
 	  {
 	  case SFmode:
 	    return V2SFmode;
+	  case HFmode:
+	    return V4HFmode;
 	  case SImode:
 	    return V2SImode;
 	  case HImode:
@@ -10469,7 +10473,12 @@ sizetochar (int size)
 /* Return true iff x is a uniform vector of floating-point
    constants, and the constant can be represented in
    quarter-precision form.  Note, as aarch64_float_const_representable
-   rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
+   rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.
+   Also note that this won't ever be called for V*HFmode vectors,
+   since in aarch64_simd_valid_immediate () we check for the mode
+   and handle these vector types differently from other floating
+   point vector modes.  */
+
 static bool
 aarch64_vect_float_const_representable_p (rtx x)
 {
@@ -10505,7 +10514,10 @@ aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
   unsigned int invmask = inverse ? 0xff : 0;
   int eshift, emvn;
 
-  if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
+  /* Ignore V*HFmode vectors, they are handled below with the integer
+     code.  */
+  if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT
+      && GET_MODE_INNER (mode) != HFmode)
     {
       if (! (aarch64_simd_imm_zero_p (op, mode)
 	     || aarch64_vect_float_const_representable_p (op)))
@@ -10530,15 +10542,26 @@ aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
       unsigned HOST_WIDE_INT elpart;
 
-      gcc_assert (CONST_INT_P (el));
-      elpart = INTVAL (el);
+      if (CONST_INT_P (el))
+	elpart = INTVAL (el);
+      /* Convert HFmode vector element to bit pattern.  Logic below will catch
+	 most common constants since for FP16 the sign and exponent are in the
+	 top 6 bits and a movi with a left shift of 8 will catch all powers
+	 of 2 that fit in a 16 bit floating point, and the 2 extra bits left
+	 for the mantissa can cover some more non-power of 2 constants.  With
+	 a 0 left shift, we can cover constants of the form 1.xxx since we have
+	 8 bits only for the mantissa.  */
+      else if (CONST_DOUBLE_P (el) && GET_MODE_INNER (mode) == HFmode)
+	elpart =
+	  real_to_target (NULL, CONST_DOUBLE_REAL_VALUE (el), HFmode);
+      else
+        gcc_unreachable ();
 
       for (unsigned int byte = 0; byte < innersize; byte++)
 	{
 	  bytes[idx++] = (elpart & 0xff) ^ invmask;
 	  elpart >>= BITS_PER_UNIT;
 	}
-
     }
 
   /* Sanity check.  */
@@ -11913,7 +11936,10 @@ aarch64_output_simd_mov_immediate (rtx const_vector,
   lane_count = width / info.element_width;
 
   mode = GET_MODE_INNER (mode);
-  if (GET_MODE_CLASS (mode) == MODE_FLOAT)
+  /* We handle HFmode vectors separately from the other floating point
+     vector modes.  See aarch64_simd_valid_immediate (), but in short
+     we use a movi instruction rather than a fmov.  */
+  if (GET_MODE_CLASS (mode) == MODE_FLOAT && mode != HFmode)
     {
       gcc_assert (info.shift == 0 && ! info.mvn);
       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
diff --git a/gcc/testsuite/gcc.target/aarch64/fp16/f16_mov_immediate_simd_1.c b/gcc/testsuite/gcc.target/aarch64/fp16/f16_mov_immediate_simd_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..4533a888a43773a92be2f120f30353b7b23c9ab5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fp16/f16_mov_immediate_simd_1.c
@@ -0,0 +1,262 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps" } */
+
+#include "arm_neon.h"
+
+extern void abort ();
+
+#define VAL_4(val)				\
+  val, val, val, val				\
+
+#define VAL_8(val)				\
+  val, val, val, val, val, val, val, val	\
+
+
+#define CHECK_1(LEN, val)				\
+  do							\
+    {							\
+      float16x##LEN##_t a = {VAL_##LEN (val)};		\
+	for (int i = 0; i < LEN; ++i)			\
+	  if (a[i] != val)				\
+	    abort ();					\
+    }							\
+  while (0)						\
+
+void __attribute__ ((noinline))
+f1 ()
+{
+  CHECK_1 (4, -64.0);
+  CHECK_1 (4, -32.0);
+  CHECK_1 (4, -16.0);
+  CHECK_1 (4, -8.0);
+  CHECK_1 (4, -4.0);
+  CHECK_1 (4, -2.0);
+  CHECK_1 (4, -1.0);
+
+  CHECK_1 (4, 0.0);
+
+  CHECK_1 (4, 1.0);
+  CHECK_1 (4, 2.0);
+  CHECK_1 (4, 4.0);
+  CHECK_1 (4, 8.0);
+  CHECK_1 (4, 16.0);
+  CHECK_1 (4, 32.0);
+  CHECK_1 (4, 64.0);
+
+  CHECK_1 (8, -64.0);
+  CHECK_1 (8, -32.0);
+  CHECK_1 (8, -16.0);
+  CHECK_1 (8, -8.0);
+  CHECK_1 (8, -4.0);
+  CHECK_1 (8, -2.0);
+  CHECK_1 (8, -1.0);
+
+  CHECK_1 (8, 0.0);
+
+  CHECK_1 (8, 1.0);
+  CHECK_1 (8, 2.0);
+  CHECK_1 (8, 4.0);
+  CHECK_1 (8, 8.0);
+  CHECK_1 (8, 16.0);
+  CHECK_1 (8, 32.0);
+  CHECK_1 (8, 64.0);
+}
+
+#define CHECK_2(LEN, val1, val2)					\
+  do									\
+    {									\
+      float16x##LEN##_t a =						\
+	vcreate_f16 (__AARCH64_UINT64_C (val1));			\
+	for (int i = 0; i < LEN; ++i)					\
+	  if (vget_lane_f16 (a, i) != val2)				\
+	    abort ();							\
+    }									\
+  while (0)								\
+
+void __attribute__ ((noinline))
+f2 ()
+{
+  CHECK_2 (4, 0xd400d400d400d400, -64.0);
+  CHECK_2 (4, 0xd000d000d000d000, -32.0);
+  CHECK_2 (4, 0xcc00cc00cc00cc00, -16.0);
+  CHECK_2 (4, 0xc800c800c800c800, -8.0);
+  CHECK_2 (4, 0xc400c400c400c400, -4.0);
+  CHECK_2 (4, 0xc000c000c000c000, -2.0);
+  CHECK_2 (4, 0xbc00bc00bc00bc00, -1.0);
+
+  CHECK_2 (4, 0, 0.0);
+
+  CHECK_2 (4, 0x3c003c003c003c00, 1.0);
+  CHECK_2 (4, 0x4000400040004000, 2.0);
+  CHECK_2 (4, 0x4400440044004400, 4.0);
+  CHECK_2 (4, 0x4800480048004800, 8.0);
+  CHECK_2 (4, 0x4c004c004c004c00, 16.0);
+  CHECK_2 (4, 0x5000500050005000, 32.0);
+  CHECK_2 (4, 0x5400540054005400, 64.0);
+}
+
+#define VGET_LANE_F16_4(a, i)			\
+  vget_lane_f16 (a, i)				\
+
+#define VGET_LANE_F16_8(a, i)			\
+  vgetq_lane_f16 (a, i)				\
+
+#define VLD1_F16_4(x) \
+  vld1_f16 (x)	      \
+
+#define VLD1_F16_8(x) \
+  vld1q_f16 (x)	      \
+
+#define CHECK_3(LEN, val)						\
+  do									\
+    {									\
+      float16_t x[] = {VAL_##LEN (val)};				\
+      float16x##LEN##_t a =						\
+	VLD1_F16_##LEN (x);						\
+	for (int i = 0; i < LEN; ++i)					\
+	  if (VGET_LANE_F16_##LEN (a, i) != val)			\
+	    abort ();							\
+    }									\
+  while (0)								\
+
+void __attribute__ ((noinline))
+f3 ()
+{
+  CHECK_3 (4, -64.0);
+  CHECK_3 (4, -32.0);
+  CHECK_3 (4, -16.0);
+  CHECK_3 (4, -8.0);
+  CHECK_3 (4, -4.0);
+  CHECK_3 (4, -2.0);
+  CHECK_3 (4, -1.0);
+
+  CHECK_3 (4, 0.0);
+
+  CHECK_3 (4, 1.0);
+  CHECK_3 (4, 2.0);
+  CHECK_3 (4, 4.0);
+  CHECK_3 (4, 8.0);
+  CHECK_3 (4, 16.0);
+  CHECK_3 (4, 32.0);
+  CHECK_3 (4, 64.0);
+
+  CHECK_3 (8, -64.0);
+  CHECK_3 (8, -32.0);
+  CHECK_3 (8, -16.0);
+  CHECK_3 (8, -8.0);
+  CHECK_3 (8, -4.0);
+  CHECK_3 (8, -2.0);
+  CHECK_3 (8, -1.0);
+
+  CHECK_3 (8, 0.0);
+
+  CHECK_3 (8, 1.0);
+  CHECK_3 (8, 2.0);
+  CHECK_3 (8, 4.0);
+  CHECK_3 (8, 8.0);
+  CHECK_3 (8, 16.0);
+  CHECK_3 (8, 32.0);
+  CHECK_3 (8, 64.0);
+}
+
+#define VLD1_DUP_F16_4(x) \
+  vld1_dup_f16 (x)	  \
+
+#define VLD1_DUP_F16_8(x) \
+  vld1q_dup_f16 (x)	  \
+
+
+#define CHECK_4(LEN, val)						\
+  do									\
+    {									\
+      float16_t x = val;						\
+      float16x##LEN##_t a =						\
+	VLD1_DUP_F16_##LEN (&x);					\
+      for (int i = 0; i < LEN; ++i)					\
+	if (VGET_LANE_F16_##LEN (a, i) != val)				\
+	  abort ();							\
+    }									\
+  while (0)								\
+
+void __attribute__ ((noinline))
+f4 ()
+{
+  CHECK_4 (4, -64.0);
+  CHECK_4 (4, -32.0);
+  CHECK_4 (4, -16.0);
+  CHECK_4 (4, -8.0);
+  CHECK_4 (4, -4.0);
+  CHECK_4 (4, -2.0);
+  CHECK_4 (4, -1.0);
+
+  CHECK_4 (4, 0.0);
+
+  CHECK_4 (4, 1.0);
+  CHECK_4 (4, 2.0);
+  CHECK_4 (4, 4.0);
+  CHECK_4 (4, 8.0);
+  CHECK_4 (4, 16.0);
+  CHECK_4 (4, 32.0);
+  CHECK_4 (4, 64.0);
+
+  CHECK_4 (8, -64.0);
+  CHECK_4 (8, -32.0);
+  CHECK_4 (8, -16.0);
+  CHECK_4 (8, -8.0);
+  CHECK_4 (8, -4.0);
+  CHECK_4 (8, -2.0);
+  CHECK_4 (8, -1.0);
+
+  CHECK_4 (8, 0.0);
+
+  CHECK_4 (8, 1.0);
+  CHECK_4 (8, 2.0);
+  CHECK_4 (8, 4.0);
+  CHECK_4 (8, 8.0);
+  CHECK_4 (8, 16.0);
+  CHECK_4 (8, 32.0);
+  CHECK_4 (8, 64.0);
+}
+
+int
+main ()
+{
+  f1 ();
+  f2 ();
+  f3 ();
+  f4 ();
+  return 0;
+}
+
+/* We are searching for 7 movi for each constant except zero.  The functions f1
+   (), f3 (), f4 () check for both V4HF and V8HF modes, while f2 () checks
+   only for V4HF, hence the 14 directive lines.  The constants are in hex,
+   and the list is here:
+   0xd4 -> -64
+   0xd0 -> -32
+   0xcc -> -16
+   etc...
+   0x4c -> 16
+   0x50 -> 32
+   0x54 -> 64.  */
+
+/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0xd4, ?lsl 8\n" 7 } } */
+/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0xd0, ?lsl 8\n" 7 } } */
+/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0xcc, ?lsl 8\n" 7 } } */
+/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0xc8, ?lsl 8\n" 7 } } */
+/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0xc4, ?lsl 8\n" 7 } } */
+/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0xc0, ?lsl 8\n" 7 } } */
+/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0xbc, ?lsl 8\n" 7 } } */
+/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0x3c, ?lsl 8\n" 7 } } */
+/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0x40, ?lsl 8\n" 7 } } */
+/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0x44, ?lsl 8\n" 7 } } */
+/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0x48, ?lsl 8\n" 7 } } */
+/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0x4c, ?lsl 8\n" 7 } } */
+/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0x50, ?lsl 8\n" 7 } } */
+/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0x54, ?lsl 8\n" 7 } } */
+
+/* For the constant zero, the instruction emitted is a movi but with a different
+   size for the vector lane.  Also, since f2 () only tests V4HF, we have 1 less
+   case to check for V8HF.  V4HF mode emits v*.2s, V8HF emits v*.4s.  */
+/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[2\]\[sS\], ?0\n" 4 } } */
+/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[4\]\[sS\], ?0\n" 3 } } */

[PATCH][AARCH64][NEON] Enabling V*HFmode simd immediate loads.

Reply via email to