This patch adds support for loading vector 16bit floating point immediates (modes V*HF) using a movi instruction. We leverage the existing code that does checking for an 8 bit pattern in a 64/128-bit long splattered version of the concatenated bit pattern representations of the individual constant elements of the vector. This enables us to load a variety of constants, since the movi instruction also comes with an up to 24 bit immediate left shift encoding (in multiples of 8). A new testcase was added that checks for presence of movi instructions and for correctness of results.
Tested on aarch64-none-elf, aarch64_be-none-elf, bootstrapped on aarch64-none-linux-gnu. --- gcc/ 2015-XX-XX Bilyan Borisov <bilyan.bori...@arm.com> * config/aarch64/aarch64.c (aarch64_simd_container_mode): Added HFmode cases. (aarch64_vect_float_const_representable_p): Updated comment. (aarch64_simd_valid_immediate): Added support for V*HF arguments. (aarch64_output_simd_mov_immediate): Added check for HFmode. gcc/testsuite/ 2015-XX-XX Bilyan Borisov <bilyan.bori...@arm.com> * gcc.target/aarch64/fp16/f16_mov_immediate_simd_1.c: New.
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index ae4cfb336a827a63a6baadefcb5646a9dbfb7523..bb6fce0a829d634a7694710e8a8c9a1c3e841abd 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -10250,6 +10250,8 @@ aarch64_simd_container_mode (machine_mode mode, unsigned width) return V2DFmode; case SFmode: return V4SFmode; + case HFmode: + return V8HFmode; case SImode: return V4SImode; case HImode: @@ -10266,6 +10268,8 @@ aarch64_simd_container_mode (machine_mode mode, unsigned width) { case SFmode: return V2SFmode; + case HFmode: + return V4HFmode; case SImode: return V2SImode; case HImode: @@ -10469,7 +10473,12 @@ sizetochar (int size) /* Return true iff x is a uniform vector of floating-point constants, and the constant can be represented in quarter-precision form. Note, as aarch64_float_const_representable - rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */ + rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. + Also note that this won't ever be called for V*HFmode vectors, + since in aarch64_simd_valid_immediate () we check for the mode + and handle these vector types differently from other floating + point vector modes. */ + static bool aarch64_vect_float_const_representable_p (rtx x) { @@ -10505,7 +10514,10 @@ aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse, unsigned int invmask = inverse ? 0xff : 0; int eshift, emvn; - if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT) + /* Ignore V*HFmode vectors, they are handled below with the integer + code. */ + if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT + && GET_MODE_INNER (mode) != HFmode) { if (! (aarch64_simd_imm_zero_p (op, mode) || aarch64_vect_float_const_representable_p (op))) @@ -10530,15 +10542,26 @@ aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse, rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i); unsigned HOST_WIDE_INT elpart; - gcc_assert (CONST_INT_P (el)); - elpart = INTVAL (el); + if (CONST_INT_P (el)) + elpart = INTVAL (el); + /* Convert HFmode vector element to bit pattern. Logic below will catch + most common constants since for FP16 the sign and exponent are in the + top 6 bits and a movi with a left shift of 8 will catch all powers + of 2 that fit in a 16 bit floating point, and the 2 extra bits left + for the mantissa can cover some more non-power of 2 constants. With + a 0 left shift, we can cover constants of the form 1.xxx since we have + 8 bits only for the mantissa. */ + else if (CONST_DOUBLE_P (el) && GET_MODE_INNER (mode) == HFmode) + elpart = + real_to_target (NULL, CONST_DOUBLE_REAL_VALUE (el), HFmode); + else + gcc_unreachable (); for (unsigned int byte = 0; byte < innersize; byte++) { bytes[idx++] = (elpart & 0xff) ^ invmask; elpart >>= BITS_PER_UNIT; } - } /* Sanity check. */ @@ -11913,7 +11936,10 @@ aarch64_output_simd_mov_immediate (rtx const_vector, lane_count = width / info.element_width; mode = GET_MODE_INNER (mode); - if (GET_MODE_CLASS (mode) == MODE_FLOAT) + /* We handle HFmode vectors separately from the other floating point + vector modes. See aarch64_simd_valid_immediate (), but in short + we use a movi instruction rather than a fmov. */ + if (GET_MODE_CLASS (mode) == MODE_FLOAT && mode != HFmode) { gcc_assert (info.shift == 0 && ! info.mvn); /* For FP zero change it to a CONST_INT 0 and use the integer SIMD diff --git a/gcc/testsuite/gcc.target/aarch64/fp16/f16_mov_immediate_simd_1.c b/gcc/testsuite/gcc.target/aarch64/fp16/f16_mov_immediate_simd_1.c new file mode 100644 index 0000000000000000000000000000000000000000..4533a888a43773a92be2f120f30353b7b23c9ab5 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/fp16/f16_mov_immediate_simd_1.c @@ -0,0 +1,262 @@ +/* { dg-do run } */ +/* { dg-options "-O3 -save-temps" } */ + +#include "arm_neon.h" + +extern void abort (); + +#define VAL_4(val) \ + val, val, val, val \ + +#define VAL_8(val) \ + val, val, val, val, val, val, val, val \ + + +#define CHECK_1(LEN, val) \ + do \ + { \ + float16x##LEN##_t a = {VAL_##LEN (val)}; \ + for (int i = 0; i < LEN; ++i) \ + if (a[i] != val) \ + abort (); \ + } \ + while (0) \ + +void __attribute__ ((noinline)) +f1 () +{ + CHECK_1 (4, -64.0); + CHECK_1 (4, -32.0); + CHECK_1 (4, -16.0); + CHECK_1 (4, -8.0); + CHECK_1 (4, -4.0); + CHECK_1 (4, -2.0); + CHECK_1 (4, -1.0); + + CHECK_1 (4, 0.0); + + CHECK_1 (4, 1.0); + CHECK_1 (4, 2.0); + CHECK_1 (4, 4.0); + CHECK_1 (4, 8.0); + CHECK_1 (4, 16.0); + CHECK_1 (4, 32.0); + CHECK_1 (4, 64.0); + + CHECK_1 (8, -64.0); + CHECK_1 (8, -32.0); + CHECK_1 (8, -16.0); + CHECK_1 (8, -8.0); + CHECK_1 (8, -4.0); + CHECK_1 (8, -2.0); + CHECK_1 (8, -1.0); + + CHECK_1 (8, 0.0); + + CHECK_1 (8, 1.0); + CHECK_1 (8, 2.0); + CHECK_1 (8, 4.0); + CHECK_1 (8, 8.0); + CHECK_1 (8, 16.0); + CHECK_1 (8, 32.0); + CHECK_1 (8, 64.0); +} + +#define CHECK_2(LEN, val1, val2) \ + do \ + { \ + float16x##LEN##_t a = \ + vcreate_f16 (__AARCH64_UINT64_C (val1)); \ + for (int i = 0; i < LEN; ++i) \ + if (vget_lane_f16 (a, i) != val2) \ + abort (); \ + } \ + while (0) \ + +void __attribute__ ((noinline)) +f2 () +{ + CHECK_2 (4, 0xd400d400d400d400, -64.0); + CHECK_2 (4, 0xd000d000d000d000, -32.0); + CHECK_2 (4, 0xcc00cc00cc00cc00, -16.0); + CHECK_2 (4, 0xc800c800c800c800, -8.0); + CHECK_2 (4, 0xc400c400c400c400, -4.0); + CHECK_2 (4, 0xc000c000c000c000, -2.0); + CHECK_2 (4, 0xbc00bc00bc00bc00, -1.0); + + CHECK_2 (4, 0, 0.0); + + CHECK_2 (4, 0x3c003c003c003c00, 1.0); + CHECK_2 (4, 0x4000400040004000, 2.0); + CHECK_2 (4, 0x4400440044004400, 4.0); + CHECK_2 (4, 0x4800480048004800, 8.0); + CHECK_2 (4, 0x4c004c004c004c00, 16.0); + CHECK_2 (4, 0x5000500050005000, 32.0); + CHECK_2 (4, 0x5400540054005400, 64.0); +} + +#define VGET_LANE_F16_4(a, i) \ + vget_lane_f16 (a, i) \ + +#define VGET_LANE_F16_8(a, i) \ + vgetq_lane_f16 (a, i) \ + +#define VLD1_F16_4(x) \ + vld1_f16 (x) \ + +#define VLD1_F16_8(x) \ + vld1q_f16 (x) \ + +#define CHECK_3(LEN, val) \ + do \ + { \ + float16_t x[] = {VAL_##LEN (val)}; \ + float16x##LEN##_t a = \ + VLD1_F16_##LEN (x); \ + for (int i = 0; i < LEN; ++i) \ + if (VGET_LANE_F16_##LEN (a, i) != val) \ + abort (); \ + } \ + while (0) \ + +void __attribute__ ((noinline)) +f3 () +{ + CHECK_3 (4, -64.0); + CHECK_3 (4, -32.0); + CHECK_3 (4, -16.0); + CHECK_3 (4, -8.0); + CHECK_3 (4, -4.0); + CHECK_3 (4, -2.0); + CHECK_3 (4, -1.0); + + CHECK_3 (4, 0.0); + + CHECK_3 (4, 1.0); + CHECK_3 (4, 2.0); + CHECK_3 (4, 4.0); + CHECK_3 (4, 8.0); + CHECK_3 (4, 16.0); + CHECK_3 (4, 32.0); + CHECK_3 (4, 64.0); + + CHECK_3 (8, -64.0); + CHECK_3 (8, -32.0); + CHECK_3 (8, -16.0); + CHECK_3 (8, -8.0); + CHECK_3 (8, -4.0); + CHECK_3 (8, -2.0); + CHECK_3 (8, -1.0); + + CHECK_3 (8, 0.0); + + CHECK_3 (8, 1.0); + CHECK_3 (8, 2.0); + CHECK_3 (8, 4.0); + CHECK_3 (8, 8.0); + CHECK_3 (8, 16.0); + CHECK_3 (8, 32.0); + CHECK_3 (8, 64.0); +} + +#define VLD1_DUP_F16_4(x) \ + vld1_dup_f16 (x) \ + +#define VLD1_DUP_F16_8(x) \ + vld1q_dup_f16 (x) \ + + +#define CHECK_4(LEN, val) \ + do \ + { \ + float16_t x = val; \ + float16x##LEN##_t a = \ + VLD1_DUP_F16_##LEN (&x); \ + for (int i = 0; i < LEN; ++i) \ + if (VGET_LANE_F16_##LEN (a, i) != val) \ + abort (); \ + } \ + while (0) \ + +void __attribute__ ((noinline)) +f4 () +{ + CHECK_4 (4, -64.0); + CHECK_4 (4, -32.0); + CHECK_4 (4, -16.0); + CHECK_4 (4, -8.0); + CHECK_4 (4, -4.0); + CHECK_4 (4, -2.0); + CHECK_4 (4, -1.0); + + CHECK_4 (4, 0.0); + + CHECK_4 (4, 1.0); + CHECK_4 (4, 2.0); + CHECK_4 (4, 4.0); + CHECK_4 (4, 8.0); + CHECK_4 (4, 16.0); + CHECK_4 (4, 32.0); + CHECK_4 (4, 64.0); + + CHECK_4 (8, -64.0); + CHECK_4 (8, -32.0); + CHECK_4 (8, -16.0); + CHECK_4 (8, -8.0); + CHECK_4 (8, -4.0); + CHECK_4 (8, -2.0); + CHECK_4 (8, -1.0); + + CHECK_4 (8, 0.0); + + CHECK_4 (8, 1.0); + CHECK_4 (8, 2.0); + CHECK_4 (8, 4.0); + CHECK_4 (8, 8.0); + CHECK_4 (8, 16.0); + CHECK_4 (8, 32.0); + CHECK_4 (8, 64.0); +} + +int +main () +{ + f1 (); + f2 (); + f3 (); + f4 (); + return 0; +} + +/* We are searching for 7 movi for each constant except zero. The functions f1 + (), f3 (), f4 () check for both V4HF and V8HF modes, while f2 () checks + only for V4HF, hence the 14 directive lines. The constants are in hex, + and the list is here: + 0xd4 -> -64 + 0xd0 -> -32 + 0xcc -> -16 + etc... + 0x4c -> 16 + 0x50 -> 32 + 0x54 -> 64. */ + +/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0xd4, ?lsl 8\n" 7 } } */ +/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0xd0, ?lsl 8\n" 7 } } */ +/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0xcc, ?lsl 8\n" 7 } } */ +/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0xc8, ?lsl 8\n" 7 } } */ +/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0xc4, ?lsl 8\n" 7 } } */ +/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0xc0, ?lsl 8\n" 7 } } */ +/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0xbc, ?lsl 8\n" 7 } } */ +/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0x3c, ?lsl 8\n" 7 } } */ +/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0x40, ?lsl 8\n" 7 } } */ +/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0x44, ?lsl 8\n" 7 } } */ +/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0x48, ?lsl 8\n" 7 } } */ +/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0x4c, ?lsl 8\n" 7 } } */ +/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0x50, ?lsl 8\n" 7 } } */ +/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0x54, ?lsl 8\n" 7 } } */ + +/* For the constant zero, the instruction emitted is a movi but with a different + size for the vector lane. Also, since f2 () only tests V4HF, we have 1 less + case to check for V8HF. V4HF mode emits v*.2s, V8HF emits v*.4s. */ +/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[2\]\[sS\], ?0\n" 4 } } */ +/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[4\]\[sS\], ?0\n" 3 } } */