On 17/05/16 15:34, Matthew Wahab wrote: > The ARMv8.2-A FP16 extension adds a number of instructions to support > data movement for FP16 values. This patch adds these instructions to the > backend, making them available to the compiler code generator.
This updates the expected output for the test added by the patch since gcc now generates ldrh/strh for some indexed loads/stores which were previously done with vld1/vstr1. Tested the series for arm-none-linux-gnueabihf with native bootstrap and make check and for arm-none-eabi and armeb-none-eabi with make check on an ARMv8.2-A emulator. 2016-07-04 Matthew Wahab <matthew.wa...@arm.com> Jiong Wang <jiong.w...@arm.com> * config/arm/arm.c (coproc_secondary_reload_class): Make HFmode available when FP16 instructions are available. (output_move_vfp): Add support for 16-bit data moves. (arm_validize_comparison): Fix some white-space. Support HFmode by conversion to SFmode. * config/arm/arm.md (truncdfhf2): Fix a comment. (extendhfdf2): Likewise. (cstorehf4): New. (movsicc): Fix some white-space. (movhfcc): New. (movsfcc): Fix some white-space. (*cmovhf): New. * config/arm/vfp.md (*arm_movhi_vfp): Disable when VFP FP16 instructions are available. (*thumb2_movhi_vfp): Likewise. (*arm_movhi_fp16): New. (*thumb2_movhi_fp16): New. (*movhf_vfp_fp16): New. (*movhf_vfp_neon): Disable when VFP FP16 instructions are available. (*movhf_vfp): Likewise. (extendhfsf2): Enable when VFP FP16 instructions are available. (truncsfhf2): Enable when VFP FP16 instructions are available. testsuite/ 2016-07-04 Matthew Wahab <matthew.wa...@arm.com> * gcc.target/arm/armv8_2_fp16-move-1.c: New.
>From 0633bbb2f2d43a6994adaeb44898e18c304ee728 Mon Sep 17 00:00:00 2001 From: Matthew Wahab <matthew.wa...@arm.com> Date: Thu, 7 Apr 2016 13:35:04 +0100 Subject: [PATCH 07/17] [PATCH 7/17][ARM] Add FP16 data movement instructions. 2016-07-04 Matthew Wahab <matthew.wa...@arm.com> Jiong Wang <jiong.w...@arm.com> * config/arm/arm.c (coproc_secondary_reload_class): Make HFmode available when FP16 instructions are available. (output_move_vfp): Add support for 16-bit data moves. (arm_validize_comparison): Fix some white-space. Support HFmode by conversion to SFmode. * config/arm/arm.md (truncdfhf2): Fix a comment. (extendhfdf2): Likewise. (cstorehf4): New. (movsicc): Fix some white-space. (movhfcc): New. (movsfcc): Fix some white-space. (*cmovhf): New. * config/arm/vfp.md (*arm_movhi_vfp): Disable when VFP FP16 instructions are available. (*thumb2_movhi_vfp): Likewise. (*arm_movhi_fp16): New. (*thumb2_movhi_fp16): New. (*movhf_vfp_fp16): New. (*movhf_vfp_neon): Disable when VFP FP16 instructions are available. (*movhf_vfp): Likewise. (extendhfsf2): Enable when VFP FP16 instructions are available. (truncsfhf2): Enable when VFP FP16 instructions are available. testsuite/ 2016-07-04 Matthew Wahab <matthew.wa...@arm.com> * gcc.target/arm/armv8_2_fp16-move-1.c: New. --- gcc/config/arm/arm.c | 16 +- gcc/config/arm/arm.md | 81 ++++++++- gcc/config/arm/vfp.md | 182 ++++++++++++++++++++- gcc/testsuite/gcc.target/arm/armv8_2-fp16-move-1.c | 165 +++++++++++++++++++ 4 files changed, 432 insertions(+), 12 deletions(-) create mode 100644 gcc/testsuite/gcc.target/arm/armv8_2-fp16-move-1.c diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index ce18f75..f07e2c1 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -13187,7 +13187,7 @@ coproc_secondary_reload_class (machine_mode mode, rtx x, bool wb) { if (mode == HFmode) { - if (!TARGET_NEON_FP16) + if (!TARGET_NEON_FP16 && !TARGET_VFP_FP16INST) return GENERAL_REGS; if (s_register_operand (x, mode) || neon_vector_mem_operand (x, 2, true)) return NO_REGS; @@ -18638,6 +18638,8 @@ output_move_vfp (rtx *operands) rtx reg, mem, addr, ops[2]; int load = REG_P (operands[0]); int dp = GET_MODE_SIZE (GET_MODE (operands[0])) == 8; + int sp = (!TARGET_VFP_FP16INST + || GET_MODE_SIZE (GET_MODE (operands[0])) == 4); int integer_p = GET_MODE_CLASS (GET_MODE (operands[0])) == MODE_INT; const char *templ; char buff[50]; @@ -18684,7 +18686,7 @@ output_move_vfp (rtx *operands) sprintf (buff, templ, load ? "ld" : "st", - dp ? "64" : "32", + dp ? "64" : sp ? "32" : "16", dp ? "P" : "", integer_p ? "\t%@ int" : ""); output_asm_insn (buff, ops); @@ -29326,7 +29328,7 @@ arm_validize_comparison (rtx *comparison, rtx * op1, rtx * op2) { enum rtx_code code = GET_CODE (*comparison); int code_int; - machine_mode mode = (GET_MODE (*op1) == VOIDmode) + machine_mode mode = (GET_MODE (*op1) == VOIDmode) ? GET_MODE (*op2) : GET_MODE (*op1); gcc_assert (GET_MODE (*op1) != VOIDmode || GET_MODE (*op2) != VOIDmode); @@ -29354,6 +29356,14 @@ arm_validize_comparison (rtx *comparison, rtx * op1, rtx * op2) *op2 = force_reg (mode, *op2); return true; + case HFmode: + if (!TARGET_VFP_FP16INST) + break; + /* FP16 comparisons are done in SF mode. */ + mode = SFmode; + *op1 = convert_to_mode (mode, *op1, 1); + *op2 = convert_to_mode (mode, *op2, 1); + /* Fall through. */ case SFmode: case DFmode: if (!arm_float_compare_operand (*op1, mode)) diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md index 21af27c..6a980cd 100644 --- a/gcc/config/arm/arm.md +++ b/gcc/config/arm/arm.md @@ -4854,7 +4854,7 @@ "" ) -/* DFmode -> HFmode conversions have to go through SFmode. */ +;; DFmode to HFmode conversions have to go through SFmode. (define_expand "truncdfhf2" [(set (match_operand:HF 0 "general_operand" "") (float_truncate:HF @@ -5361,7 +5361,7 @@ "" ) -/* HFmode -> DFmode conversions have to go through SFmode. */ +;; HFmode -> DFmode conversions have to go through SFmode. (define_expand "extendhfdf2" [(set (match_operand:DF 0 "general_operand" "") (float_extend:DF (match_operand:HF 1 "general_operand" "")))] @@ -7366,6 +7366,24 @@ DONE; }") +(define_expand "cstorehf4" + [(set (match_operand:SI 0 "s_register_operand") + (match_operator:SI 1 "expandable_comparison_operator" + [(match_operand:HF 2 "s_register_operand") + (match_operand:HF 3 "arm_float_compare_operand")]))] + "TARGET_VFP_FP16INST" + { + if (!arm_validize_comparison (&operands[1], + &operands[2], + &operands[3])) + FAIL; + + emit_insn (gen_cstore_cc (operands[0], operands[1], + operands[2], operands[3])); + DONE; + } +) + (define_expand "cstoresf4" [(set (match_operand:SI 0 "s_register_operand" "") (match_operator:SI 1 "expandable_comparison_operator" @@ -7418,9 +7436,31 @@ rtx ccreg; if (!arm_validize_comparison (&operands[1], &XEXP (operands[1], 0), - &XEXP (operands[1], 1))) + &XEXP (operands[1], 1))) FAIL; - + + code = GET_CODE (operands[1]); + ccreg = arm_gen_compare_reg (code, XEXP (operands[1], 0), + XEXP (operands[1], 1), NULL_RTX); + operands[1] = gen_rtx_fmt_ee (code, VOIDmode, ccreg, const0_rtx); + }" +) + +(define_expand "movhfcc" + [(set (match_operand:HF 0 "s_register_operand") + (if_then_else:HF (match_operand 1 "arm_cond_move_operator") + (match_operand:HF 2 "s_register_operand") + (match_operand:HF 3 "s_register_operand")))] + "TARGET_VFP_FP16INST" + " + { + enum rtx_code code = GET_CODE (operands[1]); + rtx ccreg; + + if (!arm_validize_comparison (&operands[1], &XEXP (operands[1], 0), + &XEXP (operands[1], 1))) + FAIL; + code = GET_CODE (operands[1]); ccreg = arm_gen_compare_reg (code, XEXP (operands[1], 0), XEXP (operands[1], 1), NULL_RTX); @@ -7439,7 +7479,7 @@ enum rtx_code code = GET_CODE (operands[1]); rtx ccreg; - if (!arm_validize_comparison (&operands[1], &XEXP (operands[1], 0), + if (!arm_validize_comparison (&operands[1], &XEXP (operands[1], 0), &XEXP (operands[1], 1))) FAIL; @@ -7504,6 +7544,37 @@ (set_attr "type" "fcsel")] ) +(define_insn "*cmovhf" + [(set (match_operand:HF 0 "s_register_operand" "=t") + (if_then_else:HF (match_operator 1 "arm_vsel_comparison_operator" + [(match_operand 2 "cc_register" "") (const_int 0)]) + (match_operand:HF 3 "s_register_operand" "t") + (match_operand:HF 4 "s_register_operand" "t")))] + "TARGET_VFP_FP16INST" + "* + { + enum arm_cond_code code = maybe_get_arm_condition_code (operands[1]); + switch (code) + { + case ARM_GE: + case ARM_GT: + case ARM_EQ: + case ARM_VS: + return \"vsel%d1.f16\\t%0, %3, %4\"; + case ARM_LT: + case ARM_LE: + case ARM_NE: + case ARM_VC: + return \"vsel%D1.f16\\t%0, %4, %3\"; + default: + gcc_unreachable (); + } + return \"\"; + }" + [(set_attr "conds" "use") + (set_attr "type" "fcsel")] +) + (define_insn_and_split "*movsicc_insn" [(set (match_operand:SI 0 "s_register_operand" "=r,r,r,r,r,r,r,r") (if_then_else:SI diff --git a/gcc/config/arm/vfp.md b/gcc/config/arm/vfp.md index d7c874a..b1c13fa 100644 --- a/gcc/config/arm/vfp.md +++ b/gcc/config/arm/vfp.md @@ -27,6 +27,7 @@ (match_operand:HI 1 "general_operand" "rIk, K, n, r, mi, r, *t, *t"))] "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP + && !TARGET_VFP_FP16INST && (register_operand (operands[0], HImode) || register_operand (operands[1], HImode))" { @@ -76,6 +77,7 @@ (match_operand:HI 1 "general_operand" "rk, I, Py, n, r, m, r, *t, *t"))] "TARGET_THUMB2 && TARGET_HARD_FLOAT && TARGET_VFP + && !TARGET_VFP_FP16INST && (register_operand (operands[0], HImode) || register_operand (operands[1], HImode))" { @@ -111,6 +113,99 @@ (set_attr "length" "2, 4, 2, 4, 4, 4, 4, 4, 4")] ) +;; Patterns for HI moves which provide more data transfer instructions when FP16 +;; instructions are available. +(define_insn "*arm_movhi_fp16" + [(set + (match_operand:HI 0 "nonimmediate_operand" + "=r, r, r, m, r, *t, r, *t") + (match_operand:HI 1 "general_operand" + "rIk, K, n, r, mi, r, *t, *t"))] + "TARGET_ARM && TARGET_VFP_FP16INST + && (register_operand (operands[0], HImode) + || register_operand (operands[1], HImode))" +{ + switch (which_alternative) + { + case 0: + return "mov%?\t%0, %1\t%@ movhi"; + case 1: + return "mvn%?\t%0, #%B1\t%@ movhi"; + case 2: + return "movw%?\t%0, %L1\t%@ movhi"; + case 3: + return "strh%?\t%1, %0\t%@ movhi"; + case 4: + return "ldrh%?\t%0, %1\t%@ movhi"; + case 5: + case 6: + return "vmov%?.f16\t%0, %1\t%@ int"; + case 7: + return "vmov%?.f32\t%0, %1\t%@ int"; + default: + gcc_unreachable (); + } +} + [(set_attr "predicable" "yes") + (set_attr_alternative "type" + [(if_then_else + (match_operand 1 "const_int_operand" "") + (const_string "mov_imm") + (const_string "mov_reg")) + (const_string "mvn_imm") + (const_string "mov_imm") + (const_string "store1") + (const_string "load1") + (const_string "f_mcr") + (const_string "f_mrc") + (const_string "fmov")]) + (set_attr "pool_range" "*, *, *, *, 256, *, *, *") + (set_attr "neg_pool_range" "*, *, *, *, 244, *, *, *") + (set_attr "length" "4")] +) + +(define_insn "*thumb2_movhi_fp16" + [(set + (match_operand:HI 0 "nonimmediate_operand" + "=rk, r, l, r, m, r, *t, r, *t") + (match_operand:HI 1 "general_operand" + "rk, I, Py, n, r, m, r, *t, *t"))] + "TARGET_THUMB2 && TARGET_VFP_FP16INST + && (register_operand (operands[0], HImode) + || register_operand (operands[1], HImode))" +{ + switch (which_alternative) + { + case 0: + case 1: + case 2: + return "mov%?\t%0, %1\t%@ movhi"; + case 3: + return "movw%?\t%0, %L1\t%@ movhi"; + case 4: + return "strh%?\t%1, %0\t%@ movhi"; + case 5: + return "ldrh%?\t%0, %1\t%@ movhi"; + case 6: + case 7: + return "vmov%?.f16\t%0, %1\t%@ int"; + case 8: + return "vmov%?.f32\t%0, %1\t%@ int"; + default: + gcc_unreachable (); + } +} + [(set_attr "predicable" "yes") + (set_attr "predicable_short_it" + "yes, no, yes, no, no, no, no, no, no") + (set_attr "type" + "mov_reg, mov_imm, mov_imm, mov_imm, store1, load1,\ + f_mcr, f_mrc, fmov") + (set_attr "pool_range" "*, *, *, *, *, 4094, *, *, *") + (set_attr "neg_pool_range" "*, *, *, *, *, 250, *, *, *") + (set_attr "length" "2, 4, 2, 4, 4, 4, 4, 4, 4")] +) + ;; SImode moves ;; ??? For now do not allow loading constants into vfp regs. This causes ;; problems because small constants get converted into adds. @@ -304,10 +399,87 @@ ) ;; HFmode moves + +(define_insn "*movhf_vfp_fp16" + [(set (match_operand:HF 0 "nonimmediate_operand" + "= r,m,t,r,t,r,t,t,Um,r") + (match_operand:HF 1 "general_operand" + " m,r,t,r,r,t,Dv,Um,t,F"))] + "TARGET_32BIT + && TARGET_VFP_FP16INST + && (s_register_operand (operands[0], HFmode) + || s_register_operand (operands[1], HFmode))" + { + switch (which_alternative) + { + case 0: /* ARM register from memory. */ + return \"ldrh%?\\t%0, %1\\t%@ __fp16\"; + case 1: /* Memory from ARM register. */ + return \"strh%?\\t%1, %0\\t%@ __fp16\"; + case 2: /* S register from S register. */ + return \"vmov\\t%0, %1\t%@ __fp16\"; + case 3: /* ARM register from ARM register. */ + return \"mov%?\\t%0, %1\\t%@ __fp16\"; + case 4: /* S register from ARM register. */ + case 5: /* ARM register from S register. */ + case 6: /* S register from immediate. */ + return \"vmov.f16\\t%0, %1\t%@ __fp16\"; + case 7: /* S register from memory. */ + return \"vld1.16\\t{%z0}, %A1\"; + case 8: /* Memory from S register. */ + return \"vst1.16\\t{%z1}, %A0\"; + case 9: /* ARM register from constant. */ + { + long bits; + rtx ops[4]; + + bits = real_to_target (NULL, CONST_DOUBLE_REAL_VALUE (operands[1]), + HFmode); + ops[0] = operands[0]; + ops[1] = GEN_INT (bits); + ops[2] = GEN_INT (bits & 0xff00); + ops[3] = GEN_INT (bits & 0x00ff); + + if (arm_arch_thumb2) + output_asm_insn (\"movw\\t%0, %1\", ops); + else + output_asm_insn (\"mov\\t%0, %2\;orr\\t%0, %0, %3\", ops); + return \"\"; + } + default: + gcc_unreachable (); + } + } + [(set_attr "predicable" "yes, yes, no, yes, no, no, no, no, no, no") + (set_attr "predicable_short_it" "no, no, no, yes,\ + no, no, no, no,\ + no, no") + (set_attr_alternative "type" + [(const_string "load1") (const_string "store1") + (const_string "fmov") (const_string "mov_reg") + (const_string "f_mcr") (const_string "f_mrc") + (const_string "fconsts") (const_string "neon_load1_1reg") + (const_string "neon_store1_1reg") + (if_then_else (match_test "arm_arch_thumb2") + (const_string "mov_imm") + (const_string "multiple"))]) + (set_attr_alternative "length" + [(const_int 4) (const_int 4) + (const_int 4) (const_int 4) + (const_int 4) (const_int 4) + (const_int 4) (const_int 4) + (const_int 4) + (if_then_else (match_test "arm_arch_thumb2") + (const_int 4) + (const_int 8))])] +) + (define_insn "*movhf_vfp_neon" [(set (match_operand:HF 0 "nonimmediate_operand" "= t,Um,r,m,t,r,t,r,r") (match_operand:HF 1 "general_operand" " Um, t,m,r,t,r,r,t,F"))] - "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_NEON_FP16 + "TARGET_32BIT + && TARGET_HARD_FLOAT && TARGET_NEON_FP16 + && !TARGET_VFP_FP16INST && ( s_register_operand (operands[0], HFmode) || s_register_operand (operands[1], HFmode))" "* @@ -361,8 +533,10 @@ (define_insn "*movhf_vfp" [(set (match_operand:HF 0 "nonimmediate_operand" "=r,m,t,r,t,r,r") (match_operand:HF 1 "general_operand" " m,r,t,r,r,t,F"))] - "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP + "TARGET_32BIT + && TARGET_HARD_FLOAT && TARGET_VFP && !TARGET_NEON_FP16 + && !TARGET_VFP_FP16INST && ( s_register_operand (operands[0], HFmode) || s_register_operand (operands[1], HFmode))" "* @@ -1095,7 +1269,7 @@ (define_insn "extendhfsf2" [(set (match_operand:SF 0 "s_register_operand" "=t") (float_extend:SF (match_operand:HF 1 "s_register_operand" "t")))] - "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FP16" + "TARGET_32BIT && TARGET_HARD_FLOAT && (TARGET_FP16 || TARGET_VFP_FP16INST)" "vcvtb%?.f32.f16\\t%0, %1" [(set_attr "predicable" "yes") (set_attr "predicable_short_it" "no") @@ -1105,7 +1279,7 @@ (define_insn "truncsfhf2" [(set (match_operand:HF 0 "s_register_operand" "=t") (float_truncate:HF (match_operand:SF 1 "s_register_operand" "t")))] - "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FP16" + "TARGET_32BIT && TARGET_HARD_FLOAT && (TARGET_FP16 || TARGET_VFP_FP16INST)" "vcvtb%?.f16.f32\\t%0, %1" [(set_attr "predicable" "yes") (set_attr "predicable_short_it" "no") diff --git a/gcc/testsuite/gcc.target/arm/armv8_2-fp16-move-1.c b/gcc/testsuite/gcc.target/arm/armv8_2-fp16-move-1.c new file mode 100644 index 0000000..951da23 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/armv8_2-fp16-move-1.c @@ -0,0 +1,165 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_v8_2a_fp16_scalar_ok } */ +/* { dg-options "-O2" } */ +/* { dg-add-options arm_v8_2a_fp16_scalar } */ + +__fp16 +test_load_1 (__fp16* a) +{ + return *a; +} + +__fp16 +test_load_2 (__fp16* a, int i) +{ + return a[i]; +} + +/* { dg-final { scan-assembler-times {vld1\.16\t\{d[0-9]+\[[0-9]+\]\}, \[r[0-9]\]+} 1 } } */ + +void +test_store_1 (__fp16* a, __fp16 b) +{ + *a = b; +} + +void +test_store_2 (__fp16* a, int i, __fp16 b) +{ + a[i] = b; +} + +/* { dg-final { scan-assembler-times {vst1\.16\t\{d[0-9]+\[[0-9]+\]\}, \[r[0-9]\]+} 1 } } */ + +__fp16 +test_load_store_1 (__fp16* a, int i, __fp16* b) +{ + a[i] = b[i]; +} + +__fp16 +test_load_store_2 (__fp16* a, int i, __fp16* b) +{ + a[i] = b[i + 2]; + return a[i]; +} +/* { dg-final { scan-assembler-times {ldrh\tr[0-9]+} 3 } } */ +/* { dg-final { scan-assembler-times {strh\tr[0-9]+} 3 } } */ + +__fp16 +test_select_1 (int sel, __fp16 a, __fp16 b) +{ + if (sel) + return a; + else + return b; +} + +__fp16 +test_select_2 (int sel, __fp16 a, __fp16 b) +{ + return sel ? a : b; +} + +__fp16 +test_select_3 (__fp16 a, __fp16 b, __fp16 c) +{ + return (a == b) ? b : c; +} + +__fp16 +test_select_4 (__fp16 a, __fp16 b, __fp16 c) +{ + return (a != b) ? b : c; +} + +__fp16 +test_select_5 (__fp16 a, __fp16 b, __fp16 c) +{ + return (a < b) ? b : c; +} + +__fp16 +test_select_6 (__fp16 a, __fp16 b, __fp16 c) +{ + return (a <= b) ? b : c; +} + +__fp16 +test_select_7 (__fp16 a, __fp16 b, __fp16 c) +{ + return (a > b) ? b : c; +} + +__fp16 +test_select_8 (__fp16 a, __fp16 b, __fp16 c) +{ + return (a >= b) ? b : c; +} + +/* { dg-final { scan-assembler-times {vseleq\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 4 } } */ +/* { dg-final { scan-assembler-times {vselgt\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {vselge\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 1 } } */ + +/* { dg-final { scan-assembler-times {vmov\.f16\ts[0-9]+, r[0-9]+} 5 } } */ +/* { dg-final { scan-assembler-times {vmov\.f16\tr[0-9]+, s[0-9]+} 5 } } */ + +int +test_compare_1 (__fp16 a, __fp16 b) +{ + if (a == b) + return -1; + else + return 0; +} + +int +test_compare_ (__fp16 a, __fp16 b) +{ + if (a != b) + return -1; + else + return 0; +} + +int +test_compare_2 (__fp16 a, __fp16 b) +{ + if (a > b) + return -1; + else + return 0; +} + +int +test_compare_3 (__fp16 a, __fp16 b) +{ + if (a >= b) + return -1; + else + return 0; +} + +int +test_compare_4 (__fp16 a, __fp16 b) +{ + if (a < b) + return -1; + else + return 0; +} + +int +test_compare_5 (__fp16 a, __fp16 b) +{ + if (a <= b) + return -1; + else + return 0; +} + +/* { dg-final { scan-assembler-not {vcmp\.f16} } } */ +/* { dg-final { scan-assembler-not {vcmpe\.f16} } } */ + +/* { dg-final { scan-assembler-times {vcmp\.f32} 4 } } */ +/* { dg-final { scan-assembler-times {vcmpe\.f32} 8 } } */ -- 2.1.4