On Tue, Oct 26, 2021 at 5:51 PM Hongyu Wang via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > Hi, > > For _Float16 type, add insn and expanders to optimize x / y to > x * rcp (y), and x / sqrt (y) to x * rsqrt (y). > As Half float only have minor precision difference between div and > mul * rcp, there is no need for Newton-Rhapson approximation. > > Bootstrapped/regtested on x86_64-pc-linux-gnu{-m32,} and sde. > Ok for master? Ok. > > gcc/ChangeLog: > > * config/i386/i386.c (use_rsqrt_p): Add mode parameter, enable > HFmode rsqrt without TARGET_SSE_MATH. > (ix86_optab_supported_p): Refactor rint, adjust floor, ceil, > btrunc condition to be restricted by -ftrapping-math, adjust > use_rsqrt_p function call. > * config/i386/i386.md (rcphf2): New define_insn. > (rsqrthf2): Likewise. > * config/i386/sse.md (div<mode>3): Change VF2H to VF2. > (div<mode>3): New expander for HF mode. > (rsqrt<mode>2): Likewise. > (*avx512fp16_vmrcpv8hf2): New define_insn for rpad pass. > (*avx512fp16_vmrsqrtv8hf2): Likewise. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/avx512fp16-recip-1.c: New test. > * gcc.target/i386/avx512fp16-recip-2.c: Ditto. > * gcc.target/i386/pr102464.c: Add -fno-trapping-math. > --- > gcc/config/i386/i386.c | 29 +++--- > gcc/config/i386/i386.md | 44 ++++++++- > gcc/config/i386/sse.md | 63 +++++++++++- > .../gcc.target/i386/avx512fp16-recip-1.c | 43 ++++++++ > .../gcc.target/i386/avx512fp16-recip-2.c | 97 +++++++++++++++++++ > gcc/testsuite/gcc.target/i386/pr102464.c | 2 +- > 6 files changed, 258 insertions(+), 20 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-recip-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-recip-2.c > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > index 299e1ab2621..c5789365d3b 100644 > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -18905,9 +18905,10 @@ ix86_vectorize_builtin_scatter (const_tree vectype, > 1.0/sqrt. */ > > static bool > -use_rsqrt_p () > +use_rsqrt_p (machine_mode mode) > { > - return (TARGET_SSE && TARGET_SSE_MATH > + return ((mode == HFmode > + || (TARGET_SSE && TARGET_SSE_MATH)) > && flag_finite_math_only > && !flag_trapping_math > && flag_unsafe_math_optimizations); > @@ -23603,29 +23604,27 @@ ix86_optab_supported_p (int op, machine_mode mode1, > machine_mode, > return opt_type == OPTIMIZE_FOR_SPEED; > > case rint_optab: > - if (mode1 == HFmode) > - return true; > - else if (SSE_FLOAT_MODE_P (mode1) > - && TARGET_SSE_MATH > - && !flag_trapping_math > - && !TARGET_SSE4_1) > + if (SSE_FLOAT_MODE_P (mode1) > + && TARGET_SSE_MATH > + && !flag_trapping_math > + && !TARGET_SSE4_1 > + && mode1 != HFmode) > return opt_type == OPTIMIZE_FOR_SPEED; > return true; > > case floor_optab: > case ceil_optab: > case btrunc_optab: > - if (mode1 == HFmode) > - return true; > - else if (SSE_FLOAT_MODE_P (mode1) > - && TARGET_SSE_MATH > - && !flag_trapping_math > - && TARGET_SSE4_1) > + if (((SSE_FLOAT_MODE_P (mode1) > + && TARGET_SSE_MATH > + && TARGET_SSE4_1) > + || mode1 == HFmode) > + && !flag_trapping_math) > return true; > return opt_type == OPTIMIZE_FOR_SPEED; > > case rsqrt_optab: > - return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (); > + return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1); > > default: > return true; > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md > index e733a40fc90..11535df5425 100644 > --- a/gcc/config/i386/i386.md > +++ b/gcc/config/i386/i386.md > @@ -8417,11 +8417,27 @@ > (match_operand:XF 2 "register_operand")))] > "TARGET_80387") > > +/* There is no more precision loss than Newton-Rhapson approximation > + when using HFmode rcp/rsqrt, so do the transformation directly under > + TARGET_RECIP_DIV and fast-math. */ > (define_expand "divhf3" > [(set (match_operand:HF 0 "register_operand") > (div:HF (match_operand:HF 1 "register_operand") > (match_operand:HF 2 "nonimmediate_operand")))] > - "TARGET_AVX512FP16") > + "TARGET_AVX512FP16" > +{ > + if (TARGET_RECIP_DIV > + && optimize_insn_for_speed_p () > + && flag_finite_math_only && !flag_trapping_math > + && flag_unsafe_math_optimizations) > + { > + rtx op = gen_reg_rtx (HFmode); > + operands[2] = force_reg (HFmode, operands[2]); > + emit_insn (gen_rcphf2 (op, operands[2])); > + emit_insn (gen_mulhf3 (operands[0], operands[1], op)); > + DONE; > + } > +}) > > (define_expand "div<mode>3" > [(set (match_operand:MODEF 0 "register_operand") > @@ -16973,6 +16989,19 @@ > ] > (symbol_ref "true")))]) > > +(define_insn "rcphf2" > + [(set (match_operand:HF 0 "register_operand" "=v,v") > + (unspec:HF [(match_operand:HF 1 "nonimmediate_operand" "v,m")] > + UNSPEC_RCP))] > + "TARGET_AVX512FP16" > + "@ > + vrcpsh\t{%d1, %0|%0, %d1} > + vrcpsh\t{%1, %d0|%d0, %1}" > + [(set_attr "type" "sse") > + (set_attr "prefix" "evex") > + (set_attr "mode" "HF") > + (set_attr "avx_partial_xmm_update" "false,true")]) > + > (define_insn "*fop_xf_1_i387" > [(set (match_operand:XF 0 "register_operand" "=f,f") > (match_operator:XF 3 "binary_fp_operator" > @@ -17230,6 +17259,19 @@ > DONE; > }) > > +(define_insn "rsqrthf2" > + [(set (match_operand:HF 0 "register_operand" "=v,v") > + (unspec:HF [(match_operand:HF 1 "nonimmediate_operand" "v,m")] > + UNSPEC_RSQRT))] > + "TARGET_AVX512FP16" > + "@ > + vrsqrtsh\t{%d1, %0|%0, %d1} > + vrsqrtsh\t{%1, %d0|%d0, %1}" > + [(set_attr "type" "sse") > + (set_attr "prefix" "evex") > + (set_attr "avx_partial_xmm_update" "false,true") > + (set_attr "mode" "HF")]) > + > (define_insn "sqrthf2" > [(set (match_operand:HF 0 "register_operand" "=v,v") > (sqrt:HF > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > index 431236ab3a4..0d87aeb75a1 100644 > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -2306,11 +2306,33 @@ > (set_attr "mode" "<ssescalarmode>")]) > > (define_expand "div<mode>3" > - [(set (match_operand:VF2H 0 "register_operand") > - (div:VF2H (match_operand:VF2H 1 "register_operand") > - (match_operand:VF2H 2 "vector_operand")))] > + [(set (match_operand:VF2 0 "register_operand") > + (div:VF2 (match_operand:VF2 1 "register_operand") > + (match_operand:VF2 2 "vector_operand")))] > "TARGET_SSE2") > > +(define_expand "div<mode>3" > + [(set (match_operand:VF_AVX512FP16VL 0 "register_operand") > + (div:VF_AVX512FP16VL > + (match_operand:VF_AVX512FP16VL 1 "register_operand") > + (match_operand:VF_AVX512FP16VL 2 "vector_operand")))] > + "TARGET_AVX512FP16" > +{ > + /* Transform HF vector div to vector mul/rcp. */ > + if (GET_MODE_INNER (<MODE>mode) == HFmode > + && TARGET_RECIP_VEC_DIV > + && optimize_insn_for_speed_p () > + && flag_finite_math_only && !flag_trapping_math > + && flag_unsafe_math_optimizations) > + { > + rtx op = gen_reg_rtx (<MODE>mode); > + operands[2] = force_reg (<MODE>mode, operands[2]); > + emit_insn (gen_avx512fp16_rcp<mode>2 (op, operands[2])); > + emit_insn (gen_mul<mode>3 (operands[0], operands[1], op)); > + DONE; > + } > +}) > + > (define_expand "div<mode>3" > [(set (match_operand:VF1 0 "register_operand") > (div:VF1 (match_operand:VF1 1 "register_operand") > @@ -2433,6 +2455,20 @@ > (set_attr "prefix" "evex") > (set_attr "mode" "HF")]) > > +(define_insn "*avx512fp16_vmrcpv8hf2" > + [(set (match_operand:V8HF 0 "register_operand" "=v") > + (vec_merge:V8HF > + (vec_duplicate:V8HF > + (unspec:HF [(match_operand:HF 1 "nonimmediate_operand" "vm")] > + UNSPEC_RCP)) > + (match_operand:V8HF 2 "register_operand" "v") > + (const_int 1)))] > + "TARGET_AVX512FP16" > + "vrcpsh\t{%1, %2, %0|%0, %2, %w1}" > + [(set_attr "type" "sse") > + (set_attr "prefix" "evex") > + (set_attr "mode" "HF")]) > + > (define_insn "<mask_codefor>rcp14<mode><mask_name>" > [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v") > (unspec:VF_AVX512VL > @@ -2558,6 +2594,13 @@ > DONE; > }) > > +(define_expand "rsqrt<mode>2" > + [(set (match_operand:VF_AVX512FP16VL 0 "register_operand") > + (unspec:VF_AVX512FP16VL > + [(match_operand:VF_AVX512FP16VL 1 "vector_operand")] > + UNSPEC_RSQRT))] > + "TARGET_AVX512FP16") > + > (define_insn "<sse>_rsqrt<mode>2" > [(set (match_operand:VF1_128_256 0 "register_operand" "=x") > (unspec:VF1_128_256 > @@ -2666,6 +2709,20 @@ > (set_attr "prefix" "evex") > (set_attr "mode" "HF")]) > > +(define_insn "*avx512fp16_vmrsqrtv8hf2" > + [(set (match_operand:V8HF 0 "register_operand" "=v") > + (vec_merge:V8HF > + (vec_duplicate:V8HF > + (unspec:HF [(match_operand:HF 1 "nonimmediate_operand" "vm")] > + UNSPEC_RSQRT)) > + (match_operand:V8HF 2 "register_operand" "v") > + (const_int 1)))] > + "TARGET_AVX512FP16" > + "vrsqrtsh\t{%1, %2, %0<mask_scalar_operand3>|%0<mask_scalar_operand3>, %2, > %w1}" > + [(set_attr "type" "sse") > + (set_attr "prefix" "evex") > + (set_attr "mode" "HF")]) > + > (define_expand "cond_<code><mode>" > [(set (match_operand:VFH 0 "register_operand") > (vec_merge:VFH > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-recip-1.c > b/gcc/testsuite/gcc.target/i386/avx512fp16-recip-1.c > new file mode 100644 > index 00000000000..bc7cbbc11b9 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-recip-1.c > @@ -0,0 +1,43 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -mavx512fp16 -mavx512vl -ffast-math" } */ > +/* { dg-final { scan-assembler "vrcpsh.*\n.*vmulsh" } } */ > +/* { dg-final { scan-assembler "vrcpph.*\n.*vmulph" } } */ > +/* { dg-final { scan-assembler "vrsqrtsh.*\n.*vmulsh" } } */ > +/* { dg-final { scan-assembler "vrsqrtph.*\n.*vmulph" } } */ > +/* { dg-final { scan-assembler-not "vsqrtsh" } } */ > +/* { dg-final { scan-assembler-not "vsqrtph" } } */ > +/* { dg-final { scan-assembler-not "vdivsh" } } */ > +/* { dg-final { scan-assembler-not "vdivph" } } */ > + > +#define FAST_ATTR \ > + __attribute__((noinline, noclone, optimize("fast-math"), target("recip"))) > + > +_Float16 FAST_ATTR > +scalar_hf_rcp_fast (_Float16 a, _Float16 b) > +{ > + return a / b; > +} > + > +_Float16 FAST_ATTR > +scalar_hf_rsqrt_fast (_Float16 a, _Float16 b) > +{ > + return a / __builtin_sqrtf16 (b); > +} > + > +void FAST_ATTR > +vector_hf_rcp_fast (_Float16 * restrict a, _Float16 * restrict b, > + _Float16 * restrict c, int n) > +{ > + int i; > + for (i = 0; i < n; i++) > + c[i] = a[i] / b[i]; > +} > + > +void FAST_ATTR > +vector_hf_rsqrt_fast (_Float16 * restrict a, _Float16 * restrict b, > + _Float16 * restrict c, int n) > +{ > + int i; > + for (i = 0; i < n; i++) > + c[i] = a[i] / __builtin_sqrtf16(b[i]); > +} > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-recip-2.c > b/gcc/testsuite/gcc.target/i386/avx512fp16-recip-2.c > new file mode 100644 > index 00000000000..ed7e0a2225f > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-recip-2.c > @@ -0,0 +1,97 @@ > +/* { dg-do run { target avx512fp16 } } */ > +/* { dg-options "-O3 -mavx512fp16 -mavx512vl -ffast-math" } */ > + > +static void recip_op_test (void); > +#define DO_TEST recip_op_test > +#define AVX512FP16 > +#define AVX512VL > +#include "avx512f-check.h" > +#include "avx512fp16-recip-1.c" > + > +_Float16 a[32], b[32], vexp[32], vref[32], sa, sb, sexp, sref; > + > +#define NO_FAST_ATTR \ > + __attribute__((noinline, noclone, \ > + optimize("fast-math,trapping-math"))) > + > +_Float16 NO_FAST_ATTR > +scalar_hf_rcp_no_fast (_Float16 a, _Float16 b) > +{ > + return a / b; > +} > + > +_Float16 NO_FAST_ATTR > +scalar_hf_rsqrt_no_fast (_Float16 a, _Float16 b) > +{ > + return a / __builtin_sqrtf16 (b); > +} > + > +void NO_FAST_ATTR > +vector_hf_rcp_no_fast (_Float16 * restrict a, _Float16 * restrict b, > + _Float16 * restrict c, int n) > +{ > + int i; > + for (i = 0; i < n; i++) > + c[i] = a[i] / b[i]; > +} > + > +void NO_FAST_ATTR > +vector_hf_rsqrt_no_fast (_Float16 * restrict a, _Float16 * restrict b, > + _Float16 * restrict c, int n) > +{ > + int i; > + for (i = 0; i < n; i++) > + c[i] = a[i] / __builtin_sqrtf16 (b[i]); > +} > + > +void init() > +{ > + int i; > + sa = 3.75; > + sb = 6.25; > + sexp = sref = 2.75; > + for (i = 0; i < 32; i++) > + { > + a[i] = i + 0.5; > + b[i] = i * 1.5; > + vexp[i] = vref[i] = 2.75 * i; > + } > +} > + > +int check_cond(void *a, void *b, int size) > +{ > + int i; > + unsigned short *pa = (unsigned short *)a, > + *pb = (unsigned short *)b; > + for (i = 0; i < size; i++) > + if (pa[i] != pb[i]) > + return 0; > + return 1; > +} > + > +static void recip_op_test() > +{ > + init (); > + sexp = scalar_hf_rcp_fast (sa, sb); > + sref = scalar_hf_rcp_no_fast (sa, sb); > + if (!check_cond (&sexp, &sref, 1)) > + abort (); > + > + init (); > + sexp = scalar_hf_rsqrt_fast (sa, sb); > + sref = scalar_hf_rsqrt_no_fast (sa, sb); > + if (!check_cond (&sexp, &sref, 1)) > + abort (); > + > + init (); > + vector_hf_rcp_fast (a, b, vexp, 32); > + vector_hf_rcp_no_fast (a, b, vref, 32); > + if (!check_cond (vexp, vref, 1)) > + abort (); > + > + init (); > + vector_hf_rsqrt_fast (a, b, vexp, 32); > + vector_hf_rsqrt_no_fast (a, b, vref, 32); > + if (!check_cond (vexp, vref, 1)) > + abort (); > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr102464.c > b/gcc/testsuite/gcc.target/i386/pr102464.c > index e3e060ee80b..7e1fbdccf02 100644 > --- a/gcc/testsuite/gcc.target/i386/pr102464.c > +++ b/gcc/testsuite/gcc.target/i386/pr102464.c > @@ -1,6 +1,6 @@ > /* PR target/102464. */ > /* { dg-do compile } */ > -/* { dg-options "-O2 -mavx512fp16" } */ > +/* { dg-options "-O2 -mavx512fp16 -fno-trapping-math" } */ > > #define FOO(FUNC,SUFFIX) \ > _Float16 \ > -- > 2.27.1 >
-- BR, Hongtao