On Tue, Jul 12, 2022 at 2:07 PM Richard Sandiford via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > The PR is about the aarch64 port using an ACLE built-in function > to vectorise a scalar function call, even though the ECF_* flags for > the ACLE function didn't match the ECF_* flags for the scalar call. > > To some extent that kind of difference is inevitable, since the > ACLE intrinsics are supposed to follow the behaviour of the > underlying instruction as closely as possible. Also, using > target-specific builtins has the drawback of limiting further > gimple optimisation, since the gimple optimisers won't know what > the function does. > > We handle several other maths functions, including round, floor > and ceil, by defining directly-mapped internal functions that > are linked to the associated built-in functions. This has two > main advantages: > > - it means that, internally, we are not restricted to the set of > scalar types that happen to have associated C/C++ functions > > - the functions (and thus the underlying optabs) extend naturally > to vectors > > This patch takes the same approach for the remaining functions > handled by aarch64_builtin_vectorized_function. > > Tested on aarch64-linux-gnu and x86_64-linux-gnu. OK to install?
OK. Thanks, Richard. > Richard > > > gcc/ > PR target/106253 > * predict.h (insn_optimization_type): Declare. > * predict.cc (insn_optimization_type): New function. > * internal-fn.def (IFN_ICEIL, IFN_IFLOOR, IFN_IRINT, IFN_IROUND) > (IFN_LCEIL, IFN_LFLOOR, IFN_LRINT, IFN_LROUND, IFN_LLCEIL) > (IFN_LLFLOOR, IFN_LLRINT, IFN_LLROUND): New internal functions. > * internal-fn.cc (unary_convert_direct): New macro. > (expand_convert_optab_fn): New function. > (expand_unary_convert_optab_fn): New macro. > (direct_unary_convert_optab_supported_p): Likewise. > * optabs.cc (expand_sfix_optab): Pass insn_optimization_type to > convert_optab_handler. > * config/aarch64/aarch64-protos.h > (aarch64_builtin_vectorized_function): Delete. > * config/aarch64/aarch64-builtins.cc > (aarch64_builtin_vectorized_function): Delete. > * config/aarch64/aarch64.cc > (TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION): Delete. > * config/i386/i386.cc (ix86_optab_supported_p): Handle lround_optab. > * config/i386/i386.md (lround<X87MODEF:mode><SWI248x:mode>2): Remove > optimize_insn_for_size_p test. > > gcc/testsuite/ > PR target/106253 > * gcc.target/aarch64/vect_unary_1.c: Add tests for iroundf, > llround, iceilf, llceil, ifloorf, llfloor, irintf and llrint. > * gfortran.dg/vect/pr106253.f: New test. > --- > gcc/config/aarch64/aarch64-builtins.cc | 83 ------------------- > gcc/config/aarch64/aarch64-protos.h | 1 - > gcc/config/aarch64/aarch64.cc | 4 - > gcc/config/i386/i386.cc | 1 + > gcc/config/i386/i386.md | 3 - > gcc/internal-fn.cc | 20 +++++ > gcc/internal-fn.def | 23 +++++ > gcc/optabs.cc | 3 +- > gcc/predict.cc | 11 +++ > gcc/predict.h | 1 + > .../gcc.target/aarch64/vect_unary_1.c | 65 ++++++++++++++- > gcc/testsuite/gfortran.dg/vect/pr106253.f | 35 ++++++++ > 12 files changed, 157 insertions(+), 93 deletions(-) > create mode 100644 gcc/testsuite/gfortran.dg/vect/pr106253.f > > diff --git a/gcc/config/aarch64/aarch64-builtins.cc > b/gcc/config/aarch64/aarch64-builtins.cc > index a486321e10f..adfddb8b215 100644 > --- a/gcc/config/aarch64/aarch64-builtins.cc > +++ b/gcc/config/aarch64/aarch64-builtins.cc > @@ -2555,89 +2555,6 @@ aarch64_general_expand_builtin (unsigned int fcode, > tree exp, rtx target, > gcc_unreachable (); > } > > -tree > -aarch64_builtin_vectorized_function (unsigned int fn, tree type_out, > - tree type_in) > -{ > - machine_mode in_mode, out_mode; > - > - if (TREE_CODE (type_out) != VECTOR_TYPE > - || TREE_CODE (type_in) != VECTOR_TYPE) > - return NULL_TREE; > - > - out_mode = TYPE_MODE (type_out); > - in_mode = TYPE_MODE (type_in); > - > -#undef AARCH64_CHECK_BUILTIN_MODE > -#define AARCH64_CHECK_BUILTIN_MODE(C, N) 1 > -#define AARCH64_FIND_FRINT_VARIANT(N) \ > - (AARCH64_CHECK_BUILTIN_MODE (2, D) \ > - ? aarch64_builtin_decls[AARCH64_SIMD_BUILTIN_UNOP_##N##v2df] \ > - : (AARCH64_CHECK_BUILTIN_MODE (4, S) \ > - ? aarch64_builtin_decls[AARCH64_SIMD_BUILTIN_UNOP_##N##v4sf] \ > - : (AARCH64_CHECK_BUILTIN_MODE (2, S) \ > - ? aarch64_builtin_decls[AARCH64_SIMD_BUILTIN_UNOP_##N##v2sf] \ > - : NULL_TREE))) > - switch (fn) > - { > -#undef AARCH64_CHECK_BUILTIN_MODE > -#define AARCH64_CHECK_BUILTIN_MODE(C, N) \ > - (out_mode == V##C##N##Imode && in_mode == V##C##N##Fmode) > - CASE_CFN_IFLOOR: > - CASE_CFN_LFLOOR: > - CASE_CFN_LLFLOOR: > - { > - enum aarch64_builtins builtin; > - if (AARCH64_CHECK_BUILTIN_MODE (2, D)) > - builtin = AARCH64_SIMD_BUILTIN_UNOP_lfloorv2dfv2di; > - else if (AARCH64_CHECK_BUILTIN_MODE (4, S)) > - builtin = AARCH64_SIMD_BUILTIN_UNOP_lfloorv4sfv4si; > - else if (AARCH64_CHECK_BUILTIN_MODE (2, S)) > - builtin = AARCH64_SIMD_BUILTIN_UNOP_lfloorv2sfv2si; > - else > - return NULL_TREE; > - > - return aarch64_builtin_decls[builtin]; > - } > - CASE_CFN_ICEIL: > - CASE_CFN_LCEIL: > - CASE_CFN_LLCEIL: > - { > - enum aarch64_builtins builtin; > - if (AARCH64_CHECK_BUILTIN_MODE (2, D)) > - builtin = AARCH64_SIMD_BUILTIN_UNOP_lceilv2dfv2di; > - else if (AARCH64_CHECK_BUILTIN_MODE (4, S)) > - builtin = AARCH64_SIMD_BUILTIN_UNOP_lceilv4sfv4si; > - else if (AARCH64_CHECK_BUILTIN_MODE (2, S)) > - builtin = AARCH64_SIMD_BUILTIN_UNOP_lceilv2sfv2si; > - else > - return NULL_TREE; > - > - return aarch64_builtin_decls[builtin]; > - } > - CASE_CFN_IROUND: > - CASE_CFN_LROUND: > - CASE_CFN_LLROUND: > - { > - enum aarch64_builtins builtin; > - if (AARCH64_CHECK_BUILTIN_MODE (2, D)) > - builtin = AARCH64_SIMD_BUILTIN_UNOP_lroundv2dfv2di; > - else if (AARCH64_CHECK_BUILTIN_MODE (4, S)) > - builtin = AARCH64_SIMD_BUILTIN_UNOP_lroundv4sfv4si; > - else if (AARCH64_CHECK_BUILTIN_MODE (2, S)) > - builtin = AARCH64_SIMD_BUILTIN_UNOP_lroundv2sfv2si; > - else > - return NULL_TREE; > - > - return aarch64_builtin_decls[builtin]; > - } > - default: > - return NULL_TREE; > - } > - > - return NULL_TREE; > -} > - > /* Return builtin for reciprocal square root. */ > > tree > diff --git a/gcc/config/aarch64/aarch64-protos.h > b/gcc/config/aarch64/aarch64-protos.h > index dabd047d7ba..19c9d3cb179 100644 > --- a/gcc/config/aarch64/aarch64-protos.h > +++ b/gcc/config/aarch64/aarch64-protos.h > @@ -986,7 +986,6 @@ gimple *aarch64_general_gimple_fold_builtin (unsigned > int, gcall *, > rtx aarch64_general_expand_builtin (unsigned int, tree, rtx, int); > tree aarch64_general_builtin_decl (unsigned, bool); > tree aarch64_general_builtin_rsqrt (unsigned int); > -tree aarch64_builtin_vectorized_function (unsigned int, tree, tree); > void handle_arm_acle_h (void); > void handle_arm_neon_h (void); > > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > index d049f9a9819..25f4cbb466d 100644 > --- a/gcc/config/aarch64/aarch64.cc > +++ b/gcc/config/aarch64/aarch64.cc > @@ -27584,10 +27584,6 @@ aarch64_libgcc_floating_mode_supported_p > #undef TARGET_VECTORIZE_BUILTINS > #define TARGET_VECTORIZE_BUILTINS > > -#undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION > -#define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \ > - aarch64_builtin_vectorized_function > - > #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES > #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \ > aarch64_autovectorize_vector_modes > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc > index 95cb1e2ce70..3a3c7299eb4 100644 > --- a/gcc/config/i386/i386.cc > +++ b/gcc/config/i386/i386.cc > @@ -24004,6 +24004,7 @@ ix86_optab_supported_p (int op, machine_mode mode1, > machine_mode, > case ldexp_optab: > case scalb_optab: > case round_optab: > + case lround_optab: > return opt_type == OPTIMIZE_FOR_SPEED; > > case rint_optab: > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md > index 3b02d0cd567..bf29f444382 100644 > --- a/gcc/config/i386/i386.md > +++ b/gcc/config/i386/i386.md > @@ -19926,9 +19926,6 @@ (define_expand "lround<X87MODEF:mode><SWI248x:mode>2" > && ((<SWI248x:MODE>mode != DImode) || TARGET_64BIT) > && !flag_trapping_math && !flag_rounding_math)" > { > - if (optimize_insn_for_size_p ()) > - FAIL; > - > if (SSE_FLOAT_MODE_P (<X87MODEF:MODE>mode) && TARGET_SSE_MATH > && <SWI248x:MODE>mode != HImode > && ((<SWI248x:MODE>mode != DImode) || TARGET_64BIT) > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc > index d666ccccf67..28973d957fb 100644 > --- a/gcc/internal-fn.cc > +++ b/gcc/internal-fn.cc > @@ -120,6 +120,7 @@ init_internal_fns () > #define len_store_direct { 3, 3, false } > #define vec_set_direct { 3, 3, false } > #define unary_direct { 0, 0, true } > +#define unary_convert_direct { -1, 0, true } > #define binary_direct { 0, 0, true } > #define ternary_direct { 0, 0, true } > #define cond_unary_direct { 1, 1, true } > @@ -3679,6 +3680,19 @@ expand_while_optab_fn (internal_fn, gcall *stmt, > convert_optab optab) > emit_move_insn (lhs_rtx, ops[0].value); > } > > +/* Expand a call to a convert-like optab using the operands in STMT. > + FN has a single output operand and NARGS input operands. */ > + > +static void > +expand_convert_optab_fn (internal_fn fn, gcall *stmt, convert_optab optab, > + unsigned int nargs) > +{ > + tree_pair types = direct_internal_fn_types (fn, stmt); > + insn_code icode = convert_optab_handler (optab, TYPE_MODE (types.first), > + TYPE_MODE (types.second)); > + expand_fn_using_insn (stmt, icode, 1, nargs); > +} > + > /* Expanders for optabs that can use expand_direct_optab_fn. */ > > #define expand_unary_optab_fn(FN, STMT, OPTAB) \ > @@ -3711,6 +3725,11 @@ expand_while_optab_fn (internal_fn, gcall *stmt, > convert_optab optab) > #define expand_check_ptrs_optab_fn(FN, STMT, OPTAB) \ > expand_direct_optab_fn (FN, STMT, OPTAB, 4) > > +/* Expanders for optabs that can use expand_convert_optab_fn. */ > + > +#define expand_unary_convert_optab_fn(FN, STMT, OPTAB) \ > + expand_convert_optab_fn (FN, STMT, OPTAB, 1) > + > /* RETURN_TYPE and ARGS are a return type and argument list that are > in principle compatible with FN (which satisfies direct_internal_fn_p). > Return the types that should be used to determine whether the > @@ -3783,6 +3802,7 @@ multi_vector_optab_supported_p (convert_optab optab, > tree_pair types, > } > > #define direct_unary_optab_supported_p direct_optab_supported_p > +#define direct_unary_convert_optab_supported_p convert_optab_supported_p > #define direct_binary_optab_supported_p direct_optab_supported_p > #define direct_ternary_optab_supported_p direct_optab_supported_p > #define direct_cond_unary_optab_supported_p direct_optab_supported_p > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def > index d2d550d3586..7c398baadc8 100644 > --- a/gcc/internal-fn.def > +++ b/gcc/internal-fn.def > @@ -61,6 +61,9 @@ along with GCC; see the file COPYING3. If not see > - binary: a normal binary optab, such as vec_interleave_lo_<mode> > - ternary: a normal ternary optab, such as fma<mode>4 > > + - unary_convert: a single-input conversion optab, such as > + lround<srcmode><dstmode>2. > + > - cond_binary: a conditional binary optab, such as cond_add<mode> > - cond_ternary: a conditional ternary optab, such as cond_fma_rev<mode> > > @@ -267,6 +270,26 @@ DEF_INTERNAL_FLT_FLOATN_FN (SQRT, ECF_CONST, sqrt, unary) > DEF_INTERNAL_FLT_FN (TAN, ECF_CONST, tan, unary) > DEF_INTERNAL_FLT_FN (TANH, ECF_CONST, tanh, unary) > > +/* Floating-point to integer conversions. > + > + ??? Here we preserve the I/L/LL prefix convention from the > + corresponding built-in functions, rather than make the internal > + functions polymorphic in both the argument and the return types. > + Perhaps an alternative would be to pass a zero of the required > + return type as a second parameter. */ > +DEF_INTERNAL_FLT_FN (ICEIL, ECF_CONST, lceil, unary_convert) > +DEF_INTERNAL_FLT_FN (IFLOOR, ECF_CONST, lfloor, unary_convert) > +DEF_INTERNAL_FLT_FN (IRINT, ECF_CONST, lrint, unary_convert) > +DEF_INTERNAL_FLT_FN (IROUND, ECF_CONST, lround, unary_convert) > +DEF_INTERNAL_FLT_FN (LCEIL, ECF_CONST, lceil, unary_convert) > +DEF_INTERNAL_FLT_FN (LFLOOR, ECF_CONST, lfloor, unary_convert) > +DEF_INTERNAL_FLT_FN (LRINT, ECF_CONST, lrint, unary_convert) > +DEF_INTERNAL_FLT_FN (LROUND, ECF_CONST, lround, unary_convert) > +DEF_INTERNAL_FLT_FN (LLCEIL, ECF_CONST, lceil, unary_convert) > +DEF_INTERNAL_FLT_FN (LLFLOOR, ECF_CONST, lfloor, unary_convert) > +DEF_INTERNAL_FLT_FN (LLRINT, ECF_CONST, lrint, unary_convert) > +DEF_INTERNAL_FLT_FN (LLROUND, ECF_CONST, lround, unary_convert) > + > /* FP rounding. */ > DEF_INTERNAL_FLT_FLOATN_FN (CEIL, ECF_CONST, ceil, unary) > DEF_INTERNAL_FLT_FLOATN_FN (FLOOR, ECF_CONST, floor, unary) > diff --git a/gcc/optabs.cc b/gcc/optabs.cc > index a50dd798f2a..165f8d1fa22 100644 > --- a/gcc/optabs.cc > +++ b/gcc/optabs.cc > @@ -5828,7 +5828,8 @@ expand_sfix_optab (rtx to, rtx from, convert_optab tab) > FOR_EACH_MODE_FROM (fmode, GET_MODE (from)) > FOR_EACH_MODE_FROM (imode, GET_MODE (to)) > { > - icode = convert_optab_handler (tab, imode, fmode); > + icode = convert_optab_handler (tab, imode, fmode, > + insn_optimization_type ()); > if (icode != CODE_FOR_nothing) > { > rtx_insn *last = get_last_insn (); > diff --git a/gcc/predict.cc b/gcc/predict.cc > index b36caa3ae82..1bc7ab94454 100644 > --- a/gcc/predict.cc > +++ b/gcc/predict.cc > @@ -362,6 +362,17 @@ optimize_insn_for_speed_p (void) > return !optimize_insn_for_size_p (); > } > > +/* Return the optimization type that should be used for the current > + instruction. */ > + > +optimization_type > +insn_optimization_type () > +{ > + return (optimize_insn_for_speed_p () > + ? OPTIMIZE_FOR_SPEED > + : OPTIMIZE_FOR_SIZE); > +} > + > /* Return TRUE if LOOP should be optimized for size. */ > > optimize_size_level > diff --git a/gcc/predict.h b/gcc/predict.h > index 864997498ec..25484373769 100644 > --- a/gcc/predict.h > +++ b/gcc/predict.h > @@ -68,6 +68,7 @@ extern enum optimize_size_level optimize_edge_for_size_p > (edge); > extern bool optimize_edge_for_speed_p (edge); > extern enum optimize_size_level optimize_insn_for_size_p (void); > extern bool optimize_insn_for_speed_p (void); > +extern optimization_type insn_optimization_type (); > extern enum optimize_size_level optimize_loop_for_size_p (class loop *); > extern bool optimize_loop_for_speed_p (class loop *); > extern bool optimize_loop_nest_for_speed_p (class loop *); > diff --git a/gcc/testsuite/gcc.target/aarch64/vect_unary_1.c > b/gcc/testsuite/gcc.target/aarch64/vect_unary_1.c > index 8516808becf..94d9af1a55d 100644 > --- a/gcc/testsuite/gcc.target/aarch64/vect_unary_1.c > +++ b/gcc/testsuite/gcc.target/aarch64/vect_unary_1.c > @@ -1,4 +1,4 @@ > -/* { dg-options "-O3 --save-temps" } */ > +/* { dg-options "-O3 -fno-math-errno --save-temps" } */ > /* { dg-final { check-function-bodies "**" "" "" } } */ > > #include <stdint.h> > @@ -184,3 +184,66 @@ TEST2 (int, ctz, int) > ** ret > */ > TEST4 (int, ctz, int) > + > +/* > +** test2_int_iroundf_float: > +** fcvtas v0.2s, v1.2s > +** ret > +*/ > +TEST2 (int, iroundf, float) > + > +/* > +** test2_int64_t_llround_double: > +** fcvtas v0.2d, v1.2d > +** ret > +*/ > +TEST2 (int64_t, llround, double) > + > +/* > +** test4_int_iroundf_float: > +** fcvtas v0.4s, v1.4s > +** ret > +*/ > +TEST4 (int, iroundf, float) > + > +/* > +** test2_int_ifloorf_float: > +** fcvtms v0.2s, v1.2s > +** ret > +*/ > +TEST2 (int, ifloorf, float) > + > +/* > +** test2_int64_t_llfloor_double: > +** fcvtms v0.2d, v1.2d > +** ret > +*/ > +TEST2 (int64_t, llfloor, double) > + > +/* > +** test4_int_ifloorf_float: > +** fcvtms v0.4s, v1.4s > +** ret > +*/ > +TEST4 (int, ifloorf, float) > + > +/* > +** test2_int_iceilf_float: > +** fcvtps v0.2s, v1.2s > +** ret > +*/ > +TEST2 (int, iceilf, float) > + > +/* > +** test2_int64_t_llceil_double: > +** fcvtps v0.2d, v1.2d > +** ret > +*/ > +TEST2 (int64_t, llceil, double) > + > +/* > +** test4_int_iceilf_float: > +** fcvtps v0.4s, v1.4s > +** ret > +*/ > +TEST4 (int, iceilf, float) > diff --git a/gcc/testsuite/gfortran.dg/vect/pr106253.f > b/gcc/testsuite/gfortran.dg/vect/pr106253.f > new file mode 100644 > index 00000000000..1b6b7e892f2 > --- /dev/null > +++ b/gcc/testsuite/gfortran.dg/vect/pr106253.f > @@ -0,0 +1,35 @@ > +! { dg-do compile } > + > + SUBROUTINE DGEMV ( TRANS, M, N, ALPHA, A, LDA, X, INCX, & > + & BETA, Y, INCY ) > + LOGICAL LSAME > + IF ( .NOT.LSAME( TRANS, 'N' ).AND. & > + & .NOT.LSAME( TRANS, 'C' ) )THEN > + END IF > + END > + subroutine evlrnf (ptrs0t, nclsm, prnf0t) > + real, dimension (1:nclsm,1:nclsm), intent (in) :: ptrs0t > + real, dimension (1:nclsm,1:nclsm), intent (out):: prnf0t > + real, allocatable, dimension (:,:) :: utrsft ! probas up > + real, allocatable, dimension (:,:) :: dtrsft ! probas down > + real, allocatable, dimension (:,:) :: xwrkt ! matrice > + do icls = 1, nclsm > + do ival = ipic - 1, 1, -1 > + xwrkt = trs2a2 (ival, ipic, utrsft, dtrsft, ncls) > + enddo > + enddo > + contains > + function trs2a2 (j, k, u, d, m) > + real, dimension (1:m,1:m) :: trs2a2 ! resultat > + real, dimension (1:m,1:m) :: u, d ! matrices utrsft, dtrsft > + end function trs2a2 > + end > + program rnflow > + integer, parameter :: ncls = 256 ! nombre de classes > + integer, dimension (1:ncls,1:ncls) :: mrnftt ! matrice theorique > + real, dimension (1:ncls,1:ncls) :: ptrst ! matrice Markov > + real, dimension (1:ncls,1:ncls) :: prnft ! matrice Rainflow > + call evlrnf (ptrst, ncls, prnft) > + mrnftt = nint (real (nsim) * real (npic) * prnft) > + call cmpmat (mrnftt, mrnfst) > + end program rnflow > -- > 2.25.1 >