Hi, As PR90332 shows, the current scalar epilogue peeling for gaps elimination requires expected vec_init optab with two half size vector mode. On Power, we don't support vector mode like V8QI, so can't support optab like vec_initv16qiv8qi. But we want to leverage existing scalar mode like DI to init the desirable vector mode. This patch is to extend the existing support for Power, as evaluated on Power9 we can see expected 1.9% speed up on SPEC2017 525.x264_r.
Bootstrapped/regtested on powerpc64le-linux-gnu (LE) P8 and P9. Is it ok for trunk? BR, Kewen ----------- gcc/ChangeLog 2020-MM-DD Kewen Lin <li...@gcc.gnu.org> PR tree-optimization/90332 * gcc/tree-vectorizer.h (struct _stmt_vec_info): Add half_mode field. (DR_GROUP_HALF_MODE): New macro. * gcc/tree-vect-stmts.c (get_half_mode_for_vector): New function. (get_group_load_store_type): Call get_half_mode_for_vector to query target whether support half size mode and update DR_GROUP_HALF_MODE if yes. (vectorizable_load): Build appropriate vector type based on DR_GROUP_HALF_MODE.
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 2ca8e494680..24ec0d3759d 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -2220,6 +2220,52 @@ vect_get_store_rhs (stmt_vec_info stmt_info) gcc_unreachable (); } +/* Function GET_HALF_MODE_FOR_VECTOR + + If target supports either of: + - One vector mode, whose size is half of given vector size and whose + element mode is the same as that of given vector. Meanwhile, it's + available to init given vector with two of them. + - One scalar mode, whose size is half of given vector size. Meanwhile, + vector mode with two of them exists and it's available to init it with + two of them. + return true and save the mode in HMODE. Otherwise, return false. + + VECTYPE is type of given vector type. */ + +static bool +get_half_mode_for_vector (tree vectype, machine_mode *hmode) +{ + gcc_assert (VECTOR_TYPE_P (vectype)); + machine_mode vec_mode = TYPE_MODE (vectype); + scalar_mode elmode = SCALAR_TYPE_MODE (TREE_TYPE (vectype)); + + /* Check whether half size vector mode supported. */ + gcc_assert (GET_MODE_NUNITS (vec_mode).is_constant ()); + poly_uint64 n_half_units = exact_div (GET_MODE_NUNITS (vec_mode), 2); + if (related_vector_mode (vec_mode, elmode, n_half_units).exists (hmode) + && convert_optab_handler (vec_init_optab, vec_mode, *hmode) + != CODE_FOR_nothing) + return true; + + /* Check whether half size scalar mode supported. */ + poly_uint64 half_size = exact_div (GET_MODE_BITSIZE (vec_mode), 2); + opt_machine_mode smode + = mode_for_size (half_size, GET_MODE_CLASS (elmode), 0); + if (!smode.exists ()) + return false; + *hmode = smode.require (); + + machine_mode new_vec_mode; + if (related_vector_mode (vec_mode, as_a<scalar_mode> (*hmode), 2) + .exists (&new_vec_mode) + && convert_optab_handler (vec_init_optab, new_vec_mode, *hmode) + != CODE_FOR_nothing) + return true; + + return false; +} + /* A subroutine of get_load_store_type, with a subset of the same arguments. Handle the case where STMT_INFO is part of a grouped load or store. @@ -2290,33 +2336,36 @@ get_group_load_store_type (stmt_vec_info stmt_info, tree vectype, bool slp, than the alignment boundary B. Every vector access will be a multiple of B and so we are guaranteed to access a non-gap element in the same B-sized block. */ + machine_mode half_mode; if (overrun_p && gap < (vect_known_alignment_in_bytes (first_dr_info) / vect_get_scalar_dr_size (first_dr_info))) - overrun_p = false; - + { + overrun_p = false; + if (known_eq (nunits, (group_size - gap) * 2) + && known_eq (nunits, group_size) + && get_half_mode_for_vector (vectype, &half_mode)) + DR_GROUP_HALF_MODE (first_stmt_info) = half_mode; + } /* If the gap splits the vector in half and the target can do half-vector operations avoid the epilogue peeling by simply loading half of the vector only. Usually the construction with an upper zero half will be elided. */ dr_alignment_support alignment_support_scheme; - scalar_mode elmode = SCALAR_TYPE_MODE (TREE_TYPE (vectype)); - machine_mode vmode; if (overrun_p && !masked_p && (((alignment_support_scheme - = vect_supportable_dr_alignment (first_dr_info, false))) - == dr_aligned + = vect_supportable_dr_alignment (first_dr_info, false))) + == dr_aligned || alignment_support_scheme == dr_unaligned_supported) && known_eq (nunits, (group_size - gap) * 2) && known_eq (nunits, group_size) && VECTOR_MODE_P (TYPE_MODE (vectype)) - && related_vector_mode (TYPE_MODE (vectype), elmode, - group_size - gap).exists (&vmode) - && (convert_optab_handler (vec_init_optab, - TYPE_MODE (vectype), vmode) - != CODE_FOR_nothing)) - overrun_p = false; + && get_half_mode_for_vector (vectype, &half_mode)) + { + DR_GROUP_HALF_MODE (first_stmt_info) = half_mode; + overrun_p = false; + } if (overrun_p && !can_overrun_p) { @@ -9541,6 +9590,7 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, else { tree ltype = vectype; + machine_mode half_mode = VOIDmode; /* If there's no peeling for gaps but we have a gap with slp loads then load the lower half of the vector only. See get_group_load_store_type for @@ -9553,10 +9603,18 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, (group_size - DR_GROUP_GAP (first_stmt_info)) * 2) && known_eq (nunits, group_size)) - ltype = build_vector_type (TREE_TYPE (vectype), - (group_size - - DR_GROUP_GAP - (first_stmt_info))); + { + gcc_assert (DR_GROUP_HALF_MODE (first_stmt_info) + != VOIDmode); + half_mode = DR_GROUP_HALF_MODE (first_stmt_info); + if (VECTOR_MODE_P (half_mode)) + ltype = build_vector_type ( + TREE_TYPE (vectype), + (group_size - DR_GROUP_GAP (first_stmt_info))); + else + ltype + = lang_hooks.types.type_for_mode (half_mode, 1); + } data_ref = fold_build2 (MEM_REF, ltype, dataref_ptr, dataref_offset @@ -9584,10 +9642,21 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem); CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, build_zero_cst (ltype)); - new_stmt - = gimple_build_assign (vec_dest, - build_constructor - (vectype, v)); + if (VECTOR_MODE_P (half_mode)) + new_stmt = gimple_build_assign ( + vec_dest, build_constructor (vectype, v)); + else + { + tree new_vtype = build_vector_type (ltype, 2); + tree new_vname = make_ssa_name (new_vtype); + new_stmt = gimple_build_assign ( + new_vname, build_constructor (new_vtype, v)); + vect_finish_stmt_generation (stmt_info, + new_stmt, gsi); + new_stmt = gimple_build_assign ( + vec_dest, build1 (VIEW_CONVERT_EXPR, vectype, + new_vname)); + } } } break; diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index f7becb34ab4..6fcbeb653d7 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -1018,6 +1018,8 @@ public: /* For loads only, the gap from the previous load. For consecutive loads, GAP is 1. */ unsigned int gap; + /* For loads only, mode for halves of vector without peeling for gaps. */ + machine_mode half_mode; /* The minimum negative dependence distance this stmt participates in or zero if none. */ @@ -1227,6 +1229,8 @@ STMT_VINFO_BB_VINFO (stmt_vec_info stmt_vinfo) (gcc_checking_assert ((S)->dr_aux.dr), (S)->store_count) #define DR_GROUP_GAP(S) \ (gcc_checking_assert ((S)->dr_aux.dr), (S)->gap) +#define DR_GROUP_HALF_MODE(S) \ + (gcc_checking_assert ((S)->dr_aux.dr), (S)->half_mode) #define REDUC_GROUP_FIRST_ELEMENT(S) \ (gcc_checking_assert (!(S)->dr_aux.dr), (S)->first_element)