Hi,
As the comments from Richard and Segher (Thanks!), I've made some changes by adding some checks and extensions. *) Check whether vector type available on target machine, *) Check whether vector operation available on target machine, *) Extend the vector operation support to MULT/BIT_AND/BIT_IOR/BIT_XOR. *) Add more test cases for coverage. Re-bootstrapped/re-regtested successfully on powerpc64le-linux-gnu, is it ok for GCC10? gcc/ChangeLog 2019-03-12 Kewen Lin <li...@gcc.gnu.org> PR target/88497 * tree-ssa-reassoc.c (reassociate_bb): Swap the positions of GIMPLE_BINARY_RHS check and gimple_visited_p check, call new function undistribute_bitref_for_vector. (undistribute_bitref_for_vector): New function. (cleanup_vinfo_map): Likewise. (unsigned_cmp): Likewise. gcc/testsuite/ChangeLog 2019-03-12 Kewen Lin <li...@gcc.gnu.org> * gcc.dg/tree-ssa/pr88497-1.c: New test. * gcc.dg/tree-ssa/pr88497-2.c: Likewise. * gcc.dg/tree-ssa/pr88497-3.c: Likewise. * gcc.dg/tree-ssa/pr88497-4.c: Likewise. * gcc.dg/tree-ssa/pr88497-5.c: Likewise. --- gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c | 42 ++++ gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c | 31 +++ gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c | 31 +++ gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c | 31 +++ gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c | 31 +++ gcc/tree-ssa-reassoc.c | 309 +++++++++++++++++++++++++++++- 6 files changed, 470 insertions(+), 5 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c new file mode 100644 index 0000000..c87caff --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c @@ -0,0 +1,42 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_double } */ +/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */ + +/* To test reassoc can undistribute vector bit_field_ref summation. + + arg1 and arg2 are two arrays whose elements of type vector double. + Assuming: + A0 = arg1[0], A1 = arg1[1], A2 = arg1[2], A3 = arg1[3], + B0 = arg2[0], B1 = arg2[1], B2 = arg2[2], B3 = arg2[3], + + Then: + V0 = A0 * B0, V1 = A1 * B1, V2 = A2 * B2, V3 = A3 * B3, + + reassoc transforms + + accumulator += V0[0] + V0[1] + V1[0] + V1[1] + V2[0] + V2[1] + + V3[0] + V3[1]; + + into: + + T = V0 + V1 + V2 + V3 + accumulator += T[0] + T[1]; + + Fewer bit_field_refs, only two for 128 or more bits vector. */ + +typedef double v2df __attribute__ ((vector_size (16))); +double +test (double accumulator, v2df arg1[], v2df arg2[]) +{ + v2df temp; + temp = arg1[0] * arg2[0]; + accumulator += temp[0] + temp[1]; + temp = arg1[1] * arg2[1]; + accumulator += temp[0] + temp[1]; + temp = arg1[2] * arg2[2]; + accumulator += temp[0] + temp[1]; + temp = arg1[3] * arg2[3]; + accumulator += temp[0] + temp[1]; + return accumulator; +} +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 2 "reassoc1" { target { powerpc*-*-* } } } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c new file mode 100644 index 0000000..d1851ff --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c @@ -0,0 +1,31 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_float } */ +/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */ + +/* To test reassoc can undistribute vector bit_field_ref on multiplication. + + v1, v2, v3, v4 of type vector float. + + reassoc transforms + + accumulator *= v1[0] * v1[1] * v1[2] * v1[3] * + v2[0] * v2[1] * v2[2] * v2[3] * + v3[0] * v3[1] * v3[2] * v3[3] * + v4[0] * v4[1] * v4[2] * v4[3] ; + + into: + + T = v1 * v2 * v3 * v4; + accumulator *= T[0] * T[1] * T[2] * T[3]; + + Fewer bit_field_refs, only four for 128 or more bits vector. */ + +typedef float v4si __attribute__((vector_size(16))); +float test(float accumulator, v4si v1, v4si v2, v4si v3, v4si v4) { + accumulator *= v1[0] * v1[1] * v1[2] * v1[3]; + accumulator *= v2[0] * v2[1] * v2[2] * v2[3]; + accumulator *= v3[0] * v3[1] * v3[2] * v3[3]; + accumulator *= v4[0] * v4[1] * v4[2] * v4[3]; + return accumulator; +} +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c new file mode 100644 index 0000000..e774d25 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c @@ -0,0 +1,31 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-options "-O2 -fdump-tree-reassoc1" } */ + +/* To test reassoc can undistribute vector bit_field_ref on bitwise AND. + + v1, v2, v3, v4 of type vector int. + + reassoc transforms + + accumulator &= v1[0] & v1[1] & v1[2] & v1[3] & + v2[0] & v2[1] & v2[2] & v2[3] & + v3[0] & v3[1] & v3[2] & v3[3] & + v4[0] & v4[1] & v4[2] & v4[3] ; + + into: + + T = v1 & v2 & v3 & v4; + accumulator &= T[0] & T[1] & T[2] & T[3]; + + Fewer bit_field_refs, only four for 128 or more bits vector. */ + +typedef int v4si __attribute__((vector_size(16))); +int test(int accumulator, v4si v1, v4si v2, v4si v3, v4si v4) { + accumulator &= v1[0] & v1[1] & v1[2] & v1[3]; + accumulator &= v2[0] & v2[1] & v2[2] & v2[3]; + accumulator &= v3[0] & v3[1] & v3[2] & v3[3]; + accumulator &= v4[0] & v4[1] & v4[2] & v4[3]; + return accumulator; +} +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c new file mode 100644 index 0000000..7b75404 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c @@ -0,0 +1,31 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-options "-O2 -fdump-tree-reassoc1" } */ + +/* To test reassoc can undistribute vector bit_field_ref on bitwise IOR. + + v1, v2, v3, v4 of type vector int. + + reassoc transforms + + accumulator |= v1[0] | v1[1] | v1[2] | v1[3] | + v2[0] | v2[1] | v2[2] | v2[3] | + v3[0] | v3[1] | v3[2] | v3[3] | + v4[0] | v4[1] | v4[2] | v4[3] ; + + into: + + T = v1 | v2 | v3 | v4; + accumulator |= T[0] | T[1] | T[2] | T[3]; + + Fewer bit_field_refs, only four for 128 or more bits vector. */ + +typedef int v4si __attribute__((vector_size(16))); +int test(int accumulator, v4si v1, v4si v2, v4si v3, v4si v4) { + accumulator |= v1[0] | v1[1] | v1[2] | v1[3]; + accumulator |= v2[0] | v2[1] | v2[2] | v2[3]; + accumulator |= v3[0] | v3[1] | v3[2] | v3[3]; + accumulator |= v4[0] | v4[1] | v4[2] | v4[3]; + return accumulator; +} +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c new file mode 100644 index 0000000..d069725 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c @@ -0,0 +1,31 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */ + +/* To test reassoc can undistribute vector bit_field_ref on bitwise XOR. + + v1, v2, v3, v4 of type vector int. + + reassoc transforms + + accumulator ^= v1[0] ^ v1[1] ^ v1[2] ^ v1[3] ^ + v2[0] ^ v2[1] ^ v2[2] ^ v2[3] ^ + v3[0] ^ v3[1] ^ v3[2] ^ v3[3] ^ + v4[0] ^ v4[1] ^ v4[2] ^ v4[3] ; + + into: + + T = v1 ^ v2 ^ v3 ^ v4; + accumulator ^= T[0] ^ T[1] ^ T[2] ^ T[3]; + + Fewer bit_field_refs, only four for 128 or more bits vector. */ + +typedef int v4si __attribute__((vector_size(16))); +int test(int accumulator, v4si v1, v4si v2, v4si v3, v4si v4) { + accumulator ^= v1[0] ^ v1[1] ^ v1[2] ^ v1[3]; + accumulator ^= v2[0] ^ v2[1] ^ v2[2] ^ v2[3]; + accumulator ^= v3[0] ^ v3[1] ^ v3[2] ^ v3[3]; + accumulator ^= v4[0] ^ v4[1] ^ v4[2] ^ v4[3]; + return accumulator; +} +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */ diff --git a/gcc/tree-ssa-reassoc.c b/gcc/tree-ssa-reassoc.c index e1c4dfe..d755911 100644 --- a/gcc/tree-ssa-reassoc.c +++ b/gcc/tree-ssa-reassoc.c @@ -1772,6 +1772,298 @@ undistribute_ops_list (enum tree_code opcode, return changed; } +/* Hold the information of one specific VECTOR_TYPE SSA_NAME. + - offsets: for different BIT_FIELD_REF offsets accessing same VECTOR. + - ops_indexes: the index of vec ops* for each relavant BIT_FIELD_REF. */ +struct v_info +{ + auto_vec<unsigned HOST_WIDE_INT, 32> offsets; + auto_vec<unsigned, 32> ops_indexes; +}; + +typedef struct v_info *v_info_ptr; + +/* Comparison function for qsort on unsigned BIT_FIELD_REF offsets. */ +static int +unsigned_cmp (const void *p_i, const void *p_j) +{ + if (*(const unsigned *) p_i >= *(const unsigned *) p_j) + return 1; + else + return -1; +} + +/* Cleanup hash map for VECTOR information. */ +static void +cleanup_vinfo_map (hash_map<tree, v_info_ptr> &info_map) +{ + for (hash_map<tree, v_info_ptr>::iterator it = info_map.begin (); + it != info_map.end (); ++it) + { + v_info_ptr info = (*it).second; + delete info; + (*it).second = NULL; + } +} + +/* Perform un-distribution of BIT_FIELD_REF on VECTOR_TYPE. + V1[0] + V1[1] + ... + V1[k] + V2[0] + V2[1] + ... + V2[k] + ... Vn[k] + is transformed to + Vs = (V1 + V2 + ... + Vn) + Vs[0] + Vs[1] + ... + Vs[k] + + The basic steps are listed below: + + 1) Check the addition chain *OPS by looking those summands coming from + VECTOR bit_field_ref on VECTOR type. Put the information into + v_info_map for each satisfied summand, using VECTOR SSA_NAME as key. + + 2) For each key (VECTOR SSA_NAME), validate all its BIT_FIELD_REFs are + continous, they can cover the whole VECTOR perfectly without any holes. + Obtain one VECTOR list which contain candidates to be transformed. + + 3) Build the addition statements for all VECTOR candidates, generate + BIT_FIELD_REFs accordingly. + + TODO: + 1) The current implementation restrict all candidate VECTORs should have + the same VECTOR type, but it can be extended into different groups by + VECTOR types in future if any profitable cases found. + 2) The current implementation requires the whole VECTORs should be fully + covered, but it can be extended to support partial, checking adjacent + but not fill the whole, it may need some cost model to define the + boundary to do or not. +*/ +static bool +undistribute_bitref_for_vector (enum tree_code opcode, vec<operand_entry *> *ops, + struct loop *loop) +{ + if (ops->length () <= 1) + return false; + + if (opcode != PLUS_EXPR && opcode != MULT_EXPR && opcode != BIT_XOR_EXPR + && opcode != BIT_IOR_EXPR && opcode != BIT_AND_EXPR) + return false; + + hash_map<tree, v_info_ptr> v_info_map; + operand_entry *oe1; + unsigned i; + + /* Find those summands from VECTOR BIT_FIELD_REF in addition chain, put the + information into map. */ + FOR_EACH_VEC_ELT (*ops, i, oe1) + { + enum tree_code dcode; + gimple *oe1def; + + if (TREE_CODE (oe1->op) != SSA_NAME) + continue; + oe1def = SSA_NAME_DEF_STMT (oe1->op); + if (!is_gimple_assign (oe1def)) + continue; + dcode = gimple_assign_rhs_code (oe1def); + if (dcode != BIT_FIELD_REF || !is_reassociable_op (oe1def, dcode, loop)) + continue; + + tree rhs = gimple_op (oe1def, 1); + tree op0 = TREE_OPERAND (rhs, 0); + tree vec_type = TREE_TYPE (op0); + + if (TREE_CODE (op0) != SSA_NAME || TREE_CODE (vec_type) != VECTOR_TYPE) + continue; + + tree op1 = TREE_OPERAND (rhs, 1); + tree op2 = TREE_OPERAND (rhs, 2); + + tree elem_type = TREE_TYPE (vec_type); + unsigned HOST_WIDE_INT size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type)); + if (size != TREE_INT_CST_LOW (op1)) + continue; + + /* Ignore it if target machine can't support this VECTOR type. */ + if (!VECTOR_MODE_P (TYPE_MODE (vec_type))) + continue; + + v_info_ptr *info_ptr = v_info_map.get (op0); + if (info_ptr) + { + v_info_ptr info = *info_ptr; + info->offsets.safe_push (TREE_INT_CST_LOW (op2)); + info->ops_indexes.safe_push (i); + } + else + { + v_info_ptr info = new v_info; + info->offsets.safe_push (TREE_INT_CST_LOW (op2)); + info->ops_indexes.safe_push (i); + v_info_map.put (op0, info); + } + } + + /* At least two VECTOR to combine. */ + if (v_info_map.elements () <= 1) + { + cleanup_vinfo_map (v_info_map); + return false; + } + + /* Use the first VECTOR and its information as the reference. + Firstly, we should validate it, that is: + 1) required VECTOR operation supported on target machine. + 2) sorted offsets are adjacent, no holes. + 3) can fill the whole VECTOR perfectly. */ + hash_map<tree, v_info_ptr>::iterator it = v_info_map.begin (); + tree ref_vec = (*it).first; + tree vec_type = TREE_TYPE (ref_vec); + tree elem_type = TREE_TYPE (vec_type); + + /* Check VECTOR operation available on target machine. */ + optab op_tab = optab_for_tree_code (opcode, vec_type, optab_vector); + if (optab_handler (op_tab, TYPE_MODE (vec_type)) == CODE_FOR_nothing) + { + cleanup_vinfo_map (v_info_map); + return false; + } + + v_info_ptr ref_info = (*it).second; + ref_info->offsets.qsort (unsigned_cmp); + unsigned HOST_WIDE_INT elem_size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type)); + unsigned HOST_WIDE_INT curr; + unsigned HOST_WIDE_INT prev = ref_info->offsets[0]; + + /* Continous check. */ + FOR_EACH_VEC_ELT_FROM (ref_info->offsets, i, curr, 1) + { + if (curr != (prev + elem_size)) + { + cleanup_vinfo_map (v_info_map); + return false; + } + prev = curr; + } + + /* Check whether fill the whole. */ + if ((prev + elem_size) != TREE_INT_CST_LOW (TYPE_SIZE (TREE_TYPE (ref_vec)))) + { + cleanup_vinfo_map (v_info_map); + return false; + } + + auto_vec<tree> vectors (v_info_map.elements ()); + vectors.quick_push (ref_vec); + + /* Use the ref_vec to filter others. */ + for (++it; it != v_info_map.end (); ++it) + { + tree vec = (*it).first; + v_info_ptr info = (*it).second; + if (TREE_TYPE (ref_vec) != TREE_TYPE (vec)) + continue; + if (ref_info->offsets.length () != info->offsets.length ()) + continue; + bool same_offset = true; + info->offsets.qsort (unsigned_cmp); + for (unsigned i = 0; i < ref_info->offsets.length (); i++) + { + if (ref_info->offsets[i] != info->offsets[i]) + { + same_offset = false; + break; + } + } + if (!same_offset) + continue; + vectors.quick_push (vec); + } + + if (vectors.length () < 2) + { + cleanup_vinfo_map (v_info_map); + return false; + } + + tree tr; + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "The bit_field_ref vector list for undistribute: "); + FOR_EACH_VEC_ELT (vectors, i, tr) + { + print_generic_expr (dump_file, tr); + fprintf (dump_file, " "); + } + fprintf (dump_file, "\n"); + } + + /* Build the sum for all candidate VECTORs. */ + unsigned idx; + gimple *sum = NULL; + v_info_ptr info; + tree sum_vec = ref_vec; + FOR_EACH_VEC_ELT_FROM (vectors, i, tr, 1) + { + sum = build_and_add_sum (TREE_TYPE (ref_vec), sum_vec, tr, opcode); + info = *(v_info_map.get (tr)); + unsigned j; + FOR_EACH_VEC_ELT (info->ops_indexes, j, idx) + { + gimple *def = SSA_NAME_DEF_STMT ((*ops)[idx]->op); + gimple_set_visited (def, true); + if (opcode == PLUS_EXPR || opcode == BIT_XOR_EXPR + || opcode == BIT_IOR_EXPR) + (*ops)[idx]->op = build_zero_cst (TREE_TYPE ((*ops)[idx]->op)); + else if (opcode == MULT_EXPR) + (*ops)[idx]->op = build_one_cst (TREE_TYPE ((*ops)[idx]->op)); + else + { + gcc_assert (opcode == BIT_AND_EXPR); + (*ops)[idx]->op + = build_all_ones_cst (TREE_TYPE ((*ops)[idx]->op)); + } + (*ops)[idx]->rank = 0; + } + sum_vec = gimple_get_lhs (sum); + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Generating addition -> "); + print_gimple_stmt (dump_file, sum, 0); + } + } + + /* Referring to any good shape VECTOR (here using ref_vec), generate the + BIT_FIELD_REF statements accordingly. */ + info = *(v_info_map.get (ref_vec)); + gcc_assert (sum); + FOR_EACH_VEC_ELT (info->ops_indexes, i, idx) + { + tree dst = make_ssa_name (elem_type); + gimple *gs + = gimple_build_assign (dst, BIT_FIELD_REF, + build3 (BIT_FIELD_REF, elem_type, sum_vec, + TYPE_SIZE (elem_type), + bitsize_int (info->offsets[i]))); + insert_stmt_after (gs, sum); + update_stmt (gs); + gimple *def = SSA_NAME_DEF_STMT ((*ops)[idx]->op); + gimple_set_visited (def, true); + (*ops)[idx]->op = gimple_assign_lhs (gs); + (*ops)[idx]->rank = get_rank ((*ops)[idx]->op); + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Generating bit_field_ref -> "); + print_gimple_stmt (dump_file, gs, 0); + } + } + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "undistributiong bit_field_ref for vector done.\n"); + } + + cleanup_vinfo_map (v_info_map); + + return true; +} + /* If OPCODE is BIT_IOR_EXPR or BIT_AND_EXPR and CURR is a comparison expression, examine the other OPS to see if any of them are comparisons of the same values, which we may be able to combine or eliminate. @@ -5880,11 +6172,6 @@ reassociate_bb (basic_block bb) tree lhs, rhs1, rhs2; enum tree_code rhs_code = gimple_assign_rhs_code (stmt); - /* If this is not a gimple binary expression, there is - nothing for us to do with it. */ - if (get_gimple_rhs_class (rhs_code) != GIMPLE_BINARY_RHS) - continue; - /* If this was part of an already processed statement, we don't need to touch it again. */ if (gimple_visited_p (stmt)) @@ -5911,6 +6198,11 @@ reassociate_bb (basic_block bb) continue; } + /* If this is not a gimple binary expression, there is + nothing for us to do with it. */ + if (get_gimple_rhs_class (rhs_code) != GIMPLE_BINARY_RHS) + continue; + lhs = gimple_assign_lhs (stmt); rhs1 = gimple_assign_rhs1 (stmt); rhs2 = gimple_assign_rhs2 (stmt); @@ -5950,6 +6242,13 @@ reassociate_bb (basic_block bb) optimize_ops_list (rhs_code, &ops); } + if (undistribute_bitref_for_vector (rhs_code, &ops, + loop_containing_stmt (stmt))) + { + ops.qsort (sort_by_operand_rank); + optimize_ops_list (rhs_code, &ops); + } + if (rhs_code == PLUS_EXPR && transform_add_to_multiply (&ops)) ops.qsort (sort_by_operand_rank); -- 2.7.4 on 2019/3/11 下午9:49, Kewen.Lin wrote: > on 2019/3/11 下午6:24, Richard Biener wrote: >> On Mon, 11 Mar 2019, Kewen.Lin wrote: >> >>> on 2019/3/8 下午6:57, Richard Biener wrote: >>>> On Fri, Mar 8, 2019 at 2:40 AM Kewen.Lin <li...@linux.ibm.com> wrote: >>>>> >>>>> Hi, >>>>> >>>>> As PR88497 (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88497), >>>>> when we meet some code pattern like: >>>>> >>>>> V1[0] + V1[1] + ... + V1[k] + V2[0] + ... + V2[k] + ... Vn[k] >>>>> // V1...Vn of VECTOR_TYPE >>>>> >>>>> We can teach reassoc to transform it to: >>>>> >>>>> Vs = (V1 + V2 + ... + Vn) >>>>> Vs[0] + Vs[1] + ... + Vs[k] >>>>> >>>>> It saves addition and bit_field_ref operations and exposes more >>>>> opportunities for downstream passes, I notice that even on one >>>>> target doesn't support vector type and vector type gets expanded >>>>> in veclower, it's still better to have it, since the generated >>>>> sequence is more friendly for widening_mul. (If one more time >>>>> DCE after forwprop, it should be the same. ) >>>>> >>>>> Bootstrapped/regtested on powerpc64le-linux-gnu, ok for trunk? >>>> >>>> Hmm. You are basically vectorizing the reduction partially here (note >>>> basic-block vectorization doesn't handle reductions yet). So I'm not sure >>>> whether we should eventually just change op sort order to sort >>>> v1[1] after v2[0] to sort v1[0] and v2[0] together. That would be also >>>> maybe >>>> an intermediate step that makes implementing the vectorization part >>>> "easier"? I also imagine that the "vectorization" would be profitable even >>>> if there's just two elements of the vectors summed and the real vectors are >>>> larger. >>>> >>> >>> Hi Richard, >>> >>> Sorry, I have no idea about basic-block vectorization (SLP?), did you >>> suggest >>> supporting this there rather than in reassoc? Changing op sort order for >>> its expected pattern? >> >> I was thinking you're smashing two things together here - one part >> suitable for reassocation and one that's on the border. The reassoc >> pass already gathered quite some stuff that doesn't really belong there, >> we should at least think twice adding to that. >> > > Got it! Since the discussion context of the PR is mainly about reassocation, > I started to look from it. :) > >>>> Note that the code you transform contains no vector operations (just >>>> element extracts) and thus you need to check for target support before >>>> creating vector add. >>>> >>> >>> I had the same concern before. But I thought that if this VECTOR type is >>> valid >>> on the target, the addition operation should be fundamental on the target, >>> then >>> it's ok; if it's invalid, then veclower will expand it into scalar type as >>> well >>> as the addition operation. So it looks fine to guard it. Does it make sense? >> >> But there's a reassoc phase after veclower. Generally we avoid generating >> vector code not supported by the target. You are not checking whether >> the vector type is valid on the target either btw. The user might have >> written an optimal scalar sequence for a non-natively supported vector >> type (and veclower wouldn't touch the original scalar code) and veclower >> generally makes a mess of things, likely worse than the original code. >> > > Thanks for your explanation! I'll update the code to check vector supported > by > target or not. > >>> >>>> This is for GCC 10. Also it doesn't fix PR88497, does it? >>>> >>> >>> Yes, it's in low priority and for GCC10. I think it does. Did I miss >>> something? >> >> Wasn't the original testcase in the PR for _vectorized_ code? The >> PR then got an additional testcase which you indeed fix. >> > > The original case is actually optimal with -Ofast, but additional case > isn't. :) > >>> For comment 1 and comment 7 cases, trunk gcc generates the expected code >>> with -Ofast. There isn't any issues for the original loop. But for the >>> expanded code in comment 1 (manually expanded case), gcc can't generate >>> better code with multiply-add. >>> >>> From comment 9 case (same as comment 1 expanded version): >>> >>> without this patch, optimized tree and final asm: >>> >>> <bb 2> [local count: 1073741824]: >>> _1 = *arg2_26(D); >>> _2 = *arg3_27(D); >>> _3 = _1 * _2; >>> _4 = BIT_FIELD_REF <_3, 64, 0>; >>> _5 = BIT_FIELD_REF <_3, 64, 64>; >>> _18 = _4 + _5; >>> _7 = MEM[(__vector double *)arg2_26(D) + 16B]; >>> _8 = MEM[(__vector double *)arg3_27(D) + 16B]; >>> _9 = _7 * _8; >>> _10 = BIT_FIELD_REF <_9, 64, 0>; >>> _11 = BIT_FIELD_REF <_9, 64, 64>; >>> _31 = _10 + _11; >>> _29 = _18 + _31; >>> _13 = MEM[(__vector double *)arg2_26(D) + 32B]; >>> _14 = MEM[(__vector double *)arg3_27(D) + 32B]; >>> _15 = _13 * _14; >>> _16 = BIT_FIELD_REF <_15, 64, 0>; >>> _17 = BIT_FIELD_REF <_15, 64, 64>; >>> _6 = _16 + _17; >>> _12 = _6 + accumulator_28(D); >>> _37 = _12 + _29; >>> _19 = MEM[(__vector double *)arg2_26(D) + 48B]; >>> _20 = MEM[(__vector double *)arg3_27(D) + 48B]; >>> _21 = _19 * _20; >>> _22 = BIT_FIELD_REF <_21, 64, 0>; >>> _23 = BIT_FIELD_REF <_21, 64, 64>; >>> _24 = _22 + _23; >>> accumulator_32 = _24 + _37; >>> return accumulator_32; >>> >>> 0000000000000000 <foo>: >>> 0: 01 00 e5 f4 lxv vs7,0(r5) >>> 4: 11 00 05 f4 lxv vs0,16(r5) >>> 8: 21 00 24 f5 lxv vs9,32(r4) >>> c: 21 00 c5 f4 lxv vs6,32(r5) >>> 10: 01 00 44 f5 lxv vs10,0(r4) >>> 14: 11 00 64 f5 lxv vs11,16(r4) >>> 18: 31 00 05 f5 lxv vs8,48(r5) >>> 1c: 31 00 84 f5 lxv vs12,48(r4) >>> 20: 80 33 29 f1 xvmuldp vs9,vs9,vs6 >>> 24: 80 3b 4a f1 xvmuldp vs10,vs10,vs7 >>> 28: 80 03 6b f1 xvmuldp vs11,vs11,vs0 >>> 2c: 80 43 8c f1 xvmuldp vs12,vs12,vs8 >>> 30: 50 4b 09 f1 xxspltd vs8,vs9,1 >>> 34: 50 5b eb f0 xxspltd vs7,vs11,1 >>> 38: 50 53 0a f0 xxspltd vs0,vs10,1 >>> 3c: 2a 48 28 fd fadd f9,f8,f9 >>> 40: 2a 58 67 fd fadd f11,f7,f11 >>> 44: 2a 50 00 fc fadd f0,f0,f10 >>> 48: 50 63 4c f1 xxspltd vs10,vs12,1 >>> 4c: 2a 60 8a fd fadd f12,f10,f12 >>> 50: 2a 08 29 fd fadd f9,f9,f1 >>> 54: 2a 58 00 fc fadd f0,f0,f11 >>> 58: 2a 48 20 fc fadd f1,f0,f9 >>> 5c: 2a 08 2c fc fadd f1,f12,f1 >>> 60: 20 00 80 4e blr >>> >>> >>> with this patch, optimized tree and final asm: >>> >>> _1 = *arg2_26(D); >>> _2 = *arg3_27(D); >>> _7 = MEM[(__vector double *)arg2_26(D) + 16B]; >>> _8 = MEM[(__vector double *)arg3_27(D) + 16B]; >>> _9 = _7 * _8; >>> _5 = .FMA (_1, _2, _9); >>> _13 = MEM[(__vector double *)arg2_26(D) + 32B]; >>> _14 = MEM[(__vector double *)arg3_27(D) + 32B]; >>> _19 = MEM[(__vector double *)arg2_26(D) + 48B]; >>> _20 = MEM[(__vector double *)arg3_27(D) + 48B]; >>> _21 = _19 * _20; >>> _10 = .FMA (_13, _14, _21); >>> _41 = _5 + _10; >>> _43 = BIT_FIELD_REF <_41, 64, 64>; >>> _42 = BIT_FIELD_REF <_41, 64, 0>; >>> _44 = _42 + _43; >>> accumulator_32 = accumulator_28(D) + _44; >>> return accumulator_32; >>> >>> 0000000000000000 <foo>: >>> 0: 11 00 04 f4 lxv vs0,16(r4) >>> 4: 11 00 c5 f4 lxv vs6,16(r5) >>> 8: 31 00 44 f5 lxv vs10,48(r4) >>> c: 31 00 e5 f4 lxv vs7,48(r5) >>> 10: 01 00 84 f5 lxv vs12,0(r4) >>> 14: 01 00 05 f5 lxv vs8,0(r5) >>> 18: 21 00 64 f5 lxv vs11,32(r4) >>> 1c: 21 00 25 f5 lxv vs9,32(r5) >>> 20: 80 33 00 f0 xvmuldp vs0,vs0,vs6 >>> 24: 80 3b 4a f1 xvmuldp vs10,vs10,vs7 >>> 28: 08 43 0c f0 xvmaddadp vs0,vs12,vs8 >>> 2c: 90 54 8a f1 xxlor vs12,vs10,vs10 >>> 30: 08 4b 8b f1 xvmaddadp vs12,vs11,vs9 >>> 34: 00 63 00 f0 xvadddp vs0,vs0,vs12 >>> 38: 50 03 80 f1 xxspltd vs12,vs0,1 >>> 3c: 2a 00 0c fc fadd f0,f12,f0 >>> 40: 2a 08 20 fc fadd f1,f0,f1 >>> 44: 20 00 80 4e blr >> >> OK, I see. >> >> Btw, your code should equally well work for multiplications and >> bit operations, no? So it would be nice to extend it for those. >> > > Good point! I'll extend it. > >> Please add the appropriate target support checks for the vector >> operations. >> > > OK. Will fix it. > > >> Richard. >> >>>> Richard. >>>>