The following makes it possible to vectorize complex multiplication without unrolling by allowing mixed operations in a SLP node (plus/minus for now). We vectorize that by computing both plus and minus and then merge the result.
Something needs to be done in the backends to fully leverage this as for x86_64 for example the addsub patterns match vec_merge but the permutes only use vec_merge for V4SF not for V2DF. We have too many ways to express this kind of permutation (vec_perm, vec_merge and also vec_select (vec_concat ())). This is not a full enablement to vectorize the loop in PR37021, some pieces are still missing (I'm working step-by-step here, resurrecting old patches). Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk. Richard. 2015-05-12 Richard Biener <rguent...@suse.de> PR tree-optimization/37021 * tree-vectorizer.h (struct _slp_tree): Add two_operators flag. (SLP_TREE_TWO_OPERATORS): New define. * tree-vect-slp.c (vect_create_new_slp_node): Initialize SLP_TREE_TWO_OPERATORS. (vect_build_slp_tree_1): Allow two mixing plus/minus in an SLP node. (vect_build_slp_tree): Adjust. (vect_analyze_slp_cost_1): Likewise. (vect_schedule_slp_instance): Vectorize mixing plus/minus by emitting two vector stmts and mixing the results. * gcc.target/i386/vect-addsub.c: New testcase. Index: gcc/tree-vect-slp.c =================================================================== *** gcc/tree-vect-slp.c.orig 2015-05-08 13:12:21.449383105 +0200 --- gcc/tree-vect-slp.c 2015-05-12 09:55:03.431066278 +0200 *************** vect_create_new_slp_node (vec<gimple> sc *** 160,165 **** --- 160,166 ---- SLP_TREE_VEC_STMTS (node).create (0); SLP_TREE_CHILDREN (node).create (nops); SLP_TREE_LOAD_PERMUTATION (node) = vNULL; + SLP_TREE_TWO_OPERATORS (node) = false; return node; } *************** static bool *** 472,482 **** vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo, vec<gimple> stmts, unsigned int group_size, unsigned nops, unsigned int *max_nunits, ! unsigned int vectorization_factor, bool *matches) { unsigned int i; ! gimple stmt = stmts[0]; ! enum tree_code first_stmt_code = ERROR_MARK, rhs_code = ERROR_MARK; enum tree_code first_cond_code = ERROR_MARK; tree lhs; bool need_same_oprnds = false; --- 473,486 ---- vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo, vec<gimple> stmts, unsigned int group_size, unsigned nops, unsigned int *max_nunits, ! unsigned int vectorization_factor, bool *matches, ! bool *two_operators) { unsigned int i; ! gimple first_stmt = stmts[0], stmt = stmts[0]; ! enum tree_code first_stmt_code = ERROR_MARK; ! enum tree_code alt_stmt_code = ERROR_MARK; ! enum tree_code rhs_code = ERROR_MARK; enum tree_code first_cond_code = ERROR_MARK; tree lhs; bool need_same_oprnds = false; *************** vect_build_slp_tree_1 (loop_vec_info loo *** 662,671 **** --- 666,685 ---- else { if (first_stmt_code != rhs_code + && alt_stmt_code == ERROR_MARK) + alt_stmt_code = rhs_code; + if (first_stmt_code != rhs_code && (first_stmt_code != IMAGPART_EXPR || rhs_code != REALPART_EXPR) && (first_stmt_code != REALPART_EXPR || rhs_code != IMAGPART_EXPR) + /* Handle mismatches in plus/minus by computing both + and merging the results. */ + && !((first_stmt_code == PLUS_EXPR + || first_stmt_code == MINUS_EXPR) + && (alt_stmt_code == PLUS_EXPR + || alt_stmt_code == MINUS_EXPR) + && rhs_code == alt_stmt_code) && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)) && (first_stmt_code == ARRAY_REF || first_stmt_code == BIT_FIELD_REF *************** vect_build_slp_tree_1 (loop_vec_info loo *** 679,685 **** "Build SLP failed: different operation " "in stmt "); dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); ! dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); } /* Mismatch. */ continue; --- 693,702 ---- "Build SLP failed: different operation " "in stmt "); dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); ! dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ! "original stmt "); ! dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ! first_stmt, 0); } /* Mismatch. */ continue; *************** vect_build_slp_tree_1 (loop_vec_info loo *** 916,921 **** --- 933,975 ---- if (!matches[i]) return false; + /* If we allowed a two-operation SLP node verify the target can cope + with the permute we are going to use. */ + if (alt_stmt_code != ERROR_MARK + && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference) + { + unsigned char *sel + = XALLOCAVEC (unsigned char, TYPE_VECTOR_SUBPARTS (vectype)); + for (i = 0; i < TYPE_VECTOR_SUBPARTS (vectype); ++i) + { + sel[i] = i; + if (gimple_assign_rhs_code (stmts[i % group_size]) == alt_stmt_code) + sel[i] += TYPE_VECTOR_SUBPARTS (vectype); + } + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + for (i = 0; i < group_size; ++i) + if (gimple_assign_rhs_code (stmts[i]) == alt_stmt_code) + { + matches[i] = false; + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "Build SLP failed: different operation " + "in stmt "); + dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, + stmts[i], 0); + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "original stmt "); + dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, + first_stmt, 0); + } + } + return false; + } + *two_operators = true; + } + return true; } *************** vect_build_slp_tree (loop_vec_info loop_ *** 952,961 **** else return false; if (!vect_build_slp_tree_1 (loop_vinfo, bb_vinfo, SLP_TREE_SCALAR_STMTS (*node), group_size, nops, ! max_nunits, vectorization_factor, matches)) return false; /* If the SLP node is a load, terminate the recursion. */ if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)) --- 1006,1018 ---- else return false; + bool two_operators = false; if (!vect_build_slp_tree_1 (loop_vinfo, bb_vinfo, SLP_TREE_SCALAR_STMTS (*node), group_size, nops, ! max_nunits, vectorization_factor, matches, ! &two_operators)) return false; + SLP_TREE_TWO_OPERATORS (*node) = two_operators; /* If the SLP node is a load, terminate the recursion. */ if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)) *************** vect_analyze_slp_cost_1 (loop_vec_info l *** 1514,1521 **** } } else ! record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt, ! stmt_info, 0, vect_body); /* Scan operands and account for prologue cost of constants/externals. ??? This over-estimates cost for multiple uses and should be --- 1571,1587 ---- } } else ! { ! record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt, ! stmt_info, 0, vect_body); ! if (SLP_TREE_TWO_OPERATORS (node)) ! { ! record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt, ! stmt_info, 0, vect_body); ! record_stmt_cost (body_cost_vec, ncopies_for_cost, vec_perm, ! stmt_info, 0, vect_body); ! } ! } /* Scan operands and account for prologue cost of constants/externals. ??? This over-estimates cost for multiple uses and should be *************** vect_schedule_slp_instance (slp_tree nod *** 3347,3352 **** --- 3413,3486 ---- STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; } + /* Handle two-operation SLP nodes by vectorizing the group with + both operations and then performing a merge. */ + if (SLP_TREE_TWO_OPERATORS (node)) + { + enum tree_code code0 = gimple_assign_rhs_code (stmt); + enum tree_code ocode; + gimple ostmt; + unsigned char *mask = XALLOCAVEC (unsigned char, group_size); + bool allsame = true; + FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt) + if (gimple_assign_rhs_code (ostmt) != code0) + { + mask[i] = 1; + allsame = false; + ocode = gimple_assign_rhs_code (ostmt); + } + else + mask[i] = 0; + if (!allsame) + { + vec<gimple> v0; + vec<gimple> v1; + unsigned j; + tree tmask = NULL_TREE; + vect_transform_stmt (stmt, &si, &grouped_store, node, instance); + v0 = SLP_TREE_VEC_STMTS (node).copy (); + SLP_TREE_VEC_STMTS (node).truncate (0); + gimple_assign_set_rhs_code (stmt, ocode); + vect_transform_stmt (stmt, &si, &grouped_store, node, instance); + gimple_assign_set_rhs_code (stmt, code0); + v1 = SLP_TREE_VEC_STMTS (node).copy (); + SLP_TREE_VEC_STMTS (node).truncate (0); + tree meltype = build_nonstandard_integer_type + (GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (vectype))), 1); + tree mvectype = get_same_sized_vectype (meltype, vectype); + unsigned k = 0, l; + for (j = 0; j < v0.length (); ++j) + { + tree *melts = XALLOCAVEC (tree, TYPE_VECTOR_SUBPARTS (vectype)); + for (l = 0; l < TYPE_VECTOR_SUBPARTS (vectype); ++l) + { + if (k > group_size) + k = 0; + melts[l] = build_int_cst + (meltype, mask[k++] * TYPE_VECTOR_SUBPARTS (vectype) + l); + } + tmask = build_vector (mvectype, melts); + + /* ??? Not all targets support a VEC_PERM_EXPR with a + constant mask that would translate to a vec_merge RTX + (with their vec_perm_const_ok). We can either not + vectorize in that case or let veclower do its job. + Unfortunately that isn't too great and at least for + plus/minus we'd eventually like to match targets + vector addsub instructions. */ + gimple vstmt; + vstmt = gimple_build_assign (make_ssa_name (vectype), + VEC_PERM_EXPR, + gimple_assign_lhs (v0[j]), + gimple_assign_lhs (v1[j]), tmask); + vect_finish_stmt_generation (stmt, vstmt, &si); + SLP_TREE_VEC_STMTS (node).quick_push (vstmt); + } + v0.release (); + v1.release (); + return false; + } + } is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance); return is_store; } Index: gcc/testsuite/gcc.target/i386/vect-addsub.c =================================================================== *** /dev/null 1970-01-01 00:00:00.000000000 +0000 --- gcc/testsuite/gcc.target/i386/vect-addsub.c 2015-05-11 13:56:07.958892513 +0200 *************** *** 0 **** --- 1,22 ---- + /* { dg-do compile } */ + /* { dg-options "-O2 -ftree-vectorize -msse4 -mtune=generic" } */ + + /* We need SSE4 so the backend recognizes a { 0, 5, 2, 7 } constant + permutation as supported as the vectorizer wants to generate + + vect__6.10_24 = vect__3.6_20 - vect__5.9_23; + vect__6.11_25 = vect__3.6_20 + vect__5.9_23; + _26 = VEC_PERM_EXPR <vect__6.10_24, vect__6.11_25, { 0, 5, 2, 7 }>; + + See also the ??? comment about using and/andn/or in expand_vec_perm_blend + for non-SSE4 targets. */ + + void testf (float * __restrict__ p, float * __restrict q) + { + p[0] = p[0] - q[0]; + p[1] = p[1] + q[1]; + p[2] = p[2] - q[2]; + p[3] = p[3] + q[3]; + } + + /* { dg-final { scan-assembler "addsubps" } } */ Index: gcc/tree-vectorizer.h =================================================================== *** gcc/tree-vectorizer.h.orig 2015-04-28 09:59:32.259075685 +0200 --- gcc/tree-vectorizer.h 2015-05-11 13:29:04.889568537 +0200 *************** struct _slp_tree { *** 111,116 **** --- 111,118 ---- scalar elements in one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector size. */ unsigned int vec_stmts_size; + /* Whether the scalar computations use two different operators. */ + bool two_operators; }; *************** typedef struct _slp_instance { *** 146,151 **** --- 148,154 ---- #define SLP_TREE_VEC_STMTS(S) (S)->vec_stmts #define SLP_TREE_NUMBER_OF_VEC_STMTS(S) (S)->vec_stmts_size #define SLP_TREE_LOAD_PERMUTATION(S) (S)->load_permutation + #define SLP_TREE_TWO_OPERATORS(S) (S)->two_operators /* This structure is used in creation of an SLP tree. Each instance corresponds to the same operand in a group of scalar stmts in an SLP