Hi! This is a small improvement over the previous patch, the decision to use whole vector left shift + optional VEC_COND_EXPR doesn't have to be binary for the whole scan that contains several permutations, e.g. SSE2 can't do non-whole vector left shift { 0, 4, 5, 6 } permutation, but can do { 0, 1, 4, 5 } and especially if the initializer is not 0, that saves some instructions.
The following patch changes the code, so that it remembers what to do for each of the permutations. Bootstrapped/regtested on x86_64-linux and i686-linux, committed to trunk. 2019-06-20 Jakub Jelinek <ja...@redhat.com> * tree-vect-stmts.c (enum scan_store_kind): New type. (scan_store_can_perm_p): Change last argument from int * to vec<enum scan_store_kind> *, record precisely which permutations need whole vector left shift or that plus VEC_COND_EXPR. (vectorizable_scan_store): Adjust caller, use whole vector left shift and additional VEC_COND_EXPR only for those iterations that need it. --- gcc/tree-vect-stmts.c.jj 2019-06-19 11:58:53.161238429 +0200 +++ gcc/tree-vect-stmts.c 2019-06-19 12:40:50.675838267 +0200 @@ -6354,13 +6354,27 @@ scan_operand_equal_p (tree ref1, tree re } +enum scan_store_kind { + /* Normal permutation. */ + scan_store_kind_perm, + + /* Whole vector left shift permutation with zero init. */ + scan_store_kind_lshift_zero, + + /* Whole vector left shift permutation and VEC_COND_EXPR. */ + scan_store_kind_lshift_cond +}; + /* Function check_scan_store. Verify if we can perform the needed permutations or whole vector shifts. - Return -1 on failure, otherwise exact log2 of vectype's nunits. */ + Return -1 on failure, otherwise exact log2 of vectype's nunits. + USE_WHOLE_VECTOR is a vector of enum scan_store_kind which operation + to do at each step. */ static int -scan_store_can_perm_p (tree vectype, tree init, int *use_whole_vector_p = NULL) +scan_store_can_perm_p (tree vectype, tree init, + vec<enum scan_store_kind> *use_whole_vector = NULL) { enum machine_mode vec_mode = TYPE_MODE (vectype); unsigned HOST_WIDE_INT nunits; @@ -6371,50 +6385,59 @@ scan_store_can_perm_p (tree vectype, tre return -1; int i; + enum scan_store_kind whole_vector_shift_kind = scan_store_kind_perm; for (i = 0; i <= units_log2; ++i) { unsigned HOST_WIDE_INT j, k; + enum scan_store_kind kind = scan_store_kind_perm; vec_perm_builder sel (nunits, nunits, 1); sel.quick_grow (nunits); - if (i == 0) + if (i == units_log2) { for (j = 0; j < nunits; ++j) sel[j] = nunits - 1; } else { - for (j = 0; j < (HOST_WIDE_INT_1U << (i - 1)); ++j) + for (j = 0; j < (HOST_WIDE_INT_1U << i); ++j) sel[j] = j; for (k = 0; j < nunits; ++j, ++k) sel[j] = nunits + k; } - vec_perm_indices indices (sel, i == 0 ? 1 : 2, nunits); + vec_perm_indices indices (sel, i == units_log2 ? 1 : 2, nunits); if (!can_vec_perm_const_p (vec_mode, indices)) - break; - } - - if (i == 0) - return -1; - - if (i <= units_log2) - { - if (optab_handler (vec_shl_optab, vec_mode) == CODE_FOR_nothing) - return -1; - int kind = 1; - /* Whole vector shifts shift in zeros, so if init is all zero constant, - there is no need to do anything further. */ - if ((TREE_CODE (init) != INTEGER_CST - && TREE_CODE (init) != REAL_CST) - || !initializer_zerop (init)) { - tree masktype = build_same_sized_truth_vector_type (vectype); - if (!expand_vec_cond_expr_p (vectype, masktype, VECTOR_CST)) + if (i == units_log2) return -1; - kind = 2; + + if (whole_vector_shift_kind == scan_store_kind_perm) + { + if (optab_handler (vec_shl_optab, vec_mode) == CODE_FOR_nothing) + return -1; + whole_vector_shift_kind = scan_store_kind_lshift_zero; + /* Whole vector shifts shift in zeros, so if init is all zero + constant, there is no need to do anything further. */ + if ((TREE_CODE (init) != INTEGER_CST + && TREE_CODE (init) != REAL_CST) + || !initializer_zerop (init)) + { + tree masktype = build_same_sized_truth_vector_type (vectype); + if (!expand_vec_cond_expr_p (vectype, masktype, VECTOR_CST)) + return -1; + whole_vector_shift_kind = scan_store_kind_lshift_cond; + } + } + kind = whole_vector_shift_kind; + } + if (use_whole_vector) + { + if (kind != scan_store_kind_perm && use_whole_vector->is_empty ()) + use_whole_vector->safe_grow_cleared (i); + if (kind != scan_store_kind_perm || !use_whole_vector->is_empty ()) + use_whole_vector->safe_push (kind); } - if (use_whole_vector_p) - *use_whole_vector_p = kind; } + return units_log2; } @@ -6726,11 +6749,12 @@ vectorizable_scan_store (stmt_vec_info s unsigned HOST_WIDE_INT nunits; if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)) gcc_unreachable (); - int use_whole_vector_p = 0; - int units_log2 = scan_store_can_perm_p (vectype, *init, &use_whole_vector_p); + auto_vec<enum scan_store_kind, 16> use_whole_vector; + int units_log2 = scan_store_can_perm_p (vectype, *init, &use_whole_vector); gcc_assert (units_log2 > 0); auto_vec<tree, 16> perms; perms.quick_grow (units_log2 + 1); + tree zero_vec = NULL_TREE, masktype = NULL_TREE; for (int i = 0; i <= units_log2; ++i) { unsigned HOST_WIDE_INT j, k; @@ -6739,23 +6763,28 @@ vectorizable_scan_store (stmt_vec_info s if (i == units_log2) for (j = 0; j < nunits; ++j) sel[j] = nunits - 1; - else - { - for (j = 0; j < (HOST_WIDE_INT_1U << i); ++j) - sel[j] = j; - for (k = 0; j < nunits; ++j, ++k) - sel[j] = nunits + k; - } + else + { + for (j = 0; j < (HOST_WIDE_INT_1U << i); ++j) + sel[j] = j; + for (k = 0; j < nunits; ++j, ++k) + sel[j] = nunits + k; + } vec_perm_indices indices (sel, i == units_log2 ? 1 : 2, nunits); - if (use_whole_vector_p && i < units_log2) - perms[i] = vect_gen_perm_mask_any (vectype, indices); + if (!use_whole_vector.is_empty () + && use_whole_vector[i] != scan_store_kind_perm) + { + if (zero_vec == NULL_TREE) + zero_vec = build_zero_cst (vectype); + if (masktype == NULL_TREE + && use_whole_vector[i] == scan_store_kind_lshift_cond) + masktype = build_same_sized_truth_vector_type (vectype); + perms[i] = vect_gen_perm_mask_any (vectype, indices); + } else perms[i] = vect_gen_perm_mask_checked (vectype, indices); } - tree zero_vec = use_whole_vector_p ? build_zero_cst (vectype) : NULL_TREE; - tree masktype = (use_whole_vector_p == 2 - ? build_same_sized_truth_vector_type (vectype) : NULL_TREE); stmt_vec_info prev_stmt_info = NULL; tree vec_oprnd1 = NULL_TREE; tree vec_oprnd2 = NULL_TREE; @@ -6788,7 +6817,10 @@ vectorizable_scan_store (stmt_vec_info s { tree new_temp = make_ssa_name (vectype); gimple *g = gimple_build_assign (new_temp, VEC_PERM_EXPR, - zero_vec ? zero_vec : vec_oprnd1, v, + (zero_vec + && (use_whole_vector[i] + != scan_store_kind_perm)) + ? zero_vec : vec_oprnd1, v, perms[i]); new_stmt_info = vect_finish_stmt_generation (stmt_info, g, gsi); if (prev_stmt_info == NULL) @@ -6797,7 +6829,7 @@ vectorizable_scan_store (stmt_vec_info s STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info; prev_stmt_info = new_stmt_info; - if (use_whole_vector_p == 2) + if (zero_vec && use_whole_vector[i] == scan_store_kind_lshift_cond) { /* Whole vector shift shifted in zero bits, but if *init is not initializer_zerop, we need to replace those elements Jakub