Joel Hutton <joel.hut...@arm.com> writes: > Hi Richards, > > This patch adds support for the V8QI->V8HI case from widening vect patterns > as discussed to target PR98772.
Thanks, the approach looks good to me. Mostly just minor comments below. > Bootstrapped and regression tested on aarch64. > > > [aarch64][vect] Support V8QI->V8HI WIDEN_ patterns > > In the case where 8 out of every 16 elements are widened using a > widening pattern and the next 8 are skipped the patterns are not > recognized. This is because they are normally used in a pair, such as > VEC_WIDEN_MINUS_HI/LO, to achieve a v16qi->v16hi conversion for example. > This patch adds support for V8HI->V8QI patterns. > > gcc/ChangeLog: > PR tree-optimisation/98772 > * optabs-tree.c (supportable_convert_operation): Add case for > V8QI->V8HI > * tree-vect-stmts.c (vect_create_vectorized_promotion_stmts): New > function to generate promotion stmts for V8QI->V8HI > (vectorizable_conversion): Add case for V8QI->V8HI > > gcc/testsuite/ChangeLog: > PR tree-optimisation/98772 > * gcc.target/aarch64/pr98772.c: New test. > > diff --git a/gcc/optabs-tree.c b/gcc/optabs-tree.c > index > c94073e3ed98f8c4cab65891f65dedebdb1ec274..b91ce3af6f0d4b3a62110bdb38f68ecc53765cad > 100644 > --- a/gcc/optabs-tree.c > +++ b/gcc/optabs-tree.c > @@ -308,6 +308,40 @@ supportable_convert_operation (enum tree_code code, > if (!VECTOR_MODE_P (m1) || !VECTOR_MODE_P (m2)) > return false; > > + /* The case where a widening operation is not making use of the full width > of > + of the input vector, but using the full width of the output vector. > + Return the non-wided code, which will be used after the inputs are non-widened > + converted to the wide type. */ > + if ((code == WIDEN_MINUS_EXPR > + || code == WIDEN_PLUS_EXPR > + || code == WIDEN_MULT_EXPR > + || code == WIDEN_LSHIFT_EXPR) Minor formatting nit, but the ||s should be indented one space more. > + && known_eq (TYPE_VECTOR_SUBPARTS (vectype_in), > + TYPE_VECTOR_SUBPARTS (vectype_out))) > + { > + switch (code) > + { > + case WIDEN_LSHIFT_EXPR: > + *code1 = LSHIFT_EXPR; > + return true; > + break; > + case WIDEN_MINUS_EXPR: > + *code1 = MINUS_EXPR; > + return true; > + break; > + case WIDEN_PLUS_EXPR: > + *code1 = PLUS_EXPR; > + return true; > + break; > + case WIDEN_MULT_EXPR: > + *code1 = MULT_EXPR; > + return true; > + break; > + default: > + gcc_unreachable (); > + } > + } Rather than return true, I think we should do something like: if (!supportable_convert_operation (NOP_EXPR, vectype_out, vectype_in, &dummy_code)) return false; optab = optab_for_tree_code (*code1, vectype_out, optab_default); return (optab_handler (optab, TYPE_MODE (vectype_out)) != CODE_FOR_nothing); to make sure that the target really does support this. AFAICT the caller always knows when it wants the “if” statement above to be used. What it's doing is a bit different from what supportable_convert_operation normally does, so it might be better to put it into a separate function that tests whether the target supports the non-widening form of a widening operation. > diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c > index > f180ced312443ba1e698932d5e8362208690b3fc..b34b00f67ea67943dee7023ab9bfd19c1be5ccbe > 100644 > --- a/gcc/tree-vect-stmts.c > +++ b/gcc/tree-vect-stmts.c > @@ -4545,6 +4545,72 @@ vect_create_vectorized_promotion_stmts (vec_info > *vinfo, > *vec_oprnds0 = vec_tmp; > } > > +/* Create vectorized promotion stmts for widening stmts using only half the > + potential vector size for input. */ > +static void > +vect_create_vectorized_promotion_stmts (vec_info *vinfo, > + vec<tree> *vec_oprnds0, > + vec<tree> *vec_oprnds1, > + stmt_vec_info stmt_info, tree vec_dest, > + gimple_stmt_iterator *gsi, > + enum tree_code code1, > + int op_type) > +{ > + int i; > + tree vop0, vop1, new_tmp; > + gimple *new_stmt1; > + gimple *new_stmt2; > + gimple *new_stmt3; > + vec<tree> vec_tmp = vNULL; > + > + vec_tmp.create (vec_oprnds0->length () * 2); It looks like the * 2 isn't needed. > + FOR_EACH_VEC_ELT (*vec_oprnds0, i, vop0) > + { > + tree new_tmp1, new_tmp2, new_tmp3, out_type; > + > + gcc_assert (op_type == binary_op); > + vop1 = (*vec_oprnds1)[i]; > + > + /* Widen the first vector input. */ > + out_type = TREE_TYPE (vec_dest); > + new_tmp1 = make_ssa_name (out_type); > + new_stmt1 = gimple_build_assign (new_tmp1, NOP_EXPR, vop0); > + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt1, gsi); > + if (VECTOR_TYPE_P (TREE_TYPE (vop1))) > + { > + /* Widen the second vector input. */ > + new_tmp2 = make_ssa_name (out_type); > + new_stmt2 = gimple_build_assign (new_tmp2, NOP_EXPR, vop1); > + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt2, gsi); > + /* Perform the operation. With both vector inputs widened. */ > + new_stmt3 = gimple_build_assign (vec_dest, code1, new_tmp1, new_tmp2); > + } > + else > + { > + /* Perform the operation. With the single vector input widened. */ > + new_stmt3 = gimple_build_assign (vec_dest, code1, new_tmp1, vop1); > + } > + > + new_tmp3 = make_ssa_name (vec_dest, new_stmt3); > + gimple_assign_set_lhs (new_stmt3, new_tmp3); > + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt3, gsi); > + if (is_gimple_call (new_stmt3)) > + { > + new_tmp = gimple_call_lhs (new_stmt3); > + } > + else > + { > + new_tmp = gimple_assign_lhs (new_stmt3); > + } The lhs is always new_tmp3, so it's not necessary to read it back. > + > + /* Store the results for the next step. */ > + vec_tmp.quick_push (new_tmp); FWIW, you could just assign to vec_oprnds[i] and not have vec_tmp, but I don't know whether that's more or less confusing. Either way's fine with me. > + } > + > + vec_oprnds0->release (); > + *vec_oprnds0 = vec_tmp; > +} > + > > /* Check if STMT_INFO performs a conversion operation that can be vectorized. > If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized > @@ -4697,7 +4763,13 @@ vectorizable_conversion (vec_info *vinfo, > nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in); > nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out); > if (known_eq (nunits_out, nunits_in)) > - modifier = NONE; > + if (code == WIDEN_MINUS_EXPR > + || code == WIDEN_PLUS_EXPR > + || code == WIDEN_LSHIFT_EXPR > + || code == WIDEN_MULT_EXPR) > + modifier = WIDEN; Formatting nit: the last line should be indented by 6 spaces rather than 8. > + else > + modifier = NONE; > @@ -4743,9 +4815,21 @@ vectorizable_conversion (vec_info *vinfo, > return false; > > case WIDEN: > - if (supportable_widening_operation (vinfo, code, stmt_info, > vectype_out, > - vectype_in, &code1, &code2, > - &multi_step_cvt, &interm_types)) > + if (known_eq (nunits_out, nunits_in) > + && (code == WIDEN_MINUS_EXPR > + || code == WIDEN_LSHIFT_EXPR > + || code == WIDEN_PLUS_EXPR > + || code == WIDEN_MULT_EXPR) > + && supportable_convert_operation (code, vectype_out, vectype_in, > + &code1)) Guess this is personal taste, sorry, since it's clearly right both ways, but IMO it'd be better to drop the code test. We can only get here with nunits_out==nunits_in if we're converting a widening operation into a non-widening operation. If we do end up calling a separate function (as per the comment above), then it would abort in a meaningful place if something unexpected slips through. > + { > + gcc_assert (!(multi_step_cvt && op_type == binary_op)); > + break; > + } > + else if (supportable_widening_operation (vinfo, code, stmt_info, > + vectype_out, vectype_in, &code1, > + &code2, &multi_step_cvt, > + &interm_types)) > { > /* Binary widening operation can only be supported directly by the > architecture. */ > @@ -4981,10 +5065,20 @@ vectorizable_conversion (vec_info *vinfo, > c1 = codecvt1; > c2 = codecvt2; > } > - vect_create_vectorized_promotion_stmts (vinfo, &vec_oprnds0, > - &vec_oprnds1, stmt_info, > - this_dest, gsi, > - c1, c2, op_type); > + if ((code == WIDEN_MINUS_EXPR > + || code == WIDEN_PLUS_EXPR > + || code == WIDEN_LSHIFT_EXPR > + || code == WIDEN_MULT_EXPR) > + && known_eq (nunits_in, nunits_out)) Same comment here about dropping the code tests. Thanks, Richard > + vect_create_vectorized_promotion_stmts (vinfo, &vec_oprnds0, > + &vec_oprnds1, stmt_info, > + this_dest, gsi, > + c1, op_type); > + else > + vect_create_vectorized_promotion_stmts (vinfo, &vec_oprnds0, > + &vec_oprnds1, stmt_info, > + this_dest, gsi, > + c1, c2, op_type); > } > FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0)