On Tue, Feb 18, 2025 at 10:27:46AM +0000, Richard Sandiford wrote: > Thanks, this generally looks really good. Some comments on top of > Kyrill's, and Christophe's comment internally about -save-temps. > > Spencer Abson <spencer.ab...@arm.com> writes: > > +/* Build and return a new VECTOR_CST that is the concatenation of > > + VEC_IN with itself. */ > > +static tree > > +aarch64_self_concat_vec_cst (tree vec_in) > > +{ > > + gcc_assert ((TREE_CODE (vec_in) == VECTOR_CST)); > > + unsigned HOST_WIDE_INT nelts > > + = VECTOR_CST_NELTS (vec_in).to_constant (); > > + > > + tree out_type = build_vector_type (TREE_TYPE (TREE_TYPE (vec_in)), > > + nelts * 2); > > It would be good to pass in the type that the caller wants. > More about that below.
Yeah, I can see the advantage of that. > > > + > > + /* Avoid decoding/encoding if the encoding won't change. */ > > + if (VECTOR_CST_DUPLICATE_P (vec_in)) > > + { > > + tree vec_out = make_vector (exact_log2 > > + (VECTOR_CST_NPATTERNS (vec_in)), 1); > > + unsigned int encoded_size > > + = vector_cst_encoded_nelts (vec_in) * sizeof (tree); > > + > > + memcpy (VECTOR_CST_ENCODED_ELTS (vec_out), > > + VECTOR_CST_ENCODED_ELTS (vec_in), encoded_size); > > + > > + TREE_TYPE (vec_out) = out_type; > > + return vec_out; > > + } > > I'm not sure this is worth it. The approach below shouldn't be that > much less efficient, since all the temporaries are generally on the > stack. Also: > > > + > > + tree_vector_builder vec_out (out_type, nelts, 1); > > This call rightly describes a duplicated sequence of NELTS elements so... > > > + for (unsigned i = 0; i < nelts * 2; i++) > > + vec_out.quick_push (VECTOR_CST_ELT (vec_in, i % nelts)); > > ...it should only be necessary to push nelts elements here. Good point! > > > + > > + return vec_out.build (); > > +} > > + > > +/* If the SSA_NAME_DEF_STMT of ARG is an assignement to a > > + BIT_FIELD_REF with SIZE and OFFSET, return the object of the > > + BIT_FIELD_REF. Otherwise, return NULL_TREE. */ > > +static tree > > +aarch64_object_of_bfr (tree arg, unsigned HOST_WIDE_INT size, > > + unsigned HOST_WIDE_INT offset) > > +{ > > + if (TREE_CODE (arg) != SSA_NAME) > > + return NULL_TREE; > > + > > + gassign *stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (arg)); > > + > > + if (!stmt) > > + return NULL_TREE; > > + > > + if (gimple_assign_rhs_code (stmt) != BIT_FIELD_REF) > > + return NULL_TREE; > > + > > + tree bf_ref = gimple_assign_rhs1 (stmt); > > + > > + if (bit_field_size (bf_ref).to_constant () != size > > + || bit_field_offset (bf_ref).to_constant () != offset) > > + return NULL_TREE; > > + > > + return TREE_OPERAND (bf_ref, 0); > > I think this also needs to check that operand 0 of the BIT_FIELD_REF > is a 128-bit vector. A 64-bit reference at offset 64 could instead > be into something else, such as a 256-bit vector. > > An example is: > > ---------------------------------------------------------------------- > #include <arm_neon.h> > > typedef int16_t int16x16_t __attribute__((vector_size(32))); > > int32x4_t > f (int16x16_t foo) > { > return vmovl_s16 ((int16x4_t) { foo[4], foo[5], foo[6], foo[7] }); > } > ---------------------------------------------------------------------- > > which triggers an ICE. > > Even if the argument is a 128-bit vector, it could be a 128-bit > vector of a different type, such as in: > > ---------------------------------------------------------------------- > #include <arm_neon.h> > > int32x4_t > f (int32x4_t foo) > { > return vmovl_s16 (vget_high_s16 (vreinterpretq_s16_s32 (foo))); > } > ---------------------------------------------------------------------- > > I think we should still accept this second case, but emit a VIEW_CONVERT_EXPR > before the call to convert the argument to the right type. > Thanks for raising these, serious tunnel vision on my part... > > +} > > + > > +/* Prefer to use the highpart builtin when: > > + > > + 1) All lowpart arguments are references to the highparts of other > > + vectors. > > + > > + 2) For calls with two lowpart arguments, if either refers to a > > + vector highpart and the other is a VECTOR_CST. We can copy the > > + VECTOR_CST to 128b in this case. */ > > +static bool > > +aarch64_fold_lo_call_to_hi (tree arg_0, tree arg_1, tree *out_0, > > + tree *out_1) > > +{ > > + /* Punt until as late as possible: > > + > > + 1) By folding away BIT_FIELD_REFs we remove information about the > > + operands that may be useful to other optimizers. > > + > > + 2) For simplicity, we'd like the expression > > + > > + x = BIT_FIELD_REF<a, 64, 64> > > + > > + to imply that A is not a VECTOR_CST. This assumption is unlikely > > + to hold before constant propagation/folding. */ > > + if (!(cfun->curr_properties & PROP_last_full_fold)) > > + return false; > > + > > + unsigned int offset = BYTES_BIG_ENDIAN ? 0 : 64; > > + > > + tree hi_arg_0 = aarch64_object_of_bfr (arg_0, 64, offset); > > + tree hi_arg_1 = aarch64_object_of_bfr (arg_1, 64, offset); > > + if (!hi_arg_0) > > + { > > + if (!hi_arg_1 || TREE_CODE (arg_0) != VECTOR_CST) > > + return false; > > + hi_arg_0 = aarch64_self_concat_vec_cst (arg_0); > > + } > > + else if (!hi_arg_1) > > + { > > + if (TREE_CODE (arg_1) != VECTOR_CST) > > + return false; > > + hi_arg_1 = aarch64_self_concat_vec_cst (arg_1); > > + } > > + > > + *out_0 = hi_arg_0; > > + *out_1 = hi_arg_1; > > + return true; > > +} > > + > > +static bool > > +aarch64_fold_lo_call_to_hi (tree arg_in, tree *out) > > +{ > > + if (!(cfun->curr_properties & PROP_last_full_fold)) > > + return false; > > + unsigned int offset = BYTES_BIG_ENDIAN ? 0 : 64; > > + > > + tree hi_arg = aarch64_object_of_bfr (arg_in, 64, offset); > > + if (!hi_arg) > > + return false; > > + > > + *out = hi_arg; > > + return true; > > +} > > + > > +#undef LO_HI_PAIR > > +#define LO_HI_PAIR(A, B) case AARCH64_SIMD_BUILTIN_##A: > > + > > /* Try to fold STMT, given that it's a call to the built-in function with > > subcode FCODE. Return the new statement on success and null on > > failure. */ > > @@ -5168,6 +5322,84 @@ aarch64_general_gimple_fold_builtin (unsigned int > > fcode, gcall *stmt, > > } > > break; > > } > > + break; > > + UNOP_LONG_LH_PAIRS > > + { > > + tree builtin_hi = aarch64_get_highpart_builtin (fcode); > > + gcc_assert (nargs == 1 && builtin_hi != NULL_TREE); > > + > > + tree hi_arg; > > + if (!aarch64_fold_lo_call_to_hi (args[0], &hi_arg)) > > + break; > > + new_stmt = gimple_build_call (builtin_hi, 1, hi_arg); > > + gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt)); > > + } > > + break; > > + BINOP_LONG_LH_PAIRS > > + { > > + tree builtin_hi = aarch64_get_highpart_builtin (fcode); > > + gcc_assert (nargs == 2 && builtin_hi != NULL_TREE); > > + > > + tree hi_arg_0, hi_arg_1; > > + if (!aarch64_fold_lo_call_to_hi (args[0], args[1], &hi_arg_0, > > + &hi_arg_1)) > > + break; > > + new_stmt = gimple_build_call (builtin_hi, 2, > > + hi_arg_0, hi_arg_1); > > + gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt)); > > + } > > + break; > > + BINOP_LONG_N_LH_PAIRS > > + { > > + tree builtin_hi = aarch64_get_highpart_builtin (fcode); > > + gcc_assert (nargs == 2 && builtin_hi != NULL_TREE); > > + > > + tree hi_arg; > > + if (!aarch64_fold_lo_call_to_hi (args[0], &hi_arg)) > > + break; > > + new_stmt = gimple_build_call (builtin_hi, 2, hi_arg, args[1]); > > + gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt)); > > + } > > + break; > > + TERNOP_LONG_LH_PAIRS > > + { > > + tree builtin_hi = aarch64_get_highpart_builtin (fcode); > > + gcc_assert (nargs == 3 && builtin_hi != NULL_TREE); > > + > > + tree hi_arg_0, hi_arg_1; > > + if (!aarch64_fold_lo_call_to_hi (args[1], args[2], &hi_arg_0, > > + &hi_arg_1)) > > + break; > > + new_stmt = gimple_build_call (builtin_hi, 3, args[0], > > + hi_arg_0, hi_arg_1); > > + gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt)); > > + } > > + break; > > + TERNOP_LONG_N_LH_PAIRS > > + { > > + tree builtin_hi = aarch64_get_highpart_builtin (fcode); > > + gcc_assert (nargs == 3 && builtin_hi != NULL_TREE); > > + > > + tree hi_arg; > > + if (!aarch64_fold_lo_call_to_hi (args[1], &hi_arg)) > > + break; > > + new_stmt = gimple_build_call (builtin_hi, 3, args[0], hi_arg, > > + args[2]); > > + gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt)); > > + } > > + break; > > + BINOP_WIDE_LH_PAIRS > > + { > > + tree builtin_hi = aarch64_get_highpart_builtin (fcode); > > + gcc_assert (nargs == 2 && builtin_hi != NULL_TREE); > > + > > + tree hi_arg; > > + if (!aarch64_fold_lo_call_to_hi (args[1], &hi_arg)) > > + break; > > + new_stmt = gimple_build_call (builtin_hi, 2, args[0], hi_arg); > > + gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt)); > > + } > > + break; > > I was wondering if, rather than have separate case blocks for each > style of function signature, we could instead derive the mapping > programmatically. For example (feel free to rework): > > ---------------------------------------------------------------------- > static gcall * > aarch64_fold_lo_call_to_hi (unsigned int fcode, gcall *stmt) > { > /* Punt until as late as possible: > > 1) By folding away BIT_FIELD_REFs we remove information about the > operands that may be useful to other optimizers. > > 2) For simplicity, we'd like the expression > > x = BIT_FIELD_REF<a, 64, 64> > > to imply that A is not a VECTOR_CST. This assumption is unlikely > to hold before constant propagation/folding. */ > if (!(cfun->curr_properties & PROP_last_full_fold)) > return nullptr; > > unsigned int offset = BYTES_BIG_ENDIAN ? 0 : 64; > > /* The arguments of the builtin pair differ only in the lo and hi arguments. > Find which arguments those are and build up a new list of arguments for > the hi builtin. Defer converting 64-bit constants into 128-bit constants > until we know that we want to go ahead with the change. */ > tree builtin_hi = aarch64_get_highpart_builtin (fcode); > tree args_lo = TYPE_ARG_TYPES (gimple_call_fntype (stmt)); > tree args_hi = TYPE_ARG_TYPES (TREE_TYPE (builtin_hi)); > unsigned int argno = 0; > unsigned int num_var = 0; > auto_vec<tree, 8> new_args; > auto_vec<unsigned int, 2> const_argnos; > while (args_lo != void_list_node && args_hi != void_list_node) > { > tree arg_lo = TREE_VALUE (args_lo); > tree arg_hi = TREE_VALUE (args_hi); > tree new_arg = gimple_call_arg (stmt, argno); > if (!types_compatible_p (arg_lo, arg_hi)) > { > gcc_assert (wi::to_widest (TYPE_SIZE (arg_lo)) == 64 > && wi::to_widest (TYPE_SIZE (arg_hi)) == 128); > if (tree hi_arg = aarch64_object_of_bfr (new_arg, 64, offset)) > { > num_var += 1; > new_arg = hi_arg; > } > else if (TREE_CODE (new_arg) == VECTOR_CST) > const_argnos.safe_push (argno); > else > return nullptr; > } > new_args.safe_push (new_arg); > args_lo = TREE_CHAIN (args_lo); > args_hi = TREE_CHAIN (args_hi); > argno += 1; > } > gcc_assert (args_lo == void_list_node && args_hi == void_list_node); > > if (num_var == 0) > return nullptr; > > for (auto i : const_argnos) > new_args[i] = aarch64_self_concat_vec_cst (new_args[i]); > > auto *new_stmt = gimple_build_call_vec (builtin_hi, new_args); > gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt)); > return new_stmt; > } > ---------------------------------------------------------------------- > > This seems to pass the tests in the patch. It doesn't address my > comments above, but one way of handling those would be to record the > required function type in the main loop and create a VIEW_CONVERT_EXPR > after the "for (auto i : const_argnos)" loop. > > Thanks, > Richard Thanks for all this, I really appreciate the detail. I shied away from the above as I knew NARGS was at most 3, but it certainly scales better as a solution (thinking about _lane builtins) so perhaps it's the way to go. Let me see if I can tie your suggestions together. Spencer