On Tue, Feb 18, 2025 at 10:27:46AM +0000, Richard Sandiford wrote:
> Thanks, this generally looks really good.  Some comments on top of
> Kyrill's, and Christophe's comment internally about -save-temps.
> 
> Spencer Abson <spencer.ab...@arm.com> writes:
> > +/* Build and return a new VECTOR_CST that is the concatenation of
> > +   VEC_IN with itself.  */
> > +static tree
> > +aarch64_self_concat_vec_cst (tree vec_in)
> > +{
> > +  gcc_assert ((TREE_CODE (vec_in) == VECTOR_CST));
> > +  unsigned HOST_WIDE_INT nelts
> > +    = VECTOR_CST_NELTS (vec_in).to_constant ();
> > +
> > +  tree out_type = build_vector_type (TREE_TYPE (TREE_TYPE (vec_in)),
> > +                                nelts * 2);
> 
> It would be good to pass in the type that the caller wants.
> More about that below.

Yeah, I can see the advantage of that.

> 
> > +
> > +  /* Avoid decoding/encoding if the encoding won't change.  */
> > +  if (VECTOR_CST_DUPLICATE_P (vec_in))
> > +    {
> > +      tree vec_out = make_vector (exact_log2
> > +                            (VECTOR_CST_NPATTERNS (vec_in)), 1);
> > +      unsigned int encoded_size
> > +   = vector_cst_encoded_nelts (vec_in) * sizeof (tree);
> > +
> > +      memcpy (VECTOR_CST_ENCODED_ELTS (vec_out),
> > +         VECTOR_CST_ENCODED_ELTS (vec_in), encoded_size);
> > +
> > +      TREE_TYPE (vec_out) = out_type;
> > +      return vec_out;
> > +    }
> 
> I'm not sure this is worth it.  The approach below shouldn't be that
> much less efficient, since all the temporaries are generally on the
> stack.  Also:
> 
> > +
> > +  tree_vector_builder vec_out (out_type, nelts, 1);
> 
> This call rightly describes a duplicated sequence of NELTS elements so...
> 
> > +  for (unsigned i = 0; i < nelts * 2; i++)
> > +    vec_out.quick_push (VECTOR_CST_ELT (vec_in, i % nelts));
> 
> ...it should only be necessary to push nelts elements here.

Good point!

> 
> > +
> > +  return vec_out.build ();
> > +}
> > +
> > +/* If the SSA_NAME_DEF_STMT of ARG is an assignement to a
> > +   BIT_FIELD_REF with SIZE and OFFSET, return the object of the
> > +   BIT_FIELD_REF.  Otherwise, return NULL_TREE.  */
> > +static tree
> > +aarch64_object_of_bfr (tree arg, unsigned HOST_WIDE_INT size,
> > +                  unsigned HOST_WIDE_INT offset)
> > +{
> > +  if (TREE_CODE (arg) != SSA_NAME)
> > +    return NULL_TREE;
> > +
> > +  gassign *stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (arg));
> > +
> > +  if (!stmt)
> > +    return NULL_TREE;
> > +
> > +  if (gimple_assign_rhs_code (stmt) != BIT_FIELD_REF)
> > +    return NULL_TREE;
> > +
> > +  tree bf_ref = gimple_assign_rhs1 (stmt);
> > +
> > +  if (bit_field_size (bf_ref).to_constant () != size
> > +      || bit_field_offset (bf_ref).to_constant () != offset)
> > +    return NULL_TREE;
> > +
> > +  return TREE_OPERAND (bf_ref, 0);
> 
> I think this also needs to check that operand 0 of the BIT_FIELD_REF
> is a 128-bit vector.  A 64-bit reference at offset 64 could instead
> be into something else, such as a 256-bit vector.
> 
> An example is:
> 
> ----------------------------------------------------------------------
> #include <arm_neon.h>
> 
> typedef int16_t int16x16_t __attribute__((vector_size(32)));
> 
> int32x4_t
> f (int16x16_t foo)
> {
>   return vmovl_s16 ((int16x4_t) { foo[4], foo[5], foo[6], foo[7] });
> }
> ----------------------------------------------------------------------
> 
> which triggers an ICE.
> 
> Even if the argument is a 128-bit vector, it could be a 128-bit
> vector of a different type, such as in:
> 
> ----------------------------------------------------------------------
> #include <arm_neon.h>
> 
> int32x4_t
> f (int32x4_t foo)
> {
>   return vmovl_s16 (vget_high_s16 (vreinterpretq_s16_s32 (foo)));
> }
> ----------------------------------------------------------------------
> 
> I think we should still accept this second case, but emit a VIEW_CONVERT_EXPR
> before the call to convert the argument to the right type.
> 

Thanks for raising these, serious tunnel vision on my part...

> > +}
> > +
> > +/*  Prefer to use the highpart builtin when:
> > +
> > +    1) All lowpart arguments are references to the highparts of other
> > +    vectors.
> > +
> > +    2) For calls with two lowpart arguments, if either refers to a
> > +    vector highpart and the other is a VECTOR_CST.  We can copy the
> > +    VECTOR_CST to 128b in this case.  */
> > +static bool
> > +aarch64_fold_lo_call_to_hi (tree arg_0, tree arg_1, tree *out_0,
> > +                       tree *out_1)
> > +{
> > +  /* Punt until as late as possible:
> > +
> > +     1) By folding away BIT_FIELD_REFs we remove information about the
> > +     operands that may be useful to other optimizers.
> > +
> > +     2) For simplicity, we'd like the expression
> > +
> > +   x = BIT_FIELD_REF<a, 64, 64>
> > +
> > +     to imply that A is not a VECTOR_CST.  This assumption is unlikely
> > +     to hold before constant propagation/folding.  */
> > +  if (!(cfun->curr_properties & PROP_last_full_fold))
> > +    return false;
> > +
> > +  unsigned int offset = BYTES_BIG_ENDIAN ? 0 : 64;
> > +
> > +  tree hi_arg_0 = aarch64_object_of_bfr (arg_0, 64, offset);
> > +  tree hi_arg_1 = aarch64_object_of_bfr (arg_1, 64, offset);
> > +  if (!hi_arg_0)
> > +    {
> > +      if (!hi_arg_1 || TREE_CODE (arg_0) != VECTOR_CST)
> > +   return false;
> > +      hi_arg_0 = aarch64_self_concat_vec_cst (arg_0);
> > +    }
> > +  else if (!hi_arg_1)
> > +    {
> > +      if (TREE_CODE (arg_1) != VECTOR_CST)
> > +   return false;
> > +      hi_arg_1 = aarch64_self_concat_vec_cst (arg_1);
> > +    }
> > +
> > +  *out_0 = hi_arg_0;
> > +  *out_1 = hi_arg_1;
> > +  return true;
> > +}
> > +
> > +static bool
> > +aarch64_fold_lo_call_to_hi (tree arg_in, tree *out)
> > +{
> > +  if (!(cfun->curr_properties & PROP_last_full_fold))
> > +    return false;
> > +  unsigned int offset = BYTES_BIG_ENDIAN ? 0 : 64;
> > +
> > +  tree hi_arg = aarch64_object_of_bfr (arg_in, 64, offset);
> > +  if (!hi_arg)
> > +    return false;
> > +
> > +  *out = hi_arg;
> > +  return true;
> > +}
> > +
> > +#undef LO_HI_PAIR
> > +#define LO_HI_PAIR(A, B) case AARCH64_SIMD_BUILTIN_##A:
> > +
> >  /* Try to fold STMT, given that it's a call to the built-in function with
> >     subcode FCODE.  Return the new statement on success and null on
> >     failure.  */
> > @@ -5168,6 +5322,84 @@ aarch64_general_gimple_fold_builtin (unsigned int 
> > fcode, gcall *stmt,
> >         }
> >       break;
> >     }
> > +      break;
> > +    UNOP_LONG_LH_PAIRS
> > +      {
> > +   tree builtin_hi = aarch64_get_highpart_builtin (fcode);
> > +   gcc_assert (nargs == 1 && builtin_hi != NULL_TREE);
> > +
> > +   tree hi_arg;
> > +   if (!aarch64_fold_lo_call_to_hi (args[0], &hi_arg))
> > +     break;
> > +   new_stmt = gimple_build_call (builtin_hi, 1, hi_arg);
> > +   gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
> > +      }
> > +    break;
> > +    BINOP_LONG_LH_PAIRS
> > +      {
> > +   tree builtin_hi = aarch64_get_highpart_builtin (fcode);
> > +   gcc_assert (nargs == 2 && builtin_hi != NULL_TREE);
> > +
> > +   tree hi_arg_0, hi_arg_1;
> > +   if (!aarch64_fold_lo_call_to_hi (args[0], args[1], &hi_arg_0,
> > +                                    &hi_arg_1))
> > +     break;
> > +   new_stmt = gimple_build_call (builtin_hi, 2,
> > +                                 hi_arg_0, hi_arg_1);
> > +   gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
> > +      }
> > +    break;
> > +    BINOP_LONG_N_LH_PAIRS
> > +      {
> > +   tree builtin_hi = aarch64_get_highpart_builtin (fcode);
> > +   gcc_assert (nargs == 2 && builtin_hi != NULL_TREE);
> > +
> > +   tree hi_arg;
> > +   if (!aarch64_fold_lo_call_to_hi (args[0], &hi_arg))
> > +     break;
> > +   new_stmt = gimple_build_call (builtin_hi, 2, hi_arg, args[1]);
> > +   gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
> > +      }
> > +    break;
> > +    TERNOP_LONG_LH_PAIRS
> > +      {
> > +   tree builtin_hi = aarch64_get_highpart_builtin (fcode);
> > +   gcc_assert (nargs == 3 && builtin_hi != NULL_TREE);
> > +
> > +   tree hi_arg_0, hi_arg_1;
> > +   if (!aarch64_fold_lo_call_to_hi (args[1], args[2], &hi_arg_0,
> > +                                    &hi_arg_1))
> > +     break;
> > +   new_stmt = gimple_build_call (builtin_hi, 3, args[0],
> > +                                 hi_arg_0, hi_arg_1);
> > +   gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
> > +      }
> > +    break;
> > +    TERNOP_LONG_N_LH_PAIRS
> > +      {
> > +   tree builtin_hi = aarch64_get_highpart_builtin (fcode);
> > +   gcc_assert (nargs == 3 && builtin_hi != NULL_TREE);
> > +
> > +   tree hi_arg;
> > +   if (!aarch64_fold_lo_call_to_hi (args[1], &hi_arg))
> > +     break;
> > +   new_stmt = gimple_build_call (builtin_hi, 3, args[0], hi_arg,
> > +                                 args[2]);
> > +   gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
> > +      }
> > +    break;
> > +    BINOP_WIDE_LH_PAIRS
> > +      {
> > +   tree builtin_hi = aarch64_get_highpart_builtin (fcode);
> > +   gcc_assert (nargs == 2 && builtin_hi != NULL_TREE);
> > +
> > +   tree hi_arg;
> > +   if (!aarch64_fold_lo_call_to_hi (args[1], &hi_arg))
> > +     break;
> > +   new_stmt = gimple_build_call (builtin_hi, 2, args[0], hi_arg);
> > +   gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
> > +      }
> > +    break;
> 
> I was wondering if, rather than have separate case blocks for each
> style of function signature, we could instead derive the mapping
> programmatically.  For example (feel free to rework):
> 
> ----------------------------------------------------------------------
> static gcall *
> aarch64_fold_lo_call_to_hi (unsigned int fcode, gcall *stmt)
> {
>   /* Punt until as late as possible:
> 
>      1) By folding away BIT_FIELD_REFs we remove information about the
>      operands that may be useful to other optimizers.
> 
>      2) For simplicity, we'd like the expression
> 
>       x = BIT_FIELD_REF<a, 64, 64>
> 
>      to imply that A is not a VECTOR_CST.  This assumption is unlikely
>      to hold before constant propagation/folding.  */
>   if (!(cfun->curr_properties & PROP_last_full_fold))
>     return nullptr;
> 
>   unsigned int offset = BYTES_BIG_ENDIAN ? 0 : 64;
> 
>   /* The arguments of the builtin pair differ only in the lo and hi arguments.
>      Find which arguments those are and build up a new list of arguments for
>      the hi builtin.  Defer converting 64-bit constants into 128-bit constants
>      until we know that we want to go ahead with the change.  */
>   tree builtin_hi = aarch64_get_highpart_builtin (fcode);
>   tree args_lo = TYPE_ARG_TYPES (gimple_call_fntype (stmt));
>   tree args_hi = TYPE_ARG_TYPES (TREE_TYPE (builtin_hi));
>   unsigned int argno = 0;
>   unsigned int num_var = 0;
>   auto_vec<tree, 8> new_args;
>   auto_vec<unsigned int, 2> const_argnos;
>   while (args_lo != void_list_node && args_hi != void_list_node)
>     {
>       tree arg_lo = TREE_VALUE (args_lo);
>       tree arg_hi = TREE_VALUE (args_hi);
>       tree new_arg = gimple_call_arg (stmt, argno);
>       if (!types_compatible_p (arg_lo, arg_hi))
>       {
>         gcc_assert (wi::to_widest (TYPE_SIZE (arg_lo)) == 64
>                     && wi::to_widest (TYPE_SIZE (arg_hi)) == 128);
>         if (tree hi_arg = aarch64_object_of_bfr (new_arg, 64, offset))
>           {
>             num_var += 1;
>             new_arg = hi_arg;
>           }
>         else if (TREE_CODE (new_arg) == VECTOR_CST)
>           const_argnos.safe_push (argno);
>         else
>           return nullptr;
>       }
>       new_args.safe_push (new_arg);
>       args_lo = TREE_CHAIN (args_lo);
>       args_hi = TREE_CHAIN (args_hi);
>       argno += 1;
>     }
>   gcc_assert (args_lo == void_list_node && args_hi == void_list_node);
> 
>   if (num_var == 0)
>     return nullptr;
> 
>   for (auto i : const_argnos)
>     new_args[i] = aarch64_self_concat_vec_cst (new_args[i]);
> 
>   auto *new_stmt = gimple_build_call_vec (builtin_hi, new_args);
>   gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
>   return new_stmt;
> }
> ----------------------------------------------------------------------
> 
> This seems to pass the tests in the patch.  It doesn't address my
> comments above, but one way of handling those would be to record the
> required function type in the main loop and create a VIEW_CONVERT_EXPR
> after the "for (auto i : const_argnos)" loop.
> 
> Thanks,
> Richard

Thanks for all this, I really appreciate the detail.

I shied away from the above as I knew NARGS was at most 3, but it certainly 
scales
better as a solution (thinking about _lane builtins) so perhaps it's the way to 
go.  

Let me see if I can tie your suggestions together.

Spencer

Reply via email to