Thanks, this generally looks really good.  Some comments on top of
Kyrill's, and Christophe's comment internally about -save-temps.

Spencer Abson <spencer.ab...@arm.com> writes:
> +/* Build and return a new VECTOR_CST that is the concatenation of
> +   VEC_IN with itself.  */
> +static tree
> +aarch64_self_concat_vec_cst (tree vec_in)
> +{
> +  gcc_assert ((TREE_CODE (vec_in) == VECTOR_CST));
> +  unsigned HOST_WIDE_INT nelts
> +    = VECTOR_CST_NELTS (vec_in).to_constant ();
> +
> +  tree out_type = build_vector_type (TREE_TYPE (TREE_TYPE (vec_in)),
> +                                  nelts * 2);

It would be good to pass in the type that the caller wants.
More about that below.

> +
> +  /* Avoid decoding/encoding if the encoding won't change.  */
> +  if (VECTOR_CST_DUPLICATE_P (vec_in))
> +    {
> +      tree vec_out = make_vector (exact_log2
> +                              (VECTOR_CST_NPATTERNS (vec_in)), 1);
> +      unsigned int encoded_size
> +     = vector_cst_encoded_nelts (vec_in) * sizeof (tree);
> +
> +      memcpy (VECTOR_CST_ENCODED_ELTS (vec_out),
> +           VECTOR_CST_ENCODED_ELTS (vec_in), encoded_size);
> +
> +      TREE_TYPE (vec_out) = out_type;
> +      return vec_out;
> +    }

I'm not sure this is worth it.  The approach below shouldn't be that
much less efficient, since all the temporaries are generally on the
stack.  Also:

> +
> +  tree_vector_builder vec_out (out_type, nelts, 1);

This call rightly describes a duplicated sequence of NELTS elements so...

> +  for (unsigned i = 0; i < nelts * 2; i++)
> +    vec_out.quick_push (VECTOR_CST_ELT (vec_in, i % nelts));

...it should only be necessary to push nelts elements here.

> +
> +  return vec_out.build ();
> +}
> +
> +/* If the SSA_NAME_DEF_STMT of ARG is an assignement to a
> +   BIT_FIELD_REF with SIZE and OFFSET, return the object of the
> +   BIT_FIELD_REF.  Otherwise, return NULL_TREE.  */
> +static tree
> +aarch64_object_of_bfr (tree arg, unsigned HOST_WIDE_INT size,
> +                    unsigned HOST_WIDE_INT offset)
> +{
> +  if (TREE_CODE (arg) != SSA_NAME)
> +    return NULL_TREE;
> +
> +  gassign *stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (arg));
> +
> +  if (!stmt)
> +    return NULL_TREE;
> +
> +  if (gimple_assign_rhs_code (stmt) != BIT_FIELD_REF)
> +    return NULL_TREE;
> +
> +  tree bf_ref = gimple_assign_rhs1 (stmt);
> +
> +  if (bit_field_size (bf_ref).to_constant () != size
> +      || bit_field_offset (bf_ref).to_constant () != offset)
> +    return NULL_TREE;
> +
> +  return TREE_OPERAND (bf_ref, 0);

I think this also needs to check that operand 0 of the BIT_FIELD_REF
is a 128-bit vector.  A 64-bit reference at offset 64 could instead
be into something else, such as a 256-bit vector.

An example is:

----------------------------------------------------------------------
#include <arm_neon.h>

typedef int16_t int16x16_t __attribute__((vector_size(32)));

int32x4_t
f (int16x16_t foo)
{
  return vmovl_s16 ((int16x4_t) { foo[4], foo[5], foo[6], foo[7] });
}
----------------------------------------------------------------------

which triggers an ICE.

Even if the argument is a 128-bit vector, it could be a 128-bit
vector of a different type, such as in:

----------------------------------------------------------------------
#include <arm_neon.h>

int32x4_t
f (int32x4_t foo)
{
  return vmovl_s16 (vget_high_s16 (vreinterpretq_s16_s32 (foo)));
}
----------------------------------------------------------------------

I think we should still accept this second case, but emit a VIEW_CONVERT_EXPR
before the call to convert the argument to the right type.

> +}
> +
> +/*  Prefer to use the highpart builtin when:
> +
> +    1) All lowpart arguments are references to the highparts of other
> +    vectors.
> +
> +    2) For calls with two lowpart arguments, if either refers to a
> +    vector highpart and the other is a VECTOR_CST.  We can copy the
> +    VECTOR_CST to 128b in this case.  */
> +static bool
> +aarch64_fold_lo_call_to_hi (tree arg_0, tree arg_1, tree *out_0,
> +                         tree *out_1)
> +{
> +  /* Punt until as late as possible:
> +
> +     1) By folding away BIT_FIELD_REFs we remove information about the
> +     operands that may be useful to other optimizers.
> +
> +     2) For simplicity, we'd like the expression
> +
> +     x = BIT_FIELD_REF<a, 64, 64>
> +
> +     to imply that A is not a VECTOR_CST.  This assumption is unlikely
> +     to hold before constant propagation/folding.  */
> +  if (!(cfun->curr_properties & PROP_last_full_fold))
> +    return false;
> +
> +  unsigned int offset = BYTES_BIG_ENDIAN ? 0 : 64;
> +
> +  tree hi_arg_0 = aarch64_object_of_bfr (arg_0, 64, offset);
> +  tree hi_arg_1 = aarch64_object_of_bfr (arg_1, 64, offset);
> +  if (!hi_arg_0)
> +    {
> +      if (!hi_arg_1 || TREE_CODE (arg_0) != VECTOR_CST)
> +     return false;
> +      hi_arg_0 = aarch64_self_concat_vec_cst (arg_0);
> +    }
> +  else if (!hi_arg_1)
> +    {
> +      if (TREE_CODE (arg_1) != VECTOR_CST)
> +     return false;
> +      hi_arg_1 = aarch64_self_concat_vec_cst (arg_1);
> +    }
> +
> +  *out_0 = hi_arg_0;
> +  *out_1 = hi_arg_1;
> +  return true;
> +}
> +
> +static bool
> +aarch64_fold_lo_call_to_hi (tree arg_in, tree *out)
> +{
> +  if (!(cfun->curr_properties & PROP_last_full_fold))
> +    return false;
> +  unsigned int offset = BYTES_BIG_ENDIAN ? 0 : 64;
> +
> +  tree hi_arg = aarch64_object_of_bfr (arg_in, 64, offset);
> +  if (!hi_arg)
> +    return false;
> +
> +  *out = hi_arg;
> +  return true;
> +}
> +
> +#undef LO_HI_PAIR
> +#define LO_HI_PAIR(A, B) case AARCH64_SIMD_BUILTIN_##A:
> +
>  /* Try to fold STMT, given that it's a call to the built-in function with
>     subcode FCODE.  Return the new statement on success and null on
>     failure.  */
> @@ -5168,6 +5322,84 @@ aarch64_general_gimple_fold_builtin (unsigned int 
> fcode, gcall *stmt,
>           }
>         break;
>       }
> +      break;
> +    UNOP_LONG_LH_PAIRS
> +      {
> +     tree builtin_hi = aarch64_get_highpart_builtin (fcode);
> +     gcc_assert (nargs == 1 && builtin_hi != NULL_TREE);
> +
> +     tree hi_arg;
> +     if (!aarch64_fold_lo_call_to_hi (args[0], &hi_arg))
> +       break;
> +     new_stmt = gimple_build_call (builtin_hi, 1, hi_arg);
> +     gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
> +      }
> +    break;
> +    BINOP_LONG_LH_PAIRS
> +      {
> +     tree builtin_hi = aarch64_get_highpart_builtin (fcode);
> +     gcc_assert (nargs == 2 && builtin_hi != NULL_TREE);
> +
> +     tree hi_arg_0, hi_arg_1;
> +     if (!aarch64_fold_lo_call_to_hi (args[0], args[1], &hi_arg_0,
> +                                      &hi_arg_1))
> +       break;
> +     new_stmt = gimple_build_call (builtin_hi, 2,
> +                                   hi_arg_0, hi_arg_1);
> +     gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
> +      }
> +    break;
> +    BINOP_LONG_N_LH_PAIRS
> +      {
> +     tree builtin_hi = aarch64_get_highpart_builtin (fcode);
> +     gcc_assert (nargs == 2 && builtin_hi != NULL_TREE);
> +
> +     tree hi_arg;
> +     if (!aarch64_fold_lo_call_to_hi (args[0], &hi_arg))
> +       break;
> +     new_stmt = gimple_build_call (builtin_hi, 2, hi_arg, args[1]);
> +     gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
> +      }
> +    break;
> +    TERNOP_LONG_LH_PAIRS
> +      {
> +     tree builtin_hi = aarch64_get_highpart_builtin (fcode);
> +     gcc_assert (nargs == 3 && builtin_hi != NULL_TREE);
> +
> +     tree hi_arg_0, hi_arg_1;
> +     if (!aarch64_fold_lo_call_to_hi (args[1], args[2], &hi_arg_0,
> +                                      &hi_arg_1))
> +       break;
> +     new_stmt = gimple_build_call (builtin_hi, 3, args[0],
> +                                   hi_arg_0, hi_arg_1);
> +     gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
> +      }
> +    break;
> +    TERNOP_LONG_N_LH_PAIRS
> +      {
> +     tree builtin_hi = aarch64_get_highpart_builtin (fcode);
> +     gcc_assert (nargs == 3 && builtin_hi != NULL_TREE);
> +
> +     tree hi_arg;
> +     if (!aarch64_fold_lo_call_to_hi (args[1], &hi_arg))
> +       break;
> +     new_stmt = gimple_build_call (builtin_hi, 3, args[0], hi_arg,
> +                                   args[2]);
> +     gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
> +      }
> +    break;
> +    BINOP_WIDE_LH_PAIRS
> +      {
> +     tree builtin_hi = aarch64_get_highpart_builtin (fcode);
> +     gcc_assert (nargs == 2 && builtin_hi != NULL_TREE);
> +
> +     tree hi_arg;
> +     if (!aarch64_fold_lo_call_to_hi (args[1], &hi_arg))
> +       break;
> +     new_stmt = gimple_build_call (builtin_hi, 2, args[0], hi_arg);
> +     gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
> +      }
> +    break;

I was wondering if, rather than have separate case blocks for each
style of function signature, we could instead derive the mapping
programmatically.  For example (feel free to rework):

----------------------------------------------------------------------
static gcall *
aarch64_fold_lo_call_to_hi (unsigned int fcode, gcall *stmt)
{
  /* Punt until as late as possible:

     1) By folding away BIT_FIELD_REFs we remove information about the
     operands that may be useful to other optimizers.

     2) For simplicity, we'd like the expression

        x = BIT_FIELD_REF<a, 64, 64>

     to imply that A is not a VECTOR_CST.  This assumption is unlikely
     to hold before constant propagation/folding.  */
  if (!(cfun->curr_properties & PROP_last_full_fold))
    return nullptr;

  unsigned int offset = BYTES_BIG_ENDIAN ? 0 : 64;

  /* The arguments of the builtin pair differ only in the lo and hi arguments.
     Find which arguments those are and build up a new list of arguments for
     the hi builtin.  Defer converting 64-bit constants into 128-bit constants
     until we know that we want to go ahead with the change.  */
  tree builtin_hi = aarch64_get_highpart_builtin (fcode);
  tree args_lo = TYPE_ARG_TYPES (gimple_call_fntype (stmt));
  tree args_hi = TYPE_ARG_TYPES (TREE_TYPE (builtin_hi));
  unsigned int argno = 0;
  unsigned int num_var = 0;
  auto_vec<tree, 8> new_args;
  auto_vec<unsigned int, 2> const_argnos;
  while (args_lo != void_list_node && args_hi != void_list_node)
    {
      tree arg_lo = TREE_VALUE (args_lo);
      tree arg_hi = TREE_VALUE (args_hi);
      tree new_arg = gimple_call_arg (stmt, argno);
      if (!types_compatible_p (arg_lo, arg_hi))
        {
          gcc_assert (wi::to_widest (TYPE_SIZE (arg_lo)) == 64
                      && wi::to_widest (TYPE_SIZE (arg_hi)) == 128);
          if (tree hi_arg = aarch64_object_of_bfr (new_arg, 64, offset))
            {
              num_var += 1;
              new_arg = hi_arg;
            }
          else if (TREE_CODE (new_arg) == VECTOR_CST)
            const_argnos.safe_push (argno);
          else
            return nullptr;
        }
      new_args.safe_push (new_arg);
      args_lo = TREE_CHAIN (args_lo);
      args_hi = TREE_CHAIN (args_hi);
      argno += 1;
    }
  gcc_assert (args_lo == void_list_node && args_hi == void_list_node);

  if (num_var == 0)
    return nullptr;

  for (auto i : const_argnos)
    new_args[i] = aarch64_self_concat_vec_cst (new_args[i]);

  auto *new_stmt = gimple_build_call_vec (builtin_hi, new_args);
  gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
  return new_stmt;
}
----------------------------------------------------------------------

This seems to pass the tests in the patch.  It doesn't address my
comments above, but one way of handling those would be to record the
required function type in the main loop and create a VIEW_CONVERT_EXPR
after the "for (auto i : const_argnos)" loop.

Thanks,
Richard

Reply via email to