On Fri, 4 Aug 2023 at 20:36, Richard Sandiford
<richard.sandif...@arm.com> wrote:
>
> Full review this time, sorry for the skipping the tests earlier.
Thanks for the detailed review! Please find my responses inline below.
>
> Prathamesh Kulkarni <prathamesh.kulka...@linaro.org> writes:
> > diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
> > index 7e5494dfd39..680d0e54fd4 100644
> > --- a/gcc/fold-const.cc
> > +++ b/gcc/fold-const.cc
> > @@ -85,6 +85,10 @@ along with GCC; see the file COPYING3.  If not see
> >  #include "vec-perm-indices.h"
> >  #include "asan.h"
> >  #include "gimple-range.h"
> > +#include <algorithm>
>
> This should be included by defining INCLUDE_ALGORITHM instead.
Done. Just curious, why do we use this macro instead of directly
including <algorithm> ?
>
> > +#include "tree-pretty-print.h"
> > +#include "gimple-pretty-print.h"
> > +#include "print-tree.h"
>
> Are these still needed, or were they for debugging?
Just for debugging, removed.
>
> >
> >  /* Nonzero if we are folding constants inside an initializer or a C++
> >     manifestly-constant-evaluated context; zero otherwise.
> > @@ -10494,15 +10498,9 @@ fold_mult_zconjz (location_t loc, tree type, tree 
> > expr)
> >  static bool
> >  vec_cst_ctor_to_array (tree arg, unsigned int nelts, tree *elts)
> >  {
> > -  unsigned HOST_WIDE_INT i, nunits;
> > +  unsigned HOST_WIDE_INT i;
> >
> > -  if (TREE_CODE (arg) == VECTOR_CST
> > -      && VECTOR_CST_NELTS (arg).is_constant (&nunits))
> > -    {
> > -      for (i = 0; i < nunits; ++i)
> > -     elts[i] = VECTOR_CST_ELT (arg, i);
> > -    }
> > -  else if (TREE_CODE (arg) == CONSTRUCTOR)
> > +  if (TREE_CODE (arg) == CONSTRUCTOR)
> >      {
> >        constructor_elt *elt;
> >
> > @@ -10520,6 +10518,192 @@ vec_cst_ctor_to_array (tree arg, unsigned int 
> > nelts, tree *elts)
> >    return true;
> >  }
> >
> > +/* Helper routine for fold_vec_perm_cst to check if SEL is a suitable
> > +   mask for VLA vec_perm folding.
> > +   REASON if specified, will contain the reason why SEL is not suitable.
> > +   Used only for debugging and unit-testing.
> > +   VERBOSE if enabled is used for debugging output.  */
> > +
> > +static bool
> > +valid_mask_for_fold_vec_perm_cst_p (tree arg0, tree arg1,
> > +                                 const vec_perm_indices &sel,
> > +                                 const char **reason = NULL,
> > +                                 ATTRIBUTE_UNUSED bool verbose = false)
>
> Since verbose is no longer needed (good!), I think we should just remove it.
Done.
>
> > +{
> > +  unsigned sel_npatterns = sel.encoding ().npatterns ();
> > +  unsigned sel_nelts_per_pattern = sel.encoding ().nelts_per_pattern ();
> > +
> > +  if (!(pow2p_hwi (sel_npatterns)
> > +     && pow2p_hwi (VECTOR_CST_NPATTERNS (arg0))
> > +     && pow2p_hwi (VECTOR_CST_NPATTERNS (arg1))))
> > +    {
> > +      if (reason)
> > +     *reason = "npatterns is not power of 2";
> > +      return false;
> > +    }
> > +
> > +  /* We want to avoid cases where sel.length is not a multiple of 
> > npatterns.
> > +     For eg: sel.length = 2 + 2x, and sel npatterns = 4.  */
> > +  poly_uint64 esel;
> > +  if (!multiple_p (sel.length (), sel_npatterns, &esel))
> > +    {
> > +      if (reason)
> > +     *reason = "sel.length is not multiple of sel_npatterns";
> > +      return false;
> > +    }
> > +
> > +  if (sel_nelts_per_pattern < 3)
> > +    return true;
> > +
> > +  for (unsigned pattern = 0; pattern < sel_npatterns; pattern++)
> > +    {
> > +      poly_uint64 a1 = sel[pattern + sel_npatterns];
> > +      poly_uint64 a2 = sel[pattern + 2 * sel_npatterns];
> > +      HOST_WIDE_INT S;
>
> Trailing whitespace.  The convention is to use lowercase variable
> names, so please call this "step".
Fixed, thanks.
>
> > +      if (!poly_int64 (a2 - a1).is_constant (&S))
> > +     {
> > +       if (reason)
> > +         *reason = "step is not constant";
> > +       return false;
> > +     }
> > +      // FIXME: Punt on S < 0 for now, revisit later.
> > +      if (S < 0)
> > +     return false;
> > +      if (S == 0)
> > +     continue;
> > +
> > +      if (!pow2p_hwi (S))
> > +     {
> > +       if (reason)
> > +         *reason = "step is not power of 2";
> > +       return false;
> > +     }
> > +
> > +      /* Ensure that stepped sequence of the pattern selects elements
> > +      only from the same input vector if it's VLA.  */
>
> s/ if it's VLA//
Oops sorry, that was a relic of something else I was trying :)
Fixed, thanks.
>
> > +      uint64_t q1, qe;
> > +      poly_uint64 r1, re;
> > +      poly_uint64 ae = a1 + (esel - 2) * S;
> > +      poly_uint64 arg_len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> > +
> > +      if (!(can_div_trunc_p (a1, arg_len, &q1, &r1)
> > +         && can_div_trunc_p (ae, arg_len, &qe, &re)
> > +         && q1 == qe))
> > +     {
> > +       if (reason)
> > +         *reason = "crossed input vectors";
> > +       return false;
> > +     }
> > +
>
> Probably worth a comment above the following code too:
>
>   /* Ensure that the stepped sequence always selects from the same
>      input pattern.  */
Done.
>
> > +      unsigned arg_npatterns
> > +     = ((q1 & 0) == 0) ? VECTOR_CST_NPATTERNS (arg0)
> > +                       : VECTOR_CST_NPATTERNS (arg1);
> > +
> > +      if (!multiple_p (S, arg_npatterns))
> > +     {
> > +       if (reason)
> > +         *reason = "S is not multiple of npatterns";
> > +       return false;
> > +     }
> > +    }
> > +
> > +  return true;
> > +}
> > +
> > +/* Try to fold permutation of ARG0 and ARG1 with SEL selector when
> > +   the input vectors are VECTOR_CST. Return NULL_TREE otherwise.
> > +   REASON and VERBOSE have same purpose as described in
> > +   valid_mask_for_fold_vec_perm_cst_p.
> > +
> > +   (1) If SEL is a suitable mask as determined by
> > +       valid_mask_for_fold_vec_perm_cst_p, then:
> > +       res_npatterns = max of npatterns between ARG0, ARG1, and SEL
> > +       res_nelts_per_pattern = max of nelts_per_pattern between
> > +                            ARG0, ARG1 and SEL.
> > +   (2) If SEL is not a suitable mask, and ARG0, ARG1 are VLS,
> > +       then:
> > +       res_npatterns = nelts in input vector.
>
> s/input vector/result vector/
Fixed, thanks.
>
> > +       res_nelts_per_pattern = 1.
> > +       This exception is made so that VLS ARG0, ARG1 and SEL work as 
> > before.  */
>
> Guess this is personal preference, but (1) and (2) seem more like
> implementation details, so I think they belong...
>
> > +
> > +static tree
> > +fold_vec_perm_cst (tree type, tree arg0, tree arg1, const vec_perm_indices 
> > &sel,
> > +                const char **reason = NULL, bool verbose = false)
> > +{
> > +  unsigned res_npatterns, res_nelts_per_pattern;
> > +  unsigned HOST_WIDE_INT res_nelts;
> > +
>
> ...here instead.
Done.
>
> > +  if (valid_mask_for_fold_vec_perm_cst_p (arg0, arg1, sel, reason, 
> > verbose))
> > +    {
> > +      res_npatterns
> > +     = std::max (VECTOR_CST_NPATTERNS (arg0),
> > +                 std::max (VECTOR_CST_NPATTERNS (arg1),
> > +                           sel.encoding ().npatterns ()));
> > +
> > +      res_nelts_per_pattern
> > +     = std::max (VECTOR_CST_NELTS_PER_PATTERN (arg0),
> > +                 std::max (VECTOR_CST_NELTS_PER_PATTERN (arg1),
> > +                           sel.encoding ().nelts_per_pattern ()));
> > +
> > +      res_nelts = res_npatterns * res_nelts_per_pattern;
> > +    }
> > +  else if (TYPE_VECTOR_SUBPARTS (type).is_constant (&res_nelts))
> > +    {
> > +      res_npatterns = res_nelts;
> > +      res_nelts_per_pattern = 1;
> > +    }
> > +  else
> > +    return NULL_TREE;
> > +
> > +  tree_vector_builder out_elts (type, res_npatterns, 
> > res_nelts_per_pattern);
> > +  for (unsigned i = 0; i < res_nelts; i++)
> > +    {
> > +      poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> > +      uint64_t q;
> > +      poly_uint64 r;
> > +      unsigned HOST_WIDE_INT index;
> > +
> > +      unsigned HOST_WIDE_INT arg_nelts;
> > +      if (TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)).is_constant (&arg_nelts)
> > +       && known_ge (sel[i], poly_int64 (2 * arg_nelts)))
> > +     {
> > +       if (reason)
> > +         *reason = "out of bounds access";
> > +       return NULL_TREE;
> > +     }
>
> I don't think this is needed.  The selector indices wrap, and the code
> below should handle the wrapping correctly.
Removed, thanks.
>
> > +
> > +      /* Punt if sel[i] /trunc_div len cannot be determined,
> > +      because the input vector to be chosen will depend on
> > +      runtime vector length.
> > +      For example if len == 4 + 4x, and sel[i] == 4,
> > +      If len at runtime equals 4, we choose arg1[0].
> > +      For any other value of len > 4 at runtime, we choose arg0[4].
> > +      which makes the element choice dependent on runtime vector length.  
> > */
> > +      if (!can_div_trunc_p (sel[i], len, &q, &r))
> > +     {
> > +       if (reason)
> > +         *reason = "cannot divide selector element by arg len";
> > +       return NULL_TREE;
> > +     }
> > +
> > +      /* sel[i] % len will give the index of element in the chosen input
> > +      vector. For example if sel[i] == 5 + 4x and len == 4 + 4x,
> > +      we will choose arg1[1] since (5 + 4x) % (4 + 4x) == 1.  */
> > +      if (!r.is_constant (&index))
> > +     {
> > +       if (reason)
> > +         *reason = "remainder is not constant";
> > +       return NULL_TREE;
> > +     }
> > +
> > +      tree arg = ((q & 1) == 0) ? arg0 : arg1;
> > +      tree elem = vector_cst_elt (arg, index);
> > +      out_elts.quick_push (elem);
> > +    }
> > +
> > +  return out_elts.build ();
> > +}
> > +
> >  /* Attempt to fold vector permutation of ARG0 and ARG1 vectors using SEL
> >     selector.  Return the folded VECTOR_CST or CONSTRUCTOR if successful,
> >     NULL_TREE otherwise.  */
> > @@ -10529,43 +10713,40 @@ fold_vec_perm (tree type, tree arg0, tree arg1, 
> > const vec_perm_indices &sel)
> >  {
> >    unsigned int i;
> >    unsigned HOST_WIDE_INT nelts;
> > -  bool need_ctor = false;
> >
> > -  if (!sel.length ().is_constant (&nelts))
> > -    return NULL_TREE;
> > -  gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (type), nelts)
> > -           && known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)), nelts)
> > -           && known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg1)), nelts));
> > +  gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (type), sel.length ())
> > +           && known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)),
> > +                        TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg1))));
> > +
> >    if (TREE_TYPE (TREE_TYPE (arg0)) != TREE_TYPE (type)
> >        || TREE_TYPE (TREE_TYPE (arg1)) != TREE_TYPE (type))
> >      return NULL_TREE;
> >
> > +  if (TREE_CODE (arg0) == VECTOR_CST
> > +      && TREE_CODE (arg1) == VECTOR_CST)
> > +    return fold_vec_perm_cst (type, arg0, arg1, sel);
> > +
> > +  /* For fall back case, we want to ensure we have VLS vectors
> > +     with equal length.  */
> > +  if (!sel.length ().is_constant (&nelts))
> > +    return NULL_TREE;
> > +
> > +  gcc_assert (known_eq (sel.length (), TYPE_VECTOR_SUBPARTS (TREE_TYPE 
> > (arg0))));
>
> Nit: long line.
Fixed, thanks.
>
> >    tree *in_elts = XALLOCAVEC (tree, nelts * 2);
> >    if (!vec_cst_ctor_to_array (arg0, nelts, in_elts)
> >        || !vec_cst_ctor_to_array (arg1, nelts, in_elts + nelts))
> >      return NULL_TREE;
> >
> > -  tree_vector_builder out_elts (type, nelts, 1);
> > +  vec<constructor_elt, va_gc> *v;
> > +  vec_alloc (v, nelts);
> >    for (i = 0; i < nelts; i++)
> >      {
> >        HOST_WIDE_INT index;
> >        if (!sel[i].is_constant (&index))
> >       return NULL_TREE;
> > -      if (!CONSTANT_CLASS_P (in_elts[index]))
> > -     need_ctor = true;
> > -      out_elts.quick_push (unshare_expr (in_elts[index]));
> > -    }
> > -
> > -  if (need_ctor)
> > -    {
> > -      vec<constructor_elt, va_gc> *v;
> > -      vec_alloc (v, nelts);
> > -      for (i = 0; i < nelts; i++)
> > -     CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, out_elts[i]);
> > -      return build_constructor (type, v);
> > +      CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, in_elts[index]);
> >      }
> > -  else
> > -    return out_elts.build ();
> > +  return build_constructor (type, v);
> >  }
> >
> >  /* Try to fold a pointer difference of type TYPE two address expressions of
> > @@ -16892,6 +17073,508 @@ test_arithmetic_folding ()
> >                                  x);
> >  }
> >
> > +namespace test_fold_vec_perm_cst {
> > +
> > +static tree
> > +get_preferred_vectype (tree inner_type)
> > +{
> > +  scalar_int_mode int_mode = SCALAR_INT_TYPE_MODE (inner_type);
> > +  machine_mode vmode = targetm.vectorize.preferred_simd_mode (int_mode);
> > +  poly_uint64 nunits = GET_MODE_NUNITS (vmode);
> > +  return build_vector_type (inner_type, nunits);
> > +}
> > +
> > +static tree
> > +build_vec_cst_rand (tree inner_type, unsigned npatterns,
> > +                 unsigned nelts_per_pattern, int S = 0,
>
> Similar comment about lowercase variable names here.
>
> > +                 tree vectype = NULL_TREE)
> > +{
> > +  if (!vectype)
> > +    vectype = get_preferred_vectype (inner_type);
>
> I'm not sure how portable this is.  It looks like the tests rely on
> the integer_type_node vectors being 4 + 4x, but that isn't necessarily
> true on all VLA targets.
>
> Perhaps instead the tests could be classified based on the vector
> lengths that they assume.  Then we can iterate through the vector
> modes and call the appropriate function based on GET_MODE_NUNITS
> and GET_MODE_INNER.
I tried this approach in the attached patch.
Does it look OK ?
>
> > +  tree_vector_builder builder (vectype, npatterns, nelts_per_pattern);
> > +
> > +  // Fill a0 for each pattern
> > +  for (unsigned i = 0; i < npatterns; i++)
> > +    builder.quick_push (build_int_cst (inner_type, rand () % 100));
> > +
> > +  if (nelts_per_pattern == 1)
> > +    return builder.build ();
> > +
> > +  // Fill a1 for each pattern
> > +  for (unsigned i = 0; i < npatterns; i++)
> > +    builder.quick_push (build_int_cst (inner_type, rand () % 100));
> > +
> > +  if (nelts_per_pattern == 2)
> > +    return builder.build ();
> > +
> > +  for (unsigned i = npatterns * 2; i < npatterns * nelts_per_pattern; i++)
> > +    {
> > +      tree prev_elem = builder[i - npatterns];
> > +      int prev_elem_val = TREE_INT_CST_LOW (prev_elem);
> > +      int val = prev_elem_val + S;
> > +      builder.quick_push (build_int_cst (inner_type, val));
> > +    }
> > +
> > +  return builder.build ();
> > +}
> > +
> > +static void
> > +validate_res (unsigned npatterns, unsigned nelts_per_pattern,
> > +           tree res, tree *expected_res)
> > +{
> > +  ASSERT_TRUE (VECTOR_CST_NPATTERNS (res) == npatterns);
> > +  ASSERT_TRUE (VECTOR_CST_NELTS_PER_PATTERN (res) == nelts_per_pattern);
>
> I don't think this is safe when the inputs are randomised.  E.g. we
> could by chance end up with a vector of all zeros, which would have
> a single pattern and a single element per pattern, regardless of the
> shapes of the inputs.
>
> Given the way that vector_builder<T, Shape, Derived>::finalize
> canonicalises the encoding, it should be safe to use:
>
> * VECTOR_CST_NPATTERNS (res) <= npatterns
> * vector_cst_encoded_nelts (res) <= npatterns * nelts_per_pattern
>
> If we do that then...
>
> > +
> > +  for (unsigned i = 0; i < vector_cst_encoded_nelts (res); i++)
>
> ...this loop bound should be npatterns * nelts_per_pattern instead.
Ah indeed. Fixed, thanks.
>
> > +    ASSERT_TRUE (operand_equal_p (VECTOR_CST_ELT (res, i), 
> > expected_res[i], 0));
> > +}
> > +
> > +static void
> > +validate_res_vls (tree res, tree *expected_res, unsigned expected_nelts)
> > +{
> > +  ASSERT_TRUE (known_eq (VECTOR_CST_NELTS (res), expected_nelts));
> > +  for (unsigned i = 0; i < expected_nelts; i++)
> > +    ASSERT_TRUE (operand_equal_p (VECTOR_CST_ELT (res, i), 
> > expected_res[i], 0));
> > +}
> > +
> > +/* Verify VLA vec_perm folding.  */
> > +
> > +static void
> > +test_stepped ()
> > +{
> > +  /* Case 1: sel = {0, 1, 2, ...}
> > +     npatterns = 1, nelts_per_pattern = 3
> > +     expected res: { arg0[0], arg0[1], arg0[2], ... } */
> > +  {
> > +    tree arg0 = build_vec_cst_rand (integer_type_node, 1, 3, 2);
> > +    tree arg1 = build_vec_cst_rand (integer_type_node, 1, 3, 2);
> > +    poly_uint64 arg0_len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> > +
> > +    vec_perm_builder builder (arg0_len, 1, 3);
> > +    builder.quick_push (0);
> > +    builder.quick_push (1);
> > +    builder.quick_push (2);
> > +
> > +    vec_perm_indices sel (builder, 2, arg0_len);
> > +    tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel);
> > +    tree expected_res[] = { vector_cst_elt (arg0, 0), vector_cst_elt 
> > (arg0, 1),
> > +                         vector_cst_elt (arg0, 2) };
> > +    validate_res (1, 3, res, expected_res);
> > +  }
> > +
> > +  /* Case 2: sel = {len, len + 1, len + 2, ... }
> > +     npatterns = 1, nelts_per_pattern = 3
> > +     FIXME: This should return
> > +     expected res: { op1[0], op1[1], op1[2], ... }
> > +     however it returns NULL_TREE.  */
>
> Looks like the comment is out of date.
Fixed, thanks.
>
> > +  {
> > +    tree arg0 = build_vec_cst_rand (integer_type_node, 1, 3, 2);
> > +    tree arg1 = build_vec_cst_rand (integer_type_node, 1, 3, 2);
> > +    poly_uint64 arg0_len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> > +
> > +    vec_perm_builder builder (arg0_len, 1, 3);
> > +    builder.quick_push (arg0_len);
> > +    builder.quick_push (arg0_len + 1);
> > +    builder.quick_push (arg0_len + 2);
> > +
> > +    vec_perm_indices sel (builder, 2, arg0_len);
> > +    tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel, NULL, 
> > true);
> > +    tree expected_res[] = { vector_cst_elt (arg1, 0), vector_cst_elt 
> > (arg1, 1),
> > +                         vector_cst_elt (arg1, 2) };
> > +    validate_res (1, 3, res, expected_res);
> > +  }
> > +
> > +  /* Case 3: Leading element of arg1, stepped sequence: pattern 0 of arg0.
> > +     sel = {len, 0, 0, 0, 2, 0, ...}
> > +     npatterns = 2, nelts_per_pattern = 3.
> > +     Use extra pattern {0, ...} to lower number of elements per pattern.  
> > */
> > +  {
> > +    tree arg0 = build_vec_cst_rand (char_type_node, 1, 3, 2);
> > +    tree arg1 = build_vec_cst_rand (char_type_node, 1, 3, 2);
> > +    poly_uint64 arg0_len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> > +
> > +    vec_perm_builder builder (arg0_len, 2, 3);
> > +    builder.quick_push (arg0_len);
> > +    int mask_elems[] = { 0, 0, 0, 2, 0 };
> > +    for (int i = 0; i < 5; i++)
> > +      builder.quick_push (mask_elems[i]);
>
> This leaves one of the elements unspecified.
Sorry, I didn't understand.
It first pushes len in:
builder.quick_push (arg0_len)
and then pushes the remaining indices in the loop:
for (int i = 0; i < 5; i++)
  builder.quick_push (mask_elems[i])
So overall, builder will have 6 elements: {len, 0, 0, 0, 2, 0}
>
> > +
> > +    vec_perm_indices sel (builder, 2, arg0_len);
> > +    const char *reason;
> > +    tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel, 
> > &reason);
> > +
> > +    tree expected_res[] = { vector_cst_elt (arg1, 0), vector_cst_elt 
> > (arg0, 0),
> > +                         vector_cst_elt (arg0, 0), vector_cst_elt (arg0, 
> > 0),
> > +                         vector_cst_elt (arg0, 2), vector_cst_elt (arg0, 0)
> > +                       };
> > +    validate_res (2, 3, res, expected_res);
> > +  }
> > +
> > +  /* Case 4:
> > +     sel = { len, 0, 2, ... } npatterns = 1, nelts_per_pattern = 3.
> > +     This should return NULL because we cross the input vectors.
> > +     Because,
> > +     arg0_len = 16 + 16x
> > +     a1 = 0
> > +     S = 2
> > +     esel = arg0_len / npatterns_sel = 16+16x/1 = 16 + 16x
> > +     ae = 0 + (esel - 2) * S
> > +     = 0 + (16 + 16x - 2) * 2
> > +     = 28 + 32x
> > +     a1 / arg0_len = 0 /trunc (16 + 16x) = 0
> > +     ae / arg0_len = (28 + 32x) /trunc (16 + 16x), which is not defined,
> > +     since 28/16 != 32/16.
> > +     So return NULL_TREE.  */
>
> The division should succeed now, so as the test says, the reason should
> instead be that ae is in the second input.
Fixed, thanks.
>
> > +  {
> > +    tree arg0 = build_vec_cst_rand (char_type_node, 1, 3, 2);
> > +    tree arg1 = build_vec_cst_rand (char_type_node, 1, 3, 2);
> > +    poly_uint64 arg0_len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> > +
> > +    vec_perm_builder builder (arg0_len, 1, 3);
> > +    builder.quick_push (arg0_len);
> > +    builder.quick_push (0);
> > +    builder.quick_push (2);
> > +
> > +    vec_perm_indices sel (builder, 2, arg0_len);
> > +    const char *reason;
> > +    tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel, 
> > &reason, false);
> > +    gcc_assert (res == NULL_TREE);
> > +    gcc_assert (!strcmp (reason, "crossed input vectors"));
>
> The tests should use ASSERT_* macros rather than gcc_assert.
Fixed, thanks.
>
> > +  }
> > +
> > +  /* Case 5: Select elements from different patterns.
> > +     Should return NULL.  */
> > +  {
> > +    tree op0 = build_vec_cst_rand (char_type_node, 2, 3, 2);
> > +    tree op1 = build_vec_cst_rand (char_type_node, 2, 3, 2);
> > +    poly_uint64 op0_len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (op0));
> > +
> > +    vec_perm_builder builder (op0_len, 2, 3);
> > +    builder.quick_push (op0_len);
> > +    int mask_elems[] = { 0, 0, 0, 1, 0 };
> > +    for (int i = 0; i < 5; i++)
> > +      builder.quick_push (mask_elems[i]);
>
> Should be 6 elements here too.
>
> > +
> > +    vec_perm_indices sel (builder, 2, op0_len);
> > +    const char *reason;
> > +    tree res = fold_vec_perm_cst (TREE_TYPE (op0), op0, op1, sel, &reason, 
> > false);
> > +    gcc_assert (res == NULL_TREE);
> > +    gcc_assert (!strcmp (reason, "S is not multiple of npatterns"));
> > +  }
> > +
> > +  /* Case 6: Select pattern 0 of op0 and dup of op0[0]
> > +     op0, op1, sel: npatterns = 2, nelts_per_pattern = 3
> > +     sel = { 0, 0, 2, 0, 4, 0, ... }.
> > +
> > +     For pattern {0, 2, 4, ...}:
> > +     a1 = 2
> > +     len = 16 + 16x
> > +     S = 2
> > +     esel = len / npatterns_sel = (16 + 16x) / 2 = (8 + 8x)
> > +     ae = a1 + (esel - 2) * S
> > +     = 2 + (8 + 8x - 2) * 2
> > +     = 14 + 16x
> > +     a1 / arg0_len = 2 / (16 + 16x) = 0
> > +     ae / arg0_len = (14 + 16x) / (16 + 16x) = 0
> > +     So a1/arg0_len = ae/arg0_len = 0
> > +     Hence we select from first vector op0
> > +     S = 2, npatterns = 2.
> > +     Since S is multiple of npatterns(op0), we are selecting from
> > +     same pattern of op0.
> > +
> > +     For pattern {0, ...}, we are choosing { op0[0] ... }
> > +     So res will be combination of above patterns:
> > +     res: { op0[0], op0[0], op0[2], op0[0], op0[4], op0[0], ... }
> > +     with npatterns = 2, nelts_per_pattern = 3.  */
> > +  {
> > +    tree op0 = build_vec_cst_rand (char_type_node, 2, 3, 2);
> > +    tree op1 = build_vec_cst_rand (char_type_node, 2, 3, 2);
> > +    poly_uint64 op0_len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (op0));
> > +
> > +    vec_perm_builder builder (op0_len, 2, 3);
> > +    int mask_elems[] = { 0, 0, 2, 0, 4, 0 };
> > +    for (int i = 0; i < 6; i++)
> > +      builder.quick_push (mask_elems[i]);
> > +
> > +    vec_perm_indices sel (builder, 2, op0_len);
> > +    tree res = fold_vec_perm_cst (TREE_TYPE (op0), op0, op1, sel);
> > +    tree expected_res[] = { vector_cst_elt (op0, 0), vector_cst_elt (op0, 
> > 0),
> > +                         vector_cst_elt (op0, 2), vector_cst_elt (op0, 0),
> > +                         vector_cst_elt (op0, 4), vector_cst_elt (op0, 0) 
> > };
> > +    validate_res (2, 3, res, expected_res);
> > +  }
> > +
> > +  /* Case 7: sel_npatterns > op_npatterns;
> > +     op0, op1: npatterns = 2, nelts_per_pattern = 3
> > +     sel: { 0, 0, 1, len, 2, 0, 3, len, 4, 0, 5, len, ...},
> > +     with npatterns = 4, nelts_per_pattern = 3.  */
> > +  {
> > +    tree op0 = build_vec_cst_rand (char_type_node, 2, 3, 2);
> > +    tree op1 = build_vec_cst_rand (char_type_node, 2, 3, 2);
> > +    poly_uint64 op0_len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (op0));
> > +
> > +    vec_perm_builder builder(op0_len, 4, 3);
> > +    // -1 is used as place holder for poly_int_cst
> > +    int mask_elems[] = { 0, 0, 1, -1, 2, 0, 3, -1, 4, 0, 5, -1 };
> > +    for (int i = 0; i < 12; i++)
> > +      builder.quick_push ((mask_elems[i] == -1) ? op0_len : mask_elems[i]);
> > +
> > +    vec_perm_indices sel (builder, 2, op0_len);
> > +    tree res = fold_vec_perm_cst (TREE_TYPE (op0), op0, op1, sel);
> > +    tree expected_res[] = { vector_cst_elt (op0, 0), vector_cst_elt (op0, 
> > 0),
> > +                         vector_cst_elt (op0, 1), vector_cst_elt (op1, 0),
> > +                         vector_cst_elt (op0, 2), vector_cst_elt (op0, 0),
> > +                         vector_cst_elt (op0, 3), vector_cst_elt (op1, 0),
> > +                         vector_cst_elt (op0, 4), vector_cst_elt (op0, 0),
> > +                         vector_cst_elt (op0, 5), vector_cst_elt (op1, 0) 
> > };
> > +    validate_res (4, 3, res, expected_res);
> > +  }
> > +}
> > +
> > +static void
> > +test_dup ()
> > +{
> > +  /* Case 1: mask = {0, ...} */
> > +  {
> > +    tree arg0 = build_vec_cst_rand (integer_type_node, 2, 3, 1);
> > +    tree arg1 = build_vec_cst_rand (integer_type_node, 2, 3, 1);
> > +    poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> > +
> > +    vec_perm_builder builder (len, 1, 1);
> > +    builder.quick_push (0);
> > +    vec_perm_indices sel (builder, 2, len);
> > +    tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel);
> > +
> > +    tree expected_res[] = { vector_cst_elt (res, 0) };
> > +    validate_res (1, 1, res, expected_res);
> > +  }
> > +
> > +  /* Case 2: mask = {len, ...} */
> > +  {
> > +    tree arg0 = build_vec_cst_rand (integer_type_node, 2, 3, 1);
> > +    tree arg1 = build_vec_cst_rand (integer_type_node, 2, 3, 1);
> > +    poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> > +
> > +    vec_perm_builder builder (len, 1, 1);
> > +    builder.quick_push (len);
> > +    vec_perm_indices sel (builder, 2, len);
> > +    tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel);
> > +
> > +    tree expected_res[] = { vector_cst_elt (arg1, 0) };
> > +    validate_res (1, 1, res, expected_res);
> > +  }
> > +
> > +  /* Case 3: mask = { 0, len, ... } */
> > +  {
> > +    tree arg0 = build_vec_cst_rand (integer_type_node, 2, 3, 1);
> > +    tree arg1 = build_vec_cst_rand (integer_type_node, 2, 3, 1);
> > +    poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> > +
> > +    vec_perm_builder builder (len, 2, 1);
> > +    builder.quick_push (0);
> > +    builder.quick_push (len);
> > +    vec_perm_indices sel (builder, 2, len);
> > +    tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel);
> > +
> > +    tree expected_res[] = { vector_cst_elt (arg0, 0), vector_cst_elt 
> > (arg1, 0) };
> > +    validate_res (2, 1, res, expected_res);
> > +  }
> > +
> > +  /* Case 4: mask = { 0, len, 1, len+1, ... } */
> > +  {
> > +    tree arg0 = build_vec_cst_rand (integer_type_node, 2, 3, 1);
> > +    tree arg1 = build_vec_cst_rand (integer_type_node, 2, 3, 1);
> > +    poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> > +
> > +    vec_perm_builder builder (len, 2, 2);
> > +    builder.quick_push (0);
> > +    builder.quick_push (len);
> > +    builder.quick_push (1);
> > +    builder.quick_push (len + 1);
> > +    vec_perm_indices sel (builder, 2, len);
> > +    tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel);
> > +
> > +    tree expected_res[] = { vector_cst_elt (arg0, 0), vector_cst_elt 
> > (arg1, 0),
> > +                         vector_cst_elt (arg0, 1), vector_cst_elt (arg1, 1)
> > +                       };
> > +    validate_res (2, 2, res, expected_res);
> > +  }
> > +
> > +  /* Case 5: mask = { 0, len, 1, len+1, .... }
> > +     npatterns = 4, nelts_per_pattern = 1 */
> > +  {
> > +    tree arg0 = build_vec_cst_rand (integer_type_node, 2, 3, 1);
> > +    tree arg1 = build_vec_cst_rand (integer_type_node, 2, 3, 1);
> > +    poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> > +
> > +    vec_perm_builder builder (len, 4, 1);
> > +    builder.quick_push (0);
> > +    builder.quick_push (len);
> > +    builder.quick_push (1);
> > +    builder.quick_push (len + 1);
> > +    vec_perm_indices sel (builder, 2, len);
> > +    tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel);
> > +
> > +    tree expected_res[] = { vector_cst_elt (arg0, 0), vector_cst_elt 
> > (arg1, 0),
> > +                         vector_cst_elt (arg0, 1), vector_cst_elt (arg1, 1)
> > +                       };
> > +    validate_res (4, 1, res, expected_res);
> > +  }
> > +
> > +  /* Case 6: mask = {0, 4, ...}
> > +     npatterns = 1, nelts_per_pattern = 2.
> > +     This should return NULL_TREE because the index 4 may choose
> > +     from either arg0 or arg1 depending on vector length.  */
> > +  {
> > +    tree arg0 = build_vec_cst_rand (integer_type_node, 2, 3, 1);
> > +    tree arg1 = build_vec_cst_rand (integer_type_node, 2, 3, 1);
> > +    poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> > +
> > +    vec_perm_builder builder (len, 1, 2);
> > +    builder.quick_push (0);
> > +    builder.quick_push (4);
> > +    vec_perm_indices sel (builder, 2, len);
> > +    tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel);
> > +    ASSERT_TRUE (res == NULL_TREE);
> > +  }
> > +
> > +  /* Case 7: npatterns(arg0) = 4 > npatterns(sel) = 2
> > +     mask = {0, len, 1, len + 1, ...}
> > +     sel_npatterns = 2, sel_nelts_per_pattern = 2.  */
>
> This is a good test to have, but it doesn't seem to match the
> name of the containing function (test_dup).
Well since the selector has nelts_per_pattern = 2, ie, dup of a1, I
chose to put it in test_dup.
Anyway, the functions are now re-classified based on vector length in the patch.
>
> > +  {
> > +    tree arg0 = build_vec_cst_rand (integer_type_node, 2, 3, 1);
> > +    tree arg1 = build_vec_cst_rand (integer_type_node, 2, 3, 1);
> > +    poly_uint64 arg0_len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> > +
> > +    vec_perm_builder builder (arg0_len, 2, 2);
> > +    builder.quick_push (0);
> > +    builder.quick_push (arg0_len);
> > +    builder.quick_push (1);
> > +    builder.quick_push (arg0_len + 1);
> > +    vec_perm_indices sel (builder, 2, arg0_len);
> > +    tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel);
> > +
> > +    tree expected_res[] = { vector_cst_elt (arg0, 0), vector_cst_elt 
> > (arg1, 0),
> > +                         vector_cst_elt (arg0, 1), vector_cst_elt (arg1, 1)
> > +                       };
> > +    validate_res (2, 2, res, expected_res);
> > +  }
> > +}
> > +
> > +static void
> > +test_mixed ()
> > +{
> > +  /* Case 1: op0, op1 -> VLS, sel -> VLA and selects from both input 
> > vectors.
> > +     In this case, we treat res_npatterns = nelts in input vector
> > +     and res_nelts_per_pattern = 1, and create a dup pattern.
> > +     sel = { 0, 4, 1, 5, ... }
> > +     res = { op0[0], op1[0], op0[1], op1[1], ...} // (4, 1)
> > +     res_npatterns = 4, res_nelts_per_pattern = 1.  */
> > +  {
> > +    tree arg_vectype = build_vector_type (integer_type_node, 4);
> > +    tree arg0 = build_vec_cst_rand (integer_type_node, 4, 1, 0, 
> > arg_vectype);
> > +    tree arg1 = build_vec_cst_rand (integer_type_node, 4, 1, 0, 
> > arg_vectype);
> > +
> > +    tree res_type = get_preferred_vectype (integer_type_node);
> > +    poly_uint64 res_len = TYPE_VECTOR_SUBPARTS (res_type);
> > +    vec_perm_builder builder (res_len, 4, 1);
> > +    builder.quick_push (0);
> > +    builder.quick_push (4);
> > +    builder.quick_push (1);
> > +    builder.quick_push (5);
> > +
> > +    vec_perm_indices sel (builder, 2, res_len);
> > +    tree res = fold_vec_perm_cst (res_type, arg0, arg1, sel);
> > +    tree expected_res[] = { vector_cst_elt (arg0, 0), vector_cst_elt 
> > (arg1, 0),
> > +                         vector_cst_elt (arg0, 1), vector_cst_elt (arg1, 1)
> > +                       };
> > +    validate_res (4, 1, res, expected_res);
> > +  }
> > +
> > +  /* Case 2: Same as Case 1, but sel contains an out of bounds index.
> > +     result should be NULL_TREE.  */
> > +  {
> > +    tree arg_vectype = build_vector_type (integer_type_node, 4);
> > +    tree arg0 = build_vec_cst_rand (integer_type_node, 4, 1, 0, 
> > arg_vectype);
> > +    tree arg1 = build_vec_cst_rand (integer_type_node, 4, 1, 0, 
> > arg_vectype);
> > +
> > +    tree res_type = get_preferred_vectype (integer_type_node);
> > +    poly_uint64 res_len = TYPE_VECTOR_SUBPARTS (res_type);
> > +    vec_perm_builder builder (res_len, 4, 1);
> > +    builder.quick_push (0);
> > +    builder.quick_push (8);
> > +    builder.quick_push (1);
> > +    builder.quick_push (5);
> > +
> > +    vec_perm_indices sel (builder, 2, res_len);
> > +    const char *reason;
> > +    tree res = fold_vec_perm_cst (res_type, arg0, arg1, sel, &reason);
> > +    gcc_assert (res == NULL_TREE);
> > +    gcc_assert (!strcmp (reason, "out of bounds access"));
> > +  }
> > +
> > +  /* Case 3: op0, op1 are VLA and sel is VLS.
> > +     op0, op1: VNx16QI with shape (2, 3)
> > +     sel = V4SI with values {0, 2, 4, 6}
> > +     res: V4SI with values { op0[0], op0[2], op0[4], op0[6] }.  */
> > +  {
> > +    tree arg0 = build_vec_cst_rand (char_type_node, 2, 3, 2);
> > +    tree arg1 = build_vec_cst_rand (char_type_node, 2, 3, 2);
> > +
> > +    poly_uint64 res_len = 4;
> > +    tree res_type = build_vector_type (char_type_node, res_len);
> > +    vec_perm_builder builder (res_len, 4, 1);
> > +    builder.quick_push (0);
> > +    builder.quick_push (2);
> > +    builder.quick_push (4);
> > +    builder.quick_push (6);
> > +
> > +    vec_perm_indices sel (builder, 2, res_len);
> > +    tree res = fold_vec_perm_cst (res_type, arg0, arg1, sel);
> > +
> > +    tree expected_res[] = { vector_cst_elt (arg0, 0), vector_cst_elt 
> > (arg0, 2),
> > +                         vector_cst_elt (arg0, 4), vector_cst_elt (arg0, 6)
> > +                       };
> > +    validate_res_vls (res, expected_res, 4);
> > +  }
> > +
> > +  /* Case 4: Same as case 4, but op0, op1 are VNx4SI with shape (2, 3) and 
> > step = 2
>
> Same as case 3?
Oops sorry, fixed.

The attached patch passes bootstrap+test on aarch64-linux-gnu with and
without SVE, and on x86_64-linux-gnu.

Thanks,
Prathamesh
>

> Thanks,
> Richard
>
> > +     sel = V4SI with values {0, 2, 4, 6}
> > +     In this case result should be NULL_TREE because we cross input vector
> > +     boundary at index 4.  */
> > +  {
> > +    tree arg0 = build_vec_cst_rand (integer_type_node, 2, 3, 2);
> > +    tree arg1 = build_vec_cst_rand (integer_type_node, 2, 3, 2);
> > +
> > +    poly_uint64 res_len = 4;
> > +    tree res_type = build_vector_type (char_type_node, res_len);
> > +    vec_perm_builder builder (res_len, 4, 1);
> > +    builder.quick_push (0);
> > +    builder.quick_push (2);
> > +    builder.quick_push (4);
> > +    builder.quick_push (6);
> > +
> > +    vec_perm_indices sel (builder, 2, res_len);
> > +    const char *reason;
> > +    tree res = fold_vec_perm_cst (res_type, arg0, arg1, sel, &reason);
> > +    gcc_assert (res == NULL_TREE);
> > +    gcc_assert (!strcmp (reason, "cannot divide selector element by arg 
> > len"));
> > +  }
> > +}
> > +
> > +static void
> > +test ()
> > +{
> > +  tree vectype = get_preferred_vectype (integer_type_node);
> > +  if (TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
> > +    return;
> > +
> > +  test_dup ();
> > +  test_stepped ();
> > +  test_mixed ();
> > +}
> > +};
> > +
> >  /* Verify that various binary operations on vectors are folded
> >     correctly.  */
> >
> > @@ -16943,6 +17626,7 @@ fold_const_cc_tests ()
> >    test_arithmetic_folding ();
> >    test_vector_folding ();
> >    test_vec_duplicate_folding ();
> > +  test_fold_vec_perm_cst::test ();
> >  }
> >
> >  } // namespace selftest
diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
index 7e5494dfd39..648ef5c647e 100644
--- a/gcc/fold-const.cc
+++ b/gcc/fold-const.cc
@@ -40,6 +40,7 @@ along with GCC; see the file COPYING3.  If not see
    gimple code, we need to handle GIMPLE tuples as well as their
    corresponding tree equivalents.  */
 
+#define INCLUDE_ALGORITHM
 #include "config.h"
 #include "system.h"
 #include "coretypes.h"
@@ -10494,15 +10495,9 @@ fold_mult_zconjz (location_t loc, tree type, tree expr)
 static bool
 vec_cst_ctor_to_array (tree arg, unsigned int nelts, tree *elts)
 {
-  unsigned HOST_WIDE_INT i, nunits;
+  unsigned HOST_WIDE_INT i;
 
-  if (TREE_CODE (arg) == VECTOR_CST
-      && VECTOR_CST_NELTS (arg).is_constant (&nunits))
-    {
-      for (i = 0; i < nunits; ++i)
-       elts[i] = VECTOR_CST_ELT (arg, i);
-    }
-  else if (TREE_CODE (arg) == CONSTRUCTOR)
+  if (TREE_CODE (arg) == CONSTRUCTOR)
     {
       constructor_elt *elt;
 
@@ -10520,6 +10515,182 @@ vec_cst_ctor_to_array (tree arg, unsigned int nelts, 
tree *elts)
   return true;
 }
 
+/* Helper routine for fold_vec_perm_cst to check if SEL is a suitable
+   mask for VLA vec_perm folding.
+   REASON if specified, will contain the reason why SEL is not suitable.
+   Used only for debugging and unit-testing.  */
+
+static bool
+valid_mask_for_fold_vec_perm_cst_p (tree arg0, tree arg1,
+                                   const vec_perm_indices &sel,
+                                   const char **reason = NULL)
+{
+  unsigned sel_npatterns = sel.encoding ().npatterns ();
+  unsigned sel_nelts_per_pattern = sel.encoding ().nelts_per_pattern ();
+
+  if (!(pow2p_hwi (sel_npatterns)
+       && pow2p_hwi (VECTOR_CST_NPATTERNS (arg0))
+       && pow2p_hwi (VECTOR_CST_NPATTERNS (arg1))))
+    {
+      if (reason)
+       *reason = "npatterns is not power of 2";
+      return false;
+    }
+
+  /* We want to avoid cases where sel.length is not a multiple of npatterns.
+     For eg: sel.length = 2 + 2x, and sel npatterns = 4.  */
+  poly_uint64 esel;
+  if (!multiple_p (sel.length (), sel_npatterns, &esel))
+    {
+      if (reason)
+       *reason = "sel.length is not multiple of sel_npatterns";
+      return false;
+    }
+
+  if (sel_nelts_per_pattern < 3)
+    return true;
+
+  for (unsigned pattern = 0; pattern < sel_npatterns; pattern++)
+    {
+      poly_uint64 a1 = sel[pattern + sel_npatterns];
+      poly_uint64 a2 = sel[pattern + 2 * sel_npatterns];
+      HOST_WIDE_INT step;
+      if (!poly_int64 (a2 - a1).is_constant (&step))
+       {
+         if (reason)
+           *reason = "step is not constant";
+         return false;
+       }
+      // FIXME: Punt on step < 0 for now, revisit later.
+      if (step < 0)
+       return false;
+      if (step == 0)
+       continue;
+
+      if (!pow2p_hwi (step))
+       {
+         if (reason)
+           *reason = "step is not power of 2";
+         return false;
+       }
+
+      /* Ensure that stepped sequence of the pattern selects elements
+        only from the same input vector.  */
+      uint64_t q1, qe;
+      poly_uint64 r1, re;
+      poly_uint64 ae = a1 + (esel - 2) * step;
+      poly_uint64 arg_len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
+
+      if (!(can_div_trunc_p (a1, arg_len, &q1, &r1)
+           && can_div_trunc_p (ae, arg_len, &qe, &re)
+           && q1 == qe))
+       {
+         if (reason)
+           *reason = "crossed input vectors";
+         return false;
+       }
+
+      /* Ensure that the stepped sequence always selects from the same
+        input pattern.  */
+      unsigned arg_npatterns
+       = ((q1 & 0) == 0) ? VECTOR_CST_NPATTERNS (arg0)
+                         : VECTOR_CST_NPATTERNS (arg1);
+
+      if (!multiple_p (step, arg_npatterns))
+       {
+         if (reason)
+           *reason = "step is not multiple of npatterns";
+         return false;
+       }
+    }
+
+  return true;
+}
+
+/* Try to fold permutation of ARG0 and ARG1 with SEL selector when
+   the input vectors are VECTOR_CST. Return NULL_TREE otherwise.
+   REASON has same purpose as described in
+   valid_mask_for_fold_vec_perm_cst_p.  */
+
+
+static tree
+fold_vec_perm_cst (tree type, tree arg0, tree arg1, const vec_perm_indices 
&sel,
+                  const char **reason = NULL)
+{
+  unsigned res_npatterns, res_nelts_per_pattern;
+  unsigned HOST_WIDE_INT res_nelts;
+
+  /* (1) If SEL is a suitable mask as determined by
+     valid_mask_for_fold_vec_perm_cst_p, then:
+     res_npatterns = max of npatterns between ARG0, ARG1, and SEL
+     res_nelts_per_pattern = max of nelts_per_pattern between
+                            ARG0, ARG1 and SEL.
+     (2) If SEL is not a suitable mask, and TYPE is VLS then:
+     res_npatterns = nelts in result vector.
+     res_nelts_per_pattern = 1.
+     This exception is made so that VLS ARG0, ARG1 and SEL work as before.  */
+  if (valid_mask_for_fold_vec_perm_cst_p (arg0, arg1, sel, reason))
+    {
+      res_npatterns
+       = std::max (VECTOR_CST_NPATTERNS (arg0),
+                   std::max (VECTOR_CST_NPATTERNS (arg1),
+                             sel.encoding ().npatterns ()));
+
+      res_nelts_per_pattern
+       = std::max (VECTOR_CST_NELTS_PER_PATTERN (arg0),
+                   std::max (VECTOR_CST_NELTS_PER_PATTERN (arg1),
+                             sel.encoding ().nelts_per_pattern ()));
+
+      res_nelts = res_npatterns * res_nelts_per_pattern;
+    }
+  else if (TYPE_VECTOR_SUBPARTS (type).is_constant (&res_nelts))
+    {
+      res_npatterns = res_nelts;
+      res_nelts_per_pattern = 1;
+    }
+  else
+    return NULL_TREE;
+
+  tree_vector_builder out_elts (type, res_npatterns, res_nelts_per_pattern);
+  for (unsigned i = 0; i < res_nelts; i++)
+    {
+      poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
+      uint64_t q;
+      poly_uint64 r;
+      unsigned HOST_WIDE_INT index;
+
+      /* Punt if sel[i] /trunc_div len cannot be determined,
+        because the input vector to be chosen will depend on
+        runtime vector length.
+        For example if len == 4 + 4x, and sel[i] == 4,
+        If len at runtime equals 4, we choose arg1[0].
+        For any other value of len > 4 at runtime, we choose arg0[4].
+        which makes the element choice dependent on runtime vector length.  */
+      if (!can_div_trunc_p (sel[i], len, &q, &r))
+       {
+         if (reason)
+           *reason = "cannot divide selector element by arg len";
+         return NULL_TREE;
+       }
+
+      /* sel[i] % len will give the index of element in the chosen input
+        vector. For example if sel[i] == 5 + 4x and len == 4 + 4x,
+        we will choose arg1[1] since (5 + 4x) % (4 + 4x) == 1.  */
+      if (!r.is_constant (&index))
+       {
+         if (reason)
+           *reason = "remainder is not constant";
+         return NULL_TREE;
+       }
+
+      tree arg = ((q & 1) == 0) ? arg0 : arg1;
+      tree elem = vector_cst_elt (arg, index);
+      out_elts.quick_push (elem);
+    }
+
+  return out_elts.build ();
+}
+
 /* Attempt to fold vector permutation of ARG0 and ARG1 vectors using SEL
    selector.  Return the folded VECTOR_CST or CONSTRUCTOR if successful,
    NULL_TREE otherwise.  */
@@ -10529,43 +10700,41 @@ fold_vec_perm (tree type, tree arg0, tree arg1, const 
vec_perm_indices &sel)
 {
   unsigned int i;
   unsigned HOST_WIDE_INT nelts;
-  bool need_ctor = false;
 
-  if (!sel.length ().is_constant (&nelts))
-    return NULL_TREE;
-  gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (type), nelts)
-             && known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)), nelts)
-             && known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg1)), nelts));
+  gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (type), sel.length ())
+             && known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)),
+                          TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg1))));
+
   if (TREE_TYPE (TREE_TYPE (arg0)) != TREE_TYPE (type)
       || TREE_TYPE (TREE_TYPE (arg1)) != TREE_TYPE (type))
     return NULL_TREE;
 
+  if (TREE_CODE (arg0) == VECTOR_CST
+      && TREE_CODE (arg1) == VECTOR_CST)
+    return fold_vec_perm_cst (type, arg0, arg1, sel);
+
+  /* For fall back case, we want to ensure we have VLS vectors
+     with equal length.  */
+  if (!sel.length ().is_constant (&nelts))
+    return NULL_TREE;
+
+  gcc_assert (known_eq (sel.length (),
+                       TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0))));
   tree *in_elts = XALLOCAVEC (tree, nelts * 2);
   if (!vec_cst_ctor_to_array (arg0, nelts, in_elts)
       || !vec_cst_ctor_to_array (arg1, nelts, in_elts + nelts))
     return NULL_TREE;
 
-  tree_vector_builder out_elts (type, nelts, 1);
+  vec<constructor_elt, va_gc> *v;
+  vec_alloc (v, nelts);
   for (i = 0; i < nelts; i++)
     {
       HOST_WIDE_INT index;
       if (!sel[i].is_constant (&index))
        return NULL_TREE;
-      if (!CONSTANT_CLASS_P (in_elts[index]))
-       need_ctor = true;
-      out_elts.quick_push (unshare_expr (in_elts[index]));
-    }
-
-  if (need_ctor)
-    {
-      vec<constructor_elt, va_gc> *v;
-      vec_alloc (v, nelts);
-      for (i = 0; i < nelts; i++)
-       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, out_elts[i]);
-      return build_constructor (type, v);
+      CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, in_elts[index]);
     }
-  else
-    return out_elts.build ();
+  return build_constructor (type, v);
 }
 
 /* Try to fold a pointer difference of type TYPE two address expressions of
@@ -16892,6 +17061,554 @@ test_arithmetic_folding ()
                                   x);
 }
 
+namespace test_fold_vec_perm_cst {
+
+/* Build a VECTOR_CST corresponding to VMODE, and has
+   encoding given by NPATTERNS, NELTS_PER_PATTERN and STEP.
+   Fill it with randomized elements, using rand() % THRESHOLD.  */
+
+static tree
+build_vec_cst_rand (machine_mode vmode, unsigned npatterns,
+                   unsigned nelts_per_pattern,
+                   int step = 0, int threshold = 100)
+{
+  tree inner_type = lang_hooks.types.type_for_mode (GET_MODE_INNER (vmode), 1);
+  tree vectype = build_vector_type_for_mode (inner_type, vmode);
+  tree_vector_builder builder (vectype, npatterns, nelts_per_pattern);
+
+  // Fill a0 for each pattern
+  for (unsigned i = 0; i < npatterns; i++)
+    builder.quick_push (build_int_cst (inner_type, rand () % threshold));
+
+  if (nelts_per_pattern == 1)
+    return builder.build ();
+
+  // Fill a1 for each pattern
+  for (unsigned i = 0; i < npatterns; i++)
+    builder.quick_push (build_int_cst (inner_type, rand () % threshold));
+
+  if (nelts_per_pattern == 2)
+    return builder.build ();
+
+  for (unsigned i = npatterns * 2; i < npatterns * nelts_per_pattern; i++)
+    {
+      tree prev_elem = builder[i - npatterns];
+      int prev_elem_val = TREE_INT_CST_LOW (prev_elem);
+      int val = prev_elem_val + step;
+      builder.quick_push (build_int_cst (inner_type, val));
+    }
+
+  return builder.build ();
+}
+
+/* Validate result of VEC_PERM_EXPR folding for the unit-tests below,
+   when result is VLA.  */
+
+static void
+validate_res (unsigned npatterns, unsigned nelts_per_pattern,
+             tree res, tree *expected_res)
+{
+  /* Actual npatterns / nelts_per_pattern in res may be less than expected due
+     to canonicalization.  */
+  ASSERT_TRUE (VECTOR_CST_NPATTERNS (res) <= npatterns);
+  ASSERT_TRUE (VECTOR_CST_NELTS_PER_PATTERN (res) <= nelts_per_pattern);
+
+  for (unsigned i = 0; i < npatterns * nelts_per_pattern; i++)
+    ASSERT_TRUE (operand_equal_p (VECTOR_CST_ELT (res, i), expected_res[i], 
0));
+}
+
+/* Validate result of VEC_PERM_EXPR folding for the unit-tests below,
+   when the result is VLS.  */
+
+static void
+validate_res_vls (tree res, tree *expected_res, unsigned expected_nelts)
+{
+  ASSERT_TRUE (known_eq (VECTOR_CST_NELTS (res), expected_nelts));
+  for (unsigned i = 0; i < expected_nelts; i++)
+    ASSERT_TRUE (operand_equal_p (VECTOR_CST_ELT (res, i), expected_res[i], 
0));
+}
+
+/* Test cases where result and input vectors are VNx4SI  */
+
+static void
+test_vnx4si (machine_mode vmode)
+{
+  /* Case 1: mask = {0, ...} */
+  {
+    tree arg0 = build_vec_cst_rand (vmode, 2, 3, 1);
+    tree arg1 = build_vec_cst_rand (vmode, 2, 3, 1);
+    poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
+
+    vec_perm_builder builder (len, 1, 1);
+    builder.quick_push (0);
+    vec_perm_indices sel (builder, 2, len);
+    tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel);
+
+    tree expected_res[] = { vector_cst_elt (res, 0) };
+    validate_res (1, 1, res, expected_res);
+  }
+
+  /* Case 2: mask = {len, ...} */
+  {
+    tree arg0 = build_vec_cst_rand (vmode, 2, 3, 1);
+    tree arg1 = build_vec_cst_rand (vmode, 2, 3, 1);
+    poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
+
+    vec_perm_builder builder (len, 1, 1);
+    builder.quick_push (len);
+    vec_perm_indices sel (builder, 2, len);
+    tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel);
+
+    tree expected_res[] = { vector_cst_elt (arg1, 0) };
+    validate_res (1, 1, res, expected_res);
+  }
+
+  /* Case 3: mask = { 0, len, ... } */
+  {
+    tree arg0 = build_vec_cst_rand (vmode, 2, 3, 1);
+    tree arg1 = build_vec_cst_rand (vmode, 2, 3, 1);
+    poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
+
+    vec_perm_builder builder (len, 2, 1);
+    builder.quick_push (0);
+    builder.quick_push (len);
+    vec_perm_indices sel (builder, 2, len);
+    tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel);
+
+    tree expected_res[] = { vector_cst_elt (arg0, 0), vector_cst_elt (arg1, 0) 
};
+    validate_res (2, 1, res, expected_res);
+  }
+
+  /* Case 4: mask = { 0, len, 1, len+1, ... } */
+  {
+    tree arg0 = build_vec_cst_rand (vmode, 2, 3, 1);
+    tree arg1 = build_vec_cst_rand (vmode, 2, 3, 1);
+    poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
+
+    vec_perm_builder builder (len, 2, 2);
+    builder.quick_push (0);
+    builder.quick_push (len);
+    builder.quick_push (1);
+    builder.quick_push (len + 1);
+    vec_perm_indices sel (builder, 2, len);
+    tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel);
+
+    tree expected_res[] = { vector_cst_elt (arg0, 0), vector_cst_elt (arg1, 0),
+                           vector_cst_elt (arg0, 1), vector_cst_elt (arg1, 1)
+                         };
+    validate_res (2, 2, res, expected_res);
+  }
+
+  /* Case 5: mask = { 0, len, 1, len+1, .... }
+     npatterns = 4, nelts_per_pattern = 1 */
+  {
+    tree arg0 = build_vec_cst_rand (vmode, 2, 3, 1);
+    tree arg1 = build_vec_cst_rand (vmode, 2, 3, 1);
+    poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
+
+    vec_perm_builder builder (len, 4, 1);
+    builder.quick_push (0);
+    builder.quick_push (len);
+    builder.quick_push (1);
+    builder.quick_push (len + 1);
+    vec_perm_indices sel (builder, 2, len);
+    tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel);
+
+    tree expected_res[] = { vector_cst_elt (arg0, 0), vector_cst_elt (arg1, 0),
+                           vector_cst_elt (arg0, 1), vector_cst_elt (arg1, 1)
+                         };
+    validate_res (4, 1, res, expected_res);
+  }
+
+  /* Case 6: mask = {0, 4, ...}
+     npatterns = 1, nelts_per_pattern = 2.
+     This should return NULL_TREE because the index 4 may choose
+     from either arg0 or arg1 depending on vector length.  */
+  {
+    tree arg0 = build_vec_cst_rand (vmode, 2, 3, 1);
+    tree arg1 = build_vec_cst_rand (vmode, 2, 3, 1);
+    poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
+
+    vec_perm_builder builder (len, 1, 2);
+    builder.quick_push (0);
+    builder.quick_push (4);
+    vec_perm_indices sel (builder, 2, len);
+    tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel);
+    ASSERT_TRUE (res == NULL_TREE);
+  }
+
+  /* Case 7: npatterns(arg0) = 4 > npatterns(sel) = 2
+     mask = {0, len, 1, len + 1, ...}
+     sel_npatterns = 2, sel_nelts_per_pattern = 2.  */
+  {
+    tree arg0 = build_vec_cst_rand (vmode, 2, 3, 1);
+    tree arg1 = build_vec_cst_rand (vmode, 2, 3, 1);
+    poly_uint64 arg0_len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
+
+    vec_perm_builder builder (arg0_len, 2, 2);
+    builder.quick_push (0);
+    builder.quick_push (arg0_len);
+    builder.quick_push (1);
+    builder.quick_push (arg0_len + 1);
+    vec_perm_indices sel (builder, 2, arg0_len);
+    tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel);
+
+    tree expected_res[] = { vector_cst_elt (arg0, 0), vector_cst_elt (arg1, 0),
+                           vector_cst_elt (arg0, 1), vector_cst_elt (arg1, 1)
+                         };
+    validate_res (2, 2, res, expected_res);
+  }
+
+  /* Case 8: sel = {0, 1, 2, ...}
+     npatterns = 1, nelts_per_pattern = 3
+     expected res: { arg0[0], arg0[1], arg0[2], ... } */
+  {
+    tree arg0 = build_vec_cst_rand (vmode, 1, 3, 2);
+    tree arg1 = build_vec_cst_rand (vmode, 1, 3, 2);
+    poly_uint64 arg0_len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
+
+    vec_perm_builder builder (arg0_len, 1, 3);
+    builder.quick_push (0);
+    builder.quick_push (1);
+    builder.quick_push (2);
+
+    vec_perm_indices sel (builder, 2, arg0_len);
+    tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel);
+    tree expected_res[] = { vector_cst_elt (arg0, 0), vector_cst_elt (arg0, 1),
+                           vector_cst_elt (arg0, 2) };
+    validate_res (1, 3, res, expected_res);
+  }
+
+  /* Case 9: sel = {len, len + 1, len + 2, ... }
+     npatterns = 1, nelts_per_pattern = 3
+     expected res: { op1[0], op1[1], op1[2], ... }  */
+  {
+    tree arg0 = build_vec_cst_rand (vmode, 1, 3, 2);
+    tree arg1 = build_vec_cst_rand (vmode, 1, 3, 2);
+    poly_uint64 arg0_len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
+
+    vec_perm_builder builder (arg0_len, 1, 3);
+    builder.quick_push (arg0_len);
+    builder.quick_push (arg0_len + 1);
+    builder.quick_push (arg0_len + 2);
+
+    vec_perm_indices sel (builder, 2, arg0_len);
+    tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel, NULL);
+    tree expected_res[] = { vector_cst_elt (arg1, 0), vector_cst_elt (arg1, 1),
+                           vector_cst_elt (arg1, 2) };
+    validate_res (1, 3, res, expected_res);
+  }
+}
+
+/* Test cases where result and input vectors are VNx16QI  */
+
+static void
+test_vnx16qi (machine_mode vmode)
+{
+  /* Case 1: Leading element of arg1, stepped sequence: pattern 0 of arg0.
+     sel = {len, 0, 0, 0, 2, 0, ...}
+     npatterns = 2, nelts_per_pattern = 3.
+     Use extra pattern {0, ...} to lower number of elements per pattern.  */
+  {
+    tree arg0 = build_vec_cst_rand (vmode, 1, 3, 2);
+    tree arg1 = build_vec_cst_rand (vmode, 1, 3, 2);
+    poly_uint64 arg0_len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
+
+    vec_perm_builder builder (arg0_len, 2, 3);
+    builder.quick_push (arg0_len);
+    int mask_elems[] = { 0, 0, 0, 2, 0 };
+    for (int i = 0; i < 5; i++)
+      builder.quick_push (mask_elems[i]);
+
+    vec_perm_indices sel (builder, 2, arg0_len);
+    const char *reason;
+    tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel, &reason);
+
+    tree expected_res[] = { vector_cst_elt (arg1, 0), vector_cst_elt (arg0, 0),
+                           vector_cst_elt (arg0, 0), vector_cst_elt (arg0, 0),
+                           vector_cst_elt (arg0, 2), vector_cst_elt (arg0, 0)
+                         };
+    validate_res (2, 3, res, expected_res);
+  }
+
+  /* Case 2:
+     sel = { len, 0, 2, ... } npatterns = 1, nelts_per_pattern = 3.
+     This should return NULL because we cross the input vectors.
+     Because,
+     arg0_len = 16 + 16x
+     a1 = 0
+     S = 2
+     esel = arg0_len / npatterns_sel = 16+16x/1 = 16 + 16x
+     ae = 0 + (esel - 2) * S
+       = 0 + (16 + 16x - 2) * 2
+       = 28 + 32x
+     Let q1 = a1 / arg0_len = 0 /trunc (16 + 16x) = 0
+     Let qe = ae / arg0_len = (28 + 32x) /trunc (16 + 16x) = 1.
+     Since q1 != qe, we cross input vectors.
+     So return NULL_TREE.  */
+  {
+    tree arg0 = build_vec_cst_rand (vmode, 1, 3, 2);
+    tree arg1 = build_vec_cst_rand (vmode, 1, 3, 2);
+    poly_uint64 arg0_len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
+
+    vec_perm_builder builder (arg0_len, 1, 3);
+    builder.quick_push (arg0_len);
+    builder.quick_push (0);
+    builder.quick_push (2);
+
+    vec_perm_indices sel (builder, 2, arg0_len);
+    const char *reason;
+    tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel, &reason);
+    ASSERT_TRUE (res == NULL_TREE);
+    ASSERT_TRUE (!strcmp (reason, "crossed input vectors"));
+  }
+
+  /* Case 3: Select elements from different patterns.
+     Should return NULL.  */
+  {
+    tree op0 = build_vec_cst_rand (vmode, 2, 3, 2);
+    tree op1 = build_vec_cst_rand (vmode, 2, 3, 2);
+    poly_uint64 op0_len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (op0));
+
+    vec_perm_builder builder (op0_len, 2, 3);
+    builder.quick_push (op0_len);
+    int mask_elems[] = { 0, 0, 0, 1, 0 };
+    for (int i = 0; i < 5; i++)
+      builder.quick_push (mask_elems[i]);
+
+    vec_perm_indices sel (builder, 2, op0_len);
+    const char *reason;
+    tree res = fold_vec_perm_cst (TREE_TYPE (op0), op0, op1, sel, &reason);
+    ASSERT_TRUE (res == NULL_TREE);
+    ASSERT_TRUE (!strcmp (reason, "step is not multiple of npatterns"));
+  }
+
+  /* Case 4: Select pattern 0 of op0 and dup of op0[0]
+     op0, op1, sel: npatterns = 2, nelts_per_pattern = 3
+     sel = { 0, 0, 2, 0, 4, 0, ... }.
+
+     For pattern {0, 2, 4, ...}:
+     a1 = 2
+     len = 16 + 16x
+     S = 2
+     esel = len / npatterns_sel = (16 + 16x) / 2 = (8 + 8x)
+     ae = a1 + (esel - 2) * S
+       = 2 + (8 + 8x - 2) * 2
+       = 14 + 16x
+     a1 / arg0_len = 2 / (16 + 16x) = 0
+     ae / arg0_len = (14 + 16x) / (16 + 16x) = 0
+     So a1/arg0_len = ae/arg0_len = 0
+     Hence we select from first vector op0
+     S = 2, npatterns = 2.
+     Since S is multiple of npatterns(op0), we are selecting from
+     same pattern of op0.
+
+     For pattern {0, ...}, we are choosing { op0[0] ... }
+     So res will be combination of above patterns:
+     res: { op0[0], op0[0], op0[2], op0[0], op0[4], op0[0], ... }
+     with npatterns = 2, nelts_per_pattern = 3.  */
+  {
+    tree op0 = build_vec_cst_rand (vmode, 2, 3, 2);
+    tree op1 = build_vec_cst_rand (vmode, 2, 3, 2);
+    poly_uint64 op0_len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (op0));
+
+    vec_perm_builder builder (op0_len, 2, 3);
+    int mask_elems[] = { 0, 0, 2, 0, 4, 0 };
+    for (int i = 0; i < 6; i++)
+      builder.quick_push (mask_elems[i]);
+
+    vec_perm_indices sel (builder, 2, op0_len);
+    tree res = fold_vec_perm_cst (TREE_TYPE (op0), op0, op1, sel);
+    tree expected_res[] = { vector_cst_elt (op0, 0), vector_cst_elt (op0, 0),
+                           vector_cst_elt (op0, 2), vector_cst_elt (op0, 0),
+                           vector_cst_elt (op0, 4), vector_cst_elt (op0, 0) };
+    validate_res (2, 3, res, expected_res);
+  }
+
+  /* Case 7: sel_npatterns > op_npatterns;
+     op0, op1: npatterns = 2, nelts_per_pattern = 3
+     sel: { 0, 0, 1, len, 2, 0, 3, len, 4, 0, 5, len, ...},
+     with npatterns = 4, nelts_per_pattern = 3.  */
+  {
+    tree op0 = build_vec_cst_rand (vmode, 2, 3, 2);
+    tree op1 = build_vec_cst_rand (vmode, 2, 3, 2);
+    poly_uint64 op0_len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (op0));
+
+    vec_perm_builder builder(op0_len, 4, 3);
+    // -1 is used as place holder for poly_int_cst
+    int mask_elems[] = { 0, 0, 1, -1, 2, 0, 3, -1, 4, 0, 5, -1 };
+    for (int i = 0; i < 12; i++)
+      builder.quick_push ((mask_elems[i] == -1) ? op0_len : mask_elems[i]);
+
+    vec_perm_indices sel (builder, 2, op0_len);
+    tree res = fold_vec_perm_cst (TREE_TYPE (op0), op0, op1, sel);
+    tree expected_res[] = { vector_cst_elt (op0, 0), vector_cst_elt (op0, 0),
+                           vector_cst_elt (op0, 1), vector_cst_elt (op1, 0),
+                           vector_cst_elt (op0, 2), vector_cst_elt (op0, 0),
+                           vector_cst_elt (op0, 3), vector_cst_elt (op1, 0),
+                           vector_cst_elt (op0, 4), vector_cst_elt (op0, 0),
+                           vector_cst_elt (op0, 5), vector_cst_elt (op1, 0) };
+    validate_res (4, 3, res, expected_res);
+  }
+}
+
+/* Test cases where result is VNx4SI and input vectors are V4SI.  */
+
+static void
+test_vnx4si_v4si (machine_mode vnx4si_mode, machine_mode v4si_mode)
+{
+  /* Case 1:
+     sel = { 0, 4, 1, 5, ... }
+     res = { op0[0], op1[0], op0[1], op1[1], ...} // (4, 1)  */
+  {
+    tree arg0 = build_vec_cst_rand (v4si_mode, 4, 1, 0);
+    tree arg1 = build_vec_cst_rand (v4si_mode, 4, 1, 0);
+
+    tree inner_type
+      = lang_hooks.types.type_for_mode (GET_MODE_INNER (vnx4si_mode), 1);
+    tree res_type = build_vector_type_for_mode (inner_type, vnx4si_mode);
+
+    poly_uint64 res_len = TYPE_VECTOR_SUBPARTS (res_type);
+    vec_perm_builder builder (res_len, 4, 1);
+    builder.quick_push (0);
+    builder.quick_push (4);
+    builder.quick_push (1);
+    builder.quick_push (5);
+
+    vec_perm_indices sel (builder, 2, res_len);
+    tree res = fold_vec_perm_cst (res_type, arg0, arg1, sel);
+    tree expected_res[] = { vector_cst_elt (arg0, 0), vector_cst_elt (arg1, 0),
+                           vector_cst_elt (arg0, 1), vector_cst_elt (arg1, 1)
+                         };
+    validate_res (4, 1, res, expected_res);
+  }
+
+  /* Case 2: Same as case 1, but contains an out of bounds access which
+     should wrap around.
+     sel = {0, 8, 4, 12, ...} (4, 1)
+     res = { op0[0], op0[0], op1[0], op1[0], ... } (4, 1).  */
+  {
+    tree arg0 = build_vec_cst_rand (v4si_mode, 4, 1, 0);
+    tree arg1 = build_vec_cst_rand (v4si_mode, 4, 1, 0);
+
+    tree inner_type
+      = lang_hooks.types.type_for_mode (GET_MODE_INNER (vnx4si_mode), 1);
+    tree res_type = build_vector_type_for_mode (inner_type, vnx4si_mode);
+
+    poly_uint64 res_len = TYPE_VECTOR_SUBPARTS (res_type);
+    vec_perm_builder builder (res_len, 4, 1);
+    builder.quick_push (0);
+    builder.quick_push (8);
+    builder.quick_push (4);
+    builder.quick_push (12);
+
+    vec_perm_indices sel (builder, 2, res_len);
+    tree res = fold_vec_perm_cst (res_type, arg0, arg1, sel);
+    tree expected_res[] = { vector_cst_elt (arg0, 0), vector_cst_elt (arg0, 0),
+                           vector_cst_elt (arg1, 0), vector_cst_elt (arg1, 0)
+                         };
+    validate_res (4, 1, res, expected_res);
+  }
+}
+
+/* Test cases where result is V4SI and input vectors are VNx4SI.  */
+
+static void
+test_v4si_vnx4si (machine_mode v4si_mode, machine_mode vnx4si_mode)
+{
+  /* Case 1:
+     sel = { 0, 1, 2, 3}
+     res = { op0[0], op0[1], op0[2], op0[3] }.  */
+  {
+    tree arg0 = build_vec_cst_rand (vnx4si_mode, 4, 1);
+    tree arg1 = build_vec_cst_rand (vnx4si_mode, 4, 1);
+
+    tree inner_type
+      = lang_hooks.types.type_for_mode (GET_MODE_INNER (v4si_mode), 1);
+    tree res_type = build_vector_type_for_mode (inner_type, v4si_mode);
+
+    poly_uint64 res_len = TYPE_VECTOR_SUBPARTS (res_type);
+    vec_perm_builder builder (res_len, 4, 1);
+    for (int i = 0; i < 4; i++)
+      builder.quick_push (i);
+
+    vec_perm_indices sel (builder, 2, res_len);
+    tree res = fold_vec_perm_cst (res_type, arg0, arg1, sel);
+    tree expected_res[] = { vector_cst_elt (arg0, 0), vector_cst_elt (arg0, 1),
+                           vector_cst_elt (arg0, 2), vector_cst_elt (arg0, 3) 
};
+    validate_res_vls (res, expected_res, 4);
+  }
+
+  /* Case 2: Same as Case 1, but crossing input vector.
+     sel = {0, 2, 4, 6}
+     In this case,the index 4 is ambiguous since len = 4 + 4x.
+     If x = 0 at runtime, we choose op1[0]
+     For x > 0 at runtime, we choose op0[4]
+     Since we cannot determine, which vector to choose from during compile 
time,
+     should return NULL_TREE.  */
+  {
+    tree arg0 = build_vec_cst_rand (vnx4si_mode, 4, 1);
+    tree arg1 = build_vec_cst_rand (vnx4si_mode, 4, 1);
+
+    tree inner_type
+      = lang_hooks.types.type_for_mode (GET_MODE_INNER (v4si_mode), 1);
+    tree res_type = build_vector_type_for_mode (inner_type, v4si_mode);
+
+    poly_uint64 res_len = TYPE_VECTOR_SUBPARTS (res_type);
+    vec_perm_builder builder (res_len, 4, 1);
+    for (int i = 0; i < 8; i += 2)
+      builder.quick_push (i);
+
+    vec_perm_indices sel (builder, 2, res_len);
+    const char *reason;
+    tree res = fold_vec_perm_cst (res_type, arg0, arg1, sel, &reason);
+    ASSERT_TRUE (res == NULL_TREE);
+    ASSERT_TRUE (!strcmp (reason, "cannot divide selector element by arg 
len"));
+  }
+}
+
+/* Helper function to get a vector mode that has element
+   mode as INNER_MODE and GET_MODE_NUNITS equal to len.  */
+
+static machine_mode
+get_vmode (machine_mode inner_mode, poly_uint64 len)
+{
+  machine_mode vmode;
+  FOR_EACH_MODE_IN_CLASS (vmode, MODE_VECTOR_INT)
+    if (GET_MODE_INNER (vmode) == inner_mode
+       && known_eq (GET_MODE_NUNITS (vmode), len))
+      return vmode;
+  return E_VOIDmode;
+}
+
+/* Invoke tests for fold_vec_perm_cst.  */
+
+static void
+test ()
+{
+  /* Conditionally execute fold_vec_perm_cst tests, if target supports
+     VLA vectors. Use a compile time check so we avoid instantiating
+     poly_uint64 with N > 1 on targets that do not support VLA vectors.  */
+  if constexpr (poly_int_traits<poly_uint64>::num_coeffs > 1)
+    {
+      machine_mode vnx4si_mode = get_vmode (SImode, poly_uint64 (4, 4));
+      if (vnx4si_mode == E_VOIDmode)
+       return;
+      machine_mode vnx16qi_mode = get_vmode (QImode, poly_uint64 (16, 16));
+      if (vnx16qi_mode == E_VOIDmode)
+       return;
+      machine_mode v4si_mode = get_vmode (SImode, 4);
+      if (v4si_mode == E_VOIDmode)
+       return;
+
+      test_vnx4si (vnx4si_mode);
+      test_vnx16qi (vnx16qi_mode);
+      test_vnx4si_v4si (vnx4si_mode, v4si_mode);
+      test_v4si_vnx4si (v4si_mode, vnx4si_mode);
+    }
+}
+}; // end of test_fold_vec_perm_cst namespace
+
 /* Verify that various binary operations on vectors are folded
    correctly.  */
 
@@ -16943,6 +17660,7 @@ fold_const_cc_tests ()
   test_arithmetic_folding ();
   test_vector_folding ();
   test_vec_duplicate_folding ();
+  test_fold_vec_perm_cst::test ();
 }
 
 } // namespace selftest

Reply via email to