On Fri, Dec 2, 2011 at 8:26 PM, Jakub Jelinek <ja...@redhat.com> wrote:
> Hi!
>
> As I found during investigation of PR51387, e.g. on the attached testcase
> we generate pretty bad code (for f1 even with bigger N like 256 for avx2),
> because after vectorization cunroll unrolls the loops completely and we
> end up with lots of VEC_PACK_TRUNC_EXPR etc. expressions with VECTOR_CST
> arguments.  We don't fold them, thus we read lots of constants from memory
> and reshuffle them in lots of code.  This patch adds folding for these
> expressions, we end up on this testcase with the same amount of loaded
> constants from memory, but no need to reshuffle it.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

Ok.

Thanks,
Richard.

> 2011-12-02  Jakub Jelinek  <ja...@redhat.com>
>
>        * fold-const.c (fold_unary_loc): Fold VEC_UNPACK_LO_EXPR,
>        VEC_UNPACK_HI_EXPR, VEC_UNPACK_FLOAT_LO_EXPR and
>        VEC_UNPACK_FLOAT_HI_EXPR with VECTOR_CST argument.
>        (fold_binary_loc): Fold VEC_PACK_TRUNC_EXPR,
>        VEC_PACK_FIX_TRUNC_EXPR, VEC_WIDEN_MULT_LO_EXPR
>        and VEC_WIDEN_MULT_HI_EXPR with VECTOR_CST arguments.
>
>        * gcc.dg/vect/vect-122.c: New test.
>
> --- gcc/fold-const.c.jj 2011-12-02 01:52:26.000000000 +0100
> +++ gcc/fold-const.c    2011-12-02 17:43:09.246557524 +0100
> @@ -7651,6 +7651,8 @@ build_fold_addr_expr_loc (location_t loc
>   return build_fold_addr_expr_with_type_loc (loc, t, ptrtype);
>  }
>
> +static bool vec_cst_ctor_to_array (tree, tree *);
> +
>  /* Fold a unary expression of code CODE and type TYPE with operand
>    OP0.  Return the folded expression if folding is successful.
>    Otherwise, return NULL_TREE.  */
> @@ -8294,6 +8296,44 @@ fold_unary_loc (location_t loc, enum tre
>        }
>       return NULL_TREE;
>
> +    case VEC_UNPACK_LO_EXPR:
> +    case VEC_UNPACK_HI_EXPR:
> +    case VEC_UNPACK_FLOAT_LO_EXPR:
> +    case VEC_UNPACK_FLOAT_HI_EXPR:
> +      {
> +       unsigned int nelts = TYPE_VECTOR_SUBPARTS (type), i;
> +       tree *elts, vals = NULL_TREE;
> +       enum tree_code subcode;
> +
> +       gcc_assert (TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)) == nelts * 2);
> +       if (TREE_CODE (arg0) != VECTOR_CST)
> +         return NULL_TREE;
> +
> +       elts = XALLOCAVEC (tree, nelts * 2);
> +       if (!vec_cst_ctor_to_array (arg0, elts))
> +         return NULL_TREE;
> +
> +       if ((!BYTES_BIG_ENDIAN) ^ (code == VEC_UNPACK_LO_EXPR
> +                                  || code == VEC_UNPACK_FLOAT_LO_EXPR))
> +         elts += nelts;
> +
> +       if (code == VEC_UNPACK_LO_EXPR || code == VEC_UNPACK_HI_EXPR)
> +         subcode = NOP_EXPR;
> +       else
> +         subcode = FLOAT_EXPR;
> +
> +       for (i = 0; i < nelts; i++)
> +         {
> +           elts[i] = fold_convert_const (subcode, TREE_TYPE (type), elts[i]);
> +           if (elts[i] == NULL_TREE || !CONSTANT_CLASS_P (elts[i]))
> +             return NULL_TREE;
> +         }
> +
> +       for (i = 0; i < nelts; i++)
> +         vals = tree_cons (NULL_TREE, elts[nelts - i - 1], vals);
> +       return build_vector (type, vals);
> +      }
> +
>     default:
>       return NULL_TREE;
>     } /* switch (code) */
> @@ -13498,6 +13538,73 @@ fold_binary_loc (location_t loc,
>        }
>       return NULL_TREE;
>
> +    case VEC_PACK_TRUNC_EXPR:
> +    case VEC_PACK_FIX_TRUNC_EXPR:
> +      {
> +       unsigned int nelts = TYPE_VECTOR_SUBPARTS (type), i;
> +       tree *elts, vals = NULL_TREE;
> +
> +       gcc_assert (TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)) == nelts / 2
> +                   && TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg1)) == nelts / 2);
> +       if (TREE_CODE (arg0) != VECTOR_CST || TREE_CODE (arg1) != VECTOR_CST)
> +         return NULL_TREE;
> +
> +       elts = XALLOCAVEC (tree, nelts);
> +       if (!vec_cst_ctor_to_array (arg0, elts)
> +           || !vec_cst_ctor_to_array (arg1, elts + nelts / 2))
> +         return NULL_TREE;
> +
> +       for (i = 0; i < nelts; i++)
> +         {
> +           elts[i] = fold_convert_const (code == VEC_PACK_TRUNC_EXPR
> +                                         ? NOP_EXPR : FIX_TRUNC_EXPR,
> +                                         TREE_TYPE (type), elts[i]);
> +           if (elts[i] == NULL_TREE || !CONSTANT_CLASS_P (elts[i]))
> +             return NULL_TREE;
> +         }
> +
> +       for (i = 0; i < nelts; i++)
> +         vals = tree_cons (NULL_TREE, elts[nelts - i - 1], vals);
> +       return build_vector (type, vals);
> +      }
> +
> +    case VEC_WIDEN_MULT_LO_EXPR:
> +    case VEC_WIDEN_MULT_HI_EXPR:
> +      {
> +       unsigned int nelts = TYPE_VECTOR_SUBPARTS (type), i;
> +       tree *elts, vals = NULL_TREE;
> +
> +       gcc_assert (TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)) == nelts * 2
> +                   && TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg1)) == nelts * 2);
> +       if (TREE_CODE (arg0) != VECTOR_CST || TREE_CODE (arg1) != VECTOR_CST)
> +         return NULL_TREE;
> +
> +       elts = XALLOCAVEC (tree, nelts * 4);
> +       if (!vec_cst_ctor_to_array (arg0, elts)
> +           || !vec_cst_ctor_to_array (arg1, elts + nelts * 2))
> +         return NULL_TREE;
> +
> +       if ((!BYTES_BIG_ENDIAN) ^ (code == VEC_WIDEN_MULT_LO_EXPR))
> +         elts += nelts;
> +
> +       for (i = 0; i < nelts; i++)
> +         {
> +           elts[i] = fold_convert_const (NOP_EXPR, TREE_TYPE (type), 
> elts[i]);
> +           elts[i + nelts * 2]
> +             = fold_convert_const (NOP_EXPR, TREE_TYPE (type),
> +                                   elts[i + nelts * 2]);
> +           if (elts[i] == NULL_TREE || elts[i + nelts * 2] == NULL_TREE)
> +             return NULL_TREE;
> +           elts[i] = const_binop (MULT_EXPR, elts[i], elts[i + nelts * 2]);
> +           if (elts[i] == NULL_TREE || !CONSTANT_CLASS_P (elts[i]))
> +             return NULL_TREE;
> +         }
> +
> +       for (i = 0; i < nelts; i++)
> +         vals = tree_cons (NULL_TREE, elts[nelts - i - 1], vals);
> +       return build_vector (type, vals);
> +      }
> +
>     default:
>       return NULL_TREE;
>     } /* switch (code) */
> --- gcc/testsuite/gcc.dg/vect/vect-122.c.jj     2011-12-02 17:48:27.182059637 
> +0100
> +++ gcc/testsuite/gcc.dg/vect/vect-122.c        2011-12-02 17:49:05.160880424 
> +0100
> @@ -0,0 +1,59 @@
> +#include "tree-vect.h"
> +
> +#ifndef N
> +#define N 64
> +#endif
> +
> +char a[N];
> +float b[N];
> +long long l[N], m[N];
> +
> +__attribute__((noinline, noclone)) int
> +f1 (void)
> +{
> +  int i;
> +  for (i = 0; i < N; i++)
> +    a[i] = i;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f2 (void)
> +{
> +  int i;
> +  for (i = 0; i < N; i++)
> +    b[i] = (double) i;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f3 (void)
> +{
> +  int i;
> +  for (i = 0; i < N; i++)
> +    l[i] = (long long) i * (i + 7);
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f4 (void)
> +{
> +  int i;
> +  for (i = 0; i < N; i++)
> +    m[i] = (long long) i * 7;
> +}
> +
> +int
> +main ()
> +{
> +  int i;
> +
> +  check_vect ();
> +  f1 ();
> +  f2 ();
> +  f3 ();
> +  f4 ();
> +  for (i = 0; i < N; i++)
> +    if (a[i] != i || b[i] != i || l[i] != i * (i + 7LL) || m[i] != i * 7LL)
> +      abort ();
> +  return 0;
> +}
> +
> +/* { dg-final { cleanup-tree-dump "vect" } } */
>
>
>        Jakub

Reply via email to