On Fri, Dec 2, 2011 at 8:26 PM, Jakub Jelinek <ja...@redhat.com> wrote: > Hi! > > As I found during investigation of PR51387, e.g. on the attached testcase > we generate pretty bad code (for f1 even with bigger N like 256 for avx2), > because after vectorization cunroll unrolls the loops completely and we > end up with lots of VEC_PACK_TRUNC_EXPR etc. expressions with VECTOR_CST > arguments. We don't fold them, thus we read lots of constants from memory > and reshuffle them in lots of code. This patch adds folding for these > expressions, we end up on this testcase with the same amount of loaded > constants from memory, but no need to reshuffle it. > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
Ok. Thanks, Richard. > 2011-12-02 Jakub Jelinek <ja...@redhat.com> > > * fold-const.c (fold_unary_loc): Fold VEC_UNPACK_LO_EXPR, > VEC_UNPACK_HI_EXPR, VEC_UNPACK_FLOAT_LO_EXPR and > VEC_UNPACK_FLOAT_HI_EXPR with VECTOR_CST argument. > (fold_binary_loc): Fold VEC_PACK_TRUNC_EXPR, > VEC_PACK_FIX_TRUNC_EXPR, VEC_WIDEN_MULT_LO_EXPR > and VEC_WIDEN_MULT_HI_EXPR with VECTOR_CST arguments. > > * gcc.dg/vect/vect-122.c: New test. > > --- gcc/fold-const.c.jj 2011-12-02 01:52:26.000000000 +0100 > +++ gcc/fold-const.c 2011-12-02 17:43:09.246557524 +0100 > @@ -7651,6 +7651,8 @@ build_fold_addr_expr_loc (location_t loc > return build_fold_addr_expr_with_type_loc (loc, t, ptrtype); > } > > +static bool vec_cst_ctor_to_array (tree, tree *); > + > /* Fold a unary expression of code CODE and type TYPE with operand > OP0. Return the folded expression if folding is successful. > Otherwise, return NULL_TREE. */ > @@ -8294,6 +8296,44 @@ fold_unary_loc (location_t loc, enum tre > } > return NULL_TREE; > > + case VEC_UNPACK_LO_EXPR: > + case VEC_UNPACK_HI_EXPR: > + case VEC_UNPACK_FLOAT_LO_EXPR: > + case VEC_UNPACK_FLOAT_HI_EXPR: > + { > + unsigned int nelts = TYPE_VECTOR_SUBPARTS (type), i; > + tree *elts, vals = NULL_TREE; > + enum tree_code subcode; > + > + gcc_assert (TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)) == nelts * 2); > + if (TREE_CODE (arg0) != VECTOR_CST) > + return NULL_TREE; > + > + elts = XALLOCAVEC (tree, nelts * 2); > + if (!vec_cst_ctor_to_array (arg0, elts)) > + return NULL_TREE; > + > + if ((!BYTES_BIG_ENDIAN) ^ (code == VEC_UNPACK_LO_EXPR > + || code == VEC_UNPACK_FLOAT_LO_EXPR)) > + elts += nelts; > + > + if (code == VEC_UNPACK_LO_EXPR || code == VEC_UNPACK_HI_EXPR) > + subcode = NOP_EXPR; > + else > + subcode = FLOAT_EXPR; > + > + for (i = 0; i < nelts; i++) > + { > + elts[i] = fold_convert_const (subcode, TREE_TYPE (type), elts[i]); > + if (elts[i] == NULL_TREE || !CONSTANT_CLASS_P (elts[i])) > + return NULL_TREE; > + } > + > + for (i = 0; i < nelts; i++) > + vals = tree_cons (NULL_TREE, elts[nelts - i - 1], vals); > + return build_vector (type, vals); > + } > + > default: > return NULL_TREE; > } /* switch (code) */ > @@ -13498,6 +13538,73 @@ fold_binary_loc (location_t loc, > } > return NULL_TREE; > > + case VEC_PACK_TRUNC_EXPR: > + case VEC_PACK_FIX_TRUNC_EXPR: > + { > + unsigned int nelts = TYPE_VECTOR_SUBPARTS (type), i; > + tree *elts, vals = NULL_TREE; > + > + gcc_assert (TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)) == nelts / 2 > + && TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg1)) == nelts / 2); > + if (TREE_CODE (arg0) != VECTOR_CST || TREE_CODE (arg1) != VECTOR_CST) > + return NULL_TREE; > + > + elts = XALLOCAVEC (tree, nelts); > + if (!vec_cst_ctor_to_array (arg0, elts) > + || !vec_cst_ctor_to_array (arg1, elts + nelts / 2)) > + return NULL_TREE; > + > + for (i = 0; i < nelts; i++) > + { > + elts[i] = fold_convert_const (code == VEC_PACK_TRUNC_EXPR > + ? NOP_EXPR : FIX_TRUNC_EXPR, > + TREE_TYPE (type), elts[i]); > + if (elts[i] == NULL_TREE || !CONSTANT_CLASS_P (elts[i])) > + return NULL_TREE; > + } > + > + for (i = 0; i < nelts; i++) > + vals = tree_cons (NULL_TREE, elts[nelts - i - 1], vals); > + return build_vector (type, vals); > + } > + > + case VEC_WIDEN_MULT_LO_EXPR: > + case VEC_WIDEN_MULT_HI_EXPR: > + { > + unsigned int nelts = TYPE_VECTOR_SUBPARTS (type), i; > + tree *elts, vals = NULL_TREE; > + > + gcc_assert (TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)) == nelts * 2 > + && TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg1)) == nelts * 2); > + if (TREE_CODE (arg0) != VECTOR_CST || TREE_CODE (arg1) != VECTOR_CST) > + return NULL_TREE; > + > + elts = XALLOCAVEC (tree, nelts * 4); > + if (!vec_cst_ctor_to_array (arg0, elts) > + || !vec_cst_ctor_to_array (arg1, elts + nelts * 2)) > + return NULL_TREE; > + > + if ((!BYTES_BIG_ENDIAN) ^ (code == VEC_WIDEN_MULT_LO_EXPR)) > + elts += nelts; > + > + for (i = 0; i < nelts; i++) > + { > + elts[i] = fold_convert_const (NOP_EXPR, TREE_TYPE (type), > elts[i]); > + elts[i + nelts * 2] > + = fold_convert_const (NOP_EXPR, TREE_TYPE (type), > + elts[i + nelts * 2]); > + if (elts[i] == NULL_TREE || elts[i + nelts * 2] == NULL_TREE) > + return NULL_TREE; > + elts[i] = const_binop (MULT_EXPR, elts[i], elts[i + nelts * 2]); > + if (elts[i] == NULL_TREE || !CONSTANT_CLASS_P (elts[i])) > + return NULL_TREE; > + } > + > + for (i = 0; i < nelts; i++) > + vals = tree_cons (NULL_TREE, elts[nelts - i - 1], vals); > + return build_vector (type, vals); > + } > + > default: > return NULL_TREE; > } /* switch (code) */ > --- gcc/testsuite/gcc.dg/vect/vect-122.c.jj 2011-12-02 17:48:27.182059637 > +0100 > +++ gcc/testsuite/gcc.dg/vect/vect-122.c 2011-12-02 17:49:05.160880424 > +0100 > @@ -0,0 +1,59 @@ > +#include "tree-vect.h" > + > +#ifndef N > +#define N 64 > +#endif > + > +char a[N]; > +float b[N]; > +long long l[N], m[N]; > + > +__attribute__((noinline, noclone)) int > +f1 (void) > +{ > + int i; > + for (i = 0; i < N; i++) > + a[i] = i; > +} > + > +__attribute__((noinline, noclone)) int > +f2 (void) > +{ > + int i; > + for (i = 0; i < N; i++) > + b[i] = (double) i; > +} > + > +__attribute__((noinline, noclone)) int > +f3 (void) > +{ > + int i; > + for (i = 0; i < N; i++) > + l[i] = (long long) i * (i + 7); > +} > + > +__attribute__((noinline, noclone)) int > +f4 (void) > +{ > + int i; > + for (i = 0; i < N; i++) > + m[i] = (long long) i * 7; > +} > + > +int > +main () > +{ > + int i; > + > + check_vect (); > + f1 (); > + f2 (); > + f3 (); > + f4 (); > + for (i = 0; i < N; i++) > + if (a[i] != i || b[i] != i || l[i] != i * (i + 7LL) || m[i] != i * 7LL) > + abort (); > + return 0; > +} > + > +/* { dg-final { cleanup-tree-dump "vect" } } */ > > > Jakub