On Thu, 26 Jun 2014, Jakub Jelinek wrote: > On Thu, Jun 26, 2014 at 01:16:41PM +0200, Marc Glisse wrote: > > >+ if (compute_type == TREE_TYPE (type) > > >+ && !VECTOR_INTEGER_TYPE_P (TREE_TYPE (rhs2))) > > >+ { > > >+ optab oplv, opl, oprv, opr, opo; > > >+ oplv = optab_for_tree_code (LSHIFT_EXPR, type, optab_vector); > > >+ /* Right shift always has to be logical, no matter what > > >+ signedness type has. */ > > >+ oprv = vlshr_optab; > > >+ opo = optab_for_tree_code (BIT_IOR_EXPR, type, optab_default); > > >+ opl = optab_for_tree_code (LSHIFT_EXPR, type, optab_scalar); > > >+ oprv = lshr_optab; > > >+ opr = optab_for_tree_code (RSHIFT_EXPR, type, optab_scalar); > > > > Looks like there are some typos in there, you are assigning to oprv twice. > > Oops, fixed thusly.
Ok. Thanks, Richard. > 2014-06-25 Jakub Jelinek <ja...@redhat.com> > > PR tree-optimization/57233 > PR tree-optimization/61299 > * tree-vect-generic.c (get_compute_type, count_type_subparts): New > functions. > (expand_vector_operations_1): Use them. If {L,R}ROTATE_EXPR > would be lowered to scalar shifts, check if corresponding > shifts and vector BIT_IOR_EXPR are supported and don't lower > or lower just to narrower vector type in that case. > * expmed.c (expand_shift_1): Fix up handling of vector > shifts and rotates. > > * gcc.dg/pr57233.c: New test. > * gcc.target/i386/pr57233.c: New test. > * gcc.target/i386/sse2-pr57233.c: New test. > * gcc.target/i386/avx-pr57233.c: New test. > * gcc.target/i386/avx2-pr57233.c: New test. > * gcc.target/i386/avx512f-pr57233.c: New test. > * gcc.target/i386/xop-pr57233.c: New test. > > --- gcc/tree-vect-generic.c.jj 2014-06-26 11:00:00.477268305 +0200 > +++ gcc/tree-vect-generic.c 2014-06-26 13:33:33.024069715 +0200 > @@ -1334,15 +1334,67 @@ lower_vec_perm (gimple_stmt_iterator *gs > update_stmt (gsi_stmt (*gsi)); > } > > +/* Return type in which CODE operation with optab OP can be > + computed. */ > + > +static tree > +get_compute_type (enum tree_code code, optab op, tree type) > +{ > + /* For very wide vectors, try using a smaller vector mode. */ > + tree compute_type = type; > + if (op > + && (!VECTOR_MODE_P (TYPE_MODE (type)) > + || optab_handler (op, TYPE_MODE (type)) == CODE_FOR_nothing)) > + { > + tree vector_compute_type > + = type_for_widest_vector_mode (TREE_TYPE (type), op); > + if (vector_compute_type != NULL_TREE > + && (TYPE_VECTOR_SUBPARTS (vector_compute_type) > + < TYPE_VECTOR_SUBPARTS (compute_type)) > + && (optab_handler (op, TYPE_MODE (vector_compute_type)) > + != CODE_FOR_nothing)) > + compute_type = vector_compute_type; > + } > + > + /* If we are breaking a BLKmode vector into smaller pieces, > + type_for_widest_vector_mode has already looked into the optab, > + so skip these checks. */ > + if (compute_type == type) > + { > + enum machine_mode compute_mode = TYPE_MODE (compute_type); > + if (VECTOR_MODE_P (compute_mode)) > + { > + if (op && optab_handler (op, compute_mode) != CODE_FOR_nothing) > + return compute_type; > + if (code == MULT_HIGHPART_EXPR > + && can_mult_highpart_p (compute_mode, > + TYPE_UNSIGNED (compute_type))) > + return compute_type; > + } > + /* There is no operation in hardware, so fall back to scalars. */ > + compute_type = TREE_TYPE (type); > + } > + > + return compute_type; > +} > + > +/* Helper function of expand_vector_operations_1. Return number of > + vector elements for vector types or 1 for other types. */ > + > +static inline int > +count_type_subparts (tree type) > +{ > + return VECTOR_TYPE_P (type) ? TYPE_VECTOR_SUBPARTS (type) : 1; > +} > + > /* Process one statement. If we identify a vector operation, expand it. */ > > static void > expand_vector_operations_1 (gimple_stmt_iterator *gsi) > { > gimple stmt = gsi_stmt (*gsi); > - tree lhs, rhs1, rhs2 = NULL, type, compute_type; > + tree lhs, rhs1, rhs2 = NULL, type, compute_type = NULL_TREE; > enum tree_code code; > - enum machine_mode compute_mode; > optab op = unknown_optab; > enum gimple_rhs_class rhs_class; > tree new_rhs; > @@ -1455,11 +1507,76 @@ expand_vector_operations_1 (gimple_stmt_ > { > op = optab_for_tree_code (code, type, optab_scalar); > > + compute_type = get_compute_type (code, op, type); > + if (compute_type == type) > + return; > /* The rtl expander will expand vector/scalar as vector/vector > - if necessary. Don't bother converting the stmt here. */ > - if (optab_handler (op, TYPE_MODE (type)) == CODE_FOR_nothing > - && optab_handler (opv, TYPE_MODE (type)) != CODE_FOR_nothing) > + if necessary. Pick one with wider vector type. */ > + tree compute_vtype = get_compute_type (code, opv, type); > + if (count_type_subparts (compute_vtype) > + > count_type_subparts (compute_type)) > + { > + compute_type = compute_vtype; > + op = opv; > + } > + } > + > + if (code == LROTATE_EXPR || code == RROTATE_EXPR) > + { > + if (compute_type == NULL_TREE) > + compute_type = get_compute_type (code, op, type); > + if (compute_type == type) > return; > + /* Before splitting vector rotates into scalar rotates, > + see if we can't use vector shifts and BIT_IOR_EXPR > + instead. For vector by vector rotates we'd also > + need to check BIT_AND_EXPR and NEGATE_EXPR, punt there > + for now, fold doesn't seem to create such rotates anyway. */ > + if (compute_type == TREE_TYPE (type) > + && !VECTOR_INTEGER_TYPE_P (TREE_TYPE (rhs2))) > + { > + optab oplv = vashl_optab, opl = ashl_optab; > + optab oprv = vlshr_optab, opr = lshr_optab, opo = ior_optab; > + tree compute_lvtype = get_compute_type (LSHIFT_EXPR, oplv, type); > + tree compute_rvtype = get_compute_type (RSHIFT_EXPR, oprv, type); > + tree compute_otype = get_compute_type (BIT_IOR_EXPR, opo, type); > + tree compute_ltype = get_compute_type (LSHIFT_EXPR, opl, type); > + tree compute_rtype = get_compute_type (RSHIFT_EXPR, opr, type); > + /* The rtl expander will expand vector/scalar as vector/vector > + if necessary. Pick one with wider vector type. */ > + if (count_type_subparts (compute_lvtype) > + > count_type_subparts (compute_ltype)) > + { > + compute_ltype = compute_lvtype; > + opl = oplv; > + } > + if (count_type_subparts (compute_rvtype) > + > count_type_subparts (compute_rtype)) > + { > + compute_rtype = compute_rvtype; > + opr = oprv; > + } > + /* Pick the narrowest type from LSHIFT_EXPR, RSHIFT_EXPR and > + BIT_IOR_EXPR. */ > + compute_type = compute_ltype; > + if (count_type_subparts (compute_type) > + > count_type_subparts (compute_rtype)) > + compute_type = compute_rtype; > + if (count_type_subparts (compute_type) > + > count_type_subparts (compute_otype)) > + compute_type = compute_otype; > + /* Verify all 3 operations can be performed in that type. */ > + if (compute_type != TREE_TYPE (type)) > + { > + if (optab_handler (opl, TYPE_MODE (compute_type)) > + == CODE_FOR_nothing > + || optab_handler (opr, TYPE_MODE (compute_type)) > + == CODE_FOR_nothing > + || optab_handler (opo, TYPE_MODE (compute_type)) > + == CODE_FOR_nothing) > + compute_type = TREE_TYPE (type); > + } > + } > } > } > else > @@ -1473,38 +1590,10 @@ expand_vector_operations_1 (gimple_stmt_ > && INTEGRAL_TYPE_P (TREE_TYPE (type))) > op = optab_for_tree_code (MINUS_EXPR, type, optab_default); > > - /* For very wide vectors, try using a smaller vector mode. */ > - compute_type = type; > - if (!VECTOR_MODE_P (TYPE_MODE (type)) && op) > - { > - tree vector_compute_type > - = type_for_widest_vector_mode (TREE_TYPE (type), op); > - if (vector_compute_type != NULL_TREE > - && (TYPE_VECTOR_SUBPARTS (vector_compute_type) > - < TYPE_VECTOR_SUBPARTS (compute_type)) > - && (optab_handler (op, TYPE_MODE (vector_compute_type)) > - != CODE_FOR_nothing)) > - compute_type = vector_compute_type; > - } > - > - /* If we are breaking a BLKmode vector into smaller pieces, > - type_for_widest_vector_mode has already looked into the optab, > - so skip these checks. */ > + if (compute_type == NULL_TREE) > + compute_type = get_compute_type (code, op, type); > if (compute_type == type) > - { > - compute_mode = TYPE_MODE (compute_type); > - if (VECTOR_MODE_P (compute_mode)) > - { > - if (op && optab_handler (op, compute_mode) != CODE_FOR_nothing) > - return; > - if (code == MULT_HIGHPART_EXPR > - && can_mult_highpart_p (compute_mode, > - TYPE_UNSIGNED (compute_type))) > - return; > - } > - /* There is no operation in hardware, so fall back to scalars. */ > - compute_type = TREE_TYPE (type); > - } > + return; > > gcc_assert (code != VEC_LSHIFT_EXPR && code != VEC_RSHIFT_EXPR); > new_rhs = expand_vector_operation (gsi, type, compute_type, stmt, code); > --- gcc/expmed.c.jj 2014-06-26 11:00:00.034270690 +0200 > +++ gcc/expmed.c 2014-06-26 13:28:49.802581691 +0200 > @@ -2128,9 +2128,12 @@ expand_shift_1 (enum tree_code code, enu > optab lrotate_optab = rotl_optab; > optab rrotate_optab = rotr_optab; > enum machine_mode op1_mode; > + enum machine_mode scalar_mode = mode; > int attempt; > bool speed = optimize_insn_for_speed_p (); > > + if (VECTOR_MODE_P (mode)) > + scalar_mode = GET_MODE_INNER (mode); > op1 = amount; > op1_mode = GET_MODE (op1); > > @@ -2153,9 +2156,9 @@ expand_shift_1 (enum tree_code code, enu > { > if (CONST_INT_P (op1) > && ((unsigned HOST_WIDE_INT) INTVAL (op1) >= > - (unsigned HOST_WIDE_INT) GET_MODE_BITSIZE (mode))) > + (unsigned HOST_WIDE_INT) GET_MODE_BITSIZE (scalar_mode))) > op1 = GEN_INT ((unsigned HOST_WIDE_INT) INTVAL (op1) > - % GET_MODE_BITSIZE (mode)); > + % GET_MODE_BITSIZE (scalar_mode)); > else if (GET_CODE (op1) == SUBREG > && subreg_lowpart_p (op1) > && SCALAR_INT_MODE_P (GET_MODE (SUBREG_REG (op1))) > @@ -2169,10 +2172,10 @@ expand_shift_1 (enum tree_code code, enu > amount instead. */ > if (rotate > && CONST_INT_P (op1) > - && IN_RANGE (INTVAL (op1), GET_MODE_BITSIZE (mode) / 2 + left, > - GET_MODE_BITSIZE (mode) - 1)) > + && IN_RANGE (INTVAL (op1), GET_MODE_BITSIZE (scalar_mode) / 2 + left, > + GET_MODE_BITSIZE (scalar_mode) - 1)) > { > - op1 = GEN_INT (GET_MODE_BITSIZE (mode) - INTVAL (op1)); > + op1 = GEN_INT (GET_MODE_BITSIZE (scalar_mode) - INTVAL (op1)); > left = !left; > code = left ? LROTATE_EXPR : RROTATE_EXPR; > } > @@ -2185,7 +2188,7 @@ expand_shift_1 (enum tree_code code, enu > if (code == LSHIFT_EXPR > && CONST_INT_P (op1) > && INTVAL (op1) > 0 > - && INTVAL (op1) < GET_MODE_PRECISION (mode) > + && INTVAL (op1) < GET_MODE_PRECISION (scalar_mode) > && INTVAL (op1) < MAX_BITS_PER_WORD > && (shift_cost (speed, mode, INTVAL (op1)) > > INTVAL (op1) * add_cost (speed, mode)) > @@ -2240,14 +2243,14 @@ expand_shift_1 (enum tree_code code, enu > if (op1 == const0_rtx) > return shifted; > else if (CONST_INT_P (op1)) > - other_amount = GEN_INT (GET_MODE_BITSIZE (mode) > + other_amount = GEN_INT (GET_MODE_BITSIZE (scalar_mode) > - INTVAL (op1)); > else > { > other_amount > = simplify_gen_unary (NEG, GET_MODE (op1), > op1, GET_MODE (op1)); > - HOST_WIDE_INT mask = GET_MODE_PRECISION (mode) - 1; > + HOST_WIDE_INT mask = GET_MODE_PRECISION (scalar_mode) - 1; > other_amount > = simplify_gen_binary (AND, GET_MODE (op1), other_amount, > gen_int_mode (mask, GET_MODE (op1))); > --- gcc/testsuite/gcc.target/i386/sse2-pr57233.c.jj 2014-06-26 > 13:28:49.802581691 +0200 > +++ gcc/testsuite/gcc.target/i386/sse2-pr57233.c 2014-06-26 > 13:28:49.802581691 +0200 > @@ -0,0 +1,16 @@ > +/* PR tree-optimization/57233 */ > +/* { dg-do run { target sse2 } } */ > +/* { dg-options "-O2 -msse2" } */ > + > +#include "sse2-check.h" > + > +static void > +sse2_test (void) > +{ > + do_main (); > +} > + > +#undef main > +#define main() do_main () > + > +#include "../../gcc.dg/pr57233.c" > --- gcc/testsuite/gcc.target/i386/pr57233.c.jj 2014-06-26 > 13:28:49.802581691 +0200 > +++ gcc/testsuite/gcc.target/i386/pr57233.c 2014-06-26 13:28:49.802581691 > +0200 > @@ -0,0 +1,15 @@ > +/* PR tree-optimization/57233 */ > +/* { dg-do compile { target avx } } */ > +/* { dg-options "-O2 -mavx -mno-xop" } */ > + > +typedef unsigned V4 __attribute__((vector_size(4 * sizeof (int)))); > +V4 a; > + > +__attribute__((noinline)) void > +foo (void) > +{ > + a = (a << 2) | (a >> 30); > +} > + > +/* { dg-final { scan-assembler "vpsrld\[^\n\r]*30" } } */ > +/* { dg-final { scan-assembler "vpslld\[^\n\r]*2" } } */ > --- gcc/testsuite/gcc.target/i386/xop-pr57233.c.jj 2014-06-26 > 13:28:49.802581691 +0200 > +++ gcc/testsuite/gcc.target/i386/xop-pr57233.c 2014-06-26 > 13:28:49.802581691 +0200 > @@ -0,0 +1,16 @@ > +/* PR tree-optimization/57233 */ > +/* { dg-do run { target xop } } */ > +/* { dg-options "-O2 -mxop" } */ > + > +#include "xop-check.h" > + > +static void > +xop_test (void) > +{ > + do_main (); > +} > + > +#undef main > +#define main() do_main () > + > +#include "../../gcc.dg/pr57233.c" > --- gcc/testsuite/gcc.target/i386/avx2-pr57233.c.jj 2014-06-26 > 13:28:49.803581572 +0200 > +++ gcc/testsuite/gcc.target/i386/avx2-pr57233.c 2014-06-26 > 13:28:49.802581691 +0200 > @@ -0,0 +1,16 @@ > +/* PR tree-optimization/57233 */ > +/* { dg-do run { target avx2 } } */ > +/* { dg-options "-O2 -mavx2" } */ > + > +#include "avx2-check.h" > + > +static void > +avx2_test (void) > +{ > + do_main (); > +} > + > +#undef main > +#define main() do_main () > + > +#include "../../gcc.dg/pr57233.c" > --- gcc/testsuite/gcc.target/i386/avx-pr57233.c.jj 2014-06-26 > 13:28:49.803581572 +0200 > +++ gcc/testsuite/gcc.target/i386/avx-pr57233.c 2014-06-26 > 13:28:49.803581572 +0200 > @@ -0,0 +1,16 @@ > +/* PR tree-optimization/57233 */ > +/* { dg-do run { target avx } } */ > +/* { dg-options "-O2 -mavx" } */ > + > +#include "avx-check.h" > + > +static void > +avx_test (void) > +{ > + do_main (); > +} > + > +#undef main > +#define main() do_main () > + > +#include "../../gcc.dg/pr57233.c" > --- gcc/testsuite/gcc.target/i386/avx512f-pr57233.c.jj 2014-06-26 > 13:28:49.803581572 +0200 > +++ gcc/testsuite/gcc.target/i386/avx512f-pr57233.c 2014-06-26 > 13:28:49.803581572 +0200 > @@ -0,0 +1,16 @@ > +/* PR tree-optimization/57233 */ > +/* { dg-do run { target avx512f } } */ > +/* { dg-options "-O2 -mavx512f" } */ > + > +#include "avx512f-check.h" > + > +static void > +avx512f_test (void) > +{ > + do_main (); > +} > + > +#undef main > +#define main() do_main () > + > +#include "../../gcc.dg/pr57233.c" > --- gcc/testsuite/gcc.dg/pr57233.c.jj 2014-06-26 13:28:49.803581572 +0200 > +++ gcc/testsuite/gcc.dg/pr57233.c 2014-06-26 13:28:49.803581572 +0200 > @@ -0,0 +1,171 @@ > +/* PR tree-optimization/57233 */ > +/* { dg-do run { target { ilp32 || lp64 } } } */ > +/* { dg-options "-O2" } */ > + > +typedef unsigned V4 __attribute__((vector_size(4 * sizeof (int)))); > +typedef unsigned V8 __attribute__((vector_size(8 * sizeof (int)))); > +typedef unsigned V16 __attribute__((vector_size(16 * sizeof (int)))); > +V4 a, b, g; > +V8 c, d, h; > +V16 e, f, j; > + > +__attribute__((noinline)) void > +f1 (void) > +{ > + a = (a << 2) | (a >> 30); > +} > + > +__attribute__((noinline)) void > +f2 (void) > +{ > + a = (a << 30) | (a >> 2); > +} > + > +__attribute__((noinline)) void > +f3 (void) > +{ > + a = (a << b) | (a >> (32 - b)); > +} > + > +__attribute__((noinline, noclone)) void > +f4 (int x) > +{ > + a = (a << x) | (a >> (32 - x)); > +} > + > +__attribute__((noinline)) void > +f5 (void) > +{ > + c = (c << 2) | (c >> 30); > +} > + > +__attribute__((noinline)) void > +f6 (void) > +{ > + c = (c << 30) | (c >> 2); > +} > + > +__attribute__((noinline)) void > +f7 (void) > +{ > + c = (c << d) | (c >> (32 - d)); > +} > + > +__attribute__((noinline, noclone)) void > +f8 (int x) > +{ > + c = (c << x) | (c >> (32 - x)); > +} > + > +__attribute__((noinline)) void > +f9 (void) > +{ > + e = (e << 2) | (e >> 30); > +} > + > +__attribute__((noinline)) void > +f10 (void) > +{ > + e = (e << 30) | (e >> 2); > +} > + > +__attribute__((noinline)) void > +f11 (void) > +{ > + e = (e << f) | (e >> (32 - f)); > +} > + > +__attribute__((noinline, noclone)) void > +f12 (int x) > +{ > + e = (e << x) | (e >> (32 - x)); > +} > + > +unsigned > +r (void) > +{ > + static unsigned x = 0xdeadbeefU; > + static unsigned y = 0x12347654U; > + static unsigned z = 0x1a2b3c4dU; > + static unsigned w = 0x87654321U; > + unsigned t = x ^ (x << 11); > + x = y; > + y = z; > + z = w; > + w = w ^ (w >> 19) ^ t ^ (t >> 8); > + return w; > +} > + > +void > +init (unsigned int *p, int count, int mod) > +{ > + int i; > + for (i = 0; i < count; i++) > + { > + unsigned int v = r (); > + if (mod) > + v = (v % 31) + 1; > + p[i] = v; > + } > +} > + > +void > +check (unsigned int *p, unsigned int *q, int count, unsigned int *s, int ss) > +{ > + int i; > + for (i = 0; i < count; i++) > + { > + if (s) > + ss = s[i]; > + if (p[i] != ((q[i] << ss) | (q[i] >> (32 - ss)))) > + __builtin_abort (); > + } > +} > + > +int > +main () > +{ > + init ((unsigned int *) &a, 4, 0); > + init ((unsigned int *) &b, 4, 1); > + init ((unsigned int *) &c, 8, 0); > + init ((unsigned int *) &d, 8, 1); > + init ((unsigned int *) &e, 16, 0); > + init ((unsigned int *) &f, 16, 1); > + g = a; > + h = c; > + j = e; > + f1 (); > + f5 (); > + f9 (); > + check ((unsigned int *) &a, (unsigned int *) &g, 4, 0, 2); > + check ((unsigned int *) &c, (unsigned int *) &h, 8, 0, 2); > + check ((unsigned int *) &e, (unsigned int *) &j, 16, 0, 2); > + g = a; > + h = c; > + j = e; > + f2 (); > + f6 (); > + f10 (); > + check ((unsigned int *) &a, (unsigned int *) &g, 4, 0, 30); > + check ((unsigned int *) &c, (unsigned int *) &h, 8, 0, 30); > + check ((unsigned int *) &e, (unsigned int *) &j, 16, 0, 30); > + g = a; > + h = c; > + j = e; > + f3 (); > + f7 (); > + f11 (); > + check ((unsigned int *) &a, (unsigned int *) &g, 4, (unsigned int *) &b, > 0); > + check ((unsigned int *) &c, (unsigned int *) &h, 8, (unsigned int *) &d, > 0); > + check ((unsigned int *) &e, (unsigned int *) &j, 16, (unsigned int *) &f, > 0); > + g = a; > + h = c; > + j = e; > + f4 (5); > + f8 (5); > + f12 (5); > + check ((unsigned int *) &a, (unsigned int *) &g, 4, 0, 5); > + check ((unsigned int *) &c, (unsigned int *) &h, 8, 0, 5); > + check ((unsigned int *) &e, (unsigned int *) &j, 16, 0, 5); > + return 0; > +} > > > Jakub > > -- Richard Biener <rguent...@suse.de> SUSE / SUSE Labs SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746 GF: Jeff Hawn, Jennifer Guild, Felix Imend"orffer