On Tue, Apr 7, 2020 at 12:51 AM Jakub Jelinek <ja...@redhat.com> wrote: > > Hi! > > The following testcase is miscompiled in 8.x, because emit_reduc_half is > prepared to handle for 512-bit modes only i equal to 512, 256, 128 and 64. > V32HImode also needs i equal to 32 and V64QImode i equal to 32 and 16, > but emit_reduc_half in that case performs a redundant permutation exactly > like i == 32. In 9+ the testcase works because Richard in r9-3393 > changed the reduc_* expanders so that they actually don't call > ix86_expand_reduc on 512-bit modes, but only 128-bit ones. > > The patch fixes emit_reduc_half to handle also i of 32 and 16 similarly to > how V32QImode/V16HImode are handled for AVX2. I think it shouldn't hurt > to fix the function even on the trunk and 9 branch even when nothing uses > it ATM. > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk/9 and > primarily for 8.5 (obviously in that case s/i386-expand/i386/)? > > 2020-04-06 Jakub Jelinek <ja...@redhat.com> > > PR target/94500 > * config/i386/i386-expand.c (emit_reduc_half): For V{64QI,32HI}mode > handle i < 64 using avx512bw_lshrv4ti3. Formatting fixes. > > * gcc.target/i386/avx512bw-pr94500.c: New test.
OK everywhere. Thanks, Uros. > --- gcc/config/i386/i386-expand.c.jj 2020-03-29 19:26:31.748561262 +0200 > +++ gcc/config/i386/i386-expand.c 2020-04-06 17:18:44.906242980 +0200 > @@ -14891,43 +14891,51 @@ emit_reduc_half (rtx dest, rtx src, int > break; > case E_V64QImode: > case E_V32HImode: > + if (i < 64) > + { > + d = gen_reg_rtx (V4TImode); > + tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src), > + GEN_INT (i / 2)); > + break; > + } > + /* FALLTHRU */ > case E_V16SImode: > case E_V16SFmode: > case E_V8DImode: > case E_V8DFmode: > if (i > 128) > tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest), > - gen_lowpart (V16SImode, src), > - gen_lowpart (V16SImode, src), > - GEN_INT (0x4 + (i == 512 ? 4 : 0)), > - GEN_INT (0x5 + (i == 512 ? 4 : 0)), > - GEN_INT (0x6 + (i == 512 ? 4 : 0)), > - GEN_INT (0x7 + (i == 512 ? 4 : 0)), > - GEN_INT (0xC), GEN_INT (0xD), > - GEN_INT (0xE), GEN_INT (0xF), > - GEN_INT (0x10), GEN_INT (0x11), > - GEN_INT (0x12), GEN_INT (0x13), > - GEN_INT (0x14), GEN_INT (0x15), > - GEN_INT (0x16), GEN_INT (0x17)); > + gen_lowpart (V16SImode, src), > + gen_lowpart (V16SImode, src), > + GEN_INT (0x4 + (i == 512 ? 4 : 0)), > + GEN_INT (0x5 + (i == 512 ? 4 : 0)), > + GEN_INT (0x6 + (i == 512 ? 4 : 0)), > + GEN_INT (0x7 + (i == 512 ? 4 : 0)), > + GEN_INT (0xC), GEN_INT (0xD), > + GEN_INT (0xE), GEN_INT (0xF), > + GEN_INT (0x10), GEN_INT (0x11), > + GEN_INT (0x12), GEN_INT (0x13), > + GEN_INT (0x14), GEN_INT (0x15), > + GEN_INT (0x16), GEN_INT (0x17)); > else > tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest), > - gen_lowpart (V16SImode, src), > - GEN_INT (i == 128 ? 0x2 : 0x1), > - GEN_INT (0x3), > - GEN_INT (0x3), > - GEN_INT (0x3), > - GEN_INT (i == 128 ? 0x6 : 0x5), > - GEN_INT (0x7), > - GEN_INT (0x7), > - GEN_INT (0x7), > - GEN_INT (i == 128 ? 0xA : 0x9), > - GEN_INT (0xB), > - GEN_INT (0xB), > - GEN_INT (0xB), > - GEN_INT (i == 128 ? 0xE : 0xD), > - GEN_INT (0xF), > - GEN_INT (0xF), > - GEN_INT (0xF)); > + gen_lowpart (V16SImode, src), > + GEN_INT (i == 128 ? 0x2 : 0x1), > + GEN_INT (0x3), > + GEN_INT (0x3), > + GEN_INT (0x3), > + GEN_INT (i == 128 ? 0x6 : 0x5), > + GEN_INT (0x7), > + GEN_INT (0x7), > + GEN_INT (0x7), > + GEN_INT (i == 128 ? 0xA : 0x9), > + GEN_INT (0xB), > + GEN_INT (0xB), > + GEN_INT (0xB), > + GEN_INT (i == 128 ? 0xE : 0xD), > + GEN_INT (0xF), > + GEN_INT (0xF), > + GEN_INT (0xF)); > break; > default: > gcc_unreachable (); > --- gcc/testsuite/gcc.target/i386/avx512bw-pr94500.c.jj 2020-04-06 > 17:24:42.246904934 +0200 > +++ gcc/testsuite/gcc.target/i386/avx512bw-pr94500.c 2020-04-06 > 17:26:03.721687840 +0200 > @@ -0,0 +1,28 @@ > +/* PR target/94500 */ > +/* { dg-do run { target avx512bw } } */ > +/* { dg-options "-O3 -mavx512bw -mprefer-vector-width=512" } */ > + > +#define AVX512BW > +#include "avx512f-helper.h" > + > +__attribute__((noipa)) signed char > +foo (signed char *p) > +{ > + signed char r = 0; > + int i; > + for (i = 0; i < 256; i++) > + if (p[i] > r) r = p[i]; > + return r; > +} > + > +signed char buf[256]; > + > +static void > +TEST (void) > +{ > + int i; > + for (i = 0; i < 256; i++) > + buf[i] = i - 128; > + if (foo (buf) != 127) > + abort (); > +} > > Jakub >