On Wed, Jun 26, 2019 at 10:11 AM Jakub Jelinek <ja...@redhat.com> wrote: > > Hi! > > The following testcase is miscompiled starting with my PR85480 change. > While it is perfectly fine to use "xm" or "vm" constraints for the source > operand when the other operand is "C", we rely on the AVX/AVX512 behavior > that most 128-bit or 256-bit vector instructions clear the upper bits of the > 512-bit vectors, but if the source is in memory, we need to check if it is > aligned or maybe misaligned and use corresponding aligned or unaligned loads > accordingly, rather than always aligned ones. > > Fixed thusly, bootstrapped/regtested on x86_64-linux and i686-linux, ok for > trunk and 9.2? > > Note, I have a follow-up patch to improve avx_vec_concat<mode>, just will > need to test it. > > 2019-06-26 Jakub Jelinek <ja...@redhat.com> > > PR target/90991 > * config/i386/sse.md > (*<extract_type>_vinsert<shuffletype><extract_suf>_0): Use vmovupd, > vmovups, vmovdqu, vmovdqu32 or vmovdqu64 instead of the aligned > insns if operands[2] is misaligned_operand. > > * gcc.target/i386/avx512dq-pr90991-1.c: New test.
OK, looks even obvious to me. Thanks, Uros. > --- gcc/config/i386/sse.md.jj 2019-06-21 08:43:17.734263742 +0200 > +++ gcc/config/i386/sse.md 2019-06-25 22:36:12.955476294 +0200 > @@ -13744,15 +13744,29 @@ (define_insn "*<extract_type>_vinsert<sh > switch (<MODE>mode) > { > case E_V8DFmode: > - return "vmovapd\t{%2, %x0|%x0, %2}"; > + if (misaligned_operand (operands[2], <ssequartermode>mode)) > + return "vmovupd\t{%2, %x0|%x0, %2}"; > + else > + return "vmovapd\t{%2, %x0|%x0, %2}"; > case E_V16SFmode: > - return "vmovaps\t{%2, %x0|%x0, %2}"; > + if (misaligned_operand (operands[2], <ssequartermode>mode)) > + return "vmovups\t{%2, %x0|%x0, %2}"; > + else > + return "vmovaps\t{%2, %x0|%x0, %2}"; > case E_V8DImode: > - return which_alternative == 2 ? "vmovdqa64\t{%2, %x0|%x0, %2}" > - : "vmovdqa\t{%2, %x0|%x0, %2}"; > + if (misaligned_operand (operands[2], <ssequartermode>mode)) > + return which_alternative == 2 ? "vmovdqu64\t{%2, %x0|%x0, %2}" > + : "vmovdqu\t{%2, %x0|%x0, %2}"; > + else > + return which_alternative == 2 ? "vmovdqa64\t{%2, %x0|%x0, %2}" > + : "vmovdqa\t{%2, %x0|%x0, %2}"; > case E_V16SImode: > - return which_alternative == 2 ? "vmovdqa32\t{%2, %x0|%x0, %2}" > - : "vmovdqa\t{%2, %x0|%x0, %2}"; > + if (misaligned_operand (operands[2], <ssequartermode>mode)) > + return which_alternative == 2 ? "vmovdqu32\t{%2, %x0|%x0, %2}" > + : "vmovdqu\t{%2, %x0|%x0, %2}"; > + else > + return which_alternative == 2 ? "vmovdqa32\t{%2, %x0|%x0, %2}" > + : "vmovdqa\t{%2, %x0|%x0, %2}"; > default: > gcc_unreachable (); > } > --- gcc/testsuite/gcc.target/i386/avx512dq-pr90991-1.c.jj 2019-06-25 > 23:17:44.831272146 +0200 > +++ gcc/testsuite/gcc.target/i386/avx512dq-pr90991-1.c 2019-06-25 > 23:27:27.732357035 +0200 > @@ -0,0 +1,47 @@ > +/* PR target/90991 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mavx512dq -masm=att" } */ > +/* { dg-final { scan-assembler-times "vmovaps\[ \t]\+\\(\[^\n\r]*\\), %xmm0" > 1 } } */ > +/* { dg-final { scan-assembler-times "vmovapd\[ \t]\+\\(\[^\n\r]*\\), %xmm0" > 1 } } */ > +/* { dg-final { scan-assembler-times "vmovdqa\[ \t]\+\\(\[^\n\r]*\\), %xmm0" > 1 } } */ > +/* { dg-final { scan-assembler-times "vmovups\[ \t]\+\\(\[^\n\r]*\\), %xmm0" > 1 } } */ > +/* { dg-final { scan-assembler-times "vmovupd\[ \t]\+\\(\[^\n\r]*\\), %xmm0" > 1 } } */ > +/* { dg-final { scan-assembler-times "vmovdqu\[ \t]\+\\(\[^\n\r]*\\), %xmm0" > 1 } } */ > + > +#include <x86intrin.h> > + > +__m512 > +f1 (void *a) > +{ > + return _mm512_insertf32x4 (_mm512_set1_ps (0.0f), _mm_load_ps (a), 0); > +} > + > +__m512d > +f2 (void *a) > +{ > + return _mm512_insertf64x2 (_mm512_set1_pd (0.0), _mm_load_pd (a), 0); > +} > + > +__m512i > +f3 (void *a) > +{ > + return _mm512_inserti32x4 (_mm512_set1_epi32 (0), _mm_load_si128 (a), 0); > +} > + > +__m512 > +f4 (void *a) > +{ > + return _mm512_insertf32x4 (_mm512_set1_ps (0.0f), _mm_loadu_ps (a), 0); > +} > + > +__m512d > +f5 (void *a) > +{ > + return _mm512_insertf64x2 (_mm512_set1_pd (0.0), _mm_loadu_pd (a), 0); > +} > + > +__m512i > +f6 (void *a) > +{ > + return _mm512_inserti32x4 (_mm512_set1_epi32 (0), _mm_loadu_si128 (a), 0); > +} > > Jakub