Hi! The following testcase is miscompiled starting with my PR85480 change. While it is perfectly fine to use "xm" or "vm" constraints for the source operand when the other operand is "C", we rely on the AVX/AVX512 behavior that most 128-bit or 256-bit vector instructions clear the upper bits of the 512-bit vectors, but if the source is in memory, we need to check if it is aligned or maybe misaligned and use corresponding aligned or unaligned loads accordingly, rather than always aligned ones.
Fixed thusly, bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk and 9.2? Note, I have a follow-up patch to improve avx_vec_concat<mode>, just will need to test it. 2019-06-26 Jakub Jelinek <ja...@redhat.com> PR target/90991 * config/i386/sse.md (*<extract_type>_vinsert<shuffletype><extract_suf>_0): Use vmovupd, vmovups, vmovdqu, vmovdqu32 or vmovdqu64 instead of the aligned insns if operands[2] is misaligned_operand. * gcc.target/i386/avx512dq-pr90991-1.c: New test. --- gcc/config/i386/sse.md.jj 2019-06-21 08:43:17.734263742 +0200 +++ gcc/config/i386/sse.md 2019-06-25 22:36:12.955476294 +0200 @@ -13744,15 +13744,29 @@ (define_insn "*<extract_type>_vinsert<sh switch (<MODE>mode) { case E_V8DFmode: - return "vmovapd\t{%2, %x0|%x0, %2}"; + if (misaligned_operand (operands[2], <ssequartermode>mode)) + return "vmovupd\t{%2, %x0|%x0, %2}"; + else + return "vmovapd\t{%2, %x0|%x0, %2}"; case E_V16SFmode: - return "vmovaps\t{%2, %x0|%x0, %2}"; + if (misaligned_operand (operands[2], <ssequartermode>mode)) + return "vmovups\t{%2, %x0|%x0, %2}"; + else + return "vmovaps\t{%2, %x0|%x0, %2}"; case E_V8DImode: - return which_alternative == 2 ? "vmovdqa64\t{%2, %x0|%x0, %2}" - : "vmovdqa\t{%2, %x0|%x0, %2}"; + if (misaligned_operand (operands[2], <ssequartermode>mode)) + return which_alternative == 2 ? "vmovdqu64\t{%2, %x0|%x0, %2}" + : "vmovdqu\t{%2, %x0|%x0, %2}"; + else + return which_alternative == 2 ? "vmovdqa64\t{%2, %x0|%x0, %2}" + : "vmovdqa\t{%2, %x0|%x0, %2}"; case E_V16SImode: - return which_alternative == 2 ? "vmovdqa32\t{%2, %x0|%x0, %2}" - : "vmovdqa\t{%2, %x0|%x0, %2}"; + if (misaligned_operand (operands[2], <ssequartermode>mode)) + return which_alternative == 2 ? "vmovdqu32\t{%2, %x0|%x0, %2}" + : "vmovdqu\t{%2, %x0|%x0, %2}"; + else + return which_alternative == 2 ? "vmovdqa32\t{%2, %x0|%x0, %2}" + : "vmovdqa\t{%2, %x0|%x0, %2}"; default: gcc_unreachable (); } --- gcc/testsuite/gcc.target/i386/avx512dq-pr90991-1.c.jj 2019-06-25 23:17:44.831272146 +0200 +++ gcc/testsuite/gcc.target/i386/avx512dq-pr90991-1.c 2019-06-25 23:27:27.732357035 +0200 @@ -0,0 +1,47 @@ +/* PR target/90991 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx512dq -masm=att" } */ +/* { dg-final { scan-assembler-times "vmovaps\[ \t]\+\\(\[^\n\r]*\\), %xmm0" 1 } } */ +/* { dg-final { scan-assembler-times "vmovapd\[ \t]\+\\(\[^\n\r]*\\), %xmm0" 1 } } */ +/* { dg-final { scan-assembler-times "vmovdqa\[ \t]\+\\(\[^\n\r]*\\), %xmm0" 1 } } */ +/* { dg-final { scan-assembler-times "vmovups\[ \t]\+\\(\[^\n\r]*\\), %xmm0" 1 } } */ +/* { dg-final { scan-assembler-times "vmovupd\[ \t]\+\\(\[^\n\r]*\\), %xmm0" 1 } } */ +/* { dg-final { scan-assembler-times "vmovdqu\[ \t]\+\\(\[^\n\r]*\\), %xmm0" 1 } } */ + +#include <x86intrin.h> + +__m512 +f1 (void *a) +{ + return _mm512_insertf32x4 (_mm512_set1_ps (0.0f), _mm_load_ps (a), 0); +} + +__m512d +f2 (void *a) +{ + return _mm512_insertf64x2 (_mm512_set1_pd (0.0), _mm_load_pd (a), 0); +} + +__m512i +f3 (void *a) +{ + return _mm512_inserti32x4 (_mm512_set1_epi32 (0), _mm_load_si128 (a), 0); +} + +__m512 +f4 (void *a) +{ + return _mm512_insertf32x4 (_mm512_set1_ps (0.0f), _mm_loadu_ps (a), 0); +} + +__m512d +f5 (void *a) +{ + return _mm512_insertf64x2 (_mm512_set1_pd (0.0), _mm_loadu_pd (a), 0); +} + +__m512i +f6 (void *a) +{ + return _mm512_inserti32x4 (_mm512_set1_epi32 (0), _mm_loadu_si128 (a), 0); +} Jakub