https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87718
--- Comment #2 from Uroš Bizjak <ubizjak at gmail dot com> --- Following testcase: --cut here-- typedef int V __attribute__((vector_size (8))); void foo (int x, int y) { register int a __asm ("xmm1"); register int b __asm ("xmm2"); register V c __asm ("xmm3"); a = x; b = y; asm volatile ("" : "+v" (a), "+v" (b)); c = (V) { a, b }; asm volatile ("" : "+v" (c)); } --cut here-- gets compiled with -O2 -mavx -mtune=intel: vmovd %edi, %xmm1 vmovd %esi, %xmm2 vmovd %xmm2, %eax vpinsrd $1, %eax, %xmm1, %xmm3 ret The relevant pattern is defined as: (define_insn "*vec_concatv2si_sse4_1" [(set (match_operand:V2SI 0 "register_operand" "=Yr,*x, x, v,Yr,*x, v, v, *y,*y") (vec_concat:V2SI (match_operand:SI 1 "nonimmediate_operand" " 0, 0, x,Yv, 0, 0,Yv,rm, 0,rm") (match_operand:SI 2 "nonimm_or_0_operand" " rm,rm,rm,rm,Yr,*x,Yv, C,*ym, C")))] "TARGET_SSE4_1 && !(MEM_P (operands[1]) && MEM_P (operands[2]))" "@ pinsrd\t{$1, %2, %0|%0, %2, 1} pinsrd\t{$1, %2, %0|%0, %2, 1} vpinsrd\t{$1, %2, %1, %0|%0, %1, %2, 1} vpinsrd\t{$1, %2, %1, %0|%0, %1, %2, 1} punpckldq\t{%2, %0|%0, %2} punpckldq\t{%2, %0|%0, %2} vpunpckldq\t{%2, %1, %0|%0, %1, %2} %vmovd\t{%1, %0|%0, %1} punpckldq\t{%2, %0|%0, %2} movd\t{%1, %0|%0, %1}" but for some reason RA chooses alternative 2 (x<-x,rm) instead of alternative 6 (v<-Yv,Yv), although alternative 2 needs an extra reload from %xmm2 to %eax.