Hi! As mentioned in the PR, while in vec_concatv2df or vec_concatv2di we have alternatives where the lower half is low 64-bit part of a xmm reg or 64-bit memory and upper half is zero using movq/vmovq (or movq2dq), for vec_concatv4sf and vec_concatv4si we don't have anything similar, although the operations do pretty much the same thing. I'm not advocating to put in alternatives with GPR operands as having V2SF or V2SI in a GPR is too weird, but for the cases where a simple movq or vmovq instruction can move 64-bits and clear upper 64-bits these patterns look useful to me.
I had to tweak the pr53759.c testcase because it started FAILing, but only because it changed: - vxorps %xmm0, %xmm0, %xmm0 - vmovlps (%rsi), %xmm0, %xmm0 + vmovq (%rsi), %xmm0 vaddps %xmm0, %xmm0, %xmm0 vmovaps %xmm0, (%rdi) ret I've added a variant of that testcase that tests its original purpose by using a register there not known to be all zeros. Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? 2018-11-30 Jakub Jelinek <ja...@redhat.com> PR target/88278 * config/i386/sse.md (*vec_concatv4sf_0, *vec_concatv4si_0): New insns. * gcc.target/i386/pr88278.c: New test. * gcc.target/i386/pr53759.c: Don't expect vmovlps insn, expect vmovq instead. * gcc.target/i386/pr53759-2.c: New test. --- gcc/config/i386/sse.md.jj 2018-11-30 21:36:22.277762263 +0100 +++ gcc/config/i386/sse.md 2018-11-30 21:38:02.261120768 +0100 @@ -7248,6 +7248,17 @@ (define_insn "*vec_concatv4sf" (set_attr "prefix" "orig,maybe_evex,orig,maybe_evex") (set_attr "mode" "V4SF,V4SF,V2SF,V2SF")]) +(define_insn "*vec_concatv4sf_0" + [(set (match_operand:V4SF 0 "register_operand" "=v") + (vec_concat:V4SF + (match_operand:V2SF 1 "nonimmediate_operand" "xm") + (match_operand:V2SF 2 "const0_operand" " C")))] + "TARGET_SSE2" + "%vmovq\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "DF")]) + ;; Avoid combining registers from different units in a single alternative, ;; see comment above inline_secondary_memory_needed function in i386.c (define_insn "vec_set<mode>_0" @@ -14409,6 +14420,23 @@ (define_insn "*vec_concatv4si" (set_attr "prefix" "orig,maybe_evex,orig,orig,maybe_evex") (set_attr "mode" "TI,TI,V4SF,V2SF,V2SF")]) +(define_insn "*vec_concatv4si_0" + [(set (match_operand:V4SI 0 "register_operand" "=v,x") + (vec_concat:V4SI + (match_operand:V2SI 1 "nonimmediate_operand" "vm,?!*y") + (match_operand:V2SI 2 "const0_operand" " C,C")))] + "TARGET_SSE2" + "@ + %vmovq\t{%1, %0|%0, %1} + movq2dq\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "maybe_vex,orig") + (set_attr "mode" "TI") + (set (attr "preferred_for_speed") + (if_then_else (eq_attr "alternative" "1") + (symbol_ref "TARGET_INTER_UNIT_MOVES_FROM_VEC") + (symbol_ref "true")))]) + ;; movd instead of movq is required to handle broken assemblers. (define_insn "vec_concatv2di" [(set (match_operand:V2DI 0 "register_operand" --- gcc/testsuite/gcc.target/i386/pr88278.c.jj 2018-11-30 21:38:02.261120768 +0100 +++ gcc/testsuite/gcc.target/i386/pr88278.c 2018-11-30 21:38:02.261120768 +0100 @@ -0,0 +1,34 @@ +/* PR target/88278 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse2 -mno-sse3 -fgimple -masm=att" } */ +/* { dg-final { scan-assembler-times "movq\[ \t]+\\(" 2 } } */ +/* { dg-final { scan-assembler-not "punpcklqdq\[ \t]+" } } */ +/* { dg-final { scan-assembler-not "pxor\[ \t]+" } } */ + +typedef unsigned char v16qi __attribute__((vector_size(16))); +typedef unsigned char v8qi __attribute__((vector_size(8))); +typedef unsigned int v4si __attribute__((vector_size(16))); +typedef unsigned int v2si __attribute__((vector_size(8))); + +v16qi __GIMPLE foo (unsigned char *p) +{ + v8qi _2; + v16qi _3; + +bb_2: + _2 = __MEM <v8qi, 8> (p_1(D)); + _3 = _Literal (v16qi) { _2, _Literal (v8qi) { _Literal (unsigned char) 0, _Literal (unsigned char) 0, _Literal (unsigned char) 0, _Literal (unsigned char) 0, _Literal (unsigned char) 0, _Literal (unsigned char) 0, _Literal (unsigned char) 0 } }; + return _3; +} + + +v4si __GIMPLE bar (unsigned int *p) +{ + v2si _2; + v4si _3; + +bb_2: + _2 = __MEM <v2si, 32> (p_1(D)); + _3 = _Literal (v4si) { _2, _Literal (v2si) { 0u, 0u } }; + return _3; +} --- gcc/testsuite/gcc.target/i386/pr53759.c.jj 2016-05-22 12:20:04.184034591 +0200 +++ gcc/testsuite/gcc.target/i386/pr53759.c 2018-11-30 22:04:56.432806470 +0100 @@ -12,5 +12,6 @@ foo (__m128 *x, __m64 *y) *x = _mm_add_ps (b, b); } -/* { dg-final { scan-assembler "vmovlps\[ \\t\]" } } */ +/* { dg-final { scan-assembler "vmovq\[ \\t\]" } } */ +/* { dg-final { scan-assembler-not "vmovlps\[ \\t\]" } } */ /* { dg-final { scan-assembler-not "vshufps\[ \\t\]" } } */ --- gcc/testsuite/gcc.target/i386/pr53759-2.c.jj 2018-11-30 22:05:06.932657374 +0100 +++ gcc/testsuite/gcc.target/i386/pr53759-2.c 2018-11-30 22:05:42.568151361 +0100 @@ -0,0 +1,16 @@ +/* PR target/53759 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx" } */ + +#include <xmmintrin.h> + +void +foo (__m128 *x, __m64 *y) +{ + __m128 a = _mm_add_ps (x[1], x[2]); + __m128 b = _mm_loadl_pi (a, y); + *x = _mm_add_ps (b, b); +} + +/* { dg-final { scan-assembler "vmovlps\[ \\t\]" } } */ +/* { dg-final { scan-assembler-not "vshufps\[ \\t\]" } } */ Jakub