Implement vector extend and zero_extend functionality for TARGET_SSE2 using
PUNPCKL?? family of instructions. The code for e.g. zero-extend from V2SI to
V2DImode improves from:

        movd    %xmm0, %edx
        pshufd  $85, %xmm0, %xmm0
        movd    %xmm0, %eax
        movq    %rdx, (%rdi)
        movq    %rax, 8(%rdi)

to:
        pxor    %xmm1, %xmm1
        punpckldq       %xmm1, %xmm0
        movaps  %xmm0, (%rdi)

And the code for sign-extend from V2SI to V2DImode from:

        movd    %xmm0, %edx
        pshufd  $85, %xmm0, %xmm0
        movd    %xmm0, %eax
        movslq  %edx, %rdx
        cltq
        movq    %rdx, (%rdi)
        movq    %rax, 8(%rdi)

to:
        pxor    %xmm1, %xmm1
        pcmpgtd %xmm0, %xmm1
        punpckldq       %xmm1, %xmm0
        movaps  %xmm0, (%rdi)

    PR target/111023

gcc/ChangeLog:

    * config/i386/i386-expand.cc (ix86_split_mmx_punpck):
    Also handle V2QImode.
    (ix86_expand_sse_extend): New function.
    * config/i386/i386-protos.h (ix86_expand_sse_extend): New prototype.
    * config/i386/mmx.md (<any_extend:insn>v4qiv4hi2): Enable for
    TARGET_SSE2.  Expand through ix86_expand_sse_extend for !TARGET_SSE4_1.
    (<any_extend:insn>v2hiv2si2): Ditto.
    (<any_extend:insn>v2qiv2hi2): Ditto.
    * config/i386/sse.md (<any_extend:insn>v8qiv8hi2): Ditto.
    (<any_extend:insn>v4hiv4si2): Ditto.
    (<any_extend:insn>v2siv2di2): Ditto.

gcc/testsuite/ChangeLog:

    * gcc.target/i386/pr111023-2.c: New test.
    * gcc.target/i386/pr111023-4b.c: New test.
    * gcc.target/i386/pr111023-8b.c: New test.
    * gcc.target/i386/pr111023.c: New test.

Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.

Uros.
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 85e30552d6f..460d496ef22 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -1124,8 +1124,9 @@ ix86_split_mmx_punpck (rtx operands[], bool high_p)
 
   switch (mode)
     {
-    case E_V4QImode:
     case E_V8QImode:
+    case E_V4QImode:
+    case E_V2QImode:
       sse_mode = V16QImode;
       double_sse_mode = V32QImode;
       mask = gen_rtx_PARALLEL (VOIDmode,
@@ -5636,7 +5637,43 @@ ix86_expand_vec_perm (rtx operands[])
     }
 }
 
-/* Unpack OP[1] into the next wider integer vector type.  UNSIGNED_P is
+/* Extend SRC into next wider integer vector type.  UNSIGNED_P is
+   true if we should do zero extension, else sign extension.  */
+
+void
+ix86_expand_sse_extend (rtx dest, rtx src, bool unsigned_p)
+{
+  machine_mode imode = GET_MODE (src);
+  rtx ops[3];
+
+  switch (imode)
+    {
+    case E_V8QImode:
+    case E_V4QImode:
+    case E_V2QImode:
+    case E_V4HImode:
+    case E_V2HImode:
+    case E_V2SImode:
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  ops[0] = gen_reg_rtx (imode);
+
+  ops[1] = force_reg (imode, src);
+
+  if (unsigned_p)
+    ops[2] = force_reg (imode, CONST0_RTX (imode));
+  else
+    ops[2] = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
+                                 src, pc_rtx, pc_rtx);
+
+  ix86_split_mmx_punpck (ops, false);
+  emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), ops[0], imode));
+}
+
+/* Unpack SRC into the next wider integer vector type.  UNSIGNED_P is
    true if we should do zero extension, else sign extension.  HIGH_P is
    true if we want the N/2 high elements, else the low elements.  */
 
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index fc2f1f13b78..9ffb125fc2b 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -155,6 +155,7 @@ extern bool ix86_expand_mask_vec_cmp (rtx, enum rtx_code, 
rtx, rtx);
 extern bool ix86_expand_int_vec_cmp (rtx[]);
 extern bool ix86_expand_fp_vec_cmp (rtx[]);
 extern void ix86_expand_sse_movcc (rtx, rtx, rtx, rtx);
+extern void ix86_expand_sse_extend (rtx, rtx, bool);
 extern void ix86_expand_sse_unpack (rtx, rtx, bool, bool);
 extern void ix86_expand_fp_spaceship (rtx, rtx, rtx);
 extern bool ix86_expand_int_addcc (rtx[]);
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 170432a7128..ef578222945 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -3744,8 +3744,14 @@ (define_expand "<insn>v4qiv4hi2"
   [(set (match_operand:V4HI 0 "register_operand")
        (any_extend:V4HI
          (match_operand:V4QI 1 "register_operand")))]
-  "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
+  "TARGET_MMX_WITH_SSE"
 {
+  if (!TARGET_SSE4_1)
+    {
+      ix86_expand_sse_extend (operands[0], operands[1], <u_bool>);
+      DONE;
+    }
+
   rtx op1 = force_reg (V4QImode, operands[1]);
   op1 = lowpart_subreg (V8QImode, op1, V4QImode);
   emit_insn (gen_sse4_1_<code>v4qiv4hi2 (operands[0], op1));
@@ -3770,8 +3776,14 @@ (define_expand "<insn>v2hiv2si2"
   [(set (match_operand:V2SI 0 "register_operand")
        (any_extend:V2SI
          (match_operand:V2HI 1 "register_operand")))]
-  "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
+  "TARGET_MMX_WITH_SSE"
 {
+  if (!TARGET_SSE4_1)
+    {
+      ix86_expand_sse_extend (operands[0], operands[1], <u_bool>);
+      DONE;
+    }
+
   rtx op1 = force_reg (V2HImode, operands[1]);
   op1 = lowpart_subreg (V4HImode, op1, V2HImode);
   emit_insn (gen_sse4_1_<code>v2hiv2si2 (operands[0], op1));
@@ -3822,8 +3834,14 @@ (define_expand "<insn>v2qiv2hi2"
   [(set (match_operand:V2HI 0 "register_operand")
        (any_extend:V2HI
          (match_operand:V2QI 1 "register_operand")))]
-  "TARGET_SSE4_1"
+  "TARGET_SSE2"
 {
+  if (!TARGET_SSE4_1)
+    {
+      ix86_expand_sse_extend (operands[0], operands[1], <u_bool>);
+      DONE;
+    }
+
   rtx op1 = force_reg (V2QImode, operands[1]);
   op1 = lowpart_subreg (V4QImode, op1, V2QImode);
   emit_insn (gen_sse4_1_<code>v2qiv2hi2 (operands[0], op1));
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 6784a8c5369..87c3bf07020 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -22919,8 +22919,15 @@ (define_expand "<insn>v8qiv8hi2"
   [(set (match_operand:V8HI 0 "register_operand")
        (any_extend:V8HI
          (match_operand:V8QI 1 "nonimmediate_operand")))]
-  "TARGET_SSE4_1"
+  "TARGET_SSE4_1 || TARGET_MMX_WITH_SSE"
 {
+  if (!TARGET_SSE4_1)
+    {
+      rtx op1 = force_reg (V8QImode, operands[1]);
+      ix86_expand_sse_extend (operands[0], op1, <u_bool>);
+      DONE;
+    }
+
   if (!MEM_P (operands[1]))
     {
       rtx op1 = force_reg (V8QImode, operands[1]);
@@ -23229,8 +23236,15 @@ (define_expand "<insn>v4hiv4si2"
   [(set (match_operand:V4SI 0 "register_operand")
        (any_extend:V4SI
          (match_operand:V4HI 1 "nonimmediate_operand")))]
-  "TARGET_SSE4_1"
+  "TARGET_SSE4_1 || TARGET_MMX_WITH_SSE"
 {
+  if (!TARGET_SSE4_1)
+    {
+      rtx op1 = force_reg (V4HImode, operands[1]);
+      ix86_expand_sse_extend (operands[0], op1, <u_bool>);
+      DONE;
+    }
+
   if (!MEM_P (operands[1]))
     {
       rtx op1 = force_reg (V4HImode, operands[1]);
@@ -23828,8 +23842,15 @@ (define_expand "<insn>v2siv2di2"
   [(set (match_operand:V2DI 0 "register_operand")
        (any_extend:V2DI
          (match_operand:V2SI 1 "nonimmediate_operand")))]
-  "TARGET_SSE4_1"
+  "TARGET_SSE4_1 || TARGET_MMX_WITH_SSE"
 {
+  if (!TARGET_SSE4_1)
+    {
+      rtx op1 = force_reg (V2SImode, operands[1]);
+      ix86_expand_sse_extend (operands[0], op1, <u_bool>);
+      DONE;
+    }
+
   if (!MEM_P (operands[1]))
     {
       rtx op1 = force_reg (V2SImode, operands[1]);
diff --git a/gcc/testsuite/gcc.target/i386/pr111023-2.c 
b/gcc/testsuite/gcc.target/i386/pr111023-2.c
new file mode 100644
index 00000000000..6c69f947544
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr111023-2.c
@@ -0,0 +1,52 @@
+/* PR target/111023 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=icelake-server -ftree-vectorize -msse2 
-mno-sse4.1" } */
+
+typedef char v16qi __attribute__((vector_size (16)));
+typedef short v8hi __attribute__((vector_size (16)));
+typedef int v4si __attribute__((vector_size (16)));
+typedef long long v2di __attribute__((vector_size (16)));
+
+void
+v8hi_v8qi (v8hi *dst, v16qi src)
+{
+  short tem[8];
+  tem[0] = src[0];
+  tem[1] = src[1];
+  tem[2] = src[2];
+  tem[3] = src[3];
+  tem[4] = src[4];
+  tem[5] = src[5];
+  tem[6] = src[6];
+  tem[7] = src[7];
+  dst[0] = *(v8hi *) tem;
+}
+
+/* { dg-final { scan-assembler "pcmpgtb" } } */
+/* { dg-final { scan-assembler "punpcklbw" } } */
+
+void
+v4si_v4hi (v4si *dst, v8hi src)
+{
+  int tem[4];
+  tem[0] = src[0];
+  tem[1] = src[1];
+  tem[2] = src[2];
+  tem[3] = src[3];
+  dst[0] = *(v4si *) tem;
+}
+
+/* { dg-final { scan-assembler "pcmpgtw" } } */
+/* { dg-final { scan-assembler "punpcklwd" } } */
+
+void
+v2di_v2si (v2di *dst, v4si src)
+{
+  long long tem[2];
+  tem[0] = src[0];
+  tem[1] = src[1];
+  dst[0] = *(v2di *) tem;
+}
+
+/* { dg-final { scan-assembler "pcmpgtd" } } */
+/* { dg-final { scan-assembler "punpckldq" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr111023-4b.c 
b/gcc/testsuite/gcc.target/i386/pr111023-4b.c
new file mode 100644
index 00000000000..061f6a18fff
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr111023-4b.c
@@ -0,0 +1,17 @@
+/* PR target/111023 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=icelake-server -ftree-vectorize -msse2 
-mno-sse4.1" } */
+
+typedef unsigned char v4qi __attribute__((vector_size (4)));
+typedef unsigned short v2hi __attribute__((vector_size (4)));
+
+void
+v2hi_v2qi (v2hi *dst, v4qi src)
+{
+  unsigned short tem[2];
+  tem[0] = src[0];
+  tem[1] = src[1];
+  dst[0] = *(v2hi *) tem;
+}
+
+/* { dg-final { scan-assembler "punpcklbw" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr111023-8b.c 
b/gcc/testsuite/gcc.target/i386/pr111023-8b.c
new file mode 100644
index 00000000000..26c5e2785be
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr111023-8b.c
@@ -0,0 +1,31 @@
+/* PR target/111023 */
+/* { dg-do compile  { target { ! ia32 } } } */
+/* { dg-options "-O2 -mtune=icelake-server -ftree-vectorize -msse2 
-mno-sse4.1" } */
+
+typedef unsigned char v8qi __attribute__((vector_size (8)));
+typedef unsigned short v4hi __attribute__((vector_size (8)));
+typedef unsigned int v2si __attribute__((vector_size (8)));
+
+void
+v4hi_v4qi (v4hi *dst, v8qi src)
+{
+  unsigned short tem[4];
+  tem[0] = src[0];
+  tem[1] = src[1];
+  tem[2] = src[2];
+  tem[3] = src[3];
+  dst[0] = *(v4hi *) tem;
+}
+
+/* { dg-final { scan-assembler "punpcklbw" } } */
+
+void
+v2si_v2hi (v2si *dst, v4hi src)
+{
+  unsigned int tem[2];
+  tem[0] = src[0];
+  tem[1] = src[1];
+  dst[0] = *(v2si *) tem;
+}
+
+/* { dg-final { scan-assembler "punpcklwd" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr111023.c 
b/gcc/testsuite/gcc.target/i386/pr111023.c
new file mode 100644
index 00000000000..6144c371f32
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr111023.c
@@ -0,0 +1,49 @@
+/* PR target/111023 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=icelake-server -ftree-vectorize -msse2 
-mno-sse4.1" } */
+
+typedef unsigned char v16qi __attribute__((vector_size (16)));
+typedef unsigned short v8hi __attribute__((vector_size (16)));
+typedef unsigned int v4si __attribute__((vector_size (16)));
+typedef unsigned long long v2di __attribute__((vector_size (16)));
+
+void
+v8hi_v8qi (v8hi *dst, v16qi src)
+{
+  unsigned short tem[8];
+  tem[0] = src[0];
+  tem[1] = src[1];
+  tem[2] = src[2];
+  tem[3] = src[3];
+  tem[4] = src[4];
+  tem[5] = src[5];
+  tem[6] = src[6];
+  tem[7] = src[7];
+  dst[0] = *(v8hi *) tem;
+}
+
+/* { dg-final { scan-assembler "punpcklbw" } } */
+
+void
+v4si_v4hi (v4si *dst, v8hi src)
+{
+  unsigned int tem[4];
+  tem[0] = src[0];
+  tem[1] = src[1];
+  tem[2] = src[2];
+  tem[3] = src[3];
+  dst[0] = *(v4si *) tem;
+}
+
+/* { dg-final { scan-assembler "punpcklwd" } } */
+
+void
+v2di_v2si (v2di *dst, v4si src)
+{
+  unsigned long long tem[2];
+  tem[0] = src[0];
+  tem[1] = src[1];
+  dst[0] = *(v2di *) tem;
+}
+
+/* { dg-final { scan-assembler "punpckldq" } } */

Reply via email to