Implement vector extend and zero_extend functionality for TARGET_SSE2 using PUNPCKL?? family of instructions. The code for e.g. zero-extend from V2SI to V2DImode improves from:
movd %xmm0, %edx pshufd $85, %xmm0, %xmm0 movd %xmm0, %eax movq %rdx, (%rdi) movq %rax, 8(%rdi) to: pxor %xmm1, %xmm1 punpckldq %xmm1, %xmm0 movaps %xmm0, (%rdi) And the code for sign-extend from V2SI to V2DImode from: movd %xmm0, %edx pshufd $85, %xmm0, %xmm0 movd %xmm0, %eax movslq %edx, %rdx cltq movq %rdx, (%rdi) movq %rax, 8(%rdi) to: pxor %xmm1, %xmm1 pcmpgtd %xmm0, %xmm1 punpckldq %xmm1, %xmm0 movaps %xmm0, (%rdi) PR target/111023 gcc/ChangeLog: * config/i386/i386-expand.cc (ix86_split_mmx_punpck): Also handle V2QImode. (ix86_expand_sse_extend): New function. * config/i386/i386-protos.h (ix86_expand_sse_extend): New prototype. * config/i386/mmx.md (<any_extend:insn>v4qiv4hi2): Enable for TARGET_SSE2. Expand through ix86_expand_sse_extend for !TARGET_SSE4_1. (<any_extend:insn>v2hiv2si2): Ditto. (<any_extend:insn>v2qiv2hi2): Ditto. * config/i386/sse.md (<any_extend:insn>v8qiv8hi2): Ditto. (<any_extend:insn>v4hiv4si2): Ditto. (<any_extend:insn>v2siv2di2): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/pr111023-2.c: New test. * gcc.target/i386/pr111023-4b.c: New test. * gcc.target/i386/pr111023-8b.c: New test. * gcc.target/i386/pr111023.c: New test. Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}. Uros.
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 85e30552d6f..460d496ef22 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -1124,8 +1124,9 @@ ix86_split_mmx_punpck (rtx operands[], bool high_p) switch (mode) { - case E_V4QImode: case E_V8QImode: + case E_V4QImode: + case E_V2QImode: sse_mode = V16QImode; double_sse_mode = V32QImode; mask = gen_rtx_PARALLEL (VOIDmode, @@ -5636,7 +5637,43 @@ ix86_expand_vec_perm (rtx operands[]) } } -/* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is +/* Extend SRC into next wider integer vector type. UNSIGNED_P is + true if we should do zero extension, else sign extension. */ + +void +ix86_expand_sse_extend (rtx dest, rtx src, bool unsigned_p) +{ + machine_mode imode = GET_MODE (src); + rtx ops[3]; + + switch (imode) + { + case E_V8QImode: + case E_V4QImode: + case E_V2QImode: + case E_V4HImode: + case E_V2HImode: + case E_V2SImode: + break; + default: + gcc_unreachable (); + } + + ops[0] = gen_reg_rtx (imode); + + ops[1] = force_reg (imode, src); + + if (unsigned_p) + ops[2] = force_reg (imode, CONST0_RTX (imode)); + else + ops[2] = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode), + src, pc_rtx, pc_rtx); + + ix86_split_mmx_punpck (ops, false); + emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), ops[0], imode)); +} + +/* Unpack SRC into the next wider integer vector type. UNSIGNED_P is true if we should do zero extension, else sign extension. HIGH_P is true if we want the N/2 high elements, else the low elements. */ diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index fc2f1f13b78..9ffb125fc2b 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -155,6 +155,7 @@ extern bool ix86_expand_mask_vec_cmp (rtx, enum rtx_code, rtx, rtx); extern bool ix86_expand_int_vec_cmp (rtx[]); extern bool ix86_expand_fp_vec_cmp (rtx[]); extern void ix86_expand_sse_movcc (rtx, rtx, rtx, rtx); +extern void ix86_expand_sse_extend (rtx, rtx, bool); extern void ix86_expand_sse_unpack (rtx, rtx, bool, bool); extern void ix86_expand_fp_spaceship (rtx, rtx, rtx); extern bool ix86_expand_int_addcc (rtx[]); diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 170432a7128..ef578222945 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -3744,8 +3744,14 @@ (define_expand "<insn>v4qiv4hi2" [(set (match_operand:V4HI 0 "register_operand") (any_extend:V4HI (match_operand:V4QI 1 "register_operand")))] - "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE" + "TARGET_MMX_WITH_SSE" { + if (!TARGET_SSE4_1) + { + ix86_expand_sse_extend (operands[0], operands[1], <u_bool>); + DONE; + } + rtx op1 = force_reg (V4QImode, operands[1]); op1 = lowpart_subreg (V8QImode, op1, V4QImode); emit_insn (gen_sse4_1_<code>v4qiv4hi2 (operands[0], op1)); @@ -3770,8 +3776,14 @@ (define_expand "<insn>v2hiv2si2" [(set (match_operand:V2SI 0 "register_operand") (any_extend:V2SI (match_operand:V2HI 1 "register_operand")))] - "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE" + "TARGET_MMX_WITH_SSE" { + if (!TARGET_SSE4_1) + { + ix86_expand_sse_extend (operands[0], operands[1], <u_bool>); + DONE; + } + rtx op1 = force_reg (V2HImode, operands[1]); op1 = lowpart_subreg (V4HImode, op1, V2HImode); emit_insn (gen_sse4_1_<code>v2hiv2si2 (operands[0], op1)); @@ -3822,8 +3834,14 @@ (define_expand "<insn>v2qiv2hi2" [(set (match_operand:V2HI 0 "register_operand") (any_extend:V2HI (match_operand:V2QI 1 "register_operand")))] - "TARGET_SSE4_1" + "TARGET_SSE2" { + if (!TARGET_SSE4_1) + { + ix86_expand_sse_extend (operands[0], operands[1], <u_bool>); + DONE; + } + rtx op1 = force_reg (V2QImode, operands[1]); op1 = lowpart_subreg (V4QImode, op1, V2QImode); emit_insn (gen_sse4_1_<code>v2qiv2hi2 (operands[0], op1)); diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 6784a8c5369..87c3bf07020 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -22919,8 +22919,15 @@ (define_expand "<insn>v8qiv8hi2" [(set (match_operand:V8HI 0 "register_operand") (any_extend:V8HI (match_operand:V8QI 1 "nonimmediate_operand")))] - "TARGET_SSE4_1" + "TARGET_SSE4_1 || TARGET_MMX_WITH_SSE" { + if (!TARGET_SSE4_1) + { + rtx op1 = force_reg (V8QImode, operands[1]); + ix86_expand_sse_extend (operands[0], op1, <u_bool>); + DONE; + } + if (!MEM_P (operands[1])) { rtx op1 = force_reg (V8QImode, operands[1]); @@ -23229,8 +23236,15 @@ (define_expand "<insn>v4hiv4si2" [(set (match_operand:V4SI 0 "register_operand") (any_extend:V4SI (match_operand:V4HI 1 "nonimmediate_operand")))] - "TARGET_SSE4_1" + "TARGET_SSE4_1 || TARGET_MMX_WITH_SSE" { + if (!TARGET_SSE4_1) + { + rtx op1 = force_reg (V4HImode, operands[1]); + ix86_expand_sse_extend (operands[0], op1, <u_bool>); + DONE; + } + if (!MEM_P (operands[1])) { rtx op1 = force_reg (V4HImode, operands[1]); @@ -23828,8 +23842,15 @@ (define_expand "<insn>v2siv2di2" [(set (match_operand:V2DI 0 "register_operand") (any_extend:V2DI (match_operand:V2SI 1 "nonimmediate_operand")))] - "TARGET_SSE4_1" + "TARGET_SSE4_1 || TARGET_MMX_WITH_SSE" { + if (!TARGET_SSE4_1) + { + rtx op1 = force_reg (V2SImode, operands[1]); + ix86_expand_sse_extend (operands[0], op1, <u_bool>); + DONE; + } + if (!MEM_P (operands[1])) { rtx op1 = force_reg (V2SImode, operands[1]); diff --git a/gcc/testsuite/gcc.target/i386/pr111023-2.c b/gcc/testsuite/gcc.target/i386/pr111023-2.c new file mode 100644 index 00000000000..6c69f947544 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr111023-2.c @@ -0,0 +1,52 @@ +/* PR target/111023 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mtune=icelake-server -ftree-vectorize -msse2 -mno-sse4.1" } */ + +typedef char v16qi __attribute__((vector_size (16))); +typedef short v8hi __attribute__((vector_size (16))); +typedef int v4si __attribute__((vector_size (16))); +typedef long long v2di __attribute__((vector_size (16))); + +void +v8hi_v8qi (v8hi *dst, v16qi src) +{ + short tem[8]; + tem[0] = src[0]; + tem[1] = src[1]; + tem[2] = src[2]; + tem[3] = src[3]; + tem[4] = src[4]; + tem[5] = src[5]; + tem[6] = src[6]; + tem[7] = src[7]; + dst[0] = *(v8hi *) tem; +} + +/* { dg-final { scan-assembler "pcmpgtb" } } */ +/* { dg-final { scan-assembler "punpcklbw" } } */ + +void +v4si_v4hi (v4si *dst, v8hi src) +{ + int tem[4]; + tem[0] = src[0]; + tem[1] = src[1]; + tem[2] = src[2]; + tem[3] = src[3]; + dst[0] = *(v4si *) tem; +} + +/* { dg-final { scan-assembler "pcmpgtw" } } */ +/* { dg-final { scan-assembler "punpcklwd" } } */ + +void +v2di_v2si (v2di *dst, v4si src) +{ + long long tem[2]; + tem[0] = src[0]; + tem[1] = src[1]; + dst[0] = *(v2di *) tem; +} + +/* { dg-final { scan-assembler "pcmpgtd" } } */ +/* { dg-final { scan-assembler "punpckldq" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr111023-4b.c b/gcc/testsuite/gcc.target/i386/pr111023-4b.c new file mode 100644 index 00000000000..061f6a18fff --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr111023-4b.c @@ -0,0 +1,17 @@ +/* PR target/111023 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mtune=icelake-server -ftree-vectorize -msse2 -mno-sse4.1" } */ + +typedef unsigned char v4qi __attribute__((vector_size (4))); +typedef unsigned short v2hi __attribute__((vector_size (4))); + +void +v2hi_v2qi (v2hi *dst, v4qi src) +{ + unsigned short tem[2]; + tem[0] = src[0]; + tem[1] = src[1]; + dst[0] = *(v2hi *) tem; +} + +/* { dg-final { scan-assembler "punpcklbw" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr111023-8b.c b/gcc/testsuite/gcc.target/i386/pr111023-8b.c new file mode 100644 index 00000000000..26c5e2785be --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr111023-8b.c @@ -0,0 +1,31 @@ +/* PR target/111023 */ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-O2 -mtune=icelake-server -ftree-vectorize -msse2 -mno-sse4.1" } */ + +typedef unsigned char v8qi __attribute__((vector_size (8))); +typedef unsigned short v4hi __attribute__((vector_size (8))); +typedef unsigned int v2si __attribute__((vector_size (8))); + +void +v4hi_v4qi (v4hi *dst, v8qi src) +{ + unsigned short tem[4]; + tem[0] = src[0]; + tem[1] = src[1]; + tem[2] = src[2]; + tem[3] = src[3]; + dst[0] = *(v4hi *) tem; +} + +/* { dg-final { scan-assembler "punpcklbw" } } */ + +void +v2si_v2hi (v2si *dst, v4hi src) +{ + unsigned int tem[2]; + tem[0] = src[0]; + tem[1] = src[1]; + dst[0] = *(v2si *) tem; +} + +/* { dg-final { scan-assembler "punpcklwd" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr111023.c b/gcc/testsuite/gcc.target/i386/pr111023.c new file mode 100644 index 00000000000..6144c371f32 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr111023.c @@ -0,0 +1,49 @@ +/* PR target/111023 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mtune=icelake-server -ftree-vectorize -msse2 -mno-sse4.1" } */ + +typedef unsigned char v16qi __attribute__((vector_size (16))); +typedef unsigned short v8hi __attribute__((vector_size (16))); +typedef unsigned int v4si __attribute__((vector_size (16))); +typedef unsigned long long v2di __attribute__((vector_size (16))); + +void +v8hi_v8qi (v8hi *dst, v16qi src) +{ + unsigned short tem[8]; + tem[0] = src[0]; + tem[1] = src[1]; + tem[2] = src[2]; + tem[3] = src[3]; + tem[4] = src[4]; + tem[5] = src[5]; + tem[6] = src[6]; + tem[7] = src[7]; + dst[0] = *(v8hi *) tem; +} + +/* { dg-final { scan-assembler "punpcklbw" } } */ + +void +v4si_v4hi (v4si *dst, v8hi src) +{ + unsigned int tem[4]; + tem[0] = src[0]; + tem[1] = src[1]; + tem[2] = src[2]; + tem[3] = src[3]; + dst[0] = *(v4si *) tem; +} + +/* { dg-final { scan-assembler "punpcklwd" } } */ + +void +v2di_v2si (v2di *dst, v4si src) +{ + unsigned long long tem[2]; + tem[0] = src[0]; + tem[1] = src[1]; + dst[0] = *(v2di *) tem; +} + +/* { dg-final { scan-assembler "punpckldq" } } */