This patch improves the code generated when moving a 128-bit value
in TImode, represented by two 64-bit registers, to V1TImode, which is a single SSE register. Currently, the simple move: typedef unsigned __int128 uv1ti __attribute__ ((__vector_size__ (16))); uv1ti foo(__int128 x) { return (uv1ti)x; } is always transferred via memory, as: foo: movq %rdi, -24(%rsp) movq %rsi, -16(%rsp) movdqa -24(%rsp), %xmm0 ret with this patch, we now generate (with -msse2): foo: movq %rdi, %xmm1 movq %rsi, %xmm2 punpcklqdq %xmm2, %xmm1 movdqa %xmm1, %xmm0 ret and with -mavx2: foo: vmovq %rdi, %xmm1 vpinsrq $1, %rsi, %xmm1, %xmm0 ret Even more dramatic is the improvement of zero extended transfers. uv1ti bar(unsigned char c) { return (uv1ti)(__int128)c; } Previously generated: bar: movq $0, -16(%rsp) movzbl %dil, %eax movq %rax, -24(%rsp) vmovdqa -24(%rsp), %xmm0 ret Now generates: bar: movzbl %dil, %edi movq %rdi, %xmm0 ret My first attempt at this functionality attempted to use a simple define_split: +;; Move TImode to V1TImode via V2DImode instead of memory. +(define_split + [(set (match_operand:V1TI 0 "register_operand") + (subreg:V1TI (match_operand:TI 1 "register_operand") 0))] + "TARGET_64BIT && TARGET_SSE2 && can_create_pseudo_p ()" + [(set (match_dup 2) (vec_concat:V2DI (match_dup 3) (match_dup 4))) + (set (match_dup 0) (subreg:V1TI (match_dup 2) 0))] +{ + operands[2] = gen_reg_rtx (V2DImode); + operands[3] = gen_lowpart (DImode, operands[1]); + operands[4] = gen_highpart (DImode, operands[1]); +}) + Unfortunately, this triggers very late during the compilation preventing some of the simplification's we'd like (in combine). For example the foo case above becomes: foo: movq %rsi, -16(%rsp) movq %rdi, %xmm0 movhps -16(%rsp), %xmm0 transferring half directly, and the other half via memory. And for the bar case above, GCC fails to appreciate that movq/vmovq clears the high bits, resulting in: bar: movzbl %dil, %eax xorl %edx, %edx vmovq %rax, %xmm1 vpinsrq $1, %rdx, %xmm1, %xmm0 ret Hence the solution (i.e. this patch) is to add a special case to ix86_expand_vector_move for TImode to V1TImode transfers. This patch has been tested on x86_64-pc-linux-gnu with make bootstrap and make -k check with no new failures. Ok for mainline? 2022-01-06 Roger Sayle <ro...@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-expand.c (ix86_expand_vector_move): Add special case for TImode to V1TImode moves, going via V2DImode. gcc/testsuite/ChangeLog * gcc.target/i386/sse2-v1ti-mov-1.c: New test case. * gcc.target/i386/sse2-v1ti-zext.c: New test case. Many thanks in advance, Roger --
diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 7013c20..47c4870 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -615,6 +615,23 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[]) return; } + /* Special case TImode to V1TImode conversions, via V2DI. */ + if (mode == V1TImode + && SUBREG_P (op1) + && GET_MODE (SUBREG_REG (op1)) == TImode + && TARGET_64BIT && TARGET_SSE + && can_create_pseudo_p ()) + { + rtx tmp = gen_reg_rtx (V2DImode); + rtx lo = gen_reg_rtx (DImode); + rtx hi = gen_reg_rtx (DImode); + emit_move_insn (lo, gen_lowpart (DImode, SUBREG_REG (op1))); + emit_move_insn (hi, gen_highpart (DImode, SUBREG_REG (op1))); + emit_insn (gen_vec_concatv2di (tmp, lo, hi)); + emit_move_insn (op0, gen_lowpart (V1TImode, tmp)); + return; + } + /* If operand0 is a hard register, make operand1 a pseudo. */ if (can_create_pseudo_p () && !ix86_hardreg_mov_ok (op0, op1)) diff --git a/gcc/testsuite/gcc.target/i386/sse2-v1ti-mov-1.c b/gcc/testsuite/gcc.target/i386/sse2-v1ti-mov-1.c new file mode 100644 index 0000000..a1ef7b7 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse2-v1ti-mov-1.c @@ -0,0 +1,11 @@ +/* { dg-do compile { target int128 } } */ +/* { dg-options "-O2 -msse2" } */ + +typedef unsigned __int128 uv1ti __attribute__ ((__vector_size__ (16))); + +uv1ti foo(__int128 x) +{ + return (uv1ti)x; +} + +/* { dg-final { scan-assembler-not "%\[er\]sp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/sse2-v1ti-zext.c b/gcc/testsuite/gcc.target/i386/sse2-v1ti-zext.c new file mode 100644 index 0000000..4dfc655 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse2-v1ti-zext.c @@ -0,0 +1,16 @@ +/* { dg-do compile { target int128 } } */ +/* { dg-options "-O2 -msse2" } */ + +typedef unsigned __int128 uv1ti __attribute__ ((__vector_size__ (16))); + +uv1ti extqi(unsigned char c) { return (uv1ti)(__int128)c; } +uv1ti exthi(unsigned short s) { return (uv1ti)(__int128)s; } +uv1ti extsi(unsigned int i) { return (uv1ti)(__int128)i; } +uv1ti extdi(unsigned long long l) { return (uv1ti)(__int128)l; } + +uv1ti pextqi(unsigned char *pc) { return (uv1ti)(__int128)(*pc); } +uv1ti pexthi(unsigned short *ps) { return (uv1ti)(__int128)(*ps); } +uv1ti pextsi(unsigned int *pi) { return (uv1ti)(__int128)(*pi); } +uv1ti pextdi(unsigned long long *pl) { return (uv1ti)(__int128)(*pl); } + +/* { dg-final { scan-assembler-not "%\[er\]sp" } } */