Hi Uros, Happy New Year. As requested here's a revised version of my patch to introduce a pattern for extendditi2, but implementing your suggestion to re-use the existing extendsidi2_1 splitters and peephole2 optimizations by using DWI/DWIH mode iterators.
This patch has been tested on x86_64-pc-linux-gnu with make bootstrap and make -k check, both with and without --target_board=unix{-m32}, with no new failures. Ok for mainline? 2023-01-01 Roger Sayle <ro...@nextmovesoftware.com> Uroš Bizjak <ubiz...@gmail.com> gcc/ChangeLog * config/i386/i386.md (extendditi2): New define_insn. (define_split): Use DWIH mode iterator to treat new extendditi2 identically to existing extendsidi2_1. (define_peephole2): Likewise. (define_peephole2): Likewise. (define_split): Likewise. gcc/testsuite/ChangeLog * gcc.target/i386/extendditi2-1.c: New test case. * gcc.target/i386/extendditi2-2.c: Likewise. Thanks in advance, Roger -- > -----Original Message----- > From: Uros Bizjak <ubiz...@gmail.com> > Sent: 28 December 2022 09:28 > To: Roger Sayle <ro...@nextmovesoftware.com> > Cc: GCC Patches <gcc-patches@gcc.gnu.org> > Subject: Re: [x86_64 PATCH] Add post-reload splitter for extendditi2. > > On Wed, Dec 28, 2022 at 1:32 AM Roger Sayle > <ro...@nextmovesoftware.com> wrote: > > > > > > This is another step towards a possible solution for PR 105137. > > This patch introduces a define_insn_and_split for extendditi2, that > > allows DImode to TImode sign-extension to be represented in the early > > RTL optimizers, before being split post-reload into the exact same > > idiom as currently produced by RTL expansion. > > Please see extendsidi2_1 insn pattern and follow-up splitters and > peephole2 patterns that do exactly what you want to achieve, but they are > currently handling only SImode to DImode on 32-bit targets. OTOH, these > patterns handle several more cases (e.g. split to the memory > output) and just have to be macroized with DWIH mode iterator to also handle > DImode to TImode on 64-bit targets. Probably, an extendsidi expander will have > to be slightly adjusted when macroized to signal middle end the availability > of > extendditi pattern. > > Following macroization, any possible follow-up optimizations and improvements > will then be automatically applied also to 32-bit targets. > > Uros. > > > > > Typically this produces the identical code, so the first new test > > case: > > __int128 foo(long long x) { return (__int128)x; } > > > > continues to generate: > > foo: movq %rdi, %rax > > cqto > > ret > > > > The "magic" is that this representation allows combine and the other > > RTL optimizers to do a better job. Hence, the second test case: > > > > __int128 foo(__int128 a, long long b) { > > a += ((__int128)b) << 70; > > return a; > > } > > > > which mainline with -O2 currently generates as: > > > > foo: movq %rsi, %rax > > movq %rdx, %rcx > > movq %rdi, %rsi > > salq $6, %rcx > > movq %rax, %rdi > > xorl %eax, %eax > > movq %rcx, %rdx > > addq %rsi, %rax > > adcq %rdi, %rdx > > ret > > > > with this patch now becomes: > > foo: movl $0, %eax > > salq $6, %rdx > > addq %rdi, %rax > > adcq %rsi, %rdx > > ret > > > > i.e. the same code for the signed and unsigned extension variants. > > > > This patch has been tested on x86_64-pc-linux-gnu with make bootstrap > > and make -k check, both with and without --target_board=unix{-m32}, > > with no new failures. Ok for mainline? > > > > 2022-12-28 Roger Sayle <ro...@nextmovesoftware.com> > > > > gcc/ChangeLog > > * config/i386/i386.md (extendditi2): New define_insn_and_split > > to split DImode to TImode sign-extension after reload. > > > > gcc/testsuite/ChangeLog > > * gcc.target/i386/extendditi2-1.c: New test case. > > * gcc.target/i386/extendditi2-2.c: Likewise. > > > > > > Thanks in advance, > > Roger > > -- > >
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index ca40c4f..890c4c8 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -4548,17 +4548,27 @@ "!TARGET_64BIT" "#") +(define_insn "extendditi2" + [(set (match_operand:TI 0 "nonimmediate_operand" "=*A,r,?r,?*o") + (sign_extend:TI (match_operand:DI 1 "register_operand" "0,0,r,r"))) + (clobber (reg:CC FLAGS_REG)) + (clobber (match_scratch:DI 2 "=X,X,X,&r"))] + "TARGET_64BIT" + "#") + ;; Split the memory case. If the source register doesn't die, it will stay ;; this way, if it does die, following peephole2s take care of it. (define_split - [(set (match_operand:DI 0 "memory_operand") - (sign_extend:DI (match_operand:SI 1 "register_operand"))) + [(set (match_operand:<DWI> 0 "memory_operand") + (sign_extend:<DWI> (match_operand:DWIH 1 "register_operand"))) (clobber (reg:CC FLAGS_REG)) - (clobber (match_operand:SI 2 "register_operand"))] + (clobber (match_operand:DWIH 2 "register_operand"))] "reload_completed" [(const_int 0)] { - split_double_mode (DImode, &operands[0], 1, &operands[3], &operands[4]); + rtx bits = GEN_INT (<MODE_SIZE> * BITS_PER_UNIT - 1); + + split_double_mode (<DWI>mode, &operands[0], 1, &operands[3], &operands[4]); emit_move_insn (operands[3], operands[1]); @@ -4567,12 +4577,12 @@ && REGNO (operands[1]) == AX_REG && REGNO (operands[2]) == DX_REG) { - emit_insn (gen_ashrsi3_cvt (operands[2], operands[1], GEN_INT (31))); + emit_insn (gen_ashr<mode>3_cvt (operands[2], operands[1], bits)); } else { emit_move_insn (operands[2], operands[1]); - emit_insn (gen_ashrsi3_cvt (operands[2], operands[2], GEN_INT (31))); + emit_insn (gen_ashr<mode>3_cvt (operands[2], operands[2], bits)); } emit_move_insn (operands[4], operands[2]); DONE; @@ -4581,69 +4591,79 @@ ;; Peepholes for the case where the source register does die, after ;; being split with the above splitter. (define_peephole2 - [(set (match_operand:SI 0 "memory_operand") - (match_operand:SI 1 "general_reg_operand")) - (set (match_operand:SI 2 "general_reg_operand") (match_dup 1)) + [(set (match_operand:DWIH 0 "memory_operand") + (match_operand:DWIH 1 "general_reg_operand")) + (set (match_operand:DWIH 2 "general_reg_operand") (match_dup 1)) (parallel [(set (match_dup 2) - (ashiftrt:SI (match_dup 2) (const_int 31))) + (ashiftrt:DWIH (match_dup 2) + (match_operand 4 "const_int_operand"))) (clobber (reg:CC FLAGS_REG))]) - (set (match_operand:SI 3 "memory_operand") (match_dup 2))] + (set (match_operand:DWIH 3 "memory_operand") (match_dup 2))] "REGNO (operands[1]) != REGNO (operands[2]) + && INTVAL (operands[4]) == (<MODE_SIZE> * BITS_PER_UNIT - 1) && peep2_reg_dead_p (2, operands[1]) && peep2_reg_dead_p (4, operands[2]) && !reg_mentioned_p (operands[2], operands[3])" [(set (match_dup 0) (match_dup 1)) - (parallel [(set (match_dup 1) (ashiftrt:SI (match_dup 1) (const_int 31))) + (parallel [(set (match_dup 1) (ashiftrt:DWIH (match_dup 1) (match_dup 4))) (clobber (reg:CC FLAGS_REG))]) (set (match_dup 3) (match_dup 1))]) (define_peephole2 - [(set (match_operand:SI 0 "memory_operand") - (match_operand:SI 1 "general_reg_operand")) - (parallel [(set (match_operand:SI 2 "general_reg_operand") - (ashiftrt:SI (match_dup 1) (const_int 31))) + [(set (match_operand:DWIH 0 "memory_operand") + (match_operand:DWIH 1 "general_reg_operand")) + (parallel [(set (match_operand:DWIH 2 "general_reg_operand") + (ashiftrt:DWIH (match_dup 1) + (match_operand 4 "const_int_operand"))) (clobber (reg:CC FLAGS_REG))]) - (set (match_operand:SI 3 "memory_operand") (match_dup 2))] + (set (match_operand:DWIH 3 "memory_operand") (match_dup 2))] "/* cltd is shorter than sarl $31, %eax */ !optimize_function_for_size_p (cfun) && REGNO (operands[1]) == AX_REG && REGNO (operands[2]) == DX_REG + && INTVAL (operands[4]) == (<MODE_SIZE> * BITS_PER_UNIT - 1) && peep2_reg_dead_p (2, operands[1]) && peep2_reg_dead_p (3, operands[2]) && !reg_mentioned_p (operands[2], operands[3])" [(set (match_dup 0) (match_dup 1)) - (parallel [(set (match_dup 1) (ashiftrt:SI (match_dup 1) (const_int 31))) + (parallel [(set (match_dup 1) (ashiftrt:DWIH (match_dup 1) (match_dup 4))) (clobber (reg:CC FLAGS_REG))]) (set (match_dup 3) (match_dup 1))]) ;; Extend to register case. Optimize case where source and destination ;; registers match and cases where we can use cltd. (define_split - [(set (match_operand:DI 0 "register_operand") - (sign_extend:DI (match_operand:SI 1 "register_operand"))) + [(set (match_operand:<DWI> 0 "register_operand") + (sign_extend:<DWI> (match_operand:DWIH 1 "register_operand"))) (clobber (reg:CC FLAGS_REG)) - (clobber (match_scratch:SI 2))] + (clobber (match_scratch:DWIH 2))] "reload_completed" [(const_int 0)] { - split_double_mode (DImode, &operands[0], 1, &operands[3], &operands[4]); + rtx bits = GEN_INT (<MODE_SIZE> * BITS_PER_UNIT - 1); + + split_double_mode (<DWI>mode, &operands[0], 1, &operands[3], &operands[4]); if (REGNO (operands[3]) != REGNO (operands[1])) emit_move_insn (operands[3], operands[1]); + rtx src = operands[1]; + if (REGNO (operands[3]) == AX_REG) + src = operands[3]; + /* Generate a cltd if possible and doing so it profitable. */ if ((optimize_function_for_size_p (cfun) || TARGET_USE_CLTD) - && REGNO (operands[3]) == AX_REG + && REGNO (src) == AX_REG && REGNO (operands[4]) == DX_REG) { - emit_insn (gen_ashrsi3_cvt (operands[4], operands[3], GEN_INT (31))); + emit_insn (gen_ashr<mode>3_cvt (operands[4], src, bits)); DONE; } if (REGNO (operands[4]) != REGNO (operands[1])) emit_move_insn (operands[4], operands[1]); - emit_insn (gen_ashrsi3_cvt (operands[4], operands[4], GEN_INT (31))); + emit_insn (gen_ashr<mode>3_cvt (operands[4], operands[4], bits)); DONE; }) diff --git a/gcc/testsuite/gcc.target/i386/extendditi2-1.c b/gcc/testsuite/gcc.target/i386/extendditi2-1.c new file mode 100644 index 0000000..efbad0e --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/extendditi2-1.c @@ -0,0 +1,8 @@ +/* { dg-do compile { target int128 } } */ +/* { dg-options "-O2" } */ + +__int128 foo(long long x) +{ + return (__int128)x; +} +/* { dg-final { scan-assembler "cqt?o" } } */ diff --git a/gcc/testsuite/gcc.target/i386/extendditi2-2.c b/gcc/testsuite/gcc.target/i386/extendditi2-2.c new file mode 100644 index 0000000..dbfa6fb --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/extendditi2-2.c @@ -0,0 +1,13 @@ +/* { dg-do compile { target int128 } } */ +/* { dg-options "-O2" } */ + +__int128 foo(__int128 a, long long b) { + a += ((__int128)b) << 70; + return a; +} + +__int128 bar(__int128 a, unsigned long long b) { + a += ((__int128)b) << 70; + return a; +} +/* { dg-final { scan-assembler-not "movq" } } */