Uros, The following is the change compared to version 4 after fix all the issues you raised in the previous email.
Let me know if there is any other issue. Thanks. Qing --- gcc/config/i386/i386.c | 162 +++++++++++++++++---- .../gcc.target/i386/zero-scratch-regs-28.c | 16 ++ .../gcc.target/i386/zero-scratch-regs-29.c | 10 ++ .../gcc.target/i386/zero-scratch-regs-30.c | 11 ++ .../gcc.target/i386/zero-scratch-regs-31.c | 16 ++ 5 files changed, 188 insertions(+), 27 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratch-regs-28.c create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratch-regs-29.c create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratch-regs-30.c create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratch-regs-31.c diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index e66dcf0d587..e6c5001b11e 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -3554,17 +3554,17 @@ ix86_function_value_regno_p (const unsigned int regno) /* Check whether the register REGNO should be zeroed on X86. When ALL_SSE_ZEROED is true, all SSE registers have been zeroed together, no need to zero it again. - Stack registers (st0-st7) and mm0-mm7 are aliased with each other. - very hard to be zeroed individually, don't zero individual st or - mm registgers. */ + When NEED_ZERO_MMX is true, MMX registers should be cleared. */ static bool zero_call_used_regno_p (const unsigned int regno, - bool all_sse_zeroed) + bool all_sse_zeroed, + bool need_zero_mmx) { return GENERAL_REGNO_P (regno) || (!all_sse_zeroed && SSE_REGNO_P (regno)) - || MASK_REGNO_P (regno); + || MASK_REGNO_P (regno) + || (need_zero_mmx && MMX_REGNO_P (regno)); } /* Return the machine_mode that is used to zero register REGNO. */ @@ -3579,8 +3579,12 @@ zero_call_used_regno_mode (const unsigned int regno) return SImode; else if (SSE_REGNO_P (regno)) return V4SFmode; - else + else if (MASK_REGNO_P (regno)) return HImode; + else if (MMX_REGNO_P (regno)) + return V4HImode; + else + gcc_unreachable (); } /* Generate a rtx to zero all vector registers together if possible, @@ -3603,7 +3607,7 @@ zero_all_vector_registers (HARD_REG_SET need_zeroed_hardregs) return gen_avx_vzeroall (); } -/* Generate insns to zero all st/mm registers together. +/* Generate insns to zero all st registers together. Return true when zeroing instructions are generated. Assume the number of st registers that are zeroed is num_of_st, we will emit the following sequence to zero them together: @@ -3616,23 +3620,49 @@ zero_all_vector_registers (HARD_REG_SET need_zeroed_hardregs) ... fstp %%st(0); i.e., num_of_st fldz followed by num_of_st fstp to clear the stack - mark stack slots empty. */ + mark stack slots empty. + + How to compute the num_of_st? + There is no direct mapping from stack registers to hard register + numbers. If one stack register need to be cleared, we don't know + where in the stack the value remains. So, if any stack register + need to be cleared, the whole stack should be cleared. However, + x87 stack registers that hold the return value should be excluded. + x87 returns in the top (two for complex values) register, so + num_of_st should be 7/6 when x87 returns, otherwise it will be 8. */ + static bool -zero_all_st_mm_registers (HARD_REG_SET need_zeroed_hardregs) +zero_all_st_registers (HARD_REG_SET need_zeroed_hardregs) { unsigned int num_of_st = 0; for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) - if (STACK_REGNO_P (regno) - && TEST_HARD_REG_BIT (need_zeroed_hardregs, regno) - /* When the corresponding mm register also need to be cleared too. */ - && TEST_HARD_REG_BIT (need_zeroed_hardregs, - (regno - FIRST_STACK_REG + FIRST_MMX_REG))) - num_of_st++; + if ((STACK_REGNO_P (regno) || MMX_REGNO_P (regno)) + && TEST_HARD_REG_BIT (need_zeroed_hardregs, regno)) + { + num_of_st++; + break; + } if (num_of_st == 0) return false; + bool return_with_x87 = false; + return_with_x87 = (crtl->return_rtx + && (STACK_REG_P (crtl->return_rtx))); + + bool complex_return = false; + complex_return = (crtl->return_rtx + && COMPLEX_MODE_P (GET_MODE (crtl->return_rtx))); + + if (return_with_x87) + if (complex_return) + num_of_st = 6; + else + num_of_st = 7; + else + num_of_st = 8; + rtx st_reg = gen_rtx_REG (XFmode, FIRST_STACK_REG); for (unsigned int i = 0; i < num_of_st; i++) emit_insn (gen_rtx_SET (st_reg, CONST0_RTX (XFmode))); @@ -3646,6 +3676,43 @@ zero_all_st_mm_registers (HARD_REG_SET need_zeroed_hardregs) return true; } + +/* When the routine exit with MMX mode, if there is any ST registers + need to be zeroed, we should clear all MMX registers except the + one that holds the return value RET_MMX_REGNO. */ +static bool +zero_all_mm_registers (HARD_REG_SET need_zeroed_hardregs, + unsigned int ret_mmx_regno) +{ + bool need_zero_all_mm = false; + for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) + if (STACK_REGNO_P (regno) + && TEST_HARD_REG_BIT (need_zeroed_hardregs, regno)) + { + need_zero_all_mm = true; + break; + } + + if (!need_zero_all_mm) + return false; + + rtx zero_mmx = NULL_RTX; + machine_mode mode = V4HImode; + for (unsigned int regno = FIRST_MMX_REG; regno <= LAST_MMX_REG; regno++) + if (regno != ret_mmx_regno) + { + rtx reg = gen_rtx_REG (mode, regno); + if (zero_mmx == NULL_RTX) + { + zero_mmx = reg; + emit_insn (gen_rtx_SET (reg, CONST0_RTX(mode))); + } + else + emit_move_insn (reg, zero_mmx); + } + return true; +} + /* TARGET_ZERO_CALL_USED_REGS. */ /* Generate a sequence of instructions that zero registers specified by NEED_ZEROED_HARDREGS. Return the ZEROED_HARDREGS that are actually @@ -3655,7 +3722,10 @@ ix86_zero_call_used_regs (HARD_REG_SET need_zeroed_hardregs) { HARD_REG_SET zeroed_hardregs; bool all_sse_zeroed = false; - bool st_zeroed = false; + bool all_st_zeroed = false; + bool all_mm_zeroed = false; + + CLEAR_HARD_REG_SET (zeroed_hardregs); /* first, let's see whether we can zero all vector registers together. */ rtx zero_all_vec_insn = zero_all_vector_registers (need_zeroed_hardregs); @@ -3665,38 +3735,67 @@ ix86_zero_call_used_regs (HARD_REG_SET need_zeroed_hardregs) all_sse_zeroed = true; } - /* then, let's see whether we can zero all st+mm registers togeter. */ - st_zeroed = zero_all_st_mm_registers (need_zeroed_hardregs); + /* mm/st registers are shared registers set, we should follow the following + rules to clear them: + MMX exit mode x87 exit mode + -------------|----------------------|--------------- + uses x87 reg | clear all MMX | clear all x87 + uses MMX reg | clear individual MMX | clear all x87 + x87 + MMX | clear all MMX | clear all x87 - /* Now, generate instructions to zero all the registers. */ + first, we should decide which mode (MMX mode or x87 mode) the function + exit with. */ - CLEAR_HARD_REG_SET (zeroed_hardregs); - if (st_zeroed) - SET_HARD_REG_BIT (zeroed_hardregs, FIRST_STACK_REG); + bool exit_with_mmx_mode = (crtl->return_rtx + && (MMX_REG_P (crtl->return_rtx))); + + if (!exit_with_mmx_mode) + /* x87 exit mode, we should zero all st registers together. */ + { + all_st_zeroed = zero_all_st_registers (need_zeroed_hardregs); + if (all_st_zeroed) + SET_HARD_REG_BIT (zeroed_hardregs, FIRST_STACK_REG); + } + else + /* MMX exit mode, check whether we can zero all mm registers. */ + { + unsigned int exit_mmx_regno = REGNO (crtl->return_rtx); + all_mm_zeroed = zero_all_mm_registers (need_zeroed_hardregs, + exit_mmx_regno); + if (all_mm_zeroed) + for (unsigned int regno = FIRST_MMX_REG; regno <= LAST_MMX_REG; regno++) + if (regno != exit_mmx_regno) + SET_HARD_REG_BIT (zeroed_hardregs, regno); + } + + /* Now, generate instructions to zero all the other registers. */ rtx zero_gpr = NULL_RTX; rtx zero_vector = NULL_RTX; rtx zero_mask = NULL_RTX; + rtx zero_mmx = NULL_RTX; for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) { if (!TEST_HARD_REG_BIT (need_zeroed_hardregs, regno)) continue; - if (!zero_call_used_regno_p (regno, all_sse_zeroed)) + if (!zero_call_used_regno_p (regno, all_sse_zeroed, + exit_with_mmx_mode && !all_mm_zeroed)) continue; SET_HARD_REG_BIT (zeroed_hardregs, regno); - rtx reg, tmp; + rtx reg, tmp, zero_rtx; machine_mode mode = zero_call_used_regno_mode (regno); reg = gen_rtx_REG (mode, regno); + zero_rtx = CONST0_RTX (mode); if (mode == SImode) if (zero_gpr == NULL_RTX) { zero_gpr = reg; - tmp = gen_rtx_SET (reg, const0_rtx); + tmp = gen_rtx_SET (reg, zero_rtx); if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ()) { rtx clob = gen_rtx_CLOBBER (VOIDmode, @@ -3714,7 +3813,7 @@ ix86_zero_call_used_regs (HARD_REG_SET need_zeroed_hardregs) if (zero_vector == NULL_RTX) { zero_vector = reg; - tmp = gen_rtx_SET (reg, const0_rtx); + tmp = gen_rtx_SET (reg, zero_rtx); emit_insn (tmp); } else @@ -3723,11 +3822,20 @@ ix86_zero_call_used_regs (HARD_REG_SET need_zeroed_hardregs) if (zero_mask == NULL_RTX) { zero_mask = reg; - tmp = gen_rtx_SET (reg, const0_rtx); + tmp = gen_rtx_SET (reg, zero_rtx); emit_insn (tmp); } else emit_move_insn (reg, zero_mask); + else if (mode == V4HImode) + if (zero_mmx == NULL_RTX) + { + zero_mmx = reg; + tmp = gen_rtx_SET (reg, zero_rtx); + emit_insn (tmp); + } + else + emit_move_insn (reg, zero_mmx); else gcc_unreachable (); } diff --git a/gcc/testsuite/gcc.target/i386/zero-scratch-regs-28.c b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-28.c new file mode 100644 index 00000000000..48b1f019a28 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-28.c @@ -0,0 +1,16 @@ +/* { dg-do compile { target *-*-linux* } } */ +/* { dg-options "-O2 -mmmx -fzero-call-used-regs=all" } */ +/* { dg-require-effective-target ia32 } */ + +__v2si ret_mmx (void) +{ + return (__v2si) { 123, 345 }; +} + +/* { dg-final { scan-assembler "pxor\[ \t\]*%mm1, %mm1" } } */ +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm2" } } */ +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm3" } } */ +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm4" } } */ +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm5" } } */ +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm6" } } */ +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm7" } } */ diff --git a/gcc/testsuite/gcc.target/i386/zero-scratch-regs-29.c b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-29.c new file mode 100644 index 00000000000..8b5e1cd1602 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-29.c @@ -0,0 +1,10 @@ +/* { dg-do compile { target *-*-linux* } } */ +/* { dg-options "-O2 -fzero-call-used-regs=all" } */ + +long double ret_x87 (void) +{ + return 1.1L; +} + +/* { dg-final { scan-assembler-times "fldz" 7 } } */ +/* { dg-final { scan-assembler-times "fstp" 7 } } */ diff --git a/gcc/testsuite/gcc.target/i386/zero-scratch-regs-30.c b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-30.c new file mode 100644 index 00000000000..e6fb4acf0fa --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-30.c @@ -0,0 +1,11 @@ +/* { dg-do compile { target *-*-linux* } } */ +/* { dg-options "-O2 -fzero-call-used-regs=all" } */ +/* { dg-require-effective-target lp64} */ + +_Complex long double ret_x87_cplx (void) +{ + return 1.1L + 1.2iL; +} + +/* { dg-final { scan-assembler-times "fldz" 6 } } */ +/* { dg-final { scan-assembler-times "fstp" 6 } } */ diff --git a/gcc/testsuite/gcc.target/i386/zero-scratch-regs-31.c b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-31.c new file mode 100644 index 00000000000..943508d1d26 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-31.c @@ -0,0 +1,16 @@ +/* { dg-do compile { target *-*-linux* } } */ +/* { dg-options "-O2 -mmmx -fzero-call-used-regs=all-arg" } */ +/* { dg-require-effective-target ia32 } */ + +__v2si ret_mmx (void) +{ + return (__v2si) { 123, 345 }; +} + +/* { dg-final { scan-assembler "pxor\[ \t\]*%mm1, %mm1" } } */ +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm2" } } */ +/* { dg-final { scan-assembler-not "movq\[ \t\]*%mm1, %mm3" } } */ +/* { dg-final { scan-assembler-not "movq\[ \t\]*%mm1, %mm4" } } */ +/* { dg-final { scan-assembler-not "movq\[ \t\]*%mm1, %mm5" } } */ +/* { dg-final { scan-assembler-not "movq\[ \t\]*%mm1, %mm6" } } */ +/* { dg-final { scan-assembler-not "movq\[ \t\]*%mm1, %mm7" } } */ -- 2.11.0 > On Oct 27, 2020, at 8:55 AM, Qing Zhao via Gcc-patches > <gcc-patches@gcc.gnu.org> wrote: > > > >> On Oct 27, 2020, at 3:09 AM, Uros Bizjak <ubiz...@gmail.com> wrote: >> >> On Tue, Oct 27, 2020 at 12:08 AM Qing Zhao <qing.z...@oracle.com >> <mailto:qing.z...@oracle.com>> wrote: >>> >>> Hi, Uros, >>> >>> Could you please check the change compared to the previous version for >>> i386.c as following: >>> Let me know any issue there. >> >> It looks that the combination when the function only touches MMX >> registers (so, no x87 register is touched) and exits in MMX mode is >> not handled in the optimal way. > > My current code should handle this in the expected way already, as following: > > > /* Then, decide which mode (MMX mode or x87 mode) the function exit with. > In order to decide whether we need to clear the MMX registers or the > stack registers. */ > > bool exit_with_mmx_mode = (crtl->return_rtx > && (GET_CODE (crtl->return_rtx) == REG) > && (MMX_REG_P (crtl->return_rtx))); > > /* then, let's see whether we can zero all st registers together. */ > if (!exit_with_mmx_mode) > all_st_zeroed = zero_all_st_registers (need_zeroed_hardregs); > /* Or should we zero all MMX registers. */ > else > { > unsigned int exit_mmx_regno = REGNO (crtl->return_rtx); > all_mm_zeroed = zero_all_mm_registers (need_zeroed_hardregs, > exit_mmx_regno); > } > > > “Zero_all_mm_registers” only zero all MM registers when any ST register need > to be cleared. Otherwise, it will not clear all MM registers. > And individual MM registers will be cleared in the regular loop as all other > registers. > >> In this case, MMX registers should be >> handled in the same way as XMM registers, where only used/arg/all regs >> can be cleared. >> >> MMX exit mode x87 exit mode >> -------------|----------------------|--------------- >> uses x87 reg | clear all MMX | clear all x87 >> uses MMX reg | clear individual MMX | clear all x87 >> x87 + MMX | clear all MMX | clear all x87 >> >> IOW, if x87 is used, we don't know where in the stack (or in which MMX >> "register") the value lies. But when the function uses only MMX >> registers and exits in MMX mode, we know which register was used, and >> we *can* access them individually. > > I will add the above table to the comment part of the implementation. >> >> Also, do we want to handle only arg/used registers? > > Yes. Arg/used register handling has been done in middle end. (In > gcc/function.c) as following: > > /* For each of the hard registers, check to see whether we should zero it if: > 1. it is a call-used-registers; > and 2. it is not a fixed-registers; > and 3. it is not live at the return of the routine; > and 4. it is general registor if gpr_only is true; > and 5. it is used in the routine if used_only is true; > and 6. it is a register that passes parameter if arg_only is true; > */ > > The register set that i386 backend gets already satisfied all the above > requirement. > >> x87 has no arg >> registers, so there is no need to clear anything. MMX has 3 argument >> registers for 32bit targets, and is possible to clear them >> individually when the function exits in MMX mode. > > The above information should already been covered by : > > if (arg_only && !FUNCTION_ARG_REGNO_P (regno)) > > Right? > > >> >> Please note review comments inline. >> >> Uros. >> >>> Thanks a lot. >>> >>> Qing >>> >>> --- >>> gcc/config/i386/i386.c | 136 >>> ++++++++++++++++++--- >>> .../gcc.target/i386/zero-scratch-regs-28.c | 17 +++ >>> .../gcc.target/i386/zero-scratch-regs-29.c | 11 ++ >>> .../gcc.target/i386/zero-scratch-regs-30.c | 11 ++ >>> 4 files changed, 155 insertions(+), 20 deletions(-) >>> create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratch-regs-28.c >>> create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratch-regs-29.c >>> create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratch-regs-30.c >>> >>> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c >>> index e66dcf0d587..65f778112d9 100644 >>> --- a/gcc/config/i386/i386.c >>> +++ b/gcc/config/i386/i386.c >>> @@ -3554,17 +3554,17 @@ ix86_function_value_regno_p (const unsigned int >>> regno) >>> /* Check whether the register REGNO should be zeroed on X86. >>> When ALL_SSE_ZEROED is true, all SSE registers have been zeroed >>> together, no need to zero it again. >>> - Stack registers (st0-st7) and mm0-mm7 are aliased with each other. >>> - very hard to be zeroed individually, don't zero individual st or >>> - mm registgers. */ >>> + When NEED_ZERO_MMX is true, MMX registers should be cleared. */ >>> >>> static bool >>> zero_call_used_regno_p (const unsigned int regno, >>> - bool all_sse_zeroed) >>> + bool all_sse_zeroed, >>> + bool need_zero_mmx) >>> { >>> return GENERAL_REGNO_P (regno) >>> || (!all_sse_zeroed && SSE_REGNO_P (regno)) >>> - || MASK_REGNO_P (regno); >>> + || MASK_REGNO_P (regno) >>> + || (need_zero_mmx && MMX_REGNO_P (regno)); >>> } >>> >>> /* Return the machine_mode that is used to zero register REGNO. */ >>> @@ -3579,8 +3579,12 @@ zero_call_used_regno_mode (const unsigned int regno) >>> return SImode; >>> else if (SSE_REGNO_P (regno)) >>> return V4SFmode; >>> - else >>> + else if (MASK_REGNO_P (regno)) >>> return HImode; >>> + else if (MMX_REGNO_P (regno)) >>> + return DImode; >> >> Why DImode instead of V4HImode? > > I tried V4HImode, and V2SImode in the beginning, all failed during > compilation time with “unrecognized inns” error, so, I have to use “DImode”. > >> DImode is "natural" for integer >> registers, and we risk moves from integer to MMX regs. > > So, does this mean using DImode is not correct? >> >>> + else >>> + gcc_unreachable (); >>> } >>> >>> /* Generate a rtx to zero all vector registers together if possible, >>> @@ -3603,7 +3607,7 @@ zero_all_vector_registers (HARD_REG_SET >>> need_zeroed_hardregs) >>> return gen_avx_vzeroall (); >>> } >>> >>> -/* Generate insns to zero all st/mm registers together. >>> +/* Generate insns to zero all st registers together. >>> Return true when zeroing instructions are generated. >>> Assume the number of st registers that are zeroed is num_of_st, >>> we will emit the following sequence to zero them together: >>> @@ -3616,23 +3620,50 @@ zero_all_vector_registers (HARD_REG_SET >>> need_zeroed_hardregs) >>> ... >>> fstp %%st(0); >>> i.e., num_of_st fldz followed by num_of_st fstp to clear the stack >>> - mark stack slots empty. */ >>> + mark stack slots empty. >>> + >>> + How to compute the num_of_st? >>> + There is no direct mapping from stack registers to hard register >>> + numbers. If one stack register need to be cleared, we don't know >>> + where in the stack the value remains. So, if any stack register >>> + need to be cleared, the whole stack should be cleared. However, >>> + x87 stack registers that hold the return value should be excluded. >>> + x87 returns in the top (two for complex values) register, so >>> + num_of_st should be 7/6 when x87 returns, otherwise it will be 8. */ >>> + >>> >>> static bool >>> -zero_all_st_mm_registers (HARD_REG_SET need_zeroed_hardregs) >>> +zero_all_st_registers (HARD_REG_SET need_zeroed_hardregs) >>> { >>> unsigned int num_of_st = 0; >>> for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) >>> - if (STACK_REGNO_P (regno) >>> - && TEST_HARD_REG_BIT (need_zeroed_hardregs, regno) >>> - /* When the corresponding mm register also need to be cleared too. */ >>> - && TEST_HARD_REG_BIT (need_zeroed_hardregs, >>> - (regno - FIRST_STACK_REG + FIRST_MMX_REG))) >>> - num_of_st++; >>> + if ((STACK_REGNO_P (regno) || MMX_REGNO_P (regno)) >>> + && TEST_HARD_REG_BIT (need_zeroed_hardregs, regno)) >>> + { >>> + num_of_st++; >>> + break; >>> + } >>> >>> if (num_of_st == 0) >>> return false; >>> >>> + bool return_with_x87 = false; >>> + return_with_x87 = (crtl->return_rtx >>> + && (GET_CODE (crtl->return_rtx) == REG) >>> + && (STACK_REG_P (crtl->return_rtx))); >> >> STACK_REG_P already checks for REG, no need for separate check. > > Okay. > >> >>> + >>> + bool complex_return = false; >>> + complex_return = (crtl->return_rtx >>> + && COMPLEX_MODE_P (GET_MODE (crtl->return_rtx))); >>> + >>> + if (return_with_x87) >>> + if (complex_return) >>> + num_of_st = 6; >>> + else >>> + num_of_st = 7; >>> + else >>> + num_of_st = 8; >>> + >>> rtx st_reg = gen_rtx_REG (XFmode, FIRST_STACK_REG); >>> for (unsigned int i = 0; i < num_of_st; i++) >>> emit_insn (gen_rtx_SET (st_reg, CONST0_RTX (XFmode))); >>> @@ -3646,6 +3677,43 @@ zero_all_st_mm_registers (HARD_REG_SET >>> need_zeroed_hardregs) >>> return true; >>> } >>> >>> + >>> +/* When the routine exit with MMX mode, if there is any ST registers >>> + need to be zeroed, we should clear all MMX registers except the >>> + one that holds the return value RET_MMX_REGNO. */ >>> +static bool >>> +zero_all_mm_registers (HARD_REG_SET need_zeroed_hardregs, >>> + unsigned int ret_mmx_regno) >>> +{ >>> + bool need_zero_all_mm = false; >>> + for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) >>> + if (STACK_REGNO_P (regno) >>> + && TEST_HARD_REG_BIT (need_zeroed_hardregs, regno)) >>> + { >>> + need_zero_all_mm = true; >>> + break; >>> + } >>> + >>> + if (!need_zero_all_mm) >>> + return false; >>> + >>> + rtx zero_mmx = NULL_RTX; >>> + machine_mode mode = DImode; >>> + for (unsigned int regno = FIRST_MMX_REG; regno <= LAST_MMX_REG; regno++) >>> + if (regno != ret_mmx_regno) >>> + { >>> + rtx reg = gen_rtx_REG (mode, regno); >>> + if (zero_mmx == NULL_RTX) >>> + { >>> + zero_mmx = reg; >>> + emit_insn (gen_rtx_SET (reg, const0_rtx)); >> >> Use CONST0_RTX (mode), and you will be able to use V4HImode instead of >> DImode. > Will try this. >> >>> + } >>> + else >>> + emit_move_insn (reg, zero_mmx); >>> + } >>> + return true; >>> +} >>> + >>> /* TARGET_ZERO_CALL_USED_REGS. */ >>> /* Generate a sequence of instructions that zero registers specified by >>> NEED_ZEROED_HARDREGS. Return the ZEROED_HARDREGS that are actually >>> @@ -3655,7 +3723,8 @@ ix86_zero_call_used_regs (HARD_REG_SET >>> need_zeroed_hardregs) >>> { >>> HARD_REG_SET zeroed_hardregs; >>> bool all_sse_zeroed = false; >>> - bool st_zeroed = false; >>> + bool all_st_zeroed = false; >>> + bool all_mm_zeroed = false; >>> >>> /* first, let's see whether we can zero all vector registers together. */ >>> rtx zero_all_vec_insn = zero_all_vector_registers (need_zeroed_hardregs); >>> @@ -3665,24 +3734,42 @@ ix86_zero_call_used_regs (HARD_REG_SET >>> need_zeroed_hardregs) >>> all_sse_zeroed = true; >>> } >>> >>> - /* then, let's see whether we can zero all st+mm registers togeter. */ >>> - st_zeroed = zero_all_st_mm_registers (need_zeroed_hardregs); >>> + /* Then, decide which mode (MMX mode or x87 mode) the function exit with. >>> + In order to decide whether we need to clear the MMX registers or the >>> + stack registers. */ >>> + >>> + bool exit_with_mmx_mode = (crtl->return_rtx >>> + && (GET_CODE (crtl->return_rtx) == REG) >>> + && (MMX_REG_P (crtl->return_rtx))); >> >> MMX_REG_P also checks for REG internally. > > Okay, will update. >> >>> + >>> + /* then, let's see whether we can zero all st registers together. */ >>> + if (!exit_with_mmx_mode) >>> + all_st_zeroed = zero_all_st_registers (need_zeroed_hardregs); >>> + /* Or should we zero all MMX registers. */ >>> + else >>> + { >>> + unsigned int exit_mmx_regno = REGNO (crtl->return_rtx); >>> + all_mm_zeroed = zero_all_mm_registers (need_zeroed_hardregs, >>> + exit_mmx_regno); >>> + } >>> >>> /* Now, generate instructions to zero all the registers. */ >>> >>> CLEAR_HARD_REG_SET (zeroed_hardregs); >>> - if (st_zeroed) >>> + if (all_st_zeroed) >>> SET_HARD_REG_BIT (zeroed_hardregs, FIRST_STACK_REG); >>> >>> rtx zero_gpr = NULL_RTX; >>> rtx zero_vector = NULL_RTX; >>> rtx zero_mask = NULL_RTX; >>> + rtx zero_mmx = NULL_RTX; >>> >>> for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) >>> { >>> if (!TEST_HARD_REG_BIT (need_zeroed_hardregs, regno)) >>> continue; >>> - if (!zero_call_used_regno_p (regno, all_sse_zeroed)) >>> + if (!zero_call_used_regno_p (regno, all_sse_zeroed, >>> + exit_with_mmx_mode && !all_mm_zeroed)) >>> continue; >>> >>> SET_HARD_REG_BIT (zeroed_hardregs, regno); >>> @@ -3728,6 +3815,15 @@ ix86_zero_call_used_regs (HARD_REG_SET >>> need_zeroed_hardregs) >>> } >>> else >>> emit_move_insn (reg, zero_mask); >>> + else if (mode == DImode) >>> + if (zero_mmx == NULL_RTX) >>> + { >>> + zero_mmx = reg; >>> + tmp = gen_rtx_SET (reg, const0_rtx); >> >> CONST0_RTX (mode), and you will be able to use V4HImode. > > Okay, will try this. >> >>> + emit_insn (tmp); >>> + } >>> + else >>> + emit_move_insn (reg, zero_mmx); >>> else >>> gcc_unreachable (); >>> } >>> diff --git a/gcc/testsuite/gcc.target/i386/zero-scratch-regs-28.c >>> b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-28.c >>> new file mode 100644 >>> index 00000000000..61c0bb7a35c >>> --- /dev/null >>> +++ b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-28.c >>> @@ -0,0 +1,17 @@ >>> +/* { dg-do compile { target *-*-linux* } } */ >>> +/* { dg-options "-O2 -m32 -mmmx -fzero-call-used-regs=all" } */ >> >> -m32 should not be used explicitly. Use: >> >> /* { dg-require-effective-target ia32 } */ >> >> instead. >> >> Also, can we test -fzero-call-used-regs=used and >> -fzero-call-used-regs=arg with MMX regs? As said above, when function >> exits in MMX mode, and no x87 is touched, we can clear separate MMX >> registers. > > I will try to add these new testing. >> >>> + >>> +typedef int __v2si __attribute__ ((vector_size (8))); >>> + >>> +__v2si ret_mmx (void) >>> +{ >>> + return (__v2si) { 123, 345 }; >>> +} >>> + >>> +/* { dg-final { scan-assembler "pxor\[ \t\]*%mm1, %mm1" } } */ >>> +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm2" } } */ >>> +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm3" } } */ >>> +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm4" } } */ >>> +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm5" } } */ >>> +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm6" } } */ >>> +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm7" } } */ >>> diff --git a/gcc/testsuite/gcc.target/i386/zero-scratch-regs-29.c >>> b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-29.c >>> new file mode 100644 >>> index 00000000000..db636654e70 >>> --- /dev/null >>> +++ b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-29.c >>> @@ -0,0 +1,11 @@ >>> +/* { dg-do compile { target *-*-linux* } } */ >>> +/* { dg-options "-O2 -m32 -mmmx -fzero-call-used-regs=all" } */ >> >> No need for "-m32 -mmmx", this test works the same for 32bit and 64bit >> targets.. > > Will fix this. >> >>> +typedef int __v2si __attribute__ ((vector_size (8))); >> >> The above is not needed in this test. > > Okay. >>> + >>> +long double ret_x87 (void) >>> +{ >>> + return 1.1L; >>> +} >>> + >>> +/* { dg-final { scan-assembler-times "fldz" 7 } } */ >>> +/* { dg-final { scan-assembler-times "fstp" 7 } } */ >>> diff --git a/gcc/testsuite/gcc.target/i386/zero-scratch-regs-30.c >>> b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-30.c >>> new file mode 100644 >>> index 00000000000..7c20b569bfa >>> --- /dev/null >>> +++ b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-30.c >>> @@ -0,0 +1,11 @@ >>> +/* { dg-do compile { target *-*-linux* } } */ >>> +/* { dg-options "-O2 -fzero-call-used-regs=all" } */ >>> +typedef int __v2si __attribute__ ((vector_size (8))); >> >> The above line is not needed. > Okay. > >> >>> + >>> +_Complex long double ret_x87_cplx (void) >>> +{ >>> + return 1.1L + 1.2iL; >>> +} >>> + >>> +/* { dg-final { scan-assembler-times "fldz" 6 } } */ >>> +/* { dg-final { scan-assembler-times "fstp" 6 } } */ >> >> The above applies only to 64bit target. 32bit targets pass complex >> value via memory and should clear all 8 registers. > > Will fix this. > > Thanks. > > Qing >