On Tue, Oct 27, 2020 at 5:10 PM Qing Zhao <qing.z...@oracle.com> wrote: > > Uros, > > The following is the change compared to version 4 after fix all the issues > you raised in the previous email. > > Let me know if there is any other issue.
LGTM for x86 part, with a couple of small review comments inline. Thanks, Uros. > Thanks. > > Qing > > --- > gcc/config/i386/i386.c | 162 > +++++++++++++++++---- > .../gcc.target/i386/zero-scratch-regs-28.c | 16 ++ > .../gcc.target/i386/zero-scratch-regs-29.c | 10 ++ > .../gcc.target/i386/zero-scratch-regs-30.c | 11 ++ > .../gcc.target/i386/zero-scratch-regs-31.c | 16 ++ > 5 files changed, 188 insertions(+), 27 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratch-regs-28.c > create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratch-regs-29.c > create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratchregs-30.c > create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratch-regs-31.c > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > index e66dcf0d587..e6c5001b11e 100644 > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -3554,17 +3554,17 @@ ix86_function_value_regno_p (const unsigned int regno) > /* Check whether the register REGNO should be zeroed on X86. > When ALL_SSE_ZEROED is true, all SSE registers have been zeroed > together, no need to zero it again. > - Stack registers (st0-st7) and mm0-mm7 are aliased with each other. > - very hard to be zeroed individually, don't zero individual st or > - mm registgers. */ > + When NEED_ZERO_MMX is true, MMX registers should be cleared. */ > > static bool > zero_call_used_regno_p (const unsigned int regno, > - bool all_sse_zeroed) > + bool all_sse_zeroed, > + bool need_zero_mmx) > { > return GENERAL_REGNO_P (regno) > || (!all_sse_zeroed && SSE_REGNO_P (regno)) > - || MASK_REGNO_P (regno); > + || MASK_REGNO_P (regno) > + || (need_zero_mmx && MMX_REGNO_P (regno)); > } > > /* Return the machine_mode that is used to zero register REGNO. */ > @@ -3579,8 +3579,12 @@ zero_call_used_regno_mode (const unsigned int regno) > return SImode; > else if (SSE_REGNO_P (regno)) > return V4SFmode; > - else > + else if (MASK_REGNO_P (regno)) > return HImode; > + else if (MMX_REGNO_P (regno)) > + return V4HImode; > + else > + gcc_unreachable (); > } > > /* Generate a rtx to zero all vector registers together if possible, > @@ -3603,7 +3607,7 @@ zero_all_vector_registers (HARD_REG_SET > need_zeroed_hardregs) > return gen_avx_vzeroall (); > } > > -/* Generate insns to zero all st/mm registers together. > +/* Generate insns to zero all st registers together. > Return true when zeroing instructions are generated. > Assume the number of st registers that are zeroed is num_of_st, > we will emit the following sequence to zero them together: > @@ -3616,23 +3620,49 @@ zero_all_vector_registers (HARD_REG_SET > need_zeroed_hardregs) > ... > fstp %%st(0); > i.e., num_of_st fldz followed by num_of_st fstp to clear the stack > - mark stack slots empty. */ > + mark stack slots empty. > + > + How to compute the num_of_st? > + There is no direct mapping from stack registers to hard register > + numbers. If one stack register need to be cleared, we don't know needs > + where in the stack the value remains. So, if any stack register > + need to be cleared, the whole stack should be cleared. However, needs > + x87 stack registers that hold the return value should be excluded. > + x87 returns in the top (two for complex values) register, so > + num_of_st should be 7/6 when x87 returns, otherwise it will be 8. */ > + > > static bool > -zero_all_st_mm_registers (HARD_REG_SET need_zeroed_hardregs) > +zero_all_st_registers (HARD_REG_SET need_zeroed_hardregs) > { > unsigned int num_of_st = 0; > for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) > - if (STACK_REGNO_P (regno) > - && TEST_HARD_REG_BIT (need_zeroed_hardregs, regno) > - /* When the corresponding mm register also need to be cleared too. */ > - && TEST_HARD_REG_BIT (need_zeroed_hardregs, > - (regno - FIRST_STACK_REG + FIRST_MMX_REG))) > - num_of_st++; > + if ((STACK_REGNO_P (regno) || MMX_REGNO_P (regno)) > + && TEST_HARD_REG_BIT (need_zeroed_hardregs, regno)) > + { > + num_of_st++; > + break; > + } > > if (num_of_st == 0) > return false; > > + bool return_with_x87 = false; > + return_with_x87 = (crtl->return_rtx > + && (STACK_REG_P (crtl->return_rtx))); > + > + bool complex_return = false; > + complex_return = (crtl->return_rtx > + && COMPLEX_MODE_P (GET_MODE (crtl->return_rtx))); > + > + if (return_with_x87) > + if (complex_return) > + num_of_st = 6; > + else > + num_of_st = 7; > + else > + num_of_st = 8; > + > rtx st_reg = gen_rtx_REG (XFmode, FIRST_STACK_REG); > for (unsigned int i = 0; i < num_of_st; i++) > emit_insn (gen_rtx_SET (st_reg, CONST0_RTX (XFmode))); > @@ -3646,6 +3676,43 @@ zero_all_st_mm_registers (HARD_REG_SET > need_zeroed_hardregs) > return true; > } > > + > +/* When the routine exit with MMX mode, if there is any ST registers ... exits in MMX mode, if any ST register needs ... > + need to be zeroed, we should clear all MMX registers except the > + one that holds the return value RET_MMX_REGNO. */ ... except the RET_MMX_REGNO that holds the return value. > +static bool > +zero_all_mm_registers (HARD_REG_SET need_zeroed_hardregs, > + unsigned int ret_mmx_regno) > +{ > + bool need_zero_all_mm = false; > + for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) > + if (STACK_REGNO_P (regno) > + && TEST_HARD_REG_BIT (need_zeroed_hardregs, regno)) > + { > + need_zero_all_mm = true; > + break; > + } > + > + if (!need_zero_all_mm) > + return false; > + > + rtx zero_mmx = NULL_RTX; > + machine_mode mode = V4HImode; > + for (unsigned int regno = FIRST_MMX_REG; regno <= LAST_MMX_REG; regno++) > + if (regno != ret_mmx_regno) > + { > + rtx reg = gen_rtx_REG (mode, regno); > + if (zero_mmx == NULL_RTX) > + { > + zero_mmx = reg; > + emit_insn (gen_rtx_SET (reg, CONST0_RTX(mode))); space after CONST0_RTX > + } > + else > + emit_move_insn (reg, zero_mmx); > + } > + return true; > +} > + > /* TARGET_ZERO_CALL_USED_REGS. */ > /* Generate a sequence of instructions that zero registers specified by > NEED_ZEROED_HARDREGS. Return the ZEROED_HARDREGS that are actually > @@ -3655,7 +3722,10 @@ ix86_zero_call_used_regs (HARD_REG_SET > need_zeroed_hardregs) > { > HARD_REG_SET zeroed_hardregs; > bool all_sse_zeroed = false; > - bool st_zeroed = false; > + bool all_st_zeroed = false; > + bool all_mm_zeroed = false; > + > + CLEAR_HARD_REG_SET (zeroed_hardregs); > > /* first, let's see whether we can zero all vector registers together. */ > rtx zero_all_vec_insn = zero_all_vector_registers (need_zeroed_hardregs); > @@ -3665,38 +3735,67 @@ ix86_zero_call_used_regs (HARD_REG_SET > need_zeroed_hardregs) > all_sse_zeroed = true; > } > > - /* then, let's see whether we can zero all st+mm registers togeter. */ > - st_zeroed = zero_all_st_mm_registers (need_zeroed_hardregs); > + /* mm/st registers are shared registers set, we should follow the following > + rules to clear them: > + MMX exit mode x87 exit mode > + -------------|----------------------|--------------- > + uses x87 reg | clear all MMX | clear all x87 > + uses MMX reg | clear individual MMX | clear all x87 > + x87 + MMX | clear all MMX | clear all x87 > > - /* Now, generate instructions to zero all the registers. */ > + first, we should decide which mode (MMX mode or x87 mode) the function > + exit with. */ > > - CLEAR_HARD_REG_SET (zeroed_hardregs); > - if (st_zeroed) > - SET_HARD_REG_BIT (zeroed_hardregs, FIRST_STACK_REG); > + bool exit_with_mmx_mode = (crtl->return_rtx > + && (MMX_REG_P (crtl->return_rtx))); > + > + if (!exit_with_mmx_mode) > + /* x87 exit mode, we should zero all st registers together. */ > + { > + all_st_zeroed = zero_all_st_registers (need_zeroed_hardregs); > + if (all_st_zeroed) > + SET_HARD_REG_BIT (zeroed_hardregs, FIRST_STACK_REG); > + } > + else > + /* MMX exit mode, check whether we can zero all mm registers. */ > + { > + unsigned int exit_mmx_regno = REGNO (crtl->return_rtx); > + all_mm_zeroed = zero_all_mm_registers (need_zeroed_hardregs, > + exit_mmx_regno); > + if (all_mm_zeroed) > + for (unsigned int regno = FIRST_MMX_REG; regno <= LAST_MMX_REG; > regno++) > + if (regno != exit_mmx_regno) > + SET_HARD_REG_BIT (zeroed_hardregs, regno); > + } > + > + /* Now, generate instructions to zero all the other registers. */ > > rtx zero_gpr = NULL_RTX; > rtx zero_vector = NULL_RTX; > rtx zero_mask = NULL_RTX; > + rtx zero_mmx = NULL_RTX; > > for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) > { > if (!TEST_HARD_REG_BIT (need_zeroed_hardregs, regno)) > continue; > - if (!zero_call_used_regno_p (regno, all_sse_zeroed)) > + if (!zero_call_used_regno_p (regno, all_sse_zeroed, > + exit_with_mmx_mode && !all_mm_zeroed)) > continue; > > SET_HARD_REG_BIT (zeroed_hardregs, regno); > > - rtx reg, tmp; > + rtx reg, tmp, zero_rtx; > machine_mode mode = zero_call_used_regno_mode (regno); > > reg = gen_rtx_REG (mode, regno); > + zero_rtx = CONST0_RTX (mode); > > if (mode == SImode) > if (zero_gpr == NULL_RTX) > { > zero_gpr = reg; > - tmp = gen_rtx_SET (reg, const0_rtx); > + tmp = gen_rtx_SET (reg, zero_rtx); > if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ()) > { > rtx clob = gen_rtx_CLOBBER (VOIDmode, > @@ -3714,7 +3813,7 @@ ix86_zero_call_used_regs (HARD_REG_SET > need_zeroed_hardregs) > if (zero_vector == NULL_RTX) > { > zero_vector = reg; > - tmp = gen_rtx_SET (reg, const0_rtx); > + tmp = gen_rtx_SET (reg, zero_rtx); > emit_insn (tmp); > } > else > @@ -3723,11 +3822,20 @@ ix86_zero_call_used_regs (HARD_REG_SET > need_zeroed_hardregs) > if (zero_mask == NULL_RTX) > { > zero_mask = reg; > - tmp = gen_rtx_SET (reg, const0_rtx); > + tmp = gen_rtx_SET (reg, zero_rtx); > emit_insn (tmp); > } > else > emit_move_insn (reg, zero_mask); > + else if (mode == V4HImode) > + if (zero_mmx == NULL_RTX) > + { > + zero_mmx = reg; > + tmp = gen_rtx_SET (reg, zero_rtx); > + emit_insn (tmp); > + } > + else > + emit_move_insn (reg, zero_mmx); > else > gcc_unreachable (); > } > diff --git a/gcc/testsuite/gcc.target/i386/zero-scratch-regs-28.c > b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-28.c > new file mode 100644 > index 00000000000..48b1f019a28 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-28.c > @@ -0,0 +1,16 @@ > +/* { dg-do compile { target *-*-linux* } } */ > +/* { dg-options "-O2 -mmmx -fzero-call-used-regs=all" } */ > +/* { dg-require-effective-target ia32 } */ > + > +__v2si ret_mmx (void) > +{ > + return (__v2si) { 123, 345 }; > +} > + > +/* { dg-final { scan-assembler "pxor\[ \t\]*%mm1, %mm1" } } */ > +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm2" } } */ > +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm3" } } */ > +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm4" } } */ > +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm5" } } */ > +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm6" } } */ > +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm7" } } */ > diff --git a/gcc/testsuite/gcc.target/i386/zero-scratch-regs-29.c > b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-29.c > new file mode 100644 > index 00000000000..8b5e1cd1602 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-29.c > @@ -0,0 +1,10 @@ > +/* { dg-do compile { target *-*-linux* } } */ > +/* { dg-options "-O2 -fzero-call-used-regs=all" } */ > + > +long double ret_x87 (void) > +{ > + return 1.1L; > +} > + > +/* { dg-final { scan-assembler-times "fldz" 7 } } */ > +/* { dg-final { scan-assembler-times "fstp" 7 } } */ > diff --git a/gcc/testsuite/gcc.target/i386/zero-scratch-regs-30.c > b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-30.c > new file mode 100644 > index 00000000000..e6fb4acf0fa > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-30.c > @@ -0,0 +1,11 @@ > +/* { dg-do compile { target *-*-linux* } } */ > +/* { dg-options "-O2 -fzero-call-used-regs=all" } */ > +/* { dg-require-effective-target lp64} */ Please remove the above line, we can get better test coverage by conditional scans below. > + > +_Complex long double ret_x87_cplx (void) > +{ > + return 1.1L + 1.2iL; > +} > + > +/* { dg-final { scan-assembler-times "fldz" 6 } } */ > +/* { dg-final { scan-assembler-times "fstp" 6 } } */ /* { dg-final { scan-assembler-times "fldz" 8 { target ia32 } } } */ /* { dg-final { scan-assembler-times "fstp" 8 { target ia32 } } } */ /* { dg-final { scan-assembler-times "fldz" 6 { target { ! ia32 } } } } */ /* { dg-final { scan-assembler-times "fstp" 6 { target { ! ia32 } } } } */ > diff --git a/gcc/testsuite/gcc.target/i386/zero-scratch-regs-31.c > b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-31.c > new file mode 100644 > index 00000000000..943508d1d26 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-31.c > @@ -0,0 +1,16 @@ > +/* { dg-do compile { target *-*-linux* } } */ > +/* { dg-options "-O2 -mmmx -fzero-call-used-regs=all-arg" } */ > +/* { dg-require-effective-target ia32 } */ > + > +__v2si ret_mmx (void) > +{ > + return (__v2si) { 123, 345 }; > +} > + > +/* { dg-final { scan-assembler "pxor\[ \t\]*%mm1, %mm1" } } */ > +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm2" } } */ > +/* { dg-final { scan-assembler-not "movq\[ \t\]*%mm1, %mm3" } } */ /* { dg-final { scan-assembler-not "movq\[ \t\]*%mm1, %mm\[34567\]" } } */ should achieve the same with one regexp. > +/* { dg-final { scan-assembler-not "movq\[ \t\]*%mm1, %mm4" } } */ > +/* { dg-final { scan-assembler-not "movq\[ \t\]*%mm1, %mm5" } } */ > +/* { dg-final { scan-assembler-not "movq\[ \t\]*%mm1, %mm6" } } */ > +/* { dg-final { scan-assembler-not "movq\[ \t\]*%mm1, %mm7" } } */ > -- > 2.11.0