> On Oct 27, 2020, at 12:34 PM, Uros Bizjak <ubiz...@gmail.com> wrote:
> 
> On Tue, Oct 27, 2020 at 5:10 PM Qing Zhao <qing.z...@oracle.com 
> <mailto:qing.z...@oracle.com>> wrote:
>> 
>> Uros,
>> 
>> The following is the change compared to version 4 after fix all the issues 
>> you raised in the previous email.
>> 
>> Let me know if there is any other issue.
> 
> LGTM for x86 part, with a couple of small review comments inline.

thanks.

I will fix those issues below and then include them in the next version 5.

Thanks a lot for all your help.

Qing

> 
> Thanks,
> Uros.
> 
>> Thanks.
>> 
>> Qing
>> 
>> ---
>> gcc/config/i386/i386.c                             | 162 
>> +++++++++++++++++----
>> .../gcc.target/i386/zero-scratch-regs-28.c         |  16 ++
>> .../gcc.target/i386/zero-scratch-regs-29.c         |  10 ++
>> .../gcc.target/i386/zero-scratch-regs-30.c         |  11 ++
>> .../gcc.target/i386/zero-scratch-regs-31.c         |  16 ++
>> 5 files changed, 188 insertions(+), 27 deletions(-)
>> create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratch-regs-28.c
>> create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratch-regs-29.c
>> create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratchregs-30.c
>> create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratch-regs-31.c
>> 
>> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
>> index e66dcf0d587..e6c5001b11e 100644
>> --- a/gcc/config/i386/i386.c
>> +++ b/gcc/config/i386/i386.c
>> @@ -3554,17 +3554,17 @@ ix86_function_value_regno_p (const unsigned int 
>> regno)
>> /* Check whether the register REGNO should be zeroed on X86.
>>    When ALL_SSE_ZEROED is true, all SSE registers have been zeroed
>>    together, no need to zero it again.
>> -   Stack registers (st0-st7) and mm0-mm7 are aliased with each other.
>> -   very hard to be zeroed individually, don't zero individual st or
>> -   mm registgers.  */
>> +   When NEED_ZERO_MMX is true, MMX registers should be cleared.  */
>> 
>> static bool
>> zero_call_used_regno_p (const unsigned int regno,
>> -                       bool all_sse_zeroed)
>> +                       bool all_sse_zeroed,
>> +                       bool need_zero_mmx)
>> {
>>   return GENERAL_REGNO_P (regno)
>>         || (!all_sse_zeroed && SSE_REGNO_P (regno))
>> -        || MASK_REGNO_P (regno);
>> +        || MASK_REGNO_P (regno)
>> +        || (need_zero_mmx && MMX_REGNO_P (regno));
>> }
>> 
>> /* Return the machine_mode that is used to zero register REGNO.  */
>> @@ -3579,8 +3579,12 @@ zero_call_used_regno_mode (const unsigned int regno)
>>     return SImode;
>>   else if (SSE_REGNO_P (regno))
>>     return V4SFmode;
>> -  else
>> +  else if (MASK_REGNO_P (regno))
>>     return HImode;
>> +  else if (MMX_REGNO_P (regno))
>> +    return V4HImode;
>> +  else
>> +    gcc_unreachable ();
>> }
>> 
>> /* Generate a rtx to zero all vector registers together if possible,
>> @@ -3603,7 +3607,7 @@ zero_all_vector_registers (HARD_REG_SET 
>> need_zeroed_hardregs)
>>   return gen_avx_vzeroall ();
>> }
>> 
>> -/* Generate insns to zero all st/mm registers together.
>> +/* Generate insns to zero all st registers together.
>>    Return true when zeroing instructions are generated.
>>    Assume the number of st registers that are zeroed is num_of_st,
>>    we will emit the following sequence to zero them together:
>> @@ -3616,23 +3620,49 @@ zero_all_vector_registers (HARD_REG_SET 
>> need_zeroed_hardregs)
>>                  ...
>>                  fstp %%st(0);
>>    i.e., num_of_st fldz followed by num_of_st fstp to clear the stack
>> -   mark stack slots empty.  */
>> +   mark stack slots empty.
>> +
>> +   How to compute the num_of_st?
>> +   There is no direct mapping from stack registers to hard register
>> +   numbers.  If one stack register need to be cleared, we don't know
> 
> needs
> 
>> +   where in the stack the value remains.  So, if any stack register
>> +   need to be cleared, the whole stack should be cleared.  However,
> 
> needs
> 
>> +   x87 stack registers that hold the return value should be excluded.
>> +   x87 returns in the top (two for complex values) register, so
>> +   num_of_st should be 7/6 when x87 returns, otherwise it will be 8.  */
>> +
>> 
>> static bool
>> -zero_all_st_mm_registers (HARD_REG_SET need_zeroed_hardregs)
>> +zero_all_st_registers (HARD_REG_SET need_zeroed_hardregs)
>> {
>>   unsigned int num_of_st = 0;
>>   for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
>> -    if (STACK_REGNO_P (regno)
>> -       && TEST_HARD_REG_BIT (need_zeroed_hardregs, regno)
>> -       /* When the corresponding mm register also need to be cleared too.  
>> */
>> -       && TEST_HARD_REG_BIT (need_zeroed_hardregs,
>> -                             (regno - FIRST_STACK_REG + FIRST_MMX_REG)))
>> -      num_of_st++;
>> +    if ((STACK_REGNO_P (regno) || MMX_REGNO_P (regno))
>> +       && TEST_HARD_REG_BIT (need_zeroed_hardregs, regno))
>> +      {
>> +       num_of_st++;
>> +       break;
>> +      }
>> 
>>   if (num_of_st == 0)
>>     return false;
>> 
>> +  bool return_with_x87 = false;
>> +  return_with_x87 = (crtl->return_rtx
>> +                    && (STACK_REG_P (crtl->return_rtx)));
>> +
>> +  bool complex_return = false;
>> +  complex_return = (crtl->return_rtx
>> +                   && COMPLEX_MODE_P (GET_MODE (crtl->return_rtx)));
>> +
>> +  if (return_with_x87)
>> +    if (complex_return)
>> +      num_of_st = 6;
>> +    else
>> +      num_of_st = 7;
>> +  else
>> +    num_of_st = 8;
>> +
>>   rtx st_reg = gen_rtx_REG (XFmode, FIRST_STACK_REG);
>>   for (unsigned int i = 0; i < num_of_st; i++)
>>     emit_insn (gen_rtx_SET (st_reg, CONST0_RTX (XFmode)));
>> @@ -3646,6 +3676,43 @@ zero_all_st_mm_registers (HARD_REG_SET 
>> need_zeroed_hardregs)
>>   return true;
>> }
>> 
>> +
>> +/* When the routine exit with MMX mode, if there is any ST registers
> 
> ... exits in MMX mode, if any ST register needs ...
> 
>> +   need to be zeroed, we should clear all MMX registers except the
>> +   one that holds the return value RET_MMX_REGNO.  */
> 
> ... except the RET_MMX_REGNO that holds the return value.
> 
>> +static bool
>> +zero_all_mm_registers (HARD_REG_SET need_zeroed_hardregs,
>> +                      unsigned int ret_mmx_regno)
>> +{
>> +  bool need_zero_all_mm = false;
>> +  for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
>> +    if (STACK_REGNO_P (regno)
>> +       && TEST_HARD_REG_BIT (need_zeroed_hardregs, regno))
>> +      {
>> +       need_zero_all_mm = true;
>> +       break;
>> +      }
>> +
>> +  if (!need_zero_all_mm)
>> +    return false;
>> +
>> +  rtx zero_mmx = NULL_RTX;
>> +  machine_mode mode = V4HImode;
>> +  for (unsigned int regno = FIRST_MMX_REG; regno <= LAST_MMX_REG; regno++)
>> +    if (regno != ret_mmx_regno)
>> +      {
>> +       rtx reg = gen_rtx_REG (mode, regno);
>> +       if (zero_mmx == NULL_RTX)
>> +         {
>> +           zero_mmx = reg;
>> +           emit_insn (gen_rtx_SET (reg, CONST0_RTX(mode)));
> 
> space after CONST0_RTX
> 
>> +         }
>> +       else
>> +         emit_move_insn (reg, zero_mmx);
>> +      }
>> +  return true;
>> +}
>> +
>> /* TARGET_ZERO_CALL_USED_REGS.  */
>> /* Generate a sequence of instructions that zero registers specified by
>>    NEED_ZEROED_HARDREGS.  Return the ZEROED_HARDREGS that are actually
>> @@ -3655,7 +3722,10 @@ ix86_zero_call_used_regs (HARD_REG_SET 
>> need_zeroed_hardregs)
>> {
>>   HARD_REG_SET zeroed_hardregs;
>>   bool all_sse_zeroed = false;
>> -  bool st_zeroed = false;
>> +  bool all_st_zeroed = false;
>> +  bool all_mm_zeroed = false;
>> +
>> +  CLEAR_HARD_REG_SET (zeroed_hardregs);
>> 
>>   /* first, let's see whether we can zero all vector registers together.  */
>>   rtx zero_all_vec_insn = zero_all_vector_registers (need_zeroed_hardregs);
>> @@ -3665,38 +3735,67 @@ ix86_zero_call_used_regs (HARD_REG_SET 
>> need_zeroed_hardregs)
>>       all_sse_zeroed = true;
>>     }
>> 
>> -  /* then, let's see whether we can zero all st+mm registers togeter.  */
>> -  st_zeroed = zero_all_st_mm_registers (need_zeroed_hardregs);
>> +  /* mm/st registers are shared registers set, we should follow the 
>> following
>> +     rules to clear them:
>> +                       MMX exit mode       x87 exit mode
>> +       -------------|----------------------|---------------
>> +       uses x87 reg | clear all MMX        | clear all x87
>> +       uses MMX reg | clear individual MMX | clear all x87
>> +       x87 + MMX    | clear all MMX        | clear all x87
>> 
>> -  /* Now, generate instructions to zero all the registers.  */
>> +     first, we should decide which mode (MMX mode or x87 mode) the function
>> +     exit with.  */
>> 
>> -  CLEAR_HARD_REG_SET (zeroed_hardregs);
>> -  if (st_zeroed)
>> -    SET_HARD_REG_BIT (zeroed_hardregs, FIRST_STACK_REG);
>> +  bool exit_with_mmx_mode = (crtl->return_rtx
>> +                            && (MMX_REG_P (crtl->return_rtx)));
>> +
>> +  if (!exit_with_mmx_mode)
>> +    /* x87 exit mode, we should zero all st registers together.  */
>> +    {
>> +      all_st_zeroed = zero_all_st_registers (need_zeroed_hardregs);
>> +      if (all_st_zeroed)
>> +       SET_HARD_REG_BIT (zeroed_hardregs, FIRST_STACK_REG);
>> +    }
>> +  else
>> +    /* MMX exit mode, check whether we can zero all mm registers.  */
>> +    {
>> +      unsigned int exit_mmx_regno = REGNO (crtl->return_rtx);
>> +      all_mm_zeroed = zero_all_mm_registers (need_zeroed_hardregs,
>> +                                            exit_mmx_regno);
>> +      if (all_mm_zeroed)
>> +       for (unsigned int regno = FIRST_MMX_REG; regno <= LAST_MMX_REG; 
>> regno++)
>> +         if (regno != exit_mmx_regno)
>> +           SET_HARD_REG_BIT (zeroed_hardregs, regno);
>> +    }
>> +
>> +  /* Now, generate instructions to zero all the other registers.  */
>> 
>>   rtx zero_gpr = NULL_RTX;
>>   rtx zero_vector = NULL_RTX;
>>   rtx zero_mask = NULL_RTX;
>> +  rtx zero_mmx = NULL_RTX;
>> 
>>   for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
>>     {
>>       if (!TEST_HARD_REG_BIT (need_zeroed_hardregs, regno))
>>        continue;
>> -      if (!zero_call_used_regno_p (regno, all_sse_zeroed))
>> +      if (!zero_call_used_regno_p (regno, all_sse_zeroed,
>> +                                  exit_with_mmx_mode && !all_mm_zeroed))
>>        continue;
>> 
>>       SET_HARD_REG_BIT (zeroed_hardregs, regno);
>> 
>> -      rtx reg, tmp;
>> +      rtx reg, tmp, zero_rtx;
>>       machine_mode mode = zero_call_used_regno_mode (regno);
>> 
>>       reg = gen_rtx_REG (mode, regno);
>> +      zero_rtx = CONST0_RTX (mode);
>> 
>>       if (mode == SImode)
>>        if (zero_gpr == NULL_RTX)
>>          {
>>            zero_gpr = reg;
>> -           tmp = gen_rtx_SET (reg, const0_rtx);
>> +           tmp = gen_rtx_SET (reg, zero_rtx);
>>            if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
>>              {
>>                rtx clob = gen_rtx_CLOBBER (VOIDmode,
>> @@ -3714,7 +3813,7 @@ ix86_zero_call_used_regs (HARD_REG_SET 
>> need_zeroed_hardregs)
>>        if (zero_vector == NULL_RTX)
>>          {
>>            zero_vector = reg;
>> -           tmp = gen_rtx_SET (reg, const0_rtx);
>> +           tmp = gen_rtx_SET (reg, zero_rtx);
>>            emit_insn (tmp);
>>          }
>>        else
>> @@ -3723,11 +3822,20 @@ ix86_zero_call_used_regs (HARD_REG_SET 
>> need_zeroed_hardregs)
>>        if (zero_mask == NULL_RTX)
>>          {
>>            zero_mask = reg;
>> -           tmp = gen_rtx_SET (reg, const0_rtx);
>> +           tmp = gen_rtx_SET (reg, zero_rtx);
>>            emit_insn (tmp);
>>          }
>>        else
>>          emit_move_insn (reg, zero_mask);
>> +      else if (mode == V4HImode)
>> +       if (zero_mmx == NULL_RTX)
>> +         {
>> +           zero_mmx = reg;
>> +           tmp = gen_rtx_SET (reg, zero_rtx);
>> +           emit_insn (tmp);
>> +         }
>> +       else
>> +         emit_move_insn (reg, zero_mmx);
>>       else
>>        gcc_unreachable ();
>>     }
>> diff --git a/gcc/testsuite/gcc.target/i386/zero-scratch-regs-28.c 
>> b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-28.c
>> new file mode 100644
>> index 00000000000..48b1f019a28
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-28.c
>> @@ -0,0 +1,16 @@
>> +/* { dg-do compile { target *-*-linux* } } */
>> +/* { dg-options "-O2 -mmmx -fzero-call-used-regs=all" } */
>> +/* { dg-require-effective-target ia32 } */
>> +
>> +__v2si ret_mmx (void)
>> +{
>> +  return (__v2si) { 123, 345 };
>> +}
>> +
>> +/* { dg-final { scan-assembler "pxor\[ \t\]*%mm1, %mm1" } } */
>> +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm2" } } */
>> +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm3" } } */
>> +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm4" } } */
>> +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm5" } } */
>> +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm6" } } */
>> +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm7" } } */
>> diff --git a/gcc/testsuite/gcc.target/i386/zero-scratch-regs-29.c 
>> b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-29.c
>> new file mode 100644
>> index 00000000000..8b5e1cd1602
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-29.c
>> @@ -0,0 +1,10 @@
>> +/* { dg-do compile { target *-*-linux* } } */
>> +/* { dg-options "-O2 -fzero-call-used-regs=all" } */
>> +
>> +long double ret_x87 (void)
>> +{
>> +  return 1.1L;
>> +}
>> +
>> +/* { dg-final { scan-assembler-times "fldz" 7 } } */
>> +/* { dg-final { scan-assembler-times "fstp" 7 } } */
>> diff --git a/gcc/testsuite/gcc.target/i386/zero-scratch-regs-30.c 
>> b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-30.c
>> new file mode 100644
>> index 00000000000..e6fb4acf0fa
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-30.c
>> @@ -0,0 +1,11 @@
>> +/* { dg-do compile { target *-*-linux* } } */
>> +/* { dg-options "-O2  -fzero-call-used-regs=all" } */
>> +/* { dg-require-effective-target lp64} */
> 
> Please remove the above line, we can get better test coverage by
> conditional scans below.
> 
>> +
>> +_Complex long double ret_x87_cplx (void)
>> +{
>> +  return 1.1L + 1.2iL;
>> +}
>> +
>> +/* { dg-final { scan-assembler-times "fldz" 6 } } */
>> +/* { dg-final { scan-assembler-times "fstp" 6 } } */
> 
> /* { dg-final { scan-assembler-times "fldz" 8 { target ia32 } } } */
> /* { dg-final { scan-assembler-times "fstp" 8 { target ia32 } } } */
> 
> /* { dg-final { scan-assembler-times "fldz" 6 { target { ! ia32 } } } } */
> /* { dg-final { scan-assembler-times "fstp" 6 { target { ! ia32 } } } } */
> 
>> diff --git a/gcc/testsuite/gcc.target/i386/zero-scratch-regs-31.c 
>> b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-31.c
>> new file mode 100644
>> index 00000000000..943508d1d26
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-31.c
>> @@ -0,0 +1,16 @@
>> +/* { dg-do compile { target *-*-linux* } } */
>> +/* { dg-options "-O2 -mmmx -fzero-call-used-regs=all-arg" } */
>> +/* { dg-require-effective-target ia32 } */
>> +
>> +__v2si ret_mmx (void)
>> +{
>> +  return (__v2si) { 123, 345 };
>> +}
>> +
>> +/* { dg-final { scan-assembler "pxor\[ \t\]*%mm1, %mm1" } } */
>> +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm2" } } */
>> +/* { dg-final { scan-assembler-not "movq\[ \t\]*%mm1, %mm3" } } */
> 
> /* { dg-final { scan-assembler-not "movq\[ \t\]*%mm1, %mm\[34567\]" } } */
> 
> should achieve the same with one regexp.
> 
>> +/* { dg-final { scan-assembler-not "movq\[ \t\]*%mm1, %mm4" } } */
>> +/* { dg-final { scan-assembler-not "movq\[ \t\]*%mm1, %mm5" } } */
>> +/* { dg-final { scan-assembler-not "movq\[ \t\]*%mm1, %mm6" } } */
>> +/* { dg-final { scan-assembler-not "movq\[ \t\]*%mm1, %mm7" } } */
>> --
>> 2.11.0

Reply via email to