luoxhu <luo...@linux.ibm.com> writes:
> Thanks, the v5 update as comments:
>  1. Move const_rhs shift out of loop;
>  2. Iterate from int size for read_mode.
>
>
> This patch could optimize(works for char/short/int/void*):
>
> 6: r119:TI=[r118:DI+0x10]
> 7: [r118:DI]=r119:TI
> 8: r121:DI=[r118:DI+0x8]
>
> =>
>
> 6: r119:TI=[r118:DI+0x10]
> 16: r122:DI=r119:TI#8
>
> Final ASM will be as below without partial load after full store(stxv+ld):
>   ld 10,16(3)
>   mr 9,3
>   ld 3,24(3)
>   std 10,0(9)
>   std 3,8(9)
>   blr
>
> It could achieve ~25% performance improvement for typical cases on
> Power9.  Bootstrap and regression tested on Power9-LE.
>
> For AArch64, one ldr is replaced by mov with this patch:
>
> ldp     x2, x3, [x0, 16]
> stp     x2, x3, [x0]
> ldr     x0, [x0, 8]
>
> =>
>
> mov     x1, x0
> ldp     x2, x0, [x0, 16]
> stp     x2, x0, [x1]
>
> gcc/ChangeLog:
>
> 2020-08-03  Xionghu Luo  <luo...@linux.ibm.com>
>
>       PR rtl-optimization/71309
>       * dse.c (find_shift_sequence): Use subreg of shifted from high part
>       register to avoid loading from address.
>
> gcc/testsuite/ChangeLog:
>
> 2020-08-03  Xionghu Luo  <luo...@linux.ibm.com>
>
>       PR rtl-optimization/71309
>       * gcc.target/powerpc/pr71309.c: New test.
> ---
>  gcc/dse.c                                  | 78 ++++++++++++++--------
>  gcc/testsuite/gcc.target/powerpc/pr71309.c | 33 +++++++++
>  2 files changed, 82 insertions(+), 29 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr71309.c
>
> diff --git a/gcc/dse.c b/gcc/dse.c
> index bbe792e48e8..b67b5c2ba35 100644
> --- a/gcc/dse.c
> +++ b/gcc/dse.c
> @@ -1720,6 +1720,35 @@ find_shift_sequence (poly_int64 access_size,
>    scalar_int_mode new_mode;
>    rtx read_reg = NULL;
>  
> +  /* If a constant was stored into memory, try to simplify it here,
> +     otherwise the cost of the shift might preclude this optimization
> +     e.g. at -Os, even when no actual shift will be needed.  */
> +  if (store_info->const_rhs)
> +    {
> +      auto new_mode = smallest_int_mode_for_size (access_size * 
> BITS_PER_UNIT);
> +      auto byte = subreg_lowpart_offset (new_mode, store_mode);
> +      rtx ret
> +     = simplify_subreg (new_mode, store_info->const_rhs, store_mode, byte);
> +      if (ret && CONSTANT_P (ret))
> +     {
> +       rtx shift_rtx = gen_int_shift_amount (new_mode, shift);
> +       ret = simplify_const_binary_operation (LSHIFTRT, new_mode, ret,
> +                                              shift_rtx);
> +       if (ret && CONSTANT_P (ret))
> +         {
> +           byte = subreg_lowpart_offset (read_mode, new_mode);
> +           ret = simplify_subreg (read_mode, ret, new_mode, byte);
> +           if (ret && CONSTANT_P (ret)
> +               && (set_src_cost (ret, read_mode, speed)
> +                   <= COSTS_N_INSNS (1)))
> +             return ret;
> +         }
> +     }
> +    }
> +
> +  if (require_cst)
> +    return NULL_RTX;
> +
>    /* Some machines like the x86 have shift insns for each size of
>       operand.  Other machines like the ppc or the ia-64 may only have
>       shift insns that shift values within 32 or 64 bit registers.
> @@ -1729,7 +1758,7 @@ find_shift_sequence (poly_int64 access_size,
>  
>    opt_scalar_int_mode new_mode_iter;
>    FOR_EACH_MODE_FROM (new_mode_iter,
> -                   smallest_int_mode_for_size (access_size * BITS_PER_UNIT))
> +                   smallest_int_mode_for_size (GET_MODE_BITSIZE (read_mode)))
>      {
>        rtx target, new_reg, new_lhs;
>        rtx_insn *shift_seq, *insn;
> @@ -1739,34 +1768,6 @@ find_shift_sequence (poly_int64 access_size,
>        if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD)
>       break;
>  
> -      /* If a constant was stored into memory, try to simplify it here,
> -      otherwise the cost of the shift might preclude this optimization
> -      e.g. at -Os, even when no actual shift will be needed.  */
> -      if (store_info->const_rhs)
> -     {
> -       poly_uint64 byte = subreg_lowpart_offset (new_mode, store_mode);
> -       rtx ret = simplify_subreg (new_mode, store_info->const_rhs,
> -                                  store_mode, byte);
> -       if (ret && CONSTANT_P (ret))
> -         {
> -           rtx shift_rtx = gen_int_shift_amount (new_mode, shift);
> -           ret = simplify_const_binary_operation (LSHIFTRT, new_mode,
> -                                                  ret, shift_rtx);
> -           if (ret && CONSTANT_P (ret))
> -             {
> -               byte = subreg_lowpart_offset (read_mode, new_mode);
> -               ret = simplify_subreg (read_mode, ret, new_mode, byte);
> -               if (ret && CONSTANT_P (ret)
> -                   && (set_src_cost (ret, read_mode, speed)
> -                       <= COSTS_N_INSNS (1)))
> -                 return ret;
> -             }
> -         }
> -     }
> -
> -      if (require_cst)
> -     return NULL_RTX;
> -
>        /* Try a wider mode if truncating the store mode to NEW_MODE
>        requires a real instruction.  */
>        if (maybe_lt (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode))
> @@ -1779,6 +1780,25 @@ find_shift_sequence (poly_int64 access_size,
>         && !targetm.modes_tieable_p (new_mode, store_mode))
>       continue;
>  
> +      if (multiple_p (GET_MODE_BITSIZE (new_mode), shift)

Like I mentioned before, this should be the other way around:

    multiple_p (shift, GET_MODE_BITSIZE (new_mode))

i.e. for the subreg to be valid, the shift amount must be a multiple
of the outer mode size in bits.

OK with that change, thanks, and sorry for the multiple review cycles.

Richard

Reply via email to