Hi,

On 2020/7/22 19:05, Richard Sandiford wrote:
> This wasn't really what I meant.  Using subregs is fine, but I was
> thinking of:
> 
>        /* Also try a wider mode if the necessary punning is either not
>        desirable or not possible.  */
>        if (!CONSTANT_P (store_info->rhs)
>         && !targetm.modes_tieable_p (new_mode, store_mode))
>       continue;
> 
>        if (multiple_p (shift, GET_MODE_BITSIZE (new_mode)))
>       {
>         /* Try to implement the shift using a subreg.  */
>         poly_int64 offset = subreg_offset_from_lsb (new_mode, store_mode,
>                                                     shift);
>         rhs_subreg = simplify_gen_subreg (new_mode, store_info->rhs,
>                                           store_mode, offset);
>         if (rhs_subreg)
>           {
>             ...
>             break;
>           }
>       }
> 
> where the rhs_subreg is from your original patch.
> 
> The multiple_p should be that way round: the shift needs to be a
> multiple of the new_mode for the subreg to be valid.
> 
> I think this should also avoid the BITS_PER_WORD problem.  On the
> other hand, I agree BITS_PER_UNIT isn't a very sensible limit if
> we're using subregs, so maybe moving it to after the multiple_p
> if block would still make sense.
> 

Thanks, I took that rhs_subreg part back for the v3 patch and updated a bit
based on your prototype, shift should be put in op1 as multiple_p requires
op0 >= op1. 

Then, new_mode is still TImode same to store_mode, offset will return 8 when
shift is 64,  simplify_gen_subreg needs an additional inner_mode(DImode) 
generated from "smallest_int_mode_for_size (shift)" to get rhs_subreg, 
otherwise it will return NULL if new_mode is equal to store_mode.

Lastly, move the BITS_PER_UNIT after multiple_p as it still need generate
shift_seq for other circumstances. :)


[PATCH v3] dse: Remove partial load after full store for high part 
access[PR71309]


This patch could optimize (works for char/short/int/void*):

6: r119:TI=[r118:DI+0x10]
7: [r118:DI]=r119:TI
8: r121:DI=[r118:DI+0x8]

=>

6: r119:TI=[r118:DI+0x10]
16: r122:DI=r119:TI#8

Final ASM will be as below without partial load after full store(stxv+ld):
  ld 10,16(3)
  mr 9,3
  ld 3,24(3)
  std 10,0(9)
  std 3,8(9)
  blr

It could achieve ~25% performance improvement for typical cases on
Power9.  Bootstrap and regression tested on Power9-LE.

For AArch64, one ldr is replaced by mov with this patch:

ldp     x2, x3, [x0, 16]
stp     x2, x3, [x0]
ldr     x0, [x0, 8]

=>

mov     x1, x0
ldp     x2, x0, [x0, 16]
stp     x2, x0, [x1]

gcc/ChangeLog:

2020-07-22  Xionghu Luo  <luo...@linux.ibm.com>

        PR rtl-optimization/71309
        * dse.c (find_shift_sequence): Use subreg of shifted from high part
        register to avoid loading from address.

gcc/testsuite/ChangeLog:

2020-07-22  Xionghu Luo  <luo...@linux.ibm.com>

        PR rtl-optimization/71309
        * gcc.target/powerpc/pr71309.c: New test.
---
 gcc/dse.c                                  | 21 ++++++++++++--
 gcc/testsuite/gcc.target/powerpc/pr71309.c | 33 ++++++++++++++++++++++
 2 files changed, 52 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr71309.c

diff --git a/gcc/dse.c b/gcc/dse.c
index bbe792e48e8..aaa161237c3 100644
--- a/gcc/dse.c
+++ b/gcc/dse.c
@@ -1736,8 +1736,6 @@ find_shift_sequence (poly_int64 access_size,
       int cost;
 
       new_mode = new_mode_iter.require ();
-      if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD)
-       break;
 
       /* If a constant was stored into memory, try to simplify it here,
         otherwise the cost of the shift might preclude this optimization
@@ -1779,6 +1777,25 @@ find_shift_sequence (poly_int64 access_size,
          && !targetm.modes_tieable_p (new_mode, store_mode))
        continue;
 
+      if (multiple_p (GET_MODE_BITSIZE (new_mode), shift))
+       {
+         /* Try to implement the shift using a subreg.  */
+         scalar_int_mode inner_mode = smallest_int_mode_for_size (shift);
+         poly_int64 offset
+           = subreg_offset_from_lsb (new_mode, store_mode, shift);
+         rtx rhs_subreg = simplify_gen_subreg (inner_mode, store_info->rhs,
+                                               store_mode, offset);
+         if (rhs_subreg)
+           {
+             read_reg = extract_low_bits (read_mode, inner_mode,
+                                          copy_rtx (rhs_subreg));
+             break;
+           }
+       }
+
+      if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD)
+       break;
+
       new_reg = gen_reg_rtx (new_mode);
 
       start_sequence ();
diff --git a/gcc/testsuite/gcc.target/powerpc/pr71309.c 
b/gcc/testsuite/gcc.target/powerpc/pr71309.c
new file mode 100644
index 00000000000..94d727a8ed9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr71309.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-O2 -mdejagnu-cpu=power9" } */
+
+#define TYPE void*
+#define TYPE2 void*
+
+struct path {
+    TYPE2 mnt;
+    TYPE dentry;
+};
+
+struct nameidata {
+    struct path path;
+    struct path root;
+};
+
+__attribute__ ((noinline))
+TYPE foo(struct nameidata *nd)
+{
+  TYPE d;
+  TYPE2 d2;
+
+  nd->path = nd->root;
+  d = nd->path.dentry;
+  d2 = nd->path.mnt;
+  return d;
+}
+
+/* { dg-final { scan-assembler-not {\mlxv\M} } } */
+/* { dg-final { scan-assembler-not {\mstxv\M} } } */
+/* { dg-final { scan-assembler-times {\mld\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mstd\M} 2 } } */
-- 
2.27.0.90.geebb51ba8c



Reply via email to