Hi, On 2020/7/22 19:05, Richard Sandiford wrote: > This wasn't really what I meant. Using subregs is fine, but I was > thinking of: > > /* Also try a wider mode if the necessary punning is either not > desirable or not possible. */ > if (!CONSTANT_P (store_info->rhs) > && !targetm.modes_tieable_p (new_mode, store_mode)) > continue; > > if (multiple_p (shift, GET_MODE_BITSIZE (new_mode))) > { > /* Try to implement the shift using a subreg. */ > poly_int64 offset = subreg_offset_from_lsb (new_mode, store_mode, > shift); > rhs_subreg = simplify_gen_subreg (new_mode, store_info->rhs, > store_mode, offset); > if (rhs_subreg) > { > ... > break; > } > } > > where the rhs_subreg is from your original patch. > > The multiple_p should be that way round: the shift needs to be a > multiple of the new_mode for the subreg to be valid. > > I think this should also avoid the BITS_PER_WORD problem. On the > other hand, I agree BITS_PER_UNIT isn't a very sensible limit if > we're using subregs, so maybe moving it to after the multiple_p > if block would still make sense. >
Thanks, I took that rhs_subreg part back for the v3 patch and updated a bit based on your prototype, shift should be put in op1 as multiple_p requires op0 >= op1. Then, new_mode is still TImode same to store_mode, offset will return 8 when shift is 64, simplify_gen_subreg needs an additional inner_mode(DImode) generated from "smallest_int_mode_for_size (shift)" to get rhs_subreg, otherwise it will return NULL if new_mode is equal to store_mode. Lastly, move the BITS_PER_UNIT after multiple_p as it still need generate shift_seq for other circumstances. :) [PATCH v3] dse: Remove partial load after full store for high part access[PR71309] This patch could optimize (works for char/short/int/void*): 6: r119:TI=[r118:DI+0x10] 7: [r118:DI]=r119:TI 8: r121:DI=[r118:DI+0x8] => 6: r119:TI=[r118:DI+0x10] 16: r122:DI=r119:TI#8 Final ASM will be as below without partial load after full store(stxv+ld): ld 10,16(3) mr 9,3 ld 3,24(3) std 10,0(9) std 3,8(9) blr It could achieve ~25% performance improvement for typical cases on Power9. Bootstrap and regression tested on Power9-LE. For AArch64, one ldr is replaced by mov with this patch: ldp x2, x3, [x0, 16] stp x2, x3, [x0] ldr x0, [x0, 8] => mov x1, x0 ldp x2, x0, [x0, 16] stp x2, x0, [x1] gcc/ChangeLog: 2020-07-22 Xionghu Luo <luo...@linux.ibm.com> PR rtl-optimization/71309 * dse.c (find_shift_sequence): Use subreg of shifted from high part register to avoid loading from address. gcc/testsuite/ChangeLog: 2020-07-22 Xionghu Luo <luo...@linux.ibm.com> PR rtl-optimization/71309 * gcc.target/powerpc/pr71309.c: New test. --- gcc/dse.c | 21 ++++++++++++-- gcc/testsuite/gcc.target/powerpc/pr71309.c | 33 ++++++++++++++++++++++ 2 files changed, 52 insertions(+), 2 deletions(-) create mode 100644 gcc/testsuite/gcc.target/powerpc/pr71309.c diff --git a/gcc/dse.c b/gcc/dse.c index bbe792e48e8..aaa161237c3 100644 --- a/gcc/dse.c +++ b/gcc/dse.c @@ -1736,8 +1736,6 @@ find_shift_sequence (poly_int64 access_size, int cost; new_mode = new_mode_iter.require (); - if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD) - break; /* If a constant was stored into memory, try to simplify it here, otherwise the cost of the shift might preclude this optimization @@ -1779,6 +1777,25 @@ find_shift_sequence (poly_int64 access_size, && !targetm.modes_tieable_p (new_mode, store_mode)) continue; + if (multiple_p (GET_MODE_BITSIZE (new_mode), shift)) + { + /* Try to implement the shift using a subreg. */ + scalar_int_mode inner_mode = smallest_int_mode_for_size (shift); + poly_int64 offset + = subreg_offset_from_lsb (new_mode, store_mode, shift); + rtx rhs_subreg = simplify_gen_subreg (inner_mode, store_info->rhs, + store_mode, offset); + if (rhs_subreg) + { + read_reg = extract_low_bits (read_mode, inner_mode, + copy_rtx (rhs_subreg)); + break; + } + } + + if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD) + break; + new_reg = gen_reg_rtx (new_mode); start_sequence (); diff --git a/gcc/testsuite/gcc.target/powerpc/pr71309.c b/gcc/testsuite/gcc.target/powerpc/pr71309.c new file mode 100644 index 00000000000..94d727a8ed9 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr71309.c @@ -0,0 +1,33 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target powerpc_p9vector_ok } */ +/* { dg-options "-O2 -mdejagnu-cpu=power9" } */ + +#define TYPE void* +#define TYPE2 void* + +struct path { + TYPE2 mnt; + TYPE dentry; +}; + +struct nameidata { + struct path path; + struct path root; +}; + +__attribute__ ((noinline)) +TYPE foo(struct nameidata *nd) +{ + TYPE d; + TYPE2 d2; + + nd->path = nd->root; + d = nd->path.dentry; + d2 = nd->path.mnt; + return d; +} + +/* { dg-final { scan-assembler-not {\mlxv\M} } } */ +/* { dg-final { scan-assembler-not {\mstxv\M} } } */ +/* { dg-final { scan-assembler-times {\mld\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mstd\M} 2 } } */ -- 2.27.0.90.geebb51ba8c