Hi This is my attempt at reviving the old patch https://gcc.gnu.org/pipermail/gcc-patches/2019-January/514632.html
I have followed on Kyrill's comment upstream on the link above and I am using the recommended option iii that he mentioned. "1) Adjust the copy_limit to 256 bits after checking AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS in the tuning. 2) Adjust aarch64_copy_one_block_and_progress_pointers to handle 256-bit moves. by iii: iii) Emit explicit V4SI (or any other 128-bit vector mode) pairs ldp/stps. This wouldn't need any adjustments to MD patterns, but would make aarch64_copy_one_block_and_progress_pointers more complex as it would now have two paths, where one handles two adjacent memory addresses in one calls." With this patch the following test #define N 8 extern int src[N], dst[N]; void foo (void) { __builtin_memcpy (dst, src, N * sizeof (int)); } which was originally giving foo: adrp x1, src add x1, x1, :lo12:src ldp x4, x5, [x1] adrp x0, dst add x0, x0, :lo12:dst ldp x2, x3, [x1, 16] stp x4, x5, [x0] stp x2, x3, [x0, 16] ret changes to the following foo: adrp x1, src add x1, x1, :lo12:src adrp x0, dst add x0, x0, :lo12:dst ldp q1, q0, [x1] stp q1, q0, [x0] ret This gives about 1.3% improvement on 523.xalancbmk_r in SPEC2017 and an overall code size reduction on most SPEC2017 Int benchmarks on Neoverse N1 due to more LDP/STP Q pair registers. Bootstrapped and regression tested on aarch64-none-linux-gnu. Is this ok for trunk? Thanks Sudi gcc/ChangeLog: 2020-07-23 Sudakshina Das <sudi....@arm.com> Kyrylo Tkachov <kyrylo.tkac...@arm.com> * config/aarch64/aarch64.c (aarch64_gen_store_pair): Add case for E_V4SImode. (aarch64_gen_load_pair): Likewise. (aarch64_copy_one_block_and_progress_pointers): Handle 256 bit copy. (aarch64_expand_cpymem): Expand copy_limit to 256bits where appropriate. gcc/testsuite/ChangeLog: 2020-07-23 Sudakshina Das <sudi....@arm.com> Kyrylo Tkachov <kyrylo.tkac...@arm.com> * gcc.target/aarch64/cpymem-q-reg_1.c: New test. * gcc.target/aarch64/large_struct_copy_2.c: Update for ldp q regs. ****************************** Attachment inlined ********************************** diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 3fe1feaa80ccb0a287ee1c7ea1056e8f0a830532..a38ff39c4d5d53f056bbba3114ebaf8f0414c037 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -6920,6 +6920,9 @@ aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2, case E_TFmode: return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2); + case E_V4SImode: + return gen_vec_store_pairv4siv4si (mem1, reg1, mem2, reg2); + default: gcc_unreachable (); } @@ -6943,6 +6946,9 @@ aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2, case E_TFmode: return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2); + case E_V4SImode: + return gen_load_pairv4siv4si (reg1, mem1, reg2, mem2); + default: gcc_unreachable (); } @@ -21097,6 +21103,27 @@ static void aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst, machine_mode mode) { + /* Handle 256-bit memcpy separately. We do this by making 2 adjacent memory + address copies using V4SImode so that we can use Q registers. */ + if (known_eq (GET_MODE_BITSIZE (mode), 256)) + { + mode = V4SImode; + rtx reg1 = gen_reg_rtx (mode); + rtx reg2 = gen_reg_rtx (mode); + /* "Cast" the pointers to the correct mode. */ + *src = adjust_address (*src, mode, 0); + *dst = adjust_address (*dst, mode, 0); + /* Emit the memcpy. */ + emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2, + aarch64_progress_pointer (*src))); + emit_insn (aarch64_gen_store_pair (mode, *dst, reg1, + aarch64_progress_pointer (*dst), reg2)); + /* Move the pointers forward. */ + *src = aarch64_move_pointer (*src, 32); + *dst = aarch64_move_pointer (*dst, 32); + return; + } + rtx reg = gen_reg_rtx (mode); /* "Cast" the pointers to the correct mode. */ @@ -21150,9 +21177,12 @@ aarch64_expand_cpymem (rtx *operands) /* Convert n to bits to make the rest of the code simpler. */ n = n * BITS_PER_UNIT; - /* Maximum amount to copy in one go. The AArch64 back-end has integer modes - larger than TImode, but we should not use them for loads/stores here. */ - const int copy_limit = GET_MODE_BITSIZE (TImode); + /* Maximum amount to copy in one go. We allow 256-bit chunks based on the + AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter and TARGET_SIMD. */ + const int copy_limit = ((aarch64_tune_params.extra_tuning_flags + & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) + || !TARGET_SIMD) + ? GET_MODE_BITSIZE (TImode) : 256; while (n > 0) { diff --git a/gcc/testsuite/gcc.target/aarch64/cpymem-q-reg_1.c b/gcc/testsuite/gcc.target/aarch64/cpymem-q-reg_1.c new file mode 100644 index 0000000000000000000000000000000000000000..df5f67e425bd6a52362bc912afbcd1ca3725d449 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/cpymem-q-reg_1.c @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +#define N 8 +extern int src[N], dst[N]; + +void +foo (void) +{ + __builtin_memcpy (dst, src, N * sizeof (int)); +} + +/* { dg-final { scan-assembler {ldp\tq[0-9]*} } } */ +/* { dg-final { scan-assembler-not {ldp\tx[0-9]*} } } */ +/* { dg-final { scan-assembler {stp\tq[0-9]*} } } */ +/* { dg-final { scan-assembler-not {stp\tx[0-9]*} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/large_struct_copy_2.c b/gcc/testsuite/gcc.target/aarch64/large_struct_copy_2.c index 565434244e8296749a99bebc4e095065945d825e..8ee0a9fae92f805573b763ec679f2a8045afb479 100644 --- a/gcc/testsuite/gcc.target/aarch64/large_struct_copy_2.c +++ b/gcc/testsuite/gcc.target/aarch64/large_struct_copy_2.c @@ -16,11 +16,10 @@ struct S2 { struct S0 f3; }; -void fn1 () { - struct S2 b = {0, 1, 7, 4073709551611, 4, 8, 7}; +void fn1 (struct S2 b) { a = b.f3; } -/* { dg-final { scan-assembler-times {ldp\s+x[0-9]+} 2 } } */ -/* { dg-final { scan-assembler-times {stp\s+x[0-9]+} 2 } } */ +/* { dg-final { scan-assembler-times {ldp\s+q[0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {stp\s+q[0-9]+} 1 } } */ /* { dg-final { scan-assembler-not {ld[1-3]} } } */
rb13305.patch
Description: rb13305.patch