Re: [PATCH v5 2/2] asf: Enable pass at O2 or higher on AArch64

Konstantinos Eleftheriou Wed, 10 Dec 2025 04:35:28 -0800

On Tue, Dec 9, 2025 at 9:38 PM Andrew Pinski <[email protected]>
wrote:


> On Tue, Dec 9, 2025 at 8:55 AM Konstantinos Eleftheriou
> <[email protected]> wrote:
> >
> > From: kelefth <[email protected]>
> >
> > This patch enables the avoid-store-forwarding patch by default at O2 or
> > higher on AArch64.
>
> I am not 100% convinced why this should be enabled specific to
> aarch64. It seems like a generic issue. and looking into the testcases
> all should happen on the gimple level much earlier than RTL.
>

We just don't have enough data to support that this would be beneficial on
other architectures. There have been some talks about introducing a
cost-function
so that the targets can decide (see threads of previous versions), but
there was no
clear decision, so we went with this for now.

>
> >
> > The assembly patterns in `bitfield-bitint-abi-align16.c` and
> > `bitfield-bitint-abi-align8.c` have been updated to account for
> > the asf transformations.
> >
> > gcc/ChangeLog:
> >
> >         * common/config/aarch64/aarch64-common.cc: Enable asf at O2.
> >         * doc/invoke.texi: Document asf as an O2 enabled option for
> AArch64.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * gcc.target/aarch64/bitfield-bitint-abi-align16.c:
> >         Modify testcases to account for the asf transformations.
> >         * gcc.target/aarch64/bitfield-bitint-abi-align8.c: Likewise.
> >         * gcc.target/aarch64/avoid-store-forwarding-6.c: New test.
> >
> > Co-Authored-By: Christoph Müllner <[email protected]>
> > Signed-off-by: Konstantinos Eleftheriou <
> [email protected]>
> > ---
> >
> > Changes in v5:
> > - Move `store_bit_field` destination register fix to `store_bit_field_1`.
> >
> > Changes in v4:
> > - Enable asf on AArch64 only.
> > - Remove `avoid_store_forwarding_reorder_cost_p` function.
> > - Remove `asf-default-cost-value` parameter.
> >
> > Changes in v3:
> > - Add `avoid_store_forwarding_reorder_cost_p` function.
> > - Add `asf-default-cost-value` parameter.
> > - Use `avoid_store_forwarding_reorder_cost_p` for each store in ASF.
> > - Update x86_64 testcases.
> > - Update assembly patterns in `bitfield-bitint-abi-align16.c` and
> > `bitfield-bitint-abi-align8.c`.
> > - Fix bug where the result of `store_bit_field` is not used in the
> > following instructions.
> >
> > Changes in v2:
> > - Update assembly patterns in `bitfield-bitint-abi-align16.c` and
> > `bitfield-bitint-abi-align8.c`.
> >
> > Changes in v1:
> > - Enable asf by default at O2 or higher.
> >
> >  gcc/common/config/aarch64/aarch64-common.cc   |  2 ++
> >  gcc/doc/invoke.texi                           |  2 +-
> >  .../aarch64/avoid-store-forwarding-6.c        | 29 +++++++++++++++++++
> >  .../aarch64/bitfield-bitint-abi-align16.c     | 25 +++++++++-------
> >  .../aarch64/bitfield-bitint-abi-align8.c      | 25 +++++++++-------
> >  5 files changed, 60 insertions(+), 23 deletions(-)
> >  create mode 100644
> gcc/testsuite/gcc.target/aarch64/avoid-store-forwarding-6.c
> >
> > diff --git a/gcc/common/config/aarch64/aarch64-common.cc
> b/gcc/common/config/aarch64/aarch64-common.cc
> > index 1488697c6ce4..c3cd3eca9b84 100644
> > --- a/gcc/common/config/aarch64/aarch64-common.cc
> > +++ b/gcc/common/config/aarch64/aarch64-common.cc
> > @@ -60,6 +60,8 @@ static const struct default_options
> aarch_option_optimization_table[] =
> >      /* Enable redundant extension instructions removal at -O2 and
> higher.  */
> >      { OPT_LEVELS_2_PLUS, OPT_free, NULL, 1 },
> >      { OPT_LEVELS_2_PLUS, OPT_mearly_ra_, NULL, AARCH64_EARLY_RA_ALL },
> > +    /* Enable store forwarding avoidance at -O2 and higher.  */
> > +    { OPT_LEVELS_2_PLUS, OPT_favoid_store_forwarding, NULL, 1 },
> >  #if (TARGET_DEFAULT_ASYNC_UNWIND_TABLES == 1)
> >      { OPT_LEVELS_ALL, OPT_fasynchronous_unwind_tables, NULL, 1 },
> >      { OPT_LEVELS_ALL, OPT_funwind_tables, NULL, 1},
> > diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> > index 2da2a27acd68..ef73a2b6302a 100644
> > --- a/gcc/doc/invoke.texi
> > +++ b/gcc/doc/invoke.texi
> > @@ -13500,7 +13500,7 @@ Many CPUs will stall for many cycles when a load
> partially depends on previous
> >  smaller stores.  This pass tries to detect such cases and avoid the
> penalty by
> >  changing the order of the load and store and then fixing up the loaded
> value.
> >
> > -Disabled by default.
> > +Enabled by default at @option{-O2} and higher on AArch64.
> >
> >  @opindex ffp-contract
> >  @item -ffp-contract=@var{style}
> > diff --git a/gcc/testsuite/gcc.target/aarch64/avoid-store-forwarding-6.c
> b/gcc/testsuite/gcc.target/aarch64/avoid-store-forwarding-6.c
> > new file mode 100644
> > index 000000000000..320fa5e101e6
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/avoid-store-forwarding-6.c
> > @@ -0,0 +1,29 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2" } */
> > +
> > +/* Same as avoid-store-forwarding-1.c but without
> -favoid-store-forwarding.  */
> > +
> > +typedef union {
> > +    char arr_8[8];
> > +    long long_value;
> > +} DataUnion;
> > +
> > +long ssll_1 (DataUnion *data, char x)
> > +{
> > +  data->arr_8[0] = x;
> > +  return data->long_value;
> > +}
> > +
> > +long ssll_2 (DataUnion *data, char x)
> > +{
> > +  data->arr_8[1] = x;
> > +  return data->long_value;
> > +}
> > +
> > +long ssll_3 (DataUnion *data, char x)
> > +{
> > +  data->arr_8[7] = x;
> > +  return data->long_value;
> > +}
> > +
> > +/* { dg-final { scan-assembler-times {ldr\tx[0-9]+,
> \[x[0-9]+\]\n\tstrb\tw[0-9]+, \[x[0-9]+(, \d+)?\]\n\tbfi\tx[0-9]+, x[0-9]+,
> \d+, \d+} 3 } } */
>
> Can you use check-function-bodies instead of scan-assembler-times?
>

Will do.

> Also this seems like this optimization should be at the gimple level
> instead of the RTL level for the above case and even the bitfield
> case.
>
>
> > diff --git
> a/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align16.c
> b/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align16.c
> > index c29a230a7713..34f3d7f9653c 100644
> > --- a/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align16.c
> > +++ b/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align16.c
> > @@ -87,14 +87,15 @@
> >  **     sub     sp, sp, #16
> >  **     mov     (x[0-9]+), x0
> >  **     mov     w0, w1
> > +**     mov     x1, 0
> >  **     sbfx    x([0-9]+), \1, 0, 63
> >  **     mov     (w[0-9]+), 0
> >  **     bfi     \3, w\2, 0, 1
> >  **     and     x3, x\2, 9223372036854775807
> > -**     mov     x2, 0
> > -**     str     xzr, \[sp\]
> > +**     str     x1, \[sp\]
>
> This is worse code generation here where x1 is zero. So why it the
> case the x1 is set above to 0 and then stored?
>
> >  **     strb    \3, \[sp\]
> > -**     ldr     x1, \[sp\]
> > +**     bfi     x1, x2, 0, 8
>
> This is not good code generation at all. Shouldn't this just be `and
> x1, x2, 0xff`?
>
> More of the same below.
> I am not 100% sure if this is a win either.
> Why is pass_rtl_avoid_store_forwarding right before RA instead of
> before combine? Where the above optimizations can happen?
>

These issues are indeed fixed if we move it before 'combine'. It's something
that we had already considered in the past (or running it twice). We were
thinking
of sending a patch in the future about this, as this one was about the
default
enablement of the pass in its current state. We can change it now, though.

Thanks,
Konstantinos

>
> Thanks,
> Andrew Pinski
>
>
> > +**     mov     x2, 0
> >  **     add     sp, sp, 16
> >  **     b       fp
> >  */
> > @@ -183,19 +184,20 @@
> >  **     sxtw    (x[0-9]+), w1
> >  **     mov     x0, \2
> >  **     and     x7, \2, 9223372036854775807
> > +**     mov     x2, 0
> >  **     mov     (w[0-9]+), 0
> >  **     bfi     \3, w\1, 0, 1
> >  **     strb    wzr, \[sp, 16\]
> >  **     mov     x6, x7
> >  **     mov     x5, x7
> >  **     mov     x4, x7
> > +**     mov     x1, x7
> > +**     str     x2, \[sp, 48\]
> > +**     strb    \3, \[sp, 48\]
> > +**     bfi     x2, x3, 0, 8
> > +**     stp     x7, x2, \[sp\]
> >  **     mov     x3, x7
> >  **     mov     x2, x7
> > -**     str     xzr, \[sp, 48\]
> > -**     strb    \3, \[sp, 48\]
> > -**     ldr     (x[0-9]+), \[sp, 48\]
> > -**     stp     x7, \4, \[sp\]
> > -**     mov     x1, x7
> >  **     bl      fp_stack
> >  **     sbfx    x0, x0, 0, 63
> >  **...
> > @@ -341,12 +343,13 @@
> >  **...
> >  **     mov     x([0-9]+), x0
> >  **     mov     w0, w1
> > +**     mov     x1, 0
> >  **     mov     (w[0-9]+), 0
> >  **     bfi     \2, w\1, 0, 1
> > -**     mov     x2, 0
> > -**     str     xzr, \[sp\]
> > +**     str     x1, \[sp\]
> >  **     strb    \2, \[sp\]
> > -**     ldr     x1, \[sp\]
> > +**     bfi     x1, x2, 0, 8
> > +**     mov     x2, 0
> >  **...
> >  **     b       fp_stdarg
> >  */
> > diff --git
> a/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align8.c
> b/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align8.c
> > index 13ffbf416cab..d9cefbabb80c 100644
> > --- a/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align8.c
> > +++ b/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align8.c
> > @@ -87,14 +87,15 @@
> >  **     sub     sp, sp, #16
> >  **     mov     (x[0-9]+), x0
> >  **     mov     w0, w1
> > +**     mov     x1, 0
> >  **     sbfx    x([0-9]+), \1, 0, 63
> >  **     mov     (w[0-9]+), 0
> >  **     bfi     \3, w\2, 0, 1
> >  **     and     x3, x\2, 9223372036854775807
> > -**     mov     x2, 0
> > -**     str     xzr, \[sp\]
> > +**     str     x1, \[sp\]
> >  **     strb    \3, \[sp\]
> > -**     ldr     x1, \[sp\]
> > +**     bfi     x1, x2, 0, 8
> > +**     mov     x2, 0
> >  **     add     sp, sp, 16
> >  **     b       fp
> >  */
> > @@ -183,19 +184,20 @@
> >  **     sxtw    (x[0-9]+), w1
> >  **     mov     x0, \2
> >  **     and     x7, \2, 9223372036854775807
> > +**     mov     x2, 0
> >  **     mov     (w[0-9]+), 0
> >  **     bfi     \3, w\1, 0, 1
> >  **     strb    wzr, \[sp, 16\]
> >  **     mov     x6, x7
> >  **     mov     x5, x7
> >  **     mov     x4, x7
> > +**     mov     x1, x7
> > +**     str     x2, \[sp, 48\]
> > +**     strb    \3, \[sp, 48\]
> > +**     bfi     x2, x3, 0, 8
> > +**     stp     x7, x2, \[sp\]
> >  **     mov     x3, x7
> >  **     mov     x2, x7
> > -**     str     xzr, \[sp, 48\]
> > -**     strb    \3, \[sp, 48\]
> > -**     ldr     (x[0-9]+), \[sp, 48\]
> > -**     stp     x7, \4, \[sp\]
> > -**     mov     x1, x7
> >  **     bl      fp_stack
> >  **     sbfx    x0, x0, 0, 63
> >  **...
> > @@ -343,12 +345,13 @@
> >  **...
> >  **     mov     x([0-9]+), x0
> >  **     mov     w0, w1
> > +**     mov     x1, 0
> >  **     mov     (w[0-9]+), 0
> >  **     bfi     \2, w\1, 0, 1
> > -**     mov     x2, 0
> > -**     str     xzr, \[sp\]
> > +**     str     x1, \[sp\]
> >  **     strb    \2, \[sp\]
> > -**     ldr     x1, \[sp\]
> > +**     bfi     x1, x2, 0, 8
> > +**     mov     x2, 0
> >  **...
> >  **     b       fp_stdarg
> >  */
> > --
> > 2.51.1
> >
>

Re: [PATCH v5 2/2] asf: Enable pass at O2 or higher on AArch64

Reply via email to