> -----Original Message----- > From: Richard Sandiford <richard.sandif...@arm.com> > Sent: Thursday, January 9, 2025 3:09 PM > To: Tamar Christina <tamar.christ...@arm.com> > Cc: gcc-patches@gcc.gnu.org; nd <n...@arm.com>; Richard Earnshaw > <richard.earns...@arm.com>; ktkac...@gcc.gnu.org > Subject: Re: [PATCH]AArch64: Fix costing of emulated gathers/scatters > [PR118188] > > Tamar Christina <tamar.christ...@arm.com> writes: > >> > + After the final loads are done it issues a > >> > + vec_construct to recreate the vector from the scalar. For costing > >> > when > >> > + we see a vec_to_scalar on a stmt with VMAT_GATHER_SCATTER we are > >> dealing > >> > + with an emulated instruction and should adjust costing properly. > >> > */ > >> > + if (kind == vec_to_scalar > >> > + && (m_vec_flags & VEC_ADVSIMD) > >> > + && vect_mem_access_type (stmt_info, node) == > VMAT_GATHER_SCATTER) > >> > + { > >> > + auto dr = STMT_VINFO_DATA_REF (stmt_info); > >> > + tree dr_ref = DR_REF (dr); > >> > + /* Only really expect MEM_REF or ARRAY_REF here. Ignore the > >> > rest. */ > >> > + switch (TREE_CODE (dr_ref)) > >> > + { > >> > + case MEM_REF: > >> > + case ARRAY_REF: > >> > + case TARGET_MEM_REF: > >> > + { > >> > + tree offset = TREE_OPERAND (dr_ref, 1); > >> > + if (SSA_VAR_P (offset) > >> > + && gimple_vuse (SSA_NAME_DEF_STMT (offset))) > >> > + { > >> > + if (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type) > >> > + ops->loads += count - 1; > >> > + else > >> > + /* Stores want to count both the index to array and > >> > data to > >> > + array using vec_to_scalar. However we have index > >> > stores in > >> > + Adv.SIMD and so we only want to adjust the index > >> > loads. */ > >> > + ops->loads += count / 2; > >> > + return; > >> > + } > >> > + break; > >> > + } > >> > + default: > >> > + break; > >> > + } > >> > + } > >> > >> Operand 1 of MEM_REF and TARGET_MEM_REF are always constant, so the > >> handling of those codes looks redundant. Perhaps we should instead use: > >> > >> while (handled_component_p (dr_ref)) > >> { > >> if (TREE_CODE (dr_ref) == ARRAY_REF) > >> { > >> ...early return or break if SSA offset found... > >> } > >> dr_ref = TREE_OPERAND (dr_ref, 0); > >> } > >> > >> A gather load could reasonably be to COMPONENT_REFs or BIT_FIELD_REFs of > >> an ARRAY_REF, so the ARRAY_REF might not be the outermost code. > >> > > > > Ah, I tried to find their layouts but couldn't readily find one. > > Interesting, I would > have > > Thought a scatter/gather on either of those is more efficiently done > > through bit > masking. > > It can happen in things like: > > #define iterations 100000 > #define LEN_1D 32000 > > typedef struct { float f; } floats; > float a[LEN_1D]; > floats b[LEN_1D][2]; > > float > s4115 (int *ip) > { > float sum = 0.; > for (int i = 0; i < LEN_1D; i++) > { > sum += a[i] * b[ip[i]][0].f; > } > return sum; > } > > where the outer compoennt reference is the ".f" and the array reference > is hidden inside. This example also shows why I think we should only > break on SSA offsets: the outer array reference has a constant index, > but the inner one is variable and is loaded from memory.
Ah that makes sense, the gather is of a bitfield, not the address from a bitfield. > > So: > > > Here's updated version of patch. > > > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > > > > Ok for master? > > > > Thanks, > > Tamar > > > > gcc/ChangeLog: > > > > PR target/118188 > > * config/aarch64/aarch64.cc (aarch64_vector_costs::count_ops): Adjust > > throughput of emulated gather and scatters. > > > > gcc/testsuite/ChangeLog: > > > > PR target/118188 > > * gcc.target/aarch64/sve/gather_load_12.c: New test. > > * gcc.target/aarch64/sve/gather_load_13.c: New test. > > * gcc.target/aarch64/sve/gather_load_14.c: New test. > > > > -- inline copy of patch -- > > > > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > > index > 3e700ed41e97a98dc844cad1c8a66a3555d82221..6a7abceda466f5569d743b0 > d3a0eed1bbeb79a2c 100644 > > --- a/gcc/config/aarch64/aarch64.cc > > +++ b/gcc/config/aarch64/aarch64.cc > > @@ -17378,6 +17378,46 @@ aarch64_vector_costs::count_ops (unsigned int > count, vect_cost_for_stmt kind, > > return; > > } > > > > + /* Detect the case where we are using an emulated gather/scatter. When a > > + target does not support gathers and scatters directly the vectorizer > > + emulates these by constructing an index vector and then issuing an > > + extraction for every lane in the vector. If the index vector is > > loaded > > + from memory, the vector load and extractions are subsequently lowered > > by > > + veclower into a series of scalar index loads. After the final loads > > are > > + done it issues a vec_construct to recreate the vector from the > > scalar. For > > + costing when we see a vec_to_scalar on a stmt with VMAT_GATHER_SCATTER > we > > + are dealing with an emulated instruction and should adjust costing > > + properly. */ > > + if (kind == vec_to_scalar > > + && (m_vec_flags & VEC_ADVSIMD) > > + && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER) > > + { > > + auto dr = STMT_VINFO_DATA_REF (stmt_info); > > + tree dr_ref = DR_REF (dr); > > + while (handled_component_p (dr_ref)) > > + { > > + if (TREE_CODE (dr_ref) == ARRAY_REF) > > + { > > + tree offset = TREE_OPERAND (dr_ref, 1); > > + if (SSA_VAR_P (offset) > > + && gimple_vuse (SSA_NAME_DEF_STMT (offset))) > > + { > > + if (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type) > > + ops->loads += count - 1; > > + else > > + /* Stores want to count both the index to array and data to > > + array using vec_to_scalar. However we have index stores > > + in Adv.SIMD and so we only want to adjust the index > > + loads. */ > > + ops->loads += count / 2; > > + return; > > + } > > + else > > + break; > > ...rather than break for all ARRAY_REFs, I think we should do: > > if (TREE_CODE (dr_ref) == ARRAY_REF) > { > tree offset = TREE_OPERAND (dr_ref, 1); > if (SSA_VAR_P (offset)) > { > if (gimple_vuse (SSA_NAME_DEF_STMT (offset))) > { > if (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type) > ops->loads += count - 1; > else > /* Stores want to count both the index to array and > data to array using vec_to_scalar. However we have > index stores in Adv.SIMD and so we only want to > adjust the index loads. */ > ops->loads += count / 2; > return; > } > break; > } > } > > OK with that change if you agree, and if it works. (I did check the > above example locally FWIW.) > Yes that works for me. Thanks for the review and examples! Tamar > Thanks, > Richard > > > + } > > + dr_ref = TREE_OPERAND (dr_ref, 0); > > + } > > + } > > > > /* Count the basic operation cost associated with KIND. */ > > switch (kind) > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/gather_load_12.c > b/gcc/testsuite/gcc.target/aarch64/sve/gather_load_12.c > > new file mode 100644 > > index > 0000000000000000000000000000000000000000..d550f005d638f26097331 > 2fbccd05e2bb9aef1a7 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/gather_load_12.c > > @@ -0,0 +1,20 @@ > > +/* { dg-do compile } */ > > +/* { dg-additional-options "-Ofast -mcpu=neoverse-v2" } */ > > + > > +#define iterations 100000 > > +#define LEN_1D 32000 > > + > > +float a[LEN_1D], b[LEN_1D]; > > + > > +float > > +s4115 (int *ip) > > +{ > > + float sum = 0.; > > + for (int i = 0; i < LEN_1D; i++) > > + { > > + sum += a[i] * b[ip[i]]; > > + } > > + return sum; > > +} > > + > > +/* { dg-final { scan-assembler {\s+ld1w\t} } } */ > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/gather_load_13.c > b/gcc/testsuite/gcc.target/aarch64/sve/gather_load_13.c > > new file mode 100644 > > index > 0000000000000000000000000000000000000000..24da0646a75e41eae0c0dd > 6a2b8af58631ac94cb > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/gather_load_13.c > > @@ -0,0 +1,20 @@ > > +/* { dg-do compile } */ > > +/* { dg-additional-options "-Ofast -mcpu=neoverse-v2" } */ > > + > > +#define iterations 100000 > > +#define LEN_1D 32000 > > + > > +float a[LEN_1D], b[LEN_1D]; > > + > > +float > > +s4115 (int *ip) > > +{ > > + float sum = 0.; > > + for (int i = 0; i < LEN_1D; i++) > > + { > > + sum += a[i] * b[ip[i] + 1]; > > + } > > + return sum; > > +} > > + > > +/* { dg-final { scan-assembler {\s+ld1w\t} { xfail *-*-* } } } */ > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/gather_load_14.c > b/gcc/testsuite/gcc.target/aarch64/sve/gather_load_14.c > > new file mode 100644 > > index > 0000000000000000000000000000000000000000..77d06d2a0e0ee89e5e457 > 30eebf9b4bebfa2aaed > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/gather_load_14.c > > @@ -0,0 +1,20 @@ > > +/* { dg-do compile } */ > > +/* { dg-additional-options "-Ofast -mcpu=neoverse-v2" } */ > > + > > +#define iterations 100000 > > +#define LEN_1D 32000 > > + > > +float a[LEN_1D], b[LEN_1D]; > > + > > +float > > +s4115 (int *ip) > > +{ > > + float sum = 0.; > > + for (int i = 0; i < LEN_1D; i++) > > + { > > + sum += a[i] * b[ip[i]]; > > + } > > + return sum; > > +} > > + > > +/* { dg-final { scan-assembler-not {\s+st1w\t} } } */