Tamar Christina <tamar.christ...@arm.com> writes: >> > + After the final loads are done it issues a >> > + vec_construct to recreate the vector from the scalar. For costing >> > when >> > + we see a vec_to_scalar on a stmt with VMAT_GATHER_SCATTER we are >> dealing >> > + with an emulated instruction and should adjust costing properly. */ >> > + if (kind == vec_to_scalar >> > + && (m_vec_flags & VEC_ADVSIMD) >> > + && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER) >> > + { >> > + auto dr = STMT_VINFO_DATA_REF (stmt_info); >> > + tree dr_ref = DR_REF (dr); >> > + /* Only really expect MEM_REF or ARRAY_REF here. Ignore the rest. >> > */ >> > + switch (TREE_CODE (dr_ref)) >> > + { >> > + case MEM_REF: >> > + case ARRAY_REF: >> > + case TARGET_MEM_REF: >> > + { >> > + tree offset = TREE_OPERAND (dr_ref, 1); >> > + if (SSA_VAR_P (offset) >> > + && gimple_vuse (SSA_NAME_DEF_STMT (offset))) >> > + { >> > + if (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type) >> > + ops->loads += count - 1; >> > + else >> > + /* Stores want to count both the index to array and data to >> > + array using vec_to_scalar. However we have index stores in >> > + Adv.SIMD and so we only want to adjust the index loads. */ >> > + ops->loads += count / 2; >> > + return; >> > + } >> > + break; >> > + } >> > + default: >> > + break; >> > + } >> > + } >> >> Operand 1 of MEM_REF and TARGET_MEM_REF are always constant, so the >> handling of those codes looks redundant. Perhaps we should instead use: >> >> while (handled_component_p (dr_ref)) >> { >> if (TREE_CODE (dr_ref) == ARRAY_REF) >> { >> ...early return or break if SSA offset found... >> } >> dr_ref = TREE_OPERAND (dr_ref, 0); >> } >> >> A gather load could reasonably be to COMPONENT_REFs or BIT_FIELD_REFs of >> an ARRAY_REF, so the ARRAY_REF might not be the outermost code. >> > > Ah, I tried to find their layouts but couldn't readily find one. > Interesting, I would have > Thought a scatter/gather on either of those is more efficiently done through > bit masking.
It can happen in things like: #define iterations 100000 #define LEN_1D 32000 typedef struct { float f; } floats; float a[LEN_1D]; floats b[LEN_1D][2]; float s4115 (int *ip) { float sum = 0.; for (int i = 0; i < LEN_1D; i++) { sum += a[i] * b[ip[i]][0].f; } return sum; } where the outer compoennt reference is the ".f" and the array reference is hidden inside. This example also shows why I think we should only break on SSA offsets: the outer array reference has a constant index, but the inner one is variable and is loaded from memory. So: > Here's updated version of patch. > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > > Ok for master? > > Thanks, > Tamar > > gcc/ChangeLog: > > PR target/118188 > * config/aarch64/aarch64.cc (aarch64_vector_costs::count_ops): Adjust > throughput of emulated gather and scatters. > > gcc/testsuite/ChangeLog: > > PR target/118188 > * gcc.target/aarch64/sve/gather_load_12.c: New test. > * gcc.target/aarch64/sve/gather_load_13.c: New test. > * gcc.target/aarch64/sve/gather_load_14.c: New test. > > -- inline copy of patch -- > > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > index > 3e700ed41e97a98dc844cad1c8a66a3555d82221..6a7abceda466f5569d743b0d3a0eed1bbeb79a2c > 100644 > --- a/gcc/config/aarch64/aarch64.cc > +++ b/gcc/config/aarch64/aarch64.cc > @@ -17378,6 +17378,46 @@ aarch64_vector_costs::count_ops (unsigned int count, > vect_cost_for_stmt kind, > return; > } > > + /* Detect the case where we are using an emulated gather/scatter. When a > + target does not support gathers and scatters directly the vectorizer > + emulates these by constructing an index vector and then issuing an > + extraction for every lane in the vector. If the index vector is loaded > + from memory, the vector load and extractions are subsequently lowered by > + veclower into a series of scalar index loads. After the final loads are > + done it issues a vec_construct to recreate the vector from the scalar. > For > + costing when we see a vec_to_scalar on a stmt with VMAT_GATHER_SCATTER > we > + are dealing with an emulated instruction and should adjust costing > + properly. */ > + if (kind == vec_to_scalar > + && (m_vec_flags & VEC_ADVSIMD) > + && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER) > + { > + auto dr = STMT_VINFO_DATA_REF (stmt_info); > + tree dr_ref = DR_REF (dr); > + while (handled_component_p (dr_ref)) > + { > + if (TREE_CODE (dr_ref) == ARRAY_REF) > + { > + tree offset = TREE_OPERAND (dr_ref, 1); > + if (SSA_VAR_P (offset) > + && gimple_vuse (SSA_NAME_DEF_STMT (offset))) > + { > + if (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type) > + ops->loads += count - 1; > + else > + /* Stores want to count both the index to array and data to > + array using vec_to_scalar. However we have index stores > + in Adv.SIMD and so we only want to adjust the index > + loads. */ > + ops->loads += count / 2; > + return; > + } > + else > + break; ...rather than break for all ARRAY_REFs, I think we should do: if (TREE_CODE (dr_ref) == ARRAY_REF) { tree offset = TREE_OPERAND (dr_ref, 1); if (SSA_VAR_P (offset)) { if (gimple_vuse (SSA_NAME_DEF_STMT (offset))) { if (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type) ops->loads += count - 1; else /* Stores want to count both the index to array and data to array using vec_to_scalar. However we have index stores in Adv.SIMD and so we only want to adjust the index loads. */ ops->loads += count / 2; return; } break; } } OK with that change if you agree, and if it works. (I did check the above example locally FWIW.) Thanks, Richard > + } > + dr_ref = TREE_OPERAND (dr_ref, 0); > + } > + } > > /* Count the basic operation cost associated with KIND. */ > switch (kind) > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/gather_load_12.c > b/gcc/testsuite/gcc.target/aarch64/sve/gather_load_12.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..d550f005d638f260973312fbccd05e2bb9aef1a7 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/gather_load_12.c > @@ -0,0 +1,20 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-Ofast -mcpu=neoverse-v2" } */ > + > +#define iterations 100000 > +#define LEN_1D 32000 > + > +float a[LEN_1D], b[LEN_1D]; > + > +float > +s4115 (int *ip) > +{ > + float sum = 0.; > + for (int i = 0; i < LEN_1D; i++) > + { > + sum += a[i] * b[ip[i]]; > + } > + return sum; > +} > + > +/* { dg-final { scan-assembler {\s+ld1w\t} } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/gather_load_13.c > b/gcc/testsuite/gcc.target/aarch64/sve/gather_load_13.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..24da0646a75e41eae0c0dd6a2b8af58631ac94cb > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/gather_load_13.c > @@ -0,0 +1,20 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-Ofast -mcpu=neoverse-v2" } */ > + > +#define iterations 100000 > +#define LEN_1D 32000 > + > +float a[LEN_1D], b[LEN_1D]; > + > +float > +s4115 (int *ip) > +{ > + float sum = 0.; > + for (int i = 0; i < LEN_1D; i++) > + { > + sum += a[i] * b[ip[i] + 1]; > + } > + return sum; > +} > + > +/* { dg-final { scan-assembler {\s+ld1w\t} { xfail *-*-* } } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/gather_load_14.c > b/gcc/testsuite/gcc.target/aarch64/sve/gather_load_14.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..77d06d2a0e0ee89e5e45730eebf9b4bebfa2aaed > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/gather_load_14.c > @@ -0,0 +1,20 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-Ofast -mcpu=neoverse-v2" } */ > + > +#define iterations 100000 > +#define LEN_1D 32000 > + > +float a[LEN_1D], b[LEN_1D]; > + > +float > +s4115 (int *ip) > +{ > + float sum = 0.; > + for (int i = 0; i < LEN_1D; i++) > + { > + sum += a[i] * b[ip[i]]; > + } > + return sum; > +} > + > +/* { dg-final { scan-assembler-not {\s+st1w\t} } } */