Jennifer Schmitz <jschm...@nvidia.com> writes: > For the test case > int32_t foo (svint32_t x) > { > svbool_t pg = svpfalse (); > return svlastb_s32 (pg, x); > } > compiled with -O3 -mcpu=grace -msve-vector-bits=128, GCC produced: > foo: > pfalse p3.b > lastb w0, p3, z0.s > ret > when it could use a Neon lane extract instead: > foo: > umov w0, v0.s[3] > ret > > We implemented this optimization by guarding the emission of > pfalse+lastb in the pattern vec_extract<mode><Vel> by > known_gt (BYTES_PER_SVE_VECTOR, 16). Thus, for a last-extract operation > in 128-bit VLS, the pattern *vec_extract<mode><Vel>_v128 is used instead. > > The patch was bootstrapped and tested on aarch64-linux-gnu, no regression. > OK for mainline? > > Signed-off-by: Jennifer Schmitz <jschm...@nvidia.com> > > gcc/ > * config/aarch64/aarch64-sve.md (vec_extract<mode><Vel>): > Prevent the emission of pfalse+lastb for 128-bit VLS. > > gcc/testsuite/ > * gcc.target/aarch64/sve/extract_last_128.c: New test.
OK, thanks. Richard > --- > gcc/config/aarch64/aarch64-sve.md | 7 ++-- > .../gcc.target/aarch64/sve/extract_last_128.c | 33 +++++++++++++++++++ > 2 files changed, 37 insertions(+), 3 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/extract_last_128.c > > diff --git a/gcc/config/aarch64/aarch64-sve.md > b/gcc/config/aarch64/aarch64-sve.md > index 3dbd65986ec..824bd877e47 100644 > --- a/gcc/config/aarch64/aarch64-sve.md > +++ b/gcc/config/aarch64/aarch64-sve.md > @@ -2969,10 +2969,11 @@ > { > poly_int64 val; > if (poly_int_rtx_p (operands[2], &val) > - && known_eq (val, GET_MODE_NUNITS (<MODE>mode) - 1)) > + && known_eq (val, GET_MODE_NUNITS (<MODE>mode) - 1) > + && known_gt (BYTES_PER_SVE_VECTOR, 16)) > { > - /* The last element can be extracted with a LASTB and a false > - predicate. */ > + /* Extract the last element with a LASTB and a false predicate. > + Exclude 128-bit VLS to use *vec_extract<mode><Vel>_v128. */ > rtx sel = aarch64_pfalse_reg (<VPRED>mode); > emit_insn (gen_extract_last_<mode> (operands[0], sel, operands[1])); > DONE; > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/extract_last_128.c > b/gcc/testsuite/gcc.target/aarch64/sve/extract_last_128.c > new file mode 100644 > index 00000000000..71d3561ec60 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/extract_last_128.c > @@ -0,0 +1,33 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -msve-vector-bits=128" } */ > + > +#include <arm_sve.h> > + > +#define TEST(TYPE, TY) \ > + TYPE exract_last_##TY (sv##TYPE x) \ > + { \ > + svbool_t pg = svpfalse (); \ > + return svlastb_##TY (pg, x); \ > + } > + > +TEST(bfloat16_t, bf16) > +TEST(float16_t, f16) > +TEST(float32_t, f32) > +TEST(float64_t, f64) > +TEST(int8_t, s8) > +TEST(int16_t, s16) > +TEST(int32_t, s32) > +TEST(int64_t, s64) > +TEST(uint8_t, u8) > +TEST(uint16_t, u16) > +TEST(uint32_t, u32) > +TEST(uint64_t, u64) > + > +/* { dg-final { scan-assembler-times {\tdup\th0, v0\.h\[7\]} 2 } } */ > +/* { dg-final { scan-assembler-times {\tdup\ts0, v0\.s\[3\]} 1 } } */ > +/* { dg-final { scan-assembler-times {\tdup\td0, v0\.d\[1\]} 1 } } */ > +/* { dg-final { scan-assembler-times {\tumov\tw0, v0\.h\[7\]} 2 } } */ > +/* { dg-final { scan-assembler-times {\tumov\tw0, v0\.b\[15\]} 2 } } */ > +/* { dg-final { scan-assembler-times {\tumov\tw0, v0\.s\[3\]} 2 } } */ > +/* { dg-final { scan-assembler-times {\tumov\tx0, v0\.d\[1\]} 2 } } */ > +/* { dg-final { scan-assembler-not "lastb" } } */ > \ No newline at end of file