Tamar Christina <tamar.christ...@arm.com> writes: > Hi All, > > The following testcase > > #pragma GCC target ("+sve") > extern char __attribute__ ((simd, const)) fn3 (int, short); > void test_fn3 (float *a, float *b, double *c, int n) > { > for (int i = 0; i < n; ++i) > a[i] = fn3 (b[i], c[i]); > } > > at -Ofast ICEs because my previous patch only added support for combining 2 > partial SVE vectors into a bigger vector. However There can also 4 and 8 > piece subvectors. > > This patch fixes this by implementing the missing expansions. > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > > Ok for master? > > Thanks, > Tamar > > gcc/ChangeLog: > > PR target/96342 > PR middle-end/118272 > * config/aarch64/aarch64-sve.md (vec_init<mode><Vquad>, > vec_initvnx16qivnx2qi): New. > * config/aarch64/aarch64.cc (aarch64_sve_expand_vector_init_subvector): > Rewrite to support any arbitrary combinations. > * config/aarch64/iterators.md (SVE_NO2E): Update to use SVE_NO4E > (SVE_NO2E, Vquad): New. > > gcc/testsuite/ChangeLog: > > PR target/96342 > PR middle-end/118272 > * gcc.target/aarch64/vect-simd-clone-3.c: New test. > > --- > > diff --git a/gcc/config/aarch64/aarch64-sve.md > b/gcc/config/aarch64/aarch64-sve.md > index > 6659bb4fcab34699f22ff883825de1cd67108203..35f55bfacfc3238a8a7aa69015f36ba32981af59 > 100644 > --- a/gcc/config/aarch64/aarch64-sve.md > +++ b/gcc/config/aarch64/aarch64-sve.md > @@ -2839,6 +2839,7 @@ (define_expand "vec_init<mode><Vel>" > } > ) > > +;; Vector constructor combining two half vectors { a, b } > (define_expand "vec_init<mode><Vhalf>" > [(match_operand:SVE_NO2E 0 "register_operand") > (match_operand 1 "")] > @@ -2849,6 +2850,28 @@ (define_expand "vec_init<mode><Vhalf>" > } > ) > > +;; Vector constructor combining four quad vectors { a, b, c, d } > +(define_expand "vec_init<mode><Vquad>" > + [(match_operand:SVE_NO4E 0 "register_operand") > + (match_operand 1 "")] > + "TARGET_SVE" > + { > + aarch64_sve_expand_vector_init_subvector (operands[0], operands[1]); > + DONE; > + } > +) > + > +;; Vector constructor combining eight vectors { a, b, c, d, ... } > +(define_expand "vec_initvnx16qivnx2qi" > + [(match_operand:VNx16QI 0 "register_operand") > + (match_operand 1 "")] > + "TARGET_SVE" > + { > + aarch64_sve_expand_vector_init_subvector (operands[0], operands[1]); > + DONE; > + } > +) > + > ;; Shift an SVE vector left and insert a scalar into element 0. > (define_insn "vec_shl_insert_<mode>" > [(set (match_operand:SVE_FULL 0 "register_operand") > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > index > cb9b155826d12b622ae0df1736e4b042d01cf56a..e062cc00d1a548290377382c98ea8f3bb9310513 > 100644 > --- a/gcc/config/aarch64/aarch64.cc > +++ b/gcc/config/aarch64/aarch64.cc > @@ -24898,18 +24898,51 @@ aarch64_sve_expand_vector_init_subvector (rtx > target, rtx vals) > machine_mode mode = GET_MODE (target); > int nelts = XVECLEN (vals, 0); > > - gcc_assert (nelts == 2); > + gcc_assert (nelts % 2 == 0); > > - rtx arg0 = XVECEXP (vals, 0, 0); > - rtx arg1 = XVECEXP (vals, 0, 1); > - > - /* If we have two elements and are concatting vector. */ > - machine_mode elem_mode = GET_MODE (arg0); > + /* We have to be concatting vector. */ > + machine_mode elem_mode = GET_MODE (XVECEXP (vals, 0, 0)); > gcc_assert (VECTOR_MODE_P (elem_mode)); > > - arg0 = force_reg (elem_mode, arg0); > - arg1 = force_reg (elem_mode, arg1); > - emit_insn (gen_aarch64_pack_partial (mode, target, arg0, arg1)); > + auto_vec<rtx> worklist; > + machine_mode wider_mode > + = related_vector_mode (elem_mode, GET_MODE_INNER (elem_mode), > + GET_MODE_NUNITS (elem_mode) * 2).require (); > + > + /* First create the wider first level pack, which also allows us to force > + the values to registers and put the elements in a more convenient > + data structure. */ > + > + for (int i = 0; i < nelts; i+=2) > + { > + rtx arg0 = XVECEXP (vals, 0, i); > + rtx arg1 = XVECEXP (vals, 0, i + 1); > + arg0 = force_reg (elem_mode, arg0); > + arg1 = force_reg (elem_mode, arg1); > + rtx tmp = gen_reg_rtx (wider_mode); > + emit_insn (gen_aarch64_pack_partial (wider_mode, tmp, arg0, arg1)); > + worklist.safe_push (tmp); > + } > + > + /* Keep widening pairwise to have maximum throughput. */ > + while (worklist.length () > 1) > + { > + rtx arg0 = worklist.pop (); > + rtx arg1 = worklist.pop (); > + gcc_assert (GET_MODE (arg0) == GET_MODE (arg1)); > + > + wider_mode > + = related_vector_mode (wider_mode, GET_MODE_INNER (wider_mode), > + GET_MODE_NUNITS (wider_mode) * 2).require (); > + > + rtx tmp = gen_reg_rtx (wider_mode); > + emit_insn (gen_aarch64_pack_partial (wider_mode, tmp, arg0, arg1)); > + worklist.safe_push (tmp);
It looks like this might not work VNx16QI->VNx2QI, since we'd start with 4 VNx8QIs on the worklist, and after the first iteration would have: VNx8QI VNx8QI VNx4QI We'd then try to pack the second VNx8QI and the VNx4QI together. How about instead doing something like: worklist.reserve (nelts); for (int i = 0; i < nelts; ++i) worklist.quick_push (force_reg (elem_mode, XVECEXP (vals, 0, i))); while (nelts > 2) { for (int i = 0; i < nelts; i += 2) { ...read worklist[i] and worklist[i + 1]... ...write back to worklist[i / 2]... } nelts /= 2; } emit_insn (gen_aarch64_pack_partial (mode, target, worklist[0], worklist[1])); Thanks, Richard > + } > + > + gcc_assert (wider_mode == mode); > + emit_move_insn (target, worklist.pop ()); > + > return; > } > > diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md > index > 07b97547cb292e6d4dc1040173a5d5525fb268d5..60bad5c72bc667be19f8224af87ec5451b4b1a9a > 100644 > --- a/gcc/config/aarch64/iterators.md > +++ b/gcc/config/aarch64/iterators.md > @@ -140,9 +140,12 @@ (define_mode_iterator VQ_I [V16QI V8HI V4SI V2DI]) > ;; VQ without 2 element modes. > (define_mode_iterator VQ_NO2E [V16QI V8HI V4SI V8HF V4SF V8BF]) > > +;; SVE modes without 2 and 4 element modes. > +(define_mode_iterator SVE_NO4E [VNx16QI VNx8QI VNx8HI VNx8HF VNx8BF]) > + > ;; SVE modes without 2 element modes. > -(define_mode_iterator SVE_NO2E [VNx16QI VNx8QI VNx4QI VNx8HI VNx4HI VNx8HF > - VNx4HF VNx8BF VNx4BF VNx4SI VNx4SF]) > +(define_mode_iterator SVE_NO2E [SVE_NO4E VNx4QI VNx4HI VNx4HF VNx4BF VNx4SI > + VNx4SF]) > > ;; 2 element quad vector modes. > (define_mode_iterator VQ_2E [V2DI V2DF]) > @@ -1764,6 +1767,11 @@ (define_mode_attr Vhalf [(V8QI "v4qi") (V16QI "v8qi") > (VNx8BF "vnx4bf") (VNx4BF "vnx2bf") > (VNx4SI "vnx2si") (VNx4SF "vnx2sf")]) > > +;; Quad modes of all vector modes, in lower-case. > +(define_mode_attr Vquad [(VNx16QI "vnx4qi") (VNx8QI "vnx2qi") > + (VNx8HI "vnx2hi") (VNx8HF "vnx2hf") > + (VNx8BF "vnx2bf")]) > + > ;; Single-element half modes of quad vector modes. > (define_mode_attr V1HALF [(V2DI "V1DI") (V2DF "V1DF")]) > > diff --git a/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-3.c > b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-3.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..fb50da1f714ce929da0e35b7d9fead2d93476cb9 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-3.c > @@ -0,0 +1,27 @@ > +/* { dg-do compile } */ > +/* { dg-options "-std=c99" } */ > +/* { dg-additional-options "-O3 -march=armv8-a" } */ > + > +#pragma GCC target ("+sve") > +extern char __attribute__ ((simd, const)) fn3 (int, short); > +void test_fn3 (float *a, float *b, double *c, int n) > +{ > + for (int i = 0; i < n; ++i) > + a[i] = fn3 (b[i], c[i]); > +} > + > +/* { dg-final { scan-assembler {\s+_ZGVsMxvv_fn3\n} } } */ > + > +extern char __attribute__ ((simd, const)) fn4 (int, char); > +void test_fn4 (float *a, float *b, double *c, int n) > +{ > + for (int i = 0; i < n; ++i) > + a[i] = fn4 (b[i], c[i]); > +} > + > +/* { dg-final { scan-assembler {\s+_ZGVsMxvv_fn4\n} } } */ > + > +/* { dg-final { scan-assembler-times {\s+uzp1\tz[0-9]+\.b, z[0-9]+\.b, > z[0-9]+\.b\n} 6 } } */ > +/* { dg-final { scan-assembler-times {\s+uzp1\tz[0-9]+\.h, z[0-9]+\.h, > z[0-9]+\.h\n} 16 } } */ > +/* { dg-final { scan-assembler-times {\s+uzp1\tz[0-9]+\.s, z[0-9]+\.s, > z[0-9]+\.s\n} 24 } } */ > +