> -----Original Message----- > From: Richard Sandiford <richard.sandif...@arm.com> > Sent: Thursday, January 2, 2025 5:19 PM > To: Tamar Christina <tamar.christ...@arm.com> > Cc: gcc-patches@gcc.gnu.org; nd <n...@arm.com>; Richard Earnshaw > <richard.earns...@arm.com>; ktkac...@gcc.gnu.org > Subject: Re: [PATCH]AArch64: Implement four and eight chunk VLA concats > [PR118272] > > Tamar Christina <tamar.christ...@arm.com> writes: > > Hi All, > > > > The following testcase > > > > #pragma GCC target ("+sve") > > extern char __attribute__ ((simd, const)) fn3 (int, short); > > void test_fn3 (float *a, float *b, double *c, int n) > > { > > for (int i = 0; i < n; ++i) > > a[i] = fn3 (b[i], c[i]); > > } > > > > at -Ofast ICEs because my previous patch only added support for combining 2 > > partial SVE vectors into a bigger vector. However There can also 4 and 8 > > piece subvectors. > > > > This patch fixes this by implementing the missing expansions. > > > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > > > > Ok for master? > > > > Thanks, > > Tamar > > > > gcc/ChangeLog: > > > > PR target/96342 > > PR middle-end/118272 > > * config/aarch64/aarch64-sve.md (vec_init<mode><Vquad>, > > vec_initvnx16qivnx2qi): New. > > * config/aarch64/aarch64.cc > (aarch64_sve_expand_vector_init_subvector): > > Rewrite to support any arbitrary combinations. > > * config/aarch64/iterators.md (SVE_NO2E): Update to use SVE_NO4E > > (SVE_NO2E, Vquad): New. > > > > gcc/testsuite/ChangeLog: > > > > PR target/96342 > > PR middle-end/118272 > > * gcc.target/aarch64/vect-simd-clone-3.c: New test. > > > > --- > > > > diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64- > sve.md > > index > 6659bb4fcab34699f22ff883825de1cd67108203..35f55bfacfc3238a8a7aa69015 > f36ba32981af59 100644 > > --- a/gcc/config/aarch64/aarch64-sve.md > > +++ b/gcc/config/aarch64/aarch64-sve.md > > @@ -2839,6 +2839,7 @@ (define_expand "vec_init<mode><Vel>" > > } > > ) > > > > +;; Vector constructor combining two half vectors { a, b } > > (define_expand "vec_init<mode><Vhalf>" > > [(match_operand:SVE_NO2E 0 "register_operand") > > (match_operand 1 "")] > > @@ -2849,6 +2850,28 @@ (define_expand "vec_init<mode><Vhalf>" > > } > > ) > > > > +;; Vector constructor combining four quad vectors { a, b, c, d } > > +(define_expand "vec_init<mode><Vquad>" > > + [(match_operand:SVE_NO4E 0 "register_operand") > > + (match_operand 1 "")] > > + "TARGET_SVE" > > + { > > + aarch64_sve_expand_vector_init_subvector (operands[0], operands[1]); > > + DONE; > > + } > > +) > > + > > +;; Vector constructor combining eight vectors { a, b, c, d, ... } > > +(define_expand "vec_initvnx16qivnx2qi" > > + [(match_operand:VNx16QI 0 "register_operand") > > + (match_operand 1 "")] > > + "TARGET_SVE" > > + { > > + aarch64_sve_expand_vector_init_subvector (operands[0], operands[1]); > > + DONE; > > + } > > +) > > + > > ;; Shift an SVE vector left and insert a scalar into element 0. > > (define_insn "vec_shl_insert_<mode>" > > [(set (match_operand:SVE_FULL 0 "register_operand") > > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > > index > cb9b155826d12b622ae0df1736e4b042d01cf56a..e062cc00d1a548290377382 > c98ea8f3bb9310513 100644 > > --- a/gcc/config/aarch64/aarch64.cc > > +++ b/gcc/config/aarch64/aarch64.cc > > @@ -24898,18 +24898,51 @@ aarch64_sve_expand_vector_init_subvector (rtx > target, rtx vals) > > machine_mode mode = GET_MODE (target); > > int nelts = XVECLEN (vals, 0); > > > > - gcc_assert (nelts == 2); > > + gcc_assert (nelts % 2 == 0); > > > > - rtx arg0 = XVECEXP (vals, 0, 0); > > - rtx arg1 = XVECEXP (vals, 0, 1); > > - > > - /* If we have two elements and are concatting vector. */ > > - machine_mode elem_mode = GET_MODE (arg0); > > + /* We have to be concatting vector. */ > > + machine_mode elem_mode = GET_MODE (XVECEXP (vals, 0, 0)); > > gcc_assert (VECTOR_MODE_P (elem_mode)); > > > > - arg0 = force_reg (elem_mode, arg0); > > - arg1 = force_reg (elem_mode, arg1); > > - emit_insn (gen_aarch64_pack_partial (mode, target, arg0, arg1)); > > + auto_vec<rtx> worklist; > > + machine_mode wider_mode > > + = related_vector_mode (elem_mode, GET_MODE_INNER (elem_mode), > > + GET_MODE_NUNITS (elem_mode) * 2).require (); > > + > > + /* First create the wider first level pack, which also allows us to > > force > > + the values to registers and put the elements in a more convenient > > + data structure. */ > > + > > + for (int i = 0; i < nelts; i+=2) > > + { > > + rtx arg0 = XVECEXP (vals, 0, i); > > + rtx arg1 = XVECEXP (vals, 0, i + 1); > > + arg0 = force_reg (elem_mode, arg0); > > + arg1 = force_reg (elem_mode, arg1); > > + rtx tmp = gen_reg_rtx (wider_mode); > > + emit_insn (gen_aarch64_pack_partial (wider_mode, tmp, arg0, arg1)); > > + worklist.safe_push (tmp); > > + } > > + > > + /* Keep widening pairwise to have maximum throughput. */ > > + while (worklist.length () > 1) > > + { > > + rtx arg0 = worklist.pop (); > > + rtx arg1 = worklist.pop (); > > + gcc_assert (GET_MODE (arg0) == GET_MODE (arg1)); > > + > > + wider_mode > > + = related_vector_mode (wider_mode, GET_MODE_INNER (wider_mode), > > + GET_MODE_NUNITS (wider_mode) * 2).require (); > > + > > + rtx tmp = gen_reg_rtx (wider_mode); > > + emit_insn (gen_aarch64_pack_partial (wider_mode, tmp, arg0, arg1)); > > + worklist.safe_push (tmp); > > It looks like this might not work VNx16QI->VNx2QI, since we'd start with > 4 VNx8QIs on the worklist, and after the first iteration would have: > > VNx8QI > VNx8QI > VNx4QI > > We'd then try to pack the second VNx8QI and the VNx4QI together.
Arg, I intended to use quick_insert (0, here. But.. > > How about instead doing something like: > > worklist.reserve (nelts); > for (int i = 0; i < nelts; ++i) > worklist.quick_push (force_reg (elem_mode, XVECEXP (vals, 0, i))); > > while (nelts > 2) > { > for (int i = 0; i < nelts; i += 2) > { > ...read worklist[i] and worklist[i + 1]... > ...write back to worklist[i / 2]... > } > nelts /= 2; > } > > emit_insn (gen_aarch64_pack_partial (mode, target, worklist[0], > worklist[1])); > I think this is better since we above the memmove.. Apologies for the bug ☹ I'll respin. Thanks, Tamar > Thanks, > Richard > > > > + } > > + > > + gcc_assert (wider_mode == mode); > > + emit_move_insn (target, worklist.pop ()); > > + > > return; > > } > > > > diff --git a/gcc/config/aarch64/iterators.md > > b/gcc/config/aarch64/iterators.md > > index > 07b97547cb292e6d4dc1040173a5d5525fb268d5..60bad5c72bc667be19f8224 > af87ec5451b4b1a9a 100644 > > --- a/gcc/config/aarch64/iterators.md > > +++ b/gcc/config/aarch64/iterators.md > > @@ -140,9 +140,12 @@ (define_mode_iterator VQ_I [V16QI V8HI V4SI V2DI]) > > ;; VQ without 2 element modes. > > (define_mode_iterator VQ_NO2E [V16QI V8HI V4SI V8HF V4SF V8BF]) > > > > +;; SVE modes without 2 and 4 element modes. > > +(define_mode_iterator SVE_NO4E [VNx16QI VNx8QI VNx8HI VNx8HF > VNx8BF]) > > + > > ;; SVE modes without 2 element modes. > > -(define_mode_iterator SVE_NO2E [VNx16QI VNx8QI VNx4QI VNx8HI VNx4HI > VNx8HF > > - VNx4HF VNx8BF VNx4BF VNx4SI VNx4SF]) > > +(define_mode_iterator SVE_NO2E [SVE_NO4E VNx4QI VNx4HI VNx4HF > VNx4BF VNx4SI > > + VNx4SF]) > > > > ;; 2 element quad vector modes. > > (define_mode_iterator VQ_2E [V2DI V2DF]) > > @@ -1764,6 +1767,11 @@ (define_mode_attr Vhalf [(V8QI "v4qi") (V16QI > "v8qi") > > (VNx8BF "vnx4bf") (VNx4BF "vnx2bf") > > (VNx4SI "vnx2si") (VNx4SF "vnx2sf")]) > > > > +;; Quad modes of all vector modes, in lower-case. > > +(define_mode_attr Vquad [(VNx16QI "vnx4qi") (VNx8QI "vnx2qi") > > + (VNx8HI "vnx2hi") (VNx8HF "vnx2hf") > > + (VNx8BF "vnx2bf")]) > > + > > ;; Single-element half modes of quad vector modes. > > (define_mode_attr V1HALF [(V2DI "V1DI") (V2DF "V1DF")]) > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-3.c > b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-3.c > > new file mode 100644 > > index > 0000000000000000000000000000000000000000..fb50da1f714ce929da0e35 > b7d9fead2d93476cb9 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-3.c > > @@ -0,0 +1,27 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-std=c99" } */ > > +/* { dg-additional-options "-O3 -march=armv8-a" } */ > > + > > +#pragma GCC target ("+sve") > > +extern char __attribute__ ((simd, const)) fn3 (int, short); > > +void test_fn3 (float *a, float *b, double *c, int n) > > +{ > > + for (int i = 0; i < n; ++i) > > + a[i] = fn3 (b[i], c[i]); > > +} > > + > > +/* { dg-final { scan-assembler {\s+_ZGVsMxvv_fn3\n} } } */ > > + > > +extern char __attribute__ ((simd, const)) fn4 (int, char); > > +void test_fn4 (float *a, float *b, double *c, int n) > > +{ > > + for (int i = 0; i < n; ++i) > > + a[i] = fn4 (b[i], c[i]); > > +} > > + > > +/* { dg-final { scan-assembler {\s+_ZGVsMxvv_fn4\n} } } */ > > + > > +/* { dg-final { scan-assembler-times {\s+uzp1\tz[0-9]+\.b, z[0-9]+\.b, z[0- > 9]+\.b\n} 6 } } */ > > +/* { dg-final { scan-assembler-times {\s+uzp1\tz[0-9]+\.h, z[0-9]+\.h, z[0- > 9]+\.h\n} 16 } } */ > > +/* { dg-final { scan-assembler-times {\s+uzp1\tz[0-9]+\.s, z[0-9]+\.s, z[0- > 9]+\.s\n} 24 } } */ > > +