RE: [PATCH]AArch64: Implement four and eight chunk VLA concats [PR118272]

Tamar Christina Thu, 02 Jan 2025 09:36:18 -0800

> -----Original Message-----
> From: Richard Sandiford <richard.sandif...@arm.com>
> Sent: Thursday, January 2, 2025 5:19 PM
> To: Tamar Christina <tamar.christ...@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <n...@arm.com>; Richard Earnshaw
> <richard.earns...@arm.com>; ktkac...@gcc.gnu.org
> Subject: Re: [PATCH]AArch64: Implement four and eight chunk VLA concats
> [PR118272]
> 
> Tamar Christina <tamar.christ...@arm.com> writes:
> > Hi All,
> >
> > The following testcase
> >
> >   #pragma GCC target ("+sve")
> >   extern char __attribute__ ((simd, const)) fn3 (int, short);
> >   void test_fn3 (float *a, float *b, double *c, int n)
> >   {
> >     for (int i = 0; i < n; ++i)
> >       a[i] = fn3 (b[i], c[i]);
> >   }
> >
> > at -Ofast ICEs because my previous patch only added support for combining 2
> > partial SVE vectors into a bigger vector.  However There can also 4 and 8
> > piece subvectors.
> >
> > This patch fixes this by implementing the missing expansions.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> >     PR target/96342
> >     PR middle-end/118272
> >     * config/aarch64/aarch64-sve.md (vec_init<mode><Vquad>,
> >     vec_initvnx16qivnx2qi): New.
> >     * config/aarch64/aarch64.cc
> (aarch64_sve_expand_vector_init_subvector):
> >     Rewrite to support any arbitrary combinations.
> >     * config/aarch64/iterators.md (SVE_NO2E): Update to use SVE_NO4E
> >     (SVE_NO2E, Vquad): New.
> >
> > gcc/testsuite/ChangeLog:
> >
> >     PR target/96342
> >     PR middle-end/118272
> >     * gcc.target/aarch64/vect-simd-clone-3.c: New test.
> >
> > ---
> >
> > diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-
> sve.md
> > index
> 6659bb4fcab34699f22ff883825de1cd67108203..35f55bfacfc3238a8a7aa69015
> f36ba32981af59 100644
> > --- a/gcc/config/aarch64/aarch64-sve.md
> > +++ b/gcc/config/aarch64/aarch64-sve.md
> > @@ -2839,6 +2839,7 @@ (define_expand "vec_init<mode><Vel>"
> >    }
> >  )
> >
> > +;; Vector constructor combining two half vectors { a, b }
> >  (define_expand "vec_init<mode><Vhalf>"
> >    [(match_operand:SVE_NO2E 0 "register_operand")
> >     (match_operand 1 "")]
> > @@ -2849,6 +2850,28 @@ (define_expand "vec_init<mode><Vhalf>"
> >    }
> >  )
> >
> > +;; Vector constructor combining four quad vectors { a, b, c, d }
> > +(define_expand "vec_init<mode><Vquad>"
> > +  [(match_operand:SVE_NO4E 0 "register_operand")
> > +   (match_operand 1 "")]
> > +  "TARGET_SVE"
> > +  {
> > +    aarch64_sve_expand_vector_init_subvector (operands[0], operands[1]);
> > +    DONE;
> > +  }
> > +)
> > +
> > +;; Vector constructor combining eight vectors { a, b, c, d, ... }
> > +(define_expand "vec_initvnx16qivnx2qi"
> > +  [(match_operand:VNx16QI 0 "register_operand")
> > +   (match_operand 1 "")]
> > +  "TARGET_SVE"
> > +  {
> > +    aarch64_sve_expand_vector_init_subvector (operands[0], operands[1]);
> > +    DONE;
> > +  }
> > +)
> > +
> >  ;; Shift an SVE vector left and insert a scalar into element 0.
> >  (define_insn "vec_shl_insert_<mode>"
> >    [(set (match_operand:SVE_FULL 0 "register_operand")
> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> > index
> cb9b155826d12b622ae0df1736e4b042d01cf56a..e062cc00d1a548290377382
> c98ea8f3bb9310513 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -24898,18 +24898,51 @@ aarch64_sve_expand_vector_init_subvector (rtx
> target, rtx vals)
> >    machine_mode mode = GET_MODE (target);
> >    int nelts = XVECLEN (vals, 0);
> >
> > -  gcc_assert (nelts == 2);
> > +  gcc_assert (nelts % 2 == 0);
> >
> > -  rtx arg0 = XVECEXP (vals, 0, 0);
> > -  rtx arg1 = XVECEXP (vals, 0, 1);
> > -
> > -  /* If we have two elements and are concatting vector.  */
> > -  machine_mode elem_mode = GET_MODE (arg0);
> > +  /* We have to be concatting vector.  */
> > +  machine_mode elem_mode = GET_MODE (XVECEXP (vals, 0, 0));
> >    gcc_assert (VECTOR_MODE_P (elem_mode));
> >
> > -  arg0 = force_reg (elem_mode, arg0);
> > -  arg1 = force_reg (elem_mode, arg1);
> > -  emit_insn (gen_aarch64_pack_partial (mode, target, arg0, arg1));
> > +  auto_vec<rtx> worklist;
> > +  machine_mode wider_mode
> > +    = related_vector_mode (elem_mode, GET_MODE_INNER (elem_mode),
> > +                      GET_MODE_NUNITS (elem_mode) * 2).require ();
> > +
> > +  /* First create the wider first level pack,  which also allows us to 
> > force
> > +     the values to registers and put the elements in a more convenient
> > +     data structure.  */
> > +
> > +  for (int i = 0; i < nelts; i+=2)
> > +    {
> > +      rtx arg0 = XVECEXP (vals, 0, i);
> > +      rtx arg1 = XVECEXP (vals, 0, i + 1);
> > +      arg0 = force_reg (elem_mode, arg0);
> > +      arg1 = force_reg (elem_mode, arg1);
> > +      rtx tmp = gen_reg_rtx (wider_mode);
> > +      emit_insn (gen_aarch64_pack_partial (wider_mode, tmp, arg0, arg1));
> > +      worklist.safe_push (tmp);
> > +    }
> > +
> > +  /* Keep widening pairwise to have maximum throughput.  */
> > +  while (worklist.length () > 1)
> > +    {
> > +      rtx arg0 = worklist.pop ();
> > +      rtx arg1 = worklist.pop ();
> > +      gcc_assert (GET_MODE (arg0) == GET_MODE (arg1));
> > +
> > +      wider_mode
> > +   = related_vector_mode (wider_mode, GET_MODE_INNER (wider_mode),
> > +                          GET_MODE_NUNITS (wider_mode) * 2).require ();
> > +
> > +      rtx tmp = gen_reg_rtx (wider_mode);
> > +      emit_insn (gen_aarch64_pack_partial (wider_mode, tmp, arg0, arg1));
> > +      worklist.safe_push (tmp);
> 
> It looks like this might not work VNx16QI->VNx2QI, since we'd start with
> 4 VNx8QIs on the worklist, and after the first iteration would have:
> 
>    VNx8QI
>    VNx8QI
>    VNx4QI
> 
> We'd then try to pack the second VNx8QI and the VNx4QI together.


Arg, I intended to use quick_insert (0, here.
But..

> 
> How about instead doing something like:
> 
>   worklist.reserve (nelts);
>   for (int i = 0; i < nelts; ++i)
>     worklist.quick_push (force_reg (elem_mode, XVECEXP (vals, 0, i)));
> 
>   while (nelts > 2)
>     {
>       for (int i = 0; i < nelts; i += 2)
>         {
>           ...read worklist[i] and worklist[i + 1]...
>           ...write back to worklist[i / 2]...
>         }
>       nelts /= 2;
>     }
> 
>   emit_insn (gen_aarch64_pack_partial (mode, target, worklist[0],
>                                      worklist[1]));
> 

I think this is better since we above the memmove.. Apologies for the bug ☹

I'll respin.

Thanks,
Tamar

> Thanks,
> Richard
> 
> 
> > +    }
> > +
> > +  gcc_assert (wider_mode == mode);
> > +  emit_move_insn (target, worklist.pop ());
> > +
> >    return;
> >  }
> >
> > diff --git a/gcc/config/aarch64/iterators.md 
> > b/gcc/config/aarch64/iterators.md
> > index
> 07b97547cb292e6d4dc1040173a5d5525fb268d5..60bad5c72bc667be19f8224
> af87ec5451b4b1a9a 100644
> > --- a/gcc/config/aarch64/iterators.md
> > +++ b/gcc/config/aarch64/iterators.md
> > @@ -140,9 +140,12 @@ (define_mode_iterator VQ_I [V16QI V8HI V4SI V2DI])
> >  ;; VQ without 2 element modes.
> >  (define_mode_iterator VQ_NO2E [V16QI V8HI V4SI V8HF V4SF V8BF])
> >
> > +;; SVE modes without 2 and 4 element modes.
> > +(define_mode_iterator SVE_NO4E [VNx16QI VNx8QI VNx8HI VNx8HF
> VNx8BF])
> > +
> >  ;; SVE modes without 2 element modes.
> > -(define_mode_iterator SVE_NO2E [VNx16QI VNx8QI VNx4QI VNx8HI VNx4HI
> VNx8HF
> > -                           VNx4HF VNx8BF VNx4BF VNx4SI VNx4SF])
> > +(define_mode_iterator SVE_NO2E [SVE_NO4E VNx4QI VNx4HI VNx4HF
> VNx4BF VNx4SI
> > +                           VNx4SF])
> >
> >  ;; 2 element quad vector modes.
> >  (define_mode_iterator VQ_2E [V2DI V2DF])
> > @@ -1764,6 +1767,11 @@ (define_mode_attr Vhalf [(V8QI "v4qi")  (V16QI
> "v8qi")
> >                      (VNx8BF "vnx4bf")  (VNx4BF "vnx2bf")
> >                      (VNx4SI "vnx2si")  (VNx4SF "vnx2sf")])
> >
> > +;; Quad modes of all vector modes, in lower-case.
> > +(define_mode_attr Vquad [(VNx16QI "vnx4qi") (VNx8QI "vnx2qi")
> > +                    (VNx8HI "vnx2hi")  (VNx8HF "vnx2hf")
> > +                    (VNx8BF "vnx2bf")])
> > +
> >  ;; Single-element half modes of quad vector modes.
> >  (define_mode_attr V1HALF [(V2DI "V1DI")  (V2DF  "V1DF")])
> >
> > diff --git a/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-3.c
> b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-3.c
> > new file mode 100644
> > index
> 0000000000000000000000000000000000000000..fb50da1f714ce929da0e35
> b7d9fead2d93476cb9
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-3.c
> > @@ -0,0 +1,27 @@
> > +/* { dg-do compile }  */
> > +/* { dg-options "-std=c99" } */
> > +/* { dg-additional-options "-O3 -march=armv8-a" } */
> > +
> > +#pragma GCC target ("+sve")
> > +extern char __attribute__ ((simd, const)) fn3 (int, short);
> > +void test_fn3 (float *a, float *b, double *c, int n)
> > +{
> > +  for (int i = 0; i < n; ++i)
> > +    a[i] = fn3 (b[i], c[i]);
> > +}
> > +
> > +/* { dg-final { scan-assembler {\s+_ZGVsMxvv_fn3\n} } } */
> > +
> > +extern char __attribute__ ((simd, const)) fn4 (int, char);
> > +void test_fn4 (float *a, float *b, double *c, int n)
> > +{
> > +  for (int i = 0; i < n; ++i)
> > +    a[i] = fn4 (b[i], c[i]);
> > +}
> > +
> > +/* { dg-final { scan-assembler {\s+_ZGVsMxvv_fn4\n} } } */
> > +
> > +/* { dg-final { scan-assembler-times {\s+uzp1\tz[0-9]+\.b, z[0-9]+\.b, z[0-
> 9]+\.b\n} 6 } } */
> > +/* { dg-final { scan-assembler-times {\s+uzp1\tz[0-9]+\.h, z[0-9]+\.h, z[0-
> 9]+\.h\n} 16 } } */
> > +/* { dg-final { scan-assembler-times {\s+uzp1\tz[0-9]+\.s, z[0-9]+\.s, z[0-
> 9]+\.s\n} 24 } } */
> > +

RE: [PATCH]AArch64: Implement four and eight chunk VLA concats [PR118272]

Reply via email to