Tamar Christina <tamar.christ...@arm.com> writes: >> > ;; 2 element quad vector modes. >> > (define_mode_iterator VQ_2E [V2DI V2DF]) >> > >> > @@ -1678,7 +1686,15 @@ (define_mode_attr VHALF [(V8QI "V4QI") (V16QI >> "V8QI") >> > (V2DI "DI") (V2SF "SF") >> > (V4SF "V2SF") (V4HF "V2HF") >> > (V8HF "V4HF") (V2DF "DF") >> > - (V8BF "V4BF")]) >> > + (V8BF "V4BF") >> > + (VNx16QI "VNx8QI") (VNx8QI "VNx4QI") >> > + (VNx4QI "VNx2QI") (VNx2QI "QI") >> > + (VNx8HI "VNx4HI") (VNx4HI "VNx2HI") (VNx2HI "HI") >> > + (VNx8HF "VNx4HF") (VNx4HF "VNx2HF") (VNx2HF "HF") >> > + (VNx8BF "VNx4BF") (VNx4BF "VNx2BF") (VNx2BF "BF") >> > + (VNx4SI "VNx2SI") (VNx2SI "SI") >> > + (VNx4SF "VNx2SF") (VNx2SF "SF") >> > + (VNx2DI "DI") (VNx2DF "DF")]) >> >> Are the x2 entries necessary, given that the new uses are restricted >> to NO2E? >> > > No, but I wanted to keep the symmetry with the Adv. SIMD modes. Since the > mode attributes don't really control the number of alternatives I thought it > would > be better to have the attributes be "fully" defined rather than only the > subset I use.
But these are variable-length modes, so DI is only half of VNx2DI for the minimum vector length. It's less than half for Neoverse V1 or A64FX. IMO it'd be better to leave them out for now and defined them when needed, at which point the right choice would be more obvious. Thanks, Richard > > gcc/ChangeLog: > > PR target/96342 > * config/aarch64/aarch64-sve.md (vec_init<mode><Vhalf>): New. > (@aarch64_pack_partial<mode>): New. > * config/aarch64/aarch64.cc (aarch64_sve_expand_vector_init_subvector): > New. > * config/aarch64/iterators.md (SVE_NO2E): New. > (VHALF, Vhalf): Add SVE partial vectors. > > gcc/testsuite/ChangeLog: > > PR target/96342 > * gcc.target/aarch64/vect-simd-clone-2.c: New test. > > Bootstrapped Regtested on aarch64-none-linux-gnu, > arm-none-linux-gnueabihf, x86_64-pc-linux-gnu > -m32, -m64 and no issues. > > Ok for master? > > Thanks, > Tamar > > -- inline copy of patch -- > > diff --git a/gcc/config/aarch64/aarch64-sve.md > b/gcc/config/aarch64/aarch64-sve.md > index > a72ca2a500d394598268c6adfe717eed94a304b3..8ed4221dbe5c49db97b37f186365fa391900eadb > 100644 > --- a/gcc/config/aarch64/aarch64-sve.md > +++ b/gcc/config/aarch64/aarch64-sve.md > @@ -2839,6 +2839,16 @@ (define_expand "vec_init<mode><Vel>" > } > ) > > +(define_expand "vec_init<mode><Vhalf>" > + [(match_operand:SVE_NO2E 0 "register_operand") > + (match_operand 1 "")] > + "TARGET_SVE" > + { > + aarch64_sve_expand_vector_init (operands[0], operands[1]); > + DONE; > + } > +) > + > ;; Shift an SVE vector left and insert a scalar into element 0.a > (define_insn "vec_shl_insert_<mode>" > [(set (match_operand:SVE_FULL 0 "register_operand") > @@ -9289,6 +9299,19 @@ (define_insn "vec_pack_trunc_<Vwide>" > "uzp1\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>" > ) > > +;; Integer partial pack packing two partial SVE types into a single full SVE > +;; type of the same element type. Use UZP1 on the wider type, which discards > +;; the high part of each wide element. This allows to concat SVE partial > types > +;; into a wider vector. > +(define_insn "@aarch64_pack_partial<mode>" > + [(set (match_operand:SVE_NO2E 0 "register_operand" "=w") > + (vec_concat:SVE_NO2E > + (match_operand:<VHALF> 1 "register_operand" "w") > + (match_operand:<VHALF> 2 "register_operand" "w")))] > + "TARGET_SVE" > + "uzp1\t%0.<Vctype>, %1.<Vctype>, %2.<Vctype>" > +) > + > ;; ------------------------------------------------------------------------- > ;; ---- [INT<-INT] Unpacks > ;; ------------------------------------------------------------------------- > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > index > de4c0a0783912b54ac35d7c818c24574b27a4ca0..40214e318f3c4e30e619d96073b253887c973efc > 100644 > --- a/gcc/config/aarch64/aarch64.cc > +++ b/gcc/config/aarch64/aarch64.cc > @@ -24859,6 +24859,17 @@ aarch64_sve_expand_vector_init (rtx target, rtx vals) > v.quick_push (XVECEXP (vals, 0, i)); > v.finalize (); > > + /* If we have two elements and are concatting vector. */ > + machine_mode elem_mode = GET_MODE (v.elt (0)); > + if (nelts == 2 && VECTOR_MODE_P (elem_mode)) > + { > + /* We've failed expansion using a dup. Try using a cheeky truncate. */ > + rtx arg0 = force_reg (elem_mode, v.elt(0)); > + rtx arg1 = force_reg (elem_mode, v.elt(1)); > + emit_insn (gen_aarch64_pack_partial (mode, target, arg0, arg1)); > + return; > + } > + > /* If neither sub-vectors of v could be initialized specially, > then use INSR to insert all elements from v into TARGET. > ??? This might not be optimal for vectors with large > @@ -24870,6 +24881,30 @@ aarch64_sve_expand_vector_init (rtx target, rtx vals) > aarch64_sve_expand_vector_init_insert_elems (target, v, nelts); > } > > +/* Initialize register TARGET from the two vector subelements in PARALLEL > + rtx VALS. */ > + > +void > +aarch64_sve_expand_vector_init_subvector (rtx target, rtx vals) > +{ > + machine_mode mode = GET_MODE (target); > + int nelts = XVECLEN (vals, 0); > + > + gcc_assert (nelts == 2); > + > + rtx arg0 = XVECEXP (vals, 0, 0); > + rtx arg1 = XVECEXP (vals, 0, 1); > + > + /* If we have two elements and are concatting vector. */ > + machine_mode elem_mode = GET_MODE (arg0); > + gcc_assert (VECTOR_MODE_P (elem_mode)); > + > + arg0 = force_reg (elem_mode, arg0); > + arg1 = force_reg (elem_mode, arg1); > + emit_insn (gen_aarch64_pack_partial (mode, target, arg0, arg1)); > + return; > +} > + > /* Check whether VALUE is a vector constant in which every element > is either a power of 2 or a negated power of 2. If so, return > a constant vector of log2s, and flip CODE between PLUS and MINUS > diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md > index > 89c72b24aeb791adbbd3edfdb131478d52b248e6..09c2d24c4b8f1f39c27ea691f7cfe0b51bc4f788 > 100644 > --- a/gcc/config/aarch64/iterators.md > +++ b/gcc/config/aarch64/iterators.md > @@ -140,6 +140,10 @@ (define_mode_iterator VQ_I [V16QI V8HI V4SI V2DI]) > ;; VQ without 2 element modes. > (define_mode_iterator VQ_NO2E [V16QI V8HI V4SI V8HF V4SF V8BF]) > > +;; SVE modes without 2 element modes. > +(define_mode_iterator SVE_NO2E [VNx16QI VNx8QI VNx4QI VNx8HI VNx4HI VNx8HF > + VNx4HF VNx8BF VNx4BF VNx4SI VNx4SF]) > + > ;; 2 element quad vector modes. > (define_mode_iterator VQ_2E [V2DI V2DF]) > > @@ -1737,7 +1741,15 @@ (define_mode_attr VHALF [(V8QI "V4QI") (V16QI "V8QI") > (V2DI "DI") (V2SF "SF") > (V4SF "V2SF") (V4HF "V2HF") > (V8HF "V4HF") (V2DF "DF") > - (V8BF "V4BF")]) > + (V8BF "V4BF") > + (VNx16QI "VNx8QI") (VNx8QI "VNx4QI") > + (VNx4QI "VNx2QI") (VNx2QI "QI") > + (VNx8HI "VNx4HI") (VNx4HI "VNx2HI") (VNx2HI "HI") > + (VNx8HF "VNx4HF") (VNx4HF "VNx2HF") (VNx2HF "HF") > + (VNx8BF "VNx4BF") (VNx4BF "VNx2BF") (VNx2BF "BF") > + (VNx4SI "VNx2SI") (VNx2SI "SI") > + (VNx4SF "VNx2SF") (VNx2SF "SF") > + (VNx2DI "DI") (VNx2DF "DF")]) > > ;; Half modes of all vector modes, in lower-case. > (define_mode_attr Vhalf [(V8QI "v4qi") (V16QI "v8qi") > @@ -1745,7 +1757,15 @@ (define_mode_attr Vhalf [(V8QI "v4qi") (V16QI "v8qi") > (V8HF "v4hf") (V8BF "v4bf") > (V2SI "si") (V4SI "v2si") > (V2DI "di") (V2SF "sf") > - (V4SF "v2sf") (V2DF "df")]) > + (V4SF "v2sf") (V2DF "df") > + (VNx16QI "vnx8qi") (VNx8QI "vnx4qi") > + (VNx4QI "vnx2qi") (VNx2QI "qi") > + (VNx8HI "vnx4hi") (VNx4HI "vnx2hi") (VNx2HI "hi") > + (VNx8HF "vnx4hf") (VNx4HF "vnx2hf") (VNx2HF "hf") > + (VNx8BF "vnx4bf") (VNx4BF "vnx2bf") (VNx2BF "bf") > + (VNx4SI "vnx2si") (VNx2SI "si") > + (VNx4SF "vnx2sf") (VNx2SF "sf") > + (VNx2DI "di") (VNx2DF "df")]) > > ;; Single-element half modes of quad vector modes. > (define_mode_attr V1HALF [(V2DI "V1DI") (V2DF "V1DF")]) > diff --git a/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-2.c > b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-2.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..a25cae2708dd18cc91a7732f845419bbdb06c5c1 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-2.c > @@ -0,0 +1,13 @@ > +/* { dg-do compile } */ > +/* { dg-options "-std=c99" } */ > +/* { dg-additional-options "-O3 -march=armv8-a" } */ > + > +#pragma GCC target ("+sve") > +extern char __attribute__ ((simd, const)) fn3 (int, char); > +void test_fn3 (int *a, int *b, char *c, int n) > +{ > + for (int i = 0; i < n; ++i) > + a[i] = (int) (fn3 (b[i], c[i]) + c[i]); > +} > + > +/* { dg-final { scan-assembler {\s+_ZGVsMxvv_fn3\n} } } */