Sorry for the slow review. Dmitrij Pochepko <dmitrij.poche...@bell-sw.com> writes: > @@ -20074,6 +20076,83 @@ aarch64_evpc_trn (struct expand_vec_perm_d *d) > return true; > } > > +/* Try to re-encode the PERM constant so it use the bigger size up. > + This rewrites constants such as {0, 1, 4, 5}/V4SF to {0, 2}/V2DI. > + We retry with this new constant with the full suite of patterns. */ > +static bool > +aarch64_evpc_reencode (struct expand_vec_perm_d *d) > +{ > + expand_vec_perm_d newd; > + unsigned HOST_WIDE_INT nelt; > + > + if (d->vec_flags != VEC_ADVSIMD) > + return false; > + > + unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts (); > + for (unsigned int i = 0; i < encoded_nelts; ++i) > + if (!d->perm[i].is_constant ()) > + return false; > + > + /* to_constant is safe since this routine is specific to Advanced SIMD > + vectors. */ > + nelt = d->perm.length ().to_constant (); > + > + /* Get the new mode. Always twice the size of the inner > + and half the elements. */ > + machine_mode new_mode; > + switch (d->vmode) > + { > + /* 128bit vectors. */ > + case E_V4SFmode: > + case E_V4SImode: > + new_mode = V2DImode; > + break; > + case E_V8BFmode: > + case E_V8HFmode: > + case E_V8HImode: > + new_mode = V4SImode; > + break; > + case E_V16QImode: > + new_mode = V8HImode; > + break; > + /* 64bit vectors. */ > + case E_V4BFmode: > + case E_V4HFmode: > + case E_V4HImode: > + new_mode = V2SImode; > + break; > + case E_V8QImode: > + new_mode = V4HImode; > + break; > + default: > + return false; > + } > + > + newd.vmode = new_mode; > + newd.vec_flags = VEC_ADVSIMD; > + newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL; > + newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL; > + newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL; > + newd.testing_p = d->testing_p; > + newd.one_vector_p = d->one_vector_p; > + vec_perm_builder newpermconst; > + newpermconst.new_vector (nelt / 2, nelt / 2, 1); > + > + /* Convert the perm constant if we can. Require even, odd as the pairs. > */ > + for (unsigned int i = 0; i < nelt; i += 2) > + { > + unsigned int elt0 = d->perm[i].to_constant (); > + unsigned int elt1 = d->perm[i+1].to_constant (); > + if ((elt0 & 1) != 0 || elt0 + 1 != elt1) > + return false; > + newpermconst.quick_push (elt0 / 2); > + } > + newpermconst.finalize ();
I think it would be simpler to do it in this order: - check for Advanced SIMD, bail out if not - get the new mode, bail out if none - calculate the permutation vector, bail out if not suitable - set up the rest of “newd” There would then only be one walk over d->perm rather than two, and we'd only create the gen_lowparts when there's something to test. The new mode can be calculated with something like: poly_uint64 vec_bits = GET_MODE_BITSIZE (d->vmode); unsigned int new_elt_bits = GET_MODE_UNIT_BITSIZE (d->vmode) * 2; auto new_elt_mode = int_mode_for_size (new_elt_bits, false).require (); machine_mode new_mode = aarch64_simd_container_mode (new_elt_mode, vec_bits); “new_mode” will be “word_mode” on failure. > diff --git a/gcc/testsuite/gcc.target/aarch64/vdup_n_3.c > b/gcc/testsuite/gcc.target/aarch64/vdup_n_3.c > new file mode 100644 > index 0000000..289604d > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/vdup_n_3.c > @@ -0,0 +1,16 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2" } */ > + > +#define vector __attribute__((vector_size(4*sizeof(float)))) > + > +/* These are both dups. */ > +vector float f(vector float a, vector float b) > +{ > + return __builtin_shuffle (a, a, (vector int){0, 1, 0, 1}); > +} > +vector float f1(vector float a, vector float b) > +{ > + return __builtin_shuffle (a, a, (vector int){2, 3, 2, 3}); > +} > + > +/* { dg-final { scan-assembler-times "\[ \t\]*dup\[ \t\]+v\[0-9\]+\.2d" 2 } > } */ The regexp would be easier to read if quoted using {…}, which requires fewer backslashes. Same for the other tests. Thanks, Richard