On Tue, 6 Dec 2022 at 07:01, Prathamesh Kulkarni <prathamesh.kulka...@linaro.org> wrote: > > On Mon, 5 Dec 2022 at 16:50, Richard Sandiford > <richard.sandif...@arm.com> wrote: > > > > Richard Sandiford via Gcc-patches <gcc-patches@gcc.gnu.org> writes: > > > Prathamesh Kulkarni <prathamesh.kulka...@linaro.org> writes: > > >> Hi, > > >> For the following test-case: > > >> > > >> int16x8_t foo(int16_t x, int16_t y) > > >> { > > >> return (int16x8_t) { x, y, x, y, x, y, x, y }; > > >> } > > >> > > >> Code gen at -O3: > > >> foo: > > >> dup v0.8h, w0 > > >> ins v0.h[1], w1 > > >> ins v0.h[3], w1 > > >> ins v0.h[5], w1 > > >> ins v0.h[7], w1 > > >> ret > > >> > > >> For 16 elements, it results in 8 ins instructions which might not be > > >> optimal perhaps. > > >> I guess, the above code-gen would be equivalent to the following ? > > >> dup v0.8h, w0 > > >> dup v1.8h, w1 > > >> zip1 v0.8h, v0.8h, v1.8h > > >> > > >> I have attached patch to do the same, if number of elements >= 8, > > >> which should be possibly better compared to current code-gen ? > > >> Patch passes bootstrap+test on aarch64-linux-gnu. > > >> Does the patch look OK ? > > >> > > >> Thanks, > > >> Prathamesh > > >> > > >> diff --git a/gcc/config/aarch64/aarch64.cc > > >> b/gcc/config/aarch64/aarch64.cc > > >> index c91df6f5006..e5dea70e363 100644 > > >> --- a/gcc/config/aarch64/aarch64.cc > > >> +++ b/gcc/config/aarch64/aarch64.cc > > >> @@ -22028,6 +22028,39 @@ aarch64_expand_vector_init (rtx target, rtx > > >> vals) > > >> return; > > >> } > > >> > > >> + /* Check for interleaving case. > > >> + For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}. > > >> + Generate following code: > > >> + dup v0.h, x > > >> + dup v1.h, y > > >> + zip1 v0.h, v0.h, v1.h > > >> + for "large enough" initializer. */ > > >> + > > >> + if (n_elts >= 8) > > >> + { > > >> + int i; > > >> + for (i = 2; i < n_elts; i++) > > >> + if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2))) > > >> + break; > > >> + > > >> + if (i == n_elts) > > >> + { > > >> + machine_mode mode = GET_MODE (target); > > >> + rtx dest[2]; > > >> + > > >> + for (int i = 0; i < 2; i++) > > >> + { > > >> + rtx x = copy_to_mode_reg (GET_MODE_INNER (mode), XVECEXP > > >> (vals, 0, i)); > > > > > > Formatting nit: long line. > > > > > >> + dest[i] = gen_reg_rtx (mode); > > >> + aarch64_emit_move (dest[i], gen_vec_duplicate (mode, x)); > > >> + } > > > > > > This could probably be written: > > > > > > for (int i = 0; i < 2; i++) > > > { > > > rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i)); > > > dest[i] = force_reg (GET_MODE_INNER (mode), x); > > > > Oops, I meant "mode" rather than "GET_MODE_INNER (mode)", sorry. > Thanks, I have pushed the change in > 769370f3e2e04823c8a621d8ffa756dd83ebf21e after running > bootstrap+test on aarch64-linux-gnu. Hi Richard, I have attached a patch that extends the transform if one half is dup and other is set of constants. For eg: int8x16_t f(int8_t x) { return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, x, 5, x, 6, x, 7, x, 8 }; }
code-gen trunk: f: adrp x1, .LC0 ldr q0, [x1, #:lo12:.LC0] ins v0.b[0], w0 ins v0.b[2], w0 ins v0.b[4], w0 ins v0.b[6], w0 ins v0.b[8], w0 ins v0.b[10], w0 ins v0.b[12], w0 ins v0.b[14], w0 ret code-gen with patch: f: dup v0.16b, w0 adrp x0, .LC0 ldr q1, [x0, #:lo12:.LC0] zip1 v0.16b, v0.16b, v1.16b ret Bootstrapped+tested on aarch64-linux-gnu. Does it look OK ? Thanks, Prathamesh > > Thanks, > Prathamesh > > > > > } > > > > > > which avoids forcing constant elements into a register before the > > > duplication. > > > OK with that change if it works. > > > > > > Thanks, > > > Richard > > > > > >> + > > >> + rtvec v = gen_rtvec (2, dest[0], dest[1]); > > >> + emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1)); > > >> + return; > > >> + } > > >> + } > > >> + > > >> enum insn_code icode = optab_handler (vec_set_optab, mode); > > >> gcc_assert (icode != CODE_FOR_nothing); > > >> > > >> diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c > > >> b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c > > >> new file mode 100644 > > >> index 00000000000..ee775048589 > > >> --- /dev/null > > >> +++ b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c > > >> @@ -0,0 +1,37 @@ > > >> +/* { dg-do compile } */ > > >> +/* { dg-options "-O3" } */ > > >> +/* { dg-final { check-function-bodies "**" "" "" } } */ > > >> + > > >> +#include <arm_neon.h> > > >> + > > >> +/* > > >> +** foo: > > >> +** ... > > >> +** dup v[0-9]+\.8h, w[0-9]+ > > >> +** dup v[0-9]+\.8h, w[0-9]+ > > >> +** zip1 v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h > > >> +** ... > > >> +** ret > > >> +*/ > > >> + > > >> +int16x8_t foo(int16_t x, int y) > > >> +{ > > >> + int16x8_t v = (int16x8_t) {x, y, x, y, x, y, x, y}; > > >> + return v; > > >> +} > > >> + > > >> +/* > > >> +** foo2: > > >> +** ... > > >> +** dup v[0-9]+\.8h, w[0-9]+ > > >> +** movi v[0-9]+\.8h, 0x1 > > >> +** zip1 v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h > > >> +** ... > > >> +** ret > > >> +*/ > > >> + > > >> +int16x8_t foo2(int16_t x) > > >> +{ > > >> + int16x8_t v = (int16x8_t) {x, 1, x, 1, x, 1, x, 1}; > > >> + return v; > > >> +}
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 9a79a9e7928..411e85f52a4 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -21984,6 +21984,54 @@ aarch64_simd_make_constant (rtx vals) return NULL_RTX; } +/* Subroutine of aarch64_expand_vector_init. + Check if VALS has same element at every alternate position + from START_POS. */ + +static +bool aarch64_init_interleaving_dup_p (rtx vals, int start_pos) +{ + for (int i = start_pos + 2; i < XVECLEN (vals, 0); i += 2) + if (!rtx_equal_p (XVECEXP (vals, 0, start_pos), XVECEXP (vals, 0, i))) + return false; + return true; +} + +/* Subroutine of aarch64_expand_vector_init. + Check if every alternate element in VALS starting from START_POS + is a constant. */ + +static +bool aarch64_init_interleaving_const_p (rtx vals, int start_pos) +{ + for (int i = start_pos; i < XVECLEN (vals, 0); i += 2) + if (!CONSTANT_P (XVECEXP (vals, 0, i))) + return false; + return true; +} + +/* Subroutine of aarch64_expand_vector_init. + Copy all odd-numbered or even-numbered elements from VALS + depending on CONST_EVEN. + For eg if VALS is { x, 1, x, 2, x, 3, x, 4 } + return {1, 2, 3, 4, 1, 1, 1, 1}. + We are only interested in the first half {0 ... n_elts/2} since + that will be used by zip1 for merging. Fill the second half + with an arbitrary value since it will be discarded. */ + +static +rtx aarch64_init_interleaving_shift_init (rtx vals, bool const_even) +{ + int n_elts = XVECLEN (vals, 0); + rtvec vec = rtvec_alloc (n_elts); + int i; + for (i = 0; i < n_elts / 2; i++) + RTVEC_ELT (vec, i) = XVECEXP (vals, 0, (const_even) ? 2 * i : 2 * i + 1); + for (; i < n_elts; i++) + RTVEC_ELT (vec, i) = RTVEC_ELT (vec, 0); + return gen_rtx_CONST_VECTOR (GET_MODE (vals), vec); +} + /* Expand a vector initialisation sequence, such that TARGET is initialised to contain VALS. */ @@ -22048,22 +22096,55 @@ aarch64_expand_vector_init (rtx target, rtx vals) return; } - /* Check for interleaving case. - For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}. - Generate following code: - dup v0.h, x - dup v1.h, y - zip1 v0.h, v0.h, v1.h - for "large enough" initializer. */ + /* Check for interleaving case for "large enough" initializer. + Currently we handle following cases: + (a) Even part is dup and odd part is const. + (b) Odd part is dup and even part is const. + (c) Both even and odd parts are dup. */ if (n_elts >= 8) { - int i; - for (i = 2; i < n_elts; i++) - if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2))) - break; + bool even_dup = false, even_const = false; + bool odd_dup = false, odd_const = false; + + even_dup = aarch64_init_interleaving_dup_p (vals, 0); + if (!even_dup) + even_const = aarch64_init_interleaving_const_p (vals, 0); + + odd_dup = aarch64_init_interleaving_dup_p (vals, 1); + if (!odd_dup) + odd_const = aarch64_init_interleaving_const_p (vals, 1); + + /* This case should already be handled above when all elements are constants. */ + gcc_assert (!(even_const && odd_const)); - if (i == n_elts) + if (even_dup && odd_const) + { + rtx dup_reg = expand_vector_broadcast (mode, XVECEXP (vals, 0, 0)); + dup_reg = force_reg (mode, dup_reg); + + rtx const_reg = gen_reg_rtx (mode); + rtx const_vector = aarch64_init_interleaving_shift_init (vals, false); + aarch64_expand_vector_init (const_reg, const_vector); + + rtvec v = gen_rtvec (2, dup_reg, const_reg); + emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1)); + return; + } + else if (odd_dup && even_const) + { + rtx dup_reg = expand_vector_broadcast (mode, XVECEXP (vals, 0, 1)); + dup_reg = force_reg (mode, dup_reg); + + rtx const_reg = gen_reg_rtx (mode); + rtx const_vector = aarch64_init_interleaving_shift_init (vals, true); + aarch64_expand_vector_init (const_reg, const_vector); + + rtvec v = gen_rtvec (2, const_reg, dup_reg); + emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1)); + return; + } + else if (even_dup && odd_dup) { machine_mode mode = GET_MODE (target); rtx dest[2]; diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-2.c b/gcc/testsuite/gcc.target/aarch64/interleave-init-2.c new file mode 100644 index 00000000000..3ad06c00451 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/interleave-init-2.c @@ -0,0 +1,51 @@ +/* { dg-do compile } */ +/* { dg-options "-O3" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +#include "arm_neon.h" + +/* +**foo: +** ... +** dup v[0-9]+\.8h, w[0-9]+ +** adrp x[0-9]+, .LC[0-9]+ +** ldr q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\] +** zip1 v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h +** ... +*/ + +int16x8_t foo(int16_t x) +{ + return (int16x8_t) { x, 1, x, 2, x, 3, x, 4 }; +} + + +/* +**foo2: +** ... +** dup v[0-9]+\.8h, w[0-9]+ +** adrp x[0-9]+, .LC[0-9]+ +** ldr q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\] +** zip1 v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h +** ... +*/ + +int16x8_t foo2(int16_t x) +{ + return (int16x8_t) { 1, x, 2, x, 3, x, 4, x }; +} + +/* +**foo3: +** ... +** dup v[0-9]+\.8h, v[0-9]+\.h\[0\] +** adrp x[0-9]+, .LC[0-9]+ +** ldr q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\] +** zip1 v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h +** ... +*/ + +float16x8_t foo3(float16_t x) +{ + return (float16x8_t) { x, 1.0, x, 2.0, x, 3.0, x, 4.0 }; +}