From https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64134, testcase
#define vector __attribute__((vector_size(16)))
float a; float b;
vector float fb(void) { return (vector float){ 0,0,b,a};}
currently produces (correct, but suboptimal):
fb:
fmov s0, wzr
adrp x1, b
adrp x0, a
sub sp, sp, #16
ldr w1, [x1, #:lo12:b]
ldr w0, [x0, #:lo12:a]
stp s0, s0, [sp]
stp w1, w0, [sp, 8]
ldr q0, [sp]
add sp, sp, 16
ret
with this patch:
fb:
adrp x1, b
movi v0.4s, 0
adrp x0, a
ldr s2, [x1, #:lo12:b]
ldr s1, [x0, #:lo12:a]
ins v0.s[2], v2.s[0]
ins v0.s[3], v1.s[0]
ret
The reason is that aarch64_expand_vector_init presently loads a constant and
then overwrites with 'ins' only if exactly one element of the vector is
variable; otherwise, it dumps the entire vector out to the stack (later changed
to STP) and then loads the whole vector in. This patch changes behaviour to load
constants and then 'ins' if at most half the elements are variable rather than
only one.
AFAICT this code path is only used for initialization of GCC vector extension
vectors, and there is already a special cases for all elements being the same
(e.g. the _dup_ instrinsics). So it doesn't feel worth introducing a 'cost
model'-type approach for this one use case (such would probably have to be based
on an assumption about success of STP pattern later anyway). Instead this is a
(relatively) simple heuristic improvement.
There is a possibility of using ld1 rather than ldr+ins, which *may* generate
further improvement (probably requiring adrp+add due to limited addressing modes
of ld1, however); this patch does not tackle that.
Tested on aarch64-none-elf.
gcc/ChangeLog:
PR target/64134
config/aarch64/aarch64.c (aarch64_expand_vector_init): Load constant
and overwrite variable parts if <= 1/2 the elements are variable.
gcc/testsuite/ChangeLog:
PR target/64134
gcc.target/aarch64/vec_init_1.c: New test.
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index cba3c1a4d42c7d543e0ed96a7b41fcd9c925f245..767b986e907fbede46b117a271d179b58406afca 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -8769,22 +8769,19 @@ aarch64_expand_vector_init (rtx target, rtx vals)
machine_mode mode = GET_MODE (target);
machine_mode inner_mode = GET_MODE_INNER (mode);
int n_elts = GET_MODE_NUNITS (mode);
- int n_var = 0, one_var = -1;
+ int n_var = 0;
+ rtx any_const = NULL_RTX;
bool all_same = true;
- rtx x, mem;
- int i;
- x = XVECEXP (vals, 0, 0);
- if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
- n_var = 1, one_var = 0;
-
- for (i = 1; i < n_elts; ++i)
+ for (int i = 0; i < n_elts; ++i)
{
- x = XVECEXP (vals, 0, i);
+ rtx x = XVECEXP (vals, 0, i);
if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
- ++n_var, one_var = i;
+ ++n_var;
+ else
+ any_const = x;
- if (!rtx_equal_p (x, XVECEXP (vals, 0, 0)))
+ if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
all_same = false;
}
@@ -8801,36 +8798,60 @@ aarch64_expand_vector_init (rtx target, rtx vals)
/* Splat a single non-constant element if we can. */
if (all_same)
{
- x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
+ rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
return;
}
- /* One field is non-constant. Load constant then overwrite varying
- field. This is more efficient than using the stack. */
- if (n_var == 1)
+ /* Half the fields (or less) are non-constant. Load constant then overwrite
+ varying fields. Hope that this is more efficient than using the stack. */
+ if (n_var <= n_elts/2)
{
rtx copy = copy_rtx (vals);
- rtx index = GEN_INT (one_var);
- enum insn_code icode;
- /* Load constant part of vector, substitute neighboring value for
- varying element. */
- XVECEXP (copy, 0, one_var) = XVECEXP (vals, 0, one_var ^ 1);
+ /* Load constant part of vector. We really don't care what goes into the
+ parts we will overwrite, but we're more likely to be able to load the
+ constant efficiently if it has fewer, larger, repeating parts
+ (see aarch64_simd_valid_immediate). */
+ for (int i = 0; i < n_elts; i++)
+ {
+ rtx x = XVECEXP (vals, 0, i);
+ if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
+ continue;
+ rtx subst = any_const;
+ for (int bit = n_elts / 2; bit > 0; bit /= 2)
+ {
+ /* Look in the copied vector, as more elements are const. */
+ rtx test = XVECEXP (copy, 0, i ^ bit);
+ if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
+ {
+ subst = test;
+ break;
+ }
+ }
+ XVECEXP (copy, 0, i) = subst;
+ }
aarch64_expand_vector_init (target, copy);
- /* Insert variable. */
- x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, one_var));
- icode = optab_handler (vec_set_optab, mode);
+ /* Insert variables. */
+ enum insn_code icode = optab_handler (vec_set_optab, mode);
gcc_assert (icode != CODE_FOR_nothing);
- emit_insn (GEN_FCN (icode) (target, x, index));
+
+ for (int i = 0; i < n_elts; i++)
+ {
+ rtx x = XVECEXP (vals, 0, i);
+ if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
+ continue;
+ x = copy_to_mode_reg (inner_mode, x);
+ emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
+ }
return;
}
/* Construct the vector in memory one field at a time
and load the whole vector. */
- mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
- for (i = 0; i < n_elts; i++)
+ rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
+ for (int i = 0; i < n_elts; i++)
emit_move_insn (adjust_address_nv (mem, inner_mode,
i * GET_MODE_SIZE (inner_mode)),
XVECEXP (vals, 0, i));
diff --git a/gcc/testsuite/gcc.target/aarch64/vec_init_1.c b/gcc/testsuite/gcc.target/aarch64/vec_init_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..64eaff2dadab9de94e5c80174b24e9b216478a54
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec_init_1.c
@@ -0,0 +1,34 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -fomit-frame-pointer --save-temps -fno-inline" } */
+
+extern void abort (void);
+
+typedef float float16x4_t __attribute__ ((vector_size ((16))));
+
+float a;
+float b;
+
+float16x4_t
+make_vector ()
+{
+ return (float16x4_t) { 0, 0, a, b };
+}
+
+int
+main (int argc, char **argv)
+{
+ a = 4.0;
+ b = 3.0;
+ float16x4_t vec = make_vector ();
+ if (vec[0] != 0 || vec[1] != 0 || vec[2] != a || vec[3] != b)
+ abort ();
+ return 0;
+}
+
+/* { dg-final { scan-assembler-times "ins\\t" 2 } } */
+/* What we want to check, is that make_vector does not stp the whole vector
+ to the stack. Unfortunately here we scan the body of main() too, which may
+ be a bit fragile - the test is currently passing only because of the option
+ -fomit-frame-pointer which avoids use of stp in the prologue to main(). */
+/* { dg-final { scan-assembler-not "stp\\t" } } */
+/* { dg-final { cleanup-saved-temps } } */