Re: [Patch][Aarch64] Implement Aarch64 SIMD ABI and aarch64_vector_pcs attribute

Kyrill Tkachov Wed, 01 Aug 2018 05:14:41 -0700

Hi Steve,

On 31/07/18 23:24, Steve Ellcey wrote:

Here is a new version of my patch to support the Aarch64 SIMD ABI [1]
in GCC.  I think this is complete enought to be considered for check
in.  I wrote a few new tests and put them in a new gcc.target/torture
directory so they would be run with multiple optimization options.  I
also verified that there are no regressions in the GCC testsuite.


The significant difference between the standard ARM ABI and the SIMD
ABI is that in the normal ABI a callee saves only the lower 64 bits of
registers V8-V15, in the SIMD ABI the callee must save all 128 bits of
registers V8-V23.

As I mentioned in my RFC, I intend to (eventually) follow this patch
with two more, one to define the TARGET_SIMD_CLONE* macros and one to
improve the GCC register allocation/usage when calling SIMD
functions.  Right now, a caller calling a SIMD function will save more
registers than it needs to because some of those registers will also be
saved by the callee.


Thanks for working on this!
A few comments on the patch inline.

Thanks,
Kyrill

Steve Ellcey
sell...@cavium.com

[1] https://developer.arm.com/products/software-development-tools/hpc/a
rm-compiler-for-hpc/vector-function-abi

Compiler ChangeLog:

2018-07-31  Steve Ellcey  <sell...@cavium.com>

        * config/aarch64/aarch64-protos.h (aarch64_use_simple_return_insn_p):
        New prototype.
        (aarch64_epilogue_uses): Ditto.
        * config/aarch64/aarch64.c (aarch64_attribute_table): New array.
        (aarch64_simd_decl_p): New function.
        (aarch64_reg_save_mode): New function.
        (aarch64_is_simd_call_p): New function.
        (aarch64_function_ok_for_sibcall): Check for simd calls.
        (aarch64_layout_frame): Check for simd function.
        (aarch64_gen_storewb_pair): Handle E_TFmode.
        (aarch64_push_regs): Use aarch64_reg_save_mode to get mode.
        (aarch64_gen_loadwb_pair): Handle E_TFmode.
        (aarch64_pop_regs): Use aarch64_reg_save_mode to get mode.
        (aarch64_gen_store_pair): Handle E_TFmode.
        (aarch64_gen_load_pair): Ditto.
        (aarch64_save_callee_saves): Handle different mode sizes.
        (aarch64_restore_callee_saves): Ditto.
        (aarch64_components_for_bb): Check for simd function.
        (aarch64_epilogue_uses): New function.
        (aarch64_process_components): Ditto.
        (aarch64_expand_prologue): Ditto.
        (aarch64_expand_epilogue): Ditto.
        (aarch64_expand_call): Ditto.
        (TARGET_ATTRIBUTE_TABLE): New define.
        * config/aarch64/aarch64.h (EPILOGUE_USES): Redefine.
        (FP_SIMD_SAVED_REGNUM_P): New macro.
        * config/aarch64/aarch64.md (V23_REGNUM) New constant.
        (simple_return): New define_expand.
        (load_pair_dw_tftf): New instruction.
        (store_pair_dw_tftf): Ditto.
        (loadwb_pair<TX:mode>_<P:mode>): Ditto.
        ("storewb_pair<TX:mode>_<P:mode>): Ditto.

Testsuite ChangeLog:

2018-07-31  Steve Ellcey  <sell...@cavium.com>

        * gcc.target/aarch64/torture/aarch64-torture.exp: New file.
        * gcc.target/aarch64/torture/simd-abi-1.c: New test.
        * gcc.target/aarch64/torture/simd-abi-2.c: Ditto.
        * gcc.target/aarch64/torture/simd-abi-3.c: Ditto.
        * gcc.target/aarch64/torture/simd-abi-4.c: Ditto.


gcc-vect-abi.patch


diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index af5db9c..99c962f 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -423,6 +423,7 @@ bool aarch64_split_dimode_const_store (rtx, rtx);
 bool aarch64_symbolic_address_p (rtx);
 bool aarch64_uimm12_shift (HOST_WIDE_INT);
 bool aarch64_use_return_insn_p (void);
+bool aarch64_use_simple_return_insn_p (void);
 const char *aarch64_mangle_builtin_type (const_tree);
 const char *aarch64_output_casesi (rtx *);

@@ -507,6 +508,8 @@ void aarch64_split_simd_move (rtx, rtx);

 /* Check for a legitimate floating point constant for FMOV.  */
 bool aarch64_float_const_representable_p (rtx);

+extern int aarch64_epilogue_uses (int);

+
 #if defined (RTX_CODE)
 void aarch64_gen_unlikely_cbranch (enum rtx_code, machine_mode cc_mode,
                                   rtx label_ref);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index fa01475..9e6827a 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1027,6 +1027,15 @@ static const struct processor *selected_tune;
 /* The current tuning set.  */
 struct tune_params aarch64_tune_params = generic_tunings;

+/* Table of machine attributes. */

+static const struct attribute_spec aarch64_attribute_table[] =
+{
+  /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
+       affects_type_identity, handler, exclude } */
+  { "aarch64_vector_pcs", 0, 0, true,  false, false, false, NULL, NULL },
+  { NULL,                 0, 0, false, false, false, false, NULL, NULL }
+};
+
 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)

/* An ISA extension in the co-processor and main instruction set space. */

@@ -1405,6 +1414,26 @@ aarch64_hard_regno_mode_ok (unsigned regno, machine_mode 
mode)
   return false;
 }

+/* Return true if this is a definition of a vectorized simd function. */

+
+static bool
+aarch64_simd_decl_p (tree fndecl)
+{
+  if (lookup_attribute ("aarch64_vector_pcs", DECL_ATTRIBUTES (fndecl)) != 
NULL)
+    return true;
+  if (lookup_attribute ("simd", DECL_ATTRIBUTES (fndecl)) == NULL)
+    return false;
+  return (VECTOR_TYPE_P (TREE_TYPE (TREE_TYPE (fndecl))));
+}
+
+static
+machine_mode aarch64_reg_save_mode (tree fndecl, unsigned regno)
+{
+  return GP_REGNUM_P (regno)
+          ? E_DImode
+          : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
+}
+

Please add a function comment (and return type should go on the same line as 
'static')

 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
    clobbers the top 64 bits when restoring the bottom 64 bits.  */
@@ -3272,7 +3301,9 @@ static bool
 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
                                 tree exp ATTRIBUTE_UNUSED)
 {
-  /* Currently, always true.  */
+  if (aarch64_simd_decl_p (cfun->decl))
+    return false;
+
   return true;
 }

@@ -4035,6 +4066,7 @@ aarch64_layout_frame (void)

 {
   HOST_WIDE_INT offset = 0;
   int regno, last_fp_reg = INVALID_REGNUM;
+  bool simd_function = aarch64_simd_decl_p (cfun->decl);

if (reload_completed && cfun->machine->frame.laid_out)

     return;
@@ -4047,6 +4079,17 @@ aarch64_layout_frame (void)
   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;

+ /* If this is a non-leaf simd function with calls we assume that

+     at least one of those calls is to a non-simd function and thus
+     we must save V8 to V23 in the prologue.  */
+
+  if (simd_function && !crtl->is_leaf)
+    {
+      for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
+       if (FP_SIMD_SAVED_REGNUM_P (regno))
+         df_set_regs_ever_live (regno, true);
+    }
+
   /* First mark all the registers that really need to be saved...  */
   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
@@ -4069,7 +4112,8 @@ aarch64_layout_frame (void)

for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)

     if (df_regs_ever_live_p (regno)
-       && !call_used_regs[regno])
+       && (!call_used_regs[regno]
+           || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
       {
        cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
        last_fp_reg = regno;
@@ -4106,7 +4150,8 @@ aarch64_layout_frame (void)
       {
        /* If there is an alignment gap between integer and fp callee-saves,
           allocate the last fp register to it if possible.  */
-       if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
+       if (regno == last_fp_reg && has_align_gap
+           && !simd_function && (offset & 8) == 0)
          {
            cfun->machine->frame.reg_offset[regno] = max_int_offset;
            break;
@@ -4118,7 +4163,7 @@ aarch64_layout_frame (void)
        else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
                 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
          cfun->machine->frame.wb_candidate2 = regno;
-       offset += UNITS_PER_WORD;
+       offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
       }

offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);

@@ -4261,6 +4306,10 @@ aarch64_gen_storewb_pair (machine_mode mode, rtx base, 
rtx reg, rtx reg2,
       return gen_storewb_pairdf_di (base, base, reg, reg2,
                                    GEN_INT (-adjustment),
                                    GEN_INT (UNITS_PER_WORD - adjustment));
+    case E_TFmode:
+      return gen_storewb_pairtf_di (base, base, reg, reg2,
+                                   GEN_INT (-adjustment),
+                                   GEN_INT (UNITS_PER_VREG - adjustment));
     default:
       gcc_unreachable ();
     }
@@ -4273,7 +4322,7 @@ static void
 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
 {
   rtx_insn *insn;
-  machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
+  machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);

if (regno2 == INVALID_REGNUM)

     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
@@ -4303,6 +4352,9 @@ aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx 
reg, rtx reg2,
     case E_DFmode:
       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
                                   GEN_INT (UNITS_PER_WORD));
+    case E_TFmode:
+      return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
+                                  GEN_INT (UNITS_PER_VREG));
     default:
       gcc_unreachable ();
     }
@@ -4316,7 +4368,7 @@ static void
 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
                  rtx *cfi_ops)
 {
-  machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
+  machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
   rtx reg1 = gen_rtx_REG (mode, regno1);

*cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);

@@ -4351,6 +4403,9 @@ aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx 
reg1, rtx mem2,
     case E_DFmode:
       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);

+ case E_TFmode:

+      return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
+
     default:
       gcc_unreachable ();
     }
@@ -4371,6 +4426,9 @@ aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx 
mem1, rtx reg2,
     case E_DFmode:
       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);

+ case E_TFmode:

+      return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
+
     default:
       gcc_unreachable ();
     }
@@ -4403,6 +4461,9 @@ aarch64_save_callee_saves (machine_mode mode, poly_int64 
start_offset,
   rtx_insn *insn;
   unsigned regno;
   unsigned regno2;
+  HOST_WIDE_INT mode_size;
+
+  gcc_assert (GET_MODE_SIZE (mode).is_constant(&mode_size));

for (regno = aarch64_next_callee_save (start, limit);

        regno <= limit;
@@ -4428,7 +4489,7 @@ aarch64_save_callee_saves (machine_mode mode, poly_int64 
start_offset,

if (regno2 <= limit

          && !cfun->machine->reg_is_wrapped_separately[regno2]
-         && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
+         && ((cfun->machine->frame.reg_offset[regno] + mode_size)
              == cfun->machine->frame.reg_offset[regno2]))

{

@@ -4469,6 +4530,9 @@ aarch64_restore_callee_saves (machine_mode mode,
   unsigned regno;
   unsigned regno2;
   poly_int64 offset;
+  HOST_WIDE_INT mode_size;
+
+  gcc_assert (GET_MODE_SIZE (mode).is_constant(&mode_size));

I believe GCC can be compiled such that gcc_asserts are a no-op, so we want to 
avoid depending
on side-effects in such calls. Better to make this into an if-statement and a 
gcc_unreachable ().

   for (regno = aarch64_next_callee_save (start, limit);
        regno <= limit;
@@ -4492,7 +4556,7 @@ aarch64_restore_callee_saves (machine_mode mode,

if (regno2 <= limit

          && !cfun->machine->reg_is_wrapped_separately[regno2]
-         && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
+         && ((cfun->machine->frame.reg_offset[regno] + mode_size)
              == cfun->machine->frame.reg_offset[regno2]))
        {
          rtx reg2 = gen_rtx_REG (mode, regno2);
@@ -4629,13 +4693,15 @@ aarch64_components_for_bb (basic_block bb)
   bitmap in = DF_LIVE_IN (bb);
   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
+  bool simd_function = aarch64_simd_decl_p (cfun->decl);

sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);

   bitmap_clear (components);

/* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */

   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
-    if ((!call_used_regs[regno])
+    if ((!call_used_regs[regno]
+       || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
        && (bitmap_bit_p (in, regno)
           || bitmap_bit_p (gen, regno)
           || bitmap_bit_p (kill, regno)))
@@ -4707,8 +4773,10 @@ aarch64_process_components (sbitmap components, bool 
prologue_p)
   while (regno != last_regno)
     {
       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
-        so DFmode for the vector registers is enough.  */
-      machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
+        so DFmode for the vector registers is enough.  For simd functions
+         we want to save the entire register.  */
+      machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
+
       rtx reg = gen_rtx_REG (mode, regno);
       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
       if (!frame_pointer_needed)
@@ -4737,6 +4805,7 @@ aarch64_process_components (sbitmap components, bool 
prologue_p)
         mergeable with the current one into a pair.  */
       if (!satisfies_constraint_Ump (mem)
          || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
+         || (aarch64_simd_decl_p (cfun->decl) && (FP_REGNUM_P (regno)))
          || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
                       GET_MODE_SIZE (mode)))
        {
@@ -4808,6 +4877,22 @@ aarch64_set_handled_components (sbitmap components)
       cfun->machine->reg_is_wrapped_separately[regno] = true;
 }

+/* Say that the return address register is used by the epilogue, but only after

+   epilogue generation is complete.  Note that in the case of sibcalls, the
+   values "used by the epilogue" are considered live at the start of the called
+   function.  For SIMD functions we say that the FP regs that need to be saved
+   are used so that the restore code does not get removed by optimizations.  */
+

I'm a bit lost here. Can you please make this comment clearer as to what value 
this
function returns? I can see it's an implementation of the EPILOGUE_USES hook 
but it's
not clear from this comment what the required logic is.

+int
+aarch64_epilogue_uses (int regno)
+{
+  if (epilogue_completed && (regno) == LR_REGNUM)
+    return 1;
+  if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
+    return 1;
+  return 0;
+}
+
 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
    is saved at BASE + OFFSET.  */

@@ -4959,8 +5044,12 @@ aarch64_expand_prologue (void)aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,

                             callee_adjust != 0 || emit_frame_chain);
-  aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
-                            callee_adjust != 0 || emit_frame_chain);
+  if (aarch64_simd_decl_p (cfun->decl))
+    aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
+                              callee_adjust != 0 || emit_frame_chain);
+  else
+    aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
+                              callee_adjust != 0 || emit_frame_chain);
   aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed);
 }

@@ -4983,6 +5072,18 @@ aarch64_use_return_insn_p (void)return known_eq (cfun->machine->frame.frame_size, 0);

 }
+bool
+aarch64_use_simple_return_insn_p (void)
+{
+  /* Avoid shrink-wrapping non-leaf simd functions.  Doing so
+     causes GCC to lose the fp save/restores in the prologue and
+     epilogue.  */
+


Please add a function comment. How is this used in shrink-wrapping?
Does the availability of the simple_return standard name guard that 
optimisation?

 +  if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
+    return false;
+
+  return true;
+}

/* Generate the epilogue instructions for returning from a function.

    This is almost exactly the reverse of the prolog sequence, except
@@ -5041,8 +5142,12 @@ aarch64_expand_epilogue (bool for_sibcall)

aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,

                                callee_adjust != 0, &cfi_ops);
-  aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
-                               callee_adjust != 0, &cfi_ops);
+  if (aarch64_simd_decl_p (cfun->decl))
+    aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
+                                 callee_adjust != 0, &cfi_ops);
+  else
+    aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
+                                 callee_adjust != 0, &cfi_ops);

if (need_barrier_p)

     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
@@ -6287,7 +6392,7 @@ aarch64_fixed_condition_code_regs (unsigned int *p1, 
unsigned int *p2)
 void
 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
 {
-  rtx call, callee, tmp;
+  rtx call, call_insn, callee, tmp;
   rtvec vec;
   machine_mode mode;

@@ -6305,7 +6410,8 @@ aarch64_expand_call (rtx result, rtx mem, bool sibcall)

       : !REG_P (callee))
     XEXP (mem, 0) = force_reg (mode, callee);

- call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);

+  call_insn = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
+  call = call_insn;

if (result != NULL_RTX)

     call = gen_rtx_SET (result, call);
@@ -18210,6 +18316,9 @@ aarch64_libgcc_floating_mode_supported_p
 #undef TARGET_SELECT_EARLY_REMAT_MODES
 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes

+#undef TARGET_ATTRIBUTE_TABLE

+#define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
+
 #if CHECKING_P
 #undef TARGET_RUN_TARGET_SELFTESTS
 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index c121850..b995705 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -392,13 +392,7 @@ extern unsigned aarch64_architecture_version;
     V_ALIASES(28), V_ALIASES(29), V_ALIASES(30), V_ALIASES(31)  \
   }

-/* Say that the return address register is used by the epilogue, but only after

-   epilogue generation is complete.  Note that in the case of sibcalls, the
-   values "used by the epilogue" are considered live at the start of the called
-   function.  */
-
-#define EPILOGUE_USES(REGNO) \
-  (epilogue_completed && (REGNO) == LR_REGNUM)
+#define EPILOGUE_USES(REGNO) aarch64_epilogue_uses (REGNO)

Please wrap the expansion of the macro in parentheses


 /* EXIT_IGNORE_STACK should be nonzero if, when returning from a function,
    the stack pointer does not matter.  This is only true if the function
@@ -503,6 +497,8 @@ extern unsigned aarch64_architecture_version;
 #define PR_LO_REGNUM_P(REGNO)\
   (((unsigned) (REGNO - P0_REGNUM)) <= (P7_REGNUM - P0_REGNUM))

+#define FP_SIMD_SAVED_REGNUM_P(REGNO) \

+  (((unsigned) (REGNO - V8_REGNUM)) <= (V23_REGNUM - V8_REGNUM))

/* Register and constant classes. */diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md

index e9c16f9..40e78be 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -63,6 +63,7 @@
     (V15_REGNUM                47)
     (V16_REGNUM                48)
     (V20_REGNUM                52)
+    (V23_REGNUM                55)
     (V24_REGNUM                56)
     (V28_REGNUM                60)
     (V31_REGNUM                63)
@@ -677,7 +678,13 @@
   ""
 )

-(define_insn "simple_return"

+(define_expand "simple_return"
+  [(simple_return)]
+  "aarch64_use_simple_return_insn_p ()"
+  ""
+)
+
+(define_insn "*simple_return"
   [(simple_return)]
   ""
   "ret"
@@ -1346,6 +1353,20 @@
    (set_attr "fp" "*,yes")]
 )

+(define_insn "load_pair_dw_tftf"

+  [(set (match_operand:TF 0 "register_operand" "=w")
+       (match_operand:TF 1 "aarch64_mem_pair_operand" "Ump"))
+   (set (match_operand:TF 2 "register_operand" "=w")
+       (match_operand:TF 3 "memory_operand" "m"))]
+   "rtx_equal_p (XEXP (operands[3], 0),
+                plus_constant (Pmode,
+                               XEXP (operands[1], 0),
+                               GET_MODE_SIZE (TFmode)))"
+  "ldp\\t%q0, %q2, %1"
+  [(set_attr "type" "neon_load1_2reg")
+   (set_attr "fp" "yes")]
+)
+
 ;; Operands 0 and 2 are tied together by the final condition; so we allow
 ;; fairly lax checking on the second memory operation.
 (define_insn "store_pair_sw_<SX:mode><SX2:mode>"
@@ -1381,6 +1402,20 @@
    (set_attr "fp" "*,yes")]
 )

+(define_insn "store_pair_dw_tftf"

+  [(set (match_operand:TF 0 "aarch64_mem_pair_operand" "=Ump")
+       (match_operand:TF 1 "register_operand" "w"))
+   (set (match_operand:TF 2 "memory_operand" "=m")
+       (match_operand:TF 3 "register_operand" "w"))]
+   "rtx_equal_p (XEXP (operands[2], 0),
+                plus_constant (Pmode,
+                               XEXP (operands[0], 0),
+                               GET_MODE_SIZE (TFmode)))"
+  "stp\\t%q1, %q3, %0"
+  [(set_attr "type" "neon_store1_2reg")
+   (set_attr "fp" "yes")]
+)
+

I think instead of having these two patterns it would be better to extend the 
existing
load_pair<VQ:mode><VQ2:mode> and vec_store_pair<VQ:mode><VQ2:mode> patterns.
You may need to define a new iterator that extends the VQ modes with TFmode.

 ;; Load pair with post-index writeback.  This is primarily used in function
 ;; epilogues.
 (define_insn "loadwb_pair<GPI:mode>_<P:mode>"
@@ -1413,6 +1448,21 @@
   [(set_attr "type" "neon_load1_2reg")]
 )

+(define_insn "loadwb_pair<TX:mode>_<P:mode>"

+  [(parallel
+    [(set (match_operand:P 0 "register_operand" "=k")
+          (plus:P (match_operand:P 1 "register_operand" "0")
+                  (match_operand:P 4 "aarch64_mem_pair_offset" "n")))
+     (set (match_operand:TX 2 "register_operand" "=w")
+          (mem:TX (match_dup 1)))
+     (set (match_operand:TX 3 "register_operand" "=w")
+          (mem:TX (plus:P (match_dup 1)
+                  (match_operand:P 5 "const_int_operand" "n"))))])]
+  "INTVAL (operands[5]) == GET_MODE_SIZE (<TX:MODE>mode)"
+  "ldp\\t%q2, %q3, [%1], %4"
+  [(set_attr "type" "neon_load1_2reg")]
+)

PLease guard this on TARGET_SIMD. Also, I believe the correct type for this is 
neon_ldp_q

 +
 ;; Store pair with pre-index writeback.  This is primarily used in function
 ;; prologues.
 (define_insn "storewb_pair<GPI:mode>_<P:mode>"
@@ -1447,6 +1497,22 @@
   [(set_attr "type" "neon_store1_2reg<q>")]
 )

+(define_insn "storewb_pair<TX:mode>_<P:mode>"

+  [(parallel
+    [(set (match_operand:P 0 "register_operand" "=&k")
+          (plus:P (match_operand:P 1 "register_operand" "0")
+                  (match_operand:P 4 "aarch64_mem_pair_offset" "n")))
+     (set (mem:TX (plus:P (match_dup 0)
+                  (match_dup 4)))
+          (match_operand:TX 2 "register_operand" "w"))
+     (set (mem:TX (plus:P (match_dup 0)
+                  (match_operand:P 5 "const_int_operand" "n")))
+          (match_operand:TX 3 "register_operand" "w"))])]
+  "INTVAL (operands[5]) == INTVAL (operands[4]) + GET_MODE_SIZE 
(<TX:MODE>mode)"
+  "stp\\t%q2, %q3, [%0, %4]!"
+  [(set_attr "type" "neon_store1_2reg<q>")]
+)

Likewise. The type here should be neon_stp_q.

 +
 ;; -------------------------------------------------------------------
 ;; Sign/Zero extension
 ;; -------------------------------------------------------------------


gcc-vect-abi-test.patch


diff --git a/gcc/testsuite/gcc.target/aarch64/torture/aarch64-torture.exp 
b/gcc/testsuite/gcc.target/aarch64/torture/aarch64-torture.exp
index e69de29..22f08ff 100644
--- a/gcc/testsuite/gcc.target/aarch64/torture/aarch64-torture.exp
+++ b/gcc/testsuite/gcc.target/aarch64/torture/aarch64-torture.exp
@@ -0,0 +1,41 @@
+#   Copyright (C) 2018 Free Software Foundation, Inc.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3.  If not see
+#<http://www.gnu.org/licenses/>.
+
+# GCC testsuite that uses the `gcc-dg.exp' driver, looping over
+# optimization options.
+
+# Exit immediately if this isn't a Aarch64 target.
+if { ![istarget aarch64*-*-*] } then {
+  return
+}
+
+# Load support procs.
+load_lib gcc-dg.exp
+
+# If a testcase doesn't have special options, use these.
+global DEFAULT_CFLAGS
+if ![info exists DEFAULT_CFLAGS] then {
+    set DEFAULT_CFLAGS " -ansi -pedantic-errors"
+}
+
+# Initialize `dg'.
+dg-init
+
+# Main loop.
+gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.\[cS\]]] "" 
$DEFAULT_CFLAGS
+

I don't see this iterating over any options.
Did you mean to load c-torture.exp above?

 +# All done.
+dg-finish
diff --git a/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-1.c 
b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-1.c
index e69de29..e11580a 100644
--- a/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-1.c
+++ b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-1.c
@@ -0,0 +1,41 @@
+/* { dg-do compile } */
+
+void __attribute__ ((aarch64_vector_pcs))
+f (void)
+{
+  /* Clobber all fp/simd regs and verify that the correct ones are saved
+     and restored in the prologue and epilogue of a SIMD function. */
+  __asm__ __volatile__ ("" :::  "q0",  "q1",  "q2",  "q3");
+  __asm__ __volatile__ ("" :::  "q4",  "q5",  "q6",  "q7");
+  __asm__ __volatile__ ("" :::  "q8",  "q9", "q10", "q11");
+  __asm__ __volatile__ ("" ::: "q12", "q13", "q14", "q15");
+  __asm__ __volatile__ ("" ::: "q16", "q17", "q18", "q19");
+  __asm__ __volatile__ ("" ::: "q20", "q21", "q22", "q23");
+  __asm__ __volatile__ ("" ::: "q24", "q25", "q26", "q27");
+  __asm__ __volatile__ ("" ::: "q28", "q29", "q30", "q31");
+}
+
+/* { dg-final { scan-assembler "\[ \t\]stp\tq8, q9" } } */
+/* { dg-final { scan-assembler "\[ \t\]stp\tq10, q11" } } */
+/* { dg-final { scan-assembler "\[ \t\]stp\tq12, q13" } } */
+/* { dg-final { scan-assembler "\[ \t\]stp\tq14, q15" } } */
+/* { dg-final { scan-assembler "\[ \t\]stp\tq16, q17" } } */
+/* { dg-final { scan-assembler "\[ \t\]stp\tq18, q19" } } */
+/* { dg-final { scan-assembler "\[ \t\]stp\tq20, q21" } } */
+/* { dg-final { scan-assembler "\[ \t\]stp\tq22, q23" } } */
+/* { dg-final { scan-assembler "\[ \t\]ldp\tq8, q9" } } */
+/* { dg-final { scan-assembler "\[ \t\]ldp\tq10, q11" } } */
+/* { dg-final { scan-assembler "\[ \t\]ldp\tq12, q13" } } */
+/* { dg-final { scan-assembler "\[ \t\]ldp\tq14, q15" } } */
+/* { dg-final { scan-assembler "\[ \t\]ldp\tq16, q17" } } */
+/* { dg-final { scan-assembler "\[ \t\]ldp\tq18, q19" } } */
+/* { dg-final { scan-assembler "\[ \t\]ldp\tq20, q21" } } */
+/* { dg-final { scan-assembler "\[ \t\]ldp\tq22, q23" } } */
+/* { dg-final { scan-assembler-not "\[ \t\]stp\tq\[034567\]" } } */
+/* { dg-final { scan-assembler-not "\[ \t\]ldp\tq\[034567\]" } } */
+/* { dg-final { scan-assembler-not "\[ \t\]stp\tq2\[456789\]" } } */
+/* { dg-final { scan-assembler-not "\[ \t\]ldp\tq2\[456789\]" } } */
+/* { dg-final { scan-assembler-not "\[ \t\]stp\td" } } */
+/* { dg-final { scan-assembler-not "\[ \t\]ldp\td" } } */
+/* { dg-final { scan-assembler-not "\[ \t\]str\t" } } */
+/* { dg-final { scan-assembler-not "\[ \t\]ldr\t" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-2.c 
b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-2.c
index e69de29..ecc60d0 100644
--- a/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-2.c
+++ b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-2.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+
+void
+f (void)
+{
+  /* Clobber all fp/simd regs and verify that the correct ones are saved
+     and restored in the prologue and epilogue of a SIMD function. */
+  __asm__ __volatile__ ("" :::  "q0",  "q1",  "q2",  "q3");
+  __asm__ __volatile__ ("" :::  "q4",  "q5",  "q6",  "q7");
+  __asm__ __volatile__ ("" :::  "q8",  "q9", "q10", "q11");
+  __asm__ __volatile__ ("" ::: "q12", "q13", "q14", "q15");
+  __asm__ __volatile__ ("" ::: "q16", "q17", "q18", "q19");
+  __asm__ __volatile__ ("" ::: "q20", "q21", "q22", "q23");
+  __asm__ __volatile__ ("" ::: "q24", "q25", "q26", "q27");
+  __asm__ __volatile__ ("" ::: "q28", "q29", "q30", "q31");
+}
+
+/* { dg-final { scan-assembler "\[ \t\]stp\td8, d9" } } */
+/* { dg-final { scan-assembler "\[ \t\]stp\td10, d11" } } */
+/* { dg-final { scan-assembler "\[ \t\]stp\td12, d13" } } */
+/* { dg-final { scan-assembler "\[ \t\]stp\td14, d15" } } */
+/* { dg-final { scan-assembler "\[ \t\]ldp\td8, d9" } } */
+/* { dg-final { scan-assembler "\[ \t\]ldp\td10, d11" } } */
+/* { dg-final { scan-assembler "\[ \t\]ldp\td12, d13" } } */
+/* { dg-final { scan-assembler "\[ \t\]ldp\td14, d15" } } */
+/* { dg-final { scan-assembler-not "\[ \t\]stp\tq" } } */
+/* { dg-final { scan-assembler-not "\[ \t\]ldp\tq" } } */
+/* { dg-final { scan-assembler-not "\[ \t\]stp\tq\[01234567\]" } } */
+/* { dg-final { scan-assembler-not "\[ \t\]ldp\tq\[01234567\]" } } */
+/* { dg-final { scan-assembler-not "\[ \t\]stp\tq1\[6789\]" } } */
+/* { dg-final { scan-assembler-not "\[ \t\]ldp\tq1\[6789\]" } } */
+/* { dg-final { scan-assembler-not "\[ \t\]str\t" } } */
+/* { dg-final { scan-assembler-not "\[ \t\]ldr\t" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-3.c 
b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-3.c
index e69de29..d7926d3 100644
--- a/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-3.c
+++ b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-3.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+
+extern void g (void);
+
+void __attribute__ ((aarch64_vector_pcs))
+f (void)
+{
+       g();
+}
+
+/* { dg-final { scan-assembler "\[ \t\]stp\tq8, q9" } } */
+/* { dg-final { scan-assembler "\[ \t\]stp\tq10, q11" } } */
+/* { dg-final { scan-assembler "\[ \t\]stp\tq12, q13" } } */
+/* { dg-final { scan-assembler "\[ \t\]stp\tq14, q15" } } */
+/* { dg-final { scan-assembler "\[ \t\]stp\tq16, q17" } } */
+/* { dg-final { scan-assembler "\[ \t\]stp\tq18, q19" } } */
+/* { dg-final { scan-assembler "\[ \t\]stp\tq20, q21" } } */
+/* { dg-final { scan-assembler "\[ \t\]stp\tq22, q23" } } */
+/* { dg-final { scan-assembler "\[ \t\]ldp\tq8, q9" } } */
+/* { dg-final { scan-assembler "\[ \t\]ldp\tq10, q11" } } */
+/* { dg-final { scan-assembler "\[ \t\]ldp\tq12, q13" } } */
+/* { dg-final { scan-assembler "\[ \t\]ldp\tq14, q15" } } */
+/* { dg-final { scan-assembler "\[ \t\]ldp\tq16, q17" } } */
+/* { dg-final { scan-assembler "\[ \t\]ldp\tq18, q19" } } */
+/* { dg-final { scan-assembler "\[ \t\]ldp\tq20, q21" } } */
+/* { dg-final { scan-assembler "\[ \t\]ldp\tq22, q23" } } */
+/* { dg-final { scan-assembler-not "\[ \t\]stp\tq\[034567\]" } } */
+/* { dg-final { scan-assembler-not "\[ \t\]ldp\tq\[034567\]" } } */
+/* { dg-final { scan-assembler-not "\[ \t\]stp\tq2\[456789\]" } } */
+/* { dg-final { scan-assembler-not "\[ \t\]ldp\tq2\[456789\]" } } */
+/* { dg-final { scan-assembler-not "\[ \t\]stp\td" } } */
+/* { dg-final { scan-assembler-not "\[ \t\]ldp\td" } } */
+/* { dg-final { scan-assembler-not "\[ \t\]str\t" } } */
+/* { dg-final { scan-assembler-not "\[ \t\]ldr\t" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-4.c 
b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-4.c
index e69de29..e399690 100644
--- a/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-4.c
+++ b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-4.c
@@ -0,0 +1,34 @@
+/* dg-do run */
+/* { dg-additional-options "-std=c99" }  */
+
+
+
+/* There is nothing special about the calculations here, this is just
+   a test that can be compiled and run.  */
+
+extern void abort (void);
+
+__Float64x2_t __attribute__ ((noinline, aarch64_vector_pcs))
+foo(__Float64x2_t a, __Float64x2_t b, __Float64x2_t c,
+    __Float64x2_t d, __Float64x2_t e, __Float64x2_t f,
+    __Float64x2_t g, __Float64x2_t h, __Float64x2_t i)
+{
+       __Float64x2_t w, x, y, z;
+       w = a + b * c;
+       x = d + e * f;
+       y = g + h * i;
+       return w + x * y;
+}
+
+
+int main()
+{
+       __Float64x2_t a, b, c, d;
+       a = (__Float64x2_t) { 1.0, 2.0 };
+       b = (__Float64x2_t) { 3.0, 4.0 };
+       c = (__Float64x2_t) { 5.0, 6.0 };
+       d = foo (a, b, c, (a+b), (b+c), (a+c), (a-b), (b-c), (a-c)) + a + b + c;
+       if (d[0] != 337.0 || d[1] != 554.0)
+               abort ();
+       return 0;
+}

Re: [Patch][Aarch64] Implement Aarch64 SIMD ABI and aarch64_vector_pcs attribute

Reply via email to