[PATCH][AARCH64] Emulating aligned mask loads on AArch64

Pawel Kupidura Fri, 18 Sep 2015 03:26:06 -0700

This patch uses max reductions to emulate aligned masked loads on AArch64.
It reduces the mask to a scalar that is nonzero if any mask element is true,
then uses that scalar to select between the real address and a scratchpad
address.


The idea is that if the vector load is aligned, it cannot cross a page
boundary and so cannot partially fault.  It is safe to load from the
address (and use only some of the result) if any mask element is true.

The patch provided a 15% speed improvement for simple microbenchmarks.

There were several spec2k6 benchmarks affected by patch: 400.perlbench,
403.gcc, 436.cactusADM, 454.calculix and 464.h264.  However, the changes
had no measureable effect on performance.

Regression-tested on x86_64-linux-gnu, aarch64-linux-gnu andarm-linux-gnueabi.


Thanks,
Pawel

diff --git a/gcc/optabs-query.h b/gcc/optabs-query.h
index 73f2729..066d133 100644
--- a/gcc/optabs-query.h
+++ b/gcc/optabs-query.h
@@ -134,5 +134,6 @@ bool can_vec_mask_load_store_p (machine_mode, bool);
 bool can_compare_and_swap_p (machine_mode, bool);
 bool can_atomic_exchange_p (machine_mode, bool);
 bool lshift_cheap_p (bool);
+bool supports_umax_reduction ();
 
 #endif
diff --git a/gcc/optabs-query.c b/gcc/optabs-query.c
index 254089f..23a85a4 100644
--- a/gcc/optabs-query.c
+++ b/gcc/optabs-query.c
@@ -463,6 +463,21 @@ can_mult_highpart_p (machine_mode mode, bool uns_p)
   return 0;
 }
 
+/* Return true if target supports unsigned max reduction for any mode.  */
+
+bool
+supports_umax_reduction ()
+{
+  machine_mode mode;
+
+  for (mode = MIN_MODE_VECTOR_INT; mode <= MAX_MODE_VECTOR_INT;
+       mode = (machine_mode) (mode + 1))
+    if (optab_handler (reduc_umax_scal_optab, mode) != CODE_FOR_nothing)
+      return true;
+
+  return false;
+}
+
 /* Return true if target supports vector masked load/store for mode.  */
 
 bool
diff --git a/gcc/testsuite/gcc.dg/vect/vect-align-4.c 
b/gcc/testsuite/gcc.dg/vect/vect-align-4.c
new file mode 100644
index 0000000..98db8e3
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-align-4.c
@@ -0,0 +1,65 @@
+/* { dg-require-effective-target umax_reduction } */
+
+#define N 512
+#define K 32
+
+extern void abort (void) __attribute__((noreturn));
+
+int a[N] __attribute__ ((aligned (16)));
+int b[N] __attribute__ ((aligned (16)));
+int c[N] __attribute__ ((aligned (16)));
+
+__attribute__ ((noinline)) void
+init_arrays () {
+  int i;
+
+  for (i = 0; i < N / 4; ++i)
+    a[i] = K + 1;
+
+  for (i = N / 4; i < N / 2; ++i)
+    a[i] = (i % 2 == 0) ? K - 1 : K + 1;
+
+  for (i = N / 2; i < N; ++i)
+    a[i] = K - 1;
+
+  for (i = 0; i < N; ++i)
+    b[i] = i;
+}
+
+__attribute__ ((noinline)) void
+check_array () {
+  int i = 0;
+
+  for (i = 0; i < N / 4; ++i)
+    if (c[i] != a[i])
+      abort ();
+
+  for (i = N / 4; i < N / 2; ++i)
+    if (c[i] != (i % 2 == 0) ? b[i] : a[i])
+      abort ();
+
+  for (i = N / 2; i < N; ++i)
+    if (c[i] != b[i])
+      abort ();
+}
+
+__attribute__ ((noinline)) void
+main1 (int* bp) {
+  int i;
+
+  for (i = 0; i < N; ++i)
+    c[i] = a[i] < K ? bp[i] : a[i];
+
+  check_array ();
+}
+
+int main (void) {
+  init_arrays ();
+
+  main1 (b);
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/vect-align-5.c 
b/gcc/testsuite/gcc.dg/vect/vect-align-5.c
new file mode 100644
index 0000000..93bfaa1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-align-5.c
@@ -0,0 +1,65 @@
+/* { dg-require-effective-target umax_reduction } */
+
+#define N 512
+#define K 32
+
+extern void abort (void) __attribute__((noreturn));
+
+int a[N] __attribute__ ((aligned (16)));
+int b[N];
+int c[N] __attribute__ ((aligned (16)));
+
+__attribute__ ((noinline)) void
+init_arrays () {
+  int i;
+
+  for (i = 0; i < N / 4; ++i)
+    a[i] = K + 1;
+
+  for (i = N / 4; i < N / 2; ++i)
+    a[i] = (i % 2 == 0) ? K - 1 : K + 1;
+
+  for (i = N / 2; i < N; ++i)
+    a[i] = K - 1;
+
+  for (i = 0; i < N; ++i)
+    b[i] = i;
+}
+
+__attribute__ ((noinline)) void
+check_array () {
+  int i = 0;
+
+  for (i = 0; i < N / 4; ++i)
+    if (c[i] != a[i])
+      abort ();
+
+  for (i = N / 4; i < N / 2; ++i)
+    if (c[i] != (i % 2 == 0) ? b[i] : a[i])
+      abort ();
+
+  for (i = N / 2; i < N; ++i)
+    if (c[i] != b[i])
+      abort ();
+}
+
+__attribute__ ((noinline)) void
+main1 (int* bp) {
+  int i;
+
+  for (i = 0; i < N; ++i)
+    c[i] = a[i] < K ? bp[i] : a[i];
+
+  check_array ();
+}
+
+int main (void) {
+  init_arrays ();
+
+  main1 (b);
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+
diff --git a/gcc/testsuite/lib/target-supports.exp 
b/gcc/testsuite/lib/target-supports.exp
index a465eb1..9b1c338 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -6449,3 +6449,14 @@ proc check_effective_target_comdat_group {} {
        int (*fn) () = foo;
     }]
 }
+
+# Return 1 if the target supports unsigned max vector reduction.
+
+proc check_effective_target_umax_reduction { } {
+    if { [istarget aarch64*-*-*] } {
+       return 1;
+    } else {
+       return 0;
+    }
+}
+
diff --git a/gcc/tree-if-conv.c b/gcc/tree-if-conv.c
index 0987884..4f84705 100644
--- a/gcc/tree-if-conv.c
+++ b/gcc/tree-if-conv.c
@@ -811,7 +811,8 @@ ifcvt_can_use_mask_load_store (gimple stmt)
       || VECTOR_MODE_P (mode))
     return false;
 
-  if (can_vec_mask_load_store_p (mode, is_load))
+  if (can_vec_mask_load_store_p (mode, is_load)
+      || (is_load && supports_umax_reduction ()))
     return true;
 
   return false;
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index 671e613..4f8c2c5 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -5749,10 +5749,19 @@ vect_supportable_dr_alignment (struct data_reference 
*dr,
   /* For now assume all conditional loads/stores support unaligned
      access without any special code.  */
   if (is_gimple_call (stmt)
-      && gimple_call_internal_p (stmt)
-      && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
-         || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
-    return dr_unaligned_supported;
+      && gimple_call_internal_p (stmt))
+    {
+      if (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD)
+       return (can_vec_mask_load_store_p (mode, true)
+               ? dr_unaligned_supported
+               : dr_unaligned_unsupported);
+      else if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
+       {
+         gcc_checking_assert (can_vec_mask_load_store_p (
+                               TYPE_MODE (TREE_TYPE (vectype)), false));
+         return dr_unaligned_supported;
+       }
+    }
 
   if (loop_vinfo)
     {
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index d4a436d..2a8c231 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -1840,7 +1840,9 @@ vectorizable_mask_load_store (gimple stmt, 
gimple_stmt_iterator *gsi,
                                 : DR_STEP (dr), size_zero_node) <= 0)
     return false;
   else if (!VECTOR_MODE_P (TYPE_MODE (vectype))
-          || !can_vec_mask_load_store_p (TYPE_MODE (vectype), !is_store))
+          || !(can_vec_mask_load_store_p (TYPE_MODE (vectype), !is_store)
+               || (optab_handler (reduc_umax_scal_optab,
+                                  TYPE_MODE (vectype)) != CODE_FOR_nothing)))
     return false;
 
   if (TREE_CODE (mask) != SSA_NAME)
@@ -2140,12 +2142,43 @@ vectorizable_mask_load_store (gimple stmt, 
gimple_stmt_iterator *gsi,
            misalign = DR_MISALIGNMENT (dr);
          set_ptr_info_alignment (get_ptr_info (dataref_ptr), align,
                                  misalign);
-         new_stmt
-           = gimple_build_call_internal (IFN_MASK_LOAD, 3, dataref_ptr,
-                                         gimple_call_arg (stmt, 1),
-                                         vec_mask);
-         gimple_call_set_lhs (new_stmt, make_ssa_name (vec_dest));
-         vect_finish_stmt_generation (stmt, new_stmt, gsi);
+
+         if (can_vec_mask_load_store_p (TYPE_MODE (vectype), !is_store))
+           {
+             new_stmt
+               = gimple_build_call_internal (IFN_MASK_LOAD, 3, dataref_ptr,
+                                             gimple_call_arg (stmt, 1),
+                                             vec_mask);
+             gimple_call_set_lhs (new_stmt, make_ssa_name (vec_dest));
+             vect_finish_stmt_generation (stmt, new_stmt, gsi);
+           }
+         else
+           {
+             tree anytrue = make_temp_ssa_name (TREE_TYPE (
+                                                 TREE_TYPE (vec_mask)),
+                                                NULL, "_anytrue");
+             tree reduction = build1 (REDUC_MAX_EXPR, TREE_TYPE (anytrue),
+                                      vec_mask);
+             gimple anytrue_init = gimple_build_assign (anytrue, reduction);
+             vect_finish_stmt_generation (stmt, anytrue_init, gsi);
+
+             tree temp_addr = build1 (ADDR_EXPR, TREE_TYPE (dataref_ptr),
+                                      create_tmp_var (vectype, "safevec"));
+             tree vec_cond_expr = build3 (COND_EXPR, vectype, anytrue,
+                                          dataref_ptr, temp_addr);
+
+             tree safeb = make_temp_ssa_name (TREE_TYPE (dataref_ptr),
+                                              NULL, "_safeb");
+             gimple safeb_init = gimple_build_assign (safeb, vec_cond_expr);
+             vect_finish_stmt_generation (stmt, safeb_init, gsi);
+
+             tree load = build2 (MEM_REF, vectype, safeb,
+                                 build_int_cst (ptr_type_node, 0));
+             new_stmt
+               = gimple_build_assign (make_ssa_name (vec_dest), load);
+             vect_finish_stmt_generation (stmt, new_stmt, gsi);
+           }
+
          if (i == 0)
            STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
          else

[PATCH][AARCH64] Emulating aligned mask loads on AArch64

Reply via email to