[PATCH] tree-optimization/112736 - avoid overread with non-grouped SLP load

Richard Biener Tue, 12 Dec 2023 02:52:58 -0800

The following aovids over/under-read of storage when vectorizing
a non-grouped load with SLP.  Instead of forcing peeling for gaps
use a smaller load for the last vector which might access excess
elements.  This builds upon the existing optimization avoiding
peeling for gaps, generalizing it to all gap widths leaving a
power-of-two remaining number of elements (but it doesn't replace
or improve that particular case at this point).


I wonder if the poly relational compares I set up are good enough
to guarantee /* remain should now be > 0 and < nunits.  */.

There is existing test coverage that runs into /* DR will be unused.  */
always when the gap is wider than nunits.  Compared to the
existing gap == nunits/2 case this only adjusts the load that will
cause the overrun at the end, not every load.  Apart from the
poly relational compares it should reliably cover these cases but
I'll leave it for stage1 to remove.

Bootstrapped and tested on x86_64-unknown-linux-gnu, I've also
built and tested SPEC CPU 2017.

OK?

        PR tree-optimization/112736
        * tree-vect-stmts.cc (vectorizable_load): Extend optimization
        to avoid peeling for gaps to handle single-element non-groups
        we now allow with SLP.

        * gcc.dg/torture/pr112736.c: New testcase.
---
 gcc/testsuite/gcc.dg/torture/pr112736.c | 27 ++++++++
 gcc/tree-vect-stmts.cc                  | 86 ++++++++++++++++++++-----
 2 files changed, 96 insertions(+), 17 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/torture/pr112736.c

diff --git a/gcc/testsuite/gcc.dg/torture/pr112736.c 
b/gcc/testsuite/gcc.dg/torture/pr112736.c
new file mode 100644
index 00000000000..6abb56edba3
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/pr112736.c
@@ -0,0 +1,27 @@
+/* { dg-do run { target *-*-linux* *-*-gnu* *-*-uclinux* } } */
+
+#include <sys/mman.h>
+#include <unistd.h>
+
+int a, c[3][5];
+
+void __attribute__((noipa))
+fn1 (int * __restrict b)
+{
+  int e;
+  for (a = 2; a >= 0; a--)
+    for (e = 0; e < 4; e++)
+      c[a][e] = b[a];
+}
+
+int main()
+{
+  long pgsz = sysconf (_SC_PAGESIZE);
+  void *p = mmap (NULL, pgsz * 2, PROT_READ|PROT_WRITE,
+                  MAP_ANONYMOUS|MAP_PRIVATE, 0, 0);
+  if (p == MAP_FAILED)
+    return 0;
+  mprotect (p, pgsz, PROT_NONE);
+  fn1 (p + pgsz);
+  return 0;
+}
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 390c8472fd6..c03c4c08c9d 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -11465,26 +11465,70 @@ vectorizable_load (vec_info *vinfo,
                        if (new_vtype != NULL_TREE)
                          ltype = half_vtype;
                      }
+                   /* Try to use a single smaller load when we are about
+                      to load accesses excess elements compared to the
+                      unrolled scalar loop.
+                      ???  This should cover the above case as well.  */
+                   else if (known_gt ((vec_num * j + i + 1) * nunits,
+                                      (group_size * vf - gap)))
+                     {
+                       if (known_ge ((vec_num * j + i + 1) * nunits
+                                     - (group_size * vf - gap), nunits))
+                         /* DR will be unused.  */
+                         ltype = NULL_TREE;
+                       else if (alignment_support_scheme == dr_aligned)
+                         /* Aligned access to excess elements is OK if
+                            at least one element is accessed in the
+                            scalar loop.  */
+                         ;
+                       else
+                         {
+                           auto remain
+                             = ((group_size * vf - gap)
+                                - (vec_num * j + i) * nunits);
+                           /* remain should now be > 0 and < nunits.  */
+                           unsigned num;
+                           if (constant_multiple_p (nunits, remain, &num))
+                             {
+                               tree ptype;
+                               new_vtype
+                                 = vector_vector_composition_type (vectype,
+                                                                   num,
+                                                                   &ptype);
+                               if (new_vtype)
+                                 ltype = ptype;
+                             }
+                           /* Else use multiple loads or a masked load?  */
+                         }
+                     }
                    tree offset
                      = (dataref_offset ? dataref_offset
                                        : build_int_cst (ref_type, 0));
-                   if (ltype != vectype
-                       && memory_access_type == VMAT_CONTIGUOUS_REVERSE)
+                   if (!ltype)
+                     ;
+                   else if (ltype != vectype
+                            && memory_access_type == VMAT_CONTIGUOUS_REVERSE)
                      {
                        unsigned HOST_WIDE_INT gap_offset
-                         = gap * tree_to_uhwi (TYPE_SIZE_UNIT (elem_type));
+                         = (tree_to_uhwi (TYPE_SIZE_UNIT (vectype))
+                            - tree_to_uhwi (TYPE_SIZE_UNIT (ltype)));
                        tree gapcst = build_int_cst (ref_type, gap_offset);
                        offset = size_binop (PLUS_EXPR, offset, gapcst);
                      }
-                   data_ref
-                     = fold_build2 (MEM_REF, ltype, dataref_ptr, offset);
-                   if (alignment_support_scheme == dr_aligned)
-                     ;
-                   else
-                     TREE_TYPE (data_ref)
-                       = build_aligned_type (TREE_TYPE (data_ref),
-                                             align * BITS_PER_UNIT);
-                   if (ltype != vectype)
+                   if (ltype)
+                     {
+                       data_ref
+                         = fold_build2 (MEM_REF, ltype, dataref_ptr, offset);
+                       if (alignment_support_scheme == dr_aligned)
+                         ;
+                       else
+                         TREE_TYPE (data_ref)
+                           = build_aligned_type (TREE_TYPE (data_ref),
+                                                 align * BITS_PER_UNIT);
+                     }
+                   if (!ltype)
+                     data_ref = build_constructor (vectype, NULL);
+                   else if (ltype != vectype)
                      {
                        vect_copy_ref_info (data_ref,
                                            DR_REF (first_dr_info->dr));
@@ -11494,18 +11538,26 @@ vectorizable_load (vec_info *vinfo,
                                                     gsi);
                        data_ref = NULL;
                        vec<constructor_elt, va_gc> *v;
-                       vec_alloc (v, 2);
+                       unsigned num
+                         = (exact_div (tree_to_poly_uint64
+                                         (TYPE_SIZE_UNIT (vectype)),
+                                       tree_to_poly_uint64
+                                         (TYPE_SIZE_UNIT (ltype)))
+                            .to_constant ());
+                       vec_alloc (v, num);
                        if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
                          {
-                           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
-                                                   build_zero_cst (ltype));
+                           while (--num)
+                             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
+                                                     build_zero_cst (ltype));
                            CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
                          }
                        else
                          {
                            CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
-                           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
-                                                   build_zero_cst (ltype));
+                           while (--num)
+                             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
+                                                     build_zero_cst (ltype));
                          }
                        gcc_assert (new_vtype != NULL_TREE);
                        if (new_vtype == vectype)
-- 
2.35.3

[PATCH] tree-optimization/112736 - avoid overread with non-grouped SLP load

Reply via email to