[PATCH] tree-optimization/115895 - overrun with masked loop

Richard Biener Tue, 14 Jan 2025 23:18:12 -0800

The following addresses the fact that with loop masking (or regular
mask loads) we do not implement load shortening but we override
the case where we need that for correctness.  Likewise when we
attempt to use loop masking to handle large trailing gaps we cannot
do so when there's this overrun case.


Bootstrapped and tested on x86_64-unknown-linux-gnu.  I'm going to
wait for the arm/risc-v CI.

Thanks,
Richard.

        PR tree-optimization/115895
        * tree-vect-stmts.cc (get_group_load_store_type): When we
        might overrun because the group size is not a multiple of the
        vector size we cannot use loop masking since that does not
        implement the required load shortening.

        * gcc.target/i386/vect-pr115895.c: New testcase.
---
 gcc/testsuite/gcc.target/i386/vect-pr115895.c | 65 +++++++++++++++++++
 gcc/tree-vect-stmts.cc                        | 24 +++++--
 2 files changed, 84 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/vect-pr115895.c

diff --git a/gcc/testsuite/gcc.target/i386/vect-pr115895.c 
b/gcc/testsuite/gcc.target/i386/vect-pr115895.c
new file mode 100644
index 00000000000..2246c66d37e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-pr115895.c
@@ -0,0 +1,65 @@
+/* For some targets we end up vectorizing the below loop such that the `sp`
+   single integer is loaded into a 4 integer vector.
+   While the writes are all safe, without 2 scalar loops being peeled into the
+   epilogue we would read past the end of the 31 integer array.  This happens
+   because we load a 4 integer chunk to only use the first integer and
+   increment by 2 integers at a time, hence the last load needs s[30-33] and
+   the penultimate load needs s[28-31].
+   This testcase ensures that we do not crash due to that behaviour.  */
+/* { dg-do run } */
+/* { dg-options "-std=gnu17 -O2 -ftree-vectorize -fno-vect-cost-model --param 
vect-partial-vector-usage=2 -mavx512bw -mprefer-vector-width=512" } */
+/* { dg-require-effective-target mmap } */
+#include <sys/mman.h>
+#include <stdio.h>
+
+#define MMAP_SIZE 0x20000
+#define ADDRESS 0x1122000000
+
+#define MB_BLOCK_SIZE 16
+#define VERT_PRED_16 0
+#define HOR_PRED_16 1
+#define DC_PRED_16 2
+int *sptr;
+extern void intrapred_luma_16x16();
+unsigned short mprr_2[5][16][16];
+void initialise_s(int *s) { }
+int main_1() {
+    void *s_mapping;
+    void *end_s;
+    s_mapping = mmap ((void *)ADDRESS, MMAP_SIZE, PROT_READ | PROT_WRITE,
+                     MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (s_mapping == MAP_FAILED)
+      {
+       perror ("mmap");
+       return 1;
+      }
+    end_s = (s_mapping + MMAP_SIZE);
+    sptr = (int*)(end_s - sizeof(int[31]));
+    intrapred_luma_16x16(sptr);
+    return 0;
+}
+
+void intrapred_luma_16x16(int * restrict sp) {
+    for (int j=0; j < MB_BLOCK_SIZE; j++)
+      {
+       mprr_2[VERT_PRED_16][j][0]=sp[j*2];
+       mprr_2[VERT_PRED_16][j][1]=sp[j*2];
+       mprr_2[VERT_PRED_16][j][2]=sp[j*2];
+       mprr_2[VERT_PRED_16][j][3]=sp[j*2];
+       mprr_2[VERT_PRED_16][j][4]=sp[j*2];
+       mprr_2[VERT_PRED_16][j][5]=sp[j*2];
+       mprr_2[VERT_PRED_16][j][6]=sp[j*2];
+       mprr_2[VERT_PRED_16][j][7]=sp[j*2];
+       mprr_2[VERT_PRED_16][j][8]=sp[j*2];
+       mprr_2[VERT_PRED_16][j][9]=sp[j*2];
+       mprr_2[VERT_PRED_16][j][10]=sp[j*2];
+       mprr_2[VERT_PRED_16][j][11]=sp[j*2];
+       mprr_2[VERT_PRED_16][j][12]=sp[j*2];
+       mprr_2[VERT_PRED_16][j][13]=sp[j*2];
+       mprr_2[VERT_PRED_16][j][14]=sp[j*2];
+       mprr_2[VERT_PRED_16][j][15]=sp[j*2];
+      }
+}
+
+#define DO_TEST main_1
+#include "avx512-check.h"
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 0c0f999d3e3..b5dd1a2e40f 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2216,13 +2216,14 @@ get_group_load_store_type (vec_info *vinfo, 
stmt_vec_info stmt_info,
 
             If there is a combination of the access not covering the full
             vector and a gap recorded then we may need to peel twice.  */
+         bool large_vector_overrun_p = false;
          if (loop_vinfo
              && (*memory_access_type == VMAT_CONTIGUOUS
                  || *memory_access_type == VMAT_CONTIGUOUS_REVERSE)
              && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
              && !multiple_p (group_size * LOOP_VINFO_VECT_FACTOR (loop_vinfo),
                              nunits))
-           overrun_p = true;
+           large_vector_overrun_p = overrun_p = true;
 
          /* If the gap splits the vector in half and the target
             can do half-vector operations avoid the epilogue peeling
@@ -2273,7 +2274,8 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info 
stmt_info,
                 access and that is sufficiently small to be covered
                 by the single scalar iteration.  */
              unsigned HOST_WIDE_INT cnunits, cvf, cremain, cpart_size;
-             if (!nunits.is_constant (&cnunits)
+             if (masked_p
+                 || !nunits.is_constant (&cnunits)
                  || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&cvf)
                  || (((cremain = (group_size * cvf - gap) % cnunits), true)
                      && ((cpart_size = (1 << ceil_log2 (cremain))), true)
@@ -2282,9 +2284,11 @@ get_group_load_store_type (vec_info *vinfo, 
stmt_vec_info stmt_info,
                               (vectype, cnunits / cpart_size,
                                &half_vtype) == NULL_TREE)))
                {
-                 /* If all fails we can still resort to niter masking, so
-                    enforce the use of partial vectors.  */
-                 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
+                 /* If all fails we can still resort to niter masking unless
+                    the vectors used are too big, so enforce the use of
+                    partial vectors.  */
+                 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+                     && !large_vector_overrun_p)
                    {
                      if (dump_enabled_p ())
                        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -2302,6 +2306,16 @@ get_group_load_store_type (vec_info *vinfo, 
stmt_vec_info stmt_info,
                      return false;
                    }
                }
+             else if (large_vector_overrun_p)
+               {
+                 if (dump_enabled_p ())
+                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                    "can't operate on partial vectors because "
+                                    "only unmasked loads handle access "
+                                    "shortening required because of gaps at "
+                                    "the end of the access\n");
+                 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+               }
            }
        }
     }
-- 
2.43.0

[PATCH] tree-optimization/115895 - overrun with masked loop

Reply via email to