Re: [PATCH] tree-optimization/115895 - overrun with masked loop

Richard Sandiford Wed, 15 Jan 2025 08:29:31 -0800

Richard Biener <rguent...@suse.de> writes:
> The following addresses the fact that with loop masking (or regular
> mask loads) we do not implement load shortening but we override
> the case where we need that for correctness.  Likewise when we
> attempt to use loop masking to handle large trailing gaps we cannot
> do so when there's this overrun case.
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu.  I'm going to
> wait for the arm/risc-v CI.
>
> Thanks,
> Richard.
>
>       PR tree-optimization/115895
>       * tree-vect-stmts.cc (get_group_load_store_type): When we
>       might overrun because the group size is not a multiple of the
>       vector size we cannot use loop masking since that does not
>       implement the required load shortening.
>
>       * gcc.target/i386/vect-pr115895.c: New testcase.


LGTM FWIW

Richard

> ---
>  gcc/testsuite/gcc.target/i386/vect-pr115895.c | 65 +++++++++++++++++++
>  gcc/tree-vect-stmts.cc                        | 24 +++++--
>  2 files changed, 84 insertions(+), 5 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/vect-pr115895.c
>
> diff --git a/gcc/testsuite/gcc.target/i386/vect-pr115895.c 
> b/gcc/testsuite/gcc.target/i386/vect-pr115895.c
> new file mode 100644
> index 00000000000..2246c66d37e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/vect-pr115895.c
> @@ -0,0 +1,65 @@
> +/* For some targets we end up vectorizing the below loop such that the `sp`
> +   single integer is loaded into a 4 integer vector.
> +   While the writes are all safe, without 2 scalar loops being peeled into 
> the
> +   epilogue we would read past the end of the 31 integer array.  This happens
> +   because we load a 4 integer chunk to only use the first integer and
> +   increment by 2 integers at a time, hence the last load needs s[30-33] and
> +   the penultimate load needs s[28-31].
> +   This testcase ensures that we do not crash due to that behaviour.  */
> +/* { dg-do run } */
> +/* { dg-options "-std=gnu17 -O2 -ftree-vectorize -fno-vect-cost-model 
> --param vect-partial-vector-usage=2 -mavx512bw -mprefer-vector-width=512" } */
> +/* { dg-require-effective-target mmap } */
> +#include <sys/mman.h>
> +#include <stdio.h>
> +
> +#define MMAP_SIZE 0x20000
> +#define ADDRESS 0x1122000000
> +
> +#define MB_BLOCK_SIZE 16
> +#define VERT_PRED_16 0
> +#define HOR_PRED_16 1
> +#define DC_PRED_16 2
> +int *sptr;
> +extern void intrapred_luma_16x16();
> +unsigned short mprr_2[5][16][16];
> +void initialise_s(int *s) { }
> +int main_1() {
> +    void *s_mapping;
> +    void *end_s;
> +    s_mapping = mmap ((void *)ADDRESS, MMAP_SIZE, PROT_READ | PROT_WRITE,
> +                   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> +    if (s_mapping == MAP_FAILED)
> +      {
> +     perror ("mmap");
> +     return 1;
> +      }
> +    end_s = (s_mapping + MMAP_SIZE);
> +    sptr = (int*)(end_s - sizeof(int[31]));
> +    intrapred_luma_16x16(sptr);
> +    return 0;
> +}
> +
> +void intrapred_luma_16x16(int * restrict sp) {
> +    for (int j=0; j < MB_BLOCK_SIZE; j++)
> +      {
> +     mprr_2[VERT_PRED_16][j][0]=sp[j*2];
> +     mprr_2[VERT_PRED_16][j][1]=sp[j*2];
> +     mprr_2[VERT_PRED_16][j][2]=sp[j*2];
> +     mprr_2[VERT_PRED_16][j][3]=sp[j*2];
> +     mprr_2[VERT_PRED_16][j][4]=sp[j*2];
> +     mprr_2[VERT_PRED_16][j][5]=sp[j*2];
> +     mprr_2[VERT_PRED_16][j][6]=sp[j*2];
> +     mprr_2[VERT_PRED_16][j][7]=sp[j*2];
> +     mprr_2[VERT_PRED_16][j][8]=sp[j*2];
> +     mprr_2[VERT_PRED_16][j][9]=sp[j*2];
> +     mprr_2[VERT_PRED_16][j][10]=sp[j*2];
> +     mprr_2[VERT_PRED_16][j][11]=sp[j*2];
> +     mprr_2[VERT_PRED_16][j][12]=sp[j*2];
> +     mprr_2[VERT_PRED_16][j][13]=sp[j*2];
> +     mprr_2[VERT_PRED_16][j][14]=sp[j*2];
> +     mprr_2[VERT_PRED_16][j][15]=sp[j*2];
> +      }
> +}
> +
> +#define DO_TEST main_1
> +#include "avx512-check.h"
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 0c0f999d3e3..b5dd1a2e40f 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -2216,13 +2216,14 @@ get_group_load_store_type (vec_info *vinfo, 
> stmt_vec_info stmt_info,
>  
>            If there is a combination of the access not covering the full
>            vector and a gap recorded then we may need to peel twice.  */
> +       bool large_vector_overrun_p = false;
>         if (loop_vinfo
>             && (*memory_access_type == VMAT_CONTIGUOUS
>                 || *memory_access_type == VMAT_CONTIGUOUS_REVERSE)
>             && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
>             && !multiple_p (group_size * LOOP_VINFO_VECT_FACTOR (loop_vinfo),
>                             nunits))
> -         overrun_p = true;
> +         large_vector_overrun_p = overrun_p = true;
>  
>         /* If the gap splits the vector in half and the target
>            can do half-vector operations avoid the epilogue peeling
> @@ -2273,7 +2274,8 @@ get_group_load_store_type (vec_info *vinfo, 
> stmt_vec_info stmt_info,
>                access and that is sufficiently small to be covered
>                by the single scalar iteration.  */
>             unsigned HOST_WIDE_INT cnunits, cvf, cremain, cpart_size;
> -           if (!nunits.is_constant (&cnunits)
> +           if (masked_p
> +               || !nunits.is_constant (&cnunits)
>                 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&cvf)
>                 || (((cremain = (group_size * cvf - gap) % cnunits), true)
>                     && ((cpart_size = (1 << ceil_log2 (cremain))), true)
> @@ -2282,9 +2284,11 @@ get_group_load_store_type (vec_info *vinfo, 
> stmt_vec_info stmt_info,
>                              (vectype, cnunits / cpart_size,
>                               &half_vtype) == NULL_TREE)))
>               {
> -               /* If all fails we can still resort to niter masking, so
> -                  enforce the use of partial vectors.  */
> -               if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
> +               /* If all fails we can still resort to niter masking unless
> +                  the vectors used are too big, so enforce the use of
> +                  partial vectors.  */
> +               if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
> +                   && !large_vector_overrun_p)
>                   {
>                     if (dump_enabled_p ())
>                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -2302,6 +2306,16 @@ get_group_load_store_type (vec_info *vinfo, 
> stmt_vec_info stmt_info,
>                     return false;
>                   }
>               }
> +           else if (large_vector_overrun_p)
> +             {
> +               if (dump_enabled_p ())
> +                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                                  "can't operate on partial vectors because "
> +                                  "only unmasked loads handle access "
> +                                  "shortening required because of gaps at "
> +                                  "the end of the access\n");
> +               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> +             }
>           }
>       }
>      }

Re: [PATCH] tree-optimization/115895 - overrun with masked loop

Reply via email to