commit:     4462db3ded9dc8bf07f321ac7d538e311e0b79ac
Author:     Sam James <sam <AT> gentoo <DOT> org>
AuthorDate: Mon Jul 21 14:01:49 2025 +0000
Commit:     Sam James <sam <AT> gentoo <DOT> org>
CommitDate: Mon Jul 21 14:01:49 2025 +0000
URL:        https://gitweb.gentoo.org/proj/gcc-patches.git/commit/?id=4462db3d

16.0.0: add vect fixes

Bug: https://bugs.gentoo.org/959698
Bug: https://gcc.gnu.org/PR121190
Bug: https://gcc.gnu.org/PR121020
Signed-off-by: Sam James <sam <AT> gentoo.org>

 ...sufficient-alignment-requirement-for-spec.patch | 181 +++++++++++++++++++++
 ...ssing-skip-vector-check-for-peeling-with-.patch | 142 ++++++++++++++++
 16.0.0/gentoo/README.history                       |   5 +
 3 files changed, 328 insertions(+)

diff --git 
a/16.0.0/gentoo/86_all_PR121190-vect-Fix-insufficient-alignment-requirement-for-spec.patch
 
b/16.0.0/gentoo/86_all_PR121190-vect-Fix-insufficient-alignment-requirement-for-spec.patch
new file mode 100644
index 0000000..9a03cdd
--- /dev/null
+++ 
b/16.0.0/gentoo/86_all_PR121190-vect-Fix-insufficient-alignment-requirement-for-spec.patch
@@ -0,0 +1,181 @@
+https://bugs.gentoo.org/959698
+https://inbox.sourceware.org/gcc-patches/[email protected]/T/#u
+
+From e120f6cb794a4d104b37913c918aabe0ae6b2c64 Mon Sep 17 00:00:00 2001
+Message-ID: 
<e120f6cb794a4d104b37913c918aabe0ae6b2c64.1753106388.git....@gentoo.org>
+From: Pengfei Li <[email protected]>
+Date: Mon, 21 Jul 2025 11:02:32 +0000
+Subject: [PATCH 1/2] vect: Fix insufficient alignment requirement for
+ speculative loads [PR121190]
+
+This patch fixes a segmentation fault issue that can occur in vectorized
+loops with an early break. When GCC vectorizes such loops, it may insert
+a versioning check to ensure that data references (DRs) with speculative
+loads are aligned. The check normally requires DRs to be aligned to the
+vector mode size, which prevents generated vector load instructions from
+crossing page boundaries.
+
+However, this is not sufficient when a single scalar load is vectorized
+into multiple loads within the same iteration. In such cases, even if
+none of the vector loads crosses page boundaries, subsequent loads after
+the first one may still access memory beyond current valid page.
+
+Consider the following loop as an example:
+
+       while (i < MAX_COMPARE) {
+         if (*(p + i) != *(q + i))
+           return i;
+         i++;
+       }
+
+When compiled with "-O3 -march=znver2" on x86, the vectorized loop may
+include instructions like:
+
+       vmovdqa (%rcx,%rax), %ymm0
+       vmovdqa 32(%rcx,%rax), %ymm1
+       vpcmpeqq (%rdx,%rax), %ymm0, %ymm0
+       vpcmpeqq 32(%rdx,%rax), %ymm1, %ymm1
+
+Note two speculative vector loads are generated for each DR (p and q).
+The first vmovdqa and vpcmpeqq are safe due to the vector size (32-byte)
+alignment, but the following ones (at offset 32) may not be safe because
+they could read from the beginning of the next memory page, potentially
+leading to segmentation faults.
+
+To avoid the issue, this patch increases the alignment requirement for
+speculative loads to DR_TARGET_ALIGNMENT. It ensures all vector loads in
+the same vector iteration access memory within the same page.
+
+This patch is bootstrapped and regression-tested on x86_64-linux-gnu,
+arm-linux-gnueabihf and aarch64-linux-gnu.
+
+       PR tree-optimization/121190
+
+gcc/ChangeLog:
+
+       * tree-vect-data-refs.cc (vect_enhance_data_refs_alignment):
+         Increase alignment requirement for speculative loads.
+
+gcc/testsuite/ChangeLog:
+
+* gcc.dg/vect/vect-early-break_52.c: Update an unsafe test.
+       * gcc.dg/vect/vect-early-break_137.c-pr121190: New test.
+---
+ .../vect/vect-early-break_137-pr121190.c      | 60 +++++++++++++++++++
+ .../gcc.dg/vect/vect-early-break_52.c         |  2 +-
+ gcc/tree-vect-data-refs.cc                    | 15 ++++-
+ 3 files changed, 75 insertions(+), 2 deletions(-)
+ create mode 100644 gcc/testsuite/gcc.dg/vect/vect-early-break_137-pr121190.c
+
+diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_137-pr121190.c 
b/gcc/testsuite/gcc.dg/vect/vect-early-break_137-pr121190.c
+new file mode 100644
+index 000000000000..da11146c578e
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_137-pr121190.c
+@@ -0,0 +1,60 @@
++/* PR tree-optimization/121190 */
++/* { dg-do run } */
++/* { dg-options "-O3" } */
++/* { dg-additional-options "-march=znver2" { target x86_64-*-* i?86-*-* } } */
++/* { dg-require-effective-target mmap } */
++/* { dg-require-effective-target vect_early_break } */
++
++#include <stdint.h>
++#include <string.h>
++#include <stdio.h>
++#include <sys/mman.h>
++#include <unistd.h>
++
++#define MAX_COMPARE 5000
++
++__attribute__((noipa))
++int diff (uint64_t *restrict p, uint64_t *restrict q)
++{
++  int i = 0;
++  while (i < MAX_COMPARE) {
++    if (*(p + i) != *(q + i))
++      return i;
++    i++;
++  }
++  return -1;
++}
++
++int main ()
++{
++  long pgsz = sysconf (_SC_PAGESIZE);
++  if (pgsz == -1) {
++    fprintf (stderr, "sysconf failed\n");
++    return 0;
++  }
++
++  /* Allocate 2 consecutive pages of memory and let p1 and p2 point to the
++     beginning of each.  */
++  void *mem = mmap (NULL, pgsz * 2, PROT_READ | PROT_WRITE,
++                  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
++  if (mem == MAP_FAILED) {
++    fprintf (stderr, "mmap failed\n");
++    return 0;
++  }
++  uint64_t *p1 = (uint64_t *) mem;
++  uint64_t *p2 = (uint64_t *) mem + pgsz / sizeof (uint64_t);
++
++  /* Fill the first page with zeros, except for its last 64 bits.  */
++  memset (p1, 0, pgsz);
++  *(p2 - 1) = -1;
++
++  /* Make the 2nd page not accessable.  */
++  mprotect (p2, pgsz, PROT_NONE);
++
++  /* Calls to diff should not read the 2nd page.  */
++  for (int i = 1; i <= 20; i++) {
++    if (diff (p2 - i, p1) != i - 1)
++      __builtin_abort ();
++  }
++}
++
+diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_52.c 
b/gcc/testsuite/gcc.dg/vect/vect-early-break_52.c
+index 86a632f2a822..6abfcd6580e4 100644
+--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_52.c
++++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_52.c
+@@ -18,4 +18,4 @@ int main1 (short X)
+     }
+ }
+ 
+-/* { dg-final { scan-tree-dump "vectorized 1 loops in function" "vect" { 
target { ! "x86_64-*-* i?86-*-*" } } } } */
++/* { dg-final { scan-tree-dump "vectorized 1 loops in function" "vect" { 
target { ! "x86_64-*-* i?86-*-* arm*-*-*" } } } } */
+diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
+index a24ddfbc3841..24048086857f 100644
+--- a/gcc/tree-vect-data-refs.cc
++++ b/gcc/tree-vect-data-refs.cc
+@@ -2964,12 +2964,25 @@ vect_enhance_data_refs_alignment (loop_vec_info 
loop_vinfo)
+                   break;
+                 }
+ 
++            /* Normally, we require DRs to be aligned to the vector mode size.
++               However, this is not sufficient when a statement involves safe
++               speculative read.  In such cases, a single scalar load can be
++               vectorized into multiple vector loads in one loop iteration.
++               Even if the first vector load is safe, subsequent loads might
++               still access an invalid memory page.  We increase the alignment
++               requirement to prevent this.  */
++            poly_uint64 required_align_size;
++            if (dr_safe_speculative_read_required (stmt_info))
++              required_align_size = DR_TARGET_ALIGNMENT (dr_info);
++            else
++              required_align_size = GET_MODE_SIZE (TYPE_MODE (vectype));
++
+             /* At present we don't support versioning for alignment
+                with variable VF, since there's no guarantee that the
+                VF is a power of two.  We could relax this if we added
+                a way of enforcing a power-of-two size.  */
+             unsigned HOST_WIDE_INT size;
+-            if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
++            if (!required_align_size.is_constant (&size))
+               {
+                 do_versioning = false;
+                 break;
+
+base-commit: b441d735c092f5d60c4a9c7167ed9153003d49fa
+-- 
+2.50.1
+

diff --git 
a/16.0.0/gentoo/87_all_PR121020-vect-Add-missing-skip-vector-check-for-peeling-with-.patch
 
b/16.0.0/gentoo/87_all_PR121020-vect-Add-missing-skip-vector-check-for-peeling-with-.patch
new file mode 100644
index 0000000..5aae3a5
--- /dev/null
+++ 
b/16.0.0/gentoo/87_all_PR121020-vect-Add-missing-skip-vector-check-for-peeling-with-.patch
@@ -0,0 +1,142 @@
+https://bugs.gentoo.org/959698
+https://inbox.sourceware.org/gcc-patches/[email protected]/T/#u
+
+From f66323025c47ba09cee296a8a638cfe63d1bdad3 Mon Sep 17 00:00:00 2001
+Message-ID: 
<f66323025c47ba09cee296a8a638cfe63d1bdad3.1753106388.git....@gentoo.org>
+In-Reply-To: 
<e120f6cb794a4d104b37913c918aabe0ae6b2c64.1753106388.git....@gentoo.org>
+References: 
<e120f6cb794a4d104b37913c918aabe0ae6b2c64.1753106388.git....@gentoo.org>
+From: Pengfei Li <[email protected]>
+Date: Mon, 21 Jul 2025 11:06:42 +0000
+Subject: [PATCH 2/2] vect: Add missing skip-vector check for peeling with
+ versioning [PR121020]
+
+This fixes a miscompilation issue introduced by the enablement of
+combined loop peeling and versioning. A test case that reproduces the
+issue is included in the patch.
+
+When performing loop peeling, GCC usually inserts a skip-vector check.
+This ensures that after peeling, there are enough remaining iterations
+to enter the main vectorized loop. Previously, the check was omitted if
+loop versioning for alignment was applied. It was safe before because
+versioning and peeling for alignment were mutually exclusive.
+
+However, with combined peeling and versioning enabled, this is not safe
+any more. A loop may be peeled and versioned at the same time. Without
+the skip-vector check, the main vectorized loop can be entered even if
+its iteration count is zero. This can cause the loop running many more
+iterations than needed, resulting in incorrect results.
+
+To fix this, the patch updates the condition of omitting the skip-vector
+check to when versioning is performed alone without peeling.
+
+This patch is bootstrapped and regression-tested on x86_64-linux-gnu,
+arm-linux-gnueabihf and aarch64-linux-gnu.
+
+       PR tree-optimization/121020
+
+gcc/ChangeLog:
+
+       * tree-vect-loop-manip.cc (vect_do_peeling): Update the
+         condition of omitting the skip-vector check.
+       * tree-vectorizer.h (LOOP_VINFO_USE_VERSIONING_WITHOUT_PEELING):
+         Add a helper macro.
+
+gcc/testsuite/ChangeLog:
+
+* gcc.dg/vect/vect-early-break_138-pr121020.c: New test.
+---
+ .../vect/vect-early-break_138-pr121020.c      | 52 +++++++++++++++++++
+ gcc/tree-vect-loop-manip.cc                   |  2 +-
+ gcc/tree-vectorizer.h                         |  4 ++
+ 3 files changed, 57 insertions(+), 1 deletion(-)
+ create mode 100644 gcc/testsuite/gcc.dg/vect/vect-early-break_138-pr121020.c
+
+diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_138-pr121020.c 
b/gcc/testsuite/gcc.dg/vect/vect-early-break_138-pr121020.c
+new file mode 100644
+index 000000000000..86661e445a83
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_138-pr121020.c
+@@ -0,0 +1,52 @@
++/* PR tree-optimization/121020 */
++/* { dg-do run } */
++/* { dg-options "-O3 --vect-cost-model=unlimited" } */
++/* { dg-additional-options "-march=znver2" { target x86_64-*-* i?86-*-* } } */
++/* { dg-require-effective-target mmap } */
++/* { dg-require-effective-target vect_early_break } */
++
++#include <stdint.h>
++#include <stdio.h>
++#include <sys/mman.h>
++#include <unistd.h>
++
++__attribute__((noipa))
++bool equal (uint64_t *restrict p, uint64_t *restrict q, int length)
++{
++  for (int i = 0; i < length; i++) {
++    if (*(p + i) != *(q + i))
++      return false;
++  }
++  return true;
++}
++
++int main ()
++{
++  long pgsz = sysconf (_SC_PAGESIZE);
++  if (pgsz == -1) {
++    fprintf (stderr, "sysconf failed\n");
++    return 0;
++  }
++
++  /* Allocate a whole page of memory.  */
++  void *mem = mmap (NULL, pgsz, PROT_READ | PROT_WRITE,
++                  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
++  if (mem == MAP_FAILED) {
++    fprintf (stderr, "mmap failed\n");
++    return 0;
++  }
++  uint64_t *p1 = (uint64_t *) mem;
++  uint64_t *p2 = (uint64_t *) mem + 32;
++
++  /* The first 16 elements pointed to by p1 and p2 are the same.  */
++  for (int i = 0; i < 32; i++) {
++    *(p1 + i) = 0;
++    *(p2 + i) = (i < 16 ? 0 : -1);
++  }
++
++  /* All calls to equal should return true.  */
++  for (int len = 0; len < 16; len++) {
++    if (!equal (p1 + 1, p2 + 1, len))
++      __builtin_abort();
++  }
++}
+diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
+index 2d01a4b0ed1c..7fcbc1ad2eb8 100644
+--- a/gcc/tree-vect-loop-manip.cc
++++ b/gcc/tree-vect-loop-manip.cc
+@@ -3295,7 +3295,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, 
tree nitersm1,
+   bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+                     ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
+                                 bound_prolog + bound_epilog)
+-                    : (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
++                    : (!LOOP_VINFO_USE_VERSIONING_WITHOUT_PEELING (loop_vinfo)
+                        || vect_epilogues));
+ 
+   /* Epilog loop must be executed if the number of iterations for epilog
+diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
+index 80f8853733de..69428f1747cb 100644
+--- a/gcc/tree-vectorizer.h
++++ b/gcc/tree-vectorizer.h
+@@ -1168,6 +1168,10 @@ public:
+    || LOOP_REQUIRES_VERSIONING_FOR_NITERS (L)         \
+    || LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (L))
+ 
++#define LOOP_VINFO_USE_VERSIONING_WITHOUT_PEELING(L)  \
++  ((L)->may_misalign_stmts.length () > 0              \
++   && !LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT (L))
++
+ #define LOOP_VINFO_NITERS_KNOWN_P(L)          \
+   (tree_fits_shwi_p ((L)->num_iters) && tree_to_shwi ((L)->num_iters) > 0)
+ 
+-- 
+2.50.1
+

diff --git a/16.0.0/gentoo/README.history b/16.0.0/gentoo/README.history
index d0f985b..7020432 100644
--- a/16.0.0/gentoo/README.history
+++ b/16.0.0/gentoo/README.history
@@ -1,3 +1,8 @@
+8      ????
+
+       + 
86_all_PR121190-vect-Fix-insufficient-alignment-requirement-for-spec.patch
+       + 
87_all_PR121020-vect-Add-missing-skip-vector-check-for-peeling-with-.patch
+
 7      21 July 2025
 
        - 86_all_PR120881-x86-64-Add-enable-x86-64-mfentry.patch

Reply via email to