When generating a SUBREG from V16QI to V2HF, validate_subreg fails since
the V2HF size (4 bytes) is smaller than its natural size (word size).
Update remove_redundant_vector_load to skip if the mode size is smaller
than its natural size.

gcc/

PR target/120036
* config/i386/i386-features.cc (remove_redundant_vector_load):
Also skip if the mode size is smaller than its natural size.

gcc/testsuite/

PR target/120036
* g++.target/i386/pr120036.C: New test.

-- 
H.J.
From 6bfacf6014965d3ec498620dd9951efca9ad6015 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.to...@gmail.com>
Date: Thu, 1 May 2025 06:30:41 +0800
Subject: [PATCH] x86: Skip if the mode size is smaller than its natural size

When generating a SUBREG from V16QI to V2HF, validate_subreg fails since
the V2HF size (4 bytes) is smaller than its natural size (word size).
Update remove_redundant_vector_load to skip if the mode size is smaller
than its natural size.

gcc/

	PR target/120036
	* config/i386/i386-features.cc (remove_redundant_vector_load):
	Also skip if the mode size is smaller than its natural size.

gcc/testsuite/

	PR target/120036
	* g++.target/i386/pr120036.C: New test.

Signed-off-by: H.J. Lu <hjl.to...@gmail.com>
---
 gcc/config/i386/i386-features.cc         |   7 +-
 gcc/testsuite/g++.target/i386/pr120036.C | 113 +++++++++++++++++++++++
 2 files changed, 118 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/g++.target/i386/pr120036.C

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 31f3ee2ef17..8e12ca88f7a 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -3395,8 +3395,11 @@ remove_redundant_vector_load (void)
 
 	  rtx dest = SET_DEST (set);
 	  machine_mode mode = GET_MODE (dest);
-	  /* Skip non-vector instruction.  */
-	  if (!VECTOR_MODE_P (mode))
+	  /* Skip non-vector instruction. Also skip if the mode size is
+	     smaller than its natural size to avoid validate_subreg
+	     failure.  */
+	  if (!VECTOR_MODE_P (mode)
+	      || GET_MODE_SIZE (mode) < ix86_regmode_natural_size (mode))
 	    continue;
 
 	  rtx src = SET_SRC (set);
diff --git a/gcc/testsuite/g++.target/i386/pr120036.C b/gcc/testsuite/g++.target/i386/pr120036.C
new file mode 100644
index 00000000000..a2fc24f1286
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr120036.C
@@ -0,0 +1,113 @@
+/* { dg-do compile { target fpic } } */
+/* { dg-options "-O2 -std=c++11 -march=sapphirerapids -fPIC" } */
+
+typedef _Float16 Native;
+struct float16_t
+{
+  Native native;
+  float16_t ();
+  float16_t (Native arg) : native (arg) {}
+  operator Native ();
+  float16_t
+  operator+ (float16_t rhs)
+  {
+    return native + rhs.native;
+  }
+  float16_t
+  operator* (float16_t)
+  {
+    return native * native;
+  }
+};
+template <int N> struct Simd
+{
+  static constexpr int kPrivateLanes = N;
+};
+template <int N> struct ClampNAndPow2
+{
+  using type = Simd<N>;
+};
+template <int kLimit> struct CappedTagChecker
+{
+  static constexpr int N = sizeof (int) ? kLimit : 0;
+  using type = typename ClampNAndPow2<N>::type;
+};
+template <typename, int kLimit, int>
+using CappedTag = typename CappedTagChecker<kLimit>::type;
+template <class D>
+int
+Lanes (D)
+{
+  return D::kPrivateLanes;
+}
+template <class D> int Zero (D);
+template <class D> using VFromD = decltype (Zero (D ()));
+struct Vec512
+{
+  __attribute__ ((__vector_size__ (16))) _Float16 raw;
+};
+Vec512 Zero (Simd<2>);
+template <class D> void ReduceSum (D, VFromD<D>);
+struct Dot
+{
+  template <int, class D, typename T>
+  static T
+  Compute (D d, T *pa, int num_elements)
+  {
+    T *pb;
+    int N = Lanes (d), i = 0;
+    if (__builtin_expect (num_elements < N, 0))
+      {
+        T sum0 = 0, sum1 = 0;
+        for (; i + 2 <= num_elements; i += 2)
+          {
+            float16_t __trans_tmp_6 = pa[i] * pb[i],
+                      __trans_tmp_5 = sum0 + __trans_tmp_6,
+                      __trans_tmp_8 = pa[i + 1] * pb[1],
+                      __trans_tmp_7 = sum1 + __trans_tmp_8;
+            sum0 = __trans_tmp_5;
+            sum1 = __trans_tmp_7;
+          }
+        float16_t __trans_tmp_9 = sum0 + sum1;
+        return __trans_tmp_9;
+      }
+    decltype (Zero (d)) sum0;
+    ReduceSum (d, sum0);
+    __builtin_trap ();
+  }
+};
+template <int kMul, class Test, int kPow2> struct ForeachCappedR
+{
+  static void
+  Do (int min_lanes, int max_lanes)
+  {
+    CappedTag<int, kMul, kPow2> d;
+    Test () (int (), d);
+    ForeachCappedR<kMul / 2, Test, kPow2>::Do (min_lanes, max_lanes);
+  }
+};
+template <class Test, int kPow2> struct ForeachCappedR<0, Test, kPow2>
+{
+  static void Do (int, int);
+};
+struct TestDot
+{
+  template <class T, class D>
+  void
+  operator() (T, D d)
+  {
+    int counts[]{ 1, 3 };
+    for (int num : counts)
+      {
+        float16_t a;
+        T __trans_tmp_4 = Dot::Compute<0> (d, &a, num);
+      }
+  }
+};
+int DotTest_TestAllDot_TestTestBody_max_lanes;
+void
+DotTest_TestAllDot_TestTestBody ()
+{
+  ForeachCappedR<64, TestDot, 0>::Do (
+      1, DotTest_TestAllDot_TestTestBody_max_lanes);
+}
-- 
2.49.0

Reply via email to