[AMD Official Use Only - AMD Internal Distribution Only]

> -----Original Message-----
> From: Jan Hubicka <hubi...@ucw.cz>
> Sent: Wednesday, May 28, 2025 3:52 PM
> To: Gorantla, Pranav <pranav.goran...@amd.com>
> Cc: gcc-patches@gcc.gnu.org; Kumar, Venkataramanan
> <venkataramanan.ku...@amd.com>; ubiz...@gmail.com
> Subject: Re: [PATCH] i386: Use Shuffles instead of shifts for Reduction in AMD
> znver4/5
>
> Caution: This message originated from an External Source. Use proper caution
> when opening attachments, clicking links, or responding.
>
>
> > gcc/ChangeLog:
> >
> >       * config/i386/i386-expand.cc (emit_reduc_half): Use shuffles to
> >       generate reduc half for V4SI, similar modes.
> >       * config/i386/i386.h (TARGET_SSE_REDUCTION_PREFER_PSHUF): New
> Macro.
> >       * config/i386/x86-tune.def
> (X86_TUNE_SSE_REDUCTION_PREFER_PSHUF):
> >       New tuning.
> >
> > gcc/testsuite/ChangeLog:
> >
> >       * gcc.target/i386/reduc-pshuf.c: New test.
> > ---
> >  gcc/config/i386/i386-expand.cc              | 28 ++++++++++++++++++---
> >  gcc/config/i386/i386.h                      |  2 ++
> >  gcc/config/i386/x86-tune.def                |  5 ++++
> >  gcc/testsuite/gcc.target/i386/reduc-pshuf.c | 14 +++++++++++
> >  4 files changed, 46 insertions(+), 3 deletions(-)  create mode 100644
> > gcc/testsuite/gcc.target/i386/reduc-pshuf.c
> >
> > diff --git a/gcc/config/i386/i386-expand.cc
> > b/gcc/config/i386/i386-expand.cc index 7fd03c88630..c7aec716a55 100644
> > --- a/gcc/config/i386/i386-expand.cc
> > +++ b/gcc/config/i386/i386-expand.cc
> > @@ -18724,9 +18724,31 @@ emit_reduc_half (rtx dest, rtx src, int i)
> >      case E_V8HFmode:
> >      case E_V4SImode:
> >      case E_V2DImode:
> > -      d = gen_reg_rtx (V1TImode);
> > -      tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
> > -                             GEN_INT (i / 2));
> > +      if (TARGET_SSE_REDUCTION_PREFER_PSHUF) {
> > +        if (i == 128) {
> > +          d = gen_reg_rtx(V4SImode);
> > +          tem = gen_sse2_pshufd_1(
> > +              d, force_reg(V4SImode, gen_lowpart(V4SImode, src)), 
> > GEN_INT(2),
> > +              GEN_INT(3), GEN_INT(2), GEN_INT(3));
> > +        } else if (i == 64) {
> > +          d = gen_reg_rtx(V4SImode);
> > +          tem = gen_sse2_pshufd_1(
> > +              d, force_reg(V4SImode, gen_lowpart(V4SImode, src)), 
> > GEN_INT(1),
> > +              GEN_INT(1), GEN_INT(1), GEN_INT(1));
> > +        } else if (i == 32) {
> > +          d = gen_reg_rtx(V8HImode);
> > +          tem = gen_sse2_pshuflw_1(
> > +              d, force_reg(V8HImode, gen_lowpart(V8HImode, src)), 
> > GEN_INT(1),
> > +              GEN_INT(1), GEN_INT(1), GEN_INT(1));
> > +        } else {
> > +          d = gen_reg_rtx(V1TImode);
> > +          tem =
> > +              gen_sse2_lshrv1ti3(d, gen_lowpart(V1TImode, src), GEN_INT(i 
> > / 2));
> > +        }
> > +      } else {
> > +        d = gen_reg_rtx(V1TImode);
> > +        tem = gen_sse2_lshrv1ti3(d, gen_lowpart(V1TImode, src),
> > + GEN_INT(i / 2));
>
> Instead of duplicating gen_sse2_lshrv1ti3 it is probably cleaner to simply 
> break after
> each gen_sse_pshuw call and remove else.
>
> OK with that change
> Honza
Updated the patch as Suggested. Can you please commit the patch on my behalf as
don't have write permission.

Thanks
Pranav


>From 5070975a014dbfd0ca8ccb279a50d2266c2a6a18 Mon Sep 17 00:00:00 2001
From: Pranav Gorantla <pranav.goran...@amd.com>
Date: Wed, 28 May 2025 10:05:46 +0530
Subject: [PATCH v1] i386: Use Shuffles instead of shifts for Reduction in AMD
 znver4/5

In AMD znver4, znver5 targets vpshufd, vpsrldq have latencies 1,2 and
throughput 4 (2 for znver4),2 respectively. It is better to generate
shuffles instead of shifts wherever possible. In this patch we try to
generate appropriate shuffle instruction to copy higher half to lower
half instead of a simple right shift during horizontal vector reduction.

gcc/ChangeLog:

        * config/i386/i386-expand.cc (emit_reduc_half): Use shuffles to
        generate reduc half for V4SI, similar modes.
        * config/i386/i386.h (TARGET_SSE_REDUCTION_PREFER_PSHUF): New Macro.
        * config/i386/x86-tune.def (X86_TUNE_SSE_REDUCTION_PREFER_PSHUF):
        New tuning.

gcc/testsuite/ChangeLog:

        * gcc.target/i386/reduc-pshuf.c: New test.
---
 gcc/config/i386/i386-expand.cc              | 29 ++++++++++++++++++++-
 gcc/config/i386/i386.h                      |  2 ++
 gcc/config/i386/x86-tune.def                |  5 ++++
 gcc/testsuite/gcc.target/i386/reduc-pshuf.c | 14 ++++++++++
 4 files changed, 49 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/reduc-pshuf.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 7fd03c88630..96333c3c18e 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -18724,9 +18724,36 @@ emit_reduc_half (rtx dest, rtx src, int i)
     case E_V8HFmode:
     case E_V4SImode:
     case E_V2DImode:
+      if (TARGET_SSE_REDUCTION_PREFER_PSHUF)
+        {
+          if (i == 128)
+            {
+              d = gen_reg_rtx (V4SImode);
+              tem = gen_sse2_pshufd_1 (
+                  d, force_reg (V4SImode, gen_lowpart (V4SImode, src)),
+                  GEN_INT (2), GEN_INT (3), GEN_INT (2), GEN_INT (3));
+              break;
+            }
+          else if (i == 64)
+            {
+              d = gen_reg_rtx (V4SImode);
+              tem = gen_sse2_pshufd_1 (
+                  d, force_reg (V4SImode, gen_lowpart (V4SImode, src)),
+                  GEN_INT (1), GEN_INT (1), GEN_INT (1), GEN_INT (1));
+              break;
+            }
+          else if (i == 32)
+            {
+              d = gen_reg_rtx (V8HImode);
+              tem = gen_sse2_pshuflw_1 (
+                  d, force_reg (V8HImode, gen_lowpart (V8HImode, src)),
+                  GEN_INT (1), GEN_INT (1), GEN_INT (1), GEN_INT (1));
+              break;
+            }
+        }
       d = gen_reg_rtx (V1TImode);
       tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
-                               GEN_INT (i / 2));
+                                GEN_INT (i / 2));
       break;
     case E_V8SFmode:
       if (i == 256)
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 5aa056ff553..ef1700da0e7 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -491,6 +491,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
        ix86_tune_features[X86_TUNE_SSE_MOVCC_USE_BLENDV]
 #define TARGET_ALIGN_TIGHT_LOOPS \
         ix86_tune_features[X86_TUNE_ALIGN_TIGHT_LOOPS]
+#define TARGET_SSE_REDUCTION_PREFER_PSHUF \
+   ix86_tune_features[X86_TUNE_SSE_REDUCTION_PREFER_PSHUF]


 /* Feature tests against the various architecture variations.  */
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index e6044c6032e..f7213de9c48 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -572,6 +572,11 @@ DEF_TUNE (X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD,
 DEF_TUNE (X86_TUNE_SSE_MOVCC_USE_BLENDV,
          "sse_movcc_use_blendv", ~m_CORE_ATOM)

+/* X86_TUNE_V4SI_REDUCTION_PREFER_SHUFD: Prefer pshuf to reduce V16QI,
+   V8HI, V8HI, V4SI, V4FI, V2DI modes when lshr are costlier. */
+DEF_TUNE (X86_TUNE_SSE_REDUCTION_PREFER_PSHUF,
+   "sse_reduction_prefer_pshuf", m_ZNVER4 | m_ZNVER5)
+
 /*****************************************************************************/
 /* AVX instruction selection tuning (some of SSE flags affects AVX, too)     */
 /*****************************************************************************/
diff --git a/gcc/testsuite/gcc.target/i386/reduc-pshuf.c 
b/gcc/testsuite/gcc.target/i386/reduc-pshuf.c
new file mode 100644
index 00000000000..26998afc14c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/reduc-pshuf.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=znver5 " } */
+
+#define N 32
+#define T short
+T foo(T *a) {
+T sum = 0;
+for ( int i = 0 ; i < N ; i++ )
+    sum += a[i];
+return sum;
+}
+
+/* { dg-final { scan-assembler-times "vpsrl" 0 } } */
+/* { dg-final { scan-assembler-times "vpshuf" 3 } } */
--
2.34.1

Attachment: v1-0001-i386-Use-Shuffles-instead-of-shifts-for-Reduction.patch
Description: v1-0001-i386-Use-Shuffles-instead-of-shifts-for-Reduction.patch

Reply via email to