On Tue, Mar 23, 2021 at 09:19:38AM +0100, Richard Biener wrote:
> On Tue, Mar 23, 2021 at 3:41 AM Hongyu Wang <wwwhhhyyy...@gmail.com> wrote:
> >
> > > Hongyue, please collect code size differences on SPEC CPU 2017 and
> > > eembc.
> >
> > Here is code size difference for this patch
> 
> Thanks, nothing too bad although slightly larger impacts than envisioned.
> 

PING.

OK for master branch?

Thanks.

H.J.
 ---
Simplify memcpy and memset inline strategies to avoid branches for
-mtune=generic:

1. With MOVE_RATIO and CLEAR_RATIO == 17, GCC will use integer/vector
   load and store for up to 16 * 16 (256) bytes when the data size is
   fixed and known.
2. Inline only if data size is known to be <= 256.
   a. Use "rep movsb/stosb" with simple code sequence if the data size
      is a constant.
   b. Use loop if data size is not a constant.
3. Use memcpy/memset libray function if data size is unknown or > 256.

With -mtune=generic -O2,

1. On Ice Lake processor,

Performance impacts on SPEC CPU 2017:

500.perlbench_r  0.51%
502.gcc_r        0.55%
505.mcf_r        0.38%
520.omnetpp_r   -0.74%
523.xalancbmk_r -0.35%
525.x264_r       2.99%
531.deepsjeng_r -0.17%
541.leela_r     -0.98%
548.exchange2_r  0.89%
557.xz_r         0.70%
Geomean          0.37%

503.bwaves_r     0.04%
507.cactuBSSN_r -0.01%
508.namd_r      -0.45%
510.parest_r    -0.09%
511.povray_r    -1.37%
519.lbm_r        0.00%
521.wrf_r       -2.56%
526.blender_r   -0.01%
527.cam4_r      -0.05%
538.imagick_r    0.36%
544.nab_r        0.08%
549.fotonik3d_r -0.06%
554.roms_r       0.05%
Geomean         -0.34%

Significant impacts on eembc benchmarks:

eembc/nnet_test      14.85%
eembc/mp2decoddata2  13.57%

2. On Cascadelake processor,

Performance impacts on SPEC CPU 2017:

500.perlbench_r -0.02%
502.gcc_r        0.10%
505.mcf_r       -1.14%
520.omnetpp_r   -0.22%
523.xalancbmk_r  0.21%
525.x264_r       0.94%
531.deepsjeng_r -0.37%
541.leela_r     -0.46%
548.exchange2_r -0.40%
557.xz_r         0.60%
Geomean         -0.08%

503.bwaves_r    -0.50%
507.cactuBSSN_r  0.05%
508.namd_r      -0.02%
510.parest_r     0.09%
511.povray_r    -1.35%
519.lbm_r        0.00%
521.wrf_r       -0.03%
526.blender_r   -0.83%
527.cam4_r       1.23%
538.imagick_r    0.97%
544.nab_r       -0.02%
549.fotonik3d_r -0.12%
554.roms_r       0.55%
Geomean          0.00%

Significant impacts on eembc benchmarks:

eembc/nnet_test      9.90%
eembc/mp2decoddata2  16.42%
eembc/textv2data3   -4.86%
eembc/qos            12.90%

3. On Znver3 processor,

Performance impacts on SPEC CPU 2017:

500.perlbench_r -0.96%
502.gcc_r       -1.06%
505.mcf_r       -0.01%
520.omnetpp_r   -1.45%
523.xalancbmk_r  2.89%
525.x264_r       4.98%
531.deepsjeng_r  0.18%
541.leela_r     -1.54%
548.exchange2_r -1.25%
557.xz_r        -0.01%
Geomean          0.16%

503.bwaves_r     0.04%
507.cactuBSSN_r  0.85%
508.namd_r      -0.13%
510.parest_r     0.39%
511.povray_r     0.00%
519.lbm_r        0.00%
521.wrf_r        0.28%
526.blender_r   -0.10%
527.cam4_r      -0.58%
538.imagick_r    0.69%
544.nab_r       -0.04%
549.fotonik3d_r -0.04%
554.roms_r       0.40%
Geomean          0.15%

Significant impacts on eembc benchmarks:

eembc/aifftr01       13.95%
eembc/idctrn01       8.41%
eembc/nnet_test      30.25%
eembc/mp2decoddata2  5.05%
eembc/textv2data3    6.43%
eembc/qos           -5.79%

Code size differences are:

SPEC CPU 2017

                  difference      w patch      w/o patch
500.perlbench_r     0.051%        1622637      1621805
502.gcc_r           0.039%        6930877      6928141
505.mcf_r           0.098%        16413        16397
520.omnetpp_r       0.083%        1327757      1326653
523.xalancbmk_r     0.001%        3575709      3575677
525.x264_r         -0.067%        769095       769607
531.deepsjeng_r     0.071%        67629        67581
541.leela_r        -3.062%        127629       131661
548.exchange2_r    -0.338%        66141        66365
557.xz_r            0.946%        128061       126861
503.bwaves_r        0.534%        33117        32941
507.cactuBSSN_r     0.004%        2993645      2993517
508.namd_r          0.006%        851677       851629
510.parest_r        0.488%        6741277      6708557
511.povray_r       -0.021%        849290       849466
521.wrf_r           0.022%        29682154     29675530
526.blender_r       0.054%        7544057      7540009
527.cam4_r          0.043%        6102234      6099594
538.imagick_r      -0.015%        1625770      1626010
544.nab_r           0.155%        155453       155213
549.fotonik3d_r     0.000%        351757       351757
554.roms_r          0.041%        735837       735533

eembc

aifftr01            0.762%        14813        14701
aiifft01            0.556%        14477        14397
idctrn01            0.101%        15853        15837
cjpeg-rose7-preset  0.114%        56125        56061
nnet_test          -0.848%        35549        35853
aes                 0.125%        38493        38445
cjpegv2data         0.108%        59213        59149
djpegv2data         0.025%        63821        63805
huffde             -0.104%        30621        30653
mp2decoddata       -0.047%        68285        68317
mp2enf32data1       0.018%        86925        86909
mp2enf32data2       0.018%        89357        89341
mp2enf32data3       0.018%        88253        88237
mp3playerfixeddata  0.103%        46877        46829
ip_pktcheckb1m      0.191%        25213        25165
nat                 0.527%        45757        45517
ospfv2              0.196%        24573        24525
routelookup         0.189%        25389        25341
tcpbulk             0.155%        30925        30877
textv2data          0.055%        29101        29085

gcc/

        * config/i386/x86-tune-costs.h (generic_memcpy): Updated.
        (generic_memset): Likewise.
        (generic_cost): Change CLEAR_RATIO to 17.
        * config/i386/x86-tune.def (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB):
        Add m_GENERIC.

gcc/testsuite/

        * gcc.target/i386/memcpy-strategy-12.c: New test.
        * gcc.target/i386/memcpy-strategy-13.c: Likewise.
        * gcc.target/i386/memset-strategy-10.c: Likewise.
        * gcc.target/i386/memset-strategy-11.c: Likewise.
        * gcc.target/i386/shrink_wrap_1.c: Also pass
        -mmemset-strategy=rep_8byte:-1:align.
        * gcc.target/i386/sw-1.c: Also pass -mstringop-strategy=rep_byte.
---
 gcc/config/i386/x86-tune-costs.h              | 31 ++++++++++++-------
 gcc/config/i386/x86-tune.def                  |  2 +-
 .../gcc.target/i386/memcpy-strategy-12.c      |  9 ++++++
 .../gcc.target/i386/memcpy-strategy-13.c      | 11 +++++++
 .../gcc.target/i386/memset-strategy-10.c      | 11 +++++++
 .../gcc.target/i386/memset-strategy-11.c      |  9 ++++++
 gcc/testsuite/gcc.target/i386/shrink_wrap_1.c |  2 +-
 gcc/testsuite/gcc.target/i386/sw-1.c          |  2 +-
 8 files changed, 63 insertions(+), 14 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-strategy-12.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-strategy-13.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-10.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-11.c

diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index ffe810f2bcb..30e7c3e4261 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -2844,19 +2844,28 @@ struct processor_costs intel_cost = {
   "16",                                        /* Func alignment.  */
 };
 
-/* Generic should produce code tuned for Core-i7 (and newer chips)
-   and btver1 (and newer chips).  */
+/* Generic should produce code tuned for Haswell (and newer chips)
+   and znver1 (and newer chips).  NB: rep_prefix_1_byte is used only
+   for known size.  */
 
 static stringop_algs generic_memcpy[2] = {
-  {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
-             {-1, libcall, false}}},
-  {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
-             {-1, libcall, false}}}};
+  {libcall,
+   {{256, rep_prefix_1_byte, true},
+    {256, loop, false},
+    {-1, libcall, false}}},
+  {libcall,
+   {{256, rep_prefix_1_byte, true},
+    {256, loop, false},
+    {-1, libcall, false}}}};
 static stringop_algs generic_memset[2] = {
-  {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
-             {-1, libcall, false}}},
-  {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
-             {-1, libcall, false}}}};
+  {libcall,
+   {{256, rep_prefix_1_byte, true},
+    {256, loop, false},
+    {-1, libcall, false}}},
+  {libcall,
+   {{256, rep_prefix_1_byte, true},
+    {256, loop, false},
+    {-1, libcall, false}}}};
 static const
 struct processor_costs generic_cost = {
   {
@@ -2913,7 +2922,7 @@ struct processor_costs generic_cost = {
   COSTS_N_INSNS (1),                   /* cost of movzx */
   8,                                   /* "large" insn */
   17,                                  /* MOVE_RATIO */
-  6,                                   /* CLEAR_RATIO */
+  17,                                  /* CLEAR_RATIO */
   {6, 6, 6},                           /* cost of loading integer registers
                                           in QImode, HImode and SImode.
                                           Relative to reg-reg move (2).  */
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 8f55da89c92..a9a023f33f5 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -273,7 +273,7 @@ DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", 
m_386 | m_P4_NOCONA)
    move/set sequences of bytes with known size.  */
 DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB,
          "prefer_known_rep_movsb_stosb",
-         m_SKYLAKE | m_ALDERLAKE | m_CORE_AVX512)
+         m_SKYLAKE | m_ALDERLAKE | m_CORE_AVX512 | m_GENERIC)
 
 /* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of
    compact prologues and epilogues by issuing a misaligned moves.  This
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-12.c 
b/gcc/testsuite/gcc.target/i386/memcpy-strategy-12.c
new file mode 100644
index 00000000000..e9998b70ab2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-12.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=generic -mno-sse" } */
+/* { dg-final { scan-assembler "rep movsb" } } */
+
+void
+foo (char *dest, char *src)
+{
+  __builtin_memcpy (dest, src, 249);
+}
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-13.c 
b/gcc/testsuite/gcc.target/i386/memcpy-strategy-13.c
new file mode 100644
index 00000000000..109bd675a51
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-13.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=generic -mno-avx" } */
+/* { dg-final { scan-assembler "jmp\tmemcpy" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler "call\tmemcpy" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "rep movsb" } } */
+
+void
+foo (char *dest, char *src)
+{
+  __builtin_memcpy (dest, src, 257);
+}
diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-10.c 
b/gcc/testsuite/gcc.target/i386/memset-strategy-10.c
new file mode 100644
index 00000000000..685d6e5a5c2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-strategy-10.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=generic -mno-avx" } */
+/* { dg-final { scan-assembler "jmp\tmemset" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler "call\tmemset" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "rep stosb" } } */
+
+void
+foo (char *dest)
+{
+  __builtin_memset (dest, 0, 257);
+}
diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-11.c 
b/gcc/testsuite/gcc.target/i386/memset-strategy-11.c
new file mode 100644
index 00000000000..61ee463a8cf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-strategy-11.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=generic -mno-sse" } */
+/* { dg-final { scan-assembler "rep stosb" } } */
+
+void
+foo (char *dest)
+{
+  __builtin_memset (dest, 0, 253);
+}
diff --git a/gcc/testsuite/gcc.target/i386/shrink_wrap_1.c 
b/gcc/testsuite/gcc.target/i386/shrink_wrap_1.c
index 94dadd6cdbd..44fe7d2836e 100644
--- a/gcc/testsuite/gcc.target/i386/shrink_wrap_1.c
+++ b/gcc/testsuite/gcc.target/i386/shrink_wrap_1.c
@@ -1,5 +1,5 @@
 /* { dg-do compile { target { ! ia32 } } } */
-/* { dg-options "-O2 -fdump-rtl-pro_and_epilogue" } */
+/* { dg-options "-O2 -mmemset-strategy=rep_8byte:-1:align 
-fdump-rtl-pro_and_epilogue" } */
 
 enum machine_mode
 {
diff --git a/gcc/testsuite/gcc.target/i386/sw-1.c 
b/gcc/testsuite/gcc.target/i386/sw-1.c
index a9c89fca4ec..234db0e67c2 100644
--- a/gcc/testsuite/gcc.target/i386/sw-1.c
+++ b/gcc/testsuite/gcc.target/i386/sw-1.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -mtune=generic -fshrink-wrap -fdump-rtl-pro_and_epilogue" 
} */
+/* { dg-options "-O2 -mtune=generic -mstringop-strategy=rep_byte -fshrink-wrap 
-fdump-rtl-pro_and_epilogue" } */
 /* { dg-additional-options "-mno-avx" { target ia32 } } */
 /* { dg-skip-if "No shrink-wrapping preformed" { x86_64-*-mingw* } } */
 
-- 
2.31.1

Reply via email to