Hi Segher,

Thanks for the review!

on 2021/8/12 下午11:10, Segher Boessenkool wrote:
> Hi!
> 
> On Wed, Aug 11, 2021 at 02:56:11PM +0800, Kewen.Lin wrote:
>>      * config/rs6000/rs6000.c (rs6000_builtin_md_vectorized_function): Add
>>      support for some built-in functions vectorized on Power10.
> 
> Say which, not "some" please?
> 

Done.

>> +  machine_mode in_vmode = TYPE_MODE (type_in);
>> +  machine_mode out_vmode = TYPE_MODE (type_out);
>> +
>> +  /* Power10 supported vectorized built-in functions.  */
>> +  if (TARGET_POWER10
>> +      && in_vmode == out_vmode
>> +      && VECTOR_UNIT_ALTIVEC_OR_VSX_P (in_vmode))
>> +    {
>> +      machine_mode exp_mode = DImode;
>> +      machine_mode exp_vmode = V2DImode;
>> +      enum rs6000_builtins vname = RS6000_BUILTIN_COUNT;
> 
> "name"?  This should be "bif" or similar?
> 

Updated with name.

>> +      switch (fn)
>> +    {
>> +    case MISC_BUILTIN_DIVWE:
>> +    case MISC_BUILTIN_DIVWEU:
>> +      exp_mode = SImode;
>> +      exp_vmode = V4SImode;
>> +      if (fn == MISC_BUILTIN_DIVWE)
>> +        vname = P10V_BUILTIN_DIVES_V4SI;
>> +      else
>> +        vname = P10V_BUILTIN_DIVEU_V4SI;
>> +      break;
>> +    case MISC_BUILTIN_DIVDE:
>> +    case MISC_BUILTIN_DIVDEU:
>> +      if (fn == MISC_BUILTIN_DIVDE)
>> +        vname = P10V_BUILTIN_DIVES_V2DI;
>> +      else
>> +        vname = P10V_BUILTIN_DIVEU_V2DI;
>> +      break;
> 
> All of the above should not be builtin functions really, they are all
> simple arithmetic :-(  They should not be UNSPECs either, on RTL level.
> They can and should be optimised in real code as well.  Oh well.
> 
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.c
>> @@ -0,0 +1,12 @@
>> +/* { dg-do compile } */
>> +/* { dg-require-effective-target lp64 } */
> 
> Please add a comment what this is needed for?  "We scan for dive*d" is
> enough, but without anything, it takes time to figure this out.
> 

Done, same for below requests on lp64 commentary.

>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-2.c
>> @@ -0,0 +1,53 @@
>> +/* { dg-do run } */
>> +/* { dg-require-effective-target lp64 } */
> 
> Same here.  I suppose this uses builtins that do not exist on 32-bit?
> 

Yeah, those bifs which are guarded with lp64 in their cases are only
supported on 64-bit environment.

>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-run-1.c
>> @@ -0,0 +1,45 @@
>> +/* { dg-do run } */
>> +/* { dg-require-effective-target lp64 } */
> 
> And another.
> 
>> +#define CHECK(name)                                                         
>>   \
>> +  __attribute__ ((optimize (1))) void check_##name ()                       
>>   \
> 
> What is the attribute for, btw?  It seems fragile, but perhaps I do not
> understand the intention.
> 
> 

It's to stop compiler from optimizing check functions with vectorization,
since the test point is to compare the results between scalar and vectorized
version.

> Okay for trunk with whose lp64 things improved.  Thanks!
> 

Thanks, v2 has been attached by addressing Bill's and your comments.  :)


BR,
Kewen
-----
gcc/ChangeLog:

        * config/rs6000/rs6000.c (rs6000_builtin_md_vectorized_function): Add
        support for built-in functions MISC_BUILTIN_DIVWE, MISC_BUILTIN_DIVWEU,
        MISC_BUILTIN_DIVDE, MISC_BUILTIN_DIVDEU, P10_BUILTIN_CFUGED,
        P10_BUILTIN_CNTLZDM, P10_BUILTIN_CNTTZDM, P10_BUILTIN_PDEPD and
        P10_BUILTIN_PEXTD on Power10.
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 279f00cc648..a8b3175ed50 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -5785,6 +5785,59 @@ rs6000_builtin_md_vectorized_function (tree fndecl, tree 
type_out,
     default:
       break;
     }
+
+  machine_mode in_vmode = TYPE_MODE (type_in);
+  machine_mode out_vmode = TYPE_MODE (type_out);
+
+  /* Power10 supported vectorized built-in functions.  */
+  if (TARGET_POWER10
+      && in_vmode == out_vmode
+      && VECTOR_UNIT_ALTIVEC_OR_VSX_P (in_vmode))
+    {
+      machine_mode exp_mode = DImode;
+      machine_mode exp_vmode = V2DImode;
+      enum rs6000_builtins name;
+      switch (fn)
+       {
+       case MISC_BUILTIN_DIVWE:
+       case MISC_BUILTIN_DIVWEU:
+         exp_mode = SImode;
+         exp_vmode = V4SImode;
+         if (fn == MISC_BUILTIN_DIVWE)
+           name = P10V_BUILTIN_DIVES_V4SI;
+         else
+           name = P10V_BUILTIN_DIVEU_V4SI;
+         break;
+       case MISC_BUILTIN_DIVDE:
+       case MISC_BUILTIN_DIVDEU:
+         if (fn == MISC_BUILTIN_DIVDE)
+           name = P10V_BUILTIN_DIVES_V2DI;
+         else
+           name = P10V_BUILTIN_DIVEU_V2DI;
+         break;
+       case P10_BUILTIN_CFUGED:
+         name = P10V_BUILTIN_VCFUGED;
+         break;
+       case P10_BUILTIN_CNTLZDM:
+         name = P10V_BUILTIN_VCLZDM;
+         break;
+       case P10_BUILTIN_CNTTZDM:
+         name = P10V_BUILTIN_VCTZDM;
+         break;
+       case P10_BUILTIN_PDEPD:
+         name = P10V_BUILTIN_VPDEPD;
+         break;
+       case P10_BUILTIN_PEXTD:
+         name = P10V_BUILTIN_VPEXTD;
+         break;
+       default:
+         return NULL_TREE;
+       }
+
+      if (in_mode == exp_mode && in_vmode == exp_vmode)
+       return rs6000_builtin_decls[name];
+    }
+
   return NULL_TREE;
 }
 
diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.c 
b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.c
new file mode 100644
index 00000000000..84f1b0a88f2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize 
-fno-vect-cost-model -fno-unroll-loops -fdump-tree-vect-details" } */
+
+/* Test if signed/unsigned int extended divisions get vectorized.  */
+
+#include "dive-vectorize-1.h"
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */
+/* { dg-final { scan-assembler-times {\mvdivesw\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mvdiveuw\M} 1 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.h 
b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.h
new file mode 100644
index 00000000000..119f637b46b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.h
@@ -0,0 +1,22 @@
+#define N 128
+
+typedef signed int si;
+typedef unsigned int ui;
+
+si si_a[N], si_b[N], si_c[N];
+ui ui_a[N], ui_b[N], ui_c[N];
+
+__attribute__ ((noipa)) void
+test_divwe ()
+{
+  for (int i = 0; i < N; i++)
+    si_c[i] = __builtin_divwe (si_a[i], si_b[i]);
+}
+
+__attribute__ ((noipa)) void
+test_divweu ()
+{
+  for (int i = 0; i < N; i++)
+    ui_c[i] = __builtin_divweu (ui_a[i], ui_b[i]);
+}
+
diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.c 
b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.c
new file mode 100644
index 00000000000..13d768d748c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* We scan for vdive*d which are only supported on 64-bit env.  */
+/* { dg-require-effective-target lp64 } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize 
-fno-vect-cost-model -fno-unroll-loops -fdump-tree-vect-details" } */
+
+/* Test if signed/unsigned long long extended divisions get vectorized.  */
+
+#include "dive-vectorize-2.h"
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */
+/* { dg-final { scan-assembler-times {\mvdivesd\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mvdiveud\M} 1 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.h 
b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.h
new file mode 100644
index 00000000000..1cab56b2e0b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.h
@@ -0,0 +1,22 @@
+#define N 128
+
+typedef signed long long sLL;
+typedef unsigned long long uLL;
+
+sLL sll_a[N], sll_b[N], sll_c[N];
+uLL ull_a[N], ull_b[N], ull_c[N];
+
+__attribute__ ((noipa)) void
+test_divde ()
+{
+  for (int i = 0; i < N; i++)
+    sll_c[i] = __builtin_divde (sll_a[i], sll_b[i]);
+}
+
+__attribute__ ((noipa)) void
+test_divdeu ()
+{
+  for (int i = 0; i < N; i++)
+    ull_c[i] = __builtin_divdeu (ull_a[i], ull_b[i]);
+}
+
diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-1.c 
b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-1.c
new file mode 100644
index 00000000000..1d5cbaa9f9b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-1.c
@@ -0,0 +1,52 @@
+/* { dg-do run } */
+/* { dg-require-effective-target power10_hw } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize 
-fno-vect-cost-model" } */
+
+#include "dive-vectorize-1.h"
+
+/* Check if test cases with signed/unsigned int extended division
+   vectorization run successfully.  */
+
+__attribute__ ((optimize (1))) void
+check_divwe ()
+{
+  test_divwe ();
+  for (int i = 0; i < N; i++)
+    {
+      si exp = __builtin_divwe (si_a[i], si_b[i]);
+      if (exp != si_c[i])
+       __builtin_abort ();
+    }
+}
+
+__attribute__ ((optimize (1))) void
+check_divweu ()
+{
+  test_divweu ();
+  for (int i = 0; i < N; i++)
+    {
+      ui exp = __builtin_divweu (ui_a[i], ui_b[i]);
+      if (exp != ui_c[i])
+       __builtin_abort ();
+    }
+}
+
+int
+main ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      si_a[i] = 0x10 * (i * 3 + 2);
+      si_b[i] = 0x7890 * (i * 3 + 1);
+      ui_a[i] = 0x234 * (i * 11 + 3) - 0xcd * (i * 5 - 7);
+      ui_b[i] = 0x6078 * (i * 7 + 3) + 0xef * (i * 7 - 11);
+      if (si_b[i] == 0 || ui_b[i] == 0)
+       __builtin_abort ();
+    }
+
+  check_divwe ();
+  check_divweu ();
+
+  return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-2.c 
b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-2.c
new file mode 100644
index 00000000000..d25111c5c1f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-2.c
@@ -0,0 +1,54 @@
+/* { dg-do run } */
+/* The checked bifs are only supported on 64-bit env.  */
+/* { dg-require-effective-target lp64 } */
+/* { dg-require-effective-target power10_hw } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize 
-fno-vect-cost-model" } */
+
+#include "dive-vectorize-2.h"
+
+/* Check if test cases with signed/unsigned int extended division
+   vectorization run successfully.  */
+
+__attribute__ ((optimize (1))) void
+check_divde ()
+{
+  test_divde ();
+  for (int i = 0; i < N; i++)
+    {
+      sLL exp = __builtin_divde (sll_a[i], sll_b[i]);
+      if (exp != sll_c[i])
+       __builtin_abort ();
+    }
+}
+
+__attribute__ ((optimize (1))) void
+check_divdeu ()
+{
+  test_divdeu ();
+  for (int i = 0; i < N; i++)
+    {
+      uLL exp = __builtin_divdeu (ull_a[i], ull_b[i]);
+      if (exp != ull_c[i])
+       __builtin_abort ();
+    }
+}
+
+int
+main ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      sll_a[i] = 0x102 * (i * 3 + 2);
+      sll_b[i] = 0x789ab * (i * 3 + 1);
+      ull_a[i] = 0x2345 * (i * 11 + 3) - 0xcd1 * (i * 5 - 7);
+      ull_b[i] = 0x6078e * (i * 7 + 3) + 0xefa * (i * 7 - 11);
+      if (sll_b[i] == 0 || ull_b[i] == 0)
+       __builtin_abort ();
+    }
+
+  check_divde ();
+  check_divdeu ();
+
+  return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.c 
b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.c
new file mode 100644
index 00000000000..fdbb9ebd61b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* What we scan for are only supported on 64-bit env.  */
+/* { dg-require-effective-target lp64 } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize 
-fno-vect-cost-model -fno-unroll-loops -fdump-tree-vect-details" } */
+
+/* Test if some Power10 built-in functions get vectorized.  */
+
+#include "p10-bifs-vectorize-1.h"
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 5 "vect" } } */
+/* { dg-final { scan-assembler-times {\mvcfuged\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mvclzdm\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mvctzdm\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mvpdepd\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mvpextd\M} 1 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.h 
b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.h
new file mode 100644
index 00000000000..80b7aacf810
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.h
@@ -0,0 +1,40 @@
+#define N 32
+
+typedef unsigned long long uLL;
+uLL ull_a[N], ull_b[N], ull_c[N];
+
+__attribute__ ((noipa)) void
+test_cfuged ()
+{
+  for (int i = 0; i < N; i++)
+    ull_c[i] = __builtin_cfuged (ull_a[i], ull_b[i]);
+}
+
+__attribute__ ((noipa)) void
+test_cntlzdm ()
+{
+  for (int i = 0; i < N; i++)
+    ull_c[i] = __builtin_cntlzdm (ull_a[i], ull_b[i]);
+}
+
+__attribute__ ((noipa)) void
+test_cnttzdm ()
+{
+  for (int i = 0; i < N; i++)
+    ull_c[i] = __builtin_cnttzdm (ull_a[i], ull_b[i]);
+}
+
+__attribute__ ((noipa)) void
+test_pdepd ()
+{
+  for (int i = 0; i < N; i++)
+    ull_c[i] = __builtin_pdepd (ull_a[i], ull_b[i]);
+}
+
+__attribute__ ((noipa)) void
+test_pextd ()
+{
+  for (int i = 0; i < N; i++)
+    ull_c[i] = __builtin_pextd (ull_a[i], ull_b[i]);
+}
+
diff --git a/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-run-1.c 
b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-run-1.c
new file mode 100644
index 00000000000..fbaff14b5cd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-run-1.c
@@ -0,0 +1,46 @@
+/* { dg-do run } */
+/* The checked bifs are only supported on 64-bit env.  */
+/* { dg-require-effective-target lp64 } */
+/* { dg-require-effective-target power10_hw } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize 
-fno-vect-cost-model" } */
+
+#include "p10-bifs-vectorize-1.h"
+
+/* Check if vectorized built-in functions run expectedly.  */
+
+#define CHECK(name)                                                           \
+  __attribute__ ((optimize (1))) void check_##name ()                         \
+  {                                                                           \
+    test_##name ();                                                           \
+    for (int i = 0; i < N; i++)                                               \
+      {                                                                       \
+       uLL exp = __builtin_##name (ull_a[i], ull_b[i]);                      \
+       if (exp != ull_c[i])                                                  \
+         __builtin_abort ();                                                 \
+      }                                                                       \
+  }
+
+CHECK (cfuged)
+CHECK (cntlzdm)
+CHECK (cnttzdm)
+CHECK (pdepd)
+CHECK (pextd)
+
+int
+main ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      ull_a[i] = 0x789a * (i * 11 - 5) - 0xcd1 * (i * 5 - 7);
+      ull_b[i] = 0xfedc * (i * 7 + 3) + 0x467 * (i * 7 - 11);
+    }
+
+  check_cfuged ();
+  check_cntlzdm ();
+  check_cnttzdm ();
+  check_pdepd ();
+  check_pextd ();
+
+  return 0;
+}
+

Reply via email to