Hi Segher, Thanks for the review!
on 2021/8/12 下午11:10, Segher Boessenkool wrote: > Hi! > > On Wed, Aug 11, 2021 at 02:56:11PM +0800, Kewen.Lin wrote: >> * config/rs6000/rs6000.c (rs6000_builtin_md_vectorized_function): Add >> support for some built-in functions vectorized on Power10. > > Say which, not "some" please? > Done. >> + machine_mode in_vmode = TYPE_MODE (type_in); >> + machine_mode out_vmode = TYPE_MODE (type_out); >> + >> + /* Power10 supported vectorized built-in functions. */ >> + if (TARGET_POWER10 >> + && in_vmode == out_vmode >> + && VECTOR_UNIT_ALTIVEC_OR_VSX_P (in_vmode)) >> + { >> + machine_mode exp_mode = DImode; >> + machine_mode exp_vmode = V2DImode; >> + enum rs6000_builtins vname = RS6000_BUILTIN_COUNT; > > "name"? This should be "bif" or similar? > Updated with name. >> + switch (fn) >> + { >> + case MISC_BUILTIN_DIVWE: >> + case MISC_BUILTIN_DIVWEU: >> + exp_mode = SImode; >> + exp_vmode = V4SImode; >> + if (fn == MISC_BUILTIN_DIVWE) >> + vname = P10V_BUILTIN_DIVES_V4SI; >> + else >> + vname = P10V_BUILTIN_DIVEU_V4SI; >> + break; >> + case MISC_BUILTIN_DIVDE: >> + case MISC_BUILTIN_DIVDEU: >> + if (fn == MISC_BUILTIN_DIVDE) >> + vname = P10V_BUILTIN_DIVES_V2DI; >> + else >> + vname = P10V_BUILTIN_DIVEU_V2DI; >> + break; > > All of the above should not be builtin functions really, they are all > simple arithmetic :-( They should not be UNSPECs either, on RTL level. > They can and should be optimised in real code as well. Oh well. > >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.c >> @@ -0,0 +1,12 @@ >> +/* { dg-do compile } */ >> +/* { dg-require-effective-target lp64 } */ > > Please add a comment what this is needed for? "We scan for dive*d" is > enough, but without anything, it takes time to figure this out. > Done, same for below requests on lp64 commentary. >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-2.c >> @@ -0,0 +1,53 @@ >> +/* { dg-do run } */ >> +/* { dg-require-effective-target lp64 } */ > > Same here. I suppose this uses builtins that do not exist on 32-bit? > Yeah, those bifs which are guarded with lp64 in their cases are only supported on 64-bit environment. >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-run-1.c >> @@ -0,0 +1,45 @@ >> +/* { dg-do run } */ >> +/* { dg-require-effective-target lp64 } */ > > And another. > >> +#define CHECK(name) >> \ >> + __attribute__ ((optimize (1))) void check_##name () >> \ > > What is the attribute for, btw? It seems fragile, but perhaps I do not > understand the intention. > > It's to stop compiler from optimizing check functions with vectorization, since the test point is to compare the results between scalar and vectorized version. > Okay for trunk with whose lp64 things improved. Thanks! > Thanks, v2 has been attached by addressing Bill's and your comments. :) BR, Kewen ----- gcc/ChangeLog: * config/rs6000/rs6000.c (rs6000_builtin_md_vectorized_function): Add support for built-in functions MISC_BUILTIN_DIVWE, MISC_BUILTIN_DIVWEU, MISC_BUILTIN_DIVDE, MISC_BUILTIN_DIVDEU, P10_BUILTIN_CFUGED, P10_BUILTIN_CNTLZDM, P10_BUILTIN_CNTTZDM, P10_BUILTIN_PDEPD and P10_BUILTIN_PEXTD on Power10.
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 279f00cc648..a8b3175ed50 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -5785,6 +5785,59 @@ rs6000_builtin_md_vectorized_function (tree fndecl, tree type_out, default: break; } + + machine_mode in_vmode = TYPE_MODE (type_in); + machine_mode out_vmode = TYPE_MODE (type_out); + + /* Power10 supported vectorized built-in functions. */ + if (TARGET_POWER10 + && in_vmode == out_vmode + && VECTOR_UNIT_ALTIVEC_OR_VSX_P (in_vmode)) + { + machine_mode exp_mode = DImode; + machine_mode exp_vmode = V2DImode; + enum rs6000_builtins name; + switch (fn) + { + case MISC_BUILTIN_DIVWE: + case MISC_BUILTIN_DIVWEU: + exp_mode = SImode; + exp_vmode = V4SImode; + if (fn == MISC_BUILTIN_DIVWE) + name = P10V_BUILTIN_DIVES_V4SI; + else + name = P10V_BUILTIN_DIVEU_V4SI; + break; + case MISC_BUILTIN_DIVDE: + case MISC_BUILTIN_DIVDEU: + if (fn == MISC_BUILTIN_DIVDE) + name = P10V_BUILTIN_DIVES_V2DI; + else + name = P10V_BUILTIN_DIVEU_V2DI; + break; + case P10_BUILTIN_CFUGED: + name = P10V_BUILTIN_VCFUGED; + break; + case P10_BUILTIN_CNTLZDM: + name = P10V_BUILTIN_VCLZDM; + break; + case P10_BUILTIN_CNTTZDM: + name = P10V_BUILTIN_VCTZDM; + break; + case P10_BUILTIN_PDEPD: + name = P10V_BUILTIN_VPDEPD; + break; + case P10_BUILTIN_PEXTD: + name = P10V_BUILTIN_VPEXTD; + break; + default: + return NULL_TREE; + } + + if (in_mode == exp_mode && in_vmode == exp_vmode) + return rs6000_builtin_decls[name]; + } + return NULL_TREE; } diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.c b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.c new file mode 100644 index 00000000000..84f1b0a88f2 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target power10_ok } */ +/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize -fno-vect-cost-model -fno-unroll-loops -fdump-tree-vect-details" } */ + +/* Test if signed/unsigned int extended divisions get vectorized. */ + +#include "dive-vectorize-1.h" + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */ +/* { dg-final { scan-assembler-times {\mvdivesw\M} 1 } } */ +/* { dg-final { scan-assembler-times {\mvdiveuw\M} 1 } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.h b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.h new file mode 100644 index 00000000000..119f637b46b --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.h @@ -0,0 +1,22 @@ +#define N 128 + +typedef signed int si; +typedef unsigned int ui; + +si si_a[N], si_b[N], si_c[N]; +ui ui_a[N], ui_b[N], ui_c[N]; + +__attribute__ ((noipa)) void +test_divwe () +{ + for (int i = 0; i < N; i++) + si_c[i] = __builtin_divwe (si_a[i], si_b[i]); +} + +__attribute__ ((noipa)) void +test_divweu () +{ + for (int i = 0; i < N; i++) + ui_c[i] = __builtin_divweu (ui_a[i], ui_b[i]); +} + diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.c b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.c new file mode 100644 index 00000000000..13d768d748c --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.c @@ -0,0 +1,13 @@ +/* { dg-do compile } */ +/* We scan for vdive*d which are only supported on 64-bit env. */ +/* { dg-require-effective-target lp64 } */ +/* { dg-require-effective-target power10_ok } */ +/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize -fno-vect-cost-model -fno-unroll-loops -fdump-tree-vect-details" } */ + +/* Test if signed/unsigned long long extended divisions get vectorized. */ + +#include "dive-vectorize-2.h" + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */ +/* { dg-final { scan-assembler-times {\mvdivesd\M} 1 } } */ +/* { dg-final { scan-assembler-times {\mvdiveud\M} 1 } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.h b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.h new file mode 100644 index 00000000000..1cab56b2e0b --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.h @@ -0,0 +1,22 @@ +#define N 128 + +typedef signed long long sLL; +typedef unsigned long long uLL; + +sLL sll_a[N], sll_b[N], sll_c[N]; +uLL ull_a[N], ull_b[N], ull_c[N]; + +__attribute__ ((noipa)) void +test_divde () +{ + for (int i = 0; i < N; i++) + sll_c[i] = __builtin_divde (sll_a[i], sll_b[i]); +} + +__attribute__ ((noipa)) void +test_divdeu () +{ + for (int i = 0; i < N; i++) + ull_c[i] = __builtin_divdeu (ull_a[i], ull_b[i]); +} + diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-1.c b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-1.c new file mode 100644 index 00000000000..1d5cbaa9f9b --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-1.c @@ -0,0 +1,52 @@ +/* { dg-do run } */ +/* { dg-require-effective-target power10_hw } */ +/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize -fno-vect-cost-model" } */ + +#include "dive-vectorize-1.h" + +/* Check if test cases with signed/unsigned int extended division + vectorization run successfully. */ + +__attribute__ ((optimize (1))) void +check_divwe () +{ + test_divwe (); + for (int i = 0; i < N; i++) + { + si exp = __builtin_divwe (si_a[i], si_b[i]); + if (exp != si_c[i]) + __builtin_abort (); + } +} + +__attribute__ ((optimize (1))) void +check_divweu () +{ + test_divweu (); + for (int i = 0; i < N; i++) + { + ui exp = __builtin_divweu (ui_a[i], ui_b[i]); + if (exp != ui_c[i]) + __builtin_abort (); + } +} + +int +main () +{ + for (int i = 0; i < N; i++) + { + si_a[i] = 0x10 * (i * 3 + 2); + si_b[i] = 0x7890 * (i * 3 + 1); + ui_a[i] = 0x234 * (i * 11 + 3) - 0xcd * (i * 5 - 7); + ui_b[i] = 0x6078 * (i * 7 + 3) + 0xef * (i * 7 - 11); + if (si_b[i] == 0 || ui_b[i] == 0) + __builtin_abort (); + } + + check_divwe (); + check_divweu (); + + return 0; +} + diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-2.c b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-2.c new file mode 100644 index 00000000000..d25111c5c1f --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-2.c @@ -0,0 +1,54 @@ +/* { dg-do run } */ +/* The checked bifs are only supported on 64-bit env. */ +/* { dg-require-effective-target lp64 } */ +/* { dg-require-effective-target power10_hw } */ +/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize -fno-vect-cost-model" } */ + +#include "dive-vectorize-2.h" + +/* Check if test cases with signed/unsigned int extended division + vectorization run successfully. */ + +__attribute__ ((optimize (1))) void +check_divde () +{ + test_divde (); + for (int i = 0; i < N; i++) + { + sLL exp = __builtin_divde (sll_a[i], sll_b[i]); + if (exp != sll_c[i]) + __builtin_abort (); + } +} + +__attribute__ ((optimize (1))) void +check_divdeu () +{ + test_divdeu (); + for (int i = 0; i < N; i++) + { + uLL exp = __builtin_divdeu (ull_a[i], ull_b[i]); + if (exp != ull_c[i]) + __builtin_abort (); + } +} + +int +main () +{ + for (int i = 0; i < N; i++) + { + sll_a[i] = 0x102 * (i * 3 + 2); + sll_b[i] = 0x789ab * (i * 3 + 1); + ull_a[i] = 0x2345 * (i * 11 + 3) - 0xcd1 * (i * 5 - 7); + ull_b[i] = 0x6078e * (i * 7 + 3) + 0xefa * (i * 7 - 11); + if (sll_b[i] == 0 || ull_b[i] == 0) + __builtin_abort (); + } + + check_divde (); + check_divdeu (); + + return 0; +} + diff --git a/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.c b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.c new file mode 100644 index 00000000000..fdbb9ebd61b --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.c @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* What we scan for are only supported on 64-bit env. */ +/* { dg-require-effective-target lp64 } */ +/* { dg-require-effective-target power10_ok } */ +/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize -fno-vect-cost-model -fno-unroll-loops -fdump-tree-vect-details" } */ + +/* Test if some Power10 built-in functions get vectorized. */ + +#include "p10-bifs-vectorize-1.h" + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 5 "vect" } } */ +/* { dg-final { scan-assembler-times {\mvcfuged\M} 1 } } */ +/* { dg-final { scan-assembler-times {\mvclzdm\M} 1 } } */ +/* { dg-final { scan-assembler-times {\mvctzdm\M} 1 } } */ +/* { dg-final { scan-assembler-times {\mvpdepd\M} 1 } } */ +/* { dg-final { scan-assembler-times {\mvpextd\M} 1 } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.h b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.h new file mode 100644 index 00000000000..80b7aacf810 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.h @@ -0,0 +1,40 @@ +#define N 32 + +typedef unsigned long long uLL; +uLL ull_a[N], ull_b[N], ull_c[N]; + +__attribute__ ((noipa)) void +test_cfuged () +{ + for (int i = 0; i < N; i++) + ull_c[i] = __builtin_cfuged (ull_a[i], ull_b[i]); +} + +__attribute__ ((noipa)) void +test_cntlzdm () +{ + for (int i = 0; i < N; i++) + ull_c[i] = __builtin_cntlzdm (ull_a[i], ull_b[i]); +} + +__attribute__ ((noipa)) void +test_cnttzdm () +{ + for (int i = 0; i < N; i++) + ull_c[i] = __builtin_cnttzdm (ull_a[i], ull_b[i]); +} + +__attribute__ ((noipa)) void +test_pdepd () +{ + for (int i = 0; i < N; i++) + ull_c[i] = __builtin_pdepd (ull_a[i], ull_b[i]); +} + +__attribute__ ((noipa)) void +test_pextd () +{ + for (int i = 0; i < N; i++) + ull_c[i] = __builtin_pextd (ull_a[i], ull_b[i]); +} + diff --git a/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-run-1.c b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-run-1.c new file mode 100644 index 00000000000..fbaff14b5cd --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-run-1.c @@ -0,0 +1,46 @@ +/* { dg-do run } */ +/* The checked bifs are only supported on 64-bit env. */ +/* { dg-require-effective-target lp64 } */ +/* { dg-require-effective-target power10_hw } */ +/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize -fno-vect-cost-model" } */ + +#include "p10-bifs-vectorize-1.h" + +/* Check if vectorized built-in functions run expectedly. */ + +#define CHECK(name) \ + __attribute__ ((optimize (1))) void check_##name () \ + { \ + test_##name (); \ + for (int i = 0; i < N; i++) \ + { \ + uLL exp = __builtin_##name (ull_a[i], ull_b[i]); \ + if (exp != ull_c[i]) \ + __builtin_abort (); \ + } \ + } + +CHECK (cfuged) +CHECK (cntlzdm) +CHECK (cnttzdm) +CHECK (pdepd) +CHECK (pextd) + +int +main () +{ + for (int i = 0; i < N; i++) + { + ull_a[i] = 0x789a * (i * 11 - 5) - 0xcd1 * (i * 5 - 7); + ull_b[i] = 0xfedc * (i * 7 + 3) + 0x467 * (i * 7 - 11); + } + + check_cfuged (); + check_cntlzdm (); + check_cnttzdm (); + check_pdepd (); + check_pextd (); + + return 0; +} +