Ping. Please review. Thanks, Avinash
On Fri, 2025-11-28 at 23:23 +0530, Avinash Jayakar wrote: > From: Avinash Jayakar <[email protected]> > > Hi, > > Here is a small patch to do better codegen in powerpc for %, [fl]% > and > [cl]% operators. > Kindly review. Is this OK for trunk? > > Changes from v1: > - Added test cases to check the vectorization > Thanks and regards, > Avinash Jayakar > > The modulo operator and its floor/ceil variants %, [fl]% and [cl]% > would > not auto-vectorize in powerpc. Because during vectorization, the > vector > cost would be unfairly declared costlier than scalar loop. i.e., the > scalar cost of % operator would be considered 1 despite in assembly > it > generating 6 instructions, and thus in vector pass with O2 flag, it > would consider scalar loop as better and abandon vectorization. > > This patch adjusts the cost of the 3 operators as seen in the > assembly, > and thus generates faster-running code when these operators are seen > in > a loop. > > Suppose the source is > > for (int i=0; i<N; i++) > a[i] = a[i] % CONST; > > The inner basic block of the loop would emit following asm: > .L2: > lwzu 10,4(4) > mulhw 8,10,5 > srawi 9,10,31 > srawi 8,8,3 > subf 9,9,8 > mulli 9,9,19 > subf 9,9,10 > stwu 9,4(3) > bdnz .L2 > blr > > After fine tuning the cost we see > .L2: > lxvd2x 45,0,3 > vmulosw 10,13,11 > vmulesw 0,13,11 > vsraw 1,13,9 > vmrgew 0,0,10 > vsraw 0,0,8 > vsubuwm 0,0,1 > vslw 1,0,12 > vadduwm 1,1,0 > vslw 1,1,12 > vsubuwm 0,1,0 > vsubuwm 13,13,0 > stxvd2x 45,0,3 > addi 3,3,16 > bdnz .L2 > blr > > Although the code size increases, the runtime performance is almost > 4x > better than scalar code. > > 2025-11-28 Avinash Jayakar <[email protected]> > > PR 121700 > > gcc/ChangeLog: > > * config/rs6000/rs6000.cc > (rs6000_adjust_vect_cost_per_stmt): > Fine-grain adjustment of %, [fl]% and [cl]% ops. > > gcc/testsuite/ChangeLog: > > * gcc.target/powerpc/pr121700-ceil-mod.c: New test. > * gcc.target/powerpc/pr121700-floor-mod.c: New test. > * gcc.target/powerpc/pr121700-trunc-mod.c: New test. > * gcc.target/powerpc/pr121700.h: Test utility. > --- > gcc/config/rs6000/rs6000.cc | 7 +++ > .../gcc.target/powerpc/pr121700-ceil-mod.c | 16 ++++++ > .../gcc.target/powerpc/pr121700-floor-mod.c | 16 ++++++ > .../gcc.target/powerpc/pr121700-trunc-mod.c | 16 ++++++ > gcc/testsuite/gcc.target/powerpc/pr121700.h | 52 > +++++++++++++++++++ > 5 files changed, 107 insertions(+) > create mode 100644 gcc/testsuite/gcc.target/powerpc/pr121700-ceil- > mod.c > create mode 100644 gcc/testsuite/gcc.target/powerpc/pr121700-floor- > mod.c > create mode 100644 gcc/testsuite/gcc.target/powerpc/pr121700-trunc- > mod.c > create mode 100644 gcc/testsuite/gcc.target/powerpc/pr121700.h > > diff --git a/gcc/config/rs6000/rs6000.cc > b/gcc/config/rs6000/rs6000.cc > index 1d5cd25c0f0..181571f17ab 100644 > --- a/gcc/config/rs6000/rs6000.cc > +++ b/gcc/config/rs6000/rs6000.cc > @@ -5308,6 +5308,13 @@ rs6000_adjust_vect_cost_per_stmt (enum > vect_cost_for_stmt kind, > tree_code subcode = gimple_assign_rhs_code (stmt_info->stmt); > if (subcode == COND_EXPR) > return 2; > +/* For {FLOOR,TRUNC}_MOD_EXPR, cost them a bit higher in order to > fairly > + compare the scalar and vector costs, since there is optimal > scalar instruction > + that can evaluation these expressions with just 1 instruction. > Currently > + using the number of instructions generated for these > expressions.*/ > + else if (subcode == FLOOR_MOD_EXPR || subcode == > TRUNC_MOD_EXPR > + || subcode == CEIL_MOD_EXPR) > + return 6; > } > > return 0; > diff --git a/gcc/testsuite/gcc.target/powerpc/pr121700-ceil-mod.c > b/gcc/testsuite/gcc.target/powerpc/pr121700-ceil-mod.c > new file mode 100644 > index 00000000000..56ac1a48217 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/pr121700-ceil-mod.c > @@ -0,0 +1,16 @@ > +/* { dg-options "-fgimple -fdump-tree-vect-details -O2 -mdejagnu- > cpu=power8" } */ > + > +#include "pr121700.h" > + > +TEST_FN(__CEIL_MOD, 19, mod) > + > +int main (void) > +{ > + int *a = (int*)&arr; > + init_arr(a, N); > + mod(a); > + > + return 0; > +} > + > +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 1 > "vect" } } */ > diff --git a/gcc/testsuite/gcc.target/powerpc/pr121700-floor-mod.c > b/gcc/testsuite/gcc.target/powerpc/pr121700-floor-mod.c > new file mode 100644 > index 00000000000..9198773b210 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/pr121700-floor-mod.c > @@ -0,0 +1,16 @@ > +/* { dg-options "-fgimple -fdump-tree-vect-details -O2 -mdejagnu- > cpu=power8" } */ > + > +#include "pr121700.h" > + > +TEST_FN(__FLOOR_MOD, 19, mod) > + > +int main (void) > +{ > + int *a = (int*)&arr; > + init_arr(a, N); > + mod(a); > + > + return 0; > +} > + > +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 1 > "vect" } } */ > diff --git a/gcc/testsuite/gcc.target/powerpc/pr121700-trunc-mod.c > b/gcc/testsuite/gcc.target/powerpc/pr121700-trunc-mod.c > new file mode 100644 > index 00000000000..c1154b08a39 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/pr121700-trunc-mod.c > @@ -0,0 +1,16 @@ > +/* { dg-options "-fgimple -fdump-tree-vect-details -O2 -mdejagnu- > cpu=power8" } */ > + > +#include "pr121700.h" > + > +TEST_FN(%, 19, mod) > + > +int main (void) > +{ > + int *a = (int*)&arr; > + init_arr(a, N); > + mod(a); > + > + return 0; > +} > + > +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 1 > "vect" } } */ > diff --git a/gcc/testsuite/gcc.target/powerpc/pr121700.h > b/gcc/testsuite/gcc.target/powerpc/pr121700.h > new file mode 100644 > index 00000000000..1550f9f8f5f > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/pr121700.h > @@ -0,0 +1,52 @@ > +#define TEST_FN(OP, CONST, NAME) \ > +__attribute__((noinline)) \ > +void __GIMPLE (ssa,guessed_local(10737416)) \ > +NAME (int * a) \ > +{ \ > + int i; \ > + long unsigned int _1; \ > + long unsigned int _2; \ > + int * _3; \ > + int _4; \ > + int _5; \ > + unsigned int _12; \ > + unsigned int _13; \ > + \ > + __BB(2,guessed_local(10737416)): \ > + goto __BB3(precise(134217728)); \ > + \ > + __BB(3,loop_header(1),guessed_local(1063004408)): \ > + i_14 = __PHI (__BB5: i_11, __BB2: 0); \ > + _13 = __PHI (__BB5: _12, __BB2: 1024u); \ > + _1 = (long unsigned int) i_14; \ > + _2 = _1 * 4ul; \ > + _3 = a_9(D) + _2; \ > + _4 = __MEM <int> (_3); \ > + _5 = _4 OP CONST; \ > + __MEM <int> (_3) = _5; \ > + i_11 = i_14 + 1; \ > + _12 = _13 - 1u; \ > + if (_12 != 0u) \ > + goto __BB5(guessed(132861994)); \ > + else \ > + goto __BB4(guessed(1355734)); \ > + \ > + __BB(5,guessed_local(1052266995)): \ > + goto __BB3(precise(134217728)); \ > + \ > + __BB(4,guessed_local(10737416)): \ > + return; \ > + \ > +} \ > + > + > + > +#define N 1024 > +int arr[N]; > +void init_arr (int *a, int n) > +{ > + #pragma GCC novector > + for (int i=0; i<n; i++) > + a[i] = i - n/2; > +} > +
