Ping. Please review.

Thanks,
Avinash

On Fri, 2025-11-28 at 23:23 +0530, Avinash Jayakar wrote:
> From: Avinash Jayakar <[email protected]>
> 
> Hi,
> 
> Here is a small patch to do better codegen in powerpc for %, [fl]%
> and
> [cl]% operators.
> Kindly review. Is this OK for trunk?
> 
> Changes from v1:
> - Added test cases to check the vectorization
> Thanks and regards,
> Avinash Jayakar
> 
> The modulo operator and its floor/ceil variants %, [fl]% and [cl]%
> would
> not auto-vectorize in powerpc. Because during vectorization, the
> vector
> cost would be unfairly declared costlier than scalar loop. i.e., the
> scalar cost of % operator would be considered 1 despite in assembly
> it
> generating 6 instructions, and thus in vector pass with O2 flag, it
> would consider scalar loop as better and abandon vectorization.
> 
> This patch adjusts the cost of the 3 operators as seen in the
> assembly,
> and thus generates faster-running code when these operators are seen
> in
> a loop.
> 
> Suppose the source is
> 
> for (int i=0; i<N; i++)
>   a[i] = a[i] % CONST;
> 
> The inner basic block of the loop would emit following asm:
> .L2:
>         lwzu 10,4(4)
>         mulhw 8,10,5
>         srawi 9,10,31
>         srawi 8,8,3
>         subf 9,9,8
>         mulli 9,9,19
>         subf 9,9,10
>         stwu 9,4(3)
>         bdnz .L2
>         blr
> 
> After fine tuning the cost we see
> .L2:
>         lxvd2x 45,0,3
>         vmulosw 10,13,11
>         vmulesw 0,13,11
>         vsraw 1,13,9
>         vmrgew 0,0,10
>         vsraw 0,0,8
>         vsubuwm 0,0,1
>         vslw 1,0,12
>         vadduwm 1,1,0
>         vslw 1,1,12
>         vsubuwm 0,1,0
>         vsubuwm 13,13,0
>         stxvd2x 45,0,3
>         addi 3,3,16
>         bdnz .L2
>         blr
> 
> Although the code size increases, the runtime performance is almost
> 4x
> better than scalar code.
> 
> 2025-11-28  Avinash Jayakar  <[email protected]>
> 
>       PR 121700
> 
> gcc/ChangeLog:
> 
>       * config/rs6000/rs6000.cc
> (rs6000_adjust_vect_cost_per_stmt):
>       Fine-grain adjustment of %, [fl]% and [cl]% ops.
> 
> gcc/testsuite/ChangeLog:
> 
>       * gcc.target/powerpc/pr121700-ceil-mod.c: New test.
>       * gcc.target/powerpc/pr121700-floor-mod.c: New test.
>       * gcc.target/powerpc/pr121700-trunc-mod.c: New test.
>       * gcc.target/powerpc/pr121700.h: Test utility.
> ---
>  gcc/config/rs6000/rs6000.cc                   |  7 +++
>  .../gcc.target/powerpc/pr121700-ceil-mod.c    | 16 ++++++
>  .../gcc.target/powerpc/pr121700-floor-mod.c   | 16 ++++++
>  .../gcc.target/powerpc/pr121700-trunc-mod.c   | 16 ++++++
>  gcc/testsuite/gcc.target/powerpc/pr121700.h   | 52
> +++++++++++++++++++
>  5 files changed, 107 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr121700-ceil-
> mod.c
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr121700-floor-
> mod.c
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr121700-trunc-
> mod.c
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr121700.h
> 
> diff --git a/gcc/config/rs6000/rs6000.cc
> b/gcc/config/rs6000/rs6000.cc
> index 1d5cd25c0f0..181571f17ab 100644
> --- a/gcc/config/rs6000/rs6000.cc
> +++ b/gcc/config/rs6000/rs6000.cc
> @@ -5308,6 +5308,13 @@ rs6000_adjust_vect_cost_per_stmt (enum
> vect_cost_for_stmt kind,
>        tree_code subcode = gimple_assign_rhs_code (stmt_info->stmt);
>        if (subcode == COND_EXPR)
>       return 2;
> +/* For {FLOOR,TRUNC}_MOD_EXPR, cost them a bit higher in order to
> fairly
> +   compare the scalar and vector costs, since there is optimal
> scalar instruction
> +   that can evaluation these expressions with just 1 instruction.
> Currently
> +   using the number of instructions generated for these
> expressions.*/
> +      else if (subcode == FLOOR_MOD_EXPR || subcode ==
> TRUNC_MOD_EXPR
> +            || subcode == CEIL_MOD_EXPR)
> +     return 6;
>      }
>  
>    return 0;
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr121700-ceil-mod.c
> b/gcc/testsuite/gcc.target/powerpc/pr121700-ceil-mod.c
> new file mode 100644
> index 00000000000..56ac1a48217
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr121700-ceil-mod.c
> @@ -0,0 +1,16 @@
> +/* { dg-options "-fgimple -fdump-tree-vect-details -O2 -mdejagnu-
> cpu=power8" } */
> +
> +#include "pr121700.h"
> +
> +TEST_FN(__CEIL_MOD, 19, mod)
> +
> +int main (void)
> +{
> +  int *a = (int*)&arr;
> +  init_arr(a, N);
> +  mod(a);
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 1
> "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr121700-floor-mod.c
> b/gcc/testsuite/gcc.target/powerpc/pr121700-floor-mod.c
> new file mode 100644
> index 00000000000..9198773b210
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr121700-floor-mod.c
> @@ -0,0 +1,16 @@
> +/* { dg-options "-fgimple -fdump-tree-vect-details -O2 -mdejagnu-
> cpu=power8" } */
> +
> +#include "pr121700.h"
> +
> +TEST_FN(__FLOOR_MOD, 19, mod)
> +
> +int main (void)
> +{
> +  int *a = (int*)&arr;
> +  init_arr(a, N);
> +  mod(a);
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 1
> "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr121700-trunc-mod.c
> b/gcc/testsuite/gcc.target/powerpc/pr121700-trunc-mod.c
> new file mode 100644
> index 00000000000..c1154b08a39
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr121700-trunc-mod.c
> @@ -0,0 +1,16 @@
> +/* { dg-options "-fgimple -fdump-tree-vect-details -O2 -mdejagnu-
> cpu=power8" } */
> +
> +#include "pr121700.h"
> +
> +TEST_FN(%, 19, mod)
> +
> +int main (void)
> +{
> +  int *a = (int*)&arr;
> +  init_arr(a, N);
> +  mod(a);
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 1
> "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr121700.h
> b/gcc/testsuite/gcc.target/powerpc/pr121700.h
> new file mode 100644
> index 00000000000..1550f9f8f5f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr121700.h
> @@ -0,0 +1,52 @@
> +#define TEST_FN(OP, CONST, NAME) \
> +__attribute__((noinline)) \
> +void __GIMPLE (ssa,guessed_local(10737416)) \
> +NAME (int * a) \
> +{ \
> +  int i; \
> +  long unsigned int _1; \
> +  long unsigned int _2; \
> +  int * _3; \
> +  int _4; \
> +  int _5; \
> +  unsigned int _12; \
> +  unsigned int _13; \
> + \
> +  __BB(2,guessed_local(10737416)): \
> +  goto __BB3(precise(134217728)); \
> + \
> +  __BB(3,loop_header(1),guessed_local(1063004408)): \
> +  i_14 = __PHI (__BB5: i_11, __BB2: 0); \
> +  _13 = __PHI (__BB5: _12, __BB2: 1024u); \
> +  _1 = (long unsigned int) i_14; \
> +  _2 = _1 * 4ul; \
> +  _3 = a_9(D) + _2; \
> +  _4 = __MEM <int> (_3); \
> +  _5 = _4 OP CONST; \
> +  __MEM <int> (_3) = _5; \
> +  i_11 = i_14 + 1; \
> +  _12 = _13 - 1u; \
> +  if (_12 != 0u) \
> +    goto __BB5(guessed(132861994)); \
> +  else \
> +    goto __BB4(guessed(1355734)); \
> + \
> +  __BB(5,guessed_local(1052266995)): \
> +  goto __BB3(precise(134217728)); \
> + \
> +  __BB(4,guessed_local(10737416)): \
> +  return; \
> + \
> +} \
> +
> +
> +
> +#define N 1024
> +int arr[N];
> +void init_arr (int *a, int n)
> +{
> +  #pragma GCC novector
> +  for (int i=0; i<n; i++)
> +    a[i] = i - n/2;
> +}
> +

Reply via email to