RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned SAT_ADD into branchless

Li, Pan2 Thu, 07 Nov 2024 04:57:47 -0800

I see your point that the backend can leverage condition move to emit the 
branch code.


> For instance see https://godbolt.org/z/fvrq3aq6K
> On ISAs with conditional operations the branch version gets ifconverted.
> On AArch64 we get:
> sat_add_u_1(unsigned int, unsigned int):
>         adds    w0, w0, w1
>         csinv   w0, w0, wzr, cc
>         ret
> so just 2 instructions, and also branchless. On x86_64 we get:
> sat_add_u_1(unsigned int, unsigned int):
>         add     edi, esi
>         mov     eax, -1
>         cmovnc  eax, edi
>         ret
> so 3 instructions but a dependency chain of 2.
> also branchless.  This patch would regress both of these.

But the above Godbolt may not be a good example for evidence, because both 
x86_64 and aarch64 implemented usadd
already.
Thus, they all go to usadd<QImode>. For example as below, the sat_add_u_1 and 
sat_add_u_2 are almost the
same when the backend implemented usadd.

#include <stdint.h>

#define T uint32_t

  T sat_add_u_1 (T x, T y)
  {
    return (T)(x + y) < x ? -1 : (x + y);
  }

  T sat_add_u_2 (T x, T y)
  {
    return (x + y) | -((x + y) < x);
  }

It will become different when take gcc 14.2 (which doesn’t have .SAT_ADD GIMPLE 
IR), the x86_64
will have below asm dump for -O3. Looks like no obvious difference here.

sat_add_u_1(unsigned int, unsigned int):
        add     edi, esi
        mov     eax, -1
        cmovnc  eax, edi
        ret

sat_add_u_2(unsigned int, unsigned int):
        add     edi, esi
        sbb     eax, eax
        or      eax, edi
        ret

As well as for the vector mode for x86_64 gcc 14.2 as below.

  void vec_sat_add_u_1 (T *a, T *b, T *out, int n)
  {
    for (int i = 0; i < n; i++) {
        T x = a[i];
        T y = b[i];
        out[i] = (T)(x + y) < x ? -1 : (x + y);
    }
  }

  void vec_sat_add_u_2 (T * __restrict a, T * __restrict b,
          T * __restrict out, int n)
  {
    for (int i = 0; i < n; i++) {
        T x = a[i];
        T y = b[i];
        out[i] = (x + y) | -((x + y) < x);
    }
  }

vec_sat_add_u_1(unsigned int*, unsigned int*, unsigned int*, int):
....
.L15:
        movdqu  xmm0, XMMWORD PTR [rdi+rax]
        movdqu  xmm1, XMMWORD PTR [rsi+rax]
        paddd   xmm1, xmm0
        psubd   xmm0, xmm2
        movdqa  xmm3, xmm1
        psubd   xmm3, xmm2
        pcmpgtd xmm0, xmm3
        por     xmm0, xmm1
        movups  XMMWORD PTR [rdx+rax], xmm0
        add     rax, 16
        cmp     r8, rax
        jne     .L15
        mov     eax, ecx
        and     eax, -4
        mov     r8d, eax
        cmp     ecx, eax
        je      .L11
        sub     ecx, eax
        mov     r9d, ecx
        cmp     ecx, 1
        je      .L17
.L14:
        movq    xmm2, QWORD PTR .LC2[rip]
        mov     ecx, r8d
        movq    xmm0, QWORD PTR [rdi+rcx*4]
        movq    xmm1, QWORD PTR [rsi+rcx*4]
        paddd   xmm1, xmm0
        psubd   xmm0, xmm2
        movdqa  xmm3, xmm1
        psubd   xmm3, xmm2
        pcmpgtd xmm0, xmm3
        movdqa  xmm2, xmm0
        pandn   xmm2, xmm1
        por     xmm0, xmm2
        movq    QWORD PTR [rdx+rcx*4], xmm0
....

vec_sat_add_u_2(unsigned int*, unsigned int*, unsigned int*, int):
...
.L50:
        movdqu  xmm0, XMMWORD PTR [rdi+rax]
        movdqu  xmm1, XMMWORD PTR [rsi+rax]
        paddd   xmm1, xmm0
        psubd   xmm0, xmm2
        movdqa  xmm3, xmm1
        psubd   xmm3, xmm2
        pcmpgtd xmm0, xmm3
        por     xmm0, xmm1
        movups  XMMWORD PTR [rdx+rax], xmm0
        add     rax, 16
        cmp     rax, r8
        jne     .L50
        mov     eax, ecx
        and     eax, -4
        mov     r8d, eax
        cmp     ecx, eax
        je      .L64
.L49:
        sub     ecx, r8d
        cmp     ecx, 1
        je      .L52
        movq    xmm0, QWORD PTR [rdi+r8*4]
        movq    xmm1, QWORD PTR [rsi+r8*4]
        movq    xmm2, QWORD PTR .LC2[rip]
        paddd   xmm1, xmm0
        psubd   xmm0, xmm2
        movdqa  xmm3, xmm1
        psubd   xmm3, xmm2
        pcmpgtd xmm0, xmm3
        movdqa  xmm2, xmm0
        pandn   xmm2, xmm1
        por     xmm0, xmm2
        movq    QWORD PTR [rdx+r8*4], xmm0
...

Pan

-----Original Message-----
From: Tamar Christina <tamar.christ...@arm.com> 
Sent: Thursday, November 7, 2024 7:43 PM
To: Li, Pan2 <pan2...@intel.com>; Richard Biener <richard.guent...@gmail.com>
Cc: gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai; kito.ch...@gmail.com; 
jeffreya...@gmail.com; rdapp....@gmail.com
Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned SAT_ADD 
into branchless

> -----Original Message-----
> From: Li, Pan2 <pan2...@intel.com>
> Sent: Thursday, November 7, 2024 1:45 AM
> To: Tamar Christina <tamar.christ...@arm.com>; Richard Biener
> <richard.guent...@gmail.com>
> Cc: gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai; kito.ch...@gmail.com;
> jeffreya...@gmail.com; rdapp....@gmail.com
> Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned
> SAT_ADD into branchless
> 
> I see, thanks Tamar for the explanation.
> 
> > The problem with the rewrite is that it pessimists the code if the 
> > saturating
> > instructions are not recognized afterwards.
> 
> The original idea is somehow independent with the backend support IFN_SAT_* or
> not.
> Given we have sorts of form of IFN_SAT_*, some of them are cheap while others
> are heavy
> from the perspective of gimple. It is possible to do some canonicalization 
> here.
> 
> > Additionally the branch version will get the benefit of branch prediction 
> > when ran
> > inside a loop.
> 
> Not very sure if my understanding is correct, but the branchless version is
> preferred in most case IMO if
> they have nearly count of stmt. Not sure if it is still true during the 
> vectorization.

I don't think this is true.  The branchless version must be either less 
instructions or
cheaper instruction.  The branches, especially in a loop become mostly free due 
to
branch prediction. Modern branch predictors are really good at idoms like this.

That said, the other issue with doing this in gimple is that it interferes with 
the RTL
conditional execution pass.

For instance see https://godbolt.org/z/fvrq3aq6K

On ISAs with conditional operations the branch version gets ifconverted.

On AArch64 we get:

sat_add_u_1(unsigned int, unsigned int):
        adds    w0, w0, w1
        csinv   w0, w0, wzr, cc
        ret

so just 2 instructions, and also branchless. On x86_64 we get:

sat_add_u_1(unsigned int, unsigned int):
        add     edi, esi
        mov     eax, -1
        cmovnc  eax, edi
        ret

so 3 instructions but a dependency chain of 2.

also branchless.  This patch would regress both of these.

Thanks,
Tamar

> 
> Pan
> 
> -----Original Message-----
> From: Tamar Christina <tamar.christ...@arm.com>
> Sent: Thursday, November 7, 2024 12:25 AM
> To: Li, Pan2 <pan2...@intel.com>; Richard Biener <richard.guent...@gmail.com>
> Cc: gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai; kito.ch...@gmail.com;
> jeffreya...@gmail.com; rdapp....@gmail.com
> Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned
> SAT_ADD into branchless
> 
> > -----Original Message-----
> > From: Li, Pan2 <pan2...@intel.com>
> > Sent: Wednesday, November 6, 2024 1:31 PM
> > To: Richard Biener <richard.guent...@gmail.com>
> > Cc: gcc-patches@gcc.gnu.org; Tamar Christina <tamar.christ...@arm.com>;
> > juzhe.zh...@rivai.ai; kito.ch...@gmail.com; jeffreya...@gmail.com;
> > rdapp....@gmail.com
> > Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned
> > SAT_ADD into branchless
> >
> > Never mind and thanks Richard for comments.
> >
> > > Sorry for falling back in reviewing - it's not exactly clear the "cheap" 
> > > form is
> > > cheaper.  When I count the number of gimple statements (sub-expressions)
> > > the original appears as 3 while the result looks to have 5.
> >
> > I may have a question about how we count the stmts, not sure the x <= _1 and
> > then goto should be counted
> 
> They should, because they both become a instructions that need to be executed 
> in
> the execution chain.
> 
> > as 1 or 2 stmts in below example. I try to count it by myself but not very 
> > sure the
> > understanding is correct.
> >
> > Before this patch:
> >    1   │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y)
> >    2   │ {
> >    3   │   uint8_t D.2809;
> >    4   │
> >    5   │   _1 = x + y;  // 1
> >    6   │   if (x <= _1) goto <D.2810>; else goto <D.2811>; // 2 for x <= 
> > _1, 3 for goto
> > ???
> >    7   │   <D.2810>:
> >    8   │   D.2809 = x + y; // 4 (token)
> >    9   │   goto <D.2812>; // 5 (token) for goto ???
> >   10   │   <D.2811>:
> >   11   │   D.2809 = 255; // 4 (not token)
> >   12   │   <D.2812>:
> >   13   │   return D.2809; // 6 for token, 5 for not token
> >   14   │ }
> >
> > After this patch:
> >    1   │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y)
> >    2   │ {
> >    3   │   uint8_t D.2809;
> >    4   │
> >    5   │   _1 = x + y; // 1
> >    6   │   _2 = x + y; // 2
> >    7   │   _3 = x > _2; // 3
> >    8   │   _4 = (unsigned char) _3; // 4
> >    9   │   _5 = -_4; // 5
> >   10   │   D.2809 = _1 | _5; // 6
> >   11   │   return D.2809; // 7
> >   12   │ }
> >
> > > The catch is of
> > > course that the original might involve control flow and a PHI.
> 
> _1 and _2 are counted as one, as they'll be CSE'd.
> After the patch your dependency chain is 5 instructions, as in,
> to create the result you must execute 5 sequential instructions.
> 
>    1   │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y)
>    2   │ {
>    3   │   uint8_t D.2809;
>    4   │
>    5   │   _1 = x + y;
>    6   │   _2 = x + y; // 1
>    7   │   _3 = x > _2; // 2
>    8   │   _4 = (unsigned char) _3; // 3
>    9   │   _5 = -_4; // 4
>   10   │   D.2809 = _1 | _5; //5
>   11   │   return D.2809;
>   12   │ }
> 
> Before your change you had 3, because you always do
> Compare + branch + set.
> 
> The problem with the rewrite is that it pessimists the code if the saturating
> instructions are not recognized afterwards.
> 
> Note this is also the case for vector codegen, The original code in vector 
> would be
> 
> 1. duplicate 255
> 1. x + y
> 2. compare
> 3. select.
> 
> The first two instructions are independent and execute in parallel, so the 
> original
> vector codegen also had a dependency chain of 3.
> 
> Additionally the branch version will get the benefit of branch prediction 
> when ran
> inside a loop.
> 
> >
> > > The pattern will apply during late PHI-OPT or during GENERIC folding
> > > done by the frontends or gimplfiication - you scan the gimplification dump
> > > below, so having it apply there means it will influence inlining 
> > > heuristics
> > > for example.  I wonder which variant is considered larger by its 
> > > heuristic.
> >
> > > What do others think of the early canonicalization of these to
> > > straight-line code?
> >
> > Make sense, we could find one form that most of us consider to be the
> "cheapest".
> 
> I would disagree with Richard here in that this would be a pessimistic 
> codegen for
> targets that don't later implement the IFN_SAT.
> 
> Thanks,
> Tamar
> 
> >
> > Pan
> >
> > -----Original Message-----
> > From: Richard Biener <richard.guent...@gmail.com>
> > Sent: Wednesday, November 6, 2024 8:43 PM
> > To: Li, Pan2 <pan2...@intel.com>
> > Cc: gcc-patches@gcc.gnu.org; tamar.christ...@arm.com; juzhe.zh...@rivai.ai;
> > kito.ch...@gmail.com; jeffreya...@gmail.com; rdapp....@gmail.com
> > Subject: Re: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned
> > SAT_ADD into branchless
> >
> > On Thu, Oct 31, 2024 at 7:29 AM <pan2...@intel.com> wrote:
> > >
> > > From: Pan Li <pan2...@intel.com>
> > >
> > > There are sorts of forms for the unsigned SAT_ADD.  Some of them are
> > > complicated while others are cheap.  This patch would like to simplify
> > > the complicated form into the cheap ones.  For example as below:
> > >
> > > From the form 4 (branch):
> > >   SAT_U_ADD = (X + Y) < x ? -1 : (X + Y).
> > >
> > > To (branchless):
> > >   SAT_U_ADD = (X + Y) | - ((X + Y) < X).
> > >
> > >   #define T uint8_t
> > >
> > >   T sat_add_u_1 (T x, T y)
> > >   {
> > >     return (T)(x + y) < x ? -1 : (x + y);
> > >   }
> > >
> > > Before this patch:
> > >    1   │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y)
> > >    2   │ {
> > >    3   │   uint8_t D.2809;
> > >    4   │
> > >    5   │   _1 = x + y;
> > >    6   │   if (x <= _1) goto <D.2810>; else goto <D.2811>;
> > >    7   │   <D.2810>:
> > >    8   │   D.2809 = x + y;
> > >    9   │   goto <D.2812>;
> > >   10   │   <D.2811>:
> > >   11   │   D.2809 = 255;
> > >   12   │   <D.2812>:
> > >   13   │   return D.2809;
> > >   14   │ }
> > >
> > > After this patch:
> > >    1   │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y)
> > >    2   │ {
> > >    3   │   uint8_t D.2809;
> > >    4   │
> > >    5   │   _1 = x + y;
> > >    6   │   _2 = x + y;
> > >    7   │   _3 = x > _2;
> > >    8   │   _4 = (unsigned char) _3;
> > >    9   │   _5 = -_4;
> > >   10   │   D.2809 = _1 | _5;
> > >   11   │   return D.2809;
> > >   12   │ }
> > >
> > > The below test suites are passed for this patch.
> > > * The rv64gcv fully regression test.
> > > * The x86 bootstrap test.
> > > * The x86 fully regression test.
> > >
> > > gcc/ChangeLog:
> > >
> > >         * match.pd: Remove unsigned branch form 4 for SAT_ADD, and
> > >         add simplify to branchless instead.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >         * gcc.dg/sat_arith_simplify.h: New test.
> > >         * gcc.dg/sat_u_add-simplify-2-u16.c: New test.
> > >         * gcc.dg/sat_u_add-simplify-2-u32.c: New test.
> > >         * gcc.dg/sat_u_add-simplify-2-u64.c: New test.
> > >         * gcc.dg/sat_u_add-simplify-2-u8.c: New test.
> > >
> > > Signed-off-by: Pan Li <pan2...@intel.com>
> > > ---
> > >  gcc/match.pd                                  | 23 +++++++++----------
> > >  gcc/testsuite/gcc.dg/sat_arith_simplify.h     | 10 ++++++++
> > >  .../gcc.dg/sat_u_add-simplify-2-u16.c         | 11 +++++++++
> > >  .../gcc.dg/sat_u_add-simplify-2-u32.c         | 11 +++++++++
> > >  .../gcc.dg/sat_u_add-simplify-2-u64.c         | 11 +++++++++
> > >  .../gcc.dg/sat_u_add-simplify-2-u8.c          | 11 +++++++++
> > >  6 files changed, 65 insertions(+), 12 deletions(-)
> > >  create mode 100644 gcc/testsuite/gcc.dg/sat_arith_simplify.h
> > >  create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c
> > >  create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c
> > >  create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c
> > >  create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c
> > >
> > > diff --git a/gcc/match.pd b/gcc/match.pd
> > > index c851ac56e37..8425d7c4f20 100644
> > > --- a/gcc/match.pd
> > > +++ b/gcc/match.pd
> > > @@ -3146,18 +3146,17 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> > >  (match (unsigned_integer_sat_add @0 @1)
> > >   (bit_ior:c (usadd_left_part_2 @0 @1) (usadd_right_part_2 @0 @1)))
> > >
> > > -/* Simplify SAT_U_ADD to the cheap form
> > > -   From: SAT_U_ADD = (X + Y) >= x ? (X + Y) : -1.
> > > -   To:   SAT_U_ADD = (X + Y) | - ((X + Y) < X).  */
> > > -(simplify (cond (ge (plus:c@2 @0 @1) @0) @2 integer_minus_onep)
> > > - (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> > > -      && types_match (type, @0, @1))
> > > -  (bit_ior @2 (negate (convert (lt @2 @0))))))
> > > -
> > > -/* Unsigned saturation add, case 4 (branch with lt):
> > > -   SAT_U_ADD = (X + Y) < x ? -1 : (X + Y).  */
> > > -(match (unsigned_integer_sat_add @0 @1)
> > > - (cond^ (lt (usadd_left_part_1@2 @0 @1) @0) integer_minus_onep @2))
> > > +/* Simplify sorts of SAT_U_ADD forms to the cheap one.
> > > +   The cheap form: SAT_U_ADD = (X + Y) | - ((X + Y) < X).  */
> > > +(if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type))
> > > + /* From SAT_U_ADD = (X + Y) >= x ? (X + Y) : -1.  */
> > > + (simplify (cond (ge (plus:c@2 @0 @1) @0) @2 integer_minus_onep)
> > > +  (if (types_match (type, @0, @1))
> > > +   (bit_ior (plus@2 @0 @1) (negate (convert (lt @2 @0))))))
> > > + /* From SAT_U_ADD = (X + Y) < x ? -1 : (X + Y).  */
> > > + (simplify (cond (lt (plus:c@2 @0 @1) @0) integer_minus_onep @2)
> > > +  (if (types_match (type, @0, @1))
> > > +   (bit_ior (plus@2 @0 @1) (negate (convert (lt @2 @0)))))))
> >
> > Sorry for falling back in reviewing - it's not exactly clear the "cheap" 
> > form is
> > cheaper.  When I count the number of gimple statements (sub-expressions)
> > the original appears as 3 while the result looks to have 5.  The catch is of
> > course that the original might involve control flow and a PHI.
> >
> > The pattern will apply during late PHI-OPT or during GENERIC folding
> > done by the frontends or gimplfiication - you scan the gimplification dump
> > below, so having it apply there means it will influence inlining heuristics
> > for example.  I wonder which variant is considered larger by its heuristic.
> >
> > What do others think of the early canonicalization of these to
> > straight-line code?
> >
> > Thanks,
> > Richard.
> >
> > >  /* Unsigned saturation add, case 5 (branch with eq .ADD_OVERFLOW):
> > >     SAT_U_ADD = REALPART_EXPR <.ADD_OVERFLOW> == 0 ? .ADD_OVERFLOW
> : -
> > 1.  */
> > > diff --git a/gcc/testsuite/gcc.dg/sat_arith_simplify.h
> > b/gcc/testsuite/gcc.dg/sat_arith_simplify.h
> > > new file mode 100644
> > > index 00000000000..46ac00426b2
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.dg/sat_arith_simplify.h
> > > @@ -0,0 +1,10 @@
> > > +#ifndef HAVE_DEFINED_SAT_ARITH_SIMPLIFY_H
> > > +#define HAVE_DEFINED_SAT_ARITH_SIMPLIFY_H
> > > +
> > > +#define DEF_SAT_U_ADD_2(T)              \
> > > +T sat_u_add_##T##_2 (T x, T y)          \
> > > +{                                       \
> > > +  return (T)(x + y) < x ? -1 : (x + y); \
> > > +}
> > > +
> > > +#endif
> > > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c
> > b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c
> > > new file mode 100644
> > > index 00000000000..b170b35096c
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c
> > > @@ -0,0 +1,11 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */
> > > +
> > > +#include <stdint.h>
> > > +#include "sat_arith_simplify.h"
> > > +
> > > +DEF_SAT_U_ADD_2 (uint16_t)
> > > +
> > > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */
> > > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */
> > > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */
> > > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c
> > b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c
> > > new file mode 100644
> > > index 00000000000..8830ed7b878
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c
> > > @@ -0,0 +1,11 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */
> > > +
> > > +#include <stdint.h>
> > > +#include "sat_arith_simplify.h"
> > > +
> > > +DEF_SAT_U_ADD_2 (uint32_t)
> > > +
> > > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */
> > > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */
> > > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */
> > > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c
> > b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c
> > > new file mode 100644
> > > index 00000000000..76c4d4bddaa
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c
> > > @@ -0,0 +1,11 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */
> > > +
> > > +#include <stdint.h>
> > > +#include "sat_arith_simplify.h"
> > > +
> > > +DEF_SAT_U_ADD_2 (uint64_t)
> > > +
> > > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */
> > > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */
> > > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */
> > > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c
> > b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c
> > > new file mode 100644
> > > index 00000000000..b034b8eedb1
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c
> > > @@ -0,0 +1,11 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */
> > > +
> > > +#include <stdint.h>
> > > +#include "sat_arith_simplify.h"
> > > +
> > > +DEF_SAT_U_ADD_2 (uint8_t)
> > > +
> > > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */
> > > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */
> > > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */
> > > --
> > > 2.43.0
> > >

RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned SAT_ADD into branchless

Reply via email to