On Fri, Nov 8, 2024 at 12:34 AM Li, Pan2 <pan2...@intel.com> wrote:
>
> Thanks Tamar and Jeff for comments.
>
> > I'm not sure it's that simple.  It'll depend on the micro-architecture.
> > So things like strength of the branch predictors, how fetch blocks are
> > handled (can you have embedded not-taken branches, short-forward-branch
> > optimizations, etc).
>
> > After:
> >
> > .L.sat_add_u_1(unsigned int, unsigned int):
> >          add 4,3,4
> >          rldicl 9,4,0,32
> >          subf 3,3,9
> >          sradi 3,3,63
> >          or 3,3,4
> >          rldicl 3,3,0,32
> >          blr
> >
> > and before
> >
> > .L.sat_add_u_1(unsigned int, unsigned int):
> >          add 4,3,4
> >          cmplw 0,4,3
> >          bge 0,.L2
> >          li 4,-1
> > .L2:
> >          rldicl 3,4,0,32
> >          blr
>
> I am not familiar with branch prediction, but the branch should be 50% token 
> and 50% not-token
> according to the range of sat add input. It is the worst case for branch 
> prediction? I mean if we call
> 100 times with token, not-token, token, not-token... sequence, the branch 
> version will be still faster?
> Feel free to correct me if I'm wrong.
>
> Back to these 16 forms of sat add as below, is there any suggestion which one 
> or two form(s) may be
> cheaper than others from the perspective of gimple IR? Independent with the 
> backend implemented SAT_ADD or not.
>
> #define DEF_SAT_U_ADD_1(T)               \
> T sat_u_add_##T##_1 (T x, T y)           \
> {                                        \
>   return (T)(x + y) >= x ? (x + y) : -1; \
> }
>
> #define DEF_SAT_U_ADD_2(T)              \
> T sat_u_add_##T##_2 (T x, T y)          \
> {                                       \
>   return (T)(x + y) < x ? -1 : (x + y); \
> }
>
> #define DEF_SAT_U_ADD_3(T)               \
> T sat_u_add_##T##_3 (T x, T y)           \
> {                                        \
>   return x <= (T)(x + y) ? (x + y) : -1; \
> }
>
> #define DEF_SAT_U_ADD_4(T)              \
> T sat_u_add_##T##_4 (T x, T y)          \
> {                                       \
>   return x > (T)(x + y) ? -1 : (x + y); \
> }
>
> #define DEF_SAT_U_ADD_5(T)      \
> T sat_u_add_##T##_1 (T x, T y)  \
> {                               \
>   if ((T)(x + y) >= x)          \
>     return x + y;               \
>   else                          \
>     return -1;                  \
> }
>
> #define DEF_SAT_U_ADD_6(T)      \
> T sat_u_add_##T##_6 (T x, T y)  \
> {                               \
>   if ((T)(x + y) < x)           \
>     return -1;                  \
>   else                          \
>     return x + y;               \
> }
>
> #define DEF_SAT_U_ADD_7(T)      \
> T sat_u_add_##T##_7 (T x, T y)  \
> {                               \
>   if (x <= (T)(x + y))          \
>     return x + y;               \
>   else                          \
>     return -1;                  \
> }
>
> #define DEF_SAT_U_ADD_8(T)      \
> T sat_u_add_##T##_8 (T x, T y)  \
> {                               \
>   if (x > (T)(x + y))           \
>     return -1;                  \
>   else                          \
>     return x + y;               \
> }
>
> #define DEF_SAT_U_ADD_9(T)                                     \
> T sat_u_add_##T##_9 (T x, T y)                                 \
> {                                                              \
>   T ret;                                                       \
>   return __builtin_add_overflow (x, y, &ret) == 0 ? ret : - 1; \
> }
>
> #define DEF_SAT_U_ADD_10(T)                                \
> T sat_u_add_##T##_10 (T x, T y)                            \
> {                                                          \
>   T ret;                                                   \
>   return !__builtin_add_overflow (x, y, &ret) ? ret : - 1; \
> }
>
> #define DEF_SAT_U_ADD_11(T)                     \
> T sat_u_add_##T##_11 (T x, T y)                 \
> {                                               \
>   T ret;                                        \
>   if (__builtin_add_overflow (x, y, &ret) == 0) \
>     return ret;                                 \
>   else                                          \
>     return -1;                                  \
> }
>
> #define DEF_SAT_U_ADD_12(T)                 \
> T sat_u_add_##T##_12 (T x, T y)             \
> {                                           \
>   T ret;                                    \
>   if (!__builtin_add_overflow (x, y, &ret)) \
>     return ret;                             \
>   else                                      \
>     return -1;                              \
> }
>
> #define DEF_SAT_U_ADD_13(T)                                   \
> T sat_u_add_##T##_13 (T x, T y)                               \
> {                                                             \
>   T ret;                                                      \
>   return __builtin_add_overflow (x, y, &ret) != 0 ? -1 : ret; \
> }
>
> #define DEF_SAT_U_ADD_14(T)                              \
> T sat_u_add_##T##_14 (T x, T y)                          \
> {                                                        \
>   T ret;                                                 \
>   return __builtin_add_overflow (x, y, &ret) ? -1 : ret; \
> }
>
> #define DEF_SAT_U_ADD_15(T)                     \
> T sat_u_add_##T##_15 (T x, T y)                 \
> {                                               \
>   T ret;                                        \
>   if (__builtin_add_overflow (x, y, &ret) != 0) \
>     return -1;                                  \
>   else                                          \
>     return ret;                                 \
> }
>
> #define DEF_SAT_U_ADD_16(T)                \
> T sat_u_add_##T##_16 (T x, T y)            \
> {                                          \
>   T ret;                                   \
>   if (__builtin_add_overflow (x, y, &ret)) \
>     return -1;                             \
>   else                                     \
>     return ret;                            \
> }

So for the late .SAT_* introduction in the widen_mul pass the
__builtin_add_overflow
variants make sense to be used iff the target has optabs for those.
The IL for the
builtin variant should appear cheap as well:

  _6 = .ADD_OVERFLOW (x_4(D), y_5(D));
  _2 = IMAGPART_EXPR <_6>;
  if (_2 != 0)
    goto <bb 4>; [21.72%]
  else
    goto <bb 3>; [78.28%]

  <bb 3> [local count: 840525096]:
  _1 = REALPART_EXPR <_6>;

  <bb 4> [local count: 1073741824]:
  # _3 = PHI <65535(2), _1(3)>

it's possibly also the best canonicalization for the overflow check
though I fear code
generation for targets that do not support __builtin_*_overflow might
end up with
two branches.

That said - I'd avoid canonicalizing this via match.pd given that
inevitably will
if-convert.  Instead I'd see it as a way to provide a generic .SAT_* expansion
though one could say we should then simply implement fallback expansion
for the internal function.  It's also not necessarily your
responsibility to implement
this since risc-v does have .SAT_* expanders, so does x86.

Richard.

> Pan
>
> -----Original Message-----
> From: Jeff Law <jeffreya...@gmail.com>
> Sent: Friday, November 8, 2024 4:08 AM
> To: Tamar Christina <tamar.christ...@arm.com>; Li, Pan2 <pan2...@intel.com>; 
> Richard Biener <richard.guent...@gmail.com>
> Cc: gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai; kito.ch...@gmail.com; 
> rdapp....@gmail.com
> Subject: Re: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned 
> SAT_ADD into branchless
>
>
>
> On 11/7/24 8:07 AM, Tamar Christina wrote:
> >
> >> -----Original Message-----
> >> From: Li, Pan2 <pan2...@intel.com>
> >> Sent: Thursday, November 7, 2024 12:57 PM
> >> To: Tamar Christina <tamar.christ...@arm.com>; Richard Biener
> >> <richard.guent...@gmail.com>
> >> Cc: gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai; kito.ch...@gmail.com;
> >> jeffreya...@gmail.com; rdapp....@gmail.com
> >> Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned
> >> SAT_ADD into branchless
> >>
> >> I see your point that the backend can leverage condition move to emit the 
> >> branch
> >> code.
> >>
> >>> For instance see https://godbolt.org/z/fvrq3aq6K
> >>> On ISAs with conditional operations the branch version gets ifconverted.
> >>> On AArch64 we get:
> >>> sat_add_u_1(unsigned int, unsigned int):
> >>>          adds    w0, w0, w1
> >>>          csinv   w0, w0, wzr, cc
> >>>          ret
> >>> so just 2 instructions, and also branchless. On x86_64 we get:
> >>> sat_add_u_1(unsigned int, unsigned int):
> >>>          add     edi, esi
> >>>          mov     eax, -1
> >>>          cmovnc  eax, edi
> >>>          ret
> >>> so 3 instructions but a dependency chain of 2.
> >>> also branchless.  This patch would regress both of these.
> >>
> >> But the above Godbolt may not be a good example for evidence, because both
> >> x86_64 and aarch64 implemented usadd
> >> already.
> >> Thus, they all go to usadd<QImode>. For example as below, the sat_add_u_1 
> >> and
> >> sat_add_u_2 are almost the
> >> same when the backend implemented usadd.
> >>
> >> #include <stdint.h>
> >>
> >> #define T uint32_t
> >>
> >>    T sat_add_u_1 (T x, T y)
> >>    {
> >>      return (T)(x + y) < x ? -1 : (x + y);
> >>    }
> >>
> >>    T sat_add_u_2 (T x, T y)
> >>    {
> >>      return (x + y) | -((x + y) < x);
> >>    }
> >>
> >> It will become different when take gcc 14.2 (which doesn’t have .SAT_ADD 
> >> GIMPLE
> >> IR), the x86_64
> >> will have below asm dump for -O3. Looks like no obvious difference here.
> >>
> >> sat_add_u_1(unsigned int, unsigned int):
> >>          add     edi, esi
> >>          mov     eax, -1
> >>          cmovnc  eax, edi
> >>          ret
> >>
> >> sat_add_u_2(unsigned int, unsigned int):
> >>          add     edi, esi
> >>          sbb     eax, eax
> >>          or      eax, edi
> >>          ret
> >>
> >
> > Because CE is able to recognize the idiom back into a conditional move.
> > Pick a target that doesn't have conditional instructions, like PowerPC
> > https://godbolt.org/z/4bTv18WMv
> >
> > You'll see that this canonicalization has made codegen worse.
> >
> > After:
> >
> > .L.sat_add_u_1(unsigned int, unsigned int):
> >          add 4,3,4
> >          rldicl 9,4,0,32
> >          subf 3,3,9
> >          sradi 3,3,63
> >          or 3,3,4
> >          rldicl 3,3,0,32
> >          blr
> >
> > and before
> >
> > .L.sat_add_u_1(unsigned int, unsigned int):
> >          add 4,3,4
> >          cmplw 0,4,3
> >          bge 0,.L2
> >          li 4,-1
> > .L2:
> >          rldicl 3,4,0,32
> >          blr
> >
> > It means now it always has to execute 6 instructions, whereas before it was 
> > 4 or 5 depending
> > on the order of the branch. So for those architectures, it's always slower.
> I'm not sure it's that simple.  It'll depend on the micro-architecture.
> So things like strength of the branch predictors, how fetch blocks are
> handled (can you have embedded not-taken branches, short-forward-branch
> optimizations, etc).
>
> Jeff

Reply via email to