>> 2017-11-17 Kito Cheng <kito.ch...@gmail.com> >> >> * longlong.h [__riscv] (__umulsidi3): Define. >> [__riscv] (umul_ppmm) Likewise. >> [__riscv] (__muluw3) Likewise. > > > Apparently the point of this is that by defining __mulsi3/__muldi3 as an > extended asm, we get better register allocation, and hence better > performance. It would be nice if this info was provided when a patch was > submitted, so that we don't have to figure it out ourselves.
I prefer write more comment in the code instead of ChangeLog, I add more comment in __muluw3. btw, long times ago, Vladimir was told me[1] it should contain what is done but not why it is done? [1] https://gcc.gnu.org/ml/gcc-patches/2014-06/msg00604.html > I see one minor issue though > >> + (w0) = __muluw3 (__ll_lowpart (__x1), __ll_B) \ >> + + __ll_lowpart (__x0); \ > > > The multiply here is just a shift and should be written as a shift by > (W_TYPE_SIZE / 2). > > You copied this code from the default definition which uses *, and assumes > that the compiler will optimize the multiply into a shift. However, because > __muluw3 expands into a call to __mulsi3/__muldi3 for a risc-v part with no > multiply, the optimization will not take place here, and you end up with one > extra unnecessary multiply operation. You are right, I guess I just substitute all multiplication before, thanks you point out this :) Updated patch: 2017-11-24 Kito Cheng <kito.ch...@gmail.com> * longlong.h [__riscv] (__umulsidi3): Define. [__riscv] (umul_ppmm) Likewise. [__riscv] (__muluw3) Likewise. --- include/longlong.h | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/include/longlong.h b/include/longlong.h index 9d3ab21..b76fa41 100644 --- a/include/longlong.h +++ b/include/longlong.h @@ -1086,6 +1086,56 @@ extern UDItype __umulsidi3 (USItype, USItype); } while (0) #endif +#if defined(__riscv) +#ifdef __riscv_mul +#define __umulsidi3(u,v) ((UDWtype)(UWtype)(u) * (UWtype)(v)) +#define __muluw3(a, b) ((UWtype)(a) * (UWtype)(b)) +#else +#if __riscv_xlen == 32 + #define MULUW3 "call __mulsi3" +#elif __riscv_xlen == 64 + #define MULUW3 "call __muldi3" +#else +#error unsupport xlen +#endif /* __riscv_xlen */ +/* We rely on the fact that MULUW3 doesn't clobber the t-registers. + It can get better register allocation result. */ +#define __muluw3(a, b) \ + ({ \ + register UWtype __op0 asm ("a0") = a; \ + register UWtype __op1 asm ("a1") = b; \ + asm volatile (MULUW3 \ + : "+r" (__op0), "+r" (__op1) \ + : \ + : "ra", "a2", "a3"); \ + __op0; \ + }) +#endif /* __riscv_mul */ +#define umul_ppmm(w1, w0, u, v) \ + do { \ + UWtype __x0, __x1, __x2, __x3; \ + UHWtype __ul, __vl, __uh, __vh; \ + \ + __ul = __ll_lowpart (u); \ + __uh = __ll_highpart (u); \ + __vl = __ll_lowpart (v); \ + __vh = __ll_highpart (v); \ + \ + __x0 = __muluw3 (__ul, __vl); \ + __x1 = __muluw3 (__ul, __vh); \ + __x2 = __muluw3 (__uh, __vl); \ + __x3 = __muluw3 (__uh, __vh); \ + \ + __x1 += __ll_highpart (__x0);/* this can't give carry */ \ + __x1 += __x2; /* but this indeed can */ \ + if (__x1 < __x2) /* did we get it? */ \ + __x3 += __ll_B; /* yes, add it in the proper pos. */ \ + \ + (w1) = __x3 + __ll_highpart (__x1); \ + (w0) = __ll_lowpart (__x1) * __ll_B + __ll_lowpart (__x0); \ + } while (0) +#endif /* __riscv */ + #if defined(__sh__) && W_TYPE_SIZE == 32 #ifndef __sh1__ #define umul_ppmm(w1, w0, u, v) \ -- 2.7.4