On 3/14/20 4:12 PM, LIU Zhiwei wrote:
I am not sure whether I get it. In my opinion, the code should be modified like
static inline int8_t aadd8_rnu(CPURISCVState *env, int8_t a, int8_t b)
{
int16_t res = (int16_t)a + (int16_t)b;
uint8_t round = res & 0x1;
res = (res >> 1) + round;
return res;
}
static inline int8_t aadd8_rne(CPURISCVState *env, int8_t a, int8_t b)
{
int16_t res = (int16_t)a + (int16_t)b;
uint8_t round = ((res & 0x3) == 0x3);
res = (res >> 1) + round;
return res;
}
static inline int8_t aadd8_rdn(CPURISCVState *env, int8_t a, int8_t b)
{
int16_t res = (int16_t)a + (int16_t)b;
res = (res >> 1);
return res;
}
static inline int8_t aadd8_rod(CPURISCVState *env, int8_t a, int8_t b)
{
int16_t res = (int16_t)a + (int16_t)b;
uint8_t round = ((res & 0x3) == 0x1);
res = (res >> 1) + round;
return res;
}
RVVCALL(OPIVV2_ENV, vaadd_vv_b_rnu, OP_SSS_B, H1, H1, H1, aadd8_rnu)
RVVCALL(OPIVV2_ENV, vaadd_vv_b_rne, OP_SSS_B, H1, H1, H1, aadd8_rne)
RVVCALL(OPIVV2_ENV, vaadd_vv_b_rdn, OP_SSS_B, H1, H1, H1, aadd8_rdn)
RVVCALL(OPIVV2_ENV, vaadd_vv_b_rod, OP_SSS_B, H1, H1, H1, aadd8_rod)
void do_vext_vv_env(void *vd, void *v0, void *vs1,
void *vs2, CPURISCVState *env, uint32_t desc,
uint32_t esz, uint32_t dsz,
opivv2_fn *fn, clear_fn *clearfn)
{
uint32_t vlmax = vext_maxsz(desc) / esz;
uint32_t mlen = vext_mlen(desc);
uint32_t vm = vext_vm(desc);
uint32_t vl = env->vl;
uint32_t i;
for (i = 0; i < vl; i++) {
if (!vm && !vext_elem_mask(v0, mlen, i)) {
continue;
}
fn(vd, vs1, vs2, i, env);
}
if (i != 0) {
clear_fn(vd, vl, vl * dsz, vlmax * dsz);
}
}
#define GEN_VEXT_VV_ENV(NAME, ESZ, DSZ, CLEAR_FN) \
void HELPER(NAME)(void *vd, void *v0, void *vs1, \
void *vs2, CPURISCVState *env, \
uint32_t desc) \
{ \
static opivv2_fn *fns[4] = { \
NAME##_rnu, NAME##_rne, \
NAME##_rdn, NAME##_rod \
} \
return do_vext_vv_env(vd, v0, vs1, vs2, env, desc, \
ESZ, DSZ, fns[env->vxrm], \
CLEAR_FN); \
}
Is it true?
While that does look good for this case, there are many other uses of
get_round(), and it may not be quite as simple there.
My suggestion was
static inline int32_t aadd32(int vxrm, int32_t a, int32_t b)
{
int64_t res = (int64_t)a + b;
uint8_t round = get_round(vxrm, res, 1);
return (res >> 1) + round;
}
static inline int64_t aadd64(int vxrm, int64_t a, int64_t b)
{
int64_t res = a + b;
uint8_t round = get_round(vxrm, res, 1);
int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
/* With signed overflow, bit 64 is inverse of bit 63. */
return ((res >> 1) ^ over) + round;
}
RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
static inline void
vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
uint32_t vl, uint32_t vm, uint32_t mlen, int vxrm,
opivv2_rm_fn *fn)
{
for (uint32_t i = 0; i < vl; i++) {
if (!vm && !vext_elem_mask(v0, mlen, i)) {
continue;
}
fn(vd, vs1, vs2, i, vxrm);
}
}
static inline void
vext_vv_rm_2(void *vd, void *v0, void *vs1,
void *vs2, CPURISCVState *env, uint32_t desc,
uint32_t esz, uint32_t dsz,
opivv2_rm_fn *fn, clear_fn *clearfn)
{
uint32_t vlmax = vext_maxsz(desc) / esz;
uint32_t mlen = vext_mlen(desc);
uint32_t vm = vext_vm(desc);
uint32_t vl = env->vl;
if (vl == 0) {
return;
}
switch (env->vxrm) {
case 0: /* rnu */
vext_vv_rm_1(vd, v0, vs1, vs2,
vl, vm, mlen, 0, fn);
break;
case 1: /* rne */
vext_vv_rm_1(vd, v0, vs1, vs2,
vl, vm, mlen, 1, fn);
break;
case 2: /* rdn */
vext_vv_rm_1(vd, v0, vs1, vs2,
vl, vm, mlen, 2, fn);
break;
default: /* rod */
vext_vv_rm_1(vd, v0, vs1, vs2,
vl, vm, mlen, 3, fn);
break;
}
clear_fn(vd, vl, vl * dsz, vlmax * dsz);
}
>From vext_vv_rm_2, a constant is passed down all of the inline functions, so
that a constant arrives in get_round() at the bottom of the call chain. At
which point all of the expressions get folded by the compiler and we *should*
get very similar generated code as to what you have above.