riscv: vector single-width averaging add and subtract

LIU Zhiwei Sun, 15 Mar 2020 16:24:21 -0700


On 2020/3/15 9:00, Richard Henderson wrote:

On 3/14/20 4:12 PM, LIU Zhiwei wrote:

I am not sure whether I get it. In my opinion, the code should be modified like

static inline int8_t aadd8_rnu(CPURISCVState *env, int8_t a, int8_t b)
{
     int16_t res = (int16_t)a + (int16_t)b;
     uint8_t round = res & 0x1;
     res   = (res >> 1) + round;
     return res;
}

static inline int8_t aadd8_rne(CPURISCVState *env, int8_t a, int8_t b)
{
     int16_t res = (int16_t)a + (int16_t)b;
     uint8_t round = ((res & 0x3) == 0x3);
     res   = (res >> 1) + round;
     return res;
}

static inline int8_t aadd8_rdn(CPURISCVState *env, int8_t a, int8_t b)
{
     int16_t res = (int16_t)a + (int16_t)b;
     res   = (res >> 1);
     return res;
}

static inline int8_t aadd8_rod(CPURISCVState *env, int8_t a, int8_t b)
{
     int16_t res = (int16_t)a + (int16_t)b;
     uint8_t round = ((res & 0x3) == 0x1);
    res   = (res >> 1) + round;
     return res;
}

RVVCALL(OPIVV2_ENV, vaadd_vv_b_rnu, OP_SSS_B, H1, H1, H1, aadd8_rnu)
RVVCALL(OPIVV2_ENV, vaadd_vv_b_rne, OP_SSS_B, H1, H1, H1, aadd8_rne)
RVVCALL(OPIVV2_ENV, vaadd_vv_b_rdn, OP_SSS_B, H1, H1, H1, aadd8_rdn)
RVVCALL(OPIVV2_ENV, vaadd_vv_b_rod, OP_SSS_B, H1, H1, H1, aadd8_rod)

void do_vext_vv_env(void *vd, void *v0, void *vs1,
                     void *vs2, CPURISCVState *env, uint32_t desc,
                     uint32_t esz, uint32_t dsz,
                     opivv2_fn *fn, clear_fn *clearfn)
{
     uint32_t vlmax = vext_maxsz(desc) / esz;
     uint32_t mlen = vext_mlen(desc);
     uint32_t vm = vext_vm(desc);
     uint32_t vl = env->vl;
     uint32_t i;
     for (i = 0; i < vl; i++) {
         if (!vm && !vext_elem_mask(v0, mlen, i)) {
             continue;
         }
         fn(vd, vs1, vs2, i, env);
     }
     if (i != 0) {
         clear_fn(vd, vl, vl * dsz,  vlmax * dsz);
     }
}

#define GEN_VEXT_VV_ENV(NAME, ESZ, DSZ, CLEAR_FN)         \
void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
                   void *vs2, CPURISCVState *env,          \
                   uint32_t desc)                          \
{                                                         \
     static opivv2_fn *fns[4] = {                          \
         NAME##_rnu, NAME##_rne,                           \
         NAME##_rdn, NAME##_rod                            \
     }                                                     \
     return do_vext_vv_env(vd, v0, vs1, vs2, env, desc,    \
                           ESZ, DSZ, fns[env->vxrm],       \
               CLEAR_FN);                      \
}

Is it true?

While that does look good for this case, there are many other uses of
get_round(), and it may not be quite as simple there.

My suggestion was

static inline int32_t aadd32(int vxrm, int32_t a, int32_t b)
{
     int64_t res = (int64_t)a + b;
     uint8_t round = get_round(vxrm, res, 1);

     return (res >> 1) + round;
}

static inline int64_t aadd64(int vxrm, int64_t a, int64_t b)
{
     int64_t res = a + b;
     uint8_t round = get_round(vxrm, res, 1);
     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;

     /* With signed overflow, bit 64 is inverse of bit 63. */
     return ((res >> 1) ^ over) + round;
}

RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)

static inline void
vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
              uint32_t vl, uint32_t vm, uint32_t mlen, int vxrm,
              opivv2_rm_fn *fn)
{
     for (uint32_t i = 0; i < vl; i++) {
         if (!vm && !vext_elem_mask(v0, mlen, i)) {
             continue;
         }
         fn(vd, vs1, vs2, i, vxrm);
     }
}

static inline void
vext_vv_rm_2(void *vd, void *v0, void *vs1,
              void *vs2, CPURISCVState *env, uint32_t desc,
              uint32_t esz, uint32_t dsz,
              opivv2_rm_fn *fn, clear_fn *clearfn)
{
     uint32_t vlmax = vext_maxsz(desc) / esz;
     uint32_t mlen = vext_mlen(desc);
     uint32_t vm = vext_vm(desc);
     uint32_t vl = env->vl;

     if (vl == 0) {
         return;
     }

     switch (env->vxrm) {
     case 0: /* rnu */
         vext_vv_rm_1(vd, v0, vs1, vs2,
                      vl, vm, mlen, 0, fn);
         break;
     case 1: /* rne */
         vext_vv_rm_1(vd, v0, vs1, vs2,
                      vl, vm, mlen, 1, fn);
         break;
     case 2: /* rdn */
         vext_vv_rm_1(vd, v0, vs1, vs2,
                      vl, vm, mlen, 2, fn);
         break;
     default: /* rod */
         vext_vv_rm_1(vd, v0, vs1, vs2,
                      vl, vm, mlen, 3, fn);
         break;
     }

     clear_fn(vd, vl, vl * dsz,  vlmax * dsz);
}

>From vext_vv_rm_2, a constant is passed down all of the inline functions, so
that a constant arrives in get_round() at the bottom of the call chain.  At
which point all of the expressions get folded by the compiler and we *should*
get very similar generated code as to what you have above.

Yes, it will be much better.

I still have one question here.

Many other fixed point instructions also need vxsat besides vxsrm.

In that cases, can I just define OPIVV2_RM like this:

#define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
static inline void                                                  \
do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
          CPURISCVState *env, int vxrm)                             \
{                                                                   \
    TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
    TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
    *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
}

static inline int32_t aadd32(|__attribute__((unused)) |CPURISCVState *env,
                             int vxrm, int32_t a, int32_t b)
{
    int64_t res = (int64_t)a + b;
    uint8_t round = get_round(vxrm, res, 1);

    return (res >> 1) + round;
}

In this way， I can write just one OPIVV2_RM instead of (OPIVV2_RM,OPIVV2_RM_ENV, OPIVV2_ENV).


Zhiwei

r~

Re: [PATCH v5 24/60] target/riscv: vector single-width averaging add and subtract

Reply via email to