Hi, Richard. On 07/23/2021 09:29 AM, Richard Henderson wrote: > On 7/20/21 11:53 PM, Song Gao wrote: >> This patch implement fixed point bit instruction translation. >> >> This includes: >> - EXT.W.{B/H} >> - CL{O/Z}.{W/D}, CT{O/Z}.{W/D} >> - BYTEPICK.{W/D} >> - REVB.{2H/4H/2W/D} >> - REVH.{2W/D} >> - BITREV.{4B/8B}, BITREV.{W/D} >> - BSTRINS.{W/D}, BSTRPICK.{W/D} >> - MASKEQZ, MASKNEZ >> >> Signed-off-by: Song Gao <gaos...@loongson.cn> >> --- >> target/loongarch/helper.h | 10 + >> target/loongarch/insns.decode | 45 +++ >> target/loongarch/op_helper.c | 119 ++++++++ >> target/loongarch/trans.inc.c | 665 >> ++++++++++++++++++++++++++++++++++++++++++ >> 4 files changed, 839 insertions(+) >> >> diff --git a/target/loongarch/helper.h b/target/loongarch/helper.h >> index 6c7e19b..bbbcc26 100644 >> --- a/target/loongarch/helper.h >> +++ b/target/loongarch/helper.h >> @@ -8,3 +8,13 @@ >> DEF_HELPER_3(raise_exception_err, noreturn, env, i32, int) >> DEF_HELPER_2(raise_exception, noreturn, env, i32) >> + >> +DEF_HELPER_2(cto_w, tl, env, tl) >> +DEF_HELPER_2(ctz_w, tl, env, tl) >> +DEF_HELPER_2(cto_d, tl, env, tl) >> +DEF_HELPER_2(ctz_d, tl, env, tl) > > The count leading and trailing zero operations are built into tcg. Count > leading and trailing one simply needs a NOT operation to convert it to zero. >
My understanding is this: cto -> NOT operation (tcg_gen_not_tl) -> ctz, is right? >> +DEF_HELPER_2(bitrev_w, tl, env, tl) >> +DEF_HELPER_2(bitrev_d, tl, env, tl) > > These should use TCG_CALL_NO_RWG_SE. > >> +target_ulong helper_bitrev_w(CPULoongArchState *env, target_ulong rj) >> +{ >> + int32_t v = (int32_t)rj; >> + const int SIZE = 32; >> + uint8_t bytes[SIZE]; >> + >> + int i; >> + for (i = 0; i < SIZE; i++) { >> + bytes[i] = v & 0x1; >> + v = v >> 1; >> + } >> + /* v == 0 */ >> + for (i = 0; i < SIZE; i++) { >> + v = v | ((uint32_t)bytes[i] << (SIZE - 1 - i)); >> + } >> + >> + return (target_ulong)(int32_t)v; >> +} > > return (int32_t)revbit32(rj); > > OK. >> +target_ulong helper_bitrev_d(CPULoongArchState *env, target_ulong rj) >> +{ >> + uint64_t v = rj; >> + const int SIZE = 64; >> + uint8_t bytes[SIZE]; >> + >> + int i; >> + for (i = 0; i < SIZE; i++) { >> + bytes[i] = v & 0x1; >> + v = v >> 1; >> + } >> + /* v == 0 */ >> + for (i = 0; i < SIZE; i++) { >> + v = v | ((uint64_t)bytes[i] << (SIZE - 1 - i)); >> + } >> + >> + return (target_ulong)v; >> +} > > return revbit64(rj); > OK. >> +static inline target_ulong bitswap(target_ulong v) >> +{ >> + v = ((v >> 1) & (target_ulong)0x5555555555555555ULL) | >> + ((v & (target_ulong)0x5555555555555555ULL) << 1); >> + v = ((v >> 2) & (target_ulong)0x3333333333333333ULL) | >> + ((v & (target_ulong)0x3333333333333333ULL) << 2); >> + v = ((v >> 4) & (target_ulong)0x0F0F0F0F0F0F0F0FULL) | >> + ((v & (target_ulong)0x0F0F0F0F0F0F0F0FULL) << 4); >> + return v; >> +} >> + >> +target_ulong helper_loongarch_dbitswap(target_ulong rj) >> +{ >> + return bitswap(rj); >> +} >> + >> +target_ulong helper_loongarch_bitswap(target_ulong rt) >> +{ >> + return (int32_t)bitswap(rt); >> +} > > I assume these are fpr the bitrev.4b and bitrev.8b insns? > It would be better to name them correctly. > > Yes. >> +/* Fixed point bit operation instruction translation */ >> +static bool trans_ext_w_h(DisasContext *ctx, arg_ext_w_h *a) >> +{ >> + TCGv t0; >> + TCGv Rd = cpu_gpr[a->rd]; >> + >> + if (a->rd == 0) { >> + /* Nop */ >> + return true; >> + } >> + >> + t0 = get_gpr(a->rj); >> + >> + tcg_gen_ext16s_tl(Rd, t0); > > Again, you should have a common routine for handling these unary operations. > OK. >> +static bool trans_clo_w(DisasContext *ctx, arg_clo_w *a) >> +{ >> + TCGv Rd = cpu_gpr[a->rd]; >> + >> + if (a->rd == 0) { >> + /* Nop */ >> + return true; >> + } >> + >> + gen_load_gpr(Rd, a->rj); >> + >> + tcg_gen_not_tl(Rd, Rd); >> + tcg_gen_ext32u_tl(Rd, Rd); >> + tcg_gen_clzi_tl(Rd, Rd, TARGET_LONG_BITS); >> + tcg_gen_subi_tl(Rd, Rd, TARGET_LONG_BITS - 32); > > So, you're actually using the tcg builtins here, and the helper you created > isn't used. > Yes. >> +static bool trans_cto_w(DisasContext *ctx, arg_cto_w *a) >> +{ >> + TCGv t0; >> + TCGv Rd = cpu_gpr[a->rd]; >> + >> + if (a->rd == 0) { >> + /* Nop */ >> + return true; >> + } >> + >> + t0 = tcg_temp_new(); >> + gen_load_gpr(t0, a->rj); >> + >> + gen_helper_cto_w(Rd, cpu_env, t0); > > Here you should have used the tcg builtin. > OK. >> +static bool trans_ctz_w(DisasContext *ctx, arg_ctz_w *a) >> +{ >> + TCGv t0; >> + TCGv Rd = cpu_gpr[a->rd]; >> + >> + if (a->rd == 0) { >> + /* Nop */ >> + return true; >> + } >> + >> + t0 = tcg_temp_new(); >> + gen_load_gpr(t0, a->rj); >> + >> + gen_helper_ctz_w(Rd, cpu_env, t0); > > Likewise. > >> +static bool trans_revb_2w(DisasContext *ctx, arg_revb_2w *a) >> +{ >> + TCGv_i64 t0, t1, t2; >> + TCGv Rd = cpu_gpr[a->rd]; >> + >> + if (a->rd == 0) { >> + /* Nop */ >> + return true; >> + } >> + >> + t0 = tcg_temp_new_i64(); >> + t1 = tcg_temp_new_i64(); >> + t2 = get_gpr(a->rj); >> + >> + gen_load_gpr(t0, a->rd); >> + >> + tcg_gen_ext32u_i64(t1, t2); >> + tcg_gen_bswap32_i64(t0, t1); >> + tcg_gen_shri_i64(t1, t2, 32); >> + tcg_gen_bswap32_i64(t1, t1); >> + tcg_gen_concat32_i64(Rd, t0, t1); > > tcg_gen_bswap64_i64(Rd, Rj) > tcg_gen_rotri_i64(Rd, Rd, 32); > OK. >> +static bool trans_bytepick_d(DisasContext *ctx, arg_bytepick_d *a) >> +{ >> + TCGv t0; >> + TCGv Rd = cpu_gpr[a->rd]; >> + >> + if (a->rd == 0) { >> + /* Nop */ >> + return true; >> + } >> + >> + t0 = tcg_temp_new(); >> + >> + check_loongarch_64(ctx); >> + if (a->sa3 == 0 || ((a->sa3) * 8) == 64) { >> + if (a->sa3 == 0) { >> + gen_load_gpr(t0, a->rk); >> + } else { >> + gen_load_gpr(t0, a->rj); >> + } >> + tcg_gen_mov_tl(Rd, t0); >> + } else { >> + TCGv t1 = tcg_temp_new(); >> + >> + gen_load_gpr(t0, a->rk); >> + gen_load_gpr(t1, a->rj); >> + >> + tcg_gen_shli_tl(t0, t0, ((a->sa3) * 8)); >> + tcg_gen_shri_tl(t1, t1, 64 - ((a->sa3) * 8)); >> + tcg_gen_or_tl(Rd, t1, t0); >> + >> + tcg_temp_free(t1); >> + } > > tcg_gen_extract2_i64(Rd, Rk, Rj, a->sa3 * 8); > OK Thank you kindly help. Thanks Song Gao.