From: Mateja Marjanovic <mateja.marjano...@rt-rk.com> Optimize set of MSA instructions ILVOD, using directly tcg registers and performing logic on them instead of using helpers.
instr || before || after ====================================== ilvod.b || 117.50 ms || 24.99 ms ilvod.h || 93.16 ms || 24.27 ms ilvod.w || 119.90 ms || 24.81 ms ilvod.d || 43.01 ms || 20.69 ms Signed-off-by: Mateja Marjanovic <mateja.marjano...@rt-rk.com> --- target/mips/helper.h | 1 - target/mips/msa_helper.c | 7 --- target/mips/translate.c | 111 ++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 110 insertions(+), 9 deletions(-) diff --git a/target/mips/helper.h b/target/mips/helper.h index a6d687e..d162836 100644 --- a/target/mips/helper.h +++ b/target/mips/helper.h @@ -865,7 +865,6 @@ DEF_HELPER_5(msa_pckod_df, void, env, i32, i32, i32, i32) DEF_HELPER_5(msa_ilvl_df, void, env, i32, i32, i32, i32) DEF_HELPER_5(msa_ilvr_df, void, env, i32, i32, i32, i32) DEF_HELPER_5(msa_ilvev_df, void, env, i32, i32, i32, i32) -DEF_HELPER_5(msa_ilvod_df, void, env, i32, i32, i32, i32) DEF_HELPER_5(msa_vshf_df, void, env, i32, i32, i32, i32) DEF_HELPER_5(msa_srar_df, void, env, i32, i32, i32, i32) DEF_HELPER_5(msa_srlr_df, void, env, i32, i32, i32, i32) diff --git a/target/mips/msa_helper.c b/target/mips/msa_helper.c index c74e3cd..9e52a31 100644 --- a/target/mips/msa_helper.c +++ b/target/mips/msa_helper.c @@ -1206,13 +1206,6 @@ MSA_FN_DF(ilvr_df) MSA_FN_DF(ilvev_df) #undef MSA_DO -#define MSA_DO(DF) \ - do { \ - pwx->DF[2*i] = pwt->DF[2*i+1]; \ - pwx->DF[2*i+1] = pws->DF[2*i+1]; \ - } while (0) -MSA_FN_DF(ilvod_df) -#undef MSA_DO #undef MSA_LOOP_COND #define MSA_LOOP_COND(DF) \ diff --git a/target/mips/translate.c b/target/mips/translate.c index 5848f7a..10c5c55 100644 --- a/target/mips/translate.c +++ b/target/mips/translate.c @@ -28884,6 +28884,100 @@ static void gen_msa_bit(CPUMIPSState *env, DisasContext *ctx) tcg_temp_free_i32(tws); } +/* + * [MSA] ILVOD.B wd, ws, wt + * + * Vector Interleave Odd (byte data elements) + * + */ +static inline void gen_ilvod_b(CPUMIPSState *env, uint32_t wd, + uint32_t ws, uint32_t wt) +{ + TCGv_i64 t0 = tcg_temp_new_i64(); + TCGv_i64 t1 = tcg_temp_new_i64(); + const uint64_t mask = 0xff00ff00ff00ff00ULL; + + tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask); + tcg_gen_shri_i64(t1, t1, 8); + tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask); + tcg_gen_or_i64(msa_wr_d[wd * 2], t1, t0); + + tcg_gen_andi_i64(t1, msa_wr_d[wt * 2 + 1], mask); + tcg_gen_shri_i64(t1, t1, 8); + tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], mask); + tcg_gen_or_i64(msa_wr_d[wd * 2 + 1], t1, t0); + + tcg_temp_free_i64(t0); + tcg_temp_free_i64(t1); +} + +/* + * [MSA] ILVOD.H wd, ws, wt + * + * Vector Interleave Odd (halfword data elements) + * + */ +static inline void gen_ilvod_h(CPUMIPSState *env, uint32_t wd, + uint32_t ws, uint32_t wt) +{ + TCGv_i64 t0 = tcg_temp_new_i64(); + TCGv_i64 t1 = tcg_temp_new_i64(); + const uint64_t mask = 0xffff0000ffff0000ULL; + + tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask); + tcg_gen_shri_i64(t1, t1, 16); + tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask); + tcg_gen_or_i64(msa_wr_d[wd * 2], t1, t0); + + tcg_gen_andi_i64(t1, msa_wr_d[wt * 2 + 1], mask); + tcg_gen_shri_i64(t1, t1, 16); + tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], mask); + tcg_gen_or_i64(msa_wr_d[wd * 2 + 1], t1, t0); + + tcg_temp_free_i64(t0); + tcg_temp_free_i64(t1); +} + +/* + * [MSA] ILVOD.W wd, ws, wt + * + * Vector Interleave Odd (word data elements) + * + */ +static inline void gen_ilvod_w(CPUMIPSState *env, uint32_t wd, + uint32_t ws, uint32_t wt) +{ + TCGv_i64 t0 = tcg_temp_new_i64(); + TCGv_i64 t1 = tcg_temp_new_i64(); + const uint64_t mask = 0xffffffff00000000ULL; + + tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask); + tcg_gen_shri_i64(t1, t1, 32); + tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask); + tcg_gen_or_i64(msa_wr_d[wd * 2], t1, t0); + + tcg_gen_andi_i64(t1, msa_wr_d[wt * 2 + 1], mask); + tcg_gen_shri_i64(t1, t1, 32); + tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], mask); + tcg_gen_or_i64(msa_wr_d[wd * 2 + 1], t1, t0); + + tcg_temp_free_i64(t0); + tcg_temp_free_i64(t1); +} + +/* + * [MSA] ILVOD.D wd, ws, wt + * + * Vector Interleave Odd (double data elements) + * + */ +static inline void gen_ilvod_d(CPUMIPSState *env, uint32_t wd, + uint32_t ws, uint32_t wt) +{ + tcg_gen_mov_i64(msa_wr_d[wd * 2], msa_wr_d[wt * 2 + 1]); + tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], msa_wr_d[ws * 2 + 1]); +} + static void gen_msa_3r(CPUMIPSState *env, DisasContext *ctx) { #define MASK_MSA_3R(op) (MASK_MSA_MINOR(op) | (op & (0x7 << 23))) @@ -29055,7 +29149,22 @@ static void gen_msa_3r(CPUMIPSState *env, DisasContext *ctx) gen_helper_msa_mod_u_df(cpu_env, tdf, twd, tws, twt); break; case OPC_ILVOD_df: - gen_helper_msa_ilvod_df(cpu_env, tdf, twd, tws, twt); + switch (df) { + case DF_BYTE: + gen_ilvod_b(env, wd, ws, wt); + break; + case DF_HALF: + gen_ilvod_h(env, wd, ws, wt); + break; + case DF_WORD: + gen_ilvod_w(env, wd, ws, wt); + break; + case DF_DOUBLE: + gen_ilvod_d(env, wd, ws, wt); + break; + default: + assert(0); + } break; case OPC_DOTP_S_df: -- 2.7.4